RubyGems - typohero - Versions diffs - 0.0.2 → 0.0.3 - Mend

typohero 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

data/lib/typohero/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module TypoHero
-  VERSION = '0.0.2'
+  VERSION = '0.0.3'
 end

data/lib/typohero.rb CHANGED Viewed

@@ -1,27 +1,37 @@
+# -*- coding: utf-8 -*-
 require 'typohero/version'
 require 'typohero/latex'
 module TypoHero
   extend self
-  EXCLUDED_TAGS = %w(head pre code kbd math script textarea)
+  EXCLUDED_TAGS = %w(head pre code kbd math script style textarea)
   EXCLUDED_TAGS_RE = /\A<(\/)?(?:#{EXCLUDED_TAGS.join('|')})[\p{Space}\/>]/im
-  TOKENIZER_RE = /<[^>]+>|\\[\(\)\[\]]|\$\$|(?:[^<\$\\]|\$(?:[^$]|\Z)|\\(?:[^\(\)\[\]]|\Z))+/im
+  TOKENIZER_RE = %r{
+    <!--(?:(?:(?!-->).)*)-->|            # comment
+    <!\[CDATA\[(?:(?:(?!\]\]>).)*)\]\]>| # cdata
+    <[^>]+>|                             # opening or closing tag
+    \\[\(\)\[\]]|                        # latex begin/end
+    \$\$|                                # dollar latex begin/end
+    (?:(?:(?!\$\$|\\[\(\)\[\]])[^<])+)   # text without double dollar or latex
+  }xm
   ESCAPE = {
-    '\\\\'   => '&#92;',
-    '\"'     => '&#34;',
-    "\\\'"   => '&#39;',
-    '\.'     => '&#46;',
-    '\,'     => '&#44;',
-    '\-'     => '&#45;',
-    '\`'     => '&#96;',
-    '\('     => '&#40',
+    '\\\\'  => '&#92;',
+    '\"'    => '&#34;',
+    "\\'"   => '&#39;',
+    '\.'    => '&#46;',
+    '\,'    => '&#44;',
+    '\-'    => '&#45;',
+    '\`'    => '&#96;',
   }
+  UNESCAPE = Hash[ESCAPE.map {|k,v| [v,k[1..-1]] }]
   ESCAPE_RE = Regexp.union(*ESCAPE.keys)
+  UNESCAPE_RE = Regexp.union(*UNESCAPE.keys)
   NBSP  = "\u00a0"
+  NBSP_THIN = "\u202F"
   MDASH = "\u2014"
   NDASH = "\u2013"
   LDQUO = "\u201C"
@@ -29,16 +39,19 @@ module TypoHero
   LSQUO = "\u2018"
   RSQUO = "\u2019"
   BDQUO = "\u201E"
+  ELLIPSIS = "\u2026"
   SPECIAL = {
     # enhance!
+    ' - '      => " #{NDASH} ",
     '---'      => MDASH,
     '--'       => NDASH,
-    '...'      => "\u2026",
-    '. . .'    => "\u2026",
+    '...'      => ELLIPSIS,
+    '. . .'    => ELLIPSIS,
     '``'       => LDQUO,
     "''"       => RDQUO,
     '`'        => LSQUO,
+    #'\''        => RSQUO, # needs more complex treatment
     ',,'       => BDQUO,
     '(c)'      => "\u00A9",
     '(C)'      => "\u00A9",
@@ -48,37 +61,21 @@ module TypoHero
     '(TM)'     => "\u2122",
     # normalize for further processing
     '&ldquo;'  => LDQUO,
-    '&#8220;'  => LDQUO,
-    '&#x201C;' => LDQUO,
     '&rdquo;'  => RDQUO,
-    '&#8221;'  => RDQUO,
-    '&#x201D;' => RDQUO,
     '&lsquo;'  => LSQUO,
-    '&#8216;'  => LSQUO,
-    '&#x2018;' => LSQUO,
     '&rsquo;'  => RSQUO,
-    '&#8217;'  => RSQUO,
-    '&#x2019;' => RSQUO,
-    '&#160;'   => NBSP,
-    '&#xA0;'   => NBSP,
     '&nbsp;'   => NBSP,
     '&ndash;'  => NDASH,
-    '&#x2013;' => NDASH,
-    '&#8211;'  => NDASH,
-    '&#x2014;' => MDASH,
-    '&mdash;'  => MDASH,
-    '&#8212;'  => MDASH,
-    '&#38;'    => '&amp;',
-    '&#x26;'   => '&amp;',
+    '&mdash;'  => MDASH
   }
   SPECIAL_RE = Regexp.union(*SPECIAL.keys)
-  LATEX_RE = /(#{Regexp.union *LATEX.keys})(?=\p{Space}|$)/
+  LATEX_RE = /(#{Regexp.union *LATEX.keys})(?=\p{Space}|$)/m
   DASH_RE  = "[#{MDASH}#{NDASH}]"
   AMP_RE   = '&(?:amp;)?'
   LEFT_QUOTE_RE = "[#{LDQUO}#{LSQUO}#{BDQUO}]"
-  PRIME_RE = /(?<=\d)(''?)(?=\p{Space}|\d|$)/
+  PRIME_RE = /(?<=\d)(''?)(?=[\p{Space}\dNEWS]|$)/m
   PRIMES = {
    "'"   => "\u2032",
    "''"  => "\u2033",
@@ -86,22 +83,23 @@ module TypoHero
   }
   ORDINAL_RE = /(?<=\d)(st|nd|rd|th)(?=\p{Space}|$)/
-  MDASH_SPACE_RE = /\p{Space}*(#{MDASH})\p{Space}*/
-  NDASH_SPACE_RE = /\p{Space}*(#{NDASH})\p{Space}*/
+  MDASH_SPACE_RE = /\p{Space}*#{MDASH}\p{Space}*/
+  NDASH_SPACE_RE = /\p{Space}*#{NDASH}\p{Space}*/
+  MDASH_SPACE = "#{NBSP_THIN}#{MDASH}#{NBSP_THIN}"
+  NDASH_SPACE = "#{NBSP}#{NDASH}#{NBSP}"
-  REPLACE_AMP_RE  = /(?<=\p{Space})#{AMP_RE}(?=\p{Space})/m
+  REPLACE_AMP_RE = /(?<=\p{Space})#{AMP_RE}(?=\p{Space})/
-  CAPS_BEGIN_RE   = "(^|\\p{Space}|#{LEFT_QUOTE_RE})"
-  CAPS_INNER_RE   = "(?:#{AMP_RE}|[A-Z\\d\\.]|#{RSQUO})*" # right quote for posession (e.g. JIMMY'S)
-  REPLACE_CAPS_RE = /#{CAPS_BEGIN_RE}([A-Z\d]#{CAPS_INNER_RE}[A-Z]#{CAPS_INNER_RE}|[A-Z]#{CAPS_INNER_RE}[A-Z\d]#{CAPS_INNER_RE})/m
+  CAPS_BEGIN_RE  = "(^|\\p{Space}|#{LEFT_QUOTE_RE})"
+  CAPS_INNER_RE  = "(?:#{AMP_RE}|[A-Z\\d\\.]|#{RSQUO})*" # right quote for posession (e.g. JIMMY'S)
+  CAPS_RE        = /#{CAPS_BEGIN_RE}([A-Z\d]#{CAPS_INNER_RE}[A-Z]#{CAPS_INNER_RE}|[A-Z]#{CAPS_INNER_RE}[A-Z\d]#{CAPS_INNER_RE})/m
-  PUNCT_CLASS = '[!"#\$\%\'()*+,\-.\/:;<=>?\@\[\\\\\]\^_`{|}~]'
-  RIGHT_QUOTE_RE  = %r{
-    ^['"](?=#{PUNCT_CLASS})\B|                  # Very first character is a closing quote followed by punctuation at a non-word-break
+  RIGHT_QUOTE_RE = %r{
+    ^['"](?=\p{Punct})\B|                       # Very first character is a closing quote followed by punctuation at a non-word-break
     (?<!^|#{DASH_RE}|\p{Space}|[\[\{\(\-])['"]| # Not after dash, space or opening parentheses
     ['"](?=\p{Space}|$)|                        # Followed by space or end of line
     's\b|                                       # Apostrophe
-    (?<=#{DASH_RE})['"](?=#{PUNCT_CLASS})|      # Dash quote punctuation (e.g. --'!), for quotations
+    (?<=#{DASH_RE})['"](?=\p{Punct})|           # Dash quote punctuation (e.g. --'!), for quotations
     '(?=(\d\d(?:s|\p{Space}|$)))                # Decade abbreviations (the '80s)
   }xm
@@ -125,9 +123,9 @@ module TypoHero
   WIDONT_PARAGRAPH_RE = /\A<\/(?:#{PARAGRAPH_RE})>\Z/im
   WIDONT_INLINE_RE = /\A<\/?(?:#{INLINE_RE})[^>]*>\Z/im
-  WIDONT_NBSP_RE = /#{NBSP}|<|>/
+  WIDONT_NBSP_RE = /[#{NBSP}#{NBSP_THIN}<>]/
-  INITIAL_QUOTE_RE = /(?=(?:<(?:#{PARAGRAPH_RE})[^>]*>|^)(?:<(?:#{INLINE_RE})[^>]*>|\p{Space})*)#{LEFT_QUOTE_RE}/
+  INITIAL_QUOTE_RE = /(?=(?:<(?:#{PARAGRAPH_RE})[^>]*>|^)(?:<(?:#{INLINE_RE})[^>]*>|\p{Space})*)#{LEFT_QUOTE_RE}/m
   INITIAL_QUOTES = {
     LSQUO => "<span class=\"quo\">#{LSQUO}</span>",
     LDQUO => "<span class=\"dquo\">#{LDQUO}</span>",
@@ -135,30 +133,126 @@ module TypoHero
   }
   def tokenize(input)
-    excluded, latex, dollar = 0, 0, 0
+    comment, excluded, latex, dollar = false, 0, 0, 0
     input.scan TOKENIZER_RE do |s|
-      text = false
-      case s
-      when /\A</
-        excluded += ($1 ? -1 : 1) if s =~ EXCLUDED_TAGS_RE
-      when /\A\\[\(\[]\Z/
-        latex += 1
-      when /\A\\[\)\]]\Z/
-        latex -= 1
-      when '$$'
-        dollar += 1
+      type =
+        if s =~ /\A<!--/
+          :comment
+        elsif s =~ /\A<!\[/
+          :cdata
+        end
+      if !type && latex == 0 && dollar.even?
+        if s=~ /\A</
+          if s =~ EXCLUDED_TAGS_RE
+            excluded += $1 ? -1 : 1
+            excluded = 0 if excluded < 0
+            type = :excluded
+          else
+            type = excluded == 0 ? :tag : :excluded
+          end
+        end
+      end
+      if !type && excluded == 0
+        case s
+        when /\A\\[\(\[]\Z/
+          latex += 1
+          type = :latex
+        when /\A\\[\)\]]\Z/
+          latex -= 1 if latex > 0
+          type = :latex
+        when '$$'
+          dollar += 1
+          type = :latex
+        end
+      end
+      type ||=
+        if excluded != 0
+          :excluded
+        elsif latex != 0 || dollar.odd?
+          :latex
+        else
+          :text
+        end
+      yield(s, type)
+    end
+  end
+  def tokenize_with_tags(input)
+    tags = []
+    tokenize(input) do |s, type|
+      if type == :tag && s =~ /\A<(\/)?([^\p{Space}\/>]+)/
+        if $1
+          until tags.empty? || tags.pop == $2; end
+        else
+          tags << $2
+        end
+      end
+      yield(s, type, tags)
+    end
+  end
+  def truncate(input, *max_words_or_separator)
+    max_words = max_words_or_separator.select {|i| Fixnum === i }.first
+    if separator = max_words_or_separator.reject {|i| Fixnum === i }.first
+      separator = Regexp.union(*separator) unless Regexp === separator
+      separator = nil unless input =~ separator
+    end
+    out, tail, truncated = '', '', false
+    tokenize_with_tags(input) do |s, type, tags|
+      if separator && (type == :comment || type == :text || type == :latex || type == :tag) && separator === s
+        out << $` if type == :text
+        if type == :tag
+          if s =~ /\A<\//
+            tail << s
+          else
+            tags.pop
+          end
+        end
+        truncated = tags
+        break
+      elsif max_words == 0
+        if type == :text
+          truncated = tags
+          break
+        end
+        tail << s
       else
-        text = true if latex == 0 && dollar.even? && excluded == 0
+        if max_words && type == :text
+          s =~ /\A(\p{Space}*)(.*)\Z/m
+          ws, w = $1, $2.split(/\p{Space}+/)
+          if w.size > max_words
+            out << ws << w[0...max_words].join(' ')
+            truncated = tags
+            break
+          end
+          max_words -= w.size
+        end
+        out << s
       end
-      yield(s, text)
     end
+    if truncated
+      out.sub!(/[\p{Space}\p{Punct}]*\Z/, ELLIPSIS)
+      tail << "</#{truncated.pop}>" until truncated.empty?
+    end
+    html_safe(input, out << tail)
+  end
+  def strip_tags(input)
+    out = ''
+    tokenize(input) {|s, type| out << s if type == :text || type == :latex }
+    html_safe(input, out)
   end
   def enhance(input)
     tokens, text, prev_last_char = [], []
-    tokenize(input) do |s, t|
-      if t
+    tokenize(input) do |s, type|
+      if type == :text
         last_char = s[-1]
+        decode(s)
         escape(s)
         primes(s)
         special(s)
@@ -176,8 +270,10 @@ module TypoHero
       amp(s)
       caps(s)
       ordinals(s)
+      nobr(s)
+      unescape(s)
     end
-    tokens.join
+    html_safe(input, tokens.join)
   end
   def widont(tokens)
@@ -189,7 +285,8 @@ module TypoHero
         if tokens[i] =~ WIDONT_NBSP_RE
           state = 0
         elsif state == 1 || state == 3
-          if tokens[i] =~ (state == 1 ? /(\P{Space}+)?(\p{Space}+)?(\P{Space}+\p{Space}*)\Z/m : /(\P{Space}+)?(\p{Space}+)(\P{Space}*)\Z/m)
+          if tokens[i] =~ (state == 1 ? /(\P{Space}+)?(\p{Space}+)?(\P{Space}+\p{Space}*)\Z/m :
+                                        /(\P{Space}+)?(\p{Space}+)(\P{Space}*)\Z/m)
             if $1 && $2
               tokens[i].replace "#{$`}#{$1}#{NBSP}#{$3}"
               state = 0
@@ -209,10 +306,25 @@ module TypoHero
     end
   end
+  def html_safe(src, dst)
+    src.respond_to?(:html_safe?) && src.html_safe? ? dst.html_safe : dst
+  end
+  def decode(s)
+    s.gsub!(/&#x([0-9A-F]+);|&#([0-9]+);/i) do
+      i = $1 ? $1.to_i(16) : $2.to_i(10)
+      i == 38 ? '&amp;' : i.chr('UTF-8')
+    end
+  end
   def escape(s)
     s.gsub!(ESCAPE_RE, ESCAPE)
   end
+  def unescape(s)
+    s.gsub!(UNESCAPE_RE, UNESCAPE)
+  end
   def special(s)
     s.gsub!(SPECIAL_RE, SPECIAL)
   end
@@ -222,8 +334,8 @@ module TypoHero
   end
   def dash_spaces(s)
-    s.gsub!(MDASH_SPACE_RE, "\u2009\\1\u2009")
-    s.gsub!(NDASH_SPACE_RE, ' \1 ')
+    s.gsub!(MDASH_SPACE_RE, MDASH_SPACE)
+    s.gsub!(NDASH_SPACE_RE, NDASH_SPACE)
   end
   def amp(s)
@@ -231,20 +343,24 @@ module TypoHero
   end
   def caps(s)
-    s.gsub!(REPLACE_CAPS_RE, '\1<span class="caps">\2</span>')
+    s.gsub!(CAPS_RE, '\1<span class="caps">\2</span>')
   end
   def initial_quotes(s)
     s.gsub!(INITIAL_QUOTE_RE, INITIAL_QUOTES)
   end
+  def nobr(s)
+    s.gsub!(/[\p{Digit}\p{Word}]+(-[\p{Digit}\p{Word}]+)+/, '<span class="nobr">\0</span>')
+  end
   def primes(s)
     # Special case for inches and minutes, seconds
     s.gsub!(PRIME_RE, PRIMES)
   end
   def ordinals(s)
-    s.gsub!(ORDINAL_RE, '<sup>\1</sup>')
+    s.gsub!(ORDINAL_RE, '<span class="ord">\1</span>')
   end
   def quotes(s, prev_last_char)