typohero 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,3 @@
1
1
  module TypoHero
2
- VERSION = '0.0.2'
2
+ VERSION = '0.0.3'
3
3
  end
data/lib/typohero.rb CHANGED
@@ -1,27 +1,37 @@
1
+ # -*- coding: utf-8 -*-
1
2
  require 'typohero/version'
2
3
  require 'typohero/latex'
3
4
 
4
5
  module TypoHero
5
6
  extend self
6
7
 
7
- EXCLUDED_TAGS = %w(head pre code kbd math script textarea)
8
+ EXCLUDED_TAGS = %w(head pre code kbd math script style textarea)
8
9
  EXCLUDED_TAGS_RE = /\A<(\/)?(?:#{EXCLUDED_TAGS.join('|')})[\p{Space}\/>]/im
9
10
 
10
- TOKENIZER_RE = /<[^>]+>|\\[\(\)\[\]]|\$\$|(?:[^<\$\\]|\$(?:[^$]|\Z)|\\(?:[^\(\)\[\]]|\Z))+/im
11
+ TOKENIZER_RE = %r{
12
+ <!--(?:(?:(?!-->).)*)-->| # comment
13
+ <!\[CDATA\[(?:(?:(?!\]\]>).)*)\]\]>| # cdata
14
+ <[^>]+>| # opening or closing tag
15
+ \\[\(\)\[\]]| # latex begin/end
16
+ \$\$| # dollar latex begin/end
17
+ (?:(?:(?!\$\$|\\[\(\)\[\]])[^<])+) # text without double dollar or latex
18
+ }xm
11
19
 
12
20
  ESCAPE = {
13
- '\\\\' => '&#92;',
14
- '\"' => '&#34;',
15
- "\\\'" => '&#39;',
16
- '\.' => '&#46;',
17
- '\,' => '&#44;',
18
- '\-' => '&#45;',
19
- '\`' => '&#96;',
20
- '\(' => '&#40',
21
+ '\\\\' => '&#92;',
22
+ '\"' => '&#34;',
23
+ "\\'" => '&#39;',
24
+ '\.' => '&#46;',
25
+ '\,' => '&#44;',
26
+ '\-' => '&#45;',
27
+ '\`' => '&#96;',
21
28
  }
29
+ UNESCAPE = Hash[ESCAPE.map {|k,v| [v,k[1..-1]] }]
22
30
  ESCAPE_RE = Regexp.union(*ESCAPE.keys)
31
+ UNESCAPE_RE = Regexp.union(*UNESCAPE.keys)
23
32
 
24
33
  NBSP = "\u00a0"
34
+ NBSP_THIN = "\u202F"
25
35
  MDASH = "\u2014"
26
36
  NDASH = "\u2013"
27
37
  LDQUO = "\u201C"
@@ -29,16 +39,19 @@ module TypoHero
29
39
  LSQUO = "\u2018"
30
40
  RSQUO = "\u2019"
31
41
  BDQUO = "\u201E"
42
+ ELLIPSIS = "\u2026"
32
43
 
33
44
  SPECIAL = {
34
45
  # enhance!
46
+ ' - ' => " #{NDASH} ",
35
47
  '---' => MDASH,
36
48
  '--' => NDASH,
37
- '...' => "\u2026",
38
- '. . .' => "\u2026",
49
+ '...' => ELLIPSIS,
50
+ '. . .' => ELLIPSIS,
39
51
  '``' => LDQUO,
40
52
  "''" => RDQUO,
41
53
  '`' => LSQUO,
54
+ #'\'' => RSQUO, # needs more complex treatment
42
55
  ',,' => BDQUO,
43
56
  '(c)' => "\u00A9",
44
57
  '(C)' => "\u00A9",
@@ -48,37 +61,21 @@ module TypoHero
48
61
  '(TM)' => "\u2122",
49
62
  # normalize for further processing
50
63
  '&ldquo;' => LDQUO,
51
- '&#8220;' => LDQUO,
52
- '&#x201C;' => LDQUO,
53
64
  '&rdquo;' => RDQUO,
54
- '&#8221;' => RDQUO,
55
- '&#x201D;' => RDQUO,
56
65
  '&lsquo;' => LSQUO,
57
- '&#8216;' => LSQUO,
58
- '&#x2018;' => LSQUO,
59
66
  '&rsquo;' => RSQUO,
60
- '&#8217;' => RSQUO,
61
- '&#x2019;' => RSQUO,
62
- '&#160;' => NBSP,
63
- '&#xA0;' => NBSP,
64
67
  '&nbsp;' => NBSP,
65
68
  '&ndash;' => NDASH,
66
- '&#x2013;' => NDASH,
67
- '&#8211;' => NDASH,
68
- '&#x2014;' => MDASH,
69
- '&mdash;' => MDASH,
70
- '&#8212;' => MDASH,
71
- '&#38;' => '&amp;',
72
- '&#x26;' => '&amp;',
69
+ '&mdash;' => MDASH
73
70
  }
74
71
  SPECIAL_RE = Regexp.union(*SPECIAL.keys)
75
- LATEX_RE = /(#{Regexp.union *LATEX.keys})(?=\p{Space}|$)/
72
+ LATEX_RE = /(#{Regexp.union *LATEX.keys})(?=\p{Space}|$)/m
76
73
 
77
74
  DASH_RE = "[#{MDASH}#{NDASH}]"
78
75
  AMP_RE = '&(?:amp;)?'
79
76
  LEFT_QUOTE_RE = "[#{LDQUO}#{LSQUO}#{BDQUO}]"
80
77
 
81
- PRIME_RE = /(?<=\d)(''?)(?=\p{Space}|\d|$)/
78
+ PRIME_RE = /(?<=\d)(''?)(?=[\p{Space}\dNEWS]|$)/m
82
79
  PRIMES = {
83
80
  "'" => "\u2032",
84
81
  "''" => "\u2033",
@@ -86,22 +83,23 @@ module TypoHero
86
83
  }
87
84
  ORDINAL_RE = /(?<=\d)(st|nd|rd|th)(?=\p{Space}|$)/
88
85
 
89
- MDASH_SPACE_RE = /\p{Space}*(#{MDASH})\p{Space}*/
90
- NDASH_SPACE_RE = /\p{Space}*(#{NDASH})\p{Space}*/
86
+ MDASH_SPACE_RE = /\p{Space}*#{MDASH}\p{Space}*/
87
+ NDASH_SPACE_RE = /\p{Space}*#{NDASH}\p{Space}*/
88
+ MDASH_SPACE = "#{NBSP_THIN}#{MDASH}#{NBSP_THIN}"
89
+ NDASH_SPACE = "#{NBSP}#{NDASH}#{NBSP}"
91
90
 
92
- REPLACE_AMP_RE = /(?<=\p{Space})#{AMP_RE}(?=\p{Space})/m
91
+ REPLACE_AMP_RE = /(?<=\p{Space})#{AMP_RE}(?=\p{Space})/
93
92
 
94
- CAPS_BEGIN_RE = "(^|\\p{Space}|#{LEFT_QUOTE_RE})"
95
- CAPS_INNER_RE = "(?:#{AMP_RE}|[A-Z\\d\\.]|#{RSQUO})*" # right quote for posession (e.g. JIMMY'S)
96
- REPLACE_CAPS_RE = /#{CAPS_BEGIN_RE}([A-Z\d]#{CAPS_INNER_RE}[A-Z]#{CAPS_INNER_RE}|[A-Z]#{CAPS_INNER_RE}[A-Z\d]#{CAPS_INNER_RE})/m
93
+ CAPS_BEGIN_RE = "(^|\\p{Space}|#{LEFT_QUOTE_RE})"
94
+ CAPS_INNER_RE = "(?:#{AMP_RE}|[A-Z\\d\\.]|#{RSQUO})*" # right quote for posession (e.g. JIMMY'S)
95
+ CAPS_RE = /#{CAPS_BEGIN_RE}([A-Z\d]#{CAPS_INNER_RE}[A-Z]#{CAPS_INNER_RE}|[A-Z]#{CAPS_INNER_RE}[A-Z\d]#{CAPS_INNER_RE})/m
97
96
 
98
- PUNCT_CLASS = '[!"#\$\%\'()*+,\-.\/:;<=>?\@\[\\\\\]\^_`{|}~]'
99
- RIGHT_QUOTE_RE = %r{
100
- ^['"](?=#{PUNCT_CLASS})\B| # Very first character is a closing quote followed by punctuation at a non-word-break
97
+ RIGHT_QUOTE_RE = %r{
98
+ ^['"](?=\p{Punct})\B| # Very first character is a closing quote followed by punctuation at a non-word-break
101
99
  (?<!^|#{DASH_RE}|\p{Space}|[\[\{\(\-])['"]| # Not after dash, space or opening parentheses
102
100
  ['"](?=\p{Space}|$)| # Followed by space or end of line
103
101
  's\b| # Apostrophe
104
- (?<=#{DASH_RE})['"](?=#{PUNCT_CLASS})| # Dash quote punctuation (e.g. --'!), for quotations
102
+ (?<=#{DASH_RE})['"](?=\p{Punct})| # Dash quote punctuation (e.g. --'!), for quotations
105
103
  '(?=(\d\d(?:s|\p{Space}|$))) # Decade abbreviations (the '80s)
106
104
  }xm
107
105
 
@@ -125,9 +123,9 @@ module TypoHero
125
123
 
126
124
  WIDONT_PARAGRAPH_RE = /\A<\/(?:#{PARAGRAPH_RE})>\Z/im
127
125
  WIDONT_INLINE_RE = /\A<\/?(?:#{INLINE_RE})[^>]*>\Z/im
128
- WIDONT_NBSP_RE = /#{NBSP}|<|>/
126
+ WIDONT_NBSP_RE = /[#{NBSP}#{NBSP_THIN}<>]/
129
127
 
130
- INITIAL_QUOTE_RE = /(?=(?:<(?:#{PARAGRAPH_RE})[^>]*>|^)(?:<(?:#{INLINE_RE})[^>]*>|\p{Space})*)#{LEFT_QUOTE_RE}/
128
+ INITIAL_QUOTE_RE = /(?=(?:<(?:#{PARAGRAPH_RE})[^>]*>|^)(?:<(?:#{INLINE_RE})[^>]*>|\p{Space})*)#{LEFT_QUOTE_RE}/m
131
129
  INITIAL_QUOTES = {
132
130
  LSQUO => "<span class=\"quo\">#{LSQUO}</span>",
133
131
  LDQUO => "<span class=\"dquo\">#{LDQUO}</span>",
@@ -135,30 +133,126 @@ module TypoHero
135
133
  }
136
134
 
137
135
  def tokenize(input)
138
- excluded, latex, dollar = 0, 0, 0
136
+ comment, excluded, latex, dollar = false, 0, 0, 0
139
137
  input.scan TOKENIZER_RE do |s|
140
- text = false
141
- case s
142
- when /\A</
143
- excluded += ($1 ? -1 : 1) if s =~ EXCLUDED_TAGS_RE
144
- when /\A\\[\(\[]\Z/
145
- latex += 1
146
- when /\A\\[\)\]]\Z/
147
- latex -= 1
148
- when '$$'
149
- dollar += 1
138
+ type =
139
+ if s =~ /\A<!--/
140
+ :comment
141
+ elsif s =~ /\A<!\[/
142
+ :cdata
143
+ end
144
+
145
+ if !type && latex == 0 && dollar.even?
146
+ if s=~ /\A</
147
+ if s =~ EXCLUDED_TAGS_RE
148
+ excluded += $1 ? -1 : 1
149
+ excluded = 0 if excluded < 0
150
+ type = :excluded
151
+ else
152
+ type = excluded == 0 ? :tag : :excluded
153
+ end
154
+ end
155
+ end
156
+
157
+ if !type && excluded == 0
158
+ case s
159
+ when /\A\\[\(\[]\Z/
160
+ latex += 1
161
+ type = :latex
162
+ when /\A\\[\)\]]\Z/
163
+ latex -= 1 if latex > 0
164
+ type = :latex
165
+ when '$$'
166
+ dollar += 1
167
+ type = :latex
168
+ end
169
+ end
170
+
171
+ type ||=
172
+ if excluded != 0
173
+ :excluded
174
+ elsif latex != 0 || dollar.odd?
175
+ :latex
176
+ else
177
+ :text
178
+ end
179
+
180
+ yield(s, type)
181
+ end
182
+ end
183
+
184
+ def tokenize_with_tags(input)
185
+ tags = []
186
+ tokenize(input) do |s, type|
187
+ if type == :tag && s =~ /\A<(\/)?([^\p{Space}\/>]+)/
188
+ if $1
189
+ until tags.empty? || tags.pop == $2; end
190
+ else
191
+ tags << $2
192
+ end
193
+ end
194
+ yield(s, type, tags)
195
+ end
196
+ end
197
+
198
+ def truncate(input, *max_words_or_separator)
199
+ max_words = max_words_or_separator.select {|i| Fixnum === i }.first
200
+ if separator = max_words_or_separator.reject {|i| Fixnum === i }.first
201
+ separator = Regexp.union(*separator) unless Regexp === separator
202
+ separator = nil unless input =~ separator
203
+ end
204
+ out, tail, truncated = '', '', false
205
+ tokenize_with_tags(input) do |s, type, tags|
206
+ if separator && (type == :comment || type == :text || type == :latex || type == :tag) && separator === s
207
+ out << $` if type == :text
208
+ if type == :tag
209
+ if s =~ /\A<\//
210
+ tail << s
211
+ else
212
+ tags.pop
213
+ end
214
+ end
215
+ truncated = tags
216
+ break
217
+ elsif max_words == 0
218
+ if type == :text
219
+ truncated = tags
220
+ break
221
+ end
222
+ tail << s
150
223
  else
151
- text = true if latex == 0 && dollar.even? && excluded == 0
224
+ if max_words && type == :text
225
+ s =~ /\A(\p{Space}*)(.*)\Z/m
226
+ ws, w = $1, $2.split(/\p{Space}+/)
227
+ if w.size > max_words
228
+ out << ws << w[0...max_words].join(' ')
229
+ truncated = tags
230
+ break
231
+ end
232
+ max_words -= w.size
233
+ end
234
+ out << s
152
235
  end
153
- yield(s, text)
154
236
  end
237
+ if truncated
238
+ out.sub!(/[\p{Space}\p{Punct}]*\Z/, ELLIPSIS)
239
+ tail << "</#{truncated.pop}>" until truncated.empty?
240
+ end
241
+ html_safe(input, out << tail)
242
+ end
243
+
244
+ def strip_tags(input)
245
+ out = ''
246
+ tokenize(input) {|s, type| out << s if type == :text || type == :latex }
247
+ html_safe(input, out)
155
248
  end
156
249
 
157
250
  def enhance(input)
158
251
  tokens, text, prev_last_char = [], []
159
- tokenize(input) do |s, t|
160
- if t
252
+ tokenize(input) do |s, type|
253
+ if type == :text
161
254
  last_char = s[-1]
255
+ decode(s)
162
256
  escape(s)
163
257
  primes(s)
164
258
  special(s)
@@ -176,8 +270,10 @@ module TypoHero
176
270
  amp(s)
177
271
  caps(s)
178
272
  ordinals(s)
273
+ nobr(s)
274
+ unescape(s)
179
275
  end
180
- tokens.join
276
+ html_safe(input, tokens.join)
181
277
  end
182
278
 
183
279
  def widont(tokens)
@@ -189,7 +285,8 @@ module TypoHero
189
285
  if tokens[i] =~ WIDONT_NBSP_RE
190
286
  state = 0
191
287
  elsif state == 1 || state == 3
192
- if tokens[i] =~ (state == 1 ? /(\P{Space}+)?(\p{Space}+)?(\P{Space}+\p{Space}*)\Z/m : /(\P{Space}+)?(\p{Space}+)(\P{Space}*)\Z/m)
288
+ if tokens[i] =~ (state == 1 ? /(\P{Space}+)?(\p{Space}+)?(\P{Space}+\p{Space}*)\Z/m :
289
+ /(\P{Space}+)?(\p{Space}+)(\P{Space}*)\Z/m)
193
290
  if $1 && $2
194
291
  tokens[i].replace "#{$`}#{$1}#{NBSP}#{$3}"
195
292
  state = 0
@@ -209,10 +306,25 @@ module TypoHero
209
306
  end
210
307
  end
211
308
 
309
+ def html_safe(src, dst)
310
+ src.respond_to?(:html_safe?) && src.html_safe? ? dst.html_safe : dst
311
+ end
312
+
313
+ def decode(s)
314
+ s.gsub!(/&#x([0-9A-F]+);|&#([0-9]+);/i) do
315
+ i = $1 ? $1.to_i(16) : $2.to_i(10)
316
+ i == 38 ? '&amp;' : i.chr('UTF-8')
317
+ end
318
+ end
319
+
212
320
  def escape(s)
213
321
  s.gsub!(ESCAPE_RE, ESCAPE)
214
322
  end
215
323
 
324
+ def unescape(s)
325
+ s.gsub!(UNESCAPE_RE, UNESCAPE)
326
+ end
327
+
216
328
  def special(s)
217
329
  s.gsub!(SPECIAL_RE, SPECIAL)
218
330
  end
@@ -222,8 +334,8 @@ module TypoHero
222
334
  end
223
335
 
224
336
  def dash_spaces(s)
225
- s.gsub!(MDASH_SPACE_RE, "\u2009\\1\u2009")
226
- s.gsub!(NDASH_SPACE_RE, ' \1 ')
337
+ s.gsub!(MDASH_SPACE_RE, MDASH_SPACE)
338
+ s.gsub!(NDASH_SPACE_RE, NDASH_SPACE)
227
339
  end
228
340
 
229
341
  def amp(s)
@@ -231,20 +343,24 @@ module TypoHero
231
343
  end
232
344
 
233
345
  def caps(s)
234
- s.gsub!(REPLACE_CAPS_RE, '\1<span class="caps">\2</span>')
346
+ s.gsub!(CAPS_RE, '\1<span class="caps">\2</span>')
235
347
  end
236
348
 
237
349
  def initial_quotes(s)
238
350
  s.gsub!(INITIAL_QUOTE_RE, INITIAL_QUOTES)
239
351
  end
240
352
 
353
+ def nobr(s)
354
+ s.gsub!(/[\p{Digit}\p{Word}]+(-[\p{Digit}\p{Word}]+)+/, '<span class="nobr">\0</span>')
355
+ end
356
+
241
357
  def primes(s)
242
358
  # Special case for inches and minutes, seconds
243
359
  s.gsub!(PRIME_RE, PRIMES)
244
360
  end
245
361
 
246
362
  def ordinals(s)
247
- s.gsub!(ORDINAL_RE, '<sup>\1</sup>')
363
+ s.gsub!(ORDINAL_RE, '<span class="ord">\1</span>')
248
364
  end
249
365
 
250
366
  def quotes(s, prev_last_char)