typohero 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,3 @@
1
1
  module TypoHero
2
- VERSION = '0.0.2'
2
+ VERSION = '0.0.3'
3
3
  end
data/lib/typohero.rb CHANGED
@@ -1,27 +1,37 @@
1
+ # -*- coding: utf-8 -*-
1
2
  require 'typohero/version'
2
3
  require 'typohero/latex'
3
4
 
4
5
  module TypoHero
5
6
  extend self
6
7
 
7
- EXCLUDED_TAGS = %w(head pre code kbd math script textarea)
8
+ EXCLUDED_TAGS = %w(head pre code kbd math script style textarea)
8
9
  EXCLUDED_TAGS_RE = /\A<(\/)?(?:#{EXCLUDED_TAGS.join('|')})[\p{Space}\/>]/im
9
10
 
10
- TOKENIZER_RE = /<[^>]+>|\\[\(\)\[\]]|\$\$|(?:[^<\$\\]|\$(?:[^$]|\Z)|\\(?:[^\(\)\[\]]|\Z))+/im
11
+ TOKENIZER_RE = %r{
12
+ <!--(?:(?:(?!-->).)*)-->| # comment
13
+ <!\[CDATA\[(?:(?:(?!\]\]>).)*)\]\]>| # cdata
14
+ <[^>]+>| # opening or closing tag
15
+ \\[\(\)\[\]]| # latex begin/end
16
+ \$\$| # dollar latex begin/end
17
+ (?:(?:(?!\$\$|\\[\(\)\[\]])[^<])+) # text without double dollar or latex
18
+ }xm
11
19
 
12
20
  ESCAPE = {
13
- '\\\\' => '&#92;',
14
- '\"' => '&#34;',
15
- "\\\'" => '&#39;',
16
- '\.' => '&#46;',
17
- '\,' => '&#44;',
18
- '\-' => '&#45;',
19
- '\`' => '&#96;',
20
- '\(' => '&#40',
21
+ '\\\\' => '&#92;',
22
+ '\"' => '&#34;',
23
+ "\\'" => '&#39;',
24
+ '\.' => '&#46;',
25
+ '\,' => '&#44;',
26
+ '\-' => '&#45;',
27
+ '\`' => '&#96;',
21
28
  }
29
+ UNESCAPE = Hash[ESCAPE.map {|k,v| [v,k[1..-1]] }]
22
30
  ESCAPE_RE = Regexp.union(*ESCAPE.keys)
31
+ UNESCAPE_RE = Regexp.union(*UNESCAPE.keys)
23
32
 
24
33
  NBSP = "\u00a0"
34
+ NBSP_THIN = "\u202F"
25
35
  MDASH = "\u2014"
26
36
  NDASH = "\u2013"
27
37
  LDQUO = "\u201C"
@@ -29,16 +39,19 @@ module TypoHero
29
39
  LSQUO = "\u2018"
30
40
  RSQUO = "\u2019"
31
41
  BDQUO = "\u201E"
42
+ ELLIPSIS = "\u2026"
32
43
 
33
44
  SPECIAL = {
34
45
  # enhance!
46
+ ' - ' => " #{NDASH} ",
35
47
  '---' => MDASH,
36
48
  '--' => NDASH,
37
- '...' => "\u2026",
38
- '. . .' => "\u2026",
49
+ '...' => ELLIPSIS,
50
+ '. . .' => ELLIPSIS,
39
51
  '``' => LDQUO,
40
52
  "''" => RDQUO,
41
53
  '`' => LSQUO,
54
+ #'\'' => RSQUO, # needs more complex treatment
42
55
  ',,' => BDQUO,
43
56
  '(c)' => "\u00A9",
44
57
  '(C)' => "\u00A9",
@@ -48,37 +61,21 @@ module TypoHero
48
61
  '(TM)' => "\u2122",
49
62
  # normalize for further processing
50
63
  '&ldquo;' => LDQUO,
51
- '&#8220;' => LDQUO,
52
- '&#x201C;' => LDQUO,
53
64
  '&rdquo;' => RDQUO,
54
- '&#8221;' => RDQUO,
55
- '&#x201D;' => RDQUO,
56
65
  '&lsquo;' => LSQUO,
57
- '&#8216;' => LSQUO,
58
- '&#x2018;' => LSQUO,
59
66
  '&rsquo;' => RSQUO,
60
- '&#8217;' => RSQUO,
61
- '&#x2019;' => RSQUO,
62
- '&#160;' => NBSP,
63
- '&#xA0;' => NBSP,
64
67
  '&nbsp;' => NBSP,
65
68
  '&ndash;' => NDASH,
66
- '&#x2013;' => NDASH,
67
- '&#8211;' => NDASH,
68
- '&#x2014;' => MDASH,
69
- '&mdash;' => MDASH,
70
- '&#8212;' => MDASH,
71
- '&#38;' => '&amp;',
72
- '&#x26;' => '&amp;',
69
+ '&mdash;' => MDASH
73
70
  }
74
71
  SPECIAL_RE = Regexp.union(*SPECIAL.keys)
75
- LATEX_RE = /(#{Regexp.union *LATEX.keys})(?=\p{Space}|$)/
72
+ LATEX_RE = /(#{Regexp.union *LATEX.keys})(?=\p{Space}|$)/m
76
73
 
77
74
  DASH_RE = "[#{MDASH}#{NDASH}]"
78
75
  AMP_RE = '&(?:amp;)?'
79
76
  LEFT_QUOTE_RE = "[#{LDQUO}#{LSQUO}#{BDQUO}]"
80
77
 
81
- PRIME_RE = /(?<=\d)(''?)(?=\p{Space}|\d|$)/
78
+ PRIME_RE = /(?<=\d)(''?)(?=[\p{Space}\dNEWS]|$)/m
82
79
  PRIMES = {
83
80
  "'" => "\u2032",
84
81
  "''" => "\u2033",
@@ -86,22 +83,23 @@ module TypoHero
86
83
  }
87
84
  ORDINAL_RE = /(?<=\d)(st|nd|rd|th)(?=\p{Space}|$)/
88
85
 
89
- MDASH_SPACE_RE = /\p{Space}*(#{MDASH})\p{Space}*/
90
- NDASH_SPACE_RE = /\p{Space}*(#{NDASH})\p{Space}*/
86
+ MDASH_SPACE_RE = /\p{Space}*#{MDASH}\p{Space}*/
87
+ NDASH_SPACE_RE = /\p{Space}*#{NDASH}\p{Space}*/
88
+ MDASH_SPACE = "#{NBSP_THIN}#{MDASH}#{NBSP_THIN}"
89
+ NDASH_SPACE = "#{NBSP}#{NDASH}#{NBSP}"
91
90
 
92
- REPLACE_AMP_RE = /(?<=\p{Space})#{AMP_RE}(?=\p{Space})/m
91
+ REPLACE_AMP_RE = /(?<=\p{Space})#{AMP_RE}(?=\p{Space})/
93
92
 
94
- CAPS_BEGIN_RE = "(^|\\p{Space}|#{LEFT_QUOTE_RE})"
95
- CAPS_INNER_RE = "(?:#{AMP_RE}|[A-Z\\d\\.]|#{RSQUO})*" # right quote for posession (e.g. JIMMY'S)
96
- REPLACE_CAPS_RE = /#{CAPS_BEGIN_RE}([A-Z\d]#{CAPS_INNER_RE}[A-Z]#{CAPS_INNER_RE}|[A-Z]#{CAPS_INNER_RE}[A-Z\d]#{CAPS_INNER_RE})/m
93
+ CAPS_BEGIN_RE = "(^|\\p{Space}|#{LEFT_QUOTE_RE})"
94
+ CAPS_INNER_RE = "(?:#{AMP_RE}|[A-Z\\d\\.]|#{RSQUO})*" # right quote for posession (e.g. JIMMY'S)
95
+ CAPS_RE = /#{CAPS_BEGIN_RE}([A-Z\d]#{CAPS_INNER_RE}[A-Z]#{CAPS_INNER_RE}|[A-Z]#{CAPS_INNER_RE}[A-Z\d]#{CAPS_INNER_RE})/m
97
96
 
98
- PUNCT_CLASS = '[!"#\$\%\'()*+,\-.\/:;<=>?\@\[\\\\\]\^_`{|}~]'
99
- RIGHT_QUOTE_RE = %r{
100
- ^['"](?=#{PUNCT_CLASS})\B| # Very first character is a closing quote followed by punctuation at a non-word-break
97
+ RIGHT_QUOTE_RE = %r{
98
+ ^['"](?=\p{Punct})\B| # Very first character is a closing quote followed by punctuation at a non-word-break
101
99
  (?<!^|#{DASH_RE}|\p{Space}|[\[\{\(\-])['"]| # Not after dash, space or opening parentheses
102
100
  ['"](?=\p{Space}|$)| # Followed by space or end of line
103
101
  's\b| # Apostrophe
104
- (?<=#{DASH_RE})['"](?=#{PUNCT_CLASS})| # Dash quote punctuation (e.g. --'!), for quotations
102
+ (?<=#{DASH_RE})['"](?=\p{Punct})| # Dash quote punctuation (e.g. --'!), for quotations
105
103
  '(?=(\d\d(?:s|\p{Space}|$))) # Decade abbreviations (the '80s)
106
104
  }xm
107
105
 
@@ -125,9 +123,9 @@ module TypoHero
125
123
 
126
124
  WIDONT_PARAGRAPH_RE = /\A<\/(?:#{PARAGRAPH_RE})>\Z/im
127
125
  WIDONT_INLINE_RE = /\A<\/?(?:#{INLINE_RE})[^>]*>\Z/im
128
- WIDONT_NBSP_RE = /#{NBSP}|<|>/
126
+ WIDONT_NBSP_RE = /[#{NBSP}#{NBSP_THIN}<>]/
129
127
 
130
- INITIAL_QUOTE_RE = /(?=(?:<(?:#{PARAGRAPH_RE})[^>]*>|^)(?:<(?:#{INLINE_RE})[^>]*>|\p{Space})*)#{LEFT_QUOTE_RE}/
128
+ INITIAL_QUOTE_RE = /(?=(?:<(?:#{PARAGRAPH_RE})[^>]*>|^)(?:<(?:#{INLINE_RE})[^>]*>|\p{Space})*)#{LEFT_QUOTE_RE}/m
131
129
  INITIAL_QUOTES = {
132
130
  LSQUO => "<span class=\"quo\">#{LSQUO}</span>",
133
131
  LDQUO => "<span class=\"dquo\">#{LDQUO}</span>",
@@ -135,30 +133,126 @@ module TypoHero
135
133
  }
136
134
 
137
135
  def tokenize(input)
138
- excluded, latex, dollar = 0, 0, 0
136
+ comment, excluded, latex, dollar = false, 0, 0, 0
139
137
  input.scan TOKENIZER_RE do |s|
140
- text = false
141
- case s
142
- when /\A</
143
- excluded += ($1 ? -1 : 1) if s =~ EXCLUDED_TAGS_RE
144
- when /\A\\[\(\[]\Z/
145
- latex += 1
146
- when /\A\\[\)\]]\Z/
147
- latex -= 1
148
- when '$$'
149
- dollar += 1
138
+ type =
139
+ if s =~ /\A<!--/
140
+ :comment
141
+ elsif s =~ /\A<!\[/
142
+ :cdata
143
+ end
144
+
145
+ if !type && latex == 0 && dollar.even?
146
+ if s=~ /\A</
147
+ if s =~ EXCLUDED_TAGS_RE
148
+ excluded += $1 ? -1 : 1
149
+ excluded = 0 if excluded < 0
150
+ type = :excluded
151
+ else
152
+ type = excluded == 0 ? :tag : :excluded
153
+ end
154
+ end
155
+ end
156
+
157
+ if !type && excluded == 0
158
+ case s
159
+ when /\A\\[\(\[]\Z/
160
+ latex += 1
161
+ type = :latex
162
+ when /\A\\[\)\]]\Z/
163
+ latex -= 1 if latex > 0
164
+ type = :latex
165
+ when '$$'
166
+ dollar += 1
167
+ type = :latex
168
+ end
169
+ end
170
+
171
+ type ||=
172
+ if excluded != 0
173
+ :excluded
174
+ elsif latex != 0 || dollar.odd?
175
+ :latex
176
+ else
177
+ :text
178
+ end
179
+
180
+ yield(s, type)
181
+ end
182
+ end
183
+
184
+ def tokenize_with_tags(input)
185
+ tags = []
186
+ tokenize(input) do |s, type|
187
+ if type == :tag && s =~ /\A<(\/)?([^\p{Space}\/>]+)/
188
+ if $1
189
+ until tags.empty? || tags.pop == $2; end
190
+ else
191
+ tags << $2
192
+ end
193
+ end
194
+ yield(s, type, tags)
195
+ end
196
+ end
197
+
198
+ def truncate(input, *max_words_or_separator)
199
+ max_words = max_words_or_separator.select {|i| Fixnum === i }.first
200
+ if separator = max_words_or_separator.reject {|i| Fixnum === i }.first
201
+ separator = Regexp.union(*separator) unless Regexp === separator
202
+ separator = nil unless input =~ separator
203
+ end
204
+ out, tail, truncated = '', '', false
205
+ tokenize_with_tags(input) do |s, type, tags|
206
+ if separator && (type == :comment || type == :text || type == :latex || type == :tag) && separator === s
207
+ out << $` if type == :text
208
+ if type == :tag
209
+ if s =~ /\A<\//
210
+ tail << s
211
+ else
212
+ tags.pop
213
+ end
214
+ end
215
+ truncated = tags
216
+ break
217
+ elsif max_words == 0
218
+ if type == :text
219
+ truncated = tags
220
+ break
221
+ end
222
+ tail << s
150
223
  else
151
- text = true if latex == 0 && dollar.even? && excluded == 0
224
+ if max_words && type == :text
225
+ s =~ /\A(\p{Space}*)(.*)\Z/m
226
+ ws, w = $1, $2.split(/\p{Space}+/)
227
+ if w.size > max_words
228
+ out << ws << w[0...max_words].join(' ')
229
+ truncated = tags
230
+ break
231
+ end
232
+ max_words -= w.size
233
+ end
234
+ out << s
152
235
  end
153
- yield(s, text)
154
236
  end
237
+ if truncated
238
+ out.sub!(/[\p{Space}\p{Punct}]*\Z/, ELLIPSIS)
239
+ tail << "</#{truncated.pop}>" until truncated.empty?
240
+ end
241
+ html_safe(input, out << tail)
242
+ end
243
+
244
+ def strip_tags(input)
245
+ out = ''
246
+ tokenize(input) {|s, type| out << s if type == :text || type == :latex }
247
+ html_safe(input, out)
155
248
  end
156
249
 
157
250
  def enhance(input)
158
251
  tokens, text, prev_last_char = [], []
159
- tokenize(input) do |s, t|
160
- if t
252
+ tokenize(input) do |s, type|
253
+ if type == :text
161
254
  last_char = s[-1]
255
+ decode(s)
162
256
  escape(s)
163
257
  primes(s)
164
258
  special(s)
@@ -176,8 +270,10 @@ module TypoHero
176
270
  amp(s)
177
271
  caps(s)
178
272
  ordinals(s)
273
+ nobr(s)
274
+ unescape(s)
179
275
  end
180
- tokens.join
276
+ html_safe(input, tokens.join)
181
277
  end
182
278
 
183
279
  def widont(tokens)
@@ -189,7 +285,8 @@ module TypoHero
189
285
  if tokens[i] =~ WIDONT_NBSP_RE
190
286
  state = 0
191
287
  elsif state == 1 || state == 3
192
- if tokens[i] =~ (state == 1 ? /(\P{Space}+)?(\p{Space}+)?(\P{Space}+\p{Space}*)\Z/m : /(\P{Space}+)?(\p{Space}+)(\P{Space}*)\Z/m)
288
+ if tokens[i] =~ (state == 1 ? /(\P{Space}+)?(\p{Space}+)?(\P{Space}+\p{Space}*)\Z/m :
289
+ /(\P{Space}+)?(\p{Space}+)(\P{Space}*)\Z/m)
193
290
  if $1 && $2
194
291
  tokens[i].replace "#{$`}#{$1}#{NBSP}#{$3}"
195
292
  state = 0
@@ -209,10 +306,25 @@ module TypoHero
209
306
  end
210
307
  end
211
308
 
309
+ def html_safe(src, dst)
310
+ src.respond_to?(:html_safe?) && src.html_safe? ? dst.html_safe : dst
311
+ end
312
+
313
+ def decode(s)
314
+ s.gsub!(/&#x([0-9A-F]+);|&#([0-9]+);/i) do
315
+ i = $1 ? $1.to_i(16) : $2.to_i(10)
316
+ i == 38 ? '&amp;' : i.chr('UTF-8')
317
+ end
318
+ end
319
+
212
320
  def escape(s)
213
321
  s.gsub!(ESCAPE_RE, ESCAPE)
214
322
  end
215
323
 
324
+ def unescape(s)
325
+ s.gsub!(UNESCAPE_RE, UNESCAPE)
326
+ end
327
+
216
328
  def special(s)
217
329
  s.gsub!(SPECIAL_RE, SPECIAL)
218
330
  end
@@ -222,8 +334,8 @@ module TypoHero
222
334
  end
223
335
 
224
336
  def dash_spaces(s)
225
- s.gsub!(MDASH_SPACE_RE, "\u2009\\1\u2009")
226
- s.gsub!(NDASH_SPACE_RE, ' \1 ')
337
+ s.gsub!(MDASH_SPACE_RE, MDASH_SPACE)
338
+ s.gsub!(NDASH_SPACE_RE, NDASH_SPACE)
227
339
  end
228
340
 
229
341
  def amp(s)
@@ -231,20 +343,24 @@ module TypoHero
231
343
  end
232
344
 
233
345
  def caps(s)
234
- s.gsub!(REPLACE_CAPS_RE, '\1<span class="caps">\2</span>')
346
+ s.gsub!(CAPS_RE, '\1<span class="caps">\2</span>')
235
347
  end
236
348
 
237
349
  def initial_quotes(s)
238
350
  s.gsub!(INITIAL_QUOTE_RE, INITIAL_QUOTES)
239
351
  end
240
352
 
353
+ def nobr(s)
354
+ s.gsub!(/[\p{Digit}\p{Word}]+(-[\p{Digit}\p{Word}]+)+/, '<span class="nobr">\0</span>')
355
+ end
356
+
241
357
  def primes(s)
242
358
  # Special case for inches and minutes, seconds
243
359
  s.gsub!(PRIME_RE, PRIMES)
244
360
  end
245
361
 
246
362
  def ordinals(s)
247
- s.gsub!(ORDINAL_RE, '<sup>\1</sup>')
363
+ s.gsub!(ORDINAL_RE, '<span class="ord">\1</span>')
248
364
  end
249
365
 
250
366
  def quotes(s, prev_last_char)