typohero 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.travis.yml +1 -0
- data/README.md +36 -11
- data/latex.pl +11 -4
- data/lib/typohero/latex.rb +538 -533
- data/lib/typohero/version.rb +1 -1
- data/lib/typohero.rb +180 -64
- data/test/typohero_test.rb +171 -128
- data/typohero.gemspec +1 -1
- metadata +2 -3
- data/bench/bench.txt +0 -16413
data/lib/typohero/version.rb
CHANGED
data/lib/typohero.rb
CHANGED
@@ -1,27 +1,37 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
require 'typohero/version'
|
2
3
|
require 'typohero/latex'
|
3
4
|
|
4
5
|
module TypoHero
|
5
6
|
extend self
|
6
7
|
|
7
|
-
EXCLUDED_TAGS = %w(head pre code kbd math script textarea)
|
8
|
+
EXCLUDED_TAGS = %w(head pre code kbd math script style textarea)
|
8
9
|
EXCLUDED_TAGS_RE = /\A<(\/)?(?:#{EXCLUDED_TAGS.join('|')})[\p{Space}\/>]/im
|
9
10
|
|
10
|
-
TOKENIZER_RE =
|
11
|
+
TOKENIZER_RE = %r{
|
12
|
+
<!--(?:(?:(?!-->).)*)-->| # comment
|
13
|
+
<!\[CDATA\[(?:(?:(?!\]\]>).)*)\]\]>| # cdata
|
14
|
+
<[^>]+>| # opening or closing tag
|
15
|
+
\\[\(\)\[\]]| # latex begin/end
|
16
|
+
\$\$| # dollar latex begin/end
|
17
|
+
(?:(?:(?!\$\$|\\[\(\)\[\]])[^<])+) # text without double dollar or latex
|
18
|
+
}xm
|
11
19
|
|
12
20
|
ESCAPE = {
|
13
|
-
'\\\\'
|
14
|
-
'\"'
|
15
|
-
"
|
16
|
-
'\.'
|
17
|
-
'\,'
|
18
|
-
'\-'
|
19
|
-
'\`'
|
20
|
-
'\(' => '(',
|
21
|
+
'\\\\' => '\',
|
22
|
+
'\"' => '"',
|
23
|
+
"\\'" => ''',
|
24
|
+
'\.' => '.',
|
25
|
+
'\,' => ',',
|
26
|
+
'\-' => '-',
|
27
|
+
'\`' => '`',
|
21
28
|
}
|
29
|
+
UNESCAPE = Hash[ESCAPE.map {|k,v| [v,k[1..-1]] }]
|
22
30
|
ESCAPE_RE = Regexp.union(*ESCAPE.keys)
|
31
|
+
UNESCAPE_RE = Regexp.union(*UNESCAPE.keys)
|
23
32
|
|
24
33
|
NBSP = "\u00a0"
|
34
|
+
NBSP_THIN = "\u202F"
|
25
35
|
MDASH = "\u2014"
|
26
36
|
NDASH = "\u2013"
|
27
37
|
LDQUO = "\u201C"
|
@@ -29,16 +39,19 @@ module TypoHero
|
|
29
39
|
LSQUO = "\u2018"
|
30
40
|
RSQUO = "\u2019"
|
31
41
|
BDQUO = "\u201E"
|
42
|
+
ELLIPSIS = "\u2026"
|
32
43
|
|
33
44
|
SPECIAL = {
|
34
45
|
# enhance!
|
46
|
+
' - ' => " #{NDASH} ",
|
35
47
|
'---' => MDASH,
|
36
48
|
'--' => NDASH,
|
37
|
-
'...' =>
|
38
|
-
'. . .' =>
|
49
|
+
'...' => ELLIPSIS,
|
50
|
+
'. . .' => ELLIPSIS,
|
39
51
|
'``' => LDQUO,
|
40
52
|
"''" => RDQUO,
|
41
53
|
'`' => LSQUO,
|
54
|
+
#'\'' => RSQUO, # needs more complex treatment
|
42
55
|
',,' => BDQUO,
|
43
56
|
'(c)' => "\u00A9",
|
44
57
|
'(C)' => "\u00A9",
|
@@ -48,37 +61,21 @@ module TypoHero
|
|
48
61
|
'(TM)' => "\u2122",
|
49
62
|
# normalize for further processing
|
50
63
|
'“' => LDQUO,
|
51
|
-
'“' => LDQUO,
|
52
|
-
'“' => LDQUO,
|
53
64
|
'”' => RDQUO,
|
54
|
-
'”' => RDQUO,
|
55
|
-
'”' => RDQUO,
|
56
65
|
'‘' => LSQUO,
|
57
|
-
'‘' => LSQUO,
|
58
|
-
'‘' => LSQUO,
|
59
66
|
'’' => RSQUO,
|
60
|
-
'’' => RSQUO,
|
61
|
-
'’' => RSQUO,
|
62
|
-
' ' => NBSP,
|
63
|
-
' ' => NBSP,
|
64
67
|
' ' => NBSP,
|
65
68
|
'–' => NDASH,
|
66
|
-
'
|
67
|
-
'–' => NDASH,
|
68
|
-
'—' => MDASH,
|
69
|
-
'—' => MDASH,
|
70
|
-
'—' => MDASH,
|
71
|
-
'&' => '&',
|
72
|
-
'&' => '&',
|
69
|
+
'—' => MDASH
|
73
70
|
}
|
74
71
|
SPECIAL_RE = Regexp.union(*SPECIAL.keys)
|
75
|
-
LATEX_RE = /(#{Regexp.union *LATEX.keys})(?=\p{Space}|$)/
|
72
|
+
LATEX_RE = /(#{Regexp.union *LATEX.keys})(?=\p{Space}|$)/m
|
76
73
|
|
77
74
|
DASH_RE = "[#{MDASH}#{NDASH}]"
|
78
75
|
AMP_RE = '&(?:amp;)?'
|
79
76
|
LEFT_QUOTE_RE = "[#{LDQUO}#{LSQUO}#{BDQUO}]"
|
80
77
|
|
81
|
-
PRIME_RE = /(?<=\d)(''?)(
|
78
|
+
PRIME_RE = /(?<=\d)(''?)(?=[\p{Space}\dNEWS]|$)/m
|
82
79
|
PRIMES = {
|
83
80
|
"'" => "\u2032",
|
84
81
|
"''" => "\u2033",
|
@@ -86,22 +83,23 @@ module TypoHero
|
|
86
83
|
}
|
87
84
|
ORDINAL_RE = /(?<=\d)(st|nd|rd|th)(?=\p{Space}|$)/
|
88
85
|
|
89
|
-
MDASH_SPACE_RE = /\p{Space}
|
90
|
-
NDASH_SPACE_RE = /\p{Space}
|
86
|
+
MDASH_SPACE_RE = /\p{Space}*#{MDASH}\p{Space}*/
|
87
|
+
NDASH_SPACE_RE = /\p{Space}*#{NDASH}\p{Space}*/
|
88
|
+
MDASH_SPACE = "#{NBSP_THIN}#{MDASH}#{NBSP_THIN}"
|
89
|
+
NDASH_SPACE = "#{NBSP}#{NDASH}#{NBSP}"
|
91
90
|
|
92
|
-
REPLACE_AMP_RE
|
91
|
+
REPLACE_AMP_RE = /(?<=\p{Space})#{AMP_RE}(?=\p{Space})/
|
93
92
|
|
94
|
-
CAPS_BEGIN_RE
|
95
|
-
CAPS_INNER_RE
|
96
|
-
|
93
|
+
CAPS_BEGIN_RE = "(^|\\p{Space}|#{LEFT_QUOTE_RE})"
|
94
|
+
CAPS_INNER_RE = "(?:#{AMP_RE}|[A-Z\\d\\.]|#{RSQUO})*" # right quote for posession (e.g. JIMMY'S)
|
95
|
+
CAPS_RE = /#{CAPS_BEGIN_RE}([A-Z\d]#{CAPS_INNER_RE}[A-Z]#{CAPS_INNER_RE}|[A-Z]#{CAPS_INNER_RE}[A-Z\d]#{CAPS_INNER_RE})/m
|
97
96
|
|
98
|
-
|
99
|
-
|
100
|
-
^['"](?=#{PUNCT_CLASS})\B| # Very first character is a closing quote followed by punctuation at a non-word-break
|
97
|
+
RIGHT_QUOTE_RE = %r{
|
98
|
+
^['"](?=\p{Punct})\B| # Very first character is a closing quote followed by punctuation at a non-word-break
|
101
99
|
(?<!^|#{DASH_RE}|\p{Space}|[\[\{\(\-])['"]| # Not after dash, space or opening parentheses
|
102
100
|
['"](?=\p{Space}|$)| # Followed by space or end of line
|
103
101
|
's\b| # Apostrophe
|
104
|
-
(?<=#{DASH_RE})['"](
|
102
|
+
(?<=#{DASH_RE})['"](?=\p{Punct})| # Dash quote punctuation (e.g. --'!), for quotations
|
105
103
|
'(?=(\d\d(?:s|\p{Space}|$))) # Decade abbreviations (the '80s)
|
106
104
|
}xm
|
107
105
|
|
@@ -125,9 +123,9 @@ module TypoHero
|
|
125
123
|
|
126
124
|
WIDONT_PARAGRAPH_RE = /\A<\/(?:#{PARAGRAPH_RE})>\Z/im
|
127
125
|
WIDONT_INLINE_RE = /\A<\/?(?:#{INLINE_RE})[^>]*>\Z/im
|
128
|
-
WIDONT_NBSP_RE =
|
126
|
+
WIDONT_NBSP_RE = /[#{NBSP}#{NBSP_THIN}<>]/
|
129
127
|
|
130
|
-
INITIAL_QUOTE_RE = /(?=(?:<(?:#{PARAGRAPH_RE})[^>]*>|^)(?:<(?:#{INLINE_RE})[^>]*>|\p{Space})*)#{LEFT_QUOTE_RE}/
|
128
|
+
INITIAL_QUOTE_RE = /(?=(?:<(?:#{PARAGRAPH_RE})[^>]*>|^)(?:<(?:#{INLINE_RE})[^>]*>|\p{Space})*)#{LEFT_QUOTE_RE}/m
|
131
129
|
INITIAL_QUOTES = {
|
132
130
|
LSQUO => "<span class=\"quo\">#{LSQUO}</span>",
|
133
131
|
LDQUO => "<span class=\"dquo\">#{LDQUO}</span>",
|
@@ -135,30 +133,126 @@ module TypoHero
|
|
135
133
|
}
|
136
134
|
|
137
135
|
def tokenize(input)
|
138
|
-
excluded, latex, dollar = 0, 0, 0
|
136
|
+
comment, excluded, latex, dollar = false, 0, 0, 0
|
139
137
|
input.scan TOKENIZER_RE do |s|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
138
|
+
type =
|
139
|
+
if s =~ /\A<!--/
|
140
|
+
:comment
|
141
|
+
elsif s =~ /\A<!\[/
|
142
|
+
:cdata
|
143
|
+
end
|
144
|
+
|
145
|
+
if !type && latex == 0 && dollar.even?
|
146
|
+
if s=~ /\A</
|
147
|
+
if s =~ EXCLUDED_TAGS_RE
|
148
|
+
excluded += $1 ? -1 : 1
|
149
|
+
excluded = 0 if excluded < 0
|
150
|
+
type = :excluded
|
151
|
+
else
|
152
|
+
type = excluded == 0 ? :tag : :excluded
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
if !type && excluded == 0
|
158
|
+
case s
|
159
|
+
when /\A\\[\(\[]\Z/
|
160
|
+
latex += 1
|
161
|
+
type = :latex
|
162
|
+
when /\A\\[\)\]]\Z/
|
163
|
+
latex -= 1 if latex > 0
|
164
|
+
type = :latex
|
165
|
+
when '$$'
|
166
|
+
dollar += 1
|
167
|
+
type = :latex
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
type ||=
|
172
|
+
if excluded != 0
|
173
|
+
:excluded
|
174
|
+
elsif latex != 0 || dollar.odd?
|
175
|
+
:latex
|
176
|
+
else
|
177
|
+
:text
|
178
|
+
end
|
179
|
+
|
180
|
+
yield(s, type)
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
def tokenize_with_tags(input)
|
185
|
+
tags = []
|
186
|
+
tokenize(input) do |s, type|
|
187
|
+
if type == :tag && s =~ /\A<(\/)?([^\p{Space}\/>]+)/
|
188
|
+
if $1
|
189
|
+
until tags.empty? || tags.pop == $2; end
|
190
|
+
else
|
191
|
+
tags << $2
|
192
|
+
end
|
193
|
+
end
|
194
|
+
yield(s, type, tags)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
def truncate(input, *max_words_or_separator)
|
199
|
+
max_words = max_words_or_separator.select {|i| Fixnum === i }.first
|
200
|
+
if separator = max_words_or_separator.reject {|i| Fixnum === i }.first
|
201
|
+
separator = Regexp.union(*separator) unless Regexp === separator
|
202
|
+
separator = nil unless input =~ separator
|
203
|
+
end
|
204
|
+
out, tail, truncated = '', '', false
|
205
|
+
tokenize_with_tags(input) do |s, type, tags|
|
206
|
+
if separator && (type == :comment || type == :text || type == :latex || type == :tag) && separator === s
|
207
|
+
out << $` if type == :text
|
208
|
+
if type == :tag
|
209
|
+
if s =~ /\A<\//
|
210
|
+
tail << s
|
211
|
+
else
|
212
|
+
tags.pop
|
213
|
+
end
|
214
|
+
end
|
215
|
+
truncated = tags
|
216
|
+
break
|
217
|
+
elsif max_words == 0
|
218
|
+
if type == :text
|
219
|
+
truncated = tags
|
220
|
+
break
|
221
|
+
end
|
222
|
+
tail << s
|
150
223
|
else
|
151
|
-
|
224
|
+
if max_words && type == :text
|
225
|
+
s =~ /\A(\p{Space}*)(.*)\Z/m
|
226
|
+
ws, w = $1, $2.split(/\p{Space}+/)
|
227
|
+
if w.size > max_words
|
228
|
+
out << ws << w[0...max_words].join(' ')
|
229
|
+
truncated = tags
|
230
|
+
break
|
231
|
+
end
|
232
|
+
max_words -= w.size
|
233
|
+
end
|
234
|
+
out << s
|
152
235
|
end
|
153
|
-
yield(s, text)
|
154
236
|
end
|
237
|
+
if truncated
|
238
|
+
out.sub!(/[\p{Space}\p{Punct}]*\Z/, ELLIPSIS)
|
239
|
+
tail << "</#{truncated.pop}>" until truncated.empty?
|
240
|
+
end
|
241
|
+
html_safe(input, out << tail)
|
242
|
+
end
|
243
|
+
|
244
|
+
def strip_tags(input)
|
245
|
+
out = ''
|
246
|
+
tokenize(input) {|s, type| out << s if type == :text || type == :latex }
|
247
|
+
html_safe(input, out)
|
155
248
|
end
|
156
249
|
|
157
250
|
def enhance(input)
|
158
251
|
tokens, text, prev_last_char = [], []
|
159
|
-
tokenize(input) do |s,
|
160
|
-
if
|
252
|
+
tokenize(input) do |s, type|
|
253
|
+
if type == :text
|
161
254
|
last_char = s[-1]
|
255
|
+
decode(s)
|
162
256
|
escape(s)
|
163
257
|
primes(s)
|
164
258
|
special(s)
|
@@ -176,8 +270,10 @@ module TypoHero
|
|
176
270
|
amp(s)
|
177
271
|
caps(s)
|
178
272
|
ordinals(s)
|
273
|
+
nobr(s)
|
274
|
+
unescape(s)
|
179
275
|
end
|
180
|
-
tokens.join
|
276
|
+
html_safe(input, tokens.join)
|
181
277
|
end
|
182
278
|
|
183
279
|
def widont(tokens)
|
@@ -189,7 +285,8 @@ module TypoHero
|
|
189
285
|
if tokens[i] =~ WIDONT_NBSP_RE
|
190
286
|
state = 0
|
191
287
|
elsif state == 1 || state == 3
|
192
|
-
if tokens[i] =~ (state == 1 ? /(\P{Space}+)?(\p{Space}+)?(\P{Space}+\p{Space}*)\Z/m :
|
288
|
+
if tokens[i] =~ (state == 1 ? /(\P{Space}+)?(\p{Space}+)?(\P{Space}+\p{Space}*)\Z/m :
|
289
|
+
/(\P{Space}+)?(\p{Space}+)(\P{Space}*)\Z/m)
|
193
290
|
if $1 && $2
|
194
291
|
tokens[i].replace "#{$`}#{$1}#{NBSP}#{$3}"
|
195
292
|
state = 0
|
@@ -209,10 +306,25 @@ module TypoHero
|
|
209
306
|
end
|
210
307
|
end
|
211
308
|
|
309
|
+
def html_safe(src, dst)
|
310
|
+
src.respond_to?(:html_safe?) && src.html_safe? ? dst.html_safe : dst
|
311
|
+
end
|
312
|
+
|
313
|
+
def decode(s)
|
314
|
+
s.gsub!(/&#x([0-9A-F]+);|&#([0-9]+);/i) do
|
315
|
+
i = $1 ? $1.to_i(16) : $2.to_i(10)
|
316
|
+
i == 38 ? '&' : i.chr('UTF-8')
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
212
320
|
def escape(s)
|
213
321
|
s.gsub!(ESCAPE_RE, ESCAPE)
|
214
322
|
end
|
215
323
|
|
324
|
+
def unescape(s)
|
325
|
+
s.gsub!(UNESCAPE_RE, UNESCAPE)
|
326
|
+
end
|
327
|
+
|
216
328
|
def special(s)
|
217
329
|
s.gsub!(SPECIAL_RE, SPECIAL)
|
218
330
|
end
|
@@ -222,8 +334,8 @@ module TypoHero
|
|
222
334
|
end
|
223
335
|
|
224
336
|
def dash_spaces(s)
|
225
|
-
s.gsub!(MDASH_SPACE_RE,
|
226
|
-
s.gsub!(NDASH_SPACE_RE,
|
337
|
+
s.gsub!(MDASH_SPACE_RE, MDASH_SPACE)
|
338
|
+
s.gsub!(NDASH_SPACE_RE, NDASH_SPACE)
|
227
339
|
end
|
228
340
|
|
229
341
|
def amp(s)
|
@@ -231,20 +343,24 @@ module TypoHero
|
|
231
343
|
end
|
232
344
|
|
233
345
|
def caps(s)
|
234
|
-
s.gsub!(
|
346
|
+
s.gsub!(CAPS_RE, '\1<span class="caps">\2</span>')
|
235
347
|
end
|
236
348
|
|
237
349
|
def initial_quotes(s)
|
238
350
|
s.gsub!(INITIAL_QUOTE_RE, INITIAL_QUOTES)
|
239
351
|
end
|
240
352
|
|
353
|
+
def nobr(s)
|
354
|
+
s.gsub!(/[\p{Digit}\p{Word}]+(-[\p{Digit}\p{Word}]+)+/, '<span class="nobr">\0</span>')
|
355
|
+
end
|
356
|
+
|
241
357
|
def primes(s)
|
242
358
|
# Special case for inches and minutes, seconds
|
243
359
|
s.gsub!(PRIME_RE, PRIMES)
|
244
360
|
end
|
245
361
|
|
246
362
|
def ordinals(s)
|
247
|
-
s.gsub!(ORDINAL_RE, '<
|
363
|
+
s.gsub!(ORDINAL_RE, '<span class="ord">\1</span>')
|
248
364
|
end
|
249
365
|
|
250
366
|
def quotes(s, prev_last_char)
|