typohero 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.travis.yml +1 -0
- data/README.md +36 -11
- data/latex.pl +11 -4
- data/lib/typohero/latex.rb +538 -533
- data/lib/typohero/version.rb +1 -1
- data/lib/typohero.rb +180 -64
- data/test/typohero_test.rb +171 -128
- data/typohero.gemspec +1 -1
- metadata +2 -3
- data/bench/bench.txt +0 -16413
data/lib/typohero/version.rb
CHANGED
data/lib/typohero.rb
CHANGED
@@ -1,27 +1,37 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
require 'typohero/version'
|
2
3
|
require 'typohero/latex'
|
3
4
|
|
4
5
|
module TypoHero
|
5
6
|
extend self
|
6
7
|
|
7
|
-
EXCLUDED_TAGS = %w(head pre code kbd math script textarea)
|
8
|
+
EXCLUDED_TAGS = %w(head pre code kbd math script style textarea)
|
8
9
|
EXCLUDED_TAGS_RE = /\A<(\/)?(?:#{EXCLUDED_TAGS.join('|')})[\p{Space}\/>]/im
|
9
10
|
|
10
|
-
TOKENIZER_RE =
|
11
|
+
TOKENIZER_RE = %r{
|
12
|
+
<!--(?:(?:(?!-->).)*)-->| # comment
|
13
|
+
<!\[CDATA\[(?:(?:(?!\]\]>).)*)\]\]>| # cdata
|
14
|
+
<[^>]+>| # opening or closing tag
|
15
|
+
\\[\(\)\[\]]| # latex begin/end
|
16
|
+
\$\$| # dollar latex begin/end
|
17
|
+
(?:(?:(?!\$\$|\\[\(\)\[\]])[^<])+) # text without double dollar or latex
|
18
|
+
}xm
|
11
19
|
|
12
20
|
ESCAPE = {
|
13
|
-
'\\\\'
|
14
|
-
'\"'
|
15
|
-
"
|
16
|
-
'\.'
|
17
|
-
'\,'
|
18
|
-
'\-'
|
19
|
-
'\`'
|
20
|
-
'\(' => '(',
|
21
|
+
'\\\\' => '\',
|
22
|
+
'\"' => '"',
|
23
|
+
"\\'" => ''',
|
24
|
+
'\.' => '.',
|
25
|
+
'\,' => ',',
|
26
|
+
'\-' => '-',
|
27
|
+
'\`' => '`',
|
21
28
|
}
|
29
|
+
UNESCAPE = Hash[ESCAPE.map {|k,v| [v,k[1..-1]] }]
|
22
30
|
ESCAPE_RE = Regexp.union(*ESCAPE.keys)
|
31
|
+
UNESCAPE_RE = Regexp.union(*UNESCAPE.keys)
|
23
32
|
|
24
33
|
NBSP = "\u00a0"
|
34
|
+
NBSP_THIN = "\u202F"
|
25
35
|
MDASH = "\u2014"
|
26
36
|
NDASH = "\u2013"
|
27
37
|
LDQUO = "\u201C"
|
@@ -29,16 +39,19 @@ module TypoHero
|
|
29
39
|
LSQUO = "\u2018"
|
30
40
|
RSQUO = "\u2019"
|
31
41
|
BDQUO = "\u201E"
|
42
|
+
ELLIPSIS = "\u2026"
|
32
43
|
|
33
44
|
SPECIAL = {
|
34
45
|
# enhance!
|
46
|
+
' - ' => " #{NDASH} ",
|
35
47
|
'---' => MDASH,
|
36
48
|
'--' => NDASH,
|
37
|
-
'...' =>
|
38
|
-
'. . .' =>
|
49
|
+
'...' => ELLIPSIS,
|
50
|
+
'. . .' => ELLIPSIS,
|
39
51
|
'``' => LDQUO,
|
40
52
|
"''" => RDQUO,
|
41
53
|
'`' => LSQUO,
|
54
|
+
#'\'' => RSQUO, # needs more complex treatment
|
42
55
|
',,' => BDQUO,
|
43
56
|
'(c)' => "\u00A9",
|
44
57
|
'(C)' => "\u00A9",
|
@@ -48,37 +61,21 @@ module TypoHero
|
|
48
61
|
'(TM)' => "\u2122",
|
49
62
|
# normalize for further processing
|
50
63
|
'“' => LDQUO,
|
51
|
-
'“' => LDQUO,
|
52
|
-
'“' => LDQUO,
|
53
64
|
'”' => RDQUO,
|
54
|
-
'”' => RDQUO,
|
55
|
-
'”' => RDQUO,
|
56
65
|
'‘' => LSQUO,
|
57
|
-
'‘' => LSQUO,
|
58
|
-
'‘' => LSQUO,
|
59
66
|
'’' => RSQUO,
|
60
|
-
'’' => RSQUO,
|
61
|
-
'’' => RSQUO,
|
62
|
-
' ' => NBSP,
|
63
|
-
' ' => NBSP,
|
64
67
|
' ' => NBSP,
|
65
68
|
'–' => NDASH,
|
66
|
-
'
|
67
|
-
'–' => NDASH,
|
68
|
-
'—' => MDASH,
|
69
|
-
'—' => MDASH,
|
70
|
-
'—' => MDASH,
|
71
|
-
'&' => '&',
|
72
|
-
'&' => '&',
|
69
|
+
'—' => MDASH
|
73
70
|
}
|
74
71
|
SPECIAL_RE = Regexp.union(*SPECIAL.keys)
|
75
|
-
LATEX_RE = /(#{Regexp.union *LATEX.keys})(?=\p{Space}|$)/
|
72
|
+
LATEX_RE = /(#{Regexp.union *LATEX.keys})(?=\p{Space}|$)/m
|
76
73
|
|
77
74
|
DASH_RE = "[#{MDASH}#{NDASH}]"
|
78
75
|
AMP_RE = '&(?:amp;)?'
|
79
76
|
LEFT_QUOTE_RE = "[#{LDQUO}#{LSQUO}#{BDQUO}]"
|
80
77
|
|
81
|
-
PRIME_RE = /(?<=\d)(''?)(
|
78
|
+
PRIME_RE = /(?<=\d)(''?)(?=[\p{Space}\dNEWS]|$)/m
|
82
79
|
PRIMES = {
|
83
80
|
"'" => "\u2032",
|
84
81
|
"''" => "\u2033",
|
@@ -86,22 +83,23 @@ module TypoHero
|
|
86
83
|
}
|
87
84
|
ORDINAL_RE = /(?<=\d)(st|nd|rd|th)(?=\p{Space}|$)/
|
88
85
|
|
89
|
-
MDASH_SPACE_RE = /\p{Space}
|
90
|
-
NDASH_SPACE_RE = /\p{Space}
|
86
|
+
MDASH_SPACE_RE = /\p{Space}*#{MDASH}\p{Space}*/
|
87
|
+
NDASH_SPACE_RE = /\p{Space}*#{NDASH}\p{Space}*/
|
88
|
+
MDASH_SPACE = "#{NBSP_THIN}#{MDASH}#{NBSP_THIN}"
|
89
|
+
NDASH_SPACE = "#{NBSP}#{NDASH}#{NBSP}"
|
91
90
|
|
92
|
-
REPLACE_AMP_RE
|
91
|
+
REPLACE_AMP_RE = /(?<=\p{Space})#{AMP_RE}(?=\p{Space})/
|
93
92
|
|
94
|
-
CAPS_BEGIN_RE
|
95
|
-
CAPS_INNER_RE
|
96
|
-
|
93
|
+
CAPS_BEGIN_RE = "(^|\\p{Space}|#{LEFT_QUOTE_RE})"
|
94
|
+
CAPS_INNER_RE = "(?:#{AMP_RE}|[A-Z\\d\\.]|#{RSQUO})*" # right quote for posession (e.g. JIMMY'S)
|
95
|
+
CAPS_RE = /#{CAPS_BEGIN_RE}([A-Z\d]#{CAPS_INNER_RE}[A-Z]#{CAPS_INNER_RE}|[A-Z]#{CAPS_INNER_RE}[A-Z\d]#{CAPS_INNER_RE})/m
|
97
96
|
|
98
|
-
|
99
|
-
|
100
|
-
^['"](?=#{PUNCT_CLASS})\B| # Very first character is a closing quote followed by punctuation at a non-word-break
|
97
|
+
RIGHT_QUOTE_RE = %r{
|
98
|
+
^['"](?=\p{Punct})\B| # Very first character is a closing quote followed by punctuation at a non-word-break
|
101
99
|
(?<!^|#{DASH_RE}|\p{Space}|[\[\{\(\-])['"]| # Not after dash, space or opening parentheses
|
102
100
|
['"](?=\p{Space}|$)| # Followed by space or end of line
|
103
101
|
's\b| # Apostrophe
|
104
|
-
(?<=#{DASH_RE})['"](
|
102
|
+
(?<=#{DASH_RE})['"](?=\p{Punct})| # Dash quote punctuation (e.g. --'!), for quotations
|
105
103
|
'(?=(\d\d(?:s|\p{Space}|$))) # Decade abbreviations (the '80s)
|
106
104
|
}xm
|
107
105
|
|
@@ -125,9 +123,9 @@ module TypoHero
|
|
125
123
|
|
126
124
|
WIDONT_PARAGRAPH_RE = /\A<\/(?:#{PARAGRAPH_RE})>\Z/im
|
127
125
|
WIDONT_INLINE_RE = /\A<\/?(?:#{INLINE_RE})[^>]*>\Z/im
|
128
|
-
WIDONT_NBSP_RE =
|
126
|
+
WIDONT_NBSP_RE = /[#{NBSP}#{NBSP_THIN}<>]/
|
129
127
|
|
130
|
-
INITIAL_QUOTE_RE = /(?=(?:<(?:#{PARAGRAPH_RE})[^>]*>|^)(?:<(?:#{INLINE_RE})[^>]*>|\p{Space})*)#{LEFT_QUOTE_RE}/
|
128
|
+
INITIAL_QUOTE_RE = /(?=(?:<(?:#{PARAGRAPH_RE})[^>]*>|^)(?:<(?:#{INLINE_RE})[^>]*>|\p{Space})*)#{LEFT_QUOTE_RE}/m
|
131
129
|
INITIAL_QUOTES = {
|
132
130
|
LSQUO => "<span class=\"quo\">#{LSQUO}</span>",
|
133
131
|
LDQUO => "<span class=\"dquo\">#{LDQUO}</span>",
|
@@ -135,30 +133,126 @@ module TypoHero
|
|
135
133
|
}
|
136
134
|
|
137
135
|
def tokenize(input)
|
138
|
-
excluded, latex, dollar = 0, 0, 0
|
136
|
+
comment, excluded, latex, dollar = false, 0, 0, 0
|
139
137
|
input.scan TOKENIZER_RE do |s|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
138
|
+
type =
|
139
|
+
if s =~ /\A<!--/
|
140
|
+
:comment
|
141
|
+
elsif s =~ /\A<!\[/
|
142
|
+
:cdata
|
143
|
+
end
|
144
|
+
|
145
|
+
if !type && latex == 0 && dollar.even?
|
146
|
+
if s=~ /\A</
|
147
|
+
if s =~ EXCLUDED_TAGS_RE
|
148
|
+
excluded += $1 ? -1 : 1
|
149
|
+
excluded = 0 if excluded < 0
|
150
|
+
type = :excluded
|
151
|
+
else
|
152
|
+
type = excluded == 0 ? :tag : :excluded
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
if !type && excluded == 0
|
158
|
+
case s
|
159
|
+
when /\A\\[\(\[]\Z/
|
160
|
+
latex += 1
|
161
|
+
type = :latex
|
162
|
+
when /\A\\[\)\]]\Z/
|
163
|
+
latex -= 1 if latex > 0
|
164
|
+
type = :latex
|
165
|
+
when '$$'
|
166
|
+
dollar += 1
|
167
|
+
type = :latex
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
type ||=
|
172
|
+
if excluded != 0
|
173
|
+
:excluded
|
174
|
+
elsif latex != 0 || dollar.odd?
|
175
|
+
:latex
|
176
|
+
else
|
177
|
+
:text
|
178
|
+
end
|
179
|
+
|
180
|
+
yield(s, type)
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
def tokenize_with_tags(input)
|
185
|
+
tags = []
|
186
|
+
tokenize(input) do |s, type|
|
187
|
+
if type == :tag && s =~ /\A<(\/)?([^\p{Space}\/>]+)/
|
188
|
+
if $1
|
189
|
+
until tags.empty? || tags.pop == $2; end
|
190
|
+
else
|
191
|
+
tags << $2
|
192
|
+
end
|
193
|
+
end
|
194
|
+
yield(s, type, tags)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
def truncate(input, *max_words_or_separator)
|
199
|
+
max_words = max_words_or_separator.select {|i| Fixnum === i }.first
|
200
|
+
if separator = max_words_or_separator.reject {|i| Fixnum === i }.first
|
201
|
+
separator = Regexp.union(*separator) unless Regexp === separator
|
202
|
+
separator = nil unless input =~ separator
|
203
|
+
end
|
204
|
+
out, tail, truncated = '', '', false
|
205
|
+
tokenize_with_tags(input) do |s, type, tags|
|
206
|
+
if separator && (type == :comment || type == :text || type == :latex || type == :tag) && separator === s
|
207
|
+
out << $` if type == :text
|
208
|
+
if type == :tag
|
209
|
+
if s =~ /\A<\//
|
210
|
+
tail << s
|
211
|
+
else
|
212
|
+
tags.pop
|
213
|
+
end
|
214
|
+
end
|
215
|
+
truncated = tags
|
216
|
+
break
|
217
|
+
elsif max_words == 0
|
218
|
+
if type == :text
|
219
|
+
truncated = tags
|
220
|
+
break
|
221
|
+
end
|
222
|
+
tail << s
|
150
223
|
else
|
151
|
-
|
224
|
+
if max_words && type == :text
|
225
|
+
s =~ /\A(\p{Space}*)(.*)\Z/m
|
226
|
+
ws, w = $1, $2.split(/\p{Space}+/)
|
227
|
+
if w.size > max_words
|
228
|
+
out << ws << w[0...max_words].join(' ')
|
229
|
+
truncated = tags
|
230
|
+
break
|
231
|
+
end
|
232
|
+
max_words -= w.size
|
233
|
+
end
|
234
|
+
out << s
|
152
235
|
end
|
153
|
-
yield(s, text)
|
154
236
|
end
|
237
|
+
if truncated
|
238
|
+
out.sub!(/[\p{Space}\p{Punct}]*\Z/, ELLIPSIS)
|
239
|
+
tail << "</#{truncated.pop}>" until truncated.empty?
|
240
|
+
end
|
241
|
+
html_safe(input, out << tail)
|
242
|
+
end
|
243
|
+
|
244
|
+
def strip_tags(input)
|
245
|
+
out = ''
|
246
|
+
tokenize(input) {|s, type| out << s if type == :text || type == :latex }
|
247
|
+
html_safe(input, out)
|
155
248
|
end
|
156
249
|
|
157
250
|
def enhance(input)
|
158
251
|
tokens, text, prev_last_char = [], []
|
159
|
-
tokenize(input) do |s,
|
160
|
-
if
|
252
|
+
tokenize(input) do |s, type|
|
253
|
+
if type == :text
|
161
254
|
last_char = s[-1]
|
255
|
+
decode(s)
|
162
256
|
escape(s)
|
163
257
|
primes(s)
|
164
258
|
special(s)
|
@@ -176,8 +270,10 @@ module TypoHero
|
|
176
270
|
amp(s)
|
177
271
|
caps(s)
|
178
272
|
ordinals(s)
|
273
|
+
nobr(s)
|
274
|
+
unescape(s)
|
179
275
|
end
|
180
|
-
tokens.join
|
276
|
+
html_safe(input, tokens.join)
|
181
277
|
end
|
182
278
|
|
183
279
|
def widont(tokens)
|
@@ -189,7 +285,8 @@ module TypoHero
|
|
189
285
|
if tokens[i] =~ WIDONT_NBSP_RE
|
190
286
|
state = 0
|
191
287
|
elsif state == 1 || state == 3
|
192
|
-
if tokens[i] =~ (state == 1 ? /(\P{Space}+)?(\p{Space}+)?(\P{Space}+\p{Space}*)\Z/m :
|
288
|
+
if tokens[i] =~ (state == 1 ? /(\P{Space}+)?(\p{Space}+)?(\P{Space}+\p{Space}*)\Z/m :
|
289
|
+
/(\P{Space}+)?(\p{Space}+)(\P{Space}*)\Z/m)
|
193
290
|
if $1 && $2
|
194
291
|
tokens[i].replace "#{$`}#{$1}#{NBSP}#{$3}"
|
195
292
|
state = 0
|
@@ -209,10 +306,25 @@ module TypoHero
|
|
209
306
|
end
|
210
307
|
end
|
211
308
|
|
309
|
+
def html_safe(src, dst)
|
310
|
+
src.respond_to?(:html_safe?) && src.html_safe? ? dst.html_safe : dst
|
311
|
+
end
|
312
|
+
|
313
|
+
def decode(s)
|
314
|
+
s.gsub!(/&#x([0-9A-F]+);|&#([0-9]+);/i) do
|
315
|
+
i = $1 ? $1.to_i(16) : $2.to_i(10)
|
316
|
+
i == 38 ? '&' : i.chr('UTF-8')
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
212
320
|
def escape(s)
|
213
321
|
s.gsub!(ESCAPE_RE, ESCAPE)
|
214
322
|
end
|
215
323
|
|
324
|
+
def unescape(s)
|
325
|
+
s.gsub!(UNESCAPE_RE, UNESCAPE)
|
326
|
+
end
|
327
|
+
|
216
328
|
def special(s)
|
217
329
|
s.gsub!(SPECIAL_RE, SPECIAL)
|
218
330
|
end
|
@@ -222,8 +334,8 @@ module TypoHero
|
|
222
334
|
end
|
223
335
|
|
224
336
|
def dash_spaces(s)
|
225
|
-
s.gsub!(MDASH_SPACE_RE,
|
226
|
-
s.gsub!(NDASH_SPACE_RE,
|
337
|
+
s.gsub!(MDASH_SPACE_RE, MDASH_SPACE)
|
338
|
+
s.gsub!(NDASH_SPACE_RE, NDASH_SPACE)
|
227
339
|
end
|
228
340
|
|
229
341
|
def amp(s)
|
@@ -231,20 +343,24 @@ module TypoHero
|
|
231
343
|
end
|
232
344
|
|
233
345
|
def caps(s)
|
234
|
-
s.gsub!(
|
346
|
+
s.gsub!(CAPS_RE, '\1<span class="caps">\2</span>')
|
235
347
|
end
|
236
348
|
|
237
349
|
def initial_quotes(s)
|
238
350
|
s.gsub!(INITIAL_QUOTE_RE, INITIAL_QUOTES)
|
239
351
|
end
|
240
352
|
|
353
|
+
def nobr(s)
|
354
|
+
s.gsub!(/[\p{Digit}\p{Word}]+(-[\p{Digit}\p{Word}]+)+/, '<span class="nobr">\0</span>')
|
355
|
+
end
|
356
|
+
|
241
357
|
def primes(s)
|
242
358
|
# Special case for inches and minutes, seconds
|
243
359
|
s.gsub!(PRIME_RE, PRIMES)
|
244
360
|
end
|
245
361
|
|
246
362
|
def ordinals(s)
|
247
|
-
s.gsub!(ORDINAL_RE, '<
|
363
|
+
s.gsub!(ORDINAL_RE, '<span class="ord">\1</span>')
|
248
364
|
end
|
249
365
|
|
250
366
|
def quotes(s, prev_last_char)
|