sterile 1.0.1 → 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.autotest +1 -1
- data/.yaropts +1 -0
- data/Gemfile +1 -1
- data/Gemfile.lock +1 -1
- data/lib/sterile/data/codepoints_data.rb +46527 -0
- data/lib/sterile/data/html_entities_data.rb +264 -0
- data/lib/sterile/data/smart_format_rules.rb +45 -0
- data/lib/sterile/entities.rb +48 -0
- data/lib/sterile/smart_format.rb +41 -0
- data/lib/sterile/string_extensions.rb +19 -0
- data/lib/sterile/tags.rb +78 -0
- data/lib/sterile/titlecase.rb +123 -0
- data/lib/sterile/transliterate.rb +65 -0
- data/lib/sterile/utilities.rb +43 -0
- data/lib/sterile/version.rb +1 -1
- data/lib/sterile.rb +7 -314
- metadata +13 -5
- data/lib/sterile/codepoints.rb +0 -46523
- data/lib/sterile/html_entities.rb +0 -260
- data/lib/sterile/smart_format_rules.rb +0 -41
@@ -0,0 +1,264 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Sterile
|
4
|
+
# @private
|
5
|
+
class Data
|
6
|
+
def self.html_entities_data
|
7
|
+
{
|
8
|
+
"quot" => 34,
|
9
|
+
"amp" => 38,
|
10
|
+
"apos" => 39,
|
11
|
+
"lt" => 60,
|
12
|
+
"gt" => 62,
|
13
|
+
"nbsp" => 160,
|
14
|
+
"iexcl" => 161,
|
15
|
+
"cent" => 162,
|
16
|
+
"pound" => 163,
|
17
|
+
"curren" => 164,
|
18
|
+
"yen" => 165,
|
19
|
+
"brvbar" => 166,
|
20
|
+
"sect" => 167,
|
21
|
+
"uml" => 168,
|
22
|
+
"copy" => 169,
|
23
|
+
"ordf" => 170,
|
24
|
+
"laquo" => 171,
|
25
|
+
"not" => 172,
|
26
|
+
"shy" => 173,
|
27
|
+
"reg" => 174,
|
28
|
+
"macr" => 175,
|
29
|
+
"deg" => 176,
|
30
|
+
"plusmn" => 177,
|
31
|
+
"sup2" => 178,
|
32
|
+
"sup3" => 179,
|
33
|
+
"acute" => 180,
|
34
|
+
"micro" => 181,
|
35
|
+
"para" => 182,
|
36
|
+
"middot" => 183,
|
37
|
+
"cedil" => 184,
|
38
|
+
"sup1" => 185,
|
39
|
+
"ordm" => 186,
|
40
|
+
"raquo" => 187,
|
41
|
+
"frac14" => 188,
|
42
|
+
"frac12" => 189,
|
43
|
+
"frac34" => 190,
|
44
|
+
"iquest" => 191,
|
45
|
+
"Agrave" => 192,
|
46
|
+
"Aacute" => 193,
|
47
|
+
"Acirc" => 194,
|
48
|
+
"Atilde" => 195,
|
49
|
+
"Auml" => 196,
|
50
|
+
"Aring" => 197,
|
51
|
+
"AElig" => 198,
|
52
|
+
"Ccedil" => 199,
|
53
|
+
"Egrave" => 200,
|
54
|
+
"Eacute" => 201,
|
55
|
+
"Ecirc" => 202,
|
56
|
+
"Euml" => 203,
|
57
|
+
"Igrave" => 204,
|
58
|
+
"Iacute" => 205,
|
59
|
+
"Icirc" => 206,
|
60
|
+
"Iuml" => 207,
|
61
|
+
"ETH" => 208,
|
62
|
+
"Ntilde" => 209,
|
63
|
+
"Ograve" => 210,
|
64
|
+
"Oacute" => 211,
|
65
|
+
"Ocirc" => 212,
|
66
|
+
"Otilde" => 213,
|
67
|
+
"Ouml" => 214,
|
68
|
+
"times" => 215,
|
69
|
+
"Oslash" => 216,
|
70
|
+
"Ugrave" => 217,
|
71
|
+
"Uacute" => 218,
|
72
|
+
"Ucirc" => 219,
|
73
|
+
"Uuml" => 220,
|
74
|
+
"Yacute" => 221,
|
75
|
+
"THORN" => 222,
|
76
|
+
"szlig" => 223,
|
77
|
+
"agrave" => 224,
|
78
|
+
"aacute" => 225,
|
79
|
+
"acirc" => 226,
|
80
|
+
"atilde" => 227,
|
81
|
+
"auml" => 228,
|
82
|
+
"aring" => 229,
|
83
|
+
"aelig" => 230,
|
84
|
+
"ccedil" => 231,
|
85
|
+
"egrave" => 232,
|
86
|
+
"eacute" => 233,
|
87
|
+
"ecirc" => 234,
|
88
|
+
"euml" => 235,
|
89
|
+
"igrave" => 236,
|
90
|
+
"iacute" => 237,
|
91
|
+
"icirc" => 238,
|
92
|
+
"iuml" => 239,
|
93
|
+
"eth" => 240,
|
94
|
+
"ntilde" => 241,
|
95
|
+
"ograve" => 242,
|
96
|
+
"oacute" => 243,
|
97
|
+
"ocirc" => 244,
|
98
|
+
"otilde" => 245,
|
99
|
+
"ouml" => 246,
|
100
|
+
"divide" => 247,
|
101
|
+
"oslash" => 248,
|
102
|
+
"ugrave" => 249,
|
103
|
+
"uacute" => 250,
|
104
|
+
"ucirc" => 251,
|
105
|
+
"uuml" => 252,
|
106
|
+
"yacute" => 253,
|
107
|
+
"thorn" => 254,
|
108
|
+
"yuml" => 255,
|
109
|
+
"OElig" => 338,
|
110
|
+
"oelig" => 339,
|
111
|
+
"Scaron" => 352,
|
112
|
+
"scaron" => 353,
|
113
|
+
"Yuml" => 376,
|
114
|
+
"fnof" => 402,
|
115
|
+
"circ" => 710,
|
116
|
+
"tilde" => 732,
|
117
|
+
"Alpha" => 913,
|
118
|
+
"Beta" => 914,
|
119
|
+
"Gamma" => 915,
|
120
|
+
"Delta" => 916,
|
121
|
+
"Epsilon" => 917,
|
122
|
+
"Zeta" => 918,
|
123
|
+
"Eta" => 919,
|
124
|
+
"Theta" => 920,
|
125
|
+
"Iota" => 921,
|
126
|
+
"Kappa" => 922,
|
127
|
+
"Lambda" => 923,
|
128
|
+
"Mu" => 924,
|
129
|
+
"Nu" => 925,
|
130
|
+
"Xi" => 926,
|
131
|
+
"Omicron" => 927,
|
132
|
+
"Pi" => 928,
|
133
|
+
"Rho" => 929,
|
134
|
+
"Sigma" => 931,
|
135
|
+
"Tau" => 932,
|
136
|
+
"Upsilon" => 933,
|
137
|
+
"Phi" => 934,
|
138
|
+
"Chi" => 935,
|
139
|
+
"Psi" => 936,
|
140
|
+
"Omega" => 937,
|
141
|
+
"alpha" => 945,
|
142
|
+
"beta" => 946,
|
143
|
+
"gamma" => 947,
|
144
|
+
"delta" => 948,
|
145
|
+
"epsilon" => 949,
|
146
|
+
"zeta" => 950,
|
147
|
+
"eta" => 951,
|
148
|
+
"theta" => 952,
|
149
|
+
"iota" => 953,
|
150
|
+
"kappa" => 954,
|
151
|
+
"lambda" => 955,
|
152
|
+
"mu" => 956,
|
153
|
+
"nu" => 957,
|
154
|
+
"xi" => 958,
|
155
|
+
"omicron" => 959,
|
156
|
+
"pi" => 960,
|
157
|
+
"rho" => 961,
|
158
|
+
"sigmaf" => 962,
|
159
|
+
"sigma" => 963,
|
160
|
+
"tau" => 964,
|
161
|
+
"upsilon" => 965,
|
162
|
+
"phi" => 966,
|
163
|
+
"chi" => 967,
|
164
|
+
"psi" => 968,
|
165
|
+
"omega" => 969,
|
166
|
+
"thetasym" => 977,
|
167
|
+
"upsih" => 978,
|
168
|
+
"piv" => 982,
|
169
|
+
"ensp" => 8194,
|
170
|
+
"emsp" => 8195,
|
171
|
+
"thinsp" => 8201,
|
172
|
+
"zwnj" => 8204,
|
173
|
+
"zwj" => 8205,
|
174
|
+
"lrm" => 8206,
|
175
|
+
"rlm" => 8207,
|
176
|
+
"ndash" => 8211,
|
177
|
+
"mdash" => 8212,
|
178
|
+
"lsquo" => 8216,
|
179
|
+
"rsquo" => 8217,
|
180
|
+
"sbquo" => 8218,
|
181
|
+
"ldquo" => 8220,
|
182
|
+
"rdquo" => 8221,
|
183
|
+
"bdquo" => 8222,
|
184
|
+
"dagger" => 8224,
|
185
|
+
"Dagger" => 8225,
|
186
|
+
"bull" => 8226,
|
187
|
+
"hellip" => 8230,
|
188
|
+
"permil" => 8240,
|
189
|
+
"prime" => 8242,
|
190
|
+
"Prime" => 8243,
|
191
|
+
"lsaquo" => 8249,
|
192
|
+
"rsaquo" => 8250,
|
193
|
+
"oline" => 8254,
|
194
|
+
"frasl" => 8260,
|
195
|
+
"euro" => 8364,
|
196
|
+
"image" => 8465,
|
197
|
+
"weierp" => 8472,
|
198
|
+
"real" => 8476,
|
199
|
+
"trade" => 8482,
|
200
|
+
"alefsym" => 8501,
|
201
|
+
"larr" => 8592,
|
202
|
+
"uarr" => 8593,
|
203
|
+
"rarr" => 8594,
|
204
|
+
"darr" => 8595,
|
205
|
+
"harr" => 8596,
|
206
|
+
"crarr" => 8629,
|
207
|
+
"lArr" => 8656,
|
208
|
+
"uArr" => 8657,
|
209
|
+
"rArr" => 8658,
|
210
|
+
"dArr" => 8659,
|
211
|
+
"hArr" => 8660,
|
212
|
+
"forall" => 8704,
|
213
|
+
"part" => 8706,
|
214
|
+
"exist" => 8707,
|
215
|
+
"empty" => 8709,
|
216
|
+
"nabla" => 8711,
|
217
|
+
"isin" => 8712,
|
218
|
+
"notin" => 8713,
|
219
|
+
"ni" => 8715,
|
220
|
+
"prod" => 8719,
|
221
|
+
"sum" => 8721,
|
222
|
+
"minus" => 8722,
|
223
|
+
"lowast" => 8727,
|
224
|
+
"radic" => 8730,
|
225
|
+
"prop" => 8733,
|
226
|
+
"infin" => 8734,
|
227
|
+
"ang" => 8736,
|
228
|
+
"and" => 8743,
|
229
|
+
"or" => 8744,
|
230
|
+
"cap" => 8745,
|
231
|
+
"cup" => 8746,
|
232
|
+
"int" => 8747,
|
233
|
+
"there4" => 8756,
|
234
|
+
"sim" => 8764,
|
235
|
+
"cong" => 8773,
|
236
|
+
"asymp" => 8776,
|
237
|
+
"ne" => 8800,
|
238
|
+
"equiv" => 8801,
|
239
|
+
"le" => 8804,
|
240
|
+
"ge" => 8805,
|
241
|
+
"sub" => 8834,
|
242
|
+
"sup" => 8835,
|
243
|
+
"nsub" => 8836,
|
244
|
+
"sube" => 8838,
|
245
|
+
"supe" => 8839,
|
246
|
+
"oplus" => 8853,
|
247
|
+
"otimes" => 8855,
|
248
|
+
"perp" => 8869,
|
249
|
+
"sdot" => 8901,
|
250
|
+
"lceil" => 8968,
|
251
|
+
"rceil" => 8969,
|
252
|
+
"lfloor" => 8970,
|
253
|
+
"rfloor" => 8971,
|
254
|
+
"lang" => 10216,
|
255
|
+
"rang" => 10217,
|
256
|
+
"loz" => 9674,
|
257
|
+
"spades" => 9824,
|
258
|
+
"clubs" => 9827,
|
259
|
+
"hearts" => 9829,
|
260
|
+
"diams" => 9830
|
261
|
+
}
|
262
|
+
end
|
263
|
+
end
|
264
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Sterile
|
4
|
+
# @private
|
5
|
+
class Data
|
6
|
+
def self.smart_format_rules
|
7
|
+
[
|
8
|
+
["'tain't", "’tain’t"],
|
9
|
+
["'twere", "’twere"],
|
10
|
+
["'twas", "’twas"],
|
11
|
+
["'tis", "’tis"],
|
12
|
+
["'twill", "’twill"],
|
13
|
+
["'til", "’til"],
|
14
|
+
["'bout", "’bout"],
|
15
|
+
["'nuff", "’nuff"],
|
16
|
+
["'round", "’round"],
|
17
|
+
["'cause", "’cause"],
|
18
|
+
["'cos", "’cos"],
|
19
|
+
["i'm", "i’m"],
|
20
|
+
['--"', "—”"],
|
21
|
+
["--'", "—’"],
|
22
|
+
["--", "—"],
|
23
|
+
["...", "…"],
|
24
|
+
["(tm)", "™"],
|
25
|
+
["(TM)", "™"],
|
26
|
+
["(c)", "©"],
|
27
|
+
["(r)", "®"],
|
28
|
+
["(R)", "®"],
|
29
|
+
[/s\'([^a-zA-Z0-9])/, "s’\\1"],
|
30
|
+
[/"([:;])/, "”\\1"],
|
31
|
+
[/\'s$/, "’s"],
|
32
|
+
[/\'(\d\d(?:’|\')?s)/, "’\\1"],
|
33
|
+
[/(\s|\A|"|\(|\[)\'/, "\\1‘"],
|
34
|
+
[/(\d+)"/, "\\1′"],
|
35
|
+
[/(\d+)\'/, "\\1″"],
|
36
|
+
[/(\S)\'([^\'\s])/, "\\1’\\2"],
|
37
|
+
[/(\s|\A|\(|\[)"(?!\s)/, "\\1“\\2"],
|
38
|
+
[/"(\s|\S|\Z)/, "”\\1"],
|
39
|
+
[/\'([\s.]|\Z)/, "’\\1"],
|
40
|
+
[/(\d+)x(\d+)/, "\\1×\\2"],
|
41
|
+
[/([a-z])'(t|d|s|ll|re|ve)(\b)/i, "\\1’\\2\\3"]
|
42
|
+
]
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Sterile
|
4
|
+
|
5
|
+
class << self
|
6
|
+
|
7
|
+
# Turn Unicode characters into their HTML equivilents.
|
8
|
+
# If a valid HTML entity is not possible, it will create a numeric entity.
|
9
|
+
#
|
10
|
+
# q{“Economy Hits Bottom,” ran the headline}.encode_entities # => “Economy Hits Bottom,” ran the headline
|
11
|
+
#
|
12
|
+
def encode_entities(string)
|
13
|
+
transmogrify(string) do |mapping, codepoint|
|
14
|
+
if (32..126).include?(codepoint)
|
15
|
+
mapping[0]
|
16
|
+
else
|
17
|
+
"&" + (mapping[2] || "#" + codepoint.to_s) + ";"
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
# The reverse of +encode_entities+. Turns HTML or numeric entities into
|
24
|
+
# their Unicode counterparts.
|
25
|
+
#
|
26
|
+
def decode_entities(string)
|
27
|
+
string.gsub!(/&#(\d{1,4});/) { [$1.to_i].pack("U") }
|
28
|
+
string.gsub(/&([a-zA-Z0-9]+);/) do
|
29
|
+
codepoint = html_entities_data[$1]
|
30
|
+
codepoint ? [codepoint].pack("U") : $&
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
# Lazy load html entities
|
38
|
+
#
|
39
|
+
def html_entities_data
|
40
|
+
@html_entities_data ||= begin
|
41
|
+
require "sterile/data/html_entities_data"
|
42
|
+
Data.html_entities_data
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
end # class << self
|
47
|
+
|
48
|
+
end # module Sterile
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Sterile
|
4
|
+
|
5
|
+
class << self
|
6
|
+
|
7
|
+
# Format text with proper "curly" quotes, m-dashes, copyright, trademark, etc.
|
8
|
+
#
|
9
|
+
# q{"He said, 'Away with you, Drake!'"}.smart_format # => “He said, ‘Away with you, Drake!’”
|
10
|
+
#
|
11
|
+
def smart_format(string)
|
12
|
+
smart_format_rules.each do |rule|
|
13
|
+
string.gsub!(rule[0], rule[1])
|
14
|
+
end
|
15
|
+
string
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
# Like +smart_format+, but works with HTML/XML (somewhat).
|
20
|
+
#
|
21
|
+
def smart_format_tags(string)
|
22
|
+
string.gsub_tags do |text|
|
23
|
+
text.smart_format.encode_entities
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
# Lazy load smart formatting rules
|
31
|
+
#
|
32
|
+
def smart_format_rules
|
33
|
+
@smart_format_rules ||= begin
|
34
|
+
require "sterile/data/smart_format_rules"
|
35
|
+
Data.smart_format_rules
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
end # class << self
|
40
|
+
|
41
|
+
end # module Sterile
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Sterile
|
4
|
+
|
5
|
+
module StringExtensions
|
6
|
+
def self.included(base)
|
7
|
+
Sterile.methods(false).each do |method|
|
8
|
+
eval("def #{method}(*args, &block); Sterile.#{method}(self, *args, &block); end")
|
9
|
+
eval("def #{method}!(*args, &block); replace Sterile.#{method}(self, *args, &block); end")
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
class String
|
18
|
+
include Sterile::StringExtensions
|
19
|
+
end
|
data/lib/sterile/tags.rb
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Sterile
|
4
|
+
|
5
|
+
class << self
|
6
|
+
|
7
|
+
# Remove HTML/XML tags from text. Also strips out comments, PHP and ERB style tags.
|
8
|
+
# CDATA is considered text unless :keep_cdata => false is specified.
|
9
|
+
# Redundant whitespace will be removed unless :keep_whitespace => true is specified.
|
10
|
+
#
|
11
|
+
def strip_tags(string, options = {})
|
12
|
+
options = {
|
13
|
+
:keep_whitespace => false,
|
14
|
+
:keep_cdata => true
|
15
|
+
}.merge!(options)
|
16
|
+
|
17
|
+
string.gsub!(/<[%?](php)?[^>]*>/, '') # strip php, erb et al
|
18
|
+
string.gsub!(/<!--[^-]*-->/, '') # strip comments
|
19
|
+
|
20
|
+
string.gsub!(
|
21
|
+
/
|
22
|
+
<!\[CDATA\[
|
23
|
+
([^\]]*)
|
24
|
+
\]\]>
|
25
|
+
/xi,
|
26
|
+
options[:keep_cdata] ? '\\1' : ''
|
27
|
+
)
|
28
|
+
|
29
|
+
html_name = /[\w:-]+/
|
30
|
+
html_data = /([A-Za-z0-9]+|('[^']*?'|"[^"]*?"))/
|
31
|
+
html_attr = /(#{html_name}(\s*=\s*#{html_data})?)/
|
32
|
+
|
33
|
+
string.gsub!(
|
34
|
+
/
|
35
|
+
<
|
36
|
+
[\/]?
|
37
|
+
#{html_name}
|
38
|
+
(\s+(#{html_attr}(\s+#{html_attr})*))?
|
39
|
+
\s*
|
40
|
+
[\/]?
|
41
|
+
>
|
42
|
+
/xi,
|
43
|
+
''
|
44
|
+
)
|
45
|
+
|
46
|
+
options[:keep_whitespace] ? string : trim_whitespace(string)
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
# Similar to +gsub+, except it works in between HTML/XML tags and
|
51
|
+
# yields text to a block. Text will be replaced by what the block
|
52
|
+
# returns.
|
53
|
+
# Warning: does not work in some degenerate cases.
|
54
|
+
#
|
55
|
+
def gsub_tags(string, &block)
|
56
|
+
raise "No block given" unless block_given?
|
57
|
+
|
58
|
+
string.gsub!(/(<[^>]*>)|([^<]+)/) do |match|
|
59
|
+
$2 ? yield($2) : $1
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
|
64
|
+
# Iterates over all text in between HTML/XML tags and yields
|
65
|
+
# it to a block.
|
66
|
+
# Warning: does not work in some degenerate cases.
|
67
|
+
#
|
68
|
+
def scan_tags(string, &block)
|
69
|
+
raise "No block given" unless block_given?
|
70
|
+
|
71
|
+
string.scan(/(<[^>]*>)|([^<]+)/) do |match|
|
72
|
+
yield($2) unless $2.nil?
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
end # class << self
|
77
|
+
|
78
|
+
end # module Sterile
|
@@ -0,0 +1,123 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Sterile
|
4
|
+
|
5
|
+
class << self
|
6
|
+
|
7
|
+
# Format text appropriately for titles. This method is much smarter
|
8
|
+
# than ActiveSupport's +titlecase+. The algorithm is based on work done
|
9
|
+
# by John Gruber et al (http://daringfireball.net/2008/08/title_case_update)
|
10
|
+
#
|
11
|
+
def titlecase(string)
|
12
|
+
|
13
|
+
lsquo = [8216].pack("U")
|
14
|
+
rsquo = [8217].pack("U")
|
15
|
+
ldquo = [8220].pack("U")
|
16
|
+
rdquo = [8221].pack("U")
|
17
|
+
ndash = [8211].pack("U")
|
18
|
+
|
19
|
+
string.strip!
|
20
|
+
string.gsub!(/\s+/, " ")
|
21
|
+
string.downcase! unless string =~ /[[:lower:]]/
|
22
|
+
|
23
|
+
small_words = %w{ a an and as at(?!&t) but by en for if in nor of on or the to v[.]? via vs[.]? }.join("|")
|
24
|
+
apos = / (?: ['#{rsquo}] [[:lower:]]* )? /xu
|
25
|
+
|
26
|
+
string.gsub!(
|
27
|
+
/
|
28
|
+
\b
|
29
|
+
([_\*]*)
|
30
|
+
(?:
|
31
|
+
( [-\+\w]+ [@.\:\/] [-\w@.\:\/]+ #{apos} ) # URL, domain, or email
|
32
|
+
|
|
33
|
+
( (?i: #{small_words} ) #{apos} ) # or small word, case-insensitive
|
34
|
+
|
|
35
|
+
( [[:alpha:]] [[:lower:]'#{rsquo}()\[\]{}]* #{apos} ) # or word without internal caps
|
36
|
+
|
|
37
|
+
( [[:alpha:]] [[:alpha:]'#{rsquo}()\[\]{}]* #{apos} ) # or some other word
|
38
|
+
)
|
39
|
+
([_\*]*)
|
40
|
+
\b
|
41
|
+
/xu
|
42
|
+
) do
|
43
|
+
($1 ? $1 : "") +
|
44
|
+
($2 ? $2 : ($3 ? $3.downcase : ($4 ? $4.downcase.capitalize : $5))) +
|
45
|
+
($6 ? $6 : "")
|
46
|
+
end
|
47
|
+
|
48
|
+
if RUBY_VERSION < "1.9.0"
|
49
|
+
string.gsub!(
|
50
|
+
/
|
51
|
+
\b
|
52
|
+
([:alpha:]+)
|
53
|
+
(#{ndash})
|
54
|
+
([:alpha:]+)
|
55
|
+
\b
|
56
|
+
/xu
|
57
|
+
) do
|
58
|
+
$1.downcase.capitalize + $2 + $1.downcase.capitalize
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
string.gsub!(
|
63
|
+
/
|
64
|
+
(
|
65
|
+
\A [[:punct:]]* # start of title
|
66
|
+
| [:.;?!][ ]+ # or of subsentence
|
67
|
+
| [ ]['"#{ldquo}#{lsquo}(\[][ ]* # or of inserted subphrase
|
68
|
+
)
|
69
|
+
( #{small_words} ) # followed by a small-word
|
70
|
+
\b
|
71
|
+
/xiu
|
72
|
+
) do
|
73
|
+
$1 + $2.downcase.capitalize
|
74
|
+
end
|
75
|
+
|
76
|
+
string.gsub!(
|
77
|
+
/
|
78
|
+
\b
|
79
|
+
( #{small_words} ) # small-word
|
80
|
+
(?=
|
81
|
+
[[:punct:]]* \Z # at the end of the title
|
82
|
+
|
|
83
|
+
['"#{rsquo}#{rdquo})\]] [ ] # or of an inserted subphrase
|
84
|
+
)
|
85
|
+
/xu
|
86
|
+
) do
|
87
|
+
$1.downcase.capitalize
|
88
|
+
end
|
89
|
+
|
90
|
+
string.gsub!(
|
91
|
+
/
|
92
|
+
(
|
93
|
+
\b
|
94
|
+
[[:alpha:]] # single first letter
|
95
|
+
[\-#{ndash}] # followed by a dash
|
96
|
+
)
|
97
|
+
( [[:alpha:]] ) # followed by a letter
|
98
|
+
/xu
|
99
|
+
) do
|
100
|
+
$1 + $2.downcase
|
101
|
+
end
|
102
|
+
|
103
|
+
string.gsub!(/q&a/i, 'Q&A')
|
104
|
+
|
105
|
+
string
|
106
|
+
end
|
107
|
+
alias_method :titleize, :titlecase
|
108
|
+
|
109
|
+
|
110
|
+
private
|
111
|
+
|
112
|
+
# Lazy load smart formatting rules
|
113
|
+
#
|
114
|
+
def smart_format_rules
|
115
|
+
@smart_format_rules ||= begin
|
116
|
+
require "sterile/data/smart_format_rules"
|
117
|
+
Data.smart_format_rules
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
end # class << self
|
122
|
+
|
123
|
+
end # module Sterile
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Sterile
|
4
|
+
|
5
|
+
class << self
|
6
|
+
|
7
|
+
def transmogrify(string, &block)
|
8
|
+
raise "No block given" unless block_given?
|
9
|
+
|
10
|
+
result = ""
|
11
|
+
string.unpack("U*").each do |codepoint|
|
12
|
+
cg = codepoint >> 8
|
13
|
+
cp = codepoint & 0xFF
|
14
|
+
begin
|
15
|
+
mapping = codepoints_data[cg][cp]
|
16
|
+
result << yield(mapping, codepoint)
|
17
|
+
rescue
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
result
|
22
|
+
end
|
23
|
+
|
24
|
+
# Transliterate Unicode [and accented ASCII] characters to their plain-text
|
25
|
+
# ASCII equivalents. This is based on data from the stringex gem (https://github.com/rsl/stringex)
|
26
|
+
# which is in turn a port of Perl's Unidecode and ostensibly provides
|
27
|
+
# superior results to iconv. The optical conversion data is based on work
|
28
|
+
# by Eric Boehs at https://github.com/ericboehs/to_slug
|
29
|
+
# Passing an option of :optical => true will prefer optical mapping instead
|
30
|
+
# of more pedantic matches.
|
31
|
+
#
|
32
|
+
# "ýůçký".transliterate # => "yucky"
|
33
|
+
#
|
34
|
+
def transliterate(string, options = {})
|
35
|
+
options = {
|
36
|
+
:optical => false
|
37
|
+
}.merge!(options)
|
38
|
+
|
39
|
+
if options[:optical]
|
40
|
+
transmogrify(string) do |mapping, codepoint|
|
41
|
+
mapping[1] || mapping[0] || ""
|
42
|
+
end
|
43
|
+
else
|
44
|
+
transmogrify(string) do |mapping, codepoint|
|
45
|
+
mapping[0] || mapping[1] || ""
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
alias_method :to_ascii, :transliterate
|
50
|
+
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
# Lazy load codepoints data
|
55
|
+
#
|
56
|
+
def codepoints_data
|
57
|
+
@codepoints_data ||= begin
|
58
|
+
require "sterile/data/codepoints_data"
|
59
|
+
Data.codepoints_data
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
end # class << self
|
64
|
+
|
65
|
+
end # module Sterile
|