sterile 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,264 @@
1
+ # encoding: UTF-8
2
+
3
+ module Sterile
4
+ # @private
5
+ class Data
6
+ def self.html_entities_data
7
+ {
8
+ "quot" => 34,
9
+ "amp" => 38,
10
+ "apos" => 39,
11
+ "lt" => 60,
12
+ "gt" => 62,
13
+ "nbsp" => 160,
14
+ "iexcl" => 161,
15
+ "cent" => 162,
16
+ "pound" => 163,
17
+ "curren" => 164,
18
+ "yen" => 165,
19
+ "brvbar" => 166,
20
+ "sect" => 167,
21
+ "uml" => 168,
22
+ "copy" => 169,
23
+ "ordf" => 170,
24
+ "laquo" => 171,
25
+ "not" => 172,
26
+ "shy" => 173,
27
+ "reg" => 174,
28
+ "macr" => 175,
29
+ "deg" => 176,
30
+ "plusmn" => 177,
31
+ "sup2" => 178,
32
+ "sup3" => 179,
33
+ "acute" => 180,
34
+ "micro" => 181,
35
+ "para" => 182,
36
+ "middot" => 183,
37
+ "cedil" => 184,
38
+ "sup1" => 185,
39
+ "ordm" => 186,
40
+ "raquo" => 187,
41
+ "frac14" => 188,
42
+ "frac12" => 189,
43
+ "frac34" => 190,
44
+ "iquest" => 191,
45
+ "Agrave" => 192,
46
+ "Aacute" => 193,
47
+ "Acirc" => 194,
48
+ "Atilde" => 195,
49
+ "Auml" => 196,
50
+ "Aring" => 197,
51
+ "AElig" => 198,
52
+ "Ccedil" => 199,
53
+ "Egrave" => 200,
54
+ "Eacute" => 201,
55
+ "Ecirc" => 202,
56
+ "Euml" => 203,
57
+ "Igrave" => 204,
58
+ "Iacute" => 205,
59
+ "Icirc" => 206,
60
+ "Iuml" => 207,
61
+ "ETH" => 208,
62
+ "Ntilde" => 209,
63
+ "Ograve" => 210,
64
+ "Oacute" => 211,
65
+ "Ocirc" => 212,
66
+ "Otilde" => 213,
67
+ "Ouml" => 214,
68
+ "times" => 215,
69
+ "Oslash" => 216,
70
+ "Ugrave" => 217,
71
+ "Uacute" => 218,
72
+ "Ucirc" => 219,
73
+ "Uuml" => 220,
74
+ "Yacute" => 221,
75
+ "THORN" => 222,
76
+ "szlig" => 223,
77
+ "agrave" => 224,
78
+ "aacute" => 225,
79
+ "acirc" => 226,
80
+ "atilde" => 227,
81
+ "auml" => 228,
82
+ "aring" => 229,
83
+ "aelig" => 230,
84
+ "ccedil" => 231,
85
+ "egrave" => 232,
86
+ "eacute" => 233,
87
+ "ecirc" => 234,
88
+ "euml" => 235,
89
+ "igrave" => 236,
90
+ "iacute" => 237,
91
+ "icirc" => 238,
92
+ "iuml" => 239,
93
+ "eth" => 240,
94
+ "ntilde" => 241,
95
+ "ograve" => 242,
96
+ "oacute" => 243,
97
+ "ocirc" => 244,
98
+ "otilde" => 245,
99
+ "ouml" => 246,
100
+ "divide" => 247,
101
+ "oslash" => 248,
102
+ "ugrave" => 249,
103
+ "uacute" => 250,
104
+ "ucirc" => 251,
105
+ "uuml" => 252,
106
+ "yacute" => 253,
107
+ "thorn" => 254,
108
+ "yuml" => 255,
109
+ "OElig" => 338,
110
+ "oelig" => 339,
111
+ "Scaron" => 352,
112
+ "scaron" => 353,
113
+ "Yuml" => 376,
114
+ "fnof" => 402,
115
+ "circ" => 710,
116
+ "tilde" => 732,
117
+ "Alpha" => 913,
118
+ "Beta" => 914,
119
+ "Gamma" => 915,
120
+ "Delta" => 916,
121
+ "Epsilon" => 917,
122
+ "Zeta" => 918,
123
+ "Eta" => 919,
124
+ "Theta" => 920,
125
+ "Iota" => 921,
126
+ "Kappa" => 922,
127
+ "Lambda" => 923,
128
+ "Mu" => 924,
129
+ "Nu" => 925,
130
+ "Xi" => 926,
131
+ "Omicron" => 927,
132
+ "Pi" => 928,
133
+ "Rho" => 929,
134
+ "Sigma" => 931,
135
+ "Tau" => 932,
136
+ "Upsilon" => 933,
137
+ "Phi" => 934,
138
+ "Chi" => 935,
139
+ "Psi" => 936,
140
+ "Omega" => 937,
141
+ "alpha" => 945,
142
+ "beta" => 946,
143
+ "gamma" => 947,
144
+ "delta" => 948,
145
+ "epsilon" => 949,
146
+ "zeta" => 950,
147
+ "eta" => 951,
148
+ "theta" => 952,
149
+ "iota" => 953,
150
+ "kappa" => 954,
151
+ "lambda" => 955,
152
+ "mu" => 956,
153
+ "nu" => 957,
154
+ "xi" => 958,
155
+ "omicron" => 959,
156
+ "pi" => 960,
157
+ "rho" => 961,
158
+ "sigmaf" => 962,
159
+ "sigma" => 963,
160
+ "tau" => 964,
161
+ "upsilon" => 965,
162
+ "phi" => 966,
163
+ "chi" => 967,
164
+ "psi" => 968,
165
+ "omega" => 969,
166
+ "thetasym" => 977,
167
+ "upsih" => 978,
168
+ "piv" => 982,
169
+ "ensp" => 8194,
170
+ "emsp" => 8195,
171
+ "thinsp" => 8201,
172
+ "zwnj" => 8204,
173
+ "zwj" => 8205,
174
+ "lrm" => 8206,
175
+ "rlm" => 8207,
176
+ "ndash" => 8211,
177
+ "mdash" => 8212,
178
+ "lsquo" => 8216,
179
+ "rsquo" => 8217,
180
+ "sbquo" => 8218,
181
+ "ldquo" => 8220,
182
+ "rdquo" => 8221,
183
+ "bdquo" => 8222,
184
+ "dagger" => 8224,
185
+ "Dagger" => 8225,
186
+ "bull" => 8226,
187
+ "hellip" => 8230,
188
+ "permil" => 8240,
189
+ "prime" => 8242,
190
+ "Prime" => 8243,
191
+ "lsaquo" => 8249,
192
+ "rsaquo" => 8250,
193
+ "oline" => 8254,
194
+ "frasl" => 8260,
195
+ "euro" => 8364,
196
+ "image" => 8465,
197
+ "weierp" => 8472,
198
+ "real" => 8476,
199
+ "trade" => 8482,
200
+ "alefsym" => 8501,
201
+ "larr" => 8592,
202
+ "uarr" => 8593,
203
+ "rarr" => 8594,
204
+ "darr" => 8595,
205
+ "harr" => 8596,
206
+ "crarr" => 8629,
207
+ "lArr" => 8656,
208
+ "uArr" => 8657,
209
+ "rArr" => 8658,
210
+ "dArr" => 8659,
211
+ "hArr" => 8660,
212
+ "forall" => 8704,
213
+ "part" => 8706,
214
+ "exist" => 8707,
215
+ "empty" => 8709,
216
+ "nabla" => 8711,
217
+ "isin" => 8712,
218
+ "notin" => 8713,
219
+ "ni" => 8715,
220
+ "prod" => 8719,
221
+ "sum" => 8721,
222
+ "minus" => 8722,
223
+ "lowast" => 8727,
224
+ "radic" => 8730,
225
+ "prop" => 8733,
226
+ "infin" => 8734,
227
+ "ang" => 8736,
228
+ "and" => 8743,
229
+ "or" => 8744,
230
+ "cap" => 8745,
231
+ "cup" => 8746,
232
+ "int" => 8747,
233
+ "there4" => 8756,
234
+ "sim" => 8764,
235
+ "cong" => 8773,
236
+ "asymp" => 8776,
237
+ "ne" => 8800,
238
+ "equiv" => 8801,
239
+ "le" => 8804,
240
+ "ge" => 8805,
241
+ "sub" => 8834,
242
+ "sup" => 8835,
243
+ "nsub" => 8836,
244
+ "sube" => 8838,
245
+ "supe" => 8839,
246
+ "oplus" => 8853,
247
+ "otimes" => 8855,
248
+ "perp" => 8869,
249
+ "sdot" => 8901,
250
+ "lceil" => 8968,
251
+ "rceil" => 8969,
252
+ "lfloor" => 8970,
253
+ "rfloor" => 8971,
254
+ "lang" => 10216,
255
+ "rang" => 10217,
256
+ "loz" => 9674,
257
+ "spades" => 9824,
258
+ "clubs" => 9827,
259
+ "hearts" => 9829,
260
+ "diams" => 9830
261
+ }
262
+ end
263
+ end
264
+ end
@@ -0,0 +1,45 @@
1
+ # encoding: UTF-8
2
+
3
+ module Sterile
4
+ # @private
5
+ class Data
6
+ def self.smart_format_rules
7
+ [
8
+ ["'tain't", "’tain’t"],
9
+ ["'twere", "’twere"],
10
+ ["'twas", "’twas"],
11
+ ["'tis", "’tis"],
12
+ ["'twill", "’twill"],
13
+ ["'til", "’til"],
14
+ ["'bout", "’bout"],
15
+ ["'nuff", "’nuff"],
16
+ ["'round", "’round"],
17
+ ["'cause", "’cause"],
18
+ ["'cos", "’cos"],
19
+ ["i'm", "i’m"],
20
+ ['--"', "—”"],
21
+ ["--'", "—’"],
22
+ ["--", "—"],
23
+ ["...", "…"],
24
+ ["(tm)", "™"],
25
+ ["(TM)", "™"],
26
+ ["(c)", "©"],
27
+ ["(r)", "®"],
28
+ ["(R)", "®"],
29
+ [/s\'([^a-zA-Z0-9])/, "s’\\1"],
30
+ [/"([:;])/, "”\\1"],
31
+ [/\'s$/, "’s"],
32
+ [/\'(\d\d(?:’|\')?s)/, "’\\1"],
33
+ [/(\s|\A|"|\(|\[)\'/, "\\1‘"],
34
+ [/(\d+)"/, "\\1′"],
35
+ [/(\d+)\'/, "\\1″"],
36
+ [/(\S)\'([^\'\s])/, "\\1’\\2"],
37
+ [/(\s|\A|\(|\[)"(?!\s)/, "\\1“\\2"],
38
+ [/"(\s|\S|\Z)/, "”\\1"],
39
+ [/\'([\s.]|\Z)/, "’\\1"],
40
+ [/(\d+)x(\d+)/, "\\1×\\2"],
41
+ [/([a-z])'(t|d|s|ll|re|ve)(\b)/i, "\\1’\\2\\3"]
42
+ ]
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,48 @@
1
+ # encoding: UTF-8
2
+
3
+ module Sterile
4
+
5
+ class << self
6
+
7
+ # Turn Unicode characters into their HTML equivilents.
8
+ # If a valid HTML entity is not possible, it will create a numeric entity.
9
+ #
10
+ # q{“Economy Hits Bottom,” ran the headline}.encode_entities # => &ldquo;Economy Hits Bottom,&rdquo; ran the headline
11
+ #
12
+ def encode_entities(string)
13
+ transmogrify(string) do |mapping, codepoint|
14
+ if (32..126).include?(codepoint)
15
+ mapping[0]
16
+ else
17
+ "&" + (mapping[2] || "#" + codepoint.to_s) + ";"
18
+ end
19
+ end
20
+ end
21
+
22
+
23
+ # The reverse of +encode_entities+. Turns HTML or numeric entities into
24
+ # their Unicode counterparts.
25
+ #
26
+ def decode_entities(string)
27
+ string.gsub!(/&#(\d{1,4});/) { [$1.to_i].pack("U") }
28
+ string.gsub(/&([a-zA-Z0-9]+);/) do
29
+ codepoint = html_entities_data[$1]
30
+ codepoint ? [codepoint].pack("U") : $&
31
+ end
32
+ end
33
+
34
+
35
+ private
36
+
37
+ # Lazy load html entities
38
+ #
39
+ def html_entities_data
40
+ @html_entities_data ||= begin
41
+ require "sterile/data/html_entities_data"
42
+ Data.html_entities_data
43
+ end
44
+ end
45
+
46
+ end # class << self
47
+
48
+ end # module Sterile
@@ -0,0 +1,41 @@
1
+ # encoding: UTF-8
2
+
3
+ module Sterile
4
+
5
+ class << self
6
+
7
+ # Format text with proper "curly" quotes, m-dashes, copyright, trademark, etc.
8
+ #
9
+ # q{"He said, 'Away with you, Drake!'"}.smart_format # => “He said, ‘Away with you, Drake!’”
10
+ #
11
+ def smart_format(string)
12
+ smart_format_rules.each do |rule|
13
+ string.gsub!(rule[0], rule[1])
14
+ end
15
+ string
16
+ end
17
+
18
+
19
+ # Like +smart_format+, but works with HTML/XML (somewhat).
20
+ #
21
+ def smart_format_tags(string)
22
+ string.gsub_tags do |text|
23
+ text.smart_format.encode_entities
24
+ end
25
+ end
26
+
27
+
28
+ private
29
+
30
+ # Lazy load smart formatting rules
31
+ #
32
+ def smart_format_rules
33
+ @smart_format_rules ||= begin
34
+ require "sterile/data/smart_format_rules"
35
+ Data.smart_format_rules
36
+ end
37
+ end
38
+
39
+ end # class << self
40
+
41
+ end # module Sterile
@@ -0,0 +1,19 @@
1
+ # encoding: UTF-8
2
+
3
+ module Sterile
4
+
5
+ module StringExtensions
6
+ def self.included(base)
7
+ Sterile.methods(false).each do |method|
8
+ eval("def #{method}(*args, &block); Sterile.#{method}(self, *args, &block); end")
9
+ eval("def #{method}!(*args, &block); replace Sterile.#{method}(self, *args, &block); end")
10
+ end
11
+ end
12
+ end
13
+
14
+ end
15
+
16
+
17
+ class String
18
+ include Sterile::StringExtensions
19
+ end
@@ -0,0 +1,78 @@
1
+ # encoding: UTF-8
2
+
3
+ module Sterile
4
+
5
+ class << self
6
+
7
+ # Remove HTML/XML tags from text. Also strips out comments, PHP and ERB style tags.
8
+ # CDATA is considered text unless :keep_cdata => false is specified.
9
+ # Redundant whitespace will be removed unless :keep_whitespace => true is specified.
10
+ #
11
+ def strip_tags(string, options = {})
12
+ options = {
13
+ :keep_whitespace => false,
14
+ :keep_cdata => true
15
+ }.merge!(options)
16
+
17
+ string.gsub!(/<[%?](php)?[^>]*>/, '') # strip php, erb et al
18
+ string.gsub!(/<!--[^-]*-->/, '') # strip comments
19
+
20
+ string.gsub!(
21
+ /
22
+ <!\[CDATA\[
23
+ ([^\]]*)
24
+ \]\]>
25
+ /xi,
26
+ options[:keep_cdata] ? '\\1' : ''
27
+ )
28
+
29
+ html_name = /[\w:-]+/
30
+ html_data = /([A-Za-z0-9]+|('[^']*?'|"[^"]*?"))/
31
+ html_attr = /(#{html_name}(\s*=\s*#{html_data})?)/
32
+
33
+ string.gsub!(
34
+ /
35
+ <
36
+ [\/]?
37
+ #{html_name}
38
+ (\s+(#{html_attr}(\s+#{html_attr})*))?
39
+ \s*
40
+ [\/]?
41
+ >
42
+ /xi,
43
+ ''
44
+ )
45
+
46
+ options[:keep_whitespace] ? string : trim_whitespace(string)
47
+ end
48
+
49
+
50
+ # Similar to +gsub+, except it works in between HTML/XML tags and
51
+ # yields text to a block. Text will be replaced by what the block
52
+ # returns.
53
+ # Warning: does not work in some degenerate cases.
54
+ #
55
+ def gsub_tags(string, &block)
56
+ raise "No block given" unless block_given?
57
+
58
+ string.gsub!(/(<[^>]*>)|([^<]+)/) do |match|
59
+ $2 ? yield($2) : $1
60
+ end
61
+ end
62
+
63
+
64
+ # Iterates over all text in between HTML/XML tags and yields
65
+ # it to a block.
66
+ # Warning: does not work in some degenerate cases.
67
+ #
68
+ def scan_tags(string, &block)
69
+ raise "No block given" unless block_given?
70
+
71
+ string.scan(/(<[^>]*>)|([^<]+)/) do |match|
72
+ yield($2) unless $2.nil?
73
+ end
74
+ end
75
+
76
+ end # class << self
77
+
78
+ end # module Sterile
@@ -0,0 +1,123 @@
1
+ # encoding: UTF-8
2
+
3
+ module Sterile
4
+
5
+ class << self
6
+
7
+ # Format text appropriately for titles. This method is much smarter
8
+ # than ActiveSupport's +titlecase+. The algorithm is based on work done
9
+ # by John Gruber et al (http://daringfireball.net/2008/08/title_case_update)
10
+ #
11
+ def titlecase(string)
12
+
13
+ lsquo = [8216].pack("U")
14
+ rsquo = [8217].pack("U")
15
+ ldquo = [8220].pack("U")
16
+ rdquo = [8221].pack("U")
17
+ ndash = [8211].pack("U")
18
+
19
+ string.strip!
20
+ string.gsub!(/\s+/, " ")
21
+ string.downcase! unless string =~ /[[:lower:]]/
22
+
23
+ small_words = %w{ a an and as at(?!&t) but by en for if in nor of on or the to v[.]? via vs[.]? }.join("|")
24
+ apos = / (?: ['#{rsquo}] [[:lower:]]* )? /xu
25
+
26
+ string.gsub!(
27
+ /
28
+ \b
29
+ ([_\*]*)
30
+ (?:
31
+ ( [-\+\w]+ [@.\:\/] [-\w@.\:\/]+ #{apos} ) # URL, domain, or email
32
+ |
33
+ ( (?i: #{small_words} ) #{apos} ) # or small word, case-insensitive
34
+ |
35
+ ( [[:alpha:]] [[:lower:]'#{rsquo}()\[\]{}]* #{apos} ) # or word without internal caps
36
+ |
37
+ ( [[:alpha:]] [[:alpha:]'#{rsquo}()\[\]{}]* #{apos} ) # or some other word
38
+ )
39
+ ([_\*]*)
40
+ \b
41
+ /xu
42
+ ) do
43
+ ($1 ? $1 : "") +
44
+ ($2 ? $2 : ($3 ? $3.downcase : ($4 ? $4.downcase.capitalize : $5))) +
45
+ ($6 ? $6 : "")
46
+ end
47
+
48
+ if RUBY_VERSION < "1.9.0"
49
+ string.gsub!(
50
+ /
51
+ \b
52
+ ([:alpha:]+)
53
+ (#{ndash})
54
+ ([:alpha:]+)
55
+ \b
56
+ /xu
57
+ ) do
58
+ $1.downcase.capitalize + $2 + $1.downcase.capitalize
59
+ end
60
+ end
61
+
62
+ string.gsub!(
63
+ /
64
+ (
65
+ \A [[:punct:]]* # start of title
66
+ | [:.;?!][ ]+ # or of subsentence
67
+ | [ ]['"#{ldquo}#{lsquo}(\[][ ]* # or of inserted subphrase
68
+ )
69
+ ( #{small_words} ) # followed by a small-word
70
+ \b
71
+ /xiu
72
+ ) do
73
+ $1 + $2.downcase.capitalize
74
+ end
75
+
76
+ string.gsub!(
77
+ /
78
+ \b
79
+ ( #{small_words} ) # small-word
80
+ (?=
81
+ [[:punct:]]* \Z # at the end of the title
82
+ |
83
+ ['"#{rsquo}#{rdquo})\]] [ ] # or of an inserted subphrase
84
+ )
85
+ /xu
86
+ ) do
87
+ $1.downcase.capitalize
88
+ end
89
+
90
+ string.gsub!(
91
+ /
92
+ (
93
+ \b
94
+ [[:alpha:]] # single first letter
95
+ [\-#{ndash}] # followed by a dash
96
+ )
97
+ ( [[:alpha:]] ) # followed by a letter
98
+ /xu
99
+ ) do
100
+ $1 + $2.downcase
101
+ end
102
+
103
+ string.gsub!(/q&a/i, 'Q&A')
104
+
105
+ string
106
+ end
107
+ alias_method :titleize, :titlecase
108
+
109
+
110
+ private
111
+
112
+ # Lazy load smart formatting rules
113
+ #
114
+ def smart_format_rules
115
+ @smart_format_rules ||= begin
116
+ require "sterile/data/smart_format_rules"
117
+ Data.smart_format_rules
118
+ end
119
+ end
120
+
121
+ end # class << self
122
+
123
+ end # module Sterile
@@ -0,0 +1,65 @@
1
+ # encoding: UTF-8
2
+
3
+ module Sterile
4
+
5
+ class << self
6
+
7
+ def transmogrify(string, &block)
8
+ raise "No block given" unless block_given?
9
+
10
+ result = ""
11
+ string.unpack("U*").each do |codepoint|
12
+ cg = codepoint >> 8
13
+ cp = codepoint & 0xFF
14
+ begin
15
+ mapping = codepoints_data[cg][cp]
16
+ result << yield(mapping, codepoint)
17
+ rescue
18
+ end
19
+ end
20
+
21
+ result
22
+ end
23
+
24
+ # Transliterate Unicode [and accented ASCII] characters to their plain-text
25
+ # ASCII equivalents. This is based on data from the stringex gem (https://github.com/rsl/stringex)
26
+ # which is in turn a port of Perl's Unidecode and ostensibly provides
27
+ # superior results to iconv. The optical conversion data is based on work
28
+ # by Eric Boehs at https://github.com/ericboehs/to_slug
29
+ # Passing an option of :optical => true will prefer optical mapping instead
30
+ # of more pedantic matches.
31
+ #
32
+ # "ýůçký".transliterate # => "yucky"
33
+ #
34
+ def transliterate(string, options = {})
35
+ options = {
36
+ :optical => false
37
+ }.merge!(options)
38
+
39
+ if options[:optical]
40
+ transmogrify(string) do |mapping, codepoint|
41
+ mapping[1] || mapping[0] || ""
42
+ end
43
+ else
44
+ transmogrify(string) do |mapping, codepoint|
45
+ mapping[0] || mapping[1] || ""
46
+ end
47
+ end
48
+ end
49
+ alias_method :to_ascii, :transliterate
50
+
51
+
52
+ private
53
+
54
+ # Lazy load codepoints data
55
+ #
56
+ def codepoints_data
57
+ @codepoints_data ||= begin
58
+ require "sterile/data/codepoints_data"
59
+ Data.codepoints_data
60
+ end
61
+ end
62
+
63
+ end # class << self
64
+
65
+ end # module Sterile