sterile 1.0.1 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,264 @@
1
+ # encoding: UTF-8
2
+
3
+ module Sterile
4
+ # @private
5
+ class Data
6
+ def self.html_entities_data
7
+ {
8
+ "quot" => 34,
9
+ "amp" => 38,
10
+ "apos" => 39,
11
+ "lt" => 60,
12
+ "gt" => 62,
13
+ "nbsp" => 160,
14
+ "iexcl" => 161,
15
+ "cent" => 162,
16
+ "pound" => 163,
17
+ "curren" => 164,
18
+ "yen" => 165,
19
+ "brvbar" => 166,
20
+ "sect" => 167,
21
+ "uml" => 168,
22
+ "copy" => 169,
23
+ "ordf" => 170,
24
+ "laquo" => 171,
25
+ "not" => 172,
26
+ "shy" => 173,
27
+ "reg" => 174,
28
+ "macr" => 175,
29
+ "deg" => 176,
30
+ "plusmn" => 177,
31
+ "sup2" => 178,
32
+ "sup3" => 179,
33
+ "acute" => 180,
34
+ "micro" => 181,
35
+ "para" => 182,
36
+ "middot" => 183,
37
+ "cedil" => 184,
38
+ "sup1" => 185,
39
+ "ordm" => 186,
40
+ "raquo" => 187,
41
+ "frac14" => 188,
42
+ "frac12" => 189,
43
+ "frac34" => 190,
44
+ "iquest" => 191,
45
+ "Agrave" => 192,
46
+ "Aacute" => 193,
47
+ "Acirc" => 194,
48
+ "Atilde" => 195,
49
+ "Auml" => 196,
50
+ "Aring" => 197,
51
+ "AElig" => 198,
52
+ "Ccedil" => 199,
53
+ "Egrave" => 200,
54
+ "Eacute" => 201,
55
+ "Ecirc" => 202,
56
+ "Euml" => 203,
57
+ "Igrave" => 204,
58
+ "Iacute" => 205,
59
+ "Icirc" => 206,
60
+ "Iuml" => 207,
61
+ "ETH" => 208,
62
+ "Ntilde" => 209,
63
+ "Ograve" => 210,
64
+ "Oacute" => 211,
65
+ "Ocirc" => 212,
66
+ "Otilde" => 213,
67
+ "Ouml" => 214,
68
+ "times" => 215,
69
+ "Oslash" => 216,
70
+ "Ugrave" => 217,
71
+ "Uacute" => 218,
72
+ "Ucirc" => 219,
73
+ "Uuml" => 220,
74
+ "Yacute" => 221,
75
+ "THORN" => 222,
76
+ "szlig" => 223,
77
+ "agrave" => 224,
78
+ "aacute" => 225,
79
+ "acirc" => 226,
80
+ "atilde" => 227,
81
+ "auml" => 228,
82
+ "aring" => 229,
83
+ "aelig" => 230,
84
+ "ccedil" => 231,
85
+ "egrave" => 232,
86
+ "eacute" => 233,
87
+ "ecirc" => 234,
88
+ "euml" => 235,
89
+ "igrave" => 236,
90
+ "iacute" => 237,
91
+ "icirc" => 238,
92
+ "iuml" => 239,
93
+ "eth" => 240,
94
+ "ntilde" => 241,
95
+ "ograve" => 242,
96
+ "oacute" => 243,
97
+ "ocirc" => 244,
98
+ "otilde" => 245,
99
+ "ouml" => 246,
100
+ "divide" => 247,
101
+ "oslash" => 248,
102
+ "ugrave" => 249,
103
+ "uacute" => 250,
104
+ "ucirc" => 251,
105
+ "uuml" => 252,
106
+ "yacute" => 253,
107
+ "thorn" => 254,
108
+ "yuml" => 255,
109
+ "OElig" => 338,
110
+ "oelig" => 339,
111
+ "Scaron" => 352,
112
+ "scaron" => 353,
113
+ "Yuml" => 376,
114
+ "fnof" => 402,
115
+ "circ" => 710,
116
+ "tilde" => 732,
117
+ "Alpha" => 913,
118
+ "Beta" => 914,
119
+ "Gamma" => 915,
120
+ "Delta" => 916,
121
+ "Epsilon" => 917,
122
+ "Zeta" => 918,
123
+ "Eta" => 919,
124
+ "Theta" => 920,
125
+ "Iota" => 921,
126
+ "Kappa" => 922,
127
+ "Lambda" => 923,
128
+ "Mu" => 924,
129
+ "Nu" => 925,
130
+ "Xi" => 926,
131
+ "Omicron" => 927,
132
+ "Pi" => 928,
133
+ "Rho" => 929,
134
+ "Sigma" => 931,
135
+ "Tau" => 932,
136
+ "Upsilon" => 933,
137
+ "Phi" => 934,
138
+ "Chi" => 935,
139
+ "Psi" => 936,
140
+ "Omega" => 937,
141
+ "alpha" => 945,
142
+ "beta" => 946,
143
+ "gamma" => 947,
144
+ "delta" => 948,
145
+ "epsilon" => 949,
146
+ "zeta" => 950,
147
+ "eta" => 951,
148
+ "theta" => 952,
149
+ "iota" => 953,
150
+ "kappa" => 954,
151
+ "lambda" => 955,
152
+ "mu" => 956,
153
+ "nu" => 957,
154
+ "xi" => 958,
155
+ "omicron" => 959,
156
+ "pi" => 960,
157
+ "rho" => 961,
158
+ "sigmaf" => 962,
159
+ "sigma" => 963,
160
+ "tau" => 964,
161
+ "upsilon" => 965,
162
+ "phi" => 966,
163
+ "chi" => 967,
164
+ "psi" => 968,
165
+ "omega" => 969,
166
+ "thetasym" => 977,
167
+ "upsih" => 978,
168
+ "piv" => 982,
169
+ "ensp" => 8194,
170
+ "emsp" => 8195,
171
+ "thinsp" => 8201,
172
+ "zwnj" => 8204,
173
+ "zwj" => 8205,
174
+ "lrm" => 8206,
175
+ "rlm" => 8207,
176
+ "ndash" => 8211,
177
+ "mdash" => 8212,
178
+ "lsquo" => 8216,
179
+ "rsquo" => 8217,
180
+ "sbquo" => 8218,
181
+ "ldquo" => 8220,
182
+ "rdquo" => 8221,
183
+ "bdquo" => 8222,
184
+ "dagger" => 8224,
185
+ "Dagger" => 8225,
186
+ "bull" => 8226,
187
+ "hellip" => 8230,
188
+ "permil" => 8240,
189
+ "prime" => 8242,
190
+ "Prime" => 8243,
191
+ "lsaquo" => 8249,
192
+ "rsaquo" => 8250,
193
+ "oline" => 8254,
194
+ "frasl" => 8260,
195
+ "euro" => 8364,
196
+ "image" => 8465,
197
+ "weierp" => 8472,
198
+ "real" => 8476,
199
+ "trade" => 8482,
200
+ "alefsym" => 8501,
201
+ "larr" => 8592,
202
+ "uarr" => 8593,
203
+ "rarr" => 8594,
204
+ "darr" => 8595,
205
+ "harr" => 8596,
206
+ "crarr" => 8629,
207
+ "lArr" => 8656,
208
+ "uArr" => 8657,
209
+ "rArr" => 8658,
210
+ "dArr" => 8659,
211
+ "hArr" => 8660,
212
+ "forall" => 8704,
213
+ "part" => 8706,
214
+ "exist" => 8707,
215
+ "empty" => 8709,
216
+ "nabla" => 8711,
217
+ "isin" => 8712,
218
+ "notin" => 8713,
219
+ "ni" => 8715,
220
+ "prod" => 8719,
221
+ "sum" => 8721,
222
+ "minus" => 8722,
223
+ "lowast" => 8727,
224
+ "radic" => 8730,
225
+ "prop" => 8733,
226
+ "infin" => 8734,
227
+ "ang" => 8736,
228
+ "and" => 8743,
229
+ "or" => 8744,
230
+ "cap" => 8745,
231
+ "cup" => 8746,
232
+ "int" => 8747,
233
+ "there4" => 8756,
234
+ "sim" => 8764,
235
+ "cong" => 8773,
236
+ "asymp" => 8776,
237
+ "ne" => 8800,
238
+ "equiv" => 8801,
239
+ "le" => 8804,
240
+ "ge" => 8805,
241
+ "sub" => 8834,
242
+ "sup" => 8835,
243
+ "nsub" => 8836,
244
+ "sube" => 8838,
245
+ "supe" => 8839,
246
+ "oplus" => 8853,
247
+ "otimes" => 8855,
248
+ "perp" => 8869,
249
+ "sdot" => 8901,
250
+ "lceil" => 8968,
251
+ "rceil" => 8969,
252
+ "lfloor" => 8970,
253
+ "rfloor" => 8971,
254
+ "lang" => 10216,
255
+ "rang" => 10217,
256
+ "loz" => 9674,
257
+ "spades" => 9824,
258
+ "clubs" => 9827,
259
+ "hearts" => 9829,
260
+ "diams" => 9830
261
+ }
262
+ end
263
+ end
264
+ end
@@ -0,0 +1,45 @@
1
+ # encoding: UTF-8
2
+
3
+ module Sterile
4
+ # @private
5
+ class Data
6
+ def self.smart_format_rules
7
+ [
8
+ ["'tain't", "’tain’t"],
9
+ ["'twere", "’twere"],
10
+ ["'twas", "’twas"],
11
+ ["'tis", "’tis"],
12
+ ["'twill", "’twill"],
13
+ ["'til", "’til"],
14
+ ["'bout", "’bout"],
15
+ ["'nuff", "’nuff"],
16
+ ["'round", "’round"],
17
+ ["'cause", "’cause"],
18
+ ["'cos", "’cos"],
19
+ ["i'm", "i’m"],
20
+ ['--"', "—”"],
21
+ ["--'", "—’"],
22
+ ["--", "—"],
23
+ ["...", "…"],
24
+ ["(tm)", "™"],
25
+ ["(TM)", "™"],
26
+ ["(c)", "©"],
27
+ ["(r)", "®"],
28
+ ["(R)", "®"],
29
+ [/s\'([^a-zA-Z0-9])/, "s’\\1"],
30
+ [/"([:;])/, "”\\1"],
31
+ [/\'s$/, "’s"],
32
+ [/\'(\d\d(?:’|\')?s)/, "’\\1"],
33
+ [/(\s|\A|"|\(|\[)\'/, "\\1‘"],
34
+ [/(\d+)"/, "\\1′"],
35
+ [/(\d+)\'/, "\\1″"],
36
+ [/(\S)\'([^\'\s])/, "\\1’\\2"],
37
+ [/(\s|\A|\(|\[)"(?!\s)/, "\\1“\\2"],
38
+ [/"(\s|\S|\Z)/, "”\\1"],
39
+ [/\'([\s.]|\Z)/, "’\\1"],
40
+ [/(\d+)x(\d+)/, "\\1×\\2"],
41
+ [/([a-z])'(t|d|s|ll|re|ve)(\b)/i, "\\1’\\2\\3"]
42
+ ]
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,48 @@
1
+ # encoding: UTF-8
2
+
3
+ module Sterile
4
+
5
+ class << self
6
+
7
+ # Turn Unicode characters into their HTML equivilents.
8
+ # If a valid HTML entity is not possible, it will create a numeric entity.
9
+ #
10
+ # q{“Economy Hits Bottom,” ran the headline}.encode_entities # => &ldquo;Economy Hits Bottom,&rdquo; ran the headline
11
+ #
12
+ def encode_entities(string)
13
+ transmogrify(string) do |mapping, codepoint|
14
+ if (32..126).include?(codepoint)
15
+ mapping[0]
16
+ else
17
+ "&" + (mapping[2] || "#" + codepoint.to_s) + ";"
18
+ end
19
+ end
20
+ end
21
+
22
+
23
+ # The reverse of +encode_entities+. Turns HTML or numeric entities into
24
+ # their Unicode counterparts.
25
+ #
26
+ def decode_entities(string)
27
+ string.gsub!(/&#(\d{1,4});/) { [$1.to_i].pack("U") }
28
+ string.gsub(/&([a-zA-Z0-9]+);/) do
29
+ codepoint = html_entities_data[$1]
30
+ codepoint ? [codepoint].pack("U") : $&
31
+ end
32
+ end
33
+
34
+
35
+ private
36
+
37
+ # Lazy load html entities
38
+ #
39
+ def html_entities_data
40
+ @html_entities_data ||= begin
41
+ require "sterile/data/html_entities_data"
42
+ Data.html_entities_data
43
+ end
44
+ end
45
+
46
+ end # class << self
47
+
48
+ end # module Sterile
@@ -0,0 +1,41 @@
1
+ # encoding: UTF-8
2
+
3
+ module Sterile
4
+
5
+ class << self
6
+
7
+ # Format text with proper "curly" quotes, m-dashes, copyright, trademark, etc.
8
+ #
9
+ # q{"He said, 'Away with you, Drake!'"}.smart_format # => “He said, ‘Away with you, Drake!’”
10
+ #
11
+ def smart_format(string)
12
+ smart_format_rules.each do |rule|
13
+ string.gsub!(rule[0], rule[1])
14
+ end
15
+ string
16
+ end
17
+
18
+
19
+ # Like +smart_format+, but works with HTML/XML (somewhat).
20
+ #
21
+ def smart_format_tags(string)
22
+ string.gsub_tags do |text|
23
+ text.smart_format.encode_entities
24
+ end
25
+ end
26
+
27
+
28
+ private
29
+
30
+ # Lazy load smart formatting rules
31
+ #
32
+ def smart_format_rules
33
+ @smart_format_rules ||= begin
34
+ require "sterile/data/smart_format_rules"
35
+ Data.smart_format_rules
36
+ end
37
+ end
38
+
39
+ end # class << self
40
+
41
+ end # module Sterile
@@ -0,0 +1,19 @@
1
+ # encoding: UTF-8
2
+
3
+ module Sterile
4
+
5
+ module StringExtensions
6
+ def self.included(base)
7
+ Sterile.methods(false).each do |method|
8
+ eval("def #{method}(*args, &block); Sterile.#{method}(self, *args, &block); end")
9
+ eval("def #{method}!(*args, &block); replace Sterile.#{method}(self, *args, &block); end")
10
+ end
11
+ end
12
+ end
13
+
14
+ end
15
+
16
+
17
+ class String
18
+ include Sterile::StringExtensions
19
+ end
@@ -0,0 +1,78 @@
1
+ # encoding: UTF-8
2
+
3
+ module Sterile
4
+
5
+ class << self
6
+
7
+ # Remove HTML/XML tags from text. Also strips out comments, PHP and ERB style tags.
8
+ # CDATA is considered text unless :keep_cdata => false is specified.
9
+ # Redundant whitespace will be removed unless :keep_whitespace => true is specified.
10
+ #
11
+ def strip_tags(string, options = {})
12
+ options = {
13
+ :keep_whitespace => false,
14
+ :keep_cdata => true
15
+ }.merge!(options)
16
+
17
+ string.gsub!(/<[%?](php)?[^>]*>/, '') # strip php, erb et al
18
+ string.gsub!(/<!--[^-]*-->/, '') # strip comments
19
+
20
+ string.gsub!(
21
+ /
22
+ <!\[CDATA\[
23
+ ([^\]]*)
24
+ \]\]>
25
+ /xi,
26
+ options[:keep_cdata] ? '\\1' : ''
27
+ )
28
+
29
+ html_name = /[\w:-]+/
30
+ html_data = /([A-Za-z0-9]+|('[^']*?'|"[^"]*?"))/
31
+ html_attr = /(#{html_name}(\s*=\s*#{html_data})?)/
32
+
33
+ string.gsub!(
34
+ /
35
+ <
36
+ [\/]?
37
+ #{html_name}
38
+ (\s+(#{html_attr}(\s+#{html_attr})*))?
39
+ \s*
40
+ [\/]?
41
+ >
42
+ /xi,
43
+ ''
44
+ )
45
+
46
+ options[:keep_whitespace] ? string : trim_whitespace(string)
47
+ end
48
+
49
+
50
+ # Similar to +gsub+, except it works in between HTML/XML tags and
51
+ # yields text to a block. Text will be replaced by what the block
52
+ # returns.
53
+ # Warning: does not work in some degenerate cases.
54
+ #
55
+ def gsub_tags(string, &block)
56
+ raise "No block given" unless block_given?
57
+
58
+ string.gsub!(/(<[^>]*>)|([^<]+)/) do |match|
59
+ $2 ? yield($2) : $1
60
+ end
61
+ end
62
+
63
+
64
+ # Iterates over all text in between HTML/XML tags and yields
65
+ # it to a block.
66
+ # Warning: does not work in some degenerate cases.
67
+ #
68
+ def scan_tags(string, &block)
69
+ raise "No block given" unless block_given?
70
+
71
+ string.scan(/(<[^>]*>)|([^<]+)/) do |match|
72
+ yield($2) unless $2.nil?
73
+ end
74
+ end
75
+
76
+ end # class << self
77
+
78
+ end # module Sterile
@@ -0,0 +1,123 @@
1
+ # encoding: UTF-8
2
+
3
+ module Sterile
4
+
5
+ class << self
6
+
7
+ # Format text appropriately for titles. This method is much smarter
8
+ # than ActiveSupport's +titlecase+. The algorithm is based on work done
9
+ # by John Gruber et al (http://daringfireball.net/2008/08/title_case_update)
10
+ #
11
+ def titlecase(string)
12
+
13
+ lsquo = [8216].pack("U")
14
+ rsquo = [8217].pack("U")
15
+ ldquo = [8220].pack("U")
16
+ rdquo = [8221].pack("U")
17
+ ndash = [8211].pack("U")
18
+
19
+ string.strip!
20
+ string.gsub!(/\s+/, " ")
21
+ string.downcase! unless string =~ /[[:lower:]]/
22
+
23
+ small_words = %w{ a an and as at(?!&t) but by en for if in nor of on or the to v[.]? via vs[.]? }.join("|")
24
+ apos = / (?: ['#{rsquo}] [[:lower:]]* )? /xu
25
+
26
+ string.gsub!(
27
+ /
28
+ \b
29
+ ([_\*]*)
30
+ (?:
31
+ ( [-\+\w]+ [@.\:\/] [-\w@.\:\/]+ #{apos} ) # URL, domain, or email
32
+ |
33
+ ( (?i: #{small_words} ) #{apos} ) # or small word, case-insensitive
34
+ |
35
+ ( [[:alpha:]] [[:lower:]'#{rsquo}()\[\]{}]* #{apos} ) # or word without internal caps
36
+ |
37
+ ( [[:alpha:]] [[:alpha:]'#{rsquo}()\[\]{}]* #{apos} ) # or some other word
38
+ )
39
+ ([_\*]*)
40
+ \b
41
+ /xu
42
+ ) do
43
+ ($1 ? $1 : "") +
44
+ ($2 ? $2 : ($3 ? $3.downcase : ($4 ? $4.downcase.capitalize : $5))) +
45
+ ($6 ? $6 : "")
46
+ end
47
+
48
+ if RUBY_VERSION < "1.9.0"
49
+ string.gsub!(
50
+ /
51
+ \b
52
+ ([:alpha:]+)
53
+ (#{ndash})
54
+ ([:alpha:]+)
55
+ \b
56
+ /xu
57
+ ) do
58
+ $1.downcase.capitalize + $2 + $1.downcase.capitalize
59
+ end
60
+ end
61
+
62
+ string.gsub!(
63
+ /
64
+ (
65
+ \A [[:punct:]]* # start of title
66
+ | [:.;?!][ ]+ # or of subsentence
67
+ | [ ]['"#{ldquo}#{lsquo}(\[][ ]* # or of inserted subphrase
68
+ )
69
+ ( #{small_words} ) # followed by a small-word
70
+ \b
71
+ /xiu
72
+ ) do
73
+ $1 + $2.downcase.capitalize
74
+ end
75
+
76
+ string.gsub!(
77
+ /
78
+ \b
79
+ ( #{small_words} ) # small-word
80
+ (?=
81
+ [[:punct:]]* \Z # at the end of the title
82
+ |
83
+ ['"#{rsquo}#{rdquo})\]] [ ] # or of an inserted subphrase
84
+ )
85
+ /xu
86
+ ) do
87
+ $1.downcase.capitalize
88
+ end
89
+
90
+ string.gsub!(
91
+ /
92
+ (
93
+ \b
94
+ [[:alpha:]] # single first letter
95
+ [\-#{ndash}] # followed by a dash
96
+ )
97
+ ( [[:alpha:]] ) # followed by a letter
98
+ /xu
99
+ ) do
100
+ $1 + $2.downcase
101
+ end
102
+
103
+ string.gsub!(/q&a/i, 'Q&A')
104
+
105
+ string
106
+ end
107
+ alias_method :titleize, :titlecase
108
+
109
+
110
+ private
111
+
112
+ # Lazy load smart formatting rules
113
+ #
114
+ def smart_format_rules
115
+ @smart_format_rules ||= begin
116
+ require "sterile/data/smart_format_rules"
117
+ Data.smart_format_rules
118
+ end
119
+ end
120
+
121
+ end # class << self
122
+
123
+ end # module Sterile
@@ -0,0 +1,65 @@
1
+ # encoding: UTF-8
2
+
3
+ module Sterile
4
+
5
+ class << self
6
+
7
+ def transmogrify(string, &block)
8
+ raise "No block given" unless block_given?
9
+
10
+ result = ""
11
+ string.unpack("U*").each do |codepoint|
12
+ cg = codepoint >> 8
13
+ cp = codepoint & 0xFF
14
+ begin
15
+ mapping = codepoints_data[cg][cp]
16
+ result << yield(mapping, codepoint)
17
+ rescue
18
+ end
19
+ end
20
+
21
+ result
22
+ end
23
+
24
+ # Transliterate Unicode [and accented ASCII] characters to their plain-text
25
+ # ASCII equivalents. This is based on data from the stringex gem (https://github.com/rsl/stringex)
26
+ # which is in turn a port of Perl's Unidecode and ostensibly provides
27
+ # superior results to iconv. The optical conversion data is based on work
28
+ # by Eric Boehs at https://github.com/ericboehs/to_slug
29
+ # Passing an option of :optical => true will prefer optical mapping instead
30
+ # of more pedantic matches.
31
+ #
32
+ # "ýůçký".transliterate # => "yucky"
33
+ #
34
+ def transliterate(string, options = {})
35
+ options = {
36
+ :optical => false
37
+ }.merge!(options)
38
+
39
+ if options[:optical]
40
+ transmogrify(string) do |mapping, codepoint|
41
+ mapping[1] || mapping[0] || ""
42
+ end
43
+ else
44
+ transmogrify(string) do |mapping, codepoint|
45
+ mapping[0] || mapping[1] || ""
46
+ end
47
+ end
48
+ end
49
+ alias_method :to_ascii, :transliterate
50
+
51
+
52
+ private
53
+
54
+ # Lazy load codepoints data
55
+ #
56
+ def codepoints_data
57
+ @codepoints_data ||= begin
58
+ require "sterile/data/codepoints_data"
59
+ Data.codepoints_data
60
+ end
61
+ end
62
+
63
+ end # class << self
64
+
65
+ end # module Sterile