sterile 1.0.1 → 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.autotest +1 -1
- data/.yaropts +1 -0
- data/Gemfile +1 -1
- data/Gemfile.lock +1 -1
- data/lib/sterile/data/codepoints_data.rb +46527 -0
- data/lib/sterile/data/html_entities_data.rb +264 -0
- data/lib/sterile/data/smart_format_rules.rb +45 -0
- data/lib/sterile/entities.rb +48 -0
- data/lib/sterile/smart_format.rb +41 -0
- data/lib/sterile/string_extensions.rb +19 -0
- data/lib/sterile/tags.rb +78 -0
- data/lib/sterile/titlecase.rb +123 -0
- data/lib/sterile/transliterate.rb +65 -0
- data/lib/sterile/utilities.rb +43 -0
- data/lib/sterile/version.rb +1 -1
- data/lib/sterile.rb +7 -314
- metadata +13 -5
- data/lib/sterile/codepoints.rb +0 -46523
- data/lib/sterile/html_entities.rb +0 -260
- data/lib/sterile/smart_format_rules.rb +0 -41
@@ -0,0 +1,43 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Sterile
|
4
|
+
|
5
|
+
class << self
|
6
|
+
|
7
|
+
# Trim whitespace from start and end of string and remove any redundant
|
8
|
+
# whitespace in between.
|
9
|
+
#
|
10
|
+
# " Hello world! ".transliterate # => "Hello world!"
|
11
|
+
#
|
12
|
+
def trim_whitespace(string)
|
13
|
+
string.gsub(/\s+/, " ").strip
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
# Transliterate to ASCII and strip out any HTML/XML tags.
|
18
|
+
#
|
19
|
+
# "<b>nåsty</b>".sterilize # => "nasty"
|
20
|
+
#
|
21
|
+
def sterilize(string)
|
22
|
+
strip_tags(transliterate(string))
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
# Transliterate to ASCII, downcase and format for URL permalink/slug
|
27
|
+
# by stripping out all non-alphanumeric characters and replacing spaces
|
28
|
+
# with a delimiter (defaults to '-').
|
29
|
+
#
|
30
|
+
# "Hello World!".sluggerize # => "hello-world"
|
31
|
+
#
|
32
|
+
def sluggerize(string, options = {})
|
33
|
+
options = {
|
34
|
+
:delimiter => "-"
|
35
|
+
}.merge!(options)
|
36
|
+
|
37
|
+
sterilize(string).strip.gsub(/\s+/, "-").gsub(/[^a-zA-Z0-9\-]/, "").gsub(/-+/, options[:delimiter]).downcase
|
38
|
+
end
|
39
|
+
alias_method :to_slug, :sluggerize
|
40
|
+
|
41
|
+
end # class << self
|
42
|
+
|
43
|
+
end # module Sterile
|
data/lib/sterile/version.rb
CHANGED
data/lib/sterile.rb
CHANGED
@@ -21,319 +21,12 @@
|
|
21
21
|
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
22
|
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
23
|
|
24
|
-
require "sterile/codepoints"
|
25
|
-
require "sterile/html_entities"
|
26
|
-
require "sterile/smart_format_rules"
|
27
24
|
|
25
|
+
require "sterile/transliterate"
|
26
|
+
require "sterile/smart_format"
|
27
|
+
require "sterile/titlecase"
|
28
|
+
require "sterile/utilities"
|
29
|
+
require "sterile/entities"
|
30
|
+
require "sterile/tags"
|
28
31
|
|
29
|
-
|
30
|
-
|
31
|
-
class << self
|
32
|
-
|
33
|
-
def transmogrify(string, &block)
|
34
|
-
raise "No block given" unless block_given?
|
35
|
-
|
36
|
-
result = ""
|
37
|
-
string.unpack("U*").each do |codepoint|
|
38
|
-
cg = codepoint >> 8
|
39
|
-
cp = codepoint & 0xFF
|
40
|
-
begin
|
41
|
-
mapping = CODEPOINTS[cg][cp]
|
42
|
-
result << yield(mapping, codepoint)
|
43
|
-
rescue
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
result
|
48
|
-
end
|
49
|
-
|
50
|
-
# Transliterate Unicode [and accented ASCII] characters to their plain-text
|
51
|
-
# ASCII equivalents. This is based on data from the stringex gem (https://github.com/rsl/stringex)
|
52
|
-
# which is in turn a port of Perl's Unidecode and ostensibly provides
|
53
|
-
# superior results to iconv. The optical conversion data is based on work
|
54
|
-
# by Eric Boehs at https://github.com/ericboehs/to_slug
|
55
|
-
# Passing an option of :optical => true will prefer optical mapping instead
|
56
|
-
# of more pedantic matches.
|
57
|
-
#
|
58
|
-
# "ýůçký".transliterate # => "yucky"
|
59
|
-
#
|
60
|
-
def transliterate(string, options = {})
|
61
|
-
options = {
|
62
|
-
:optical => false
|
63
|
-
}.merge!(options)
|
64
|
-
|
65
|
-
if options[:optical]
|
66
|
-
transmogrify(string) do |mapping, codepoint|
|
67
|
-
mapping[1] || mapping[0] || ""
|
68
|
-
end
|
69
|
-
else
|
70
|
-
transmogrify(string) do |mapping, codepoint|
|
71
|
-
mapping[0] || mapping[1] || ""
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|
75
|
-
alias_method :to_ascii, :transliterate
|
76
|
-
|
77
|
-
|
78
|
-
# Trim whitespace from start and end of string and remove any redundant
|
79
|
-
# whitespace in between.
|
80
|
-
#
|
81
|
-
# " Hello world! ".transliterate # => "Hello world!"
|
82
|
-
#
|
83
|
-
def trim_whitespace(string)
|
84
|
-
string.gsub(/\s+/, " ").strip
|
85
|
-
end
|
86
|
-
|
87
|
-
|
88
|
-
# Transliterate to ASCII and strip out any HTML/XML tags.
|
89
|
-
#
|
90
|
-
# "<b>nåsty</b>".sterilize # => "nasty"
|
91
|
-
#
|
92
|
-
def sterilize(string)
|
93
|
-
strip_tags(transliterate(string))
|
94
|
-
end
|
95
|
-
|
96
|
-
|
97
|
-
# Transliterate to ASCII, downcase and format for URL permalink/slug
|
98
|
-
# by stripping out all non-alphanumeric characters and replacing spaces
|
99
|
-
# with a delimiter (defaults to '-').
|
100
|
-
#
|
101
|
-
# "Hello World!".sluggerize # => "hello-world"
|
102
|
-
#
|
103
|
-
def sluggerize(string, options = {})
|
104
|
-
options = {
|
105
|
-
:delimiter => "-"
|
106
|
-
}.merge!(options)
|
107
|
-
|
108
|
-
sterilize(string).strip.gsub(/\s+/, "-").gsub(/[^a-zA-Z0-9\-]/, "").gsub(/-+/, options[:delimiter]).downcase
|
109
|
-
end
|
110
|
-
alias_method :to_slug, :sluggerize
|
111
|
-
|
112
|
-
|
113
|
-
# Format text with proper "curly" quotes, m-dashes, copyright, trademark, etc.
|
114
|
-
#
|
115
|
-
# q{"He said, 'Away with you, Drake!'"}.smart_format # => “He said, ‘Away with you, Drake!’”
|
116
|
-
#
|
117
|
-
def smart_format(string)
|
118
|
-
SMART_FORMAT_RULES.each do |rule|
|
119
|
-
string.gsub!(rule[0], rule[1])
|
120
|
-
end
|
121
|
-
string
|
122
|
-
end
|
123
|
-
|
124
|
-
|
125
|
-
# Turn Unicode characters into their HTML equivilents.
|
126
|
-
# If a valid HTML entity is not possible, it will create a numeric entity.
|
127
|
-
#
|
128
|
-
# q{“Economy Hits Bottom,” ran the headline}.encode_entities # => “Economy Hits Bottom,” ran the headline
|
129
|
-
#
|
130
|
-
def encode_entities(string)
|
131
|
-
transmogrify(string) do |mapping, codepoint|
|
132
|
-
if (32..126).include?(codepoint)
|
133
|
-
mapping[0]
|
134
|
-
else
|
135
|
-
"&" + (mapping[2] || "#" + codepoint.to_s) + ";"
|
136
|
-
end
|
137
|
-
end
|
138
|
-
end
|
139
|
-
|
140
|
-
|
141
|
-
# The reverse of +encode_entities+. Turns HTML or numeric entities into
|
142
|
-
# their Unicode counterparts.
|
143
|
-
#
|
144
|
-
def decode_entities(string)
|
145
|
-
string.gsub!(/&#(\d{1,4});/) { [$1.to_i].pack("U") }
|
146
|
-
string.gsub(/&([a-zA-Z0-9]+);/) do
|
147
|
-
codepoint = HTML_ENTITIES[$1]
|
148
|
-
codepoint ? [codepoint].pack("U") : $&
|
149
|
-
end
|
150
|
-
end
|
151
|
-
|
152
|
-
|
153
|
-
# Remove HTML/XML tags from text. Also strips out comments, PHP and ERB style tags.
|
154
|
-
# CDATA is considered text unless :keep_cdata => false is specified.
|
155
|
-
# Redundant whitespace will be removed unless :keep_whitespace => true is specified.
|
156
|
-
#
|
157
|
-
def strip_tags(string, options = {})
|
158
|
-
options = {
|
159
|
-
:keep_whitespace => false,
|
160
|
-
:keep_cdata => true
|
161
|
-
}.merge!(options)
|
162
|
-
|
163
|
-
string.gsub!(/<[%?](php)?[^>]*>/, '') # strip php, erb et al
|
164
|
-
string.gsub!(/<!--[^-]*-->/, '') # strip comments
|
165
|
-
|
166
|
-
string.gsub!(
|
167
|
-
/
|
168
|
-
<!\[CDATA\[
|
169
|
-
([^\]]*)
|
170
|
-
\]\]>
|
171
|
-
/xi,
|
172
|
-
options[:keep_cdata] ? '\\1' : ''
|
173
|
-
)
|
174
|
-
|
175
|
-
html_name = /[\w:-]+/
|
176
|
-
html_data = /([A-Za-z0-9]+|('[^']*?'|"[^"]*?"))/
|
177
|
-
html_attr = /(#{html_name}(\s*=\s*#{html_data})?)/
|
178
|
-
|
179
|
-
string.gsub!(
|
180
|
-
/
|
181
|
-
<
|
182
|
-
[\/]?
|
183
|
-
#{html_name}
|
184
|
-
(\s+(#{html_attr}(\s+#{html_attr})*))?
|
185
|
-
\s*
|
186
|
-
[\/]?
|
187
|
-
>
|
188
|
-
/xi,
|
189
|
-
''
|
190
|
-
)
|
191
|
-
|
192
|
-
options[:keep_whitespace] ? string : trim_whitespace(string)
|
193
|
-
end
|
194
|
-
|
195
|
-
|
196
|
-
# Similar to +gsub+, except it works in between HTML/XML tags and
|
197
|
-
# yields text to a block. Text will be replaced by what the block
|
198
|
-
# returns.
|
199
|
-
# Warning: does not work in some degenerate cases.
|
200
|
-
#
|
201
|
-
def gsub_tags(string, &block)
|
202
|
-
raise "No block given" unless block_given?
|
203
|
-
|
204
|
-
string.gsub!(/(<[^>]*>)|([^<]+)/) do |match|
|
205
|
-
$2 ? yield($2) : $1
|
206
|
-
end
|
207
|
-
end
|
208
|
-
|
209
|
-
|
210
|
-
# Iterates over all text in between HTML/XML tags and yields
|
211
|
-
# it to a block.
|
212
|
-
# Warning: does not work in some degenerate cases.
|
213
|
-
#
|
214
|
-
def scan_tags(string, &block)
|
215
|
-
raise "No block given" unless block_given?
|
216
|
-
|
217
|
-
string.scan(/(<[^>]*>)|([^<]+)/) do |match|
|
218
|
-
yield($2) unless $2.nil?
|
219
|
-
end
|
220
|
-
end
|
221
|
-
|
222
|
-
|
223
|
-
# Like +smart_format+, but works with HTML/XML (somewhat).
|
224
|
-
#
|
225
|
-
def smart_format_tags(string)
|
226
|
-
string.gsub_tags do |text|
|
227
|
-
text.smart_format.encode_entities
|
228
|
-
end
|
229
|
-
end
|
230
|
-
|
231
|
-
|
232
|
-
# Format text appropriately for titles. This method is much smarter
|
233
|
-
# than ActiveSupport's +titlecase+. The algorithm is based on work done
|
234
|
-
# by John Gruber et al (http://daringfireball.net/2008/08/title_case_update)
|
235
|
-
#
|
236
|
-
def titlecase(string)
|
237
|
-
string.strip!
|
238
|
-
string.gsub!(/\s+/, " ")
|
239
|
-
string.downcase! unless string =~ /[[:lower:]]/
|
240
|
-
|
241
|
-
small_words = %w{ a an and as at(?!&t) but by en for if in nor of on or the to v[.]? via vs[.]? }.join("|")
|
242
|
-
apos = / (?: ['’] [[:lower:]]* )? /xu
|
243
|
-
|
244
|
-
string.gsub!(
|
245
|
-
/
|
246
|
-
\b
|
247
|
-
([_\*]*)
|
248
|
-
(?:
|
249
|
-
( [-\+\w]+ [@.\:\/] [-\w@.\:\/]+ #{apos} ) # URL, domain, or email
|
250
|
-
|
|
251
|
-
( (?i: #{small_words} ) #{apos} ) # or small word, case-insensitive
|
252
|
-
|
|
253
|
-
( [[:alpha:]] [[:lower:]'’()\[\]{}]* #{apos} ) # or word without internal caps
|
254
|
-
|
|
255
|
-
( [[:alpha:]] [[:alpha:]'’()\[\]{}]* #{apos} ) # or some other word
|
256
|
-
)
|
257
|
-
([_\*]*)
|
258
|
-
\b
|
259
|
-
/xu
|
260
|
-
) do
|
261
|
-
($1 ? $1 : "") +
|
262
|
-
($2 ? $2 : ($3 ? $3.downcase : ($4 ? $4.downcase.capitalize : $5))) +
|
263
|
-
($6 ? $6 : "")
|
264
|
-
end
|
265
|
-
|
266
|
-
if RUBY_VERSION < "1.9.0"
|
267
|
-
string.gsub!(
|
268
|
-
/
|
269
|
-
\b
|
270
|
-
([:alpha:]+)
|
271
|
-
(‑)
|
272
|
-
([:alpha:]+)
|
273
|
-
\b
|
274
|
-
/xu
|
275
|
-
) do
|
276
|
-
$1.downcase.capitalize + $2 + $1.downcase.capitalize
|
277
|
-
end
|
278
|
-
end
|
279
|
-
|
280
|
-
string.gsub!(
|
281
|
-
/
|
282
|
-
(
|
283
|
-
\A [[:punct:]]* # start of title
|
284
|
-
| [:.;?!][ ]+ # or of subsentence
|
285
|
-
| [ ]['"“‘(\[][ ]* # or of inserted subphrase
|
286
|
-
)
|
287
|
-
( #{small_words} ) # followed by a small-word
|
288
|
-
\b
|
289
|
-
/xiu
|
290
|
-
) do
|
291
|
-
$1 + $2.downcase.capitalize
|
292
|
-
end
|
293
|
-
|
294
|
-
string.gsub!(
|
295
|
-
/
|
296
|
-
\b
|
297
|
-
( #{small_words} ) # small-word
|
298
|
-
(?=
|
299
|
-
[[:punct:]]* \Z # at the end of the title
|
300
|
-
|
|
301
|
-
['"’”)\]] [ ] # or of an inserted subphrase
|
302
|
-
)
|
303
|
-
/xu
|
304
|
-
) do
|
305
|
-
$1.downcase.capitalize
|
306
|
-
end
|
307
|
-
|
308
|
-
string.gsub!(
|
309
|
-
/
|
310
|
-
(
|
311
|
-
\b
|
312
|
-
[[:alpha:]] # single first letter
|
313
|
-
[\-‑] # followed by a dash
|
314
|
-
)
|
315
|
-
( [[:alpha:]] ) # followed by a letter
|
316
|
-
/xu
|
317
|
-
) do
|
318
|
-
$1 + $2.downcase
|
319
|
-
end
|
320
|
-
|
321
|
-
string.gsub!(/q&a/i, 'Q&A')
|
322
|
-
|
323
|
-
string
|
324
|
-
end
|
325
|
-
|
326
|
-
end
|
327
|
-
|
328
|
-
end
|
329
|
-
|
330
|
-
|
331
|
-
# Add extensions to String
|
332
|
-
#
|
333
|
-
class String
|
334
|
-
Sterile.methods(false).each do |method|
|
335
|
-
eval("def #{method}(*args, &block); Sterile.#{method}(self, *args, &block); end")
|
336
|
-
eval("def #{method}!(*args, &block); replace Sterile.#{method}(self, *args, &block); end")
|
337
|
-
end
|
338
|
-
end
|
339
|
-
|
32
|
+
require "sterile/string_extensions"
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: sterile
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 1.0.
|
5
|
+
version: 1.0.2
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Patrick Hogan
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-06-
|
13
|
+
date: 2011-06-03 00:00:00 -05:00
|
14
14
|
default_executable:
|
15
15
|
dependencies: []
|
16
16
|
|
@@ -27,14 +27,22 @@ files:
|
|
27
27
|
- .autotest
|
28
28
|
- .gitignore
|
29
29
|
- .rvmrc
|
30
|
+
- .yaropts
|
30
31
|
- Gemfile
|
31
32
|
- Gemfile.lock
|
32
33
|
- README.markdown
|
33
34
|
- Rakefile
|
34
35
|
- lib/sterile.rb
|
35
|
-
- lib/sterile/
|
36
|
-
- lib/sterile/
|
37
|
-
- lib/sterile/smart_format_rules.rb
|
36
|
+
- lib/sterile/data/codepoints_data.rb
|
37
|
+
- lib/sterile/data/html_entities_data.rb
|
38
|
+
- lib/sterile/data/smart_format_rules.rb
|
39
|
+
- lib/sterile/entities.rb
|
40
|
+
- lib/sterile/smart_format.rb
|
41
|
+
- lib/sterile/string_extensions.rb
|
42
|
+
- lib/sterile/tags.rb
|
43
|
+
- lib/sterile/titlecase.rb
|
44
|
+
- lib/sterile/transliterate.rb
|
45
|
+
- lib/sterile/utilities.rb
|
38
46
|
- lib/sterile/version.rb
|
39
47
|
- sterile.gemspec
|
40
48
|
has_rdoc: true
|