sterile 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.autotest +1 -1
- data/.yaropts +1 -0
- data/Gemfile +1 -1
- data/Gemfile.lock +1 -1
- data/lib/sterile/data/codepoints_data.rb +46527 -0
- data/lib/sterile/data/html_entities_data.rb +264 -0
- data/lib/sterile/data/smart_format_rules.rb +45 -0
- data/lib/sterile/entities.rb +48 -0
- data/lib/sterile/smart_format.rb +41 -0
- data/lib/sterile/string_extensions.rb +19 -0
- data/lib/sterile/tags.rb +78 -0
- data/lib/sterile/titlecase.rb +123 -0
- data/lib/sterile/transliterate.rb +65 -0
- data/lib/sterile/utilities.rb +43 -0
- data/lib/sterile/version.rb +1 -1
- data/lib/sterile.rb +7 -314
- metadata +13 -5
- data/lib/sterile/codepoints.rb +0 -46523
- data/lib/sterile/html_entities.rb +0 -260
- data/lib/sterile/smart_format_rules.rb +0 -41
@@ -0,0 +1,43 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Sterile
|
4
|
+
|
5
|
+
class << self
|
6
|
+
|
7
|
+
# Trim whitespace from start and end of string and remove any redundant
|
8
|
+
# whitespace in between.
|
9
|
+
#
|
10
|
+
# " Hello world! ".transliterate # => "Hello world!"
|
11
|
+
#
|
12
|
+
def trim_whitespace(string)
|
13
|
+
string.gsub(/\s+/, " ").strip
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
# Transliterate to ASCII and strip out any HTML/XML tags.
|
18
|
+
#
|
19
|
+
# "<b>nåsty</b>".sterilize # => "nasty"
|
20
|
+
#
|
21
|
+
def sterilize(string)
|
22
|
+
strip_tags(transliterate(string))
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
# Transliterate to ASCII, downcase and format for URL permalink/slug
|
27
|
+
# by stripping out all non-alphanumeric characters and replacing spaces
|
28
|
+
# with a delimiter (defaults to '-').
|
29
|
+
#
|
30
|
+
# "Hello World!".sluggerize # => "hello-world"
|
31
|
+
#
|
32
|
+
def sluggerize(string, options = {})
|
33
|
+
options = {
|
34
|
+
:delimiter => "-"
|
35
|
+
}.merge!(options)
|
36
|
+
|
37
|
+
sterilize(string).strip.gsub(/\s+/, "-").gsub(/[^a-zA-Z0-9\-]/, "").gsub(/-+/, options[:delimiter]).downcase
|
38
|
+
end
|
39
|
+
alias_method :to_slug, :sluggerize
|
40
|
+
|
41
|
+
end # class << self
|
42
|
+
|
43
|
+
end # module Sterile
|
data/lib/sterile/version.rb
CHANGED
data/lib/sterile.rb
CHANGED
@@ -21,319 +21,12 @@
|
|
21
21
|
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
22
|
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
23
|
|
24
|
-
require "sterile/codepoints"
|
25
|
-
require "sterile/html_entities"
|
26
|
-
require "sterile/smart_format_rules"
|
27
24
|
|
25
|
+
require "sterile/transliterate"
|
26
|
+
require "sterile/smart_format"
|
27
|
+
require "sterile/titlecase"
|
28
|
+
require "sterile/utilities"
|
29
|
+
require "sterile/entities"
|
30
|
+
require "sterile/tags"
|
28
31
|
|
29
|
-
|
30
|
-
|
31
|
-
class << self
|
32
|
-
|
33
|
-
def transmogrify(string, &block)
|
34
|
-
raise "No block given" unless block_given?
|
35
|
-
|
36
|
-
result = ""
|
37
|
-
string.unpack("U*").each do |codepoint|
|
38
|
-
cg = codepoint >> 8
|
39
|
-
cp = codepoint & 0xFF
|
40
|
-
begin
|
41
|
-
mapping = CODEPOINTS[cg][cp]
|
42
|
-
result << yield(mapping, codepoint)
|
43
|
-
rescue
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
result
|
48
|
-
end
|
49
|
-
|
50
|
-
# Transliterate Unicode [and accented ASCII] characters to their plain-text
|
51
|
-
# ASCII equivalents. This is based on data from the stringex gem (https://github.com/rsl/stringex)
|
52
|
-
# which is in turn a port of Perl's Unidecode and ostensibly provides
|
53
|
-
# superior results to iconv. The optical conversion data is based on work
|
54
|
-
# by Eric Boehs at https://github.com/ericboehs/to_slug
|
55
|
-
# Passing an option of :optical => true will prefer optical mapping instead
|
56
|
-
# of more pedantic matches.
|
57
|
-
#
|
58
|
-
# "ýůçký".transliterate # => "yucky"
|
59
|
-
#
|
60
|
-
def transliterate(string, options = {})
|
61
|
-
options = {
|
62
|
-
:optical => false
|
63
|
-
}.merge!(options)
|
64
|
-
|
65
|
-
if options[:optical]
|
66
|
-
transmogrify(string) do |mapping, codepoint|
|
67
|
-
mapping[1] || mapping[0] || ""
|
68
|
-
end
|
69
|
-
else
|
70
|
-
transmogrify(string) do |mapping, codepoint|
|
71
|
-
mapping[0] || mapping[1] || ""
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|
75
|
-
alias_method :to_ascii, :transliterate
|
76
|
-
|
77
|
-
|
78
|
-
# Trim whitespace from start and end of string and remove any redundant
|
79
|
-
# whitespace in between.
|
80
|
-
#
|
81
|
-
# " Hello world! ".transliterate # => "Hello world!"
|
82
|
-
#
|
83
|
-
def trim_whitespace(string)
|
84
|
-
string.gsub(/\s+/, " ").strip
|
85
|
-
end
|
86
|
-
|
87
|
-
|
88
|
-
# Transliterate to ASCII and strip out any HTML/XML tags.
|
89
|
-
#
|
90
|
-
# "<b>nåsty</b>".sterilize # => "nasty"
|
91
|
-
#
|
92
|
-
def sterilize(string)
|
93
|
-
strip_tags(transliterate(string))
|
94
|
-
end
|
95
|
-
|
96
|
-
|
97
|
-
# Transliterate to ASCII, downcase and format for URL permalink/slug
|
98
|
-
# by stripping out all non-alphanumeric characters and replacing spaces
|
99
|
-
# with a delimiter (defaults to '-').
|
100
|
-
#
|
101
|
-
# "Hello World!".sluggerize # => "hello-world"
|
102
|
-
#
|
103
|
-
def sluggerize(string, options = {})
|
104
|
-
options = {
|
105
|
-
:delimiter => "-"
|
106
|
-
}.merge!(options)
|
107
|
-
|
108
|
-
sterilize(string).strip.gsub(/\s+/, "-").gsub(/[^a-zA-Z0-9\-]/, "").gsub(/-+/, options[:delimiter]).downcase
|
109
|
-
end
|
110
|
-
alias_method :to_slug, :sluggerize
|
111
|
-
|
112
|
-
|
113
|
-
# Format text with proper "curly" quotes, m-dashes, copyright, trademark, etc.
|
114
|
-
#
|
115
|
-
# q{"He said, 'Away with you, Drake!'"}.smart_format # => “He said, ‘Away with you, Drake!’”
|
116
|
-
#
|
117
|
-
def smart_format(string)
|
118
|
-
SMART_FORMAT_RULES.each do |rule|
|
119
|
-
string.gsub!(rule[0], rule[1])
|
120
|
-
end
|
121
|
-
string
|
122
|
-
end
|
123
|
-
|
124
|
-
|
125
|
-
# Turn Unicode characters into their HTML equivilents.
|
126
|
-
# If a valid HTML entity is not possible, it will create a numeric entity.
|
127
|
-
#
|
128
|
-
# q{“Economy Hits Bottom,” ran the headline}.encode_entities # => “Economy Hits Bottom,” ran the headline
|
129
|
-
#
|
130
|
-
def encode_entities(string)
|
131
|
-
transmogrify(string) do |mapping, codepoint|
|
132
|
-
if (32..126).include?(codepoint)
|
133
|
-
mapping[0]
|
134
|
-
else
|
135
|
-
"&" + (mapping[2] || "#" + codepoint.to_s) + ";"
|
136
|
-
end
|
137
|
-
end
|
138
|
-
end
|
139
|
-
|
140
|
-
|
141
|
-
# The reverse of +encode_entities+. Turns HTML or numeric entities into
|
142
|
-
# their Unicode counterparts.
|
143
|
-
#
|
144
|
-
def decode_entities(string)
|
145
|
-
string.gsub!(/&#(\d{1,4});/) { [$1.to_i].pack("U") }
|
146
|
-
string.gsub(/&([a-zA-Z0-9]+);/) do
|
147
|
-
codepoint = HTML_ENTITIES[$1]
|
148
|
-
codepoint ? [codepoint].pack("U") : $&
|
149
|
-
end
|
150
|
-
end
|
151
|
-
|
152
|
-
|
153
|
-
# Remove HTML/XML tags from text. Also strips out comments, PHP and ERB style tags.
|
154
|
-
# CDATA is considered text unless :keep_cdata => false is specified.
|
155
|
-
# Redundant whitespace will be removed unless :keep_whitespace => true is specified.
|
156
|
-
#
|
157
|
-
def strip_tags(string, options = {})
|
158
|
-
options = {
|
159
|
-
:keep_whitespace => false,
|
160
|
-
:keep_cdata => true
|
161
|
-
}.merge!(options)
|
162
|
-
|
163
|
-
string.gsub!(/<[%?](php)?[^>]*>/, '') # strip php, erb et al
|
164
|
-
string.gsub!(/<!--[^-]*-->/, '') # strip comments
|
165
|
-
|
166
|
-
string.gsub!(
|
167
|
-
/
|
168
|
-
<!\[CDATA\[
|
169
|
-
([^\]]*)
|
170
|
-
\]\]>
|
171
|
-
/xi,
|
172
|
-
options[:keep_cdata] ? '\\1' : ''
|
173
|
-
)
|
174
|
-
|
175
|
-
html_name = /[\w:-]+/
|
176
|
-
html_data = /([A-Za-z0-9]+|('[^']*?'|"[^"]*?"))/
|
177
|
-
html_attr = /(#{html_name}(\s*=\s*#{html_data})?)/
|
178
|
-
|
179
|
-
string.gsub!(
|
180
|
-
/
|
181
|
-
<
|
182
|
-
[\/]?
|
183
|
-
#{html_name}
|
184
|
-
(\s+(#{html_attr}(\s+#{html_attr})*))?
|
185
|
-
\s*
|
186
|
-
[\/]?
|
187
|
-
>
|
188
|
-
/xi,
|
189
|
-
''
|
190
|
-
)
|
191
|
-
|
192
|
-
options[:keep_whitespace] ? string : trim_whitespace(string)
|
193
|
-
end
|
194
|
-
|
195
|
-
|
196
|
-
# Similar to +gsub+, except it works in between HTML/XML tags and
|
197
|
-
# yields text to a block. Text will be replaced by what the block
|
198
|
-
# returns.
|
199
|
-
# Warning: does not work in some degenerate cases.
|
200
|
-
#
|
201
|
-
def gsub_tags(string, &block)
|
202
|
-
raise "No block given" unless block_given?
|
203
|
-
|
204
|
-
string.gsub!(/(<[^>]*>)|([^<]+)/) do |match|
|
205
|
-
$2 ? yield($2) : $1
|
206
|
-
end
|
207
|
-
end
|
208
|
-
|
209
|
-
|
210
|
-
# Iterates over all text in between HTML/XML tags and yields
|
211
|
-
# it to a block.
|
212
|
-
# Warning: does not work in some degenerate cases.
|
213
|
-
#
|
214
|
-
def scan_tags(string, &block)
|
215
|
-
raise "No block given" unless block_given?
|
216
|
-
|
217
|
-
string.scan(/(<[^>]*>)|([^<]+)/) do |match|
|
218
|
-
yield($2) unless $2.nil?
|
219
|
-
end
|
220
|
-
end
|
221
|
-
|
222
|
-
|
223
|
-
# Like +smart_format+, but works with HTML/XML (somewhat).
|
224
|
-
#
|
225
|
-
def smart_format_tags(string)
|
226
|
-
string.gsub_tags do |text|
|
227
|
-
text.smart_format.encode_entities
|
228
|
-
end
|
229
|
-
end
|
230
|
-
|
231
|
-
|
232
|
-
# Format text appropriately for titles. This method is much smarter
|
233
|
-
# than ActiveSupport's +titlecase+. The algorithm is based on work done
|
234
|
-
# by John Gruber et al (http://daringfireball.net/2008/08/title_case_update)
|
235
|
-
#
|
236
|
-
def titlecase(string)
|
237
|
-
string.strip!
|
238
|
-
string.gsub!(/\s+/, " ")
|
239
|
-
string.downcase! unless string =~ /[[:lower:]]/
|
240
|
-
|
241
|
-
small_words = %w{ a an and as at(?!&t) but by en for if in nor of on or the to v[.]? via vs[.]? }.join("|")
|
242
|
-
apos = / (?: ['’] [[:lower:]]* )? /xu
|
243
|
-
|
244
|
-
string.gsub!(
|
245
|
-
/
|
246
|
-
\b
|
247
|
-
([_\*]*)
|
248
|
-
(?:
|
249
|
-
( [-\+\w]+ [@.\:\/] [-\w@.\:\/]+ #{apos} ) # URL, domain, or email
|
250
|
-
|
|
251
|
-
( (?i: #{small_words} ) #{apos} ) # or small word, case-insensitive
|
252
|
-
|
|
253
|
-
( [[:alpha:]] [[:lower:]'’()\[\]{}]* #{apos} ) # or word without internal caps
|
254
|
-
|
|
255
|
-
( [[:alpha:]] [[:alpha:]'’()\[\]{}]* #{apos} ) # or some other word
|
256
|
-
)
|
257
|
-
([_\*]*)
|
258
|
-
\b
|
259
|
-
/xu
|
260
|
-
) do
|
261
|
-
($1 ? $1 : "") +
|
262
|
-
($2 ? $2 : ($3 ? $3.downcase : ($4 ? $4.downcase.capitalize : $5))) +
|
263
|
-
($6 ? $6 : "")
|
264
|
-
end
|
265
|
-
|
266
|
-
if RUBY_VERSION < "1.9.0"
|
267
|
-
string.gsub!(
|
268
|
-
/
|
269
|
-
\b
|
270
|
-
([:alpha:]+)
|
271
|
-
(‑)
|
272
|
-
([:alpha:]+)
|
273
|
-
\b
|
274
|
-
/xu
|
275
|
-
) do
|
276
|
-
$1.downcase.capitalize + $2 + $1.downcase.capitalize
|
277
|
-
end
|
278
|
-
end
|
279
|
-
|
280
|
-
string.gsub!(
|
281
|
-
/
|
282
|
-
(
|
283
|
-
\A [[:punct:]]* # start of title
|
284
|
-
| [:.;?!][ ]+ # or of subsentence
|
285
|
-
| [ ]['"“‘(\[][ ]* # or of inserted subphrase
|
286
|
-
)
|
287
|
-
( #{small_words} ) # followed by a small-word
|
288
|
-
\b
|
289
|
-
/xiu
|
290
|
-
) do
|
291
|
-
$1 + $2.downcase.capitalize
|
292
|
-
end
|
293
|
-
|
294
|
-
string.gsub!(
|
295
|
-
/
|
296
|
-
\b
|
297
|
-
( #{small_words} ) # small-word
|
298
|
-
(?=
|
299
|
-
[[:punct:]]* \Z # at the end of the title
|
300
|
-
|
|
301
|
-
['"’”)\]] [ ] # or of an inserted subphrase
|
302
|
-
)
|
303
|
-
/xu
|
304
|
-
) do
|
305
|
-
$1.downcase.capitalize
|
306
|
-
end
|
307
|
-
|
308
|
-
string.gsub!(
|
309
|
-
/
|
310
|
-
(
|
311
|
-
\b
|
312
|
-
[[:alpha:]] # single first letter
|
313
|
-
[\-‑] # followed by a dash
|
314
|
-
)
|
315
|
-
( [[:alpha:]] ) # followed by a letter
|
316
|
-
/xu
|
317
|
-
) do
|
318
|
-
$1 + $2.downcase
|
319
|
-
end
|
320
|
-
|
321
|
-
string.gsub!(/q&a/i, 'Q&A')
|
322
|
-
|
323
|
-
string
|
324
|
-
end
|
325
|
-
|
326
|
-
end
|
327
|
-
|
328
|
-
end
|
329
|
-
|
330
|
-
|
331
|
-
# Add extensions to String
|
332
|
-
#
|
333
|
-
class String
|
334
|
-
Sterile.methods(false).each do |method|
|
335
|
-
eval("def #{method}(*args, &block); Sterile.#{method}(self, *args, &block); end")
|
336
|
-
eval("def #{method}!(*args, &block); replace Sterile.#{method}(self, *args, &block); end")
|
337
|
-
end
|
338
|
-
end
|
339
|
-
|
32
|
+
require "sterile/string_extensions"
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: sterile
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 1.0.
|
5
|
+
version: 1.0.2
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Patrick Hogan
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-06-
|
13
|
+
date: 2011-06-03 00:00:00 -05:00
|
14
14
|
default_executable:
|
15
15
|
dependencies: []
|
16
16
|
|
@@ -27,14 +27,22 @@ files:
|
|
27
27
|
- .autotest
|
28
28
|
- .gitignore
|
29
29
|
- .rvmrc
|
30
|
+
- .yaropts
|
30
31
|
- Gemfile
|
31
32
|
- Gemfile.lock
|
32
33
|
- README.markdown
|
33
34
|
- Rakefile
|
34
35
|
- lib/sterile.rb
|
35
|
-
- lib/sterile/
|
36
|
-
- lib/sterile/
|
37
|
-
- lib/sterile/smart_format_rules.rb
|
36
|
+
- lib/sterile/data/codepoints_data.rb
|
37
|
+
- lib/sterile/data/html_entities_data.rb
|
38
|
+
- lib/sterile/data/smart_format_rules.rb
|
39
|
+
- lib/sterile/entities.rb
|
40
|
+
- lib/sterile/smart_format.rb
|
41
|
+
- lib/sterile/string_extensions.rb
|
42
|
+
- lib/sterile/tags.rb
|
43
|
+
- lib/sterile/titlecase.rb
|
44
|
+
- lib/sterile/transliterate.rb
|
45
|
+
- lib/sterile/utilities.rb
|
38
46
|
- lib/sterile/version.rb
|
39
47
|
- sterile.gemspec
|
40
48
|
has_rdoc: true
|