stringex 1.3.1 → 1.3.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +184 -184
- data/lib/stringex/acts_as_url.rb +107 -0
- data/lib/stringex/string_extensions.rb +198 -0
- data/lib/stringex/unidecoder.rb +159 -0
- data/lib/stringex/unidecoder_data/x00.yml +257 -0
- data/lib/stringex/unidecoder_data/x01.yml +257 -0
- data/lib/stringex/unidecoder_data/x02.yml +256 -0
- data/lib/stringex/unidecoder_data/x03.yml +256 -0
- data/lib/stringex/unidecoder_data/x04.yml +256 -0
- data/lib/stringex/unidecoder_data/x05.yml +256 -0
- data/lib/stringex/unidecoder_data/x06.yml +256 -0
- data/lib/stringex/unidecoder_data/x07.yml +256 -0
- data/lib/stringex/unidecoder_data/x09.yml +256 -0
- data/lib/stringex/unidecoder_data/x0a.yml +256 -0
- data/lib/stringex/unidecoder_data/x0b.yml +256 -0
- data/lib/stringex/unidecoder_data/x0c.yml +256 -0
- data/lib/stringex/unidecoder_data/x0d.yml +256 -0
- data/lib/stringex/unidecoder_data/x0e.yml +256 -0
- data/lib/stringex/unidecoder_data/x0f.yml +256 -0
- data/lib/stringex/unidecoder_data/x10.yml +256 -0
- data/lib/stringex/unidecoder_data/x11.yml +256 -0
- data/lib/stringex/unidecoder_data/x12.yml +257 -0
- data/lib/stringex/unidecoder_data/x13.yml +256 -0
- data/lib/stringex/unidecoder_data/x14.yml +257 -0
- data/lib/stringex/unidecoder_data/x15.yml +257 -0
- data/lib/stringex/unidecoder_data/x16.yml +256 -0
- data/lib/stringex/unidecoder_data/x17.yml +256 -0
- data/lib/stringex/unidecoder_data/x18.yml +256 -0
- data/lib/stringex/unidecoder_data/x1e.yml +256 -0
- data/lib/stringex/unidecoder_data/x1f.yml +256 -0
- data/lib/stringex/unidecoder_data/x20.yml +256 -0
- data/lib/stringex/unidecoder_data/x21.yml +256 -0
- data/lib/stringex/unidecoder_data/x22.yml +256 -0
- data/lib/stringex/unidecoder_data/x23.yml +256 -0
- data/lib/stringex/unidecoder_data/x24.yml +256 -0
- data/lib/stringex/unidecoder_data/x25.yml +256 -0
- data/lib/stringex/unidecoder_data/x26.yml +256 -0
- data/lib/stringex/unidecoder_data/x27.yml +256 -0
- data/lib/stringex/unidecoder_data/x28.yml +257 -0
- data/lib/stringex/unidecoder_data/x2e.yml +256 -0
- data/lib/stringex/unidecoder_data/x2f.yml +256 -0
- data/lib/stringex/unidecoder_data/x30.yml +256 -0
- data/lib/stringex/unidecoder_data/x31.yml +256 -0
- data/lib/stringex/unidecoder_data/x32.yml +256 -0
- data/lib/stringex/unidecoder_data/x33.yml +256 -0
- data/lib/stringex/unidecoder_data/x4d.yml +256 -0
- data/lib/stringex/unidecoder_data/x4e.yml +257 -0
- data/lib/stringex/unidecoder_data/x4f.yml +257 -0
- data/lib/stringex/unidecoder_data/x50.yml +257 -0
- data/lib/stringex/unidecoder_data/x51.yml +257 -0
- data/lib/stringex/unidecoder_data/x52.yml +257 -0
- data/lib/stringex/unidecoder_data/x53.yml +257 -0
- data/lib/stringex/unidecoder_data/x54.yml +257 -0
- data/lib/stringex/unidecoder_data/x55.yml +257 -0
- data/lib/stringex/unidecoder_data/x56.yml +257 -0
- data/lib/stringex/unidecoder_data/x57.yml +257 -0
- data/lib/stringex/unidecoder_data/x58.yml +257 -0
- data/lib/stringex/unidecoder_data/x59.yml +257 -0
- data/lib/stringex/unidecoder_data/x5a.yml +257 -0
- data/lib/stringex/unidecoder_data/x5b.yml +257 -0
- data/lib/stringex/unidecoder_data/x5c.yml +257 -0
- data/lib/stringex/unidecoder_data/x5d.yml +257 -0
- data/lib/stringex/unidecoder_data/x5e.yml +257 -0
- data/lib/stringex/unidecoder_data/x5f.yml +257 -0
- data/lib/stringex/unidecoder_data/x60.yml +257 -0
- data/lib/stringex/unidecoder_data/x61.yml +257 -0
- data/lib/stringex/unidecoder_data/x62.yml +257 -0
- data/lib/stringex/unidecoder_data/x63.yml +257 -0
- data/lib/stringex/unidecoder_data/x64.yml +257 -0
- data/lib/stringex/unidecoder_data/x65.yml +257 -0
- data/lib/stringex/unidecoder_data/x66.yml +257 -0
- data/lib/stringex/unidecoder_data/x67.yml +257 -0
- data/lib/stringex/unidecoder_data/x68.yml +257 -0
- data/lib/stringex/unidecoder_data/x69.yml +257 -0
- data/lib/stringex/unidecoder_data/x6a.yml +257 -0
- data/lib/stringex/unidecoder_data/x6b.yml +257 -0
- data/lib/stringex/unidecoder_data/x6c.yml +257 -0
- data/lib/stringex/unidecoder_data/x6d.yml +257 -0
- data/lib/stringex/unidecoder_data/x6e.yml +257 -0
- data/lib/stringex/unidecoder_data/x6f.yml +257 -0
- data/lib/stringex/unidecoder_data/x70.yml +257 -0
- data/lib/stringex/unidecoder_data/x71.yml +257 -0
- data/lib/stringex/unidecoder_data/x72.yml +257 -0
- data/lib/stringex/unidecoder_data/x73.yml +257 -0
- data/lib/stringex/unidecoder_data/x74.yml +257 -0
- data/lib/stringex/unidecoder_data/x75.yml +257 -0
- data/lib/stringex/unidecoder_data/x76.yml +257 -0
- data/lib/stringex/unidecoder_data/x77.yml +257 -0
- data/lib/stringex/unidecoder_data/x78.yml +257 -0
- data/lib/stringex/unidecoder_data/x79.yml +257 -0
- data/lib/stringex/unidecoder_data/x7a.yml +257 -0
- data/lib/stringex/unidecoder_data/x7b.yml +257 -0
- data/lib/stringex/unidecoder_data/x7c.yml +257 -0
- data/lib/stringex/unidecoder_data/x7d.yml +257 -0
- data/lib/stringex/unidecoder_data/x7e.yml +257 -0
- data/lib/stringex/unidecoder_data/x7f.yml +257 -0
- data/lib/stringex/unidecoder_data/x80.yml +257 -0
- data/lib/stringex/unidecoder_data/x81.yml +257 -0
- data/lib/stringex/unidecoder_data/x82.yml +257 -0
- data/lib/stringex/unidecoder_data/x83.yml +257 -0
- data/lib/stringex/unidecoder_data/x84.yml +257 -0
- data/lib/stringex/unidecoder_data/x85.yml +257 -0
- data/lib/stringex/unidecoder_data/x86.yml +257 -0
- data/lib/stringex/unidecoder_data/x87.yml +257 -0
- data/lib/stringex/unidecoder_data/x88.yml +257 -0
- data/lib/stringex/unidecoder_data/x89.yml +257 -0
- data/lib/stringex/unidecoder_data/x8a.yml +257 -0
- data/lib/stringex/unidecoder_data/x8b.yml +257 -0
- data/lib/stringex/unidecoder_data/x8c.yml +257 -0
- data/lib/stringex/unidecoder_data/x8d.yml +257 -0
- data/lib/stringex/unidecoder_data/x8e.yml +257 -0
- data/lib/stringex/unidecoder_data/x8f.yml +257 -0
- data/lib/stringex/unidecoder_data/x90.yml +257 -0
- data/lib/stringex/unidecoder_data/x91.yml +257 -0
- data/lib/stringex/unidecoder_data/x92.yml +257 -0
- data/lib/stringex/unidecoder_data/x93.yml +257 -0
- data/lib/stringex/unidecoder_data/x94.yml +257 -0
- data/lib/stringex/unidecoder_data/x95.yml +257 -0
- data/lib/stringex/unidecoder_data/x96.yml +257 -0
- data/lib/stringex/unidecoder_data/x97.yml +257 -0
- data/lib/stringex/unidecoder_data/x98.yml +257 -0
- data/lib/stringex/unidecoder_data/x99.yml +257 -0
- data/lib/stringex/unidecoder_data/x9a.yml +257 -0
- data/lib/stringex/unidecoder_data/x9b.yml +257 -0
- data/lib/stringex/unidecoder_data/x9c.yml +257 -0
- data/lib/stringex/unidecoder_data/x9d.yml +257 -0
- data/lib/stringex/unidecoder_data/x9e.yml +257 -0
- data/lib/stringex/unidecoder_data/x9f.yml +256 -0
- data/lib/stringex/unidecoder_data/xa0.yml +257 -0
- data/lib/stringex/unidecoder_data/xa1.yml +257 -0
- data/lib/stringex/unidecoder_data/xa2.yml +257 -0
- data/lib/stringex/unidecoder_data/xa3.yml +257 -0
- data/lib/stringex/unidecoder_data/xa4.yml +256 -0
- data/lib/stringex/unidecoder_data/xac.yml +257 -0
- data/lib/stringex/unidecoder_data/xad.yml +257 -0
- data/lib/stringex/unidecoder_data/xae.yml +257 -0
- data/lib/stringex/unidecoder_data/xaf.yml +257 -0
- data/lib/stringex/unidecoder_data/xb0.yml +257 -0
- data/lib/stringex/unidecoder_data/xb1.yml +257 -0
- data/lib/stringex/unidecoder_data/xb2.yml +257 -0
- data/lib/stringex/unidecoder_data/xb3.yml +257 -0
- data/lib/stringex/unidecoder_data/xb4.yml +257 -0
- data/lib/stringex/unidecoder_data/xb5.yml +257 -0
- data/lib/stringex/unidecoder_data/xb6.yml +257 -0
- data/lib/stringex/unidecoder_data/xb7.yml +257 -0
- data/lib/stringex/unidecoder_data/xb8.yml +257 -0
- data/lib/stringex/unidecoder_data/xb9.yml +257 -0
- data/lib/stringex/unidecoder_data/xba.yml +257 -0
- data/lib/stringex/unidecoder_data/xbb.yml +257 -0
- data/lib/stringex/unidecoder_data/xbc.yml +257 -0
- data/lib/stringex/unidecoder_data/xbd.yml +257 -0
- data/lib/stringex/unidecoder_data/xbe.yml +257 -0
- data/lib/stringex/unidecoder_data/xbf.yml +257 -0
- data/lib/stringex/unidecoder_data/xc0.yml +257 -0
- data/lib/stringex/unidecoder_data/xc1.yml +257 -0
- data/lib/stringex/unidecoder_data/xc2.yml +257 -0
- data/lib/stringex/unidecoder_data/xc3.yml +257 -0
- data/lib/stringex/unidecoder_data/xc4.yml +257 -0
- data/lib/stringex/unidecoder_data/xc5.yml +257 -0
- data/lib/stringex/unidecoder_data/xc6.yml +257 -0
- data/lib/stringex/unidecoder_data/xc7.yml +257 -0
- data/lib/stringex/unidecoder_data/xc8.yml +257 -0
- data/lib/stringex/unidecoder_data/xc9.yml +257 -0
- data/lib/stringex/unidecoder_data/xca.yml +257 -0
- data/lib/stringex/unidecoder_data/xcb.yml +257 -0
- data/lib/stringex/unidecoder_data/xcc.yml +257 -0
- data/lib/stringex/unidecoder_data/xcd.yml +257 -0
- data/lib/stringex/unidecoder_data/xce.yml +257 -0
- data/lib/stringex/unidecoder_data/xcf.yml +257 -0
- data/lib/stringex/unidecoder_data/xd0.yml +257 -0
- data/lib/stringex/unidecoder_data/xd1.yml +257 -0
- data/lib/stringex/unidecoder_data/xd2.yml +257 -0
- data/lib/stringex/unidecoder_data/xd3.yml +257 -0
- data/lib/stringex/unidecoder_data/xd4.yml +257 -0
- data/lib/stringex/unidecoder_data/xd5.yml +257 -0
- data/lib/stringex/unidecoder_data/xd6.yml +257 -0
- data/lib/stringex/unidecoder_data/xd7.yml +256 -0
- data/lib/stringex/unidecoder_data/xf9.yml +257 -0
- data/lib/stringex/unidecoder_data/xfa.yml +256 -0
- data/lib/stringex/unidecoder_data/xfb.yml +257 -0
- data/lib/stringex/unidecoder_data/xfc.yml +257 -0
- data/lib/stringex/unidecoder_data/xfd.yml +256 -0
- data/lib/stringex/unidecoder_data/xfe.yml +257 -0
- data/lib/stringex/unidecoder_data/xff.yml +257 -0
- data/stringex.gemspec +184 -1
- metadata +186 -3
@@ -0,0 +1,198 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Stringex
|
4
|
+
# These methods are all added on String class.
|
5
|
+
module StringExtensions
|
6
|
+
def self.included(base) # :nodoc:
|
7
|
+
base.extend(ClassMethods)
|
8
|
+
end
|
9
|
+
|
10
|
+
# Returns the string converted (via Textile/RedCloth) to HTML format
|
11
|
+
# or self [with a friendly warning] if Redcloth is not available.
|
12
|
+
#
|
13
|
+
# Using <tt>:lite</tt> argument will cause RedCloth to not wrap the HTML in a container
|
14
|
+
# P element, which is useful behavior for generating header element text, etc.
|
15
|
+
# This is roughly equivalent to ActionView's <tt>textilize_without_paragraph</tt>
|
16
|
+
# except that it makes RedCloth do all the work instead of just gsubbing the return
|
17
|
+
# from RedCloth.
|
18
|
+
def to_html(lite_mode = false)
|
19
|
+
if defined?(RedCloth)
|
20
|
+
if lite_mode
|
21
|
+
RedCloth.new(self, [:lite_mode]).to_html
|
22
|
+
else
|
23
|
+
if self =~ /<pre>/
|
24
|
+
RedCloth.new(self).to_html.tr("\t", "")
|
25
|
+
else
|
26
|
+
RedCloth.new(self).to_html.tr("\t", "").gsub(/\n\n/, "")
|
27
|
+
end
|
28
|
+
end
|
29
|
+
else
|
30
|
+
warn "String#to_html was called without RedCloth being successfully required"
|
31
|
+
self
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Create a URI-friendly representation of the string. This is used internally by
|
36
|
+
# acts_as_url[link:classes/Stringex/ActsAsUrl/ClassMethods.html#M000012]
|
37
|
+
# but can be called manually in order to generate an URI-friendly version of any string.
|
38
|
+
def to_url(options = {})
|
39
|
+
remove_formatting(options).downcase.replace_whitespace("-").collapse("-")
|
40
|
+
end
|
41
|
+
|
42
|
+
# Performs multiple text manipulations. Essentially a shortcut for typing them all. View source
|
43
|
+
# below to see which methods are run.
|
44
|
+
def remove_formatting(options = {})
|
45
|
+
strip_html_tags.convert_smart_punctuation.convert_accented_entities.convert_misc_entities.convert_misc_characters(options).to_ascii.collapse
|
46
|
+
end
|
47
|
+
|
48
|
+
# Removes HTML tags from text. This code is simplified from Tobias Luettke's regular expression
|
49
|
+
# in Typo[http://typosphere.org].
|
50
|
+
def strip_html_tags(leave_whitespace = false)
|
51
|
+
name = /[\w:_-]+/
|
52
|
+
value = /([A-Za-z0-9]+|('[^']*?'|"[^"]*?"))/
|
53
|
+
attr = /(#{name}(\s*=\s*#{value})?)/
|
54
|
+
rx = /<[!\/?\[]?(#{name}|--)(\s+(#{attr}(\s+#{attr})*))?\s*([!\/?\]]+|--)?>/
|
55
|
+
(leave_whitespace) ? gsub(rx, "").strip : gsub(rx, "").gsub(/\s+/, " ").strip
|
56
|
+
end
|
57
|
+
|
58
|
+
# Converts HTML entities into the respective non-accented letters. Examples:
|
59
|
+
#
|
60
|
+
# "á".convert_accented_entities # => "a"
|
61
|
+
# "ç".convert_accented_entities # => "c"
|
62
|
+
# "è".convert_accented_entities # => "e"
|
63
|
+
# "î".convert_accented_entities # => "i"
|
64
|
+
# "ø".convert_accented_entities # => "o"
|
65
|
+
# "ü".convert_accented_entities # => "u"
|
66
|
+
#
|
67
|
+
# Note: This does not do any conversion of Unicode/ASCII accented-characters. For that
|
68
|
+
# functionality please use <tt>to_ascii</tt>.
|
69
|
+
def convert_accented_entities
|
70
|
+
gsub(/&([A-Za-z])(grave|acute|circ|tilde|uml|ring|cedil|slash);/, '\1')
|
71
|
+
end
|
72
|
+
|
73
|
+
# Converts HTML entities (taken from common Textile/RedCloth formattings) into plain text formats.
|
74
|
+
#
|
75
|
+
# Note: This isn't an attempt at complete conversion of HTML entities, just those most likely
|
76
|
+
# to be generated by Textile.
|
77
|
+
def convert_misc_entities
|
78
|
+
dummy = dup
|
79
|
+
{
|
80
|
+
"#822[01]" => "\"",
|
81
|
+
"#821[67]" => "'",
|
82
|
+
"#8230" => "...",
|
83
|
+
"#8211" => "-",
|
84
|
+
"#8212" => "--",
|
85
|
+
"#215" => "x",
|
86
|
+
"gt" => ">",
|
87
|
+
"lt" => "<",
|
88
|
+
"(#8482|trade)" => "(tm)",
|
89
|
+
"(#174|reg)" => "(r)",
|
90
|
+
"(#169|copy)" => "(c)",
|
91
|
+
"(#38|amp)" => "and",
|
92
|
+
"nbsp" => " ",
|
93
|
+
"(#162|cent)" => " cent",
|
94
|
+
"(#163|pound)" => " pound",
|
95
|
+
"(#188|frac14)" => "one fourth",
|
96
|
+
"(#189|frac12)" => "half",
|
97
|
+
"(#190|frac34)" => "three fourths",
|
98
|
+
"(#176|deg)" => " degrees"
|
99
|
+
}.each do |textiled, normal|
|
100
|
+
dummy.gsub!(/&#{textiled};/, normal)
|
101
|
+
end
|
102
|
+
dummy.gsub(/&[^;]+;/, "")
|
103
|
+
end
|
104
|
+
|
105
|
+
# Converts MS Word 'smart punctuation' to ASCII
|
106
|
+
#
|
107
|
+
def convert_smart_punctuation
|
108
|
+
dummy = dup
|
109
|
+
{
|
110
|
+
|
111
|
+
"(“|”|\302\223|\302\224|\303\222|\303\223)" => '"',
|
112
|
+
"(‘|’|\302\221|\302\222|\303\225)" => "'",
|
113
|
+
"…" => "...",
|
114
|
+
}.each do |smart, normal|
|
115
|
+
dummy.gsub!(/#{smart}/, normal)
|
116
|
+
end
|
117
|
+
dummy
|
118
|
+
end
|
119
|
+
|
120
|
+
# Converts various common plaintext characters to a more URI-friendly representation.
|
121
|
+
# Examples:
|
122
|
+
#
|
123
|
+
# "foo & bar".convert_misc_characters # => "foo and bar"
|
124
|
+
# "Chanel #9".convert_misc_characters # => "Chanel number nine"
|
125
|
+
# "user@host".convert_misc_characters # => "user at host"
|
126
|
+
# "google.com".convert_misc_characters # => "google dot com"
|
127
|
+
# "$10".convert_misc_characters # => "10 dollars"
|
128
|
+
# "*69".convert_misc_characters # => "star 69"
|
129
|
+
# "100%".convert_misc_characters # => "100 percent"
|
130
|
+
# "windows/mac/linux".convert_misc_characters # => "windows slash mac slash linux"
|
131
|
+
#
|
132
|
+
# Note: Because this method will convert any & symbols to the string "and",
|
133
|
+
# you should run any methods which convert HTML entities (convert_html_entities and convert_misc_entities)
|
134
|
+
# before running this method.
|
135
|
+
def convert_misc_characters(options = {})
|
136
|
+
dummy = dup.gsub(/\.{3,}/, " dot dot dot ") # Catch ellipses before single dot rule!
|
137
|
+
# Special rules for money
|
138
|
+
{
|
139
|
+
/(\s|^)\$(\d+)\.(\d+)(\s|$)/ => '\2 dollars \3 cents',
|
140
|
+
/(\s|^)£(\d+)\.(\d+)(\s|$)/u => '\2 pounds \3 pence',
|
141
|
+
}.each do |found, replaced|
|
142
|
+
replaced = " #{replaced} " unless replaced =~ /\\1/
|
143
|
+
dummy.gsub!(found, replaced)
|
144
|
+
end
|
145
|
+
# Back to normal rules
|
146
|
+
misc_characters =
|
147
|
+
{
|
148
|
+
/\s*&\s*/ => "and",
|
149
|
+
/\s*#/ => "number",
|
150
|
+
/\s*@\s*/ => "at",
|
151
|
+
/(\S|^)\.(\S)/ => '\1 dot \2',
|
152
|
+
/(\s|^)\$(\d*)(\s|$)/ => '\2 dollars',
|
153
|
+
/(\s|^)£(\d*)(\s|$)/u => '\2 pounds',
|
154
|
+
/(\s|^)¥(\d*)(\s|$)/u => '\2 yen',
|
155
|
+
/\s*\*\s*/ => "star",
|
156
|
+
/\s*%\s*/ => "percent",
|
157
|
+
/(\s*=\s*)/ => " equals ",
|
158
|
+
/\s*\+\s*/ => "plus"
|
159
|
+
}
|
160
|
+
misc_characters[/\s*(\\|\/)\s*/] = 'slash' unless options[:allow_slash]
|
161
|
+
misc_characters.each do |found, replaced|
|
162
|
+
replaced = " #{replaced} " unless replaced =~ /\\1/
|
163
|
+
dummy.gsub!(found, replaced)
|
164
|
+
end
|
165
|
+
dummy = dummy.gsub(/(^|\w)'(\w|$)/, '\1\2').gsub(/[\.,:;()\[\]\?!\^'"_]/, " ")
|
166
|
+
end
|
167
|
+
|
168
|
+
# Replace runs of whitespace in string. Defaults to a single space but any replacement
|
169
|
+
# string may be specified as an argument. Examples:
|
170
|
+
#
|
171
|
+
# "Foo bar".replace_whitespace # => "Foo bar"
|
172
|
+
# "Foo bar".replace_whitespace("-") # => "Foo-bar"
|
173
|
+
def replace_whitespace(replace = " ")
|
174
|
+
gsub(/\s+/, replace)
|
175
|
+
end
|
176
|
+
|
177
|
+
# Removes specified character from the beginning and/or end of the string and then performs
|
178
|
+
# <tt>String#squeeze(character)</tt>, condensing runs of the character within the string.
|
179
|
+
#
|
180
|
+
# Note: This method has been superceded by ActiveSupport's squish method.
|
181
|
+
def collapse(character = " ")
|
182
|
+
sub(/^#{character}*/, "").sub(/#{character}*$/, "").squeeze(character)
|
183
|
+
end
|
184
|
+
|
185
|
+
module ClassMethods
|
186
|
+
# Returns string of random characters with a length matching the specified limit. Excludes 0
|
187
|
+
# to avoid confusion between 0 and O.
|
188
|
+
def random(limit)
|
189
|
+
strong_alphanumerics = %w{
|
190
|
+
a b c d e f g h i j k l m n o p q r s t u v w x y z
|
191
|
+
A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
|
192
|
+
1 2 3 4 5 6 7 8 9
|
193
|
+
}
|
194
|
+
Array.new(limit, "").collect{strong_alphanumerics[rand(61)]}.join
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
@@ -0,0 +1,159 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require "yaml"
|
3
|
+
|
4
|
+
module Stringex
|
5
|
+
module Unidecoder
|
6
|
+
# Contains Unicode codepoints, loading as needed from YAML files
|
7
|
+
CODEPOINTS = Hash.new{|h, k|
|
8
|
+
h[k] = YAML.load_file(File.join(File.expand_path(File.dirname(__FILE__)), "unidecoder_data", "#{k}.yml"))
|
9
|
+
} unless defined?(CODEPOINTS)
|
10
|
+
LOCAL_CODEPOINTS = Hash.new unless defined?(LOCAL_CODEPOINTS)
|
11
|
+
|
12
|
+
class << self
|
13
|
+
# Returns string with its UTF-8 characters transliterated to ASCII ones
|
14
|
+
#
|
15
|
+
# You're probably better off just using the added String#to_ascii
|
16
|
+
def decode(string)
|
17
|
+
string.gsub(/[^\x00-\x7f]/u) do |codepoint|
|
18
|
+
if localized = local_codepoint(codepoint)
|
19
|
+
localized
|
20
|
+
else
|
21
|
+
begin
|
22
|
+
unpacked = codepoint.unpack("U")[0]
|
23
|
+
CODEPOINTS[code_group(unpacked)][grouped_point(unpacked)]
|
24
|
+
rescue
|
25
|
+
# Hopefully this won't come up much
|
26
|
+
# TODO: Make this note something to the user that is reportable to me perhaps
|
27
|
+
"?"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Returns character for the given Unicode codepoint
|
34
|
+
def encode(codepoint)
|
35
|
+
["0x#{codepoint}".to_i(16)].pack("U")
|
36
|
+
end
|
37
|
+
|
38
|
+
# Returns string indicating which file (and line) contains the
|
39
|
+
# transliteration value for the character
|
40
|
+
def in_yaml_file(character)
|
41
|
+
unpacked = character.unpack("U")[0]
|
42
|
+
"#{code_group(unpacked)}.yml (line #{grouped_point(unpacked) + 2})"
|
43
|
+
end
|
44
|
+
|
45
|
+
# Adds localized transliterations to Unidecoder
|
46
|
+
def localize_from(hash_or_path_to_file)
|
47
|
+
hash = if hash_or_path_to_file.is_a?(Hash)
|
48
|
+
hash_or_path_to_file
|
49
|
+
else
|
50
|
+
YAML.load_file(hash_or_path_to_file)
|
51
|
+
end
|
52
|
+
verify_local_codepoints hash
|
53
|
+
end
|
54
|
+
|
55
|
+
# Returns locale for localized transliterations
|
56
|
+
def locale
|
57
|
+
if @locale
|
58
|
+
@locale
|
59
|
+
elsif defined?(I18n)
|
60
|
+
I18n.locale
|
61
|
+
else
|
62
|
+
default_locale
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# Sets locale for localized transliterations
|
67
|
+
def locale=(new_locale)
|
68
|
+
@locale = new_locale
|
69
|
+
end
|
70
|
+
|
71
|
+
# Returns default locale for localized transliterations. NOTE: Will set @locale as well.
|
72
|
+
def default_locale
|
73
|
+
@default_locale ||= "en"
|
74
|
+
@locale = @default_locale
|
75
|
+
end
|
76
|
+
|
77
|
+
# Sets the default locale for localized transliterations. NOTE: Will set @locale as well.
|
78
|
+
def default_locale=(new_locale)
|
79
|
+
@default_locale = new_locale
|
80
|
+
# Seems logical that @locale should be the new default
|
81
|
+
@locale = new_locale
|
82
|
+
end
|
83
|
+
|
84
|
+
# Returns the localized transliteration for a codepoint
|
85
|
+
def local_codepoint(codepoint)
|
86
|
+
locale_hash = LOCAL_CODEPOINTS[locale] || LOCAL_CODEPOINTS[locale.is_a?(Symbol) ? locale.to_s : locale.to_sym]
|
87
|
+
locale_hash && locale_hash[codepoint]
|
88
|
+
end
|
89
|
+
|
90
|
+
# Runs a block with a temporary locale setting, returning the locale to the original state when complete
|
91
|
+
def with_locale(new_locale, &block)
|
92
|
+
new_locale = default_locale if new_locale == :default
|
93
|
+
original_locale = locale
|
94
|
+
self.locale = new_locale
|
95
|
+
block.call
|
96
|
+
self.locale = original_locale
|
97
|
+
end
|
98
|
+
|
99
|
+
# Runs a block with default locale
|
100
|
+
def with_default_locale(&block)
|
101
|
+
with_locale default_locale, &block
|
102
|
+
end
|
103
|
+
|
104
|
+
private
|
105
|
+
# Returns the Unicode codepoint grouping for the given character
|
106
|
+
def code_group(unpacked_character)
|
107
|
+
"x%02x" % (unpacked_character >> 8)
|
108
|
+
end
|
109
|
+
|
110
|
+
# Returns the index of the given character in the YAML file for its codepoint group
|
111
|
+
def grouped_point(unpacked_character)
|
112
|
+
unpacked_character & 255
|
113
|
+
end
|
114
|
+
|
115
|
+
# Checks LOCAL_CODEPOINTS's Hash is in the format we expect before assigning it and raises
|
116
|
+
# instructive exception if not
|
117
|
+
def verify_local_codepoints(hash)
|
118
|
+
pass_check = hash.is_a?(Hash) && hash.all?{|key, value|
|
119
|
+
# Fuck a duck, eh?
|
120
|
+
[Symbol, String].include?(key.class) && value.is_a?(Hash) &&
|
121
|
+
value.keys.all?{|k| k.is_a?(String)} && value.values.all?{|v| v.is_a?(String)}
|
122
|
+
}
|
123
|
+
if pass_check
|
124
|
+
hash.each do |k, v|
|
125
|
+
LOCAL_CODEPOINTS[k] = v
|
126
|
+
end
|
127
|
+
else
|
128
|
+
raise ArgumentError, "LOCAL_CODEPOINTS is not correctly defined. Please see the README for more information on how to correctly format this data."
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# Provide a simpler interface for localization implementations
|
135
|
+
class << self
|
136
|
+
%w{
|
137
|
+
localize_from
|
138
|
+
locale locale=
|
139
|
+
default_locale default_locale=
|
140
|
+
local_codepoint
|
141
|
+
with_locale with_default_locale
|
142
|
+
}.each do |name|
|
143
|
+
define_method name do |*args, &block|
|
144
|
+
Unidecoder.send name, *args, &block
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
module Stringex
|
151
|
+
module StringExtensions
|
152
|
+
# Returns string with its UTF-8 characters transliterated to ASCII ones. Example:
|
153
|
+
#
|
154
|
+
# "⠋⠗⠁⠝⠉⠑".to_ascii #=> "braille"
|
155
|
+
def to_ascii
|
156
|
+
Stringex::Unidecoder.decode(self)
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
@@ -0,0 +1,257 @@
|
|
1
|
+
---
|
2
|
+
- "\x00"
|
3
|
+
- "\x01"
|
4
|
+
- "\x02"
|
5
|
+
- "\x03"
|
6
|
+
- "\x04"
|
7
|
+
- "\x05"
|
8
|
+
- "\x06"
|
9
|
+
- "\a"
|
10
|
+
- "\x08"
|
11
|
+
- ' '
|
12
|
+
- "\n"
|
13
|
+
- "\v"
|
14
|
+
- "\f"
|
15
|
+
- "\r"
|
16
|
+
- "\x0e"
|
17
|
+
- "\x0f"
|
18
|
+
- "\x10"
|
19
|
+
- "\x11"
|
20
|
+
- "\x12"
|
21
|
+
- "\x13"
|
22
|
+
- "\x14"
|
23
|
+
- "\x15"
|
24
|
+
- "\x16"
|
25
|
+
- "\x17"
|
26
|
+
- "\x18"
|
27
|
+
- "\x19"
|
28
|
+
- "\x1a"
|
29
|
+
- "\e"
|
30
|
+
- "\x1c"
|
31
|
+
- "\x1d"
|
32
|
+
- "\x1e"
|
33
|
+
- "\x1f"
|
34
|
+
- ' '
|
35
|
+
- '!'
|
36
|
+
- '"'
|
37
|
+
- '#'
|
38
|
+
- $
|
39
|
+
- '%'
|
40
|
+
- '&'
|
41
|
+
- "'"
|
42
|
+
- (
|
43
|
+
- )
|
44
|
+
- '*'
|
45
|
+
- +
|
46
|
+
- ','
|
47
|
+
- '-'
|
48
|
+
- .
|
49
|
+
- /
|
50
|
+
- 0
|
51
|
+
- 1
|
52
|
+
- 2
|
53
|
+
- 3
|
54
|
+
- 4
|
55
|
+
- 5
|
56
|
+
- 6
|
57
|
+
- 7
|
58
|
+
- 8
|
59
|
+
- 9
|
60
|
+
- ':'
|
61
|
+
- ;
|
62
|
+
- <
|
63
|
+
- '='
|
64
|
+
- '>'
|
65
|
+
- '?'
|
66
|
+
- '@'
|
67
|
+
- A
|
68
|
+
- B
|
69
|
+
- C
|
70
|
+
- D
|
71
|
+
- E
|
72
|
+
- F
|
73
|
+
- G
|
74
|
+
- H
|
75
|
+
- I
|
76
|
+
- J
|
77
|
+
- K
|
78
|
+
- L
|
79
|
+
- M
|
80
|
+
- N
|
81
|
+
- O
|
82
|
+
- P
|
83
|
+
- Q
|
84
|
+
- R
|
85
|
+
- S
|
86
|
+
- T
|
87
|
+
- U
|
88
|
+
- V
|
89
|
+
- W
|
90
|
+
- X
|
91
|
+
- Y
|
92
|
+
- Z
|
93
|
+
- ']'
|
94
|
+
- \
|
95
|
+
- ']'
|
96
|
+
- '^'
|
97
|
+
- _
|
98
|
+
- '`'
|
99
|
+
- a
|
100
|
+
- b
|
101
|
+
- c
|
102
|
+
- d
|
103
|
+
- e
|
104
|
+
- f
|
105
|
+
- g
|
106
|
+
- h
|
107
|
+
- i
|
108
|
+
- j
|
109
|
+
- k
|
110
|
+
- l
|
111
|
+
- m
|
112
|
+
- n
|
113
|
+
- o
|
114
|
+
- p
|
115
|
+
- q
|
116
|
+
- r
|
117
|
+
- s
|
118
|
+
- t
|
119
|
+
- u
|
120
|
+
- v
|
121
|
+
- w
|
122
|
+
- x
|
123
|
+
- y
|
124
|
+
- z
|
125
|
+
- '{'
|
126
|
+
- '|'
|
127
|
+
- '}'
|
128
|
+
- '~'
|
129
|
+
- ''
|
130
|
+
- ''
|
131
|
+
- ''
|
132
|
+
- ''
|
133
|
+
- ''
|
134
|
+
- ''
|
135
|
+
- ''
|
136
|
+
- ''
|
137
|
+
- ''
|
138
|
+
- ''
|
139
|
+
- ''
|
140
|
+
- ''
|
141
|
+
- ''
|
142
|
+
- ''
|
143
|
+
- ''
|
144
|
+
- ''
|
145
|
+
- ''
|
146
|
+
- ''
|
147
|
+
- "'"
|
148
|
+
- "'"
|
149
|
+
- '"'
|
150
|
+
- '"'
|
151
|
+
- ''
|
152
|
+
- ''
|
153
|
+
- ''
|
154
|
+
- ''
|
155
|
+
- ''
|
156
|
+
- ''
|
157
|
+
- ''
|
158
|
+
- ''
|
159
|
+
- ''
|
160
|
+
- ''
|
161
|
+
- ''
|
162
|
+
- ' '
|
163
|
+
- ''
|
164
|
+
- C/
|
165
|
+
- PS
|
166
|
+
- $?
|
167
|
+
- Y=
|
168
|
+
- '|'
|
169
|
+
- SS
|
170
|
+
- '"'
|
171
|
+
- (c)
|
172
|
+
- a
|
173
|
+
- '<<'
|
174
|
+
- '!'
|
175
|
+
- ''
|
176
|
+
- (r)
|
177
|
+
- '-'
|
178
|
+
- deg
|
179
|
+
- +-
|
180
|
+
- 2
|
181
|
+
- 3
|
182
|
+
- "'"
|
183
|
+
- u
|
184
|
+
- P
|
185
|
+
- '*'
|
186
|
+
- ','
|
187
|
+
- 1
|
188
|
+
- o
|
189
|
+
- '>>'
|
190
|
+
- 1/4
|
191
|
+
- 1/2
|
192
|
+
- 3/4
|
193
|
+
- ''
|
194
|
+
- A
|
195
|
+
- A
|
196
|
+
- A
|
197
|
+
- A
|
198
|
+
- A
|
199
|
+
- A
|
200
|
+
- AE
|
201
|
+
- C
|
202
|
+
- E
|
203
|
+
- E
|
204
|
+
- E
|
205
|
+
- E
|
206
|
+
- I
|
207
|
+
- I
|
208
|
+
- I
|
209
|
+
- I
|
210
|
+
- D
|
211
|
+
- N
|
212
|
+
- O
|
213
|
+
- O
|
214
|
+
- O
|
215
|
+
- O
|
216
|
+
- O
|
217
|
+
- x
|
218
|
+
- O
|
219
|
+
- U
|
220
|
+
- U
|
221
|
+
- U
|
222
|
+
- U
|
223
|
+
- Y
|
224
|
+
- Th
|
225
|
+
- ss
|
226
|
+
- a
|
227
|
+
- a
|
228
|
+
- a
|
229
|
+
- a
|
230
|
+
- a
|
231
|
+
- a
|
232
|
+
- ae
|
233
|
+
- c
|
234
|
+
- e
|
235
|
+
- e
|
236
|
+
- e
|
237
|
+
- e
|
238
|
+
- i
|
239
|
+
- i
|
240
|
+
- i
|
241
|
+
- i
|
242
|
+
- d
|
243
|
+
- n
|
244
|
+
- o
|
245
|
+
- o
|
246
|
+
- o
|
247
|
+
- o
|
248
|
+
- o
|
249
|
+
- /
|
250
|
+
- o
|
251
|
+
- u
|
252
|
+
- u
|
253
|
+
- u
|
254
|
+
- u
|
255
|
+
- y
|
256
|
+
- th
|
257
|
+
- y
|