stringex 1.3.1 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. data/Rakefile +184 -184
  2. data/lib/stringex/acts_as_url.rb +107 -0
  3. data/lib/stringex/string_extensions.rb +198 -0
  4. data/lib/stringex/unidecoder.rb +159 -0
  5. data/lib/stringex/unidecoder_data/x00.yml +257 -0
  6. data/lib/stringex/unidecoder_data/x01.yml +257 -0
  7. data/lib/stringex/unidecoder_data/x02.yml +256 -0
  8. data/lib/stringex/unidecoder_data/x03.yml +256 -0
  9. data/lib/stringex/unidecoder_data/x04.yml +256 -0
  10. data/lib/stringex/unidecoder_data/x05.yml +256 -0
  11. data/lib/stringex/unidecoder_data/x06.yml +256 -0
  12. data/lib/stringex/unidecoder_data/x07.yml +256 -0
  13. data/lib/stringex/unidecoder_data/x09.yml +256 -0
  14. data/lib/stringex/unidecoder_data/x0a.yml +256 -0
  15. data/lib/stringex/unidecoder_data/x0b.yml +256 -0
  16. data/lib/stringex/unidecoder_data/x0c.yml +256 -0
  17. data/lib/stringex/unidecoder_data/x0d.yml +256 -0
  18. data/lib/stringex/unidecoder_data/x0e.yml +256 -0
  19. data/lib/stringex/unidecoder_data/x0f.yml +256 -0
  20. data/lib/stringex/unidecoder_data/x10.yml +256 -0
  21. data/lib/stringex/unidecoder_data/x11.yml +256 -0
  22. data/lib/stringex/unidecoder_data/x12.yml +257 -0
  23. data/lib/stringex/unidecoder_data/x13.yml +256 -0
  24. data/lib/stringex/unidecoder_data/x14.yml +257 -0
  25. data/lib/stringex/unidecoder_data/x15.yml +257 -0
  26. data/lib/stringex/unidecoder_data/x16.yml +256 -0
  27. data/lib/stringex/unidecoder_data/x17.yml +256 -0
  28. data/lib/stringex/unidecoder_data/x18.yml +256 -0
  29. data/lib/stringex/unidecoder_data/x1e.yml +256 -0
  30. data/lib/stringex/unidecoder_data/x1f.yml +256 -0
  31. data/lib/stringex/unidecoder_data/x20.yml +256 -0
  32. data/lib/stringex/unidecoder_data/x21.yml +256 -0
  33. data/lib/stringex/unidecoder_data/x22.yml +256 -0
  34. data/lib/stringex/unidecoder_data/x23.yml +256 -0
  35. data/lib/stringex/unidecoder_data/x24.yml +256 -0
  36. data/lib/stringex/unidecoder_data/x25.yml +256 -0
  37. data/lib/stringex/unidecoder_data/x26.yml +256 -0
  38. data/lib/stringex/unidecoder_data/x27.yml +256 -0
  39. data/lib/stringex/unidecoder_data/x28.yml +257 -0
  40. data/lib/stringex/unidecoder_data/x2e.yml +256 -0
  41. data/lib/stringex/unidecoder_data/x2f.yml +256 -0
  42. data/lib/stringex/unidecoder_data/x30.yml +256 -0
  43. data/lib/stringex/unidecoder_data/x31.yml +256 -0
  44. data/lib/stringex/unidecoder_data/x32.yml +256 -0
  45. data/lib/stringex/unidecoder_data/x33.yml +256 -0
  46. data/lib/stringex/unidecoder_data/x4d.yml +256 -0
  47. data/lib/stringex/unidecoder_data/x4e.yml +257 -0
  48. data/lib/stringex/unidecoder_data/x4f.yml +257 -0
  49. data/lib/stringex/unidecoder_data/x50.yml +257 -0
  50. data/lib/stringex/unidecoder_data/x51.yml +257 -0
  51. data/lib/stringex/unidecoder_data/x52.yml +257 -0
  52. data/lib/stringex/unidecoder_data/x53.yml +257 -0
  53. data/lib/stringex/unidecoder_data/x54.yml +257 -0
  54. data/lib/stringex/unidecoder_data/x55.yml +257 -0
  55. data/lib/stringex/unidecoder_data/x56.yml +257 -0
  56. data/lib/stringex/unidecoder_data/x57.yml +257 -0
  57. data/lib/stringex/unidecoder_data/x58.yml +257 -0
  58. data/lib/stringex/unidecoder_data/x59.yml +257 -0
  59. data/lib/stringex/unidecoder_data/x5a.yml +257 -0
  60. data/lib/stringex/unidecoder_data/x5b.yml +257 -0
  61. data/lib/stringex/unidecoder_data/x5c.yml +257 -0
  62. data/lib/stringex/unidecoder_data/x5d.yml +257 -0
  63. data/lib/stringex/unidecoder_data/x5e.yml +257 -0
  64. data/lib/stringex/unidecoder_data/x5f.yml +257 -0
  65. data/lib/stringex/unidecoder_data/x60.yml +257 -0
  66. data/lib/stringex/unidecoder_data/x61.yml +257 -0
  67. data/lib/stringex/unidecoder_data/x62.yml +257 -0
  68. data/lib/stringex/unidecoder_data/x63.yml +257 -0
  69. data/lib/stringex/unidecoder_data/x64.yml +257 -0
  70. data/lib/stringex/unidecoder_data/x65.yml +257 -0
  71. data/lib/stringex/unidecoder_data/x66.yml +257 -0
  72. data/lib/stringex/unidecoder_data/x67.yml +257 -0
  73. data/lib/stringex/unidecoder_data/x68.yml +257 -0
  74. data/lib/stringex/unidecoder_data/x69.yml +257 -0
  75. data/lib/stringex/unidecoder_data/x6a.yml +257 -0
  76. data/lib/stringex/unidecoder_data/x6b.yml +257 -0
  77. data/lib/stringex/unidecoder_data/x6c.yml +257 -0
  78. data/lib/stringex/unidecoder_data/x6d.yml +257 -0
  79. data/lib/stringex/unidecoder_data/x6e.yml +257 -0
  80. data/lib/stringex/unidecoder_data/x6f.yml +257 -0
  81. data/lib/stringex/unidecoder_data/x70.yml +257 -0
  82. data/lib/stringex/unidecoder_data/x71.yml +257 -0
  83. data/lib/stringex/unidecoder_data/x72.yml +257 -0
  84. data/lib/stringex/unidecoder_data/x73.yml +257 -0
  85. data/lib/stringex/unidecoder_data/x74.yml +257 -0
  86. data/lib/stringex/unidecoder_data/x75.yml +257 -0
  87. data/lib/stringex/unidecoder_data/x76.yml +257 -0
  88. data/lib/stringex/unidecoder_data/x77.yml +257 -0
  89. data/lib/stringex/unidecoder_data/x78.yml +257 -0
  90. data/lib/stringex/unidecoder_data/x79.yml +257 -0
  91. data/lib/stringex/unidecoder_data/x7a.yml +257 -0
  92. data/lib/stringex/unidecoder_data/x7b.yml +257 -0
  93. data/lib/stringex/unidecoder_data/x7c.yml +257 -0
  94. data/lib/stringex/unidecoder_data/x7d.yml +257 -0
  95. data/lib/stringex/unidecoder_data/x7e.yml +257 -0
  96. data/lib/stringex/unidecoder_data/x7f.yml +257 -0
  97. data/lib/stringex/unidecoder_data/x80.yml +257 -0
  98. data/lib/stringex/unidecoder_data/x81.yml +257 -0
  99. data/lib/stringex/unidecoder_data/x82.yml +257 -0
  100. data/lib/stringex/unidecoder_data/x83.yml +257 -0
  101. data/lib/stringex/unidecoder_data/x84.yml +257 -0
  102. data/lib/stringex/unidecoder_data/x85.yml +257 -0
  103. data/lib/stringex/unidecoder_data/x86.yml +257 -0
  104. data/lib/stringex/unidecoder_data/x87.yml +257 -0
  105. data/lib/stringex/unidecoder_data/x88.yml +257 -0
  106. data/lib/stringex/unidecoder_data/x89.yml +257 -0
  107. data/lib/stringex/unidecoder_data/x8a.yml +257 -0
  108. data/lib/stringex/unidecoder_data/x8b.yml +257 -0
  109. data/lib/stringex/unidecoder_data/x8c.yml +257 -0
  110. data/lib/stringex/unidecoder_data/x8d.yml +257 -0
  111. data/lib/stringex/unidecoder_data/x8e.yml +257 -0
  112. data/lib/stringex/unidecoder_data/x8f.yml +257 -0
  113. data/lib/stringex/unidecoder_data/x90.yml +257 -0
  114. data/lib/stringex/unidecoder_data/x91.yml +257 -0
  115. data/lib/stringex/unidecoder_data/x92.yml +257 -0
  116. data/lib/stringex/unidecoder_data/x93.yml +257 -0
  117. data/lib/stringex/unidecoder_data/x94.yml +257 -0
  118. data/lib/stringex/unidecoder_data/x95.yml +257 -0
  119. data/lib/stringex/unidecoder_data/x96.yml +257 -0
  120. data/lib/stringex/unidecoder_data/x97.yml +257 -0
  121. data/lib/stringex/unidecoder_data/x98.yml +257 -0
  122. data/lib/stringex/unidecoder_data/x99.yml +257 -0
  123. data/lib/stringex/unidecoder_data/x9a.yml +257 -0
  124. data/lib/stringex/unidecoder_data/x9b.yml +257 -0
  125. data/lib/stringex/unidecoder_data/x9c.yml +257 -0
  126. data/lib/stringex/unidecoder_data/x9d.yml +257 -0
  127. data/lib/stringex/unidecoder_data/x9e.yml +257 -0
  128. data/lib/stringex/unidecoder_data/x9f.yml +256 -0
  129. data/lib/stringex/unidecoder_data/xa0.yml +257 -0
  130. data/lib/stringex/unidecoder_data/xa1.yml +257 -0
  131. data/lib/stringex/unidecoder_data/xa2.yml +257 -0
  132. data/lib/stringex/unidecoder_data/xa3.yml +257 -0
  133. data/lib/stringex/unidecoder_data/xa4.yml +256 -0
  134. data/lib/stringex/unidecoder_data/xac.yml +257 -0
  135. data/lib/stringex/unidecoder_data/xad.yml +257 -0
  136. data/lib/stringex/unidecoder_data/xae.yml +257 -0
  137. data/lib/stringex/unidecoder_data/xaf.yml +257 -0
  138. data/lib/stringex/unidecoder_data/xb0.yml +257 -0
  139. data/lib/stringex/unidecoder_data/xb1.yml +257 -0
  140. data/lib/stringex/unidecoder_data/xb2.yml +257 -0
  141. data/lib/stringex/unidecoder_data/xb3.yml +257 -0
  142. data/lib/stringex/unidecoder_data/xb4.yml +257 -0
  143. data/lib/stringex/unidecoder_data/xb5.yml +257 -0
  144. data/lib/stringex/unidecoder_data/xb6.yml +257 -0
  145. data/lib/stringex/unidecoder_data/xb7.yml +257 -0
  146. data/lib/stringex/unidecoder_data/xb8.yml +257 -0
  147. data/lib/stringex/unidecoder_data/xb9.yml +257 -0
  148. data/lib/stringex/unidecoder_data/xba.yml +257 -0
  149. data/lib/stringex/unidecoder_data/xbb.yml +257 -0
  150. data/lib/stringex/unidecoder_data/xbc.yml +257 -0
  151. data/lib/stringex/unidecoder_data/xbd.yml +257 -0
  152. data/lib/stringex/unidecoder_data/xbe.yml +257 -0
  153. data/lib/stringex/unidecoder_data/xbf.yml +257 -0
  154. data/lib/stringex/unidecoder_data/xc0.yml +257 -0
  155. data/lib/stringex/unidecoder_data/xc1.yml +257 -0
  156. data/lib/stringex/unidecoder_data/xc2.yml +257 -0
  157. data/lib/stringex/unidecoder_data/xc3.yml +257 -0
  158. data/lib/stringex/unidecoder_data/xc4.yml +257 -0
  159. data/lib/stringex/unidecoder_data/xc5.yml +257 -0
  160. data/lib/stringex/unidecoder_data/xc6.yml +257 -0
  161. data/lib/stringex/unidecoder_data/xc7.yml +257 -0
  162. data/lib/stringex/unidecoder_data/xc8.yml +257 -0
  163. data/lib/stringex/unidecoder_data/xc9.yml +257 -0
  164. data/lib/stringex/unidecoder_data/xca.yml +257 -0
  165. data/lib/stringex/unidecoder_data/xcb.yml +257 -0
  166. data/lib/stringex/unidecoder_data/xcc.yml +257 -0
  167. data/lib/stringex/unidecoder_data/xcd.yml +257 -0
  168. data/lib/stringex/unidecoder_data/xce.yml +257 -0
  169. data/lib/stringex/unidecoder_data/xcf.yml +257 -0
  170. data/lib/stringex/unidecoder_data/xd0.yml +257 -0
  171. data/lib/stringex/unidecoder_data/xd1.yml +257 -0
  172. data/lib/stringex/unidecoder_data/xd2.yml +257 -0
  173. data/lib/stringex/unidecoder_data/xd3.yml +257 -0
  174. data/lib/stringex/unidecoder_data/xd4.yml +257 -0
  175. data/lib/stringex/unidecoder_data/xd5.yml +257 -0
  176. data/lib/stringex/unidecoder_data/xd6.yml +257 -0
  177. data/lib/stringex/unidecoder_data/xd7.yml +256 -0
  178. data/lib/stringex/unidecoder_data/xf9.yml +257 -0
  179. data/lib/stringex/unidecoder_data/xfa.yml +256 -0
  180. data/lib/stringex/unidecoder_data/xfb.yml +257 -0
  181. data/lib/stringex/unidecoder_data/xfc.yml +257 -0
  182. data/lib/stringex/unidecoder_data/xfd.yml +256 -0
  183. data/lib/stringex/unidecoder_data/xfe.yml +257 -0
  184. data/lib/stringex/unidecoder_data/xff.yml +257 -0
  185. data/stringex.gemspec +184 -1
  186. metadata +186 -3
@@ -0,0 +1,198 @@
1
+ # encoding: UTF-8
2
+
3
+ module Stringex
4
+ # These methods are all added on String class.
5
+ module StringExtensions
6
+ def self.included(base) # :nodoc:
7
+ base.extend(ClassMethods)
8
+ end
9
+
10
+ # Returns the string converted (via Textile/RedCloth) to HTML format
11
+ # or self [with a friendly warning] if Redcloth is not available.
12
+ #
13
+ # Using <tt>:lite</tt> argument will cause RedCloth to not wrap the HTML in a container
14
+ # P element, which is useful behavior for generating header element text, etc.
15
+ # This is roughly equivalent to ActionView's <tt>textilize_without_paragraph</tt>
16
+ # except that it makes RedCloth do all the work instead of just gsubbing the return
17
+ # from RedCloth.
18
+ def to_html(lite_mode = false)
19
+ if defined?(RedCloth)
20
+ if lite_mode
21
+ RedCloth.new(self, [:lite_mode]).to_html
22
+ else
23
+ if self =~ /<pre>/
24
+ RedCloth.new(self).to_html.tr("\t", "")
25
+ else
26
+ RedCloth.new(self).to_html.tr("\t", "").gsub(/\n\n/, "")
27
+ end
28
+ end
29
+ else
30
+ warn "String#to_html was called without RedCloth being successfully required"
31
+ self
32
+ end
33
+ end
34
+
35
+ # Create a URI-friendly representation of the string. This is used internally by
36
+ # acts_as_url[link:classes/Stringex/ActsAsUrl/ClassMethods.html#M000012]
37
+ # but can be called manually in order to generate an URI-friendly version of any string.
38
+ def to_url(options = {})
39
+ remove_formatting(options).downcase.replace_whitespace("-").collapse("-")
40
+ end
41
+
42
+ # Performs multiple text manipulations. Essentially a shortcut for typing them all. View source
43
+ # below to see which methods are run.
44
+ def remove_formatting(options = {})
45
+ strip_html_tags.convert_smart_punctuation.convert_accented_entities.convert_misc_entities.convert_misc_characters(options).to_ascii.collapse
46
+ end
47
+
48
+ # Removes HTML tags from text. This code is simplified from Tobias Luettke's regular expression
49
+ # in Typo[http://typosphere.org].
50
+ def strip_html_tags(leave_whitespace = false)
51
+ name = /[\w:_-]+/
52
+ value = /([A-Za-z0-9]+|('[^']*?'|"[^"]*?"))/
53
+ attr = /(#{name}(\s*=\s*#{value})?)/
54
+ rx = /<[!\/?\[]?(#{name}|--)(\s+(#{attr}(\s+#{attr})*))?\s*([!\/?\]]+|--)?>/
55
+ (leave_whitespace) ? gsub(rx, "").strip : gsub(rx, "").gsub(/\s+/, " ").strip
56
+ end
57
+
58
+ # Converts HTML entities into the respective non-accented letters. Examples:
59
+ #
60
+ # "&aacute;".convert_accented_entities # => "a"
61
+ # "&ccedil;".convert_accented_entities # => "c"
62
+ # "&egrave;".convert_accented_entities # => "e"
63
+ # "&icirc;".convert_accented_entities # => "i"
64
+ # "&oslash;".convert_accented_entities # => "o"
65
+ # "&uuml;".convert_accented_entities # => "u"
66
+ #
67
+ # Note: This does not do any conversion of Unicode/ASCII accented-characters. For that
68
+ # functionality please use <tt>to_ascii</tt>.
69
+ def convert_accented_entities
70
+ gsub(/&([A-Za-z])(grave|acute|circ|tilde|uml|ring|cedil|slash);/, '\1')
71
+ end
72
+
73
+ # Converts HTML entities (taken from common Textile/RedCloth formattings) into plain text formats.
74
+ #
75
+ # Note: This isn't an attempt at complete conversion of HTML entities, just those most likely
76
+ # to be generated by Textile.
77
+ def convert_misc_entities
78
+ dummy = dup
79
+ {
80
+ "#822[01]" => "\"",
81
+ "#821[67]" => "'",
82
+ "#8230" => "...",
83
+ "#8211" => "-",
84
+ "#8212" => "--",
85
+ "#215" => "x",
86
+ "gt" => ">",
87
+ "lt" => "<",
88
+ "(#8482|trade)" => "(tm)",
89
+ "(#174|reg)" => "(r)",
90
+ "(#169|copy)" => "(c)",
91
+ "(#38|amp)" => "and",
92
+ "nbsp" => " ",
93
+ "(#162|cent)" => " cent",
94
+ "(#163|pound)" => " pound",
95
+ "(#188|frac14)" => "one fourth",
96
+ "(#189|frac12)" => "half",
97
+ "(#190|frac34)" => "three fourths",
98
+ "(#176|deg)" => " degrees"
99
+ }.each do |textiled, normal|
100
+ dummy.gsub!(/&#{textiled};/, normal)
101
+ end
102
+ dummy.gsub(/&[^;]+;/, "")
103
+ end
104
+
105
+ # Converts MS Word 'smart punctuation' to ASCII
106
+ #
107
+ def convert_smart_punctuation
108
+ dummy = dup
109
+ {
110
+
111
+ "(“|”|\302\223|\302\224|\303\222|\303\223)" => '"',
112
+ "(‘|’|\302\221|\302\222|\303\225)" => "'",
113
+ "…" => "...",
114
+ }.each do |smart, normal|
115
+ dummy.gsub!(/#{smart}/, normal)
116
+ end
117
+ dummy
118
+ end
119
+
120
+ # Converts various common plaintext characters to a more URI-friendly representation.
121
+ # Examples:
122
+ #
123
+ # "foo & bar".convert_misc_characters # => "foo and bar"
124
+ # "Chanel #9".convert_misc_characters # => "Chanel number nine"
125
+ # "user@host".convert_misc_characters # => "user at host"
126
+ # "google.com".convert_misc_characters # => "google dot com"
127
+ # "$10".convert_misc_characters # => "10 dollars"
128
+ # "*69".convert_misc_characters # => "star 69"
129
+ # "100%".convert_misc_characters # => "100 percent"
130
+ # "windows/mac/linux".convert_misc_characters # => "windows slash mac slash linux"
131
+ #
132
+ # Note: Because this method will convert any & symbols to the string "and",
133
+ # you should run any methods which convert HTML entities (convert_html_entities and convert_misc_entities)
134
+ # before running this method.
135
+ def convert_misc_characters(options = {})
136
+ dummy = dup.gsub(/\.{3,}/, " dot dot dot ") # Catch ellipses before single dot rule!
137
+ # Special rules for money
138
+ {
139
+ /(\s|^)\$(\d+)\.(\d+)(\s|$)/ => '\2 dollars \3 cents',
140
+ /(\s|^)£(\d+)\.(\d+)(\s|$)/u => '\2 pounds \3 pence',
141
+ }.each do |found, replaced|
142
+ replaced = " #{replaced} " unless replaced =~ /\\1/
143
+ dummy.gsub!(found, replaced)
144
+ end
145
+ # Back to normal rules
146
+ misc_characters =
147
+ {
148
+ /\s*&\s*/ => "and",
149
+ /\s*#/ => "number",
150
+ /\s*@\s*/ => "at",
151
+ /(\S|^)\.(\S)/ => '\1 dot \2',
152
+ /(\s|^)\$(\d*)(\s|$)/ => '\2 dollars',
153
+ /(\s|^)£(\d*)(\s|$)/u => '\2 pounds',
154
+ /(\s|^)¥(\d*)(\s|$)/u => '\2 yen',
155
+ /\s*\*\s*/ => "star",
156
+ /\s*%\s*/ => "percent",
157
+ /(\s*=\s*)/ => " equals ",
158
+ /\s*\+\s*/ => "plus"
159
+ }
160
+ misc_characters[/\s*(\\|\/)\s*/] = 'slash' unless options[:allow_slash]
161
+ misc_characters.each do |found, replaced|
162
+ replaced = " #{replaced} " unless replaced =~ /\\1/
163
+ dummy.gsub!(found, replaced)
164
+ end
165
+ dummy = dummy.gsub(/(^|\w)'(\w|$)/, '\1\2').gsub(/[\.,:;()\[\]\?!\^'"_]/, " ")
166
+ end
167
+
168
+ # Replace runs of whitespace in string. Defaults to a single space but any replacement
169
+ # string may be specified as an argument. Examples:
170
+ #
171
+ # "Foo bar".replace_whitespace # => "Foo bar"
172
+ # "Foo bar".replace_whitespace("-") # => "Foo-bar"
173
+ def replace_whitespace(replace = " ")
174
+ gsub(/\s+/, replace)
175
+ end
176
+
177
+ # Removes specified character from the beginning and/or end of the string and then performs
178
+ # <tt>String#squeeze(character)</tt>, condensing runs of the character within the string.
179
+ #
180
+ # Note: This method has been superceded by ActiveSupport's squish method.
181
+ def collapse(character = " ")
182
+ sub(/^#{character}*/, "").sub(/#{character}*$/, "").squeeze(character)
183
+ end
184
+
185
+ module ClassMethods
186
+ # Returns string of random characters with a length matching the specified limit. Excludes 0
187
+ # to avoid confusion between 0 and O.
188
+ def random(limit)
189
+ strong_alphanumerics = %w{
190
+ a b c d e f g h i j k l m n o p q r s t u v w x y z
191
+ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
192
+ 1 2 3 4 5 6 7 8 9
193
+ }
194
+ Array.new(limit, "").collect{strong_alphanumerics[rand(61)]}.join
195
+ end
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,159 @@
1
+ # encoding: UTF-8
2
+ require "yaml"
3
+
4
+ module Stringex
5
+ module Unidecoder
6
+ # Contains Unicode codepoints, loading as needed from YAML files
7
+ CODEPOINTS = Hash.new{|h, k|
8
+ h[k] = YAML.load_file(File.join(File.expand_path(File.dirname(__FILE__)), "unidecoder_data", "#{k}.yml"))
9
+ } unless defined?(CODEPOINTS)
10
+ LOCAL_CODEPOINTS = Hash.new unless defined?(LOCAL_CODEPOINTS)
11
+
12
+ class << self
13
+ # Returns string with its UTF-8 characters transliterated to ASCII ones
14
+ #
15
+ # You're probably better off just using the added String#to_ascii
16
+ def decode(string)
17
+ string.gsub(/[^\x00-\x7f]/u) do |codepoint|
18
+ if localized = local_codepoint(codepoint)
19
+ localized
20
+ else
21
+ begin
22
+ unpacked = codepoint.unpack("U")[0]
23
+ CODEPOINTS[code_group(unpacked)][grouped_point(unpacked)]
24
+ rescue
25
+ # Hopefully this won't come up much
26
+ # TODO: Make this note something to the user that is reportable to me perhaps
27
+ "?"
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ # Returns character for the given Unicode codepoint
34
+ def encode(codepoint)
35
+ ["0x#{codepoint}".to_i(16)].pack("U")
36
+ end
37
+
38
+ # Returns string indicating which file (and line) contains the
39
+ # transliteration value for the character
40
+ def in_yaml_file(character)
41
+ unpacked = character.unpack("U")[0]
42
+ "#{code_group(unpacked)}.yml (line #{grouped_point(unpacked) + 2})"
43
+ end
44
+
45
+ # Adds localized transliterations to Unidecoder
46
+ def localize_from(hash_or_path_to_file)
47
+ hash = if hash_or_path_to_file.is_a?(Hash)
48
+ hash_or_path_to_file
49
+ else
50
+ YAML.load_file(hash_or_path_to_file)
51
+ end
52
+ verify_local_codepoints hash
53
+ end
54
+
55
+ # Returns locale for localized transliterations
56
+ def locale
57
+ if @locale
58
+ @locale
59
+ elsif defined?(I18n)
60
+ I18n.locale
61
+ else
62
+ default_locale
63
+ end
64
+ end
65
+
66
+ # Sets locale for localized transliterations
67
+ def locale=(new_locale)
68
+ @locale = new_locale
69
+ end
70
+
71
+ # Returns default locale for localized transliterations. NOTE: Will set @locale as well.
72
+ def default_locale
73
+ @default_locale ||= "en"
74
+ @locale = @default_locale
75
+ end
76
+
77
+ # Sets the default locale for localized transliterations. NOTE: Will set @locale as well.
78
+ def default_locale=(new_locale)
79
+ @default_locale = new_locale
80
+ # Seems logical that @locale should be the new default
81
+ @locale = new_locale
82
+ end
83
+
84
+ # Returns the localized transliteration for a codepoint
85
+ def local_codepoint(codepoint)
86
+ locale_hash = LOCAL_CODEPOINTS[locale] || LOCAL_CODEPOINTS[locale.is_a?(Symbol) ? locale.to_s : locale.to_sym]
87
+ locale_hash && locale_hash[codepoint]
88
+ end
89
+
90
+ # Runs a block with a temporary locale setting, returning the locale to the original state when complete
91
+ def with_locale(new_locale, &block)
92
+ new_locale = default_locale if new_locale == :default
93
+ original_locale = locale
94
+ self.locale = new_locale
95
+ block.call
96
+ self.locale = original_locale
97
+ end
98
+
99
+ # Runs a block with default locale
100
+ def with_default_locale(&block)
101
+ with_locale default_locale, &block
102
+ end
103
+
104
+ private
105
+ # Returns the Unicode codepoint grouping for the given character
106
+ def code_group(unpacked_character)
107
+ "x%02x" % (unpacked_character >> 8)
108
+ end
109
+
110
+ # Returns the index of the given character in the YAML file for its codepoint group
111
+ def grouped_point(unpacked_character)
112
+ unpacked_character & 255
113
+ end
114
+
115
+ # Checks LOCAL_CODEPOINTS's Hash is in the format we expect before assigning it and raises
116
+ # instructive exception if not
117
+ def verify_local_codepoints(hash)
118
+ pass_check = hash.is_a?(Hash) && hash.all?{|key, value|
119
+ # Fuck a duck, eh?
120
+ [Symbol, String].include?(key.class) && value.is_a?(Hash) &&
121
+ value.keys.all?{|k| k.is_a?(String)} && value.values.all?{|v| v.is_a?(String)}
122
+ }
123
+ if pass_check
124
+ hash.each do |k, v|
125
+ LOCAL_CODEPOINTS[k] = v
126
+ end
127
+ else
128
+ raise ArgumentError, "LOCAL_CODEPOINTS is not correctly defined. Please see the README for more information on how to correctly format this data."
129
+ end
130
+ end
131
+ end
132
+ end
133
+
134
+ # Provide a simpler interface for localization implementations
135
+ class << self
136
+ %w{
137
+ localize_from
138
+ locale locale=
139
+ default_locale default_locale=
140
+ local_codepoint
141
+ with_locale with_default_locale
142
+ }.each do |name|
143
+ define_method name do |*args, &block|
144
+ Unidecoder.send name, *args, &block
145
+ end
146
+ end
147
+ end
148
+ end
149
+
150
+ module Stringex
151
+ module StringExtensions
152
+ # Returns string with its UTF-8 characters transliterated to ASCII ones. Example:
153
+ #
154
+ # "⠋⠗⠁⠝⠉⠑".to_ascii #=> "braille"
155
+ def to_ascii
156
+ Stringex::Unidecoder.decode(self)
157
+ end
158
+ end
159
+ end
@@ -0,0 +1,257 @@
1
+ ---
2
+ - "\x00"
3
+ - "\x01"
4
+ - "\x02"
5
+ - "\x03"
6
+ - "\x04"
7
+ - "\x05"
8
+ - "\x06"
9
+ - "\a"
10
+ - "\x08"
11
+ - ' '
12
+ - "\n"
13
+ - "\v"
14
+ - "\f"
15
+ - "\r"
16
+ - "\x0e"
17
+ - "\x0f"
18
+ - "\x10"
19
+ - "\x11"
20
+ - "\x12"
21
+ - "\x13"
22
+ - "\x14"
23
+ - "\x15"
24
+ - "\x16"
25
+ - "\x17"
26
+ - "\x18"
27
+ - "\x19"
28
+ - "\x1a"
29
+ - "\e"
30
+ - "\x1c"
31
+ - "\x1d"
32
+ - "\x1e"
33
+ - "\x1f"
34
+ - ' '
35
+ - '!'
36
+ - '"'
37
+ - '#'
38
+ - $
39
+ - '%'
40
+ - '&'
41
+ - "'"
42
+ - (
43
+ - )
44
+ - '*'
45
+ - +
46
+ - ','
47
+ - '-'
48
+ - .
49
+ - /
50
+ - 0
51
+ - 1
52
+ - 2
53
+ - 3
54
+ - 4
55
+ - 5
56
+ - 6
57
+ - 7
58
+ - 8
59
+ - 9
60
+ - ':'
61
+ - ;
62
+ - <
63
+ - '='
64
+ - '>'
65
+ - '?'
66
+ - '@'
67
+ - A
68
+ - B
69
+ - C
70
+ - D
71
+ - E
72
+ - F
73
+ - G
74
+ - H
75
+ - I
76
+ - J
77
+ - K
78
+ - L
79
+ - M
80
+ - N
81
+ - O
82
+ - P
83
+ - Q
84
+ - R
85
+ - S
86
+ - T
87
+ - U
88
+ - V
89
+ - W
90
+ - X
91
+ - Y
92
+ - Z
93
+ - ']'
94
+ - \
95
+ - ']'
96
+ - '^'
97
+ - _
98
+ - '`'
99
+ - a
100
+ - b
101
+ - c
102
+ - d
103
+ - e
104
+ - f
105
+ - g
106
+ - h
107
+ - i
108
+ - j
109
+ - k
110
+ - l
111
+ - m
112
+ - n
113
+ - o
114
+ - p
115
+ - q
116
+ - r
117
+ - s
118
+ - t
119
+ - u
120
+ - v
121
+ - w
122
+ - x
123
+ - y
124
+ - z
125
+ - '{'
126
+ - '|'
127
+ - '}'
128
+ - '~'
129
+ - ''
130
+ - ''
131
+ - ''
132
+ - ''
133
+ - ''
134
+ - ''
135
+ - ''
136
+ - ''
137
+ - ''
138
+ - ''
139
+ - ''
140
+ - ''
141
+ - ''
142
+ - ''
143
+ - ''
144
+ - ''
145
+ - ''
146
+ - ''
147
+ - "'"
148
+ - "'"
149
+ - '"'
150
+ - '"'
151
+ - ''
152
+ - ''
153
+ - ''
154
+ - ''
155
+ - ''
156
+ - ''
157
+ - ''
158
+ - ''
159
+ - ''
160
+ - ''
161
+ - ''
162
+ - ' '
163
+ - ''
164
+ - C/
165
+ - PS
166
+ - $?
167
+ - Y=
168
+ - '|'
169
+ - SS
170
+ - '"'
171
+ - (c)
172
+ - a
173
+ - '<<'
174
+ - '!'
175
+ - ''
176
+ - (r)
177
+ - '-'
178
+ - deg
179
+ - +-
180
+ - 2
181
+ - 3
182
+ - "'"
183
+ - u
184
+ - P
185
+ - '*'
186
+ - ','
187
+ - 1
188
+ - o
189
+ - '>>'
190
+ - 1/4
191
+ - 1/2
192
+ - 3/4
193
+ - ''
194
+ - A
195
+ - A
196
+ - A
197
+ - A
198
+ - A
199
+ - A
200
+ - AE
201
+ - C
202
+ - E
203
+ - E
204
+ - E
205
+ - E
206
+ - I
207
+ - I
208
+ - I
209
+ - I
210
+ - D
211
+ - N
212
+ - O
213
+ - O
214
+ - O
215
+ - O
216
+ - O
217
+ - x
218
+ - O
219
+ - U
220
+ - U
221
+ - U
222
+ - U
223
+ - Y
224
+ - Th
225
+ - ss
226
+ - a
227
+ - a
228
+ - a
229
+ - a
230
+ - a
231
+ - a
232
+ - ae
233
+ - c
234
+ - e
235
+ - e
236
+ - e
237
+ - e
238
+ - i
239
+ - i
240
+ - i
241
+ - i
242
+ - d
243
+ - n
244
+ - o
245
+ - o
246
+ - o
247
+ - o
248
+ - o
249
+ - /
250
+ - o
251
+ - u
252
+ - u
253
+ - u
254
+ - u
255
+ - y
256
+ - th
257
+ - y