stringex 1.3.1 → 1.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (186) hide show
  1. data/Rakefile +184 -184
  2. data/lib/stringex/acts_as_url.rb +107 -0
  3. data/lib/stringex/string_extensions.rb +198 -0
  4. data/lib/stringex/unidecoder.rb +159 -0
  5. data/lib/stringex/unidecoder_data/x00.yml +257 -0
  6. data/lib/stringex/unidecoder_data/x01.yml +257 -0
  7. data/lib/stringex/unidecoder_data/x02.yml +256 -0
  8. data/lib/stringex/unidecoder_data/x03.yml +256 -0
  9. data/lib/stringex/unidecoder_data/x04.yml +256 -0
  10. data/lib/stringex/unidecoder_data/x05.yml +256 -0
  11. data/lib/stringex/unidecoder_data/x06.yml +256 -0
  12. data/lib/stringex/unidecoder_data/x07.yml +256 -0
  13. data/lib/stringex/unidecoder_data/x09.yml +256 -0
  14. data/lib/stringex/unidecoder_data/x0a.yml +256 -0
  15. data/lib/stringex/unidecoder_data/x0b.yml +256 -0
  16. data/lib/stringex/unidecoder_data/x0c.yml +256 -0
  17. data/lib/stringex/unidecoder_data/x0d.yml +256 -0
  18. data/lib/stringex/unidecoder_data/x0e.yml +256 -0
  19. data/lib/stringex/unidecoder_data/x0f.yml +256 -0
  20. data/lib/stringex/unidecoder_data/x10.yml +256 -0
  21. data/lib/stringex/unidecoder_data/x11.yml +256 -0
  22. data/lib/stringex/unidecoder_data/x12.yml +257 -0
  23. data/lib/stringex/unidecoder_data/x13.yml +256 -0
  24. data/lib/stringex/unidecoder_data/x14.yml +257 -0
  25. data/lib/stringex/unidecoder_data/x15.yml +257 -0
  26. data/lib/stringex/unidecoder_data/x16.yml +256 -0
  27. data/lib/stringex/unidecoder_data/x17.yml +256 -0
  28. data/lib/stringex/unidecoder_data/x18.yml +256 -0
  29. data/lib/stringex/unidecoder_data/x1e.yml +256 -0
  30. data/lib/stringex/unidecoder_data/x1f.yml +256 -0
  31. data/lib/stringex/unidecoder_data/x20.yml +256 -0
  32. data/lib/stringex/unidecoder_data/x21.yml +256 -0
  33. data/lib/stringex/unidecoder_data/x22.yml +256 -0
  34. data/lib/stringex/unidecoder_data/x23.yml +256 -0
  35. data/lib/stringex/unidecoder_data/x24.yml +256 -0
  36. data/lib/stringex/unidecoder_data/x25.yml +256 -0
  37. data/lib/stringex/unidecoder_data/x26.yml +256 -0
  38. data/lib/stringex/unidecoder_data/x27.yml +256 -0
  39. data/lib/stringex/unidecoder_data/x28.yml +257 -0
  40. data/lib/stringex/unidecoder_data/x2e.yml +256 -0
  41. data/lib/stringex/unidecoder_data/x2f.yml +256 -0
  42. data/lib/stringex/unidecoder_data/x30.yml +256 -0
  43. data/lib/stringex/unidecoder_data/x31.yml +256 -0
  44. data/lib/stringex/unidecoder_data/x32.yml +256 -0
  45. data/lib/stringex/unidecoder_data/x33.yml +256 -0
  46. data/lib/stringex/unidecoder_data/x4d.yml +256 -0
  47. data/lib/stringex/unidecoder_data/x4e.yml +257 -0
  48. data/lib/stringex/unidecoder_data/x4f.yml +257 -0
  49. data/lib/stringex/unidecoder_data/x50.yml +257 -0
  50. data/lib/stringex/unidecoder_data/x51.yml +257 -0
  51. data/lib/stringex/unidecoder_data/x52.yml +257 -0
  52. data/lib/stringex/unidecoder_data/x53.yml +257 -0
  53. data/lib/stringex/unidecoder_data/x54.yml +257 -0
  54. data/lib/stringex/unidecoder_data/x55.yml +257 -0
  55. data/lib/stringex/unidecoder_data/x56.yml +257 -0
  56. data/lib/stringex/unidecoder_data/x57.yml +257 -0
  57. data/lib/stringex/unidecoder_data/x58.yml +257 -0
  58. data/lib/stringex/unidecoder_data/x59.yml +257 -0
  59. data/lib/stringex/unidecoder_data/x5a.yml +257 -0
  60. data/lib/stringex/unidecoder_data/x5b.yml +257 -0
  61. data/lib/stringex/unidecoder_data/x5c.yml +257 -0
  62. data/lib/stringex/unidecoder_data/x5d.yml +257 -0
  63. data/lib/stringex/unidecoder_data/x5e.yml +257 -0
  64. data/lib/stringex/unidecoder_data/x5f.yml +257 -0
  65. data/lib/stringex/unidecoder_data/x60.yml +257 -0
  66. data/lib/stringex/unidecoder_data/x61.yml +257 -0
  67. data/lib/stringex/unidecoder_data/x62.yml +257 -0
  68. data/lib/stringex/unidecoder_data/x63.yml +257 -0
  69. data/lib/stringex/unidecoder_data/x64.yml +257 -0
  70. data/lib/stringex/unidecoder_data/x65.yml +257 -0
  71. data/lib/stringex/unidecoder_data/x66.yml +257 -0
  72. data/lib/stringex/unidecoder_data/x67.yml +257 -0
  73. data/lib/stringex/unidecoder_data/x68.yml +257 -0
  74. data/lib/stringex/unidecoder_data/x69.yml +257 -0
  75. data/lib/stringex/unidecoder_data/x6a.yml +257 -0
  76. data/lib/stringex/unidecoder_data/x6b.yml +257 -0
  77. data/lib/stringex/unidecoder_data/x6c.yml +257 -0
  78. data/lib/stringex/unidecoder_data/x6d.yml +257 -0
  79. data/lib/stringex/unidecoder_data/x6e.yml +257 -0
  80. data/lib/stringex/unidecoder_data/x6f.yml +257 -0
  81. data/lib/stringex/unidecoder_data/x70.yml +257 -0
  82. data/lib/stringex/unidecoder_data/x71.yml +257 -0
  83. data/lib/stringex/unidecoder_data/x72.yml +257 -0
  84. data/lib/stringex/unidecoder_data/x73.yml +257 -0
  85. data/lib/stringex/unidecoder_data/x74.yml +257 -0
  86. data/lib/stringex/unidecoder_data/x75.yml +257 -0
  87. data/lib/stringex/unidecoder_data/x76.yml +257 -0
  88. data/lib/stringex/unidecoder_data/x77.yml +257 -0
  89. data/lib/stringex/unidecoder_data/x78.yml +257 -0
  90. data/lib/stringex/unidecoder_data/x79.yml +257 -0
  91. data/lib/stringex/unidecoder_data/x7a.yml +257 -0
  92. data/lib/stringex/unidecoder_data/x7b.yml +257 -0
  93. data/lib/stringex/unidecoder_data/x7c.yml +257 -0
  94. data/lib/stringex/unidecoder_data/x7d.yml +257 -0
  95. data/lib/stringex/unidecoder_data/x7e.yml +257 -0
  96. data/lib/stringex/unidecoder_data/x7f.yml +257 -0
  97. data/lib/stringex/unidecoder_data/x80.yml +257 -0
  98. data/lib/stringex/unidecoder_data/x81.yml +257 -0
  99. data/lib/stringex/unidecoder_data/x82.yml +257 -0
  100. data/lib/stringex/unidecoder_data/x83.yml +257 -0
  101. data/lib/stringex/unidecoder_data/x84.yml +257 -0
  102. data/lib/stringex/unidecoder_data/x85.yml +257 -0
  103. data/lib/stringex/unidecoder_data/x86.yml +257 -0
  104. data/lib/stringex/unidecoder_data/x87.yml +257 -0
  105. data/lib/stringex/unidecoder_data/x88.yml +257 -0
  106. data/lib/stringex/unidecoder_data/x89.yml +257 -0
  107. data/lib/stringex/unidecoder_data/x8a.yml +257 -0
  108. data/lib/stringex/unidecoder_data/x8b.yml +257 -0
  109. data/lib/stringex/unidecoder_data/x8c.yml +257 -0
  110. data/lib/stringex/unidecoder_data/x8d.yml +257 -0
  111. data/lib/stringex/unidecoder_data/x8e.yml +257 -0
  112. data/lib/stringex/unidecoder_data/x8f.yml +257 -0
  113. data/lib/stringex/unidecoder_data/x90.yml +257 -0
  114. data/lib/stringex/unidecoder_data/x91.yml +257 -0
  115. data/lib/stringex/unidecoder_data/x92.yml +257 -0
  116. data/lib/stringex/unidecoder_data/x93.yml +257 -0
  117. data/lib/stringex/unidecoder_data/x94.yml +257 -0
  118. data/lib/stringex/unidecoder_data/x95.yml +257 -0
  119. data/lib/stringex/unidecoder_data/x96.yml +257 -0
  120. data/lib/stringex/unidecoder_data/x97.yml +257 -0
  121. data/lib/stringex/unidecoder_data/x98.yml +257 -0
  122. data/lib/stringex/unidecoder_data/x99.yml +257 -0
  123. data/lib/stringex/unidecoder_data/x9a.yml +257 -0
  124. data/lib/stringex/unidecoder_data/x9b.yml +257 -0
  125. data/lib/stringex/unidecoder_data/x9c.yml +257 -0
  126. data/lib/stringex/unidecoder_data/x9d.yml +257 -0
  127. data/lib/stringex/unidecoder_data/x9e.yml +257 -0
  128. data/lib/stringex/unidecoder_data/x9f.yml +256 -0
  129. data/lib/stringex/unidecoder_data/xa0.yml +257 -0
  130. data/lib/stringex/unidecoder_data/xa1.yml +257 -0
  131. data/lib/stringex/unidecoder_data/xa2.yml +257 -0
  132. data/lib/stringex/unidecoder_data/xa3.yml +257 -0
  133. data/lib/stringex/unidecoder_data/xa4.yml +256 -0
  134. data/lib/stringex/unidecoder_data/xac.yml +257 -0
  135. data/lib/stringex/unidecoder_data/xad.yml +257 -0
  136. data/lib/stringex/unidecoder_data/xae.yml +257 -0
  137. data/lib/stringex/unidecoder_data/xaf.yml +257 -0
  138. data/lib/stringex/unidecoder_data/xb0.yml +257 -0
  139. data/lib/stringex/unidecoder_data/xb1.yml +257 -0
  140. data/lib/stringex/unidecoder_data/xb2.yml +257 -0
  141. data/lib/stringex/unidecoder_data/xb3.yml +257 -0
  142. data/lib/stringex/unidecoder_data/xb4.yml +257 -0
  143. data/lib/stringex/unidecoder_data/xb5.yml +257 -0
  144. data/lib/stringex/unidecoder_data/xb6.yml +257 -0
  145. data/lib/stringex/unidecoder_data/xb7.yml +257 -0
  146. data/lib/stringex/unidecoder_data/xb8.yml +257 -0
  147. data/lib/stringex/unidecoder_data/xb9.yml +257 -0
  148. data/lib/stringex/unidecoder_data/xba.yml +257 -0
  149. data/lib/stringex/unidecoder_data/xbb.yml +257 -0
  150. data/lib/stringex/unidecoder_data/xbc.yml +257 -0
  151. data/lib/stringex/unidecoder_data/xbd.yml +257 -0
  152. data/lib/stringex/unidecoder_data/xbe.yml +257 -0
  153. data/lib/stringex/unidecoder_data/xbf.yml +257 -0
  154. data/lib/stringex/unidecoder_data/xc0.yml +257 -0
  155. data/lib/stringex/unidecoder_data/xc1.yml +257 -0
  156. data/lib/stringex/unidecoder_data/xc2.yml +257 -0
  157. data/lib/stringex/unidecoder_data/xc3.yml +257 -0
  158. data/lib/stringex/unidecoder_data/xc4.yml +257 -0
  159. data/lib/stringex/unidecoder_data/xc5.yml +257 -0
  160. data/lib/stringex/unidecoder_data/xc6.yml +257 -0
  161. data/lib/stringex/unidecoder_data/xc7.yml +257 -0
  162. data/lib/stringex/unidecoder_data/xc8.yml +257 -0
  163. data/lib/stringex/unidecoder_data/xc9.yml +257 -0
  164. data/lib/stringex/unidecoder_data/xca.yml +257 -0
  165. data/lib/stringex/unidecoder_data/xcb.yml +257 -0
  166. data/lib/stringex/unidecoder_data/xcc.yml +257 -0
  167. data/lib/stringex/unidecoder_data/xcd.yml +257 -0
  168. data/lib/stringex/unidecoder_data/xce.yml +257 -0
  169. data/lib/stringex/unidecoder_data/xcf.yml +257 -0
  170. data/lib/stringex/unidecoder_data/xd0.yml +257 -0
  171. data/lib/stringex/unidecoder_data/xd1.yml +257 -0
  172. data/lib/stringex/unidecoder_data/xd2.yml +257 -0
  173. data/lib/stringex/unidecoder_data/xd3.yml +257 -0
  174. data/lib/stringex/unidecoder_data/xd4.yml +257 -0
  175. data/lib/stringex/unidecoder_data/xd5.yml +257 -0
  176. data/lib/stringex/unidecoder_data/xd6.yml +257 -0
  177. data/lib/stringex/unidecoder_data/xd7.yml +256 -0
  178. data/lib/stringex/unidecoder_data/xf9.yml +257 -0
  179. data/lib/stringex/unidecoder_data/xfa.yml +256 -0
  180. data/lib/stringex/unidecoder_data/xfb.yml +257 -0
  181. data/lib/stringex/unidecoder_data/xfc.yml +257 -0
  182. data/lib/stringex/unidecoder_data/xfd.yml +256 -0
  183. data/lib/stringex/unidecoder_data/xfe.yml +257 -0
  184. data/lib/stringex/unidecoder_data/xff.yml +257 -0
  185. data/stringex.gemspec +184 -1
  186. metadata +186 -3
@@ -0,0 +1,198 @@
1
+ # encoding: UTF-8
2
+
3
+ module Stringex
4
+ # These methods are all added on String class.
5
+ module StringExtensions
6
+ def self.included(base) # :nodoc:
7
+ base.extend(ClassMethods)
8
+ end
9
+
10
+ # Returns the string converted (via Textile/RedCloth) to HTML format
11
+ # or self [with a friendly warning] if Redcloth is not available.
12
+ #
13
+ # Using <tt>:lite</tt> argument will cause RedCloth to not wrap the HTML in a container
14
+ # P element, which is useful behavior for generating header element text, etc.
15
+ # This is roughly equivalent to ActionView's <tt>textilize_without_paragraph</tt>
16
+ # except that it makes RedCloth do all the work instead of just gsubbing the return
17
+ # from RedCloth.
18
+ def to_html(lite_mode = false)
19
+ if defined?(RedCloth)
20
+ if lite_mode
21
+ RedCloth.new(self, [:lite_mode]).to_html
22
+ else
23
+ if self =~ /<pre>/
24
+ RedCloth.new(self).to_html.tr("\t", "")
25
+ else
26
+ RedCloth.new(self).to_html.tr("\t", "").gsub(/\n\n/, "")
27
+ end
28
+ end
29
+ else
30
+ warn "String#to_html was called without RedCloth being successfully required"
31
+ self
32
+ end
33
+ end
34
+
35
+ # Create a URI-friendly representation of the string. This is used internally by
36
+ # acts_as_url[link:classes/Stringex/ActsAsUrl/ClassMethods.html#M000012]
37
+ # but can be called manually in order to generate an URI-friendly version of any string.
38
+ def to_url(options = {})
39
+ remove_formatting(options).downcase.replace_whitespace("-").collapse("-")
40
+ end
41
+
42
+ # Performs multiple text manipulations. Essentially a shortcut for typing them all. View source
43
+ # below to see which methods are run.
44
+ def remove_formatting(options = {})
45
+ strip_html_tags.convert_smart_punctuation.convert_accented_entities.convert_misc_entities.convert_misc_characters(options).to_ascii.collapse
46
+ end
47
+
48
+ # Removes HTML tags from text. This code is simplified from Tobias Luettke's regular expression
49
+ # in Typo[http://typosphere.org].
50
+ def strip_html_tags(leave_whitespace = false)
51
+ name = /[\w:_-]+/
52
+ value = /([A-Za-z0-9]+|('[^']*?'|"[^"]*?"))/
53
+ attr = /(#{name}(\s*=\s*#{value})?)/
54
+ rx = /<[!\/?\[]?(#{name}|--)(\s+(#{attr}(\s+#{attr})*))?\s*([!\/?\]]+|--)?>/
55
+ (leave_whitespace) ? gsub(rx, "").strip : gsub(rx, "").gsub(/\s+/, " ").strip
56
+ end
57
+
58
+ # Converts HTML entities into the respective non-accented letters. Examples:
59
+ #
60
+ # "&aacute;".convert_accented_entities # => "a"
61
+ # "&ccedil;".convert_accented_entities # => "c"
62
+ # "&egrave;".convert_accented_entities # => "e"
63
+ # "&icirc;".convert_accented_entities # => "i"
64
+ # "&oslash;".convert_accented_entities # => "o"
65
+ # "&uuml;".convert_accented_entities # => "u"
66
+ #
67
+ # Note: This does not do any conversion of Unicode/ASCII accented-characters. For that
68
+ # functionality please use <tt>to_ascii</tt>.
69
+ def convert_accented_entities
70
+ gsub(/&([A-Za-z])(grave|acute|circ|tilde|uml|ring|cedil|slash);/, '\1')
71
+ end
72
+
73
+ # Converts HTML entities (taken from common Textile/RedCloth formattings) into plain text formats.
74
+ #
75
+ # Note: This isn't an attempt at complete conversion of HTML entities, just those most likely
76
+ # to be generated by Textile.
77
+ def convert_misc_entities
78
+ dummy = dup
79
+ {
80
+ "#822[01]" => "\"",
81
+ "#821[67]" => "'",
82
+ "#8230" => "...",
83
+ "#8211" => "-",
84
+ "#8212" => "--",
85
+ "#215" => "x",
86
+ "gt" => ">",
87
+ "lt" => "<",
88
+ "(#8482|trade)" => "(tm)",
89
+ "(#174|reg)" => "(r)",
90
+ "(#169|copy)" => "(c)",
91
+ "(#38|amp)" => "and",
92
+ "nbsp" => " ",
93
+ "(#162|cent)" => " cent",
94
+ "(#163|pound)" => " pound",
95
+ "(#188|frac14)" => "one fourth",
96
+ "(#189|frac12)" => "half",
97
+ "(#190|frac34)" => "three fourths",
98
+ "(#176|deg)" => " degrees"
99
+ }.each do |textiled, normal|
100
+ dummy.gsub!(/&#{textiled};/, normal)
101
+ end
102
+ dummy.gsub(/&[^;]+;/, "")
103
+ end
104
+
105
+ # Converts MS Word 'smart punctuation' to ASCII
106
+ #
107
+ def convert_smart_punctuation
108
+ dummy = dup
109
+ {
110
+
111
+ "(“|”|\302\223|\302\224|\303\222|\303\223)" => '"',
112
+ "(‘|’|\302\221|\302\222|\303\225)" => "'",
113
+ "…" => "...",
114
+ }.each do |smart, normal|
115
+ dummy.gsub!(/#{smart}/, normal)
116
+ end
117
+ dummy
118
+ end
119
+
120
+ # Converts various common plaintext characters to a more URI-friendly representation.
121
+ # Examples:
122
+ #
123
+ # "foo & bar".convert_misc_characters # => "foo and bar"
124
+ # "Chanel #9".convert_misc_characters # => "Chanel number nine"
125
+ # "user@host".convert_misc_characters # => "user at host"
126
+ # "google.com".convert_misc_characters # => "google dot com"
127
+ # "$10".convert_misc_characters # => "10 dollars"
128
+ # "*69".convert_misc_characters # => "star 69"
129
+ # "100%".convert_misc_characters # => "100 percent"
130
+ # "windows/mac/linux".convert_misc_characters # => "windows slash mac slash linux"
131
+ #
132
+ # Note: Because this method will convert any & symbols to the string "and",
133
+ # you should run any methods which convert HTML entities (convert_html_entities and convert_misc_entities)
134
+ # before running this method.
135
+ def convert_misc_characters(options = {})
136
+ dummy = dup.gsub(/\.{3,}/, " dot dot dot ") # Catch ellipses before single dot rule!
137
+ # Special rules for money
138
+ {
139
+ /(\s|^)\$(\d+)\.(\d+)(\s|$)/ => '\2 dollars \3 cents',
140
+ /(\s|^)£(\d+)\.(\d+)(\s|$)/u => '\2 pounds \3 pence',
141
+ }.each do |found, replaced|
142
+ replaced = " #{replaced} " unless replaced =~ /\\1/
143
+ dummy.gsub!(found, replaced)
144
+ end
145
+ # Back to normal rules
146
+ misc_characters =
147
+ {
148
+ /\s*&\s*/ => "and",
149
+ /\s*#/ => "number",
150
+ /\s*@\s*/ => "at",
151
+ /(\S|^)\.(\S)/ => '\1 dot \2',
152
+ /(\s|^)\$(\d*)(\s|$)/ => '\2 dollars',
153
+ /(\s|^)£(\d*)(\s|$)/u => '\2 pounds',
154
+ /(\s|^)¥(\d*)(\s|$)/u => '\2 yen',
155
+ /\s*\*\s*/ => "star",
156
+ /\s*%\s*/ => "percent",
157
+ /(\s*=\s*)/ => " equals ",
158
+ /\s*\+\s*/ => "plus"
159
+ }
160
+ misc_characters[/\s*(\\|\/)\s*/] = 'slash' unless options[:allow_slash]
161
+ misc_characters.each do |found, replaced|
162
+ replaced = " #{replaced} " unless replaced =~ /\\1/
163
+ dummy.gsub!(found, replaced)
164
+ end
165
+ dummy = dummy.gsub(/(^|\w)'(\w|$)/, '\1\2').gsub(/[\.,:;()\[\]\?!\^'"_]/, " ")
166
+ end
167
+
168
+ # Replace runs of whitespace in string. Defaults to a single space but any replacement
169
+ # string may be specified as an argument. Examples:
170
+ #
171
+ # "Foo bar".replace_whitespace # => "Foo bar"
172
+ # "Foo bar".replace_whitespace("-") # => "Foo-bar"
173
+ def replace_whitespace(replace = " ")
174
+ gsub(/\s+/, replace)
175
+ end
176
+
177
+ # Removes specified character from the beginning and/or end of the string and then performs
178
+ # <tt>String#squeeze(character)</tt>, condensing runs of the character within the string.
179
+ #
180
+ # Note: This method has been superceded by ActiveSupport's squish method.
181
+ def collapse(character = " ")
182
+ sub(/^#{character}*/, "").sub(/#{character}*$/, "").squeeze(character)
183
+ end
184
+
185
+ module ClassMethods
186
+ # Returns string of random characters with a length matching the specified limit. Excludes 0
187
+ # to avoid confusion between 0 and O.
188
+ def random(limit)
189
+ strong_alphanumerics = %w{
190
+ a b c d e f g h i j k l m n o p q r s t u v w x y z
191
+ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
192
+ 1 2 3 4 5 6 7 8 9
193
+ }
194
+ Array.new(limit, "").collect{strong_alphanumerics[rand(61)]}.join
195
+ end
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,159 @@
1
+ # encoding: UTF-8
2
+ require "yaml"
3
+
4
+ module Stringex
5
+ module Unidecoder
6
+ # Contains Unicode codepoints, loading as needed from YAML files
7
+ CODEPOINTS = Hash.new{|h, k|
8
+ h[k] = YAML.load_file(File.join(File.expand_path(File.dirname(__FILE__)), "unidecoder_data", "#{k}.yml"))
9
+ } unless defined?(CODEPOINTS)
10
+ LOCAL_CODEPOINTS = Hash.new unless defined?(LOCAL_CODEPOINTS)
11
+
12
+ class << self
13
+ # Returns string with its UTF-8 characters transliterated to ASCII ones
14
+ #
15
+ # You're probably better off just using the added String#to_ascii
16
+ def decode(string)
17
+ string.gsub(/[^\x00-\x7f]/u) do |codepoint|
18
+ if localized = local_codepoint(codepoint)
19
+ localized
20
+ else
21
+ begin
22
+ unpacked = codepoint.unpack("U")[0]
23
+ CODEPOINTS[code_group(unpacked)][grouped_point(unpacked)]
24
+ rescue
25
+ # Hopefully this won't come up much
26
+ # TODO: Make this note something to the user that is reportable to me perhaps
27
+ "?"
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ # Returns character for the given Unicode codepoint
34
+ def encode(codepoint)
35
+ ["0x#{codepoint}".to_i(16)].pack("U")
36
+ end
37
+
38
+ # Returns string indicating which file (and line) contains the
39
+ # transliteration value for the character
40
+ def in_yaml_file(character)
41
+ unpacked = character.unpack("U")[0]
42
+ "#{code_group(unpacked)}.yml (line #{grouped_point(unpacked) + 2})"
43
+ end
44
+
45
+ # Adds localized transliterations to Unidecoder
46
+ def localize_from(hash_or_path_to_file)
47
+ hash = if hash_or_path_to_file.is_a?(Hash)
48
+ hash_or_path_to_file
49
+ else
50
+ YAML.load_file(hash_or_path_to_file)
51
+ end
52
+ verify_local_codepoints hash
53
+ end
54
+
55
+ # Returns locale for localized transliterations
56
+ def locale
57
+ if @locale
58
+ @locale
59
+ elsif defined?(I18n)
60
+ I18n.locale
61
+ else
62
+ default_locale
63
+ end
64
+ end
65
+
66
+ # Sets locale for localized transliterations
67
+ def locale=(new_locale)
68
+ @locale = new_locale
69
+ end
70
+
71
+ # Returns default locale for localized transliterations. NOTE: Will set @locale as well.
72
+ def default_locale
73
+ @default_locale ||= "en"
74
+ @locale = @default_locale
75
+ end
76
+
77
+ # Sets the default locale for localized transliterations. NOTE: Will set @locale as well.
78
+ def default_locale=(new_locale)
79
+ @default_locale = new_locale
80
+ # Seems logical that @locale should be the new default
81
+ @locale = new_locale
82
+ end
83
+
84
+ # Returns the localized transliteration for a codepoint
85
+ def local_codepoint(codepoint)
86
+ locale_hash = LOCAL_CODEPOINTS[locale] || LOCAL_CODEPOINTS[locale.is_a?(Symbol) ? locale.to_s : locale.to_sym]
87
+ locale_hash && locale_hash[codepoint]
88
+ end
89
+
90
+ # Runs a block with a temporary locale setting, returning the locale to the original state when complete
91
+ def with_locale(new_locale, &block)
92
+ new_locale = default_locale if new_locale == :default
93
+ original_locale = locale
94
+ self.locale = new_locale
95
+ block.call
96
+ self.locale = original_locale
97
+ end
98
+
99
+ # Runs a block with default locale
100
+ def with_default_locale(&block)
101
+ with_locale default_locale, &block
102
+ end
103
+
104
+ private
105
+ # Returns the Unicode codepoint grouping for the given character
106
+ def code_group(unpacked_character)
107
+ "x%02x" % (unpacked_character >> 8)
108
+ end
109
+
110
+ # Returns the index of the given character in the YAML file for its codepoint group
111
+ def grouped_point(unpacked_character)
112
+ unpacked_character & 255
113
+ end
114
+
115
+ # Checks LOCAL_CODEPOINTS's Hash is in the format we expect before assigning it and raises
116
+ # instructive exception if not
117
+ def verify_local_codepoints(hash)
118
+ pass_check = hash.is_a?(Hash) && hash.all?{|key, value|
119
+ # Fuck a duck, eh?
120
+ [Symbol, String].include?(key.class) && value.is_a?(Hash) &&
121
+ value.keys.all?{|k| k.is_a?(String)} && value.values.all?{|v| v.is_a?(String)}
122
+ }
123
+ if pass_check
124
+ hash.each do |k, v|
125
+ LOCAL_CODEPOINTS[k] = v
126
+ end
127
+ else
128
+ raise ArgumentError, "LOCAL_CODEPOINTS is not correctly defined. Please see the README for more information on how to correctly format this data."
129
+ end
130
+ end
131
+ end
132
+ end
133
+
134
+ # Provide a simpler interface for localization implementations
135
+ class << self
136
+ %w{
137
+ localize_from
138
+ locale locale=
139
+ default_locale default_locale=
140
+ local_codepoint
141
+ with_locale with_default_locale
142
+ }.each do |name|
143
+ define_method name do |*args, &block|
144
+ Unidecoder.send name, *args, &block
145
+ end
146
+ end
147
+ end
148
+ end
149
+
150
+ module Stringex
151
+ module StringExtensions
152
+ # Returns string with its UTF-8 characters transliterated to ASCII ones. Example:
153
+ #
154
+ # "⠋⠗⠁⠝⠉⠑".to_ascii #=> "braille"
155
+ def to_ascii
156
+ Stringex::Unidecoder.decode(self)
157
+ end
158
+ end
159
+ end
@@ -0,0 +1,257 @@
1
+ ---
2
+ - "\x00"
3
+ - "\x01"
4
+ - "\x02"
5
+ - "\x03"
6
+ - "\x04"
7
+ - "\x05"
8
+ - "\x06"
9
+ - "\a"
10
+ - "\x08"
11
+ - ' '
12
+ - "\n"
13
+ - "\v"
14
+ - "\f"
15
+ - "\r"
16
+ - "\x0e"
17
+ - "\x0f"
18
+ - "\x10"
19
+ - "\x11"
20
+ - "\x12"
21
+ - "\x13"
22
+ - "\x14"
23
+ - "\x15"
24
+ - "\x16"
25
+ - "\x17"
26
+ - "\x18"
27
+ - "\x19"
28
+ - "\x1a"
29
+ - "\e"
30
+ - "\x1c"
31
+ - "\x1d"
32
+ - "\x1e"
33
+ - "\x1f"
34
+ - ' '
35
+ - '!'
36
+ - '"'
37
+ - '#'
38
+ - $
39
+ - '%'
40
+ - '&'
41
+ - "'"
42
+ - (
43
+ - )
44
+ - '*'
45
+ - +
46
+ - ','
47
+ - '-'
48
+ - .
49
+ - /
50
+ - 0
51
+ - 1
52
+ - 2
53
+ - 3
54
+ - 4
55
+ - 5
56
+ - 6
57
+ - 7
58
+ - 8
59
+ - 9
60
+ - ':'
61
+ - ;
62
+ - <
63
+ - '='
64
+ - '>'
65
+ - '?'
66
+ - '@'
67
+ - A
68
+ - B
69
+ - C
70
+ - D
71
+ - E
72
+ - F
73
+ - G
74
+ - H
75
+ - I
76
+ - J
77
+ - K
78
+ - L
79
+ - M
80
+ - N
81
+ - O
82
+ - P
83
+ - Q
84
+ - R
85
+ - S
86
+ - T
87
+ - U
88
+ - V
89
+ - W
90
+ - X
91
+ - Y
92
+ - Z
93
+ - ']'
94
+ - \
95
+ - ']'
96
+ - '^'
97
+ - _
98
+ - '`'
99
+ - a
100
+ - b
101
+ - c
102
+ - d
103
+ - e
104
+ - f
105
+ - g
106
+ - h
107
+ - i
108
+ - j
109
+ - k
110
+ - l
111
+ - m
112
+ - n
113
+ - o
114
+ - p
115
+ - q
116
+ - r
117
+ - s
118
+ - t
119
+ - u
120
+ - v
121
+ - w
122
+ - x
123
+ - y
124
+ - z
125
+ - '{'
126
+ - '|'
127
+ - '}'
128
+ - '~'
129
+ - ''
130
+ - ''
131
+ - ''
132
+ - ''
133
+ - ''
134
+ - ''
135
+ - ''
136
+ - ''
137
+ - ''
138
+ - ''
139
+ - ''
140
+ - ''
141
+ - ''
142
+ - ''
143
+ - ''
144
+ - ''
145
+ - ''
146
+ - ''
147
+ - "'"
148
+ - "'"
149
+ - '"'
150
+ - '"'
151
+ - ''
152
+ - ''
153
+ - ''
154
+ - ''
155
+ - ''
156
+ - ''
157
+ - ''
158
+ - ''
159
+ - ''
160
+ - ''
161
+ - ''
162
+ - ' '
163
+ - ''
164
+ - C/
165
+ - PS
166
+ - $?
167
+ - Y=
168
+ - '|'
169
+ - SS
170
+ - '"'
171
+ - (c)
172
+ - a
173
+ - '<<'
174
+ - '!'
175
+ - ''
176
+ - (r)
177
+ - '-'
178
+ - deg
179
+ - +-
180
+ - 2
181
+ - 3
182
+ - "'"
183
+ - u
184
+ - P
185
+ - '*'
186
+ - ','
187
+ - 1
188
+ - o
189
+ - '>>'
190
+ - 1/4
191
+ - 1/2
192
+ - 3/4
193
+ - ''
194
+ - A
195
+ - A
196
+ - A
197
+ - A
198
+ - A
199
+ - A
200
+ - AE
201
+ - C
202
+ - E
203
+ - E
204
+ - E
205
+ - E
206
+ - I
207
+ - I
208
+ - I
209
+ - I
210
+ - D
211
+ - N
212
+ - O
213
+ - O
214
+ - O
215
+ - O
216
+ - O
217
+ - x
218
+ - O
219
+ - U
220
+ - U
221
+ - U
222
+ - U
223
+ - Y
224
+ - Th
225
+ - ss
226
+ - a
227
+ - a
228
+ - a
229
+ - a
230
+ - a
231
+ - a
232
+ - ae
233
+ - c
234
+ - e
235
+ - e
236
+ - e
237
+ - e
238
+ - i
239
+ - i
240
+ - i
241
+ - i
242
+ - d
243
+ - n
244
+ - o
245
+ - o
246
+ - o
247
+ - o
248
+ - o
249
+ - /
250
+ - o
251
+ - u
252
+ - u
253
+ - u
254
+ - u
255
+ - y
256
+ - th
257
+ - y