talentbox-unidecoder 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. data/Changelog.md +16 -0
  2. data/README.md +51 -0
  3. data/Rakefile +28 -0
  4. data/lib/unidecoder.rb +98 -0
  5. data/lib/unidecoder/data/x00.yml +257 -0
  6. data/lib/unidecoder/data/x01.yml +257 -0
  7. data/lib/unidecoder/data/x02.yml +256 -0
  8. data/lib/unidecoder/data/x03.yml +256 -0
  9. data/lib/unidecoder/data/x04.yml +256 -0
  10. data/lib/unidecoder/data/x05.yml +256 -0
  11. data/lib/unidecoder/data/x06.yml +256 -0
  12. data/lib/unidecoder/data/x07.yml +256 -0
  13. data/lib/unidecoder/data/x09.yml +256 -0
  14. data/lib/unidecoder/data/x0a.yml +256 -0
  15. data/lib/unidecoder/data/x0b.yml +256 -0
  16. data/lib/unidecoder/data/x0c.yml +256 -0
  17. data/lib/unidecoder/data/x0d.yml +256 -0
  18. data/lib/unidecoder/data/x0e.yml +256 -0
  19. data/lib/unidecoder/data/x0f.yml +256 -0
  20. data/lib/unidecoder/data/x10.yml +256 -0
  21. data/lib/unidecoder/data/x11.yml +256 -0
  22. data/lib/unidecoder/data/x12.yml +257 -0
  23. data/lib/unidecoder/data/x13.yml +256 -0
  24. data/lib/unidecoder/data/x14.yml +257 -0
  25. data/lib/unidecoder/data/x15.yml +257 -0
  26. data/lib/unidecoder/data/x16.yml +256 -0
  27. data/lib/unidecoder/data/x17.yml +256 -0
  28. data/lib/unidecoder/data/x18.yml +256 -0
  29. data/lib/unidecoder/data/x1e.yml +256 -0
  30. data/lib/unidecoder/data/x1f.yml +256 -0
  31. data/lib/unidecoder/data/x20.yml +256 -0
  32. data/lib/unidecoder/data/x21.yml +256 -0
  33. data/lib/unidecoder/data/x22.yml +256 -0
  34. data/lib/unidecoder/data/x23.yml +256 -0
  35. data/lib/unidecoder/data/x24.yml +256 -0
  36. data/lib/unidecoder/data/x25.yml +256 -0
  37. data/lib/unidecoder/data/x26.yml +256 -0
  38. data/lib/unidecoder/data/x27.yml +256 -0
  39. data/lib/unidecoder/data/x28.yml +257 -0
  40. data/lib/unidecoder/data/x2e.yml +256 -0
  41. data/lib/unidecoder/data/x2f.yml +256 -0
  42. data/lib/unidecoder/data/x30.yml +256 -0
  43. data/lib/unidecoder/data/x31.yml +256 -0
  44. data/lib/unidecoder/data/x32.yml +256 -0
  45. data/lib/unidecoder/data/x33.yml +256 -0
  46. data/lib/unidecoder/data/x4d.yml +256 -0
  47. data/lib/unidecoder/data/x4e.yml +257 -0
  48. data/lib/unidecoder/data/x4f.yml +257 -0
  49. data/lib/unidecoder/data/x50.yml +257 -0
  50. data/lib/unidecoder/data/x51.yml +257 -0
  51. data/lib/unidecoder/data/x52.yml +257 -0
  52. data/lib/unidecoder/data/x53.yml +257 -0
  53. data/lib/unidecoder/data/x54.yml +257 -0
  54. data/lib/unidecoder/data/x55.yml +257 -0
  55. data/lib/unidecoder/data/x56.yml +257 -0
  56. data/lib/unidecoder/data/x57.yml +257 -0
  57. data/lib/unidecoder/data/x58.yml +257 -0
  58. data/lib/unidecoder/data/x59.yml +257 -0
  59. data/lib/unidecoder/data/x5a.yml +257 -0
  60. data/lib/unidecoder/data/x5b.yml +257 -0
  61. data/lib/unidecoder/data/x5c.yml +257 -0
  62. data/lib/unidecoder/data/x5d.yml +257 -0
  63. data/lib/unidecoder/data/x5e.yml +257 -0
  64. data/lib/unidecoder/data/x5f.yml +257 -0
  65. data/lib/unidecoder/data/x60.yml +257 -0
  66. data/lib/unidecoder/data/x61.yml +257 -0
  67. data/lib/unidecoder/data/x62.yml +257 -0
  68. data/lib/unidecoder/data/x63.yml +257 -0
  69. data/lib/unidecoder/data/x64.yml +257 -0
  70. data/lib/unidecoder/data/x65.yml +257 -0
  71. data/lib/unidecoder/data/x66.yml +257 -0
  72. data/lib/unidecoder/data/x67.yml +257 -0
  73. data/lib/unidecoder/data/x68.yml +257 -0
  74. data/lib/unidecoder/data/x69.yml +257 -0
  75. data/lib/unidecoder/data/x6a.yml +257 -0
  76. data/lib/unidecoder/data/x6b.yml +257 -0
  77. data/lib/unidecoder/data/x6c.yml +257 -0
  78. data/lib/unidecoder/data/x6d.yml +257 -0
  79. data/lib/unidecoder/data/x6e.yml +257 -0
  80. data/lib/unidecoder/data/x6f.yml +257 -0
  81. data/lib/unidecoder/data/x70.yml +257 -0
  82. data/lib/unidecoder/data/x71.yml +257 -0
  83. data/lib/unidecoder/data/x72.yml +257 -0
  84. data/lib/unidecoder/data/x73.yml +257 -0
  85. data/lib/unidecoder/data/x74.yml +257 -0
  86. data/lib/unidecoder/data/x75.yml +257 -0
  87. data/lib/unidecoder/data/x76.yml +257 -0
  88. data/lib/unidecoder/data/x77.yml +257 -0
  89. data/lib/unidecoder/data/x78.yml +257 -0
  90. data/lib/unidecoder/data/x79.yml +257 -0
  91. data/lib/unidecoder/data/x7a.yml +257 -0
  92. data/lib/unidecoder/data/x7b.yml +257 -0
  93. data/lib/unidecoder/data/x7c.yml +257 -0
  94. data/lib/unidecoder/data/x7d.yml +257 -0
  95. data/lib/unidecoder/data/x7e.yml +257 -0
  96. data/lib/unidecoder/data/x7f.yml +257 -0
  97. data/lib/unidecoder/data/x80.yml +257 -0
  98. data/lib/unidecoder/data/x81.yml +257 -0
  99. data/lib/unidecoder/data/x82.yml +257 -0
  100. data/lib/unidecoder/data/x83.yml +257 -0
  101. data/lib/unidecoder/data/x84.yml +257 -0
  102. data/lib/unidecoder/data/x85.yml +257 -0
  103. data/lib/unidecoder/data/x86.yml +257 -0
  104. data/lib/unidecoder/data/x87.yml +257 -0
  105. data/lib/unidecoder/data/x88.yml +257 -0
  106. data/lib/unidecoder/data/x89.yml +257 -0
  107. data/lib/unidecoder/data/x8a.yml +257 -0
  108. data/lib/unidecoder/data/x8b.yml +257 -0
  109. data/lib/unidecoder/data/x8c.yml +257 -0
  110. data/lib/unidecoder/data/x8d.yml +257 -0
  111. data/lib/unidecoder/data/x8e.yml +257 -0
  112. data/lib/unidecoder/data/x8f.yml +257 -0
  113. data/lib/unidecoder/data/x90.yml +257 -0
  114. data/lib/unidecoder/data/x91.yml +257 -0
  115. data/lib/unidecoder/data/x92.yml +257 -0
  116. data/lib/unidecoder/data/x93.yml +257 -0
  117. data/lib/unidecoder/data/x94.yml +257 -0
  118. data/lib/unidecoder/data/x95.yml +257 -0
  119. data/lib/unidecoder/data/x96.yml +257 -0
  120. data/lib/unidecoder/data/x97.yml +257 -0
  121. data/lib/unidecoder/data/x98.yml +257 -0
  122. data/lib/unidecoder/data/x99.yml +257 -0
  123. data/lib/unidecoder/data/x9a.yml +257 -0
  124. data/lib/unidecoder/data/x9b.yml +257 -0
  125. data/lib/unidecoder/data/x9c.yml +257 -0
  126. data/lib/unidecoder/data/x9d.yml +257 -0
  127. data/lib/unidecoder/data/x9e.yml +257 -0
  128. data/lib/unidecoder/data/x9f.yml +256 -0
  129. data/lib/unidecoder/data/xa0.yml +257 -0
  130. data/lib/unidecoder/data/xa1.yml +257 -0
  131. data/lib/unidecoder/data/xa2.yml +257 -0
  132. data/lib/unidecoder/data/xa3.yml +257 -0
  133. data/lib/unidecoder/data/xa4.yml +256 -0
  134. data/lib/unidecoder/data/xac.yml +257 -0
  135. data/lib/unidecoder/data/xad.yml +257 -0
  136. data/lib/unidecoder/data/xae.yml +257 -0
  137. data/lib/unidecoder/data/xaf.yml +257 -0
  138. data/lib/unidecoder/data/xb0.yml +257 -0
  139. data/lib/unidecoder/data/xb1.yml +257 -0
  140. data/lib/unidecoder/data/xb2.yml +257 -0
  141. data/lib/unidecoder/data/xb3.yml +257 -0
  142. data/lib/unidecoder/data/xb4.yml +257 -0
  143. data/lib/unidecoder/data/xb5.yml +257 -0
  144. data/lib/unidecoder/data/xb6.yml +257 -0
  145. data/lib/unidecoder/data/xb7.yml +257 -0
  146. data/lib/unidecoder/data/xb8.yml +257 -0
  147. data/lib/unidecoder/data/xb9.yml +257 -0
  148. data/lib/unidecoder/data/xba.yml +257 -0
  149. data/lib/unidecoder/data/xbb.yml +257 -0
  150. data/lib/unidecoder/data/xbc.yml +257 -0
  151. data/lib/unidecoder/data/xbd.yml +257 -0
  152. data/lib/unidecoder/data/xbe.yml +257 -0
  153. data/lib/unidecoder/data/xbf.yml +257 -0
  154. data/lib/unidecoder/data/xc0.yml +257 -0
  155. data/lib/unidecoder/data/xc1.yml +257 -0
  156. data/lib/unidecoder/data/xc2.yml +257 -0
  157. data/lib/unidecoder/data/xc3.yml +257 -0
  158. data/lib/unidecoder/data/xc4.yml +257 -0
  159. data/lib/unidecoder/data/xc5.yml +257 -0
  160. data/lib/unidecoder/data/xc6.yml +257 -0
  161. data/lib/unidecoder/data/xc7.yml +257 -0
  162. data/lib/unidecoder/data/xc8.yml +257 -0
  163. data/lib/unidecoder/data/xc9.yml +257 -0
  164. data/lib/unidecoder/data/xca.yml +257 -0
  165. data/lib/unidecoder/data/xcb.yml +257 -0
  166. data/lib/unidecoder/data/xcc.yml +257 -0
  167. data/lib/unidecoder/data/xcd.yml +257 -0
  168. data/lib/unidecoder/data/xce.yml +257 -0
  169. data/lib/unidecoder/data/xcf.yml +257 -0
  170. data/lib/unidecoder/data/xd0.yml +257 -0
  171. data/lib/unidecoder/data/xd1.yml +257 -0
  172. data/lib/unidecoder/data/xd2.yml +257 -0
  173. data/lib/unidecoder/data/xd3.yml +257 -0
  174. data/lib/unidecoder/data/xd4.yml +257 -0
  175. data/lib/unidecoder/data/xd5.yml +257 -0
  176. data/lib/unidecoder/data/xd6.yml +257 -0
  177. data/lib/unidecoder/data/xd7.yml +256 -0
  178. data/lib/unidecoder/data/xf9.yml +257 -0
  179. data/lib/unidecoder/data/xfa.yml +256 -0
  180. data/lib/unidecoder/data/xfb.yml +257 -0
  181. data/lib/unidecoder/data/xfc.yml +257 -0
  182. data/lib/unidecoder/data/xfd.yml +256 -0
  183. data/lib/unidecoder/data/xfe.yml +257 -0
  184. data/lib/unidecoder/data/xff.yml +257 -0
  185. data/lib/unidecoder/version.rb +9 -0
  186. data/test/unicode_point_suite/basic_latin_test.rb +144 -0
  187. data/test/unicode_point_suite/codepoint_test_helper.rb +28 -0
  188. data/test/unidecoder_test.rb +114 -0
  189. metadata +238 -0
@@ -0,0 +1,257 @@
1
+ ---
2
+ - '[?]'
3
+ - '!'
4
+ - '"'
5
+ - '#'
6
+ - $
7
+ - '%'
8
+ - '&'
9
+ - "'"
10
+ - (
11
+ - )
12
+ - '*'
13
+ - +
14
+ - ','
15
+ - -
16
+ - .
17
+ - /
18
+ - 0
19
+ - 1
20
+ - 2
21
+ - 3
22
+ - 4
23
+ - 5
24
+ - 6
25
+ - 7
26
+ - 8
27
+ - 9
28
+ - ':'
29
+ - ;
30
+ - <
31
+ - =
32
+ - '>'
33
+ - '?'
34
+ - '@'
35
+ - A
36
+ - B
37
+ - C
38
+ - D
39
+ - E
40
+ - F
41
+ - G
42
+ - H
43
+ - I
44
+ - J
45
+ - K
46
+ - L
47
+ - M
48
+ - N
49
+ - O
50
+ - P
51
+ - Q
52
+ - R
53
+ - S
54
+ - T
55
+ - U
56
+ - V
57
+ - W
58
+ - X
59
+ - Y
60
+ - Z
61
+ - '['
62
+ - \
63
+ - ']'
64
+ - '^'
65
+ - _
66
+ - '`'
67
+ - a
68
+ - b
69
+ - c
70
+ - d
71
+ - e
72
+ - f
73
+ - g
74
+ - h
75
+ - i
76
+ - j
77
+ - k
78
+ - l
79
+ - m
80
+ - n
81
+ - o
82
+ - p
83
+ - q
84
+ - r
85
+ - s
86
+ - t
87
+ - u
88
+ - v
89
+ - w
90
+ - x
91
+ - y
92
+ - z
93
+ - '{'
94
+ - '|'
95
+ - '}'
96
+ - '~'
97
+ - '[?]'
98
+ - '[?]'
99
+ - .
100
+ - '['
101
+ - ']'
102
+ - ','
103
+ - '*'
104
+ - wo
105
+ - a
106
+ - i
107
+ - u
108
+ - e
109
+ - o
110
+ - ya
111
+ - yu
112
+ - yo
113
+ - tu
114
+ - +
115
+ - a
116
+ - i
117
+ - u
118
+ - e
119
+ - o
120
+ - ka
121
+ - ki
122
+ - ku
123
+ - ke
124
+ - ko
125
+ - sa
126
+ - si
127
+ - su
128
+ - se
129
+ - so
130
+ - ta
131
+ - ti
132
+ - tu
133
+ - te
134
+ - to
135
+ - na
136
+ - ni
137
+ - nu
138
+ - ne
139
+ - no
140
+ - ha
141
+ - hi
142
+ - hu
143
+ - he
144
+ - ho
145
+ - ma
146
+ - mi
147
+ - mu
148
+ - me
149
+ - mo
150
+ - ya
151
+ - yu
152
+ - yo
153
+ - ra
154
+ - ri
155
+ - ru
156
+ - re
157
+ - ro
158
+ - wa
159
+ - n
160
+ - ':'
161
+ - ;
162
+ - ''
163
+ - g
164
+ - gg
165
+ - gs
166
+ - n
167
+ - nj
168
+ - nh
169
+ - d
170
+ - dd
171
+ - r
172
+ - lg
173
+ - lm
174
+ - lb
175
+ - ls
176
+ - lt
177
+ - lp
178
+ - rh
179
+ - m
180
+ - b
181
+ - bb
182
+ - bs
183
+ - s
184
+ - ss
185
+ - ''
186
+ - j
187
+ - jj
188
+ - c
189
+ - k
190
+ - t
191
+ - p
192
+ - h
193
+ - '[?]'
194
+ - '[?]'
195
+ - '[?]'
196
+ - a
197
+ - ae
198
+ - ya
199
+ - yae
200
+ - eo
201
+ - e
202
+ - '[?]'
203
+ - '[?]'
204
+ - yeo
205
+ - ye
206
+ - o
207
+ - wa
208
+ - wae
209
+ - oe
210
+ - '[?]'
211
+ - '[?]'
212
+ - yo
213
+ - u
214
+ - weo
215
+ - we
216
+ - wi
217
+ - yu
218
+ - '[?]'
219
+ - '[?]'
220
+ - eu
221
+ - yi
222
+ - i
223
+ - '[?]'
224
+ - '[?]'
225
+ - '[?]'
226
+ - /C
227
+ - PS
228
+ - '!'
229
+ - -
230
+ - '|'
231
+ - Y=
232
+ - W=
233
+ - '[?]'
234
+ - '|'
235
+ - -
236
+ - '|'
237
+ - -
238
+ - '|'
239
+ - '#'
240
+ - O
241
+ - '[?]'
242
+ - '[?]'
243
+ - '[?]'
244
+ - '[?]'
245
+ - '[?]'
246
+ - '[?]'
247
+ - '[?]'
248
+ - '[?]'
249
+ - '[?]'
250
+ - '[?]'
251
+ - '{'
252
+ - '|'
253
+ - '}'
254
+ - ''
255
+ - ''
256
+ - ''
257
+ - ''
@@ -0,0 +1,9 @@
1
+ module Unidecoder
2
+ module Version
3
+ MAJOR = 1
4
+ MINOR = 1
5
+ TINY = 2
6
+ BUILD = nil
7
+ STRING = [MAJOR, MINOR, TINY, BUILD].compact.join('.')
8
+ end
9
+ end
@@ -0,0 +1,144 @@
1
+ $:.unshift File.expand_path("../../lib", File.dirname(__FILE__))
2
+ $:.unshift File.expand_path(File.dirname(__FILE__))
3
+ $:.uniq!
4
+ require "test/unit"
5
+ require "unidecoder"
6
+ require "codepoint_test_helper"
7
+
8
+ include CodepointTestHelper
9
+
10
+ class BasicLatinTest < Test::Unit::TestCase
11
+ # This test suite is just regression test and debugging
12
+ # to better transliterate the Basic Latin Unicode codepoints
13
+ #
14
+ # http://unicode.org/charts/
15
+ # http://unicode.org/charts/PDF/U0000.pdf
16
+
17
+ # NOTE: I can't figure out how to test control characters.
18
+ # Get weird results trying to pack them to unicode.
19
+
20
+ def test_spaces
21
+ assert_equal_encoded " ", %w{0020 00a0}
22
+ assert_equal_encoded "", %w{200b 2060}
23
+ end
24
+
25
+ def test_exclamation_marks
26
+ assert_equal_encoded "!", %w{0021 2762}
27
+ assert_equal_encoded "!!", "203c"
28
+ assert_equal_encoded "", "00a1"
29
+ assert_equal_encoded "?!", "203d"
30
+ end
31
+
32
+ def test_quotation_marks
33
+ assert_equal_encoded "\"", %w{0022 02ba 2033 3003}
34
+ end
35
+
36
+ def test_apostrophes
37
+ assert_equal_encoded "'", %w{0027 02b9 02bc 02c8 2032}
38
+ end
39
+
40
+ def test_asterisks
41
+ assert_equal_encoded "*", %w{002a 066d 204e 2217 26b9 2731}
42
+ end
43
+
44
+ def test_commas
45
+ assert_equal_encoded ",", %w{002c 060c}
46
+ end
47
+
48
+ def test_periods
49
+ assert_equal_encoded ".", %w{002e 06d4}
50
+ end
51
+
52
+ def test_hyphens
53
+ assert_equal_encoded "-", %w{002d 2010 2011 2012 2212}
54
+ end
55
+
56
+ def test_endash
57
+ assert_equal_encoded "--", %w{2013 2015}
58
+ end
59
+
60
+ def test_emdash
61
+ assert_equal_encoded "---", %w{2014}
62
+ end
63
+
64
+ def test_dotleader
65
+ assert_equal_encoded "..", %w{2025}
66
+ end
67
+
68
+ def test_ellipsis
69
+ assert_equal_encoded "...", %w{2026}
70
+ end
71
+
72
+ def test_slashes
73
+ assert_equal_encoded "/", %w{002f 2044 2215}
74
+ assert_equal_encoded "\\", %w{005c 2216}
75
+ end
76
+
77
+ def test_colons
78
+ assert_equal_encoded ":", %w{003a 2236}
79
+ end
80
+
81
+ def test_semicolons
82
+ assert_equal_encoded ";", %w{003b 061b}
83
+ end
84
+
85
+ def test_less_thans
86
+ assert_equal_encoded "<", %w{003c 2039 2329 27e8 3008}
87
+ end
88
+
89
+ def test_equals
90
+ assert_equal_encoded "=", "003d"
91
+ end
92
+
93
+ def test_greater_thans
94
+ assert_equal_encoded ">", %w{003e 203a 232a 27e9 3009}
95
+ end
96
+
97
+ def test_question_marks
98
+ assert_equal_encoded "?", %w{003f 061f}
99
+ assert_equal_encoded "", "00bf"
100
+ assert_equal_encoded "?!", %w{203d 2048}
101
+ assert_equal_encoded "!?", "2049"
102
+ end
103
+
104
+ def test_circumflexes
105
+ assert_equal_encoded "^", %w{005e 2038 2303}
106
+ end
107
+
108
+ def test_underscores
109
+ assert_equal_encoded "_", %w{005f 02cd 2017}
110
+ end
111
+
112
+ def test_grave_accents
113
+ assert_equal_encoded "`", %w{0060 02cb 2035}
114
+ end
115
+
116
+ def test_bars
117
+ assert_equal_encoded "|", %w{007c 2223 2758}
118
+ end
119
+
120
+ def test_tildes
121
+ assert_equal_encoded "~", %w{007e 02dc 2053 223c ff5e}
122
+ end
123
+
124
+ def test_related_letters
125
+ {
126
+ "B" => "212c",
127
+ "C" => %w{2102 212d},
128
+ "E" => %w{2107 2130},
129
+ "F" => "2131",
130
+ "H" => %w{210b 210c 210d},
131
+ "I" => %w{0130 0406 04c0 2110 2111 2160},
132
+ "K" => "212a",
133
+ "L" => "2112",
134
+ "M" => "2133",
135
+ "N" => "2115",
136
+ "P" => "2119",
137
+ "Q" => "211a",
138
+ "R" => %w{211b 211c 211d},
139
+ "Z" => %w{2124 2128}
140
+ }.each do |expected, encode_mes|
141
+ assert_equal_encoded expected, encode_mes
142
+ end
143
+ end
144
+ end
@@ -0,0 +1,28 @@
1
+ # 100% shorthand
2
+ module CodepointTestHelper
3
+ def assert_equal_encoded(expected, encode_mes)
4
+ # Killing a duck because Ruby 1.9 doesn't mix Enumerable into String
5
+ encode_mes = [encode_mes] if encode_mes.is_a?(String)
6
+ encode_mes.each do |encode_me|
7
+ encoded = encode(encode_me)
8
+ actual = encoded.to_ascii
9
+ if expected == actual
10
+ # Let's not retest it
11
+ assert true
12
+ else
13
+ message = "<#{expected.inspect}> expected but was\n<#{actual.inspect}>\n"
14
+ message << " defined in #{Unidecoder.in_yaml_file(encoded)}"
15
+ fail message
16
+ end
17
+ end
18
+ end
19
+
20
+ private
21
+ def encode(codepoint)
22
+ Unidecoder.encode(codepoint)
23
+ end
24
+
25
+ def which_yaml(codepoint)
26
+ Unidecoder.in_yaml_file(encode(codepoint))
27
+ end
28
+ end
@@ -0,0 +1,114 @@
1
+ # encoding: utf-8
2
+ $:.unshift File.expand_path("../lib", File.dirname(__FILE__))
3
+ $:.unshift File.expand_path(File.dirname(__FILE__))
4
+ $:.uniq!
5
+ require "test/unit"
6
+ require "unidecoder"
7
+
8
+
9
+ class UnidecoderTest < Test::Unit::TestCase
10
+ # Silly phrases courtesy of Frank da Cruz (http://www.columbia.edu/kermit/utf8.html).
11
+
12
+ DONT_CONVERT = [
13
+ "Vitrum edere possum; mihi non nocet.", # Latin
14
+ "Je puis mangier del voirre. Ne me nuit.", # Old French
15
+ "Kristala jan dezaket, ez dit minik ematen.", # Basque
16
+ "Kaya kong kumain nang bubog at hindi ako masaktan.", # Tagalog
17
+ "Ich kann Glas essen, ohne mir weh zu tun.", # German
18
+ "I can eat glass and it doesn't hurt me.", # English
19
+ ]
20
+
21
+ CONVERT_PAIRS = {
22
+ # French
23
+ "Je peux manger du verre, ça ne me fait pas de mal." => "Je peux manger du verre, ca ne me fait pas de mal.",
24
+ # Romanian
25
+ "Pot să mănânc sticlă și ea nu mă rănește." => "Pot sa mananc sticla si ea nu ma raneste.",
26
+ # Icelandic
27
+ "Ég get etið gler án þess að meiða mig." => "Eg get etid gler an thess ad meida mig.",
28
+ # Albanian
29
+ "Unë mund të ha qelq dhe nuk më gjen gjë." => "Une mund te ha qelq dhe nuk me gjen gje.",
30
+ # Polish
31
+ "Mogę jeść szkło i mi nie szkodzi." => "Moge jesc szklo i mi nie szkodzi.",
32
+ # Russian
33
+ "Я могу есть стекло, оно мне не вредит." => "Ia moghu iest' stieklo, ono mnie nie vriedit.",
34
+ # Bulgarian
35
+ "Мога да ям стъкло, то не ми вреди." => "Mogha da iam stklo, to nie mi vriedi.",
36
+ # Anglo-Saxon
37
+ "ᛁᚳ᛫ᛗᚨᚷ᛫ᚷᛚᚨᛋ᛫ᛖᚩᛏᚪᚾ᛫ᚩᚾᛞ᛫ᚻᛁᛏ᛫ᚾᛖ᛫ᚻᛖᚪᚱᛗᛁᚪᚧ᛫ᛗᛖ᛬" => "ic.mag.glas.eotacn.ond.hit.ne.heacrmiacth.me:",
38
+ # Classical Greek
39
+ "ὕαλον ϕαγεῖν δύναμαι· τοῦτο οὔ με βλάπτει" => "ualon phagein dunamai; touto ou me blaptei",
40
+ # Hindi
41
+ "मैं काँच खा सकता हूँ और मुझे उससे कोई चोट नहीं पहुंचती" => "maiN kaaNc khaa sktaa huuN aur mujhe usse koii cott nhiiN phuNctii",
42
+ # Thai
43
+ "ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ" => "chankinkracchkaid aetmanaimthamaihchanecchb",
44
+ # Chinese
45
+ "我能吞下玻璃而不伤身体。" => "Wo Neng Tun Xia Bo Li Er Bu Shang Shen Ti . ",
46
+ # Japanese
47
+ "私はガラスを食べられます。それは私を傷つけません。" => "Si hagarasuwoShi beraremasu. sorehaSi woShang tukemasen. ",
48
+ "من می توانم بدونِ احساس درد شيشه بخورم" => # Persian
49
+ "mn my twnm bdwni Hss drd shyshh bkhwrm",
50
+ "أنا قادر على أكل الزجاج و هذا لا يؤلمن" => # Arabic
51
+ "'n qdr `l~ 'kl lzjj w hdh l yw'lmn",
52
+ "אני יכול לאכול זכוכית וזה לא מזיק לי" => # Hebrew
53
+ "ny ykvl lkvl zkvkyt vzh l mzyq ly",
54
+ }
55
+
56
+ def test_should_raise_error_with_invalid_utf8
57
+ [
58
+ "\x80", # Continuation byte, low (cp125)
59
+ "\x94", # Continuation byte, mid (cp125)
60
+ "\x9F", # Continuation byte, high (cp125)
61
+ "\xC0", # Overlong encoding, start of 2-byte sequence, but codepoint < 128
62
+ "\xC1", # Overlong encoding, start of 2-byte sequence, but codepoint < 128
63
+ "\xC2", # Start of 2-byte sequence, low
64
+ "\xC8", # Start of 2-byte sequence, mid
65
+ "\xDF", # Start of 2-byte sequence, high
66
+ "\xE0", # Start of 3-byte sequence, low
67
+ "\xE8", # Start of 3-byte sequence, mid
68
+ "\xEF", # Start of 3-byte sequence, high
69
+ "\xF0", # Start of 4-byte sequence
70
+ "\xF1", # Unused byte
71
+ "\xFF", # Restricted byte
72
+ ].map do |byte|
73
+ assert_raise ArgumentError, "#{byte.inspect} did not raise error" do
74
+ Unidecoder.decode("a#{byte}a")
75
+ end
76
+ end
77
+
78
+ end
79
+
80
+ def test_unidecoder_decode
81
+ DONT_CONVERT.each do |ascii|
82
+ assert_equal ascii, Unidecoder.decode(ascii)
83
+ end
84
+ CONVERT_PAIRS.each do |unicode, ascii|
85
+ assert_equal ascii, Unidecoder.decode(unicode)
86
+ end
87
+ end
88
+
89
+ def test_unidecoder_encode
90
+ {
91
+ # Strings
92
+ "0041" => "A",
93
+ "00e6" => "æ",
94
+ "042f" => "Я"
95
+ }.each do |codepoint, unicode|
96
+ assert_equal unicode, Unidecoder.encode(codepoint)
97
+ end
98
+ end
99
+
100
+ def test_unidecoder_in_yaml_file
101
+ {
102
+ "A" => "x00.yml (line 67)",
103
+ "π" => "x03.yml (line 194)",
104
+ "Я" => "x04.yml (line 49)"
105
+ }.each do |character, output|
106
+ assert_equal output, Unidecoder.in_yaml_file(character)
107
+ end
108
+ end
109
+
110
+ def test_override
111
+ assert_equal "Juergen", Unidecoder.decode("Jürgen", "ü" => "ue")
112
+ end
113
+
114
+ end