talentbox-unidecoder 1.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (189) hide show
  1. data/Changelog.md +16 -0
  2. data/README.md +51 -0
  3. data/Rakefile +28 -0
  4. data/lib/unidecoder.rb +98 -0
  5. data/lib/unidecoder/data/x00.yml +257 -0
  6. data/lib/unidecoder/data/x01.yml +257 -0
  7. data/lib/unidecoder/data/x02.yml +256 -0
  8. data/lib/unidecoder/data/x03.yml +256 -0
  9. data/lib/unidecoder/data/x04.yml +256 -0
  10. data/lib/unidecoder/data/x05.yml +256 -0
  11. data/lib/unidecoder/data/x06.yml +256 -0
  12. data/lib/unidecoder/data/x07.yml +256 -0
  13. data/lib/unidecoder/data/x09.yml +256 -0
  14. data/lib/unidecoder/data/x0a.yml +256 -0
  15. data/lib/unidecoder/data/x0b.yml +256 -0
  16. data/lib/unidecoder/data/x0c.yml +256 -0
  17. data/lib/unidecoder/data/x0d.yml +256 -0
  18. data/lib/unidecoder/data/x0e.yml +256 -0
  19. data/lib/unidecoder/data/x0f.yml +256 -0
  20. data/lib/unidecoder/data/x10.yml +256 -0
  21. data/lib/unidecoder/data/x11.yml +256 -0
  22. data/lib/unidecoder/data/x12.yml +257 -0
  23. data/lib/unidecoder/data/x13.yml +256 -0
  24. data/lib/unidecoder/data/x14.yml +257 -0
  25. data/lib/unidecoder/data/x15.yml +257 -0
  26. data/lib/unidecoder/data/x16.yml +256 -0
  27. data/lib/unidecoder/data/x17.yml +256 -0
  28. data/lib/unidecoder/data/x18.yml +256 -0
  29. data/lib/unidecoder/data/x1e.yml +256 -0
  30. data/lib/unidecoder/data/x1f.yml +256 -0
  31. data/lib/unidecoder/data/x20.yml +256 -0
  32. data/lib/unidecoder/data/x21.yml +256 -0
  33. data/lib/unidecoder/data/x22.yml +256 -0
  34. data/lib/unidecoder/data/x23.yml +256 -0
  35. data/lib/unidecoder/data/x24.yml +256 -0
  36. data/lib/unidecoder/data/x25.yml +256 -0
  37. data/lib/unidecoder/data/x26.yml +256 -0
  38. data/lib/unidecoder/data/x27.yml +256 -0
  39. data/lib/unidecoder/data/x28.yml +257 -0
  40. data/lib/unidecoder/data/x2e.yml +256 -0
  41. data/lib/unidecoder/data/x2f.yml +256 -0
  42. data/lib/unidecoder/data/x30.yml +256 -0
  43. data/lib/unidecoder/data/x31.yml +256 -0
  44. data/lib/unidecoder/data/x32.yml +256 -0
  45. data/lib/unidecoder/data/x33.yml +256 -0
  46. data/lib/unidecoder/data/x4d.yml +256 -0
  47. data/lib/unidecoder/data/x4e.yml +257 -0
  48. data/lib/unidecoder/data/x4f.yml +257 -0
  49. data/lib/unidecoder/data/x50.yml +257 -0
  50. data/lib/unidecoder/data/x51.yml +257 -0
  51. data/lib/unidecoder/data/x52.yml +257 -0
  52. data/lib/unidecoder/data/x53.yml +257 -0
  53. data/lib/unidecoder/data/x54.yml +257 -0
  54. data/lib/unidecoder/data/x55.yml +257 -0
  55. data/lib/unidecoder/data/x56.yml +257 -0
  56. data/lib/unidecoder/data/x57.yml +257 -0
  57. data/lib/unidecoder/data/x58.yml +257 -0
  58. data/lib/unidecoder/data/x59.yml +257 -0
  59. data/lib/unidecoder/data/x5a.yml +257 -0
  60. data/lib/unidecoder/data/x5b.yml +257 -0
  61. data/lib/unidecoder/data/x5c.yml +257 -0
  62. data/lib/unidecoder/data/x5d.yml +257 -0
  63. data/lib/unidecoder/data/x5e.yml +257 -0
  64. data/lib/unidecoder/data/x5f.yml +257 -0
  65. data/lib/unidecoder/data/x60.yml +257 -0
  66. data/lib/unidecoder/data/x61.yml +257 -0
  67. data/lib/unidecoder/data/x62.yml +257 -0
  68. data/lib/unidecoder/data/x63.yml +257 -0
  69. data/lib/unidecoder/data/x64.yml +257 -0
  70. data/lib/unidecoder/data/x65.yml +257 -0
  71. data/lib/unidecoder/data/x66.yml +257 -0
  72. data/lib/unidecoder/data/x67.yml +257 -0
  73. data/lib/unidecoder/data/x68.yml +257 -0
  74. data/lib/unidecoder/data/x69.yml +257 -0
  75. data/lib/unidecoder/data/x6a.yml +257 -0
  76. data/lib/unidecoder/data/x6b.yml +257 -0
  77. data/lib/unidecoder/data/x6c.yml +257 -0
  78. data/lib/unidecoder/data/x6d.yml +257 -0
  79. data/lib/unidecoder/data/x6e.yml +257 -0
  80. data/lib/unidecoder/data/x6f.yml +257 -0
  81. data/lib/unidecoder/data/x70.yml +257 -0
  82. data/lib/unidecoder/data/x71.yml +257 -0
  83. data/lib/unidecoder/data/x72.yml +257 -0
  84. data/lib/unidecoder/data/x73.yml +257 -0
  85. data/lib/unidecoder/data/x74.yml +257 -0
  86. data/lib/unidecoder/data/x75.yml +257 -0
  87. data/lib/unidecoder/data/x76.yml +257 -0
  88. data/lib/unidecoder/data/x77.yml +257 -0
  89. data/lib/unidecoder/data/x78.yml +257 -0
  90. data/lib/unidecoder/data/x79.yml +257 -0
  91. data/lib/unidecoder/data/x7a.yml +257 -0
  92. data/lib/unidecoder/data/x7b.yml +257 -0
  93. data/lib/unidecoder/data/x7c.yml +257 -0
  94. data/lib/unidecoder/data/x7d.yml +257 -0
  95. data/lib/unidecoder/data/x7e.yml +257 -0
  96. data/lib/unidecoder/data/x7f.yml +257 -0
  97. data/lib/unidecoder/data/x80.yml +257 -0
  98. data/lib/unidecoder/data/x81.yml +257 -0
  99. data/lib/unidecoder/data/x82.yml +257 -0
  100. data/lib/unidecoder/data/x83.yml +257 -0
  101. data/lib/unidecoder/data/x84.yml +257 -0
  102. data/lib/unidecoder/data/x85.yml +257 -0
  103. data/lib/unidecoder/data/x86.yml +257 -0
  104. data/lib/unidecoder/data/x87.yml +257 -0
  105. data/lib/unidecoder/data/x88.yml +257 -0
  106. data/lib/unidecoder/data/x89.yml +257 -0
  107. data/lib/unidecoder/data/x8a.yml +257 -0
  108. data/lib/unidecoder/data/x8b.yml +257 -0
  109. data/lib/unidecoder/data/x8c.yml +257 -0
  110. data/lib/unidecoder/data/x8d.yml +257 -0
  111. data/lib/unidecoder/data/x8e.yml +257 -0
  112. data/lib/unidecoder/data/x8f.yml +257 -0
  113. data/lib/unidecoder/data/x90.yml +257 -0
  114. data/lib/unidecoder/data/x91.yml +257 -0
  115. data/lib/unidecoder/data/x92.yml +257 -0
  116. data/lib/unidecoder/data/x93.yml +257 -0
  117. data/lib/unidecoder/data/x94.yml +257 -0
  118. data/lib/unidecoder/data/x95.yml +257 -0
  119. data/lib/unidecoder/data/x96.yml +257 -0
  120. data/lib/unidecoder/data/x97.yml +257 -0
  121. data/lib/unidecoder/data/x98.yml +257 -0
  122. data/lib/unidecoder/data/x99.yml +257 -0
  123. data/lib/unidecoder/data/x9a.yml +257 -0
  124. data/lib/unidecoder/data/x9b.yml +257 -0
  125. data/lib/unidecoder/data/x9c.yml +257 -0
  126. data/lib/unidecoder/data/x9d.yml +257 -0
  127. data/lib/unidecoder/data/x9e.yml +257 -0
  128. data/lib/unidecoder/data/x9f.yml +256 -0
  129. data/lib/unidecoder/data/xa0.yml +257 -0
  130. data/lib/unidecoder/data/xa1.yml +257 -0
  131. data/lib/unidecoder/data/xa2.yml +257 -0
  132. data/lib/unidecoder/data/xa3.yml +257 -0
  133. data/lib/unidecoder/data/xa4.yml +256 -0
  134. data/lib/unidecoder/data/xac.yml +257 -0
  135. data/lib/unidecoder/data/xad.yml +257 -0
  136. data/lib/unidecoder/data/xae.yml +257 -0
  137. data/lib/unidecoder/data/xaf.yml +257 -0
  138. data/lib/unidecoder/data/xb0.yml +257 -0
  139. data/lib/unidecoder/data/xb1.yml +257 -0
  140. data/lib/unidecoder/data/xb2.yml +257 -0
  141. data/lib/unidecoder/data/xb3.yml +257 -0
  142. data/lib/unidecoder/data/xb4.yml +257 -0
  143. data/lib/unidecoder/data/xb5.yml +257 -0
  144. data/lib/unidecoder/data/xb6.yml +257 -0
  145. data/lib/unidecoder/data/xb7.yml +257 -0
  146. data/lib/unidecoder/data/xb8.yml +257 -0
  147. data/lib/unidecoder/data/xb9.yml +257 -0
  148. data/lib/unidecoder/data/xba.yml +257 -0
  149. data/lib/unidecoder/data/xbb.yml +257 -0
  150. data/lib/unidecoder/data/xbc.yml +257 -0
  151. data/lib/unidecoder/data/xbd.yml +257 -0
  152. data/lib/unidecoder/data/xbe.yml +257 -0
  153. data/lib/unidecoder/data/xbf.yml +257 -0
  154. data/lib/unidecoder/data/xc0.yml +257 -0
  155. data/lib/unidecoder/data/xc1.yml +257 -0
  156. data/lib/unidecoder/data/xc2.yml +257 -0
  157. data/lib/unidecoder/data/xc3.yml +257 -0
  158. data/lib/unidecoder/data/xc4.yml +257 -0
  159. data/lib/unidecoder/data/xc5.yml +257 -0
  160. data/lib/unidecoder/data/xc6.yml +257 -0
  161. data/lib/unidecoder/data/xc7.yml +257 -0
  162. data/lib/unidecoder/data/xc8.yml +257 -0
  163. data/lib/unidecoder/data/xc9.yml +257 -0
  164. data/lib/unidecoder/data/xca.yml +257 -0
  165. data/lib/unidecoder/data/xcb.yml +257 -0
  166. data/lib/unidecoder/data/xcc.yml +257 -0
  167. data/lib/unidecoder/data/xcd.yml +257 -0
  168. data/lib/unidecoder/data/xce.yml +257 -0
  169. data/lib/unidecoder/data/xcf.yml +257 -0
  170. data/lib/unidecoder/data/xd0.yml +257 -0
  171. data/lib/unidecoder/data/xd1.yml +257 -0
  172. data/lib/unidecoder/data/xd2.yml +257 -0
  173. data/lib/unidecoder/data/xd3.yml +257 -0
  174. data/lib/unidecoder/data/xd4.yml +257 -0
  175. data/lib/unidecoder/data/xd5.yml +257 -0
  176. data/lib/unidecoder/data/xd6.yml +257 -0
  177. data/lib/unidecoder/data/xd7.yml +256 -0
  178. data/lib/unidecoder/data/xf9.yml +257 -0
  179. data/lib/unidecoder/data/xfa.yml +256 -0
  180. data/lib/unidecoder/data/xfb.yml +257 -0
  181. data/lib/unidecoder/data/xfc.yml +257 -0
  182. data/lib/unidecoder/data/xfd.yml +256 -0
  183. data/lib/unidecoder/data/xfe.yml +257 -0
  184. data/lib/unidecoder/data/xff.yml +257 -0
  185. data/lib/unidecoder/version.rb +9 -0
  186. data/test/unicode_point_suite/basic_latin_test.rb +144 -0
  187. data/test/unicode_point_suite/codepoint_test_helper.rb +28 -0
  188. data/test/unidecoder_test.rb +114 -0
  189. metadata +238 -0
@@ -0,0 +1,257 @@
1
+ ---
2
+ - '[?]'
3
+ - '!'
4
+ - '"'
5
+ - '#'
6
+ - $
7
+ - '%'
8
+ - '&'
9
+ - "'"
10
+ - (
11
+ - )
12
+ - '*'
13
+ - +
14
+ - ','
15
+ - -
16
+ - .
17
+ - /
18
+ - 0
19
+ - 1
20
+ - 2
21
+ - 3
22
+ - 4
23
+ - 5
24
+ - 6
25
+ - 7
26
+ - 8
27
+ - 9
28
+ - ':'
29
+ - ;
30
+ - <
31
+ - =
32
+ - '>'
33
+ - '?'
34
+ - '@'
35
+ - A
36
+ - B
37
+ - C
38
+ - D
39
+ - E
40
+ - F
41
+ - G
42
+ - H
43
+ - I
44
+ - J
45
+ - K
46
+ - L
47
+ - M
48
+ - N
49
+ - O
50
+ - P
51
+ - Q
52
+ - R
53
+ - S
54
+ - T
55
+ - U
56
+ - V
57
+ - W
58
+ - X
59
+ - Y
60
+ - Z
61
+ - '['
62
+ - \
63
+ - ']'
64
+ - '^'
65
+ - _
66
+ - '`'
67
+ - a
68
+ - b
69
+ - c
70
+ - d
71
+ - e
72
+ - f
73
+ - g
74
+ - h
75
+ - i
76
+ - j
77
+ - k
78
+ - l
79
+ - m
80
+ - n
81
+ - o
82
+ - p
83
+ - q
84
+ - r
85
+ - s
86
+ - t
87
+ - u
88
+ - v
89
+ - w
90
+ - x
91
+ - y
92
+ - z
93
+ - '{'
94
+ - '|'
95
+ - '}'
96
+ - '~'
97
+ - '[?]'
98
+ - '[?]'
99
+ - .
100
+ - '['
101
+ - ']'
102
+ - ','
103
+ - '*'
104
+ - wo
105
+ - a
106
+ - i
107
+ - u
108
+ - e
109
+ - o
110
+ - ya
111
+ - yu
112
+ - yo
113
+ - tu
114
+ - +
115
+ - a
116
+ - i
117
+ - u
118
+ - e
119
+ - o
120
+ - ka
121
+ - ki
122
+ - ku
123
+ - ke
124
+ - ko
125
+ - sa
126
+ - si
127
+ - su
128
+ - se
129
+ - so
130
+ - ta
131
+ - ti
132
+ - tu
133
+ - te
134
+ - to
135
+ - na
136
+ - ni
137
+ - nu
138
+ - ne
139
+ - no
140
+ - ha
141
+ - hi
142
+ - hu
143
+ - he
144
+ - ho
145
+ - ma
146
+ - mi
147
+ - mu
148
+ - me
149
+ - mo
150
+ - ya
151
+ - yu
152
+ - yo
153
+ - ra
154
+ - ri
155
+ - ru
156
+ - re
157
+ - ro
158
+ - wa
159
+ - n
160
+ - ':'
161
+ - ;
162
+ - ''
163
+ - g
164
+ - gg
165
+ - gs
166
+ - n
167
+ - nj
168
+ - nh
169
+ - d
170
+ - dd
171
+ - r
172
+ - lg
173
+ - lm
174
+ - lb
175
+ - ls
176
+ - lt
177
+ - lp
178
+ - rh
179
+ - m
180
+ - b
181
+ - bb
182
+ - bs
183
+ - s
184
+ - ss
185
+ - ''
186
+ - j
187
+ - jj
188
+ - c
189
+ - k
190
+ - t
191
+ - p
192
+ - h
193
+ - '[?]'
194
+ - '[?]'
195
+ - '[?]'
196
+ - a
197
+ - ae
198
+ - ya
199
+ - yae
200
+ - eo
201
+ - e
202
+ - '[?]'
203
+ - '[?]'
204
+ - yeo
205
+ - ye
206
+ - o
207
+ - wa
208
+ - wae
209
+ - oe
210
+ - '[?]'
211
+ - '[?]'
212
+ - yo
213
+ - u
214
+ - weo
215
+ - we
216
+ - wi
217
+ - yu
218
+ - '[?]'
219
+ - '[?]'
220
+ - eu
221
+ - yi
222
+ - i
223
+ - '[?]'
224
+ - '[?]'
225
+ - '[?]'
226
+ - /C
227
+ - PS
228
+ - '!'
229
+ - -
230
+ - '|'
231
+ - Y=
232
+ - W=
233
+ - '[?]'
234
+ - '|'
235
+ - -
236
+ - '|'
237
+ - -
238
+ - '|'
239
+ - '#'
240
+ - O
241
+ - '[?]'
242
+ - '[?]'
243
+ - '[?]'
244
+ - '[?]'
245
+ - '[?]'
246
+ - '[?]'
247
+ - '[?]'
248
+ - '[?]'
249
+ - '[?]'
250
+ - '[?]'
251
+ - '{'
252
+ - '|'
253
+ - '}'
254
+ - ''
255
+ - ''
256
+ - ''
257
+ - ''
@@ -0,0 +1,9 @@
1
+ module Unidecoder
2
+ module Version
3
+ MAJOR = 1
4
+ MINOR = 1
5
+ TINY = 2
6
+ BUILD = nil
7
+ STRING = [MAJOR, MINOR, TINY, BUILD].compact.join('.')
8
+ end
9
+ end
@@ -0,0 +1,144 @@
1
+ $:.unshift File.expand_path("../../lib", File.dirname(__FILE__))
2
+ $:.unshift File.expand_path(File.dirname(__FILE__))
3
+ $:.uniq!
4
+ require "test/unit"
5
+ require "unidecoder"
6
+ require "codepoint_test_helper"
7
+
8
+ include CodepointTestHelper
9
+
10
+ class BasicLatinTest < Test::Unit::TestCase
11
+ # This test suite is just regression test and debugging
12
+ # to better transliterate the Basic Latin Unicode codepoints
13
+ #
14
+ # http://unicode.org/charts/
15
+ # http://unicode.org/charts/PDF/U0000.pdf
16
+
17
+ # NOTE: I can't figure out how to test control characters.
18
+ # Get weird results trying to pack them to unicode.
19
+
20
+ def test_spaces
21
+ assert_equal_encoded " ", %w{0020 00a0}
22
+ assert_equal_encoded "", %w{200b 2060}
23
+ end
24
+
25
+ def test_exclamation_marks
26
+ assert_equal_encoded "!", %w{0021 2762}
27
+ assert_equal_encoded "!!", "203c"
28
+ assert_equal_encoded "", "00a1"
29
+ assert_equal_encoded "?!", "203d"
30
+ end
31
+
32
+ def test_quotation_marks
33
+ assert_equal_encoded "\"", %w{0022 02ba 2033 3003}
34
+ end
35
+
36
+ def test_apostrophes
37
+ assert_equal_encoded "'", %w{0027 02b9 02bc 02c8 2032}
38
+ end
39
+
40
+ def test_asterisks
41
+ assert_equal_encoded "*", %w{002a 066d 204e 2217 26b9 2731}
42
+ end
43
+
44
+ def test_commas
45
+ assert_equal_encoded ",", %w{002c 060c}
46
+ end
47
+
48
+ def test_periods
49
+ assert_equal_encoded ".", %w{002e 06d4}
50
+ end
51
+
52
+ def test_hyphens
53
+ assert_equal_encoded "-", %w{002d 2010 2011 2012 2212}
54
+ end
55
+
56
+ def test_endash
57
+ assert_equal_encoded "--", %w{2013 2015}
58
+ end
59
+
60
+ def test_emdash
61
+ assert_equal_encoded "---", %w{2014}
62
+ end
63
+
64
+ def test_dotleader
65
+ assert_equal_encoded "..", %w{2025}
66
+ end
67
+
68
+ def test_ellipsis
69
+ assert_equal_encoded "...", %w{2026}
70
+ end
71
+
72
+ def test_slashes
73
+ assert_equal_encoded "/", %w{002f 2044 2215}
74
+ assert_equal_encoded "\\", %w{005c 2216}
75
+ end
76
+
77
+ def test_colons
78
+ assert_equal_encoded ":", %w{003a 2236}
79
+ end
80
+
81
+ def test_semicolons
82
+ assert_equal_encoded ";", %w{003b 061b}
83
+ end
84
+
85
+ def test_less_thans
86
+ assert_equal_encoded "<", %w{003c 2039 2329 27e8 3008}
87
+ end
88
+
89
+ def test_equals
90
+ assert_equal_encoded "=", "003d"
91
+ end
92
+
93
+ def test_greater_thans
94
+ assert_equal_encoded ">", %w{003e 203a 232a 27e9 3009}
95
+ end
96
+
97
+ def test_question_marks
98
+ assert_equal_encoded "?", %w{003f 061f}
99
+ assert_equal_encoded "", "00bf"
100
+ assert_equal_encoded "?!", %w{203d 2048}
101
+ assert_equal_encoded "!?", "2049"
102
+ end
103
+
104
+ def test_circumflexes
105
+ assert_equal_encoded "^", %w{005e 2038 2303}
106
+ end
107
+
108
+ def test_underscores
109
+ assert_equal_encoded "_", %w{005f 02cd 2017}
110
+ end
111
+
112
+ def test_grave_accents
113
+ assert_equal_encoded "`", %w{0060 02cb 2035}
114
+ end
115
+
116
+ def test_bars
117
+ assert_equal_encoded "|", %w{007c 2223 2758}
118
+ end
119
+
120
+ def test_tildes
121
+ assert_equal_encoded "~", %w{007e 02dc 2053 223c ff5e}
122
+ end
123
+
124
+ def test_related_letters
125
+ {
126
+ "B" => "212c",
127
+ "C" => %w{2102 212d},
128
+ "E" => %w{2107 2130},
129
+ "F" => "2131",
130
+ "H" => %w{210b 210c 210d},
131
+ "I" => %w{0130 0406 04c0 2110 2111 2160},
132
+ "K" => "212a",
133
+ "L" => "2112",
134
+ "M" => "2133",
135
+ "N" => "2115",
136
+ "P" => "2119",
137
+ "Q" => "211a",
138
+ "R" => %w{211b 211c 211d},
139
+ "Z" => %w{2124 2128}
140
+ }.each do |expected, encode_mes|
141
+ assert_equal_encoded expected, encode_mes
142
+ end
143
+ end
144
+ end
@@ -0,0 +1,28 @@
1
+ # 100% shorthand
2
+ module CodepointTestHelper
3
+ def assert_equal_encoded(expected, encode_mes)
4
+ # Killing a duck because Ruby 1.9 doesn't mix Enumerable into String
5
+ encode_mes = [encode_mes] if encode_mes.is_a?(String)
6
+ encode_mes.each do |encode_me|
7
+ encoded = encode(encode_me)
8
+ actual = encoded.to_ascii
9
+ if expected == actual
10
+ # Let's not retest it
11
+ assert true
12
+ else
13
+ message = "<#{expected.inspect}> expected but was\n<#{actual.inspect}>\n"
14
+ message << " defined in #{Unidecoder.in_yaml_file(encoded)}"
15
+ fail message
16
+ end
17
+ end
18
+ end
19
+
20
+ private
21
+ def encode(codepoint)
22
+ Unidecoder.encode(codepoint)
23
+ end
24
+
25
+ def which_yaml(codepoint)
26
+ Unidecoder.in_yaml_file(encode(codepoint))
27
+ end
28
+ end
@@ -0,0 +1,114 @@
1
+ # encoding: utf-8
2
+ $:.unshift File.expand_path("../lib", File.dirname(__FILE__))
3
+ $:.unshift File.expand_path(File.dirname(__FILE__))
4
+ $:.uniq!
5
+ require "test/unit"
6
+ require "unidecoder"
7
+
8
+
9
+ class UnidecoderTest < Test::Unit::TestCase
10
+ # Silly phrases courtesy of Frank da Cruz (http://www.columbia.edu/kermit/utf8.html).
11
+
12
+ DONT_CONVERT = [
13
+ "Vitrum edere possum; mihi non nocet.", # Latin
14
+ "Je puis mangier del voirre. Ne me nuit.", # Old French
15
+ "Kristala jan dezaket, ez dit minik ematen.", # Basque
16
+ "Kaya kong kumain nang bubog at hindi ako masaktan.", # Tagalog
17
+ "Ich kann Glas essen, ohne mir weh zu tun.", # German
18
+ "I can eat glass and it doesn't hurt me.", # English
19
+ ]
20
+
21
+ CONVERT_PAIRS = {
22
+ # French
23
+ "Je peux manger du verre, ça ne me fait pas de mal." => "Je peux manger du verre, ca ne me fait pas de mal.",
24
+ # Romanian
25
+ "Pot să mănânc sticlă și ea nu mă rănește." => "Pot sa mananc sticla si ea nu ma raneste.",
26
+ # Icelandic
27
+ "Ég get etið gler án þess að meiða mig." => "Eg get etid gler an thess ad meida mig.",
28
+ # Albanian
29
+ "Unë mund të ha qelq dhe nuk më gjen gjë." => "Une mund te ha qelq dhe nuk me gjen gje.",
30
+ # Polish
31
+ "Mogę jeść szkło i mi nie szkodzi." => "Moge jesc szklo i mi nie szkodzi.",
32
+ # Russian
33
+ "Я могу есть стекло, оно мне не вредит." => "Ia moghu iest' stieklo, ono mnie nie vriedit.",
34
+ # Bulgarian
35
+ "Мога да ям стъкло, то не ми вреди." => "Mogha da iam stklo, to nie mi vriedi.",
36
+ # Anglo-Saxon
37
+ "ᛁᚳ᛫ᛗᚨᚷ᛫ᚷᛚᚨᛋ᛫ᛖᚩᛏᚪᚾ᛫ᚩᚾᛞ᛫ᚻᛁᛏ᛫ᚾᛖ᛫ᚻᛖᚪᚱᛗᛁᚪᚧ᛫ᛗᛖ᛬" => "ic.mag.glas.eotacn.ond.hit.ne.heacrmiacth.me:",
38
+ # Classical Greek
39
+ "ὕαλον ϕαγεῖν δύναμαι· τοῦτο οὔ με βλάπτει" => "ualon phagein dunamai; touto ou me blaptei",
40
+ # Hindi
41
+ "मैं काँच खा सकता हूँ और मुझे उससे कोई चोट नहीं पहुंचती" => "maiN kaaNc khaa sktaa huuN aur mujhe usse koii cott nhiiN phuNctii",
42
+ # Thai
43
+ "ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ" => "chankinkracchkaid aetmanaimthamaihchanecchb",
44
+ # Chinese
45
+ "我能吞下玻璃而不伤身体。" => "Wo Neng Tun Xia Bo Li Er Bu Shang Shen Ti . ",
46
+ # Japanese
47
+ "私はガラスを食べられます。それは私を傷つけません。" => "Si hagarasuwoShi beraremasu. sorehaSi woShang tukemasen. ",
48
+ "من می توانم بدونِ احساس درد شيشه بخورم" => # Persian
49
+ "mn my twnm bdwni Hss drd shyshh bkhwrm",
50
+ "أنا قادر على أكل الزجاج و هذا لا يؤلمن" => # Arabic
51
+ "'n qdr `l~ 'kl lzjj w hdh l yw'lmn",
52
+ "אני יכול לאכול זכוכית וזה לא מזיק לי" => # Hebrew
53
+ "ny ykvl lkvl zkvkyt vzh l mzyq ly",
54
+ }
55
+
56
+ def test_should_raise_error_with_invalid_utf8
57
+ [
58
+ "\x80", # Continuation byte, low (cp125)
59
+ "\x94", # Continuation byte, mid (cp125)
60
+ "\x9F", # Continuation byte, high (cp125)
61
+ "\xC0", # Overlong encoding, start of 2-byte sequence, but codepoint < 128
62
+ "\xC1", # Overlong encoding, start of 2-byte sequence, but codepoint < 128
63
+ "\xC2", # Start of 2-byte sequence, low
64
+ "\xC8", # Start of 2-byte sequence, mid
65
+ "\xDF", # Start of 2-byte sequence, high
66
+ "\xE0", # Start of 3-byte sequence, low
67
+ "\xE8", # Start of 3-byte sequence, mid
68
+ "\xEF", # Start of 3-byte sequence, high
69
+ "\xF0", # Start of 4-byte sequence
70
+ "\xF1", # Unused byte
71
+ "\xFF", # Restricted byte
72
+ ].map do |byte|
73
+ assert_raise ArgumentError, "#{byte.inspect} did not raise error" do
74
+ Unidecoder.decode("a#{byte}a")
75
+ end
76
+ end
77
+
78
+ end
79
+
80
+ def test_unidecoder_decode
81
+ DONT_CONVERT.each do |ascii|
82
+ assert_equal ascii, Unidecoder.decode(ascii)
83
+ end
84
+ CONVERT_PAIRS.each do |unicode, ascii|
85
+ assert_equal ascii, Unidecoder.decode(unicode)
86
+ end
87
+ end
88
+
89
+ def test_unidecoder_encode
90
+ {
91
+ # Strings
92
+ "0041" => "A",
93
+ "00e6" => "æ",
94
+ "042f" => "Я"
95
+ }.each do |codepoint, unicode|
96
+ assert_equal unicode, Unidecoder.encode(codepoint)
97
+ end
98
+ end
99
+
100
+ def test_unidecoder_in_yaml_file
101
+ {
102
+ "A" => "x00.yml (line 67)",
103
+ "π" => "x03.yml (line 194)",
104
+ "Я" => "x04.yml (line 49)"
105
+ }.each do |character, output|
106
+ assert_equal output, Unidecoder.in_yaml_file(character)
107
+ end
108
+ end
109
+
110
+ def test_override
111
+ assert_equal "Juergen", Unidecoder.decode("Jürgen", "ü" => "ue")
112
+ end
113
+
114
+ end