talentbox-unidecoder 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Changelog.md +16 -0
- data/README.md +51 -0
- data/Rakefile +28 -0
- data/lib/unidecoder.rb +98 -0
- data/lib/unidecoder/data/x00.yml +257 -0
- data/lib/unidecoder/data/x01.yml +257 -0
- data/lib/unidecoder/data/x02.yml +256 -0
- data/lib/unidecoder/data/x03.yml +256 -0
- data/lib/unidecoder/data/x04.yml +256 -0
- data/lib/unidecoder/data/x05.yml +256 -0
- data/lib/unidecoder/data/x06.yml +256 -0
- data/lib/unidecoder/data/x07.yml +256 -0
- data/lib/unidecoder/data/x09.yml +256 -0
- data/lib/unidecoder/data/x0a.yml +256 -0
- data/lib/unidecoder/data/x0b.yml +256 -0
- data/lib/unidecoder/data/x0c.yml +256 -0
- data/lib/unidecoder/data/x0d.yml +256 -0
- data/lib/unidecoder/data/x0e.yml +256 -0
- data/lib/unidecoder/data/x0f.yml +256 -0
- data/lib/unidecoder/data/x10.yml +256 -0
- data/lib/unidecoder/data/x11.yml +256 -0
- data/lib/unidecoder/data/x12.yml +257 -0
- data/lib/unidecoder/data/x13.yml +256 -0
- data/lib/unidecoder/data/x14.yml +257 -0
- data/lib/unidecoder/data/x15.yml +257 -0
- data/lib/unidecoder/data/x16.yml +256 -0
- data/lib/unidecoder/data/x17.yml +256 -0
- data/lib/unidecoder/data/x18.yml +256 -0
- data/lib/unidecoder/data/x1e.yml +256 -0
- data/lib/unidecoder/data/x1f.yml +256 -0
- data/lib/unidecoder/data/x20.yml +256 -0
- data/lib/unidecoder/data/x21.yml +256 -0
- data/lib/unidecoder/data/x22.yml +256 -0
- data/lib/unidecoder/data/x23.yml +256 -0
- data/lib/unidecoder/data/x24.yml +256 -0
- data/lib/unidecoder/data/x25.yml +256 -0
- data/lib/unidecoder/data/x26.yml +256 -0
- data/lib/unidecoder/data/x27.yml +256 -0
- data/lib/unidecoder/data/x28.yml +257 -0
- data/lib/unidecoder/data/x2e.yml +256 -0
- data/lib/unidecoder/data/x2f.yml +256 -0
- data/lib/unidecoder/data/x30.yml +256 -0
- data/lib/unidecoder/data/x31.yml +256 -0
- data/lib/unidecoder/data/x32.yml +256 -0
- data/lib/unidecoder/data/x33.yml +256 -0
- data/lib/unidecoder/data/x4d.yml +256 -0
- data/lib/unidecoder/data/x4e.yml +257 -0
- data/lib/unidecoder/data/x4f.yml +257 -0
- data/lib/unidecoder/data/x50.yml +257 -0
- data/lib/unidecoder/data/x51.yml +257 -0
- data/lib/unidecoder/data/x52.yml +257 -0
- data/lib/unidecoder/data/x53.yml +257 -0
- data/lib/unidecoder/data/x54.yml +257 -0
- data/lib/unidecoder/data/x55.yml +257 -0
- data/lib/unidecoder/data/x56.yml +257 -0
- data/lib/unidecoder/data/x57.yml +257 -0
- data/lib/unidecoder/data/x58.yml +257 -0
- data/lib/unidecoder/data/x59.yml +257 -0
- data/lib/unidecoder/data/x5a.yml +257 -0
- data/lib/unidecoder/data/x5b.yml +257 -0
- data/lib/unidecoder/data/x5c.yml +257 -0
- data/lib/unidecoder/data/x5d.yml +257 -0
- data/lib/unidecoder/data/x5e.yml +257 -0
- data/lib/unidecoder/data/x5f.yml +257 -0
- data/lib/unidecoder/data/x60.yml +257 -0
- data/lib/unidecoder/data/x61.yml +257 -0
- data/lib/unidecoder/data/x62.yml +257 -0
- data/lib/unidecoder/data/x63.yml +257 -0
- data/lib/unidecoder/data/x64.yml +257 -0
- data/lib/unidecoder/data/x65.yml +257 -0
- data/lib/unidecoder/data/x66.yml +257 -0
- data/lib/unidecoder/data/x67.yml +257 -0
- data/lib/unidecoder/data/x68.yml +257 -0
- data/lib/unidecoder/data/x69.yml +257 -0
- data/lib/unidecoder/data/x6a.yml +257 -0
- data/lib/unidecoder/data/x6b.yml +257 -0
- data/lib/unidecoder/data/x6c.yml +257 -0
- data/lib/unidecoder/data/x6d.yml +257 -0
- data/lib/unidecoder/data/x6e.yml +257 -0
- data/lib/unidecoder/data/x6f.yml +257 -0
- data/lib/unidecoder/data/x70.yml +257 -0
- data/lib/unidecoder/data/x71.yml +257 -0
- data/lib/unidecoder/data/x72.yml +257 -0
- data/lib/unidecoder/data/x73.yml +257 -0
- data/lib/unidecoder/data/x74.yml +257 -0
- data/lib/unidecoder/data/x75.yml +257 -0
- data/lib/unidecoder/data/x76.yml +257 -0
- data/lib/unidecoder/data/x77.yml +257 -0
- data/lib/unidecoder/data/x78.yml +257 -0
- data/lib/unidecoder/data/x79.yml +257 -0
- data/lib/unidecoder/data/x7a.yml +257 -0
- data/lib/unidecoder/data/x7b.yml +257 -0
- data/lib/unidecoder/data/x7c.yml +257 -0
- data/lib/unidecoder/data/x7d.yml +257 -0
- data/lib/unidecoder/data/x7e.yml +257 -0
- data/lib/unidecoder/data/x7f.yml +257 -0
- data/lib/unidecoder/data/x80.yml +257 -0
- data/lib/unidecoder/data/x81.yml +257 -0
- data/lib/unidecoder/data/x82.yml +257 -0
- data/lib/unidecoder/data/x83.yml +257 -0
- data/lib/unidecoder/data/x84.yml +257 -0
- data/lib/unidecoder/data/x85.yml +257 -0
- data/lib/unidecoder/data/x86.yml +257 -0
- data/lib/unidecoder/data/x87.yml +257 -0
- data/lib/unidecoder/data/x88.yml +257 -0
- data/lib/unidecoder/data/x89.yml +257 -0
- data/lib/unidecoder/data/x8a.yml +257 -0
- data/lib/unidecoder/data/x8b.yml +257 -0
- data/lib/unidecoder/data/x8c.yml +257 -0
- data/lib/unidecoder/data/x8d.yml +257 -0
- data/lib/unidecoder/data/x8e.yml +257 -0
- data/lib/unidecoder/data/x8f.yml +257 -0
- data/lib/unidecoder/data/x90.yml +257 -0
- data/lib/unidecoder/data/x91.yml +257 -0
- data/lib/unidecoder/data/x92.yml +257 -0
- data/lib/unidecoder/data/x93.yml +257 -0
- data/lib/unidecoder/data/x94.yml +257 -0
- data/lib/unidecoder/data/x95.yml +257 -0
- data/lib/unidecoder/data/x96.yml +257 -0
- data/lib/unidecoder/data/x97.yml +257 -0
- data/lib/unidecoder/data/x98.yml +257 -0
- data/lib/unidecoder/data/x99.yml +257 -0
- data/lib/unidecoder/data/x9a.yml +257 -0
- data/lib/unidecoder/data/x9b.yml +257 -0
- data/lib/unidecoder/data/x9c.yml +257 -0
- data/lib/unidecoder/data/x9d.yml +257 -0
- data/lib/unidecoder/data/x9e.yml +257 -0
- data/lib/unidecoder/data/x9f.yml +256 -0
- data/lib/unidecoder/data/xa0.yml +257 -0
- data/lib/unidecoder/data/xa1.yml +257 -0
- data/lib/unidecoder/data/xa2.yml +257 -0
- data/lib/unidecoder/data/xa3.yml +257 -0
- data/lib/unidecoder/data/xa4.yml +256 -0
- data/lib/unidecoder/data/xac.yml +257 -0
- data/lib/unidecoder/data/xad.yml +257 -0
- data/lib/unidecoder/data/xae.yml +257 -0
- data/lib/unidecoder/data/xaf.yml +257 -0
- data/lib/unidecoder/data/xb0.yml +257 -0
- data/lib/unidecoder/data/xb1.yml +257 -0
- data/lib/unidecoder/data/xb2.yml +257 -0
- data/lib/unidecoder/data/xb3.yml +257 -0
- data/lib/unidecoder/data/xb4.yml +257 -0
- data/lib/unidecoder/data/xb5.yml +257 -0
- data/lib/unidecoder/data/xb6.yml +257 -0
- data/lib/unidecoder/data/xb7.yml +257 -0
- data/lib/unidecoder/data/xb8.yml +257 -0
- data/lib/unidecoder/data/xb9.yml +257 -0
- data/lib/unidecoder/data/xba.yml +257 -0
- data/lib/unidecoder/data/xbb.yml +257 -0
- data/lib/unidecoder/data/xbc.yml +257 -0
- data/lib/unidecoder/data/xbd.yml +257 -0
- data/lib/unidecoder/data/xbe.yml +257 -0
- data/lib/unidecoder/data/xbf.yml +257 -0
- data/lib/unidecoder/data/xc0.yml +257 -0
- data/lib/unidecoder/data/xc1.yml +257 -0
- data/lib/unidecoder/data/xc2.yml +257 -0
- data/lib/unidecoder/data/xc3.yml +257 -0
- data/lib/unidecoder/data/xc4.yml +257 -0
- data/lib/unidecoder/data/xc5.yml +257 -0
- data/lib/unidecoder/data/xc6.yml +257 -0
- data/lib/unidecoder/data/xc7.yml +257 -0
- data/lib/unidecoder/data/xc8.yml +257 -0
- data/lib/unidecoder/data/xc9.yml +257 -0
- data/lib/unidecoder/data/xca.yml +257 -0
- data/lib/unidecoder/data/xcb.yml +257 -0
- data/lib/unidecoder/data/xcc.yml +257 -0
- data/lib/unidecoder/data/xcd.yml +257 -0
- data/lib/unidecoder/data/xce.yml +257 -0
- data/lib/unidecoder/data/xcf.yml +257 -0
- data/lib/unidecoder/data/xd0.yml +257 -0
- data/lib/unidecoder/data/xd1.yml +257 -0
- data/lib/unidecoder/data/xd2.yml +257 -0
- data/lib/unidecoder/data/xd3.yml +257 -0
- data/lib/unidecoder/data/xd4.yml +257 -0
- data/lib/unidecoder/data/xd5.yml +257 -0
- data/lib/unidecoder/data/xd6.yml +257 -0
- data/lib/unidecoder/data/xd7.yml +256 -0
- data/lib/unidecoder/data/xf9.yml +257 -0
- data/lib/unidecoder/data/xfa.yml +256 -0
- data/lib/unidecoder/data/xfb.yml +257 -0
- data/lib/unidecoder/data/xfc.yml +257 -0
- data/lib/unidecoder/data/xfd.yml +256 -0
- data/lib/unidecoder/data/xfe.yml +257 -0
- data/lib/unidecoder/data/xff.yml +257 -0
- data/lib/unidecoder/version.rb +9 -0
- data/test/unicode_point_suite/basic_latin_test.rb +144 -0
- data/test/unicode_point_suite/codepoint_test_helper.rb +28 -0
- data/test/unidecoder_test.rb +114 -0
- metadata +238 -0
@@ -0,0 +1,257 @@
|
|
1
|
+
---
|
2
|
+
- '[?]'
|
3
|
+
- '!'
|
4
|
+
- '"'
|
5
|
+
- '#'
|
6
|
+
- $
|
7
|
+
- '%'
|
8
|
+
- '&'
|
9
|
+
- "'"
|
10
|
+
- (
|
11
|
+
- )
|
12
|
+
- '*'
|
13
|
+
- +
|
14
|
+
- ','
|
15
|
+
- -
|
16
|
+
- .
|
17
|
+
- /
|
18
|
+
- 0
|
19
|
+
- 1
|
20
|
+
- 2
|
21
|
+
- 3
|
22
|
+
- 4
|
23
|
+
- 5
|
24
|
+
- 6
|
25
|
+
- 7
|
26
|
+
- 8
|
27
|
+
- 9
|
28
|
+
- ':'
|
29
|
+
- ;
|
30
|
+
- <
|
31
|
+
- =
|
32
|
+
- '>'
|
33
|
+
- '?'
|
34
|
+
- '@'
|
35
|
+
- A
|
36
|
+
- B
|
37
|
+
- C
|
38
|
+
- D
|
39
|
+
- E
|
40
|
+
- F
|
41
|
+
- G
|
42
|
+
- H
|
43
|
+
- I
|
44
|
+
- J
|
45
|
+
- K
|
46
|
+
- L
|
47
|
+
- M
|
48
|
+
- N
|
49
|
+
- O
|
50
|
+
- P
|
51
|
+
- Q
|
52
|
+
- R
|
53
|
+
- S
|
54
|
+
- T
|
55
|
+
- U
|
56
|
+
- V
|
57
|
+
- W
|
58
|
+
- X
|
59
|
+
- Y
|
60
|
+
- Z
|
61
|
+
- '['
|
62
|
+
- \
|
63
|
+
- ']'
|
64
|
+
- '^'
|
65
|
+
- _
|
66
|
+
- '`'
|
67
|
+
- a
|
68
|
+
- b
|
69
|
+
- c
|
70
|
+
- d
|
71
|
+
- e
|
72
|
+
- f
|
73
|
+
- g
|
74
|
+
- h
|
75
|
+
- i
|
76
|
+
- j
|
77
|
+
- k
|
78
|
+
- l
|
79
|
+
- m
|
80
|
+
- n
|
81
|
+
- o
|
82
|
+
- p
|
83
|
+
- q
|
84
|
+
- r
|
85
|
+
- s
|
86
|
+
- t
|
87
|
+
- u
|
88
|
+
- v
|
89
|
+
- w
|
90
|
+
- x
|
91
|
+
- y
|
92
|
+
- z
|
93
|
+
- '{'
|
94
|
+
- '|'
|
95
|
+
- '}'
|
96
|
+
- '~'
|
97
|
+
- '[?]'
|
98
|
+
- '[?]'
|
99
|
+
- .
|
100
|
+
- '['
|
101
|
+
- ']'
|
102
|
+
- ','
|
103
|
+
- '*'
|
104
|
+
- wo
|
105
|
+
- a
|
106
|
+
- i
|
107
|
+
- u
|
108
|
+
- e
|
109
|
+
- o
|
110
|
+
- ya
|
111
|
+
- yu
|
112
|
+
- yo
|
113
|
+
- tu
|
114
|
+
- +
|
115
|
+
- a
|
116
|
+
- i
|
117
|
+
- u
|
118
|
+
- e
|
119
|
+
- o
|
120
|
+
- ka
|
121
|
+
- ki
|
122
|
+
- ku
|
123
|
+
- ke
|
124
|
+
- ko
|
125
|
+
- sa
|
126
|
+
- si
|
127
|
+
- su
|
128
|
+
- se
|
129
|
+
- so
|
130
|
+
- ta
|
131
|
+
- ti
|
132
|
+
- tu
|
133
|
+
- te
|
134
|
+
- to
|
135
|
+
- na
|
136
|
+
- ni
|
137
|
+
- nu
|
138
|
+
- ne
|
139
|
+
- no
|
140
|
+
- ha
|
141
|
+
- hi
|
142
|
+
- hu
|
143
|
+
- he
|
144
|
+
- ho
|
145
|
+
- ma
|
146
|
+
- mi
|
147
|
+
- mu
|
148
|
+
- me
|
149
|
+
- mo
|
150
|
+
- ya
|
151
|
+
- yu
|
152
|
+
- yo
|
153
|
+
- ra
|
154
|
+
- ri
|
155
|
+
- ru
|
156
|
+
- re
|
157
|
+
- ro
|
158
|
+
- wa
|
159
|
+
- n
|
160
|
+
- ':'
|
161
|
+
- ;
|
162
|
+
- ''
|
163
|
+
- g
|
164
|
+
- gg
|
165
|
+
- gs
|
166
|
+
- n
|
167
|
+
- nj
|
168
|
+
- nh
|
169
|
+
- d
|
170
|
+
- dd
|
171
|
+
- r
|
172
|
+
- lg
|
173
|
+
- lm
|
174
|
+
- lb
|
175
|
+
- ls
|
176
|
+
- lt
|
177
|
+
- lp
|
178
|
+
- rh
|
179
|
+
- m
|
180
|
+
- b
|
181
|
+
- bb
|
182
|
+
- bs
|
183
|
+
- s
|
184
|
+
- ss
|
185
|
+
- ''
|
186
|
+
- j
|
187
|
+
- jj
|
188
|
+
- c
|
189
|
+
- k
|
190
|
+
- t
|
191
|
+
- p
|
192
|
+
- h
|
193
|
+
- '[?]'
|
194
|
+
- '[?]'
|
195
|
+
- '[?]'
|
196
|
+
- a
|
197
|
+
- ae
|
198
|
+
- ya
|
199
|
+
- yae
|
200
|
+
- eo
|
201
|
+
- e
|
202
|
+
- '[?]'
|
203
|
+
- '[?]'
|
204
|
+
- yeo
|
205
|
+
- ye
|
206
|
+
- o
|
207
|
+
- wa
|
208
|
+
- wae
|
209
|
+
- oe
|
210
|
+
- '[?]'
|
211
|
+
- '[?]'
|
212
|
+
- yo
|
213
|
+
- u
|
214
|
+
- weo
|
215
|
+
- we
|
216
|
+
- wi
|
217
|
+
- yu
|
218
|
+
- '[?]'
|
219
|
+
- '[?]'
|
220
|
+
- eu
|
221
|
+
- yi
|
222
|
+
- i
|
223
|
+
- '[?]'
|
224
|
+
- '[?]'
|
225
|
+
- '[?]'
|
226
|
+
- /C
|
227
|
+
- PS
|
228
|
+
- '!'
|
229
|
+
- -
|
230
|
+
- '|'
|
231
|
+
- Y=
|
232
|
+
- W=
|
233
|
+
- '[?]'
|
234
|
+
- '|'
|
235
|
+
- -
|
236
|
+
- '|'
|
237
|
+
- -
|
238
|
+
- '|'
|
239
|
+
- '#'
|
240
|
+
- O
|
241
|
+
- '[?]'
|
242
|
+
- '[?]'
|
243
|
+
- '[?]'
|
244
|
+
- '[?]'
|
245
|
+
- '[?]'
|
246
|
+
- '[?]'
|
247
|
+
- '[?]'
|
248
|
+
- '[?]'
|
249
|
+
- '[?]'
|
250
|
+
- '[?]'
|
251
|
+
- '{'
|
252
|
+
- '|'
|
253
|
+
- '}'
|
254
|
+
- ''
|
255
|
+
- ''
|
256
|
+
- ''
|
257
|
+
- ''
|
@@ -0,0 +1,144 @@
|
|
1
|
+
$:.unshift File.expand_path("../../lib", File.dirname(__FILE__))
|
2
|
+
$:.unshift File.expand_path(File.dirname(__FILE__))
|
3
|
+
$:.uniq!
|
4
|
+
require "test/unit"
|
5
|
+
require "unidecoder"
|
6
|
+
require "codepoint_test_helper"
|
7
|
+
|
8
|
+
include CodepointTestHelper
|
9
|
+
|
10
|
+
class BasicLatinTest < Test::Unit::TestCase
|
11
|
+
# This test suite is just regression test and debugging
|
12
|
+
# to better transliterate the Basic Latin Unicode codepoints
|
13
|
+
#
|
14
|
+
# http://unicode.org/charts/
|
15
|
+
# http://unicode.org/charts/PDF/U0000.pdf
|
16
|
+
|
17
|
+
# NOTE: I can't figure out how to test control characters.
|
18
|
+
# Get weird results trying to pack them to unicode.
|
19
|
+
|
20
|
+
def test_spaces
|
21
|
+
assert_equal_encoded " ", %w{0020 00a0}
|
22
|
+
assert_equal_encoded "", %w{200b 2060}
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_exclamation_marks
|
26
|
+
assert_equal_encoded "!", %w{0021 2762}
|
27
|
+
assert_equal_encoded "!!", "203c"
|
28
|
+
assert_equal_encoded "", "00a1"
|
29
|
+
assert_equal_encoded "?!", "203d"
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_quotation_marks
|
33
|
+
assert_equal_encoded "\"", %w{0022 02ba 2033 3003}
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_apostrophes
|
37
|
+
assert_equal_encoded "'", %w{0027 02b9 02bc 02c8 2032}
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_asterisks
|
41
|
+
assert_equal_encoded "*", %w{002a 066d 204e 2217 26b9 2731}
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_commas
|
45
|
+
assert_equal_encoded ",", %w{002c 060c}
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_periods
|
49
|
+
assert_equal_encoded ".", %w{002e 06d4}
|
50
|
+
end
|
51
|
+
|
52
|
+
def test_hyphens
|
53
|
+
assert_equal_encoded "-", %w{002d 2010 2011 2012 2212}
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_endash
|
57
|
+
assert_equal_encoded "--", %w{2013 2015}
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_emdash
|
61
|
+
assert_equal_encoded "---", %w{2014}
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_dotleader
|
65
|
+
assert_equal_encoded "..", %w{2025}
|
66
|
+
end
|
67
|
+
|
68
|
+
def test_ellipsis
|
69
|
+
assert_equal_encoded "...", %w{2026}
|
70
|
+
end
|
71
|
+
|
72
|
+
def test_slashes
|
73
|
+
assert_equal_encoded "/", %w{002f 2044 2215}
|
74
|
+
assert_equal_encoded "\\", %w{005c 2216}
|
75
|
+
end
|
76
|
+
|
77
|
+
def test_colons
|
78
|
+
assert_equal_encoded ":", %w{003a 2236}
|
79
|
+
end
|
80
|
+
|
81
|
+
def test_semicolons
|
82
|
+
assert_equal_encoded ";", %w{003b 061b}
|
83
|
+
end
|
84
|
+
|
85
|
+
def test_less_thans
|
86
|
+
assert_equal_encoded "<", %w{003c 2039 2329 27e8 3008}
|
87
|
+
end
|
88
|
+
|
89
|
+
def test_equals
|
90
|
+
assert_equal_encoded "=", "003d"
|
91
|
+
end
|
92
|
+
|
93
|
+
def test_greater_thans
|
94
|
+
assert_equal_encoded ">", %w{003e 203a 232a 27e9 3009}
|
95
|
+
end
|
96
|
+
|
97
|
+
def test_question_marks
|
98
|
+
assert_equal_encoded "?", %w{003f 061f}
|
99
|
+
assert_equal_encoded "", "00bf"
|
100
|
+
assert_equal_encoded "?!", %w{203d 2048}
|
101
|
+
assert_equal_encoded "!?", "2049"
|
102
|
+
end
|
103
|
+
|
104
|
+
def test_circumflexes
|
105
|
+
assert_equal_encoded "^", %w{005e 2038 2303}
|
106
|
+
end
|
107
|
+
|
108
|
+
def test_underscores
|
109
|
+
assert_equal_encoded "_", %w{005f 02cd 2017}
|
110
|
+
end
|
111
|
+
|
112
|
+
def test_grave_accents
|
113
|
+
assert_equal_encoded "`", %w{0060 02cb 2035}
|
114
|
+
end
|
115
|
+
|
116
|
+
def test_bars
|
117
|
+
assert_equal_encoded "|", %w{007c 2223 2758}
|
118
|
+
end
|
119
|
+
|
120
|
+
def test_tildes
|
121
|
+
assert_equal_encoded "~", %w{007e 02dc 2053 223c ff5e}
|
122
|
+
end
|
123
|
+
|
124
|
+
def test_related_letters
|
125
|
+
{
|
126
|
+
"B" => "212c",
|
127
|
+
"C" => %w{2102 212d},
|
128
|
+
"E" => %w{2107 2130},
|
129
|
+
"F" => "2131",
|
130
|
+
"H" => %w{210b 210c 210d},
|
131
|
+
"I" => %w{0130 0406 04c0 2110 2111 2160},
|
132
|
+
"K" => "212a",
|
133
|
+
"L" => "2112",
|
134
|
+
"M" => "2133",
|
135
|
+
"N" => "2115",
|
136
|
+
"P" => "2119",
|
137
|
+
"Q" => "211a",
|
138
|
+
"R" => %w{211b 211c 211d},
|
139
|
+
"Z" => %w{2124 2128}
|
140
|
+
}.each do |expected, encode_mes|
|
141
|
+
assert_equal_encoded expected, encode_mes
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# 100% shorthand
|
2
|
+
module CodepointTestHelper
|
3
|
+
def assert_equal_encoded(expected, encode_mes)
|
4
|
+
# Killing a duck because Ruby 1.9 doesn't mix Enumerable into String
|
5
|
+
encode_mes = [encode_mes] if encode_mes.is_a?(String)
|
6
|
+
encode_mes.each do |encode_me|
|
7
|
+
encoded = encode(encode_me)
|
8
|
+
actual = encoded.to_ascii
|
9
|
+
if expected == actual
|
10
|
+
# Let's not retest it
|
11
|
+
assert true
|
12
|
+
else
|
13
|
+
message = "<#{expected.inspect}> expected but was\n<#{actual.inspect}>\n"
|
14
|
+
message << " defined in #{Unidecoder.in_yaml_file(encoded)}"
|
15
|
+
fail message
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
def encode(codepoint)
|
22
|
+
Unidecoder.encode(codepoint)
|
23
|
+
end
|
24
|
+
|
25
|
+
def which_yaml(codepoint)
|
26
|
+
Unidecoder.in_yaml_file(encode(codepoint))
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
$:.unshift File.expand_path("../lib", File.dirname(__FILE__))
|
3
|
+
$:.unshift File.expand_path(File.dirname(__FILE__))
|
4
|
+
$:.uniq!
|
5
|
+
require "test/unit"
|
6
|
+
require "unidecoder"
|
7
|
+
|
8
|
+
|
9
|
+
class UnidecoderTest < Test::Unit::TestCase
|
10
|
+
# Silly phrases courtesy of Frank da Cruz (http://www.columbia.edu/kermit/utf8.html).
|
11
|
+
|
12
|
+
DONT_CONVERT = [
|
13
|
+
"Vitrum edere possum; mihi non nocet.", # Latin
|
14
|
+
"Je puis mangier del voirre. Ne me nuit.", # Old French
|
15
|
+
"Kristala jan dezaket, ez dit minik ematen.", # Basque
|
16
|
+
"Kaya kong kumain nang bubog at hindi ako masaktan.", # Tagalog
|
17
|
+
"Ich kann Glas essen, ohne mir weh zu tun.", # German
|
18
|
+
"I can eat glass and it doesn't hurt me.", # English
|
19
|
+
]
|
20
|
+
|
21
|
+
CONVERT_PAIRS = {
|
22
|
+
# French
|
23
|
+
"Je peux manger du verre, ça ne me fait pas de mal." => "Je peux manger du verre, ca ne me fait pas de mal.",
|
24
|
+
# Romanian
|
25
|
+
"Pot să mănânc sticlă și ea nu mă rănește." => "Pot sa mananc sticla si ea nu ma raneste.",
|
26
|
+
# Icelandic
|
27
|
+
"Ég get etið gler án þess að meiða mig." => "Eg get etid gler an thess ad meida mig.",
|
28
|
+
# Albanian
|
29
|
+
"Unë mund të ha qelq dhe nuk më gjen gjë." => "Une mund te ha qelq dhe nuk me gjen gje.",
|
30
|
+
# Polish
|
31
|
+
"Mogę jeść szkło i mi nie szkodzi." => "Moge jesc szklo i mi nie szkodzi.",
|
32
|
+
# Russian
|
33
|
+
"Я могу есть стекло, оно мне не вредит." => "Ia moghu iest' stieklo, ono mnie nie vriedit.",
|
34
|
+
# Bulgarian
|
35
|
+
"Мога да ям стъкло, то не ми вреди." => "Mogha da iam stklo, to nie mi vriedi.",
|
36
|
+
# Anglo-Saxon
|
37
|
+
"ᛁᚳ᛫ᛗᚨᚷ᛫ᚷᛚᚨᛋ᛫ᛖᚩᛏᚪᚾ᛫ᚩᚾᛞ᛫ᚻᛁᛏ᛫ᚾᛖ᛫ᚻᛖᚪᚱᛗᛁᚪᚧ᛫ᛗᛖ᛬" => "ic.mag.glas.eotacn.ond.hit.ne.heacrmiacth.me:",
|
38
|
+
# Classical Greek
|
39
|
+
"ὕαλον ϕαγεῖν δύναμαι· τοῦτο οὔ με βλάπτει" => "ualon phagein dunamai; touto ou me blaptei",
|
40
|
+
# Hindi
|
41
|
+
"मैं काँच खा सकता हूँ और मुझे उससे कोई चोट नहीं पहुंचती" => "maiN kaaNc khaa sktaa huuN aur mujhe usse koii cott nhiiN phuNctii",
|
42
|
+
# Thai
|
43
|
+
"ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ" => "chankinkracchkaid aetmanaimthamaihchanecchb",
|
44
|
+
# Chinese
|
45
|
+
"我能吞下玻璃而不伤身体。" => "Wo Neng Tun Xia Bo Li Er Bu Shang Shen Ti . ",
|
46
|
+
# Japanese
|
47
|
+
"私はガラスを食べられます。それは私を傷つけません。" => "Si hagarasuwoShi beraremasu. sorehaSi woShang tukemasen. ",
|
48
|
+
"من می توانم بدونِ احساس درد شيشه بخورم" => # Persian
|
49
|
+
"mn my twnm bdwni Hss drd shyshh bkhwrm",
|
50
|
+
"أنا قادر على أكل الزجاج و هذا لا يؤلمن" => # Arabic
|
51
|
+
"'n qdr `l~ 'kl lzjj w hdh l yw'lmn",
|
52
|
+
"אני יכול לאכול זכוכית וזה לא מזיק לי" => # Hebrew
|
53
|
+
"ny ykvl lkvl zkvkyt vzh l mzyq ly",
|
54
|
+
}
|
55
|
+
|
56
|
+
def test_should_raise_error_with_invalid_utf8
|
57
|
+
[
|
58
|
+
"\x80", # Continuation byte, low (cp125)
|
59
|
+
"\x94", # Continuation byte, mid (cp125)
|
60
|
+
"\x9F", # Continuation byte, high (cp125)
|
61
|
+
"\xC0", # Overlong encoding, start of 2-byte sequence, but codepoint < 128
|
62
|
+
"\xC1", # Overlong encoding, start of 2-byte sequence, but codepoint < 128
|
63
|
+
"\xC2", # Start of 2-byte sequence, low
|
64
|
+
"\xC8", # Start of 2-byte sequence, mid
|
65
|
+
"\xDF", # Start of 2-byte sequence, high
|
66
|
+
"\xE0", # Start of 3-byte sequence, low
|
67
|
+
"\xE8", # Start of 3-byte sequence, mid
|
68
|
+
"\xEF", # Start of 3-byte sequence, high
|
69
|
+
"\xF0", # Start of 4-byte sequence
|
70
|
+
"\xF1", # Unused byte
|
71
|
+
"\xFF", # Restricted byte
|
72
|
+
].map do |byte|
|
73
|
+
assert_raise ArgumentError, "#{byte.inspect} did not raise error" do
|
74
|
+
Unidecoder.decode("a#{byte}a")
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_unidecoder_decode
|
81
|
+
DONT_CONVERT.each do |ascii|
|
82
|
+
assert_equal ascii, Unidecoder.decode(ascii)
|
83
|
+
end
|
84
|
+
CONVERT_PAIRS.each do |unicode, ascii|
|
85
|
+
assert_equal ascii, Unidecoder.decode(unicode)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def test_unidecoder_encode
|
90
|
+
{
|
91
|
+
# Strings
|
92
|
+
"0041" => "A",
|
93
|
+
"00e6" => "æ",
|
94
|
+
"042f" => "Я"
|
95
|
+
}.each do |codepoint, unicode|
|
96
|
+
assert_equal unicode, Unidecoder.encode(codepoint)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def test_unidecoder_in_yaml_file
|
101
|
+
{
|
102
|
+
"A" => "x00.yml (line 67)",
|
103
|
+
"π" => "x03.yml (line 194)",
|
104
|
+
"Я" => "x04.yml (line 49)"
|
105
|
+
}.each do |character, output|
|
106
|
+
assert_equal output, Unidecoder.in_yaml_file(character)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def test_override
|
111
|
+
assert_equal "Juergen", Unidecoder.decode("Jürgen", "ü" => "ue")
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|