talentbox-unidecoder 1.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Changelog.md +16 -0
- data/README.md +51 -0
- data/Rakefile +28 -0
- data/lib/unidecoder.rb +98 -0
- data/lib/unidecoder/data/x00.yml +257 -0
- data/lib/unidecoder/data/x01.yml +257 -0
- data/lib/unidecoder/data/x02.yml +256 -0
- data/lib/unidecoder/data/x03.yml +256 -0
- data/lib/unidecoder/data/x04.yml +256 -0
- data/lib/unidecoder/data/x05.yml +256 -0
- data/lib/unidecoder/data/x06.yml +256 -0
- data/lib/unidecoder/data/x07.yml +256 -0
- data/lib/unidecoder/data/x09.yml +256 -0
- data/lib/unidecoder/data/x0a.yml +256 -0
- data/lib/unidecoder/data/x0b.yml +256 -0
- data/lib/unidecoder/data/x0c.yml +256 -0
- data/lib/unidecoder/data/x0d.yml +256 -0
- data/lib/unidecoder/data/x0e.yml +256 -0
- data/lib/unidecoder/data/x0f.yml +256 -0
- data/lib/unidecoder/data/x10.yml +256 -0
- data/lib/unidecoder/data/x11.yml +256 -0
- data/lib/unidecoder/data/x12.yml +257 -0
- data/lib/unidecoder/data/x13.yml +256 -0
- data/lib/unidecoder/data/x14.yml +257 -0
- data/lib/unidecoder/data/x15.yml +257 -0
- data/lib/unidecoder/data/x16.yml +256 -0
- data/lib/unidecoder/data/x17.yml +256 -0
- data/lib/unidecoder/data/x18.yml +256 -0
- data/lib/unidecoder/data/x1e.yml +256 -0
- data/lib/unidecoder/data/x1f.yml +256 -0
- data/lib/unidecoder/data/x20.yml +256 -0
- data/lib/unidecoder/data/x21.yml +256 -0
- data/lib/unidecoder/data/x22.yml +256 -0
- data/lib/unidecoder/data/x23.yml +256 -0
- data/lib/unidecoder/data/x24.yml +256 -0
- data/lib/unidecoder/data/x25.yml +256 -0
- data/lib/unidecoder/data/x26.yml +256 -0
- data/lib/unidecoder/data/x27.yml +256 -0
- data/lib/unidecoder/data/x28.yml +257 -0
- data/lib/unidecoder/data/x2e.yml +256 -0
- data/lib/unidecoder/data/x2f.yml +256 -0
- data/lib/unidecoder/data/x30.yml +256 -0
- data/lib/unidecoder/data/x31.yml +256 -0
- data/lib/unidecoder/data/x32.yml +256 -0
- data/lib/unidecoder/data/x33.yml +256 -0
- data/lib/unidecoder/data/x4d.yml +256 -0
- data/lib/unidecoder/data/x4e.yml +257 -0
- data/lib/unidecoder/data/x4f.yml +257 -0
- data/lib/unidecoder/data/x50.yml +257 -0
- data/lib/unidecoder/data/x51.yml +257 -0
- data/lib/unidecoder/data/x52.yml +257 -0
- data/lib/unidecoder/data/x53.yml +257 -0
- data/lib/unidecoder/data/x54.yml +257 -0
- data/lib/unidecoder/data/x55.yml +257 -0
- data/lib/unidecoder/data/x56.yml +257 -0
- data/lib/unidecoder/data/x57.yml +257 -0
- data/lib/unidecoder/data/x58.yml +257 -0
- data/lib/unidecoder/data/x59.yml +257 -0
- data/lib/unidecoder/data/x5a.yml +257 -0
- data/lib/unidecoder/data/x5b.yml +257 -0
- data/lib/unidecoder/data/x5c.yml +257 -0
- data/lib/unidecoder/data/x5d.yml +257 -0
- data/lib/unidecoder/data/x5e.yml +257 -0
- data/lib/unidecoder/data/x5f.yml +257 -0
- data/lib/unidecoder/data/x60.yml +257 -0
- data/lib/unidecoder/data/x61.yml +257 -0
- data/lib/unidecoder/data/x62.yml +257 -0
- data/lib/unidecoder/data/x63.yml +257 -0
- data/lib/unidecoder/data/x64.yml +257 -0
- data/lib/unidecoder/data/x65.yml +257 -0
- data/lib/unidecoder/data/x66.yml +257 -0
- data/lib/unidecoder/data/x67.yml +257 -0
- data/lib/unidecoder/data/x68.yml +257 -0
- data/lib/unidecoder/data/x69.yml +257 -0
- data/lib/unidecoder/data/x6a.yml +257 -0
- data/lib/unidecoder/data/x6b.yml +257 -0
- data/lib/unidecoder/data/x6c.yml +257 -0
- data/lib/unidecoder/data/x6d.yml +257 -0
- data/lib/unidecoder/data/x6e.yml +257 -0
- data/lib/unidecoder/data/x6f.yml +257 -0
- data/lib/unidecoder/data/x70.yml +257 -0
- data/lib/unidecoder/data/x71.yml +257 -0
- data/lib/unidecoder/data/x72.yml +257 -0
- data/lib/unidecoder/data/x73.yml +257 -0
- data/lib/unidecoder/data/x74.yml +257 -0
- data/lib/unidecoder/data/x75.yml +257 -0
- data/lib/unidecoder/data/x76.yml +257 -0
- data/lib/unidecoder/data/x77.yml +257 -0
- data/lib/unidecoder/data/x78.yml +257 -0
- data/lib/unidecoder/data/x79.yml +257 -0
- data/lib/unidecoder/data/x7a.yml +257 -0
- data/lib/unidecoder/data/x7b.yml +257 -0
- data/lib/unidecoder/data/x7c.yml +257 -0
- data/lib/unidecoder/data/x7d.yml +257 -0
- data/lib/unidecoder/data/x7e.yml +257 -0
- data/lib/unidecoder/data/x7f.yml +257 -0
- data/lib/unidecoder/data/x80.yml +257 -0
- data/lib/unidecoder/data/x81.yml +257 -0
- data/lib/unidecoder/data/x82.yml +257 -0
- data/lib/unidecoder/data/x83.yml +257 -0
- data/lib/unidecoder/data/x84.yml +257 -0
- data/lib/unidecoder/data/x85.yml +257 -0
- data/lib/unidecoder/data/x86.yml +257 -0
- data/lib/unidecoder/data/x87.yml +257 -0
- data/lib/unidecoder/data/x88.yml +257 -0
- data/lib/unidecoder/data/x89.yml +257 -0
- data/lib/unidecoder/data/x8a.yml +257 -0
- data/lib/unidecoder/data/x8b.yml +257 -0
- data/lib/unidecoder/data/x8c.yml +257 -0
- data/lib/unidecoder/data/x8d.yml +257 -0
- data/lib/unidecoder/data/x8e.yml +257 -0
- data/lib/unidecoder/data/x8f.yml +257 -0
- data/lib/unidecoder/data/x90.yml +257 -0
- data/lib/unidecoder/data/x91.yml +257 -0
- data/lib/unidecoder/data/x92.yml +257 -0
- data/lib/unidecoder/data/x93.yml +257 -0
- data/lib/unidecoder/data/x94.yml +257 -0
- data/lib/unidecoder/data/x95.yml +257 -0
- data/lib/unidecoder/data/x96.yml +257 -0
- data/lib/unidecoder/data/x97.yml +257 -0
- data/lib/unidecoder/data/x98.yml +257 -0
- data/lib/unidecoder/data/x99.yml +257 -0
- data/lib/unidecoder/data/x9a.yml +257 -0
- data/lib/unidecoder/data/x9b.yml +257 -0
- data/lib/unidecoder/data/x9c.yml +257 -0
- data/lib/unidecoder/data/x9d.yml +257 -0
- data/lib/unidecoder/data/x9e.yml +257 -0
- data/lib/unidecoder/data/x9f.yml +256 -0
- data/lib/unidecoder/data/xa0.yml +257 -0
- data/lib/unidecoder/data/xa1.yml +257 -0
- data/lib/unidecoder/data/xa2.yml +257 -0
- data/lib/unidecoder/data/xa3.yml +257 -0
- data/lib/unidecoder/data/xa4.yml +256 -0
- data/lib/unidecoder/data/xac.yml +257 -0
- data/lib/unidecoder/data/xad.yml +257 -0
- data/lib/unidecoder/data/xae.yml +257 -0
- data/lib/unidecoder/data/xaf.yml +257 -0
- data/lib/unidecoder/data/xb0.yml +257 -0
- data/lib/unidecoder/data/xb1.yml +257 -0
- data/lib/unidecoder/data/xb2.yml +257 -0
- data/lib/unidecoder/data/xb3.yml +257 -0
- data/lib/unidecoder/data/xb4.yml +257 -0
- data/lib/unidecoder/data/xb5.yml +257 -0
- data/lib/unidecoder/data/xb6.yml +257 -0
- data/lib/unidecoder/data/xb7.yml +257 -0
- data/lib/unidecoder/data/xb8.yml +257 -0
- data/lib/unidecoder/data/xb9.yml +257 -0
- data/lib/unidecoder/data/xba.yml +257 -0
- data/lib/unidecoder/data/xbb.yml +257 -0
- data/lib/unidecoder/data/xbc.yml +257 -0
- data/lib/unidecoder/data/xbd.yml +257 -0
- data/lib/unidecoder/data/xbe.yml +257 -0
- data/lib/unidecoder/data/xbf.yml +257 -0
- data/lib/unidecoder/data/xc0.yml +257 -0
- data/lib/unidecoder/data/xc1.yml +257 -0
- data/lib/unidecoder/data/xc2.yml +257 -0
- data/lib/unidecoder/data/xc3.yml +257 -0
- data/lib/unidecoder/data/xc4.yml +257 -0
- data/lib/unidecoder/data/xc5.yml +257 -0
- data/lib/unidecoder/data/xc6.yml +257 -0
- data/lib/unidecoder/data/xc7.yml +257 -0
- data/lib/unidecoder/data/xc8.yml +257 -0
- data/lib/unidecoder/data/xc9.yml +257 -0
- data/lib/unidecoder/data/xca.yml +257 -0
- data/lib/unidecoder/data/xcb.yml +257 -0
- data/lib/unidecoder/data/xcc.yml +257 -0
- data/lib/unidecoder/data/xcd.yml +257 -0
- data/lib/unidecoder/data/xce.yml +257 -0
- data/lib/unidecoder/data/xcf.yml +257 -0
- data/lib/unidecoder/data/xd0.yml +257 -0
- data/lib/unidecoder/data/xd1.yml +257 -0
- data/lib/unidecoder/data/xd2.yml +257 -0
- data/lib/unidecoder/data/xd3.yml +257 -0
- data/lib/unidecoder/data/xd4.yml +257 -0
- data/lib/unidecoder/data/xd5.yml +257 -0
- data/lib/unidecoder/data/xd6.yml +257 -0
- data/lib/unidecoder/data/xd7.yml +256 -0
- data/lib/unidecoder/data/xf9.yml +257 -0
- data/lib/unidecoder/data/xfa.yml +256 -0
- data/lib/unidecoder/data/xfb.yml +257 -0
- data/lib/unidecoder/data/xfc.yml +257 -0
- data/lib/unidecoder/data/xfd.yml +256 -0
- data/lib/unidecoder/data/xfe.yml +257 -0
- data/lib/unidecoder/data/xff.yml +257 -0
- data/lib/unidecoder/version.rb +9 -0
- data/test/unicode_point_suite/basic_latin_test.rb +144 -0
- data/test/unicode_point_suite/codepoint_test_helper.rb +28 -0
- data/test/unidecoder_test.rb +114 -0
- metadata +238 -0
@@ -0,0 +1,257 @@
|
|
1
|
+
---
|
2
|
+
- '[?]'
|
3
|
+
- '!'
|
4
|
+
- '"'
|
5
|
+
- '#'
|
6
|
+
- $
|
7
|
+
- '%'
|
8
|
+
- '&'
|
9
|
+
- "'"
|
10
|
+
- (
|
11
|
+
- )
|
12
|
+
- '*'
|
13
|
+
- +
|
14
|
+
- ','
|
15
|
+
- -
|
16
|
+
- .
|
17
|
+
- /
|
18
|
+
- 0
|
19
|
+
- 1
|
20
|
+
- 2
|
21
|
+
- 3
|
22
|
+
- 4
|
23
|
+
- 5
|
24
|
+
- 6
|
25
|
+
- 7
|
26
|
+
- 8
|
27
|
+
- 9
|
28
|
+
- ':'
|
29
|
+
- ;
|
30
|
+
- <
|
31
|
+
- =
|
32
|
+
- '>'
|
33
|
+
- '?'
|
34
|
+
- '@'
|
35
|
+
- A
|
36
|
+
- B
|
37
|
+
- C
|
38
|
+
- D
|
39
|
+
- E
|
40
|
+
- F
|
41
|
+
- G
|
42
|
+
- H
|
43
|
+
- I
|
44
|
+
- J
|
45
|
+
- K
|
46
|
+
- L
|
47
|
+
- M
|
48
|
+
- N
|
49
|
+
- O
|
50
|
+
- P
|
51
|
+
- Q
|
52
|
+
- R
|
53
|
+
- S
|
54
|
+
- T
|
55
|
+
- U
|
56
|
+
- V
|
57
|
+
- W
|
58
|
+
- X
|
59
|
+
- Y
|
60
|
+
- Z
|
61
|
+
- '['
|
62
|
+
- \
|
63
|
+
- ']'
|
64
|
+
- '^'
|
65
|
+
- _
|
66
|
+
- '`'
|
67
|
+
- a
|
68
|
+
- b
|
69
|
+
- c
|
70
|
+
- d
|
71
|
+
- e
|
72
|
+
- f
|
73
|
+
- g
|
74
|
+
- h
|
75
|
+
- i
|
76
|
+
- j
|
77
|
+
- k
|
78
|
+
- l
|
79
|
+
- m
|
80
|
+
- n
|
81
|
+
- o
|
82
|
+
- p
|
83
|
+
- q
|
84
|
+
- r
|
85
|
+
- s
|
86
|
+
- t
|
87
|
+
- u
|
88
|
+
- v
|
89
|
+
- w
|
90
|
+
- x
|
91
|
+
- y
|
92
|
+
- z
|
93
|
+
- '{'
|
94
|
+
- '|'
|
95
|
+
- '}'
|
96
|
+
- '~'
|
97
|
+
- '[?]'
|
98
|
+
- '[?]'
|
99
|
+
- .
|
100
|
+
- '['
|
101
|
+
- ']'
|
102
|
+
- ','
|
103
|
+
- '*'
|
104
|
+
- wo
|
105
|
+
- a
|
106
|
+
- i
|
107
|
+
- u
|
108
|
+
- e
|
109
|
+
- o
|
110
|
+
- ya
|
111
|
+
- yu
|
112
|
+
- yo
|
113
|
+
- tu
|
114
|
+
- +
|
115
|
+
- a
|
116
|
+
- i
|
117
|
+
- u
|
118
|
+
- e
|
119
|
+
- o
|
120
|
+
- ka
|
121
|
+
- ki
|
122
|
+
- ku
|
123
|
+
- ke
|
124
|
+
- ko
|
125
|
+
- sa
|
126
|
+
- si
|
127
|
+
- su
|
128
|
+
- se
|
129
|
+
- so
|
130
|
+
- ta
|
131
|
+
- ti
|
132
|
+
- tu
|
133
|
+
- te
|
134
|
+
- to
|
135
|
+
- na
|
136
|
+
- ni
|
137
|
+
- nu
|
138
|
+
- ne
|
139
|
+
- no
|
140
|
+
- ha
|
141
|
+
- hi
|
142
|
+
- hu
|
143
|
+
- he
|
144
|
+
- ho
|
145
|
+
- ma
|
146
|
+
- mi
|
147
|
+
- mu
|
148
|
+
- me
|
149
|
+
- mo
|
150
|
+
- ya
|
151
|
+
- yu
|
152
|
+
- yo
|
153
|
+
- ra
|
154
|
+
- ri
|
155
|
+
- ru
|
156
|
+
- re
|
157
|
+
- ro
|
158
|
+
- wa
|
159
|
+
- n
|
160
|
+
- ':'
|
161
|
+
- ;
|
162
|
+
- ''
|
163
|
+
- g
|
164
|
+
- gg
|
165
|
+
- gs
|
166
|
+
- n
|
167
|
+
- nj
|
168
|
+
- nh
|
169
|
+
- d
|
170
|
+
- dd
|
171
|
+
- r
|
172
|
+
- lg
|
173
|
+
- lm
|
174
|
+
- lb
|
175
|
+
- ls
|
176
|
+
- lt
|
177
|
+
- lp
|
178
|
+
- rh
|
179
|
+
- m
|
180
|
+
- b
|
181
|
+
- bb
|
182
|
+
- bs
|
183
|
+
- s
|
184
|
+
- ss
|
185
|
+
- ''
|
186
|
+
- j
|
187
|
+
- jj
|
188
|
+
- c
|
189
|
+
- k
|
190
|
+
- t
|
191
|
+
- p
|
192
|
+
- h
|
193
|
+
- '[?]'
|
194
|
+
- '[?]'
|
195
|
+
- '[?]'
|
196
|
+
- a
|
197
|
+
- ae
|
198
|
+
- ya
|
199
|
+
- yae
|
200
|
+
- eo
|
201
|
+
- e
|
202
|
+
- '[?]'
|
203
|
+
- '[?]'
|
204
|
+
- yeo
|
205
|
+
- ye
|
206
|
+
- o
|
207
|
+
- wa
|
208
|
+
- wae
|
209
|
+
- oe
|
210
|
+
- '[?]'
|
211
|
+
- '[?]'
|
212
|
+
- yo
|
213
|
+
- u
|
214
|
+
- weo
|
215
|
+
- we
|
216
|
+
- wi
|
217
|
+
- yu
|
218
|
+
- '[?]'
|
219
|
+
- '[?]'
|
220
|
+
- eu
|
221
|
+
- yi
|
222
|
+
- i
|
223
|
+
- '[?]'
|
224
|
+
- '[?]'
|
225
|
+
- '[?]'
|
226
|
+
- /C
|
227
|
+
- PS
|
228
|
+
- '!'
|
229
|
+
- -
|
230
|
+
- '|'
|
231
|
+
- Y=
|
232
|
+
- W=
|
233
|
+
- '[?]'
|
234
|
+
- '|'
|
235
|
+
- -
|
236
|
+
- '|'
|
237
|
+
- -
|
238
|
+
- '|'
|
239
|
+
- '#'
|
240
|
+
- O
|
241
|
+
- '[?]'
|
242
|
+
- '[?]'
|
243
|
+
- '[?]'
|
244
|
+
- '[?]'
|
245
|
+
- '[?]'
|
246
|
+
- '[?]'
|
247
|
+
- '[?]'
|
248
|
+
- '[?]'
|
249
|
+
- '[?]'
|
250
|
+
- '[?]'
|
251
|
+
- '{'
|
252
|
+
- '|'
|
253
|
+
- '}'
|
254
|
+
- ''
|
255
|
+
- ''
|
256
|
+
- ''
|
257
|
+
- ''
|
@@ -0,0 +1,144 @@
|
|
1
|
+
$:.unshift File.expand_path("../../lib", File.dirname(__FILE__))
|
2
|
+
$:.unshift File.expand_path(File.dirname(__FILE__))
|
3
|
+
$:.uniq!
|
4
|
+
require "test/unit"
|
5
|
+
require "unidecoder"
|
6
|
+
require "codepoint_test_helper"
|
7
|
+
|
8
|
+
include CodepointTestHelper
|
9
|
+
|
10
|
+
class BasicLatinTest < Test::Unit::TestCase
|
11
|
+
# This test suite is just regression test and debugging
|
12
|
+
# to better transliterate the Basic Latin Unicode codepoints
|
13
|
+
#
|
14
|
+
# http://unicode.org/charts/
|
15
|
+
# http://unicode.org/charts/PDF/U0000.pdf
|
16
|
+
|
17
|
+
# NOTE: I can't figure out how to test control characters.
|
18
|
+
# Get weird results trying to pack them to unicode.
|
19
|
+
|
20
|
+
def test_spaces
|
21
|
+
assert_equal_encoded " ", %w{0020 00a0}
|
22
|
+
assert_equal_encoded "", %w{200b 2060}
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_exclamation_marks
|
26
|
+
assert_equal_encoded "!", %w{0021 2762}
|
27
|
+
assert_equal_encoded "!!", "203c"
|
28
|
+
assert_equal_encoded "", "00a1"
|
29
|
+
assert_equal_encoded "?!", "203d"
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_quotation_marks
|
33
|
+
assert_equal_encoded "\"", %w{0022 02ba 2033 3003}
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_apostrophes
|
37
|
+
assert_equal_encoded "'", %w{0027 02b9 02bc 02c8 2032}
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_asterisks
|
41
|
+
assert_equal_encoded "*", %w{002a 066d 204e 2217 26b9 2731}
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_commas
|
45
|
+
assert_equal_encoded ",", %w{002c 060c}
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_periods
|
49
|
+
assert_equal_encoded ".", %w{002e 06d4}
|
50
|
+
end
|
51
|
+
|
52
|
+
def test_hyphens
|
53
|
+
assert_equal_encoded "-", %w{002d 2010 2011 2012 2212}
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_endash
|
57
|
+
assert_equal_encoded "--", %w{2013 2015}
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_emdash
|
61
|
+
assert_equal_encoded "---", %w{2014}
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_dotleader
|
65
|
+
assert_equal_encoded "..", %w{2025}
|
66
|
+
end
|
67
|
+
|
68
|
+
def test_ellipsis
|
69
|
+
assert_equal_encoded "...", %w{2026}
|
70
|
+
end
|
71
|
+
|
72
|
+
def test_slashes
|
73
|
+
assert_equal_encoded "/", %w{002f 2044 2215}
|
74
|
+
assert_equal_encoded "\\", %w{005c 2216}
|
75
|
+
end
|
76
|
+
|
77
|
+
def test_colons
|
78
|
+
assert_equal_encoded ":", %w{003a 2236}
|
79
|
+
end
|
80
|
+
|
81
|
+
def test_semicolons
|
82
|
+
assert_equal_encoded ";", %w{003b 061b}
|
83
|
+
end
|
84
|
+
|
85
|
+
def test_less_thans
|
86
|
+
assert_equal_encoded "<", %w{003c 2039 2329 27e8 3008}
|
87
|
+
end
|
88
|
+
|
89
|
+
def test_equals
|
90
|
+
assert_equal_encoded "=", "003d"
|
91
|
+
end
|
92
|
+
|
93
|
+
def test_greater_thans
|
94
|
+
assert_equal_encoded ">", %w{003e 203a 232a 27e9 3009}
|
95
|
+
end
|
96
|
+
|
97
|
+
def test_question_marks
|
98
|
+
assert_equal_encoded "?", %w{003f 061f}
|
99
|
+
assert_equal_encoded "", "00bf"
|
100
|
+
assert_equal_encoded "?!", %w{203d 2048}
|
101
|
+
assert_equal_encoded "!?", "2049"
|
102
|
+
end
|
103
|
+
|
104
|
+
def test_circumflexes
|
105
|
+
assert_equal_encoded "^", %w{005e 2038 2303}
|
106
|
+
end
|
107
|
+
|
108
|
+
def test_underscores
|
109
|
+
assert_equal_encoded "_", %w{005f 02cd 2017}
|
110
|
+
end
|
111
|
+
|
112
|
+
def test_grave_accents
|
113
|
+
assert_equal_encoded "`", %w{0060 02cb 2035}
|
114
|
+
end
|
115
|
+
|
116
|
+
def test_bars
|
117
|
+
assert_equal_encoded "|", %w{007c 2223 2758}
|
118
|
+
end
|
119
|
+
|
120
|
+
def test_tildes
|
121
|
+
assert_equal_encoded "~", %w{007e 02dc 2053 223c ff5e}
|
122
|
+
end
|
123
|
+
|
124
|
+
def test_related_letters
|
125
|
+
{
|
126
|
+
"B" => "212c",
|
127
|
+
"C" => %w{2102 212d},
|
128
|
+
"E" => %w{2107 2130},
|
129
|
+
"F" => "2131",
|
130
|
+
"H" => %w{210b 210c 210d},
|
131
|
+
"I" => %w{0130 0406 04c0 2110 2111 2160},
|
132
|
+
"K" => "212a",
|
133
|
+
"L" => "2112",
|
134
|
+
"M" => "2133",
|
135
|
+
"N" => "2115",
|
136
|
+
"P" => "2119",
|
137
|
+
"Q" => "211a",
|
138
|
+
"R" => %w{211b 211c 211d},
|
139
|
+
"Z" => %w{2124 2128}
|
140
|
+
}.each do |expected, encode_mes|
|
141
|
+
assert_equal_encoded expected, encode_mes
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# 100% shorthand
|
2
|
+
module CodepointTestHelper
|
3
|
+
def assert_equal_encoded(expected, encode_mes)
|
4
|
+
# Killing a duck because Ruby 1.9 doesn't mix Enumerable into String
|
5
|
+
encode_mes = [encode_mes] if encode_mes.is_a?(String)
|
6
|
+
encode_mes.each do |encode_me|
|
7
|
+
encoded = encode(encode_me)
|
8
|
+
actual = encoded.to_ascii
|
9
|
+
if expected == actual
|
10
|
+
# Let's not retest it
|
11
|
+
assert true
|
12
|
+
else
|
13
|
+
message = "<#{expected.inspect}> expected but was\n<#{actual.inspect}>\n"
|
14
|
+
message << " defined in #{Unidecoder.in_yaml_file(encoded)}"
|
15
|
+
fail message
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
def encode(codepoint)
|
22
|
+
Unidecoder.encode(codepoint)
|
23
|
+
end
|
24
|
+
|
25
|
+
def which_yaml(codepoint)
|
26
|
+
Unidecoder.in_yaml_file(encode(codepoint))
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
$:.unshift File.expand_path("../lib", File.dirname(__FILE__))
|
3
|
+
$:.unshift File.expand_path(File.dirname(__FILE__))
|
4
|
+
$:.uniq!
|
5
|
+
require "test/unit"
|
6
|
+
require "unidecoder"
|
7
|
+
|
8
|
+
|
9
|
+
class UnidecoderTest < Test::Unit::TestCase
|
10
|
+
# Silly phrases courtesy of Frank da Cruz (http://www.columbia.edu/kermit/utf8.html).
|
11
|
+
|
12
|
+
DONT_CONVERT = [
|
13
|
+
"Vitrum edere possum; mihi non nocet.", # Latin
|
14
|
+
"Je puis mangier del voirre. Ne me nuit.", # Old French
|
15
|
+
"Kristala jan dezaket, ez dit minik ematen.", # Basque
|
16
|
+
"Kaya kong kumain nang bubog at hindi ako masaktan.", # Tagalog
|
17
|
+
"Ich kann Glas essen, ohne mir weh zu tun.", # German
|
18
|
+
"I can eat glass and it doesn't hurt me.", # English
|
19
|
+
]
|
20
|
+
|
21
|
+
CONVERT_PAIRS = {
|
22
|
+
# French
|
23
|
+
"Je peux manger du verre, ça ne me fait pas de mal." => "Je peux manger du verre, ca ne me fait pas de mal.",
|
24
|
+
# Romanian
|
25
|
+
"Pot să mănânc sticlă și ea nu mă rănește." => "Pot sa mananc sticla si ea nu ma raneste.",
|
26
|
+
# Icelandic
|
27
|
+
"Ég get etið gler án þess að meiða mig." => "Eg get etid gler an thess ad meida mig.",
|
28
|
+
# Albanian
|
29
|
+
"Unë mund të ha qelq dhe nuk më gjen gjë." => "Une mund te ha qelq dhe nuk me gjen gje.",
|
30
|
+
# Polish
|
31
|
+
"Mogę jeść szkło i mi nie szkodzi." => "Moge jesc szklo i mi nie szkodzi.",
|
32
|
+
# Russian
|
33
|
+
"Я могу есть стекло, оно мне не вредит." => "Ia moghu iest' stieklo, ono mnie nie vriedit.",
|
34
|
+
# Bulgarian
|
35
|
+
"Мога да ям стъкло, то не ми вреди." => "Mogha da iam stklo, to nie mi vriedi.",
|
36
|
+
# Anglo-Saxon
|
37
|
+
"ᛁᚳ᛫ᛗᚨᚷ᛫ᚷᛚᚨᛋ᛫ᛖᚩᛏᚪᚾ᛫ᚩᚾᛞ᛫ᚻᛁᛏ᛫ᚾᛖ᛫ᚻᛖᚪᚱᛗᛁᚪᚧ᛫ᛗᛖ᛬" => "ic.mag.glas.eotacn.ond.hit.ne.heacrmiacth.me:",
|
38
|
+
# Classical Greek
|
39
|
+
"ὕαλον ϕαγεῖν δύναμαι· τοῦτο οὔ με βλάπτει" => "ualon phagein dunamai; touto ou me blaptei",
|
40
|
+
# Hindi
|
41
|
+
"मैं काँच खा सकता हूँ और मुझे उससे कोई चोट नहीं पहुंचती" => "maiN kaaNc khaa sktaa huuN aur mujhe usse koii cott nhiiN phuNctii",
|
42
|
+
# Thai
|
43
|
+
"ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ" => "chankinkracchkaid aetmanaimthamaihchanecchb",
|
44
|
+
# Chinese
|
45
|
+
"我能吞下玻璃而不伤身体。" => "Wo Neng Tun Xia Bo Li Er Bu Shang Shen Ti . ",
|
46
|
+
# Japanese
|
47
|
+
"私はガラスを食べられます。それは私を傷つけません。" => "Si hagarasuwoShi beraremasu. sorehaSi woShang tukemasen. ",
|
48
|
+
"من می توانم بدونِ احساس درد شيشه بخورم" => # Persian
|
49
|
+
"mn my twnm bdwni Hss drd shyshh bkhwrm",
|
50
|
+
"أنا قادر على أكل الزجاج و هذا لا يؤلمن" => # Arabic
|
51
|
+
"'n qdr `l~ 'kl lzjj w hdh l yw'lmn",
|
52
|
+
"אני יכול לאכול זכוכית וזה לא מזיק לי" => # Hebrew
|
53
|
+
"ny ykvl lkvl zkvkyt vzh l mzyq ly",
|
54
|
+
}
|
55
|
+
|
56
|
+
def test_should_raise_error_with_invalid_utf8
|
57
|
+
[
|
58
|
+
"\x80", # Continuation byte, low (cp125)
|
59
|
+
"\x94", # Continuation byte, mid (cp125)
|
60
|
+
"\x9F", # Continuation byte, high (cp125)
|
61
|
+
"\xC0", # Overlong encoding, start of 2-byte sequence, but codepoint < 128
|
62
|
+
"\xC1", # Overlong encoding, start of 2-byte sequence, but codepoint < 128
|
63
|
+
"\xC2", # Start of 2-byte sequence, low
|
64
|
+
"\xC8", # Start of 2-byte sequence, mid
|
65
|
+
"\xDF", # Start of 2-byte sequence, high
|
66
|
+
"\xE0", # Start of 3-byte sequence, low
|
67
|
+
"\xE8", # Start of 3-byte sequence, mid
|
68
|
+
"\xEF", # Start of 3-byte sequence, high
|
69
|
+
"\xF0", # Start of 4-byte sequence
|
70
|
+
"\xF1", # Unused byte
|
71
|
+
"\xFF", # Restricted byte
|
72
|
+
].map do |byte|
|
73
|
+
assert_raise ArgumentError, "#{byte.inspect} did not raise error" do
|
74
|
+
Unidecoder.decode("a#{byte}a")
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_unidecoder_decode
|
81
|
+
DONT_CONVERT.each do |ascii|
|
82
|
+
assert_equal ascii, Unidecoder.decode(ascii)
|
83
|
+
end
|
84
|
+
CONVERT_PAIRS.each do |unicode, ascii|
|
85
|
+
assert_equal ascii, Unidecoder.decode(unicode)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def test_unidecoder_encode
|
90
|
+
{
|
91
|
+
# Strings
|
92
|
+
"0041" => "A",
|
93
|
+
"00e6" => "æ",
|
94
|
+
"042f" => "Я"
|
95
|
+
}.each do |codepoint, unicode|
|
96
|
+
assert_equal unicode, Unidecoder.encode(codepoint)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def test_unidecoder_in_yaml_file
|
101
|
+
{
|
102
|
+
"A" => "x00.yml (line 67)",
|
103
|
+
"π" => "x03.yml (line 194)",
|
104
|
+
"Я" => "x04.yml (line 49)"
|
105
|
+
}.each do |character, output|
|
106
|
+
assert_equal output, Unidecoder.in_yaml_file(character)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def test_override
|
111
|
+
assert_equal "Juergen", Unidecoder.decode("Jürgen", "ü" => "ue")
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|