punycode4r 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/punycode.rb +565 -0
- data/test/test_punycode.rb +311 -0
- metadata +47 -0
data/lib/punycode.rb
ADDED
@@ -0,0 +1,565 @@
|
|
1
|
+
#!/usr/bin/ruby -Ku
|
2
|
+
#
|
3
|
+
# This is pure Ruby implementing Punycode (RFC 3492).
|
4
|
+
# (original ANSI C code (C89) implementing Punycode is in RFC 3492)
|
5
|
+
#
|
6
|
+
# copyright (c) 2005 Kazuhiro NISHIYAMA
|
7
|
+
# You can redistribute it and/or modify it under the same terms as Ruby.
|
8
|
+
#
|
9
|
+
=begin
|
10
|
+
= punycode4r
|
11
|
+
== usage
|
12
|
+
=== simple usage
|
13
|
+
require 'punycode'
|
14
|
+
utf8_string = "\346\226\207\345\255\227\345\210\227"
|
15
|
+
punycode_string = Punycode.encode(utf8_string)
|
16
|
+
p punycode_string #=> "1br58tspi"
|
17
|
+
p(Punycode.decode(punycode_string) == utf8_string) #=> true
|
18
|
+
|
19
|
+
== IDN (Internationalized Domain Name)
|
20
|
+
When you use punycode in IDN,
|
21
|
+
you must need to do NAMEPREP (RFC 3491) before Punycode.encode,
|
22
|
+
and add ACE Prefix (defined in RFC 3490) after Punycode.encode.
|
23
|
+
|
24
|
+
This library supports punycode only.
|
25
|
+
NAMEPREP requires other libraries.
|
26
|
+
|
27
|
+
=end
|
28
|
+
|
29
|
+
module Punycode
|
30
|
+
module Status
|
31
|
+
class Error < StandardError; end
|
32
|
+
class PunycodeSuccess; end
|
33
|
+
# Input is invalid.
|
34
|
+
class PunycodeBadInput < Error; end
|
35
|
+
# Output would exceed the space provided.
|
36
|
+
class PunycodeBigOutput< Error; end
|
37
|
+
# Input needs wider integers to process.
|
38
|
+
class PunycodeOverflow < Error; end
|
39
|
+
end
|
40
|
+
include Status
|
41
|
+
|
42
|
+
# *** Bootstring parameters for Punycode ***
|
43
|
+
|
44
|
+
BASE = 36; TMIN = 1; TMAX = 26; SKEW = 38; DAMP = 700
|
45
|
+
INITIAL_BIAS = 72; INITIAL_N = 0x80; DELIMITER = 0x2D
|
46
|
+
|
47
|
+
module_function
|
48
|
+
|
49
|
+
# basic(cp) tests whether cp is a basic code point:
|
50
|
+
def basic(cp)
|
51
|
+
cp < 0x80
|
52
|
+
end
|
53
|
+
|
54
|
+
# delim(cp) tests whether cp is a delimiter:
|
55
|
+
def delim(cp)
|
56
|
+
cp == DELIMITER
|
57
|
+
end
|
58
|
+
|
59
|
+
# decode_digit(cp) returns the numeric value of a basic code
|
60
|
+
# point (for use in representing integers) in the range 0 to
|
61
|
+
# base-1, or base if cp is does not represent a value.
|
62
|
+
def decode_digit(cp)
|
63
|
+
cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 :
|
64
|
+
cp - 97 < 26 ? cp - 97 : BASE
|
65
|
+
end
|
66
|
+
|
67
|
+
# encode_digit(d,flag) returns the basic code point whose value
|
68
|
+
# (when used for representing integers) is d, which needs to be in
|
69
|
+
# the range 0 to base-1. The lowercase form is used unless flag is
|
70
|
+
# nonzero, in which case the uppercase form is used. The behavior
|
71
|
+
# is undefined if flag is nonzero and digit d has no uppercase form.
|
72
|
+
def encode_digit(d, flag)
|
73
|
+
return d + 22 + 75 * ((d < 26) ? 1 : 0) - ((flag ? 1 : 0) << 5)
|
74
|
+
# 0..25 map to ASCII a..z or A..Z
|
75
|
+
# 26..35 map to ASCII 0..9
|
76
|
+
end
|
77
|
+
|
78
|
+
# flagged(bcp) tests whether a basic code point is flagged
|
79
|
+
# (uppercase). The behavior is undefined if bcp is not a
|
80
|
+
# basic code point.
|
81
|
+
def flagged(bcp)
|
82
|
+
(0...26) === (bcp - 65)
|
83
|
+
end
|
84
|
+
|
85
|
+
# encode_basic(bcp,flag) forces a basic code point to lowercase
|
86
|
+
# if flag is zero, uppercase if flag is nonzero, and returns
|
87
|
+
# the resulting code point. The code point is unchanged if it
|
88
|
+
# is caseless. The behavior is undefined if bcp is not a basic
|
89
|
+
# code point.
|
90
|
+
def encode_basic(bcp, flag)
|
91
|
+
# bcp -= (bcp - 97 < 26) << 5;
|
92
|
+
if (0...26) === (bcp - 97)
|
93
|
+
bcp -= 1 << 5
|
94
|
+
end
|
95
|
+
# return bcp + ((!flag && (bcp - 65 < 26)) << 5);
|
96
|
+
if !flag and (0...26) === (bcp - 65)
|
97
|
+
bcp += 1 << 5
|
98
|
+
end
|
99
|
+
bcp
|
100
|
+
end
|
101
|
+
|
102
|
+
# *** Platform-specific constants ***
|
103
|
+
|
104
|
+
# maxint is the maximum value of a punycode_uint variable:
|
105
|
+
MAXINT = 1 << 64
|
106
|
+
|
107
|
+
# *** Bias adaptation function ***
|
108
|
+
|
109
|
+
def adapt(delta, numpoints, firsttime)
|
110
|
+
delta = firsttime ? delta / DAMP : delta >> 1
|
111
|
+
# delta >> 1 is a faster way of doing delta / 2
|
112
|
+
delta += delta / numpoints
|
113
|
+
|
114
|
+
k = 0
|
115
|
+
while delta > ((BASE - TMIN) * TMAX) / 2
|
116
|
+
delta /= BASE - TMIN
|
117
|
+
k += BASE
|
118
|
+
end
|
119
|
+
|
120
|
+
k + (BASE - TMIN + 1) * delta / (delta + SKEW)
|
121
|
+
end
|
122
|
+
|
123
|
+
# *** Main encode function ***
|
124
|
+
|
125
|
+
# punycode_encode() converts Unicode to Punycode. The input
|
126
|
+
# is represented as an array of Unicode code points (not code
|
127
|
+
# units; surrogate pairs are not allowed), and the output
|
128
|
+
# will be represented as an array of ASCII code points. The
|
129
|
+
# output string is *not* null-terminated; it will contain
|
130
|
+
# zeros if and only if the input contains zeros. (Of course
|
131
|
+
# the caller can leave room for a terminator and add one if
|
132
|
+
# needed.) The input_length is the number of code points in
|
133
|
+
# the input. The output_length is an in/out argument: the
|
134
|
+
# caller passes in the maximum number of code points that it
|
135
|
+
# can receive, and on successful return it will contain the
|
136
|
+
# number of code points actually output. The case_flags array
|
137
|
+
# holds input_length boolean values, where nonzero suggests that
|
138
|
+
# the corresponding Unicode character be forced to uppercase
|
139
|
+
# after being decoded (if possible), and zero suggests that
|
140
|
+
# it be forced to lowercase (if possible). ASCII code points
|
141
|
+
# are encoded literally, except that ASCII letters are forced
|
142
|
+
# to uppercase or lowercase according to the corresponding
|
143
|
+
# uppercase flags. If case_flags is a null pointer then ASCII
|
144
|
+
# letters are left as they are, and other code points are
|
145
|
+
# treated as if their uppercase flags were zero. The return
|
146
|
+
# value can be any of the punycode_status values defined above
|
147
|
+
# except punycode_bad_input; if not punycode_success, then
|
148
|
+
# output_size and output might contain garbage.
|
149
|
+
def punycode_encode(input_length, input, case_flags, output_length, output)
|
150
|
+
|
151
|
+
# Initialize the state:
|
152
|
+
|
153
|
+
n = INITIAL_N
|
154
|
+
delta = out = 0
|
155
|
+
max_out = output_length[0]
|
156
|
+
bias = INITIAL_BIAS
|
157
|
+
|
158
|
+
# Handle the basic code points:
|
159
|
+
input_length.times do |j|
|
160
|
+
if basic(input[j])
|
161
|
+
raise PunycodeBigOutput if max_out - out < 2
|
162
|
+
output[out] =
|
163
|
+
if case_flags
|
164
|
+
encode_basic(input[j], case_flags[j])
|
165
|
+
else
|
166
|
+
input[j]
|
167
|
+
end
|
168
|
+
out+=1
|
169
|
+
# elsif (input[j] < n)
|
170
|
+
# raise PunycodeBadInput
|
171
|
+
# (not needed for Punycode with unsigned code points)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
h = b = out
|
176
|
+
|
177
|
+
# h is the number of code points that have been handled, b is the
|
178
|
+
# number of basic code points, and out is the number of characters
|
179
|
+
# that have been output.
|
180
|
+
|
181
|
+
if b > 0
|
182
|
+
output[out] = DELIMITER
|
183
|
+
out+=1
|
184
|
+
end
|
185
|
+
|
186
|
+
# Main encoding loop:
|
187
|
+
|
188
|
+
while h < input_length
|
189
|
+
# All non-basic code points < n have been
|
190
|
+
# handled already. Find the next larger one:
|
191
|
+
|
192
|
+
m = MAXINT
|
193
|
+
input_length.times do |j|
|
194
|
+
# next if basic(input[j])
|
195
|
+
# (not needed for Punycode)
|
196
|
+
m = input[j] if (n...m) === input[j]
|
197
|
+
end
|
198
|
+
|
199
|
+
# Increase delta enough to advance the decoder's
|
200
|
+
# <n,i> state to <m,0>, but guard against overflow:
|
201
|
+
|
202
|
+
raise PunycodeOverflow if m - n > (MAXINT - delta) / (h + 1)
|
203
|
+
delta += (m - n) * (h + 1)
|
204
|
+
n = m
|
205
|
+
|
206
|
+
input_length.times do |j|
|
207
|
+
# Punycode does not need to check whether input[j] is basic:
|
208
|
+
if input[j] < n # || basic(input[j])
|
209
|
+
delta+=1
|
210
|
+
raise PunycodeOverflow if delta == 0
|
211
|
+
end
|
212
|
+
|
213
|
+
if input[j] == n
|
214
|
+
# Represent delta as a generalized variable-length integer:
|
215
|
+
|
216
|
+
q = delta; k = BASE
|
217
|
+
while true
|
218
|
+
raise PunycodeBigOutput if out >= max_out
|
219
|
+
t = if k <= bias # + TMIN # +TMIN not needed
|
220
|
+
TMIN
|
221
|
+
elsif k >= bias + TMAX
|
222
|
+
TMAX
|
223
|
+
else
|
224
|
+
k - bias
|
225
|
+
end
|
226
|
+
break if q < t
|
227
|
+
output[out] = encode_digit(t + (q - t) % (BASE - t), false)
|
228
|
+
out+=1
|
229
|
+
q = (q - t) / (BASE - t)
|
230
|
+
k += BASE
|
231
|
+
end
|
232
|
+
|
233
|
+
output[out] = encode_digit(q, case_flags && case_flags[j])
|
234
|
+
out+=1
|
235
|
+
bias = adapt(delta, h + 1, h == b)
|
236
|
+
delta = 0
|
237
|
+
h+=1
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
delta+=1; n+=1
|
242
|
+
end
|
243
|
+
|
244
|
+
output_length[0] = out
|
245
|
+
return PunycodeSuccess
|
246
|
+
end
|
247
|
+
|
248
|
+
# *** Main decode function ***
|
249
|
+
|
250
|
+
# punycode_decode() converts Punycode to Unicode. The input is
|
251
|
+
# represented as an array of ASCII code points, and the output
|
252
|
+
# will be represented as an array of Unicode code points. The
|
253
|
+
# input_length is the number of code points in the input. The
|
254
|
+
# output_length is an in/out argument: the caller passes in
|
255
|
+
# the maximum number of code points that it can receive, and
|
256
|
+
# on successful return it will contain the actual number of
|
257
|
+
# code points output. The case_flags array needs room for at
|
258
|
+
# least output_length values, or it can be a null pointer if the
|
259
|
+
# case information is not needed. A nonzero flag suggests that
|
260
|
+
# the corresponding Unicode character be forced to uppercase
|
261
|
+
# by the caller (if possible), while zero suggests that it be
|
262
|
+
# forced to lowercase (if possible). ASCII code points are
|
263
|
+
# output already in the proper case, but their flags will be set
|
264
|
+
# appropriately so that applying the flags would be harmless.
|
265
|
+
# The return value can be any of the punycode_status values
|
266
|
+
# defined above; if not punycode_success, then output_length,
|
267
|
+
# output, and case_flags might contain garbage. On success, the
|
268
|
+
# decoder will never need to write an output_length greater than
|
269
|
+
# input_length, because of how the encoding is defined.
|
270
|
+
def punycode_decode(input_length, input, output_length, output, case_flags)
|
271
|
+
|
272
|
+
# Initialize the state:
|
273
|
+
|
274
|
+
n = INITIAL_N
|
275
|
+
|
276
|
+
out = i = 0
|
277
|
+
max_out = output_length[0]
|
278
|
+
bias = INITIAL_BIAS
|
279
|
+
|
280
|
+
# Handle the basic code points: Let b be the number of input code
|
281
|
+
# points before the last delimiter, or 0 if there is none, then
|
282
|
+
# copy the first b code points to the output.
|
283
|
+
|
284
|
+
b = 0
|
285
|
+
input_length.times do |j|
|
286
|
+
b = j if delim(input[j])
|
287
|
+
end
|
288
|
+
raise PunycodeBigOutput if b > max_out
|
289
|
+
|
290
|
+
b.times do |j|
|
291
|
+
case_flags[out] = flagged(input[j]) if case_flags
|
292
|
+
raise PunycodeBadInput unless basic(input[j])
|
293
|
+
output[out] = input[j]
|
294
|
+
out+=1
|
295
|
+
end
|
296
|
+
|
297
|
+
# Main decoding loop: Start just after the last delimiter if any
|
298
|
+
# basic code points were copied; start at the beginning otherwise.
|
299
|
+
|
300
|
+
in_ = b > 0 ? b + 1 : 0
|
301
|
+
while in_ < input_length
|
302
|
+
|
303
|
+
# in_ is the index of the next character to be consumed, and
|
304
|
+
# out is the number of code points in the output array.
|
305
|
+
|
306
|
+
# Decode a generalized variable-length integer into delta,
|
307
|
+
# which gets added to i. The overflow checking is easier
|
308
|
+
# if we increase i as we go, then subtract off its starting
|
309
|
+
# value at the end to obtain delta.
|
310
|
+
|
311
|
+
oldi = i; w = 1; k = BASE
|
312
|
+
while true
|
313
|
+
raise PunycodeBadInput if in_ >= input_length
|
314
|
+
digit = decode_digit(input[in_])
|
315
|
+
in_+=1
|
316
|
+
raise PunycodeBadInput if digit >= BASE
|
317
|
+
raise PunycodeOverflow if digit > (MAXINT - i) / w
|
318
|
+
i += digit * w
|
319
|
+
t = if k <= bias # + TMIN # +TMIN not needed
|
320
|
+
TMIN
|
321
|
+
elsif k >= bias + TMAX
|
322
|
+
TMAX
|
323
|
+
else
|
324
|
+
k - bias
|
325
|
+
end
|
326
|
+
break if digit < t
|
327
|
+
raise PunycodeOverflow if w > MAXINT / (BASE - t)
|
328
|
+
w *= BASE - t
|
329
|
+
k += BASE
|
330
|
+
end
|
331
|
+
|
332
|
+
bias = adapt(i - oldi, out + 1, oldi == 0)
|
333
|
+
|
334
|
+
# i was supposed to wrap around from out+1 to 0,
|
335
|
+
# incrementing n each time, so we'll fix that now:
|
336
|
+
|
337
|
+
raise PunycodeOverflow if i / (out + 1) > MAXINT - n
|
338
|
+
n += i / (out + 1)
|
339
|
+
i %= out + 1
|
340
|
+
|
341
|
+
# Insert n at position i of the output:
|
342
|
+
|
343
|
+
# not needed for Punycode:
|
344
|
+
# raise PUNYCODE_INVALID_INPUT if decode_digit(n) <= base
|
345
|
+
raise PunycodeBigOutput if out >= max_out
|
346
|
+
|
347
|
+
if case_flags
|
348
|
+
#memmove(case_flags + i + 1, case_flags + i, out - i)
|
349
|
+
case_flags[i + 1, out - i] = case_flags[i, out - i]
|
350
|
+
|
351
|
+
# Case of last character determines uppercase flag:
|
352
|
+
case_flags[i] = flagged(input[in_ - 1])
|
353
|
+
end
|
354
|
+
|
355
|
+
#memmove(output + i + 1, output + i, (out - i) * sizeof *output)
|
356
|
+
output[i + 1, out - i] = output[i, out - i]
|
357
|
+
output[i] = n
|
358
|
+
i+=1
|
359
|
+
|
360
|
+
out+=1
|
361
|
+
end
|
362
|
+
|
363
|
+
output_length[0] = out
|
364
|
+
return PunycodeSuccess
|
365
|
+
end
|
366
|
+
|
367
|
+
def encode(unicode_string, case_flags=nil, print_ascii_only=false)
|
368
|
+
input = unicode_string.unpack('U*')
|
369
|
+
output = [0] * (ACE_MAX_LENGTH+1)
|
370
|
+
output_length = [ACE_MAX_LENGTH]
|
371
|
+
|
372
|
+
punycode_encode(input.size, input, case_flags, output_length, output)
|
373
|
+
|
374
|
+
outlen = output_length[0]
|
375
|
+
outlen.times do |j|
|
376
|
+
c = output[j]
|
377
|
+
unless c >= 0 && c <= 127
|
378
|
+
raise Error, "assertion error: invalid output char"
|
379
|
+
end
|
380
|
+
unless PRINT_ASCII[c]
|
381
|
+
raise PunycodeBadInput
|
382
|
+
end
|
383
|
+
output[j] = PRINT_ASCII[c] if print_ascii_only
|
384
|
+
end
|
385
|
+
|
386
|
+
output[0..outlen].map{|x|x.chr}.join('').sub(/\0+\z/, '')
|
387
|
+
end
|
388
|
+
|
389
|
+
def decode(punycode, case_flags=[])
|
390
|
+
input = []
|
391
|
+
output = []
|
392
|
+
|
393
|
+
if ACE_MAX_LENGTH*2 < punycode.size
|
394
|
+
raise PunycodeBigOutput
|
395
|
+
end
|
396
|
+
punycode.each_byte do |c|
|
397
|
+
unless c >= 0 && c <= 127
|
398
|
+
raise PunycodeBadInput
|
399
|
+
end
|
400
|
+
input.push(c)
|
401
|
+
end
|
402
|
+
|
403
|
+
output_length = [UNICODE_MAX_LENGTH]
|
404
|
+
Punycode.punycode_decode(input.length, input, output_length,
|
405
|
+
output, case_flags)
|
406
|
+
output.pack('U*')
|
407
|
+
end
|
408
|
+
|
409
|
+
UNICODE_MAX_LENGTH = 256
|
410
|
+
ACE_MAX_LENGTH = 256
|
411
|
+
|
412
|
+
# The following string is used to convert printable
|
413
|
+
# characters between ASCII and the native charset:
|
414
|
+
|
415
|
+
PRINT_ASCII =
|
416
|
+
"\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" \
|
417
|
+
"\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" \
|
418
|
+
" !\"\#$%&'()*+,-./" \
|
419
|
+
"0123456789:;<=>?" \
|
420
|
+
"@ABCDEFGHIJKLMNO" \
|
421
|
+
"PQRSTUVWXYZ[\\]^_" \
|
422
|
+
"`abcdefghijklmno" \
|
423
|
+
"pqrstuvwxyz{|}~\n"
|
424
|
+
end
|
425
|
+
|
426
|
+
if __FILE__ == $0
|
427
|
+
UNICODE_MAX_LENGTH = Punycode::UNICODE_MAX_LENGTH
|
428
|
+
ACE_MAX_LENGTH = Punycode::ACE_MAX_LENGTH
|
429
|
+
|
430
|
+
def usage(argv)
|
431
|
+
STDERR.puts <<-USAGE
|
432
|
+
#{argv[0]} -e reads code points and writes a Punycode string.
|
433
|
+
#{argv[0]} -d reads a Punycode string and writes code points.
|
434
|
+
|
435
|
+
Input and output are plain text in the native character set.
|
436
|
+
Code points are in the form u+hex separated by whitespace.
|
437
|
+
Although the specification allows Punycode strings to contain
|
438
|
+
any characters from the ASCII repertoire, this test code
|
439
|
+
supports only the printable characters, and needs the Punycode
|
440
|
+
string to be followed by a newline.
|
441
|
+
The case of the u in u+hex is the force-to-uppercase flag.
|
442
|
+
USAGE
|
443
|
+
exit(false)
|
444
|
+
end
|
445
|
+
|
446
|
+
TOO_BIG = "input or output is too large, recompile with larger limits"
|
447
|
+
INVALID_INPUT = "invalid input"
|
448
|
+
OVERFLOW = "arithmetic overflow"
|
449
|
+
IO_ERROR = "I/O error"
|
450
|
+
|
451
|
+
PRINT_ASCII = Punycode::PRINT_ASCII
|
452
|
+
|
453
|
+
def main(argv)
|
454
|
+
case_flags = [0] * UNICODE_MAX_LENGTH
|
455
|
+
|
456
|
+
usage(argv) if argv.size != 2
|
457
|
+
usage(argv) if /\A-[de]\z/ !~ argv[1]
|
458
|
+
|
459
|
+
if argv[1][1] == ?e
|
460
|
+
input = [0] * UNICODE_MAX_LENGTH
|
461
|
+
output = [0] * (ACE_MAX_LENGTH+1)
|
462
|
+
|
463
|
+
# Read the input code points:
|
464
|
+
|
465
|
+
input_length = 0
|
466
|
+
|
467
|
+
STDIN.read.scan(/([uU]\+)([0-9a-fA-F]+)/) do |uplus, codept|
|
468
|
+
codept = codept.hex
|
469
|
+
if uplus[1] != ?+ || codept > Punycode::MAXINT
|
470
|
+
fail(INVALID_INPUT)
|
471
|
+
end
|
472
|
+
|
473
|
+
fail(TOO_BIG) if input_length == UNICODE_MAX_LENGTH
|
474
|
+
|
475
|
+
if uplus[0] == ?u
|
476
|
+
case_flags[input_length] = false
|
477
|
+
elsif uplus[0] == ?U
|
478
|
+
case_flags[input_length] = true
|
479
|
+
else
|
480
|
+
fail(INVALID_INPUT)
|
481
|
+
end
|
482
|
+
|
483
|
+
input[input_length] = codept
|
484
|
+
input_length+=1
|
485
|
+
end
|
486
|
+
|
487
|
+
# Encode:
|
488
|
+
|
489
|
+
output_length = [ACE_MAX_LENGTH]
|
490
|
+
begin
|
491
|
+
status = Punycode.punycode_encode(input_length, input, case_flags,
|
492
|
+
output_length, output)
|
493
|
+
rescue Punycode::Status::PunycodeBadInput
|
494
|
+
fail(INVALID_INPUT)
|
495
|
+
rescue Punycode::Status::PunycodeBigOutput
|
496
|
+
fail(TOO_BIG)
|
497
|
+
rescue Punycode::Status::PunycodeOverflow
|
498
|
+
fail(OVERFLOW)
|
499
|
+
end
|
500
|
+
if status != Punycode::Status::PunycodeSuccess
|
501
|
+
fail("assertion error: unknown status")
|
502
|
+
end
|
503
|
+
|
504
|
+
# Convert to native charset and output:
|
505
|
+
|
506
|
+
outlen = output_length[0]
|
507
|
+
outlen.times do |j|
|
508
|
+
c = output[j]
|
509
|
+
raise "assertion error: invalid output char" unless c >= 0 && c <= 127
|
510
|
+
unless PRINT_ASCII[c]
|
511
|
+
fail(INVALID_INPUT)
|
512
|
+
end
|
513
|
+
output[j] = PRINT_ASCII[c]
|
514
|
+
end
|
515
|
+
|
516
|
+
output = output[0..outlen].map{|x|x.chr}.join('').sub(/\0+\z/, '')
|
517
|
+
puts(output)
|
518
|
+
exit(true)
|
519
|
+
end
|
520
|
+
|
521
|
+
if argv[1][1] == ?d
|
522
|
+
#input = [0] * ACE_MAX_LENGTH*2
|
523
|
+
#output = [0] * UNICODE_MAX_LENGTH
|
524
|
+
output = []
|
525
|
+
|
526
|
+
input = STDIN.gets.split(//)[0,ACE_MAX_LENGTH*2]
|
527
|
+
fail(TOO_BIG) if input[-1] != "\n"
|
528
|
+
input = input[0...-1]
|
529
|
+
input.each_with_index do |c, i|
|
530
|
+
print_ascii_index = PRINT_ASCII.index(c)
|
531
|
+
fail(INVALID_INPUT) unless print_ascii_index
|
532
|
+
input[i] = print_ascii_index
|
533
|
+
end
|
534
|
+
|
535
|
+
# Decode:
|
536
|
+
|
537
|
+
output_length = [UNICODE_MAX_LENGTH]
|
538
|
+
begin
|
539
|
+
status = Punycode.punycode_decode(input.length, input, output_length,
|
540
|
+
output, case_flags)
|
541
|
+
rescue Punycode::Status::PunycodeBadInput
|
542
|
+
fail(INVALID_INPUT)
|
543
|
+
rescue Punycode::Status::PunycodeBigOutput
|
544
|
+
fail(TOO_BIG)
|
545
|
+
rescue Punycode::Status::PunycodeOverflow
|
546
|
+
fail(OVERFLOW)
|
547
|
+
end
|
548
|
+
if status != Punycode::Status::PunycodeSuccess
|
549
|
+
fail("assertion error: unknown status")
|
550
|
+
end
|
551
|
+
|
552
|
+
# Output the result:
|
553
|
+
|
554
|
+
output_length[0].times do |j|
|
555
|
+
printf("%s+%04X\n", case_flags[j] ? "U" : "u", output[j])
|
556
|
+
end
|
557
|
+
|
558
|
+
exit(true)
|
559
|
+
end
|
560
|
+
|
561
|
+
usage(argv)
|
562
|
+
raise "not reached"
|
563
|
+
end
|
564
|
+
main([$0]+ARGV)
|
565
|
+
end
|
@@ -0,0 +1,311 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
#
|
3
|
+
# test of punycode.rb
|
4
|
+
#
|
5
|
+
# copyright (c) 2005 Kazuhiro NISHIYAMA
|
6
|
+
# You can redistribute it and/or modify it under the same terms as Ruby.
|
7
|
+
#
|
8
|
+
require 'test/unit'
|
9
|
+
|
10
|
+
module AssertPunycode
|
11
|
+
def assert_punycode(example)
|
12
|
+
example = example.gsub(/\\\n\s*/, "").split(/\n/)
|
13
|
+
description = example[0]
|
14
|
+
codepoints = example[1...-1].join("")
|
15
|
+
punycode = example[-1].strip.sub(/^Punycode: /, "")
|
16
|
+
|
17
|
+
assert_punycode_main(description, codepoints, punycode)
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_rfc3492_7_1_A
|
21
|
+
assert_punycode(<<-EXAMPLE)
|
22
|
+
(A) Arabic (Egyptian):
|
23
|
+
u+0644 u+064A u+0647 u+0645 u+0627 u+0628 u+062A u+0643 u+0644
|
24
|
+
u+0645 u+0648 u+0634 u+0639 u+0631 u+0628 u+064A u+061F
|
25
|
+
Punycode: egbpdaj6bu4bxfgehfvwxn
|
26
|
+
EXAMPLE
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_rfc3492_7_1_B
|
30
|
+
assert_punycode(<<-EXAMPLE)
|
31
|
+
(B) Chinese (simplified):
|
32
|
+
u+4ED6 u+4EEC u+4E3A u+4EC0 u+4E48 u+4E0D u+8BF4 u+4E2D u+6587
|
33
|
+
Punycode: ihqwcrb4cv8a8dqg056pqjye
|
34
|
+
EXAMPLE
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_rfc3492_7_1_C
|
38
|
+
assert_punycode(<<-EXAMPLE)
|
39
|
+
(C) Chinese (traditional):
|
40
|
+
u+4ED6 u+5011 u+7232 u+4EC0 u+9EBD u+4E0D u+8AAA u+4E2D u+6587
|
41
|
+
Punycode: ihqwctvzc91f659drss3x8bo0yb
|
42
|
+
EXAMPLE
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_rfc3492_7_1_D
|
46
|
+
assert_punycode(<<-EXAMPLE)
|
47
|
+
(D) Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
|
48
|
+
U+0050 u+0072 u+006F u+010D u+0070 u+0072 u+006F u+0073 u+0074
|
49
|
+
u+011B u+006E u+0065 u+006D u+006C u+0075 u+0076 u+00ED u+010D
|
50
|
+
u+0065 u+0073 u+006B u+0079
|
51
|
+
Punycode: Proprostnemluvesky-uyb24dma41a
|
52
|
+
EXAMPLE
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_rfc3492_7_1_E
|
56
|
+
assert_punycode(<<-EXAMPLE)
|
57
|
+
(E) Hebrew:
|
58
|
+
u+05DC u+05DE u+05D4 u+05D4 u+05DD u+05E4 u+05E9 u+05D5 u+05D8
|
59
|
+
u+05DC u+05D0 u+05DE u+05D3 u+05D1 u+05E8 u+05D9 u+05DD u+05E2
|
60
|
+
u+05D1 u+05E8 u+05D9 u+05EA
|
61
|
+
Punycode: 4dbcagdahymbxekheh6e0a7fei0b
|
62
|
+
EXAMPLE
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_rfc3492_7_1_F
|
66
|
+
assert_punycode(<<-EXAMPLE)
|
67
|
+
(F) Hindi (Devanagari):
|
68
|
+
u+092F u+0939 u+0932 u+094B u+0917 u+0939 u+093F u+0928 u+094D
|
69
|
+
u+0926 u+0940 u+0915 u+094D u+092F u+094B u+0902 u+0928 u+0939
|
70
|
+
u+0940 u+0902 u+092C u+094B u+0932 u+0938 u+0915 u+0924 u+0947
|
71
|
+
u+0939 u+0948 u+0902
|
72
|
+
Punycode: i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd
|
73
|
+
EXAMPLE
|
74
|
+
end
|
75
|
+
|
76
|
+
def test_rfc3492_7_1_G
|
77
|
+
assert_punycode(<<-EXAMPLE)
|
78
|
+
(G) Japanese (kanji and hiragana):
|
79
|
+
u+306A u+305C u+307F u+3093 u+306A u+65E5 u+672C u+8A9E u+3092
|
80
|
+
u+8A71 u+3057 u+3066 u+304F u+308C u+306A u+3044 u+306E u+304B
|
81
|
+
Punycode: n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa
|
82
|
+
EXAMPLE
|
83
|
+
end
|
84
|
+
|
85
|
+
def test_rfc3492_7_1_H
|
86
|
+
assert_punycode(<<-EXAMPLE)
|
87
|
+
(H) Korean (Hangul syllables):
|
88
|
+
u+C138 u+ACC4 u+C758 u+BAA8 u+B4E0 u+C0AC u+B78C u+B4E4 u+C774
|
89
|
+
u+D55C u+AD6D u+C5B4 u+B97C u+C774 u+D574 u+D55C u+B2E4 u+BA74
|
90
|
+
u+C5BC u+B9C8 u+B098 u+C88B u+C744 u+AE4C
|
91
|
+
Punycode: 989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j\\
|
92
|
+
psd879ccm6fea98c
|
93
|
+
EXAMPLE
|
94
|
+
end
|
95
|
+
|
96
|
+
def test_rfc3492_7_1_I
|
97
|
+
if self.class.to_s == 'TestPunycodeEncodeLib'
|
98
|
+
if __FILE__ == $0 || $VERBOSE || $DEBUG
|
99
|
+
STDERR.puts "SKIP KNOWN BUG: downcase D in Punycode in encode test without case_flags."
|
100
|
+
end
|
101
|
+
return
|
102
|
+
end
|
103
|
+
assert_punycode(<<-EXAMPLE)
|
104
|
+
KNOWN BUG: downcase D in Punycode in encode test without case_flags. \\
|
105
|
+
(I) Russian (Cyrillic):
|
106
|
+
U+043F u+043E u+0447 u+0435 u+043C u+0443 u+0436 u+0435 u+043E
|
107
|
+
u+043D u+0438 u+043D u+0435 u+0433 u+043E u+0432 u+043E u+0440
|
108
|
+
u+044F u+0442 u+043F u+043E u+0440 u+0443 u+0441 u+0441 u+043A
|
109
|
+
u+0438
|
110
|
+
Punycode: b1abfaaepdrnnbgefbaDotcwatmq2g4l
|
111
|
+
EXAMPLE
|
112
|
+
end
|
113
|
+
|
114
|
+
def test_rfc3492_7_1_I_downcase
|
115
|
+
assert_punycode(<<-EXAMPLE)
|
116
|
+
(I) Russian (Cyrillic): (downcase first U in Codepoints and D in Punycode)
|
117
|
+
u+043F u+043E u+0447 u+0435 u+043C u+0443 u+0436 u+0435 u+043E
|
118
|
+
u+043D u+0438 u+043D u+0435 u+0433 u+043E u+0432 u+043E u+0440
|
119
|
+
u+044F u+0442 u+043F u+043E u+0440 u+0443 u+0441 u+0441 u+043A
|
120
|
+
u+0438
|
121
|
+
Punycode: b1abfaaepdrnnbgefbadotcwatmq2g4l
|
122
|
+
EXAMPLE
|
123
|
+
end
|
124
|
+
|
125
|
+
def test_rfc3492_7_1_J
|
126
|
+
assert_punycode(<<-EXAMPLE)
|
127
|
+
(J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
|
128
|
+
U+0050 u+006F u+0072 u+0071 u+0075 u+00E9 u+006E u+006F u+0070
|
129
|
+
u+0075 u+0065 u+0064 u+0065 u+006E u+0073 u+0069 u+006D u+0070
|
130
|
+
u+006C u+0065 u+006D u+0065 u+006E u+0074 u+0065 u+0068 u+0061
|
131
|
+
u+0062 u+006C u+0061 u+0072 u+0065 u+006E U+0045 u+0073 u+0070
|
132
|
+
u+0061 u+00F1 u+006F u+006C
|
133
|
+
Punycode: PorqunopuedensimplementehablarenEspaol-fmd56a
|
134
|
+
EXAMPLE
|
135
|
+
end
|
136
|
+
|
137
|
+
def test_rfc3492_7_1_K
|
138
|
+
assert_punycode(<<-EXAMPLE)
|
139
|
+
(K) Vietnamese:\\
|
140
|
+
T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\\
|
141
|
+
<ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
|
142
|
+
U+0054 u+1EA1 u+0069 u+0073 u+0061 u+006F u+0068 u+1ECD u+006B
|
143
|
+
u+0068 u+00F4 u+006E u+0067 u+0074 u+0068 u+1EC3 u+0063 u+0068
|
144
|
+
u+1EC9 u+006E u+00F3 u+0069 u+0074 u+0069 u+1EBF u+006E u+0067
|
145
|
+
U+0056 u+0069 u+1EC7 u+0074
|
146
|
+
Punycode: TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g
|
147
|
+
EXAMPLE
|
148
|
+
end
|
149
|
+
|
150
|
+
def test_rfc3492_7_1_L
|
151
|
+
assert_punycode(<<-EXAMPLE)
|
152
|
+
(L) 3<nen>B<gumi><kinpachi><sensei>
|
153
|
+
u+0033 u+5E74 U+0042 u+7D44 u+91D1 u+516B u+5148 u+751F
|
154
|
+
Punycode: 3B-ww4c5e180e575a65lsy2b
|
155
|
+
EXAMPLE
|
156
|
+
end
|
157
|
+
|
158
|
+
def test_rfc3492_7_1_M
|
159
|
+
assert_punycode(<<-EXAMPLE)
|
160
|
+
(M) <amuro><namie>-with-SUPER-MONKEYS
|
161
|
+
u+5B89 u+5BA4 u+5948 u+7F8E u+6075 u+002D u+0077 u+0069 u+0074
|
162
|
+
u+0068 u+002D U+0053 U+0055 U+0050 U+0045 U+0052 u+002D U+004D
|
163
|
+
U+004F U+004E U+004B U+0045 U+0059 U+0053
|
164
|
+
Punycode: -with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n
|
165
|
+
EXAMPLE
|
166
|
+
end
|
167
|
+
|
168
|
+
def test_rfc3492_7_1_N
|
169
|
+
assert_punycode(<<-EXAMPLE)
|
170
|
+
(N) Hello-Another-Way-<sorezore><no><basho>
|
171
|
+
U+0048 u+0065 u+006C u+006C u+006F u+002D U+0041 u+006E u+006F
|
172
|
+
u+0074 u+0068 u+0065 u+0072 u+002D U+0057 u+0061 u+0079 u+002D
|
173
|
+
u+305D u+308C u+305E u+308C u+306E u+5834 u+6240
|
174
|
+
Punycode: Hello-Another-Way--fc4qua05auwb3674vfr0b
|
175
|
+
EXAMPLE
|
176
|
+
end
|
177
|
+
|
178
|
+
def test_rfc3492_7_1_O
|
179
|
+
assert_punycode(<<-EXAMPLE)
|
180
|
+
(O) <hitotsu><yane><no><shita>2
|
181
|
+
u+3072 u+3068 u+3064 u+5C4B u+6839 u+306E u+4E0B u+0032
|
182
|
+
Punycode: 2-u9tlzr9756bt3uc0v
|
183
|
+
EXAMPLE
|
184
|
+
end
|
185
|
+
|
186
|
+
def test_rfc3492_7_1_P
|
187
|
+
assert_punycode(<<-EXAMPLE)
|
188
|
+
(P) Maji<de>Koi<suru>5<byou><mae>
|
189
|
+
U+004D u+0061 u+006A u+0069 u+3067 U+004B u+006F u+0069 u+3059
|
190
|
+
u+308B u+0035 u+79D2 u+524D
|
191
|
+
Punycode: MajiKoi5-783gue6qz075azm5e
|
192
|
+
EXAMPLE
|
193
|
+
end
|
194
|
+
|
195
|
+
def test_rfc3492_7_1_Q
|
196
|
+
assert_punycode(<<-EXAMPLE)
|
197
|
+
(Q) <pafii>de<runba>
|
198
|
+
u+30D1 u+30D5 u+30A3 u+30FC u+0064 u+0065 u+30EB u+30F3 u+30D0
|
199
|
+
Punycode: de-jg4avhby1noc0d
|
200
|
+
EXAMPLE
|
201
|
+
end
|
202
|
+
|
203
|
+
def test_rfc3492_7_1_R
|
204
|
+
assert_punycode(<<-EXAMPLE)
|
205
|
+
(R) <sono><supiido><de>
|
206
|
+
u+305D u+306E u+30B9 u+30D4 u+30FC u+30C9 u+3067
|
207
|
+
Punycode: d9juau41awczczp
|
208
|
+
EXAMPLE
|
209
|
+
end
|
210
|
+
|
211
|
+
def test_rfc3492_7_1_S
|
212
|
+
assert_punycode(<<-EXAMPLE)
|
213
|
+
(S) -> $1.00 <-
|
214
|
+
u+002D u+003E u+0020 u+0024 u+0031 u+002E u+0030 u+0030 u+0020
|
215
|
+
u+003C u+002D
|
216
|
+
Punycode: -> $1.00 <--
|
217
|
+
EXAMPLE
|
218
|
+
end
|
219
|
+
|
220
|
+
RUBY_BIN =
|
221
|
+
begin
|
222
|
+
require "rbconfig"
|
223
|
+
File.join(
|
224
|
+
Config::CONFIG["bindir"],
|
225
|
+
Config::CONFIG["ruby_install_name"] + Config::CONFIG["EXEEXT"]
|
226
|
+
)
|
227
|
+
rescue LoadError
|
228
|
+
"ruby"
|
229
|
+
end
|
230
|
+
PUNYCODE_RB =
|
231
|
+
if File.exist?('punycode.rb')
|
232
|
+
'punycode.rb'
|
233
|
+
else
|
234
|
+
File.expand_path(File.join('..', 'lib', 'punycode.rb'),
|
235
|
+
File.dirname(__FILE__))
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
class TestPunycodeEncode < Test::Unit::TestCase
|
240
|
+
include AssertPunycode
|
241
|
+
|
242
|
+
def assert_punycode_main(description, codepoints, punycode)
|
243
|
+
IO.popen("#{RUBY_BIN} '#{PUNYCODE_RB}' -e", "r+") do |io|
|
244
|
+
io.puts codepoints
|
245
|
+
io.close_write
|
246
|
+
assert_equal(punycode, io.gets.chomp, description)
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
class TestPunycodeDecode < Test::Unit::TestCase
|
252
|
+
include AssertPunycode
|
253
|
+
|
254
|
+
def assert_punycode_main(description, codepoints, punycode)
|
255
|
+
IO.popen("#{RUBY_BIN} '#{PUNYCODE_RB}' -d", "r+") do |io|
|
256
|
+
io.puts punycode
|
257
|
+
io.close_write
|
258
|
+
assert_equal(codepoints.strip.gsub(/\s+/, "\n"),
|
259
|
+
io.read.strip, description)
|
260
|
+
end
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
if File.executable?("./punycode")
|
265
|
+
class TestPunycodeEncodeBin < Test::Unit::TestCase
|
266
|
+
include AssertPunycode
|
267
|
+
|
268
|
+
def assert_punycode_main(description, codepoints, punycode)
|
269
|
+
IO.popen("./punycode -e", "r+") do |io|
|
270
|
+
io.puts codepoints
|
271
|
+
io.close_write
|
272
|
+
assert_equal(punycode, io.gets.chomp, description)
|
273
|
+
end
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
class TestPunycodeDecodeBin < Test::Unit::TestCase
|
278
|
+
include AssertPunycode
|
279
|
+
|
280
|
+
def assert_punycode_main(description, codepoints, punycode)
|
281
|
+
IO.popen("./punycode -d", "r+") do |io|
|
282
|
+
io.puts punycode
|
283
|
+
io.close_write
|
284
|
+
assert_equal(codepoints.strip.gsub(/\s+/, "\n"),
|
285
|
+
io.read.strip, description)
|
286
|
+
end
|
287
|
+
end
|
288
|
+
end
|
289
|
+
end
|
290
|
+
|
291
|
+
begin
|
292
|
+
require 'punycode'
|
293
|
+
class TestPunycodeEncodeLib < Test::Unit::TestCase
|
294
|
+
include AssertPunycode
|
295
|
+
|
296
|
+
def assert_punycode_main(description, codepoints, punycode)
|
297
|
+
unistring = codepoints.scan(/[0-9a-fA-F]+/).map{|x|x.hex}.pack('U*')
|
298
|
+
assert_equal(punycode, Punycode.encode(unistring), description)
|
299
|
+
end
|
300
|
+
end
|
301
|
+
|
302
|
+
class TestPunycodeDecodeLib < Test::Unit::TestCase
|
303
|
+
include AssertPunycode
|
304
|
+
|
305
|
+
def assert_punycode_main(description, codepoints, punycode)
|
306
|
+
unistring = codepoints.scan(/[0-9a-fA-F]+/).map{|x|x.hex}.pack('U*')
|
307
|
+
assert_equal(unistring, Punycode.decode(punycode), description)
|
308
|
+
end
|
309
|
+
end
|
310
|
+
rescue LoadError
|
311
|
+
end
|
metadata
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.2
|
3
|
+
specification_version: 1
|
4
|
+
name: punycode4r
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.2.0
|
7
|
+
date: 2007-02-21 00:00:00 +09:00
|
8
|
+
summary: pure Ruby implementing Punycode (RFC 3492)
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: zn@mbf.nifty.com
|
12
|
+
homepage:
|
13
|
+
rubyforge_project: rwiki
|
14
|
+
description:
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: false
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- Kazuhiro NISHIYAMA
|
31
|
+
files:
|
32
|
+
- lib/punycode.rb
|
33
|
+
- test/test_punycode.rb
|
34
|
+
test_files: []
|
35
|
+
|
36
|
+
rdoc_options: []
|
37
|
+
|
38
|
+
extra_rdoc_files: []
|
39
|
+
|
40
|
+
executables: []
|
41
|
+
|
42
|
+
extensions: []
|
43
|
+
|
44
|
+
requirements: []
|
45
|
+
|
46
|
+
dependencies: []
|
47
|
+
|