uri-idna 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +22 -0
- data/LICENSE.txt +21 -0
- data/README.md +184 -0
- data/lib/uri/idna/data/idna.rb +4692 -0
- data/lib/uri/idna/data/uts46.rb +8190 -0
- data/lib/uri/idna/intranges.rb +49 -0
- data/lib/uri/idna/process.rb +139 -0
- data/lib/uri/idna/punycode.rb +174 -0
- data/lib/uri/idna/uts46.rb +60 -0
- data/lib/uri/idna/validation/bidi.rb +93 -0
- data/lib/uri/idna/validation.rb +199 -0
- data/lib/uri/idna/version.rb +7 -0
- data/lib/uri/idna.rb +60 -0
- metadata +62 -0
@@ -0,0 +1,49 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module URI
|
4
|
+
module IDNA
|
5
|
+
module Intranges
|
6
|
+
class << self
|
7
|
+
def from_list(list)
|
8
|
+
sorted_list = list.sort
|
9
|
+
ranges = []
|
10
|
+
last_write = -1
|
11
|
+
sorted_list.each_with_index do |value, i|
|
12
|
+
next if value + 1 == sorted_list[i + 1]
|
13
|
+
|
14
|
+
ranges << encode_range(sorted_list[last_write + 1], sorted_list[i] + 1)
|
15
|
+
last_write = i
|
16
|
+
end
|
17
|
+
ranges
|
18
|
+
end
|
19
|
+
|
20
|
+
def contain?(int, ranges)
|
21
|
+
tuple = encode_range(int, 0)
|
22
|
+
pos = ranges.bsearch_index { |x| x > tuple } || ranges.length
|
23
|
+
# we could be immediately ahead of a tuple (start, end)
|
24
|
+
# with start < int_ <= end
|
25
|
+
if pos > 0
|
26
|
+
left, right = decode_range(ranges[pos - 1])
|
27
|
+
return true if left <= int && int < right
|
28
|
+
end
|
29
|
+
# or we could be immediately behind a tuple (int_, end)
|
30
|
+
if pos < ranges.length
|
31
|
+
left, = decode_range(ranges[pos])
|
32
|
+
return true if left == int
|
33
|
+
end
|
34
|
+
false
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def encode_range(start, finish)
|
40
|
+
(start << 32) | finish
|
41
|
+
end
|
42
|
+
|
43
|
+
def decode_range(r)
|
44
|
+
[(r >> 32), (r & ((1 << 32) - 1))]
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "punycode"
|
4
|
+
require_relative "validation"
|
5
|
+
|
6
|
+
require_relative "uts46"
|
7
|
+
|
8
|
+
module URI
|
9
|
+
module IDNA
|
10
|
+
class Process
|
11
|
+
UNICODE_DOTS_REGEX = /[\u002e\u3002\uff0e\uff61]/.freeze
|
12
|
+
|
13
|
+
def initialize(**options)
|
14
|
+
@options = options
|
15
|
+
end
|
16
|
+
|
17
|
+
def register(alabel: nil, ulabel: nil)
|
18
|
+
raise ArgumentError, "Provide alabel or ulabel" if alabel.nil? && ulabel.nil?
|
19
|
+
|
20
|
+
return encode(ulabel) if alabel.nil?
|
21
|
+
|
22
|
+
raise ArgumentError, "String expected" unless alabel.is_a?(String)
|
23
|
+
raise Error, "Invalid alabel #{alabel}" unless alabel.start_with?(ALABEL_PREFIX)
|
24
|
+
|
25
|
+
process_labels(alabel) do |l|
|
26
|
+
to_alabel(l, roundtrip: true, ulabel: ulabel)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def lookup(s)
|
31
|
+
raise ArgumentError, "String expected" unless s.is_a?(String)
|
32
|
+
|
33
|
+
s = process_labels(s) do |l|
|
34
|
+
to_alabel(l, roundtrip: true)
|
35
|
+
end
|
36
|
+
validate_domain_length(s) if options.fetch(:dns_length, true)
|
37
|
+
s
|
38
|
+
end
|
39
|
+
|
40
|
+
def encode(s)
|
41
|
+
raise ArgumentError, "String expected" unless s.is_a?(String)
|
42
|
+
|
43
|
+
s = process_labels(s) { |l| to_alabel(l) }
|
44
|
+
validate_domain_length(s) if options.fetch(:dns_length, true)
|
45
|
+
s
|
46
|
+
end
|
47
|
+
|
48
|
+
def decode(s)
|
49
|
+
raise ArgumentError, "String expected" unless s.is_a?(String)
|
50
|
+
|
51
|
+
process_labels(s) { |l| to_ulabel(l) }
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
attr_reader :labels, :options
|
57
|
+
|
58
|
+
def splitter
|
59
|
+
@splitter ||= options.fetch(:uts46, false) ? "." : UNICODE_DOTS_REGEX
|
60
|
+
end
|
61
|
+
|
62
|
+
def process_labels(s)
|
63
|
+
s = UTS46.map_string(s, **options.slice(:uts46_std3, :uts46_transitional)) if options.fetch(:uts46, false)
|
64
|
+
@labels ||= s.split(splitter, -1)
|
65
|
+
trailing_dot = labels[-1] && labels[-1].empty? ? labels.pop : false
|
66
|
+
|
67
|
+
raise Error, "Empty domain" if labels.empty? || labels == [""]
|
68
|
+
|
69
|
+
result = []
|
70
|
+
labels.each do |label|
|
71
|
+
str = yield(label)
|
72
|
+
raise Error, "Empty label" if str.empty?
|
73
|
+
|
74
|
+
result << str
|
75
|
+
end
|
76
|
+
|
77
|
+
result << "" if trailing_dot
|
78
|
+
result.join(".")
|
79
|
+
end
|
80
|
+
|
81
|
+
def to_alabel(label, roundtrip: false, ulabel: nil)
|
82
|
+
orig_label = label
|
83
|
+
# validate label is a valid U-label
|
84
|
+
label = to_ulabel(label)
|
85
|
+
if ulabel && ulabel != label
|
86
|
+
raise Error, "Provided ulabel does not match conversion of alabel, #{ulabel.inspect} != #{label.inspect}"
|
87
|
+
end
|
88
|
+
|
89
|
+
label = encode_punycode_label(label) unless label.ascii_only?
|
90
|
+
validate_label_length(label)
|
91
|
+
|
92
|
+
if roundtrip && orig_label.ascii_only? && orig_label != label
|
93
|
+
raise Error, "Roundtrip encoding failed, #{orig_label.inspect} != #{label.inspect}"
|
94
|
+
end
|
95
|
+
|
96
|
+
label
|
97
|
+
end
|
98
|
+
|
99
|
+
# https://datatracker.ietf.org/doc/html/rfc5891#section-5.3
|
100
|
+
def to_ulabel(label)
|
101
|
+
decoded = false
|
102
|
+
label, decoded = decode_punycode_label(label) if label.ascii_only?
|
103
|
+
validation.call(label, decoded: decoded)
|
104
|
+
label
|
105
|
+
end
|
106
|
+
|
107
|
+
def encode_punycode_label(label)
|
108
|
+
ALABEL_PREFIX + Punycode.encode(label)
|
109
|
+
end
|
110
|
+
|
111
|
+
def decode_punycode_label(label)
|
112
|
+
label = label.downcase
|
113
|
+
return [label, false] unless label.start_with?(ALABEL_PREFIX)
|
114
|
+
|
115
|
+
code = label[ALABEL_PREFIX.length..]
|
116
|
+
raise Error, "Malformed A-label, no Punycode eligible content found" if code.empty?
|
117
|
+
raise Error, "A-label must not end with a hyphen" if code[-1] == "-"
|
118
|
+
|
119
|
+
[URI::IDNA::Punycode.decode(code), true]
|
120
|
+
end
|
121
|
+
|
122
|
+
def validate_label_length(label)
|
123
|
+
raise Error, "Label too long" unless label.length < 64
|
124
|
+
end
|
125
|
+
|
126
|
+
def validate_domain_length(s)
|
127
|
+
raise Error, "Domain too long" unless s.length < (s[-1] == "." ? 255 : 254)
|
128
|
+
end
|
129
|
+
|
130
|
+
def validation
|
131
|
+
@validation ||= Validation.new(options.merge(bidi: check_bidi?))
|
132
|
+
end
|
133
|
+
|
134
|
+
def check_bidi?
|
135
|
+
options.fetch(:bidi, true) && Validation::Bidi.check?(labels)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
@@ -0,0 +1,174 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module URI
|
4
|
+
module IDNA
|
5
|
+
# Punycode implementation based on a simplified version of RFC 3492
|
6
|
+
# https://datatracker.ietf.org/doc/html/rfc3492#appendix-C
|
7
|
+
module Punycode
|
8
|
+
class << self
|
9
|
+
BASE = 36
|
10
|
+
TMIN = 1
|
11
|
+
TMAX = 26
|
12
|
+
SKEW = 38
|
13
|
+
DAMP = 700
|
14
|
+
INITIAL_BIAS = 72
|
15
|
+
INITIAL_N = 0x80
|
16
|
+
|
17
|
+
DELIMITER = 0x2D
|
18
|
+
MAXINT = 0x7FFFFFFF
|
19
|
+
|
20
|
+
def decode_digit(cp)
|
21
|
+
if cp - 48 < 10
|
22
|
+
cp - 22
|
23
|
+
elsif cp - 65 < 26
|
24
|
+
cp - 65
|
25
|
+
elsif cp - 97 < 26
|
26
|
+
cp - 97
|
27
|
+
else
|
28
|
+
BASE
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def encode_digit(d)
|
33
|
+
d + 22 + 75 * (d < 26 ? 1 : 0)
|
34
|
+
end
|
35
|
+
|
36
|
+
def adapt(delta, num_points, first_time)
|
37
|
+
delta = first_time ? (delta / DAMP) : (delta >> 1)
|
38
|
+
delta += (delta / num_points)
|
39
|
+
|
40
|
+
k = 0
|
41
|
+
while delta > (((BASE - TMIN) * TMAX) / 2)
|
42
|
+
delta /= BASE - TMIN
|
43
|
+
k += BASE
|
44
|
+
end
|
45
|
+
k + ((BASE - TMIN + 1) * delta / (delta + SKEW))
|
46
|
+
end
|
47
|
+
|
48
|
+
def encode(input)
|
49
|
+
input = input.codepoints
|
50
|
+
|
51
|
+
n = INITIAL_N
|
52
|
+
delta = 0
|
53
|
+
bias = INITIAL_BIAS
|
54
|
+
|
55
|
+
output = input.select { |char| basic?(char) }
|
56
|
+
h = b = output.length
|
57
|
+
|
58
|
+
output << DELIMITER if b > 0
|
59
|
+
|
60
|
+
while h < input.length
|
61
|
+
m = MAXINT
|
62
|
+
input.each do |char|
|
63
|
+
m = char if char >= n && char < m
|
64
|
+
end
|
65
|
+
|
66
|
+
raise PunycodeError, "Arithmetic overflow" if m - n > (MAXINT - delta) / (h + 1)
|
67
|
+
|
68
|
+
delta += (m - n) * (h + 1)
|
69
|
+
n = m
|
70
|
+
|
71
|
+
input.each do |char|
|
72
|
+
if char < n
|
73
|
+
delta += 1
|
74
|
+
raise PunycodeError, "Arithmetic overflow" if delta > MAXINT
|
75
|
+
end
|
76
|
+
next unless char == n
|
77
|
+
|
78
|
+
q = delta
|
79
|
+
k = BASE
|
80
|
+
loop do
|
81
|
+
t =
|
82
|
+
if k <= bias
|
83
|
+
TMIN
|
84
|
+
elsif k >= bias + TMAX
|
85
|
+
TMAX
|
86
|
+
else
|
87
|
+
k - bias
|
88
|
+
end
|
89
|
+
break if q < t
|
90
|
+
|
91
|
+
output << encode_digit(t + ((q - t) % (BASE - t)))
|
92
|
+
q = (q - t) / (BASE - t)
|
93
|
+
k += BASE
|
94
|
+
end
|
95
|
+
|
96
|
+
output << encode_digit(q)
|
97
|
+
bias = adapt(delta, h + 1, h == b)
|
98
|
+
delta = 0
|
99
|
+
h += 1
|
100
|
+
end
|
101
|
+
|
102
|
+
delta += 1
|
103
|
+
n += 1
|
104
|
+
end
|
105
|
+
output.pack("U*")
|
106
|
+
end
|
107
|
+
|
108
|
+
def decode(input)
|
109
|
+
input = input.codepoints
|
110
|
+
output = []
|
111
|
+
|
112
|
+
n = INITIAL_N
|
113
|
+
i = 0
|
114
|
+
bias = INITIAL_BIAS
|
115
|
+
|
116
|
+
b = input.rindex(DELIMITER) || 0
|
117
|
+
|
118
|
+
input[0, b].each do |char|
|
119
|
+
raise PunycodeError, "Invalid input" unless basic?(char)
|
120
|
+
|
121
|
+
output << char
|
122
|
+
end
|
123
|
+
|
124
|
+
inc = b > 0 ? b + 1 : 0
|
125
|
+
while inc < input.length
|
126
|
+
old_i = i
|
127
|
+
w = 1
|
128
|
+
k = BASE
|
129
|
+
loop do
|
130
|
+
raise PunycodeError, "Invalid input" if inc >= input.length
|
131
|
+
|
132
|
+
digit = decode_digit(input[inc])
|
133
|
+
inc += 1
|
134
|
+
raise PunycodeError, "Invalid input" if digit >= BASE
|
135
|
+
raise PunycodeError, "Arithmetic overflow" if digit > (MAXINT - i) / w
|
136
|
+
|
137
|
+
i += digit * w
|
138
|
+
t = if k <= bias
|
139
|
+
TMIN
|
140
|
+
elsif k >= bias + TMAX
|
141
|
+
TMAX
|
142
|
+
else
|
143
|
+
k - bias
|
144
|
+
end
|
145
|
+
break if digit < t
|
146
|
+
raise PunycodeError, "Arithmetic overflow" if w > MAXINT / (BASE - t)
|
147
|
+
|
148
|
+
w *= BASE - t
|
149
|
+
k += BASE
|
150
|
+
end
|
151
|
+
out = output.length
|
152
|
+
bias = adapt(i - old_i, out + 1, old_i == 0)
|
153
|
+
raise PunycodeError, "Arithmetic overflow" if (i / (out + 1)) > MAXINT - n
|
154
|
+
|
155
|
+
n += i / (out + 1)
|
156
|
+
i %= (out + 1)
|
157
|
+
|
158
|
+
output.insert(i, n)
|
159
|
+
|
160
|
+
i += 1
|
161
|
+
end
|
162
|
+
|
163
|
+
output.pack("U*")
|
164
|
+
end
|
165
|
+
|
166
|
+
private
|
167
|
+
|
168
|
+
def basic?(codepoint)
|
169
|
+
codepoint < 0x80
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "data/uts46"
|
4
|
+
|
5
|
+
module URI
|
6
|
+
module IDNA
|
7
|
+
module UTS46
|
8
|
+
class << self
|
9
|
+
# https://unicode.org/reports/tr46/#ProcessingStepMap
|
10
|
+
def map_string(domain, uts46_std3: true, uts46_transitional: false)
|
11
|
+
output = ""
|
12
|
+
domain.each_char do |char|
|
13
|
+
code_point = char.ord
|
14
|
+
_, status, replacement = uts46_status(code_point)
|
15
|
+
case status
|
16
|
+
when "I"
|
17
|
+
next
|
18
|
+
when "V"
|
19
|
+
output += char
|
20
|
+
when "M"
|
21
|
+
output += replacement
|
22
|
+
when "D"
|
23
|
+
output += uts46_transitional ? replacement : char
|
24
|
+
when "3"
|
25
|
+
if uts46_std3
|
26
|
+
raise InvalidCodepointError,
|
27
|
+
"Codepoint #{code_point} not allowed in #{domain} via STD3 rules"
|
28
|
+
end
|
29
|
+
|
30
|
+
output += replacement || char
|
31
|
+
else
|
32
|
+
raise InvalidCodepointError, "Codepoint #{code_point} not allowed in #{domain}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
output.unicode_normalize(:nfc)
|
36
|
+
end
|
37
|
+
|
38
|
+
def valid?(char, uts46_transitional: false)
|
39
|
+
_, status, = uts46_status(char.ord)
|
40
|
+
return true if status == "V"
|
41
|
+
return true if uts46_transitional && status == "D"
|
42
|
+
|
43
|
+
false
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def uts46_status(code_point)
|
49
|
+
index =
|
50
|
+
if code_point < 256
|
51
|
+
code_point
|
52
|
+
else
|
53
|
+
(UTS46_DATA.bsearch_index { |x| x[0] > code_point } || UTS46_DATA.length) - 1
|
54
|
+
end
|
55
|
+
UTS46_DATA[index] || []
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module URI
|
4
|
+
module IDNA
|
5
|
+
class Validation
|
6
|
+
# 4.2.3.4. Labels Containing Characters Written Right to Left
|
7
|
+
# https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.4
|
8
|
+
# https://datatracker.ietf.org/doc/html/rfc5893#section-2
|
9
|
+
module Bidi
|
10
|
+
class << self
|
11
|
+
def call(label)
|
12
|
+
# Bidi rule 1
|
13
|
+
if bidi_class(label[0], "RTL")
|
14
|
+
rtl = true
|
15
|
+
elsif bidi_class(label[0], "L")
|
16
|
+
rtl = false
|
17
|
+
else
|
18
|
+
raise BidiError, "First codepoint in label #{label} must be directionality L, R or AL"
|
19
|
+
end
|
20
|
+
|
21
|
+
valid_ending = false
|
22
|
+
number_type = nil
|
23
|
+
label.each_char.with_index do |cp, idx|
|
24
|
+
if rtl
|
25
|
+
# Bidi rule 2
|
26
|
+
if bidi_class(cp, "L") || bidi_class(cp, "UNUSED")
|
27
|
+
raise BidiError, "Invalid direction for codepoint at position #{idx + 1} in a right-to-left label"
|
28
|
+
end
|
29
|
+
|
30
|
+
# Bidi rule 3
|
31
|
+
direction = bidi_class(cp, "RTL") || bidi_class(cp, "EN") || bidi_class(cp, "AN")
|
32
|
+
if direction
|
33
|
+
valid_ending = true
|
34
|
+
elsif !bidi_class(cp, "NSM")
|
35
|
+
valid_ending = false
|
36
|
+
end
|
37
|
+
# Bidi rule 4
|
38
|
+
if %w[EN AN].include?(direction)
|
39
|
+
number_type ||= direction
|
40
|
+
raise BidiError, "Can not mix numeral types in a right-to-left label" if number_type != direction
|
41
|
+
end
|
42
|
+
else
|
43
|
+
# Bidi rule 5
|
44
|
+
if bidi_class(cp, "RTL") || bidi_class(cp, "AN")
|
45
|
+
raise BidiError, "Invalid direction for codepoint at position #{idx + 1} in a left-to-right label"
|
46
|
+
end
|
47
|
+
|
48
|
+
# Bidi rule 6
|
49
|
+
if bidi_class(cp, "L") || bidi_class(cp, "EN")
|
50
|
+
valid_ending = true
|
51
|
+
elsif !bidi_class(cp, "NSM")
|
52
|
+
valid_ending = false
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
raise BidiError, "Label ends with illegal codepoint directionality" unless valid_ending
|
58
|
+
|
59
|
+
true
|
60
|
+
end
|
61
|
+
|
62
|
+
# https://www.rfc-editor.org/rfc/rfc5891.html#section-4.2.3.4
|
63
|
+
def check?(labels)
|
64
|
+
domain = labels.map do |label|
|
65
|
+
if label.start_with?(ALABEL_PREFIX)
|
66
|
+
begin
|
67
|
+
Punycode.decode(label[ALABEL_PREFIX.length..])
|
68
|
+
rescue StandardError
|
69
|
+
""
|
70
|
+
end
|
71
|
+
else
|
72
|
+
label
|
73
|
+
end
|
74
|
+
end.join(".")
|
75
|
+
|
76
|
+
domain.each_char do |cp|
|
77
|
+
return true if bidi_class(cp, "RTL") || bidi_class(cp, "AN")
|
78
|
+
end
|
79
|
+
false
|
80
|
+
end
|
81
|
+
|
82
|
+
private
|
83
|
+
|
84
|
+
def bidi_class(cp, bidi_class)
|
85
|
+
return bidi_class if Intranges.contain?(cp.ord, BIDI_CLASSES[bidi_class])
|
86
|
+
|
87
|
+
false
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|