uri-idna 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +22 -0
- data/LICENSE.txt +21 -0
- data/README.md +184 -0
- data/lib/uri/idna/data/idna.rb +4692 -0
- data/lib/uri/idna/data/uts46.rb +8190 -0
- data/lib/uri/idna/intranges.rb +49 -0
- data/lib/uri/idna/process.rb +139 -0
- data/lib/uri/idna/punycode.rb +174 -0
- data/lib/uri/idna/uts46.rb +60 -0
- data/lib/uri/idna/validation/bidi.rb +93 -0
- data/lib/uri/idna/validation.rb +199 -0
- data/lib/uri/idna/version.rb +7 -0
- data/lib/uri/idna.rb +60 -0
- metadata +62 -0
@@ -0,0 +1,49 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module URI
|
4
|
+
module IDNA
|
5
|
+
module Intranges
|
6
|
+
class << self
|
7
|
+
def from_list(list)
|
8
|
+
sorted_list = list.sort
|
9
|
+
ranges = []
|
10
|
+
last_write = -1
|
11
|
+
sorted_list.each_with_index do |value, i|
|
12
|
+
next if value + 1 == sorted_list[i + 1]
|
13
|
+
|
14
|
+
ranges << encode_range(sorted_list[last_write + 1], sorted_list[i] + 1)
|
15
|
+
last_write = i
|
16
|
+
end
|
17
|
+
ranges
|
18
|
+
end
|
19
|
+
|
20
|
+
def contain?(int, ranges)
|
21
|
+
tuple = encode_range(int, 0)
|
22
|
+
pos = ranges.bsearch_index { |x| x > tuple } || ranges.length
|
23
|
+
# we could be immediately ahead of a tuple (start, end)
|
24
|
+
# with start < int_ <= end
|
25
|
+
if pos > 0
|
26
|
+
left, right = decode_range(ranges[pos - 1])
|
27
|
+
return true if left <= int && int < right
|
28
|
+
end
|
29
|
+
# or we could be immediately behind a tuple (int_, end)
|
30
|
+
if pos < ranges.length
|
31
|
+
left, = decode_range(ranges[pos])
|
32
|
+
return true if left == int
|
33
|
+
end
|
34
|
+
false
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def encode_range(start, finish)
|
40
|
+
(start << 32) | finish
|
41
|
+
end
|
42
|
+
|
43
|
+
def decode_range(r)
|
44
|
+
[(r >> 32), (r & ((1 << 32) - 1))]
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "punycode"
|
4
|
+
require_relative "validation"
|
5
|
+
|
6
|
+
require_relative "uts46"
|
7
|
+
|
8
|
+
module URI
|
9
|
+
module IDNA
|
10
|
+
class Process
|
11
|
+
UNICODE_DOTS_REGEX = /[\u002e\u3002\uff0e\uff61]/.freeze
|
12
|
+
|
13
|
+
def initialize(**options)
|
14
|
+
@options = options
|
15
|
+
end
|
16
|
+
|
17
|
+
def register(alabel: nil, ulabel: nil)
|
18
|
+
raise ArgumentError, "Provide alabel or ulabel" if alabel.nil? && ulabel.nil?
|
19
|
+
|
20
|
+
return encode(ulabel) if alabel.nil?
|
21
|
+
|
22
|
+
raise ArgumentError, "String expected" unless alabel.is_a?(String)
|
23
|
+
raise Error, "Invalid alabel #{alabel}" unless alabel.start_with?(ALABEL_PREFIX)
|
24
|
+
|
25
|
+
process_labels(alabel) do |l|
|
26
|
+
to_alabel(l, roundtrip: true, ulabel: ulabel)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def lookup(s)
|
31
|
+
raise ArgumentError, "String expected" unless s.is_a?(String)
|
32
|
+
|
33
|
+
s = process_labels(s) do |l|
|
34
|
+
to_alabel(l, roundtrip: true)
|
35
|
+
end
|
36
|
+
validate_domain_length(s) if options.fetch(:dns_length, true)
|
37
|
+
s
|
38
|
+
end
|
39
|
+
|
40
|
+
def encode(s)
|
41
|
+
raise ArgumentError, "String expected" unless s.is_a?(String)
|
42
|
+
|
43
|
+
s = process_labels(s) { |l| to_alabel(l) }
|
44
|
+
validate_domain_length(s) if options.fetch(:dns_length, true)
|
45
|
+
s
|
46
|
+
end
|
47
|
+
|
48
|
+
def decode(s)
|
49
|
+
raise ArgumentError, "String expected" unless s.is_a?(String)
|
50
|
+
|
51
|
+
process_labels(s) { |l| to_ulabel(l) }
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
attr_reader :labels, :options
|
57
|
+
|
58
|
+
def splitter
|
59
|
+
@splitter ||= options.fetch(:uts46, false) ? "." : UNICODE_DOTS_REGEX
|
60
|
+
end
|
61
|
+
|
62
|
+
def process_labels(s)
|
63
|
+
s = UTS46.map_string(s, **options.slice(:uts46_std3, :uts46_transitional)) if options.fetch(:uts46, false)
|
64
|
+
@labels ||= s.split(splitter, -1)
|
65
|
+
trailing_dot = labels[-1] && labels[-1].empty? ? labels.pop : false
|
66
|
+
|
67
|
+
raise Error, "Empty domain" if labels.empty? || labels == [""]
|
68
|
+
|
69
|
+
result = []
|
70
|
+
labels.each do |label|
|
71
|
+
str = yield(label)
|
72
|
+
raise Error, "Empty label" if str.empty?
|
73
|
+
|
74
|
+
result << str
|
75
|
+
end
|
76
|
+
|
77
|
+
result << "" if trailing_dot
|
78
|
+
result.join(".")
|
79
|
+
end
|
80
|
+
|
81
|
+
def to_alabel(label, roundtrip: false, ulabel: nil)
|
82
|
+
orig_label = label
|
83
|
+
# validate label is a valid U-label
|
84
|
+
label = to_ulabel(label)
|
85
|
+
if ulabel && ulabel != label
|
86
|
+
raise Error, "Provided ulabel does not match conversion of alabel, #{ulabel.inspect} != #{label.inspect}"
|
87
|
+
end
|
88
|
+
|
89
|
+
label = encode_punycode_label(label) unless label.ascii_only?
|
90
|
+
validate_label_length(label)
|
91
|
+
|
92
|
+
if roundtrip && orig_label.ascii_only? && orig_label != label
|
93
|
+
raise Error, "Roundtrip encoding failed, #{orig_label.inspect} != #{label.inspect}"
|
94
|
+
end
|
95
|
+
|
96
|
+
label
|
97
|
+
end
|
98
|
+
|
99
|
+
# https://datatracker.ietf.org/doc/html/rfc5891#section-5.3
|
100
|
+
def to_ulabel(label)
|
101
|
+
decoded = false
|
102
|
+
label, decoded = decode_punycode_label(label) if label.ascii_only?
|
103
|
+
validation.call(label, decoded: decoded)
|
104
|
+
label
|
105
|
+
end
|
106
|
+
|
107
|
+
def encode_punycode_label(label)
|
108
|
+
ALABEL_PREFIX + Punycode.encode(label)
|
109
|
+
end
|
110
|
+
|
111
|
+
def decode_punycode_label(label)
|
112
|
+
label = label.downcase
|
113
|
+
return [label, false] unless label.start_with?(ALABEL_PREFIX)
|
114
|
+
|
115
|
+
code = label[ALABEL_PREFIX.length..]
|
116
|
+
raise Error, "Malformed A-label, no Punycode eligible content found" if code.empty?
|
117
|
+
raise Error, "A-label must not end with a hyphen" if code[-1] == "-"
|
118
|
+
|
119
|
+
[URI::IDNA::Punycode.decode(code), true]
|
120
|
+
end
|
121
|
+
|
122
|
+
def validate_label_length(label)
|
123
|
+
raise Error, "Label too long" unless label.length < 64
|
124
|
+
end
|
125
|
+
|
126
|
+
def validate_domain_length(s)
|
127
|
+
raise Error, "Domain too long" unless s.length < (s[-1] == "." ? 255 : 254)
|
128
|
+
end
|
129
|
+
|
130
|
+
def validation
|
131
|
+
@validation ||= Validation.new(options.merge(bidi: check_bidi?))
|
132
|
+
end
|
133
|
+
|
134
|
+
def check_bidi?
|
135
|
+
options.fetch(:bidi, true) && Validation::Bidi.check?(labels)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
@@ -0,0 +1,174 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module URI
|
4
|
+
module IDNA
|
5
|
+
# Punycode implementation based on a simplified version of RFC 3492
|
6
|
+
# https://datatracker.ietf.org/doc/html/rfc3492#appendix-C
|
7
|
+
module Punycode
|
8
|
+
class << self
|
9
|
+
BASE = 36
|
10
|
+
TMIN = 1
|
11
|
+
TMAX = 26
|
12
|
+
SKEW = 38
|
13
|
+
DAMP = 700
|
14
|
+
INITIAL_BIAS = 72
|
15
|
+
INITIAL_N = 0x80
|
16
|
+
|
17
|
+
DELIMITER = 0x2D
|
18
|
+
MAXINT = 0x7FFFFFFF
|
19
|
+
|
20
|
+
def decode_digit(cp)
|
21
|
+
if cp - 48 < 10
|
22
|
+
cp - 22
|
23
|
+
elsif cp - 65 < 26
|
24
|
+
cp - 65
|
25
|
+
elsif cp - 97 < 26
|
26
|
+
cp - 97
|
27
|
+
else
|
28
|
+
BASE
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def encode_digit(d)
|
33
|
+
d + 22 + 75 * (d < 26 ? 1 : 0)
|
34
|
+
end
|
35
|
+
|
36
|
+
def adapt(delta, num_points, first_time)
|
37
|
+
delta = first_time ? (delta / DAMP) : (delta >> 1)
|
38
|
+
delta += (delta / num_points)
|
39
|
+
|
40
|
+
k = 0
|
41
|
+
while delta > (((BASE - TMIN) * TMAX) / 2)
|
42
|
+
delta /= BASE - TMIN
|
43
|
+
k += BASE
|
44
|
+
end
|
45
|
+
k + ((BASE - TMIN + 1) * delta / (delta + SKEW))
|
46
|
+
end
|
47
|
+
|
48
|
+
def encode(input)
|
49
|
+
input = input.codepoints
|
50
|
+
|
51
|
+
n = INITIAL_N
|
52
|
+
delta = 0
|
53
|
+
bias = INITIAL_BIAS
|
54
|
+
|
55
|
+
output = input.select { |char| basic?(char) }
|
56
|
+
h = b = output.length
|
57
|
+
|
58
|
+
output << DELIMITER if b > 0
|
59
|
+
|
60
|
+
while h < input.length
|
61
|
+
m = MAXINT
|
62
|
+
input.each do |char|
|
63
|
+
m = char if char >= n && char < m
|
64
|
+
end
|
65
|
+
|
66
|
+
raise PunycodeError, "Arithmetic overflow" if m - n > (MAXINT - delta) / (h + 1)
|
67
|
+
|
68
|
+
delta += (m - n) * (h + 1)
|
69
|
+
n = m
|
70
|
+
|
71
|
+
input.each do |char|
|
72
|
+
if char < n
|
73
|
+
delta += 1
|
74
|
+
raise PunycodeError, "Arithmetic overflow" if delta > MAXINT
|
75
|
+
end
|
76
|
+
next unless char == n
|
77
|
+
|
78
|
+
q = delta
|
79
|
+
k = BASE
|
80
|
+
loop do
|
81
|
+
t =
|
82
|
+
if k <= bias
|
83
|
+
TMIN
|
84
|
+
elsif k >= bias + TMAX
|
85
|
+
TMAX
|
86
|
+
else
|
87
|
+
k - bias
|
88
|
+
end
|
89
|
+
break if q < t
|
90
|
+
|
91
|
+
output << encode_digit(t + ((q - t) % (BASE - t)))
|
92
|
+
q = (q - t) / (BASE - t)
|
93
|
+
k += BASE
|
94
|
+
end
|
95
|
+
|
96
|
+
output << encode_digit(q)
|
97
|
+
bias = adapt(delta, h + 1, h == b)
|
98
|
+
delta = 0
|
99
|
+
h += 1
|
100
|
+
end
|
101
|
+
|
102
|
+
delta += 1
|
103
|
+
n += 1
|
104
|
+
end
|
105
|
+
output.pack("U*")
|
106
|
+
end
|
107
|
+
|
108
|
+
def decode(input)
|
109
|
+
input = input.codepoints
|
110
|
+
output = []
|
111
|
+
|
112
|
+
n = INITIAL_N
|
113
|
+
i = 0
|
114
|
+
bias = INITIAL_BIAS
|
115
|
+
|
116
|
+
b = input.rindex(DELIMITER) || 0
|
117
|
+
|
118
|
+
input[0, b].each do |char|
|
119
|
+
raise PunycodeError, "Invalid input" unless basic?(char)
|
120
|
+
|
121
|
+
output << char
|
122
|
+
end
|
123
|
+
|
124
|
+
inc = b > 0 ? b + 1 : 0
|
125
|
+
while inc < input.length
|
126
|
+
old_i = i
|
127
|
+
w = 1
|
128
|
+
k = BASE
|
129
|
+
loop do
|
130
|
+
raise PunycodeError, "Invalid input" if inc >= input.length
|
131
|
+
|
132
|
+
digit = decode_digit(input[inc])
|
133
|
+
inc += 1
|
134
|
+
raise PunycodeError, "Invalid input" if digit >= BASE
|
135
|
+
raise PunycodeError, "Arithmetic overflow" if digit > (MAXINT - i) / w
|
136
|
+
|
137
|
+
i += digit * w
|
138
|
+
t = if k <= bias
|
139
|
+
TMIN
|
140
|
+
elsif k >= bias + TMAX
|
141
|
+
TMAX
|
142
|
+
else
|
143
|
+
k - bias
|
144
|
+
end
|
145
|
+
break if digit < t
|
146
|
+
raise PunycodeError, "Arithmetic overflow" if w > MAXINT / (BASE - t)
|
147
|
+
|
148
|
+
w *= BASE - t
|
149
|
+
k += BASE
|
150
|
+
end
|
151
|
+
out = output.length
|
152
|
+
bias = adapt(i - old_i, out + 1, old_i == 0)
|
153
|
+
raise PunycodeError, "Arithmetic overflow" if (i / (out + 1)) > MAXINT - n
|
154
|
+
|
155
|
+
n += i / (out + 1)
|
156
|
+
i %= (out + 1)
|
157
|
+
|
158
|
+
output.insert(i, n)
|
159
|
+
|
160
|
+
i += 1
|
161
|
+
end
|
162
|
+
|
163
|
+
output.pack("U*")
|
164
|
+
end
|
165
|
+
|
166
|
+
private
|
167
|
+
|
168
|
+
def basic?(codepoint)
|
169
|
+
codepoint < 0x80
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "data/uts46"
|
4
|
+
|
5
|
+
module URI
|
6
|
+
module IDNA
|
7
|
+
module UTS46
|
8
|
+
class << self
|
9
|
+
# https://unicode.org/reports/tr46/#ProcessingStepMap
|
10
|
+
def map_string(domain, uts46_std3: true, uts46_transitional: false)
|
11
|
+
output = ""
|
12
|
+
domain.each_char do |char|
|
13
|
+
code_point = char.ord
|
14
|
+
_, status, replacement = uts46_status(code_point)
|
15
|
+
case status
|
16
|
+
when "I"
|
17
|
+
next
|
18
|
+
when "V"
|
19
|
+
output += char
|
20
|
+
when "M"
|
21
|
+
output += replacement
|
22
|
+
when "D"
|
23
|
+
output += uts46_transitional ? replacement : char
|
24
|
+
when "3"
|
25
|
+
if uts46_std3
|
26
|
+
raise InvalidCodepointError,
|
27
|
+
"Codepoint #{code_point} not allowed in #{domain} via STD3 rules"
|
28
|
+
end
|
29
|
+
|
30
|
+
output += replacement || char
|
31
|
+
else
|
32
|
+
raise InvalidCodepointError, "Codepoint #{code_point} not allowed in #{domain}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
output.unicode_normalize(:nfc)
|
36
|
+
end
|
37
|
+
|
38
|
+
def valid?(char, uts46_transitional: false)
|
39
|
+
_, status, = uts46_status(char.ord)
|
40
|
+
return true if status == "V"
|
41
|
+
return true if uts46_transitional && status == "D"
|
42
|
+
|
43
|
+
false
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def uts46_status(code_point)
|
49
|
+
index =
|
50
|
+
if code_point < 256
|
51
|
+
code_point
|
52
|
+
else
|
53
|
+
(UTS46_DATA.bsearch_index { |x| x[0] > code_point } || UTS46_DATA.length) - 1
|
54
|
+
end
|
55
|
+
UTS46_DATA[index] || []
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module URI
|
4
|
+
module IDNA
|
5
|
+
class Validation
|
6
|
+
# 4.2.3.4. Labels Containing Characters Written Right to Left
|
7
|
+
# https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.4
|
8
|
+
# https://datatracker.ietf.org/doc/html/rfc5893#section-2
|
9
|
+
module Bidi
|
10
|
+
class << self
|
11
|
+
def call(label)
|
12
|
+
# Bidi rule 1
|
13
|
+
if bidi_class(label[0], "RTL")
|
14
|
+
rtl = true
|
15
|
+
elsif bidi_class(label[0], "L")
|
16
|
+
rtl = false
|
17
|
+
else
|
18
|
+
raise BidiError, "First codepoint in label #{label} must be directionality L, R or AL"
|
19
|
+
end
|
20
|
+
|
21
|
+
valid_ending = false
|
22
|
+
number_type = nil
|
23
|
+
label.each_char.with_index do |cp, idx|
|
24
|
+
if rtl
|
25
|
+
# Bidi rule 2
|
26
|
+
if bidi_class(cp, "L") || bidi_class(cp, "UNUSED")
|
27
|
+
raise BidiError, "Invalid direction for codepoint at position #{idx + 1} in a right-to-left label"
|
28
|
+
end
|
29
|
+
|
30
|
+
# Bidi rule 3
|
31
|
+
direction = bidi_class(cp, "RTL") || bidi_class(cp, "EN") || bidi_class(cp, "AN")
|
32
|
+
if direction
|
33
|
+
valid_ending = true
|
34
|
+
elsif !bidi_class(cp, "NSM")
|
35
|
+
valid_ending = false
|
36
|
+
end
|
37
|
+
# Bidi rule 4
|
38
|
+
if %w[EN AN].include?(direction)
|
39
|
+
number_type ||= direction
|
40
|
+
raise BidiError, "Can not mix numeral types in a right-to-left label" if number_type != direction
|
41
|
+
end
|
42
|
+
else
|
43
|
+
# Bidi rule 5
|
44
|
+
if bidi_class(cp, "RTL") || bidi_class(cp, "AN")
|
45
|
+
raise BidiError, "Invalid direction for codepoint at position #{idx + 1} in a left-to-right label"
|
46
|
+
end
|
47
|
+
|
48
|
+
# Bidi rule 6
|
49
|
+
if bidi_class(cp, "L") || bidi_class(cp, "EN")
|
50
|
+
valid_ending = true
|
51
|
+
elsif !bidi_class(cp, "NSM")
|
52
|
+
valid_ending = false
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
raise BidiError, "Label ends with illegal codepoint directionality" unless valid_ending
|
58
|
+
|
59
|
+
true
|
60
|
+
end
|
61
|
+
|
62
|
+
# https://www.rfc-editor.org/rfc/rfc5891.html#section-4.2.3.4
|
63
|
+
def check?(labels)
|
64
|
+
domain = labels.map do |label|
|
65
|
+
if label.start_with?(ALABEL_PREFIX)
|
66
|
+
begin
|
67
|
+
Punycode.decode(label[ALABEL_PREFIX.length..])
|
68
|
+
rescue StandardError
|
69
|
+
""
|
70
|
+
end
|
71
|
+
else
|
72
|
+
label
|
73
|
+
end
|
74
|
+
end.join(".")
|
75
|
+
|
76
|
+
domain.each_char do |cp|
|
77
|
+
return true if bidi_class(cp, "RTL") || bidi_class(cp, "AN")
|
78
|
+
end
|
79
|
+
false
|
80
|
+
end
|
81
|
+
|
82
|
+
private
|
83
|
+
|
84
|
+
def bidi_class(cp, bidi_class)
|
85
|
+
return bidi_class if Intranges.contain?(cp.ord, BIDI_CLASSES[bidi_class])
|
86
|
+
|
87
|
+
false
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|