uri-idna 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module URI
4
+ module IDNA
5
+ module Intranges
6
+ class << self
7
+ def from_list(list)
8
+ sorted_list = list.sort
9
+ ranges = []
10
+ last_write = -1
11
+ sorted_list.each_with_index do |value, i|
12
+ next if value + 1 == sorted_list[i + 1]
13
+
14
+ ranges << encode_range(sorted_list[last_write + 1], sorted_list[i] + 1)
15
+ last_write = i
16
+ end
17
+ ranges
18
+ end
19
+
20
+ def contain?(int, ranges)
21
+ tuple = encode_range(int, 0)
22
+ pos = ranges.bsearch_index { |x| x > tuple } || ranges.length
23
+ # we could be immediately ahead of a tuple (start, end)
24
+ # with start < int_ <= end
25
+ if pos > 0
26
+ left, right = decode_range(ranges[pos - 1])
27
+ return true if left <= int && int < right
28
+ end
29
+ # or we could be immediately behind a tuple (int_, end)
30
+ if pos < ranges.length
31
+ left, = decode_range(ranges[pos])
32
+ return true if left == int
33
+ end
34
+ false
35
+ end
36
+
37
+ private
38
+
39
+ def encode_range(start, finish)
40
+ (start << 32) | finish
41
+ end
42
+
43
+ def decode_range(r)
44
+ [(r >> 32), (r & ((1 << 32) - 1))]
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,139 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "punycode"
4
+ require_relative "validation"
5
+
6
+ require_relative "uts46"
7
+
8
+ module URI
9
+ module IDNA
10
+ class Process
11
+ UNICODE_DOTS_REGEX = /[\u002e\u3002\uff0e\uff61]/.freeze
12
+
13
+ def initialize(**options)
14
+ @options = options
15
+ end
16
+
17
+ def register(alabel: nil, ulabel: nil)
18
+ raise ArgumentError, "Provide alabel or ulabel" if alabel.nil? && ulabel.nil?
19
+
20
+ return encode(ulabel) if alabel.nil?
21
+
22
+ raise ArgumentError, "String expected" unless alabel.is_a?(String)
23
+ raise Error, "Invalid alabel #{alabel}" unless alabel.start_with?(ALABEL_PREFIX)
24
+
25
+ process_labels(alabel) do |l|
26
+ to_alabel(l, roundtrip: true, ulabel: ulabel)
27
+ end
28
+ end
29
+
30
+ def lookup(s)
31
+ raise ArgumentError, "String expected" unless s.is_a?(String)
32
+
33
+ s = process_labels(s) do |l|
34
+ to_alabel(l, roundtrip: true)
35
+ end
36
+ validate_domain_length(s) if options.fetch(:dns_length, true)
37
+ s
38
+ end
39
+
40
+ def encode(s)
41
+ raise ArgumentError, "String expected" unless s.is_a?(String)
42
+
43
+ s = process_labels(s) { |l| to_alabel(l) }
44
+ validate_domain_length(s) if options.fetch(:dns_length, true)
45
+ s
46
+ end
47
+
48
+ def decode(s)
49
+ raise ArgumentError, "String expected" unless s.is_a?(String)
50
+
51
+ process_labels(s) { |l| to_ulabel(l) }
52
+ end
53
+
54
+ private
55
+
56
+ attr_reader :labels, :options
57
+
58
+ def splitter
59
+ @splitter ||= options.fetch(:uts46, false) ? "." : UNICODE_DOTS_REGEX
60
+ end
61
+
62
+ def process_labels(s)
63
+ s = UTS46.map_string(s, **options.slice(:uts46_std3, :uts46_transitional)) if options.fetch(:uts46, false)
64
+ @labels ||= s.split(splitter, -1)
65
+ trailing_dot = labels[-1] && labels[-1].empty? ? labels.pop : false
66
+
67
+ raise Error, "Empty domain" if labels.empty? || labels == [""]
68
+
69
+ result = []
70
+ labels.each do |label|
71
+ str = yield(label)
72
+ raise Error, "Empty label" if str.empty?
73
+
74
+ result << str
75
+ end
76
+
77
+ result << "" if trailing_dot
78
+ result.join(".")
79
+ end
80
+
81
+ def to_alabel(label, roundtrip: false, ulabel: nil)
82
+ orig_label = label
83
+ # validate label is a valid U-label
84
+ label = to_ulabel(label)
85
+ if ulabel && ulabel != label
86
+ raise Error, "Provided ulabel does not match conversion of alabel, #{ulabel.inspect} != #{label.inspect}"
87
+ end
88
+
89
+ label = encode_punycode_label(label) unless label.ascii_only?
90
+ validate_label_length(label)
91
+
92
+ if roundtrip && orig_label.ascii_only? && orig_label != label
93
+ raise Error, "Roundtrip encoding failed, #{orig_label.inspect} != #{label.inspect}"
94
+ end
95
+
96
+ label
97
+ end
98
+
99
+ # https://datatracker.ietf.org/doc/html/rfc5891#section-5.3
100
+ def to_ulabel(label)
101
+ decoded = false
102
+ label, decoded = decode_punycode_label(label) if label.ascii_only?
103
+ validation.call(label, decoded: decoded)
104
+ label
105
+ end
106
+
107
+ def encode_punycode_label(label)
108
+ ALABEL_PREFIX + Punycode.encode(label)
109
+ end
110
+
111
+ def decode_punycode_label(label)
112
+ label = label.downcase
113
+ return [label, false] unless label.start_with?(ALABEL_PREFIX)
114
+
115
+ code = label[ALABEL_PREFIX.length..]
116
+ raise Error, "Malformed A-label, no Punycode eligible content found" if code.empty?
117
+ raise Error, "A-label must not end with a hyphen" if code[-1] == "-"
118
+
119
+ [URI::IDNA::Punycode.decode(code), true]
120
+ end
121
+
122
+ def validate_label_length(label)
123
+ raise Error, "Label too long" unless label.length < 64
124
+ end
125
+
126
+ def validate_domain_length(s)
127
+ raise Error, "Domain too long" unless s.length < (s[-1] == "." ? 255 : 254)
128
+ end
129
+
130
+ def validation
131
+ @validation ||= Validation.new(options.merge(bidi: check_bidi?))
132
+ end
133
+
134
+ def check_bidi?
135
+ options.fetch(:bidi, true) && Validation::Bidi.check?(labels)
136
+ end
137
+ end
138
+ end
139
+ end
@@ -0,0 +1,174 @@
1
+ # frozen_string_literal: true
2
+
3
+ module URI
4
+ module IDNA
5
+ # Punycode implementation based on a simplified version of RFC 3492
6
+ # https://datatracker.ietf.org/doc/html/rfc3492#appendix-C
7
+ module Punycode
8
+ class << self
9
+ BASE = 36
10
+ TMIN = 1
11
+ TMAX = 26
12
+ SKEW = 38
13
+ DAMP = 700
14
+ INITIAL_BIAS = 72
15
+ INITIAL_N = 0x80
16
+
17
+ DELIMITER = 0x2D
18
+ MAXINT = 0x7FFFFFFF
19
+
20
+ def decode_digit(cp)
21
+ if cp - 48 < 10
22
+ cp - 22
23
+ elsif cp - 65 < 26
24
+ cp - 65
25
+ elsif cp - 97 < 26
26
+ cp - 97
27
+ else
28
+ BASE
29
+ end
30
+ end
31
+
32
+ def encode_digit(d)
33
+ d + 22 + 75 * (d < 26 ? 1 : 0)
34
+ end
35
+
36
+ def adapt(delta, num_points, first_time)
37
+ delta = first_time ? (delta / DAMP) : (delta >> 1)
38
+ delta += (delta / num_points)
39
+
40
+ k = 0
41
+ while delta > (((BASE - TMIN) * TMAX) / 2)
42
+ delta /= BASE - TMIN
43
+ k += BASE
44
+ end
45
+ k + ((BASE - TMIN + 1) * delta / (delta + SKEW))
46
+ end
47
+
48
+ def encode(input)
49
+ input = input.codepoints
50
+
51
+ n = INITIAL_N
52
+ delta = 0
53
+ bias = INITIAL_BIAS
54
+
55
+ output = input.select { |char| basic?(char) }
56
+ h = b = output.length
57
+
58
+ output << DELIMITER if b > 0
59
+
60
+ while h < input.length
61
+ m = MAXINT
62
+ input.each do |char|
63
+ m = char if char >= n && char < m
64
+ end
65
+
66
+ raise PunycodeError, "Arithmetic overflow" if m - n > (MAXINT - delta) / (h + 1)
67
+
68
+ delta += (m - n) * (h + 1)
69
+ n = m
70
+
71
+ input.each do |char|
72
+ if char < n
73
+ delta += 1
74
+ raise PunycodeError, "Arithmetic overflow" if delta > MAXINT
75
+ end
76
+ next unless char == n
77
+
78
+ q = delta
79
+ k = BASE
80
+ loop do
81
+ t =
82
+ if k <= bias
83
+ TMIN
84
+ elsif k >= bias + TMAX
85
+ TMAX
86
+ else
87
+ k - bias
88
+ end
89
+ break if q < t
90
+
91
+ output << encode_digit(t + ((q - t) % (BASE - t)))
92
+ q = (q - t) / (BASE - t)
93
+ k += BASE
94
+ end
95
+
96
+ output << encode_digit(q)
97
+ bias = adapt(delta, h + 1, h == b)
98
+ delta = 0
99
+ h += 1
100
+ end
101
+
102
+ delta += 1
103
+ n += 1
104
+ end
105
+ output.pack("U*")
106
+ end
107
+
108
+ def decode(input)
109
+ input = input.codepoints
110
+ output = []
111
+
112
+ n = INITIAL_N
113
+ i = 0
114
+ bias = INITIAL_BIAS
115
+
116
+ b = input.rindex(DELIMITER) || 0
117
+
118
+ input[0, b].each do |char|
119
+ raise PunycodeError, "Invalid input" unless basic?(char)
120
+
121
+ output << char
122
+ end
123
+
124
+ inc = b > 0 ? b + 1 : 0
125
+ while inc < input.length
126
+ old_i = i
127
+ w = 1
128
+ k = BASE
129
+ loop do
130
+ raise PunycodeError, "Invalid input" if inc >= input.length
131
+
132
+ digit = decode_digit(input[inc])
133
+ inc += 1
134
+ raise PunycodeError, "Invalid input" if digit >= BASE
135
+ raise PunycodeError, "Arithmetic overflow" if digit > (MAXINT - i) / w
136
+
137
+ i += digit * w
138
+ t = if k <= bias
139
+ TMIN
140
+ elsif k >= bias + TMAX
141
+ TMAX
142
+ else
143
+ k - bias
144
+ end
145
+ break if digit < t
146
+ raise PunycodeError, "Arithmetic overflow" if w > MAXINT / (BASE - t)
147
+
148
+ w *= BASE - t
149
+ k += BASE
150
+ end
151
+ out = output.length
152
+ bias = adapt(i - old_i, out + 1, old_i == 0)
153
+ raise PunycodeError, "Arithmetic overflow" if (i / (out + 1)) > MAXINT - n
154
+
155
+ n += i / (out + 1)
156
+ i %= (out + 1)
157
+
158
+ output.insert(i, n)
159
+
160
+ i += 1
161
+ end
162
+
163
+ output.pack("U*")
164
+ end
165
+
166
+ private
167
+
168
+ def basic?(codepoint)
169
+ codepoint < 0x80
170
+ end
171
+ end
172
+ end
173
+ end
174
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "data/uts46"
4
+
5
+ module URI
6
+ module IDNA
7
+ module UTS46
8
+ class << self
9
+ # https://unicode.org/reports/tr46/#ProcessingStepMap
10
+ def map_string(domain, uts46_std3: true, uts46_transitional: false)
11
+ output = ""
12
+ domain.each_char do |char|
13
+ code_point = char.ord
14
+ _, status, replacement = uts46_status(code_point)
15
+ case status
16
+ when "I"
17
+ next
18
+ when "V"
19
+ output += char
20
+ when "M"
21
+ output += replacement
22
+ when "D"
23
+ output += uts46_transitional ? replacement : char
24
+ when "3"
25
+ if uts46_std3
26
+ raise InvalidCodepointError,
27
+ "Codepoint #{code_point} not allowed in #{domain} via STD3 rules"
28
+ end
29
+
30
+ output += replacement || char
31
+ else
32
+ raise InvalidCodepointError, "Codepoint #{code_point} not allowed in #{domain}"
33
+ end
34
+ end
35
+ output.unicode_normalize(:nfc)
36
+ end
37
+
38
+ def valid?(char, uts46_transitional: false)
39
+ _, status, = uts46_status(char.ord)
40
+ return true if status == "V"
41
+ return true if uts46_transitional && status == "D"
42
+
43
+ false
44
+ end
45
+
46
+ private
47
+
48
+ def uts46_status(code_point)
49
+ index =
50
+ if code_point < 256
51
+ code_point
52
+ else
53
+ (UTS46_DATA.bsearch_index { |x| x[0] > code_point } || UTS46_DATA.length) - 1
54
+ end
55
+ UTS46_DATA[index] || []
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,93 @@
1
+ # frozen_string_literal: true
2
+
3
+ module URI
4
+ module IDNA
5
+ class Validation
6
+ # 4.2.3.4. Labels Containing Characters Written Right to Left
7
+ # https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.4
8
+ # https://datatracker.ietf.org/doc/html/rfc5893#section-2
9
+ module Bidi
10
+ class << self
11
+ def call(label)
12
+ # Bidi rule 1
13
+ if bidi_class(label[0], "RTL")
14
+ rtl = true
15
+ elsif bidi_class(label[0], "L")
16
+ rtl = false
17
+ else
18
+ raise BidiError, "First codepoint in label #{label} must be directionality L, R or AL"
19
+ end
20
+
21
+ valid_ending = false
22
+ number_type = nil
23
+ label.each_char.with_index do |cp, idx|
24
+ if rtl
25
+ # Bidi rule 2
26
+ if bidi_class(cp, "L") || bidi_class(cp, "UNUSED")
27
+ raise BidiError, "Invalid direction for codepoint at position #{idx + 1} in a right-to-left label"
28
+ end
29
+
30
+ # Bidi rule 3
31
+ direction = bidi_class(cp, "RTL") || bidi_class(cp, "EN") || bidi_class(cp, "AN")
32
+ if direction
33
+ valid_ending = true
34
+ elsif !bidi_class(cp, "NSM")
35
+ valid_ending = false
36
+ end
37
+ # Bidi rule 4
38
+ if %w[EN AN].include?(direction)
39
+ number_type ||= direction
40
+ raise BidiError, "Can not mix numeral types in a right-to-left label" if number_type != direction
41
+ end
42
+ else
43
+ # Bidi rule 5
44
+ if bidi_class(cp, "RTL") || bidi_class(cp, "AN")
45
+ raise BidiError, "Invalid direction for codepoint at position #{idx + 1} in a left-to-right label"
46
+ end
47
+
48
+ # Bidi rule 6
49
+ if bidi_class(cp, "L") || bidi_class(cp, "EN")
50
+ valid_ending = true
51
+ elsif !bidi_class(cp, "NSM")
52
+ valid_ending = false
53
+ end
54
+ end
55
+ end
56
+
57
+ raise BidiError, "Label ends with illegal codepoint directionality" unless valid_ending
58
+
59
+ true
60
+ end
61
+
62
+ # https://www.rfc-editor.org/rfc/rfc5891.html#section-4.2.3.4
63
+ def check?(labels)
64
+ domain = labels.map do |label|
65
+ if label.start_with?(ALABEL_PREFIX)
66
+ begin
67
+ Punycode.decode(label[ALABEL_PREFIX.length..])
68
+ rescue StandardError
69
+ ""
70
+ end
71
+ else
72
+ label
73
+ end
74
+ end.join(".")
75
+
76
+ domain.each_char do |cp|
77
+ return true if bidi_class(cp, "RTL") || bidi_class(cp, "AN")
78
+ end
79
+ false
80
+ end
81
+
82
+ private
83
+
84
+ def bidi_class(cp, bidi_class)
85
+ return bidi_class if Intranges.contain?(cp.ord, BIDI_CLASSES[bidi_class])
86
+
87
+ false
88
+ end
89
+ end
90
+ end
91
+ end
92
+ end
93
+ end