uri-idna 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module URI
4
+ module IDNA
5
+ module Intranges
6
+ class << self
7
+ def from_list(list)
8
+ sorted_list = list.sort
9
+ ranges = []
10
+ last_write = -1
11
+ sorted_list.each_with_index do |value, i|
12
+ next if value + 1 == sorted_list[i + 1]
13
+
14
+ ranges << encode_range(sorted_list[last_write + 1], sorted_list[i] + 1)
15
+ last_write = i
16
+ end
17
+ ranges
18
+ end
19
+
20
+ def contain?(int, ranges)
21
+ tuple = encode_range(int, 0)
22
+ pos = ranges.bsearch_index { |x| x > tuple } || ranges.length
23
+ # we could be immediately ahead of a tuple (start, end)
24
+ # with start < int_ <= end
25
+ if pos > 0
26
+ left, right = decode_range(ranges[pos - 1])
27
+ return true if left <= int && int < right
28
+ end
29
+ # or we could be immediately behind a tuple (int_, end)
30
+ if pos < ranges.length
31
+ left, = decode_range(ranges[pos])
32
+ return true if left == int
33
+ end
34
+ false
35
+ end
36
+
37
+ private
38
+
39
+ def encode_range(start, finish)
40
+ (start << 32) | finish
41
+ end
42
+
43
+ def decode_range(r)
44
+ [(r >> 32), (r & ((1 << 32) - 1))]
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,139 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "punycode"
4
+ require_relative "validation"
5
+
6
+ require_relative "uts46"
7
+
8
+ module URI
9
+ module IDNA
10
+ class Process
11
+ UNICODE_DOTS_REGEX = /[\u002e\u3002\uff0e\uff61]/.freeze
12
+
13
+ def initialize(**options)
14
+ @options = options
15
+ end
16
+
17
+ def register(alabel: nil, ulabel: nil)
18
+ raise ArgumentError, "Provide alabel or ulabel" if alabel.nil? && ulabel.nil?
19
+
20
+ return encode(ulabel) if alabel.nil?
21
+
22
+ raise ArgumentError, "String expected" unless alabel.is_a?(String)
23
+ raise Error, "Invalid alabel #{alabel}" unless alabel.start_with?(ALABEL_PREFIX)
24
+
25
+ process_labels(alabel) do |l|
26
+ to_alabel(l, roundtrip: true, ulabel: ulabel)
27
+ end
28
+ end
29
+
30
+ def lookup(s)
31
+ raise ArgumentError, "String expected" unless s.is_a?(String)
32
+
33
+ s = process_labels(s) do |l|
34
+ to_alabel(l, roundtrip: true)
35
+ end
36
+ validate_domain_length(s) if options.fetch(:dns_length, true)
37
+ s
38
+ end
39
+
40
+ def encode(s)
41
+ raise ArgumentError, "String expected" unless s.is_a?(String)
42
+
43
+ s = process_labels(s) { |l| to_alabel(l) }
44
+ validate_domain_length(s) if options.fetch(:dns_length, true)
45
+ s
46
+ end
47
+
48
+ def decode(s)
49
+ raise ArgumentError, "String expected" unless s.is_a?(String)
50
+
51
+ process_labels(s) { |l| to_ulabel(l) }
52
+ end
53
+
54
+ private
55
+
56
+ attr_reader :labels, :options
57
+
58
+ def splitter
59
+ @splitter ||= options.fetch(:uts46, false) ? "." : UNICODE_DOTS_REGEX
60
+ end
61
+
62
+ def process_labels(s)
63
+ s = UTS46.map_string(s, **options.slice(:uts46_std3, :uts46_transitional)) if options.fetch(:uts46, false)
64
+ @labels ||= s.split(splitter, -1)
65
+ trailing_dot = labels[-1] && labels[-1].empty? ? labels.pop : false
66
+
67
+ raise Error, "Empty domain" if labels.empty? || labels == [""]
68
+
69
+ result = []
70
+ labels.each do |label|
71
+ str = yield(label)
72
+ raise Error, "Empty label" if str.empty?
73
+
74
+ result << str
75
+ end
76
+
77
+ result << "" if trailing_dot
78
+ result.join(".")
79
+ end
80
+
81
+ def to_alabel(label, roundtrip: false, ulabel: nil)
82
+ orig_label = label
83
+ # validate label is a valid U-label
84
+ label = to_ulabel(label)
85
+ if ulabel && ulabel != label
86
+ raise Error, "Provided ulabel does not match conversion of alabel, #{ulabel.inspect} != #{label.inspect}"
87
+ end
88
+
89
+ label = encode_punycode_label(label) unless label.ascii_only?
90
+ validate_label_length(label)
91
+
92
+ if roundtrip && orig_label.ascii_only? && orig_label != label
93
+ raise Error, "Roundtrip encoding failed, #{orig_label.inspect} != #{label.inspect}"
94
+ end
95
+
96
+ label
97
+ end
98
+
99
+ # https://datatracker.ietf.org/doc/html/rfc5891#section-5.3
100
+ def to_ulabel(label)
101
+ decoded = false
102
+ label, decoded = decode_punycode_label(label) if label.ascii_only?
103
+ validation.call(label, decoded: decoded)
104
+ label
105
+ end
106
+
107
+ def encode_punycode_label(label)
108
+ ALABEL_PREFIX + Punycode.encode(label)
109
+ end
110
+
111
+ def decode_punycode_label(label)
112
+ label = label.downcase
113
+ return [label, false] unless label.start_with?(ALABEL_PREFIX)
114
+
115
+ code = label[ALABEL_PREFIX.length..]
116
+ raise Error, "Malformed A-label, no Punycode eligible content found" if code.empty?
117
+ raise Error, "A-label must not end with a hyphen" if code[-1] == "-"
118
+
119
+ [URI::IDNA::Punycode.decode(code), true]
120
+ end
121
+
122
+ def validate_label_length(label)
123
+ raise Error, "Label too long" unless label.length < 64
124
+ end
125
+
126
+ def validate_domain_length(s)
127
+ raise Error, "Domain too long" unless s.length < (s[-1] == "." ? 255 : 254)
128
+ end
129
+
130
+ def validation
131
+ @validation ||= Validation.new(options.merge(bidi: check_bidi?))
132
+ end
133
+
134
+ def check_bidi?
135
+ options.fetch(:bidi, true) && Validation::Bidi.check?(labels)
136
+ end
137
+ end
138
+ end
139
+ end
@@ -0,0 +1,174 @@
1
+ # frozen_string_literal: true
2
+
3
+ module URI
4
+ module IDNA
5
+ # Punycode implementation based on a simplified version of RFC 3492
6
+ # https://datatracker.ietf.org/doc/html/rfc3492#appendix-C
7
+ module Punycode
8
+ class << self
9
+ BASE = 36
10
+ TMIN = 1
11
+ TMAX = 26
12
+ SKEW = 38
13
+ DAMP = 700
14
+ INITIAL_BIAS = 72
15
+ INITIAL_N = 0x80
16
+
17
+ DELIMITER = 0x2D
18
+ MAXINT = 0x7FFFFFFF
19
+
20
+ def decode_digit(cp)
21
+ if cp - 48 < 10
22
+ cp - 22
23
+ elsif cp - 65 < 26
24
+ cp - 65
25
+ elsif cp - 97 < 26
26
+ cp - 97
27
+ else
28
+ BASE
29
+ end
30
+ end
31
+
32
+ def encode_digit(d)
33
+ d + 22 + 75 * (d < 26 ? 1 : 0)
34
+ end
35
+
36
+ def adapt(delta, num_points, first_time)
37
+ delta = first_time ? (delta / DAMP) : (delta >> 1)
38
+ delta += (delta / num_points)
39
+
40
+ k = 0
41
+ while delta > (((BASE - TMIN) * TMAX) / 2)
42
+ delta /= BASE - TMIN
43
+ k += BASE
44
+ end
45
+ k + ((BASE - TMIN + 1) * delta / (delta + SKEW))
46
+ end
47
+
48
+ def encode(input)
49
+ input = input.codepoints
50
+
51
+ n = INITIAL_N
52
+ delta = 0
53
+ bias = INITIAL_BIAS
54
+
55
+ output = input.select { |char| basic?(char) }
56
+ h = b = output.length
57
+
58
+ output << DELIMITER if b > 0
59
+
60
+ while h < input.length
61
+ m = MAXINT
62
+ input.each do |char|
63
+ m = char if char >= n && char < m
64
+ end
65
+
66
+ raise PunycodeError, "Arithmetic overflow" if m - n > (MAXINT - delta) / (h + 1)
67
+
68
+ delta += (m - n) * (h + 1)
69
+ n = m
70
+
71
+ input.each do |char|
72
+ if char < n
73
+ delta += 1
74
+ raise PunycodeError, "Arithmetic overflow" if delta > MAXINT
75
+ end
76
+ next unless char == n
77
+
78
+ q = delta
79
+ k = BASE
80
+ loop do
81
+ t =
82
+ if k <= bias
83
+ TMIN
84
+ elsif k >= bias + TMAX
85
+ TMAX
86
+ else
87
+ k - bias
88
+ end
89
+ break if q < t
90
+
91
+ output << encode_digit(t + ((q - t) % (BASE - t)))
92
+ q = (q - t) / (BASE - t)
93
+ k += BASE
94
+ end
95
+
96
+ output << encode_digit(q)
97
+ bias = adapt(delta, h + 1, h == b)
98
+ delta = 0
99
+ h += 1
100
+ end
101
+
102
+ delta += 1
103
+ n += 1
104
+ end
105
+ output.pack("U*")
106
+ end
107
+
108
+ def decode(input)
109
+ input = input.codepoints
110
+ output = []
111
+
112
+ n = INITIAL_N
113
+ i = 0
114
+ bias = INITIAL_BIAS
115
+
116
+ b = input.rindex(DELIMITER) || 0
117
+
118
+ input[0, b].each do |char|
119
+ raise PunycodeError, "Invalid input" unless basic?(char)
120
+
121
+ output << char
122
+ end
123
+
124
+ inc = b > 0 ? b + 1 : 0
125
+ while inc < input.length
126
+ old_i = i
127
+ w = 1
128
+ k = BASE
129
+ loop do
130
+ raise PunycodeError, "Invalid input" if inc >= input.length
131
+
132
+ digit = decode_digit(input[inc])
133
+ inc += 1
134
+ raise PunycodeError, "Invalid input" if digit >= BASE
135
+ raise PunycodeError, "Arithmetic overflow" if digit > (MAXINT - i) / w
136
+
137
+ i += digit * w
138
+ t = if k <= bias
139
+ TMIN
140
+ elsif k >= bias + TMAX
141
+ TMAX
142
+ else
143
+ k - bias
144
+ end
145
+ break if digit < t
146
+ raise PunycodeError, "Arithmetic overflow" if w > MAXINT / (BASE - t)
147
+
148
+ w *= BASE - t
149
+ k += BASE
150
+ end
151
+ out = output.length
152
+ bias = adapt(i - old_i, out + 1, old_i == 0)
153
+ raise PunycodeError, "Arithmetic overflow" if (i / (out + 1)) > MAXINT - n
154
+
155
+ n += i / (out + 1)
156
+ i %= (out + 1)
157
+
158
+ output.insert(i, n)
159
+
160
+ i += 1
161
+ end
162
+
163
+ output.pack("U*")
164
+ end
165
+
166
+ private
167
+
168
+ def basic?(codepoint)
169
+ codepoint < 0x80
170
+ end
171
+ end
172
+ end
173
+ end
174
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "data/uts46"
4
+
5
+ module URI
6
+ module IDNA
7
+ module UTS46
8
+ class << self
9
+ # https://unicode.org/reports/tr46/#ProcessingStepMap
10
+ def map_string(domain, uts46_std3: true, uts46_transitional: false)
11
+ output = ""
12
+ domain.each_char do |char|
13
+ code_point = char.ord
14
+ _, status, replacement = uts46_status(code_point)
15
+ case status
16
+ when "I"
17
+ next
18
+ when "V"
19
+ output += char
20
+ when "M"
21
+ output += replacement
22
+ when "D"
23
+ output += uts46_transitional ? replacement : char
24
+ when "3"
25
+ if uts46_std3
26
+ raise InvalidCodepointError,
27
+ "Codepoint #{code_point} not allowed in #{domain} via STD3 rules"
28
+ end
29
+
30
+ output += replacement || char
31
+ else
32
+ raise InvalidCodepointError, "Codepoint #{code_point} not allowed in #{domain}"
33
+ end
34
+ end
35
+ output.unicode_normalize(:nfc)
36
+ end
37
+
38
+ def valid?(char, uts46_transitional: false)
39
+ _, status, = uts46_status(char.ord)
40
+ return true if status == "V"
41
+ return true if uts46_transitional && status == "D"
42
+
43
+ false
44
+ end
45
+
46
+ private
47
+
48
+ def uts46_status(code_point)
49
+ index =
50
+ if code_point < 256
51
+ code_point
52
+ else
53
+ (UTS46_DATA.bsearch_index { |x| x[0] > code_point } || UTS46_DATA.length) - 1
54
+ end
55
+ UTS46_DATA[index] || []
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,93 @@
1
+ # frozen_string_literal: true
2
+
3
+ module URI
4
+ module IDNA
5
+ class Validation
6
+ # 4.2.3.4. Labels Containing Characters Written Right to Left
7
+ # https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.4
8
+ # https://datatracker.ietf.org/doc/html/rfc5893#section-2
9
+ module Bidi
10
+ class << self
11
+ def call(label)
12
+ # Bidi rule 1
13
+ if bidi_class(label[0], "RTL")
14
+ rtl = true
15
+ elsif bidi_class(label[0], "L")
16
+ rtl = false
17
+ else
18
+ raise BidiError, "First codepoint in label #{label} must be directionality L, R or AL"
19
+ end
20
+
21
+ valid_ending = false
22
+ number_type = nil
23
+ label.each_char.with_index do |cp, idx|
24
+ if rtl
25
+ # Bidi rule 2
26
+ if bidi_class(cp, "L") || bidi_class(cp, "UNUSED")
27
+ raise BidiError, "Invalid direction for codepoint at position #{idx + 1} in a right-to-left label"
28
+ end
29
+
30
+ # Bidi rule 3
31
+ direction = bidi_class(cp, "RTL") || bidi_class(cp, "EN") || bidi_class(cp, "AN")
32
+ if direction
33
+ valid_ending = true
34
+ elsif !bidi_class(cp, "NSM")
35
+ valid_ending = false
36
+ end
37
+ # Bidi rule 4
38
+ if %w[EN AN].include?(direction)
39
+ number_type ||= direction
40
+ raise BidiError, "Can not mix numeral types in a right-to-left label" if number_type != direction
41
+ end
42
+ else
43
+ # Bidi rule 5
44
+ if bidi_class(cp, "RTL") || bidi_class(cp, "AN")
45
+ raise BidiError, "Invalid direction for codepoint at position #{idx + 1} in a left-to-right label"
46
+ end
47
+
48
+ # Bidi rule 6
49
+ if bidi_class(cp, "L") || bidi_class(cp, "EN")
50
+ valid_ending = true
51
+ elsif !bidi_class(cp, "NSM")
52
+ valid_ending = false
53
+ end
54
+ end
55
+ end
56
+
57
+ raise BidiError, "Label ends with illegal codepoint directionality" unless valid_ending
58
+
59
+ true
60
+ end
61
+
62
+ # https://www.rfc-editor.org/rfc/rfc5891.html#section-4.2.3.4
63
+ def check?(labels)
64
+ domain = labels.map do |label|
65
+ if label.start_with?(ALABEL_PREFIX)
66
+ begin
67
+ Punycode.decode(label[ALABEL_PREFIX.length..])
68
+ rescue StandardError
69
+ ""
70
+ end
71
+ else
72
+ label
73
+ end
74
+ end.join(".")
75
+
76
+ domain.each_char do |cp|
77
+ return true if bidi_class(cp, "RTL") || bidi_class(cp, "AN")
78
+ end
79
+ false
80
+ end
81
+
82
+ private
83
+
84
+ def bidi_class(cp, bidi_class)
85
+ return bidi_class if Intranges.contain?(cp.ord, BIDI_CLASSES[bidi_class])
86
+
87
+ false
88
+ end
89
+ end
90
+ end
91
+ end
92
+ end
93
+ end