uri-idna 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../intranges"
4
+ require_relative "../data/idna"
5
+
6
+ module URI
7
+ module IDNA
8
+ module Validation
9
+ module Label
10
+ class << self
11
+ # 4.1. Input to IDNA Registration
12
+ # https://datatracker.ietf.org/doc/html/rfc5891#section-4.1
13
+ def check_nfc(label)
14
+ return if label.unicode_normalized?(:nfc)
15
+
16
+ raise Error, "Label must be in Unicode Normalization Form NFC"
17
+ end
18
+
19
+ # 4.2.3.1. Hyphen Restrictions
20
+ # https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.1
21
+ def check_hyphen34(label)
22
+ return if label[2..3] != "--"
23
+
24
+ raise Error, "Label must not contain a U+002D HYPHEN-MINUS character in both the third and fourth positions"
25
+ end
26
+
27
+ # 4.2.3.1. Hyphen Restrictions
28
+ # https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.1
29
+ def check_hyphen_sides(label)
30
+ return unless label[0] == "-" || label[-1] == "-"
31
+
32
+ raise Error, "Label must neither begin nor end with a U+002D HYPHEN-MINUS character"
33
+ end
34
+
35
+ def check_ace_prefix(label)
36
+ return unless label.start_with?(ACE_PREFIX)
37
+
38
+ raise Error, "Label must not begin with `xn--`"
39
+ end
40
+
41
+ # 4.2.3.2. Leading Combining Marks
42
+ # https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.2
43
+ def check_leading_combining(label)
44
+ return unless Intranges.contain?(label[0].ord, INITIAL_COMBINERS)
45
+
46
+ raise Error, "Label begins with an illegal combining character"
47
+ end
48
+
49
+ def check_dot(label)
50
+ raise Error, "Label must not contain a U+002E ( . ) FULL STOP" if label.include?(".")
51
+ end
52
+
53
+ # DNS label size limit
54
+ # See Processing step 4.2
55
+ # https://www.unicode.org/reports/tr46/#ToASCII
56
+ def check_length(label)
57
+ raise Error, "Label too long" unless label.length < 64
58
+ end
59
+
60
+ # DNS name size limit
61
+ # See Processing step 4.1
62
+ # https://www.unicode.org/reports/tr46/#ToASCII
63
+ def check_domain_length(domain_name)
64
+ raise Error, "Domain too long" unless domain_name.length < (domain_name[-1] == "." ? 255 : 254)
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module URI
4
4
  module IDNA
5
- VERSION = "0.1.0"
5
+ VERSION = "0.2.0"
6
6
  end
7
7
  end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ module URI
4
+ module IDNA
5
+ # https://url.spec.whatwg.org/#idna
6
+ module WHATWG
7
+ class ToASCII < UTS46::ToASCII
8
+ def initialize(domain_name, be_strict: true)
9
+ super(
10
+ domain_name,
11
+ use_std3_ascii_rules: be_strict,
12
+ check_hyphens: false,
13
+ check_bidi: true,
14
+ check_joiners: true,
15
+ transitional_processing: false,
16
+ verify_dns_length: be_strict,
17
+ )
18
+ end
19
+ end
20
+
21
+ class ToUnicode < UTS46::ToUnicode
22
+ def initialize(domain_name, be_strict: true)
23
+ super(
24
+ domain_name,
25
+ use_std3_ascii_rules: be_strict,
26
+ check_hyphens: false,
27
+ check_bidi: true,
28
+ check_joiners: true,
29
+ transitional_processing: false,
30
+ )
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
data/lib/uri/idna.rb CHANGED
@@ -1,11 +1,15 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "idna/version"
4
- require_relative "idna/process"
4
+ require_relative "idna/punycode"
5
+ require_relative "idna/base_processing"
6
+ require_relative "idna/idna2008/processing"
7
+ require_relative "idna/uts46/processing"
8
+ require_relative "idna/whatwg/processing"
5
9
 
6
10
  module URI
7
11
  module IDNA
8
- ALABEL_PREFIX = "xn--"
12
+ ACE_PREFIX = "xn--"
9
13
 
10
14
  class Error < StandardError; end
11
15
 
@@ -22,38 +26,40 @@ module URI
22
26
  class PunycodeError < Error; end
23
27
 
24
28
  class << self
25
- UTS46_PARAMS = {
26
- check_dot: true,
27
- idna_validity: false,
28
- uts46: true,
29
- uts46_std3: true,
30
- uts46_transitional: false,
31
- contexto: false,
32
- }.freeze
33
-
34
- LOOKUP_PARAMS = {
35
- hyphen_sides: false,
36
- leading_combining: false,
37
- }.freeze
38
-
39
- def lookup(s, **params)
40
- Process.new(**LOOKUP_PARAMS.merge(params)).lookup(s)
29
+ # IDNA2008 Lookup protocol defined in RFC 5891
30
+ # https://datatracker.ietf.org/doc/html/rfc5891#section-5
31
+ def lookup(domain_name, **options)
32
+ IDNA2008::Lookup.new(domain_name, **options).call
41
33
  end
42
34
 
43
- def register(alabel: nil, ulabel: nil, **params)
44
- Process.new(**params).register(alabel: alabel, ulabel: ulabel)
35
+ # IDNA2008 Registration protocol defined in RFC 5891
36
+ # https://datatracker.ietf.org/doc/html/rfc5891#section-4
37
+ def register(alabel: nil, ulabel: nil, **options)
38
+ IDNA2008::Registration.new(alabel: alabel, ulabel: ulabel, **options).call
45
39
  end
46
40
 
47
41
  # UTS46 ToUnicode process
48
42
  # https://unicode.org/reports/tr46/#ToUnicode
49
- def to_unicode(s, **params)
50
- Process.new(**UTS46_PARAMS.merge(params)).decode(s)
43
+ def to_unicode(domain_name, **options)
44
+ UTS46::ToUnicode.new(domain_name, **options).call
51
45
  end
52
46
 
53
47
  # UTS46 ToASCII process
54
48
  # https://unicode.org/reports/tr46/#ToASCII
55
- def to_ascii(s, **params)
56
- Process.new(**UTS46_PARAMS.merge(params)).encode(s)
49
+ def to_ascii(domain_name, **options)
50
+ UTS46::ToASCII.new(domain_name, **options).call
51
+ end
52
+
53
+ # WHATWG URL Standard domain to ASCII algorithm
54
+ # https://url.spec.whatwg.org/#idna
55
+ def whatwg_to_unicode(domain_name, **options)
56
+ WHATWG::ToUnicode.new(domain_name, **options).call
57
+ end
58
+
59
+ # WHATWG URL Standard domain to Unicode algorithm
60
+ # https://url.spec.whatwg.org/#idna
61
+ def whatwg_to_ascii(domain_name, **options)
62
+ WHATWG::ToASCII.new(domain_name, **options).call
57
63
  end
58
64
  end
59
65
  end
data/lib/uri-idna.rb ADDED
@@ -0,0 +1,3 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "uri/idna"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: uri-idna
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Svyatoslav Kryukov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-08-05 00:00:00.000000000 Z
11
+ date: 2023-11-14 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Internationalized Domain Names in Applications (IDNA)
14
14
  email:
@@ -20,16 +20,23 @@ files:
20
20
  - CHANGELOG.md
21
21
  - LICENSE.txt
22
22
  - README.md
23
+ - lib/uri-idna.rb
23
24
  - lib/uri/idna.rb
25
+ - lib/uri/idna/base_processing.rb
24
26
  - lib/uri/idna/data/idna.rb
25
27
  - lib/uri/idna/data/uts46.rb
28
+ - lib/uri/idna/idna2008/options.rb
29
+ - lib/uri/idna/idna2008/processing.rb
26
30
  - lib/uri/idna/intranges.rb
27
- - lib/uri/idna/process.rb
28
31
  - lib/uri/idna/punycode.rb
29
- - lib/uri/idna/uts46.rb
30
- - lib/uri/idna/validation.rb
32
+ - lib/uri/idna/uts46/mapping.rb
33
+ - lib/uri/idna/uts46/options.rb
34
+ - lib/uri/idna/uts46/processing.rb
31
35
  - lib/uri/idna/validation/bidi.rb
36
+ - lib/uri/idna/validation/codepoint.rb
37
+ - lib/uri/idna/validation/label.rb
32
38
  - lib/uri/idna/version.rb
39
+ - lib/uri/idna/whatwg/processing.rb
33
40
  homepage: https://github.com/skryukov/uri-idna
34
41
  licenses:
35
42
  - MIT
@@ -1,139 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative "punycode"
4
- require_relative "validation"
5
-
6
- require_relative "uts46"
7
-
8
- module URI
9
- module IDNA
10
- class Process
11
- UNICODE_DOTS_REGEX = /[\u002e\u3002\uff0e\uff61]/.freeze
12
-
13
- def initialize(**options)
14
- @options = options
15
- end
16
-
17
- def register(alabel: nil, ulabel: nil)
18
- raise ArgumentError, "Provide alabel or ulabel" if alabel.nil? && ulabel.nil?
19
-
20
- return encode(ulabel) if alabel.nil?
21
-
22
- raise ArgumentError, "String expected" unless alabel.is_a?(String)
23
- raise Error, "Invalid alabel #{alabel}" unless alabel.start_with?(ALABEL_PREFIX)
24
-
25
- process_labels(alabel) do |l|
26
- to_alabel(l, roundtrip: true, ulabel: ulabel)
27
- end
28
- end
29
-
30
- def lookup(s)
31
- raise ArgumentError, "String expected" unless s.is_a?(String)
32
-
33
- s = process_labels(s) do |l|
34
- to_alabel(l, roundtrip: true)
35
- end
36
- validate_domain_length(s) if options.fetch(:dns_length, true)
37
- s
38
- end
39
-
40
- def encode(s)
41
- raise ArgumentError, "String expected" unless s.is_a?(String)
42
-
43
- s = process_labels(s) { |l| to_alabel(l) }
44
- validate_domain_length(s) if options.fetch(:dns_length, true)
45
- s
46
- end
47
-
48
- def decode(s)
49
- raise ArgumentError, "String expected" unless s.is_a?(String)
50
-
51
- process_labels(s) { |l| to_ulabel(l) }
52
- end
53
-
54
- private
55
-
56
- attr_reader :labels, :options
57
-
58
- def splitter
59
- @splitter ||= options.fetch(:uts46, false) ? "." : UNICODE_DOTS_REGEX
60
- end
61
-
62
- def process_labels(s)
63
- s = UTS46.map_string(s, **options.slice(:uts46_std3, :uts46_transitional)) if options.fetch(:uts46, false)
64
- @labels ||= s.split(splitter, -1)
65
- trailing_dot = labels[-1] && labels[-1].empty? ? labels.pop : false
66
-
67
- raise Error, "Empty domain" if labels.empty? || labels == [""]
68
-
69
- result = []
70
- labels.each do |label|
71
- str = yield(label)
72
- raise Error, "Empty label" if str.empty?
73
-
74
- result << str
75
- end
76
-
77
- result << "" if trailing_dot
78
- result.join(".")
79
- end
80
-
81
- def to_alabel(label, roundtrip: false, ulabel: nil)
82
- orig_label = label
83
- # validate label is a valid U-label
84
- label = to_ulabel(label)
85
- if ulabel && ulabel != label
86
- raise Error, "Provided ulabel does not match conversion of alabel, #{ulabel.inspect} != #{label.inspect}"
87
- end
88
-
89
- label = encode_punycode_label(label) unless label.ascii_only?
90
- validate_label_length(label)
91
-
92
- if roundtrip && orig_label.ascii_only? && orig_label != label
93
- raise Error, "Roundtrip encoding failed, #{orig_label.inspect} != #{label.inspect}"
94
- end
95
-
96
- label
97
- end
98
-
99
- # https://datatracker.ietf.org/doc/html/rfc5891#section-5.3
100
- def to_ulabel(label)
101
- decoded = false
102
- label, decoded = decode_punycode_label(label) if label.ascii_only?
103
- validation.call(label, decoded: decoded)
104
- label
105
- end
106
-
107
- def encode_punycode_label(label)
108
- ALABEL_PREFIX + Punycode.encode(label)
109
- end
110
-
111
- def decode_punycode_label(label)
112
- label = label.downcase
113
- return [label, false] unless label.start_with?(ALABEL_PREFIX)
114
-
115
- code = label[ALABEL_PREFIX.length..]
116
- raise Error, "Malformed A-label, no Punycode eligible content found" if code.empty?
117
- raise Error, "A-label must not end with a hyphen" if code[-1] == "-"
118
-
119
- [URI::IDNA::Punycode.decode(code), true]
120
- end
121
-
122
- def validate_label_length(label)
123
- raise Error, "Label too long" unless label.length < 64
124
- end
125
-
126
- def validate_domain_length(s)
127
- raise Error, "Domain too long" unless s.length < (s[-1] == "." ? 255 : 254)
128
- end
129
-
130
- def validation
131
- @validation ||= Validation.new(options.merge(bidi: check_bidi?))
132
- end
133
-
134
- def check_bidi?
135
- options.fetch(:bidi, true) && Validation::Bidi.check?(labels)
136
- end
137
- end
138
- end
139
- end
@@ -1,60 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative "data/uts46"
4
-
5
- module URI
6
- module IDNA
7
- module UTS46
8
- class << self
9
- # https://unicode.org/reports/tr46/#ProcessingStepMap
10
- def map_string(domain, uts46_std3: true, uts46_transitional: false)
11
- output = ""
12
- domain.each_char do |char|
13
- code_point = char.ord
14
- _, status, replacement = uts46_status(code_point)
15
- case status
16
- when "I"
17
- next
18
- when "V"
19
- output += char
20
- when "M"
21
- output += replacement
22
- when "D"
23
- output += uts46_transitional ? replacement : char
24
- when "3"
25
- if uts46_std3
26
- raise InvalidCodepointError,
27
- "Codepoint #{code_point} not allowed in #{domain} via STD3 rules"
28
- end
29
-
30
- output += replacement || char
31
- else
32
- raise InvalidCodepointError, "Codepoint #{code_point} not allowed in #{domain}"
33
- end
34
- end
35
- output.unicode_normalize(:nfc)
36
- end
37
-
38
- def valid?(char, uts46_transitional: false)
39
- _, status, = uts46_status(char.ord)
40
- return true if status == "V"
41
- return true if uts46_transitional && status == "D"
42
-
43
- false
44
- end
45
-
46
- private
47
-
48
- def uts46_status(code_point)
49
- index =
50
- if code_point < 256
51
- code_point
52
- else
53
- (UTS46_DATA.bsearch_index { |x| x[0] > code_point } || UTS46_DATA.length) - 1
54
- end
55
- UTS46_DATA[index] || []
56
- end
57
- end
58
- end
59
- end
60
- end
@@ -1,199 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative "intranges"
4
- require_relative "data/idna"
5
- require_relative "validation/bidi"
6
-
7
- module URI
8
- module IDNA
9
- # U-label domain validation for IDNA and UTS46.
10
- class Validation
11
- # @param [Hash] params
12
- # @option params [Boolean] :nfc Normalize to NFC (true by default)
13
- # @option params [Boolean] :hyphen34 Hyphen restrictions (true by default)
14
- # @option params [Boolean] :hyphen_sides Hyphen restrictions (true for the register protocol and UTS46)
15
- # @option params [Boolean] :leading_combining Leading combining marks (true for the register protocol and UTS46)
16
- # @option params [Boolean] :contextj Contextual rules CONTEXTJ (true by default)
17
- # @option params [Boolean] :contexto Contextual rules CONTEXTO (true for IDNA2008 protocols)
18
- # @option params [Boolean] :bidi Bidi rules (true by default)
19
- # @option params [Boolean] :idna_validity IDNA2008 validity (true for IDNA2008 protocols)
20
- # @option params [Boolean] :uts46 UTS46 validity (true for UTS46)
21
- # @option params [Boolean] :uts46_transitional UTS46 transitional validity (false by default)
22
- # @option params [Boolean] :check_dot Check for dots (true for UTS46)
23
- #
24
- def initialize(params)
25
- @nfc = params.fetch(:nfc, true)
26
- @hyphen34 = params.fetch(:hyphen34, true)
27
- @hyphen_sides = params.fetch(:hyphen_sides, true)
28
-
29
- # Contextual rules
30
- @leading_combining = params.fetch(:leading_combining, true)
31
- @contextj = params.fetch(:contextj, true)
32
- @contexto = params.fetch(:contexto, true)
33
- @bidi = params.fetch(:bidi, true)
34
- # IDNA2008 specific
35
- @idna_validity = params.fetch(:idna_validity, true)
36
-
37
- # UTS46 specific
38
- @uts46 = params.fetch(:uts46, false)
39
- @uts46_transitional = params.fetch(:uts46_transitional, false)
40
- @check_dot = params.fetch(:check_dot, false)
41
- end
42
-
43
- def call(label, decoded: false)
44
- raise Error, "Empty label" if label.empty?
45
-
46
- check_nfc(label) if @nfc
47
- check_hyphen34(label) if @hyphen34
48
- check_hyphen_sides(label) if @hyphen_sides
49
- check_leading_combining(label) if @leading_combining
50
- check_dot(label) if @check_dot
51
- label.each_char.with_index do |cp, pos|
52
- next if codepoint?(cp, "PVALID")
53
-
54
- if @contextj && codepoint?(cp, "CONTEXTJ")
55
- next if valid_contextj?(label, pos)
56
-
57
- raise InvalidCodepointContextError, cp_error_message(cp, label, pos)
58
- end
59
-
60
- if @contexto && codepoint?(cp, "CONTEXTO")
61
- next if valid_contexto?(label, pos)
62
-
63
- raise InvalidCodepointContextError, cp_error_message(cp, label, pos)
64
- end
65
-
66
- # 4.2.2. Rejection of Characters That Are Not Permitted
67
- # https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.2
68
- raise InvalidCodepointError, cp_error_message(cp, label, pos) if @idna_validity
69
-
70
- if @uts46 && !UTS46.valid?(cp, uts46_transitional: @uts46_transitional && !decoded)
71
- raise InvalidCodepointError, cp_error_message(cp, label, pos)
72
- end
73
- end
74
- Bidi.call(label) if @bidi
75
- end
76
-
77
- private
78
-
79
- # 4.1. Input to IDNA Registration
80
- # https://datatracker.ietf.org/doc/html/rfc5891#section-4.1
81
- def check_nfc(label)
82
- return true if label.unicode_normalized?(:nfc)
83
-
84
- raise Error, "Label must be in Normalization Form C"
85
- end
86
-
87
- # 4.2.3.1. Hyphen Restrictions
88
- # https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.1
89
- def check_hyphen34(label)
90
- return unless label[2..3] == "--"
91
-
92
- raise Error, "Label has disallowed hyphens in 3rd and 4th position"
93
- end
94
-
95
- # 4.2.3.1. Hyphen Restrictions
96
- # https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.1
97
- def check_hyphen_sides(label)
98
- return unless label[0] == "-" || label[-1] == "-"
99
-
100
- raise Error, "Label must not start or end with a hyphen"
101
- end
102
-
103
- # 4.2.3.2. Leading Combining Marks
104
- # https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.2
105
- def check_leading_combining(label)
106
- return unless Intranges.contain?(label[0].ord, INITIAL_COMBINERS)
107
-
108
- raise Error, "Label begins with an illegal combining character"
109
- end
110
-
111
- def check_dot(label)
112
- raise Error, "Label must not contain dots" if label.include?(".")
113
- end
114
-
115
- def valid_contexto?(label, pos)
116
- cp_value = label[pos].ord
117
- case cp_value
118
- # https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.3
119
- when 0x00b7
120
- pos > 0 && pos < label.length - 1 ? (label[pos - 1].ord == 0x006c && label[pos + 1].ord == 0x006c) : false
121
- # https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.4
122
- when 0x0375
123
- pos < label.length - 1 ? script?(label[pos + 1], "Greek") : false
124
- # https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.5
125
- # https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.6
126
- when 0x05f3, 0x05f4
127
- pos > 0 ? script?(label[pos - 1], "Hebrew") : false
128
- # https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.7
129
- when 0x30fb
130
- label.each_char do |cp|
131
- next if cp.ord == 0x30fb
132
- return true if script?(cp, "Hiragana") || script?(cp, "Katakana") || script?(cp, "Han")
133
- end
134
- false
135
- # https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.8
136
- when 0x0660..0x0669
137
- label.each_char do |cp|
138
- return false if cp.ord >= 0x06f0 && cp.ord <= 0x06f9
139
- end
140
- true
141
- # https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.9
142
- when 0x06f0..0x06f9
143
- label.each_char do |cp|
144
- return false if cp.ord >= 0x0660 && cp.ord <= 0x0669
145
- end
146
- true
147
- else
148
- false
149
- end
150
- end
151
-
152
- def valid_contextj?(label, pos)
153
- case label[pos].ord
154
- # https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.1
155
- when 0x200c
156
- return true if pos > 0 && virama_combining_class?(label[pos - 1])
157
-
158
- ok = false
159
- (pos - 1).downto(0) do |i|
160
- joining_type = JOINING_TYPES[label[i].ord]
161
- next if joining_type == 0x54
162
-
163
- if [0x4c, 0x44].include?(joining_type)
164
- ok = true
165
- break
166
- end
167
- end
168
- return false unless ok
169
-
170
- (pos + 1).upto(label.length - 1) do |i|
171
- joining_type = JOINING_TYPES[label[i].ord]
172
- next if joining_type == 0x54
173
- return true if [0x52, 0x44].include?(joining_type)
174
- end
175
- # https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.2
176
- when 0x200d
177
- return virama_combining_class?(label[pos - 1]) if pos > 0
178
- end
179
- false
180
- end
181
-
182
- def codepoint?(cp, class_name)
183
- Intranges.contain?(cp.ord, CODEPOINT_CLASSES[class_name])
184
- end
185
-
186
- def script?(cp, script)
187
- Intranges.contain?(cp.ord, SCRIPTS[script])
188
- end
189
-
190
- def virama_combining_class?(cp)
191
- Intranges.contain?(cp.ord, VIRAMA_COMBINING_CLASSES)
192
- end
193
-
194
- def cp_error_message(cp, label, pos)
195
- format("Codepoint U+%04X at position %d of %p not allowed", cp.ord, pos + 1, label)
196
- end
197
- end
198
- end
199
- end