uri-idna 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../intranges"
4
+ require_relative "../data/idna"
5
+
6
+ module URI
7
+ module IDNA
8
+ module Validation
9
+ module Label
10
+ class << self
11
+ # 4.1. Input to IDNA Registration
12
+ # https://datatracker.ietf.org/doc/html/rfc5891#section-4.1
13
+ def check_nfc(label)
14
+ return if label.unicode_normalized?(:nfc)
15
+
16
+ raise Error, "Label must be in Unicode Normalization Form NFC"
17
+ end
18
+
19
+ # 4.2.3.1. Hyphen Restrictions
20
+ # https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.1
21
+ def check_hyphen34(label)
22
+ return if label[2..3] != "--"
23
+
24
+ raise Error, "Label must not contain a U+002D HYPHEN-MINUS character in both the third and fourth positions"
25
+ end
26
+
27
+ # 4.2.3.1. Hyphen Restrictions
28
+ # https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.1
29
+ def check_hyphen_sides(label)
30
+ return unless label[0] == "-" || label[-1] == "-"
31
+
32
+ raise Error, "Label must neither begin nor end with a U+002D HYPHEN-MINUS character"
33
+ end
34
+
35
+ def check_ace_prefix(label)
36
+ return unless label.start_with?(ACE_PREFIX)
37
+
38
+ raise Error, "Label must not begin with `xn--`"
39
+ end
40
+
41
+ # 4.2.3.2. Leading Combining Marks
42
+ # https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.2
43
+ def check_leading_combining(label)
44
+ return unless Intranges.contain?(label[0].ord, INITIAL_COMBINERS)
45
+
46
+ raise Error, "Label begins with an illegal combining character"
47
+ end
48
+
49
+ def check_dot(label)
50
+ raise Error, "Label must not contain a U+002E ( . ) FULL STOP" if label.include?(".")
51
+ end
52
+
53
+ # DNS label size limit
54
+ # See Processing step 4.2
55
+ # https://www.unicode.org/reports/tr46/#ToASCII
56
+ def check_length(label)
57
+ raise Error, "Label too long" unless label.length < 64
58
+ end
59
+
60
+ # DNS name size limit
61
+ # See Processing step 4.1
62
+ # https://www.unicode.org/reports/tr46/#ToASCII
63
+ def check_domain_length(domain_name)
64
+ raise Error, "Domain too long" unless domain_name.length < (domain_name[-1] == "." ? 255 : 254)
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module URI
4
4
  module IDNA
5
- VERSION = "0.1.0"
5
+ VERSION = "0.2.0"
6
6
  end
7
7
  end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ module URI
4
+ module IDNA
5
+ # https://url.spec.whatwg.org/#idna
6
+ module WHATWG
7
+ class ToASCII < UTS46::ToASCII
8
+ def initialize(domain_name, be_strict: true)
9
+ super(
10
+ domain_name,
11
+ use_std3_ascii_rules: be_strict,
12
+ check_hyphens: false,
13
+ check_bidi: true,
14
+ check_joiners: true,
15
+ transitional_processing: false,
16
+ verify_dns_length: be_strict,
17
+ )
18
+ end
19
+ end
20
+
21
+ class ToUnicode < UTS46::ToUnicode
22
+ def initialize(domain_name, be_strict: true)
23
+ super(
24
+ domain_name,
25
+ use_std3_ascii_rules: be_strict,
26
+ check_hyphens: false,
27
+ check_bidi: true,
28
+ check_joiners: true,
29
+ transitional_processing: false,
30
+ )
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
data/lib/uri/idna.rb CHANGED
@@ -1,11 +1,15 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "idna/version"
4
- require_relative "idna/process"
4
+ require_relative "idna/punycode"
5
+ require_relative "idna/base_processing"
6
+ require_relative "idna/idna2008/processing"
7
+ require_relative "idna/uts46/processing"
8
+ require_relative "idna/whatwg/processing"
5
9
 
6
10
  module URI
7
11
  module IDNA
8
- ALABEL_PREFIX = "xn--"
12
+ ACE_PREFIX = "xn--"
9
13
 
10
14
  class Error < StandardError; end
11
15
 
@@ -22,38 +26,40 @@ module URI
22
26
  class PunycodeError < Error; end
23
27
 
24
28
  class << self
25
- UTS46_PARAMS = {
26
- check_dot: true,
27
- idna_validity: false,
28
- uts46: true,
29
- uts46_std3: true,
30
- uts46_transitional: false,
31
- contexto: false,
32
- }.freeze
33
-
34
- LOOKUP_PARAMS = {
35
- hyphen_sides: false,
36
- leading_combining: false,
37
- }.freeze
38
-
39
- def lookup(s, **params)
40
- Process.new(**LOOKUP_PARAMS.merge(params)).lookup(s)
29
+ # IDNA2008 Lookup protocol defined in RFC 5891
30
+ # https://datatracker.ietf.org/doc/html/rfc5891#section-5
31
+ def lookup(domain_name, **options)
32
+ IDNA2008::Lookup.new(domain_name, **options).call
41
33
  end
42
34
 
43
- def register(alabel: nil, ulabel: nil, **params)
44
- Process.new(**params).register(alabel: alabel, ulabel: ulabel)
35
+ # IDNA2008 Registration protocol defined in RFC 5891
36
+ # https://datatracker.ietf.org/doc/html/rfc5891#section-4
37
+ def register(alabel: nil, ulabel: nil, **options)
38
+ IDNA2008::Registration.new(alabel: alabel, ulabel: ulabel, **options).call
45
39
  end
46
40
 
47
41
  # UTS46 ToUnicode process
48
42
  # https://unicode.org/reports/tr46/#ToUnicode
49
- def to_unicode(s, **params)
50
- Process.new(**UTS46_PARAMS.merge(params)).decode(s)
43
+ def to_unicode(domain_name, **options)
44
+ UTS46::ToUnicode.new(domain_name, **options).call
51
45
  end
52
46
 
53
47
  # UTS46 ToASCII process
54
48
  # https://unicode.org/reports/tr46/#ToASCII
55
- def to_ascii(s, **params)
56
- Process.new(**UTS46_PARAMS.merge(params)).encode(s)
49
+ def to_ascii(domain_name, **options)
50
+ UTS46::ToASCII.new(domain_name, **options).call
51
+ end
52
+
53
+ # WHATWG URL Standard domain to ASCII algorithm
54
+ # https://url.spec.whatwg.org/#idna
55
+ def whatwg_to_unicode(domain_name, **options)
56
+ WHATWG::ToUnicode.new(domain_name, **options).call
57
+ end
58
+
59
+ # WHATWG URL Standard domain to Unicode algorithm
60
+ # https://url.spec.whatwg.org/#idna
61
+ def whatwg_to_ascii(domain_name, **options)
62
+ WHATWG::ToASCII.new(domain_name, **options).call
57
63
  end
58
64
  end
59
65
  end
data/lib/uri-idna.rb ADDED
@@ -0,0 +1,3 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "uri/idna"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: uri-idna
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Svyatoslav Kryukov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-08-05 00:00:00.000000000 Z
11
+ date: 2023-11-14 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Internationalized Domain Names in Applications (IDNA)
14
14
  email:
@@ -20,16 +20,23 @@ files:
20
20
  - CHANGELOG.md
21
21
  - LICENSE.txt
22
22
  - README.md
23
+ - lib/uri-idna.rb
23
24
  - lib/uri/idna.rb
25
+ - lib/uri/idna/base_processing.rb
24
26
  - lib/uri/idna/data/idna.rb
25
27
  - lib/uri/idna/data/uts46.rb
28
+ - lib/uri/idna/idna2008/options.rb
29
+ - lib/uri/idna/idna2008/processing.rb
26
30
  - lib/uri/idna/intranges.rb
27
- - lib/uri/idna/process.rb
28
31
  - lib/uri/idna/punycode.rb
29
- - lib/uri/idna/uts46.rb
30
- - lib/uri/idna/validation.rb
32
+ - lib/uri/idna/uts46/mapping.rb
33
+ - lib/uri/idna/uts46/options.rb
34
+ - lib/uri/idna/uts46/processing.rb
31
35
  - lib/uri/idna/validation/bidi.rb
36
+ - lib/uri/idna/validation/codepoint.rb
37
+ - lib/uri/idna/validation/label.rb
32
38
  - lib/uri/idna/version.rb
39
+ - lib/uri/idna/whatwg/processing.rb
33
40
  homepage: https://github.com/skryukov/uri-idna
34
41
  licenses:
35
42
  - MIT
@@ -1,139 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative "punycode"
4
- require_relative "validation"
5
-
6
- require_relative "uts46"
7
-
8
- module URI
9
- module IDNA
10
- class Process
11
- UNICODE_DOTS_REGEX = /[\u002e\u3002\uff0e\uff61]/.freeze
12
-
13
- def initialize(**options)
14
- @options = options
15
- end
16
-
17
- def register(alabel: nil, ulabel: nil)
18
- raise ArgumentError, "Provide alabel or ulabel" if alabel.nil? && ulabel.nil?
19
-
20
- return encode(ulabel) if alabel.nil?
21
-
22
- raise ArgumentError, "String expected" unless alabel.is_a?(String)
23
- raise Error, "Invalid alabel #{alabel}" unless alabel.start_with?(ALABEL_PREFIX)
24
-
25
- process_labels(alabel) do |l|
26
- to_alabel(l, roundtrip: true, ulabel: ulabel)
27
- end
28
- end
29
-
30
- def lookup(s)
31
- raise ArgumentError, "String expected" unless s.is_a?(String)
32
-
33
- s = process_labels(s) do |l|
34
- to_alabel(l, roundtrip: true)
35
- end
36
- validate_domain_length(s) if options.fetch(:dns_length, true)
37
- s
38
- end
39
-
40
- def encode(s)
41
- raise ArgumentError, "String expected" unless s.is_a?(String)
42
-
43
- s = process_labels(s) { |l| to_alabel(l) }
44
- validate_domain_length(s) if options.fetch(:dns_length, true)
45
- s
46
- end
47
-
48
- def decode(s)
49
- raise ArgumentError, "String expected" unless s.is_a?(String)
50
-
51
- process_labels(s) { |l| to_ulabel(l) }
52
- end
53
-
54
- private
55
-
56
- attr_reader :labels, :options
57
-
58
- def splitter
59
- @splitter ||= options.fetch(:uts46, false) ? "." : UNICODE_DOTS_REGEX
60
- end
61
-
62
- def process_labels(s)
63
- s = UTS46.map_string(s, **options.slice(:uts46_std3, :uts46_transitional)) if options.fetch(:uts46, false)
64
- @labels ||= s.split(splitter, -1)
65
- trailing_dot = labels[-1] && labels[-1].empty? ? labels.pop : false
66
-
67
- raise Error, "Empty domain" if labels.empty? || labels == [""]
68
-
69
- result = []
70
- labels.each do |label|
71
- str = yield(label)
72
- raise Error, "Empty label" if str.empty?
73
-
74
- result << str
75
- end
76
-
77
- result << "" if trailing_dot
78
- result.join(".")
79
- end
80
-
81
- def to_alabel(label, roundtrip: false, ulabel: nil)
82
- orig_label = label
83
- # validate label is a valid U-label
84
- label = to_ulabel(label)
85
- if ulabel && ulabel != label
86
- raise Error, "Provided ulabel does not match conversion of alabel, #{ulabel.inspect} != #{label.inspect}"
87
- end
88
-
89
- label = encode_punycode_label(label) unless label.ascii_only?
90
- validate_label_length(label)
91
-
92
- if roundtrip && orig_label.ascii_only? && orig_label != label
93
- raise Error, "Roundtrip encoding failed, #{orig_label.inspect} != #{label.inspect}"
94
- end
95
-
96
- label
97
- end
98
-
99
- # https://datatracker.ietf.org/doc/html/rfc5891#section-5.3
100
- def to_ulabel(label)
101
- decoded = false
102
- label, decoded = decode_punycode_label(label) if label.ascii_only?
103
- validation.call(label, decoded: decoded)
104
- label
105
- end
106
-
107
- def encode_punycode_label(label)
108
- ALABEL_PREFIX + Punycode.encode(label)
109
- end
110
-
111
- def decode_punycode_label(label)
112
- label = label.downcase
113
- return [label, false] unless label.start_with?(ALABEL_PREFIX)
114
-
115
- code = label[ALABEL_PREFIX.length..]
116
- raise Error, "Malformed A-label, no Punycode eligible content found" if code.empty?
117
- raise Error, "A-label must not end with a hyphen" if code[-1] == "-"
118
-
119
- [URI::IDNA::Punycode.decode(code), true]
120
- end
121
-
122
- def validate_label_length(label)
123
- raise Error, "Label too long" unless label.length < 64
124
- end
125
-
126
- def validate_domain_length(s)
127
- raise Error, "Domain too long" unless s.length < (s[-1] == "." ? 255 : 254)
128
- end
129
-
130
- def validation
131
- @validation ||= Validation.new(options.merge(bidi: check_bidi?))
132
- end
133
-
134
- def check_bidi?
135
- options.fetch(:bidi, true) && Validation::Bidi.check?(labels)
136
- end
137
- end
138
- end
139
- end
@@ -1,60 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative "data/uts46"
4
-
5
- module URI
6
- module IDNA
7
- module UTS46
8
- class << self
9
- # https://unicode.org/reports/tr46/#ProcessingStepMap
10
- def map_string(domain, uts46_std3: true, uts46_transitional: false)
11
- output = ""
12
- domain.each_char do |char|
13
- code_point = char.ord
14
- _, status, replacement = uts46_status(code_point)
15
- case status
16
- when "I"
17
- next
18
- when "V"
19
- output += char
20
- when "M"
21
- output += replacement
22
- when "D"
23
- output += uts46_transitional ? replacement : char
24
- when "3"
25
- if uts46_std3
26
- raise InvalidCodepointError,
27
- "Codepoint #{code_point} not allowed in #{domain} via STD3 rules"
28
- end
29
-
30
- output += replacement || char
31
- else
32
- raise InvalidCodepointError, "Codepoint #{code_point} not allowed in #{domain}"
33
- end
34
- end
35
- output.unicode_normalize(:nfc)
36
- end
37
-
38
- def valid?(char, uts46_transitional: false)
39
- _, status, = uts46_status(char.ord)
40
- return true if status == "V"
41
- return true if uts46_transitional && status == "D"
42
-
43
- false
44
- end
45
-
46
- private
47
-
48
- def uts46_status(code_point)
49
- index =
50
- if code_point < 256
51
- code_point
52
- else
53
- (UTS46_DATA.bsearch_index { |x| x[0] > code_point } || UTS46_DATA.length) - 1
54
- end
55
- UTS46_DATA[index] || []
56
- end
57
- end
58
- end
59
- end
60
- end
@@ -1,199 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative "intranges"
4
- require_relative "data/idna"
5
- require_relative "validation/bidi"
6
-
7
- module URI
8
- module IDNA
9
- # U-label domain validation for IDNA and UTS46.
10
- class Validation
11
- # @param [Hash] params
12
- # @option params [Boolean] :nfc Normalize to NFC (true by default)
13
- # @option params [Boolean] :hyphen34 Hyphen restrictions (true by default)
14
- # @option params [Boolean] :hyphen_sides Hyphen restrictions (true for the register protocol and UTS46)
15
- # @option params [Boolean] :leading_combining Leading combining marks (true for the register protocol and UTS46)
16
- # @option params [Boolean] :contextj Contextual rules CONTEXTJ (true by default)
17
- # @option params [Boolean] :contexto Contextual rules CONTEXTO (true for IDNA2008 protocols)
18
- # @option params [Boolean] :bidi Bidi rules (true by default)
19
- # @option params [Boolean] :idna_validity IDNA2008 validity (true for IDNA2008 protocols)
20
- # @option params [Boolean] :uts46 UTS46 validity (true for UTS46)
21
- # @option params [Boolean] :uts46_transitional UTS46 transitional validity (false by default)
22
- # @option params [Boolean] :check_dot Check for dots (true for UTS46)
23
- #
24
- def initialize(params)
25
- @nfc = params.fetch(:nfc, true)
26
- @hyphen34 = params.fetch(:hyphen34, true)
27
- @hyphen_sides = params.fetch(:hyphen_sides, true)
28
-
29
- # Contextual rules
30
- @leading_combining = params.fetch(:leading_combining, true)
31
- @contextj = params.fetch(:contextj, true)
32
- @contexto = params.fetch(:contexto, true)
33
- @bidi = params.fetch(:bidi, true)
34
- # IDNA2008 specific
35
- @idna_validity = params.fetch(:idna_validity, true)
36
-
37
- # UTS46 specific
38
- @uts46 = params.fetch(:uts46, false)
39
- @uts46_transitional = params.fetch(:uts46_transitional, false)
40
- @check_dot = params.fetch(:check_dot, false)
41
- end
42
-
43
- def call(label, decoded: false)
44
- raise Error, "Empty label" if label.empty?
45
-
46
- check_nfc(label) if @nfc
47
- check_hyphen34(label) if @hyphen34
48
- check_hyphen_sides(label) if @hyphen_sides
49
- check_leading_combining(label) if @leading_combining
50
- check_dot(label) if @check_dot
51
- label.each_char.with_index do |cp, pos|
52
- next if codepoint?(cp, "PVALID")
53
-
54
- if @contextj && codepoint?(cp, "CONTEXTJ")
55
- next if valid_contextj?(label, pos)
56
-
57
- raise InvalidCodepointContextError, cp_error_message(cp, label, pos)
58
- end
59
-
60
- if @contexto && codepoint?(cp, "CONTEXTO")
61
- next if valid_contexto?(label, pos)
62
-
63
- raise InvalidCodepointContextError, cp_error_message(cp, label, pos)
64
- end
65
-
66
- # 4.2.2. Rejection of Characters That Are Not Permitted
67
- # https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.2
68
- raise InvalidCodepointError, cp_error_message(cp, label, pos) if @idna_validity
69
-
70
- if @uts46 && !UTS46.valid?(cp, uts46_transitional: @uts46_transitional && !decoded)
71
- raise InvalidCodepointError, cp_error_message(cp, label, pos)
72
- end
73
- end
74
- Bidi.call(label) if @bidi
75
- end
76
-
77
- private
78
-
79
- # 4.1. Input to IDNA Registration
80
- # https://datatracker.ietf.org/doc/html/rfc5891#section-4.1
81
- def check_nfc(label)
82
- return true if label.unicode_normalized?(:nfc)
83
-
84
- raise Error, "Label must be in Normalization Form C"
85
- end
86
-
87
- # 4.2.3.1. Hyphen Restrictions
88
- # https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.1
89
- def check_hyphen34(label)
90
- return unless label[2..3] == "--"
91
-
92
- raise Error, "Label has disallowed hyphens in 3rd and 4th position"
93
- end
94
-
95
- # 4.2.3.1. Hyphen Restrictions
96
- # https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.1
97
- def check_hyphen_sides(label)
98
- return unless label[0] == "-" || label[-1] == "-"
99
-
100
- raise Error, "Label must not start or end with a hyphen"
101
- end
102
-
103
- # 4.2.3.2. Leading Combining Marks
104
- # https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.2
105
- def check_leading_combining(label)
106
- return unless Intranges.contain?(label[0].ord, INITIAL_COMBINERS)
107
-
108
- raise Error, "Label begins with an illegal combining character"
109
- end
110
-
111
- def check_dot(label)
112
- raise Error, "Label must not contain dots" if label.include?(".")
113
- end
114
-
115
- def valid_contexto?(label, pos)
116
- cp_value = label[pos].ord
117
- case cp_value
118
- # https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.3
119
- when 0x00b7
120
- pos > 0 && pos < label.length - 1 ? (label[pos - 1].ord == 0x006c && label[pos + 1].ord == 0x006c) : false
121
- # https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.4
122
- when 0x0375
123
- pos < label.length - 1 ? script?(label[pos + 1], "Greek") : false
124
- # https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.5
125
- # https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.6
126
- when 0x05f3, 0x05f4
127
- pos > 0 ? script?(label[pos - 1], "Hebrew") : false
128
- # https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.7
129
- when 0x30fb
130
- label.each_char do |cp|
131
- next if cp.ord == 0x30fb
132
- return true if script?(cp, "Hiragana") || script?(cp, "Katakana") || script?(cp, "Han")
133
- end
134
- false
135
- # https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.8
136
- when 0x0660..0x0669
137
- label.each_char do |cp|
138
- return false if cp.ord >= 0x06f0 && cp.ord <= 0x06f9
139
- end
140
- true
141
- # https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.9
142
- when 0x06f0..0x06f9
143
- label.each_char do |cp|
144
- return false if cp.ord >= 0x0660 && cp.ord <= 0x0669
145
- end
146
- true
147
- else
148
- false
149
- end
150
- end
151
-
152
- def valid_contextj?(label, pos)
153
- case label[pos].ord
154
- # https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.1
155
- when 0x200c
156
- return true if pos > 0 && virama_combining_class?(label[pos - 1])
157
-
158
- ok = false
159
- (pos - 1).downto(0) do |i|
160
- joining_type = JOINING_TYPES[label[i].ord]
161
- next if joining_type == 0x54
162
-
163
- if [0x4c, 0x44].include?(joining_type)
164
- ok = true
165
- break
166
- end
167
- end
168
- return false unless ok
169
-
170
- (pos + 1).upto(label.length - 1) do |i|
171
- joining_type = JOINING_TYPES[label[i].ord]
172
- next if joining_type == 0x54
173
- return true if [0x52, 0x44].include?(joining_type)
174
- end
175
- # https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.2
176
- when 0x200d
177
- return virama_combining_class?(label[pos - 1]) if pos > 0
178
- end
179
- false
180
- end
181
-
182
- def codepoint?(cp, class_name)
183
- Intranges.contain?(cp.ord, CODEPOINT_CLASSES[class_name])
184
- end
185
-
186
- def script?(cp, script)
187
- Intranges.contain?(cp.ord, SCRIPTS[script])
188
- end
189
-
190
- def virama_combining_class?(cp)
191
- Intranges.contain?(cp.ord, VIRAMA_COMBINING_CLASSES)
192
- end
193
-
194
- def cp_error_message(cp, label, pos)
195
- format("Codepoint U+%04X at position %d of %p not allowed", cp.ord, pos + 1, label)
196
- end
197
- end
198
- end
199
- end