uri-whatwg_parser 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -2
- data/lib/uri/whatwg_parser/host_parser.rb +31 -17
- data/lib/uri/whatwg_parser/version.rb +1 -1
- data/lib/uri/whatwg_parser.rb +164 -25
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1836e023e3a156fed4ff343555eb8562c358aea0040f644e1239739f83eff9ee
|
4
|
+
data.tar.gz: c529afc3fb9bc883857312156ed3628ce5d1978cb30edc041a8b7800d4da269c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dfb4edce987f84e43c1ac6aa5c89b7efca25c65d399caa2635faeedead1706fe55f498cd2d6e65c979c326fc7e80bf78910442fea417836fc497f9c625ab42ed
|
7
|
+
data.tar.gz: d916826be29b6668f8820376a5eb6ebf2545aa8e82945eb4075b1c8acd4e10ba9dbe285c26296f27d0ebc5da41aad7f390148611b786ca4319c5a188e54b8bc5
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# URI::WhatwgParser
|
2
2
|
|
3
3
|
Ruby implementation of the [WHATWG URL Living Standard](https://url.spec.whatwg.org/).
|
4
4
|
|
@@ -29,7 +29,6 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
29
29
|
|
30
30
|
## TODO
|
31
31
|
|
32
|
-
* Support passing `base`
|
33
32
|
* Support state override
|
34
33
|
* Support validations
|
35
34
|
* Support encodings other than UTF-8
|
@@ -9,45 +9,44 @@ class URI::WhatwgParser
|
|
9
9
|
include ParserHelper
|
10
10
|
|
11
11
|
FORBIDDEN_HOST_CODE_POINT = ["\x00", "\t", "\x0a", "\x0d", " ", "#", "/", ":", "<", ">", "?", "@", "[", "\\", "]", "^", "|"]
|
12
|
-
FORBIDDEN_DOMAIN_CODE_POINT = FORBIDDEN_HOST_CODE_POINT +
|
12
|
+
FORBIDDEN_DOMAIN_CODE_POINT = FORBIDDEN_HOST_CODE_POINT + C0_CONTROL_PERCENT_ENCODE_SET + ["%", "\x7f"]
|
13
13
|
|
14
14
|
def parse(input, opaque = false) # :nodoc:
|
15
15
|
return if input&.empty?
|
16
16
|
|
17
17
|
if input.start_with?("[")
|
18
|
-
raise ParseError unless input.end_with?("]")
|
18
|
+
raise ParseError, "invalid IPv6 format" unless input.end_with?("]")
|
19
19
|
return parse_ipv6(input)
|
20
20
|
end
|
21
21
|
|
22
22
|
return parse_opaque_host(input) if opaque
|
23
23
|
|
24
24
|
domain = percent_decode(input)
|
25
|
-
ascii_domain =
|
25
|
+
ascii_domain = domain_to_ascii(domain)
|
26
26
|
if ends_in_number?(ascii_domain)
|
27
27
|
ipv4 = parse_ipv4(ascii_domain)
|
28
28
|
return serialize_ipv4(ipv4)
|
29
29
|
end
|
30
30
|
|
31
|
-
raise ParseError if include_forbidden_domain_code_point?(ascii_domain)
|
32
31
|
ascii_domain
|
33
32
|
rescue URI::IDNA::Error, Encoding::CompatibilityError, ArgumentError => _e
|
34
|
-
raise ParseError
|
33
|
+
raise ParseError, "invalid host value"
|
35
34
|
end
|
36
35
|
|
37
36
|
private
|
38
37
|
|
39
38
|
def parse_ipv4(host)
|
40
39
|
parts = host.split(".")
|
41
|
-
raise URI::WhatwgParser::ParseError if parts.size > 4
|
40
|
+
raise URI::WhatwgParser::ParseError, "invalid IPv4 format" if parts.size > 4
|
42
41
|
numbers = []
|
43
42
|
parts.each do |part|
|
44
43
|
value, _validation_error = parse_ipv4_number(part)
|
45
44
|
numbers << value
|
46
45
|
end
|
47
46
|
|
48
|
-
(numbers.size-1).times {|i| raise URI::WhatwgParser::ParseError if numbers[i] > 255 }
|
47
|
+
(numbers.size-1).times {|i| raise URI::WhatwgParser::ParseError, "invalid IPv4 format" if numbers[i] > 255 }
|
49
48
|
|
50
|
-
raise ParseError if numbers.last >= 256 ** (5 - numbers.size)
|
49
|
+
raise ParseError, "invalid IPv4 format" if numbers.last >= 256 ** (5 - numbers.size)
|
51
50
|
|
52
51
|
ipv4 = numbers.pop
|
53
52
|
numbers.each_with_index do |number, index|
|
@@ -68,9 +67,12 @@ class URI::WhatwgParser
|
|
68
67
|
end
|
69
68
|
|
70
69
|
def parse_ipv6(host)
|
71
|
-
|
72
|
-
|
73
|
-
raise ParseError
|
70
|
+
addr = IPAddr.new(host)
|
71
|
+
# NOTE: URL Standard doesn't support `zone_id`.
|
72
|
+
raise ParseError, "invalid IPv6 format" unless addr.zone_id.nil?
|
73
|
+
"[#{addr}]"
|
74
|
+
rescue IPAddr::InvalidAddressError
|
75
|
+
raise ParseError, "invalid IPv6 format"
|
74
76
|
end
|
75
77
|
|
76
78
|
def parse_opaque_host(host)
|
@@ -83,15 +85,18 @@ class URI::WhatwgParser
|
|
83
85
|
m[1..2].to_i(16).chr
|
84
86
|
end
|
85
87
|
rescue ArgumentError
|
86
|
-
raise ParseError
|
88
|
+
raise ParseError, "including invalid value in host"
|
87
89
|
end
|
88
90
|
|
89
91
|
def ends_in_number?(domain)
|
90
|
-
parts = domain.split(".")
|
91
|
-
|
92
|
+
parts = domain.split(".", -1)
|
93
|
+
if parts.last == ""
|
94
|
+
return false if parts.size == 1
|
95
|
+
parts.pop
|
96
|
+
end
|
92
97
|
|
93
98
|
last = parts.last
|
94
|
-
return true if last.chars.all? { |c| ascii_digit?(c) }
|
99
|
+
return true if last != "" && last.chars.all? { |c| ascii_digit?(c) }
|
95
100
|
|
96
101
|
begin
|
97
102
|
parse_ipv4_number(last)
|
@@ -103,7 +108,7 @@ class URI::WhatwgParser
|
|
103
108
|
end
|
104
109
|
|
105
110
|
def parse_ipv4_number(str)
|
106
|
-
raise ParseError if str&.empty?
|
111
|
+
raise ParseError, "invalid IPv4 format" if str&.empty?
|
107
112
|
|
108
113
|
validation_error = false
|
109
114
|
r = 10
|
@@ -124,10 +129,19 @@ class URI::WhatwgParser
|
|
124
129
|
output = Integer(str, r)
|
125
130
|
return output, validation_error
|
126
131
|
rescue ArgumentError
|
127
|
-
raise ParseError
|
132
|
+
raise ParseError, "invalid IPv4 format"
|
128
133
|
end
|
129
134
|
end
|
130
135
|
|
136
|
+
def domain_to_ascii(domain)
|
137
|
+
ascii_domain = URI::IDNA.whatwg_to_ascii(domain.force_encoding(Encoding::UTF_8), be_strict: false)
|
138
|
+
|
139
|
+
raise ParseError, "including invalid value in host" if include_forbidden_domain_code_point?(ascii_domain)
|
140
|
+
raise ParseError, "host can't be empty" if ascii_domain.empty?
|
141
|
+
|
142
|
+
ascii_domain
|
143
|
+
end
|
144
|
+
|
131
145
|
def include_forbidden_domain_code_point?(str)
|
132
146
|
str.chars.intersect?(FORBIDDEN_DOMAIN_CODE_POINT)
|
133
147
|
end
|
data/lib/uri/whatwg_parser.rb
CHANGED
@@ -37,12 +37,20 @@ module URI
|
|
37
37
|
{}
|
38
38
|
end
|
39
39
|
|
40
|
-
def parse(uri) # :nodoc:
|
40
|
+
def parse(uri, base = nil) # :nodoc:
|
41
41
|
reset
|
42
|
-
URI.for(*self.split(uri))
|
42
|
+
URI.for(*self.split(uri, base))
|
43
43
|
end
|
44
44
|
|
45
|
-
def split(uri) # :nodoc:
|
45
|
+
def split(uri, base = nil) # :nodoc:
|
46
|
+
@base = nil
|
47
|
+
if base != nil
|
48
|
+
ary = split(base)
|
49
|
+
@base = { scheme: ary[0], userinfo: ary[1], host: ary[2], port: ary[3], registry: ary[4], path: ary[5], opaque: ary[6], query: ary[7], fragment: ary[8]}
|
50
|
+
@base_paths = @paths
|
51
|
+
reset
|
52
|
+
end
|
53
|
+
|
46
54
|
uri = uri.dup
|
47
55
|
uri.gsub!(/\A[\u0000-\u0020]*/, "")
|
48
56
|
uri.gsub!(/[\u0000-\u0020]*\z/, "")
|
@@ -50,7 +58,7 @@ module URI
|
|
50
58
|
uri.delete!("\n")
|
51
59
|
uri.delete!("\r")
|
52
60
|
|
53
|
-
raise ParseError if uri.empty?
|
61
|
+
raise ParseError, "uri can't be empty" if uri.empty? && @base.nil?
|
54
62
|
|
55
63
|
@scanner = StringScanner.new(uri)
|
56
64
|
|
@@ -58,6 +66,11 @@ module URI
|
|
58
66
|
c = @scanner.getch
|
59
67
|
send("on_#{@state}", c)
|
60
68
|
|
69
|
+
if @force_continue
|
70
|
+
@force_continue = false
|
71
|
+
next
|
72
|
+
end
|
73
|
+
|
61
74
|
break if c.nil? && @scanner.eos?
|
62
75
|
end
|
63
76
|
|
@@ -67,10 +80,21 @@ module URI
|
|
67
80
|
@parse_result.values
|
68
81
|
end
|
69
82
|
|
83
|
+
def join(*uris)
|
84
|
+
return parse(uris[0]) if uris.size == 1
|
85
|
+
|
86
|
+
base, input = uris.shift(2)
|
87
|
+
uri = parse(input.to_s, base.to_s)
|
88
|
+
uris.each do |input|
|
89
|
+
uri = parse(input.to_s, uri.to_s)
|
90
|
+
end
|
91
|
+
|
92
|
+
uri
|
93
|
+
end
|
94
|
+
|
70
95
|
private
|
71
96
|
|
72
97
|
def reset
|
73
|
-
@state = nil
|
74
98
|
@scanner = nil
|
75
99
|
@buffer = +""
|
76
100
|
@at_sign_seen = nil
|
@@ -80,6 +104,7 @@ module URI
|
|
80
104
|
@username = nil
|
81
105
|
@password = nil
|
82
106
|
@parse_result = { scheme: nil, userinfo: nil, host: nil, port: nil, registry: nil, path: nil, opaque: nil, query: nil, fragment: nil }
|
107
|
+
@force_continue = false
|
83
108
|
@state = :scheme_start_state
|
84
109
|
end
|
85
110
|
|
@@ -88,7 +113,11 @@ module URI
|
|
88
113
|
@buffer += c.downcase
|
89
114
|
@state = :scheme_state
|
90
115
|
else
|
91
|
-
|
116
|
+
if c.nil?
|
117
|
+
@force_continue = true
|
118
|
+
else
|
119
|
+
@scanner.pos -= c.bytesize
|
120
|
+
end
|
92
121
|
@state = :no_scheme_state
|
93
122
|
end
|
94
123
|
end
|
@@ -102,6 +131,8 @@ module URI
|
|
102
131
|
|
103
132
|
if @parse_result[:scheme] == "file"
|
104
133
|
@state = :file_state
|
134
|
+
elsif special_url? && !@base.nil? && @parse_result[:scheme] == @base[:scheme]
|
135
|
+
@state = :special_relative_or_authority_state
|
105
136
|
elsif special_url?
|
106
137
|
@state = :special_authority_slashes_state
|
107
138
|
elsif @scanner.rest.start_with?("/")
|
@@ -113,22 +144,36 @@ module URI
|
|
113
144
|
end
|
114
145
|
else
|
115
146
|
@buffer = +""
|
116
|
-
|
147
|
+
decrease_pos(c)
|
117
148
|
@state = :no_scheme_state
|
118
149
|
end
|
119
150
|
end
|
120
151
|
|
121
152
|
def on_no_scheme_state(c)
|
122
|
-
raise ParseError
|
153
|
+
raise ParseError, "scheme is missing" if @base.nil? || !@base[:opaque].nil? && c != "#"
|
154
|
+
|
155
|
+
if !@base[:opaque].nil? && c == "#"
|
156
|
+
@parse_result[:scheme] = @base[:scheme]
|
157
|
+
@parse_result[:path] = @base[:path]
|
158
|
+
@parse_result[:query] = @base[:query]
|
159
|
+
@parse_result[:fragment] = ""
|
160
|
+
@state = :fragment_state
|
161
|
+
elsif @base[:scheme] != "file"
|
162
|
+
@state = :relative_state
|
163
|
+
decrease_pos(c)
|
164
|
+
else
|
165
|
+
@state = :file_state
|
166
|
+
decrease_pos(c)
|
167
|
+
end
|
123
168
|
end
|
124
169
|
|
125
170
|
def on_special_relative_or_authority_state(c)
|
126
171
|
if c == "/" && @scanner.rest.start_with?("/")
|
127
172
|
@state = :special_authority_ignore_slashes_state
|
128
|
-
|
173
|
+
decrease_pos(c)
|
129
174
|
else
|
130
175
|
@state = :relative_state
|
131
|
-
|
176
|
+
decrease_pos(c)
|
132
177
|
end
|
133
178
|
end
|
134
179
|
|
@@ -137,14 +182,66 @@ module URI
|
|
137
182
|
@state = :authority_state
|
138
183
|
else
|
139
184
|
@state = :path_state
|
140
|
-
|
185
|
+
decrease_pos(c)
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
def on_relative_state(c)
|
190
|
+
@parse_result[:scheme] = @base[:scheme]
|
191
|
+
if c == "/"
|
192
|
+
@state = :relative_slash_state
|
193
|
+
elsif special_url? && c == "\\"
|
194
|
+
@state = :relative_slash_state
|
195
|
+
else
|
196
|
+
@parse_result[:userinfo] = @base[:userinfo]
|
197
|
+
@parse_result[:host] = @base[:host]
|
198
|
+
@parse_result[:port] = @base[:port]
|
199
|
+
@parse_result[:path] = @base[:path]
|
200
|
+
@parse_result[:query] = @base[:query]
|
201
|
+
|
202
|
+
if c == "?"
|
203
|
+
@parse_result[:query] = ""
|
204
|
+
@state = :query_state
|
205
|
+
elsif c == "#"
|
206
|
+
@parse_result[:fragment] = ""
|
207
|
+
@state = :fragment_state
|
208
|
+
elsif !c.nil?
|
209
|
+
@parse_result[:query] = nil
|
210
|
+
shorten_url_path
|
211
|
+
@state = :path_state
|
212
|
+
@scanner.pos -= c.bytesize
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
def on_relative_slash_state(c)
|
218
|
+
if special_url? && (c == "/" || c == "\\")
|
219
|
+
@state = :special_authority_ignore_slashes_state
|
220
|
+
elsif c == "/"
|
221
|
+
@state = :authority_state
|
222
|
+
else
|
223
|
+
@parse_result[:userinfo] = @base[:userinfo]
|
224
|
+
@parse_result[:host] = @base[:host]
|
225
|
+
@parse_result[:port] = @base[:port]
|
226
|
+
@state = :path_state
|
227
|
+
decrease_pos(c)
|
141
228
|
end
|
142
229
|
end
|
143
230
|
|
144
231
|
def on_special_authority_slashes_state(c)
|
145
|
-
if c
|
232
|
+
if c == "/" && @scanner.rest.start_with?("/")
|
233
|
+
@state = :special_authority_ignore_slashes_state
|
234
|
+
@scanner.pos += c.bytesize
|
235
|
+
else
|
236
|
+
@state = :special_authority_ignore_slashes_state
|
237
|
+
decrease_pos(c)
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
def on_special_authority_ignore_slashes_state(c)
|
242
|
+
if c != "/" && c != "\\"
|
146
243
|
@state = :authority_state
|
147
|
-
|
244
|
+
decrease_pos(c)
|
148
245
|
end
|
149
246
|
end
|
150
247
|
|
@@ -169,8 +266,13 @@ module URI
|
|
169
266
|
|
170
267
|
@buffer = +""
|
171
268
|
elsif c.nil? || ["/", "?", "#"].include?(c) || (special_url? && c == "\\")
|
172
|
-
raise ParseError if @at_sign_seen && @buffer.empty?
|
173
|
-
|
269
|
+
raise ParseError, "host is missing" if @at_sign_seen && @buffer.empty?
|
270
|
+
if c.nil?
|
271
|
+
@force_continue = true
|
272
|
+
@scanner.pos -= @buffer.bytesize
|
273
|
+
else
|
274
|
+
@scanner.pos -= (@buffer.bytesize + c.bytesize.to_i)
|
275
|
+
end
|
174
276
|
@buffer = +""
|
175
277
|
@state = :host_state
|
176
278
|
else
|
@@ -180,15 +282,15 @@ module URI
|
|
180
282
|
|
181
283
|
def on_host_state(c)
|
182
284
|
if c == ":" && !@inside_brackets
|
183
|
-
raise ParseError if @buffer.empty?
|
285
|
+
raise ParseError, "host is missing" if @buffer.empty?
|
184
286
|
|
185
287
|
@parse_result[:host] = @host_parser.parse(@buffer, !special_url?)
|
186
288
|
@buffer = +""
|
187
289
|
@state = :port_state
|
188
290
|
elsif c.nil? || ["/", "?", "#"].include?(c) || (special_url? && c == "\\")
|
189
|
-
|
291
|
+
decrease_pos(c)
|
190
292
|
if special_url? && @buffer.empty?
|
191
|
-
raise ParseError
|
293
|
+
raise ParseError, "host is missing"
|
192
294
|
else
|
193
295
|
@parse_result[:host] = @host_parser.parse(@buffer, !special_url?)
|
194
296
|
@buffer = +""
|
@@ -207,20 +309,20 @@ module URI
|
|
207
309
|
elsif c.nil? || ["/", "?", "#"].include?(c) || (special_url? && c == "\\")
|
208
310
|
unless @buffer.empty?
|
209
311
|
begin
|
210
|
-
port = Integer(@buffer)
|
211
|
-
raise ParseError if port < 0 || port > 65535
|
312
|
+
port = Integer(@buffer, 10)
|
313
|
+
raise ParseError, "port is invalid value" if port < 0 || port > 65535
|
212
314
|
@parse_result[:port] = port unless SPECIAL_SCHEME[@parse_result[:scheme]] == port
|
213
315
|
rescue ArgumentError
|
214
|
-
raise ParseError
|
316
|
+
raise ParseError, "port is invalid value"
|
215
317
|
end
|
216
318
|
|
217
319
|
@buffer = +""
|
218
320
|
end
|
219
321
|
|
220
322
|
@state = :path_start_state
|
221
|
-
|
323
|
+
decrease_pos(c)
|
222
324
|
else
|
223
|
-
raise ParseError
|
325
|
+
raise ParseError, "port is invalid value"
|
224
326
|
end
|
225
327
|
end
|
226
328
|
|
@@ -230,9 +332,28 @@ module URI
|
|
230
332
|
|
231
333
|
if c == "/" || c == "\\"
|
232
334
|
@state = :file_slash_state
|
335
|
+
elsif !@base.nil? && @base[:scheme] == "file"
|
336
|
+
@parse_result[:host] = @base[:host]
|
337
|
+
@parse_result[:query] = @base[:query]
|
338
|
+
if c == "?"
|
339
|
+
@parse_result[:query] = ""
|
340
|
+
@state = :query_state
|
341
|
+
elsif c == "#"
|
342
|
+
@parse_result[:fragment] = ""
|
343
|
+
@state = :fragment_state
|
344
|
+
elsif !c.nil?
|
345
|
+
@parse_result[:query] = nil
|
346
|
+
if !starts_with_windows_drive_letter?(@scanner.rest)
|
347
|
+
shorten_url_path
|
348
|
+
else
|
349
|
+
@paths = []
|
350
|
+
end
|
351
|
+
@state = :path_state
|
352
|
+
decrease_pos(c)
|
353
|
+
end
|
233
354
|
else
|
234
|
-
@scanner.pos -= c.bytesize unless c.nil?
|
235
355
|
@state = :path_state
|
356
|
+
decrease_pos(c)
|
236
357
|
end
|
237
358
|
end
|
238
359
|
|
@@ -240,8 +361,14 @@ module URI
|
|
240
361
|
if c == "/" || c == "\\"
|
241
362
|
@state = :file_host_state
|
242
363
|
else
|
243
|
-
|
364
|
+
if !@base.nil? && @base[:scheme] == "file"
|
365
|
+
@parse_result[:host] = @base[:host]
|
366
|
+
if !starts_with_windows_drive_letter?(@scanner.rest) && normalized_windows_drive_letter?(@base_paths[0])
|
367
|
+
@paths[0] += @base_paths[0]
|
368
|
+
end
|
369
|
+
end
|
244
370
|
@state = :path_state
|
371
|
+
decrease_pos(c)
|
245
372
|
end
|
246
373
|
end
|
247
374
|
|
@@ -361,6 +488,10 @@ module URI
|
|
361
488
|
WINDOWS_DRIVE_LETTER.match?(str)
|
362
489
|
end
|
363
490
|
|
491
|
+
def starts_with_windows_drive_letter?(str)
|
492
|
+
STARTS_WITH_wINDOWS_DRIVE_LETTER.match?(str)
|
493
|
+
end
|
494
|
+
|
364
495
|
def normalized_windows_drive_letter?(str)
|
365
496
|
NORMALIZED_WINDOWS_DRIVE_LETTER.match?(str)
|
366
497
|
end
|
@@ -383,6 +514,14 @@ module URI
|
|
383
514
|
return true if @parse_result[:scheme] == "file" && @parse_result[:path]&.length == 1 && normalized_windows_drive_letter?(@parse_result[:path])
|
384
515
|
@parse_result[:path]&.chomp!
|
385
516
|
end
|
517
|
+
|
518
|
+
def decrease_pos(c)
|
519
|
+
if c.nil?
|
520
|
+
@force_continue = true
|
521
|
+
else
|
522
|
+
@scanner.pos -= c.bytesize
|
523
|
+
end
|
524
|
+
end
|
386
525
|
end
|
387
526
|
end
|
388
527
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: uri-whatwg_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yuji Yaginuma
|
8
8
|
bindir: exe
|
9
9
|
cert_chain: []
|
10
|
-
date: 2025-05-
|
10
|
+
date: 2025-05-11 00:00:00.000000000 Z
|
11
11
|
dependencies:
|
12
12
|
- !ruby/object:Gem::Dependency
|
13
13
|
name: strscan
|