uri-whatwg_parser 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f268e45182363045ad2658cd458e9bd6c1309688889a056ab3c507c1805b41f6
4
- data.tar.gz: 842b5a01c0860e9293f5c1b58d71fbd22fbc38de8775488865f2ecd060f5d546
3
+ metadata.gz: 1836e023e3a156fed4ff343555eb8562c358aea0040f644e1239739f83eff9ee
4
+ data.tar.gz: c529afc3fb9bc883857312156ed3628ce5d1978cb30edc041a8b7800d4da269c
5
5
  SHA512:
6
- metadata.gz: df6b26333982d0ab865d4c3b436b6f0b61ba06e2e034c8e8453586bfd5dbffe0493be6bc36b295f4d7ca8511892c454c8b3b178beb387c7e8588290f25cbd462
7
- data.tar.gz: ad8179360bcd2b0e963841936978aa04f38075142762114c7fb0a7c7dae459ad54f2e31deaa8cd9e9a16820d34235d9ad89038c6677648fcc40135f0350e5867
6
+ metadata.gz: dfb4edce987f84e43c1ac6aa5c89b7efca25c65d399caa2635faeedead1706fe55f498cd2d6e65c979c326fc7e80bf78910442fea417836fc497f9c625ab42ed
7
+ data.tar.gz: d916826be29b6668f8820376a5eb6ebf2545aa8e82945eb4075b1c8acd4e10ba9dbe285c26296f27d0ebc5da41aad7f390148611b786ca4319c5a188e54b8bc5
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # Uri::WhatwgParser
1
+ # URI::WhatwgParser
2
2
 
3
3
  Ruby implementation of the [WHATWG URL Living Standard](https://url.spec.whatwg.org/).
4
4
 
@@ -29,7 +29,6 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
29
29
 
30
30
  ## TODO
31
31
 
32
- * Support passing `base`
33
32
  * Support state override
34
33
  * Support validations
35
34
  * Support encodings other than UTF-8
@@ -9,45 +9,44 @@ class URI::WhatwgParser
9
9
  include ParserHelper
10
10
 
11
11
  FORBIDDEN_HOST_CODE_POINT = ["\x00", "\t", "\x0a", "\x0d", " ", "#", "/", ":", "<", ">", "?", "@", "[", "\\", "]", "^", "|"]
12
- FORBIDDEN_DOMAIN_CODE_POINT = FORBIDDEN_HOST_CODE_POINT + C0_CONTROL + ["%"]
12
+ FORBIDDEN_DOMAIN_CODE_POINT = FORBIDDEN_HOST_CODE_POINT + C0_CONTROL_PERCENT_ENCODE_SET + ["%", "\x7f"]
13
13
 
14
14
  def parse(input, opaque = false) # :nodoc:
15
15
  return if input&.empty?
16
16
 
17
17
  if input.start_with?("[")
18
- raise ParseError unless input.end_with?("]")
18
+ raise ParseError, "invalid IPv6 format" unless input.end_with?("]")
19
19
  return parse_ipv6(input)
20
20
  end
21
21
 
22
22
  return parse_opaque_host(input) if opaque
23
23
 
24
24
  domain = percent_decode(input)
25
- ascii_domain = URI::IDNA.whatwg_to_ascii(domain.force_encoding(Encoding::UTF_8))
25
+ ascii_domain = domain_to_ascii(domain)
26
26
  if ends_in_number?(ascii_domain)
27
27
  ipv4 = parse_ipv4(ascii_domain)
28
28
  return serialize_ipv4(ipv4)
29
29
  end
30
30
 
31
- raise ParseError if include_forbidden_domain_code_point?(ascii_domain)
32
31
  ascii_domain
33
32
  rescue URI::IDNA::Error, Encoding::CompatibilityError, ArgumentError => _e
34
- raise ParseError
33
+ raise ParseError, "invalid host value"
35
34
  end
36
35
 
37
36
  private
38
37
 
39
38
  def parse_ipv4(host)
40
39
  parts = host.split(".")
41
- raise URI::WhatwgParser::ParseError if parts.size > 4
40
+ raise URI::WhatwgParser::ParseError, "invalid IPv4 format" if parts.size > 4
42
41
  numbers = []
43
42
  parts.each do |part|
44
43
  value, _validation_error = parse_ipv4_number(part)
45
44
  numbers << value
46
45
  end
47
46
 
48
- (numbers.size-1).times {|i| raise URI::WhatwgParser::ParseError if numbers[i] > 255 }
47
+ (numbers.size-1).times {|i| raise URI::WhatwgParser::ParseError, "invalid IPv4 format" if numbers[i] > 255 }
49
48
 
50
- raise ParseError if numbers.last >= 256 ** (5 - numbers.size)
49
+ raise ParseError, "invalid IPv4 format" if numbers.last >= 256 ** (5 - numbers.size)
51
50
 
52
51
  ipv4 = numbers.pop
53
52
  numbers.each_with_index do |number, index|
@@ -68,9 +67,12 @@ class URI::WhatwgParser
68
67
  end
69
68
 
70
69
  def parse_ipv6(host)
71
- "[#{IPAddr.new(host).to_s}]"
72
- rescue
73
- raise ParseError
70
+ addr = IPAddr.new(host)
71
+ # NOTE: URL Standard doesn't support `zone_id`.
72
+ raise ParseError, "invalid IPv6 format" unless addr.zone_id.nil?
73
+ "[#{addr}]"
74
+ rescue IPAddr::InvalidAddressError
75
+ raise ParseError, "invalid IPv6 format"
74
76
  end
75
77
 
76
78
  def parse_opaque_host(host)
@@ -83,15 +85,18 @@ class URI::WhatwgParser
83
85
  m[1..2].to_i(16).chr
84
86
  end
85
87
  rescue ArgumentError
86
- raise ParseError
88
+ raise ParseError, "including invalid value in host"
87
89
  end
88
90
 
89
91
  def ends_in_number?(domain)
90
- parts = domain.split(".")
91
- return false if parts.size == 0
92
+ parts = domain.split(".", -1)
93
+ if parts.last == ""
94
+ return false if parts.size == 1
95
+ parts.pop
96
+ end
92
97
 
93
98
  last = parts.last
94
- return true if last.chars.all? { |c| ascii_digit?(c) }
99
+ return true if last != "" && last.chars.all? { |c| ascii_digit?(c) }
95
100
 
96
101
  begin
97
102
  parse_ipv4_number(last)
@@ -103,7 +108,7 @@ class URI::WhatwgParser
103
108
  end
104
109
 
105
110
  def parse_ipv4_number(str)
106
- raise ParseError if str&.empty?
111
+ raise ParseError, "invalid IPv4 format" if str&.empty?
107
112
 
108
113
  validation_error = false
109
114
  r = 10
@@ -124,10 +129,19 @@ class URI::WhatwgParser
124
129
  output = Integer(str, r)
125
130
  return output, validation_error
126
131
  rescue ArgumentError
127
- raise ParseError
132
+ raise ParseError, "invalid IPv4 format"
128
133
  end
129
134
  end
130
135
 
136
+ def domain_to_ascii(domain)
137
+ ascii_domain = URI::IDNA.whatwg_to_ascii(domain.force_encoding(Encoding::UTF_8), be_strict: false)
138
+
139
+ raise ParseError, "including invalid value in host" if include_forbidden_domain_code_point?(ascii_domain)
140
+ raise ParseError, "host can't be empty" if ascii_domain.empty?
141
+
142
+ ascii_domain
143
+ end
144
+
131
145
  def include_forbidden_domain_code_point?(str)
132
146
  str.chars.intersect?(FORBIDDEN_DOMAIN_CODE_POINT)
133
147
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module URI
4
4
  class WhatwgParser
5
- VERSION = "0.1.0"
5
+ VERSION = "0.1.1"
6
6
  end
7
7
  end
@@ -37,12 +37,20 @@ module URI
37
37
  {}
38
38
  end
39
39
 
40
- def parse(uri) # :nodoc:
40
+ def parse(uri, base = nil) # :nodoc:
41
41
  reset
42
- URI.for(*self.split(uri))
42
+ URI.for(*self.split(uri, base))
43
43
  end
44
44
 
45
- def split(uri) # :nodoc:
45
+ def split(uri, base = nil) # :nodoc:
46
+ @base = nil
47
+ if base != nil
48
+ ary = split(base)
49
+ @base = { scheme: ary[0], userinfo: ary[1], host: ary[2], port: ary[3], registry: ary[4], path: ary[5], opaque: ary[6], query: ary[7], fragment: ary[8]}
50
+ @base_paths = @paths
51
+ reset
52
+ end
53
+
46
54
  uri = uri.dup
47
55
  uri.gsub!(/\A[\u0000-\u0020]*/, "")
48
56
  uri.gsub!(/[\u0000-\u0020]*\z/, "")
@@ -50,7 +58,7 @@ module URI
50
58
  uri.delete!("\n")
51
59
  uri.delete!("\r")
52
60
 
53
- raise ParseError if uri.empty?
61
+ raise ParseError, "uri can't be empty" if uri.empty? && @base.nil?
54
62
 
55
63
  @scanner = StringScanner.new(uri)
56
64
 
@@ -58,6 +66,11 @@ module URI
58
66
  c = @scanner.getch
59
67
  send("on_#{@state}", c)
60
68
 
69
+ if @force_continue
70
+ @force_continue = false
71
+ next
72
+ end
73
+
61
74
  break if c.nil? && @scanner.eos?
62
75
  end
63
76
 
@@ -67,10 +80,21 @@ module URI
67
80
  @parse_result.values
68
81
  end
69
82
 
83
+ def join(*uris)
84
+ return parse(uris[0]) if uris.size == 1
85
+
86
+ base, input = uris.shift(2)
87
+ uri = parse(input.to_s, base.to_s)
88
+ uris.each do |input|
89
+ uri = parse(input.to_s, uri.to_s)
90
+ end
91
+
92
+ uri
93
+ end
94
+
70
95
  private
71
96
 
72
97
  def reset
73
- @state = nil
74
98
  @scanner = nil
75
99
  @buffer = +""
76
100
  @at_sign_seen = nil
@@ -80,6 +104,7 @@ module URI
80
104
  @username = nil
81
105
  @password = nil
82
106
  @parse_result = { scheme: nil, userinfo: nil, host: nil, port: nil, registry: nil, path: nil, opaque: nil, query: nil, fragment: nil }
107
+ @force_continue = false
83
108
  @state = :scheme_start_state
84
109
  end
85
110
 
@@ -88,7 +113,11 @@ module URI
88
113
  @buffer += c.downcase
89
114
  @state = :scheme_state
90
115
  else
91
- @scanner.pos -= c.bytesize unless c.nil?
116
+ if c.nil?
117
+ @force_continue = true
118
+ else
119
+ @scanner.pos -= c.bytesize
120
+ end
92
121
  @state = :no_scheme_state
93
122
  end
94
123
  end
@@ -102,6 +131,8 @@ module URI
102
131
 
103
132
  if @parse_result[:scheme] == "file"
104
133
  @state = :file_state
134
+ elsif special_url? && !@base.nil? && @parse_result[:scheme] == @base[:scheme]
135
+ @state = :special_relative_or_authority_state
105
136
  elsif special_url?
106
137
  @state = :special_authority_slashes_state
107
138
  elsif @scanner.rest.start_with?("/")
@@ -113,22 +144,36 @@ module URI
113
144
  end
114
145
  else
115
146
  @buffer = +""
116
- @scanner.pos = 0
147
+ decrease_pos(c)
117
148
  @state = :no_scheme_state
118
149
  end
119
150
  end
120
151
 
121
152
  def on_no_scheme_state(c)
122
- raise ParseError
153
+ raise ParseError, "scheme is missing" if @base.nil? || !@base[:opaque].nil? && c != "#"
154
+
155
+ if !@base[:opaque].nil? && c == "#"
156
+ @parse_result[:scheme] = @base[:scheme]
157
+ @parse_result[:path] = @base[:path]
158
+ @parse_result[:query] = @base[:query]
159
+ @parse_result[:fragment] = ""
160
+ @state = :fragment_state
161
+ elsif @base[:scheme] != "file"
162
+ @state = :relative_state
163
+ decrease_pos(c)
164
+ else
165
+ @state = :file_state
166
+ decrease_pos(c)
167
+ end
123
168
  end
124
169
 
125
170
  def on_special_relative_or_authority_state(c)
126
171
  if c == "/" && @scanner.rest.start_with?("/")
127
172
  @state = :special_authority_ignore_slashes_state
128
- @scanner.pos += c.bytesize
173
+ decrease_pos(c)
129
174
  else
130
175
  @state = :relative_state
131
- @scanner.pos -= c.bytesize
176
+ decrease_pos(c)
132
177
  end
133
178
  end
134
179
 
@@ -137,14 +182,66 @@ module URI
137
182
  @state = :authority_state
138
183
  else
139
184
  @state = :path_state
140
- @scanner.pos -= c.bytesize
185
+ decrease_pos(c)
186
+ end
187
+ end
188
+
189
+ def on_relative_state(c)
190
+ @parse_result[:scheme] = @base[:scheme]
191
+ if c == "/"
192
+ @state = :relative_slash_state
193
+ elsif special_url? && c == "\\"
194
+ @state = :relative_slash_state
195
+ else
196
+ @parse_result[:userinfo] = @base[:userinfo]
197
+ @parse_result[:host] = @base[:host]
198
+ @parse_result[:port] = @base[:port]
199
+ @parse_result[:path] = @base[:path]
200
+ @parse_result[:query] = @base[:query]
201
+
202
+ if c == "?"
203
+ @parse_result[:query] = ""
204
+ @state = :query_state
205
+ elsif c == "#"
206
+ @parse_result[:fragment] = ""
207
+ @state = :fragment_state
208
+ elsif !c.nil?
209
+ @parse_result[:query] = nil
210
+ shorten_url_path
211
+ @state = :path_state
212
+ @scanner.pos -= c.bytesize
213
+ end
214
+ end
215
+ end
216
+
217
+ def on_relative_slash_state(c)
218
+ if special_url? && (c == "/" || c == "\\")
219
+ @state = :special_authority_ignore_slashes_state
220
+ elsif c == "/"
221
+ @state = :authority_state
222
+ else
223
+ @parse_result[:userinfo] = @base[:userinfo]
224
+ @parse_result[:host] = @base[:host]
225
+ @parse_result[:port] = @base[:port]
226
+ @state = :path_state
227
+ decrease_pos(c)
141
228
  end
142
229
  end
143
230
 
144
231
  def on_special_authority_slashes_state(c)
145
- if c != "\\" && c != "/"
232
+ if c == "/" && @scanner.rest.start_with?("/")
233
+ @state = :special_authority_ignore_slashes_state
234
+ @scanner.pos += c.bytesize
235
+ else
236
+ @state = :special_authority_ignore_slashes_state
237
+ decrease_pos(c)
238
+ end
239
+ end
240
+
241
+ def on_special_authority_ignore_slashes_state(c)
242
+ if c != "/" && c != "\\"
146
243
  @state = :authority_state
147
- @scanner.pos -= c.bytesize
244
+ decrease_pos(c)
148
245
  end
149
246
  end
150
247
 
@@ -169,8 +266,13 @@ module URI
169
266
 
170
267
  @buffer = +""
171
268
  elsif c.nil? || ["/", "?", "#"].include?(c) || (special_url? && c == "\\")
172
- raise ParseError if @at_sign_seen && @buffer.empty?
173
- @scanner.pos -= (@buffer.bytesize + c&.bytesize.to_i)
269
+ raise ParseError, "host is missing" if @at_sign_seen && @buffer.empty?
270
+ if c.nil?
271
+ @force_continue = true
272
+ @scanner.pos -= @buffer.bytesize
273
+ else
274
+ @scanner.pos -= (@buffer.bytesize + c.bytesize.to_i)
275
+ end
174
276
  @buffer = +""
175
277
  @state = :host_state
176
278
  else
@@ -180,15 +282,15 @@ module URI
180
282
 
181
283
  def on_host_state(c)
182
284
  if c == ":" && !@inside_brackets
183
- raise ParseError if @buffer.empty?
285
+ raise ParseError, "host is missing" if @buffer.empty?
184
286
 
185
287
  @parse_result[:host] = @host_parser.parse(@buffer, !special_url?)
186
288
  @buffer = +""
187
289
  @state = :port_state
188
290
  elsif c.nil? || ["/", "?", "#"].include?(c) || (special_url? && c == "\\")
189
- @scanner.pos -= c.bytesize unless c.nil?
291
+ decrease_pos(c)
190
292
  if special_url? && @buffer.empty?
191
- raise ParseError
293
+ raise ParseError, "host is missing"
192
294
  else
193
295
  @parse_result[:host] = @host_parser.parse(@buffer, !special_url?)
194
296
  @buffer = +""
@@ -207,20 +309,20 @@ module URI
207
309
  elsif c.nil? || ["/", "?", "#"].include?(c) || (special_url? && c == "\\")
208
310
  unless @buffer.empty?
209
311
  begin
210
- port = Integer(@buffer)
211
- raise ParseError if port < 0 || port > 65535
312
+ port = Integer(@buffer, 10)
313
+ raise ParseError, "port is invalid value" if port < 0 || port > 65535
212
314
  @parse_result[:port] = port unless SPECIAL_SCHEME[@parse_result[:scheme]] == port
213
315
  rescue ArgumentError
214
- raise ParseError
316
+ raise ParseError, "port is invalid value"
215
317
  end
216
318
 
217
319
  @buffer = +""
218
320
  end
219
321
 
220
322
  @state = :path_start_state
221
- @scanner.pos -= c.bytesize unless c.nil?
323
+ decrease_pos(c)
222
324
  else
223
- raise ParseError
325
+ raise ParseError, "port is invalid value"
224
326
  end
225
327
  end
226
328
 
@@ -230,9 +332,28 @@ module URI
230
332
 
231
333
  if c == "/" || c == "\\"
232
334
  @state = :file_slash_state
335
+ elsif !@base.nil? && @base[:scheme] == "file"
336
+ @parse_result[:host] = @base[:host]
337
+ @parse_result[:query] = @base[:query]
338
+ if c == "?"
339
+ @parse_result[:query] = ""
340
+ @state = :query_state
341
+ elsif c == "#"
342
+ @parse_result[:fragment] = ""
343
+ @state = :fragment_state
344
+ elsif !c.nil?
345
+ @parse_result[:query] = nil
346
+ if !starts_with_windows_drive_letter?(@scanner.rest)
347
+ shorten_url_path
348
+ else
349
+ @paths = []
350
+ end
351
+ @state = :path_state
352
+ decrease_pos(c)
353
+ end
233
354
  else
234
- @scanner.pos -= c.bytesize unless c.nil?
235
355
  @state = :path_state
356
+ decrease_pos(c)
236
357
  end
237
358
  end
238
359
 
@@ -240,8 +361,14 @@ module URI
240
361
  if c == "/" || c == "\\"
241
362
  @state = :file_host_state
242
363
  else
243
- @scanner.pos -= c.bytesize unless c.nil?
364
+ if !@base.nil? && @base[:scheme] == "file"
365
+ @parse_result[:host] = @base[:host]
366
+ if !starts_with_windows_drive_letter?(@scanner.rest) && normalized_windows_drive_letter?(@base_paths[0])
367
+ @paths[0] += @base_paths[0]
368
+ end
369
+ end
244
370
  @state = :path_state
371
+ decrease_pos(c)
245
372
  end
246
373
  end
247
374
 
@@ -361,6 +488,10 @@ module URI
361
488
  WINDOWS_DRIVE_LETTER.match?(str)
362
489
  end
363
490
 
491
+ def starts_with_windows_drive_letter?(str)
492
+ STARTS_WITH_wINDOWS_DRIVE_LETTER.match?(str)
493
+ end
494
+
364
495
  def normalized_windows_drive_letter?(str)
365
496
  NORMALIZED_WINDOWS_DRIVE_LETTER.match?(str)
366
497
  end
@@ -383,6 +514,14 @@ module URI
383
514
  return true if @parse_result[:scheme] == "file" && @parse_result[:path]&.length == 1 && normalized_windows_drive_letter?(@parse_result[:path])
384
515
  @parse_result[:path]&.chomp!
385
516
  end
517
+
518
+ def decrease_pos(c)
519
+ if c.nil?
520
+ @force_continue = true
521
+ else
522
+ @scanner.pos -= c.bytesize
523
+ end
524
+ end
386
525
  end
387
526
  end
388
527
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: uri-whatwg_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yuji Yaginuma
8
8
  bindir: exe
9
9
  cert_chain: []
10
- date: 2025-05-04 00:00:00.000000000 Z
10
+ date: 2025-05-11 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: strscan