uri-whatwg_parser 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +43 -0
- data/Rakefile +17 -0
- data/lib/uri/whatwg_parser/error.rb +5 -0
- data/lib/uri/whatwg_parser/host_parser.rb +139 -0
- data/lib/uri/whatwg_parser/parser_helper.rb +27 -0
- data/lib/uri/whatwg_parser/version.rb +7 -0
- data/lib/uri/whatwg_parser.rb +391 -0
- metadata +105 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: f268e45182363045ad2658cd458e9bd6c1309688889a056ab3c507c1805b41f6
|
4
|
+
data.tar.gz: 842b5a01c0860e9293f5c1b58d71fbd22fbc38de8775488865f2ecd060f5d546
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: df6b26333982d0ab865d4c3b436b6f0b61ba06e2e034c8e8453586bfd5dbffe0493be6bc36b295f4d7ca8511892c454c8b3b178beb387c7e8588290f25cbd462
|
7
|
+
data.tar.gz: ad8179360bcd2b0e963841936978aa04f38075142762114c7fb0a7c7dae459ad54f2e31deaa8cd9e9a16820d34235d9ad89038c6677648fcc40135f0350e5867
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2025 Yuji Yaginuma
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# Uri::WhatwgParser
|
2
|
+
|
3
|
+
Ruby implementation of the [WHATWG URL Living Standard](https://url.spec.whatwg.org/).
|
4
|
+
|
5
|
+
The latest revision that this package implements of the standard is ([24 March 2025](https://url.spec.whatwg.org/commit-snapshots/cc8b776b89a6d92b5cc74581c8d90450d3c1e762/)).
|
6
|
+
|
7
|
+
NOTE: Some features haven't been implemented yet. Please see the TODO for details.
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
```bash
|
12
|
+
gem install uri-whatwg_parser
|
13
|
+
```
|
14
|
+
|
15
|
+
## Usage
|
16
|
+
|
17
|
+
This gem is compatible with [`uri`](https://github.com/ruby/uri) gem and automatically switches parser's behavior. So users don't need to set up.
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
URI.parse("http://日本語.jp")
|
21
|
+
# => #<URI::HTTP http://xn--wgv71a119e.jp>
|
22
|
+
```
|
23
|
+
|
24
|
+
## Development
|
25
|
+
|
26
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
27
|
+
|
28
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
29
|
+
|
30
|
+
## TODO
|
31
|
+
|
32
|
+
* Support passing `base`
|
33
|
+
* Support state override
|
34
|
+
* Support validations
|
35
|
+
* Support encodings other than UTF-8
|
36
|
+
|
37
|
+
## Contributing
|
38
|
+
|
39
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/y-yagi/uri-whatwg_parser.
|
40
|
+
|
41
|
+
## License
|
42
|
+
|
43
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "bundler/gem_tasks"
|
4
|
+
require "rake/testtask"
|
5
|
+
|
6
|
+
Rake::TestTask.new(:test) do |t|
|
7
|
+
t.libs << "test"
|
8
|
+
t.test_files = FileList["test/**/test_*.rb"]
|
9
|
+
end
|
10
|
+
|
11
|
+
task :download_wpt_resources do
|
12
|
+
Dir.chdir "test/resources" do
|
13
|
+
system("curl -O https://raw.githubusercontent.com/web-platform-tests/wpt/master/url/resources/urltestdata.json", exception: true)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
task default: :test
|
@@ -0,0 +1,139 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "uri/idna"
|
4
|
+
require "ipaddr"
|
5
|
+
require_relative "parser_helper"
|
6
|
+
|
7
|
+
class URI::WhatwgParser
|
8
|
+
class HostParser
|
9
|
+
include ParserHelper
|
10
|
+
|
11
|
+
FORBIDDEN_HOST_CODE_POINT = ["\x00", "\t", "\x0a", "\x0d", " ", "#", "/", ":", "<", ">", "?", "@", "[", "\\", "]", "^", "|"]
|
12
|
+
FORBIDDEN_DOMAIN_CODE_POINT = FORBIDDEN_HOST_CODE_POINT + C0_CONTROL + ["%"]
|
13
|
+
|
14
|
+
def parse(input, opaque = false) # :nodoc:
|
15
|
+
return if input&.empty?
|
16
|
+
|
17
|
+
if input.start_with?("[")
|
18
|
+
raise ParseError unless input.end_with?("]")
|
19
|
+
return parse_ipv6(input)
|
20
|
+
end
|
21
|
+
|
22
|
+
return parse_opaque_host(input) if opaque
|
23
|
+
|
24
|
+
domain = percent_decode(input)
|
25
|
+
ascii_domain = URI::IDNA.whatwg_to_ascii(domain.force_encoding(Encoding::UTF_8))
|
26
|
+
if ends_in_number?(ascii_domain)
|
27
|
+
ipv4 = parse_ipv4(ascii_domain)
|
28
|
+
return serialize_ipv4(ipv4)
|
29
|
+
end
|
30
|
+
|
31
|
+
raise ParseError if include_forbidden_domain_code_point?(ascii_domain)
|
32
|
+
ascii_domain
|
33
|
+
rescue URI::IDNA::Error, Encoding::CompatibilityError, ArgumentError => _e
|
34
|
+
raise ParseError
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def parse_ipv4(host)
|
40
|
+
parts = host.split(".")
|
41
|
+
raise URI::WhatwgParser::ParseError if parts.size > 4
|
42
|
+
numbers = []
|
43
|
+
parts.each do |part|
|
44
|
+
value, _validation_error = parse_ipv4_number(part)
|
45
|
+
numbers << value
|
46
|
+
end
|
47
|
+
|
48
|
+
(numbers.size-1).times {|i| raise URI::WhatwgParser::ParseError if numbers[i] > 255 }
|
49
|
+
|
50
|
+
raise ParseError if numbers.last >= 256 ** (5 - numbers.size)
|
51
|
+
|
52
|
+
ipv4 = numbers.pop
|
53
|
+
numbers.each_with_index do |number, index|
|
54
|
+
ipv4 += number * (256 ** (3 - index))
|
55
|
+
end
|
56
|
+
|
57
|
+
ipv4
|
58
|
+
end
|
59
|
+
|
60
|
+
def serialize_ipv4(ipv4)
|
61
|
+
output = []
|
62
|
+
4.times.each do |_|
|
63
|
+
output.prepend("#{ipv4 % 256}")
|
64
|
+
ipv4 /= 256
|
65
|
+
end
|
66
|
+
|
67
|
+
output.join(".")
|
68
|
+
end
|
69
|
+
|
70
|
+
def parse_ipv6(host)
|
71
|
+
"[#{IPAddr.new(host).to_s}]"
|
72
|
+
rescue
|
73
|
+
raise ParseError
|
74
|
+
end
|
75
|
+
|
76
|
+
def parse_opaque_host(host)
|
77
|
+
raise ParseError if include_forbidden_host_code_point?(host)
|
78
|
+
host.chars.map { |c| percent_encode(c, C0_CONTROL_PERCENT_ENCODE_SET) }.join
|
79
|
+
end
|
80
|
+
|
81
|
+
def percent_decode(str)
|
82
|
+
str.gsub(/%[0-9A-Fa-f]{2}/) do |m|
|
83
|
+
m[1..2].to_i(16).chr
|
84
|
+
end
|
85
|
+
rescue ArgumentError
|
86
|
+
raise ParseError
|
87
|
+
end
|
88
|
+
|
89
|
+
def ends_in_number?(domain)
|
90
|
+
parts = domain.split(".")
|
91
|
+
return false if parts.size == 0
|
92
|
+
|
93
|
+
last = parts.last
|
94
|
+
return true if last.chars.all? { |c| ascii_digit?(c) }
|
95
|
+
|
96
|
+
begin
|
97
|
+
parse_ipv4_number(last)
|
98
|
+
rescue ParseError
|
99
|
+
return false
|
100
|
+
end
|
101
|
+
|
102
|
+
true
|
103
|
+
end
|
104
|
+
|
105
|
+
def parse_ipv4_number(str)
|
106
|
+
raise ParseError if str&.empty?
|
107
|
+
|
108
|
+
validation_error = false
|
109
|
+
r = 10
|
110
|
+
|
111
|
+
if str.size >= 2 && (str.start_with?("0x") || str.start_with?("0X"))
|
112
|
+
validation_error = true
|
113
|
+
str = str[2..-1]
|
114
|
+
r = 16
|
115
|
+
elsif str.size >= 2 && str.start_with?("0")
|
116
|
+
validation_error = true
|
117
|
+
str = str[1..-1]
|
118
|
+
r = 8
|
119
|
+
end
|
120
|
+
|
121
|
+
return 0, true if str.empty?
|
122
|
+
|
123
|
+
begin
|
124
|
+
output = Integer(str, r)
|
125
|
+
return output, validation_error
|
126
|
+
rescue ArgumentError
|
127
|
+
raise ParseError
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def include_forbidden_domain_code_point?(str)
|
132
|
+
str.chars.intersect?(FORBIDDEN_DOMAIN_CODE_POINT)
|
133
|
+
end
|
134
|
+
|
135
|
+
def include_forbidden_host_code_point?(str)
|
136
|
+
str.chars.intersect?(FORBIDDEN_HOST_CODE_POINT)
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class URI::WhatwgParser
|
4
|
+
module ParserHelper
|
5
|
+
C0_CONTROL = (0..0x1f).to_a
|
6
|
+
C0_CONTROL_PERCENT_ENCODE_SET = C0_CONTROL.map(&:chr)
|
7
|
+
|
8
|
+
def ascii_alpha?(c)
|
9
|
+
ASCII_ALPHA.include?(c)
|
10
|
+
end
|
11
|
+
|
12
|
+
def ascii_alphanumerica?(c)
|
13
|
+
ascii_alpha?(c) || ascii_digit?(c)
|
14
|
+
end
|
15
|
+
|
16
|
+
def ascii_digit?(c)
|
17
|
+
ASCII_DIGIT.include?(c)
|
18
|
+
end
|
19
|
+
|
20
|
+
def percent_encode(c, encode_set)
|
21
|
+
if encode_set.include?(c) || c.ord > 0x7e
|
22
|
+
return c.unpack("C*").map { |b| sprintf("%%%02X", b) }.join
|
23
|
+
end
|
24
|
+
c
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,391 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "strscan"
|
4
|
+
require "uri"
|
5
|
+
require_relative "whatwg_parser/error"
|
6
|
+
require_relative "whatwg_parser/version"
|
7
|
+
require_relative "whatwg_parser/parser_helper"
|
8
|
+
require_relative "whatwg_parser/host_parser"
|
9
|
+
|
10
|
+
module URI
|
11
|
+
class WhatwgParser
|
12
|
+
include ParserHelper
|
13
|
+
|
14
|
+
SPECIAL_SCHEME = { "ftp" => 21, "file" => nil, "http" => 80, "https" => 443, "ws" => 80, "wss" => 443 }
|
15
|
+
ASCII_ALPHA = ("a".."z").to_a + ("A".."Z").to_a
|
16
|
+
ASCII_DIGIT = ("0".."9").to_a
|
17
|
+
|
18
|
+
FRAGMENT_PERCENT_ENCODE_SET = C0_CONTROL_PERCENT_ENCODE_SET + [" ", "\"", "<", ">", "`"]
|
19
|
+
QUERY_PERCENT_ENCODE_SET = C0_CONTROL_PERCENT_ENCODE_SET + [" ", "\"", "#", "<", ">"]
|
20
|
+
SPECIAL_QUERY_PERCENT_ENCODE_SET = QUERY_PERCENT_ENCODE_SET + ["'"]
|
21
|
+
PATH_PERCENT_ENCODE_SET = QUERY_PERCENT_ENCODE_SET + ["?", "^", "`", "{", "}"]
|
22
|
+
USERINFO_PERCENT_ENCODE_SET = PATH_PERCENT_ENCODE_SET + ["/", ":", ";", "=","@", "[", "\\", "]", "|"]
|
23
|
+
|
24
|
+
SINGLE_DOT_PATH_SEGMENTS = [".", "%2e", "%2E"]
|
25
|
+
DOUBLE_DOT_PATH_SEGMENTS = ["..", ".%2e", ".%2E", "%2e.", "%2e%2e", "%2e%2E", "%2E.", "%2E%2e", "%2E%2E"]
|
26
|
+
|
27
|
+
WINDOWS_DRIVE_LETTER = Regexp.new("\\A([a-zA-Z][:|])\\z")
|
28
|
+
NORMALIZED_WINDOWS_DRIVE_LETTER = Regexp.new("\\A([a-zA-Z][:])\\z")
|
29
|
+
STARTS_WITH_wINDOWS_DRIVE_LETTER = Regexp.new("\\A([a-zA-Z][:|])(?:[/\\?#])?\\z")
|
30
|
+
|
31
|
+
def initialize
|
32
|
+
reset
|
33
|
+
@host_parser = HostParser.new
|
34
|
+
end
|
35
|
+
|
36
|
+
def regexp
|
37
|
+
{}
|
38
|
+
end
|
39
|
+
|
40
|
+
def parse(uri) # :nodoc:
|
41
|
+
reset
|
42
|
+
URI.for(*self.split(uri))
|
43
|
+
end
|
44
|
+
|
45
|
+
def split(uri) # :nodoc:
|
46
|
+
uri = uri.dup
|
47
|
+
uri.gsub!(/\A[\u0000-\u0020]*/, "")
|
48
|
+
uri.gsub!(/[\u0000-\u0020]*\z/, "")
|
49
|
+
uri.delete!("\t")
|
50
|
+
uri.delete!("\n")
|
51
|
+
uri.delete!("\r")
|
52
|
+
|
53
|
+
raise ParseError if uri.empty?
|
54
|
+
|
55
|
+
@scanner = StringScanner.new(uri)
|
56
|
+
|
57
|
+
loop do
|
58
|
+
c = @scanner.getch
|
59
|
+
send("on_#{@state}", c)
|
60
|
+
|
61
|
+
break if c.nil? && @scanner.eos?
|
62
|
+
end
|
63
|
+
|
64
|
+
@parse_result[:userinfo] = "#{@username}:#{@password}" if !@username.nil? || !@password.nil?
|
65
|
+
@parse_result[:path] = "/#{@paths.join("/")}" if !@paths.empty?
|
66
|
+
|
67
|
+
@parse_result.values
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
def reset
|
73
|
+
@state = nil
|
74
|
+
@scanner = nil
|
75
|
+
@buffer = +""
|
76
|
+
@at_sign_seen = nil
|
77
|
+
@password_token_seen = nil
|
78
|
+
@inside_brackets = nil
|
79
|
+
@paths = []
|
80
|
+
@username = nil
|
81
|
+
@password = nil
|
82
|
+
@parse_result = { scheme: nil, userinfo: nil, host: nil, port: nil, registry: nil, path: nil, opaque: nil, query: nil, fragment: nil }
|
83
|
+
@state = :scheme_start_state
|
84
|
+
end
|
85
|
+
|
86
|
+
def on_scheme_start_state(c)
|
87
|
+
if ascii_alpha?(c)
|
88
|
+
@buffer += c.downcase
|
89
|
+
@state = :scheme_state
|
90
|
+
else
|
91
|
+
@scanner.pos -= c.bytesize unless c.nil?
|
92
|
+
@state = :no_scheme_state
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def on_scheme_state(c)
|
97
|
+
if ascii_alphanumerica?(c) || ["+", "-", "."].include?(c)
|
98
|
+
@buffer += c.downcase
|
99
|
+
elsif c == ":"
|
100
|
+
@parse_result[:scheme] = @buffer
|
101
|
+
@buffer = +""
|
102
|
+
|
103
|
+
if @parse_result[:scheme] == "file"
|
104
|
+
@state = :file_state
|
105
|
+
elsif special_url?
|
106
|
+
@state = :special_authority_slashes_state
|
107
|
+
elsif @scanner.rest.start_with?("/")
|
108
|
+
@state = :path_or_authority_state
|
109
|
+
@scanner.pos += c.bytesize
|
110
|
+
else
|
111
|
+
@parse_result[:path] = ""
|
112
|
+
@state = :opaque_path_state
|
113
|
+
end
|
114
|
+
else
|
115
|
+
@buffer = +""
|
116
|
+
@scanner.pos = 0
|
117
|
+
@state = :no_scheme_state
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
def on_no_scheme_state(c)
|
122
|
+
raise ParseError
|
123
|
+
end
|
124
|
+
|
125
|
+
def on_special_relative_or_authority_state(c)
|
126
|
+
if c == "/" && @scanner.rest.start_with?("/")
|
127
|
+
@state = :special_authority_ignore_slashes_state
|
128
|
+
@scanner.pos += c.bytesize
|
129
|
+
else
|
130
|
+
@state = :relative_state
|
131
|
+
@scanner.pos -= c.bytesize
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def on_path_or_authority_state(c)
|
136
|
+
if c == "/"
|
137
|
+
@state = :authority_state
|
138
|
+
else
|
139
|
+
@state = :path_state
|
140
|
+
@scanner.pos -= c.bytesize
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
def on_special_authority_slashes_state(c)
|
145
|
+
if c != "\\" && c != "/"
|
146
|
+
@state = :authority_state
|
147
|
+
@scanner.pos -= c.bytesize
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def on_authority_state(c)
|
152
|
+
if c == "@"
|
153
|
+
@buffer.prepend("%40") if @at_sign_seen
|
154
|
+
@at_sign_seen = true
|
155
|
+
@buffer.chars.each do |char|
|
156
|
+
if char == ":" && !@password_token_seen
|
157
|
+
@password_token_seen = true
|
158
|
+
next
|
159
|
+
end
|
160
|
+
|
161
|
+
encoded_char = percent_encode(char, USERINFO_PERCENT_ENCODE_SET)
|
162
|
+
|
163
|
+
if @password_token_seen
|
164
|
+
@password = @password.to_s + encoded_char
|
165
|
+
else
|
166
|
+
@username = @username.to_s + encoded_char
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
@buffer = +""
|
171
|
+
elsif c.nil? || ["/", "?", "#"].include?(c) || (special_url? && c == "\\")
|
172
|
+
raise ParseError if @at_sign_seen && @buffer.empty?
|
173
|
+
@scanner.pos -= (@buffer.bytesize + c&.bytesize.to_i)
|
174
|
+
@buffer = +""
|
175
|
+
@state = :host_state
|
176
|
+
else
|
177
|
+
@buffer << c
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
def on_host_state(c)
|
182
|
+
if c == ":" && !@inside_brackets
|
183
|
+
raise ParseError if @buffer.empty?
|
184
|
+
|
185
|
+
@parse_result[:host] = @host_parser.parse(@buffer, !special_url?)
|
186
|
+
@buffer = +""
|
187
|
+
@state = :port_state
|
188
|
+
elsif c.nil? || ["/", "?", "#"].include?(c) || (special_url? && c == "\\")
|
189
|
+
@scanner.pos -= c.bytesize unless c.nil?
|
190
|
+
if special_url? && @buffer.empty?
|
191
|
+
raise ParseError
|
192
|
+
else
|
193
|
+
@parse_result[:host] = @host_parser.parse(@buffer, !special_url?)
|
194
|
+
@buffer = +""
|
195
|
+
@state = :path_start_state
|
196
|
+
end
|
197
|
+
else
|
198
|
+
@inside_brackets = true if c == "["
|
199
|
+
@inside_brackets = false if c == "]"
|
200
|
+
@buffer += c
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
def on_port_state(c)
|
205
|
+
if ascii_digit?(c)
|
206
|
+
@buffer += c
|
207
|
+
elsif c.nil? || ["/", "?", "#"].include?(c) || (special_url? && c == "\\")
|
208
|
+
unless @buffer.empty?
|
209
|
+
begin
|
210
|
+
port = Integer(@buffer)
|
211
|
+
raise ParseError if port < 0 || port > 65535
|
212
|
+
@parse_result[:port] = port unless SPECIAL_SCHEME[@parse_result[:scheme]] == port
|
213
|
+
rescue ArgumentError
|
214
|
+
raise ParseError
|
215
|
+
end
|
216
|
+
|
217
|
+
@buffer = +""
|
218
|
+
end
|
219
|
+
|
220
|
+
@state = :path_start_state
|
221
|
+
@scanner.pos -= c.bytesize unless c.nil?
|
222
|
+
else
|
223
|
+
raise ParseError
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
def on_file_state(c)
|
228
|
+
@parse_result[:scheme] = "file"
|
229
|
+
@parse_result[:host] = ""
|
230
|
+
|
231
|
+
if c == "/" || c == "\\"
|
232
|
+
@state = :file_slash_state
|
233
|
+
else
|
234
|
+
@scanner.pos -= c.bytesize unless c.nil?
|
235
|
+
@state = :path_state
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
def on_file_slash_state(c)
|
240
|
+
if c == "/" || c == "\\"
|
241
|
+
@state = :file_host_state
|
242
|
+
else
|
243
|
+
@scanner.pos -= c.bytesize unless c.nil?
|
244
|
+
@state = :path_state
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
def on_file_host_state(c)
|
249
|
+
if c.nil? || c == "/" || c == "\\" || c == "?" || c == "#"
|
250
|
+
@scanner.pos -= c.bytesize unless c.nil?
|
251
|
+
|
252
|
+
if windows_drive_letter?(@buffer)
|
253
|
+
@state = :path_state
|
254
|
+
elsif @buffer.empty?
|
255
|
+
@parse_result[:host] = ""
|
256
|
+
@state = :path_start_state
|
257
|
+
else
|
258
|
+
host = @host_parser.parse(@buffer, !special_url?)
|
259
|
+
if host != "localhost"
|
260
|
+
@parse_result[:host] = host
|
261
|
+
end
|
262
|
+
|
263
|
+
@buffer = +""
|
264
|
+
@state = :path_start_state
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
@buffer += c unless c.nil?
|
269
|
+
end
|
270
|
+
|
271
|
+
def on_path_start_state(c)
|
272
|
+
return if c.nil?
|
273
|
+
|
274
|
+
if special_url?
|
275
|
+
@scanner.pos -= c.bytesize if c != "/" && c != "\\"
|
276
|
+
@state = :path_state
|
277
|
+
elsif c == "?"
|
278
|
+
@state = :query_state
|
279
|
+
elsif c == "#"
|
280
|
+
@state = :fragment_state
|
281
|
+
elsif c != nil
|
282
|
+
@scanner.pos -= c.bytesize if c != "/"
|
283
|
+
@state = :path_state
|
284
|
+
end
|
285
|
+
end
|
286
|
+
|
287
|
+
def on_path_state(c)
|
288
|
+
if (c.nil? || c == "/") || (special_url? && c == "\/") || (c == "?" || c == "#")
|
289
|
+
|
290
|
+
if double_dot_path_segments?(@buffer)
|
291
|
+
shorten_url_path
|
292
|
+
if c != "/" || (special_url? && c == "\/")
|
293
|
+
@paths << ""
|
294
|
+
end
|
295
|
+
elsif single_dot_path_segments?(@buffer) && (c != "/" || (special_url? && c == "\/"))
|
296
|
+
@paths << ""
|
297
|
+
elsif !single_dot_path_segments?(@buffer)
|
298
|
+
if @parse_result[:scheme] == "file" && @paths.empty? && windows_drive_letter?(@buffer)
|
299
|
+
@buffer[1] = ":"
|
300
|
+
end
|
301
|
+
@paths << @buffer
|
302
|
+
end
|
303
|
+
|
304
|
+
@buffer = +""
|
305
|
+
|
306
|
+
if c == "?"
|
307
|
+
@parse_result[:query] = ""
|
308
|
+
@state = :query_state
|
309
|
+
elsif c == "#"
|
310
|
+
@parse_result[:frament] = ""
|
311
|
+
@state = :fragment_state
|
312
|
+
end
|
313
|
+
else
|
314
|
+
@buffer << percent_encode(c, PATH_PERCENT_ENCODE_SET)
|
315
|
+
end
|
316
|
+
end
|
317
|
+
|
318
|
+
def on_opaque_path_state(c)
|
319
|
+
if c == "?"
|
320
|
+
@parse_result[:query] = ""
|
321
|
+
@state = :query_state
|
322
|
+
elsif c == "#"
|
323
|
+
@parse_result[:fragment] = ""
|
324
|
+
@state = :fragment_state
|
325
|
+
elsif c == " "
|
326
|
+
if @scanner.rest.start_with?("?") || @scanner.rest.start_with?("#")
|
327
|
+
@parse_result[:path] = @parse_result[:path].to_s + "%20"
|
328
|
+
else
|
329
|
+
@parse_result[:path] = @parse_result[:path].to_s + " "
|
330
|
+
end
|
331
|
+
elsif !c.nil?
|
332
|
+
@parse_result[:path] = @parse_result[:path].to_s + percent_encode(c, C0_CONTROL_PERCENT_ENCODE_SET)
|
333
|
+
end
|
334
|
+
end
|
335
|
+
|
336
|
+
def on_query_state(c)
|
337
|
+
if c.nil? || c == "#"
|
338
|
+
query_percent_encode_set = special_url? ? SPECIAL_QUERY_PERCENT_ENCODE_SET : QUERY_PERCENT_ENCODE_SET
|
339
|
+
@parse_result[:query] = @buffer.chars.map { |c| percent_encode(c, query_percent_encode_set) }.join
|
340
|
+
@buffer = +""
|
341
|
+
@state = :fragment_state if c == "#"
|
342
|
+
elsif !c.nil?
|
343
|
+
@buffer << c
|
344
|
+
end
|
345
|
+
end
|
346
|
+
|
347
|
+
def on_fragment_state(c)
|
348
|
+
return if c.nil?
|
349
|
+
@parse_result[:fragment] = @parse_result[:fragment].to_s + percent_encode(c, FRAGMENT_PERCENT_ENCODE_SET)
|
350
|
+
end
|
351
|
+
|
352
|
+
def c0_control_or_space?(c)
|
353
|
+
c0_control? || c == " "
|
354
|
+
end
|
355
|
+
|
356
|
+
def c0_control?(c)
|
357
|
+
C0_CONTROL.include?(c.ord)
|
358
|
+
end
|
359
|
+
|
360
|
+
def windows_drive_letter?(str)
|
361
|
+
WINDOWS_DRIVE_LETTER.match?(str)
|
362
|
+
end
|
363
|
+
|
364
|
+
def normalized_windows_drive_letter?(str)
|
365
|
+
NORMALIZED_WINDOWS_DRIVE_LETTER.match?(str)
|
366
|
+
end
|
367
|
+
|
368
|
+
def special_url?
|
369
|
+
SPECIAL_SCHEME.keys.include?(@parse_result[:scheme])
|
370
|
+
end
|
371
|
+
|
372
|
+
def single_dot_path_segments?(c)
|
373
|
+
SINGLE_DOT_PATH_SEGMENTS.include?(c)
|
374
|
+
end
|
375
|
+
|
376
|
+
def double_dot_path_segments?(c)
|
377
|
+
DOUBLE_DOT_PATH_SEGMENTS.include?(c)
|
378
|
+
end
|
379
|
+
|
380
|
+
def shorten_url_path
|
381
|
+
return if @parse_result[:path]&.empty?
|
382
|
+
|
383
|
+
return true if @parse_result[:scheme] == "file" && @parse_result[:path]&.length == 1 && normalized_windows_drive_letter?(@parse_result[:path])
|
384
|
+
@parse_result[:path]&.chomp!
|
385
|
+
end
|
386
|
+
end
|
387
|
+
end
|
388
|
+
|
389
|
+
URI.send(:remove_const, :DEFAULT_PARSER) if defined?(URI::DEFAULT_PARSER)
|
390
|
+
URI::DEFAULT_PARSER = URI::WhatwgParser.new
|
391
|
+
URI.parser = URI::DEFAULT_PARSER
|
metadata
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: uri-whatwg_parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Yuji Yaginuma
|
8
|
+
bindir: exe
|
9
|
+
cert_chain: []
|
10
|
+
date: 2025-05-04 00:00:00.000000000 Z
|
11
|
+
dependencies:
|
12
|
+
- !ruby/object:Gem::Dependency
|
13
|
+
name: strscan
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - ">="
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: '0'
|
19
|
+
type: :runtime
|
20
|
+
prerelease: false
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
22
|
+
requirements:
|
23
|
+
- - ">="
|
24
|
+
- !ruby/object:Gem::Version
|
25
|
+
version: '0'
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: uri
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ">="
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '0'
|
40
|
+
- !ruby/object:Gem::Dependency
|
41
|
+
name: uri-idna
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
47
|
+
type: :runtime
|
48
|
+
prerelease: false
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
- !ruby/object:Gem::Dependency
|
55
|
+
name: debug
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
type: :development
|
62
|
+
prerelease: false
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: '0'
|
68
|
+
email:
|
69
|
+
- yuuji.yaginuma@gmail.com
|
70
|
+
executables: []
|
71
|
+
extensions: []
|
72
|
+
extra_rdoc_files: []
|
73
|
+
files:
|
74
|
+
- LICENSE.txt
|
75
|
+
- README.md
|
76
|
+
- Rakefile
|
77
|
+
- lib/uri/whatwg_parser.rb
|
78
|
+
- lib/uri/whatwg_parser/error.rb
|
79
|
+
- lib/uri/whatwg_parser/host_parser.rb
|
80
|
+
- lib/uri/whatwg_parser/parser_helper.rb
|
81
|
+
- lib/uri/whatwg_parser/version.rb
|
82
|
+
homepage: https://github.com/y-yagi/uri-whatwg_parser
|
83
|
+
licenses:
|
84
|
+
- MIT
|
85
|
+
metadata:
|
86
|
+
homepage_uri: https://github.com/y-yagi/uri-whatwg_parser
|
87
|
+
rubygems_mfa_required: 'true'
|
88
|
+
rdoc_options: []
|
89
|
+
require_paths:
|
90
|
+
- lib
|
91
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
93
|
+
- - ">="
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: 3.2.0
|
96
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - ">="
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: '0'
|
101
|
+
requirements: []
|
102
|
+
rubygems_version: 3.6.2
|
103
|
+
specification_version: 4
|
104
|
+
summary: Ruby implementation of the WHATWG URL Living Standard
|
105
|
+
test_files: []
|