uri_pattern 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CODE_OF_CONDUCT.md +10 -0
- data/LICENSE.txt +21 -0
- data/README.md +109 -0
- data/Rakefile +11 -0
- data/lib/uri_pattern/canonicalization.rb +76 -0
- data/lib/uri_pattern/compiler.rb +380 -0
- data/lib/uri_pattern/component_pattern.rb +42 -0
- data/lib/uri_pattern/match_result.rb +25 -0
- data/lib/uri_pattern/pattern_string.rb +327 -0
- data/lib/uri_pattern/tokenizer.rb +170 -0
- data/lib/uri_pattern/url_parser.rb +487 -0
- data/lib/uri_pattern/version.rb +5 -0
- data/lib/uri_pattern.rb +378 -0
- metadata +68 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 6bb66ae7537c5bdaf93dfa53686ed10a17f773a41524312132e7b44912332a34
|
|
4
|
+
data.tar.gz: bbfccbdcc181b33bc8a1abdc13a21a1e87006c8c8df2a3c4b328d94ceba84910
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 67f62acc8dac39fe7528926aca10b230409d353fdc7ab5cb7130bd03b49fb17ccc48883aaa8652142610630234da77273d784955a7e110e88d7b6d991ca40bfb
|
|
7
|
+
data.tar.gz: b9b61835ebfcedc03f2832277507513d64b9cdbabf4876a5715839be11836fc2119219c18fd4850228cd34afa782d6d8fa8f3a16b3312039af7ff679243b8f7d
|
data/CODE_OF_CONDUCT.md
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# Code of Conduct
|
|
2
|
+
|
|
3
|
+
"uri_pattern" follows [The Ruby Community Conduct Guideline](https://www.ruby-lang.org/en/conduct) in all "collaborative space", which is defined as community communications channels (such as mailing lists, submitted patches, commit comments, etc.):
|
|
4
|
+
|
|
5
|
+
* Participants will be tolerant of opposing views.
|
|
6
|
+
* Participants must ensure that their language and actions are free of personal attacks and disparaging personal remarks.
|
|
7
|
+
* When interpreting the words and actions of others, participants should always assume good intentions.
|
|
8
|
+
* Behaviour which can be reasonably considered harassment will not be tolerated.
|
|
9
|
+
|
|
10
|
+
If you have any concerns about behaviour within this project, please contact us at ["yuuji.yaginuma@gmail.com"](mailto:"yuuji.yaginuma@gmail.com").
|
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Yuji Yaginuma
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# URIPattern
|
|
2
|
+
|
|
3
|
+
Ruby port of the [WHATWG URLPattern specification](https://urlpattern.spec.whatwg.org/).
|
|
4
|
+
|
|
5
|
+
It lets you match URLs against patterns that contain named groups, wildcards, optional segments, and custom regular expressions, and read the captured values back out — the same matching model the `URLPattern` API provides in browsers, adapted to Ruby conventions (snake_case, keyword arguments, and `nil` in place of `undefined`).
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
Install the gem and add it to the application's Gemfile by executing:
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
bundle add uri_pattern
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
If bundler is not being used to manage dependencies, install the gem by executing:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
gem install uri_pattern
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Usage
|
|
22
|
+
|
|
23
|
+
### Constructing a pattern
|
|
24
|
+
|
|
25
|
+
A pattern can be built from a full URL pattern string, or from a hash of
|
|
26
|
+
per-component pattern strings. Any component you omit defaults to the wildcard
|
|
27
|
+
`*`.
|
|
28
|
+
|
|
29
|
+
```ruby
|
|
30
|
+
require "uri_pattern"
|
|
31
|
+
|
|
32
|
+
# From a string
|
|
33
|
+
pattern = URIPattern.new("https://example.com/users/:id")
|
|
34
|
+
|
|
35
|
+
# From a hash of components
|
|
36
|
+
pattern = URIPattern.new({ hostname: "example.com", pathname: "/users/:id" })
|
|
37
|
+
|
|
38
|
+
# A relative pattern, resolved against a base URL
|
|
39
|
+
pattern = URIPattern.new("/users/:id", "https://example.com")
|
|
40
|
+
|
|
41
|
+
# Case-insensitive matching
|
|
42
|
+
pattern = URIPattern.new("https://example.com/Users/:id", ignore_case: true)
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Testing for a match
|
|
46
|
+
|
|
47
|
+
`#match?` returns a boolean and is the fastest way to check a URL:
|
|
48
|
+
|
|
49
|
+
```ruby
|
|
50
|
+
pattern = URIPattern.new("https://example.com/users/:id")
|
|
51
|
+
|
|
52
|
+
pattern.match?("https://example.com/users/42") # => true
|
|
53
|
+
pattern.match?("https://example.com/posts/42") # => false
|
|
54
|
+
pattern.match?("https://other.com/users/42") # => false
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Capturing values
|
|
58
|
+
|
|
59
|
+
`#match` returns a `URIPattern::MatchResult` on success, or `nil` when the URL
|
|
60
|
+
does not match. Each component exposes its matched `input` and named `groups`:
|
|
61
|
+
|
|
62
|
+
```ruby
|
|
63
|
+
pattern = URIPattern.new("https://example.com/users/:id")
|
|
64
|
+
result = pattern.match("https://example.com/users/42")
|
|
65
|
+
|
|
66
|
+
result.pathname.input # => "/users/42"
|
|
67
|
+
result.pathname.groups # => { "id" => "42" }
|
|
68
|
+
result.hostname.input # => "example.com"
|
|
69
|
+
result.hostname.groups # => {}
|
|
70
|
+
|
|
71
|
+
pattern.match("https://other.com/users/42") # => nil
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Groups can be captured from any component, including the query string:
|
|
75
|
+
|
|
76
|
+
```ruby
|
|
77
|
+
pattern = URIPattern.new("https://example.com/search?q=:term")
|
|
78
|
+
result = pattern.match("https://example.com/search?q=ruby")
|
|
79
|
+
|
|
80
|
+
result.query.groups # => { "term" => "ruby" }
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Reading components back
|
|
84
|
+
|
|
85
|
+
Each component reader returns the pattern string for that component:
|
|
86
|
+
|
|
87
|
+
```ruby
|
|
88
|
+
pattern = URIPattern.new("https://*.example.com/books/:id?")
|
|
89
|
+
|
|
90
|
+
pattern.protocol # => "https"
|
|
91
|
+
pattern.hostname # => "*.example.com"
|
|
92
|
+
pattern.pathname # => "/books/:id?"
|
|
93
|
+
pattern.query # => "*"
|
|
94
|
+
pattern.fragment # => "*"
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Development
|
|
98
|
+
|
|
99
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
|
100
|
+
|
|
101
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
|
102
|
+
|
|
103
|
+
## Contributing
|
|
104
|
+
|
|
105
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/y-yagi/uri_pattern.
|
|
106
|
+
|
|
107
|
+
## License
|
|
108
|
+
|
|
109
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class URIPattern
|
|
4
|
+
# Per-component canonicalization of fixed (literal) text, shared by the regexp
|
|
5
|
+
# Compiler and the pattern-string generator so both apply identical encoding.
|
|
6
|
+
# Including classes must define @component, @opaque_path and @ipv6.
|
|
7
|
+
module Canonicalization
|
|
8
|
+
# Canonicalize one fixed-text run for the current component. The percent-encode
|
|
9
|
+
# components (pathname/query/fragment/username/password) are delegated to the
|
|
10
|
+
# spec's "dummy URL" canonicalizers in URLParser so the URL parser applies the
|
|
11
|
+
# exact spec encode set and dot-segment handling. Hostname/port keep their
|
|
12
|
+
# dedicated parsers; protocol (and anything else) passes through unchanged.
|
|
13
|
+
def encode_run(run)
|
|
14
|
+
case @component
|
|
15
|
+
when :hostname
|
|
16
|
+
@ipv6 ? canonicalize_ipv6(run) : canonicalize_hostname(run)
|
|
17
|
+
when :port
|
|
18
|
+
canonicalize_port(run)
|
|
19
|
+
when :pathname
|
|
20
|
+
URIPattern::URLParser.canonicalize_pathname_run(run, opaque_path: @opaque_path)
|
|
21
|
+
when :query
|
|
22
|
+
URIPattern::URLParser.canonicalize_search_run(run)
|
|
23
|
+
when :fragment
|
|
24
|
+
URIPattern::URLParser.canonicalize_hash_run(run)
|
|
25
|
+
when :username
|
|
26
|
+
URIPattern::URLParser.canonicalize_username_run(run)
|
|
27
|
+
when :password
|
|
28
|
+
URIPattern::URLParser.canonicalize_password_run(run)
|
|
29
|
+
else
|
|
30
|
+
run
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# WHATWG basic URL parser "port state": read leading ASCII digits, stop at the
|
|
35
|
+
# first non-digit, fail if the number exceeds 65535, and serialize without
|
|
36
|
+
# leading zeros. (Default-port stripping is protocol-dependent and not applied
|
|
37
|
+
# to the pattern string.)
|
|
38
|
+
def canonicalize_port(run)
|
|
39
|
+
return run if run.empty?
|
|
40
|
+
digits = run[/\A[0-9]*/]
|
|
41
|
+
raise URIPattern::Error, "Invalid port #{run.inspect}" if digits.empty?
|
|
42
|
+
number = digits.to_i
|
|
43
|
+
raise URIPattern::Error, "Invalid port #{run.inspect}" if number > 65_535
|
|
44
|
+
number.to_s
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# WHATWG "canonicalize a hostname": strip tab/newline/CR, end the host at the
|
|
48
|
+
# first path delimiter ("/", "\\", "#", "?"), then run the host parser. A host
|
|
49
|
+
# that fails to parse (forbidden code points, bad IDN, etc.) raises.
|
|
50
|
+
def canonicalize_hostname(run)
|
|
51
|
+
return run if run.empty?
|
|
52
|
+
value = run.gsub(/[\t\n\r]/, "")
|
|
53
|
+
return "" if value.empty?
|
|
54
|
+
if (idx = value.index(/[\/\\#?]/))
|
|
55
|
+
value = value[0, idx]
|
|
56
|
+
end
|
|
57
|
+
return "" if value.empty?
|
|
58
|
+
URI::WhatwgParser::HostParser.new.parse(value)
|
|
59
|
+
rescue => e
|
|
60
|
+
raise URIPattern::Error, "Invalid hostname #{run.inspect}: #{e.message}"
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# WHATWG "canonicalize an IPv6 hostname": only "[", "]", ":" and ASCII hex
|
|
64
|
+
# digits are permitted; hex letters are lowercased.
|
|
65
|
+
def canonicalize_ipv6(run)
|
|
66
|
+
run.each_char.map do |c|
|
|
67
|
+
case c
|
|
68
|
+
when "[", "]", ":" then c
|
|
69
|
+
when /[0-9a-fA-F]/ then c.downcase
|
|
70
|
+
else
|
|
71
|
+
raise URIPattern::Error, "Invalid IPv6 hostname character #{c.inspect} in #{run.inspect}"
|
|
72
|
+
end
|
|
73
|
+
end.join
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class URIPattern
|
|
4
|
+
class Compiler
|
|
5
|
+
SEGMENT_REGEXPS = {
|
|
6
|
+
pathname: "[^/]+?",
|
|
7
|
+
hostname: "[^.]+?"
|
|
8
|
+
}.freeze
|
|
9
|
+
DEFAULT_SEGMENT = "[^#?{}]+?"
|
|
10
|
+
|
|
11
|
+
DELIMITER_CHARS = {
|
|
12
|
+
pathname: "/",
|
|
13
|
+
hostname: "."
|
|
14
|
+
}.freeze
|
|
15
|
+
|
|
16
|
+
# Token types that carry literal text and are buffered (not turned into a
|
|
17
|
+
# capture). Shared by the top-level and in-group compile loops.
|
|
18
|
+
LITERAL_TOKEN_TYPES = %i[char escaped_char invalid_char].freeze
|
|
19
|
+
|
|
20
|
+
include URIPattern::Canonicalization
|
|
21
|
+
|
|
22
|
+
# Accumulate consecutive literal characters; flush_literals canonicalizes the
|
|
23
|
+
# whole run through the component's encode callback (which may raise) and
|
|
24
|
+
# appends the Regexp-escaped result. This mirrors the spec applying an
|
|
25
|
+
# encoding callback to each fixed-text part of a pattern.
|
|
26
|
+
def flush_literals(result, before_part: false)
|
|
27
|
+
return if @literal_buf.empty?
|
|
28
|
+
run = @literal_buf
|
|
29
|
+
@literal_buf = +""
|
|
30
|
+
delim = delimiter_char
|
|
31
|
+
# When this run is immediately followed by a part (name/group/wildcard), a
|
|
32
|
+
# trailing delimiter ("/" for pathname, "." for hostname) is that part's
|
|
33
|
+
# prefix, not part of this fixed run. Canonicalize the run WITHOUT it — so e.g.
|
|
34
|
+
# pathname dot-segments collapse correctly (`/a/../` → run `/a/..` → `/`) — and
|
|
35
|
+
# re-append the delimiter verbatim for pull_delimiter_prefix / the next literal
|
|
36
|
+
# to consume. This keeps the Compiler consistent with PatternString and the
|
|
37
|
+
# spec, which treat the prefix as a separate token.
|
|
38
|
+
if before_part && !delim.empty? && run != delim && run.end_with?(delim)
|
|
39
|
+
result << Regexp.escape(encode_run(run[0...-delim.length]))
|
|
40
|
+
result << Regexp.escape(delim)
|
|
41
|
+
else
|
|
42
|
+
result << Regexp.escape(encode_run(run))
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
WILDCARD_PREFIX = "_w"
|
|
47
|
+
|
|
48
|
+
def initialize(tokens, component:, ignore_case: false, opaque_path: false, ipv6: false)
|
|
49
|
+
@tokens = tokens
|
|
50
|
+
@component = component
|
|
51
|
+
@ignore_case = ignore_case
|
|
52
|
+
@opaque_path = opaque_path
|
|
53
|
+
@ipv6 = ipv6
|
|
54
|
+
@wildcard_index = 0
|
|
55
|
+
@names_order = []
|
|
56
|
+
@wildcard_name_map = {}
|
|
57
|
+
@literal_buf = +""
|
|
58
|
+
@seen_names = {}
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def compile
|
|
62
|
+
regexp_str = translate_v_class_sets(build_regexp_string)
|
|
63
|
+
flags = @ignore_case ? Regexp::IGNORECASE : 0
|
|
64
|
+
begin
|
|
65
|
+
regexp = Regexp.new("\\A#{regexp_str}\\z", flags)
|
|
66
|
+
rescue RegexpError => e
|
|
67
|
+
raise URIPattern::Error, "Invalid pattern: #{e.message}"
|
|
68
|
+
end
|
|
69
|
+
{ regexp: regexp, names: @names_order, wildcard_name_map: @wildcard_name_map }
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
private
|
|
73
|
+
|
|
74
|
+
# ECMAScript "v"-flag character classes support a "--" set-subtraction operator
|
|
75
|
+
# (e.g. "[[a-z]--a]" = "[a-z]" minus "a") that Ruby's regexp engine lacks. Ruby
|
|
76
|
+
# does support "&&" intersection, so rewrite "[A--B]" as "[A&&[^B]]".
|
|
77
|
+
V_CLASS_SUBTRACTION = /(\[[^\[\]]+\])--(\[[^\[\]]+\]|[^\]]+?)(?=\])/
|
|
78
|
+
|
|
79
|
+
def translate_v_class_sets(source)
|
|
80
|
+
source.gsub(V_CLASS_SUBTRACTION) do
|
|
81
|
+
lhs, rhs = $1, $2
|
|
82
|
+
rhs_chars = rhs.start_with?("[") ? rhs[1..-2] : rhs
|
|
83
|
+
"#{lhs}&&[^#{rhs_chars}]"
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def segment_regexp
|
|
88
|
+
SEGMENT_REGEXPS.fetch(@component, DEFAULT_SEGMENT)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def delimiter_char
|
|
92
|
+
# Opaque paths (non-special schemes like "data:") are not hierarchical, so
|
|
93
|
+
# there is no "/" segment delimiter and no delimiter prefix is pulled into
|
|
94
|
+
# an optional/repeated group.
|
|
95
|
+
return "" if @component == :pathname && @opaque_path
|
|
96
|
+
DELIMITER_CHARS[@component] || ""
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def pull_delimiter_prefix(result)
|
|
100
|
+
delim = delimiter_char
|
|
101
|
+
return "" if delim.empty?
|
|
102
|
+
escaped = Regexp.escape(delim)
|
|
103
|
+
if result.end_with?(escaped)
|
|
104
|
+
result.slice!(result.length - escaped.length, escaped.length)
|
|
105
|
+
escaped
|
|
106
|
+
else
|
|
107
|
+
""
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# A duplicate group name is a spec-level error ("URLPattern" raises TypeError).
|
|
112
|
+
def register_name(name)
|
|
113
|
+
if @seen_names[name]
|
|
114
|
+
raise URIPattern::Error, "Duplicate group name #{name.inspect} in pattern"
|
|
115
|
+
end
|
|
116
|
+
@seen_names[name] = true
|
|
117
|
+
@names_order << name
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def next_wildcard_name
|
|
121
|
+
external = @wildcard_index.to_s
|
|
122
|
+
internal = "#{WILDCARD_PREFIX}#{@wildcard_index}"
|
|
123
|
+
@wildcard_index += 1
|
|
124
|
+
@wildcard_name_map[internal] = external
|
|
125
|
+
@names_order << external
|
|
126
|
+
internal
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def modifier_regex(name, core, prefix, mod)
|
|
130
|
+
case mod
|
|
131
|
+
when "+"
|
|
132
|
+
# One or more: capture all repetitions in a single group
|
|
133
|
+
# prefix + (core)(delimiter+core)* → all in one named group
|
|
134
|
+
delim = delimiter_char.empty? ? "" : Regexp.escape(delimiter_char)
|
|
135
|
+
if delim.empty? || prefix.empty?
|
|
136
|
+
"#{prefix}(?<#{name}>(?:#{core})+)"
|
|
137
|
+
else
|
|
138
|
+
"#{prefix}(?<#{name}>#{core}(?:#{delim}#{core})*)"
|
|
139
|
+
end
|
|
140
|
+
when "*"
|
|
141
|
+
# Zero or more: optional group, nil on zero occurrences
|
|
142
|
+
delim = delimiter_char.empty? ? "" : Regexp.escape(delimiter_char)
|
|
143
|
+
if delim.empty? || prefix.empty?
|
|
144
|
+
"(?:#{prefix}(?<#{name}>(?:#{core})*))?".dup
|
|
145
|
+
else
|
|
146
|
+
"(?:#{prefix}(?<#{name}>#{core}(?:#{delim}#{core})*))?".dup
|
|
147
|
+
end
|
|
148
|
+
when "?"
|
|
149
|
+
# Zero or one
|
|
150
|
+
"(?:#{prefix}(?<#{name}>#{core}))?"
|
|
151
|
+
else
|
|
152
|
+
"(?:#{prefix}(?<#{name}>#{core}))#{mod}"
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def build_regexp_string
|
|
157
|
+
result = +""
|
|
158
|
+
i = 0
|
|
159
|
+
|
|
160
|
+
while i < @tokens.length
|
|
161
|
+
token = @tokens[i]
|
|
162
|
+
|
|
163
|
+
if LITERAL_TOKEN_TYPES.include?(token.type)
|
|
164
|
+
@literal_buf << token.value
|
|
165
|
+
i += 1
|
|
166
|
+
next
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
# A "{...}" group whose body is pure literal text and which carries no
|
|
170
|
+
# modifier is a fixed-text part, not a group. Merge its text into the
|
|
171
|
+
# literal run so adjacent literals canonicalize together (e.g. hostname
|
|
172
|
+
# "example{.com/}foo" → the run "example.com/foo" → host-truncated at "/").
|
|
173
|
+
if token.type == :open && (fixed = fixed_text_group(i))
|
|
174
|
+
@literal_buf << fixed[:text]
|
|
175
|
+
i = fixed[:next_index]
|
|
176
|
+
next
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
flush_literals(result, before_part: %i[asterisk name open regexp].include?(token.type))
|
|
180
|
+
|
|
181
|
+
case token.type
|
|
182
|
+
when :end
|
|
183
|
+
break
|
|
184
|
+
|
|
185
|
+
when :asterisk
|
|
186
|
+
internal_name = next_wildcard_name
|
|
187
|
+
next_tok = @tokens[i + 1]
|
|
188
|
+
if next_tok && next_tok.type == :other_modifier
|
|
189
|
+
prefix = pull_delimiter_prefix(result)
|
|
190
|
+
# Wildcards match greedily; optional/one-or-more modifiers use lazy outer quantifier
|
|
191
|
+
case next_tok.value
|
|
192
|
+
when "+"
|
|
193
|
+
result << "#{prefix}(?<#{internal_name}>.*)"
|
|
194
|
+
when "*", "?"
|
|
195
|
+
result << "(?:#{prefix}(?<#{internal_name}>.*))?\?"
|
|
196
|
+
else
|
|
197
|
+
result << "#{prefix}(?<#{internal_name}>.*)#{next_tok.value}"
|
|
198
|
+
end
|
|
199
|
+
i += 2
|
|
200
|
+
else
|
|
201
|
+
result << "(?<#{internal_name}>.*)"
|
|
202
|
+
i += 1
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
when :name
|
|
206
|
+
name = token.value
|
|
207
|
+
register_name(name)
|
|
208
|
+
next_tok = @tokens[i + 1]
|
|
209
|
+
seg = segment_regexp
|
|
210
|
+
if next_tok&.type == :other_modifier
|
|
211
|
+
prefix = pull_delimiter_prefix(result)
|
|
212
|
+
result << modifier_regex(name, seg, prefix, next_tok.value)
|
|
213
|
+
i += 2
|
|
214
|
+
elsif next_tok&.type == :regexp
|
|
215
|
+
inner = regexp_inner(next_tok)
|
|
216
|
+
mod_tok = @tokens[i + 2]
|
|
217
|
+
if mod_tok&.type == :other_modifier
|
|
218
|
+
prefix = pull_delimiter_prefix(result)
|
|
219
|
+
result << modifier_regex(name, inner, prefix, mod_tok.value)
|
|
220
|
+
i += 3
|
|
221
|
+
else
|
|
222
|
+
result << "(?<#{name}>(?:#{inner}))"
|
|
223
|
+
i += 2
|
|
224
|
+
end
|
|
225
|
+
else
|
|
226
|
+
result << "(?<#{name}>#{seg})"
|
|
227
|
+
i += 1
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
when :open
|
|
231
|
+
i += 1
|
|
232
|
+
inner_result, i = compile_group_inner(i)
|
|
233
|
+
mod_tok = @tokens[i]
|
|
234
|
+
if mod_tok&.type == :other_modifier
|
|
235
|
+
prefix = pull_delimiter_prefix(result)
|
|
236
|
+
result << "(?:#{prefix}#{inner_result})#{mod_tok.value}"
|
|
237
|
+
i += 1
|
|
238
|
+
else
|
|
239
|
+
result << "(?:#{inner_result})"
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
when :regexp
|
|
243
|
+
inner = regexp_inner(token)
|
|
244
|
+
internal_name = next_wildcard_name
|
|
245
|
+
mod_tok = @tokens[i + 1]
|
|
246
|
+
if mod_tok&.type == :other_modifier
|
|
247
|
+
prefix = pull_delimiter_prefix(result)
|
|
248
|
+
result << modifier_regex(internal_name, inner, prefix, mod_tok.value)
|
|
249
|
+
i += 2
|
|
250
|
+
else
|
|
251
|
+
result << "(?<#{internal_name}>(?:#{inner}))"
|
|
252
|
+
i += 1
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
when :other_modifier
|
|
256
|
+
# A modifier here did not follow a group/name/regexp/wildcard.
|
|
257
|
+
raise URIPattern::Error, "Dangling modifier #{token.value.inspect} in pattern"
|
|
258
|
+
|
|
259
|
+
else
|
|
260
|
+
i += 1
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
flush_literals(result)
|
|
265
|
+
result
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
# If the "{" group starting at index i contains only literal characters and is
|
|
269
|
+
# not followed by a modifier, return its text and the index just past "}".
|
|
270
|
+
# Otherwise return nil (it is a real group with a capture/wildcard/modifier).
|
|
271
|
+
def fixed_text_group(i)
|
|
272
|
+
j = i + 1
|
|
273
|
+
text = +""
|
|
274
|
+
while j < @tokens.length
|
|
275
|
+
tok = @tokens[j]
|
|
276
|
+
case tok.type
|
|
277
|
+
when :char, :escaped_char, :invalid_char
|
|
278
|
+
text << tok.value
|
|
279
|
+
j += 1
|
|
280
|
+
when :close
|
|
281
|
+
return nil if @tokens[j + 1]&.type == :other_modifier
|
|
282
|
+
return { text: text, next_index: j + 1 }
|
|
283
|
+
else
|
|
284
|
+
return nil
|
|
285
|
+
end
|
|
286
|
+
end
|
|
287
|
+
nil
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
def compile_group_inner(i)
|
|
291
|
+
result = +""
|
|
292
|
+
while i < @tokens.length
|
|
293
|
+
token = @tokens[i]
|
|
294
|
+
|
|
295
|
+
if LITERAL_TOKEN_TYPES.include?(token.type)
|
|
296
|
+
@literal_buf << token.value
|
|
297
|
+
i += 1
|
|
298
|
+
next
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
flush_literals(result)
|
|
302
|
+
|
|
303
|
+
case token.type
|
|
304
|
+
when :close
|
|
305
|
+
i += 1
|
|
306
|
+
return [result, i]
|
|
307
|
+
when :end
|
|
308
|
+
raise URIPattern::Error, "Unclosed '{' group in pattern"
|
|
309
|
+
when :name
|
|
310
|
+
name = token.value
|
|
311
|
+
register_name(name)
|
|
312
|
+
next_tok = @tokens[i + 1]
|
|
313
|
+
seg = segment_regexp
|
|
314
|
+
if next_tok&.type == :other_modifier
|
|
315
|
+
result << "(?<#{name}>#{seg})#{next_tok.value}"
|
|
316
|
+
i += 2
|
|
317
|
+
elsif next_tok&.type == :regexp
|
|
318
|
+
inner = regexp_inner(next_tok)
|
|
319
|
+
result << "(?<#{name}>(?:#{inner}))"
|
|
320
|
+
i += 2
|
|
321
|
+
else
|
|
322
|
+
result << "(?<#{name}>#{seg})"
|
|
323
|
+
i += 1
|
|
324
|
+
end
|
|
325
|
+
when :asterisk
|
|
326
|
+
internal_name = next_wildcard_name
|
|
327
|
+
result << "(?<#{internal_name}>.*)"
|
|
328
|
+
i += 1
|
|
329
|
+
when :regexp
|
|
330
|
+
inner = regexp_inner(token)
|
|
331
|
+
internal_name = next_wildcard_name
|
|
332
|
+
result << "(?<#{internal_name}>(?:#{inner}))"
|
|
333
|
+
i += 1
|
|
334
|
+
when :open
|
|
335
|
+
# A "{" group nested inside another "{" group is not allowed.
|
|
336
|
+
raise URIPattern::Error, "Nested '{' group in pattern"
|
|
337
|
+
when :other_modifier
|
|
338
|
+
raise URIPattern::Error, "Dangling modifier #{token.value.inspect} in pattern"
|
|
339
|
+
else
|
|
340
|
+
i += 1
|
|
341
|
+
end
|
|
342
|
+
end
|
|
343
|
+
raise URIPattern::Error, "Unclosed '{' group in pattern"
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
# Prepare a :regexp token's raw inner source for embedding: validate its identity
|
|
347
|
+
# escapes (ECMAScript "u"-mode rules) and neutralize any author-written named
|
|
348
|
+
# captures so only URLPattern-level names surface. The tokenizer already enforced
|
|
349
|
+
# the structural rules (balance, no capturing sub-groups, ASCII, non-empty).
|
|
350
|
+
def regexp_inner(token)
|
|
351
|
+
inner = token.value.to_s
|
|
352
|
+
validate_regexp_escapes(inner)
|
|
353
|
+
strip_named_captures(inner)
|
|
354
|
+
end
|
|
355
|
+
|
|
356
|
+
# Validate every "\X" identity escape in a regexp group's source.
|
|
357
|
+
def validate_regexp_escapes(inner)
|
|
358
|
+
inner.scan(/\\(.)/m) { validate_regexp_escape($1) }
|
|
359
|
+
end
|
|
360
|
+
|
|
361
|
+
# Named captures written inside a custom regexp group — "(?<x>...)" / "(?'x'...)"
|
|
362
|
+
# — must not surface in the match result's groups (only URLPattern-level names
|
|
363
|
+
# and wildcard indices do). Convert them to plain non-capturing groups, while
|
|
364
|
+
# leaving lookbehind assertions "(?<=...)" / "(?<!...)" untouched.
|
|
365
|
+
def strip_named_captures(inner)
|
|
366
|
+
inner.gsub(/\(\?<(?![=!])[^>]*>/, "(?:").gsub(/\(\?'[^']*'/, "(?:")
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
# The spec compiles each custom regexp group as a Unicode-mode ECMAScript
|
|
370
|
+
# regexp, where an identity escape ("\\x" for a literal x) is only valid for a
|
|
371
|
+
# SyntaxCharacter, "/", or a recognized escape class. Letters like "m" or "H"
|
|
372
|
+
# have no such escape and make the whole pattern invalid, even though Ruby
|
|
373
|
+
# would silently accept them.
|
|
374
|
+
VALID_REGEXP_ESCAPE = /\A[\^$\\.*+?()\[\]{}|\/dDsSwWbBfnrtvcxukpP0-9]\z/
|
|
375
|
+
def validate_regexp_escape(char)
|
|
376
|
+
return if char.match?(VALID_REGEXP_ESCAPE)
|
|
377
|
+
raise URIPattern::Error, "Invalid regexp escape \\#{char}"
|
|
378
|
+
end
|
|
379
|
+
end
|
|
380
|
+
end
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class URIPattern
|
|
4
|
+
class ComponentPattern
|
|
5
|
+
attr_reader :pattern
|
|
6
|
+
|
|
7
|
+
def initialize(pattern_string, component:, ignore_case: false, opaque_path: false)
|
|
8
|
+
tokens = Tokenizer.new(pattern_string, policy: :strict).tokenize
|
|
9
|
+
ipv6 = component == :hostname && ipv6_hostname_pattern?(pattern_string)
|
|
10
|
+
compiled = Compiler.new(tokens, component: component, ignore_case: ignore_case,
|
|
11
|
+
opaque_path: opaque_path, ipv6: ipv6).compile
|
|
12
|
+
@regexp = compiled[:regexp]
|
|
13
|
+
@wildcard_name_map = compiled[:wildcard_name_map]
|
|
14
|
+
# The getter exposes the canonicalized "component pattern string", not the
|
|
15
|
+
# raw input (see PatternString).
|
|
16
|
+
@pattern = PatternString.generate(pattern_string, component: component,
|
|
17
|
+
opaque_path: opaque_path, ipv6: ipv6)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# WHATWG "hostname pattern is an IPv6 address": true when the pattern starts
|
|
21
|
+
# with "[" (optionally wrapped in a "{" group). Such hostnames use the IPv6
|
|
22
|
+
# encode callback (lowercase hex + char validation) instead of host parsing.
|
|
23
|
+
def ipv6_hostname_pattern?(str)
|
|
24
|
+
return false if str.length < 2
|
|
25
|
+
str[0] == "[" || (str[0] == "{" && str[1] == "[")
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def match(string)
|
|
29
|
+
@regexp.match(string)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def groups_for(string)
|
|
33
|
+
md = @regexp.match(string)
|
|
34
|
+
return nil unless md
|
|
35
|
+
caps = md.named_captures
|
|
36
|
+
@wildcard_name_map.each do |internal, external|
|
|
37
|
+
caps[external] = caps.delete(internal) if caps.key?(internal)
|
|
38
|
+
end
|
|
39
|
+
caps
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class URIPattern
|
|
4
|
+
ComponentResult = Struct.new(:input, :groups, keyword_init: true)
|
|
5
|
+
|
|
6
|
+
class MatchResult
|
|
7
|
+
# +inputs+ is the array of arguments passed to #match: [input] or
|
|
8
|
+
# [input, base_url], mirroring URLPatternResult.inputs in the spec.
|
|
9
|
+
attr_reader :inputs, :protocol, :username, :password, :hostname,
|
|
10
|
+
:port, :pathname, :query, :fragment
|
|
11
|
+
|
|
12
|
+
def initialize(inputs:, protocol:, username:, password:, hostname:,
|
|
13
|
+
port:, pathname:, query:, fragment:)
|
|
14
|
+
@inputs = inputs
|
|
15
|
+
@protocol = protocol
|
|
16
|
+
@username = username
|
|
17
|
+
@password = password
|
|
18
|
+
@hostname = hostname
|
|
19
|
+
@port = port
|
|
20
|
+
@pathname = pathname
|
|
21
|
+
@query = query
|
|
22
|
+
@fragment = fragment
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|