uri_pattern 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,378 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "uri_pattern/version"
4
+ require_relative "uri_pattern/tokenizer"
5
+ require_relative "uri_pattern/canonicalization"
6
+ require_relative "uri_pattern/compiler"
7
+ require_relative "uri_pattern/pattern_string"
8
+ require_relative "uri_pattern/component_pattern"
9
+ require_relative "uri_pattern/url_parser"
10
+ require_relative "uri_pattern/match_result"
11
+
12
+ class URIPattern
13
+ class Error < StandardError; end
14
+
15
+ COMPONENT_KEYS = %i[protocol username password hostname port pathname query fragment].freeze
16
+
17
+ COMPONENT_DEFAULTS = {
18
+ protocol: "*",
19
+ username: "*",
20
+ password: "*",
21
+ hostname: "*",
22
+ port: "*",
23
+ pathname: "*",
24
+ query: "*",
25
+ fragment: "*"
26
+ }.freeze
27
+
28
+ SPECIAL_SCHEMES = %w[http https ws wss ftp file].freeze
29
+
30
+ # Authority components inherit the base_url value verbatim (already a literal
31
+ # string); path components inherit it as an *escaped* pattern string.
32
+ ESCAPED_AUTHORITY = %i[protocol hostname port].freeze
33
+ ESCAPED_PATH = %i[pathname query fragment].freeze
34
+ # ignoreCase only applies to these three components (per the spec's create
35
+ # algorithm, which only mixes ignoreCaseOptions into pathname/search/hash).
36
+ IGNORE_CASE_COMPONENTS = %i[pathname query fragment].freeze
37
+
38
+ def initialize(input = {}, base_url = nil, ignore_case: false)
39
+ if input.is_a?(Hash)
40
+ init_from_hash(input, base_url, ignore_case: ignore_case)
41
+ else
42
+ init_from_string(input.to_s, base_url, ignore_case: ignore_case)
43
+ end
44
+ end
45
+
46
+ def match?(input, base_url = nil)
47
+ components = parse_input(input, base_url)
48
+ return false unless components
49
+
50
+ COMPONENT_KEYS.all? do |key|
51
+ @patterns[key].match(components[key] || "")
52
+ end
53
+ rescue RegexpError => e
54
+ raise URIPattern::Error, e.message
55
+ end
56
+
57
+ def match(input, base_url = nil)
58
+ components = parse_input(input, base_url)
59
+ return nil unless components
60
+
61
+ results = {}
62
+ COMPONENT_KEYS.each do |key|
63
+ value = components[key] || ""
64
+ groups = @patterns[key].groups_for(value)
65
+ return nil unless groups
66
+ results[key] = URIPattern::ComponentResult.new(input: value, groups: groups)
67
+ end
68
+
69
+ URIPattern::MatchResult.new(
70
+ inputs: base_url.nil? ? [input] : [input, base_url],
71
+ protocol: results[:protocol],
72
+ username: results[:username],
73
+ password: results[:password],
74
+ hostname: results[:hostname],
75
+ port: results[:port],
76
+ pathname: results[:pathname],
77
+ query: results[:query],
78
+ fragment: results[:fragment]
79
+ )
80
+ end
81
+
82
+ COMPONENT_KEYS.each do |key|
83
+ define_method(key) { @patterns[key].pattern }
84
+ end
85
+
86
+ private
87
+
88
+ def init_from_string(pattern_string, base_url, ignore_case:)
89
+ if base_url
90
+ unless valid_base_url?(base_url)
91
+ raise URIPattern::Error, "Invalid base_url: #{base_url.inspect}"
92
+ end
93
+ # Parse the pattern on its own (preserving pattern syntax like "{...}"), then
94
+ # let unspecified components fall back to the base_url — the same hierarchical
95
+ # fallback used for dictionary inputs. Merging into a URL string first would
96
+ # corrupt pattern-syntax characters via percent-encoding.
97
+ parts = URIPattern::URLParser.split_pattern(pattern_string)
98
+ build_patterns(parts, ignore_case: ignore_case, base_url: base_url)
99
+ else
100
+ parts = URIPattern::URLParser.split_pattern(pattern_string)
101
+ # A relative URL pattern (one whose protocol is never determined — e.g.
102
+ # "/foo", "example.com/foo", or "{https://}example.com" where the scheme is
103
+ # hidden inside a group) is invalid without a base URL.
104
+ if parts[:protocol].nil?
105
+ raise URIPattern::Error, "Relative URL pattern requires a base URL"
106
+ end
107
+ build_patterns(parts, ignore_case: ignore_case)
108
+ end
109
+ end
110
+
111
+ def init_from_hash(hash, base_url, ignore_case:)
112
+ # A dictionary input must not be paired with a base_url argument.
113
+ if base_url
114
+ raise URIPattern::Error, "base_url cannot be provided when input is a dictionary"
115
+ end
116
+ hash = normalize_hash_keys(hash)
117
+ effective_base = hash[:base_url]
118
+ if effective_base && !valid_base_url?(effective_base)
119
+ raise URIPattern::Error, "Invalid base_url: #{effective_base.inspect}"
120
+ end
121
+ parts = {}
122
+ COMPONENT_KEYS.each { |k| parts[k] = hash[k]&.to_s }
123
+ # "process protocol for init": a protocol value provided via a dictionary may
124
+ # carry a single trailing ":" (e.g. "http{s}?:"), which is stripped before
125
+ # compiling the component.
126
+ if parts[:protocol]&.end_with?(":")
127
+ parts[:protocol] = parts[:protocol][0...-1]
128
+ end
129
+ # "process search/hash for init": strip a single leading "?"/"#" prefix.
130
+ if parts[:query]&.start_with?("?")
131
+ parts[:query] = parts[:query][1..]
132
+ end
133
+ if parts[:fragment]&.start_with?("#")
134
+ parts[:fragment] = parts[:fragment][1..]
135
+ end
136
+ build_patterns(parts, ignore_case: ignore_case, base_url: effective_base)
137
+ end
138
+
139
+ def normalize_hash_keys(hash)
140
+ hash.transform_keys do |k|
141
+ sym = k.to_sym
142
+ # Map WPT/WHATWG alternative names to uri gem keys
143
+ case sym
144
+ when :search then :query
145
+ when :hash then :fragment
146
+ when :baseURL then :base_url
147
+ else sym
148
+ end
149
+ end
150
+ end
151
+
152
+ def build_patterns(parts, ignore_case:, base_url: nil)
153
+ base_components = base_url ? parse_base_url(base_url) : {}
154
+
155
+ validate_port!(parts[:port])
156
+ parts = normalize_pattern_parts(parts, base_url)
157
+ pathname_opaque = opaque_pathname_context?(parts)
158
+
159
+ @patterns = compile_components(parts, base_components, base_url:, ignore_case:,
160
+ pathname_opaque:)
161
+ end
162
+
163
+ def validate_port!(port)
164
+ return unless port && port.match?(/\A\d+\z/) && port.to_i > 65_535
165
+ raise URIPattern::Error, "Invalid port: #{port.inspect}"
166
+ end
167
+
168
+ def normalize_pattern_parts(parts, base_url)
169
+ parts = resolve_pattern_pathname_part(parts, base_url)
170
+ parts = suppress_default_port(parts)
171
+ parts
172
+ end
173
+
174
+ def resolve_pattern_pathname_part(parts, base_url)
175
+ # Dot-segment collapsing of a pattern pathname is now handled per fixed run by
176
+ # the component canonicalizer (URLParser.canonicalize_pathname_run), so it works
177
+ # even when pattern tokens are present. Only base_url-relative resolution remains
178
+ # here.
179
+ if base_url && parts[:pathname]
180
+ return parts if absolute_pattern_pathname?(parts[:pathname])
181
+ parts = parts.dup
182
+ parts[:pathname] = resolve_pattern_pathname(parts[:pathname], base_url)
183
+ end
184
+ parts
185
+ end
186
+
187
+ # Suppress the default port only when the protocol pattern is *exactly* a special
188
+ # scheme name and the port is that scheme's default port. The comparison is an
189
+ # exact, case-sensitive string match (per the spec's create step /
190
+ # defaultPortForProtocol): a pattern like "http{s}?" or "HTTPS" is not the
191
+ # concrete scheme "https", so it must not trigger suppression.
192
+ def suppress_default_port(parts)
193
+ return parts unless parts[:port] && parts[:protocol]
194
+ default = URIPattern::URLParser::DEFAULT_PORTS[parts[:protocol]]
195
+ return parts unless default && default.to_s == parts[:port]
196
+ parts = parts.dup
197
+ parts[:port] = ""
198
+ parts
199
+ end
200
+
201
+ # An opaque path context occurs when the protocol is explicitly set, no authority
202
+ # components are present, and the protocol pattern can't match any special scheme.
203
+ def opaque_pathname_context?(parts)
204
+ return false unless parts[:protocol]
205
+ return false unless authority_empty?(parts)
206
+ compiled_proto = URIPattern::ComponentPattern.new(parts[:protocol], component: :protocol)
207
+ SPECIAL_SCHEMES.none? { |s| compiled_proto.match(s) }
208
+ end
209
+
210
+ def authority_empty?(parts)
211
+ %i[hostname username password port].all? { |k| parts[k].nil? || parts[k].empty? }
212
+ end
213
+
214
+ def compile_components(parts, base_components, base_url:, ignore_case:, pathname_opaque:)
215
+ # Hierarchical base_url fallback: components appearing *after* the last
216
+ # explicitly-specified component (in COMPONENT_KEYS order) do not inherit from
217
+ # the base — they are wildcarded. Only components at or before that boundary
218
+ # fall back to the base URL value.
219
+ last_specified = COMPONENT_KEYS.each_index.select { |idx| !parts[COMPONENT_KEYS[idx]].nil? }.max
220
+ COMPONENT_KEYS.each_with_index.to_h do |key, idx|
221
+ pattern = parts[key] || default_pattern(key, idx, base_components, base_url, last_specified)
222
+ opaque = (key == :pathname) ? pathname_opaque : false
223
+ component_ignore_case = ignore_case && IGNORE_CASE_COMPONENTS.include?(key)
224
+ [key, URIPattern::ComponentPattern.new(pattern, component: key,
225
+ ignore_case: component_ignore_case, opaque_path: opaque)]
226
+ end
227
+ end
228
+
229
+ # The pattern for a component that was not explicitly specified. Components
230
+ # inherited from a base_url are exact strings: authority components are taken
231
+ # verbatim, while path components are escaped ("escape a pattern string") so a
232
+ # base query like "q=*&v=?" is not reinterpreted as pattern syntax.
233
+ # username/password are never inherited from a base_url (spec "process a
234
+ # URLPatternInit" guards them with "is not a pattern"); they stay wildcards.
235
+ def default_pattern(key, idx, base_components, base_url, last_specified)
236
+ if base_url && last_specified && idx > last_specified
237
+ COMPONENT_DEFAULTS[key]
238
+ elsif base_components[key] && ESCAPED_AUTHORITY.include?(key)
239
+ base_components[key]
240
+ elsif base_url && ESCAPED_PATH.include?(key)
241
+ escape_pattern_string(base_components[key] || "")
242
+ else
243
+ COMPONENT_DEFAULTS[key]
244
+ end
245
+ end
246
+
247
+ # Resolve a relative pattern pathname against the base_url's path. This is pure
248
+ # string manipulation — prepend the base path up to and including its last "/" —
249
+ # so pattern-syntax characters ("{", "}", ":", …) are preserved rather than
250
+ # percent-encoded by the URL parser.
251
+ # WHATWG "is an absolute pathname" for a pattern: a leading "/", or (because this
252
+ # is a pattern, not a URL) an escaped "\\/" or a "{/" grouping that yields a
253
+ # leading slash. Such pathnames are NOT resolved against the base_url's path.
254
+ def absolute_pattern_pathname?(pathname)
255
+ return false if pathname.empty?
256
+ return true if pathname.start_with?("/")
257
+ return false if pathname.length < 2
258
+ (pathname[0] == "\\" || pathname[0] == "{") && pathname[1] == "/"
259
+ end
260
+
261
+ def resolve_pattern_pathname(pathname, base_url)
262
+ base_path = parse_base_url(base_url)[:pathname].to_s
263
+ base_path = "/" if base_path.empty?
264
+ slash = base_path.rindex("/")
265
+ prefix = slash ? base_path[0..slash] : "/"
266
+ remove_dot_segments("#{prefix}#{pathname}")
267
+ rescue
268
+ "/#{pathname}"
269
+ end
270
+
271
+ # RFC 3986 §5.2.4 "remove dot segments", operating purely on the string so that
272
+ # only whole "." / ".." path segments are collapsed (pattern syntax is untouched).
273
+ def remove_dot_segments(path)
274
+ input = path.dup
275
+ output = +""
276
+ until input.empty?
277
+ if input.start_with?("../")
278
+ input = input[3..]
279
+ elsif input.start_with?("./")
280
+ input = input[2..]
281
+ elsif input.start_with?("/./")
282
+ input = "/#{input[3..]}"
283
+ elsif input == "/."
284
+ input = "/"
285
+ elsif input.start_with?("/../")
286
+ input = "/#{input[4..]}"
287
+ output.sub!(%r{/?[^/]*\z}, "")
288
+ elsif input == "/.."
289
+ input = "/"
290
+ output.sub!(%r{/?[^/]*\z}, "")
291
+ elsif input == "." || input == ".."
292
+ input = ""
293
+ else
294
+ m = input.match(%r{\A(/?[^/]*)})
295
+ output << m[1]
296
+ input = input[m[1].length..]
297
+ end
298
+ end
299
+ output
300
+ end
301
+
302
+ # A base_url must be a parseable absolute URL (it needs a scheme). An empty
303
+ # string or a relative reference is not valid.
304
+ def valid_base_url?(base_url)
305
+ return false if base_url.nil? || base_url.empty?
306
+ parsed = URI::WhatwgParser.new.split(base_url)
307
+ scheme = parsed[URIPattern::URLParser::WHATWG_SCHEME]
308
+ !scheme.nil? && !scheme.empty?
309
+ rescue
310
+ false
311
+ end
312
+
313
+ # WHATWG "escape a pattern string": backslash-escape every code point that has
314
+ # special meaning in pattern syntax so the string matches literally.
315
+ PATTERN_ESCAPE_CHARS = "+*?:{}()\\"
316
+ def escape_pattern_string(str)
317
+ str.each_char.map { |c| PATTERN_ESCAPE_CHARS.include?(c) ? "\\#{c}" : c }.join
318
+ end
319
+
320
+ def parse_base_url(base_url)
321
+ URIPattern::URLParser.split_components(base_url)
322
+ rescue URIPattern::Error
323
+ {}
324
+ end
325
+
326
+ def parse_input(input, base_url)
327
+ if input.is_a?(Hash)
328
+ # base_url with a Hash input is always an error (must propagate, not be silenced)
329
+ raise URIPattern::Error, "base_url must not be provided when input is a Hash" if base_url
330
+ parse_hash_input(input)
331
+ else
332
+ begin
333
+ URIPattern::URLParser.split_components(input.to_s, base_url: base_url)
334
+ rescue URIPattern::Error
335
+ nil
336
+ end
337
+ end
338
+ end
339
+
340
+ def parse_hash_input(input)
341
+ normalized = normalize_hash_keys(input)
342
+ effective_base = normalized.delete(:base_url)
343
+ raw = hash_input_components(normalized, effective_base)
344
+ URIPattern::URLParser.normalize_hash_input(raw)
345
+ rescue URIPattern::Error
346
+ nil
347
+ end
348
+
349
+ # Build the eight raw component strings for a dictionary match input. With no
350
+ # base_url each component defaults to "". With a base_url, unspecified components
351
+ # are inherited from it and a relative pathname is resolved against its path.
352
+ def hash_input_components(normalized, effective_base)
353
+ unless effective_base
354
+ return COMPONENT_KEYS.to_h { |k| [k, normalized[k]&.to_s || ""] }
355
+ end
356
+
357
+ base_components = parse_base_url(effective_base)
358
+ raw = COMPONENT_KEYS.to_h do |k|
359
+ [k, normalized.key?(k) ? normalized[k].to_s : (base_components[k] || "")]
360
+ end
361
+ if normalized.key?(:pathname) && !normalized[:pathname].to_s.start_with?("/")
362
+ raw[:pathname] = resolve_relative_pathname(normalized[:pathname].to_s, effective_base)
363
+ end
364
+ raw
365
+ end
366
+
367
+ # Resolve a relative pathname against the base_url using WHATWG relative-URL
368
+ # resolution (replace the base's last path segment, honour dot segments), the
369
+ # same algorithm node's URLPattern uses for a dictionary match input. A previous
370
+ # hand-rolled concatenation appended to the full base path and broke on a
371
+ # base_url carrying a query/fragment.
372
+ def resolve_relative_pathname(pathname, base_url)
373
+ return pathname if pathname.empty?
374
+ URIPattern::URLParser.split_components(pathname, base_url: base_url)[:pathname]
375
+ rescue
376
+ pathname
377
+ end
378
+ end
metadata ADDED
@@ -0,0 +1,68 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: uri_pattern
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Yuji Yaginuma
8
+ bindir: exe
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: uri-whatwg_parser
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '0'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: '0'
26
+ email:
27
+ - yuuji.yaginuma@gmail.com
28
+ executables: []
29
+ extensions: []
30
+ extra_rdoc_files: []
31
+ files:
32
+ - CODE_OF_CONDUCT.md
33
+ - LICENSE.txt
34
+ - README.md
35
+ - Rakefile
36
+ - lib/uri_pattern.rb
37
+ - lib/uri_pattern/canonicalization.rb
38
+ - lib/uri_pattern/compiler.rb
39
+ - lib/uri_pattern/component_pattern.rb
40
+ - lib/uri_pattern/match_result.rb
41
+ - lib/uri_pattern/pattern_string.rb
42
+ - lib/uri_pattern/tokenizer.rb
43
+ - lib/uri_pattern/url_parser.rb
44
+ - lib/uri_pattern/version.rb
45
+ homepage: https://github.com/y-yagi/uri_pattern
46
+ licenses:
47
+ - MIT
48
+ metadata:
49
+ homepage_uri: https://github.com/y-yagi/uri_pattern
50
+ source_code_uri: https://github.com/y-yagi/uri_pattern
51
+ rdoc_options: []
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: 3.2.0
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ requirements: []
65
+ rubygems_version: 4.0.14
66
+ specification_version: 4
67
+ summary: Ruby implementation of the WHATWG URLPattern API
68
+ test_files: []