uri_pattern 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,487 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+ require "uri/whatwg_parser"
5
+
6
+ class URIPattern
7
+ module URLParser
8
+ module_function
9
+
10
+ def split_components(url, base_url: nil)
11
+ url = resolve(url, base_url) if base_url && !url.empty?
12
+ parsed = URI::WhatwgParser.new.split(url)
13
+ userinfo = parsed[WHATWG_USERINFO] || ""
14
+ user, pass = userinfo.include?(":") ? userinfo.split(":", 2) : [userinfo, nil]
15
+ {
16
+ protocol: parsed[WHATWG_SCHEME] || "",
17
+ username: user || "",
18
+ password: pass || "",
19
+ hostname: parsed[WHATWG_HOST] || "",
20
+ port: parsed[WHATWG_PORT] ? parsed[WHATWG_PORT].to_s : "",
21
+ pathname: parsed[WHATWG_PATH] || parsed[WHATWG_OPAQUE_PATH] || "",
22
+ query: parsed[WHATWG_QUERY] || "",
23
+ fragment: parsed[WHATWG_FRAGMENT] || ""
24
+ }
25
+ rescue URIPattern::Error
26
+ raise
27
+ rescue => e
28
+ raise URIPattern::Error, "Failed to parse URL #{url.inspect}: #{e.message}"
29
+ end
30
+
31
+ def resolve(relative, base_url)
32
+ URI::WhatwgParser.new.parse(relative, base: base_url).to_s
33
+ rescue => e
34
+ raise URIPattern::Error, "Failed to resolve URL: #{e.message}"
35
+ end
36
+
37
+ # Parse a constructor string into its eight pattern components, following the
38
+ # WHATWG URLPattern "parse a constructor string" algorithm:
39
+ # https://urlpattern.spec.whatwg.org/#constructor-string-parsing
40
+ #
41
+ # Returns a hash keyed by the eight component symbols. A component that does not
42
+ # appear in the input is left as nil so that defaults can be applied downstream.
43
+ def split_pattern(pattern)
44
+ tokens = URIPattern::Tokenizer.new(pattern, policy: :lenient).tokenize
45
+ raw = ConstructorStringParser.new(pattern, tokens).parse
46
+ {
47
+ protocol: raw[:protocol],
48
+ username: raw[:username],
49
+ password: raw[:password],
50
+ hostname: raw[:hostname],
51
+ port: raw[:port],
52
+ pathname: raw[:pathname],
53
+ query: raw[:search],
54
+ fragment: raw[:hash]
55
+ }
56
+ end
57
+
58
+ # Indices in the array returned by URI::WhatwgParser#split:
59
+ # [scheme, userinfo, host, port, nil, path, opaque_path, query, fragment]
60
+ WHATWG_SCHEME = 0
61
+ WHATWG_USERINFO = 1
62
+ WHATWG_HOST = 2
63
+ WHATWG_PORT = 3
64
+ WHATWG_PATH = 5
65
+ WHATWG_OPAQUE_PATH = 6
66
+ WHATWG_QUERY = 7
67
+ WHATWG_FRAGMENT = 8
68
+
69
+ DEFAULT_PORTS = {
70
+ "http" => 80,
71
+ "https" => 443,
72
+ "ws" => 80,
73
+ "wss" => 443,
74
+ "ftp" => 21
75
+ }.freeze
76
+
77
+ # Normalize a port string for use as a match input component.
78
+ # Strips tabs, takes leading numeric digits, and suppresses the default port.
79
+ # Returns nil if the port string has no leading digits (parse failure).
80
+ def normalize_port_input(port_str, protocol = "")
81
+ port = port_str.to_s.gsub(/[\t\f]/, "")
82
+ digits = port.match(/\A\d*/)[0]
83
+ return nil if digits.empty? && !port.empty?
84
+ return nil if digits.length > 0 && digits.to_i > 65535
85
+ default = DEFAULT_PORTS[protocol.to_s.downcase]
86
+ default && default.to_s == digits ? "" : digits
87
+ end
88
+
89
+ SPECIAL_SCHEMES_SET = Set.new(%w[http https ws wss ftp file]).freeze
90
+
91
+ # Normalize a hostname: IDN, and strip CR/LF/tab.
92
+ def normalize_hostname_input(hostname)
93
+ return "" if hostname.nil? || hostname.empty?
94
+ h = hostname.gsub(/[\r\n\t]/, "")
95
+ return "" if h.empty?
96
+ URI::WhatwgParser.new.split("https://#{h}/")[WHATWG_HOST] || h
97
+ rescue
98
+ h
99
+ end
100
+
101
+ # Normalize a hash input through WHATWG URL rules for each component.
102
+ # Returns nil if a required component fails normalization.
103
+ def normalize_hash_input(hash)
104
+ protocol = hash[:protocol].to_s.downcase
105
+ # Opaque path: non-special scheme, no username/password/hostname/port set
106
+ opaque_path = !protocol.empty? && !SPECIAL_SCHEMES_SET.include?(protocol) &&
107
+ (hash[:hostname].nil? || hash[:hostname].to_s.empty?) &&
108
+ (hash[:username].nil? || hash[:username].to_s.empty?) &&
109
+ (hash[:password].nil? || hash[:password].to_s.empty?) &&
110
+ (hash[:port].nil? || hash[:port].to_s.empty?)
111
+ result = {}
112
+ hash.each do |k, v|
113
+ result[k] = case k
114
+ when :protocol
115
+ norm = canonicalize_protocol_input(v.to_s)
116
+ return nil if norm.nil?
117
+ norm
118
+ when :port
119
+ norm = normalize_port_input(v.to_s, protocol)
120
+ return nil if norm.nil?
121
+ norm
122
+ when :pathname
123
+ canonicalize_pathname_run(v.to_s, opaque_path: opaque_path)
124
+ when :hostname
125
+ normalize_hostname_input(v.to_s)
126
+ when :username
127
+ canonicalize_username_run(v.to_s)
128
+ when :password
129
+ canonicalize_password_run(v.to_s)
130
+ when :query
131
+ canonicalize_search_run(v.to_s)
132
+ when :fragment
133
+ canonicalize_hash_run(v.to_s)
134
+ else
135
+ v.to_s
136
+ end
137
+ end
138
+ result
139
+ end
140
+
141
+ # "canonicalize a protocol" on a match input: a scheme is ASCII, starts with a
142
+ # letter, and contains only letters, digits, "+", "-" and ".". A value with any
143
+ # other code point (e.g. "café") cannot be a protocol, so matching fails.
144
+ def canonicalize_protocol_input(value)
145
+ return "" if value.empty?
146
+ return nil unless value.match?(/\A[a-zA-Z][a-zA-Z0-9+.\-]*\z/)
147
+ value.downcase
148
+ end
149
+
150
+ # --- "dummy URL" canonicalization of a fixed pattern run --------------------
151
+ #
152
+ # The WHATWG URLPattern spec canonicalizes each fixed-text part of a pattern by
153
+ # running it through a throwaway ("dummy") URL, so the URL parser applies the
154
+ # exact spec percent-encode set and (for pathname) dot-segment handling. We
155
+ # delegate here instead of maintaining encode-set tables by hand, which both
156
+ # simplifies the code and tracks the spec precisely.
157
+ #
158
+ # DUMMY_URL is the spec's "create a dummy URL" input verbatim
159
+ # (https://urlpattern.spec.whatwg.org/ — "Let dummyInput be `https://dummy.invalid/`").
160
+ DUMMY_URL = "https://dummy.invalid/"
161
+
162
+ def dummy_url
163
+ URI::WhatwgParser.new.parse(DUMMY_URL)
164
+ end
165
+
166
+ # "canonicalize a search" / "...hash" / "...username" / "...password": the
167
+ # polyfill sets the corresponding URL component and reads it back. The
168
+ # uri-whatwg_parser setters run the basic URL parser with the matching state
169
+ # override and apply the spec encode sets (special-query for search, userinfo
170
+ # for username/password, etc.).
171
+ def canonicalize_search_run(run)
172
+ u = dummy_url
173
+ u.query = run
174
+ u.query.to_s
175
+ rescue => e
176
+ raise URIPattern::Error, "Invalid search #{run.inspect}: #{e.message}"
177
+ end
178
+
179
+ def canonicalize_hash_run(run)
180
+ u = dummy_url
181
+ u.fragment = run
182
+ u.fragment.to_s
183
+ rescue => e
184
+ raise URIPattern::Error, "Invalid hash #{run.inspect}: #{e.message}"
185
+ end
186
+
187
+ def canonicalize_username_run(run)
188
+ u = dummy_url
189
+ u.user = run
190
+ u.user.to_s
191
+ rescue => e
192
+ raise URIPattern::Error, "Invalid username #{run.inspect}: #{e.message}"
193
+ end
194
+
195
+ def canonicalize_password_run(run)
196
+ u = dummy_url
197
+ u.password = run
198
+ u.password.to_s
199
+ rescue => e
200
+ raise URIPattern::Error, "Invalid password #{run.inspect}: #{e.message}"
201
+ end
202
+
203
+ # "canonicalize a pathname" / "canonicalize an opaque pathname": run the fixed
204
+ # text through a dummy URL via full parsing (so "#"/"?" terminate the path and
205
+ # dot segments collapse, matching the polyfill). A non-opaque run that is not
206
+ # "/"-prefixed gets the spec's "/-" prefix trick so a leading "../" is preserved
207
+ # rather than collapsed against the root.
208
+ def canonicalize_pathname_run(run, opaque_path: false)
209
+ return run if run.empty?
210
+ if opaque_path
211
+ parsed = URI::WhatwgParser.new.split("data:#{run}")
212
+ (parsed[WHATWG_OPAQUE_PATH] || parsed[WHATWG_PATH]).to_s
213
+ else
214
+ lead = run.start_with?("/")
215
+ modified = lead ? run : "/-#{run}"
216
+ # Append the run as the dummy URL's path. The run supplies its own leading
217
+ # "/", so drop DUMMY_URL's trailing slash before joining. Parsing the whole
218
+ # URL (rather than resolving the run against DUMMY_URL as a base) keeps a
219
+ # leading "//" a path instead of an authority, and lets "#"/"?" terminate.
220
+ parsed = URI::WhatwgParser.new.split(DUMMY_URL.chomp("/") + modified)
221
+ pathname = parsed[WHATWG_PATH].to_s
222
+ lead ? pathname : pathname.sub(%r{\A/-}, "")
223
+ end
224
+ rescue => e
225
+ raise URIPattern::Error, "Invalid pathname #{run.inspect}: #{e.message}"
226
+ end
227
+ end
228
+
229
+ # Implements the WHATWG URLPattern "constructor string parser" state machine.
230
+ # https://urlpattern.spec.whatwg.org/#constructor-string-parsing
231
+ #
232
+ # Walks the (regexp-coalesced) token list with a state machine, recording each
233
+ # component into `result` as it is delimited. Component keys use the spec names
234
+ # (`:search` / `:hash`); URLParser.split_pattern maps them to `:query` / `:fragment`.
235
+ class ConstructorStringParser
236
+ NON_SPECIAL_CHAR_TYPES = %i[char escaped_char invalid_char].freeze
237
+ SEARCH_PREFIX_BLOCKERS = %i[name regexp close asterisk].freeze
238
+
239
+ def initialize(input, tokens)
240
+ @input = input
241
+ @tokens = tokens
242
+ @result = {}
243
+ @component_start = 0
244
+ @token_index = 0
245
+ @token_increment = 1
246
+ @group_depth = 0
247
+ @ipv6_depth = 0
248
+ @protocol_special = false
249
+ @state = :init
250
+ end
251
+
252
+ def parse
253
+ while @token_index < @tokens.length
254
+ @token_increment = 1
255
+
256
+ if current.type == :end
257
+ case @state
258
+ when :init
259
+ rewind
260
+ if hash_prefix?
261
+ change_state(:hash, 1)
262
+ elsif search_prefix?
263
+ change_state(:search, 1)
264
+ else
265
+ change_state(:pathname, 0)
266
+ end
267
+ @token_index += @token_increment
268
+ next
269
+ when :authority
270
+ rewind_and_set_state(:hostname)
271
+ @token_index += @token_increment
272
+ next
273
+ else
274
+ change_state(:done, 0)
275
+ break
276
+ end
277
+ end
278
+
279
+ if group_open?
280
+ @group_depth += 1
281
+ @token_index += @token_increment
282
+ next
283
+ end
284
+
285
+ if @group_depth.positive?
286
+ if group_close?
287
+ @group_depth -= 1
288
+ else
289
+ @token_index += @token_increment
290
+ next
291
+ end
292
+ end
293
+
294
+ step_state
295
+
296
+ @token_index += @token_increment
297
+ end
298
+
299
+ @result[:port] = "" if @result.key?(:hostname) && !@result.key?(:port)
300
+ @result
301
+ end
302
+
303
+ private
304
+
305
+ def step_state
306
+ case @state
307
+ when :init
308
+ rewind_and_set_state(:protocol) if protocol_suffix?
309
+ when :protocol
310
+ step_protocol
311
+ when :authority
312
+ if identity_terminator?
313
+ rewind_and_set_state(:username)
314
+ elsif pathname_start? || search_prefix? || hash_prefix?
315
+ rewind_and_set_state(:hostname)
316
+ end
317
+ when :username
318
+ if password_prefix?
319
+ change_state(:password, 1)
320
+ elsif identity_terminator?
321
+ change_state(:hostname, 1)
322
+ end
323
+ when :password
324
+ change_state(:hostname, 1) if identity_terminator?
325
+ when :hostname
326
+ step_hostname
327
+ when :port
328
+ step_port_or_pathname
329
+ when :pathname
330
+ if search_prefix?
331
+ change_state(:search, 1)
332
+ elsif hash_prefix?
333
+ change_state(:hash, 1)
334
+ end
335
+ when :search
336
+ change_state(:hash, 1) if hash_prefix?
337
+ when :hash
338
+ # nothing to do
339
+ end
340
+ end
341
+
342
+ def step_protocol
343
+ return unless protocol_suffix?
344
+
345
+ compute_protocol_matches_special_scheme
346
+ next_state = :pathname
347
+ skip = 1
348
+ if next_is_authority_slashes?
349
+ next_state = :authority
350
+ skip = 3
351
+ elsif @protocol_special
352
+ next_state = :authority
353
+ end
354
+ change_state(next_state, skip)
355
+ end
356
+
357
+ def step_hostname
358
+ if ipv6_open?
359
+ @ipv6_depth += 1
360
+ elsif ipv6_close?
361
+ @ipv6_depth -= 1
362
+ elsif port_prefix? && @ipv6_depth.zero?
363
+ change_state(:port, 1)
364
+ else
365
+ step_port_or_pathname
366
+ end
367
+ end
368
+
369
+ def step_port_or_pathname
370
+ if pathname_start?
371
+ change_state(:pathname, 0)
372
+ elsif search_prefix?
373
+ change_state(:search, 1)
374
+ elsif hash_prefix?
375
+ change_state(:hash, 1)
376
+ end
377
+ end
378
+
379
+ def current
380
+ @tokens[@token_index]
381
+ end
382
+
383
+ # "get a safe token": out-of-range indices resolve to the trailing :end token.
384
+ def safe_token(index)
385
+ return @tokens[index] if index < @tokens.length
386
+ @tokens[@tokens.length - 1]
387
+ end
388
+
389
+ def non_special_pattern_char?(index, value)
390
+ token = safe_token(index)
391
+ return false unless token.value == value
392
+ NON_SPECIAL_CHAR_TYPES.include?(token.type)
393
+ end
394
+
395
+ def protocol_suffix? = non_special_pattern_char?(@token_index, ":")
396
+ def identity_terminator? = non_special_pattern_char?(@token_index, "@")
397
+ def password_prefix? = non_special_pattern_char?(@token_index, ":")
398
+ def port_prefix? = non_special_pattern_char?(@token_index, ":")
399
+ def pathname_start? = non_special_pattern_char?(@token_index, "/")
400
+ def hash_prefix? = non_special_pattern_char?(@token_index, "#")
401
+ def ipv6_open? = non_special_pattern_char?(@token_index, "[")
402
+ def ipv6_close? = non_special_pattern_char?(@token_index, "]")
403
+ def group_open? = current.type == :open
404
+ def group_close? = current.type == :close
405
+
406
+ def search_prefix?
407
+ return true if non_special_pattern_char?(@token_index, "?")
408
+ return false unless current.value == "?"
409
+
410
+ previous_index = @token_index - 1
411
+ return true if previous_index.negative?
412
+
413
+ !SEARCH_PREFIX_BLOCKERS.include?(safe_token(previous_index).type)
414
+ end
415
+
416
+ def next_is_authority_slashes?
417
+ non_special_pattern_char?(@token_index + 1, "/") &&
418
+ non_special_pattern_char?(@token_index + 2, "/")
419
+ end
420
+
421
+ def change_state(new_state, skip)
422
+ unless %i[init authority done].include?(@state)
423
+ @result[@state] = make_component_string
424
+ end
425
+
426
+ apply_implicit_defaults(new_state) if @state != :init && new_state != :done
427
+
428
+ change_state_without_setting_component(new_state, skip)
429
+ end
430
+
431
+ # Advance to +new_state+, skipping +skip+ tokens and marking the new component's
432
+ # start, without finalizing the current component or applying defaults. Mirrors
433
+ # the spec/polyfill "change state without setting component" helper.
434
+ def change_state_without_setting_component(new_state, skip)
435
+ @state = new_state
436
+ @token_index += skip
437
+ @component_start = @token_index
438
+ @token_increment = 0
439
+ end
440
+
441
+ # When a transition skips over earlier components, those components still need a
442
+ # value. Per the spec's constructor-string parser, jumping from an authority-side
443
+ # state straight to a later one fills the skipped slots with their defaults
444
+ # (empty, or "/" for a special-scheme pathname). Driven by @state -> new_state.
445
+ def apply_implicit_defaults(new_state)
446
+ if %i[protocol authority username password].include?(@state) &&
447
+ %i[port pathname search hash].include?(new_state) &&
448
+ !@result.key?(:hostname)
449
+ @result[:hostname] = ""
450
+ end
451
+ if %i[protocol authority username password hostname port].include?(@state) &&
452
+ %i[search hash].include?(new_state) &&
453
+ !@result.key?(:pathname)
454
+ @result[:pathname] = @protocol_special ? "/" : ""
455
+ end
456
+ if %i[protocol authority username password hostname port pathname].include?(@state) &&
457
+ new_state == :hash &&
458
+ !@result.key?(:search)
459
+ @result[:search] = ""
460
+ end
461
+ end
462
+
463
+ def rewind
464
+ @token_index = @component_start
465
+ @token_increment = 0
466
+ end
467
+
468
+ def rewind_and_set_state(new_state)
469
+ rewind
470
+ @state = new_state
471
+ end
472
+
473
+ def make_component_string
474
+ token = @tokens[@token_index]
475
+ start_token = safe_token(@component_start)
476
+ @input[start_token.index...token.index]
477
+ end
478
+
479
+ def compute_protocol_matches_special_scheme
480
+ protocol_string = make_component_string
481
+ compiled = URIPattern::ComponentPattern.new(protocol_string, component: :protocol)
482
+ @protocol_special = URLParser::SPECIAL_SCHEMES_SET.any? { |scheme| compiled.match(scheme) }
483
+ rescue URIPattern::Error
484
+ @protocol_special = false
485
+ end
486
+ end
487
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class URIPattern
4
+ VERSION = "0.1.0"
5
+ end