uri_pattern 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,327 @@
1
+ # frozen_string_literal: true
2
+
3
+ class URIPattern
4
+ # Generates the WHATWG "component pattern string" returned by the component
5
+ # getters (protocol, hostname, pathname, ...). It parses the raw component
6
+ # pattern into a part list — applying the same per-component canonicalization
7
+ # used for matching — and re-serialises it ("generate a pattern string"), so
8
+ # wildcards become "*", hostnames are punycoded, fixed text is percent-encoded,
9
+ # redundant "{}" groups are dropped, and so on.
10
+ #
11
+ # This is a port of the path-to-regexp-derived parse()/partsToPattern() used by
12
+ # the reference URLPattern implementation.
13
+ class PatternString
14
+ include URIPattern::Canonicalization
15
+
16
+ FULL_WILDCARD_REGEXP = ".*"
17
+
18
+ # Identifier continuation code points. The reference uses
19
+ # /[$_‌‍\p{ID_Continue}]/u; in Ruby "_", ZWNJ and ZWJ are already in
20
+ # \p{ID_Continue}, so only "$" needs to be added (avoids a duplicate-range warning).
21
+ IDENTIFIER_PART = /[$\p{ID_Continue}]/u
22
+
23
+ Part = Struct.new(:type, :name, :prefix, :value, :suffix, :modifier) do
24
+ def custom_name?
25
+ name.is_a?(String) && !name.empty?
26
+ end
27
+ end
28
+
29
+ def self.generate(pattern_string, component:, opaque_path: false, ipv6: false)
30
+ new(pattern_string, component: component, opaque_path: opaque_path, ipv6: ipv6).generate
31
+ end
32
+
33
+ def initialize(pattern_string, component:, opaque_path: false, ipv6: false)
34
+ @input = pattern_string
35
+ @component = component
36
+ @opaque_path = opaque_path
37
+ @ipv6 = ipv6
38
+ @delimiter, @prefixes = options_for(component, opaque_path)
39
+ @segment_wildcard_regexp = "[^#{escape_regexp_string(@delimiter)}]+?"
40
+ end
41
+
42
+ def generate
43
+ parts_to_pattern(parse)
44
+ end
45
+
46
+ private
47
+
48
+ # delimiter / prefix characters per component, matching the reference
49
+ # DEFAULT_OPTIONS / HOSTNAME_OPTIONS / PATHNAME_OPTIONS.
50
+ def options_for(component, opaque_path)
51
+ case component
52
+ when :hostname then [".", ""]
53
+ when :pathname then opaque_path ? ["", ""] : ["/", "/"]
54
+ else ["", ""]
55
+ end
56
+ end
57
+
58
+ def encode_part(value)
59
+ encode_run(value)
60
+ end
61
+
62
+ # --- parse: token list -> part list ------------------------------------
63
+
64
+ def parse
65
+ tokens = adapt_tokens(Tokenizer.new(@input, policy: :strict).tokenize)
66
+ @tokens = tokens
67
+ @index = 0
68
+ @key = 0
69
+ @name_set = {}
70
+ @pending = +""
71
+ @parts = []
72
+
73
+ while @index < @tokens.length
74
+ char_token = try_consume(:CHAR)
75
+ name_token = try_consume(:NAME)
76
+ regexp_or_wildcard = try_consume(:REGEX)
77
+ if !name_token && !regexp_or_wildcard
78
+ regexp_or_wildcard = try_consume(:ASTERISK)
79
+ end
80
+
81
+ if name_token || regexp_or_wildcard
82
+ prefix = char_token || ""
83
+ unless @prefixes.include?(prefix) && !prefix.empty?
84
+ @pending << prefix
85
+ prefix = ""
86
+ end
87
+ maybe_add_pending_fixed
88
+ modifier_token = try_consume_modifier
89
+ add_part(prefix, name_token, regexp_or_wildcard, "", modifier_token)
90
+ next
91
+ end
92
+
93
+ value = char_token || try_consume(:ESCAPED_CHAR)
94
+ if value
95
+ @pending << value
96
+ next
97
+ end
98
+
99
+ open_token = try_consume(:OPEN)
100
+ if open_token
101
+ prefix = consume_text
102
+ name_token = try_consume(:NAME)
103
+ regexp_or_wildcard = try_consume(:REGEX)
104
+ if !name_token && !regexp_or_wildcard
105
+ regexp_or_wildcard = try_consume(:ASTERISK)
106
+ end
107
+ suffix = consume_text
108
+ must_consume(:CLOSE)
109
+ modifier_token = try_consume_modifier
110
+ add_part(prefix, name_token, regexp_or_wildcard, suffix, modifier_token)
111
+ next
112
+ end
113
+
114
+ maybe_add_pending_fixed
115
+ must_consume(:END)
116
+ end
117
+
118
+ @parts
119
+ end
120
+
121
+ def try_consume(type)
122
+ return nil unless @index < @tokens.length && @tokens[@index].type == type
123
+ value = @tokens[@index].value
124
+ @index += 1
125
+ value
126
+ end
127
+
128
+ def try_consume_modifier
129
+ try_consume(:OTHER_MODIFIER) || try_consume(:ASTERISK)
130
+ end
131
+
132
+ def must_consume(type)
133
+ value = try_consume(type)
134
+ return value unless value.nil?
135
+ raise URIPattern::Error, "Unexpected token, expected #{type}"
136
+ end
137
+
138
+ def consume_text
139
+ result = +""
140
+ while (value = try_consume(:CHAR) || try_consume(:ESCAPED_CHAR))
141
+ result << value
142
+ end
143
+ result
144
+ end
145
+
146
+ def maybe_add_pending_fixed
147
+ return if @pending.empty?
148
+ @parts << Part.new(:fixed, "", "", encode_part(@pending), "", :none)
149
+ @pending = +""
150
+ end
151
+
152
+ MODIFIER_MAP = { "?" => :optional, "*" => :zero_or_more, "+" => :one_or_more }.freeze
153
+
154
+ def add_part(prefix, name_token, regexp_or_wildcard, suffix, modifier_token)
155
+ modifier = MODIFIER_MAP.fetch(modifier_token, :none)
156
+
157
+ # A "{ ... }" group of only fixed text with no modifier: buffer it.
158
+ if !name_token && !regexp_or_wildcard && modifier == :none
159
+ @pending << prefix
160
+ return
161
+ end
162
+
163
+ maybe_add_pending_fixed
164
+
165
+ # Fixed-string grouping such as "{foo}?": the text is the prefix.
166
+ if !name_token && !regexp_or_wildcard
167
+ return if prefix.empty?
168
+ @parts << Part.new(:fixed, "", "", encode_part(prefix), "", modifier)
169
+ return
170
+ end
171
+
172
+ regexp_value =
173
+ if !regexp_or_wildcard
174
+ @segment_wildcard_regexp
175
+ elsif regexp_or_wildcard == "*"
176
+ FULL_WILDCARD_REGEXP
177
+ else
178
+ regexp_or_wildcard
179
+ end
180
+
181
+ type = :regexp
182
+ if regexp_value == @segment_wildcard_regexp
183
+ type = :segment_wildcard
184
+ regexp_value = ""
185
+ elsif regexp_value == FULL_WILDCARD_REGEXP
186
+ type = :full_wildcard
187
+ regexp_value = ""
188
+ end
189
+
190
+ name =
191
+ if name_token
192
+ name_token
193
+ elsif regexp_or_wildcard
194
+ n = @key
195
+ @key += 1
196
+ n
197
+ else
198
+ ""
199
+ end
200
+
201
+ if @name_set.key?(name)
202
+ raise URIPattern::Error, "Duplicate name #{name.inspect}"
203
+ end
204
+ @name_set[name] = true
205
+
206
+ @parts << Part.new(type, name, encode_part(prefix), regexp_value, encode_part(suffix), modifier)
207
+ end
208
+
209
+ # --- generate: part list -> pattern string -----------------------------
210
+
211
+ def parts_to_pattern(parts)
212
+ result = +""
213
+ parts.each_with_index do |part, i|
214
+ if part.type == :fixed
215
+ if part.modifier == :none
216
+ result << escape_pattern_string(part.value)
217
+ else
218
+ result << "{#{escape_pattern_string(part.value)}}#{modifier_to_string(part.modifier)}"
219
+ end
220
+ next
221
+ end
222
+
223
+ custom_name = part.custom_name?
224
+
225
+ needs_grouping =
226
+ !part.suffix.empty? ||
227
+ (!part.prefix.empty? && (part.prefix.length != 1 || !@prefixes.include?(part.prefix)))
228
+
229
+ last_part = i > 0 ? parts[i - 1] : nil
230
+ next_part = i < parts.length - 1 ? parts[i + 1] : nil
231
+
232
+ if !needs_grouping && custom_name &&
233
+ part.type == :segment_wildcard && part.modifier == :none &&
234
+ next_part && next_part.prefix.empty? && next_part.suffix.empty?
235
+ if next_part.type == :fixed
236
+ code = next_part.value.empty? ? "" : next_part.value[0]
237
+ needs_grouping = IDENTIFIER_PART.match?(code)
238
+ else
239
+ needs_grouping = !next_part.custom_name?
240
+ end
241
+ end
242
+
243
+ if !needs_grouping && part.prefix.empty? && last_part && last_part.type == :fixed
244
+ code = last_part.value[-1]
245
+ needs_grouping = !code.nil? && @prefixes.include?(code)
246
+ end
247
+
248
+ result << "{" if needs_grouping
249
+ result << escape_pattern_string(part.prefix)
250
+ result << ":#{part.name}" if custom_name
251
+
252
+ case part.type
253
+ when :regexp
254
+ result << "(#{part.value})"
255
+ when :segment_wildcard
256
+ result << "(#{@segment_wildcard_regexp})" unless custom_name
257
+ when :full_wildcard
258
+ if !custom_name && (last_part.nil? ||
259
+ last_part.type == :fixed ||
260
+ last_part.modifier != :none ||
261
+ needs_grouping ||
262
+ !part.prefix.empty?)
263
+ result << "*"
264
+ else
265
+ result << "(#{FULL_WILDCARD_REGEXP})"
266
+ end
267
+ end
268
+
269
+ if part.type == :segment_wildcard && custom_name && !part.suffix.empty? &&
270
+ IDENTIFIER_PART.match?(part.suffix[0])
271
+ result << "\\"
272
+ end
273
+
274
+ result << escape_pattern_string(part.suffix)
275
+ result << "}" if needs_grouping
276
+ result << modifier_to_string(part.modifier) if part.modifier != :none
277
+ end
278
+ result
279
+ end
280
+
281
+ def modifier_to_string(modifier)
282
+ case modifier
283
+ when :zero_or_more then "*"
284
+ when :optional then "?"
285
+ when :one_or_more then "+"
286
+ else ""
287
+ end
288
+ end
289
+
290
+ def escape_pattern_string(value)
291
+ value.gsub(/([+*?:{}()\\])/, '\\\\\1')
292
+ end
293
+
294
+ def escape_regexp_string(value)
295
+ value.gsub(%r{([.+*?^${}()\[\]|/\\])}, '\\\\\1')
296
+ end
297
+
298
+ # --- token adaptation --------------------------------------------------
299
+
300
+ AdaptedToken = Struct.new(:type, :value)
301
+
302
+ # Convert our Tokenizer output into the flat token stream the parser expects.
303
+ # A "(...)" group is already a single :regexp token (carrying the raw regexp
304
+ # source), so it maps straight to a :REGEX token.
305
+ def adapt_tokens(tokens)
306
+ out = []
307
+ i = 0
308
+ while i < tokens.length
309
+ t = tokens[i]
310
+ case t.type
311
+ when :regexp then out << AdaptedToken.new(:REGEX, t.value); i += 1
312
+ when :char, :invalid_char then out << AdaptedToken.new(:CHAR, t.value); i += 1
313
+ when :escaped_char then out << AdaptedToken.new(:ESCAPED_CHAR, t.value); i += 1
314
+ when :name then out << AdaptedToken.new(:NAME, t.value); i += 1
315
+ when :asterisk then out << AdaptedToken.new(:ASTERISK, "*"); i += 1
316
+ when :open then out << AdaptedToken.new(:OPEN, "{"); i += 1
317
+ when :close then out << AdaptedToken.new(:CLOSE, "}"); i += 1
318
+ when :other_modifier then out << AdaptedToken.new(:OTHER_MODIFIER, t.value); i += 1
319
+ when :end then out << AdaptedToken.new(:END, ""); i += 1
320
+ else i += 1
321
+ end
322
+ end
323
+ out << AdaptedToken.new(:END, "") unless out.last&.type == :END
324
+ out
325
+ end
326
+ end
327
+ end
@@ -0,0 +1,170 @@
1
+ # frozen_string_literal: true
2
+
3
+ class URIPattern
4
+ class Tokenizer
5
+ Token = Struct.new(:type, :value, :index, keyword_init: true)
6
+
7
+ # A ":name" identifier follows the spec's "regexIdentifierStart" /
8
+ # "regexIdentifierPart" (path-to-regex-modified):
9
+ # start = /[$_\p{ID_Start}]/u, part = /[$_‌‍\p{ID_Continue}]/u
10
+ # In Ruby "_", ZWNJ and ZWJ are already in \p{ID_Continue} (and "$" is not),
11
+ # while "_" is not in \p{ID_Start}; so the start class adds "$" and "_" and the
12
+ # part class adds only "$". Matching the spec here (rather than a permissive
13
+ # "[\u{80}-\u{10FFFF}]") makes e.g. ":$foo" a name and rejects a name starting
14
+ # with a non-ID_Start code point (e.g. ":🚲"), as the reference does.
15
+ IDENTIFIER_RE = /\A[$_\p{ID_Start}][$\p{ID_Continue}]*/u
16
+
17
+ def initialize(pattern, policy: :lenient)
18
+ @pattern = pattern
19
+ @policy = policy
20
+ @index = 0
21
+ @tokens = []
22
+ end
23
+
24
+ def tokenize
25
+ while @index < @pattern.length
26
+ ch = @pattern[@index]
27
+
28
+ case ch
29
+ when "\\"
30
+ if @index + 1 < @pattern.length
31
+ emit(:escaped_char, @pattern[@index + 1])
32
+ @index += 2
33
+ else
34
+ handle_invalid("trailing backslash")
35
+ end
36
+ when "{"
37
+ emit(:open, ch)
38
+ @index += 1
39
+ when "}"
40
+ emit(:close, ch)
41
+ @index += 1
42
+ when "("
43
+ # Lex the whole "(...)" group atomically into one :regexp token, as the
44
+ # spec's tokenizer does (validating it during the scan).
45
+ scan_regexp_group
46
+ when ")"
47
+ # A ")" not consumed by a group scan is a literal character (the spec's
48
+ # tokenizer falls through to a CHAR token here).
49
+ emit(:char, ch)
50
+ @index += 1
51
+ when "*"
52
+ prev = @tokens.last
53
+ if prev && %i[close regexp name asterisk].include?(prev.type)
54
+ emit(:other_modifier, ch)
55
+ else
56
+ emit(:asterisk, ch)
57
+ end
58
+ @index += 1
59
+ when "?", "+"
60
+ # "?"/"+" are always modifier tokens. A modifier that does not follow a
61
+ # group/name/regexp/wildcard is a dangling modifier; the compiler rejects
62
+ # it. (A literal "?"/"+" must be escaped, e.g. "\\?".)
63
+ emit(:other_modifier, ch)
64
+ @index += 1
65
+ when ":"
66
+ rest = @pattern[(@index + 1)..]
67
+ if (m = IDENTIFIER_RE.match(rest))
68
+ emit(:name, m[0])
69
+ @index += 1 + m[0].length
70
+ else
71
+ # ":" must be followed by a valid name. When it is not, the spec's
72
+ # tokenizer reports "missing parameter name": strict tokenizing (used
73
+ # when compiling a component) raises, while lenient tokenizing
74
+ # (constructor string parsing) emits an :invalid_char so the ":" is
75
+ # still recognized as a protocol/password/port delimiter by the
76
+ # constructor string parser (which treats :invalid_char as a
77
+ # non-special char, like :char).
78
+ handle_invalid("missing parameter name")
79
+ end
80
+ else
81
+ emit(:char, ch)
82
+ @index += 1
83
+ end
84
+ end
85
+ emit(:end, "")
86
+ @tokens
87
+ end
88
+
89
+ private
90
+
91
+ def emit(type, value)
92
+ @tokens << Token.new(type: type, value: value, index: @index)
93
+ end
94
+
95
+ def handle_invalid(reason)
96
+ if @policy == :strict
97
+ raise URIPattern::Error, "Invalid pattern at index #{@index}: #{reason}"
98
+ else
99
+ emit(:invalid_char, @pattern[@index])
100
+ @index += 1
101
+ end
102
+ end
103
+
104
+ # Scan a "(...)" regexp group starting at @index (the "("), following the
105
+ # spec/path-to-regexp tokenizer. On success emits a single :regexp token whose
106
+ # value is the raw inner regexp source and advances @index past the closing ")".
107
+ # On a spec violation calls handle_invalid_group (strict raises; lenient emits an
108
+ # :invalid_char for the "(" and re-scans the remainder).
109
+ def scan_regexp_group
110
+ start = @index
111
+ j = start + 1
112
+
113
+ # "Pattern cannot start with '?'": a top-level group may not open with "?".
114
+ return handle_invalid_group(start, "regexp group cannot start with '?'") if @pattern[j] == "?"
115
+
116
+ count = 1
117
+ inner = +""
118
+ while j < @pattern.length
119
+ c = @pattern[j]
120
+ # Inside a group only ASCII is allowed (the escaped char after "\" is exempt).
121
+ return handle_invalid_group(start, "invalid character #{c.inspect} in regexp group") if c.ord >= 0x80
122
+
123
+ if c == "\\"
124
+ # Escaped pair: keep the backslash and the next char verbatim.
125
+ return handle_invalid_group(start, "trailing backslash in regexp group") if j + 1 >= @pattern.length
126
+ inner << c << @pattern[j + 1]
127
+ j += 2
128
+ next
129
+ end
130
+
131
+ if c == ")"
132
+ count -= 1
133
+ if count.zero?
134
+ j += 1
135
+ break
136
+ end
137
+ inner << c
138
+ j += 1
139
+ next
140
+ elsif c == "("
141
+ count += 1
142
+ # A nested group must be non-capturing ("(?:...)" etc.); a bare "(" would
143
+ # introduce a capturing group, which is not allowed.
144
+ return handle_invalid_group(start, "capturing groups are not allowed") if @pattern[j + 1] != "?"
145
+ inner << c
146
+ j += 1
147
+ next
148
+ end
149
+
150
+ inner << c
151
+ j += 1
152
+ end
153
+
154
+ return handle_invalid_group(start, "unbalanced regexp group") unless count.zero?
155
+ return handle_invalid_group(start, "missing pattern in regexp group") if inner.empty?
156
+
157
+ @tokens << Token.new(type: :regexp, value: inner, index: start)
158
+ @index = j
159
+ end
160
+
161
+ def handle_invalid_group(at, reason)
162
+ if @policy == :strict
163
+ raise URIPattern::Error, "Invalid pattern at index #{at}: #{reason}"
164
+ else
165
+ @tokens << Token.new(type: :invalid_char, value: @pattern[at], index: at)
166
+ @index = at + 1
167
+ end
168
+ end
169
+ end
170
+ end