wxpath 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wxpath/core/parser.py CHANGED
@@ -1,13 +1,7 @@
1
- """
2
- This module contains mainly two kinds of functions:
3
-
4
- 1. functions for parsing wxpath expressions.
5
- 2. functions for extracting information from wxpath expressions or subexpressions.
6
-
7
- """
8
1
  import re
9
- from dataclasses import dataclass, fields
10
- from typing import NamedTuple, Optional, TypeAlias
2
+ from dataclasses import dataclass
3
+ from itertools import pairwise
4
+ from typing import Iterable, Iterator, TypeAlias
11
5
 
12
6
  try:
13
7
  from enum import StrEnum
@@ -18,302 +12,587 @@ except ImportError:
18
12
  pass
19
13
 
20
14
 
21
- @dataclass(frozen=True, slots=True)
22
- class ValueBase:
23
- _value: str
15
+ TOKEN_SPEC = [
16
+ ("NUMBER", r"\d+(\.\d+)?"),
17
+ ("STRING", r"'([^'\\]|\\.)*'|\"([^\"\\]|\\.)*\""), # TODO: Rename to URL Literal
18
+ ("WXPATH", r"/{0,3}\s*url"), # Must come before NAME to match 'url' as WXPATH
19
+ # ("///URL", r"/{3}\s*url"),
20
+ # ("//URL", r"/{2}\s*url"),
21
+ # ("/URL", r"/{1}\s*url"),
22
+ ("URL", r"\s*url"), # Must come before NAME to match 'url' as WXPATH
23
+ # ("NAME", r"[a-zA-Z_][a-zA-Z0-9_]*"),
24
+ ("FOLLOW", r",?\s{,}follow="),
25
+ ("OP", r"\|\||<=|>=|!=|=|<|>|\+|-|\*|/|!"), # Added || for string concat
26
+ ("LPAREN", r"\("),
27
+ ("RPAREN", r"\)"),
28
+ ("LBRACE", r"\{"),
29
+ ("RBRACE", r"\}"),
30
+ ("COLON", r":"),
31
+ ("COMMA", r","),
32
+ ("WS", r"\s+"),
33
+ ("DOT", r"\."),
34
+ ("OTHER", r"."), # Catch-all for xpath operators: /, @, [, ], etc.
35
+ ]
36
+
37
+ TOKEN_RE = re.compile("|".join(
38
+ f"(?P<{name}>{pattern})"
39
+ for name, pattern in TOKEN_SPEC
40
+ ))
41
+
42
+
43
+ @dataclass
44
+ class Token:
45
+ type: str
46
+ value: str
47
+ start: int = 0 # position in source string
48
+ end: int = 0
49
+
50
+
51
+ def tokenize(src: str):
52
+ for m in TOKEN_RE.finditer(src):
53
+ kind = m.lastgroup
54
+ # # NOTE: in order to preserve native XPath expressions that contain whitespace,
55
+ # # for example, "and not(...)", we can't skip whitespace
56
+ # if kind == "WS":
57
+ # continue
58
+ yield Token(kind, m.group(), m.start(), m.end())
59
+ yield Token("EOF", "", len(src), len(src))
60
+
61
+
62
+ @dataclass
63
+ class Number:
64
+ value: float
65
+
66
+ @dataclass
67
+ class String:
68
+ value: str
69
+
70
+
71
+ @dataclass
72
+ class Name:
73
+ value: str
74
+
75
+ @dataclass
76
+ class Xpath:
77
+ value: str
78
+
79
+ @dataclass
80
+ class Wxpath:
81
+ value: str
82
+
83
+ @dataclass
84
+ class Call:
85
+ func: str
86
+ args: list
87
+
88
+ @dataclass
89
+ class Url(Call):
90
+ pass
91
+
92
+ @dataclass
93
+ class UrlLiteral(Url):
94
+ pass
95
+
96
+ @dataclass
97
+ class UrlQuery(Url):
98
+ pass
99
+
100
+ UrlSelect = UrlQuery
101
+
102
+ @dataclass
103
+ class UrlCrawl(Url):
104
+ pass
105
+
106
+ UrlFollow = UrlCrawl
107
+
108
+ @dataclass
109
+ class Binary:
110
+ left: object
111
+ op: str
112
+ right: object
24
113
 
114
+ Segment: TypeAlias = Url | Xpath
25
115
 
26
- @dataclass(frozen=True, slots=True)
27
- class UrlValue(ValueBase):
28
- target: str
29
- follow: str | None = None
116
+ class Segments(list):
117
+ def __repr__(self):
118
+ return f"Segments({super().__repr__()})"
119
+
120
+ def __str__(self):
121
+ return f"Segments({super().__str__()})"
30
122
 
123
+ @dataclass
124
+ class Other:
125
+ value: str
31
126
 
32
- @dataclass(frozen=True, slots=True)
33
- class XpathValue(ValueBase):
34
- expr: str
35
127
 
128
+ @dataclass
129
+ class ContextItem(Xpath):
130
+ """Represents the XPath context item expression: ."""
131
+ value: str = "."
36
132
 
37
- @dataclass(frozen=True, slots=True)
38
- class UrlInfAndXpathValue(ValueBase):
39
- target: str
40
- expr: str
41
133
 
134
+ PRECEDENCE = {
135
+ "||": 5, # String concatenation (lowest precedence)
136
+ "=": 10,
137
+ "!=": 10,
138
+ "<": 10,
139
+ "<=": 10,
140
+ ">": 10,
141
+ ">=": 10,
142
+ "+": 20,
143
+ "-": 20,
144
+ "*": 30,
145
+ "/": 30,
146
+ "!": 40, # Simple map operator (highest precedence)
147
+ }
42
148
 
43
- Value: TypeAlias = UrlValue | XpathValue | UrlInfAndXpathValue
44
149
 
150
+ class Parser:
151
+ """Pratt-style parser that produces wxpath AST nodes."""
45
152
 
46
- class Segment(NamedTuple):
47
- op: str
48
- value: Value
153
+ def __init__(self, tokens: Iterable[Token]):
154
+ self.tokens: Iterator[Token] = iter(tokens)
155
+ self.token: Token = next(self.tokens)
49
156
 
157
+ def advance(self) -> None:
158
+ self.token = next(self.tokens)
50
159
 
51
- class OPS(StrEnum):
52
- URL_STR_LIT = "url_str_lit"
53
- URL_EVAL = "url_eval"
54
- URL_INF = "url_inf"
55
- URL_INF_AND_XPATH = "url_inf_and_xpath"
56
- XPATH = "xpath"
57
- XPATH_FN_MAP_FRAG = "xpath_fn_map_frag" # XPath function ending with map operator '!'
58
- INF_XPATH = "inf_xpath" # Experimental
59
- OBJECT = "object" # Deprecated
60
- URL_FROM_ATTR = "url_from_attr" # Deprecated
61
- URL_OPR_AND_ARG = "url_opr_and_arg" # Deprecated
160
+ def parse(self) -> object:
161
+ """Parse the input tokens into an AST or raise on unexpected trailing tokens."""
162
+ output = self.expression(0)
163
+ if self.token.type != "EOF":
164
+ raise SyntaxError(f"unexpected token: {self.token}")
62
165
 
166
+ return output
63
167
 
64
- def _scan_path_expr(path_expr: str) -> list[str]:
65
- """
66
- Provided a wxpath expression, produce a list of all xpath and url() partitions
67
-
68
- :param path_expr: Description
69
- """
70
- # remove newlines
71
- path_expr = path_expr.replace('\n', '')
72
- partitions = [] # type: list[str]
73
- i = 0
74
- n = len(path_expr)
75
- while i < n:
76
- # Detect ///url(, //url(, /url(, or url(
77
- match = re.match(r'/{0,3}url\(', path_expr[i:])
78
- if match:
79
- seg_start = i
80
- i += match.end() # Move past the matched "url("
81
- paren_depth = 1
82
- while i < n and paren_depth > 0:
83
- if path_expr[i] == '(':
84
- paren_depth += 1
85
- elif path_expr[i] == ')':
86
- paren_depth -= 1
87
- i += 1
88
- partitions.append(path_expr[seg_start:i])
89
- else:
90
- # Grab until the next /url(
91
- next_url = re.search(r'/{0,3}url\(', path_expr[i:])
92
- next_pos = next_url.start() + i if next_url else n
93
- if i != next_pos:
94
- partitions.append(path_expr[i:next_pos])
95
- i = next_pos
96
-
97
- return partitions
98
-
99
-
100
- def parse_wxpath_expr(path_expr):
101
- partitions = _scan_path_expr(path_expr)
102
-
103
- # Lex and parse
104
- segments = [] # type: list[Segment]
105
- for s in partitions:
106
- s = s.strip()
107
- if not s:
108
- continue
109
- if s.startswith('url("') or s.startswith("url('"):
110
- segments.append(
111
- Segment(
112
- OPS.URL_STR_LIT,
113
- UrlValue(s, *parse_url_value(_extract_arg_from_url_op(s))),
114
- )
115
- )
116
- elif s.startswith('///url('):
117
- segments.append(
118
- Segment(
119
- OPS.URL_INF,
120
- # XpathValue(extract_url_op_arg(s))
121
- XpathValue(_value=s, expr=_extract_arg_from_url_xpath_op(s))
122
- )
123
- )
124
- elif s.startswith('/url("') or s.startswith('//url("'):
125
- raise ValueError("url() segment cannot have string literal "
126
- f"argument and preceding navigation slashes (/|//): {s}")
127
- elif s.startswith("/url('") or s.startswith("//url('"):
128
- raise ValueError("url() segment cannot have string literal "
129
- f"argument and preceding navigation slashes (/|//): {s}")
130
- elif s.startswith('/url(') or s.startswith("//url("):
131
- segments.append(Segment(OPS.URL_EVAL, XpathValue(s, _extract_arg_from_url_xpath_op(s))))
132
- elif s.startswith('url('):
133
- segments.append(Segment(OPS.URL_EVAL, XpathValue(s, _extract_arg_from_url_xpath_op(s))))
134
- elif s.startswith('///'):
135
- raise ValueError(f"xpath segment cannot have preceding triple slashes : {s}")
136
- # segments.append(Segment(OPS.INF_XPATH, XpathValue(s, "//" + s[3:])))
137
- elif s.endswith('!'):
138
- segments.append(Segment(OPS.XPATH_FN_MAP_FRAG, XpathValue(s, s[:-1])))
168
+ def expression(self, min_prec: int) -> object:
169
+ return self.parse_binary(min_prec)
170
+
171
+ def parse_binary(self, min_prec: int) -> object:
172
+ """Parse a binary expression chain honoring operator precedence."""
173
+ if self.token.type == "WXPATH":
174
+ left = self.parse_segments()
139
175
  else:
140
- segments.append(Segment(OPS.XPATH, XpathValue(s, s)))
141
-
142
- ## EXPERIMENTAL
143
- ## Disabled for now
144
- ## Collapes inf_xpath segment and the succeeding url_eval segment into a single url_inf segment
145
- # for i in range(len(segments) - 1, 0, -1):
146
- # if segments[i - 1][0] == OPS.INF_XPATH and segments[i][0] == OPS.URL_EVAL:
147
- # inf_xpath_value = segments[i - 1][1]
148
- # url_eval_value = segments[i][1]
149
- # url_eval_traveral_fragment = url_eval_value._value.split('url')[0]
150
- # segments[i - 1] = Segment(
151
- # OPS.URL_INF,
152
- # XpathValue(
153
- # _value='',
154
- # expr=(f'{inf_xpath_value.expr}'
155
- # f'{url_eval_traveral_fragment}'
156
- # f'{url_eval_value.expr}')
157
- # )
158
- # )
159
- # segments.pop(i)
160
-
161
- #### RAISE ERRORS FROM INVALID SEGMENTS ####
162
- # Raises if multiple ///url() are present
163
- if len([op for op, val in segments if op == OPS.URL_INF]) > 1:
164
- raise ValueError("Only one ///url() is allowed")
176
+ left = self.nud()
177
+
178
+ while self.token.type == "OP" and PRECEDENCE.get(self.token.value, -1) >= min_prec:
179
+ op = self.token.value
180
+ prec = PRECEDENCE[op]
181
+ self.advance()
182
+ if self.token.type == 'WXPATH':
183
+ right = self.parse_segments()
184
+ else:
185
+ right = self.parse_binary(prec + 1)
186
+ left = Binary(left, op, right)
187
+
188
+ return left
165
189
 
166
- # Raises if multiple url() with string literals are present
167
- if len([op for op, _ in segments if op == OPS.URL_STR_LIT]) > 1:
168
- raise ValueError("Only one url() with string literal argument is allowed")
190
+ @staticmethod
191
+ def _validate_segments(func):
192
+ """Decorator that validates segment invariants after parsing.
193
+
194
+ Raises ValueError if the xpath in ``url(<xpath>)`` begins with ``/``
195
+ or ``//`` when it follows an Xpath segment.
196
+
197
+ Args:
198
+ func: A bound method that returns a list of segments.
199
+
200
+ Returns:
201
+ The wrapped function that performs validation.
202
+ """
203
+ def _func(self) -> Segments:
204
+ segments = func(self)
205
+ for seg1, seg2 in pairwise(segments):
206
+ if isinstance(seg1, Xpath) and isinstance(seg2, Url):
207
+ if seg2.args[0].value.startswith(("/", "//")):
208
+ raise ValueError(
209
+ f"Invalid segments: {segments}. the <xpath> in url(<xpath>)"
210
+ " may not begin with / or // if following an Xpath segment."
211
+ )
212
+ return segments
213
+ return _func
214
+
215
+ @_validate_segments
216
+ def parse_segments(self) -> Segments:
217
+ """Parse a sequence of wxpath segments: url() calls interspersed with xpath.
218
+
219
+ Handles patterns like::
220
+
221
+ url('...')
222
+ url('...')//a/@href
223
+ url('...')//a/url(@href)//b
224
+ //a/@href
225
+ //a/map { 'key': value }
226
+
227
+ Returns:
228
+ A Segments list containing the parsed Url and Xpath nodes.
229
+ """
230
+ segments = []
231
+
232
+ while self.token.type != "EOF":
233
+ if self.token.type == "WXPATH":
234
+ # Parse url() call
235
+ call = self.nud()
236
+ if call is not None:
237
+ if isinstance(call, (Segments, list)):
238
+ segments.extend(call)
239
+ else:
240
+ segments.append(call)
241
+ elif self.token.type == "RPAREN":
242
+ # End of nested context
243
+ break
244
+ elif self.token.type == "COMMA":
245
+ # Argument separator - stop segment parsing
246
+ break
247
+ elif self.token.type == "RBRACE":
248
+ # End of map context - stop segment parsing
249
+ break
250
+ else:
251
+ # Capture xpath content until next url() or end
252
+ xpath_content = self.capture_xpath_until_wxpath_or_end()
253
+ if xpath_content.strip():
254
+ segments.append(Xpath(xpath_content.strip()))
255
+
256
+ return Segments(segments)
257
+
258
+
259
+ def nud(self) -> object | None:
260
+ """Parse a null-denoting expression (nud).
261
+
262
+ Null-denoting expressions include numbers, names, or expressions
263
+ enclosed in parentheses.
264
+
265
+ Returns:
266
+ The parsed AST node, or None if the token is unrecognized.
267
+
268
+ Raises:
269
+ SyntaxError: If the token cannot form a valid expression.
270
+ """
271
+ tok = self.token
272
+
273
+ if tok.type == "NUMBER":
274
+ self.advance()
275
+ return Number(float(tok.value))
276
+
277
+ if tok.type == "STRING":
278
+ self.advance()
279
+ return String(tok.value[1:-1]) # strip quotes
280
+
281
+ if tok.type == "DOT":
282
+ self.advance()
283
+ return ContextItem()
284
+
285
+ if tok.type == "WXPATH":
286
+ value = tok.value.replace(" ", "").replace("\n", "")
287
+ self.advance()
288
+
289
+ if self.token.type == "LPAREN":
290
+ return self.parse_call(value)
291
+
292
+ return Wxpath(value)
293
+
294
+ if tok.type == "NAME":
295
+ self.advance()
296
+
297
+ # function call
298
+ if self.token.type == "LPAREN":
299
+ return self.parse_call(tok.value)
300
+
301
+ return Name(tok.value)
302
+
303
+ if tok.type == "LPAREN":
304
+ self.advance()
305
+ expr = self.expression(0)
306
+ if self.token.type != "RPAREN":
307
+ raise SyntaxError("expected ')'")
308
+ self.advance()
309
+ return expr
310
+
311
+ # For other tokens (xpath content), return None to signal caller to handle
312
+ return None
169
313
 
170
- # Raises when expr starts with //url(@<attr>)
171
- if segments and segments[0][0] == OPS.URL_EVAL:
172
- raise ValueError("Path expr cannot start with [//]url(<xpath>)")
173
-
174
- # Raises if expr ends with INF_XPATH
175
- if segments and segments[-1][0] == OPS.INF_XPATH:
176
- raise ValueError("Path expr cannot end with ///<xpath>")
177
-
178
- # Raises if expr ends with XPATH_FN_MAP_FRAG
179
- if segments and segments[-1][0] == OPS.XPATH_FN_MAP_FRAG:
180
- raise ValueError("Path expr cannot end with !")
181
- return segments
182
-
183
-
184
- def parse_url_value(src: str) -> tuple[str, Optional[str]]:
185
- """
186
- Parse the contents of url(...).
187
-
188
- Examples of src:
189
- "'https://example.com'"
190
- "//a/@href"
191
- "'https://x', follow=//a/@href"
192
- """
193
-
194
- parts = _split_top_level_commas(src)
195
-
196
- if not parts:
197
- raise SyntaxError("url() requires at least one argument")
198
-
199
- # ---- positional argument (target) ----
200
- target_src = parts[0].strip()
201
- if not target_src:
202
- raise SyntaxError("url() target cannot be empty")
203
-
204
- target = _parse_url_target(target_src)
205
-
206
- follow = None
207
314
 
208
- # ---- keyword arguments ----
209
- for part in parts[1:]:
210
- name, value = _split_kwarg(part)
315
+ def capture_xpath_until_wxpath_or_end(self) -> str:
316
+ """Capture xpath tokens until a WXPATH token, EOF, RPAREN, or COMMA.
317
+
318
+ Balances parentheses and braces so that xpath functions like
319
+ ``contains()`` and map constructors like ``map { ... }`` are captured
320
+ correctly.
321
+
322
+ Returns:
323
+ The accumulated xpath content as a string.
324
+ """
325
+ result = ""
326
+ paren_depth = 0
327
+ brace_depth = 0
328
+
329
+ while self.token.type != "EOF":
330
+ # Stop conditions (only at depth 0 for both parens and braces)
331
+ if paren_depth == 0 and brace_depth == 0:
332
+ if self.token.type == "WXPATH":
333
+ break
334
+ if self.token.type == "RPAREN":
335
+ break
336
+ if self.token.type == "COMMA":
337
+ break
338
+
339
+ # Track paren depth for xpath functions
340
+ if self.token.type == "LPAREN":
341
+ paren_depth += 1
342
+ elif self.token.type == "RPAREN":
343
+ paren_depth -= 1
344
+ if paren_depth < 0:
345
+ # This RPAREN closes an outer context
346
+ break
347
+
348
+ # Track brace depth for map constructors
349
+ if self.token.type == "LBRACE":
350
+ brace_depth += 1
351
+ elif self.token.type == "RBRACE":
352
+ brace_depth -= 1
353
+ if brace_depth < 0:
354
+ # This RBRACE closes an outer context
355
+ break
356
+
357
+ result += self.token.value
358
+ self.advance()
359
+
360
+ return result
361
+
211
362
 
212
- if name == "follow":
213
- if follow is not None:
214
- raise SyntaxError("duplicate follow= in url()")
215
- follow = value.strip()
363
+ def capture_url_arg_content(self) -> list[Call | Xpath | ContextItem]:
364
+ """Capture content inside a url() call, handling nested wxpath expressions.
365
+
366
+ Supports patterns like::
367
+
368
+ url('...') -> [String]
369
+ url('...' follow=//a/@href) -> [String, Xpath]
370
+ url(//a/@href) -> [Xpath]
371
+ url( url('..')//a/@href ) -> [Call, Xpath]
372
+ url( url( url('..')//a )//b ) -> [Call, Xpath]
373
+
374
+ Returns:
375
+ A list of parsed elements: Xpath nodes for xpath content and Call
376
+ nodes for nested url() calls.
377
+ """
378
+ elements = []
379
+ current_xpath = ""
380
+ paren_balance = 1 # We're already inside the opening paren of url()
381
+ brace_balance = 0 # Track braces for map constructors
382
+ reached_follow_token = False
383
+ follow_xpath = ""
384
+ while paren_balance > 0 and self.token.type != "EOF":
385
+ if self.token.type == "WXPATH":
386
+ # Found nested wxpath: save any accumulated xpath content first
387
+ if current_xpath.strip():
388
+ elements.append(Xpath(current_xpath.strip()))
389
+ current_xpath = ""
390
+
391
+ # Parse the nested url() call using nud()
392
+ # This recursively handles deeply nested wxpath
393
+ nested_call = self.nud()
394
+ if nested_call is not None:
395
+ elements.append(nested_call)
396
+
397
+ elif self.token.type == "FOLLOW":
398
+ reached_follow_token = True
399
+ self.advance()
400
+
401
+ elif self.token.type == "LPAREN":
402
+ # Opening paren that's NOT part of a url() call
403
+ # (it's part of an xpath function like contains(), starts-with(), etc.)
404
+ paren_balance += 1
405
+ current_xpath += self.token.value
406
+ self.advance()
407
+
408
+ elif self.token.type == "RPAREN":
409
+ paren_balance -= 1
410
+ if paren_balance == 0:
411
+ # This is the closing paren of the outer url()
412
+ break
413
+ current_xpath += self.token.value
414
+ self.advance()
415
+
416
+ elif self.token.type == "LBRACE":
417
+ # Opening brace for map constructors
418
+ brace_balance += 1
419
+ current_xpath += self.token.value
420
+ self.advance()
421
+
422
+ elif self.token.type == "RBRACE":
423
+ brace_balance -= 1
424
+ current_xpath += self.token.value
425
+ self.advance()
426
+
427
+ else:
428
+ # Accumulate all other tokens as xpath content
429
+ if not reached_follow_token:
430
+ current_xpath += self.token.value
431
+ else:
432
+ follow_xpath += self.token.value
433
+
434
+ self.advance()
435
+
436
+ if paren_balance != 0:
437
+ raise SyntaxError("unbalanced parentheses in url()")
438
+
439
+ # Save any remaining xpath content
440
+ if current_xpath.strip():
441
+ current_xpath = current_xpath.strip()
442
+ if current_xpath == ".":
443
+ elements.append(ContextItem())
444
+ else:
445
+ elements.append(Xpath(current_xpath))
446
+
447
+ if follow_xpath.strip():
448
+ elements.append(Xpath(follow_xpath.strip()))
449
+
450
+ return elements
451
+
452
+ def parse_call(self, func_name: str) -> Call | Segments:
453
+ """Parse a function call (including url variants) and specialize node types."""
454
+ self.advance() # consume '('
455
+ args = []
456
+ follow_arg = None
457
+
458
+ if func_name.endswith("url"):
459
+ if self.token.type == "STRING":
460
+ # Simple case: url('literal string')
461
+ args = [String(self.token.value[1:-1])] # strip quotes
462
+ self.advance()
463
+ # Handle follow=...
464
+ if self.token.type == "FOLLOW":
465
+ self.advance()
466
+ follow_arg = self.capture_url_arg_content()
467
+ args.extend(follow_arg)
468
+ elif self.token.type == "WXPATH":
469
+ # Nested wxpath: url( url('...')//a/@href ) or url( /url(...) )
470
+ # Use capture_url_arg_content to handle nested wxpath and xpath
471
+ args = self.capture_url_arg_content()
472
+ else:
473
+ # Simple xpath argument: url(//a/@href)
474
+ # Could still contain nested wxpath, so use capture_url_arg_content
475
+ args = self.capture_url_arg_content()
476
+
477
+ # Handle additional comma-separated arguments (e.g., follow=...)
478
+ if self.token.type != "RPAREN":
479
+ while True:
480
+ args.append(self.expression(0))
481
+ if self.token.type == "COMMA":
482
+ self.advance()
483
+ continue
484
+ break
485
+
486
+ if self.token.type != "RPAREN":
487
+ raise SyntaxError("expected ')'")
488
+ self.advance()
489
+
490
+ return _specify_call_types(func_name, args)
491
+
492
+
493
+ def _specify_call_types(func_name: str, args: list) -> Call | Segments:
494
+ if func_name == "url":
495
+ if len(args) == 1:
496
+ if isinstance(args[0], String):
497
+ return UrlLiteral(func_name, args)
498
+ elif isinstance(args[0], (Xpath, ContextItem)):
499
+ return UrlQuery(func_name, args)
500
+ else:
501
+ raise ValueError(f"Unknown argument type: {type(args[0])}")
502
+ elif len(args) == 2:
503
+ if isinstance(args[0], String) and isinstance(args[1], Xpath):
504
+ return UrlCrawl(func_name, args)
505
+ elif isinstance(args[0], UrlLiteral) and isinstance(args[1], Xpath):
506
+ args.append(UrlQuery('url', [ContextItem()]))
507
+ return Segments(args)
508
+ elif isinstance(args[0], (Segments, list)) and isinstance(args[1], Xpath):
509
+ segs = args[0]
510
+ segs.append(args[1])
511
+ return Segments(segs)
512
+ else:
513
+ raise ValueError(f"Unknown arguments: {args}")
216
514
  else:
217
- raise SyntaxError(f"unknown url() argument: {name}")
218
-
219
- return target, follow
220
-
221
-
222
- def extract_url_op_arg(url_op_and_arg: str) -> str:
223
- url_op_arg = _extract_arg_from_url_xpath_op(url_op_and_arg)
224
- if url_op_arg.startswith('@'):
225
- return ".//" + url_op_arg
226
- elif url_op_arg.startswith('.'):
227
- return url_op_arg
228
- elif url_op_arg.startswith('//'):
229
- return '.' + url_op_arg
230
- elif not url_op_arg.startswith('.//'):
231
- return './/' + url_op_arg
232
- else:
233
- return url_op_arg
234
-
235
-
236
- def _extract_arg_from_url_xpath_op(url_subsegment):
237
- match = re.search(r"url\((.+)\)", url_subsegment)
238
- if not match:
239
- raise ValueError(f"Invalid url() segment: {url_subsegment}")
240
- return match.group(1).strip("'\"") # Remove surrounding quotes if any
241
-
242
-
243
- def _extract_arg_from_url_op(url_subsegment):
244
- match = re.search(r"url\((.+)\)", url_subsegment)
245
- if not match:
246
- raise ValueError(f"Invalid url() segment: {url_subsegment}")
247
- return match.group(1) # Remove surrounding quotes if any
248
-
249
-
250
- def _split_top_level_commas(src: str) -> list[str]:
251
- parts = []
252
- buf = []
253
- depth = 0
254
- in_string = False
255
- quote = None
256
-
257
- for ch in src:
258
- if in_string:
259
- buf.append(ch)
260
- if ch == quote:
261
- in_string = False
262
- continue
263
-
264
- if ch in ("'", '"'):
265
- in_string = True
266
- quote = ch
267
- buf.append(ch)
268
- continue
269
-
270
- if ch in "([{":
271
- depth += 1
272
- elif ch in ")]}":
273
- depth -= 1
274
- if depth < 0:
275
- raise SyntaxError("unbalanced parentheses in url()")
276
-
277
- if ch == "," and depth == 0:
278
- parts.append("".join(buf).strip())
279
- buf.clear()
515
+ raise ValueError(f"Unknown arguments: {args}")
516
+ elif func_name == "/url" or func_name == "//url":
517
+ if len(args) == 1:
518
+ if isinstance(args[0], (Xpath, ContextItem)):
519
+ return UrlQuery(func_name, args)
520
+ else:
521
+ raise ValueError(f"Unknown argument type: {type(args[0])}")
280
522
  else:
281
- buf.append(ch)
282
-
283
- if in_string or depth != 0:
284
- raise SyntaxError("unbalanced expression in url()")
285
-
286
- if buf:
287
- parts.append("".join(buf).strip())
288
-
289
- return parts
290
-
291
-
292
- def _split_kwarg(src: str) -> tuple[str, str]:
293
- if "=" not in src:
294
- raise SyntaxError(f"expected keyword argument, got: {src}")
295
-
296
- name, value = src.split("=", 1)
297
- name = name.strip()
298
- value = value.strip()
299
-
300
- if not name or not value:
301
- raise SyntaxError(f"invalid keyword argument: {src}")
523
+ raise ValueError(f"Unknown arguments: {args}")
524
+ elif func_name == "///url":
525
+ if len(args) == 1:
526
+ if isinstance(args[0], (Xpath, ContextItem)):
527
+ return UrlCrawl(func_name, args)
528
+ else:
529
+ raise ValueError(f"Unknown argument type: {type(args[0])}")
530
+ else:
531
+ raise ValueError(f"Unknown arguments: {args}")
532
+ else:
533
+ return Call(func_name, args)
302
534
 
303
- return name, value
304
535
 
536
+ def find_wxpath_boundary(tokens: list[Token]) -> tuple[int, int] | None:
537
+ """Find the operator that connects pure xpath to wxpath.
305
538
 
306
- def _parse_url_target(src: str):
307
- src = src.strip()
308
- # string literal
309
- if (src.startswith("'") and src.endswith("'")) or \
310
- (src.startswith('"') and src.endswith('"')):
311
- return src[1:-1]
539
+ The boundary is the last operator at depth 0 before the first WXPATH token.
312
540
 
313
- return src
541
+ Args:
542
+ tokens: List of Token objects from the tokenizer.
314
543
 
544
+ Returns:
545
+ A tuple of (op_position, wxpath_position) or None if no boundary
546
+ exists.
547
+ """
548
+ # Find first WXPATH token position
549
+ wxpath_pos = None
550
+ for i, tok in enumerate(tokens):
551
+ if tok.type == "WXPATH":
552
+ wxpath_pos = i
553
+ break
554
+
555
+ if wxpath_pos is None:
556
+ return None
557
+
558
+ # Walk backwards from wxpath to find connecting operator at depth 0
559
+ paren_depth = 0
560
+ for i in range(wxpath_pos - 1, -1, -1):
561
+ tok = tokens[i]
562
+ if tok.type == "RPAREN":
563
+ paren_depth += 1
564
+ elif tok.type == "LPAREN":
565
+ paren_depth -= 1
566
+ elif paren_depth == 0 and tok.type == "OP":
567
+ return (i, wxpath_pos)
568
+
569
+ return None
315
570
 
316
- def _get_shallow_dict(instance: Value):
317
- return {field.name: getattr(instance, field.name)
318
- for field in fields(instance) if field.name not in {'_value'}}
319
571
 
572
+ def parse(src):
573
+ tokens = list(tokenize(src))
574
+
575
+ boundary = find_wxpath_boundary(tokens)
576
+
577
+ # If no wxpath at all, return as pure xpath
578
+ if boundary is None:
579
+ # Check if there's any WXPATH token
580
+ has_wxpath = any(t.type == "WXPATH" for t in tokens)
581
+ if not has_wxpath:
582
+ return Xpath(src.strip())
583
+ # Has wxpath but no boundary operator - parse normally
584
+ parser = Parser(iter(tokens))
585
+ return parser.parse()
586
+
587
+ op_pos, wxpath_pos = boundary
588
+
589
+ # Use source positions to extract xpath string (preserves whitespace)
590
+ op_token = tokens[op_pos]
591
+ xpath_str = src[:op_token.start].strip()
592
+
593
+ # Parse wxpath part (tokens after the operator)
594
+ wxpath_tokens = tokens[op_pos + 1:] # includes EOF
595
+ parser = Parser(iter(wxpath_tokens))
596
+ wxpath_ast = parser.parse()
597
+
598
+ return Binary(Xpath(xpath_str), op_token.value, wxpath_ast)