wxpath 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wxpath/__init__.py +9 -0
- wxpath/cli.py +92 -0
- wxpath/core/__init__.py +13 -0
- wxpath/core/dom.py +22 -0
- wxpath/core/models.py +74 -0
- wxpath/core/ops.py +278 -0
- wxpath/core/parser.py +598 -0
- wxpath/core/runtime/__init__.py +5 -0
- wxpath/core/runtime/engine.py +406 -0
- wxpath/core/runtime/helpers.py +41 -0
- wxpath/hooks/__init__.py +9 -0
- wxpath/hooks/builtin.py +113 -0
- wxpath/hooks/registry.py +145 -0
- wxpath/http/__init__.py +0 -0
- wxpath/http/client/__init__.py +9 -0
- wxpath/http/client/crawler.py +231 -0
- wxpath/http/client/request.py +38 -0
- wxpath/http/client/response.py +14 -0
- wxpath/http/policy/backoff.py +16 -0
- wxpath/http/policy/retry.py +35 -0
- wxpath/http/policy/robots.py +82 -0
- wxpath/http/policy/throttler.py +114 -0
- wxpath/http/stats.py +96 -0
- wxpath/patches.py +63 -0
- wxpath/util/__init__.py +0 -0
- wxpath/util/logging.py +91 -0
- wxpath/util/serialize.py +22 -0
- {wxpath-0.1.1.dist-info → wxpath-0.3.0.dist-info}/METADATA +107 -129
- wxpath-0.3.0.dist-info/RECORD +33 -0
- wxpath-0.3.0.dist-info/top_level.txt +1 -0
- wxpath-0.1.1.dist-info/RECORD +0 -6
- wxpath-0.1.1.dist-info/top_level.txt +0 -1
- {wxpath-0.1.1.dist-info → wxpath-0.3.0.dist-info}/WHEEL +0 -0
- {wxpath-0.1.1.dist-info → wxpath-0.3.0.dist-info}/entry_points.txt +0 -0
- {wxpath-0.1.1.dist-info → wxpath-0.3.0.dist-info}/licenses/LICENSE +0 -0
wxpath/core/parser.py
ADDED
|
@@ -0,0 +1,598 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from itertools import pairwise
|
|
4
|
+
from typing import Iterable, Iterator, TypeAlias
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
from enum import StrEnum
|
|
8
|
+
except ImportError:
|
|
9
|
+
from enum import Enum
|
|
10
|
+
|
|
11
|
+
class StrEnum(str, Enum):
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
TOKEN_SPEC = [
|
|
16
|
+
("NUMBER", r"\d+(\.\d+)?"),
|
|
17
|
+
("STRING", r"'([^'\\]|\\.)*'|\"([^\"\\]|\\.)*\""), # TODO: Rename to URL Literal
|
|
18
|
+
("WXPATH", r"/{0,3}\s*url"), # Must come before NAME to match 'url' as WXPATH
|
|
19
|
+
# ("///URL", r"/{3}\s*url"),
|
|
20
|
+
# ("//URL", r"/{2}\s*url"),
|
|
21
|
+
# ("/URL", r"/{1}\s*url"),
|
|
22
|
+
("URL", r"\s*url"), # Must come before NAME to match 'url' as WXPATH
|
|
23
|
+
# ("NAME", r"[a-zA-Z_][a-zA-Z0-9_]*"),
|
|
24
|
+
("FOLLOW", r",?\s{,}follow="),
|
|
25
|
+
("OP", r"\|\||<=|>=|!=|=|<|>|\+|-|\*|/|!"), # Added || for string concat
|
|
26
|
+
("LPAREN", r"\("),
|
|
27
|
+
("RPAREN", r"\)"),
|
|
28
|
+
("LBRACE", r"\{"),
|
|
29
|
+
("RBRACE", r"\}"),
|
|
30
|
+
("COLON", r":"),
|
|
31
|
+
("COMMA", r","),
|
|
32
|
+
("WS", r"\s+"),
|
|
33
|
+
("DOT", r"\."),
|
|
34
|
+
("OTHER", r"."), # Catch-all for xpath operators: /, @, [, ], etc.
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
TOKEN_RE = re.compile("|".join(
|
|
38
|
+
f"(?P<{name}>{pattern})"
|
|
39
|
+
for name, pattern in TOKEN_SPEC
|
|
40
|
+
))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class Token:
|
|
45
|
+
type: str
|
|
46
|
+
value: str
|
|
47
|
+
start: int = 0 # position in source string
|
|
48
|
+
end: int = 0
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def tokenize(src: str):
|
|
52
|
+
for m in TOKEN_RE.finditer(src):
|
|
53
|
+
kind = m.lastgroup
|
|
54
|
+
# # NOTE: in order to preserve native XPath expressions that contain whitespace,
|
|
55
|
+
# # for example, "and not(...)", we can't skip whitespace
|
|
56
|
+
# if kind == "WS":
|
|
57
|
+
# continue
|
|
58
|
+
yield Token(kind, m.group(), m.start(), m.end())
|
|
59
|
+
yield Token("EOF", "", len(src), len(src))
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class Number:
|
|
64
|
+
value: float
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class String:
|
|
68
|
+
value: str
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class Name:
|
|
73
|
+
value: str
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class Xpath:
|
|
77
|
+
value: str
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class Wxpath:
|
|
81
|
+
value: str
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class Call:
|
|
85
|
+
func: str
|
|
86
|
+
args: list
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class Url(Call):
|
|
90
|
+
pass
|
|
91
|
+
|
|
92
|
+
@dataclass
|
|
93
|
+
class UrlLiteral(Url):
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
@dataclass
|
|
97
|
+
class UrlQuery(Url):
|
|
98
|
+
pass
|
|
99
|
+
|
|
100
|
+
UrlSelect = UrlQuery
|
|
101
|
+
|
|
102
|
+
@dataclass
|
|
103
|
+
class UrlCrawl(Url):
|
|
104
|
+
pass
|
|
105
|
+
|
|
106
|
+
UrlFollow = UrlCrawl
|
|
107
|
+
|
|
108
|
+
@dataclass
|
|
109
|
+
class Binary:
|
|
110
|
+
left: object
|
|
111
|
+
op: str
|
|
112
|
+
right: object
|
|
113
|
+
|
|
114
|
+
Segment: TypeAlias = Url | Xpath
|
|
115
|
+
|
|
116
|
+
class Segments(list):
|
|
117
|
+
def __repr__(self):
|
|
118
|
+
return f"Segments({super().__repr__()})"
|
|
119
|
+
|
|
120
|
+
def __str__(self):
|
|
121
|
+
return f"Segments({super().__str__()})"
|
|
122
|
+
|
|
123
|
+
@dataclass
|
|
124
|
+
class Other:
|
|
125
|
+
value: str
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@dataclass
|
|
129
|
+
class ContextItem(Xpath):
|
|
130
|
+
"""Represents the XPath context item expression: ."""
|
|
131
|
+
value: str = "."
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
PRECEDENCE = {
|
|
135
|
+
"||": 5, # String concatenation (lowest precedence)
|
|
136
|
+
"=": 10,
|
|
137
|
+
"!=": 10,
|
|
138
|
+
"<": 10,
|
|
139
|
+
"<=": 10,
|
|
140
|
+
">": 10,
|
|
141
|
+
">=": 10,
|
|
142
|
+
"+": 20,
|
|
143
|
+
"-": 20,
|
|
144
|
+
"*": 30,
|
|
145
|
+
"/": 30,
|
|
146
|
+
"!": 40, # Simple map operator (highest precedence)
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class Parser:
|
|
151
|
+
"""Pratt-style parser that produces wxpath AST nodes."""
|
|
152
|
+
|
|
153
|
+
def __init__(self, tokens: Iterable[Token]):
|
|
154
|
+
self.tokens: Iterator[Token] = iter(tokens)
|
|
155
|
+
self.token: Token = next(self.tokens)
|
|
156
|
+
|
|
157
|
+
def advance(self) -> None:
|
|
158
|
+
self.token = next(self.tokens)
|
|
159
|
+
|
|
160
|
+
def parse(self) -> object:
|
|
161
|
+
"""Parse the input tokens into an AST or raise on unexpected trailing tokens."""
|
|
162
|
+
output = self.expression(0)
|
|
163
|
+
if self.token.type != "EOF":
|
|
164
|
+
raise SyntaxError(f"unexpected token: {self.token}")
|
|
165
|
+
|
|
166
|
+
return output
|
|
167
|
+
|
|
168
|
+
def expression(self, min_prec: int) -> object:
|
|
169
|
+
return self.parse_binary(min_prec)
|
|
170
|
+
|
|
171
|
+
def parse_binary(self, min_prec: int) -> object:
|
|
172
|
+
"""Parse a binary expression chain honoring operator precedence."""
|
|
173
|
+
if self.token.type == "WXPATH":
|
|
174
|
+
left = self.parse_segments()
|
|
175
|
+
else:
|
|
176
|
+
left = self.nud()
|
|
177
|
+
|
|
178
|
+
while self.token.type == "OP" and PRECEDENCE.get(self.token.value, -1) >= min_prec:
|
|
179
|
+
op = self.token.value
|
|
180
|
+
prec = PRECEDENCE[op]
|
|
181
|
+
self.advance()
|
|
182
|
+
if self.token.type == 'WXPATH':
|
|
183
|
+
right = self.parse_segments()
|
|
184
|
+
else:
|
|
185
|
+
right = self.parse_binary(prec + 1)
|
|
186
|
+
left = Binary(left, op, right)
|
|
187
|
+
|
|
188
|
+
return left
|
|
189
|
+
|
|
190
|
+
@staticmethod
|
|
191
|
+
def _validate_segments(func):
|
|
192
|
+
"""Decorator that validates segment invariants after parsing.
|
|
193
|
+
|
|
194
|
+
Raises ValueError if the xpath in ``url(<xpath>)`` begins with ``/``
|
|
195
|
+
or ``//`` when it follows an Xpath segment.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
func: A bound method that returns a list of segments.
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
The wrapped function that performs validation.
|
|
202
|
+
"""
|
|
203
|
+
def _func(self) -> Segments:
|
|
204
|
+
segments = func(self)
|
|
205
|
+
for seg1, seg2 in pairwise(segments):
|
|
206
|
+
if isinstance(seg1, Xpath) and isinstance(seg2, Url):
|
|
207
|
+
if seg2.args[0].value.startswith(("/", "//")):
|
|
208
|
+
raise ValueError(
|
|
209
|
+
f"Invalid segments: {segments}. the <xpath> in url(<xpath>)"
|
|
210
|
+
" may not begin with / or // if following an Xpath segment."
|
|
211
|
+
)
|
|
212
|
+
return segments
|
|
213
|
+
return _func
|
|
214
|
+
|
|
215
|
+
@_validate_segments
|
|
216
|
+
def parse_segments(self) -> Segments:
|
|
217
|
+
"""Parse a sequence of wxpath segments: url() calls interspersed with xpath.
|
|
218
|
+
|
|
219
|
+
Handles patterns like::
|
|
220
|
+
|
|
221
|
+
url('...')
|
|
222
|
+
url('...')//a/@href
|
|
223
|
+
url('...')//a/url(@href)//b
|
|
224
|
+
//a/@href
|
|
225
|
+
//a/map { 'key': value }
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
A Segments list containing the parsed Url and Xpath nodes.
|
|
229
|
+
"""
|
|
230
|
+
segments = []
|
|
231
|
+
|
|
232
|
+
while self.token.type != "EOF":
|
|
233
|
+
if self.token.type == "WXPATH":
|
|
234
|
+
# Parse url() call
|
|
235
|
+
call = self.nud()
|
|
236
|
+
if call is not None:
|
|
237
|
+
if isinstance(call, (Segments, list)):
|
|
238
|
+
segments.extend(call)
|
|
239
|
+
else:
|
|
240
|
+
segments.append(call)
|
|
241
|
+
elif self.token.type == "RPAREN":
|
|
242
|
+
# End of nested context
|
|
243
|
+
break
|
|
244
|
+
elif self.token.type == "COMMA":
|
|
245
|
+
# Argument separator - stop segment parsing
|
|
246
|
+
break
|
|
247
|
+
elif self.token.type == "RBRACE":
|
|
248
|
+
# End of map context - stop segment parsing
|
|
249
|
+
break
|
|
250
|
+
else:
|
|
251
|
+
# Capture xpath content until next url() or end
|
|
252
|
+
xpath_content = self.capture_xpath_until_wxpath_or_end()
|
|
253
|
+
if xpath_content.strip():
|
|
254
|
+
segments.append(Xpath(xpath_content.strip()))
|
|
255
|
+
|
|
256
|
+
return Segments(segments)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def nud(self) -> object | None:
|
|
260
|
+
"""Parse a null-denoting expression (nud).
|
|
261
|
+
|
|
262
|
+
Null-denoting expressions include numbers, names, or expressions
|
|
263
|
+
enclosed in parentheses.
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
The parsed AST node, or None if the token is unrecognized.
|
|
267
|
+
|
|
268
|
+
Raises:
|
|
269
|
+
SyntaxError: If the token cannot form a valid expression.
|
|
270
|
+
"""
|
|
271
|
+
tok = self.token
|
|
272
|
+
|
|
273
|
+
if tok.type == "NUMBER":
|
|
274
|
+
self.advance()
|
|
275
|
+
return Number(float(tok.value))
|
|
276
|
+
|
|
277
|
+
if tok.type == "STRING":
|
|
278
|
+
self.advance()
|
|
279
|
+
return String(tok.value[1:-1]) # strip quotes
|
|
280
|
+
|
|
281
|
+
if tok.type == "DOT":
|
|
282
|
+
self.advance()
|
|
283
|
+
return ContextItem()
|
|
284
|
+
|
|
285
|
+
if tok.type == "WXPATH":
|
|
286
|
+
value = tok.value.replace(" ", "").replace("\n", "")
|
|
287
|
+
self.advance()
|
|
288
|
+
|
|
289
|
+
if self.token.type == "LPAREN":
|
|
290
|
+
return self.parse_call(value)
|
|
291
|
+
|
|
292
|
+
return Wxpath(value)
|
|
293
|
+
|
|
294
|
+
if tok.type == "NAME":
|
|
295
|
+
self.advance()
|
|
296
|
+
|
|
297
|
+
# function call
|
|
298
|
+
if self.token.type == "LPAREN":
|
|
299
|
+
return self.parse_call(tok.value)
|
|
300
|
+
|
|
301
|
+
return Name(tok.value)
|
|
302
|
+
|
|
303
|
+
if tok.type == "LPAREN":
|
|
304
|
+
self.advance()
|
|
305
|
+
expr = self.expression(0)
|
|
306
|
+
if self.token.type != "RPAREN":
|
|
307
|
+
raise SyntaxError("expected ')'")
|
|
308
|
+
self.advance()
|
|
309
|
+
return expr
|
|
310
|
+
|
|
311
|
+
# For other tokens (xpath content), return None to signal caller to handle
|
|
312
|
+
return None
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def capture_xpath_until_wxpath_or_end(self) -> str:
|
|
316
|
+
"""Capture xpath tokens until a WXPATH token, EOF, RPAREN, or COMMA.
|
|
317
|
+
|
|
318
|
+
Balances parentheses and braces so that xpath functions like
|
|
319
|
+
``contains()`` and map constructors like ``map { ... }`` are captured
|
|
320
|
+
correctly.
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
The accumulated xpath content as a string.
|
|
324
|
+
"""
|
|
325
|
+
result = ""
|
|
326
|
+
paren_depth = 0
|
|
327
|
+
brace_depth = 0
|
|
328
|
+
|
|
329
|
+
while self.token.type != "EOF":
|
|
330
|
+
# Stop conditions (only at depth 0 for both parens and braces)
|
|
331
|
+
if paren_depth == 0 and brace_depth == 0:
|
|
332
|
+
if self.token.type == "WXPATH":
|
|
333
|
+
break
|
|
334
|
+
if self.token.type == "RPAREN":
|
|
335
|
+
break
|
|
336
|
+
if self.token.type == "COMMA":
|
|
337
|
+
break
|
|
338
|
+
|
|
339
|
+
# Track paren depth for xpath functions
|
|
340
|
+
if self.token.type == "LPAREN":
|
|
341
|
+
paren_depth += 1
|
|
342
|
+
elif self.token.type == "RPAREN":
|
|
343
|
+
paren_depth -= 1
|
|
344
|
+
if paren_depth < 0:
|
|
345
|
+
# This RPAREN closes an outer context
|
|
346
|
+
break
|
|
347
|
+
|
|
348
|
+
# Track brace depth for map constructors
|
|
349
|
+
if self.token.type == "LBRACE":
|
|
350
|
+
brace_depth += 1
|
|
351
|
+
elif self.token.type == "RBRACE":
|
|
352
|
+
brace_depth -= 1
|
|
353
|
+
if brace_depth < 0:
|
|
354
|
+
# This RBRACE closes an outer context
|
|
355
|
+
break
|
|
356
|
+
|
|
357
|
+
result += self.token.value
|
|
358
|
+
self.advance()
|
|
359
|
+
|
|
360
|
+
return result
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def capture_url_arg_content(self) -> list[Call | Xpath | ContextItem]:
|
|
364
|
+
"""Capture content inside a url() call, handling nested wxpath expressions.
|
|
365
|
+
|
|
366
|
+
Supports patterns like::
|
|
367
|
+
|
|
368
|
+
url('...') -> [String]
|
|
369
|
+
url('...' follow=//a/@href) -> [String, Xpath]
|
|
370
|
+
url(//a/@href) -> [Xpath]
|
|
371
|
+
url( url('..')//a/@href ) -> [Call, Xpath]
|
|
372
|
+
url( url( url('..')//a )//b ) -> [Call, Xpath]
|
|
373
|
+
|
|
374
|
+
Returns:
|
|
375
|
+
A list of parsed elements: Xpath nodes for xpath content and Call
|
|
376
|
+
nodes for nested url() calls.
|
|
377
|
+
"""
|
|
378
|
+
elements = []
|
|
379
|
+
current_xpath = ""
|
|
380
|
+
paren_balance = 1 # We're already inside the opening paren of url()
|
|
381
|
+
brace_balance = 0 # Track braces for map constructors
|
|
382
|
+
reached_follow_token = False
|
|
383
|
+
follow_xpath = ""
|
|
384
|
+
while paren_balance > 0 and self.token.type != "EOF":
|
|
385
|
+
if self.token.type == "WXPATH":
|
|
386
|
+
# Found nested wxpath: save any accumulated xpath content first
|
|
387
|
+
if current_xpath.strip():
|
|
388
|
+
elements.append(Xpath(current_xpath.strip()))
|
|
389
|
+
current_xpath = ""
|
|
390
|
+
|
|
391
|
+
# Parse the nested url() call using nud()
|
|
392
|
+
# This recursively handles deeply nested wxpath
|
|
393
|
+
nested_call = self.nud()
|
|
394
|
+
if nested_call is not None:
|
|
395
|
+
elements.append(nested_call)
|
|
396
|
+
|
|
397
|
+
elif self.token.type == "FOLLOW":
|
|
398
|
+
reached_follow_token = True
|
|
399
|
+
self.advance()
|
|
400
|
+
|
|
401
|
+
elif self.token.type == "LPAREN":
|
|
402
|
+
# Opening paren that's NOT part of a url() call
|
|
403
|
+
# (it's part of an xpath function like contains(), starts-with(), etc.)
|
|
404
|
+
paren_balance += 1
|
|
405
|
+
current_xpath += self.token.value
|
|
406
|
+
self.advance()
|
|
407
|
+
|
|
408
|
+
elif self.token.type == "RPAREN":
|
|
409
|
+
paren_balance -= 1
|
|
410
|
+
if paren_balance == 0:
|
|
411
|
+
# This is the closing paren of the outer url()
|
|
412
|
+
break
|
|
413
|
+
current_xpath += self.token.value
|
|
414
|
+
self.advance()
|
|
415
|
+
|
|
416
|
+
elif self.token.type == "LBRACE":
|
|
417
|
+
# Opening brace for map constructors
|
|
418
|
+
brace_balance += 1
|
|
419
|
+
current_xpath += self.token.value
|
|
420
|
+
self.advance()
|
|
421
|
+
|
|
422
|
+
elif self.token.type == "RBRACE":
|
|
423
|
+
brace_balance -= 1
|
|
424
|
+
current_xpath += self.token.value
|
|
425
|
+
self.advance()
|
|
426
|
+
|
|
427
|
+
else:
|
|
428
|
+
# Accumulate all other tokens as xpath content
|
|
429
|
+
if not reached_follow_token:
|
|
430
|
+
current_xpath += self.token.value
|
|
431
|
+
else:
|
|
432
|
+
follow_xpath += self.token.value
|
|
433
|
+
|
|
434
|
+
self.advance()
|
|
435
|
+
|
|
436
|
+
if paren_balance != 0:
|
|
437
|
+
raise SyntaxError("unbalanced parentheses in url()")
|
|
438
|
+
|
|
439
|
+
# Save any remaining xpath content
|
|
440
|
+
if current_xpath.strip():
|
|
441
|
+
current_xpath = current_xpath.strip()
|
|
442
|
+
if current_xpath == ".":
|
|
443
|
+
elements.append(ContextItem())
|
|
444
|
+
else:
|
|
445
|
+
elements.append(Xpath(current_xpath))
|
|
446
|
+
|
|
447
|
+
if follow_xpath.strip():
|
|
448
|
+
elements.append(Xpath(follow_xpath.strip()))
|
|
449
|
+
|
|
450
|
+
return elements
|
|
451
|
+
|
|
452
|
+
def parse_call(self, func_name: str) -> Call | Segments:
|
|
453
|
+
"""Parse a function call (including url variants) and specialize node types."""
|
|
454
|
+
self.advance() # consume '('
|
|
455
|
+
args = []
|
|
456
|
+
follow_arg = None
|
|
457
|
+
|
|
458
|
+
if func_name.endswith("url"):
|
|
459
|
+
if self.token.type == "STRING":
|
|
460
|
+
# Simple case: url('literal string')
|
|
461
|
+
args = [String(self.token.value[1:-1])] # strip quotes
|
|
462
|
+
self.advance()
|
|
463
|
+
# Handle follow=...
|
|
464
|
+
if self.token.type == "FOLLOW":
|
|
465
|
+
self.advance()
|
|
466
|
+
follow_arg = self.capture_url_arg_content()
|
|
467
|
+
args.extend(follow_arg)
|
|
468
|
+
elif self.token.type == "WXPATH":
|
|
469
|
+
# Nested wxpath: url( url('...')//a/@href ) or url( /url(...) )
|
|
470
|
+
# Use capture_url_arg_content to handle nested wxpath and xpath
|
|
471
|
+
args = self.capture_url_arg_content()
|
|
472
|
+
else:
|
|
473
|
+
# Simple xpath argument: url(//a/@href)
|
|
474
|
+
# Could still contain nested wxpath, so use capture_url_arg_content
|
|
475
|
+
args = self.capture_url_arg_content()
|
|
476
|
+
|
|
477
|
+
# Handle additional comma-separated arguments (e.g., follow=...)
|
|
478
|
+
if self.token.type != "RPAREN":
|
|
479
|
+
while True:
|
|
480
|
+
args.append(self.expression(0))
|
|
481
|
+
if self.token.type == "COMMA":
|
|
482
|
+
self.advance()
|
|
483
|
+
continue
|
|
484
|
+
break
|
|
485
|
+
|
|
486
|
+
if self.token.type != "RPAREN":
|
|
487
|
+
raise SyntaxError("expected ')'")
|
|
488
|
+
self.advance()
|
|
489
|
+
|
|
490
|
+
return _specify_call_types(func_name, args)
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def _specify_call_types(func_name: str, args: list) -> Call | Segments:
|
|
494
|
+
if func_name == "url":
|
|
495
|
+
if len(args) == 1:
|
|
496
|
+
if isinstance(args[0], String):
|
|
497
|
+
return UrlLiteral(func_name, args)
|
|
498
|
+
elif isinstance(args[0], (Xpath, ContextItem)):
|
|
499
|
+
return UrlQuery(func_name, args)
|
|
500
|
+
else:
|
|
501
|
+
raise ValueError(f"Unknown argument type: {type(args[0])}")
|
|
502
|
+
elif len(args) == 2:
|
|
503
|
+
if isinstance(args[0], String) and isinstance(args[1], Xpath):
|
|
504
|
+
return UrlCrawl(func_name, args)
|
|
505
|
+
elif isinstance(args[0], UrlLiteral) and isinstance(args[1], Xpath):
|
|
506
|
+
args.append(UrlQuery('url', [ContextItem()]))
|
|
507
|
+
return Segments(args)
|
|
508
|
+
elif isinstance(args[0], (Segments, list)) and isinstance(args[1], Xpath):
|
|
509
|
+
segs = args[0]
|
|
510
|
+
segs.append(args[1])
|
|
511
|
+
return Segments(segs)
|
|
512
|
+
else:
|
|
513
|
+
raise ValueError(f"Unknown arguments: {args}")
|
|
514
|
+
else:
|
|
515
|
+
raise ValueError(f"Unknown arguments: {args}")
|
|
516
|
+
elif func_name == "/url" or func_name == "//url":
|
|
517
|
+
if len(args) == 1:
|
|
518
|
+
if isinstance(args[0], (Xpath, ContextItem)):
|
|
519
|
+
return UrlQuery(func_name, args)
|
|
520
|
+
else:
|
|
521
|
+
raise ValueError(f"Unknown argument type: {type(args[0])}")
|
|
522
|
+
else:
|
|
523
|
+
raise ValueError(f"Unknown arguments: {args}")
|
|
524
|
+
elif func_name == "///url":
|
|
525
|
+
if len(args) == 1:
|
|
526
|
+
if isinstance(args[0], (Xpath, ContextItem)):
|
|
527
|
+
return UrlCrawl(func_name, args)
|
|
528
|
+
else:
|
|
529
|
+
raise ValueError(f"Unknown argument type: {type(args[0])}")
|
|
530
|
+
else:
|
|
531
|
+
raise ValueError(f"Unknown arguments: {args}")
|
|
532
|
+
else:
|
|
533
|
+
return Call(func_name, args)
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
def find_wxpath_boundary(tokens: list[Token]) -> tuple[int, int] | None:
|
|
537
|
+
"""Find the operator that connects pure xpath to wxpath.
|
|
538
|
+
|
|
539
|
+
The boundary is the last operator at depth 0 before the first WXPATH token.
|
|
540
|
+
|
|
541
|
+
Args:
|
|
542
|
+
tokens: List of Token objects from the tokenizer.
|
|
543
|
+
|
|
544
|
+
Returns:
|
|
545
|
+
A tuple of (op_position, wxpath_position) or None if no boundary
|
|
546
|
+
exists.
|
|
547
|
+
"""
|
|
548
|
+
# Find first WXPATH token position
|
|
549
|
+
wxpath_pos = None
|
|
550
|
+
for i, tok in enumerate(tokens):
|
|
551
|
+
if tok.type == "WXPATH":
|
|
552
|
+
wxpath_pos = i
|
|
553
|
+
break
|
|
554
|
+
|
|
555
|
+
if wxpath_pos is None:
|
|
556
|
+
return None
|
|
557
|
+
|
|
558
|
+
# Walk backwards from wxpath to find connecting operator at depth 0
|
|
559
|
+
paren_depth = 0
|
|
560
|
+
for i in range(wxpath_pos - 1, -1, -1):
|
|
561
|
+
tok = tokens[i]
|
|
562
|
+
if tok.type == "RPAREN":
|
|
563
|
+
paren_depth += 1
|
|
564
|
+
elif tok.type == "LPAREN":
|
|
565
|
+
paren_depth -= 1
|
|
566
|
+
elif paren_depth == 0 and tok.type == "OP":
|
|
567
|
+
return (i, wxpath_pos)
|
|
568
|
+
|
|
569
|
+
return None
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def parse(src):
|
|
573
|
+
tokens = list(tokenize(src))
|
|
574
|
+
|
|
575
|
+
boundary = find_wxpath_boundary(tokens)
|
|
576
|
+
|
|
577
|
+
# If no wxpath at all, return as pure xpath
|
|
578
|
+
if boundary is None:
|
|
579
|
+
# Check if there's any WXPATH token
|
|
580
|
+
has_wxpath = any(t.type == "WXPATH" for t in tokens)
|
|
581
|
+
if not has_wxpath:
|
|
582
|
+
return Xpath(src.strip())
|
|
583
|
+
# Has wxpath but no boundary operator - parse normally
|
|
584
|
+
parser = Parser(iter(tokens))
|
|
585
|
+
return parser.parse()
|
|
586
|
+
|
|
587
|
+
op_pos, wxpath_pos = boundary
|
|
588
|
+
|
|
589
|
+
# Use source positions to extract xpath string (preserves whitespace)
|
|
590
|
+
op_token = tokens[op_pos]
|
|
591
|
+
xpath_str = src[:op_token.start].strip()
|
|
592
|
+
|
|
593
|
+
# Parse wxpath part (tokens after the operator)
|
|
594
|
+
wxpath_tokens = tokens[op_pos + 1:] # includes EOF
|
|
595
|
+
parser = Parser(iter(wxpath_tokens))
|
|
596
|
+
wxpath_ast = parser.parse()
|
|
597
|
+
|
|
598
|
+
return Binary(Xpath(xpath_str), op_token.value, wxpath_ast)
|