wxpath 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wxpath/cli.py +52 -12
- wxpath/core/ops.py +163 -129
- wxpath/core/parser.py +559 -280
- wxpath/core/runtime/engine.py +133 -42
- wxpath/core/runtime/helpers.py +0 -7
- wxpath/hooks/registry.py +29 -17
- wxpath/http/client/crawler.py +46 -11
- wxpath/http/client/request.py +6 -3
- wxpath/http/client/response.py +1 -1
- wxpath/http/policy/robots.py +82 -0
- {wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/METADATA +84 -37
- {wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/RECORD +16 -16
- wxpath/core/errors.py +0 -134
- {wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/WHEEL +0 -0
- {wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/entry_points.txt +0 -0
- {wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/top_level.txt +0 -0
wxpath/core/parser.py
CHANGED
|
@@ -1,13 +1,7 @@
|
|
|
1
|
-
"""
|
|
2
|
-
This module contains mainly two kinds of functions:
|
|
3
|
-
|
|
4
|
-
1. functions for parsing wxpath expressions.
|
|
5
|
-
2. functions for extracting information from wxpath expressions or subexpressions.
|
|
6
|
-
|
|
7
|
-
"""
|
|
8
1
|
import re
|
|
9
|
-
from dataclasses import dataclass
|
|
10
|
-
from
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from itertools import pairwise
|
|
4
|
+
from typing import Iterable, Iterator, TypeAlias
|
|
11
5
|
|
|
12
6
|
try:
|
|
13
7
|
from enum import StrEnum
|
|
@@ -18,302 +12,587 @@ except ImportError:
|
|
|
18
12
|
pass
|
|
19
13
|
|
|
20
14
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
15
|
+
TOKEN_SPEC = [
|
|
16
|
+
("NUMBER", r"\d+(\.\d+)?"),
|
|
17
|
+
("STRING", r"'([^'\\]|\\.)*'|\"([^\"\\]|\\.)*\""), # TODO: Rename to URL Literal
|
|
18
|
+
("WXPATH", r"/{0,3}\s*url"), # Must come before NAME to match 'url' as WXPATH
|
|
19
|
+
# ("///URL", r"/{3}\s*url"),
|
|
20
|
+
# ("//URL", r"/{2}\s*url"),
|
|
21
|
+
# ("/URL", r"/{1}\s*url"),
|
|
22
|
+
("URL", r"\s*url"), # Must come before NAME to match 'url' as WXPATH
|
|
23
|
+
# ("NAME", r"[a-zA-Z_][a-zA-Z0-9_]*"),
|
|
24
|
+
("FOLLOW", r",?\s{,}follow="),
|
|
25
|
+
("OP", r"\|\||<=|>=|!=|=|<|>|\+|-|\*|/|!"), # Added || for string concat
|
|
26
|
+
("LPAREN", r"\("),
|
|
27
|
+
("RPAREN", r"\)"),
|
|
28
|
+
("LBRACE", r"\{"),
|
|
29
|
+
("RBRACE", r"\}"),
|
|
30
|
+
("COLON", r":"),
|
|
31
|
+
("COMMA", r","),
|
|
32
|
+
("WS", r"\s+"),
|
|
33
|
+
("DOT", r"\."),
|
|
34
|
+
("OTHER", r"."), # Catch-all for xpath operators: /, @, [, ], etc.
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
TOKEN_RE = re.compile("|".join(
|
|
38
|
+
f"(?P<{name}>{pattern})"
|
|
39
|
+
for name, pattern in TOKEN_SPEC
|
|
40
|
+
))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class Token:
|
|
45
|
+
type: str
|
|
46
|
+
value: str
|
|
47
|
+
start: int = 0 # position in source string
|
|
48
|
+
end: int = 0
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def tokenize(src: str):
|
|
52
|
+
for m in TOKEN_RE.finditer(src):
|
|
53
|
+
kind = m.lastgroup
|
|
54
|
+
# # NOTE: in order to preserve native XPath expressions that contain whitespace,
|
|
55
|
+
# # for example, "and not(...)", we can't skip whitespace
|
|
56
|
+
# if kind == "WS":
|
|
57
|
+
# continue
|
|
58
|
+
yield Token(kind, m.group(), m.start(), m.end())
|
|
59
|
+
yield Token("EOF", "", len(src), len(src))
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class Number:
|
|
64
|
+
value: float
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class String:
|
|
68
|
+
value: str
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class Name:
|
|
73
|
+
value: str
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class Xpath:
|
|
77
|
+
value: str
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class Wxpath:
|
|
81
|
+
value: str
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class Call:
|
|
85
|
+
func: str
|
|
86
|
+
args: list
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class Url(Call):
|
|
90
|
+
pass
|
|
91
|
+
|
|
92
|
+
@dataclass
|
|
93
|
+
class UrlLiteral(Url):
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
@dataclass
|
|
97
|
+
class UrlQuery(Url):
|
|
98
|
+
pass
|
|
99
|
+
|
|
100
|
+
UrlSelect = UrlQuery
|
|
101
|
+
|
|
102
|
+
@dataclass
|
|
103
|
+
class UrlCrawl(Url):
|
|
104
|
+
pass
|
|
105
|
+
|
|
106
|
+
UrlFollow = UrlCrawl
|
|
107
|
+
|
|
108
|
+
@dataclass
|
|
109
|
+
class Binary:
|
|
110
|
+
left: object
|
|
111
|
+
op: str
|
|
112
|
+
right: object
|
|
24
113
|
|
|
114
|
+
Segment: TypeAlias = Url | Xpath
|
|
25
115
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
116
|
+
class Segments(list):
|
|
117
|
+
def __repr__(self):
|
|
118
|
+
return f"Segments({super().__repr__()})"
|
|
119
|
+
|
|
120
|
+
def __str__(self):
|
|
121
|
+
return f"Segments({super().__str__()})"
|
|
30
122
|
|
|
123
|
+
@dataclass
|
|
124
|
+
class Other:
|
|
125
|
+
value: str
|
|
31
126
|
|
|
32
|
-
@dataclass(frozen=True, slots=True)
|
|
33
|
-
class XpathValue(ValueBase):
|
|
34
|
-
expr: str
|
|
35
127
|
|
|
128
|
+
@dataclass
|
|
129
|
+
class ContextItem(Xpath):
|
|
130
|
+
"""Represents the XPath context item expression: ."""
|
|
131
|
+
value: str = "."
|
|
36
132
|
|
|
37
|
-
@dataclass(frozen=True, slots=True)
|
|
38
|
-
class UrlInfAndXpathValue(ValueBase):
|
|
39
|
-
target: str
|
|
40
|
-
expr: str
|
|
41
133
|
|
|
134
|
+
PRECEDENCE = {
|
|
135
|
+
"||": 5, # String concatenation (lowest precedence)
|
|
136
|
+
"=": 10,
|
|
137
|
+
"!=": 10,
|
|
138
|
+
"<": 10,
|
|
139
|
+
"<=": 10,
|
|
140
|
+
">": 10,
|
|
141
|
+
">=": 10,
|
|
142
|
+
"+": 20,
|
|
143
|
+
"-": 20,
|
|
144
|
+
"*": 30,
|
|
145
|
+
"/": 30,
|
|
146
|
+
"!": 40, # Simple map operator (highest precedence)
|
|
147
|
+
}
|
|
42
148
|
|
|
43
|
-
Value: TypeAlias = UrlValue | XpathValue | UrlInfAndXpathValue
|
|
44
149
|
|
|
150
|
+
class Parser:
|
|
151
|
+
"""Pratt-style parser that produces wxpath AST nodes."""
|
|
45
152
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
153
|
+
def __init__(self, tokens: Iterable[Token]):
|
|
154
|
+
self.tokens: Iterator[Token] = iter(tokens)
|
|
155
|
+
self.token: Token = next(self.tokens)
|
|
49
156
|
|
|
157
|
+
def advance(self) -> None:
|
|
158
|
+
self.token = next(self.tokens)
|
|
50
159
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
XPATH = "xpath"
|
|
57
|
-
XPATH_FN_MAP_FRAG = "xpath_fn_map_frag" # XPath function ending with map operator '!'
|
|
58
|
-
INF_XPATH = "inf_xpath" # Experimental
|
|
59
|
-
OBJECT = "object" # Deprecated
|
|
60
|
-
URL_FROM_ATTR = "url_from_attr" # Deprecated
|
|
61
|
-
URL_OPR_AND_ARG = "url_opr_and_arg" # Deprecated
|
|
160
|
+
def parse(self) -> object:
|
|
161
|
+
"""Parse the input tokens into an AST or raise on unexpected trailing tokens."""
|
|
162
|
+
output = self.expression(0)
|
|
163
|
+
if self.token.type != "EOF":
|
|
164
|
+
raise SyntaxError(f"unexpected token: {self.token}")
|
|
62
165
|
|
|
166
|
+
return output
|
|
63
167
|
|
|
64
|
-
def
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
path_expr = path_expr.replace('\n', '')
|
|
72
|
-
partitions = [] # type: list[str]
|
|
73
|
-
i = 0
|
|
74
|
-
n = len(path_expr)
|
|
75
|
-
while i < n:
|
|
76
|
-
# Detect ///url(, //url(, /url(, or url(
|
|
77
|
-
match = re.match(r'/{0,3}url\(', path_expr[i:])
|
|
78
|
-
if match:
|
|
79
|
-
seg_start = i
|
|
80
|
-
i += match.end() # Move past the matched "url("
|
|
81
|
-
paren_depth = 1
|
|
82
|
-
while i < n and paren_depth > 0:
|
|
83
|
-
if path_expr[i] == '(':
|
|
84
|
-
paren_depth += 1
|
|
85
|
-
elif path_expr[i] == ')':
|
|
86
|
-
paren_depth -= 1
|
|
87
|
-
i += 1
|
|
88
|
-
partitions.append(path_expr[seg_start:i])
|
|
89
|
-
else:
|
|
90
|
-
# Grab until the next /url(
|
|
91
|
-
next_url = re.search(r'/{0,3}url\(', path_expr[i:])
|
|
92
|
-
next_pos = next_url.start() + i if next_url else n
|
|
93
|
-
if i != next_pos:
|
|
94
|
-
partitions.append(path_expr[i:next_pos])
|
|
95
|
-
i = next_pos
|
|
96
|
-
|
|
97
|
-
return partitions
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
def parse_wxpath_expr(path_expr):
|
|
101
|
-
partitions = _scan_path_expr(path_expr)
|
|
102
|
-
|
|
103
|
-
# Lex and parse
|
|
104
|
-
segments = [] # type: list[Segment]
|
|
105
|
-
for s in partitions:
|
|
106
|
-
s = s.strip()
|
|
107
|
-
if not s:
|
|
108
|
-
continue
|
|
109
|
-
if s.startswith('url("') or s.startswith("url('"):
|
|
110
|
-
segments.append(
|
|
111
|
-
Segment(
|
|
112
|
-
OPS.URL_STR_LIT,
|
|
113
|
-
UrlValue(s, *parse_url_value(_extract_arg_from_url_op(s))),
|
|
114
|
-
)
|
|
115
|
-
)
|
|
116
|
-
elif s.startswith('///url('):
|
|
117
|
-
segments.append(
|
|
118
|
-
Segment(
|
|
119
|
-
OPS.URL_INF,
|
|
120
|
-
# XpathValue(extract_url_op_arg(s))
|
|
121
|
-
XpathValue(_value=s, expr=_extract_arg_from_url_xpath_op(s))
|
|
122
|
-
)
|
|
123
|
-
)
|
|
124
|
-
elif s.startswith('/url("') or s.startswith('//url("'):
|
|
125
|
-
raise ValueError("url() segment cannot have string literal "
|
|
126
|
-
f"argument and preceding navigation slashes (/|//): {s}")
|
|
127
|
-
elif s.startswith("/url('") or s.startswith("//url('"):
|
|
128
|
-
raise ValueError("url() segment cannot have string literal "
|
|
129
|
-
f"argument and preceding navigation slashes (/|//): {s}")
|
|
130
|
-
elif s.startswith('/url(') or s.startswith("//url("):
|
|
131
|
-
segments.append(Segment(OPS.URL_EVAL, XpathValue(s, _extract_arg_from_url_xpath_op(s))))
|
|
132
|
-
elif s.startswith('url('):
|
|
133
|
-
segments.append(Segment(OPS.URL_EVAL, XpathValue(s, _extract_arg_from_url_xpath_op(s))))
|
|
134
|
-
elif s.startswith('///'):
|
|
135
|
-
raise ValueError(f"xpath segment cannot have preceding triple slashes : {s}")
|
|
136
|
-
# segments.append(Segment(OPS.INF_XPATH, XpathValue(s, "//" + s[3:])))
|
|
137
|
-
elif s.endswith('!'):
|
|
138
|
-
segments.append(Segment(OPS.XPATH_FN_MAP_FRAG, XpathValue(s, s[:-1])))
|
|
168
|
+
def expression(self, min_prec: int) -> object:
|
|
169
|
+
return self.parse_binary(min_prec)
|
|
170
|
+
|
|
171
|
+
def parse_binary(self, min_prec: int) -> object:
|
|
172
|
+
"""Parse a binary expression chain honoring operator precedence."""
|
|
173
|
+
if self.token.type == "WXPATH":
|
|
174
|
+
left = self.parse_segments()
|
|
139
175
|
else:
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
# _value='',
|
|
154
|
-
# expr=(f'{inf_xpath_value.expr}'
|
|
155
|
-
# f'{url_eval_traveral_fragment}'
|
|
156
|
-
# f'{url_eval_value.expr}')
|
|
157
|
-
# )
|
|
158
|
-
# )
|
|
159
|
-
# segments.pop(i)
|
|
160
|
-
|
|
161
|
-
#### RAISE ERRORS FROM INVALID SEGMENTS ####
|
|
162
|
-
# Raises if multiple ///url() are present
|
|
163
|
-
if len([op for op, val in segments if op == OPS.URL_INF]) > 1:
|
|
164
|
-
raise ValueError("Only one ///url() is allowed")
|
|
176
|
+
left = self.nud()
|
|
177
|
+
|
|
178
|
+
while self.token.type == "OP" and PRECEDENCE.get(self.token.value, -1) >= min_prec:
|
|
179
|
+
op = self.token.value
|
|
180
|
+
prec = PRECEDENCE[op]
|
|
181
|
+
self.advance()
|
|
182
|
+
if self.token.type == 'WXPATH':
|
|
183
|
+
right = self.parse_segments()
|
|
184
|
+
else:
|
|
185
|
+
right = self.parse_binary(prec + 1)
|
|
186
|
+
left = Binary(left, op, right)
|
|
187
|
+
|
|
188
|
+
return left
|
|
165
189
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
190
|
+
@staticmethod
|
|
191
|
+
def _validate_segments(func):
|
|
192
|
+
"""Decorator that validates segment invariants after parsing.
|
|
193
|
+
|
|
194
|
+
Raises ValueError if the xpath in ``url(<xpath>)`` begins with ``/``
|
|
195
|
+
or ``//`` when it follows an Xpath segment.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
func: A bound method that returns a list of segments.
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
The wrapped function that performs validation.
|
|
202
|
+
"""
|
|
203
|
+
def _func(self) -> Segments:
|
|
204
|
+
segments = func(self)
|
|
205
|
+
for seg1, seg2 in pairwise(segments):
|
|
206
|
+
if isinstance(seg1, Xpath) and isinstance(seg2, Url):
|
|
207
|
+
if seg2.args[0].value.startswith(("/", "//")):
|
|
208
|
+
raise ValueError(
|
|
209
|
+
f"Invalid segments: {segments}. the <xpath> in url(<xpath>)"
|
|
210
|
+
" may not begin with / or // if following an Xpath segment."
|
|
211
|
+
)
|
|
212
|
+
return segments
|
|
213
|
+
return _func
|
|
214
|
+
|
|
215
|
+
@_validate_segments
|
|
216
|
+
def parse_segments(self) -> Segments:
|
|
217
|
+
"""Parse a sequence of wxpath segments: url() calls interspersed with xpath.
|
|
218
|
+
|
|
219
|
+
Handles patterns like::
|
|
220
|
+
|
|
221
|
+
url('...')
|
|
222
|
+
url('...')//a/@href
|
|
223
|
+
url('...')//a/url(@href)//b
|
|
224
|
+
//a/@href
|
|
225
|
+
//a/map { 'key': value }
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
A Segments list containing the parsed Url and Xpath nodes.
|
|
229
|
+
"""
|
|
230
|
+
segments = []
|
|
231
|
+
|
|
232
|
+
while self.token.type != "EOF":
|
|
233
|
+
if self.token.type == "WXPATH":
|
|
234
|
+
# Parse url() call
|
|
235
|
+
call = self.nud()
|
|
236
|
+
if call is not None:
|
|
237
|
+
if isinstance(call, (Segments, list)):
|
|
238
|
+
segments.extend(call)
|
|
239
|
+
else:
|
|
240
|
+
segments.append(call)
|
|
241
|
+
elif self.token.type == "RPAREN":
|
|
242
|
+
# End of nested context
|
|
243
|
+
break
|
|
244
|
+
elif self.token.type == "COMMA":
|
|
245
|
+
# Argument separator - stop segment parsing
|
|
246
|
+
break
|
|
247
|
+
elif self.token.type == "RBRACE":
|
|
248
|
+
# End of map context - stop segment parsing
|
|
249
|
+
break
|
|
250
|
+
else:
|
|
251
|
+
# Capture xpath content until next url() or end
|
|
252
|
+
xpath_content = self.capture_xpath_until_wxpath_or_end()
|
|
253
|
+
if xpath_content.strip():
|
|
254
|
+
segments.append(Xpath(xpath_content.strip()))
|
|
255
|
+
|
|
256
|
+
return Segments(segments)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def nud(self) -> object | None:
|
|
260
|
+
"""Parse a null-denoting expression (nud).
|
|
261
|
+
|
|
262
|
+
Null-denoting expressions include numbers, names, or expressions
|
|
263
|
+
enclosed in parentheses.
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
The parsed AST node, or None if the token is unrecognized.
|
|
267
|
+
|
|
268
|
+
Raises:
|
|
269
|
+
SyntaxError: If the token cannot form a valid expression.
|
|
270
|
+
"""
|
|
271
|
+
tok = self.token
|
|
272
|
+
|
|
273
|
+
if tok.type == "NUMBER":
|
|
274
|
+
self.advance()
|
|
275
|
+
return Number(float(tok.value))
|
|
276
|
+
|
|
277
|
+
if tok.type == "STRING":
|
|
278
|
+
self.advance()
|
|
279
|
+
return String(tok.value[1:-1]) # strip quotes
|
|
280
|
+
|
|
281
|
+
if tok.type == "DOT":
|
|
282
|
+
self.advance()
|
|
283
|
+
return ContextItem()
|
|
284
|
+
|
|
285
|
+
if tok.type == "WXPATH":
|
|
286
|
+
value = tok.value.replace(" ", "").replace("\n", "")
|
|
287
|
+
self.advance()
|
|
288
|
+
|
|
289
|
+
if self.token.type == "LPAREN":
|
|
290
|
+
return self.parse_call(value)
|
|
291
|
+
|
|
292
|
+
return Wxpath(value)
|
|
293
|
+
|
|
294
|
+
if tok.type == "NAME":
|
|
295
|
+
self.advance()
|
|
296
|
+
|
|
297
|
+
# function call
|
|
298
|
+
if self.token.type == "LPAREN":
|
|
299
|
+
return self.parse_call(tok.value)
|
|
300
|
+
|
|
301
|
+
return Name(tok.value)
|
|
302
|
+
|
|
303
|
+
if tok.type == "LPAREN":
|
|
304
|
+
self.advance()
|
|
305
|
+
expr = self.expression(0)
|
|
306
|
+
if self.token.type != "RPAREN":
|
|
307
|
+
raise SyntaxError("expected ')'")
|
|
308
|
+
self.advance()
|
|
309
|
+
return expr
|
|
310
|
+
|
|
311
|
+
# For other tokens (xpath content), return None to signal caller to handle
|
|
312
|
+
return None
|
|
169
313
|
|
|
170
|
-
# Raises when expr starts with //url(@<attr>)
|
|
171
|
-
if segments and segments[0][0] == OPS.URL_EVAL:
|
|
172
|
-
raise ValueError("Path expr cannot start with [//]url(<xpath>)")
|
|
173
|
-
|
|
174
|
-
# Raises if expr ends with INF_XPATH
|
|
175
|
-
if segments and segments[-1][0] == OPS.INF_XPATH:
|
|
176
|
-
raise ValueError("Path expr cannot end with ///<xpath>")
|
|
177
|
-
|
|
178
|
-
# Raises if expr ends with XPATH_FN_MAP_FRAG
|
|
179
|
-
if segments and segments[-1][0] == OPS.XPATH_FN_MAP_FRAG:
|
|
180
|
-
raise ValueError("Path expr cannot end with !")
|
|
181
|
-
return segments
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
def parse_url_value(src: str) -> tuple[str, Optional[str]]:
|
|
185
|
-
"""
|
|
186
|
-
Parse the contents of url(...).
|
|
187
|
-
|
|
188
|
-
Examples of src:
|
|
189
|
-
"'https://example.com'"
|
|
190
|
-
"//a/@href"
|
|
191
|
-
"'https://x', follow=//a/@href"
|
|
192
|
-
"""
|
|
193
|
-
|
|
194
|
-
parts = _split_top_level_commas(src)
|
|
195
|
-
|
|
196
|
-
if not parts:
|
|
197
|
-
raise SyntaxError("url() requires at least one argument")
|
|
198
|
-
|
|
199
|
-
# ---- positional argument (target) ----
|
|
200
|
-
target_src = parts[0].strip()
|
|
201
|
-
if not target_src:
|
|
202
|
-
raise SyntaxError("url() target cannot be empty")
|
|
203
|
-
|
|
204
|
-
target = _parse_url_target(target_src)
|
|
205
|
-
|
|
206
|
-
follow = None
|
|
207
314
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
315
|
+
def capture_xpath_until_wxpath_or_end(self) -> str:
|
|
316
|
+
"""Capture xpath tokens until a WXPATH token, EOF, RPAREN, or COMMA.
|
|
317
|
+
|
|
318
|
+
Balances parentheses and braces so that xpath functions like
|
|
319
|
+
``contains()`` and map constructors like ``map { ... }`` are captured
|
|
320
|
+
correctly.
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
The accumulated xpath content as a string.
|
|
324
|
+
"""
|
|
325
|
+
result = ""
|
|
326
|
+
paren_depth = 0
|
|
327
|
+
brace_depth = 0
|
|
328
|
+
|
|
329
|
+
while self.token.type != "EOF":
|
|
330
|
+
# Stop conditions (only at depth 0 for both parens and braces)
|
|
331
|
+
if paren_depth == 0 and brace_depth == 0:
|
|
332
|
+
if self.token.type == "WXPATH":
|
|
333
|
+
break
|
|
334
|
+
if self.token.type == "RPAREN":
|
|
335
|
+
break
|
|
336
|
+
if self.token.type == "COMMA":
|
|
337
|
+
break
|
|
338
|
+
|
|
339
|
+
# Track paren depth for xpath functions
|
|
340
|
+
if self.token.type == "LPAREN":
|
|
341
|
+
paren_depth += 1
|
|
342
|
+
elif self.token.type == "RPAREN":
|
|
343
|
+
paren_depth -= 1
|
|
344
|
+
if paren_depth < 0:
|
|
345
|
+
# This RPAREN closes an outer context
|
|
346
|
+
break
|
|
347
|
+
|
|
348
|
+
# Track brace depth for map constructors
|
|
349
|
+
if self.token.type == "LBRACE":
|
|
350
|
+
brace_depth += 1
|
|
351
|
+
elif self.token.type == "RBRACE":
|
|
352
|
+
brace_depth -= 1
|
|
353
|
+
if brace_depth < 0:
|
|
354
|
+
# This RBRACE closes an outer context
|
|
355
|
+
break
|
|
356
|
+
|
|
357
|
+
result += self.token.value
|
|
358
|
+
self.advance()
|
|
359
|
+
|
|
360
|
+
return result
|
|
361
|
+
|
|
211
362
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
363
|
+
def capture_url_arg_content(self) -> list[Call | Xpath | ContextItem]:
|
|
364
|
+
"""Capture content inside a url() call, handling nested wxpath expressions.
|
|
365
|
+
|
|
366
|
+
Supports patterns like::
|
|
367
|
+
|
|
368
|
+
url('...') -> [String]
|
|
369
|
+
url('...' follow=//a/@href) -> [String, Xpath]
|
|
370
|
+
url(//a/@href) -> [Xpath]
|
|
371
|
+
url( url('..')//a/@href ) -> [Call, Xpath]
|
|
372
|
+
url( url( url('..')//a )//b ) -> [Call, Xpath]
|
|
373
|
+
|
|
374
|
+
Returns:
|
|
375
|
+
A list of parsed elements: Xpath nodes for xpath content and Call
|
|
376
|
+
nodes for nested url() calls.
|
|
377
|
+
"""
|
|
378
|
+
elements = []
|
|
379
|
+
current_xpath = ""
|
|
380
|
+
paren_balance = 1 # We're already inside the opening paren of url()
|
|
381
|
+
brace_balance = 0 # Track braces for map constructors
|
|
382
|
+
reached_follow_token = False
|
|
383
|
+
follow_xpath = ""
|
|
384
|
+
while paren_balance > 0 and self.token.type != "EOF":
|
|
385
|
+
if self.token.type == "WXPATH":
|
|
386
|
+
# Found nested wxpath: save any accumulated xpath content first
|
|
387
|
+
if current_xpath.strip():
|
|
388
|
+
elements.append(Xpath(current_xpath.strip()))
|
|
389
|
+
current_xpath = ""
|
|
390
|
+
|
|
391
|
+
# Parse the nested url() call using nud()
|
|
392
|
+
# This recursively handles deeply nested wxpath
|
|
393
|
+
nested_call = self.nud()
|
|
394
|
+
if nested_call is not None:
|
|
395
|
+
elements.append(nested_call)
|
|
396
|
+
|
|
397
|
+
elif self.token.type == "FOLLOW":
|
|
398
|
+
reached_follow_token = True
|
|
399
|
+
self.advance()
|
|
400
|
+
|
|
401
|
+
elif self.token.type == "LPAREN":
|
|
402
|
+
# Opening paren that's NOT part of a url() call
|
|
403
|
+
# (it's part of an xpath function like contains(), starts-with(), etc.)
|
|
404
|
+
paren_balance += 1
|
|
405
|
+
current_xpath += self.token.value
|
|
406
|
+
self.advance()
|
|
407
|
+
|
|
408
|
+
elif self.token.type == "RPAREN":
|
|
409
|
+
paren_balance -= 1
|
|
410
|
+
if paren_balance == 0:
|
|
411
|
+
# This is the closing paren of the outer url()
|
|
412
|
+
break
|
|
413
|
+
current_xpath += self.token.value
|
|
414
|
+
self.advance()
|
|
415
|
+
|
|
416
|
+
elif self.token.type == "LBRACE":
|
|
417
|
+
# Opening brace for map constructors
|
|
418
|
+
brace_balance += 1
|
|
419
|
+
current_xpath += self.token.value
|
|
420
|
+
self.advance()
|
|
421
|
+
|
|
422
|
+
elif self.token.type == "RBRACE":
|
|
423
|
+
brace_balance -= 1
|
|
424
|
+
current_xpath += self.token.value
|
|
425
|
+
self.advance()
|
|
426
|
+
|
|
427
|
+
else:
|
|
428
|
+
# Accumulate all other tokens as xpath content
|
|
429
|
+
if not reached_follow_token:
|
|
430
|
+
current_xpath += self.token.value
|
|
431
|
+
else:
|
|
432
|
+
follow_xpath += self.token.value
|
|
433
|
+
|
|
434
|
+
self.advance()
|
|
435
|
+
|
|
436
|
+
if paren_balance != 0:
|
|
437
|
+
raise SyntaxError("unbalanced parentheses in url()")
|
|
438
|
+
|
|
439
|
+
# Save any remaining xpath content
|
|
440
|
+
if current_xpath.strip():
|
|
441
|
+
current_xpath = current_xpath.strip()
|
|
442
|
+
if current_xpath == ".":
|
|
443
|
+
elements.append(ContextItem())
|
|
444
|
+
else:
|
|
445
|
+
elements.append(Xpath(current_xpath))
|
|
446
|
+
|
|
447
|
+
if follow_xpath.strip():
|
|
448
|
+
elements.append(Xpath(follow_xpath.strip()))
|
|
449
|
+
|
|
450
|
+
return elements
|
|
451
|
+
|
|
452
|
+
def parse_call(self, func_name: str) -> Call | Segments:
|
|
453
|
+
"""Parse a function call (including url variants) and specialize node types."""
|
|
454
|
+
self.advance() # consume '('
|
|
455
|
+
args = []
|
|
456
|
+
follow_arg = None
|
|
457
|
+
|
|
458
|
+
if func_name.endswith("url"):
|
|
459
|
+
if self.token.type == "STRING":
|
|
460
|
+
# Simple case: url('literal string')
|
|
461
|
+
args = [String(self.token.value[1:-1])] # strip quotes
|
|
462
|
+
self.advance()
|
|
463
|
+
# Handle follow=...
|
|
464
|
+
if self.token.type == "FOLLOW":
|
|
465
|
+
self.advance()
|
|
466
|
+
follow_arg = self.capture_url_arg_content()
|
|
467
|
+
args.extend(follow_arg)
|
|
468
|
+
elif self.token.type == "WXPATH":
|
|
469
|
+
# Nested wxpath: url( url('...')//a/@href ) or url( /url(...) )
|
|
470
|
+
# Use capture_url_arg_content to handle nested wxpath and xpath
|
|
471
|
+
args = self.capture_url_arg_content()
|
|
472
|
+
else:
|
|
473
|
+
# Simple xpath argument: url(//a/@href)
|
|
474
|
+
# Could still contain nested wxpath, so use capture_url_arg_content
|
|
475
|
+
args = self.capture_url_arg_content()
|
|
476
|
+
|
|
477
|
+
# Handle additional comma-separated arguments (e.g., follow=...)
|
|
478
|
+
if self.token.type != "RPAREN":
|
|
479
|
+
while True:
|
|
480
|
+
args.append(self.expression(0))
|
|
481
|
+
if self.token.type == "COMMA":
|
|
482
|
+
self.advance()
|
|
483
|
+
continue
|
|
484
|
+
break
|
|
485
|
+
|
|
486
|
+
if self.token.type != "RPAREN":
|
|
487
|
+
raise SyntaxError("expected ')'")
|
|
488
|
+
self.advance()
|
|
489
|
+
|
|
490
|
+
return _specify_call_types(func_name, args)
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def _specify_call_types(func_name: str, args: list) -> Call | Segments:
|
|
494
|
+
if func_name == "url":
|
|
495
|
+
if len(args) == 1:
|
|
496
|
+
if isinstance(args[0], String):
|
|
497
|
+
return UrlLiteral(func_name, args)
|
|
498
|
+
elif isinstance(args[0], (Xpath, ContextItem)):
|
|
499
|
+
return UrlQuery(func_name, args)
|
|
500
|
+
else:
|
|
501
|
+
raise ValueError(f"Unknown argument type: {type(args[0])}")
|
|
502
|
+
elif len(args) == 2:
|
|
503
|
+
if isinstance(args[0], String) and isinstance(args[1], Xpath):
|
|
504
|
+
return UrlCrawl(func_name, args)
|
|
505
|
+
elif isinstance(args[0], UrlLiteral) and isinstance(args[1], Xpath):
|
|
506
|
+
args.append(UrlQuery('url', [ContextItem()]))
|
|
507
|
+
return Segments(args)
|
|
508
|
+
elif isinstance(args[0], (Segments, list)) and isinstance(args[1], Xpath):
|
|
509
|
+
segs = args[0]
|
|
510
|
+
segs.append(args[1])
|
|
511
|
+
return Segments(segs)
|
|
512
|
+
else:
|
|
513
|
+
raise ValueError(f"Unknown arguments: {args}")
|
|
216
514
|
else:
|
|
217
|
-
raise
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
if url_op_arg.startswith('@'):
|
|
225
|
-
return ".//" + url_op_arg
|
|
226
|
-
elif url_op_arg.startswith('.'):
|
|
227
|
-
return url_op_arg
|
|
228
|
-
elif url_op_arg.startswith('//'):
|
|
229
|
-
return '.' + url_op_arg
|
|
230
|
-
elif not url_op_arg.startswith('.//'):
|
|
231
|
-
return './/' + url_op_arg
|
|
232
|
-
else:
|
|
233
|
-
return url_op_arg
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
def _extract_arg_from_url_xpath_op(url_subsegment):
|
|
237
|
-
match = re.search(r"url\((.+)\)", url_subsegment)
|
|
238
|
-
if not match:
|
|
239
|
-
raise ValueError(f"Invalid url() segment: {url_subsegment}")
|
|
240
|
-
return match.group(1).strip("'\"") # Remove surrounding quotes if any
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
def _extract_arg_from_url_op(url_subsegment):
|
|
244
|
-
match = re.search(r"url\((.+)\)", url_subsegment)
|
|
245
|
-
if not match:
|
|
246
|
-
raise ValueError(f"Invalid url() segment: {url_subsegment}")
|
|
247
|
-
return match.group(1) # Remove surrounding quotes if any
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
def _split_top_level_commas(src: str) -> list[str]:
|
|
251
|
-
parts = []
|
|
252
|
-
buf = []
|
|
253
|
-
depth = 0
|
|
254
|
-
in_string = False
|
|
255
|
-
quote = None
|
|
256
|
-
|
|
257
|
-
for ch in src:
|
|
258
|
-
if in_string:
|
|
259
|
-
buf.append(ch)
|
|
260
|
-
if ch == quote:
|
|
261
|
-
in_string = False
|
|
262
|
-
continue
|
|
263
|
-
|
|
264
|
-
if ch in ("'", '"'):
|
|
265
|
-
in_string = True
|
|
266
|
-
quote = ch
|
|
267
|
-
buf.append(ch)
|
|
268
|
-
continue
|
|
269
|
-
|
|
270
|
-
if ch in "([{":
|
|
271
|
-
depth += 1
|
|
272
|
-
elif ch in ")]}":
|
|
273
|
-
depth -= 1
|
|
274
|
-
if depth < 0:
|
|
275
|
-
raise SyntaxError("unbalanced parentheses in url()")
|
|
276
|
-
|
|
277
|
-
if ch == "," and depth == 0:
|
|
278
|
-
parts.append("".join(buf).strip())
|
|
279
|
-
buf.clear()
|
|
515
|
+
raise ValueError(f"Unknown arguments: {args}")
|
|
516
|
+
elif func_name == "/url" or func_name == "//url":
|
|
517
|
+
if len(args) == 1:
|
|
518
|
+
if isinstance(args[0], (Xpath, ContextItem)):
|
|
519
|
+
return UrlQuery(func_name, args)
|
|
520
|
+
else:
|
|
521
|
+
raise ValueError(f"Unknown argument type: {type(args[0])}")
|
|
280
522
|
else:
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
def _split_kwarg(src: str) -> tuple[str, str]:
|
|
293
|
-
if "=" not in src:
|
|
294
|
-
raise SyntaxError(f"expected keyword argument, got: {src}")
|
|
295
|
-
|
|
296
|
-
name, value = src.split("=", 1)
|
|
297
|
-
name = name.strip()
|
|
298
|
-
value = value.strip()
|
|
299
|
-
|
|
300
|
-
if not name or not value:
|
|
301
|
-
raise SyntaxError(f"invalid keyword argument: {src}")
|
|
523
|
+
raise ValueError(f"Unknown arguments: {args}")
|
|
524
|
+
elif func_name == "///url":
|
|
525
|
+
if len(args) == 1:
|
|
526
|
+
if isinstance(args[0], (Xpath, ContextItem)):
|
|
527
|
+
return UrlCrawl(func_name, args)
|
|
528
|
+
else:
|
|
529
|
+
raise ValueError(f"Unknown argument type: {type(args[0])}")
|
|
530
|
+
else:
|
|
531
|
+
raise ValueError(f"Unknown arguments: {args}")
|
|
532
|
+
else:
|
|
533
|
+
return Call(func_name, args)
|
|
302
534
|
|
|
303
|
-
return name, value
|
|
304
535
|
|
|
536
|
+
def find_wxpath_boundary(tokens: list[Token]) -> tuple[int, int] | None:
|
|
537
|
+
"""Find the operator that connects pure xpath to wxpath.
|
|
305
538
|
|
|
306
|
-
|
|
307
|
-
src = src.strip()
|
|
308
|
-
# string literal
|
|
309
|
-
if (src.startswith("'") and src.endswith("'")) or \
|
|
310
|
-
(src.startswith('"') and src.endswith('"')):
|
|
311
|
-
return src[1:-1]
|
|
539
|
+
The boundary is the last operator at depth 0 before the first WXPATH token.
|
|
312
540
|
|
|
313
|
-
|
|
541
|
+
Args:
|
|
542
|
+
tokens: List of Token objects from the tokenizer.
|
|
314
543
|
|
|
544
|
+
Returns:
|
|
545
|
+
A tuple of (op_position, wxpath_position) or None if no boundary
|
|
546
|
+
exists.
|
|
547
|
+
"""
|
|
548
|
+
# Find first WXPATH token position
|
|
549
|
+
wxpath_pos = None
|
|
550
|
+
for i, tok in enumerate(tokens):
|
|
551
|
+
if tok.type == "WXPATH":
|
|
552
|
+
wxpath_pos = i
|
|
553
|
+
break
|
|
554
|
+
|
|
555
|
+
if wxpath_pos is None:
|
|
556
|
+
return None
|
|
557
|
+
|
|
558
|
+
# Walk backwards from wxpath to find connecting operator at depth 0
|
|
559
|
+
paren_depth = 0
|
|
560
|
+
for i in range(wxpath_pos - 1, -1, -1):
|
|
561
|
+
tok = tokens[i]
|
|
562
|
+
if tok.type == "RPAREN":
|
|
563
|
+
paren_depth += 1
|
|
564
|
+
elif tok.type == "LPAREN":
|
|
565
|
+
paren_depth -= 1
|
|
566
|
+
elif paren_depth == 0 and tok.type == "OP":
|
|
567
|
+
return (i, wxpath_pos)
|
|
568
|
+
|
|
569
|
+
return None
|
|
315
570
|
|
|
316
|
-
def _get_shallow_dict(instance: Value):
|
|
317
|
-
return {field.name: getattr(instance, field.name)
|
|
318
|
-
for field in fields(instance) if field.name not in {'_value'}}
|
|
319
571
|
|
|
572
|
+
def parse(src):
|
|
573
|
+
tokens = list(tokenize(src))
|
|
574
|
+
|
|
575
|
+
boundary = find_wxpath_boundary(tokens)
|
|
576
|
+
|
|
577
|
+
# If no wxpath at all, return as pure xpath
|
|
578
|
+
if boundary is None:
|
|
579
|
+
# Check if there's any WXPATH token
|
|
580
|
+
has_wxpath = any(t.type == "WXPATH" for t in tokens)
|
|
581
|
+
if not has_wxpath:
|
|
582
|
+
return Xpath(src.strip())
|
|
583
|
+
# Has wxpath but no boundary operator - parse normally
|
|
584
|
+
parser = Parser(iter(tokens))
|
|
585
|
+
return parser.parse()
|
|
586
|
+
|
|
587
|
+
op_pos, wxpath_pos = boundary
|
|
588
|
+
|
|
589
|
+
# Use source positions to extract xpath string (preserves whitespace)
|
|
590
|
+
op_token = tokens[op_pos]
|
|
591
|
+
xpath_str = src[:op_token.start].strip()
|
|
592
|
+
|
|
593
|
+
# Parse wxpath part (tokens after the operator)
|
|
594
|
+
wxpath_tokens = tokens[op_pos + 1:] # includes EOF
|
|
595
|
+
parser = Parser(iter(wxpath_tokens))
|
|
596
|
+
wxpath_ast = parser.parse()
|
|
597
|
+
|
|
598
|
+
return Binary(Xpath(xpath_str), op_token.value, wxpath_ast)
|