wxpath 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wxpath/__init__.py +2 -0
- wxpath/cli.py +6 -0
- wxpath/core/exceptions.py +53 -0
- wxpath/core/models.py +1 -0
- wxpath/core/ops.py +100 -19
- wxpath/core/parser.py +94 -24
- wxpath/core/runtime/engine.py +74 -10
- wxpath/core/runtime/helpers.py +6 -3
- wxpath/http/client/__init__.py +1 -1
- wxpath/http/client/crawler.py +17 -5
- wxpath/http/client/response.py +7 -1
- wxpath/http/policy/retry.py +2 -2
- wxpath/integrations/__init__.py +0 -0
- wxpath/integrations/langchain/__init__.py +0 -0
- wxpath/integrations/langchain/examples/basic_rag.py +85 -0
- wxpath/integrations/langchain/examples/rolling_window_rag.py +218 -0
- wxpath/integrations/langchain/loader.py +60 -0
- wxpath/patches.py +215 -5
- wxpath/settings.py +3 -1
- wxpath/tui.py +1225 -0
- wxpath/tui_settings.py +151 -0
- wxpath/util/cleaners.py +31 -0
- wxpath/util/common_paths.py +22 -0
- wxpath/util/logging.py +3 -7
- {wxpath-0.4.1.dist-info → wxpath-0.5.1.dist-info}/METADATA +73 -9
- wxpath-0.5.1.dist-info/RECORD +45 -0
- {wxpath-0.4.1.dist-info → wxpath-0.5.1.dist-info}/WHEEL +1 -1
- {wxpath-0.4.1.dist-info → wxpath-0.5.1.dist-info}/entry_points.txt +1 -0
- wxpath-0.4.1.dist-info/RECORD +0 -35
- {wxpath-0.4.1.dist-info → wxpath-0.5.1.dist-info}/licenses/LICENSE +0 -0
- {wxpath-0.4.1.dist-info → wxpath-0.5.1.dist-info}/top_level.txt +0 -0
wxpath/__init__.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from . import settings
|
|
1
2
|
from .core.runtime.engine import wxpath_async, wxpath_async_blocking, wxpath_async_blocking_iter
|
|
2
3
|
from .util.logging import configure_logging
|
|
3
4
|
|
|
@@ -6,4 +7,5 @@ __all__ = [
|
|
|
6
7
|
'wxpath_async_blocking',
|
|
7
8
|
'wxpath_async_blocking_iter',
|
|
8
9
|
'configure_logging',
|
|
10
|
+
'settings',
|
|
9
11
|
]
|
wxpath/cli.py
CHANGED
|
@@ -47,6 +47,11 @@ def main():
|
|
|
47
47
|
help="Respect robots.txt",
|
|
48
48
|
default=True
|
|
49
49
|
)
|
|
50
|
+
arg_parser.add_argument(
|
|
51
|
+
"--insecure",
|
|
52
|
+
action="store_true",
|
|
53
|
+
help="Disable SSL certificate verification (use for sites with broken chains)",
|
|
54
|
+
)
|
|
50
55
|
arg_parser.add_argument(
|
|
51
56
|
"--cache",
|
|
52
57
|
action="store_true",
|
|
@@ -112,6 +117,7 @@ def main():
|
|
|
112
117
|
concurrency=args.concurrency,
|
|
113
118
|
per_host=args.concurrency_per_host,
|
|
114
119
|
respect_robots=args.respect_robots,
|
|
120
|
+
verify_ssl=not args.insecure,
|
|
115
121
|
headers=custom_headers
|
|
116
122
|
)
|
|
117
123
|
engine = WXPathEngine(crawler=crawler)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
class XPathEvaluationError(Exception):
|
|
2
|
+
"""Errors during XPath evaluation with elementpath."""
|
|
3
|
+
|
|
4
|
+
def __init__(
|
|
5
|
+
self,
|
|
6
|
+
message: str,
|
|
7
|
+
xpath: str,
|
|
8
|
+
base_url: str | None = None,
|
|
9
|
+
element_tag: str | None = None,
|
|
10
|
+
error_code: str | None = None, # XPath error codes like XPST0003
|
|
11
|
+
position: tuple[int, int] | None = None, # (line, column)
|
|
12
|
+
original_error: Exception | None = None
|
|
13
|
+
):
|
|
14
|
+
context = {
|
|
15
|
+
"xpath": xpath,
|
|
16
|
+
"base_url": base_url,
|
|
17
|
+
"element_tag": element_tag,
|
|
18
|
+
"error_code": error_code,
|
|
19
|
+
"position": position,
|
|
20
|
+
}
|
|
21
|
+
if original_error:
|
|
22
|
+
context["original_error"] = str(original_error)
|
|
23
|
+
# Extract XPath error code if present (e.g., [err:XPST0003])
|
|
24
|
+
if hasattr(original_error, 'code'):
|
|
25
|
+
context["error_code"] = original_error.code
|
|
26
|
+
|
|
27
|
+
super().__init__(message, context)
|
|
28
|
+
|
|
29
|
+
def to_dict(self) -> dict:
|
|
30
|
+
return {
|
|
31
|
+
"message": self.message,
|
|
32
|
+
"xpath": self.xpath,
|
|
33
|
+
"base_url": self.base_url,
|
|
34
|
+
"element_tag": self.element_tag,
|
|
35
|
+
"error_code": self.error_code,
|
|
36
|
+
"position": self.position,
|
|
37
|
+
"original_error": self.original_error,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class XPathSyntaxError(XPathEvaluationError):
|
|
42
|
+
"""Invalid XPath syntax."""
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class XPathTypeError(XPathEvaluationError):
|
|
47
|
+
"""Type error in XPath expression."""
|
|
48
|
+
pass
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class XPathRuntimeError(XPathEvaluationError):
|
|
52
|
+
"""Runtime error during XPath evaluation."""
|
|
53
|
+
pass
|
wxpath/core/models.py
CHANGED
wxpath/core/ops.py
CHANGED
|
@@ -2,11 +2,25 @@ from typing import Callable, Iterable
|
|
|
2
2
|
from urllib.parse import urljoin
|
|
3
3
|
|
|
4
4
|
import elementpath
|
|
5
|
+
from elementpath import (
|
|
6
|
+
ElementPathError,
|
|
7
|
+
ElementPathSyntaxError as EPSyntaxError,
|
|
8
|
+
ElementPathTypeError as EPTypeError,
|
|
9
|
+
ElementPathZeroDivisionError,
|
|
10
|
+
ElementPathRuntimeError as EPRuntimeError,
|
|
11
|
+
MissingContextError,
|
|
12
|
+
)
|
|
5
13
|
from elementpath.datatypes import AnyAtomicType
|
|
6
14
|
from elementpath.xpath3 import XPath3Parser
|
|
7
15
|
from lxml import html
|
|
8
16
|
|
|
9
17
|
from wxpath.core.dom import get_absolute_links_from_elem_and_xpath
|
|
18
|
+
from wxpath.core.exceptions import (
|
|
19
|
+
XPathEvaluationError,
|
|
20
|
+
XPathSyntaxError,
|
|
21
|
+
XPathTypeError,
|
|
22
|
+
XPathRuntimeError,
|
|
23
|
+
)
|
|
10
24
|
from wxpath.core.models import (
|
|
11
25
|
CrawlIntent,
|
|
12
26
|
DataIntent,
|
|
@@ -19,6 +33,7 @@ from wxpath.core.parser import (
|
|
|
19
33
|
Binary,
|
|
20
34
|
Call,
|
|
21
35
|
ContextItem,
|
|
36
|
+
Depth,
|
|
22
37
|
Segment,
|
|
23
38
|
Segments,
|
|
24
39
|
String,
|
|
@@ -78,7 +93,10 @@ def get_operator(
|
|
|
78
93
|
|
|
79
94
|
|
|
80
95
|
@register('url', (String,))
|
|
96
|
+
@register('url', (String, Depth))
|
|
81
97
|
@register('url', (String, Xpath))
|
|
98
|
+
@register('url', (String, Depth, Xpath))
|
|
99
|
+
@register('url', (String, Xpath, Depth))
|
|
82
100
|
def _handle_url_str_lit(curr_elem: html.HtmlElement,
|
|
83
101
|
curr_segments: list[Url | Xpath],
|
|
84
102
|
curr_depth: int, **kwargs) -> Iterable[Intent]:
|
|
@@ -87,9 +105,12 @@ def _handle_url_str_lit(curr_elem: html.HtmlElement,
|
|
|
87
105
|
|
|
88
106
|
next_segments = curr_segments[1:]
|
|
89
107
|
|
|
90
|
-
|
|
108
|
+
# NOTE: Expects parser to produce UrlCrawl node in expressions
|
|
109
|
+
# that look like `url('...', follow=//a/@href)`
|
|
110
|
+
if isinstance(url_call, UrlCrawl):
|
|
111
|
+
xpath_arg = [arg for arg in url_call.args if isinstance(arg, Xpath)][0]
|
|
91
112
|
_segments = [
|
|
92
|
-
UrlCrawl('///url', [
|
|
113
|
+
UrlCrawl('///url', [xpath_arg, url_call.args[0].value])
|
|
93
114
|
] + next_segments
|
|
94
115
|
|
|
95
116
|
yield CrawlIntent(url=url_call.args[0].value, next_segments=_segments)
|
|
@@ -112,17 +133,52 @@ def _handle_xpath(curr_elem: html.HtmlElement,
|
|
|
112
133
|
raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
|
|
113
134
|
base_url = getattr(curr_elem, 'base_url', None)
|
|
114
135
|
log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
elems = curr_elem.xpath3(expr)
|
|
139
|
+
except EPSyntaxError as e:
|
|
140
|
+
# Parse the error message to extract line/column if available
|
|
141
|
+
# elementpath format: "... at line 1, column 7: [err:XPST0003] ..."
|
|
142
|
+
raise XPathSyntaxError(
|
|
143
|
+
f"Invalid XPath syntax: {str(e).split(': ', 1)[-1]}",
|
|
144
|
+
xpath=expr,
|
|
145
|
+
base_url=base_url,
|
|
146
|
+
element_tag=curr_elem.tag,
|
|
147
|
+
original_error=e
|
|
148
|
+
) from e
|
|
149
|
+
except EPTypeError as e:
|
|
150
|
+
raise XPathTypeError(
|
|
151
|
+
f"XPath type error: {str(e).split(': ', 1)[-1]}",
|
|
152
|
+
xpath=expr,
|
|
153
|
+
base_url=base_url,
|
|
154
|
+
element_tag=curr_elem.tag,
|
|
155
|
+
original_error=e
|
|
156
|
+
) from e
|
|
157
|
+
except ElementPathZeroDivisionError as e:
|
|
158
|
+
raise XPathRuntimeError(
|
|
159
|
+
f"Division by zero in XPath: {expr}",
|
|
160
|
+
xpath=expr,
|
|
161
|
+
base_url=base_url,
|
|
162
|
+
element_tag=curr_elem.tag,
|
|
163
|
+
original_error=e
|
|
164
|
+
) from e
|
|
165
|
+
except MissingContextError as e:
|
|
166
|
+
raise XPathRuntimeError(
|
|
167
|
+
f"XPath requires context but none provided: {expr}",
|
|
168
|
+
xpath=expr,
|
|
169
|
+
base_url=base_url,
|
|
170
|
+
element_tag=curr_elem.tag,
|
|
171
|
+
original_error=e
|
|
172
|
+
) from e
|
|
173
|
+
except ElementPathError as e:
|
|
174
|
+
# Catch-all for other elementpath errors
|
|
175
|
+
raise XPathEvaluationError(
|
|
176
|
+
f"XPath evaluation failed: {e}",
|
|
177
|
+
xpath=expr,
|
|
178
|
+
base_url=base_url,
|
|
179
|
+
element_tag=curr_elem.tag,
|
|
180
|
+
original_error=e
|
|
181
|
+
) from e
|
|
126
182
|
|
|
127
183
|
next_segments = curr_segments[1:]
|
|
128
184
|
for elem in elems:
|
|
@@ -259,12 +315,37 @@ def _handle_binary(curr_elem: html.HtmlElement | str,
|
|
|
259
315
|
base_url = getattr(curr_elem, 'base_url', None)
|
|
260
316
|
next_segments = right
|
|
261
317
|
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
318
|
+
try:
|
|
319
|
+
results = elementpath.select(
|
|
320
|
+
curr_elem,
|
|
321
|
+
left.value,
|
|
322
|
+
parser=XPath3Parser,
|
|
323
|
+
item='' if curr_elem is None else None
|
|
324
|
+
)
|
|
325
|
+
except EPSyntaxError as e:
|
|
326
|
+
raise XPathSyntaxError(
|
|
327
|
+
f"Invalid XPath in binary operation: {str(e).split(': ', 1)[-1]}",
|
|
328
|
+
xpath=left.value,
|
|
329
|
+
base_url=base_url,
|
|
330
|
+
element_tag=getattr(curr_elem, 'tag', None),
|
|
331
|
+
original_error=e
|
|
332
|
+
) from e
|
|
333
|
+
except EPTypeError as e:
|
|
334
|
+
raise XPathTypeError(
|
|
335
|
+
f"XPath type error in binary operation: {str(e).split(': ', 1)[-1]}",
|
|
336
|
+
xpath=left.value,
|
|
337
|
+
base_url=base_url,
|
|
338
|
+
element_tag=getattr(curr_elem, 'tag', None),
|
|
339
|
+
original_error=e
|
|
340
|
+
) from e
|
|
341
|
+
except ElementPathError as e:
|
|
342
|
+
raise XPathEvaluationError(
|
|
343
|
+
f"XPath evaluation failed in binary operation: {e}",
|
|
344
|
+
xpath=left.value,
|
|
345
|
+
base_url=base_url,
|
|
346
|
+
element_tag=getattr(curr_elem, 'tag', None),
|
|
347
|
+
original_error=e
|
|
348
|
+
) from e
|
|
268
349
|
|
|
269
350
|
if isinstance(results, AnyAtomicType):
|
|
270
351
|
results = [results]
|
wxpath/core/parser.py
CHANGED
|
@@ -13,7 +13,9 @@ except ImportError:
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
TOKEN_SPEC = [
|
|
16
|
-
("
|
|
16
|
+
("WXLOOP", r"wx:loop"),
|
|
17
|
+
("NUMBER", r"\d+\.\d+"),
|
|
18
|
+
("INTEGER", r"\d+"),
|
|
17
19
|
("STRING", r"'([^'\\]|\\.)*'|\"([^\"\\]|\\.)*\""), # TODO: Rename to URL Literal
|
|
18
20
|
("WXPATH", r"/{0,3}\s*url"), # Must come before NAME to match 'url' as WXPATH
|
|
19
21
|
# ("///URL", r"/{3}\s*url"),
|
|
@@ -22,6 +24,7 @@ TOKEN_SPEC = [
|
|
|
22
24
|
("URL", r"\s*url"), # Must come before NAME to match 'url' as WXPATH
|
|
23
25
|
# ("NAME", r"[a-zA-Z_][a-zA-Z0-9_]*"),
|
|
24
26
|
("FOLLOW", r",?\s{,}follow="),
|
|
27
|
+
("DEPTH", r",?\s{,}depth="),
|
|
25
28
|
("OP", r"\|\||<=|>=|!=|=|<|>|\+|-|\*|/|!"), # Added || for string concat
|
|
26
29
|
("LPAREN", r"\("),
|
|
27
30
|
("RPAREN", r"\)"),
|
|
@@ -63,6 +66,14 @@ def tokenize(src: str):
|
|
|
63
66
|
class Number:
|
|
64
67
|
value: float
|
|
65
68
|
|
|
69
|
+
@dataclass
|
|
70
|
+
class Integer:
|
|
71
|
+
value: int
|
|
72
|
+
|
|
73
|
+
@dataclass
|
|
74
|
+
class Depth(Integer):
|
|
75
|
+
pass
|
|
76
|
+
|
|
66
77
|
@dataclass
|
|
67
78
|
class String:
|
|
68
79
|
value: str
|
|
@@ -170,7 +181,7 @@ class Parser:
|
|
|
170
181
|
|
|
171
182
|
def parse_binary(self, min_prec: int) -> object:
|
|
172
183
|
"""Parse a binary expression chain honoring operator precedence."""
|
|
173
|
-
if self.token.type == "WXPATH":
|
|
184
|
+
if self.token.type == "WXPATH" or self.token.type == "WXLOOP":
|
|
174
185
|
left = self.parse_segments()
|
|
175
186
|
else:
|
|
176
187
|
left = self.nud()
|
|
@@ -273,6 +284,10 @@ class Parser:
|
|
|
273
284
|
if tok.type == "NUMBER":
|
|
274
285
|
self.advance()
|
|
275
286
|
return Number(float(tok.value))
|
|
287
|
+
|
|
288
|
+
if tok.type == "INTEGER":
|
|
289
|
+
self.advance()
|
|
290
|
+
return Integer(int(tok.value))
|
|
276
291
|
|
|
277
292
|
if tok.type == "STRING":
|
|
278
293
|
self.advance()
|
|
@@ -358,18 +373,18 @@ class Parser:
|
|
|
358
373
|
self.advance()
|
|
359
374
|
|
|
360
375
|
return result
|
|
361
|
-
|
|
362
376
|
|
|
363
377
|
def capture_url_arg_content(self) -> list[Call | Xpath | ContextItem]:
|
|
364
378
|
"""Capture content inside a url() call, handling nested wxpath expressions.
|
|
365
379
|
|
|
366
380
|
Supports patterns like::
|
|
367
381
|
|
|
368
|
-
url('...')
|
|
369
|
-
url('...' follow=//a/@href)
|
|
370
|
-
url(
|
|
371
|
-
url(
|
|
372
|
-
url( url(
|
|
382
|
+
url('...') -> [String]
|
|
383
|
+
url('...' follow=//a/@href) -> [String, Xpath]
|
|
384
|
+
url('...' follow=//a/@href depth=2) -> [String, Xpath, Integer]
|
|
385
|
+
url(//a/@href depth=2) -> [Xpath, Integer]
|
|
386
|
+
url( url('..')//a/@href ) -> [Call, Xpath]
|
|
387
|
+
url( url( url('..')//a )//b ) -> [Call, Xpath]
|
|
373
388
|
|
|
374
389
|
Returns:
|
|
375
390
|
A list of parsed elements: Xpath nodes for xpath content and Call
|
|
@@ -380,7 +395,10 @@ class Parser:
|
|
|
380
395
|
paren_balance = 1 # We're already inside the opening paren of url()
|
|
381
396
|
brace_balance = 0 # Track braces for map constructors
|
|
382
397
|
reached_follow_token = False
|
|
398
|
+
reached_depth_token = False
|
|
383
399
|
follow_xpath = ""
|
|
400
|
+
depth_number = ""
|
|
401
|
+
|
|
384
402
|
while paren_balance > 0 and self.token.type != "EOF":
|
|
385
403
|
if self.token.type == "WXPATH":
|
|
386
404
|
# Found nested wxpath: save any accumulated xpath content first
|
|
@@ -396,13 +414,22 @@ class Parser:
|
|
|
396
414
|
|
|
397
415
|
elif self.token.type == "FOLLOW":
|
|
398
416
|
reached_follow_token = True
|
|
417
|
+
reached_depth_token = False
|
|
418
|
+
self.advance()
|
|
419
|
+
|
|
420
|
+
elif self.token.type == "DEPTH":
|
|
421
|
+
reached_depth_token = True
|
|
422
|
+
reached_follow_token = False
|
|
399
423
|
self.advance()
|
|
400
424
|
|
|
401
425
|
elif self.token.type == "LPAREN":
|
|
402
426
|
# Opening paren that's NOT part of a url() call
|
|
403
427
|
# (it's part of an xpath function like contains(), starts-with(), etc.)
|
|
404
428
|
paren_balance += 1
|
|
405
|
-
|
|
429
|
+
if not reached_follow_token:
|
|
430
|
+
current_xpath += self.token.value
|
|
431
|
+
else:
|
|
432
|
+
follow_xpath += self.token.value
|
|
406
433
|
self.advance()
|
|
407
434
|
|
|
408
435
|
elif self.token.type == "RPAREN":
|
|
@@ -410,26 +437,37 @@ class Parser:
|
|
|
410
437
|
if paren_balance == 0:
|
|
411
438
|
# This is the closing paren of the outer url()
|
|
412
439
|
break
|
|
413
|
-
|
|
440
|
+
if not reached_follow_token:
|
|
441
|
+
current_xpath += self.token.value
|
|
442
|
+
else:
|
|
443
|
+
follow_xpath += self.token.value
|
|
414
444
|
self.advance()
|
|
415
445
|
|
|
416
446
|
elif self.token.type == "LBRACE":
|
|
417
447
|
# Opening brace for map constructors
|
|
418
448
|
brace_balance += 1
|
|
419
|
-
|
|
449
|
+
if not reached_follow_token:
|
|
450
|
+
current_xpath += self.token.value
|
|
451
|
+
else:
|
|
452
|
+
follow_xpath += self.token.value
|
|
420
453
|
self.advance()
|
|
421
454
|
|
|
422
455
|
elif self.token.type == "RBRACE":
|
|
423
456
|
brace_balance -= 1
|
|
424
|
-
|
|
457
|
+
if not reached_follow_token:
|
|
458
|
+
current_xpath += self.token.value
|
|
459
|
+
else:
|
|
460
|
+
follow_xpath += self.token.value
|
|
425
461
|
self.advance()
|
|
426
462
|
|
|
427
463
|
else:
|
|
428
464
|
# Accumulate all other tokens as xpath content
|
|
429
|
-
if
|
|
430
|
-
current_xpath += self.token.value
|
|
431
|
-
else:
|
|
465
|
+
if reached_follow_token:
|
|
432
466
|
follow_xpath += self.token.value
|
|
467
|
+
elif reached_depth_token:
|
|
468
|
+
depth_number += self.token.value
|
|
469
|
+
else:
|
|
470
|
+
current_xpath += self.token.value
|
|
433
471
|
|
|
434
472
|
self.advance()
|
|
435
473
|
|
|
@@ -447,6 +485,9 @@ class Parser:
|
|
|
447
485
|
if follow_xpath.strip():
|
|
448
486
|
elements.append(Xpath(follow_xpath.strip()))
|
|
449
487
|
|
|
488
|
+
if depth_number.strip():
|
|
489
|
+
elements.append(Depth(int(depth_number.strip())))
|
|
490
|
+
|
|
450
491
|
return elements
|
|
451
492
|
|
|
452
493
|
def parse_call(self, func_name: str) -> Call | Segments:
|
|
@@ -462,13 +503,16 @@ class Parser:
|
|
|
462
503
|
self.advance()
|
|
463
504
|
# Handle follow=...
|
|
464
505
|
if self.token.type == "FOLLOW":
|
|
465
|
-
self.advance()
|
|
466
506
|
follow_arg = self.capture_url_arg_content()
|
|
467
507
|
args.extend(follow_arg)
|
|
508
|
+
if self.token.type == "DEPTH":
|
|
509
|
+
depth_arg = self.capture_url_arg_content()
|
|
510
|
+
args.extend(depth_arg)
|
|
468
511
|
elif self.token.type == "WXPATH":
|
|
469
512
|
# Nested wxpath: url( url('...')//a/@href ) or url( /url(...) )
|
|
470
|
-
#
|
|
471
|
-
args = self.capture_url_arg_content()
|
|
513
|
+
# NOTE: We used to use capture_url_arg_content to handle nested wxpath and xpath
|
|
514
|
+
# args = self.capture_url_arg_content()
|
|
515
|
+
args = self.nud()
|
|
472
516
|
else:
|
|
473
517
|
# Simple xpath argument: url(//a/@href)
|
|
474
518
|
# Could still contain nested wxpath, so use capture_url_arg_content
|
|
@@ -489,8 +533,18 @@ class Parser:
|
|
|
489
533
|
|
|
490
534
|
return _specify_call_types(func_name, args)
|
|
491
535
|
|
|
492
|
-
|
|
493
536
|
def _specify_call_types(func_name: str, args: list) -> Call | Segments:
|
|
537
|
+
"""
|
|
538
|
+
Specify the type of a call based on the function name and arguments.
|
|
539
|
+
TODO: Provide example wxpath expressions for each call type.
|
|
540
|
+
|
|
541
|
+
Args:
|
|
542
|
+
func_name: The name of the function.
|
|
543
|
+
args: The arguments of the function.
|
|
544
|
+
|
|
545
|
+
Returns:
|
|
546
|
+
Call | Segments: The type of the call.
|
|
547
|
+
"""
|
|
494
548
|
if func_name == "url":
|
|
495
549
|
if len(args) == 1:
|
|
496
550
|
if isinstance(args[0], String):
|
|
@@ -500,17 +554,33 @@ def _specify_call_types(func_name: str, args: list) -> Call | Segments:
|
|
|
500
554
|
else:
|
|
501
555
|
raise ValueError(f"Unknown argument type: {type(args[0])}")
|
|
502
556
|
elif len(args) == 2:
|
|
503
|
-
|
|
557
|
+
arg0, arg1 = args
|
|
558
|
+
if isinstance(arg0, String) and isinstance(arg1, Xpath):
|
|
559
|
+
# Example: url('...', follow=//a/@href)
|
|
504
560
|
return UrlCrawl(func_name, args)
|
|
505
|
-
elif isinstance(
|
|
561
|
+
elif isinstance(arg0, String) and isinstance(arg1, Integer):
|
|
562
|
+
# Example: url('...', depth=2)
|
|
563
|
+
return UrlLiteral(func_name, args)
|
|
564
|
+
elif isinstance(arg0, UrlLiteral) and isinstance(arg1, Xpath):
|
|
506
565
|
args.append(UrlQuery('url', [ContextItem()]))
|
|
507
566
|
return Segments(args)
|
|
508
|
-
elif isinstance(
|
|
509
|
-
segs =
|
|
510
|
-
segs.append(
|
|
567
|
+
elif isinstance(arg0, (Segments, list)) and isinstance(arg1, Xpath):
|
|
568
|
+
segs = arg0
|
|
569
|
+
segs.append(arg1)
|
|
511
570
|
return Segments(segs)
|
|
512
571
|
else:
|
|
513
572
|
raise ValueError(f"Unknown arguments: {args}")
|
|
573
|
+
elif len(args) == 3:
|
|
574
|
+
arg0, arg1, arg2 = args
|
|
575
|
+
if (isinstance(arg0, String) and (
|
|
576
|
+
(isinstance(arg1, Xpath) and isinstance(arg2, Integer)) or
|
|
577
|
+
(isinstance(arg1, Integer) and isinstance(arg2, Xpath))
|
|
578
|
+
)):
|
|
579
|
+
# Example: url('...', follow=//a/@href, depth=2)
|
|
580
|
+
# Example: url('...', depth=2, follow=//a/@href)
|
|
581
|
+
return UrlCrawl(func_name, args)
|
|
582
|
+
else:
|
|
583
|
+
raise ValueError(f"Unknown arguments: {args}")
|
|
514
584
|
else:
|
|
515
585
|
raise ValueError(f"Unknown arguments: {args}")
|
|
516
586
|
elif func_name == "/url" or func_name == "//url":
|