wxpath 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wxpath/cli.py +52 -12
- wxpath/core/ops.py +163 -129
- wxpath/core/parser.py +559 -280
- wxpath/core/runtime/engine.py +133 -42
- wxpath/core/runtime/helpers.py +0 -7
- wxpath/hooks/registry.py +29 -17
- wxpath/http/client/crawler.py +46 -11
- wxpath/http/client/request.py +6 -3
- wxpath/http/client/response.py +1 -1
- wxpath/http/policy/robots.py +82 -0
- {wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/METADATA +84 -37
- {wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/RECORD +16 -16
- wxpath/core/errors.py +0 -134
- {wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/WHEEL +0 -0
- {wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/entry_points.txt +0 -0
- {wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/top_level.txt +0 -0
wxpath/cli.py
CHANGED
|
@@ -2,44 +2,84 @@ import argparse
|
|
|
2
2
|
import json
|
|
3
3
|
import sys
|
|
4
4
|
|
|
5
|
-
from wxpath.core
|
|
5
|
+
from wxpath.core import parser as wxpath_parser
|
|
6
6
|
from wxpath.core.runtime.engine import WXPathEngine, wxpath_async_blocking_iter
|
|
7
7
|
from wxpath.hooks import builtin, registry
|
|
8
|
+
from wxpath.http.client.crawler import Crawler
|
|
8
9
|
from wxpath.util.serialize import simplify
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
def main():
|
|
12
13
|
registry.register(builtin.SerializeXPathMapAndNodeHook)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
14
|
+
arg_parser = argparse.ArgumentParser(description="Run wxpath expression.")
|
|
15
|
+
arg_parser.add_argument("expression", help="The wxpath expression")
|
|
16
|
+
arg_parser.add_argument("--depth", type=int, default=1, help="Recursion depth")
|
|
16
17
|
# debug
|
|
17
|
-
|
|
18
|
+
arg_parser.add_argument("--debug", action="store_true", help="Debug mode")
|
|
18
19
|
# verbose
|
|
19
|
-
|
|
20
|
+
arg_parser.add_argument("--verbose", action="store_true", help="Verbose mode")
|
|
20
21
|
|
|
21
|
-
|
|
22
|
-
|
|
22
|
+
arg_parser.add_argument(
|
|
23
|
+
"--concurrency",
|
|
24
|
+
type=int,
|
|
25
|
+
default=16,
|
|
26
|
+
help="Number of concurrent fetches"
|
|
27
|
+
)
|
|
28
|
+
arg_parser.add_argument(
|
|
23
29
|
"--concurrency-per-host",
|
|
24
30
|
type=int,
|
|
25
31
|
default=8,
|
|
26
32
|
help="Number of concurrent fetches per host"
|
|
27
33
|
)
|
|
34
|
+
arg_parser.add_argument(
|
|
35
|
+
"--header",
|
|
36
|
+
action="append",
|
|
37
|
+
dest="header_list",
|
|
38
|
+
default=[],
|
|
39
|
+
help="Add a custom header (e.g., 'Key:Value'). Can be used multiple times.",
|
|
40
|
+
)
|
|
41
|
+
arg_parser.add_argument(
|
|
42
|
+
"--respect-robots",
|
|
43
|
+
action="store_true",
|
|
44
|
+
help="Respect robots.txt",
|
|
45
|
+
default=True
|
|
46
|
+
)
|
|
28
47
|
|
|
29
|
-
args =
|
|
48
|
+
args = arg_parser.parse_args()
|
|
30
49
|
|
|
31
50
|
if args.verbose:
|
|
32
|
-
|
|
33
|
-
print("parsed expression
|
|
51
|
+
segments = wxpath_parser.parse(args.expression)
|
|
52
|
+
print("parsed expression:\n\nSegments([")
|
|
53
|
+
for s in segments:
|
|
54
|
+
print(f"\t{s},")
|
|
55
|
+
print("])")
|
|
56
|
+
print()
|
|
34
57
|
|
|
35
58
|
if args.debug:
|
|
36
59
|
from wxpath import configure_logging
|
|
37
60
|
configure_logging('DEBUG')
|
|
38
61
|
|
|
39
|
-
|
|
62
|
+
custom_headers = {}
|
|
63
|
+
if args.header_list:
|
|
64
|
+
for header_item in args.header_list:
|
|
65
|
+
try:
|
|
66
|
+
key, value = header_item.split(':', 1)
|
|
67
|
+
custom_headers[key.strip()] = value.strip()
|
|
68
|
+
except ValueError:
|
|
69
|
+
print(f"Warning: Invalid header format '{header_item}'. Use 'Key:Value'.")
|
|
70
|
+
|
|
71
|
+
if custom_headers and args.verbose:
|
|
72
|
+
print(f"Using custom headers: {custom_headers}")
|
|
73
|
+
print()
|
|
74
|
+
|
|
75
|
+
crawler = Crawler(
|
|
40
76
|
concurrency=args.concurrency,
|
|
41
77
|
per_host=args.concurrency_per_host,
|
|
78
|
+
respect_robots=args.respect_robots,
|
|
79
|
+
headers=custom_headers
|
|
42
80
|
)
|
|
81
|
+
engine = WXPathEngine(crawler=crawler)
|
|
82
|
+
|
|
43
83
|
try:
|
|
44
84
|
for r in wxpath_async_blocking_iter(args.expression, args.depth, engine):
|
|
45
85
|
clean = simplify(r)
|
wxpath/core/ops.py
CHANGED
|
@@ -1,7 +1,3 @@
|
|
|
1
|
-
"""
|
|
2
|
-
`ops` for "operations". This module contains side-effect-free functions (operators)
|
|
3
|
-
for handling each segment of a wxpath expression.
|
|
4
|
-
"""
|
|
5
1
|
from typing import Callable, Iterable
|
|
6
2
|
from urllib.parse import urljoin
|
|
7
3
|
|
|
@@ -19,16 +15,24 @@ from wxpath.core.models import (
|
|
|
19
15
|
Intent,
|
|
20
16
|
ProcessIntent,
|
|
21
17
|
)
|
|
22
|
-
from wxpath.core.parser import
|
|
18
|
+
from wxpath.core.parser import (
|
|
19
|
+
Binary,
|
|
20
|
+
Call,
|
|
21
|
+
ContextItem,
|
|
22
|
+
Segment,
|
|
23
|
+
Segments,
|
|
24
|
+
String,
|
|
25
|
+
Url,
|
|
26
|
+
UrlCrawl,
|
|
27
|
+
Xpath,
|
|
28
|
+
)
|
|
23
29
|
from wxpath.util.logging import get_logger
|
|
24
30
|
|
|
25
31
|
log = get_logger(__name__)
|
|
26
32
|
|
|
27
33
|
|
|
28
34
|
class WxStr(str):
|
|
29
|
-
"""
|
|
30
|
-
A string that has a base_url and depth associated with it. Purely for debugging.
|
|
31
|
-
"""
|
|
35
|
+
"""A string with associated base_url and depth metadata for debugging."""
|
|
32
36
|
def __new__(cls, value, base_url=None, depth=-1):
|
|
33
37
|
obj = super().__new__(cls, value)
|
|
34
38
|
obj.base_url = base_url
|
|
@@ -39,61 +43,120 @@ class WxStr(str):
|
|
|
39
43
|
return f"WxStr({super().__repr__()}, base_url={self.base_url!r}, depth={self.depth})"
|
|
40
44
|
|
|
41
45
|
|
|
42
|
-
|
|
46
|
+
class RuntimeSetupError(Exception):
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
OPS_REGISTER: dict[str, Callable] = {}
|
|
51
|
+
|
|
52
|
+
def register(func_name_or_type: str | type, args_types: tuple[type, ...] | None = None):
|
|
53
|
+
def _register(func: Callable) -> Callable:
|
|
54
|
+
global OPS_REGISTER
|
|
55
|
+
_key = (func_name_or_type, args_types) if args_types else func_name_or_type
|
|
56
|
+
if _key in OPS_REGISTER:
|
|
57
|
+
raise RuntimeSetupError(f"The operation handler for \"{_key}\" already registered")
|
|
58
|
+
OPS_REGISTER[_key] = func
|
|
59
|
+
return func
|
|
60
|
+
return _register
|
|
43
61
|
|
|
44
|
-
def _op(name: OPS):
|
|
45
|
-
def reg(fn):
|
|
46
|
-
if name in HANDLERS:
|
|
47
|
-
raise ValueError(f"Duplicate operation: {name}")
|
|
48
|
-
HANDLERS[name] = fn
|
|
49
|
-
return fn
|
|
50
|
-
return reg
|
|
51
62
|
|
|
63
|
+
def get_operator(
|
|
64
|
+
binary_or_segment: Binary | Segment
|
|
65
|
+
) -> Callable[[html.HtmlElement, list[Url | Xpath], int], Iterable[Intent]]:
|
|
66
|
+
func_name_or_type = getattr(binary_or_segment, 'func', None) or binary_or_segment.__class__
|
|
52
67
|
|
|
53
|
-
|
|
54
|
-
if
|
|
55
|
-
|
|
56
|
-
|
|
68
|
+
args_types = None
|
|
69
|
+
if isinstance(binary_or_segment, Binary):
|
|
70
|
+
args_types = (binary_or_segment.left.__class__, binary_or_segment.right.__class__)
|
|
71
|
+
elif isinstance(binary_or_segment, Call):
|
|
72
|
+
args_types = tuple(arg.__class__ for arg in binary_or_segment.args)
|
|
57
73
|
|
|
74
|
+
_key = (func_name_or_type, args_types) if args_types else func_name_or_type
|
|
75
|
+
if _key not in OPS_REGISTER:
|
|
76
|
+
raise ValueError(f"Unknown operation: {_key}")
|
|
77
|
+
return OPS_REGISTER[_key]
|
|
58
78
|
|
|
59
|
-
|
|
79
|
+
|
|
80
|
+
@register('url', (String,))
|
|
81
|
+
@register('url', (String, Xpath))
|
|
60
82
|
def _handle_url_str_lit(curr_elem: html.HtmlElement,
|
|
61
|
-
curr_segments: list[
|
|
83
|
+
curr_segments: list[Url | Xpath],
|
|
62
84
|
curr_depth: int, **kwargs) -> Iterable[Intent]:
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
log.debug("queueing", extra={"depth": curr_depth, "op": op, "url": value.target})
|
|
85
|
+
"""Handle `url('<literal>')` segments and optional follow xpath."""
|
|
86
|
+
url_call = curr_segments[0] # type: Url
|
|
66
87
|
|
|
67
88
|
next_segments = curr_segments[1:]
|
|
68
89
|
|
|
69
|
-
if
|
|
90
|
+
if len(url_call.args) == 2:
|
|
70
91
|
_segments = [
|
|
71
|
-
(
|
|
92
|
+
UrlCrawl('///url', [url_call.args[1], url_call.args[0].value])
|
|
72
93
|
] + next_segments
|
|
73
94
|
|
|
74
|
-
yield CrawlIntent(url=value
|
|
95
|
+
yield CrawlIntent(url=url_call.args[0].value, next_segments=_segments)
|
|
75
96
|
else:
|
|
76
|
-
yield CrawlIntent(url=value
|
|
97
|
+
yield CrawlIntent(url=url_call.args[0].value, next_segments=next_segments)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# @register2('url', (Xpath,))
|
|
101
|
+
@register(Xpath)
|
|
102
|
+
def _handle_xpath(curr_elem: html.HtmlElement,
|
|
103
|
+
curr_segments: Segments,
|
|
104
|
+
curr_depth: int,
|
|
105
|
+
**kwargs) -> Iterable[Intent]:
|
|
106
|
+
"""Execute an xpath step and yield data or chained processing intents."""
|
|
107
|
+
xpath_node = curr_segments[0] # type: Xpath
|
|
108
|
+
|
|
109
|
+
expr = xpath_node.value
|
|
110
|
+
|
|
111
|
+
if curr_elem is None:
|
|
112
|
+
raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
|
|
113
|
+
base_url = getattr(curr_elem, 'base_url', None)
|
|
114
|
+
log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
|
|
115
|
+
|
|
116
|
+
_backlink_str = f"string('{curr_elem.get('backlink')}')"
|
|
117
|
+
# We use the root tree's depth and not curr_depth because curr_depth accounts for a +1
|
|
118
|
+
# increment after each url*() hop
|
|
119
|
+
_depth_str = f"number({curr_elem.getroottree().getroot().get('depth')})"
|
|
120
|
+
expr = expr.replace('wx:backlink()', _backlink_str)
|
|
121
|
+
expr = expr.replace('wx:backlink(.)', _backlink_str)
|
|
122
|
+
expr = expr.replace('wx:depth()', _depth_str)
|
|
123
|
+
expr = expr.replace('wx:depth(.)', _depth_str)
|
|
124
|
+
|
|
125
|
+
elems = curr_elem.xpath3(expr)
|
|
126
|
+
|
|
127
|
+
next_segments = curr_segments[1:]
|
|
128
|
+
for elem in elems:
|
|
129
|
+
value_or_elem = WxStr(
|
|
130
|
+
elem, base_url=base_url,
|
|
131
|
+
depth=curr_depth
|
|
132
|
+
) if isinstance(elem, str) else elem
|
|
133
|
+
if len(curr_segments) == 1:
|
|
134
|
+
yield DataIntent(value=value_or_elem)
|
|
135
|
+
else:
|
|
136
|
+
yield ProcessIntent(elem=value_or_elem, next_segments=next_segments)
|
|
77
137
|
|
|
78
138
|
|
|
79
|
-
@
|
|
139
|
+
@register('//url', (ContextItem,))
|
|
140
|
+
@register('//url', (Xpath,))
|
|
141
|
+
@register('/url', (ContextItem,))
|
|
142
|
+
@register('/url', (Xpath,))
|
|
143
|
+
@register('url', (ContextItem,))
|
|
144
|
+
@register('url', (Xpath,))
|
|
80
145
|
def _handle_url_eval(curr_elem: html.HtmlElement | str,
|
|
81
|
-
curr_segments: list[
|
|
146
|
+
curr_segments: list[Url | Xpath],
|
|
82
147
|
curr_depth: int,
|
|
83
148
|
**kwargs) -> Iterable[Intent]:
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
# in ops.py. It should instead be validated in the parser.
|
|
91
|
-
if _path_exp not in {'.', 'self::node()'}:
|
|
92
|
-
raise ValueError("Only '.' or 'self::node()' is supported in url() segments "
|
|
93
|
-
f"when prior xpath operation results in a string. Got: {_path_exp}")
|
|
149
|
+
"""Resolve dynamic url() arguments and enqueue crawl intents.
|
|
150
|
+
|
|
151
|
+
Yields:
|
|
152
|
+
CrawlIntent
|
|
153
|
+
"""
|
|
154
|
+
url_call = curr_segments[0] # type: Url
|
|
94
155
|
|
|
156
|
+
if isinstance(url_call.args[0], ContextItem):
|
|
95
157
|
urls = [urljoin(getattr(curr_elem, 'base_url', None) or '', curr_elem)]
|
|
96
158
|
else:
|
|
159
|
+
_path_exp = url_call.args[0].value
|
|
97
160
|
# TODO: If prior xpath operation is XPATH_FN_MAP_FRAG, then this will likely fail.
|
|
98
161
|
# It should be handled in the parser.
|
|
99
162
|
urls = get_absolute_links_from_elem_and_xpath(curr_elem, _path_exp)
|
|
@@ -101,144 +164,115 @@ def _handle_url_eval(curr_elem: html.HtmlElement | str,
|
|
|
101
164
|
|
|
102
165
|
next_segments = curr_segments[1:]
|
|
103
166
|
for url in urls:
|
|
104
|
-
log.debug("queueing", extra={"depth": curr_depth, "op": op, "url": url})
|
|
167
|
+
# log.debug("queueing", extra={"depth": curr_depth, "op": op, "url": url})
|
|
105
168
|
yield CrawlIntent(url=url, next_segments=next_segments)
|
|
106
169
|
|
|
107
170
|
|
|
108
|
-
@
|
|
171
|
+
@register('///url', (Xpath,))
|
|
109
172
|
def _handle_url_inf(curr_elem: html.HtmlElement,
|
|
110
|
-
curr_segments: list[
|
|
173
|
+
curr_segments: list[Url | Xpath],
|
|
111
174
|
curr_depth: int,
|
|
112
175
|
**kwargs) -> Iterable[CrawlIntent]:
|
|
176
|
+
"""Handle the ``///url()`` segment of a wxpath expression.
|
|
177
|
+
|
|
178
|
+
This operation is also generated internally by the parser when a
|
|
179
|
+
``///<xpath>/[/]url()`` segment is encountered.
|
|
180
|
+
|
|
181
|
+
Instead of fetching URLs directly, this operator XPaths the current
|
|
182
|
+
element for URLs and queues them for further processing via
|
|
183
|
+
``_handle_url_inf_and_xpath``.
|
|
113
184
|
"""
|
|
114
|
-
|
|
115
|
-
generated internally by the parser when a `///<xpath>/[/]url()` segment is
|
|
116
|
-
encountered by the parser.
|
|
117
|
-
This operation does not fetch URLs; instead, it XPaths the current element
|
|
118
|
-
for URLs, then queues them for further processing (see
|
|
119
|
-
_handle_url_inf_and_xpath).
|
|
120
|
-
"""
|
|
121
|
-
op, value = curr_segments[0]
|
|
185
|
+
url_call = curr_segments[0] # type: Url
|
|
122
186
|
|
|
123
|
-
_path_exp = value
|
|
187
|
+
_path_exp = url_call.args[0].value
|
|
124
188
|
|
|
125
189
|
urls = get_absolute_links_from_elem_and_xpath(curr_elem, _path_exp)
|
|
126
190
|
|
|
127
|
-
log.debug("found urls",
|
|
128
|
-
extra={"depth": curr_depth, "op": op, "url": getattr(curr_elem, 'base_url', None)})
|
|
129
|
-
|
|
130
191
|
tail_segments = curr_segments[1:]
|
|
131
192
|
for url in dict.fromkeys(urls):
|
|
132
193
|
_segments = [
|
|
133
|
-
(
|
|
194
|
+
UrlCrawl('///url', [url_call.args[0], url])
|
|
134
195
|
] + tail_segments
|
|
135
196
|
|
|
136
|
-
log.debug("queueing", extra={"depth": curr_depth, "op": op, "url": url})
|
|
137
|
-
|
|
138
197
|
yield CrawlIntent(url=url, next_segments=_segments)
|
|
139
198
|
|
|
140
199
|
|
|
141
|
-
@
|
|
200
|
+
@register('///url', (Xpath, str))
|
|
142
201
|
def _handle_url_inf_and_xpath(curr_elem: html.HtmlElement,
|
|
143
|
-
curr_segments: list[
|
|
202
|
+
curr_segments: list[Url | Xpath],
|
|
144
203
|
curr_depth: int, **kwargs) \
|
|
145
204
|
-> Iterable[DataIntent | ProcessIntent | InfiniteCrawlIntent]:
|
|
205
|
+
"""Handle infinite-crawl with an xpath extraction step.
|
|
206
|
+
|
|
207
|
+
This operation is generated internally by the parser; there is no explicit
|
|
208
|
+
wxpath expression that produces it directly.
|
|
209
|
+
|
|
210
|
+
Yields:
|
|
211
|
+
DataIntent: If the current element is not None and no next segments are provided.
|
|
212
|
+
ExtractIntent: If the current element is not None and next segments are provided.
|
|
213
|
+
InfiniteCrawlIntent: If the current element is not None and next segments are provided.
|
|
214
|
+
|
|
215
|
+
Raises:
|
|
216
|
+
ValueError: If the current element is None.
|
|
146
217
|
"""
|
|
147
|
-
|
|
148
|
-
no explicit wxpath expression that generates this operation.
|
|
149
|
-
"""
|
|
150
|
-
op, value = curr_segments[0]
|
|
218
|
+
url_call = curr_segments[0]
|
|
151
219
|
|
|
152
220
|
try:
|
|
153
221
|
if curr_elem is None:
|
|
154
222
|
raise ValueError("Missing element when op is 'url_inf_and_xpath'.")
|
|
155
223
|
|
|
156
224
|
next_segments = curr_segments[1:]
|
|
225
|
+
|
|
157
226
|
if not next_segments:
|
|
158
227
|
yield DataIntent(value=curr_elem)
|
|
159
228
|
else:
|
|
160
229
|
yield ExtractIntent(elem=curr_elem, next_segments=next_segments)
|
|
161
230
|
|
|
162
231
|
# For url_inf, also re-enqueue for further infinite expansion
|
|
163
|
-
_segments = [(
|
|
232
|
+
_segments = [UrlCrawl('///url', url_call.args[:-1])] + next_segments
|
|
164
233
|
crawl_intent = InfiniteCrawlIntent(elem=curr_elem, next_segments=_segments)
|
|
165
|
-
|
|
166
|
-
extra={"depth": curr_depth, "op": op,
|
|
167
|
-
"url": value.target, "crawl_intent": crawl_intent})
|
|
234
|
+
|
|
168
235
|
yield crawl_intent
|
|
169
236
|
|
|
170
237
|
except Exception:
|
|
171
|
-
log.exception("error fetching url",
|
|
172
|
-
extra={"depth": curr_depth, "
|
|
173
|
-
|
|
238
|
+
log.exception("error fetching url inf and xpath",
|
|
239
|
+
extra={"depth": curr_depth, "url": url_call.args[1]})
|
|
174
240
|
|
|
175
|
-
@
|
|
176
|
-
def
|
|
177
|
-
|
|
178
|
-
curr_depth: int,
|
|
179
|
-
**kwargs) -> Iterable[DataIntent | ProcessIntent]:
|
|
180
|
-
"""
|
|
181
|
-
Handles the [/|//]<xpath> segment of a wxpath expression. This is a plain XPath expression.
|
|
182
|
-
Also handles wxpath-specific macro expansions like wx:backlink() or wx:depth().
|
|
183
|
-
"""
|
|
184
|
-
_, value = curr_segments[0]
|
|
185
|
-
expr = value.expr
|
|
186
|
-
if curr_elem is None:
|
|
187
|
-
raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
|
|
188
|
-
base_url = getattr(curr_elem, 'base_url', None)
|
|
189
|
-
log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
|
|
190
|
-
|
|
191
|
-
_backlink_str = f"string('{curr_elem.get('backlink')}')"
|
|
192
|
-
# We use the root tree's depth and not curr_depth because curr_depth accounts for a +1
|
|
193
|
-
# increment after each url*() hop
|
|
194
|
-
_depth_str = f"number({curr_elem.getroottree().getroot().get('depth')})"
|
|
195
|
-
expr = expr.replace('wx:backlink()', _backlink_str)
|
|
196
|
-
expr = expr.replace('wx:backlink(.)', _backlink_str)
|
|
197
|
-
expr = expr.replace('wx:depth()', _depth_str)
|
|
198
|
-
expr = expr.replace('wx:depth(.)', _depth_str)
|
|
199
|
-
|
|
200
|
-
elems = curr_elem.xpath3(expr)
|
|
201
|
-
|
|
202
|
-
next_segments = curr_segments[1:]
|
|
203
|
-
for elem in elems:
|
|
204
|
-
value_or_elem = WxStr(
|
|
205
|
-
elem, base_url=base_url,
|
|
206
|
-
depth=curr_depth
|
|
207
|
-
) if isinstance(elem, str) else elem
|
|
208
|
-
if len(curr_segments) == 1:
|
|
209
|
-
yield DataIntent(value=value_or_elem)
|
|
210
|
-
else:
|
|
211
|
-
yield ProcessIntent(elem=value_or_elem, next_segments=next_segments)
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
@_op(OPS.XPATH_FN_MAP_FRAG)
|
|
215
|
-
def _handle_xpath_fn_map_frag(curr_elem: html.HtmlElement | str,
|
|
216
|
-
curr_segments: list[Segment],
|
|
241
|
+
@register(Binary, (Xpath, Segments))
|
|
242
|
+
def _handle_binary(curr_elem: html.HtmlElement | str,
|
|
243
|
+
curr_segments: list[Url | Xpath] | Binary,
|
|
217
244
|
curr_depth: int,
|
|
218
245
|
**kwargs) -> Iterable[DataIntent | ProcessIntent]:
|
|
246
|
+
"""Execute XPath expressions suffixed with the ``!`` (map) operator.
|
|
247
|
+
|
|
248
|
+
Yields:
|
|
249
|
+
ProcessIntent: Contrains either a WxStr or lxml or elementpath element.
|
|
219
250
|
"""
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
251
|
+
left = curr_segments.left
|
|
252
|
+
_ = curr_segments.op
|
|
253
|
+
right = curr_segments.right
|
|
254
|
+
|
|
255
|
+
if len(right) == 0:
|
|
256
|
+
# Binary operation on segments expects non-empty segments
|
|
257
|
+
raise ValueError("Binary operation on segments expects non-empty segments")
|
|
224
258
|
|
|
225
259
|
base_url = getattr(curr_elem, 'base_url', None)
|
|
226
|
-
next_segments =
|
|
260
|
+
next_segments = right
|
|
227
261
|
|
|
228
|
-
|
|
262
|
+
results = elementpath.select(
|
|
229
263
|
curr_elem,
|
|
230
|
-
value
|
|
264
|
+
left.value,
|
|
231
265
|
parser=XPath3Parser,
|
|
232
266
|
item='' if curr_elem is None else None
|
|
233
267
|
)
|
|
234
268
|
|
|
235
|
-
if isinstance(
|
|
236
|
-
|
|
269
|
+
if isinstance(results, AnyAtomicType):
|
|
270
|
+
results = [results]
|
|
237
271
|
|
|
238
|
-
for
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
# XPATH_FN_MAP_FRAG is not a terminal operation
|
|
242
|
-
raise ValueError("XPATH_FN_MAP_FRAG is not a terminal operation")
|
|
272
|
+
for result in results:
|
|
273
|
+
if isinstance(result, str):
|
|
274
|
+
value_or_elem = WxStr(result, base_url=base_url, depth=curr_depth)
|
|
243
275
|
else:
|
|
244
|
-
|
|
276
|
+
value_or_elem = result
|
|
277
|
+
|
|
278
|
+
yield ProcessIntent(elem=value_or_elem, next_segments=next_segments)
|