wxpath 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wxpath/__init__.py ADDED
@@ -0,0 +1,9 @@
1
+ from .core.runtime.engine import wxpath_async, wxpath_async_blocking, wxpath_async_blocking_iter
2
+ from .util.logging import configure_logging
3
+
4
+ __all__ = [
5
+ 'wxpath_async',
6
+ 'wxpath_async_blocking',
7
+ 'wxpath_async_blocking_iter',
8
+ 'configure_logging',
9
+ ]
wxpath/cli.py ADDED
@@ -0,0 +1,137 @@
1
+ import argparse
2
+ import json
3
+ import sys
4
+
5
+ from wxpath.core import parser as wxpath_parser
6
+ from wxpath.core.runtime.engine import WXPathEngine, wxpath_async_blocking_iter
7
+ from wxpath.hooks import builtin, registry
8
+ from wxpath.http.client.crawler import Crawler
9
+ from wxpath.settings import SETTINGS
10
+ from wxpath.util.serialize import simplify
11
+
12
+
13
+ def main():
14
+ registry.register(builtin.SerializeXPathMapAndNodeHook)
15
+ arg_parser = argparse.ArgumentParser(description="Run wxpath expression.")
16
+ arg_parser.add_argument("expression", help="The wxpath expression")
17
+ arg_parser.add_argument("--depth", type=int, default=1, help="Recursion depth")
18
+ # debug
19
+ arg_parser.add_argument("--debug", action="store_true",
20
+ help="Debug mode. Provides verbose runtime output and information")
21
+ # verbose
22
+ arg_parser.add_argument("--verbose", action="store_true",
23
+ help="Verbose mode. Prints CLI level information")
24
+
25
+ arg_parser.add_argument(
26
+ "--concurrency",
27
+ type=int,
28
+ default=16,
29
+ help="Number of concurrent fetches"
30
+ )
31
+ arg_parser.add_argument(
32
+ "--concurrency-per-host",
33
+ type=int,
34
+ default=8,
35
+ help="Number of concurrent fetches per host"
36
+ )
37
+ arg_parser.add_argument(
38
+ "--header",
39
+ action="append",
40
+ dest="header_list",
41
+ default=[],
42
+ help="Add a custom header (e.g., 'Key:Value'). Can be used multiple times.",
43
+ )
44
+ arg_parser.add_argument(
45
+ "--respect-robots",
46
+ action="store_true",
47
+ help="Respect robots.txt",
48
+ default=True
49
+ )
50
+ arg_parser.add_argument(
51
+ "--cache",
52
+ action="store_true",
53
+ help="Use cache",
54
+ default=False
55
+ )
56
+ arg_parser.add_argument(
57
+ "--cache-backend",
58
+ type=str,
59
+ help="Cache backend. Possible values: redis, sqlite",
60
+ default="sqlite"
61
+ )
62
+ arg_parser.add_argument(
63
+ "--cache-db-path-or-url",
64
+ type=str,
65
+ help="Path to cache database",
66
+ default="cache.db"
67
+ )
68
+
69
+ args = arg_parser.parse_args()
70
+
71
+ if args.debug:
72
+ from wxpath import configure_logging
73
+ configure_logging('DEBUG')
74
+
75
+ custom_headers = {}
76
+ if args.header_list:
77
+ for header_item in args.header_list:
78
+ try:
79
+ key, value = header_item.split(':', 1)
80
+ custom_headers[key.strip()] = value.strip()
81
+ except ValueError:
82
+ print(f"Warning: Invalid header format '{header_item}'. Use 'Key:Value'.")
83
+
84
+ if custom_headers and args.verbose:
85
+ print(f"Using custom headers: {custom_headers}")
86
+ print()
87
+
88
+ if args.cache:
89
+ SETTINGS.http.client.cache.enabled = True
90
+ if args.cache_backend == "redis":
91
+ SETTINGS.http.client.cache.backend = "redis"
92
+ SETTINGS.http.client.cache.redis.address = args.cache_db_path_or_url
93
+ elif args.cache_backend == "sqlite":
94
+ SETTINGS.http.client.cache.backend = "sqlite"
95
+ SETTINGS.http.client.cache.sqlite.cache_name = args.cache_db_path_or_url
96
+
97
+ if args.verbose:
98
+ print(f"Using concurrency: {args.concurrency}")
99
+ print(f"Using concurrency per host: {args.concurrency_per_host}")
100
+ print(f"Using respect robots: {args.respect_robots}")
101
+ print(f"Using cache: {args.cache}")
102
+
103
+ segments = wxpath_parser.parse(args.expression)
104
+ print("parsed expression:\n\nSegments([")
105
+ for s in segments:
106
+ print(f"\t{s},")
107
+ print("])")
108
+ print()
109
+ print()
110
+
111
+ crawler = Crawler(
112
+ concurrency=args.concurrency,
113
+ per_host=args.concurrency_per_host,
114
+ respect_robots=args.respect_robots,
115
+ headers=custom_headers
116
+ )
117
+ engine = WXPathEngine(crawler=crawler)
118
+
119
+ try:
120
+ for r in wxpath_async_blocking_iter(
121
+ path_expr=args.expression,
122
+ max_depth=args.depth,
123
+ engine=engine):
124
+ clean = simplify(r)
125
+ print(json.dumps(clean, ensure_ascii=False), flush=True)
126
+ except BrokenPipeError:
127
+ if args.verbose:
128
+ print("Pipe broken.")
129
+
130
+ if args.verbose:
131
+ print("Done. Printing crawl stats")
132
+ print(crawler._stats)
133
+ sys.exit(0)
134
+
135
+
136
+ if __name__ == "__main__":
137
+ main()
@@ -0,0 +1,13 @@
1
+ from wxpath.core.runtime.engine import (
2
+ WXPathEngine,
3
+ wxpath_async,
4
+ wxpath_async_blocking,
5
+ wxpath_async_blocking_iter,
6
+ )
7
+
8
+ __all__ = [
9
+ 'wxpath_async',
10
+ 'wxpath_async_blocking',
11
+ 'wxpath_async_blocking_iter',
12
+ 'WXPathEngine',
13
+ ]
wxpath/core/dom.py ADDED
@@ -0,0 +1,22 @@
1
+ from urllib.parse import urljoin
2
+
3
+
4
+ def _make_links_absolute(links: list[str], base_url: str) -> list[str]:
5
+ """
6
+ Convert relative links to absolute links based on the base URL.
7
+
8
+ Args:
9
+ links (list): List of link strings.
10
+ base_url (str): The base URL to resolve relative links against.
11
+
12
+ Returns:
13
+ List of absolute URLs.
14
+ """
15
+ if base_url is None:
16
+ raise ValueError("base_url must not be None when making links absolute.")
17
+ return [urljoin(base_url, link) for link in links if link]
18
+
19
+
20
+ def get_absolute_links_from_elem_and_xpath(elem, xpath):
21
+ base_url = getattr(elem, 'base_url', None)
22
+ return _make_links_absolute(elem.xpath3(xpath), base_url)
wxpath/core/models.py ADDED
@@ -0,0 +1,74 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Any, List, Optional, Tuple
3
+
4
+
5
+ @dataclass(slots=True)
6
+ class CrawlTask:
7
+ """A unit of work for the crawler."""
8
+ elem: Any
9
+ url: str
10
+ segments: List[Tuple[str, str]]
11
+ depth: int
12
+ backlink: Optional[str] = None
13
+ base_url: Optional[str] = None
14
+
15
+ # Priority for the queue (lower number = higher priority)
16
+ # Useful if you want Depth-First behavior in a shared queue
17
+ priority: int = field(default=0)
18
+
19
+ def __post_init__(self):
20
+ # Automatically sync priority with depth for BFS behavior
21
+ self.priority = self.depth
22
+
23
+ def __lt__(self, other):
24
+ return self.priority < other.priority
25
+
26
+ def __iter__(self):
27
+ return iter((self.elem, self.segments, self.depth, self.backlink))
28
+
29
+
30
+ @dataclass(slots=True)
31
+ class Intent:
32
+ pass
33
+
34
+
35
+ @dataclass(slots=True)
36
+ class Result(Intent):
37
+ """A container for an extracted item or error."""
38
+ value: Any
39
+ url: str
40
+ depth: int
41
+ error: Optional[Exception] = None
42
+ backlink: Optional[str] = None
43
+
44
+
45
+ @dataclass(slots=True)
46
+ class CrawlIntent(Intent):
47
+ url: str # "I found this link"
48
+ next_segments: list # "Here is what to do next if you go there"
49
+
50
+
51
+ @dataclass(slots=True)
52
+ class ProcessIntent(Intent):
53
+ elem: Any
54
+ next_segments: list
55
+
56
+
57
+ @dataclass(slots=True)
58
+ class InfiniteCrawlIntent(ProcessIntent):
59
+ pass
60
+
61
+
62
+ @dataclass(slots=True)
63
+ class ExtractIntent(ProcessIntent):
64
+ pass
65
+
66
+
67
+ @dataclass(slots=True)
68
+ class CrawlFromAttributeIntent(ProcessIntent):
69
+ pass
70
+
71
+
72
+ @dataclass(slots=True)
73
+ class DataIntent(Intent):
74
+ value: Any
wxpath/core/ops.py ADDED
@@ -0,0 +1,278 @@
1
+ from typing import Callable, Iterable
2
+ from urllib.parse import urljoin
3
+
4
+ import elementpath
5
+ from elementpath.datatypes import AnyAtomicType
6
+ from elementpath.xpath3 import XPath3Parser
7
+ from lxml import html
8
+
9
+ from wxpath.core.dom import get_absolute_links_from_elem_and_xpath
10
+ from wxpath.core.models import (
11
+ CrawlIntent,
12
+ DataIntent,
13
+ ExtractIntent,
14
+ InfiniteCrawlIntent,
15
+ Intent,
16
+ ProcessIntent,
17
+ )
18
+ from wxpath.core.parser import (
19
+ Binary,
20
+ Call,
21
+ ContextItem,
22
+ Segment,
23
+ Segments,
24
+ String,
25
+ Url,
26
+ UrlCrawl,
27
+ Xpath,
28
+ )
29
+ from wxpath.util.logging import get_logger
30
+
31
+ log = get_logger(__name__)
32
+
33
+
34
+ class WxStr(str):
35
+ """A string with associated base_url and depth metadata for debugging."""
36
+ def __new__(cls, value, base_url=None, depth=-1):
37
+ obj = super().__new__(cls, value)
38
+ obj.base_url = base_url
39
+ obj.depth = depth
40
+ return obj
41
+
42
+ def __repr__(self):
43
+ return f"WxStr({super().__repr__()}, base_url={self.base_url!r}, depth={self.depth})"
44
+
45
+
46
+ class RuntimeSetupError(Exception):
47
+ pass
48
+
49
+
50
+ OPS_REGISTER: dict[str, Callable] = {}
51
+
52
+ def register(func_name_or_type: str | type, args_types: tuple[type, ...] | None = None):
53
+ def _register(func: Callable) -> Callable:
54
+ global OPS_REGISTER
55
+ _key = (func_name_or_type, args_types) if args_types else func_name_or_type
56
+ if _key in OPS_REGISTER:
57
+ raise RuntimeSetupError(f"The operation handler for \"{_key}\" already registered")
58
+ OPS_REGISTER[_key] = func
59
+ return func
60
+ return _register
61
+
62
+
63
+ def get_operator(
64
+ binary_or_segment: Binary | Segment
65
+ ) -> Callable[[html.HtmlElement, list[Url | Xpath], int], Iterable[Intent]]:
66
+ func_name_or_type = getattr(binary_or_segment, 'func', None) or binary_or_segment.__class__
67
+
68
+ args_types = None
69
+ if isinstance(binary_or_segment, Binary):
70
+ args_types = (binary_or_segment.left.__class__, binary_or_segment.right.__class__)
71
+ elif isinstance(binary_or_segment, Call):
72
+ args_types = tuple(arg.__class__ for arg in binary_or_segment.args)
73
+
74
+ _key = (func_name_or_type, args_types) if args_types else func_name_or_type
75
+ if _key not in OPS_REGISTER:
76
+ raise ValueError(f"Unknown operation: {_key}")
77
+ return OPS_REGISTER[_key]
78
+
79
+
80
+ @register('url', (String,))
81
+ @register('url', (String, Xpath))
82
+ def _handle_url_str_lit(curr_elem: html.HtmlElement,
83
+ curr_segments: list[Url | Xpath],
84
+ curr_depth: int, **kwargs) -> Iterable[Intent]:
85
+ """Handle `url('<literal>')` segments and optional follow xpath."""
86
+ url_call = curr_segments[0] # type: Url
87
+
88
+ next_segments = curr_segments[1:]
89
+
90
+ if len(url_call.args) == 2:
91
+ _segments = [
92
+ UrlCrawl('///url', [url_call.args[1], url_call.args[0].value])
93
+ ] + next_segments
94
+
95
+ yield CrawlIntent(url=url_call.args[0].value, next_segments=_segments)
96
+ else:
97
+ yield CrawlIntent(url=url_call.args[0].value, next_segments=next_segments)
98
+
99
+
100
+ # @register2('url', (Xpath,))
101
+ @register(Xpath)
102
+ def _handle_xpath(curr_elem: html.HtmlElement,
103
+ curr_segments: Segments,
104
+ curr_depth: int,
105
+ **kwargs) -> Iterable[Intent]:
106
+ """Execute an xpath step and yield data or chained processing intents."""
107
+ xpath_node = curr_segments[0] # type: Xpath
108
+
109
+ expr = xpath_node.value
110
+
111
+ if curr_elem is None:
112
+ raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
113
+ base_url = getattr(curr_elem, 'base_url', None)
114
+ log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
115
+
116
+ _backlink_str = f"string('{curr_elem.get('backlink')}')"
117
+ # We use the root tree's depth and not curr_depth because curr_depth accounts for a +1
118
+ # increment after each url*() hop
119
+ _depth_str = f"number({curr_elem.getroottree().getroot().get('depth')})"
120
+ expr = expr.replace('wx:backlink()', _backlink_str)
121
+ expr = expr.replace('wx:backlink(.)', _backlink_str)
122
+ expr = expr.replace('wx:depth()', _depth_str)
123
+ expr = expr.replace('wx:depth(.)', _depth_str)
124
+
125
+ elems = curr_elem.xpath3(expr)
126
+
127
+ next_segments = curr_segments[1:]
128
+ for elem in elems:
129
+ value_or_elem = WxStr(
130
+ elem, base_url=base_url,
131
+ depth=curr_depth
132
+ ) if isinstance(elem, str) else elem
133
+ if len(curr_segments) == 1:
134
+ yield DataIntent(value=value_or_elem)
135
+ else:
136
+ yield ProcessIntent(elem=value_or_elem, next_segments=next_segments)
137
+
138
+
139
+ @register('//url', (ContextItem,))
140
+ @register('//url', (Xpath,))
141
+ @register('/url', (ContextItem,))
142
+ @register('/url', (Xpath,))
143
+ @register('url', (ContextItem,))
144
+ @register('url', (Xpath,))
145
+ def _handle_url_eval(curr_elem: html.HtmlElement | str,
146
+ curr_segments: list[Url | Xpath],
147
+ curr_depth: int,
148
+ **kwargs) -> Iterable[Intent]:
149
+ """Resolve dynamic url() arguments and enqueue crawl intents.
150
+
151
+ Yields:
152
+ CrawlIntent
153
+ """
154
+ url_call = curr_segments[0] # type: Url
155
+
156
+ if isinstance(url_call.args[0], ContextItem):
157
+ urls = [urljoin(getattr(curr_elem, 'base_url', None) or '', curr_elem)]
158
+ else:
159
+ _path_exp = url_call.args[0].value
160
+ # TODO: If prior xpath operation is XPATH_FN_MAP_FRAG, then this will likely fail.
161
+ # It should be handled in the parser.
162
+ urls = get_absolute_links_from_elem_and_xpath(curr_elem, _path_exp)
163
+ urls = dict.fromkeys(urls)
164
+
165
+ next_segments = curr_segments[1:]
166
+ for url in urls:
167
+ # log.debug("queueing", extra={"depth": curr_depth, "op": op, "url": url})
168
+ yield CrawlIntent(url=url, next_segments=next_segments)
169
+
170
+
171
+ @register('///url', (Xpath,))
172
+ def _handle_url_inf(curr_elem: html.HtmlElement,
173
+ curr_segments: list[Url | Xpath],
174
+ curr_depth: int,
175
+ **kwargs) -> Iterable[CrawlIntent]:
176
+ """Handle the ``///url()`` segment of a wxpath expression.
177
+
178
+ This operation is also generated internally by the parser when a
179
+ ``///<xpath>/[/]url()`` segment is encountered.
180
+
181
+ Instead of fetching URLs directly, this operator XPaths the current
182
+ element for URLs and queues them for further processing via
183
+ ``_handle_url_inf_and_xpath``.
184
+ """
185
+ url_call = curr_segments[0] # type: Url
186
+
187
+ _path_exp = url_call.args[0].value
188
+
189
+ urls = get_absolute_links_from_elem_and_xpath(curr_elem, _path_exp)
190
+
191
+ tail_segments = curr_segments[1:]
192
+ for url in dict.fromkeys(urls):
193
+ _segments = [
194
+ UrlCrawl('///url', [url_call.args[0], url])
195
+ ] + tail_segments
196
+
197
+ yield CrawlIntent(url=url, next_segments=_segments)
198
+
199
+
200
+ @register('///url', (Xpath, str))
201
+ def _handle_url_inf_and_xpath(curr_elem: html.HtmlElement,
202
+ curr_segments: list[Url | Xpath],
203
+ curr_depth: int, **kwargs) \
204
+ -> Iterable[DataIntent | ProcessIntent | InfiniteCrawlIntent]:
205
+ """Handle infinite-crawl with an xpath extraction step.
206
+
207
+ This operation is generated internally by the parser; there is no explicit
208
+ wxpath expression that produces it directly.
209
+
210
+ Yields:
211
+ DataIntent: If the current element is not None and no next segments are provided.
212
+ ExtractIntent: If the current element is not None and next segments are provided.
213
+ InfiniteCrawlIntent: If the current element is not None and next segments are provided.
214
+
215
+ Raises:
216
+ ValueError: If the current element is None.
217
+ """
218
+ url_call = curr_segments[0]
219
+
220
+ try:
221
+ if curr_elem is None:
222
+ raise ValueError("Missing element when op is 'url_inf_and_xpath'.")
223
+
224
+ next_segments = curr_segments[1:]
225
+
226
+ if not next_segments:
227
+ yield DataIntent(value=curr_elem)
228
+ else:
229
+ yield ExtractIntent(elem=curr_elem, next_segments=next_segments)
230
+
231
+ # For url_inf, also re-enqueue for further infinite expansion
232
+ _segments = [UrlCrawl('///url', url_call.args[:-1])] + next_segments
233
+ crawl_intent = InfiniteCrawlIntent(elem=curr_elem, next_segments=_segments)
234
+
235
+ yield crawl_intent
236
+
237
+ except Exception:
238
+ log.exception("error fetching url inf and xpath",
239
+ extra={"depth": curr_depth, "url": url_call.args[1]})
240
+
241
+ @register(Binary, (Xpath, Segments))
242
+ def _handle_binary(curr_elem: html.HtmlElement | str,
243
+ curr_segments: list[Url | Xpath] | Binary,
244
+ curr_depth: int,
245
+ **kwargs) -> Iterable[DataIntent | ProcessIntent]:
246
+ """Execute XPath expressions suffixed with the ``!`` (map) operator.
247
+
248
+ Yields:
249
+ ProcessIntent: Contrains either a WxStr or lxml or elementpath element.
250
+ """
251
+ left = curr_segments.left
252
+ _ = curr_segments.op
253
+ right = curr_segments.right
254
+
255
+ if len(right) == 0:
256
+ # Binary operation on segments expects non-empty segments
257
+ raise ValueError("Binary operation on segments expects non-empty segments")
258
+
259
+ base_url = getattr(curr_elem, 'base_url', None)
260
+ next_segments = right
261
+
262
+ results = elementpath.select(
263
+ curr_elem,
264
+ left.value,
265
+ parser=XPath3Parser,
266
+ item='' if curr_elem is None else None
267
+ )
268
+
269
+ if isinstance(results, AnyAtomicType):
270
+ results = [results]
271
+
272
+ for result in results:
273
+ if isinstance(result, str):
274
+ value_or_elem = WxStr(result, base_url=base_url, depth=curr_depth)
275
+ else:
276
+ value_or_elem = result
277
+
278
+ yield ProcessIntent(elem=value_or_elem, next_segments=next_segments)