wxpath 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wxpath/__init__.py ADDED
@@ -0,0 +1,9 @@
1
+ from .core.runtime.engine import wxpath_async, wxpath_async_blocking, wxpath_async_blocking_iter
2
+ from .util.logging import configure_logging
3
+
4
+ __all__ = [
5
+ 'wxpath_async',
6
+ 'wxpath_async_blocking',
7
+ 'wxpath_async_blocking_iter',
8
+ 'configure_logging',
9
+ ]
wxpath/cli.py ADDED
@@ -0,0 +1,52 @@
1
+ import argparse
2
+ import json
3
+ import sys
4
+
5
+ from wxpath.core.parser import parse_wxpath_expr
6
+ from wxpath.core.runtime.engine import WXPathEngine, wxpath_async_blocking_iter
7
+ from wxpath.hooks import builtin, registry
8
+ from wxpath.util.serialize import simplify
9
+
10
+
11
+ def main():
12
+ registry.register(builtin.SerializeXPathMapAndNodeHook)
13
+ parser = argparse.ArgumentParser(description="Run wxpath expression.")
14
+ parser.add_argument("expression", help="The wxpath expression")
15
+ parser.add_argument("--depth", type=int, default=1, help="Recursion depth")
16
+ # debug
17
+ parser.add_argument("--debug", action="store_true", help="Debug mode")
18
+ # verbose
19
+ parser.add_argument("--verbose", action="store_true", help="Verbose mode")
20
+
21
+ parser.add_argument("--concurrency", type=int, default=16, help="Number of concurrent fetches")
22
+ parser.add_argument(
23
+ "--concurrency-per-host",
24
+ type=int,
25
+ default=8,
26
+ help="Number of concurrent fetches per host"
27
+ )
28
+
29
+ args = parser.parse_args()
30
+
31
+ if args.verbose:
32
+ print("wxpath expression:", args.expression)
33
+ print("parsed expression:", parse_wxpath_expr(args.expression))
34
+
35
+ if args.debug:
36
+ from wxpath import configure_logging
37
+ configure_logging('DEBUG')
38
+
39
+ engine = WXPathEngine(
40
+ concurrency=args.concurrency,
41
+ per_host=args.concurrency_per_host,
42
+ )
43
+ try:
44
+ for r in wxpath_async_blocking_iter(args.expression, args.depth, engine):
45
+ clean = simplify(r)
46
+ print(json.dumps(clean, ensure_ascii=False), flush=True)
47
+ except BrokenPipeError:
48
+ sys.exit(0)
49
+
50
+
51
+ if __name__ == "__main__":
52
+ main()
@@ -0,0 +1,13 @@
1
+ from wxpath.core.runtime.engine import (
2
+ WXPathEngine,
3
+ wxpath_async,
4
+ wxpath_async_blocking,
5
+ wxpath_async_blocking_iter,
6
+ )
7
+
8
+ __all__ = [
9
+ 'wxpath_async',
10
+ 'wxpath_async_blocking',
11
+ 'wxpath_async_blocking_iter',
12
+ 'WXPathEngine',
13
+ ]
wxpath/core/dom.py ADDED
@@ -0,0 +1,22 @@
1
+ from urllib.parse import urljoin
2
+
3
+
4
+ def _make_links_absolute(links: list[str], base_url: str) -> list[str]:
5
+ """
6
+ Convert relative links to absolute links based on the base URL.
7
+
8
+ Args:
9
+ links (list): List of link strings.
10
+ base_url (str): The base URL to resolve relative links against.
11
+
12
+ Returns:
13
+ List of absolute URLs.
14
+ """
15
+ if base_url is None:
16
+ raise ValueError("base_url must not be None when making links absolute.")
17
+ return [urljoin(base_url, link) for link in links if link]
18
+
19
+
20
+ def get_absolute_links_from_elem_and_xpath(elem, xpath):
21
+ base_url = getattr(elem, 'base_url', None)
22
+ return _make_links_absolute(elem.xpath3(xpath), base_url)
wxpath/core/errors.py ADDED
@@ -0,0 +1,134 @@
1
+
2
+ import collections.abc as cabc
3
+ import functools
4
+ import inspect
5
+ import types
6
+ from contextlib import contextmanager
7
+ from contextvars import ContextVar
8
+ from enum import Enum, auto
9
+ from typing import AsyncGenerator
10
+
11
+ from wxpath.util.logging import get_logger
12
+
13
+ log = get_logger(__name__)
14
+
15
+ class ErrorPolicy(Enum):
16
+ IGNORE = auto() # swallow completely
17
+ LOG = auto() # just log at ERROR
18
+ COLLECT = auto() # yield {"_error": ..., "_ctx": ...}
19
+ RAISE = auto() # re-raise
20
+
21
+
22
+ _GLOBAL_DEFAULT = ErrorPolicy.LOG
23
+
24
+ # Task-local override (None => fall back to _GLOBAL_DEFAULT)
25
+ _CURRENT: ContextVar[ErrorPolicy | None] = ContextVar("wx_err_policy", default=None)
26
+
27
+
28
+ def get_current_error_policy() -> ErrorPolicy:
29
+ return _CURRENT.get() or _GLOBAL_DEFAULT
30
+
31
+
32
+ def set_default_error_policy(policy: ErrorPolicy) -> None:
33
+ global _GLOBAL_DEFAULT
34
+ _GLOBAL_DEFAULT = policy
35
+
36
+
37
+ @contextmanager
38
+ def use_error_policy(policy: ErrorPolicy):
39
+ token = _CURRENT.set(policy)
40
+ try:
41
+ yield
42
+ finally:
43
+ _CURRENT.reset(token)
44
+
45
+
46
+ def handle_error(exc: Exception, policy: ErrorPolicy, ctx: dict):
47
+ if policy is ErrorPolicy.IGNORE:
48
+ return None
49
+
50
+ if policy is ErrorPolicy.LOG:
51
+ log.exception("processing error", extra=ctx)
52
+ return None
53
+
54
+ if policy is ErrorPolicy.COLLECT:
55
+ return {"_error": str(exc), "_ctx": ctx}
56
+
57
+ # RAISE (safe default)
58
+ raise exc
59
+
60
+
61
+ def _is_gen(obj): # helper
62
+ return isinstance(obj, (types.GeneratorType, cabc.Generator))
63
+
64
+
65
+ def with_errors():
66
+ """
67
+ Apply the current ErrorPolicy at call time while preserving the callable kind:
68
+ - async generator -> async generator wrapper
69
+ - coroutine -> async wrapper
70
+ - sync generator -> sync generator wrapper
71
+ - plain function -> plain wrapper
72
+ """
73
+ def decorator(fn):
74
+ # --- ASYNC GENERATOR ---
75
+ if inspect.isasyncgenfunction(fn):
76
+ @functools.wraps(fn)
77
+ async def asyncgen_wrapped(*a, **kw) -> AsyncGenerator:
78
+ try:
79
+ async for item in fn(*a, **kw):
80
+ yield item
81
+ except Exception as exc:
82
+ collected = handle_error(exc, get_current_error_policy(),
83
+ _ctx_from_sig(fn, a, kw))
84
+ if collected is not None:
85
+ yield collected
86
+ return asyncgen_wrapped
87
+
88
+ # --- COROUTINE ---
89
+ if inspect.iscoroutinefunction(fn):
90
+ @functools.wraps(fn)
91
+ async def coro_wrapped(*a, **kw):
92
+ try:
93
+ return await fn(*a, **kw)
94
+ except Exception as exc:
95
+ return handle_error(exc, get_current_error_policy(),
96
+ _ctx_from_sig(fn, a, kw))
97
+ return coro_wrapped
98
+
99
+ # --- SYNC GENERATOR ---
100
+ if inspect.isgeneratorfunction(fn):
101
+ @functools.wraps(fn)
102
+ def gen_wrapped(*a, **kw):
103
+ try:
104
+ for item in fn(*a, **kw):
105
+ yield item
106
+ except Exception as exc:
107
+ collected = handle_error(exc, get_current_error_policy(),
108
+ _ctx_from_sig(fn, a, kw))
109
+ if collected is not None:
110
+ yield collected
111
+ return gen_wrapped
112
+
113
+ # --- PLAIN SYNC FUNCTION ---
114
+ @functools.wraps(fn)
115
+ def plain_wrapped(*a, **kw):
116
+ try:
117
+ return fn(*a, **kw)
118
+ except Exception as exc:
119
+ return handle_error(exc, get_current_error_policy(),
120
+ _ctx_from_sig(fn, a, kw))
121
+ return plain_wrapped
122
+ return decorator
123
+
124
+
125
+ def _ctx_from_sig(fn, a, kw):
126
+ """Best-effort extraction of depth/url/op for logging."""
127
+ # you already pass these in every handler, so pull by position
128
+ try:
129
+ elem, segs, depth, *_ = a
130
+ op, val = segs[0] if segs else ("?", "?")
131
+ url = getattr(elem, "base_url", None)
132
+ return {"op": op, "depth": depth, "url": url}
133
+ except Exception:
134
+ return {}
wxpath/core/models.py ADDED
@@ -0,0 +1,74 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Any, List, Optional, Tuple
3
+
4
+
5
+ @dataclass(slots=True)
6
+ class CrawlTask:
7
+ """A unit of work for the crawler."""
8
+ elem: Any
9
+ url: str
10
+ segments: List[Tuple[str, str]]
11
+ depth: int
12
+ backlink: Optional[str] = None
13
+ base_url: Optional[str] = None
14
+
15
+ # Priority for the queue (lower number = higher priority)
16
+ # Useful if you want Depth-First behavior in a shared queue
17
+ priority: int = field(default=0)
18
+
19
+ def __post_init__(self):
20
+ # Automatically sync priority with depth for BFS behavior
21
+ self.priority = self.depth
22
+
23
+ def __lt__(self, other):
24
+ return self.priority < other.priority
25
+
26
+ def __iter__(self):
27
+ return iter((self.elem, self.segments, self.depth, self.backlink))
28
+
29
+
30
+ @dataclass(slots=True)
31
+ class Intent:
32
+ pass
33
+
34
+
35
+ @dataclass(slots=True)
36
+ class Result(Intent):
37
+ """A container for an extracted item or error."""
38
+ value: Any
39
+ url: str
40
+ depth: int
41
+ error: Optional[Exception] = None
42
+ backlink: Optional[str] = None
43
+
44
+
45
+ @dataclass(slots=True)
46
+ class CrawlIntent(Intent):
47
+ url: str # "I found this link"
48
+ next_segments: list # "Here is what to do next if you go there"
49
+
50
+
51
+ @dataclass(slots=True)
52
+ class ProcessIntent(Intent):
53
+ elem: Any
54
+ next_segments: list
55
+
56
+
57
+ @dataclass(slots=True)
58
+ class InfiniteCrawlIntent(ProcessIntent):
59
+ pass
60
+
61
+
62
+ @dataclass(slots=True)
63
+ class ExtractIntent(ProcessIntent):
64
+ pass
65
+
66
+
67
+ @dataclass(slots=True)
68
+ class CrawlFromAttributeIntent(ProcessIntent):
69
+ pass
70
+
71
+
72
+ @dataclass(slots=True)
73
+ class DataIntent(Intent):
74
+ value: Any
wxpath/core/ops.py ADDED
@@ -0,0 +1,244 @@
1
+ """
2
+ `ops` for "operations". This module contains side-effect-free functions (operators)
3
+ for handling each segment of a wxpath expression.
4
+ """
5
+ from typing import Callable, Iterable
6
+ from urllib.parse import urljoin
7
+
8
+ import elementpath
9
+ from elementpath.datatypes import AnyAtomicType
10
+ from elementpath.xpath3 import XPath3Parser
11
+ from lxml import html
12
+
13
+ from wxpath.core.dom import get_absolute_links_from_elem_and_xpath
14
+ from wxpath.core.models import (
15
+ CrawlIntent,
16
+ DataIntent,
17
+ ExtractIntent,
18
+ InfiniteCrawlIntent,
19
+ Intent,
20
+ ProcessIntent,
21
+ )
22
+ from wxpath.core.parser import OPS, Segment, UrlInfAndXpathValue, XpathValue
23
+ from wxpath.util.logging import get_logger
24
+
25
+ log = get_logger(__name__)
26
+
27
+
28
+ class WxStr(str):
29
+ """
30
+ A string that has a base_url and depth associated with it. Purely for debugging.
31
+ """
32
+ def __new__(cls, value, base_url=None, depth=-1):
33
+ obj = super().__new__(cls, value)
34
+ obj.base_url = base_url
35
+ obj.depth = depth
36
+ return obj
37
+
38
+ def __repr__(self):
39
+ return f"WxStr({super().__repr__()}, base_url={self.base_url!r}, depth={self.depth})"
40
+
41
+
42
+ HANDLERS: dict[str, Callable] = {}
43
+
44
+ def _op(name: OPS):
45
+ def reg(fn):
46
+ if name in HANDLERS:
47
+ raise ValueError(f"Duplicate operation: {name}")
48
+ HANDLERS[name] = fn
49
+ return fn
50
+ return reg
51
+
52
+
53
+ def get_operator(name: OPS) -> Callable[[html.HtmlElement, list[Segment], int], Iterable[Intent]]:
54
+ if name not in HANDLERS:
55
+ raise ValueError(f"Unknown operation: {name}")
56
+ return HANDLERS[name]
57
+
58
+
59
+ @_op(OPS.URL_STR_LIT)
60
+ def _handle_url_str_lit(curr_elem: html.HtmlElement,
61
+ curr_segments: list[Segment],
62
+ curr_depth: int, **kwargs) -> Iterable[Intent]:
63
+ op, value = curr_segments[0]
64
+
65
+ log.debug("queueing", extra={"depth": curr_depth, "op": op, "url": value.target})
66
+
67
+ next_segments = curr_segments[1:]
68
+
69
+ if value.follow:
70
+ _segments = [
71
+ (OPS.URL_INF_AND_XPATH, UrlInfAndXpathValue('', value.target, value.follow))
72
+ ] + next_segments
73
+
74
+ yield CrawlIntent(url=value.target, next_segments=_segments)
75
+ else:
76
+ yield CrawlIntent(url=value.target, next_segments=next_segments)
77
+
78
+
79
+ @_op(OPS.URL_EVAL)
80
+ def _handle_url_eval(curr_elem: html.HtmlElement | str,
81
+ curr_segments: list[Segment],
82
+ curr_depth: int,
83
+ **kwargs) -> Iterable[Intent]:
84
+ op, value = curr_segments[0]
85
+
86
+ _path_exp = value.expr
87
+
88
+ if isinstance(curr_elem, str):
89
+ # TODO: IMO, ideally, wxpath grammar should not be checked/validated/enforced
90
+ # in ops.py. It should instead be validated in the parser.
91
+ if _path_exp not in {'.', 'self::node()'}:
92
+ raise ValueError("Only '.' or 'self::node()' is supported in url() segments "
93
+ f"when prior xpath operation results in a string. Got: {_path_exp}")
94
+
95
+ urls = [urljoin(getattr(curr_elem, 'base_url', None) or '', curr_elem)]
96
+ else:
97
+ # TODO: If prior xpath operation is XPATH_FN_MAP_FRAG, then this will likely fail.
98
+ # It should be handled in the parser.
99
+ urls = get_absolute_links_from_elem_and_xpath(curr_elem, _path_exp)
100
+ urls = dict.fromkeys(urls)
101
+
102
+ next_segments = curr_segments[1:]
103
+ for url in urls:
104
+ log.debug("queueing", extra={"depth": curr_depth, "op": op, "url": url})
105
+ yield CrawlIntent(url=url, next_segments=next_segments)
106
+
107
+
108
+ @_op(OPS.URL_INF)
109
+ def _handle_url_inf(curr_elem: html.HtmlElement,
110
+ curr_segments: list[Segment],
111
+ curr_depth: int,
112
+ **kwargs) -> Iterable[CrawlIntent]:
113
+ """
114
+ Handles the ///url() segment of a wxpath expression. This operation is also
115
+ generated internally by the parser when a `///<xpath>/[/]url()` segment is
116
+ encountered by the parser.
117
+ This operation does not fetch URLs; instead, it XPaths the current element
118
+ for URLs, then queues them for further processing (see
119
+ _handle_url_inf_and_xpath).
120
+ """
121
+ op, value = curr_segments[0]
122
+
123
+ _path_exp = value.expr
124
+
125
+ urls = get_absolute_links_from_elem_and_xpath(curr_elem, _path_exp)
126
+
127
+ log.debug("found urls",
128
+ extra={"depth": curr_depth, "op": op, "url": getattr(curr_elem, 'base_url', None)})
129
+
130
+ tail_segments = curr_segments[1:]
131
+ for url in dict.fromkeys(urls):
132
+ _segments = [
133
+ (OPS.URL_INF_AND_XPATH, UrlInfAndXpathValue('', url, _path_exp))
134
+ ] + tail_segments
135
+
136
+ log.debug("queueing", extra={"depth": curr_depth, "op": op, "url": url})
137
+
138
+ yield CrawlIntent(url=url, next_segments=_segments)
139
+
140
+
141
+ @_op(OPS.URL_INF_AND_XPATH)
142
+ def _handle_url_inf_and_xpath(curr_elem: html.HtmlElement,
143
+ curr_segments: list[Segment],
144
+ curr_depth: int, **kwargs) \
145
+ -> Iterable[DataIntent | ProcessIntent | InfiniteCrawlIntent]:
146
+ """
147
+ This is an operation that is generated internally by the parser. There is
148
+ no explicit wxpath expression that generates this operation.
149
+ """
150
+ op, value = curr_segments[0]
151
+
152
+ try:
153
+ if curr_elem is None:
154
+ raise ValueError("Missing element when op is 'url_inf_and_xpath'.")
155
+
156
+ next_segments = curr_segments[1:]
157
+ if not next_segments:
158
+ yield DataIntent(value=curr_elem)
159
+ else:
160
+ yield ExtractIntent(elem=curr_elem, next_segments=next_segments)
161
+
162
+ # For url_inf, also re-enqueue for further infinite expansion
163
+ _segments = [(OPS.URL_INF, XpathValue('', value.expr))] + next_segments
164
+ crawl_intent = InfiniteCrawlIntent(elem=curr_elem, next_segments=_segments)
165
+ log.debug("queueing InfiniteCrawlIntent",
166
+ extra={"depth": curr_depth, "op": op,
167
+ "url": value.target, "crawl_intent": crawl_intent})
168
+ yield crawl_intent
169
+
170
+ except Exception:
171
+ log.exception("error fetching url",
172
+ extra={"depth": curr_depth, "op": op, "url": value.target})
173
+
174
+
175
+ @_op(OPS.XPATH)
176
+ def _handle_xpath(curr_elem: html.HtmlElement,
177
+ curr_segments: list[Segment],
178
+ curr_depth: int,
179
+ **kwargs) -> Iterable[DataIntent | ProcessIntent]:
180
+ """
181
+ Handles the [/|//]<xpath> segment of a wxpath expression. This is a plain XPath expression.
182
+ Also handles wxpath-specific macro expansions like wx:backlink() or wx:depth().
183
+ """
184
+ _, value = curr_segments[0]
185
+ expr = value.expr
186
+ if curr_elem is None:
187
+ raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
188
+ base_url = getattr(curr_elem, 'base_url', None)
189
+ log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
190
+
191
+ _backlink_str = f"string('{curr_elem.get('backlink')}')"
192
+ # We use the root tree's depth and not curr_depth because curr_depth accounts for a +1
193
+ # increment after each url*() hop
194
+ _depth_str = f"number({curr_elem.getroottree().getroot().get('depth')})"
195
+ expr = expr.replace('wx:backlink()', _backlink_str)
196
+ expr = expr.replace('wx:backlink(.)', _backlink_str)
197
+ expr = expr.replace('wx:depth()', _depth_str)
198
+ expr = expr.replace('wx:depth(.)', _depth_str)
199
+
200
+ elems = curr_elem.xpath3(expr)
201
+
202
+ next_segments = curr_segments[1:]
203
+ for elem in elems:
204
+ value_or_elem = WxStr(
205
+ elem, base_url=base_url,
206
+ depth=curr_depth
207
+ ) if isinstance(elem, str) else elem
208
+ if len(curr_segments) == 1:
209
+ yield DataIntent(value=value_or_elem)
210
+ else:
211
+ yield ProcessIntent(elem=value_or_elem, next_segments=next_segments)
212
+
213
+
214
+ @_op(OPS.XPATH_FN_MAP_FRAG)
215
+ def _handle_xpath_fn_map_frag(curr_elem: html.HtmlElement | str,
216
+ curr_segments: list[Segment],
217
+ curr_depth: int,
218
+ **kwargs) -> Iterable[DataIntent | ProcessIntent]:
219
+ """
220
+ Handles the execution of XPath functions that were initially suffixed with a
221
+ '!' (map) operator.
222
+ """
223
+ _, value = curr_segments[0]
224
+
225
+ base_url = getattr(curr_elem, 'base_url', None)
226
+ next_segments = curr_segments[1:]
227
+
228
+ result = elementpath.select(
229
+ curr_elem,
230
+ value.expr,
231
+ parser=XPath3Parser,
232
+ item='' if curr_elem is None else None
233
+ )
234
+
235
+ if isinstance(result, AnyAtomicType):
236
+ result = [result]
237
+
238
+ for r in result:
239
+ value_or_elem = WxStr(r, base_url=base_url, depth=curr_depth) if isinstance(r, str) else r
240
+ if len(curr_segments) == 1:
241
+ # XPATH_FN_MAP_FRAG is not a terminal operation
242
+ raise ValueError("XPATH_FN_MAP_FRAG is not a terminal operation")
243
+ else:
244
+ yield ProcessIntent(elem=value_or_elem, next_segments=next_segments)