wxpath 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wxpath/__init__.py +9 -0
- wxpath/cli.py +52 -0
- wxpath/core/__init__.py +13 -0
- wxpath/core/dom.py +22 -0
- wxpath/core/errors.py +134 -0
- wxpath/core/models.py +74 -0
- wxpath/core/ops.py +244 -0
- wxpath/core/parser.py +319 -0
- wxpath/core/runtime/__init__.py +5 -0
- wxpath/core/runtime/engine.py +315 -0
- wxpath/core/runtime/helpers.py +48 -0
- wxpath/hooks/__init__.py +9 -0
- wxpath/hooks/builtin.py +113 -0
- wxpath/hooks/registry.py +133 -0
- wxpath/http/__init__.py +0 -0
- wxpath/http/client/__init__.py +9 -0
- wxpath/http/client/crawler.py +196 -0
- wxpath/http/client/request.py +35 -0
- wxpath/http/client/response.py +14 -0
- wxpath/http/policy/backoff.py +16 -0
- wxpath/http/policy/retry.py +35 -0
- wxpath/http/policy/throttler.py +114 -0
- wxpath/http/stats.py +96 -0
- wxpath/patches.py +63 -0
- wxpath/util/__init__.py +0 -0
- wxpath/util/logging.py +91 -0
- wxpath/util/serialize.py +22 -0
- {wxpath-0.1.0.dist-info → wxpath-0.2.0.dist-info}/METADATA +30 -97
- wxpath-0.2.0.dist-info/RECORD +33 -0
- wxpath-0.2.0.dist-info/top_level.txt +1 -0
- wxpath-0.1.0.dist-info/RECORD +0 -6
- wxpath-0.1.0.dist-info/top_level.txt +0 -1
- {wxpath-0.1.0.dist-info → wxpath-0.2.0.dist-info}/WHEEL +0 -0
- {wxpath-0.1.0.dist-info → wxpath-0.2.0.dist-info}/entry_points.txt +0 -0
- {wxpath-0.1.0.dist-info → wxpath-0.2.0.dist-info}/licenses/LICENSE +0 -0
wxpath/__init__.py
ADDED
wxpath/cli.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import json
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
from wxpath.core.parser import parse_wxpath_expr
|
|
6
|
+
from wxpath.core.runtime.engine import WXPathEngine, wxpath_async_blocking_iter
|
|
7
|
+
from wxpath.hooks import builtin, registry
|
|
8
|
+
from wxpath.util.serialize import simplify
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def main():
|
|
12
|
+
registry.register(builtin.SerializeXPathMapAndNodeHook)
|
|
13
|
+
parser = argparse.ArgumentParser(description="Run wxpath expression.")
|
|
14
|
+
parser.add_argument("expression", help="The wxpath expression")
|
|
15
|
+
parser.add_argument("--depth", type=int, default=1, help="Recursion depth")
|
|
16
|
+
# debug
|
|
17
|
+
parser.add_argument("--debug", action="store_true", help="Debug mode")
|
|
18
|
+
# verbose
|
|
19
|
+
parser.add_argument("--verbose", action="store_true", help="Verbose mode")
|
|
20
|
+
|
|
21
|
+
parser.add_argument("--concurrency", type=int, default=16, help="Number of concurrent fetches")
|
|
22
|
+
parser.add_argument(
|
|
23
|
+
"--concurrency-per-host",
|
|
24
|
+
type=int,
|
|
25
|
+
default=8,
|
|
26
|
+
help="Number of concurrent fetches per host"
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
args = parser.parse_args()
|
|
30
|
+
|
|
31
|
+
if args.verbose:
|
|
32
|
+
print("wxpath expression:", args.expression)
|
|
33
|
+
print("parsed expression:", parse_wxpath_expr(args.expression))
|
|
34
|
+
|
|
35
|
+
if args.debug:
|
|
36
|
+
from wxpath import configure_logging
|
|
37
|
+
configure_logging('DEBUG')
|
|
38
|
+
|
|
39
|
+
engine = WXPathEngine(
|
|
40
|
+
concurrency=args.concurrency,
|
|
41
|
+
per_host=args.concurrency_per_host,
|
|
42
|
+
)
|
|
43
|
+
try:
|
|
44
|
+
for r in wxpath_async_blocking_iter(args.expression, args.depth, engine):
|
|
45
|
+
clean = simplify(r)
|
|
46
|
+
print(json.dumps(clean, ensure_ascii=False), flush=True)
|
|
47
|
+
except BrokenPipeError:
|
|
48
|
+
sys.exit(0)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
if __name__ == "__main__":
|
|
52
|
+
main()
|
wxpath/core/__init__.py
ADDED
wxpath/core/dom.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from urllib.parse import urljoin
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def _make_links_absolute(links: list[str], base_url: str) -> list[str]:
|
|
5
|
+
"""
|
|
6
|
+
Convert relative links to absolute links based on the base URL.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
links (list): List of link strings.
|
|
10
|
+
base_url (str): The base URL to resolve relative links against.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
List of absolute URLs.
|
|
14
|
+
"""
|
|
15
|
+
if base_url is None:
|
|
16
|
+
raise ValueError("base_url must not be None when making links absolute.")
|
|
17
|
+
return [urljoin(base_url, link) for link in links if link]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_absolute_links_from_elem_and_xpath(elem, xpath):
|
|
21
|
+
base_url = getattr(elem, 'base_url', None)
|
|
22
|
+
return _make_links_absolute(elem.xpath3(xpath), base_url)
|
wxpath/core/errors.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
|
|
2
|
+
import collections.abc as cabc
|
|
3
|
+
import functools
|
|
4
|
+
import inspect
|
|
5
|
+
import types
|
|
6
|
+
from contextlib import contextmanager
|
|
7
|
+
from contextvars import ContextVar
|
|
8
|
+
from enum import Enum, auto
|
|
9
|
+
from typing import AsyncGenerator
|
|
10
|
+
|
|
11
|
+
from wxpath.util.logging import get_logger
|
|
12
|
+
|
|
13
|
+
log = get_logger(__name__)
|
|
14
|
+
|
|
15
|
+
class ErrorPolicy(Enum):
|
|
16
|
+
IGNORE = auto() # swallow completely
|
|
17
|
+
LOG = auto() # just log at ERROR
|
|
18
|
+
COLLECT = auto() # yield {"_error": ..., "_ctx": ...}
|
|
19
|
+
RAISE = auto() # re-raise
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
_GLOBAL_DEFAULT = ErrorPolicy.LOG
|
|
23
|
+
|
|
24
|
+
# Task-local override (None => fall back to _GLOBAL_DEFAULT)
|
|
25
|
+
_CURRENT: ContextVar[ErrorPolicy | None] = ContextVar("wx_err_policy", default=None)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_current_error_policy() -> ErrorPolicy:
|
|
29
|
+
return _CURRENT.get() or _GLOBAL_DEFAULT
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def set_default_error_policy(policy: ErrorPolicy) -> None:
|
|
33
|
+
global _GLOBAL_DEFAULT
|
|
34
|
+
_GLOBAL_DEFAULT = policy
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@contextmanager
|
|
38
|
+
def use_error_policy(policy: ErrorPolicy):
|
|
39
|
+
token = _CURRENT.set(policy)
|
|
40
|
+
try:
|
|
41
|
+
yield
|
|
42
|
+
finally:
|
|
43
|
+
_CURRENT.reset(token)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def handle_error(exc: Exception, policy: ErrorPolicy, ctx: dict):
|
|
47
|
+
if policy is ErrorPolicy.IGNORE:
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
if policy is ErrorPolicy.LOG:
|
|
51
|
+
log.exception("processing error", extra=ctx)
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
if policy is ErrorPolicy.COLLECT:
|
|
55
|
+
return {"_error": str(exc), "_ctx": ctx}
|
|
56
|
+
|
|
57
|
+
# RAISE (safe default)
|
|
58
|
+
raise exc
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _is_gen(obj): # helper
|
|
62
|
+
return isinstance(obj, (types.GeneratorType, cabc.Generator))
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def with_errors():
|
|
66
|
+
"""
|
|
67
|
+
Apply the current ErrorPolicy at call time while preserving the callable kind:
|
|
68
|
+
- async generator -> async generator wrapper
|
|
69
|
+
- coroutine -> async wrapper
|
|
70
|
+
- sync generator -> sync generator wrapper
|
|
71
|
+
- plain function -> plain wrapper
|
|
72
|
+
"""
|
|
73
|
+
def decorator(fn):
|
|
74
|
+
# --- ASYNC GENERATOR ---
|
|
75
|
+
if inspect.isasyncgenfunction(fn):
|
|
76
|
+
@functools.wraps(fn)
|
|
77
|
+
async def asyncgen_wrapped(*a, **kw) -> AsyncGenerator:
|
|
78
|
+
try:
|
|
79
|
+
async for item in fn(*a, **kw):
|
|
80
|
+
yield item
|
|
81
|
+
except Exception as exc:
|
|
82
|
+
collected = handle_error(exc, get_current_error_policy(),
|
|
83
|
+
_ctx_from_sig(fn, a, kw))
|
|
84
|
+
if collected is not None:
|
|
85
|
+
yield collected
|
|
86
|
+
return asyncgen_wrapped
|
|
87
|
+
|
|
88
|
+
# --- COROUTINE ---
|
|
89
|
+
if inspect.iscoroutinefunction(fn):
|
|
90
|
+
@functools.wraps(fn)
|
|
91
|
+
async def coro_wrapped(*a, **kw):
|
|
92
|
+
try:
|
|
93
|
+
return await fn(*a, **kw)
|
|
94
|
+
except Exception as exc:
|
|
95
|
+
return handle_error(exc, get_current_error_policy(),
|
|
96
|
+
_ctx_from_sig(fn, a, kw))
|
|
97
|
+
return coro_wrapped
|
|
98
|
+
|
|
99
|
+
# --- SYNC GENERATOR ---
|
|
100
|
+
if inspect.isgeneratorfunction(fn):
|
|
101
|
+
@functools.wraps(fn)
|
|
102
|
+
def gen_wrapped(*a, **kw):
|
|
103
|
+
try:
|
|
104
|
+
for item in fn(*a, **kw):
|
|
105
|
+
yield item
|
|
106
|
+
except Exception as exc:
|
|
107
|
+
collected = handle_error(exc, get_current_error_policy(),
|
|
108
|
+
_ctx_from_sig(fn, a, kw))
|
|
109
|
+
if collected is not None:
|
|
110
|
+
yield collected
|
|
111
|
+
return gen_wrapped
|
|
112
|
+
|
|
113
|
+
# --- PLAIN SYNC FUNCTION ---
|
|
114
|
+
@functools.wraps(fn)
|
|
115
|
+
def plain_wrapped(*a, **kw):
|
|
116
|
+
try:
|
|
117
|
+
return fn(*a, **kw)
|
|
118
|
+
except Exception as exc:
|
|
119
|
+
return handle_error(exc, get_current_error_policy(),
|
|
120
|
+
_ctx_from_sig(fn, a, kw))
|
|
121
|
+
return plain_wrapped
|
|
122
|
+
return decorator
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _ctx_from_sig(fn, a, kw):
|
|
126
|
+
"""Best-effort extraction of depth/url/op for logging."""
|
|
127
|
+
# you already pass these in every handler, so pull by position
|
|
128
|
+
try:
|
|
129
|
+
elem, segs, depth, *_ = a
|
|
130
|
+
op, val = segs[0] if segs else ("?", "?")
|
|
131
|
+
url = getattr(elem, "base_url", None)
|
|
132
|
+
return {"op": op, "depth": depth, "url": url}
|
|
133
|
+
except Exception:
|
|
134
|
+
return {}
|
wxpath/core/models.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Any, List, Optional, Tuple
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass(slots=True)
|
|
6
|
+
class CrawlTask:
|
|
7
|
+
"""A unit of work for the crawler."""
|
|
8
|
+
elem: Any
|
|
9
|
+
url: str
|
|
10
|
+
segments: List[Tuple[str, str]]
|
|
11
|
+
depth: int
|
|
12
|
+
backlink: Optional[str] = None
|
|
13
|
+
base_url: Optional[str] = None
|
|
14
|
+
|
|
15
|
+
# Priority for the queue (lower number = higher priority)
|
|
16
|
+
# Useful if you want Depth-First behavior in a shared queue
|
|
17
|
+
priority: int = field(default=0)
|
|
18
|
+
|
|
19
|
+
def __post_init__(self):
|
|
20
|
+
# Automatically sync priority with depth for BFS behavior
|
|
21
|
+
self.priority = self.depth
|
|
22
|
+
|
|
23
|
+
def __lt__(self, other):
|
|
24
|
+
return self.priority < other.priority
|
|
25
|
+
|
|
26
|
+
def __iter__(self):
|
|
27
|
+
return iter((self.elem, self.segments, self.depth, self.backlink))
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(slots=True)
|
|
31
|
+
class Intent:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass(slots=True)
|
|
36
|
+
class Result(Intent):
|
|
37
|
+
"""A container for an extracted item or error."""
|
|
38
|
+
value: Any
|
|
39
|
+
url: str
|
|
40
|
+
depth: int
|
|
41
|
+
error: Optional[Exception] = None
|
|
42
|
+
backlink: Optional[str] = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass(slots=True)
|
|
46
|
+
class CrawlIntent(Intent):
|
|
47
|
+
url: str # "I found this link"
|
|
48
|
+
next_segments: list # "Here is what to do next if you go there"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass(slots=True)
|
|
52
|
+
class ProcessIntent(Intent):
|
|
53
|
+
elem: Any
|
|
54
|
+
next_segments: list
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass(slots=True)
|
|
58
|
+
class InfiniteCrawlIntent(ProcessIntent):
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass(slots=True)
|
|
63
|
+
class ExtractIntent(ProcessIntent):
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass(slots=True)
|
|
68
|
+
class CrawlFromAttributeIntent(ProcessIntent):
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass(slots=True)
|
|
73
|
+
class DataIntent(Intent):
|
|
74
|
+
value: Any
|
wxpath/core/ops.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
"""
|
|
2
|
+
`ops` for "operations". This module contains side-effect-free functions (operators)
|
|
3
|
+
for handling each segment of a wxpath expression.
|
|
4
|
+
"""
|
|
5
|
+
from typing import Callable, Iterable
|
|
6
|
+
from urllib.parse import urljoin
|
|
7
|
+
|
|
8
|
+
import elementpath
|
|
9
|
+
from elementpath.datatypes import AnyAtomicType
|
|
10
|
+
from elementpath.xpath3 import XPath3Parser
|
|
11
|
+
from lxml import html
|
|
12
|
+
|
|
13
|
+
from wxpath.core.dom import get_absolute_links_from_elem_and_xpath
|
|
14
|
+
from wxpath.core.models import (
|
|
15
|
+
CrawlIntent,
|
|
16
|
+
DataIntent,
|
|
17
|
+
ExtractIntent,
|
|
18
|
+
InfiniteCrawlIntent,
|
|
19
|
+
Intent,
|
|
20
|
+
ProcessIntent,
|
|
21
|
+
)
|
|
22
|
+
from wxpath.core.parser import OPS, Segment, UrlInfAndXpathValue, XpathValue
|
|
23
|
+
from wxpath.util.logging import get_logger
|
|
24
|
+
|
|
25
|
+
log = get_logger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class WxStr(str):
|
|
29
|
+
"""
|
|
30
|
+
A string that has a base_url and depth associated with it. Purely for debugging.
|
|
31
|
+
"""
|
|
32
|
+
def __new__(cls, value, base_url=None, depth=-1):
|
|
33
|
+
obj = super().__new__(cls, value)
|
|
34
|
+
obj.base_url = base_url
|
|
35
|
+
obj.depth = depth
|
|
36
|
+
return obj
|
|
37
|
+
|
|
38
|
+
def __repr__(self):
|
|
39
|
+
return f"WxStr({super().__repr__()}, base_url={self.base_url!r}, depth={self.depth})"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
HANDLERS: dict[str, Callable] = {}
|
|
43
|
+
|
|
44
|
+
def _op(name: OPS):
|
|
45
|
+
def reg(fn):
|
|
46
|
+
if name in HANDLERS:
|
|
47
|
+
raise ValueError(f"Duplicate operation: {name}")
|
|
48
|
+
HANDLERS[name] = fn
|
|
49
|
+
return fn
|
|
50
|
+
return reg
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_operator(name: OPS) -> Callable[[html.HtmlElement, list[Segment], int], Iterable[Intent]]:
|
|
54
|
+
if name not in HANDLERS:
|
|
55
|
+
raise ValueError(f"Unknown operation: {name}")
|
|
56
|
+
return HANDLERS[name]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@_op(OPS.URL_STR_LIT)
|
|
60
|
+
def _handle_url_str_lit(curr_elem: html.HtmlElement,
|
|
61
|
+
curr_segments: list[Segment],
|
|
62
|
+
curr_depth: int, **kwargs) -> Iterable[Intent]:
|
|
63
|
+
op, value = curr_segments[0]
|
|
64
|
+
|
|
65
|
+
log.debug("queueing", extra={"depth": curr_depth, "op": op, "url": value.target})
|
|
66
|
+
|
|
67
|
+
next_segments = curr_segments[1:]
|
|
68
|
+
|
|
69
|
+
if value.follow:
|
|
70
|
+
_segments = [
|
|
71
|
+
(OPS.URL_INF_AND_XPATH, UrlInfAndXpathValue('', value.target, value.follow))
|
|
72
|
+
] + next_segments
|
|
73
|
+
|
|
74
|
+
yield CrawlIntent(url=value.target, next_segments=_segments)
|
|
75
|
+
else:
|
|
76
|
+
yield CrawlIntent(url=value.target, next_segments=next_segments)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@_op(OPS.URL_EVAL)
|
|
80
|
+
def _handle_url_eval(curr_elem: html.HtmlElement | str,
|
|
81
|
+
curr_segments: list[Segment],
|
|
82
|
+
curr_depth: int,
|
|
83
|
+
**kwargs) -> Iterable[Intent]:
|
|
84
|
+
op, value = curr_segments[0]
|
|
85
|
+
|
|
86
|
+
_path_exp = value.expr
|
|
87
|
+
|
|
88
|
+
if isinstance(curr_elem, str):
|
|
89
|
+
# TODO: IMO, ideally, wxpath grammar should not be checked/validated/enforced
|
|
90
|
+
# in ops.py. It should instead be validated in the parser.
|
|
91
|
+
if _path_exp not in {'.', 'self::node()'}:
|
|
92
|
+
raise ValueError("Only '.' or 'self::node()' is supported in url() segments "
|
|
93
|
+
f"when prior xpath operation results in a string. Got: {_path_exp}")
|
|
94
|
+
|
|
95
|
+
urls = [urljoin(getattr(curr_elem, 'base_url', None) or '', curr_elem)]
|
|
96
|
+
else:
|
|
97
|
+
# TODO: If prior xpath operation is XPATH_FN_MAP_FRAG, then this will likely fail.
|
|
98
|
+
# It should be handled in the parser.
|
|
99
|
+
urls = get_absolute_links_from_elem_and_xpath(curr_elem, _path_exp)
|
|
100
|
+
urls = dict.fromkeys(urls)
|
|
101
|
+
|
|
102
|
+
next_segments = curr_segments[1:]
|
|
103
|
+
for url in urls:
|
|
104
|
+
log.debug("queueing", extra={"depth": curr_depth, "op": op, "url": url})
|
|
105
|
+
yield CrawlIntent(url=url, next_segments=next_segments)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@_op(OPS.URL_INF)
|
|
109
|
+
def _handle_url_inf(curr_elem: html.HtmlElement,
|
|
110
|
+
curr_segments: list[Segment],
|
|
111
|
+
curr_depth: int,
|
|
112
|
+
**kwargs) -> Iterable[CrawlIntent]:
|
|
113
|
+
"""
|
|
114
|
+
Handles the ///url() segment of a wxpath expression. This operation is also
|
|
115
|
+
generated internally by the parser when a `///<xpath>/[/]url()` segment is
|
|
116
|
+
encountered by the parser.
|
|
117
|
+
This operation does not fetch URLs; instead, it XPaths the current element
|
|
118
|
+
for URLs, then queues them for further processing (see
|
|
119
|
+
_handle_url_inf_and_xpath).
|
|
120
|
+
"""
|
|
121
|
+
op, value = curr_segments[0]
|
|
122
|
+
|
|
123
|
+
_path_exp = value.expr
|
|
124
|
+
|
|
125
|
+
urls = get_absolute_links_from_elem_and_xpath(curr_elem, _path_exp)
|
|
126
|
+
|
|
127
|
+
log.debug("found urls",
|
|
128
|
+
extra={"depth": curr_depth, "op": op, "url": getattr(curr_elem, 'base_url', None)})
|
|
129
|
+
|
|
130
|
+
tail_segments = curr_segments[1:]
|
|
131
|
+
for url in dict.fromkeys(urls):
|
|
132
|
+
_segments = [
|
|
133
|
+
(OPS.URL_INF_AND_XPATH, UrlInfAndXpathValue('', url, _path_exp))
|
|
134
|
+
] + tail_segments
|
|
135
|
+
|
|
136
|
+
log.debug("queueing", extra={"depth": curr_depth, "op": op, "url": url})
|
|
137
|
+
|
|
138
|
+
yield CrawlIntent(url=url, next_segments=_segments)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@_op(OPS.URL_INF_AND_XPATH)
|
|
142
|
+
def _handle_url_inf_and_xpath(curr_elem: html.HtmlElement,
|
|
143
|
+
curr_segments: list[Segment],
|
|
144
|
+
curr_depth: int, **kwargs) \
|
|
145
|
+
-> Iterable[DataIntent | ProcessIntent | InfiniteCrawlIntent]:
|
|
146
|
+
"""
|
|
147
|
+
This is an operation that is generated internally by the parser. There is
|
|
148
|
+
no explicit wxpath expression that generates this operation.
|
|
149
|
+
"""
|
|
150
|
+
op, value = curr_segments[0]
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
if curr_elem is None:
|
|
154
|
+
raise ValueError("Missing element when op is 'url_inf_and_xpath'.")
|
|
155
|
+
|
|
156
|
+
next_segments = curr_segments[1:]
|
|
157
|
+
if not next_segments:
|
|
158
|
+
yield DataIntent(value=curr_elem)
|
|
159
|
+
else:
|
|
160
|
+
yield ExtractIntent(elem=curr_elem, next_segments=next_segments)
|
|
161
|
+
|
|
162
|
+
# For url_inf, also re-enqueue for further infinite expansion
|
|
163
|
+
_segments = [(OPS.URL_INF, XpathValue('', value.expr))] + next_segments
|
|
164
|
+
crawl_intent = InfiniteCrawlIntent(elem=curr_elem, next_segments=_segments)
|
|
165
|
+
log.debug("queueing InfiniteCrawlIntent",
|
|
166
|
+
extra={"depth": curr_depth, "op": op,
|
|
167
|
+
"url": value.target, "crawl_intent": crawl_intent})
|
|
168
|
+
yield crawl_intent
|
|
169
|
+
|
|
170
|
+
except Exception:
|
|
171
|
+
log.exception("error fetching url",
|
|
172
|
+
extra={"depth": curr_depth, "op": op, "url": value.target})
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
@_op(OPS.XPATH)
|
|
176
|
+
def _handle_xpath(curr_elem: html.HtmlElement,
|
|
177
|
+
curr_segments: list[Segment],
|
|
178
|
+
curr_depth: int,
|
|
179
|
+
**kwargs) -> Iterable[DataIntent | ProcessIntent]:
|
|
180
|
+
"""
|
|
181
|
+
Handles the [/|//]<xpath> segment of a wxpath expression. This is a plain XPath expression.
|
|
182
|
+
Also handles wxpath-specific macro expansions like wx:backlink() or wx:depth().
|
|
183
|
+
"""
|
|
184
|
+
_, value = curr_segments[0]
|
|
185
|
+
expr = value.expr
|
|
186
|
+
if curr_elem is None:
|
|
187
|
+
raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
|
|
188
|
+
base_url = getattr(curr_elem, 'base_url', None)
|
|
189
|
+
log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
|
|
190
|
+
|
|
191
|
+
_backlink_str = f"string('{curr_elem.get('backlink')}')"
|
|
192
|
+
# We use the root tree's depth and not curr_depth because curr_depth accounts for a +1
|
|
193
|
+
# increment after each url*() hop
|
|
194
|
+
_depth_str = f"number({curr_elem.getroottree().getroot().get('depth')})"
|
|
195
|
+
expr = expr.replace('wx:backlink()', _backlink_str)
|
|
196
|
+
expr = expr.replace('wx:backlink(.)', _backlink_str)
|
|
197
|
+
expr = expr.replace('wx:depth()', _depth_str)
|
|
198
|
+
expr = expr.replace('wx:depth(.)', _depth_str)
|
|
199
|
+
|
|
200
|
+
elems = curr_elem.xpath3(expr)
|
|
201
|
+
|
|
202
|
+
next_segments = curr_segments[1:]
|
|
203
|
+
for elem in elems:
|
|
204
|
+
value_or_elem = WxStr(
|
|
205
|
+
elem, base_url=base_url,
|
|
206
|
+
depth=curr_depth
|
|
207
|
+
) if isinstance(elem, str) else elem
|
|
208
|
+
if len(curr_segments) == 1:
|
|
209
|
+
yield DataIntent(value=value_or_elem)
|
|
210
|
+
else:
|
|
211
|
+
yield ProcessIntent(elem=value_or_elem, next_segments=next_segments)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
@_op(OPS.XPATH_FN_MAP_FRAG)
|
|
215
|
+
def _handle_xpath_fn_map_frag(curr_elem: html.HtmlElement | str,
|
|
216
|
+
curr_segments: list[Segment],
|
|
217
|
+
curr_depth: int,
|
|
218
|
+
**kwargs) -> Iterable[DataIntent | ProcessIntent]:
|
|
219
|
+
"""
|
|
220
|
+
Handles the execution of XPath functions that were initially suffixed with a
|
|
221
|
+
'!' (map) operator.
|
|
222
|
+
"""
|
|
223
|
+
_, value = curr_segments[0]
|
|
224
|
+
|
|
225
|
+
base_url = getattr(curr_elem, 'base_url', None)
|
|
226
|
+
next_segments = curr_segments[1:]
|
|
227
|
+
|
|
228
|
+
result = elementpath.select(
|
|
229
|
+
curr_elem,
|
|
230
|
+
value.expr,
|
|
231
|
+
parser=XPath3Parser,
|
|
232
|
+
item='' if curr_elem is None else None
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
if isinstance(result, AnyAtomicType):
|
|
236
|
+
result = [result]
|
|
237
|
+
|
|
238
|
+
for r in result:
|
|
239
|
+
value_or_elem = WxStr(r, base_url=base_url, depth=curr_depth) if isinstance(r, str) else r
|
|
240
|
+
if len(curr_segments) == 1:
|
|
241
|
+
# XPATH_FN_MAP_FRAG is not a terminal operation
|
|
242
|
+
raise ValueError("XPATH_FN_MAP_FRAG is not a terminal operation")
|
|
243
|
+
else:
|
|
244
|
+
yield ProcessIntent(elem=value_or_elem, next_segments=next_segments)
|