wxpath 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wxpath/__init__.py +9 -0
- wxpath/cli.py +52 -0
- wxpath/core/__init__.py +13 -0
- wxpath/core/dom.py +22 -0
- wxpath/core/errors.py +134 -0
- wxpath/core/models.py +74 -0
- wxpath/core/ops.py +244 -0
- wxpath/core/parser.py +319 -0
- wxpath/core/runtime/__init__.py +5 -0
- wxpath/core/runtime/engine.py +315 -0
- wxpath/core/runtime/helpers.py +48 -0
- wxpath/hooks/__init__.py +9 -0
- wxpath/hooks/builtin.py +113 -0
- wxpath/hooks/registry.py +133 -0
- wxpath/http/__init__.py +0 -0
- wxpath/http/client/__init__.py +9 -0
- wxpath/http/client/crawler.py +196 -0
- wxpath/http/client/request.py +35 -0
- wxpath/http/client/response.py +14 -0
- wxpath/http/policy/backoff.py +16 -0
- wxpath/http/policy/retry.py +35 -0
- wxpath/http/policy/throttler.py +114 -0
- wxpath/http/stats.py +96 -0
- wxpath/patches.py +63 -0
- wxpath/util/__init__.py +0 -0
- wxpath/util/logging.py +91 -0
- wxpath/util/serialize.py +22 -0
- {wxpath-0.1.1.dist-info → wxpath-0.2.0.dist-info}/METADATA +28 -97
- wxpath-0.2.0.dist-info/RECORD +33 -0
- wxpath-0.2.0.dist-info/top_level.txt +1 -0
- wxpath-0.1.1.dist-info/RECORD +0 -6
- wxpath-0.1.1.dist-info/top_level.txt +0 -1
- {wxpath-0.1.1.dist-info → wxpath-0.2.0.dist-info}/WHEEL +0 -0
- {wxpath-0.1.1.dist-info → wxpath-0.2.0.dist-info}/entry_points.txt +0 -0
- {wxpath-0.1.1.dist-info → wxpath-0.2.0.dist-info}/licenses/LICENSE +0 -0
wxpath/core/parser.py
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module contains mainly two kinds of functions:
|
|
3
|
+
|
|
4
|
+
1. functions for parsing wxpath expressions.
|
|
5
|
+
2. functions for extracting information from wxpath expressions or subexpressions.
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
import re
|
|
9
|
+
from dataclasses import dataclass, fields
|
|
10
|
+
from typing import NamedTuple, Optional, TypeAlias
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
from enum import StrEnum
|
|
14
|
+
except ImportError:
|
|
15
|
+
from enum import Enum
|
|
16
|
+
|
|
17
|
+
class StrEnum(str, Enum):
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True, slots=True)
|
|
22
|
+
class ValueBase:
|
|
23
|
+
_value: str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass(frozen=True, slots=True)
|
|
27
|
+
class UrlValue(ValueBase):
|
|
28
|
+
target: str
|
|
29
|
+
follow: str | None = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass(frozen=True, slots=True)
|
|
33
|
+
class XpathValue(ValueBase):
|
|
34
|
+
expr: str
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass(frozen=True, slots=True)
|
|
38
|
+
class UrlInfAndXpathValue(ValueBase):
|
|
39
|
+
target: str
|
|
40
|
+
expr: str
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
Value: TypeAlias = UrlValue | XpathValue | UrlInfAndXpathValue
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class Segment(NamedTuple):
|
|
47
|
+
op: str
|
|
48
|
+
value: Value
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class OPS(StrEnum):
|
|
52
|
+
URL_STR_LIT = "url_str_lit"
|
|
53
|
+
URL_EVAL = "url_eval"
|
|
54
|
+
URL_INF = "url_inf"
|
|
55
|
+
URL_INF_AND_XPATH = "url_inf_and_xpath"
|
|
56
|
+
XPATH = "xpath"
|
|
57
|
+
XPATH_FN_MAP_FRAG = "xpath_fn_map_frag" # XPath function ending with map operator '!'
|
|
58
|
+
INF_XPATH = "inf_xpath" # Experimental
|
|
59
|
+
OBJECT = "object" # Deprecated
|
|
60
|
+
URL_FROM_ATTR = "url_from_attr" # Deprecated
|
|
61
|
+
URL_OPR_AND_ARG = "url_opr_and_arg" # Deprecated
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _scan_path_expr(path_expr: str) -> list[str]:
|
|
65
|
+
"""
|
|
66
|
+
Provided a wxpath expression, produce a list of all xpath and url() partitions
|
|
67
|
+
|
|
68
|
+
:param path_expr: Description
|
|
69
|
+
"""
|
|
70
|
+
# remove newlines
|
|
71
|
+
path_expr = path_expr.replace('\n', '')
|
|
72
|
+
partitions = [] # type: list[str]
|
|
73
|
+
i = 0
|
|
74
|
+
n = len(path_expr)
|
|
75
|
+
while i < n:
|
|
76
|
+
# Detect ///url(, //url(, /url(, or url(
|
|
77
|
+
match = re.match(r'/{0,3}url\(', path_expr[i:])
|
|
78
|
+
if match:
|
|
79
|
+
seg_start = i
|
|
80
|
+
i += match.end() # Move past the matched "url("
|
|
81
|
+
paren_depth = 1
|
|
82
|
+
while i < n and paren_depth > 0:
|
|
83
|
+
if path_expr[i] == '(':
|
|
84
|
+
paren_depth += 1
|
|
85
|
+
elif path_expr[i] == ')':
|
|
86
|
+
paren_depth -= 1
|
|
87
|
+
i += 1
|
|
88
|
+
partitions.append(path_expr[seg_start:i])
|
|
89
|
+
else:
|
|
90
|
+
# Grab until the next /url(
|
|
91
|
+
next_url = re.search(r'/{0,3}url\(', path_expr[i:])
|
|
92
|
+
next_pos = next_url.start() + i if next_url else n
|
|
93
|
+
if i != next_pos:
|
|
94
|
+
partitions.append(path_expr[i:next_pos])
|
|
95
|
+
i = next_pos
|
|
96
|
+
|
|
97
|
+
return partitions
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def parse_wxpath_expr(path_expr):
|
|
101
|
+
partitions = _scan_path_expr(path_expr)
|
|
102
|
+
|
|
103
|
+
# Lex and parse
|
|
104
|
+
segments = [] # type: list[Segment]
|
|
105
|
+
for s in partitions:
|
|
106
|
+
s = s.strip()
|
|
107
|
+
if not s:
|
|
108
|
+
continue
|
|
109
|
+
if s.startswith('url("') or s.startswith("url('"):
|
|
110
|
+
segments.append(
|
|
111
|
+
Segment(
|
|
112
|
+
OPS.URL_STR_LIT,
|
|
113
|
+
UrlValue(s, *parse_url_value(_extract_arg_from_url_op(s))),
|
|
114
|
+
)
|
|
115
|
+
)
|
|
116
|
+
elif s.startswith('///url('):
|
|
117
|
+
segments.append(
|
|
118
|
+
Segment(
|
|
119
|
+
OPS.URL_INF,
|
|
120
|
+
# XpathValue(extract_url_op_arg(s))
|
|
121
|
+
XpathValue(_value=s, expr=_extract_arg_from_url_xpath_op(s))
|
|
122
|
+
)
|
|
123
|
+
)
|
|
124
|
+
elif s.startswith('/url("') or s.startswith('//url("'):
|
|
125
|
+
raise ValueError("url() segment cannot have string literal "
|
|
126
|
+
f"argument and preceding navigation slashes (/|//): {s}")
|
|
127
|
+
elif s.startswith("/url('") or s.startswith("//url('"):
|
|
128
|
+
raise ValueError("url() segment cannot have string literal "
|
|
129
|
+
f"argument and preceding navigation slashes (/|//): {s}")
|
|
130
|
+
elif s.startswith('/url(') or s.startswith("//url("):
|
|
131
|
+
segments.append(Segment(OPS.URL_EVAL, XpathValue(s, _extract_arg_from_url_xpath_op(s))))
|
|
132
|
+
elif s.startswith('url('):
|
|
133
|
+
segments.append(Segment(OPS.URL_EVAL, XpathValue(s, _extract_arg_from_url_xpath_op(s))))
|
|
134
|
+
elif s.startswith('///'):
|
|
135
|
+
raise ValueError(f"xpath segment cannot have preceding triple slashes : {s}")
|
|
136
|
+
# segments.append(Segment(OPS.INF_XPATH, XpathValue(s, "//" + s[3:])))
|
|
137
|
+
elif s.endswith('!'):
|
|
138
|
+
segments.append(Segment(OPS.XPATH_FN_MAP_FRAG, XpathValue(s, s[:-1])))
|
|
139
|
+
else:
|
|
140
|
+
segments.append(Segment(OPS.XPATH, XpathValue(s, s)))
|
|
141
|
+
|
|
142
|
+
## EXPERIMENTAL
|
|
143
|
+
## Disabled for now
|
|
144
|
+
## Collapes inf_xpath segment and the succeeding url_eval segment into a single url_inf segment
|
|
145
|
+
# for i in range(len(segments) - 1, 0, -1):
|
|
146
|
+
# if segments[i - 1][0] == OPS.INF_XPATH and segments[i][0] == OPS.URL_EVAL:
|
|
147
|
+
# inf_xpath_value = segments[i - 1][1]
|
|
148
|
+
# url_eval_value = segments[i][1]
|
|
149
|
+
# url_eval_traveral_fragment = url_eval_value._value.split('url')[0]
|
|
150
|
+
# segments[i - 1] = Segment(
|
|
151
|
+
# OPS.URL_INF,
|
|
152
|
+
# XpathValue(
|
|
153
|
+
# _value='',
|
|
154
|
+
# expr=(f'{inf_xpath_value.expr}'
|
|
155
|
+
# f'{url_eval_traveral_fragment}'
|
|
156
|
+
# f'{url_eval_value.expr}')
|
|
157
|
+
# )
|
|
158
|
+
# )
|
|
159
|
+
# segments.pop(i)
|
|
160
|
+
|
|
161
|
+
#### RAISE ERRORS FROM INVALID SEGMENTS ####
|
|
162
|
+
# Raises if multiple ///url() are present
|
|
163
|
+
if len([op for op, val in segments if op == OPS.URL_INF]) > 1:
|
|
164
|
+
raise ValueError("Only one ///url() is allowed")
|
|
165
|
+
|
|
166
|
+
# Raises if multiple url() with string literals are present
|
|
167
|
+
if len([op for op, _ in segments if op == OPS.URL_STR_LIT]) > 1:
|
|
168
|
+
raise ValueError("Only one url() with string literal argument is allowed")
|
|
169
|
+
|
|
170
|
+
# Raises when expr starts with //url(@<attr>)
|
|
171
|
+
if segments and segments[0][0] == OPS.URL_EVAL:
|
|
172
|
+
raise ValueError("Path expr cannot start with [//]url(<xpath>)")
|
|
173
|
+
|
|
174
|
+
# Raises if expr ends with INF_XPATH
|
|
175
|
+
if segments and segments[-1][0] == OPS.INF_XPATH:
|
|
176
|
+
raise ValueError("Path expr cannot end with ///<xpath>")
|
|
177
|
+
|
|
178
|
+
# Raises if expr ends with XPATH_FN_MAP_FRAG
|
|
179
|
+
if segments and segments[-1][0] == OPS.XPATH_FN_MAP_FRAG:
|
|
180
|
+
raise ValueError("Path expr cannot end with !")
|
|
181
|
+
return segments
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def parse_url_value(src: str) -> tuple[str, Optional[str]]:
|
|
185
|
+
"""
|
|
186
|
+
Parse the contents of url(...).
|
|
187
|
+
|
|
188
|
+
Examples of src:
|
|
189
|
+
"'https://example.com'"
|
|
190
|
+
"//a/@href"
|
|
191
|
+
"'https://x', follow=//a/@href"
|
|
192
|
+
"""
|
|
193
|
+
|
|
194
|
+
parts = _split_top_level_commas(src)
|
|
195
|
+
|
|
196
|
+
if not parts:
|
|
197
|
+
raise SyntaxError("url() requires at least one argument")
|
|
198
|
+
|
|
199
|
+
# ---- positional argument (target) ----
|
|
200
|
+
target_src = parts[0].strip()
|
|
201
|
+
if not target_src:
|
|
202
|
+
raise SyntaxError("url() target cannot be empty")
|
|
203
|
+
|
|
204
|
+
target = _parse_url_target(target_src)
|
|
205
|
+
|
|
206
|
+
follow = None
|
|
207
|
+
|
|
208
|
+
# ---- keyword arguments ----
|
|
209
|
+
for part in parts[1:]:
|
|
210
|
+
name, value = _split_kwarg(part)
|
|
211
|
+
|
|
212
|
+
if name == "follow":
|
|
213
|
+
if follow is not None:
|
|
214
|
+
raise SyntaxError("duplicate follow= in url()")
|
|
215
|
+
follow = value.strip()
|
|
216
|
+
else:
|
|
217
|
+
raise SyntaxError(f"unknown url() argument: {name}")
|
|
218
|
+
|
|
219
|
+
return target, follow
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def extract_url_op_arg(url_op_and_arg: str) -> str:
|
|
223
|
+
url_op_arg = _extract_arg_from_url_xpath_op(url_op_and_arg)
|
|
224
|
+
if url_op_arg.startswith('@'):
|
|
225
|
+
return ".//" + url_op_arg
|
|
226
|
+
elif url_op_arg.startswith('.'):
|
|
227
|
+
return url_op_arg
|
|
228
|
+
elif url_op_arg.startswith('//'):
|
|
229
|
+
return '.' + url_op_arg
|
|
230
|
+
elif not url_op_arg.startswith('.//'):
|
|
231
|
+
return './/' + url_op_arg
|
|
232
|
+
else:
|
|
233
|
+
return url_op_arg
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _extract_arg_from_url_xpath_op(url_subsegment):
|
|
237
|
+
match = re.search(r"url\((.+)\)", url_subsegment)
|
|
238
|
+
if not match:
|
|
239
|
+
raise ValueError(f"Invalid url() segment: {url_subsegment}")
|
|
240
|
+
return match.group(1).strip("'\"") # Remove surrounding quotes if any
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _extract_arg_from_url_op(url_subsegment):
|
|
244
|
+
match = re.search(r"url\((.+)\)", url_subsegment)
|
|
245
|
+
if not match:
|
|
246
|
+
raise ValueError(f"Invalid url() segment: {url_subsegment}")
|
|
247
|
+
return match.group(1) # Remove surrounding quotes if any
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def _split_top_level_commas(src: str) -> list[str]:
|
|
251
|
+
parts = []
|
|
252
|
+
buf = []
|
|
253
|
+
depth = 0
|
|
254
|
+
in_string = False
|
|
255
|
+
quote = None
|
|
256
|
+
|
|
257
|
+
for ch in src:
|
|
258
|
+
if in_string:
|
|
259
|
+
buf.append(ch)
|
|
260
|
+
if ch == quote:
|
|
261
|
+
in_string = False
|
|
262
|
+
continue
|
|
263
|
+
|
|
264
|
+
if ch in ("'", '"'):
|
|
265
|
+
in_string = True
|
|
266
|
+
quote = ch
|
|
267
|
+
buf.append(ch)
|
|
268
|
+
continue
|
|
269
|
+
|
|
270
|
+
if ch in "([{":
|
|
271
|
+
depth += 1
|
|
272
|
+
elif ch in ")]}":
|
|
273
|
+
depth -= 1
|
|
274
|
+
if depth < 0:
|
|
275
|
+
raise SyntaxError("unbalanced parentheses in url()")
|
|
276
|
+
|
|
277
|
+
if ch == "," and depth == 0:
|
|
278
|
+
parts.append("".join(buf).strip())
|
|
279
|
+
buf.clear()
|
|
280
|
+
else:
|
|
281
|
+
buf.append(ch)
|
|
282
|
+
|
|
283
|
+
if in_string or depth != 0:
|
|
284
|
+
raise SyntaxError("unbalanced expression in url()")
|
|
285
|
+
|
|
286
|
+
if buf:
|
|
287
|
+
parts.append("".join(buf).strip())
|
|
288
|
+
|
|
289
|
+
return parts
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def _split_kwarg(src: str) -> tuple[str, str]:
|
|
293
|
+
if "=" not in src:
|
|
294
|
+
raise SyntaxError(f"expected keyword argument, got: {src}")
|
|
295
|
+
|
|
296
|
+
name, value = src.split("=", 1)
|
|
297
|
+
name = name.strip()
|
|
298
|
+
value = value.strip()
|
|
299
|
+
|
|
300
|
+
if not name or not value:
|
|
301
|
+
raise SyntaxError(f"invalid keyword argument: {src}")
|
|
302
|
+
|
|
303
|
+
return name, value
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def _parse_url_target(src: str):
|
|
307
|
+
src = src.strip()
|
|
308
|
+
# string literal
|
|
309
|
+
if (src.startswith("'") and src.endswith("'")) or \
|
|
310
|
+
(src.startswith('"') and src.endswith('"')):
|
|
311
|
+
return src[1:-1]
|
|
312
|
+
|
|
313
|
+
return src
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def _get_shallow_dict(instance: Value):
|
|
317
|
+
return {field.name: getattr(instance, field.name)
|
|
318
|
+
for field in fields(instance) if field.name not in {'_value'}}
|
|
319
|
+
|
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import contextlib
|
|
3
|
+
import inspect
|
|
4
|
+
from collections import deque
|
|
5
|
+
from typing import Any, AsyncGenerator
|
|
6
|
+
|
|
7
|
+
from lxml.html import HtmlElement
|
|
8
|
+
|
|
9
|
+
from wxpath import patches # noqa: F401
|
|
10
|
+
from wxpath.core.models import (
|
|
11
|
+
CrawlIntent,
|
|
12
|
+
CrawlTask,
|
|
13
|
+
DataIntent,
|
|
14
|
+
ExtractIntent,
|
|
15
|
+
InfiniteCrawlIntent,
|
|
16
|
+
ProcessIntent,
|
|
17
|
+
)
|
|
18
|
+
from wxpath.core.ops import get_operator
|
|
19
|
+
from wxpath.core.parser import parse_wxpath_expr
|
|
20
|
+
from wxpath.core.runtime.helpers import parse_html
|
|
21
|
+
from wxpath.hooks.registry import FetchContext, get_hooks
|
|
22
|
+
from wxpath.http.client.crawler import Crawler
|
|
23
|
+
from wxpath.http.client.request import Request
|
|
24
|
+
from wxpath.util.logging import get_logger
|
|
25
|
+
|
|
26
|
+
log = get_logger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class HookedEngineBase:
|
|
30
|
+
async def post_fetch_hooks(self, body, task):
|
|
31
|
+
for hook in get_hooks():
|
|
32
|
+
hook_method = getattr(hook, "post_fetch", lambda _, b: b)
|
|
33
|
+
if inspect.iscoroutinefunction(hook_method):
|
|
34
|
+
body = await hook_method(
|
|
35
|
+
FetchContext(task.url, task.backlink, task.depth, task.segments),
|
|
36
|
+
body
|
|
37
|
+
)
|
|
38
|
+
else:
|
|
39
|
+
body = hook_method(
|
|
40
|
+
FetchContext(task.url, task.backlink, task.depth, task.segments),
|
|
41
|
+
body
|
|
42
|
+
)
|
|
43
|
+
if not body:
|
|
44
|
+
log.debug(f"hook {type(hook).__name__} dropped {task.url}")
|
|
45
|
+
break
|
|
46
|
+
return body
|
|
47
|
+
|
|
48
|
+
async def post_parse_hooks(self, elem, task):
|
|
49
|
+
for hook in get_hooks():
|
|
50
|
+
hook_method = getattr(hook, "post_parse", lambda _, e: e)
|
|
51
|
+
if inspect.iscoroutinefunction(hook_method):
|
|
52
|
+
elem = await hook_method(
|
|
53
|
+
FetchContext(
|
|
54
|
+
url=task.url,
|
|
55
|
+
backlink=task.backlink,
|
|
56
|
+
depth=task.depth,
|
|
57
|
+
segments=task.segments
|
|
58
|
+
),
|
|
59
|
+
elem,
|
|
60
|
+
)
|
|
61
|
+
else:
|
|
62
|
+
elem = hook_method(
|
|
63
|
+
FetchContext(
|
|
64
|
+
url=task.url,
|
|
65
|
+
backlink=task.backlink,
|
|
66
|
+
depth=task.depth,
|
|
67
|
+
segments=task.segments
|
|
68
|
+
),
|
|
69
|
+
elem,
|
|
70
|
+
)
|
|
71
|
+
if elem is None:
|
|
72
|
+
log.debug(f"hook {type(hook).__name__} dropped {task.url}")
|
|
73
|
+
break
|
|
74
|
+
return elem
|
|
75
|
+
|
|
76
|
+
async def post_extract_hooks(self, value):
|
|
77
|
+
for hook in get_hooks():
|
|
78
|
+
hook_method = getattr(hook, "post_extract", lambda v: v)
|
|
79
|
+
if inspect.iscoroutinefunction(hook_method):
|
|
80
|
+
value = await hook_method(value)
|
|
81
|
+
else:
|
|
82
|
+
value = hook_method(value)
|
|
83
|
+
if value is None:
|
|
84
|
+
log.debug(f"hook {type(hook).__name__} dropped value")
|
|
85
|
+
break
|
|
86
|
+
return value
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class WXPathEngine(HookedEngineBase):
|
|
90
|
+
"""
|
|
91
|
+
Main class for executing wxpath expressions.
|
|
92
|
+
|
|
93
|
+
The core pattern and directive for this engine is to build a queue of CrawlTasks,
|
|
94
|
+
which is crawled and processed FIFO. The traversal of the queue (and therefore
|
|
95
|
+
the web graph) is done concurrently and in BFS-ish order.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
crawler: Crawler instance
|
|
99
|
+
concurrency: number of concurrent fetches at the Crawler (request engine) level
|
|
100
|
+
per_host: number of concurrent fetches per host
|
|
101
|
+
"""
|
|
102
|
+
def __init__(
|
|
103
|
+
self,
|
|
104
|
+
crawler: Crawler | None = None,
|
|
105
|
+
concurrency: int = 16,
|
|
106
|
+
per_host: int = 8
|
|
107
|
+
):
|
|
108
|
+
self.seen_urls: set[str] = set()
|
|
109
|
+
self.crawler = crawler or Crawler(concurrency=concurrency, per_host=per_host)
|
|
110
|
+
|
|
111
|
+
async def run(self, expression: str, max_depth: int):
|
|
112
|
+
segments = parse_wxpath_expr(expression)
|
|
113
|
+
|
|
114
|
+
queue: asyncio.Queue[CrawlTask] = asyncio.Queue()
|
|
115
|
+
inflight: dict[str, CrawlTask] = {}
|
|
116
|
+
pending_tasks = 0
|
|
117
|
+
|
|
118
|
+
def is_terminal():
|
|
119
|
+
return queue.empty() and pending_tasks <= 0
|
|
120
|
+
|
|
121
|
+
async with self.crawler as crawler:
|
|
122
|
+
async def submitter():
|
|
123
|
+
nonlocal pending_tasks
|
|
124
|
+
while True:
|
|
125
|
+
task = await queue.get()
|
|
126
|
+
|
|
127
|
+
if task is None:
|
|
128
|
+
break
|
|
129
|
+
|
|
130
|
+
if task.url in self.seen_urls or task.url in inflight:
|
|
131
|
+
queue.task_done()
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
# Mark URL as seen immediately
|
|
135
|
+
self.seen_urls.add(task.url)
|
|
136
|
+
inflight[task.url] = task
|
|
137
|
+
|
|
138
|
+
pending_tasks += 1
|
|
139
|
+
crawler.submit(Request(task.url, max_retries=0))
|
|
140
|
+
queue.task_done()
|
|
141
|
+
|
|
142
|
+
submit_task = asyncio.create_task(submitter())
|
|
143
|
+
|
|
144
|
+
# Seed the pipeline with a dummy task
|
|
145
|
+
seed_task = CrawlTask(
|
|
146
|
+
elem=None,
|
|
147
|
+
url=None,
|
|
148
|
+
segments=segments,
|
|
149
|
+
depth=-1,
|
|
150
|
+
backlink=None,
|
|
151
|
+
)
|
|
152
|
+
async for output in self._process_pipeline(
|
|
153
|
+
task=seed_task,
|
|
154
|
+
elem=None,
|
|
155
|
+
depth=seed_task.depth,
|
|
156
|
+
max_depth=max_depth,
|
|
157
|
+
queue=queue,
|
|
158
|
+
):
|
|
159
|
+
yield await self.post_extract_hooks(output)
|
|
160
|
+
|
|
161
|
+
# While looping asynchronous generators, you MUST make sure
|
|
162
|
+
# to check terminal conditions before re-iteration.
|
|
163
|
+
async for resp in crawler:
|
|
164
|
+
task = inflight.pop(resp.request.url, None)
|
|
165
|
+
pending_tasks -= 1
|
|
166
|
+
|
|
167
|
+
if task is None:
|
|
168
|
+
log.warning(f"Got unexpected response from {resp.request.url}")
|
|
169
|
+
if is_terminal():
|
|
170
|
+
break
|
|
171
|
+
continue
|
|
172
|
+
|
|
173
|
+
if resp.error:
|
|
174
|
+
log.warning(f"Got error from {resp.request.url}: {resp.error}")
|
|
175
|
+
if is_terminal():
|
|
176
|
+
break
|
|
177
|
+
continue
|
|
178
|
+
|
|
179
|
+
# NOTE: Consider allowing redirects
|
|
180
|
+
if resp.status != 200 or not resp.body:
|
|
181
|
+
log.warning(f"Got non-200 response from {resp.request.url}")
|
|
182
|
+
if is_terminal():
|
|
183
|
+
break
|
|
184
|
+
continue
|
|
185
|
+
|
|
186
|
+
body = await self.post_fetch_hooks(resp.body, task)
|
|
187
|
+
if not body:
|
|
188
|
+
if is_terminal():
|
|
189
|
+
break
|
|
190
|
+
continue
|
|
191
|
+
|
|
192
|
+
elem = parse_html(
|
|
193
|
+
body,
|
|
194
|
+
base_url=task.url,
|
|
195
|
+
backlink=task.backlink,
|
|
196
|
+
depth=task.depth,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
elem = await self.post_parse_hooks(elem, task)
|
|
200
|
+
if elem is None:
|
|
201
|
+
if is_terminal():
|
|
202
|
+
break
|
|
203
|
+
continue
|
|
204
|
+
|
|
205
|
+
if task.segments:
|
|
206
|
+
async for output in self._process_pipeline(
|
|
207
|
+
task=task,
|
|
208
|
+
elem=elem,
|
|
209
|
+
depth=task.depth,
|
|
210
|
+
max_depth=max_depth,
|
|
211
|
+
queue=queue,
|
|
212
|
+
):
|
|
213
|
+
|
|
214
|
+
yield await self.post_extract_hooks(output)
|
|
215
|
+
else:
|
|
216
|
+
yield await self.post_extract_hooks(elem)
|
|
217
|
+
|
|
218
|
+
# Termination condition
|
|
219
|
+
if is_terminal():
|
|
220
|
+
break
|
|
221
|
+
|
|
222
|
+
submit_task.cancel()
|
|
223
|
+
with contextlib.suppress(asyncio.CancelledError):
|
|
224
|
+
await submit_task
|
|
225
|
+
|
|
226
|
+
async def _process_pipeline(
|
|
227
|
+
self,
|
|
228
|
+
task: CrawlTask,
|
|
229
|
+
elem,
|
|
230
|
+
depth: int,
|
|
231
|
+
max_depth: int,
|
|
232
|
+
queue: asyncio.Queue[CrawlTask],
|
|
233
|
+
):
|
|
234
|
+
mini_queue: deque[(HtmlElement, list[tuple[str, str]])] = deque([(elem, task.segments)])
|
|
235
|
+
|
|
236
|
+
while mini_queue:
|
|
237
|
+
elem, segments = mini_queue.popleft()
|
|
238
|
+
|
|
239
|
+
op, _ = segments[0]
|
|
240
|
+
operator = get_operator(op)
|
|
241
|
+
|
|
242
|
+
intents = operator(elem, segments, depth)
|
|
243
|
+
|
|
244
|
+
if not intents:
|
|
245
|
+
return
|
|
246
|
+
|
|
247
|
+
for intent in intents:
|
|
248
|
+
if isinstance(intent, DataIntent):
|
|
249
|
+
yield intent.value
|
|
250
|
+
|
|
251
|
+
elif isinstance(intent, CrawlIntent):
|
|
252
|
+
next_depth = task.depth + 1
|
|
253
|
+
# if intent.url not in self.seen_urls and next_depth <= max_depth:
|
|
254
|
+
if next_depth <= max_depth:
|
|
255
|
+
# self.seen_urls.add(intent.url)
|
|
256
|
+
queue.put_nowait(
|
|
257
|
+
CrawlTask(
|
|
258
|
+
elem=None,
|
|
259
|
+
url=intent.url,
|
|
260
|
+
segments=intent.next_segments,
|
|
261
|
+
depth=next_depth,
|
|
262
|
+
backlink=task.url,
|
|
263
|
+
)
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
elif isinstance(intent, (ExtractIntent, ProcessIntent, InfiniteCrawlIntent)):
|
|
267
|
+
# immediately traverse the extraction
|
|
268
|
+
elem = intent.elem
|
|
269
|
+
next_segments = intent.next_segments
|
|
270
|
+
mini_queue.append((elem, next_segments))
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def wxpath_async(path_expr: str,
|
|
274
|
+
max_depth: int,
|
|
275
|
+
engine: WXPathEngine = None) -> AsyncGenerator[Any, None]:
|
|
276
|
+
if engine is None:
|
|
277
|
+
engine = WXPathEngine()
|
|
278
|
+
return engine.run(path_expr, max_depth)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
##### ASYNC IN SYNC #####
|
|
282
|
+
def wxpath_async_blocking_iter(path_expr, max_depth=1, engine: WXPathEngine = None):
|
|
283
|
+
"""
|
|
284
|
+
Evaluate a wxpath expression using concurrent breadth-first traversal.
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
path_expr (str): A wxpath expression.
|
|
288
|
+
max_depth (int, optional): Maximum crawl depth. Must be at least the
|
|
289
|
+
number of `url*` segments minus one. Defaults to `1`.
|
|
290
|
+
|
|
291
|
+
Yields:
|
|
292
|
+
lxml.html.HtmlElement | wxpath.models.WxStr | dict | Any: The same objects
|
|
293
|
+
produced by the sequential evaluator.
|
|
294
|
+
|
|
295
|
+
Warning:
|
|
296
|
+
Spins up its own event loop therefore this function must **not** be
|
|
297
|
+
invoked from within an active asyncio event loop.
|
|
298
|
+
"""
|
|
299
|
+
loop = asyncio.new_event_loop()
|
|
300
|
+
asyncio.set_event_loop(loop)
|
|
301
|
+
agen = wxpath_async(path_expr, max_depth=max_depth, engine=engine)
|
|
302
|
+
|
|
303
|
+
try:
|
|
304
|
+
while True:
|
|
305
|
+
try:
|
|
306
|
+
yield loop.run_until_complete(agen.__anext__())
|
|
307
|
+
except StopAsyncIteration:
|
|
308
|
+
break
|
|
309
|
+
finally:
|
|
310
|
+
loop.run_until_complete(loop.shutdown_asyncgens())
|
|
311
|
+
loop.close()
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def wxpath_async_blocking(path_expr, max_depth=1, engine: WXPathEngine = None):
|
|
315
|
+
return list(wxpath_async_blocking_iter(path_expr, max_depth=max_depth, engine=engine))
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from lxml import etree, html
|
|
3
|
+
|
|
4
|
+
from wxpath import patches
|
|
5
|
+
from wxpath.util.logging import get_logger
|
|
6
|
+
|
|
7
|
+
log = get_logger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def parse_html(content, base_url=None, **elem_kv_pairs) -> html.HtmlElement:
|
|
11
|
+
elem = etree.HTML(content, parser=patches.html_parser_with_xpath3, base_url=base_url)
|
|
12
|
+
if base_url:
|
|
13
|
+
elem.getroottree().docinfo.URL = base_url # make base-uri() work
|
|
14
|
+
# Also set xml:base on the root element for XPath base-uri()
|
|
15
|
+
elem.set("{http://www.w3.org/XML/1998/namespace}base", base_url)
|
|
16
|
+
elem.base_url = base_url # sets both attribute and doc-level URL
|
|
17
|
+
|
|
18
|
+
# NOTE: some pages may have multiple root elements, i.e.
|
|
19
|
+
# len(elem.itersiblings()) > 0 AND elem.getparent() is None.
|
|
20
|
+
# This breaks elementpath. If elem has siblings, recreate the
|
|
21
|
+
# root element and only the root element.
|
|
22
|
+
if len(list(elem.itersiblings())) > 0:
|
|
23
|
+
elem = detach_html_root(elem, base_url)
|
|
24
|
+
|
|
25
|
+
for k, v in elem_kv_pairs.items():
|
|
26
|
+
elem.set(k, str(v))
|
|
27
|
+
return elem
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def detach_html_root(elem, base_url=None):
|
|
31
|
+
new_root = etree.HTML(
|
|
32
|
+
etree.tostring(elem, encoding="utf-8"),
|
|
33
|
+
parser=patches.html_parser_with_xpath3,
|
|
34
|
+
base_url=base_url
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
if base_url:
|
|
38
|
+
new_root.getroottree().docinfo.URL = base_url
|
|
39
|
+
new_root.set("{http://www.w3.org/XML/1998/namespace}base", base_url)
|
|
40
|
+
new_root.base_url = base_url
|
|
41
|
+
|
|
42
|
+
return new_root
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def fetch_html(url):
|
|
46
|
+
response = requests.get(url, timeout=10)
|
|
47
|
+
response.raise_for_status()
|
|
48
|
+
return response.content
|
wxpath/hooks/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from wxpath.hooks.builtin import JSONLWriter as JSONLWriter
|
|
2
|
+
from wxpath.hooks.builtin import SerializeXPathMapAndNodeHook as SerializeXPathMapAndNodeHook
|
|
3
|
+
from wxpath.hooks.registry import register as register
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"JSONLWriter",
|
|
7
|
+
"SerializeXPathMapAndNodeHook",
|
|
8
|
+
"register",
|
|
9
|
+
]
|