wxpath 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wxpath/cli.py +52 -12
- wxpath/core/ops.py +163 -129
- wxpath/core/parser.py +559 -280
- wxpath/core/runtime/engine.py +133 -42
- wxpath/core/runtime/helpers.py +0 -7
- wxpath/hooks/registry.py +29 -17
- wxpath/http/client/crawler.py +46 -11
- wxpath/http/client/request.py +6 -3
- wxpath/http/client/response.py +1 -1
- wxpath/http/policy/robots.py +82 -0
- {wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/METADATA +84 -37
- {wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/RECORD +16 -16
- wxpath/core/errors.py +0 -134
- {wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/WHEEL +0 -0
- {wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/entry_points.txt +0 -0
- {wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/top_level.txt +0 -0
wxpath/core/runtime/engine.py
CHANGED
|
@@ -2,11 +2,12 @@ import asyncio
|
|
|
2
2
|
import contextlib
|
|
3
3
|
import inspect
|
|
4
4
|
from collections import deque
|
|
5
|
-
from typing import Any, AsyncGenerator
|
|
5
|
+
from typing import Any, AsyncGenerator, Iterator
|
|
6
6
|
|
|
7
7
|
from lxml.html import HtmlElement
|
|
8
8
|
|
|
9
9
|
from wxpath import patches # noqa: F401
|
|
10
|
+
from wxpath.core import parser
|
|
10
11
|
from wxpath.core.models import (
|
|
11
12
|
CrawlIntent,
|
|
12
13
|
CrawlTask,
|
|
@@ -16,7 +17,7 @@ from wxpath.core.models import (
|
|
|
16
17
|
ProcessIntent,
|
|
17
18
|
)
|
|
18
19
|
from wxpath.core.ops import get_operator
|
|
19
|
-
from wxpath.core.parser import
|
|
20
|
+
from wxpath.core.parser import Binary, Segment, Segments
|
|
20
21
|
from wxpath.core.runtime.helpers import parse_html
|
|
21
22
|
from wxpath.hooks.registry import FetchContext, get_hooks
|
|
22
23
|
from wxpath.http.client.crawler import Crawler
|
|
@@ -27,7 +28,21 @@ log = get_logger(__name__)
|
|
|
27
28
|
|
|
28
29
|
|
|
29
30
|
class HookedEngineBase:
|
|
30
|
-
|
|
31
|
+
"""Common hook invocation helpers shared by engine variants."""
|
|
32
|
+
|
|
33
|
+
async def post_fetch_hooks(self, body: bytes | str, task: CrawlTask) -> bytes | str | None:
|
|
34
|
+
"""Run registered `post_fetch` hooks over a fetched response body.
|
|
35
|
+
|
|
36
|
+
Hooks may be synchronous or asynchronous and can transform or drop the
|
|
37
|
+
response payload entirely.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
body: Raw response body bytes from the crawler.
|
|
41
|
+
task: The `CrawlTask` that produced the response.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
The transformed body, or `None` if any hook chooses to drop it.
|
|
45
|
+
"""
|
|
31
46
|
for hook in get_hooks():
|
|
32
47
|
hook_method = getattr(hook, "post_fetch", lambda _, b: b)
|
|
33
48
|
if inspect.iscoroutinefunction(hook_method):
|
|
@@ -45,7 +60,18 @@ class HookedEngineBase:
|
|
|
45
60
|
break
|
|
46
61
|
return body
|
|
47
62
|
|
|
48
|
-
async def post_parse_hooks(
|
|
63
|
+
async def post_parse_hooks(
|
|
64
|
+
self, elem: HtmlElement | None, task: CrawlTask
|
|
65
|
+
) -> HtmlElement | None:
|
|
66
|
+
"""Run registered `post_parse` hooks on a parsed DOM element.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
elem: Parsed `lxml` element to process.
|
|
70
|
+
task: The originating `CrawlTask`.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
The transformed element, or `None` if a hook drops the branch.
|
|
74
|
+
"""
|
|
49
75
|
for hook in get_hooks():
|
|
50
76
|
hook_method = getattr(hook, "post_parse", lambda _, e: e)
|
|
51
77
|
if inspect.iscoroutinefunction(hook_method):
|
|
@@ -73,7 +99,15 @@ class HookedEngineBase:
|
|
|
73
99
|
break
|
|
74
100
|
return elem
|
|
75
101
|
|
|
76
|
-
async def post_extract_hooks(self, value):
|
|
102
|
+
async def post_extract_hooks(self, value: Any) -> Any | None:
|
|
103
|
+
"""Run registered `post_extract` hooks on extracted values.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
value: The extracted datum to post-process.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
The transformed value, or `None` if a hook drops it.
|
|
110
|
+
"""
|
|
77
111
|
for hook in get_hooks():
|
|
78
112
|
hook_method = getattr(hook, "post_extract", lambda v: v)
|
|
79
113
|
if inspect.iscoroutinefunction(hook_method):
|
|
@@ -87,35 +121,65 @@ class HookedEngineBase:
|
|
|
87
121
|
|
|
88
122
|
|
|
89
123
|
class WXPathEngine(HookedEngineBase):
|
|
90
|
-
"""
|
|
91
|
-
Main class for executing wxpath expressions.
|
|
124
|
+
"""Main class for executing wxpath expressions.
|
|
92
125
|
|
|
93
|
-
The core pattern
|
|
94
|
-
|
|
95
|
-
|
|
126
|
+
The core pattern is to build a queue of CrawlTasks that are crawled and
|
|
127
|
+
processed FIFO. Traversal of the queue (and therefore the web graph) is
|
|
128
|
+
done concurrently in BFS-ish order.
|
|
96
129
|
|
|
97
130
|
Args:
|
|
98
|
-
crawler: Crawler instance
|
|
99
|
-
concurrency:
|
|
100
|
-
per_host:
|
|
131
|
+
crawler: Crawler instance to use for HTTP requests.
|
|
132
|
+
concurrency: Number of concurrent fetches at the Crawler level.
|
|
133
|
+
per_host: Number of concurrent fetches per host.
|
|
134
|
+
respect_robots: Whether to respect robots.txt directives.
|
|
135
|
+
allowed_response_codes: Set of allowed HTTP response codes. Defaults
|
|
136
|
+
to ``{200}``. Responses may still be filtered and dropped.
|
|
137
|
+
allow_redirects: Whether to follow HTTP redirects. Defaults to ``True``.
|
|
101
138
|
"""
|
|
102
139
|
def __init__(
|
|
103
140
|
self,
|
|
104
141
|
crawler: Crawler | None = None,
|
|
105
142
|
concurrency: int = 16,
|
|
106
|
-
per_host: int = 8
|
|
143
|
+
per_host: int = 8,
|
|
144
|
+
respect_robots: bool = True,
|
|
145
|
+
allowed_response_codes: set[int] = None,
|
|
146
|
+
allow_redirects: bool = True,
|
|
107
147
|
):
|
|
148
|
+
# NOTE: Will grow unbounded in large crawls. Consider a LRU cache, or bloom filter.
|
|
108
149
|
self.seen_urls: set[str] = set()
|
|
109
|
-
self.crawler = crawler or Crawler(
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
150
|
+
self.crawler = crawler or Crawler(
|
|
151
|
+
concurrency=concurrency,
|
|
152
|
+
per_host=per_host,
|
|
153
|
+
respect_robots=respect_robots
|
|
154
|
+
)
|
|
155
|
+
self.allowed_response_codes = allowed_response_codes or {200}
|
|
156
|
+
self.allow_redirects = allow_redirects
|
|
157
|
+
if allow_redirects:
|
|
158
|
+
self.allowed_response_codes |= {301, 302, 303, 307, 308}
|
|
159
|
+
|
|
160
|
+
async def run(self, expression: str, max_depth: int) -> AsyncGenerator[Any, None]:
|
|
161
|
+
"""Execute a wxpath expression concurrently and yield results.
|
|
162
|
+
|
|
163
|
+
Builds and drives a BFS-like crawl pipeline that honors robots rules,
|
|
164
|
+
throttling, and hook callbacks while walking the web graph.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
expression: WXPath expression string to evaluate.
|
|
168
|
+
max_depth: Maximum crawl depth to follow for url hops.
|
|
169
|
+
|
|
170
|
+
Yields:
|
|
171
|
+
Extracted values produced by the expression (HTML elements or
|
|
172
|
+
wxpath-specific value types).
|
|
173
|
+
"""
|
|
174
|
+
segments = parser.parse(expression)
|
|
113
175
|
|
|
114
176
|
queue: asyncio.Queue[CrawlTask] = asyncio.Queue()
|
|
115
177
|
inflight: dict[str, CrawlTask] = {}
|
|
116
178
|
pending_tasks = 0
|
|
117
179
|
|
|
118
180
|
def is_terminal():
|
|
181
|
+
# NOTE: consider adopting state machine pattern for determining
|
|
182
|
+
# the current state of the engine.
|
|
119
183
|
return queue.empty() and pending_tasks <= 0
|
|
120
184
|
|
|
121
185
|
async with self.crawler as crawler:
|
|
@@ -177,7 +241,7 @@ class WXPathEngine(HookedEngineBase):
|
|
|
177
241
|
continue
|
|
178
242
|
|
|
179
243
|
# NOTE: Consider allowing redirects
|
|
180
|
-
if resp.status
|
|
244
|
+
if resp.status not in self.allowed_response_codes or not resp.body:
|
|
181
245
|
log.warning(f"Got non-200 response from {resp.request.url}")
|
|
182
246
|
if is_terminal():
|
|
183
247
|
break
|
|
@@ -226,20 +290,36 @@ class WXPathEngine(HookedEngineBase):
|
|
|
226
290
|
async def _process_pipeline(
|
|
227
291
|
self,
|
|
228
292
|
task: CrawlTask,
|
|
229
|
-
elem,
|
|
293
|
+
elem: Any,
|
|
230
294
|
depth: int,
|
|
231
295
|
max_depth: int,
|
|
232
296
|
queue: asyncio.Queue[CrawlTask],
|
|
233
|
-
):
|
|
234
|
-
|
|
297
|
+
) -> AsyncGenerator[Any, None]:
|
|
298
|
+
"""Process a queue of intents for a single crawl branch.
|
|
299
|
+
|
|
300
|
+
Traverses wxpath segments depth-first within a page while coordinating
|
|
301
|
+
newly discovered crawl intents back to the shared queue.
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
task: The originating crawl task for this branch.
|
|
305
|
+
elem: Current DOM element (or extracted value) being processed.
|
|
306
|
+
depth: Current traversal depth.
|
|
307
|
+
max_depth: Maximum permitted crawl depth.
|
|
308
|
+
queue: Shared crawl queue for enqueuing downstream URLs.
|
|
309
|
+
|
|
310
|
+
Yields:
|
|
311
|
+
object: Extracted values or processed elements as produced by operators.
|
|
312
|
+
"""
|
|
313
|
+
mini_queue: deque[tuple[HtmlElement | Any, list[Binary | Segment] | Segments]] = deque(
|
|
314
|
+
[(elem, task.segments)]
|
|
315
|
+
)
|
|
235
316
|
|
|
236
317
|
while mini_queue:
|
|
237
|
-
elem,
|
|
238
|
-
|
|
239
|
-
op, _ = segments[0]
|
|
240
|
-
operator = get_operator(op)
|
|
318
|
+
elem, bin_or_segs = mini_queue.popleft()
|
|
241
319
|
|
|
242
|
-
|
|
320
|
+
binary_or_segment = bin_or_segs if isinstance(bin_or_segs, Binary) else bin_or_segs[0]
|
|
321
|
+
operator = get_operator(binary_or_segment)
|
|
322
|
+
intents = operator(elem, bin_or_segs, depth)
|
|
243
323
|
|
|
244
324
|
if not intents:
|
|
245
325
|
return
|
|
@@ -253,6 +333,7 @@ class WXPathEngine(HookedEngineBase):
|
|
|
253
333
|
# if intent.url not in self.seen_urls and next_depth <= max_depth:
|
|
254
334
|
if next_depth <= max_depth:
|
|
255
335
|
# self.seen_urls.add(intent.url)
|
|
336
|
+
log.debug(f"Depth: {next_depth}; Enqueuing {intent.url}")
|
|
256
337
|
queue.put_nowait(
|
|
257
338
|
CrawlTask(
|
|
258
339
|
elem=None,
|
|
@@ -272,29 +353,33 @@ class WXPathEngine(HookedEngineBase):
|
|
|
272
353
|
|
|
273
354
|
def wxpath_async(path_expr: str,
|
|
274
355
|
max_depth: int,
|
|
275
|
-
engine: WXPathEngine = None) -> AsyncGenerator[Any, None]:
|
|
356
|
+
engine: WXPathEngine | None = None) -> AsyncGenerator[Any, None]:
|
|
276
357
|
if engine is None:
|
|
277
358
|
engine = WXPathEngine()
|
|
278
359
|
return engine.run(path_expr, max_depth)
|
|
279
360
|
|
|
280
361
|
|
|
281
362
|
##### ASYNC IN SYNC #####
|
|
282
|
-
def wxpath_async_blocking_iter(
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
max_depth (int, optional): Maximum crawl depth. Must be at least the
|
|
289
|
-
number of `url*` segments minus one. Defaults to `1`.
|
|
290
|
-
|
|
291
|
-
Yields:
|
|
292
|
-
lxml.html.HtmlElement | wxpath.models.WxStr | dict | Any: The same objects
|
|
293
|
-
produced by the sequential evaluator.
|
|
363
|
+
def wxpath_async_blocking_iter(
|
|
364
|
+
path_expr: str,
|
|
365
|
+
max_depth: int = 1,
|
|
366
|
+
engine: WXPathEngine | None = None,
|
|
367
|
+
) -> Iterator[Any]:
|
|
368
|
+
"""Evaluate a wxpath expression using concurrent breadth-first traversal.
|
|
294
369
|
|
|
295
370
|
Warning:
|
|
296
371
|
Spins up its own event loop therefore this function must **not** be
|
|
297
372
|
invoked from within an active asyncio event loop.
|
|
373
|
+
|
|
374
|
+
Args:
|
|
375
|
+
path_expr: A wxpath expression.
|
|
376
|
+
max_depth: Maximum crawl depth. Must be at least the number of
|
|
377
|
+
``url*`` segments minus one.
|
|
378
|
+
engine: Optional pre-configured WXPathEngine instance.
|
|
379
|
+
|
|
380
|
+
Yields:
|
|
381
|
+
object: Extracted objects (HtmlElement, WxStr, dict, or other values)
|
|
382
|
+
produced by the expression evaluator.
|
|
298
383
|
"""
|
|
299
384
|
loop = asyncio.new_event_loop()
|
|
300
385
|
asyncio.set_event_loop(loop)
|
|
@@ -311,5 +396,11 @@ def wxpath_async_blocking_iter(path_expr, max_depth=1, engine: WXPathEngine = No
|
|
|
311
396
|
loop.close()
|
|
312
397
|
|
|
313
398
|
|
|
314
|
-
def wxpath_async_blocking(
|
|
315
|
-
|
|
399
|
+
def wxpath_async_blocking(
|
|
400
|
+
path_expr: str,
|
|
401
|
+
max_depth: int = 1,
|
|
402
|
+
engine: WXPathEngine | None = None,
|
|
403
|
+
) -> list[Any]:
|
|
404
|
+
return list(
|
|
405
|
+
wxpath_async_blocking_iter(path_expr, max_depth=max_depth, engine=engine)
|
|
406
|
+
)
|
wxpath/core/runtime/helpers.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import requests
|
|
2
1
|
from lxml import etree, html
|
|
3
2
|
|
|
4
3
|
from wxpath import patches
|
|
@@ -40,9 +39,3 @@ def detach_html_root(elem, base_url=None):
|
|
|
40
39
|
new_root.base_url = base_url
|
|
41
40
|
|
|
42
41
|
return new_root
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def fetch_html(url):
|
|
46
|
-
response = requests.get(url, timeout=10)
|
|
47
|
-
response.raise_for_status()
|
|
48
|
-
return response.content
|
wxpath/hooks/registry.py
CHANGED
|
@@ -17,7 +17,6 @@ Write once:
|
|
|
17
17
|
from __future__ import annotations
|
|
18
18
|
|
|
19
19
|
import functools
|
|
20
|
-
from collections import OrderedDict
|
|
21
20
|
from collections.abc import Generator
|
|
22
21
|
from dataclasses import dataclass, field
|
|
23
22
|
from typing import Any, Iterable, List, Optional, Protocol
|
|
@@ -66,20 +65,24 @@ class Hook(Protocol):
|
|
|
66
65
|
# --------------------------------------------------------------------------- #
|
|
67
66
|
# Global registry helpers
|
|
68
67
|
# --------------------------------------------------------------------------- #
|
|
69
|
-
_global_hooks:
|
|
68
|
+
_global_hooks: dict[str, Hook] = dict()
|
|
70
69
|
|
|
71
70
|
|
|
72
71
|
def register(hook: Hook | type) -> Hook:
|
|
73
|
-
"""
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
72
|
+
"""Decorator/helper to add a Hook to the global list.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
hook: A Hook class or instance to register.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
The registered hook (instantiated if a class was provided).
|
|
79
|
+
|
|
80
|
+
Example:
|
|
81
|
+
>>> @register
|
|
82
|
+
... class DebugHook:
|
|
83
|
+
... def post_fetch(self, ctx, html_bytes):
|
|
84
|
+
... print("Fetched", ctx.url)
|
|
85
|
+
... return html_bytes
|
|
83
86
|
"""
|
|
84
87
|
|
|
85
88
|
hook_name = getattr(hook, '__name__', hook.__class__.__name__)
|
|
@@ -101,9 +104,13 @@ def iter_post_extract_hooks() -> Iterable[Hook]:
|
|
|
101
104
|
|
|
102
105
|
|
|
103
106
|
def pipe_post_extract(gen_func):
|
|
104
|
-
"""
|
|
105
|
-
|
|
106
|
-
|
|
107
|
+
"""Wrap a generator function to pipe yielded values through post_extract hooks.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
gen_func: A generator function to wrap.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
A wrapped generator that filters values through registered hooks.
|
|
107
114
|
"""
|
|
108
115
|
@functools.wraps(gen_func)
|
|
109
116
|
def wrapper(*args, **kwargs) -> Generator:
|
|
@@ -118,8 +125,13 @@ def pipe_post_extract(gen_func):
|
|
|
118
125
|
|
|
119
126
|
|
|
120
127
|
def pipe_post_extract_async(async_gen_func):
|
|
121
|
-
"""
|
|
122
|
-
|
|
128
|
+
"""Wrap an async generator function to pipe yielded values through hooks.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
async_gen_func: An async generator function to wrap.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
A wrapped async generator that filters values through registered hooks.
|
|
123
135
|
"""
|
|
124
136
|
@functools.wraps(async_gen_func)
|
|
125
137
|
async def wrapper(*args, **kwargs):
|
wxpath/http/client/crawler.py
CHANGED
|
@@ -10,6 +10,7 @@ import aiohttp
|
|
|
10
10
|
from wxpath.http.client.request import Request
|
|
11
11
|
from wxpath.http.client.response import Response
|
|
12
12
|
from wxpath.http.policy.retry import RetryPolicy
|
|
13
|
+
from wxpath.http.policy.robots import RobotsTxtPolicy
|
|
13
14
|
from wxpath.http.policy.throttler import AbstractThrottler, AutoThrottler
|
|
14
15
|
from wxpath.http.stats import CrawlerStats, build_trace_config
|
|
15
16
|
from wxpath.util.logging import get_logger
|
|
@@ -22,6 +23,8 @@ HEADERS = {"User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
|
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
class Crawler:
|
|
26
|
+
"""Concurrent HTTP crawler that manages throttling, retries, and robots."""
|
|
27
|
+
|
|
25
28
|
def __init__(
|
|
26
29
|
self,
|
|
27
30
|
concurrency: int = 16,
|
|
@@ -35,11 +38,13 @@ class Crawler:
|
|
|
35
38
|
auto_throttle_target_concurrency: float = None,
|
|
36
39
|
auto_throttle_start_delay: float = 0.25,
|
|
37
40
|
auto_throttle_max_delay: float = 10.0,
|
|
41
|
+
respect_robots: bool = True,
|
|
38
42
|
):
|
|
39
43
|
self.concurrency = concurrency
|
|
40
44
|
self._timeout = aiohttp.ClientTimeout(total=timeout)
|
|
41
45
|
self._headers = HEADERS | (headers or {}) # merge headers
|
|
42
|
-
self._proxies = proxies or {}
|
|
46
|
+
self._proxies = proxies if (isinstance(proxies, defaultdict) or proxies) else {}
|
|
47
|
+
self.respect_robots = respect_robots
|
|
43
48
|
|
|
44
49
|
self.retry_policy = retry_policy or RetryPolicy()
|
|
45
50
|
self.throttler = throttler or AutoThrottler(
|
|
@@ -57,8 +62,10 @@ class Crawler:
|
|
|
57
62
|
self._workers: list[asyncio.Task] = []
|
|
58
63
|
self._closed = False
|
|
59
64
|
self._stats = CrawlerStats()
|
|
65
|
+
self._robots_policy: RobotsTxtPolicy | None = None
|
|
60
66
|
|
|
61
|
-
def build_session(self):
|
|
67
|
+
def build_session(self) -> aiohttp.ClientSession:
|
|
68
|
+
"""Construct an `aiohttp.ClientSession` with tracing and pooling."""
|
|
62
69
|
trace_config = build_trace_config(self._stats)
|
|
63
70
|
# Need to build the connector as late as possible as it requires the loop
|
|
64
71
|
connector = aiohttp.TCPConnector(limit=self.concurrency*2, ttl_dns_cache=300)
|
|
@@ -69,26 +76,34 @@ class Crawler:
|
|
|
69
76
|
trace_configs=[trace_config]
|
|
70
77
|
)
|
|
71
78
|
|
|
72
|
-
async def __aenter__(self):
|
|
79
|
+
async def __aenter__(self) -> "Crawler":
|
|
80
|
+
"""Initialize HTTP session and start background workers."""
|
|
73
81
|
if self._session is None:
|
|
74
82
|
# self._session = aiohttp.ClientSession(timeout=self._timeout)
|
|
75
83
|
self._session = self.build_session()
|
|
76
84
|
|
|
85
|
+
if self.respect_robots:
|
|
86
|
+
self._robots_policy = RobotsTxtPolicy(self._session)
|
|
87
|
+
|
|
77
88
|
self._workers = [
|
|
78
89
|
asyncio.create_task(self._worker(), name=f"crawler-worker-{i}")
|
|
79
90
|
for i in range(self.concurrency)
|
|
80
91
|
]
|
|
81
92
|
return self
|
|
82
93
|
|
|
83
|
-
async def __aexit__(self, *_):
|
|
94
|
+
async def __aexit__(self, *_) -> None:
|
|
95
|
+
"""Tear down workers and close the HTTP session."""
|
|
84
96
|
self._closed = True
|
|
85
97
|
for w in self._workers:
|
|
86
98
|
w.cancel()
|
|
99
|
+
|
|
87
100
|
await asyncio.gather(*self._workers, return_exceptions=True)
|
|
101
|
+
|
|
88
102
|
if self._session:
|
|
89
103
|
await self._session.close()
|
|
90
104
|
|
|
91
|
-
def submit(self, req: Request):
|
|
105
|
+
def submit(self, req: Request) -> None:
|
|
106
|
+
"""Queue a request for fetching or raise if crawler already closed."""
|
|
92
107
|
if self._closed:
|
|
93
108
|
raise RuntimeError("crawler is closed")
|
|
94
109
|
self._pending.put_nowait(req)
|
|
@@ -96,18 +111,28 @@ class Crawler:
|
|
|
96
111
|
def __aiter__(self) -> AsyncIterator[Response]:
|
|
97
112
|
return self._result_iter()
|
|
98
113
|
|
|
99
|
-
async def _result_iter(self):
|
|
114
|
+
async def _result_iter(self) -> AsyncIterator[Response]:
|
|
115
|
+
"""Async iterator yielding responses as workers produce them."""
|
|
100
116
|
# while not self._closed:
|
|
101
117
|
while not (self._closed and self._results.empty()):
|
|
102
118
|
resp = await self._results.get()
|
|
103
119
|
self._results.task_done()
|
|
104
120
|
yield resp
|
|
105
121
|
|
|
106
|
-
def _proxy_for(self, url: str):
|
|
122
|
+
def _proxy_for(self, url: str) -> str | None:
|
|
107
123
|
host = urllib.parse.urlsplit(url).hostname
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
124
|
+
try:
|
|
125
|
+
# bracket notation first, for defaultdicts
|
|
126
|
+
value = self._proxies[host]
|
|
127
|
+
except KeyError:
|
|
128
|
+
value = self._proxies.get(host)
|
|
129
|
+
|
|
130
|
+
if not value:
|
|
131
|
+
log.debug("proxy", extra={"host": host, "value": value})
|
|
132
|
+
return value
|
|
133
|
+
|
|
134
|
+
async def _worker(self) -> None:
|
|
135
|
+
"""Worker loop that fetches pending requests and enqueues results."""
|
|
111
136
|
while True:
|
|
112
137
|
req = await self._pending.get()
|
|
113
138
|
try:
|
|
@@ -133,8 +158,17 @@ class Crawler:
|
|
|
133
158
|
self._pending.task_done()
|
|
134
159
|
|
|
135
160
|
async def _fetch_one(self, req: Request) -> Response | None:
|
|
161
|
+
"""Fetch a single request, handling robots, throttling, and retries."""
|
|
136
162
|
host = req.hostname
|
|
137
163
|
|
|
164
|
+
if self._robots_policy:
|
|
165
|
+
can_fetch = await self._robots_policy.can_fetch(
|
|
166
|
+
req.url, self._headers.get("User-Agent")
|
|
167
|
+
)
|
|
168
|
+
if not can_fetch:
|
|
169
|
+
log.debug("disallowed by robots.txt", extra={"url": req.url})
|
|
170
|
+
return Response(req, 403, b"", error=RuntimeError("Disallowed by robots.txt"))
|
|
171
|
+
|
|
138
172
|
# TODO: Move this filter to hooks
|
|
139
173
|
if req.url.lower().endswith((".pdf", ".zip", ".exe")):
|
|
140
174
|
req.max_retries = 0
|
|
@@ -181,7 +215,8 @@ class Crawler:
|
|
|
181
215
|
log.error("request failed", extra={"url": req.url}, exc_info=exc)
|
|
182
216
|
return Response(req, 0, b"", error=exc)
|
|
183
217
|
|
|
184
|
-
async def _retry(self, req: Request):
|
|
218
|
+
async def _retry(self, req: Request) -> None:
|
|
219
|
+
"""Reschedule a request according to the retry policy."""
|
|
185
220
|
req.retries += 1
|
|
186
221
|
delay = self.retry_policy.get_delay(req)
|
|
187
222
|
|
wxpath/http/client/request.py
CHANGED
|
@@ -1,24 +1,26 @@
|
|
|
1
1
|
import time
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
|
-
from typing import Any
|
|
3
|
+
from typing import Any
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
@dataclass
|
|
7
7
|
class Request:
|
|
8
|
+
"""HTTP request envelope used by the crawler."""
|
|
8
9
|
url: str
|
|
9
10
|
method: str = "GET"
|
|
10
|
-
headers:
|
|
11
|
+
headers: dict[str, str] = field(default_factory=dict)
|
|
11
12
|
timeout: float = 15.0
|
|
12
13
|
|
|
13
14
|
retries: int = 0
|
|
14
15
|
max_retries: int | None = None
|
|
15
16
|
dont_retry: bool = False
|
|
16
17
|
|
|
17
|
-
meta:
|
|
18
|
+
meta: dict[str, Any] = field(default_factory=dict)
|
|
18
19
|
|
|
19
20
|
created_at: float = field(default_factory=time.monotonic)
|
|
20
21
|
|
|
21
22
|
def copy_for_retry(self) -> "Request":
|
|
23
|
+
"""Create a copy incrementing the retry counter for scheduling."""
|
|
22
24
|
return Request(
|
|
23
25
|
url=self.url,
|
|
24
26
|
method=self.method,
|
|
@@ -26,6 +28,7 @@ class Request:
|
|
|
26
28
|
timeout=self.timeout,
|
|
27
29
|
retries=self.retries + 1,
|
|
28
30
|
max_retries=self.max_retries,
|
|
31
|
+
dont_retry=self.dont_retry,
|
|
29
32
|
meta=self.meta,
|
|
30
33
|
)
|
|
31
34
|
|
wxpath/http/client/response.py
CHANGED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import urllib.parse
|
|
3
|
+
import urllib.robotparser
|
|
4
|
+
|
|
5
|
+
import aiohttp
|
|
6
|
+
|
|
7
|
+
from wxpath.util.logging import get_logger
|
|
8
|
+
|
|
9
|
+
log = get_logger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RobotsTxtPolicy:
|
|
13
|
+
"""Caches and evaluates robots.txt rules for crawler requests."""
|
|
14
|
+
|
|
15
|
+
def __init__(self,
|
|
16
|
+
session: aiohttp.ClientSession,
|
|
17
|
+
default_parser: type['RobotsParserBase'] | None = None):
|
|
18
|
+
self._session = session
|
|
19
|
+
self._parsers: dict[str, "RobotsParserBase"] = {}
|
|
20
|
+
self._lock = asyncio.Lock()
|
|
21
|
+
self._default_parser = default_parser or UrllibRobotParser
|
|
22
|
+
|
|
23
|
+
async def can_fetch(self, url: str, user_agent: str | None) -> bool:
|
|
24
|
+
"""Return whether the crawler is allowed to fetch `url`."""
|
|
25
|
+
host = urllib.parse.urlsplit(url).hostname
|
|
26
|
+
if not host:
|
|
27
|
+
return False
|
|
28
|
+
|
|
29
|
+
# Due to multiple aiohttp workers running concurrently, we need to lock
|
|
30
|
+
async with self._lock:
|
|
31
|
+
if host not in self._parsers:
|
|
32
|
+
self._parsers[host] = await self._fetch_robots_txt(host)
|
|
33
|
+
|
|
34
|
+
return self._parsers[host].can_fetch(url, user_agent)
|
|
35
|
+
|
|
36
|
+
async def _fetch_robots_txt(self, host: str) -> "RobotsParserBase":
|
|
37
|
+
"""Retrieve and parse the robots.txt for `host`, failing open on errors."""
|
|
38
|
+
url = f"http://{host}/robots.txt"
|
|
39
|
+
try:
|
|
40
|
+
async with self._session.get(url) as response:
|
|
41
|
+
if response.status == 200:
|
|
42
|
+
text = await response.text()
|
|
43
|
+
# Pass the text as-is to the parser, let it handle the format
|
|
44
|
+
if self._default_parser == UrllibRobotParser:
|
|
45
|
+
return self._default_parser(text.splitlines())
|
|
46
|
+
else:
|
|
47
|
+
return self._default_parser(text)
|
|
48
|
+
else:
|
|
49
|
+
# Empty robots.txt - allow all
|
|
50
|
+
if self._default_parser == UrllibRobotParser:
|
|
51
|
+
return self._default_parser([])
|
|
52
|
+
else:
|
|
53
|
+
return self._default_parser("")
|
|
54
|
+
except Exception:
|
|
55
|
+
# If robots.txt is unavailable, allow all requests (fail open)
|
|
56
|
+
log.debug(f"Failed to fetch robots.txt from {host}, allowing all requests")
|
|
57
|
+
if self._default_parser == UrllibRobotParser:
|
|
58
|
+
return self._default_parser([])
|
|
59
|
+
else:
|
|
60
|
+
return self._default_parser("")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class RobotsParserBase:
|
|
64
|
+
"""Base type for robots.txt parsers used by the policy."""
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class UrllibRobotParser(RobotsParserBase):
|
|
68
|
+
"""Adapter around `urllib.robotparser.RobotFileParser`."""
|
|
69
|
+
|
|
70
|
+
def __init__(self, text):
|
|
71
|
+
self._parser = urllib.robotparser.RobotFileParser()
|
|
72
|
+
# urllib.robotparser.RobotFileParser.parse() expects a list of lines
|
|
73
|
+
if isinstance(text, str):
|
|
74
|
+
lines = text.splitlines() if text else []
|
|
75
|
+
else:
|
|
76
|
+
lines = text if text else []
|
|
77
|
+
self._parser.parse(lines)
|
|
78
|
+
|
|
79
|
+
def can_fetch(self, url, user_agent):
|
|
80
|
+
"""Return whether the URL is allowed for the given user agent."""
|
|
81
|
+
return self._parser.can_fetch(user_agent, url)
|
|
82
|
+
|