wxpath 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wxpath/__init__.py +9 -0
- wxpath/cli.py +92 -0
- wxpath/core/__init__.py +13 -0
- wxpath/core/dom.py +22 -0
- wxpath/core/models.py +74 -0
- wxpath/core/ops.py +278 -0
- wxpath/core/parser.py +598 -0
- wxpath/core/runtime/__init__.py +5 -0
- wxpath/core/runtime/engine.py +406 -0
- wxpath/core/runtime/helpers.py +41 -0
- wxpath/hooks/__init__.py +9 -0
- wxpath/hooks/builtin.py +113 -0
- wxpath/hooks/registry.py +145 -0
- wxpath/http/__init__.py +0 -0
- wxpath/http/client/__init__.py +9 -0
- wxpath/http/client/crawler.py +231 -0
- wxpath/http/client/request.py +38 -0
- wxpath/http/client/response.py +14 -0
- wxpath/http/policy/backoff.py +16 -0
- wxpath/http/policy/retry.py +35 -0
- wxpath/http/policy/robots.py +82 -0
- wxpath/http/policy/throttler.py +114 -0
- wxpath/http/stats.py +96 -0
- wxpath/patches.py +63 -0
- wxpath/util/__init__.py +0 -0
- wxpath/util/logging.py +91 -0
- wxpath/util/serialize.py +22 -0
- {wxpath-0.1.1.dist-info → wxpath-0.3.0.dist-info}/METADATA +107 -129
- wxpath-0.3.0.dist-info/RECORD +33 -0
- wxpath-0.3.0.dist-info/top_level.txt +1 -0
- wxpath-0.1.1.dist-info/RECORD +0 -6
- wxpath-0.1.1.dist-info/top_level.txt +0 -1
- {wxpath-0.1.1.dist-info → wxpath-0.3.0.dist-info}/WHEEL +0 -0
- {wxpath-0.1.1.dist-info → wxpath-0.3.0.dist-info}/entry_points.txt +0 -0
- {wxpath-0.1.1.dist-info → wxpath-0.3.0.dist-info}/licenses/LICENSE +0 -0
wxpath/hooks/registry.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pluggable hook system for wxpath.
|
|
3
|
+
|
|
4
|
+
Write once:
|
|
5
|
+
|
|
6
|
+
from wxpath import hooks
|
|
7
|
+
|
|
8
|
+
@hooks.register
|
|
9
|
+
class OnlyEnglish:
|
|
10
|
+
def post_parse(self, ctx, elem):
|
|
11
|
+
lang = elem.xpath('string(/html/@lang)').lower()[:2]
|
|
12
|
+
return elem if lang in ("en", "") else None
|
|
13
|
+
|
|
14
|
+
... and wxpath.engine will call it automatically.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import functools
|
|
20
|
+
from collections.abc import Generator
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
from typing import Any, Iterable, List, Optional, Protocol
|
|
23
|
+
|
|
24
|
+
from lxml import html
|
|
25
|
+
|
|
26
|
+
from wxpath.util.logging import get_logger
|
|
27
|
+
|
|
28
|
+
log = get_logger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# --------------------------------------------------------------------------- #
|
|
32
|
+
# Dataclass describing the crawl context for a single URL
|
|
33
|
+
# --------------------------------------------------------------------------- #
|
|
34
|
+
@dataclass
|
|
35
|
+
class FetchContext:
|
|
36
|
+
url: str
|
|
37
|
+
backlink: Optional[str]
|
|
38
|
+
depth: int
|
|
39
|
+
segments: list # remaining op/value pairs
|
|
40
|
+
user_data: dict = field(default_factory=dict)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# --------------------------------------------------------------------------- #
|
|
44
|
+
# Hook protocol - every method is optional
|
|
45
|
+
# --------------------------------------------------------------------------- #
|
|
46
|
+
class Hook(Protocol):
|
|
47
|
+
# Return False to abort fetching this URL
|
|
48
|
+
# def pre_fetch(self, ctx: FetchContext) -> bool: ...
|
|
49
|
+
|
|
50
|
+
# May return modified HTML bytes or None to drop this branch entirely
|
|
51
|
+
def post_fetch(self, ctx: FetchContext, html_bytes: bytes) -> bytes | None: ...
|
|
52
|
+
|
|
53
|
+
# May return modified element or None to drop this branch entirely
|
|
54
|
+
def post_parse(
|
|
55
|
+
self, ctx: FetchContext, elem: html.HtmlElement
|
|
56
|
+
) -> html.HtmlElement | None: ...
|
|
57
|
+
|
|
58
|
+
# Called for every candidate link; return False to prevent enqueueing it
|
|
59
|
+
# def pre_queue(self, ctx: FetchContext, url: str) -> bool: ...
|
|
60
|
+
|
|
61
|
+
# Called for every extracted value; may transform or drop it
|
|
62
|
+
def post_extract(self, value: Any) -> Any: ...
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# --------------------------------------------------------------------------- #
|
|
66
|
+
# Global registry helpers
|
|
67
|
+
# --------------------------------------------------------------------------- #
|
|
68
|
+
_global_hooks: dict[str, Hook] = dict()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def register(hook: Hook | type) -> Hook:
|
|
72
|
+
"""Decorator/helper to add a Hook to the global list.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
hook: A Hook class or instance to register.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
The registered hook (instantiated if a class was provided).
|
|
79
|
+
|
|
80
|
+
Example:
|
|
81
|
+
>>> @register
|
|
82
|
+
... class DebugHook:
|
|
83
|
+
... def post_fetch(self, ctx, html_bytes):
|
|
84
|
+
... print("Fetched", ctx.url)
|
|
85
|
+
... return html_bytes
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
hook_name = getattr(hook, '__name__', hook.__class__.__name__)
|
|
89
|
+
if hook_name in _global_hooks:
|
|
90
|
+
return hook
|
|
91
|
+
|
|
92
|
+
instance = hook() if isinstance(hook, type) else hook
|
|
93
|
+
_global_hooks[hook_name] = instance
|
|
94
|
+
return hook
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def get_hooks() -> List[Hook]:
|
|
98
|
+
"""Return the list of globally-registered hooks (read-only)."""
|
|
99
|
+
return list(_global_hooks.values())
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def iter_post_extract_hooks() -> Iterable[Hook]:
|
|
103
|
+
yield from (h for h in _global_hooks.values() if hasattr(h, "post_extract"))
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def pipe_post_extract(gen_func):
|
|
107
|
+
"""Wrap a generator function to pipe yielded values through post_extract hooks.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
gen_func: A generator function to wrap.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
A wrapped generator that filters values through registered hooks.
|
|
114
|
+
"""
|
|
115
|
+
@functools.wraps(gen_func)
|
|
116
|
+
def wrapper(*args, **kwargs) -> Generator:
|
|
117
|
+
for item in gen_func(*args, **kwargs):
|
|
118
|
+
for hook in iter_post_extract_hooks():
|
|
119
|
+
item = hook.post_extract(item)
|
|
120
|
+
if item is None: # hook decided to drop it
|
|
121
|
+
break
|
|
122
|
+
if item is not None:
|
|
123
|
+
yield item
|
|
124
|
+
return wrapper
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def pipe_post_extract_async(async_gen_func):
|
|
128
|
+
"""Wrap an async generator function to pipe yielded values through hooks.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
async_gen_func: An async generator function to wrap.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
A wrapped async generator that filters values through registered hooks.
|
|
135
|
+
"""
|
|
136
|
+
@functools.wraps(async_gen_func)
|
|
137
|
+
async def wrapper(*args, **kwargs):
|
|
138
|
+
async for item in async_gen_func(*args, **kwargs):
|
|
139
|
+
for hook in iter_post_extract_hooks():
|
|
140
|
+
item = hook.post_extract(item)
|
|
141
|
+
if item is None:
|
|
142
|
+
break
|
|
143
|
+
if item is not None:
|
|
144
|
+
yield item
|
|
145
|
+
return wrapper
|
wxpath/http/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import time
|
|
3
|
+
import urllib.parse
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from socket import gaierror
|
|
6
|
+
from typing import AsyncIterator
|
|
7
|
+
|
|
8
|
+
import aiohttp
|
|
9
|
+
|
|
10
|
+
from wxpath.http.client.request import Request
|
|
11
|
+
from wxpath.http.client.response import Response
|
|
12
|
+
from wxpath.http.policy.retry import RetryPolicy
|
|
13
|
+
from wxpath.http.policy.robots import RobotsTxtPolicy
|
|
14
|
+
from wxpath.http.policy.throttler import AbstractThrottler, AutoThrottler
|
|
15
|
+
from wxpath.http.stats import CrawlerStats, build_trace_config
|
|
16
|
+
from wxpath.util.logging import get_logger
|
|
17
|
+
|
|
18
|
+
log = get_logger(__name__)
|
|
19
|
+
|
|
20
|
+
HEADERS = {"User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
|
|
21
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
22
|
+
"Chrome/142.0.0.0 Safari/537.36")}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Crawler:
|
|
26
|
+
"""Concurrent HTTP crawler that manages throttling, retries, and robots."""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
concurrency: int = 16,
|
|
31
|
+
per_host: int = 8,
|
|
32
|
+
timeout: int = 15,
|
|
33
|
+
*,
|
|
34
|
+
headers: dict | None = None,
|
|
35
|
+
proxies: dict | None = None,
|
|
36
|
+
retry_policy: RetryPolicy | None = None,
|
|
37
|
+
throttler: AbstractThrottler | None = None,
|
|
38
|
+
auto_throttle_target_concurrency: float = None,
|
|
39
|
+
auto_throttle_start_delay: float = 0.25,
|
|
40
|
+
auto_throttle_max_delay: float = 10.0,
|
|
41
|
+
respect_robots: bool = True,
|
|
42
|
+
):
|
|
43
|
+
self.concurrency = concurrency
|
|
44
|
+
self._timeout = aiohttp.ClientTimeout(total=timeout)
|
|
45
|
+
self._headers = HEADERS | (headers or {}) # merge headers
|
|
46
|
+
self._proxies = proxies if (isinstance(proxies, defaultdict) or proxies) else {}
|
|
47
|
+
self.respect_robots = respect_robots
|
|
48
|
+
|
|
49
|
+
self.retry_policy = retry_policy or RetryPolicy()
|
|
50
|
+
self.throttler = throttler or AutoThrottler(
|
|
51
|
+
target_concurrency=auto_throttle_target_concurrency or concurrency/4.0,
|
|
52
|
+
start_delay=auto_throttle_start_delay,
|
|
53
|
+
max_delay=auto_throttle_max_delay,
|
|
54
|
+
)
|
|
55
|
+
self._sem_global = asyncio.Semaphore(concurrency)
|
|
56
|
+
self._sem_host = defaultdict(lambda: asyncio.Semaphore(per_host))
|
|
57
|
+
|
|
58
|
+
self._pending: asyncio.Queue[Request] = asyncio.Queue()
|
|
59
|
+
self._results: asyncio.Queue[Response] = asyncio.Queue()
|
|
60
|
+
|
|
61
|
+
self._session: aiohttp.ClientSession | None = None
|
|
62
|
+
self._workers: list[asyncio.Task] = []
|
|
63
|
+
self._closed = False
|
|
64
|
+
self._stats = CrawlerStats()
|
|
65
|
+
self._robots_policy: RobotsTxtPolicy | None = None
|
|
66
|
+
|
|
67
|
+
def build_session(self) -> aiohttp.ClientSession:
|
|
68
|
+
"""Construct an `aiohttp.ClientSession` with tracing and pooling."""
|
|
69
|
+
trace_config = build_trace_config(self._stats)
|
|
70
|
+
# Need to build the connector as late as possible as it requires the loop
|
|
71
|
+
connector = aiohttp.TCPConnector(limit=self.concurrency*2, ttl_dns_cache=300)
|
|
72
|
+
return aiohttp.ClientSession(
|
|
73
|
+
headers=self._headers,
|
|
74
|
+
timeout=self._timeout,
|
|
75
|
+
connector=connector,
|
|
76
|
+
trace_configs=[trace_config]
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
async def __aenter__(self) -> "Crawler":
|
|
80
|
+
"""Initialize HTTP session and start background workers."""
|
|
81
|
+
if self._session is None:
|
|
82
|
+
# self._session = aiohttp.ClientSession(timeout=self._timeout)
|
|
83
|
+
self._session = self.build_session()
|
|
84
|
+
|
|
85
|
+
if self.respect_robots:
|
|
86
|
+
self._robots_policy = RobotsTxtPolicy(self._session)
|
|
87
|
+
|
|
88
|
+
self._workers = [
|
|
89
|
+
asyncio.create_task(self._worker(), name=f"crawler-worker-{i}")
|
|
90
|
+
for i in range(self.concurrency)
|
|
91
|
+
]
|
|
92
|
+
return self
|
|
93
|
+
|
|
94
|
+
async def __aexit__(self, *_) -> None:
|
|
95
|
+
"""Tear down workers and close the HTTP session."""
|
|
96
|
+
self._closed = True
|
|
97
|
+
for w in self._workers:
|
|
98
|
+
w.cancel()
|
|
99
|
+
|
|
100
|
+
await asyncio.gather(*self._workers, return_exceptions=True)
|
|
101
|
+
|
|
102
|
+
if self._session:
|
|
103
|
+
await self._session.close()
|
|
104
|
+
|
|
105
|
+
def submit(self, req: Request) -> None:
|
|
106
|
+
"""Queue a request for fetching or raise if crawler already closed."""
|
|
107
|
+
if self._closed:
|
|
108
|
+
raise RuntimeError("crawler is closed")
|
|
109
|
+
self._pending.put_nowait(req)
|
|
110
|
+
|
|
111
|
+
def __aiter__(self) -> AsyncIterator[Response]:
|
|
112
|
+
return self._result_iter()
|
|
113
|
+
|
|
114
|
+
async def _result_iter(self) -> AsyncIterator[Response]:
|
|
115
|
+
"""Async iterator yielding responses as workers produce them."""
|
|
116
|
+
# while not self._closed:
|
|
117
|
+
while not (self._closed and self._results.empty()):
|
|
118
|
+
resp = await self._results.get()
|
|
119
|
+
self._results.task_done()
|
|
120
|
+
yield resp
|
|
121
|
+
|
|
122
|
+
def _proxy_for(self, url: str) -> str | None:
|
|
123
|
+
host = urllib.parse.urlsplit(url).hostname
|
|
124
|
+
try:
|
|
125
|
+
# bracket notation first, for defaultdicts
|
|
126
|
+
value = self._proxies[host]
|
|
127
|
+
except KeyError:
|
|
128
|
+
value = self._proxies.get(host)
|
|
129
|
+
|
|
130
|
+
if not value:
|
|
131
|
+
log.debug("proxy", extra={"host": host, "value": value})
|
|
132
|
+
return value
|
|
133
|
+
|
|
134
|
+
async def _worker(self) -> None:
|
|
135
|
+
"""Worker loop that fetches pending requests and enqueues results."""
|
|
136
|
+
while True:
|
|
137
|
+
req = await self._pending.get()
|
|
138
|
+
try:
|
|
139
|
+
resp = await self._fetch_one(req)
|
|
140
|
+
if resp is not None:
|
|
141
|
+
await self._results.put(resp)
|
|
142
|
+
|
|
143
|
+
except asyncio.CancelledError:
|
|
144
|
+
# Must propagate cancellation
|
|
145
|
+
log.debug("cancelled error", extra={"url": req.url})
|
|
146
|
+
raise
|
|
147
|
+
|
|
148
|
+
except gaierror:
|
|
149
|
+
# Ignore DNS errors
|
|
150
|
+
log.warning("DNS error", extra={"url": req.url})
|
|
151
|
+
pass
|
|
152
|
+
|
|
153
|
+
except Exception as exc:
|
|
154
|
+
log.warning("exception", extra={"url": req.url})
|
|
155
|
+
# Last-resort safety: never drop a request silently
|
|
156
|
+
await self._results.put(Response(req, 0, b"", error=exc))
|
|
157
|
+
finally:
|
|
158
|
+
self._pending.task_done()
|
|
159
|
+
|
|
160
|
+
async def _fetch_one(self, req: Request) -> Response | None:
|
|
161
|
+
"""Fetch a single request, handling robots, throttling, and retries."""
|
|
162
|
+
host = req.hostname
|
|
163
|
+
|
|
164
|
+
if self._robots_policy:
|
|
165
|
+
can_fetch = await self._robots_policy.can_fetch(
|
|
166
|
+
req.url, self._headers.get("User-Agent")
|
|
167
|
+
)
|
|
168
|
+
if not can_fetch:
|
|
169
|
+
log.debug("disallowed by robots.txt", extra={"url": req.url})
|
|
170
|
+
return Response(req, 403, b"", error=RuntimeError("Disallowed by robots.txt"))
|
|
171
|
+
|
|
172
|
+
# TODO: Move this filter to hooks
|
|
173
|
+
if req.url.lower().endswith((".pdf", ".zip", ".exe")):
|
|
174
|
+
req.max_retries = 0
|
|
175
|
+
|
|
176
|
+
async with self._sem_global, self._sem_host[host]:
|
|
177
|
+
t0 = asyncio.get_running_loop().time()
|
|
178
|
+
await self.throttler.wait(host)
|
|
179
|
+
dt = asyncio.get_running_loop().time() - t0
|
|
180
|
+
|
|
181
|
+
self._stats.throttle_waits += 1
|
|
182
|
+
self._stats.throttle_wait_time += dt
|
|
183
|
+
self._stats.throttle_waits_by_host[host] += 1
|
|
184
|
+
|
|
185
|
+
start = time.monotonic()
|
|
186
|
+
try:
|
|
187
|
+
async with self._session.get(
|
|
188
|
+
req.url,
|
|
189
|
+
headers=self._headers | req.headers,
|
|
190
|
+
proxy=self._proxy_for(req.url),
|
|
191
|
+
timeout=req.timeout or self._timeout,
|
|
192
|
+
) as resp:
|
|
193
|
+
body = await resp.read()
|
|
194
|
+
|
|
195
|
+
latency = time.monotonic() - start
|
|
196
|
+
self.throttler.record_latency(host, latency)
|
|
197
|
+
|
|
198
|
+
if self.retry_policy.should_retry(req, response=resp):
|
|
199
|
+
await self._retry(req)
|
|
200
|
+
return None
|
|
201
|
+
|
|
202
|
+
return Response(req, resp.status, body, dict(resp.headers))
|
|
203
|
+
except asyncio.CancelledError:
|
|
204
|
+
# Normal during shutdown / timeout propagation
|
|
205
|
+
log.debug("cancelled error", extra={"url": req.url})
|
|
206
|
+
raise
|
|
207
|
+
except Exception as exc:
|
|
208
|
+
latency = time.monotonic() - start
|
|
209
|
+
self.throttler.record_latency(host, latency)
|
|
210
|
+
|
|
211
|
+
if self.retry_policy.should_retry(req, exception=exc):
|
|
212
|
+
await self._retry(req)
|
|
213
|
+
return None
|
|
214
|
+
|
|
215
|
+
log.error("request failed", extra={"url": req.url}, exc_info=exc)
|
|
216
|
+
return Response(req, 0, b"", error=exc)
|
|
217
|
+
|
|
218
|
+
async def _retry(self, req: Request) -> None:
|
|
219
|
+
"""Reschedule a request according to the retry policy."""
|
|
220
|
+
req.retries += 1
|
|
221
|
+
delay = self.retry_policy.get_delay(req)
|
|
222
|
+
|
|
223
|
+
log.warning(
|
|
224
|
+
"retrying",
|
|
225
|
+
extra={"url": req.url, "retry": req.retries, "delay": delay},
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
if delay:
|
|
229
|
+
await asyncio.sleep(delay)
|
|
230
|
+
|
|
231
|
+
self.submit(req)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class Request:
|
|
8
|
+
"""HTTP request envelope used by the crawler."""
|
|
9
|
+
url: str
|
|
10
|
+
method: str = "GET"
|
|
11
|
+
headers: dict[str, str] = field(default_factory=dict)
|
|
12
|
+
timeout: float = 15.0
|
|
13
|
+
|
|
14
|
+
retries: int = 0
|
|
15
|
+
max_retries: int | None = None
|
|
16
|
+
dont_retry: bool = False
|
|
17
|
+
|
|
18
|
+
meta: dict[str, Any] = field(default_factory=dict)
|
|
19
|
+
|
|
20
|
+
created_at: float = field(default_factory=time.monotonic)
|
|
21
|
+
|
|
22
|
+
def copy_for_retry(self) -> "Request":
|
|
23
|
+
"""Create a copy incrementing the retry counter for scheduling."""
|
|
24
|
+
return Request(
|
|
25
|
+
url=self.url,
|
|
26
|
+
method=self.method,
|
|
27
|
+
headers=self.headers,
|
|
28
|
+
timeout=self.timeout,
|
|
29
|
+
retries=self.retries + 1,
|
|
30
|
+
max_retries=self.max_retries,
|
|
31
|
+
dont_retry=self.dont_retry,
|
|
32
|
+
meta=self.meta,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def hostname(self) -> str:
|
|
37
|
+
from urllib.parse import urlsplit
|
|
38
|
+
return urlsplit(self.url).hostname or ""
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# wxpath/http/response.py
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from wxpath.http.client.request import Request
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class Response:
|
|
10
|
+
request: Request
|
|
11
|
+
status: int
|
|
12
|
+
body: bytes
|
|
13
|
+
headers: dict[str, str] | None = None
|
|
14
|
+
error: Optional[Exception] = field(default=None, kw_only=True)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import random
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def exponential_backoff(
|
|
5
|
+
attempt: int,
|
|
6
|
+
base: float = 0.5,
|
|
7
|
+
cap: float = 30.0,
|
|
8
|
+
jitter: bool = True,
|
|
9
|
+
) -> float:
|
|
10
|
+
"""
|
|
11
|
+
Exponential backoff with optional jitter.
|
|
12
|
+
"""
|
|
13
|
+
delay = min(cap, base * (2 ** attempt))
|
|
14
|
+
if jitter:
|
|
15
|
+
delay *= random.uniform(0.7, 1.3)
|
|
16
|
+
return delay
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from wxpath.http.policy.backoff import exponential_backoff
|
|
2
|
+
from wxpath.util.logging import get_logger
|
|
3
|
+
|
|
4
|
+
log = get_logger(__name__)
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class RetryPolicy:
|
|
8
|
+
def __init__(
|
|
9
|
+
self,
|
|
10
|
+
max_retries: int = 3,
|
|
11
|
+
retry_statuses: set[int] = None,
|
|
12
|
+
):
|
|
13
|
+
self.max_retries = max_retries
|
|
14
|
+
self.retry_statuses = retry_statuses or {500, 502, 503, 504}
|
|
15
|
+
|
|
16
|
+
def should_retry(self, request, response=None, exception=None) -> bool:
|
|
17
|
+
if request.dont_retry:
|
|
18
|
+
return False
|
|
19
|
+
|
|
20
|
+
if request.max_retries is not None and request.retries >= request.max_retries:
|
|
21
|
+
return False
|
|
22
|
+
|
|
23
|
+
if request.retries >= self.max_retries:
|
|
24
|
+
return False
|
|
25
|
+
|
|
26
|
+
if response is not None and response.status in self.retry_statuses:
|
|
27
|
+
return True
|
|
28
|
+
|
|
29
|
+
if exception is not None:
|
|
30
|
+
return True
|
|
31
|
+
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
def get_delay(self, request) -> float:
|
|
35
|
+
return exponential_backoff(request.retries)
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import urllib.parse
|
|
3
|
+
import urllib.robotparser
|
|
4
|
+
|
|
5
|
+
import aiohttp
|
|
6
|
+
|
|
7
|
+
from wxpath.util.logging import get_logger
|
|
8
|
+
|
|
9
|
+
log = get_logger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RobotsTxtPolicy:
|
|
13
|
+
"""Caches and evaluates robots.txt rules for crawler requests."""
|
|
14
|
+
|
|
15
|
+
def __init__(self,
|
|
16
|
+
session: aiohttp.ClientSession,
|
|
17
|
+
default_parser: type['RobotsParserBase'] | None = None):
|
|
18
|
+
self._session = session
|
|
19
|
+
self._parsers: dict[str, "RobotsParserBase"] = {}
|
|
20
|
+
self._lock = asyncio.Lock()
|
|
21
|
+
self._default_parser = default_parser or UrllibRobotParser
|
|
22
|
+
|
|
23
|
+
async def can_fetch(self, url: str, user_agent: str | None) -> bool:
|
|
24
|
+
"""Return whether the crawler is allowed to fetch `url`."""
|
|
25
|
+
host = urllib.parse.urlsplit(url).hostname
|
|
26
|
+
if not host:
|
|
27
|
+
return False
|
|
28
|
+
|
|
29
|
+
# Due to multiple aiohttp workers running concurrently, we need to lock
|
|
30
|
+
async with self._lock:
|
|
31
|
+
if host not in self._parsers:
|
|
32
|
+
self._parsers[host] = await self._fetch_robots_txt(host)
|
|
33
|
+
|
|
34
|
+
return self._parsers[host].can_fetch(url, user_agent)
|
|
35
|
+
|
|
36
|
+
async def _fetch_robots_txt(self, host: str) -> "RobotsParserBase":
|
|
37
|
+
"""Retrieve and parse the robots.txt for `host`, failing open on errors."""
|
|
38
|
+
url = f"http://{host}/robots.txt"
|
|
39
|
+
try:
|
|
40
|
+
async with self._session.get(url) as response:
|
|
41
|
+
if response.status == 200:
|
|
42
|
+
text = await response.text()
|
|
43
|
+
# Pass the text as-is to the parser, let it handle the format
|
|
44
|
+
if self._default_parser == UrllibRobotParser:
|
|
45
|
+
return self._default_parser(text.splitlines())
|
|
46
|
+
else:
|
|
47
|
+
return self._default_parser(text)
|
|
48
|
+
else:
|
|
49
|
+
# Empty robots.txt - allow all
|
|
50
|
+
if self._default_parser == UrllibRobotParser:
|
|
51
|
+
return self._default_parser([])
|
|
52
|
+
else:
|
|
53
|
+
return self._default_parser("")
|
|
54
|
+
except Exception:
|
|
55
|
+
# If robots.txt is unavailable, allow all requests (fail open)
|
|
56
|
+
log.debug(f"Failed to fetch robots.txt from {host}, allowing all requests")
|
|
57
|
+
if self._default_parser == UrllibRobotParser:
|
|
58
|
+
return self._default_parser([])
|
|
59
|
+
else:
|
|
60
|
+
return self._default_parser("")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class RobotsParserBase:
|
|
64
|
+
"""Base type for robots.txt parsers used by the policy."""
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class UrllibRobotParser(RobotsParserBase):
|
|
68
|
+
"""Adapter around `urllib.robotparser.RobotFileParser`."""
|
|
69
|
+
|
|
70
|
+
def __init__(self, text):
|
|
71
|
+
self._parser = urllib.robotparser.RobotFileParser()
|
|
72
|
+
# urllib.robotparser.RobotFileParser.parse() expects a list of lines
|
|
73
|
+
if isinstance(text, str):
|
|
74
|
+
lines = text.splitlines() if text else []
|
|
75
|
+
else:
|
|
76
|
+
lines = text if text else []
|
|
77
|
+
self._parser.parse(lines)
|
|
78
|
+
|
|
79
|
+
def can_fetch(self, url, user_agent):
|
|
80
|
+
"""Return whether the URL is allowed for the given user agent."""
|
|
81
|
+
return self._parser.can_fetch(user_agent, url)
|
|
82
|
+
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
|
|
5
|
+
from wxpath.util.logging import get_logger
|
|
6
|
+
|
|
7
|
+
log = get_logger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Abstract Base Class
|
|
11
|
+
class AbstractThrottler(ABC):
|
|
12
|
+
@abstractmethod
|
|
13
|
+
async def wait(self, host: str):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def record_latency(self, host: str, latency: float):
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class AutoThrottler(AbstractThrottler):
|
|
22
|
+
"""
|
|
23
|
+
Scrapy-inspired auto-throttle, simplified:
|
|
24
|
+
- increases delay when latency increases
|
|
25
|
+
- decreases delay when responses are fast
|
|
26
|
+
|
|
27
|
+
Explanation:
|
|
28
|
+
- target_concurrency is the desired number of concurrent requests
|
|
29
|
+
- start_delay is the initial delay
|
|
30
|
+
- max_delay is the maximum delay
|
|
31
|
+
- smoothing is the exponential smoothing factor
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
start_delay: float = 0.25,
|
|
37
|
+
max_delay: float = 10.0,
|
|
38
|
+
target_concurrency: float = 1.0,
|
|
39
|
+
smoothing: float = 0.7,
|
|
40
|
+
):
|
|
41
|
+
self.start_delay = start_delay
|
|
42
|
+
self.max_delay = max_delay
|
|
43
|
+
self.target_concurrency = target_concurrency
|
|
44
|
+
self.smoothing = smoothing
|
|
45
|
+
|
|
46
|
+
self._delay = defaultdict(lambda: start_delay)
|
|
47
|
+
self._latency = defaultdict(lambda: None)
|
|
48
|
+
|
|
49
|
+
def record_latency(self, host: str, latency: float):
|
|
50
|
+
prev = self._latency[host]
|
|
51
|
+
if prev is None:
|
|
52
|
+
self._latency[host] = latency
|
|
53
|
+
else:
|
|
54
|
+
self._latency[host] = (
|
|
55
|
+
# exponential smoothing
|
|
56
|
+
self.smoothing * prev + (1 - self.smoothing) * latency
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
self._recalculate_delay(host)
|
|
60
|
+
|
|
61
|
+
def _recalculate_delay(self, host: str):
|
|
62
|
+
latency = self._latency[host]
|
|
63
|
+
if not latency:
|
|
64
|
+
return
|
|
65
|
+
|
|
66
|
+
target_delay = latency / self.target_concurrency
|
|
67
|
+
delay = min(self.max_delay, max(0.0, target_delay))
|
|
68
|
+
self._delay[host] = delay
|
|
69
|
+
|
|
70
|
+
log.debug(
|
|
71
|
+
"auto-throttle",
|
|
72
|
+
extra={"host": host, "latency": latency, "delay": delay},
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
async def wait(self, host: str):
|
|
76
|
+
delay = self._delay[host]
|
|
77
|
+
if delay > 0:
|
|
78
|
+
await asyncio.sleep(delay)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class ImpoliteThrottle(AbstractThrottler):
|
|
82
|
+
"""
|
|
83
|
+
Zero delay throttler
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
async def wait(self, host: str):
|
|
87
|
+
pass
|
|
88
|
+
|
|
89
|
+
def record_latency(self, host: str, latency: float):
|
|
90
|
+
pass
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
ZeroWaitThrottler = ImpoliteThrottle
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class SimpleThrottler(AbstractThrottler):
|
|
97
|
+
"""
|
|
98
|
+
Fixed delay throttler. Optionally provide per-host delays via `per_host_delays`.
|
|
99
|
+
"""
|
|
100
|
+
def __init__(self, delay: float, per_host_delays: dict[str, float] = None):
|
|
101
|
+
self.delay = delay
|
|
102
|
+
self._delays = per_host_delays or defaultdict(lambda: delay)
|
|
103
|
+
|
|
104
|
+
async def wait(self, host: str):
|
|
105
|
+
if host in self._delays:
|
|
106
|
+
await asyncio.sleep(self._delays[host])
|
|
107
|
+
else:
|
|
108
|
+
await asyncio.sleep(self.delay)
|
|
109
|
+
|
|
110
|
+
def record_latency(self, host: str, latency: float):
|
|
111
|
+
pass
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
FixedDelayThrottler = SimpleThrottler
|