wxpath 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wxpath/__init__.py +9 -0
- wxpath/cli.py +137 -0
- wxpath/core/__init__.py +13 -0
- wxpath/core/dom.py +22 -0
- wxpath/core/models.py +74 -0
- wxpath/core/ops.py +278 -0
- wxpath/core/parser.py +598 -0
- wxpath/core/runtime/__init__.py +5 -0
- wxpath/core/runtime/engine.py +444 -0
- wxpath/core/runtime/helpers.py +41 -0
- wxpath/hooks/__init__.py +9 -0
- wxpath/hooks/builtin.py +113 -0
- wxpath/hooks/registry.py +145 -0
- wxpath/http/__init__.py +0 -0
- wxpath/http/client/__init__.py +9 -0
- wxpath/http/client/cache.py +43 -0
- wxpath/http/client/crawler.py +315 -0
- wxpath/http/client/request.py +38 -0
- wxpath/http/client/response.py +14 -0
- wxpath/http/policy/backoff.py +16 -0
- wxpath/http/policy/retry.py +35 -0
- wxpath/http/policy/robots.py +82 -0
- wxpath/http/policy/throttler.py +114 -0
- wxpath/http/stats.py +102 -0
- wxpath/patches.py +63 -0
- wxpath/settings.py +108 -0
- wxpath/util/__init__.py +0 -0
- wxpath/util/logging.py +91 -0
- wxpath/util/serialize.py +22 -0
- wxpath-0.4.0.dist-info/METADATA +460 -0
- wxpath-0.4.0.dist-info/RECORD +35 -0
- wxpath-0.4.0.dist-info/WHEEL +5 -0
- wxpath-0.4.0.dist-info/entry_points.txt +2 -0
- wxpath-0.4.0.dist-info/licenses/LICENSE +21 -0
- wxpath-0.4.0.dist-info/top_level.txt +1 -0
wxpath/hooks/registry.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pluggable hook system for wxpath.
|
|
3
|
+
|
|
4
|
+
Write once:
|
|
5
|
+
|
|
6
|
+
from wxpath import hooks
|
|
7
|
+
|
|
8
|
+
@hooks.register
|
|
9
|
+
class OnlyEnglish:
|
|
10
|
+
def post_parse(self, ctx, elem):
|
|
11
|
+
lang = elem.xpath('string(/html/@lang)').lower()[:2]
|
|
12
|
+
return elem if lang in ("en", "") else None
|
|
13
|
+
|
|
14
|
+
... and wxpath.engine will call it automatically.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import functools
|
|
20
|
+
from collections.abc import Generator
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
from typing import Any, Iterable, List, Optional, Protocol
|
|
23
|
+
|
|
24
|
+
from lxml import html
|
|
25
|
+
|
|
26
|
+
from wxpath.util.logging import get_logger
|
|
27
|
+
|
|
28
|
+
log = get_logger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# --------------------------------------------------------------------------- #
|
|
32
|
+
# Dataclass describing the crawl context for a single URL
|
|
33
|
+
# --------------------------------------------------------------------------- #
|
|
34
|
+
@dataclass
|
|
35
|
+
class FetchContext:
|
|
36
|
+
url: str
|
|
37
|
+
backlink: Optional[str]
|
|
38
|
+
depth: int
|
|
39
|
+
segments: list # remaining op/value pairs
|
|
40
|
+
user_data: dict = field(default_factory=dict)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# --------------------------------------------------------------------------- #
|
|
44
|
+
# Hook protocol - every method is optional
|
|
45
|
+
# --------------------------------------------------------------------------- #
|
|
46
|
+
class Hook(Protocol):
|
|
47
|
+
# Return False to abort fetching this URL
|
|
48
|
+
# def pre_fetch(self, ctx: FetchContext) -> bool: ...
|
|
49
|
+
|
|
50
|
+
# May return modified HTML bytes or None to drop this branch entirely
|
|
51
|
+
def post_fetch(self, ctx: FetchContext, html_bytes: bytes) -> bytes | None: ...
|
|
52
|
+
|
|
53
|
+
# May return modified element or None to drop this branch entirely
|
|
54
|
+
def post_parse(
|
|
55
|
+
self, ctx: FetchContext, elem: html.HtmlElement
|
|
56
|
+
) -> html.HtmlElement | None: ...
|
|
57
|
+
|
|
58
|
+
# Called for every candidate link; return False to prevent enqueueing it
|
|
59
|
+
# def pre_queue(self, ctx: FetchContext, url: str) -> bool: ...
|
|
60
|
+
|
|
61
|
+
# Called for every extracted value; may transform or drop it
|
|
62
|
+
def post_extract(self, value: Any) -> Any: ...
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# --------------------------------------------------------------------------- #
|
|
66
|
+
# Global registry helpers
|
|
67
|
+
# --------------------------------------------------------------------------- #
|
|
68
|
+
_global_hooks: dict[str, Hook] = dict()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def register(hook: Hook | type) -> Hook:
|
|
72
|
+
"""Decorator/helper to add a Hook to the global list.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
hook: A Hook class or instance to register.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
The registered hook (instantiated if a class was provided).
|
|
79
|
+
|
|
80
|
+
Example:
|
|
81
|
+
>>> @register
|
|
82
|
+
... class DebugHook:
|
|
83
|
+
... def post_fetch(self, ctx, html_bytes):
|
|
84
|
+
... print("Fetched", ctx.url)
|
|
85
|
+
... return html_bytes
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
hook_name = getattr(hook, '__name__', hook.__class__.__name__)
|
|
89
|
+
if hook_name in _global_hooks:
|
|
90
|
+
return hook
|
|
91
|
+
|
|
92
|
+
instance = hook() if isinstance(hook, type) else hook
|
|
93
|
+
_global_hooks[hook_name] = instance
|
|
94
|
+
return hook
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def get_hooks() -> List[Hook]:
|
|
98
|
+
"""Return the list of globally-registered hooks (read-only)."""
|
|
99
|
+
return list(_global_hooks.values())
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def iter_post_extract_hooks() -> Iterable[Hook]:
|
|
103
|
+
yield from (h for h in _global_hooks.values() if hasattr(h, "post_extract"))
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def pipe_post_extract(gen_func):
|
|
107
|
+
"""Wrap a generator function to pipe yielded values through post_extract hooks.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
gen_func: A generator function to wrap.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
A wrapped generator that filters values through registered hooks.
|
|
114
|
+
"""
|
|
115
|
+
@functools.wraps(gen_func)
|
|
116
|
+
def wrapper(*args, **kwargs) -> Generator:
|
|
117
|
+
for item in gen_func(*args, **kwargs):
|
|
118
|
+
for hook in iter_post_extract_hooks():
|
|
119
|
+
item = hook.post_extract(item)
|
|
120
|
+
if item is None: # hook decided to drop it
|
|
121
|
+
break
|
|
122
|
+
if item is not None:
|
|
123
|
+
yield item
|
|
124
|
+
return wrapper
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def pipe_post_extract_async(async_gen_func):
|
|
128
|
+
"""Wrap an async generator function to pipe yielded values through hooks.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
async_gen_func: An async generator function to wrap.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
A wrapped async generator that filters values through registered hooks.
|
|
135
|
+
"""
|
|
136
|
+
@functools.wraps(async_gen_func)
|
|
137
|
+
async def wrapper(*args, **kwargs):
|
|
138
|
+
async for item in async_gen_func(*args, **kwargs):
|
|
139
|
+
for hook in iter_post_extract_hooks():
|
|
140
|
+
item = hook.post_extract(item)
|
|
141
|
+
if item is None:
|
|
142
|
+
break
|
|
143
|
+
if item is not None:
|
|
144
|
+
yield item
|
|
145
|
+
return wrapper
|
wxpath/http/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
try:
|
|
2
|
+
from aiohttp_client_cache import SQLiteBackend
|
|
3
|
+
except ImportError:
|
|
4
|
+
CachedSession = None
|
|
5
|
+
|
|
6
|
+
from wxpath.settings import SETTINGS
|
|
7
|
+
from wxpath.util.logging import get_logger
|
|
8
|
+
|
|
9
|
+
log = get_logger(__name__)
|
|
10
|
+
|
|
11
|
+
CACHE_SETTINGS = SETTINGS.http.client.cache
|
|
12
|
+
|
|
13
|
+
def get_cache_backend():
|
|
14
|
+
log.info("cache backend", extra={"backend": CACHE_SETTINGS.backend})
|
|
15
|
+
if CACHE_SETTINGS.backend == "redis":
|
|
16
|
+
from aiohttp_client_cache.backends.redis import RedisBackend
|
|
17
|
+
return RedisBackend(
|
|
18
|
+
expire_after=CACHE_SETTINGS.expire_after,
|
|
19
|
+
urls_expire_after=CACHE_SETTINGS.urls_expire_after or None,
|
|
20
|
+
allowed_methods=CACHE_SETTINGS.allowed_methods,
|
|
21
|
+
allowed_codes=CACHE_SETTINGS.allowed_codes,
|
|
22
|
+
include_headers=CACHE_SETTINGS.include_headers,
|
|
23
|
+
ignored_parameters=CACHE_SETTINGS.ignored_parameters,
|
|
24
|
+
**CACHE_SETTINGS.redis
|
|
25
|
+
# cache_name=CACHE_SETTINGS.redis.cache_name,
|
|
26
|
+
# host=CACHE_SETTINGS.redis.host,
|
|
27
|
+
# port=CACHE_SETTINGS.redis.port,
|
|
28
|
+
# db=CACHE_SETTINGS.redis.db,
|
|
29
|
+
# cache_control=CACHE_SETTINGS.cache_control,
|
|
30
|
+
)
|
|
31
|
+
elif CACHE_SETTINGS.backend == "sqlite":
|
|
32
|
+
return SQLiteBackend(
|
|
33
|
+
cache_name=CACHE_SETTINGS.sqlite.cache_name,
|
|
34
|
+
expire_after=CACHE_SETTINGS.expire_after,
|
|
35
|
+
urls_expire_after=CACHE_SETTINGS.urls_expire_after or None,
|
|
36
|
+
allowed_methods=CACHE_SETTINGS.allowed_methods,
|
|
37
|
+
allowed_codes=CACHE_SETTINGS.allowed_codes,
|
|
38
|
+
include_headers=CACHE_SETTINGS.include_headers,
|
|
39
|
+
ignored_parameters=CACHE_SETTINGS.ignored_parameters,
|
|
40
|
+
# cache_control=CACHE_SETTINGS.cache_control,
|
|
41
|
+
)
|
|
42
|
+
else:
|
|
43
|
+
raise ValueError(f"Unknown cache backend: {CACHE_SETTINGS.backend}")
|
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
import aiohttp
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from aiohttp_client_cache import CachedSession, SQLiteBackend
|
|
5
|
+
except ImportError:
|
|
6
|
+
CachedSession = None
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import time
|
|
10
|
+
import urllib.parse
|
|
11
|
+
from collections import defaultdict
|
|
12
|
+
from socket import gaierror
|
|
13
|
+
from typing import AsyncIterator
|
|
14
|
+
|
|
15
|
+
from wxpath.http.client.cache import get_cache_backend
|
|
16
|
+
from wxpath.http.client.request import Request
|
|
17
|
+
from wxpath.http.client.response import Response
|
|
18
|
+
from wxpath.http.policy.retry import RetryPolicy
|
|
19
|
+
from wxpath.http.policy.robots import RobotsTxtPolicy
|
|
20
|
+
from wxpath.http.policy.throttler import AbstractThrottler, AutoThrottler
|
|
21
|
+
from wxpath.http.stats import CrawlerStats, build_trace_config
|
|
22
|
+
from wxpath.settings import SETTINGS
|
|
23
|
+
from wxpath.util.logging import get_logger
|
|
24
|
+
|
|
25
|
+
log = get_logger(__name__)
|
|
26
|
+
|
|
27
|
+
CACHE_SETTINGS = SETTINGS.http.client.cache
|
|
28
|
+
CRAWLER_SETTINGS = SETTINGS.http.client.crawler
|
|
29
|
+
|
|
30
|
+
def get_async_session(
|
|
31
|
+
headers: dict | None = None,
|
|
32
|
+
timeout: aiohttp.ClientTimeout | None = None,
|
|
33
|
+
connector: aiohttp.TCPConnector | None = None,
|
|
34
|
+
trace_config: aiohttp.TraceConfig | None = None
|
|
35
|
+
) -> aiohttp.ClientSession:
|
|
36
|
+
"""
|
|
37
|
+
Create and return a new aiohttp session. If aiohttp-client-cache is available
|
|
38
|
+
and enabled, return a new CachedSession bound to the configured SQLite backend.
|
|
39
|
+
The caller is responsible for closing the session.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
if timeout is None:
|
|
43
|
+
timeout = aiohttp.ClientTimeout(total=CRAWLER_SETTINGS.timeout)
|
|
44
|
+
|
|
45
|
+
if CACHE_SETTINGS.enabled and CachedSession and SQLiteBackend:
|
|
46
|
+
log.info("using aiohttp-client-cache")
|
|
47
|
+
return CachedSession(
|
|
48
|
+
cache=get_cache_backend(),
|
|
49
|
+
headers=headers,
|
|
50
|
+
timeout=timeout,
|
|
51
|
+
connector=connector,
|
|
52
|
+
trace_configs=[trace_config] if trace_config is not None else None
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
return aiohttp.ClientSession(
|
|
56
|
+
headers=headers,
|
|
57
|
+
timeout=timeout,
|
|
58
|
+
connector=connector,
|
|
59
|
+
trace_configs=[trace_config] if trace_config is not None else None
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class Crawler:
|
|
64
|
+
"""Concurrent HTTP crawler that manages throttling, retries, and robots."""
|
|
65
|
+
|
|
66
|
+
def __init__(
|
|
67
|
+
self,
|
|
68
|
+
concurrency: int = None,
|
|
69
|
+
per_host: int = None,
|
|
70
|
+
timeout: int = None,
|
|
71
|
+
*,
|
|
72
|
+
headers: dict | None = None,
|
|
73
|
+
proxies: dict | None = None,
|
|
74
|
+
retry_policy: RetryPolicy | None = None,
|
|
75
|
+
throttler: AbstractThrottler | None = None,
|
|
76
|
+
auto_throttle_target_concurrency: float = None,
|
|
77
|
+
auto_throttle_start_delay: float = None,
|
|
78
|
+
auto_throttle_max_delay: float = None,
|
|
79
|
+
respect_robots: bool = True,
|
|
80
|
+
):
|
|
81
|
+
cfg = CRAWLER_SETTINGS
|
|
82
|
+
|
|
83
|
+
self.concurrency = concurrency if concurrency is not None else cfg.concurrency
|
|
84
|
+
self.per_host = per_host if per_host is not None else cfg.per_host
|
|
85
|
+
|
|
86
|
+
timeout = timeout if timeout is not None else cfg.timeout
|
|
87
|
+
self._timeout = aiohttp.ClientTimeout(total=timeout)
|
|
88
|
+
|
|
89
|
+
self._headers = cfg.headers | (headers or {}) # merge headers
|
|
90
|
+
|
|
91
|
+
_proxies = proxies if proxies is not None else cfg.proxies
|
|
92
|
+
self._proxies = _proxies if (isinstance(_proxies, defaultdict) or _proxies) else {}
|
|
93
|
+
|
|
94
|
+
self.retry_policy = retry_policy or RetryPolicy()
|
|
95
|
+
|
|
96
|
+
# auto-throttle defaults
|
|
97
|
+
auto_throttle_target_concurrency = auto_throttle_target_concurrency \
|
|
98
|
+
if auto_throttle_target_concurrency is not None \
|
|
99
|
+
else cfg.auto_throttle_target_concurrency
|
|
100
|
+
|
|
101
|
+
auto_throttle_start_delay = auto_throttle_start_delay \
|
|
102
|
+
if auto_throttle_start_delay is not None \
|
|
103
|
+
else cfg.auto_throttle_start_delay
|
|
104
|
+
|
|
105
|
+
auto_throttle_max_delay = auto_throttle_max_delay \
|
|
106
|
+
if auto_throttle_max_delay is not None \
|
|
107
|
+
else cfg.auto_throttle_max_delay
|
|
108
|
+
|
|
109
|
+
self.throttler = throttler or AutoThrottler(
|
|
110
|
+
target_concurrency=auto_throttle_target_concurrency or self.concurrency/4.0,
|
|
111
|
+
start_delay=auto_throttle_start_delay,
|
|
112
|
+
max_delay=auto_throttle_max_delay,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
self._sem_global = asyncio.Semaphore(self.concurrency)
|
|
116
|
+
self._sem_host = defaultdict(lambda: asyncio.Semaphore(self.per_host))
|
|
117
|
+
|
|
118
|
+
self._pending: asyncio.Queue[Request] = asyncio.Queue()
|
|
119
|
+
self._results: asyncio.Queue[Response] = asyncio.Queue()
|
|
120
|
+
|
|
121
|
+
self._session: aiohttp.ClientSession | None = None
|
|
122
|
+
self._workers: list[asyncio.Task] = []
|
|
123
|
+
self._closed = False
|
|
124
|
+
self._stats = CrawlerStats()
|
|
125
|
+
|
|
126
|
+
self.respect_robots = respect_robots if respect_robots is not None else cfg.respect_robots
|
|
127
|
+
self._robots_policy: RobotsTxtPolicy | None = None
|
|
128
|
+
|
|
129
|
+
# WARN: If SQLiteBackend caching is enabled and min(concurrency, per_host) > 1,
|
|
130
|
+
# write-contention is likely to occur.
|
|
131
|
+
if (CACHE_SETTINGS.enabled
|
|
132
|
+
and CACHE_SETTINGS.backend == "sqlite"
|
|
133
|
+
and min(self.concurrency, self.per_host) > 1
|
|
134
|
+
):
|
|
135
|
+
log.warning(
|
|
136
|
+
"SQLiteBackend caching is enabled and min(concurrency, per_host) > 1. "
|
|
137
|
+
"Write-contention is likely to occur. Consider using RedisBackend."
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
def build_session(self) -> aiohttp.ClientSession:
|
|
141
|
+
"""Construct an `aiohttp.ClientSession` with tracing and pooling."""
|
|
142
|
+
trace_config = build_trace_config(self._stats)
|
|
143
|
+
# Need to build the connector as late as possible as it requires the loop
|
|
144
|
+
connector = aiohttp.TCPConnector(limit=self.concurrency*2, ttl_dns_cache=300)
|
|
145
|
+
return get_async_session(
|
|
146
|
+
headers=self._headers,
|
|
147
|
+
timeout=self._timeout,
|
|
148
|
+
connector=connector,
|
|
149
|
+
trace_config=trace_config
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
async def __aenter__(self) -> "Crawler":
|
|
153
|
+
"""Initialize HTTP session and start background workers."""
|
|
154
|
+
if self._session is None:
|
|
155
|
+
# self._session = aiohttp.ClientSession(timeout=self._timeout)
|
|
156
|
+
self._session = self.build_session()
|
|
157
|
+
|
|
158
|
+
# Note: Set robots policy after session is created
|
|
159
|
+
if self.respect_robots:
|
|
160
|
+
self._robots_policy = RobotsTxtPolicy(self._session)
|
|
161
|
+
|
|
162
|
+
self._workers = [
|
|
163
|
+
asyncio.create_task(self._worker(), name=f"crawler-worker-{i}")
|
|
164
|
+
for i in range(self.concurrency)
|
|
165
|
+
]
|
|
166
|
+
return self
|
|
167
|
+
|
|
168
|
+
async def __aexit__(self, *_) -> None:
|
|
169
|
+
"""Tear down workers and close the HTTP session."""
|
|
170
|
+
self._closed = True
|
|
171
|
+
for w in self._workers:
|
|
172
|
+
w.cancel()
|
|
173
|
+
|
|
174
|
+
await asyncio.gather(*self._workers, return_exceptions=True)
|
|
175
|
+
|
|
176
|
+
if self._session:
|
|
177
|
+
await self._session.close()
|
|
178
|
+
|
|
179
|
+
def submit(self, req: Request) -> None:
|
|
180
|
+
"""Queue a request for fetching or raise if crawler already closed."""
|
|
181
|
+
if self._closed:
|
|
182
|
+
raise RuntimeError("crawler is closed")
|
|
183
|
+
self._pending.put_nowait(req)
|
|
184
|
+
|
|
185
|
+
def __aiter__(self) -> AsyncIterator[Response]:
|
|
186
|
+
return self._result_iter()
|
|
187
|
+
|
|
188
|
+
async def _result_iter(self) -> AsyncIterator[Response]:
|
|
189
|
+
"""Async iterator yielding responses as workers produce them."""
|
|
190
|
+
# while not self._closed:
|
|
191
|
+
while not (self._closed and self._results.empty()):
|
|
192
|
+
resp = await self._results.get()
|
|
193
|
+
self._results.task_done()
|
|
194
|
+
yield resp
|
|
195
|
+
|
|
196
|
+
def _proxy_for(self, url: str) -> str | None:
|
|
197
|
+
host = urllib.parse.urlsplit(url).hostname
|
|
198
|
+
try:
|
|
199
|
+
# bracket notation first, for defaultdicts
|
|
200
|
+
value = self._proxies[host]
|
|
201
|
+
except KeyError:
|
|
202
|
+
value = self._proxies.get(host)
|
|
203
|
+
|
|
204
|
+
if not value:
|
|
205
|
+
log.debug("proxy", extra={"host": host, "value": value})
|
|
206
|
+
return value
|
|
207
|
+
|
|
208
|
+
async def _worker(self) -> None:
|
|
209
|
+
"""Worker loop that fetches pending requests and enqueues results."""
|
|
210
|
+
while True:
|
|
211
|
+
req = await self._pending.get()
|
|
212
|
+
try:
|
|
213
|
+
resp = await self._fetch_one(req)
|
|
214
|
+
if resp is not None:
|
|
215
|
+
await self._results.put(resp)
|
|
216
|
+
|
|
217
|
+
except asyncio.CancelledError:
|
|
218
|
+
# Must propagate cancellation
|
|
219
|
+
log.debug("cancelled error", extra={"url": req.url})
|
|
220
|
+
raise
|
|
221
|
+
|
|
222
|
+
except gaierror:
|
|
223
|
+
# Ignore DNS errors
|
|
224
|
+
log.warning("DNS error", extra={"url": req.url})
|
|
225
|
+
pass
|
|
226
|
+
|
|
227
|
+
except Exception as exc:
|
|
228
|
+
log.warning("exception", extra={"url": req.url})
|
|
229
|
+
# Last-resort safety: never drop a request silently
|
|
230
|
+
await self._results.put(Response(req, 0, b"", error=exc))
|
|
231
|
+
finally:
|
|
232
|
+
self._pending.task_done()
|
|
233
|
+
|
|
234
|
+
async def _fetch_one(self, req: Request) -> Response | None:
|
|
235
|
+
"""Fetch a single request, handling robots, throttling, and retries."""
|
|
236
|
+
host = req.hostname
|
|
237
|
+
|
|
238
|
+
if self._robots_policy:
|
|
239
|
+
can_fetch = await self._robots_policy.can_fetch(
|
|
240
|
+
req.url, self._headers.get("User-Agent")
|
|
241
|
+
)
|
|
242
|
+
if not can_fetch:
|
|
243
|
+
log.debug("disallowed by robots.txt", extra={"url": req.url})
|
|
244
|
+
return Response(req, 403, b"", error=RuntimeError("Disallowed by robots.txt"))
|
|
245
|
+
|
|
246
|
+
# TODO: Move this filter to hooks
|
|
247
|
+
if req.url.lower().endswith((".pdf", ".zip", ".exe")):
|
|
248
|
+
req.max_retries = 0
|
|
249
|
+
|
|
250
|
+
async with self._sem_global, self._sem_host[host]:
|
|
251
|
+
t0 = asyncio.get_running_loop().time()
|
|
252
|
+
await self.throttler.wait(host)
|
|
253
|
+
dt = asyncio.get_running_loop().time() - t0
|
|
254
|
+
|
|
255
|
+
self._stats.throttle_waits += 1
|
|
256
|
+
self._stats.throttle_wait_time += dt
|
|
257
|
+
self._stats.throttle_waits_by_host[host] += 1
|
|
258
|
+
|
|
259
|
+
start = time.monotonic()
|
|
260
|
+
try:
|
|
261
|
+
log.info("fetching", extra={"url": req.url})
|
|
262
|
+
async with self._session.get(
|
|
263
|
+
req.url,
|
|
264
|
+
headers=self._headers | req.headers,
|
|
265
|
+
proxy=self._proxy_for(req.url),
|
|
266
|
+
timeout=req.timeout or self._timeout,
|
|
267
|
+
) as resp:
|
|
268
|
+
from_cache = getattr(resp, "from_cache", False)
|
|
269
|
+
if from_cache:
|
|
270
|
+
# NOTE: This is a bit of a hack, but it works. aiohttp-client-cache does not
|
|
271
|
+
# interface with TraceConfigs on cache hit, so we have to do it here.
|
|
272
|
+
self._stats.requests_cache_hit += 1
|
|
273
|
+
log.info("[CACHE HIT]", extra={"req.url": req.url, "resp.url": resp.url})
|
|
274
|
+
else:
|
|
275
|
+
log.info("[CACHE MISS]", extra={"req.url": req.url, "resp.url": resp.url})
|
|
276
|
+
|
|
277
|
+
body = await resp.read()
|
|
278
|
+
|
|
279
|
+
latency = time.monotonic() - start
|
|
280
|
+
self.throttler.record_latency(host, latency)
|
|
281
|
+
|
|
282
|
+
if self.retry_policy.should_retry(req, response=resp):
|
|
283
|
+
await self._retry(req)
|
|
284
|
+
return None
|
|
285
|
+
|
|
286
|
+
return Response(req, resp.status, body, dict(resp.headers))
|
|
287
|
+
except asyncio.CancelledError:
|
|
288
|
+
# Normal during shutdown / timeout propagation
|
|
289
|
+
log.debug("cancelled error", extra={"url": req.url})
|
|
290
|
+
raise
|
|
291
|
+
except Exception as exc:
|
|
292
|
+
latency = time.monotonic() - start
|
|
293
|
+
self.throttler.record_latency(host, latency)
|
|
294
|
+
|
|
295
|
+
if self.retry_policy.should_retry(req, exception=exc):
|
|
296
|
+
await self._retry(req)
|
|
297
|
+
return None
|
|
298
|
+
|
|
299
|
+
log.error("request failed", extra={"url": req.url}, exc_info=exc)
|
|
300
|
+
return Response(req, 0, b"", error=exc)
|
|
301
|
+
|
|
302
|
+
async def _retry(self, req: Request) -> None:
|
|
303
|
+
"""Reschedule a request according to the retry policy."""
|
|
304
|
+
req.retries += 1
|
|
305
|
+
delay = self.retry_policy.get_delay(req)
|
|
306
|
+
|
|
307
|
+
log.warning(
|
|
308
|
+
"retrying",
|
|
309
|
+
extra={"url": req.url, "retry": req.retries, "delay": delay},
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
if delay:
|
|
313
|
+
await asyncio.sleep(delay)
|
|
314
|
+
|
|
315
|
+
self.submit(req)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class Request:
|
|
8
|
+
"""HTTP request envelope used by the crawler."""
|
|
9
|
+
url: str
|
|
10
|
+
method: str = "GET"
|
|
11
|
+
headers: dict[str, str] = field(default_factory=dict)
|
|
12
|
+
timeout: float = 15.0
|
|
13
|
+
|
|
14
|
+
retries: int = 0
|
|
15
|
+
max_retries: int | None = None
|
|
16
|
+
dont_retry: bool = False
|
|
17
|
+
|
|
18
|
+
meta: dict[str, Any] = field(default_factory=dict)
|
|
19
|
+
|
|
20
|
+
created_at: float = field(default_factory=time.monotonic)
|
|
21
|
+
|
|
22
|
+
def copy_for_retry(self) -> "Request":
|
|
23
|
+
"""Create a copy incrementing the retry counter for scheduling."""
|
|
24
|
+
return Request(
|
|
25
|
+
url=self.url,
|
|
26
|
+
method=self.method,
|
|
27
|
+
headers=self.headers,
|
|
28
|
+
timeout=self.timeout,
|
|
29
|
+
retries=self.retries + 1,
|
|
30
|
+
max_retries=self.max_retries,
|
|
31
|
+
dont_retry=self.dont_retry,
|
|
32
|
+
meta=self.meta,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def hostname(self) -> str:
|
|
37
|
+
from urllib.parse import urlsplit
|
|
38
|
+
return urlsplit(self.url).hostname or ""
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# wxpath/http/response.py
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from wxpath.http.client.request import Request
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class Response:
|
|
10
|
+
request: Request
|
|
11
|
+
status: int
|
|
12
|
+
body: bytes
|
|
13
|
+
headers: dict[str, str] | None = None
|
|
14
|
+
error: Optional[Exception] = field(default=None, kw_only=True)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import random
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def exponential_backoff(
|
|
5
|
+
attempt: int,
|
|
6
|
+
base: float = 0.5,
|
|
7
|
+
cap: float = 30.0,
|
|
8
|
+
jitter: bool = True,
|
|
9
|
+
) -> float:
|
|
10
|
+
"""
|
|
11
|
+
Exponential backoff with optional jitter.
|
|
12
|
+
"""
|
|
13
|
+
delay = min(cap, base * (2 ** attempt))
|
|
14
|
+
if jitter:
|
|
15
|
+
delay *= random.uniform(0.7, 1.3)
|
|
16
|
+
return delay
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from wxpath.http.policy.backoff import exponential_backoff
|
|
2
|
+
from wxpath.util.logging import get_logger
|
|
3
|
+
|
|
4
|
+
log = get_logger(__name__)
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class RetryPolicy:
|
|
8
|
+
def __init__(
|
|
9
|
+
self,
|
|
10
|
+
max_retries: int = 3,
|
|
11
|
+
retry_statuses: set[int] = None,
|
|
12
|
+
):
|
|
13
|
+
self.max_retries = max_retries
|
|
14
|
+
self.retry_statuses = retry_statuses or {500, 502, 503, 504}
|
|
15
|
+
|
|
16
|
+
def should_retry(self, request, response=None, exception=None) -> bool:
|
|
17
|
+
if request.dont_retry:
|
|
18
|
+
return False
|
|
19
|
+
|
|
20
|
+
if request.max_retries is not None and request.retries >= request.max_retries:
|
|
21
|
+
return False
|
|
22
|
+
|
|
23
|
+
if request.retries >= self.max_retries:
|
|
24
|
+
return False
|
|
25
|
+
|
|
26
|
+
if response is not None and response.status in self.retry_statuses:
|
|
27
|
+
return True
|
|
28
|
+
|
|
29
|
+
if exception is not None:
|
|
30
|
+
return True
|
|
31
|
+
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
def get_delay(self, request) -> float:
|
|
35
|
+
return exponential_backoff(request.retries)
|