wxpath 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,145 @@
1
+ """
2
+ Pluggable hook system for wxpath.
3
+
4
+ Write once:
5
+
6
+ from wxpath import hooks
7
+
8
+ @hooks.register
9
+ class OnlyEnglish:
10
+ def post_parse(self, ctx, elem):
11
+ lang = elem.xpath('string(/html/@lang)').lower()[:2]
12
+ return elem if lang in ("en", "") else None
13
+
14
+ ... and wxpath.engine will call it automatically.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import functools
20
+ from collections.abc import Generator
21
+ from dataclasses import dataclass, field
22
+ from typing import Any, Iterable, List, Optional, Protocol
23
+
24
+ from lxml import html
25
+
26
+ from wxpath.util.logging import get_logger
27
+
28
+ log = get_logger(__name__)
29
+
30
+
31
+ # --------------------------------------------------------------------------- #
32
+ # Dataclass describing the crawl context for a single URL
33
+ # --------------------------------------------------------------------------- #
34
+ @dataclass
35
+ class FetchContext:
36
+ url: str
37
+ backlink: Optional[str]
38
+ depth: int
39
+ segments: list # remaining op/value pairs
40
+ user_data: dict = field(default_factory=dict)
41
+
42
+
43
+ # --------------------------------------------------------------------------- #
44
+ # Hook protocol - every method is optional
45
+ # --------------------------------------------------------------------------- #
46
+ class Hook(Protocol):
47
+ # Return False to abort fetching this URL
48
+ # def pre_fetch(self, ctx: FetchContext) -> bool: ...
49
+
50
+ # May return modified HTML bytes or None to drop this branch entirely
51
+ def post_fetch(self, ctx: FetchContext, html_bytes: bytes) -> bytes | None: ...
52
+
53
+ # May return modified element or None to drop this branch entirely
54
+ def post_parse(
55
+ self, ctx: FetchContext, elem: html.HtmlElement
56
+ ) -> html.HtmlElement | None: ...
57
+
58
+ # Called for every candidate link; return False to prevent enqueueing it
59
+ # def pre_queue(self, ctx: FetchContext, url: str) -> bool: ...
60
+
61
+ # Called for every extracted value; may transform or drop it
62
+ def post_extract(self, value: Any) -> Any: ...
63
+
64
+
65
+ # --------------------------------------------------------------------------- #
66
+ # Global registry helpers
67
+ # --------------------------------------------------------------------------- #
68
+ _global_hooks: dict[str, Hook] = dict()
69
+
70
+
71
+ def register(hook: Hook | type) -> Hook:
72
+ """Decorator/helper to add a Hook to the global list.
73
+
74
+ Args:
75
+ hook: A Hook class or instance to register.
76
+
77
+ Returns:
78
+ The registered hook (instantiated if a class was provided).
79
+
80
+ Example:
81
+ >>> @register
82
+ ... class DebugHook:
83
+ ... def post_fetch(self, ctx, html_bytes):
84
+ ... print("Fetched", ctx.url)
85
+ ... return html_bytes
86
+ """
87
+
88
+ hook_name = getattr(hook, '__name__', hook.__class__.__name__)
89
+ if hook_name in _global_hooks:
90
+ return hook
91
+
92
+ instance = hook() if isinstance(hook, type) else hook
93
+ _global_hooks[hook_name] = instance
94
+ return hook
95
+
96
+
97
+ def get_hooks() -> List[Hook]:
98
+ """Return the list of globally-registered hooks (read-only)."""
99
+ return list(_global_hooks.values())
100
+
101
+
102
+ def iter_post_extract_hooks() -> Iterable[Hook]:
103
+ yield from (h for h in _global_hooks.values() if hasattr(h, "post_extract"))
104
+
105
+
106
+ def pipe_post_extract(gen_func):
107
+ """Wrap a generator function to pipe yielded values through post_extract hooks.
108
+
109
+ Args:
110
+ gen_func: A generator function to wrap.
111
+
112
+ Returns:
113
+ A wrapped generator that filters values through registered hooks.
114
+ """
115
+ @functools.wraps(gen_func)
116
+ def wrapper(*args, **kwargs) -> Generator:
117
+ for item in gen_func(*args, **kwargs):
118
+ for hook in iter_post_extract_hooks():
119
+ item = hook.post_extract(item)
120
+ if item is None: # hook decided to drop it
121
+ break
122
+ if item is not None:
123
+ yield item
124
+ return wrapper
125
+
126
+
127
+ def pipe_post_extract_async(async_gen_func):
128
+ """Wrap an async generator function to pipe yielded values through hooks.
129
+
130
+ Args:
131
+ async_gen_func: An async generator function to wrap.
132
+
133
+ Returns:
134
+ A wrapped async generator that filters values through registered hooks.
135
+ """
136
+ @functools.wraps(async_gen_func)
137
+ async def wrapper(*args, **kwargs):
138
+ async for item in async_gen_func(*args, **kwargs):
139
+ for hook in iter_post_extract_hooks():
140
+ item = hook.post_extract(item)
141
+ if item is None:
142
+ break
143
+ if item is not None:
144
+ yield item
145
+ return wrapper
File without changes
@@ -0,0 +1,9 @@
1
+ from wxpath.http.client.crawler import Crawler
2
+ from wxpath.http.client.request import Request
3
+ from wxpath.http.client.response import Response
4
+
5
+ __all__ = [
6
+ "Crawler",
7
+ "Request",
8
+ "Response"
9
+ ]
@@ -0,0 +1,43 @@
1
+ try:
2
+ from aiohttp_client_cache import SQLiteBackend
3
+ except ImportError:
4
+ CachedSession = None
5
+
6
+ from wxpath.settings import SETTINGS
7
+ from wxpath.util.logging import get_logger
8
+
9
+ log = get_logger(__name__)
10
+
11
+ CACHE_SETTINGS = SETTINGS.http.client.cache
12
+
13
+ def get_cache_backend():
14
+ log.info("cache backend", extra={"backend": CACHE_SETTINGS.backend})
15
+ if CACHE_SETTINGS.backend == "redis":
16
+ from aiohttp_client_cache.backends.redis import RedisBackend
17
+ return RedisBackend(
18
+ expire_after=CACHE_SETTINGS.expire_after,
19
+ urls_expire_after=CACHE_SETTINGS.urls_expire_after or None,
20
+ allowed_methods=CACHE_SETTINGS.allowed_methods,
21
+ allowed_codes=CACHE_SETTINGS.allowed_codes,
22
+ include_headers=CACHE_SETTINGS.include_headers,
23
+ ignored_parameters=CACHE_SETTINGS.ignored_parameters,
24
+ **CACHE_SETTINGS.redis
25
+ # cache_name=CACHE_SETTINGS.redis.cache_name,
26
+ # host=CACHE_SETTINGS.redis.host,
27
+ # port=CACHE_SETTINGS.redis.port,
28
+ # db=CACHE_SETTINGS.redis.db,
29
+ # cache_control=CACHE_SETTINGS.cache_control,
30
+ )
31
+ elif CACHE_SETTINGS.backend == "sqlite":
32
+ return SQLiteBackend(
33
+ cache_name=CACHE_SETTINGS.sqlite.cache_name,
34
+ expire_after=CACHE_SETTINGS.expire_after,
35
+ urls_expire_after=CACHE_SETTINGS.urls_expire_after or None,
36
+ allowed_methods=CACHE_SETTINGS.allowed_methods,
37
+ allowed_codes=CACHE_SETTINGS.allowed_codes,
38
+ include_headers=CACHE_SETTINGS.include_headers,
39
+ ignored_parameters=CACHE_SETTINGS.ignored_parameters,
40
+ # cache_control=CACHE_SETTINGS.cache_control,
41
+ )
42
+ else:
43
+ raise ValueError(f"Unknown cache backend: {CACHE_SETTINGS.backend}")
@@ -0,0 +1,315 @@
1
+ import aiohttp
2
+
3
+ try:
4
+ from aiohttp_client_cache import CachedSession, SQLiteBackend
5
+ except ImportError:
6
+ CachedSession = None
7
+
8
+ import asyncio
9
+ import time
10
+ import urllib.parse
11
+ from collections import defaultdict
12
+ from socket import gaierror
13
+ from typing import AsyncIterator
14
+
15
+ from wxpath.http.client.cache import get_cache_backend
16
+ from wxpath.http.client.request import Request
17
+ from wxpath.http.client.response import Response
18
+ from wxpath.http.policy.retry import RetryPolicy
19
+ from wxpath.http.policy.robots import RobotsTxtPolicy
20
+ from wxpath.http.policy.throttler import AbstractThrottler, AutoThrottler
21
+ from wxpath.http.stats import CrawlerStats, build_trace_config
22
+ from wxpath.settings import SETTINGS
23
+ from wxpath.util.logging import get_logger
24
+
25
+ log = get_logger(__name__)
26
+
27
+ CACHE_SETTINGS = SETTINGS.http.client.cache
28
+ CRAWLER_SETTINGS = SETTINGS.http.client.crawler
29
+
30
+ def get_async_session(
31
+ headers: dict | None = None,
32
+ timeout: aiohttp.ClientTimeout | None = None,
33
+ connector: aiohttp.TCPConnector | None = None,
34
+ trace_config: aiohttp.TraceConfig | None = None
35
+ ) -> aiohttp.ClientSession:
36
+ """
37
+ Create and return a new aiohttp session. If aiohttp-client-cache is available
38
+ and enabled, return a new CachedSession bound to the configured SQLite backend.
39
+ The caller is responsible for closing the session.
40
+ """
41
+
42
+ if timeout is None:
43
+ timeout = aiohttp.ClientTimeout(total=CRAWLER_SETTINGS.timeout)
44
+
45
+ if CACHE_SETTINGS.enabled and CachedSession and SQLiteBackend:
46
+ log.info("using aiohttp-client-cache")
47
+ return CachedSession(
48
+ cache=get_cache_backend(),
49
+ headers=headers,
50
+ timeout=timeout,
51
+ connector=connector,
52
+ trace_configs=[trace_config] if trace_config is not None else None
53
+ )
54
+
55
+ return aiohttp.ClientSession(
56
+ headers=headers,
57
+ timeout=timeout,
58
+ connector=connector,
59
+ trace_configs=[trace_config] if trace_config is not None else None
60
+ )
61
+
62
+
63
+ class Crawler:
64
+ """Concurrent HTTP crawler that manages throttling, retries, and robots."""
65
+
66
+ def __init__(
67
+ self,
68
+ concurrency: int = None,
69
+ per_host: int = None,
70
+ timeout: int = None,
71
+ *,
72
+ headers: dict | None = None,
73
+ proxies: dict | None = None,
74
+ retry_policy: RetryPolicy | None = None,
75
+ throttler: AbstractThrottler | None = None,
76
+ auto_throttle_target_concurrency: float = None,
77
+ auto_throttle_start_delay: float = None,
78
+ auto_throttle_max_delay: float = None,
79
+ respect_robots: bool = True,
80
+ ):
81
+ cfg = CRAWLER_SETTINGS
82
+
83
+ self.concurrency = concurrency if concurrency is not None else cfg.concurrency
84
+ self.per_host = per_host if per_host is not None else cfg.per_host
85
+
86
+ timeout = timeout if timeout is not None else cfg.timeout
87
+ self._timeout = aiohttp.ClientTimeout(total=timeout)
88
+
89
+ self._headers = cfg.headers | (headers or {}) # merge headers
90
+
91
+ _proxies = proxies if proxies is not None else cfg.proxies
92
+ self._proxies = _proxies if (isinstance(_proxies, defaultdict) or _proxies) else {}
93
+
94
+ self.retry_policy = retry_policy or RetryPolicy()
95
+
96
+ # auto-throttle defaults
97
+ auto_throttle_target_concurrency = auto_throttle_target_concurrency \
98
+ if auto_throttle_target_concurrency is not None \
99
+ else cfg.auto_throttle_target_concurrency
100
+
101
+ auto_throttle_start_delay = auto_throttle_start_delay \
102
+ if auto_throttle_start_delay is not None \
103
+ else cfg.auto_throttle_start_delay
104
+
105
+ auto_throttle_max_delay = auto_throttle_max_delay \
106
+ if auto_throttle_max_delay is not None \
107
+ else cfg.auto_throttle_max_delay
108
+
109
+ self.throttler = throttler or AutoThrottler(
110
+ target_concurrency=auto_throttle_target_concurrency or self.concurrency/4.0,
111
+ start_delay=auto_throttle_start_delay,
112
+ max_delay=auto_throttle_max_delay,
113
+ )
114
+
115
+ self._sem_global = asyncio.Semaphore(self.concurrency)
116
+ self._sem_host = defaultdict(lambda: asyncio.Semaphore(self.per_host))
117
+
118
+ self._pending: asyncio.Queue[Request] = asyncio.Queue()
119
+ self._results: asyncio.Queue[Response] = asyncio.Queue()
120
+
121
+ self._session: aiohttp.ClientSession | None = None
122
+ self._workers: list[asyncio.Task] = []
123
+ self._closed = False
124
+ self._stats = CrawlerStats()
125
+
126
+ self.respect_robots = respect_robots if respect_robots is not None else cfg.respect_robots
127
+ self._robots_policy: RobotsTxtPolicy | None = None
128
+
129
+ # WARN: If SQLiteBackend caching is enabled and min(concurrency, per_host) > 1,
130
+ # write-contention is likely to occur.
131
+ if (CACHE_SETTINGS.enabled
132
+ and CACHE_SETTINGS.backend == "sqlite"
133
+ and min(self.concurrency, self.per_host) > 1
134
+ ):
135
+ log.warning(
136
+ "SQLiteBackend caching is enabled and min(concurrency, per_host) > 1. "
137
+ "Write-contention is likely to occur. Consider using RedisBackend."
138
+ )
139
+
140
+ def build_session(self) -> aiohttp.ClientSession:
141
+ """Construct an `aiohttp.ClientSession` with tracing and pooling."""
142
+ trace_config = build_trace_config(self._stats)
143
+ # Need to build the connector as late as possible as it requires the loop
144
+ connector = aiohttp.TCPConnector(limit=self.concurrency*2, ttl_dns_cache=300)
145
+ return get_async_session(
146
+ headers=self._headers,
147
+ timeout=self._timeout,
148
+ connector=connector,
149
+ trace_config=trace_config
150
+ )
151
+
152
+ async def __aenter__(self) -> "Crawler":
153
+ """Initialize HTTP session and start background workers."""
154
+ if self._session is None:
155
+ # self._session = aiohttp.ClientSession(timeout=self._timeout)
156
+ self._session = self.build_session()
157
+
158
+ # Note: Set robots policy after session is created
159
+ if self.respect_robots:
160
+ self._robots_policy = RobotsTxtPolicy(self._session)
161
+
162
+ self._workers = [
163
+ asyncio.create_task(self._worker(), name=f"crawler-worker-{i}")
164
+ for i in range(self.concurrency)
165
+ ]
166
+ return self
167
+
168
+ async def __aexit__(self, *_) -> None:
169
+ """Tear down workers and close the HTTP session."""
170
+ self._closed = True
171
+ for w in self._workers:
172
+ w.cancel()
173
+
174
+ await asyncio.gather(*self._workers, return_exceptions=True)
175
+
176
+ if self._session:
177
+ await self._session.close()
178
+
179
+ def submit(self, req: Request) -> None:
180
+ """Queue a request for fetching or raise if crawler already closed."""
181
+ if self._closed:
182
+ raise RuntimeError("crawler is closed")
183
+ self._pending.put_nowait(req)
184
+
185
+ def __aiter__(self) -> AsyncIterator[Response]:
186
+ return self._result_iter()
187
+
188
+ async def _result_iter(self) -> AsyncIterator[Response]:
189
+ """Async iterator yielding responses as workers produce them."""
190
+ # while not self._closed:
191
+ while not (self._closed and self._results.empty()):
192
+ resp = await self._results.get()
193
+ self._results.task_done()
194
+ yield resp
195
+
196
+ def _proxy_for(self, url: str) -> str | None:
197
+ host = urllib.parse.urlsplit(url).hostname
198
+ try:
199
+ # bracket notation first, for defaultdicts
200
+ value = self._proxies[host]
201
+ except KeyError:
202
+ value = self._proxies.get(host)
203
+
204
+ if not value:
205
+ log.debug("proxy", extra={"host": host, "value": value})
206
+ return value
207
+
208
+ async def _worker(self) -> None:
209
+ """Worker loop that fetches pending requests and enqueues results."""
210
+ while True:
211
+ req = await self._pending.get()
212
+ try:
213
+ resp = await self._fetch_one(req)
214
+ if resp is not None:
215
+ await self._results.put(resp)
216
+
217
+ except asyncio.CancelledError:
218
+ # Must propagate cancellation
219
+ log.debug("cancelled error", extra={"url": req.url})
220
+ raise
221
+
222
+ except gaierror:
223
+ # Ignore DNS errors
224
+ log.warning("DNS error", extra={"url": req.url})
225
+ pass
226
+
227
+ except Exception as exc:
228
+ log.warning("exception", extra={"url": req.url})
229
+ # Last-resort safety: never drop a request silently
230
+ await self._results.put(Response(req, 0, b"", error=exc))
231
+ finally:
232
+ self._pending.task_done()
233
+
234
+ async def _fetch_one(self, req: Request) -> Response | None:
235
+ """Fetch a single request, handling robots, throttling, and retries."""
236
+ host = req.hostname
237
+
238
+ if self._robots_policy:
239
+ can_fetch = await self._robots_policy.can_fetch(
240
+ req.url, self._headers.get("User-Agent")
241
+ )
242
+ if not can_fetch:
243
+ log.debug("disallowed by robots.txt", extra={"url": req.url})
244
+ return Response(req, 403, b"", error=RuntimeError("Disallowed by robots.txt"))
245
+
246
+ # TODO: Move this filter to hooks
247
+ if req.url.lower().endswith((".pdf", ".zip", ".exe")):
248
+ req.max_retries = 0
249
+
250
+ async with self._sem_global, self._sem_host[host]:
251
+ t0 = asyncio.get_running_loop().time()
252
+ await self.throttler.wait(host)
253
+ dt = asyncio.get_running_loop().time() - t0
254
+
255
+ self._stats.throttle_waits += 1
256
+ self._stats.throttle_wait_time += dt
257
+ self._stats.throttle_waits_by_host[host] += 1
258
+
259
+ start = time.monotonic()
260
+ try:
261
+ log.info("fetching", extra={"url": req.url})
262
+ async with self._session.get(
263
+ req.url,
264
+ headers=self._headers | req.headers,
265
+ proxy=self._proxy_for(req.url),
266
+ timeout=req.timeout or self._timeout,
267
+ ) as resp:
268
+ from_cache = getattr(resp, "from_cache", False)
269
+ if from_cache:
270
+ # NOTE: This is a bit of a hack, but it works. aiohttp-client-cache does not
271
+ # interface with TraceConfigs on cache hit, so we have to do it here.
272
+ self._stats.requests_cache_hit += 1
273
+ log.info("[CACHE HIT]", extra={"req.url": req.url, "resp.url": resp.url})
274
+ else:
275
+ log.info("[CACHE MISS]", extra={"req.url": req.url, "resp.url": resp.url})
276
+
277
+ body = await resp.read()
278
+
279
+ latency = time.monotonic() - start
280
+ self.throttler.record_latency(host, latency)
281
+
282
+ if self.retry_policy.should_retry(req, response=resp):
283
+ await self._retry(req)
284
+ return None
285
+
286
+ return Response(req, resp.status, body, dict(resp.headers))
287
+ except asyncio.CancelledError:
288
+ # Normal during shutdown / timeout propagation
289
+ log.debug("cancelled error", extra={"url": req.url})
290
+ raise
291
+ except Exception as exc:
292
+ latency = time.monotonic() - start
293
+ self.throttler.record_latency(host, latency)
294
+
295
+ if self.retry_policy.should_retry(req, exception=exc):
296
+ await self._retry(req)
297
+ return None
298
+
299
+ log.error("request failed", extra={"url": req.url}, exc_info=exc)
300
+ return Response(req, 0, b"", error=exc)
301
+
302
+ async def _retry(self, req: Request) -> None:
303
+ """Reschedule a request according to the retry policy."""
304
+ req.retries += 1
305
+ delay = self.retry_policy.get_delay(req)
306
+
307
+ log.warning(
308
+ "retrying",
309
+ extra={"url": req.url, "retry": req.retries, "delay": delay},
310
+ )
311
+
312
+ if delay:
313
+ await asyncio.sleep(delay)
314
+
315
+ self.submit(req)
@@ -0,0 +1,38 @@
1
+ import time
2
+ from dataclasses import dataclass, field
3
+ from typing import Any
4
+
5
+
6
+ @dataclass
7
+ class Request:
8
+ """HTTP request envelope used by the crawler."""
9
+ url: str
10
+ method: str = "GET"
11
+ headers: dict[str, str] = field(default_factory=dict)
12
+ timeout: float = 15.0
13
+
14
+ retries: int = 0
15
+ max_retries: int | None = None
16
+ dont_retry: bool = False
17
+
18
+ meta: dict[str, Any] = field(default_factory=dict)
19
+
20
+ created_at: float = field(default_factory=time.monotonic)
21
+
22
+ def copy_for_retry(self) -> "Request":
23
+ """Create a copy incrementing the retry counter for scheduling."""
24
+ return Request(
25
+ url=self.url,
26
+ method=self.method,
27
+ headers=self.headers,
28
+ timeout=self.timeout,
29
+ retries=self.retries + 1,
30
+ max_retries=self.max_retries,
31
+ dont_retry=self.dont_retry,
32
+ meta=self.meta,
33
+ )
34
+
35
+ @property
36
+ def hostname(self) -> str:
37
+ from urllib.parse import urlsplit
38
+ return urlsplit(self.url).hostname or ""
@@ -0,0 +1,14 @@
1
+ # wxpath/http/response.py
2
+ from dataclasses import dataclass, field
3
+ from typing import Optional
4
+
5
+ from wxpath.http.client.request import Request
6
+
7
+
8
+ @dataclass
9
+ class Response:
10
+ request: Request
11
+ status: int
12
+ body: bytes
13
+ headers: dict[str, str] | None = None
14
+ error: Optional[Exception] = field(default=None, kw_only=True)
@@ -0,0 +1,16 @@
1
+ import random
2
+
3
+
4
+ def exponential_backoff(
5
+ attempt: int,
6
+ base: float = 0.5,
7
+ cap: float = 30.0,
8
+ jitter: bool = True,
9
+ ) -> float:
10
+ """
11
+ Exponential backoff with optional jitter.
12
+ """
13
+ delay = min(cap, base * (2 ** attempt))
14
+ if jitter:
15
+ delay *= random.uniform(0.7, 1.3)
16
+ return delay
@@ -0,0 +1,35 @@
1
+ from wxpath.http.policy.backoff import exponential_backoff
2
+ from wxpath.util.logging import get_logger
3
+
4
+ log = get_logger(__name__)
5
+
6
+
7
+ class RetryPolicy:
8
+ def __init__(
9
+ self,
10
+ max_retries: int = 3,
11
+ retry_statuses: set[int] = None,
12
+ ):
13
+ self.max_retries = max_retries
14
+ self.retry_statuses = retry_statuses or {500, 502, 503, 504}
15
+
16
+ def should_retry(self, request, response=None, exception=None) -> bool:
17
+ if request.dont_retry:
18
+ return False
19
+
20
+ if request.max_retries is not None and request.retries >= request.max_retries:
21
+ return False
22
+
23
+ if request.retries >= self.max_retries:
24
+ return False
25
+
26
+ if response is not None and response.status in self.retry_statuses:
27
+ return True
28
+
29
+ if exception is not None:
30
+ return True
31
+
32
+ return False
33
+
34
+ def get_delay(self, request) -> float:
35
+ return exponential_backoff(request.retries)