wxpath 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,113 @@
1
+ import atexit
2
+ import json
3
+ import os
4
+ import queue
5
+ import threading
6
+ import time
7
+
8
+ from elementpath.serialization import XPathMap, XPathNode
9
+
10
+ from wxpath.util.logging import get_logger
11
+
12
+ log = get_logger(__name__)
13
+
14
+
15
+ class SerializeXPathMapAndNodeHook:
16
+ """
17
+ Serialize XPathMap and XPathNode objects to plain Python types.
18
+ This is enabled by default (once this module is imported).
19
+ """
20
+ def post_extract(self, value):
21
+ if isinstance(value, (list, tuple, set)):
22
+ return type(value)(self.post_extract(v) for v in value)
23
+ if isinstance(value, XPathMap):
24
+ return {k: self.post_extract(v) for k, v in value.items()}
25
+ if isinstance(value, XPathNode):
26
+ return self.post_extract(value.obj)
27
+ return value
28
+
29
+
30
+ class JSONLWriter:
31
+ """
32
+ Efficient writer that mirrors items to an NDJSON file.
33
+ - Non-blocking: post_extract enqueues and returns immediately.
34
+ - Background thread flushes to disk.
35
+ - Skips non-JSONable values (e.g., raw HtmlElement) by default.
36
+ Customize _jsonable() to change behavior.
37
+ """
38
+ def __init__(self, path=None):
39
+ self.path = path or os.getenv("WXPATH_OUT", "extractions.ndjson")
40
+ self._q: "queue.Queue[str]" = queue.Queue(maxsize=10000)
41
+ self._written = 0
42
+ self._dropped = 0
43
+ self._stop = False
44
+ self._t = threading.Thread(target=self._writer, name="wxpath-ndjson-writer", daemon=True)
45
+ self._t.start()
46
+ atexit.register(self._shutdown)
47
+
48
+ # ---- hook API ----
49
+ def post_extract(self, value):
50
+ js = self._jsonable(value)
51
+ if js is not None:
52
+ line = json.dumps(js, ensure_ascii=False, separators=(",", ":"))
53
+ try:
54
+ self._q.put_nowait(line)
55
+ except queue.Full:
56
+ self._dropped += 1
57
+ if self._dropped in (1, 100, 1000) or self._dropped % 10000 == 0:
58
+ log.warning("NDJSON queue full; dropping items",
59
+ extra={"dropped": self._dropped, "written": self._written})
60
+ return value # always pass-through
61
+
62
+ # ---- internals ----
63
+ def _writer(self):
64
+ # Open lazily to avoid creating files when nothing is produced.
65
+ f = None
66
+ try:
67
+ last_flush = time.time()
68
+ while not self._stop or not self._q.empty():
69
+ try:
70
+ line = self._q.get(timeout=0.5)
71
+ except queue.Empty:
72
+ line = None
73
+ if line is not None:
74
+ if f is None:
75
+ f = open(self.path, "a", buffering=1, encoding="utf-8") # line-buffered
76
+ f.write(line)
77
+ f.write("\n")
78
+ self._written += 1
79
+ # periodic flush guard for OS buffers even with line buffering
80
+ if f and (time.time() - last_flush) > 1.0:
81
+ f.flush()
82
+ last_flush = time.time()
83
+ finally:
84
+ if f:
85
+ f.flush()
86
+ f.close()
87
+ if self._dropped:
88
+ log.warning("NDJSON writer finished with drops",
89
+ extra={"dropped": self._dropped, "written": self._written})
90
+
91
+ def _shutdown(self):
92
+ self._stop = True
93
+ if self._t.is_alive():
94
+ self._t.join(timeout=2)
95
+
96
+ def _jsonable(self, v):
97
+ # Keep it conservative: only write JSON-friendly shapes by default.
98
+ # You can relax this if you want to serialize HtmlElement metadata, etc.
99
+ if v is None or isinstance(v, (bool, int, float, str, list, dict)):
100
+ return v
101
+ # Handle common wxpath types gently:
102
+ # - WxStr: stringify
103
+ if v.__class__.__name__ == "WxStr":
104
+ return str(v)
105
+ # - lxml HtmlElement: record minimal metadata instead of the whole DOM
106
+ base_url = getattr(v, "base_url", None)
107
+ tag = getattr(v, "tag", None)
108
+ if base_url or tag:
109
+ return {"_element": tag, "url": base_url}
110
+ return None # skip unknowns
111
+
112
+
113
+ NDJSONWriter = JSONLWriter
@@ -0,0 +1,133 @@
1
+ """
2
+ Pluggable hook system for wxpath.
3
+
4
+ Write once:
5
+
6
+ from wxpath import hooks
7
+
8
+ @hooks.register
9
+ class OnlyEnglish:
10
+ def post_parse(self, ctx, elem):
11
+ lang = elem.xpath('string(/html/@lang)').lower()[:2]
12
+ return elem if lang in ("en", "") else None
13
+
14
+ ... and wxpath.engine will call it automatically.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import functools
20
+ from collections import OrderedDict
21
+ from collections.abc import Generator
22
+ from dataclasses import dataclass, field
23
+ from typing import Any, Iterable, List, Optional, Protocol
24
+
25
+ from lxml import html
26
+
27
+ from wxpath.util.logging import get_logger
28
+
29
+ log = get_logger(__name__)
30
+
31
+
32
+ # --------------------------------------------------------------------------- #
33
+ # Dataclass describing the crawl context for a single URL
34
+ # --------------------------------------------------------------------------- #
35
+ @dataclass
36
+ class FetchContext:
37
+ url: str
38
+ backlink: Optional[str]
39
+ depth: int
40
+ segments: list # remaining op/value pairs
41
+ user_data: dict = field(default_factory=dict)
42
+
43
+
44
+ # --------------------------------------------------------------------------- #
45
+ # Hook protocol - every method is optional
46
+ # --------------------------------------------------------------------------- #
47
+ class Hook(Protocol):
48
+ # Return False to abort fetching this URL
49
+ # def pre_fetch(self, ctx: FetchContext) -> bool: ...
50
+
51
+ # May return modified HTML bytes or None to drop this branch entirely
52
+ def post_fetch(self, ctx: FetchContext, html_bytes: bytes) -> bytes | None: ...
53
+
54
+ # May return modified element or None to drop this branch entirely
55
+ def post_parse(
56
+ self, ctx: FetchContext, elem: html.HtmlElement
57
+ ) -> html.HtmlElement | None: ...
58
+
59
+ # Called for every candidate link; return False to prevent enqueueing it
60
+ # def pre_queue(self, ctx: FetchContext, url: str) -> bool: ...
61
+
62
+ # Called for every extracted value; may transform or drop it
63
+ def post_extract(self, value: Any) -> Any: ...
64
+
65
+
66
+ # --------------------------------------------------------------------------- #
67
+ # Global registry helpers
68
+ # --------------------------------------------------------------------------- #
69
+ _global_hooks: OrderedDict[str, Hook] = OrderedDict()
70
+
71
+
72
+ def register(hook: Hook | type) -> Hook:
73
+ """
74
+ Decorator / helper to add a Hook to the global list.
75
+
76
+ Example
77
+ -------
78
+ >>> @register
79
+ ... class DebugHook:
80
+ ... def post_fetch(self, ctx, html_bytes):
81
+ ... print("Fetched", ctx.url)
82
+ ... return html_bytes
83
+ """
84
+
85
+ hook_name = getattr(hook, '__name__', hook.__class__.__name__)
86
+ if hook_name in _global_hooks:
87
+ return hook
88
+
89
+ instance = hook() if isinstance(hook, type) else hook
90
+ _global_hooks[hook_name] = instance
91
+ return hook
92
+
93
+
94
+ def get_hooks() -> List[Hook]:
95
+ """Return the list of globally-registered hooks (read-only)."""
96
+ return list(_global_hooks.values())
97
+
98
+
99
+ def iter_post_extract_hooks() -> Iterable[Hook]:
100
+ yield from (h for h in _global_hooks.values() if hasattr(h, "post_extract"))
101
+
102
+
103
+ def pipe_post_extract(gen_func):
104
+ """
105
+ Decorator: wrap a *generator function* so every yielded value
106
+ is piped through the registered post_extract hooks.
107
+ """
108
+ @functools.wraps(gen_func)
109
+ def wrapper(*args, **kwargs) -> Generator:
110
+ for item in gen_func(*args, **kwargs):
111
+ for hook in iter_post_extract_hooks():
112
+ item = hook.post_extract(item)
113
+ if item is None: # hook decided to drop it
114
+ break
115
+ if item is not None:
116
+ yield item
117
+ return wrapper
118
+
119
+
120
+ def pipe_post_extract_async(async_gen_func):
121
+ """
122
+ Async variant - wraps an *async* generator function.
123
+ """
124
+ @functools.wraps(async_gen_func)
125
+ async def wrapper(*args, **kwargs):
126
+ async for item in async_gen_func(*args, **kwargs):
127
+ for hook in iter_post_extract_hooks():
128
+ item = hook.post_extract(item)
129
+ if item is None:
130
+ break
131
+ if item is not None:
132
+ yield item
133
+ return wrapper
File without changes
@@ -0,0 +1,9 @@
1
+ from wxpath.http.client.crawler import Crawler
2
+ from wxpath.http.client.request import Request
3
+ from wxpath.http.client.response import Response
4
+
5
+ __all__ = [
6
+ "Crawler",
7
+ "Request",
8
+ "Response"
9
+ ]
@@ -0,0 +1,196 @@
1
+ import asyncio
2
+ import time
3
+ import urllib.parse
4
+ from collections import defaultdict
5
+ from socket import gaierror
6
+ from typing import AsyncIterator
7
+
8
+ import aiohttp
9
+
10
+ from wxpath.http.client.request import Request
11
+ from wxpath.http.client.response import Response
12
+ from wxpath.http.policy.retry import RetryPolicy
13
+ from wxpath.http.policy.throttler import AbstractThrottler, AutoThrottler
14
+ from wxpath.http.stats import CrawlerStats, build_trace_config
15
+ from wxpath.util.logging import get_logger
16
+
17
+ log = get_logger(__name__)
18
+
19
+ HEADERS = {"User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
20
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
21
+ "Chrome/142.0.0.0 Safari/537.36")}
22
+
23
+
24
+ class Crawler:
25
+ def __init__(
26
+ self,
27
+ concurrency: int = 16,
28
+ per_host: int = 8,
29
+ timeout: int = 15,
30
+ *,
31
+ headers: dict | None = None,
32
+ proxies: dict | None = None,
33
+ retry_policy: RetryPolicy | None = None,
34
+ throttler: AbstractThrottler | None = None,
35
+ auto_throttle_target_concurrency: float = None,
36
+ auto_throttle_start_delay: float = 0.25,
37
+ auto_throttle_max_delay: float = 10.0,
38
+ ):
39
+ self.concurrency = concurrency
40
+ self._timeout = aiohttp.ClientTimeout(total=timeout)
41
+ self._headers = HEADERS | (headers or {}) # merge headers
42
+ self._proxies = proxies or {}
43
+
44
+ self.retry_policy = retry_policy or RetryPolicy()
45
+ self.throttler = throttler or AutoThrottler(
46
+ target_concurrency=auto_throttle_target_concurrency or concurrency/4.0,
47
+ start_delay=auto_throttle_start_delay,
48
+ max_delay=auto_throttle_max_delay,
49
+ )
50
+ self._sem_global = asyncio.Semaphore(concurrency)
51
+ self._sem_host = defaultdict(lambda: asyncio.Semaphore(per_host))
52
+
53
+ self._pending: asyncio.Queue[Request] = asyncio.Queue()
54
+ self._results: asyncio.Queue[Response] = asyncio.Queue()
55
+
56
+ self._session: aiohttp.ClientSession | None = None
57
+ self._workers: list[asyncio.Task] = []
58
+ self._closed = False
59
+ self._stats = CrawlerStats()
60
+
61
+ def build_session(self):
62
+ trace_config = build_trace_config(self._stats)
63
+ # Need to build the connector as late as possible as it requires the loop
64
+ connector = aiohttp.TCPConnector(limit=self.concurrency*2, ttl_dns_cache=300)
65
+ return aiohttp.ClientSession(
66
+ headers=self._headers,
67
+ timeout=self._timeout,
68
+ connector=connector,
69
+ trace_configs=[trace_config]
70
+ )
71
+
72
+ async def __aenter__(self):
73
+ if self._session is None:
74
+ # self._session = aiohttp.ClientSession(timeout=self._timeout)
75
+ self._session = self.build_session()
76
+
77
+ self._workers = [
78
+ asyncio.create_task(self._worker(), name=f"crawler-worker-{i}")
79
+ for i in range(self.concurrency)
80
+ ]
81
+ return self
82
+
83
+ async def __aexit__(self, *_):
84
+ self._closed = True
85
+ for w in self._workers:
86
+ w.cancel()
87
+ await asyncio.gather(*self._workers, return_exceptions=True)
88
+ if self._session:
89
+ await self._session.close()
90
+
91
+ def submit(self, req: Request):
92
+ if self._closed:
93
+ raise RuntimeError("crawler is closed")
94
+ self._pending.put_nowait(req)
95
+
96
+ def __aiter__(self) -> AsyncIterator[Response]:
97
+ return self._result_iter()
98
+
99
+ async def _result_iter(self):
100
+ # while not self._closed:
101
+ while not (self._closed and self._results.empty()):
102
+ resp = await self._results.get()
103
+ self._results.task_done()
104
+ yield resp
105
+
106
+ def _proxy_for(self, url: str):
107
+ host = urllib.parse.urlsplit(url).hostname
108
+ return self._proxies.get(host)
109
+
110
+ async def _worker(self):
111
+ while True:
112
+ req = await self._pending.get()
113
+ try:
114
+ resp = await self._fetch_one(req)
115
+ if resp is not None:
116
+ await self._results.put(resp)
117
+
118
+ except asyncio.CancelledError:
119
+ # Must propagate cancellation
120
+ log.debug("cancelled error", extra={"url": req.url})
121
+ raise
122
+
123
+ except gaierror:
124
+ # Ignore DNS errors
125
+ log.warning("DNS error", extra={"url": req.url})
126
+ pass
127
+
128
+ except Exception as exc:
129
+ log.warning("exception", extra={"url": req.url})
130
+ # Last-resort safety: never drop a request silently
131
+ await self._results.put(Response(req, 0, b"", error=exc))
132
+ finally:
133
+ self._pending.task_done()
134
+
135
+ async def _fetch_one(self, req: Request) -> Response | None:
136
+ host = req.hostname
137
+
138
+ # TODO: Move this filter to hooks
139
+ if req.url.lower().endswith((".pdf", ".zip", ".exe")):
140
+ req.max_retries = 0
141
+
142
+ async with self._sem_global, self._sem_host[host]:
143
+ t0 = asyncio.get_running_loop().time()
144
+ await self.throttler.wait(host)
145
+ dt = asyncio.get_running_loop().time() - t0
146
+
147
+ self._stats.throttle_waits += 1
148
+ self._stats.throttle_wait_time += dt
149
+ self._stats.throttle_waits_by_host[host] += 1
150
+
151
+ start = time.monotonic()
152
+ try:
153
+ async with self._session.get(
154
+ req.url,
155
+ headers=self._headers | req.headers,
156
+ proxy=self._proxy_for(req.url),
157
+ timeout=req.timeout or self._timeout,
158
+ ) as resp:
159
+ body = await resp.read()
160
+
161
+ latency = time.monotonic() - start
162
+ self.throttler.record_latency(host, latency)
163
+
164
+ if self.retry_policy.should_retry(req, response=resp):
165
+ await self._retry(req)
166
+ return None
167
+
168
+ return Response(req, resp.status, body, dict(resp.headers))
169
+ except asyncio.CancelledError:
170
+ # Normal during shutdown / timeout propagation
171
+ log.debug("cancelled error", extra={"url": req.url})
172
+ raise
173
+ except Exception as exc:
174
+ latency = time.monotonic() - start
175
+ self.throttler.record_latency(host, latency)
176
+
177
+ if self.retry_policy.should_retry(req, exception=exc):
178
+ await self._retry(req)
179
+ return None
180
+
181
+ log.error("request failed", extra={"url": req.url}, exc_info=exc)
182
+ return Response(req, 0, b"", error=exc)
183
+
184
+ async def _retry(self, req: Request):
185
+ req.retries += 1
186
+ delay = self.retry_policy.get_delay(req)
187
+
188
+ log.warning(
189
+ "retrying",
190
+ extra={"url": req.url, "retry": req.retries, "delay": delay},
191
+ )
192
+
193
+ if delay:
194
+ await asyncio.sleep(delay)
195
+
196
+ self.submit(req)
@@ -0,0 +1,35 @@
1
+ import time
2
+ from dataclasses import dataclass, field
3
+ from typing import Any, Dict
4
+
5
+
6
+ @dataclass
7
+ class Request:
8
+ url: str
9
+ method: str = "GET"
10
+ headers: Dict[str, str] = field(default_factory=dict)
11
+ timeout: float = 15.0
12
+
13
+ retries: int = 0
14
+ max_retries: int | None = None
15
+ dont_retry: bool = False
16
+
17
+ meta: Dict[str, Any] = field(default_factory=dict)
18
+
19
+ created_at: float = field(default_factory=time.monotonic)
20
+
21
+ def copy_for_retry(self) -> "Request":
22
+ return Request(
23
+ url=self.url,
24
+ method=self.method,
25
+ headers=self.headers,
26
+ timeout=self.timeout,
27
+ retries=self.retries + 1,
28
+ max_retries=self.max_retries,
29
+ meta=self.meta,
30
+ )
31
+
32
+ @property
33
+ def hostname(self) -> str:
34
+ from urllib.parse import urlsplit
35
+ return urlsplit(self.url).hostname or ""
@@ -0,0 +1,14 @@
1
+ # wxpath/http/response.py
2
+ from dataclasses import dataclass, field
3
+ from typing import Optional
4
+
5
+ from wxpath.http.client.request import Request
6
+
7
+
8
+ @dataclass
9
+ class Response:
10
+ request: Request
11
+ status: int
12
+ body: bytes
13
+ headers: dict | None = None
14
+ error: Optional[Exception] = field(default=None, kw_only=True)
@@ -0,0 +1,16 @@
1
+ import random
2
+
3
+
4
+ def exponential_backoff(
5
+ attempt: int,
6
+ base: float = 0.5,
7
+ cap: float = 30.0,
8
+ jitter: bool = True,
9
+ ) -> float:
10
+ """
11
+ Exponential backoff with optional jitter.
12
+ """
13
+ delay = min(cap, base * (2 ** attempt))
14
+ if jitter:
15
+ delay *= random.uniform(0.7, 1.3)
16
+ return delay
@@ -0,0 +1,35 @@
1
+ from wxpath.http.policy.backoff import exponential_backoff
2
+ from wxpath.util.logging import get_logger
3
+
4
+ log = get_logger(__name__)
5
+
6
+
7
+ class RetryPolicy:
8
+ def __init__(
9
+ self,
10
+ max_retries: int = 3,
11
+ retry_statuses: set[int] = None,
12
+ ):
13
+ self.max_retries = max_retries
14
+ self.retry_statuses = retry_statuses or {500, 502, 503, 504}
15
+
16
+ def should_retry(self, request, response=None, exception=None) -> bool:
17
+ if request.dont_retry:
18
+ return False
19
+
20
+ if request.max_retries is not None and request.retries >= request.max_retries:
21
+ return False
22
+
23
+ if request.retries >= self.max_retries:
24
+ return False
25
+
26
+ if response is not None and response.status in self.retry_statuses:
27
+ return True
28
+
29
+ if exception is not None:
30
+ return True
31
+
32
+ return False
33
+
34
+ def get_delay(self, request) -> float:
35
+ return exponential_backoff(request.retries)
@@ -0,0 +1,114 @@
1
+ import asyncio
2
+ from abc import ABC, abstractmethod
3
+ from collections import defaultdict
4
+
5
+ from wxpath.util.logging import get_logger
6
+
7
+ log = get_logger(__name__)
8
+
9
+
10
+ # Abstract Base Class
11
+ class AbstractThrottler(ABC):
12
+ @abstractmethod
13
+ async def wait(self, host: str):
14
+ pass
15
+
16
+ @abstractmethod
17
+ def record_latency(self, host: str, latency: float):
18
+ pass
19
+
20
+
21
+ class AutoThrottler(AbstractThrottler):
22
+ """
23
+ Scrapy-inspired auto-throttle, simplified:
24
+ - increases delay when latency increases
25
+ - decreases delay when responses are fast
26
+
27
+ Explanation:
28
+ - target_concurrency is the desired number of concurrent requests
29
+ - start_delay is the initial delay
30
+ - max_delay is the maximum delay
31
+ - smoothing is the exponential smoothing factor
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ start_delay: float = 0.25,
37
+ max_delay: float = 10.0,
38
+ target_concurrency: float = 1.0,
39
+ smoothing: float = 0.7,
40
+ ):
41
+ self.start_delay = start_delay
42
+ self.max_delay = max_delay
43
+ self.target_concurrency = target_concurrency
44
+ self.smoothing = smoothing
45
+
46
+ self._delay = defaultdict(lambda: start_delay)
47
+ self._latency = defaultdict(lambda: None)
48
+
49
+ def record_latency(self, host: str, latency: float):
50
+ prev = self._latency[host]
51
+ if prev is None:
52
+ self._latency[host] = latency
53
+ else:
54
+ self._latency[host] = (
55
+ # exponential smoothing
56
+ self.smoothing * prev + (1 - self.smoothing) * latency
57
+ )
58
+
59
+ self._recalculate_delay(host)
60
+
61
+ def _recalculate_delay(self, host: str):
62
+ latency = self._latency[host]
63
+ if not latency:
64
+ return
65
+
66
+ target_delay = latency / self.target_concurrency
67
+ delay = min(self.max_delay, max(0.0, target_delay))
68
+ self._delay[host] = delay
69
+
70
+ log.debug(
71
+ "auto-throttle",
72
+ extra={"host": host, "latency": latency, "delay": delay},
73
+ )
74
+
75
+ async def wait(self, host: str):
76
+ delay = self._delay[host]
77
+ if delay > 0:
78
+ await asyncio.sleep(delay)
79
+
80
+
81
+ class ImpoliteThrottle(AbstractThrottler):
82
+ """
83
+ Zero delay throttler
84
+ """
85
+
86
+ async def wait(self, host: str):
87
+ pass
88
+
89
+ def record_latency(self, host: str, latency: float):
90
+ pass
91
+
92
+
93
+ ZeroWaitThrottler = ImpoliteThrottle
94
+
95
+
96
+ class SimpleThrottler(AbstractThrottler):
97
+ """
98
+ Fixed delay throttler. Optionally provide per-host delays via `per_host_delays`.
99
+ """
100
+ def __init__(self, delay: float, per_host_delays: dict[str, float] = None):
101
+ self.delay = delay
102
+ self._delays = per_host_delays or defaultdict(lambda: delay)
103
+
104
+ async def wait(self, host: str):
105
+ if host in self._delays:
106
+ await asyncio.sleep(self._delays[host])
107
+ else:
108
+ await asyncio.sleep(self.delay)
109
+
110
+ def record_latency(self, host: str, latency: float):
111
+ pass
112
+
113
+
114
+ FixedDelayThrottler = SimpleThrottler