wxpath 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,82 @@
1
+ import asyncio
2
+ import urllib.parse
3
+ import urllib.robotparser
4
+
5
+ import aiohttp
6
+
7
+ from wxpath.util.logging import get_logger
8
+
9
+ log = get_logger(__name__)
10
+
11
+
12
+ class RobotsTxtPolicy:
13
+ """Caches and evaluates robots.txt rules for crawler requests."""
14
+
15
+ def __init__(self,
16
+ session: aiohttp.ClientSession,
17
+ default_parser: type['RobotsParserBase'] | None = None):
18
+ self._session = session
19
+ self._parsers: dict[str, "RobotsParserBase"] = {}
20
+ self._lock = asyncio.Lock()
21
+ self._default_parser = default_parser or UrllibRobotParser
22
+
23
+ async def can_fetch(self, url: str, user_agent: str | None) -> bool:
24
+ """Return whether the crawler is allowed to fetch `url`."""
25
+ host = urllib.parse.urlsplit(url).hostname
26
+ if not host:
27
+ return False
28
+
29
+ # Due to multiple aiohttp workers running concurrently, we need to lock
30
+ async with self._lock:
31
+ if host not in self._parsers:
32
+ self._parsers[host] = await self._fetch_robots_txt(host)
33
+
34
+ return self._parsers[host].can_fetch(url, user_agent)
35
+
36
+ async def _fetch_robots_txt(self, host: str) -> "RobotsParserBase":
37
+ """Retrieve and parse the robots.txt for `host`, failing open on errors."""
38
+ url = f"http://{host}/robots.txt"
39
+ try:
40
+ async with self._session.get(url) as response:
41
+ if response.status == 200:
42
+ text = await response.text()
43
+ # Pass the text as-is to the parser, let it handle the format
44
+ if self._default_parser == UrllibRobotParser:
45
+ return self._default_parser(text.splitlines())
46
+ else:
47
+ return self._default_parser(text)
48
+ else:
49
+ # Empty robots.txt - allow all
50
+ if self._default_parser == UrllibRobotParser:
51
+ return self._default_parser([])
52
+ else:
53
+ return self._default_parser("")
54
+ except Exception:
55
+ # If robots.txt is unavailable, allow all requests (fail open)
56
+ log.debug(f"Failed to fetch robots.txt from {host}, allowing all requests")
57
+ if self._default_parser == UrllibRobotParser:
58
+ return self._default_parser([])
59
+ else:
60
+ return self._default_parser("")
61
+
62
+
63
+ class RobotsParserBase:
64
+ """Base type for robots.txt parsers used by the policy."""
65
+
66
+
67
+ class UrllibRobotParser(RobotsParserBase):
68
+ """Adapter around `urllib.robotparser.RobotFileParser`."""
69
+
70
+ def __init__(self, text):
71
+ self._parser = urllib.robotparser.RobotFileParser()
72
+ # urllib.robotparser.RobotFileParser.parse() expects a list of lines
73
+ if isinstance(text, str):
74
+ lines = text.splitlines() if text else []
75
+ else:
76
+ lines = text if text else []
77
+ self._parser.parse(lines)
78
+
79
+ def can_fetch(self, url, user_agent):
80
+ """Return whether the URL is allowed for the given user agent."""
81
+ return self._parser.can_fetch(user_agent, url)
82
+
@@ -0,0 +1,114 @@
1
+ import asyncio
2
+ from abc import ABC, abstractmethod
3
+ from collections import defaultdict
4
+
5
+ from wxpath.util.logging import get_logger
6
+
7
+ log = get_logger(__name__)
8
+
9
+
10
+ # Abstract Base Class
11
+ class AbstractThrottler(ABC):
12
+ @abstractmethod
13
+ async def wait(self, host: str):
14
+ pass
15
+
16
+ @abstractmethod
17
+ def record_latency(self, host: str, latency: float):
18
+ pass
19
+
20
+
21
+ class AutoThrottler(AbstractThrottler):
22
+ """
23
+ Scrapy-inspired auto-throttle, simplified:
24
+ - increases delay when latency increases
25
+ - decreases delay when responses are fast
26
+
27
+ Explanation:
28
+ - target_concurrency is the desired number of concurrent requests
29
+ - start_delay is the initial delay
30
+ - max_delay is the maximum delay
31
+ - smoothing is the exponential smoothing factor
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ start_delay: float = 0.25,
37
+ max_delay: float = 10.0,
38
+ target_concurrency: float = 1.0,
39
+ smoothing: float = 0.7,
40
+ ):
41
+ self.start_delay = start_delay
42
+ self.max_delay = max_delay
43
+ self.target_concurrency = target_concurrency
44
+ self.smoothing = smoothing
45
+
46
+ self._delay = defaultdict(lambda: start_delay)
47
+ self._latency = defaultdict(lambda: None)
48
+
49
+ def record_latency(self, host: str, latency: float):
50
+ prev = self._latency[host]
51
+ if prev is None:
52
+ self._latency[host] = latency
53
+ else:
54
+ self._latency[host] = (
55
+ # exponential smoothing
56
+ self.smoothing * prev + (1 - self.smoothing) * latency
57
+ )
58
+
59
+ self._recalculate_delay(host)
60
+
61
+ def _recalculate_delay(self, host: str):
62
+ latency = self._latency[host]
63
+ if not latency:
64
+ return
65
+
66
+ target_delay = latency / self.target_concurrency
67
+ delay = min(self.max_delay, max(0.0, target_delay))
68
+ self._delay[host] = delay
69
+
70
+ log.debug(
71
+ "auto-throttle",
72
+ extra={"host": host, "latency": latency, "delay": delay},
73
+ )
74
+
75
+ async def wait(self, host: str):
76
+ delay = self._delay[host]
77
+ if delay > 0:
78
+ await asyncio.sleep(delay)
79
+
80
+
81
+ class ImpoliteThrottle(AbstractThrottler):
82
+ """
83
+ Zero delay throttler
84
+ """
85
+
86
+ async def wait(self, host: str):
87
+ pass
88
+
89
+ def record_latency(self, host: str, latency: float):
90
+ pass
91
+
92
+
93
+ ZeroWaitThrottler = ImpoliteThrottle
94
+
95
+
96
+ class SimpleThrottler(AbstractThrottler):
97
+ """
98
+ Fixed delay throttler. Optionally provide per-host delays via `per_host_delays`.
99
+ """
100
+ def __init__(self, delay: float, per_host_delays: dict[str, float] = None):
101
+ self.delay = delay
102
+ self._delays = per_host_delays or defaultdict(lambda: delay)
103
+
104
+ async def wait(self, host: str):
105
+ if host in self._delays:
106
+ await asyncio.sleep(self._delays[host])
107
+ else:
108
+ await asyncio.sleep(self.delay)
109
+
110
+ def record_latency(self, host: str, latency: float):
111
+ pass
112
+
113
+
114
+ FixedDelayThrottler = SimpleThrottler
wxpath/http/stats.py ADDED
@@ -0,0 +1,102 @@
1
+ """
2
+ aiohttp request statistics and tracing hooks.
3
+ """
4
+
5
+ import time
6
+ from collections import defaultdict
7
+ from dataclasses import dataclass, field
8
+ from typing import Optional
9
+
10
+ from aiohttp import TraceConfig
11
+
12
+
13
+ @dataclass
14
+ class CrawlerStats:
15
+ # ---- Lifecycle counts ----
16
+ requests_enqueued: int = 0
17
+ requests_started: int = 0
18
+ requests_completed: int = 0
19
+ requests_cache_hit: int = 0
20
+
21
+ # ---- Concurrency ----
22
+ in_flight_global: int = 0
23
+ in_flight_per_host: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
24
+
25
+ # ---- Queueing ----
26
+ queue_size: int = 0
27
+ queue_wait_time_total: float = 0.0
28
+
29
+ # ---- Throttling ----
30
+ throttle_waits: int = 0
31
+ throttle_wait_time: float = 0.0
32
+ throttle_waits_by_host: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
33
+
34
+ # ---- Latency feedback ----
35
+ latency_samples: int = 0
36
+ latency_ewma: float = 0.0
37
+ min_latency: Optional[float] = None
38
+ max_latency: Optional[float] = None
39
+
40
+ # ---- Errors / retries ----
41
+ retries_scheduled: int = 0
42
+ retries_executed: int = 0
43
+ errors_by_host: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
44
+
45
+
46
+ def build_trace_config(stats: CrawlerStats) -> TraceConfig:
47
+ """
48
+ Returns an aiohttp TraceConfig wired to the given stats instance.
49
+ Tracks detailed per-request, per-host, and queue/throttle metrics.
50
+ """
51
+ trace = TraceConfig()
52
+
53
+ async def on_request_start(session, context, params):
54
+ stats.requests_started += 1
55
+ stats.in_flight_global += 1
56
+ host = params.url.host
57
+ stats.in_flight_per_host[host] += 1
58
+ context._start_time = time.monotonic()
59
+
60
+ async def on_request_end(session, context, params):
61
+ """
62
+ Update stats on request completion.
63
+ """
64
+ host = params.url.host
65
+ stats.in_flight_global -= 1
66
+ stats.in_flight_per_host[host] -= 1
67
+
68
+ latency = time.monotonic() - context._start_time
69
+ stats.latency_samples += 1
70
+ # EWMA update: alpha = 0.3
71
+ alpha = 0.3
72
+ stats.latency_ewma = (alpha * latency) + ((1 - alpha) * stats.latency_ewma)
73
+ stats.min_latency = latency if stats.min_latency is None \
74
+ else min(stats.min_latency, latency)
75
+ stats.max_latency = latency if stats.max_latency is None \
76
+ else max(stats.max_latency, latency)
77
+
78
+ status = getattr(params.response, "status", None)
79
+ if status is not None:
80
+ if not hasattr(stats, "status_counts"):
81
+ stats.status_counts = defaultdict(int)
82
+ stats.status_counts[status] += 1
83
+
84
+ content_length = getattr(params.response, "content_length", None)
85
+ if content_length:
86
+ if not hasattr(stats, "bytes_received"):
87
+ stats.bytes_received = 0
88
+ stats.bytes_received += content_length
89
+
90
+ stats.requests_completed += 1
91
+
92
+ async def on_request_exception(session, context, params):
93
+ host = params.url.host
94
+ stats.in_flight_global -= 1
95
+ stats.in_flight_per_host[host] -= 1
96
+ stats.errors_by_host[host] += 1
97
+
98
+ trace.on_request_start.append(on_request_start)
99
+ trace.on_request_end.append(on_request_end)
100
+ trace.on_request_exception.append(on_request_exception)
101
+
102
+ return trace
wxpath/patches.py ADDED
@@ -0,0 +1,63 @@
1
+ import elementpath
2
+ from elementpath.xpath3 import XPath3Parser
3
+ from lxml import etree, html
4
+
5
+
6
+ def html_element_repr(self):
7
+ return (f"HtmlElement(tag={self.tag}, "
8
+ f"depth={self.get('depth', -1)}, "
9
+ f"base_url={getattr(self, 'base_url', None)!r})")
10
+
11
+ # Patch lxml.html.HtmlElement.__repr__ to improve debugging with base_url.
12
+ html.HtmlElement.__repr__ = html_element_repr
13
+
14
+
15
+ class XPath3Element(etree.ElementBase):
16
+ def xpath3(self, expr, **kwargs):
17
+ """
18
+ Evaluate an XPath 3 expression using elementpath library,
19
+ returning the results as a list.
20
+ """
21
+ kwargs.setdefault("parser", XPath3Parser)
22
+ kwargs.setdefault(
23
+ "uri",
24
+ getattr(self.getroottree().docinfo, "URL", None) or self.get("base_url")
25
+ )
26
+ return elementpath.select(self, expr, **kwargs)
27
+
28
+ # --- Convenience property for backward‑compatibility -----------------
29
+ @property
30
+ def base_url(self):
31
+ # 1) Per-element override (keeps our “multiple base URLs” feature)
32
+ url = self.get("base_url")
33
+ if url is not None:
34
+ return url
35
+ # 2) Fall back to document URL (O(1))
36
+ return self.getroottree().docinfo.URL
37
+
38
+ @base_url.setter
39
+ def base_url(self, value):
40
+ # Keep the per-element attribute (used by our crawler)
41
+ self.set("base_url", value)
42
+ # Set xml:base attribute so XPath base-uri() picks it up
43
+ self.set("{http://www.w3.org/XML/1998/namespace}base", value)
44
+ # Also store on the document so descendants can fetch it quickly
45
+ self.getroottree().docinfo.URL = value
46
+
47
+ @property
48
+ def depth(self):
49
+ return int(self.get("depth", -1))
50
+
51
+ @depth.setter
52
+ def depth(self, value):
53
+ self.set("depth", str(value))
54
+
55
+ # Create and register custom parser that returns XPath3Element instances
56
+ lookup = etree.ElementDefaultClassLookup(element=XPath3Element)
57
+ parser = etree.HTMLParser()
58
+ parser.set_element_class_lookup(lookup)
59
+
60
+
61
+ # Expose parser for use in parse_html
62
+ html_parser_with_xpath3 = parser
63
+ html.HtmlElement.xpath3 = XPath3Element.xpath3
wxpath/settings.py ADDED
@@ -0,0 +1,108 @@
1
+ """
2
+ Settings for wxpath.
3
+
4
+ These settings are global and can be accessed from any module in the wxpath package.
5
+
6
+ They are typically used by various modules to configure Class initializers.
7
+
8
+ The SETTINGS dict structure follows the structure of wxpath submodules.
9
+
10
+ Expected usage behavior:
11
+
12
+ ```python
13
+ from wxpath.settings import SETTINGS
14
+
15
+ CACHE_SETTINGS = SETTINGS.http.client.cache
16
+ ```
17
+
18
+ Once initialized, the settings are expected to be immutable (not enforced).
19
+ """
20
+
21
+ from datetime import timedelta
22
+
23
+ # Settings match
24
+ SETTINGS = {
25
+ 'http': {
26
+ 'client': {
27
+ 'cache': {
28
+ 'enabled': False,
29
+ # 'db_path': 'cache.db',
30
+ 'expire_after': timedelta(days=7),
31
+ 'urls_expire_after': None,
32
+ 'allowed_methods': ("GET", "HEAD"),
33
+ 'allowed_codes': (200, 203, 301, 302, 307, 308),
34
+ 'ignored_parameters': ["utm_*", "fbclid"],
35
+ 'include_headers': False, # don’t vary cache keys on headers by default
36
+ 'cache_control': False, # honor Cache-Control/Expires if present
37
+ # # TODO: size hedges (soft, enforced by wxpath)
38
+ # 'max_entries': None, # e.g. 1_000_000
39
+ # 'max_response_size': None, # bytes, e.g. 2_000_000
40
+ # 'max_db_size': None, # bytes, e.g. 5 * 1024**3
41
+ 'backend': "sqlite",
42
+ 'sqlite': {
43
+ 'cache_name': "cache.db",
44
+ },
45
+ 'redis': {
46
+ # 'host': "localhost",
47
+ # 'port': 6379,
48
+ # 'db': 0,
49
+ 'address': 'redis://localhost:6379/0',
50
+ 'cache_name': "wxpath:",
51
+ }
52
+ },
53
+ 'crawler': {
54
+ 'concurrency': 16,
55
+ 'per_host': 8,
56
+ 'timeout': 15,
57
+ 'headers': {
58
+ "User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
59
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
60
+ "Chrome/142.0.0.0 Safari/537.36")},
61
+ 'proxies': None,
62
+ 'auto_throttle_target_concurrency': None,
63
+ 'auto_throttle_start_delay': 0.25,
64
+ 'auto_throttle_max_delay': 10.0,
65
+ 'respect_robots': True,
66
+ },
67
+ },
68
+ },
69
+ }
70
+
71
+
72
+ class AttrDict(dict):
73
+ """
74
+ A dictionary subclass that allows dot-notation access while
75
+ recursively converting nested dictionaries.
76
+ """
77
+ def __init__(self, *args, **kwargs):
78
+ super().__init__(*args, **kwargs)
79
+ # Point the instance __dict__ to itself to allow attribute access
80
+ self.__dict__ = self
81
+ # Recursively convert any dicts passed during initialization
82
+ for key, value in self.items():
83
+ self[key] = self._convert(value)
84
+
85
+ @classmethod
86
+ def _convert(cls, value):
87
+ """Recursively converts dicts to AttrDicts, leaving other types alone."""
88
+ if isinstance(value, dict):
89
+ return cls(value)
90
+ elif isinstance(value, list):
91
+ # Optional: converts dicts inside lists while keeping the list container
92
+ return [cls._convert(item) for item in value]
93
+ return value
94
+
95
+ def __setitem__(self, key, value):
96
+ # Ensure that new items added via dict-syntax are also converted
97
+ super().__setitem__(key, self._convert(value))
98
+
99
+ def __getattr__(self, key):
100
+ try:
101
+ return self[key]
102
+ except KeyError as exc:
103
+ raise AttributeError(f"AttrDict object has no attribute '{key}'") from exc
104
+
105
+
106
+ SETTINGS = AttrDict(SETTINGS)
107
+ CACHE_SETTINGS = SETTINGS.http.client.cache
108
+ CRAWLER_SETTINGS = SETTINGS.http.client.crawler
File without changes
wxpath/util/logging.py ADDED
@@ -0,0 +1,91 @@
1
+ import logging
2
+ from logging.config import dictConfig
3
+ from typing import Any, Mapping
4
+
5
+
6
+ class KeyValueFormatter(logging.Formatter):
7
+ """
8
+ Formatter that automatically renders any 'extra' context added to the record
9
+ as key=value pairs at the end of the log line.
10
+ """
11
+ # Reserved keys that already exist in LogRecord and shouldn't be printed again
12
+ _RESERVED = {
13
+ 'args', 'asctime', 'created', 'exc_info', 'exc_text', 'filename',
14
+ 'funcName', 'levelname', 'levelno', 'lineno', 'message', 'module',
15
+ 'msecs', 'msg', 'name', 'pathname', 'process', 'processName',
16
+ 'relativeCreated', 'stack_info', 'thread', 'threadName', 'taskName'
17
+ }
18
+
19
+ def format(self, record: logging.LogRecord) -> str:
20
+ # 1. Format the standard message first
21
+ s = super().format(record)
22
+
23
+ # 2. Find all 'extra' keys
24
+ extras = {k: v for k, v in record.__dict__.items() if k not in self._RESERVED}
25
+
26
+ # 3. Append them as key=value
27
+ if extras:
28
+ # Sort for deterministic logs
29
+ context_str = " ".join(f"{k}={v}" for k, v in sorted(extras.items()))
30
+ s = f"{s} | {context_str}"
31
+
32
+ return s
33
+
34
+
35
+ _DEFAULT_LOGGING_CONF = {
36
+ "version": 1,
37
+ "disable_existing_loggers": False,
38
+ "formatters": {
39
+ "kv": {
40
+ # Note: We use the class path to our custom class
41
+ "()": KeyValueFormatter,
42
+ "format": "%(asctime)s [%(levelname).1s] %(name)s | %(funcName)s | %(message)s"
43
+ }
44
+ },
45
+ "handlers": {
46
+ "stderr": {
47
+ "class": "logging.StreamHandler",
48
+ "formatter": "kv",
49
+ }
50
+ },
51
+ "loggers": {
52
+ "wxpath": {"level": "INFO", "handlers": ["stderr"]},
53
+ },
54
+ }
55
+
56
+ def configure_logging(level: str | int = "INFO", **overrides) -> None:
57
+ """
58
+ Configure wxpath's logger.
59
+
60
+ Call this once in an application entry-point **or** rely on defaults.
61
+
62
+ Parameters
63
+ ----------
64
+ level
65
+ "DEBUG"|"INFO"|... or `logging.DEBUG`, overrides the root wxpath logger.
66
+ overrides
67
+ Dict that is merged (shallow) into the default dictConfig.
68
+ Lets advanced users swap formatters/handlers.
69
+ """
70
+ conf = {**_DEFAULT_LOGGING_CONF, **overrides}
71
+ conf["loggers"]["wxpath"]["level"] = level
72
+ dictConfig(conf)
73
+
74
+
75
+ class CrawlAdapter(logging.LoggerAdapter):
76
+ """
77
+ Inject crawl context (depth, op, url) so the handler/formatter
78
+ never needs to know scraping internals.
79
+ """
80
+ def process(self, msg: str, kwargs: Mapping[str, Any]):
81
+ extra = self.extra.copy()
82
+ extra.update(kwargs.pop("extra", {}))
83
+ kwargs["extra"] = extra
84
+ return msg, kwargs
85
+
86
+ def get_logger(name: str, **ctx) -> CrawlAdapter:
87
+ base = logging.getLogger(name)
88
+ # default placeholders so formatter never blows up
89
+ defaults = {"depth": "-", "op": "-", "url": "-"}
90
+ defaults.update(ctx)
91
+ return CrawlAdapter(base, defaults)
@@ -0,0 +1,22 @@
1
+ from wxpath.core.ops import WxStr
2
+
3
+
4
+ def simplify(obj):
5
+ """
6
+ Recursively convert custom wrapper types (e.g., WxStr / ExtractedStr,
7
+ lxml elements) into plain built-in Python types so that printing or
8
+ JSON serialising shows clean values.
9
+ """
10
+ # Scalars
11
+ if isinstance(obj, WxStr):
12
+ return str(obj)
13
+
14
+ # Mapping
15
+ if isinstance(obj, dict):
16
+ return {k: simplify(v) for k, v in obj.items()}
17
+
18
+ # Sequence (but not str/bytes)
19
+ if isinstance(obj, (list, tuple, set)):
20
+ return type(obj)(simplify(v) for v in obj)
21
+
22
+ return obj