PyPI - wxpath - Versions diffs - 0.4.0__py3-none-any.whl - Mend

wxpath 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

wxpath/__init__.py +9 -0
wxpath/cli.py +137 -0
wxpath/core/__init__.py +13 -0
wxpath/core/dom.py +22 -0
wxpath/core/models.py +74 -0
wxpath/core/ops.py +278 -0
wxpath/core/parser.py +598 -0
wxpath/core/runtime/__init__.py +5 -0
wxpath/core/runtime/engine.py +444 -0
wxpath/core/runtime/helpers.py +41 -0
wxpath/hooks/__init__.py +9 -0
wxpath/hooks/builtin.py +113 -0
wxpath/hooks/registry.py +145 -0
wxpath/http/__init__.py +0 -0
wxpath/http/client/__init__.py +9 -0
wxpath/http/client/cache.py +43 -0
wxpath/http/client/crawler.py +315 -0
wxpath/http/client/request.py +38 -0
wxpath/http/client/response.py +14 -0
wxpath/http/policy/backoff.py +16 -0
wxpath/http/policy/retry.py +35 -0
wxpath/http/policy/robots.py +82 -0
wxpath/http/policy/throttler.py +114 -0
wxpath/http/stats.py +102 -0
wxpath/patches.py +63 -0
wxpath/settings.py +108 -0
wxpath/util/__init__.py +0 -0
wxpath/util/logging.py +91 -0
wxpath/util/serialize.py +22 -0
wxpath-0.4.0.dist-info/METADATA +460 -0
wxpath-0.4.0.dist-info/RECORD +35 -0
wxpath-0.4.0.dist-info/WHEEL +5 -0
wxpath-0.4.0.dist-info/entry_points.txt +2 -0
wxpath-0.4.0.dist-info/licenses/LICENSE +21 -0
wxpath-0.4.0.dist-info/top_level.txt +1 -0

wxpath/http/policy/robots.py ADDED Viewed

@@ -0,0 +1,82 @@
+import asyncio
+import urllib.parse
+import urllib.robotparser
+import aiohttp
+from wxpath.util.logging import get_logger
+log = get_logger(__name__)
+class RobotsTxtPolicy:
+    """Caches and evaluates robots.txt rules for crawler requests."""
+    def __init__(self,
+                 session: aiohttp.ClientSession,
+                 default_parser: type['RobotsParserBase'] | None = None):
+        self._session = session
+        self._parsers: dict[str, "RobotsParserBase"] = {}
+        self._lock = asyncio.Lock()
+        self._default_parser = default_parser or UrllibRobotParser
+    async def can_fetch(self, url: str, user_agent: str | None) -> bool:
+        """Return whether the crawler is allowed to fetch `url`."""
+        host = urllib.parse.urlsplit(url).hostname
+        if not host:
+            return False
+        # Due to multiple aiohttp workers running concurrently, we need to lock
+        async with self._lock:
+            if host not in self._parsers:
+                self._parsers[host] = await self._fetch_robots_txt(host)
+        return self._parsers[host].can_fetch(url, user_agent)
+    async def _fetch_robots_txt(self, host: str) -> "RobotsParserBase":
+        """Retrieve and parse the robots.txt for `host`, failing open on errors."""
+        url = f"http://{host}/robots.txt"
+        try:
+            async with self._session.get(url) as response:
+                if response.status == 200:
+                    text = await response.text()
+                    # Pass the text as-is to the parser, let it handle the format
+                    if self._default_parser == UrllibRobotParser:
+                        return self._default_parser(text.splitlines())
+                    else:
+                        return self._default_parser(text)
+                else:
+                    # Empty robots.txt - allow all
+                    if self._default_parser == UrllibRobotParser:
+                        return self._default_parser([])
+                    else:
+                        return self._default_parser("")
+        except Exception:
+            # If robots.txt is unavailable, allow all requests (fail open)
+            log.debug(f"Failed to fetch robots.txt from {host}, allowing all requests")
+            if self._default_parser == UrllibRobotParser:
+                return self._default_parser([])
+            else:
+                return self._default_parser("")
+class RobotsParserBase:
+    """Base type for robots.txt parsers used by the policy."""
+class UrllibRobotParser(RobotsParserBase):
+    """Adapter around `urllib.robotparser.RobotFileParser`."""
+    def __init__(self, text):
+        self._parser = urllib.robotparser.RobotFileParser()
+        # urllib.robotparser.RobotFileParser.parse() expects a list of lines
+        if isinstance(text, str):
+            lines = text.splitlines() if text else []
+        else:
+            lines = text if text else []
+        self._parser.parse(lines)
+    def can_fetch(self, url, user_agent):
+        """Return whether the URL is allowed for the given user agent."""
+        return self._parser.can_fetch(user_agent, url)

wxpath/http/policy/throttler.py ADDED Viewed

@@ -0,0 +1,114 @@
+import asyncio
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from wxpath.util.logging import get_logger
+log = get_logger(__name__)
+# Abstract Base Class
+class AbstractThrottler(ABC):
+    @abstractmethod
+    async def wait(self, host: str):
+        pass
+    @abstractmethod
+    def record_latency(self, host: str, latency: float):
+        pass
+class AutoThrottler(AbstractThrottler):
+    """
+    Scrapy-inspired auto-throttle, simplified:
+    - increases delay when latency increases
+    - decreases delay when responses are fast
+    Explanation:
+    - target_concurrency is the desired number of concurrent requests
+    - start_delay is the initial delay
+    - max_delay is the maximum delay
+    - smoothing is the exponential smoothing factor
+    """
+    def __init__(
+        self,
+        start_delay: float = 0.25,
+        max_delay: float = 10.0,
+        target_concurrency: float = 1.0,
+        smoothing: float = 0.7,
+    ):
+        self.start_delay = start_delay
+        self.max_delay = max_delay
+        self.target_concurrency = target_concurrency
+        self.smoothing = smoothing
+        self._delay = defaultdict(lambda: start_delay)
+        self._latency = defaultdict(lambda: None)
+    def record_latency(self, host: str, latency: float):
+        prev = self._latency[host]
+        if prev is None:
+            self._latency[host] = latency
+        else:
+            self._latency[host] = (
+                # exponential smoothing
+                self.smoothing * prev + (1 - self.smoothing) * latency
+            )
+        self._recalculate_delay(host)
+    def _recalculate_delay(self, host: str):
+        latency = self._latency[host]
+        if not latency:
+            return
+        target_delay = latency / self.target_concurrency
+        delay = min(self.max_delay, max(0.0, target_delay))
+        self._delay[host] = delay
+        log.debug(
+            "auto-throttle",
+            extra={"host": host, "latency": latency, "delay": delay},
+        )
+    async def wait(self, host: str):
+        delay = self._delay[host]
+        if delay > 0:
+            await asyncio.sleep(delay)
+class ImpoliteThrottle(AbstractThrottler):
+    """
+    Zero delay throttler
+    """
+    async def wait(self, host: str):
+        pass
+    def record_latency(self, host: str, latency: float):
+        pass
+ZeroWaitThrottler = ImpoliteThrottle
+class SimpleThrottler(AbstractThrottler):
+    """
+    Fixed delay throttler. Optionally provide per-host delays via `per_host_delays`.
+    """
+    def __init__(self, delay: float, per_host_delays: dict[str, float] = None):
+        self.delay = delay
+        self._delays = per_host_delays or defaultdict(lambda: delay)
+    async def wait(self, host: str):
+        if host in self._delays:
+            await asyncio.sleep(self._delays[host])
+        else:
+            await asyncio.sleep(self.delay)
+    def record_latency(self, host: str, latency: float):
+        pass
+FixedDelayThrottler = SimpleThrottler

wxpath/http/stats.py ADDED Viewed

@@ -0,0 +1,102 @@
+"""
+aiohttp request statistics and tracing hooks.
+"""
+import time
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Optional
+from aiohttp import TraceConfig
+@dataclass
+class CrawlerStats:
+    # ---- Lifecycle counts ----
+    requests_enqueued: int = 0
+    requests_started: int = 0
+    requests_completed: int = 0
+    requests_cache_hit: int = 0
+    # ---- Concurrency ----
+    in_flight_global: int = 0
+    in_flight_per_host: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
+    # ---- Queueing ----
+    queue_size: int = 0
+    queue_wait_time_total: float = 0.0
+    # ---- Throttling ----
+    throttle_waits: int = 0
+    throttle_wait_time: float = 0.0
+    throttle_waits_by_host: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
+    # ---- Latency feedback ----
+    latency_samples: int = 0
+    latency_ewma: float = 0.0
+    min_latency: Optional[float] = None
+    max_latency: Optional[float] = None
+    # ---- Errors / retries ----
+    retries_scheduled: int = 0
+    retries_executed: int = 0
+    errors_by_host: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
+def build_trace_config(stats: CrawlerStats) -> TraceConfig:
+    """
+    Returns an aiohttp TraceConfig wired to the given stats instance.
+    Tracks detailed per-request, per-host, and queue/throttle metrics.
+    """
+    trace = TraceConfig()
+    async def on_request_start(session, context, params):
+        stats.requests_started += 1
+        stats.in_flight_global += 1
+        host = params.url.host
+        stats.in_flight_per_host[host] += 1
+        context._start_time = time.monotonic()
+    async def on_request_end(session, context, params):
+        """
+        Update stats on request completion.
+        """
+        host = params.url.host
+        stats.in_flight_global -= 1
+        stats.in_flight_per_host[host] -= 1
+        latency = time.monotonic() - context._start_time
+        stats.latency_samples += 1
+        # EWMA update: alpha = 0.3
+        alpha = 0.3
+        stats.latency_ewma = (alpha * latency) + ((1 - alpha) * stats.latency_ewma)
+        stats.min_latency = latency if stats.min_latency is None \
+            else min(stats.min_latency, latency)
+        stats.max_latency = latency if stats.max_latency is None \
+            else max(stats.max_latency, latency)
+        status = getattr(params.response, "status", None)
+        if status is not None:
+            if not hasattr(stats, "status_counts"):
+                stats.status_counts = defaultdict(int)
+            stats.status_counts[status] += 1
+        content_length = getattr(params.response, "content_length", None)
+        if content_length:
+            if not hasattr(stats, "bytes_received"):
+                stats.bytes_received = 0
+            stats.bytes_received += content_length
+        stats.requests_completed += 1
+    async def on_request_exception(session, context, params):
+        host = params.url.host
+        stats.in_flight_global -= 1
+        stats.in_flight_per_host[host] -= 1
+        stats.errors_by_host[host] += 1
+    trace.on_request_start.append(on_request_start)
+    trace.on_request_end.append(on_request_end)
+    trace.on_request_exception.append(on_request_exception)
+    return trace

wxpath/patches.py ADDED Viewed

@@ -0,0 +1,63 @@
+import elementpath
+from elementpath.xpath3 import XPath3Parser
+from lxml import etree, html
+def html_element_repr(self):
+    return (f"HtmlElement(tag={self.tag}, "
+            f"depth={self.get('depth', -1)}, "
+            f"base_url={getattr(self, 'base_url', None)!r})")
+# Patch lxml.html.HtmlElement.__repr__ to improve debugging with base_url.
+html.HtmlElement.__repr__ = html_element_repr
+class XPath3Element(etree.ElementBase):
+    def xpath3(self, expr, **kwargs):
+        """
+        Evaluate an XPath 3 expression using elementpath library,
+        returning the results as a list.
+        """
+        kwargs.setdefault("parser", XPath3Parser)
+        kwargs.setdefault(
+            "uri",
+            getattr(self.getroottree().docinfo, "URL", None) or self.get("base_url")
+        )
+        return elementpath.select(self, expr, **kwargs)
+    # --- Convenience property for backward‑compatibility -----------------
+    @property
+    def base_url(self):
+        # 1) Per-element override (keeps our “multiple base URLs” feature)
+        url = self.get("base_url")
+        if url is not None:
+            return url
+        # 2) Fall back to document URL (O(1))
+        return self.getroottree().docinfo.URL
+    @base_url.setter
+    def base_url(self, value):
+        # Keep the per-element attribute (used by our crawler)
+        self.set("base_url", value)
+        # Set xml:base attribute so XPath base-uri() picks it up
+        self.set("{http://www.w3.org/XML/1998/namespace}base", value)
+        # Also store on the document so descendants can fetch it quickly
+        self.getroottree().docinfo.URL = value
+    @property
+    def depth(self):
+        return int(self.get("depth", -1))
+    @depth.setter
+    def depth(self, value):
+        self.set("depth", str(value))
+# Create and register custom parser that returns XPath3Element instances
+lookup = etree.ElementDefaultClassLookup(element=XPath3Element)
+parser = etree.HTMLParser()
+parser.set_element_class_lookup(lookup)
+# Expose parser for use in parse_html
+html_parser_with_xpath3 = parser
+html.HtmlElement.xpath3 = XPath3Element.xpath3

wxpath/settings.py ADDED Viewed

@@ -0,0 +1,108 @@
+"""
+Settings for wxpath.
+These settings are global and can be accessed from any module in the wxpath package.
+They are typically used by various modules to configure Class initializers.
+The SETTINGS dict structure follows the structure of wxpath submodules.
+Expected usage behavior:
+```python
+from wxpath.settings import SETTINGS
+CACHE_SETTINGS = SETTINGS.http.client.cache
+```
+Once initialized, the settings are expected to be immutable (not enforced).
+"""
+from datetime import timedelta
+# Settings match
+SETTINGS = {
+    'http': {
+        'client': {
+            'cache': {
+                'enabled': False,
+                # 'db_path': 'cache.db',
+                'expire_after': timedelta(days=7),
+                'urls_expire_after': None,
+                'allowed_methods': ("GET", "HEAD"),
+                'allowed_codes': (200, 203, 301, 302, 307, 308),
+                'ignored_parameters': ["utm_*", "fbclid"],
+                'include_headers': False,   # don’t vary cache keys on headers by default
+                'cache_control': False,     # honor Cache-Control/Expires if present
+                # # TODO: size hedges (soft, enforced by wxpath)
+                # 'max_entries': None,        # e.g. 1_000_000
+                # 'max_response_size': None,  # bytes, e.g. 2_000_000
+                # 'max_db_size': None,        # bytes, e.g. 5 * 1024**3
+                'backend': "sqlite",
+                'sqlite': {
+                    'cache_name': "cache.db",
+                },
+                'redis': {
+                    # 'host': "localhost",
+                    # 'port': 6379,
+                    # 'db': 0,
+                    'address': 'redis://localhost:6379/0',
+                    'cache_name': "wxpath:",
+                }
+            },
+            'crawler': {
+                'concurrency': 16,
+                'per_host': 8,
+                'timeout': 15,
+                'headers': {
+                    "User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
+                                   "AppleWebKit/537.36 (KHTML, like Gecko) "
+                                   "Chrome/142.0.0.0 Safari/537.36")},
+                'proxies': None,
+                'auto_throttle_target_concurrency': None,
+                'auto_throttle_start_delay': 0.25,
+                'auto_throttle_max_delay': 10.0,
+                'respect_robots': True,
+            },
+        },
+    },
+}
+class AttrDict(dict):
+    """
+    A dictionary subclass that allows dot-notation access while
+    recursively converting nested dictionaries.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Point the instance __dict__ to itself to allow attribute access
+        self.__dict__ = self
+        # Recursively convert any dicts passed during initialization
+        for key, value in self.items():
+            self[key] = self._convert(value)
+    @classmethod
+    def _convert(cls, value):
+        """Recursively converts dicts to AttrDicts, leaving other types alone."""
+        if isinstance(value, dict):
+            return cls(value)
+        elif isinstance(value, list):
+            # Optional: converts dicts inside lists while keeping the list container
+            return [cls._convert(item) for item in value]
+        return value
+    def __setitem__(self, key, value):
+        # Ensure that new items added via dict-syntax are also converted
+        super().__setitem__(key, self._convert(value))
+    def __getattr__(self, key):
+        try:
+            return self[key]
+        except KeyError as exc:
+            raise AttributeError(f"AttrDict object has no attribute '{key}'") from exc
+SETTINGS = AttrDict(SETTINGS)
+CACHE_SETTINGS = SETTINGS.http.client.cache
+CRAWLER_SETTINGS = SETTINGS.http.client.crawler

wxpath/util/__init__.py ADDED Viewed

File without changes

wxpath/util/logging.py ADDED Viewed

@@ -0,0 +1,91 @@
+import logging
+from logging.config import dictConfig
+from typing import Any, Mapping
+class KeyValueFormatter(logging.Formatter):
+    """
+    Formatter that automatically renders any 'extra' context added to the record
+    as key=value pairs at the end of the log line.
+    """
+    # Reserved keys that already exist in LogRecord and shouldn't be printed again
+    _RESERVED = {
+        'args', 'asctime', 'created', 'exc_info', 'exc_text', 'filename',
+        'funcName', 'levelname', 'levelno', 'lineno', 'message', 'module',
+        'msecs', 'msg', 'name', 'pathname', 'process', 'processName',
+        'relativeCreated', 'stack_info', 'thread', 'threadName', 'taskName'
+    }
+    def format(self, record: logging.LogRecord) -> str:
+        # 1. Format the standard message first
+        s = super().format(record)
+        # 2. Find all 'extra' keys
+        extras = {k: v for k, v in record.__dict__.items() if k not in self._RESERVED}
+        # 3. Append them as key=value
+        if extras:
+            # Sort for deterministic logs
+            context_str = " ".join(f"{k}={v}" for k, v in sorted(extras.items()))
+            s = f"{s} | {context_str}"
+        return s
+_DEFAULT_LOGGING_CONF = {
+    "version": 1,
+    "disable_existing_loggers": False,
+    "formatters": {
+        "kv": {
+            # Note: We use the class path to our custom class
+            "()": KeyValueFormatter,
+            "format": "%(asctime)s [%(levelname).1s] %(name)s | %(funcName)s | %(message)s"
+        }
+    },
+    "handlers": {
+        "stderr": {
+            "class": "logging.StreamHandler",
+            "formatter": "kv",
+        }
+    },
+    "loggers": {
+        "wxpath": {"level": "INFO", "handlers": ["stderr"]},
+    },
+}
+def configure_logging(level: str | int = "INFO", **overrides) -> None:
+    """
+    Configure wxpath's logger.
+    Call this once in an application entry-point **or** rely on defaults.
+    Parameters
+    ----------
+    level
+        "DEBUG"|"INFO"|... or `logging.DEBUG`, overrides the root wxpath logger.
+    overrides
+        Dict that is merged (shallow) into the default dictConfig.
+        Lets advanced users swap formatters/handlers.
+    """
+    conf = {**_DEFAULT_LOGGING_CONF, **overrides}
+    conf["loggers"]["wxpath"]["level"] = level
+    dictConfig(conf)
+class CrawlAdapter(logging.LoggerAdapter):
+    """
+    Inject crawl context (depth, op, url) so the handler/formatter
+    never needs to know scraping internals.
+    """
+    def process(self, msg: str, kwargs: Mapping[str, Any]):
+        extra = self.extra.copy()
+        extra.update(kwargs.pop("extra", {}))
+        kwargs["extra"] = extra
+        return msg, kwargs
+def get_logger(name: str, **ctx) -> CrawlAdapter:
+    base = logging.getLogger(name)
+    # default placeholders so formatter never blows up
+    defaults = {"depth": "-", "op": "-", "url": "-"}
+    defaults.update(ctx)
+    return CrawlAdapter(base, defaults)

wxpath/util/serialize.py ADDED Viewed

@@ -0,0 +1,22 @@
+from wxpath.core.ops import WxStr
+def simplify(obj):
+    """
+    Recursively convert custom wrapper types (e.g., WxStr / ExtractedStr,
+    lxml elements) into plain built-in Python types so that printing or
+    JSON serialising shows clean values.
+    """
+    # Scalars
+    if isinstance(obj, WxStr):
+        return str(obj)
+    # Mapping
+    if isinstance(obj, dict):
+        return {k: simplify(v) for k, v in obj.items()}
+    # Sequence (but not str/bytes)
+    if isinstance(obj, (list, tuple, set)):
+        return type(obj)(simplify(v) for v in obj)
+    return obj