PyPI - wxpath - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

wxpath 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

wxpath/cli.py +52 -12
wxpath/core/ops.py +163 -129
wxpath/core/parser.py +559 -280
wxpath/core/runtime/engine.py +133 -42
wxpath/core/runtime/helpers.py +0 -7
wxpath/hooks/registry.py +29 -17
wxpath/http/client/crawler.py +46 -11
wxpath/http/client/request.py +6 -3
wxpath/http/client/response.py +1 -1
wxpath/http/policy/robots.py +82 -0
{wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/METADATA +84 -37
{wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/RECORD +16 -16
wxpath/core/errors.py +0 -134
{wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/WHEEL +0 -0
{wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/entry_points.txt +0 -0
{wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/licenses/LICENSE +0 -0
{wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/top_level.txt +0 -0

wxpath/core/runtime/engine.py CHANGED Viewed

@@ -2,11 +2,12 @@ import asyncio
 import contextlib
 import inspect
 from collections import deque
-from typing import Any, AsyncGenerator
+from typing import Any, AsyncGenerator, Iterator
 from lxml.html import HtmlElement
 from wxpath import patches  # noqa: F401
+from wxpath.core import parser
 from wxpath.core.models import (
     CrawlIntent,
     CrawlTask,
@@ -16,7 +17,7 @@ from wxpath.core.models import (
     ProcessIntent,
 )
 from wxpath.core.ops import get_operator
-from wxpath.core.parser import parse_wxpath_expr
+from wxpath.core.parser import Binary, Segment, Segments
 from wxpath.core.runtime.helpers import parse_html
 from wxpath.hooks.registry import FetchContext, get_hooks
 from wxpath.http.client.crawler import Crawler
@@ -27,7 +28,21 @@ log = get_logger(__name__)
 class HookedEngineBase:
-    async def post_fetch_hooks(self, body, task):
+    """Common hook invocation helpers shared by engine variants."""
+    async def post_fetch_hooks(self, body: bytes | str, task: CrawlTask) -> bytes | str | None:
+        """Run registered `post_fetch` hooks over a fetched response body.
+        Hooks may be synchronous or asynchronous and can transform or drop the
+        response payload entirely.
+        Args:
+            body: Raw response body bytes from the crawler.
+            task: The `CrawlTask` that produced the response.
+        Returns:
+            The transformed body, or `None` if any hook chooses to drop it.
+        """
         for hook in get_hooks():
             hook_method = getattr(hook, "post_fetch", lambda _, b: b)
             if inspect.iscoroutinefunction(hook_method):
@@ -45,7 +60,18 @@ class HookedEngineBase:
                 break
         return body
-    async def post_parse_hooks(self, elem, task):
+    async def post_parse_hooks(
+        self, elem: HtmlElement | None, task: CrawlTask
+    ) -> HtmlElement | None:
+        """Run registered `post_parse` hooks on a parsed DOM element.
+        Args:
+            elem: Parsed `lxml` element to process.
+            task: The originating `CrawlTask`.
+        Returns:
+            The transformed element, or `None` if a hook drops the branch.
+        """
         for hook in get_hooks():
             hook_method = getattr(hook, "post_parse", lambda _, e: e)
             if inspect.iscoroutinefunction(hook_method):
@@ -73,7 +99,15 @@ class HookedEngineBase:
                 break
         return elem
-    async def post_extract_hooks(self, value):
+    async def post_extract_hooks(self, value: Any) -> Any | None:
+        """Run registered `post_extract` hooks on extracted values.
+        Args:
+            value: The extracted datum to post-process.
+        Returns:
+            The transformed value, or `None` if a hook drops it.
+        """
         for hook in get_hooks():
             hook_method = getattr(hook, "post_extract", lambda v: v)
             if inspect.iscoroutinefunction(hook_method):
@@ -87,35 +121,65 @@ class HookedEngineBase:
 class WXPathEngine(HookedEngineBase):
-    """
-    Main class for executing wxpath expressions.
+    """Main class for executing wxpath expressions.
-    The core pattern and directive for this engine is to build a queue of CrawlTasks,
-    which is crawled and processed FIFO. The traversal of the queue (and therefore
-    the web graph) is done concurrently and in BFS-ish order.
+    The core pattern is to build a queue of CrawlTasks that are crawled and
+    processed FIFO. Traversal of the queue (and therefore the web graph) is
+    done concurrently in BFS-ish order.
     Args:
-        crawler: Crawler instance
-        concurrency: number of concurrent fetches at the Crawler (request engine) level
-        per_host: number of concurrent fetches per host
+        crawler: Crawler instance to use for HTTP requests.
+        concurrency: Number of concurrent fetches at the Crawler level.
+        per_host: Number of concurrent fetches per host.
+        respect_robots: Whether to respect robots.txt directives.
+        allowed_response_codes: Set of allowed HTTP response codes. Defaults
+            to ``{200}``. Responses may still be filtered and dropped.
+        allow_redirects: Whether to follow HTTP redirects. Defaults to ``True``.
     """
     def __init__(
             self,
             crawler: Crawler | None = None,
             concurrency: int = 16,
-            per_host: int = 8
+            per_host: int = 8,
+            respect_robots: bool = True,
+            allowed_response_codes: set[int] = None,
+            allow_redirects: bool = True,
         ):
+        # NOTE: Will grow unbounded in large crawls. Consider a LRU cache, or bloom filter.
         self.seen_urls: set[str] = set()
-        self.crawler = crawler or Crawler(concurrency=concurrency, per_host=per_host)
-    async def run(self, expression: str, max_depth: int):
-        segments = parse_wxpath_expr(expression)
+        self.crawler = crawler or Crawler(
+            concurrency=concurrency,
+            per_host=per_host,
+            respect_robots=respect_robots
+        )
+        self.allowed_response_codes = allowed_response_codes or {200}
+        self.allow_redirects = allow_redirects
+        if allow_redirects:
+            self.allowed_response_codes |= {301, 302, 303, 307, 308}
+    async def run(self, expression: str, max_depth: int) -> AsyncGenerator[Any, None]:
+        """Execute a wxpath expression concurrently and yield results.
+        Builds and drives a BFS-like crawl pipeline that honors robots rules,
+        throttling, and hook callbacks while walking the web graph.
+        Args:
+            expression: WXPath expression string to evaluate.
+            max_depth: Maximum crawl depth to follow for url hops.
+        Yields:
+            Extracted values produced by the expression (HTML elements or
+            wxpath-specific value types).
+        """
+        segments = parser.parse(expression)
         queue: asyncio.Queue[CrawlTask] = asyncio.Queue()
         inflight: dict[str, CrawlTask] = {}
         pending_tasks = 0
         def is_terminal():
+            # NOTE: consider adopting state machine pattern for determining
+            #       the current state of the engine.
             return queue.empty() and pending_tasks <= 0
         async with self.crawler as crawler:
@@ -177,7 +241,7 @@ class WXPathEngine(HookedEngineBase):
                     continue
                 # NOTE: Consider allowing redirects
-                if resp.status != 200 or not resp.body:
+                if resp.status not in self.allowed_response_codes or not resp.body:
                     log.warning(f"Got non-200 response from {resp.request.url}")
                     if is_terminal():
                         break
@@ -226,20 +290,36 @@ class WXPathEngine(HookedEngineBase):
     async def _process_pipeline(
         self,
         task: CrawlTask,
-        elem,
+        elem: Any,
         depth: int,
         max_depth: int,
         queue: asyncio.Queue[CrawlTask],
-    ):
-        mini_queue: deque[(HtmlElement, list[tuple[str, str]])] = deque([(elem, task.segments)])
+    ) -> AsyncGenerator[Any, None]:
+        """Process a queue of intents for a single crawl branch.
+        Traverses wxpath segments depth-first within a page while coordinating
+        newly discovered crawl intents back to the shared queue.
+        Args:
+            task: The originating crawl task for this branch.
+            elem: Current DOM element (or extracted value) being processed.
+            depth: Current traversal depth.
+            max_depth: Maximum permitted crawl depth.
+            queue: Shared crawl queue for enqueuing downstream URLs.
+        Yields:
+            object: Extracted values or processed elements as produced by operators.
+        """
+        mini_queue: deque[tuple[HtmlElement | Any, list[Binary | Segment] | Segments]] = deque(
+            [(elem, task.segments)]
+        )
         while mini_queue:
-            elem, segments = mini_queue.popleft()
-            op, _ = segments[0]
-            operator = get_operator(op)
+            elem, bin_or_segs = mini_queue.popleft()
-            intents = operator(elem, segments, depth)
+            binary_or_segment = bin_or_segs if isinstance(bin_or_segs, Binary) else bin_or_segs[0]
+            operator = get_operator(binary_or_segment)
+            intents = operator(elem, bin_or_segs, depth)
             if not intents:
                 return
@@ -253,6 +333,7 @@ class WXPathEngine(HookedEngineBase):
                     # if intent.url not in self.seen_urls and next_depth <= max_depth:
                     if next_depth <= max_depth:
                         # self.seen_urls.add(intent.url)
+                        log.debug(f"Depth: {next_depth}; Enqueuing {intent.url}")
                         queue.put_nowait(
                             CrawlTask(
                                 elem=None,
@@ -272,29 +353,33 @@ class WXPathEngine(HookedEngineBase):
 def wxpath_async(path_expr: str,
                  max_depth: int,
-                 engine: WXPathEngine = None) -> AsyncGenerator[Any, None]:
+                 engine: WXPathEngine | None = None) -> AsyncGenerator[Any, None]:
     if engine is None:
         engine = WXPathEngine()
     return engine.run(path_expr, max_depth)
 ##### ASYNC IN SYNC #####
-def wxpath_async_blocking_iter(path_expr, max_depth=1, engine: WXPathEngine = None):
-    """
-    Evaluate a wxpath expression using concurrent breadth-first traversal.
-    Args:
-        path_expr (str): A wxpath expression.
-        max_depth (int, optional): Maximum crawl depth. Must be at least the
-            number of `url*` segments minus one. Defaults to `1`.
-    Yields:
-        lxml.html.HtmlElement | wxpath.models.WxStr | dict | Any: The same objects
-        produced by the sequential evaluator.
+def wxpath_async_blocking_iter(
+    path_expr: str,
+    max_depth: int = 1,
+    engine: WXPathEngine | None = None,
+) -> Iterator[Any]:
+    """Evaluate a wxpath expression using concurrent breadth-first traversal.
     Warning:
         Spins up its own event loop therefore this function must **not** be
         invoked from within an active asyncio event loop.
+    Args:
+        path_expr: A wxpath expression.
+        max_depth: Maximum crawl depth. Must be at least the number of
+            ``url*`` segments minus one.
+        engine: Optional pre-configured WXPathEngine instance.
+    Yields:
+        object: Extracted objects (HtmlElement, WxStr, dict, or other values)
+        produced by the expression evaluator.
     """
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)
@@ -311,5 +396,11 @@ def wxpath_async_blocking_iter(path_expr, max_depth=1, engine: WXPathEngine = No
         loop.close()
-def wxpath_async_blocking(path_expr, max_depth=1, engine: WXPathEngine = None):
-    return list(wxpath_async_blocking_iter(path_expr, max_depth=max_depth, engine=engine))
+def wxpath_async_blocking(
+    path_expr: str,
+    max_depth: int = 1,
+    engine: WXPathEngine | None = None,
+) -> list[Any]:
+    return list(
+        wxpath_async_blocking_iter(path_expr, max_depth=max_depth, engine=engine)
+    )

wxpath/core/runtime/helpers.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import requests
 from lxml import etree, html
 from wxpath import patches
@@ -40,9 +39,3 @@ def detach_html_root(elem, base_url=None):
         new_root.base_url = base_url
     return new_root
-def fetch_html(url):
-    response = requests.get(url, timeout=10)
-    response.raise_for_status()
-    return response.content

wxpath/hooks/registry.py CHANGED Viewed

@@ -17,7 +17,6 @@ Write once:
 from __future__ import annotations
 import functools
-from collections import OrderedDict
 from collections.abc import Generator
 from dataclasses import dataclass, field
 from typing import Any, Iterable, List, Optional, Protocol
@@ -66,20 +65,24 @@ class Hook(Protocol):
 # --------------------------------------------------------------------------- #
 # Global registry helpers
 # --------------------------------------------------------------------------- #
-_global_hooks: OrderedDict[str, Hook] = OrderedDict()
+_global_hooks: dict[str, Hook] = dict()
 def register(hook: Hook | type) -> Hook:
-    """
-    Decorator / helper to add a Hook to the global list.
-    Example
-    -------
-    >>> @register
-    ... class DebugHook:
-    ...     def post_fetch(self, ctx, html_bytes):
-    ...         print("Fetched", ctx.url)
-    ...         return html_bytes
+    """Decorator/helper to add a Hook to the global list.
+    Args:
+        hook: A Hook class or instance to register.
+    Returns:
+        The registered hook (instantiated if a class was provided).
+    Example:
+        >>> @register
+        ... class DebugHook:
+        ...     def post_fetch(self, ctx, html_bytes):
+        ...         print("Fetched", ctx.url)
+        ...         return html_bytes
     """
     hook_name = getattr(hook, '__name__', hook.__class__.__name__)
@@ -101,9 +104,13 @@ def iter_post_extract_hooks() -> Iterable[Hook]:
 def pipe_post_extract(gen_func):
-    """
-    Decorator: wrap a *generator function* so every yielded value
-    is piped through the registered post_extract hooks.
+    """Wrap a generator function to pipe yielded values through post_extract hooks.
+    Args:
+        gen_func: A generator function to wrap.
+    Returns:
+        A wrapped generator that filters values through registered hooks.
     """
     @functools.wraps(gen_func)
     def wrapper(*args, **kwargs) -> Generator:
@@ -118,8 +125,13 @@ def pipe_post_extract(gen_func):
 def pipe_post_extract_async(async_gen_func):
-    """
-    Async variant - wraps an *async* generator function.
+    """Wrap an async generator function to pipe yielded values through hooks.
+    Args:
+        async_gen_func: An async generator function to wrap.
+    Returns:
+        A wrapped async generator that filters values through registered hooks.
     """
     @functools.wraps(async_gen_func)
     async def wrapper(*args, **kwargs):

wxpath/http/client/crawler.py CHANGED Viewed

@@ -10,6 +10,7 @@ import aiohttp
 from wxpath.http.client.request import Request
 from wxpath.http.client.response import Response
 from wxpath.http.policy.retry import RetryPolicy
+from wxpath.http.policy.robots import RobotsTxtPolicy
 from wxpath.http.policy.throttler import AbstractThrottler, AutoThrottler
 from wxpath.http.stats import CrawlerStats, build_trace_config
 from wxpath.util.logging import get_logger
@@ -22,6 +23,8 @@ HEADERS = {"User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
 class Crawler:
+    """Concurrent HTTP crawler that manages throttling, retries, and robots."""
     def __init__(
         self,
         concurrency: int = 16,
@@ -35,11 +38,13 @@ class Crawler:
         auto_throttle_target_concurrency: float = None,
         auto_throttle_start_delay: float = 0.25,
         auto_throttle_max_delay: float = 10.0,
+        respect_robots: bool = True,
     ):
         self.concurrency = concurrency
         self._timeout = aiohttp.ClientTimeout(total=timeout)
         self._headers   = HEADERS | (headers or {}) # merge headers
-        self._proxies = proxies or {}
+        self._proxies = proxies if (isinstance(proxies, defaultdict) or proxies) else {}
+        self.respect_robots = respect_robots
         self.retry_policy = retry_policy or RetryPolicy()
         self.throttler = throttler or AutoThrottler(
@@ -57,8 +62,10 @@ class Crawler:
         self._workers: list[asyncio.Task] = []
         self._closed = False
         self._stats = CrawlerStats()
+        self._robots_policy: RobotsTxtPolicy | None = None
-    def build_session(self):
+    def build_session(self) -> aiohttp.ClientSession:
+        """Construct an `aiohttp.ClientSession` with tracing and pooling."""
         trace_config = build_trace_config(self._stats)
         # Need to build the connector as late as possible as it requires the loop
         connector = aiohttp.TCPConnector(limit=self.concurrency*2, ttl_dns_cache=300)
@@ -69,26 +76,34 @@ class Crawler:
             trace_configs=[trace_config]
         )
-    async def __aenter__(self):
+    async def __aenter__(self) -> "Crawler":
+        """Initialize HTTP session and start background workers."""
         if self._session is None:
             # self._session = aiohttp.ClientSession(timeout=self._timeout)
             self._session = self.build_session()
+        if self.respect_robots:
+            self._robots_policy = RobotsTxtPolicy(self._session)
         self._workers = [
             asyncio.create_task(self._worker(), name=f"crawler-worker-{i}")
             for i in range(self.concurrency)
         ]
         return self
-    async def __aexit__(self, *_):
+    async def __aexit__(self, *_) -> None:
+        """Tear down workers and close the HTTP session."""
         self._closed = True
         for w in self._workers:
             w.cancel()
         await asyncio.gather(*self._workers, return_exceptions=True)
         if self._session:
             await self._session.close()
-    def submit(self, req: Request):
+    def submit(self, req: Request) -> None:
+        """Queue a request for fetching or raise if crawler already closed."""
         if self._closed:
             raise RuntimeError("crawler is closed")
         self._pending.put_nowait(req)
@@ -96,18 +111,28 @@ class Crawler:
     def __aiter__(self) -> AsyncIterator[Response]:
         return self._result_iter()
-    async def _result_iter(self):
+    async def _result_iter(self) -> AsyncIterator[Response]:
+        """Async iterator yielding responses as workers produce them."""
         # while not self._closed:
         while not (self._closed and self._results.empty()):
             resp = await self._results.get()
             self._results.task_done()
             yield resp
-    def _proxy_for(self, url: str):
+    def _proxy_for(self, url: str) -> str | None:
         host = urllib.parse.urlsplit(url).hostname
-        return self._proxies.get(host)
-    async def _worker(self):
+        try:
+            # bracket notation first, for defaultdicts
+            value = self._proxies[host]
+        except KeyError:
+            value = self._proxies.get(host)
+        if not value:
+            log.debug("proxy", extra={"host": host, "value": value})
+        return value
+    async def _worker(self) -> None:
+        """Worker loop that fetches pending requests and enqueues results."""
         while True:
             req = await self._pending.get()
             try:
@@ -133,8 +158,17 @@ class Crawler:
                 self._pending.task_done()
     async def _fetch_one(self, req: Request) -> Response | None:
+        """Fetch a single request, handling robots, throttling, and retries."""
         host = req.hostname
+        if self._robots_policy:
+            can_fetch = await self._robots_policy.can_fetch(
+                req.url, self._headers.get("User-Agent")
+            )
+            if not can_fetch:
+                log.debug("disallowed by robots.txt", extra={"url": req.url})
+                return Response(req, 403, b"", error=RuntimeError("Disallowed by robots.txt"))
         # TODO: Move this filter to hooks
         if req.url.lower().endswith((".pdf", ".zip", ".exe")):
             req.max_retries = 0
@@ -181,7 +215,8 @@ class Crawler:
                 log.error("request failed", extra={"url": req.url}, exc_info=exc)
                 return Response(req, 0, b"", error=exc)
-    async def _retry(self, req: Request):
+    async def _retry(self, req: Request) -> None:
+        """Reschedule a request according to the retry policy."""
         req.retries += 1
         delay = self.retry_policy.get_delay(req)

wxpath/http/client/request.py CHANGED Viewed

@@ -1,24 +1,26 @@
 import time
 from dataclasses import dataclass, field
-from typing import Any, Dict
+from typing import Any
 @dataclass
 class Request:
+    """HTTP request envelope used by the crawler."""
     url: str
     method: str = "GET"
-    headers: Dict[str, str] = field(default_factory=dict)
+    headers: dict[str, str] = field(default_factory=dict)
     timeout: float = 15.0
     retries: int = 0
     max_retries: int | None = None
     dont_retry: bool = False
-    meta: Dict[str, Any] = field(default_factory=dict)
+    meta: dict[str, Any] = field(default_factory=dict)
     created_at: float = field(default_factory=time.monotonic)
     def copy_for_retry(self) -> "Request":
+        """Create a copy incrementing the retry counter for scheduling."""
         return Request(
             url=self.url,
             method=self.method,
@@ -26,6 +28,7 @@ class Request:
             timeout=self.timeout,
             retries=self.retries + 1,
             max_retries=self.max_retries,
+            dont_retry=self.dont_retry,
             meta=self.meta,
         )

wxpath/http/client/response.py CHANGED Viewed

@@ -10,5 +10,5 @@ class Response:
     request: Request
     status: int
     body: bytes
-    headers: dict | None = None
+    headers: dict[str, str] | None = None
     error: Optional[Exception] = field(default=None, kw_only=True)

wxpath/http/policy/robots.py ADDED Viewed

@@ -0,0 +1,82 @@
+import asyncio
+import urllib.parse
+import urllib.robotparser
+import aiohttp
+from wxpath.util.logging import get_logger
+log = get_logger(__name__)
+class RobotsTxtPolicy:
+    """Caches and evaluates robots.txt rules for crawler requests."""
+    def __init__(self,
+                 session: aiohttp.ClientSession,
+                 default_parser: type['RobotsParserBase'] | None = None):
+        self._session = session
+        self._parsers: dict[str, "RobotsParserBase"] = {}
+        self._lock = asyncio.Lock()
+        self._default_parser = default_parser or UrllibRobotParser
+    async def can_fetch(self, url: str, user_agent: str | None) -> bool:
+        """Return whether the crawler is allowed to fetch `url`."""
+        host = urllib.parse.urlsplit(url).hostname
+        if not host:
+            return False
+        # Due to multiple aiohttp workers running concurrently, we need to lock
+        async with self._lock:
+            if host not in self._parsers:
+                self._parsers[host] = await self._fetch_robots_txt(host)
+        return self._parsers[host].can_fetch(url, user_agent)
+    async def _fetch_robots_txt(self, host: str) -> "RobotsParserBase":
+        """Retrieve and parse the robots.txt for `host`, failing open on errors."""
+        url = f"http://{host}/robots.txt"
+        try:
+            async with self._session.get(url) as response:
+                if response.status == 200:
+                    text = await response.text()
+                    # Pass the text as-is to the parser, let it handle the format
+                    if self._default_parser == UrllibRobotParser:
+                        return self._default_parser(text.splitlines())
+                    else:
+                        return self._default_parser(text)
+                else:
+                    # Empty robots.txt - allow all
+                    if self._default_parser == UrllibRobotParser:
+                        return self._default_parser([])
+                    else:
+                        return self._default_parser("")
+        except Exception:
+            # If robots.txt is unavailable, allow all requests (fail open)
+            log.debug(f"Failed to fetch robots.txt from {host}, allowing all requests")
+            if self._default_parser == UrllibRobotParser:
+                return self._default_parser([])
+            else:
+                return self._default_parser("")
+class RobotsParserBase:
+    """Base type for robots.txt parsers used by the policy."""
+class UrllibRobotParser(RobotsParserBase):
+    """Adapter around `urllib.robotparser.RobotFileParser`."""
+    def __init__(self, text):
+        self._parser = urllib.robotparser.RobotFileParser()
+        # urllib.robotparser.RobotFileParser.parse() expects a list of lines
+        if isinstance(text, str):
+            lines = text.splitlines() if text else []
+        else:
+            lines = text if text else []
+        self._parser.parse(lines)
+    def can_fetch(self, url, user_agent):
+        """Return whether the URL is allowed for the given user agent."""
+        return self._parser.can_fetch(user_agent, url)

wxpath 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

wxpath 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl