PyPI - wxpath - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

wxpath 0.3.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

wxpath/cli.py +57 -12
wxpath/core/runtime/engine.py +87 -11
wxpath/http/client/cache.py +43 -0
wxpath/http/client/crawler.py +106 -22
wxpath/http/client/request.py +1 -1
wxpath/http/stats.py +6 -0
wxpath/settings.py +108 -0
{wxpath-0.3.0.dist-info → wxpath-0.4.1.dist-info}/METADATA +140 -23
{wxpath-0.3.0.dist-info → wxpath-0.4.1.dist-info}/RECORD +13 -11
{wxpath-0.3.0.dist-info → wxpath-0.4.1.dist-info}/WHEEL +1 -1
{wxpath-0.3.0.dist-info → wxpath-0.4.1.dist-info}/entry_points.txt +0 -0
{wxpath-0.3.0.dist-info → wxpath-0.4.1.dist-info}/licenses/LICENSE +0 -0
{wxpath-0.3.0.dist-info → wxpath-0.4.1.dist-info}/top_level.txt +0 -0

wxpath/cli.py CHANGED Viewed

@@ -6,6 +6,7 @@ from wxpath.core import parser as wxpath_parser
 from wxpath.core.runtime.engine import WXPathEngine, wxpath_async_blocking_iter
 from wxpath.hooks import builtin, registry
 from wxpath.http.client.crawler import Crawler
+from wxpath.settings import SETTINGS
 from wxpath.util.serialize import simplify
@@ -15,9 +16,11 @@ def main():
     arg_parser.add_argument("expression", help="The wxpath expression")
     arg_parser.add_argument("--depth", type=int, default=1, help="Recursion depth")
     # debug
-    arg_parser.add_argument("--debug", action="store_true", help="Debug mode")
+    arg_parser.add_argument("--debug", action="store_true",
+                            help="Debug mode. Provides verbose runtime output and information")
     # verbose
-    arg_parser.add_argument("--verbose", action="store_true", help="Verbose mode")
+    arg_parser.add_argument("--verbose", action="store_true",
+                            help="Verbose mode. Prints CLI level information")
     arg_parser.add_argument(
         "--concurrency",
@@ -44,17 +47,27 @@ def main():
         help="Respect robots.txt",
         default=True
     )
+    arg_parser.add_argument(
+        "--cache",
+        action="store_true",
+        help="Use cache",
+        default=False
+    )
+    arg_parser.add_argument(
+        "--cache-backend",
+        type=str,
+        help="Cache backend. Possible values: redis, sqlite",
+        default="sqlite"
+    )
+    arg_parser.add_argument(
+        "--cache-db-path-or-url",
+        type=str,
+        help="Path to cache database",
+        default="cache.db"
+    )
     args = arg_parser.parse_args()
-    if args.verbose:
-        segments = wxpath_parser.parse(args.expression)
-        print("parsed expression:\n\nSegments([")
-        for s in segments:
-            print(f"\t{s},")
-        print("])")
-        print()
     if args.debug:
         from wxpath import configure_logging
         configure_logging('DEBUG')
@@ -72,6 +85,29 @@ def main():
         print(f"Using custom headers: {custom_headers}")
         print()
+    if args.cache:
+        SETTINGS.http.client.cache.enabled = True
+        if args.cache_backend == "redis":
+            SETTINGS.http.client.cache.backend = "redis"
+            SETTINGS.http.client.cache.redis.address = args.cache_db_path_or_url
+        elif args.cache_backend == "sqlite":
+            SETTINGS.http.client.cache.backend = "sqlite"
+            SETTINGS.http.client.cache.sqlite.cache_name = args.cache_db_path_or_url
+    if args.verbose:
+        print(f"Using concurrency: {args.concurrency}")
+        print(f"Using concurrency per host: {args.concurrency_per_host}")
+        print(f"Using respect robots: {args.respect_robots}")
+        print(f"Using cache: {args.cache}")
+        segments = wxpath_parser.parse(args.expression)
+        print("parsed expression:\n\nSegments([")
+        for s in segments:
+            print(f"\t{s},")
+        print("])")
+        print()
+        print()
     crawler = Crawler(
         concurrency=args.concurrency,
         per_host=args.concurrency_per_host,
@@ -81,11 +117,20 @@ def main():
     engine = WXPathEngine(crawler=crawler)
     try:
-        for r in wxpath_async_blocking_iter(args.expression, args.depth, engine):
+        for r in wxpath_async_blocking_iter(
+            path_expr=args.expression,
+            max_depth=args.depth,
+            engine=engine):
             clean = simplify(r)
             print(json.dumps(clean, ensure_ascii=False), flush=True)
     except BrokenPipeError:
-        sys.exit(0)
+        if args.verbose:
+            print("Pipe broken.")
+    if args.verbose:
+        print("Done. Printing crawl stats")
+        print(crawler._stats)
+    sys.exit(0)
 if __name__ == "__main__":

wxpath/core/runtime/engine.py CHANGED Viewed

@@ -5,6 +5,7 @@ from collections import deque
 from typing import Any, AsyncGenerator, Iterator
 from lxml.html import HtmlElement
+from tqdm import tqdm
 from wxpath import patches  # noqa: F401
 from wxpath.core import parser
@@ -157,7 +158,13 @@ class WXPathEngine(HookedEngineBase):
         if allow_redirects:
             self.allowed_response_codes |= {301, 302, 303, 307, 308}
-    async def run(self, expression: str, max_depth: int) -> AsyncGenerator[Any, None]:
+    async def run(
+            self,
+            expression: str,
+            max_depth: int,
+            progress: bool = False,
+            yield_errors: bool = False,
+        ) -> AsyncGenerator[Any, None]:
         """Execute a wxpath expression concurrently and yield results.
         Builds and drives a BFS-like crawl pipeline that honors robots rules,
@@ -166,6 +173,7 @@ class WXPathEngine(HookedEngineBase):
         Args:
             expression: WXPath expression string to evaluate.
             max_depth: Maximum crawl depth to follow for url hops.
+            progress: Whether to display a progress bar.
         Yields:
             Extracted values produced by the expression (HTML elements or
@@ -182,6 +190,12 @@ class WXPathEngine(HookedEngineBase):
             #       the current state of the engine.
             return queue.empty() and pending_tasks <= 0
+        total_yielded = 0
+        if progress:
+            pbar = tqdm(total=0)
+        else:
+            pbar = None
         async with self.crawler as crawler:
             async def submitter():
                 nonlocal pending_tasks
@@ -219,23 +233,48 @@ class WXPathEngine(HookedEngineBase):
                 depth=seed_task.depth,
                 max_depth=max_depth,
                 queue=queue,
+                pbar=pbar,
             ):
                 yield await self.post_extract_hooks(output)
             # While looping asynchronous generators, you MUST make sure
             # to check terminal conditions before re-iteration.
             async for resp in crawler:
+                if pbar is not None:
+                    pbar.update(1)
+                    pbar.refresh()
                 task = inflight.pop(resp.request.url, None)
                 pending_tasks -= 1
                 if task is None:
                     log.warning(f"Got unexpected response from {resp.request.url}")
+                    if yield_errors:
+                        yield {
+                            "__type__": "error",
+                            "url": resp.request.url,
+                            "reason": "unexpected_response",
+                            "status": resp.body,
+                            "body": resp.body
+                        }
                     if is_terminal():
                         break
                     continue
                 if resp.error:
                     log.warning(f"Got error from {resp.request.url}: {resp.error}")
+                    if yield_errors:
+                        yield {
+                            "__type__": "error",
+                            "url": resp.request.url,
+                            "reason": "network_error",
+                            "exception": str(resp.error),
+                            "status": resp.status,
+                            "body": resp.body
+                        }
                     if is_terminal():
                         break
                     continue
@@ -243,6 +282,16 @@ class WXPathEngine(HookedEngineBase):
                 # NOTE: Consider allowing redirects
                 if resp.status not in self.allowed_response_codes or not resp.body:
                     log.warning(f"Got non-200 response from {resp.request.url}")
+                    if yield_errors:
+                        yield {
+                            "__type__": "error",
+                            "url": resp.request.url,
+                            "reason": "bad_status",
+                            "status": resp.status,
+                            "body": resp.body
+                        }
                     if is_terminal():
                         break
                     continue
@@ -273,10 +322,18 @@ class WXPathEngine(HookedEngineBase):
                         depth=task.depth,
                         max_depth=max_depth,
                         queue=queue,
-                    ):
+                        pbar=pbar
+                    ):
+                        total_yielded += 1
+                        if pbar is not None:
+                            pbar.set_postfix(yielded=total_yielded, depth=task.depth,)
                         yield await self.post_extract_hooks(output)
                 else:
+                    total_yielded += 1
+                    if pbar is not None:
+                        pbar.set_postfix(yielded=total_yielded, depth=task.depth,)
                     yield await self.post_extract_hooks(elem)
                 # Termination condition
@@ -287,6 +344,9 @@ class WXPathEngine(HookedEngineBase):
             with contextlib.suppress(asyncio.CancelledError):
                 await submit_task
+        if pbar is not None:
+            pbar.close()
     async def _process_pipeline(
         self,
         task: CrawlTask,
@@ -294,6 +354,7 @@ class WXPathEngine(HookedEngineBase):
         depth: int,
         max_depth: int,
         queue: asyncio.Queue[CrawlTask],
+        pbar: tqdm = None
     ) -> AsyncGenerator[Any, None]:
         """Process a queue of intents for a single crawl branch.
@@ -331,9 +392,10 @@ class WXPathEngine(HookedEngineBase):
                 elif isinstance(intent, CrawlIntent):
                     next_depth = task.depth + 1
                     # if intent.url not in self.seen_urls and next_depth <= max_depth:
-                    if next_depth <= max_depth:
+                    if next_depth <= max_depth and intent.url not in self.seen_urls:
                         # self.seen_urls.add(intent.url)
                         log.debug(f"Depth: {next_depth}; Enqueuing {intent.url}")
                         queue.put_nowait(
                             CrawlTask(
                                 elem=None,
@@ -343,6 +405,9 @@ class WXPathEngine(HookedEngineBase):
                                 backlink=task.url,
                             )
                         )
+                        if pbar is not None:
+                            pbar.total += 1
+                            pbar.refresh()
                 elif isinstance(intent, (ExtractIntent, ProcessIntent, InfiniteCrawlIntent)):
                     # immediately traverse the extraction
@@ -351,19 +416,24 @@ class WXPathEngine(HookedEngineBase):
                     mini_queue.append((elem, next_segments))
-def wxpath_async(path_expr: str,
-                 max_depth: int,
-                 engine: WXPathEngine | None = None) -> AsyncGenerator[Any, None]:
+def wxpath_async(path_expr: str,
+                 max_depth: int,
+                 progress: bool = False,
+                 engine: WXPathEngine | None = None,
+                 yield_errors: bool = False
+                 ) -> AsyncGenerator[Any, None]:
     if engine is None:
         engine = WXPathEngine()
-    return engine.run(path_expr, max_depth)
+    return engine.run(path_expr, max_depth, progress=progress, yield_errors=yield_errors)
 ##### ASYNC IN SYNC #####
 def wxpath_async_blocking_iter(
     path_expr: str,
     max_depth: int = 1,
+    progress: bool = False,
     engine: WXPathEngine | None = None,
+    yield_errors: bool = False
 ) -> Iterator[Any]:
     """Evaluate a wxpath expression using concurrent breadth-first traversal.
@@ -383,7 +453,8 @@ def wxpath_async_blocking_iter(
     """
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)
-    agen = wxpath_async(path_expr, max_depth=max_depth, engine=engine)
+    agen = wxpath_async(path_expr, max_depth=max_depth, progress=progress,
+                        engine=engine, yield_errors=yield_errors)
     try:
         while True:
@@ -399,8 +470,13 @@ def wxpath_async_blocking_iter(
 def wxpath_async_blocking(
     path_expr: str,
     max_depth: int = 1,
+    progress: bool = False,
     engine: WXPathEngine | None = None,
+    yield_errors: bool = False
 ) -> list[Any]:
-    return list(
-        wxpath_async_blocking_iter(path_expr, max_depth=max_depth, engine=engine)
-    )
+    return list(wxpath_async_blocking_iter(path_expr,
+                                           max_depth=max_depth,
+                                           progress=progress,
+                                           engine=engine,
+                                           yield_errors=yield_errors,
+                                           ))

wxpath/http/client/cache.py ADDED Viewed

@@ -0,0 +1,43 @@
+try:
+    from aiohttp_client_cache import SQLiteBackend
+except ImportError:
+    CachedSession = None
+from wxpath.settings import SETTINGS
+from wxpath.util.logging import get_logger
+log = get_logger(__name__)
+CACHE_SETTINGS = SETTINGS.http.client.cache
+def get_cache_backend():
+    log.info("cache backend", extra={"backend": CACHE_SETTINGS.backend})
+    if CACHE_SETTINGS.backend == "redis":
+        from aiohttp_client_cache.backends.redis import RedisBackend
+        return RedisBackend(
+            expire_after=CACHE_SETTINGS.expire_after,
+            urls_expire_after=CACHE_SETTINGS.urls_expire_after or None,
+            allowed_methods=CACHE_SETTINGS.allowed_methods,
+            allowed_codes=CACHE_SETTINGS.allowed_codes,
+            include_headers=CACHE_SETTINGS.include_headers,
+            ignored_parameters=CACHE_SETTINGS.ignored_parameters,
+            **CACHE_SETTINGS.redis
+            # cache_name=CACHE_SETTINGS.redis.cache_name,
+            # host=CACHE_SETTINGS.redis.host,
+            # port=CACHE_SETTINGS.redis.port,
+            # db=CACHE_SETTINGS.redis.db,
+            # cache_control=CACHE_SETTINGS.cache_control,
+        )
+    elif CACHE_SETTINGS.backend == "sqlite":
+        return SQLiteBackend(
+            cache_name=CACHE_SETTINGS.sqlite.cache_name,
+            expire_after=CACHE_SETTINGS.expire_after,
+            urls_expire_after=CACHE_SETTINGS.urls_expire_after or None,
+            allowed_methods=CACHE_SETTINGS.allowed_methods,
+            allowed_codes=CACHE_SETTINGS.allowed_codes,
+            include_headers=CACHE_SETTINGS.include_headers,
+            ignored_parameters=CACHE_SETTINGS.ignored_parameters,
+            # cache_control=CACHE_SETTINGS.cache_control,
+        )
+    else:
+        raise ValueError(f"Unknown cache backend: {CACHE_SETTINGS.backend}")

wxpath/http/client/crawler.py CHANGED Viewed

@@ -1,3 +1,10 @@
+import aiohttp
+try:
+    from aiohttp_client_cache import CachedSession
+except ImportError:
+    CachedSession = None
 import asyncio
 import time
 import urllib.parse
@@ -5,21 +12,52 @@ from collections import defaultdict
 from socket import gaierror
 from typing import AsyncIterator
-import aiohttp
+from wxpath.http.client.cache import get_cache_backend
 from wxpath.http.client.request import Request
 from wxpath.http.client.response import Response
 from wxpath.http.policy.retry import RetryPolicy
 from wxpath.http.policy.robots import RobotsTxtPolicy
 from wxpath.http.policy.throttler import AbstractThrottler, AutoThrottler
 from wxpath.http.stats import CrawlerStats, build_trace_config
+from wxpath.settings import SETTINGS
 from wxpath.util.logging import get_logger
 log = get_logger(__name__)
-HEADERS = {"User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
-                   "AppleWebKit/537.36 (KHTML, like Gecko) "
-                   "Chrome/142.0.0.0 Safari/537.36")}
+CACHE_SETTINGS = SETTINGS.http.client.cache
+CRAWLER_SETTINGS = SETTINGS.http.client.crawler
+def get_async_session(
+        headers: dict | None = None,
+        timeout: aiohttp.ClientTimeout | None = None,
+        connector: aiohttp.TCPConnector | None = None,
+        trace_config: aiohttp.TraceConfig | None = None
+) -> aiohttp.ClientSession:
+    """
+    Create and return a new aiohttp session. If aiohttp-client-cache is available
+    and enabled, return a new CachedSession bound to the configured SQLite backend.
+    The caller is responsible for closing the session.
+    """
+    if timeout is None:
+        timeout = aiohttp.ClientTimeout(total=CRAWLER_SETTINGS.timeout)
+    if CACHE_SETTINGS.enabled and CachedSession:
+        log.info("using aiohttp-client-cache")
+        return CachedSession(
+            cache=get_cache_backend(),
+            headers=headers,
+            timeout=timeout,
+            connector=connector,
+            trace_configs=[trace_config] if trace_config is not None else None
+        )
+    return aiohttp.ClientSession(
+        headers=headers,
+        timeout=timeout,
+        connector=connector,
+        trace_configs=[trace_config] if trace_config is not None else None
+    )
 class Crawler:
@@ -27,33 +65,55 @@ class Crawler:
     def __init__(
         self,
-        concurrency: int = 16,
-        per_host: int = 8,
-        timeout: int = 15,
+        concurrency: int = None,
+        per_host: int = None,
+        timeout: int = None,
         *,
         headers: dict | None = None,
         proxies: dict | None = None,
         retry_policy: RetryPolicy | None = None,
         throttler: AbstractThrottler | None = None,
         auto_throttle_target_concurrency: float = None,
-        auto_throttle_start_delay: float = 0.25,
-        auto_throttle_max_delay: float = 10.0,
+        auto_throttle_start_delay: float = None,
+        auto_throttle_max_delay: float = None,
         respect_robots: bool = True,
     ):
-        self.concurrency = concurrency
+        cfg = CRAWLER_SETTINGS
+        self.concurrency = concurrency if concurrency is not None else cfg.concurrency
+        self.per_host = per_host if per_host is not None else cfg.per_host
+        timeout = timeout if timeout is not None else cfg.timeout
         self._timeout = aiohttp.ClientTimeout(total=timeout)
-        self._headers   = HEADERS | (headers or {}) # merge headers
-        self._proxies = proxies if (isinstance(proxies, defaultdict) or proxies) else {}
-        self.respect_robots = respect_robots
+        self._headers = cfg.headers | (headers or {}) # merge headers
+        _proxies = proxies if proxies is not None else cfg.proxies
+        self._proxies = _proxies if (isinstance(_proxies, defaultdict) or _proxies) else {}
         self.retry_policy = retry_policy or RetryPolicy()
+        # auto-throttle defaults
+        auto_throttle_target_concurrency = auto_throttle_target_concurrency \
+            if auto_throttle_target_concurrency is not None \
+            else cfg.auto_throttle_target_concurrency
+        auto_throttle_start_delay = auto_throttle_start_delay \
+            if auto_throttle_start_delay is not None \
+            else cfg.auto_throttle_start_delay
+        auto_throttle_max_delay = auto_throttle_max_delay \
+            if auto_throttle_max_delay is not None \
+            else cfg.auto_throttle_max_delay
         self.throttler = throttler or AutoThrottler(
-            target_concurrency=auto_throttle_target_concurrency or concurrency/4.0,
+            target_concurrency=auto_throttle_target_concurrency or self.concurrency/4.0,
             start_delay=auto_throttle_start_delay,
             max_delay=auto_throttle_max_delay,
         )
-        self._sem_global = asyncio.Semaphore(concurrency)
-        self._sem_host = defaultdict(lambda: asyncio.Semaphore(per_host))
+        self._sem_global = asyncio.Semaphore(self.concurrency)
+        self._sem_host = defaultdict(lambda: asyncio.Semaphore(self.per_host))
         self._pending: asyncio.Queue[Request] = asyncio.Queue()
         self._results: asyncio.Queue[Response] = asyncio.Queue()
@@ -62,18 +122,31 @@ class Crawler:
         self._workers: list[asyncio.Task] = []
         self._closed = False
         self._stats = CrawlerStats()
+        self.respect_robots = respect_robots if respect_robots is not None else cfg.respect_robots
         self._robots_policy: RobotsTxtPolicy | None = None
+        # WARN: If SQLiteBackend caching is enabled and min(concurrency, per_host) > 1,
+        #       write-contention is likely to occur.
+        if (CACHE_SETTINGS.enabled
+            and CACHE_SETTINGS.backend == "sqlite"
+            and min(self.concurrency, self.per_host) > 1
+            ):
+            log.warning(
+                "SQLiteBackend caching is enabled and min(concurrency, per_host) > 1. "
+                "Write-contention is likely to occur. Consider using RedisBackend."
+            )
     def build_session(self) -> aiohttp.ClientSession:
         """Construct an `aiohttp.ClientSession` with tracing and pooling."""
         trace_config = build_trace_config(self._stats)
         # Need to build the connector as late as possible as it requires the loop
         connector = aiohttp.TCPConnector(limit=self.concurrency*2, ttl_dns_cache=300)
-        return aiohttp.ClientSession(
-            headers=self._headers,
-            timeout=self._timeout,
-            connector=connector,
-            trace_configs=[trace_config]
+        return get_async_session(
+            headers=self._headers,
+            timeout=self._timeout,
+            connector=connector,
+            trace_config=trace_config
         )
     async def __aenter__(self) -> "Crawler":
@@ -82,6 +155,7 @@ class Crawler:
             # self._session = aiohttp.ClientSession(timeout=self._timeout)
             self._session = self.build_session()
+        # Note: Set robots policy after session is created
         if self.respect_robots:
             self._robots_policy = RobotsTxtPolicy(self._session)
@@ -184,12 +258,22 @@ class Crawler:
             start = time.monotonic()
             try:
+                log.info("fetching", extra={"url": req.url})
                 async with self._session.get(
                     req.url,
                     headers=self._headers | req.headers,
                     proxy=self._proxy_for(req.url),
                     timeout=req.timeout or self._timeout,
                 ) as resp:
+                    from_cache = getattr(resp, "from_cache", False)
+                    if from_cache:
+                        # NOTE: This is a bit of a hack, but it works. aiohttp-client-cache does not
+                        #  interface with TraceConfigs on cache hit, so we have to do it here.
+                        self._stats.requests_cache_hit += 1
+                        log.info("[CACHE HIT]", extra={"req.url": req.url, "resp.url": resp.url})
+                    else:
+                        log.info("[CACHE MISS]", extra={"req.url": req.url, "resp.url": resp.url})
                     body = await resp.read()
                     latency = time.monotonic() - start

wxpath/http/client/request.py CHANGED Viewed

@@ -9,7 +9,7 @@ class Request:
     url: str
     method: str = "GET"
     headers: dict[str, str] = field(default_factory=dict)
-    timeout: float = 15.0
+    timeout: float | None = None
     retries: int = 0
     max_retries: int | None = None

wxpath/http/stats.py CHANGED Viewed

@@ -16,6 +16,7 @@ class CrawlerStats:
     requests_enqueued: int = 0
     requests_started: int = 0
     requests_completed: int = 0
+    requests_cache_hit: int = 0
     # ---- Concurrency ----
     in_flight_global: int = 0
@@ -57,6 +58,9 @@ def build_trace_config(stats: CrawlerStats) -> TraceConfig:
         context._start_time = time.monotonic()
     async def on_request_end(session, context, params):
+        """
+        Update stats on request completion.
+        """
         host = params.url.host
         stats.in_flight_global -= 1
         stats.in_flight_per_host[host] -= 1
@@ -82,6 +86,8 @@ def build_trace_config(stats: CrawlerStats) -> TraceConfig:
             if not hasattr(stats, "bytes_received"):
                 stats.bytes_received = 0
             stats.bytes_received += content_length
+        stats.requests_completed += 1
     async def on_request_exception(session, context, params):
         host = params.url.host

wxpath/settings.py ADDED Viewed

@@ -0,0 +1,108 @@
+"""
+Settings for wxpath.
+These settings are global and can be accessed from any module in the wxpath package.
+They are typically used by various modules to configure Class initializers.
+The SETTINGS dict structure follows the structure of wxpath submodules.
+Expected usage behavior:
+```python
+from wxpath.settings import SETTINGS
+CACHE_SETTINGS = SETTINGS.http.client.cache
+```
+Once initialized, the settings are expected to be immutable (not enforced).
+"""
+from datetime import timedelta
+# Settings match
+SETTINGS = {
+    'http': {
+        'client': {
+            'cache': {
+                'enabled': False,
+                # 'db_path': 'cache.db',
+                'expire_after': timedelta(days=7),
+                'urls_expire_after': None,
+                'allowed_methods': ("GET", "HEAD"),
+                'allowed_codes': (200, 203, 301, 302, 307, 308),
+                'ignored_parameters': ["utm_*", "fbclid"],
+                'include_headers': False,   # don’t vary cache keys on headers by default
+                'cache_control': False,     # honor Cache-Control/Expires if present
+                # # TODO: size hedges (soft, enforced by wxpath)
+                # 'max_entries': None,        # e.g. 1_000_000
+                # 'max_response_size': None,  # bytes, e.g. 2_000_000
+                # 'max_db_size': None,        # bytes, e.g. 5 * 1024**3
+                'backend': "sqlite",
+                'sqlite': {
+                    'cache_name': "cache.db",
+                },
+                'redis': {
+                    # 'host': "localhost",
+                    # 'port': 6379,
+                    # 'db': 0,
+                    'address': 'redis://localhost:6379/0',
+                    'cache_name': "wxpath:",
+                }
+            },
+            'crawler': {
+                'concurrency': 16,
+                'per_host': 8,
+                'timeout': 15,
+                'headers': {
+                    "User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
+                                   "AppleWebKit/537.36 (KHTML, like Gecko) "
+                                   "Chrome/142.0.0.0 Safari/537.36")},
+                'proxies': None,
+                'auto_throttle_target_concurrency': None,
+                'auto_throttle_start_delay': 0.25,
+                'auto_throttle_max_delay': 10.0,
+                'respect_robots': True,
+            },
+        },
+    },
+}
+class AttrDict(dict):
+    """
+    A dictionary subclass that allows dot-notation access while
+    recursively converting nested dictionaries.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Point the instance __dict__ to itself to allow attribute access
+        self.__dict__ = self
+        # Recursively convert any dicts passed during initialization
+        for key, value in self.items():
+            self[key] = self._convert(value)
+    @classmethod
+    def _convert(cls, value):
+        """Recursively converts dicts to AttrDicts, leaving other types alone."""
+        if isinstance(value, dict):
+            return cls(value)
+        elif isinstance(value, list):
+            # Optional: converts dicts inside lists while keeping the list container
+            return [cls._convert(item) for item in value]
+        return value
+    def __setitem__(self, key, value):
+        # Ensure that new items added via dict-syntax are also converted
+        super().__setitem__(key, self._convert(value))
+    def __getattr__(self, key):
+        try:
+            return self[key]
+        except KeyError as exc:
+            raise AttributeError(f"AttrDict object has no attribute '{key}'") from exc
+SETTINGS = AttrDict(SETTINGS)
+CACHE_SETTINGS = SETTINGS.http.client.cache
+CRAWLER_SETTINGS = SETTINGS.http.client.crawler

{wxpath-0.3.0.dist-info → wxpath-0.4.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: wxpath
-Version: 0.3.0
+Version: 0.4.1
 Summary: wxpath - a declarative web crawler and data extractor
 Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
 License-Expression: MIT
@@ -10,6 +10,13 @@ License-File: LICENSE
 Requires-Dist: lxml>=4.0
 Requires-Dist: elementpath<=5.0.3,>=5.0.0
 Requires-Dist: aiohttp<=3.12.15,>=3.8.0
+Requires-Dist: tqdm>=4.0.0
+Provides-Extra: cache
+Requires-Dist: aiohttp-client-cache>=0.14.0; extra == "cache"
+Provides-Extra: cache-sqlite
+Requires-Dist: aiohttp-client-cache[sqlite]; extra == "cache-sqlite"
+Provides-Extra: cache-redis
+Requires-Dist: aiohttp-client-cache[redis]; extra == "cache-redis"
 Provides-Extra: test
 Requires-Dist: pytest>=7.0; extra == "test"
 Requires-Dist: pytest-asyncio>=0.23; extra == "test"
@@ -23,9 +30,42 @@ Dynamic: license-file
 **wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
-By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform deep (or paginated) web crawling and extraction.
+This expression fetches a page, extracts links, and streams them concurrently - no crawl loop required:
-NOTE: This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
+```python
+import wxpath
+expr = "url('https://example.com')//a/@href"
+for link in wxpath.wxpath_async_blocking_iter(expr):
+    print(link)
+```
+By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform deep (or paginated) web crawling and extraction:
+```python
+import wxpath
+path_expr = """
+url('https://quotes.toscrape.com')
+  ///url(//a/@href)
+    //a/@href
+"""
+for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
+    print(item)
+```
+## Why wxpath?
+Most web scrapers force you to write crawl control flow first, and extraction second.
+**wxpath** inverts that:
+- **You describe traversal declaratively**
+- **Extraction is expressed inline**
+- **The engine handles scheduling, concurrency, and deduplication**
 ## Contents
@@ -38,7 +78,10 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
 - [Polite Crawling](#polite-crawling)
 - [Output types](#output-types)
 - [XPath 3.1](#xpath-31-by-default)
+- [Progress Bar](#progress-bar)
 - [CLI](#cli)
+- [Persistence and Caching](#persistence-and-caching)
+- [Settings](#settings)
 - [Hooks (Experimental)](#hooks-experimental)
 - [Install](#install)
 - [More Examples](EXAMPLES.md)
@@ -46,7 +89,8 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
 - [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
 - [Project Philosophy](#project-philosophy)
 - [Warnings](#warnings)
-- [Commercial support / consulting](#commercial-support--consulting)
+- [Commercial support/consulting](#commercial-supportconsulting)
+- [Versioning](#versioning)
 - [License](#license)
@@ -54,32 +98,31 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
 ```python
 import wxpath
+from wxpath.settings import CRAWLER_SETTINGS
+# Custom headers for politeness; necessary for some sites (e.g., Wikipedia)
+CRAWLER_SETTINGS.headers = {'User-Agent': 'my-app/0.4.0 (contact: you@example.com)'}
 # Crawl, extract fields, build a knowledge graph
 path_expr = """
 url('https://en.wikipedia.org/wiki/Expression_language')
- ///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
- /map{
-    'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
-    'url': string(base-uri(.)),
-    'short_description': //div[contains(@class, 'shortdescription')]/text() ! string(.),
-    'forward_links': //div[@id="mw-content-text"]//a/@href ! string(.)
- }
+  ///url(
+        //main//a/@href[
+            starts-with(., '/wiki/') and not(contains(., ':'))
+        ]
+    )
+    /map{
+        'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
+        'url': string(base-uri(.)),
+        'short_description': //div[contains(@class, 'shortdescription')]/text() ! string(.),
+        'forward_links': //div[@id="mw-content-text"]//a/@href ! string(.)
+    }
 """
 for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
     print(item)
 ```
-Output:
-```python
-map{'title': 'Computer language', 'url': 'https://en.wikipedia.org/wiki/Computer_language', 'short_description': 'Formal language for communicating with a computer', 'forward_links': ['/wiki/Formal_language', '/wiki/Communication', ...]}
-map{'title': 'Advanced Boolean Expression Language', 'url': 'https://en.wikipedia.org/wiki/Advanced_Boolean_Expression_Language', 'short_description': 'Hardware description language and software', 'forward_links': ['/wiki/File:ABEL_HDL_example_SN74162.png', '/wiki/Hardware_description_language', ...]}
-map{'title': 'Machine-readable medium and data', 'url': 'https://en.wikipedia.org/wiki/Machine_readable', 'short_description': 'Medium capable of storing data in a format readable by a machine', 'forward_links': ['/wiki/File:EAN-13-ISBN-13.svg', '/wiki/ISBN', ...]}
-...
-```
 **Note:** Some sites (including Wikipedia) may block requests without proper headers.
 See [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration) to set a custom `User-Agent`.
@@ -195,6 +238,17 @@ path_expr = """
 # ...]
 ```
+## Progress Bar
+**wxpath** provides a progress bar (via `tqdm`) to track crawl progress. This is especially useful for long-running crawls.
+Enable by setting `engine.run(..., progress=True)`, or pass `progress=True` to any of the `wxpath_async*(...)` functions.
+```python
+items = wxpath.wxpath_async_blocking("...", progress=True)
+> 100%|██████████████████████████████████████████████████████████▎| 469/471 [00:05<00:00, 72.00it/s, depth=2, yielded=457]
+```
 ## CLI
@@ -237,9 +291,46 @@ Command line options:
 --concurrency-per-host <concurrency> Number of concurrent fetches per host
 --header               "Key:Value"   Add a custom header (e.g., 'Key:Value'). Can be used multiple times.
 --respect-robots       [true|false] (Default: True) Respects robots.txt
+--cache                [true|false] (Default: False) Persist crawl results to a local database
+```
+## Persistence and Caching
+**wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
+**wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will be encounter a warning if you `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
+To use, you must install the appropriate optional dependency:
+```bash
+pip install wxpath[cache-sqlite]
+pip install wxpath[cache-redis]
+```
+Once the dependency is installed, you must enable the cache:
+```python
+from wxpath.settings import SETTINGS
+# To enable caching; sqlite is the default
+SETTINGS.http.client.cache.enabled = True
+# For redis backend
+SETTINGS.http.client.cache.enabled = True
+SETTINGS.http.client.cache.backend = "redis"
+SETTINGS.http.client.cache.redis.address = "redis://localhost:6379/0"
+# Run wxpath as usual
+items = list(wxpath_async_blocking_iter('...', max_depth=1, engine=engine))
 ```
+## Settings
+See [settings.py](src/wxpath/settings.py) for details of the settings.
 ## Hooks (Experimental)
 **wxpath** supports a pluggable hook system that allows you to modify the crawling and extraction behavior. You can register hooks to preprocess URLs, post-process HTML, filter extracted values, and more. Hooks will be executed in the order they are registered. Hooks may impact performance.
@@ -290,6 +381,13 @@ Requires Python 3.10+.
 pip install wxpath
 ```
+For persisted/cached, wxpath supports the following backends:
+```
+pip install wxpath[cache-sqlite]
+pip install wxpath[cache-redis]
+```
 ## More Examples
@@ -336,6 +434,17 @@ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(//mai
 items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
 ```
+### Runtime API (`wxpath_async*`) options
+- `max_depth`: int = 1
+- `progress`: bool = False
+- `engine`: WXPathEngine | None = None
+- `yield_errors`: bool = False
+### Settings
+You can also use [settings.py](src/wxpath/settings.py) to enable caching, throttling, concurrency and more.
 ## Project Philosophy
@@ -345,7 +454,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
 - Stay lightweight and composable
 - Asynchronous support for high-performance crawls
-### Guarantees/Goals
+### Goals
 - URLs are deduplicated on a best-effort, per-crawl basis.
 - Crawls are intended to terminate once the frontier is exhausted or `max_depth` is reached.
@@ -356,7 +465,6 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
 The following features are not yet supported:
-- Persistent scheduling or crawl resumption
 - Automatic proxy rotation
 - Browser-based rendering (JavaScript execution)
 - Strict result ordering
@@ -364,13 +472,15 @@ The following features are not yet supported:
 ## WARNINGS!!!
+This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
 - Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
 - Deep crawls (`///`) require user discipline to avoid unbounded expansion (traversal explosion).
 - Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
 - Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
-## Commercial support / consulting
+## Commercial support/consulting
 If you want help building or operating crawlers/data feeds with wxpath (extraction, scheduling, monitoring, breakage fixes) or other web-scraping needs, please contact me at: rodrigopala91@gmail.com.
@@ -379,6 +489,13 @@ If you want help building or operating crawlers/data feeds with wxpath (extracti
 If you like wxpath and want to support its development, please consider [donating](https://www.paypal.com/donate/?business=WDNDK6J6PJEXY&no_recurring=0&item_name=Thanks+for+using+wxpath%21+Donations+fund+development%2C+docs%2C+and+bug+fixes.+If+wxpath+saved+you+time%2C+a+small+contribution+helps%21&currency_code=USD).
+## Versioning
+**wxpath** follows [semver](https://semver.org): `<MAJOR>.<MINOR>.<PATCH>`.
+However, pre-1.0.0 follows `0.<MAJOR>.<MINOR|PATCH>`.
 ## License
 MIT

{wxpath-0.3.0.dist-info → wxpath-0.4.1.dist-info}/RECORD RENAMED Viewed

@@ -1,22 +1,24 @@
 wxpath/__init__.py,sha256=w1hFE_VSIYq_TSFLoPfp6MJbG1sA6BeChX6PYsXIK4o,265
-wxpath/cli.py,sha256=GJ4vAax5DlpxczZ_eLetlfRwa177VFKo2LHv09X-0eo,2799
+wxpath/cli.py,sha256=e0-mHkpuC1B_WyJw7wH43UBmtuF8oL8phQ4GEzUX0Ns,4332
 wxpath/patches.py,sha256=u0dOL-K-gvdO9SJvzGrqR9Zou6XduWjl6R7mzIcZtJg,2130
+wxpath/settings.py,sha256=a4TlCAOvmO03oOXiiYQzIDBMZU0XpTqntwnjVsumnas,3809
 wxpath/core/__init__.py,sha256=U9_In2iRaZrpiIVavIli1M59gCB6Kn1en-1Fza-qIiI,257
 wxpath/core/dom.py,sha256=X0L3n8jRfO5evEypDaJTD-NQ3cLXWvnEUVERAHo3vV0,701
 wxpath/core/models.py,sha256=3KYt-UwfLY2FlSRUHeA_getnYaNUMPW9wRrl2CRbPso,1611
 wxpath/core/ops.py,sha256=PTjX6c4QvCqGaByYYqaK4dte5iWO3lZzgqGrMXp6f6g,9727
 wxpath/core/parser.py,sha256=WfjQNixBz7nWtX2O0t19MOhUJmzGMg8Qol40P6oC8zc,18827
 wxpath/core/runtime/__init__.py,sha256=_iCgkIWxXvxzQcenHOsjYGsk74HboTIYWOtgM8GtCyc,86
-wxpath/core/runtime/engine.py,sha256=069ITKDXcHss__AwaYf0VSfliCNB49yZbnW2v3xEZO0,14512
+wxpath/core/runtime/engine.py,sha256=UQ8wSr49TJibRRtXzIgXVSBvuB1VttYicKEwV4xcG6Q,17345
 wxpath/core/runtime/helpers.py,sha256=M1i4BryCktAxeboa4LOXMTNiKVCJLDBD-KpWCQXadpw,1434
 wxpath/hooks/__init__.py,sha256=9JG63e4z_8CZLWugFcY786hebaEEPZ5FmZhyDHat-98,294
 wxpath/hooks/builtin.py,sha256=GJ4w1C9djWNzAmAA3U0qI9OoCOeC5R8tEGtWXJVHSYs,4125
 wxpath/hooks/registry.py,sha256=-D11f_mMboeVAH8qsTkbKTQ0aGNaQ7F6zbXDsOIYxN0,4513
 wxpath/http/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wxpath/http/stats.py,sha256=FrXbFrnms113Gapf-Z5WiD5qaNiJ0XuOqjSQhwXfuEo,3172
+wxpath/http/stats.py,sha256=aqZWuybc5RCv-AmKdNbEX4uw1YvZtFoE6591UfukZns,3319
 wxpath/http/client/__init__.py,sha256=QpdmqzcznUeuFvT3IIo-LmBUUHEa2BDq9sHGAHJnDLI,202
-wxpath/http/client/crawler.py,sha256=YlE469UqMck0wqRd6J9kNxm5G9BCbE_x5O6MROwmcaE,8742
-wxpath/http/client/request.py,sha256=LF_OIXetfouyE5GwEqp0cya0oMAZouKRPNFRFGscQS8,1050
+wxpath/http/client/cache.py,sha256=cHS4XlfOStoHTG83ypNITk3Oc0lqGoTRqV0_UWBWQFY,1811
+wxpath/http/client/crawler.py,sha256=UiKtc5K2KBc0bBw2fTdRHLNTa2OFoE1tZsDjR7J4Xeo,12126
+wxpath/http/client/request.py,sha256=cpqo_ASG_wKz0q6m33lsE0kIIthfANt8fx7ptxlyehY,1057
 wxpath/http/client/response.py,sha256=z9LQPnDN-NZRnQpIKozaWCqgpRejc6nixCr_XaPyqUQ,334
 wxpath/http/policy/backoff.py,sha256=NwdUR6bRe1RtUGSJOktj-p8IyC1l9xu_-Aa_Gj_u5sw,321
 wxpath/http/policy/retry.py,sha256=WSrQfCy1F7IcXFpVGDi4HTphNhFq12p4DaMO0_4dgrw,982
@@ -25,9 +27,9 @@ wxpath/http/policy/throttler.py,sha256=wydMFV-0mxpHSI5iYkLfE78oY4z_fF8jW9MqCeb8G
 wxpath/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxpath/util/logging.py,sha256=oQi8sp7yKWgXkkcJ4U4WHp7TyBCQiK4VhSXOSb8pGw0,2965
 wxpath/util/serialize.py,sha256=uUs4C9VErpFd97smBM2bRWo2nW25kCgKdsMrVtVxhg8,575
-wxpath-0.3.0.dist-info/licenses/LICENSE,sha256=AVBZLhdWmqxm-f-dy5prVB1E-solHWoP2EXEIV_o-00,1076
-wxpath-0.3.0.dist-info/METADATA,sha256=9Y0V7Up2efXCRtKZ7Cceawz9LHvNcfH0olmEGK2mVk0,16326
-wxpath-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-wxpath-0.3.0.dist-info/entry_points.txt,sha256=FwoIOnUTl-DjPqVw-eb9EHHiiXCyRZy_mEQKFu2eb5Y,43
-wxpath-0.3.0.dist-info/top_level.txt,sha256=uFCcveG78mnefxRGvYsR2OexDlKR_Z1UD4vZijUcex8,7
-wxpath-0.3.0.dist-info/RECORD,,
+wxpath-0.4.1.dist-info/licenses/LICENSE,sha256=AVBZLhdWmqxm-f-dy5prVB1E-solHWoP2EXEIV_o-00,1076
+wxpath-0.4.1.dist-info/METADATA,sha256=LxmOTsWpspYFedvP02fDL1Wy5t1ygZKuIg2cHVQU_aY,19445
+wxpath-0.4.1.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
+wxpath-0.4.1.dist-info/entry_points.txt,sha256=FwoIOnUTl-DjPqVw-eb9EHHiiXCyRZy_mEQKFu2eb5Y,43
+wxpath-0.4.1.dist-info/top_level.txt,sha256=uFCcveG78mnefxRGvYsR2OexDlKR_Z1UD4vZijUcex8,7
+wxpath-0.4.1.dist-info/RECORD,,

{wxpath-0.3.0.dist-info → wxpath-0.4.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (80.10.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

{wxpath-0.3.0.dist-info → wxpath-0.4.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{wxpath-0.3.0.dist-info → wxpath-0.4.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{wxpath-0.3.0.dist-info → wxpath-0.4.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

wxpath 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

wxpath 0.3.0py3-none-any.whl → 0.4.1py3-none-any.whl