PyPI - wxpath - Versions diffs - 0.4.0__py3-none-any.whl - Mend

wxpath 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

wxpath/__init__.py +9 -0
wxpath/cli.py +137 -0
wxpath/core/__init__.py +13 -0
wxpath/core/dom.py +22 -0
wxpath/core/models.py +74 -0
wxpath/core/ops.py +278 -0
wxpath/core/parser.py +598 -0
wxpath/core/runtime/__init__.py +5 -0
wxpath/core/runtime/engine.py +444 -0
wxpath/core/runtime/helpers.py +41 -0
wxpath/hooks/__init__.py +9 -0
wxpath/hooks/builtin.py +113 -0
wxpath/hooks/registry.py +145 -0
wxpath/http/__init__.py +0 -0
wxpath/http/client/__init__.py +9 -0
wxpath/http/client/cache.py +43 -0
wxpath/http/client/crawler.py +315 -0
wxpath/http/client/request.py +38 -0
wxpath/http/client/response.py +14 -0
wxpath/http/policy/backoff.py +16 -0
wxpath/http/policy/retry.py +35 -0
wxpath/http/policy/robots.py +82 -0
wxpath/http/policy/throttler.py +114 -0
wxpath/http/stats.py +102 -0
wxpath/patches.py +63 -0
wxpath/settings.py +108 -0
wxpath/util/__init__.py +0 -0
wxpath/util/logging.py +91 -0
wxpath/util/serialize.py +22 -0
wxpath-0.4.0.dist-info/METADATA +460 -0
wxpath-0.4.0.dist-info/RECORD +35 -0
wxpath-0.4.0.dist-info/WHEEL +5 -0
wxpath-0.4.0.dist-info/entry_points.txt +2 -0
wxpath-0.4.0.dist-info/licenses/LICENSE +21 -0
wxpath-0.4.0.dist-info/top_level.txt +1 -0

wxpath/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+from .core.runtime.engine import wxpath_async, wxpath_async_blocking, wxpath_async_blocking_iter
+from .util.logging import configure_logging
+__all__ = [
+    'wxpath_async',
+    'wxpath_async_blocking',
+    'wxpath_async_blocking_iter',
+    'configure_logging',
+]

wxpath/cli.py ADDED Viewed

@@ -0,0 +1,137 @@
+import argparse
+import json
+import sys
+from wxpath.core import parser as wxpath_parser
+from wxpath.core.runtime.engine import WXPathEngine, wxpath_async_blocking_iter
+from wxpath.hooks import builtin, registry
+from wxpath.http.client.crawler import Crawler
+from wxpath.settings import SETTINGS
+from wxpath.util.serialize import simplify
+def main():
+    registry.register(builtin.SerializeXPathMapAndNodeHook)
+    arg_parser = argparse.ArgumentParser(description="Run wxpath expression.")
+    arg_parser.add_argument("expression", help="The wxpath expression")
+    arg_parser.add_argument("--depth", type=int, default=1, help="Recursion depth")
+    # debug
+    arg_parser.add_argument("--debug", action="store_true",
+                            help="Debug mode. Provides verbose runtime output and information")
+    # verbose
+    arg_parser.add_argument("--verbose", action="store_true",
+                            help="Verbose mode. Prints CLI level information")
+    arg_parser.add_argument(
+        "--concurrency",
+        type=int,
+        default=16,
+        help="Number of concurrent fetches"
+    )
+    arg_parser.add_argument(
+        "--concurrency-per-host",
+        type=int,
+        default=8,
+        help="Number of concurrent fetches per host"
+    )
+    arg_parser.add_argument(
+        "--header",
+        action="append",
+        dest="header_list",
+        default=[],
+        help="Add a custom header (e.g., 'Key:Value'). Can be used multiple times.",
+    )
+    arg_parser.add_argument(
+        "--respect-robots",
+        action="store_true",
+        help="Respect robots.txt",
+        default=True
+    )
+    arg_parser.add_argument(
+        "--cache",
+        action="store_true",
+        help="Use cache",
+        default=False
+    )
+    arg_parser.add_argument(
+        "--cache-backend",
+        type=str,
+        help="Cache backend. Possible values: redis, sqlite",
+        default="sqlite"
+    )
+    arg_parser.add_argument(
+        "--cache-db-path-or-url",
+        type=str,
+        help="Path to cache database",
+        default="cache.db"
+    )
+    args = arg_parser.parse_args()
+    if args.debug:
+        from wxpath import configure_logging
+        configure_logging('DEBUG')
+    custom_headers = {}
+    if args.header_list:
+        for header_item in args.header_list:
+            try:
+                key, value = header_item.split(':', 1)
+                custom_headers[key.strip()] = value.strip()
+            except ValueError:
+                print(f"Warning: Invalid header format '{header_item}'. Use 'Key:Value'.")
+    if custom_headers and args.verbose:
+        print(f"Using custom headers: {custom_headers}")
+        print()
+    if args.cache:
+        SETTINGS.http.client.cache.enabled = True
+        if args.cache_backend == "redis":
+            SETTINGS.http.client.cache.backend = "redis"
+            SETTINGS.http.client.cache.redis.address = args.cache_db_path_or_url
+        elif args.cache_backend == "sqlite":
+            SETTINGS.http.client.cache.backend = "sqlite"
+            SETTINGS.http.client.cache.sqlite.cache_name = args.cache_db_path_or_url
+    if args.verbose:
+        print(f"Using concurrency: {args.concurrency}")
+        print(f"Using concurrency per host: {args.concurrency_per_host}")
+        print(f"Using respect robots: {args.respect_robots}")
+        print(f"Using cache: {args.cache}")
+        segments = wxpath_parser.parse(args.expression)
+        print("parsed expression:\n\nSegments([")
+        for s in segments:
+            print(f"\t{s},")
+        print("])")
+        print()
+        print()
+    crawler = Crawler(
+        concurrency=args.concurrency,
+        per_host=args.concurrency_per_host,
+        respect_robots=args.respect_robots,
+        headers=custom_headers
+    )
+    engine = WXPathEngine(crawler=crawler)
+    try:
+        for r in wxpath_async_blocking_iter(
+            path_expr=args.expression,
+            max_depth=args.depth,
+            engine=engine):
+            clean = simplify(r)
+            print(json.dumps(clean, ensure_ascii=False), flush=True)
+    except BrokenPipeError:
+        if args.verbose:
+            print("Pipe broken.")
+    if args.verbose:
+        print("Done. Printing crawl stats")
+        print(crawler._stats)
+    sys.exit(0)
+if __name__ == "__main__":
+    main()

wxpath/core/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+from wxpath.core.runtime.engine import (
+    WXPathEngine,
+    wxpath_async,
+    wxpath_async_blocking,
+    wxpath_async_blocking_iter,
+)
+__all__ = [
+    'wxpath_async',
+    'wxpath_async_blocking',
+    'wxpath_async_blocking_iter',
+    'WXPathEngine',
+]

wxpath/core/dom.py ADDED Viewed

@@ -0,0 +1,22 @@
+from urllib.parse import urljoin
+def _make_links_absolute(links: list[str], base_url: str) -> list[str]:
+    """
+    Convert relative links to absolute links based on the base URL.
+    Args:
+        links (list): List of link strings.
+        base_url (str): The base URL to resolve relative links against.
+    Returns:
+        List of absolute URLs.
+    """
+    if base_url is None:
+        raise ValueError("base_url must not be None when making links absolute.")
+    return [urljoin(base_url, link) for link in links if link]
+def get_absolute_links_from_elem_and_xpath(elem, xpath):
+    base_url = getattr(elem, 'base_url', None)
+    return _make_links_absolute(elem.xpath3(xpath), base_url)

wxpath/core/models.py ADDED Viewed

@@ -0,0 +1,74 @@
+from dataclasses import dataclass, field
+from typing import Any, List, Optional, Tuple
+@dataclass(slots=True)
+class CrawlTask:
+    """A unit of work for the crawler."""
+    elem: Any
+    url: str
+    segments: List[Tuple[str, str]]
+    depth: int
+    backlink: Optional[str] = None
+    base_url: Optional[str] = None
+    # Priority for the queue (lower number = higher priority)
+    # Useful if you want Depth-First behavior in a shared queue
+    priority: int = field(default=0)
+    def __post_init__(self):
+        # Automatically sync priority with depth for BFS behavior
+        self.priority = self.depth
+    def __lt__(self, other):
+        return self.priority < other.priority
+    def __iter__(self):
+        return iter((self.elem, self.segments, self.depth, self.backlink))
+@dataclass(slots=True)
+class Intent:
+    pass
+@dataclass(slots=True)
+class Result(Intent):
+    """A container for an extracted item or error."""
+    value: Any
+    url: str
+    depth: int
+    error: Optional[Exception] = None
+    backlink: Optional[str] = None
+@dataclass(slots=True)
+class CrawlIntent(Intent):
+    url: str             # "I found this link"
+    next_segments: list  # "Here is what to do next if you go there"
+@dataclass(slots=True)
+class ProcessIntent(Intent):
+    elem: Any
+    next_segments: list
+@dataclass(slots=True)
+class InfiniteCrawlIntent(ProcessIntent):
+    pass
+@dataclass(slots=True)
+class ExtractIntent(ProcessIntent):
+    pass
+@dataclass(slots=True)
+class CrawlFromAttributeIntent(ProcessIntent):
+    pass
+@dataclass(slots=True)
+class DataIntent(Intent):
+    value: Any

wxpath/core/ops.py ADDED Viewed

@@ -0,0 +1,278 @@
+from typing import Callable, Iterable
+from urllib.parse import urljoin
+import elementpath
+from elementpath.datatypes import AnyAtomicType
+from elementpath.xpath3 import XPath3Parser
+from lxml import html
+from wxpath.core.dom import get_absolute_links_from_elem_and_xpath
+from wxpath.core.models import (
+    CrawlIntent,
+    DataIntent,
+    ExtractIntent,
+    InfiniteCrawlIntent,
+    Intent,
+    ProcessIntent,
+)
+from wxpath.core.parser import (
+    Binary,
+    Call,
+    ContextItem,
+    Segment,
+    Segments,
+    String,
+    Url,
+    UrlCrawl,
+    Xpath,
+)
+from wxpath.util.logging import get_logger
+log = get_logger(__name__)
+class WxStr(str):
+    """A string with associated base_url and depth metadata for debugging."""
+    def __new__(cls, value, base_url=None, depth=-1):
+        obj = super().__new__(cls, value)
+        obj.base_url = base_url
+        obj.depth = depth
+        return obj
+    def __repr__(self):
+        return f"WxStr({super().__repr__()}, base_url={self.base_url!r}, depth={self.depth})"
+class RuntimeSetupError(Exception):
+    pass
+OPS_REGISTER: dict[str, Callable] = {}
+def register(func_name_or_type: str | type, args_types: tuple[type, ...] | None = None):
+    def _register(func: Callable) -> Callable:
+        global OPS_REGISTER
+        _key = (func_name_or_type, args_types) if args_types else func_name_or_type
+        if _key in OPS_REGISTER:
+            raise RuntimeSetupError(f"The operation handler for \"{_key}\" already registered")
+        OPS_REGISTER[_key] = func
+        return func
+    return _register
+def get_operator(
+        binary_or_segment: Binary | Segment
+    ) -> Callable[[html.HtmlElement, list[Url | Xpath], int], Iterable[Intent]]:
+    func_name_or_type = getattr(binary_or_segment, 'func', None) or binary_or_segment.__class__
+    args_types = None
+    if isinstance(binary_or_segment, Binary):
+        args_types = (binary_or_segment.left.__class__, binary_or_segment.right.__class__)
+    elif isinstance(binary_or_segment, Call):
+        args_types = tuple(arg.__class__ for arg in binary_or_segment.args)
+    _key = (func_name_or_type, args_types) if args_types else func_name_or_type
+    if _key not in OPS_REGISTER:
+        raise ValueError(f"Unknown operation: {_key}")
+    return OPS_REGISTER[_key]
+@register('url', (String,))
+@register('url', (String, Xpath))
+def _handle_url_str_lit(curr_elem: html.HtmlElement,
+                        curr_segments: list[Url | Xpath],
+                        curr_depth: int, **kwargs) -> Iterable[Intent]:
+    """Handle `url('<literal>')` segments and optional follow xpath."""
+    url_call = curr_segments[0] # type: Url
+    next_segments = curr_segments[1:]
+    if len(url_call.args) == 2:
+        _segments = [
+            UrlCrawl('///url', [url_call.args[1], url_call.args[0].value])
+        ] + next_segments
+        yield CrawlIntent(url=url_call.args[0].value, next_segments=_segments)
+    else:
+        yield CrawlIntent(url=url_call.args[0].value, next_segments=next_segments)
+# @register2('url', (Xpath,))
+@register(Xpath)
+def _handle_xpath(curr_elem: html.HtmlElement,
+                  curr_segments: Segments,
+                  curr_depth: int,
+                  **kwargs) -> Iterable[Intent]:
+    """Execute an xpath step and yield data or chained processing intents."""
+    xpath_node = curr_segments[0] # type: Xpath
+    expr = xpath_node.value
+    if curr_elem is None:
+        raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
+    base_url = getattr(curr_elem, 'base_url', None)
+    log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
+    _backlink_str = f"string('{curr_elem.get('backlink')}')"
+    # We use the root tree's depth and not curr_depth because curr_depth accounts for a +1
+    # increment after each url*() hop
+    _depth_str = f"number({curr_elem.getroottree().getroot().get('depth')})"
+    expr = expr.replace('wx:backlink()', _backlink_str)
+    expr = expr.replace('wx:backlink(.)', _backlink_str)
+    expr = expr.replace('wx:depth()', _depth_str)
+    expr = expr.replace('wx:depth(.)', _depth_str)
+    elems = curr_elem.xpath3(expr)
+    next_segments = curr_segments[1:]
+    for elem in elems:
+        value_or_elem = WxStr(
+            elem, base_url=base_url,
+            depth=curr_depth
+        ) if isinstance(elem, str) else elem
+        if len(curr_segments) == 1:
+            yield DataIntent(value=value_or_elem)
+        else:
+            yield ProcessIntent(elem=value_or_elem, next_segments=next_segments)
+@register('//url', (ContextItem,))
+@register('//url', (Xpath,))
+@register('/url', (ContextItem,))
+@register('/url', (Xpath,))
+@register('url', (ContextItem,))
+@register('url', (Xpath,))
+def _handle_url_eval(curr_elem: html.HtmlElement | str,
+                     curr_segments: list[Url | Xpath],
+                     curr_depth: int,
+                     **kwargs) -> Iterable[Intent]:
+    """Resolve dynamic url() arguments and enqueue crawl intents.
+    Yields:
+        CrawlIntent
+    """
+    url_call = curr_segments[0] # type: Url
+    if isinstance(url_call.args[0], ContextItem):
+        urls = [urljoin(getattr(curr_elem, 'base_url', None) or '', curr_elem)]
+    else:
+        _path_exp = url_call.args[0].value
+        # TODO: If prior xpath operation is XPATH_FN_MAP_FRAG, then this will likely fail.
+        # It should be handled in the parser.
+        urls = get_absolute_links_from_elem_and_xpath(curr_elem, _path_exp)
+        urls = dict.fromkeys(urls)
+    next_segments = curr_segments[1:]
+    for url in urls:
+        # log.debug("queueing", extra={"depth": curr_depth, "op": op, "url": url})
+        yield CrawlIntent(url=url, next_segments=next_segments)
+@register('///url', (Xpath,))
+def _handle_url_inf(curr_elem: html.HtmlElement,
+                    curr_segments: list[Url | Xpath],
+                    curr_depth: int,
+                    **kwargs) -> Iterable[CrawlIntent]:
+    """Handle the ``///url()`` segment of a wxpath expression.
+    This operation is also generated internally by the parser when a
+    ``///<xpath>/[/]url()`` segment is encountered.
+    Instead of fetching URLs directly, this operator XPaths the current
+    element for URLs and queues them for further processing via
+    ``_handle_url_inf_and_xpath``.
+    """
+    url_call = curr_segments[0] # type: Url
+    _path_exp = url_call.args[0].value
+    urls = get_absolute_links_from_elem_and_xpath(curr_elem, _path_exp)
+    tail_segments = curr_segments[1:]
+    for url in dict.fromkeys(urls):
+        _segments = [
+            UrlCrawl('///url', [url_call.args[0], url])
+        ] + tail_segments
+        yield CrawlIntent(url=url, next_segments=_segments)
+@register('///url', (Xpath, str))
+def _handle_url_inf_and_xpath(curr_elem: html.HtmlElement,
+                              curr_segments: list[Url | Xpath],
+                              curr_depth: int, **kwargs) \
+                                -> Iterable[DataIntent | ProcessIntent | InfiniteCrawlIntent]:
+    """Handle infinite-crawl with an xpath extraction step.
+    This operation is generated internally by the parser; there is no explicit
+    wxpath expression that produces it directly.
+    Yields:
+        DataIntent: If the current element is not None and no next segments are provided.
+        ExtractIntent: If the current element is not None and next segments are provided.
+        InfiniteCrawlIntent: If the current element is not None and next segments are provided.
+    Raises:
+        ValueError: If the current element is None.
+    """
+    url_call = curr_segments[0]
+    try:
+        if curr_elem is None:
+            raise ValueError("Missing element when op is 'url_inf_and_xpath'.")
+        next_segments = curr_segments[1:]
+        if not next_segments:
+            yield DataIntent(value=curr_elem)
+        else:
+            yield ExtractIntent(elem=curr_elem, next_segments=next_segments)
+        # For url_inf, also re-enqueue for further infinite expansion
+        _segments = [UrlCrawl('///url', url_call.args[:-1])] + next_segments
+        crawl_intent = InfiniteCrawlIntent(elem=curr_elem, next_segments=_segments)
+        yield crawl_intent
+    except Exception:
+        log.exception("error fetching url inf and xpath",
+                      extra={"depth": curr_depth, "url": url_call.args[1]})
+@register(Binary, (Xpath, Segments))
+def _handle_binary(curr_elem: html.HtmlElement | str,
+                              curr_segments: list[Url | Xpath] | Binary,
+                              curr_depth: int,
+                              **kwargs) -> Iterable[DataIntent | ProcessIntent]:
+    """Execute XPath expressions suffixed with the ``!`` (map) operator.
+    Yields:
+        ProcessIntent: Contrains either a WxStr or lxml or elementpath element.
+    """
+    left = curr_segments.left
+    _ = curr_segments.op
+    right = curr_segments.right
+    if len(right) == 0:
+        # Binary operation on segments expects non-empty segments
+        raise ValueError("Binary operation on segments expects non-empty segments")
+    base_url = getattr(curr_elem, 'base_url', None)
+    next_segments = right
+    results = elementpath.select(
+        curr_elem,
+        left.value,
+        parser=XPath3Parser,
+        item='' if curr_elem is None else None
+    )
+    if isinstance(results, AnyAtomicType):
+        results = [results]
+    for result in results:
+        if isinstance(result, str):
+            value_or_elem = WxStr(result, base_url=base_url, depth=curr_depth)
+        else:
+            value_or_elem = result
+        yield ProcessIntent(elem=value_or_elem, next_segments=next_segments)