PyPI - wxpath - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

wxpath 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

wxpath/cli.py +52 -12
wxpath/core/ops.py +163 -129
wxpath/core/parser.py +559 -280
wxpath/core/runtime/engine.py +133 -42
wxpath/core/runtime/helpers.py +0 -7
wxpath/hooks/registry.py +29 -17
wxpath/http/client/crawler.py +46 -11
wxpath/http/client/request.py +6 -3
wxpath/http/client/response.py +1 -1
wxpath/http/policy/robots.py +82 -0
{wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/METADATA +84 -37
{wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/RECORD +16 -16
wxpath/core/errors.py +0 -134
{wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/WHEEL +0 -0
{wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/entry_points.txt +0 -0
{wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/licenses/LICENSE +0 -0
{wxpath-0.2.0.dist-info → wxpath-0.3.0.dist-info}/top_level.txt +0 -0

wxpath/cli.py CHANGED Viewed

@@ -2,44 +2,84 @@ import argparse
 import json
 import sys
-from wxpath.core.parser import parse_wxpath_expr
+from wxpath.core import parser as wxpath_parser
 from wxpath.core.runtime.engine import WXPathEngine, wxpath_async_blocking_iter
 from wxpath.hooks import builtin, registry
+from wxpath.http.client.crawler import Crawler
 from wxpath.util.serialize import simplify
 def main():
     registry.register(builtin.SerializeXPathMapAndNodeHook)
-    parser = argparse.ArgumentParser(description="Run wxpath expression.")
-    parser.add_argument("expression", help="The wxpath expression")
-    parser.add_argument("--depth", type=int, default=1, help="Recursion depth")
+    arg_parser = argparse.ArgumentParser(description="Run wxpath expression.")
+    arg_parser.add_argument("expression", help="The wxpath expression")
+    arg_parser.add_argument("--depth", type=int, default=1, help="Recursion depth")
     # debug
-    parser.add_argument("--debug", action="store_true", help="Debug mode")
+    arg_parser.add_argument("--debug", action="store_true", help="Debug mode")
     # verbose
-    parser.add_argument("--verbose", action="store_true", help="Verbose mode")
+    arg_parser.add_argument("--verbose", action="store_true", help="Verbose mode")
-    parser.add_argument("--concurrency", type=int, default=16, help="Number of concurrent fetches")
-    parser.add_argument(
+    arg_parser.add_argument(
+        "--concurrency",
+        type=int,
+        default=16,
+        help="Number of concurrent fetches"
+    )
+    arg_parser.add_argument(
         "--concurrency-per-host",
         type=int,
         default=8,
         help="Number of concurrent fetches per host"
     )
+    arg_parser.add_argument(
+        "--header",
+        action="append",
+        dest="header_list",
+        default=[],
+        help="Add a custom header (e.g., 'Key:Value'). Can be used multiple times.",
+    )
+    arg_parser.add_argument(
+        "--respect-robots",
+        action="store_true",
+        help="Respect robots.txt",
+        default=True
+    )
-    args = parser.parse_args()
+    args = arg_parser.parse_args()
     if args.verbose:
-        print("wxpath expression:", args.expression)
-        print("parsed expression:", parse_wxpath_expr(args.expression))
+        segments = wxpath_parser.parse(args.expression)
+        print("parsed expression:\n\nSegments([")
+        for s in segments:
+            print(f"\t{s},")
+        print("])")
+        print()
     if args.debug:
         from wxpath import configure_logging
         configure_logging('DEBUG')
-    engine = WXPathEngine(
+    custom_headers = {}
+    if args.header_list:
+        for header_item in args.header_list:
+            try:
+                key, value = header_item.split(':', 1)
+                custom_headers[key.strip()] = value.strip()
+            except ValueError:
+                print(f"Warning: Invalid header format '{header_item}'. Use 'Key:Value'.")
+    if custom_headers and args.verbose:
+        print(f"Using custom headers: {custom_headers}")
+        print()
+    crawler = Crawler(
         concurrency=args.concurrency,
         per_host=args.concurrency_per_host,
+        respect_robots=args.respect_robots,
+        headers=custom_headers
     )
+    engine = WXPathEngine(crawler=crawler)
     try:
         for r in wxpath_async_blocking_iter(args.expression, args.depth, engine):
             clean = simplify(r)

wxpath/core/ops.py CHANGED Viewed

@@ -1,7 +1,3 @@
-"""
-`ops` for "operations". This module contains side-effect-free functions (operators)
-for handling each segment of a wxpath expression.
-"""
 from typing import Callable, Iterable
 from urllib.parse import urljoin
@@ -19,16 +15,24 @@ from wxpath.core.models import (
     Intent,
     ProcessIntent,
 )
-from wxpath.core.parser import OPS, Segment, UrlInfAndXpathValue, XpathValue
+from wxpath.core.parser import (
+    Binary,
+    Call,
+    ContextItem,
+    Segment,
+    Segments,
+    String,
+    Url,
+    UrlCrawl,
+    Xpath,
+)
 from wxpath.util.logging import get_logger
 log = get_logger(__name__)
 class WxStr(str):
-    """
-    A string that has a base_url and depth associated with it. Purely for debugging.
-    """
+    """A string with associated base_url and depth metadata for debugging."""
     def __new__(cls, value, base_url=None, depth=-1):
         obj = super().__new__(cls, value)
         obj.base_url = base_url
@@ -39,61 +43,120 @@ class WxStr(str):
         return f"WxStr({super().__repr__()}, base_url={self.base_url!r}, depth={self.depth})"
-HANDLERS: dict[str, Callable] = {}
+class RuntimeSetupError(Exception):
+    pass
+OPS_REGISTER: dict[str, Callable] = {}
+def register(func_name_or_type: str | type, args_types: tuple[type, ...] | None = None):
+    def _register(func: Callable) -> Callable:
+        global OPS_REGISTER
+        _key = (func_name_or_type, args_types) if args_types else func_name_or_type
+        if _key in OPS_REGISTER:
+            raise RuntimeSetupError(f"The operation handler for \"{_key}\" already registered")
+        OPS_REGISTER[_key] = func
+        return func
+    return _register
-def _op(name: OPS):
-    def reg(fn):
-        if name in HANDLERS:
-            raise ValueError(f"Duplicate operation: {name}")
-        HANDLERS[name] = fn
-        return fn
-    return reg
+def get_operator(
+        binary_or_segment: Binary | Segment
+    ) -> Callable[[html.HtmlElement, list[Url | Xpath], int], Iterable[Intent]]:
+    func_name_or_type = getattr(binary_or_segment, 'func', None) or binary_or_segment.__class__
-def get_operator(name: OPS) -> Callable[[html.HtmlElement, list[Segment], int], Iterable[Intent]]:
-    if name not in HANDLERS:
-        raise ValueError(f"Unknown operation: {name}")
-    return HANDLERS[name]
+    args_types = None
+    if isinstance(binary_or_segment, Binary):
+        args_types = (binary_or_segment.left.__class__, binary_or_segment.right.__class__)
+    elif isinstance(binary_or_segment, Call):
+        args_types = tuple(arg.__class__ for arg in binary_or_segment.args)
+    _key = (func_name_or_type, args_types) if args_types else func_name_or_type
+    if _key not in OPS_REGISTER:
+        raise ValueError(f"Unknown operation: {_key}")
+    return OPS_REGISTER[_key]
-@_op(OPS.URL_STR_LIT)
+@register('url', (String,))
+@register('url', (String, Xpath))
 def _handle_url_str_lit(curr_elem: html.HtmlElement,
-                        curr_segments: list[Segment],
+                        curr_segments: list[Url | Xpath],
                         curr_depth: int, **kwargs) -> Iterable[Intent]:
-    op, value = curr_segments[0]
-    log.debug("queueing", extra={"depth": curr_depth, "op": op, "url": value.target})
+    """Handle `url('<literal>')` segments and optional follow xpath."""
+    url_call = curr_segments[0] # type: Url
     next_segments = curr_segments[1:]
-    if value.follow:
+    if len(url_call.args) == 2:
         _segments = [
-            (OPS.URL_INF_AND_XPATH, UrlInfAndXpathValue('', value.target, value.follow))
+            UrlCrawl('///url', [url_call.args[1], url_call.args[0].value])
         ] + next_segments
-        yield CrawlIntent(url=value.target, next_segments=_segments)
+        yield CrawlIntent(url=url_call.args[0].value, next_segments=_segments)
     else:
-        yield CrawlIntent(url=value.target, next_segments=next_segments)
+        yield CrawlIntent(url=url_call.args[0].value, next_segments=next_segments)
+# @register2('url', (Xpath,))
+@register(Xpath)
+def _handle_xpath(curr_elem: html.HtmlElement,
+                  curr_segments: Segments,
+                  curr_depth: int,
+                  **kwargs) -> Iterable[Intent]:
+    """Execute an xpath step and yield data or chained processing intents."""
+    xpath_node = curr_segments[0] # type: Xpath
+    expr = xpath_node.value
+    if curr_elem is None:
+        raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
+    base_url = getattr(curr_elem, 'base_url', None)
+    log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
+    _backlink_str = f"string('{curr_elem.get('backlink')}')"
+    # We use the root tree's depth and not curr_depth because curr_depth accounts for a +1
+    # increment after each url*() hop
+    _depth_str = f"number({curr_elem.getroottree().getroot().get('depth')})"
+    expr = expr.replace('wx:backlink()', _backlink_str)
+    expr = expr.replace('wx:backlink(.)', _backlink_str)
+    expr = expr.replace('wx:depth()', _depth_str)
+    expr = expr.replace('wx:depth(.)', _depth_str)
+    elems = curr_elem.xpath3(expr)
+    next_segments = curr_segments[1:]
+    for elem in elems:
+        value_or_elem = WxStr(
+            elem, base_url=base_url,
+            depth=curr_depth
+        ) if isinstance(elem, str) else elem
+        if len(curr_segments) == 1:
+            yield DataIntent(value=value_or_elem)
+        else:
+            yield ProcessIntent(elem=value_or_elem, next_segments=next_segments)
-@_op(OPS.URL_EVAL)
+@register('//url', (ContextItem,))
+@register('//url', (Xpath,))
+@register('/url', (ContextItem,))
+@register('/url', (Xpath,))
+@register('url', (ContextItem,))
+@register('url', (Xpath,))
 def _handle_url_eval(curr_elem: html.HtmlElement | str,
-                     curr_segments: list[Segment],
+                     curr_segments: list[Url | Xpath],
                      curr_depth: int,
                      **kwargs) -> Iterable[Intent]:
-    op, value = curr_segments[0]
-    _path_exp = value.expr
-    if isinstance(curr_elem, str):
-        # TODO: IMO, ideally, wxpath grammar should not be checked/validated/enforced
-        # in ops.py. It should instead be validated in the parser.
-        if _path_exp not in {'.', 'self::node()'}:
-            raise ValueError("Only '.' or 'self::node()' is supported in url() segments "
-                             f"when prior xpath operation results in a string. Got: {_path_exp}")
+    """Resolve dynamic url() arguments and enqueue crawl intents.
+    Yields:
+        CrawlIntent
+    """
+    url_call = curr_segments[0] # type: Url
+    if isinstance(url_call.args[0], ContextItem):
         urls = [urljoin(getattr(curr_elem, 'base_url', None) or '', curr_elem)]
     else:
+        _path_exp = url_call.args[0].value
         # TODO: If prior xpath operation is XPATH_FN_MAP_FRAG, then this will likely fail.
         # It should be handled in the parser.
         urls = get_absolute_links_from_elem_and_xpath(curr_elem, _path_exp)
@@ -101,144 +164,115 @@ def _handle_url_eval(curr_elem: html.HtmlElement | str,
     next_segments = curr_segments[1:]
     for url in urls:
-        log.debug("queueing", extra={"depth": curr_depth, "op": op, "url": url})
+        # log.debug("queueing", extra={"depth": curr_depth, "op": op, "url": url})
         yield CrawlIntent(url=url, next_segments=next_segments)
-@_op(OPS.URL_INF)
+@register('///url', (Xpath,))
 def _handle_url_inf(curr_elem: html.HtmlElement,
-                    curr_segments: list[Segment],
+                    curr_segments: list[Url | Xpath],
                     curr_depth: int,
                     **kwargs) -> Iterable[CrawlIntent]:
+    """Handle the ``///url()`` segment of a wxpath expression.
+    This operation is also generated internally by the parser when a
+    ``///<xpath>/[/]url()`` segment is encountered.
+    Instead of fetching URLs directly, this operator XPaths the current
+    element for URLs and queues them for further processing via
+    ``_handle_url_inf_and_xpath``.
     """
-    Handles the ///url() segment of a wxpath expression. This operation is also
-    generated internally by the parser when a `///<xpath>/[/]url()` segment is
-    encountered by the parser.
-    This operation does not fetch URLs; instead, it XPaths the current element
-    for URLs, then queues them for further processing (see
-    _handle_url_inf_and_xpath).
-    """
-    op, value = curr_segments[0]
+    url_call = curr_segments[0] # type: Url
-    _path_exp = value.expr
+    _path_exp = url_call.args[0].value
     urls = get_absolute_links_from_elem_and_xpath(curr_elem, _path_exp)
-    log.debug("found urls",
-              extra={"depth": curr_depth, "op": op, "url": getattr(curr_elem, 'base_url', None)})
     tail_segments = curr_segments[1:]
     for url in dict.fromkeys(urls):
         _segments = [
-            (OPS.URL_INF_AND_XPATH, UrlInfAndXpathValue('', url, _path_exp))
+            UrlCrawl('///url', [url_call.args[0], url])
         ] + tail_segments
-        log.debug("queueing", extra={"depth": curr_depth, "op": op, "url": url})
         yield CrawlIntent(url=url, next_segments=_segments)
-@_op(OPS.URL_INF_AND_XPATH)
+@register('///url', (Xpath, str))
 def _handle_url_inf_and_xpath(curr_elem: html.HtmlElement,
-                              curr_segments: list[Segment],
+                              curr_segments: list[Url | Xpath],
                               curr_depth: int, **kwargs) \
                                 -> Iterable[DataIntent | ProcessIntent | InfiniteCrawlIntent]:
+    """Handle infinite-crawl with an xpath extraction step.
+    This operation is generated internally by the parser; there is no explicit
+    wxpath expression that produces it directly.
+    Yields:
+        DataIntent: If the current element is not None and no next segments are provided.
+        ExtractIntent: If the current element is not None and next segments are provided.
+        InfiniteCrawlIntent: If the current element is not None and next segments are provided.
+    Raises:
+        ValueError: If the current element is None.
     """
-    This is an operation that is generated internally by the parser. There is
-    no explicit wxpath expression that generates this operation.
-    """
-    op, value = curr_segments[0]
+    url_call = curr_segments[0]
     try:
         if curr_elem is None:
             raise ValueError("Missing element when op is 'url_inf_and_xpath'.")
         next_segments = curr_segments[1:]
         if not next_segments:
             yield DataIntent(value=curr_elem)
         else:
             yield ExtractIntent(elem=curr_elem, next_segments=next_segments)
         # For url_inf, also re-enqueue for further infinite expansion
-        _segments = [(OPS.URL_INF, XpathValue('', value.expr))] + next_segments
+        _segments = [UrlCrawl('///url', url_call.args[:-1])] + next_segments
         crawl_intent = InfiniteCrawlIntent(elem=curr_elem, next_segments=_segments)
-        log.debug("queueing InfiniteCrawlIntent",
-                  extra={"depth": curr_depth, "op": op,
-                         "url": value.target, "crawl_intent": crawl_intent})
         yield crawl_intent
     except Exception:
-        log.exception("error fetching url",
-                      extra={"depth": curr_depth, "op": op, "url": value.target})
+        log.exception("error fetching url inf and xpath",
+                      extra={"depth": curr_depth, "url": url_call.args[1]})
-@_op(OPS.XPATH)
-def _handle_xpath(curr_elem: html.HtmlElement,
-                  curr_segments: list[Segment],
-                  curr_depth: int,
-                  **kwargs) -> Iterable[DataIntent | ProcessIntent]:
-    """
-    Handles the [/|//]<xpath> segment of a wxpath expression. This is a plain XPath expression.
-    Also handles wxpath-specific macro expansions like wx:backlink() or wx:depth().
-    """
-    _, value = curr_segments[0]
-    expr = value.expr
-    if curr_elem is None:
-        raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
-    base_url = getattr(curr_elem, 'base_url', None)
-    log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
-    _backlink_str = f"string('{curr_elem.get('backlink')}')"
-    # We use the root tree's depth and not curr_depth because curr_depth accounts for a +1
-    # increment after each url*() hop
-    _depth_str = f"number({curr_elem.getroottree().getroot().get('depth')})"
-    expr = expr.replace('wx:backlink()', _backlink_str)
-    expr = expr.replace('wx:backlink(.)', _backlink_str)
-    expr = expr.replace('wx:depth()', _depth_str)
-    expr = expr.replace('wx:depth(.)', _depth_str)
-    elems = curr_elem.xpath3(expr)
-    next_segments = curr_segments[1:]
-    for elem in elems:
-        value_or_elem = WxStr(
-            elem, base_url=base_url,
-            depth=curr_depth
-        ) if isinstance(elem, str) else elem
-        if len(curr_segments) == 1:
-            yield DataIntent(value=value_or_elem)
-        else:
-            yield ProcessIntent(elem=value_or_elem, next_segments=next_segments)
-@_op(OPS.XPATH_FN_MAP_FRAG)
-def _handle_xpath_fn_map_frag(curr_elem: html.HtmlElement | str,
-                              curr_segments: list[Segment],
+@register(Binary, (Xpath, Segments))
+def _handle_binary(curr_elem: html.HtmlElement | str,
+                              curr_segments: list[Url | Xpath] | Binary,
                               curr_depth: int,
                               **kwargs) -> Iterable[DataIntent | ProcessIntent]:
+    """Execute XPath expressions suffixed with the ``!`` (map) operator.
+    Yields:
+        ProcessIntent: Contrains either a WxStr or lxml or elementpath element.
     """
-    Handles the execution of XPath functions that were initially suffixed with a
-    '!' (map) operator.
-    """
-    _, value = curr_segments[0]
+    left = curr_segments.left
+    _ = curr_segments.op
+    right = curr_segments.right
+    if len(right) == 0:
+        # Binary operation on segments expects non-empty segments
+        raise ValueError("Binary operation on segments expects non-empty segments")
     base_url = getattr(curr_elem, 'base_url', None)
-    next_segments = curr_segments[1:]
+    next_segments = right
-    result = elementpath.select(
+    results = elementpath.select(
         curr_elem,
-        value.expr,
+        left.value,
         parser=XPath3Parser,
         item='' if curr_elem is None else None
     )
-    if isinstance(result, AnyAtomicType):
-        result = [result]
+    if isinstance(results, AnyAtomicType):
+        results = [results]
-    for r in result:
-        value_or_elem = WxStr(r, base_url=base_url, depth=curr_depth) if isinstance(r, str) else r
-        if len(curr_segments) == 1:
-            # XPATH_FN_MAP_FRAG is not a terminal operation
-            raise ValueError("XPATH_FN_MAP_FRAG is not a terminal operation")
+    for result in results:
+        if isinstance(result, str):
+            value_or_elem = WxStr(result, base_url=base_url, depth=curr_depth)
         else:
-            yield ProcessIntent(elem=value_or_elem, next_segments=next_segments)
+            value_or_elem = result
+        yield ProcessIntent(elem=value_or_elem, next_segments=next_segments)

wxpath 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

wxpath 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl