PyPI - wxpath - Versions diffs - 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

wxpath 0.4.1py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

wxpath/__init__.py +2 -0
wxpath/cli.py +6 -0
wxpath/core/exceptions.py +53 -0
wxpath/core/models.py +1 -0
wxpath/core/ops.py +100 -19
wxpath/core/parser.py +94 -24
wxpath/core/runtime/engine.py +74 -10
wxpath/core/runtime/helpers.py +6 -3
wxpath/http/client/__init__.py +1 -1
wxpath/http/client/crawler.py +17 -5
wxpath/http/client/response.py +7 -1
wxpath/http/policy/retry.py +2 -2
wxpath/integrations/__init__.py +0 -0
wxpath/integrations/langchain/__init__.py +0 -0
wxpath/integrations/langchain/examples/basic_rag.py +85 -0
wxpath/integrations/langchain/examples/rolling_window_rag.py +218 -0
wxpath/integrations/langchain/loader.py +60 -0
wxpath/patches.py +215 -5
wxpath/settings.py +3 -1
wxpath/tui.py +1225 -0
wxpath/tui_settings.py +151 -0
wxpath/util/cleaners.py +31 -0
wxpath/util/common_paths.py +22 -0
wxpath/util/logging.py +3 -7
{wxpath-0.4.1.dist-info → wxpath-0.5.1.dist-info}/METADATA +73 -9
wxpath-0.5.1.dist-info/RECORD +45 -0
{wxpath-0.4.1.dist-info → wxpath-0.5.1.dist-info}/WHEEL +1 -1
{wxpath-0.4.1.dist-info → wxpath-0.5.1.dist-info}/entry_points.txt +1 -0
wxpath-0.4.1.dist-info/RECORD +0 -35
{wxpath-0.4.1.dist-info → wxpath-0.5.1.dist-info}/licenses/LICENSE +0 -0
{wxpath-0.4.1.dist-info → wxpath-0.5.1.dist-info}/top_level.txt +0 -0

wxpath/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from . import settings
 from .core.runtime.engine import wxpath_async, wxpath_async_blocking, wxpath_async_blocking_iter
 from .util.logging import configure_logging
@@ -6,4 +7,5 @@ __all__ = [
     'wxpath_async_blocking',
     'wxpath_async_blocking_iter',
     'configure_logging',
+    'settings',
 ]

wxpath/cli.py CHANGED Viewed

@@ -47,6 +47,11 @@ def main():
         help="Respect robots.txt",
         default=True
     )
+    arg_parser.add_argument(
+        "--insecure",
+        action="store_true",
+        help="Disable SSL certificate verification (use for sites with broken chains)",
+    )
     arg_parser.add_argument(
         "--cache",
         action="store_true",
@@ -112,6 +117,7 @@ def main():
         concurrency=args.concurrency,
         per_host=args.concurrency_per_host,
         respect_robots=args.respect_robots,
+        verify_ssl=not args.insecure,
         headers=custom_headers
     )
     engine = WXPathEngine(crawler=crawler)

wxpath/core/exceptions.py ADDED Viewed

@@ -0,0 +1,53 @@
+class XPathEvaluationError(Exception):
+    """Errors during XPath evaluation with elementpath."""
+    def __init__(
+        self,
+        message: str,
+        xpath: str,
+        base_url: str | None = None,
+        element_tag: str | None = None,
+        error_code: str | None = None,  # XPath error codes like XPST0003
+        position: tuple[int, int] | None = None,  # (line, column)
+        original_error: Exception | None = None
+    ):
+        context = {
+            "xpath": xpath,
+            "base_url": base_url,
+            "element_tag": element_tag,
+            "error_code": error_code,
+            "position": position,
+        }
+        if original_error:
+            context["original_error"] = str(original_error)
+            # Extract XPath error code if present (e.g., [err:XPST0003])
+            if hasattr(original_error, 'code'):
+                context["error_code"] = original_error.code
+        super().__init__(message, context)
+    def to_dict(self) -> dict:
+        return {
+            "message": self.message,
+            "xpath": self.xpath,
+            "base_url": self.base_url,
+            "element_tag": self.element_tag,
+            "error_code": self.error_code,
+            "position": self.position,
+            "original_error": self.original_error,
+        }
+class XPathSyntaxError(XPathEvaluationError):
+    """Invalid XPath syntax."""
+    pass
+class XPathTypeError(XPathEvaluationError):
+    """Type error in XPath expression."""
+    pass
+class XPathRuntimeError(XPathEvaluationError):
+    """Runtime error during XPath evaluation."""
+    pass

wxpath/core/models.py CHANGED Viewed

@@ -61,6 +61,7 @@ class InfiniteCrawlIntent(ProcessIntent):
 @dataclass(slots=True)
 class ExtractIntent(ProcessIntent):
+    """TODO: May be redundant with ProcessIntent?"""
     pass

wxpath/core/ops.py CHANGED Viewed

@@ -2,11 +2,25 @@ from typing import Callable, Iterable
 from urllib.parse import urljoin
 import elementpath
+from elementpath import (
+    ElementPathError,
+    ElementPathSyntaxError as EPSyntaxError,
+    ElementPathTypeError as EPTypeError,
+    ElementPathZeroDivisionError,
+    ElementPathRuntimeError as EPRuntimeError,
+    MissingContextError,
+)
 from elementpath.datatypes import AnyAtomicType
 from elementpath.xpath3 import XPath3Parser
 from lxml import html
 from wxpath.core.dom import get_absolute_links_from_elem_and_xpath
+from wxpath.core.exceptions import (
+    XPathEvaluationError,
+    XPathSyntaxError,
+    XPathTypeError,
+    XPathRuntimeError,
+)
 from wxpath.core.models import (
     CrawlIntent,
     DataIntent,
@@ -19,6 +33,7 @@ from wxpath.core.parser import (
     Binary,
     Call,
     ContextItem,
+    Depth,
     Segment,
     Segments,
     String,
@@ -78,7 +93,10 @@ def get_operator(
 @register('url', (String,))
+@register('url', (String, Depth))
 @register('url', (String, Xpath))
+@register('url', (String, Depth, Xpath))
+@register('url', (String, Xpath, Depth))
 def _handle_url_str_lit(curr_elem: html.HtmlElement,
                         curr_segments: list[Url | Xpath],
                         curr_depth: int, **kwargs) -> Iterable[Intent]:
@@ -87,9 +105,12 @@ def _handle_url_str_lit(curr_elem: html.HtmlElement,
     next_segments = curr_segments[1:]
-    if len(url_call.args) == 2:
+    # NOTE: Expects parser to produce UrlCrawl node in expressions
+    # that look like `url('...', follow=//a/@href)`
+    if isinstance(url_call, UrlCrawl):
+        xpath_arg = [arg for arg in url_call.args if isinstance(arg, Xpath)][0]
         _segments = [
-            UrlCrawl('///url', [url_call.args[1], url_call.args[0].value])
+            UrlCrawl('///url', [xpath_arg, url_call.args[0].value])
         ] + next_segments
         yield CrawlIntent(url=url_call.args[0].value, next_segments=_segments)
@@ -112,17 +133,52 @@ def _handle_xpath(curr_elem: html.HtmlElement,
         raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
     base_url = getattr(curr_elem, 'base_url', None)
     log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
-    _backlink_str = f"string('{curr_elem.get('backlink')}')"
-    # We use the root tree's depth and not curr_depth because curr_depth accounts for a +1
-    # increment after each url*() hop
-    _depth_str = f"number({curr_elem.getroottree().getroot().get('depth')})"
-    expr = expr.replace('wx:backlink()', _backlink_str)
-    expr = expr.replace('wx:backlink(.)', _backlink_str)
-    expr = expr.replace('wx:depth()', _depth_str)
-    expr = expr.replace('wx:depth(.)', _depth_str)
-    elems = curr_elem.xpath3(expr)
+    try:
+        elems = curr_elem.xpath3(expr)
+    except EPSyntaxError as e:
+        # Parse the error message to extract line/column if available
+        # elementpath format: "... at line 1, column 7: [err:XPST0003] ..."
+        raise XPathSyntaxError(
+            f"Invalid XPath syntax: {str(e).split(': ', 1)[-1]}",
+            xpath=expr,
+            base_url=base_url,
+            element_tag=curr_elem.tag,
+            original_error=e
+        ) from e
+    except EPTypeError as e:
+        raise XPathTypeError(
+            f"XPath type error: {str(e).split(': ', 1)[-1]}",
+            xpath=expr,
+            base_url=base_url,
+            element_tag=curr_elem.tag,
+            original_error=e
+        ) from e
+    except ElementPathZeroDivisionError as e:
+        raise XPathRuntimeError(
+            f"Division by zero in XPath: {expr}",
+            xpath=expr,
+            base_url=base_url,
+            element_tag=curr_elem.tag,
+            original_error=e
+        ) from e
+    except MissingContextError as e:
+        raise XPathRuntimeError(
+            f"XPath requires context but none provided: {expr}",
+            xpath=expr,
+            base_url=base_url,
+            element_tag=curr_elem.tag,
+            original_error=e
+        ) from e
+    except ElementPathError as e:
+        # Catch-all for other elementpath errors
+        raise XPathEvaluationError(
+            f"XPath evaluation failed: {e}",
+            xpath=expr,
+            base_url=base_url,
+            element_tag=curr_elem.tag,
+            original_error=e
+        ) from e
     next_segments = curr_segments[1:]
     for elem in elems:
@@ -259,12 +315,37 @@ def _handle_binary(curr_elem: html.HtmlElement | str,
     base_url = getattr(curr_elem, 'base_url', None)
     next_segments = right
-    results = elementpath.select(
-        curr_elem,
-        left.value,
-        parser=XPath3Parser,
-        item='' if curr_elem is None else None
-    )
+    try:
+        results = elementpath.select(
+            curr_elem,
+            left.value,
+            parser=XPath3Parser,
+            item='' if curr_elem is None else None
+        )
+    except EPSyntaxError as e:
+        raise XPathSyntaxError(
+            f"Invalid XPath in binary operation: {str(e).split(': ', 1)[-1]}",
+            xpath=left.value,
+            base_url=base_url,
+            element_tag=getattr(curr_elem, 'tag', None),
+            original_error=e
+        ) from e
+    except EPTypeError as e:
+        raise XPathTypeError(
+            f"XPath type error in binary operation: {str(e).split(': ', 1)[-1]}",
+            xpath=left.value,
+            base_url=base_url,
+            element_tag=getattr(curr_elem, 'tag', None),
+            original_error=e
+        ) from e
+    except ElementPathError as e:
+        raise XPathEvaluationError(
+            f"XPath evaluation failed in binary operation: {e}",
+            xpath=left.value,
+            base_url=base_url,
+            element_tag=getattr(curr_elem, 'tag', None),
+            original_error=e
+        ) from e
     if isinstance(results, AnyAtomicType):
         results = [results]

wxpath/core/parser.py CHANGED Viewed

@@ -13,7 +13,9 @@ except ImportError:
 TOKEN_SPEC = [
-    ("NUMBER",   r"\d+(\.\d+)?"),
+    ("WXLOOP",  r"wx:loop"),
+    ("NUMBER",   r"\d+\.\d+"),
+    ("INTEGER",  r"\d+"),
     ("STRING",   r"'([^'\\]|\\.)*'|\"([^\"\\]|\\.)*\""), # TODO: Rename to URL Literal
     ("WXPATH",   r"/{0,3}\s*url"),  # Must come before NAME to match 'url' as WXPATH
     # ("///URL",   r"/{3}\s*url"),
@@ -22,6 +24,7 @@ TOKEN_SPEC = [
     ("URL",      r"\s*url"),  # Must come before NAME to match 'url' as WXPATH
     # ("NAME",     r"[a-zA-Z_][a-zA-Z0-9_]*"),
     ("FOLLOW",   r",?\s{,}follow="),
+    ("DEPTH",    r",?\s{,}depth="),
     ("OP",       r"\|\||<=|>=|!=|=|<|>|\+|-|\*|/|!"),  # Added || for string concat
     ("LPAREN",   r"\("),
     ("RPAREN",   r"\)"),
@@ -63,6 +66,14 @@ def tokenize(src: str):
 class Number:
     value: float
+@dataclass
+class Integer:
+    value: int
+@dataclass
+class Depth(Integer):
+    pass
 @dataclass
 class String:
     value: str
@@ -170,7 +181,7 @@ class Parser:
     def parse_binary(self, min_prec: int) -> object:
         """Parse a binary expression chain honoring operator precedence."""
-        if self.token.type == "WXPATH":
+        if self.token.type == "WXPATH" or self.token.type == "WXLOOP":
             left = self.parse_segments()
         else:
             left = self.nud()
@@ -273,6 +284,10 @@ class Parser:
         if tok.type == "NUMBER":
             self.advance()
             return Number(float(tok.value))
+        if tok.type == "INTEGER":
+            self.advance()
+            return Integer(int(tok.value))
         if tok.type == "STRING":
             self.advance()
@@ -358,18 +373,18 @@ class Parser:
             self.advance()
         return result
     def capture_url_arg_content(self) -> list[Call | Xpath | ContextItem]:
         """Capture content inside a url() call, handling nested wxpath expressions.
         Supports patterns like::
-            url('...')                      -> [String]
-            url('...' follow=//a/@href)     -> [String, Xpath]
-            url(//a/@href)                  -> [Xpath]
-            url( url('..')//a/@href )       -> [Call, Xpath]
-            url( url( url('..')//a )//b )   -> [Call, Xpath]
+            url('...')                          -> [String]
+            url('...' follow=//a/@href)         -> [String, Xpath]
+            url('...' follow=//a/@href depth=2) -> [String, Xpath, Integer]
+            url(//a/@href depth=2)              -> [Xpath, Integer]
+            url( url('..')//a/@href )           -> [Call, Xpath]
+            url( url( url('..')//a )//b )       -> [Call, Xpath]
         Returns:
             A list of parsed elements: Xpath nodes for xpath content and Call
@@ -380,7 +395,10 @@ class Parser:
         paren_balance = 1  # We're already inside the opening paren of url()
         brace_balance = 0  # Track braces for map constructors
         reached_follow_token = False
+        reached_depth_token = False
         follow_xpath = ""
+        depth_number = ""
         while paren_balance > 0 and self.token.type != "EOF":
             if self.token.type == "WXPATH":
                 # Found nested wxpath: save any accumulated xpath content first
@@ -396,13 +414,22 @@ class Parser:
             elif self.token.type == "FOLLOW":
                 reached_follow_token = True
+                reached_depth_token = False
+                self.advance()
+            elif self.token.type == "DEPTH":
+                reached_depth_token = True
+                reached_follow_token = False
                 self.advance()
             elif self.token.type == "LPAREN":
                 # Opening paren that's NOT part of a url() call
                 # (it's part of an xpath function like contains(), starts-with(), etc.)
                 paren_balance += 1
-                current_xpath += self.token.value
+                if not reached_follow_token:
+                    current_xpath += self.token.value
+                else:
+                    follow_xpath += self.token.value
                 self.advance()
             elif self.token.type == "RPAREN":
@@ -410,26 +437,37 @@ class Parser:
                 if paren_balance == 0:
                     # This is the closing paren of the outer url()
                     break
-                current_xpath += self.token.value
+                if not reached_follow_token:
+                    current_xpath += self.token.value
+                else:
+                    follow_xpath += self.token.value
                 self.advance()
             elif self.token.type == "LBRACE":
                 # Opening brace for map constructors
                 brace_balance += 1
-                current_xpath += self.token.value
+                if not reached_follow_token:
+                    current_xpath += self.token.value
+                else:
+                    follow_xpath += self.token.value
                 self.advance()
             elif self.token.type == "RBRACE":
                 brace_balance -= 1
-                current_xpath += self.token.value
+                if not reached_follow_token:
+                        current_xpath += self.token.value
+                else:
+                    follow_xpath += self.token.value
                 self.advance()
             else:
                 # Accumulate all other tokens as xpath content
-                if not reached_follow_token:
-                    current_xpath += self.token.value
-                else:
+                if reached_follow_token:
                     follow_xpath += self.token.value
+                elif reached_depth_token:
+                    depth_number += self.token.value
+                else:
+                    current_xpath += self.token.value
                 self.advance()
@@ -447,6 +485,9 @@ class Parser:
         if follow_xpath.strip():
             elements.append(Xpath(follow_xpath.strip()))
+        if depth_number.strip():
+            elements.append(Depth(int(depth_number.strip())))
         return elements
     def parse_call(self, func_name: str) -> Call | Segments:
@@ -462,13 +503,16 @@ class Parser:
                 self.advance()
                 # Handle follow=...
                 if self.token.type == "FOLLOW":
-                    self.advance()
                     follow_arg = self.capture_url_arg_content()
                     args.extend(follow_arg)
+                if self.token.type == "DEPTH":
+                    depth_arg = self.capture_url_arg_content()
+                    args.extend(depth_arg)
             elif self.token.type == "WXPATH":
                 # Nested wxpath: url( url('...')//a/@href ) or url( /url(...) )
-                # Use capture_url_arg_content to handle nested wxpath and xpath
-                args = self.capture_url_arg_content()
+                # NOTE: We used to use capture_url_arg_content to handle nested wxpath and xpath
+                # args = self.capture_url_arg_content()
+                args = self.nud()
             else:
                 # Simple xpath argument: url(//a/@href)
                 # Could still contain nested wxpath, so use capture_url_arg_content
@@ -489,8 +533,18 @@ class Parser:
         return _specify_call_types(func_name, args)
 def _specify_call_types(func_name: str, args: list) -> Call | Segments:
+    """
+    Specify the type of a call based on the function name and arguments.
+    TODO: Provide example wxpath expressions for each call type.
+    Args:
+        func_name: The name of the function.
+        args: The arguments of the function.
+    Returns:
+        Call | Segments: The type of the call.
+    """
     if func_name == "url":
         if len(args) == 1:
             if isinstance(args[0], String):
@@ -500,17 +554,33 @@ def _specify_call_types(func_name: str, args: list) -> Call | Segments:
             else:
                 raise ValueError(f"Unknown argument type: {type(args[0])}")
         elif len(args) == 2:
-            if isinstance(args[0], String) and isinstance(args[1], Xpath):
+            arg0, arg1 = args
+            if isinstance(arg0, String) and isinstance(arg1, Xpath):
+                # Example: url('...', follow=//a/@href)
                 return UrlCrawl(func_name, args)
-            elif isinstance(args[0], UrlLiteral) and isinstance(args[1], Xpath):
+            elif isinstance(arg0, String) and isinstance(arg1, Integer):
+                # Example: url('...', depth=2)
+                return UrlLiteral(func_name, args)
+            elif isinstance(arg0, UrlLiteral) and isinstance(arg1, Xpath):
                 args.append(UrlQuery('url', [ContextItem()]))
                 return Segments(args)
-            elif isinstance(args[0], (Segments, list)) and isinstance(args[1], Xpath):
-                segs = args[0]
-                segs.append(args[1])
+            elif isinstance(arg0, (Segments, list)) and isinstance(arg1, Xpath):
+                segs = arg0
+                segs.append(arg1)
                 return Segments(segs)
             else:
                 raise ValueError(f"Unknown arguments: {args}")
+        elif len(args) == 3:
+            arg0, arg1, arg2 = args
+            if (isinstance(arg0, String) and (
+                (isinstance(arg1, Xpath) and isinstance(arg2, Integer)) or
+                (isinstance(arg1, Integer) and isinstance(arg2, Xpath))
+            )):
+                # Example: url('...', follow=//a/@href, depth=2)
+                # Example: url('...', depth=2, follow=//a/@href)
+                return UrlCrawl(func_name, args)
+            else:
+                raise ValueError(f"Unknown arguments: {args}")
         else:
             raise ValueError(f"Unknown arguments: {args}")
     elif func_name == "/url" or func_name == "//url":

wxpath 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

wxpath 0.4.1py3-none-any.whl → 0.5.1py3-none-any.whl