PyPI - wxpath - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

wxpath 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

wxpath/__init__.py +2 -0
wxpath/cli.py +6 -0
wxpath/core/models.py +1 -0
wxpath/core/ops.py +9 -12
wxpath/core/parser.py +92 -23
wxpath/core/runtime/engine.py +79 -8
wxpath/core/runtime/helpers.py +6 -3
wxpath/http/client/__init__.py +1 -1
wxpath/http/client/crawler.py +19 -7
wxpath/http/client/request.py +1 -1
wxpath/http/client/response.py +7 -1
wxpath/http/policy/retry.py +2 -2
wxpath/integrations/__init__.py +0 -0
wxpath/integrations/langchain/__init__.py +0 -0
wxpath/integrations/langchain/examples/basic_rag.py +85 -0
wxpath/integrations/langchain/examples/rolling_window_rag.py +218 -0
wxpath/integrations/langchain/loader.py +60 -0
wxpath/patches.py +215 -5
wxpath/settings.py +3 -1
wxpath/tui.py +1204 -0
wxpath/tui_settings.py +151 -0
wxpath/util/cleaners.py +31 -0
wxpath/util/common_paths.py +22 -0
wxpath/util/logging.py +3 -7
{wxpath-0.4.0.dist-info → wxpath-0.5.0.dist-info}/METADATA +123 -19
wxpath-0.5.0.dist-info/RECORD +44 -0
{wxpath-0.4.0.dist-info → wxpath-0.5.0.dist-info}/WHEEL +1 -1
{wxpath-0.4.0.dist-info → wxpath-0.5.0.dist-info}/entry_points.txt +1 -0
wxpath-0.4.0.dist-info/RECORD +0 -35
{wxpath-0.4.0.dist-info → wxpath-0.5.0.dist-info}/licenses/LICENSE +0 -0
{wxpath-0.4.0.dist-info → wxpath-0.5.0.dist-info}/top_level.txt +0 -0

wxpath/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from . import settings
 from .core.runtime.engine import wxpath_async, wxpath_async_blocking, wxpath_async_blocking_iter
 from .util.logging import configure_logging
@@ -6,4 +7,5 @@ __all__ = [
     'wxpath_async_blocking',
     'wxpath_async_blocking_iter',
     'configure_logging',
+    'settings',
 ]

wxpath/cli.py CHANGED Viewed

@@ -47,6 +47,11 @@ def main():
         help="Respect robots.txt",
         default=True
     )
+    arg_parser.add_argument(
+        "--insecure",
+        action="store_true",
+        help="Disable SSL certificate verification (use for sites with broken chains)",
+    )
     arg_parser.add_argument(
         "--cache",
         action="store_true",
@@ -112,6 +117,7 @@ def main():
         concurrency=args.concurrency,
         per_host=args.concurrency_per_host,
         respect_robots=args.respect_robots,
+        verify_ssl=not args.insecure,
         headers=custom_headers
     )
     engine = WXPathEngine(crawler=crawler)

wxpath/core/models.py CHANGED Viewed

@@ -61,6 +61,7 @@ class InfiniteCrawlIntent(ProcessIntent):
 @dataclass(slots=True)
 class ExtractIntent(ProcessIntent):
+    """TODO: May be redundant with ProcessIntent?"""
     pass

wxpath/core/ops.py CHANGED Viewed

@@ -19,6 +19,7 @@ from wxpath.core.parser import (
     Binary,
     Call,
     ContextItem,
+    Depth,
     Segment,
     Segments,
     String,
@@ -78,7 +79,10 @@ def get_operator(
 @register('url', (String,))
+@register('url', (String, Depth))
 @register('url', (String, Xpath))
+@register('url', (String, Depth, Xpath))
+@register('url', (String, Xpath, Depth))
 def _handle_url_str_lit(curr_elem: html.HtmlElement,
                         curr_segments: list[Url | Xpath],
                         curr_depth: int, **kwargs) -> Iterable[Intent]:
@@ -87,9 +91,12 @@ def _handle_url_str_lit(curr_elem: html.HtmlElement,
     next_segments = curr_segments[1:]
-    if len(url_call.args) == 2:
+    # NOTE: Expects parser to produce UrlCrawl node in expressions
+    # that look like `url('...', follow=//a/@href)`
+    if isinstance(url_call, UrlCrawl):
+        xpath_arg = [arg for arg in url_call.args if isinstance(arg, Xpath)][0]
         _segments = [
-            UrlCrawl('///url', [url_call.args[1], url_call.args[0].value])
+            UrlCrawl('///url', [xpath_arg, url_call.args[0].value])
         ] + next_segments
         yield CrawlIntent(url=url_call.args[0].value, next_segments=_segments)
@@ -112,16 +119,6 @@ def _handle_xpath(curr_elem: html.HtmlElement,
         raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
     base_url = getattr(curr_elem, 'base_url', None)
     log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
-    _backlink_str = f"string('{curr_elem.get('backlink')}')"
-    # We use the root tree's depth and not curr_depth because curr_depth accounts for a +1
-    # increment after each url*() hop
-    _depth_str = f"number({curr_elem.getroottree().getroot().get('depth')})"
-    expr = expr.replace('wx:backlink()', _backlink_str)
-    expr = expr.replace('wx:backlink(.)', _backlink_str)
-    expr = expr.replace('wx:depth()', _depth_str)
-    expr = expr.replace('wx:depth(.)', _depth_str)
     elems = curr_elem.xpath3(expr)
     next_segments = curr_segments[1:]

wxpath/core/parser.py CHANGED Viewed

@@ -13,7 +13,8 @@ except ImportError:
 TOKEN_SPEC = [
-    ("NUMBER",   r"\d+(\.\d+)?"),
+    ("NUMBER",   r"\d+\.\d+"),
+    ("INTEGER",  r"\d+"),
     ("STRING",   r"'([^'\\]|\\.)*'|\"([^\"\\]|\\.)*\""), # TODO: Rename to URL Literal
     ("WXPATH",   r"/{0,3}\s*url"),  # Must come before NAME to match 'url' as WXPATH
     # ("///URL",   r"/{3}\s*url"),
@@ -22,6 +23,7 @@ TOKEN_SPEC = [
     ("URL",      r"\s*url"),  # Must come before NAME to match 'url' as WXPATH
     # ("NAME",     r"[a-zA-Z_][a-zA-Z0-9_]*"),
     ("FOLLOW",   r",?\s{,}follow="),
+    ("DEPTH",    r",?\s{,}depth="),
     ("OP",       r"\|\||<=|>=|!=|=|<|>|\+|-|\*|/|!"),  # Added || for string concat
     ("LPAREN",   r"\("),
     ("RPAREN",   r"\)"),
@@ -63,6 +65,14 @@ def tokenize(src: str):
 class Number:
     value: float
+@dataclass
+class Integer:
+    value: int
+@dataclass
+class Depth(Integer):
+    pass
 @dataclass
 class String:
     value: str
@@ -273,6 +283,10 @@ class Parser:
         if tok.type == "NUMBER":
             self.advance()
             return Number(float(tok.value))
+        if tok.type == "INTEGER":
+            self.advance()
+            return Integer(int(tok.value))
         if tok.type == "STRING":
             self.advance()
@@ -358,18 +372,18 @@ class Parser:
             self.advance()
         return result
     def capture_url_arg_content(self) -> list[Call | Xpath | ContextItem]:
         """Capture content inside a url() call, handling nested wxpath expressions.
         Supports patterns like::
-            url('...')                      -> [String]
-            url('...' follow=//a/@href)     -> [String, Xpath]
-            url(//a/@href)                  -> [Xpath]
-            url( url('..')//a/@href )       -> [Call, Xpath]
-            url( url( url('..')//a )//b )   -> [Call, Xpath]
+            url('...')                          -> [String]
+            url('...' follow=//a/@href)         -> [String, Xpath]
+            url('...' follow=//a/@href depth=2) -> [String, Xpath, Integer]
+            url(//a/@href depth=2)              -> [Xpath, Integer]
+            url( url('..')//a/@href )           -> [Call, Xpath]
+            url( url( url('..')//a )//b )       -> [Call, Xpath]
         Returns:
             A list of parsed elements: Xpath nodes for xpath content and Call
@@ -380,7 +394,10 @@ class Parser:
         paren_balance = 1  # We're already inside the opening paren of url()
         brace_balance = 0  # Track braces for map constructors
         reached_follow_token = False
+        reached_depth_token = False
         follow_xpath = ""
+        depth_number = ""
         while paren_balance > 0 and self.token.type != "EOF":
             if self.token.type == "WXPATH":
                 # Found nested wxpath: save any accumulated xpath content first
@@ -396,13 +413,22 @@ class Parser:
             elif self.token.type == "FOLLOW":
                 reached_follow_token = True
+                reached_depth_token = False
+                self.advance()
+            elif self.token.type == "DEPTH":
+                reached_depth_token = True
+                reached_follow_token = False
                 self.advance()
             elif self.token.type == "LPAREN":
                 # Opening paren that's NOT part of a url() call
                 # (it's part of an xpath function like contains(), starts-with(), etc.)
                 paren_balance += 1
-                current_xpath += self.token.value
+                if not reached_follow_token:
+                    current_xpath += self.token.value
+                else:
+                    follow_xpath += self.token.value
                 self.advance()
             elif self.token.type == "RPAREN":
@@ -410,26 +436,37 @@ class Parser:
                 if paren_balance == 0:
                     # This is the closing paren of the outer url()
                     break
-                current_xpath += self.token.value
+                if not reached_follow_token:
+                    current_xpath += self.token.value
+                else:
+                    follow_xpath += self.token.value
                 self.advance()
             elif self.token.type == "LBRACE":
                 # Opening brace for map constructors
                 brace_balance += 1
-                current_xpath += self.token.value
+                if not reached_follow_token:
+                    current_xpath += self.token.value
+                else:
+                    follow_xpath += self.token.value
                 self.advance()
             elif self.token.type == "RBRACE":
                 brace_balance -= 1
-                current_xpath += self.token.value
+                if not reached_follow_token:
+                        current_xpath += self.token.value
+                else:
+                    follow_xpath += self.token.value
                 self.advance()
             else:
                 # Accumulate all other tokens as xpath content
-                if not reached_follow_token:
-                    current_xpath += self.token.value
-                else:
+                if reached_follow_token:
                     follow_xpath += self.token.value
+                elif reached_depth_token:
+                    depth_number += self.token.value
+                else:
+                    current_xpath += self.token.value
                 self.advance()
@@ -447,6 +484,9 @@ class Parser:
         if follow_xpath.strip():
             elements.append(Xpath(follow_xpath.strip()))
+        if depth_number.strip():
+            elements.append(Depth(int(depth_number.strip())))
         return elements
     def parse_call(self, func_name: str) -> Call | Segments:
@@ -462,13 +502,16 @@ class Parser:
                 self.advance()
                 # Handle follow=...
                 if self.token.type == "FOLLOW":
-                    self.advance()
                     follow_arg = self.capture_url_arg_content()
                     args.extend(follow_arg)
+                if self.token.type == "DEPTH":
+                    depth_arg = self.capture_url_arg_content()
+                    args.extend(depth_arg)
             elif self.token.type == "WXPATH":
                 # Nested wxpath: url( url('...')//a/@href ) or url( /url(...) )
-                # Use capture_url_arg_content to handle nested wxpath and xpath
-                args = self.capture_url_arg_content()
+                # NOTE: We used to use capture_url_arg_content to handle nested wxpath and xpath
+                # args = self.capture_url_arg_content()
+                args = self.nud()
             else:
                 # Simple xpath argument: url(//a/@href)
                 # Could still contain nested wxpath, so use capture_url_arg_content
@@ -489,8 +532,18 @@ class Parser:
         return _specify_call_types(func_name, args)
 def _specify_call_types(func_name: str, args: list) -> Call | Segments:
+    """
+    Specify the type of a call based on the function name and arguments.
+    TODO: Provide example wxpath expressions for each call type.
+    Args:
+        func_name: The name of the function.
+        args: The arguments of the function.
+    Returns:
+        Call | Segments: The type of the call.
+    """
     if func_name == "url":
         if len(args) == 1:
             if isinstance(args[0], String):
@@ -500,17 +553,33 @@ def _specify_call_types(func_name: str, args: list) -> Call | Segments:
             else:
                 raise ValueError(f"Unknown argument type: {type(args[0])}")
         elif len(args) == 2:
-            if isinstance(args[0], String) and isinstance(args[1], Xpath):
+            arg0, arg1 = args
+            if isinstance(arg0, String) and isinstance(arg1, Xpath):
+                # Example: url('...', follow=//a/@href)
                 return UrlCrawl(func_name, args)
-            elif isinstance(args[0], UrlLiteral) and isinstance(args[1], Xpath):
+            elif isinstance(arg0, String) and isinstance(arg1, Integer):
+                # Example: url('...', depth=2)
+                return UrlLiteral(func_name, args)
+            elif isinstance(arg0, UrlLiteral) and isinstance(arg1, Xpath):
                 args.append(UrlQuery('url', [ContextItem()]))
                 return Segments(args)
-            elif isinstance(args[0], (Segments, list)) and isinstance(args[1], Xpath):
-                segs = args[0]
-                segs.append(args[1])
+            elif isinstance(arg0, (Segments, list)) and isinstance(arg1, Xpath):
+                segs = arg0
+                segs.append(arg1)
                 return Segments(segs)
             else:
                 raise ValueError(f"Unknown arguments: {args}")
+        elif len(args) == 3:
+            arg0, arg1, arg2 = args
+            if (isinstance(arg0, String) and (
+                (isinstance(arg1, Xpath) and isinstance(arg2, Integer)) or
+                (isinstance(arg1, Integer) and isinstance(arg2, Xpath))
+            )):
+                # Example: url('...', follow=//a/@href, depth=2)
+                # Example: url('...', depth=2, follow=//a/@href)
+                return UrlCrawl(func_name, args)
+            else:
+                raise ValueError(f"Unknown arguments: {args}")
         else:
             raise ValueError(f"Unknown arguments: {args}")
     elif func_name == "/url" or func_name == "//url":

wxpath/core/runtime/engine.py CHANGED Viewed

@@ -18,7 +18,7 @@ from wxpath.core.models import (
     ProcessIntent,
 )
 from wxpath.core.ops import get_operator
-from wxpath.core.parser import Binary, Segment, Segments
+from wxpath.core.parser import Binary, Depth, Segment, Segments
 from wxpath.core.runtime.helpers import parse_html
 from wxpath.hooks.registry import FetchContext, get_hooks
 from wxpath.http.client.crawler import Crawler
@@ -158,17 +158,48 @@ class WXPathEngine(HookedEngineBase):
         if allow_redirects:
             self.allowed_response_codes |= {301, 302, 303, 307, 308}
+    def _get_max_depth(self, bin_or_segs: Binary | Segments, max_depth: int) -> int:
+        """Get the maximum crawl depth for a given expression. Will find a Depth
+        argument at the beginning of the expression and return its value. Otherwise, returns the
+        max_depth value provided.
+        TODO: There has to be a better way to do this.
+        """
+        if isinstance(bin_or_segs, Binary):
+            if hasattr(bin_or_segs.left, 'func') == 'url':
+                depth_arg = [arg for arg in bin_or_segs.left.args if isinstance(arg, Depth)][0]
+                return int(depth_arg.value)
+            elif hasattr(bin_or_segs.right, 'func') == 'url':
+                depth_arg = [arg for arg in bin_or_segs.right.args if isinstance(arg, Depth)][0]
+                return int(depth_arg.value)
+        elif isinstance(bin_or_segs, Segments):
+            depth_arg = [arg for arg in bin_or_segs[0].args if isinstance(arg, Depth)]
+            if depth_arg:
+                return int(depth_arg[0].value)
+        return max_depth
     async def run(
             self,
             expression: str,
             max_depth: int,
-            progress: bool = False
+            progress: bool = False,
+            yield_errors: bool = False,
         ) -> AsyncGenerator[Any, None]:
         """Execute a wxpath expression concurrently and yield results.
         Builds and drives a BFS-like crawl pipeline that honors robots rules,
         throttling, and hook callbacks while walking the web graph.
+        NOTES ON max_depth:
+        If depth is provided in the expression, it will be used to limit the depth of the
+        crawl. If depth is provided in the expression and max_depth is provided as an argument
+        to `run`, the inline depth in the expression will take precedence.
+        Currently, max_depth control flow logic is detected and executed in the
+        engine. In the future, the operation handlers (ops.py) could be responsible for
+        detecting max_depth, and sending a terminal intent to the engine. It's also possible
+        that the depth terminals are relative to the current depth (i.e. `url(//xpath, depth=2)`
+        implies crawling only the next 2 levels). This is not yet supported.
         Args:
             expression: WXPath expression string to evaluate.
             max_depth: Maximum crawl depth to follow for url hops.
@@ -178,7 +209,9 @@ class WXPathEngine(HookedEngineBase):
             Extracted values produced by the expression (HTML elements or
             wxpath-specific value types).
         """
-        segments = parser.parse(expression)
+        bin_or_segs = parser.parse(expression)
+        max_depth = self._get_max_depth(bin_or_segs, max_depth)
         queue: asyncio.Queue[CrawlTask] = asyncio.Queue()
         inflight: dict[str, CrawlTask] = {}
@@ -222,7 +255,7 @@ class WXPathEngine(HookedEngineBase):
             seed_task = CrawlTask(
                 elem=None,
                 url=None,
-                segments=segments,
+                segments=bin_or_segs,
                 depth=-1,
                 backlink=None,
             )
@@ -248,12 +281,32 @@ class WXPathEngine(HookedEngineBase):
                 if task is None:
                     log.warning(f"Got unexpected response from {resp.request.url}")
+                    if yield_errors:
+                        yield {
+                            "__type__": "error",
+                            "url": resp.request.url,
+                            "reason": "unexpected_response",
+                            "status": resp.body,
+                            "body": resp.body
+                        }
                     if is_terminal():
                         break
                     continue
                 if resp.error:
                     log.warning(f"Got error from {resp.request.url}: {resp.error}")
+                    if yield_errors:
+                        yield {
+                            "__type__": "error",
+                            "url": resp.request.url,
+                            "reason": "network_error",
+                            "exception": str(resp.error),
+                            "status": resp.status,
+                            "body": resp.body
+                        }
                     if is_terminal():
                         break
                     continue
@@ -261,6 +314,16 @@ class WXPathEngine(HookedEngineBase):
                 # NOTE: Consider allowing redirects
                 if resp.status not in self.allowed_response_codes or not resp.body:
                     log.warning(f"Got non-200 response from {resp.request.url}")
+                    if yield_errors:
+                        yield {
+                            "__type__": "error",
+                            "url": resp.request.url,
+                            "reason": "bad_status",
+                            "status": resp.status,
+                            "body": resp.body
+                        }
                     if is_terminal():
                         break
                     continue
@@ -276,6 +339,7 @@ class WXPathEngine(HookedEngineBase):
                     base_url=task.url,
                     backlink=task.backlink,
                     depth=task.depth,
+                    response=resp
                 )
                 elem = await self.post_parse_hooks(elem, task)
@@ -388,10 +452,12 @@ class WXPathEngine(HookedEngineBase):
 def wxpath_async(path_expr: str,
                  max_depth: int,
                  progress: bool = False,
-                 engine: WXPathEngine | None = None) -> AsyncGenerator[Any, None]:
+                 engine: WXPathEngine | None = None,
+                 yield_errors: bool = False
+                 ) -> AsyncGenerator[Any, None]:
     if engine is None:
         engine = WXPathEngine()
-    return engine.run(path_expr, max_depth, progress=progress)
+    return engine.run(path_expr, max_depth, progress=progress, yield_errors=yield_errors)
 ##### ASYNC IN SYNC #####
@@ -400,6 +466,7 @@ def wxpath_async_blocking_iter(
     max_depth: int = 1,
     progress: bool = False,
     engine: WXPathEngine | None = None,
+    yield_errors: bool = False
 ) -> Iterator[Any]:
     """Evaluate a wxpath expression using concurrent breadth-first traversal.
@@ -419,7 +486,8 @@ def wxpath_async_blocking_iter(
     """
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)
-    agen = wxpath_async(path_expr, max_depth=max_depth, progress=progress, engine=engine)
+    agen = wxpath_async(path_expr, max_depth=max_depth, progress=progress,
+                        engine=engine, yield_errors=yield_errors)
     try:
         while True:
@@ -437,8 +505,11 @@ def wxpath_async_blocking(
     max_depth: int = 1,
     progress: bool = False,
     engine: WXPathEngine | None = None,
+    yield_errors: bool = False
 ) -> list[Any]:
     return list(wxpath_async_blocking_iter(path_expr,
                                            max_depth=max_depth,
                                            progress=progress,
-                                           engine=engine))
+                                           engine=engine,
+                                           yield_errors=yield_errors,
+                                           ))

wxpath/core/runtime/helpers.py CHANGED Viewed

@@ -6,7 +6,7 @@ from wxpath.util.logging import get_logger
 log = get_logger(__name__)
-def parse_html(content, base_url=None, **elem_kv_pairs) -> html.HtmlElement:
+def parse_html(content, base_url=None, response=None, **elem_kv_pairs) -> html.HtmlElement:
     elem = etree.HTML(content, parser=patches.html_parser_with_xpath3, base_url=base_url)
     if base_url:
         elem.getroottree().docinfo.URL = base_url  # make base-uri() work
@@ -14,12 +14,15 @@ def parse_html(content, base_url=None, **elem_kv_pairs) -> html.HtmlElement:
         elem.set("{http://www.w3.org/XML/1998/namespace}base", base_url)
         elem.base_url = base_url  # sets both attribute and doc-level URL
-    # NOTE: some pages may have multiple root elements, i.e.
+    if response:
+        elem.response = response
+        elem.getroottree().getroot().response = response
+    # NOTE: some pages may have multiple root elements, i.e.
     # len(elem.itersiblings()) > 0 AND elem.getparent() is None.
     # This breaks elementpath. If elem has siblings, recreate the
     # root element and only the root element.
     if len(list(elem.itersiblings())) > 0:
-        elem = detach_html_root(elem, base_url)
+        elem = detach_html_root(elem, base_url)
     for k, v in elem_kv_pairs.items():
         elem.set(k, str(v))

wxpath/http/client/__init__.py CHANGED Viewed

@@ -5,5 +5,5 @@ from wxpath.http.client.response import Response
 __all__ = [
     "Crawler",
     "Request",
-    "Response"
+    "Response",
 ]

wxpath/http/client/crawler.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import aiohttp
 try:
-    from aiohttp_client_cache import CachedSession, SQLiteBackend
+    from aiohttp_client_cache import CachedSession
 except ImportError:
     CachedSession = None
@@ -42,7 +42,7 @@ def get_async_session(
     if timeout is None:
         timeout = aiohttp.ClientTimeout(total=CRAWLER_SETTINGS.timeout)
-    if CACHE_SETTINGS.enabled and CachedSession and SQLiteBackend:
+    if CACHE_SETTINGS.enabled and CachedSession:
         log.info("using aiohttp-client-cache")
         return CachedSession(
             cache=get_cache_backend(),
@@ -71,6 +71,7 @@ class Crawler:
         *,
         headers: dict | None = None,
         proxies: dict | None = None,
+        verify_ssl: bool | None = None,
         retry_policy: RetryPolicy | None = None,
         throttler: AbstractThrottler | None = None,
         auto_throttle_target_concurrency: float = None,
@@ -82,6 +83,9 @@ class Crawler:
         self.concurrency = concurrency if concurrency is not None else cfg.concurrency
         self.per_host = per_host if per_host is not None else cfg.per_host
+        self._verify_ssl = verify_ssl if verify_ssl is not None else getattr(
+            cfg, "verify_ssl", True
+        )
         timeout = timeout if timeout is not None else cfg.timeout
         self._timeout = aiohttp.ClientTimeout(total=timeout)
@@ -141,7 +145,11 @@ class Crawler:
         """Construct an `aiohttp.ClientSession` with tracing and pooling."""
         trace_config = build_trace_config(self._stats)
         # Need to build the connector as late as possible as it requires the loop
-        connector = aiohttp.TCPConnector(limit=self.concurrency*2, ttl_dns_cache=300)
+        connector = aiohttp.TCPConnector(
+            limit=self.concurrency * 2,
+            ttl_dns_cache=300,
+            ssl=self._verify_ssl,
+        )
         return get_async_session(
             headers=self._headers,
             timeout=self._timeout,
@@ -274,22 +282,26 @@ class Crawler:
                     else:
                         log.info("[CACHE MISS]", extra={"req.url": req.url, "resp.url": resp.url})
+                    _start = time.monotonic()
                     body = await resp.read()
-                    latency = time.monotonic() - start
+                    end = time.monotonic()
+                    latency = end - _start
                     self.throttler.record_latency(host, latency)
                     if self.retry_policy.should_retry(req, response=resp):
                         await self._retry(req)
                         return None
-                    return Response(req, resp.status, body, dict(resp.headers))
+                    return Response(req, resp.status, body, dict(resp.headers),
+                                    request_start=_start, response_end=end)
             except asyncio.CancelledError:
                 # Normal during shutdown / timeout propagation
                 log.debug("cancelled error", extra={"url": req.url})
                 raise
             except Exception as exc:
-                latency = time.monotonic() - start
+                end = time.monotonic()
+                latency = end - start
                 self.throttler.record_latency(host, latency)
                 if self.retry_policy.should_retry(req, exception=exc):
@@ -297,7 +309,7 @@ class Crawler:
                     return None
                 log.error("request failed", extra={"url": req.url}, exc_info=exc)
-                return Response(req, 0, b"", error=exc)
+                return Response(req, 0, b"", error=exc, request_start=start, response_end=end)
     async def _retry(self, req: Request) -> None:
         """Reschedule a request according to the retry policy."""

wxpath/http/client/request.py CHANGED Viewed

@@ -9,7 +9,7 @@ class Request:
     url: str
     method: str = "GET"
     headers: dict[str, str] = field(default_factory=dict)
-    timeout: float = 15.0
+    timeout: float | None = None
     retries: int = 0
     max_retries: int | None = None

wxpath/http/client/response.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# wxpath/http/response.py
 from dataclasses import dataclass, field
 from typing import Optional
@@ -12,3 +11,10 @@ class Response:
     body: bytes
     headers: dict[str, str] | None = None
     error: Optional[Exception] = field(default=None, kw_only=True)
+    request_start: float | None = None
+    response_end: float | None = None
+    @property
+    def latency(self) -> float:
+        return self.response_end - self.request_start

wxpath/http/policy/retry.py CHANGED Viewed

@@ -19,13 +19,13 @@ class RetryPolicy:
         if request.max_retries is not None and request.retries >= request.max_retries:
             return False
         if request.retries >= self.max_retries:
             return False
         if response is not None and response.status in self.retry_statuses:
             return True
         if exception is not None:
             return True

wxpath/integrations/__init__.py ADDED Viewed

File without changes

wxpath/integrations/langchain/__init__.py ADDED Viewed

File without changes

wxpath 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

wxpath 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl