wxpath 0.5.0__tar.gz → 0.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wxpath-0.5.0/src/wxpath.egg-info → wxpath-0.5.1}/PKG-INFO +6 -5
- {wxpath-0.5.0 → wxpath-0.5.1}/README.md +3 -3
- {wxpath-0.5.0 → wxpath-0.5.1}/pyproject.toml +3 -3
- wxpath-0.5.1/src/wxpath/core/exceptions.py +53 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/core/ops.py +91 -7
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/core/parser.py +2 -1
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/core/runtime/engine.py +38 -7
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/tui.py +28 -7
- {wxpath-0.5.0 → wxpath-0.5.1/src/wxpath.egg-info}/PKG-INFO +6 -5
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath.egg-info/SOURCES.txt +1 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath.egg-info/requires.txt +2 -1
- {wxpath-0.5.0 → wxpath-0.5.1}/LICENSE +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/setup.cfg +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/__init__.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/cli.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/core/__init__.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/core/dom.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/core/models.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/core/runtime/__init__.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/core/runtime/helpers.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/hooks/__init__.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/hooks/builtin.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/hooks/registry.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/http/__init__.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/http/client/__init__.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/http/client/cache.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/http/client/crawler.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/http/client/request.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/http/client/response.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/http/policy/backoff.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/http/policy/retry.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/http/policy/robots.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/http/policy/throttler.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/http/stats.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/integrations/__init__.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/integrations/langchain/__init__.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/integrations/langchain/examples/basic_rag.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/integrations/langchain/examples/rolling_window_rag.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/integrations/langchain/loader.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/patches.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/settings.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/tui_settings.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/util/__init__.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/util/cleaners.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/util/common_paths.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/util/logging.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/util/serialize.py +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath.egg-info/dependency_links.txt +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath.egg-info/entry_points.txt +0 -0
- {wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: wxpath
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: wxpath - a declarative web crawler and data extractor
|
|
5
5
|
Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -14,7 +14,7 @@ Description-Content-Type: text/markdown
|
|
|
14
14
|
License-File: LICENSE
|
|
15
15
|
Requires-Dist: lxml>=4.0
|
|
16
16
|
Requires-Dist: elementpath<=5.0.3,>=5.0.0
|
|
17
|
-
Requires-Dist: aiohttp<=
|
|
17
|
+
Requires-Dist: aiohttp<=4.0.0,>=3.8.0
|
|
18
18
|
Requires-Dist: tqdm>=4.0.0
|
|
19
19
|
Provides-Extra: cache
|
|
20
20
|
Requires-Dist: aiohttp-client-cache>=0.14.0; extra == "cache"
|
|
@@ -35,6 +35,7 @@ Requires-Dist: pytest>=7.0; extra == "test"
|
|
|
35
35
|
Requires-Dist: pytest-asyncio>=0.23; extra == "test"
|
|
36
36
|
Provides-Extra: dev
|
|
37
37
|
Requires-Dist: ruff; extra == "dev"
|
|
38
|
+
Requires-Dist: tox; extra == "dev"
|
|
38
39
|
Provides-Extra: docs
|
|
39
40
|
Requires-Dist: mkdocs>=1.5; extra == "docs"
|
|
40
41
|
Requires-Dist: mkdocs-material>=9.0; extra == "docs"
|
|
@@ -54,7 +55,7 @@ Dynamic: license-file
|
|
|
54
55
|
[](https://www.python.org/downloads/release/python-3100/) [](https://rodricios.github.io/wxpath)
|
|
55
56
|
|
|
56
57
|
|
|
57
|
-
> NEW: [TUI](https://rodricios.github.io/wxpath/tui/quickstart
|
|
58
|
+
> NEW: [TUI](https://rodricios.github.io/wxpath/tui/quickstart) - Interactive terminal interface (powered by Textual) for testing wxpath expressions and exporting data.
|
|
58
59
|
|
|
59
60
|

|
|
60
61
|
|
|
@@ -65,7 +66,7 @@ Requires Python 3.10+.
|
|
|
65
66
|
```
|
|
66
67
|
pip install wxpath
|
|
67
68
|
# For TUI support
|
|
68
|
-
pip install wxpath[tui]
|
|
69
|
+
pip install "wxpath[tui]"
|
|
69
70
|
```
|
|
70
71
|
---
|
|
71
72
|
|
|
@@ -356,7 +357,7 @@ Command line options:
|
|
|
356
357
|
|
|
357
358
|
**wxpath** provides a terminal interface (TUI) for interactive expression testing and data extraction.
|
|
358
359
|
|
|
359
|
-
See [TUI Quickstart](https://rodricios.github.io/wxpath/tui/quickstart
|
|
360
|
+
See [TUI Quickstart](https://rodricios.github.io/wxpath/tui/quickstart) for more details.
|
|
360
361
|
|
|
361
362
|
## Persistence and Caching
|
|
362
363
|
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
[](https://www.python.org/downloads/release/python-3100/) [](https://rodricios.github.io/wxpath)
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
> NEW: [TUI](https://rodricios.github.io/wxpath/tui/quickstart
|
|
6
|
+
> NEW: [TUI](https://rodricios.github.io/wxpath/tui/quickstart) - Interactive terminal interface (powered by Textual) for testing wxpath expressions and exporting data.
|
|
7
7
|
|
|
8
8
|

|
|
9
9
|
|
|
@@ -14,7 +14,7 @@ Requires Python 3.10+.
|
|
|
14
14
|
```
|
|
15
15
|
pip install wxpath
|
|
16
16
|
# For TUI support
|
|
17
|
-
pip install wxpath[tui]
|
|
17
|
+
pip install "wxpath[tui]"
|
|
18
18
|
```
|
|
19
19
|
---
|
|
20
20
|
|
|
@@ -305,7 +305,7 @@ Command line options:
|
|
|
305
305
|
|
|
306
306
|
**wxpath** provides a terminal interface (TUI) for interactive expression testing and data extraction.
|
|
307
307
|
|
|
308
|
-
See [TUI Quickstart](https://rodricios.github.io/wxpath/tui/quickstart
|
|
308
|
+
See [TUI Quickstart](https://rodricios.github.io/wxpath/tui/quickstart) for more details.
|
|
309
309
|
|
|
310
310
|
## Persistence and Caching
|
|
311
311
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "wxpath"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.1"
|
|
8
8
|
description = "wxpath - a declarative web crawler and data extractor"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -17,7 +17,7 @@ license-files = ["LICENSE"]
|
|
|
17
17
|
dependencies = [
|
|
18
18
|
"lxml>=4.0",
|
|
19
19
|
"elementpath>=5.0.0,<=5.0.3",
|
|
20
|
-
"aiohttp>=3.8.0,<=
|
|
20
|
+
"aiohttp>=3.8.0,<=4.0.0",
|
|
21
21
|
"tqdm>=4.0.0"
|
|
22
22
|
]
|
|
23
23
|
|
|
@@ -40,7 +40,7 @@ llm = ["langchain>=1.0.0", "langchain-core>=1.0.0", "langchain-ollama>=1.0.0",
|
|
|
40
40
|
"langchain-text-splitters>=1.1.0"]
|
|
41
41
|
|
|
42
42
|
test = ["pytest>=7.0", "pytest-asyncio>=0.23"]
|
|
43
|
-
dev = ["ruff"]
|
|
43
|
+
dev = ["ruff", "tox"]
|
|
44
44
|
docs = ["mkdocs>=1.5", "mkdocs-material>=9.0", "mkdocstrings[python]>=0.24", "mkdocs-macros-plugin>=1.0", "mkdocs-resize-images>=1.0", "mkdocs-glightbox", "pyyaml>=6.0"]
|
|
45
45
|
tui = ["textual>=1.0.0", "aiohttp-client-cache>=0.14.0", "aiohttp-client-cache[sqlite]"]
|
|
46
46
|
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
class XPathEvaluationError(Exception):
|
|
2
|
+
"""Errors during XPath evaluation with elementpath."""
|
|
3
|
+
|
|
4
|
+
def __init__(
|
|
5
|
+
self,
|
|
6
|
+
message: str,
|
|
7
|
+
xpath: str,
|
|
8
|
+
base_url: str | None = None,
|
|
9
|
+
element_tag: str | None = None,
|
|
10
|
+
error_code: str | None = None, # XPath error codes like XPST0003
|
|
11
|
+
position: tuple[int, int] | None = None, # (line, column)
|
|
12
|
+
original_error: Exception | None = None
|
|
13
|
+
):
|
|
14
|
+
context = {
|
|
15
|
+
"xpath": xpath,
|
|
16
|
+
"base_url": base_url,
|
|
17
|
+
"element_tag": element_tag,
|
|
18
|
+
"error_code": error_code,
|
|
19
|
+
"position": position,
|
|
20
|
+
}
|
|
21
|
+
if original_error:
|
|
22
|
+
context["original_error"] = str(original_error)
|
|
23
|
+
# Extract XPath error code if present (e.g., [err:XPST0003])
|
|
24
|
+
if hasattr(original_error, 'code'):
|
|
25
|
+
context["error_code"] = original_error.code
|
|
26
|
+
|
|
27
|
+
super().__init__(message, context)
|
|
28
|
+
|
|
29
|
+
def to_dict(self) -> dict:
|
|
30
|
+
return {
|
|
31
|
+
"message": self.message,
|
|
32
|
+
"xpath": self.xpath,
|
|
33
|
+
"base_url": self.base_url,
|
|
34
|
+
"element_tag": self.element_tag,
|
|
35
|
+
"error_code": self.error_code,
|
|
36
|
+
"position": self.position,
|
|
37
|
+
"original_error": self.original_error,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class XPathSyntaxError(XPathEvaluationError):
|
|
42
|
+
"""Invalid XPath syntax."""
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class XPathTypeError(XPathEvaluationError):
|
|
47
|
+
"""Type error in XPath expression."""
|
|
48
|
+
pass
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class XPathRuntimeError(XPathEvaluationError):
|
|
52
|
+
"""Runtime error during XPath evaluation."""
|
|
53
|
+
pass
|
|
@@ -2,11 +2,25 @@ from typing import Callable, Iterable
|
|
|
2
2
|
from urllib.parse import urljoin
|
|
3
3
|
|
|
4
4
|
import elementpath
|
|
5
|
+
from elementpath import (
|
|
6
|
+
ElementPathError,
|
|
7
|
+
ElementPathSyntaxError as EPSyntaxError,
|
|
8
|
+
ElementPathTypeError as EPTypeError,
|
|
9
|
+
ElementPathZeroDivisionError,
|
|
10
|
+
ElementPathRuntimeError as EPRuntimeError,
|
|
11
|
+
MissingContextError,
|
|
12
|
+
)
|
|
5
13
|
from elementpath.datatypes import AnyAtomicType
|
|
6
14
|
from elementpath.xpath3 import XPath3Parser
|
|
7
15
|
from lxml import html
|
|
8
16
|
|
|
9
17
|
from wxpath.core.dom import get_absolute_links_from_elem_and_xpath
|
|
18
|
+
from wxpath.core.exceptions import (
|
|
19
|
+
XPathEvaluationError,
|
|
20
|
+
XPathSyntaxError,
|
|
21
|
+
XPathTypeError,
|
|
22
|
+
XPathRuntimeError,
|
|
23
|
+
)
|
|
10
24
|
from wxpath.core.models import (
|
|
11
25
|
CrawlIntent,
|
|
12
26
|
DataIntent,
|
|
@@ -119,7 +133,52 @@ def _handle_xpath(curr_elem: html.HtmlElement,
|
|
|
119
133
|
raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
|
|
120
134
|
base_url = getattr(curr_elem, 'base_url', None)
|
|
121
135
|
log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
|
|
122
|
-
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
elems = curr_elem.xpath3(expr)
|
|
139
|
+
except EPSyntaxError as e:
|
|
140
|
+
# Parse the error message to extract line/column if available
|
|
141
|
+
# elementpath format: "... at line 1, column 7: [err:XPST0003] ..."
|
|
142
|
+
raise XPathSyntaxError(
|
|
143
|
+
f"Invalid XPath syntax: {str(e).split(': ', 1)[-1]}",
|
|
144
|
+
xpath=expr,
|
|
145
|
+
base_url=base_url,
|
|
146
|
+
element_tag=curr_elem.tag,
|
|
147
|
+
original_error=e
|
|
148
|
+
) from e
|
|
149
|
+
except EPTypeError as e:
|
|
150
|
+
raise XPathTypeError(
|
|
151
|
+
f"XPath type error: {str(e).split(': ', 1)[-1]}",
|
|
152
|
+
xpath=expr,
|
|
153
|
+
base_url=base_url,
|
|
154
|
+
element_tag=curr_elem.tag,
|
|
155
|
+
original_error=e
|
|
156
|
+
) from e
|
|
157
|
+
except ElementPathZeroDivisionError as e:
|
|
158
|
+
raise XPathRuntimeError(
|
|
159
|
+
f"Division by zero in XPath: {expr}",
|
|
160
|
+
xpath=expr,
|
|
161
|
+
base_url=base_url,
|
|
162
|
+
element_tag=curr_elem.tag,
|
|
163
|
+
original_error=e
|
|
164
|
+
) from e
|
|
165
|
+
except MissingContextError as e:
|
|
166
|
+
raise XPathRuntimeError(
|
|
167
|
+
f"XPath requires context but none provided: {expr}",
|
|
168
|
+
xpath=expr,
|
|
169
|
+
base_url=base_url,
|
|
170
|
+
element_tag=curr_elem.tag,
|
|
171
|
+
original_error=e
|
|
172
|
+
) from e
|
|
173
|
+
except ElementPathError as e:
|
|
174
|
+
# Catch-all for other elementpath errors
|
|
175
|
+
raise XPathEvaluationError(
|
|
176
|
+
f"XPath evaluation failed: {e}",
|
|
177
|
+
xpath=expr,
|
|
178
|
+
base_url=base_url,
|
|
179
|
+
element_tag=curr_elem.tag,
|
|
180
|
+
original_error=e
|
|
181
|
+
) from e
|
|
123
182
|
|
|
124
183
|
next_segments = curr_segments[1:]
|
|
125
184
|
for elem in elems:
|
|
@@ -256,12 +315,37 @@ def _handle_binary(curr_elem: html.HtmlElement | str,
|
|
|
256
315
|
base_url = getattr(curr_elem, 'base_url', None)
|
|
257
316
|
next_segments = right
|
|
258
317
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
318
|
+
try:
|
|
319
|
+
results = elementpath.select(
|
|
320
|
+
curr_elem,
|
|
321
|
+
left.value,
|
|
322
|
+
parser=XPath3Parser,
|
|
323
|
+
item='' if curr_elem is None else None
|
|
324
|
+
)
|
|
325
|
+
except EPSyntaxError as e:
|
|
326
|
+
raise XPathSyntaxError(
|
|
327
|
+
f"Invalid XPath in binary operation: {str(e).split(': ', 1)[-1]}",
|
|
328
|
+
xpath=left.value,
|
|
329
|
+
base_url=base_url,
|
|
330
|
+
element_tag=getattr(curr_elem, 'tag', None),
|
|
331
|
+
original_error=e
|
|
332
|
+
) from e
|
|
333
|
+
except EPTypeError as e:
|
|
334
|
+
raise XPathTypeError(
|
|
335
|
+
f"XPath type error in binary operation: {str(e).split(': ', 1)[-1]}",
|
|
336
|
+
xpath=left.value,
|
|
337
|
+
base_url=base_url,
|
|
338
|
+
element_tag=getattr(curr_elem, 'tag', None),
|
|
339
|
+
original_error=e
|
|
340
|
+
) from e
|
|
341
|
+
except ElementPathError as e:
|
|
342
|
+
raise XPathEvaluationError(
|
|
343
|
+
f"XPath evaluation failed in binary operation: {e}",
|
|
344
|
+
xpath=left.value,
|
|
345
|
+
base_url=base_url,
|
|
346
|
+
element_tag=getattr(curr_elem, 'tag', None),
|
|
347
|
+
original_error=e
|
|
348
|
+
) from e
|
|
265
349
|
|
|
266
350
|
if isinstance(results, AnyAtomicType):
|
|
267
351
|
results = [results]
|
|
@@ -13,6 +13,7 @@ except ImportError:
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
TOKEN_SPEC = [
|
|
16
|
+
("WXLOOP", r"wx:loop"),
|
|
16
17
|
("NUMBER", r"\d+\.\d+"),
|
|
17
18
|
("INTEGER", r"\d+"),
|
|
18
19
|
("STRING", r"'([^'\\]|\\.)*'|\"([^\"\\]|\\.)*\""), # TODO: Rename to URL Literal
|
|
@@ -180,7 +181,7 @@ class Parser:
|
|
|
180
181
|
|
|
181
182
|
def parse_binary(self, min_prec: int) -> object:
|
|
182
183
|
"""Parse a binary expression chain honoring operator precedence."""
|
|
183
|
-
if self.token.type == "WXPATH":
|
|
184
|
+
if self.token.type == "WXPATH" or self.token.type == "WXLOOP":
|
|
184
185
|
left = self.parse_segments()
|
|
185
186
|
else:
|
|
186
187
|
left = self.nud()
|
|
@@ -2,13 +2,14 @@ import asyncio
|
|
|
2
2
|
import contextlib
|
|
3
3
|
import inspect
|
|
4
4
|
from collections import deque
|
|
5
|
-
from typing import Any, AsyncGenerator, Iterator
|
|
5
|
+
from typing import Any, AsyncGenerator, Iterator, Iterable
|
|
6
6
|
|
|
7
7
|
from lxml.html import HtmlElement
|
|
8
8
|
from tqdm import tqdm
|
|
9
9
|
|
|
10
10
|
from wxpath import patches # noqa: F401
|
|
11
11
|
from wxpath.core import parser
|
|
12
|
+
from wxpath.core.exceptions import XPathEvaluationError
|
|
12
13
|
from wxpath.core.models import (
|
|
13
14
|
CrawlIntent,
|
|
14
15
|
CrawlTask,
|
|
@@ -145,6 +146,7 @@ class WXPathEngine(HookedEngineBase):
|
|
|
145
146
|
respect_robots: bool = True,
|
|
146
147
|
allowed_response_codes: set[int] = None,
|
|
147
148
|
allow_redirects: bool = True,
|
|
149
|
+
yield_errors: bool = False,
|
|
148
150
|
):
|
|
149
151
|
# NOTE: Will grow unbounded in large crawls. Consider a LRU cache, or bloom filter.
|
|
150
152
|
self.seen_urls: set[str] = set()
|
|
@@ -157,6 +159,7 @@ class WXPathEngine(HookedEngineBase):
|
|
|
157
159
|
self.allow_redirects = allow_redirects
|
|
158
160
|
if allow_redirects:
|
|
159
161
|
self.allowed_response_codes |= {301, 302, 303, 307, 308}
|
|
162
|
+
self.yield_errors = yield_errors
|
|
160
163
|
|
|
161
164
|
def _get_max_depth(self, bin_or_segs: Binary | Segments, max_depth: int) -> int:
|
|
162
165
|
"""Get the maximum crawl depth for a given expression. Will find a Depth
|
|
@@ -182,7 +185,6 @@ class WXPathEngine(HookedEngineBase):
|
|
|
182
185
|
expression: str,
|
|
183
186
|
max_depth: int,
|
|
184
187
|
progress: bool = False,
|
|
185
|
-
yield_errors: bool = False,
|
|
186
188
|
) -> AsyncGenerator[Any, None]:
|
|
187
189
|
"""Execute a wxpath expression concurrently and yield results.
|
|
188
190
|
|
|
@@ -267,7 +269,10 @@ class WXPathEngine(HookedEngineBase):
|
|
|
267
269
|
queue=queue,
|
|
268
270
|
pbar=pbar,
|
|
269
271
|
):
|
|
270
|
-
|
|
272
|
+
if isinstance(output, dict) and output.get("__type__") == "error":
|
|
273
|
+
yield output
|
|
274
|
+
else:
|
|
275
|
+
yield await self.post_extract_hooks(output)
|
|
271
276
|
|
|
272
277
|
# While looping asynchronous generators, you MUST make sure
|
|
273
278
|
# to check terminal conditions before re-iteration.
|
|
@@ -282,7 +287,7 @@ class WXPathEngine(HookedEngineBase):
|
|
|
282
287
|
if task is None:
|
|
283
288
|
log.warning(f"Got unexpected response from {resp.request.url}")
|
|
284
289
|
|
|
285
|
-
if yield_errors:
|
|
290
|
+
if self.yield_errors:
|
|
286
291
|
yield {
|
|
287
292
|
"__type__": "error",
|
|
288
293
|
"url": resp.request.url,
|
|
@@ -298,7 +303,7 @@ class WXPathEngine(HookedEngineBase):
|
|
|
298
303
|
if resp.error:
|
|
299
304
|
log.warning(f"Got error from {resp.request.url}: {resp.error}")
|
|
300
305
|
|
|
301
|
-
if yield_errors:
|
|
306
|
+
if self.yield_errors:
|
|
302
307
|
yield {
|
|
303
308
|
"__type__": "error",
|
|
304
309
|
"url": resp.request.url,
|
|
@@ -315,7 +320,7 @@ class WXPathEngine(HookedEngineBase):
|
|
|
315
320
|
if resp.status not in self.allowed_response_codes or not resp.body:
|
|
316
321
|
log.warning(f"Got non-200 response from {resp.request.url}")
|
|
317
322
|
|
|
318
|
-
if yield_errors:
|
|
323
|
+
if self.yield_errors:
|
|
319
324
|
yield {
|
|
320
325
|
"__type__": "error",
|
|
321
326
|
"url": resp.request.url,
|
|
@@ -413,7 +418,11 @@ class WXPathEngine(HookedEngineBase):
|
|
|
413
418
|
|
|
414
419
|
binary_or_segment = bin_or_segs if isinstance(bin_or_segs, Binary) else bin_or_segs[0]
|
|
415
420
|
operator = get_operator(binary_or_segment)
|
|
416
|
-
|
|
421
|
+
|
|
422
|
+
if self.yield_errors:
|
|
423
|
+
intents = _safe_iterator(operator(elem, bin_or_segs, depth))
|
|
424
|
+
else:
|
|
425
|
+
intents = operator(elem, bin_or_segs, depth)
|
|
417
426
|
|
|
418
427
|
if not intents:
|
|
419
428
|
return
|
|
@@ -449,6 +458,28 @@ class WXPathEngine(HookedEngineBase):
|
|
|
449
458
|
mini_queue.append((elem, next_segments))
|
|
450
459
|
|
|
451
460
|
|
|
461
|
+
def _safe_iterator(iterable: Iterable[Any]) -> Iterator[Any]:
|
|
462
|
+
"""Wrap an iterable in a try/except block and return an iterator that yields the result or the error."""
|
|
463
|
+
it = iter(iterable)
|
|
464
|
+
while True:
|
|
465
|
+
try:
|
|
466
|
+
yield next(it)
|
|
467
|
+
except StopIteration:
|
|
468
|
+
break
|
|
469
|
+
except XPathEvaluationError as e:
|
|
470
|
+
yield {
|
|
471
|
+
"__type__": "error",
|
|
472
|
+
"reason": "xpath_evaluation_error",
|
|
473
|
+
"exception": str(e),
|
|
474
|
+
}
|
|
475
|
+
except Exception as e:
|
|
476
|
+
yield {
|
|
477
|
+
"__type__": "error",
|
|
478
|
+
"reason": "iterator_error",
|
|
479
|
+
"exception": str(e),
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
|
|
452
483
|
def wxpath_async(path_expr: str,
|
|
453
484
|
max_depth: int,
|
|
454
485
|
progress: bool = False,
|
|
@@ -20,6 +20,7 @@ Example:
|
|
|
20
20
|
import asyncio
|
|
21
21
|
import csv
|
|
22
22
|
import json
|
|
23
|
+
import traceback
|
|
23
24
|
from datetime import datetime
|
|
24
25
|
from pathlib import Path
|
|
25
26
|
from typing import Any, Iterable
|
|
@@ -27,7 +28,7 @@ from typing import Any, Iterable
|
|
|
27
28
|
from elementpath.xpath_tokens import XPathMap
|
|
28
29
|
from lxml.html import HtmlElement, tostring
|
|
29
30
|
from rich.console import RenderableType
|
|
30
|
-
from textual import work
|
|
31
|
+
from textual import events, work
|
|
31
32
|
from textual.app import App, ComposeResult
|
|
32
33
|
from textual.containers import Container, Horizontal, Vertical, VerticalScroll
|
|
33
34
|
from textual.reactive import reactive
|
|
@@ -558,6 +559,7 @@ class WXPathTUI(App):
|
|
|
558
559
|
("ctrl+r", "execute", "Execute"),
|
|
559
560
|
("escape", "cancel_crawl", "Cancel Crawl"),
|
|
560
561
|
("ctrl+c", "clear", "Clear"),
|
|
562
|
+
("ctrl+shift+backspace", "clear_editor", "Clear Editor"),
|
|
561
563
|
("ctrl+d", "clear_debug", "Clear Debug"),
|
|
562
564
|
("ctrl+shift+d", "toggle_debug", "Toggle Debug"),
|
|
563
565
|
("ctrl+e", "export", "Export"),
|
|
@@ -629,6 +631,7 @@ class WXPathTUI(App):
|
|
|
629
631
|
" • Press [bold]Escape[/bold] to cancel a running crawl\n"
|
|
630
632
|
" • Press [bold]Ctrl+E[/bold] to export table (CSV/JSON)\n"
|
|
631
633
|
" • Press [bold]Ctrl+C[/bold] to clear output\n"
|
|
634
|
+
" • Press [bold]Ctrl+Shift+Backspace[/bold] to clear expression editor\n"
|
|
632
635
|
" • Press [bold]Ctrl+Shift+D[/bold] to toggle debug panel\n"
|
|
633
636
|
" • Press [bold]Ctrl+H[/bold] to configure HTTP headers\n"
|
|
634
637
|
" • Press [bold]Ctrl+Shift+S[/bold] to edit persistent settings (concurrency, robots)\n" # noqa: E501
|
|
@@ -645,7 +648,7 @@ class WXPathTUI(App):
|
|
|
645
648
|
"""Update global settings and subtitle when cache setting changes."""
|
|
646
649
|
# Update the global settings - this is what the HTTP crawler will read
|
|
647
650
|
SETTINGS.http.client.cache.enabled = bool(new_value)
|
|
648
|
-
|
|
651
|
+
self._debug(f"Cache enabled: {SETTINGS.http.client.cache.enabled}")
|
|
649
652
|
self._update_subtitle()
|
|
650
653
|
|
|
651
654
|
def watch_custom_headers(self, new_value: dict) -> None:
|
|
@@ -658,7 +661,8 @@ class WXPathTUI(App):
|
|
|
658
661
|
|
|
659
662
|
def _update_subtitle(self) -> None:
|
|
660
663
|
"""Update subtitle with current cache, headers, and persistent settings."""
|
|
661
|
-
cache_state = "ON" if self.cache_enabled else "OFF"
|
|
664
|
+
# cache_state = "ON" if self.cache_enabled else "OFF"
|
|
665
|
+
cache_state = SETTINGS.http.client.cache.enabled
|
|
662
666
|
headers_count = len(self.custom_headers)
|
|
663
667
|
headers_info = f"{headers_count} custom" if headers_count > 0 else "default"
|
|
664
668
|
conc = self.tui_settings.get("concurrency", 16)
|
|
@@ -940,7 +944,10 @@ class WXPathTUI(App):
|
|
|
940
944
|
columns_initialized = False
|
|
941
945
|
column_keys: list[str] = []
|
|
942
946
|
|
|
943
|
-
async for result in engine.run(expression, max_depth=1, progress=False):
|
|
947
|
+
async for result in engine.run(expression, max_depth=1, progress=False, yield_errors=True):
|
|
948
|
+
if isinstance(result, dict) and result.get("__type__") == "error":
|
|
949
|
+
self._debug(f"Error: {result.get('reason')}: {result}")
|
|
950
|
+
continue
|
|
944
951
|
count += 1
|
|
945
952
|
if count % 100 == 0:
|
|
946
953
|
self._debug(f"Received result {count} of type {type(result).__name__}")
|
|
@@ -990,8 +997,16 @@ class WXPathTUI(App):
|
|
|
990
997
|
self._executing = False
|
|
991
998
|
return
|
|
992
999
|
except Exception as e:
|
|
993
|
-
#
|
|
994
|
-
self.
|
|
1000
|
+
# Log full stack trace to debug panel
|
|
1001
|
+
self._debug(traceback.format_exc())
|
|
1002
|
+
# Append error as next row of table (do not clear output panel)
|
|
1003
|
+
err_msg = f"Execution Error: {type(e).__name__}: {e}"
|
|
1004
|
+
if columns_initialized and column_keys:
|
|
1005
|
+
row = [err_msg] + [""] * (len(column_keys) - 1)
|
|
1006
|
+
data_table.add_row(*row, key=f"error-{count}")
|
|
1007
|
+
else:
|
|
1008
|
+
data_table.add_column("error", key="error")
|
|
1009
|
+
data_table.add_row(err_msg, key="error-0")
|
|
995
1010
|
self._executing = False
|
|
996
1011
|
return
|
|
997
1012
|
finally:
|
|
@@ -1086,7 +1101,13 @@ class WXPathTUI(App):
|
|
|
1086
1101
|
"""Clear the output panel."""
|
|
1087
1102
|
self._update_output("Waiting for expression...")
|
|
1088
1103
|
self._debug("Cleared output panel.")
|
|
1089
|
-
|
|
1104
|
+
|
|
1105
|
+
def action_clear_editor(self) -> None:
|
|
1106
|
+
"""Clear the expression editor (all text)."""
|
|
1107
|
+
editor = self.query_one("#expression-editor", )
|
|
1108
|
+
editor.clear()
|
|
1109
|
+
self._debug("Expression editor cleared.")
|
|
1110
|
+
|
|
1090
1111
|
def _update_output(self, content: str | RenderableType) -> None:
|
|
1091
1112
|
"""Update the output panel with new content."""
|
|
1092
1113
|
# output_panel = self.query_one("#output-panel", OutputPanel)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: wxpath
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: wxpath - a declarative web crawler and data extractor
|
|
5
5
|
Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -14,7 +14,7 @@ Description-Content-Type: text/markdown
|
|
|
14
14
|
License-File: LICENSE
|
|
15
15
|
Requires-Dist: lxml>=4.0
|
|
16
16
|
Requires-Dist: elementpath<=5.0.3,>=5.0.0
|
|
17
|
-
Requires-Dist: aiohttp<=
|
|
17
|
+
Requires-Dist: aiohttp<=4.0.0,>=3.8.0
|
|
18
18
|
Requires-Dist: tqdm>=4.0.0
|
|
19
19
|
Provides-Extra: cache
|
|
20
20
|
Requires-Dist: aiohttp-client-cache>=0.14.0; extra == "cache"
|
|
@@ -35,6 +35,7 @@ Requires-Dist: pytest>=7.0; extra == "test"
|
|
|
35
35
|
Requires-Dist: pytest-asyncio>=0.23; extra == "test"
|
|
36
36
|
Provides-Extra: dev
|
|
37
37
|
Requires-Dist: ruff; extra == "dev"
|
|
38
|
+
Requires-Dist: tox; extra == "dev"
|
|
38
39
|
Provides-Extra: docs
|
|
39
40
|
Requires-Dist: mkdocs>=1.5; extra == "docs"
|
|
40
41
|
Requires-Dist: mkdocs-material>=9.0; extra == "docs"
|
|
@@ -54,7 +55,7 @@ Dynamic: license-file
|
|
|
54
55
|
[](https://www.python.org/downloads/release/python-3100/) [](https://rodricios.github.io/wxpath)
|
|
55
56
|
|
|
56
57
|
|
|
57
|
-
> NEW: [TUI](https://rodricios.github.io/wxpath/tui/quickstart
|
|
58
|
+
> NEW: [TUI](https://rodricios.github.io/wxpath/tui/quickstart) - Interactive terminal interface (powered by Textual) for testing wxpath expressions and exporting data.
|
|
58
59
|
|
|
59
60
|

|
|
60
61
|
|
|
@@ -65,7 +66,7 @@ Requires Python 3.10+.
|
|
|
65
66
|
```
|
|
66
67
|
pip install wxpath
|
|
67
68
|
# For TUI support
|
|
68
|
-
pip install wxpath[tui]
|
|
69
|
+
pip install "wxpath[tui]"
|
|
69
70
|
```
|
|
70
71
|
---
|
|
71
72
|
|
|
@@ -356,7 +357,7 @@ Command line options:
|
|
|
356
357
|
|
|
357
358
|
**wxpath** provides a terminal interface (TUI) for interactive expression testing and data extraction.
|
|
358
359
|
|
|
359
|
-
See [TUI Quickstart](https://rodricios.github.io/wxpath/tui/quickstart
|
|
360
|
+
See [TUI Quickstart](https://rodricios.github.io/wxpath/tui/quickstart) for more details.
|
|
360
361
|
|
|
361
362
|
## Persistence and Caching
|
|
362
363
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{wxpath-0.5.0 → wxpath-0.5.1}/src/wxpath/integrations/langchain/examples/rolling_window_rag.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|