wxpath 0.4.1__tar.gz → 0.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wxpath-0.4.1 → wxpath-0.5.1}/PKG-INFO +73 -9
- wxpath-0.4.1/src/wxpath.egg-info/PKG-INFO → wxpath-0.5.1/README.md +46 -34
- {wxpath-0.4.1 → wxpath-0.5.1}/pyproject.toml +20 -3
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/__init__.py +2 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/cli.py +6 -0
- wxpath-0.5.1/src/wxpath/core/exceptions.py +53 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/core/models.py +1 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/core/ops.py +100 -19
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/core/parser.py +94 -24
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/core/runtime/engine.py +74 -10
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/core/runtime/helpers.py +6 -3
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/client/__init__.py +1 -1
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/client/crawler.py +17 -5
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/client/response.py +7 -1
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/policy/retry.py +2 -2
- wxpath-0.5.1/src/wxpath/integrations/langchain/__init__.py +0 -0
- wxpath-0.5.1/src/wxpath/integrations/langchain/examples/basic_rag.py +85 -0
- wxpath-0.5.1/src/wxpath/integrations/langchain/examples/rolling_window_rag.py +218 -0
- wxpath-0.5.1/src/wxpath/integrations/langchain/loader.py +60 -0
- wxpath-0.5.1/src/wxpath/patches.py +273 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/settings.py +3 -1
- wxpath-0.5.1/src/wxpath/tui.py +1225 -0
- wxpath-0.5.1/src/wxpath/tui_settings.py +151 -0
- wxpath-0.5.1/src/wxpath/util/__init__.py +0 -0
- wxpath-0.5.1/src/wxpath/util/cleaners.py +31 -0
- wxpath-0.5.1/src/wxpath/util/common_paths.py +22 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/util/logging.py +3 -7
- wxpath-0.4.1/README.md → wxpath-0.5.1/src/wxpath.egg-info/PKG-INFO +97 -7
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath.egg-info/SOURCES.txt +10 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath.egg-info/entry_points.txt +1 -0
- wxpath-0.5.1/src/wxpath.egg-info/requires.txt +44 -0
- wxpath-0.4.1/src/wxpath/patches.py +0 -63
- wxpath-0.4.1/src/wxpath.egg-info/requires.txt +0 -20
- {wxpath-0.4.1 → wxpath-0.5.1}/LICENSE +0 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/setup.cfg +0 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/core/__init__.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/core/dom.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/core/runtime/__init__.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/hooks/__init__.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/hooks/builtin.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/hooks/registry.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/__init__.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/client/cache.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/client/request.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/policy/backoff.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/policy/robots.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/policy/throttler.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/stats.py +0 -0
- {wxpath-0.4.1/src/wxpath/util → wxpath-0.5.1/src/wxpath/integrations}/__init__.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/util/serialize.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath.egg-info/dependency_links.txt +0 -0
- {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath.egg-info/top_level.txt +0 -0
|
@@ -1,15 +1,20 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: wxpath
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: wxpath - a declarative web crawler and data extractor
|
|
5
5
|
Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://rodricios.github.io/wxpath
|
|
8
|
+
Project-URL: Documentation, https://rodricios.github.io/wxpath
|
|
9
|
+
Project-URL: Repository, https://github.com/rodricios/wxpath
|
|
10
|
+
Project-URL: Issues, https://github.com/rodricios/wxpath/issues
|
|
11
|
+
Project-URL: Changelog, https://github.com/rodricios/wxpath/blob/main/CHANGELOG.md
|
|
7
12
|
Requires-Python: >=3.10
|
|
8
13
|
Description-Content-Type: text/markdown
|
|
9
14
|
License-File: LICENSE
|
|
10
15
|
Requires-Dist: lxml>=4.0
|
|
11
16
|
Requires-Dist: elementpath<=5.0.3,>=5.0.0
|
|
12
|
-
Requires-Dist: aiohttp<=
|
|
17
|
+
Requires-Dist: aiohttp<=4.0.0,>=3.8.0
|
|
13
18
|
Requires-Dist: tqdm>=4.0.0
|
|
14
19
|
Provides-Extra: cache
|
|
15
20
|
Requires-Dist: aiohttp-client-cache>=0.14.0; extra == "cache"
|
|
@@ -17,16 +22,56 @@ Provides-Extra: cache-sqlite
|
|
|
17
22
|
Requires-Dist: aiohttp-client-cache[sqlite]; extra == "cache-sqlite"
|
|
18
23
|
Provides-Extra: cache-redis
|
|
19
24
|
Requires-Dist: aiohttp-client-cache[redis]; extra == "cache-redis"
|
|
25
|
+
Provides-Extra: llm
|
|
26
|
+
Requires-Dist: langchain>=1.0.0; extra == "llm"
|
|
27
|
+
Requires-Dist: langchain-core>=1.0.0; extra == "llm"
|
|
28
|
+
Requires-Dist: langchain-ollama>=1.0.0; extra == "llm"
|
|
29
|
+
Requires-Dist: langchain-community>=0.4.0; extra == "llm"
|
|
30
|
+
Requires-Dist: langchain-chroma>=1.0.0; extra == "llm"
|
|
31
|
+
Requires-Dist: chromadb>=1.0.0; extra == "llm"
|
|
32
|
+
Requires-Dist: langchain-text-splitters>=1.1.0; extra == "llm"
|
|
20
33
|
Provides-Extra: test
|
|
21
34
|
Requires-Dist: pytest>=7.0; extra == "test"
|
|
22
35
|
Requires-Dist: pytest-asyncio>=0.23; extra == "test"
|
|
23
36
|
Provides-Extra: dev
|
|
24
37
|
Requires-Dist: ruff; extra == "dev"
|
|
38
|
+
Requires-Dist: tox; extra == "dev"
|
|
39
|
+
Provides-Extra: docs
|
|
40
|
+
Requires-Dist: mkdocs>=1.5; extra == "docs"
|
|
41
|
+
Requires-Dist: mkdocs-material>=9.0; extra == "docs"
|
|
42
|
+
Requires-Dist: mkdocstrings[python]>=0.24; extra == "docs"
|
|
43
|
+
Requires-Dist: mkdocs-macros-plugin>=1.0; extra == "docs"
|
|
44
|
+
Requires-Dist: mkdocs-resize-images>=1.0; extra == "docs"
|
|
45
|
+
Requires-Dist: mkdocs-glightbox; extra == "docs"
|
|
46
|
+
Requires-Dist: pyyaml>=6.0; extra == "docs"
|
|
47
|
+
Provides-Extra: tui
|
|
48
|
+
Requires-Dist: textual>=1.0.0; extra == "tui"
|
|
49
|
+
Requires-Dist: aiohttp-client-cache>=0.14.0; extra == "tui"
|
|
50
|
+
Requires-Dist: aiohttp-client-cache[sqlite]; extra == "tui"
|
|
25
51
|
Dynamic: license-file
|
|
26
52
|
|
|
27
|
-
# **wxpath** - declarative web
|
|
53
|
+
# **wxpath** - declarative web graph traversal with XPath
|
|
28
54
|
|
|
29
|
-
[](https://www.python.org/downloads/release/python-3100/)
|
|
55
|
+
[](https://www.python.org/downloads/release/python-3100/) [](https://rodricios.github.io/wxpath)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
> NEW: [TUI](https://rodricios.github.io/wxpath/tui/quickstart) - Interactive terminal interface (powered by Textual) for testing wxpath expressions and exporting data.
|
|
59
|
+
|
|
60
|
+

|
|
61
|
+
|
|
62
|
+
## Install
|
|
63
|
+
|
|
64
|
+
Requires Python 3.10+.
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
pip install wxpath
|
|
68
|
+
# For TUI support
|
|
69
|
+
pip install "wxpath[tui]"
|
|
70
|
+
```
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
## What is wxpath?
|
|
30
75
|
|
|
31
76
|
**wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
|
|
32
77
|
|
|
@@ -35,14 +80,14 @@ This expression fetches a page, extracts links, and streams them concurrently -
|
|
|
35
80
|
```python
|
|
36
81
|
import wxpath
|
|
37
82
|
|
|
38
|
-
expr = "url('https://
|
|
83
|
+
expr = "url('https://quotes.toscrape.com')//a/@href"
|
|
39
84
|
|
|
40
85
|
for link in wxpath.wxpath_async_blocking_iter(expr):
|
|
41
86
|
print(link)
|
|
42
87
|
```
|
|
43
88
|
|
|
44
89
|
|
|
45
|
-
By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform
|
|
90
|
+
By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform recursive (or paginated) web crawling and extraction:
|
|
46
91
|
|
|
47
92
|
```python
|
|
48
93
|
import wxpath
|
|
@@ -62,15 +107,28 @@ for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
|
|
|
62
107
|
|
|
63
108
|
Most web scrapers force you to write crawl control flow first, and extraction second.
|
|
64
109
|
|
|
65
|
-
**wxpath**
|
|
110
|
+
**wxpath** converges those two steps into one:
|
|
66
111
|
- **You describe traversal declaratively**
|
|
67
112
|
- **Extraction is expressed inline**
|
|
68
113
|
- **The engine handles scheduling, concurrency, and deduplication**
|
|
69
114
|
|
|
70
115
|
|
|
116
|
+
### RAG-Ready Output
|
|
117
|
+
|
|
118
|
+
Extract clean, structured JSON hierarchies directly from the graph - feed your LLMs signal, not noise. Refer to [LangChain Integration](https://rodricios.github.io/wxpath/api/integrations/langchain/) for more details.
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
### Deterministic
|
|
122
|
+
|
|
123
|
+
**wxpath** is deterministic (read: not powered by LLMs). While we can't guarantee the network is stable, we can guarantee the traversal is.
|
|
124
|
+
|
|
125
|
+
## Documentation (WIP)
|
|
126
|
+
|
|
127
|
+
Documentation is now available [here](https://rodricios.github.io/wxpath/).
|
|
128
|
+
|
|
71
129
|
## Contents
|
|
72
130
|
|
|
73
|
-
- [Example](#example)
|
|
131
|
+
- [Example: Knowledge Graph](#example)
|
|
74
132
|
- [Language Design](DESIGN.md)
|
|
75
133
|
- [`url(...)` and `///url(...)` Explained](#url-and-url-explained)
|
|
76
134
|
- [General flow](#general-flow)
|
|
@@ -80,6 +138,7 @@ Most web scrapers force you to write crawl control flow first, and extraction se
|
|
|
80
138
|
- [XPath 3.1](#xpath-31-by-default)
|
|
81
139
|
- [Progress Bar](#progress-bar)
|
|
82
140
|
- [CLI](#cli)
|
|
141
|
+
- [TUI](#tui)
|
|
83
142
|
- [Persistence and Caching](#persistence-and-caching)
|
|
84
143
|
- [Settings](#settings)
|
|
85
144
|
- [Hooks (Experimental)](#hooks-experimental)
|
|
@@ -294,12 +353,17 @@ Command line options:
|
|
|
294
353
|
--cache [true|false] (Default: False) Persist crawl results to a local database
|
|
295
354
|
```
|
|
296
355
|
|
|
356
|
+
## TUI
|
|
357
|
+
|
|
358
|
+
**wxpath** provides a terminal interface (TUI) for interactive expression testing and data extraction.
|
|
359
|
+
|
|
360
|
+
See [TUI Quickstart](https://rodricios.github.io/wxpath/tui/quickstart) for more details.
|
|
297
361
|
|
|
298
362
|
## Persistence and Caching
|
|
299
363
|
|
|
300
364
|
**wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
|
|
301
365
|
|
|
302
|
-
**wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will
|
|
366
|
+
**wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will encounter a warning if `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
|
|
303
367
|
|
|
304
368
|
To use, you must install the appropriate optional dependency:
|
|
305
369
|
|
|
@@ -1,32 +1,25 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
Requires
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
Provides-Extra: dev
|
|
24
|
-
Requires-Dist: ruff; extra == "dev"
|
|
25
|
-
Dynamic: license-file
|
|
26
|
-
|
|
27
|
-
# **wxpath** - declarative web crawling with XPath
|
|
28
|
-
|
|
29
|
-
[](https://www.python.org/downloads/release/python-3100/)
|
|
1
|
+
# **wxpath** - declarative web graph traversal with XPath
|
|
2
|
+
|
|
3
|
+
[](https://www.python.org/downloads/release/python-3100/) [](https://rodricios.github.io/wxpath)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
> NEW: [TUI](https://rodricios.github.io/wxpath/tui/quickstart) - Interactive terminal interface (powered by Textual) for testing wxpath expressions and exporting data.
|
|
7
|
+
|
|
8
|
+

|
|
9
|
+
|
|
10
|
+
## Install
|
|
11
|
+
|
|
12
|
+
Requires Python 3.10+.
|
|
13
|
+
|
|
14
|
+
```
|
|
15
|
+
pip install wxpath
|
|
16
|
+
# For TUI support
|
|
17
|
+
pip install "wxpath[tui]"
|
|
18
|
+
```
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
## What is wxpath?
|
|
30
23
|
|
|
31
24
|
**wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
|
|
32
25
|
|
|
@@ -35,14 +28,14 @@ This expression fetches a page, extracts links, and streams them concurrently -
|
|
|
35
28
|
```python
|
|
36
29
|
import wxpath
|
|
37
30
|
|
|
38
|
-
expr = "url('https://
|
|
31
|
+
expr = "url('https://quotes.toscrape.com')//a/@href"
|
|
39
32
|
|
|
40
33
|
for link in wxpath.wxpath_async_blocking_iter(expr):
|
|
41
34
|
print(link)
|
|
42
35
|
```
|
|
43
36
|
|
|
44
37
|
|
|
45
|
-
By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform
|
|
38
|
+
By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform recursive (or paginated) web crawling and extraction:
|
|
46
39
|
|
|
47
40
|
```python
|
|
48
41
|
import wxpath
|
|
@@ -62,15 +55,28 @@ for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
|
|
|
62
55
|
|
|
63
56
|
Most web scrapers force you to write crawl control flow first, and extraction second.
|
|
64
57
|
|
|
65
|
-
**wxpath**
|
|
58
|
+
**wxpath** converges those two steps into one:
|
|
66
59
|
- **You describe traversal declaratively**
|
|
67
60
|
- **Extraction is expressed inline**
|
|
68
61
|
- **The engine handles scheduling, concurrency, and deduplication**
|
|
69
62
|
|
|
70
63
|
|
|
64
|
+
### RAG-Ready Output
|
|
65
|
+
|
|
66
|
+
Extract clean, structured JSON hierarchies directly from the graph - feed your LLMs signal, not noise. Refer to [LangChain Integration](https://rodricios.github.io/wxpath/api/integrations/langchain/) for more details.
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
### Deterministic
|
|
70
|
+
|
|
71
|
+
**wxpath** is deterministic (read: not powered by LLMs). While we can't guarantee the network is stable, we can guarantee the traversal is.
|
|
72
|
+
|
|
73
|
+
## Documentation (WIP)
|
|
74
|
+
|
|
75
|
+
Documentation is now available [here](https://rodricios.github.io/wxpath/).
|
|
76
|
+
|
|
71
77
|
## Contents
|
|
72
78
|
|
|
73
|
-
- [Example](#example)
|
|
79
|
+
- [Example: Knowledge Graph](#example)
|
|
74
80
|
- [Language Design](DESIGN.md)
|
|
75
81
|
- [`url(...)` and `///url(...)` Explained](#url-and-url-explained)
|
|
76
82
|
- [General flow](#general-flow)
|
|
@@ -80,6 +86,7 @@ Most web scrapers force you to write crawl control flow first, and extraction se
|
|
|
80
86
|
- [XPath 3.1](#xpath-31-by-default)
|
|
81
87
|
- [Progress Bar](#progress-bar)
|
|
82
88
|
- [CLI](#cli)
|
|
89
|
+
- [TUI](#tui)
|
|
83
90
|
- [Persistence and Caching](#persistence-and-caching)
|
|
84
91
|
- [Settings](#settings)
|
|
85
92
|
- [Hooks (Experimental)](#hooks-experimental)
|
|
@@ -294,12 +301,17 @@ Command line options:
|
|
|
294
301
|
--cache [true|false] (Default: False) Persist crawl results to a local database
|
|
295
302
|
```
|
|
296
303
|
|
|
304
|
+
## TUI
|
|
305
|
+
|
|
306
|
+
**wxpath** provides a terminal interface (TUI) for interactive expression testing and data extraction.
|
|
307
|
+
|
|
308
|
+
See [TUI Quickstart](https://rodricios.github.io/wxpath/tui/quickstart) for more details.
|
|
297
309
|
|
|
298
310
|
## Persistence and Caching
|
|
299
311
|
|
|
300
312
|
**wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
|
|
301
313
|
|
|
302
|
-
**wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will
|
|
314
|
+
**wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will encounter a warning if `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
|
|
303
315
|
|
|
304
316
|
To use, you must install the appropriate optional dependency:
|
|
305
317
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "wxpath"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.5.1"
|
|
8
8
|
description = "wxpath - a declarative web crawler and data extractor"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -13,23 +13,40 @@ authors = [
|
|
|
13
13
|
]
|
|
14
14
|
license = "MIT"
|
|
15
15
|
license-files = ["LICENSE"]
|
|
16
|
+
|
|
16
17
|
dependencies = [
|
|
17
18
|
"lxml>=4.0",
|
|
18
19
|
"elementpath>=5.0.0,<=5.0.3",
|
|
19
|
-
"aiohttp>=3.8.0,<=
|
|
20
|
+
"aiohttp>=3.8.0,<=4.0.0",
|
|
20
21
|
"tqdm>=4.0.0"
|
|
21
22
|
]
|
|
22
23
|
|
|
24
|
+
[project.urls]
|
|
25
|
+
Homepage = "https://rodricios.github.io/wxpath"
|
|
26
|
+
Documentation = "https://rodricios.github.io/wxpath"
|
|
27
|
+
Repository = "https://github.com/rodricios/wxpath"
|
|
28
|
+
Issues = "https://github.com/rodricios/wxpath/issues"
|
|
29
|
+
Changelog = "https://github.com/rodricios/wxpath/blob/main/CHANGELOG.md"
|
|
30
|
+
|
|
31
|
+
|
|
23
32
|
[project.optional-dependencies]
|
|
24
33
|
cache = ["aiohttp-client-cache>=0.14.0"]
|
|
25
34
|
cache-sqlite = ["aiohttp-client-cache[sqlite]"]
|
|
26
35
|
cache-redis = ["aiohttp-client-cache[redis]"]
|
|
27
36
|
|
|
37
|
+
# langchain langchain-ollama langchain-chroma chromadb
|
|
38
|
+
llm = ["langchain>=1.0.0", "langchain-core>=1.0.0", "langchain-ollama>=1.0.0",
|
|
39
|
+
"langchain-community>=0.4.0", "langchain-chroma>=1.0.0", "chromadb>=1.0.0",
|
|
40
|
+
"langchain-text-splitters>=1.1.0"]
|
|
41
|
+
|
|
28
42
|
test = ["pytest>=7.0", "pytest-asyncio>=0.23"]
|
|
29
|
-
dev = ["ruff"]
|
|
43
|
+
dev = ["ruff", "tox"]
|
|
44
|
+
docs = ["mkdocs>=1.5", "mkdocs-material>=9.0", "mkdocstrings[python]>=0.24", "mkdocs-macros-plugin>=1.0", "mkdocs-resize-images>=1.0", "mkdocs-glightbox", "pyyaml>=6.0"]
|
|
45
|
+
tui = ["textual>=1.0.0", "aiohttp-client-cache>=0.14.0", "aiohttp-client-cache[sqlite]"]
|
|
30
46
|
|
|
31
47
|
[project.scripts]
|
|
32
48
|
wxpath = "wxpath.cli:main"
|
|
49
|
+
wxpath-tui = "wxpath.tui:main"
|
|
33
50
|
|
|
34
51
|
[tool.pytest.ini_options]
|
|
35
52
|
minversion = "6.0"
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from . import settings
|
|
1
2
|
from .core.runtime.engine import wxpath_async, wxpath_async_blocking, wxpath_async_blocking_iter
|
|
2
3
|
from .util.logging import configure_logging
|
|
3
4
|
|
|
@@ -6,4 +7,5 @@ __all__ = [
|
|
|
6
7
|
'wxpath_async_blocking',
|
|
7
8
|
'wxpath_async_blocking_iter',
|
|
8
9
|
'configure_logging',
|
|
10
|
+
'settings',
|
|
9
11
|
]
|
|
@@ -47,6 +47,11 @@ def main():
|
|
|
47
47
|
help="Respect robots.txt",
|
|
48
48
|
default=True
|
|
49
49
|
)
|
|
50
|
+
arg_parser.add_argument(
|
|
51
|
+
"--insecure",
|
|
52
|
+
action="store_true",
|
|
53
|
+
help="Disable SSL certificate verification (use for sites with broken chains)",
|
|
54
|
+
)
|
|
50
55
|
arg_parser.add_argument(
|
|
51
56
|
"--cache",
|
|
52
57
|
action="store_true",
|
|
@@ -112,6 +117,7 @@ def main():
|
|
|
112
117
|
concurrency=args.concurrency,
|
|
113
118
|
per_host=args.concurrency_per_host,
|
|
114
119
|
respect_robots=args.respect_robots,
|
|
120
|
+
verify_ssl=not args.insecure,
|
|
115
121
|
headers=custom_headers
|
|
116
122
|
)
|
|
117
123
|
engine = WXPathEngine(crawler=crawler)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
class XPathEvaluationError(Exception):
|
|
2
|
+
"""Errors during XPath evaluation with elementpath."""
|
|
3
|
+
|
|
4
|
+
def __init__(
|
|
5
|
+
self,
|
|
6
|
+
message: str,
|
|
7
|
+
xpath: str,
|
|
8
|
+
base_url: str | None = None,
|
|
9
|
+
element_tag: str | None = None,
|
|
10
|
+
error_code: str | None = None, # XPath error codes like XPST0003
|
|
11
|
+
position: tuple[int, int] | None = None, # (line, column)
|
|
12
|
+
original_error: Exception | None = None
|
|
13
|
+
):
|
|
14
|
+
context = {
|
|
15
|
+
"xpath": xpath,
|
|
16
|
+
"base_url": base_url,
|
|
17
|
+
"element_tag": element_tag,
|
|
18
|
+
"error_code": error_code,
|
|
19
|
+
"position": position,
|
|
20
|
+
}
|
|
21
|
+
if original_error:
|
|
22
|
+
context["original_error"] = str(original_error)
|
|
23
|
+
# Extract XPath error code if present (e.g., [err:XPST0003])
|
|
24
|
+
if hasattr(original_error, 'code'):
|
|
25
|
+
context["error_code"] = original_error.code
|
|
26
|
+
|
|
27
|
+
super().__init__(message, context)
|
|
28
|
+
|
|
29
|
+
def to_dict(self) -> dict:
|
|
30
|
+
return {
|
|
31
|
+
"message": self.message,
|
|
32
|
+
"xpath": self.xpath,
|
|
33
|
+
"base_url": self.base_url,
|
|
34
|
+
"element_tag": self.element_tag,
|
|
35
|
+
"error_code": self.error_code,
|
|
36
|
+
"position": self.position,
|
|
37
|
+
"original_error": self.original_error,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class XPathSyntaxError(XPathEvaluationError):
|
|
42
|
+
"""Invalid XPath syntax."""
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class XPathTypeError(XPathEvaluationError):
|
|
47
|
+
"""Type error in XPath expression."""
|
|
48
|
+
pass
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class XPathRuntimeError(XPathEvaluationError):
|
|
52
|
+
"""Runtime error during XPath evaluation."""
|
|
53
|
+
pass
|
|
@@ -2,11 +2,25 @@ from typing import Callable, Iterable
|
|
|
2
2
|
from urllib.parse import urljoin
|
|
3
3
|
|
|
4
4
|
import elementpath
|
|
5
|
+
from elementpath import (
|
|
6
|
+
ElementPathError,
|
|
7
|
+
ElementPathSyntaxError as EPSyntaxError,
|
|
8
|
+
ElementPathTypeError as EPTypeError,
|
|
9
|
+
ElementPathZeroDivisionError,
|
|
10
|
+
ElementPathRuntimeError as EPRuntimeError,
|
|
11
|
+
MissingContextError,
|
|
12
|
+
)
|
|
5
13
|
from elementpath.datatypes import AnyAtomicType
|
|
6
14
|
from elementpath.xpath3 import XPath3Parser
|
|
7
15
|
from lxml import html
|
|
8
16
|
|
|
9
17
|
from wxpath.core.dom import get_absolute_links_from_elem_and_xpath
|
|
18
|
+
from wxpath.core.exceptions import (
|
|
19
|
+
XPathEvaluationError,
|
|
20
|
+
XPathSyntaxError,
|
|
21
|
+
XPathTypeError,
|
|
22
|
+
XPathRuntimeError,
|
|
23
|
+
)
|
|
10
24
|
from wxpath.core.models import (
|
|
11
25
|
CrawlIntent,
|
|
12
26
|
DataIntent,
|
|
@@ -19,6 +33,7 @@ from wxpath.core.parser import (
|
|
|
19
33
|
Binary,
|
|
20
34
|
Call,
|
|
21
35
|
ContextItem,
|
|
36
|
+
Depth,
|
|
22
37
|
Segment,
|
|
23
38
|
Segments,
|
|
24
39
|
String,
|
|
@@ -78,7 +93,10 @@ def get_operator(
|
|
|
78
93
|
|
|
79
94
|
|
|
80
95
|
@register('url', (String,))
|
|
96
|
+
@register('url', (String, Depth))
|
|
81
97
|
@register('url', (String, Xpath))
|
|
98
|
+
@register('url', (String, Depth, Xpath))
|
|
99
|
+
@register('url', (String, Xpath, Depth))
|
|
82
100
|
def _handle_url_str_lit(curr_elem: html.HtmlElement,
|
|
83
101
|
curr_segments: list[Url | Xpath],
|
|
84
102
|
curr_depth: int, **kwargs) -> Iterable[Intent]:
|
|
@@ -87,9 +105,12 @@ def _handle_url_str_lit(curr_elem: html.HtmlElement,
|
|
|
87
105
|
|
|
88
106
|
next_segments = curr_segments[1:]
|
|
89
107
|
|
|
90
|
-
|
|
108
|
+
# NOTE: Expects parser to produce UrlCrawl node in expressions
|
|
109
|
+
# that look like `url('...', follow=//a/@href)`
|
|
110
|
+
if isinstance(url_call, UrlCrawl):
|
|
111
|
+
xpath_arg = [arg for arg in url_call.args if isinstance(arg, Xpath)][0]
|
|
91
112
|
_segments = [
|
|
92
|
-
UrlCrawl('///url', [
|
|
113
|
+
UrlCrawl('///url', [xpath_arg, url_call.args[0].value])
|
|
93
114
|
] + next_segments
|
|
94
115
|
|
|
95
116
|
yield CrawlIntent(url=url_call.args[0].value, next_segments=_segments)
|
|
@@ -112,17 +133,52 @@ def _handle_xpath(curr_elem: html.HtmlElement,
|
|
|
112
133
|
raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
|
|
113
134
|
base_url = getattr(curr_elem, 'base_url', None)
|
|
114
135
|
log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
elems = curr_elem.xpath3(expr)
|
|
139
|
+
except EPSyntaxError as e:
|
|
140
|
+
# Parse the error message to extract line/column if available
|
|
141
|
+
# elementpath format: "... at line 1, column 7: [err:XPST0003] ..."
|
|
142
|
+
raise XPathSyntaxError(
|
|
143
|
+
f"Invalid XPath syntax: {str(e).split(': ', 1)[-1]}",
|
|
144
|
+
xpath=expr,
|
|
145
|
+
base_url=base_url,
|
|
146
|
+
element_tag=curr_elem.tag,
|
|
147
|
+
original_error=e
|
|
148
|
+
) from e
|
|
149
|
+
except EPTypeError as e:
|
|
150
|
+
raise XPathTypeError(
|
|
151
|
+
f"XPath type error: {str(e).split(': ', 1)[-1]}",
|
|
152
|
+
xpath=expr,
|
|
153
|
+
base_url=base_url,
|
|
154
|
+
element_tag=curr_elem.tag,
|
|
155
|
+
original_error=e
|
|
156
|
+
) from e
|
|
157
|
+
except ElementPathZeroDivisionError as e:
|
|
158
|
+
raise XPathRuntimeError(
|
|
159
|
+
f"Division by zero in XPath: {expr}",
|
|
160
|
+
xpath=expr,
|
|
161
|
+
base_url=base_url,
|
|
162
|
+
element_tag=curr_elem.tag,
|
|
163
|
+
original_error=e
|
|
164
|
+
) from e
|
|
165
|
+
except MissingContextError as e:
|
|
166
|
+
raise XPathRuntimeError(
|
|
167
|
+
f"XPath requires context but none provided: {expr}",
|
|
168
|
+
xpath=expr,
|
|
169
|
+
base_url=base_url,
|
|
170
|
+
element_tag=curr_elem.tag,
|
|
171
|
+
original_error=e
|
|
172
|
+
) from e
|
|
173
|
+
except ElementPathError as e:
|
|
174
|
+
# Catch-all for other elementpath errors
|
|
175
|
+
raise XPathEvaluationError(
|
|
176
|
+
f"XPath evaluation failed: {e}",
|
|
177
|
+
xpath=expr,
|
|
178
|
+
base_url=base_url,
|
|
179
|
+
element_tag=curr_elem.tag,
|
|
180
|
+
original_error=e
|
|
181
|
+
) from e
|
|
126
182
|
|
|
127
183
|
next_segments = curr_segments[1:]
|
|
128
184
|
for elem in elems:
|
|
@@ -259,12 +315,37 @@ def _handle_binary(curr_elem: html.HtmlElement | str,
|
|
|
259
315
|
base_url = getattr(curr_elem, 'base_url', None)
|
|
260
316
|
next_segments = right
|
|
261
317
|
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
318
|
+
try:
|
|
319
|
+
results = elementpath.select(
|
|
320
|
+
curr_elem,
|
|
321
|
+
left.value,
|
|
322
|
+
parser=XPath3Parser,
|
|
323
|
+
item='' if curr_elem is None else None
|
|
324
|
+
)
|
|
325
|
+
except EPSyntaxError as e:
|
|
326
|
+
raise XPathSyntaxError(
|
|
327
|
+
f"Invalid XPath in binary operation: {str(e).split(': ', 1)[-1]}",
|
|
328
|
+
xpath=left.value,
|
|
329
|
+
base_url=base_url,
|
|
330
|
+
element_tag=getattr(curr_elem, 'tag', None),
|
|
331
|
+
original_error=e
|
|
332
|
+
) from e
|
|
333
|
+
except EPTypeError as e:
|
|
334
|
+
raise XPathTypeError(
|
|
335
|
+
f"XPath type error in binary operation: {str(e).split(': ', 1)[-1]}",
|
|
336
|
+
xpath=left.value,
|
|
337
|
+
base_url=base_url,
|
|
338
|
+
element_tag=getattr(curr_elem, 'tag', None),
|
|
339
|
+
original_error=e
|
|
340
|
+
) from e
|
|
341
|
+
except ElementPathError as e:
|
|
342
|
+
raise XPathEvaluationError(
|
|
343
|
+
f"XPath evaluation failed in binary operation: {e}",
|
|
344
|
+
xpath=left.value,
|
|
345
|
+
base_url=base_url,
|
|
346
|
+
element_tag=getattr(curr_elem, 'tag', None),
|
|
347
|
+
original_error=e
|
|
348
|
+
) from e
|
|
268
349
|
|
|
269
350
|
if isinstance(results, AnyAtomicType):
|
|
270
351
|
results = [results]
|