wxpath 0.4.1__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wxpath-0.4.1 → wxpath-0.5.0}/PKG-INFO +71 -8
- wxpath-0.4.1/src/wxpath.egg-info/PKG-INFO → wxpath-0.5.0/README.md +46 -34
- {wxpath-0.4.1 → wxpath-0.5.0}/pyproject.toml +18 -1
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/__init__.py +2 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/cli.py +6 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/core/models.py +1 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/core/ops.py +9 -12
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/core/parser.py +92 -23
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/core/runtime/engine.py +36 -3
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/core/runtime/helpers.py +6 -3
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/client/__init__.py +1 -1
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/client/crawler.py +17 -5
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/client/response.py +7 -1
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/policy/retry.py +2 -2
- wxpath-0.5.0/src/wxpath/integrations/langchain/__init__.py +0 -0
- wxpath-0.5.0/src/wxpath/integrations/langchain/examples/basic_rag.py +85 -0
- wxpath-0.5.0/src/wxpath/integrations/langchain/examples/rolling_window_rag.py +218 -0
- wxpath-0.5.0/src/wxpath/integrations/langchain/loader.py +60 -0
- wxpath-0.5.0/src/wxpath/patches.py +273 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/settings.py +3 -1
- wxpath-0.5.0/src/wxpath/tui.py +1204 -0
- wxpath-0.5.0/src/wxpath/tui_settings.py +151 -0
- wxpath-0.5.0/src/wxpath/util/__init__.py +0 -0
- wxpath-0.5.0/src/wxpath/util/cleaners.py +31 -0
- wxpath-0.5.0/src/wxpath/util/common_paths.py +22 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/util/logging.py +3 -7
- wxpath-0.4.1/README.md → wxpath-0.5.0/src/wxpath.egg-info/PKG-INFO +96 -7
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath.egg-info/SOURCES.txt +9 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath.egg-info/entry_points.txt +1 -0
- wxpath-0.5.0/src/wxpath.egg-info/requires.txt +43 -0
- wxpath-0.4.1/src/wxpath/patches.py +0 -63
- wxpath-0.4.1/src/wxpath.egg-info/requires.txt +0 -20
- {wxpath-0.4.1 → wxpath-0.5.0}/LICENSE +0 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/setup.cfg +0 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/core/__init__.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/core/dom.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/core/runtime/__init__.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/hooks/__init__.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/hooks/builtin.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/hooks/registry.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/__init__.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/client/cache.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/client/request.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/policy/backoff.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/policy/robots.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/policy/throttler.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/stats.py +0 -0
- {wxpath-0.4.1/src/wxpath/util → wxpath-0.5.0/src/wxpath/integrations}/__init__.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/util/serialize.py +0 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath.egg-info/dependency_links.txt +0 -0
- {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath.egg-info/top_level.txt +0 -0
|
@@ -1,9 +1,14 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: wxpath
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: wxpath - a declarative web crawler and data extractor
|
|
5
5
|
Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://rodricios.github.io/wxpath
|
|
8
|
+
Project-URL: Documentation, https://rodricios.github.io/wxpath
|
|
9
|
+
Project-URL: Repository, https://github.com/rodricios/wxpath
|
|
10
|
+
Project-URL: Issues, https://github.com/rodricios/wxpath/issues
|
|
11
|
+
Project-URL: Changelog, https://github.com/rodricios/wxpath/blob/main/CHANGELOG.md
|
|
7
12
|
Requires-Python: >=3.10
|
|
8
13
|
Description-Content-Type: text/markdown
|
|
9
14
|
License-File: LICENSE
|
|
@@ -17,16 +22,55 @@ Provides-Extra: cache-sqlite
|
|
|
17
22
|
Requires-Dist: aiohttp-client-cache[sqlite]; extra == "cache-sqlite"
|
|
18
23
|
Provides-Extra: cache-redis
|
|
19
24
|
Requires-Dist: aiohttp-client-cache[redis]; extra == "cache-redis"
|
|
25
|
+
Provides-Extra: llm
|
|
26
|
+
Requires-Dist: langchain>=1.0.0; extra == "llm"
|
|
27
|
+
Requires-Dist: langchain-core>=1.0.0; extra == "llm"
|
|
28
|
+
Requires-Dist: langchain-ollama>=1.0.0; extra == "llm"
|
|
29
|
+
Requires-Dist: langchain-community>=0.4.0; extra == "llm"
|
|
30
|
+
Requires-Dist: langchain-chroma>=1.0.0; extra == "llm"
|
|
31
|
+
Requires-Dist: chromadb>=1.0.0; extra == "llm"
|
|
32
|
+
Requires-Dist: langchain-text-splitters>=1.1.0; extra == "llm"
|
|
20
33
|
Provides-Extra: test
|
|
21
34
|
Requires-Dist: pytest>=7.0; extra == "test"
|
|
22
35
|
Requires-Dist: pytest-asyncio>=0.23; extra == "test"
|
|
23
36
|
Provides-Extra: dev
|
|
24
37
|
Requires-Dist: ruff; extra == "dev"
|
|
38
|
+
Provides-Extra: docs
|
|
39
|
+
Requires-Dist: mkdocs>=1.5; extra == "docs"
|
|
40
|
+
Requires-Dist: mkdocs-material>=9.0; extra == "docs"
|
|
41
|
+
Requires-Dist: mkdocstrings[python]>=0.24; extra == "docs"
|
|
42
|
+
Requires-Dist: mkdocs-macros-plugin>=1.0; extra == "docs"
|
|
43
|
+
Requires-Dist: mkdocs-resize-images>=1.0; extra == "docs"
|
|
44
|
+
Requires-Dist: mkdocs-glightbox; extra == "docs"
|
|
45
|
+
Requires-Dist: pyyaml>=6.0; extra == "docs"
|
|
46
|
+
Provides-Extra: tui
|
|
47
|
+
Requires-Dist: textual>=1.0.0; extra == "tui"
|
|
48
|
+
Requires-Dist: aiohttp-client-cache>=0.14.0; extra == "tui"
|
|
49
|
+
Requires-Dist: aiohttp-client-cache[sqlite]; extra == "tui"
|
|
25
50
|
Dynamic: license-file
|
|
26
51
|
|
|
27
|
-
# **wxpath** - declarative web
|
|
52
|
+
# **wxpath** - declarative web graph traversal with XPath
|
|
28
53
|
|
|
29
|
-
[](https://www.python.org/downloads/release/python-3100/)
|
|
54
|
+
[](https://www.python.org/downloads/release/python-3100/) [](https://rodricios.github.io/wxpath)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
> NEW: [TUI](https://rodricios.github.io/wxpath/tui/quickstart.md) - Interactive terminal interface (powered by Textual) for testing wxpath expressions and exporting data.
|
|
58
|
+
|
|
59
|
+

|
|
60
|
+
|
|
61
|
+
## Install
|
|
62
|
+
|
|
63
|
+
Requires Python 3.10+.
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
pip install wxpath
|
|
67
|
+
# For TUI support
|
|
68
|
+
pip install wxpath[tui]
|
|
69
|
+
```
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
## What is wxpath?
|
|
30
74
|
|
|
31
75
|
**wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
|
|
32
76
|
|
|
@@ -35,14 +79,14 @@ This expression fetches a page, extracts links, and streams them concurrently -
|
|
|
35
79
|
```python
|
|
36
80
|
import wxpath
|
|
37
81
|
|
|
38
|
-
expr = "url('https://
|
|
82
|
+
expr = "url('https://quotes.toscrape.com')//a/@href"
|
|
39
83
|
|
|
40
84
|
for link in wxpath.wxpath_async_blocking_iter(expr):
|
|
41
85
|
print(link)
|
|
42
86
|
```
|
|
43
87
|
|
|
44
88
|
|
|
45
|
-
By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform
|
|
89
|
+
By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform recursive (or paginated) web crawling and extraction:
|
|
46
90
|
|
|
47
91
|
```python
|
|
48
92
|
import wxpath
|
|
@@ -62,15 +106,28 @@ for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
|
|
|
62
106
|
|
|
63
107
|
Most web scrapers force you to write crawl control flow first, and extraction second.
|
|
64
108
|
|
|
65
|
-
**wxpath**
|
|
109
|
+
**wxpath** converges those two steps into one:
|
|
66
110
|
- **You describe traversal declaratively**
|
|
67
111
|
- **Extraction is expressed inline**
|
|
68
112
|
- **The engine handles scheduling, concurrency, and deduplication**
|
|
69
113
|
|
|
70
114
|
|
|
115
|
+
### RAG-Ready Output
|
|
116
|
+
|
|
117
|
+
Extract clean, structured JSON hierarchies directly from the graph - feed your LLMs signal, not noise. Refer to [LangChain Integration](https://rodricios.github.io/wxpath/api/integrations/langchain/) for more details.
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
### Deterministic
|
|
121
|
+
|
|
122
|
+
**wxpath** is deterministic (read: not powered by LLMs). While we can't guarantee the network is stable, we can guarantee the traversal is.
|
|
123
|
+
|
|
124
|
+
## Documentation (WIP)
|
|
125
|
+
|
|
126
|
+
Documentation is now available [here](https://rodricios.github.io/wxpath/).
|
|
127
|
+
|
|
71
128
|
## Contents
|
|
72
129
|
|
|
73
|
-
- [Example](#example)
|
|
130
|
+
- [Example: Knowledge Graph](#example)
|
|
74
131
|
- [Language Design](DESIGN.md)
|
|
75
132
|
- [`url(...)` and `///url(...)` Explained](#url-and-url-explained)
|
|
76
133
|
- [General flow](#general-flow)
|
|
@@ -80,6 +137,7 @@ Most web scrapers force you to write crawl control flow first, and extraction se
|
|
|
80
137
|
- [XPath 3.1](#xpath-31-by-default)
|
|
81
138
|
- [Progress Bar](#progress-bar)
|
|
82
139
|
- [CLI](#cli)
|
|
140
|
+
- [TUI](#tui)
|
|
83
141
|
- [Persistence and Caching](#persistence-and-caching)
|
|
84
142
|
- [Settings](#settings)
|
|
85
143
|
- [Hooks (Experimental)](#hooks-experimental)
|
|
@@ -294,12 +352,17 @@ Command line options:
|
|
|
294
352
|
--cache [true|false] (Default: False) Persist crawl results to a local database
|
|
295
353
|
```
|
|
296
354
|
|
|
355
|
+
## TUI
|
|
356
|
+
|
|
357
|
+
**wxpath** provides a terminal interface (TUI) for interactive expression testing and data extraction.
|
|
358
|
+
|
|
359
|
+
See [TUI Quickstart](https://rodricios.github.io/wxpath/tui/quickstart.md) for more details.
|
|
297
360
|
|
|
298
361
|
## Persistence and Caching
|
|
299
362
|
|
|
300
363
|
**wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
|
|
301
364
|
|
|
302
|
-
**wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will
|
|
365
|
+
**wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will encounter a warning if `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
|
|
303
366
|
|
|
304
367
|
To use, you must install the appropriate optional dependency:
|
|
305
368
|
|
|
@@ -1,32 +1,25 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
Requires
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
Provides-Extra: dev
|
|
24
|
-
Requires-Dist: ruff; extra == "dev"
|
|
25
|
-
Dynamic: license-file
|
|
26
|
-
|
|
27
|
-
# **wxpath** - declarative web crawling with XPath
|
|
28
|
-
|
|
29
|
-
[](https://www.python.org/downloads/release/python-3100/)
|
|
1
|
+
# **wxpath** - declarative web graph traversal with XPath
|
|
2
|
+
|
|
3
|
+
[](https://www.python.org/downloads/release/python-3100/) [](https://rodricios.github.io/wxpath)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
> NEW: [TUI](https://rodricios.github.io/wxpath/tui/quickstart.md) - Interactive terminal interface (powered by Textual) for testing wxpath expressions and exporting data.
|
|
7
|
+
|
|
8
|
+

|
|
9
|
+
|
|
10
|
+
## Install
|
|
11
|
+
|
|
12
|
+
Requires Python 3.10+.
|
|
13
|
+
|
|
14
|
+
```
|
|
15
|
+
pip install wxpath
|
|
16
|
+
# For TUI support
|
|
17
|
+
pip install wxpath[tui]
|
|
18
|
+
```
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
## What is wxpath?
|
|
30
23
|
|
|
31
24
|
**wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
|
|
32
25
|
|
|
@@ -35,14 +28,14 @@ This expression fetches a page, extracts links, and streams them concurrently -
|
|
|
35
28
|
```python
|
|
36
29
|
import wxpath
|
|
37
30
|
|
|
38
|
-
expr = "url('https://
|
|
31
|
+
expr = "url('https://quotes.toscrape.com')//a/@href"
|
|
39
32
|
|
|
40
33
|
for link in wxpath.wxpath_async_blocking_iter(expr):
|
|
41
34
|
print(link)
|
|
42
35
|
```
|
|
43
36
|
|
|
44
37
|
|
|
45
|
-
By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform
|
|
38
|
+
By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform recursive (or paginated) web crawling and extraction:
|
|
46
39
|
|
|
47
40
|
```python
|
|
48
41
|
import wxpath
|
|
@@ -62,15 +55,28 @@ for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
|
|
|
62
55
|
|
|
63
56
|
Most web scrapers force you to write crawl control flow first, and extraction second.
|
|
64
57
|
|
|
65
|
-
**wxpath**
|
|
58
|
+
**wxpath** converges those two steps into one:
|
|
66
59
|
- **You describe traversal declaratively**
|
|
67
60
|
- **Extraction is expressed inline**
|
|
68
61
|
- **The engine handles scheduling, concurrency, and deduplication**
|
|
69
62
|
|
|
70
63
|
|
|
64
|
+
### RAG-Ready Output
|
|
65
|
+
|
|
66
|
+
Extract clean, structured JSON hierarchies directly from the graph - feed your LLMs signal, not noise. Refer to [LangChain Integration](https://rodricios.github.io/wxpath/api/integrations/langchain/) for more details.
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
### Deterministic
|
|
70
|
+
|
|
71
|
+
**wxpath** is deterministic (read: not powered by LLMs). While we can't guarantee the network is stable, we can guarantee the traversal is.
|
|
72
|
+
|
|
73
|
+
## Documentation (WIP)
|
|
74
|
+
|
|
75
|
+
Documentation is now available [here](https://rodricios.github.io/wxpath/).
|
|
76
|
+
|
|
71
77
|
## Contents
|
|
72
78
|
|
|
73
|
-
- [Example](#example)
|
|
79
|
+
- [Example: Knowledge Graph](#example)
|
|
74
80
|
- [Language Design](DESIGN.md)
|
|
75
81
|
- [`url(...)` and `///url(...)` Explained](#url-and-url-explained)
|
|
76
82
|
- [General flow](#general-flow)
|
|
@@ -80,6 +86,7 @@ Most web scrapers force you to write crawl control flow first, and extraction se
|
|
|
80
86
|
- [XPath 3.1](#xpath-31-by-default)
|
|
81
87
|
- [Progress Bar](#progress-bar)
|
|
82
88
|
- [CLI](#cli)
|
|
89
|
+
- [TUI](#tui)
|
|
83
90
|
- [Persistence and Caching](#persistence-and-caching)
|
|
84
91
|
- [Settings](#settings)
|
|
85
92
|
- [Hooks (Experimental)](#hooks-experimental)
|
|
@@ -294,12 +301,17 @@ Command line options:
|
|
|
294
301
|
--cache [true|false] (Default: False) Persist crawl results to a local database
|
|
295
302
|
```
|
|
296
303
|
|
|
304
|
+
## TUI
|
|
305
|
+
|
|
306
|
+
**wxpath** provides a terminal interface (TUI) for interactive expression testing and data extraction.
|
|
307
|
+
|
|
308
|
+
See [TUI Quickstart](https://rodricios.github.io/wxpath/tui/quickstart.md) for more details.
|
|
297
309
|
|
|
298
310
|
## Persistence and Caching
|
|
299
311
|
|
|
300
312
|
**wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
|
|
301
313
|
|
|
302
|
-
**wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will
|
|
314
|
+
**wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will encounter a warning if `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
|
|
303
315
|
|
|
304
316
|
To use, you must install the appropriate optional dependency:
|
|
305
317
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "wxpath"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.5.0"
|
|
8
8
|
description = "wxpath - a declarative web crawler and data extractor"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -13,6 +13,7 @@ authors = [
|
|
|
13
13
|
]
|
|
14
14
|
license = "MIT"
|
|
15
15
|
license-files = ["LICENSE"]
|
|
16
|
+
|
|
16
17
|
dependencies = [
|
|
17
18
|
"lxml>=4.0",
|
|
18
19
|
"elementpath>=5.0.0,<=5.0.3",
|
|
@@ -20,16 +21,32 @@ dependencies = [
|
|
|
20
21
|
"tqdm>=4.0.0"
|
|
21
22
|
]
|
|
22
23
|
|
|
24
|
+
[project.urls]
|
|
25
|
+
Homepage = "https://rodricios.github.io/wxpath"
|
|
26
|
+
Documentation = "https://rodricios.github.io/wxpath"
|
|
27
|
+
Repository = "https://github.com/rodricios/wxpath"
|
|
28
|
+
Issues = "https://github.com/rodricios/wxpath/issues"
|
|
29
|
+
Changelog = "https://github.com/rodricios/wxpath/blob/main/CHANGELOG.md"
|
|
30
|
+
|
|
31
|
+
|
|
23
32
|
[project.optional-dependencies]
|
|
24
33
|
cache = ["aiohttp-client-cache>=0.14.0"]
|
|
25
34
|
cache-sqlite = ["aiohttp-client-cache[sqlite]"]
|
|
26
35
|
cache-redis = ["aiohttp-client-cache[redis]"]
|
|
27
36
|
|
|
37
|
+
# langchain langchain-ollama langchain-chroma chromadb
|
|
38
|
+
llm = ["langchain>=1.0.0", "langchain-core>=1.0.0", "langchain-ollama>=1.0.0",
|
|
39
|
+
"langchain-community>=0.4.0", "langchain-chroma>=1.0.0", "chromadb>=1.0.0",
|
|
40
|
+
"langchain-text-splitters>=1.1.0"]
|
|
41
|
+
|
|
28
42
|
test = ["pytest>=7.0", "pytest-asyncio>=0.23"]
|
|
29
43
|
dev = ["ruff"]
|
|
44
|
+
docs = ["mkdocs>=1.5", "mkdocs-material>=9.0", "mkdocstrings[python]>=0.24", "mkdocs-macros-plugin>=1.0", "mkdocs-resize-images>=1.0", "mkdocs-glightbox", "pyyaml>=6.0"]
|
|
45
|
+
tui = ["textual>=1.0.0", "aiohttp-client-cache>=0.14.0", "aiohttp-client-cache[sqlite]"]
|
|
30
46
|
|
|
31
47
|
[project.scripts]
|
|
32
48
|
wxpath = "wxpath.cli:main"
|
|
49
|
+
wxpath-tui = "wxpath.tui:main"
|
|
33
50
|
|
|
34
51
|
[tool.pytest.ini_options]
|
|
35
52
|
minversion = "6.0"
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from . import settings
|
|
1
2
|
from .core.runtime.engine import wxpath_async, wxpath_async_blocking, wxpath_async_blocking_iter
|
|
2
3
|
from .util.logging import configure_logging
|
|
3
4
|
|
|
@@ -6,4 +7,5 @@ __all__ = [
|
|
|
6
7
|
'wxpath_async_blocking',
|
|
7
8
|
'wxpath_async_blocking_iter',
|
|
8
9
|
'configure_logging',
|
|
10
|
+
'settings',
|
|
9
11
|
]
|
|
@@ -47,6 +47,11 @@ def main():
|
|
|
47
47
|
help="Respect robots.txt",
|
|
48
48
|
default=True
|
|
49
49
|
)
|
|
50
|
+
arg_parser.add_argument(
|
|
51
|
+
"--insecure",
|
|
52
|
+
action="store_true",
|
|
53
|
+
help="Disable SSL certificate verification (use for sites with broken chains)",
|
|
54
|
+
)
|
|
50
55
|
arg_parser.add_argument(
|
|
51
56
|
"--cache",
|
|
52
57
|
action="store_true",
|
|
@@ -112,6 +117,7 @@ def main():
|
|
|
112
117
|
concurrency=args.concurrency,
|
|
113
118
|
per_host=args.concurrency_per_host,
|
|
114
119
|
respect_robots=args.respect_robots,
|
|
120
|
+
verify_ssl=not args.insecure,
|
|
115
121
|
headers=custom_headers
|
|
116
122
|
)
|
|
117
123
|
engine = WXPathEngine(crawler=crawler)
|
|
@@ -19,6 +19,7 @@ from wxpath.core.parser import (
|
|
|
19
19
|
Binary,
|
|
20
20
|
Call,
|
|
21
21
|
ContextItem,
|
|
22
|
+
Depth,
|
|
22
23
|
Segment,
|
|
23
24
|
Segments,
|
|
24
25
|
String,
|
|
@@ -78,7 +79,10 @@ def get_operator(
|
|
|
78
79
|
|
|
79
80
|
|
|
80
81
|
@register('url', (String,))
|
|
82
|
+
@register('url', (String, Depth))
|
|
81
83
|
@register('url', (String, Xpath))
|
|
84
|
+
@register('url', (String, Depth, Xpath))
|
|
85
|
+
@register('url', (String, Xpath, Depth))
|
|
82
86
|
def _handle_url_str_lit(curr_elem: html.HtmlElement,
|
|
83
87
|
curr_segments: list[Url | Xpath],
|
|
84
88
|
curr_depth: int, **kwargs) -> Iterable[Intent]:
|
|
@@ -87,9 +91,12 @@ def _handle_url_str_lit(curr_elem: html.HtmlElement,
|
|
|
87
91
|
|
|
88
92
|
next_segments = curr_segments[1:]
|
|
89
93
|
|
|
90
|
-
|
|
94
|
+
# NOTE: Expects parser to produce UrlCrawl node in expressions
|
|
95
|
+
# that look like `url('...', follow=//a/@href)`
|
|
96
|
+
if isinstance(url_call, UrlCrawl):
|
|
97
|
+
xpath_arg = [arg for arg in url_call.args if isinstance(arg, Xpath)][0]
|
|
91
98
|
_segments = [
|
|
92
|
-
UrlCrawl('///url', [
|
|
99
|
+
UrlCrawl('///url', [xpath_arg, url_call.args[0].value])
|
|
93
100
|
] + next_segments
|
|
94
101
|
|
|
95
102
|
yield CrawlIntent(url=url_call.args[0].value, next_segments=_segments)
|
|
@@ -112,16 +119,6 @@ def _handle_xpath(curr_elem: html.HtmlElement,
|
|
|
112
119
|
raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
|
|
113
120
|
base_url = getattr(curr_elem, 'base_url', None)
|
|
114
121
|
log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
|
|
115
|
-
|
|
116
|
-
_backlink_str = f"string('{curr_elem.get('backlink')}')"
|
|
117
|
-
# We use the root tree's depth and not curr_depth because curr_depth accounts for a +1
|
|
118
|
-
# increment after each url*() hop
|
|
119
|
-
_depth_str = f"number({curr_elem.getroottree().getroot().get('depth')})"
|
|
120
|
-
expr = expr.replace('wx:backlink()', _backlink_str)
|
|
121
|
-
expr = expr.replace('wx:backlink(.)', _backlink_str)
|
|
122
|
-
expr = expr.replace('wx:depth()', _depth_str)
|
|
123
|
-
expr = expr.replace('wx:depth(.)', _depth_str)
|
|
124
|
-
|
|
125
122
|
elems = curr_elem.xpath3(expr)
|
|
126
123
|
|
|
127
124
|
next_segments = curr_segments[1:]
|
|
@@ -13,7 +13,8 @@ except ImportError:
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
TOKEN_SPEC = [
|
|
16
|
-
("NUMBER", r"\d
|
|
16
|
+
("NUMBER", r"\d+\.\d+"),
|
|
17
|
+
("INTEGER", r"\d+"),
|
|
17
18
|
("STRING", r"'([^'\\]|\\.)*'|\"([^\"\\]|\\.)*\""), # TODO: Rename to URL Literal
|
|
18
19
|
("WXPATH", r"/{0,3}\s*url"), # Must come before NAME to match 'url' as WXPATH
|
|
19
20
|
# ("///URL", r"/{3}\s*url"),
|
|
@@ -22,6 +23,7 @@ TOKEN_SPEC = [
|
|
|
22
23
|
("URL", r"\s*url"), # Must come before NAME to match 'url' as WXPATH
|
|
23
24
|
# ("NAME", r"[a-zA-Z_][a-zA-Z0-9_]*"),
|
|
24
25
|
("FOLLOW", r",?\s{,}follow="),
|
|
26
|
+
("DEPTH", r",?\s{,}depth="),
|
|
25
27
|
("OP", r"\|\||<=|>=|!=|=|<|>|\+|-|\*|/|!"), # Added || for string concat
|
|
26
28
|
("LPAREN", r"\("),
|
|
27
29
|
("RPAREN", r"\)"),
|
|
@@ -63,6 +65,14 @@ def tokenize(src: str):
|
|
|
63
65
|
class Number:
|
|
64
66
|
value: float
|
|
65
67
|
|
|
68
|
+
@dataclass
|
|
69
|
+
class Integer:
|
|
70
|
+
value: int
|
|
71
|
+
|
|
72
|
+
@dataclass
|
|
73
|
+
class Depth(Integer):
|
|
74
|
+
pass
|
|
75
|
+
|
|
66
76
|
@dataclass
|
|
67
77
|
class String:
|
|
68
78
|
value: str
|
|
@@ -273,6 +283,10 @@ class Parser:
|
|
|
273
283
|
if tok.type == "NUMBER":
|
|
274
284
|
self.advance()
|
|
275
285
|
return Number(float(tok.value))
|
|
286
|
+
|
|
287
|
+
if tok.type == "INTEGER":
|
|
288
|
+
self.advance()
|
|
289
|
+
return Integer(int(tok.value))
|
|
276
290
|
|
|
277
291
|
if tok.type == "STRING":
|
|
278
292
|
self.advance()
|
|
@@ -358,18 +372,18 @@ class Parser:
|
|
|
358
372
|
self.advance()
|
|
359
373
|
|
|
360
374
|
return result
|
|
361
|
-
|
|
362
375
|
|
|
363
376
|
def capture_url_arg_content(self) -> list[Call | Xpath | ContextItem]:
|
|
364
377
|
"""Capture content inside a url() call, handling nested wxpath expressions.
|
|
365
378
|
|
|
366
379
|
Supports patterns like::
|
|
367
380
|
|
|
368
|
-
url('...')
|
|
369
|
-
url('...' follow=//a/@href)
|
|
370
|
-
url(
|
|
371
|
-
url(
|
|
372
|
-
url( url(
|
|
381
|
+
url('...') -> [String]
|
|
382
|
+
url('...' follow=//a/@href) -> [String, Xpath]
|
|
383
|
+
url('...' follow=//a/@href depth=2) -> [String, Xpath, Integer]
|
|
384
|
+
url(//a/@href depth=2) -> [Xpath, Integer]
|
|
385
|
+
url( url('..')//a/@href ) -> [Call, Xpath]
|
|
386
|
+
url( url( url('..')//a )//b ) -> [Call, Xpath]
|
|
373
387
|
|
|
374
388
|
Returns:
|
|
375
389
|
A list of parsed elements: Xpath nodes for xpath content and Call
|
|
@@ -380,7 +394,10 @@ class Parser:
|
|
|
380
394
|
paren_balance = 1 # We're already inside the opening paren of url()
|
|
381
395
|
brace_balance = 0 # Track braces for map constructors
|
|
382
396
|
reached_follow_token = False
|
|
397
|
+
reached_depth_token = False
|
|
383
398
|
follow_xpath = ""
|
|
399
|
+
depth_number = ""
|
|
400
|
+
|
|
384
401
|
while paren_balance > 0 and self.token.type != "EOF":
|
|
385
402
|
if self.token.type == "WXPATH":
|
|
386
403
|
# Found nested wxpath: save any accumulated xpath content first
|
|
@@ -396,13 +413,22 @@ class Parser:
|
|
|
396
413
|
|
|
397
414
|
elif self.token.type == "FOLLOW":
|
|
398
415
|
reached_follow_token = True
|
|
416
|
+
reached_depth_token = False
|
|
417
|
+
self.advance()
|
|
418
|
+
|
|
419
|
+
elif self.token.type == "DEPTH":
|
|
420
|
+
reached_depth_token = True
|
|
421
|
+
reached_follow_token = False
|
|
399
422
|
self.advance()
|
|
400
423
|
|
|
401
424
|
elif self.token.type == "LPAREN":
|
|
402
425
|
# Opening paren that's NOT part of a url() call
|
|
403
426
|
# (it's part of an xpath function like contains(), starts-with(), etc.)
|
|
404
427
|
paren_balance += 1
|
|
405
|
-
|
|
428
|
+
if not reached_follow_token:
|
|
429
|
+
current_xpath += self.token.value
|
|
430
|
+
else:
|
|
431
|
+
follow_xpath += self.token.value
|
|
406
432
|
self.advance()
|
|
407
433
|
|
|
408
434
|
elif self.token.type == "RPAREN":
|
|
@@ -410,26 +436,37 @@ class Parser:
|
|
|
410
436
|
if paren_balance == 0:
|
|
411
437
|
# This is the closing paren of the outer url()
|
|
412
438
|
break
|
|
413
|
-
|
|
439
|
+
if not reached_follow_token:
|
|
440
|
+
current_xpath += self.token.value
|
|
441
|
+
else:
|
|
442
|
+
follow_xpath += self.token.value
|
|
414
443
|
self.advance()
|
|
415
444
|
|
|
416
445
|
elif self.token.type == "LBRACE":
|
|
417
446
|
# Opening brace for map constructors
|
|
418
447
|
brace_balance += 1
|
|
419
|
-
|
|
448
|
+
if not reached_follow_token:
|
|
449
|
+
current_xpath += self.token.value
|
|
450
|
+
else:
|
|
451
|
+
follow_xpath += self.token.value
|
|
420
452
|
self.advance()
|
|
421
453
|
|
|
422
454
|
elif self.token.type == "RBRACE":
|
|
423
455
|
brace_balance -= 1
|
|
424
|
-
|
|
456
|
+
if not reached_follow_token:
|
|
457
|
+
current_xpath += self.token.value
|
|
458
|
+
else:
|
|
459
|
+
follow_xpath += self.token.value
|
|
425
460
|
self.advance()
|
|
426
461
|
|
|
427
462
|
else:
|
|
428
463
|
# Accumulate all other tokens as xpath content
|
|
429
|
-
if
|
|
430
|
-
current_xpath += self.token.value
|
|
431
|
-
else:
|
|
464
|
+
if reached_follow_token:
|
|
432
465
|
follow_xpath += self.token.value
|
|
466
|
+
elif reached_depth_token:
|
|
467
|
+
depth_number += self.token.value
|
|
468
|
+
else:
|
|
469
|
+
current_xpath += self.token.value
|
|
433
470
|
|
|
434
471
|
self.advance()
|
|
435
472
|
|
|
@@ -447,6 +484,9 @@ class Parser:
|
|
|
447
484
|
if follow_xpath.strip():
|
|
448
485
|
elements.append(Xpath(follow_xpath.strip()))
|
|
449
486
|
|
|
487
|
+
if depth_number.strip():
|
|
488
|
+
elements.append(Depth(int(depth_number.strip())))
|
|
489
|
+
|
|
450
490
|
return elements
|
|
451
491
|
|
|
452
492
|
def parse_call(self, func_name: str) -> Call | Segments:
|
|
@@ -462,13 +502,16 @@ class Parser:
|
|
|
462
502
|
self.advance()
|
|
463
503
|
# Handle follow=...
|
|
464
504
|
if self.token.type == "FOLLOW":
|
|
465
|
-
self.advance()
|
|
466
505
|
follow_arg = self.capture_url_arg_content()
|
|
467
506
|
args.extend(follow_arg)
|
|
507
|
+
if self.token.type == "DEPTH":
|
|
508
|
+
depth_arg = self.capture_url_arg_content()
|
|
509
|
+
args.extend(depth_arg)
|
|
468
510
|
elif self.token.type == "WXPATH":
|
|
469
511
|
# Nested wxpath: url( url('...')//a/@href ) or url( /url(...) )
|
|
470
|
-
#
|
|
471
|
-
args = self.capture_url_arg_content()
|
|
512
|
+
# NOTE: We used to use capture_url_arg_content to handle nested wxpath and xpath
|
|
513
|
+
# args = self.capture_url_arg_content()
|
|
514
|
+
args = self.nud()
|
|
472
515
|
else:
|
|
473
516
|
# Simple xpath argument: url(//a/@href)
|
|
474
517
|
# Could still contain nested wxpath, so use capture_url_arg_content
|
|
@@ -489,8 +532,18 @@ class Parser:
|
|
|
489
532
|
|
|
490
533
|
return _specify_call_types(func_name, args)
|
|
491
534
|
|
|
492
|
-
|
|
493
535
|
def _specify_call_types(func_name: str, args: list) -> Call | Segments:
|
|
536
|
+
"""
|
|
537
|
+
Specify the type of a call based on the function name and arguments.
|
|
538
|
+
TODO: Provide example wxpath expressions for each call type.
|
|
539
|
+
|
|
540
|
+
Args:
|
|
541
|
+
func_name: The name of the function.
|
|
542
|
+
args: The arguments of the function.
|
|
543
|
+
|
|
544
|
+
Returns:
|
|
545
|
+
Call | Segments: The type of the call.
|
|
546
|
+
"""
|
|
494
547
|
if func_name == "url":
|
|
495
548
|
if len(args) == 1:
|
|
496
549
|
if isinstance(args[0], String):
|
|
@@ -500,17 +553,33 @@ def _specify_call_types(func_name: str, args: list) -> Call | Segments:
|
|
|
500
553
|
else:
|
|
501
554
|
raise ValueError(f"Unknown argument type: {type(args[0])}")
|
|
502
555
|
elif len(args) == 2:
|
|
503
|
-
|
|
556
|
+
arg0, arg1 = args
|
|
557
|
+
if isinstance(arg0, String) and isinstance(arg1, Xpath):
|
|
558
|
+
# Example: url('...', follow=//a/@href)
|
|
504
559
|
return UrlCrawl(func_name, args)
|
|
505
|
-
elif isinstance(
|
|
560
|
+
elif isinstance(arg0, String) and isinstance(arg1, Integer):
|
|
561
|
+
# Example: url('...', depth=2)
|
|
562
|
+
return UrlLiteral(func_name, args)
|
|
563
|
+
elif isinstance(arg0, UrlLiteral) and isinstance(arg1, Xpath):
|
|
506
564
|
args.append(UrlQuery('url', [ContextItem()]))
|
|
507
565
|
return Segments(args)
|
|
508
|
-
elif isinstance(
|
|
509
|
-
segs =
|
|
510
|
-
segs.append(
|
|
566
|
+
elif isinstance(arg0, (Segments, list)) and isinstance(arg1, Xpath):
|
|
567
|
+
segs = arg0
|
|
568
|
+
segs.append(arg1)
|
|
511
569
|
return Segments(segs)
|
|
512
570
|
else:
|
|
513
571
|
raise ValueError(f"Unknown arguments: {args}")
|
|
572
|
+
elif len(args) == 3:
|
|
573
|
+
arg0, arg1, arg2 = args
|
|
574
|
+
if (isinstance(arg0, String) and (
|
|
575
|
+
(isinstance(arg1, Xpath) and isinstance(arg2, Integer)) or
|
|
576
|
+
(isinstance(arg1, Integer) and isinstance(arg2, Xpath))
|
|
577
|
+
)):
|
|
578
|
+
# Example: url('...', follow=//a/@href, depth=2)
|
|
579
|
+
# Example: url('...', depth=2, follow=//a/@href)
|
|
580
|
+
return UrlCrawl(func_name, args)
|
|
581
|
+
else:
|
|
582
|
+
raise ValueError(f"Unknown arguments: {args}")
|
|
514
583
|
else:
|
|
515
584
|
raise ValueError(f"Unknown arguments: {args}")
|
|
516
585
|
elif func_name == "/url" or func_name == "//url":
|