wxpath 0.4.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wxpath-0.4.0 → wxpath-0.5.0}/PKG-INFO +123 -19
- wxpath-0.4.0/src/wxpath.egg-info/PKG-INFO → wxpath-0.5.0/README.md +98 -45
- {wxpath-0.4.0 → wxpath-0.5.0}/pyproject.toml +18 -1
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/__init__.py +2 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/cli.py +6 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/core/models.py +1 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/core/ops.py +9 -12
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/core/parser.py +92 -23
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/core/runtime/engine.py +79 -8
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/core/runtime/helpers.py +6 -3
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/client/__init__.py +1 -1
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/client/crawler.py +19 -7
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/client/request.py +1 -1
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/client/response.py +7 -1
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/policy/retry.py +2 -2
- wxpath-0.5.0/src/wxpath/integrations/langchain/__init__.py +0 -0
- wxpath-0.5.0/src/wxpath/integrations/langchain/examples/basic_rag.py +85 -0
- wxpath-0.5.0/src/wxpath/integrations/langchain/examples/rolling_window_rag.py +218 -0
- wxpath-0.5.0/src/wxpath/integrations/langchain/loader.py +60 -0
- wxpath-0.5.0/src/wxpath/patches.py +273 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/settings.py +3 -1
- wxpath-0.5.0/src/wxpath/tui.py +1204 -0
- wxpath-0.5.0/src/wxpath/tui_settings.py +151 -0
- wxpath-0.5.0/src/wxpath/util/__init__.py +0 -0
- wxpath-0.5.0/src/wxpath/util/cleaners.py +31 -0
- wxpath-0.5.0/src/wxpath/util/common_paths.py +22 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/util/logging.py +3 -7
- wxpath-0.4.0/README.md → wxpath-0.5.0/src/wxpath.egg-info/PKG-INFO +148 -18
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath.egg-info/SOURCES.txt +9 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath.egg-info/entry_points.txt +1 -0
- wxpath-0.5.0/src/wxpath.egg-info/requires.txt +43 -0
- wxpath-0.4.0/src/wxpath/patches.py +0 -63
- wxpath-0.4.0/src/wxpath.egg-info/requires.txt +0 -20
- {wxpath-0.4.0 → wxpath-0.5.0}/LICENSE +0 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/setup.cfg +0 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/core/__init__.py +0 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/core/dom.py +0 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/core/runtime/__init__.py +0 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/hooks/__init__.py +0 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/hooks/builtin.py +0 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/hooks/registry.py +0 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/__init__.py +0 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/client/cache.py +0 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/policy/backoff.py +0 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/policy/robots.py +0 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/policy/throttler.py +0 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/stats.py +0 -0
- {wxpath-0.4.0/src/wxpath/util → wxpath-0.5.0/src/wxpath/integrations}/__init__.py +0 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/util/serialize.py +0 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath.egg-info/dependency_links.txt +0 -0
- {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath.egg-info/top_level.txt +0 -0
|
@@ -1,9 +1,14 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: wxpath
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: wxpath - a declarative web crawler and data extractor
|
|
5
5
|
Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://rodricios.github.io/wxpath
|
|
8
|
+
Project-URL: Documentation, https://rodricios.github.io/wxpath
|
|
9
|
+
Project-URL: Repository, https://github.com/rodricios/wxpath
|
|
10
|
+
Project-URL: Issues, https://github.com/rodricios/wxpath/issues
|
|
11
|
+
Project-URL: Changelog, https://github.com/rodricios/wxpath/blob/main/CHANGELOG.md
|
|
7
12
|
Requires-Python: >=3.10
|
|
8
13
|
Description-Content-Type: text/markdown
|
|
9
14
|
License-File: LICENSE
|
|
@@ -17,27 +22,112 @@ Provides-Extra: cache-sqlite
|
|
|
17
22
|
Requires-Dist: aiohttp-client-cache[sqlite]; extra == "cache-sqlite"
|
|
18
23
|
Provides-Extra: cache-redis
|
|
19
24
|
Requires-Dist: aiohttp-client-cache[redis]; extra == "cache-redis"
|
|
25
|
+
Provides-Extra: llm
|
|
26
|
+
Requires-Dist: langchain>=1.0.0; extra == "llm"
|
|
27
|
+
Requires-Dist: langchain-core>=1.0.0; extra == "llm"
|
|
28
|
+
Requires-Dist: langchain-ollama>=1.0.0; extra == "llm"
|
|
29
|
+
Requires-Dist: langchain-community>=0.4.0; extra == "llm"
|
|
30
|
+
Requires-Dist: langchain-chroma>=1.0.0; extra == "llm"
|
|
31
|
+
Requires-Dist: chromadb>=1.0.0; extra == "llm"
|
|
32
|
+
Requires-Dist: langchain-text-splitters>=1.1.0; extra == "llm"
|
|
20
33
|
Provides-Extra: test
|
|
21
34
|
Requires-Dist: pytest>=7.0; extra == "test"
|
|
22
35
|
Requires-Dist: pytest-asyncio>=0.23; extra == "test"
|
|
23
36
|
Provides-Extra: dev
|
|
24
37
|
Requires-Dist: ruff; extra == "dev"
|
|
38
|
+
Provides-Extra: docs
|
|
39
|
+
Requires-Dist: mkdocs>=1.5; extra == "docs"
|
|
40
|
+
Requires-Dist: mkdocs-material>=9.0; extra == "docs"
|
|
41
|
+
Requires-Dist: mkdocstrings[python]>=0.24; extra == "docs"
|
|
42
|
+
Requires-Dist: mkdocs-macros-plugin>=1.0; extra == "docs"
|
|
43
|
+
Requires-Dist: mkdocs-resize-images>=1.0; extra == "docs"
|
|
44
|
+
Requires-Dist: mkdocs-glightbox; extra == "docs"
|
|
45
|
+
Requires-Dist: pyyaml>=6.0; extra == "docs"
|
|
46
|
+
Provides-Extra: tui
|
|
47
|
+
Requires-Dist: textual>=1.0.0; extra == "tui"
|
|
48
|
+
Requires-Dist: aiohttp-client-cache>=0.14.0; extra == "tui"
|
|
49
|
+
Requires-Dist: aiohttp-client-cache[sqlite]; extra == "tui"
|
|
25
50
|
Dynamic: license-file
|
|
26
51
|
|
|
27
|
-
# **wxpath** - declarative web
|
|
52
|
+
# **wxpath** - declarative web graph traversal with XPath
|
|
28
53
|
|
|
29
|
-
[](https://www.python.org/downloads/release/python-3100/)
|
|
54
|
+
[](https://www.python.org/downloads/release/python-3100/) [](https://rodricios.github.io/wxpath)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
> NEW: [TUI](https://rodricios.github.io/wxpath/tui/quickstart.md) - Interactive terminal interface (powered by Textual) for testing wxpath expressions and exporting data.
|
|
58
|
+
|
|
59
|
+

|
|
60
|
+
|
|
61
|
+
## Install
|
|
62
|
+
|
|
63
|
+
Requires Python 3.10+.
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
pip install wxpath
|
|
67
|
+
# For TUI support
|
|
68
|
+
pip install wxpath[tui]
|
|
69
|
+
```
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
## What is wxpath?
|
|
30
74
|
|
|
31
75
|
**wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
|
|
32
76
|
|
|
33
|
-
|
|
77
|
+
This expression fetches a page, extracts links, and streams them concurrently - no crawl loop required:
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
import wxpath
|
|
81
|
+
|
|
82
|
+
expr = "url('https://quotes.toscrape.com')//a/@href"
|
|
83
|
+
|
|
84
|
+
for link in wxpath.wxpath_async_blocking_iter(expr):
|
|
85
|
+
print(link)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform recursive (or paginated) web crawling and extraction:
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
import wxpath
|
|
93
|
+
|
|
94
|
+
path_expr = """
|
|
95
|
+
url('https://quotes.toscrape.com')
|
|
96
|
+
///url(//a/@href)
|
|
97
|
+
//a/@href
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
|
|
101
|
+
print(item)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
## Why wxpath?
|
|
106
|
+
|
|
107
|
+
Most web scrapers force you to write crawl control flow first, and extraction second.
|
|
108
|
+
|
|
109
|
+
**wxpath** converges those two steps into one:
|
|
110
|
+
- **You describe traversal declaratively**
|
|
111
|
+
- **Extraction is expressed inline**
|
|
112
|
+
- **The engine handles scheduling, concurrency, and deduplication**
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
### RAG-Ready Output
|
|
116
|
+
|
|
117
|
+
Extract clean, structured JSON hierarchies directly from the graph - feed your LLMs signal, not noise. Refer to [LangChain Integration](https://rodricios.github.io/wxpath/api/integrations/langchain/) for more details.
|
|
118
|
+
|
|
34
119
|
|
|
35
|
-
|
|
120
|
+
### Deterministic
|
|
36
121
|
|
|
122
|
+
**wxpath** is deterministic (read: not powered by LLMs). While we can't guarantee the network is stable, we can guarantee the traversal is.
|
|
123
|
+
|
|
124
|
+
## Documentation (WIP)
|
|
125
|
+
|
|
126
|
+
Documentation is now available [here](https://rodricios.github.io/wxpath/).
|
|
37
127
|
|
|
38
128
|
## Contents
|
|
39
129
|
|
|
40
|
-
- [Example](#example)
|
|
130
|
+
- [Example: Knowledge Graph](#example)
|
|
41
131
|
- [Language Design](DESIGN.md)
|
|
42
132
|
- [`url(...)` and `///url(...)` Explained](#url-and-url-explained)
|
|
43
133
|
- [General flow](#general-flow)
|
|
@@ -47,6 +137,7 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
47
137
|
- [XPath 3.1](#xpath-31-by-default)
|
|
48
138
|
- [Progress Bar](#progress-bar)
|
|
49
139
|
- [CLI](#cli)
|
|
140
|
+
- [TUI](#tui)
|
|
50
141
|
- [Persistence and Caching](#persistence-and-caching)
|
|
51
142
|
- [Settings](#settings)
|
|
52
143
|
- [Hooks (Experimental)](#hooks-experimental)
|
|
@@ -56,7 +147,7 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
56
147
|
- [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
|
|
57
148
|
- [Project Philosophy](#project-philosophy)
|
|
58
149
|
- [Warnings](#warnings)
|
|
59
|
-
- [Commercial support
|
|
150
|
+
- [Commercial support/consulting](#commercial-supportconsulting)
|
|
60
151
|
- [Versioning](#versioning)
|
|
61
152
|
- [License](#license)
|
|
62
153
|
|
|
@@ -73,7 +164,11 @@ CRAWLER_SETTINGS.headers = {'User-Agent': 'my-app/0.4.0 (contact: you@example.co
|
|
|
73
164
|
# Crawl, extract fields, build a knowledge graph
|
|
74
165
|
path_expr = """
|
|
75
166
|
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
76
|
-
///url(
|
|
167
|
+
///url(
|
|
168
|
+
//main//a/@href[
|
|
169
|
+
starts-with(., '/wiki/') and not(contains(., ':'))
|
|
170
|
+
]
|
|
171
|
+
)
|
|
77
172
|
/map{
|
|
78
173
|
'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
|
|
79
174
|
'url': string(base-uri(.)),
|
|
@@ -86,15 +181,6 @@ for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
|
|
|
86
181
|
print(item)
|
|
87
182
|
```
|
|
88
183
|
|
|
89
|
-
Output:
|
|
90
|
-
|
|
91
|
-
```python
|
|
92
|
-
map{'title': 'Computer language', 'url': 'https://en.wikipedia.org/wiki/Computer_language', 'short_description': 'Formal language for communicating with a computer', 'forward_links': ['/wiki/Formal_language', '/wiki/Communication', ...]}
|
|
93
|
-
map{'title': 'Advanced Boolean Expression Language', 'url': 'https://en.wikipedia.org/wiki/Advanced_Boolean_Expression_Language', 'short_description': 'Hardware description language and software', 'forward_links': ['/wiki/File:ABEL_HDL_example_SN74162.png', '/wiki/Hardware_description_language', ...]}
|
|
94
|
-
map{'title': 'Machine-readable medium and data', 'url': 'https://en.wikipedia.org/wiki/Machine_readable', 'short_description': 'Medium capable of storing data in a format readable by a machine', 'forward_links': ['/wiki/File:EAN-13-ISBN-13.svg', '/wiki/ISBN', ...]}
|
|
95
|
-
...
|
|
96
|
-
```
|
|
97
|
-
|
|
98
184
|
**Note:** Some sites (including Wikipedia) may block requests without proper headers.
|
|
99
185
|
See [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration) to set a custom `User-Agent`.
|
|
100
186
|
|
|
@@ -266,12 +352,17 @@ Command line options:
|
|
|
266
352
|
--cache [true|false] (Default: False) Persist crawl results to a local database
|
|
267
353
|
```
|
|
268
354
|
|
|
355
|
+
## TUI
|
|
356
|
+
|
|
357
|
+
**wxpath** provides a terminal interface (TUI) for interactive expression testing and data extraction.
|
|
358
|
+
|
|
359
|
+
See [TUI Quickstart](https://rodricios.github.io/wxpath/tui/quickstart.md) for more details.
|
|
269
360
|
|
|
270
361
|
## Persistence and Caching
|
|
271
362
|
|
|
272
363
|
**wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
|
|
273
364
|
|
|
274
|
-
**wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will
|
|
365
|
+
**wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will encounter a warning if `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
|
|
275
366
|
|
|
276
367
|
To use, you must install the appropriate optional dependency:
|
|
277
368
|
|
|
@@ -406,6 +497,17 @@ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(//mai
|
|
|
406
497
|
items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
407
498
|
```
|
|
408
499
|
|
|
500
|
+
### Runtime API (`wxpath_async*`) options
|
|
501
|
+
|
|
502
|
+
- `max_depth`: int = 1
|
|
503
|
+
- `progress`: bool = False
|
|
504
|
+
- `engine`: WXPathEngine | None = None
|
|
505
|
+
- `yield_errors`: bool = False
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
### Settings
|
|
509
|
+
You can also use [settings.py](src/wxpath/settings.py) to enable caching, throttling, concurrency and more.
|
|
510
|
+
|
|
409
511
|
|
|
410
512
|
## Project Philosophy
|
|
411
513
|
|
|
@@ -433,13 +535,15 @@ The following features are not yet supported:
|
|
|
433
535
|
|
|
434
536
|
## WARNINGS!!!
|
|
435
537
|
|
|
538
|
+
This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
|
|
539
|
+
|
|
436
540
|
- Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
|
|
437
541
|
- Deep crawls (`///`) require user discipline to avoid unbounded expansion (traversal explosion).
|
|
438
542
|
- Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
|
|
439
543
|
- Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
|
|
440
544
|
|
|
441
545
|
|
|
442
|
-
## Commercial support
|
|
546
|
+
## Commercial support/consulting
|
|
443
547
|
|
|
444
548
|
If you want help building or operating crawlers/data feeds with wxpath (extraction, scheduling, monitoring, breakage fixes) or other web-scraping needs, please contact me at: rodrigopala91@gmail.com.
|
|
445
549
|
|
|
@@ -1,43 +1,82 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
Requires
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
Provides-Extra: dev
|
|
24
|
-
Requires-Dist: ruff; extra == "dev"
|
|
25
|
-
Dynamic: license-file
|
|
26
|
-
|
|
27
|
-
# **wxpath** - declarative web crawling with XPath
|
|
28
|
-
|
|
29
|
-
[](https://www.python.org/downloads/release/python-3100/)
|
|
1
|
+
# **wxpath** - declarative web graph traversal with XPath
|
|
2
|
+
|
|
3
|
+
[](https://www.python.org/downloads/release/python-3100/) [](https://rodricios.github.io/wxpath)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
> NEW: [TUI](https://rodricios.github.io/wxpath/tui/quickstart.md) - Interactive terminal interface (powered by Textual) for testing wxpath expressions and exporting data.
|
|
7
|
+
|
|
8
|
+

|
|
9
|
+
|
|
10
|
+
## Install
|
|
11
|
+
|
|
12
|
+
Requires Python 3.10+.
|
|
13
|
+
|
|
14
|
+
```
|
|
15
|
+
pip install wxpath
|
|
16
|
+
# For TUI support
|
|
17
|
+
pip install wxpath[tui]
|
|
18
|
+
```
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
## What is wxpath?
|
|
30
23
|
|
|
31
24
|
**wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
|
|
32
25
|
|
|
33
|
-
|
|
26
|
+
This expression fetches a page, extracts links, and streams them concurrently - no crawl loop required:
|
|
34
27
|
|
|
35
|
-
|
|
28
|
+
```python
|
|
29
|
+
import wxpath
|
|
36
30
|
|
|
31
|
+
expr = "url('https://quotes.toscrape.com')//a/@href"
|
|
32
|
+
|
|
33
|
+
for link in wxpath.wxpath_async_blocking_iter(expr):
|
|
34
|
+
print(link)
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform recursive (or paginated) web crawling and extraction:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
import wxpath
|
|
42
|
+
|
|
43
|
+
path_expr = """
|
|
44
|
+
url('https://quotes.toscrape.com')
|
|
45
|
+
///url(//a/@href)
|
|
46
|
+
//a/@href
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
|
|
50
|
+
print(item)
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
## Why wxpath?
|
|
55
|
+
|
|
56
|
+
Most web scrapers force you to write crawl control flow first, and extraction second.
|
|
57
|
+
|
|
58
|
+
**wxpath** converges those two steps into one:
|
|
59
|
+
- **You describe traversal declaratively**
|
|
60
|
+
- **Extraction is expressed inline**
|
|
61
|
+
- **The engine handles scheduling, concurrency, and deduplication**
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
### RAG-Ready Output
|
|
65
|
+
|
|
66
|
+
Extract clean, structured JSON hierarchies directly from the graph - feed your LLMs signal, not noise. Refer to [LangChain Integration](https://rodricios.github.io/wxpath/api/integrations/langchain/) for more details.
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
### Deterministic
|
|
70
|
+
|
|
71
|
+
**wxpath** is deterministic (read: not powered by LLMs). While we can't guarantee the network is stable, we can guarantee the traversal is.
|
|
72
|
+
|
|
73
|
+
## Documentation (WIP)
|
|
74
|
+
|
|
75
|
+
Documentation is now available [here](https://rodricios.github.io/wxpath/).
|
|
37
76
|
|
|
38
77
|
## Contents
|
|
39
78
|
|
|
40
|
-
- [Example](#example)
|
|
79
|
+
- [Example: Knowledge Graph](#example)
|
|
41
80
|
- [Language Design](DESIGN.md)
|
|
42
81
|
- [`url(...)` and `///url(...)` Explained](#url-and-url-explained)
|
|
43
82
|
- [General flow](#general-flow)
|
|
@@ -47,6 +86,7 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
47
86
|
- [XPath 3.1](#xpath-31-by-default)
|
|
48
87
|
- [Progress Bar](#progress-bar)
|
|
49
88
|
- [CLI](#cli)
|
|
89
|
+
- [TUI](#tui)
|
|
50
90
|
- [Persistence and Caching](#persistence-and-caching)
|
|
51
91
|
- [Settings](#settings)
|
|
52
92
|
- [Hooks (Experimental)](#hooks-experimental)
|
|
@@ -56,7 +96,7 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
56
96
|
- [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
|
|
57
97
|
- [Project Philosophy](#project-philosophy)
|
|
58
98
|
- [Warnings](#warnings)
|
|
59
|
-
- [Commercial support
|
|
99
|
+
- [Commercial support/consulting](#commercial-supportconsulting)
|
|
60
100
|
- [Versioning](#versioning)
|
|
61
101
|
- [License](#license)
|
|
62
102
|
|
|
@@ -73,7 +113,11 @@ CRAWLER_SETTINGS.headers = {'User-Agent': 'my-app/0.4.0 (contact: you@example.co
|
|
|
73
113
|
# Crawl, extract fields, build a knowledge graph
|
|
74
114
|
path_expr = """
|
|
75
115
|
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
76
|
-
///url(
|
|
116
|
+
///url(
|
|
117
|
+
//main//a/@href[
|
|
118
|
+
starts-with(., '/wiki/') and not(contains(., ':'))
|
|
119
|
+
]
|
|
120
|
+
)
|
|
77
121
|
/map{
|
|
78
122
|
'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
|
|
79
123
|
'url': string(base-uri(.)),
|
|
@@ -86,15 +130,6 @@ for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
|
|
|
86
130
|
print(item)
|
|
87
131
|
```
|
|
88
132
|
|
|
89
|
-
Output:
|
|
90
|
-
|
|
91
|
-
```python
|
|
92
|
-
map{'title': 'Computer language', 'url': 'https://en.wikipedia.org/wiki/Computer_language', 'short_description': 'Formal language for communicating with a computer', 'forward_links': ['/wiki/Formal_language', '/wiki/Communication', ...]}
|
|
93
|
-
map{'title': 'Advanced Boolean Expression Language', 'url': 'https://en.wikipedia.org/wiki/Advanced_Boolean_Expression_Language', 'short_description': 'Hardware description language and software', 'forward_links': ['/wiki/File:ABEL_HDL_example_SN74162.png', '/wiki/Hardware_description_language', ...]}
|
|
94
|
-
map{'title': 'Machine-readable medium and data', 'url': 'https://en.wikipedia.org/wiki/Machine_readable', 'short_description': 'Medium capable of storing data in a format readable by a machine', 'forward_links': ['/wiki/File:EAN-13-ISBN-13.svg', '/wiki/ISBN', ...]}
|
|
95
|
-
...
|
|
96
|
-
```
|
|
97
|
-
|
|
98
133
|
**Note:** Some sites (including Wikipedia) may block requests without proper headers.
|
|
99
134
|
See [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration) to set a custom `User-Agent`.
|
|
100
135
|
|
|
@@ -266,12 +301,17 @@ Command line options:
|
|
|
266
301
|
--cache [true|false] (Default: False) Persist crawl results to a local database
|
|
267
302
|
```
|
|
268
303
|
|
|
304
|
+
## TUI
|
|
305
|
+
|
|
306
|
+
**wxpath** provides a terminal interface (TUI) for interactive expression testing and data extraction.
|
|
307
|
+
|
|
308
|
+
See [TUI Quickstart](https://rodricios.github.io/wxpath/tui/quickstart.md) for more details.
|
|
269
309
|
|
|
270
310
|
## Persistence and Caching
|
|
271
311
|
|
|
272
312
|
**wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
|
|
273
313
|
|
|
274
|
-
**wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will
|
|
314
|
+
**wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will encounter a warning if `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
|
|
275
315
|
|
|
276
316
|
To use, you must install the appropriate optional dependency:
|
|
277
317
|
|
|
@@ -406,6 +446,17 @@ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(//mai
|
|
|
406
446
|
items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
407
447
|
```
|
|
408
448
|
|
|
449
|
+
### Runtime API (`wxpath_async*`) options
|
|
450
|
+
|
|
451
|
+
- `max_depth`: int = 1
|
|
452
|
+
- `progress`: bool = False
|
|
453
|
+
- `engine`: WXPathEngine | None = None
|
|
454
|
+
- `yield_errors`: bool = False
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
### Settings
|
|
458
|
+
You can also use [settings.py](src/wxpath/settings.py) to enable caching, throttling, concurrency and more.
|
|
459
|
+
|
|
409
460
|
|
|
410
461
|
## Project Philosophy
|
|
411
462
|
|
|
@@ -433,13 +484,15 @@ The following features are not yet supported:
|
|
|
433
484
|
|
|
434
485
|
## WARNINGS!!!
|
|
435
486
|
|
|
487
|
+
This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
|
|
488
|
+
|
|
436
489
|
- Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
|
|
437
490
|
- Deep crawls (`///`) require user discipline to avoid unbounded expansion (traversal explosion).
|
|
438
491
|
- Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
|
|
439
492
|
- Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
|
|
440
493
|
|
|
441
494
|
|
|
442
|
-
## Commercial support
|
|
495
|
+
## Commercial support/consulting
|
|
443
496
|
|
|
444
497
|
If you want help building or operating crawlers/data feeds with wxpath (extraction, scheduling, monitoring, breakage fixes) or other web-scraping needs, please contact me at: rodrigopala91@gmail.com.
|
|
445
498
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "wxpath"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.5.0"
|
|
8
8
|
description = "wxpath - a declarative web crawler and data extractor"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -13,6 +13,7 @@ authors = [
|
|
|
13
13
|
]
|
|
14
14
|
license = "MIT"
|
|
15
15
|
license-files = ["LICENSE"]
|
|
16
|
+
|
|
16
17
|
dependencies = [
|
|
17
18
|
"lxml>=4.0",
|
|
18
19
|
"elementpath>=5.0.0,<=5.0.3",
|
|
@@ -20,16 +21,32 @@ dependencies = [
|
|
|
20
21
|
"tqdm>=4.0.0"
|
|
21
22
|
]
|
|
22
23
|
|
|
24
|
+
[project.urls]
|
|
25
|
+
Homepage = "https://rodricios.github.io/wxpath"
|
|
26
|
+
Documentation = "https://rodricios.github.io/wxpath"
|
|
27
|
+
Repository = "https://github.com/rodricios/wxpath"
|
|
28
|
+
Issues = "https://github.com/rodricios/wxpath/issues"
|
|
29
|
+
Changelog = "https://github.com/rodricios/wxpath/blob/main/CHANGELOG.md"
|
|
30
|
+
|
|
31
|
+
|
|
23
32
|
[project.optional-dependencies]
|
|
24
33
|
cache = ["aiohttp-client-cache>=0.14.0"]
|
|
25
34
|
cache-sqlite = ["aiohttp-client-cache[sqlite]"]
|
|
26
35
|
cache-redis = ["aiohttp-client-cache[redis]"]
|
|
27
36
|
|
|
37
|
+
# langchain langchain-ollama langchain-chroma chromadb
|
|
38
|
+
llm = ["langchain>=1.0.0", "langchain-core>=1.0.0", "langchain-ollama>=1.0.0",
|
|
39
|
+
"langchain-community>=0.4.0", "langchain-chroma>=1.0.0", "chromadb>=1.0.0",
|
|
40
|
+
"langchain-text-splitters>=1.1.0"]
|
|
41
|
+
|
|
28
42
|
test = ["pytest>=7.0", "pytest-asyncio>=0.23"]
|
|
29
43
|
dev = ["ruff"]
|
|
44
|
+
docs = ["mkdocs>=1.5", "mkdocs-material>=9.0", "mkdocstrings[python]>=0.24", "mkdocs-macros-plugin>=1.0", "mkdocs-resize-images>=1.0", "mkdocs-glightbox", "pyyaml>=6.0"]
|
|
45
|
+
tui = ["textual>=1.0.0", "aiohttp-client-cache>=0.14.0", "aiohttp-client-cache[sqlite]"]
|
|
30
46
|
|
|
31
47
|
[project.scripts]
|
|
32
48
|
wxpath = "wxpath.cli:main"
|
|
49
|
+
wxpath-tui = "wxpath.tui:main"
|
|
33
50
|
|
|
34
51
|
[tool.pytest.ini_options]
|
|
35
52
|
minversion = "6.0"
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from . import settings
|
|
1
2
|
from .core.runtime.engine import wxpath_async, wxpath_async_blocking, wxpath_async_blocking_iter
|
|
2
3
|
from .util.logging import configure_logging
|
|
3
4
|
|
|
@@ -6,4 +7,5 @@ __all__ = [
|
|
|
6
7
|
'wxpath_async_blocking',
|
|
7
8
|
'wxpath_async_blocking_iter',
|
|
8
9
|
'configure_logging',
|
|
10
|
+
'settings',
|
|
9
11
|
]
|
|
@@ -47,6 +47,11 @@ def main():
|
|
|
47
47
|
help="Respect robots.txt",
|
|
48
48
|
default=True
|
|
49
49
|
)
|
|
50
|
+
arg_parser.add_argument(
|
|
51
|
+
"--insecure",
|
|
52
|
+
action="store_true",
|
|
53
|
+
help="Disable SSL certificate verification (use for sites with broken chains)",
|
|
54
|
+
)
|
|
50
55
|
arg_parser.add_argument(
|
|
51
56
|
"--cache",
|
|
52
57
|
action="store_true",
|
|
@@ -112,6 +117,7 @@ def main():
|
|
|
112
117
|
concurrency=args.concurrency,
|
|
113
118
|
per_host=args.concurrency_per_host,
|
|
114
119
|
respect_robots=args.respect_robots,
|
|
120
|
+
verify_ssl=not args.insecure,
|
|
115
121
|
headers=custom_headers
|
|
116
122
|
)
|
|
117
123
|
engine = WXPathEngine(crawler=crawler)
|
|
@@ -19,6 +19,7 @@ from wxpath.core.parser import (
|
|
|
19
19
|
Binary,
|
|
20
20
|
Call,
|
|
21
21
|
ContextItem,
|
|
22
|
+
Depth,
|
|
22
23
|
Segment,
|
|
23
24
|
Segments,
|
|
24
25
|
String,
|
|
@@ -78,7 +79,10 @@ def get_operator(
|
|
|
78
79
|
|
|
79
80
|
|
|
80
81
|
@register('url', (String,))
|
|
82
|
+
@register('url', (String, Depth))
|
|
81
83
|
@register('url', (String, Xpath))
|
|
84
|
+
@register('url', (String, Depth, Xpath))
|
|
85
|
+
@register('url', (String, Xpath, Depth))
|
|
82
86
|
def _handle_url_str_lit(curr_elem: html.HtmlElement,
|
|
83
87
|
curr_segments: list[Url | Xpath],
|
|
84
88
|
curr_depth: int, **kwargs) -> Iterable[Intent]:
|
|
@@ -87,9 +91,12 @@ def _handle_url_str_lit(curr_elem: html.HtmlElement,
|
|
|
87
91
|
|
|
88
92
|
next_segments = curr_segments[1:]
|
|
89
93
|
|
|
90
|
-
|
|
94
|
+
# NOTE: Expects parser to produce UrlCrawl node in expressions
|
|
95
|
+
# that look like `url('...', follow=//a/@href)`
|
|
96
|
+
if isinstance(url_call, UrlCrawl):
|
|
97
|
+
xpath_arg = [arg for arg in url_call.args if isinstance(arg, Xpath)][0]
|
|
91
98
|
_segments = [
|
|
92
|
-
UrlCrawl('///url', [
|
|
99
|
+
UrlCrawl('///url', [xpath_arg, url_call.args[0].value])
|
|
93
100
|
] + next_segments
|
|
94
101
|
|
|
95
102
|
yield CrawlIntent(url=url_call.args[0].value, next_segments=_segments)
|
|
@@ -112,16 +119,6 @@ def _handle_xpath(curr_elem: html.HtmlElement,
|
|
|
112
119
|
raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
|
|
113
120
|
base_url = getattr(curr_elem, 'base_url', None)
|
|
114
121
|
log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
|
|
115
|
-
|
|
116
|
-
_backlink_str = f"string('{curr_elem.get('backlink')}')"
|
|
117
|
-
# We use the root tree's depth and not curr_depth because curr_depth accounts for a +1
|
|
118
|
-
# increment after each url*() hop
|
|
119
|
-
_depth_str = f"number({curr_elem.getroottree().getroot().get('depth')})"
|
|
120
|
-
expr = expr.replace('wx:backlink()', _backlink_str)
|
|
121
|
-
expr = expr.replace('wx:backlink(.)', _backlink_str)
|
|
122
|
-
expr = expr.replace('wx:depth()', _depth_str)
|
|
123
|
-
expr = expr.replace('wx:depth(.)', _depth_str)
|
|
124
|
-
|
|
125
122
|
elems = curr_elem.xpath3(expr)
|
|
126
123
|
|
|
127
124
|
next_segments = curr_segments[1:]
|