wxpath 0.3.0__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wxpath-0.3.0 → wxpath-0.4.1}/PKG-INFO +140 -23
- wxpath-0.3.0/src/wxpath.egg-info/PKG-INFO → wxpath-0.4.1/README.md +132 -41
- {wxpath-0.3.0 → wxpath-0.4.1}/pyproject.toml +7 -2
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/cli.py +57 -12
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/core/runtime/engine.py +87 -11
- wxpath-0.4.1/src/wxpath/http/client/cache.py +43 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/http/client/crawler.py +106 -22
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/http/client/request.py +1 -1
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/http/stats.py +6 -0
- wxpath-0.4.1/src/wxpath/settings.py +108 -0
- wxpath-0.3.0/README.md → wxpath-0.4.1/src/wxpath.egg-info/PKG-INFO +158 -22
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath.egg-info/SOURCES.txt +2 -0
- wxpath-0.4.1/src/wxpath.egg-info/requires.txt +20 -0
- wxpath-0.3.0/src/wxpath.egg-info/requires.txt +0 -10
- {wxpath-0.3.0 → wxpath-0.4.1}/LICENSE +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/setup.cfg +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/__init__.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/core/__init__.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/core/dom.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/core/models.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/core/ops.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/core/parser.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/core/runtime/__init__.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/core/runtime/helpers.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/hooks/__init__.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/hooks/builtin.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/hooks/registry.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/http/__init__.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/http/client/__init__.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/http/client/response.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/http/policy/backoff.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/http/policy/retry.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/http/policy/robots.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/http/policy/throttler.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/patches.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/util/__init__.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/util/logging.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/util/serialize.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath.egg-info/dependency_links.txt +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath.egg-info/entry_points.txt +0 -0
- {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: wxpath
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: wxpath - a declarative web crawler and data extractor
|
|
5
5
|
Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -10,6 +10,13 @@ License-File: LICENSE
|
|
|
10
10
|
Requires-Dist: lxml>=4.0
|
|
11
11
|
Requires-Dist: elementpath<=5.0.3,>=5.0.0
|
|
12
12
|
Requires-Dist: aiohttp<=3.12.15,>=3.8.0
|
|
13
|
+
Requires-Dist: tqdm>=4.0.0
|
|
14
|
+
Provides-Extra: cache
|
|
15
|
+
Requires-Dist: aiohttp-client-cache>=0.14.0; extra == "cache"
|
|
16
|
+
Provides-Extra: cache-sqlite
|
|
17
|
+
Requires-Dist: aiohttp-client-cache[sqlite]; extra == "cache-sqlite"
|
|
18
|
+
Provides-Extra: cache-redis
|
|
19
|
+
Requires-Dist: aiohttp-client-cache[redis]; extra == "cache-redis"
|
|
13
20
|
Provides-Extra: test
|
|
14
21
|
Requires-Dist: pytest>=7.0; extra == "test"
|
|
15
22
|
Requires-Dist: pytest-asyncio>=0.23; extra == "test"
|
|
@@ -23,9 +30,42 @@ Dynamic: license-file
|
|
|
23
30
|
|
|
24
31
|
**wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
|
|
25
32
|
|
|
26
|
-
|
|
33
|
+
This expression fetches a page, extracts links, and streams them concurrently - no crawl loop required:
|
|
27
34
|
|
|
28
|
-
|
|
35
|
+
```python
|
|
36
|
+
import wxpath
|
|
37
|
+
|
|
38
|
+
expr = "url('https://example.com')//a/@href"
|
|
39
|
+
|
|
40
|
+
for link in wxpath.wxpath_async_blocking_iter(expr):
|
|
41
|
+
print(link)
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform deep (or paginated) web crawling and extraction:
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
import wxpath
|
|
49
|
+
|
|
50
|
+
path_expr = """
|
|
51
|
+
url('https://quotes.toscrape.com')
|
|
52
|
+
///url(//a/@href)
|
|
53
|
+
//a/@href
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
|
|
57
|
+
print(item)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
## Why wxpath?
|
|
62
|
+
|
|
63
|
+
Most web scrapers force you to write crawl control flow first, and extraction second.
|
|
64
|
+
|
|
65
|
+
**wxpath** inverts that:
|
|
66
|
+
- **You describe traversal declaratively**
|
|
67
|
+
- **Extraction is expressed inline**
|
|
68
|
+
- **The engine handles scheduling, concurrency, and deduplication**
|
|
29
69
|
|
|
30
70
|
|
|
31
71
|
## Contents
|
|
@@ -38,7 +78,10 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
38
78
|
- [Polite Crawling](#polite-crawling)
|
|
39
79
|
- [Output types](#output-types)
|
|
40
80
|
- [XPath 3.1](#xpath-31-by-default)
|
|
81
|
+
- [Progress Bar](#progress-bar)
|
|
41
82
|
- [CLI](#cli)
|
|
83
|
+
- [Persistence and Caching](#persistence-and-caching)
|
|
84
|
+
- [Settings](#settings)
|
|
42
85
|
- [Hooks (Experimental)](#hooks-experimental)
|
|
43
86
|
- [Install](#install)
|
|
44
87
|
- [More Examples](EXAMPLES.md)
|
|
@@ -46,7 +89,8 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
46
89
|
- [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
|
|
47
90
|
- [Project Philosophy](#project-philosophy)
|
|
48
91
|
- [Warnings](#warnings)
|
|
49
|
-
- [Commercial support
|
|
92
|
+
- [Commercial support/consulting](#commercial-supportconsulting)
|
|
93
|
+
- [Versioning](#versioning)
|
|
50
94
|
- [License](#license)
|
|
51
95
|
|
|
52
96
|
|
|
@@ -54,32 +98,31 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
54
98
|
|
|
55
99
|
```python
|
|
56
100
|
import wxpath
|
|
101
|
+
from wxpath.settings import CRAWLER_SETTINGS
|
|
102
|
+
|
|
103
|
+
# Custom headers for politeness; necessary for some sites (e.g., Wikipedia)
|
|
104
|
+
CRAWLER_SETTINGS.headers = {'User-Agent': 'my-app/0.4.0 (contact: you@example.com)'}
|
|
57
105
|
|
|
58
106
|
# Crawl, extract fields, build a knowledge graph
|
|
59
107
|
path_expr = """
|
|
60
108
|
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
109
|
+
///url(
|
|
110
|
+
//main//a/@href[
|
|
111
|
+
starts-with(., '/wiki/') and not(contains(., ':'))
|
|
112
|
+
]
|
|
113
|
+
)
|
|
114
|
+
/map{
|
|
115
|
+
'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
|
|
116
|
+
'url': string(base-uri(.)),
|
|
117
|
+
'short_description': //div[contains(@class, 'shortdescription')]/text() ! string(.),
|
|
118
|
+
'forward_links': //div[@id="mw-content-text"]//a/@href ! string(.)
|
|
119
|
+
}
|
|
68
120
|
"""
|
|
69
121
|
|
|
70
122
|
for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
|
|
71
123
|
print(item)
|
|
72
124
|
```
|
|
73
125
|
|
|
74
|
-
Output:
|
|
75
|
-
|
|
76
|
-
```python
|
|
77
|
-
map{'title': 'Computer language', 'url': 'https://en.wikipedia.org/wiki/Computer_language', 'short_description': 'Formal language for communicating with a computer', 'forward_links': ['/wiki/Formal_language', '/wiki/Communication', ...]}
|
|
78
|
-
map{'title': 'Advanced Boolean Expression Language', 'url': 'https://en.wikipedia.org/wiki/Advanced_Boolean_Expression_Language', 'short_description': 'Hardware description language and software', 'forward_links': ['/wiki/File:ABEL_HDL_example_SN74162.png', '/wiki/Hardware_description_language', ...]}
|
|
79
|
-
map{'title': 'Machine-readable medium and data', 'url': 'https://en.wikipedia.org/wiki/Machine_readable', 'short_description': 'Medium capable of storing data in a format readable by a machine', 'forward_links': ['/wiki/File:EAN-13-ISBN-13.svg', '/wiki/ISBN', ...]}
|
|
80
|
-
...
|
|
81
|
-
```
|
|
82
|
-
|
|
83
126
|
**Note:** Some sites (including Wikipedia) may block requests without proper headers.
|
|
84
127
|
See [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration) to set a custom `User-Agent`.
|
|
85
128
|
|
|
@@ -195,6 +238,17 @@ path_expr = """
|
|
|
195
238
|
# ...]
|
|
196
239
|
```
|
|
197
240
|
|
|
241
|
+
## Progress Bar
|
|
242
|
+
|
|
243
|
+
**wxpath** provides a progress bar (via `tqdm`) to track crawl progress. This is especially useful for long-running crawls.
|
|
244
|
+
|
|
245
|
+
Enable by setting `engine.run(..., progress=True)`, or pass `progress=True` to any of the `wxpath_async*(...)` functions.
|
|
246
|
+
|
|
247
|
+
```python
|
|
248
|
+
items = wxpath.wxpath_async_blocking("...", progress=True)
|
|
249
|
+
> 100%|██████████████████████████████████████████████████████████▎| 469/471 [00:05<00:00, 72.00it/s, depth=2, yielded=457]
|
|
250
|
+
```
|
|
251
|
+
|
|
198
252
|
|
|
199
253
|
## CLI
|
|
200
254
|
|
|
@@ -237,9 +291,46 @@ Command line options:
|
|
|
237
291
|
--concurrency-per-host <concurrency> Number of concurrent fetches per host
|
|
238
292
|
--header "Key:Value" Add a custom header (e.g., 'Key:Value'). Can be used multiple times.
|
|
239
293
|
--respect-robots [true|false] (Default: True) Respects robots.txt
|
|
294
|
+
--cache [true|false] (Default: False) Persist crawl results to a local database
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
## Persistence and Caching
|
|
299
|
+
|
|
300
|
+
**wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
|
|
301
|
+
|
|
302
|
+
**wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will be encounter a warning if you `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
|
|
303
|
+
|
|
304
|
+
To use, you must install the appropriate optional dependency:
|
|
305
|
+
|
|
306
|
+
```bash
|
|
307
|
+
pip install wxpath[cache-sqlite]
|
|
308
|
+
pip install wxpath[cache-redis]
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
Once the dependency is installed, you must enable the cache:
|
|
312
|
+
|
|
313
|
+
```python
|
|
314
|
+
from wxpath.settings import SETTINGS
|
|
315
|
+
|
|
316
|
+
# To enable caching; sqlite is the default
|
|
317
|
+
SETTINGS.http.client.cache.enabled = True
|
|
318
|
+
|
|
319
|
+
# For redis backend
|
|
320
|
+
SETTINGS.http.client.cache.enabled = True
|
|
321
|
+
SETTINGS.http.client.cache.backend = "redis"
|
|
322
|
+
SETTINGS.http.client.cache.redis.address = "redis://localhost:6379/0"
|
|
323
|
+
|
|
324
|
+
# Run wxpath as usual
|
|
325
|
+
items = list(wxpath_async_blocking_iter('...', max_depth=1, engine=engine))
|
|
240
326
|
```
|
|
241
327
|
|
|
242
328
|
|
|
329
|
+
## Settings
|
|
330
|
+
|
|
331
|
+
See [settings.py](src/wxpath/settings.py) for details of the settings.
|
|
332
|
+
|
|
333
|
+
|
|
243
334
|
## Hooks (Experimental)
|
|
244
335
|
|
|
245
336
|
**wxpath** supports a pluggable hook system that allows you to modify the crawling and extraction behavior. You can register hooks to preprocess URLs, post-process HTML, filter extracted values, and more. Hooks will be executed in the order they are registered. Hooks may impact performance.
|
|
@@ -290,6 +381,13 @@ Requires Python 3.10+.
|
|
|
290
381
|
pip install wxpath
|
|
291
382
|
```
|
|
292
383
|
|
|
384
|
+
For persisted/cached, wxpath supports the following backends:
|
|
385
|
+
|
|
386
|
+
```
|
|
387
|
+
pip install wxpath[cache-sqlite]
|
|
388
|
+
pip install wxpath[cache-redis]
|
|
389
|
+
```
|
|
390
|
+
|
|
293
391
|
|
|
294
392
|
## More Examples
|
|
295
393
|
|
|
@@ -336,6 +434,17 @@ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(//mai
|
|
|
336
434
|
items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
337
435
|
```
|
|
338
436
|
|
|
437
|
+
### Runtime API (`wxpath_async*`) options
|
|
438
|
+
|
|
439
|
+
- `max_depth`: int = 1
|
|
440
|
+
- `progress`: bool = False
|
|
441
|
+
- `engine`: WXPathEngine | None = None
|
|
442
|
+
- `yield_errors`: bool = False
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
### Settings
|
|
446
|
+
You can also use [settings.py](src/wxpath/settings.py) to enable caching, throttling, concurrency and more.
|
|
447
|
+
|
|
339
448
|
|
|
340
449
|
## Project Philosophy
|
|
341
450
|
|
|
@@ -345,7 +454,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
345
454
|
- Stay lightweight and composable
|
|
346
455
|
- Asynchronous support for high-performance crawls
|
|
347
456
|
|
|
348
|
-
###
|
|
457
|
+
### Goals
|
|
349
458
|
|
|
350
459
|
- URLs are deduplicated on a best-effort, per-crawl basis.
|
|
351
460
|
- Crawls are intended to terminate once the frontier is exhausted or `max_depth` is reached.
|
|
@@ -356,7 +465,6 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
356
465
|
|
|
357
466
|
The following features are not yet supported:
|
|
358
467
|
|
|
359
|
-
- Persistent scheduling or crawl resumption
|
|
360
468
|
- Automatic proxy rotation
|
|
361
469
|
- Browser-based rendering (JavaScript execution)
|
|
362
470
|
- Strict result ordering
|
|
@@ -364,13 +472,15 @@ The following features are not yet supported:
|
|
|
364
472
|
|
|
365
473
|
## WARNINGS!!!
|
|
366
474
|
|
|
475
|
+
This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
|
|
476
|
+
|
|
367
477
|
- Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
|
|
368
478
|
- Deep crawls (`///`) require user discipline to avoid unbounded expansion (traversal explosion).
|
|
369
479
|
- Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
|
|
370
480
|
- Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
|
|
371
481
|
|
|
372
482
|
|
|
373
|
-
## Commercial support
|
|
483
|
+
## Commercial support/consulting
|
|
374
484
|
|
|
375
485
|
If you want help building or operating crawlers/data feeds with wxpath (extraction, scheduling, monitoring, breakage fixes) or other web-scraping needs, please contact me at: rodrigopala91@gmail.com.
|
|
376
486
|
|
|
@@ -379,6 +489,13 @@ If you want help building or operating crawlers/data feeds with wxpath (extracti
|
|
|
379
489
|
|
|
380
490
|
If you like wxpath and want to support its development, please consider [donating](https://www.paypal.com/donate/?business=WDNDK6J6PJEXY&no_recurring=0&item_name=Thanks+for+using+wxpath%21+Donations+fund+development%2C+docs%2C+and+bug+fixes.+If+wxpath+saved+you+time%2C+a+small+contribution+helps%21¤cy_code=USD).
|
|
381
491
|
|
|
492
|
+
|
|
493
|
+
## Versioning
|
|
494
|
+
|
|
495
|
+
**wxpath** follows [semver](https://semver.org): `<MAJOR>.<MINOR>.<PATCH>`.
|
|
496
|
+
|
|
497
|
+
However, pre-1.0.0 follows `0.<MAJOR>.<MINOR|PATCH>`.
|
|
498
|
+
|
|
382
499
|
## License
|
|
383
500
|
|
|
384
501
|
MIT
|
|
@@ -1,31 +1,45 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: wxpath
|
|
3
|
-
Version: 0.3.0
|
|
4
|
-
Summary: wxpath - a declarative web crawler and data extractor
|
|
5
|
-
Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
|
|
6
|
-
License-Expression: MIT
|
|
7
|
-
Requires-Python: >=3.10
|
|
8
|
-
Description-Content-Type: text/markdown
|
|
9
|
-
License-File: LICENSE
|
|
10
|
-
Requires-Dist: lxml>=4.0
|
|
11
|
-
Requires-Dist: elementpath<=5.0.3,>=5.0.0
|
|
12
|
-
Requires-Dist: aiohttp<=3.12.15,>=3.8.0
|
|
13
|
-
Provides-Extra: test
|
|
14
|
-
Requires-Dist: pytest>=7.0; extra == "test"
|
|
15
|
-
Requires-Dist: pytest-asyncio>=0.23; extra == "test"
|
|
16
|
-
Provides-Extra: dev
|
|
17
|
-
Requires-Dist: ruff; extra == "dev"
|
|
18
|
-
Dynamic: license-file
|
|
19
|
-
|
|
20
1
|
# **wxpath** - declarative web crawling with XPath
|
|
21
2
|
|
|
22
3
|
[](https://www.python.org/downloads/release/python-3100/)
|
|
23
4
|
|
|
24
5
|
**wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
|
|
25
6
|
|
|
26
|
-
|
|
7
|
+
This expression fetches a page, extracts links, and streams them concurrently - no crawl loop required:
|
|
27
8
|
|
|
28
|
-
|
|
9
|
+
```python
|
|
10
|
+
import wxpath
|
|
11
|
+
|
|
12
|
+
expr = "url('https://example.com')//a/@href"
|
|
13
|
+
|
|
14
|
+
for link in wxpath.wxpath_async_blocking_iter(expr):
|
|
15
|
+
print(link)
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform deep (or paginated) web crawling and extraction:
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
import wxpath
|
|
23
|
+
|
|
24
|
+
path_expr = """
|
|
25
|
+
url('https://quotes.toscrape.com')
|
|
26
|
+
///url(//a/@href)
|
|
27
|
+
//a/@href
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
|
|
31
|
+
print(item)
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
## Why wxpath?
|
|
36
|
+
|
|
37
|
+
Most web scrapers force you to write crawl control flow first, and extraction second.
|
|
38
|
+
|
|
39
|
+
**wxpath** inverts that:
|
|
40
|
+
- **You describe traversal declaratively**
|
|
41
|
+
- **Extraction is expressed inline**
|
|
42
|
+
- **The engine handles scheduling, concurrency, and deduplication**
|
|
29
43
|
|
|
30
44
|
|
|
31
45
|
## Contents
|
|
@@ -38,7 +52,10 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
38
52
|
- [Polite Crawling](#polite-crawling)
|
|
39
53
|
- [Output types](#output-types)
|
|
40
54
|
- [XPath 3.1](#xpath-31-by-default)
|
|
55
|
+
- [Progress Bar](#progress-bar)
|
|
41
56
|
- [CLI](#cli)
|
|
57
|
+
- [Persistence and Caching](#persistence-and-caching)
|
|
58
|
+
- [Settings](#settings)
|
|
42
59
|
- [Hooks (Experimental)](#hooks-experimental)
|
|
43
60
|
- [Install](#install)
|
|
44
61
|
- [More Examples](EXAMPLES.md)
|
|
@@ -46,7 +63,8 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
46
63
|
- [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
|
|
47
64
|
- [Project Philosophy](#project-philosophy)
|
|
48
65
|
- [Warnings](#warnings)
|
|
49
|
-
- [Commercial support
|
|
66
|
+
- [Commercial support/consulting](#commercial-supportconsulting)
|
|
67
|
+
- [Versioning](#versioning)
|
|
50
68
|
- [License](#license)
|
|
51
69
|
|
|
52
70
|
|
|
@@ -54,32 +72,31 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
54
72
|
|
|
55
73
|
```python
|
|
56
74
|
import wxpath
|
|
75
|
+
from wxpath.settings import CRAWLER_SETTINGS
|
|
76
|
+
|
|
77
|
+
# Custom headers for politeness; necessary for some sites (e.g., Wikipedia)
|
|
78
|
+
CRAWLER_SETTINGS.headers = {'User-Agent': 'my-app/0.4.0 (contact: you@example.com)'}
|
|
57
79
|
|
|
58
80
|
# Crawl, extract fields, build a knowledge graph
|
|
59
81
|
path_expr = """
|
|
60
82
|
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
83
|
+
///url(
|
|
84
|
+
//main//a/@href[
|
|
85
|
+
starts-with(., '/wiki/') and not(contains(., ':'))
|
|
86
|
+
]
|
|
87
|
+
)
|
|
88
|
+
/map{
|
|
89
|
+
'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
|
|
90
|
+
'url': string(base-uri(.)),
|
|
91
|
+
'short_description': //div[contains(@class, 'shortdescription')]/text() ! string(.),
|
|
92
|
+
'forward_links': //div[@id="mw-content-text"]//a/@href ! string(.)
|
|
93
|
+
}
|
|
68
94
|
"""
|
|
69
95
|
|
|
70
96
|
for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
|
|
71
97
|
print(item)
|
|
72
98
|
```
|
|
73
99
|
|
|
74
|
-
Output:
|
|
75
|
-
|
|
76
|
-
```python
|
|
77
|
-
map{'title': 'Computer language', 'url': 'https://en.wikipedia.org/wiki/Computer_language', 'short_description': 'Formal language for communicating with a computer', 'forward_links': ['/wiki/Formal_language', '/wiki/Communication', ...]}
|
|
78
|
-
map{'title': 'Advanced Boolean Expression Language', 'url': 'https://en.wikipedia.org/wiki/Advanced_Boolean_Expression_Language', 'short_description': 'Hardware description language and software', 'forward_links': ['/wiki/File:ABEL_HDL_example_SN74162.png', '/wiki/Hardware_description_language', ...]}
|
|
79
|
-
map{'title': 'Machine-readable medium and data', 'url': 'https://en.wikipedia.org/wiki/Machine_readable', 'short_description': 'Medium capable of storing data in a format readable by a machine', 'forward_links': ['/wiki/File:EAN-13-ISBN-13.svg', '/wiki/ISBN', ...]}
|
|
80
|
-
...
|
|
81
|
-
```
|
|
82
|
-
|
|
83
100
|
**Note:** Some sites (including Wikipedia) may block requests without proper headers.
|
|
84
101
|
See [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration) to set a custom `User-Agent`.
|
|
85
102
|
|
|
@@ -195,6 +212,17 @@ path_expr = """
|
|
|
195
212
|
# ...]
|
|
196
213
|
```
|
|
197
214
|
|
|
215
|
+
## Progress Bar
|
|
216
|
+
|
|
217
|
+
**wxpath** provides a progress bar (via `tqdm`) to track crawl progress. This is especially useful for long-running crawls.
|
|
218
|
+
|
|
219
|
+
Enable by setting `engine.run(..., progress=True)`, or pass `progress=True` to any of the `wxpath_async*(...)` functions.
|
|
220
|
+
|
|
221
|
+
```python
|
|
222
|
+
items = wxpath.wxpath_async_blocking("...", progress=True)
|
|
223
|
+
> 100%|██████████████████████████████████████████████████████████▎| 469/471 [00:05<00:00, 72.00it/s, depth=2, yielded=457]
|
|
224
|
+
```
|
|
225
|
+
|
|
198
226
|
|
|
199
227
|
## CLI
|
|
200
228
|
|
|
@@ -237,9 +265,46 @@ Command line options:
|
|
|
237
265
|
--concurrency-per-host <concurrency> Number of concurrent fetches per host
|
|
238
266
|
--header "Key:Value" Add a custom header (e.g., 'Key:Value'). Can be used multiple times.
|
|
239
267
|
--respect-robots [true|false] (Default: True) Respects robots.txt
|
|
268
|
+
--cache [true|false] (Default: False) Persist crawl results to a local database
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
## Persistence and Caching
|
|
273
|
+
|
|
274
|
+
**wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
|
|
275
|
+
|
|
276
|
+
**wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will be encounter a warning if you `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
|
|
277
|
+
|
|
278
|
+
To use, you must install the appropriate optional dependency:
|
|
279
|
+
|
|
280
|
+
```bash
|
|
281
|
+
pip install wxpath[cache-sqlite]
|
|
282
|
+
pip install wxpath[cache-redis]
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
Once the dependency is installed, you must enable the cache:
|
|
286
|
+
|
|
287
|
+
```python
|
|
288
|
+
from wxpath.settings import SETTINGS
|
|
289
|
+
|
|
290
|
+
# To enable caching; sqlite is the default
|
|
291
|
+
SETTINGS.http.client.cache.enabled = True
|
|
292
|
+
|
|
293
|
+
# For redis backend
|
|
294
|
+
SETTINGS.http.client.cache.enabled = True
|
|
295
|
+
SETTINGS.http.client.cache.backend = "redis"
|
|
296
|
+
SETTINGS.http.client.cache.redis.address = "redis://localhost:6379/0"
|
|
297
|
+
|
|
298
|
+
# Run wxpath as usual
|
|
299
|
+
items = list(wxpath_async_blocking_iter('...', max_depth=1, engine=engine))
|
|
240
300
|
```
|
|
241
301
|
|
|
242
302
|
|
|
303
|
+
## Settings
|
|
304
|
+
|
|
305
|
+
See [settings.py](src/wxpath/settings.py) for details of the settings.
|
|
306
|
+
|
|
307
|
+
|
|
243
308
|
## Hooks (Experimental)
|
|
244
309
|
|
|
245
310
|
**wxpath** supports a pluggable hook system that allows you to modify the crawling and extraction behavior. You can register hooks to preprocess URLs, post-process HTML, filter extracted values, and more. Hooks will be executed in the order they are registered. Hooks may impact performance.
|
|
@@ -290,6 +355,13 @@ Requires Python 3.10+.
|
|
|
290
355
|
pip install wxpath
|
|
291
356
|
```
|
|
292
357
|
|
|
358
|
+
For persisted/cached, wxpath supports the following backends:
|
|
359
|
+
|
|
360
|
+
```
|
|
361
|
+
pip install wxpath[cache-sqlite]
|
|
362
|
+
pip install wxpath[cache-redis]
|
|
363
|
+
```
|
|
364
|
+
|
|
293
365
|
|
|
294
366
|
## More Examples
|
|
295
367
|
|
|
@@ -336,6 +408,17 @@ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(//mai
|
|
|
336
408
|
items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
337
409
|
```
|
|
338
410
|
|
|
411
|
+
### Runtime API (`wxpath_async*`) options
|
|
412
|
+
|
|
413
|
+
- `max_depth`: int = 1
|
|
414
|
+
- `progress`: bool = False
|
|
415
|
+
- `engine`: WXPathEngine | None = None
|
|
416
|
+
- `yield_errors`: bool = False
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
### Settings
|
|
420
|
+
You can also use [settings.py](src/wxpath/settings.py) to enable caching, throttling, concurrency and more.
|
|
421
|
+
|
|
339
422
|
|
|
340
423
|
## Project Philosophy
|
|
341
424
|
|
|
@@ -345,7 +428,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
345
428
|
- Stay lightweight and composable
|
|
346
429
|
- Asynchronous support for high-performance crawls
|
|
347
430
|
|
|
348
|
-
###
|
|
431
|
+
### Goals
|
|
349
432
|
|
|
350
433
|
- URLs are deduplicated on a best-effort, per-crawl basis.
|
|
351
434
|
- Crawls are intended to terminate once the frontier is exhausted or `max_depth` is reached.
|
|
@@ -356,7 +439,6 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
356
439
|
|
|
357
440
|
The following features are not yet supported:
|
|
358
441
|
|
|
359
|
-
- Persistent scheduling or crawl resumption
|
|
360
442
|
- Automatic proxy rotation
|
|
361
443
|
- Browser-based rendering (JavaScript execution)
|
|
362
444
|
- Strict result ordering
|
|
@@ -364,13 +446,15 @@ The following features are not yet supported:
|
|
|
364
446
|
|
|
365
447
|
## WARNINGS!!!
|
|
366
448
|
|
|
449
|
+
This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
|
|
450
|
+
|
|
367
451
|
- Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
|
|
368
452
|
- Deep crawls (`///`) require user discipline to avoid unbounded expansion (traversal explosion).
|
|
369
453
|
- Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
|
|
370
454
|
- Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
|
|
371
455
|
|
|
372
456
|
|
|
373
|
-
## Commercial support
|
|
457
|
+
## Commercial support/consulting
|
|
374
458
|
|
|
375
459
|
If you want help building or operating crawlers/data feeds with wxpath (extraction, scheduling, monitoring, breakage fixes) or other web-scraping needs, please contact me at: rodrigopala91@gmail.com.
|
|
376
460
|
|
|
@@ -379,6 +463,13 @@ If you want help building or operating crawlers/data feeds with wxpath (extracti
|
|
|
379
463
|
|
|
380
464
|
If you like wxpath and want to support its development, please consider [donating](https://www.paypal.com/donate/?business=WDNDK6J6PJEXY&no_recurring=0&item_name=Thanks+for+using+wxpath%21+Donations+fund+development%2C+docs%2C+and+bug+fixes.+If+wxpath+saved+you+time%2C+a+small+contribution+helps%21¤cy_code=USD).
|
|
381
465
|
|
|
466
|
+
|
|
467
|
+
## Versioning
|
|
468
|
+
|
|
469
|
+
**wxpath** follows [semver](https://semver.org): `<MAJOR>.<MINOR>.<PATCH>`.
|
|
470
|
+
|
|
471
|
+
However, pre-1.0.0 follows `0.<MAJOR>.<MINOR|PATCH>`.
|
|
472
|
+
|
|
382
473
|
## License
|
|
383
474
|
|
|
384
475
|
MIT
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "wxpath"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.4.1"
|
|
8
8
|
description = "wxpath - a declarative web crawler and data extractor"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -16,10 +16,15 @@ license-files = ["LICENSE"]
|
|
|
16
16
|
dependencies = [
|
|
17
17
|
"lxml>=4.0",
|
|
18
18
|
"elementpath>=5.0.0,<=5.0.3",
|
|
19
|
-
"aiohttp>=3.8.0,<=3.12.15"
|
|
19
|
+
"aiohttp>=3.8.0,<=3.12.15",
|
|
20
|
+
"tqdm>=4.0.0"
|
|
20
21
|
]
|
|
21
22
|
|
|
22
23
|
[project.optional-dependencies]
|
|
24
|
+
cache = ["aiohttp-client-cache>=0.14.0"]
|
|
25
|
+
cache-sqlite = ["aiohttp-client-cache[sqlite]"]
|
|
26
|
+
cache-redis = ["aiohttp-client-cache[redis]"]
|
|
27
|
+
|
|
23
28
|
test = ["pytest>=7.0", "pytest-asyncio>=0.23"]
|
|
24
29
|
dev = ["ruff"]
|
|
25
30
|
|