wxpath 0.3.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wxpath-0.3.0 → wxpath-0.4.0}/PKG-INFO +86 -10
- wxpath-0.3.0/src/wxpath.egg-info/PKG-INFO → wxpath-0.4.0/README.md +78 -28
- {wxpath-0.3.0 → wxpath-0.4.0}/pyproject.toml +7 -2
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/cli.py +57 -12
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/core/runtime/engine.py +48 -10
- wxpath-0.4.0/src/wxpath/http/client/cache.py +43 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/http/client/crawler.py +106 -22
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/http/stats.py +6 -0
- wxpath-0.4.0/src/wxpath/settings.py +108 -0
- wxpath-0.3.0/README.md → wxpath-0.4.0/src/wxpath.egg-info/PKG-INFO +104 -9
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath.egg-info/SOURCES.txt +2 -0
- wxpath-0.4.0/src/wxpath.egg-info/requires.txt +20 -0
- wxpath-0.3.0/src/wxpath.egg-info/requires.txt +0 -10
- {wxpath-0.3.0 → wxpath-0.4.0}/LICENSE +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/setup.cfg +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/__init__.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/core/__init__.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/core/dom.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/core/models.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/core/ops.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/core/parser.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/core/runtime/__init__.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/core/runtime/helpers.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/hooks/__init__.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/hooks/builtin.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/hooks/registry.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/http/__init__.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/http/client/__init__.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/http/client/request.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/http/client/response.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/http/policy/backoff.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/http/policy/retry.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/http/policy/robots.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/http/policy/throttler.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/patches.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/util/__init__.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/util/logging.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/util/serialize.py +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath.egg-info/dependency_links.txt +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath.egg-info/entry_points.txt +0 -0
- {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: wxpath
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: wxpath - a declarative web crawler and data extractor
|
|
5
5
|
Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -10,6 +10,13 @@ License-File: LICENSE
|
|
|
10
10
|
Requires-Dist: lxml>=4.0
|
|
11
11
|
Requires-Dist: elementpath<=5.0.3,>=5.0.0
|
|
12
12
|
Requires-Dist: aiohttp<=3.12.15,>=3.8.0
|
|
13
|
+
Requires-Dist: tqdm>=4.0.0
|
|
14
|
+
Provides-Extra: cache
|
|
15
|
+
Requires-Dist: aiohttp-client-cache>=0.14.0; extra == "cache"
|
|
16
|
+
Provides-Extra: cache-sqlite
|
|
17
|
+
Requires-Dist: aiohttp-client-cache[sqlite]; extra == "cache-sqlite"
|
|
18
|
+
Provides-Extra: cache-redis
|
|
19
|
+
Requires-Dist: aiohttp-client-cache[redis]; extra == "cache-redis"
|
|
13
20
|
Provides-Extra: test
|
|
14
21
|
Requires-Dist: pytest>=7.0; extra == "test"
|
|
15
22
|
Requires-Dist: pytest-asyncio>=0.23; extra == "test"
|
|
@@ -38,7 +45,10 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
38
45
|
- [Polite Crawling](#polite-crawling)
|
|
39
46
|
- [Output types](#output-types)
|
|
40
47
|
- [XPath 3.1](#xpath-31-by-default)
|
|
48
|
+
- [Progress Bar](#progress-bar)
|
|
41
49
|
- [CLI](#cli)
|
|
50
|
+
- [Persistence and Caching](#persistence-and-caching)
|
|
51
|
+
- [Settings](#settings)
|
|
42
52
|
- [Hooks (Experimental)](#hooks-experimental)
|
|
43
53
|
- [Install](#install)
|
|
44
54
|
- [More Examples](EXAMPLES.md)
|
|
@@ -47,6 +57,7 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
47
57
|
- [Project Philosophy](#project-philosophy)
|
|
48
58
|
- [Warnings](#warnings)
|
|
49
59
|
- [Commercial support / consulting](#commercial-support--consulting)
|
|
60
|
+
- [Versioning](#versioning)
|
|
50
61
|
- [License](#license)
|
|
51
62
|
|
|
52
63
|
|
|
@@ -54,17 +65,21 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
54
65
|
|
|
55
66
|
```python
|
|
56
67
|
import wxpath
|
|
68
|
+
from wxpath.settings import CRAWLER_SETTINGS
|
|
69
|
+
|
|
70
|
+
# Custom headers for politeness; necessary for some sites (e.g., Wikipedia)
|
|
71
|
+
CRAWLER_SETTINGS.headers = {'User-Agent': 'my-app/0.4.0 (contact: you@example.com)'}
|
|
57
72
|
|
|
58
73
|
# Crawl, extract fields, build a knowledge graph
|
|
59
74
|
path_expr = """
|
|
60
75
|
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
76
|
+
///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
|
|
77
|
+
/map{
|
|
78
|
+
'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
|
|
79
|
+
'url': string(base-uri(.)),
|
|
80
|
+
'short_description': //div[contains(@class, 'shortdescription')]/text() ! string(.),
|
|
81
|
+
'forward_links': //div[@id="mw-content-text"]//a/@href ! string(.)
|
|
82
|
+
}
|
|
68
83
|
"""
|
|
69
84
|
|
|
70
85
|
for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
|
|
@@ -195,6 +210,17 @@ path_expr = """
|
|
|
195
210
|
# ...]
|
|
196
211
|
```
|
|
197
212
|
|
|
213
|
+
## Progress Bar
|
|
214
|
+
|
|
215
|
+
**wxpath** provides a progress bar (via `tqdm`) to track crawl progress. This is especially useful for long-running crawls.
|
|
216
|
+
|
|
217
|
+
Enable by setting `engine.run(..., progress=True)`, or pass `progress=True` to any of the `wxpath_async*(...)` functions.
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
items = wxpath.wxpath_async_blocking("...", progress=True)
|
|
221
|
+
> 100%|██████████████████████████████████████████████████████████▎| 469/471 [00:05<00:00, 72.00it/s, depth=2, yielded=457]
|
|
222
|
+
```
|
|
223
|
+
|
|
198
224
|
|
|
199
225
|
## CLI
|
|
200
226
|
|
|
@@ -237,9 +263,46 @@ Command line options:
|
|
|
237
263
|
--concurrency-per-host <concurrency> Number of concurrent fetches per host
|
|
238
264
|
--header "Key:Value" Add a custom header (e.g., 'Key:Value'). Can be used multiple times.
|
|
239
265
|
--respect-robots [true|false] (Default: True) Respects robots.txt
|
|
266
|
+
--cache [true|false] (Default: False) Persist crawl results to a local database
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
## Persistence and Caching
|
|
271
|
+
|
|
272
|
+
**wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
|
|
273
|
+
|
|
274
|
+
**wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will be encounter a warning if you `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
|
|
275
|
+
|
|
276
|
+
To use, you must install the appropriate optional dependency:
|
|
277
|
+
|
|
278
|
+
```bash
|
|
279
|
+
pip install wxpath[cache-sqlite]
|
|
280
|
+
pip install wxpath[cache-redis]
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
Once the dependency is installed, you must enable the cache:
|
|
284
|
+
|
|
285
|
+
```python
|
|
286
|
+
from wxpath.settings import SETTINGS
|
|
287
|
+
|
|
288
|
+
# To enable caching; sqlite is the default
|
|
289
|
+
SETTINGS.http.client.cache.enabled = True
|
|
290
|
+
|
|
291
|
+
# For redis backend
|
|
292
|
+
SETTINGS.http.client.cache.enabled = True
|
|
293
|
+
SETTINGS.http.client.cache.backend = "redis"
|
|
294
|
+
SETTINGS.http.client.cache.redis.address = "redis://localhost:6379/0"
|
|
295
|
+
|
|
296
|
+
# Run wxpath as usual
|
|
297
|
+
items = list(wxpath_async_blocking_iter('...', max_depth=1, engine=engine))
|
|
240
298
|
```
|
|
241
299
|
|
|
242
300
|
|
|
301
|
+
## Settings
|
|
302
|
+
|
|
303
|
+
See [settings.py](src/wxpath/settings.py) for details of the settings.
|
|
304
|
+
|
|
305
|
+
|
|
243
306
|
## Hooks (Experimental)
|
|
244
307
|
|
|
245
308
|
**wxpath** supports a pluggable hook system that allows you to modify the crawling and extraction behavior. You can register hooks to preprocess URLs, post-process HTML, filter extracted values, and more. Hooks will be executed in the order they are registered. Hooks may impact performance.
|
|
@@ -290,6 +353,13 @@ Requires Python 3.10+.
|
|
|
290
353
|
pip install wxpath
|
|
291
354
|
```
|
|
292
355
|
|
|
356
|
+
For persisted/cached, wxpath supports the following backends:
|
|
357
|
+
|
|
358
|
+
```
|
|
359
|
+
pip install wxpath[cache-sqlite]
|
|
360
|
+
pip install wxpath[cache-redis]
|
|
361
|
+
```
|
|
362
|
+
|
|
293
363
|
|
|
294
364
|
## More Examples
|
|
295
365
|
|
|
@@ -345,7 +415,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
345
415
|
- Stay lightweight and composable
|
|
346
416
|
- Asynchronous support for high-performance crawls
|
|
347
417
|
|
|
348
|
-
###
|
|
418
|
+
### Goals
|
|
349
419
|
|
|
350
420
|
- URLs are deduplicated on a best-effort, per-crawl basis.
|
|
351
421
|
- Crawls are intended to terminate once the frontier is exhausted or `max_depth` is reached.
|
|
@@ -356,7 +426,6 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
356
426
|
|
|
357
427
|
The following features are not yet supported:
|
|
358
428
|
|
|
359
|
-
- Persistent scheduling or crawl resumption
|
|
360
429
|
- Automatic proxy rotation
|
|
361
430
|
- Browser-based rendering (JavaScript execution)
|
|
362
431
|
- Strict result ordering
|
|
@@ -379,6 +448,13 @@ If you want help building or operating crawlers/data feeds with wxpath (extracti
|
|
|
379
448
|
|
|
380
449
|
If you like wxpath and want to support its development, please consider [donating](https://www.paypal.com/donate/?business=WDNDK6J6PJEXY&no_recurring=0&item_name=Thanks+for+using+wxpath%21+Donations+fund+development%2C+docs%2C+and+bug+fixes.+If+wxpath+saved+you+time%2C+a+small+contribution+helps%21¤cy_code=USD).
|
|
381
450
|
|
|
451
|
+
|
|
452
|
+
## Versioning
|
|
453
|
+
|
|
454
|
+
**wxpath** follows [semver](https://semver.org): `<MAJOR>.<MINOR>.<PATCH>`.
|
|
455
|
+
|
|
456
|
+
However, pre-1.0.0 follows `0.<MAJOR>.<MINOR|PATCH>`.
|
|
457
|
+
|
|
382
458
|
## License
|
|
383
459
|
|
|
384
460
|
MIT
|
|
@@ -1,22 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: wxpath
|
|
3
|
-
Version: 0.3.0
|
|
4
|
-
Summary: wxpath - a declarative web crawler and data extractor
|
|
5
|
-
Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
|
|
6
|
-
License-Expression: MIT
|
|
7
|
-
Requires-Python: >=3.10
|
|
8
|
-
Description-Content-Type: text/markdown
|
|
9
|
-
License-File: LICENSE
|
|
10
|
-
Requires-Dist: lxml>=4.0
|
|
11
|
-
Requires-Dist: elementpath<=5.0.3,>=5.0.0
|
|
12
|
-
Requires-Dist: aiohttp<=3.12.15,>=3.8.0
|
|
13
|
-
Provides-Extra: test
|
|
14
|
-
Requires-Dist: pytest>=7.0; extra == "test"
|
|
15
|
-
Requires-Dist: pytest-asyncio>=0.23; extra == "test"
|
|
16
|
-
Provides-Extra: dev
|
|
17
|
-
Requires-Dist: ruff; extra == "dev"
|
|
18
|
-
Dynamic: license-file
|
|
19
|
-
|
|
20
1
|
# **wxpath** - declarative web crawling with XPath
|
|
21
2
|
|
|
22
3
|
[](https://www.python.org/downloads/release/python-3100/)
|
|
@@ -38,7 +19,10 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
38
19
|
- [Polite Crawling](#polite-crawling)
|
|
39
20
|
- [Output types](#output-types)
|
|
40
21
|
- [XPath 3.1](#xpath-31-by-default)
|
|
22
|
+
- [Progress Bar](#progress-bar)
|
|
41
23
|
- [CLI](#cli)
|
|
24
|
+
- [Persistence and Caching](#persistence-and-caching)
|
|
25
|
+
- [Settings](#settings)
|
|
42
26
|
- [Hooks (Experimental)](#hooks-experimental)
|
|
43
27
|
- [Install](#install)
|
|
44
28
|
- [More Examples](EXAMPLES.md)
|
|
@@ -47,6 +31,7 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
47
31
|
- [Project Philosophy](#project-philosophy)
|
|
48
32
|
- [Warnings](#warnings)
|
|
49
33
|
- [Commercial support / consulting](#commercial-support--consulting)
|
|
34
|
+
- [Versioning](#versioning)
|
|
50
35
|
- [License](#license)
|
|
51
36
|
|
|
52
37
|
|
|
@@ -54,17 +39,21 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
54
39
|
|
|
55
40
|
```python
|
|
56
41
|
import wxpath
|
|
42
|
+
from wxpath.settings import CRAWLER_SETTINGS
|
|
43
|
+
|
|
44
|
+
# Custom headers for politeness; necessary for some sites (e.g., Wikipedia)
|
|
45
|
+
CRAWLER_SETTINGS.headers = {'User-Agent': 'my-app/0.4.0 (contact: you@example.com)'}
|
|
57
46
|
|
|
58
47
|
# Crawl, extract fields, build a knowledge graph
|
|
59
48
|
path_expr = """
|
|
60
49
|
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
50
|
+
///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
|
|
51
|
+
/map{
|
|
52
|
+
'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
|
|
53
|
+
'url': string(base-uri(.)),
|
|
54
|
+
'short_description': //div[contains(@class, 'shortdescription')]/text() ! string(.),
|
|
55
|
+
'forward_links': //div[@id="mw-content-text"]//a/@href ! string(.)
|
|
56
|
+
}
|
|
68
57
|
"""
|
|
69
58
|
|
|
70
59
|
for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
|
|
@@ -195,6 +184,17 @@ path_expr = """
|
|
|
195
184
|
# ...]
|
|
196
185
|
```
|
|
197
186
|
|
|
187
|
+
## Progress Bar
|
|
188
|
+
|
|
189
|
+
**wxpath** provides a progress bar (via `tqdm`) to track crawl progress. This is especially useful for long-running crawls.
|
|
190
|
+
|
|
191
|
+
Enable by setting `engine.run(..., progress=True)`, or pass `progress=True` to any of the `wxpath_async*(...)` functions.
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
items = wxpath.wxpath_async_blocking("...", progress=True)
|
|
195
|
+
> 100%|██████████████████████████████████████████████████████████▎| 469/471 [00:05<00:00, 72.00it/s, depth=2, yielded=457]
|
|
196
|
+
```
|
|
197
|
+
|
|
198
198
|
|
|
199
199
|
## CLI
|
|
200
200
|
|
|
@@ -237,9 +237,46 @@ Command line options:
|
|
|
237
237
|
--concurrency-per-host <concurrency> Number of concurrent fetches per host
|
|
238
238
|
--header "Key:Value" Add a custom header (e.g., 'Key:Value'). Can be used multiple times.
|
|
239
239
|
--respect-robots [true|false] (Default: True) Respects robots.txt
|
|
240
|
+
--cache [true|false] (Default: False) Persist crawl results to a local database
|
|
240
241
|
```
|
|
241
242
|
|
|
242
243
|
|
|
244
|
+
## Persistence and Caching
|
|
245
|
+
|
|
246
|
+
**wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
|
|
247
|
+
|
|
248
|
+
**wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will be encounter a warning if you `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
|
|
249
|
+
|
|
250
|
+
To use, you must install the appropriate optional dependency:
|
|
251
|
+
|
|
252
|
+
```bash
|
|
253
|
+
pip install wxpath[cache-sqlite]
|
|
254
|
+
pip install wxpath[cache-redis]
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
Once the dependency is installed, you must enable the cache:
|
|
258
|
+
|
|
259
|
+
```python
|
|
260
|
+
from wxpath.settings import SETTINGS
|
|
261
|
+
|
|
262
|
+
# To enable caching; sqlite is the default
|
|
263
|
+
SETTINGS.http.client.cache.enabled = True
|
|
264
|
+
|
|
265
|
+
# For redis backend
|
|
266
|
+
SETTINGS.http.client.cache.enabled = True
|
|
267
|
+
SETTINGS.http.client.cache.backend = "redis"
|
|
268
|
+
SETTINGS.http.client.cache.redis.address = "redis://localhost:6379/0"
|
|
269
|
+
|
|
270
|
+
# Run wxpath as usual
|
|
271
|
+
items = list(wxpath_async_blocking_iter('...', max_depth=1, engine=engine))
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
## Settings
|
|
276
|
+
|
|
277
|
+
See [settings.py](src/wxpath/settings.py) for details of the settings.
|
|
278
|
+
|
|
279
|
+
|
|
243
280
|
## Hooks (Experimental)
|
|
244
281
|
|
|
245
282
|
**wxpath** supports a pluggable hook system that allows you to modify the crawling and extraction behavior. You can register hooks to preprocess URLs, post-process HTML, filter extracted values, and more. Hooks will be executed in the order they are registered. Hooks may impact performance.
|
|
@@ -290,6 +327,13 @@ Requires Python 3.10+.
|
|
|
290
327
|
pip install wxpath
|
|
291
328
|
```
|
|
292
329
|
|
|
330
|
+
For persisted/cached, wxpath supports the following backends:
|
|
331
|
+
|
|
332
|
+
```
|
|
333
|
+
pip install wxpath[cache-sqlite]
|
|
334
|
+
pip install wxpath[cache-redis]
|
|
335
|
+
```
|
|
336
|
+
|
|
293
337
|
|
|
294
338
|
## More Examples
|
|
295
339
|
|
|
@@ -345,7 +389,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
345
389
|
- Stay lightweight and composable
|
|
346
390
|
- Asynchronous support for high-performance crawls
|
|
347
391
|
|
|
348
|
-
###
|
|
392
|
+
### Goals
|
|
349
393
|
|
|
350
394
|
- URLs are deduplicated on a best-effort, per-crawl basis.
|
|
351
395
|
- Crawls are intended to terminate once the frontier is exhausted or `max_depth` is reached.
|
|
@@ -356,7 +400,6 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
356
400
|
|
|
357
401
|
The following features are not yet supported:
|
|
358
402
|
|
|
359
|
-
- Persistent scheduling or crawl resumption
|
|
360
403
|
- Automatic proxy rotation
|
|
361
404
|
- Browser-based rendering (JavaScript execution)
|
|
362
405
|
- Strict result ordering
|
|
@@ -379,6 +422,13 @@ If you want help building or operating crawlers/data feeds with wxpath (extracti
|
|
|
379
422
|
|
|
380
423
|
If you like wxpath and want to support its development, please consider [donating](https://www.paypal.com/donate/?business=WDNDK6J6PJEXY&no_recurring=0&item_name=Thanks+for+using+wxpath%21+Donations+fund+development%2C+docs%2C+and+bug+fixes.+If+wxpath+saved+you+time%2C+a+small+contribution+helps%21¤cy_code=USD).
|
|
381
424
|
|
|
425
|
+
|
|
426
|
+
## Versioning
|
|
427
|
+
|
|
428
|
+
**wxpath** follows [semver](https://semver.org): `<MAJOR>.<MINOR>.<PATCH>`.
|
|
429
|
+
|
|
430
|
+
However, pre-1.0.0 follows `0.<MAJOR>.<MINOR|PATCH>`.
|
|
431
|
+
|
|
382
432
|
## License
|
|
383
433
|
|
|
384
434
|
MIT
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "wxpath"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.4.0"
|
|
8
8
|
description = "wxpath - a declarative web crawler and data extractor"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -16,10 +16,15 @@ license-files = ["LICENSE"]
|
|
|
16
16
|
dependencies = [
|
|
17
17
|
"lxml>=4.0",
|
|
18
18
|
"elementpath>=5.0.0,<=5.0.3",
|
|
19
|
-
"aiohttp>=3.8.0,<=3.12.15"
|
|
19
|
+
"aiohttp>=3.8.0,<=3.12.15",
|
|
20
|
+
"tqdm>=4.0.0"
|
|
20
21
|
]
|
|
21
22
|
|
|
22
23
|
[project.optional-dependencies]
|
|
24
|
+
cache = ["aiohttp-client-cache>=0.14.0"]
|
|
25
|
+
cache-sqlite = ["aiohttp-client-cache[sqlite]"]
|
|
26
|
+
cache-redis = ["aiohttp-client-cache[redis]"]
|
|
27
|
+
|
|
23
28
|
test = ["pytest>=7.0", "pytest-asyncio>=0.23"]
|
|
24
29
|
dev = ["ruff"]
|
|
25
30
|
|
|
@@ -6,6 +6,7 @@ from wxpath.core import parser as wxpath_parser
|
|
|
6
6
|
from wxpath.core.runtime.engine import WXPathEngine, wxpath_async_blocking_iter
|
|
7
7
|
from wxpath.hooks import builtin, registry
|
|
8
8
|
from wxpath.http.client.crawler import Crawler
|
|
9
|
+
from wxpath.settings import SETTINGS
|
|
9
10
|
from wxpath.util.serialize import simplify
|
|
10
11
|
|
|
11
12
|
|
|
@@ -15,9 +16,11 @@ def main():
|
|
|
15
16
|
arg_parser.add_argument("expression", help="The wxpath expression")
|
|
16
17
|
arg_parser.add_argument("--depth", type=int, default=1, help="Recursion depth")
|
|
17
18
|
# debug
|
|
18
|
-
arg_parser.add_argument("--debug", action="store_true",
|
|
19
|
+
arg_parser.add_argument("--debug", action="store_true",
|
|
20
|
+
help="Debug mode. Provides verbose runtime output and information")
|
|
19
21
|
# verbose
|
|
20
|
-
arg_parser.add_argument("--verbose", action="store_true",
|
|
22
|
+
arg_parser.add_argument("--verbose", action="store_true",
|
|
23
|
+
help="Verbose mode. Prints CLI level information")
|
|
21
24
|
|
|
22
25
|
arg_parser.add_argument(
|
|
23
26
|
"--concurrency",
|
|
@@ -44,17 +47,27 @@ def main():
|
|
|
44
47
|
help="Respect robots.txt",
|
|
45
48
|
default=True
|
|
46
49
|
)
|
|
50
|
+
arg_parser.add_argument(
|
|
51
|
+
"--cache",
|
|
52
|
+
action="store_true",
|
|
53
|
+
help="Use cache",
|
|
54
|
+
default=False
|
|
55
|
+
)
|
|
56
|
+
arg_parser.add_argument(
|
|
57
|
+
"--cache-backend",
|
|
58
|
+
type=str,
|
|
59
|
+
help="Cache backend. Possible values: redis, sqlite",
|
|
60
|
+
default="sqlite"
|
|
61
|
+
)
|
|
62
|
+
arg_parser.add_argument(
|
|
63
|
+
"--cache-db-path-or-url",
|
|
64
|
+
type=str,
|
|
65
|
+
help="Path to cache database",
|
|
66
|
+
default="cache.db"
|
|
67
|
+
)
|
|
47
68
|
|
|
48
69
|
args = arg_parser.parse_args()
|
|
49
70
|
|
|
50
|
-
if args.verbose:
|
|
51
|
-
segments = wxpath_parser.parse(args.expression)
|
|
52
|
-
print("parsed expression:\n\nSegments([")
|
|
53
|
-
for s in segments:
|
|
54
|
-
print(f"\t{s},")
|
|
55
|
-
print("])")
|
|
56
|
-
print()
|
|
57
|
-
|
|
58
71
|
if args.debug:
|
|
59
72
|
from wxpath import configure_logging
|
|
60
73
|
configure_logging('DEBUG')
|
|
@@ -72,6 +85,29 @@ def main():
|
|
|
72
85
|
print(f"Using custom headers: {custom_headers}")
|
|
73
86
|
print()
|
|
74
87
|
|
|
88
|
+
if args.cache:
|
|
89
|
+
SETTINGS.http.client.cache.enabled = True
|
|
90
|
+
if args.cache_backend == "redis":
|
|
91
|
+
SETTINGS.http.client.cache.backend = "redis"
|
|
92
|
+
SETTINGS.http.client.cache.redis.address = args.cache_db_path_or_url
|
|
93
|
+
elif args.cache_backend == "sqlite":
|
|
94
|
+
SETTINGS.http.client.cache.backend = "sqlite"
|
|
95
|
+
SETTINGS.http.client.cache.sqlite.cache_name = args.cache_db_path_or_url
|
|
96
|
+
|
|
97
|
+
if args.verbose:
|
|
98
|
+
print(f"Using concurrency: {args.concurrency}")
|
|
99
|
+
print(f"Using concurrency per host: {args.concurrency_per_host}")
|
|
100
|
+
print(f"Using respect robots: {args.respect_robots}")
|
|
101
|
+
print(f"Using cache: {args.cache}")
|
|
102
|
+
|
|
103
|
+
segments = wxpath_parser.parse(args.expression)
|
|
104
|
+
print("parsed expression:\n\nSegments([")
|
|
105
|
+
for s in segments:
|
|
106
|
+
print(f"\t{s},")
|
|
107
|
+
print("])")
|
|
108
|
+
print()
|
|
109
|
+
print()
|
|
110
|
+
|
|
75
111
|
crawler = Crawler(
|
|
76
112
|
concurrency=args.concurrency,
|
|
77
113
|
per_host=args.concurrency_per_host,
|
|
@@ -81,11 +117,20 @@ def main():
|
|
|
81
117
|
engine = WXPathEngine(crawler=crawler)
|
|
82
118
|
|
|
83
119
|
try:
|
|
84
|
-
for r in wxpath_async_blocking_iter(
|
|
120
|
+
for r in wxpath_async_blocking_iter(
|
|
121
|
+
path_expr=args.expression,
|
|
122
|
+
max_depth=args.depth,
|
|
123
|
+
engine=engine):
|
|
85
124
|
clean = simplify(r)
|
|
86
125
|
print(json.dumps(clean, ensure_ascii=False), flush=True)
|
|
87
126
|
except BrokenPipeError:
|
|
88
|
-
|
|
127
|
+
if args.verbose:
|
|
128
|
+
print("Pipe broken.")
|
|
129
|
+
|
|
130
|
+
if args.verbose:
|
|
131
|
+
print("Done. Printing crawl stats")
|
|
132
|
+
print(crawler._stats)
|
|
133
|
+
sys.exit(0)
|
|
89
134
|
|
|
90
135
|
|
|
91
136
|
if __name__ == "__main__":
|
|
@@ -5,6 +5,7 @@ from collections import deque
|
|
|
5
5
|
from typing import Any, AsyncGenerator, Iterator
|
|
6
6
|
|
|
7
7
|
from lxml.html import HtmlElement
|
|
8
|
+
from tqdm import tqdm
|
|
8
9
|
|
|
9
10
|
from wxpath import patches # noqa: F401
|
|
10
11
|
from wxpath.core import parser
|
|
@@ -157,7 +158,12 @@ class WXPathEngine(HookedEngineBase):
|
|
|
157
158
|
if allow_redirects:
|
|
158
159
|
self.allowed_response_codes |= {301, 302, 303, 307, 308}
|
|
159
160
|
|
|
160
|
-
async def run(
|
|
161
|
+
async def run(
|
|
162
|
+
self,
|
|
163
|
+
expression: str,
|
|
164
|
+
max_depth: int,
|
|
165
|
+
progress: bool = False
|
|
166
|
+
) -> AsyncGenerator[Any, None]:
|
|
161
167
|
"""Execute a wxpath expression concurrently and yield results.
|
|
162
168
|
|
|
163
169
|
Builds and drives a BFS-like crawl pipeline that honors robots rules,
|
|
@@ -166,6 +172,7 @@ class WXPathEngine(HookedEngineBase):
|
|
|
166
172
|
Args:
|
|
167
173
|
expression: WXPath expression string to evaluate.
|
|
168
174
|
max_depth: Maximum crawl depth to follow for url hops.
|
|
175
|
+
progress: Whether to display a progress bar.
|
|
169
176
|
|
|
170
177
|
Yields:
|
|
171
178
|
Extracted values produced by the expression (HTML elements or
|
|
@@ -182,6 +189,12 @@ class WXPathEngine(HookedEngineBase):
|
|
|
182
189
|
# the current state of the engine.
|
|
183
190
|
return queue.empty() and pending_tasks <= 0
|
|
184
191
|
|
|
192
|
+
total_yielded = 0
|
|
193
|
+
if progress:
|
|
194
|
+
pbar = tqdm(total=0)
|
|
195
|
+
else:
|
|
196
|
+
pbar = None
|
|
197
|
+
|
|
185
198
|
async with self.crawler as crawler:
|
|
186
199
|
async def submitter():
|
|
187
200
|
nonlocal pending_tasks
|
|
@@ -219,12 +232,17 @@ class WXPathEngine(HookedEngineBase):
|
|
|
219
232
|
depth=seed_task.depth,
|
|
220
233
|
max_depth=max_depth,
|
|
221
234
|
queue=queue,
|
|
235
|
+
pbar=pbar,
|
|
222
236
|
):
|
|
223
237
|
yield await self.post_extract_hooks(output)
|
|
224
238
|
|
|
225
239
|
# While looping asynchronous generators, you MUST make sure
|
|
226
240
|
# to check terminal conditions before re-iteration.
|
|
227
241
|
async for resp in crawler:
|
|
242
|
+
if pbar is not None:
|
|
243
|
+
pbar.update(1)
|
|
244
|
+
pbar.refresh()
|
|
245
|
+
|
|
228
246
|
task = inflight.pop(resp.request.url, None)
|
|
229
247
|
pending_tasks -= 1
|
|
230
248
|
|
|
@@ -273,10 +291,18 @@ class WXPathEngine(HookedEngineBase):
|
|
|
273
291
|
depth=task.depth,
|
|
274
292
|
max_depth=max_depth,
|
|
275
293
|
queue=queue,
|
|
276
|
-
|
|
294
|
+
pbar=pbar
|
|
295
|
+
):
|
|
296
|
+
total_yielded += 1
|
|
297
|
+
if pbar is not None:
|
|
298
|
+
pbar.set_postfix(yielded=total_yielded, depth=task.depth,)
|
|
277
299
|
|
|
278
300
|
yield await self.post_extract_hooks(output)
|
|
279
301
|
else:
|
|
302
|
+
total_yielded += 1
|
|
303
|
+
if pbar is not None:
|
|
304
|
+
pbar.set_postfix(yielded=total_yielded, depth=task.depth,)
|
|
305
|
+
|
|
280
306
|
yield await self.post_extract_hooks(elem)
|
|
281
307
|
|
|
282
308
|
# Termination condition
|
|
@@ -287,6 +313,9 @@ class WXPathEngine(HookedEngineBase):
|
|
|
287
313
|
with contextlib.suppress(asyncio.CancelledError):
|
|
288
314
|
await submit_task
|
|
289
315
|
|
|
316
|
+
if pbar is not None:
|
|
317
|
+
pbar.close()
|
|
318
|
+
|
|
290
319
|
async def _process_pipeline(
|
|
291
320
|
self,
|
|
292
321
|
task: CrawlTask,
|
|
@@ -294,6 +323,7 @@ class WXPathEngine(HookedEngineBase):
|
|
|
294
323
|
depth: int,
|
|
295
324
|
max_depth: int,
|
|
296
325
|
queue: asyncio.Queue[CrawlTask],
|
|
326
|
+
pbar: tqdm = None
|
|
297
327
|
) -> AsyncGenerator[Any, None]:
|
|
298
328
|
"""Process a queue of intents for a single crawl branch.
|
|
299
329
|
|
|
@@ -331,9 +361,10 @@ class WXPathEngine(HookedEngineBase):
|
|
|
331
361
|
elif isinstance(intent, CrawlIntent):
|
|
332
362
|
next_depth = task.depth + 1
|
|
333
363
|
# if intent.url not in self.seen_urls and next_depth <= max_depth:
|
|
334
|
-
if next_depth <= max_depth:
|
|
364
|
+
if next_depth <= max_depth and intent.url not in self.seen_urls:
|
|
335
365
|
# self.seen_urls.add(intent.url)
|
|
336
366
|
log.debug(f"Depth: {next_depth}; Enqueuing {intent.url}")
|
|
367
|
+
|
|
337
368
|
queue.put_nowait(
|
|
338
369
|
CrawlTask(
|
|
339
370
|
elem=None,
|
|
@@ -343,6 +374,9 @@ class WXPathEngine(HookedEngineBase):
|
|
|
343
374
|
backlink=task.url,
|
|
344
375
|
)
|
|
345
376
|
)
|
|
377
|
+
if pbar is not None:
|
|
378
|
+
pbar.total += 1
|
|
379
|
+
pbar.refresh()
|
|
346
380
|
|
|
347
381
|
elif isinstance(intent, (ExtractIntent, ProcessIntent, InfiniteCrawlIntent)):
|
|
348
382
|
# immediately traverse the extraction
|
|
@@ -351,18 +385,20 @@ class WXPathEngine(HookedEngineBase):
|
|
|
351
385
|
mini_queue.append((elem, next_segments))
|
|
352
386
|
|
|
353
387
|
|
|
354
|
-
def wxpath_async(path_expr: str,
|
|
355
|
-
max_depth: int,
|
|
388
|
+
def wxpath_async(path_expr: str,
|
|
389
|
+
max_depth: int,
|
|
390
|
+
progress: bool = False,
|
|
356
391
|
engine: WXPathEngine | None = None) -> AsyncGenerator[Any, None]:
|
|
357
392
|
if engine is None:
|
|
358
393
|
engine = WXPathEngine()
|
|
359
|
-
return engine.run(path_expr, max_depth)
|
|
394
|
+
return engine.run(path_expr, max_depth, progress=progress)
|
|
360
395
|
|
|
361
396
|
|
|
362
397
|
##### ASYNC IN SYNC #####
|
|
363
398
|
def wxpath_async_blocking_iter(
|
|
364
399
|
path_expr: str,
|
|
365
400
|
max_depth: int = 1,
|
|
401
|
+
progress: bool = False,
|
|
366
402
|
engine: WXPathEngine | None = None,
|
|
367
403
|
) -> Iterator[Any]:
|
|
368
404
|
"""Evaluate a wxpath expression using concurrent breadth-first traversal.
|
|
@@ -383,7 +419,7 @@ def wxpath_async_blocking_iter(
|
|
|
383
419
|
"""
|
|
384
420
|
loop = asyncio.new_event_loop()
|
|
385
421
|
asyncio.set_event_loop(loop)
|
|
386
|
-
agen = wxpath_async(path_expr, max_depth=max_depth, engine=engine)
|
|
422
|
+
agen = wxpath_async(path_expr, max_depth=max_depth, progress=progress, engine=engine)
|
|
387
423
|
|
|
388
424
|
try:
|
|
389
425
|
while True:
|
|
@@ -399,8 +435,10 @@ def wxpath_async_blocking_iter(
|
|
|
399
435
|
def wxpath_async_blocking(
|
|
400
436
|
path_expr: str,
|
|
401
437
|
max_depth: int = 1,
|
|
438
|
+
progress: bool = False,
|
|
402
439
|
engine: WXPathEngine | None = None,
|
|
403
440
|
) -> list[Any]:
|
|
404
|
-
return list(
|
|
405
|
-
|
|
406
|
-
|
|
441
|
+
return list(wxpath_async_blocking_iter(path_expr,
|
|
442
|
+
max_depth=max_depth,
|
|
443
|
+
progress=progress,
|
|
444
|
+
engine=engine))
|