wxpath 0.3.0__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {wxpath-0.3.0 → wxpath-0.4.1}/PKG-INFO +140 -23
  2. wxpath-0.3.0/src/wxpath.egg-info/PKG-INFO → wxpath-0.4.1/README.md +132 -41
  3. {wxpath-0.3.0 → wxpath-0.4.1}/pyproject.toml +7 -2
  4. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/cli.py +57 -12
  5. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/core/runtime/engine.py +87 -11
  6. wxpath-0.4.1/src/wxpath/http/client/cache.py +43 -0
  7. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/http/client/crawler.py +106 -22
  8. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/http/client/request.py +1 -1
  9. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/http/stats.py +6 -0
  10. wxpath-0.4.1/src/wxpath/settings.py +108 -0
  11. wxpath-0.3.0/README.md → wxpath-0.4.1/src/wxpath.egg-info/PKG-INFO +158 -22
  12. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath.egg-info/SOURCES.txt +2 -0
  13. wxpath-0.4.1/src/wxpath.egg-info/requires.txt +20 -0
  14. wxpath-0.3.0/src/wxpath.egg-info/requires.txt +0 -10
  15. {wxpath-0.3.0 → wxpath-0.4.1}/LICENSE +0 -0
  16. {wxpath-0.3.0 → wxpath-0.4.1}/setup.cfg +0 -0
  17. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/__init__.py +0 -0
  18. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/core/__init__.py +0 -0
  19. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/core/dom.py +0 -0
  20. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/core/models.py +0 -0
  21. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/core/ops.py +0 -0
  22. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/core/parser.py +0 -0
  23. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/core/runtime/__init__.py +0 -0
  24. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/core/runtime/helpers.py +0 -0
  25. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/hooks/__init__.py +0 -0
  26. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/hooks/builtin.py +0 -0
  27. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/hooks/registry.py +0 -0
  28. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/http/__init__.py +0 -0
  29. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/http/client/__init__.py +0 -0
  30. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/http/client/response.py +0 -0
  31. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/http/policy/backoff.py +0 -0
  32. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/http/policy/retry.py +0 -0
  33. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/http/policy/robots.py +0 -0
  34. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/http/policy/throttler.py +0 -0
  35. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/patches.py +0 -0
  36. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/util/__init__.py +0 -0
  37. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/util/logging.py +0 -0
  38. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath/util/serialize.py +0 -0
  39. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath.egg-info/dependency_links.txt +0 -0
  40. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath.egg-info/entry_points.txt +0 -0
  41. {wxpath-0.3.0 → wxpath-0.4.1}/src/wxpath.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wxpath
3
- Version: 0.3.0
3
+ Version: 0.4.1
4
4
  Summary: wxpath - a declarative web crawler and data extractor
5
5
  Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
6
6
  License-Expression: MIT
@@ -10,6 +10,13 @@ License-File: LICENSE
10
10
  Requires-Dist: lxml>=4.0
11
11
  Requires-Dist: elementpath<=5.0.3,>=5.0.0
12
12
  Requires-Dist: aiohttp<=3.12.15,>=3.8.0
13
+ Requires-Dist: tqdm>=4.0.0
14
+ Provides-Extra: cache
15
+ Requires-Dist: aiohttp-client-cache>=0.14.0; extra == "cache"
16
+ Provides-Extra: cache-sqlite
17
+ Requires-Dist: aiohttp-client-cache[sqlite]; extra == "cache-sqlite"
18
+ Provides-Extra: cache-redis
19
+ Requires-Dist: aiohttp-client-cache[redis]; extra == "cache-redis"
13
20
  Provides-Extra: test
14
21
  Requires-Dist: pytest>=7.0; extra == "test"
15
22
  Requires-Dist: pytest-asyncio>=0.23; extra == "test"
@@ -23,9 +30,42 @@ Dynamic: license-file
23
30
 
24
31
  **wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
25
32
 
26
- By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform deep (or paginated) web crawling and extraction.
33
+ This expression fetches a page, extracts links, and streams them concurrently - no crawl loop required:
27
34
 
28
- NOTE: This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
35
+ ```python
36
+ import wxpath
37
+
38
+ expr = "url('https://example.com')//a/@href"
39
+
40
+ for link in wxpath.wxpath_async_blocking_iter(expr):
41
+ print(link)
42
+ ```
43
+
44
+
45
+ By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform deep (or paginated) web crawling and extraction:
46
+
47
+ ```python
48
+ import wxpath
49
+
50
+ path_expr = """
51
+ url('https://quotes.toscrape.com')
52
+ ///url(//a/@href)
53
+ //a/@href
54
+ """
55
+
56
+ for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
57
+ print(item)
58
+ ```
59
+
60
+
61
+ ## Why wxpath?
62
+
63
+ Most web scrapers force you to write crawl control flow first, and extraction second.
64
+
65
+ **wxpath** inverts that:
66
+ - **You describe traversal declaratively**
67
+ - **Extraction is expressed inline**
68
+ - **The engine handles scheduling, concurrency, and deduplication**
29
69
 
30
70
 
31
71
  ## Contents
@@ -38,7 +78,10 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
38
78
  - [Polite Crawling](#polite-crawling)
39
79
  - [Output types](#output-types)
40
80
  - [XPath 3.1](#xpath-31-by-default)
81
+ - [Progress Bar](#progress-bar)
41
82
  - [CLI](#cli)
83
+ - [Persistence and Caching](#persistence-and-caching)
84
+ - [Settings](#settings)
42
85
  - [Hooks (Experimental)](#hooks-experimental)
43
86
  - [Install](#install)
44
87
  - [More Examples](EXAMPLES.md)
@@ -46,7 +89,8 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
46
89
  - [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
47
90
  - [Project Philosophy](#project-philosophy)
48
91
  - [Warnings](#warnings)
49
- - [Commercial support / consulting](#commercial-support--consulting)
92
+ - [Commercial support/consulting](#commercial-supportconsulting)
93
+ - [Versioning](#versioning)
50
94
  - [License](#license)
51
95
 
52
96
 
@@ -54,32 +98,31 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
54
98
 
55
99
  ```python
56
100
  import wxpath
101
+ from wxpath.settings import CRAWLER_SETTINGS
102
+
103
+ # Custom headers for politeness; necessary for some sites (e.g., Wikipedia)
104
+ CRAWLER_SETTINGS.headers = {'User-Agent': 'my-app/0.4.0 (contact: you@example.com)'}
57
105
 
58
106
  # Crawl, extract fields, build a knowledge graph
59
107
  path_expr = """
60
108
  url('https://en.wikipedia.org/wiki/Expression_language')
61
- ///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
62
- /map{
63
- 'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
64
- 'url': string(base-uri(.)),
65
- 'short_description': //div[contains(@class, 'shortdescription')]/text() ! string(.),
66
- 'forward_links': //div[@id="mw-content-text"]//a/@href ! string(.)
67
- }
109
+ ///url(
110
+ //main//a/@href[
111
+ starts-with(., '/wiki/') and not(contains(., ':'))
112
+ ]
113
+ )
114
+ /map{
115
+ 'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
116
+ 'url': string(base-uri(.)),
117
+ 'short_description': //div[contains(@class, 'shortdescription')]/text() ! string(.),
118
+ 'forward_links': //div[@id="mw-content-text"]//a/@href ! string(.)
119
+ }
68
120
  """
69
121
 
70
122
  for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
71
123
  print(item)
72
124
  ```
73
125
 
74
- Output:
75
-
76
- ```python
77
- map{'title': 'Computer language', 'url': 'https://en.wikipedia.org/wiki/Computer_language', 'short_description': 'Formal language for communicating with a computer', 'forward_links': ['/wiki/Formal_language', '/wiki/Communication', ...]}
78
- map{'title': 'Advanced Boolean Expression Language', 'url': 'https://en.wikipedia.org/wiki/Advanced_Boolean_Expression_Language', 'short_description': 'Hardware description language and software', 'forward_links': ['/wiki/File:ABEL_HDL_example_SN74162.png', '/wiki/Hardware_description_language', ...]}
79
- map{'title': 'Machine-readable medium and data', 'url': 'https://en.wikipedia.org/wiki/Machine_readable', 'short_description': 'Medium capable of storing data in a format readable by a machine', 'forward_links': ['/wiki/File:EAN-13-ISBN-13.svg', '/wiki/ISBN', ...]}
80
- ...
81
- ```
82
-
83
126
  **Note:** Some sites (including Wikipedia) may block requests without proper headers.
84
127
  See [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration) to set a custom `User-Agent`.
85
128
 
@@ -195,6 +238,17 @@ path_expr = """
195
238
  # ...]
196
239
  ```
197
240
 
241
+ ## Progress Bar
242
+
243
+ **wxpath** provides a progress bar (via `tqdm`) to track crawl progress. This is especially useful for long-running crawls.
244
+
245
+ Enable by setting `engine.run(..., progress=True)`, or pass `progress=True` to any of the `wxpath_async*(...)` functions.
246
+
247
+ ```python
248
+ items = wxpath.wxpath_async_blocking("...", progress=True)
249
+ > 100%|██████████████████████████████████████████████████████████▎| 469/471 [00:05<00:00, 72.00it/s, depth=2, yielded=457]
250
+ ```
251
+
198
252
 
199
253
  ## CLI
200
254
 
@@ -237,9 +291,46 @@ Command line options:
237
291
  --concurrency-per-host <concurrency> Number of concurrent fetches per host
238
292
  --header "Key:Value" Add a custom header (e.g., 'Key:Value'). Can be used multiple times.
239
293
  --respect-robots [true|false] (Default: True) Respects robots.txt
294
+ --cache [true|false] (Default: False) Persist crawl results to a local database
295
+ ```
296
+
297
+
298
+ ## Persistence and Caching
299
+
300
+ **wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
301
+
302
+ **wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will be encounter a warning if you `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
303
+
304
+ To use, you must install the appropriate optional dependency:
305
+
306
+ ```bash
307
+ pip install wxpath[cache-sqlite]
308
+ pip install wxpath[cache-redis]
309
+ ```
310
+
311
+ Once the dependency is installed, you must enable the cache:
312
+
313
+ ```python
314
+ from wxpath.settings import SETTINGS
315
+
316
+ # To enable caching; sqlite is the default
317
+ SETTINGS.http.client.cache.enabled = True
318
+
319
+ # For redis backend
320
+ SETTINGS.http.client.cache.enabled = True
321
+ SETTINGS.http.client.cache.backend = "redis"
322
+ SETTINGS.http.client.cache.redis.address = "redis://localhost:6379/0"
323
+
324
+ # Run wxpath as usual
325
+ items = list(wxpath_async_blocking_iter('...', max_depth=1, engine=engine))
240
326
  ```
241
327
 
242
328
 
329
+ ## Settings
330
+
331
+ See [settings.py](src/wxpath/settings.py) for details of the settings.
332
+
333
+
243
334
  ## Hooks (Experimental)
244
335
 
245
336
  **wxpath** supports a pluggable hook system that allows you to modify the crawling and extraction behavior. You can register hooks to preprocess URLs, post-process HTML, filter extracted values, and more. Hooks will be executed in the order they are registered. Hooks may impact performance.
@@ -290,6 +381,13 @@ Requires Python 3.10+.
290
381
  pip install wxpath
291
382
  ```
292
383
 
384
+ For persisted/cached, wxpath supports the following backends:
385
+
386
+ ```
387
+ pip install wxpath[cache-sqlite]
388
+ pip install wxpath[cache-redis]
389
+ ```
390
+
293
391
 
294
392
  ## More Examples
295
393
 
@@ -336,6 +434,17 @@ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(//mai
336
434
  items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
337
435
  ```
338
436
 
437
+ ### Runtime API (`wxpath_async*`) options
438
+
439
+ - `max_depth`: int = 1
440
+ - `progress`: bool = False
441
+ - `engine`: WXPathEngine | None = None
442
+ - `yield_errors`: bool = False
443
+
444
+
445
+ ### Settings
446
+ You can also use [settings.py](src/wxpath/settings.py) to enable caching, throttling, concurrency and more.
447
+
339
448
 
340
449
  ## Project Philosophy
341
450
 
@@ -345,7 +454,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
345
454
  - Stay lightweight and composable
346
455
  - Asynchronous support for high-performance crawls
347
456
 
348
- ### Guarantees/Goals
457
+ ### Goals
349
458
 
350
459
  - URLs are deduplicated on a best-effort, per-crawl basis.
351
460
  - Crawls are intended to terminate once the frontier is exhausted or `max_depth` is reached.
@@ -356,7 +465,6 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
356
465
 
357
466
  The following features are not yet supported:
358
467
 
359
- - Persistent scheduling or crawl resumption
360
468
  - Automatic proxy rotation
361
469
  - Browser-based rendering (JavaScript execution)
362
470
  - Strict result ordering
@@ -364,13 +472,15 @@ The following features are not yet supported:
364
472
 
365
473
  ## WARNINGS!!!
366
474
 
475
+ This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
476
+
367
477
  - Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
368
478
  - Deep crawls (`///`) require user discipline to avoid unbounded expansion (traversal explosion).
369
479
  - Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
370
480
  - Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
371
481
 
372
482
 
373
- ## Commercial support / consulting
483
+ ## Commercial support/consulting
374
484
 
375
485
  If you want help building or operating crawlers/data feeds with wxpath (extraction, scheduling, monitoring, breakage fixes) or other web-scraping needs, please contact me at: rodrigopala91@gmail.com.
376
486
 
@@ -379,6 +489,13 @@ If you want help building or operating crawlers/data feeds with wxpath (extracti
379
489
 
380
490
  If you like wxpath and want to support its development, please consider [donating](https://www.paypal.com/donate/?business=WDNDK6J6PJEXY&no_recurring=0&item_name=Thanks+for+using+wxpath%21+Donations+fund+development%2C+docs%2C+and+bug+fixes.+If+wxpath+saved+you+time%2C+a+small+contribution+helps%21&currency_code=USD).
381
491
 
492
+
493
+ ## Versioning
494
+
495
+ **wxpath** follows [semver](https://semver.org): `<MAJOR>.<MINOR>.<PATCH>`.
496
+
497
+ However, pre-1.0.0 follows `0.<MAJOR>.<MINOR|PATCH>`.
498
+
382
499
  ## License
383
500
 
384
501
  MIT
@@ -1,31 +1,45 @@
1
- Metadata-Version: 2.4
2
- Name: wxpath
3
- Version: 0.3.0
4
- Summary: wxpath - a declarative web crawler and data extractor
5
- Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
6
- License-Expression: MIT
7
- Requires-Python: >=3.10
8
- Description-Content-Type: text/markdown
9
- License-File: LICENSE
10
- Requires-Dist: lxml>=4.0
11
- Requires-Dist: elementpath<=5.0.3,>=5.0.0
12
- Requires-Dist: aiohttp<=3.12.15,>=3.8.0
13
- Provides-Extra: test
14
- Requires-Dist: pytest>=7.0; extra == "test"
15
- Requires-Dist: pytest-asyncio>=0.23; extra == "test"
16
- Provides-Extra: dev
17
- Requires-Dist: ruff; extra == "dev"
18
- Dynamic: license-file
19
-
20
1
  # **wxpath** - declarative web crawling with XPath
21
2
 
22
3
  [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-3100/)
23
4
 
24
5
  **wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
25
6
 
26
- By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform deep (or paginated) web crawling and extraction.
7
+ This expression fetches a page, extracts links, and streams them concurrently - no crawl loop required:
27
8
 
28
- NOTE: This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
9
+ ```python
10
+ import wxpath
11
+
12
+ expr = "url('https://example.com')//a/@href"
13
+
14
+ for link in wxpath.wxpath_async_blocking_iter(expr):
15
+ print(link)
16
+ ```
17
+
18
+
19
+ By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform deep (or paginated) web crawling and extraction:
20
+
21
+ ```python
22
+ import wxpath
23
+
24
+ path_expr = """
25
+ url('https://quotes.toscrape.com')
26
+ ///url(//a/@href)
27
+ //a/@href
28
+ """
29
+
30
+ for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
31
+ print(item)
32
+ ```
33
+
34
+
35
+ ## Why wxpath?
36
+
37
+ Most web scrapers force you to write crawl control flow first, and extraction second.
38
+
39
+ **wxpath** inverts that:
40
+ - **You describe traversal declaratively**
41
+ - **Extraction is expressed inline**
42
+ - **The engine handles scheduling, concurrency, and deduplication**
29
43
 
30
44
 
31
45
  ## Contents
@@ -38,7 +52,10 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
38
52
  - [Polite Crawling](#polite-crawling)
39
53
  - [Output types](#output-types)
40
54
  - [XPath 3.1](#xpath-31-by-default)
55
+ - [Progress Bar](#progress-bar)
41
56
  - [CLI](#cli)
57
+ - [Persistence and Caching](#persistence-and-caching)
58
+ - [Settings](#settings)
42
59
  - [Hooks (Experimental)](#hooks-experimental)
43
60
  - [Install](#install)
44
61
  - [More Examples](EXAMPLES.md)
@@ -46,7 +63,8 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
46
63
  - [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
47
64
  - [Project Philosophy](#project-philosophy)
48
65
  - [Warnings](#warnings)
49
- - [Commercial support / consulting](#commercial-support--consulting)
66
+ - [Commercial support/consulting](#commercial-supportconsulting)
67
+ - [Versioning](#versioning)
50
68
  - [License](#license)
51
69
 
52
70
 
@@ -54,32 +72,31 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
54
72
 
55
73
  ```python
56
74
  import wxpath
75
+ from wxpath.settings import CRAWLER_SETTINGS
76
+
77
+ # Custom headers for politeness; necessary for some sites (e.g., Wikipedia)
78
+ CRAWLER_SETTINGS.headers = {'User-Agent': 'my-app/0.4.0 (contact: you@example.com)'}
57
79
 
58
80
  # Crawl, extract fields, build a knowledge graph
59
81
  path_expr = """
60
82
  url('https://en.wikipedia.org/wiki/Expression_language')
61
- ///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
62
- /map{
63
- 'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
64
- 'url': string(base-uri(.)),
65
- 'short_description': //div[contains(@class, 'shortdescription')]/text() ! string(.),
66
- 'forward_links': //div[@id="mw-content-text"]//a/@href ! string(.)
67
- }
83
+ ///url(
84
+ //main//a/@href[
85
+ starts-with(., '/wiki/') and not(contains(., ':'))
86
+ ]
87
+ )
88
+ /map{
89
+ 'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
90
+ 'url': string(base-uri(.)),
91
+ 'short_description': //div[contains(@class, 'shortdescription')]/text() ! string(.),
92
+ 'forward_links': //div[@id="mw-content-text"]//a/@href ! string(.)
93
+ }
68
94
  """
69
95
 
70
96
  for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
71
97
  print(item)
72
98
  ```
73
99
 
74
- Output:
75
-
76
- ```python
77
- map{'title': 'Computer language', 'url': 'https://en.wikipedia.org/wiki/Computer_language', 'short_description': 'Formal language for communicating with a computer', 'forward_links': ['/wiki/Formal_language', '/wiki/Communication', ...]}
78
- map{'title': 'Advanced Boolean Expression Language', 'url': 'https://en.wikipedia.org/wiki/Advanced_Boolean_Expression_Language', 'short_description': 'Hardware description language and software', 'forward_links': ['/wiki/File:ABEL_HDL_example_SN74162.png', '/wiki/Hardware_description_language', ...]}
79
- map{'title': 'Machine-readable medium and data', 'url': 'https://en.wikipedia.org/wiki/Machine_readable', 'short_description': 'Medium capable of storing data in a format readable by a machine', 'forward_links': ['/wiki/File:EAN-13-ISBN-13.svg', '/wiki/ISBN', ...]}
80
- ...
81
- ```
82
-
83
100
  **Note:** Some sites (including Wikipedia) may block requests without proper headers.
84
101
  See [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration) to set a custom `User-Agent`.
85
102
 
@@ -195,6 +212,17 @@ path_expr = """
195
212
  # ...]
196
213
  ```
197
214
 
215
+ ## Progress Bar
216
+
217
+ **wxpath** provides a progress bar (via `tqdm`) to track crawl progress. This is especially useful for long-running crawls.
218
+
219
+ Enable by setting `engine.run(..., progress=True)`, or pass `progress=True` to any of the `wxpath_async*(...)` functions.
220
+
221
+ ```python
222
+ items = wxpath.wxpath_async_blocking("...", progress=True)
223
+ > 100%|██████████████████████████████████████████████████████████▎| 469/471 [00:05<00:00, 72.00it/s, depth=2, yielded=457]
224
+ ```
225
+
198
226
 
199
227
  ## CLI
200
228
 
@@ -237,9 +265,46 @@ Command line options:
237
265
  --concurrency-per-host <concurrency> Number of concurrent fetches per host
238
266
  --header "Key:Value" Add a custom header (e.g., 'Key:Value'). Can be used multiple times.
239
267
  --respect-robots [true|false] (Default: True) Respects robots.txt
268
+ --cache [true|false] (Default: False) Persist crawl results to a local database
269
+ ```
270
+
271
+
272
+ ## Persistence and Caching
273
+
274
+ **wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
275
+
276
+ **wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will be encounter a warning if you `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
277
+
278
+ To use, you must install the appropriate optional dependency:
279
+
280
+ ```bash
281
+ pip install wxpath[cache-sqlite]
282
+ pip install wxpath[cache-redis]
283
+ ```
284
+
285
+ Once the dependency is installed, you must enable the cache:
286
+
287
+ ```python
288
+ from wxpath.settings import SETTINGS
289
+
290
+ # To enable caching; sqlite is the default
291
+ SETTINGS.http.client.cache.enabled = True
292
+
293
+ # For redis backend
294
+ SETTINGS.http.client.cache.enabled = True
295
+ SETTINGS.http.client.cache.backend = "redis"
296
+ SETTINGS.http.client.cache.redis.address = "redis://localhost:6379/0"
297
+
298
+ # Run wxpath as usual
299
+ items = list(wxpath_async_blocking_iter('...', max_depth=1, engine=engine))
240
300
  ```
241
301
 
242
302
 
303
+ ## Settings
304
+
305
+ See [settings.py](src/wxpath/settings.py) for details of the settings.
306
+
307
+
243
308
  ## Hooks (Experimental)
244
309
 
245
310
  **wxpath** supports a pluggable hook system that allows you to modify the crawling and extraction behavior. You can register hooks to preprocess URLs, post-process HTML, filter extracted values, and more. Hooks will be executed in the order they are registered. Hooks may impact performance.
@@ -290,6 +355,13 @@ Requires Python 3.10+.
290
355
  pip install wxpath
291
356
  ```
292
357
 
358
+ For persisted/cached, wxpath supports the following backends:
359
+
360
+ ```
361
+ pip install wxpath[cache-sqlite]
362
+ pip install wxpath[cache-redis]
363
+ ```
364
+
293
365
 
294
366
  ## More Examples
295
367
 
@@ -336,6 +408,17 @@ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(//mai
336
408
  items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
337
409
  ```
338
410
 
411
+ ### Runtime API (`wxpath_async*`) options
412
+
413
+ - `max_depth`: int = 1
414
+ - `progress`: bool = False
415
+ - `engine`: WXPathEngine | None = None
416
+ - `yield_errors`: bool = False
417
+
418
+
419
+ ### Settings
420
+ You can also use [settings.py](src/wxpath/settings.py) to enable caching, throttling, concurrency and more.
421
+
339
422
 
340
423
  ## Project Philosophy
341
424
 
@@ -345,7 +428,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
345
428
  - Stay lightweight and composable
346
429
  - Asynchronous support for high-performance crawls
347
430
 
348
- ### Guarantees/Goals
431
+ ### Goals
349
432
 
350
433
  - URLs are deduplicated on a best-effort, per-crawl basis.
351
434
  - Crawls are intended to terminate once the frontier is exhausted or `max_depth` is reached.
@@ -356,7 +439,6 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
356
439
 
357
440
  The following features are not yet supported:
358
441
 
359
- - Persistent scheduling or crawl resumption
360
442
  - Automatic proxy rotation
361
443
  - Browser-based rendering (JavaScript execution)
362
444
  - Strict result ordering
@@ -364,13 +446,15 @@ The following features are not yet supported:
364
446
 
365
447
  ## WARNINGS!!!
366
448
 
449
+ This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
450
+
367
451
  - Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
368
452
  - Deep crawls (`///`) require user discipline to avoid unbounded expansion (traversal explosion).
369
453
  - Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
370
454
  - Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
371
455
 
372
456
 
373
- ## Commercial support / consulting
457
+ ## Commercial support/consulting
374
458
 
375
459
  If you want help building or operating crawlers/data feeds with wxpath (extraction, scheduling, monitoring, breakage fixes) or other web-scraping needs, please contact me at: rodrigopala91@gmail.com.
376
460
 
@@ -379,6 +463,13 @@ If you want help building or operating crawlers/data feeds with wxpath (extracti
379
463
 
380
464
  If you like wxpath and want to support its development, please consider [donating](https://www.paypal.com/donate/?business=WDNDK6J6PJEXY&no_recurring=0&item_name=Thanks+for+using+wxpath%21+Donations+fund+development%2C+docs%2C+and+bug+fixes.+If+wxpath+saved+you+time%2C+a+small+contribution+helps%21&currency_code=USD).
381
465
 
466
+
467
+ ## Versioning
468
+
469
+ **wxpath** follows [semver](https://semver.org): `<MAJOR>.<MINOR>.<PATCH>`.
470
+
471
+ However, pre-1.0.0 follows `0.<MAJOR>.<MINOR|PATCH>`.
472
+
382
473
  ## License
383
474
 
384
475
  MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "wxpath"
7
- version = "0.3.0"
7
+ version = "0.4.1"
8
8
  description = "wxpath - a declarative web crawler and data extractor"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -16,10 +16,15 @@ license-files = ["LICENSE"]
16
16
  dependencies = [
17
17
  "lxml>=4.0",
18
18
  "elementpath>=5.0.0,<=5.0.3",
19
- "aiohttp>=3.8.0,<=3.12.15"
19
+ "aiohttp>=3.8.0,<=3.12.15",
20
+ "tqdm>=4.0.0"
20
21
  ]
21
22
 
22
23
  [project.optional-dependencies]
24
+ cache = ["aiohttp-client-cache>=0.14.0"]
25
+ cache-sqlite = ["aiohttp-client-cache[sqlite]"]
26
+ cache-redis = ["aiohttp-client-cache[redis]"]
27
+
23
28
  test = ["pytest>=7.0", "pytest-asyncio>=0.23"]
24
29
  dev = ["ruff"]
25
30