wxpath 0.3.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {wxpath-0.3.0 → wxpath-0.4.0}/PKG-INFO +86 -10
  2. wxpath-0.3.0/src/wxpath.egg-info/PKG-INFO → wxpath-0.4.0/README.md +78 -28
  3. {wxpath-0.3.0 → wxpath-0.4.0}/pyproject.toml +7 -2
  4. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/cli.py +57 -12
  5. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/core/runtime/engine.py +48 -10
  6. wxpath-0.4.0/src/wxpath/http/client/cache.py +43 -0
  7. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/http/client/crawler.py +106 -22
  8. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/http/stats.py +6 -0
  9. wxpath-0.4.0/src/wxpath/settings.py +108 -0
  10. wxpath-0.3.0/README.md → wxpath-0.4.0/src/wxpath.egg-info/PKG-INFO +104 -9
  11. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath.egg-info/SOURCES.txt +2 -0
  12. wxpath-0.4.0/src/wxpath.egg-info/requires.txt +20 -0
  13. wxpath-0.3.0/src/wxpath.egg-info/requires.txt +0 -10
  14. {wxpath-0.3.0 → wxpath-0.4.0}/LICENSE +0 -0
  15. {wxpath-0.3.0 → wxpath-0.4.0}/setup.cfg +0 -0
  16. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/__init__.py +0 -0
  17. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/core/__init__.py +0 -0
  18. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/core/dom.py +0 -0
  19. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/core/models.py +0 -0
  20. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/core/ops.py +0 -0
  21. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/core/parser.py +0 -0
  22. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/core/runtime/__init__.py +0 -0
  23. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/core/runtime/helpers.py +0 -0
  24. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/hooks/__init__.py +0 -0
  25. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/hooks/builtin.py +0 -0
  26. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/hooks/registry.py +0 -0
  27. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/http/__init__.py +0 -0
  28. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/http/client/__init__.py +0 -0
  29. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/http/client/request.py +0 -0
  30. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/http/client/response.py +0 -0
  31. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/http/policy/backoff.py +0 -0
  32. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/http/policy/retry.py +0 -0
  33. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/http/policy/robots.py +0 -0
  34. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/http/policy/throttler.py +0 -0
  35. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/patches.py +0 -0
  36. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/util/__init__.py +0 -0
  37. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/util/logging.py +0 -0
  38. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath/util/serialize.py +0 -0
  39. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath.egg-info/dependency_links.txt +0 -0
  40. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath.egg-info/entry_points.txt +0 -0
  41. {wxpath-0.3.0 → wxpath-0.4.0}/src/wxpath.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wxpath
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: wxpath - a declarative web crawler and data extractor
5
5
  Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
6
6
  License-Expression: MIT
@@ -10,6 +10,13 @@ License-File: LICENSE
10
10
  Requires-Dist: lxml>=4.0
11
11
  Requires-Dist: elementpath<=5.0.3,>=5.0.0
12
12
  Requires-Dist: aiohttp<=3.12.15,>=3.8.0
13
+ Requires-Dist: tqdm>=4.0.0
14
+ Provides-Extra: cache
15
+ Requires-Dist: aiohttp-client-cache>=0.14.0; extra == "cache"
16
+ Provides-Extra: cache-sqlite
17
+ Requires-Dist: aiohttp-client-cache[sqlite]; extra == "cache-sqlite"
18
+ Provides-Extra: cache-redis
19
+ Requires-Dist: aiohttp-client-cache[redis]; extra == "cache-redis"
13
20
  Provides-Extra: test
14
21
  Requires-Dist: pytest>=7.0; extra == "test"
15
22
  Requires-Dist: pytest-asyncio>=0.23; extra == "test"
@@ -38,7 +45,10 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
38
45
  - [Polite Crawling](#polite-crawling)
39
46
  - [Output types](#output-types)
40
47
  - [XPath 3.1](#xpath-31-by-default)
48
+ - [Progress Bar](#progress-bar)
41
49
  - [CLI](#cli)
50
+ - [Persistence and Caching](#persistence-and-caching)
51
+ - [Settings](#settings)
42
52
  - [Hooks (Experimental)](#hooks-experimental)
43
53
  - [Install](#install)
44
54
  - [More Examples](EXAMPLES.md)
@@ -47,6 +57,7 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
47
57
  - [Project Philosophy](#project-philosophy)
48
58
  - [Warnings](#warnings)
49
59
  - [Commercial support / consulting](#commercial-support--consulting)
60
+ - [Versioning](#versioning)
50
61
  - [License](#license)
51
62
 
52
63
 
@@ -54,17 +65,21 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
54
65
 
55
66
  ```python
56
67
  import wxpath
68
+ from wxpath.settings import CRAWLER_SETTINGS
69
+
70
+ # Custom headers for politeness; necessary for some sites (e.g., Wikipedia)
71
+ CRAWLER_SETTINGS.headers = {'User-Agent': 'my-app/0.4.0 (contact: you@example.com)'}
57
72
 
58
73
  # Crawl, extract fields, build a knowledge graph
59
74
  path_expr = """
60
75
  url('https://en.wikipedia.org/wiki/Expression_language')
61
- ///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
62
- /map{
63
- 'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
64
- 'url': string(base-uri(.)),
65
- 'short_description': //div[contains(@class, 'shortdescription')]/text() ! string(.),
66
- 'forward_links': //div[@id="mw-content-text"]//a/@href ! string(.)
67
- }
76
+ ///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
77
+ /map{
78
+ 'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
79
+ 'url': string(base-uri(.)),
80
+ 'short_description': //div[contains(@class, 'shortdescription')]/text() ! string(.),
81
+ 'forward_links': //div[@id="mw-content-text"]//a/@href ! string(.)
82
+ }
68
83
  """
69
84
 
70
85
  for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
@@ -195,6 +210,17 @@ path_expr = """
195
210
  # ...]
196
211
  ```
197
212
 
213
+ ## Progress Bar
214
+
215
+ **wxpath** provides a progress bar (via `tqdm`) to track crawl progress. This is especially useful for long-running crawls.
216
+
217
+ Enable by setting `engine.run(..., progress=True)`, or pass `progress=True` to any of the `wxpath_async*(...)` functions.
218
+
219
+ ```python
220
+ items = wxpath.wxpath_async_blocking("...", progress=True)
221
+ > 100%|██████████████████████████████████████████████████████████▎| 469/471 [00:05<00:00, 72.00it/s, depth=2, yielded=457]
222
+ ```
223
+
198
224
 
199
225
  ## CLI
200
226
 
@@ -237,9 +263,46 @@ Command line options:
237
263
  --concurrency-per-host <concurrency> Number of concurrent fetches per host
238
264
  --header "Key:Value" Add a custom header (e.g., 'Key:Value'). Can be used multiple times.
239
265
  --respect-robots [true|false] (Default: True) Respects robots.txt
266
+ --cache [true|false] (Default: False) Persist crawl results to a local database
267
+ ```
268
+
269
+
270
+ ## Persistence and Caching
271
+
272
+ **wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
273
+
274
+ **wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will be encounter a warning if you `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
275
+
276
+ To use, you must install the appropriate optional dependency:
277
+
278
+ ```bash
279
+ pip install wxpath[cache-sqlite]
280
+ pip install wxpath[cache-redis]
281
+ ```
282
+
283
+ Once the dependency is installed, you must enable the cache:
284
+
285
+ ```python
286
+ from wxpath.settings import SETTINGS
287
+
288
+ # To enable caching; sqlite is the default
289
+ SETTINGS.http.client.cache.enabled = True
290
+
291
+ # For redis backend
292
+ SETTINGS.http.client.cache.enabled = True
293
+ SETTINGS.http.client.cache.backend = "redis"
294
+ SETTINGS.http.client.cache.redis.address = "redis://localhost:6379/0"
295
+
296
+ # Run wxpath as usual
297
+ items = list(wxpath_async_blocking_iter('...', max_depth=1, engine=engine))
240
298
  ```
241
299
 
242
300
 
301
+ ## Settings
302
+
303
+ See [settings.py](src/wxpath/settings.py) for details of the settings.
304
+
305
+
243
306
  ## Hooks (Experimental)
244
307
 
245
308
  **wxpath** supports a pluggable hook system that allows you to modify the crawling and extraction behavior. You can register hooks to preprocess URLs, post-process HTML, filter extracted values, and more. Hooks will be executed in the order they are registered. Hooks may impact performance.
@@ -290,6 +353,13 @@ Requires Python 3.10+.
290
353
  pip install wxpath
291
354
  ```
292
355
 
356
+ For persisted/cached, wxpath supports the following backends:
357
+
358
+ ```
359
+ pip install wxpath[cache-sqlite]
360
+ pip install wxpath[cache-redis]
361
+ ```
362
+
293
363
 
294
364
  ## More Examples
295
365
 
@@ -345,7 +415,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
345
415
  - Stay lightweight and composable
346
416
  - Asynchronous support for high-performance crawls
347
417
 
348
- ### Guarantees/Goals
418
+ ### Goals
349
419
 
350
420
  - URLs are deduplicated on a best-effort, per-crawl basis.
351
421
  - Crawls are intended to terminate once the frontier is exhausted or `max_depth` is reached.
@@ -356,7 +426,6 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
356
426
 
357
427
  The following features are not yet supported:
358
428
 
359
- - Persistent scheduling or crawl resumption
360
429
  - Automatic proxy rotation
361
430
  - Browser-based rendering (JavaScript execution)
362
431
  - Strict result ordering
@@ -379,6 +448,13 @@ If you want help building or operating crawlers/data feeds with wxpath (extracti
379
448
 
380
449
  If you like wxpath and want to support its development, please consider [donating](https://www.paypal.com/donate/?business=WDNDK6J6PJEXY&no_recurring=0&item_name=Thanks+for+using+wxpath%21+Donations+fund+development%2C+docs%2C+and+bug+fixes.+If+wxpath+saved+you+time%2C+a+small+contribution+helps%21&currency_code=USD).
381
450
 
451
+
452
+ ## Versioning
453
+
454
+ **wxpath** follows [semver](https://semver.org): `<MAJOR>.<MINOR>.<PATCH>`.
455
+
456
+ However, pre-1.0.0 follows `0.<MAJOR>.<MINOR|PATCH>`.
457
+
382
458
  ## License
383
459
 
384
460
  MIT
@@ -1,22 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: wxpath
3
- Version: 0.3.0
4
- Summary: wxpath - a declarative web crawler and data extractor
5
- Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
6
- License-Expression: MIT
7
- Requires-Python: >=3.10
8
- Description-Content-Type: text/markdown
9
- License-File: LICENSE
10
- Requires-Dist: lxml>=4.0
11
- Requires-Dist: elementpath<=5.0.3,>=5.0.0
12
- Requires-Dist: aiohttp<=3.12.15,>=3.8.0
13
- Provides-Extra: test
14
- Requires-Dist: pytest>=7.0; extra == "test"
15
- Requires-Dist: pytest-asyncio>=0.23; extra == "test"
16
- Provides-Extra: dev
17
- Requires-Dist: ruff; extra == "dev"
18
- Dynamic: license-file
19
-
20
1
  # **wxpath** - declarative web crawling with XPath
21
2
 
22
3
  [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-3100/)
@@ -38,7 +19,10 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
38
19
  - [Polite Crawling](#polite-crawling)
39
20
  - [Output types](#output-types)
40
21
  - [XPath 3.1](#xpath-31-by-default)
22
+ - [Progress Bar](#progress-bar)
41
23
  - [CLI](#cli)
24
+ - [Persistence and Caching](#persistence-and-caching)
25
+ - [Settings](#settings)
42
26
  - [Hooks (Experimental)](#hooks-experimental)
43
27
  - [Install](#install)
44
28
  - [More Examples](EXAMPLES.md)
@@ -47,6 +31,7 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
47
31
  - [Project Philosophy](#project-philosophy)
48
32
  - [Warnings](#warnings)
49
33
  - [Commercial support / consulting](#commercial-support--consulting)
34
+ - [Versioning](#versioning)
50
35
  - [License](#license)
51
36
 
52
37
 
@@ -54,17 +39,21 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
54
39
 
55
40
  ```python
56
41
  import wxpath
42
+ from wxpath.settings import CRAWLER_SETTINGS
43
+
44
+ # Custom headers for politeness; necessary for some sites (e.g., Wikipedia)
45
+ CRAWLER_SETTINGS.headers = {'User-Agent': 'my-app/0.4.0 (contact: you@example.com)'}
57
46
 
58
47
  # Crawl, extract fields, build a knowledge graph
59
48
  path_expr = """
60
49
  url('https://en.wikipedia.org/wiki/Expression_language')
61
- ///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
62
- /map{
63
- 'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
64
- 'url': string(base-uri(.)),
65
- 'short_description': //div[contains(@class, 'shortdescription')]/text() ! string(.),
66
- 'forward_links': //div[@id="mw-content-text"]//a/@href ! string(.)
67
- }
50
+ ///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
51
+ /map{
52
+ 'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
53
+ 'url': string(base-uri(.)),
54
+ 'short_description': //div[contains(@class, 'shortdescription')]/text() ! string(.),
55
+ 'forward_links': //div[@id="mw-content-text"]//a/@href ! string(.)
56
+ }
68
57
  """
69
58
 
70
59
  for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
@@ -195,6 +184,17 @@ path_expr = """
195
184
  # ...]
196
185
  ```
197
186
 
187
+ ## Progress Bar
188
+
189
+ **wxpath** provides a progress bar (via `tqdm`) to track crawl progress. This is especially useful for long-running crawls.
190
+
191
+ Enable by setting `engine.run(..., progress=True)`, or pass `progress=True` to any of the `wxpath_async*(...)` functions.
192
+
193
+ ```python
194
+ items = wxpath.wxpath_async_blocking("...", progress=True)
195
+ > 100%|██████████████████████████████████████████████████████████▎| 469/471 [00:05<00:00, 72.00it/s, depth=2, yielded=457]
196
+ ```
197
+
198
198
 
199
199
  ## CLI
200
200
 
@@ -237,9 +237,46 @@ Command line options:
237
237
  --concurrency-per-host <concurrency> Number of concurrent fetches per host
238
238
  --header "Key:Value" Add a custom header (e.g., 'Key:Value'). Can be used multiple times.
239
239
  --respect-robots [true|false] (Default: True) Respects robots.txt
240
+ --cache [true|false] (Default: False) Persist crawl results to a local database
240
241
  ```
241
242
 
242
243
 
244
+ ## Persistence and Caching
245
+
246
+ **wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
247
+
248
+ **wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will be encounter a warning if you `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
249
+
250
+ To use, you must install the appropriate optional dependency:
251
+
252
+ ```bash
253
+ pip install wxpath[cache-sqlite]
254
+ pip install wxpath[cache-redis]
255
+ ```
256
+
257
+ Once the dependency is installed, you must enable the cache:
258
+
259
+ ```python
260
+ from wxpath.settings import SETTINGS
261
+
262
+ # To enable caching; sqlite is the default
263
+ SETTINGS.http.client.cache.enabled = True
264
+
265
+ # For redis backend
266
+ SETTINGS.http.client.cache.enabled = True
267
+ SETTINGS.http.client.cache.backend = "redis"
268
+ SETTINGS.http.client.cache.redis.address = "redis://localhost:6379/0"
269
+
270
+ # Run wxpath as usual
271
+ items = list(wxpath_async_blocking_iter('...', max_depth=1, engine=engine))
272
+ ```
273
+
274
+
275
+ ## Settings
276
+
277
+ See [settings.py](src/wxpath/settings.py) for details of the settings.
278
+
279
+
243
280
  ## Hooks (Experimental)
244
281
 
245
282
  **wxpath** supports a pluggable hook system that allows you to modify the crawling and extraction behavior. You can register hooks to preprocess URLs, post-process HTML, filter extracted values, and more. Hooks will be executed in the order they are registered. Hooks may impact performance.
@@ -290,6 +327,13 @@ Requires Python 3.10+.
290
327
  pip install wxpath
291
328
  ```
292
329
 
330
+ For persisted/cached, wxpath supports the following backends:
331
+
332
+ ```
333
+ pip install wxpath[cache-sqlite]
334
+ pip install wxpath[cache-redis]
335
+ ```
336
+
293
337
 
294
338
  ## More Examples
295
339
 
@@ -345,7 +389,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
345
389
  - Stay lightweight and composable
346
390
  - Asynchronous support for high-performance crawls
347
391
 
348
- ### Guarantees/Goals
392
+ ### Goals
349
393
 
350
394
  - URLs are deduplicated on a best-effort, per-crawl basis.
351
395
  - Crawls are intended to terminate once the frontier is exhausted or `max_depth` is reached.
@@ -356,7 +400,6 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
356
400
 
357
401
  The following features are not yet supported:
358
402
 
359
- - Persistent scheduling or crawl resumption
360
403
  - Automatic proxy rotation
361
404
  - Browser-based rendering (JavaScript execution)
362
405
  - Strict result ordering
@@ -379,6 +422,13 @@ If you want help building or operating crawlers/data feeds with wxpath (extracti
379
422
 
380
423
  If you like wxpath and want to support its development, please consider [donating](https://www.paypal.com/donate/?business=WDNDK6J6PJEXY&no_recurring=0&item_name=Thanks+for+using+wxpath%21+Donations+fund+development%2C+docs%2C+and+bug+fixes.+If+wxpath+saved+you+time%2C+a+small+contribution+helps%21&currency_code=USD).
381
424
 
425
+
426
+ ## Versioning
427
+
428
+ **wxpath** follows [semver](https://semver.org): `<MAJOR>.<MINOR>.<PATCH>`.
429
+
430
+ However, pre-1.0.0 follows `0.<MAJOR>.<MINOR|PATCH>`.
431
+
382
432
  ## License
383
433
 
384
434
  MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "wxpath"
7
- version = "0.3.0"
7
+ version = "0.4.0"
8
8
  description = "wxpath - a declarative web crawler and data extractor"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -16,10 +16,15 @@ license-files = ["LICENSE"]
16
16
  dependencies = [
17
17
  "lxml>=4.0",
18
18
  "elementpath>=5.0.0,<=5.0.3",
19
- "aiohttp>=3.8.0,<=3.12.15"
19
+ "aiohttp>=3.8.0,<=3.12.15",
20
+ "tqdm>=4.0.0"
20
21
  ]
21
22
 
22
23
  [project.optional-dependencies]
24
+ cache = ["aiohttp-client-cache>=0.14.0"]
25
+ cache-sqlite = ["aiohttp-client-cache[sqlite]"]
26
+ cache-redis = ["aiohttp-client-cache[redis]"]
27
+
23
28
  test = ["pytest>=7.0", "pytest-asyncio>=0.23"]
24
29
  dev = ["ruff"]
25
30
 
@@ -6,6 +6,7 @@ from wxpath.core import parser as wxpath_parser
6
6
  from wxpath.core.runtime.engine import WXPathEngine, wxpath_async_blocking_iter
7
7
  from wxpath.hooks import builtin, registry
8
8
  from wxpath.http.client.crawler import Crawler
9
+ from wxpath.settings import SETTINGS
9
10
  from wxpath.util.serialize import simplify
10
11
 
11
12
 
@@ -15,9 +16,11 @@ def main():
15
16
  arg_parser.add_argument("expression", help="The wxpath expression")
16
17
  arg_parser.add_argument("--depth", type=int, default=1, help="Recursion depth")
17
18
  # debug
18
- arg_parser.add_argument("--debug", action="store_true", help="Debug mode")
19
+ arg_parser.add_argument("--debug", action="store_true",
20
+ help="Debug mode. Provides verbose runtime output and information")
19
21
  # verbose
20
- arg_parser.add_argument("--verbose", action="store_true", help="Verbose mode")
22
+ arg_parser.add_argument("--verbose", action="store_true",
23
+ help="Verbose mode. Prints CLI level information")
21
24
 
22
25
  arg_parser.add_argument(
23
26
  "--concurrency",
@@ -44,17 +47,27 @@ def main():
44
47
  help="Respect robots.txt",
45
48
  default=True
46
49
  )
50
+ arg_parser.add_argument(
51
+ "--cache",
52
+ action="store_true",
53
+ help="Use cache",
54
+ default=False
55
+ )
56
+ arg_parser.add_argument(
57
+ "--cache-backend",
58
+ type=str,
59
+ help="Cache backend. Possible values: redis, sqlite",
60
+ default="sqlite"
61
+ )
62
+ arg_parser.add_argument(
63
+ "--cache-db-path-or-url",
64
+ type=str,
65
+ help="Path to cache database",
66
+ default="cache.db"
67
+ )
47
68
 
48
69
  args = arg_parser.parse_args()
49
70
 
50
- if args.verbose:
51
- segments = wxpath_parser.parse(args.expression)
52
- print("parsed expression:\n\nSegments([")
53
- for s in segments:
54
- print(f"\t{s},")
55
- print("])")
56
- print()
57
-
58
71
  if args.debug:
59
72
  from wxpath import configure_logging
60
73
  configure_logging('DEBUG')
@@ -72,6 +85,29 @@ def main():
72
85
  print(f"Using custom headers: {custom_headers}")
73
86
  print()
74
87
 
88
+ if args.cache:
89
+ SETTINGS.http.client.cache.enabled = True
90
+ if args.cache_backend == "redis":
91
+ SETTINGS.http.client.cache.backend = "redis"
92
+ SETTINGS.http.client.cache.redis.address = args.cache_db_path_or_url
93
+ elif args.cache_backend == "sqlite":
94
+ SETTINGS.http.client.cache.backend = "sqlite"
95
+ SETTINGS.http.client.cache.sqlite.cache_name = args.cache_db_path_or_url
96
+
97
+ if args.verbose:
98
+ print(f"Using concurrency: {args.concurrency}")
99
+ print(f"Using concurrency per host: {args.concurrency_per_host}")
100
+ print(f"Using respect robots: {args.respect_robots}")
101
+ print(f"Using cache: {args.cache}")
102
+
103
+ segments = wxpath_parser.parse(args.expression)
104
+ print("parsed expression:\n\nSegments([")
105
+ for s in segments:
106
+ print(f"\t{s},")
107
+ print("])")
108
+ print()
109
+ print()
110
+
75
111
  crawler = Crawler(
76
112
  concurrency=args.concurrency,
77
113
  per_host=args.concurrency_per_host,
@@ -81,11 +117,20 @@ def main():
81
117
  engine = WXPathEngine(crawler=crawler)
82
118
 
83
119
  try:
84
- for r in wxpath_async_blocking_iter(args.expression, args.depth, engine):
120
+ for r in wxpath_async_blocking_iter(
121
+ path_expr=args.expression,
122
+ max_depth=args.depth,
123
+ engine=engine):
85
124
  clean = simplify(r)
86
125
  print(json.dumps(clean, ensure_ascii=False), flush=True)
87
126
  except BrokenPipeError:
88
- sys.exit(0)
127
+ if args.verbose:
128
+ print("Pipe broken.")
129
+
130
+ if args.verbose:
131
+ print("Done. Printing crawl stats")
132
+ print(crawler._stats)
133
+ sys.exit(0)
89
134
 
90
135
 
91
136
  if __name__ == "__main__":
@@ -5,6 +5,7 @@ from collections import deque
5
5
  from typing import Any, AsyncGenerator, Iterator
6
6
 
7
7
  from lxml.html import HtmlElement
8
+ from tqdm import tqdm
8
9
 
9
10
  from wxpath import patches # noqa: F401
10
11
  from wxpath.core import parser
@@ -157,7 +158,12 @@ class WXPathEngine(HookedEngineBase):
157
158
  if allow_redirects:
158
159
  self.allowed_response_codes |= {301, 302, 303, 307, 308}
159
160
 
160
- async def run(self, expression: str, max_depth: int) -> AsyncGenerator[Any, None]:
161
+ async def run(
162
+ self,
163
+ expression: str,
164
+ max_depth: int,
165
+ progress: bool = False
166
+ ) -> AsyncGenerator[Any, None]:
161
167
  """Execute a wxpath expression concurrently and yield results.
162
168
 
163
169
  Builds and drives a BFS-like crawl pipeline that honors robots rules,
@@ -166,6 +172,7 @@ class WXPathEngine(HookedEngineBase):
166
172
  Args:
167
173
  expression: WXPath expression string to evaluate.
168
174
  max_depth: Maximum crawl depth to follow for url hops.
175
+ progress: Whether to display a progress bar.
169
176
 
170
177
  Yields:
171
178
  Extracted values produced by the expression (HTML elements or
@@ -182,6 +189,12 @@ class WXPathEngine(HookedEngineBase):
182
189
  # the current state of the engine.
183
190
  return queue.empty() and pending_tasks <= 0
184
191
 
192
+ total_yielded = 0
193
+ if progress:
194
+ pbar = tqdm(total=0)
195
+ else:
196
+ pbar = None
197
+
185
198
  async with self.crawler as crawler:
186
199
  async def submitter():
187
200
  nonlocal pending_tasks
@@ -219,12 +232,17 @@ class WXPathEngine(HookedEngineBase):
219
232
  depth=seed_task.depth,
220
233
  max_depth=max_depth,
221
234
  queue=queue,
235
+ pbar=pbar,
222
236
  ):
223
237
  yield await self.post_extract_hooks(output)
224
238
 
225
239
  # While looping asynchronous generators, you MUST make sure
226
240
  # to check terminal conditions before re-iteration.
227
241
  async for resp in crawler:
242
+ if pbar is not None:
243
+ pbar.update(1)
244
+ pbar.refresh()
245
+
228
246
  task = inflight.pop(resp.request.url, None)
229
247
  pending_tasks -= 1
230
248
 
@@ -273,10 +291,18 @@ class WXPathEngine(HookedEngineBase):
273
291
  depth=task.depth,
274
292
  max_depth=max_depth,
275
293
  queue=queue,
276
- ):
294
+ pbar=pbar
295
+ ):
296
+ total_yielded += 1
297
+ if pbar is not None:
298
+ pbar.set_postfix(yielded=total_yielded, depth=task.depth,)
277
299
 
278
300
  yield await self.post_extract_hooks(output)
279
301
  else:
302
+ total_yielded += 1
303
+ if pbar is not None:
304
+ pbar.set_postfix(yielded=total_yielded, depth=task.depth,)
305
+
280
306
  yield await self.post_extract_hooks(elem)
281
307
 
282
308
  # Termination condition
@@ -287,6 +313,9 @@ class WXPathEngine(HookedEngineBase):
287
313
  with contextlib.suppress(asyncio.CancelledError):
288
314
  await submit_task
289
315
 
316
+ if pbar is not None:
317
+ pbar.close()
318
+
290
319
  async def _process_pipeline(
291
320
  self,
292
321
  task: CrawlTask,
@@ -294,6 +323,7 @@ class WXPathEngine(HookedEngineBase):
294
323
  depth: int,
295
324
  max_depth: int,
296
325
  queue: asyncio.Queue[CrawlTask],
326
+ pbar: tqdm = None
297
327
  ) -> AsyncGenerator[Any, None]:
298
328
  """Process a queue of intents for a single crawl branch.
299
329
 
@@ -331,9 +361,10 @@ class WXPathEngine(HookedEngineBase):
331
361
  elif isinstance(intent, CrawlIntent):
332
362
  next_depth = task.depth + 1
333
363
  # if intent.url not in self.seen_urls and next_depth <= max_depth:
334
- if next_depth <= max_depth:
364
+ if next_depth <= max_depth and intent.url not in self.seen_urls:
335
365
  # self.seen_urls.add(intent.url)
336
366
  log.debug(f"Depth: {next_depth}; Enqueuing {intent.url}")
367
+
337
368
  queue.put_nowait(
338
369
  CrawlTask(
339
370
  elem=None,
@@ -343,6 +374,9 @@ class WXPathEngine(HookedEngineBase):
343
374
  backlink=task.url,
344
375
  )
345
376
  )
377
+ if pbar is not None:
378
+ pbar.total += 1
379
+ pbar.refresh()
346
380
 
347
381
  elif isinstance(intent, (ExtractIntent, ProcessIntent, InfiniteCrawlIntent)):
348
382
  # immediately traverse the extraction
@@ -351,18 +385,20 @@ class WXPathEngine(HookedEngineBase):
351
385
  mini_queue.append((elem, next_segments))
352
386
 
353
387
 
354
- def wxpath_async(path_expr: str,
355
- max_depth: int,
388
+ def wxpath_async(path_expr: str,
389
+ max_depth: int,
390
+ progress: bool = False,
356
391
  engine: WXPathEngine | None = None) -> AsyncGenerator[Any, None]:
357
392
  if engine is None:
358
393
  engine = WXPathEngine()
359
- return engine.run(path_expr, max_depth)
394
+ return engine.run(path_expr, max_depth, progress=progress)
360
395
 
361
396
 
362
397
  ##### ASYNC IN SYNC #####
363
398
  def wxpath_async_blocking_iter(
364
399
  path_expr: str,
365
400
  max_depth: int = 1,
401
+ progress: bool = False,
366
402
  engine: WXPathEngine | None = None,
367
403
  ) -> Iterator[Any]:
368
404
  """Evaluate a wxpath expression using concurrent breadth-first traversal.
@@ -383,7 +419,7 @@ def wxpath_async_blocking_iter(
383
419
  """
384
420
  loop = asyncio.new_event_loop()
385
421
  asyncio.set_event_loop(loop)
386
- agen = wxpath_async(path_expr, max_depth=max_depth, engine=engine)
422
+ agen = wxpath_async(path_expr, max_depth=max_depth, progress=progress, engine=engine)
387
423
 
388
424
  try:
389
425
  while True:
@@ -399,8 +435,10 @@ def wxpath_async_blocking_iter(
399
435
  def wxpath_async_blocking(
400
436
  path_expr: str,
401
437
  max_depth: int = 1,
438
+ progress: bool = False,
402
439
  engine: WXPathEngine | None = None,
403
440
  ) -> list[Any]:
404
- return list(
405
- wxpath_async_blocking_iter(path_expr, max_depth=max_depth, engine=engine)
406
- )
441
+ return list(wxpath_async_blocking_iter(path_expr,
442
+ max_depth=max_depth,
443
+ progress=progress,
444
+ engine=engine))