wxpath 0.4.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {wxpath-0.4.0 → wxpath-0.5.0}/PKG-INFO +123 -19
  2. wxpath-0.4.0/src/wxpath.egg-info/PKG-INFO → wxpath-0.5.0/README.md +98 -45
  3. {wxpath-0.4.0 → wxpath-0.5.0}/pyproject.toml +18 -1
  4. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/__init__.py +2 -0
  5. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/cli.py +6 -0
  6. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/core/models.py +1 -0
  7. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/core/ops.py +9 -12
  8. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/core/parser.py +92 -23
  9. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/core/runtime/engine.py +79 -8
  10. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/core/runtime/helpers.py +6 -3
  11. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/client/__init__.py +1 -1
  12. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/client/crawler.py +19 -7
  13. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/client/request.py +1 -1
  14. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/client/response.py +7 -1
  15. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/policy/retry.py +2 -2
  16. wxpath-0.5.0/src/wxpath/integrations/langchain/__init__.py +0 -0
  17. wxpath-0.5.0/src/wxpath/integrations/langchain/examples/basic_rag.py +85 -0
  18. wxpath-0.5.0/src/wxpath/integrations/langchain/examples/rolling_window_rag.py +218 -0
  19. wxpath-0.5.0/src/wxpath/integrations/langchain/loader.py +60 -0
  20. wxpath-0.5.0/src/wxpath/patches.py +273 -0
  21. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/settings.py +3 -1
  22. wxpath-0.5.0/src/wxpath/tui.py +1204 -0
  23. wxpath-0.5.0/src/wxpath/tui_settings.py +151 -0
  24. wxpath-0.5.0/src/wxpath/util/__init__.py +0 -0
  25. wxpath-0.5.0/src/wxpath/util/cleaners.py +31 -0
  26. wxpath-0.5.0/src/wxpath/util/common_paths.py +22 -0
  27. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/util/logging.py +3 -7
  28. wxpath-0.4.0/README.md → wxpath-0.5.0/src/wxpath.egg-info/PKG-INFO +148 -18
  29. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath.egg-info/SOURCES.txt +9 -0
  30. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath.egg-info/entry_points.txt +1 -0
  31. wxpath-0.5.0/src/wxpath.egg-info/requires.txt +43 -0
  32. wxpath-0.4.0/src/wxpath/patches.py +0 -63
  33. wxpath-0.4.0/src/wxpath.egg-info/requires.txt +0 -20
  34. {wxpath-0.4.0 → wxpath-0.5.0}/LICENSE +0 -0
  35. {wxpath-0.4.0 → wxpath-0.5.0}/setup.cfg +0 -0
  36. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/core/__init__.py +0 -0
  37. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/core/dom.py +0 -0
  38. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/core/runtime/__init__.py +0 -0
  39. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/hooks/__init__.py +0 -0
  40. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/hooks/builtin.py +0 -0
  41. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/hooks/registry.py +0 -0
  42. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/__init__.py +0 -0
  43. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/client/cache.py +0 -0
  44. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/policy/backoff.py +0 -0
  45. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/policy/robots.py +0 -0
  46. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/policy/throttler.py +0 -0
  47. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/http/stats.py +0 -0
  48. {wxpath-0.4.0/src/wxpath/util → wxpath-0.5.0/src/wxpath/integrations}/__init__.py +0 -0
  49. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath/util/serialize.py +0 -0
  50. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath.egg-info/dependency_links.txt +0 -0
  51. {wxpath-0.4.0 → wxpath-0.5.0}/src/wxpath.egg-info/top_level.txt +0 -0
@@ -1,9 +1,14 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wxpath
3
- Version: 0.4.0
3
+ Version: 0.5.0
4
4
  Summary: wxpath - a declarative web crawler and data extractor
5
5
  Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
6
6
  License-Expression: MIT
7
+ Project-URL: Homepage, https://rodricios.github.io/wxpath
8
+ Project-URL: Documentation, https://rodricios.github.io/wxpath
9
+ Project-URL: Repository, https://github.com/rodricios/wxpath
10
+ Project-URL: Issues, https://github.com/rodricios/wxpath/issues
11
+ Project-URL: Changelog, https://github.com/rodricios/wxpath/blob/main/CHANGELOG.md
7
12
  Requires-Python: >=3.10
8
13
  Description-Content-Type: text/markdown
9
14
  License-File: LICENSE
@@ -17,27 +22,112 @@ Provides-Extra: cache-sqlite
17
22
  Requires-Dist: aiohttp-client-cache[sqlite]; extra == "cache-sqlite"
18
23
  Provides-Extra: cache-redis
19
24
  Requires-Dist: aiohttp-client-cache[redis]; extra == "cache-redis"
25
+ Provides-Extra: llm
26
+ Requires-Dist: langchain>=1.0.0; extra == "llm"
27
+ Requires-Dist: langchain-core>=1.0.0; extra == "llm"
28
+ Requires-Dist: langchain-ollama>=1.0.0; extra == "llm"
29
+ Requires-Dist: langchain-community>=0.4.0; extra == "llm"
30
+ Requires-Dist: langchain-chroma>=1.0.0; extra == "llm"
31
+ Requires-Dist: chromadb>=1.0.0; extra == "llm"
32
+ Requires-Dist: langchain-text-splitters>=1.1.0; extra == "llm"
20
33
  Provides-Extra: test
21
34
  Requires-Dist: pytest>=7.0; extra == "test"
22
35
  Requires-Dist: pytest-asyncio>=0.23; extra == "test"
23
36
  Provides-Extra: dev
24
37
  Requires-Dist: ruff; extra == "dev"
38
+ Provides-Extra: docs
39
+ Requires-Dist: mkdocs>=1.5; extra == "docs"
40
+ Requires-Dist: mkdocs-material>=9.0; extra == "docs"
41
+ Requires-Dist: mkdocstrings[python]>=0.24; extra == "docs"
42
+ Requires-Dist: mkdocs-macros-plugin>=1.0; extra == "docs"
43
+ Requires-Dist: mkdocs-resize-images>=1.0; extra == "docs"
44
+ Requires-Dist: mkdocs-glightbox; extra == "docs"
45
+ Requires-Dist: pyyaml>=6.0; extra == "docs"
46
+ Provides-Extra: tui
47
+ Requires-Dist: textual>=1.0.0; extra == "tui"
48
+ Requires-Dist: aiohttp-client-cache>=0.14.0; extra == "tui"
49
+ Requires-Dist: aiohttp-client-cache[sqlite]; extra == "tui"
25
50
  Dynamic: license-file
26
51
 
27
- # **wxpath** - declarative web crawling with XPath
52
+ # **wxpath** - declarative web graph traversal with XPath
28
53
 
29
- [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-3100/)
54
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-3100/) [![Documentation Status](https://img.shields.io/badge/documentation-green.svg)](https://rodricios.github.io/wxpath)
55
+
56
+
57
+ > NEW: [TUI](https://rodricios.github.io/wxpath/tui/quickstart.md) - Interactive terminal interface (powered by Textual) for testing wxpath expressions and exporting data.
58
+
59
+ ![Wxpath TUI Demo screenshot](docs/assets/images/demo1.jpg)
60
+
61
+ ## Install
62
+
63
+ Requires Python 3.10+.
64
+
65
+ ```
66
+ pip install wxpath
67
+ # For TUI support
68
+ pip install wxpath[tui]
69
+ ```
70
+ ---
71
+
72
+
73
+ ## What is wxpath?
30
74
 
31
75
  **wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
32
76
 
33
- By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform deep (or paginated) web crawling and extraction.
77
+ This expression fetches a page, extracts links, and streams them concurrently - no crawl loop required:
78
+
79
+ ```python
80
+ import wxpath
81
+
82
+ expr = "url('https://quotes.toscrape.com')//a/@href"
83
+
84
+ for link in wxpath.wxpath_async_blocking_iter(expr):
85
+ print(link)
86
+ ```
87
+
88
+
89
+ By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform recursive (or paginated) web crawling and extraction:
90
+
91
+ ```python
92
+ import wxpath
93
+
94
+ path_expr = """
95
+ url('https://quotes.toscrape.com')
96
+ ///url(//a/@href)
97
+ //a/@href
98
+ """
99
+
100
+ for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
101
+ print(item)
102
+ ```
103
+
104
+
105
+ ## Why wxpath?
106
+
107
+ Most web scrapers force you to write crawl control flow first, and extraction second.
108
+
109
+ **wxpath** converges those two steps into one:
110
+ - **You describe traversal declaratively**
111
+ - **Extraction is expressed inline**
112
+ - **The engine handles scheduling, concurrency, and deduplication**
113
+
114
+
115
+ ### RAG-Ready Output
116
+
117
+ Extract clean, structured JSON hierarchies directly from the graph - feed your LLMs signal, not noise. Refer to [LangChain Integration](https://rodricios.github.io/wxpath/api/integrations/langchain/) for more details.
118
+
34
119
 
35
- NOTE: This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
120
+ ### Deterministic
36
121
 
122
+ **wxpath** is deterministic (read: not powered by LLMs). While we can't guarantee the network is stable, we can guarantee the traversal is.
123
+
124
+ ## Documentation (WIP)
125
+
126
+ Documentation is now available [here](https://rodricios.github.io/wxpath/).
37
127
 
38
128
  ## Contents
39
129
 
40
- - [Example](#example)
130
+ - [Example: Knowledge Graph](#example)
41
131
  - [Language Design](DESIGN.md)
42
132
  - [`url(...)` and `///url(...)` Explained](#url-and-url-explained)
43
133
  - [General flow](#general-flow)
@@ -47,6 +137,7 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
47
137
  - [XPath 3.1](#xpath-31-by-default)
48
138
  - [Progress Bar](#progress-bar)
49
139
  - [CLI](#cli)
140
+ - [TUI](#tui)
50
141
  - [Persistence and Caching](#persistence-and-caching)
51
142
  - [Settings](#settings)
52
143
  - [Hooks (Experimental)](#hooks-experimental)
@@ -56,7 +147,7 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
56
147
  - [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
57
148
  - [Project Philosophy](#project-philosophy)
58
149
  - [Warnings](#warnings)
59
- - [Commercial support / consulting](#commercial-support--consulting)
150
+ - [Commercial support/consulting](#commercial-supportconsulting)
60
151
  - [Versioning](#versioning)
61
152
  - [License](#license)
62
153
 
@@ -73,7 +164,11 @@ CRAWLER_SETTINGS.headers = {'User-Agent': 'my-app/0.4.0 (contact: you@example.co
73
164
  # Crawl, extract fields, build a knowledge graph
74
165
  path_expr = """
75
166
  url('https://en.wikipedia.org/wiki/Expression_language')
76
- ///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
167
+ ///url(
168
+ //main//a/@href[
169
+ starts-with(., '/wiki/') and not(contains(., ':'))
170
+ ]
171
+ )
77
172
  /map{
78
173
  'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
79
174
  'url': string(base-uri(.)),
@@ -86,15 +181,6 @@ for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
86
181
  print(item)
87
182
  ```
88
183
 
89
- Output:
90
-
91
- ```python
92
- map{'title': 'Computer language', 'url': 'https://en.wikipedia.org/wiki/Computer_language', 'short_description': 'Formal language for communicating with a computer', 'forward_links': ['/wiki/Formal_language', '/wiki/Communication', ...]}
93
- map{'title': 'Advanced Boolean Expression Language', 'url': 'https://en.wikipedia.org/wiki/Advanced_Boolean_Expression_Language', 'short_description': 'Hardware description language and software', 'forward_links': ['/wiki/File:ABEL_HDL_example_SN74162.png', '/wiki/Hardware_description_language', ...]}
94
- map{'title': 'Machine-readable medium and data', 'url': 'https://en.wikipedia.org/wiki/Machine_readable', 'short_description': 'Medium capable of storing data in a format readable by a machine', 'forward_links': ['/wiki/File:EAN-13-ISBN-13.svg', '/wiki/ISBN', ...]}
95
- ...
96
- ```
97
-
98
184
  **Note:** Some sites (including Wikipedia) may block requests without proper headers.
99
185
  See [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration) to set a custom `User-Agent`.
100
186
 
@@ -266,12 +352,17 @@ Command line options:
266
352
  --cache [true|false] (Default: False) Persist crawl results to a local database
267
353
  ```
268
354
 
355
+ ## TUI
356
+
357
+ **wxpath** provides a terminal interface (TUI) for interactive expression testing and data extraction.
358
+
359
+ See [TUI Quickstart](https://rodricios.github.io/wxpath/tui/quickstart.md) for more details.
269
360
 
270
361
  ## Persistence and Caching
271
362
 
272
363
  **wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
273
364
 
274
- **wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will be encounter a warning if you `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
365
+ **wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will encounter a warning if `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
275
366
 
276
367
  To use, you must install the appropriate optional dependency:
277
368
 
@@ -406,6 +497,17 @@ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(//mai
406
497
  items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
407
498
  ```
408
499
 
500
+ ### Runtime API (`wxpath_async*`) options
501
+
502
+ - `max_depth`: int = 1
503
+ - `progress`: bool = False
504
+ - `engine`: WXPathEngine | None = None
505
+ - `yield_errors`: bool = False
506
+
507
+
508
+ ### Settings
509
+ You can also use [settings.py](src/wxpath/settings.py) to enable caching, throttling, concurrency and more.
510
+
409
511
 
410
512
  ## Project Philosophy
411
513
 
@@ -433,13 +535,15 @@ The following features are not yet supported:
433
535
 
434
536
  ## WARNINGS!!!
435
537
 
538
+ This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
539
+
436
540
  - Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
437
541
  - Deep crawls (`///`) require user discipline to avoid unbounded expansion (traversal explosion).
438
542
  - Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
439
543
  - Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
440
544
 
441
545
 
442
- ## Commercial support / consulting
546
+ ## Commercial support/consulting
443
547
 
444
548
  If you want help building or operating crawlers/data feeds with wxpath (extraction, scheduling, monitoring, breakage fixes) or other web-scraping needs, please contact me at: rodrigopala91@gmail.com.
445
549
 
@@ -1,43 +1,82 @@
1
- Metadata-Version: 2.4
2
- Name: wxpath
3
- Version: 0.4.0
4
- Summary: wxpath - a declarative web crawler and data extractor
5
- Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
6
- License-Expression: MIT
7
- Requires-Python: >=3.10
8
- Description-Content-Type: text/markdown
9
- License-File: LICENSE
10
- Requires-Dist: lxml>=4.0
11
- Requires-Dist: elementpath<=5.0.3,>=5.0.0
12
- Requires-Dist: aiohttp<=3.12.15,>=3.8.0
13
- Requires-Dist: tqdm>=4.0.0
14
- Provides-Extra: cache
15
- Requires-Dist: aiohttp-client-cache>=0.14.0; extra == "cache"
16
- Provides-Extra: cache-sqlite
17
- Requires-Dist: aiohttp-client-cache[sqlite]; extra == "cache-sqlite"
18
- Provides-Extra: cache-redis
19
- Requires-Dist: aiohttp-client-cache[redis]; extra == "cache-redis"
20
- Provides-Extra: test
21
- Requires-Dist: pytest>=7.0; extra == "test"
22
- Requires-Dist: pytest-asyncio>=0.23; extra == "test"
23
- Provides-Extra: dev
24
- Requires-Dist: ruff; extra == "dev"
25
- Dynamic: license-file
26
-
27
- # **wxpath** - declarative web crawling with XPath
28
-
29
- [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-3100/)
1
+ # **wxpath** - declarative web graph traversal with XPath
2
+
3
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-3100/) [![Documentation Status](https://img.shields.io/badge/documentation-green.svg)](https://rodricios.github.io/wxpath)
4
+
5
+
6
+ > NEW: [TUI](https://rodricios.github.io/wxpath/tui/quickstart.md) - Interactive terminal interface (powered by Textual) for testing wxpath expressions and exporting data.
7
+
8
+ ![Wxpath TUI Demo screenshot](docs/assets/images/demo1.jpg)
9
+
10
+ ## Install
11
+
12
+ Requires Python 3.10+.
13
+
14
+ ```
15
+ pip install wxpath
16
+ # For TUI support
17
+ pip install wxpath[tui]
18
+ ```
19
+ ---
20
+
21
+
22
+ ## What is wxpath?
30
23
 
31
24
  **wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
32
25
 
33
- By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform deep (or paginated) web crawling and extraction.
26
+ This expression fetches a page, extracts links, and streams them concurrently - no crawl loop required:
34
27
 
35
- NOTE: This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
28
+ ```python
29
+ import wxpath
36
30
 
31
+ expr = "url('https://quotes.toscrape.com')//a/@href"
32
+
33
+ for link in wxpath.wxpath_async_blocking_iter(expr):
34
+ print(link)
35
+ ```
36
+
37
+
38
+ By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform recursive (or paginated) web crawling and extraction:
39
+
40
+ ```python
41
+ import wxpath
42
+
43
+ path_expr = """
44
+ url('https://quotes.toscrape.com')
45
+ ///url(//a/@href)
46
+ //a/@href
47
+ """
48
+
49
+ for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
50
+ print(item)
51
+ ```
52
+
53
+
54
+ ## Why wxpath?
55
+
56
+ Most web scrapers force you to write crawl control flow first, and extraction second.
57
+
58
+ **wxpath** converges those two steps into one:
59
+ - **You describe traversal declaratively**
60
+ - **Extraction is expressed inline**
61
+ - **The engine handles scheduling, concurrency, and deduplication**
62
+
63
+
64
+ ### RAG-Ready Output
65
+
66
+ Extract clean, structured JSON hierarchies directly from the graph - feed your LLMs signal, not noise. Refer to [LangChain Integration](https://rodricios.github.io/wxpath/api/integrations/langchain/) for more details.
67
+
68
+
69
+ ### Deterministic
70
+
71
+ **wxpath** is deterministic (read: not powered by LLMs). While we can't guarantee the network is stable, we can guarantee the traversal is.
72
+
73
+ ## Documentation (WIP)
74
+
75
+ Documentation is now available [here](https://rodricios.github.io/wxpath/).
37
76
 
38
77
  ## Contents
39
78
 
40
- - [Example](#example)
79
+ - [Example: Knowledge Graph](#example)
41
80
  - [Language Design](DESIGN.md)
42
81
  - [`url(...)` and `///url(...)` Explained](#url-and-url-explained)
43
82
  - [General flow](#general-flow)
@@ -47,6 +86,7 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
47
86
  - [XPath 3.1](#xpath-31-by-default)
48
87
  - [Progress Bar](#progress-bar)
49
88
  - [CLI](#cli)
89
+ - [TUI](#tui)
50
90
  - [Persistence and Caching](#persistence-and-caching)
51
91
  - [Settings](#settings)
52
92
  - [Hooks (Experimental)](#hooks-experimental)
@@ -56,7 +96,7 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
56
96
  - [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
57
97
  - [Project Philosophy](#project-philosophy)
58
98
  - [Warnings](#warnings)
59
- - [Commercial support / consulting](#commercial-support--consulting)
99
+ - [Commercial support/consulting](#commercial-supportconsulting)
60
100
  - [Versioning](#versioning)
61
101
  - [License](#license)
62
102
 
@@ -73,7 +113,11 @@ CRAWLER_SETTINGS.headers = {'User-Agent': 'my-app/0.4.0 (contact: you@example.co
73
113
  # Crawl, extract fields, build a knowledge graph
74
114
  path_expr = """
75
115
  url('https://en.wikipedia.org/wiki/Expression_language')
76
- ///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
116
+ ///url(
117
+ //main//a/@href[
118
+ starts-with(., '/wiki/') and not(contains(., ':'))
119
+ ]
120
+ )
77
121
  /map{
78
122
  'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
79
123
  'url': string(base-uri(.)),
@@ -86,15 +130,6 @@ for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
86
130
  print(item)
87
131
  ```
88
132
 
89
- Output:
90
-
91
- ```python
92
- map{'title': 'Computer language', 'url': 'https://en.wikipedia.org/wiki/Computer_language', 'short_description': 'Formal language for communicating with a computer', 'forward_links': ['/wiki/Formal_language', '/wiki/Communication', ...]}
93
- map{'title': 'Advanced Boolean Expression Language', 'url': 'https://en.wikipedia.org/wiki/Advanced_Boolean_Expression_Language', 'short_description': 'Hardware description language and software', 'forward_links': ['/wiki/File:ABEL_HDL_example_SN74162.png', '/wiki/Hardware_description_language', ...]}
94
- map{'title': 'Machine-readable medium and data', 'url': 'https://en.wikipedia.org/wiki/Machine_readable', 'short_description': 'Medium capable of storing data in a format readable by a machine', 'forward_links': ['/wiki/File:EAN-13-ISBN-13.svg', '/wiki/ISBN', ...]}
95
- ...
96
- ```
97
-
98
133
  **Note:** Some sites (including Wikipedia) may block requests without proper headers.
99
134
  See [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration) to set a custom `User-Agent`.
100
135
 
@@ -266,12 +301,17 @@ Command line options:
266
301
  --cache [true|false] (Default: False) Persist crawl results to a local database
267
302
  ```
268
303
 
304
+ ## TUI
305
+
306
+ **wxpath** provides a terminal interface (TUI) for interactive expression testing and data extraction.
307
+
308
+ See [TUI Quickstart](https://rodricios.github.io/wxpath/tui/quickstart.md) for more details.
269
309
 
270
310
  ## Persistence and Caching
271
311
 
272
312
  **wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
273
313
 
274
- **wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will be encounter a warning if you `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
314
+ **wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will encounter a warning if `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
275
315
 
276
316
  To use, you must install the appropriate optional dependency:
277
317
 
@@ -406,6 +446,17 @@ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(//mai
406
446
  items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
407
447
  ```
408
448
 
449
+ ### Runtime API (`wxpath_async*`) options
450
+
451
+ - `max_depth`: int = 1
452
+ - `progress`: bool = False
453
+ - `engine`: WXPathEngine | None = None
454
+ - `yield_errors`: bool = False
455
+
456
+
457
+ ### Settings
458
+ You can also use [settings.py](src/wxpath/settings.py) to enable caching, throttling, concurrency and more.
459
+
409
460
 
410
461
  ## Project Philosophy
411
462
 
@@ -433,13 +484,15 @@ The following features are not yet supported:
433
484
 
434
485
  ## WARNINGS!!!
435
486
 
487
+ This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
488
+
436
489
  - Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
437
490
  - Deep crawls (`///`) require user discipline to avoid unbounded expansion (traversal explosion).
438
491
  - Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
439
492
  - Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
440
493
 
441
494
 
442
- ## Commercial support / consulting
495
+ ## Commercial support/consulting
443
496
 
444
497
  If you want help building or operating crawlers/data feeds with wxpath (extraction, scheduling, monitoring, breakage fixes) or other web-scraping needs, please contact me at: rodrigopala91@gmail.com.
445
498
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "wxpath"
7
- version = "0.4.0"
7
+ version = "0.5.0"
8
8
  description = "wxpath - a declarative web crawler and data extractor"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -13,6 +13,7 @@ authors = [
13
13
  ]
14
14
  license = "MIT"
15
15
  license-files = ["LICENSE"]
16
+
16
17
  dependencies = [
17
18
  "lxml>=4.0",
18
19
  "elementpath>=5.0.0,<=5.0.3",
@@ -20,16 +21,32 @@ dependencies = [
20
21
  "tqdm>=4.0.0"
21
22
  ]
22
23
 
24
+ [project.urls]
25
+ Homepage = "https://rodricios.github.io/wxpath"
26
+ Documentation = "https://rodricios.github.io/wxpath"
27
+ Repository = "https://github.com/rodricios/wxpath"
28
+ Issues = "https://github.com/rodricios/wxpath/issues"
29
+ Changelog = "https://github.com/rodricios/wxpath/blob/main/CHANGELOG.md"
30
+
31
+
23
32
  [project.optional-dependencies]
24
33
  cache = ["aiohttp-client-cache>=0.14.0"]
25
34
  cache-sqlite = ["aiohttp-client-cache[sqlite]"]
26
35
  cache-redis = ["aiohttp-client-cache[redis]"]
27
36
 
37
+ # langchain langchain-ollama langchain-chroma chromadb
38
+ llm = ["langchain>=1.0.0", "langchain-core>=1.0.0", "langchain-ollama>=1.0.0",
39
+ "langchain-community>=0.4.0", "langchain-chroma>=1.0.0", "chromadb>=1.0.0",
40
+ "langchain-text-splitters>=1.1.0"]
41
+
28
42
  test = ["pytest>=7.0", "pytest-asyncio>=0.23"]
29
43
  dev = ["ruff"]
44
+ docs = ["mkdocs>=1.5", "mkdocs-material>=9.0", "mkdocstrings[python]>=0.24", "mkdocs-macros-plugin>=1.0", "mkdocs-resize-images>=1.0", "mkdocs-glightbox", "pyyaml>=6.0"]
45
+ tui = ["textual>=1.0.0", "aiohttp-client-cache>=0.14.0", "aiohttp-client-cache[sqlite]"]
30
46
 
31
47
  [project.scripts]
32
48
  wxpath = "wxpath.cli:main"
49
+ wxpath-tui = "wxpath.tui:main"
33
50
 
34
51
  [tool.pytest.ini_options]
35
52
  minversion = "6.0"
@@ -1,3 +1,4 @@
1
+ from . import settings
1
2
  from .core.runtime.engine import wxpath_async, wxpath_async_blocking, wxpath_async_blocking_iter
2
3
  from .util.logging import configure_logging
3
4
 
@@ -6,4 +7,5 @@ __all__ = [
6
7
  'wxpath_async_blocking',
7
8
  'wxpath_async_blocking_iter',
8
9
  'configure_logging',
10
+ 'settings',
9
11
  ]
@@ -47,6 +47,11 @@ def main():
47
47
  help="Respect robots.txt",
48
48
  default=True
49
49
  )
50
+ arg_parser.add_argument(
51
+ "--insecure",
52
+ action="store_true",
53
+ help="Disable SSL certificate verification (use for sites with broken chains)",
54
+ )
50
55
  arg_parser.add_argument(
51
56
  "--cache",
52
57
  action="store_true",
@@ -112,6 +117,7 @@ def main():
112
117
  concurrency=args.concurrency,
113
118
  per_host=args.concurrency_per_host,
114
119
  respect_robots=args.respect_robots,
120
+ verify_ssl=not args.insecure,
115
121
  headers=custom_headers
116
122
  )
117
123
  engine = WXPathEngine(crawler=crawler)
@@ -61,6 +61,7 @@ class InfiniteCrawlIntent(ProcessIntent):
61
61
 
62
62
  @dataclass(slots=True)
63
63
  class ExtractIntent(ProcessIntent):
64
+ """TODO: May be redundant with ProcessIntent?"""
64
65
  pass
65
66
 
66
67
 
@@ -19,6 +19,7 @@ from wxpath.core.parser import (
19
19
  Binary,
20
20
  Call,
21
21
  ContextItem,
22
+ Depth,
22
23
  Segment,
23
24
  Segments,
24
25
  String,
@@ -78,7 +79,10 @@ def get_operator(
78
79
 
79
80
 
80
81
  @register('url', (String,))
82
+ @register('url', (String, Depth))
81
83
  @register('url', (String, Xpath))
84
+ @register('url', (String, Depth, Xpath))
85
+ @register('url', (String, Xpath, Depth))
82
86
  def _handle_url_str_lit(curr_elem: html.HtmlElement,
83
87
  curr_segments: list[Url | Xpath],
84
88
  curr_depth: int, **kwargs) -> Iterable[Intent]:
@@ -87,9 +91,12 @@ def _handle_url_str_lit(curr_elem: html.HtmlElement,
87
91
 
88
92
  next_segments = curr_segments[1:]
89
93
 
90
- if len(url_call.args) == 2:
94
+ # NOTE: Expects parser to produce UrlCrawl node in expressions
95
+ # that look like `url('...', follow=//a/@href)`
96
+ if isinstance(url_call, UrlCrawl):
97
+ xpath_arg = [arg for arg in url_call.args if isinstance(arg, Xpath)][0]
91
98
  _segments = [
92
- UrlCrawl('///url', [url_call.args[1], url_call.args[0].value])
99
+ UrlCrawl('///url', [xpath_arg, url_call.args[0].value])
93
100
  ] + next_segments
94
101
 
95
102
  yield CrawlIntent(url=url_call.args[0].value, next_segments=_segments)
@@ -112,16 +119,6 @@ def _handle_xpath(curr_elem: html.HtmlElement,
112
119
  raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
113
120
  base_url = getattr(curr_elem, 'base_url', None)
114
121
  log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
115
-
116
- _backlink_str = f"string('{curr_elem.get('backlink')}')"
117
- # We use the root tree's depth and not curr_depth because curr_depth accounts for a +1
118
- # increment after each url*() hop
119
- _depth_str = f"number({curr_elem.getroottree().getroot().get('depth')})"
120
- expr = expr.replace('wx:backlink()', _backlink_str)
121
- expr = expr.replace('wx:backlink(.)', _backlink_str)
122
- expr = expr.replace('wx:depth()', _depth_str)
123
- expr = expr.replace('wx:depth(.)', _depth_str)
124
-
125
122
  elems = curr_elem.xpath3(expr)
126
123
 
127
124
  next_segments = curr_segments[1:]