wxpath 0.4.1__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {wxpath-0.4.1 → wxpath-0.5.1}/PKG-INFO +73 -9
  2. wxpath-0.4.1/src/wxpath.egg-info/PKG-INFO → wxpath-0.5.1/README.md +46 -34
  3. {wxpath-0.4.1 → wxpath-0.5.1}/pyproject.toml +20 -3
  4. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/__init__.py +2 -0
  5. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/cli.py +6 -0
  6. wxpath-0.5.1/src/wxpath/core/exceptions.py +53 -0
  7. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/core/models.py +1 -0
  8. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/core/ops.py +100 -19
  9. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/core/parser.py +94 -24
  10. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/core/runtime/engine.py +74 -10
  11. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/core/runtime/helpers.py +6 -3
  12. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/client/__init__.py +1 -1
  13. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/client/crawler.py +17 -5
  14. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/client/response.py +7 -1
  15. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/policy/retry.py +2 -2
  16. wxpath-0.5.1/src/wxpath/integrations/langchain/__init__.py +0 -0
  17. wxpath-0.5.1/src/wxpath/integrations/langchain/examples/basic_rag.py +85 -0
  18. wxpath-0.5.1/src/wxpath/integrations/langchain/examples/rolling_window_rag.py +218 -0
  19. wxpath-0.5.1/src/wxpath/integrations/langchain/loader.py +60 -0
  20. wxpath-0.5.1/src/wxpath/patches.py +273 -0
  21. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/settings.py +3 -1
  22. wxpath-0.5.1/src/wxpath/tui.py +1225 -0
  23. wxpath-0.5.1/src/wxpath/tui_settings.py +151 -0
  24. wxpath-0.5.1/src/wxpath/util/__init__.py +0 -0
  25. wxpath-0.5.1/src/wxpath/util/cleaners.py +31 -0
  26. wxpath-0.5.1/src/wxpath/util/common_paths.py +22 -0
  27. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/util/logging.py +3 -7
  28. wxpath-0.4.1/README.md → wxpath-0.5.1/src/wxpath.egg-info/PKG-INFO +97 -7
  29. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath.egg-info/SOURCES.txt +10 -0
  30. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath.egg-info/entry_points.txt +1 -0
  31. wxpath-0.5.1/src/wxpath.egg-info/requires.txt +44 -0
  32. wxpath-0.4.1/src/wxpath/patches.py +0 -63
  33. wxpath-0.4.1/src/wxpath.egg-info/requires.txt +0 -20
  34. {wxpath-0.4.1 → wxpath-0.5.1}/LICENSE +0 -0
  35. {wxpath-0.4.1 → wxpath-0.5.1}/setup.cfg +0 -0
  36. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/core/__init__.py +0 -0
  37. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/core/dom.py +0 -0
  38. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/core/runtime/__init__.py +0 -0
  39. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/hooks/__init__.py +0 -0
  40. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/hooks/builtin.py +0 -0
  41. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/hooks/registry.py +0 -0
  42. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/__init__.py +0 -0
  43. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/client/cache.py +0 -0
  44. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/client/request.py +0 -0
  45. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/policy/backoff.py +0 -0
  46. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/policy/robots.py +0 -0
  47. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/policy/throttler.py +0 -0
  48. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/http/stats.py +0 -0
  49. {wxpath-0.4.1/src/wxpath/util → wxpath-0.5.1/src/wxpath/integrations}/__init__.py +0 -0
  50. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath/util/serialize.py +0 -0
  51. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath.egg-info/dependency_links.txt +0 -0
  52. {wxpath-0.4.1 → wxpath-0.5.1}/src/wxpath.egg-info/top_level.txt +0 -0
@@ -1,15 +1,20 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wxpath
3
- Version: 0.4.1
3
+ Version: 0.5.1
4
4
  Summary: wxpath - a declarative web crawler and data extractor
5
5
  Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
6
6
  License-Expression: MIT
7
+ Project-URL: Homepage, https://rodricios.github.io/wxpath
8
+ Project-URL: Documentation, https://rodricios.github.io/wxpath
9
+ Project-URL: Repository, https://github.com/rodricios/wxpath
10
+ Project-URL: Issues, https://github.com/rodricios/wxpath/issues
11
+ Project-URL: Changelog, https://github.com/rodricios/wxpath/blob/main/CHANGELOG.md
7
12
  Requires-Python: >=3.10
8
13
  Description-Content-Type: text/markdown
9
14
  License-File: LICENSE
10
15
  Requires-Dist: lxml>=4.0
11
16
  Requires-Dist: elementpath<=5.0.3,>=5.0.0
12
- Requires-Dist: aiohttp<=3.12.15,>=3.8.0
17
+ Requires-Dist: aiohttp<=4.0.0,>=3.8.0
13
18
  Requires-Dist: tqdm>=4.0.0
14
19
  Provides-Extra: cache
15
20
  Requires-Dist: aiohttp-client-cache>=0.14.0; extra == "cache"
@@ -17,16 +22,56 @@ Provides-Extra: cache-sqlite
17
22
  Requires-Dist: aiohttp-client-cache[sqlite]; extra == "cache-sqlite"
18
23
  Provides-Extra: cache-redis
19
24
  Requires-Dist: aiohttp-client-cache[redis]; extra == "cache-redis"
25
+ Provides-Extra: llm
26
+ Requires-Dist: langchain>=1.0.0; extra == "llm"
27
+ Requires-Dist: langchain-core>=1.0.0; extra == "llm"
28
+ Requires-Dist: langchain-ollama>=1.0.0; extra == "llm"
29
+ Requires-Dist: langchain-community>=0.4.0; extra == "llm"
30
+ Requires-Dist: langchain-chroma>=1.0.0; extra == "llm"
31
+ Requires-Dist: chromadb>=1.0.0; extra == "llm"
32
+ Requires-Dist: langchain-text-splitters>=1.1.0; extra == "llm"
20
33
  Provides-Extra: test
21
34
  Requires-Dist: pytest>=7.0; extra == "test"
22
35
  Requires-Dist: pytest-asyncio>=0.23; extra == "test"
23
36
  Provides-Extra: dev
24
37
  Requires-Dist: ruff; extra == "dev"
38
+ Requires-Dist: tox; extra == "dev"
39
+ Provides-Extra: docs
40
+ Requires-Dist: mkdocs>=1.5; extra == "docs"
41
+ Requires-Dist: mkdocs-material>=9.0; extra == "docs"
42
+ Requires-Dist: mkdocstrings[python]>=0.24; extra == "docs"
43
+ Requires-Dist: mkdocs-macros-plugin>=1.0; extra == "docs"
44
+ Requires-Dist: mkdocs-resize-images>=1.0; extra == "docs"
45
+ Requires-Dist: mkdocs-glightbox; extra == "docs"
46
+ Requires-Dist: pyyaml>=6.0; extra == "docs"
47
+ Provides-Extra: tui
48
+ Requires-Dist: textual>=1.0.0; extra == "tui"
49
+ Requires-Dist: aiohttp-client-cache>=0.14.0; extra == "tui"
50
+ Requires-Dist: aiohttp-client-cache[sqlite]; extra == "tui"
25
51
  Dynamic: license-file
26
52
 
27
- # **wxpath** - declarative web crawling with XPath
53
+ # **wxpath** - declarative web graph traversal with XPath
28
54
 
29
- [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-3100/)
55
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-3100/) [![Documentation Status](https://img.shields.io/badge/documentation-green.svg)](https://rodricios.github.io/wxpath)
56
+
57
+
58
+ > NEW: [TUI](https://rodricios.github.io/wxpath/tui/quickstart) - Interactive terminal interface (powered by Textual) for testing wxpath expressions and exporting data.
59
+
60
+ ![Wxpath TUI Demo screenshot](docs/assets/images/demo1.jpg)
61
+
62
+ ## Install
63
+
64
+ Requires Python 3.10+.
65
+
66
+ ```
67
+ pip install wxpath
68
+ # For TUI support
69
+ pip install "wxpath[tui]"
70
+ ```
71
+ ---
72
+
73
+
74
+ ## What is wxpath?
30
75
 
31
76
  **wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
32
77
 
@@ -35,14 +80,14 @@ This expression fetches a page, extracts links, and streams them concurrently -
35
80
  ```python
36
81
  import wxpath
37
82
 
38
- expr = "url('https://example.com')//a/@href"
83
+ expr = "url('https://quotes.toscrape.com')//a/@href"
39
84
 
40
85
  for link in wxpath.wxpath_async_blocking_iter(expr):
41
86
  print(link)
42
87
  ```
43
88
 
44
89
 
45
- By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform deep (or paginated) web crawling and extraction:
90
+ By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform recursive (or paginated) web crawling and extraction:
46
91
 
47
92
  ```python
48
93
  import wxpath
@@ -62,15 +107,28 @@ for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
62
107
 
63
108
  Most web scrapers force you to write crawl control flow first, and extraction second.
64
109
 
65
- **wxpath** inverts that:
110
+ **wxpath** converges those two steps into one:
66
111
  - **You describe traversal declaratively**
67
112
  - **Extraction is expressed inline**
68
113
  - **The engine handles scheduling, concurrency, and deduplication**
69
114
 
70
115
 
116
+ ### RAG-Ready Output
117
+
118
+ Extract clean, structured JSON hierarchies directly from the graph - feed your LLMs signal, not noise. Refer to [LangChain Integration](https://rodricios.github.io/wxpath/api/integrations/langchain/) for more details.
119
+
120
+
121
+ ### Deterministic
122
+
123
+ **wxpath** is deterministic (read: not powered by LLMs). While we can't guarantee the network is stable, we can guarantee the traversal is.
124
+
125
+ ## Documentation (WIP)
126
+
127
+ Documentation is now available [here](https://rodricios.github.io/wxpath/).
128
+
71
129
  ## Contents
72
130
 
73
- - [Example](#example)
131
+ - [Example: Knowledge Graph](#example)
74
132
  - [Language Design](DESIGN.md)
75
133
  - [`url(...)` and `///url(...)` Explained](#url-and-url-explained)
76
134
  - [General flow](#general-flow)
@@ -80,6 +138,7 @@ Most web scrapers force you to write crawl control flow first, and extraction se
80
138
  - [XPath 3.1](#xpath-31-by-default)
81
139
  - [Progress Bar](#progress-bar)
82
140
  - [CLI](#cli)
141
+ - [TUI](#tui)
83
142
  - [Persistence and Caching](#persistence-and-caching)
84
143
  - [Settings](#settings)
85
144
  - [Hooks (Experimental)](#hooks-experimental)
@@ -294,12 +353,17 @@ Command line options:
294
353
  --cache [true|false] (Default: False) Persist crawl results to a local database
295
354
  ```
296
355
 
356
+ ## TUI
357
+
358
+ **wxpath** provides a terminal interface (TUI) for interactive expression testing and data extraction.
359
+
360
+ See [TUI Quickstart](https://rodricios.github.io/wxpath/tui/quickstart) for more details.
297
361
 
298
362
  ## Persistence and Caching
299
363
 
300
364
  **wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
301
365
 
302
- **wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will be encounter a warning if you `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
366
+ **wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will encounter a warning if `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
303
367
 
304
368
  To use, you must install the appropriate optional dependency:
305
369
 
@@ -1,32 +1,25 @@
1
- Metadata-Version: 2.4
2
- Name: wxpath
3
- Version: 0.4.1
4
- Summary: wxpath - a declarative web crawler and data extractor
5
- Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
6
- License-Expression: MIT
7
- Requires-Python: >=3.10
8
- Description-Content-Type: text/markdown
9
- License-File: LICENSE
10
- Requires-Dist: lxml>=4.0
11
- Requires-Dist: elementpath<=5.0.3,>=5.0.0
12
- Requires-Dist: aiohttp<=3.12.15,>=3.8.0
13
- Requires-Dist: tqdm>=4.0.0
14
- Provides-Extra: cache
15
- Requires-Dist: aiohttp-client-cache>=0.14.0; extra == "cache"
16
- Provides-Extra: cache-sqlite
17
- Requires-Dist: aiohttp-client-cache[sqlite]; extra == "cache-sqlite"
18
- Provides-Extra: cache-redis
19
- Requires-Dist: aiohttp-client-cache[redis]; extra == "cache-redis"
20
- Provides-Extra: test
21
- Requires-Dist: pytest>=7.0; extra == "test"
22
- Requires-Dist: pytest-asyncio>=0.23; extra == "test"
23
- Provides-Extra: dev
24
- Requires-Dist: ruff; extra == "dev"
25
- Dynamic: license-file
26
-
27
- # **wxpath** - declarative web crawling with XPath
28
-
29
- [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-3100/)
1
+ # **wxpath** - declarative web graph traversal with XPath
2
+
3
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-3100/) [![Documentation Status](https://img.shields.io/badge/documentation-green.svg)](https://rodricios.github.io/wxpath)
4
+
5
+
6
+ > NEW: [TUI](https://rodricios.github.io/wxpath/tui/quickstart) - Interactive terminal interface (powered by Textual) for testing wxpath expressions and exporting data.
7
+
8
+ ![Wxpath TUI Demo screenshot](docs/assets/images/demo1.jpg)
9
+
10
+ ## Install
11
+
12
+ Requires Python 3.10+.
13
+
14
+ ```
15
+ pip install wxpath
16
+ # For TUI support
17
+ pip install "wxpath[tui]"
18
+ ```
19
+ ---
20
+
21
+
22
+ ## What is wxpath?
30
23
 
31
24
  **wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
32
25
 
@@ -35,14 +28,14 @@ This expression fetches a page, extracts links, and streams them concurrently -
35
28
  ```python
36
29
  import wxpath
37
30
 
38
- expr = "url('https://example.com')//a/@href"
31
+ expr = "url('https://quotes.toscrape.com')//a/@href"
39
32
 
40
33
  for link in wxpath.wxpath_async_blocking_iter(expr):
41
34
  print(link)
42
35
  ```
43
36
 
44
37
 
45
- By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform deep (or paginated) web crawling and extraction:
38
+ By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform recursive (or paginated) web crawling and extraction:
46
39
 
47
40
  ```python
48
41
  import wxpath
@@ -62,15 +55,28 @@ for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
62
55
 
63
56
  Most web scrapers force you to write crawl control flow first, and extraction second.
64
57
 
65
- **wxpath** inverts that:
58
+ **wxpath** converges those two steps into one:
66
59
  - **You describe traversal declaratively**
67
60
  - **Extraction is expressed inline**
68
61
  - **The engine handles scheduling, concurrency, and deduplication**
69
62
 
70
63
 
64
+ ### RAG-Ready Output
65
+
66
+ Extract clean, structured JSON hierarchies directly from the graph - feed your LLMs signal, not noise. Refer to [LangChain Integration](https://rodricios.github.io/wxpath/api/integrations/langchain/) for more details.
67
+
68
+
69
+ ### Deterministic
70
+
71
+ **wxpath** is deterministic (read: not powered by LLMs). While we can't guarantee the network is stable, we can guarantee the traversal is.
72
+
73
+ ## Documentation (WIP)
74
+
75
+ Documentation is now available [here](https://rodricios.github.io/wxpath/).
76
+
71
77
  ## Contents
72
78
 
73
- - [Example](#example)
79
+ - [Example: Knowledge Graph](#example)
74
80
  - [Language Design](DESIGN.md)
75
81
  - [`url(...)` and `///url(...)` Explained](#url-and-url-explained)
76
82
  - [General flow](#general-flow)
@@ -80,6 +86,7 @@ Most web scrapers force you to write crawl control flow first, and extraction se
80
86
  - [XPath 3.1](#xpath-31-by-default)
81
87
  - [Progress Bar](#progress-bar)
82
88
  - [CLI](#cli)
89
+ - [TUI](#tui)
83
90
  - [Persistence and Caching](#persistence-and-caching)
84
91
  - [Settings](#settings)
85
92
  - [Hooks (Experimental)](#hooks-experimental)
@@ -294,12 +301,17 @@ Command line options:
294
301
  --cache [true|false] (Default: False) Persist crawl results to a local database
295
302
  ```
296
303
 
304
+ ## TUI
305
+
306
+ **wxpath** provides a terminal interface (TUI) for interactive expression testing and data extraction.
307
+
308
+ See [TUI Quickstart](https://rodricios.github.io/wxpath/tui/quickstart) for more details.
297
309
 
298
310
  ## Persistence and Caching
299
311
 
300
312
  **wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
301
313
 
302
- **wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will be encounter a warning if you `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
314
+ **wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will encounter a warning if `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
303
315
 
304
316
  To use, you must install the appropriate optional dependency:
305
317
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "wxpath"
7
- version = "0.4.1"
7
+ version = "0.5.1"
8
8
  description = "wxpath - a declarative web crawler and data extractor"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -13,23 +13,40 @@ authors = [
13
13
  ]
14
14
  license = "MIT"
15
15
  license-files = ["LICENSE"]
16
+
16
17
  dependencies = [
17
18
  "lxml>=4.0",
18
19
  "elementpath>=5.0.0,<=5.0.3",
19
- "aiohttp>=3.8.0,<=3.12.15",
20
+ "aiohttp>=3.8.0,<=4.0.0",
20
21
  "tqdm>=4.0.0"
21
22
  ]
22
23
 
24
+ [project.urls]
25
+ Homepage = "https://rodricios.github.io/wxpath"
26
+ Documentation = "https://rodricios.github.io/wxpath"
27
+ Repository = "https://github.com/rodricios/wxpath"
28
+ Issues = "https://github.com/rodricios/wxpath/issues"
29
+ Changelog = "https://github.com/rodricios/wxpath/blob/main/CHANGELOG.md"
30
+
31
+
23
32
  [project.optional-dependencies]
24
33
  cache = ["aiohttp-client-cache>=0.14.0"]
25
34
  cache-sqlite = ["aiohttp-client-cache[sqlite]"]
26
35
  cache-redis = ["aiohttp-client-cache[redis]"]
27
36
 
37
+ # langchain langchain-ollama langchain-chroma chromadb
38
+ llm = ["langchain>=1.0.0", "langchain-core>=1.0.0", "langchain-ollama>=1.0.0",
39
+ "langchain-community>=0.4.0", "langchain-chroma>=1.0.0", "chromadb>=1.0.0",
40
+ "langchain-text-splitters>=1.1.0"]
41
+
28
42
  test = ["pytest>=7.0", "pytest-asyncio>=0.23"]
29
- dev = ["ruff"]
43
+ dev = ["ruff", "tox"]
44
+ docs = ["mkdocs>=1.5", "mkdocs-material>=9.0", "mkdocstrings[python]>=0.24", "mkdocs-macros-plugin>=1.0", "mkdocs-resize-images>=1.0", "mkdocs-glightbox", "pyyaml>=6.0"]
45
+ tui = ["textual>=1.0.0", "aiohttp-client-cache>=0.14.0", "aiohttp-client-cache[sqlite]"]
30
46
 
31
47
  [project.scripts]
32
48
  wxpath = "wxpath.cli:main"
49
+ wxpath-tui = "wxpath.tui:main"
33
50
 
34
51
  [tool.pytest.ini_options]
35
52
  minversion = "6.0"
@@ -1,3 +1,4 @@
1
+ from . import settings
1
2
  from .core.runtime.engine import wxpath_async, wxpath_async_blocking, wxpath_async_blocking_iter
2
3
  from .util.logging import configure_logging
3
4
 
@@ -6,4 +7,5 @@ __all__ = [
6
7
  'wxpath_async_blocking',
7
8
  'wxpath_async_blocking_iter',
8
9
  'configure_logging',
10
+ 'settings',
9
11
  ]
@@ -47,6 +47,11 @@ def main():
47
47
  help="Respect robots.txt",
48
48
  default=True
49
49
  )
50
+ arg_parser.add_argument(
51
+ "--insecure",
52
+ action="store_true",
53
+ help="Disable SSL certificate verification (use for sites with broken chains)",
54
+ )
50
55
  arg_parser.add_argument(
51
56
  "--cache",
52
57
  action="store_true",
@@ -112,6 +117,7 @@ def main():
112
117
  concurrency=args.concurrency,
113
118
  per_host=args.concurrency_per_host,
114
119
  respect_robots=args.respect_robots,
120
+ verify_ssl=not args.insecure,
115
121
  headers=custom_headers
116
122
  )
117
123
  engine = WXPathEngine(crawler=crawler)
@@ -0,0 +1,53 @@
1
+ class XPathEvaluationError(Exception):
2
+ """Errors during XPath evaluation with elementpath."""
3
+
4
+ def __init__(
5
+ self,
6
+ message: str,
7
+ xpath: str,
8
+ base_url: str | None = None,
9
+ element_tag: str | None = None,
10
+ error_code: str | None = None, # XPath error codes like XPST0003
11
+ position: tuple[int, int] | None = None, # (line, column)
12
+ original_error: Exception | None = None
13
+ ):
14
+ context = {
15
+ "xpath": xpath,
16
+ "base_url": base_url,
17
+ "element_tag": element_tag,
18
+ "error_code": error_code,
19
+ "position": position,
20
+ }
21
+ if original_error:
22
+ context["original_error"] = str(original_error)
23
+ # Extract XPath error code if present (e.g., [err:XPST0003])
24
+ if hasattr(original_error, 'code'):
25
+ context["error_code"] = original_error.code
26
+
27
+ super().__init__(message, context)
28
+
29
+ def to_dict(self) -> dict:
30
+ return {
31
+ "message": self.message,
32
+ "xpath": self.xpath,
33
+ "base_url": self.base_url,
34
+ "element_tag": self.element_tag,
35
+ "error_code": self.error_code,
36
+ "position": self.position,
37
+ "original_error": self.original_error,
38
+ }
39
+
40
+
41
+ class XPathSyntaxError(XPathEvaluationError):
42
+ """Invalid XPath syntax."""
43
+ pass
44
+
45
+
46
+ class XPathTypeError(XPathEvaluationError):
47
+ """Type error in XPath expression."""
48
+ pass
49
+
50
+
51
+ class XPathRuntimeError(XPathEvaluationError):
52
+ """Runtime error during XPath evaluation."""
53
+ pass
@@ -61,6 +61,7 @@ class InfiniteCrawlIntent(ProcessIntent):
61
61
 
62
62
  @dataclass(slots=True)
63
63
  class ExtractIntent(ProcessIntent):
64
+ """TODO: May be redundant with ProcessIntent?"""
64
65
  pass
65
66
 
66
67
 
@@ -2,11 +2,25 @@ from typing import Callable, Iterable
2
2
  from urllib.parse import urljoin
3
3
 
4
4
  import elementpath
5
+ from elementpath import (
6
+ ElementPathError,
7
+ ElementPathSyntaxError as EPSyntaxError,
8
+ ElementPathTypeError as EPTypeError,
9
+ ElementPathZeroDivisionError,
10
+ ElementPathRuntimeError as EPRuntimeError,
11
+ MissingContextError,
12
+ )
5
13
  from elementpath.datatypes import AnyAtomicType
6
14
  from elementpath.xpath3 import XPath3Parser
7
15
  from lxml import html
8
16
 
9
17
  from wxpath.core.dom import get_absolute_links_from_elem_and_xpath
18
+ from wxpath.core.exceptions import (
19
+ XPathEvaluationError,
20
+ XPathSyntaxError,
21
+ XPathTypeError,
22
+ XPathRuntimeError,
23
+ )
10
24
  from wxpath.core.models import (
11
25
  CrawlIntent,
12
26
  DataIntent,
@@ -19,6 +33,7 @@ from wxpath.core.parser import (
19
33
  Binary,
20
34
  Call,
21
35
  ContextItem,
36
+ Depth,
22
37
  Segment,
23
38
  Segments,
24
39
  String,
@@ -78,7 +93,10 @@ def get_operator(
78
93
 
79
94
 
80
95
  @register('url', (String,))
96
+ @register('url', (String, Depth))
81
97
  @register('url', (String, Xpath))
98
+ @register('url', (String, Depth, Xpath))
99
+ @register('url', (String, Xpath, Depth))
82
100
  def _handle_url_str_lit(curr_elem: html.HtmlElement,
83
101
  curr_segments: list[Url | Xpath],
84
102
  curr_depth: int, **kwargs) -> Iterable[Intent]:
@@ -87,9 +105,12 @@ def _handle_url_str_lit(curr_elem: html.HtmlElement,
87
105
 
88
106
  next_segments = curr_segments[1:]
89
107
 
90
- if len(url_call.args) == 2:
108
+ # NOTE: Expects parser to produce UrlCrawl node in expressions
109
+ # that look like `url('...', follow=//a/@href)`
110
+ if isinstance(url_call, UrlCrawl):
111
+ xpath_arg = [arg for arg in url_call.args if isinstance(arg, Xpath)][0]
91
112
  _segments = [
92
- UrlCrawl('///url', [url_call.args[1], url_call.args[0].value])
113
+ UrlCrawl('///url', [xpath_arg, url_call.args[0].value])
93
114
  ] + next_segments
94
115
 
95
116
  yield CrawlIntent(url=url_call.args[0].value, next_segments=_segments)
@@ -112,17 +133,52 @@ def _handle_xpath(curr_elem: html.HtmlElement,
112
133
  raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
113
134
  base_url = getattr(curr_elem, 'base_url', None)
114
135
  log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
115
-
116
- _backlink_str = f"string('{curr_elem.get('backlink')}')"
117
- # We use the root tree's depth and not curr_depth because curr_depth accounts for a +1
118
- # increment after each url*() hop
119
- _depth_str = f"number({curr_elem.getroottree().getroot().get('depth')})"
120
- expr = expr.replace('wx:backlink()', _backlink_str)
121
- expr = expr.replace('wx:backlink(.)', _backlink_str)
122
- expr = expr.replace('wx:depth()', _depth_str)
123
- expr = expr.replace('wx:depth(.)', _depth_str)
124
-
125
- elems = curr_elem.xpath3(expr)
136
+
137
+ try:
138
+ elems = curr_elem.xpath3(expr)
139
+ except EPSyntaxError as e:
140
+ # Parse the error message to extract line/column if available
141
+ # elementpath format: "... at line 1, column 7: [err:XPST0003] ..."
142
+ raise XPathSyntaxError(
143
+ f"Invalid XPath syntax: {str(e).split(': ', 1)[-1]}",
144
+ xpath=expr,
145
+ base_url=base_url,
146
+ element_tag=curr_elem.tag,
147
+ original_error=e
148
+ ) from e
149
+ except EPTypeError as e:
150
+ raise XPathTypeError(
151
+ f"XPath type error: {str(e).split(': ', 1)[-1]}",
152
+ xpath=expr,
153
+ base_url=base_url,
154
+ element_tag=curr_elem.tag,
155
+ original_error=e
156
+ ) from e
157
+ except ElementPathZeroDivisionError as e:
158
+ raise XPathRuntimeError(
159
+ f"Division by zero in XPath: {expr}",
160
+ xpath=expr,
161
+ base_url=base_url,
162
+ element_tag=curr_elem.tag,
163
+ original_error=e
164
+ ) from e
165
+ except MissingContextError as e:
166
+ raise XPathRuntimeError(
167
+ f"XPath requires context but none provided: {expr}",
168
+ xpath=expr,
169
+ base_url=base_url,
170
+ element_tag=curr_elem.tag,
171
+ original_error=e
172
+ ) from e
173
+ except ElementPathError as e:
174
+ # Catch-all for other elementpath errors
175
+ raise XPathEvaluationError(
176
+ f"XPath evaluation failed: {e}",
177
+ xpath=expr,
178
+ base_url=base_url,
179
+ element_tag=curr_elem.tag,
180
+ original_error=e
181
+ ) from e
126
182
 
127
183
  next_segments = curr_segments[1:]
128
184
  for elem in elems:
@@ -259,12 +315,37 @@ def _handle_binary(curr_elem: html.HtmlElement | str,
259
315
  base_url = getattr(curr_elem, 'base_url', None)
260
316
  next_segments = right
261
317
 
262
- results = elementpath.select(
263
- curr_elem,
264
- left.value,
265
- parser=XPath3Parser,
266
- item='' if curr_elem is None else None
267
- )
318
+ try:
319
+ results = elementpath.select(
320
+ curr_elem,
321
+ left.value,
322
+ parser=XPath3Parser,
323
+ item='' if curr_elem is None else None
324
+ )
325
+ except EPSyntaxError as e:
326
+ raise XPathSyntaxError(
327
+ f"Invalid XPath in binary operation: {str(e).split(': ', 1)[-1]}",
328
+ xpath=left.value,
329
+ base_url=base_url,
330
+ element_tag=getattr(curr_elem, 'tag', None),
331
+ original_error=e
332
+ ) from e
333
+ except EPTypeError as e:
334
+ raise XPathTypeError(
335
+ f"XPath type error in binary operation: {str(e).split(': ', 1)[-1]}",
336
+ xpath=left.value,
337
+ base_url=base_url,
338
+ element_tag=getattr(curr_elem, 'tag', None),
339
+ original_error=e
340
+ ) from e
341
+ except ElementPathError as e:
342
+ raise XPathEvaluationError(
343
+ f"XPath evaluation failed in binary operation: {e}",
344
+ xpath=left.value,
345
+ base_url=base_url,
346
+ element_tag=getattr(curr_elem, 'tag', None),
347
+ original_error=e
348
+ ) from e
268
349
 
269
350
  if isinstance(results, AnyAtomicType):
270
351
  results = [results]