wxpath 0.4.1__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {wxpath-0.4.1 → wxpath-0.5.0}/PKG-INFO +71 -8
  2. wxpath-0.4.1/src/wxpath.egg-info/PKG-INFO → wxpath-0.5.0/README.md +46 -34
  3. {wxpath-0.4.1 → wxpath-0.5.0}/pyproject.toml +18 -1
  4. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/__init__.py +2 -0
  5. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/cli.py +6 -0
  6. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/core/models.py +1 -0
  7. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/core/ops.py +9 -12
  8. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/core/parser.py +92 -23
  9. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/core/runtime/engine.py +36 -3
  10. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/core/runtime/helpers.py +6 -3
  11. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/client/__init__.py +1 -1
  12. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/client/crawler.py +17 -5
  13. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/client/response.py +7 -1
  14. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/policy/retry.py +2 -2
  15. wxpath-0.5.0/src/wxpath/integrations/langchain/__init__.py +0 -0
  16. wxpath-0.5.0/src/wxpath/integrations/langchain/examples/basic_rag.py +85 -0
  17. wxpath-0.5.0/src/wxpath/integrations/langchain/examples/rolling_window_rag.py +218 -0
  18. wxpath-0.5.0/src/wxpath/integrations/langchain/loader.py +60 -0
  19. wxpath-0.5.0/src/wxpath/patches.py +273 -0
  20. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/settings.py +3 -1
  21. wxpath-0.5.0/src/wxpath/tui.py +1204 -0
  22. wxpath-0.5.0/src/wxpath/tui_settings.py +151 -0
  23. wxpath-0.5.0/src/wxpath/util/__init__.py +0 -0
  24. wxpath-0.5.0/src/wxpath/util/cleaners.py +31 -0
  25. wxpath-0.5.0/src/wxpath/util/common_paths.py +22 -0
  26. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/util/logging.py +3 -7
  27. wxpath-0.4.1/README.md → wxpath-0.5.0/src/wxpath.egg-info/PKG-INFO +96 -7
  28. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath.egg-info/SOURCES.txt +9 -0
  29. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath.egg-info/entry_points.txt +1 -0
  30. wxpath-0.5.0/src/wxpath.egg-info/requires.txt +43 -0
  31. wxpath-0.4.1/src/wxpath/patches.py +0 -63
  32. wxpath-0.4.1/src/wxpath.egg-info/requires.txt +0 -20
  33. {wxpath-0.4.1 → wxpath-0.5.0}/LICENSE +0 -0
  34. {wxpath-0.4.1 → wxpath-0.5.0}/setup.cfg +0 -0
  35. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/core/__init__.py +0 -0
  36. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/core/dom.py +0 -0
  37. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/core/runtime/__init__.py +0 -0
  38. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/hooks/__init__.py +0 -0
  39. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/hooks/builtin.py +0 -0
  40. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/hooks/registry.py +0 -0
  41. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/__init__.py +0 -0
  42. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/client/cache.py +0 -0
  43. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/client/request.py +0 -0
  44. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/policy/backoff.py +0 -0
  45. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/policy/robots.py +0 -0
  46. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/policy/throttler.py +0 -0
  47. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/http/stats.py +0 -0
  48. {wxpath-0.4.1/src/wxpath/util → wxpath-0.5.0/src/wxpath/integrations}/__init__.py +0 -0
  49. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath/util/serialize.py +0 -0
  50. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath.egg-info/dependency_links.txt +0 -0
  51. {wxpath-0.4.1 → wxpath-0.5.0}/src/wxpath.egg-info/top_level.txt +0 -0
@@ -1,9 +1,14 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wxpath
3
- Version: 0.4.1
3
+ Version: 0.5.0
4
4
  Summary: wxpath - a declarative web crawler and data extractor
5
5
  Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
6
6
  License-Expression: MIT
7
+ Project-URL: Homepage, https://rodricios.github.io/wxpath
8
+ Project-URL: Documentation, https://rodricios.github.io/wxpath
9
+ Project-URL: Repository, https://github.com/rodricios/wxpath
10
+ Project-URL: Issues, https://github.com/rodricios/wxpath/issues
11
+ Project-URL: Changelog, https://github.com/rodricios/wxpath/blob/main/CHANGELOG.md
7
12
  Requires-Python: >=3.10
8
13
  Description-Content-Type: text/markdown
9
14
  License-File: LICENSE
@@ -17,16 +22,55 @@ Provides-Extra: cache-sqlite
17
22
  Requires-Dist: aiohttp-client-cache[sqlite]; extra == "cache-sqlite"
18
23
  Provides-Extra: cache-redis
19
24
  Requires-Dist: aiohttp-client-cache[redis]; extra == "cache-redis"
25
+ Provides-Extra: llm
26
+ Requires-Dist: langchain>=1.0.0; extra == "llm"
27
+ Requires-Dist: langchain-core>=1.0.0; extra == "llm"
28
+ Requires-Dist: langchain-ollama>=1.0.0; extra == "llm"
29
+ Requires-Dist: langchain-community>=0.4.0; extra == "llm"
30
+ Requires-Dist: langchain-chroma>=1.0.0; extra == "llm"
31
+ Requires-Dist: chromadb>=1.0.0; extra == "llm"
32
+ Requires-Dist: langchain-text-splitters>=1.1.0; extra == "llm"
20
33
  Provides-Extra: test
21
34
  Requires-Dist: pytest>=7.0; extra == "test"
22
35
  Requires-Dist: pytest-asyncio>=0.23; extra == "test"
23
36
  Provides-Extra: dev
24
37
  Requires-Dist: ruff; extra == "dev"
38
+ Provides-Extra: docs
39
+ Requires-Dist: mkdocs>=1.5; extra == "docs"
40
+ Requires-Dist: mkdocs-material>=9.0; extra == "docs"
41
+ Requires-Dist: mkdocstrings[python]>=0.24; extra == "docs"
42
+ Requires-Dist: mkdocs-macros-plugin>=1.0; extra == "docs"
43
+ Requires-Dist: mkdocs-resize-images>=1.0; extra == "docs"
44
+ Requires-Dist: mkdocs-glightbox; extra == "docs"
45
+ Requires-Dist: pyyaml>=6.0; extra == "docs"
46
+ Provides-Extra: tui
47
+ Requires-Dist: textual>=1.0.0; extra == "tui"
48
+ Requires-Dist: aiohttp-client-cache>=0.14.0; extra == "tui"
49
+ Requires-Dist: aiohttp-client-cache[sqlite]; extra == "tui"
25
50
  Dynamic: license-file
26
51
 
27
- # **wxpath** - declarative web crawling with XPath
52
+ # **wxpath** - declarative web graph traversal with XPath
28
53
 
29
- [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-3100/)
54
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-3100/) [![Documentation Status](https://img.shields.io/badge/documentation-green.svg)](https://rodricios.github.io/wxpath)
55
+
56
+
57
+ > NEW: [TUI](https://rodricios.github.io/wxpath/tui/quickstart.md) - Interactive terminal interface (powered by Textual) for testing wxpath expressions and exporting data.
58
+
59
+ ![Wxpath TUI Demo screenshot](docs/assets/images/demo1.jpg)
60
+
61
+ ## Install
62
+
63
+ Requires Python 3.10+.
64
+
65
+ ```
66
+ pip install wxpath
67
+ # For TUI support
68
+ pip install wxpath[tui]
69
+ ```
70
+ ---
71
+
72
+
73
+ ## What is wxpath?
30
74
 
31
75
  **wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
32
76
 
@@ -35,14 +79,14 @@ This expression fetches a page, extracts links, and streams them concurrently -
35
79
  ```python
36
80
  import wxpath
37
81
 
38
- expr = "url('https://example.com')//a/@href"
82
+ expr = "url('https://quotes.toscrape.com')//a/@href"
39
83
 
40
84
  for link in wxpath.wxpath_async_blocking_iter(expr):
41
85
  print(link)
42
86
  ```
43
87
 
44
88
 
45
- By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform deep (or paginated) web crawling and extraction:
89
+ By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform recursive (or paginated) web crawling and extraction:
46
90
 
47
91
  ```python
48
92
  import wxpath
@@ -62,15 +106,28 @@ for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
62
106
 
63
107
  Most web scrapers force you to write crawl control flow first, and extraction second.
64
108
 
65
- **wxpath** inverts that:
109
+ **wxpath** converges those two steps into one:
66
110
  - **You describe traversal declaratively**
67
111
  - **Extraction is expressed inline**
68
112
  - **The engine handles scheduling, concurrency, and deduplication**
69
113
 
70
114
 
115
+ ### RAG-Ready Output
116
+
117
+ Extract clean, structured JSON hierarchies directly from the graph - feed your LLMs signal, not noise. Refer to [LangChain Integration](https://rodricios.github.io/wxpath/api/integrations/langchain/) for more details.
118
+
119
+
120
+ ### Deterministic
121
+
122
+ **wxpath** is deterministic (read: not powered by LLMs). While we can't guarantee the network is stable, we can guarantee the traversal is.
123
+
124
+ ## Documentation (WIP)
125
+
126
+ Documentation is now available [here](https://rodricios.github.io/wxpath/).
127
+
71
128
  ## Contents
72
129
 
73
- - [Example](#example)
130
+ - [Example: Knowledge Graph](#example)
74
131
  - [Language Design](DESIGN.md)
75
132
  - [`url(...)` and `///url(...)` Explained](#url-and-url-explained)
76
133
  - [General flow](#general-flow)
@@ -80,6 +137,7 @@ Most web scrapers force you to write crawl control flow first, and extraction se
80
137
  - [XPath 3.1](#xpath-31-by-default)
81
138
  - [Progress Bar](#progress-bar)
82
139
  - [CLI](#cli)
140
+ - [TUI](#tui)
83
141
  - [Persistence and Caching](#persistence-and-caching)
84
142
  - [Settings](#settings)
85
143
  - [Hooks (Experimental)](#hooks-experimental)
@@ -294,12 +352,17 @@ Command line options:
294
352
  --cache [true|false] (Default: False) Persist crawl results to a local database
295
353
  ```
296
354
 
355
+ ## TUI
356
+
357
+ **wxpath** provides a terminal interface (TUI) for interactive expression testing and data extraction.
358
+
359
+ See [TUI Quickstart](https://rodricios.github.io/wxpath/tui/quickstart.md) for more details.
297
360
 
298
361
  ## Persistence and Caching
299
362
 
300
363
  **wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
301
364
 
302
- **wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will be encounter a warning if you `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
365
+ **wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will encounter a warning if `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
303
366
 
304
367
  To use, you must install the appropriate optional dependency:
305
368
 
@@ -1,32 +1,25 @@
1
- Metadata-Version: 2.4
2
- Name: wxpath
3
- Version: 0.4.1
4
- Summary: wxpath - a declarative web crawler and data extractor
5
- Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
6
- License-Expression: MIT
7
- Requires-Python: >=3.10
8
- Description-Content-Type: text/markdown
9
- License-File: LICENSE
10
- Requires-Dist: lxml>=4.0
11
- Requires-Dist: elementpath<=5.0.3,>=5.0.0
12
- Requires-Dist: aiohttp<=3.12.15,>=3.8.0
13
- Requires-Dist: tqdm>=4.0.0
14
- Provides-Extra: cache
15
- Requires-Dist: aiohttp-client-cache>=0.14.0; extra == "cache"
16
- Provides-Extra: cache-sqlite
17
- Requires-Dist: aiohttp-client-cache[sqlite]; extra == "cache-sqlite"
18
- Provides-Extra: cache-redis
19
- Requires-Dist: aiohttp-client-cache[redis]; extra == "cache-redis"
20
- Provides-Extra: test
21
- Requires-Dist: pytest>=7.0; extra == "test"
22
- Requires-Dist: pytest-asyncio>=0.23; extra == "test"
23
- Provides-Extra: dev
24
- Requires-Dist: ruff; extra == "dev"
25
- Dynamic: license-file
26
-
27
- # **wxpath** - declarative web crawling with XPath
28
-
29
- [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-3100/)
1
+ # **wxpath** - declarative web graph traversal with XPath
2
+
3
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-3100/) [![Documentation Status](https://img.shields.io/badge/documentation-green.svg)](https://rodricios.github.io/wxpath)
4
+
5
+
6
+ > NEW: [TUI](https://rodricios.github.io/wxpath/tui/quickstart.md) - Interactive terminal interface (powered by Textual) for testing wxpath expressions and exporting data.
7
+
8
+ ![Wxpath TUI Demo screenshot](docs/assets/images/demo1.jpg)
9
+
10
+ ## Install
11
+
12
+ Requires Python 3.10+.
13
+
14
+ ```
15
+ pip install wxpath
16
+ # For TUI support
17
+ pip install wxpath[tui]
18
+ ```
19
+ ---
20
+
21
+
22
+ ## What is wxpath?
30
23
 
31
24
  **wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
32
25
 
@@ -35,14 +28,14 @@ This expression fetches a page, extracts links, and streams them concurrently -
35
28
  ```python
36
29
  import wxpath
37
30
 
38
- expr = "url('https://example.com')//a/@href"
31
+ expr = "url('https://quotes.toscrape.com')//a/@href"
39
32
 
40
33
  for link in wxpath.wxpath_async_blocking_iter(expr):
41
34
  print(link)
42
35
  ```
43
36
 
44
37
 
45
- By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform deep (or paginated) web crawling and extraction:
38
+ By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform recursive (or paginated) web crawling and extraction:
46
39
 
47
40
  ```python
48
41
  import wxpath
@@ -62,15 +55,28 @@ for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
62
55
 
63
56
  Most web scrapers force you to write crawl control flow first, and extraction second.
64
57
 
65
- **wxpath** inverts that:
58
+ **wxpath** converges those two steps into one:
66
59
  - **You describe traversal declaratively**
67
60
  - **Extraction is expressed inline**
68
61
  - **The engine handles scheduling, concurrency, and deduplication**
69
62
 
70
63
 
64
+ ### RAG-Ready Output
65
+
66
+ Extract clean, structured JSON hierarchies directly from the graph - feed your LLMs signal, not noise. Refer to [LangChain Integration](https://rodricios.github.io/wxpath/api/integrations/langchain/) for more details.
67
+
68
+
69
+ ### Deterministic
70
+
71
+ **wxpath** is deterministic (read: not powered by LLMs). While we can't guarantee the network is stable, we can guarantee the traversal is.
72
+
73
+ ## Documentation (WIP)
74
+
75
+ Documentation is now available [here](https://rodricios.github.io/wxpath/).
76
+
71
77
  ## Contents
72
78
 
73
- - [Example](#example)
79
+ - [Example: Knowledge Graph](#example)
74
80
  - [Language Design](DESIGN.md)
75
81
  - [`url(...)` and `///url(...)` Explained](#url-and-url-explained)
76
82
  - [General flow](#general-flow)
@@ -80,6 +86,7 @@ Most web scrapers force you to write crawl control flow first, and extraction se
80
86
  - [XPath 3.1](#xpath-31-by-default)
81
87
  - [Progress Bar](#progress-bar)
82
88
  - [CLI](#cli)
89
+ - [TUI](#tui)
83
90
  - [Persistence and Caching](#persistence-and-caching)
84
91
  - [Settings](#settings)
85
92
  - [Hooks (Experimental)](#hooks-experimental)
@@ -294,12 +301,17 @@ Command line options:
294
301
  --cache [true|false] (Default: False) Persist crawl results to a local database
295
302
  ```
296
303
 
304
+ ## TUI
305
+
306
+ **wxpath** provides a terminal interface (TUI) for interactive expression testing and data extraction.
307
+
308
+ See [TUI Quickstart](https://rodricios.github.io/wxpath/tui/quickstart.md) for more details.
297
309
 
298
310
  ## Persistence and Caching
299
311
 
300
312
  **wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
301
313
 
302
- **wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will be encounter a warning if you `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
314
+ **wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will encounter a warning if `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
303
315
 
304
316
  To use, you must install the appropriate optional dependency:
305
317
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "wxpath"
7
- version = "0.4.1"
7
+ version = "0.5.0"
8
8
  description = "wxpath - a declarative web crawler and data extractor"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -13,6 +13,7 @@ authors = [
13
13
  ]
14
14
  license = "MIT"
15
15
  license-files = ["LICENSE"]
16
+
16
17
  dependencies = [
17
18
  "lxml>=4.0",
18
19
  "elementpath>=5.0.0,<=5.0.3",
@@ -20,16 +21,32 @@ dependencies = [
20
21
  "tqdm>=4.0.0"
21
22
  ]
22
23
 
24
+ [project.urls]
25
+ Homepage = "https://rodricios.github.io/wxpath"
26
+ Documentation = "https://rodricios.github.io/wxpath"
27
+ Repository = "https://github.com/rodricios/wxpath"
28
+ Issues = "https://github.com/rodricios/wxpath/issues"
29
+ Changelog = "https://github.com/rodricios/wxpath/blob/main/CHANGELOG.md"
30
+
31
+
23
32
  [project.optional-dependencies]
24
33
  cache = ["aiohttp-client-cache>=0.14.0"]
25
34
  cache-sqlite = ["aiohttp-client-cache[sqlite]"]
26
35
  cache-redis = ["aiohttp-client-cache[redis]"]
27
36
 
37
+ # langchain langchain-ollama langchain-chroma chromadb
38
+ llm = ["langchain>=1.0.0", "langchain-core>=1.0.0", "langchain-ollama>=1.0.0",
39
+ "langchain-community>=0.4.0", "langchain-chroma>=1.0.0", "chromadb>=1.0.0",
40
+ "langchain-text-splitters>=1.1.0"]
41
+
28
42
  test = ["pytest>=7.0", "pytest-asyncio>=0.23"]
29
43
  dev = ["ruff"]
44
+ docs = ["mkdocs>=1.5", "mkdocs-material>=9.0", "mkdocstrings[python]>=0.24", "mkdocs-macros-plugin>=1.0", "mkdocs-resize-images>=1.0", "mkdocs-glightbox", "pyyaml>=6.0"]
45
+ tui = ["textual>=1.0.0", "aiohttp-client-cache>=0.14.0", "aiohttp-client-cache[sqlite]"]
30
46
 
31
47
  [project.scripts]
32
48
  wxpath = "wxpath.cli:main"
49
+ wxpath-tui = "wxpath.tui:main"
33
50
 
34
51
  [tool.pytest.ini_options]
35
52
  minversion = "6.0"
@@ -1,3 +1,4 @@
1
+ from . import settings
1
2
  from .core.runtime.engine import wxpath_async, wxpath_async_blocking, wxpath_async_blocking_iter
2
3
  from .util.logging import configure_logging
3
4
 
@@ -6,4 +7,5 @@ __all__ = [
6
7
  'wxpath_async_blocking',
7
8
  'wxpath_async_blocking_iter',
8
9
  'configure_logging',
10
+ 'settings',
9
11
  ]
@@ -47,6 +47,11 @@ def main():
47
47
  help="Respect robots.txt",
48
48
  default=True
49
49
  )
50
+ arg_parser.add_argument(
51
+ "--insecure",
52
+ action="store_true",
53
+ help="Disable SSL certificate verification (use for sites with broken chains)",
54
+ )
50
55
  arg_parser.add_argument(
51
56
  "--cache",
52
57
  action="store_true",
@@ -112,6 +117,7 @@ def main():
112
117
  concurrency=args.concurrency,
113
118
  per_host=args.concurrency_per_host,
114
119
  respect_robots=args.respect_robots,
120
+ verify_ssl=not args.insecure,
115
121
  headers=custom_headers
116
122
  )
117
123
  engine = WXPathEngine(crawler=crawler)
@@ -61,6 +61,7 @@ class InfiniteCrawlIntent(ProcessIntent):
61
61
 
62
62
  @dataclass(slots=True)
63
63
  class ExtractIntent(ProcessIntent):
64
+ """TODO: May be redundant with ProcessIntent?"""
64
65
  pass
65
66
 
66
67
 
@@ -19,6 +19,7 @@ from wxpath.core.parser import (
19
19
  Binary,
20
20
  Call,
21
21
  ContextItem,
22
+ Depth,
22
23
  Segment,
23
24
  Segments,
24
25
  String,
@@ -78,7 +79,10 @@ def get_operator(
78
79
 
79
80
 
80
81
  @register('url', (String,))
82
+ @register('url', (String, Depth))
81
83
  @register('url', (String, Xpath))
84
+ @register('url', (String, Depth, Xpath))
85
+ @register('url', (String, Xpath, Depth))
82
86
  def _handle_url_str_lit(curr_elem: html.HtmlElement,
83
87
  curr_segments: list[Url | Xpath],
84
88
  curr_depth: int, **kwargs) -> Iterable[Intent]:
@@ -87,9 +91,12 @@ def _handle_url_str_lit(curr_elem: html.HtmlElement,
87
91
 
88
92
  next_segments = curr_segments[1:]
89
93
 
90
- if len(url_call.args) == 2:
94
+ # NOTE: Expects parser to produce UrlCrawl node in expressions
95
+ # that look like `url('...', follow=//a/@href)`
96
+ if isinstance(url_call, UrlCrawl):
97
+ xpath_arg = [arg for arg in url_call.args if isinstance(arg, Xpath)][0]
91
98
  _segments = [
92
- UrlCrawl('///url', [url_call.args[1], url_call.args[0].value])
99
+ UrlCrawl('///url', [xpath_arg, url_call.args[0].value])
93
100
  ] + next_segments
94
101
 
95
102
  yield CrawlIntent(url=url_call.args[0].value, next_segments=_segments)
@@ -112,16 +119,6 @@ def _handle_xpath(curr_elem: html.HtmlElement,
112
119
  raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
113
120
  base_url = getattr(curr_elem, 'base_url', None)
114
121
  log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
115
-
116
- _backlink_str = f"string('{curr_elem.get('backlink')}')"
117
- # We use the root tree's depth and not curr_depth because curr_depth accounts for a +1
118
- # increment after each url*() hop
119
- _depth_str = f"number({curr_elem.getroottree().getroot().get('depth')})"
120
- expr = expr.replace('wx:backlink()', _backlink_str)
121
- expr = expr.replace('wx:backlink(.)', _backlink_str)
122
- expr = expr.replace('wx:depth()', _depth_str)
123
- expr = expr.replace('wx:depth(.)', _depth_str)
124
-
125
122
  elems = curr_elem.xpath3(expr)
126
123
 
127
124
  next_segments = curr_segments[1:]
@@ -13,7 +13,8 @@ except ImportError:
13
13
 
14
14
 
15
15
  TOKEN_SPEC = [
16
- ("NUMBER", r"\d+(\.\d+)?"),
16
+ ("NUMBER", r"\d+\.\d+"),
17
+ ("INTEGER", r"\d+"),
17
18
  ("STRING", r"'([^'\\]|\\.)*'|\"([^\"\\]|\\.)*\""), # TODO: Rename to URL Literal
18
19
  ("WXPATH", r"/{0,3}\s*url"), # Must come before NAME to match 'url' as WXPATH
19
20
  # ("///URL", r"/{3}\s*url"),
@@ -22,6 +23,7 @@ TOKEN_SPEC = [
22
23
  ("URL", r"\s*url"), # Must come before NAME to match 'url' as WXPATH
23
24
  # ("NAME", r"[a-zA-Z_][a-zA-Z0-9_]*"),
24
25
  ("FOLLOW", r",?\s{,}follow="),
26
+ ("DEPTH", r",?\s{,}depth="),
25
27
  ("OP", r"\|\||<=|>=|!=|=|<|>|\+|-|\*|/|!"), # Added || for string concat
26
28
  ("LPAREN", r"\("),
27
29
  ("RPAREN", r"\)"),
@@ -63,6 +65,14 @@ def tokenize(src: str):
63
65
  class Number:
64
66
  value: float
65
67
 
68
+ @dataclass
69
+ class Integer:
70
+ value: int
71
+
72
+ @dataclass
73
+ class Depth(Integer):
74
+ pass
75
+
66
76
  @dataclass
67
77
  class String:
68
78
  value: str
@@ -273,6 +283,10 @@ class Parser:
273
283
  if tok.type == "NUMBER":
274
284
  self.advance()
275
285
  return Number(float(tok.value))
286
+
287
+ if tok.type == "INTEGER":
288
+ self.advance()
289
+ return Integer(int(tok.value))
276
290
 
277
291
  if tok.type == "STRING":
278
292
  self.advance()
@@ -358,18 +372,18 @@ class Parser:
358
372
  self.advance()
359
373
 
360
374
  return result
361
-
362
375
 
363
376
  def capture_url_arg_content(self) -> list[Call | Xpath | ContextItem]:
364
377
  """Capture content inside a url() call, handling nested wxpath expressions.
365
378
 
366
379
  Supports patterns like::
367
380
 
368
- url('...') -> [String]
369
- url('...' follow=//a/@href) -> [String, Xpath]
370
- url(//a/@href) -> [Xpath]
371
- url( url('..')//a/@href ) -> [Call, Xpath]
372
- url( url( url('..')//a )//b ) -> [Call, Xpath]
381
+ url('...') -> [String]
382
+ url('...' follow=//a/@href) -> [String, Xpath]
383
+ url('...' follow=//a/@href depth=2) -> [String, Xpath, Integer]
384
+ url(//a/@href depth=2) -> [Xpath, Integer]
385
+ url( url('..')//a/@href ) -> [Call, Xpath]
386
+ url( url( url('..')//a )//b ) -> [Call, Xpath]
373
387
 
374
388
  Returns:
375
389
  A list of parsed elements: Xpath nodes for xpath content and Call
@@ -380,7 +394,10 @@ class Parser:
380
394
  paren_balance = 1 # We're already inside the opening paren of url()
381
395
  brace_balance = 0 # Track braces for map constructors
382
396
  reached_follow_token = False
397
+ reached_depth_token = False
383
398
  follow_xpath = ""
399
+ depth_number = ""
400
+
384
401
  while paren_balance > 0 and self.token.type != "EOF":
385
402
  if self.token.type == "WXPATH":
386
403
  # Found nested wxpath: save any accumulated xpath content first
@@ -396,13 +413,22 @@ class Parser:
396
413
 
397
414
  elif self.token.type == "FOLLOW":
398
415
  reached_follow_token = True
416
+ reached_depth_token = False
417
+ self.advance()
418
+
419
+ elif self.token.type == "DEPTH":
420
+ reached_depth_token = True
421
+ reached_follow_token = False
399
422
  self.advance()
400
423
 
401
424
  elif self.token.type == "LPAREN":
402
425
  # Opening paren that's NOT part of a url() call
403
426
  # (it's part of an xpath function like contains(), starts-with(), etc.)
404
427
  paren_balance += 1
405
- current_xpath += self.token.value
428
+ if not reached_follow_token:
429
+ current_xpath += self.token.value
430
+ else:
431
+ follow_xpath += self.token.value
406
432
  self.advance()
407
433
 
408
434
  elif self.token.type == "RPAREN":
@@ -410,26 +436,37 @@ class Parser:
410
436
  if paren_balance == 0:
411
437
  # This is the closing paren of the outer url()
412
438
  break
413
- current_xpath += self.token.value
439
+ if not reached_follow_token:
440
+ current_xpath += self.token.value
441
+ else:
442
+ follow_xpath += self.token.value
414
443
  self.advance()
415
444
 
416
445
  elif self.token.type == "LBRACE":
417
446
  # Opening brace for map constructors
418
447
  brace_balance += 1
419
- current_xpath += self.token.value
448
+ if not reached_follow_token:
449
+ current_xpath += self.token.value
450
+ else:
451
+ follow_xpath += self.token.value
420
452
  self.advance()
421
453
 
422
454
  elif self.token.type == "RBRACE":
423
455
  brace_balance -= 1
424
- current_xpath += self.token.value
456
+ if not reached_follow_token:
457
+ current_xpath += self.token.value
458
+ else:
459
+ follow_xpath += self.token.value
425
460
  self.advance()
426
461
 
427
462
  else:
428
463
  # Accumulate all other tokens as xpath content
429
- if not reached_follow_token:
430
- current_xpath += self.token.value
431
- else:
464
+ if reached_follow_token:
432
465
  follow_xpath += self.token.value
466
+ elif reached_depth_token:
467
+ depth_number += self.token.value
468
+ else:
469
+ current_xpath += self.token.value
433
470
 
434
471
  self.advance()
435
472
 
@@ -447,6 +484,9 @@ class Parser:
447
484
  if follow_xpath.strip():
448
485
  elements.append(Xpath(follow_xpath.strip()))
449
486
 
487
+ if depth_number.strip():
488
+ elements.append(Depth(int(depth_number.strip())))
489
+
450
490
  return elements
451
491
 
452
492
  def parse_call(self, func_name: str) -> Call | Segments:
@@ -462,13 +502,16 @@ class Parser:
462
502
  self.advance()
463
503
  # Handle follow=...
464
504
  if self.token.type == "FOLLOW":
465
- self.advance()
466
505
  follow_arg = self.capture_url_arg_content()
467
506
  args.extend(follow_arg)
507
+ if self.token.type == "DEPTH":
508
+ depth_arg = self.capture_url_arg_content()
509
+ args.extend(depth_arg)
468
510
  elif self.token.type == "WXPATH":
469
511
  # Nested wxpath: url( url('...')//a/@href ) or url( /url(...) )
470
- # Use capture_url_arg_content to handle nested wxpath and xpath
471
- args = self.capture_url_arg_content()
512
+ # NOTE: We used to use capture_url_arg_content to handle nested wxpath and xpath
513
+ # args = self.capture_url_arg_content()
514
+ args = self.nud()
472
515
  else:
473
516
  # Simple xpath argument: url(//a/@href)
474
517
  # Could still contain nested wxpath, so use capture_url_arg_content
@@ -489,8 +532,18 @@ class Parser:
489
532
 
490
533
  return _specify_call_types(func_name, args)
491
534
 
492
-
493
535
  def _specify_call_types(func_name: str, args: list) -> Call | Segments:
536
+ """
537
+ Specify the type of a call based on the function name and arguments.
538
+ TODO: Provide example wxpath expressions for each call type.
539
+
540
+ Args:
541
+ func_name: The name of the function.
542
+ args: The arguments of the function.
543
+
544
+ Returns:
545
+ Call | Segments: The type of the call.
546
+ """
494
547
  if func_name == "url":
495
548
  if len(args) == 1:
496
549
  if isinstance(args[0], String):
@@ -500,17 +553,33 @@ def _specify_call_types(func_name: str, args: list) -> Call | Segments:
500
553
  else:
501
554
  raise ValueError(f"Unknown argument type: {type(args[0])}")
502
555
  elif len(args) == 2:
503
- if isinstance(args[0], String) and isinstance(args[1], Xpath):
556
+ arg0, arg1 = args
557
+ if isinstance(arg0, String) and isinstance(arg1, Xpath):
558
+ # Example: url('...', follow=//a/@href)
504
559
  return UrlCrawl(func_name, args)
505
- elif isinstance(args[0], UrlLiteral) and isinstance(args[1], Xpath):
560
+ elif isinstance(arg0, String) and isinstance(arg1, Integer):
561
+ # Example: url('...', depth=2)
562
+ return UrlLiteral(func_name, args)
563
+ elif isinstance(arg0, UrlLiteral) and isinstance(arg1, Xpath):
506
564
  args.append(UrlQuery('url', [ContextItem()]))
507
565
  return Segments(args)
508
- elif isinstance(args[0], (Segments, list)) and isinstance(args[1], Xpath):
509
- segs = args[0]
510
- segs.append(args[1])
566
+ elif isinstance(arg0, (Segments, list)) and isinstance(arg1, Xpath):
567
+ segs = arg0
568
+ segs.append(arg1)
511
569
  return Segments(segs)
512
570
  else:
513
571
  raise ValueError(f"Unknown arguments: {args}")
572
+ elif len(args) == 3:
573
+ arg0, arg1, arg2 = args
574
+ if (isinstance(arg0, String) and (
575
+ (isinstance(arg1, Xpath) and isinstance(arg2, Integer)) or
576
+ (isinstance(arg1, Integer) and isinstance(arg2, Xpath))
577
+ )):
578
+ # Example: url('...', follow=//a/@href, depth=2)
579
+ # Example: url('...', depth=2, follow=//a/@href)
580
+ return UrlCrawl(func_name, args)
581
+ else:
582
+ raise ValueError(f"Unknown arguments: {args}")
514
583
  else:
515
584
  raise ValueError(f"Unknown arguments: {args}")
516
585
  elif func_name == "/url" or func_name == "//url":