lean-explore 0.3.0__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. {lean_explore-0.3.0 → lean_explore-1.0.0}/PKG-INFO +32 -9
  2. {lean_explore-0.3.0 → lean_explore-1.0.0}/README.md +11 -5
  3. {lean_explore-0.3.0 → lean_explore-1.0.0}/pyproject.toml +36 -9
  4. lean_explore-1.0.0/src/lean_explore/__init__.py +14 -0
  5. lean_explore-1.0.0/src/lean_explore/api/__init__.py +12 -0
  6. lean_explore-1.0.0/src/lean_explore/api/client.py +104 -0
  7. lean_explore-1.0.0/src/lean_explore/cli/__init__.py +10 -0
  8. lean_explore-1.0.0/src/lean_explore/cli/data_commands.py +242 -0
  9. lean_explore-1.0.0/src/lean_explore/cli/display.py +171 -0
  10. lean_explore-1.0.0/src/lean_explore/cli/main.py +134 -0
  11. lean_explore-1.0.0/src/lean_explore/config.py +244 -0
  12. lean_explore-1.0.0/src/lean_explore/extract/__init__.py +5 -0
  13. lean_explore-1.0.0/src/lean_explore/extract/__main__.py +368 -0
  14. lean_explore-1.0.0/src/lean_explore/extract/doc_gen4.py +200 -0
  15. lean_explore-1.0.0/src/lean_explore/extract/doc_parser.py +499 -0
  16. lean_explore-1.0.0/src/lean_explore/extract/embeddings.py +371 -0
  17. lean_explore-1.0.0/src/lean_explore/extract/github.py +110 -0
  18. lean_explore-1.0.0/src/lean_explore/extract/index.py +317 -0
  19. lean_explore-1.0.0/src/lean_explore/extract/informalize.py +653 -0
  20. lean_explore-1.0.0/src/lean_explore/extract/package_config.py +59 -0
  21. lean_explore-1.0.0/src/lean_explore/extract/package_registry.py +45 -0
  22. lean_explore-1.0.0/src/lean_explore/extract/package_utils.py +105 -0
  23. lean_explore-1.0.0/src/lean_explore/extract/types.py +25 -0
  24. lean_explore-1.0.0/src/lean_explore/mcp/__init__.py +11 -0
  25. lean_explore-1.0.0/src/lean_explore/mcp/app.py +75 -0
  26. {lean_explore-0.3.0 → lean_explore-1.0.0}/src/lean_explore/mcp/server.py +20 -35
  27. lean_explore-1.0.0/src/lean_explore/mcp/tools.py +135 -0
  28. lean_explore-1.0.0/src/lean_explore/models/__init__.py +9 -0
  29. lean_explore-1.0.0/src/lean_explore/models/search_db.py +76 -0
  30. lean_explore-1.0.0/src/lean_explore/models/search_types.py +53 -0
  31. lean_explore-1.0.0/src/lean_explore/search/__init__.py +32 -0
  32. lean_explore-1.0.0/src/lean_explore/search/engine.py +655 -0
  33. lean_explore-1.0.0/src/lean_explore/search/scoring.py +156 -0
  34. lean_explore-1.0.0/src/lean_explore/search/service.py +68 -0
  35. lean_explore-1.0.0/src/lean_explore/search/tokenization.py +71 -0
  36. lean_explore-1.0.0/src/lean_explore/util/__init__.py +28 -0
  37. lean_explore-1.0.0/src/lean_explore/util/embedding_client.py +92 -0
  38. lean_explore-1.0.0/src/lean_explore/util/logging.py +22 -0
  39. lean_explore-1.0.0/src/lean_explore/util/openrouter_client.py +63 -0
  40. lean_explore-1.0.0/src/lean_explore/util/reranker_client.py +189 -0
  41. {lean_explore-0.3.0 → lean_explore-1.0.0}/src/lean_explore.egg-info/PKG-INFO +32 -9
  42. lean_explore-1.0.0/src/lean_explore.egg-info/SOURCES.txt +46 -0
  43. lean_explore-1.0.0/src/lean_explore.egg-info/entry_points.txt +2 -0
  44. lean_explore-1.0.0/src/lean_explore.egg-info/requires.txt +34 -0
  45. lean_explore-0.3.0/src/lean_explore/__init__.py +0 -1
  46. lean_explore-0.3.0/src/lean_explore/api/__init__.py +0 -1
  47. lean_explore-0.3.0/src/lean_explore/api/client.py +0 -216
  48. lean_explore-0.3.0/src/lean_explore/cli/__init__.py +0 -1
  49. lean_explore-0.3.0/src/lean_explore/cli/agent.py +0 -788
  50. lean_explore-0.3.0/src/lean_explore/cli/config_utils.py +0 -481
  51. lean_explore-0.3.0/src/lean_explore/cli/data_commands.py +0 -564
  52. lean_explore-0.3.0/src/lean_explore/cli/main.py +0 -691
  53. lean_explore-0.3.0/src/lean_explore/defaults.py +0 -114
  54. lean_explore-0.3.0/src/lean_explore/local/__init__.py +0 -1
  55. lean_explore-0.3.0/src/lean_explore/local/search.py +0 -1050
  56. lean_explore-0.3.0/src/lean_explore/local/service.py +0 -479
  57. lean_explore-0.3.0/src/lean_explore/mcp/__init__.py +0 -1
  58. lean_explore-0.3.0/src/lean_explore/mcp/app.py +0 -107
  59. lean_explore-0.3.0/src/lean_explore/mcp/tools.py +0 -270
  60. lean_explore-0.3.0/src/lean_explore/shared/__init__.py +0 -1
  61. lean_explore-0.3.0/src/lean_explore/shared/models/__init__.py +0 -1
  62. lean_explore-0.3.0/src/lean_explore/shared/models/api.py +0 -117
  63. lean_explore-0.3.0/src/lean_explore/shared/models/db.py +0 -396
  64. lean_explore-0.3.0/src/lean_explore.egg-info/SOURCES.txt +0 -30
  65. lean_explore-0.3.0/src/lean_explore.egg-info/entry_points.txt +0 -2
  66. lean_explore-0.3.0/src/lean_explore.egg-info/requires.txt +0 -15
  67. lean_explore-0.3.0/tests/test_defaults.py +0 -303
  68. {lean_explore-0.3.0 → lean_explore-1.0.0}/LICENSE +0 -0
  69. {lean_explore-0.3.0 → lean_explore-1.0.0}/setup.cfg +0 -0
  70. {lean_explore-0.3.0 → lean_explore-1.0.0}/src/lean_explore.egg-info/dependency_links.txt +0 -0
  71. {lean_explore-0.3.0 → lean_explore-1.0.0}/src/lean_explore.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lean-explore
3
- Version: 0.3.0
3
+ Version: 1.0.0
4
4
  Summary: A search engine for Lean 4 declarations.
5
5
  Author-email: Justin Asher <justinchadwickasher@gmail.com>
6
6
  License: Apache License
@@ -208,7 +208,7 @@ License: Apache License
208
208
  Project-URL: Homepage, https://www.leanexplore.com/
209
209
  Project-URL: Repository, https://github.com/justincasher/lean-explore
210
210
  Keywords: lean,lean4,search,formal methods,theorem prover,math,AI
211
- Classifier: Development Status :: 3 - Alpha
211
+ Classifier: Development Status :: 4 - Beta
212
212
  Classifier: Intended Audience :: Developers
213
213
  Classifier: Intended Audience :: Science/Research
214
214
  Classifier: License :: OSI Approved :: Apache Software License
@@ -223,12 +223,13 @@ Requires-Python: >=3.10
223
223
  Description-Content-Type: text/markdown
224
224
  License-File: LICENSE
225
225
  Requires-Dist: sqlalchemy>=2.0
226
+ Requires-Dist: aiosqlite>=0.19.0
227
+ Requires-Dist: greenlet>=3.0.0
226
228
  Requires-Dist: numpy>=1.20
227
229
  Requires-Dist: faiss-cpu>=1.7
228
- Requires-Dist: sentence-transformers>=2.2.0
229
230
  Requires-Dist: filelock>=3.0.0
230
231
  Requires-Dist: nltk>=3.6
231
- Requires-Dist: rank-bm25>=0.2.2
232
+ Requires-Dist: bm25s>=0.2.0
232
233
  Requires-Dist: httpx>=0.23.0
233
234
  Requires-Dist: pydantic>=2.0
234
235
  Requires-Dist: typer[all]>=0.9.0
@@ -236,7 +237,23 @@ Requires-Dist: toml>=0.10.0
236
237
  Requires-Dist: openai-agents>=0.0.16
237
238
  Requires-Dist: mcp>=1.9.0
238
239
  Requires-Dist: tqdm>=4.60
240
+ Requires-Dist: rich>=13.0.0
239
241
  Requires-Dist: requests>=2.25.0
242
+ Requires-Dist: tenacity>=8.0.0
243
+ Requires-Dist: pooch>=1.8.0
244
+ Provides-Extra: extract
245
+ Requires-Dist: sentence-transformers>=2.2.0; extra == "extract"
246
+ Requires-Dist: networkx>=3.0; extra == "extract"
247
+ Requires-Dist: torch>=2.0.0; extra == "extract"
248
+ Provides-Extra: dev
249
+ Requires-Dist: pytest>=7.0; extra == "dev"
250
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
251
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
252
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
253
+ Requires-Dist: pre-commit>=3.0.0; extra == "dev"
254
+ Requires-Dist: networkx>=3.0; extra == "dev"
255
+ Requires-Dist: torch>=2.0.0; extra == "dev"
256
+ Requires-Dist: sentence-transformers>=2.2.0; extra == "dev"
240
257
  Dynamic: license-file
241
258
 
242
259
  <h1 align="center">
@@ -269,21 +286,28 @@ A search engine for Lean 4 declarations. This project provides tools and resourc
269
286
  The current indexed projects include:
270
287
 
271
288
  * Batteries
272
- * Lean
289
+ * CSLib
290
+ * FLT (Fermat's Last Theorem)
291
+ * FormalConjectures
273
292
  * Init
293
+ * Lean
274
294
  * Mathlib
275
295
  * PhysLean
276
296
  * Std
277
297
 
278
298
  This code is distributed under an Apache License (see [LICENSE](LICENSE)).
279
299
 
280
- ### Cite
300
+ ## Contributing
301
+
302
+ Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on code style, testing, and development setup.
303
+
304
+ ## Cite
281
305
 
282
306
  If you use LeanExplore in your research or work, please cite it as follows:
283
307
 
284
308
  **General Citation:**
285
309
 
286
- Justin Asher. (2025). *LeanExplore: A search engine for Lean 4 declarations*. LeanExplore.com. (GitHub: [https://github.com/justincasher/lean-explore](https://github.com/justincasher/lean-explore)).
310
+ Justin Asher. (2025). *LeanExplore: A search engine for Lean 4 declarations*. [https://arxiv.org/abs/2506.11085](https://arxiv.org/abs/2506.11085)
287
311
 
288
312
  **BibTeX Entry:**
289
313
 
@@ -292,7 +316,6 @@ Justin Asher. (2025). *LeanExplore: A search engine for Lean 4 declarations*. Le
292
316
  author = {Asher, Justin},
293
317
  title = {{LeanExplore: A search engine for Lean 4 declarations}},
294
318
  year = {2025},
295
- url = {http://www.leanexplore.com},
296
- note = {GitHub repository: https://github.com/justincasher/lean-explore}
319
+ url = {https://arxiv.org/abs/2506.11085}
297
320
  }
298
321
  ```
@@ -28,21 +28,28 @@ A search engine for Lean 4 declarations. This project provides tools and resourc
28
28
  The current indexed projects include:
29
29
 
30
30
  * Batteries
31
- * Lean
31
+ * CSLib
32
+ * FLT (Fermat's Last Theorem)
33
+ * FormalConjectures
32
34
  * Init
35
+ * Lean
33
36
  * Mathlib
34
37
  * PhysLean
35
38
  * Std
36
39
 
37
40
  This code is distributed under an Apache License (see [LICENSE](LICENSE)).
38
41
 
39
- ### Cite
42
+ ## Contributing
43
+
44
+ Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on code style, testing, and development setup.
45
+
46
+ ## Cite
40
47
 
41
48
  If you use LeanExplore in your research or work, please cite it as follows:
42
49
 
43
50
  **General Citation:**
44
51
 
45
- Justin Asher. (2025). *LeanExplore: A search engine for Lean 4 declarations*. LeanExplore.com. (GitHub: [https://github.com/justincasher/lean-explore](https://github.com/justincasher/lean-explore)).
52
+ Justin Asher. (2025). *LeanExplore: A search engine for Lean 4 declarations*. [https://arxiv.org/abs/2506.11085](https://arxiv.org/abs/2506.11085)
46
53
 
47
54
  **BibTeX Entry:**
48
55
 
@@ -51,7 +58,6 @@ Justin Asher. (2025). *LeanExplore: A search engine for Lean 4 declarations*. Le
51
58
  author = {Asher, Justin},
52
59
  title = {{LeanExplore: A search engine for Lean 4 declarations}},
53
60
  year = {2025},
54
- url = {http://www.leanexplore.com},
55
- note = {GitHub repository: https://github.com/justincasher/lean-explore}
61
+ url = {https://arxiv.org/abs/2506.11085}
56
62
  }
57
63
  ```
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "lean-explore"
7
- version = "0.3.0"
7
+ version = "1.0.0"
8
8
  authors = [
9
9
  { name = "Justin Asher", email = "justinchadwickasher@gmail.com" },
10
10
  ]
@@ -14,7 +14,7 @@ requires-python = ">=3.10"
14
14
  license = { file = "LICENSE" }
15
15
  keywords = ["lean", "lean4", "search", "formal methods", "theorem prover", "math", "AI"]
16
16
  classifiers = [
17
- "Development Status :: 3 - Alpha",
17
+ "Development Status :: 4 - Beta",
18
18
  "Intended Audience :: Developers",
19
19
  "Intended Audience :: Science/Research",
20
20
  "License :: OSI Approved :: Apache Software License",
@@ -28,14 +28,15 @@ classifiers = [
28
28
  ]
29
29
 
30
30
  dependencies = [
31
- # Core data and search (primarily for local backend)
31
+ # Core data and search
32
32
  "sqlalchemy>=2.0",
33
+ "aiosqlite>=0.19.0",
34
+ "greenlet>=3.0.0",
33
35
  "numpy>=1.20",
34
36
  "faiss-cpu>=1.7",
35
- "sentence-transformers>=2.2.0",
36
37
  "filelock>=3.0.0",
37
38
  "nltk>=3.6",
38
- "rank-bm25>=0.2.2",
39
+ "bm25s>=0.2.0",
39
40
 
40
41
  # API Client / Shared Data Models
41
42
  "httpx>=0.23.0",
@@ -51,7 +52,10 @@ dependencies = [
51
52
 
52
53
  # Utilities
53
54
  "tqdm>=4.60",
55
+ "rich>=13.0.0",
54
56
  "requests>=2.25.0",
57
+ "tenacity>=8.0.0",
58
+ "pooch>=1.8.0",
55
59
  ]
56
60
 
57
61
  [project.urls]
@@ -59,14 +63,37 @@ Homepage = "https://www.leanexplore.com/"
59
63
  Repository = "https://github.com/justincasher/lean-explore"
60
64
 
61
65
  [project.scripts]
62
- leanexplore = "lean_explore.cli.main:app"
66
+ lean-explore = "lean_explore.cli.main:app"
67
+
68
+ [project.optional-dependencies]
69
+ extract = [
70
+ "sentence-transformers>=2.2.0",
71
+ "networkx>=3.0",
72
+ "torch>=2.0.0",
73
+ ]
74
+
75
+ dev = [
76
+ "pytest>=7.0",
77
+ "pytest-cov>=4.0",
78
+ "pytest-asyncio>=0.21.0",
79
+ "ruff>=0.1.0",
80
+ "pre-commit>=3.0.0",
81
+ "networkx>=3.0",
82
+ "torch>=2.0.0",
83
+ "sentence-transformers>=2.2.0",
84
+ ]
63
85
 
64
86
  [tool.setuptools.packages.find]
65
87
  where = ["src"]
66
88
 
67
89
  [tool.pytest.ini_options]
68
- asyncio_mode = "strict"
90
+ asyncio_mode = "auto"
69
91
  asyncio_default_fixture_loop_scope = "function"
92
+ markers = [
93
+ "slow: marks tests as slow (deselect with '-m \"not slow\"')",
94
+ "integration: marks tests as integration tests",
95
+ "external: marks tests that require external services",
96
+ ]
70
97
 
71
98
  # -- Ruff Configuration --
72
99
 
@@ -74,8 +101,8 @@ asyncio_default_fixture_loop_scope = "function"
74
101
  # Set the maximum line length.
75
102
  line-length = 88
76
103
 
77
- # Set based on your `requires-python = ">=3.8"`.
78
- target-version = "py38"
104
+ # Set based on requires-python = ">=3.10".
105
+ target-version = "py310"
79
106
 
80
107
  # Define the patterns for files Ruff should lint.
81
108
  include = [
@@ -0,0 +1,14 @@
1
+ """Lean Explore - Search and explore Lean mathematical libraries.
2
+
3
+ This package provides tools for searching Lean declarations using hybrid
4
+ semantic and lexical search, with support for both local and remote backends.
5
+
6
+ Subpackages:
7
+ api: Remote API client for the Lean Explore cloud service.
8
+ cli: Command-line interface for search and data management.
9
+ extract: Data extraction pipeline from doc-gen4 output.
10
+ mcp: Model Context Protocol server for AI assistant integration.
11
+ models: Data models for declarations and search results.
12
+ search: Local search engine with BM25 and semantic search.
13
+ util: Shared utilities for embeddings, reranking, and logging.
14
+ """
@@ -0,0 +1,12 @@
1
+ """Remote API client package for Lean Explore.
2
+
3
+ This package provides an async HTTP client for connecting to the remote
4
+ Lean Explore API service as an alternative to local search.
5
+
6
+ Modules:
7
+ client: ApiClient class for search and declaration retrieval via HTTP.
8
+ """
9
+
10
+ from lean_explore.api.client import ApiClient
11
+
12
+ __all__ = ["ApiClient"]
@@ -0,0 +1,104 @@
1
+ """Client for interacting with the remote Lean Explore API."""
2
+
3
+ import os
4
+
5
+ import httpx
6
+
7
+ from lean_explore.config import Config
8
+ from lean_explore.models import SearchResponse, SearchResult
9
+
10
+
11
+ class ApiClient:
12
+ """Async client for the remote Lean Explore API.
13
+
14
+ This client handles making HTTP requests to the API, authenticating
15
+ with an API key, and parsing responses into SearchResult objects.
16
+ """
17
+
18
+ def __init__(self, api_key: str | None = None, timeout: float = 10.0):
19
+ """Initialize the API client.
20
+
21
+ Args:
22
+ api_key: The API key for authentication. If None, reads from
23
+ LEANEXPLORE_API_KEY environment variable.
24
+ timeout: Default timeout for HTTP requests in seconds.
25
+
26
+ Raises:
27
+ ValueError: If no API key is provided and LEANEXPLORE_API_KEY is not set.
28
+ """
29
+ self.base_url: str = Config.API_BASE_URL
30
+ self.api_key: str = api_key or os.getenv("LEANEXPLORE_API_KEY", "")
31
+ if not self.api_key:
32
+ raise ValueError(
33
+ "API key required. Pass api_key parameter or set LEANEXPLORE_API_KEY "
34
+ "environment variable."
35
+ )
36
+ self.timeout: float = timeout
37
+ self._headers: dict[str, str] = {"Authorization": f"Bearer {self.api_key}"}
38
+
39
+ async def search(
40
+ self,
41
+ query: str,
42
+ limit: int = 20,
43
+ rerank_top: int | None = None, # Ignored for API (server handles reranking)
44
+ packages: list[str] | None = None,
45
+ ) -> SearchResponse:
46
+ """Search for Lean declarations via the API.
47
+
48
+ Args:
49
+ query: The search query string.
50
+ limit: Maximum number of results to return.
51
+ rerank_top: Ignored for API backend (included for interface consistency).
52
+ packages: Filter results to specific packages (e.g., ["Mathlib"]).
53
+
54
+ Returns:
55
+ SearchResponse containing results and metadata.
56
+
57
+ Raises:
58
+ httpx.HTTPStatusError: If the API returns an HTTP error status.
59
+ httpx.RequestError: For network-related issues.
60
+ """
61
+ del rerank_top # Unused - server handles reranking
62
+ endpoint = f"{self.base_url}/search"
63
+ params: dict[str, str | int] = {"q": query, "limit": limit}
64
+ if packages:
65
+ params["packages"] = ",".join(packages)
66
+
67
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
68
+ response = await client.get(endpoint, params=params, headers=self._headers)
69
+ response.raise_for_status()
70
+ data = response.json()
71
+
72
+ # Parse API response into our types
73
+ results = [SearchResult(**item) for item in data.get("results", [])]
74
+
75
+ return SearchResponse(
76
+ query=query,
77
+ results=results,
78
+ count=len(results),
79
+ processing_time_ms=data.get("processing_time_ms"),
80
+ )
81
+
82
+ async def get_by_id(self, declaration_id: int) -> SearchResult | None:
83
+ """Retrieve a declaration by ID via the API.
84
+
85
+ Args:
86
+ declaration_id: The declaration ID.
87
+
88
+ Returns:
89
+ SearchResult if found, None otherwise.
90
+
91
+ Raises:
92
+ httpx.HTTPStatusError: If the API returns an error (except 404).
93
+ httpx.RequestError: For network-related issues.
94
+ """
95
+ endpoint = f"{self.base_url}/declarations/{declaration_id}"
96
+
97
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
98
+ response = await client.get(endpoint, headers=self._headers)
99
+
100
+ if response.status_code == 404:
101
+ return None
102
+
103
+ response.raise_for_status()
104
+ return SearchResult(**response.json())
@@ -0,0 +1,10 @@
1
+ """Command-line interface package for Lean Explore.
2
+
3
+ This package provides CLI commands to search for Lean declarations via the
4
+ remote API, manage MCP servers, and download/manage local data toolchains.
5
+
6
+ Modules:
7
+ main: Core CLI application and top-level commands.
8
+ data_commands: Subcommands for managing local data toolchains.
9
+ display: Formatting and display utilities for search results.
10
+ """
@@ -0,0 +1,242 @@
1
+ # src/lean_explore/cli/data_commands.py
2
+
3
+ """Manages local Lean Explore data toolchains.
4
+
5
+ Provides CLI commands to download, install, and clean data files (database,
6
+ FAISS index, etc.) from remote storage using Pooch for checksums and caching.
7
+ """
8
+
9
+ import logging
10
+ import shutil
11
+ from typing import TypedDict
12
+
13
+ import pooch
14
+ import requests
15
+ import typer
16
+ from rich.console import Console
17
+
18
+ from lean_explore.config import Config
19
+
20
+
21
+ class ManifestFileEntry(TypedDict):
22
+ """A file entry in the manifest's toolchain version."""
23
+
24
+ remote_name: str
25
+ local_name: str
26
+ sha256: str
27
+
28
+
29
+ class ToolchainVersionInfo(TypedDict):
30
+ """Version information for a specific toolchain in the manifest."""
31
+
32
+ assets_base_path_r2: str
33
+ files: list[ManifestFileEntry]
34
+
35
+
36
+ class Manifest(TypedDict):
37
+ """Remote data manifest structure."""
38
+
39
+ default_toolchain: str
40
+ toolchains: dict[str, ToolchainVersionInfo]
41
+
42
+ logger = logging.getLogger(__name__)
43
+
44
+ app = typer.Typer(
45
+ name="data",
46
+ help="Manage local data toolchains for Lean Explore (e.g., download, list, "
47
+ "select, clean).",
48
+ no_args_is_help=True,
49
+ )
50
+
51
+
52
+ def _get_console() -> Console:
53
+ """Create a Rich console instance for output."""
54
+ return Console()
55
+
56
+
57
+ def _fetch_manifest() -> Manifest | None:
58
+ """Fetches the remote data manifest.
59
+
60
+ Returns:
61
+ The manifest dictionary, or None if fetch fails.
62
+ """
63
+ console = _get_console()
64
+ try:
65
+ response = requests.get(Config.MANIFEST_URL, timeout=10)
66
+ response.raise_for_status()
67
+ return response.json()
68
+ except requests.exceptions.RequestException as error:
69
+ logger.error("Failed to fetch manifest: %s", error)
70
+ console.print(f"[bold red]Error fetching manifest: {error}[/bold red]")
71
+ return None
72
+
73
+
74
+ def _resolve_version(manifest: Manifest, version: str | None) -> str:
75
+ """Resolves the version string to an actual toolchain version.
76
+
77
+ Args:
78
+ manifest: The manifest dictionary containing toolchain information.
79
+ version: The requested version, or None/"stable" for default.
80
+
81
+ Returns:
82
+ The resolved version string.
83
+
84
+ Raises:
85
+ ValueError: If the version cannot be resolved.
86
+ """
87
+ if not version or version.lower() == "stable":
88
+ resolved = manifest.get("default_toolchain")
89
+ if not resolved:
90
+ raise ValueError("No default_toolchain specified in manifest")
91
+ return resolved
92
+ return version
93
+
94
+
95
+ def _build_file_registry(version_info: ToolchainVersionInfo) -> dict[str, str]:
96
+ """Builds a Pooch registry from version info.
97
+
98
+ Args:
99
+ version_info: The version information from the manifest.
100
+
101
+ Returns:
102
+ A dictionary mapping remote filenames to SHA256 checksums.
103
+ """
104
+ return {
105
+ file_entry["remote_name"]: f"sha256:{file_entry['sha256']}"
106
+ for file_entry in version_info.get("files", [])
107
+ if file_entry.get("remote_name") and file_entry.get("sha256")
108
+ }
109
+
110
+
111
+ def _write_active_version(version: str) -> None:
112
+ """Write the active version to the version file.
113
+
114
+ Args:
115
+ version: The version string to write.
116
+ """
117
+ version_file = Config.CACHE_DIRECTORY.parent / "active_version"
118
+ version_file.parent.mkdir(parents=True, exist_ok=True)
119
+ version_file.write_text(version)
120
+ logger.info("Set active version to: %s", version)
121
+
122
+
123
+ def _cleanup_old_versions(current_version: str) -> None:
124
+ """Remove all cached versions except the current one.
125
+
126
+ Args:
127
+ current_version: The version to keep.
128
+ """
129
+ if not Config.CACHE_DIRECTORY.exists():
130
+ return
131
+
132
+ for item in Config.CACHE_DIRECTORY.iterdir():
133
+ if item.is_dir() and item.name != current_version:
134
+ logger.info("Removing old version: %s", item.name)
135
+ try:
136
+ shutil.rmtree(item)
137
+ except OSError as error:
138
+ logger.warning("Failed to remove %s: %s", item.name, error)
139
+
140
+
141
+ def _install_toolchain(version: str | None = None) -> None:
142
+ """Installs the data toolchain for the specified version.
143
+
144
+ Downloads and verifies all required data files (database, FAISS index, etc.)
145
+ using Pooch. Files are automatically decompressed and cached locally.
146
+ After successful installation, sets this version as the active version.
147
+
148
+ Args:
149
+ version: The version to install. If None, uses the default version.
150
+
151
+ Raises:
152
+ ValueError: If manifest fetch fails or version is not found.
153
+ """
154
+ console = _get_console()
155
+
156
+ manifest = _fetch_manifest()
157
+ if not manifest:
158
+ raise ValueError("Failed to fetch manifest")
159
+
160
+ resolved_version = _resolve_version(manifest, version)
161
+ version_info = manifest.get("toolchains", {}).get(resolved_version)
162
+ if not version_info:
163
+ available = list(manifest.get("toolchains", {}).keys())
164
+ raise ValueError(
165
+ f"Version '{resolved_version}' not found. Available: {available}"
166
+ )
167
+
168
+ registry = _build_file_registry(version_info)
169
+ base_path = version_info.get("assets_base_path_r2", "")
170
+ base_url = f"{Config.R2_ASSETS_BASE_URL}/{base_path}/"
171
+
172
+ file_downloader = pooch.create(
173
+ path=Config.CACHE_DIRECTORY / resolved_version,
174
+ base_url=base_url,
175
+ registry=registry,
176
+ )
177
+
178
+ # Download and decompress each file
179
+ for file_entry in version_info.get("files", []):
180
+ remote_name = file_entry.get("remote_name")
181
+ local_name = file_entry.get("local_name")
182
+ if remote_name and local_name:
183
+ logger.info("Downloading %s -> %s", remote_name, local_name)
184
+ file_downloader.fetch(
185
+ remote_name, processor=pooch.Decompress(name=local_name)
186
+ )
187
+
188
+ # Set this version as the active version and clean up old versions
189
+ _write_active_version(resolved_version)
190
+ _cleanup_old_versions(resolved_version)
191
+
192
+ console.print(f"[green]Installed data for version {resolved_version}[/green]")
193
+
194
+
195
+ @app.callback()
196
+ def main() -> None:
197
+ """Lean-Explore data CLI.
198
+
199
+ This callback exists only to prevent Typer from treating the first
200
+ sub-command as a *default* command when there is otherwise just one.
201
+ """
202
+ pass
203
+
204
+
205
+ @app.command()
206
+ def fetch(
207
+ version: str = typer.Option(
208
+ None,
209
+ "--version",
210
+ "-v",
211
+ help="Version to install (e.g., '0.1.0'). Defaults to stable/latest.",
212
+ ),
213
+ ) -> None:
214
+ """Fetches and installs the data toolchain from the remote repository.
215
+
216
+ Downloads the database, FAISS index, and other required data files.
217
+ Files are verified with SHA256 checksums and automatically decompressed.
218
+ """
219
+ _install_toolchain(version)
220
+
221
+
222
+ @app.command("clean")
223
+ def clean_data_toolchains() -> None:
224
+ """Removes all downloaded local data toolchains."""
225
+ console = _get_console()
226
+
227
+ if not Config.CACHE_DIRECTORY.exists():
228
+ console.print("[yellow]No local data found to clean.[/yellow]")
229
+ return
230
+
231
+ if typer.confirm("Delete all cached data?", default=False, abort=True):
232
+ try:
233
+ shutil.rmtree(Config.CACHE_DIRECTORY)
234
+ console.print("[green]Data cache cleared.[/green]")
235
+ except OSError as error:
236
+ logger.error("Failed to clean cache directory: %s", error)
237
+ console.print(f"[bold red]Error cleaning data: {error}[/bold red]")
238
+ raise typer.Exit(code=1)
239
+
240
+
241
+ if __name__ == "__main__":
242
+ app()