lean-explore 0.3.0__tar.gz → 1.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. {lean_explore-0.3.0 → lean_explore-1.0.1}/PKG-INFO +32 -9
  2. {lean_explore-0.3.0 → lean_explore-1.0.1}/README.md +11 -5
  3. {lean_explore-0.3.0 → lean_explore-1.0.1}/pyproject.toml +36 -9
  4. lean_explore-1.0.1/src/lean_explore/__init__.py +14 -0
  5. lean_explore-1.0.1/src/lean_explore/api/__init__.py +12 -0
  6. lean_explore-1.0.1/src/lean_explore/api/client.py +104 -0
  7. lean_explore-1.0.1/src/lean_explore/cli/__init__.py +10 -0
  8. lean_explore-1.0.1/src/lean_explore/cli/data_commands.py +259 -0
  9. lean_explore-1.0.1/src/lean_explore/cli/display.py +171 -0
  10. lean_explore-1.0.1/src/lean_explore/cli/main.py +134 -0
  11. lean_explore-1.0.1/src/lean_explore/config.py +244 -0
  12. lean_explore-1.0.1/src/lean_explore/extract/__init__.py +5 -0
  13. lean_explore-1.0.1/src/lean_explore/extract/__main__.py +368 -0
  14. lean_explore-1.0.1/src/lean_explore/extract/doc_gen4.py +200 -0
  15. lean_explore-1.0.1/src/lean_explore/extract/doc_parser.py +499 -0
  16. lean_explore-1.0.1/src/lean_explore/extract/embeddings.py +369 -0
  17. lean_explore-1.0.1/src/lean_explore/extract/github.py +110 -0
  18. lean_explore-1.0.1/src/lean_explore/extract/index.py +316 -0
  19. lean_explore-1.0.1/src/lean_explore/extract/informalize.py +653 -0
  20. lean_explore-1.0.1/src/lean_explore/extract/package_config.py +59 -0
  21. lean_explore-1.0.1/src/lean_explore/extract/package_registry.py +45 -0
  22. lean_explore-1.0.1/src/lean_explore/extract/package_utils.py +105 -0
  23. lean_explore-1.0.1/src/lean_explore/extract/types.py +25 -0
  24. lean_explore-1.0.1/src/lean_explore/mcp/__init__.py +11 -0
  25. lean_explore-1.0.1/src/lean_explore/mcp/app.py +75 -0
  26. {lean_explore-0.3.0 → lean_explore-1.0.1}/src/lean_explore/mcp/server.py +20 -35
  27. lean_explore-1.0.1/src/lean_explore/mcp/tools.py +136 -0
  28. lean_explore-1.0.1/src/lean_explore/models/__init__.py +9 -0
  29. lean_explore-1.0.1/src/lean_explore/models/search_db.py +76 -0
  30. lean_explore-1.0.1/src/lean_explore/models/search_types.py +53 -0
  31. lean_explore-1.0.1/src/lean_explore/search/__init__.py +32 -0
  32. lean_explore-1.0.1/src/lean_explore/search/engine.py +651 -0
  33. lean_explore-1.0.1/src/lean_explore/search/scoring.py +156 -0
  34. lean_explore-1.0.1/src/lean_explore/search/service.py +68 -0
  35. lean_explore-1.0.1/src/lean_explore/search/tokenization.py +71 -0
  36. lean_explore-1.0.1/src/lean_explore/util/__init__.py +28 -0
  37. lean_explore-1.0.1/src/lean_explore/util/embedding_client.py +92 -0
  38. lean_explore-1.0.1/src/lean_explore/util/logging.py +22 -0
  39. lean_explore-1.0.1/src/lean_explore/util/openrouter_client.py +63 -0
  40. lean_explore-1.0.1/src/lean_explore/util/reranker_client.py +187 -0
  41. {lean_explore-0.3.0 → lean_explore-1.0.1}/src/lean_explore.egg-info/PKG-INFO +32 -9
  42. lean_explore-1.0.1/src/lean_explore.egg-info/SOURCES.txt +46 -0
  43. lean_explore-1.0.1/src/lean_explore.egg-info/entry_points.txt +2 -0
  44. lean_explore-1.0.1/src/lean_explore.egg-info/requires.txt +34 -0
  45. lean_explore-0.3.0/src/lean_explore/__init__.py +0 -1
  46. lean_explore-0.3.0/src/lean_explore/api/__init__.py +0 -1
  47. lean_explore-0.3.0/src/lean_explore/api/client.py +0 -216
  48. lean_explore-0.3.0/src/lean_explore/cli/__init__.py +0 -1
  49. lean_explore-0.3.0/src/lean_explore/cli/agent.py +0 -788
  50. lean_explore-0.3.0/src/lean_explore/cli/config_utils.py +0 -481
  51. lean_explore-0.3.0/src/lean_explore/cli/data_commands.py +0 -564
  52. lean_explore-0.3.0/src/lean_explore/cli/main.py +0 -691
  53. lean_explore-0.3.0/src/lean_explore/defaults.py +0 -114
  54. lean_explore-0.3.0/src/lean_explore/local/__init__.py +0 -1
  55. lean_explore-0.3.0/src/lean_explore/local/search.py +0 -1050
  56. lean_explore-0.3.0/src/lean_explore/local/service.py +0 -479
  57. lean_explore-0.3.0/src/lean_explore/mcp/__init__.py +0 -1
  58. lean_explore-0.3.0/src/lean_explore/mcp/app.py +0 -107
  59. lean_explore-0.3.0/src/lean_explore/mcp/tools.py +0 -270
  60. lean_explore-0.3.0/src/lean_explore/shared/__init__.py +0 -1
  61. lean_explore-0.3.0/src/lean_explore/shared/models/__init__.py +0 -1
  62. lean_explore-0.3.0/src/lean_explore/shared/models/api.py +0 -117
  63. lean_explore-0.3.0/src/lean_explore/shared/models/db.py +0 -396
  64. lean_explore-0.3.0/src/lean_explore.egg-info/SOURCES.txt +0 -30
  65. lean_explore-0.3.0/src/lean_explore.egg-info/entry_points.txt +0 -2
  66. lean_explore-0.3.0/src/lean_explore.egg-info/requires.txt +0 -15
  67. lean_explore-0.3.0/tests/test_defaults.py +0 -303
  68. {lean_explore-0.3.0 → lean_explore-1.0.1}/LICENSE +0 -0
  69. {lean_explore-0.3.0 → lean_explore-1.0.1}/setup.cfg +0 -0
  70. {lean_explore-0.3.0 → lean_explore-1.0.1}/src/lean_explore.egg-info/dependency_links.txt +0 -0
  71. {lean_explore-0.3.0 → lean_explore-1.0.1}/src/lean_explore.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lean-explore
3
- Version: 0.3.0
3
+ Version: 1.0.1
4
4
  Summary: A search engine for Lean 4 declarations.
5
5
  Author-email: Justin Asher <justinchadwickasher@gmail.com>
6
6
  License: Apache License
@@ -208,7 +208,7 @@ License: Apache License
208
208
  Project-URL: Homepage, https://www.leanexplore.com/
209
209
  Project-URL: Repository, https://github.com/justincasher/lean-explore
210
210
  Keywords: lean,lean4,search,formal methods,theorem prover,math,AI
211
- Classifier: Development Status :: 3 - Alpha
211
+ Classifier: Development Status :: 4 - Beta
212
212
  Classifier: Intended Audience :: Developers
213
213
  Classifier: Intended Audience :: Science/Research
214
214
  Classifier: License :: OSI Approved :: Apache Software License
@@ -223,12 +223,13 @@ Requires-Python: >=3.10
223
223
  Description-Content-Type: text/markdown
224
224
  License-File: LICENSE
225
225
  Requires-Dist: sqlalchemy>=2.0
226
+ Requires-Dist: aiosqlite>=0.19.0
227
+ Requires-Dist: greenlet>=3.0.0
226
228
  Requires-Dist: numpy>=1.20
227
229
  Requires-Dist: faiss-cpu>=1.7
228
- Requires-Dist: sentence-transformers>=2.2.0
229
230
  Requires-Dist: filelock>=3.0.0
230
231
  Requires-Dist: nltk>=3.6
231
- Requires-Dist: rank-bm25>=0.2.2
232
+ Requires-Dist: bm25s>=0.2.0
232
233
  Requires-Dist: httpx>=0.23.0
233
234
  Requires-Dist: pydantic>=2.0
234
235
  Requires-Dist: typer[all]>=0.9.0
@@ -236,7 +237,23 @@ Requires-Dist: toml>=0.10.0
236
237
  Requires-Dist: openai-agents>=0.0.16
237
238
  Requires-Dist: mcp>=1.9.0
238
239
  Requires-Dist: tqdm>=4.60
240
+ Requires-Dist: rich>=13.0.0
239
241
  Requires-Dist: requests>=2.25.0
242
+ Requires-Dist: tenacity>=8.0.0
243
+ Requires-Dist: pooch>=1.8.0
244
+ Provides-Extra: extract
245
+ Requires-Dist: sentence-transformers>=2.2.0; extra == "extract"
246
+ Requires-Dist: networkx>=3.0; extra == "extract"
247
+ Requires-Dist: torch>=2.0.0; extra == "extract"
248
+ Provides-Extra: dev
249
+ Requires-Dist: pytest>=7.0; extra == "dev"
250
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
251
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
252
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
253
+ Requires-Dist: pre-commit>=3.0.0; extra == "dev"
254
+ Requires-Dist: networkx>=3.0; extra == "dev"
255
+ Requires-Dist: torch>=2.0.0; extra == "dev"
256
+ Requires-Dist: sentence-transformers>=2.2.0; extra == "dev"
240
257
  Dynamic: license-file
241
258
 
242
259
  <h1 align="center">
@@ -269,21 +286,28 @@ A search engine for Lean 4 declarations. This project provides tools and resourc
269
286
  The current indexed projects include:
270
287
 
271
288
  * Batteries
272
- * Lean
289
+ * CSLib
290
+ * FLT (Fermat's Last Theorem)
291
+ * FormalConjectures
273
292
  * Init
293
+ * Lean
274
294
  * Mathlib
275
295
  * PhysLean
276
296
  * Std
277
297
 
278
298
  This code is distributed under an Apache License (see [LICENSE](LICENSE)).
279
299
 
280
- ### Cite
300
+ ## Contributing
301
+
302
+ Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on code style, testing, and development setup.
303
+
304
+ ## Cite
281
305
 
282
306
  If you use LeanExplore in your research or work, please cite it as follows:
283
307
 
284
308
  **General Citation:**
285
309
 
286
- Justin Asher. (2025). *LeanExplore: A search engine for Lean 4 declarations*. LeanExplore.com. (GitHub: [https://github.com/justincasher/lean-explore](https://github.com/justincasher/lean-explore)).
310
+ Justin Asher. (2025). *LeanExplore: A search engine for Lean 4 declarations*. [https://arxiv.org/abs/2506.11085](https://arxiv.org/abs/2506.11085)
287
311
 
288
312
  **BibTeX Entry:**
289
313
 
@@ -292,7 +316,6 @@ Justin Asher. (2025). *LeanExplore: A search engine for Lean 4 declarations*. Le
292
316
  author = {Asher, Justin},
293
317
  title = {{LeanExplore: A search engine for Lean 4 declarations}},
294
318
  year = {2025},
295
- url = {http://www.leanexplore.com},
296
- note = {GitHub repository: https://github.com/justincasher/lean-explore}
319
+ url = {https://arxiv.org/abs/2506.11085}
297
320
  }
298
321
  ```
@@ -28,21 +28,28 @@ A search engine for Lean 4 declarations. This project provides tools and resourc
28
28
  The current indexed projects include:
29
29
 
30
30
  * Batteries
31
- * Lean
31
+ * CSLib
32
+ * FLT (Fermat's Last Theorem)
33
+ * FormalConjectures
32
34
  * Init
35
+ * Lean
33
36
  * Mathlib
34
37
  * PhysLean
35
38
  * Std
36
39
 
37
40
  This code is distributed under an Apache License (see [LICENSE](LICENSE)).
38
41
 
39
- ### Cite
42
+ ## Contributing
43
+
44
+ Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on code style, testing, and development setup.
45
+
46
+ ## Cite
40
47
 
41
48
  If you use LeanExplore in your research or work, please cite it as follows:
42
49
 
43
50
  **General Citation:**
44
51
 
45
- Justin Asher. (2025). *LeanExplore: A search engine for Lean 4 declarations*. LeanExplore.com. (GitHub: [https://github.com/justincasher/lean-explore](https://github.com/justincasher/lean-explore)).
52
+ Justin Asher. (2025). *LeanExplore: A search engine for Lean 4 declarations*. [https://arxiv.org/abs/2506.11085](https://arxiv.org/abs/2506.11085)
46
53
 
47
54
  **BibTeX Entry:**
48
55
 
@@ -51,7 +58,6 @@ Justin Asher. (2025). *LeanExplore: A search engine for Lean 4 declarations*. Le
51
58
  author = {Asher, Justin},
52
59
  title = {{LeanExplore: A search engine for Lean 4 declarations}},
53
60
  year = {2025},
54
- url = {http://www.leanexplore.com},
55
- note = {GitHub repository: https://github.com/justincasher/lean-explore}
61
+ url = {https://arxiv.org/abs/2506.11085}
56
62
  }
57
63
  ```
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "lean-explore"
7
- version = "0.3.0"
7
+ version = "1.0.1"
8
8
  authors = [
9
9
  { name = "Justin Asher", email = "justinchadwickasher@gmail.com" },
10
10
  ]
@@ -14,7 +14,7 @@ requires-python = ">=3.10"
14
14
  license = { file = "LICENSE" }
15
15
  keywords = ["lean", "lean4", "search", "formal methods", "theorem prover", "math", "AI"]
16
16
  classifiers = [
17
- "Development Status :: 3 - Alpha",
17
+ "Development Status :: 4 - Beta",
18
18
  "Intended Audience :: Developers",
19
19
  "Intended Audience :: Science/Research",
20
20
  "License :: OSI Approved :: Apache Software License",
@@ -28,14 +28,15 @@ classifiers = [
28
28
  ]
29
29
 
30
30
  dependencies = [
31
- # Core data and search (primarily for local backend)
31
+ # Core data and search
32
32
  "sqlalchemy>=2.0",
33
+ "aiosqlite>=0.19.0",
34
+ "greenlet>=3.0.0",
33
35
  "numpy>=1.20",
34
36
  "faiss-cpu>=1.7",
35
- "sentence-transformers>=2.2.0",
36
37
  "filelock>=3.0.0",
37
38
  "nltk>=3.6",
38
- "rank-bm25>=0.2.2",
39
+ "bm25s>=0.2.0",
39
40
 
40
41
  # API Client / Shared Data Models
41
42
  "httpx>=0.23.0",
@@ -51,7 +52,10 @@ dependencies = [
51
52
 
52
53
  # Utilities
53
54
  "tqdm>=4.60",
55
+ "rich>=13.0.0",
54
56
  "requests>=2.25.0",
57
+ "tenacity>=8.0.0",
58
+ "pooch>=1.8.0",
55
59
  ]
56
60
 
57
61
  [project.urls]
@@ -59,14 +63,37 @@ Homepage = "https://www.leanexplore.com/"
59
63
  Repository = "https://github.com/justincasher/lean-explore"
60
64
 
61
65
  [project.scripts]
62
- leanexplore = "lean_explore.cli.main:app"
66
+ lean-explore = "lean_explore.cli.main:app"
67
+
68
+ [project.optional-dependencies]
69
+ extract = [
70
+ "sentence-transformers>=2.2.0",
71
+ "networkx>=3.0",
72
+ "torch>=2.0.0",
73
+ ]
74
+
75
+ dev = [
76
+ "pytest>=7.0",
77
+ "pytest-cov>=4.0",
78
+ "pytest-asyncio>=0.21.0",
79
+ "ruff>=0.1.0",
80
+ "pre-commit>=3.0.0",
81
+ "networkx>=3.0",
82
+ "torch>=2.0.0",
83
+ "sentence-transformers>=2.2.0",
84
+ ]
63
85
 
64
86
  [tool.setuptools.packages.find]
65
87
  where = ["src"]
66
88
 
67
89
  [tool.pytest.ini_options]
68
- asyncio_mode = "strict"
90
+ asyncio_mode = "auto"
69
91
  asyncio_default_fixture_loop_scope = "function"
92
+ markers = [
93
+ "slow: marks tests as slow (deselect with '-m \"not slow\"')",
94
+ "integration: marks tests as integration tests",
95
+ "external: marks tests that require external services",
96
+ ]
70
97
 
71
98
  # -- Ruff Configuration --
72
99
 
@@ -74,8 +101,8 @@ asyncio_default_fixture_loop_scope = "function"
74
101
  # Set the maximum line length.
75
102
  line-length = 88
76
103
 
77
- # Set based on your `requires-python = ">=3.8"`.
78
- target-version = "py38"
104
+ # Set based on requires-python = ">=3.10".
105
+ target-version = "py310"
79
106
 
80
107
  # Define the patterns for files Ruff should lint.
81
108
  include = [
@@ -0,0 +1,14 @@
1
+ """Lean Explore - Search and explore Lean mathematical libraries.
2
+
3
+ This package provides tools for searching Lean declarations using hybrid
4
+ semantic and lexical search, with support for both local and remote backends.
5
+
6
+ Subpackages:
7
+ api: Remote API client for the Lean Explore cloud service.
8
+ cli: Command-line interface for search and data management.
9
+ extract: Data extraction pipeline from doc-gen4 output.
10
+ mcp: Model Context Protocol server for AI assistant integration.
11
+ models: Data models for declarations and search results.
12
+ search: Local search engine with BM25 and semantic search.
13
+ util: Shared utilities for embeddings, reranking, and logging.
14
+ """
@@ -0,0 +1,12 @@
1
+ """Remote API client package for Lean Explore.
2
+
3
+ This package provides an async HTTP client for connecting to the remote
4
+ Lean Explore API service as an alternative to local search.
5
+
6
+ Modules:
7
+ client: ApiClient class for search and declaration retrieval via HTTP.
8
+ """
9
+
10
+ from lean_explore.api.client import ApiClient
11
+
12
+ __all__ = ["ApiClient"]
@@ -0,0 +1,104 @@
1
+ """Client for interacting with the remote Lean Explore API."""
2
+
3
+ import os
4
+
5
+ import httpx
6
+
7
+ from lean_explore.config import Config
8
+ from lean_explore.models import SearchResponse, SearchResult
9
+
10
+
11
+ class ApiClient:
12
+ """Async client for the remote Lean Explore API.
13
+
14
+ This client handles making HTTP requests to the API, authenticating
15
+ with an API key, and parsing responses into SearchResult objects.
16
+ """
17
+
18
+ def __init__(self, api_key: str | None = None, timeout: float = 10.0):
19
+ """Initialize the API client.
20
+
21
+ Args:
22
+ api_key: The API key for authentication. If None, reads from
23
+ LEANEXPLORE_API_KEY environment variable.
24
+ timeout: Default timeout for HTTP requests in seconds.
25
+
26
+ Raises:
27
+ ValueError: If no API key is provided and LEANEXPLORE_API_KEY is not set.
28
+ """
29
+ self.base_url: str = Config.API_BASE_URL
30
+ self.api_key: str = api_key or os.getenv("LEANEXPLORE_API_KEY", "")
31
+ if not self.api_key:
32
+ raise ValueError(
33
+ "API key required. Pass api_key parameter or set LEANEXPLORE_API_KEY "
34
+ "environment variable."
35
+ )
36
+ self.timeout: float = timeout
37
+ self._headers: dict[str, str] = {"Authorization": f"Bearer {self.api_key}"}
38
+
39
+ async def search(
40
+ self,
41
+ query: str,
42
+ limit: int = 20,
43
+ rerank_top: int | None = None, # Ignored for API (server handles reranking)
44
+ packages: list[str] | None = None,
45
+ ) -> SearchResponse:
46
+ """Search for Lean declarations via the API.
47
+
48
+ Args:
49
+ query: The search query string.
50
+ limit: Maximum number of results to return.
51
+ rerank_top: Ignored for API backend (included for interface consistency).
52
+ packages: Filter results to specific packages (e.g., ["Mathlib"]).
53
+
54
+ Returns:
55
+ SearchResponse containing results and metadata.
56
+
57
+ Raises:
58
+ httpx.HTTPStatusError: If the API returns an HTTP error status.
59
+ httpx.RequestError: For network-related issues.
60
+ """
61
+ del rerank_top # Unused - server handles reranking
62
+ endpoint = f"{self.base_url}/search"
63
+ params: dict[str, str | int] = {"q": query, "limit": limit}
64
+ if packages:
65
+ params["packages"] = ",".join(packages)
66
+
67
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
68
+ response = await client.get(endpoint, params=params, headers=self._headers)
69
+ response.raise_for_status()
70
+ data = response.json()
71
+
72
+ # Parse API response into our types
73
+ results = [SearchResult(**item) for item in data.get("results", [])]
74
+
75
+ return SearchResponse(
76
+ query=query,
77
+ results=results,
78
+ count=len(results),
79
+ processing_time_ms=data.get("processing_time_ms"),
80
+ )
81
+
82
+ async def get_by_id(self, declaration_id: int) -> SearchResult | None:
83
+ """Retrieve a declaration by ID via the API.
84
+
85
+ Args:
86
+ declaration_id: The declaration ID.
87
+
88
+ Returns:
89
+ SearchResult if found, None otherwise.
90
+
91
+ Raises:
92
+ httpx.HTTPStatusError: If the API returns an error (except 404).
93
+ httpx.RequestError: For network-related issues.
94
+ """
95
+ endpoint = f"{self.base_url}/declarations/{declaration_id}"
96
+
97
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
98
+ response = await client.get(endpoint, headers=self._headers)
99
+
100
+ if response.status_code == 404:
101
+ return None
102
+
103
+ response.raise_for_status()
104
+ return SearchResult(**response.json())
@@ -0,0 +1,10 @@
1
+ """Command-line interface package for Lean Explore.
2
+
3
+ This package provides CLI commands to search for Lean declarations via the
4
+ remote API, manage MCP servers, and download/manage local data toolchains.
5
+
6
+ Modules:
7
+ main: Core CLI application and top-level commands.
8
+ data_commands: Subcommands for managing local data toolchains.
9
+ display: Formatting and display utilities for search results.
10
+ """
@@ -0,0 +1,259 @@
1
+ # src/lean_explore/cli/data_commands.py
2
+
3
+ """Manages local Lean Explore data toolchains.
4
+
5
+ Provides CLI commands to download, install, and clean data files (database,
6
+ FAISS index, BM25 indexes, etc.) from remote storage.
7
+ """
8
+
9
+ import logging
10
+ import shutil
11
+ from pathlib import Path
12
+
13
+ import requests
14
+ import typer
15
+ from rich.console import Console
16
+ from rich.progress import (
17
+ BarColumn,
18
+ DownloadColumn,
19
+ Progress,
20
+ TextColumn,
21
+ TransferSpeedColumn,
22
+ )
23
+
24
+ from lean_explore.config import Config
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ app = typer.Typer(
29
+ name="data",
30
+ help="Manage local data toolchains for Lean Explore (e.g., download, list, "
31
+ "select, clean).",
32
+ no_args_is_help=True,
33
+ )
34
+
35
+ # Files required for the search engine (relative to version directory)
36
+ REQUIRED_FILES: list[str] = [
37
+ "lean_explore.db",
38
+ "informalization_faiss.index",
39
+ "informalization_faiss_ids_map.json",
40
+ "bm25_ids_map.json",
41
+ ]
42
+
43
+ # BM25 index directories and their contents
44
+ BM25_DIRECTORIES: dict[str, list[str]] = {
45
+ "bm25_name_raw": [
46
+ "data.csc.index.npy",
47
+ "indices.csc.index.npy",
48
+ "indptr.csc.index.npy",
49
+ "nonoccurrence_array.index.npy",
50
+ "params.index.json",
51
+ "vocab.index.json",
52
+ ],
53
+ "bm25_name_spaced": [
54
+ "data.csc.index.npy",
55
+ "indices.csc.index.npy",
56
+ "indptr.csc.index.npy",
57
+ "nonoccurrence_array.index.npy",
58
+ "params.index.json",
59
+ "vocab.index.json",
60
+ ],
61
+ }
62
+
63
+
64
+ def _get_console() -> Console:
65
+ """Create a Rich console instance for output."""
66
+ return Console()
67
+
68
+
69
+ def _fetch_latest_version() -> str:
70
+ """Fetch the latest version identifier from remote storage.
71
+
72
+ Returns:
73
+ The version string (e.g., "20260127_103630").
74
+
75
+ Raises:
76
+ ValueError: If the latest version cannot be fetched.
77
+ """
78
+ latest_url = f"{Config.R2_ASSETS_BASE_URL}/assets/latest.txt"
79
+ try:
80
+ response = requests.get(latest_url, timeout=10)
81
+ response.raise_for_status()
82
+ return response.text.strip()
83
+ except requests.exceptions.RequestException as error:
84
+ logger.error("Failed to fetch latest version: %s", error)
85
+ raise ValueError(f"Failed to fetch latest version: {error}") from error
86
+
87
+
88
+ def _download_file(url: str, destination: Path, progress: Progress) -> None:
89
+ """Download a file with progress tracking.
90
+
91
+ Args:
92
+ url: The URL to download from.
93
+ destination: The local path to save the file.
94
+ progress: Rich progress instance for tracking.
95
+ """
96
+ destination.parent.mkdir(parents=True, exist_ok=True)
97
+
98
+ response = requests.get(url, stream=True, timeout=300)
99
+ response.raise_for_status()
100
+
101
+ total_size = int(response.headers.get("content-length", 0))
102
+ task_id = progress.add_task(destination.name, total=total_size)
103
+
104
+ with open(destination, "wb") as file:
105
+ for chunk in response.iter_content(chunk_size=8192):
106
+ file.write(chunk)
107
+ progress.update(task_id, advance=len(chunk))
108
+
109
+
110
+ def _write_active_version(version: str) -> None:
111
+ """Write the active version to the version file.
112
+
113
+ Args:
114
+ version: The version string to write.
115
+ """
116
+ version_file = Config.CACHE_DIRECTORY.parent / "active_version"
117
+ version_file.parent.mkdir(parents=True, exist_ok=True)
118
+ version_file.write_text(version)
119
+ logger.info("Set active version to: %s", version)
120
+
121
+
122
+ def _cleanup_old_versions(current_version: str) -> None:
123
+ """Remove all cached versions except the current one.
124
+
125
+ Args:
126
+ current_version: The version to keep.
127
+ """
128
+ if not Config.CACHE_DIRECTORY.exists():
129
+ return
130
+
131
+ for item in Config.CACHE_DIRECTORY.iterdir():
132
+ if item.is_dir() and item.name != current_version:
133
+ logger.info("Removing old version: %s", item.name)
134
+ try:
135
+ shutil.rmtree(item)
136
+ except OSError as error:
137
+ logger.warning("Failed to remove %s: %s", item.name, error)
138
+
139
+
140
+ def _install_toolchain(version: str | None = None) -> None:
141
+ """Install the data toolchain for the specified version.
142
+
143
+ Downloads all required data files (database, FAISS index, BM25 indexes)
144
+ from remote storage. After successful installation, sets this version
145
+ as the active version and cleans up old versions.
146
+
147
+ Args:
148
+ version: The version to install. If None, fetches the latest version.
149
+
150
+ Raises:
151
+ ValueError: If version fetch fails or download errors occur.
152
+ """
153
+ console = _get_console()
154
+
155
+ if version:
156
+ resolved_version = version
157
+ else:
158
+ console.print("Fetching latest version...")
159
+ resolved_version = _fetch_latest_version()
160
+
161
+ console.print(f"Installing version: [bold]{resolved_version}[/bold]")
162
+
163
+ base_url = f"{Config.R2_ASSETS_BASE_URL}/assets/{resolved_version}"
164
+ cache_path = Config.CACHE_DIRECTORY / resolved_version
165
+
166
+ # Build list of all files to download
167
+ files_to_download: list[tuple[str, Path]] = []
168
+
169
+ for filename in REQUIRED_FILES:
170
+ url = f"{base_url}/{filename}"
171
+ destination = cache_path / filename
172
+ files_to_download.append((url, destination))
173
+
174
+ for directory_name, directory_files in BM25_DIRECTORIES.items():
175
+ for filename in directory_files:
176
+ url = f"{base_url}/{directory_name}/{filename}"
177
+ destination = cache_path / directory_name / filename
178
+ files_to_download.append((url, destination))
179
+
180
+ # Download all files with progress
181
+ with Progress(
182
+ TextColumn("[bold blue]{task.description}"),
183
+ BarColumn(),
184
+ DownloadColumn(),
185
+ TransferSpeedColumn(),
186
+ console=console,
187
+ ) as progress:
188
+ for url, destination in files_to_download:
189
+ if destination.exists():
190
+ logger.info("Skipping existing file: %s", destination.name)
191
+ continue
192
+ try:
193
+ _download_file(url, destination, progress)
194
+ except requests.exceptions.RequestException as error:
195
+ logger.error("Failed to download %s: %s", url, error)
196
+ raise ValueError(f"Failed to download {url}: {error}") from error
197
+
198
+ # Set this version as active and clean up old versions
199
+ _write_active_version(resolved_version)
200
+ _cleanup_old_versions(resolved_version)
201
+
202
+ console.print(f"[green]Installed data for version {resolved_version}[/green]")
203
+
204
+
205
+ @app.callback()
206
+ def main() -> None:
207
+ """Lean-Explore data CLI.
208
+
209
+ This callback exists only to prevent Typer from treating the first
210
+ sub-command as a *default* command when there is otherwise just one.
211
+ """
212
+ pass
213
+
214
+
215
+ @app.command()
216
+ def fetch(
217
+ version: str = typer.Option(
218
+ None,
219
+ "--version",
220
+ "-v",
221
+ help="Version to install (e.g., '20260127_103630'). Defaults to latest.",
222
+ ),
223
+ ) -> None:
224
+ """Fetch and install the data toolchain from remote storage.
225
+
226
+ Downloads the database, FAISS index, and BM25 indexes required for
227
+ local search. Automatically cleans up old cached versions.
228
+ """
229
+ _install_toolchain(version)
230
+
231
+
232
+ @app.command("clean")
233
+ def clean_data_toolchains() -> None:
234
+ """Remove all downloaded local data toolchains."""
235
+ console = _get_console()
236
+
237
+ cache_exists = Config.CACHE_DIRECTORY.exists()
238
+ version_file = Config.CACHE_DIRECTORY.parent / "active_version"
239
+ version_exists = version_file.exists()
240
+
241
+ if not cache_exists and not version_exists:
242
+ console.print("[yellow]No local data found to clean.[/yellow]")
243
+ return
244
+
245
+ if typer.confirm("Delete all cached data?", default=False, abort=True):
246
+ try:
247
+ if cache_exists:
248
+ shutil.rmtree(Config.CACHE_DIRECTORY)
249
+ if version_exists:
250
+ version_file.unlink()
251
+ console.print("[green]Data cache cleared.[/green]")
252
+ except OSError as error:
253
+ logger.error("Failed to clean cache directory: %s", error)
254
+ console.print(f"[bold red]Error cleaning data: {error}[/bold red]")
255
+ raise typer.Exit(code=1)
256
+
257
+
258
+ if __name__ == "__main__":
259
+ app()