lean-explore 1.0.0__tar.gz → 1.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lean_explore-1.0.0 → lean_explore-1.0.1}/PKG-INFO +1 -1
- {lean_explore-1.0.0 → lean_explore-1.0.1}/pyproject.toml +1 -1
- lean_explore-1.0.1/src/lean_explore/cli/data_commands.py +259 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/extract/doc_gen4.py +1 -1
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/extract/embeddings.py +1 -3
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/extract/index.py +1 -2
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/extract/package_utils.py +2 -2
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/mcp/tools.py +1 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/search/engine.py +2 -6
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/util/reranker_client.py +1 -3
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore.egg-info/PKG-INFO +1 -1
- lean_explore-1.0.0/src/lean_explore/cli/data_commands.py +0 -242
- {lean_explore-1.0.0 → lean_explore-1.0.1}/LICENSE +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/README.md +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/setup.cfg +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/__init__.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/api/__init__.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/api/client.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/cli/__init__.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/cli/display.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/cli/main.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/config.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/extract/__init__.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/extract/__main__.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/extract/doc_parser.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/extract/github.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/extract/informalize.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/extract/package_config.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/extract/package_registry.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/extract/types.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/mcp/__init__.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/mcp/app.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/mcp/server.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/models/__init__.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/models/search_db.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/models/search_types.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/search/__init__.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/search/scoring.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/search/service.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/search/tokenization.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/util/__init__.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/util/embedding_client.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/util/logging.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore/util/openrouter_client.py +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore.egg-info/SOURCES.txt +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore.egg-info/dependency_links.txt +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore.egg-info/entry_points.txt +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore.egg-info/requires.txt +0 -0
- {lean_explore-1.0.0 → lean_explore-1.0.1}/src/lean_explore.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
# src/lean_explore/cli/data_commands.py
|
|
2
|
+
|
|
3
|
+
"""Manages local Lean Explore data toolchains.
|
|
4
|
+
|
|
5
|
+
Provides CLI commands to download, install, and clean data files (database,
|
|
6
|
+
FAISS index, BM25 indexes, etc.) from remote storage.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
import shutil
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import requests
|
|
14
|
+
import typer
|
|
15
|
+
from rich.console import Console
|
|
16
|
+
from rich.progress import (
|
|
17
|
+
BarColumn,
|
|
18
|
+
DownloadColumn,
|
|
19
|
+
Progress,
|
|
20
|
+
TextColumn,
|
|
21
|
+
TransferSpeedColumn,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
from lean_explore.config import Config
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
app = typer.Typer(
|
|
29
|
+
name="data",
|
|
30
|
+
help="Manage local data toolchains for Lean Explore (e.g., download, list, "
|
|
31
|
+
"select, clean).",
|
|
32
|
+
no_args_is_help=True,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# Files required for the search engine (relative to version directory)
|
|
36
|
+
REQUIRED_FILES: list[str] = [
|
|
37
|
+
"lean_explore.db",
|
|
38
|
+
"informalization_faiss.index",
|
|
39
|
+
"informalization_faiss_ids_map.json",
|
|
40
|
+
"bm25_ids_map.json",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
# BM25 index directories and their contents
|
|
44
|
+
BM25_DIRECTORIES: dict[str, list[str]] = {
|
|
45
|
+
"bm25_name_raw": [
|
|
46
|
+
"data.csc.index.npy",
|
|
47
|
+
"indices.csc.index.npy",
|
|
48
|
+
"indptr.csc.index.npy",
|
|
49
|
+
"nonoccurrence_array.index.npy",
|
|
50
|
+
"params.index.json",
|
|
51
|
+
"vocab.index.json",
|
|
52
|
+
],
|
|
53
|
+
"bm25_name_spaced": [
|
|
54
|
+
"data.csc.index.npy",
|
|
55
|
+
"indices.csc.index.npy",
|
|
56
|
+
"indptr.csc.index.npy",
|
|
57
|
+
"nonoccurrence_array.index.npy",
|
|
58
|
+
"params.index.json",
|
|
59
|
+
"vocab.index.json",
|
|
60
|
+
],
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _get_console() -> Console:
|
|
65
|
+
"""Create a Rich console instance for output."""
|
|
66
|
+
return Console()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _fetch_latest_version() -> str:
|
|
70
|
+
"""Fetch the latest version identifier from remote storage.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
The version string (e.g., "20260127_103630").
|
|
74
|
+
|
|
75
|
+
Raises:
|
|
76
|
+
ValueError: If the latest version cannot be fetched.
|
|
77
|
+
"""
|
|
78
|
+
latest_url = f"{Config.R2_ASSETS_BASE_URL}/assets/latest.txt"
|
|
79
|
+
try:
|
|
80
|
+
response = requests.get(latest_url, timeout=10)
|
|
81
|
+
response.raise_for_status()
|
|
82
|
+
return response.text.strip()
|
|
83
|
+
except requests.exceptions.RequestException as error:
|
|
84
|
+
logger.error("Failed to fetch latest version: %s", error)
|
|
85
|
+
raise ValueError(f"Failed to fetch latest version: {error}") from error
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _download_file(url: str, destination: Path, progress: Progress) -> None:
|
|
89
|
+
"""Download a file with progress tracking.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
url: The URL to download from.
|
|
93
|
+
destination: The local path to save the file.
|
|
94
|
+
progress: Rich progress instance for tracking.
|
|
95
|
+
"""
|
|
96
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
97
|
+
|
|
98
|
+
response = requests.get(url, stream=True, timeout=300)
|
|
99
|
+
response.raise_for_status()
|
|
100
|
+
|
|
101
|
+
total_size = int(response.headers.get("content-length", 0))
|
|
102
|
+
task_id = progress.add_task(destination.name, total=total_size)
|
|
103
|
+
|
|
104
|
+
with open(destination, "wb") as file:
|
|
105
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
106
|
+
file.write(chunk)
|
|
107
|
+
progress.update(task_id, advance=len(chunk))
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _write_active_version(version: str) -> None:
|
|
111
|
+
"""Write the active version to the version file.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
version: The version string to write.
|
|
115
|
+
"""
|
|
116
|
+
version_file = Config.CACHE_DIRECTORY.parent / "active_version"
|
|
117
|
+
version_file.parent.mkdir(parents=True, exist_ok=True)
|
|
118
|
+
version_file.write_text(version)
|
|
119
|
+
logger.info("Set active version to: %s", version)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _cleanup_old_versions(current_version: str) -> None:
|
|
123
|
+
"""Remove all cached versions except the current one.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
current_version: The version to keep.
|
|
127
|
+
"""
|
|
128
|
+
if not Config.CACHE_DIRECTORY.exists():
|
|
129
|
+
return
|
|
130
|
+
|
|
131
|
+
for item in Config.CACHE_DIRECTORY.iterdir():
|
|
132
|
+
if item.is_dir() and item.name != current_version:
|
|
133
|
+
logger.info("Removing old version: %s", item.name)
|
|
134
|
+
try:
|
|
135
|
+
shutil.rmtree(item)
|
|
136
|
+
except OSError as error:
|
|
137
|
+
logger.warning("Failed to remove %s: %s", item.name, error)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _install_toolchain(version: str | None = None) -> None:
|
|
141
|
+
"""Install the data toolchain for the specified version.
|
|
142
|
+
|
|
143
|
+
Downloads all required data files (database, FAISS index, BM25 indexes)
|
|
144
|
+
from remote storage. After successful installation, sets this version
|
|
145
|
+
as the active version and cleans up old versions.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
version: The version to install. If None, fetches the latest version.
|
|
149
|
+
|
|
150
|
+
Raises:
|
|
151
|
+
ValueError: If version fetch fails or download errors occur.
|
|
152
|
+
"""
|
|
153
|
+
console = _get_console()
|
|
154
|
+
|
|
155
|
+
if version:
|
|
156
|
+
resolved_version = version
|
|
157
|
+
else:
|
|
158
|
+
console.print("Fetching latest version...")
|
|
159
|
+
resolved_version = _fetch_latest_version()
|
|
160
|
+
|
|
161
|
+
console.print(f"Installing version: [bold]{resolved_version}[/bold]")
|
|
162
|
+
|
|
163
|
+
base_url = f"{Config.R2_ASSETS_BASE_URL}/assets/{resolved_version}"
|
|
164
|
+
cache_path = Config.CACHE_DIRECTORY / resolved_version
|
|
165
|
+
|
|
166
|
+
# Build list of all files to download
|
|
167
|
+
files_to_download: list[tuple[str, Path]] = []
|
|
168
|
+
|
|
169
|
+
for filename in REQUIRED_FILES:
|
|
170
|
+
url = f"{base_url}/{filename}"
|
|
171
|
+
destination = cache_path / filename
|
|
172
|
+
files_to_download.append((url, destination))
|
|
173
|
+
|
|
174
|
+
for directory_name, directory_files in BM25_DIRECTORIES.items():
|
|
175
|
+
for filename in directory_files:
|
|
176
|
+
url = f"{base_url}/{directory_name}/{filename}"
|
|
177
|
+
destination = cache_path / directory_name / filename
|
|
178
|
+
files_to_download.append((url, destination))
|
|
179
|
+
|
|
180
|
+
# Download all files with progress
|
|
181
|
+
with Progress(
|
|
182
|
+
TextColumn("[bold blue]{task.description}"),
|
|
183
|
+
BarColumn(),
|
|
184
|
+
DownloadColumn(),
|
|
185
|
+
TransferSpeedColumn(),
|
|
186
|
+
console=console,
|
|
187
|
+
) as progress:
|
|
188
|
+
for url, destination in files_to_download:
|
|
189
|
+
if destination.exists():
|
|
190
|
+
logger.info("Skipping existing file: %s", destination.name)
|
|
191
|
+
continue
|
|
192
|
+
try:
|
|
193
|
+
_download_file(url, destination, progress)
|
|
194
|
+
except requests.exceptions.RequestException as error:
|
|
195
|
+
logger.error("Failed to download %s: %s", url, error)
|
|
196
|
+
raise ValueError(f"Failed to download {url}: {error}") from error
|
|
197
|
+
|
|
198
|
+
# Set this version as active and clean up old versions
|
|
199
|
+
_write_active_version(resolved_version)
|
|
200
|
+
_cleanup_old_versions(resolved_version)
|
|
201
|
+
|
|
202
|
+
console.print(f"[green]Installed data for version {resolved_version}[/green]")
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
@app.callback()
|
|
206
|
+
def main() -> None:
|
|
207
|
+
"""Lean-Explore data CLI.
|
|
208
|
+
|
|
209
|
+
This callback exists only to prevent Typer from treating the first
|
|
210
|
+
sub-command as a *default* command when there is otherwise just one.
|
|
211
|
+
"""
|
|
212
|
+
pass
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
@app.command()
|
|
216
|
+
def fetch(
|
|
217
|
+
version: str = typer.Option(
|
|
218
|
+
None,
|
|
219
|
+
"--version",
|
|
220
|
+
"-v",
|
|
221
|
+
help="Version to install (e.g., '20260127_103630'). Defaults to latest.",
|
|
222
|
+
),
|
|
223
|
+
) -> None:
|
|
224
|
+
"""Fetch and install the data toolchain from remote storage.
|
|
225
|
+
|
|
226
|
+
Downloads the database, FAISS index, and BM25 indexes required for
|
|
227
|
+
local search. Automatically cleans up old cached versions.
|
|
228
|
+
"""
|
|
229
|
+
_install_toolchain(version)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
@app.command("clean")
|
|
233
|
+
def clean_data_toolchains() -> None:
|
|
234
|
+
"""Remove all downloaded local data toolchains."""
|
|
235
|
+
console = _get_console()
|
|
236
|
+
|
|
237
|
+
cache_exists = Config.CACHE_DIRECTORY.exists()
|
|
238
|
+
version_file = Config.CACHE_DIRECTORY.parent / "active_version"
|
|
239
|
+
version_exists = version_file.exists()
|
|
240
|
+
|
|
241
|
+
if not cache_exists and not version_exists:
|
|
242
|
+
console.print("[yellow]No local data found to clean.[/yellow]")
|
|
243
|
+
return
|
|
244
|
+
|
|
245
|
+
if typer.confirm("Delete all cached data?", default=False, abort=True):
|
|
246
|
+
try:
|
|
247
|
+
if cache_exists:
|
|
248
|
+
shutil.rmtree(Config.CACHE_DIRECTORY)
|
|
249
|
+
if version_exists:
|
|
250
|
+
version_file.unlink()
|
|
251
|
+
console.print("[green]Data cache cleared.[/green]")
|
|
252
|
+
except OSError as error:
|
|
253
|
+
logger.error("Failed to clean cache directory: %s", error)
|
|
254
|
+
console.print(f"[bold red]Error cleaning data: {error}[/bold red]")
|
|
255
|
+
raise typer.Exit(code=1)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
if __name__ == "__main__":
|
|
259
|
+
app()
|
|
@@ -186,7 +186,7 @@ async def run_doc_gen4(
|
|
|
186
186
|
|
|
187
187
|
config = PACKAGE_REGISTRY[package_name]
|
|
188
188
|
workspace_path = Path("lean") / package_name
|
|
189
|
-
logger.info(f"\n{'='*50}\nPackage: {package_name}\n{'='*50}")
|
|
189
|
+
logger.info(f"\n{'=' * 50}\nPackage: {package_name}\n{'=' * 50}")
|
|
190
190
|
|
|
191
191
|
if fresh:
|
|
192
192
|
_clear_workspace_cache(workspace_path)
|
|
@@ -341,9 +341,7 @@ async def generate_embeddings(
|
|
|
341
341
|
# Phase 2: Generate embeddings for remaining declarations
|
|
342
342
|
logger.info("Phase 2: Generating embeddings for remaining declarations...")
|
|
343
343
|
client = EmbeddingClient(model_name=model_name, max_length=max_seq_length)
|
|
344
|
-
logger.info(
|
|
345
|
-
f"Using {client.model_name} on {client.device}"
|
|
346
|
-
)
|
|
344
|
+
logger.info(f"Using {client.model_name} on {client.device}")
|
|
347
345
|
|
|
348
346
|
total = len(remaining)
|
|
349
347
|
total_embeddings = 0
|
|
@@ -95,8 +95,7 @@ def _build_faiss_index(embeddings: np.ndarray, device: str) -> faiss.Index:
|
|
|
95
95
|
nlist = max(256, int(np.sqrt(num_vectors)))
|
|
96
96
|
|
|
97
97
|
logger.info(
|
|
98
|
-
f"Building FAISS IVF index for {num_vectors} vectors "
|
|
99
|
-
f"with {nlist} clusters..."
|
|
98
|
+
f"Building FAISS IVF index for {num_vectors} vectors with {nlist} clusters..."
|
|
100
99
|
)
|
|
101
100
|
|
|
102
101
|
# Use inner product (cosine similarity on normalized vectors)
|
|
@@ -91,11 +91,11 @@ def update_lakefile_docgen_version(lakefile_path: Path, lean_version: str) -> No
|
|
|
91
91
|
content = lakefile_path.read_text()
|
|
92
92
|
|
|
93
93
|
pattern = (
|
|
94
|
-
r
|
|
94
|
+
r"require «doc-gen4» from git\s+"
|
|
95
95
|
r'"https://github\.com/leanprover/doc-gen4"(?:\s+@\s+"[^"]*")?'
|
|
96
96
|
)
|
|
97
97
|
replacement = (
|
|
98
|
-
f
|
|
98
|
+
f"require «doc-gen4» from git\n"
|
|
99
99
|
f' "https://github.com/leanprover/doc-gen4" @ "{lean_version}"'
|
|
100
100
|
)
|
|
101
101
|
new_content = re.sub(pattern, replacement, content)
|
|
@@ -232,9 +232,7 @@ class SearchEngine:
|
|
|
232
232
|
Map of declaration ID to semantic similarity score.
|
|
233
233
|
"""
|
|
234
234
|
embedding_response = await self.embedding_client.embed([query], is_query=True)
|
|
235
|
-
query_embedding = np.array(
|
|
236
|
-
[embedding_response.embeddings[0]], dtype=np.float32
|
|
237
|
-
)
|
|
235
|
+
query_embedding = np.array([embedding_response.embeddings[0]], dtype=np.float32)
|
|
238
236
|
|
|
239
237
|
import faiss as faiss_module
|
|
240
238
|
|
|
@@ -581,9 +579,7 @@ class SearchEngine:
|
|
|
581
579
|
declarations_map = self._filter_by_packages(declarations_map, packages)
|
|
582
580
|
# Filter boosted_scores to only include filtered declarations
|
|
583
581
|
boosted_scores = [
|
|
584
|
-
(cid, score)
|
|
585
|
-
for cid, score in boosted_scores
|
|
586
|
-
if cid in declarations_map
|
|
582
|
+
(cid, score) for cid, score in boosted_scores if cid in declarations_map
|
|
587
583
|
]
|
|
588
584
|
logger.info(f"Filtered to {len(declarations_map)} in {packages}")
|
|
589
585
|
|
|
@@ -86,9 +86,7 @@ class RerankerClient:
|
|
|
86
86
|
Formatted string for the reranker model.
|
|
87
87
|
"""
|
|
88
88
|
return (
|
|
89
|
-
f"<Instruct>: {self.instruction}\n"
|
|
90
|
-
f"<Query>: {query}\n"
|
|
91
|
-
f"<Document>: {document}"
|
|
89
|
+
f"<Instruct>: {self.instruction}\n<Query>: {query}\n<Document>: {document}"
|
|
92
90
|
)
|
|
93
91
|
|
|
94
92
|
@torch.no_grad()
|
|
@@ -1,242 +0,0 @@
|
|
|
1
|
-
# src/lean_explore/cli/data_commands.py
|
|
2
|
-
|
|
3
|
-
"""Manages local Lean Explore data toolchains.
|
|
4
|
-
|
|
5
|
-
Provides CLI commands to download, install, and clean data files (database,
|
|
6
|
-
FAISS index, etc.) from remote storage using Pooch for checksums and caching.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
import logging
|
|
10
|
-
import shutil
|
|
11
|
-
from typing import TypedDict
|
|
12
|
-
|
|
13
|
-
import pooch
|
|
14
|
-
import requests
|
|
15
|
-
import typer
|
|
16
|
-
from rich.console import Console
|
|
17
|
-
|
|
18
|
-
from lean_explore.config import Config
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class ManifestFileEntry(TypedDict):
|
|
22
|
-
"""A file entry in the manifest's toolchain version."""
|
|
23
|
-
|
|
24
|
-
remote_name: str
|
|
25
|
-
local_name: str
|
|
26
|
-
sha256: str
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class ToolchainVersionInfo(TypedDict):
|
|
30
|
-
"""Version information for a specific toolchain in the manifest."""
|
|
31
|
-
|
|
32
|
-
assets_base_path_r2: str
|
|
33
|
-
files: list[ManifestFileEntry]
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class Manifest(TypedDict):
|
|
37
|
-
"""Remote data manifest structure."""
|
|
38
|
-
|
|
39
|
-
default_toolchain: str
|
|
40
|
-
toolchains: dict[str, ToolchainVersionInfo]
|
|
41
|
-
|
|
42
|
-
logger = logging.getLogger(__name__)
|
|
43
|
-
|
|
44
|
-
app = typer.Typer(
|
|
45
|
-
name="data",
|
|
46
|
-
help="Manage local data toolchains for Lean Explore (e.g., download, list, "
|
|
47
|
-
"select, clean).",
|
|
48
|
-
no_args_is_help=True,
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def _get_console() -> Console:
|
|
53
|
-
"""Create a Rich console instance for output."""
|
|
54
|
-
return Console()
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
def _fetch_manifest() -> Manifest | None:
|
|
58
|
-
"""Fetches the remote data manifest.
|
|
59
|
-
|
|
60
|
-
Returns:
|
|
61
|
-
The manifest dictionary, or None if fetch fails.
|
|
62
|
-
"""
|
|
63
|
-
console = _get_console()
|
|
64
|
-
try:
|
|
65
|
-
response = requests.get(Config.MANIFEST_URL, timeout=10)
|
|
66
|
-
response.raise_for_status()
|
|
67
|
-
return response.json()
|
|
68
|
-
except requests.exceptions.RequestException as error:
|
|
69
|
-
logger.error("Failed to fetch manifest: %s", error)
|
|
70
|
-
console.print(f"[bold red]Error fetching manifest: {error}[/bold red]")
|
|
71
|
-
return None
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def _resolve_version(manifest: Manifest, version: str | None) -> str:
|
|
75
|
-
"""Resolves the version string to an actual toolchain version.
|
|
76
|
-
|
|
77
|
-
Args:
|
|
78
|
-
manifest: The manifest dictionary containing toolchain information.
|
|
79
|
-
version: The requested version, or None/"stable" for default.
|
|
80
|
-
|
|
81
|
-
Returns:
|
|
82
|
-
The resolved version string.
|
|
83
|
-
|
|
84
|
-
Raises:
|
|
85
|
-
ValueError: If the version cannot be resolved.
|
|
86
|
-
"""
|
|
87
|
-
if not version or version.lower() == "stable":
|
|
88
|
-
resolved = manifest.get("default_toolchain")
|
|
89
|
-
if not resolved:
|
|
90
|
-
raise ValueError("No default_toolchain specified in manifest")
|
|
91
|
-
return resolved
|
|
92
|
-
return version
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
def _build_file_registry(version_info: ToolchainVersionInfo) -> dict[str, str]:
|
|
96
|
-
"""Builds a Pooch registry from version info.
|
|
97
|
-
|
|
98
|
-
Args:
|
|
99
|
-
version_info: The version information from the manifest.
|
|
100
|
-
|
|
101
|
-
Returns:
|
|
102
|
-
A dictionary mapping remote filenames to SHA256 checksums.
|
|
103
|
-
"""
|
|
104
|
-
return {
|
|
105
|
-
file_entry["remote_name"]: f"sha256:{file_entry['sha256']}"
|
|
106
|
-
for file_entry in version_info.get("files", [])
|
|
107
|
-
if file_entry.get("remote_name") and file_entry.get("sha256")
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
def _write_active_version(version: str) -> None:
|
|
112
|
-
"""Write the active version to the version file.
|
|
113
|
-
|
|
114
|
-
Args:
|
|
115
|
-
version: The version string to write.
|
|
116
|
-
"""
|
|
117
|
-
version_file = Config.CACHE_DIRECTORY.parent / "active_version"
|
|
118
|
-
version_file.parent.mkdir(parents=True, exist_ok=True)
|
|
119
|
-
version_file.write_text(version)
|
|
120
|
-
logger.info("Set active version to: %s", version)
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
def _cleanup_old_versions(current_version: str) -> None:
|
|
124
|
-
"""Remove all cached versions except the current one.
|
|
125
|
-
|
|
126
|
-
Args:
|
|
127
|
-
current_version: The version to keep.
|
|
128
|
-
"""
|
|
129
|
-
if not Config.CACHE_DIRECTORY.exists():
|
|
130
|
-
return
|
|
131
|
-
|
|
132
|
-
for item in Config.CACHE_DIRECTORY.iterdir():
|
|
133
|
-
if item.is_dir() and item.name != current_version:
|
|
134
|
-
logger.info("Removing old version: %s", item.name)
|
|
135
|
-
try:
|
|
136
|
-
shutil.rmtree(item)
|
|
137
|
-
except OSError as error:
|
|
138
|
-
logger.warning("Failed to remove %s: %s", item.name, error)
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
def _install_toolchain(version: str | None = None) -> None:
|
|
142
|
-
"""Installs the data toolchain for the specified version.
|
|
143
|
-
|
|
144
|
-
Downloads and verifies all required data files (database, FAISS index, etc.)
|
|
145
|
-
using Pooch. Files are automatically decompressed and cached locally.
|
|
146
|
-
After successful installation, sets this version as the active version.
|
|
147
|
-
|
|
148
|
-
Args:
|
|
149
|
-
version: The version to install. If None, uses the default version.
|
|
150
|
-
|
|
151
|
-
Raises:
|
|
152
|
-
ValueError: If manifest fetch fails or version is not found.
|
|
153
|
-
"""
|
|
154
|
-
console = _get_console()
|
|
155
|
-
|
|
156
|
-
manifest = _fetch_manifest()
|
|
157
|
-
if not manifest:
|
|
158
|
-
raise ValueError("Failed to fetch manifest")
|
|
159
|
-
|
|
160
|
-
resolved_version = _resolve_version(manifest, version)
|
|
161
|
-
version_info = manifest.get("toolchains", {}).get(resolved_version)
|
|
162
|
-
if not version_info:
|
|
163
|
-
available = list(manifest.get("toolchains", {}).keys())
|
|
164
|
-
raise ValueError(
|
|
165
|
-
f"Version '{resolved_version}' not found. Available: {available}"
|
|
166
|
-
)
|
|
167
|
-
|
|
168
|
-
registry = _build_file_registry(version_info)
|
|
169
|
-
base_path = version_info.get("assets_base_path_r2", "")
|
|
170
|
-
base_url = f"{Config.R2_ASSETS_BASE_URL}/{base_path}/"
|
|
171
|
-
|
|
172
|
-
file_downloader = pooch.create(
|
|
173
|
-
path=Config.CACHE_DIRECTORY / resolved_version,
|
|
174
|
-
base_url=base_url,
|
|
175
|
-
registry=registry,
|
|
176
|
-
)
|
|
177
|
-
|
|
178
|
-
# Download and decompress each file
|
|
179
|
-
for file_entry in version_info.get("files", []):
|
|
180
|
-
remote_name = file_entry.get("remote_name")
|
|
181
|
-
local_name = file_entry.get("local_name")
|
|
182
|
-
if remote_name and local_name:
|
|
183
|
-
logger.info("Downloading %s -> %s", remote_name, local_name)
|
|
184
|
-
file_downloader.fetch(
|
|
185
|
-
remote_name, processor=pooch.Decompress(name=local_name)
|
|
186
|
-
)
|
|
187
|
-
|
|
188
|
-
# Set this version as the active version and clean up old versions
|
|
189
|
-
_write_active_version(resolved_version)
|
|
190
|
-
_cleanup_old_versions(resolved_version)
|
|
191
|
-
|
|
192
|
-
console.print(f"[green]Installed data for version {resolved_version}[/green]")
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
@app.callback()
|
|
196
|
-
def main() -> None:
|
|
197
|
-
"""Lean-Explore data CLI.
|
|
198
|
-
|
|
199
|
-
This callback exists only to prevent Typer from treating the first
|
|
200
|
-
sub-command as a *default* command when there is otherwise just one.
|
|
201
|
-
"""
|
|
202
|
-
pass
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
@app.command()
|
|
206
|
-
def fetch(
|
|
207
|
-
version: str = typer.Option(
|
|
208
|
-
None,
|
|
209
|
-
"--version",
|
|
210
|
-
"-v",
|
|
211
|
-
help="Version to install (e.g., '0.1.0'). Defaults to stable/latest.",
|
|
212
|
-
),
|
|
213
|
-
) -> None:
|
|
214
|
-
"""Fetches and installs the data toolchain from the remote repository.
|
|
215
|
-
|
|
216
|
-
Downloads the database, FAISS index, and other required data files.
|
|
217
|
-
Files are verified with SHA256 checksums and automatically decompressed.
|
|
218
|
-
"""
|
|
219
|
-
_install_toolchain(version)
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
@app.command("clean")
|
|
223
|
-
def clean_data_toolchains() -> None:
|
|
224
|
-
"""Removes all downloaded local data toolchains."""
|
|
225
|
-
console = _get_console()
|
|
226
|
-
|
|
227
|
-
if not Config.CACHE_DIRECTORY.exists():
|
|
228
|
-
console.print("[yellow]No local data found to clean.[/yellow]")
|
|
229
|
-
return
|
|
230
|
-
|
|
231
|
-
if typer.confirm("Delete all cached data?", default=False, abort=True):
|
|
232
|
-
try:
|
|
233
|
-
shutil.rmtree(Config.CACHE_DIRECTORY)
|
|
234
|
-
console.print("[green]Data cache cleared.[/green]")
|
|
235
|
-
except OSError as error:
|
|
236
|
-
logger.error("Failed to clean cache directory: %s", error)
|
|
237
|
-
console.print(f"[bold red]Error cleaning data: {error}[/bold red]")
|
|
238
|
-
raise typer.Exit(code=1)
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
if __name__ == "__main__":
|
|
242
|
-
app()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|