lean-explore 0.2.2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lean_explore/__init__.py +14 -1
- lean_explore/api/__init__.py +12 -1
- lean_explore/api/client.py +60 -80
- lean_explore/cli/__init__.py +10 -1
- lean_explore/cli/data_commands.py +157 -479
- lean_explore/cli/display.py +171 -0
- lean_explore/cli/main.py +51 -608
- lean_explore/config.py +244 -0
- lean_explore/extract/__init__.py +5 -0
- lean_explore/extract/__main__.py +368 -0
- lean_explore/extract/doc_gen4.py +200 -0
- lean_explore/extract/doc_parser.py +499 -0
- lean_explore/extract/embeddings.py +371 -0
- lean_explore/extract/github.py +110 -0
- lean_explore/extract/index.py +317 -0
- lean_explore/extract/informalize.py +653 -0
- lean_explore/extract/package_config.py +59 -0
- lean_explore/extract/package_registry.py +45 -0
- lean_explore/extract/package_utils.py +105 -0
- lean_explore/extract/types.py +25 -0
- lean_explore/mcp/__init__.py +11 -1
- lean_explore/mcp/app.py +14 -46
- lean_explore/mcp/server.py +20 -35
- lean_explore/mcp/tools.py +70 -177
- lean_explore/models/__init__.py +9 -0
- lean_explore/models/search_db.py +76 -0
- lean_explore/models/search_types.py +53 -0
- lean_explore/search/__init__.py +32 -0
- lean_explore/search/engine.py +655 -0
- lean_explore/search/scoring.py +156 -0
- lean_explore/search/service.py +68 -0
- lean_explore/search/tokenization.py +71 -0
- lean_explore/util/__init__.py +28 -0
- lean_explore/util/embedding_client.py +92 -0
- lean_explore/util/logging.py +22 -0
- lean_explore/util/openrouter_client.py +63 -0
- lean_explore/util/reranker_client.py +189 -0
- {lean_explore-0.2.2.dist-info → lean_explore-1.0.0.dist-info}/METADATA +55 -10
- lean_explore-1.0.0.dist-info/RECORD +43 -0
- {lean_explore-0.2.2.dist-info → lean_explore-1.0.0.dist-info}/WHEEL +1 -1
- lean_explore-1.0.0.dist-info/entry_points.txt +2 -0
- lean_explore/cli/agent.py +0 -781
- lean_explore/cli/config_utils.py +0 -481
- lean_explore/defaults.py +0 -114
- lean_explore/local/__init__.py +0 -1
- lean_explore/local/search.py +0 -1050
- lean_explore/local/service.py +0 -392
- lean_explore/shared/__init__.py +0 -1
- lean_explore/shared/models/__init__.py +0 -1
- lean_explore/shared/models/api.py +0 -117
- lean_explore/shared/models/db.py +0 -396
- lean_explore-0.2.2.dist-info/RECORD +0 -26
- lean_explore-0.2.2.dist-info/entry_points.txt +0 -2
- {lean_explore-0.2.2.dist-info → lean_explore-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {lean_explore-0.2.2.dist-info → lean_explore-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
"""Build search indices from declaration data.
|
|
2
|
+
|
|
3
|
+
This module creates:
|
|
4
|
+
1. FAISS IVF index for semantic search from embeddings
|
|
5
|
+
2. BM25 indices for lexical search on declaration names
|
|
6
|
+
|
|
7
|
+
IVF (Inverted File) uses k-means clustering for efficient approximate
|
|
8
|
+
nearest neighbor search with controllable recall.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import logging
|
|
13
|
+
import re
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
import bm25s
|
|
17
|
+
import faiss
|
|
18
|
+
import numpy as np
|
|
19
|
+
from sqlalchemy import create_engine, select
|
|
20
|
+
from sqlalchemy.ext.asyncio import AsyncEngine
|
|
21
|
+
from sqlalchemy.orm import Session
|
|
22
|
+
|
|
23
|
+
from lean_explore.config import Config
|
|
24
|
+
from lean_explore.models import Declaration
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _get_device() -> str:
|
|
30
|
+
"""Detect if CUDA GPU is available for FAISS.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Device string: 'cuda' if CUDA GPU available, otherwise 'cpu'.
|
|
34
|
+
Note: FAISS doesn't support MPS, so Apple Silicon uses CPU.
|
|
35
|
+
"""
|
|
36
|
+
if faiss.get_num_gpus() > 0:
|
|
37
|
+
device = "cuda"
|
|
38
|
+
logger.info("Using CUDA GPU for FAISS")
|
|
39
|
+
else:
|
|
40
|
+
device = "cpu"
|
|
41
|
+
logger.info("Using CPU for FAISS")
|
|
42
|
+
return device
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _load_embeddings_from_database(
|
|
46
|
+
session: Session, embedding_field: str
|
|
47
|
+
) -> tuple[list[int], np.ndarray]:
|
|
48
|
+
"""Load embeddings and IDs from the database.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
session: Sync database session.
|
|
52
|
+
embedding_field: Name of the embedding field to load
|
|
53
|
+
(e.g., 'informalization_embedding').
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Tuple of (declaration_ids, embeddings_array) where embeddings_array
|
|
57
|
+
is a numpy array of shape (num_declarations, embedding_dimension).
|
|
58
|
+
"""
|
|
59
|
+
stmt = select(Declaration.id, getattr(Declaration, embedding_field)).where(
|
|
60
|
+
getattr(Declaration, embedding_field).isnot(None)
|
|
61
|
+
)
|
|
62
|
+
result = session.execute(stmt)
|
|
63
|
+
rows = list(result.all())
|
|
64
|
+
|
|
65
|
+
if not rows:
|
|
66
|
+
logger.warning(f"No declarations found with {embedding_field}")
|
|
67
|
+
return [], np.array([])
|
|
68
|
+
|
|
69
|
+
declaration_ids = [row.id for row in rows]
|
|
70
|
+
embeddings_list = [row[1] for row in rows]
|
|
71
|
+
embeddings_array = np.array(embeddings_list, dtype=np.float32)
|
|
72
|
+
|
|
73
|
+
logger.info(
|
|
74
|
+
f"Loaded {len(declaration_ids)} embeddings with dimension "
|
|
75
|
+
f"{embeddings_array.shape[1]}"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
return declaration_ids, embeddings_array
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _build_faiss_index(embeddings: np.ndarray, device: str) -> faiss.Index:
|
|
82
|
+
"""Build a FAISS IVF index from embeddings.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
embeddings: Numpy array of embeddings, shape (num_vectors, dimension).
|
|
86
|
+
device: Device to use ('cuda', 'mps', or 'cpu').
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
FAISS IVF index for fast approximate nearest neighbor search.
|
|
90
|
+
"""
|
|
91
|
+
num_vectors = embeddings.shape[0]
|
|
92
|
+
dimension = embeddings.shape[1]
|
|
93
|
+
|
|
94
|
+
# Number of clusters: sqrt(n) is a good heuristic, minimum 256
|
|
95
|
+
nlist = max(256, int(np.sqrt(num_vectors)))
|
|
96
|
+
|
|
97
|
+
logger.info(
|
|
98
|
+
f"Building FAISS IVF index for {num_vectors} vectors "
|
|
99
|
+
f"with {nlist} clusters..."
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Use inner product (cosine similarity on normalized vectors)
|
|
103
|
+
quantizer = faiss.IndexFlatIP(dimension)
|
|
104
|
+
index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_INNER_PRODUCT)
|
|
105
|
+
|
|
106
|
+
if device == "cuda" and faiss.get_num_gpus() > 0:
|
|
107
|
+
logger.info("Training IVF index on GPU")
|
|
108
|
+
resource = faiss.StandardGpuResources()
|
|
109
|
+
gpu_index = faiss.index_cpu_to_gpu(resource, 0, index)
|
|
110
|
+
gpu_index.train(embeddings)
|
|
111
|
+
gpu_index.add(embeddings)
|
|
112
|
+
index = faiss.index_gpu_to_cpu(gpu_index)
|
|
113
|
+
else:
|
|
114
|
+
logger.info("Training IVF index on CPU")
|
|
115
|
+
index.train(embeddings)
|
|
116
|
+
index.add(embeddings)
|
|
117
|
+
|
|
118
|
+
logger.info("FAISS IVF index built successfully")
|
|
119
|
+
return index
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
async def build_faiss_indices(
|
|
123
|
+
engine: AsyncEngine,
|
|
124
|
+
output_directory: Path | None = None,
|
|
125
|
+
) -> None:
|
|
126
|
+
"""Build FAISS index for informalization embeddings.
|
|
127
|
+
|
|
128
|
+
This function creates a FAISS IVF index for informalization embeddings
|
|
129
|
+
and saves it to disk along with ID mappings.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
engine: Async database engine (URL extracted for sync access).
|
|
133
|
+
output_directory: Directory to save indices. Defaults to active data path.
|
|
134
|
+
"""
|
|
135
|
+
if output_directory is None:
|
|
136
|
+
output_directory = Config.ACTIVE_DATA_PATH
|
|
137
|
+
|
|
138
|
+
output_directory.mkdir(parents=True, exist_ok=True)
|
|
139
|
+
logger.info(f"Saving indices to {output_directory}")
|
|
140
|
+
|
|
141
|
+
device = _get_device()
|
|
142
|
+
|
|
143
|
+
embedding_fields = [
|
|
144
|
+
"informalization_embedding",
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
# Use sync engine to avoid aiosqlite issues with binary data
|
|
148
|
+
sync_url = str(engine.url).replace("sqlite+aiosqlite", "sqlite")
|
|
149
|
+
sync_engine = create_engine(sync_url)
|
|
150
|
+
|
|
151
|
+
with Session(sync_engine) as session:
|
|
152
|
+
for i, embedding_field in enumerate(embedding_fields, 1):
|
|
153
|
+
logger.info(
|
|
154
|
+
f"Processing {embedding_field} ({i}/{len(embedding_fields)})..."
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
declaration_ids, embeddings = _load_embeddings_from_database(
|
|
158
|
+
session, embedding_field
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
if len(declaration_ids) == 0:
|
|
162
|
+
logger.warning(f"Skipping {embedding_field} (no data)")
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
index = _build_faiss_index(embeddings, device)
|
|
166
|
+
|
|
167
|
+
# Move GPU index back to CPU for serialization
|
|
168
|
+
if device == "cuda" and isinstance(index, faiss.GpuIndex):
|
|
169
|
+
index = faiss.index_gpu_to_cpu(index)
|
|
170
|
+
|
|
171
|
+
index_filename = embedding_field.replace("_embedding", "_faiss.index")
|
|
172
|
+
index_path = output_directory / index_filename
|
|
173
|
+
faiss.write_index(index, str(index_path))
|
|
174
|
+
logger.info(f"Saved FAISS index to {index_path}")
|
|
175
|
+
|
|
176
|
+
ids_map_filename = embedding_field.replace(
|
|
177
|
+
"_embedding", "_faiss_ids_map.json"
|
|
178
|
+
)
|
|
179
|
+
ids_map_path = output_directory / ids_map_filename
|
|
180
|
+
with open(ids_map_path, "w") as file:
|
|
181
|
+
json.dump(declaration_ids, file)
|
|
182
|
+
logger.info(f"Saved ID mapping to {ids_map_path}")
|
|
183
|
+
|
|
184
|
+
sync_engine.dispose()
|
|
185
|
+
logger.info("All FAISS indices built successfully")
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _tokenize_spaced(text: str) -> list[str]:
|
|
189
|
+
"""Tokenize text with spacing on dots, underscores, and camelCase.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
text: Input text to tokenize.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
List of lowercase word tokens.
|
|
196
|
+
"""
|
|
197
|
+
if not text:
|
|
198
|
+
return []
|
|
199
|
+
text = text.replace(".", " ").replace("_", " ")
|
|
200
|
+
text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
|
|
201
|
+
return re.findall(r"\w+", text.lower())
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _tokenize_raw(text: str) -> list[str]:
|
|
205
|
+
"""Tokenize text as single token (preserves dots).
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
text: Input text to tokenize.
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
List with the full text as a single lowercase token.
|
|
212
|
+
"""
|
|
213
|
+
if not text:
|
|
214
|
+
return []
|
|
215
|
+
return [text.lower()]
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _load_declaration_names(session: Session) -> tuple[list[int], list[str]]:
|
|
219
|
+
"""Load all declaration IDs and names from the database.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
session: Sync database session.
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
Tuple of (declaration_ids, declaration_names).
|
|
226
|
+
"""
|
|
227
|
+
stmt = select(Declaration.id, Declaration.name)
|
|
228
|
+
result = session.execute(stmt)
|
|
229
|
+
rows = list(result.all())
|
|
230
|
+
|
|
231
|
+
declaration_ids = [row.id for row in rows]
|
|
232
|
+
declaration_names = [row.name or "" for row in rows]
|
|
233
|
+
|
|
234
|
+
logger.info(f"Loaded {len(declaration_ids)} declarations for BM25 indexing")
|
|
235
|
+
return declaration_ids, declaration_names
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def _build_bm25_indices(
|
|
239
|
+
declaration_names: list[str],
|
|
240
|
+
) -> tuple[bm25s.BM25, bm25s.BM25]:
|
|
241
|
+
"""Build BM25 indices over declaration names.
|
|
242
|
+
|
|
243
|
+
Creates two indices:
|
|
244
|
+
1. Spaced tokenization (splits on dots, underscores, camelCase)
|
|
245
|
+
2. Raw tokenization (full name as single token)
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
declaration_names: List of declaration names.
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
Tuple of (bm25_spaced, bm25_raw) indices.
|
|
252
|
+
"""
|
|
253
|
+
logger.info("Building BM25 indices over declaration names...")
|
|
254
|
+
|
|
255
|
+
corpus_spaced = [list(set(_tokenize_spaced(n))) for n in declaration_names]
|
|
256
|
+
corpus_raw = [list(set(_tokenize_raw(n))) for n in declaration_names]
|
|
257
|
+
|
|
258
|
+
bm25_spaced = bm25s.BM25(method="bm25+")
|
|
259
|
+
bm25_spaced.index(corpus_spaced)
|
|
260
|
+
logger.info("Built BM25 spaced index")
|
|
261
|
+
|
|
262
|
+
bm25_raw = bm25s.BM25(method="bm25+")
|
|
263
|
+
bm25_raw.index(corpus_raw)
|
|
264
|
+
logger.info("Built BM25 raw index")
|
|
265
|
+
|
|
266
|
+
return bm25_spaced, bm25_raw
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
async def build_bm25_indices(
|
|
270
|
+
engine: AsyncEngine,
|
|
271
|
+
output_directory: Path | None = None,
|
|
272
|
+
) -> None:
|
|
273
|
+
"""Build BM25 indices for declaration name search.
|
|
274
|
+
|
|
275
|
+
This function creates BM25 indices for lexical search on declaration
|
|
276
|
+
names and saves them to disk along with ID mappings.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
engine: Async database engine (URL extracted for sync access).
|
|
280
|
+
output_directory: Directory to save indices. Defaults to active data path.
|
|
281
|
+
"""
|
|
282
|
+
if output_directory is None:
|
|
283
|
+
output_directory = Config.ACTIVE_DATA_PATH
|
|
284
|
+
|
|
285
|
+
output_directory.mkdir(parents=True, exist_ok=True)
|
|
286
|
+
logger.info(f"Saving BM25 indices to {output_directory}")
|
|
287
|
+
|
|
288
|
+
sync_url = str(engine.url).replace("sqlite+aiosqlite", "sqlite")
|
|
289
|
+
sync_engine = create_engine(sync_url)
|
|
290
|
+
|
|
291
|
+
with Session(sync_engine) as session:
|
|
292
|
+
declaration_ids, declaration_names = _load_declaration_names(session)
|
|
293
|
+
|
|
294
|
+
if not declaration_ids:
|
|
295
|
+
logger.warning("No declarations found for BM25 indexing")
|
|
296
|
+
sync_engine.dispose()
|
|
297
|
+
return
|
|
298
|
+
|
|
299
|
+
bm25_spaced, bm25_raw = _build_bm25_indices(declaration_names)
|
|
300
|
+
|
|
301
|
+
# Save BM25 indices
|
|
302
|
+
bm25_spaced_path = output_directory / "bm25_name_spaced"
|
|
303
|
+
bm25_spaced.save(str(bm25_spaced_path))
|
|
304
|
+
logger.info(f"Saved BM25 spaced index to {bm25_spaced_path}")
|
|
305
|
+
|
|
306
|
+
bm25_raw_path = output_directory / "bm25_name_raw"
|
|
307
|
+
bm25_raw.save(str(bm25_raw_path))
|
|
308
|
+
logger.info(f"Saved BM25 raw index to {bm25_raw_path}")
|
|
309
|
+
|
|
310
|
+
# Save ID mapping (shared by both indices)
|
|
311
|
+
ids_map_path = output_directory / "bm25_ids_map.json"
|
|
312
|
+
with open(ids_map_path, "w") as file:
|
|
313
|
+
json.dump(declaration_ids, file)
|
|
314
|
+
logger.info(f"Saved BM25 ID mapping to {ids_map_path}")
|
|
315
|
+
|
|
316
|
+
sync_engine.dispose()
|
|
317
|
+
logger.info("All BM25 indices built successfully")
|