lean-explore 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. lean_explore/__init__.py +14 -1
  2. lean_explore/api/__init__.py +12 -1
  3. lean_explore/api/client.py +64 -176
  4. lean_explore/cli/__init__.py +10 -1
  5. lean_explore/cli/data_commands.py +184 -489
  6. lean_explore/cli/display.py +171 -0
  7. lean_explore/cli/main.py +51 -608
  8. lean_explore/config.py +244 -0
  9. lean_explore/extract/__init__.py +5 -0
  10. lean_explore/extract/__main__.py +368 -0
  11. lean_explore/extract/doc_gen4.py +200 -0
  12. lean_explore/extract/doc_parser.py +499 -0
  13. lean_explore/extract/embeddings.py +369 -0
  14. lean_explore/extract/github.py +110 -0
  15. lean_explore/extract/index.py +316 -0
  16. lean_explore/extract/informalize.py +653 -0
  17. lean_explore/extract/package_config.py +59 -0
  18. lean_explore/extract/package_registry.py +45 -0
  19. lean_explore/extract/package_utils.py +105 -0
  20. lean_explore/extract/types.py +25 -0
  21. lean_explore/mcp/__init__.py +11 -1
  22. lean_explore/mcp/app.py +14 -46
  23. lean_explore/mcp/server.py +20 -35
  24. lean_explore/mcp/tools.py +71 -205
  25. lean_explore/models/__init__.py +9 -0
  26. lean_explore/models/search_db.py +76 -0
  27. lean_explore/models/search_types.py +53 -0
  28. lean_explore/search/__init__.py +32 -0
  29. lean_explore/search/engine.py +651 -0
  30. lean_explore/search/scoring.py +156 -0
  31. lean_explore/search/service.py +68 -0
  32. lean_explore/search/tokenization.py +71 -0
  33. lean_explore/util/__init__.py +28 -0
  34. lean_explore/util/embedding_client.py +92 -0
  35. lean_explore/util/logging.py +22 -0
  36. lean_explore/util/openrouter_client.py +63 -0
  37. lean_explore/util/reranker_client.py +187 -0
  38. {lean_explore-0.3.0.dist-info → lean_explore-1.0.1.dist-info}/METADATA +32 -9
  39. lean_explore-1.0.1.dist-info/RECORD +43 -0
  40. {lean_explore-0.3.0.dist-info → lean_explore-1.0.1.dist-info}/WHEEL +1 -1
  41. lean_explore-1.0.1.dist-info/entry_points.txt +2 -0
  42. lean_explore/cli/agent.py +0 -788
  43. lean_explore/cli/config_utils.py +0 -481
  44. lean_explore/defaults.py +0 -114
  45. lean_explore/local/__init__.py +0 -1
  46. lean_explore/local/search.py +0 -1050
  47. lean_explore/local/service.py +0 -479
  48. lean_explore/shared/__init__.py +0 -1
  49. lean_explore/shared/models/__init__.py +0 -1
  50. lean_explore/shared/models/api.py +0 -117
  51. lean_explore/shared/models/db.py +0 -396
  52. lean_explore-0.3.0.dist-info/RECORD +0 -26
  53. lean_explore-0.3.0.dist-info/entry_points.txt +0 -2
  54. {lean_explore-0.3.0.dist-info → lean_explore-1.0.1.dist-info}/licenses/LICENSE +0 -0
  55. {lean_explore-0.3.0.dist-info → lean_explore-1.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,316 @@
1
+ """Build search indices from declaration data.
2
+
3
+ This module creates:
4
+ 1. FAISS IVF index for semantic search from embeddings
5
+ 2. BM25 indices for lexical search on declaration names
6
+
7
+ IVF (Inverted File) uses k-means clustering for efficient approximate
8
+ nearest neighbor search with controllable recall.
9
+ """
10
+
11
+ import json
12
+ import logging
13
+ import re
14
+ from pathlib import Path
15
+
16
+ import bm25s
17
+ import faiss
18
+ import numpy as np
19
+ from sqlalchemy import create_engine, select
20
+ from sqlalchemy.ext.asyncio import AsyncEngine
21
+ from sqlalchemy.orm import Session
22
+
23
+ from lean_explore.config import Config
24
+ from lean_explore.models import Declaration
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ def _get_device() -> str:
30
+ """Detect if CUDA GPU is available for FAISS.
31
+
32
+ Returns:
33
+ Device string: 'cuda' if CUDA GPU available, otherwise 'cpu'.
34
+ Note: FAISS doesn't support MPS, so Apple Silicon uses CPU.
35
+ """
36
+ if faiss.get_num_gpus() > 0:
37
+ device = "cuda"
38
+ logger.info("Using CUDA GPU for FAISS")
39
+ else:
40
+ device = "cpu"
41
+ logger.info("Using CPU for FAISS")
42
+ return device
43
+
44
+
45
+ def _load_embeddings_from_database(
46
+ session: Session, embedding_field: str
47
+ ) -> tuple[list[int], np.ndarray]:
48
+ """Load embeddings and IDs from the database.
49
+
50
+ Args:
51
+ session: Sync database session.
52
+ embedding_field: Name of the embedding field to load
53
+ (e.g., 'informalization_embedding').
54
+
55
+ Returns:
56
+ Tuple of (declaration_ids, embeddings_array) where embeddings_array
57
+ is a numpy array of shape (num_declarations, embedding_dimension).
58
+ """
59
+ stmt = select(Declaration.id, getattr(Declaration, embedding_field)).where(
60
+ getattr(Declaration, embedding_field).isnot(None)
61
+ )
62
+ result = session.execute(stmt)
63
+ rows = list(result.all())
64
+
65
+ if not rows:
66
+ logger.warning(f"No declarations found with {embedding_field}")
67
+ return [], np.array([])
68
+
69
+ declaration_ids = [row.id for row in rows]
70
+ embeddings_list = [row[1] for row in rows]
71
+ embeddings_array = np.array(embeddings_list, dtype=np.float32)
72
+
73
+ logger.info(
74
+ f"Loaded {len(declaration_ids)} embeddings with dimension "
75
+ f"{embeddings_array.shape[1]}"
76
+ )
77
+
78
+ return declaration_ids, embeddings_array
79
+
80
+
81
+ def _build_faiss_index(embeddings: np.ndarray, device: str) -> faiss.Index:
82
+ """Build a FAISS IVF index from embeddings.
83
+
84
+ Args:
85
+ embeddings: Numpy array of embeddings, shape (num_vectors, dimension).
86
+ device: Device to use ('cuda', 'mps', or 'cpu').
87
+
88
+ Returns:
89
+ FAISS IVF index for fast approximate nearest neighbor search.
90
+ """
91
+ num_vectors = embeddings.shape[0]
92
+ dimension = embeddings.shape[1]
93
+
94
+ # Number of clusters: sqrt(n) is a good heuristic, minimum 256
95
+ nlist = max(256, int(np.sqrt(num_vectors)))
96
+
97
+ logger.info(
98
+ f"Building FAISS IVF index for {num_vectors} vectors with {nlist} clusters..."
99
+ )
100
+
101
+ # Use inner product (cosine similarity on normalized vectors)
102
+ quantizer = faiss.IndexFlatIP(dimension)
103
+ index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_INNER_PRODUCT)
104
+
105
+ if device == "cuda" and faiss.get_num_gpus() > 0:
106
+ logger.info("Training IVF index on GPU")
107
+ resource = faiss.StandardGpuResources()
108
+ gpu_index = faiss.index_cpu_to_gpu(resource, 0, index)
109
+ gpu_index.train(embeddings)
110
+ gpu_index.add(embeddings)
111
+ index = faiss.index_gpu_to_cpu(gpu_index)
112
+ else:
113
+ logger.info("Training IVF index on CPU")
114
+ index.train(embeddings)
115
+ index.add(embeddings)
116
+
117
+ logger.info("FAISS IVF index built successfully")
118
+ return index
119
+
120
+
121
+ async def build_faiss_indices(
122
+ engine: AsyncEngine,
123
+ output_directory: Path | None = None,
124
+ ) -> None:
125
+ """Build FAISS index for informalization embeddings.
126
+
127
+ This function creates a FAISS IVF index for informalization embeddings
128
+ and saves it to disk along with ID mappings.
129
+
130
+ Args:
131
+ engine: Async database engine (URL extracted for sync access).
132
+ output_directory: Directory to save indices. Defaults to active data path.
133
+ """
134
+ if output_directory is None:
135
+ output_directory = Config.ACTIVE_DATA_PATH
136
+
137
+ output_directory.mkdir(parents=True, exist_ok=True)
138
+ logger.info(f"Saving indices to {output_directory}")
139
+
140
+ device = _get_device()
141
+
142
+ embedding_fields = [
143
+ "informalization_embedding",
144
+ ]
145
+
146
+ # Use sync engine to avoid aiosqlite issues with binary data
147
+ sync_url = str(engine.url).replace("sqlite+aiosqlite", "sqlite")
148
+ sync_engine = create_engine(sync_url)
149
+
150
+ with Session(sync_engine) as session:
151
+ for i, embedding_field in enumerate(embedding_fields, 1):
152
+ logger.info(
153
+ f"Processing {embedding_field} ({i}/{len(embedding_fields)})..."
154
+ )
155
+
156
+ declaration_ids, embeddings = _load_embeddings_from_database(
157
+ session, embedding_field
158
+ )
159
+
160
+ if len(declaration_ids) == 0:
161
+ logger.warning(f"Skipping {embedding_field} (no data)")
162
+ continue
163
+
164
+ index = _build_faiss_index(embeddings, device)
165
+
166
+ # Move GPU index back to CPU for serialization
167
+ if device == "cuda" and isinstance(index, faiss.GpuIndex):
168
+ index = faiss.index_gpu_to_cpu(index)
169
+
170
+ index_filename = embedding_field.replace("_embedding", "_faiss.index")
171
+ index_path = output_directory / index_filename
172
+ faiss.write_index(index, str(index_path))
173
+ logger.info(f"Saved FAISS index to {index_path}")
174
+
175
+ ids_map_filename = embedding_field.replace(
176
+ "_embedding", "_faiss_ids_map.json"
177
+ )
178
+ ids_map_path = output_directory / ids_map_filename
179
+ with open(ids_map_path, "w") as file:
180
+ json.dump(declaration_ids, file)
181
+ logger.info(f"Saved ID mapping to {ids_map_path}")
182
+
183
+ sync_engine.dispose()
184
+ logger.info("All FAISS indices built successfully")
185
+
186
+
187
+ def _tokenize_spaced(text: str) -> list[str]:
188
+ """Tokenize text with spacing on dots, underscores, and camelCase.
189
+
190
+ Args:
191
+ text: Input text to tokenize.
192
+
193
+ Returns:
194
+ List of lowercase word tokens.
195
+ """
196
+ if not text:
197
+ return []
198
+ text = text.replace(".", " ").replace("_", " ")
199
+ text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
200
+ return re.findall(r"\w+", text.lower())
201
+
202
+
203
+ def _tokenize_raw(text: str) -> list[str]:
204
+ """Tokenize text as single token (preserves dots).
205
+
206
+ Args:
207
+ text: Input text to tokenize.
208
+
209
+ Returns:
210
+ List with the full text as a single lowercase token.
211
+ """
212
+ if not text:
213
+ return []
214
+ return [text.lower()]
215
+
216
+
217
+ def _load_declaration_names(session: Session) -> tuple[list[int], list[str]]:
218
+ """Load all declaration IDs and names from the database.
219
+
220
+ Args:
221
+ session: Sync database session.
222
+
223
+ Returns:
224
+ Tuple of (declaration_ids, declaration_names).
225
+ """
226
+ stmt = select(Declaration.id, Declaration.name)
227
+ result = session.execute(stmt)
228
+ rows = list(result.all())
229
+
230
+ declaration_ids = [row.id for row in rows]
231
+ declaration_names = [row.name or "" for row in rows]
232
+
233
+ logger.info(f"Loaded {len(declaration_ids)} declarations for BM25 indexing")
234
+ return declaration_ids, declaration_names
235
+
236
+
237
+ def _build_bm25_indices(
238
+ declaration_names: list[str],
239
+ ) -> tuple[bm25s.BM25, bm25s.BM25]:
240
+ """Build BM25 indices over declaration names.
241
+
242
+ Creates two indices:
243
+ 1. Spaced tokenization (splits on dots, underscores, camelCase)
244
+ 2. Raw tokenization (full name as single token)
245
+
246
+ Args:
247
+ declaration_names: List of declaration names.
248
+
249
+ Returns:
250
+ Tuple of (bm25_spaced, bm25_raw) indices.
251
+ """
252
+ logger.info("Building BM25 indices over declaration names...")
253
+
254
+ corpus_spaced = [list(set(_tokenize_spaced(n))) for n in declaration_names]
255
+ corpus_raw = [list(set(_tokenize_raw(n))) for n in declaration_names]
256
+
257
+ bm25_spaced = bm25s.BM25(method="bm25+")
258
+ bm25_spaced.index(corpus_spaced)
259
+ logger.info("Built BM25 spaced index")
260
+
261
+ bm25_raw = bm25s.BM25(method="bm25+")
262
+ bm25_raw.index(corpus_raw)
263
+ logger.info("Built BM25 raw index")
264
+
265
+ return bm25_spaced, bm25_raw
266
+
267
+
268
+ async def build_bm25_indices(
269
+ engine: AsyncEngine,
270
+ output_directory: Path | None = None,
271
+ ) -> None:
272
+ """Build BM25 indices for declaration name search.
273
+
274
+ This function creates BM25 indices for lexical search on declaration
275
+ names and saves them to disk along with ID mappings.
276
+
277
+ Args:
278
+ engine: Async database engine (URL extracted for sync access).
279
+ output_directory: Directory to save indices. Defaults to active data path.
280
+ """
281
+ if output_directory is None:
282
+ output_directory = Config.ACTIVE_DATA_PATH
283
+
284
+ output_directory.mkdir(parents=True, exist_ok=True)
285
+ logger.info(f"Saving BM25 indices to {output_directory}")
286
+
287
+ sync_url = str(engine.url).replace("sqlite+aiosqlite", "sqlite")
288
+ sync_engine = create_engine(sync_url)
289
+
290
+ with Session(sync_engine) as session:
291
+ declaration_ids, declaration_names = _load_declaration_names(session)
292
+
293
+ if not declaration_ids:
294
+ logger.warning("No declarations found for BM25 indexing")
295
+ sync_engine.dispose()
296
+ return
297
+
298
+ bm25_spaced, bm25_raw = _build_bm25_indices(declaration_names)
299
+
300
+ # Save BM25 indices
301
+ bm25_spaced_path = output_directory / "bm25_name_spaced"
302
+ bm25_spaced.save(str(bm25_spaced_path))
303
+ logger.info(f"Saved BM25 spaced index to {bm25_spaced_path}")
304
+
305
+ bm25_raw_path = output_directory / "bm25_name_raw"
306
+ bm25_raw.save(str(bm25_raw_path))
307
+ logger.info(f"Saved BM25 raw index to {bm25_raw_path}")
308
+
309
+ # Save ID mapping (shared by both indices)
310
+ ids_map_path = output_directory / "bm25_ids_map.json"
311
+ with open(ids_map_path, "w") as file:
312
+ json.dump(declaration_ids, file)
313
+ logger.info(f"Saved BM25 ID mapping to {ids_map_path}")
314
+
315
+ sync_engine.dispose()
316
+ logger.info("All BM25 indices built successfully")