eth-mcp 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eth_mcp-0.2.0.dist-info/METADATA +332 -0
- eth_mcp-0.2.0.dist-info/RECORD +21 -0
- eth_mcp-0.2.0.dist-info/WHEEL +4 -0
- eth_mcp-0.2.0.dist-info/entry_points.txt +3 -0
- ethereum_mcp/__init__.py +3 -0
- ethereum_mcp/cli.py +589 -0
- ethereum_mcp/clients.py +363 -0
- ethereum_mcp/config.py +324 -0
- ethereum_mcp/expert/__init__.py +1 -0
- ethereum_mcp/expert/guidance.py +300 -0
- ethereum_mcp/indexer/__init__.py +8 -0
- ethereum_mcp/indexer/chunker.py +563 -0
- ethereum_mcp/indexer/client_compiler.py +725 -0
- ethereum_mcp/indexer/compiler.py +245 -0
- ethereum_mcp/indexer/downloader.py +521 -0
- ethereum_mcp/indexer/embedder.py +627 -0
- ethereum_mcp/indexer/manifest.py +411 -0
- ethereum_mcp/logging.py +85 -0
- ethereum_mcp/models.py +126 -0
- ethereum_mcp/server.py +555 -0
- ethereum_mcp/tools/__init__.py +1 -0
|
@@ -0,0 +1,627 @@
|
|
|
1
|
+
"""Generate embeddings and store in LanceDB.
|
|
2
|
+
|
|
3
|
+
Supports both full and incremental indexing:
|
|
4
|
+
- Full: Drop existing table and rebuild from scratch
|
|
5
|
+
- Incremental: Add/delete only changed chunks based on manifest
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import lancedb
|
|
13
|
+
from sentence_transformers import SentenceTransformer
|
|
14
|
+
|
|
15
|
+
from ..config import DEFAULT_EMBEDDING_MODEL, load_config
|
|
16
|
+
from ..logging import get_logger
|
|
17
|
+
from .chunker import Chunk, chunk_single_file
|
|
18
|
+
from .manifest import (
|
|
19
|
+
FileEntry,
|
|
20
|
+
Manifest,
|
|
21
|
+
ManifestCorruptedError,
|
|
22
|
+
compute_changes,
|
|
23
|
+
compute_file_hash,
|
|
24
|
+
get_file_mtime_ns,
|
|
25
|
+
load_manifest,
|
|
26
|
+
needs_full_rebuild,
|
|
27
|
+
save_manifest,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
logger = get_logger("embedder")
|
|
31
|
+
|
|
32
|
+
# Default embedding model - good balance of quality and speed
|
|
33
|
+
DEFAULT_MODEL = DEFAULT_EMBEDDING_MODEL
|
|
34
|
+
|
|
35
|
+
# LanceDB table name
|
|
36
|
+
TABLE_NAME = "ethereum_specs"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def embed_and_store(
|
|
40
|
+
chunks: list[Chunk],
|
|
41
|
+
db_path: Path,
|
|
42
|
+
model_name: str = DEFAULT_MODEL,
|
|
43
|
+
table_name: str = TABLE_NAME,
|
|
44
|
+
) -> int:
|
|
45
|
+
"""
|
|
46
|
+
Generate embeddings for chunks and store in LanceDB.
|
|
47
|
+
|
|
48
|
+
This is the legacy full-rebuild function, preserved for backwards compatibility.
|
|
49
|
+
For incremental indexing, use IncrementalEmbedder instead.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
chunks: List of document chunks
|
|
53
|
+
db_path: Path to LanceDB database
|
|
54
|
+
model_name: Sentence transformer model name
|
|
55
|
+
table_name: Name of table to create/update
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Number of chunks embedded and stored
|
|
59
|
+
"""
|
|
60
|
+
if not chunks:
|
|
61
|
+
logger.warning("No chunks to embed")
|
|
62
|
+
return 0
|
|
63
|
+
|
|
64
|
+
logger.info("Loading embedding model: %s", model_name)
|
|
65
|
+
model = SentenceTransformer(model_name)
|
|
66
|
+
|
|
67
|
+
logger.info("Generating embeddings for %d chunks...", len(chunks))
|
|
68
|
+
texts = [chunk.content for chunk in chunks]
|
|
69
|
+
embeddings = model.encode(texts, show_progress_bar=True)
|
|
70
|
+
|
|
71
|
+
# Prepare records for LanceDB
|
|
72
|
+
records = []
|
|
73
|
+
for chunk, embedding in zip(chunks, embeddings, strict=True):
|
|
74
|
+
record = _chunk_to_record(chunk, embedding.tolist())
|
|
75
|
+
records.append(record)
|
|
76
|
+
|
|
77
|
+
# Store in LanceDB
|
|
78
|
+
db_path.mkdir(parents=True, exist_ok=True)
|
|
79
|
+
db = lancedb.connect(str(db_path))
|
|
80
|
+
|
|
81
|
+
# Drop existing table if it exists
|
|
82
|
+
try:
|
|
83
|
+
db.drop_table(table_name)
|
|
84
|
+
except Exception as e:
|
|
85
|
+
logger.debug("Table %s does not exist or could not be dropped: %s", table_name, e)
|
|
86
|
+
|
|
87
|
+
logger.info("Storing %d records in LanceDB...", len(records))
|
|
88
|
+
table = db.create_table(table_name, records)
|
|
89
|
+
|
|
90
|
+
# Create vector index for fast search
|
|
91
|
+
logger.info("Creating vector index...")
|
|
92
|
+
table.create_index(metric="cosine", num_partitions=16, num_sub_vectors=32)
|
|
93
|
+
|
|
94
|
+
logger.info("Successfully stored %d chunks in %s", len(records), db_path)
|
|
95
|
+
return len(records)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _chunk_to_record(chunk: Chunk, embedding: list[float]) -> dict[str, Any]:
|
|
99
|
+
"""Convert a Chunk to a LanceDB record."""
|
|
100
|
+
return {
|
|
101
|
+
"chunk_id": chunk.chunk_id, # For incremental updates
|
|
102
|
+
"content": chunk.content,
|
|
103
|
+
"source": chunk.source,
|
|
104
|
+
"fork": chunk.fork or "",
|
|
105
|
+
"section": chunk.section or "",
|
|
106
|
+
"chunk_type": chunk.chunk_type,
|
|
107
|
+
"vector": embedding,
|
|
108
|
+
# Flatten metadata - specs/EIPs
|
|
109
|
+
"eip": chunk.metadata.get("eip", ""),
|
|
110
|
+
"title": chunk.metadata.get("title", ""),
|
|
111
|
+
"function_name": chunk.metadata.get("function_name", ""),
|
|
112
|
+
"h1": chunk.metadata.get("h1", ""),
|
|
113
|
+
"h2": chunk.metadata.get("h2", ""),
|
|
114
|
+
"h3": chunk.metadata.get("h3", ""),
|
|
115
|
+
# Client code metadata
|
|
116
|
+
"client": chunk.metadata.get("client", ""),
|
|
117
|
+
"language": chunk.metadata.get("language", ""),
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@dataclass
|
|
122
|
+
class IndexStats:
|
|
123
|
+
"""Statistics from an indexing operation."""
|
|
124
|
+
|
|
125
|
+
total_files: int = 0
|
|
126
|
+
files_added: int = 0
|
|
127
|
+
files_modified: int = 0
|
|
128
|
+
files_deleted: int = 0
|
|
129
|
+
chunks_added: int = 0
|
|
130
|
+
chunks_deleted: int = 0
|
|
131
|
+
full_rebuild: bool = False
|
|
132
|
+
rebuild_reason: str = ""
|
|
133
|
+
|
|
134
|
+
@property
|
|
135
|
+
def files_changed(self) -> int:
|
|
136
|
+
return self.files_added + self.files_modified + self.files_deleted
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
def is_incremental(self) -> bool:
|
|
140
|
+
return not self.full_rebuild and self.files_changed > 0
|
|
141
|
+
|
|
142
|
+
@property
|
|
143
|
+
def is_noop(self) -> bool:
|
|
144
|
+
return not self.full_rebuild and self.files_changed == 0
|
|
145
|
+
|
|
146
|
+
def summary(self) -> str:
|
|
147
|
+
"""Human-readable summary."""
|
|
148
|
+
if self.full_rebuild:
|
|
149
|
+
return f"Full rebuild: {self.chunks_added} chunks indexed ({self.rebuild_reason})"
|
|
150
|
+
if self.is_noop:
|
|
151
|
+
return "No changes detected"
|
|
152
|
+
return (
|
|
153
|
+
f"Incremental update: {self.files_added} added, "
|
|
154
|
+
f"{self.files_modified} modified, {self.files_deleted} deleted "
|
|
155
|
+
f"({self.chunks_added} chunks added, {self.chunks_deleted} deleted)"
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@dataclass
|
|
160
|
+
class DryRunResult:
|
|
161
|
+
"""Result of a dry-run showing what would change."""
|
|
162
|
+
|
|
163
|
+
would_rebuild: bool = False
|
|
164
|
+
rebuild_reason: str = ""
|
|
165
|
+
files_to_add: list[str] = field(default_factory=list)
|
|
166
|
+
files_to_modify: list[str] = field(default_factory=list)
|
|
167
|
+
files_to_delete: list[str] = field(default_factory=list)
|
|
168
|
+
estimated_chunks_add: int = 0
|
|
169
|
+
estimated_chunks_delete: int = 0
|
|
170
|
+
|
|
171
|
+
def summary(self) -> str:
|
|
172
|
+
if self.would_rebuild:
|
|
173
|
+
return f"Would perform full rebuild: {self.rebuild_reason}"
|
|
174
|
+
if not (self.files_to_add or self.files_to_modify or self.files_to_delete):
|
|
175
|
+
return "No changes detected"
|
|
176
|
+
lines = ["Would perform incremental update:"]
|
|
177
|
+
if self.files_to_add:
|
|
178
|
+
lines.append(f" Add {len(self.files_to_add)} files")
|
|
179
|
+
if self.files_to_modify:
|
|
180
|
+
lines.append(f" Modify {len(self.files_to_modify)} files")
|
|
181
|
+
if self.files_to_delete:
|
|
182
|
+
lines.append(f" Delete {len(self.files_to_delete)} files")
|
|
183
|
+
lines.append(f" ~{self.estimated_chunks_add} chunks to add")
|
|
184
|
+
lines.append(f" ~{self.estimated_chunks_delete} chunks to delete")
|
|
185
|
+
return "\n".join(lines)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class IncrementalEmbedder:
|
|
189
|
+
"""
|
|
190
|
+
Incremental embedding and indexing for Ethereum specs.
|
|
191
|
+
|
|
192
|
+
Uses a manifest to track indexed files and their chunk IDs,
|
|
193
|
+
enabling incremental updates that only re-embed changed content.
|
|
194
|
+
"""
|
|
195
|
+
|
|
196
|
+
def __init__(
|
|
197
|
+
self,
|
|
198
|
+
data_dir: Path,
|
|
199
|
+
model_name: str | None = None,
|
|
200
|
+
table_name: str = TABLE_NAME,
|
|
201
|
+
batch_size: int = 32,
|
|
202
|
+
):
|
|
203
|
+
"""
|
|
204
|
+
Initialize the incremental embedder.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
data_dir: Base data directory (~/.ethereum-mcp)
|
|
208
|
+
model_name: Embedding model name (None = use config/default)
|
|
209
|
+
table_name: LanceDB table name
|
|
210
|
+
batch_size: Batch size for embedding
|
|
211
|
+
"""
|
|
212
|
+
self.data_dir = data_dir
|
|
213
|
+
self.db_path = data_dir / "lancedb"
|
|
214
|
+
self.manifest_path = data_dir / "manifest.json"
|
|
215
|
+
self.table_name = table_name
|
|
216
|
+
self.batch_size = batch_size
|
|
217
|
+
|
|
218
|
+
# Load config if no model specified
|
|
219
|
+
if model_name is None:
|
|
220
|
+
config = load_config(data_dir=data_dir)
|
|
221
|
+
model_name = config.embedding.model
|
|
222
|
+
self.batch_size = config.embedding.batch_size
|
|
223
|
+
|
|
224
|
+
self.model_name = model_name
|
|
225
|
+
self._model: SentenceTransformer | None = None
|
|
226
|
+
|
|
227
|
+
@property
|
|
228
|
+
def model(self) -> SentenceTransformer:
|
|
229
|
+
"""Lazy-load the embedding model."""
|
|
230
|
+
if self._model is None:
|
|
231
|
+
logger.info("Loading embedding model: %s", self.model_name)
|
|
232
|
+
self._model = SentenceTransformer(self.model_name)
|
|
233
|
+
return self._model
|
|
234
|
+
|
|
235
|
+
def get_current_config(self) -> dict[str, Any]:
|
|
236
|
+
"""Get current config for manifest comparison."""
|
|
237
|
+
config = load_config(data_dir=self.data_dir)
|
|
238
|
+
return {
|
|
239
|
+
"embedding_model": self.model_name,
|
|
240
|
+
"chunk_config": config.chunking.to_dict(),
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
def dry_run(
|
|
244
|
+
self,
|
|
245
|
+
current_files: dict[str, Path],
|
|
246
|
+
file_types: dict[str, str],
|
|
247
|
+
) -> DryRunResult:
|
|
248
|
+
"""
|
|
249
|
+
Check what would change without actually indexing.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
current_files: Dict mapping relative paths to absolute Paths
|
|
253
|
+
file_types: Dict mapping relative paths to file types ('spec', 'eip', 'builder')
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
DryRunResult describing what would happen
|
|
257
|
+
"""
|
|
258
|
+
try:
|
|
259
|
+
manifest = load_manifest(self.manifest_path)
|
|
260
|
+
except ManifestCorruptedError:
|
|
261
|
+
return DryRunResult(
|
|
262
|
+
would_rebuild=True,
|
|
263
|
+
rebuild_reason="Manifest corrupted",
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
config = self.get_current_config()
|
|
267
|
+
needs_rebuild, reason = needs_full_rebuild(manifest, config)
|
|
268
|
+
|
|
269
|
+
if needs_rebuild:
|
|
270
|
+
return DryRunResult(
|
|
271
|
+
would_rebuild=True,
|
|
272
|
+
rebuild_reason=reason,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
changes = compute_changes(manifest, current_files)
|
|
276
|
+
|
|
277
|
+
result = DryRunResult()
|
|
278
|
+
for change in changes:
|
|
279
|
+
if change.change_type == "add":
|
|
280
|
+
result.files_to_add.append(change.path)
|
|
281
|
+
# Estimate chunks (rough: ~10 chunks per file)
|
|
282
|
+
result.estimated_chunks_add += 10
|
|
283
|
+
elif change.change_type == "modify":
|
|
284
|
+
result.files_to_modify.append(change.path)
|
|
285
|
+
result.estimated_chunks_add += 10
|
|
286
|
+
result.estimated_chunks_delete += len(change.old_chunk_ids)
|
|
287
|
+
elif change.change_type == "delete":
|
|
288
|
+
result.files_to_delete.append(change.path)
|
|
289
|
+
result.estimated_chunks_delete += len(change.old_chunk_ids)
|
|
290
|
+
|
|
291
|
+
return result
|
|
292
|
+
|
|
293
|
+
def index(
|
|
294
|
+
self,
|
|
295
|
+
current_files: dict[str, Path],
|
|
296
|
+
file_types: dict[str, str],
|
|
297
|
+
force_full: bool = False,
|
|
298
|
+
chunk_size: int = 1000,
|
|
299
|
+
chunk_overlap: int = 200,
|
|
300
|
+
) -> IndexStats:
|
|
301
|
+
"""
|
|
302
|
+
Index files, using incremental updates when possible.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
current_files: Dict mapping relative paths to absolute Paths
|
|
306
|
+
file_types: Dict mapping relative paths to file types ('spec', 'eip', 'builder')
|
|
307
|
+
force_full: Force full rebuild even if incremental is possible
|
|
308
|
+
chunk_size: Chunk size for text splitting
|
|
309
|
+
chunk_overlap: Overlap between chunks
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
IndexStats with details of what was done
|
|
313
|
+
"""
|
|
314
|
+
stats = IndexStats(total_files=len(current_files))
|
|
315
|
+
|
|
316
|
+
# Load manifest
|
|
317
|
+
try:
|
|
318
|
+
manifest = load_manifest(self.manifest_path)
|
|
319
|
+
except ManifestCorruptedError as e:
|
|
320
|
+
logger.warning("Manifest corrupted, performing full rebuild: %s", e)
|
|
321
|
+
manifest = None
|
|
322
|
+
force_full = True
|
|
323
|
+
stats.rebuild_reason = "Manifest corrupted"
|
|
324
|
+
|
|
325
|
+
# Check if full rebuild needed
|
|
326
|
+
config = self.get_current_config()
|
|
327
|
+
if not force_full:
|
|
328
|
+
needs_rebuild, reason = needs_full_rebuild(manifest, config)
|
|
329
|
+
if needs_rebuild:
|
|
330
|
+
force_full = True
|
|
331
|
+
stats.rebuild_reason = reason
|
|
332
|
+
|
|
333
|
+
if force_full:
|
|
334
|
+
return self._full_rebuild(
|
|
335
|
+
current_files,
|
|
336
|
+
file_types,
|
|
337
|
+
chunk_size,
|
|
338
|
+
chunk_overlap,
|
|
339
|
+
stats,
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
# Incremental update
|
|
343
|
+
return self._incremental_update(
|
|
344
|
+
manifest,
|
|
345
|
+
current_files,
|
|
346
|
+
file_types,
|
|
347
|
+
chunk_size,
|
|
348
|
+
chunk_overlap,
|
|
349
|
+
stats,
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
def _full_rebuild(
|
|
353
|
+
self,
|
|
354
|
+
current_files: dict[str, Path],
|
|
355
|
+
file_types: dict[str, str],
|
|
356
|
+
chunk_size: int,
|
|
357
|
+
chunk_overlap: int,
|
|
358
|
+
stats: IndexStats,
|
|
359
|
+
) -> IndexStats:
|
|
360
|
+
"""Perform full rebuild of the index."""
|
|
361
|
+
stats.full_rebuild = True
|
|
362
|
+
if not stats.rebuild_reason:
|
|
363
|
+
stats.rebuild_reason = "Forced rebuild"
|
|
364
|
+
|
|
365
|
+
logger.info("Performing full rebuild: %s", stats.rebuild_reason)
|
|
366
|
+
|
|
367
|
+
# Chunk all files
|
|
368
|
+
all_chunks = []
|
|
369
|
+
manifest = Manifest(
|
|
370
|
+
embedding_model=self.model_name,
|
|
371
|
+
chunk_config={"chunk_size": chunk_size, "chunk_overlap": chunk_overlap},
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
for rel_path, abs_path in current_files.items():
|
|
375
|
+
file_type = file_types.get(rel_path, "spec")
|
|
376
|
+
chunks = chunk_single_file(
|
|
377
|
+
abs_path,
|
|
378
|
+
file_type,
|
|
379
|
+
chunk_size=chunk_size,
|
|
380
|
+
chunk_overlap=chunk_overlap,
|
|
381
|
+
base_path=self.data_dir,
|
|
382
|
+
)
|
|
383
|
+
all_chunks.extend(chunks)
|
|
384
|
+
|
|
385
|
+
# Update manifest
|
|
386
|
+
manifest.files[rel_path] = FileEntry(
|
|
387
|
+
sha256=compute_file_hash(abs_path),
|
|
388
|
+
mtime_ns=get_file_mtime_ns(abs_path),
|
|
389
|
+
chunk_ids=[c.chunk_id for c in chunks],
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
# Embed and store
|
|
393
|
+
if all_chunks:
|
|
394
|
+
stats.chunks_added = embed_and_store(
|
|
395
|
+
all_chunks,
|
|
396
|
+
self.db_path,
|
|
397
|
+
model_name=self.model_name,
|
|
398
|
+
table_name=self.table_name,
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
# Save manifest
|
|
402
|
+
save_manifest(manifest, self.manifest_path)
|
|
403
|
+
|
|
404
|
+
stats.files_added = len(current_files)
|
|
405
|
+
return stats
|
|
406
|
+
|
|
407
|
+
def _incremental_update(
|
|
408
|
+
self,
|
|
409
|
+
manifest: Manifest,
|
|
410
|
+
current_files: dict[str, Path],
|
|
411
|
+
file_types: dict[str, str],
|
|
412
|
+
chunk_size: int,
|
|
413
|
+
chunk_overlap: int,
|
|
414
|
+
stats: IndexStats,
|
|
415
|
+
) -> IndexStats:
|
|
416
|
+
"""Perform incremental update of the index."""
|
|
417
|
+
changes = compute_changes(manifest, current_files)
|
|
418
|
+
|
|
419
|
+
if not changes:
|
|
420
|
+
logger.info("No changes detected, index is up to date")
|
|
421
|
+
return stats
|
|
422
|
+
|
|
423
|
+
logger.info(
|
|
424
|
+
"Incremental update: %d files changed",
|
|
425
|
+
len(changes),
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
# Open LanceDB
|
|
429
|
+
db = lancedb.connect(str(self.db_path))
|
|
430
|
+
try:
|
|
431
|
+
table = db.open_table(self.table_name)
|
|
432
|
+
except Exception:
|
|
433
|
+
# Table doesn't exist, fall back to full rebuild
|
|
434
|
+
logger.warning("Table doesn't exist, performing full rebuild")
|
|
435
|
+
stats.rebuild_reason = "Table not found"
|
|
436
|
+
return self._full_rebuild(
|
|
437
|
+
current_files,
|
|
438
|
+
file_types,
|
|
439
|
+
chunk_size,
|
|
440
|
+
chunk_overlap,
|
|
441
|
+
stats,
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
chunks_to_add = []
|
|
445
|
+
chunk_ids_to_delete = []
|
|
446
|
+
|
|
447
|
+
for change in changes:
|
|
448
|
+
if change.change_type == "add":
|
|
449
|
+
stats.files_added += 1
|
|
450
|
+
elif change.change_type == "modify":
|
|
451
|
+
stats.files_modified += 1
|
|
452
|
+
chunk_ids_to_delete.extend(change.old_chunk_ids)
|
|
453
|
+
elif change.change_type == "delete":
|
|
454
|
+
stats.files_deleted += 1
|
|
455
|
+
chunk_ids_to_delete.extend(change.old_chunk_ids)
|
|
456
|
+
# Remove from manifest
|
|
457
|
+
del manifest.files[change.path]
|
|
458
|
+
continue
|
|
459
|
+
|
|
460
|
+
# Process add/modify: chunk the file
|
|
461
|
+
abs_path = current_files.get(change.path)
|
|
462
|
+
if abs_path is None:
|
|
463
|
+
continue
|
|
464
|
+
|
|
465
|
+
file_type = file_types.get(change.path, "spec")
|
|
466
|
+
chunks = chunk_single_file(
|
|
467
|
+
abs_path,
|
|
468
|
+
file_type,
|
|
469
|
+
chunk_size=chunk_size,
|
|
470
|
+
chunk_overlap=chunk_overlap,
|
|
471
|
+
base_path=self.data_dir,
|
|
472
|
+
)
|
|
473
|
+
chunks_to_add.extend(chunks)
|
|
474
|
+
|
|
475
|
+
# Update manifest
|
|
476
|
+
manifest.files[change.path] = FileEntry(
|
|
477
|
+
sha256=compute_file_hash(abs_path),
|
|
478
|
+
mtime_ns=get_file_mtime_ns(abs_path),
|
|
479
|
+
chunk_ids=[c.chunk_id for c in chunks],
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
# Delete old chunks
|
|
483
|
+
if chunk_ids_to_delete:
|
|
484
|
+
logger.info("Deleting %d old chunks...", len(chunk_ids_to_delete))
|
|
485
|
+
self._delete_chunks(table, chunk_ids_to_delete)
|
|
486
|
+
stats.chunks_deleted = len(chunk_ids_to_delete)
|
|
487
|
+
|
|
488
|
+
# Add new chunks
|
|
489
|
+
if chunks_to_add:
|
|
490
|
+
logger.info("Adding %d new chunks...", len(chunks_to_add))
|
|
491
|
+
self._add_chunks(table, chunks_to_add)
|
|
492
|
+
stats.chunks_added = len(chunks_to_add)
|
|
493
|
+
|
|
494
|
+
# Update manifest config
|
|
495
|
+
manifest.embedding_model = self.model_name
|
|
496
|
+
manifest.chunk_config = {"chunk_size": chunk_size, "chunk_overlap": chunk_overlap}
|
|
497
|
+
|
|
498
|
+
# Save manifest
|
|
499
|
+
save_manifest(manifest, self.manifest_path)
|
|
500
|
+
|
|
501
|
+
return stats
|
|
502
|
+
|
|
503
|
+
def _delete_chunks(self, table: Any, chunk_ids: list[str]) -> None:
|
|
504
|
+
"""Delete chunks from LanceDB by chunk_id."""
|
|
505
|
+
# LanceDB delete with filter
|
|
506
|
+
# Use batch deletion to avoid SQL injection - delete one at a time with exact match
|
|
507
|
+
for chunk_id in chunk_ids:
|
|
508
|
+
# Sanitize chunk_id (already validated during manifest load)
|
|
509
|
+
# Use exact string match to avoid injection
|
|
510
|
+
try:
|
|
511
|
+
table.delete(f'chunk_id = "{chunk_id}"')
|
|
512
|
+
except Exception as e:
|
|
513
|
+
logger.warning("Failed to delete chunk %s: %s", chunk_id, e)
|
|
514
|
+
|
|
515
|
+
def _add_chunks(self, table: Any, chunks: list[Chunk]) -> None:
|
|
516
|
+
"""Add chunks to LanceDB."""
|
|
517
|
+
# Generate embeddings in batches
|
|
518
|
+
texts = [c.content for c in chunks]
|
|
519
|
+
embeddings = self.model.encode(
|
|
520
|
+
texts,
|
|
521
|
+
show_progress_bar=True,
|
|
522
|
+
batch_size=self.batch_size,
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
# Create records
|
|
526
|
+
records = []
|
|
527
|
+
for chunk, embedding in zip(chunks, embeddings, strict=True):
|
|
528
|
+
records.append(_chunk_to_record(chunk, embedding.tolist()))
|
|
529
|
+
|
|
530
|
+
# Add to table
|
|
531
|
+
table.add(records)
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
class EmbeddingSearcher:
|
|
535
|
+
"""Search interface for embedded documents."""
|
|
536
|
+
|
|
537
|
+
def __init__(
|
|
538
|
+
self,
|
|
539
|
+
db_path: Path,
|
|
540
|
+
model_name: str = DEFAULT_MODEL,
|
|
541
|
+
table_name: str = TABLE_NAME,
|
|
542
|
+
):
|
|
543
|
+
self.db = lancedb.connect(str(db_path))
|
|
544
|
+
self.table = self.db.open_table(table_name)
|
|
545
|
+
self.model = SentenceTransformer(model_name)
|
|
546
|
+
|
|
547
|
+
def search(
|
|
548
|
+
self,
|
|
549
|
+
query: str,
|
|
550
|
+
limit: int = 5,
|
|
551
|
+
fork: str | None = None,
|
|
552
|
+
chunk_type: str | None = None,
|
|
553
|
+
) -> list[dict]:
|
|
554
|
+
"""
|
|
555
|
+
Search for relevant chunks.
|
|
556
|
+
|
|
557
|
+
Args:
|
|
558
|
+
query: Search query
|
|
559
|
+
limit: Maximum results to return
|
|
560
|
+
fork: Filter by fork name
|
|
561
|
+
chunk_type: Filter by chunk type
|
|
562
|
+
|
|
563
|
+
Returns:
|
|
564
|
+
List of matching chunks with scores
|
|
565
|
+
"""
|
|
566
|
+
query_embedding = self.model.encode(query).tolist()
|
|
567
|
+
|
|
568
|
+
# Fetch extra results if filtering, then filter in Python (safer than SQL injection surface)
|
|
569
|
+
fetch_limit = limit * 2 if (fork or chunk_type) else limit
|
|
570
|
+
results = self.table.search(query_embedding).limit(fetch_limit).to_list()
|
|
571
|
+
|
|
572
|
+
# Filter in Python - no SQL injection risk
|
|
573
|
+
matches = []
|
|
574
|
+
for r in results:
|
|
575
|
+
if fork and r.get("fork") != fork:
|
|
576
|
+
continue
|
|
577
|
+
if chunk_type and r.get("chunk_type") != chunk_type:
|
|
578
|
+
continue
|
|
579
|
+
|
|
580
|
+
matches.append({
|
|
581
|
+
"content": r["content"],
|
|
582
|
+
"source": r["source"],
|
|
583
|
+
"fork": r["fork"],
|
|
584
|
+
"section": r["section"],
|
|
585
|
+
"chunk_type": r["chunk_type"],
|
|
586
|
+
"score": 1 - r["_distance"], # Convert distance to similarity
|
|
587
|
+
"eip": r.get("eip"),
|
|
588
|
+
"function_name": r.get("function_name"),
|
|
589
|
+
"client": r.get("client"),
|
|
590
|
+
"language": r.get("language"),
|
|
591
|
+
"chunk_id": r.get("chunk_id"),
|
|
592
|
+
})
|
|
593
|
+
|
|
594
|
+
if len(matches) >= limit:
|
|
595
|
+
break
|
|
596
|
+
|
|
597
|
+
return matches
|
|
598
|
+
|
|
599
|
+
def search_constant(self, constant_name: str) -> list[dict]:
|
|
600
|
+
"""Search specifically for a constant definition."""
|
|
601
|
+
return self.search(
|
|
602
|
+
f"{constant_name} constant value",
|
|
603
|
+
limit=10,
|
|
604
|
+
chunk_type="constant",
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
def search_function(self, function_name: str, fork: str | None = None) -> list[dict]:
|
|
608
|
+
"""Search specifically for a function implementation."""
|
|
609
|
+
return self.search(
|
|
610
|
+
f"def {function_name}",
|
|
611
|
+
limit=5,
|
|
612
|
+
fork=fork,
|
|
613
|
+
chunk_type="function",
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
def search_eip(self, query: str) -> list[dict]:
|
|
617
|
+
"""Search only EIPs."""
|
|
618
|
+
return self.search(query, limit=10, chunk_type="eip")
|
|
619
|
+
|
|
620
|
+
def get_stats(self) -> dict[str, Any]:
|
|
621
|
+
"""Get index statistics."""
|
|
622
|
+
# Get table count
|
|
623
|
+
count = self.table.count_rows()
|
|
624
|
+
return {
|
|
625
|
+
"total_chunks": count,
|
|
626
|
+
"table_name": self.table.name,
|
|
627
|
+
}
|