eth-mcp 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,627 @@
1
+ """Generate embeddings and store in LanceDB.
2
+
3
+ Supports both full and incremental indexing:
4
+ - Full: Drop existing table and rebuild from scratch
5
+ - Incremental: Add/delete only changed chunks based on manifest
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ import lancedb
13
+ from sentence_transformers import SentenceTransformer
14
+
15
+ from ..config import DEFAULT_EMBEDDING_MODEL, load_config
16
+ from ..logging import get_logger
17
+ from .chunker import Chunk, chunk_single_file
18
+ from .manifest import (
19
+ FileEntry,
20
+ Manifest,
21
+ ManifestCorruptedError,
22
+ compute_changes,
23
+ compute_file_hash,
24
+ get_file_mtime_ns,
25
+ load_manifest,
26
+ needs_full_rebuild,
27
+ save_manifest,
28
+ )
29
+
30
+ logger = get_logger("embedder")
31
+
32
+ # Default embedding model - good balance of quality and speed
33
+ DEFAULT_MODEL = DEFAULT_EMBEDDING_MODEL
34
+
35
+ # LanceDB table name
36
+ TABLE_NAME = "ethereum_specs"
37
+
38
+
39
+ def embed_and_store(
40
+ chunks: list[Chunk],
41
+ db_path: Path,
42
+ model_name: str = DEFAULT_MODEL,
43
+ table_name: str = TABLE_NAME,
44
+ ) -> int:
45
+ """
46
+ Generate embeddings for chunks and store in LanceDB.
47
+
48
+ This is the legacy full-rebuild function, preserved for backwards compatibility.
49
+ For incremental indexing, use IncrementalEmbedder instead.
50
+
51
+ Args:
52
+ chunks: List of document chunks
53
+ db_path: Path to LanceDB database
54
+ model_name: Sentence transformer model name
55
+ table_name: Name of table to create/update
56
+
57
+ Returns:
58
+ Number of chunks embedded and stored
59
+ """
60
+ if not chunks:
61
+ logger.warning("No chunks to embed")
62
+ return 0
63
+
64
+ logger.info("Loading embedding model: %s", model_name)
65
+ model = SentenceTransformer(model_name)
66
+
67
+ logger.info("Generating embeddings for %d chunks...", len(chunks))
68
+ texts = [chunk.content for chunk in chunks]
69
+ embeddings = model.encode(texts, show_progress_bar=True)
70
+
71
+ # Prepare records for LanceDB
72
+ records = []
73
+ for chunk, embedding in zip(chunks, embeddings, strict=True):
74
+ record = _chunk_to_record(chunk, embedding.tolist())
75
+ records.append(record)
76
+
77
+ # Store in LanceDB
78
+ db_path.mkdir(parents=True, exist_ok=True)
79
+ db = lancedb.connect(str(db_path))
80
+
81
+ # Drop existing table if it exists
82
+ try:
83
+ db.drop_table(table_name)
84
+ except Exception as e:
85
+ logger.debug("Table %s does not exist or could not be dropped: %s", table_name, e)
86
+
87
+ logger.info("Storing %d records in LanceDB...", len(records))
88
+ table = db.create_table(table_name, records)
89
+
90
+ # Create vector index for fast search
91
+ logger.info("Creating vector index...")
92
+ table.create_index(metric="cosine", num_partitions=16, num_sub_vectors=32)
93
+
94
+ logger.info("Successfully stored %d chunks in %s", len(records), db_path)
95
+ return len(records)
96
+
97
+
98
+ def _chunk_to_record(chunk: Chunk, embedding: list[float]) -> dict[str, Any]:
99
+ """Convert a Chunk to a LanceDB record."""
100
+ return {
101
+ "chunk_id": chunk.chunk_id, # For incremental updates
102
+ "content": chunk.content,
103
+ "source": chunk.source,
104
+ "fork": chunk.fork or "",
105
+ "section": chunk.section or "",
106
+ "chunk_type": chunk.chunk_type,
107
+ "vector": embedding,
108
+ # Flatten metadata - specs/EIPs
109
+ "eip": chunk.metadata.get("eip", ""),
110
+ "title": chunk.metadata.get("title", ""),
111
+ "function_name": chunk.metadata.get("function_name", ""),
112
+ "h1": chunk.metadata.get("h1", ""),
113
+ "h2": chunk.metadata.get("h2", ""),
114
+ "h3": chunk.metadata.get("h3", ""),
115
+ # Client code metadata
116
+ "client": chunk.metadata.get("client", ""),
117
+ "language": chunk.metadata.get("language", ""),
118
+ }
119
+
120
+
121
+ @dataclass
122
+ class IndexStats:
123
+ """Statistics from an indexing operation."""
124
+
125
+ total_files: int = 0
126
+ files_added: int = 0
127
+ files_modified: int = 0
128
+ files_deleted: int = 0
129
+ chunks_added: int = 0
130
+ chunks_deleted: int = 0
131
+ full_rebuild: bool = False
132
+ rebuild_reason: str = ""
133
+
134
+ @property
135
+ def files_changed(self) -> int:
136
+ return self.files_added + self.files_modified + self.files_deleted
137
+
138
+ @property
139
+ def is_incremental(self) -> bool:
140
+ return not self.full_rebuild and self.files_changed > 0
141
+
142
+ @property
143
+ def is_noop(self) -> bool:
144
+ return not self.full_rebuild and self.files_changed == 0
145
+
146
+ def summary(self) -> str:
147
+ """Human-readable summary."""
148
+ if self.full_rebuild:
149
+ return f"Full rebuild: {self.chunks_added} chunks indexed ({self.rebuild_reason})"
150
+ if self.is_noop:
151
+ return "No changes detected"
152
+ return (
153
+ f"Incremental update: {self.files_added} added, "
154
+ f"{self.files_modified} modified, {self.files_deleted} deleted "
155
+ f"({self.chunks_added} chunks added, {self.chunks_deleted} deleted)"
156
+ )
157
+
158
+
159
+ @dataclass
160
+ class DryRunResult:
161
+ """Result of a dry-run showing what would change."""
162
+
163
+ would_rebuild: bool = False
164
+ rebuild_reason: str = ""
165
+ files_to_add: list[str] = field(default_factory=list)
166
+ files_to_modify: list[str] = field(default_factory=list)
167
+ files_to_delete: list[str] = field(default_factory=list)
168
+ estimated_chunks_add: int = 0
169
+ estimated_chunks_delete: int = 0
170
+
171
+ def summary(self) -> str:
172
+ if self.would_rebuild:
173
+ return f"Would perform full rebuild: {self.rebuild_reason}"
174
+ if not (self.files_to_add or self.files_to_modify or self.files_to_delete):
175
+ return "No changes detected"
176
+ lines = ["Would perform incremental update:"]
177
+ if self.files_to_add:
178
+ lines.append(f" Add {len(self.files_to_add)} files")
179
+ if self.files_to_modify:
180
+ lines.append(f" Modify {len(self.files_to_modify)} files")
181
+ if self.files_to_delete:
182
+ lines.append(f" Delete {len(self.files_to_delete)} files")
183
+ lines.append(f" ~{self.estimated_chunks_add} chunks to add")
184
+ lines.append(f" ~{self.estimated_chunks_delete} chunks to delete")
185
+ return "\n".join(lines)
186
+
187
+
188
+ class IncrementalEmbedder:
189
+ """
190
+ Incremental embedding and indexing for Ethereum specs.
191
+
192
+ Uses a manifest to track indexed files and their chunk IDs,
193
+ enabling incremental updates that only re-embed changed content.
194
+ """
195
+
196
+ def __init__(
197
+ self,
198
+ data_dir: Path,
199
+ model_name: str | None = None,
200
+ table_name: str = TABLE_NAME,
201
+ batch_size: int = 32,
202
+ ):
203
+ """
204
+ Initialize the incremental embedder.
205
+
206
+ Args:
207
+ data_dir: Base data directory (~/.ethereum-mcp)
208
+ model_name: Embedding model name (None = use config/default)
209
+ table_name: LanceDB table name
210
+ batch_size: Batch size for embedding
211
+ """
212
+ self.data_dir = data_dir
213
+ self.db_path = data_dir / "lancedb"
214
+ self.manifest_path = data_dir / "manifest.json"
215
+ self.table_name = table_name
216
+ self.batch_size = batch_size
217
+
218
+ # Load config if no model specified
219
+ if model_name is None:
220
+ config = load_config(data_dir=data_dir)
221
+ model_name = config.embedding.model
222
+ self.batch_size = config.embedding.batch_size
223
+
224
+ self.model_name = model_name
225
+ self._model: SentenceTransformer | None = None
226
+
227
+ @property
228
+ def model(self) -> SentenceTransformer:
229
+ """Lazy-load the embedding model."""
230
+ if self._model is None:
231
+ logger.info("Loading embedding model: %s", self.model_name)
232
+ self._model = SentenceTransformer(self.model_name)
233
+ return self._model
234
+
235
+ def get_current_config(self) -> dict[str, Any]:
236
+ """Get current config for manifest comparison."""
237
+ config = load_config(data_dir=self.data_dir)
238
+ return {
239
+ "embedding_model": self.model_name,
240
+ "chunk_config": config.chunking.to_dict(),
241
+ }
242
+
243
+ def dry_run(
244
+ self,
245
+ current_files: dict[str, Path],
246
+ file_types: dict[str, str],
247
+ ) -> DryRunResult:
248
+ """
249
+ Check what would change without actually indexing.
250
+
251
+ Args:
252
+ current_files: Dict mapping relative paths to absolute Paths
253
+ file_types: Dict mapping relative paths to file types ('spec', 'eip', 'builder')
254
+
255
+ Returns:
256
+ DryRunResult describing what would happen
257
+ """
258
+ try:
259
+ manifest = load_manifest(self.manifest_path)
260
+ except ManifestCorruptedError:
261
+ return DryRunResult(
262
+ would_rebuild=True,
263
+ rebuild_reason="Manifest corrupted",
264
+ )
265
+
266
+ config = self.get_current_config()
267
+ needs_rebuild, reason = needs_full_rebuild(manifest, config)
268
+
269
+ if needs_rebuild:
270
+ return DryRunResult(
271
+ would_rebuild=True,
272
+ rebuild_reason=reason,
273
+ )
274
+
275
+ changes = compute_changes(manifest, current_files)
276
+
277
+ result = DryRunResult()
278
+ for change in changes:
279
+ if change.change_type == "add":
280
+ result.files_to_add.append(change.path)
281
+ # Estimate chunks (rough: ~10 chunks per file)
282
+ result.estimated_chunks_add += 10
283
+ elif change.change_type == "modify":
284
+ result.files_to_modify.append(change.path)
285
+ result.estimated_chunks_add += 10
286
+ result.estimated_chunks_delete += len(change.old_chunk_ids)
287
+ elif change.change_type == "delete":
288
+ result.files_to_delete.append(change.path)
289
+ result.estimated_chunks_delete += len(change.old_chunk_ids)
290
+
291
+ return result
292
+
293
+ def index(
294
+ self,
295
+ current_files: dict[str, Path],
296
+ file_types: dict[str, str],
297
+ force_full: bool = False,
298
+ chunk_size: int = 1000,
299
+ chunk_overlap: int = 200,
300
+ ) -> IndexStats:
301
+ """
302
+ Index files, using incremental updates when possible.
303
+
304
+ Args:
305
+ current_files: Dict mapping relative paths to absolute Paths
306
+ file_types: Dict mapping relative paths to file types ('spec', 'eip', 'builder')
307
+ force_full: Force full rebuild even if incremental is possible
308
+ chunk_size: Chunk size for text splitting
309
+ chunk_overlap: Overlap between chunks
310
+
311
+ Returns:
312
+ IndexStats with details of what was done
313
+ """
314
+ stats = IndexStats(total_files=len(current_files))
315
+
316
+ # Load manifest
317
+ try:
318
+ manifest = load_manifest(self.manifest_path)
319
+ except ManifestCorruptedError as e:
320
+ logger.warning("Manifest corrupted, performing full rebuild: %s", e)
321
+ manifest = None
322
+ force_full = True
323
+ stats.rebuild_reason = "Manifest corrupted"
324
+
325
+ # Check if full rebuild needed
326
+ config = self.get_current_config()
327
+ if not force_full:
328
+ needs_rebuild, reason = needs_full_rebuild(manifest, config)
329
+ if needs_rebuild:
330
+ force_full = True
331
+ stats.rebuild_reason = reason
332
+
333
+ if force_full:
334
+ return self._full_rebuild(
335
+ current_files,
336
+ file_types,
337
+ chunk_size,
338
+ chunk_overlap,
339
+ stats,
340
+ )
341
+
342
+ # Incremental update
343
+ return self._incremental_update(
344
+ manifest,
345
+ current_files,
346
+ file_types,
347
+ chunk_size,
348
+ chunk_overlap,
349
+ stats,
350
+ )
351
+
352
+ def _full_rebuild(
353
+ self,
354
+ current_files: dict[str, Path],
355
+ file_types: dict[str, str],
356
+ chunk_size: int,
357
+ chunk_overlap: int,
358
+ stats: IndexStats,
359
+ ) -> IndexStats:
360
+ """Perform full rebuild of the index."""
361
+ stats.full_rebuild = True
362
+ if not stats.rebuild_reason:
363
+ stats.rebuild_reason = "Forced rebuild"
364
+
365
+ logger.info("Performing full rebuild: %s", stats.rebuild_reason)
366
+
367
+ # Chunk all files
368
+ all_chunks = []
369
+ manifest = Manifest(
370
+ embedding_model=self.model_name,
371
+ chunk_config={"chunk_size": chunk_size, "chunk_overlap": chunk_overlap},
372
+ )
373
+
374
+ for rel_path, abs_path in current_files.items():
375
+ file_type = file_types.get(rel_path, "spec")
376
+ chunks = chunk_single_file(
377
+ abs_path,
378
+ file_type,
379
+ chunk_size=chunk_size,
380
+ chunk_overlap=chunk_overlap,
381
+ base_path=self.data_dir,
382
+ )
383
+ all_chunks.extend(chunks)
384
+
385
+ # Update manifest
386
+ manifest.files[rel_path] = FileEntry(
387
+ sha256=compute_file_hash(abs_path),
388
+ mtime_ns=get_file_mtime_ns(abs_path),
389
+ chunk_ids=[c.chunk_id for c in chunks],
390
+ )
391
+
392
+ # Embed and store
393
+ if all_chunks:
394
+ stats.chunks_added = embed_and_store(
395
+ all_chunks,
396
+ self.db_path,
397
+ model_name=self.model_name,
398
+ table_name=self.table_name,
399
+ )
400
+
401
+ # Save manifest
402
+ save_manifest(manifest, self.manifest_path)
403
+
404
+ stats.files_added = len(current_files)
405
+ return stats
406
+
407
+ def _incremental_update(
408
+ self,
409
+ manifest: Manifest,
410
+ current_files: dict[str, Path],
411
+ file_types: dict[str, str],
412
+ chunk_size: int,
413
+ chunk_overlap: int,
414
+ stats: IndexStats,
415
+ ) -> IndexStats:
416
+ """Perform incremental update of the index."""
417
+ changes = compute_changes(manifest, current_files)
418
+
419
+ if not changes:
420
+ logger.info("No changes detected, index is up to date")
421
+ return stats
422
+
423
+ logger.info(
424
+ "Incremental update: %d files changed",
425
+ len(changes),
426
+ )
427
+
428
+ # Open LanceDB
429
+ db = lancedb.connect(str(self.db_path))
430
+ try:
431
+ table = db.open_table(self.table_name)
432
+ except Exception:
433
+ # Table doesn't exist, fall back to full rebuild
434
+ logger.warning("Table doesn't exist, performing full rebuild")
435
+ stats.rebuild_reason = "Table not found"
436
+ return self._full_rebuild(
437
+ current_files,
438
+ file_types,
439
+ chunk_size,
440
+ chunk_overlap,
441
+ stats,
442
+ )
443
+
444
+ chunks_to_add = []
445
+ chunk_ids_to_delete = []
446
+
447
+ for change in changes:
448
+ if change.change_type == "add":
449
+ stats.files_added += 1
450
+ elif change.change_type == "modify":
451
+ stats.files_modified += 1
452
+ chunk_ids_to_delete.extend(change.old_chunk_ids)
453
+ elif change.change_type == "delete":
454
+ stats.files_deleted += 1
455
+ chunk_ids_to_delete.extend(change.old_chunk_ids)
456
+ # Remove from manifest
457
+ del manifest.files[change.path]
458
+ continue
459
+
460
+ # Process add/modify: chunk the file
461
+ abs_path = current_files.get(change.path)
462
+ if abs_path is None:
463
+ continue
464
+
465
+ file_type = file_types.get(change.path, "spec")
466
+ chunks = chunk_single_file(
467
+ abs_path,
468
+ file_type,
469
+ chunk_size=chunk_size,
470
+ chunk_overlap=chunk_overlap,
471
+ base_path=self.data_dir,
472
+ )
473
+ chunks_to_add.extend(chunks)
474
+
475
+ # Update manifest
476
+ manifest.files[change.path] = FileEntry(
477
+ sha256=compute_file_hash(abs_path),
478
+ mtime_ns=get_file_mtime_ns(abs_path),
479
+ chunk_ids=[c.chunk_id for c in chunks],
480
+ )
481
+
482
+ # Delete old chunks
483
+ if chunk_ids_to_delete:
484
+ logger.info("Deleting %d old chunks...", len(chunk_ids_to_delete))
485
+ self._delete_chunks(table, chunk_ids_to_delete)
486
+ stats.chunks_deleted = len(chunk_ids_to_delete)
487
+
488
+ # Add new chunks
489
+ if chunks_to_add:
490
+ logger.info("Adding %d new chunks...", len(chunks_to_add))
491
+ self._add_chunks(table, chunks_to_add)
492
+ stats.chunks_added = len(chunks_to_add)
493
+
494
+ # Update manifest config
495
+ manifest.embedding_model = self.model_name
496
+ manifest.chunk_config = {"chunk_size": chunk_size, "chunk_overlap": chunk_overlap}
497
+
498
+ # Save manifest
499
+ save_manifest(manifest, self.manifest_path)
500
+
501
+ return stats
502
+
503
+ def _delete_chunks(self, table: Any, chunk_ids: list[str]) -> None:
504
+ """Delete chunks from LanceDB by chunk_id."""
505
+ # LanceDB delete with filter
506
+ # Use batch deletion to avoid SQL injection - delete one at a time with exact match
507
+ for chunk_id in chunk_ids:
508
+ # Sanitize chunk_id (already validated during manifest load)
509
+ # Use exact string match to avoid injection
510
+ try:
511
+ table.delete(f'chunk_id = "{chunk_id}"')
512
+ except Exception as e:
513
+ logger.warning("Failed to delete chunk %s: %s", chunk_id, e)
514
+
515
+ def _add_chunks(self, table: Any, chunks: list[Chunk]) -> None:
516
+ """Add chunks to LanceDB."""
517
+ # Generate embeddings in batches
518
+ texts = [c.content for c in chunks]
519
+ embeddings = self.model.encode(
520
+ texts,
521
+ show_progress_bar=True,
522
+ batch_size=self.batch_size,
523
+ )
524
+
525
+ # Create records
526
+ records = []
527
+ for chunk, embedding in zip(chunks, embeddings, strict=True):
528
+ records.append(_chunk_to_record(chunk, embedding.tolist()))
529
+
530
+ # Add to table
531
+ table.add(records)
532
+
533
+
534
+ class EmbeddingSearcher:
535
+ """Search interface for embedded documents."""
536
+
537
+ def __init__(
538
+ self,
539
+ db_path: Path,
540
+ model_name: str = DEFAULT_MODEL,
541
+ table_name: str = TABLE_NAME,
542
+ ):
543
+ self.db = lancedb.connect(str(db_path))
544
+ self.table = self.db.open_table(table_name)
545
+ self.model = SentenceTransformer(model_name)
546
+
547
+ def search(
548
+ self,
549
+ query: str,
550
+ limit: int = 5,
551
+ fork: str | None = None,
552
+ chunk_type: str | None = None,
553
+ ) -> list[dict]:
554
+ """
555
+ Search for relevant chunks.
556
+
557
+ Args:
558
+ query: Search query
559
+ limit: Maximum results to return
560
+ fork: Filter by fork name
561
+ chunk_type: Filter by chunk type
562
+
563
+ Returns:
564
+ List of matching chunks with scores
565
+ """
566
+ query_embedding = self.model.encode(query).tolist()
567
+
568
+ # Fetch extra results if filtering, then filter in Python (safer than SQL injection surface)
569
+ fetch_limit = limit * 2 if (fork or chunk_type) else limit
570
+ results = self.table.search(query_embedding).limit(fetch_limit).to_list()
571
+
572
+ # Filter in Python - no SQL injection risk
573
+ matches = []
574
+ for r in results:
575
+ if fork and r.get("fork") != fork:
576
+ continue
577
+ if chunk_type and r.get("chunk_type") != chunk_type:
578
+ continue
579
+
580
+ matches.append({
581
+ "content": r["content"],
582
+ "source": r["source"],
583
+ "fork": r["fork"],
584
+ "section": r["section"],
585
+ "chunk_type": r["chunk_type"],
586
+ "score": 1 - r["_distance"], # Convert distance to similarity
587
+ "eip": r.get("eip"),
588
+ "function_name": r.get("function_name"),
589
+ "client": r.get("client"),
590
+ "language": r.get("language"),
591
+ "chunk_id": r.get("chunk_id"),
592
+ })
593
+
594
+ if len(matches) >= limit:
595
+ break
596
+
597
+ return matches
598
+
599
+ def search_constant(self, constant_name: str) -> list[dict]:
600
+ """Search specifically for a constant definition."""
601
+ return self.search(
602
+ f"{constant_name} constant value",
603
+ limit=10,
604
+ chunk_type="constant",
605
+ )
606
+
607
+ def search_function(self, function_name: str, fork: str | None = None) -> list[dict]:
608
+ """Search specifically for a function implementation."""
609
+ return self.search(
610
+ f"def {function_name}",
611
+ limit=5,
612
+ fork=fork,
613
+ chunk_type="function",
614
+ )
615
+
616
+ def search_eip(self, query: str) -> list[dict]:
617
+ """Search only EIPs."""
618
+ return self.search(query, limit=10, chunk_type="eip")
619
+
620
+ def get_stats(self) -> dict[str, Any]:
621
+ """Get index statistics."""
622
+ # Get table count
623
+ count = self.table.count_rows()
624
+ return {
625
+ "total_chunks": count,
626
+ "table_name": self.table.name,
627
+ }