sol-mcp 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,755 @@
1
+ """Build vector embeddings and store in LanceDB.
2
+
3
+ Creates a searchable vector index from chunked content.
4
+ Supports incremental indexing to avoid full rebuilds.
5
+ """
6
+
7
+ import json
8
+ import logging
9
+ from dataclasses import dataclass, field
10
+ from pathlib import Path
11
+ from typing import Any, Callable
12
+
13
+ from .chunker import Chunk
14
+ from .manifest import (
15
+ FileChange,
16
+ FileEntry,
17
+ Manifest,
18
+ compute_changes,
19
+ compute_file_hash,
20
+ get_file_mtime_ns,
21
+ load_manifest,
22
+ needs_full_rebuild,
23
+ save_manifest,
24
+ )
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ # Try to import embedding dependencies
29
+ try:
30
+ import lancedb
31
+ from sentence_transformers import SentenceTransformer
32
+
33
+ DEPS_AVAILABLE = True
34
+ except ImportError:
35
+ DEPS_AVAILABLE = False
36
+
37
+
38
+ # Default embedding model
39
+ DEFAULT_MODEL = "all-MiniLM-L6-v2"
40
+
41
+ # Default data directory
42
+ DEFAULT_DATA_DIR = Path.home() / ".solana-mcp"
43
+
44
+
45
+ @dataclass
46
+ class IndexStats:
47
+ """Statistics from an indexing operation."""
48
+
49
+ full_rebuild: bool = False
50
+ rebuild_reason: str = ""
51
+ files_added: int = 0
52
+ files_modified: int = 0
53
+ files_deleted: int = 0
54
+ chunks_added: int = 0
55
+ chunks_deleted: int = 0
56
+ errors: list[str] = field(default_factory=list)
57
+
58
+ @property
59
+ def files_changed(self) -> int:
60
+ return self.files_added + self.files_modified + self.files_deleted
61
+
62
+ @property
63
+ def is_incremental(self) -> bool:
64
+ return not self.full_rebuild and self.files_changed > 0
65
+
66
+ @property
67
+ def is_noop(self) -> bool:
68
+ return not self.full_rebuild and self.files_changed == 0
69
+
70
+ def summary(self) -> str:
71
+ """Generate human-readable summary."""
72
+ if self.full_rebuild:
73
+ return (
74
+ f"Full rebuild ({self.rebuild_reason}): "
75
+ f"{self.chunks_added} chunks indexed"
76
+ )
77
+ if self.is_noop:
78
+ return "No changes detected"
79
+ parts = []
80
+ if self.files_added:
81
+ parts.append(f"{self.files_added} added")
82
+ if self.files_modified:
83
+ parts.append(f"{self.files_modified} modified")
84
+ if self.files_deleted:
85
+ parts.append(f"{self.files_deleted} deleted")
86
+ return (
87
+ f"Incremental update: {', '.join(parts)} "
88
+ f"(+{self.chunks_added}/-{self.chunks_deleted} chunks)"
89
+ )
90
+
91
+
92
+ @dataclass
93
+ class DryRunResult:
94
+ """Result of a dry-run analysis."""
95
+
96
+ would_rebuild: bool = False
97
+ rebuild_reason: str = ""
98
+ files_to_add: list[str] = field(default_factory=list)
99
+ files_to_modify: list[str] = field(default_factory=list)
100
+ files_to_delete: list[str] = field(default_factory=list)
101
+ estimated_chunks_add: int = 0
102
+ estimated_chunks_delete: int = 0
103
+
104
+ def summary(self) -> str:
105
+ """Generate human-readable summary."""
106
+ if self.would_rebuild:
107
+ return f"Would perform full rebuild: {self.rebuild_reason}"
108
+
109
+ if not self.files_to_add and not self.files_to_modify and not self.files_to_delete:
110
+ return "No changes detected"
111
+
112
+ parts = []
113
+ if self.files_to_add:
114
+ parts.append(f"Add {len(self.files_to_add)} files")
115
+ if self.files_to_modify:
116
+ parts.append(f"Modify {len(self.files_to_modify)} files")
117
+ if self.files_to_delete:
118
+ parts.append(f"Delete {len(self.files_to_delete)} files")
119
+
120
+ return (
121
+ f"Would update: {', '.join(parts)} "
122
+ f"(~{self.estimated_chunks_add} add, ~{self.estimated_chunks_delete} delete)"
123
+ )
124
+
125
+
126
+ class IncrementalEmbedder:
127
+ """
128
+ Embedder with incremental indexing support.
129
+
130
+ Tracks file state via manifest and only re-embeds changed files.
131
+ """
132
+
133
+ def __init__(
134
+ self,
135
+ data_dir: Path,
136
+ model_name: str = DEFAULT_MODEL,
137
+ batch_size: int = 32,
138
+ ):
139
+ if not DEPS_AVAILABLE:
140
+ raise ImportError(
141
+ "Embedding dependencies not installed. "
142
+ "Run: pip install lancedb sentence-transformers"
143
+ )
144
+
145
+ self.data_dir = Path(data_dir)
146
+ self.model_name = model_name
147
+ self.batch_size = batch_size
148
+ self.db_path = self.data_dir / "lancedb"
149
+ self.manifest_path = self.data_dir / "manifest.json"
150
+ self.table_name = "solana_index"
151
+
152
+ self._model: SentenceTransformer | None = None
153
+ self._db: Any = None
154
+
155
+ @property
156
+ def model(self) -> "SentenceTransformer":
157
+ if self._model is None:
158
+ self._model = SentenceTransformer(self.model_name)
159
+ return self._model
160
+
161
+ @property
162
+ def db(self) -> Any:
163
+ if self._db is None:
164
+ self._db = lancedb.connect(str(self.db_path))
165
+ return self._db
166
+
167
+ def get_current_config(self) -> dict[str, Any]:
168
+ """Get current configuration for manifest comparison."""
169
+ return {
170
+ "embedding_model": self.model_name,
171
+ "chunk_config": {
172
+ "chunk_size": 1000,
173
+ "chunk_overlap": 200,
174
+ },
175
+ }
176
+
177
+ def dry_run(
178
+ self,
179
+ current_files: dict[str, Path],
180
+ file_types: dict[str, str],
181
+ ) -> DryRunResult:
182
+ """
183
+ Analyze what would change without actually indexing.
184
+
185
+ Args:
186
+ current_files: Dict mapping relative paths to absolute Paths
187
+ file_types: Dict mapping relative paths to file types
188
+
189
+ Returns:
190
+ DryRunResult describing pending changes
191
+ """
192
+ manifest = load_manifest(self.manifest_path)
193
+ config = self.get_current_config()
194
+
195
+ # Check if full rebuild needed
196
+ rebuild_needed, reason = needs_full_rebuild(manifest, config)
197
+ if rebuild_needed:
198
+ return DryRunResult(would_rebuild=True, rebuild_reason=reason)
199
+
200
+ # Compute changes
201
+ changes = compute_changes(manifest, current_files)
202
+
203
+ result = DryRunResult()
204
+ for change in changes:
205
+ if change.change_type == "add":
206
+ result.files_to_add.append(change.path)
207
+ result.estimated_chunks_add += 10 # Rough estimate
208
+ elif change.change_type == "modify":
209
+ result.files_to_modify.append(change.path)
210
+ result.estimated_chunks_add += 10
211
+ result.estimated_chunks_delete += len(change.old_chunk_ids)
212
+ elif change.change_type == "delete":
213
+ result.files_to_delete.append(change.path)
214
+ result.estimated_chunks_delete += len(change.old_chunk_ids)
215
+
216
+ return result
217
+
218
+ def index(
219
+ self,
220
+ current_files: dict[str, Path],
221
+ file_types: dict[str, str],
222
+ chunk_fn: Callable[[Path, str, Path], list[Chunk]] | None = None,
223
+ force_full: bool = False,
224
+ progress_callback: Callable[[str], None] | None = None,
225
+ ) -> IndexStats:
226
+ """
227
+ Index files incrementally.
228
+
229
+ Args:
230
+ current_files: Dict mapping relative paths to absolute Paths
231
+ file_types: Dict mapping relative paths to file types
232
+ chunk_fn: Function to chunk a single file
233
+ force_full: Force full rebuild
234
+ progress_callback: Optional progress callback
235
+
236
+ Returns:
237
+ IndexStats with operation details
238
+ """
239
+
240
+ def log(msg: str) -> None:
241
+ if progress_callback:
242
+ progress_callback(msg)
243
+ else:
244
+ logger.info(msg)
245
+
246
+ manifest = load_manifest(self.manifest_path)
247
+ config = self.get_current_config()
248
+
249
+ # Check if full rebuild needed
250
+ if force_full:
251
+ return self._full_rebuild(
252
+ current_files, file_types, chunk_fn, "Forced rebuild", log
253
+ )
254
+
255
+ rebuild_needed, reason = needs_full_rebuild(manifest, config)
256
+ if rebuild_needed:
257
+ return self._full_rebuild(
258
+ current_files, file_types, chunk_fn, reason, log
259
+ )
260
+
261
+ # Incremental update
262
+ changes = compute_changes(manifest, current_files)
263
+ if not changes:
264
+ log("No changes detected")
265
+ return IndexStats()
266
+
267
+ return self._incremental_update(
268
+ manifest, current_files, file_types, changes, chunk_fn, log
269
+ )
270
+
271
+ def _full_rebuild(
272
+ self,
273
+ current_files: dict[str, Path],
274
+ file_types: dict[str, str],
275
+ chunk_fn: Callable[[Path, str, Path], list[Chunk]] | None,
276
+ reason: str,
277
+ log: Callable[[str], None],
278
+ ) -> IndexStats:
279
+ """Perform a full index rebuild."""
280
+ log(f"Full rebuild: {reason}")
281
+ stats = IndexStats(full_rebuild=True, rebuild_reason=reason)
282
+
283
+ # Collect all chunks
284
+ all_chunks: list[Chunk] = []
285
+
286
+ if chunk_fn:
287
+ for rel_path, abs_path in current_files.items():
288
+ file_type = file_types.get(rel_path, "docs")
289
+ try:
290
+ chunks = chunk_fn(abs_path, file_type, self.data_dir)
291
+ all_chunks.extend(chunks)
292
+ except Exception as e:
293
+ stats.errors.append(f"{rel_path}: {e}")
294
+
295
+ if not all_chunks:
296
+ log("No chunks to index")
297
+ return stats
298
+
299
+ log(f"Generating embeddings for {len(all_chunks)} chunks...")
300
+
301
+ # Generate embeddings
302
+ embeddings = self._embed_chunks(all_chunks, log)
303
+
304
+ # Build records
305
+ records = self._build_records(all_chunks, embeddings)
306
+
307
+ # Drop and recreate table
308
+ log("Writing to LanceDB...")
309
+ try:
310
+ self.db.drop_table(self.table_name)
311
+ except Exception:
312
+ pass # Table may not exist
313
+
314
+ self.db.create_table(self.table_name, records)
315
+ stats.chunks_added = len(records)
316
+
317
+ # Build new manifest
318
+ manifest = Manifest(
319
+ embedding_model=self.model_name,
320
+ chunk_config=self.get_current_config()["chunk_config"],
321
+ )
322
+
323
+ # Track files
324
+ for rel_path, abs_path in current_files.items():
325
+ file_hash = compute_file_hash(abs_path)
326
+ mtime = get_file_mtime_ns(abs_path)
327
+ chunk_ids = [
328
+ c.chunk_id for c in all_chunks if c.source_file == rel_path
329
+ ]
330
+ manifest.files[rel_path] = FileEntry(
331
+ sha256=file_hash,
332
+ mtime_ns=mtime,
333
+ chunk_ids=chunk_ids,
334
+ )
335
+
336
+ save_manifest(manifest, self.manifest_path)
337
+ log(f"Indexed {stats.chunks_added} chunks")
338
+
339
+ return stats
340
+
341
+ def _incremental_update(
342
+ self,
343
+ manifest: Manifest,
344
+ current_files: dict[str, Path],
345
+ file_types: dict[str, str],
346
+ changes: list[FileChange],
347
+ chunk_fn: Callable[[Path, str, Path], list[Chunk]] | None,
348
+ log: Callable[[str], None],
349
+ ) -> IndexStats:
350
+ """Apply incremental updates."""
351
+ stats = IndexStats()
352
+
353
+ # Collect chunks to add and IDs to delete
354
+ chunks_to_add: list[Chunk] = []
355
+ ids_to_delete: list[str] = []
356
+
357
+ for change in changes:
358
+ if change.change_type == "add":
359
+ stats.files_added += 1
360
+ if chunk_fn and change.path in current_files:
361
+ abs_path = current_files[change.path]
362
+ file_type = file_types.get(change.path, "docs")
363
+ try:
364
+ chunks = chunk_fn(abs_path, file_type, self.data_dir)
365
+ chunks_to_add.extend(chunks)
366
+ except Exception as e:
367
+ stats.errors.append(f"{change.path}: {e}")
368
+
369
+ elif change.change_type == "modify":
370
+ stats.files_modified += 1
371
+ ids_to_delete.extend(change.old_chunk_ids)
372
+ if chunk_fn and change.path in current_files:
373
+ abs_path = current_files[change.path]
374
+ file_type = file_types.get(change.path, "docs")
375
+ try:
376
+ chunks = chunk_fn(abs_path, file_type, self.data_dir)
377
+ chunks_to_add.extend(chunks)
378
+ except Exception as e:
379
+ stats.errors.append(f"{change.path}: {e}")
380
+
381
+ elif change.change_type == "delete":
382
+ stats.files_deleted += 1
383
+ ids_to_delete.extend(change.old_chunk_ids)
384
+
385
+ # Apply deletions
386
+ if ids_to_delete:
387
+ log(f"Deleting {len(ids_to_delete)} old chunks...")
388
+ self._delete_chunks(ids_to_delete)
389
+ stats.chunks_deleted = len(ids_to_delete)
390
+
391
+ # Apply additions
392
+ if chunks_to_add:
393
+ log(f"Adding {len(chunks_to_add)} new chunks...")
394
+ embeddings = self._embed_chunks(chunks_to_add, log)
395
+ records = self._build_records(chunks_to_add, embeddings)
396
+ self._add_chunks(records)
397
+ stats.chunks_added = len(records)
398
+
399
+ # Update manifest
400
+ for change in changes:
401
+ if change.change_type == "delete":
402
+ del manifest.files[change.path]
403
+ elif change.change_type in ("add", "modify"):
404
+ if change.path in current_files:
405
+ abs_path = current_files[change.path]
406
+ file_hash = compute_file_hash(abs_path)
407
+ mtime = get_file_mtime_ns(abs_path)
408
+ chunk_ids = [
409
+ c.chunk_id
410
+ for c in chunks_to_add
411
+ if c.source_file == change.path
412
+ ]
413
+ manifest.files[change.path] = FileEntry(
414
+ sha256=file_hash,
415
+ mtime_ns=mtime,
416
+ chunk_ids=chunk_ids,
417
+ )
418
+
419
+ save_manifest(manifest, self.manifest_path)
420
+ log(stats.summary())
421
+
422
+ return stats
423
+
424
+ def _embed_chunks(
425
+ self,
426
+ chunks: list[Chunk],
427
+ log: Callable[[str], None],
428
+ ) -> list[list[float]]:
429
+ """Generate embeddings for chunks."""
430
+ all_embeddings: list[list[float]] = []
431
+
432
+ for i in range(0, len(chunks), self.batch_size):
433
+ batch = chunks[i : i + self.batch_size]
434
+ texts = [c.content for c in batch]
435
+ embeddings = self.model.encode(texts).tolist()
436
+ all_embeddings.extend(embeddings)
437
+
438
+ if (i + self.batch_size) % 100 == 0 or i + self.batch_size >= len(chunks):
439
+ log(f" Embedded {min(i + self.batch_size, len(chunks))}/{len(chunks)}")
440
+
441
+ return all_embeddings
442
+
443
+ def _build_records(
444
+ self,
445
+ chunks: list[Chunk],
446
+ embeddings: list[list[float]],
447
+ ) -> list[dict[str, Any]]:
448
+ """Build LanceDB records from chunks and embeddings."""
449
+ records = []
450
+ for chunk, embedding in zip(chunks, embeddings, strict=True):
451
+ records.append({
452
+ "chunk_id": chunk.chunk_id,
453
+ "content": chunk.content,
454
+ "source_type": chunk.source_type,
455
+ "source_file": chunk.source_file,
456
+ "source_name": chunk.source_name,
457
+ "line_number": chunk.line_number or 0,
458
+ "metadata": json.dumps(chunk.metadata),
459
+ "vector": embedding,
460
+ })
461
+ return records
462
+
463
+ def _delete_chunks(self, chunk_ids: list[str]) -> None:
464
+ """Delete chunks by ID from the index."""
465
+ try:
466
+ table = self.db.open_table(self.table_name)
467
+ # LanceDB delete with filter
468
+ for chunk_id in chunk_ids:
469
+ # Sanitized in manifest.py, but be extra safe
470
+ safe_id = chunk_id.replace("'", "''")
471
+ table.delete(f"chunk_id = '{safe_id}'")
472
+ except Exception as e:
473
+ logger.warning("Failed to delete chunks: %s", e)
474
+
475
+ def _add_chunks(self, records: list[dict[str, Any]]) -> None:
476
+ """Add new chunks to the index."""
477
+ try:
478
+ table = self.db.open_table(self.table_name)
479
+ table.add(records)
480
+ except Exception as e:
481
+ logger.error("Failed to add chunks: %s", e)
482
+ raise
483
+
484
+
485
+ class Embedder:
486
+ """Generate embeddings and manage LanceDB index."""
487
+
488
+ def __init__(
489
+ self,
490
+ model_name: str = DEFAULT_MODEL,
491
+ data_dir: Path | None = None,
492
+ ):
493
+ if not DEPS_AVAILABLE:
494
+ raise ImportError(
495
+ "Embedding dependencies not installed. "
496
+ "Run: pip install lancedb sentence-transformers"
497
+ )
498
+
499
+ self.model_name = model_name
500
+ self.model = SentenceTransformer(model_name)
501
+ self.data_dir = data_dir or DEFAULT_DATA_DIR
502
+ self.db_path = self.data_dir / "lancedb"
503
+
504
+ # Initialize LanceDB
505
+ self.db = lancedb.connect(str(self.db_path))
506
+
507
+ def embed_text(self, text: str) -> list[float]:
508
+ """Generate embedding for a single text."""
509
+ return self.model.encode(text).tolist()
510
+
511
+ def embed_texts(self, texts: list[str]) -> list[list[float]]:
512
+ """Generate embeddings for multiple texts."""
513
+ return self.model.encode(texts).tolist()
514
+
515
+ def build_index(
516
+ self,
517
+ chunks: list[Chunk],
518
+ table_name: str = "solana_index",
519
+ progress_callback: Callable[[str], None] | None = None,
520
+ ) -> dict:
521
+ """
522
+ Build vector index from chunks.
523
+
524
+ Args:
525
+ chunks: List of content chunks
526
+ table_name: Name of the LanceDB table
527
+ progress_callback: Optional progress callback
528
+
529
+ Returns:
530
+ Statistics about the index
531
+ """
532
+
533
+ def log(msg: str):
534
+ if progress_callback:
535
+ progress_callback(msg)
536
+ else:
537
+ print(msg)
538
+
539
+ if not chunks:
540
+ log("No chunks to index")
541
+ return {"chunks_indexed": 0}
542
+
543
+ log(f"Generating embeddings for {len(chunks)} chunks...")
544
+
545
+ # Generate embeddings in batches
546
+ batch_size = 32
547
+ all_embeddings = []
548
+
549
+ for i in range(0, len(chunks), batch_size):
550
+ batch = chunks[i : i + batch_size]
551
+ texts = [c.content for c in batch]
552
+ embeddings = self.embed_texts(texts)
553
+ all_embeddings.extend(embeddings)
554
+
555
+ if (i + batch_size) % 100 == 0 or i + batch_size >= len(chunks):
556
+ log(f" Embedded {min(i + batch_size, len(chunks))}/{len(chunks)} chunks")
557
+
558
+ # Build records for LanceDB
559
+ records = []
560
+ for chunk, embedding in zip(chunks, all_embeddings, strict=True):
561
+ record = {
562
+ "chunk_id": chunk.chunk_id,
563
+ "content": chunk.content,
564
+ "source_type": chunk.source_type,
565
+ "source_file": chunk.source_file,
566
+ "source_name": chunk.source_name,
567
+ "line_number": chunk.line_number or 0,
568
+ "metadata": json.dumps(chunk.metadata),
569
+ "vector": embedding,
570
+ }
571
+ records.append(record)
572
+
573
+ log(f"Writing {len(records)} records to LanceDB...")
574
+
575
+ # Drop existing table if exists
576
+ try:
577
+ self.db.drop_table(table_name)
578
+ except Exception as e:
579
+ logger.debug("Table %s does not exist or could not be dropped: %s", table_name, e)
580
+
581
+ # Create new table
582
+ table = self.db.create_table(table_name, records)
583
+
584
+ # Create index for faster search
585
+ log("Building vector index...")
586
+ try:
587
+ table.create_index(
588
+ metric="cosine",
589
+ num_partitions=min(256, len(records) // 10 + 1),
590
+ num_sub_vectors=min(96, len(records) // 100 + 1),
591
+ )
592
+ except Exception as e:
593
+ log(f" Index creation failed (will use brute force): {e}")
594
+
595
+ log("Index built successfully")
596
+
597
+ return {
598
+ "chunks_indexed": len(chunks),
599
+ "table_name": table_name,
600
+ "db_path": str(self.db_path),
601
+ "model": self.model_name,
602
+ }
603
+
604
+ def search(
605
+ self,
606
+ query: str,
607
+ table_name: str = "solana_index",
608
+ limit: int = 10,
609
+ source_type: str | None = None,
610
+ ) -> list[dict]:
611
+ """
612
+ Search the index for relevant content.
613
+
614
+ Args:
615
+ query: Search query
616
+ table_name: Name of the LanceDB table
617
+ limit: Maximum results to return
618
+ source_type: Filter by source type (rust, simd, docs)
619
+
620
+ Returns:
621
+ List of matching results with scores
622
+ """
623
+ try:
624
+ table = self.db.open_table(table_name)
625
+ except Exception:
626
+ return []
627
+
628
+ # Generate query embedding
629
+ query_embedding = self.embed_text(query)
630
+
631
+ # Search
632
+ results = table.search(query_embedding).limit(limit * 2 if source_type else limit)
633
+
634
+ # Convert to list of dicts
635
+ matches = []
636
+ for row in results.to_list():
637
+ # Filter by source type if specified
638
+ if source_type and row.get("source_type") != source_type:
639
+ continue
640
+
641
+ matches.append({
642
+ "content": row["content"],
643
+ "source_type": row["source_type"],
644
+ "source_file": row["source_file"],
645
+ "source_name": row["source_name"],
646
+ "line_number": row["line_number"],
647
+ "metadata": json.loads(row["metadata"]) if row.get("metadata") else {},
648
+ "score": float(row.get("_distance", 0)),
649
+ })
650
+
651
+ if len(matches) >= limit:
652
+ break
653
+
654
+ return matches
655
+
656
+ def search_runtime(self, query: str, limit: int = 10) -> list[dict]:
657
+ """Search only Rust runtime code."""
658
+ return self.search(query, source_type="rust", limit=limit)
659
+
660
+ def search_simds(self, query: str, limit: int = 10) -> list[dict]:
661
+ """Search only SIMDs."""
662
+ return self.search(query, source_type="simd", limit=limit)
663
+
664
+
665
+ def build_index(
666
+ chunks: list[Chunk],
667
+ data_dir: Path | None = None,
668
+ model_name: str = DEFAULT_MODEL,
669
+ progress_callback: Callable[[str], None] | None = None,
670
+ ) -> dict:
671
+ """
672
+ Convenience function to build the index.
673
+
674
+ Args:
675
+ chunks: List of content chunks
676
+ data_dir: Base data directory
677
+ model_name: Embedding model name
678
+ progress_callback: Optional progress callback
679
+
680
+ Returns:
681
+ Statistics about the index
682
+ """
683
+ embedder = Embedder(model_name=model_name, data_dir=data_dir)
684
+ return embedder.build_index(chunks, progress_callback=progress_callback)
685
+
686
+
687
+ def search(
688
+ query: str,
689
+ data_dir: Path | None = None,
690
+ model_name: str = DEFAULT_MODEL,
691
+ limit: int = 10,
692
+ source_type: str | None = None,
693
+ ) -> list[dict]:
694
+ """
695
+ Convenience function to search the index.
696
+
697
+ Args:
698
+ query: Search query
699
+ data_dir: Base data directory
700
+ model_name: Embedding model name
701
+ limit: Maximum results
702
+ source_type: Filter by source type
703
+
704
+ Returns:
705
+ List of matching results
706
+ """
707
+ embedder = Embedder(model_name=model_name, data_dir=data_dir)
708
+ return embedder.search(query, limit=limit, source_type=source_type)
709
+
710
+
711
+ def get_index_stats(data_dir: Path | None = None) -> dict | None:
712
+ """Get statistics about the current index."""
713
+ if not DEPS_AVAILABLE:
714
+ return None
715
+
716
+ data_dir = data_dir or DEFAULT_DATA_DIR
717
+ db_path = data_dir / "lancedb"
718
+
719
+ if not db_path.exists():
720
+ return None
721
+
722
+ try:
723
+ db = lancedb.connect(str(db_path))
724
+ table = db.open_table("solana_index")
725
+
726
+ # Count by source type
727
+ all_rows = table.to_pandas()
728
+ source_counts = all_rows["source_type"].value_counts().to_dict()
729
+
730
+ return {
731
+ "total_chunks": len(all_rows),
732
+ "by_source_type": source_counts,
733
+ "db_path": str(db_path),
734
+ }
735
+ except Exception as e:
736
+ return {"error": str(e)}
737
+
738
+
739
+ if __name__ == "__main__":
740
+ # Test search
741
+ import sys
742
+
743
+ if len(sys.argv) < 2:
744
+ print("Usage: embedder.py <query>")
745
+ sys.exit(1)
746
+
747
+ query = " ".join(sys.argv[1:])
748
+ print(f"Searching for: {query}")
749
+
750
+ results = search(query, limit=5)
751
+ for i, result in enumerate(results):
752
+ print(f"\n{i + 1}. {result['source_name']} ({result['source_type']})")
753
+ print(f" File: {result['source_file']}:{result['line_number']}")
754
+ print(f" Score: {result['score']:.4f}")
755
+ print(f" Content: {result['content'][:200]}...")