sol-mcp 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sol_mcp-0.2.0.dist-info/METADATA +218 -0
- sol_mcp-0.2.0.dist-info/RECORD +20 -0
- sol_mcp-0.2.0.dist-info/WHEEL +4 -0
- sol_mcp-0.2.0.dist-info/entry_points.txt +3 -0
- solana_mcp/__init__.py +3 -0
- solana_mcp/cli.py +527 -0
- solana_mcp/config.py +324 -0
- solana_mcp/expert/__init__.py +5 -0
- solana_mcp/expert/guidance.py +452 -0
- solana_mcp/indexer/__init__.py +8 -0
- solana_mcp/indexer/chunker.py +457 -0
- solana_mcp/indexer/compiler.py +1101 -0
- solana_mcp/indexer/downloader.py +304 -0
- solana_mcp/indexer/embedder.py +755 -0
- solana_mcp/indexer/manifest.py +411 -0
- solana_mcp/logging.py +85 -0
- solana_mcp/models.py +62 -0
- solana_mcp/server.py +746 -0
- solana_mcp/tools/__init__.py +1 -0
- solana_mcp/versions.py +391 -0
|
@@ -0,0 +1,755 @@
|
|
|
1
|
+
"""Build vector embeddings and store in LanceDB.
|
|
2
|
+
|
|
3
|
+
Creates a searchable vector index from chunked content.
|
|
4
|
+
Supports incremental indexing to avoid full rebuilds.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Callable
|
|
12
|
+
|
|
13
|
+
from .chunker import Chunk
|
|
14
|
+
from .manifest import (
|
|
15
|
+
FileChange,
|
|
16
|
+
FileEntry,
|
|
17
|
+
Manifest,
|
|
18
|
+
compute_changes,
|
|
19
|
+
compute_file_hash,
|
|
20
|
+
get_file_mtime_ns,
|
|
21
|
+
load_manifest,
|
|
22
|
+
needs_full_rebuild,
|
|
23
|
+
save_manifest,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
# Try to import embedding dependencies
|
|
29
|
+
try:
|
|
30
|
+
import lancedb
|
|
31
|
+
from sentence_transformers import SentenceTransformer
|
|
32
|
+
|
|
33
|
+
DEPS_AVAILABLE = True
|
|
34
|
+
except ImportError:
|
|
35
|
+
DEPS_AVAILABLE = False
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# Default embedding model
|
|
39
|
+
DEFAULT_MODEL = "all-MiniLM-L6-v2"
|
|
40
|
+
|
|
41
|
+
# Default data directory
|
|
42
|
+
DEFAULT_DATA_DIR = Path.home() / ".solana-mcp"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class IndexStats:
|
|
47
|
+
"""Statistics from an indexing operation."""
|
|
48
|
+
|
|
49
|
+
full_rebuild: bool = False
|
|
50
|
+
rebuild_reason: str = ""
|
|
51
|
+
files_added: int = 0
|
|
52
|
+
files_modified: int = 0
|
|
53
|
+
files_deleted: int = 0
|
|
54
|
+
chunks_added: int = 0
|
|
55
|
+
chunks_deleted: int = 0
|
|
56
|
+
errors: list[str] = field(default_factory=list)
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def files_changed(self) -> int:
|
|
60
|
+
return self.files_added + self.files_modified + self.files_deleted
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def is_incremental(self) -> bool:
|
|
64
|
+
return not self.full_rebuild and self.files_changed > 0
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def is_noop(self) -> bool:
|
|
68
|
+
return not self.full_rebuild and self.files_changed == 0
|
|
69
|
+
|
|
70
|
+
def summary(self) -> str:
|
|
71
|
+
"""Generate human-readable summary."""
|
|
72
|
+
if self.full_rebuild:
|
|
73
|
+
return (
|
|
74
|
+
f"Full rebuild ({self.rebuild_reason}): "
|
|
75
|
+
f"{self.chunks_added} chunks indexed"
|
|
76
|
+
)
|
|
77
|
+
if self.is_noop:
|
|
78
|
+
return "No changes detected"
|
|
79
|
+
parts = []
|
|
80
|
+
if self.files_added:
|
|
81
|
+
parts.append(f"{self.files_added} added")
|
|
82
|
+
if self.files_modified:
|
|
83
|
+
parts.append(f"{self.files_modified} modified")
|
|
84
|
+
if self.files_deleted:
|
|
85
|
+
parts.append(f"{self.files_deleted} deleted")
|
|
86
|
+
return (
|
|
87
|
+
f"Incremental update: {', '.join(parts)} "
|
|
88
|
+
f"(+{self.chunks_added}/-{self.chunks_deleted} chunks)"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@dataclass
|
|
93
|
+
class DryRunResult:
|
|
94
|
+
"""Result of a dry-run analysis."""
|
|
95
|
+
|
|
96
|
+
would_rebuild: bool = False
|
|
97
|
+
rebuild_reason: str = ""
|
|
98
|
+
files_to_add: list[str] = field(default_factory=list)
|
|
99
|
+
files_to_modify: list[str] = field(default_factory=list)
|
|
100
|
+
files_to_delete: list[str] = field(default_factory=list)
|
|
101
|
+
estimated_chunks_add: int = 0
|
|
102
|
+
estimated_chunks_delete: int = 0
|
|
103
|
+
|
|
104
|
+
def summary(self) -> str:
|
|
105
|
+
"""Generate human-readable summary."""
|
|
106
|
+
if self.would_rebuild:
|
|
107
|
+
return f"Would perform full rebuild: {self.rebuild_reason}"
|
|
108
|
+
|
|
109
|
+
if not self.files_to_add and not self.files_to_modify and not self.files_to_delete:
|
|
110
|
+
return "No changes detected"
|
|
111
|
+
|
|
112
|
+
parts = []
|
|
113
|
+
if self.files_to_add:
|
|
114
|
+
parts.append(f"Add {len(self.files_to_add)} files")
|
|
115
|
+
if self.files_to_modify:
|
|
116
|
+
parts.append(f"Modify {len(self.files_to_modify)} files")
|
|
117
|
+
if self.files_to_delete:
|
|
118
|
+
parts.append(f"Delete {len(self.files_to_delete)} files")
|
|
119
|
+
|
|
120
|
+
return (
|
|
121
|
+
f"Would update: {', '.join(parts)} "
|
|
122
|
+
f"(~{self.estimated_chunks_add} add, ~{self.estimated_chunks_delete} delete)"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class IncrementalEmbedder:
|
|
127
|
+
"""
|
|
128
|
+
Embedder with incremental indexing support.
|
|
129
|
+
|
|
130
|
+
Tracks file state via manifest and only re-embeds changed files.
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
def __init__(
|
|
134
|
+
self,
|
|
135
|
+
data_dir: Path,
|
|
136
|
+
model_name: str = DEFAULT_MODEL,
|
|
137
|
+
batch_size: int = 32,
|
|
138
|
+
):
|
|
139
|
+
if not DEPS_AVAILABLE:
|
|
140
|
+
raise ImportError(
|
|
141
|
+
"Embedding dependencies not installed. "
|
|
142
|
+
"Run: pip install lancedb sentence-transformers"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
self.data_dir = Path(data_dir)
|
|
146
|
+
self.model_name = model_name
|
|
147
|
+
self.batch_size = batch_size
|
|
148
|
+
self.db_path = self.data_dir / "lancedb"
|
|
149
|
+
self.manifest_path = self.data_dir / "manifest.json"
|
|
150
|
+
self.table_name = "solana_index"
|
|
151
|
+
|
|
152
|
+
self._model: SentenceTransformer | None = None
|
|
153
|
+
self._db: Any = None
|
|
154
|
+
|
|
155
|
+
@property
|
|
156
|
+
def model(self) -> "SentenceTransformer":
|
|
157
|
+
if self._model is None:
|
|
158
|
+
self._model = SentenceTransformer(self.model_name)
|
|
159
|
+
return self._model
|
|
160
|
+
|
|
161
|
+
@property
|
|
162
|
+
def db(self) -> Any:
|
|
163
|
+
if self._db is None:
|
|
164
|
+
self._db = lancedb.connect(str(self.db_path))
|
|
165
|
+
return self._db
|
|
166
|
+
|
|
167
|
+
def get_current_config(self) -> dict[str, Any]:
|
|
168
|
+
"""Get current configuration for manifest comparison."""
|
|
169
|
+
return {
|
|
170
|
+
"embedding_model": self.model_name,
|
|
171
|
+
"chunk_config": {
|
|
172
|
+
"chunk_size": 1000,
|
|
173
|
+
"chunk_overlap": 200,
|
|
174
|
+
},
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
def dry_run(
|
|
178
|
+
self,
|
|
179
|
+
current_files: dict[str, Path],
|
|
180
|
+
file_types: dict[str, str],
|
|
181
|
+
) -> DryRunResult:
|
|
182
|
+
"""
|
|
183
|
+
Analyze what would change without actually indexing.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
current_files: Dict mapping relative paths to absolute Paths
|
|
187
|
+
file_types: Dict mapping relative paths to file types
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
DryRunResult describing pending changes
|
|
191
|
+
"""
|
|
192
|
+
manifest = load_manifest(self.manifest_path)
|
|
193
|
+
config = self.get_current_config()
|
|
194
|
+
|
|
195
|
+
# Check if full rebuild needed
|
|
196
|
+
rebuild_needed, reason = needs_full_rebuild(manifest, config)
|
|
197
|
+
if rebuild_needed:
|
|
198
|
+
return DryRunResult(would_rebuild=True, rebuild_reason=reason)
|
|
199
|
+
|
|
200
|
+
# Compute changes
|
|
201
|
+
changes = compute_changes(manifest, current_files)
|
|
202
|
+
|
|
203
|
+
result = DryRunResult()
|
|
204
|
+
for change in changes:
|
|
205
|
+
if change.change_type == "add":
|
|
206
|
+
result.files_to_add.append(change.path)
|
|
207
|
+
result.estimated_chunks_add += 10 # Rough estimate
|
|
208
|
+
elif change.change_type == "modify":
|
|
209
|
+
result.files_to_modify.append(change.path)
|
|
210
|
+
result.estimated_chunks_add += 10
|
|
211
|
+
result.estimated_chunks_delete += len(change.old_chunk_ids)
|
|
212
|
+
elif change.change_type == "delete":
|
|
213
|
+
result.files_to_delete.append(change.path)
|
|
214
|
+
result.estimated_chunks_delete += len(change.old_chunk_ids)
|
|
215
|
+
|
|
216
|
+
return result
|
|
217
|
+
|
|
218
|
+
def index(
|
|
219
|
+
self,
|
|
220
|
+
current_files: dict[str, Path],
|
|
221
|
+
file_types: dict[str, str],
|
|
222
|
+
chunk_fn: Callable[[Path, str, Path], list[Chunk]] | None = None,
|
|
223
|
+
force_full: bool = False,
|
|
224
|
+
progress_callback: Callable[[str], None] | None = None,
|
|
225
|
+
) -> IndexStats:
|
|
226
|
+
"""
|
|
227
|
+
Index files incrementally.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
current_files: Dict mapping relative paths to absolute Paths
|
|
231
|
+
file_types: Dict mapping relative paths to file types
|
|
232
|
+
chunk_fn: Function to chunk a single file
|
|
233
|
+
force_full: Force full rebuild
|
|
234
|
+
progress_callback: Optional progress callback
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
IndexStats with operation details
|
|
238
|
+
"""
|
|
239
|
+
|
|
240
|
+
def log(msg: str) -> None:
|
|
241
|
+
if progress_callback:
|
|
242
|
+
progress_callback(msg)
|
|
243
|
+
else:
|
|
244
|
+
logger.info(msg)
|
|
245
|
+
|
|
246
|
+
manifest = load_manifest(self.manifest_path)
|
|
247
|
+
config = self.get_current_config()
|
|
248
|
+
|
|
249
|
+
# Check if full rebuild needed
|
|
250
|
+
if force_full:
|
|
251
|
+
return self._full_rebuild(
|
|
252
|
+
current_files, file_types, chunk_fn, "Forced rebuild", log
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
rebuild_needed, reason = needs_full_rebuild(manifest, config)
|
|
256
|
+
if rebuild_needed:
|
|
257
|
+
return self._full_rebuild(
|
|
258
|
+
current_files, file_types, chunk_fn, reason, log
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
# Incremental update
|
|
262
|
+
changes = compute_changes(manifest, current_files)
|
|
263
|
+
if not changes:
|
|
264
|
+
log("No changes detected")
|
|
265
|
+
return IndexStats()
|
|
266
|
+
|
|
267
|
+
return self._incremental_update(
|
|
268
|
+
manifest, current_files, file_types, changes, chunk_fn, log
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
def _full_rebuild(
|
|
272
|
+
self,
|
|
273
|
+
current_files: dict[str, Path],
|
|
274
|
+
file_types: dict[str, str],
|
|
275
|
+
chunk_fn: Callable[[Path, str, Path], list[Chunk]] | None,
|
|
276
|
+
reason: str,
|
|
277
|
+
log: Callable[[str], None],
|
|
278
|
+
) -> IndexStats:
|
|
279
|
+
"""Perform a full index rebuild."""
|
|
280
|
+
log(f"Full rebuild: {reason}")
|
|
281
|
+
stats = IndexStats(full_rebuild=True, rebuild_reason=reason)
|
|
282
|
+
|
|
283
|
+
# Collect all chunks
|
|
284
|
+
all_chunks: list[Chunk] = []
|
|
285
|
+
|
|
286
|
+
if chunk_fn:
|
|
287
|
+
for rel_path, abs_path in current_files.items():
|
|
288
|
+
file_type = file_types.get(rel_path, "docs")
|
|
289
|
+
try:
|
|
290
|
+
chunks = chunk_fn(abs_path, file_type, self.data_dir)
|
|
291
|
+
all_chunks.extend(chunks)
|
|
292
|
+
except Exception as e:
|
|
293
|
+
stats.errors.append(f"{rel_path}: {e}")
|
|
294
|
+
|
|
295
|
+
if not all_chunks:
|
|
296
|
+
log("No chunks to index")
|
|
297
|
+
return stats
|
|
298
|
+
|
|
299
|
+
log(f"Generating embeddings for {len(all_chunks)} chunks...")
|
|
300
|
+
|
|
301
|
+
# Generate embeddings
|
|
302
|
+
embeddings = self._embed_chunks(all_chunks, log)
|
|
303
|
+
|
|
304
|
+
# Build records
|
|
305
|
+
records = self._build_records(all_chunks, embeddings)
|
|
306
|
+
|
|
307
|
+
# Drop and recreate table
|
|
308
|
+
log("Writing to LanceDB...")
|
|
309
|
+
try:
|
|
310
|
+
self.db.drop_table(self.table_name)
|
|
311
|
+
except Exception:
|
|
312
|
+
pass # Table may not exist
|
|
313
|
+
|
|
314
|
+
self.db.create_table(self.table_name, records)
|
|
315
|
+
stats.chunks_added = len(records)
|
|
316
|
+
|
|
317
|
+
# Build new manifest
|
|
318
|
+
manifest = Manifest(
|
|
319
|
+
embedding_model=self.model_name,
|
|
320
|
+
chunk_config=self.get_current_config()["chunk_config"],
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
# Track files
|
|
324
|
+
for rel_path, abs_path in current_files.items():
|
|
325
|
+
file_hash = compute_file_hash(abs_path)
|
|
326
|
+
mtime = get_file_mtime_ns(abs_path)
|
|
327
|
+
chunk_ids = [
|
|
328
|
+
c.chunk_id for c in all_chunks if c.source_file == rel_path
|
|
329
|
+
]
|
|
330
|
+
manifest.files[rel_path] = FileEntry(
|
|
331
|
+
sha256=file_hash,
|
|
332
|
+
mtime_ns=mtime,
|
|
333
|
+
chunk_ids=chunk_ids,
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
save_manifest(manifest, self.manifest_path)
|
|
337
|
+
log(f"Indexed {stats.chunks_added} chunks")
|
|
338
|
+
|
|
339
|
+
return stats
|
|
340
|
+
|
|
341
|
+
def _incremental_update(
|
|
342
|
+
self,
|
|
343
|
+
manifest: Manifest,
|
|
344
|
+
current_files: dict[str, Path],
|
|
345
|
+
file_types: dict[str, str],
|
|
346
|
+
changes: list[FileChange],
|
|
347
|
+
chunk_fn: Callable[[Path, str, Path], list[Chunk]] | None,
|
|
348
|
+
log: Callable[[str], None],
|
|
349
|
+
) -> IndexStats:
|
|
350
|
+
"""Apply incremental updates."""
|
|
351
|
+
stats = IndexStats()
|
|
352
|
+
|
|
353
|
+
# Collect chunks to add and IDs to delete
|
|
354
|
+
chunks_to_add: list[Chunk] = []
|
|
355
|
+
ids_to_delete: list[str] = []
|
|
356
|
+
|
|
357
|
+
for change in changes:
|
|
358
|
+
if change.change_type == "add":
|
|
359
|
+
stats.files_added += 1
|
|
360
|
+
if chunk_fn and change.path in current_files:
|
|
361
|
+
abs_path = current_files[change.path]
|
|
362
|
+
file_type = file_types.get(change.path, "docs")
|
|
363
|
+
try:
|
|
364
|
+
chunks = chunk_fn(abs_path, file_type, self.data_dir)
|
|
365
|
+
chunks_to_add.extend(chunks)
|
|
366
|
+
except Exception as e:
|
|
367
|
+
stats.errors.append(f"{change.path}: {e}")
|
|
368
|
+
|
|
369
|
+
elif change.change_type == "modify":
|
|
370
|
+
stats.files_modified += 1
|
|
371
|
+
ids_to_delete.extend(change.old_chunk_ids)
|
|
372
|
+
if chunk_fn and change.path in current_files:
|
|
373
|
+
abs_path = current_files[change.path]
|
|
374
|
+
file_type = file_types.get(change.path, "docs")
|
|
375
|
+
try:
|
|
376
|
+
chunks = chunk_fn(abs_path, file_type, self.data_dir)
|
|
377
|
+
chunks_to_add.extend(chunks)
|
|
378
|
+
except Exception as e:
|
|
379
|
+
stats.errors.append(f"{change.path}: {e}")
|
|
380
|
+
|
|
381
|
+
elif change.change_type == "delete":
|
|
382
|
+
stats.files_deleted += 1
|
|
383
|
+
ids_to_delete.extend(change.old_chunk_ids)
|
|
384
|
+
|
|
385
|
+
# Apply deletions
|
|
386
|
+
if ids_to_delete:
|
|
387
|
+
log(f"Deleting {len(ids_to_delete)} old chunks...")
|
|
388
|
+
self._delete_chunks(ids_to_delete)
|
|
389
|
+
stats.chunks_deleted = len(ids_to_delete)
|
|
390
|
+
|
|
391
|
+
# Apply additions
|
|
392
|
+
if chunks_to_add:
|
|
393
|
+
log(f"Adding {len(chunks_to_add)} new chunks...")
|
|
394
|
+
embeddings = self._embed_chunks(chunks_to_add, log)
|
|
395
|
+
records = self._build_records(chunks_to_add, embeddings)
|
|
396
|
+
self._add_chunks(records)
|
|
397
|
+
stats.chunks_added = len(records)
|
|
398
|
+
|
|
399
|
+
# Update manifest
|
|
400
|
+
for change in changes:
|
|
401
|
+
if change.change_type == "delete":
|
|
402
|
+
del manifest.files[change.path]
|
|
403
|
+
elif change.change_type in ("add", "modify"):
|
|
404
|
+
if change.path in current_files:
|
|
405
|
+
abs_path = current_files[change.path]
|
|
406
|
+
file_hash = compute_file_hash(abs_path)
|
|
407
|
+
mtime = get_file_mtime_ns(abs_path)
|
|
408
|
+
chunk_ids = [
|
|
409
|
+
c.chunk_id
|
|
410
|
+
for c in chunks_to_add
|
|
411
|
+
if c.source_file == change.path
|
|
412
|
+
]
|
|
413
|
+
manifest.files[change.path] = FileEntry(
|
|
414
|
+
sha256=file_hash,
|
|
415
|
+
mtime_ns=mtime,
|
|
416
|
+
chunk_ids=chunk_ids,
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
save_manifest(manifest, self.manifest_path)
|
|
420
|
+
log(stats.summary())
|
|
421
|
+
|
|
422
|
+
return stats
|
|
423
|
+
|
|
424
|
+
def _embed_chunks(
|
|
425
|
+
self,
|
|
426
|
+
chunks: list[Chunk],
|
|
427
|
+
log: Callable[[str], None],
|
|
428
|
+
) -> list[list[float]]:
|
|
429
|
+
"""Generate embeddings for chunks."""
|
|
430
|
+
all_embeddings: list[list[float]] = []
|
|
431
|
+
|
|
432
|
+
for i in range(0, len(chunks), self.batch_size):
|
|
433
|
+
batch = chunks[i : i + self.batch_size]
|
|
434
|
+
texts = [c.content for c in batch]
|
|
435
|
+
embeddings = self.model.encode(texts).tolist()
|
|
436
|
+
all_embeddings.extend(embeddings)
|
|
437
|
+
|
|
438
|
+
if (i + self.batch_size) % 100 == 0 or i + self.batch_size >= len(chunks):
|
|
439
|
+
log(f" Embedded {min(i + self.batch_size, len(chunks))}/{len(chunks)}")
|
|
440
|
+
|
|
441
|
+
return all_embeddings
|
|
442
|
+
|
|
443
|
+
def _build_records(
|
|
444
|
+
self,
|
|
445
|
+
chunks: list[Chunk],
|
|
446
|
+
embeddings: list[list[float]],
|
|
447
|
+
) -> list[dict[str, Any]]:
|
|
448
|
+
"""Build LanceDB records from chunks and embeddings."""
|
|
449
|
+
records = []
|
|
450
|
+
for chunk, embedding in zip(chunks, embeddings, strict=True):
|
|
451
|
+
records.append({
|
|
452
|
+
"chunk_id": chunk.chunk_id,
|
|
453
|
+
"content": chunk.content,
|
|
454
|
+
"source_type": chunk.source_type,
|
|
455
|
+
"source_file": chunk.source_file,
|
|
456
|
+
"source_name": chunk.source_name,
|
|
457
|
+
"line_number": chunk.line_number or 0,
|
|
458
|
+
"metadata": json.dumps(chunk.metadata),
|
|
459
|
+
"vector": embedding,
|
|
460
|
+
})
|
|
461
|
+
return records
|
|
462
|
+
|
|
463
|
+
def _delete_chunks(self, chunk_ids: list[str]) -> None:
|
|
464
|
+
"""Delete chunks by ID from the index."""
|
|
465
|
+
try:
|
|
466
|
+
table = self.db.open_table(self.table_name)
|
|
467
|
+
# LanceDB delete with filter
|
|
468
|
+
for chunk_id in chunk_ids:
|
|
469
|
+
# Sanitized in manifest.py, but be extra safe
|
|
470
|
+
safe_id = chunk_id.replace("'", "''")
|
|
471
|
+
table.delete(f"chunk_id = '{safe_id}'")
|
|
472
|
+
except Exception as e:
|
|
473
|
+
logger.warning("Failed to delete chunks: %s", e)
|
|
474
|
+
|
|
475
|
+
def _add_chunks(self, records: list[dict[str, Any]]) -> None:
|
|
476
|
+
"""Add new chunks to the index."""
|
|
477
|
+
try:
|
|
478
|
+
table = self.db.open_table(self.table_name)
|
|
479
|
+
table.add(records)
|
|
480
|
+
except Exception as e:
|
|
481
|
+
logger.error("Failed to add chunks: %s", e)
|
|
482
|
+
raise
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
class Embedder:
|
|
486
|
+
"""Generate embeddings and manage LanceDB index."""
|
|
487
|
+
|
|
488
|
+
def __init__(
|
|
489
|
+
self,
|
|
490
|
+
model_name: str = DEFAULT_MODEL,
|
|
491
|
+
data_dir: Path | None = None,
|
|
492
|
+
):
|
|
493
|
+
if not DEPS_AVAILABLE:
|
|
494
|
+
raise ImportError(
|
|
495
|
+
"Embedding dependencies not installed. "
|
|
496
|
+
"Run: pip install lancedb sentence-transformers"
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
self.model_name = model_name
|
|
500
|
+
self.model = SentenceTransformer(model_name)
|
|
501
|
+
self.data_dir = data_dir or DEFAULT_DATA_DIR
|
|
502
|
+
self.db_path = self.data_dir / "lancedb"
|
|
503
|
+
|
|
504
|
+
# Initialize LanceDB
|
|
505
|
+
self.db = lancedb.connect(str(self.db_path))
|
|
506
|
+
|
|
507
|
+
def embed_text(self, text: str) -> list[float]:
|
|
508
|
+
"""Generate embedding for a single text."""
|
|
509
|
+
return self.model.encode(text).tolist()
|
|
510
|
+
|
|
511
|
+
def embed_texts(self, texts: list[str]) -> list[list[float]]:
|
|
512
|
+
"""Generate embeddings for multiple texts."""
|
|
513
|
+
return self.model.encode(texts).tolist()
|
|
514
|
+
|
|
515
|
+
def build_index(
|
|
516
|
+
self,
|
|
517
|
+
chunks: list[Chunk],
|
|
518
|
+
table_name: str = "solana_index",
|
|
519
|
+
progress_callback: Callable[[str], None] | None = None,
|
|
520
|
+
) -> dict:
|
|
521
|
+
"""
|
|
522
|
+
Build vector index from chunks.
|
|
523
|
+
|
|
524
|
+
Args:
|
|
525
|
+
chunks: List of content chunks
|
|
526
|
+
table_name: Name of the LanceDB table
|
|
527
|
+
progress_callback: Optional progress callback
|
|
528
|
+
|
|
529
|
+
Returns:
|
|
530
|
+
Statistics about the index
|
|
531
|
+
"""
|
|
532
|
+
|
|
533
|
+
def log(msg: str):
|
|
534
|
+
if progress_callback:
|
|
535
|
+
progress_callback(msg)
|
|
536
|
+
else:
|
|
537
|
+
print(msg)
|
|
538
|
+
|
|
539
|
+
if not chunks:
|
|
540
|
+
log("No chunks to index")
|
|
541
|
+
return {"chunks_indexed": 0}
|
|
542
|
+
|
|
543
|
+
log(f"Generating embeddings for {len(chunks)} chunks...")
|
|
544
|
+
|
|
545
|
+
# Generate embeddings in batches
|
|
546
|
+
batch_size = 32
|
|
547
|
+
all_embeddings = []
|
|
548
|
+
|
|
549
|
+
for i in range(0, len(chunks), batch_size):
|
|
550
|
+
batch = chunks[i : i + batch_size]
|
|
551
|
+
texts = [c.content for c in batch]
|
|
552
|
+
embeddings = self.embed_texts(texts)
|
|
553
|
+
all_embeddings.extend(embeddings)
|
|
554
|
+
|
|
555
|
+
if (i + batch_size) % 100 == 0 or i + batch_size >= len(chunks):
|
|
556
|
+
log(f" Embedded {min(i + batch_size, len(chunks))}/{len(chunks)} chunks")
|
|
557
|
+
|
|
558
|
+
# Build records for LanceDB
|
|
559
|
+
records = []
|
|
560
|
+
for chunk, embedding in zip(chunks, all_embeddings, strict=True):
|
|
561
|
+
record = {
|
|
562
|
+
"chunk_id": chunk.chunk_id,
|
|
563
|
+
"content": chunk.content,
|
|
564
|
+
"source_type": chunk.source_type,
|
|
565
|
+
"source_file": chunk.source_file,
|
|
566
|
+
"source_name": chunk.source_name,
|
|
567
|
+
"line_number": chunk.line_number or 0,
|
|
568
|
+
"metadata": json.dumps(chunk.metadata),
|
|
569
|
+
"vector": embedding,
|
|
570
|
+
}
|
|
571
|
+
records.append(record)
|
|
572
|
+
|
|
573
|
+
log(f"Writing {len(records)} records to LanceDB...")
|
|
574
|
+
|
|
575
|
+
# Drop existing table if exists
|
|
576
|
+
try:
|
|
577
|
+
self.db.drop_table(table_name)
|
|
578
|
+
except Exception as e:
|
|
579
|
+
logger.debug("Table %s does not exist or could not be dropped: %s", table_name, e)
|
|
580
|
+
|
|
581
|
+
# Create new table
|
|
582
|
+
table = self.db.create_table(table_name, records)
|
|
583
|
+
|
|
584
|
+
# Create index for faster search
|
|
585
|
+
log("Building vector index...")
|
|
586
|
+
try:
|
|
587
|
+
table.create_index(
|
|
588
|
+
metric="cosine",
|
|
589
|
+
num_partitions=min(256, len(records) // 10 + 1),
|
|
590
|
+
num_sub_vectors=min(96, len(records) // 100 + 1),
|
|
591
|
+
)
|
|
592
|
+
except Exception as e:
|
|
593
|
+
log(f" Index creation failed (will use brute force): {e}")
|
|
594
|
+
|
|
595
|
+
log("Index built successfully")
|
|
596
|
+
|
|
597
|
+
return {
|
|
598
|
+
"chunks_indexed": len(chunks),
|
|
599
|
+
"table_name": table_name,
|
|
600
|
+
"db_path": str(self.db_path),
|
|
601
|
+
"model": self.model_name,
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
def search(
|
|
605
|
+
self,
|
|
606
|
+
query: str,
|
|
607
|
+
table_name: str = "solana_index",
|
|
608
|
+
limit: int = 10,
|
|
609
|
+
source_type: str | None = None,
|
|
610
|
+
) -> list[dict]:
|
|
611
|
+
"""
|
|
612
|
+
Search the index for relevant content.
|
|
613
|
+
|
|
614
|
+
Args:
|
|
615
|
+
query: Search query
|
|
616
|
+
table_name: Name of the LanceDB table
|
|
617
|
+
limit: Maximum results to return
|
|
618
|
+
source_type: Filter by source type (rust, simd, docs)
|
|
619
|
+
|
|
620
|
+
Returns:
|
|
621
|
+
List of matching results with scores
|
|
622
|
+
"""
|
|
623
|
+
try:
|
|
624
|
+
table = self.db.open_table(table_name)
|
|
625
|
+
except Exception:
|
|
626
|
+
return []
|
|
627
|
+
|
|
628
|
+
# Generate query embedding
|
|
629
|
+
query_embedding = self.embed_text(query)
|
|
630
|
+
|
|
631
|
+
# Search
|
|
632
|
+
results = table.search(query_embedding).limit(limit * 2 if source_type else limit)
|
|
633
|
+
|
|
634
|
+
# Convert to list of dicts
|
|
635
|
+
matches = []
|
|
636
|
+
for row in results.to_list():
|
|
637
|
+
# Filter by source type if specified
|
|
638
|
+
if source_type and row.get("source_type") != source_type:
|
|
639
|
+
continue
|
|
640
|
+
|
|
641
|
+
matches.append({
|
|
642
|
+
"content": row["content"],
|
|
643
|
+
"source_type": row["source_type"],
|
|
644
|
+
"source_file": row["source_file"],
|
|
645
|
+
"source_name": row["source_name"],
|
|
646
|
+
"line_number": row["line_number"],
|
|
647
|
+
"metadata": json.loads(row["metadata"]) if row.get("metadata") else {},
|
|
648
|
+
"score": float(row.get("_distance", 0)),
|
|
649
|
+
})
|
|
650
|
+
|
|
651
|
+
if len(matches) >= limit:
|
|
652
|
+
break
|
|
653
|
+
|
|
654
|
+
return matches
|
|
655
|
+
|
|
656
|
+
def search_runtime(self, query: str, limit: int = 10) -> list[dict]:
|
|
657
|
+
"""Search only Rust runtime code."""
|
|
658
|
+
return self.search(query, source_type="rust", limit=limit)
|
|
659
|
+
|
|
660
|
+
def search_simds(self, query: str, limit: int = 10) -> list[dict]:
|
|
661
|
+
"""Search only SIMDs."""
|
|
662
|
+
return self.search(query, source_type="simd", limit=limit)
|
|
663
|
+
|
|
664
|
+
|
|
665
|
+
def build_index(
|
|
666
|
+
chunks: list[Chunk],
|
|
667
|
+
data_dir: Path | None = None,
|
|
668
|
+
model_name: str = DEFAULT_MODEL,
|
|
669
|
+
progress_callback: Callable[[str], None] | None = None,
|
|
670
|
+
) -> dict:
|
|
671
|
+
"""
|
|
672
|
+
Convenience function to build the index.
|
|
673
|
+
|
|
674
|
+
Args:
|
|
675
|
+
chunks: List of content chunks
|
|
676
|
+
data_dir: Base data directory
|
|
677
|
+
model_name: Embedding model name
|
|
678
|
+
progress_callback: Optional progress callback
|
|
679
|
+
|
|
680
|
+
Returns:
|
|
681
|
+
Statistics about the index
|
|
682
|
+
"""
|
|
683
|
+
embedder = Embedder(model_name=model_name, data_dir=data_dir)
|
|
684
|
+
return embedder.build_index(chunks, progress_callback=progress_callback)
|
|
685
|
+
|
|
686
|
+
|
|
687
|
+
def search(
|
|
688
|
+
query: str,
|
|
689
|
+
data_dir: Path | None = None,
|
|
690
|
+
model_name: str = DEFAULT_MODEL,
|
|
691
|
+
limit: int = 10,
|
|
692
|
+
source_type: str | None = None,
|
|
693
|
+
) -> list[dict]:
|
|
694
|
+
"""
|
|
695
|
+
Convenience function to search the index.
|
|
696
|
+
|
|
697
|
+
Args:
|
|
698
|
+
query: Search query
|
|
699
|
+
data_dir: Base data directory
|
|
700
|
+
model_name: Embedding model name
|
|
701
|
+
limit: Maximum results
|
|
702
|
+
source_type: Filter by source type
|
|
703
|
+
|
|
704
|
+
Returns:
|
|
705
|
+
List of matching results
|
|
706
|
+
"""
|
|
707
|
+
embedder = Embedder(model_name=model_name, data_dir=data_dir)
|
|
708
|
+
return embedder.search(query, limit=limit, source_type=source_type)
|
|
709
|
+
|
|
710
|
+
|
|
711
|
+
def get_index_stats(data_dir: Path | None = None) -> dict | None:
|
|
712
|
+
"""Get statistics about the current index."""
|
|
713
|
+
if not DEPS_AVAILABLE:
|
|
714
|
+
return None
|
|
715
|
+
|
|
716
|
+
data_dir = data_dir or DEFAULT_DATA_DIR
|
|
717
|
+
db_path = data_dir / "lancedb"
|
|
718
|
+
|
|
719
|
+
if not db_path.exists():
|
|
720
|
+
return None
|
|
721
|
+
|
|
722
|
+
try:
|
|
723
|
+
db = lancedb.connect(str(db_path))
|
|
724
|
+
table = db.open_table("solana_index")
|
|
725
|
+
|
|
726
|
+
# Count by source type
|
|
727
|
+
all_rows = table.to_pandas()
|
|
728
|
+
source_counts = all_rows["source_type"].value_counts().to_dict()
|
|
729
|
+
|
|
730
|
+
return {
|
|
731
|
+
"total_chunks": len(all_rows),
|
|
732
|
+
"by_source_type": source_counts,
|
|
733
|
+
"db_path": str(db_path),
|
|
734
|
+
}
|
|
735
|
+
except Exception as e:
|
|
736
|
+
return {"error": str(e)}
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
if __name__ == "__main__":
|
|
740
|
+
# Test search
|
|
741
|
+
import sys
|
|
742
|
+
|
|
743
|
+
if len(sys.argv) < 2:
|
|
744
|
+
print("Usage: embedder.py <query>")
|
|
745
|
+
sys.exit(1)
|
|
746
|
+
|
|
747
|
+
query = " ".join(sys.argv[1:])
|
|
748
|
+
print(f"Searching for: {query}")
|
|
749
|
+
|
|
750
|
+
results = search(query, limit=5)
|
|
751
|
+
for i, result in enumerate(results):
|
|
752
|
+
print(f"\n{i + 1}. {result['source_name']} ({result['source_type']})")
|
|
753
|
+
print(f" File: {result['source_file']}:{result['line_number']}")
|
|
754
|
+
print(f" Score: {result['score']:.4f}")
|
|
755
|
+
print(f" Content: {result['content'][:200]}...")
|