sol-mcp 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sol_mcp-0.2.0.dist-info/METADATA +218 -0
- sol_mcp-0.2.0.dist-info/RECORD +20 -0
- sol_mcp-0.2.0.dist-info/WHEEL +4 -0
- sol_mcp-0.2.0.dist-info/entry_points.txt +3 -0
- solana_mcp/__init__.py +3 -0
- solana_mcp/cli.py +527 -0
- solana_mcp/config.py +324 -0
- solana_mcp/expert/__init__.py +5 -0
- solana_mcp/expert/guidance.py +452 -0
- solana_mcp/indexer/__init__.py +8 -0
- solana_mcp/indexer/chunker.py +457 -0
- solana_mcp/indexer/compiler.py +1101 -0
- solana_mcp/indexer/downloader.py +304 -0
- solana_mcp/indexer/embedder.py +755 -0
- solana_mcp/indexer/manifest.py +411 -0
- solana_mcp/logging.py +85 -0
- solana_mcp/models.py +62 -0
- solana_mcp/server.py +746 -0
- solana_mcp/tools/__init__.py +1 -0
- solana_mcp/versions.py +391 -0
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
"""Chunk content for embedding.
|
|
2
|
+
|
|
3
|
+
Handles:
|
|
4
|
+
- Rust code (functions, structs, enums)
|
|
5
|
+
- SIMD markdown documents
|
|
6
|
+
- Documentation markdown
|
|
7
|
+
|
|
8
|
+
Supports incremental indexing via deterministic chunk IDs.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from dataclasses import dataclass, replace
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from .compiler import ExtractedConstant, ExtractedItem
|
|
16
|
+
from .manifest import generate_chunk_id
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class Chunk:
|
|
21
|
+
"""A chunk of content ready for embedding."""
|
|
22
|
+
|
|
23
|
+
content: str
|
|
24
|
+
source_type: str # "rust", "simd", "docs"
|
|
25
|
+
source_file: str
|
|
26
|
+
source_name: str # function name, SIMD number, doc title
|
|
27
|
+
line_number: int | None
|
|
28
|
+
metadata: dict
|
|
29
|
+
chunk_id: str = "" # Unique ID for incremental indexing
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def chunk_rust_item(item: ExtractedItem, repo_name: str = "agave") -> Chunk:
|
|
33
|
+
"""Convert an extracted Rust item into a chunk."""
|
|
34
|
+
# Build content with context
|
|
35
|
+
parts = []
|
|
36
|
+
|
|
37
|
+
# Add doc comment if present
|
|
38
|
+
if item.doc_comment:
|
|
39
|
+
parts.append(f"/// {item.doc_comment}")
|
|
40
|
+
|
|
41
|
+
# Add attributes
|
|
42
|
+
for attr in item.attributes:
|
|
43
|
+
parts.append(attr)
|
|
44
|
+
|
|
45
|
+
# Add the code
|
|
46
|
+
parts.append(item.body)
|
|
47
|
+
|
|
48
|
+
content = "\n".join(parts)
|
|
49
|
+
|
|
50
|
+
return Chunk(
|
|
51
|
+
content=content,
|
|
52
|
+
source_type="rust",
|
|
53
|
+
source_file=item.file_path,
|
|
54
|
+
source_name=item.name,
|
|
55
|
+
line_number=item.line_number,
|
|
56
|
+
metadata={
|
|
57
|
+
"kind": item.kind,
|
|
58
|
+
"signature": item.signature,
|
|
59
|
+
"visibility": item.visibility,
|
|
60
|
+
"repo": repo_name,
|
|
61
|
+
},
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def chunk_rust_constant(const: ExtractedConstant, repo_name: str = "agave") -> Chunk:
|
|
66
|
+
"""Convert an extracted constant into a chunk."""
|
|
67
|
+
parts = []
|
|
68
|
+
|
|
69
|
+
if const.doc_comment:
|
|
70
|
+
parts.append(f"/// {const.doc_comment}")
|
|
71
|
+
|
|
72
|
+
type_str = f": {const.type_annotation}" if const.type_annotation else ""
|
|
73
|
+
parts.append(f"const {const.name}{type_str} = {const.value};")
|
|
74
|
+
|
|
75
|
+
content = "\n".join(parts)
|
|
76
|
+
|
|
77
|
+
return Chunk(
|
|
78
|
+
content=content,
|
|
79
|
+
source_type="rust",
|
|
80
|
+
source_file=const.file_path,
|
|
81
|
+
source_name=const.name,
|
|
82
|
+
line_number=const.line_number,
|
|
83
|
+
metadata={
|
|
84
|
+
"kind": "constant",
|
|
85
|
+
"value": const.value,
|
|
86
|
+
"type": const.type_annotation,
|
|
87
|
+
"repo": repo_name,
|
|
88
|
+
},
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def chunk_simd(file_path: Path, simd_dir: Path) -> list[Chunk]:
|
|
93
|
+
"""
|
|
94
|
+
Chunk a SIMD markdown file into sections.
|
|
95
|
+
|
|
96
|
+
SIMDs have a standard structure:
|
|
97
|
+
- Title and metadata
|
|
98
|
+
- Abstract
|
|
99
|
+
- Motivation
|
|
100
|
+
- Specification (often the longest)
|
|
101
|
+
- Security Considerations
|
|
102
|
+
- Backwards Compatibility
|
|
103
|
+
"""
|
|
104
|
+
try:
|
|
105
|
+
content = file_path.read_text(encoding="utf-8")
|
|
106
|
+
except (OSError, UnicodeDecodeError):
|
|
107
|
+
return []
|
|
108
|
+
|
|
109
|
+
chunks = []
|
|
110
|
+
relative_path = str(file_path.relative_to(simd_dir))
|
|
111
|
+
|
|
112
|
+
# Extract SIMD number from filename (e.g., "0326-alpenglow.md" -> "SIMD-0326")
|
|
113
|
+
simd_match = re.match(r"(\d+)-(.+)\.md", file_path.name)
|
|
114
|
+
if simd_match:
|
|
115
|
+
simd_number = f"SIMD-{simd_match.group(1)}"
|
|
116
|
+
simd_name = simd_match.group(2).replace("-", " ").title()
|
|
117
|
+
else:
|
|
118
|
+
simd_number = file_path.stem
|
|
119
|
+
simd_name = file_path.stem
|
|
120
|
+
|
|
121
|
+
# Split by headers
|
|
122
|
+
sections = re.split(r"^(#{1,3}\s+.+)$", content, flags=re.MULTILINE)
|
|
123
|
+
|
|
124
|
+
current_header = f"# {simd_number}: {simd_name}"
|
|
125
|
+
current_content = []
|
|
126
|
+
current_line = 1
|
|
127
|
+
|
|
128
|
+
for i, section in enumerate(sections):
|
|
129
|
+
if re.match(r"^#{1,3}\s+", section):
|
|
130
|
+
# This is a header
|
|
131
|
+
if current_content:
|
|
132
|
+
# Save previous section
|
|
133
|
+
section_text = "\n".join(current_content).strip()
|
|
134
|
+
if section_text and len(section_text) > 50: # Skip tiny sections
|
|
135
|
+
chunks.append(
|
|
136
|
+
Chunk(
|
|
137
|
+
content=f"{current_header}\n\n{section_text}",
|
|
138
|
+
source_type="simd",
|
|
139
|
+
source_file=relative_path,
|
|
140
|
+
source_name=f"{simd_number} - {current_header.lstrip('#').strip()}",
|
|
141
|
+
line_number=current_line,
|
|
142
|
+
metadata={
|
|
143
|
+
"simd_number": simd_number,
|
|
144
|
+
"section": current_header.lstrip("#").strip(),
|
|
145
|
+
},
|
|
146
|
+
)
|
|
147
|
+
)
|
|
148
|
+
current_header = section.strip()
|
|
149
|
+
current_content = []
|
|
150
|
+
current_line = content[: content.find(section)].count("\n") + 1
|
|
151
|
+
else:
|
|
152
|
+
current_content.append(section)
|
|
153
|
+
|
|
154
|
+
# Don't forget the last section
|
|
155
|
+
if current_content:
|
|
156
|
+
section_text = "\n".join(current_content).strip()
|
|
157
|
+
if section_text and len(section_text) > 50:
|
|
158
|
+
chunks.append(
|
|
159
|
+
Chunk(
|
|
160
|
+
content=f"{current_header}\n\n{section_text}",
|
|
161
|
+
source_type="simd",
|
|
162
|
+
source_file=relative_path,
|
|
163
|
+
source_name=f"{simd_number} - {current_header.lstrip('#').strip()}",
|
|
164
|
+
line_number=current_line,
|
|
165
|
+
metadata={
|
|
166
|
+
"simd_number": simd_number,
|
|
167
|
+
"section": current_header.lstrip("#").strip(),
|
|
168
|
+
},
|
|
169
|
+
)
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
return chunks
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def chunk_markdown(file_path: Path, base_dir: Path, source_type: str = "docs") -> list[Chunk]:
|
|
176
|
+
"""
|
|
177
|
+
Chunk a generic markdown file into sections.
|
|
178
|
+
"""
|
|
179
|
+
try:
|
|
180
|
+
content = file_path.read_text(encoding="utf-8")
|
|
181
|
+
except (OSError, UnicodeDecodeError):
|
|
182
|
+
return []
|
|
183
|
+
|
|
184
|
+
chunks = []
|
|
185
|
+
relative_path = str(file_path.relative_to(base_dir))
|
|
186
|
+
|
|
187
|
+
# Split by headers
|
|
188
|
+
sections = re.split(r"^(#{1,3}\s+.+)$", content, flags=re.MULTILINE)
|
|
189
|
+
|
|
190
|
+
current_header = f"# {file_path.stem}"
|
|
191
|
+
current_content = []
|
|
192
|
+
current_line = 1
|
|
193
|
+
|
|
194
|
+
for section in sections:
|
|
195
|
+
if re.match(r"^#{1,3}\s+", section):
|
|
196
|
+
if current_content:
|
|
197
|
+
section_text = "\n".join(current_content).strip()
|
|
198
|
+
if section_text and len(section_text) > 50:
|
|
199
|
+
chunks.append(
|
|
200
|
+
Chunk(
|
|
201
|
+
content=f"{current_header}\n\n{section_text}",
|
|
202
|
+
source_type=source_type,
|
|
203
|
+
source_file=relative_path,
|
|
204
|
+
source_name=current_header.lstrip("#").strip(),
|
|
205
|
+
line_number=current_line,
|
|
206
|
+
metadata={
|
|
207
|
+
"section": current_header.lstrip("#").strip(),
|
|
208
|
+
},
|
|
209
|
+
)
|
|
210
|
+
)
|
|
211
|
+
current_header = section.strip()
|
|
212
|
+
current_content = []
|
|
213
|
+
current_line = content[: content.find(section)].count("\n") + 1
|
|
214
|
+
else:
|
|
215
|
+
current_content.append(section)
|
|
216
|
+
|
|
217
|
+
if current_content:
|
|
218
|
+
section_text = "\n".join(current_content).strip()
|
|
219
|
+
if section_text and len(section_text) > 50:
|
|
220
|
+
chunks.append(
|
|
221
|
+
Chunk(
|
|
222
|
+
content=f"{current_header}\n\n{section_text}",
|
|
223
|
+
source_type=source_type,
|
|
224
|
+
source_file=relative_path,
|
|
225
|
+
source_name=current_header.lstrip("#").strip(),
|
|
226
|
+
line_number=current_line,
|
|
227
|
+
metadata={
|
|
228
|
+
"section": current_header.lstrip("#").strip(),
|
|
229
|
+
},
|
|
230
|
+
)
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
return chunks
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def chunk_all_simds(simd_dir: Path) -> list[Chunk]:
|
|
237
|
+
"""Chunk all SIMDs in a directory."""
|
|
238
|
+
proposals_dir = simd_dir / "proposals"
|
|
239
|
+
if not proposals_dir.exists():
|
|
240
|
+
return []
|
|
241
|
+
|
|
242
|
+
chunks = []
|
|
243
|
+
for md_file in proposals_dir.glob("*.md"):
|
|
244
|
+
chunks.extend(chunk_simd(md_file, simd_dir))
|
|
245
|
+
|
|
246
|
+
return chunks
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def chunk_rust_items(
|
|
250
|
+
items: list[ExtractedItem],
|
|
251
|
+
constants: list[ExtractedConstant],
|
|
252
|
+
repo_name: str = "agave",
|
|
253
|
+
) -> list[Chunk]:
|
|
254
|
+
"""Chunk all Rust items and constants."""
|
|
255
|
+
chunks = []
|
|
256
|
+
|
|
257
|
+
for item in items:
|
|
258
|
+
chunks.append(chunk_rust_item(item, repo_name))
|
|
259
|
+
|
|
260
|
+
for const in constants:
|
|
261
|
+
chunks.append(chunk_rust_constant(const, repo_name))
|
|
262
|
+
|
|
263
|
+
return chunks
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def estimate_tokens(text: str) -> int:
|
|
267
|
+
"""Rough estimate of token count (approx 4 chars per token)."""
|
|
268
|
+
return len(text) // 4
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def split_large_chunk(chunk: Chunk, max_tokens: int = 1000) -> list[Chunk]:
|
|
272
|
+
"""Split a chunk that's too large into smaller pieces."""
|
|
273
|
+
if estimate_tokens(chunk.content) <= max_tokens:
|
|
274
|
+
return [chunk]
|
|
275
|
+
|
|
276
|
+
# Split by lines, trying to keep logical groups
|
|
277
|
+
lines = chunk.content.split("\n")
|
|
278
|
+
sub_chunks = []
|
|
279
|
+
current_lines = []
|
|
280
|
+
current_tokens = 0
|
|
281
|
+
|
|
282
|
+
for line in lines:
|
|
283
|
+
line_tokens = estimate_tokens(line)
|
|
284
|
+
if current_tokens + line_tokens > max_tokens and current_lines:
|
|
285
|
+
# Save current chunk
|
|
286
|
+
sub_content = "\n".join(current_lines)
|
|
287
|
+
sub_chunks.append(
|
|
288
|
+
Chunk(
|
|
289
|
+
content=sub_content,
|
|
290
|
+
source_type=chunk.source_type,
|
|
291
|
+
source_file=chunk.source_file,
|
|
292
|
+
source_name=f"{chunk.source_name} (part {len(sub_chunks) + 1})",
|
|
293
|
+
line_number=chunk.line_number,
|
|
294
|
+
metadata={**chunk.metadata, "part": len(sub_chunks) + 1},
|
|
295
|
+
)
|
|
296
|
+
)
|
|
297
|
+
current_lines = [line]
|
|
298
|
+
current_tokens = line_tokens
|
|
299
|
+
else:
|
|
300
|
+
current_lines.append(line)
|
|
301
|
+
current_tokens += line_tokens
|
|
302
|
+
|
|
303
|
+
# Don't forget the last part
|
|
304
|
+
if current_lines:
|
|
305
|
+
sub_content = "\n".join(current_lines)
|
|
306
|
+
sub_chunks.append(
|
|
307
|
+
Chunk(
|
|
308
|
+
content=sub_content,
|
|
309
|
+
source_type=chunk.source_type,
|
|
310
|
+
source_file=chunk.source_file,
|
|
311
|
+
source_name=f"{chunk.source_name} (part {len(sub_chunks) + 1})" if len(sub_chunks) > 0 else chunk.source_name,
|
|
312
|
+
line_number=chunk.line_number,
|
|
313
|
+
metadata={**chunk.metadata, "part": len(sub_chunks) + 1} if len(sub_chunks) > 0 else chunk.metadata,
|
|
314
|
+
)
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
return sub_chunks
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def chunk_content(
|
|
321
|
+
items: list[ExtractedItem] | None = None,
|
|
322
|
+
constants: list[ExtractedConstant] | None = None,
|
|
323
|
+
simd_dir: Path | None = None,
|
|
324
|
+
docs_dir: Path | None = None,
|
|
325
|
+
repo_name: str = "agave",
|
|
326
|
+
max_tokens: int = 1000,
|
|
327
|
+
) -> list[Chunk]:
|
|
328
|
+
"""
|
|
329
|
+
Chunk all content for embedding.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
items: Extracted Rust items
|
|
333
|
+
constants: Extracted Rust constants
|
|
334
|
+
simd_dir: Directory containing SIMDs
|
|
335
|
+
docs_dir: Directory containing documentation
|
|
336
|
+
repo_name: Name of the source repo
|
|
337
|
+
max_tokens: Maximum tokens per chunk
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
List of chunks ready for embedding
|
|
341
|
+
"""
|
|
342
|
+
all_chunks = []
|
|
343
|
+
|
|
344
|
+
# Chunk Rust items
|
|
345
|
+
if items or constants:
|
|
346
|
+
rust_chunks = chunk_rust_items(
|
|
347
|
+
items or [], constants or [], repo_name
|
|
348
|
+
)
|
|
349
|
+
for chunk in rust_chunks:
|
|
350
|
+
all_chunks.extend(split_large_chunk(chunk, max_tokens))
|
|
351
|
+
|
|
352
|
+
# Chunk SIMDs
|
|
353
|
+
if simd_dir and simd_dir.exists():
|
|
354
|
+
simd_chunks = chunk_all_simds(simd_dir)
|
|
355
|
+
for chunk in simd_chunks:
|
|
356
|
+
all_chunks.extend(split_large_chunk(chunk, max_tokens))
|
|
357
|
+
|
|
358
|
+
# Chunk docs
|
|
359
|
+
if docs_dir and docs_dir.exists():
|
|
360
|
+
for md_file in docs_dir.glob("**/*.md"):
|
|
361
|
+
doc_chunks = chunk_markdown(md_file, docs_dir, "docs")
|
|
362
|
+
for chunk in doc_chunks:
|
|
363
|
+
all_chunks.extend(split_large_chunk(chunk, max_tokens))
|
|
364
|
+
|
|
365
|
+
# Assign chunk IDs
|
|
366
|
+
all_chunks = _assign_chunk_ids(all_chunks)
|
|
367
|
+
|
|
368
|
+
return all_chunks
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def _assign_chunk_ids(chunks: list[Chunk], project: str = "sol") -> list[Chunk]:
|
|
372
|
+
"""
|
|
373
|
+
Assign unique chunk IDs to a list of chunks.
|
|
374
|
+
|
|
375
|
+
Groups chunks by source file and assigns sequential IDs within each file.
|
|
376
|
+
"""
|
|
377
|
+
# Group chunks by source file
|
|
378
|
+
file_chunks: dict[str, list[tuple[int, Chunk]]] = {}
|
|
379
|
+
for i, chunk in enumerate(chunks):
|
|
380
|
+
key = f"{chunk.source_type}:{chunk.source_file}"
|
|
381
|
+
if key not in file_chunks:
|
|
382
|
+
file_chunks[key] = []
|
|
383
|
+
file_chunks[key].append((i, chunk))
|
|
384
|
+
|
|
385
|
+
# Assign IDs within each file
|
|
386
|
+
result = list(chunks) # Copy to avoid modifying original
|
|
387
|
+
for key, indexed_chunks in file_chunks.items():
|
|
388
|
+
for file_idx, (original_idx, chunk) in enumerate(indexed_chunks):
|
|
389
|
+
chunk_id = generate_chunk_id(
|
|
390
|
+
project=project,
|
|
391
|
+
source_type=chunk.source_type,
|
|
392
|
+
source_file=chunk.source_file,
|
|
393
|
+
chunk_index=file_idx,
|
|
394
|
+
content=chunk.content,
|
|
395
|
+
)
|
|
396
|
+
result[original_idx] = replace(chunk, chunk_id=chunk_id)
|
|
397
|
+
|
|
398
|
+
return result
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def chunk_single_file(
|
|
402
|
+
file_path: Path,
|
|
403
|
+
file_type: str,
|
|
404
|
+
base_path: Path,
|
|
405
|
+
project: str = "sol",
|
|
406
|
+
max_tokens: int = 1000,
|
|
407
|
+
) -> list[Chunk]:
|
|
408
|
+
"""
|
|
409
|
+
Chunk a single file and assign chunk IDs.
|
|
410
|
+
|
|
411
|
+
This is used for incremental indexing to process individual files.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
file_path: Absolute path to the file
|
|
415
|
+
file_type: Type of file ("rust", "simd", "docs")
|
|
416
|
+
base_path: Base path for relative path calculation
|
|
417
|
+
project: Project identifier for chunk IDs
|
|
418
|
+
max_tokens: Maximum tokens per chunk
|
|
419
|
+
|
|
420
|
+
Returns:
|
|
421
|
+
List of chunks with assigned chunk IDs
|
|
422
|
+
"""
|
|
423
|
+
chunks = []
|
|
424
|
+
|
|
425
|
+
if file_type == "simd":
|
|
426
|
+
raw_chunks = chunk_simd(file_path, base_path)
|
|
427
|
+
elif file_type == "docs":
|
|
428
|
+
raw_chunks = chunk_markdown(file_path, base_path, "docs")
|
|
429
|
+
elif file_type == "rust":
|
|
430
|
+
# For Rust, we expect pre-compiled items, not raw files
|
|
431
|
+
# This should be handled via chunk_rust_items
|
|
432
|
+
return []
|
|
433
|
+
else:
|
|
434
|
+
return []
|
|
435
|
+
|
|
436
|
+
# Split large chunks
|
|
437
|
+
for chunk in raw_chunks:
|
|
438
|
+
chunks.extend(split_large_chunk(chunk, max_tokens))
|
|
439
|
+
|
|
440
|
+
# Assign chunk IDs
|
|
441
|
+
return _assign_chunk_ids(chunks, project)
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
if __name__ == "__main__":
|
|
445
|
+
# Test chunking
|
|
446
|
+
import sys
|
|
447
|
+
|
|
448
|
+
if len(sys.argv) < 2:
|
|
449
|
+
print("Usage: chunker.py <simd_dir>")
|
|
450
|
+
sys.exit(1)
|
|
451
|
+
|
|
452
|
+
simd_path = Path(sys.argv[1])
|
|
453
|
+
chunks = chunk_all_simds(simd_path)
|
|
454
|
+
|
|
455
|
+
print(f"Chunked {len(chunks)} sections from SIMDs")
|
|
456
|
+
for chunk in chunks[:5]:
|
|
457
|
+
print(f" - {chunk.source_name}: {len(chunk.content)} chars")
|