sol-mcp 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,457 @@
1
+ """Chunk content for embedding.
2
+
3
+ Handles:
4
+ - Rust code (functions, structs, enums)
5
+ - SIMD markdown documents
6
+ - Documentation markdown
7
+
8
+ Supports incremental indexing via deterministic chunk IDs.
9
+ """
10
+
11
+ import re
12
+ from dataclasses import dataclass, replace
13
+ from pathlib import Path
14
+
15
+ from .compiler import ExtractedConstant, ExtractedItem
16
+ from .manifest import generate_chunk_id
17
+
18
+
19
+ @dataclass
20
+ class Chunk:
21
+ """A chunk of content ready for embedding."""
22
+
23
+ content: str
24
+ source_type: str # "rust", "simd", "docs"
25
+ source_file: str
26
+ source_name: str # function name, SIMD number, doc title
27
+ line_number: int | None
28
+ metadata: dict
29
+ chunk_id: str = "" # Unique ID for incremental indexing
30
+
31
+
32
+ def chunk_rust_item(item: ExtractedItem, repo_name: str = "agave") -> Chunk:
33
+ """Convert an extracted Rust item into a chunk."""
34
+ # Build content with context
35
+ parts = []
36
+
37
+ # Add doc comment if present
38
+ if item.doc_comment:
39
+ parts.append(f"/// {item.doc_comment}")
40
+
41
+ # Add attributes
42
+ for attr in item.attributes:
43
+ parts.append(attr)
44
+
45
+ # Add the code
46
+ parts.append(item.body)
47
+
48
+ content = "\n".join(parts)
49
+
50
+ return Chunk(
51
+ content=content,
52
+ source_type="rust",
53
+ source_file=item.file_path,
54
+ source_name=item.name,
55
+ line_number=item.line_number,
56
+ metadata={
57
+ "kind": item.kind,
58
+ "signature": item.signature,
59
+ "visibility": item.visibility,
60
+ "repo": repo_name,
61
+ },
62
+ )
63
+
64
+
65
+ def chunk_rust_constant(const: ExtractedConstant, repo_name: str = "agave") -> Chunk:
66
+ """Convert an extracted constant into a chunk."""
67
+ parts = []
68
+
69
+ if const.doc_comment:
70
+ parts.append(f"/// {const.doc_comment}")
71
+
72
+ type_str = f": {const.type_annotation}" if const.type_annotation else ""
73
+ parts.append(f"const {const.name}{type_str} = {const.value};")
74
+
75
+ content = "\n".join(parts)
76
+
77
+ return Chunk(
78
+ content=content,
79
+ source_type="rust",
80
+ source_file=const.file_path,
81
+ source_name=const.name,
82
+ line_number=const.line_number,
83
+ metadata={
84
+ "kind": "constant",
85
+ "value": const.value,
86
+ "type": const.type_annotation,
87
+ "repo": repo_name,
88
+ },
89
+ )
90
+
91
+
92
+ def chunk_simd(file_path: Path, simd_dir: Path) -> list[Chunk]:
93
+ """
94
+ Chunk a SIMD markdown file into sections.
95
+
96
+ SIMDs have a standard structure:
97
+ - Title and metadata
98
+ - Abstract
99
+ - Motivation
100
+ - Specification (often the longest)
101
+ - Security Considerations
102
+ - Backwards Compatibility
103
+ """
104
+ try:
105
+ content = file_path.read_text(encoding="utf-8")
106
+ except (OSError, UnicodeDecodeError):
107
+ return []
108
+
109
+ chunks = []
110
+ relative_path = str(file_path.relative_to(simd_dir))
111
+
112
+ # Extract SIMD number from filename (e.g., "0326-alpenglow.md" -> "SIMD-0326")
113
+ simd_match = re.match(r"(\d+)-(.+)\.md", file_path.name)
114
+ if simd_match:
115
+ simd_number = f"SIMD-{simd_match.group(1)}"
116
+ simd_name = simd_match.group(2).replace("-", " ").title()
117
+ else:
118
+ simd_number = file_path.stem
119
+ simd_name = file_path.stem
120
+
121
+ # Split by headers
122
+ sections = re.split(r"^(#{1,3}\s+.+)$", content, flags=re.MULTILINE)
123
+
124
+ current_header = f"# {simd_number}: {simd_name}"
125
+ current_content = []
126
+ current_line = 1
127
+
128
+ for i, section in enumerate(sections):
129
+ if re.match(r"^#{1,3}\s+", section):
130
+ # This is a header
131
+ if current_content:
132
+ # Save previous section
133
+ section_text = "\n".join(current_content).strip()
134
+ if section_text and len(section_text) > 50: # Skip tiny sections
135
+ chunks.append(
136
+ Chunk(
137
+ content=f"{current_header}\n\n{section_text}",
138
+ source_type="simd",
139
+ source_file=relative_path,
140
+ source_name=f"{simd_number} - {current_header.lstrip('#').strip()}",
141
+ line_number=current_line,
142
+ metadata={
143
+ "simd_number": simd_number,
144
+ "section": current_header.lstrip("#").strip(),
145
+ },
146
+ )
147
+ )
148
+ current_header = section.strip()
149
+ current_content = []
150
+ current_line = content[: content.find(section)].count("\n") + 1
151
+ else:
152
+ current_content.append(section)
153
+
154
+ # Don't forget the last section
155
+ if current_content:
156
+ section_text = "\n".join(current_content).strip()
157
+ if section_text and len(section_text) > 50:
158
+ chunks.append(
159
+ Chunk(
160
+ content=f"{current_header}\n\n{section_text}",
161
+ source_type="simd",
162
+ source_file=relative_path,
163
+ source_name=f"{simd_number} - {current_header.lstrip('#').strip()}",
164
+ line_number=current_line,
165
+ metadata={
166
+ "simd_number": simd_number,
167
+ "section": current_header.lstrip("#").strip(),
168
+ },
169
+ )
170
+ )
171
+
172
+ return chunks
173
+
174
+
175
+ def chunk_markdown(file_path: Path, base_dir: Path, source_type: str = "docs") -> list[Chunk]:
176
+ """
177
+ Chunk a generic markdown file into sections.
178
+ """
179
+ try:
180
+ content = file_path.read_text(encoding="utf-8")
181
+ except (OSError, UnicodeDecodeError):
182
+ return []
183
+
184
+ chunks = []
185
+ relative_path = str(file_path.relative_to(base_dir))
186
+
187
+ # Split by headers
188
+ sections = re.split(r"^(#{1,3}\s+.+)$", content, flags=re.MULTILINE)
189
+
190
+ current_header = f"# {file_path.stem}"
191
+ current_content = []
192
+ current_line = 1
193
+
194
+ for section in sections:
195
+ if re.match(r"^#{1,3}\s+", section):
196
+ if current_content:
197
+ section_text = "\n".join(current_content).strip()
198
+ if section_text and len(section_text) > 50:
199
+ chunks.append(
200
+ Chunk(
201
+ content=f"{current_header}\n\n{section_text}",
202
+ source_type=source_type,
203
+ source_file=relative_path,
204
+ source_name=current_header.lstrip("#").strip(),
205
+ line_number=current_line,
206
+ metadata={
207
+ "section": current_header.lstrip("#").strip(),
208
+ },
209
+ )
210
+ )
211
+ current_header = section.strip()
212
+ current_content = []
213
+ current_line = content[: content.find(section)].count("\n") + 1
214
+ else:
215
+ current_content.append(section)
216
+
217
+ if current_content:
218
+ section_text = "\n".join(current_content).strip()
219
+ if section_text and len(section_text) > 50:
220
+ chunks.append(
221
+ Chunk(
222
+ content=f"{current_header}\n\n{section_text}",
223
+ source_type=source_type,
224
+ source_file=relative_path,
225
+ source_name=current_header.lstrip("#").strip(),
226
+ line_number=current_line,
227
+ metadata={
228
+ "section": current_header.lstrip("#").strip(),
229
+ },
230
+ )
231
+ )
232
+
233
+ return chunks
234
+
235
+
236
+ def chunk_all_simds(simd_dir: Path) -> list[Chunk]:
237
+ """Chunk all SIMDs in a directory."""
238
+ proposals_dir = simd_dir / "proposals"
239
+ if not proposals_dir.exists():
240
+ return []
241
+
242
+ chunks = []
243
+ for md_file in proposals_dir.glob("*.md"):
244
+ chunks.extend(chunk_simd(md_file, simd_dir))
245
+
246
+ return chunks
247
+
248
+
249
+ def chunk_rust_items(
250
+ items: list[ExtractedItem],
251
+ constants: list[ExtractedConstant],
252
+ repo_name: str = "agave",
253
+ ) -> list[Chunk]:
254
+ """Chunk all Rust items and constants."""
255
+ chunks = []
256
+
257
+ for item in items:
258
+ chunks.append(chunk_rust_item(item, repo_name))
259
+
260
+ for const in constants:
261
+ chunks.append(chunk_rust_constant(const, repo_name))
262
+
263
+ return chunks
264
+
265
+
266
+ def estimate_tokens(text: str) -> int:
267
+ """Rough estimate of token count (approx 4 chars per token)."""
268
+ return len(text) // 4
269
+
270
+
271
+ def split_large_chunk(chunk: Chunk, max_tokens: int = 1000) -> list[Chunk]:
272
+ """Split a chunk that's too large into smaller pieces."""
273
+ if estimate_tokens(chunk.content) <= max_tokens:
274
+ return [chunk]
275
+
276
+ # Split by lines, trying to keep logical groups
277
+ lines = chunk.content.split("\n")
278
+ sub_chunks = []
279
+ current_lines = []
280
+ current_tokens = 0
281
+
282
+ for line in lines:
283
+ line_tokens = estimate_tokens(line)
284
+ if current_tokens + line_tokens > max_tokens and current_lines:
285
+ # Save current chunk
286
+ sub_content = "\n".join(current_lines)
287
+ sub_chunks.append(
288
+ Chunk(
289
+ content=sub_content,
290
+ source_type=chunk.source_type,
291
+ source_file=chunk.source_file,
292
+ source_name=f"{chunk.source_name} (part {len(sub_chunks) + 1})",
293
+ line_number=chunk.line_number,
294
+ metadata={**chunk.metadata, "part": len(sub_chunks) + 1},
295
+ )
296
+ )
297
+ current_lines = [line]
298
+ current_tokens = line_tokens
299
+ else:
300
+ current_lines.append(line)
301
+ current_tokens += line_tokens
302
+
303
+ # Don't forget the last part
304
+ if current_lines:
305
+ sub_content = "\n".join(current_lines)
306
+ sub_chunks.append(
307
+ Chunk(
308
+ content=sub_content,
309
+ source_type=chunk.source_type,
310
+ source_file=chunk.source_file,
311
+ source_name=f"{chunk.source_name} (part {len(sub_chunks) + 1})" if len(sub_chunks) > 0 else chunk.source_name,
312
+ line_number=chunk.line_number,
313
+ metadata={**chunk.metadata, "part": len(sub_chunks) + 1} if len(sub_chunks) > 0 else chunk.metadata,
314
+ )
315
+ )
316
+
317
+ return sub_chunks
318
+
319
+
320
+ def chunk_content(
321
+ items: list[ExtractedItem] | None = None,
322
+ constants: list[ExtractedConstant] | None = None,
323
+ simd_dir: Path | None = None,
324
+ docs_dir: Path | None = None,
325
+ repo_name: str = "agave",
326
+ max_tokens: int = 1000,
327
+ ) -> list[Chunk]:
328
+ """
329
+ Chunk all content for embedding.
330
+
331
+ Args:
332
+ items: Extracted Rust items
333
+ constants: Extracted Rust constants
334
+ simd_dir: Directory containing SIMDs
335
+ docs_dir: Directory containing documentation
336
+ repo_name: Name of the source repo
337
+ max_tokens: Maximum tokens per chunk
338
+
339
+ Returns:
340
+ List of chunks ready for embedding
341
+ """
342
+ all_chunks = []
343
+
344
+ # Chunk Rust items
345
+ if items or constants:
346
+ rust_chunks = chunk_rust_items(
347
+ items or [], constants or [], repo_name
348
+ )
349
+ for chunk in rust_chunks:
350
+ all_chunks.extend(split_large_chunk(chunk, max_tokens))
351
+
352
+ # Chunk SIMDs
353
+ if simd_dir and simd_dir.exists():
354
+ simd_chunks = chunk_all_simds(simd_dir)
355
+ for chunk in simd_chunks:
356
+ all_chunks.extend(split_large_chunk(chunk, max_tokens))
357
+
358
+ # Chunk docs
359
+ if docs_dir and docs_dir.exists():
360
+ for md_file in docs_dir.glob("**/*.md"):
361
+ doc_chunks = chunk_markdown(md_file, docs_dir, "docs")
362
+ for chunk in doc_chunks:
363
+ all_chunks.extend(split_large_chunk(chunk, max_tokens))
364
+
365
+ # Assign chunk IDs
366
+ all_chunks = _assign_chunk_ids(all_chunks)
367
+
368
+ return all_chunks
369
+
370
+
371
+ def _assign_chunk_ids(chunks: list[Chunk], project: str = "sol") -> list[Chunk]:
372
+ """
373
+ Assign unique chunk IDs to a list of chunks.
374
+
375
+ Groups chunks by source file and assigns sequential IDs within each file.
376
+ """
377
+ # Group chunks by source file
378
+ file_chunks: dict[str, list[tuple[int, Chunk]]] = {}
379
+ for i, chunk in enumerate(chunks):
380
+ key = f"{chunk.source_type}:{chunk.source_file}"
381
+ if key not in file_chunks:
382
+ file_chunks[key] = []
383
+ file_chunks[key].append((i, chunk))
384
+
385
+ # Assign IDs within each file
386
+ result = list(chunks) # Copy to avoid modifying original
387
+ for key, indexed_chunks in file_chunks.items():
388
+ for file_idx, (original_idx, chunk) in enumerate(indexed_chunks):
389
+ chunk_id = generate_chunk_id(
390
+ project=project,
391
+ source_type=chunk.source_type,
392
+ source_file=chunk.source_file,
393
+ chunk_index=file_idx,
394
+ content=chunk.content,
395
+ )
396
+ result[original_idx] = replace(chunk, chunk_id=chunk_id)
397
+
398
+ return result
399
+
400
+
401
+ def chunk_single_file(
402
+ file_path: Path,
403
+ file_type: str,
404
+ base_path: Path,
405
+ project: str = "sol",
406
+ max_tokens: int = 1000,
407
+ ) -> list[Chunk]:
408
+ """
409
+ Chunk a single file and assign chunk IDs.
410
+
411
+ This is used for incremental indexing to process individual files.
412
+
413
+ Args:
414
+ file_path: Absolute path to the file
415
+ file_type: Type of file ("rust", "simd", "docs")
416
+ base_path: Base path for relative path calculation
417
+ project: Project identifier for chunk IDs
418
+ max_tokens: Maximum tokens per chunk
419
+
420
+ Returns:
421
+ List of chunks with assigned chunk IDs
422
+ """
423
+ chunks = []
424
+
425
+ if file_type == "simd":
426
+ raw_chunks = chunk_simd(file_path, base_path)
427
+ elif file_type == "docs":
428
+ raw_chunks = chunk_markdown(file_path, base_path, "docs")
429
+ elif file_type == "rust":
430
+ # For Rust, we expect pre-compiled items, not raw files
431
+ # This should be handled via chunk_rust_items
432
+ return []
433
+ else:
434
+ return []
435
+
436
+ # Split large chunks
437
+ for chunk in raw_chunks:
438
+ chunks.extend(split_large_chunk(chunk, max_tokens))
439
+
440
+ # Assign chunk IDs
441
+ return _assign_chunk_ids(chunks, project)
442
+
443
+
444
+ if __name__ == "__main__":
445
+ # Test chunking
446
+ import sys
447
+
448
+ if len(sys.argv) < 2:
449
+ print("Usage: chunker.py <simd_dir>")
450
+ sys.exit(1)
451
+
452
+ simd_path = Path(sys.argv[1])
453
+ chunks = chunk_all_simds(simd_path)
454
+
455
+ print(f"Chunked {len(chunks)} sections from SIMDs")
456
+ for chunk in chunks[:5]:
457
+ print(f" - {chunk.source_name}: {len(chunk.content)} chars")