sol-mcp 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,411 @@
1
+ """Manifest for tracking indexed files and enabling incremental updates.
2
+
3
+ The manifest tracks:
4
+ - File paths with their SHA256 hashes and mtimes
5
+ - Chunk IDs generated for each file
6
+ - Embedding model and chunking configuration
7
+ - Repository versions (git commit hashes)
8
+
9
+ This enables incremental indexing by detecting changed files.
10
+ """
11
+
12
+ import hashlib
13
+ import json
14
+ import os
15
+ import re
16
+ import shutil
17
+ from dataclasses import dataclass, field
18
+ from datetime import UTC, datetime
19
+ from pathlib import Path
20
+ from typing import Any
21
+
22
+ from ..logging import get_logger
23
+
24
+ logger = get_logger("manifest")
25
+
26
+ # Maximum manifest file size (10MB)
27
+ MAX_MANIFEST_SIZE = 10 * 1024 * 1024
28
+
29
+ # Maximum chunks per file (sanity check)
30
+ MAX_CHUNKS_PER_FILE = 10000
31
+
32
+ # Version for manifest format migrations
33
+ MANIFEST_VERSION = "1.0.0"
34
+
35
+
36
+ class ManifestError(Exception):
37
+ """Base exception for manifest errors."""
38
+
39
+ pass
40
+
41
+
42
+ class ManifestValidationError(ManifestError):
43
+ """Raised when manifest validation fails."""
44
+
45
+ pass
46
+
47
+
48
+ class ManifestCorruptedError(ManifestError):
49
+ """Raised when manifest file is corrupted."""
50
+
51
+ pass
52
+
53
+
54
+ @dataclass
55
+ class FileEntry:
56
+ """Metadata for a tracked file."""
57
+
58
+ sha256: str
59
+ mtime_ns: int
60
+ chunk_ids: list[str] = field(default_factory=list)
61
+
62
+ def to_dict(self) -> dict[str, Any]:
63
+ return {
64
+ "sha256": self.sha256,
65
+ "mtime_ns": self.mtime_ns,
66
+ "chunk_ids": self.chunk_ids,
67
+ }
68
+
69
+ @classmethod
70
+ def from_dict(cls, data: dict[str, Any]) -> "FileEntry":
71
+ return cls(
72
+ sha256=data["sha256"],
73
+ mtime_ns=data["mtime_ns"],
74
+ chunk_ids=data.get("chunk_ids", []),
75
+ )
76
+
77
+
78
+ @dataclass
79
+ class Manifest:
80
+ """
81
+ Tracks indexed files for incremental updates.
82
+
83
+ The manifest is persisted to JSON and loaded on startup to detect
84
+ which files have changed and need re-indexing.
85
+ """
86
+
87
+ version: str = MANIFEST_VERSION
88
+ updated_at: str = ""
89
+ embedding_model: str = ""
90
+ chunk_config: dict[str, int] = field(default_factory=dict)
91
+ files: dict[str, FileEntry] = field(default_factory=dict)
92
+ repo_versions: dict[str, str] = field(default_factory=dict)
93
+
94
+ def to_dict(self) -> dict[str, Any]:
95
+ return {
96
+ "version": self.version,
97
+ "updated_at": self.updated_at,
98
+ "embedding_model": self.embedding_model,
99
+ "chunk_config": self.chunk_config,
100
+ "files": {path: entry.to_dict() for path, entry in self.files.items()},
101
+ "repo_versions": self.repo_versions,
102
+ }
103
+
104
+ @classmethod
105
+ def from_dict(cls, data: dict[str, Any]) -> "Manifest":
106
+ """Parse manifest from dictionary with validation."""
107
+ # Validate required fields
108
+ if "version" not in data:
109
+ raise ManifestValidationError("Missing required field: version")
110
+ if "files" not in data:
111
+ raise ManifestValidationError("Missing required field: files")
112
+ if not isinstance(data.get("files"), dict):
113
+ raise ManifestValidationError("'files' must be a dictionary")
114
+
115
+ # Validate version format
116
+ version = data["version"]
117
+ if not re.match(r"^\d+\.\d+\.\d+$", version):
118
+ raise ManifestValidationError(f"Invalid version format: {version}")
119
+
120
+ files = {}
121
+ for path, entry_data in data["files"].items():
122
+ # Security: validate path doesn't contain traversal
123
+ _validate_path(path)
124
+
125
+ # Validate SHA256 format
126
+ sha256 = entry_data.get("sha256", "")
127
+ if not re.match(r"^[a-f0-9]{64}$", sha256):
128
+ raise ManifestValidationError(f"Invalid SHA256 for {path}: {sha256}")
129
+
130
+ # Validate chunk count
131
+ chunk_ids = entry_data.get("chunk_ids", [])
132
+ if len(chunk_ids) > MAX_CHUNKS_PER_FILE:
133
+ raise ManifestValidationError(
134
+ f"Too many chunks for {path}: {len(chunk_ids)} > {MAX_CHUNKS_PER_FILE}"
135
+ )
136
+
137
+ files[path] = FileEntry.from_dict(entry_data)
138
+
139
+ return cls(
140
+ version=data["version"],
141
+ updated_at=data.get("updated_at", ""),
142
+ embedding_model=data.get("embedding_model", ""),
143
+ chunk_config=data.get("chunk_config", {}),
144
+ files=files,
145
+ repo_versions=data.get("repo_versions", {}),
146
+ )
147
+
148
+
149
+ def _validate_path(path: str) -> None:
150
+ """Validate a file path for security.
151
+
152
+ Raises ManifestValidationError if path is invalid or potentially malicious.
153
+ """
154
+ # Check for path traversal
155
+ if ".." in path:
156
+ raise ManifestValidationError(f"Path traversal detected in: {path}")
157
+
158
+ # Check for absolute paths
159
+ if path.startswith("/") or (len(path) > 1 and path[1] == ":"):
160
+ raise ManifestValidationError(f"Absolute paths not allowed: {path}")
161
+
162
+ # Check for null bytes
163
+ if "\x00" in path:
164
+ raise ManifestValidationError(f"Null bytes in path: {path}")
165
+
166
+ # Check for special characters that could cause issues
167
+ if any(c in path for c in ["\n", "\r", "\t"]):
168
+ raise ManifestValidationError(f"Invalid characters in path: {path}")
169
+
170
+
171
+ def _sanitize_chunk_id(chunk_id: str) -> str:
172
+ """Sanitize a chunk ID for safe use in queries.
173
+
174
+ Returns sanitized ID or raises ManifestValidationError if invalid.
175
+ """
176
+ # Only allow alphanumeric, underscore, and hyphen
177
+ if not re.match(r"^[a-zA-Z0-9_-]+$", chunk_id):
178
+ raise ManifestValidationError(f"Invalid chunk ID format: {chunk_id}")
179
+
180
+ # Reasonable length limit
181
+ if len(chunk_id) > 200:
182
+ raise ManifestValidationError(f"Chunk ID too long: {len(chunk_id)}")
183
+
184
+ return chunk_id
185
+
186
+
187
+ def compute_file_hash(file_path: Path) -> str:
188
+ """Compute SHA256 hash of a file."""
189
+ hasher = hashlib.sha256()
190
+ with open(file_path, "rb") as f:
191
+ for chunk in iter(lambda: f.read(65536), b""):
192
+ hasher.update(chunk)
193
+ return hasher.hexdigest()
194
+
195
+
196
+ def get_file_mtime_ns(file_path: Path) -> int:
197
+ """Get file modification time in nanoseconds."""
198
+ return file_path.stat().st_mtime_ns
199
+
200
+
201
+ def load_manifest(manifest_path: Path) -> Manifest | None:
202
+ """
203
+ Load manifest from disk.
204
+
205
+ Returns None if manifest doesn't exist.
206
+ Raises ManifestCorruptedError if manifest is corrupted (and creates backup).
207
+ """
208
+ if not manifest_path.exists():
209
+ return None
210
+
211
+ # Check file size
212
+ file_size = manifest_path.stat().st_size
213
+ if file_size > MAX_MANIFEST_SIZE:
214
+ raise ManifestValidationError(
215
+ f"Manifest file too large: {file_size} > {MAX_MANIFEST_SIZE}"
216
+ )
217
+
218
+ try:
219
+ with open(manifest_path, encoding="utf-8") as f:
220
+ data = json.load(f)
221
+ return Manifest.from_dict(data)
222
+ except json.JSONDecodeError as e:
223
+ # Create backup of corrupted file
224
+ backup_path = manifest_path.with_suffix(".json.corrupted")
225
+ logger.warning(
226
+ "Manifest corrupted, creating backup at %s: %s", backup_path, e
227
+ )
228
+ shutil.copy(manifest_path, backup_path)
229
+ raise ManifestCorruptedError(f"Manifest JSON corrupted: {e}") from e
230
+ except ManifestValidationError:
231
+ # Re-raise validation errors
232
+ raise
233
+ except Exception as e:
234
+ raise ManifestCorruptedError(f"Failed to load manifest: {e}") from e
235
+
236
+
237
+ def save_manifest(manifest: Manifest, manifest_path: Path) -> None:
238
+ """
239
+ Save manifest to disk atomically.
240
+
241
+ Uses write-to-temp-then-rename pattern to prevent corruption
242
+ from interrupted writes.
243
+ """
244
+ manifest.updated_at = datetime.now(UTC).isoformat()
245
+
246
+ # Ensure parent directory exists
247
+ manifest_path.parent.mkdir(parents=True, exist_ok=True)
248
+
249
+ # Write to temp file first
250
+ temp_path = manifest_path.with_suffix(".json.tmp")
251
+
252
+ try:
253
+ with open(temp_path, "w", encoding="utf-8") as f:
254
+ json.dump(manifest.to_dict(), f, indent=2)
255
+ f.flush()
256
+ os.fsync(f.fileno())
257
+
258
+ # Atomic rename
259
+ temp_path.replace(manifest_path)
260
+ logger.debug("Saved manifest to %s", manifest_path)
261
+
262
+ except Exception:
263
+ # Clean up temp file on failure
264
+ if temp_path.exists():
265
+ temp_path.unlink()
266
+ raise
267
+
268
+
269
+ def generate_chunk_id(
270
+ project: str,
271
+ source_type: str,
272
+ source_file: str,
273
+ chunk_index: int,
274
+ content: str,
275
+ ) -> str:
276
+ """
277
+ Generate a unique, deterministic chunk ID.
278
+
279
+ Format: {project}_{source_type}_{path_hash}_{index:04d}_{content_hash}
280
+
281
+ The content hash ensures that if content changes, the ID changes,
282
+ enabling proper delta updates.
283
+
284
+ Args:
285
+ project: Project identifier (e.g., "sol")
286
+ source_type: Type of source (e.g., "rust", "simd", "docs")
287
+ source_file: Relative path to source file
288
+ chunk_index: Index of this chunk within the file
289
+ content: Chunk content for hashing
290
+
291
+ Returns:
292
+ Unique chunk ID string
293
+ """
294
+ # Validate inputs
295
+ _validate_path(source_file)
296
+
297
+ path_hash = hashlib.sha256(source_file.encode()).hexdigest()[:8]
298
+ content_hash = hashlib.sha256(content.encode()).hexdigest()[:8]
299
+
300
+ chunk_id = f"{project}_{source_type}_{path_hash}_{chunk_index:04d}_{content_hash}"
301
+
302
+ # Validate the generated ID
303
+ _sanitize_chunk_id(chunk_id)
304
+
305
+ return chunk_id
306
+
307
+
308
+ @dataclass
309
+ class FileChange:
310
+ """Represents a detected file change."""
311
+
312
+ path: str
313
+ change_type: str # 'add', 'modify', 'delete'
314
+ old_chunk_ids: list[str] = field(default_factory=list)
315
+
316
+
317
+ def compute_changes(
318
+ manifest: Manifest | None,
319
+ current_files: dict[str, Path],
320
+ check_hashes: bool = True,
321
+ ) -> list[FileChange]:
322
+ """
323
+ Compute file changes between manifest and current state.
324
+
325
+ Uses a two-phase check:
326
+ 1. Fast path: Compare mtime - if unchanged, skip file
327
+ 2. Slow path: If mtime changed, compare hash to detect actual changes
328
+
329
+ Args:
330
+ manifest: Previous manifest (None for fresh index)
331
+ current_files: Dict mapping relative paths to absolute Paths
332
+ check_hashes: If False, skip hash checking (assume mtime change = content change)
333
+
334
+ Returns:
335
+ List of FileChange objects describing what changed
336
+ """
337
+ changes = []
338
+ manifest_files = manifest.files if manifest else {}
339
+
340
+ # Find new and modified files
341
+ for rel_path, abs_path in current_files.items():
342
+ _validate_path(rel_path)
343
+
344
+ if rel_path not in manifest_files:
345
+ # New file
346
+ changes.append(FileChange(path=rel_path, change_type="add"))
347
+ continue
348
+
349
+ entry = manifest_files[rel_path]
350
+ current_mtime = get_file_mtime_ns(abs_path)
351
+
352
+ if current_mtime == entry.mtime_ns:
353
+ # Fast path: mtime unchanged, assume no change
354
+ continue
355
+
356
+ if check_hashes:
357
+ # Slow path: mtime changed, check hash
358
+ current_hash = compute_file_hash(abs_path)
359
+ if current_hash == entry.sha256:
360
+ # Content unchanged, just update mtime (no re-embedding needed)
361
+ # We'll handle this separately to avoid unnecessary work
362
+ continue
363
+
364
+ # File modified
365
+ changes.append(
366
+ FileChange(
367
+ path=rel_path,
368
+ change_type="modify",
369
+ old_chunk_ids=entry.chunk_ids.copy(),
370
+ )
371
+ )
372
+
373
+ # Find deleted files
374
+ for rel_path, entry in manifest_files.items():
375
+ if rel_path not in current_files:
376
+ changes.append(
377
+ FileChange(
378
+ path=rel_path,
379
+ change_type="delete",
380
+ old_chunk_ids=entry.chunk_ids.copy(),
381
+ )
382
+ )
383
+
384
+ return changes
385
+
386
+
387
+ def needs_full_rebuild(manifest: Manifest | None, config: dict[str, Any]) -> tuple[bool, str]:
388
+ """
389
+ Check if a full rebuild is needed due to config changes.
390
+
391
+ Args:
392
+ manifest: Previous manifest
393
+ config: Current configuration dict with 'embedding_model' and 'chunk_config'
394
+
395
+ Returns:
396
+ Tuple of (needs_rebuild, reason)
397
+ """
398
+ if manifest is None:
399
+ return True, "No manifest exists"
400
+
401
+ # Check embedding model change
402
+ current_model = config.get("embedding_model", "")
403
+ if manifest.embedding_model and manifest.embedding_model != current_model:
404
+ return True, f"Embedding model changed: {manifest.embedding_model} -> {current_model}"
405
+
406
+ # Check chunk config change
407
+ current_chunk_config = config.get("chunk_config", {})
408
+ if manifest.chunk_config and manifest.chunk_config != current_chunk_config:
409
+ return True, f"Chunk config changed: {manifest.chunk_config} -> {current_chunk_config}"
410
+
411
+ return False, ""
solana_mcp/logging.py ADDED
@@ -0,0 +1,85 @@
1
+ """Structured logging configuration for solana-mcp."""
2
+
3
+ import logging
4
+ import sys
5
+ from typing import Any
6
+
7
+
8
+ def setup_logging(
9
+ level: int = logging.INFO,
10
+ json_format: bool = False,
11
+ ) -> logging.Logger:
12
+ """
13
+ Configure structured logging for the application.
14
+
15
+ Args:
16
+ level: Logging level (default: INFO)
17
+ json_format: If True, output JSON-formatted logs
18
+
19
+ Returns:
20
+ Configured logger instance
21
+ """
22
+ logger = logging.getLogger("solana_mcp")
23
+ logger.setLevel(level)
24
+
25
+ # Clear existing handlers
26
+ logger.handlers.clear()
27
+
28
+ # Create handler
29
+ handler = logging.StreamHandler(sys.stderr)
30
+ handler.setLevel(level)
31
+
32
+ if json_format:
33
+ formatter = JsonFormatter()
34
+ else:
35
+ formatter = logging.Formatter(
36
+ "%(asctime)s [%(levelname)s] %(name)s: %(message)s",
37
+ datefmt="%Y-%m-%d %H:%M:%S",
38
+ )
39
+
40
+ handler.setFormatter(formatter)
41
+ logger.addHandler(handler)
42
+
43
+ return logger
44
+
45
+
46
+ class JsonFormatter(logging.Formatter):
47
+ """JSON log formatter for structured logging."""
48
+
49
+ def format(self, record: logging.LogRecord) -> str:
50
+ import json
51
+
52
+ log_data: dict[str, Any] = {
53
+ "timestamp": self.formatTime(record, "%Y-%m-%dT%H:%M:%S"),
54
+ "level": record.levelname,
55
+ "logger": record.name,
56
+ "message": record.getMessage(),
57
+ }
58
+
59
+ if record.exc_info:
60
+ log_data["exception"] = self.formatException(record.exc_info)
61
+
62
+ # Include any extra fields
63
+ if hasattr(record, "extra"):
64
+ log_data.update(record.extra)
65
+
66
+ return json.dumps(log_data)
67
+
68
+
69
+ def get_logger(name: str | None = None) -> logging.Logger:
70
+ """
71
+ Get a logger instance.
72
+
73
+ Args:
74
+ name: Logger name (will be prefixed with 'solana_mcp.')
75
+
76
+ Returns:
77
+ Logger instance
78
+ """
79
+ if name:
80
+ return logging.getLogger(f"solana_mcp.{name}")
81
+ return logging.getLogger("solana_mcp")
82
+
83
+
84
+ # Module-level logger for convenience
85
+ logger = get_logger()
solana_mcp/models.py ADDED
@@ -0,0 +1,62 @@
1
+ """Pydantic models for MCP tool input validation."""
2
+
3
+ from pydantic import BaseModel, Field, field_validator
4
+
5
+
6
+ class SearchInput(BaseModel):
7
+ """Input validation for search operations."""
8
+
9
+ query: str = Field(..., min_length=1, max_length=1000, description="Search query")
10
+ limit: int = Field(default=5, ge=1, le=50, description="Maximum results")
11
+ source_type: str | None = Field(None, description="Filter by source type")
12
+
13
+ @field_validator("source_type")
14
+ @classmethod
15
+ def validate_source_type(cls, v: str | None) -> str | None:
16
+ if v is None:
17
+ return v
18
+ valid_types = ["rust", "simd", "c", "docs"]
19
+ if v.lower() not in valid_types:
20
+ raise ValueError(f"Invalid source_type '{v}'. Must be one of: {valid_types}")
21
+ return v.lower()
22
+
23
+
24
+ class ConstantLookupInput(BaseModel):
25
+ """Input validation for constant lookup."""
26
+
27
+ name: str = Field(
28
+ ...,
29
+ min_length=1,
30
+ max_length=100,
31
+ pattern=r"^[A-Z][A-Z0-9_]*$",
32
+ description="Constant name (UPPER_SNAKE_CASE)",
33
+ )
34
+
35
+
36
+ class FunctionLookupInput(BaseModel):
37
+ """Input validation for function lookup."""
38
+
39
+ name: str = Field(
40
+ ...,
41
+ min_length=1,
42
+ max_length=100,
43
+ pattern=r"^[a-z_][a-z0-9_]*$",
44
+ description="Function name (snake_case)",
45
+ )
46
+
47
+
48
+ class GuidanceInput(BaseModel):
49
+ """Input validation for expert guidance lookup."""
50
+
51
+ topic: str = Field(..., min_length=1, max_length=50, description="Guidance topic")
52
+
53
+
54
+ class ClientLookupInput(BaseModel):
55
+ """Input validation for client lookup."""
56
+
57
+ name: str = Field(..., min_length=1, max_length=50, description="Client name")
58
+
59
+ @field_validator("name")
60
+ @classmethod
61
+ def validate_name(cls, v: str) -> str:
62
+ return v.lower().strip()