mcp-code-indexer 4.0.2__py3-none-any.whl → 4.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. mcp_code_indexer/database/models.py +125 -1
  2. mcp_code_indexer/main.py +60 -0
  3. mcp_code_indexer/migrations/006_vector_mode.sql +189 -0
  4. mcp_code_indexer/server/mcp_server.py +3 -0
  5. mcp_code_indexer/vector_mode/__init__.py +36 -0
  6. mcp_code_indexer/vector_mode/chunking/__init__.py +19 -0
  7. mcp_code_indexer/vector_mode/chunking/ast_chunker.py +403 -0
  8. mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +500 -0
  9. mcp_code_indexer/vector_mode/chunking/language_handlers.py +478 -0
  10. mcp_code_indexer/vector_mode/config.py +167 -0
  11. mcp_code_indexer/vector_mode/daemon.py +335 -0
  12. mcp_code_indexer/vector_mode/monitoring/__init__.py +19 -0
  13. mcp_code_indexer/vector_mode/monitoring/change_detector.py +312 -0
  14. mcp_code_indexer/vector_mode/monitoring/file_watcher.py +445 -0
  15. mcp_code_indexer/vector_mode/monitoring/merkle_tree.py +418 -0
  16. mcp_code_indexer/vector_mode/providers/__init__.py +17 -0
  17. mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +217 -0
  18. mcp_code_indexer/vector_mode/providers/voyage_client.py +119 -0
  19. mcp_code_indexer/vector_mode/security/__init__.py +11 -0
  20. mcp_code_indexer/vector_mode/security/patterns.py +297 -0
  21. mcp_code_indexer/vector_mode/security/redactor.py +368 -0
  22. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/METADATA +66 -5
  23. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/RECORD +26 -8
  24. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/LICENSE +0 -0
  25. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/WHEEL +0 -0
  26. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,478 @@
1
+ """
2
+ Language-specific handlers for AST parsing and code chunking.
3
+
4
+ Provides specialized handling for different programming languages using
5
+ Tree-sitter parsers with language-specific semantic understanding.
6
+ """
7
+
8
+ import logging
9
+ from abc import ABC, abstractmethod
10
+ from typing import List, Dict, Any, Optional, Set, Tuple
11
+ from pathlib import Path
12
+ from dataclasses import dataclass
13
+ from enum import Enum
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Try to import tree-sitter, fallback if not available
18
+ try:
19
+ import tree_sitter
20
+ from tree_sitter import Language, Parser, Node
21
+ TREE_SITTER_AVAILABLE = True
22
+ except ImportError:
23
+ TREE_SITTER_AVAILABLE = False
24
+ Language = None
25
+ Parser = None
26
+ Node = None
27
+
28
+ from ...database.models import ChunkType
29
+
30
+ @dataclass
31
+ class ParsedChunk:
32
+ """Represents a parsed code chunk with metadata."""
33
+ content: str
34
+ chunk_type: ChunkType
35
+ name: Optional[str]
36
+ start_line: int
37
+ end_line: int
38
+ start_byte: int
39
+ end_byte: int
40
+ metadata: Dict[str, Any]
41
+ language: str
42
+ parent_context: Optional[str] = None
43
+ imports: List[str] = None
44
+
45
+ def __post_init__(self):
46
+ if self.imports is None:
47
+ self.imports = []
48
+
49
+ class LanguageHandler(ABC):
50
+ """Base class for language-specific AST parsing."""
51
+
52
+ def __init__(self, language_name: str):
53
+ """Initialize language handler."""
54
+ self.language_name = language_name
55
+ self.parser: Optional[Parser] = None
56
+ self.language: Optional[Language] = None
57
+
58
+ # Language-specific configuration
59
+ self.function_nodes = set()
60
+ self.class_nodes = set()
61
+ self.import_nodes = set()
62
+ self.comment_nodes = set()
63
+ self.docstring_nodes = set()
64
+
65
+ self._setup_node_types()
66
+
67
+ @abstractmethod
68
+ def _setup_node_types(self) -> None:
69
+ """Set up language-specific node types."""
70
+ pass
71
+
72
+ def initialize_parser(self) -> bool:
73
+ """Initialize Tree-sitter parser for this language."""
74
+ if not TREE_SITTER_AVAILABLE:
75
+ logger.warning("Tree-sitter not available, falling back to line-based chunking")
76
+ return False
77
+
78
+ try:
79
+ # This would need actual Tree-sitter language binaries
80
+ # For now, we'll simulate the interface
81
+ logger.info(f"Parser for {self.language_name} would be initialized here")
82
+ return True
83
+ except Exception as e:
84
+ logger.warning(f"Failed to initialize {self.language_name} parser: {e}")
85
+ return False
86
+
87
+ def parse_code(self, source_code: str, file_path: str) -> List[ParsedChunk]:
88
+ """Parse source code into semantic chunks."""
89
+ if not self.parser:
90
+ # Fallback to simple line-based chunking
91
+ return self._fallback_chunking(source_code, file_path)
92
+
93
+ try:
94
+ tree = self.parser.parse(bytes(source_code, "utf8"))
95
+ return self._extract_chunks(tree.root_node, source_code, file_path)
96
+ except Exception as e:
97
+ logger.error(f"Failed to parse {file_path}: {e}")
98
+ return self._fallback_chunking(source_code, file_path)
99
+
100
+ def _fallback_chunking(self, source_code: str, file_path: str) -> List[ParsedChunk]:
101
+ """Fallback to line-based chunking when AST parsing fails."""
102
+ lines = source_code.split('\n')
103
+ chunks = []
104
+
105
+ # Simple heuristic-based chunking
106
+ current_chunk_lines = []
107
+ current_start_line = 1
108
+
109
+ for i, line in enumerate(lines, 1):
110
+ current_chunk_lines.append(line)
111
+
112
+ # End chunk on empty lines or when chunk gets too large
113
+ if (not line.strip() and len(current_chunk_lines) > 5) or len(current_chunk_lines) >= 50:
114
+ if current_chunk_lines:
115
+ content = '\n'.join(current_chunk_lines)
116
+ if content.strip(): # Only add non-empty chunks
117
+ chunk = ParsedChunk(
118
+ content=content,
119
+ chunk_type=ChunkType.GENERIC,
120
+ name=None,
121
+ start_line=current_start_line,
122
+ end_line=i,
123
+ start_byte=0,
124
+ end_byte=len(content.encode('utf-8')),
125
+ metadata={"fallback": True},
126
+ language=self.language_name,
127
+ )
128
+ chunks.append(chunk)
129
+
130
+ current_chunk_lines = []
131
+ current_start_line = i + 1
132
+
133
+ # Add final chunk if any
134
+ if current_chunk_lines:
135
+ content = '\n'.join(current_chunk_lines)
136
+ if content.strip():
137
+ chunk = ParsedChunk(
138
+ content=content,
139
+ chunk_type=ChunkType.GENERIC,
140
+ name=None,
141
+ start_line=current_start_line,
142
+ end_line=len(lines),
143
+ start_byte=0,
144
+ end_byte=len(content.encode('utf-8')),
145
+ metadata={"fallback": True},
146
+ language=self.language_name,
147
+ )
148
+ chunks.append(chunk)
149
+
150
+ return chunks
151
+
152
+ @abstractmethod
153
+ def _extract_chunks(self, root_node: Any, source_code: str, file_path: str) -> List[ParsedChunk]:
154
+ """Extract semantic chunks from AST."""
155
+ pass
156
+
157
+ def _get_node_text(self, node: Any, source_code: str) -> str:
158
+ """Get text content of a node."""
159
+ if hasattr(node, 'start_byte') and hasattr(node, 'end_byte'):
160
+ return source_code[node.start_byte:node.end_byte]
161
+ return ""
162
+
163
+ def _get_line_numbers(self, node: Any) -> Tuple[int, int]:
164
+ """Get start and end line numbers for a node."""
165
+ if hasattr(node, 'start_point') and hasattr(node, 'end_point'):
166
+ return node.start_point[0] + 1, node.end_point[0] + 1
167
+ return 1, 1
168
+
169
+ class PythonHandler(LanguageHandler):
170
+ """Handler for Python code."""
171
+
172
+ def __init__(self):
173
+ super().__init__("python")
174
+
175
+ def _setup_node_types(self) -> None:
176
+ """Set up Python-specific node types."""
177
+ self.function_nodes = {"function_definition", "async_function_definition"}
178
+ self.class_nodes = {"class_definition"}
179
+ self.import_nodes = {"import_statement", "import_from_statement"}
180
+ self.comment_nodes = {"comment"}
181
+ self.docstring_nodes = {"expression_statement"} # May contain docstrings
182
+
183
+ def _extract_chunks(self, root_node: Any, source_code: str, file_path: str) -> List[ParsedChunk]:
184
+ """Extract chunks from Python AST."""
185
+ chunks = []
186
+
187
+ # Extract imports first
188
+ imports = self._extract_imports(root_node, source_code)
189
+
190
+ # Extract top-level constructs
191
+ for child in root_node.children:
192
+ if child.type in self.function_nodes:
193
+ chunk = self._extract_function(child, source_code, imports)
194
+ if chunk:
195
+ chunks.append(chunk)
196
+
197
+ elif child.type in self.class_nodes:
198
+ class_chunks = self._extract_class(child, source_code, imports)
199
+ chunks.extend(class_chunks)
200
+
201
+ # Add import chunk if imports exist
202
+ if imports:
203
+ import_content = '\n'.join(imports)
204
+ import_chunk = ParsedChunk(
205
+ content=import_content,
206
+ chunk_type=ChunkType.IMPORT,
207
+ name="imports",
208
+ start_line=1,
209
+ end_line=len(imports),
210
+ start_byte=0,
211
+ end_byte=len(import_content.encode('utf-8')),
212
+ metadata={"import_count": len(imports)},
213
+ language=self.language_name,
214
+ imports=imports,
215
+ )
216
+ chunks.insert(0, import_chunk) # Imports go first
217
+
218
+ return chunks
219
+
220
+ def _extract_imports(self, root_node: Any, source_code: str) -> List[str]:
221
+ """Extract import statements."""
222
+ imports = []
223
+ for child in root_node.children:
224
+ if child.type in self.import_nodes:
225
+ import_text = self._get_node_text(child, source_code)
226
+ imports.append(import_text.strip())
227
+ return imports
228
+
229
+ def _extract_function(self, node: Any, source_code: str, imports: List[str]) -> Optional[ParsedChunk]:
230
+ """Extract a function definition."""
231
+ content = self._get_node_text(node, source_code)
232
+ start_line, end_line = self._get_line_numbers(node)
233
+
234
+ # Extract function name
235
+ name = "unknown_function"
236
+ for child in node.children:
237
+ if child.type == "identifier":
238
+ name = self._get_node_text(child, source_code)
239
+ break
240
+
241
+ # Check for docstring
242
+ docstring = self._extract_docstring(node, source_code)
243
+
244
+ metadata = {
245
+ "is_async": node.type == "async_function_definition",
246
+ "has_docstring": docstring is not None,
247
+ "line_count": end_line - start_line + 1,
248
+ }
249
+
250
+ if docstring:
251
+ metadata["docstring"] = docstring
252
+
253
+ return ParsedChunk(
254
+ content=content,
255
+ chunk_type=ChunkType.FUNCTION,
256
+ name=name,
257
+ start_line=start_line,
258
+ end_line=end_line,
259
+ start_byte=node.start_byte if hasattr(node, 'start_byte') else 0,
260
+ end_byte=node.end_byte if hasattr(node, 'end_byte') else len(content.encode('utf-8')),
261
+ metadata=metadata,
262
+ language=self.language_name,
263
+ imports=imports,
264
+ )
265
+
266
+ def _extract_class(self, node: Any, source_code: str, imports: List[str]) -> List[ParsedChunk]:
267
+ """Extract a class definition and its methods."""
268
+ chunks = []
269
+
270
+ # Extract class name
271
+ class_name = "unknown_class"
272
+ for child in node.children:
273
+ if child.type == "identifier":
274
+ class_name = self._get_node_text(child, source_code)
275
+ break
276
+
277
+ # Extract class docstring
278
+ class_docstring = self._extract_docstring(node, source_code)
279
+
280
+ # Extract class-level chunk
281
+ class_content = self._get_node_text(node, source_code)
282
+ start_line, end_line = self._get_line_numbers(node)
283
+
284
+ class_metadata = {
285
+ "has_docstring": class_docstring is not None,
286
+ "method_count": len([c for c in node.children if c.type in self.function_nodes]),
287
+ }
288
+
289
+ if class_docstring:
290
+ class_metadata["docstring"] = class_docstring
291
+
292
+ class_chunk = ParsedChunk(
293
+ content=class_content,
294
+ chunk_type=ChunkType.CLASS,
295
+ name=class_name,
296
+ start_line=start_line,
297
+ end_line=end_line,
298
+ start_byte=node.start_byte if hasattr(node, 'start_byte') else 0,
299
+ end_byte=node.end_byte if hasattr(node, 'end_byte') else len(class_content.encode('utf-8')),
300
+ metadata=class_metadata,
301
+ language=self.language_name,
302
+ parent_context=None,
303
+ imports=imports,
304
+ )
305
+ chunks.append(class_chunk)
306
+
307
+ # Extract methods separately for better granularity
308
+ for child in node.children:
309
+ if child.type in self.function_nodes:
310
+ method_chunk = self._extract_function(child, source_code, imports)
311
+ if method_chunk:
312
+ method_chunk.chunk_type = ChunkType.METHOD
313
+ method_chunk.parent_context = class_name
314
+ chunks.append(method_chunk)
315
+
316
+ return chunks
317
+
318
+ def _extract_docstring(self, node: Any, source_code: str) -> Optional[str]:
319
+ """Extract docstring from a function or class."""
320
+ # Look for string literal as first statement in body
321
+ for child in node.children:
322
+ if child.type == "block":
323
+ for stmt in child.children:
324
+ if stmt.type == "expression_statement":
325
+ for expr in stmt.children:
326
+ if expr.type == "string":
327
+ return self._get_node_text(expr, source_code).strip('"\'')
328
+ return None
329
+
330
+ class JavaScriptHandler(LanguageHandler):
331
+ """Handler for JavaScript/TypeScript code."""
332
+
333
+ def __init__(self, language_name: str = "javascript"):
334
+ super().__init__(language_name)
335
+
336
+ def _setup_node_types(self) -> None:
337
+ """Set up JavaScript-specific node types."""
338
+ self.function_nodes = {
339
+ "function_declaration", "arrow_function", "function_expression",
340
+ "method_definition", "generator_function_declaration"
341
+ }
342
+ self.class_nodes = {"class_declaration"}
343
+ self.import_nodes = {"import_statement", "import_declaration"}
344
+ self.comment_nodes = {"comment", "line_comment", "block_comment"}
345
+
346
+ def _extract_chunks(self, root_node: Any, source_code: str, file_path: str) -> List[ParsedChunk]:
347
+ """Extract chunks from JavaScript AST."""
348
+ chunks = []
349
+
350
+ # Extract imports
351
+ imports = self._extract_imports(root_node, source_code)
352
+
353
+ # Extract top-level constructs
354
+ for child in root_node.children:
355
+ if child.type in self.function_nodes:
356
+ chunk = self._extract_function(child, source_code, imports)
357
+ if chunk:
358
+ chunks.append(chunk)
359
+
360
+ elif child.type in self.class_nodes:
361
+ class_chunks = self._extract_class(child, source_code, imports)
362
+ chunks.extend(class_chunks)
363
+
364
+ return chunks
365
+
366
+ def _extract_imports(self, root_node: Any, source_code: str) -> List[str]:
367
+ """Extract import statements."""
368
+ imports = []
369
+ for child in root_node.children:
370
+ if child.type in self.import_nodes:
371
+ import_text = self._get_node_text(child, source_code)
372
+ imports.append(import_text.strip())
373
+ return imports
374
+
375
+ def _extract_function(self, node: Any, source_code: str, imports: List[str]) -> Optional[ParsedChunk]:
376
+ """Extract a function definition."""
377
+ content = self._get_node_text(node, source_code)
378
+ start_line, end_line = self._get_line_numbers(node)
379
+
380
+ # Extract function name
381
+ name = "anonymous_function"
382
+ for child in node.children:
383
+ if child.type == "identifier":
384
+ name = self._get_node_text(child, source_code)
385
+ break
386
+
387
+ metadata = {
388
+ "is_arrow": node.type == "arrow_function",
389
+ "is_async": "async" in content,
390
+ "line_count": end_line - start_line + 1,
391
+ }
392
+
393
+ return ParsedChunk(
394
+ content=content,
395
+ chunk_type=ChunkType.FUNCTION,
396
+ name=name,
397
+ start_line=start_line,
398
+ end_line=end_line,
399
+ start_byte=node.start_byte if hasattr(node, 'start_byte') else 0,
400
+ end_byte=node.end_byte if hasattr(node, 'end_byte') else len(content.encode('utf-8')),
401
+ metadata=metadata,
402
+ language=self.language_name,
403
+ imports=imports,
404
+ )
405
+
406
+ def _extract_class(self, node: Any, source_code: str, imports: List[str]) -> List[ParsedChunk]:
407
+ """Extract a class definition."""
408
+ chunks = []
409
+
410
+ # Extract class name
411
+ class_name = "unknown_class"
412
+ for child in node.children:
413
+ if child.type == "identifier":
414
+ class_name = self._get_node_text(child, source_code)
415
+ break
416
+
417
+ # Extract class chunk
418
+ class_content = self._get_node_text(node, source_code)
419
+ start_line, end_line = self._get_line_numbers(node)
420
+
421
+ class_chunk = ParsedChunk(
422
+ content=class_content,
423
+ chunk_type=ChunkType.CLASS,
424
+ name=class_name,
425
+ start_line=start_line,
426
+ end_line=end_line,
427
+ start_byte=node.start_byte if hasattr(node, 'start_byte') else 0,
428
+ end_byte=node.end_byte if hasattr(node, 'end_byte') else len(class_content.encode('utf-8')),
429
+ metadata={"line_count": end_line - start_line + 1},
430
+ language=self.language_name,
431
+ imports=imports,
432
+ )
433
+ chunks.append(class_chunk)
434
+
435
+ return chunks
436
+
437
+ # Language handler registry
438
+ LANGUAGE_HANDLERS = {
439
+ ".py": PythonHandler,
440
+ ".js": JavaScriptHandler,
441
+ ".ts": lambda: JavaScriptHandler("typescript"),
442
+ ".jsx": lambda: JavaScriptHandler("jsx"),
443
+ ".tsx": lambda: JavaScriptHandler("tsx"),
444
+ }
445
+
446
+ def get_language_handler(file_path: str) -> Optional[LanguageHandler]:
447
+ """Get appropriate language handler for a file."""
448
+ path = Path(file_path)
449
+ extension = path.suffix.lower()
450
+
451
+ handler_class = LANGUAGE_HANDLERS.get(extension)
452
+ if handler_class:
453
+ try:
454
+ handler = handler_class()
455
+ if handler.initialize_parser():
456
+ return handler
457
+ else:
458
+ # Return handler anyway for fallback chunking
459
+ return handler
460
+ except Exception as e:
461
+ logger.warning(f"Failed to create handler for {extension}: {e}")
462
+
463
+ # Return generic handler for unknown extensions
464
+ return GenericHandler(extension)
465
+
466
+ class GenericHandler(LanguageHandler):
467
+ """Generic handler for unsupported languages."""
468
+
469
+ def __init__(self, extension: str):
470
+ super().__init__(f"generic{extension}")
471
+
472
+ def _setup_node_types(self) -> None:
473
+ """No specific node types for generic handler."""
474
+ pass
475
+
476
+ def _extract_chunks(self, root_node: Any, source_code: str, file_path: str) -> List[ParsedChunk]:
477
+ """Generic chunking always falls back to line-based."""
478
+ return self._fallback_chunking(source_code, file_path)
@@ -0,0 +1,167 @@
1
+ """
2
+ Vector Mode Configuration.
3
+
4
+ Manages configuration settings for vector mode features including API keys,
5
+ batch sizes, similarity thresholds, and daemon settings.
6
+ """
7
+
8
+ import os
9
+ from dataclasses import dataclass, field
10
+ from pathlib import Path
11
+ from typing import Optional
12
+ import yaml
13
+
14
+ @dataclass
15
+ class VectorConfig:
16
+ """Configuration for vector mode operations."""
17
+
18
+ # API Configuration
19
+ voyage_api_key: Optional[str] = None
20
+ turbopuffer_api_key: Optional[str] = None
21
+ turbopuffer_region: str = "gcp-europe-west3"
22
+
23
+ # Embedding Configuration
24
+ embedding_model: str = "voyage-code-2"
25
+ batch_size: int = 128
26
+ max_tokens_per_chunk: int = 1024
27
+
28
+ # Search Configuration
29
+ similarity_threshold: float = 0.5
30
+ max_search_results: int = 20
31
+ enable_recency_boost: bool = True
32
+
33
+ # Chunking Configuration
34
+ max_chunk_size: int = 1500
35
+ chunk_overlap: int = 100
36
+ prefer_semantic_chunks: bool = True
37
+
38
+ # File Monitoring Configuration
39
+ watch_debounce_ms: int = 100
40
+ ignore_patterns: list[str] = field(default_factory=lambda: [
41
+ "*.log", "*.tmp", "*~", ".git/*", "__pycache__/*", "node_modules/*",
42
+ "*.pyc", "*.pyo", ".DS_Store", "Thumbs.db"
43
+ ])
44
+
45
+ # Daemon Configuration
46
+ daemon_enabled: bool = True
47
+ daemon_poll_interval: int = 5
48
+ max_queue_size: int = 1000
49
+ worker_count: int = 3
50
+
51
+ # Security Configuration
52
+ redact_secrets: bool = True
53
+ redaction_patterns_file: Optional[str] = None
54
+
55
+ @classmethod
56
+ def from_env(cls) -> "VectorConfig":
57
+ """Create config from environment variables."""
58
+ return cls(
59
+ voyage_api_key=os.getenv("VOYAGE_API_KEY"),
60
+ turbopuffer_api_key=os.getenv("TURBOPUFFER_API_KEY"),
61
+ turbopuffer_region=os.getenv("TURBOPUFFER_REGION", "gcp-europe-west3"),
62
+ embedding_model=os.getenv("VECTOR_EMBEDDING_MODEL", "voyage-code-3"),
63
+ batch_size=int(os.getenv("VECTOR_BATCH_SIZE", "128")),
64
+ max_tokens_per_chunk=int(os.getenv("VECTOR_MAX_TOKENS", "2048")),
65
+ similarity_threshold=float(os.getenv("VECTOR_SIMILARITY_THRESHOLD", "0.5")),
66
+ max_search_results=int(os.getenv("VECTOR_MAX_RESULTS", "20")),
67
+ enable_recency_boost=os.getenv("VECTOR_RECENCY_BOOST", "true").lower() == "true",
68
+ max_chunk_size=int(os.getenv("VECTOR_CHUNK_SIZE", "1500")),
69
+ chunk_overlap=int(os.getenv("VECTOR_CHUNK_OVERLAP", "100")),
70
+ watch_debounce_ms=int(os.getenv("VECTOR_DEBOUNCE_MS", "100")),
71
+ daemon_enabled=os.getenv("VECTOR_DAEMON_ENABLED", "true").lower() == "true",
72
+ daemon_poll_interval=int(os.getenv("VECTOR_POLL_INTERVAL", "5")),
73
+ max_queue_size=int(os.getenv("VECTOR_MAX_QUEUE", "1000")),
74
+ worker_count=int(os.getenv("VECTOR_WORKERS", "3")),
75
+ redact_secrets=os.getenv("VECTOR_REDACT_SECRETS", "true").lower() == "true",
76
+ )
77
+
78
+ @classmethod
79
+ def from_file(cls, config_path: Path) -> "VectorConfig":
80
+ """Load config from YAML file."""
81
+ if not config_path.exists():
82
+ return cls.from_env()
83
+
84
+ try:
85
+ with open(config_path, "r") as f:
86
+ data = yaml.safe_load(f) or {}
87
+
88
+ # Merge with environment variables (env takes precedence)
89
+ env_config = cls.from_env()
90
+
91
+ # Update with file values only if env variable not set
92
+ for key, value in data.items():
93
+ if hasattr(env_config, key):
94
+ env_value = getattr(env_config, key)
95
+ # Use file value if env value is None or default
96
+ if env_value is None or (key == "voyage_api_key" and env_value is None):
97
+ setattr(env_config, key, value)
98
+
99
+ return env_config
100
+
101
+ except Exception as e:
102
+ raise ValueError(f"Failed to load config from {config_path}: {e}")
103
+
104
+ def to_file(self, config_path: Path) -> None:
105
+ """Save config to YAML file."""
106
+ config_path.parent.mkdir(parents=True, exist_ok=True)
107
+
108
+ # Don't save API keys to file for security
109
+ data = {
110
+ k: v for k, v in self.__dict__.items()
111
+ if not k.endswith("_api_key")
112
+ }
113
+
114
+ with open(config_path, "w") as f:
115
+ yaml.dump(data, f, default_flow_style=False, sort_keys=True)
116
+
117
+ def validate(self) -> list[str]:
118
+ """Validate configuration and return list of errors."""
119
+ errors = []
120
+
121
+ if self.daemon_enabled:
122
+ if not self.voyage_api_key:
123
+ errors.append("VOYAGE_API_KEY environment variable required for vector mode")
124
+ if not self.turbopuffer_api_key:
125
+ errors.append("TURBOPUFFER_API_KEY environment variable required for vector mode")
126
+
127
+ # Validate TurboPuffer region
128
+ supported_regions = [
129
+ 'aws-ap-southeast-2', 'aws-eu-central-1', 'aws-us-east-1',
130
+ 'aws-us-east-2', 'aws-us-west-2', 'gcp-us-central1',
131
+ 'gcp-us-west1', 'gcp-us-east4', 'gcp-europe-west3'
132
+ ]
133
+ if self.turbopuffer_region not in supported_regions:
134
+ errors.append(f"turbopuffer_region '{self.turbopuffer_region}' is not supported. " +
135
+ f"Supported regions: {', '.join(supported_regions)}")
136
+
137
+ if self.batch_size <= 0:
138
+ errors.append("batch_size must be positive")
139
+ if self.max_tokens_per_chunk <= 0:
140
+ errors.append("max_tokens_per_chunk must be positive")
141
+ if not 0 <= self.similarity_threshold <= 1:
142
+ errors.append("similarity_threshold must be between 0 and 1")
143
+ if self.max_search_results <= 0:
144
+ errors.append("max_search_results must be positive")
145
+ if self.max_chunk_size <= 0:
146
+ errors.append("max_chunk_size must be positive")
147
+ if self.chunk_overlap < 0:
148
+ errors.append("chunk_overlap cannot be negative")
149
+ if self.worker_count <= 0:
150
+ errors.append("worker_count must be positive")
151
+
152
+ return errors
153
+
154
+ def load_vector_config(config_path: Optional[Path] = None) -> VectorConfig:
155
+ """Load vector configuration from file or environment."""
156
+ if config_path is None:
157
+ from . import get_vector_config_path
158
+ config_path = get_vector_config_path()
159
+
160
+ config = VectorConfig.from_file(config_path)
161
+
162
+ # Validate configuration
163
+ errors = config.validate()
164
+ if errors:
165
+ raise ValueError(f"Invalid vector configuration: {'; '.join(errors)}")
166
+
167
+ return config