mcp-code-indexer 4.0.2__py3-none-any.whl → 4.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_code_indexer/database/models.py +125 -1
- mcp_code_indexer/main.py +60 -0
- mcp_code_indexer/migrations/006_vector_mode.sql +189 -0
- mcp_code_indexer/server/mcp_server.py +3 -0
- mcp_code_indexer/vector_mode/__init__.py +36 -0
- mcp_code_indexer/vector_mode/chunking/__init__.py +19 -0
- mcp_code_indexer/vector_mode/chunking/ast_chunker.py +403 -0
- mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +500 -0
- mcp_code_indexer/vector_mode/chunking/language_handlers.py +478 -0
- mcp_code_indexer/vector_mode/config.py +167 -0
- mcp_code_indexer/vector_mode/daemon.py +335 -0
- mcp_code_indexer/vector_mode/monitoring/__init__.py +19 -0
- mcp_code_indexer/vector_mode/monitoring/change_detector.py +312 -0
- mcp_code_indexer/vector_mode/monitoring/file_watcher.py +445 -0
- mcp_code_indexer/vector_mode/monitoring/merkle_tree.py +418 -0
- mcp_code_indexer/vector_mode/providers/__init__.py +17 -0
- mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +217 -0
- mcp_code_indexer/vector_mode/providers/voyage_client.py +119 -0
- mcp_code_indexer/vector_mode/security/__init__.py +11 -0
- mcp_code_indexer/vector_mode/security/patterns.py +297 -0
- mcp_code_indexer/vector_mode/security/redactor.py +368 -0
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/METADATA +66 -5
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/RECORD +26 -8
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/LICENSE +0 -0
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/WHEEL +0 -0
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,478 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Language-specific handlers for AST parsing and code chunking.
|
|
3
|
+
|
|
4
|
+
Provides specialized handling for different programming languages using
|
|
5
|
+
Tree-sitter parsers with language-specific semantic understanding.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
from typing import List, Dict, Any, Optional, Set, Tuple
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from enum import Enum
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
# Try to import tree-sitter, fallback if not available
|
|
18
|
+
try:
|
|
19
|
+
import tree_sitter
|
|
20
|
+
from tree_sitter import Language, Parser, Node
|
|
21
|
+
TREE_SITTER_AVAILABLE = True
|
|
22
|
+
except ImportError:
|
|
23
|
+
TREE_SITTER_AVAILABLE = False
|
|
24
|
+
Language = None
|
|
25
|
+
Parser = None
|
|
26
|
+
Node = None
|
|
27
|
+
|
|
28
|
+
from ...database.models import ChunkType
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class ParsedChunk:
|
|
32
|
+
"""Represents a parsed code chunk with metadata."""
|
|
33
|
+
content: str
|
|
34
|
+
chunk_type: ChunkType
|
|
35
|
+
name: Optional[str]
|
|
36
|
+
start_line: int
|
|
37
|
+
end_line: int
|
|
38
|
+
start_byte: int
|
|
39
|
+
end_byte: int
|
|
40
|
+
metadata: Dict[str, Any]
|
|
41
|
+
language: str
|
|
42
|
+
parent_context: Optional[str] = None
|
|
43
|
+
imports: List[str] = None
|
|
44
|
+
|
|
45
|
+
def __post_init__(self):
|
|
46
|
+
if self.imports is None:
|
|
47
|
+
self.imports = []
|
|
48
|
+
|
|
49
|
+
class LanguageHandler(ABC):
|
|
50
|
+
"""Base class for language-specific AST parsing."""
|
|
51
|
+
|
|
52
|
+
def __init__(self, language_name: str):
|
|
53
|
+
"""Initialize language handler."""
|
|
54
|
+
self.language_name = language_name
|
|
55
|
+
self.parser: Optional[Parser] = None
|
|
56
|
+
self.language: Optional[Language] = None
|
|
57
|
+
|
|
58
|
+
# Language-specific configuration
|
|
59
|
+
self.function_nodes = set()
|
|
60
|
+
self.class_nodes = set()
|
|
61
|
+
self.import_nodes = set()
|
|
62
|
+
self.comment_nodes = set()
|
|
63
|
+
self.docstring_nodes = set()
|
|
64
|
+
|
|
65
|
+
self._setup_node_types()
|
|
66
|
+
|
|
67
|
+
@abstractmethod
|
|
68
|
+
def _setup_node_types(self) -> None:
|
|
69
|
+
"""Set up language-specific node types."""
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
def initialize_parser(self) -> bool:
|
|
73
|
+
"""Initialize Tree-sitter parser for this language."""
|
|
74
|
+
if not TREE_SITTER_AVAILABLE:
|
|
75
|
+
logger.warning("Tree-sitter not available, falling back to line-based chunking")
|
|
76
|
+
return False
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
# This would need actual Tree-sitter language binaries
|
|
80
|
+
# For now, we'll simulate the interface
|
|
81
|
+
logger.info(f"Parser for {self.language_name} would be initialized here")
|
|
82
|
+
return True
|
|
83
|
+
except Exception as e:
|
|
84
|
+
logger.warning(f"Failed to initialize {self.language_name} parser: {e}")
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
def parse_code(self, source_code: str, file_path: str) -> List[ParsedChunk]:
|
|
88
|
+
"""Parse source code into semantic chunks."""
|
|
89
|
+
if not self.parser:
|
|
90
|
+
# Fallback to simple line-based chunking
|
|
91
|
+
return self._fallback_chunking(source_code, file_path)
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
tree = self.parser.parse(bytes(source_code, "utf8"))
|
|
95
|
+
return self._extract_chunks(tree.root_node, source_code, file_path)
|
|
96
|
+
except Exception as e:
|
|
97
|
+
logger.error(f"Failed to parse {file_path}: {e}")
|
|
98
|
+
return self._fallback_chunking(source_code, file_path)
|
|
99
|
+
|
|
100
|
+
def _fallback_chunking(self, source_code: str, file_path: str) -> List[ParsedChunk]:
|
|
101
|
+
"""Fallback to line-based chunking when AST parsing fails."""
|
|
102
|
+
lines = source_code.split('\n')
|
|
103
|
+
chunks = []
|
|
104
|
+
|
|
105
|
+
# Simple heuristic-based chunking
|
|
106
|
+
current_chunk_lines = []
|
|
107
|
+
current_start_line = 1
|
|
108
|
+
|
|
109
|
+
for i, line in enumerate(lines, 1):
|
|
110
|
+
current_chunk_lines.append(line)
|
|
111
|
+
|
|
112
|
+
# End chunk on empty lines or when chunk gets too large
|
|
113
|
+
if (not line.strip() and len(current_chunk_lines) > 5) or len(current_chunk_lines) >= 50:
|
|
114
|
+
if current_chunk_lines:
|
|
115
|
+
content = '\n'.join(current_chunk_lines)
|
|
116
|
+
if content.strip(): # Only add non-empty chunks
|
|
117
|
+
chunk = ParsedChunk(
|
|
118
|
+
content=content,
|
|
119
|
+
chunk_type=ChunkType.GENERIC,
|
|
120
|
+
name=None,
|
|
121
|
+
start_line=current_start_line,
|
|
122
|
+
end_line=i,
|
|
123
|
+
start_byte=0,
|
|
124
|
+
end_byte=len(content.encode('utf-8')),
|
|
125
|
+
metadata={"fallback": True},
|
|
126
|
+
language=self.language_name,
|
|
127
|
+
)
|
|
128
|
+
chunks.append(chunk)
|
|
129
|
+
|
|
130
|
+
current_chunk_lines = []
|
|
131
|
+
current_start_line = i + 1
|
|
132
|
+
|
|
133
|
+
# Add final chunk if any
|
|
134
|
+
if current_chunk_lines:
|
|
135
|
+
content = '\n'.join(current_chunk_lines)
|
|
136
|
+
if content.strip():
|
|
137
|
+
chunk = ParsedChunk(
|
|
138
|
+
content=content,
|
|
139
|
+
chunk_type=ChunkType.GENERIC,
|
|
140
|
+
name=None,
|
|
141
|
+
start_line=current_start_line,
|
|
142
|
+
end_line=len(lines),
|
|
143
|
+
start_byte=0,
|
|
144
|
+
end_byte=len(content.encode('utf-8')),
|
|
145
|
+
metadata={"fallback": True},
|
|
146
|
+
language=self.language_name,
|
|
147
|
+
)
|
|
148
|
+
chunks.append(chunk)
|
|
149
|
+
|
|
150
|
+
return chunks
|
|
151
|
+
|
|
152
|
+
@abstractmethod
|
|
153
|
+
def _extract_chunks(self, root_node: Any, source_code: str, file_path: str) -> List[ParsedChunk]:
|
|
154
|
+
"""Extract semantic chunks from AST."""
|
|
155
|
+
pass
|
|
156
|
+
|
|
157
|
+
def _get_node_text(self, node: Any, source_code: str) -> str:
|
|
158
|
+
"""Get text content of a node."""
|
|
159
|
+
if hasattr(node, 'start_byte') and hasattr(node, 'end_byte'):
|
|
160
|
+
return source_code[node.start_byte:node.end_byte]
|
|
161
|
+
return ""
|
|
162
|
+
|
|
163
|
+
def _get_line_numbers(self, node: Any) -> Tuple[int, int]:
|
|
164
|
+
"""Get start and end line numbers for a node."""
|
|
165
|
+
if hasattr(node, 'start_point') and hasattr(node, 'end_point'):
|
|
166
|
+
return node.start_point[0] + 1, node.end_point[0] + 1
|
|
167
|
+
return 1, 1
|
|
168
|
+
|
|
169
|
+
class PythonHandler(LanguageHandler):
|
|
170
|
+
"""Handler for Python code."""
|
|
171
|
+
|
|
172
|
+
def __init__(self):
|
|
173
|
+
super().__init__("python")
|
|
174
|
+
|
|
175
|
+
def _setup_node_types(self) -> None:
|
|
176
|
+
"""Set up Python-specific node types."""
|
|
177
|
+
self.function_nodes = {"function_definition", "async_function_definition"}
|
|
178
|
+
self.class_nodes = {"class_definition"}
|
|
179
|
+
self.import_nodes = {"import_statement", "import_from_statement"}
|
|
180
|
+
self.comment_nodes = {"comment"}
|
|
181
|
+
self.docstring_nodes = {"expression_statement"} # May contain docstrings
|
|
182
|
+
|
|
183
|
+
def _extract_chunks(self, root_node: Any, source_code: str, file_path: str) -> List[ParsedChunk]:
|
|
184
|
+
"""Extract chunks from Python AST."""
|
|
185
|
+
chunks = []
|
|
186
|
+
|
|
187
|
+
# Extract imports first
|
|
188
|
+
imports = self._extract_imports(root_node, source_code)
|
|
189
|
+
|
|
190
|
+
# Extract top-level constructs
|
|
191
|
+
for child in root_node.children:
|
|
192
|
+
if child.type in self.function_nodes:
|
|
193
|
+
chunk = self._extract_function(child, source_code, imports)
|
|
194
|
+
if chunk:
|
|
195
|
+
chunks.append(chunk)
|
|
196
|
+
|
|
197
|
+
elif child.type in self.class_nodes:
|
|
198
|
+
class_chunks = self._extract_class(child, source_code, imports)
|
|
199
|
+
chunks.extend(class_chunks)
|
|
200
|
+
|
|
201
|
+
# Add import chunk if imports exist
|
|
202
|
+
if imports:
|
|
203
|
+
import_content = '\n'.join(imports)
|
|
204
|
+
import_chunk = ParsedChunk(
|
|
205
|
+
content=import_content,
|
|
206
|
+
chunk_type=ChunkType.IMPORT,
|
|
207
|
+
name="imports",
|
|
208
|
+
start_line=1,
|
|
209
|
+
end_line=len(imports),
|
|
210
|
+
start_byte=0,
|
|
211
|
+
end_byte=len(import_content.encode('utf-8')),
|
|
212
|
+
metadata={"import_count": len(imports)},
|
|
213
|
+
language=self.language_name,
|
|
214
|
+
imports=imports,
|
|
215
|
+
)
|
|
216
|
+
chunks.insert(0, import_chunk) # Imports go first
|
|
217
|
+
|
|
218
|
+
return chunks
|
|
219
|
+
|
|
220
|
+
def _extract_imports(self, root_node: Any, source_code: str) -> List[str]:
|
|
221
|
+
"""Extract import statements."""
|
|
222
|
+
imports = []
|
|
223
|
+
for child in root_node.children:
|
|
224
|
+
if child.type in self.import_nodes:
|
|
225
|
+
import_text = self._get_node_text(child, source_code)
|
|
226
|
+
imports.append(import_text.strip())
|
|
227
|
+
return imports
|
|
228
|
+
|
|
229
|
+
def _extract_function(self, node: Any, source_code: str, imports: List[str]) -> Optional[ParsedChunk]:
|
|
230
|
+
"""Extract a function definition."""
|
|
231
|
+
content = self._get_node_text(node, source_code)
|
|
232
|
+
start_line, end_line = self._get_line_numbers(node)
|
|
233
|
+
|
|
234
|
+
# Extract function name
|
|
235
|
+
name = "unknown_function"
|
|
236
|
+
for child in node.children:
|
|
237
|
+
if child.type == "identifier":
|
|
238
|
+
name = self._get_node_text(child, source_code)
|
|
239
|
+
break
|
|
240
|
+
|
|
241
|
+
# Check for docstring
|
|
242
|
+
docstring = self._extract_docstring(node, source_code)
|
|
243
|
+
|
|
244
|
+
metadata = {
|
|
245
|
+
"is_async": node.type == "async_function_definition",
|
|
246
|
+
"has_docstring": docstring is not None,
|
|
247
|
+
"line_count": end_line - start_line + 1,
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
if docstring:
|
|
251
|
+
metadata["docstring"] = docstring
|
|
252
|
+
|
|
253
|
+
return ParsedChunk(
|
|
254
|
+
content=content,
|
|
255
|
+
chunk_type=ChunkType.FUNCTION,
|
|
256
|
+
name=name,
|
|
257
|
+
start_line=start_line,
|
|
258
|
+
end_line=end_line,
|
|
259
|
+
start_byte=node.start_byte if hasattr(node, 'start_byte') else 0,
|
|
260
|
+
end_byte=node.end_byte if hasattr(node, 'end_byte') else len(content.encode('utf-8')),
|
|
261
|
+
metadata=metadata,
|
|
262
|
+
language=self.language_name,
|
|
263
|
+
imports=imports,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
def _extract_class(self, node: Any, source_code: str, imports: List[str]) -> List[ParsedChunk]:
|
|
267
|
+
"""Extract a class definition and its methods."""
|
|
268
|
+
chunks = []
|
|
269
|
+
|
|
270
|
+
# Extract class name
|
|
271
|
+
class_name = "unknown_class"
|
|
272
|
+
for child in node.children:
|
|
273
|
+
if child.type == "identifier":
|
|
274
|
+
class_name = self._get_node_text(child, source_code)
|
|
275
|
+
break
|
|
276
|
+
|
|
277
|
+
# Extract class docstring
|
|
278
|
+
class_docstring = self._extract_docstring(node, source_code)
|
|
279
|
+
|
|
280
|
+
# Extract class-level chunk
|
|
281
|
+
class_content = self._get_node_text(node, source_code)
|
|
282
|
+
start_line, end_line = self._get_line_numbers(node)
|
|
283
|
+
|
|
284
|
+
class_metadata = {
|
|
285
|
+
"has_docstring": class_docstring is not None,
|
|
286
|
+
"method_count": len([c for c in node.children if c.type in self.function_nodes]),
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
if class_docstring:
|
|
290
|
+
class_metadata["docstring"] = class_docstring
|
|
291
|
+
|
|
292
|
+
class_chunk = ParsedChunk(
|
|
293
|
+
content=class_content,
|
|
294
|
+
chunk_type=ChunkType.CLASS,
|
|
295
|
+
name=class_name,
|
|
296
|
+
start_line=start_line,
|
|
297
|
+
end_line=end_line,
|
|
298
|
+
start_byte=node.start_byte if hasattr(node, 'start_byte') else 0,
|
|
299
|
+
end_byte=node.end_byte if hasattr(node, 'end_byte') else len(class_content.encode('utf-8')),
|
|
300
|
+
metadata=class_metadata,
|
|
301
|
+
language=self.language_name,
|
|
302
|
+
parent_context=None,
|
|
303
|
+
imports=imports,
|
|
304
|
+
)
|
|
305
|
+
chunks.append(class_chunk)
|
|
306
|
+
|
|
307
|
+
# Extract methods separately for better granularity
|
|
308
|
+
for child in node.children:
|
|
309
|
+
if child.type in self.function_nodes:
|
|
310
|
+
method_chunk = self._extract_function(child, source_code, imports)
|
|
311
|
+
if method_chunk:
|
|
312
|
+
method_chunk.chunk_type = ChunkType.METHOD
|
|
313
|
+
method_chunk.parent_context = class_name
|
|
314
|
+
chunks.append(method_chunk)
|
|
315
|
+
|
|
316
|
+
return chunks
|
|
317
|
+
|
|
318
|
+
def _extract_docstring(self, node: Any, source_code: str) -> Optional[str]:
|
|
319
|
+
"""Extract docstring from a function or class."""
|
|
320
|
+
# Look for string literal as first statement in body
|
|
321
|
+
for child in node.children:
|
|
322
|
+
if child.type == "block":
|
|
323
|
+
for stmt in child.children:
|
|
324
|
+
if stmt.type == "expression_statement":
|
|
325
|
+
for expr in stmt.children:
|
|
326
|
+
if expr.type == "string":
|
|
327
|
+
return self._get_node_text(expr, source_code).strip('"\'')
|
|
328
|
+
return None
|
|
329
|
+
|
|
330
|
+
class JavaScriptHandler(LanguageHandler):
|
|
331
|
+
"""Handler for JavaScript/TypeScript code."""
|
|
332
|
+
|
|
333
|
+
def __init__(self, language_name: str = "javascript"):
|
|
334
|
+
super().__init__(language_name)
|
|
335
|
+
|
|
336
|
+
def _setup_node_types(self) -> None:
|
|
337
|
+
"""Set up JavaScript-specific node types."""
|
|
338
|
+
self.function_nodes = {
|
|
339
|
+
"function_declaration", "arrow_function", "function_expression",
|
|
340
|
+
"method_definition", "generator_function_declaration"
|
|
341
|
+
}
|
|
342
|
+
self.class_nodes = {"class_declaration"}
|
|
343
|
+
self.import_nodes = {"import_statement", "import_declaration"}
|
|
344
|
+
self.comment_nodes = {"comment", "line_comment", "block_comment"}
|
|
345
|
+
|
|
346
|
+
def _extract_chunks(self, root_node: Any, source_code: str, file_path: str) -> List[ParsedChunk]:
|
|
347
|
+
"""Extract chunks from JavaScript AST."""
|
|
348
|
+
chunks = []
|
|
349
|
+
|
|
350
|
+
# Extract imports
|
|
351
|
+
imports = self._extract_imports(root_node, source_code)
|
|
352
|
+
|
|
353
|
+
# Extract top-level constructs
|
|
354
|
+
for child in root_node.children:
|
|
355
|
+
if child.type in self.function_nodes:
|
|
356
|
+
chunk = self._extract_function(child, source_code, imports)
|
|
357
|
+
if chunk:
|
|
358
|
+
chunks.append(chunk)
|
|
359
|
+
|
|
360
|
+
elif child.type in self.class_nodes:
|
|
361
|
+
class_chunks = self._extract_class(child, source_code, imports)
|
|
362
|
+
chunks.extend(class_chunks)
|
|
363
|
+
|
|
364
|
+
return chunks
|
|
365
|
+
|
|
366
|
+
def _extract_imports(self, root_node: Any, source_code: str) -> List[str]:
|
|
367
|
+
"""Extract import statements."""
|
|
368
|
+
imports = []
|
|
369
|
+
for child in root_node.children:
|
|
370
|
+
if child.type in self.import_nodes:
|
|
371
|
+
import_text = self._get_node_text(child, source_code)
|
|
372
|
+
imports.append(import_text.strip())
|
|
373
|
+
return imports
|
|
374
|
+
|
|
375
|
+
def _extract_function(self, node: Any, source_code: str, imports: List[str]) -> Optional[ParsedChunk]:
|
|
376
|
+
"""Extract a function definition."""
|
|
377
|
+
content = self._get_node_text(node, source_code)
|
|
378
|
+
start_line, end_line = self._get_line_numbers(node)
|
|
379
|
+
|
|
380
|
+
# Extract function name
|
|
381
|
+
name = "anonymous_function"
|
|
382
|
+
for child in node.children:
|
|
383
|
+
if child.type == "identifier":
|
|
384
|
+
name = self._get_node_text(child, source_code)
|
|
385
|
+
break
|
|
386
|
+
|
|
387
|
+
metadata = {
|
|
388
|
+
"is_arrow": node.type == "arrow_function",
|
|
389
|
+
"is_async": "async" in content,
|
|
390
|
+
"line_count": end_line - start_line + 1,
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
return ParsedChunk(
|
|
394
|
+
content=content,
|
|
395
|
+
chunk_type=ChunkType.FUNCTION,
|
|
396
|
+
name=name,
|
|
397
|
+
start_line=start_line,
|
|
398
|
+
end_line=end_line,
|
|
399
|
+
start_byte=node.start_byte if hasattr(node, 'start_byte') else 0,
|
|
400
|
+
end_byte=node.end_byte if hasattr(node, 'end_byte') else len(content.encode('utf-8')),
|
|
401
|
+
metadata=metadata,
|
|
402
|
+
language=self.language_name,
|
|
403
|
+
imports=imports,
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
def _extract_class(self, node: Any, source_code: str, imports: List[str]) -> List[ParsedChunk]:
|
|
407
|
+
"""Extract a class definition."""
|
|
408
|
+
chunks = []
|
|
409
|
+
|
|
410
|
+
# Extract class name
|
|
411
|
+
class_name = "unknown_class"
|
|
412
|
+
for child in node.children:
|
|
413
|
+
if child.type == "identifier":
|
|
414
|
+
class_name = self._get_node_text(child, source_code)
|
|
415
|
+
break
|
|
416
|
+
|
|
417
|
+
# Extract class chunk
|
|
418
|
+
class_content = self._get_node_text(node, source_code)
|
|
419
|
+
start_line, end_line = self._get_line_numbers(node)
|
|
420
|
+
|
|
421
|
+
class_chunk = ParsedChunk(
|
|
422
|
+
content=class_content,
|
|
423
|
+
chunk_type=ChunkType.CLASS,
|
|
424
|
+
name=class_name,
|
|
425
|
+
start_line=start_line,
|
|
426
|
+
end_line=end_line,
|
|
427
|
+
start_byte=node.start_byte if hasattr(node, 'start_byte') else 0,
|
|
428
|
+
end_byte=node.end_byte if hasattr(node, 'end_byte') else len(class_content.encode('utf-8')),
|
|
429
|
+
metadata={"line_count": end_line - start_line + 1},
|
|
430
|
+
language=self.language_name,
|
|
431
|
+
imports=imports,
|
|
432
|
+
)
|
|
433
|
+
chunks.append(class_chunk)
|
|
434
|
+
|
|
435
|
+
return chunks
|
|
436
|
+
|
|
437
|
+
# Language handler registry
|
|
438
|
+
LANGUAGE_HANDLERS = {
|
|
439
|
+
".py": PythonHandler,
|
|
440
|
+
".js": JavaScriptHandler,
|
|
441
|
+
".ts": lambda: JavaScriptHandler("typescript"),
|
|
442
|
+
".jsx": lambda: JavaScriptHandler("jsx"),
|
|
443
|
+
".tsx": lambda: JavaScriptHandler("tsx"),
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
def get_language_handler(file_path: str) -> Optional[LanguageHandler]:
|
|
447
|
+
"""Get appropriate language handler for a file."""
|
|
448
|
+
path = Path(file_path)
|
|
449
|
+
extension = path.suffix.lower()
|
|
450
|
+
|
|
451
|
+
handler_class = LANGUAGE_HANDLERS.get(extension)
|
|
452
|
+
if handler_class:
|
|
453
|
+
try:
|
|
454
|
+
handler = handler_class()
|
|
455
|
+
if handler.initialize_parser():
|
|
456
|
+
return handler
|
|
457
|
+
else:
|
|
458
|
+
# Return handler anyway for fallback chunking
|
|
459
|
+
return handler
|
|
460
|
+
except Exception as e:
|
|
461
|
+
logger.warning(f"Failed to create handler for {extension}: {e}")
|
|
462
|
+
|
|
463
|
+
# Return generic handler for unknown extensions
|
|
464
|
+
return GenericHandler(extension)
|
|
465
|
+
|
|
466
|
+
class GenericHandler(LanguageHandler):
|
|
467
|
+
"""Generic handler for unsupported languages."""
|
|
468
|
+
|
|
469
|
+
def __init__(self, extension: str):
|
|
470
|
+
super().__init__(f"generic{extension}")
|
|
471
|
+
|
|
472
|
+
def _setup_node_types(self) -> None:
|
|
473
|
+
"""No specific node types for generic handler."""
|
|
474
|
+
pass
|
|
475
|
+
|
|
476
|
+
def _extract_chunks(self, root_node: Any, source_code: str, file_path: str) -> List[ParsedChunk]:
|
|
477
|
+
"""Generic chunking always falls back to line-based."""
|
|
478
|
+
return self._fallback_chunking(source_code, file_path)
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Vector Mode Configuration.
|
|
3
|
+
|
|
4
|
+
Manages configuration settings for vector mode features including API keys,
|
|
5
|
+
batch sizes, similarity thresholds, and daemon settings.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Optional
|
|
12
|
+
import yaml
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class VectorConfig:
|
|
16
|
+
"""Configuration for vector mode operations."""
|
|
17
|
+
|
|
18
|
+
# API Configuration
|
|
19
|
+
voyage_api_key: Optional[str] = None
|
|
20
|
+
turbopuffer_api_key: Optional[str] = None
|
|
21
|
+
turbopuffer_region: str = "gcp-europe-west3"
|
|
22
|
+
|
|
23
|
+
# Embedding Configuration
|
|
24
|
+
embedding_model: str = "voyage-code-2"
|
|
25
|
+
batch_size: int = 128
|
|
26
|
+
max_tokens_per_chunk: int = 1024
|
|
27
|
+
|
|
28
|
+
# Search Configuration
|
|
29
|
+
similarity_threshold: float = 0.5
|
|
30
|
+
max_search_results: int = 20
|
|
31
|
+
enable_recency_boost: bool = True
|
|
32
|
+
|
|
33
|
+
# Chunking Configuration
|
|
34
|
+
max_chunk_size: int = 1500
|
|
35
|
+
chunk_overlap: int = 100
|
|
36
|
+
prefer_semantic_chunks: bool = True
|
|
37
|
+
|
|
38
|
+
# File Monitoring Configuration
|
|
39
|
+
watch_debounce_ms: int = 100
|
|
40
|
+
ignore_patterns: list[str] = field(default_factory=lambda: [
|
|
41
|
+
"*.log", "*.tmp", "*~", ".git/*", "__pycache__/*", "node_modules/*",
|
|
42
|
+
"*.pyc", "*.pyo", ".DS_Store", "Thumbs.db"
|
|
43
|
+
])
|
|
44
|
+
|
|
45
|
+
# Daemon Configuration
|
|
46
|
+
daemon_enabled: bool = True
|
|
47
|
+
daemon_poll_interval: int = 5
|
|
48
|
+
max_queue_size: int = 1000
|
|
49
|
+
worker_count: int = 3
|
|
50
|
+
|
|
51
|
+
# Security Configuration
|
|
52
|
+
redact_secrets: bool = True
|
|
53
|
+
redaction_patterns_file: Optional[str] = None
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def from_env(cls) -> "VectorConfig":
|
|
57
|
+
"""Create config from environment variables."""
|
|
58
|
+
return cls(
|
|
59
|
+
voyage_api_key=os.getenv("VOYAGE_API_KEY"),
|
|
60
|
+
turbopuffer_api_key=os.getenv("TURBOPUFFER_API_KEY"),
|
|
61
|
+
turbopuffer_region=os.getenv("TURBOPUFFER_REGION", "gcp-europe-west3"),
|
|
62
|
+
embedding_model=os.getenv("VECTOR_EMBEDDING_MODEL", "voyage-code-3"),
|
|
63
|
+
batch_size=int(os.getenv("VECTOR_BATCH_SIZE", "128")),
|
|
64
|
+
max_tokens_per_chunk=int(os.getenv("VECTOR_MAX_TOKENS", "2048")),
|
|
65
|
+
similarity_threshold=float(os.getenv("VECTOR_SIMILARITY_THRESHOLD", "0.5")),
|
|
66
|
+
max_search_results=int(os.getenv("VECTOR_MAX_RESULTS", "20")),
|
|
67
|
+
enable_recency_boost=os.getenv("VECTOR_RECENCY_BOOST", "true").lower() == "true",
|
|
68
|
+
max_chunk_size=int(os.getenv("VECTOR_CHUNK_SIZE", "1500")),
|
|
69
|
+
chunk_overlap=int(os.getenv("VECTOR_CHUNK_OVERLAP", "100")),
|
|
70
|
+
watch_debounce_ms=int(os.getenv("VECTOR_DEBOUNCE_MS", "100")),
|
|
71
|
+
daemon_enabled=os.getenv("VECTOR_DAEMON_ENABLED", "true").lower() == "true",
|
|
72
|
+
daemon_poll_interval=int(os.getenv("VECTOR_POLL_INTERVAL", "5")),
|
|
73
|
+
max_queue_size=int(os.getenv("VECTOR_MAX_QUEUE", "1000")),
|
|
74
|
+
worker_count=int(os.getenv("VECTOR_WORKERS", "3")),
|
|
75
|
+
redact_secrets=os.getenv("VECTOR_REDACT_SECRETS", "true").lower() == "true",
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
@classmethod
|
|
79
|
+
def from_file(cls, config_path: Path) -> "VectorConfig":
|
|
80
|
+
"""Load config from YAML file."""
|
|
81
|
+
if not config_path.exists():
|
|
82
|
+
return cls.from_env()
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
with open(config_path, "r") as f:
|
|
86
|
+
data = yaml.safe_load(f) or {}
|
|
87
|
+
|
|
88
|
+
# Merge with environment variables (env takes precedence)
|
|
89
|
+
env_config = cls.from_env()
|
|
90
|
+
|
|
91
|
+
# Update with file values only if env variable not set
|
|
92
|
+
for key, value in data.items():
|
|
93
|
+
if hasattr(env_config, key):
|
|
94
|
+
env_value = getattr(env_config, key)
|
|
95
|
+
# Use file value if env value is None or default
|
|
96
|
+
if env_value is None or (key == "voyage_api_key" and env_value is None):
|
|
97
|
+
setattr(env_config, key, value)
|
|
98
|
+
|
|
99
|
+
return env_config
|
|
100
|
+
|
|
101
|
+
except Exception as e:
|
|
102
|
+
raise ValueError(f"Failed to load config from {config_path}: {e}")
|
|
103
|
+
|
|
104
|
+
def to_file(self, config_path: Path) -> None:
|
|
105
|
+
"""Save config to YAML file."""
|
|
106
|
+
config_path.parent.mkdir(parents=True, exist_ok=True)
|
|
107
|
+
|
|
108
|
+
# Don't save API keys to file for security
|
|
109
|
+
data = {
|
|
110
|
+
k: v for k, v in self.__dict__.items()
|
|
111
|
+
if not k.endswith("_api_key")
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
with open(config_path, "w") as f:
|
|
115
|
+
yaml.dump(data, f, default_flow_style=False, sort_keys=True)
|
|
116
|
+
|
|
117
|
+
def validate(self) -> list[str]:
|
|
118
|
+
"""Validate configuration and return list of errors."""
|
|
119
|
+
errors = []
|
|
120
|
+
|
|
121
|
+
if self.daemon_enabled:
|
|
122
|
+
if not self.voyage_api_key:
|
|
123
|
+
errors.append("VOYAGE_API_KEY environment variable required for vector mode")
|
|
124
|
+
if not self.turbopuffer_api_key:
|
|
125
|
+
errors.append("TURBOPUFFER_API_KEY environment variable required for vector mode")
|
|
126
|
+
|
|
127
|
+
# Validate TurboPuffer region
|
|
128
|
+
supported_regions = [
|
|
129
|
+
'aws-ap-southeast-2', 'aws-eu-central-1', 'aws-us-east-1',
|
|
130
|
+
'aws-us-east-2', 'aws-us-west-2', 'gcp-us-central1',
|
|
131
|
+
'gcp-us-west1', 'gcp-us-east4', 'gcp-europe-west3'
|
|
132
|
+
]
|
|
133
|
+
if self.turbopuffer_region not in supported_regions:
|
|
134
|
+
errors.append(f"turbopuffer_region '{self.turbopuffer_region}' is not supported. " +
|
|
135
|
+
f"Supported regions: {', '.join(supported_regions)}")
|
|
136
|
+
|
|
137
|
+
if self.batch_size <= 0:
|
|
138
|
+
errors.append("batch_size must be positive")
|
|
139
|
+
if self.max_tokens_per_chunk <= 0:
|
|
140
|
+
errors.append("max_tokens_per_chunk must be positive")
|
|
141
|
+
if not 0 <= self.similarity_threshold <= 1:
|
|
142
|
+
errors.append("similarity_threshold must be between 0 and 1")
|
|
143
|
+
if self.max_search_results <= 0:
|
|
144
|
+
errors.append("max_search_results must be positive")
|
|
145
|
+
if self.max_chunk_size <= 0:
|
|
146
|
+
errors.append("max_chunk_size must be positive")
|
|
147
|
+
if self.chunk_overlap < 0:
|
|
148
|
+
errors.append("chunk_overlap cannot be negative")
|
|
149
|
+
if self.worker_count <= 0:
|
|
150
|
+
errors.append("worker_count must be positive")
|
|
151
|
+
|
|
152
|
+
return errors
|
|
153
|
+
|
|
154
|
+
def load_vector_config(config_path: Optional[Path] = None) -> VectorConfig:
|
|
155
|
+
"""Load vector configuration from file or environment."""
|
|
156
|
+
if config_path is None:
|
|
157
|
+
from . import get_vector_config_path
|
|
158
|
+
config_path = get_vector_config_path()
|
|
159
|
+
|
|
160
|
+
config = VectorConfig.from_file(config_path)
|
|
161
|
+
|
|
162
|
+
# Validate configuration
|
|
163
|
+
errors = config.validate()
|
|
164
|
+
if errors:
|
|
165
|
+
raise ValueError(f"Invalid vector configuration: {'; '.join(errors)}")
|
|
166
|
+
|
|
167
|
+
return config
|