mcp-code-indexer 4.0.2__py3-none-any.whl → 4.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_code_indexer/database/models.py +125 -1
- mcp_code_indexer/main.py +60 -0
- mcp_code_indexer/migrations/006_vector_mode.sql +189 -0
- mcp_code_indexer/server/mcp_server.py +3 -0
- mcp_code_indexer/vector_mode/__init__.py +36 -0
- mcp_code_indexer/vector_mode/chunking/__init__.py +19 -0
- mcp_code_indexer/vector_mode/chunking/ast_chunker.py +403 -0
- mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +500 -0
- mcp_code_indexer/vector_mode/chunking/language_handlers.py +478 -0
- mcp_code_indexer/vector_mode/config.py +167 -0
- mcp_code_indexer/vector_mode/daemon.py +335 -0
- mcp_code_indexer/vector_mode/monitoring/__init__.py +19 -0
- mcp_code_indexer/vector_mode/monitoring/change_detector.py +312 -0
- mcp_code_indexer/vector_mode/monitoring/file_watcher.py +445 -0
- mcp_code_indexer/vector_mode/monitoring/merkle_tree.py +418 -0
- mcp_code_indexer/vector_mode/providers/__init__.py +17 -0
- mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +217 -0
- mcp_code_indexer/vector_mode/providers/voyage_client.py +119 -0
- mcp_code_indexer/vector_mode/security/__init__.py +11 -0
- mcp_code_indexer/vector_mode/security/patterns.py +297 -0
- mcp_code_indexer/vector_mode/security/redactor.py +368 -0
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/METADATA +66 -5
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/RECORD +26 -8
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/LICENSE +0 -0
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/WHEEL +0 -0
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,403 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Main AST-based code chunker for vector mode.
|
|
3
|
+
|
|
4
|
+
Coordinates language-specific parsing and produces optimized code chunks
|
|
5
|
+
for embedding generation while preserving semantic meaning.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
import logging
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import List, Dict, Any, Optional, Set
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
|
|
15
|
+
from .language_handlers import get_language_handler, ParsedChunk
|
|
16
|
+
from .chunk_optimizer import ChunkOptimizer, OptimizedChunk
|
|
17
|
+
from ..security.redactor import SecretRedactor, RedactionResult
|
|
18
|
+
from ...database.models import ChunkType
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class CodeChunk:
|
|
24
|
+
"""
|
|
25
|
+
Represents a code chunk ready for embedding generation.
|
|
26
|
+
|
|
27
|
+
This is the final output of the chunking process, optimized and
|
|
28
|
+
ready for vector indexing.
|
|
29
|
+
"""
|
|
30
|
+
content: str
|
|
31
|
+
chunk_type: ChunkType
|
|
32
|
+
name: Optional[str]
|
|
33
|
+
file_path: str
|
|
34
|
+
start_line: int
|
|
35
|
+
end_line: int
|
|
36
|
+
content_hash: str
|
|
37
|
+
language: str
|
|
38
|
+
redacted: bool = False
|
|
39
|
+
metadata: Dict[str, Any] = None
|
|
40
|
+
imports: List[str] = None
|
|
41
|
+
parent_context: Optional[str] = None
|
|
42
|
+
|
|
43
|
+
def __post_init__(self):
|
|
44
|
+
if self.metadata is None:
|
|
45
|
+
self.metadata = {}
|
|
46
|
+
if self.imports is None:
|
|
47
|
+
self.imports = []
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class ChunkingStats:
|
|
51
|
+
"""Statistics about the chunking process."""
|
|
52
|
+
files_processed: int = 0
|
|
53
|
+
total_chunks: int = 0
|
|
54
|
+
chunks_by_type: Dict[ChunkType, int] = None
|
|
55
|
+
chunks_by_language: Dict[str, int] = None
|
|
56
|
+
redacted_chunks: int = 0
|
|
57
|
+
fallback_chunks: int = 0
|
|
58
|
+
processing_time: float = 0.0
|
|
59
|
+
|
|
60
|
+
def __post_init__(self):
|
|
61
|
+
if self.chunks_by_type is None:
|
|
62
|
+
self.chunks_by_type = {}
|
|
63
|
+
if self.chunks_by_language is None:
|
|
64
|
+
self.chunks_by_language = {}
|
|
65
|
+
|
|
66
|
+
class ASTChunker:
|
|
67
|
+
"""
|
|
68
|
+
Main AST-based code chunker.
|
|
69
|
+
|
|
70
|
+
Orchestrates the entire chunking process from file content to
|
|
71
|
+
optimized code chunks ready for embedding generation.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
max_chunk_size: int = 1500,
|
|
77
|
+
min_chunk_size: int = 50,
|
|
78
|
+
enable_redaction: bool = True,
|
|
79
|
+
enable_optimization: bool = True,
|
|
80
|
+
redaction_confidence: float = 0.5,
|
|
81
|
+
):
|
|
82
|
+
"""
|
|
83
|
+
Initialize AST chunker.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
max_chunk_size: Maximum characters per chunk
|
|
87
|
+
min_chunk_size: Minimum characters per chunk
|
|
88
|
+
enable_redaction: Whether to redact secrets
|
|
89
|
+
enable_optimization: Whether to optimize chunks
|
|
90
|
+
redaction_confidence: Confidence threshold for redaction
|
|
91
|
+
"""
|
|
92
|
+
self.max_chunk_size = max_chunk_size
|
|
93
|
+
self.min_chunk_size = min_chunk_size
|
|
94
|
+
self.enable_redaction = enable_redaction
|
|
95
|
+
self.enable_optimization = enable_optimization
|
|
96
|
+
|
|
97
|
+
# Initialize components
|
|
98
|
+
self.redactor: Optional[SecretRedactor] = None
|
|
99
|
+
if enable_redaction:
|
|
100
|
+
self.redactor = SecretRedactor(
|
|
101
|
+
min_confidence=redaction_confidence,
|
|
102
|
+
preserve_structure=True,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
self.optimizer: Optional[ChunkOptimizer] = None
|
|
106
|
+
if enable_optimization:
|
|
107
|
+
self.optimizer = ChunkOptimizer(
|
|
108
|
+
max_chunk_size=max_chunk_size,
|
|
109
|
+
min_chunk_size=min_chunk_size,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Statistics
|
|
113
|
+
self.stats = ChunkingStats()
|
|
114
|
+
|
|
115
|
+
# Cache for performance
|
|
116
|
+
self.handler_cache: Dict[str, Any] = {}
|
|
117
|
+
|
|
118
|
+
def chunk_file(self, file_path: str, content: Optional[str] = None) -> List[CodeChunk]:
|
|
119
|
+
"""
|
|
120
|
+
Chunk a single file into semantic code chunks.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
file_path: Path to the file to chunk
|
|
124
|
+
content: Optional file content (if not provided, will read from file)
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
List of code chunks
|
|
128
|
+
"""
|
|
129
|
+
start_time = datetime.utcnow()
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
# Read content if not provided
|
|
133
|
+
if content is None:
|
|
134
|
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
135
|
+
content = f.read()
|
|
136
|
+
|
|
137
|
+
# Skip empty files
|
|
138
|
+
if not content.strip():
|
|
139
|
+
logger.debug(f"Skipping empty file: {file_path}")
|
|
140
|
+
return []
|
|
141
|
+
|
|
142
|
+
# Get language handler
|
|
143
|
+
handler = self._get_language_handler(file_path)
|
|
144
|
+
if not handler:
|
|
145
|
+
logger.warning(f"No handler available for {file_path}")
|
|
146
|
+
return []
|
|
147
|
+
|
|
148
|
+
# Parse into semantic chunks
|
|
149
|
+
logger.debug(f"Parsing {file_path} with {handler.language_name} handler")
|
|
150
|
+
parsed_chunks = handler.parse_code(content, file_path)
|
|
151
|
+
|
|
152
|
+
# Convert to code chunks
|
|
153
|
+
code_chunks = []
|
|
154
|
+
for parsed_chunk in parsed_chunks:
|
|
155
|
+
code_chunk = self._convert_parsed_chunk(parsed_chunk, file_path)
|
|
156
|
+
if code_chunk:
|
|
157
|
+
code_chunks.append(code_chunk)
|
|
158
|
+
|
|
159
|
+
# Apply redaction if enabled
|
|
160
|
+
if self.enable_redaction and self.redactor:
|
|
161
|
+
code_chunks = self._apply_redaction(code_chunks, file_path)
|
|
162
|
+
|
|
163
|
+
# Apply optimization if enabled
|
|
164
|
+
if self.enable_optimization and self.optimizer:
|
|
165
|
+
code_chunks = self._apply_optimization(code_chunks)
|
|
166
|
+
|
|
167
|
+
# Update statistics
|
|
168
|
+
processing_time = (datetime.utcnow() - start_time).total_seconds()
|
|
169
|
+
self._update_stats(code_chunks, handler.language_name, processing_time)
|
|
170
|
+
|
|
171
|
+
logger.info(f"Chunked {file_path}: {len(code_chunks)} chunks")
|
|
172
|
+
return code_chunks
|
|
173
|
+
|
|
174
|
+
except Exception as e:
|
|
175
|
+
logger.error(f"Failed to chunk file {file_path}: {e}")
|
|
176
|
+
return []
|
|
177
|
+
|
|
178
|
+
def chunk_content(
|
|
179
|
+
self,
|
|
180
|
+
content: str,
|
|
181
|
+
file_path: str,
|
|
182
|
+
language: Optional[str] = None
|
|
183
|
+
) -> List[CodeChunk]:
|
|
184
|
+
"""
|
|
185
|
+
Chunk content directly without reading from file.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
content: Source code content
|
|
189
|
+
file_path: Virtual file path for language detection
|
|
190
|
+
language: Optional language override
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
List of code chunks
|
|
194
|
+
"""
|
|
195
|
+
return self.chunk_file(file_path, content)
|
|
196
|
+
|
|
197
|
+
def chunk_multiple_files(self, file_paths: List[str]) -> Dict[str, List[CodeChunk]]:
|
|
198
|
+
"""
|
|
199
|
+
Chunk multiple files and return results grouped by file.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
file_paths: List of file paths to chunk
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
Dictionary mapping file paths to their chunks
|
|
206
|
+
"""
|
|
207
|
+
results = {}
|
|
208
|
+
|
|
209
|
+
for file_path in file_paths:
|
|
210
|
+
try:
|
|
211
|
+
chunks = self.chunk_file(file_path)
|
|
212
|
+
results[file_path] = chunks
|
|
213
|
+
except Exception as e:
|
|
214
|
+
logger.error(f"Failed to chunk {file_path}: {e}")
|
|
215
|
+
results[file_path] = []
|
|
216
|
+
|
|
217
|
+
return results
|
|
218
|
+
|
|
219
|
+
def _get_language_handler(self, file_path: str) -> Optional[Any]:
|
|
220
|
+
"""Get language handler for file, with caching."""
|
|
221
|
+
extension = Path(file_path).suffix.lower()
|
|
222
|
+
|
|
223
|
+
if extension in self.handler_cache:
|
|
224
|
+
return self.handler_cache[extension]
|
|
225
|
+
|
|
226
|
+
handler = get_language_handler(file_path)
|
|
227
|
+
self.handler_cache[extension] = handler
|
|
228
|
+
return handler
|
|
229
|
+
|
|
230
|
+
def _convert_parsed_chunk(self, parsed_chunk: ParsedChunk, file_path: str) -> Optional[CodeChunk]:
|
|
231
|
+
"""Convert a parsed chunk to a code chunk."""
|
|
232
|
+
if not parsed_chunk.content.strip():
|
|
233
|
+
return None
|
|
234
|
+
|
|
235
|
+
# Generate content hash
|
|
236
|
+
content_hash = hashlib.sha256(parsed_chunk.content.encode('utf-8')).hexdigest()
|
|
237
|
+
|
|
238
|
+
# Create code chunk
|
|
239
|
+
code_chunk = CodeChunk(
|
|
240
|
+
content=parsed_chunk.content,
|
|
241
|
+
chunk_type=parsed_chunk.chunk_type,
|
|
242
|
+
name=parsed_chunk.name,
|
|
243
|
+
file_path=file_path,
|
|
244
|
+
start_line=parsed_chunk.start_line,
|
|
245
|
+
end_line=parsed_chunk.end_line,
|
|
246
|
+
content_hash=content_hash,
|
|
247
|
+
language=parsed_chunk.language,
|
|
248
|
+
metadata=parsed_chunk.metadata.copy(),
|
|
249
|
+
imports=parsed_chunk.imports.copy() if parsed_chunk.imports else [],
|
|
250
|
+
parent_context=parsed_chunk.parent_context,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
return code_chunk
|
|
254
|
+
|
|
255
|
+
def _apply_redaction(self, chunks: List[CodeChunk], file_path: str) -> List[CodeChunk]:
|
|
256
|
+
"""Apply secret redaction to chunks."""
|
|
257
|
+
redacted_chunks = []
|
|
258
|
+
|
|
259
|
+
for chunk in chunks:
|
|
260
|
+
try:
|
|
261
|
+
redaction_result = self.redactor.redact_content(
|
|
262
|
+
content=chunk.content,
|
|
263
|
+
file_path=file_path,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
if redaction_result.was_redacted:
|
|
267
|
+
# Update chunk with redacted content
|
|
268
|
+
chunk.content = redaction_result.redacted_content
|
|
269
|
+
chunk.redacted = True
|
|
270
|
+
chunk.metadata["redaction_count"] = redaction_result.redaction_count
|
|
271
|
+
chunk.metadata["redacted_patterns"] = redaction_result.patterns_matched
|
|
272
|
+
|
|
273
|
+
# Recompute hash for redacted content
|
|
274
|
+
chunk.content_hash = hashlib.sha256(
|
|
275
|
+
chunk.content.encode('utf-8')
|
|
276
|
+
).hexdigest()
|
|
277
|
+
|
|
278
|
+
logger.debug(f"Redacted {redaction_result.redaction_count} secrets from chunk {chunk.name}")
|
|
279
|
+
|
|
280
|
+
redacted_chunks.append(chunk)
|
|
281
|
+
|
|
282
|
+
except Exception as e:
|
|
283
|
+
logger.warning(f"Failed to redact chunk {chunk.name}: {e}")
|
|
284
|
+
redacted_chunks.append(chunk)
|
|
285
|
+
|
|
286
|
+
return redacted_chunks
|
|
287
|
+
|
|
288
|
+
def _apply_optimization(self, chunks: List[CodeChunk]) -> List[CodeChunk]:
|
|
289
|
+
"""Apply chunk optimization."""
|
|
290
|
+
try:
|
|
291
|
+
# Convert to optimized chunks
|
|
292
|
+
optimized_chunks = []
|
|
293
|
+
for chunk in chunks:
|
|
294
|
+
opt_chunk = OptimizedChunk(
|
|
295
|
+
content=chunk.content,
|
|
296
|
+
chunk_type=chunk.chunk_type,
|
|
297
|
+
name=chunk.name,
|
|
298
|
+
start_line=chunk.start_line,
|
|
299
|
+
end_line=chunk.end_line,
|
|
300
|
+
metadata=chunk.metadata,
|
|
301
|
+
language=chunk.language,
|
|
302
|
+
imports=chunk.imports,
|
|
303
|
+
parent_context=chunk.parent_context,
|
|
304
|
+
)
|
|
305
|
+
optimized_chunks.append(opt_chunk)
|
|
306
|
+
|
|
307
|
+
# Apply optimization
|
|
308
|
+
optimized_chunks = self.optimizer.optimize_chunks(optimized_chunks)
|
|
309
|
+
|
|
310
|
+
# Convert back to code chunks
|
|
311
|
+
result_chunks = []
|
|
312
|
+
for opt_chunk in optimized_chunks:
|
|
313
|
+
code_chunk = CodeChunk(
|
|
314
|
+
content=opt_chunk.content,
|
|
315
|
+
chunk_type=opt_chunk.chunk_type,
|
|
316
|
+
name=opt_chunk.name,
|
|
317
|
+
file_path=chunks[0].file_path if chunks else "",
|
|
318
|
+
start_line=opt_chunk.start_line,
|
|
319
|
+
end_line=opt_chunk.end_line,
|
|
320
|
+
content_hash=hashlib.sha256(opt_chunk.content.encode('utf-8')).hexdigest(),
|
|
321
|
+
language=opt_chunk.language,
|
|
322
|
+
metadata=opt_chunk.metadata,
|
|
323
|
+
imports=opt_chunk.imports,
|
|
324
|
+
parent_context=opt_chunk.parent_context,
|
|
325
|
+
)
|
|
326
|
+
result_chunks.append(code_chunk)
|
|
327
|
+
|
|
328
|
+
return result_chunks
|
|
329
|
+
|
|
330
|
+
except Exception as e:
|
|
331
|
+
logger.warning(f"Chunk optimization failed: {e}")
|
|
332
|
+
return chunks
|
|
333
|
+
|
|
334
|
+
def _update_stats(self, chunks: List[CodeChunk], language: str, processing_time: float) -> None:
|
|
335
|
+
"""Update chunking statistics."""
|
|
336
|
+
self.stats.files_processed += 1
|
|
337
|
+
self.stats.total_chunks += len(chunks)
|
|
338
|
+
self.stats.processing_time += processing_time
|
|
339
|
+
|
|
340
|
+
# Count by type
|
|
341
|
+
for chunk in chunks:
|
|
342
|
+
self.stats.chunks_by_type[chunk.chunk_type] = (
|
|
343
|
+
self.stats.chunks_by_type.get(chunk.chunk_type, 0) + 1
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
if chunk.redacted:
|
|
347
|
+
self.stats.redacted_chunks += 1
|
|
348
|
+
|
|
349
|
+
if chunk.metadata.get("fallback", False):
|
|
350
|
+
self.stats.fallback_chunks += 1
|
|
351
|
+
|
|
352
|
+
# Count by language
|
|
353
|
+
self.stats.chunks_by_language[language] = (
|
|
354
|
+
self.stats.chunks_by_language.get(language, 0) + len(chunks)
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
def get_stats(self) -> ChunkingStats:
|
|
358
|
+
"""Get chunking statistics."""
|
|
359
|
+
return self.stats
|
|
360
|
+
|
|
361
|
+
def reset_stats(self) -> None:
|
|
362
|
+
"""Reset chunking statistics."""
|
|
363
|
+
self.stats = ChunkingStats()
|
|
364
|
+
|
|
365
|
+
def get_supported_extensions(self) -> Set[str]:
|
|
366
|
+
"""Get list of supported file extensions."""
|
|
367
|
+
from .language_handlers import LANGUAGE_HANDLERS
|
|
368
|
+
return set(LANGUAGE_HANDLERS.keys())
|
|
369
|
+
|
|
370
|
+
def is_supported_file(self, file_path: str) -> bool:
|
|
371
|
+
"""Check if a file is supported for chunking."""
|
|
372
|
+
extension = Path(file_path).suffix.lower()
|
|
373
|
+
return extension in self.get_supported_extensions()
|
|
374
|
+
|
|
375
|
+
def estimate_chunks(self, file_path: str) -> Dict[str, Any]:
|
|
376
|
+
"""Estimate number of chunks for a file without full processing."""
|
|
377
|
+
try:
|
|
378
|
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
379
|
+
content = f.read()
|
|
380
|
+
|
|
381
|
+
# Simple estimation based on content length and average chunk size
|
|
382
|
+
content_length = len(content)
|
|
383
|
+
lines = content.count('\n') + 1
|
|
384
|
+
|
|
385
|
+
# Rough estimates
|
|
386
|
+
estimated_chunks = max(1, content_length // self.max_chunk_size)
|
|
387
|
+
|
|
388
|
+
return {
|
|
389
|
+
"file_path": file_path,
|
|
390
|
+
"content_length": content_length,
|
|
391
|
+
"line_count": lines,
|
|
392
|
+
"estimated_chunks": estimated_chunks,
|
|
393
|
+
"is_supported": self.is_supported_file(file_path),
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
except Exception as e:
|
|
397
|
+
logger.warning(f"Failed to estimate chunks for {file_path}: {e}")
|
|
398
|
+
return {
|
|
399
|
+
"file_path": file_path,
|
|
400
|
+
"error": str(e),
|
|
401
|
+
"estimated_chunks": 0,
|
|
402
|
+
"is_supported": False,
|
|
403
|
+
}
|