mcp-code-indexer 4.0.2__py3-none-any.whl → 4.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. mcp_code_indexer/database/models.py +125 -1
  2. mcp_code_indexer/main.py +60 -0
  3. mcp_code_indexer/migrations/006_vector_mode.sql +189 -0
  4. mcp_code_indexer/server/mcp_server.py +3 -0
  5. mcp_code_indexer/vector_mode/__init__.py +36 -0
  6. mcp_code_indexer/vector_mode/chunking/__init__.py +19 -0
  7. mcp_code_indexer/vector_mode/chunking/ast_chunker.py +403 -0
  8. mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +500 -0
  9. mcp_code_indexer/vector_mode/chunking/language_handlers.py +478 -0
  10. mcp_code_indexer/vector_mode/config.py +167 -0
  11. mcp_code_indexer/vector_mode/daemon.py +335 -0
  12. mcp_code_indexer/vector_mode/monitoring/__init__.py +19 -0
  13. mcp_code_indexer/vector_mode/monitoring/change_detector.py +312 -0
  14. mcp_code_indexer/vector_mode/monitoring/file_watcher.py +445 -0
  15. mcp_code_indexer/vector_mode/monitoring/merkle_tree.py +418 -0
  16. mcp_code_indexer/vector_mode/providers/__init__.py +17 -0
  17. mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +217 -0
  18. mcp_code_indexer/vector_mode/providers/voyage_client.py +119 -0
  19. mcp_code_indexer/vector_mode/security/__init__.py +11 -0
  20. mcp_code_indexer/vector_mode/security/patterns.py +297 -0
  21. mcp_code_indexer/vector_mode/security/redactor.py +368 -0
  22. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/METADATA +66 -5
  23. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/RECORD +26 -8
  24. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/LICENSE +0 -0
  25. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/WHEEL +0 -0
  26. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,403 @@
1
+ """
2
+ Main AST-based code chunker for vector mode.
3
+
4
+ Coordinates language-specific parsing and produces optimized code chunks
5
+ for embedding generation while preserving semantic meaning.
6
+ """
7
+
8
+ import hashlib
9
+ import logging
10
+ from pathlib import Path
11
+ from typing import List, Dict, Any, Optional, Set
12
+ from dataclasses import dataclass
13
+ from datetime import datetime
14
+
15
+ from .language_handlers import get_language_handler, ParsedChunk
16
+ from .chunk_optimizer import ChunkOptimizer, OptimizedChunk
17
+ from ..security.redactor import SecretRedactor, RedactionResult
18
+ from ...database.models import ChunkType
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ @dataclass
23
+ class CodeChunk:
24
+ """
25
+ Represents a code chunk ready for embedding generation.
26
+
27
+ This is the final output of the chunking process, optimized and
28
+ ready for vector indexing.
29
+ """
30
+ content: str
31
+ chunk_type: ChunkType
32
+ name: Optional[str]
33
+ file_path: str
34
+ start_line: int
35
+ end_line: int
36
+ content_hash: str
37
+ language: str
38
+ redacted: bool = False
39
+ metadata: Dict[str, Any] = None
40
+ imports: List[str] = None
41
+ parent_context: Optional[str] = None
42
+
43
+ def __post_init__(self):
44
+ if self.metadata is None:
45
+ self.metadata = {}
46
+ if self.imports is None:
47
+ self.imports = []
48
+
49
+ @dataclass
50
+ class ChunkingStats:
51
+ """Statistics about the chunking process."""
52
+ files_processed: int = 0
53
+ total_chunks: int = 0
54
+ chunks_by_type: Dict[ChunkType, int] = None
55
+ chunks_by_language: Dict[str, int] = None
56
+ redacted_chunks: int = 0
57
+ fallback_chunks: int = 0
58
+ processing_time: float = 0.0
59
+
60
+ def __post_init__(self):
61
+ if self.chunks_by_type is None:
62
+ self.chunks_by_type = {}
63
+ if self.chunks_by_language is None:
64
+ self.chunks_by_language = {}
65
+
66
+ class ASTChunker:
67
+ """
68
+ Main AST-based code chunker.
69
+
70
+ Orchestrates the entire chunking process from file content to
71
+ optimized code chunks ready for embedding generation.
72
+ """
73
+
74
+ def __init__(
75
+ self,
76
+ max_chunk_size: int = 1500,
77
+ min_chunk_size: int = 50,
78
+ enable_redaction: bool = True,
79
+ enable_optimization: bool = True,
80
+ redaction_confidence: float = 0.5,
81
+ ):
82
+ """
83
+ Initialize AST chunker.
84
+
85
+ Args:
86
+ max_chunk_size: Maximum characters per chunk
87
+ min_chunk_size: Minimum characters per chunk
88
+ enable_redaction: Whether to redact secrets
89
+ enable_optimization: Whether to optimize chunks
90
+ redaction_confidence: Confidence threshold for redaction
91
+ """
92
+ self.max_chunk_size = max_chunk_size
93
+ self.min_chunk_size = min_chunk_size
94
+ self.enable_redaction = enable_redaction
95
+ self.enable_optimization = enable_optimization
96
+
97
+ # Initialize components
98
+ self.redactor: Optional[SecretRedactor] = None
99
+ if enable_redaction:
100
+ self.redactor = SecretRedactor(
101
+ min_confidence=redaction_confidence,
102
+ preserve_structure=True,
103
+ )
104
+
105
+ self.optimizer: Optional[ChunkOptimizer] = None
106
+ if enable_optimization:
107
+ self.optimizer = ChunkOptimizer(
108
+ max_chunk_size=max_chunk_size,
109
+ min_chunk_size=min_chunk_size,
110
+ )
111
+
112
+ # Statistics
113
+ self.stats = ChunkingStats()
114
+
115
+ # Cache for performance
116
+ self.handler_cache: Dict[str, Any] = {}
117
+
118
+ def chunk_file(self, file_path: str, content: Optional[str] = None) -> List[CodeChunk]:
119
+ """
120
+ Chunk a single file into semantic code chunks.
121
+
122
+ Args:
123
+ file_path: Path to the file to chunk
124
+ content: Optional file content (if not provided, will read from file)
125
+
126
+ Returns:
127
+ List of code chunks
128
+ """
129
+ start_time = datetime.utcnow()
130
+
131
+ try:
132
+ # Read content if not provided
133
+ if content is None:
134
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
135
+ content = f.read()
136
+
137
+ # Skip empty files
138
+ if not content.strip():
139
+ logger.debug(f"Skipping empty file: {file_path}")
140
+ return []
141
+
142
+ # Get language handler
143
+ handler = self._get_language_handler(file_path)
144
+ if not handler:
145
+ logger.warning(f"No handler available for {file_path}")
146
+ return []
147
+
148
+ # Parse into semantic chunks
149
+ logger.debug(f"Parsing {file_path} with {handler.language_name} handler")
150
+ parsed_chunks = handler.parse_code(content, file_path)
151
+
152
+ # Convert to code chunks
153
+ code_chunks = []
154
+ for parsed_chunk in parsed_chunks:
155
+ code_chunk = self._convert_parsed_chunk(parsed_chunk, file_path)
156
+ if code_chunk:
157
+ code_chunks.append(code_chunk)
158
+
159
+ # Apply redaction if enabled
160
+ if self.enable_redaction and self.redactor:
161
+ code_chunks = self._apply_redaction(code_chunks, file_path)
162
+
163
+ # Apply optimization if enabled
164
+ if self.enable_optimization and self.optimizer:
165
+ code_chunks = self._apply_optimization(code_chunks)
166
+
167
+ # Update statistics
168
+ processing_time = (datetime.utcnow() - start_time).total_seconds()
169
+ self._update_stats(code_chunks, handler.language_name, processing_time)
170
+
171
+ logger.info(f"Chunked {file_path}: {len(code_chunks)} chunks")
172
+ return code_chunks
173
+
174
+ except Exception as e:
175
+ logger.error(f"Failed to chunk file {file_path}: {e}")
176
+ return []
177
+
178
+ def chunk_content(
179
+ self,
180
+ content: str,
181
+ file_path: str,
182
+ language: Optional[str] = None
183
+ ) -> List[CodeChunk]:
184
+ """
185
+ Chunk content directly without reading from file.
186
+
187
+ Args:
188
+ content: Source code content
189
+ file_path: Virtual file path for language detection
190
+ language: Optional language override
191
+
192
+ Returns:
193
+ List of code chunks
194
+ """
195
+ return self.chunk_file(file_path, content)
196
+
197
+ def chunk_multiple_files(self, file_paths: List[str]) -> Dict[str, List[CodeChunk]]:
198
+ """
199
+ Chunk multiple files and return results grouped by file.
200
+
201
+ Args:
202
+ file_paths: List of file paths to chunk
203
+
204
+ Returns:
205
+ Dictionary mapping file paths to their chunks
206
+ """
207
+ results = {}
208
+
209
+ for file_path in file_paths:
210
+ try:
211
+ chunks = self.chunk_file(file_path)
212
+ results[file_path] = chunks
213
+ except Exception as e:
214
+ logger.error(f"Failed to chunk {file_path}: {e}")
215
+ results[file_path] = []
216
+
217
+ return results
218
+
219
+ def _get_language_handler(self, file_path: str) -> Optional[Any]:
220
+ """Get language handler for file, with caching."""
221
+ extension = Path(file_path).suffix.lower()
222
+
223
+ if extension in self.handler_cache:
224
+ return self.handler_cache[extension]
225
+
226
+ handler = get_language_handler(file_path)
227
+ self.handler_cache[extension] = handler
228
+ return handler
229
+
230
+ def _convert_parsed_chunk(self, parsed_chunk: ParsedChunk, file_path: str) -> Optional[CodeChunk]:
231
+ """Convert a parsed chunk to a code chunk."""
232
+ if not parsed_chunk.content.strip():
233
+ return None
234
+
235
+ # Generate content hash
236
+ content_hash = hashlib.sha256(parsed_chunk.content.encode('utf-8')).hexdigest()
237
+
238
+ # Create code chunk
239
+ code_chunk = CodeChunk(
240
+ content=parsed_chunk.content,
241
+ chunk_type=parsed_chunk.chunk_type,
242
+ name=parsed_chunk.name,
243
+ file_path=file_path,
244
+ start_line=parsed_chunk.start_line,
245
+ end_line=parsed_chunk.end_line,
246
+ content_hash=content_hash,
247
+ language=parsed_chunk.language,
248
+ metadata=parsed_chunk.metadata.copy(),
249
+ imports=parsed_chunk.imports.copy() if parsed_chunk.imports else [],
250
+ parent_context=parsed_chunk.parent_context,
251
+ )
252
+
253
+ return code_chunk
254
+
255
+ def _apply_redaction(self, chunks: List[CodeChunk], file_path: str) -> List[CodeChunk]:
256
+ """Apply secret redaction to chunks."""
257
+ redacted_chunks = []
258
+
259
+ for chunk in chunks:
260
+ try:
261
+ redaction_result = self.redactor.redact_content(
262
+ content=chunk.content,
263
+ file_path=file_path,
264
+ )
265
+
266
+ if redaction_result.was_redacted:
267
+ # Update chunk with redacted content
268
+ chunk.content = redaction_result.redacted_content
269
+ chunk.redacted = True
270
+ chunk.metadata["redaction_count"] = redaction_result.redaction_count
271
+ chunk.metadata["redacted_patterns"] = redaction_result.patterns_matched
272
+
273
+ # Recompute hash for redacted content
274
+ chunk.content_hash = hashlib.sha256(
275
+ chunk.content.encode('utf-8')
276
+ ).hexdigest()
277
+
278
+ logger.debug(f"Redacted {redaction_result.redaction_count} secrets from chunk {chunk.name}")
279
+
280
+ redacted_chunks.append(chunk)
281
+
282
+ except Exception as e:
283
+ logger.warning(f"Failed to redact chunk {chunk.name}: {e}")
284
+ redacted_chunks.append(chunk)
285
+
286
+ return redacted_chunks
287
+
288
+ def _apply_optimization(self, chunks: List[CodeChunk]) -> List[CodeChunk]:
289
+ """Apply chunk optimization."""
290
+ try:
291
+ # Convert to optimized chunks
292
+ optimized_chunks = []
293
+ for chunk in chunks:
294
+ opt_chunk = OptimizedChunk(
295
+ content=chunk.content,
296
+ chunk_type=chunk.chunk_type,
297
+ name=chunk.name,
298
+ start_line=chunk.start_line,
299
+ end_line=chunk.end_line,
300
+ metadata=chunk.metadata,
301
+ language=chunk.language,
302
+ imports=chunk.imports,
303
+ parent_context=chunk.parent_context,
304
+ )
305
+ optimized_chunks.append(opt_chunk)
306
+
307
+ # Apply optimization
308
+ optimized_chunks = self.optimizer.optimize_chunks(optimized_chunks)
309
+
310
+ # Convert back to code chunks
311
+ result_chunks = []
312
+ for opt_chunk in optimized_chunks:
313
+ code_chunk = CodeChunk(
314
+ content=opt_chunk.content,
315
+ chunk_type=opt_chunk.chunk_type,
316
+ name=opt_chunk.name,
317
+ file_path=chunks[0].file_path if chunks else "",
318
+ start_line=opt_chunk.start_line,
319
+ end_line=opt_chunk.end_line,
320
+ content_hash=hashlib.sha256(opt_chunk.content.encode('utf-8')).hexdigest(),
321
+ language=opt_chunk.language,
322
+ metadata=opt_chunk.metadata,
323
+ imports=opt_chunk.imports,
324
+ parent_context=opt_chunk.parent_context,
325
+ )
326
+ result_chunks.append(code_chunk)
327
+
328
+ return result_chunks
329
+
330
+ except Exception as e:
331
+ logger.warning(f"Chunk optimization failed: {e}")
332
+ return chunks
333
+
334
+ def _update_stats(self, chunks: List[CodeChunk], language: str, processing_time: float) -> None:
335
+ """Update chunking statistics."""
336
+ self.stats.files_processed += 1
337
+ self.stats.total_chunks += len(chunks)
338
+ self.stats.processing_time += processing_time
339
+
340
+ # Count by type
341
+ for chunk in chunks:
342
+ self.stats.chunks_by_type[chunk.chunk_type] = (
343
+ self.stats.chunks_by_type.get(chunk.chunk_type, 0) + 1
344
+ )
345
+
346
+ if chunk.redacted:
347
+ self.stats.redacted_chunks += 1
348
+
349
+ if chunk.metadata.get("fallback", False):
350
+ self.stats.fallback_chunks += 1
351
+
352
+ # Count by language
353
+ self.stats.chunks_by_language[language] = (
354
+ self.stats.chunks_by_language.get(language, 0) + len(chunks)
355
+ )
356
+
357
+ def get_stats(self) -> ChunkingStats:
358
+ """Get chunking statistics."""
359
+ return self.stats
360
+
361
+ def reset_stats(self) -> None:
362
+ """Reset chunking statistics."""
363
+ self.stats = ChunkingStats()
364
+
365
+ def get_supported_extensions(self) -> Set[str]:
366
+ """Get list of supported file extensions."""
367
+ from .language_handlers import LANGUAGE_HANDLERS
368
+ return set(LANGUAGE_HANDLERS.keys())
369
+
370
+ def is_supported_file(self, file_path: str) -> bool:
371
+ """Check if a file is supported for chunking."""
372
+ extension = Path(file_path).suffix.lower()
373
+ return extension in self.get_supported_extensions()
374
+
375
+ def estimate_chunks(self, file_path: str) -> Dict[str, Any]:
376
+ """Estimate number of chunks for a file without full processing."""
377
+ try:
378
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
379
+ content = f.read()
380
+
381
+ # Simple estimation based on content length and average chunk size
382
+ content_length = len(content)
383
+ lines = content.count('\n') + 1
384
+
385
+ # Rough estimates
386
+ estimated_chunks = max(1, content_length // self.max_chunk_size)
387
+
388
+ return {
389
+ "file_path": file_path,
390
+ "content_length": content_length,
391
+ "line_count": lines,
392
+ "estimated_chunks": estimated_chunks,
393
+ "is_supported": self.is_supported_file(file_path),
394
+ }
395
+
396
+ except Exception as e:
397
+ logger.warning(f"Failed to estimate chunks for {file_path}: {e}")
398
+ return {
399
+ "file_path": file_path,
400
+ "error": str(e),
401
+ "estimated_chunks": 0,
402
+ "is_supported": False,
403
+ }