mcp-code-indexer 4.2.15__py3-none-any.whl → 4.2.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. mcp_code_indexer/database/database.py +334 -115
  2. mcp_code_indexer/database/database_factory.py +1 -1
  3. mcp_code_indexer/database/exceptions.py +1 -1
  4. mcp_code_indexer/database/models.py +66 -24
  5. mcp_code_indexer/database/retry_executor.py +15 -5
  6. mcp_code_indexer/file_scanner.py +107 -12
  7. mcp_code_indexer/main.py +43 -30
  8. mcp_code_indexer/server/mcp_server.py +201 -7
  9. mcp_code_indexer/vector_mode/chunking/ast_chunker.py +103 -84
  10. mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +1 -0
  11. mcp_code_indexer/vector_mode/config.py +113 -45
  12. mcp_code_indexer/vector_mode/const.py +24 -0
  13. mcp_code_indexer/vector_mode/daemon.py +860 -98
  14. mcp_code_indexer/vector_mode/monitoring/change_detector.py +113 -97
  15. mcp_code_indexer/vector_mode/monitoring/file_watcher.py +175 -121
  16. mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +291 -98
  17. mcp_code_indexer/vector_mode/providers/voyage_client.py +140 -38
  18. mcp_code_indexer/vector_mode/services/__init__.py +9 -0
  19. mcp_code_indexer/vector_mode/services/embedding_service.py +389 -0
  20. mcp_code_indexer/vector_mode/services/vector_mode_tools_service.py +459 -0
  21. mcp_code_indexer/vector_mode/services/vector_storage_service.py +580 -0
  22. mcp_code_indexer/vector_mode/types.py +46 -0
  23. mcp_code_indexer/vector_mode/utils.py +50 -0
  24. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info}/METADATA +13 -10
  25. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info}/RECORD +28 -21
  26. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info}/WHEEL +1 -1
  27. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info}/entry_points.txt +0 -0
  28. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info/licenses}/LICENSE +0 -0
@@ -19,14 +19,16 @@ from ...database.models import ChunkType
19
19
 
20
20
  logger = logging.getLogger(__name__)
21
21
 
22
+
22
23
  @dataclass
23
24
  class CodeChunk:
24
25
  """
25
26
  Represents a code chunk ready for embedding generation.
26
-
27
+
27
28
  This is the final output of the chunking process, optimized and
28
29
  ready for vector indexing.
29
30
  """
31
+
30
32
  content: str
31
33
  chunk_type: ChunkType
32
34
  name: Optional[str]
@@ -39,16 +41,18 @@ class CodeChunk:
39
41
  metadata: Dict[str, Any] = None
40
42
  imports: List[str] = None
41
43
  parent_context: Optional[str] = None
42
-
44
+
43
45
  def __post_init__(self):
44
46
  if self.metadata is None:
45
47
  self.metadata = {}
46
48
  if self.imports is None:
47
49
  self.imports = []
48
50
 
51
+
49
52
  @dataclass
50
53
  class ChunkingStats:
51
54
  """Statistics about the chunking process."""
55
+
52
56
  files_processed: int = 0
53
57
  total_chunks: int = 0
54
58
  chunks_by_type: Dict[ChunkType, int] = None
@@ -56,21 +60,22 @@ class ChunkingStats:
56
60
  redacted_chunks: int = 0
57
61
  fallback_chunks: int = 0
58
62
  processing_time: float = 0.0
59
-
63
+
60
64
  def __post_init__(self):
61
65
  if self.chunks_by_type is None:
62
66
  self.chunks_by_type = {}
63
67
  if self.chunks_by_language is None:
64
68
  self.chunks_by_language = {}
65
69
 
70
+
66
71
  class ASTChunker:
67
72
  """
68
73
  Main AST-based code chunker.
69
-
74
+
70
75
  Orchestrates the entire chunking process from file content to
71
76
  optimized code chunks ready for embedding generation.
72
77
  """
73
-
78
+
74
79
  def __init__(
75
80
  self,
76
81
  max_chunk_size: int = 1500,
@@ -81,7 +86,7 @@ class ASTChunker:
81
86
  ):
82
87
  """
83
88
  Initialize AST chunker.
84
-
89
+
85
90
  Args:
86
91
  max_chunk_size: Maximum characters per chunk
87
92
  min_chunk_size: Minimum characters per chunk
@@ -93,7 +98,7 @@ class ASTChunker:
93
98
  self.min_chunk_size = min_chunk_size
94
99
  self.enable_redaction = enable_redaction
95
100
  self.enable_optimization = enable_optimization
96
-
101
+
97
102
  # Initialize components
98
103
  self.redactor: Optional[SecretRedactor] = None
99
104
  if enable_redaction:
@@ -101,111 +106,110 @@ class ASTChunker:
101
106
  min_confidence=redaction_confidence,
102
107
  preserve_structure=True,
103
108
  )
104
-
109
+
105
110
  self.optimizer: Optional[ChunkOptimizer] = None
106
111
  if enable_optimization:
107
112
  self.optimizer = ChunkOptimizer(
108
113
  max_chunk_size=max_chunk_size,
109
114
  min_chunk_size=min_chunk_size,
110
115
  )
111
-
116
+
112
117
  # Statistics
113
118
  self.stats = ChunkingStats()
114
-
119
+
115
120
  # Cache for performance
116
121
  self.handler_cache: Dict[str, Any] = {}
117
-
118
- def chunk_file(self, file_path: str, content: Optional[str] = None) -> List[CodeChunk]:
122
+
123
+ def chunk_file(
124
+ self, file_path: str, content: Optional[str] = None
125
+ ) -> List[CodeChunk]:
119
126
  """
120
127
  Chunk a single file into semantic code chunks.
121
-
128
+
122
129
  Args:
123
130
  file_path: Path to the file to chunk
124
131
  content: Optional file content (if not provided, will read from file)
125
-
132
+
126
133
  Returns:
127
134
  List of code chunks
128
135
  """
129
136
  start_time = datetime.utcnow()
130
-
137
+
131
138
  try:
132
139
  # Read content if not provided
133
140
  if content is None:
134
- with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
141
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
135
142
  content = f.read()
136
-
143
+
137
144
  # Skip empty files
138
145
  if not content.strip():
139
146
  logger.debug(f"Skipping empty file: {file_path}")
140
147
  return []
141
-
148
+
142
149
  # Get language handler
143
150
  handler = self._get_language_handler(file_path)
144
151
  if not handler:
145
152
  logger.warning(f"No handler available for {file_path}")
146
153
  return []
147
-
154
+
148
155
  # Parse into semantic chunks
149
156
  logger.debug(f"Parsing {file_path} with {handler.language_name} handler")
150
157
  parsed_chunks = handler.parse_code(content, file_path)
151
-
158
+
152
159
  # Convert to code chunks
153
160
  code_chunks = []
154
161
  for parsed_chunk in parsed_chunks:
155
162
  code_chunk = self._convert_parsed_chunk(parsed_chunk, file_path)
156
163
  if code_chunk:
157
164
  code_chunks.append(code_chunk)
158
-
165
+
159
166
  # Apply redaction if enabled
160
167
  if self.enable_redaction and self.redactor:
161
168
  code_chunks = self._apply_redaction(code_chunks, file_path)
162
-
169
+
163
170
  # Apply optimization if enabled
164
171
  if self.enable_optimization and self.optimizer:
165
172
  code_chunks = self._apply_optimization(code_chunks)
166
-
173
+
167
174
  # Update statistics
168
175
  processing_time = (datetime.utcnow() - start_time).total_seconds()
169
176
  self._update_stats(code_chunks, handler.language_name, processing_time)
170
-
177
+
171
178
  logger.info(f"Chunked {file_path}: {len(code_chunks)} chunks")
172
179
  return code_chunks
173
-
180
+
174
181
  except Exception as e:
175
182
  logger.error(f"Failed to chunk file {file_path}: {e}")
176
183
  return []
177
-
184
+
178
185
  def chunk_content(
179
- self,
180
- content: str,
181
- file_path: str,
182
- language: Optional[str] = None
186
+ self, content: str, file_path: str, language: Optional[str] = None
183
187
  ) -> List[CodeChunk]:
184
188
  """
185
189
  Chunk content directly without reading from file.
186
-
190
+
187
191
  Args:
188
192
  content: Source code content
189
193
  file_path: Virtual file path for language detection
190
194
  language: Optional language override
191
-
195
+
192
196
  Returns:
193
197
  List of code chunks
194
198
  """
195
199
  return self.chunk_file(file_path, content)
196
-
200
+
197
201
  def chunk_multiple_files(self, file_paths: List[str]) -> Dict[str, List[CodeChunk]]:
198
202
  """
199
203
  Chunk multiple files and return results grouped by file.
200
-
204
+
201
205
  Args:
202
206
  file_paths: List of file paths to chunk
203
-
207
+
204
208
  Returns:
205
209
  Dictionary mapping file paths to their chunks
206
210
  """
207
211
  results = {}
208
-
212
+
209
213
  for file_path in file_paths:
210
214
  try:
211
215
  chunks = self.chunk_file(file_path)
@@ -213,28 +217,30 @@ class ASTChunker:
213
217
  except Exception as e:
214
218
  logger.error(f"Failed to chunk {file_path}: {e}")
215
219
  results[file_path] = []
216
-
220
+
217
221
  return results
218
-
222
+
219
223
  def _get_language_handler(self, file_path: str) -> Optional[Any]:
220
224
  """Get language handler for file, with caching."""
221
225
  extension = Path(file_path).suffix.lower()
222
-
226
+
223
227
  if extension in self.handler_cache:
224
228
  return self.handler_cache[extension]
225
-
229
+
226
230
  handler = get_language_handler(file_path)
227
231
  self.handler_cache[extension] = handler
228
232
  return handler
229
-
230
- def _convert_parsed_chunk(self, parsed_chunk: ParsedChunk, file_path: str) -> Optional[CodeChunk]:
233
+
234
+ def _convert_parsed_chunk(
235
+ self, parsed_chunk: ParsedChunk, file_path: str
236
+ ) -> Optional[CodeChunk]:
231
237
  """Convert a parsed chunk to a code chunk."""
232
238
  if not parsed_chunk.content.strip():
233
239
  return None
234
-
240
+
235
241
  # Generate content hash
236
- content_hash = hashlib.sha256(parsed_chunk.content.encode('utf-8')).hexdigest()
237
-
242
+ content_hash = hashlib.sha256(parsed_chunk.content.encode("utf-8")).hexdigest()
243
+
238
244
  # Create code chunk
239
245
  code_chunk = CodeChunk(
240
246
  content=parsed_chunk.content,
@@ -249,42 +255,48 @@ class ASTChunker:
249
255
  imports=parsed_chunk.imports.copy() if parsed_chunk.imports else [],
250
256
  parent_context=parsed_chunk.parent_context,
251
257
  )
252
-
258
+
253
259
  return code_chunk
254
-
255
- def _apply_redaction(self, chunks: List[CodeChunk], file_path: str) -> List[CodeChunk]:
260
+
261
+ def _apply_redaction(
262
+ self, chunks: List[CodeChunk], file_path: str
263
+ ) -> List[CodeChunk]:
256
264
  """Apply secret redaction to chunks."""
257
265
  redacted_chunks = []
258
-
266
+
259
267
  for chunk in chunks:
260
268
  try:
261
269
  redaction_result = self.redactor.redact_content(
262
270
  content=chunk.content,
263
271
  file_path=file_path,
264
272
  )
265
-
273
+
266
274
  if redaction_result.was_redacted:
267
275
  # Update chunk with redacted content
268
276
  chunk.content = redaction_result.redacted_content
269
277
  chunk.redacted = True
270
278
  chunk.metadata["redaction_count"] = redaction_result.redaction_count
271
- chunk.metadata["redacted_patterns"] = redaction_result.patterns_matched
272
-
279
+ chunk.metadata["redacted_patterns"] = (
280
+ redaction_result.patterns_matched
281
+ )
282
+
273
283
  # Recompute hash for redacted content
274
284
  chunk.content_hash = hashlib.sha256(
275
- chunk.content.encode('utf-8')
285
+ chunk.content.encode("utf-8")
276
286
  ).hexdigest()
277
-
278
- logger.debug(f"Redacted {redaction_result.redaction_count} secrets from chunk {chunk.name}")
279
-
287
+
288
+ logger.debug(
289
+ f"Redacted {redaction_result.redaction_count} secrets from chunk {chunk.name}"
290
+ )
291
+
280
292
  redacted_chunks.append(chunk)
281
-
293
+
282
294
  except Exception as e:
283
295
  logger.warning(f"Failed to redact chunk {chunk.name}: {e}")
284
296
  redacted_chunks.append(chunk)
285
-
297
+
286
298
  return redacted_chunks
287
-
299
+
288
300
  def _apply_optimization(self, chunks: List[CodeChunk]) -> List[CodeChunk]:
289
301
  """Apply chunk optimization."""
290
302
  try:
@@ -301,12 +313,13 @@ class ASTChunker:
301
313
  language=chunk.language,
302
314
  imports=chunk.imports,
303
315
  parent_context=chunk.parent_context,
316
+ redacted=chunk.redacted,
304
317
  )
305
318
  optimized_chunks.append(opt_chunk)
306
-
319
+
307
320
  # Apply optimization
308
321
  optimized_chunks = self.optimizer.optimize_chunks(optimized_chunks)
309
-
322
+
310
323
  # Convert back to code chunks
311
324
  result_chunks = []
312
325
  for opt_chunk in optimized_chunks:
@@ -317,74 +330,80 @@ class ASTChunker:
317
330
  file_path=chunks[0].file_path if chunks else "",
318
331
  start_line=opt_chunk.start_line,
319
332
  end_line=opt_chunk.end_line,
320
- content_hash=hashlib.sha256(opt_chunk.content.encode('utf-8')).hexdigest(),
333
+ content_hash=hashlib.sha256(
334
+ opt_chunk.content.encode("utf-8")
335
+ ).hexdigest(),
321
336
  language=opt_chunk.language,
337
+ redacted=opt_chunk.redacted,
322
338
  metadata=opt_chunk.metadata,
323
339
  imports=opt_chunk.imports,
324
340
  parent_context=opt_chunk.parent_context,
325
341
  )
326
342
  result_chunks.append(code_chunk)
327
-
343
+
328
344
  return result_chunks
329
-
345
+
330
346
  except Exception as e:
331
347
  logger.warning(f"Chunk optimization failed: {e}")
332
348
  return chunks
333
-
334
- def _update_stats(self, chunks: List[CodeChunk], language: str, processing_time: float) -> None:
349
+
350
+ def _update_stats(
351
+ self, chunks: List[CodeChunk], language: str, processing_time: float
352
+ ) -> None:
335
353
  """Update chunking statistics."""
336
354
  self.stats.files_processed += 1
337
355
  self.stats.total_chunks += len(chunks)
338
356
  self.stats.processing_time += processing_time
339
-
357
+
340
358
  # Count by type
341
359
  for chunk in chunks:
342
360
  self.stats.chunks_by_type[chunk.chunk_type] = (
343
361
  self.stats.chunks_by_type.get(chunk.chunk_type, 0) + 1
344
362
  )
345
-
363
+
346
364
  if chunk.redacted:
347
365
  self.stats.redacted_chunks += 1
348
-
366
+
349
367
  if chunk.metadata.get("fallback", False):
350
368
  self.stats.fallback_chunks += 1
351
-
369
+
352
370
  # Count by language
353
- self.stats.chunks_by_language[language] = (
354
- self.stats.chunks_by_language.get(language, 0) + len(chunks)
355
- )
356
-
371
+ self.stats.chunks_by_language[language] = self.stats.chunks_by_language.get(
372
+ language, 0
373
+ ) + len(chunks)
374
+
357
375
  def get_stats(self) -> ChunkingStats:
358
376
  """Get chunking statistics."""
359
377
  return self.stats
360
-
378
+
361
379
  def reset_stats(self) -> None:
362
380
  """Reset chunking statistics."""
363
381
  self.stats = ChunkingStats()
364
-
382
+
365
383
  def get_supported_extensions(self) -> Set[str]:
366
384
  """Get list of supported file extensions."""
367
385
  from .language_handlers import LANGUAGE_HANDLERS
386
+
368
387
  return set(LANGUAGE_HANDLERS.keys())
369
-
388
+
370
389
  def is_supported_file(self, file_path: str) -> bool:
371
390
  """Check if a file is supported for chunking."""
372
391
  extension = Path(file_path).suffix.lower()
373
392
  return extension in self.get_supported_extensions()
374
-
393
+
375
394
  def estimate_chunks(self, file_path: str) -> Dict[str, Any]:
376
395
  """Estimate number of chunks for a file without full processing."""
377
396
  try:
378
- with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
397
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
379
398
  content = f.read()
380
-
399
+
381
400
  # Simple estimation based on content length and average chunk size
382
401
  content_length = len(content)
383
- lines = content.count('\n') + 1
384
-
402
+ lines = content.count("\n") + 1
403
+
385
404
  # Rough estimates
386
405
  estimated_chunks = max(1, content_length // self.max_chunk_size)
387
-
406
+
388
407
  return {
389
408
  "file_path": file_path,
390
409
  "content_length": content_length,
@@ -392,7 +411,7 @@ class ASTChunker:
392
411
  "estimated_chunks": estimated_chunks,
393
412
  "is_supported": self.is_supported_file(file_path),
394
413
  }
395
-
414
+
396
415
  except Exception as e:
397
416
  logger.warning(f"Failed to estimate chunks for {file_path}: {e}")
398
417
  return {
@@ -27,6 +27,7 @@ class OptimizedChunk:
27
27
  imports: List[str] = None
28
28
  parent_context: Optional[str] = None
29
29
  optimization_applied: str = "none"
30
+ redacted: bool = False
30
31
 
31
32
  def __post_init__(self):
32
33
  if self.imports is None: