mcp-code-indexer 4.0.2__py3-none-any.whl → 4.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. mcp_code_indexer/database/models.py +125 -1
  2. mcp_code_indexer/main.py +60 -0
  3. mcp_code_indexer/migrations/006_vector_mode.sql +189 -0
  4. mcp_code_indexer/server/mcp_server.py +3 -0
  5. mcp_code_indexer/vector_mode/__init__.py +36 -0
  6. mcp_code_indexer/vector_mode/chunking/__init__.py +19 -0
  7. mcp_code_indexer/vector_mode/chunking/ast_chunker.py +403 -0
  8. mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +500 -0
  9. mcp_code_indexer/vector_mode/chunking/language_handlers.py +478 -0
  10. mcp_code_indexer/vector_mode/config.py +167 -0
  11. mcp_code_indexer/vector_mode/daemon.py +335 -0
  12. mcp_code_indexer/vector_mode/monitoring/__init__.py +19 -0
  13. mcp_code_indexer/vector_mode/monitoring/change_detector.py +312 -0
  14. mcp_code_indexer/vector_mode/monitoring/file_watcher.py +445 -0
  15. mcp_code_indexer/vector_mode/monitoring/merkle_tree.py +418 -0
  16. mcp_code_indexer/vector_mode/providers/__init__.py +17 -0
  17. mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +217 -0
  18. mcp_code_indexer/vector_mode/providers/voyage_client.py +119 -0
  19. mcp_code_indexer/vector_mode/security/__init__.py +11 -0
  20. mcp_code_indexer/vector_mode/security/patterns.py +297 -0
  21. mcp_code_indexer/vector_mode/security/redactor.py +368 -0
  22. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/METADATA +66 -5
  23. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/RECORD +26 -8
  24. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/LICENSE +0 -0
  25. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/WHEEL +0 -0
  26. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,500 @@
1
+ """
2
+ Chunk optimization for vector mode.
3
+
4
+ Optimizes code chunks for embedding generation by combining small chunks,
5
+ splitting large chunks, and ensuring optimal token distribution.
6
+ """
7
+
8
+ import logging
9
+ from typing import List, Dict, Any, Optional, Tuple
10
+ from dataclasses import dataclass
11
+ from datetime import datetime
12
+
13
+ from ...database.models import ChunkType
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ @dataclass
18
+ class OptimizedChunk:
19
+ """Represents an optimized code chunk."""
20
+ content: str
21
+ chunk_type: ChunkType
22
+ name: Optional[str]
23
+ start_line: int
24
+ end_line: int
25
+ metadata: Dict[str, Any]
26
+ language: str
27
+ imports: List[str] = None
28
+ parent_context: Optional[str] = None
29
+ optimization_applied: str = "none"
30
+
31
+ def __post_init__(self):
32
+ if self.imports is None:
33
+ self.imports = []
34
+
35
+ class ChunkOptimizer:
36
+ """
37
+ Optimizes code chunks for better embedding quality.
38
+
39
+ Applies various optimization strategies including:
40
+ - Combining small related chunks
41
+ - Splitting oversized chunks
42
+ - Adding context from imports/parent scopes
43
+ - Balancing chunk sizes
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ max_chunk_size: int = 1500,
49
+ min_chunk_size: int = 50,
50
+ target_chunk_size: int = 800,
51
+ context_window: int = 200,
52
+ enable_context_enrichment: bool = True,
53
+ ):
54
+ """
55
+ Initialize chunk optimizer.
56
+
57
+ Args:
58
+ max_chunk_size: Maximum characters per chunk
59
+ min_chunk_size: Minimum characters per chunk
60
+ target_chunk_size: Target size for optimal chunks
61
+ context_window: Characters of context to add
62
+ enable_context_enrichment: Whether to add context from imports/parent
63
+ """
64
+ self.max_chunk_size = max_chunk_size
65
+ self.min_chunk_size = min_chunk_size
66
+ self.target_chunk_size = target_chunk_size
67
+ self.context_window = context_window
68
+ self.enable_context_enrichment = enable_context_enrichment
69
+
70
+ # Optimization statistics
71
+ self.stats = {
72
+ "chunks_processed": 0,
73
+ "chunks_combined": 0,
74
+ "chunks_split": 0,
75
+ "chunks_enriched": 0,
76
+ "optimization_time": 0.0,
77
+ }
78
+
79
+ def optimize_chunks(self, chunks: List[OptimizedChunk]) -> List[OptimizedChunk]:
80
+ """
81
+ Optimize a list of code chunks.
82
+
83
+ Args:
84
+ chunks: List of chunks to optimize
85
+
86
+ Returns:
87
+ Optimized list of chunks
88
+ """
89
+ start_time = datetime.utcnow()
90
+
91
+ if not chunks:
92
+ return chunks
93
+
94
+ logger.debug(f"Optimizing {len(chunks)} chunks")
95
+
96
+ # Step 1: Add context enrichment
97
+ if self.enable_context_enrichment:
98
+ chunks = self._enrich_with_context(chunks)
99
+
100
+ # Step 2: Combine small chunks
101
+ chunks = self._combine_small_chunks(chunks)
102
+
103
+ # Step 3: Split oversized chunks
104
+ chunks = self._split_large_chunks(chunks)
105
+
106
+ # Step 4: Balance chunk sizes
107
+ chunks = self._balance_chunks(chunks)
108
+
109
+ # Update statistics
110
+ processing_time = (datetime.utcnow() - start_time).total_seconds()
111
+ self.stats["chunks_processed"] += len(chunks)
112
+ self.stats["optimization_time"] += processing_time
113
+
114
+ logger.debug(f"Optimization complete: {len(chunks)} chunks")
115
+ return chunks
116
+
117
+ def _enrich_with_context(self, chunks: List[OptimizedChunk]) -> List[OptimizedChunk]:
118
+ """Add context information to chunks for better embeddings."""
119
+ enriched_chunks = []
120
+
121
+ # Group chunks by file to share context
122
+ chunks_by_file = {}
123
+ for chunk in chunks:
124
+ # Use language as a proxy for file grouping
125
+ file_key = f"{chunk.language}_{chunk.parent_context or 'global'}"
126
+ if file_key not in chunks_by_file:
127
+ chunks_by_file[file_key] = []
128
+ chunks_by_file[file_key].append(chunk)
129
+
130
+ for file_chunks in chunks_by_file.values():
131
+ # Find import chunks to use as context
132
+ import_chunks = [c for c in file_chunks if c.chunk_type == ChunkType.IMPORT]
133
+ import_context = ""
134
+
135
+ if import_chunks:
136
+ import_lines = []
137
+ for imp_chunk in import_chunks:
138
+ import_lines.extend(imp_chunk.imports)
139
+
140
+ if import_lines:
141
+ import_context = "\n".join(import_lines[:10]) + "\n\n" # Limit imports
142
+
143
+ # Enrich non-import chunks with context
144
+ for chunk in file_chunks:
145
+ if chunk.chunk_type != ChunkType.IMPORT and import_context:
146
+ # Add import context at the beginning
147
+ original_content = chunk.content
148
+ enriched_content = import_context + original_content
149
+
150
+ # Only enrich if it doesn't exceed max size
151
+ if len(enriched_content) <= self.max_chunk_size:
152
+ chunk.content = enriched_content
153
+ chunk.optimization_applied = "context_enriched"
154
+ chunk.metadata["context_added"] = True
155
+ chunk.metadata["import_lines_added"] = len(import_context.split('\n')) - 2
156
+ self.stats["chunks_enriched"] += 1
157
+
158
+ enriched_chunks.append(chunk)
159
+
160
+ return enriched_chunks
161
+
162
+ def _combine_small_chunks(self, chunks: List[OptimizedChunk]) -> List[OptimizedChunk]:
163
+ """Combine small chunks that are related."""
164
+ combined_chunks = []
165
+ pending_combination = []
166
+
167
+ def should_combine(chunk1: OptimizedChunk, chunk2: OptimizedChunk) -> bool:
168
+ """Check if two chunks should be combined."""
169
+ # Don't combine different types unless they're generic
170
+ if (chunk1.chunk_type != chunk2.chunk_type and
171
+ chunk1.chunk_type != ChunkType.GENERIC and
172
+ chunk2.chunk_type != ChunkType.GENERIC):
173
+ return False
174
+
175
+ # Don't combine if they're from different contexts
176
+ if chunk1.parent_context != chunk2.parent_context:
177
+ return False
178
+
179
+ # Don't combine if result would be too large
180
+ combined_size = len(chunk1.content) + len(chunk2.content) + 2 # +2 for separator
181
+ if combined_size > self.max_chunk_size:
182
+ return False
183
+
184
+ # Don't combine imports with other types
185
+ if (chunk1.chunk_type == ChunkType.IMPORT or
186
+ chunk2.chunk_type == ChunkType.IMPORT):
187
+ return False
188
+
189
+ return True
190
+
191
+ def combine_chunks(chunk_list: List[OptimizedChunk]) -> OptimizedChunk:
192
+ """Combine a list of chunks into one."""
193
+ if len(chunk_list) == 1:
194
+ return chunk_list[0]
195
+
196
+ # Combine content
197
+ combined_content = "\n\n".join(chunk.content for chunk in chunk_list)
198
+
199
+ # Use properties from first chunk as base
200
+ base_chunk = chunk_list[0]
201
+
202
+ # Combine metadata
203
+ combined_metadata = base_chunk.metadata.copy()
204
+ combined_metadata["combined_from"] = len(chunk_list)
205
+ combined_metadata["original_chunks"] = [c.name for c in chunk_list if c.name]
206
+
207
+ # Combine imports
208
+ all_imports = []
209
+ for chunk in chunk_list:
210
+ all_imports.extend(chunk.imports)
211
+ unique_imports = list(dict.fromkeys(all_imports)) # Preserve order, remove dupes
212
+
213
+ return OptimizedChunk(
214
+ content=combined_content,
215
+ chunk_type=base_chunk.chunk_type,
216
+ name=f"combined_{len(chunk_list)}_chunks",
217
+ start_line=min(c.start_line for c in chunk_list),
218
+ end_line=max(c.end_line for c in chunk_list),
219
+ metadata=combined_metadata,
220
+ language=base_chunk.language,
221
+ imports=unique_imports,
222
+ parent_context=base_chunk.parent_context,
223
+ optimization_applied="combined",
224
+ )
225
+
226
+ for chunk in chunks:
227
+ # Check if chunk is small
228
+ if len(chunk.content) < self.min_chunk_size:
229
+ # Try to combine with pending chunks
230
+ can_combine = False
231
+ if pending_combination:
232
+ last_chunk = pending_combination[-1]
233
+ if should_combine(last_chunk, chunk):
234
+ pending_combination.append(chunk)
235
+ can_combine = True
236
+
237
+ if not can_combine:
238
+ # Flush pending combination if any
239
+ if pending_combination:
240
+ combined = combine_chunks(pending_combination)
241
+ combined_chunks.append(combined)
242
+ self.stats["chunks_combined"] += len(pending_combination) - 1
243
+
244
+ # Start new combination
245
+ pending_combination = [chunk]
246
+ else:
247
+ # Flush pending combination
248
+ if pending_combination:
249
+ combined = combine_chunks(pending_combination)
250
+ combined_chunks.append(combined)
251
+ self.stats["chunks_combined"] += len(pending_combination) - 1
252
+ pending_combination = []
253
+
254
+ # Add regular chunk
255
+ combined_chunks.append(chunk)
256
+
257
+ # Flush any remaining pending combination
258
+ if pending_combination:
259
+ combined = combine_chunks(pending_combination)
260
+ combined_chunks.append(combined)
261
+ self.stats["chunks_combined"] += len(pending_combination) - 1
262
+
263
+ return combined_chunks
264
+
265
+ def _split_large_chunks(self, chunks: List[OptimizedChunk]) -> List[OptimizedChunk]:
266
+ """Split chunks that are too large."""
267
+ split_chunks = []
268
+
269
+ for chunk in chunks:
270
+ if len(chunk.content) <= self.max_chunk_size:
271
+ split_chunks.append(chunk)
272
+ continue
273
+
274
+ # Split the chunk
275
+ logger.debug(f"Splitting large chunk: {len(chunk.content)} chars")
276
+
277
+ # Try to split at logical boundaries
278
+ sub_chunks = self._split_chunk_intelligently(chunk)
279
+ split_chunks.extend(sub_chunks)
280
+
281
+ self.stats["chunks_split"] += len(sub_chunks) - 1
282
+
283
+ return split_chunks
284
+
285
+ def _split_chunk_intelligently(self, chunk: OptimizedChunk) -> List[OptimizedChunk]:
286
+ """Split a chunk at intelligent boundaries."""
287
+ content = chunk.content
288
+ max_size = self.max_chunk_size
289
+
290
+ # Try to split at natural boundaries
291
+ split_points = self._find_split_points(content)
292
+
293
+ if not split_points:
294
+ # Fallback to simple line-based splitting
295
+ return self._split_chunk_by_lines(chunk)
296
+
297
+ sub_chunks = []
298
+ start_idx = 0
299
+ current_line = chunk.start_line
300
+
301
+ for split_point in split_points:
302
+ if split_point - start_idx > max_size:
303
+ # This segment is too large, split it further
304
+ sub_content = content[start_idx:split_point]
305
+ sub_sub_chunks = self._split_content_by_size(
306
+ sub_content, chunk, current_line, max_size
307
+ )
308
+ sub_chunks.extend(sub_sub_chunks)
309
+ else:
310
+ # Create sub-chunk
311
+ sub_content = content[start_idx:split_point]
312
+ if sub_content.strip():
313
+ lines_in_chunk = sub_content.count('\n')
314
+ sub_chunk = OptimizedChunk(
315
+ content=sub_content,
316
+ chunk_type=chunk.chunk_type,
317
+ name=f"{chunk.name}_part_{len(sub_chunks) + 1}" if chunk.name else None,
318
+ start_line=current_line,
319
+ end_line=current_line + lines_in_chunk,
320
+ metadata=chunk.metadata.copy(),
321
+ language=chunk.language,
322
+ imports=chunk.imports.copy(),
323
+ parent_context=chunk.parent_context,
324
+ optimization_applied="split",
325
+ )
326
+ sub_chunks.append(sub_chunk)
327
+ current_line += lines_in_chunk + 1
328
+
329
+ start_idx = split_point
330
+
331
+ # Handle remaining content
332
+ if start_idx < len(content):
333
+ remaining_content = content[start_idx:]
334
+ if remaining_content.strip():
335
+ lines_in_chunk = remaining_content.count('\n')
336
+ sub_chunk = OptimizedChunk(
337
+ content=remaining_content,
338
+ chunk_type=chunk.chunk_type,
339
+ name=f"{chunk.name}_part_{len(sub_chunks) + 1}" if chunk.name else None,
340
+ start_line=current_line,
341
+ end_line=current_line + lines_in_chunk,
342
+ metadata=chunk.metadata.copy(),
343
+ language=chunk.language,
344
+ imports=chunk.imports.copy(),
345
+ parent_context=chunk.parent_context,
346
+ optimization_applied="split",
347
+ )
348
+ sub_chunks.append(sub_chunk)
349
+
350
+ return sub_chunks or [chunk] # Return original if splitting failed
351
+
352
+ def _find_split_points(self, content: str) -> List[int]:
353
+ """Find intelligent split points in content."""
354
+ split_points = []
355
+ lines = content.split('\n')
356
+ current_pos = 0
357
+
358
+ for i, line in enumerate(lines):
359
+ line_with_newline = line + '\n' if i < len(lines) - 1 else line
360
+
361
+ # Look for natural boundaries
362
+ stripped = line.strip()
363
+
364
+ # End of function/class/block
365
+ if (stripped.startswith(('def ', 'class ', 'function ', 'const ', 'let ', 'var ')) or
366
+ stripped.endswith(('{', '}', ';')) or
367
+ not stripped): # Empty lines
368
+
369
+ split_points.append(current_pos + len(line_with_newline))
370
+
371
+ current_pos += len(line_with_newline)
372
+
373
+ return split_points
374
+
375
+ def _split_chunk_by_lines(self, chunk: OptimizedChunk) -> List[OptimizedChunk]:
376
+ """Fallback: split chunk by lines when intelligent splitting fails."""
377
+ lines = chunk.content.split('\n')
378
+ sub_chunks = []
379
+ current_lines = []
380
+ current_size = 0
381
+ current_line_num = chunk.start_line
382
+ start_line_num = chunk.start_line
383
+
384
+ for line in lines:
385
+ line_size = len(line) + 1 # +1 for newline
386
+
387
+ if current_size + line_size > self.max_chunk_size and current_lines:
388
+ # Create sub-chunk
389
+ sub_content = '\n'.join(current_lines)
390
+ sub_chunk = OptimizedChunk(
391
+ content=sub_content,
392
+ chunk_type=chunk.chunk_type,
393
+ name=f"{chunk.name}_part_{len(sub_chunks) + 1}" if chunk.name else None,
394
+ start_line=start_line_num,
395
+ end_line=current_line_num,
396
+ metadata=chunk.metadata.copy(),
397
+ language=chunk.language,
398
+ imports=chunk.imports.copy(),
399
+ parent_context=chunk.parent_context,
400
+ optimization_applied="split_lines",
401
+ )
402
+ sub_chunks.append(sub_chunk)
403
+
404
+ # Reset for next chunk
405
+ current_lines = [line]
406
+ current_size = line_size
407
+ start_line_num = current_line_num + 1
408
+ else:
409
+ current_lines.append(line)
410
+ current_size += line_size
411
+
412
+ current_line_num += 1
413
+
414
+ # Add final chunk
415
+ if current_lines:
416
+ sub_content = '\n'.join(current_lines)
417
+ sub_chunk = OptimizedChunk(
418
+ content=sub_content,
419
+ chunk_type=chunk.chunk_type,
420
+ name=f"{chunk.name}_part_{len(sub_chunks) + 1}" if chunk.name else None,
421
+ start_line=start_line_num,
422
+ end_line=current_line_num,
423
+ metadata=chunk.metadata.copy(),
424
+ language=chunk.language,
425
+ imports=chunk.imports.copy(),
426
+ parent_context=chunk.parent_context,
427
+ optimization_applied="split_lines",
428
+ )
429
+ sub_chunks.append(sub_chunk)
430
+
431
+ return sub_chunks
432
+
433
+ def _split_content_by_size(
434
+ self,
435
+ content: str,
436
+ original_chunk: OptimizedChunk,
437
+ start_line: int,
438
+ max_size: int
439
+ ) -> List[OptimizedChunk]:
440
+ """Split content by size when other methods fail."""
441
+ sub_chunks = []
442
+ current_pos = 0
443
+ chunk_num = 1
444
+
445
+ while current_pos < len(content):
446
+ end_pos = min(current_pos + max_size, len(content))
447
+
448
+ # Try to end at a word boundary
449
+ if end_pos < len(content):
450
+ while end_pos > current_pos and content[end_pos] not in ' \n\t':
451
+ end_pos -= 1
452
+
453
+ if end_pos == current_pos: # No word boundary found
454
+ end_pos = min(current_pos + max_size, len(content))
455
+
456
+ sub_content = content[current_pos:end_pos]
457
+ if sub_content.strip():
458
+ lines_in_chunk = sub_content.count('\n')
459
+ sub_chunk = OptimizedChunk(
460
+ content=sub_content,
461
+ chunk_type=original_chunk.chunk_type,
462
+ name=f"{original_chunk.name}_size_part_{chunk_num}" if original_chunk.name else None,
463
+ start_line=start_line,
464
+ end_line=start_line + lines_in_chunk,
465
+ metadata=original_chunk.metadata.copy(),
466
+ language=original_chunk.language,
467
+ imports=original_chunk.imports.copy(),
468
+ parent_context=original_chunk.parent_context,
469
+ optimization_applied="split_size",
470
+ )
471
+ sub_chunks.append(sub_chunk)
472
+ start_line += lines_in_chunk + 1
473
+ chunk_num += 1
474
+
475
+ current_pos = end_pos
476
+
477
+ return sub_chunks
478
+
479
+ def _balance_chunks(self, chunks: List[OptimizedChunk]) -> List[OptimizedChunk]:
480
+ """Apply final balancing to chunks."""
481
+ # For now, just return as-is
482
+ # Future enhancements could include:
483
+ # - Redistributing content between chunks
484
+ # - Merging very small chunks
485
+ # - Further splitting of slightly oversized chunks
486
+ return chunks
487
+
488
+ def get_stats(self) -> Dict[str, Any]:
489
+ """Get optimization statistics."""
490
+ return self.stats.copy()
491
+
492
+ def reset_stats(self) -> None:
493
+ """Reset optimization statistics."""
494
+ self.stats = {
495
+ "chunks_processed": 0,
496
+ "chunks_combined": 0,
497
+ "chunks_split": 0,
498
+ "chunks_enriched": 0,
499
+ "optimization_time": 0.0,
500
+ }