mcp-code-indexer 4.2.15__py3-none-any.whl → 4.2.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_code_indexer/database/database.py +251 -85
- mcp_code_indexer/database/models.py +66 -24
- mcp_code_indexer/database/retry_executor.py +15 -5
- mcp_code_indexer/file_scanner.py +107 -12
- mcp_code_indexer/main.py +43 -30
- mcp_code_indexer/server/mcp_server.py +191 -1
- mcp_code_indexer/vector_mode/chunking/ast_chunker.py +103 -84
- mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +1 -0
- mcp_code_indexer/vector_mode/config.py +113 -45
- mcp_code_indexer/vector_mode/const.py +24 -0
- mcp_code_indexer/vector_mode/daemon.py +860 -98
- mcp_code_indexer/vector_mode/monitoring/change_detector.py +113 -97
- mcp_code_indexer/vector_mode/monitoring/file_watcher.py +175 -121
- mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +291 -98
- mcp_code_indexer/vector_mode/providers/voyage_client.py +140 -38
- mcp_code_indexer/vector_mode/services/__init__.py +9 -0
- mcp_code_indexer/vector_mode/services/embedding_service.py +389 -0
- mcp_code_indexer/vector_mode/services/vector_mode_tools_service.py +459 -0
- mcp_code_indexer/vector_mode/services/vector_storage_service.py +580 -0
- mcp_code_indexer/vector_mode/types.py +46 -0
- mcp_code_indexer/vector_mode/utils.py +50 -0
- {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/METADATA +13 -10
- {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/RECORD +26 -19
- {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/WHEEL +1 -1
- {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/entry_points.txt +0 -0
- {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info/licenses}/LICENSE +0 -0
|
@@ -19,14 +19,16 @@ from ...database.models import ChunkType
|
|
|
19
19
|
|
|
20
20
|
logger = logging.getLogger(__name__)
|
|
21
21
|
|
|
22
|
+
|
|
22
23
|
@dataclass
|
|
23
24
|
class CodeChunk:
|
|
24
25
|
"""
|
|
25
26
|
Represents a code chunk ready for embedding generation.
|
|
26
|
-
|
|
27
|
+
|
|
27
28
|
This is the final output of the chunking process, optimized and
|
|
28
29
|
ready for vector indexing.
|
|
29
30
|
"""
|
|
31
|
+
|
|
30
32
|
content: str
|
|
31
33
|
chunk_type: ChunkType
|
|
32
34
|
name: Optional[str]
|
|
@@ -39,16 +41,18 @@ class CodeChunk:
|
|
|
39
41
|
metadata: Dict[str, Any] = None
|
|
40
42
|
imports: List[str] = None
|
|
41
43
|
parent_context: Optional[str] = None
|
|
42
|
-
|
|
44
|
+
|
|
43
45
|
def __post_init__(self):
|
|
44
46
|
if self.metadata is None:
|
|
45
47
|
self.metadata = {}
|
|
46
48
|
if self.imports is None:
|
|
47
49
|
self.imports = []
|
|
48
50
|
|
|
51
|
+
|
|
49
52
|
@dataclass
|
|
50
53
|
class ChunkingStats:
|
|
51
54
|
"""Statistics about the chunking process."""
|
|
55
|
+
|
|
52
56
|
files_processed: int = 0
|
|
53
57
|
total_chunks: int = 0
|
|
54
58
|
chunks_by_type: Dict[ChunkType, int] = None
|
|
@@ -56,21 +60,22 @@ class ChunkingStats:
|
|
|
56
60
|
redacted_chunks: int = 0
|
|
57
61
|
fallback_chunks: int = 0
|
|
58
62
|
processing_time: float = 0.0
|
|
59
|
-
|
|
63
|
+
|
|
60
64
|
def __post_init__(self):
|
|
61
65
|
if self.chunks_by_type is None:
|
|
62
66
|
self.chunks_by_type = {}
|
|
63
67
|
if self.chunks_by_language is None:
|
|
64
68
|
self.chunks_by_language = {}
|
|
65
69
|
|
|
70
|
+
|
|
66
71
|
class ASTChunker:
|
|
67
72
|
"""
|
|
68
73
|
Main AST-based code chunker.
|
|
69
|
-
|
|
74
|
+
|
|
70
75
|
Orchestrates the entire chunking process from file content to
|
|
71
76
|
optimized code chunks ready for embedding generation.
|
|
72
77
|
"""
|
|
73
|
-
|
|
78
|
+
|
|
74
79
|
def __init__(
|
|
75
80
|
self,
|
|
76
81
|
max_chunk_size: int = 1500,
|
|
@@ -81,7 +86,7 @@ class ASTChunker:
|
|
|
81
86
|
):
|
|
82
87
|
"""
|
|
83
88
|
Initialize AST chunker.
|
|
84
|
-
|
|
89
|
+
|
|
85
90
|
Args:
|
|
86
91
|
max_chunk_size: Maximum characters per chunk
|
|
87
92
|
min_chunk_size: Minimum characters per chunk
|
|
@@ -93,7 +98,7 @@ class ASTChunker:
|
|
|
93
98
|
self.min_chunk_size = min_chunk_size
|
|
94
99
|
self.enable_redaction = enable_redaction
|
|
95
100
|
self.enable_optimization = enable_optimization
|
|
96
|
-
|
|
101
|
+
|
|
97
102
|
# Initialize components
|
|
98
103
|
self.redactor: Optional[SecretRedactor] = None
|
|
99
104
|
if enable_redaction:
|
|
@@ -101,111 +106,110 @@ class ASTChunker:
|
|
|
101
106
|
min_confidence=redaction_confidence,
|
|
102
107
|
preserve_structure=True,
|
|
103
108
|
)
|
|
104
|
-
|
|
109
|
+
|
|
105
110
|
self.optimizer: Optional[ChunkOptimizer] = None
|
|
106
111
|
if enable_optimization:
|
|
107
112
|
self.optimizer = ChunkOptimizer(
|
|
108
113
|
max_chunk_size=max_chunk_size,
|
|
109
114
|
min_chunk_size=min_chunk_size,
|
|
110
115
|
)
|
|
111
|
-
|
|
116
|
+
|
|
112
117
|
# Statistics
|
|
113
118
|
self.stats = ChunkingStats()
|
|
114
|
-
|
|
119
|
+
|
|
115
120
|
# Cache for performance
|
|
116
121
|
self.handler_cache: Dict[str, Any] = {}
|
|
117
|
-
|
|
118
|
-
def chunk_file(
|
|
122
|
+
|
|
123
|
+
def chunk_file(
|
|
124
|
+
self, file_path: str, content: Optional[str] = None
|
|
125
|
+
) -> List[CodeChunk]:
|
|
119
126
|
"""
|
|
120
127
|
Chunk a single file into semantic code chunks.
|
|
121
|
-
|
|
128
|
+
|
|
122
129
|
Args:
|
|
123
130
|
file_path: Path to the file to chunk
|
|
124
131
|
content: Optional file content (if not provided, will read from file)
|
|
125
|
-
|
|
132
|
+
|
|
126
133
|
Returns:
|
|
127
134
|
List of code chunks
|
|
128
135
|
"""
|
|
129
136
|
start_time = datetime.utcnow()
|
|
130
|
-
|
|
137
|
+
|
|
131
138
|
try:
|
|
132
139
|
# Read content if not provided
|
|
133
140
|
if content is None:
|
|
134
|
-
with open(file_path,
|
|
141
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
135
142
|
content = f.read()
|
|
136
|
-
|
|
143
|
+
|
|
137
144
|
# Skip empty files
|
|
138
145
|
if not content.strip():
|
|
139
146
|
logger.debug(f"Skipping empty file: {file_path}")
|
|
140
147
|
return []
|
|
141
|
-
|
|
148
|
+
|
|
142
149
|
# Get language handler
|
|
143
150
|
handler = self._get_language_handler(file_path)
|
|
144
151
|
if not handler:
|
|
145
152
|
logger.warning(f"No handler available for {file_path}")
|
|
146
153
|
return []
|
|
147
|
-
|
|
154
|
+
|
|
148
155
|
# Parse into semantic chunks
|
|
149
156
|
logger.debug(f"Parsing {file_path} with {handler.language_name} handler")
|
|
150
157
|
parsed_chunks = handler.parse_code(content, file_path)
|
|
151
|
-
|
|
158
|
+
|
|
152
159
|
# Convert to code chunks
|
|
153
160
|
code_chunks = []
|
|
154
161
|
for parsed_chunk in parsed_chunks:
|
|
155
162
|
code_chunk = self._convert_parsed_chunk(parsed_chunk, file_path)
|
|
156
163
|
if code_chunk:
|
|
157
164
|
code_chunks.append(code_chunk)
|
|
158
|
-
|
|
165
|
+
|
|
159
166
|
# Apply redaction if enabled
|
|
160
167
|
if self.enable_redaction and self.redactor:
|
|
161
168
|
code_chunks = self._apply_redaction(code_chunks, file_path)
|
|
162
|
-
|
|
169
|
+
|
|
163
170
|
# Apply optimization if enabled
|
|
164
171
|
if self.enable_optimization and self.optimizer:
|
|
165
172
|
code_chunks = self._apply_optimization(code_chunks)
|
|
166
|
-
|
|
173
|
+
|
|
167
174
|
# Update statistics
|
|
168
175
|
processing_time = (datetime.utcnow() - start_time).total_seconds()
|
|
169
176
|
self._update_stats(code_chunks, handler.language_name, processing_time)
|
|
170
|
-
|
|
177
|
+
|
|
171
178
|
logger.info(f"Chunked {file_path}: {len(code_chunks)} chunks")
|
|
172
179
|
return code_chunks
|
|
173
|
-
|
|
180
|
+
|
|
174
181
|
except Exception as e:
|
|
175
182
|
logger.error(f"Failed to chunk file {file_path}: {e}")
|
|
176
183
|
return []
|
|
177
|
-
|
|
184
|
+
|
|
178
185
|
def chunk_content(
|
|
179
|
-
self,
|
|
180
|
-
content: str,
|
|
181
|
-
file_path: str,
|
|
182
|
-
language: Optional[str] = None
|
|
186
|
+
self, content: str, file_path: str, language: Optional[str] = None
|
|
183
187
|
) -> List[CodeChunk]:
|
|
184
188
|
"""
|
|
185
189
|
Chunk content directly without reading from file.
|
|
186
|
-
|
|
190
|
+
|
|
187
191
|
Args:
|
|
188
192
|
content: Source code content
|
|
189
193
|
file_path: Virtual file path for language detection
|
|
190
194
|
language: Optional language override
|
|
191
|
-
|
|
195
|
+
|
|
192
196
|
Returns:
|
|
193
197
|
List of code chunks
|
|
194
198
|
"""
|
|
195
199
|
return self.chunk_file(file_path, content)
|
|
196
|
-
|
|
200
|
+
|
|
197
201
|
def chunk_multiple_files(self, file_paths: List[str]) -> Dict[str, List[CodeChunk]]:
|
|
198
202
|
"""
|
|
199
203
|
Chunk multiple files and return results grouped by file.
|
|
200
|
-
|
|
204
|
+
|
|
201
205
|
Args:
|
|
202
206
|
file_paths: List of file paths to chunk
|
|
203
|
-
|
|
207
|
+
|
|
204
208
|
Returns:
|
|
205
209
|
Dictionary mapping file paths to their chunks
|
|
206
210
|
"""
|
|
207
211
|
results = {}
|
|
208
|
-
|
|
212
|
+
|
|
209
213
|
for file_path in file_paths:
|
|
210
214
|
try:
|
|
211
215
|
chunks = self.chunk_file(file_path)
|
|
@@ -213,28 +217,30 @@ class ASTChunker:
|
|
|
213
217
|
except Exception as e:
|
|
214
218
|
logger.error(f"Failed to chunk {file_path}: {e}")
|
|
215
219
|
results[file_path] = []
|
|
216
|
-
|
|
220
|
+
|
|
217
221
|
return results
|
|
218
|
-
|
|
222
|
+
|
|
219
223
|
def _get_language_handler(self, file_path: str) -> Optional[Any]:
|
|
220
224
|
"""Get language handler for file, with caching."""
|
|
221
225
|
extension = Path(file_path).suffix.lower()
|
|
222
|
-
|
|
226
|
+
|
|
223
227
|
if extension in self.handler_cache:
|
|
224
228
|
return self.handler_cache[extension]
|
|
225
|
-
|
|
229
|
+
|
|
226
230
|
handler = get_language_handler(file_path)
|
|
227
231
|
self.handler_cache[extension] = handler
|
|
228
232
|
return handler
|
|
229
|
-
|
|
230
|
-
def _convert_parsed_chunk(
|
|
233
|
+
|
|
234
|
+
def _convert_parsed_chunk(
|
|
235
|
+
self, parsed_chunk: ParsedChunk, file_path: str
|
|
236
|
+
) -> Optional[CodeChunk]:
|
|
231
237
|
"""Convert a parsed chunk to a code chunk."""
|
|
232
238
|
if not parsed_chunk.content.strip():
|
|
233
239
|
return None
|
|
234
|
-
|
|
240
|
+
|
|
235
241
|
# Generate content hash
|
|
236
|
-
content_hash = hashlib.sha256(parsed_chunk.content.encode(
|
|
237
|
-
|
|
242
|
+
content_hash = hashlib.sha256(parsed_chunk.content.encode("utf-8")).hexdigest()
|
|
243
|
+
|
|
238
244
|
# Create code chunk
|
|
239
245
|
code_chunk = CodeChunk(
|
|
240
246
|
content=parsed_chunk.content,
|
|
@@ -249,42 +255,48 @@ class ASTChunker:
|
|
|
249
255
|
imports=parsed_chunk.imports.copy() if parsed_chunk.imports else [],
|
|
250
256
|
parent_context=parsed_chunk.parent_context,
|
|
251
257
|
)
|
|
252
|
-
|
|
258
|
+
|
|
253
259
|
return code_chunk
|
|
254
|
-
|
|
255
|
-
def _apply_redaction(
|
|
260
|
+
|
|
261
|
+
def _apply_redaction(
|
|
262
|
+
self, chunks: List[CodeChunk], file_path: str
|
|
263
|
+
) -> List[CodeChunk]:
|
|
256
264
|
"""Apply secret redaction to chunks."""
|
|
257
265
|
redacted_chunks = []
|
|
258
|
-
|
|
266
|
+
|
|
259
267
|
for chunk in chunks:
|
|
260
268
|
try:
|
|
261
269
|
redaction_result = self.redactor.redact_content(
|
|
262
270
|
content=chunk.content,
|
|
263
271
|
file_path=file_path,
|
|
264
272
|
)
|
|
265
|
-
|
|
273
|
+
|
|
266
274
|
if redaction_result.was_redacted:
|
|
267
275
|
# Update chunk with redacted content
|
|
268
276
|
chunk.content = redaction_result.redacted_content
|
|
269
277
|
chunk.redacted = True
|
|
270
278
|
chunk.metadata["redaction_count"] = redaction_result.redaction_count
|
|
271
|
-
chunk.metadata["redacted_patterns"] =
|
|
272
|
-
|
|
279
|
+
chunk.metadata["redacted_patterns"] = (
|
|
280
|
+
redaction_result.patterns_matched
|
|
281
|
+
)
|
|
282
|
+
|
|
273
283
|
# Recompute hash for redacted content
|
|
274
284
|
chunk.content_hash = hashlib.sha256(
|
|
275
|
-
chunk.content.encode(
|
|
285
|
+
chunk.content.encode("utf-8")
|
|
276
286
|
).hexdigest()
|
|
277
|
-
|
|
278
|
-
logger.debug(
|
|
279
|
-
|
|
287
|
+
|
|
288
|
+
logger.debug(
|
|
289
|
+
f"Redacted {redaction_result.redaction_count} secrets from chunk {chunk.name}"
|
|
290
|
+
)
|
|
291
|
+
|
|
280
292
|
redacted_chunks.append(chunk)
|
|
281
|
-
|
|
293
|
+
|
|
282
294
|
except Exception as e:
|
|
283
295
|
logger.warning(f"Failed to redact chunk {chunk.name}: {e}")
|
|
284
296
|
redacted_chunks.append(chunk)
|
|
285
|
-
|
|
297
|
+
|
|
286
298
|
return redacted_chunks
|
|
287
|
-
|
|
299
|
+
|
|
288
300
|
def _apply_optimization(self, chunks: List[CodeChunk]) -> List[CodeChunk]:
|
|
289
301
|
"""Apply chunk optimization."""
|
|
290
302
|
try:
|
|
@@ -301,12 +313,13 @@ class ASTChunker:
|
|
|
301
313
|
language=chunk.language,
|
|
302
314
|
imports=chunk.imports,
|
|
303
315
|
parent_context=chunk.parent_context,
|
|
316
|
+
redacted=chunk.redacted,
|
|
304
317
|
)
|
|
305
318
|
optimized_chunks.append(opt_chunk)
|
|
306
|
-
|
|
319
|
+
|
|
307
320
|
# Apply optimization
|
|
308
321
|
optimized_chunks = self.optimizer.optimize_chunks(optimized_chunks)
|
|
309
|
-
|
|
322
|
+
|
|
310
323
|
# Convert back to code chunks
|
|
311
324
|
result_chunks = []
|
|
312
325
|
for opt_chunk in optimized_chunks:
|
|
@@ -317,74 +330,80 @@ class ASTChunker:
|
|
|
317
330
|
file_path=chunks[0].file_path if chunks else "",
|
|
318
331
|
start_line=opt_chunk.start_line,
|
|
319
332
|
end_line=opt_chunk.end_line,
|
|
320
|
-
content_hash=hashlib.sha256(
|
|
333
|
+
content_hash=hashlib.sha256(
|
|
334
|
+
opt_chunk.content.encode("utf-8")
|
|
335
|
+
).hexdigest(),
|
|
321
336
|
language=opt_chunk.language,
|
|
337
|
+
redacted=opt_chunk.redacted,
|
|
322
338
|
metadata=opt_chunk.metadata,
|
|
323
339
|
imports=opt_chunk.imports,
|
|
324
340
|
parent_context=opt_chunk.parent_context,
|
|
325
341
|
)
|
|
326
342
|
result_chunks.append(code_chunk)
|
|
327
|
-
|
|
343
|
+
|
|
328
344
|
return result_chunks
|
|
329
|
-
|
|
345
|
+
|
|
330
346
|
except Exception as e:
|
|
331
347
|
logger.warning(f"Chunk optimization failed: {e}")
|
|
332
348
|
return chunks
|
|
333
|
-
|
|
334
|
-
def _update_stats(
|
|
349
|
+
|
|
350
|
+
def _update_stats(
|
|
351
|
+
self, chunks: List[CodeChunk], language: str, processing_time: float
|
|
352
|
+
) -> None:
|
|
335
353
|
"""Update chunking statistics."""
|
|
336
354
|
self.stats.files_processed += 1
|
|
337
355
|
self.stats.total_chunks += len(chunks)
|
|
338
356
|
self.stats.processing_time += processing_time
|
|
339
|
-
|
|
357
|
+
|
|
340
358
|
# Count by type
|
|
341
359
|
for chunk in chunks:
|
|
342
360
|
self.stats.chunks_by_type[chunk.chunk_type] = (
|
|
343
361
|
self.stats.chunks_by_type.get(chunk.chunk_type, 0) + 1
|
|
344
362
|
)
|
|
345
|
-
|
|
363
|
+
|
|
346
364
|
if chunk.redacted:
|
|
347
365
|
self.stats.redacted_chunks += 1
|
|
348
|
-
|
|
366
|
+
|
|
349
367
|
if chunk.metadata.get("fallback", False):
|
|
350
368
|
self.stats.fallback_chunks += 1
|
|
351
|
-
|
|
369
|
+
|
|
352
370
|
# Count by language
|
|
353
|
-
self.stats.chunks_by_language[language] = (
|
|
354
|
-
|
|
355
|
-
)
|
|
356
|
-
|
|
371
|
+
self.stats.chunks_by_language[language] = self.stats.chunks_by_language.get(
|
|
372
|
+
language, 0
|
|
373
|
+
) + len(chunks)
|
|
374
|
+
|
|
357
375
|
def get_stats(self) -> ChunkingStats:
|
|
358
376
|
"""Get chunking statistics."""
|
|
359
377
|
return self.stats
|
|
360
|
-
|
|
378
|
+
|
|
361
379
|
def reset_stats(self) -> None:
|
|
362
380
|
"""Reset chunking statistics."""
|
|
363
381
|
self.stats = ChunkingStats()
|
|
364
|
-
|
|
382
|
+
|
|
365
383
|
def get_supported_extensions(self) -> Set[str]:
|
|
366
384
|
"""Get list of supported file extensions."""
|
|
367
385
|
from .language_handlers import LANGUAGE_HANDLERS
|
|
386
|
+
|
|
368
387
|
return set(LANGUAGE_HANDLERS.keys())
|
|
369
|
-
|
|
388
|
+
|
|
370
389
|
def is_supported_file(self, file_path: str) -> bool:
|
|
371
390
|
"""Check if a file is supported for chunking."""
|
|
372
391
|
extension = Path(file_path).suffix.lower()
|
|
373
392
|
return extension in self.get_supported_extensions()
|
|
374
|
-
|
|
393
|
+
|
|
375
394
|
def estimate_chunks(self, file_path: str) -> Dict[str, Any]:
|
|
376
395
|
"""Estimate number of chunks for a file without full processing."""
|
|
377
396
|
try:
|
|
378
|
-
with open(file_path,
|
|
397
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
379
398
|
content = f.read()
|
|
380
|
-
|
|
399
|
+
|
|
381
400
|
# Simple estimation based on content length and average chunk size
|
|
382
401
|
content_length = len(content)
|
|
383
|
-
lines = content.count(
|
|
384
|
-
|
|
402
|
+
lines = content.count("\n") + 1
|
|
403
|
+
|
|
385
404
|
# Rough estimates
|
|
386
405
|
estimated_chunks = max(1, content_length // self.max_chunk_size)
|
|
387
|
-
|
|
406
|
+
|
|
388
407
|
return {
|
|
389
408
|
"file_path": file_path,
|
|
390
409
|
"content_length": content_length,
|
|
@@ -392,7 +411,7 @@ class ASTChunker:
|
|
|
392
411
|
"estimated_chunks": estimated_chunks,
|
|
393
412
|
"is_supported": self.is_supported_file(file_path),
|
|
394
413
|
}
|
|
395
|
-
|
|
414
|
+
|
|
396
415
|
except Exception as e:
|
|
397
416
|
logger.warning(f"Failed to estimate chunks for {file_path}: {e}")
|
|
398
417
|
return {
|