mcp-vector-search 0.15.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcp-vector-search might be problematic. Click here for more details.

Files changed (86) hide show
  1. mcp_vector_search/__init__.py +10 -0
  2. mcp_vector_search/cli/__init__.py +1 -0
  3. mcp_vector_search/cli/commands/__init__.py +1 -0
  4. mcp_vector_search/cli/commands/auto_index.py +397 -0
  5. mcp_vector_search/cli/commands/chat.py +534 -0
  6. mcp_vector_search/cli/commands/config.py +393 -0
  7. mcp_vector_search/cli/commands/demo.py +358 -0
  8. mcp_vector_search/cli/commands/index.py +762 -0
  9. mcp_vector_search/cli/commands/init.py +658 -0
  10. mcp_vector_search/cli/commands/install.py +869 -0
  11. mcp_vector_search/cli/commands/install_old.py +700 -0
  12. mcp_vector_search/cli/commands/mcp.py +1254 -0
  13. mcp_vector_search/cli/commands/reset.py +393 -0
  14. mcp_vector_search/cli/commands/search.py +796 -0
  15. mcp_vector_search/cli/commands/setup.py +1133 -0
  16. mcp_vector_search/cli/commands/status.py +584 -0
  17. mcp_vector_search/cli/commands/uninstall.py +404 -0
  18. mcp_vector_search/cli/commands/visualize/__init__.py +39 -0
  19. mcp_vector_search/cli/commands/visualize/cli.py +265 -0
  20. mcp_vector_search/cli/commands/visualize/exporters/__init__.py +12 -0
  21. mcp_vector_search/cli/commands/visualize/exporters/html_exporter.py +33 -0
  22. mcp_vector_search/cli/commands/visualize/exporters/json_exporter.py +29 -0
  23. mcp_vector_search/cli/commands/visualize/graph_builder.py +709 -0
  24. mcp_vector_search/cli/commands/visualize/layout_engine.py +469 -0
  25. mcp_vector_search/cli/commands/visualize/server.py +201 -0
  26. mcp_vector_search/cli/commands/visualize/state_manager.py +428 -0
  27. mcp_vector_search/cli/commands/visualize/templates/__init__.py +16 -0
  28. mcp_vector_search/cli/commands/visualize/templates/base.py +218 -0
  29. mcp_vector_search/cli/commands/visualize/templates/scripts.py +3670 -0
  30. mcp_vector_search/cli/commands/visualize/templates/styles.py +779 -0
  31. mcp_vector_search/cli/commands/visualize.py.original +2536 -0
  32. mcp_vector_search/cli/commands/watch.py +287 -0
  33. mcp_vector_search/cli/didyoumean.py +520 -0
  34. mcp_vector_search/cli/export.py +320 -0
  35. mcp_vector_search/cli/history.py +295 -0
  36. mcp_vector_search/cli/interactive.py +342 -0
  37. mcp_vector_search/cli/main.py +484 -0
  38. mcp_vector_search/cli/output.py +414 -0
  39. mcp_vector_search/cli/suggestions.py +375 -0
  40. mcp_vector_search/config/__init__.py +1 -0
  41. mcp_vector_search/config/constants.py +24 -0
  42. mcp_vector_search/config/defaults.py +200 -0
  43. mcp_vector_search/config/settings.py +146 -0
  44. mcp_vector_search/core/__init__.py +1 -0
  45. mcp_vector_search/core/auto_indexer.py +298 -0
  46. mcp_vector_search/core/config_utils.py +394 -0
  47. mcp_vector_search/core/connection_pool.py +360 -0
  48. mcp_vector_search/core/database.py +1237 -0
  49. mcp_vector_search/core/directory_index.py +318 -0
  50. mcp_vector_search/core/embeddings.py +294 -0
  51. mcp_vector_search/core/exceptions.py +89 -0
  52. mcp_vector_search/core/factory.py +318 -0
  53. mcp_vector_search/core/git_hooks.py +345 -0
  54. mcp_vector_search/core/indexer.py +1002 -0
  55. mcp_vector_search/core/llm_client.py +453 -0
  56. mcp_vector_search/core/models.py +294 -0
  57. mcp_vector_search/core/project.py +350 -0
  58. mcp_vector_search/core/scheduler.py +330 -0
  59. mcp_vector_search/core/search.py +952 -0
  60. mcp_vector_search/core/watcher.py +322 -0
  61. mcp_vector_search/mcp/__init__.py +5 -0
  62. mcp_vector_search/mcp/__main__.py +25 -0
  63. mcp_vector_search/mcp/server.py +752 -0
  64. mcp_vector_search/parsers/__init__.py +8 -0
  65. mcp_vector_search/parsers/base.py +296 -0
  66. mcp_vector_search/parsers/dart.py +605 -0
  67. mcp_vector_search/parsers/html.py +413 -0
  68. mcp_vector_search/parsers/javascript.py +643 -0
  69. mcp_vector_search/parsers/php.py +694 -0
  70. mcp_vector_search/parsers/python.py +502 -0
  71. mcp_vector_search/parsers/registry.py +223 -0
  72. mcp_vector_search/parsers/ruby.py +678 -0
  73. mcp_vector_search/parsers/text.py +186 -0
  74. mcp_vector_search/parsers/utils.py +265 -0
  75. mcp_vector_search/py.typed +1 -0
  76. mcp_vector_search/utils/__init__.py +42 -0
  77. mcp_vector_search/utils/gitignore.py +250 -0
  78. mcp_vector_search/utils/gitignore_updater.py +212 -0
  79. mcp_vector_search/utils/monorepo.py +339 -0
  80. mcp_vector_search/utils/timing.py +338 -0
  81. mcp_vector_search/utils/version.py +47 -0
  82. mcp_vector_search-0.15.7.dist-info/METADATA +884 -0
  83. mcp_vector_search-0.15.7.dist-info/RECORD +86 -0
  84. mcp_vector_search-0.15.7.dist-info/WHEEL +4 -0
  85. mcp_vector_search-0.15.7.dist-info/entry_points.txt +3 -0
  86. mcp_vector_search-0.15.7.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1002 @@
1
+ """Semantic indexer for MCP Vector Search."""
2
+
3
+ import asyncio
4
+ import json
5
+ import os
6
+ from datetime import UTC, datetime
7
+ from pathlib import Path
8
+
9
+ from loguru import logger
10
+ from packaging import version
11
+
12
+ from .. import __version__
13
+ from ..config.defaults import ALLOWED_DOTFILES, DEFAULT_IGNORE_PATTERNS
14
+ from ..config.settings import ProjectConfig
15
+ from ..parsers.registry import get_parser_registry
16
+ from ..utils.gitignore import create_gitignore_parser
17
+ from ..utils.monorepo import MonorepoDetector
18
+ from .database import VectorDatabase
19
+ from .directory_index import DirectoryIndex
20
+ from .exceptions import ParsingError
21
+ from .models import CodeChunk, IndexStats
22
+
23
+
24
+ class SemanticIndexer:
25
+ """Semantic indexer for parsing and indexing code files."""
26
+
27
+ def __init__(
28
+ self,
29
+ database: VectorDatabase,
30
+ project_root: Path,
31
+ file_extensions: list[str] | None = None,
32
+ config: ProjectConfig | None = None,
33
+ max_workers: int | None = None,
34
+ batch_size: int = 10,
35
+ debug: bool = False,
36
+ ) -> None:
37
+ """Initialize semantic indexer.
38
+
39
+ Args:
40
+ database: Vector database instance
41
+ project_root: Project root directory
42
+ file_extensions: File extensions to index (deprecated, use config)
43
+ config: Project configuration (preferred over file_extensions)
44
+ max_workers: Maximum number of worker threads for parallel processing
45
+ batch_size: Number of files to process in each batch
46
+ debug: Enable debug output for hierarchy building
47
+ """
48
+ self.database = database
49
+ self.project_root = project_root
50
+
51
+ # Store config for filtering behavior
52
+ self.config = config
53
+
54
+ # Handle backward compatibility: use config.file_extensions or fallback to parameter
55
+ if config is not None:
56
+ self.file_extensions = {ext.lower() for ext in config.file_extensions}
57
+ elif file_extensions is not None:
58
+ self.file_extensions = {ext.lower() for ext in file_extensions}
59
+ else:
60
+ raise ValueError("Either config or file_extensions must be provided")
61
+
62
+ self.parser_registry = get_parser_registry()
63
+ self._ignore_patterns = set(DEFAULT_IGNORE_PATTERNS)
64
+ self.debug = debug
65
+
66
+ # Safely get event loop for max_workers
67
+ try:
68
+ loop = asyncio.get_event_loop()
69
+ self.max_workers = max_workers or min(4, (loop.get_debug() and 1) or 4)
70
+ except RuntimeError:
71
+ # No event loop in current thread
72
+ self.max_workers = max_workers or 4
73
+
74
+ self.batch_size = batch_size
75
+ self._index_metadata_file = (
76
+ project_root / ".mcp-vector-search" / "index_metadata.json"
77
+ )
78
+
79
+ # Add cache for indexable files to avoid repeated filesystem scans
80
+ self._indexable_files_cache: list[Path] | None = None
81
+ self._cache_timestamp: float = 0
82
+ self._cache_ttl: float = 60.0 # 60 second TTL
83
+
84
+ # Initialize gitignore parser (only if respect_gitignore is True)
85
+ if config is None or config.respect_gitignore:
86
+ try:
87
+ self.gitignore_parser = create_gitignore_parser(project_root)
88
+ logger.debug(
89
+ f"Loaded {len(self.gitignore_parser.patterns)} gitignore patterns"
90
+ )
91
+ except Exception as e:
92
+ logger.warning(f"Failed to load gitignore patterns: {e}")
93
+ self.gitignore_parser = None
94
+ else:
95
+ self.gitignore_parser = None
96
+ logger.debug("Gitignore filtering disabled by configuration")
97
+
98
+ # Initialize monorepo detector
99
+ self.monorepo_detector = MonorepoDetector(project_root)
100
+ if self.monorepo_detector.is_monorepo():
101
+ subprojects = self.monorepo_detector.detect_subprojects()
102
+ logger.info(f"Detected monorepo with {len(subprojects)} subprojects")
103
+ for sp in subprojects:
104
+ logger.debug(f" - {sp.name} ({sp.relative_path})")
105
+
106
+ # Initialize directory index
107
+ self.directory_index = DirectoryIndex(
108
+ project_root / ".mcp-vector-search" / "directory_index.json"
109
+ )
110
+ # Load existing directory index
111
+ self.directory_index.load()
112
+
113
+ async def index_project(
114
+ self,
115
+ force_reindex: bool = False,
116
+ show_progress: bool = True,
117
+ ) -> int:
118
+ """Index all files in the project.
119
+
120
+ Args:
121
+ force_reindex: Whether to reindex existing files
122
+ show_progress: Whether to show progress information
123
+
124
+ Returns:
125
+ Number of files indexed
126
+ """
127
+ logger.info(f"Starting indexing of project: {self.project_root}")
128
+
129
+ # Find all indexable files
130
+ all_files = self._find_indexable_files()
131
+
132
+ if not all_files:
133
+ logger.warning("No indexable files found")
134
+ return 0
135
+
136
+ # Load existing metadata for incremental indexing
137
+ metadata = self._load_index_metadata()
138
+
139
+ # Filter files that need indexing
140
+ if force_reindex:
141
+ files_to_index = all_files
142
+ logger.info(f"Force reindex: processing all {len(files_to_index)} files")
143
+ else:
144
+ files_to_index = [
145
+ f for f in all_files if self._needs_reindexing(f, metadata)
146
+ ]
147
+ logger.info(
148
+ f"Incremental index: {len(files_to_index)} of {len(all_files)} files need updating"
149
+ )
150
+
151
+ if not files_to_index:
152
+ logger.info("All files are up to date")
153
+ return 0
154
+
155
+ # Index files in parallel batches
156
+ indexed_count = 0
157
+ failed_count = 0
158
+
159
+ # Process files in batches for better memory management
160
+ for i in range(0, len(files_to_index), self.batch_size):
161
+ batch = files_to_index[i : i + self.batch_size]
162
+
163
+ if show_progress:
164
+ logger.info(
165
+ f"Processing batch {i // self.batch_size + 1}/{(len(files_to_index) + self.batch_size - 1) // self.batch_size} ({len(batch)} files)"
166
+ )
167
+
168
+ # Process batch in parallel
169
+ batch_results = await self._process_file_batch(batch, force_reindex)
170
+
171
+ # Count results
172
+ for success in batch_results:
173
+ if success:
174
+ indexed_count += 1
175
+ else:
176
+ failed_count += 1
177
+
178
+ # Update metadata for successfully indexed files
179
+ if indexed_count > 0:
180
+ for file_path in files_to_index:
181
+ try:
182
+ metadata[str(file_path)] = os.path.getmtime(file_path)
183
+ except OSError:
184
+ pass # File might have been deleted during indexing
185
+
186
+ self._save_index_metadata(metadata)
187
+
188
+ # Rebuild directory index from successfully indexed files
189
+ try:
190
+ logger.debug("Rebuilding directory index...")
191
+ # We don't have chunk counts here, but we have file modification times
192
+ # Build a simple stats dict with file mod times for recency tracking
193
+ chunk_stats = {}
194
+ for file_path in files_to_index:
195
+ try:
196
+ mtime = os.path.getmtime(file_path)
197
+ # For now, just track modification time
198
+ # Chunk counts will be aggregated from the database later if needed
199
+ chunk_stats[str(file_path)] = {
200
+ "modified": mtime,
201
+ "chunks": 1, # Placeholder - real count from chunks
202
+ }
203
+ except OSError:
204
+ pass
205
+
206
+ self.directory_index.rebuild_from_files(
207
+ files_to_index, self.project_root, chunk_stats=chunk_stats
208
+ )
209
+ self.directory_index.save()
210
+ dir_stats = self.directory_index.get_stats()
211
+ logger.info(
212
+ f"Directory index updated: {dir_stats['total_directories']} directories, "
213
+ f"{dir_stats['total_files']} files"
214
+ )
215
+ except Exception as e:
216
+ logger.error(f"Failed to update directory index: {e}")
217
+ import traceback
218
+
219
+ logger.debug(traceback.format_exc())
220
+
221
+ logger.info(
222
+ f"Indexing complete: {indexed_count} files indexed, {failed_count} failed"
223
+ )
224
+
225
+ return indexed_count
226
+
227
+ async def _process_file_batch(
228
+ self, file_paths: list[Path], force_reindex: bool = False
229
+ ) -> list[bool]:
230
+ """Process a batch of files in parallel.
231
+
232
+ Args:
233
+ file_paths: List of file paths to process
234
+ force_reindex: Whether to force reindexing
235
+
236
+ Returns:
237
+ List of success flags for each file
238
+ """
239
+ # Create tasks for parallel processing
240
+ tasks = []
241
+ for file_path in file_paths:
242
+ task = asyncio.create_task(self._index_file_safe(file_path, force_reindex))
243
+ tasks.append(task)
244
+
245
+ # Wait for all tasks to complete
246
+ results = await asyncio.gather(*tasks, return_exceptions=True)
247
+
248
+ # Convert results to success flags
249
+ success_flags = []
250
+ for i, result in enumerate(results):
251
+ if isinstance(result, Exception):
252
+ logger.error(f"Failed to index {file_paths[i]}: {result}")
253
+ success_flags.append(False)
254
+ else:
255
+ success_flags.append(result)
256
+
257
+ return success_flags
258
+
259
+ def _load_index_metadata(self) -> dict[str, float]:
260
+ """Load file modification times from metadata file.
261
+
262
+ Returns:
263
+ Dictionary mapping file paths to modification times
264
+ """
265
+ if not self._index_metadata_file.exists():
266
+ return {}
267
+
268
+ try:
269
+ with open(self._index_metadata_file) as f:
270
+ data = json.load(f)
271
+ # Handle legacy format (just file_mtimes dict) and new format
272
+ if "file_mtimes" in data:
273
+ return data["file_mtimes"]
274
+ else:
275
+ # Legacy format - just return as-is
276
+ return data
277
+ except Exception as e:
278
+ logger.warning(f"Failed to load index metadata: {e}")
279
+ return {}
280
+
281
+ def _save_index_metadata(self, metadata: dict[str, float]) -> None:
282
+ """Save file modification times to metadata file.
283
+
284
+ Args:
285
+ metadata: Dictionary mapping file paths to modification times
286
+ """
287
+ try:
288
+ # Ensure directory exists
289
+ self._index_metadata_file.parent.mkdir(parents=True, exist_ok=True)
290
+
291
+ # New metadata format with version tracking
292
+ data = {
293
+ "index_version": __version__,
294
+ "indexed_at": datetime.now(UTC).isoformat(),
295
+ "file_mtimes": metadata,
296
+ }
297
+
298
+ with open(self._index_metadata_file, "w") as f:
299
+ json.dump(data, f, indent=2)
300
+ except Exception as e:
301
+ logger.warning(f"Failed to save index metadata: {e}")
302
+
303
+ def _needs_reindexing(self, file_path: Path, metadata: dict[str, float]) -> bool:
304
+ """Check if a file needs reindexing based on modification time.
305
+
306
+ Args:
307
+ file_path: Path to the file
308
+ metadata: Current metadata dictionary
309
+
310
+ Returns:
311
+ True if file needs reindexing
312
+ """
313
+ try:
314
+ current_mtime = os.path.getmtime(file_path)
315
+ stored_mtime = metadata.get(str(file_path), 0)
316
+ return current_mtime > stored_mtime
317
+ except OSError:
318
+ # File doesn't exist or can't be accessed
319
+ return False
320
+
321
+ async def _index_file_safe(
322
+ self, file_path: Path, force_reindex: bool = False
323
+ ) -> bool:
324
+ """Safely index a single file with error handling.
325
+
326
+ Args:
327
+ file_path: Path to the file to index
328
+ force_reindex: Whether to force reindexing
329
+
330
+ Returns:
331
+ True if successful, False otherwise
332
+ """
333
+ try:
334
+ return await self.index_file(file_path, force_reindex)
335
+ except Exception as e:
336
+ logger.error(f"Error indexing {file_path}: {e}")
337
+ return False
338
+
339
+ async def index_file(
340
+ self,
341
+ file_path: Path,
342
+ force_reindex: bool = False,
343
+ ) -> bool:
344
+ """Index a single file.
345
+
346
+ Args:
347
+ file_path: Path to the file to index
348
+ force_reindex: Whether to reindex if already indexed
349
+
350
+ Returns:
351
+ True if file was successfully indexed
352
+ """
353
+ try:
354
+ # Check if file should be indexed
355
+ if not self._should_index_file(file_path):
356
+ return False
357
+
358
+ # Always remove existing chunks when reindexing a file
359
+ # This prevents duplicate chunks and ensures consistency
360
+ await self.database.delete_by_file(file_path)
361
+
362
+ # Parse file into chunks
363
+ chunks = await self._parse_file(file_path)
364
+
365
+ if not chunks:
366
+ logger.debug(f"No chunks extracted from {file_path}")
367
+ return True # Not an error, just empty file
368
+
369
+ # Build hierarchical relationships between chunks
370
+ chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
371
+
372
+ # Debug: Check if hierarchy was built
373
+ methods_with_parents = sum(
374
+ 1
375
+ for c in chunks_with_hierarchy
376
+ if c.chunk_type in ("method", "function") and c.parent_chunk_id
377
+ )
378
+ logger.debug(
379
+ f"After hierarchy build: {methods_with_parents}/{len([c for c in chunks_with_hierarchy if c.chunk_type in ('method', 'function')])} methods have parents"
380
+ )
381
+
382
+ # Add chunks to database
383
+ await self.database.add_chunks(chunks_with_hierarchy)
384
+
385
+ # Update metadata after successful indexing
386
+ metadata = self._load_index_metadata()
387
+ metadata[str(file_path)] = os.path.getmtime(file_path)
388
+ self._save_index_metadata(metadata)
389
+
390
+ logger.debug(f"Indexed {len(chunks)} chunks from {file_path}")
391
+ return True
392
+
393
+ except Exception as e:
394
+ logger.error(f"Failed to index file {file_path}: {e}")
395
+ raise ParsingError(f"Failed to index file {file_path}: {e}") from e
396
+
397
+ async def reindex_file(self, file_path: Path) -> bool:
398
+ """Reindex a single file (removes existing chunks first).
399
+
400
+ Args:
401
+ file_path: Path to the file to reindex
402
+
403
+ Returns:
404
+ True if file was successfully reindexed
405
+ """
406
+ return await self.index_file(file_path, force_reindex=True)
407
+
408
+ async def remove_file(self, file_path: Path) -> int:
409
+ """Remove all chunks for a file from the index.
410
+
411
+ Args:
412
+ file_path: Path to the file to remove
413
+
414
+ Returns:
415
+ Number of chunks removed
416
+ """
417
+ try:
418
+ count = await self.database.delete_by_file(file_path)
419
+ logger.debug(f"Removed {count} chunks for {file_path}")
420
+ return count
421
+ except Exception as e:
422
+ logger.error(f"Failed to remove file {file_path}: {e}")
423
+ return 0
424
+
425
+ def _find_indexable_files(self) -> list[Path]:
426
+ """Find all files that should be indexed with caching.
427
+
428
+ Returns:
429
+ List of file paths to index
430
+ """
431
+ import time
432
+
433
+ # Check cache
434
+ current_time = time.time()
435
+ if (
436
+ self._indexable_files_cache is not None
437
+ and current_time - self._cache_timestamp < self._cache_ttl
438
+ ):
439
+ logger.debug(
440
+ f"Using cached indexable files ({len(self._indexable_files_cache)} files)"
441
+ )
442
+ return self._indexable_files_cache
443
+
444
+ # Rebuild cache using efficient directory filtering
445
+ logger.debug("Rebuilding indexable files cache...")
446
+ indexable_files = self._scan_files_sync()
447
+
448
+ self._indexable_files_cache = sorted(indexable_files)
449
+ self._cache_timestamp = current_time
450
+ logger.debug(f"Rebuilt indexable files cache ({len(indexable_files)} files)")
451
+
452
+ return self._indexable_files_cache
453
+
454
+ def _scan_files_sync(self) -> list[Path]:
455
+ """Synchronous file scanning (runs in thread pool).
456
+
457
+ Uses os.walk with directory filtering to avoid traversing ignored directories.
458
+
459
+ Returns:
460
+ List of indexable file paths
461
+ """
462
+ indexable_files = []
463
+
464
+ # Use os.walk for efficient directory traversal with early filtering
465
+ for root, dirs, files in os.walk(self.project_root):
466
+ root_path = Path(root)
467
+
468
+ # Filter out ignored directories IN-PLACE to prevent os.walk from traversing them
469
+ # This is much more efficient than checking every file in ignored directories
470
+ # PERFORMANCE: Pass is_directory=True hint to skip filesystem stat() calls
471
+ dirs[:] = [
472
+ d
473
+ for d in dirs
474
+ if not self._should_ignore_path(root_path / d, is_directory=True)
475
+ ]
476
+
477
+ # Check each file in the current directory
478
+ # PERFORMANCE: skip_file_check=True because os.walk guarantees these are files
479
+ for filename in files:
480
+ file_path = root_path / filename
481
+ if self._should_index_file(file_path, skip_file_check=True):
482
+ indexable_files.append(file_path)
483
+
484
+ return indexable_files
485
+
486
+ async def _find_indexable_files_async(self) -> list[Path]:
487
+ """Find all files asynchronously without blocking event loop.
488
+
489
+ Returns:
490
+ List of file paths to index
491
+ """
492
+ import time
493
+ from concurrent.futures import ThreadPoolExecutor
494
+
495
+ # Check cache first
496
+ current_time = time.time()
497
+ if (
498
+ self._indexable_files_cache is not None
499
+ and current_time - self._cache_timestamp < self._cache_ttl
500
+ ):
501
+ logger.debug(
502
+ f"Using cached indexable files ({len(self._indexable_files_cache)} files)"
503
+ )
504
+ return self._indexable_files_cache
505
+
506
+ # Run filesystem scan in thread pool to avoid blocking
507
+ logger.debug("Scanning files in background thread...")
508
+ loop = asyncio.get_running_loop()
509
+ with ThreadPoolExecutor(max_workers=1) as executor:
510
+ indexable_files = await loop.run_in_executor(
511
+ executor, self._scan_files_sync
512
+ )
513
+
514
+ # Update cache
515
+ self._indexable_files_cache = sorted(indexable_files)
516
+ self._cache_timestamp = current_time
517
+ logger.debug(f"Found {len(indexable_files)} indexable files")
518
+
519
+ return self._indexable_files_cache
520
+
521
+ def _should_index_file(
522
+ self, file_path: Path, skip_file_check: bool = False
523
+ ) -> bool:
524
+ """Check if a file should be indexed.
525
+
526
+ Args:
527
+ file_path: Path to check
528
+ skip_file_check: Skip is_file() check if caller knows it's a file (optimization)
529
+
530
+ Returns:
531
+ True if file should be indexed
532
+ """
533
+ # PERFORMANCE: Check file extension FIRST (cheapest operation, no I/O)
534
+ # This eliminates most files without any filesystem calls
535
+ if file_path.suffix.lower() not in self.file_extensions:
536
+ return False
537
+
538
+ # PERFORMANCE: Only check is_file() if not coming from os.walk
539
+ # os.walk already guarantees files, so we skip this expensive check
540
+ if not skip_file_check and not file_path.is_file():
541
+ return False
542
+
543
+ # Check if path should be ignored
544
+ # PERFORMANCE: Pass is_directory=False to skip stat() call (we know it's a file)
545
+ if self._should_ignore_path(file_path, is_directory=False):
546
+ return False
547
+
548
+ # Check file size (skip very large files)
549
+ try:
550
+ file_size = file_path.stat().st_size
551
+ if file_size > 10 * 1024 * 1024: # 10MB limit
552
+ logger.warning(f"Skipping large file: {file_path} ({file_size} bytes)")
553
+ return False
554
+ except OSError:
555
+ return False
556
+
557
+ return True
558
+
559
+ def _should_ignore_path(
560
+ self, file_path: Path, is_directory: bool | None = None
561
+ ) -> bool:
562
+ """Check if a path should be ignored.
563
+
564
+ Args:
565
+ file_path: Path to check
566
+ is_directory: Optional hint if path is a directory (avoids filesystem check)
567
+
568
+ Returns:
569
+ True if path should be ignored
570
+ """
571
+ try:
572
+ # Get relative path from project root for checking
573
+ relative_path = file_path.relative_to(self.project_root)
574
+
575
+ # 1. Check dotfile filtering (if enabled in config)
576
+ if self.config and self.config.skip_dotfiles:
577
+ for part in relative_path.parts:
578
+ # Skip dotfiles unless they're in the whitelist
579
+ if part.startswith(".") and part not in ALLOWED_DOTFILES:
580
+ logger.debug(
581
+ f"Path ignored by dotfile filter '{part}': {file_path}"
582
+ )
583
+ return True
584
+
585
+ # 2. Check gitignore rules if available and enabled
586
+ # PERFORMANCE: Pass is_directory hint to avoid redundant stat() calls
587
+ if self.config and self.config.respect_gitignore:
588
+ if self.gitignore_parser and self.gitignore_parser.is_ignored(
589
+ file_path, is_directory=is_directory
590
+ ):
591
+ logger.debug(f"Path ignored by .gitignore: {file_path}")
592
+ return True
593
+
594
+ # 3. Check each part of the path against default ignore patterns
595
+ for part in relative_path.parts:
596
+ if part in self._ignore_patterns:
597
+ logger.debug(
598
+ f"Path ignored by default pattern '{part}': {file_path}"
599
+ )
600
+ return True
601
+
602
+ # 4. Check if any parent directory should be ignored
603
+ for parent in relative_path.parents:
604
+ for part in parent.parts:
605
+ if part in self._ignore_patterns:
606
+ logger.debug(
607
+ f"Path ignored by parent pattern '{part}': {file_path}"
608
+ )
609
+ return True
610
+
611
+ return False
612
+
613
+ except ValueError:
614
+ # Path is not relative to project root
615
+ return True
616
+
617
+ async def _parse_file(self, file_path: Path) -> list[CodeChunk]:
618
+ """Parse a file into code chunks.
619
+
620
+ Args:
621
+ file_path: Path to the file to parse
622
+
623
+ Returns:
624
+ List of code chunks with subproject information
625
+ """
626
+ try:
627
+ # Get appropriate parser
628
+ parser = self.parser_registry.get_parser_for_file(file_path)
629
+
630
+ # Parse file
631
+ chunks = await parser.parse_file(file_path)
632
+
633
+ # Filter out empty chunks
634
+ valid_chunks = [chunk for chunk in chunks if chunk.content.strip()]
635
+
636
+ # Assign subproject information for monorepos
637
+ subproject = self.monorepo_detector.get_subproject_for_file(file_path)
638
+ if subproject:
639
+ for chunk in valid_chunks:
640
+ chunk.subproject_name = subproject.name
641
+ chunk.subproject_path = subproject.relative_path
642
+
643
+ return valid_chunks
644
+
645
+ except Exception as e:
646
+ logger.error(f"Failed to parse file {file_path}: {e}")
647
+ raise ParsingError(f"Failed to parse file {file_path}: {e}") from e
648
+
649
+ def add_ignore_pattern(self, pattern: str) -> None:
650
+ """Add a pattern to ignore during indexing.
651
+
652
+ Args:
653
+ pattern: Pattern to ignore (directory or file name)
654
+ """
655
+ self._ignore_patterns.add(pattern)
656
+
657
+ def remove_ignore_pattern(self, pattern: str) -> None:
658
+ """Remove an ignore pattern.
659
+
660
+ Args:
661
+ pattern: Pattern to remove
662
+ """
663
+ self._ignore_patterns.discard(pattern)
664
+
665
+ def get_ignore_patterns(self) -> set[str]:
666
+ """Get current ignore patterns.
667
+
668
+ Returns:
669
+ Set of ignore patterns
670
+ """
671
+ return self._ignore_patterns.copy()
672
+
673
+ def get_index_version(self) -> str | None:
674
+ """Get the version of the tool that created the current index.
675
+
676
+ Returns:
677
+ Version string or None if not available
678
+ """
679
+ if not self._index_metadata_file.exists():
680
+ return None
681
+
682
+ try:
683
+ with open(self._index_metadata_file) as f:
684
+ data = json.load(f)
685
+ return data.get("index_version")
686
+ except Exception as e:
687
+ logger.warning(f"Failed to read index version: {e}")
688
+ return None
689
+
690
+ def needs_reindex_for_version(self) -> bool:
691
+ """Check if reindex is needed due to version upgrade.
692
+
693
+ Returns:
694
+ True if reindex is needed for version compatibility
695
+ """
696
+ index_version = self.get_index_version()
697
+
698
+ if not index_version:
699
+ # No version recorded - this is either a new index or legacy format
700
+ # Reindex to establish version tracking
701
+ return True
702
+
703
+ try:
704
+ current = version.parse(__version__)
705
+ indexed = version.parse(index_version)
706
+
707
+ # Reindex on major or minor version change
708
+ # Patch versions (0.5.1 -> 0.5.2) don't require reindex
709
+ needs_reindex = (
710
+ current.major != indexed.major or current.minor != indexed.minor
711
+ )
712
+
713
+ if needs_reindex:
714
+ logger.info(
715
+ f"Version upgrade detected: {index_version} -> {__version__} "
716
+ f"(reindex recommended)"
717
+ )
718
+
719
+ return needs_reindex
720
+
721
+ except Exception as e:
722
+ logger.warning(f"Failed to compare versions: {e}")
723
+ # If we can't parse versions, be safe and reindex
724
+ return True
725
+
726
+ async def get_indexing_stats(self, db_stats: IndexStats | None = None) -> dict:
727
+ """Get statistics about the indexing process.
728
+
729
+ Args:
730
+ db_stats: Optional pre-fetched database stats to avoid duplicate queries
731
+
732
+ Returns:
733
+ Dictionary with indexing statistics
734
+
735
+ Note:
736
+ Uses database statistics only for performance on large projects.
737
+ Filesystem scanning would timeout on 100K+ file projects.
738
+ Pass db_stats parameter to avoid calling database.get_stats() twice.
739
+ """
740
+ try:
741
+ # Get database stats if not provided (fast, no filesystem scan)
742
+ if db_stats is None:
743
+ db_stats = await self.database.get_stats()
744
+
745
+ # Use database stats for all file counts
746
+ # This avoids expensive filesystem scans on large projects
747
+ return {
748
+ "total_indexable_files": db_stats.total_files,
749
+ "indexed_files": db_stats.total_files,
750
+ "total_files": db_stats.total_files, # For backward compatibility
751
+ "total_chunks": db_stats.total_chunks,
752
+ "languages": db_stats.languages,
753
+ "file_types": db_stats.file_types, # Include file type distribution
754
+ "file_extensions": list(self.file_extensions),
755
+ "ignore_patterns": list(self._ignore_patterns),
756
+ "parser_info": self.parser_registry.get_parser_info(),
757
+ }
758
+
759
+ except Exception as e:
760
+ logger.error(f"Failed to get indexing stats: {e}")
761
+ return {
762
+ "error": str(e),
763
+ "total_indexable_files": 0,
764
+ "indexed_files": 0,
765
+ "total_files": 0,
766
+ "total_chunks": 0,
767
+ }
768
+
769
+ async def get_files_to_index(
770
+ self, force_reindex: bool = False
771
+ ) -> tuple[list[Path], list[Path]]:
772
+ """Get all indexable files and those that need indexing.
773
+
774
+ Args:
775
+ force_reindex: Whether to force reindex of all files
776
+
777
+ Returns:
778
+ Tuple of (all_indexable_files, files_to_index)
779
+ """
780
+ # Find all indexable files
781
+ all_files = await self._find_indexable_files_async()
782
+
783
+ if not all_files:
784
+ return [], []
785
+
786
+ # Load existing metadata for incremental indexing
787
+ metadata = self._load_index_metadata()
788
+
789
+ # Filter files that need indexing
790
+ if force_reindex:
791
+ files_to_index = all_files
792
+ logger.info(f"Force reindex: processing all {len(files_to_index)} files")
793
+ else:
794
+ files_to_index = [
795
+ f for f in all_files if self._needs_reindexing(f, metadata)
796
+ ]
797
+ logger.info(
798
+ f"Incremental index: {len(files_to_index)} of {len(all_files)} files need updating"
799
+ )
800
+
801
+ return all_files, files_to_index
802
+
803
+ async def index_files_with_progress(
804
+ self,
805
+ files_to_index: list[Path],
806
+ force_reindex: bool = False,
807
+ ):
808
+ """Index files and yield progress updates for each file.
809
+
810
+ Args:
811
+ files_to_index: List of file paths to index
812
+ force_reindex: Whether to force reindexing
813
+
814
+ Yields:
815
+ Tuple of (file_path, chunks_added, success) for each processed file
816
+ """
817
+ # Write version header to error log at start of indexing run
818
+ self._write_indexing_run_header()
819
+
820
+ metadata = self._load_index_metadata()
821
+
822
+ # Process files in batches for better memory management
823
+ for i in range(0, len(files_to_index), self.batch_size):
824
+ batch = files_to_index[i : i + self.batch_size]
825
+
826
+ # Process each file in the batch
827
+ for file_path in batch:
828
+ chunks_added = 0
829
+ success = False
830
+
831
+ try:
832
+ # Always remove existing chunks when reindexing
833
+ await self.database.delete_by_file(file_path)
834
+
835
+ # Parse file into chunks
836
+ chunks = await self._parse_file(file_path)
837
+
838
+ if chunks:
839
+ # Build hierarchical relationships
840
+ chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
841
+
842
+ # Add chunks to database
843
+ await self.database.add_chunks(chunks_with_hierarchy)
844
+ chunks_added = len(chunks)
845
+ logger.debug(f"Indexed {chunks_added} chunks from {file_path}")
846
+
847
+ success = True
848
+
849
+ # Update metadata after successful indexing
850
+ metadata[str(file_path)] = os.path.getmtime(file_path)
851
+
852
+ except Exception as e:
853
+ error_msg = f"Failed to index file {file_path}: {type(e).__name__}: {str(e)}"
854
+ logger.error(error_msg)
855
+ success = False
856
+
857
+ # Save error to error log file
858
+ try:
859
+ error_log_path = (
860
+ self.project_root
861
+ / ".mcp-vector-search"
862
+ / "indexing_errors.log"
863
+ )
864
+ with open(error_log_path, "a", encoding="utf-8") as f:
865
+ from datetime import datetime
866
+
867
+ timestamp = datetime.now().isoformat()
868
+ f.write(f"[{timestamp}] {error_msg}\n")
869
+ except Exception as log_err:
870
+ logger.debug(f"Failed to write error log: {log_err}")
871
+
872
+ # Yield progress update
873
+ yield (file_path, chunks_added, success)
874
+
875
+ # Save metadata at the end
876
+ self._save_index_metadata(metadata)
877
+
878
+ def _build_chunk_hierarchy(self, chunks: list[CodeChunk]) -> list[CodeChunk]:
879
+ """Build parent-child relationships between chunks.
880
+
881
+ Logic:
882
+ - Module chunks (chunk_type="module") have depth 0
883
+ - Class chunks have depth 1, parent is module
884
+ - Method chunks have depth 2, parent is class
885
+ - Function chunks outside classes have depth 1, parent is module
886
+ - Nested classes increment depth
887
+
888
+ Args:
889
+ chunks: List of code chunks to process
890
+
891
+ Returns:
892
+ List of chunks with hierarchy relationships established
893
+ """
894
+ if not chunks:
895
+ return chunks
896
+
897
+ # Group chunks by type and name
898
+ module_chunks = [c for c in chunks if c.chunk_type in ("module", "imports")]
899
+ class_chunks = [
900
+ c for c in chunks if c.chunk_type in ("class", "interface", "mixin")
901
+ ]
902
+ function_chunks = [
903
+ c for c in chunks if c.chunk_type in ("function", "method", "constructor")
904
+ ]
905
+
906
+ # DEBUG: Print what we have (if debug enabled)
907
+ if self.debug:
908
+ import sys
909
+
910
+ print(
911
+ f"\n[DEBUG] Building hierarchy: {len(module_chunks)} modules, {len(class_chunks)} classes, {len(function_chunks)} functions",
912
+ file=sys.stderr,
913
+ )
914
+ if class_chunks:
915
+ print(
916
+ f"[DEBUG] Class names: {[c.class_name for c in class_chunks[:5]]}",
917
+ file=sys.stderr,
918
+ )
919
+ if function_chunks:
920
+ print(
921
+ f"[DEBUG] First 5 functions with class_name: {[(f.function_name, f.class_name) for f in function_chunks[:5]]}",
922
+ file=sys.stderr,
923
+ )
924
+
925
+ # Build relationships
926
+ for func in function_chunks:
927
+ if func.class_name:
928
+ # Find parent class
929
+ parent_class = next(
930
+ (c for c in class_chunks if c.class_name == func.class_name), None
931
+ )
932
+ if parent_class:
933
+ func.parent_chunk_id = parent_class.chunk_id
934
+ func.chunk_depth = parent_class.chunk_depth + 1
935
+ if func.chunk_id not in parent_class.child_chunk_ids:
936
+ parent_class.child_chunk_ids.append(func.chunk_id)
937
+ if self.debug:
938
+ import sys
939
+
940
+ print(
941
+ f"[DEBUG] ✓ Linked '{func.function_name}' to class '{parent_class.class_name}'",
942
+ file=sys.stderr,
943
+ )
944
+ logger.debug(
945
+ f"Linked method '{func.function_name}' (ID: {func.chunk_id[:8]}) to class '{parent_class.class_name}' (ID: {parent_class.chunk_id[:8]})"
946
+ )
947
+ else:
948
+ # Top-level function
949
+ if not func.chunk_depth:
950
+ func.chunk_depth = 1
951
+ # Link to module if exists
952
+ if module_chunks and not func.parent_chunk_id:
953
+ func.parent_chunk_id = module_chunks[0].chunk_id
954
+ if func.chunk_id not in module_chunks[0].child_chunk_ids:
955
+ module_chunks[0].child_chunk_ids.append(func.chunk_id)
956
+
957
+ for cls in class_chunks:
958
+ # Classes without parent are top-level (depth 1)
959
+ if not cls.chunk_depth:
960
+ cls.chunk_depth = 1
961
+ # Link to module if exists
962
+ if module_chunks and not cls.parent_chunk_id:
963
+ cls.parent_chunk_id = module_chunks[0].chunk_id
964
+ if cls.chunk_id not in module_chunks[0].child_chunk_ids:
965
+ module_chunks[0].child_chunk_ids.append(cls.chunk_id)
966
+
967
+ # Module chunks stay at depth 0
968
+ for mod in module_chunks:
969
+ if not mod.chunk_depth:
970
+ mod.chunk_depth = 0
971
+
972
+ # DEBUG: Print summary
973
+ if self.debug:
974
+ import sys
975
+
976
+ funcs_with_parents = sum(1 for f in function_chunks if f.parent_chunk_id)
977
+ classes_with_parents = sum(1 for c in class_chunks if c.parent_chunk_id)
978
+ print(
979
+ f"[DEBUG] Hierarchy built: {funcs_with_parents}/{len(function_chunks)} functions linked, {classes_with_parents}/{len(class_chunks)} classes linked\n",
980
+ file=sys.stderr,
981
+ )
982
+
983
+ return chunks
984
+
985
+ def _write_indexing_run_header(self) -> None:
986
+ """Write version and timestamp header to error log at start of indexing run."""
987
+ try:
988
+ error_log_path = (
989
+ self.project_root / ".mcp-vector-search" / "indexing_errors.log"
990
+ )
991
+ error_log_path.parent.mkdir(parents=True, exist_ok=True)
992
+
993
+ with open(error_log_path, "a", encoding="utf-8") as f:
994
+ timestamp = datetime.now(UTC).isoformat()
995
+ separator = "=" * 80
996
+ f.write(f"\n{separator}\n")
997
+ f.write(
998
+ f"[{timestamp}] Indexing run started - mcp-vector-search v{__version__}\n"
999
+ )
1000
+ f.write(f"{separator}\n")
1001
+ except Exception as e:
1002
+ logger.debug(f"Failed to write indexing run header: {e}")