mcp-code-indexer 4.2.14__py3-none-any.whl → 4.2.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. mcp_code_indexer/database/database.py +251 -85
  2. mcp_code_indexer/database/models.py +66 -24
  3. mcp_code_indexer/database/retry_executor.py +15 -5
  4. mcp_code_indexer/file_scanner.py +107 -12
  5. mcp_code_indexer/main.py +75 -23
  6. mcp_code_indexer/server/mcp_server.py +191 -1
  7. mcp_code_indexer/vector_mode/chunking/ast_chunker.py +103 -84
  8. mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +1 -0
  9. mcp_code_indexer/vector_mode/config.py +113 -45
  10. mcp_code_indexer/vector_mode/const.py +24 -0
  11. mcp_code_indexer/vector_mode/daemon.py +860 -98
  12. mcp_code_indexer/vector_mode/monitoring/change_detector.py +113 -97
  13. mcp_code_indexer/vector_mode/monitoring/file_watcher.py +175 -121
  14. mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +291 -98
  15. mcp_code_indexer/vector_mode/providers/voyage_client.py +140 -38
  16. mcp_code_indexer/vector_mode/services/__init__.py +9 -0
  17. mcp_code_indexer/vector_mode/services/embedding_service.py +389 -0
  18. mcp_code_indexer/vector_mode/services/vector_mode_tools_service.py +459 -0
  19. mcp_code_indexer/vector_mode/services/vector_storage_service.py +580 -0
  20. mcp_code_indexer/vector_mode/types.py +46 -0
  21. mcp_code_indexer/vector_mode/utils.py +50 -0
  22. {mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info}/METADATA +13 -10
  23. {mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info}/RECORD +26 -19
  24. {mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info}/WHEEL +1 -1
  25. {mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info}/entry_points.txt +0 -0
  26. {mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info/licenses}/LICENSE +0 -0
@@ -6,10 +6,12 @@ while respecting .gitignore patterns and common ignore patterns. It enables
6
6
  efficient discovery of files that need description tracking.
7
7
  """
8
8
 
9
+ import asyncio
9
10
  import fnmatch
10
11
  import logging
12
+ import os
11
13
  from pathlib import Path
12
- from typing import Dict, Generator, List, Optional, Set, Union, Any, cast
14
+ from typing import Dict, Iterator, List, Optional, Set, Union, Any, cast
13
15
 
14
16
  try:
15
17
  from gitignore_parser import parse_gitignore
@@ -150,6 +152,13 @@ class FileScanner:
150
152
  self.project_root = Path(project_root).resolve()
151
153
  self._gitignore_cache: Dict[str, Any] = {}
152
154
  self._load_gitignore_patterns()
155
+ # Build ignore patterns set for directory pruning
156
+ self.ignore_patterns = set(DEFAULT_IGNORE_PATTERNS)
157
+
158
+ @property
159
+ def root_path(self) -> Path:
160
+ """Get the root path for the scanner (alias for project_root)."""
161
+ return self.project_root
153
162
 
154
163
  def _load_gitignore_patterns(self) -> None:
155
164
  """Load and cache gitignore patterns from the project."""
@@ -228,6 +237,53 @@ class FileScanner:
228
237
  """Check if a file has an ignored extension."""
229
238
  return file_path.suffix.lower() in IGNORED_EXTENSIONS
230
239
 
240
+ def should_ignore_path(self, path: Path) -> bool:
241
+ """
242
+ Check if a path (file or directory) should be ignored based on patterns.
243
+
244
+ This is used for directory pruning during walks to skip entire subtrees
245
+ like node_modules, .git, etc.
246
+
247
+ Args:
248
+ path: Path to check (can be file or directory)
249
+
250
+ Returns:
251
+ True if the path should be ignored
252
+ """
253
+ try:
254
+ rel_path = path.relative_to(self.project_root)
255
+ except ValueError:
256
+ rel_path = path
257
+
258
+ path_str = str(rel_path)
259
+ path_name = path.name
260
+
261
+ # Check against ignore patterns
262
+ for pattern in self.ignore_patterns:
263
+ # Handle directory patterns (ending with /)
264
+ if pattern.endswith("/"):
265
+ pattern_no_slash = pattern.rstrip("/")
266
+ if path_name == pattern_no_slash:
267
+ return True
268
+ # Handle wildcard patterns (starting with *)
269
+ elif pattern.startswith("*"):
270
+ if path_str.endswith(pattern[1:]) or path_name.endswith(pattern[1:]):
271
+ return True
272
+ # Handle path patterns (containing / or \)
273
+ elif "/" in pattern or "\\" in pattern:
274
+ if pattern in path_str:
275
+ return True
276
+ # Handle simple name patterns
277
+ else:
278
+ if pattern in path.parts or path_name == pattern:
279
+ return True
280
+
281
+ # Also check gitignore
282
+ if self._is_ignored_by_gitignore(path):
283
+ return True
284
+
285
+ return False
286
+
231
287
  def should_ignore_file(self, file_path: Path) -> bool:
232
288
  """
233
289
  Determine if a file should be ignored.
@@ -246,12 +302,8 @@ class FileScanner:
246
302
  if self._is_ignored_by_extension(file_path):
247
303
  return True
248
304
 
249
- # Check default patterns
250
- if self._is_ignored_by_default_patterns(file_path):
251
- return True
252
-
253
- # Check gitignore patterns
254
- if self._is_ignored_by_gitignore(file_path):
305
+ # Check path-based patterns
306
+ if self.should_ignore_path(file_path):
255
307
  return True
256
308
 
257
309
  return False
@@ -286,12 +338,27 @@ class FileScanner:
286
338
  logger.info(f"Found {len(files)} trackable files in {self.project_root}")
287
339
  return files
288
340
 
289
- def _walk_directory(self) -> Generator[Path, None, None]:
290
- """Walk through all files in the project directory."""
341
+ def _walk_directory(self) -> Iterator[Path]:
342
+ """
343
+ Walk directory using os.walk with directory pruning.
344
+
345
+ This skips ignored directories entirely rather than traversing then filtering.
346
+ Critical for performance - avoids traversing node_modules, .git, etc.
347
+ """
291
348
  try:
292
- for item in self.project_root.rglob("*"):
293
- if item.is_file():
294
- yield item
349
+ for dirpath, dirnames, filenames in os.walk(self.project_root):
350
+ current_dir = Path(dirpath)
351
+
352
+ # Prune ignored directories in-place to prevent descending into them
353
+ # Modifying dirnames in-place is the documented way to prune os.walk
354
+ dirnames[:] = [
355
+ d for d in dirnames
356
+ if not self.should_ignore_path(current_dir / d)
357
+ ]
358
+
359
+ for filename in filenames:
360
+ yield current_dir / filename
361
+
295
362
  except PermissionError as e:
296
363
  logger.warning(f"Permission denied accessing {e.filename}")
297
364
  except Exception as e:
@@ -404,3 +471,31 @@ class FileScanner:
404
471
  logger.error(f"Error getting project stats: {e}")
405
472
 
406
473
  return stats
474
+
475
+ async def scan_directory_async(
476
+ self, max_files: Optional[int] = None
477
+ ) -> List[Path]:
478
+ """
479
+ Async version of scan_directory running in a thread.
480
+
481
+ Args:
482
+ max_files: Maximum number of files to return (None for no limit)
483
+
484
+ Returns:
485
+ List of file paths that should be tracked
486
+ """
487
+ loop = asyncio.get_running_loop()
488
+ return await loop.run_in_executor(None, self.scan_directory, max_files)
489
+
490
+ async def find_missing_files_async(self, existing_paths: Set[str]) -> List[Path]:
491
+ """
492
+ Async version of find_missing_files running in a thread.
493
+
494
+ Args:
495
+ existing_paths: Set of relative file paths that already have descriptions
496
+
497
+ Returns:
498
+ List of file paths that are missing descriptions
499
+ """
500
+ loop = asyncio.get_running_loop()
501
+ return await loop.run_in_executor(None, self.find_missing_files, existing_paths)
mcp_code_indexer/main.py CHANGED
@@ -377,6 +377,8 @@ async def handle_runcommand(args: argparse.Namespace) -> None:
377
377
  "get_word_frequency": server._handle_get_word_frequency,
378
378
  "search_codebase_overview": server._handle_search_codebase_overview,
379
379
  "check_database_health": server._handle_check_database_health,
380
+ "enabled_vector_mode": server._handle_enabled_vector_mode,
381
+ "find_similar_code": server._handle_find_similar_code,
380
382
  }
381
383
 
382
384
  if tool_name not in tool_handlers:
@@ -1017,41 +1019,49 @@ async def main() -> None:
1017
1019
  from .vector_mode import is_vector_mode_available, check_api_keys
1018
1020
  from .vector_mode.config import load_vector_config
1019
1021
  from .vector_mode.daemon import start_vector_daemon
1020
-
1022
+
1021
1023
  # Check if vector mode is available
1022
1024
  if not is_vector_mode_available():
1023
- logger.error("Vector mode dependencies not found. Try reinstalling: pip install --upgrade mcp-code-indexer")
1025
+ logger.error(
1026
+ "Vector mode dependencies not found. Try reinstalling: pip install --upgrade mcp-code-indexer"
1027
+ )
1024
1028
  sys.exit(1)
1025
-
1029
+
1026
1030
  # Check API keys
1027
1031
  api_keys = check_api_keys()
1028
1032
  if not all(api_keys.values()):
1029
1033
  missing = [k for k, v in api_keys.items() if not v]
1030
- logger.error(f"Missing API keys for vector mode: {', '.join(missing)}")
1034
+ logger.error(
1035
+ f"Missing API keys for vector mode: {', '.join(missing)}"
1036
+ )
1031
1037
  sys.exit(1)
1032
-
1038
+
1033
1039
  # Load vector configuration
1034
- vector_config_path = Path(args.vector_config).expanduser() if args.vector_config else None
1040
+ vector_config_path = (
1041
+ Path(args.vector_config).expanduser()
1042
+ if args.vector_config
1043
+ else None
1044
+ )
1035
1045
  vector_config = load_vector_config(vector_config_path)
1036
-
1046
+
1037
1047
  logger.info(
1038
- "Vector mode enabled",
1048
+ "Vector mode enabled",
1039
1049
  extra={
1040
1050
  "structured_data": {
1041
1051
  "embedding_model": vector_config.embedding_model,
1042
1052
  "batch_size": vector_config.batch_size,
1043
1053
  "daemon_enabled": vector_config.daemon_enabled,
1044
1054
  }
1045
- }
1055
+ },
1046
1056
  )
1047
-
1057
+
1048
1058
  # Start vector daemon in background
1049
1059
  if vector_config.daemon_enabled:
1050
1060
  vector_daemon_task = asyncio.create_task(
1051
1061
  start_vector_daemon(vector_config_path, db_path, cache_dir)
1052
1062
  )
1053
1063
  logger.info("Vector daemon started")
1054
-
1064
+
1055
1065
  except Exception as e:
1056
1066
  logger.error(f"Failed to initialize vector mode: {e}")
1057
1067
  sys.exit(1)
@@ -1095,7 +1105,45 @@ async def main() -> None:
1095
1105
  if transport:
1096
1106
  transport.server = server
1097
1107
 
1098
- await server.run()
1108
+ # If vector mode is enabled, we need to handle signals properly
1109
+ # because server.run() may not respond to KeyboardInterrupt
1110
+ if args.vector and vector_daemon_task:
1111
+ # Setup signal handling for graceful shutdown
1112
+ shutdown_event = asyncio.Event()
1113
+
1114
+ def signal_handler():
1115
+ logger.info("Shutdown signal received")
1116
+ shutdown_event.set()
1117
+
1118
+ # Register signal handlers
1119
+ loop = asyncio.get_running_loop()
1120
+ for sig in [signal.SIGTERM, signal.SIGINT]:
1121
+ loop.add_signal_handler(sig, signal_handler)
1122
+
1123
+ # Run server and wait for shutdown signal
1124
+ server_task = asyncio.create_task(server.run())
1125
+ shutdown_task = asyncio.create_task(shutdown_event.wait())
1126
+
1127
+ try:
1128
+ # Wait for either server completion or shutdown signal
1129
+ done, pending = await asyncio.wait(
1130
+ [server_task, shutdown_task], return_when=asyncio.FIRST_COMPLETED
1131
+ )
1132
+
1133
+ # Cancel remaining tasks
1134
+ for task in pending:
1135
+ task.cancel()
1136
+ try:
1137
+ await task
1138
+ except asyncio.CancelledError:
1139
+ pass
1140
+
1141
+ except Exception as e:
1142
+ logger.error(f"Error during server execution: {e}")
1143
+ raise
1144
+ else:
1145
+ # Normal mode - let server handle KeyboardInterrupt naturally
1146
+ await server.run()
1099
1147
 
1100
1148
  except Exception as e:
1101
1149
  error_handler.log_error(e, context={"phase": "startup"})
@@ -1105,17 +1153,21 @@ async def main() -> None:
1105
1153
  if vector_daemon_task and not vector_daemon_task.done():
1106
1154
  logger.info("Cancelling vector daemon")
1107
1155
  vector_daemon_task.cancel()
1108
-
1156
+
1109
1157
  # Wait for vector daemon to finish
1110
1158
  if vector_daemon_task:
1111
1159
  try:
1112
1160
  await vector_daemon_task
1113
1161
  except asyncio.CancelledError:
1114
1162
  logger.info("Vector daemon cancelled successfully")
1115
-
1163
+
1116
1164
  # Clean up any remaining asyncio tasks to prevent hanging
1117
1165
  current_task = asyncio.current_task()
1118
- tasks = [task for task in asyncio.all_tasks() if not task.done() and task is not current_task]
1166
+ tasks = [
1167
+ task
1168
+ for task in asyncio.all_tasks()
1169
+ if not task.done() and task is not current_task
1170
+ ]
1119
1171
  if tasks:
1120
1172
  logger.info(f"Cancelling {len(tasks)} remaining tasks")
1121
1173
  for task in tasks:
@@ -1124,22 +1176,21 @@ async def main() -> None:
1124
1176
  # Wait for cancellation but don't wait forever
1125
1177
  try:
1126
1178
  await asyncio.wait_for(
1127
- asyncio.gather(*tasks, return_exceptions=True),
1128
- timeout=2.0
1179
+ asyncio.gather(*tasks, return_exceptions=True), timeout=2.0
1129
1180
  )
1130
1181
  except asyncio.TimeoutError:
1131
1182
  logger.warning("Some tasks did not cancel within timeout")
1132
-
1183
+
1133
1184
  # Force close any remaining connections and cleanup resources
1134
1185
  try:
1135
1186
  # Give a moment for final cleanup
1136
1187
  await asyncio.sleep(0.1)
1137
-
1188
+
1138
1189
  # Shutdown the event loop executor to stop any background threads
1139
1190
  loop = asyncio.get_running_loop()
1140
- if hasattr(loop, '_default_executor') and loop._default_executor:
1191
+ if hasattr(loop, "_default_executor") and loop._default_executor:
1141
1192
  loop._default_executor.shutdown(wait=False)
1142
-
1193
+
1143
1194
  except Exception as e:
1144
1195
  logger.warning(f"Error during final cleanup: {e}")
1145
1196
 
@@ -1163,14 +1214,15 @@ def cli_main() -> None:
1163
1214
  # Force cleanup of any remaining resources to prevent hanging
1164
1215
  import threading
1165
1216
  import time
1166
-
1217
+
1167
1218
  # Give main threads a moment to finish
1168
1219
  time.sleep(0.1)
1169
-
1220
+
1170
1221
  # Force exit if daemon threads are preventing shutdown
1171
1222
  active_threads = threading.active_count()
1172
1223
  if active_threads > 1: # More than just the main thread
1173
1224
  import os
1225
+
1174
1226
  os._exit(0)
1175
1227
 
1176
1228
 
@@ -684,6 +684,104 @@ class MCPCodeIndexServer:
684
684
  "additionalProperties": False,
685
685
  },
686
686
  ),
687
+ types.Tool(
688
+ name="enabled_vector_mode",
689
+ description=(
690
+ "Enables or disables vector mode for a project. Vector mode "
691
+ "provides semantic search capabilities with embeddings for "
692
+ "enhanced code navigation and discovery."
693
+ ),
694
+ inputSchema={
695
+ "type": "object",
696
+ "properties": {
697
+ "projectName": {
698
+ "type": "string",
699
+ "description": "The name of the project",
700
+ },
701
+ "folderPath": {
702
+ "type": "string",
703
+ "description": (
704
+ "Absolute path to the project folder on disk"
705
+ ),
706
+ },
707
+ "enabled": {
708
+ "type": "boolean",
709
+ "description": (
710
+ "Whether to enable (true) or disable (false) vector mode"
711
+ ),
712
+ },
713
+ },
714
+ "required": ["projectName", "folderPath", "enabled"],
715
+ "additionalProperties": False,
716
+ },
717
+ ),
718
+ types.Tool(
719
+ name="find_similar_code",
720
+ description=(
721
+ "Find code similar to a given code snippet or file section using "
722
+ "vector-based semantic search. This tool uses AI embeddings to "
723
+ "understand code context and meaning, providing more intelligent "
724
+ "similarity detection than text-based matching. Requires vector "
725
+ "mode to be enabled for the project."
726
+ ),
727
+ inputSchema={
728
+ "type": "object",
729
+ "properties": {
730
+ "projectName": {
731
+ "type": "string",
732
+ "description": "The name of the project",
733
+ },
734
+ "folderPath": {
735
+ "type": "string",
736
+ "description": (
737
+ "Absolute path to the project folder on disk"
738
+ ),
739
+ },
740
+ "code_snippet": {
741
+ "type": "string",
742
+ "description": (
743
+ "Direct code snippet to search for similarities (mutually "
744
+ "exclusive with file_path)"
745
+ ),
746
+ },
747
+ "file_path": {
748
+ "type": "string",
749
+ "description": (
750
+ "Path to file containing code to analyze (mutually "
751
+ "exclusive with code_snippet)"
752
+ ),
753
+ },
754
+ "line_start": {
755
+ "type": "integer",
756
+ "description": (
757
+ "Starting line number for file section (1-indexed, "
758
+ "used with file_path)"
759
+ ),
760
+ },
761
+ "line_end": {
762
+ "type": "integer",
763
+ "description": (
764
+ "Ending line number for file section (1-indexed, "
765
+ "used with file_path)"
766
+ ),
767
+ },
768
+ "similarity_threshold": {
769
+ "type": "number",
770
+ "description": (
771
+ "Minimum similarity score (0.0-1.0, optional)"
772
+ ),
773
+ },
774
+ "max_results": {
775
+ "type": "integer",
776
+ "description": (
777
+ "Maximum number of results to return (optional)"
778
+ ),
779
+ },
780
+ },
781
+ "required": ["projectName", "folderPath"],
782
+ "additionalProperties": False,
783
+ },
784
+ ),
687
785
  ]
688
786
 
689
787
  @self.server.call_tool() # type: ignore[misc]
@@ -711,6 +809,8 @@ class MCPCodeIndexServer:
711
809
  "get_word_frequency": self._handle_get_word_frequency,
712
810
  "check_database_health": self._handle_check_database_health,
713
811
  "search_codebase_overview": self._handle_search_codebase_overview,
812
+ "enabled_vector_mode": self._handle_enabled_vector_mode,
813
+ "find_similar_code": self._handle_find_similar_code,
714
814
  }
715
815
 
716
816
  if name not in tool_handlers:
@@ -834,7 +934,9 @@ class MCPCodeIndexServer:
834
934
  )
835
935
 
836
936
  if project is None:
837
- raise RuntimeError("Project should always be set in if/else branches above")
937
+ raise RuntimeError(
938
+ "Project should always be set in if/else branches above"
939
+ )
838
940
  return project.id
839
941
 
840
942
  async def _find_matching_project(
@@ -1103,6 +1205,7 @@ class MCPCodeIndexServer:
1103
1205
  "isLarge": is_large,
1104
1206
  "recommendation": recommendation,
1105
1207
  "tokenLimit": token_limit,
1208
+ "totalTokens": total_tokens,
1106
1209
  "totalFiles": len(file_descriptions),
1107
1210
  "cleanedUpCount": cleaned_up_count,
1108
1211
  }
@@ -1481,6 +1584,93 @@ class MCPCodeIndexServer:
1481
1584
  "status_summary": self._generate_health_summary(comprehensive_diagnostics),
1482
1585
  }
1483
1586
 
1587
+ async def _handle_enabled_vector_mode(
1588
+ self, arguments: Dict[str, Any]
1589
+ ) -> Dict[str, Any]:
1590
+ """Handle enabled_vector_mode tool calls."""
1591
+ folder_path = arguments["folderPath"]
1592
+ db_manager = await self.db_factory.get_database_manager(folder_path)
1593
+ project_id = await self._get_or_create_project_id(arguments)
1594
+ enabled = arguments["enabled"]
1595
+
1596
+ try:
1597
+ await db_manager.set_project_vector_mode(project_id, enabled)
1598
+
1599
+ return {
1600
+ "success": True,
1601
+ "message": f"Vector mode {'enabled' if enabled else 'disabled'} for project",
1602
+ "project_id": project_id,
1603
+ "vector_mode": enabled,
1604
+ }
1605
+ except ValueError as e:
1606
+ return {
1607
+ "success": False,
1608
+ "error": str(e),
1609
+ "project_id": project_id,
1610
+ "vector_mode": None,
1611
+ }
1612
+
1613
+ async def _handle_find_similar_code(
1614
+ self, arguments: Dict[str, Any]
1615
+ ) -> Dict[str, Any]:
1616
+ """Handle find_similar_code tool calls."""
1617
+ try:
1618
+ from mcp_code_indexer.vector_mode.services.vector_mode_tools_service import (
1619
+ VectorModeToolsService,
1620
+ )
1621
+
1622
+ # Initialize the tools service (handles all vector mode setup internally)
1623
+ tools_service = VectorModeToolsService()
1624
+
1625
+ # Extract project info
1626
+ project_name = arguments["projectName"]
1627
+ folder_path = arguments["folderPath"]
1628
+
1629
+ logger.info(
1630
+ "Processing find_similar_code request",
1631
+ extra={
1632
+ "structured_data": {
1633
+ "project_name": project_name,
1634
+ "has_code_snippet": "code_snippet" in arguments,
1635
+ "has_file_path": "file_path" in arguments,
1636
+ }
1637
+ },
1638
+ )
1639
+
1640
+ # Call the service method
1641
+ result = await tools_service.find_similar_code(
1642
+ project_name=project_name,
1643
+ folder_path=folder_path,
1644
+ code_snippet=arguments.get("code_snippet"),
1645
+ file_path=arguments.get("file_path"),
1646
+ line_start=arguments.get("line_start"),
1647
+ line_end=arguments.get("line_end"),
1648
+ similarity_threshold=arguments.get("similarity_threshold"),
1649
+ max_results=arguments.get("max_results"),
1650
+ )
1651
+
1652
+ # Add success indicator to the result
1653
+ result["success"] = True
1654
+ return result
1655
+
1656
+ except Exception as e:
1657
+ logger.error(
1658
+ "Failed to execute find_similar_code",
1659
+ extra={
1660
+ "structured_data": {
1661
+ "error": str(e),
1662
+ "project_name": arguments.get("projectName", "unknown"),
1663
+ }
1664
+ },
1665
+ exc_info=True,
1666
+ )
1667
+ return {
1668
+ "success": False,
1669
+ "error": str(e),
1670
+ "results": [],
1671
+ "total_results": 0,
1672
+ }
1673
+
1484
1674
  def _generate_health_summary(self, diagnostics: Dict[str, Any]) -> Dict[str, Any]:
1485
1675
  """Generate a concise health summary from comprehensive diagnostics."""
1486
1676
  if "resilience_indicators" not in diagnostics: