shotgun-sh 0.4.0.dev1__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. shotgun/agents/agent_manager.py +307 -8
  2. shotgun/agents/cancellation.py +103 -0
  3. shotgun/agents/common.py +12 -0
  4. shotgun/agents/config/README.md +0 -1
  5. shotgun/agents/config/manager.py +10 -7
  6. shotgun/agents/config/models.py +5 -27
  7. shotgun/agents/config/provider.py +44 -27
  8. shotgun/agents/conversation/history/token_counting/base.py +51 -9
  9. shotgun/agents/file_read.py +176 -0
  10. shotgun/agents/messages.py +15 -3
  11. shotgun/agents/models.py +24 -1
  12. shotgun/agents/router/models.py +8 -0
  13. shotgun/agents/router/tools/delegation_tools.py +55 -1
  14. shotgun/agents/router/tools/plan_tools.py +88 -7
  15. shotgun/agents/runner.py +17 -2
  16. shotgun/agents/tools/__init__.py +8 -0
  17. shotgun/agents/tools/codebase/directory_lister.py +27 -39
  18. shotgun/agents/tools/codebase/file_read.py +26 -35
  19. shotgun/agents/tools/codebase/query_graph.py +9 -0
  20. shotgun/agents/tools/codebase/retrieve_code.py +9 -0
  21. shotgun/agents/tools/file_management.py +32 -2
  22. shotgun/agents/tools/file_read_tools/__init__.py +7 -0
  23. shotgun/agents/tools/file_read_tools/multimodal_file_read.py +167 -0
  24. shotgun/agents/tools/markdown_tools/__init__.py +62 -0
  25. shotgun/agents/tools/markdown_tools/insert_section.py +148 -0
  26. shotgun/agents/tools/markdown_tools/models.py +86 -0
  27. shotgun/agents/tools/markdown_tools/remove_section.py +114 -0
  28. shotgun/agents/tools/markdown_tools/replace_section.py +119 -0
  29. shotgun/agents/tools/markdown_tools/utils.py +453 -0
  30. shotgun/agents/tools/registry.py +44 -6
  31. shotgun/agents/tools/web_search/openai.py +42 -23
  32. shotgun/attachments/__init__.py +41 -0
  33. shotgun/attachments/errors.py +60 -0
  34. shotgun/attachments/models.py +107 -0
  35. shotgun/attachments/parser.py +257 -0
  36. shotgun/attachments/processor.py +193 -0
  37. shotgun/build_constants.py +4 -7
  38. shotgun/cli/clear.py +2 -2
  39. shotgun/cli/codebase/commands.py +181 -65
  40. shotgun/cli/compact.py +2 -2
  41. shotgun/cli/context.py +2 -2
  42. shotgun/cli/error_handler.py +2 -2
  43. shotgun/cli/run.py +90 -0
  44. shotgun/cli/spec/backup.py +2 -1
  45. shotgun/codebase/__init__.py +2 -0
  46. shotgun/codebase/benchmarks/__init__.py +35 -0
  47. shotgun/codebase/benchmarks/benchmark_runner.py +309 -0
  48. shotgun/codebase/benchmarks/exporters.py +119 -0
  49. shotgun/codebase/benchmarks/formatters/__init__.py +49 -0
  50. shotgun/codebase/benchmarks/formatters/base.py +34 -0
  51. shotgun/codebase/benchmarks/formatters/json_formatter.py +106 -0
  52. shotgun/codebase/benchmarks/formatters/markdown.py +136 -0
  53. shotgun/codebase/benchmarks/models.py +129 -0
  54. shotgun/codebase/core/__init__.py +4 -0
  55. shotgun/codebase/core/call_resolution.py +91 -0
  56. shotgun/codebase/core/change_detector.py +11 -6
  57. shotgun/codebase/core/errors.py +159 -0
  58. shotgun/codebase/core/extractors/__init__.py +23 -0
  59. shotgun/codebase/core/extractors/base.py +138 -0
  60. shotgun/codebase/core/extractors/factory.py +63 -0
  61. shotgun/codebase/core/extractors/go/__init__.py +7 -0
  62. shotgun/codebase/core/extractors/go/extractor.py +122 -0
  63. shotgun/codebase/core/extractors/javascript/__init__.py +7 -0
  64. shotgun/codebase/core/extractors/javascript/extractor.py +132 -0
  65. shotgun/codebase/core/extractors/protocol.py +109 -0
  66. shotgun/codebase/core/extractors/python/__init__.py +7 -0
  67. shotgun/codebase/core/extractors/python/extractor.py +141 -0
  68. shotgun/codebase/core/extractors/rust/__init__.py +7 -0
  69. shotgun/codebase/core/extractors/rust/extractor.py +139 -0
  70. shotgun/codebase/core/extractors/types.py +15 -0
  71. shotgun/codebase/core/extractors/typescript/__init__.py +7 -0
  72. shotgun/codebase/core/extractors/typescript/extractor.py +92 -0
  73. shotgun/codebase/core/gitignore.py +252 -0
  74. shotgun/codebase/core/ingestor.py +644 -354
  75. shotgun/codebase/core/kuzu_compat.py +119 -0
  76. shotgun/codebase/core/language_config.py +239 -0
  77. shotgun/codebase/core/manager.py +256 -46
  78. shotgun/codebase/core/metrics_collector.py +310 -0
  79. shotgun/codebase/core/metrics_types.py +347 -0
  80. shotgun/codebase/core/parallel_executor.py +424 -0
  81. shotgun/codebase/core/work_distributor.py +254 -0
  82. shotgun/codebase/core/worker.py +768 -0
  83. shotgun/codebase/indexing_state.py +86 -0
  84. shotgun/codebase/models.py +94 -0
  85. shotgun/codebase/service.py +13 -0
  86. shotgun/exceptions.py +9 -9
  87. shotgun/main.py +3 -16
  88. shotgun/posthog_telemetry.py +165 -24
  89. shotgun/prompts/agents/file_read.j2 +48 -0
  90. shotgun/prompts/agents/partials/common_agent_system_prompt.j2 +19 -47
  91. shotgun/prompts/agents/partials/content_formatting.j2 +12 -33
  92. shotgun/prompts/agents/partials/interactive_mode.j2 +9 -32
  93. shotgun/prompts/agents/partials/router_delegation_mode.j2 +21 -22
  94. shotgun/prompts/agents/plan.j2 +14 -0
  95. shotgun/prompts/agents/router.j2 +531 -258
  96. shotgun/prompts/agents/specify.j2 +14 -0
  97. shotgun/prompts/agents/state/codebase/codebase_graphs_available.j2 +14 -1
  98. shotgun/prompts/agents/state/system_state.j2 +13 -11
  99. shotgun/prompts/agents/tasks.j2 +14 -0
  100. shotgun/settings.py +49 -10
  101. shotgun/tui/app.py +149 -18
  102. shotgun/tui/commands/__init__.py +9 -1
  103. shotgun/tui/components/attachment_bar.py +87 -0
  104. shotgun/tui/components/prompt_input.py +25 -28
  105. shotgun/tui/components/status_bar.py +14 -7
  106. shotgun/tui/dependencies.py +3 -8
  107. shotgun/tui/protocols.py +18 -0
  108. shotgun/tui/screens/chat/chat.tcss +15 -0
  109. shotgun/tui/screens/chat/chat_screen.py +766 -235
  110. shotgun/tui/screens/chat/codebase_index_prompt_screen.py +8 -4
  111. shotgun/tui/screens/chat_screen/attachment_hint.py +40 -0
  112. shotgun/tui/screens/chat_screen/command_providers.py +0 -10
  113. shotgun/tui/screens/chat_screen/history/chat_history.py +54 -14
  114. shotgun/tui/screens/chat_screen/history/formatters.py +22 -0
  115. shotgun/tui/screens/chat_screen/history/user_question.py +25 -3
  116. shotgun/tui/screens/database_locked_dialog.py +219 -0
  117. shotgun/tui/screens/database_timeout_dialog.py +158 -0
  118. shotgun/tui/screens/kuzu_error_dialog.py +135 -0
  119. shotgun/tui/screens/model_picker.py +1 -3
  120. shotgun/tui/screens/models.py +11 -0
  121. shotgun/tui/state/processing_state.py +19 -0
  122. shotgun/tui/widgets/widget_coordinator.py +18 -0
  123. shotgun/utils/file_system_utils.py +4 -1
  124. {shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/METADATA +87 -34
  125. {shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/RECORD +128 -79
  126. shotgun/cli/export.py +0 -81
  127. shotgun/cli/plan.py +0 -73
  128. shotgun/cli/research.py +0 -93
  129. shotgun/cli/specify.py +0 -70
  130. shotgun/cli/tasks.py +0 -78
  131. shotgun/sentry_telemetry.py +0 -232
  132. shotgun/tui/screens/onboarding.py +0 -584
  133. {shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/WHEEL +0 -0
  134. {shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/entry_points.txt +0 -0
  135. {shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,80 +1,73 @@
1
1
  """Kuzu graph ingestor for building code knowledge graphs."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import asyncio
4
6
  import hashlib
7
+ import multiprocessing
5
8
  import os
6
9
  import time
7
10
  import uuid
8
11
  from collections import defaultdict
9
- from collections.abc import Callable
12
+ from collections.abc import Callable, Generator
13
+ from contextlib import contextmanager
10
14
  from pathlib import Path
11
- from typing import Any
15
+ from typing import TYPE_CHECKING, Any
12
16
 
13
17
  import aiofiles
14
- import real_ladybug as kuzu
15
18
  from tree_sitter import Node, Parser, QueryCursor
16
19
 
17
- from shotgun.codebase.core.language_config import LANGUAGE_CONFIGS, get_language_config
20
+ from shotgun.codebase.core.call_resolution import calculate_callee_confidence
21
+ from shotgun.codebase.core.gitignore import GitignoreManager
22
+ from shotgun.codebase.core.kuzu_compat import get_kuzu
23
+ from shotgun.codebase.core.metrics_collector import MetricsCollector
24
+ from shotgun.codebase.core.metrics_types import (
25
+ FileInfo,
26
+ IndexingPhase,
27
+ ParallelExecutionResult,
28
+ )
29
+ from shotgun.codebase.core.parallel_executor import ParallelExecutor
30
+ from shotgun.codebase.core.work_distributor import WorkDistributor, get_worker_count
31
+ from shotgun.codebase.models import (
32
+ IgnoreReason,
33
+ IndexingStats,
34
+ IndexProgress,
35
+ NodeLabel,
36
+ ProgressPhase,
37
+ RelationshipType,
38
+ )
39
+ from shotgun.posthog_telemetry import track_event
40
+ from shotgun.settings import settings
41
+
42
+ if TYPE_CHECKING:
43
+ import real_ladybug as kuzu
44
+
45
+ from shotgun.codebase.core.language_config import (
46
+ LANGUAGE_CONFIGS,
47
+ get_all_ignore_directories,
48
+ get_language_config,
49
+ is_path_ignored,
50
+ should_ignore_directory,
51
+ )
18
52
  from shotgun.codebase.core.parser_loader import load_parsers
19
53
  from shotgun.logging_config import get_logger
20
54
 
21
55
  logger = get_logger(__name__)
22
56
 
57
+ # For backwards compatibility, expose IGNORE_PATTERNS from this module
58
+ IGNORE_PATTERNS = get_all_ignore_directories()
23
59
 
24
- # Directories that should never be traversed during indexing
25
- BASE_IGNORE_DIRECTORIES = {
26
- ".git",
27
- "venv",
28
- ".venv",
29
- "__pycache__",
30
- ".eggs",
31
- ".pytest_cache",
32
- ".mypy_cache",
33
- ".ruff_cache",
34
- ".claude",
35
- ".idea",
36
- ".vscode",
37
- }
38
-
39
- # Well-known build output directories to skip when determining source files
40
- BUILD_ARTIFACT_DIRECTORIES = {
41
- "node_modules",
42
- ".next",
43
- ".nuxt",
44
- ".vite",
45
- ".yarn",
46
- ".svelte-kit",
47
- ".output",
48
- ".turbo",
49
- ".parcel-cache",
50
- ".vercel",
51
- ".serverless",
52
- "build",
53
- "dist",
54
- "out",
55
- "tmp",
56
- "coverage",
57
- }
58
-
59
- # Default ignore patterns combines base directories and build artifacts
60
- IGNORE_PATTERNS = BASE_IGNORE_DIRECTORIES | BUILD_ARTIFACT_DIRECTORIES
61
-
62
- # Directory prefixes that should always be ignored
63
- IGNORED_DIRECTORY_PREFIXES = (".",)
64
-
65
-
66
- def should_ignore_directory(name: str, ignore_patterns: set[str] | None = None) -> bool:
67
- """Return True if the directory name should be ignored."""
68
- patterns = IGNORE_PATTERNS if ignore_patterns is None else ignore_patterns
69
- if name in patterns:
70
- return True
71
- return name.startswith(IGNORED_DIRECTORY_PREFIXES)
72
-
73
-
74
- def is_path_ignored(path: Path, ignore_patterns: set[str] | None = None) -> bool:
75
- """Return True if any part of the path should be ignored."""
76
- patterns = IGNORE_PATTERNS if ignore_patterns is None else ignore_patterns
77
- return any(should_ignore_directory(part, patterns) for part in path.parts)
60
+ # Explicit re-exports for type checkers
61
+ __all__ = [
62
+ "IGNORE_PATTERNS",
63
+ "LANGUAGE_CONFIGS",
64
+ "Ingestor",
65
+ "SimpleGraphBuilder",
66
+ "get_all_ignore_directories",
67
+ "get_language_config",
68
+ "is_path_ignored",
69
+ "should_ignore_directory",
70
+ ]
78
71
 
79
72
 
80
73
  class Ingestor:
@@ -87,6 +80,8 @@ class Ingestor:
87
80
  tuple[str, str, Any, str, str, str, Any, dict[str, Any] | None]
88
81
  ] = []
89
82
  self.batch_size = 1000
83
+ # Track seen primary keys to avoid O(n²) duplicate checking
84
+ self._seen_node_keys: set[tuple[str, str]] = set()
90
85
 
91
86
  def create_schema(self) -> None:
92
87
  """Create the graph schema in Kuzu."""
@@ -159,10 +154,13 @@ class Ingestor:
159
154
 
160
155
  def ensure_node_batch(self, label: str, properties: dict[str, Any]) -> None:
161
156
  """Add a node to the buffer for batch insertion."""
162
- # Check for duplicates based on primary key
157
+ # Check for duplicates based on primary key using O(1) set lookup
163
158
  primary_key = self._get_primary_key(label, properties)
164
- if primary_key and self._is_duplicate_node(label, primary_key):
165
- return
159
+ if primary_key:
160
+ key = (label, primary_key)
161
+ if key in self._seen_node_keys:
162
+ return
163
+ self._seen_node_keys.add(key)
166
164
 
167
165
  self.node_buffer.append((label, properties))
168
166
 
@@ -176,29 +174,26 @@ class Ingestor:
176
174
 
177
175
  def _get_primary_key_field(self, label: str) -> str | None:
178
176
  """Get the primary key field name for a node type."""
179
- if label == "Project":
177
+ if label == NodeLabel.PROJECT:
180
178
  return "name"
181
- elif label in ["Package", "Module", "Class", "Function", "Method"]:
179
+ elif label in [
180
+ NodeLabel.PACKAGE,
181
+ NodeLabel.MODULE,
182
+ NodeLabel.CLASS,
183
+ NodeLabel.FUNCTION,
184
+ NodeLabel.METHOD,
185
+ ]:
182
186
  return "qualified_name"
183
- elif label in ["Folder", "File"]:
187
+ elif label in [NodeLabel.FOLDER, NodeLabel.FILE]:
184
188
  return "path"
185
- elif label == "FileMetadata":
189
+ elif label == NodeLabel.FILE_METADATA:
186
190
  return "filepath"
187
- elif label == "ExternalPackage":
191
+ elif label == NodeLabel.EXTERNAL_PACKAGE:
188
192
  return "name"
189
- elif label == "DeletionLog":
193
+ elif label == NodeLabel.DELETION_LOG:
190
194
  return "id"
191
195
  return None
192
196
 
193
- def _is_duplicate_node(self, label: str, primary_key: str) -> bool:
194
- """Check if a node with the given primary key already exists in the buffer."""
195
- for buffered_label, buffered_props in self.node_buffer:
196
- if buffered_label == label:
197
- buffered_key = self._get_primary_key(buffered_label, buffered_props)
198
- if buffered_key == primary_key:
199
- return True
200
- return False
201
-
202
197
  def flush_nodes(
203
198
  self,
204
199
  progress_callback: Callable[[int, int], None] | None = None,
@@ -272,6 +267,7 @@ class Ingestor:
272
267
  logger.info(f" {label}: {count}")
273
268
 
274
269
  self.node_buffer.clear()
270
+ self._seen_node_keys.clear()
275
271
 
276
272
  def ensure_relationship_batch(
277
273
  self,
@@ -410,45 +406,46 @@ class Ingestor:
410
406
  ) -> str | None:
411
407
  """Determine the actual relationship table name based on source and target."""
412
408
  # Mapping of relationship types and from_labels to table names
413
- table_mapping = {
414
- "CONTAINS_PACKAGE": {
415
- "Project": "CONTAINS_PACKAGE",
416
- "Package": "CONTAINS_PACKAGE_PKG",
417
- "Folder": "CONTAINS_PACKAGE_FOLDER",
409
+ # Keys use enum values for type-safe comparisons
410
+ table_mapping: dict[str, dict[str, str]] = {
411
+ RelationshipType.CONTAINS_PACKAGE: {
412
+ NodeLabel.PROJECT: "CONTAINS_PACKAGE",
413
+ NodeLabel.PACKAGE: "CONTAINS_PACKAGE_PKG",
414
+ NodeLabel.FOLDER: "CONTAINS_PACKAGE_FOLDER",
418
415
  },
419
- "CONTAINS_FOLDER": {
420
- "Project": "CONTAINS_FOLDER",
421
- "Package": "CONTAINS_FOLDER_PKG",
422
- "Folder": "CONTAINS_FOLDER_FOLDER",
416
+ RelationshipType.CONTAINS_FOLDER: {
417
+ NodeLabel.PROJECT: "CONTAINS_FOLDER",
418
+ NodeLabel.PACKAGE: "CONTAINS_FOLDER_PKG",
419
+ NodeLabel.FOLDER: "CONTAINS_FOLDER_FOLDER",
423
420
  },
424
- "CONTAINS_FILE": {
425
- "Project": "CONTAINS_FILE",
426
- "Package": "CONTAINS_FILE_PKG",
427
- "Folder": "CONTAINS_FILE_FOLDER",
421
+ RelationshipType.CONTAINS_FILE: {
422
+ NodeLabel.PROJECT: "CONTAINS_FILE",
423
+ NodeLabel.PACKAGE: "CONTAINS_FILE_PKG",
424
+ NodeLabel.FOLDER: "CONTAINS_FILE_FOLDER",
428
425
  },
429
- "CONTAINS_MODULE": {
430
- "Project": "CONTAINS_MODULE",
431
- "Package": "CONTAINS_MODULE_PKG",
432
- "Folder": "CONTAINS_MODULE_FOLDER",
426
+ RelationshipType.CONTAINS_MODULE: {
427
+ NodeLabel.PROJECT: "CONTAINS_MODULE",
428
+ NodeLabel.PACKAGE: "CONTAINS_MODULE_PKG",
429
+ NodeLabel.FOLDER: "CONTAINS_MODULE_FOLDER",
433
430
  },
434
431
  }
435
432
 
436
433
  if rel_type in table_mapping:
437
434
  return table_mapping[rel_type].get(from_label)
438
- elif rel_type == "DEFINES":
439
- if to_label == "Function":
440
- return "DEFINES_FUNC"
435
+ elif rel_type == RelationshipType.DEFINES:
436
+ if to_label == NodeLabel.FUNCTION:
437
+ return RelationshipType.DEFINES_FUNC
441
438
  else:
442
- return "DEFINES"
443
- elif rel_type == "CALLS":
444
- if from_label == "Function" and to_label == "Function":
445
- return "CALLS"
446
- elif from_label == "Function" and to_label == "Method":
447
- return "CALLS_FM"
448
- elif from_label == "Method" and to_label == "Function":
449
- return "CALLS_MF"
450
- elif from_label == "Method" and to_label == "Method":
451
- return "CALLS_MM"
439
+ return RelationshipType.DEFINES
440
+ elif rel_type == RelationshipType.CALLS:
441
+ if from_label == NodeLabel.FUNCTION and to_label == NodeLabel.FUNCTION:
442
+ return RelationshipType.CALLS
443
+ elif from_label == NodeLabel.FUNCTION and to_label == NodeLabel.METHOD:
444
+ return RelationshipType.CALLS_FM
445
+ elif from_label == NodeLabel.METHOD and to_label == NodeLabel.FUNCTION:
446
+ return RelationshipType.CALLS_MF
447
+ elif from_label == NodeLabel.METHOD and to_label == NodeLabel.METHOD:
448
+ return RelationshipType.CALLS_MM
452
449
  elif rel_type.startswith("TRACKS_"):
453
450
  # TRACKS relationships already have the correct table name
454
451
  return rel_type
@@ -548,10 +545,10 @@ class Ingestor:
548
545
 
549
546
  # Delete each type of node tracked by this file
550
547
  for node_type, rel_type, stat_key in [
551
- ("Module", "TRACKS_Module", "modules"),
552
- ("Class", "TRACKS_Class", "classes"),
553
- ("Function", "TRACKS_Function", "functions"),
554
- ("Method", "TRACKS_Method", "methods"),
548
+ (NodeLabel.MODULE, RelationshipType.TRACKS_MODULE, "modules"),
549
+ (NodeLabel.CLASS, RelationshipType.TRACKS_CLASS, "classes"),
550
+ (NodeLabel.FUNCTION, RelationshipType.TRACKS_FUNCTION, "functions"),
551
+ (NodeLabel.METHOD, RelationshipType.TRACKS_METHOD, "methods"),
555
552
  ]:
556
553
  try:
557
554
  # First get the nodes to delete (for logging)
@@ -614,6 +611,9 @@ class SimpleGraphBuilder:
614
611
  queries: dict[str, Any],
615
612
  exclude_patterns: list[str] | None = None,
616
613
  progress_callback: Any | None = None,
614
+ respect_gitignore: bool = True,
615
+ metrics_collector: MetricsCollector | None = None,
616
+ enable_parallel: bool = True,
617
617
  ):
618
618
  self.ingestor = ingestor
619
619
  self.repo_path = repo_path
@@ -624,10 +624,26 @@ class SimpleGraphBuilder:
624
624
  if exclude_patterns:
625
625
  self.ignore_dirs = self.ignore_dirs.union(set(exclude_patterns))
626
626
  self.progress_callback = progress_callback
627
+ self.metrics_collector = metrics_collector
628
+
629
+ # Initialize gitignore support
630
+ self.respect_gitignore = respect_gitignore
631
+ self.gitignore_manager: GitignoreManager | None = None
632
+ if respect_gitignore:
633
+ self.gitignore_manager = GitignoreManager(repo_path)
634
+ if self.gitignore_manager.stats.patterns_loaded > 0:
635
+ logger.info(
636
+ f"Loaded gitignore patterns - "
637
+ f"files: {self.gitignore_manager.stats.gitignore_files_loaded}, "
638
+ f"patterns: {self.gitignore_manager.stats.patterns_loaded}"
639
+ )
627
640
 
628
641
  # Generate unique session ID for correlating timing events in PostHog
629
642
  self._index_session_id = str(uuid.uuid4())[:8]
630
643
 
644
+ # Statistics for tracking what was indexed vs skipped
645
+ self._index_stats = IndexingStats()
646
+
631
647
  # Caches
632
648
  self.structural_elements: dict[Path, str | None] = {}
633
649
  self.ast_cache: dict[Path, tuple[Node, str]] = {}
@@ -635,6 +651,153 @@ class SimpleGraphBuilder:
635
651
  self.simple_name_lookup: dict[str, set[str]] = defaultdict(set)
636
652
  self.class_inheritance: dict[str, list[str]] = {} # class_qn -> [parent_qns]
637
653
 
654
+ # Parallel execution support
655
+ self.enable_parallel = enable_parallel
656
+ self.parallel_executor: ParallelExecutor | None = None
657
+ self._parallel_mode_active = False # Track if parallel was used for this run
658
+ self._worker_count = 0
659
+ self._init_parallel_executor()
660
+
661
+ def _init_parallel_executor(self) -> None:
662
+ """Initialize parallel executor if conditions are met.
663
+
664
+ Conditions for parallel execution:
665
+ 1. enable_parallel=True (constructor parameter)
666
+ 2. settings.indexing.index_parallel is True
667
+ 3. CPU count >= 4
668
+ """
669
+ # Check settings override
670
+ if not settings.indexing.index_parallel:
671
+ logger.info("Parallel indexing disabled via SHOTGUN_INDEX_PARALLEL=false")
672
+ return
673
+
674
+ if not self.enable_parallel:
675
+ logger.debug("Parallel indexing disabled via enable_parallel=False")
676
+ return
677
+
678
+ cpu_count = multiprocessing.cpu_count()
679
+ if cpu_count < 4:
680
+ logger.info(f"Parallel indexing disabled: CPU count ({cpu_count}) < 4")
681
+ return
682
+
683
+ worker_count = get_worker_count()
684
+ self.parallel_executor = ParallelExecutor(
685
+ worker_count=worker_count,
686
+ metrics_collector=self.metrics_collector,
687
+ )
688
+ self._worker_count = worker_count
689
+ logger.info(f"Parallel indexing enabled with {worker_count} workers")
690
+
691
+ def _build_file_infos(
692
+ self, files_to_process: list[tuple[Path, str]]
693
+ ) -> list[FileInfo]:
694
+ """Convert files_to_process list to FileInfo objects for parallel execution.
695
+
696
+ Args:
697
+ files_to_process: List of (filepath, language) tuples
698
+
699
+ Returns:
700
+ List of FileInfo objects ready for WorkDistributor
701
+ """
702
+ file_infos: list[FileInfo] = []
703
+
704
+ for filepath, language in files_to_process:
705
+ relative_path = filepath.relative_to(self.repo_path)
706
+
707
+ # Compute module_qn (same logic as _process_single_file)
708
+ if filepath.name == "__init__.py":
709
+ module_qn = ".".join(
710
+ [self.project_name] + list(relative_path.parent.parts)
711
+ )
712
+ else:
713
+ module_qn = ".".join(
714
+ [self.project_name] + list(relative_path.with_suffix("").parts)
715
+ )
716
+
717
+ # Get container qualified name from structural elements
718
+ parent_rel_path = relative_path.parent
719
+ container_qn = self.structural_elements.get(parent_rel_path)
720
+
721
+ try:
722
+ file_size = filepath.stat().st_size
723
+ except OSError:
724
+ file_size = 0
725
+
726
+ file_infos.append(
727
+ FileInfo(
728
+ file_path=filepath,
729
+ relative_path=relative_path,
730
+ language=language,
731
+ module_qn=module_qn,
732
+ container_qn=container_qn,
733
+ file_size_bytes=file_size,
734
+ )
735
+ )
736
+
737
+ return file_infos
738
+
739
+ def _merge_parallel_results(self, result: ParallelExecutionResult) -> None:
740
+ """Merge parallel execution results into Ingestor buffers and local caches.
741
+
742
+ Args:
743
+ result: ParallelExecutionResult containing all parsed file data
744
+ """
745
+ # Merge nodes and direct relationships from each file
746
+ for file_result in result.results:
747
+ if not file_result.success:
748
+ logger.warning(
749
+ f"File {file_result.task.file_path} failed: {file_result.error}"
750
+ )
751
+ continue
752
+
753
+ # Add nodes to buffer
754
+ for node in file_result.nodes:
755
+ self.ingestor.ensure_node_batch(node.label, node.properties)
756
+
757
+ # Add direct relationships to buffer
758
+ for rel in file_result.relationships:
759
+ self.ingestor.ensure_relationship_batch(
760
+ rel.from_label,
761
+ rel.from_key,
762
+ rel.from_value,
763
+ rel.rel_type,
764
+ rel.to_label,
765
+ rel.to_key,
766
+ rel.to_value,
767
+ rel.properties,
768
+ )
769
+
770
+ # Add resolved relationships (calls, inheritance) from aggregation
771
+ for rel in result.resolved_relationships:
772
+ self.ingestor.ensure_relationship_batch(
773
+ rel.from_label,
774
+ rel.from_key,
775
+ rel.from_value,
776
+ rel.rel_type,
777
+ rel.to_label,
778
+ rel.to_key,
779
+ rel.to_value,
780
+ rel.properties,
781
+ )
782
+
783
+ # Merge registries into local caches
784
+ self.function_registry.update(result.function_registry)
785
+ for name, qns in result.simple_name_lookup.items():
786
+ for qn in qns:
787
+ self.simple_name_lookup[name].add(qn)
788
+
789
+ # Merge inheritance data for potential future use
790
+ for file_result in result.results:
791
+ if file_result.success:
792
+ for inh in file_result.inheritance_data:
793
+ self.class_inheritance[inh.child_class_qn] = inh.parent_simple_names
794
+
795
+ logger.info(
796
+ f"Merged parallel results: {result.successful_files} files, "
797
+ f"{len(result.function_registry)} registry entries, "
798
+ f"{len(result.resolved_relationships)} resolved relationships"
799
+ )
800
+
638
801
  def _report_progress(
639
802
  self,
640
803
  phase: str,
@@ -648,9 +811,6 @@ class SimpleGraphBuilder:
648
811
  return
649
812
 
650
813
  try:
651
- # Import here to avoid circular dependency
652
- from shotgun.codebase.models import IndexProgress, ProgressPhase
653
-
654
814
  progress = IndexProgress(
655
815
  phase=ProgressPhase(phase),
656
816
  phase_name=phase_name,
@@ -671,8 +831,6 @@ class SimpleGraphBuilder:
671
831
  extra_props: dict[str, Any] | None = None,
672
832
  ) -> None:
673
833
  """Log timing data to PostHog for analysis."""
674
- from shotgun.posthog_telemetry import track_event
675
-
676
834
  properties: dict[str, Any] = {
677
835
  "session_id": self._index_session_id,
678
836
  "phase": phase,
@@ -692,8 +850,6 @@ class SimpleGraphBuilder:
692
850
  total_relationships: int,
693
851
  ) -> None:
694
852
  """Log indexing summary event to PostHog."""
695
- from shotgun.posthog_telemetry import track_event
696
-
697
853
  track_event(
698
854
  "codebase_index_completed",
699
855
  {
@@ -705,6 +861,24 @@ class SimpleGraphBuilder:
705
861
  },
706
862
  )
707
863
 
864
+ @contextmanager
865
+ def _track_phase(
866
+ self, phase: IndexingPhase, get_items_count: Callable[[], int]
867
+ ) -> Generator[None, None, None]:
868
+ """Context manager for tracking phase metrics.
869
+
870
+ Args:
871
+ phase: The indexing phase to track
872
+ get_items_count: Callable that returns the items processed count
873
+ """
874
+ if self.metrics_collector:
875
+ self.metrics_collector.start_phase(phase)
876
+ try:
877
+ yield
878
+ finally:
879
+ if self.metrics_collector:
880
+ self.metrics_collector.end_phase(phase, get_items_count())
881
+
708
882
  async def run(self) -> None:
709
883
  """Run the three-pass graph building process."""
710
884
  logger.info(f"Building graph for project: {self.project_name}")
@@ -712,17 +886,25 @@ class SimpleGraphBuilder:
712
886
  # Pass 1: Structure
713
887
  logger.info("Pass 1: Identifying packages and folders...")
714
888
  t0 = time.time()
715
- self._identify_structure()
889
+ with self._track_phase(
890
+ IndexingPhase.STRUCTURE, lambda: self._index_stats.dirs_scanned
891
+ ):
892
+ self._identify_structure()
716
893
  t1 = time.time()
717
- self._log_timing("structure", t1 - t0, len(self.structural_elements))
894
+ self._log_timing(
895
+ IndexingPhase.STRUCTURE, t1 - t0, len(self.structural_elements)
896
+ )
718
897
 
719
898
  # Pass 2: Definitions
720
899
  logger.info("Pass 2: Processing files and extracting definitions...")
721
900
  t2 = time.time()
722
- await self._process_files()
901
+ with self._track_phase(
902
+ IndexingPhase.DEFINITIONS, lambda: self._index_stats.files_processed
903
+ ):
904
+ await self._process_files()
723
905
  t3 = time.time()
724
906
  self._log_timing(
725
- "definitions",
907
+ IndexingPhase.DEFINITIONS,
726
908
  t3 - t2,
727
909
  len(self.ast_cache),
728
910
  {"file_count": len(self.ast_cache)},
@@ -731,52 +913,71 @@ class SimpleGraphBuilder:
731
913
  # Pass 3: Relationships
732
914
  logger.info("Pass 3: Processing relationships (calls, imports)...")
733
915
  t4 = time.time()
734
- self._process_relationships()
916
+ with self._track_phase(
917
+ IndexingPhase.RELATIONSHIPS, lambda: self._index_stats.files_processed
918
+ ):
919
+ self._process_relationships()
735
920
  t5 = time.time()
736
- self._log_timing("relationships", t5 - t4, len(self.ast_cache))
921
+ self._log_timing(IndexingPhase.RELATIONSHIPS, t5 - t4, len(self.ast_cache))
737
922
 
738
923
  # Flush all pending operations
739
924
  logger.info("Flushing all data to database...")
740
- t6 = time.time()
741
925
  node_count = len(self.ingestor.node_buffer)
742
926
 
743
927
  # Create progress callback for flush_nodes
744
928
  def node_progress(current: int, total: int) -> None:
745
929
  self._report_progress(
746
- "flush_nodes", "Flushing nodes to database", current, total
930
+ IndexingPhase.FLUSH_NODES, "Flushing nodes to database", current, total
747
931
  )
748
932
 
749
- self.ingestor.flush_nodes(progress_callback=node_progress)
933
+ t6 = time.time()
934
+ with self._track_phase(IndexingPhase.FLUSH_NODES, lambda: node_count):
935
+ self.ingestor.flush_nodes(progress_callback=node_progress)
936
+ t7 = time.time()
750
937
  self._report_progress(
751
- "flush_nodes", "Flushing nodes to database", node_count, node_count, True
938
+ IndexingPhase.FLUSH_NODES,
939
+ "Flushing nodes to database",
940
+ node_count,
941
+ node_count,
942
+ True,
943
+ )
944
+ self._log_timing(
945
+ IndexingPhase.FLUSH_NODES, t7 - t6, node_count, {"node_count": node_count}
752
946
  )
753
- t7 = time.time()
754
- self._log_timing("flush_nodes", t7 - t6, node_count, {"node_count": node_count})
755
947
 
756
948
  rel_count = len(self.ingestor.relationship_buffer)
757
949
 
758
950
  # Create progress callback for flush_relationships
759
951
  def rel_progress(current: int, total: int) -> None:
760
952
  self._report_progress(
761
- "flush_relationships",
953
+ IndexingPhase.FLUSH_RELATIONSHIPS,
762
954
  "Flushing relationships to database",
763
955
  current,
764
956
  total,
765
957
  )
766
958
 
767
- self.ingestor.flush_relationships(progress_callback=rel_progress)
959
+ t8_start = time.time()
960
+ with self._track_phase(IndexingPhase.FLUSH_RELATIONSHIPS, lambda: rel_count):
961
+ self.ingestor.flush_relationships(progress_callback=rel_progress)
962
+ t8 = time.time()
768
963
  self._report_progress(
769
- "flush_relationships",
964
+ IndexingPhase.FLUSH_RELATIONSHIPS,
770
965
  "Flushing relationships to database",
771
966
  rel_count,
772
967
  rel_count,
773
968
  True,
774
969
  )
775
- t8 = time.time()
776
970
  self._log_timing(
777
- "flush_relationships", t8 - t7, rel_count, {"relationship_count": rel_count}
971
+ IndexingPhase.FLUSH_RELATIONSHIPS,
972
+ t8 - t8_start,
973
+ rel_count,
974
+ {"relationship_count": rel_count},
778
975
  )
779
976
 
977
+ # Update metrics collector with totals
978
+ if self.metrics_collector:
979
+ self.metrics_collector.set_totals(node_count, rel_count)
980
+
780
981
  # Track summary event with totals (no PII - only numeric metadata)
781
982
  total_duration = t8 - t0
782
983
  self._log_summary(
@@ -786,15 +987,79 @@ class SimpleGraphBuilder:
786
987
  total_relationships=rel_count,
787
988
  )
788
989
 
990
+ # Log final indexing statistics
991
+ logger.info("=== Indexing Statistics ===")
992
+ logger.info(f" Directories scanned: {self._index_stats.dirs_scanned}")
993
+ logger.info(
994
+ f" Directories ignored (hardcoded patterns): {self._index_stats.dirs_ignored_hardcoded}"
995
+ )
996
+ logger.info(
997
+ f" Directories ignored (gitignore): {self._index_stats.dirs_ignored_gitignore}"
998
+ )
999
+ logger.info(f" Files scanned: {self._index_stats.files_scanned}")
1000
+ logger.info(
1001
+ f" Files ignored (hardcoded patterns): {self._index_stats.files_ignored_hardcoded}"
1002
+ )
1003
+ logger.info(
1004
+ f" Files ignored (gitignore): {self._index_stats.files_ignored_gitignore}"
1005
+ )
1006
+ logger.info(
1007
+ f" Files ignored (no parser): {self._index_stats.files_ignored_no_parser}"
1008
+ )
1009
+ logger.info(f" Files processed: {self._index_stats.files_processed}")
1010
+
1011
+ # Log gitignore manager stats if available
1012
+ if self.gitignore_manager:
1013
+ logger.info(f" {self.gitignore_manager.get_stats_summary()}")
1014
+
789
1015
  logger.info("Graph building complete!")
790
1016
 
1017
+ def _should_ignore_directory(
1018
+ self, dir_path: Path, dir_name: str
1019
+ ) -> tuple[bool, IgnoreReason | None]:
1020
+ """Check if a directory should be ignored.
1021
+
1022
+ Args:
1023
+ dir_path: Full path to the directory
1024
+ dir_name: Name of the directory
1025
+
1026
+ Returns:
1027
+ Tuple of (should_ignore, reason)
1028
+ """
1029
+ # Check hardcoded patterns first (fastest)
1030
+ if should_ignore_directory(dir_name, self.ignore_dirs):
1031
+ return True, IgnoreReason.HARDCODED
1032
+
1033
+ # Check gitignore patterns
1034
+ if self.gitignore_manager:
1035
+ try:
1036
+ relative_path = dir_path.relative_to(self.repo_path)
1037
+ if self.gitignore_manager.is_directory_ignored(relative_path):
1038
+ return True, IgnoreReason.GITIGNORE
1039
+ except ValueError:
1040
+ pass
1041
+
1042
+ return False, None
1043
+
791
1044
  def _identify_structure(self) -> None:
792
1045
  """First pass: Walk directory to find packages and folders."""
793
1046
  dir_count = 0
794
1047
  for root_str, dirs, _ in os.walk(self.repo_path, topdown=True):
795
- dirs[:] = [
796
- d for d in dirs if not should_ignore_directory(d, self.ignore_dirs)
797
- ]
1048
+ # Filter directories - modifying dirs in-place affects os.walk traversal
1049
+ filtered_dirs = []
1050
+ for d in dirs:
1051
+ dir_path = Path(root_str) / d
1052
+ should_ignore, reason = self._should_ignore_directory(dir_path, d)
1053
+ if should_ignore:
1054
+ if reason == IgnoreReason.HARDCODED:
1055
+ self._index_stats.dirs_ignored_hardcoded += 1
1056
+ elif reason == IgnoreReason.GITIGNORE:
1057
+ self._index_stats.dirs_ignored_gitignore += 1
1058
+ else:
1059
+ filtered_dirs.append(d)
1060
+ self._index_stats.dirs_scanned += 1
1061
+
1062
+ dirs[:] = filtered_dirs
798
1063
  root = Path(root_str)
799
1064
  relative_root = root.relative_to(self.repo_path)
800
1065
 
@@ -831,7 +1096,7 @@ class SimpleGraphBuilder:
831
1096
  # Create package
832
1097
  package_qn = ".".join([self.project_name] + list(relative_root.parts))
833
1098
  self.ingestor.ensure_node_batch(
834
- "Package",
1099
+ NodeLabel.PACKAGE,
835
1100
  {
836
1101
  "qualified_name": package_qn,
837
1102
  "name": relative_root.name,
@@ -843,22 +1108,22 @@ class SimpleGraphBuilder:
843
1108
  if parent_container_qn:
844
1109
  # Parent is a package
845
1110
  self.ingestor.ensure_relationship_batch(
846
- "Package",
1111
+ NodeLabel.PACKAGE,
847
1112
  "qualified_name",
848
1113
  parent_container_qn,
849
- "CONTAINS_PACKAGE",
850
- "Package",
1114
+ RelationshipType.CONTAINS_PACKAGE,
1115
+ NodeLabel.PACKAGE,
851
1116
  "qualified_name",
852
1117
  package_qn,
853
1118
  )
854
1119
  else:
855
1120
  # Parent is project root
856
1121
  self.ingestor.ensure_relationship_batch(
857
- "Project",
1122
+ NodeLabel.PROJECT,
858
1123
  "name",
859
1124
  self.project_name,
860
- "CONTAINS_PACKAGE",
861
- "Package",
1125
+ RelationshipType.CONTAINS_PACKAGE,
1126
+ NodeLabel.PACKAGE,
862
1127
  "qualified_name",
863
1128
  package_qn,
864
1129
  )
@@ -867,7 +1132,7 @@ class SimpleGraphBuilder:
867
1132
  else:
868
1133
  # Create folder
869
1134
  self.ingestor.ensure_node_batch(
870
- "Folder",
1135
+ NodeLabel.FOLDER,
871
1136
  {
872
1137
  "path": str(relative_root).replace(os.sep, "/"),
873
1138
  "name": relative_root.name,
@@ -878,33 +1143,33 @@ class SimpleGraphBuilder:
878
1143
  if parent_container_qn:
879
1144
  # Parent is a package
880
1145
  self.ingestor.ensure_relationship_batch(
881
- "Package",
1146
+ NodeLabel.PACKAGE,
882
1147
  "qualified_name",
883
1148
  parent_container_qn,
884
- "CONTAINS_FOLDER",
885
- "Folder",
1149
+ RelationshipType.CONTAINS_FOLDER,
1150
+ NodeLabel.FOLDER,
886
1151
  "path",
887
1152
  str(relative_root).replace(os.sep, "/"),
888
1153
  )
889
1154
  elif parent_rel_path == Path("."):
890
1155
  # Parent is project root
891
1156
  self.ingestor.ensure_relationship_batch(
892
- "Project",
1157
+ NodeLabel.PROJECT,
893
1158
  "name",
894
1159
  self.project_name,
895
- "CONTAINS_FOLDER",
896
- "Folder",
1160
+ RelationshipType.CONTAINS_FOLDER,
1161
+ NodeLabel.FOLDER,
897
1162
  "path",
898
1163
  str(relative_root).replace(os.sep, "/"),
899
1164
  )
900
1165
  else:
901
1166
  # Parent is another folder
902
1167
  self.ingestor.ensure_relationship_batch(
903
- "Folder",
1168
+ NodeLabel.FOLDER,
904
1169
  "path",
905
1170
  str(parent_rel_path).replace(os.sep, "/"),
906
- "CONTAINS_FOLDER",
907
- "Folder",
1171
+ RelationshipType.CONTAINS_FOLDER,
1172
+ NodeLabel.FOLDER,
908
1173
  "path",
909
1174
  str(relative_root).replace(os.sep, "/"),
910
1175
  )
@@ -919,62 +1184,195 @@ class SimpleGraphBuilder:
919
1184
  phase_complete=True,
920
1185
  )
921
1186
 
1187
+ def _should_ignore_file(self, filepath: Path) -> tuple[bool, IgnoreReason | None]:
1188
+ """Check if a file should be ignored.
1189
+
1190
+ Args:
1191
+ filepath: Full path to the file
1192
+
1193
+ Returns:
1194
+ Tuple of (should_ignore, reason)
1195
+ """
1196
+ # Check hardcoded directory patterns in path
1197
+ if is_path_ignored(filepath, self.ignore_dirs):
1198
+ return True, IgnoreReason.HARDCODED
1199
+
1200
+ # Check gitignore patterns
1201
+ if self.gitignore_manager:
1202
+ try:
1203
+ relative_path = filepath.relative_to(self.repo_path)
1204
+ if self.gitignore_manager.is_ignored(relative_path):
1205
+ return True, IgnoreReason.GITIGNORE
1206
+ except ValueError:
1207
+ pass
1208
+
1209
+ return False, None
1210
+
922
1211
  async def _process_files(self) -> None:
923
1212
  """Second pass: Process files and extract definitions."""
924
- # First pass: Count total files
1213
+ # First pass: Count total files (respecting all ignore patterns)
925
1214
  total_files = 0
926
- for root_str, _, files in os.walk(self.repo_path):
1215
+ files_to_process: list[tuple[Path, str]] = []
1216
+
1217
+ for root_str, dirs, files in os.walk(self.repo_path, topdown=True):
927
1218
  root = Path(root_str)
928
1219
 
929
- # Skip ignored directories
930
- if is_path_ignored(root, self.ignore_dirs):
931
- continue
1220
+ # Filter directories in-place to prevent os.walk from descending
1221
+ filtered_dirs = []
1222
+ for d in dirs:
1223
+ dir_path = root / d
1224
+ should_ignore, _ = self._should_ignore_directory(dir_path, d)
1225
+ if not should_ignore:
1226
+ filtered_dirs.append(d)
1227
+ dirs[:] = filtered_dirs
932
1228
 
933
1229
  for filename in files:
934
1230
  filepath = root / filename
1231
+ self._index_stats.files_scanned += 1
1232
+
1233
+ # Check if file should be ignored
1234
+ should_ignore, reason = self._should_ignore_file(filepath)
1235
+ if should_ignore:
1236
+ if reason == IgnoreReason.HARDCODED:
1237
+ self._index_stats.files_ignored_hardcoded += 1
1238
+ elif reason == IgnoreReason.GITIGNORE:
1239
+ self._index_stats.files_ignored_gitignore += 1
1240
+ continue
1241
+
1242
+ # Check if this is a supported file
935
1243
  ext = filepath.suffix
936
1244
  lang_config = get_language_config(ext)
937
1245
 
938
1246
  if lang_config and lang_config.name in self.parsers:
1247
+ files_to_process.append((filepath, lang_config.name))
939
1248
  total_files += 1
1249
+ else:
1250
+ self._index_stats.files_ignored_no_parser += 1
940
1251
 
941
- # Second pass: Process files with progress reporting
942
- file_count = 0
943
- for root_str, _, files in os.walk(self.repo_path):
944
- root = Path(root_str)
1252
+ # Log what we're about to process
1253
+ logger.info(
1254
+ f"Index statistics: "
1255
+ f"scanned {self._index_stats.files_scanned} files, "
1256
+ f"processing {total_files}, "
1257
+ f"skipped {self._index_stats.files_ignored_hardcoded} (hardcoded), "
1258
+ f"{self._index_stats.files_ignored_gitignore} (gitignore), "
1259
+ f"{self._index_stats.files_ignored_no_parser} (no parser)"
1260
+ )
945
1261
 
946
- # Skip ignored directories
947
- if is_path_ignored(root, self.ignore_dirs):
948
- continue
1262
+ # Decide on parallel vs sequential execution
1263
+ # Use parallel if executor available and enough files to benefit
1264
+ if self.parallel_executor and total_files >= 10:
1265
+ await self._process_files_parallel(files_to_process, total_files)
1266
+ else:
1267
+ await self._process_files_sequential(files_to_process, total_files)
949
1268
 
950
- for filename in files:
951
- filepath = root / filename
1269
+ async def _process_files_parallel(
1270
+ self, files_to_process: list[tuple[Path, str]], total_files: int
1271
+ ) -> None:
1272
+ """Process files using parallel execution.
952
1273
 
953
- # Check if this is a supported file
954
- ext = filepath.suffix
955
- lang_config = get_language_config(ext)
1274
+ Args:
1275
+ files_to_process: List of (filepath, language) tuples
1276
+ total_files: Total number of files to process
1277
+ """
1278
+ logger.info(f"Using parallel execution with {self._worker_count} workers")
1279
+ self._parallel_mode_active = True
956
1280
 
957
- if lang_config and lang_config.name in self.parsers:
958
- await self._process_single_file(filepath, lang_config.name)
959
- file_count += 1
1281
+ try:
1282
+ # Build FileInfo objects for WorkDistributor
1283
+ file_infos = self._build_file_infos(files_to_process)
1284
+
1285
+ # Create work batches using WorkDistributor
1286
+ distributor = WorkDistributor(worker_count=self._worker_count)
1287
+ batches = distributor.create_batches(file_infos)
1288
+
1289
+ logger.info(f"Created {len(batches)} batches for {len(file_infos)} files")
960
1290
 
961
- # Report progress after each file
1291
+ # Track progress for UI
1292
+ files_completed = 0
1293
+
1294
+ def parallel_progress(completed_batches: int, total_batches: int) -> None:
1295
+ nonlocal files_completed
1296
+ # Estimate files completed based on batch progress
1297
+ if total_batches > 0:
1298
+ estimated = int((completed_batches / total_batches) * total_files)
1299
+ files_completed = estimated
962
1300
  self._report_progress(
963
1301
  "definitions",
964
- "Processing files and extracting definitions",
965
- file_count,
1302
+ f"Processing files (Parallel, {self._worker_count} workers)",
1303
+ files_completed,
966
1304
  total_files,
967
1305
  )
968
1306
 
969
- if file_count % 100 == 0:
970
- logger.info(f" Processed {file_count}/{total_files} files...")
1307
+ # Execute in parallel - run blocking executor in thread pool
1308
+ loop = asyncio.get_event_loop()
1309
+ result = await loop.run_in_executor(
1310
+ None,
1311
+ lambda: self.parallel_executor.execute(batches, parallel_progress), # type: ignore[union-attr]
1312
+ )
1313
+
1314
+ # Merge results into Ingestor buffers and local caches
1315
+ self._merge_parallel_results(result)
1316
+
1317
+ # Update stats
1318
+ self._index_stats.files_processed = result.successful_files
1319
+
1320
+ # Report phase completion
1321
+ self._report_progress(
1322
+ "definitions",
1323
+ f"Processing files (Parallel, {self._worker_count} workers)",
1324
+ result.successful_files,
1325
+ total_files,
1326
+ phase_complete=True,
1327
+ )
1328
+
1329
+ logger.info(
1330
+ f"Parallel processing complete: {result.successful_files}/{total_files} "
1331
+ f"files, {result.failed_files} failures"
1332
+ )
1333
+
1334
+ except Exception as e:
1335
+ logger.warning(
1336
+ f"Parallel execution failed: {e}. Falling back to sequential."
1337
+ )
1338
+ self._parallel_mode_active = False
1339
+ await self._process_files_sequential(files_to_process, total_files)
1340
+
1341
+ async def _process_files_sequential(
1342
+ self, files_to_process: list[tuple[Path, str]], total_files: int
1343
+ ) -> None:
1344
+ """Process files using sequential execution (original behavior).
1345
+
1346
+ Args:
1347
+ files_to_process: List of (filepath, language) tuples
1348
+ total_files: Total number of files to process
1349
+ """
1350
+ logger.info("Using sequential execution")
1351
+ self._parallel_mode_active = False
1352
+
1353
+ file_count = 0
1354
+ for filepath, language in files_to_process:
1355
+ await self._process_single_file(filepath, language)
1356
+ file_count += 1
1357
+ self._index_stats.files_processed += 1
1358
+
1359
+ # Report progress after each file
1360
+ self._report_progress(
1361
+ "definitions",
1362
+ "Processing files (Sequential)",
1363
+ file_count,
1364
+ total_files,
1365
+ )
1366
+
1367
+ if file_count % 100 == 0:
1368
+ logger.info(f" Processed {file_count}/{total_files} files...")
971
1369
 
972
1370
  logger.info(f" Total files processed: {file_count}/{total_files}")
973
1371
 
974
1372
  # Report phase completion
975
1373
  self._report_progress(
976
1374
  "definitions",
977
- "Processing files and extracting definitions",
1375
+ "Processing files (Sequential)",
978
1376
  file_count,
979
1377
  total_files,
980
1378
  phase_complete=True,
@@ -987,7 +1385,7 @@ class SimpleGraphBuilder:
987
1385
 
988
1386
  # Create File node
989
1387
  self.ingestor.ensure_node_batch(
990
- "File",
1388
+ NodeLabel.FILE,
991
1389
  {
992
1390
  "path": relative_path_str,
993
1391
  "name": filepath.name,
@@ -1000,21 +1398,21 @@ class SimpleGraphBuilder:
1000
1398
  if parent_rel_path == Path("."):
1001
1399
  # File in project root
1002
1400
  self.ingestor.ensure_relationship_batch(
1003
- "Project",
1401
+ NodeLabel.PROJECT,
1004
1402
  "name",
1005
1403
  self.project_name,
1006
- "CONTAINS_FILE",
1007
- "File",
1404
+ RelationshipType.CONTAINS_FILE,
1405
+ NodeLabel.FILE,
1008
1406
  "path",
1009
1407
  relative_path_str,
1010
1408
  )
1011
1409
  else:
1012
1410
  self.ingestor.ensure_relationship_batch(
1013
- "Folder",
1411
+ NodeLabel.FOLDER,
1014
1412
  "path",
1015
1413
  str(parent_rel_path).replace(os.sep, "/"),
1016
- "CONTAINS_FILE",
1017
- "File",
1414
+ RelationshipType.CONTAINS_FILE,
1415
+ NodeLabel.FILE,
1018
1416
  "path",
1019
1417
  relative_path_str,
1020
1418
  )
@@ -1043,7 +1441,7 @@ class SimpleGraphBuilder:
1043
1441
 
1044
1442
  current_time = int(time.time())
1045
1443
  self.ingestor.ensure_node_batch(
1046
- "Module",
1444
+ NodeLabel.MODULE,
1047
1445
  {
1048
1446
  "qualified_name": module_qn,
1049
1447
  "name": filepath.stem,
@@ -1058,33 +1456,33 @@ class SimpleGraphBuilder:
1058
1456
  if parent_container:
1059
1457
  # Parent is a package
1060
1458
  self.ingestor.ensure_relationship_batch(
1061
- "Package",
1459
+ NodeLabel.PACKAGE,
1062
1460
  "qualified_name",
1063
1461
  parent_container,
1064
- "CONTAINS_MODULE",
1065
- "Module",
1462
+ RelationshipType.CONTAINS_MODULE,
1463
+ NodeLabel.MODULE,
1066
1464
  "qualified_name",
1067
1465
  module_qn,
1068
1466
  )
1069
1467
  elif parent_rel_path == Path("."):
1070
1468
  # Parent is project root
1071
1469
  self.ingestor.ensure_relationship_batch(
1072
- "Project",
1470
+ NodeLabel.PROJECT,
1073
1471
  "name",
1074
1472
  self.project_name,
1075
- "CONTAINS_MODULE",
1076
- "Module",
1473
+ RelationshipType.CONTAINS_MODULE,
1474
+ NodeLabel.MODULE,
1077
1475
  "qualified_name",
1078
1476
  module_qn,
1079
1477
  )
1080
1478
  else:
1081
1479
  # Parent is a folder
1082
1480
  self.ingestor.ensure_relationship_batch(
1083
- "Folder",
1481
+ NodeLabel.FOLDER,
1084
1482
  "path",
1085
1483
  str(parent_rel_path).replace(os.sep, "/"),
1086
- "CONTAINS_MODULE",
1087
- "Module",
1484
+ RelationshipType.CONTAINS_MODULE,
1485
+ NodeLabel.MODULE,
1088
1486
  "qualified_name",
1089
1487
  module_qn,
1090
1488
  )
@@ -1098,7 +1496,7 @@ class SimpleGraphBuilder:
1098
1496
 
1099
1497
  # Track module
1100
1498
  self.ingestor.ensure_tracks_relationship(
1101
- relative_path_str, "Module", module_qn
1499
+ relative_path_str, NodeLabel.MODULE, module_qn
1102
1500
  )
1103
1501
 
1104
1502
  # Extract definitions
@@ -1142,7 +1540,7 @@ class SimpleGraphBuilder:
1142
1540
 
1143
1541
  current_time = int(time.time())
1144
1542
  self.ingestor.ensure_node_batch(
1145
- "Class",
1543
+ NodeLabel.CLASS,
1146
1544
  {
1147
1545
  "qualified_name": class_qn,
1148
1546
  "name": class_name,
@@ -1156,26 +1554,23 @@ class SimpleGraphBuilder:
1156
1554
  )
1157
1555
 
1158
1556
  # Create DEFINES relationship
1159
- logger.debug(
1160
- f"Creating DEFINES relationship: Module({module_qn}) -> Class({class_qn})"
1161
- )
1162
1557
  self.ingestor.ensure_relationship_batch(
1163
- "Module",
1558
+ NodeLabel.MODULE,
1164
1559
  "qualified_name",
1165
1560
  module_qn,
1166
- "DEFINES",
1167
- "Class",
1561
+ RelationshipType.DEFINES,
1562
+ NodeLabel.CLASS,
1168
1563
  "qualified_name",
1169
1564
  class_qn,
1170
1565
  )
1171
1566
 
1172
1567
  # Track class
1173
1568
  self.ingestor.ensure_tracks_relationship(
1174
- relative_path_str, "Class", class_qn
1569
+ relative_path_str, NodeLabel.CLASS, class_qn
1175
1570
  )
1176
1571
 
1177
1572
  # Register for lookup
1178
- self.function_registry[class_qn] = "Class"
1573
+ self.function_registry[class_qn] = NodeLabel.CLASS
1179
1574
  self.simple_name_lookup[class_name].add(class_qn)
1180
1575
 
1181
1576
  # Extract inheritance
@@ -1187,7 +1582,6 @@ class SimpleGraphBuilder:
1187
1582
  if "function_query" in lang_queries:
1188
1583
  cursor = QueryCursor(lang_queries["function_query"])
1189
1584
  matches = list(cursor.matches(root_node))
1190
- logger.debug(f"Found {len(matches)} function matches in {filepath}")
1191
1585
  for match in matches:
1192
1586
  func_node = None
1193
1587
  func_name = None
@@ -1201,11 +1595,6 @@ class SimpleGraphBuilder:
1201
1595
  func_name = node.text.decode("utf-8")
1202
1596
 
1203
1597
  if func_node and func_name:
1204
- # Log what we found
1205
- logger.debug(
1206
- f"Found function: {func_name} at line {func_node.start_point.row + 1}"
1207
- )
1208
-
1209
1598
  # Check if this is a method inside a class
1210
1599
  parent_class = self._find_parent_class(func_node, module_qn)
1211
1600
 
@@ -1219,7 +1608,7 @@ class SimpleGraphBuilder:
1219
1608
 
1220
1609
  current_time = int(time.time())
1221
1610
  self.ingestor.ensure_node_batch(
1222
- "Method",
1611
+ NodeLabel.METHOD,
1223
1612
  {
1224
1613
  "qualified_name": method_qn,
1225
1614
  "name": func_name,
@@ -1234,22 +1623,22 @@ class SimpleGraphBuilder:
1234
1623
 
1235
1624
  # Create DEFINES_METHOD relationship
1236
1625
  self.ingestor.ensure_relationship_batch(
1237
- "Class",
1626
+ NodeLabel.CLASS,
1238
1627
  "qualified_name",
1239
1628
  parent_class,
1240
- "DEFINES_METHOD",
1241
- "Method",
1629
+ RelationshipType.DEFINES_METHOD,
1630
+ NodeLabel.METHOD,
1242
1631
  "qualified_name",
1243
1632
  method_qn,
1244
1633
  )
1245
1634
 
1246
1635
  # Track method
1247
1636
  self.ingestor.ensure_tracks_relationship(
1248
- relative_path_str, "Method", method_qn
1637
+ relative_path_str, NodeLabel.METHOD, method_qn
1249
1638
  )
1250
1639
 
1251
1640
  # Register for lookup
1252
- self.function_registry[method_qn] = "Method"
1641
+ self.function_registry[method_qn] = NodeLabel.METHOD
1253
1642
  self.simple_name_lookup[func_name].add(method_qn)
1254
1643
  else:
1255
1644
  # This is a standalone function
@@ -1261,7 +1650,7 @@ class SimpleGraphBuilder:
1261
1650
 
1262
1651
  current_time = int(time.time())
1263
1652
  self.ingestor.ensure_node_batch(
1264
- "Function",
1653
+ NodeLabel.FUNCTION,
1265
1654
  {
1266
1655
  "qualified_name": func_qn,
1267
1656
  "name": func_name,
@@ -1276,22 +1665,22 @@ class SimpleGraphBuilder:
1276
1665
 
1277
1666
  # Create DEFINES relationship
1278
1667
  self.ingestor.ensure_relationship_batch(
1279
- "Module",
1668
+ NodeLabel.MODULE,
1280
1669
  "qualified_name",
1281
1670
  module_qn,
1282
- "DEFINES_FUNC",
1283
- "Function",
1671
+ RelationshipType.DEFINES_FUNC,
1672
+ NodeLabel.FUNCTION,
1284
1673
  "qualified_name",
1285
1674
  func_qn,
1286
1675
  )
1287
1676
 
1288
1677
  # Track function
1289
1678
  self.ingestor.ensure_tracks_relationship(
1290
- relative_path_str, "Function", func_qn
1679
+ relative_path_str, NodeLabel.FUNCTION, func_qn
1291
1680
  )
1292
1681
 
1293
1682
  # Register for lookup
1294
- self.function_registry[func_qn] = "Function"
1683
+ self.function_registry[func_qn] = NodeLabel.FUNCTION
1295
1684
  self.simple_name_lookup[func_name].add(func_qn)
1296
1685
 
1297
1686
  def _extract_decorators(self, node: Node, language: str) -> list[str]:
@@ -1407,6 +1796,25 @@ class SimpleGraphBuilder:
1407
1796
 
1408
1797
  def _process_relationships(self) -> None:
1409
1798
  """Third pass: Process function calls and imports."""
1799
+ # If parallel mode was used, relationships are already resolved
1800
+ # by ParallelExecutor during the definitions phase
1801
+ if self._parallel_mode_active:
1802
+ logger.info(
1803
+ "Skipping relationship processing "
1804
+ "(already resolved during parallel execution)"
1805
+ )
1806
+ # Report progress as complete for UI consistency
1807
+ total = len(self.function_registry)
1808
+ self._report_progress(
1809
+ "relationships",
1810
+ "Relationships resolved during parallel execution",
1811
+ total,
1812
+ total,
1813
+ phase_complete=True,
1814
+ )
1815
+ return
1816
+
1817
+ # Sequential mode - process relationships normally
1410
1818
  # Process inheritance relationships first
1411
1819
  self._process_inheritance()
1412
1820
 
@@ -1418,18 +1826,10 @@ class SimpleGraphBuilder:
1418
1826
  f"Simple name lookup has {len(self.simple_name_lookup)} unique names"
1419
1827
  )
1420
1828
 
1421
- # Log some examples from simple_name_lookup
1422
- if self.simple_name_lookup:
1423
- example_names = list(self.simple_name_lookup.keys())[:5]
1424
- for name in example_names:
1425
- logger.debug(
1426
- f" Example: '{name}' -> {list(self.simple_name_lookup[name])[:3]}"
1427
- )
1428
-
1429
1829
  file_count = 0
1430
1830
  for filepath, (root_node, language) in self.ast_cache.items():
1431
1831
  self._process_calls(filepath, root_node, language)
1432
- # NOTE: Add import processing. wtf does this mean?
1832
+ # TODO(future): Add import statement processing for IMPORTS relationships
1433
1833
 
1434
1834
  file_count += 1
1435
1835
  # Report progress after each file
@@ -1459,17 +1859,14 @@ class SimpleGraphBuilder:
1459
1859
  if parent_qn in self.function_registry:
1460
1860
  # Create INHERITS relationship
1461
1861
  self.ingestor.ensure_relationship_batch(
1462
- "Class",
1862
+ NodeLabel.CLASS,
1463
1863
  "qualified_name",
1464
1864
  child_qn,
1465
- "INHERITS",
1466
- "Class",
1865
+ RelationshipType.INHERITS,
1866
+ NodeLabel.CLASS,
1467
1867
  "qualified_name",
1468
1868
  parent_qn,
1469
1869
  )
1470
- logger.debug(
1471
- f" Created inheritance: {child_qn} INHERITS {parent_qn}"
1472
- )
1473
1870
  else:
1474
1871
  # Try to find parent by simple name lookup
1475
1872
  parent_simple_name = parent_qn.split(".")[-1]
@@ -1481,21 +1878,14 @@ class SimpleGraphBuilder:
1481
1878
  if len(possible_parents) == 1:
1482
1879
  actual_parent_qn = list(possible_parents)[0]
1483
1880
  self.ingestor.ensure_relationship_batch(
1484
- "Class",
1881
+ NodeLabel.CLASS,
1485
1882
  "qualified_name",
1486
1883
  child_qn,
1487
- "INHERITS",
1488
- "Class",
1884
+ RelationshipType.INHERITS,
1885
+ NodeLabel.CLASS,
1489
1886
  "qualified_name",
1490
1887
  actual_parent_qn,
1491
1888
  )
1492
- logger.debug(
1493
- f" Created inheritance: {child_qn} INHERITS {actual_parent_qn}"
1494
- )
1495
- else:
1496
- logger.debug(
1497
- f" Could not resolve parent class: {parent_qn} for {child_qn}"
1498
- )
1499
1889
 
1500
1890
  def _process_calls(self, filepath: Path, root_node: Node, language: str) -> None:
1501
1891
  """Process function calls in a file."""
@@ -1516,7 +1906,6 @@ class SimpleGraphBuilder:
1516
1906
  # Find all call expressions
1517
1907
  cursor = QueryCursor(lang_queries["call_query"])
1518
1908
  matches = list(cursor.matches(root_node))
1519
- logger.debug(f"Found {len(matches)} call matches in {filepath}")
1520
1909
  for match in matches:
1521
1910
  call_node = None
1522
1911
 
@@ -1555,36 +1944,23 @@ class SimpleGraphBuilder:
1555
1944
  break
1556
1945
 
1557
1946
  if not callee_name:
1558
- logger.debug(
1559
- f" Could not extract callee name from call at line {call_node.start_point[0]}"
1560
- )
1561
1947
  return
1562
1948
 
1563
- logger.debug(f" Processing call to {callee_name} (object: {object_name})")
1564
-
1565
1949
  # Find caller function
1566
1950
  caller_qn = self._find_containing_function(call_node, module_qn)
1567
1951
  if not caller_qn:
1568
- logger.debug(
1569
- f" Could not find containing function for call at line {call_node.start_point[0]}"
1570
- )
1571
1952
  return
1572
1953
 
1573
1954
  # Get all possible callees
1574
1955
  possible_callees = self.simple_name_lookup.get(callee_name, set())
1575
1956
  if not possible_callees:
1576
- logger.debug(f" No functions found with name: {callee_name}")
1577
1957
  return
1578
1958
 
1579
- logger.debug(
1580
- f" Found {len(possible_callees)} possible callees for {callee_name}"
1581
- )
1582
-
1583
1959
  # Calculate confidence scores for each possible callee
1584
1960
  scored_callees = []
1585
1961
  for possible_qn in possible_callees:
1586
- score = self._calculate_callee_confidence(
1587
- caller_qn, possible_qn, module_qn, object_name
1962
+ score = calculate_callee_confidence(
1963
+ caller_qn, possible_qn, module_qn, object_name, self.simple_name_lookup
1588
1964
  )
1589
1965
  scored_callees.append((possible_qn, score))
1590
1966
 
@@ -1610,93 +1986,6 @@ class SimpleGraphBuilder:
1610
1986
  callee_qn,
1611
1987
  )
1612
1988
 
1613
- # Log with confidence information
1614
- alternatives = len(scored_callees) - 1
1615
- logger.info(
1616
- f" Created CALLS relationship: {caller_qn} -> {callee_qn} (confidence: {confidence:.2f}, alternatives: {alternatives})"
1617
- )
1618
-
1619
- # If multiple alternatives exist with similar confidence, log them
1620
- if alternatives > 0 and confidence < 1.0:
1621
- similar_alternatives = [
1622
- qn for qn, score in scored_callees[1:4] if score >= confidence * 0.8
1623
- ] # Top 3 alternatives # Within 80% of best score
1624
- if similar_alternatives:
1625
- logger.debug(
1626
- f" Alternative matches: {', '.join(similar_alternatives)}"
1627
- )
1628
- else:
1629
- logger.warning(
1630
- f" Failed to create CALLS relationship - caller_type: {caller_type}, callee_type: {callee_type}"
1631
- )
1632
-
1633
- def _calculate_callee_confidence(
1634
- self, caller_qn: str, callee_qn: str, module_qn: str, object_name: str | None
1635
- ) -> float:
1636
- """Calculate confidence score for a potential callee match.
1637
-
1638
- Args:
1639
- caller_qn: Qualified name of the calling function
1640
- callee_qn: Qualified name of the potential callee
1641
- module_qn: Qualified name of the current module
1642
- object_name: Object name for method calls (e.g., 'obj' in obj.method())
1643
-
1644
- Returns:
1645
- Confidence score between 0.0 and 1.0
1646
- """
1647
- score = 0.0
1648
-
1649
- # 1. Module locality - functions in the same module are most likely
1650
- if callee_qn.startswith(module_qn + "."):
1651
- score += 0.5
1652
-
1653
- # Even higher if in the same class
1654
- caller_parts = caller_qn.split(".")
1655
- callee_parts = callee_qn.split(".")
1656
- if len(caller_parts) >= 3 and len(callee_parts) >= 3:
1657
- if caller_parts[:-1] == callee_parts[:-1]: # Same class
1658
- score += 0.2
1659
-
1660
- # 2. Package locality - functions in the same package hierarchy
1661
- elif "." in module_qn:
1662
- package = module_qn.rsplit(".", 1)[0]
1663
- if callee_qn.startswith(package + "."):
1664
- score += 0.3
1665
-
1666
- # 3. Object/class match for method calls
1667
- if object_name:
1668
- # Check if callee is a method of a class matching the object name
1669
- callee_parts = callee_qn.split(".")
1670
- if len(callee_parts) >= 2:
1671
- # Simple heuristic: check if class name matches object name
1672
- # (In reality, we'd need type inference for accuracy)
1673
- class_name = callee_parts[-2]
1674
- if class_name.lower() == object_name.lower():
1675
- score += 0.3
1676
- elif object_name == "self" and callee_qn.startswith(
1677
- caller_qn.rsplit(".", 1)[0]
1678
- ):
1679
- # 'self' refers to the same class
1680
- score += 0.4
1681
-
1682
- # 4. Import presence check (simplified - would need import tracking)
1683
- # For now, we'll give a small boost to standard library functions
1684
- if callee_qn.startswith(("builtins.", "typing.", "collections.")):
1685
- score += 0.1
1686
-
1687
- # 5. Name similarity for disambiguation
1688
- # If function names are unique enough, boost confidence
1689
- possible_count = len(
1690
- self.simple_name_lookup.get(callee_qn.split(".")[-1], set())
1691
- )
1692
- if possible_count == 1:
1693
- score += 0.2
1694
- elif possible_count <= 3:
1695
- score += 0.1
1696
-
1697
- # Normalize to [0, 1]
1698
- return min(score, 1.0)
1699
-
1700
1989
  def _find_containing_function(self, node: Node, module_qn: str) -> str | None:
1701
1990
  """Find the containing function/method of a node."""
1702
1991
  current = node.parent
@@ -1760,7 +2049,8 @@ class CodebaseIngestor:
1760
2049
  self.project_name = repo_path_obj.name
1761
2050
 
1762
2051
  try:
1763
- # Create database
2052
+ # Create database (lazy import kuzu for Windows compatibility)
2053
+ kuzu = get_kuzu()
1764
2054
  logger.info(f"Creating Kuzu database at: {self.db_path}")
1765
2055
  db = kuzu.Database(str(self.db_path))
1766
2056
  conn = kuzu.Connection(db)