shotgun-sh 0.4.0.dev1__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. shotgun/agents/agent_manager.py +307 -8
  2. shotgun/agents/cancellation.py +103 -0
  3. shotgun/agents/common.py +12 -0
  4. shotgun/agents/config/README.md +0 -1
  5. shotgun/agents/config/manager.py +10 -7
  6. shotgun/agents/config/models.py +5 -27
  7. shotgun/agents/config/provider.py +44 -27
  8. shotgun/agents/conversation/history/token_counting/base.py +51 -9
  9. shotgun/agents/file_read.py +176 -0
  10. shotgun/agents/messages.py +15 -3
  11. shotgun/agents/models.py +24 -1
  12. shotgun/agents/router/models.py +8 -0
  13. shotgun/agents/router/tools/delegation_tools.py +55 -1
  14. shotgun/agents/router/tools/plan_tools.py +88 -7
  15. shotgun/agents/runner.py +17 -2
  16. shotgun/agents/tools/__init__.py +8 -0
  17. shotgun/agents/tools/codebase/directory_lister.py +27 -39
  18. shotgun/agents/tools/codebase/file_read.py +26 -35
  19. shotgun/agents/tools/codebase/query_graph.py +9 -0
  20. shotgun/agents/tools/codebase/retrieve_code.py +9 -0
  21. shotgun/agents/tools/file_management.py +32 -2
  22. shotgun/agents/tools/file_read_tools/__init__.py +7 -0
  23. shotgun/agents/tools/file_read_tools/multimodal_file_read.py +167 -0
  24. shotgun/agents/tools/markdown_tools/__init__.py +62 -0
  25. shotgun/agents/tools/markdown_tools/insert_section.py +148 -0
  26. shotgun/agents/tools/markdown_tools/models.py +86 -0
  27. shotgun/agents/tools/markdown_tools/remove_section.py +114 -0
  28. shotgun/agents/tools/markdown_tools/replace_section.py +119 -0
  29. shotgun/agents/tools/markdown_tools/utils.py +453 -0
  30. shotgun/agents/tools/registry.py +44 -6
  31. shotgun/agents/tools/web_search/openai.py +42 -23
  32. shotgun/attachments/__init__.py +41 -0
  33. shotgun/attachments/errors.py +60 -0
  34. shotgun/attachments/models.py +107 -0
  35. shotgun/attachments/parser.py +257 -0
  36. shotgun/attachments/processor.py +193 -0
  37. shotgun/build_constants.py +4 -7
  38. shotgun/cli/clear.py +2 -2
  39. shotgun/cli/codebase/commands.py +181 -65
  40. shotgun/cli/compact.py +2 -2
  41. shotgun/cli/context.py +2 -2
  42. shotgun/cli/error_handler.py +2 -2
  43. shotgun/cli/run.py +90 -0
  44. shotgun/cli/spec/backup.py +2 -1
  45. shotgun/codebase/__init__.py +2 -0
  46. shotgun/codebase/benchmarks/__init__.py +35 -0
  47. shotgun/codebase/benchmarks/benchmark_runner.py +309 -0
  48. shotgun/codebase/benchmarks/exporters.py +119 -0
  49. shotgun/codebase/benchmarks/formatters/__init__.py +49 -0
  50. shotgun/codebase/benchmarks/formatters/base.py +34 -0
  51. shotgun/codebase/benchmarks/formatters/json_formatter.py +106 -0
  52. shotgun/codebase/benchmarks/formatters/markdown.py +136 -0
  53. shotgun/codebase/benchmarks/models.py +129 -0
  54. shotgun/codebase/core/__init__.py +4 -0
  55. shotgun/codebase/core/call_resolution.py +91 -0
  56. shotgun/codebase/core/change_detector.py +11 -6
  57. shotgun/codebase/core/errors.py +159 -0
  58. shotgun/codebase/core/extractors/__init__.py +23 -0
  59. shotgun/codebase/core/extractors/base.py +138 -0
  60. shotgun/codebase/core/extractors/factory.py +63 -0
  61. shotgun/codebase/core/extractors/go/__init__.py +7 -0
  62. shotgun/codebase/core/extractors/go/extractor.py +122 -0
  63. shotgun/codebase/core/extractors/javascript/__init__.py +7 -0
  64. shotgun/codebase/core/extractors/javascript/extractor.py +132 -0
  65. shotgun/codebase/core/extractors/protocol.py +109 -0
  66. shotgun/codebase/core/extractors/python/__init__.py +7 -0
  67. shotgun/codebase/core/extractors/python/extractor.py +141 -0
  68. shotgun/codebase/core/extractors/rust/__init__.py +7 -0
  69. shotgun/codebase/core/extractors/rust/extractor.py +139 -0
  70. shotgun/codebase/core/extractors/types.py +15 -0
  71. shotgun/codebase/core/extractors/typescript/__init__.py +7 -0
  72. shotgun/codebase/core/extractors/typescript/extractor.py +92 -0
  73. shotgun/codebase/core/gitignore.py +252 -0
  74. shotgun/codebase/core/ingestor.py +644 -354
  75. shotgun/codebase/core/kuzu_compat.py +119 -0
  76. shotgun/codebase/core/language_config.py +239 -0
  77. shotgun/codebase/core/manager.py +256 -46
  78. shotgun/codebase/core/metrics_collector.py +310 -0
  79. shotgun/codebase/core/metrics_types.py +347 -0
  80. shotgun/codebase/core/parallel_executor.py +424 -0
  81. shotgun/codebase/core/work_distributor.py +254 -0
  82. shotgun/codebase/core/worker.py +768 -0
  83. shotgun/codebase/indexing_state.py +86 -0
  84. shotgun/codebase/models.py +94 -0
  85. shotgun/codebase/service.py +13 -0
  86. shotgun/exceptions.py +9 -9
  87. shotgun/main.py +3 -16
  88. shotgun/posthog_telemetry.py +165 -24
  89. shotgun/prompts/agents/file_read.j2 +48 -0
  90. shotgun/prompts/agents/partials/common_agent_system_prompt.j2 +19 -47
  91. shotgun/prompts/agents/partials/content_formatting.j2 +12 -33
  92. shotgun/prompts/agents/partials/interactive_mode.j2 +9 -32
  93. shotgun/prompts/agents/partials/router_delegation_mode.j2 +21 -22
  94. shotgun/prompts/agents/plan.j2 +14 -0
  95. shotgun/prompts/agents/router.j2 +531 -258
  96. shotgun/prompts/agents/specify.j2 +14 -0
  97. shotgun/prompts/agents/state/codebase/codebase_graphs_available.j2 +14 -1
  98. shotgun/prompts/agents/state/system_state.j2 +13 -11
  99. shotgun/prompts/agents/tasks.j2 +14 -0
  100. shotgun/settings.py +49 -10
  101. shotgun/tui/app.py +149 -18
  102. shotgun/tui/commands/__init__.py +9 -1
  103. shotgun/tui/components/attachment_bar.py +87 -0
  104. shotgun/tui/components/prompt_input.py +25 -28
  105. shotgun/tui/components/status_bar.py +14 -7
  106. shotgun/tui/dependencies.py +3 -8
  107. shotgun/tui/protocols.py +18 -0
  108. shotgun/tui/screens/chat/chat.tcss +15 -0
  109. shotgun/tui/screens/chat/chat_screen.py +766 -235
  110. shotgun/tui/screens/chat/codebase_index_prompt_screen.py +8 -4
  111. shotgun/tui/screens/chat_screen/attachment_hint.py +40 -0
  112. shotgun/tui/screens/chat_screen/command_providers.py +0 -10
  113. shotgun/tui/screens/chat_screen/history/chat_history.py +54 -14
  114. shotgun/tui/screens/chat_screen/history/formatters.py +22 -0
  115. shotgun/tui/screens/chat_screen/history/user_question.py +25 -3
  116. shotgun/tui/screens/database_locked_dialog.py +219 -0
  117. shotgun/tui/screens/database_timeout_dialog.py +158 -0
  118. shotgun/tui/screens/kuzu_error_dialog.py +135 -0
  119. shotgun/tui/screens/model_picker.py +1 -3
  120. shotgun/tui/screens/models.py +11 -0
  121. shotgun/tui/state/processing_state.py +19 -0
  122. shotgun/tui/widgets/widget_coordinator.py +18 -0
  123. shotgun/utils/file_system_utils.py +4 -1
  124. {shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/METADATA +87 -34
  125. {shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/RECORD +128 -79
  126. shotgun/cli/export.py +0 -81
  127. shotgun/cli/plan.py +0 -73
  128. shotgun/cli/research.py +0 -93
  129. shotgun/cli/specify.py +0 -70
  130. shotgun/cli/tasks.py +0 -78
  131. shotgun/sentry_telemetry.py +0 -232
  132. shotgun/tui/screens/onboarding.py +0 -584
  133. {shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/WHEEL +0 -0
  134. {shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/entry_points.txt +0 -0
  135. {shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,310 @@
1
+ """Thread-safe metrics collector for indexing operations.
2
+
3
+ This module provides the MetricsCollector class for tracking performance
4
+ metrics during codebase indexing, including phase timing, memory usage,
5
+ and optional per-file and per-worker metrics.
6
+ """
7
+
8
+ import csv
9
+ import threading
10
+ import time
11
+ import uuid
12
+ from pathlib import Path
13
+
14
+ import psutil
15
+
16
+ from shotgun.codebase.core.metrics_types import (
17
+ FileParseMetrics,
18
+ IndexingMetrics,
19
+ IndexingPhase,
20
+ PhaseMetrics,
21
+ WorkerMetrics,
22
+ )
23
+
24
+
25
+ class MetricsCollector:
26
+ """Thread-safe metrics collector for indexing operations.
27
+
28
+ Collects performance metrics at multiple granularities:
29
+ - Phase-level: timing and throughput for each indexing phase
30
+ - Worker-level: per-worker statistics (optional, for parallel execution)
31
+ - File-level: per-file parsing metrics (optional)
32
+
33
+ All collection methods are thread-safe using a lock.
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ codebase_name: str,
39
+ collect_file_metrics: bool = True,
40
+ collect_worker_metrics: bool = True,
41
+ ) -> None:
42
+ """Initialize the metrics collector.
43
+
44
+ Args:
45
+ codebase_name: Name of the codebase being indexed
46
+ collect_file_metrics: Whether to collect per-file metrics
47
+ collect_worker_metrics: Whether to collect per-worker metrics
48
+ """
49
+ self._lock = threading.Lock()
50
+ self._session_id = str(uuid.uuid4())
51
+ self._codebase_name = codebase_name
52
+ self._collect_file_metrics = collect_file_metrics
53
+ self._collect_worker_metrics = collect_worker_metrics
54
+
55
+ # Phase tracking
56
+ self._phase_starts: dict[str, tuple[float, float]] = {} # (time, memory)
57
+ self._phase_metrics: dict[str, PhaseMetrics] = {}
58
+
59
+ # File metrics (optional)
60
+ self._file_metrics: list[FileParseMetrics] = []
61
+
62
+ # Worker metrics (optional)
63
+ self._worker_metrics: dict[int, WorkerMetrics] = {}
64
+
65
+ # Session timing
66
+ self._session_start = time.perf_counter()
67
+ self._session_start_timestamp = time.time()
68
+
69
+ # Aggregates (set during flush phases)
70
+ self._total_nodes = 0
71
+ self._total_relationships = 0
72
+ self._total_files = 0
73
+
74
+ def _get_memory_mb(self) -> float:
75
+ """Get current RSS memory in MB (cross-platform).
76
+
77
+ Returns:
78
+ Current memory usage in megabytes, or 0.0 if unavailable.
79
+ """
80
+ try:
81
+ process = psutil.Process()
82
+ rss_bytes: int = process.memory_info().rss
83
+ return float(rss_bytes) / 1024 / 1024
84
+ except Exception:
85
+ return 0.0
86
+
87
+ def start_phase(self, phase: IndexingPhase | str) -> None:
88
+ """Mark the start of a processing phase.
89
+
90
+ Args:
91
+ phase: The indexing phase (use IndexingPhase enum)
92
+ """
93
+ phase_name = str(phase)
94
+ with self._lock:
95
+ start_time = time.perf_counter()
96
+ start_memory = self._get_memory_mb()
97
+ self._phase_starts[phase_name] = (start_time, start_memory)
98
+
99
+ def end_phase(self, phase: IndexingPhase | str, items_processed: int) -> None:
100
+ """Mark the end of a processing phase.
101
+
102
+ Args:
103
+ phase: The indexing phase (use IndexingPhase enum)
104
+ items_processed: Number of items processed in this phase
105
+ """
106
+ phase_name = str(phase)
107
+ end_time = time.perf_counter()
108
+ end_memory = self._get_memory_mb()
109
+
110
+ with self._lock:
111
+ if phase_name not in self._phase_starts:
112
+ return
113
+
114
+ start_time, start_memory = self._phase_starts[phase_name]
115
+ duration = end_time - start_time
116
+ peak_memory = max(start_memory, end_memory)
117
+
118
+ # Calculate throughput (avoid division by zero)
119
+ throughput = items_processed / duration if duration > 0 else 0.0
120
+
121
+ # Create phase metrics
122
+ self._phase_metrics[phase_name] = PhaseMetrics(
123
+ phase_name=phase_name,
124
+ start_time=self._session_start_timestamp
125
+ + (start_time - self._session_start),
126
+ end_time=self._session_start_timestamp
127
+ + (end_time - self._session_start),
128
+ duration_seconds=duration,
129
+ items_processed=items_processed,
130
+ throughput=throughput,
131
+ memory_mb=peak_memory,
132
+ worker_count=None,
133
+ worker_metrics=None,
134
+ )
135
+
136
+ # Track files for definitions phase
137
+ if phase_name == IndexingPhase.DEFINITIONS:
138
+ self._total_files = items_processed
139
+
140
+ def record_file_parse(self, metrics: FileParseMetrics) -> None:
141
+ """Record metrics for a single file parse.
142
+
143
+ Args:
144
+ metrics: File parsing metrics
145
+ """
146
+ if not self._collect_file_metrics:
147
+ return
148
+
149
+ with self._lock:
150
+ self._file_metrics.append(metrics)
151
+
152
+ def record_worker_metrics(self, worker_id: int, metrics: WorkerMetrics) -> None:
153
+ """Record metrics for a worker.
154
+
155
+ Args:
156
+ worker_id: Unique worker identifier
157
+ metrics: Worker performance metrics
158
+ """
159
+ if not self._collect_worker_metrics:
160
+ return
161
+
162
+ with self._lock:
163
+ self._worker_metrics[worker_id] = metrics
164
+
165
+ def set_totals(self, nodes: int, relationships: int) -> None:
166
+ """Set the total node and relationship counts.
167
+
168
+ Args:
169
+ nodes: Total number of nodes created
170
+ relationships: Total number of relationships created
171
+ """
172
+ with self._lock:
173
+ self._total_nodes = nodes
174
+ self._total_relationships = relationships
175
+
176
+ def get_metrics(self) -> IndexingMetrics:
177
+ """Get complete metrics for the indexing session.
178
+
179
+ Returns:
180
+ Complete indexing metrics including all phases and aggregates.
181
+ """
182
+ with self._lock:
183
+ total_duration = time.perf_counter() - self._session_start
184
+
185
+ # Calculate average throughput from definitions phase
186
+ avg_throughput = 0.0
187
+ definitions_key = str(IndexingPhase.DEFINITIONS)
188
+ if definitions_key in self._phase_metrics:
189
+ avg_throughput = self._phase_metrics[definitions_key].throughput
190
+
191
+ # Get peak memory across all phases
192
+ peak_memory = max(
193
+ (pm.memory_mb for pm in self._phase_metrics.values()),
194
+ default=self._get_memory_mb(),
195
+ )
196
+
197
+ # Calculate parallelism efficiency if worker metrics available
198
+ parallelism_efficiency = None
199
+ if self._worker_metrics:
200
+ worker_count = len(self._worker_metrics)
201
+ if worker_count > 1:
202
+ # Efficiency = actual speedup / ideal speedup
203
+ # For now, use balanced work distribution as proxy
204
+ files_per_worker = [
205
+ w.files_processed for w in self._worker_metrics.values()
206
+ ]
207
+ if files_per_worker:
208
+ avg_files = sum(files_per_worker) / len(files_per_worker)
209
+ max_files = max(files_per_worker)
210
+ if max_files > 0:
211
+ parallelism_efficiency = avg_files / max_files
212
+
213
+ return IndexingMetrics(
214
+ session_id=self._session_id,
215
+ codebase_name=self._codebase_name,
216
+ total_duration_seconds=total_duration,
217
+ phase_metrics=dict(self._phase_metrics),
218
+ file_metrics=list(self._file_metrics),
219
+ total_files=self._total_files,
220
+ total_nodes=self._total_nodes,
221
+ total_relationships=self._total_relationships,
222
+ avg_throughput=avg_throughput,
223
+ peak_memory_mb=peak_memory,
224
+ parallelism_efficiency=parallelism_efficiency,
225
+ )
226
+
227
+ def export_json(self, path: Path) -> None:
228
+ """Export metrics to JSON file.
229
+
230
+ Args:
231
+ path: Path to write JSON file
232
+ """
233
+ metrics = self.get_metrics()
234
+ path.write_text(metrics.model_dump_json(indent=2))
235
+
236
+ def export_csv(self, path: Path) -> None:
237
+ """Export metrics to CSV file.
238
+
239
+ Exports phase metrics and optionally file metrics as separate sections.
240
+
241
+ Args:
242
+ path: Path to write CSV file
243
+ """
244
+ metrics = self.get_metrics()
245
+
246
+ with path.open("w", newline="") as f:
247
+ writer = csv.writer(f)
248
+
249
+ # Header section
250
+ writer.writerow(["# Indexing Metrics"])
251
+ writer.writerow(["Session ID", metrics.session_id])
252
+ writer.writerow(["Codebase", metrics.codebase_name])
253
+ writer.writerow(
254
+ ["Total Duration (s)", f"{metrics.total_duration_seconds:.2f}"]
255
+ )
256
+ writer.writerow(["Total Files", metrics.total_files])
257
+ writer.writerow(["Total Nodes", metrics.total_nodes])
258
+ writer.writerow(["Total Relationships", metrics.total_relationships])
259
+ writer.writerow(["Peak Memory (MB)", f"{metrics.peak_memory_mb:.1f}"])
260
+ writer.writerow([])
261
+
262
+ # Phase metrics
263
+ writer.writerow(["# Phase Metrics"])
264
+ writer.writerow(
265
+ [
266
+ "Phase",
267
+ "Duration (s)",
268
+ "Items",
269
+ "Throughput (items/s)",
270
+ "Memory (MB)",
271
+ ]
272
+ )
273
+ for phase in metrics.phase_metrics.values():
274
+ writer.writerow(
275
+ [
276
+ phase.phase_name,
277
+ f"{phase.duration_seconds:.3f}",
278
+ phase.items_processed,
279
+ f"{phase.throughput:.1f}",
280
+ f"{phase.memory_mb:.1f}",
281
+ ]
282
+ )
283
+ writer.writerow([])
284
+
285
+ # File metrics (if collected)
286
+ if metrics.file_metrics:
287
+ writer.writerow(["# File Metrics"])
288
+ writer.writerow(
289
+ [
290
+ "File",
291
+ "Language",
292
+ "Size (bytes)",
293
+ "Parse Time (ms)",
294
+ "AST Nodes",
295
+ "Definitions",
296
+ "Relationships",
297
+ ]
298
+ )
299
+ for fm in metrics.file_metrics:
300
+ writer.writerow(
301
+ [
302
+ fm.file_path,
303
+ fm.language,
304
+ fm.file_size_bytes,
305
+ f"{fm.parse_time_ms:.2f}",
306
+ fm.ast_nodes,
307
+ fm.definitions_extracted,
308
+ fm.relationships_found,
309
+ ]
310
+ )
@@ -0,0 +1,347 @@
1
+ """Type definitions for indexing metrics collection.
2
+
3
+ These models define the data structures for tracking performance metrics
4
+ during codebase indexing operations, as well as work distribution types
5
+ for parallel file parsing.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from enum import StrEnum
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ from pydantic import BaseModel, Field
15
+
16
+ from shotgun.codebase.models import NodeLabel, RelationshipType
17
+
18
+ __all__ = [
19
+ "DistributionStats",
20
+ "FileInfo",
21
+ "FileParseMetrics",
22
+ "FileParseResult",
23
+ "FileParseTask",
24
+ "IndexingMetrics",
25
+ "IndexingPhase",
26
+ "InheritanceData",
27
+ "NodeData",
28
+ "NodeLabel",
29
+ "ParallelExecutionResult",
30
+ "PhaseMetrics",
31
+ "RawCallData",
32
+ "RelationshipData",
33
+ "RelationshipType",
34
+ "WorkBatch",
35
+ "WorkerMetrics",
36
+ ]
37
+
38
+
39
+ class IndexingPhase(StrEnum):
40
+ """Phase names for indexing operations."""
41
+
42
+ STRUCTURE = "structure"
43
+ DEFINITIONS = "definitions"
44
+ RELATIONSHIPS = "relationships"
45
+ FLUSH_NODES = "flush_nodes"
46
+ FLUSH_RELATIONSHIPS = "flush_relationships"
47
+
48
+
49
+ class PhaseMetrics(BaseModel):
50
+ """Metrics for a single execution phase."""
51
+
52
+ phase_name: str = Field(..., description="Name of the phase")
53
+ start_time: float = Field(..., description="Unix timestamp when phase started")
54
+ end_time: float = Field(..., description="Unix timestamp when phase ended")
55
+ duration_seconds: float = Field(..., description="Total duration in seconds")
56
+ items_processed: int = Field(..., description="Number of items processed")
57
+ throughput: float = Field(..., description="Items per second")
58
+ memory_mb: float = Field(..., description="Peak memory usage in MB")
59
+
60
+ # Worker-specific metrics (for parallel phases)
61
+ worker_count: int | None = Field(None, description="Number of parallel workers")
62
+ worker_metrics: dict[int, WorkerMetrics] | None = Field(
63
+ None, description="Per-worker performance metrics"
64
+ )
65
+
66
+
67
+ class WorkerMetrics(BaseModel):
68
+ """Metrics for a single worker process."""
69
+
70
+ worker_id: int = Field(..., description="Unique worker identifier")
71
+ files_processed: int = Field(..., description="Files processed by this worker")
72
+ nodes_created: int = Field(..., description="Nodes created by this worker")
73
+ relationships_created: int = Field(..., description="Relationships created")
74
+ duration_seconds: float = Field(..., description="Total processing time")
75
+ throughput: float = Field(..., description="Files per second")
76
+ peak_memory_mb: float = Field(..., description="Peak memory usage")
77
+ idle_time_seconds: float = Field(..., description="Time spent waiting for work")
78
+ error_count: int = Field(default=0, description="Number of errors encountered")
79
+
80
+
81
+ class FileParseMetrics(BaseModel):
82
+ """Detailed metrics for parsing a single file."""
83
+
84
+ file_path: str = Field(..., description="Relative path to file")
85
+ language: str = Field(..., description="Programming language")
86
+ file_size_bytes: int = Field(..., description="File size in bytes")
87
+ parse_time_ms: float = Field(..., description="Time to parse file")
88
+ ast_nodes: int = Field(..., description="Number of AST nodes")
89
+ definitions_extracted: int = Field(
90
+ ..., description="Classes, functions, methods found"
91
+ )
92
+ relationships_found: int = Field(..., description="Calls, imports found")
93
+ worker_id: int | None = Field(None, description="Worker that processed this file")
94
+
95
+
96
+ class IndexingMetrics(BaseModel):
97
+ """Complete metrics for the entire indexing operation."""
98
+
99
+ session_id: str = Field(..., description="Unique session identifier")
100
+ codebase_name: str = Field(..., description="Name of indexed codebase")
101
+ total_duration_seconds: float = Field(..., description="End-to-end duration")
102
+
103
+ # Phase-level metrics
104
+ phase_metrics: dict[str, PhaseMetrics] = Field(
105
+ default_factory=dict, description="Metrics for each indexing phase"
106
+ )
107
+
108
+ # File-level metrics
109
+ file_metrics: list[FileParseMetrics] = Field(
110
+ default_factory=list, description="Per-file parsing metrics"
111
+ )
112
+
113
+ # Aggregate statistics
114
+ total_files: int = Field(..., description="Total files processed")
115
+ total_nodes: int = Field(..., description="Total nodes created")
116
+ total_relationships: int = Field(..., description="Total relationships created")
117
+
118
+ # Performance metrics
119
+ avg_throughput: float = Field(..., description="Average files per second")
120
+ peak_memory_mb: float = Field(..., description="Peak memory usage")
121
+ parallelism_efficiency: float | None = Field(
122
+ None, description="Efficiency factor (0.0-1.0) of parallelization"
123
+ )
124
+
125
+
126
+ # =============================================================================
127
+ # Work Distribution Types
128
+ # =============================================================================
129
+
130
+
131
+ class FileInfo(BaseModel):
132
+ """Information about a file for work distribution.
133
+
134
+ Used by WorkDistributor to calculate balanced work assignments
135
+ based on file size.
136
+ """
137
+
138
+ file_path: Path = Field(..., description="Absolute path to file")
139
+ relative_path: Path = Field(..., description="Path relative to repo root")
140
+ language: str = Field(..., description="Programming language")
141
+ module_qn: str = Field(..., description="Qualified name for the module")
142
+ container_qn: str | None = Field(
143
+ None, description="Parent package/folder qualified name"
144
+ )
145
+ file_size_bytes: int = Field(..., description="File size in bytes for balancing")
146
+
147
+ model_config = {"arbitrary_types_allowed": True}
148
+
149
+
150
+ class FileParseTask(BaseModel):
151
+ """A task representing a file to be parsed by a worker.
152
+
153
+ This is the serializable unit of work sent to worker processes.
154
+ """
155
+
156
+ file_path: Path = Field(..., description="Absolute path to file")
157
+ relative_path: Path = Field(..., description="Path relative to repo root")
158
+ language: str = Field(..., description="Programming language")
159
+ module_qn: str = Field(..., description="Qualified name for the module")
160
+ container_qn: str | None = Field(
161
+ None, description="Parent package/folder qualified name"
162
+ )
163
+
164
+ model_config = {"arbitrary_types_allowed": True}
165
+
166
+
167
+ class WorkBatch(BaseModel):
168
+ """A batch of file parse tasks for distribution to a worker.
169
+
170
+ Batches group multiple tasks together to reduce queue overhead
171
+ when distributing work across processes.
172
+ """
173
+
174
+ batch_id: int = Field(..., description="Unique batch identifier")
175
+ tasks: list[FileParseTask] = Field(..., description="Tasks in this batch")
176
+ estimated_duration_seconds: float | None = Field(
177
+ None, description="Estimated processing time"
178
+ )
179
+
180
+
181
+ class DistributionStats(BaseModel):
182
+ """Statistics about work distribution across workers.
183
+
184
+ Provides insight into how files are balanced across workers
185
+ for debugging and verification.
186
+ """
187
+
188
+ total_files: int = Field(..., description="Total number of files")
189
+ total_bytes: int = Field(..., description="Total size in bytes")
190
+ worker_count: int = Field(..., description="Number of workers")
191
+ batch_size: int = Field(..., description="Files per batch")
192
+ files_per_worker: list[int] = Field(
193
+ ..., description="Number of files assigned to each worker"
194
+ )
195
+ bytes_per_worker: list[int] = Field(
196
+ ..., description="Total bytes assigned to each worker"
197
+ )
198
+
199
+
200
+ # =============================================================================
201
+ # Parallel Execution Types
202
+ # =============================================================================
203
+
204
+
205
+ class NodeData(BaseModel):
206
+ """Data for creating a graph node.
207
+
208
+ Used by workers to return extracted node information without
209
+ direct database access. Use NodeLabel enum values for the label field.
210
+ """
211
+
212
+ label: str = Field(..., description="Node type from NodeLabel enum")
213
+ properties: dict[str, Any] = Field(..., description="Node properties")
214
+
215
+
216
+ class RelationshipData(BaseModel):
217
+ """Data for creating a graph relationship.
218
+
219
+ Used by workers to return extracted relationship information
220
+ without direct database access. Use NodeLabel enum for label fields
221
+ and RelationshipType enum for rel_type.
222
+ """
223
+
224
+ from_label: str = Field(..., description="Source node type from NodeLabel enum")
225
+ from_key: str = Field(..., description="Source node primary key field")
226
+ from_value: Any = Field(..., description="Source node primary key value")
227
+ rel_type: str = Field(
228
+ ..., description="Relationship type from RelationshipType enum"
229
+ )
230
+ to_label: str = Field(..., description="Target node type from NodeLabel enum")
231
+ to_key: str = Field(..., description="Target node primary key field")
232
+ to_value: Any = Field(..., description="Target node primary key value")
233
+ properties: dict[str, Any] | None = Field(
234
+ None, description="Relationship properties"
235
+ )
236
+
237
+
238
+ class RawCallData(BaseModel):
239
+ """Raw call information extracted by worker (unresolved).
240
+
241
+ Call relationships cannot be fully resolved in workers because
242
+ they require the complete function_registry and simple_name_lookup
243
+ which are built by aggregating data from all workers.
244
+ """
245
+
246
+ caller_qn: str = Field(..., description="Qualified name of caller function/method")
247
+ callee_name: str = Field(..., description="Simple name of called function")
248
+ object_name: str | None = Field(
249
+ None, description="Object the method is called on (if method call)"
250
+ )
251
+ line_number: int = Field(..., description="Line number of the call")
252
+ module_qn: str = Field(..., description="Module qualified name for context")
253
+
254
+
255
+ class InheritanceData(BaseModel):
256
+ """Raw inheritance information extracted by worker.
257
+
258
+ Inheritance relationships require resolution against the global
259
+ registry to find the actual parent class qualified names.
260
+ """
261
+
262
+ child_class_qn: str = Field(..., description="Qualified name of child class")
263
+ parent_simple_names: list[str] = Field(
264
+ ..., description="Simple names of parent classes (need resolution)"
265
+ )
266
+
267
+
268
+ class FileParseResult(BaseModel):
269
+ """Result of parsing a single file.
270
+
271
+ Contains all data extracted by a worker from a single file,
272
+ including nodes, relationships, and deferred relationship data
273
+ that requires post-aggregation resolution.
274
+ """
275
+
276
+ task: FileParseTask = Field(..., description="Original task")
277
+ success: bool = Field(..., description="Whether parsing succeeded")
278
+ error: str | None = Field(None, description="Error message if failed")
279
+
280
+ # Extracted nodes and direct relationships
281
+ nodes: list[NodeData] = Field(
282
+ default_factory=list, description="Nodes extracted from file"
283
+ )
284
+ relationships: list[RelationshipData] = Field(
285
+ default_factory=list, description="Direct relationships extracted"
286
+ )
287
+
288
+ # Registry data for aggregation
289
+ function_registry_entries: dict[str, str] = Field(
290
+ default_factory=dict,
291
+ description="Map of qualified_name -> type (Class/Function/Method)",
292
+ )
293
+ simple_name_entries: dict[str, list[str]] = Field(
294
+ default_factory=dict,
295
+ description="Map of simple_name -> list of qualified_names",
296
+ )
297
+
298
+ # Deferred relationship data (requires post-aggregation resolution)
299
+ raw_calls: list[RawCallData] = Field(
300
+ default_factory=list, description="Unresolved call data"
301
+ )
302
+ inheritance_data: list[InheritanceData] = Field(
303
+ default_factory=list, description="Unresolved inheritance data"
304
+ )
305
+
306
+ # File metadata
307
+ file_hash: str = Field(default="", description="SHA256 hash of file content")
308
+ mtime: int = Field(default=0, description="File modification time")
309
+
310
+ # Metrics
311
+ metrics: FileParseMetrics | None = Field(
312
+ None, description="Parsing metrics for this file"
313
+ )
314
+
315
+ model_config = {"arbitrary_types_allowed": True}
316
+
317
+
318
+ class ParallelExecutionResult(BaseModel):
319
+ """Complete results from parallel execution.
320
+
321
+ Aggregates results from all workers including resolved relationships
322
+ and merged registries.
323
+ """
324
+
325
+ results: list[FileParseResult] = Field(
326
+ default_factory=list, description="Results from all files"
327
+ )
328
+ resolved_relationships: list[RelationshipData] = Field(
329
+ default_factory=list, description="Relationships resolved post-aggregation"
330
+ )
331
+ function_registry: dict[str, str] = Field(
332
+ default_factory=dict, description="Merged function registry from all workers"
333
+ )
334
+ simple_name_lookup: dict[str, list[str]] = Field(
335
+ default_factory=dict, description="Merged simple name lookup from all workers"
336
+ )
337
+
338
+ # Metrics
339
+ total_files: int = Field(default=0, description="Total files processed")
340
+ successful_files: int = Field(default=0, description="Files successfully parsed")
341
+ failed_files: int = Field(default=0, description="Files that failed to parse")
342
+ total_duration_seconds: float = Field(
343
+ default=0.0, description="Total execution duration"
344
+ )
345
+ worker_metrics: dict[int, WorkerMetrics] = Field(
346
+ default_factory=dict, description="Per-worker metrics"
347
+ )