shotgun-sh 0.4.0.dev1__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shotgun/agents/agent_manager.py +307 -8
- shotgun/agents/cancellation.py +103 -0
- shotgun/agents/common.py +12 -0
- shotgun/agents/config/README.md +0 -1
- shotgun/agents/config/manager.py +10 -7
- shotgun/agents/config/models.py +5 -27
- shotgun/agents/config/provider.py +44 -27
- shotgun/agents/conversation/history/token_counting/base.py +51 -9
- shotgun/agents/file_read.py +176 -0
- shotgun/agents/messages.py +15 -3
- shotgun/agents/models.py +24 -1
- shotgun/agents/router/models.py +8 -0
- shotgun/agents/router/tools/delegation_tools.py +55 -1
- shotgun/agents/router/tools/plan_tools.py +88 -7
- shotgun/agents/runner.py +17 -2
- shotgun/agents/tools/__init__.py +8 -0
- shotgun/agents/tools/codebase/directory_lister.py +27 -39
- shotgun/agents/tools/codebase/file_read.py +26 -35
- shotgun/agents/tools/codebase/query_graph.py +9 -0
- shotgun/agents/tools/codebase/retrieve_code.py +9 -0
- shotgun/agents/tools/file_management.py +32 -2
- shotgun/agents/tools/file_read_tools/__init__.py +7 -0
- shotgun/agents/tools/file_read_tools/multimodal_file_read.py +167 -0
- shotgun/agents/tools/markdown_tools/__init__.py +62 -0
- shotgun/agents/tools/markdown_tools/insert_section.py +148 -0
- shotgun/agents/tools/markdown_tools/models.py +86 -0
- shotgun/agents/tools/markdown_tools/remove_section.py +114 -0
- shotgun/agents/tools/markdown_tools/replace_section.py +119 -0
- shotgun/agents/tools/markdown_tools/utils.py +453 -0
- shotgun/agents/tools/registry.py +44 -6
- shotgun/agents/tools/web_search/openai.py +42 -23
- shotgun/attachments/__init__.py +41 -0
- shotgun/attachments/errors.py +60 -0
- shotgun/attachments/models.py +107 -0
- shotgun/attachments/parser.py +257 -0
- shotgun/attachments/processor.py +193 -0
- shotgun/build_constants.py +4 -7
- shotgun/cli/clear.py +2 -2
- shotgun/cli/codebase/commands.py +181 -65
- shotgun/cli/compact.py +2 -2
- shotgun/cli/context.py +2 -2
- shotgun/cli/error_handler.py +2 -2
- shotgun/cli/run.py +90 -0
- shotgun/cli/spec/backup.py +2 -1
- shotgun/codebase/__init__.py +2 -0
- shotgun/codebase/benchmarks/__init__.py +35 -0
- shotgun/codebase/benchmarks/benchmark_runner.py +309 -0
- shotgun/codebase/benchmarks/exporters.py +119 -0
- shotgun/codebase/benchmarks/formatters/__init__.py +49 -0
- shotgun/codebase/benchmarks/formatters/base.py +34 -0
- shotgun/codebase/benchmarks/formatters/json_formatter.py +106 -0
- shotgun/codebase/benchmarks/formatters/markdown.py +136 -0
- shotgun/codebase/benchmarks/models.py +129 -0
- shotgun/codebase/core/__init__.py +4 -0
- shotgun/codebase/core/call_resolution.py +91 -0
- shotgun/codebase/core/change_detector.py +11 -6
- shotgun/codebase/core/errors.py +159 -0
- shotgun/codebase/core/extractors/__init__.py +23 -0
- shotgun/codebase/core/extractors/base.py +138 -0
- shotgun/codebase/core/extractors/factory.py +63 -0
- shotgun/codebase/core/extractors/go/__init__.py +7 -0
- shotgun/codebase/core/extractors/go/extractor.py +122 -0
- shotgun/codebase/core/extractors/javascript/__init__.py +7 -0
- shotgun/codebase/core/extractors/javascript/extractor.py +132 -0
- shotgun/codebase/core/extractors/protocol.py +109 -0
- shotgun/codebase/core/extractors/python/__init__.py +7 -0
- shotgun/codebase/core/extractors/python/extractor.py +141 -0
- shotgun/codebase/core/extractors/rust/__init__.py +7 -0
- shotgun/codebase/core/extractors/rust/extractor.py +139 -0
- shotgun/codebase/core/extractors/types.py +15 -0
- shotgun/codebase/core/extractors/typescript/__init__.py +7 -0
- shotgun/codebase/core/extractors/typescript/extractor.py +92 -0
- shotgun/codebase/core/gitignore.py +252 -0
- shotgun/codebase/core/ingestor.py +644 -354
- shotgun/codebase/core/kuzu_compat.py +119 -0
- shotgun/codebase/core/language_config.py +239 -0
- shotgun/codebase/core/manager.py +256 -46
- shotgun/codebase/core/metrics_collector.py +310 -0
- shotgun/codebase/core/metrics_types.py +347 -0
- shotgun/codebase/core/parallel_executor.py +424 -0
- shotgun/codebase/core/work_distributor.py +254 -0
- shotgun/codebase/core/worker.py +768 -0
- shotgun/codebase/indexing_state.py +86 -0
- shotgun/codebase/models.py +94 -0
- shotgun/codebase/service.py +13 -0
- shotgun/exceptions.py +9 -9
- shotgun/main.py +3 -16
- shotgun/posthog_telemetry.py +165 -24
- shotgun/prompts/agents/file_read.j2 +48 -0
- shotgun/prompts/agents/partials/common_agent_system_prompt.j2 +19 -47
- shotgun/prompts/agents/partials/content_formatting.j2 +12 -33
- shotgun/prompts/agents/partials/interactive_mode.j2 +9 -32
- shotgun/prompts/agents/partials/router_delegation_mode.j2 +21 -22
- shotgun/prompts/agents/plan.j2 +14 -0
- shotgun/prompts/agents/router.j2 +531 -258
- shotgun/prompts/agents/specify.j2 +14 -0
- shotgun/prompts/agents/state/codebase/codebase_graphs_available.j2 +14 -1
- shotgun/prompts/agents/state/system_state.j2 +13 -11
- shotgun/prompts/agents/tasks.j2 +14 -0
- shotgun/settings.py +49 -10
- shotgun/tui/app.py +149 -18
- shotgun/tui/commands/__init__.py +9 -1
- shotgun/tui/components/attachment_bar.py +87 -0
- shotgun/tui/components/prompt_input.py +25 -28
- shotgun/tui/components/status_bar.py +14 -7
- shotgun/tui/dependencies.py +3 -8
- shotgun/tui/protocols.py +18 -0
- shotgun/tui/screens/chat/chat.tcss +15 -0
- shotgun/tui/screens/chat/chat_screen.py +766 -235
- shotgun/tui/screens/chat/codebase_index_prompt_screen.py +8 -4
- shotgun/tui/screens/chat_screen/attachment_hint.py +40 -0
- shotgun/tui/screens/chat_screen/command_providers.py +0 -10
- shotgun/tui/screens/chat_screen/history/chat_history.py +54 -14
- shotgun/tui/screens/chat_screen/history/formatters.py +22 -0
- shotgun/tui/screens/chat_screen/history/user_question.py +25 -3
- shotgun/tui/screens/database_locked_dialog.py +219 -0
- shotgun/tui/screens/database_timeout_dialog.py +158 -0
- shotgun/tui/screens/kuzu_error_dialog.py +135 -0
- shotgun/tui/screens/model_picker.py +1 -3
- shotgun/tui/screens/models.py +11 -0
- shotgun/tui/state/processing_state.py +19 -0
- shotgun/tui/widgets/widget_coordinator.py +18 -0
- shotgun/utils/file_system_utils.py +4 -1
- {shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/METADATA +87 -34
- {shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/RECORD +128 -79
- shotgun/cli/export.py +0 -81
- shotgun/cli/plan.py +0 -73
- shotgun/cli/research.py +0 -93
- shotgun/cli/specify.py +0 -70
- shotgun/cli/tasks.py +0 -78
- shotgun/sentry_telemetry.py +0 -232
- shotgun/tui/screens/onboarding.py +0 -584
- {shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/WHEEL +0 -0
- {shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/entry_points.txt +0 -0
- {shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
"""Thread-safe metrics collector for indexing operations.
|
|
2
|
+
|
|
3
|
+
This module provides the MetricsCollector class for tracking performance
|
|
4
|
+
metrics during codebase indexing, including phase timing, memory usage,
|
|
5
|
+
and optional per-file and per-worker metrics.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import csv
|
|
9
|
+
import threading
|
|
10
|
+
import time
|
|
11
|
+
import uuid
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import psutil
|
|
15
|
+
|
|
16
|
+
from shotgun.codebase.core.metrics_types import (
|
|
17
|
+
FileParseMetrics,
|
|
18
|
+
IndexingMetrics,
|
|
19
|
+
IndexingPhase,
|
|
20
|
+
PhaseMetrics,
|
|
21
|
+
WorkerMetrics,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MetricsCollector:
|
|
26
|
+
"""Thread-safe metrics collector for indexing operations.
|
|
27
|
+
|
|
28
|
+
Collects performance metrics at multiple granularities:
|
|
29
|
+
- Phase-level: timing and throughput for each indexing phase
|
|
30
|
+
- Worker-level: per-worker statistics (optional, for parallel execution)
|
|
31
|
+
- File-level: per-file parsing metrics (optional)
|
|
32
|
+
|
|
33
|
+
All collection methods are thread-safe using a lock.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
codebase_name: str,
|
|
39
|
+
collect_file_metrics: bool = True,
|
|
40
|
+
collect_worker_metrics: bool = True,
|
|
41
|
+
) -> None:
|
|
42
|
+
"""Initialize the metrics collector.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
codebase_name: Name of the codebase being indexed
|
|
46
|
+
collect_file_metrics: Whether to collect per-file metrics
|
|
47
|
+
collect_worker_metrics: Whether to collect per-worker metrics
|
|
48
|
+
"""
|
|
49
|
+
self._lock = threading.Lock()
|
|
50
|
+
self._session_id = str(uuid.uuid4())
|
|
51
|
+
self._codebase_name = codebase_name
|
|
52
|
+
self._collect_file_metrics = collect_file_metrics
|
|
53
|
+
self._collect_worker_metrics = collect_worker_metrics
|
|
54
|
+
|
|
55
|
+
# Phase tracking
|
|
56
|
+
self._phase_starts: dict[str, tuple[float, float]] = {} # (time, memory)
|
|
57
|
+
self._phase_metrics: dict[str, PhaseMetrics] = {}
|
|
58
|
+
|
|
59
|
+
# File metrics (optional)
|
|
60
|
+
self._file_metrics: list[FileParseMetrics] = []
|
|
61
|
+
|
|
62
|
+
# Worker metrics (optional)
|
|
63
|
+
self._worker_metrics: dict[int, WorkerMetrics] = {}
|
|
64
|
+
|
|
65
|
+
# Session timing
|
|
66
|
+
self._session_start = time.perf_counter()
|
|
67
|
+
self._session_start_timestamp = time.time()
|
|
68
|
+
|
|
69
|
+
# Aggregates (set during flush phases)
|
|
70
|
+
self._total_nodes = 0
|
|
71
|
+
self._total_relationships = 0
|
|
72
|
+
self._total_files = 0
|
|
73
|
+
|
|
74
|
+
def _get_memory_mb(self) -> float:
|
|
75
|
+
"""Get current RSS memory in MB (cross-platform).
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Current memory usage in megabytes, or 0.0 if unavailable.
|
|
79
|
+
"""
|
|
80
|
+
try:
|
|
81
|
+
process = psutil.Process()
|
|
82
|
+
rss_bytes: int = process.memory_info().rss
|
|
83
|
+
return float(rss_bytes) / 1024 / 1024
|
|
84
|
+
except Exception:
|
|
85
|
+
return 0.0
|
|
86
|
+
|
|
87
|
+
def start_phase(self, phase: IndexingPhase | str) -> None:
|
|
88
|
+
"""Mark the start of a processing phase.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
phase: The indexing phase (use IndexingPhase enum)
|
|
92
|
+
"""
|
|
93
|
+
phase_name = str(phase)
|
|
94
|
+
with self._lock:
|
|
95
|
+
start_time = time.perf_counter()
|
|
96
|
+
start_memory = self._get_memory_mb()
|
|
97
|
+
self._phase_starts[phase_name] = (start_time, start_memory)
|
|
98
|
+
|
|
99
|
+
def end_phase(self, phase: IndexingPhase | str, items_processed: int) -> None:
|
|
100
|
+
"""Mark the end of a processing phase.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
phase: The indexing phase (use IndexingPhase enum)
|
|
104
|
+
items_processed: Number of items processed in this phase
|
|
105
|
+
"""
|
|
106
|
+
phase_name = str(phase)
|
|
107
|
+
end_time = time.perf_counter()
|
|
108
|
+
end_memory = self._get_memory_mb()
|
|
109
|
+
|
|
110
|
+
with self._lock:
|
|
111
|
+
if phase_name not in self._phase_starts:
|
|
112
|
+
return
|
|
113
|
+
|
|
114
|
+
start_time, start_memory = self._phase_starts[phase_name]
|
|
115
|
+
duration = end_time - start_time
|
|
116
|
+
peak_memory = max(start_memory, end_memory)
|
|
117
|
+
|
|
118
|
+
# Calculate throughput (avoid division by zero)
|
|
119
|
+
throughput = items_processed / duration if duration > 0 else 0.0
|
|
120
|
+
|
|
121
|
+
# Create phase metrics
|
|
122
|
+
self._phase_metrics[phase_name] = PhaseMetrics(
|
|
123
|
+
phase_name=phase_name,
|
|
124
|
+
start_time=self._session_start_timestamp
|
|
125
|
+
+ (start_time - self._session_start),
|
|
126
|
+
end_time=self._session_start_timestamp
|
|
127
|
+
+ (end_time - self._session_start),
|
|
128
|
+
duration_seconds=duration,
|
|
129
|
+
items_processed=items_processed,
|
|
130
|
+
throughput=throughput,
|
|
131
|
+
memory_mb=peak_memory,
|
|
132
|
+
worker_count=None,
|
|
133
|
+
worker_metrics=None,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# Track files for definitions phase
|
|
137
|
+
if phase_name == IndexingPhase.DEFINITIONS:
|
|
138
|
+
self._total_files = items_processed
|
|
139
|
+
|
|
140
|
+
def record_file_parse(self, metrics: FileParseMetrics) -> None:
|
|
141
|
+
"""Record metrics for a single file parse.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
metrics: File parsing metrics
|
|
145
|
+
"""
|
|
146
|
+
if not self._collect_file_metrics:
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
with self._lock:
|
|
150
|
+
self._file_metrics.append(metrics)
|
|
151
|
+
|
|
152
|
+
def record_worker_metrics(self, worker_id: int, metrics: WorkerMetrics) -> None:
|
|
153
|
+
"""Record metrics for a worker.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
worker_id: Unique worker identifier
|
|
157
|
+
metrics: Worker performance metrics
|
|
158
|
+
"""
|
|
159
|
+
if not self._collect_worker_metrics:
|
|
160
|
+
return
|
|
161
|
+
|
|
162
|
+
with self._lock:
|
|
163
|
+
self._worker_metrics[worker_id] = metrics
|
|
164
|
+
|
|
165
|
+
def set_totals(self, nodes: int, relationships: int) -> None:
|
|
166
|
+
"""Set the total node and relationship counts.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
nodes: Total number of nodes created
|
|
170
|
+
relationships: Total number of relationships created
|
|
171
|
+
"""
|
|
172
|
+
with self._lock:
|
|
173
|
+
self._total_nodes = nodes
|
|
174
|
+
self._total_relationships = relationships
|
|
175
|
+
|
|
176
|
+
def get_metrics(self) -> IndexingMetrics:
|
|
177
|
+
"""Get complete metrics for the indexing session.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
Complete indexing metrics including all phases and aggregates.
|
|
181
|
+
"""
|
|
182
|
+
with self._lock:
|
|
183
|
+
total_duration = time.perf_counter() - self._session_start
|
|
184
|
+
|
|
185
|
+
# Calculate average throughput from definitions phase
|
|
186
|
+
avg_throughput = 0.0
|
|
187
|
+
definitions_key = str(IndexingPhase.DEFINITIONS)
|
|
188
|
+
if definitions_key in self._phase_metrics:
|
|
189
|
+
avg_throughput = self._phase_metrics[definitions_key].throughput
|
|
190
|
+
|
|
191
|
+
# Get peak memory across all phases
|
|
192
|
+
peak_memory = max(
|
|
193
|
+
(pm.memory_mb for pm in self._phase_metrics.values()),
|
|
194
|
+
default=self._get_memory_mb(),
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# Calculate parallelism efficiency if worker metrics available
|
|
198
|
+
parallelism_efficiency = None
|
|
199
|
+
if self._worker_metrics:
|
|
200
|
+
worker_count = len(self._worker_metrics)
|
|
201
|
+
if worker_count > 1:
|
|
202
|
+
# Efficiency = actual speedup / ideal speedup
|
|
203
|
+
# For now, use balanced work distribution as proxy
|
|
204
|
+
files_per_worker = [
|
|
205
|
+
w.files_processed for w in self._worker_metrics.values()
|
|
206
|
+
]
|
|
207
|
+
if files_per_worker:
|
|
208
|
+
avg_files = sum(files_per_worker) / len(files_per_worker)
|
|
209
|
+
max_files = max(files_per_worker)
|
|
210
|
+
if max_files > 0:
|
|
211
|
+
parallelism_efficiency = avg_files / max_files
|
|
212
|
+
|
|
213
|
+
return IndexingMetrics(
|
|
214
|
+
session_id=self._session_id,
|
|
215
|
+
codebase_name=self._codebase_name,
|
|
216
|
+
total_duration_seconds=total_duration,
|
|
217
|
+
phase_metrics=dict(self._phase_metrics),
|
|
218
|
+
file_metrics=list(self._file_metrics),
|
|
219
|
+
total_files=self._total_files,
|
|
220
|
+
total_nodes=self._total_nodes,
|
|
221
|
+
total_relationships=self._total_relationships,
|
|
222
|
+
avg_throughput=avg_throughput,
|
|
223
|
+
peak_memory_mb=peak_memory,
|
|
224
|
+
parallelism_efficiency=parallelism_efficiency,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
def export_json(self, path: Path) -> None:
|
|
228
|
+
"""Export metrics to JSON file.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
path: Path to write JSON file
|
|
232
|
+
"""
|
|
233
|
+
metrics = self.get_metrics()
|
|
234
|
+
path.write_text(metrics.model_dump_json(indent=2))
|
|
235
|
+
|
|
236
|
+
def export_csv(self, path: Path) -> None:
|
|
237
|
+
"""Export metrics to CSV file.
|
|
238
|
+
|
|
239
|
+
Exports phase metrics and optionally file metrics as separate sections.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
path: Path to write CSV file
|
|
243
|
+
"""
|
|
244
|
+
metrics = self.get_metrics()
|
|
245
|
+
|
|
246
|
+
with path.open("w", newline="") as f:
|
|
247
|
+
writer = csv.writer(f)
|
|
248
|
+
|
|
249
|
+
# Header section
|
|
250
|
+
writer.writerow(["# Indexing Metrics"])
|
|
251
|
+
writer.writerow(["Session ID", metrics.session_id])
|
|
252
|
+
writer.writerow(["Codebase", metrics.codebase_name])
|
|
253
|
+
writer.writerow(
|
|
254
|
+
["Total Duration (s)", f"{metrics.total_duration_seconds:.2f}"]
|
|
255
|
+
)
|
|
256
|
+
writer.writerow(["Total Files", metrics.total_files])
|
|
257
|
+
writer.writerow(["Total Nodes", metrics.total_nodes])
|
|
258
|
+
writer.writerow(["Total Relationships", metrics.total_relationships])
|
|
259
|
+
writer.writerow(["Peak Memory (MB)", f"{metrics.peak_memory_mb:.1f}"])
|
|
260
|
+
writer.writerow([])
|
|
261
|
+
|
|
262
|
+
# Phase metrics
|
|
263
|
+
writer.writerow(["# Phase Metrics"])
|
|
264
|
+
writer.writerow(
|
|
265
|
+
[
|
|
266
|
+
"Phase",
|
|
267
|
+
"Duration (s)",
|
|
268
|
+
"Items",
|
|
269
|
+
"Throughput (items/s)",
|
|
270
|
+
"Memory (MB)",
|
|
271
|
+
]
|
|
272
|
+
)
|
|
273
|
+
for phase in metrics.phase_metrics.values():
|
|
274
|
+
writer.writerow(
|
|
275
|
+
[
|
|
276
|
+
phase.phase_name,
|
|
277
|
+
f"{phase.duration_seconds:.3f}",
|
|
278
|
+
phase.items_processed,
|
|
279
|
+
f"{phase.throughput:.1f}",
|
|
280
|
+
f"{phase.memory_mb:.1f}",
|
|
281
|
+
]
|
|
282
|
+
)
|
|
283
|
+
writer.writerow([])
|
|
284
|
+
|
|
285
|
+
# File metrics (if collected)
|
|
286
|
+
if metrics.file_metrics:
|
|
287
|
+
writer.writerow(["# File Metrics"])
|
|
288
|
+
writer.writerow(
|
|
289
|
+
[
|
|
290
|
+
"File",
|
|
291
|
+
"Language",
|
|
292
|
+
"Size (bytes)",
|
|
293
|
+
"Parse Time (ms)",
|
|
294
|
+
"AST Nodes",
|
|
295
|
+
"Definitions",
|
|
296
|
+
"Relationships",
|
|
297
|
+
]
|
|
298
|
+
)
|
|
299
|
+
for fm in metrics.file_metrics:
|
|
300
|
+
writer.writerow(
|
|
301
|
+
[
|
|
302
|
+
fm.file_path,
|
|
303
|
+
fm.language,
|
|
304
|
+
fm.file_size_bytes,
|
|
305
|
+
f"{fm.parse_time_ms:.2f}",
|
|
306
|
+
fm.ast_nodes,
|
|
307
|
+
fm.definitions_extracted,
|
|
308
|
+
fm.relationships_found,
|
|
309
|
+
]
|
|
310
|
+
)
|
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
"""Type definitions for indexing metrics collection.
|
|
2
|
+
|
|
3
|
+
These models define the data structures for tracking performance metrics
|
|
4
|
+
during codebase indexing operations, as well as work distribution types
|
|
5
|
+
for parallel file parsing.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from enum import StrEnum
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, Field
|
|
15
|
+
|
|
16
|
+
from shotgun.codebase.models import NodeLabel, RelationshipType
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"DistributionStats",
|
|
20
|
+
"FileInfo",
|
|
21
|
+
"FileParseMetrics",
|
|
22
|
+
"FileParseResult",
|
|
23
|
+
"FileParseTask",
|
|
24
|
+
"IndexingMetrics",
|
|
25
|
+
"IndexingPhase",
|
|
26
|
+
"InheritanceData",
|
|
27
|
+
"NodeData",
|
|
28
|
+
"NodeLabel",
|
|
29
|
+
"ParallelExecutionResult",
|
|
30
|
+
"PhaseMetrics",
|
|
31
|
+
"RawCallData",
|
|
32
|
+
"RelationshipData",
|
|
33
|
+
"RelationshipType",
|
|
34
|
+
"WorkBatch",
|
|
35
|
+
"WorkerMetrics",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class IndexingPhase(StrEnum):
|
|
40
|
+
"""Phase names for indexing operations."""
|
|
41
|
+
|
|
42
|
+
STRUCTURE = "structure"
|
|
43
|
+
DEFINITIONS = "definitions"
|
|
44
|
+
RELATIONSHIPS = "relationships"
|
|
45
|
+
FLUSH_NODES = "flush_nodes"
|
|
46
|
+
FLUSH_RELATIONSHIPS = "flush_relationships"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class PhaseMetrics(BaseModel):
|
|
50
|
+
"""Metrics for a single execution phase."""
|
|
51
|
+
|
|
52
|
+
phase_name: str = Field(..., description="Name of the phase")
|
|
53
|
+
start_time: float = Field(..., description="Unix timestamp when phase started")
|
|
54
|
+
end_time: float = Field(..., description="Unix timestamp when phase ended")
|
|
55
|
+
duration_seconds: float = Field(..., description="Total duration in seconds")
|
|
56
|
+
items_processed: int = Field(..., description="Number of items processed")
|
|
57
|
+
throughput: float = Field(..., description="Items per second")
|
|
58
|
+
memory_mb: float = Field(..., description="Peak memory usage in MB")
|
|
59
|
+
|
|
60
|
+
# Worker-specific metrics (for parallel phases)
|
|
61
|
+
worker_count: int | None = Field(None, description="Number of parallel workers")
|
|
62
|
+
worker_metrics: dict[int, WorkerMetrics] | None = Field(
|
|
63
|
+
None, description="Per-worker performance metrics"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class WorkerMetrics(BaseModel):
|
|
68
|
+
"""Metrics for a single worker process."""
|
|
69
|
+
|
|
70
|
+
worker_id: int = Field(..., description="Unique worker identifier")
|
|
71
|
+
files_processed: int = Field(..., description="Files processed by this worker")
|
|
72
|
+
nodes_created: int = Field(..., description="Nodes created by this worker")
|
|
73
|
+
relationships_created: int = Field(..., description="Relationships created")
|
|
74
|
+
duration_seconds: float = Field(..., description="Total processing time")
|
|
75
|
+
throughput: float = Field(..., description="Files per second")
|
|
76
|
+
peak_memory_mb: float = Field(..., description="Peak memory usage")
|
|
77
|
+
idle_time_seconds: float = Field(..., description="Time spent waiting for work")
|
|
78
|
+
error_count: int = Field(default=0, description="Number of errors encountered")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class FileParseMetrics(BaseModel):
|
|
82
|
+
"""Detailed metrics for parsing a single file."""
|
|
83
|
+
|
|
84
|
+
file_path: str = Field(..., description="Relative path to file")
|
|
85
|
+
language: str = Field(..., description="Programming language")
|
|
86
|
+
file_size_bytes: int = Field(..., description="File size in bytes")
|
|
87
|
+
parse_time_ms: float = Field(..., description="Time to parse file")
|
|
88
|
+
ast_nodes: int = Field(..., description="Number of AST nodes")
|
|
89
|
+
definitions_extracted: int = Field(
|
|
90
|
+
..., description="Classes, functions, methods found"
|
|
91
|
+
)
|
|
92
|
+
relationships_found: int = Field(..., description="Calls, imports found")
|
|
93
|
+
worker_id: int | None = Field(None, description="Worker that processed this file")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class IndexingMetrics(BaseModel):
|
|
97
|
+
"""Complete metrics for the entire indexing operation."""
|
|
98
|
+
|
|
99
|
+
session_id: str = Field(..., description="Unique session identifier")
|
|
100
|
+
codebase_name: str = Field(..., description="Name of indexed codebase")
|
|
101
|
+
total_duration_seconds: float = Field(..., description="End-to-end duration")
|
|
102
|
+
|
|
103
|
+
# Phase-level metrics
|
|
104
|
+
phase_metrics: dict[str, PhaseMetrics] = Field(
|
|
105
|
+
default_factory=dict, description="Metrics for each indexing phase"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# File-level metrics
|
|
109
|
+
file_metrics: list[FileParseMetrics] = Field(
|
|
110
|
+
default_factory=list, description="Per-file parsing metrics"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Aggregate statistics
|
|
114
|
+
total_files: int = Field(..., description="Total files processed")
|
|
115
|
+
total_nodes: int = Field(..., description="Total nodes created")
|
|
116
|
+
total_relationships: int = Field(..., description="Total relationships created")
|
|
117
|
+
|
|
118
|
+
# Performance metrics
|
|
119
|
+
avg_throughput: float = Field(..., description="Average files per second")
|
|
120
|
+
peak_memory_mb: float = Field(..., description="Peak memory usage")
|
|
121
|
+
parallelism_efficiency: float | None = Field(
|
|
122
|
+
None, description="Efficiency factor (0.0-1.0) of parallelization"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# =============================================================================
|
|
127
|
+
# Work Distribution Types
|
|
128
|
+
# =============================================================================
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class FileInfo(BaseModel):
|
|
132
|
+
"""Information about a file for work distribution.
|
|
133
|
+
|
|
134
|
+
Used by WorkDistributor to calculate balanced work assignments
|
|
135
|
+
based on file size.
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
file_path: Path = Field(..., description="Absolute path to file")
|
|
139
|
+
relative_path: Path = Field(..., description="Path relative to repo root")
|
|
140
|
+
language: str = Field(..., description="Programming language")
|
|
141
|
+
module_qn: str = Field(..., description="Qualified name for the module")
|
|
142
|
+
container_qn: str | None = Field(
|
|
143
|
+
None, description="Parent package/folder qualified name"
|
|
144
|
+
)
|
|
145
|
+
file_size_bytes: int = Field(..., description="File size in bytes for balancing")
|
|
146
|
+
|
|
147
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class FileParseTask(BaseModel):
|
|
151
|
+
"""A task representing a file to be parsed by a worker.
|
|
152
|
+
|
|
153
|
+
This is the serializable unit of work sent to worker processes.
|
|
154
|
+
"""
|
|
155
|
+
|
|
156
|
+
file_path: Path = Field(..., description="Absolute path to file")
|
|
157
|
+
relative_path: Path = Field(..., description="Path relative to repo root")
|
|
158
|
+
language: str = Field(..., description="Programming language")
|
|
159
|
+
module_qn: str = Field(..., description="Qualified name for the module")
|
|
160
|
+
container_qn: str | None = Field(
|
|
161
|
+
None, description="Parent package/folder qualified name"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class WorkBatch(BaseModel):
|
|
168
|
+
"""A batch of file parse tasks for distribution to a worker.
|
|
169
|
+
|
|
170
|
+
Batches group multiple tasks together to reduce queue overhead
|
|
171
|
+
when distributing work across processes.
|
|
172
|
+
"""
|
|
173
|
+
|
|
174
|
+
batch_id: int = Field(..., description="Unique batch identifier")
|
|
175
|
+
tasks: list[FileParseTask] = Field(..., description="Tasks in this batch")
|
|
176
|
+
estimated_duration_seconds: float | None = Field(
|
|
177
|
+
None, description="Estimated processing time"
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class DistributionStats(BaseModel):
|
|
182
|
+
"""Statistics about work distribution across workers.
|
|
183
|
+
|
|
184
|
+
Provides insight into how files are balanced across workers
|
|
185
|
+
for debugging and verification.
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
total_files: int = Field(..., description="Total number of files")
|
|
189
|
+
total_bytes: int = Field(..., description="Total size in bytes")
|
|
190
|
+
worker_count: int = Field(..., description="Number of workers")
|
|
191
|
+
batch_size: int = Field(..., description="Files per batch")
|
|
192
|
+
files_per_worker: list[int] = Field(
|
|
193
|
+
..., description="Number of files assigned to each worker"
|
|
194
|
+
)
|
|
195
|
+
bytes_per_worker: list[int] = Field(
|
|
196
|
+
..., description="Total bytes assigned to each worker"
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
# =============================================================================
|
|
201
|
+
# Parallel Execution Types
|
|
202
|
+
# =============================================================================
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
class NodeData(BaseModel):
|
|
206
|
+
"""Data for creating a graph node.
|
|
207
|
+
|
|
208
|
+
Used by workers to return extracted node information without
|
|
209
|
+
direct database access. Use NodeLabel enum values for the label field.
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
label: str = Field(..., description="Node type from NodeLabel enum")
|
|
213
|
+
properties: dict[str, Any] = Field(..., description="Node properties")
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class RelationshipData(BaseModel):
|
|
217
|
+
"""Data for creating a graph relationship.
|
|
218
|
+
|
|
219
|
+
Used by workers to return extracted relationship information
|
|
220
|
+
without direct database access. Use NodeLabel enum for label fields
|
|
221
|
+
and RelationshipType enum for rel_type.
|
|
222
|
+
"""
|
|
223
|
+
|
|
224
|
+
from_label: str = Field(..., description="Source node type from NodeLabel enum")
|
|
225
|
+
from_key: str = Field(..., description="Source node primary key field")
|
|
226
|
+
from_value: Any = Field(..., description="Source node primary key value")
|
|
227
|
+
rel_type: str = Field(
|
|
228
|
+
..., description="Relationship type from RelationshipType enum"
|
|
229
|
+
)
|
|
230
|
+
to_label: str = Field(..., description="Target node type from NodeLabel enum")
|
|
231
|
+
to_key: str = Field(..., description="Target node primary key field")
|
|
232
|
+
to_value: Any = Field(..., description="Target node primary key value")
|
|
233
|
+
properties: dict[str, Any] | None = Field(
|
|
234
|
+
None, description="Relationship properties"
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
class RawCallData(BaseModel):
|
|
239
|
+
"""Raw call information extracted by worker (unresolved).
|
|
240
|
+
|
|
241
|
+
Call relationships cannot be fully resolved in workers because
|
|
242
|
+
they require the complete function_registry and simple_name_lookup
|
|
243
|
+
which are built by aggregating data from all workers.
|
|
244
|
+
"""
|
|
245
|
+
|
|
246
|
+
caller_qn: str = Field(..., description="Qualified name of caller function/method")
|
|
247
|
+
callee_name: str = Field(..., description="Simple name of called function")
|
|
248
|
+
object_name: str | None = Field(
|
|
249
|
+
None, description="Object the method is called on (if method call)"
|
|
250
|
+
)
|
|
251
|
+
line_number: int = Field(..., description="Line number of the call")
|
|
252
|
+
module_qn: str = Field(..., description="Module qualified name for context")
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
class InheritanceData(BaseModel):
|
|
256
|
+
"""Raw inheritance information extracted by worker.
|
|
257
|
+
|
|
258
|
+
Inheritance relationships require resolution against the global
|
|
259
|
+
registry to find the actual parent class qualified names.
|
|
260
|
+
"""
|
|
261
|
+
|
|
262
|
+
child_class_qn: str = Field(..., description="Qualified name of child class")
|
|
263
|
+
parent_simple_names: list[str] = Field(
|
|
264
|
+
..., description="Simple names of parent classes (need resolution)"
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
class FileParseResult(BaseModel):
|
|
269
|
+
"""Result of parsing a single file.
|
|
270
|
+
|
|
271
|
+
Contains all data extracted by a worker from a single file,
|
|
272
|
+
including nodes, relationships, and deferred relationship data
|
|
273
|
+
that requires post-aggregation resolution.
|
|
274
|
+
"""
|
|
275
|
+
|
|
276
|
+
task: FileParseTask = Field(..., description="Original task")
|
|
277
|
+
success: bool = Field(..., description="Whether parsing succeeded")
|
|
278
|
+
error: str | None = Field(None, description="Error message if failed")
|
|
279
|
+
|
|
280
|
+
# Extracted nodes and direct relationships
|
|
281
|
+
nodes: list[NodeData] = Field(
|
|
282
|
+
default_factory=list, description="Nodes extracted from file"
|
|
283
|
+
)
|
|
284
|
+
relationships: list[RelationshipData] = Field(
|
|
285
|
+
default_factory=list, description="Direct relationships extracted"
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
# Registry data for aggregation
|
|
289
|
+
function_registry_entries: dict[str, str] = Field(
|
|
290
|
+
default_factory=dict,
|
|
291
|
+
description="Map of qualified_name -> type (Class/Function/Method)",
|
|
292
|
+
)
|
|
293
|
+
simple_name_entries: dict[str, list[str]] = Field(
|
|
294
|
+
default_factory=dict,
|
|
295
|
+
description="Map of simple_name -> list of qualified_names",
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
# Deferred relationship data (requires post-aggregation resolution)
|
|
299
|
+
raw_calls: list[RawCallData] = Field(
|
|
300
|
+
default_factory=list, description="Unresolved call data"
|
|
301
|
+
)
|
|
302
|
+
inheritance_data: list[InheritanceData] = Field(
|
|
303
|
+
default_factory=list, description="Unresolved inheritance data"
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
# File metadata
|
|
307
|
+
file_hash: str = Field(default="", description="SHA256 hash of file content")
|
|
308
|
+
mtime: int = Field(default=0, description="File modification time")
|
|
309
|
+
|
|
310
|
+
# Metrics
|
|
311
|
+
metrics: FileParseMetrics | None = Field(
|
|
312
|
+
None, description="Parsing metrics for this file"
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
class ParallelExecutionResult(BaseModel):
|
|
319
|
+
"""Complete results from parallel execution.
|
|
320
|
+
|
|
321
|
+
Aggregates results from all workers including resolved relationships
|
|
322
|
+
and merged registries.
|
|
323
|
+
"""
|
|
324
|
+
|
|
325
|
+
results: list[FileParseResult] = Field(
|
|
326
|
+
default_factory=list, description="Results from all files"
|
|
327
|
+
)
|
|
328
|
+
resolved_relationships: list[RelationshipData] = Field(
|
|
329
|
+
default_factory=list, description="Relationships resolved post-aggregation"
|
|
330
|
+
)
|
|
331
|
+
function_registry: dict[str, str] = Field(
|
|
332
|
+
default_factory=dict, description="Merged function registry from all workers"
|
|
333
|
+
)
|
|
334
|
+
simple_name_lookup: dict[str, list[str]] = Field(
|
|
335
|
+
default_factory=dict, description="Merged simple name lookup from all workers"
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
# Metrics
|
|
339
|
+
total_files: int = Field(default=0, description="Total files processed")
|
|
340
|
+
successful_files: int = Field(default=0, description="Files successfully parsed")
|
|
341
|
+
failed_files: int = Field(default=0, description="Files that failed to parse")
|
|
342
|
+
total_duration_seconds: float = Field(
|
|
343
|
+
default=0.0, description="Total execution duration"
|
|
344
|
+
)
|
|
345
|
+
worker_metrics: dict[int, WorkerMetrics] = Field(
|
|
346
|
+
default_factory=dict, description="Per-worker metrics"
|
|
347
|
+
)
|