shotgun-sh 0.4.0.dev1__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. shotgun/agents/agent_manager.py +307 -8
  2. shotgun/agents/cancellation.py +103 -0
  3. shotgun/agents/common.py +12 -0
  4. shotgun/agents/config/README.md +0 -1
  5. shotgun/agents/config/manager.py +10 -7
  6. shotgun/agents/config/models.py +5 -27
  7. shotgun/agents/config/provider.py +44 -27
  8. shotgun/agents/conversation/history/token_counting/base.py +51 -9
  9. shotgun/agents/file_read.py +176 -0
  10. shotgun/agents/messages.py +15 -3
  11. shotgun/agents/models.py +24 -1
  12. shotgun/agents/router/models.py +8 -0
  13. shotgun/agents/router/tools/delegation_tools.py +55 -1
  14. shotgun/agents/router/tools/plan_tools.py +88 -7
  15. shotgun/agents/runner.py +17 -2
  16. shotgun/agents/tools/__init__.py +8 -0
  17. shotgun/agents/tools/codebase/directory_lister.py +27 -39
  18. shotgun/agents/tools/codebase/file_read.py +26 -35
  19. shotgun/agents/tools/codebase/query_graph.py +9 -0
  20. shotgun/agents/tools/codebase/retrieve_code.py +9 -0
  21. shotgun/agents/tools/file_management.py +32 -2
  22. shotgun/agents/tools/file_read_tools/__init__.py +7 -0
  23. shotgun/agents/tools/file_read_tools/multimodal_file_read.py +167 -0
  24. shotgun/agents/tools/markdown_tools/__init__.py +62 -0
  25. shotgun/agents/tools/markdown_tools/insert_section.py +148 -0
  26. shotgun/agents/tools/markdown_tools/models.py +86 -0
  27. shotgun/agents/tools/markdown_tools/remove_section.py +114 -0
  28. shotgun/agents/tools/markdown_tools/replace_section.py +119 -0
  29. shotgun/agents/tools/markdown_tools/utils.py +453 -0
  30. shotgun/agents/tools/registry.py +44 -6
  31. shotgun/agents/tools/web_search/openai.py +42 -23
  32. shotgun/attachments/__init__.py +41 -0
  33. shotgun/attachments/errors.py +60 -0
  34. shotgun/attachments/models.py +107 -0
  35. shotgun/attachments/parser.py +257 -0
  36. shotgun/attachments/processor.py +193 -0
  37. shotgun/build_constants.py +4 -7
  38. shotgun/cli/clear.py +2 -2
  39. shotgun/cli/codebase/commands.py +181 -65
  40. shotgun/cli/compact.py +2 -2
  41. shotgun/cli/context.py +2 -2
  42. shotgun/cli/error_handler.py +2 -2
  43. shotgun/cli/run.py +90 -0
  44. shotgun/cli/spec/backup.py +2 -1
  45. shotgun/codebase/__init__.py +2 -0
  46. shotgun/codebase/benchmarks/__init__.py +35 -0
  47. shotgun/codebase/benchmarks/benchmark_runner.py +309 -0
  48. shotgun/codebase/benchmarks/exporters.py +119 -0
  49. shotgun/codebase/benchmarks/formatters/__init__.py +49 -0
  50. shotgun/codebase/benchmarks/formatters/base.py +34 -0
  51. shotgun/codebase/benchmarks/formatters/json_formatter.py +106 -0
  52. shotgun/codebase/benchmarks/formatters/markdown.py +136 -0
  53. shotgun/codebase/benchmarks/models.py +129 -0
  54. shotgun/codebase/core/__init__.py +4 -0
  55. shotgun/codebase/core/call_resolution.py +91 -0
  56. shotgun/codebase/core/change_detector.py +11 -6
  57. shotgun/codebase/core/errors.py +159 -0
  58. shotgun/codebase/core/extractors/__init__.py +23 -0
  59. shotgun/codebase/core/extractors/base.py +138 -0
  60. shotgun/codebase/core/extractors/factory.py +63 -0
  61. shotgun/codebase/core/extractors/go/__init__.py +7 -0
  62. shotgun/codebase/core/extractors/go/extractor.py +122 -0
  63. shotgun/codebase/core/extractors/javascript/__init__.py +7 -0
  64. shotgun/codebase/core/extractors/javascript/extractor.py +132 -0
  65. shotgun/codebase/core/extractors/protocol.py +109 -0
  66. shotgun/codebase/core/extractors/python/__init__.py +7 -0
  67. shotgun/codebase/core/extractors/python/extractor.py +141 -0
  68. shotgun/codebase/core/extractors/rust/__init__.py +7 -0
  69. shotgun/codebase/core/extractors/rust/extractor.py +139 -0
  70. shotgun/codebase/core/extractors/types.py +15 -0
  71. shotgun/codebase/core/extractors/typescript/__init__.py +7 -0
  72. shotgun/codebase/core/extractors/typescript/extractor.py +92 -0
  73. shotgun/codebase/core/gitignore.py +252 -0
  74. shotgun/codebase/core/ingestor.py +644 -354
  75. shotgun/codebase/core/kuzu_compat.py +119 -0
  76. shotgun/codebase/core/language_config.py +239 -0
  77. shotgun/codebase/core/manager.py +256 -46
  78. shotgun/codebase/core/metrics_collector.py +310 -0
  79. shotgun/codebase/core/metrics_types.py +347 -0
  80. shotgun/codebase/core/parallel_executor.py +424 -0
  81. shotgun/codebase/core/work_distributor.py +254 -0
  82. shotgun/codebase/core/worker.py +768 -0
  83. shotgun/codebase/indexing_state.py +86 -0
  84. shotgun/codebase/models.py +94 -0
  85. shotgun/codebase/service.py +13 -0
  86. shotgun/exceptions.py +9 -9
  87. shotgun/main.py +3 -16
  88. shotgun/posthog_telemetry.py +165 -24
  89. shotgun/prompts/agents/file_read.j2 +48 -0
  90. shotgun/prompts/agents/partials/common_agent_system_prompt.j2 +19 -47
  91. shotgun/prompts/agents/partials/content_formatting.j2 +12 -33
  92. shotgun/prompts/agents/partials/interactive_mode.j2 +9 -32
  93. shotgun/prompts/agents/partials/router_delegation_mode.j2 +21 -22
  94. shotgun/prompts/agents/plan.j2 +14 -0
  95. shotgun/prompts/agents/router.j2 +531 -258
  96. shotgun/prompts/agents/specify.j2 +14 -0
  97. shotgun/prompts/agents/state/codebase/codebase_graphs_available.j2 +14 -1
  98. shotgun/prompts/agents/state/system_state.j2 +13 -11
  99. shotgun/prompts/agents/tasks.j2 +14 -0
  100. shotgun/settings.py +49 -10
  101. shotgun/tui/app.py +149 -18
  102. shotgun/tui/commands/__init__.py +9 -1
  103. shotgun/tui/components/attachment_bar.py +87 -0
  104. shotgun/tui/components/prompt_input.py +25 -28
  105. shotgun/tui/components/status_bar.py +14 -7
  106. shotgun/tui/dependencies.py +3 -8
  107. shotgun/tui/protocols.py +18 -0
  108. shotgun/tui/screens/chat/chat.tcss +15 -0
  109. shotgun/tui/screens/chat/chat_screen.py +766 -235
  110. shotgun/tui/screens/chat/codebase_index_prompt_screen.py +8 -4
  111. shotgun/tui/screens/chat_screen/attachment_hint.py +40 -0
  112. shotgun/tui/screens/chat_screen/command_providers.py +0 -10
  113. shotgun/tui/screens/chat_screen/history/chat_history.py +54 -14
  114. shotgun/tui/screens/chat_screen/history/formatters.py +22 -0
  115. shotgun/tui/screens/chat_screen/history/user_question.py +25 -3
  116. shotgun/tui/screens/database_locked_dialog.py +219 -0
  117. shotgun/tui/screens/database_timeout_dialog.py +158 -0
  118. shotgun/tui/screens/kuzu_error_dialog.py +135 -0
  119. shotgun/tui/screens/model_picker.py +1 -3
  120. shotgun/tui/screens/models.py +11 -0
  121. shotgun/tui/state/processing_state.py +19 -0
  122. shotgun/tui/widgets/widget_coordinator.py +18 -0
  123. shotgun/utils/file_system_utils.py +4 -1
  124. {shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/METADATA +87 -34
  125. {shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/RECORD +128 -79
  126. shotgun/cli/export.py +0 -81
  127. shotgun/cli/plan.py +0 -73
  128. shotgun/cli/research.py +0 -93
  129. shotgun/cli/specify.py +0 -70
  130. shotgun/cli/tasks.py +0 -78
  131. shotgun/sentry_telemetry.py +0 -232
  132. shotgun/tui/screens/onboarding.py +0 -584
  133. {shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/WHEEL +0 -0
  134. {shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/entry_points.txt +0 -0
  135. {shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,424 @@
1
+ """Parallel execution framework for file parsing.
2
+
3
+ This module provides the ParallelExecutor class for distributing
4
+ file parsing work across multiple threads using ThreadPoolExecutor.
5
+
6
+ Note: We use threads instead of processes because multiprocessing has
7
+ file descriptor inheritance issues when running from TUI environments
8
+ (Textual opens FDs that cause "bad value(s) in fds_to_keep" errors).
9
+ Threads avoid this issue entirely and still provide concurrency benefits
10
+ for I/O-bound operations like file reading.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import time
16
+ from collections import defaultdict
17
+ from collections.abc import Callable
18
+ from concurrent.futures import ThreadPoolExecutor, as_completed
19
+ from typing import TYPE_CHECKING
20
+
21
+ from shotgun.codebase.core.call_resolution import calculate_callee_confidence
22
+ from shotgun.codebase.core.metrics_types import (
23
+ FileParseResult,
24
+ InheritanceData,
25
+ NodeLabel,
26
+ ParallelExecutionResult,
27
+ RawCallData,
28
+ RelationshipData,
29
+ RelationshipType,
30
+ WorkBatch,
31
+ WorkerMetrics,
32
+ )
33
+ from shotgun.codebase.core.work_distributor import get_worker_count
34
+ from shotgun.codebase.core.worker import process_batch
35
+ from shotgun.logging_config import get_logger
36
+
37
+ if TYPE_CHECKING:
38
+ from shotgun.codebase.core.metrics_collector import MetricsCollector
39
+
40
+ logger = get_logger(__name__)
41
+
42
+ # Default timeout for batch processing (5 minutes)
43
+ DEFAULT_BATCH_TIMEOUT_SECONDS = 300.0
44
+
45
+
46
+ class ParallelExecutor:
47
+ """Executes file parsing concurrently across multiple threads.
48
+
49
+ This class orchestrates concurrent file parsing using ThreadPoolExecutor,
50
+ aggregates results from all workers, and resolves deferred relationships
51
+ that require knowledge of the complete function registry.
52
+
53
+ Note: Uses threads instead of processes to avoid file descriptor issues
54
+ when running from TUI environments.
55
+
56
+ Attributes:
57
+ worker_count: Number of worker processes to use
58
+ batch_timeout: Timeout in seconds for each batch
59
+ metrics_collector: Optional collector for recording metrics
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ worker_count: int | None = None,
65
+ batch_timeout_seconds: float = DEFAULT_BATCH_TIMEOUT_SECONDS,
66
+ metrics_collector: MetricsCollector | None = None,
67
+ ) -> None:
68
+ """Initialize the parallel executor.
69
+
70
+ Args:
71
+ worker_count: Number of workers. If None, uses get_worker_count().
72
+ batch_timeout_seconds: Timeout for batch processing.
73
+ metrics_collector: Optional collector for recording metrics.
74
+ """
75
+ self.worker_count = (
76
+ worker_count if worker_count is not None else get_worker_count()
77
+ )
78
+ self.batch_timeout = batch_timeout_seconds
79
+ self.metrics_collector = metrics_collector
80
+
81
+ logger.debug(
82
+ f"ParallelExecutor initialized: {self.worker_count} workers, "
83
+ f"{self.batch_timeout}s batch timeout"
84
+ )
85
+
86
+ def execute(
87
+ self,
88
+ batches: list[WorkBatch],
89
+ progress_callback: Callable[[int, int], None] | None = None,
90
+ ) -> ParallelExecutionResult:
91
+ """Execute batches in parallel and aggregate results.
92
+
93
+ Args:
94
+ batches: List of work batches to process
95
+ progress_callback: Optional callback(completed, total) for progress
96
+
97
+ Returns:
98
+ ParallelExecutionResult with all results and resolved relationships
99
+ """
100
+ if not batches:
101
+ logger.debug("No batches to process")
102
+ return ParallelExecutionResult()
103
+
104
+ start_time = time.perf_counter()
105
+ total_batches = len(batches)
106
+ all_results: list[FileParseResult] = []
107
+ worker_stats: dict[int, dict[str, int | float]] = defaultdict(
108
+ lambda: {
109
+ "files_processed": 0,
110
+ "nodes_created": 0,
111
+ "relationships_created": 0,
112
+ "duration_seconds": 0.0,
113
+ "error_count": 0,
114
+ }
115
+ )
116
+
117
+ logger.info(
118
+ f"Starting threaded execution: {total_batches} batches, "
119
+ f"{self.worker_count} threads"
120
+ )
121
+
122
+ # Execute batches using threads (avoids multiprocessing fd issues)
123
+ completed = 0
124
+ with ThreadPoolExecutor(max_workers=self.worker_count) as executor:
125
+ # Submit all batches with worker_id based on submission order
126
+ futures = {}
127
+ for i, batch in enumerate(batches):
128
+ worker_id = i % self.worker_count
129
+ future = executor.submit(process_batch, batch, worker_id)
130
+ futures[future] = (batch, worker_id)
131
+
132
+ # Collect results as they complete
133
+ for future in as_completed(futures):
134
+ batch, worker_id = futures[future]
135
+
136
+ try:
137
+ batch_results = future.result(timeout=self.batch_timeout)
138
+ all_results.extend(batch_results)
139
+
140
+ # Update worker stats
141
+ for result in batch_results:
142
+ worker_stats[worker_id]["files_processed"] += 1
143
+ worker_stats[worker_id]["nodes_created"] += len(result.nodes)
144
+ worker_stats[worker_id]["relationships_created"] += len(
145
+ result.relationships
146
+ )
147
+ if not result.success:
148
+ worker_stats[worker_id]["error_count"] += 1
149
+
150
+ except TimeoutError:
151
+ logger.warning(
152
+ f"Batch {batch.batch_id} timed out after {self.batch_timeout}s"
153
+ )
154
+ for task in batch.tasks:
155
+ all_results.append(
156
+ FileParseResult(
157
+ task=task,
158
+ success=False,
159
+ error=f"Timeout after {self.batch_timeout}s",
160
+ )
161
+ )
162
+ worker_stats[worker_id]["error_count"] += 1
163
+
164
+ except Exception as e:
165
+ logger.error(f"Batch {batch.batch_id} failed: {e}")
166
+ for task in batch.tasks:
167
+ all_results.append(
168
+ FileParseResult(
169
+ task=task,
170
+ success=False,
171
+ error=str(e),
172
+ )
173
+ )
174
+ worker_stats[worker_id]["error_count"] += 1
175
+
176
+ completed += 1
177
+ if progress_callback:
178
+ progress_callback(completed, total_batches)
179
+
180
+ total_duration = time.perf_counter() - start_time
181
+ logger.info(f"Parallel execution completed in {total_duration:.2f}s")
182
+
183
+ # Aggregate registries from all results
184
+ function_registry, simple_name_lookup = self._aggregate_registries(all_results)
185
+
186
+ logger.info(
187
+ f"Aggregated registry: {len(function_registry)} entries, "
188
+ f"{len(simple_name_lookup)} unique names"
189
+ )
190
+
191
+ # Resolve deferred relationships
192
+ resolved_relationships = self._resolve_all_relationships(
193
+ all_results, function_registry, simple_name_lookup
194
+ )
195
+
196
+ logger.info(f"Resolved {len(resolved_relationships)} deferred relationships")
197
+
198
+ # Calculate final stats
199
+ successful_files = sum(1 for r in all_results if r.success)
200
+ failed_files = sum(1 for r in all_results if not r.success)
201
+
202
+ # Build worker metrics
203
+ worker_metrics = {}
204
+ for worker_id, stats in worker_stats.items():
205
+ files = int(stats["files_processed"])
206
+ nodes = int(stats["nodes_created"])
207
+ rels = int(stats["relationships_created"])
208
+ errors = int(stats["error_count"])
209
+ duration = total_duration / max(1, self.worker_count) # Estimate per worker
210
+ worker_metrics[worker_id] = WorkerMetrics(
211
+ worker_id=worker_id,
212
+ files_processed=files,
213
+ nodes_created=nodes,
214
+ relationships_created=rels,
215
+ duration_seconds=duration,
216
+ throughput=files / duration if duration > 0 else 0,
217
+ peak_memory_mb=0.0, # Would need per-process memory tracking
218
+ idle_time_seconds=0.0, # Would need more detailed tracking
219
+ error_count=errors,
220
+ )
221
+
222
+ return ParallelExecutionResult(
223
+ results=all_results,
224
+ resolved_relationships=resolved_relationships,
225
+ function_registry=function_registry,
226
+ simple_name_lookup=simple_name_lookup,
227
+ total_files=len(all_results),
228
+ successful_files=successful_files,
229
+ failed_files=failed_files,
230
+ total_duration_seconds=total_duration,
231
+ worker_metrics=worker_metrics,
232
+ )
233
+
234
+ def _aggregate_registries(
235
+ self,
236
+ results: list[FileParseResult],
237
+ ) -> tuple[dict[str, str], dict[str, list[str]]]:
238
+ """Merge function_registry and simple_name_lookup from all workers.
239
+
240
+ Args:
241
+ results: Results from all workers
242
+
243
+ Returns:
244
+ Tuple of (function_registry, simple_name_lookup)
245
+ """
246
+ function_registry: dict[str, str] = {}
247
+ simple_name_lookup: dict[str, list[str]] = defaultdict(list)
248
+
249
+ for result in results:
250
+ if not result.success:
251
+ continue
252
+
253
+ # Merge function registry
254
+ function_registry.update(result.function_registry_entries)
255
+
256
+ # Merge simple name lookup
257
+ for name, qns in result.simple_name_entries.items():
258
+ for qn in qns:
259
+ if qn not in simple_name_lookup[name]:
260
+ simple_name_lookup[name].append(qn)
261
+
262
+ return function_registry, dict(simple_name_lookup)
263
+
264
+ def _resolve_all_relationships(
265
+ self,
266
+ results: list[FileParseResult],
267
+ function_registry: dict[str, str],
268
+ simple_name_lookup: dict[str, list[str]],
269
+ ) -> list[RelationshipData]:
270
+ """Resolve all deferred relationships.
271
+
272
+ Args:
273
+ results: Results containing raw call and inheritance data
274
+ function_registry: Merged registry from all workers
275
+ simple_name_lookup: Merged name lookup from all workers
276
+
277
+ Returns:
278
+ List of resolved RelationshipData
279
+ """
280
+ resolved: list[RelationshipData] = []
281
+
282
+ # Collect all raw data
283
+ all_raw_calls: list[RawCallData] = []
284
+ all_inheritance: list[InheritanceData] = []
285
+
286
+ for result in results:
287
+ if result.success:
288
+ all_raw_calls.extend(result.raw_calls)
289
+ all_inheritance.extend(result.inheritance_data)
290
+
291
+ # Resolve call relationships
292
+ call_rels = self._resolve_call_relationships(
293
+ all_raw_calls, function_registry, simple_name_lookup
294
+ )
295
+ resolved.extend(call_rels)
296
+
297
+ # Resolve inheritance relationships
298
+ inheritance_rels = self._resolve_inheritance_relationships(
299
+ all_inheritance, function_registry, simple_name_lookup
300
+ )
301
+ resolved.extend(inheritance_rels)
302
+
303
+ return resolved
304
+
305
+ def _resolve_call_relationships(
306
+ self,
307
+ raw_calls: list[RawCallData],
308
+ function_registry: dict[str, str],
309
+ simple_name_lookup: dict[str, list[str]],
310
+ ) -> list[RelationshipData]:
311
+ """Resolve raw calls to CALLS relationships.
312
+
313
+ Args:
314
+ raw_calls: List of unresolved call data
315
+ function_registry: Complete function registry
316
+ simple_name_lookup: Complete name lookup
317
+
318
+ Returns:
319
+ List of resolved CALLS relationships
320
+ """
321
+ resolved: list[RelationshipData] = []
322
+
323
+ for call in raw_calls:
324
+ # Get all possible callees
325
+ possible_callees = simple_name_lookup.get(call.callee_name, [])
326
+ if not possible_callees:
327
+ continue
328
+
329
+ # Calculate confidence scores and pick best match
330
+ scored_callees = []
331
+ for possible_qn in possible_callees:
332
+ score = calculate_callee_confidence(
333
+ caller_qn=call.caller_qn,
334
+ callee_qn=possible_qn,
335
+ module_qn=call.module_qn,
336
+ object_name=call.object_name,
337
+ simple_name_lookup=simple_name_lookup,
338
+ )
339
+ scored_callees.append((possible_qn, score))
340
+
341
+ # Sort by confidence and use highest match
342
+ scored_callees.sort(key=lambda x: x[1], reverse=True)
343
+ callee_qn, _confidence = scored_callees[0]
344
+
345
+ # Get types from registry
346
+ caller_type = function_registry.get(call.caller_qn)
347
+ callee_type = function_registry.get(callee_qn)
348
+
349
+ if caller_type and callee_type:
350
+ resolved.append(
351
+ RelationshipData(
352
+ from_label=caller_type,
353
+ from_key="qualified_name",
354
+ from_value=call.caller_qn,
355
+ rel_type=RelationshipType.CALLS,
356
+ to_label=callee_type,
357
+ to_key="qualified_name",
358
+ to_value=callee_qn,
359
+ )
360
+ )
361
+
362
+ return resolved
363
+
364
+ def _resolve_inheritance_relationships(
365
+ self,
366
+ inheritance_data: list[InheritanceData],
367
+ function_registry: dict[str, str],
368
+ simple_name_lookup: dict[str, list[str]],
369
+ ) -> list[RelationshipData]:
370
+ """Resolve raw inheritance to INHERITS relationships.
371
+
372
+ Args:
373
+ inheritance_data: List of unresolved inheritance data
374
+ function_registry: Complete function registry
375
+ simple_name_lookup: Complete name lookup
376
+
377
+ Returns:
378
+ List of resolved INHERITS relationships
379
+ """
380
+ resolved: list[RelationshipData] = []
381
+
382
+ for data in inheritance_data:
383
+ child_qn = data.child_class_qn
384
+
385
+ for parent_name in data.parent_simple_names:
386
+ # Check if parent exists directly in registry
387
+ if parent_name in function_registry:
388
+ resolved.append(
389
+ RelationshipData(
390
+ from_label=NodeLabel.CLASS,
391
+ from_key="qualified_name",
392
+ from_value=child_qn,
393
+ rel_type=RelationshipType.INHERITS,
394
+ to_label=NodeLabel.CLASS,
395
+ to_key="qualified_name",
396
+ to_value=parent_name,
397
+ )
398
+ )
399
+ else:
400
+ # Try simple name lookup
401
+ parent_simple = parent_name.split(".")[-1]
402
+ possible_parents = simple_name_lookup.get(parent_simple, [])
403
+
404
+ # Filter to only classes
405
+ class_parents = [
406
+ p
407
+ for p in possible_parents
408
+ if function_registry.get(p) == NodeLabel.CLASS
409
+ ]
410
+
411
+ if len(class_parents) == 1:
412
+ resolved.append(
413
+ RelationshipData(
414
+ from_label=NodeLabel.CLASS,
415
+ from_key="qualified_name",
416
+ from_value=child_qn,
417
+ rel_type=RelationshipType.INHERITS,
418
+ to_label=NodeLabel.CLASS,
419
+ to_key="qualified_name",
420
+ to_value=class_parents[0],
421
+ )
422
+ )
423
+
424
+ return resolved
@@ -0,0 +1,254 @@
1
+ """Work distribution system for parallel file parsing.
2
+
3
+ This module provides infrastructure for partitioning file parsing tasks
4
+ across workers with size-balanced distribution for optimal load balancing.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import multiprocessing
10
+
11
+ from shotgun.codebase.core.metrics_types import (
12
+ DistributionStats,
13
+ FileInfo,
14
+ FileParseTask,
15
+ WorkBatch,
16
+ )
17
+ from shotgun.logging_config import get_logger
18
+ from shotgun.settings import settings
19
+
20
+ logger = get_logger(__name__)
21
+
22
+ # Default values
23
+ DEFAULT_BATCH_SIZE = 20
24
+
25
+ # Re-export types for convenience
26
+ __all__ = [
27
+ "DEFAULT_BATCH_SIZE",
28
+ "DistributionStats",
29
+ "FileInfo",
30
+ "FileParseTask",
31
+ "WorkBatch",
32
+ "WorkDistributor",
33
+ "get_batch_size",
34
+ "get_worker_count",
35
+ ]
36
+
37
+
38
+ def get_worker_count() -> int:
39
+ """Determine optimal worker count for parallel execution.
40
+
41
+ Uses settings override if set, otherwise uses adaptive
42
+ defaults based on CPU count:
43
+ - For 4+ cores: max(2, cpu_count - 2)
44
+ - For 1-3 cores: max(1, cpu_count - 1)
45
+
46
+ Returns:
47
+ Number of workers to use for parallel execution.
48
+ """
49
+ # Check settings override first
50
+ if settings.indexing.index_workers is not None:
51
+ result = max(1, settings.indexing.index_workers)
52
+ logger.debug(f"Worker count from SHOTGUN_INDEX_WORKERS: {result}")
53
+ return result
54
+
55
+ cpu_count = multiprocessing.cpu_count()
56
+ if cpu_count >= 4:
57
+ result = max(2, cpu_count - 2)
58
+ else:
59
+ result = max(1, cpu_count - 1)
60
+
61
+ logger.debug(f"Worker count (adaptive): {result} (CPU count: {cpu_count})")
62
+ return result
63
+
64
+
65
+ def get_batch_size() -> int:
66
+ """Get the batch size for grouping file parsing tasks.
67
+
68
+ Checks settings for override, otherwise returns the default of 20 files per batch.
69
+
70
+ Returns:
71
+ Number of files to include in each work batch.
72
+ """
73
+ if settings.indexing.index_batch_size is not None:
74
+ result = max(1, settings.indexing.index_batch_size)
75
+ logger.debug(f"Batch size from SHOTGUN_INDEX_BATCH_SIZE: {result}")
76
+ return result
77
+
78
+ return DEFAULT_BATCH_SIZE
79
+
80
+
81
+ class WorkDistributor:
82
+ """Distributes file parsing work across workers using size-balanced partitioning.
83
+
84
+ Uses a bin-packing algorithm to ensure even work distribution:
85
+ 1. Sort files by size (descending)
86
+ 2. Assign each file to worker with least total work
87
+ 3. Group into batches for reduced queue overhead
88
+
89
+ This approach ensures large files don't bottleneck single workers
90
+ and workers finish at approximately the same time.
91
+ """
92
+
93
+ def __init__(
94
+ self, worker_count: int | None = None, batch_size: int | None = None
95
+ ) -> None:
96
+ """Initialize the work distributor.
97
+
98
+ Args:
99
+ worker_count: Number of workers. If None, uses get_worker_count().
100
+ batch_size: Files per batch. If None, uses get_batch_size().
101
+ """
102
+ self.worker_count = (
103
+ worker_count if worker_count is not None else get_worker_count()
104
+ )
105
+ self.batch_size = batch_size if batch_size is not None else get_batch_size()
106
+
107
+ # Ensure at least 1 worker
108
+ self.worker_count = max(1, self.worker_count)
109
+ self.batch_size = max(1, self.batch_size)
110
+
111
+ logger.debug(
112
+ f"WorkDistributor initialized: {self.worker_count} workers, "
113
+ f"batch size {self.batch_size}"
114
+ )
115
+
116
+ def _distribute_files(
117
+ self, files: list[FileInfo]
118
+ ) -> list[tuple[int, list[FileInfo]]]:
119
+ """Distribute files across workers using size-balanced bin-packing.
120
+
121
+ Args:
122
+ files: List of files to distribute.
123
+
124
+ Returns:
125
+ List of (total_bytes, file_list) tuples, one per worker.
126
+ """
127
+ # Sort files by size descending (largest first)
128
+ sorted_files = sorted(files, key=lambda f: f.file_size_bytes, reverse=True)
129
+
130
+ # Initialize worker buckets with total work tracking
131
+ # Each bucket is (total_bytes, list_of_files)
132
+ worker_buckets: list[tuple[int, list[FileInfo]]] = [
133
+ (0, []) for _ in range(self.worker_count)
134
+ ]
135
+
136
+ # Assign each file to worker with least total work (bin-packing)
137
+ for file_info in sorted_files:
138
+ # Find worker with minimum total work
139
+ min_idx = min(
140
+ range(len(worker_buckets)), key=lambda i: worker_buckets[i][0]
141
+ )
142
+ total_work, files_list = worker_buckets[min_idx]
143
+ files_list.append(file_info)
144
+ worker_buckets[min_idx] = (
145
+ total_work + file_info.file_size_bytes,
146
+ files_list,
147
+ )
148
+
149
+ return worker_buckets
150
+
151
+ def create_batches(self, files: list[FileInfo]) -> list[WorkBatch]:
152
+ """Partition files into balanced batches for parallel processing.
153
+
154
+ Uses size-balanced bin-packing to ensure even work distribution:
155
+ 1. Sort files by size (descending)
156
+ 2. Assign each file to worker with least total work
157
+ 3. Group into batches for reduced queue overhead
158
+
159
+ Args:
160
+ files: List of files to distribute across workers.
161
+
162
+ Returns:
163
+ List of WorkBatch objects containing FileParseTask items,
164
+ balanced across workers and grouped into batches.
165
+ """
166
+ if not files:
167
+ logger.debug("create_batches called with empty file list")
168
+ return []
169
+
170
+ logger.debug(
171
+ f"Distributing {len(files)} files across {self.worker_count} workers"
172
+ )
173
+
174
+ # Distribute files across workers
175
+ worker_buckets = self._distribute_files(files)
176
+
177
+ # Log distribution statistics
178
+ for worker_id, (total_bytes, worker_files) in enumerate(worker_buckets):
179
+ logger.debug(
180
+ f"Worker {worker_id}: {len(worker_files)} files, "
181
+ f"{total_bytes / 1024:.1f} KB total"
182
+ )
183
+
184
+ # Convert to WorkBatch objects, grouping into batches
185
+ batches: list[WorkBatch] = []
186
+ batch_id = 0
187
+
188
+ for _worker_id, (_, worker_files) in enumerate(worker_buckets):
189
+ # Split worker's files into batches
190
+ for i in range(0, len(worker_files), self.batch_size):
191
+ batch_files = worker_files[i : i + self.batch_size]
192
+ if batch_files:
193
+ tasks = [self._file_to_task(f) for f in batch_files]
194
+ batches.append(
195
+ WorkBatch(
196
+ batch_id=batch_id,
197
+ tasks=tasks,
198
+ estimated_duration_seconds=None,
199
+ )
200
+ )
201
+ batch_id += 1
202
+
203
+ logger.debug(f"Created {len(batches)} batches from {len(files)} files")
204
+ return batches
205
+
206
+ def _file_to_task(self, file_info: FileInfo) -> FileParseTask:
207
+ """Convert FileInfo to FileParseTask for worker consumption.
208
+
209
+ Args:
210
+ file_info: File information with size data.
211
+
212
+ Returns:
213
+ FileParseTask suitable for sending to worker processes.
214
+ """
215
+ return FileParseTask(
216
+ file_path=file_info.file_path,
217
+ relative_path=file_info.relative_path,
218
+ language=file_info.language,
219
+ module_qn=file_info.module_qn,
220
+ container_qn=file_info.container_qn,
221
+ )
222
+
223
+ def get_distribution_stats(self, files: list[FileInfo]) -> DistributionStats:
224
+ """Get statistics about how files would be distributed.
225
+
226
+ Useful for debugging and verification without creating actual batches.
227
+
228
+ Args:
229
+ files: List of files to analyze.
230
+
231
+ Returns:
232
+ DistributionStats with distribution information.
233
+ """
234
+ if not files:
235
+ return DistributionStats(
236
+ total_files=0,
237
+ total_bytes=0,
238
+ worker_count=self.worker_count,
239
+ batch_size=self.batch_size,
240
+ files_per_worker=[0] * self.worker_count,
241
+ bytes_per_worker=[0] * self.worker_count,
242
+ )
243
+
244
+ # Use shared distribution logic
245
+ worker_buckets = self._distribute_files(files)
246
+
247
+ return DistributionStats(
248
+ total_files=len(files),
249
+ total_bytes=sum(f.file_size_bytes for f in files),
250
+ worker_count=self.worker_count,
251
+ batch_size=self.batch_size,
252
+ files_per_worker=[len(file_list) for _, file_list in worker_buckets],
253
+ bytes_per_worker=[total_bytes for total_bytes, _ in worker_buckets],
254
+ )