PyPI - shotgun-sh - Versions diffs - 0.4.0.dev1__py3-none-any.whl → 0.6.2__py3-none-any.whl - Mend

shotgun-sh 0.4.0.dev1py3-none-any.whl → 0.6.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (135) hide show

shotgun/agents/agent_manager.py +307 -8
shotgun/agents/cancellation.py +103 -0
shotgun/agents/common.py +12 -0
shotgun/agents/config/README.md +0 -1
shotgun/agents/config/manager.py +10 -7
shotgun/agents/config/models.py +5 -27
shotgun/agents/config/provider.py +44 -27
shotgun/agents/conversation/history/token_counting/base.py +51 -9
shotgun/agents/file_read.py +176 -0
shotgun/agents/messages.py +15 -3
shotgun/agents/models.py +24 -1
shotgun/agents/router/models.py +8 -0
shotgun/agents/router/tools/delegation_tools.py +55 -1
shotgun/agents/router/tools/plan_tools.py +88 -7
shotgun/agents/runner.py +17 -2
shotgun/agents/tools/__init__.py +8 -0
shotgun/agents/tools/codebase/directory_lister.py +27 -39
shotgun/agents/tools/codebase/file_read.py +26 -35
shotgun/agents/tools/codebase/query_graph.py +9 -0
shotgun/agents/tools/codebase/retrieve_code.py +9 -0
shotgun/agents/tools/file_management.py +32 -2
shotgun/agents/tools/file_read_tools/__init__.py +7 -0
shotgun/agents/tools/file_read_tools/multimodal_file_read.py +167 -0
shotgun/agents/tools/markdown_tools/__init__.py +62 -0
shotgun/agents/tools/markdown_tools/insert_section.py +148 -0
shotgun/agents/tools/markdown_tools/models.py +86 -0
shotgun/agents/tools/markdown_tools/remove_section.py +114 -0
shotgun/agents/tools/markdown_tools/replace_section.py +119 -0
shotgun/agents/tools/markdown_tools/utils.py +453 -0
shotgun/agents/tools/registry.py +44 -6
shotgun/agents/tools/web_search/openai.py +42 -23
shotgun/attachments/__init__.py +41 -0
shotgun/attachments/errors.py +60 -0
shotgun/attachments/models.py +107 -0
shotgun/attachments/parser.py +257 -0
shotgun/attachments/processor.py +193 -0
shotgun/build_constants.py +4 -7
shotgun/cli/clear.py +2 -2
shotgun/cli/codebase/commands.py +181 -65
shotgun/cli/compact.py +2 -2
shotgun/cli/context.py +2 -2
shotgun/cli/error_handler.py +2 -2
shotgun/cli/run.py +90 -0
shotgun/cli/spec/backup.py +2 -1
shotgun/codebase/__init__.py +2 -0
shotgun/codebase/benchmarks/__init__.py +35 -0
shotgun/codebase/benchmarks/benchmark_runner.py +309 -0
shotgun/codebase/benchmarks/exporters.py +119 -0
shotgun/codebase/benchmarks/formatters/__init__.py +49 -0
shotgun/codebase/benchmarks/formatters/base.py +34 -0
shotgun/codebase/benchmarks/formatters/json_formatter.py +106 -0
shotgun/codebase/benchmarks/formatters/markdown.py +136 -0
shotgun/codebase/benchmarks/models.py +129 -0
shotgun/codebase/core/__init__.py +4 -0
shotgun/codebase/core/call_resolution.py +91 -0
shotgun/codebase/core/change_detector.py +11 -6
shotgun/codebase/core/errors.py +159 -0
shotgun/codebase/core/extractors/__init__.py +23 -0
shotgun/codebase/core/extractors/base.py +138 -0
shotgun/codebase/core/extractors/factory.py +63 -0
shotgun/codebase/core/extractors/go/__init__.py +7 -0
shotgun/codebase/core/extractors/go/extractor.py +122 -0
shotgun/codebase/core/extractors/javascript/__init__.py +7 -0
shotgun/codebase/core/extractors/javascript/extractor.py +132 -0
shotgun/codebase/core/extractors/protocol.py +109 -0
shotgun/codebase/core/extractors/python/__init__.py +7 -0
shotgun/codebase/core/extractors/python/extractor.py +141 -0
shotgun/codebase/core/extractors/rust/__init__.py +7 -0
shotgun/codebase/core/extractors/rust/extractor.py +139 -0
shotgun/codebase/core/extractors/types.py +15 -0
shotgun/codebase/core/extractors/typescript/__init__.py +7 -0
shotgun/codebase/core/extractors/typescript/extractor.py +92 -0
shotgun/codebase/core/gitignore.py +252 -0
shotgun/codebase/core/ingestor.py +644 -354
shotgun/codebase/core/kuzu_compat.py +119 -0
shotgun/codebase/core/language_config.py +239 -0
shotgun/codebase/core/manager.py +256 -46
shotgun/codebase/core/metrics_collector.py +310 -0
shotgun/codebase/core/metrics_types.py +347 -0
shotgun/codebase/core/parallel_executor.py +424 -0
shotgun/codebase/core/work_distributor.py +254 -0
shotgun/codebase/core/worker.py +768 -0
shotgun/codebase/indexing_state.py +86 -0
shotgun/codebase/models.py +94 -0
shotgun/codebase/service.py +13 -0
shotgun/exceptions.py +9 -9
shotgun/main.py +3 -16
shotgun/posthog_telemetry.py +165 -24
shotgun/prompts/agents/file_read.j2 +48 -0
shotgun/prompts/agents/partials/common_agent_system_prompt.j2 +19 -47
shotgun/prompts/agents/partials/content_formatting.j2 +12 -33
shotgun/prompts/agents/partials/interactive_mode.j2 +9 -32
shotgun/prompts/agents/partials/router_delegation_mode.j2 +21 -22
shotgun/prompts/agents/plan.j2 +14 -0
shotgun/prompts/agents/router.j2 +531 -258
shotgun/prompts/agents/specify.j2 +14 -0
shotgun/prompts/agents/state/codebase/codebase_graphs_available.j2 +14 -1
shotgun/prompts/agents/state/system_state.j2 +13 -11
shotgun/prompts/agents/tasks.j2 +14 -0
shotgun/settings.py +49 -10
shotgun/tui/app.py +149 -18
shotgun/tui/commands/__init__.py +9 -1
shotgun/tui/components/attachment_bar.py +87 -0
shotgun/tui/components/prompt_input.py +25 -28
shotgun/tui/components/status_bar.py +14 -7
shotgun/tui/dependencies.py +3 -8
shotgun/tui/protocols.py +18 -0
shotgun/tui/screens/chat/chat.tcss +15 -0
shotgun/tui/screens/chat/chat_screen.py +766 -235
shotgun/tui/screens/chat/codebase_index_prompt_screen.py +8 -4
shotgun/tui/screens/chat_screen/attachment_hint.py +40 -0
shotgun/tui/screens/chat_screen/command_providers.py +0 -10
shotgun/tui/screens/chat_screen/history/chat_history.py +54 -14
shotgun/tui/screens/chat_screen/history/formatters.py +22 -0
shotgun/tui/screens/chat_screen/history/user_question.py +25 -3
shotgun/tui/screens/database_locked_dialog.py +219 -0
shotgun/tui/screens/database_timeout_dialog.py +158 -0
shotgun/tui/screens/kuzu_error_dialog.py +135 -0
shotgun/tui/screens/model_picker.py +1 -3
shotgun/tui/screens/models.py +11 -0
shotgun/tui/state/processing_state.py +19 -0
shotgun/tui/widgets/widget_coordinator.py +18 -0
shotgun/utils/file_system_utils.py +4 -1
{shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/METADATA +87 -34
{shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/RECORD +128 -79
shotgun/cli/export.py +0 -81
shotgun/cli/plan.py +0 -73
shotgun/cli/research.py +0 -93
shotgun/cli/specify.py +0 -70
shotgun/cli/tasks.py +0 -78
shotgun/sentry_telemetry.py +0 -232
shotgun/tui/screens/onboarding.py +0 -584
{shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/WHEEL +0 -0
{shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/entry_points.txt +0 -0
{shotgun_sh-0.4.0.dev1.dist-info → shotgun_sh-0.6.2.dist-info}/licenses/LICENSE +0 -0

shotgun/codebase/core/parallel_executor.py ADDED Viewed

@@ -0,0 +1,424 @@
+"""Parallel execution framework for file parsing.
+This module provides the ParallelExecutor class for distributing
+file parsing work across multiple threads using ThreadPoolExecutor.
+Note: We use threads instead of processes because multiprocessing has
+file descriptor inheritance issues when running from TUI environments
+(Textual opens FDs that cause "bad value(s) in fds_to_keep" errors).
+Threads avoid this issue entirely and still provide concurrency benefits
+for I/O-bound operations like file reading.
+"""
+from __future__ import annotations
+import time
+from collections import defaultdict
+from collections.abc import Callable
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import TYPE_CHECKING
+from shotgun.codebase.core.call_resolution import calculate_callee_confidence
+from shotgun.codebase.core.metrics_types import (
+    FileParseResult,
+    InheritanceData,
+    NodeLabel,
+    ParallelExecutionResult,
+    RawCallData,
+    RelationshipData,
+    RelationshipType,
+    WorkBatch,
+    WorkerMetrics,
+)
+from shotgun.codebase.core.work_distributor import get_worker_count
+from shotgun.codebase.core.worker import process_batch
+from shotgun.logging_config import get_logger
+if TYPE_CHECKING:
+    from shotgun.codebase.core.metrics_collector import MetricsCollector
+logger = get_logger(__name__)
+# Default timeout for batch processing (5 minutes)
+DEFAULT_BATCH_TIMEOUT_SECONDS = 300.0
+class ParallelExecutor:
+    """Executes file parsing concurrently across multiple threads.
+    This class orchestrates concurrent file parsing using ThreadPoolExecutor,
+    aggregates results from all workers, and resolves deferred relationships
+    that require knowledge of the complete function registry.
+    Note: Uses threads instead of processes to avoid file descriptor issues
+    when running from TUI environments.
+    Attributes:
+        worker_count: Number of worker processes to use
+        batch_timeout: Timeout in seconds for each batch
+        metrics_collector: Optional collector for recording metrics
+    """
+    def __init__(
+        self,
+        worker_count: int | None = None,
+        batch_timeout_seconds: float = DEFAULT_BATCH_TIMEOUT_SECONDS,
+        metrics_collector: MetricsCollector | None = None,
+    ) -> None:
+        """Initialize the parallel executor.
+        Args:
+            worker_count: Number of workers. If None, uses get_worker_count().
+            batch_timeout_seconds: Timeout for batch processing.
+            metrics_collector: Optional collector for recording metrics.
+        """
+        self.worker_count = (
+            worker_count if worker_count is not None else get_worker_count()
+        )
+        self.batch_timeout = batch_timeout_seconds
+        self.metrics_collector = metrics_collector
+        logger.debug(
+            f"ParallelExecutor initialized: {self.worker_count} workers, "
+            f"{self.batch_timeout}s batch timeout"
+        )
+    def execute(
+        self,
+        batches: list[WorkBatch],
+        progress_callback: Callable[[int, int], None] | None = None,
+    ) -> ParallelExecutionResult:
+        """Execute batches in parallel and aggregate results.
+        Args:
+            batches: List of work batches to process
+            progress_callback: Optional callback(completed, total) for progress
+        Returns:
+            ParallelExecutionResult with all results and resolved relationships
+        """
+        if not batches:
+            logger.debug("No batches to process")
+            return ParallelExecutionResult()
+        start_time = time.perf_counter()
+        total_batches = len(batches)
+        all_results: list[FileParseResult] = []
+        worker_stats: dict[int, dict[str, int | float]] = defaultdict(
+            lambda: {
+                "files_processed": 0,
+                "nodes_created": 0,
+                "relationships_created": 0,
+                "duration_seconds": 0.0,
+                "error_count": 0,
+            }
+        )
+        logger.info(
+            f"Starting threaded execution: {total_batches} batches, "
+            f"{self.worker_count} threads"
+        )
+        # Execute batches using threads (avoids multiprocessing fd issues)
+        completed = 0
+        with ThreadPoolExecutor(max_workers=self.worker_count) as executor:
+            # Submit all batches with worker_id based on submission order
+            futures = {}
+            for i, batch in enumerate(batches):
+                worker_id = i % self.worker_count
+                future = executor.submit(process_batch, batch, worker_id)
+                futures[future] = (batch, worker_id)
+            # Collect results as they complete
+            for future in as_completed(futures):
+                batch, worker_id = futures[future]
+                try:
+                    batch_results = future.result(timeout=self.batch_timeout)
+                    all_results.extend(batch_results)
+                    # Update worker stats
+                    for result in batch_results:
+                        worker_stats[worker_id]["files_processed"] += 1
+                        worker_stats[worker_id]["nodes_created"] += len(result.nodes)
+                        worker_stats[worker_id]["relationships_created"] += len(
+                            result.relationships
+                        )
+                        if not result.success:
+                            worker_stats[worker_id]["error_count"] += 1
+                except TimeoutError:
+                    logger.warning(
+                        f"Batch {batch.batch_id} timed out after {self.batch_timeout}s"
+                    )
+                    for task in batch.tasks:
+                        all_results.append(
+                            FileParseResult(
+                                task=task,
+                                success=False,
+                                error=f"Timeout after {self.batch_timeout}s",
+                            )
+                        )
+                        worker_stats[worker_id]["error_count"] += 1
+                except Exception as e:
+                    logger.error(f"Batch {batch.batch_id} failed: {e}")
+                    for task in batch.tasks:
+                        all_results.append(
+                            FileParseResult(
+                                task=task,
+                                success=False,
+                                error=str(e),
+                            )
+                        )
+                        worker_stats[worker_id]["error_count"] += 1
+                completed += 1
+                if progress_callback:
+                    progress_callback(completed, total_batches)
+        total_duration = time.perf_counter() - start_time
+        logger.info(f"Parallel execution completed in {total_duration:.2f}s")
+        # Aggregate registries from all results
+        function_registry, simple_name_lookup = self._aggregate_registries(all_results)
+        logger.info(
+            f"Aggregated registry: {len(function_registry)} entries, "
+            f"{len(simple_name_lookup)} unique names"
+        )
+        # Resolve deferred relationships
+        resolved_relationships = self._resolve_all_relationships(
+            all_results, function_registry, simple_name_lookup
+        )
+        logger.info(f"Resolved {len(resolved_relationships)} deferred relationships")
+        # Calculate final stats
+        successful_files = sum(1 for r in all_results if r.success)
+        failed_files = sum(1 for r in all_results if not r.success)
+        # Build worker metrics
+        worker_metrics = {}
+        for worker_id, stats in worker_stats.items():
+            files = int(stats["files_processed"])
+            nodes = int(stats["nodes_created"])
+            rels = int(stats["relationships_created"])
+            errors = int(stats["error_count"])
+            duration = total_duration / max(1, self.worker_count)  # Estimate per worker
+            worker_metrics[worker_id] = WorkerMetrics(
+                worker_id=worker_id,
+                files_processed=files,
+                nodes_created=nodes,
+                relationships_created=rels,
+                duration_seconds=duration,
+                throughput=files / duration if duration > 0 else 0,
+                peak_memory_mb=0.0,  # Would need per-process memory tracking
+                idle_time_seconds=0.0,  # Would need more detailed tracking
+                error_count=errors,
+            )
+        return ParallelExecutionResult(
+            results=all_results,
+            resolved_relationships=resolved_relationships,
+            function_registry=function_registry,
+            simple_name_lookup=simple_name_lookup,
+            total_files=len(all_results),
+            successful_files=successful_files,
+            failed_files=failed_files,
+            total_duration_seconds=total_duration,
+            worker_metrics=worker_metrics,
+        )
+    def _aggregate_registries(
+        self,
+        results: list[FileParseResult],
+    ) -> tuple[dict[str, str], dict[str, list[str]]]:
+        """Merge function_registry and simple_name_lookup from all workers.
+        Args:
+            results: Results from all workers
+        Returns:
+            Tuple of (function_registry, simple_name_lookup)
+        """
+        function_registry: dict[str, str] = {}
+        simple_name_lookup: dict[str, list[str]] = defaultdict(list)
+        for result in results:
+            if not result.success:
+                continue
+            # Merge function registry
+            function_registry.update(result.function_registry_entries)
+            # Merge simple name lookup
+            for name, qns in result.simple_name_entries.items():
+                for qn in qns:
+                    if qn not in simple_name_lookup[name]:
+                        simple_name_lookup[name].append(qn)
+        return function_registry, dict(simple_name_lookup)
+    def _resolve_all_relationships(
+        self,
+        results: list[FileParseResult],
+        function_registry: dict[str, str],
+        simple_name_lookup: dict[str, list[str]],
+    ) -> list[RelationshipData]:
+        """Resolve all deferred relationships.
+        Args:
+            results: Results containing raw call and inheritance data
+            function_registry: Merged registry from all workers
+            simple_name_lookup: Merged name lookup from all workers
+        Returns:
+            List of resolved RelationshipData
+        """
+        resolved: list[RelationshipData] = []
+        # Collect all raw data
+        all_raw_calls: list[RawCallData] = []
+        all_inheritance: list[InheritanceData] = []
+        for result in results:
+            if result.success:
+                all_raw_calls.extend(result.raw_calls)
+                all_inheritance.extend(result.inheritance_data)
+        # Resolve call relationships
+        call_rels = self._resolve_call_relationships(
+            all_raw_calls, function_registry, simple_name_lookup
+        )
+        resolved.extend(call_rels)
+        # Resolve inheritance relationships
+        inheritance_rels = self._resolve_inheritance_relationships(
+            all_inheritance, function_registry, simple_name_lookup
+        )
+        resolved.extend(inheritance_rels)
+        return resolved
+    def _resolve_call_relationships(
+        self,
+        raw_calls: list[RawCallData],
+        function_registry: dict[str, str],
+        simple_name_lookup: dict[str, list[str]],
+    ) -> list[RelationshipData]:
+        """Resolve raw calls to CALLS relationships.
+        Args:
+            raw_calls: List of unresolved call data
+            function_registry: Complete function registry
+            simple_name_lookup: Complete name lookup
+        Returns:
+            List of resolved CALLS relationships
+        """
+        resolved: list[RelationshipData] = []
+        for call in raw_calls:
+            # Get all possible callees
+            possible_callees = simple_name_lookup.get(call.callee_name, [])
+            if not possible_callees:
+                continue
+            # Calculate confidence scores and pick best match
+            scored_callees = []
+            for possible_qn in possible_callees:
+                score = calculate_callee_confidence(
+                    caller_qn=call.caller_qn,
+                    callee_qn=possible_qn,
+                    module_qn=call.module_qn,
+                    object_name=call.object_name,
+                    simple_name_lookup=simple_name_lookup,
+                )
+                scored_callees.append((possible_qn, score))
+            # Sort by confidence and use highest match
+            scored_callees.sort(key=lambda x: x[1], reverse=True)
+            callee_qn, _confidence = scored_callees[0]
+            # Get types from registry
+            caller_type = function_registry.get(call.caller_qn)
+            callee_type = function_registry.get(callee_qn)
+            if caller_type and callee_type:
+                resolved.append(
+                    RelationshipData(
+                        from_label=caller_type,
+                        from_key="qualified_name",
+                        from_value=call.caller_qn,
+                        rel_type=RelationshipType.CALLS,
+                        to_label=callee_type,
+                        to_key="qualified_name",
+                        to_value=callee_qn,
+                    )
+                )
+        return resolved
+    def _resolve_inheritance_relationships(
+        self,
+        inheritance_data: list[InheritanceData],
+        function_registry: dict[str, str],
+        simple_name_lookup: dict[str, list[str]],
+    ) -> list[RelationshipData]:
+        """Resolve raw inheritance to INHERITS relationships.
+        Args:
+            inheritance_data: List of unresolved inheritance data
+            function_registry: Complete function registry
+            simple_name_lookup: Complete name lookup
+        Returns:
+            List of resolved INHERITS relationships
+        """
+        resolved: list[RelationshipData] = []
+        for data in inheritance_data:
+            child_qn = data.child_class_qn
+            for parent_name in data.parent_simple_names:
+                # Check if parent exists directly in registry
+                if parent_name in function_registry:
+                    resolved.append(
+                        RelationshipData(
+                            from_label=NodeLabel.CLASS,
+                            from_key="qualified_name",
+                            from_value=child_qn,
+                            rel_type=RelationshipType.INHERITS,
+                            to_label=NodeLabel.CLASS,
+                            to_key="qualified_name",
+                            to_value=parent_name,
+                        )
+                    )
+                else:
+                    # Try simple name lookup
+                    parent_simple = parent_name.split(".")[-1]
+                    possible_parents = simple_name_lookup.get(parent_simple, [])
+                    # Filter to only classes
+                    class_parents = [
+                        p
+                        for p in possible_parents
+                        if function_registry.get(p) == NodeLabel.CLASS
+                    ]
+                    if len(class_parents) == 1:
+                        resolved.append(
+                            RelationshipData(
+                                from_label=NodeLabel.CLASS,
+                                from_key="qualified_name",
+                                from_value=child_qn,
+                                rel_type=RelationshipType.INHERITS,
+                                to_label=NodeLabel.CLASS,
+                                to_key="qualified_name",
+                                to_value=class_parents[0],
+                            )
+                        )
+        return resolved

shotgun/codebase/core/work_distributor.py ADDED Viewed

@@ -0,0 +1,254 @@
+"""Work distribution system for parallel file parsing.
+This module provides infrastructure for partitioning file parsing tasks
+across workers with size-balanced distribution for optimal load balancing.
+"""
+from __future__ import annotations
+import multiprocessing
+from shotgun.codebase.core.metrics_types import (
+    DistributionStats,
+    FileInfo,
+    FileParseTask,
+    WorkBatch,
+)
+from shotgun.logging_config import get_logger
+from shotgun.settings import settings
+logger = get_logger(__name__)
+# Default values
+DEFAULT_BATCH_SIZE = 20
+# Re-export types for convenience
+__all__ = [
+    "DEFAULT_BATCH_SIZE",
+    "DistributionStats",
+    "FileInfo",
+    "FileParseTask",
+    "WorkBatch",
+    "WorkDistributor",
+    "get_batch_size",
+    "get_worker_count",
+]
+def get_worker_count() -> int:
+    """Determine optimal worker count for parallel execution.
+    Uses settings override if set, otherwise uses adaptive
+    defaults based on CPU count:
+    - For 4+ cores: max(2, cpu_count - 2)
+    - For 1-3 cores: max(1, cpu_count - 1)
+    Returns:
+        Number of workers to use for parallel execution.
+    """
+    # Check settings override first
+    if settings.indexing.index_workers is not None:
+        result = max(1, settings.indexing.index_workers)
+        logger.debug(f"Worker count from SHOTGUN_INDEX_WORKERS: {result}")
+        return result
+    cpu_count = multiprocessing.cpu_count()
+    if cpu_count >= 4:
+        result = max(2, cpu_count - 2)
+    else:
+        result = max(1, cpu_count - 1)
+    logger.debug(f"Worker count (adaptive): {result} (CPU count: {cpu_count})")
+    return result
+def get_batch_size() -> int:
+    """Get the batch size for grouping file parsing tasks.
+    Checks settings for override, otherwise returns the default of 20 files per batch.
+    Returns:
+        Number of files to include in each work batch.
+    """
+    if settings.indexing.index_batch_size is not None:
+        result = max(1, settings.indexing.index_batch_size)
+        logger.debug(f"Batch size from SHOTGUN_INDEX_BATCH_SIZE: {result}")
+        return result
+    return DEFAULT_BATCH_SIZE
+class WorkDistributor:
+    """Distributes file parsing work across workers using size-balanced partitioning.
+    Uses a bin-packing algorithm to ensure even work distribution:
+    1. Sort files by size (descending)
+    2. Assign each file to worker with least total work
+    3. Group into batches for reduced queue overhead
+    This approach ensures large files don't bottleneck single workers
+    and workers finish at approximately the same time.
+    """
+    def __init__(
+        self, worker_count: int | None = None, batch_size: int | None = None
+    ) -> None:
+        """Initialize the work distributor.
+        Args:
+            worker_count: Number of workers. If None, uses get_worker_count().
+            batch_size: Files per batch. If None, uses get_batch_size().
+        """
+        self.worker_count = (
+            worker_count if worker_count is not None else get_worker_count()
+        )
+        self.batch_size = batch_size if batch_size is not None else get_batch_size()
+        # Ensure at least 1 worker
+        self.worker_count = max(1, self.worker_count)
+        self.batch_size = max(1, self.batch_size)
+        logger.debug(
+            f"WorkDistributor initialized: {self.worker_count} workers, "
+            f"batch size {self.batch_size}"
+        )
+    def _distribute_files(
+        self, files: list[FileInfo]
+    ) -> list[tuple[int, list[FileInfo]]]:
+        """Distribute files across workers using size-balanced bin-packing.
+        Args:
+            files: List of files to distribute.
+        Returns:
+            List of (total_bytes, file_list) tuples, one per worker.
+        """
+        # Sort files by size descending (largest first)
+        sorted_files = sorted(files, key=lambda f: f.file_size_bytes, reverse=True)
+        # Initialize worker buckets with total work tracking
+        # Each bucket is (total_bytes, list_of_files)
+        worker_buckets: list[tuple[int, list[FileInfo]]] = [
+            (0, []) for _ in range(self.worker_count)
+        ]
+        # Assign each file to worker with least total work (bin-packing)
+        for file_info in sorted_files:
+            # Find worker with minimum total work
+            min_idx = min(
+                range(len(worker_buckets)), key=lambda i: worker_buckets[i][0]
+            )
+            total_work, files_list = worker_buckets[min_idx]
+            files_list.append(file_info)
+            worker_buckets[min_idx] = (
+                total_work + file_info.file_size_bytes,
+                files_list,
+            )
+        return worker_buckets
+    def create_batches(self, files: list[FileInfo]) -> list[WorkBatch]:
+        """Partition files into balanced batches for parallel processing.
+        Uses size-balanced bin-packing to ensure even work distribution:
+        1. Sort files by size (descending)
+        2. Assign each file to worker with least total work
+        3. Group into batches for reduced queue overhead
+        Args:
+            files: List of files to distribute across workers.
+        Returns:
+            List of WorkBatch objects containing FileParseTask items,
+            balanced across workers and grouped into batches.
+        """
+        if not files:
+            logger.debug("create_batches called with empty file list")
+            return []
+        logger.debug(
+            f"Distributing {len(files)} files across {self.worker_count} workers"
+        )
+        # Distribute files across workers
+        worker_buckets = self._distribute_files(files)
+        # Log distribution statistics
+        for worker_id, (total_bytes, worker_files) in enumerate(worker_buckets):
+            logger.debug(
+                f"Worker {worker_id}: {len(worker_files)} files, "
+                f"{total_bytes / 1024:.1f} KB total"
+            )
+        # Convert to WorkBatch objects, grouping into batches
+        batches: list[WorkBatch] = []
+        batch_id = 0
+        for _worker_id, (_, worker_files) in enumerate(worker_buckets):
+            # Split worker's files into batches
+            for i in range(0, len(worker_files), self.batch_size):
+                batch_files = worker_files[i : i + self.batch_size]
+                if batch_files:
+                    tasks = [self._file_to_task(f) for f in batch_files]
+                    batches.append(
+                        WorkBatch(
+                            batch_id=batch_id,
+                            tasks=tasks,
+                            estimated_duration_seconds=None,
+                        )
+                    )
+                    batch_id += 1
+        logger.debug(f"Created {len(batches)} batches from {len(files)} files")
+        return batches
+    def _file_to_task(self, file_info: FileInfo) -> FileParseTask:
+        """Convert FileInfo to FileParseTask for worker consumption.
+        Args:
+            file_info: File information with size data.
+        Returns:
+            FileParseTask suitable for sending to worker processes.
+        """
+        return FileParseTask(
+            file_path=file_info.file_path,
+            relative_path=file_info.relative_path,
+            language=file_info.language,
+            module_qn=file_info.module_qn,
+            container_qn=file_info.container_qn,
+        )
+    def get_distribution_stats(self, files: list[FileInfo]) -> DistributionStats:
+        """Get statistics about how files would be distributed.
+        Useful for debugging and verification without creating actual batches.
+        Args:
+            files: List of files to analyze.
+        Returns:
+            DistributionStats with distribution information.
+        """
+        if not files:
+            return DistributionStats(
+                total_files=0,
+                total_bytes=0,
+                worker_count=self.worker_count,
+                batch_size=self.batch_size,
+                files_per_worker=[0] * self.worker_count,
+                bytes_per_worker=[0] * self.worker_count,
+            )
+        # Use shared distribution logic
+        worker_buckets = self._distribute_files(files)
+        return DistributionStats(
+            total_files=len(files),
+            total_bytes=sum(f.file_size_bytes for f in files),
+            worker_count=self.worker_count,
+            batch_size=self.batch_size,
+            files_per_worker=[len(file_list) for _, file_list in worker_buckets],
+            bytes_per_worker=[total_bytes for total_bytes, _ in worker_buckets],
+        )

shotgun-sh 0.4.0.dev1__py3-none-any.whl → 0.6.2__py3-none-any.whl

shotgun-sh 0.4.0.dev1py3-none-any.whl → 0.6.2py3-none-any.whl