PyPI - caption-flow - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

caption-flow 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

caption_flow/cli.py +8 -2
caption_flow/monitor.py +1 -1
caption_flow/orchestrator.py +522 -129
caption_flow/storage.py +5 -0
caption_flow/utils/chunk_tracker.py +22 -4
caption_flow/utils/dataset_loader.py +99 -142
caption_flow/utils/shard_processor.py +100 -36
{caption_flow-0.2.0.dist-info → caption_flow-0.2.2.dist-info}/METADATA +2 -1
{caption_flow-0.2.0.dist-info → caption_flow-0.2.2.dist-info}/RECORD +13 -13
{caption_flow-0.2.0.dist-info → caption_flow-0.2.2.dist-info}/WHEEL +0 -0
{caption_flow-0.2.0.dist-info → caption_flow-0.2.2.dist-info}/entry_points.txt +0 -0
{caption_flow-0.2.0.dist-info → caption_flow-0.2.2.dist-info}/licenses/LICENSE +0 -0
{caption_flow-0.2.0.dist-info → caption_flow-0.2.2.dist-info}/top_level.txt +0 -0

caption_flow/orchestrator.py CHANGED Viewed

@@ -16,7 +16,7 @@ import uuid
 from dataclasses import dataclass, asdict
 from datetime import datetime
 from pathlib import Path
-from typing import Dict, Set, Optional, Any, List, Deque
+from typing import Dict, Set, Optional, Any, List, Deque, Tuple
 from collections import deque, defaultdict
 import threading
 from queue import Queue, Empty
@@ -97,27 +97,9 @@ class ChunkManager:
         self.lock = threading.Lock()
         self.tracker = tracker  # Reference to chunk tracker
-    def create_chunks_from_shard(
-        self, shard_url: str, shard_name: str, total_items: int
-    ) -> List[ShardChunk]:
-        """Create chunks from a shard."""
-        chunks = []
-        for start_idx in range(0, total_items, self.chunk_size):
-            chunk = ShardChunk.create(
-                shard_url=shard_url,
-                shard_name=shard_name,
-                start_index=start_idx,
-                chunk_size=min(self.chunk_size, total_items - start_idx),
-            )
-            with self.lock:
-                self.chunks[chunk.chunk_id] = chunk
-                self.pending_chunks.append(chunk.chunk_id)
-            chunks.append(chunk)
-        return chunks
+        # NEW: Track assigned ranges to prevent double allocation
+        # Format: {chunk_id: {(start, end): worker_id}}
+        self.assigned_ranges: Dict[str, Dict[Tuple[int, int], str]] = defaultdict(dict)
     def get_chunks_for_worker(
         self, worker_id: str, count: int = 1, tracker: Optional["ChunkTracker"] = None
@@ -127,7 +109,6 @@ class ChunkManager:
         with self.lock:
             # FIRST PRIORITY: Check if this worker already has assigned chunks
-            # Workers should complete their current chunks before getting new ones
             if worker_id in self.assigned_chunks:
                 existing_chunk_ids = list(self.assigned_chunks[worker_id])
                 for chunk_id in existing_chunk_ids:
@@ -142,12 +123,29 @@ class ChunkManager:
                     if tracker:
                         chunk_info = tracker.get_chunk_with_unprocessed_items(chunk_id)
                         if chunk_info and chunk_info["unprocessed_ranges"]:
-                            assigned.append(
-                                {
-                                    "chunk": chunk,
-                                    "unprocessed_ranges": chunk_info["unprocessed_ranges"],
-                                }
-                            )
+                            # Filter out ranges that are assigned to other workers
+                            clean_ranges = []
+                            for start, end in chunk_info["unprocessed_ranges"]:
+                                range_key = (start, end)
+                                if range_key in self.assigned_ranges[chunk_id]:
+                                    assigned_worker = self.assigned_ranges[chunk_id][range_key]
+                                    if assigned_worker != worker_id:
+                                        # Skip this range - it's assigned to another worker
+                                        logger.warning(
+                                            f"Skipping range {start}-{end} in chunk {chunk_id} "
+                                            f"(assigned to {assigned_worker}, not {worker_id})"
+                                        )
+                                        continue
+                                    # else: this worker already owns this range, include it
+                                clean_ranges.append((start, end))
+                            if clean_ranges:
+                                assigned.append(
+                                    {
+                                        "chunk": chunk,
+                                        "unprocessed_ranges": clean_ranges,
+                                    }
+                                )
                     else:
                         # No tracker, assume chunk needs processing
                         assigned.append(
@@ -158,7 +156,6 @@ class ChunkManager:
                         )
             # SECOND PRIORITY: Get new pending chunks
-            # Only if worker doesn't have enough chunks already
             while len(assigned) < count and self.pending_chunks:
                 chunk_id = self.pending_chunks.popleft()
                 chunk = self.chunks.get(chunk_id)
@@ -166,7 +163,7 @@ class ChunkManager:
                 if not chunk:
                     continue
-                # Verify chunk is truly pending (defensive check)
+                # Verify chunk is truly pending
                 if chunk.status != "pending" or chunk.assigned_to is not None:
                     logger.warning(
                         f"Chunk {chunk_id} in pending queue but status={chunk.status}, assigned_to={chunk.assigned_to}"
@@ -179,15 +176,48 @@ class ChunkManager:
                 chunk.assigned_at = datetime.utcnow()
                 self.assigned_chunks[worker_id].add(chunk_id)
-                # Get unprocessed ranges
+                # Get unprocessed ranges and filter out any that are somehow already assigned
                 unprocessed_ranges = [(0, chunk.chunk_size - 1)]  # Default
                 if tracker:
                     chunk_info = tracker.get_chunk_with_unprocessed_items(chunk_id)
                     if chunk_info:
-                        unprocessed_ranges = chunk_info["unprocessed_ranges"]
+                        # Filter out any ranges that are already assigned (shouldn't happen for new chunks)
+                        clean_ranges = []
+                        for start, end in chunk_info["unprocessed_ranges"]:
+                            range_key = (start, end)
+                            if range_key not in self.assigned_ranges[chunk_id]:
+                                clean_ranges.append((start, end))
+                            else:
+                                logger.error(
+                                    f"Range {start}-{end} in newly assigned chunk {chunk_id} "
+                                    f"is already assigned to {self.assigned_ranges[chunk_id][range_key]}!"
+                                )
+                        unprocessed_ranges = clean_ranges if clean_ranges else []
                     tracker.mark_assigned(chunk_id, worker_id)
-                assigned.append({"chunk": chunk, "unprocessed_ranges": unprocessed_ranges})
+                if unprocessed_ranges:
+                    assigned.append({"chunk": chunk, "unprocessed_ranges": unprocessed_ranges})
+            # Track assigned ranges and verify no double allocation
+            for info in assigned:
+                chunk_id = info["chunk"].chunk_id
+                for start, end in info["unprocessed_ranges"]:
+                    range_key = (start, end)
+                    # Check if this range is already assigned
+                    if range_key in self.assigned_ranges[chunk_id]:
+                        existing_worker = self.assigned_ranges[chunk_id][range_key]
+                        if existing_worker != worker_id:
+                            # This should never happen - raise assertion
+                            raise AssertionError(
+                                f"CRITICAL: Attempting to assign range {start}-{end} in chunk {chunk_id} "
+                                f"to worker {worker_id}, but it's already assigned to {existing_worker}! "
+                                f"This would cause duplicate processing."
+                            )
+                    # Track this assignment
+                    self.assigned_ranges[chunk_id][range_key] = worker_id
         # Log what we're assigning
         if assigned:
@@ -199,6 +229,12 @@ class ChunkManager:
             )
             logger.info(f"Assigning to worker {worker_id}: {chunk_summary}")
+            # Detailed range logging for debugging
+            for info in assigned:
+                chunk_id = info["chunk"].chunk_id
+                ranges_str = ", ".join([f"{s}-{e}" for s, e in info["unprocessed_ranges"]])
+                logger.debug(f"  Chunk {chunk_id} ranges: {ranges_str}")
         return assigned
     def complete_chunk(self, chunk_id: str, worker_id: str) -> bool:
@@ -210,6 +246,16 @@ class ChunkManager:
                     chunk.status = "completed"
                     chunk.completed_at = datetime.utcnow()
                     self.assigned_chunks[worker_id].discard(chunk_id)
+                    # Clear assigned ranges for this chunk
+                    if chunk_id in self.assigned_ranges:
+                        # Log what ranges we're clearing
+                        ranges_to_clear = list(self.assigned_ranges[chunk_id].keys())
+                        logger.debug(
+                            f"Clearing {len(ranges_to_clear)} assigned ranges for completed chunk {chunk_id}"
+                        )
+                        del self.assigned_ranges[chunk_id]
                     return True
         return False
@@ -224,6 +270,20 @@ class ChunkManager:
                     chunk.assigned_at = None
                     self.assigned_chunks[worker_id].discard(chunk_id)
                     self.pending_chunks.append(chunk_id)
+                    # Clear assigned ranges for this chunk/worker
+                    if chunk_id in self.assigned_ranges:
+                        ranges_to_clear = [
+                            range_key
+                            for range_key, assigned_worker in self.assigned_ranges[chunk_id].items()
+                            if assigned_worker == worker_id
+                        ]
+                        for range_key in ranges_to_clear:
+                            del self.assigned_ranges[chunk_id][range_key]
+                        logger.debug(
+                            f"Cleared {len(ranges_to_clear)} assigned ranges for failed chunk {chunk_id}"
+                        )
                     return True
         return False
@@ -240,18 +300,62 @@ class ChunkManager:
                         chunk.assigned_at = None
                         self.pending_chunks.append(chunk_id)
+                        # Clear assigned ranges for this worker
+                        if chunk_id in self.assigned_ranges:
+                            ranges_to_clear = [
+                                range_key
+                                for range_key, assigned_worker in self.assigned_ranges[
+                                    chunk_id
+                                ].items()
+                                if assigned_worker == worker_id
+                            ]
+                            for range_key in ranges_to_clear:
+                                del self.assigned_ranges[chunk_id][range_key]
+                            if ranges_to_clear:
+                                logger.info(
+                                    f"Released {len(ranges_to_clear)} ranges from chunk {chunk_id} "
+                                    f"previously assigned to disconnected worker {worker_id}"
+                                )
             if worker_id in self.assigned_chunks:
                 del self.assigned_chunks[worker_id]
+    def mark_ranges_processed(
+        self, chunk_id: str, processed_ranges: List[Tuple[int, int]], worker_id: str
+    ):
+        """Remove ranges from assignment tracking once they're processed."""
+        with self.lock:
+            if chunk_id in self.assigned_ranges:
+                for start, end in processed_ranges:
+                    range_key = (start, end)
+                    if range_key in self.assigned_ranges[chunk_id]:
+                        assigned_worker = self.assigned_ranges[chunk_id][range_key]
+                        if assigned_worker == worker_id:
+                            del self.assigned_ranges[chunk_id][range_key]
+                            logger.debug(
+                                f"Cleared assignment of range {start}-{end} in chunk {chunk_id} "
+                                f"after processing by {worker_id}"
+                            )
+                        else:
+                            logger.warning(
+                                f"Worker {worker_id} claims to have processed range {start}-{end} "
+                                f"in chunk {chunk_id}, but it was assigned to {assigned_worker}"
+                            )
     def get_stats(self) -> Dict[str, int]:
         """Get chunk statistics."""
         with self.lock:
+            # Count total assigned ranges
+            total_assigned_ranges = sum(len(ranges) for ranges in self.assigned_ranges.values())
             stats = {
                 "total": len(self.chunks),
                 "pending": len(self.pending_chunks),
                 "assigned": sum(len(chunks) for chunks in self.assigned_chunks.values()),
                 "completed": sum(1 for c in self.chunks.values() if c.status == "completed"),
                 "failed": sum(1 for c in self.chunks.values() if c.status == "failed"),
+                "assigned_ranges": total_assigned_ranges,
             }
         return stats
@@ -363,6 +467,7 @@ class Orchestrator:
         self.ssl_context = self._setup_ssl()
         # Statistics
+        self.is_generating_stats = False
         self.stats = {
             "total_chunks": 0,
             "completed_chunks": 0,
@@ -490,13 +595,15 @@ class Orchestrator:
             with self.chunk_manager.lock:
                 for chunk_state in shard_info["chunks"]:
                     if chunk_state.status in ["pending", "failed", "assigned"]:
-                        # ChunkState already has shard_url stored
+                        # For assigned chunks, reset them to pending since workers don't exist
                         chunk = ShardChunk(
                             chunk_id=chunk_state.chunk_id,
                             shard_url=chunk_state.shard_url,
                             shard_name=chunk_state.shard_name,
                             start_index=chunk_state.start_index,
                             chunk_size=chunk_state.chunk_size,
+                            status="pending",  # Reset to pending
+                            assigned_to=None,  # Clear assignment
                         )
                         self.chunk_manager.chunks[chunk_state.chunk_id] = chunk
                         self.chunk_manager.pending_chunks.append(chunk_state.chunk_id)
@@ -1409,28 +1516,36 @@ class Orchestrator:
         finally:
             del self.data_workers[worker_id]
-    async def _handle_monitor(self, websocket: WebSocketServerProtocol):
-        """Handle monitor connection."""
-        self.monitors.add(websocket)
-        logger.info("Monitor connected")
+    async def _send_leaderboard_to_monitor(self, websocket: WebSocketServerProtocol):
+        """Send leaderboard data to a specific monitor."""
+        total_start = time.time()
         try:
-            # Send initial stats
-            await websocket.send(safe_json_dumps({"type": "stats", "data": self.stats}))
-            # Send chunk stats
-            chunk_stats = self.chunk_manager.get_stats()
-            await websocket.send(safe_json_dumps({"type": "chunk_stats", "data": chunk_stats}))
+            if websocket not in self.monitors:
+                return
-            # Send contributor leaderboard with active worker counts
+            # Get contributors asynchronously
+            contributors_start = time.time()
             contributors = await self.storage.get_top_contributors(10)
+            logger.debug(
+                f"Contributors retrieved in {(time.time() - contributors_start)*1000:.1f}ms"
+            )
-            # Enhance contributor data with active worker counts
-            enhanced_contributors = []
-            worker_counts = (
-                self.get_workers_by_user_stats() if hasattr(self, "workers_by_user") else {}
+            # Get worker counts in thread pool
+            worker_counts_start = time.time()
+            loop = asyncio.get_event_loop()
+            worker_counts = await loop.run_in_executor(
+                None,
+                lambda: (
+                    self.get_workers_by_user_stats() if hasattr(self, "workers_by_user") else {}
+                ),
+            )
+            logger.debug(
+                f"Worker counts retrieved in {(time.time() - worker_counts_start)*1000:.1f}ms"
             )
+            # Build enhanced contributors list
+            build_start = time.time()
+            enhanced_contributors = []
             for contributor in contributors:
                 contrib_dict = {
                     "contributor_id": contributor.contributor_id,
@@ -1442,40 +1557,157 @@ class Orchestrator:
                     ),
                 }
                 enhanced_contributors.append(contrib_dict)
+            logger.debug(f"Enhanced contributors built in {(time.time() - build_start)*1000:.1f}ms")
-            await websocket.send(
-                safe_json_dumps({"type": "leaderboard", "data": enhanced_contributors})
+            # Cache for future monitors
+            self._cached_leaderboard = enhanced_contributors
+            # Send if still connected
+            if websocket in self.monitors:
+                send_start = time.time()
+                await websocket.send(
+                    safe_json_dumps({"type": "leaderboard", "data": enhanced_contributors})
+                )
+                logger.debug(
+                    f"Leaderboard sent to monitor in {(time.time() - send_start)*1000:.1f}ms"
+                )
+            logger.debug(
+                f"Leaderboard send to monitor completed in {(time.time() - total_start)*1000:.1f}ms"
             )
-            # Keep connection alive
-            async for _ in websocket:
-                pass
+        except websockets.exceptions.ConnectionClosed:
+            logger.debug("Monitor disconnected during leaderboard send")
+        except Exception as e:
+            logger.error(f"Error sending leaderboard to monitor: {e}")
+    async def _send_initial_monitor_data(self, websocket: WebSocketServerProtocol):
+        """Send initial data to monitor in a separate task to avoid blocking."""
+        total_start = time.time()
+        try:
+            # Check if websocket is still in monitors set
+            if websocket not in self.monitors:
+                logger.debug("Monitor disconnected before initial data send")
+                return
+            # Send current stats (already in memory)
+            stats_start = time.time()
+            await websocket.send(safe_json_dumps({"type": "stats", "data": self.stats}))
+            logger.debug(f"Monitor stats sent in {(time.time() - stats_start)*1000:.1f}ms")
+            # Get chunk stats asynchronously
+            chunk_stats_start = time.time()
+            loop = asyncio.get_event_loop()
+            chunk_stats = await loop.run_in_executor(None, self.chunk_manager.get_stats)
+            logger.debug(f"Chunk stats retrieved in {(time.time() - chunk_stats_start)*1000:.1f}ms")
+            if websocket not in self.monitors:
+                return
+            chunk_send_start = time.time()
+            await websocket.send(safe_json_dumps({"type": "chunk_stats", "data": chunk_stats}))
+            logger.debug(f"Chunk stats sent in {(time.time() - chunk_send_start)*1000:.1f}ms")
+            # For leaderboard, check if we have a cached version first
+            if hasattr(self, "_cached_leaderboard") and self._cached_leaderboard:
+                # Use cached leaderboard if available
+                cache_send_start = time.time()
+                await websocket.send(
+                    safe_json_dumps({"type": "leaderboard", "data": self._cached_leaderboard})
+                )
+                logger.debug(
+                    f"Cached leaderboard sent in {(time.time() - cache_send_start)*1000:.1f}ms"
+                )
+            else:
+                # Schedule leaderboard update separately
+                leaderboard_task_start = time.time()
+                asyncio.create_task(self._send_leaderboard_to_monitor(websocket))
+                logger.debug(
+                    f"Leaderboard task created in {(time.time() - leaderboard_task_start)*1000:.1f}ms"
+                )
+            logger.debug(
+                f"Monitor initial data send completed in {(time.time() - total_start)*1000:.1f}ms"
+            )
+        except websockets.exceptions.ConnectionClosed:
+            logger.debug("Monitor disconnected during initial data send")
+        except Exception as e:
+            logger.error(f"Error sending initial monitor data: {e}")
+    async def _handle_monitor(self, websocket: WebSocketServerProtocol):
+        """Handle monitor connection - truly non-blocking version."""
+        monitor_start = time.time()
+        self.monitors.add(websocket)
+        logger.info(f"Monitor connected (total monitors: {len(self.monitors)})")
+        try:
+            # Send welcome message immediately
+            welcome_start = time.time()
+            await websocket.send(safe_json_dumps({"type": "welcome", "role": "monitor"}))
+            logger.debug(f"Monitor welcome sent in {(time.time() - welcome_start)*1000:.1f}ms")
+            # Schedule initial data send as a separate task to avoid blocking
+            task_create_start = time.time()
+            asyncio.create_task(self._send_initial_monitor_data(websocket))
+            logger.debug(
+                f"Monitor initial data task created in {(time.time() - task_create_start)*1000:.1f}ms"
+            )
+            # Just keep the connection alive - no blocking work here
+            try:
+                async for message in websocket:
+                    # Handle any incoming messages from monitor if needed
+                    # For now, just ignore them
+                    pass
+            except websockets.exceptions.ConnectionClosed:
+                pass  # Normal disconnection
         except websockets.exceptions.ConnectionClosed:
             logger.info("Monitor disconnected")
+        except Exception as e:
+            logger.error(f"Error in monitor handler: {e}")
         finally:
             self.monitors.discard(websocket)
+            logger.debug(f"Monitor handler completed in {(time.time() - monitor_start)*1000:.1f}ms")
     async def _broadcast_stats(self):
-        """Broadcast statistics to all monitors - enhanced for multi-stage."""
+        """Broadcast statistics to all monitors - truly non-blocking version."""
         if not self.monitors:
             return
+        if self.is_generating_stats:
+            return  # Already generating stats, skip this call
+        self.is_generating_stats = True
+        total_start = time.time()
-        # Get storage stats
+        # Prepare all the data first
+        data_prep_start = time.time()
+        loop = asyncio.get_event_loop()
+        # Get storage stats (already async)
+        storage_stats_start = time.time()
         storage_stats = await self.storage.get_storage_stats()
+        logger.debug(f"Storage stats retrieved in {(time.time() - storage_stats_start)*1000:.1f}ms")
+        caption_stats_start = time.time()
         caption_stats = await self.storage.get_caption_stats()
+        logger.debug(f"Caption stats retrieved in {(time.time() - caption_stats_start)*1000:.1f}ms")
-        # Include chunk stats
-        chunk_stats = self.chunk_manager.get_stats()
-        self.stats.update({f"chunks_{k}": v for k, v in chunk_stats.items()})
+        # Get chunk stats in thread pool
+        chunk_stats_start = time.time()
+        chunk_stats = await loop.run_in_executor(None, self.chunk_manager.get_stats)
+        logger.debug(f"Chunk stats retrieved in {(time.time() - chunk_stats_start)*1000:.1f}ms")
-        # Merge storage stats
-        self.stats.update(storage_stats)
-        self.stats["field_breakdown"] = caption_stats.get("field_stats", {})
-        self.stats["output_fields_list"] = caption_stats.get("output_fields", [])
+        # Build stats dict
+        build_stats_start = time.time()
+        stats_update = self.stats.copy()
+        stats_update.update({f"chunks_{k}": v for k, v in chunk_stats.items()})
+        stats_update.update(storage_stats)
+        stats_update["field_breakdown"] = caption_stats.get("field_stats", {})
+        stats_update["output_fields_list"] = caption_stats.get("output_fields", [])
         # Add rate information
-        self.stats.update(
+        stats_update.update(
             {
                 "current_rate": self.rate_tracker["current_rate"],
                 "average_rate": self.rate_tracker["average_rate"],
@@ -1483,41 +1715,106 @@ class Orchestrator:
             }
         )
-        # Add vLLM info - now includes stage count
-        self.stats["vllm_model"] = self.vllm_config.get("model", "unknown")
-        self.stats["vllm_batch_size"] = self.vllm_config.get("batch_size", 0)
+        # Add vLLM info
+        stats_update["vllm_model"] = self.vllm_config.get("model", "unknown")
+        stats_update["vllm_batch_size"] = self.vllm_config.get("batch_size", 0)
-        # NEW: Add stage information
+        # Add stage information
         stages = self.vllm_config.get("stages", [])
         if stages:
-            self.stats["stage_count"] = len(stages)
-            self.stats["stage_names"] = [s.get("name", "unnamed") for s in stages]
+            stats_update["stage_count"] = len(stages)
+            stats_update["stage_names"] = [s.get("name", "unnamed") for s in stages]
         else:
-            self.stats["stage_count"] = 1  # Backward compatibility
-            self.stats["stage_names"] = ["default"]
+            stats_update["stage_count"] = 1
+            stats_update["stage_names"] = ["default"]
+        # Get field stats
+        field_stats_start = time.time()
         field_stats = await self.storage.get_output_field_stats()
-        self.stats["output_fields"] = field_stats
+        stats_update["output_fields"] = field_stats
+        logger.debug(f"Field stats retrieved in {(time.time() - field_stats_start)*1000:.1f}ms")
-        message = safe_json_dumps({"type": "stats", "data": self.stats})
+        # Update our internal stats
+        self.stats = stats_update
+        logger.debug(f"Stats prepared in {(time.time() - build_stats_start)*1000:.1f}ms")
-        # Send to all monitors
-        disconnected = set()
-        _monitors = self.monitors.copy()
-        for monitor in _monitors:
+        logger.debug(f"Total data preparation took {(time.time() - data_prep_start)*1000:.1f}ms")
+        # Create message once
+        message_create_start = time.time()
+        stats_message = safe_json_dumps({"type": "stats", "data": self.stats})
+        logger.debug(f"Stats message created in {(time.time() - message_create_start)*1000:.1f}ms")
+        # Send to all monitors asynchronously in parallel
+        send_start = time.time()
+        async def send_to_monitor(monitor):
             try:
-                await monitor.send(message)
+                await monitor.send(stats_message)
             except websockets.exceptions.ConnectionClosed:
-                disconnected.add(monitor)
+                return monitor  # Return for removal
+            except Exception as e:
+                logger.debug(f"Error sending stats to monitor: {e}")
+                return monitor  # Return for removal
+            return None
+        # Send to all monitors in parallel
+        monitors_copy = self.monitors.copy()
+        results = await asyncio.gather(
+            *[send_to_monitor(m) for m in monitors_copy], return_exceptions=True
+        )
+        # Remove disconnected monitors
+        disconnected = {
+            m
+            for m, r in zip(monitors_copy, results)
+            if r is not None and not isinstance(r, Exception)
+        }
+        self.monitors -= disconnected
+        logger.debug(
+            f"Stats sent to {len(monitors_copy)} monitors in {(time.time() - send_start)*1000:.1f}ms"
+        )
+        # Send leaderboard update in a separate task to avoid blocking
+        leaderboard_task_start = time.time()
+        asyncio.create_task(self._broadcast_leaderboard())
+        self.is_generating_stats = False
+        logger.debug(
+            f"Leaderboard broadcast task created in {(time.time() - leaderboard_task_start)*1000:.1f}ms"
+        )
+        logger.debug(f"Stats broadcast completed in {(time.time() - total_start)*1000:.1f}ms")
-        # send updated leaderboard
+    async def _broadcast_leaderboard(self):
+        """Send leaderboard updates to monitors - separate from stats to avoid blocking."""
+        if not self.monitors:
+            return
+        total_start = time.time()
         try:
+            # Get contributors
+            contributors_start = time.time()
             contributors = await self.storage.get_top_contributors(10)
-            enhanced_contributors = []
-            worker_counts = (
-                self.get_workers_by_user_stats() if hasattr(self, "workers_by_user") else {}
+            logger.debug(
+                f"Contributors retrieved for broadcast in {(time.time() - contributors_start)*1000:.1f}ms"
+            )
+            # Get worker counts
+            worker_counts_start = time.time()
+            loop = asyncio.get_event_loop()
+            worker_counts = await loop.run_in_executor(
+                None,
+                lambda: (
+                    self.get_workers_by_user_stats() if hasattr(self, "workers_by_user") else {}
+                ),
+            )
+            logger.debug(
+                f"Worker counts retrieved for broadcast in {(time.time() - worker_counts_start)*1000:.1f}ms"
             )
+            # Build enhanced contributors list
+            build_start = time.time()
+            enhanced_contributors = []
             for contributor in contributors:
                 contrib_dict = {
                     "contributor_id": contributor.contributor_id,
@@ -1529,26 +1826,64 @@ class Orchestrator:
                     ),
                 }
                 enhanced_contributors.append(contrib_dict)
+            logger.debug(
+                f"Enhanced contributors built for broadcast in {(time.time() - build_start)*1000:.1f}ms"
+            )
+            # Cache it
+            self._cached_leaderboard = enhanced_contributors
+            # Create message once
+            message_create_start = time.time()
             leaderboard_message = safe_json_dumps(
                 {"type": "leaderboard", "data": enhanced_contributors}
             )
+            logger.debug(
+                f"Leaderboard message created in {(time.time() - message_create_start)*1000:.1f}ms"
+            )
+            # Send to all monitors in parallel
+            send_start = time.time()
-            # Send to all monitors
-            disconnected = set()
-            for monitor in self.monitors.copy():
+            async def send_leaderboard(monitor):
                 try:
                     await monitor.send(leaderboard_message)
-                except websockets.exceptions.ConnectionClosed:
-                    disconnected.add(monitor)
+                except:
+                    return monitor  # Mark for removal
+                return None
+            monitors_copy = self.monitors.copy()
+            results = await asyncio.gather(
+                *[send_leaderboard(m) for m in monitors_copy], return_exceptions=True
+            )
+            # Remove disconnected
+            disconnected = {
+                m
+                for m, r in zip(monitors_copy, results)
+                if r is not None and not isinstance(r, Exception)
+            }
             self.monitors -= disconnected
-        except Exception as e:
-            logger.error(f"Error sending leaderboard update: {e}")
+            logger.debug(
+                f"Leaderboard sent to {len(monitors_copy)} monitors in {(time.time() - send_start)*1000:.1f}ms"
+            )
+            logger.debug(
+                f"Leaderboard broadcast completed in {(time.time() - total_start)*1000:.1f}ms"
+            )
-        # Clean up disconnected monitors
-        self.monitors -= disconnected
+        except Exception as e:
+            logger.error(f"Error broadcasting leaderboard: {e}")
+    def _get_queue_stats(self) -> Dict[str, int]:
+        """Get queue statistics - synchronous helper for thread pool."""
+        with self.chunk_manager.lock:
+            return {
+                "pending_chunks": len(self.chunk_manager.pending_chunks),
+                "assigned_chunks": sum(
+                    len(chunks) for chunks in self.chunk_manager.assigned_chunks.values()
+                ),
+            }
     async def _flush_processed_items(self):
         """Flush batched processed items to chunk tracker."""
@@ -1582,21 +1917,37 @@ class Orchestrator:
                 # Don't forget the last range
                 ranges.append((start, end))
-                # Mark ranges as processed (mark_items_processed expects absolute indices)
+                # Mark ranges as processed
                 for start_idx, end_idx in ranges:
                     self.chunk_tracker.mark_items_processed(chunk_id, start_idx, end_idx)
+                with self.chunk_manager.lock:
+                    if chunk_id in self.chunk_manager.assigned_ranges:
+                        for start_idx, end_idx in ranges:
+                            # Clear any assignments in this range
+                            to_remove = []
+                            for range_start, range_end in self.chunk_manager.assigned_ranges[
+                                chunk_id
+                            ]:
+                                if range_start >= start_idx and range_end <= end_idx:
+                                    to_remove.append((range_start, range_end))
+                            for range_key in to_remove:
+                                del self.chunk_manager.assigned_ranges[chunk_id][range_key]
             # Clear pending items
             self.pending_processed_items.clear()
             self.last_item_batch_flush = time.time()
     def get_workers_by_user_stats(self) -> Dict[str, Any]:
-        """Get statistics about workers grouped by user/token."""
+        """Get statistics about workers grouped by user/token - thread-safe version."""
         if not hasattr(self, "workers_by_user"):
             return {}
+        # Create a copy to avoid issues with concurrent modification
         stats = {}
-        for user, worker_ids in self.workers_by_user.items():
+        workers_snapshot = dict(self.workers_by_user)
+        for user, worker_ids in workers_snapshot.items():
             stats[user] = {"worker_count": len(worker_ids), "worker_ids": list(worker_ids)}
         return stats
@@ -1621,21 +1972,63 @@ class Orchestrator:
     async def _heartbeat_loop(self):
         """Send periodic heartbeats to maintain connections."""
         while True:
-            await asyncio.sleep(30)
+            try:
+                await asyncio.sleep(30)
-            # Ping workers
-            disconnected = []
-            for worker_id, ws in self.workers.items():
-                try:
-                    await ws.ping()
-                except:
-                    disconnected.append(worker_id)
+                # Create a copy of worker items to avoid modification during iteration
+                worker_items = list(self.workers.items())
+                disconnected = []
+                for worker_id, ws in worker_items:
+                    try:
+                        # Check if worker still exists before pinging
+                        if worker_id not in self.workers:
+                            continue
+                        # Send ping with timeout
+                        pong_waiter = await ws.ping()
+                        try:
+                            await asyncio.wait_for(pong_waiter, timeout=10)
+                        except asyncio.TimeoutError:
+                            logger.warning(f"Worker {worker_id} failed to respond to ping")
+                            disconnected.append(worker_id)
+                    except websockets.exceptions.ConnectionClosed:
+                        logger.info(f"Worker {worker_id} connection already closed")
+                        disconnected.append(worker_id)
+                    except Exception as e:
+                        logger.error(f"Error pinging worker {worker_id}: {e}")
+                        disconnected.append(worker_id)
+                # Clean up disconnected workers
+                for worker_id in disconnected:
+                    if worker_id in self.workers:
+                        logger.info(f"Removing unresponsive worker {worker_id}")
+                        del self.workers[worker_id]
+                        self.chunk_manager.release_worker_chunks(worker_id)
+                        # Update stats
+                        self.stats["connected_workers"] = len(self.workers)
-            # Clean up disconnected workers
-            for worker_id in disconnected:
-                if worker_id in self.workers:
-                    del self.workers[worker_id]
-                    self.chunk_manager.release_worker_chunks(worker_id)
+                        # Also clean up from workers_by_user if it exists
+                        if hasattr(self, "workers_by_user"):
+                            worker_user = (
+                                worker_id.rsplit("_", 1)[0] if "_" in worker_id else worker_id
+                            )
+                            if worker_user in self.workers_by_user:
+                                self.workers_by_user[worker_user].discard(worker_id)
+                                if not self.workers_by_user[worker_user]:
+                                    del self.workers_by_user[worker_user]
+                        # Notify monitors
+                        await self._broadcast_stats()
+                        await self._send_activity(
+                            f"Worker {worker_id} removed due to heartbeat timeout"
+                        )
+            except Exception as e:
+                logger.error(f"Error in heartbeat loop: {e}", exc_info=True)
+                # Continue the loop even if there's an error
+                await asyncio.sleep(5)
     async def _checkpoint_loop(self):
         """Periodically checkpoint storage."""
@@ -1663,7 +2056,10 @@ class Orchestrator:
                 )
     async def _stats_update_loop(self):
-        """Periodically update and broadcast stats."""
+        """Periodically update and broadcast stats - non-blocking version."""
+        # Get the event loop for running blocking operations
+        loop = asyncio.get_event_loop()
         # Track session start values
         storage_stats = await self.storage.get_storage_stats()
         session_start_outputs = storage_stats["total_captions"]  # This now counts ALL outputs
@@ -1675,8 +2071,8 @@ class Orchestrator:
         while True:
             await asyncio.sleep(10)
-            # Update chunk stats
-            chunk_stats = self.chunk_manager.get_stats()
+            # Update chunk stats in thread pool to avoid blocking
+            chunk_stats = await loop.run_in_executor(None, self.chunk_manager.get_stats)
             storage_stats = await self.storage.get_storage_stats()
             current_total_outputs = storage_stats["total_captions"]  # ALL outputs
             if self.chunk_tracker:
@@ -1690,12 +2086,9 @@ class Orchestrator:
             self.stats["total_outputs"] = current_total_outputs
             self.stats["total_captions"] = current_total_outputs  # Keep for backward compatibility
-            # Add queue information
-            with self.chunk_manager.lock:
-                self.stats["pending_chunks"] = len(self.chunk_manager.pending_chunks)
-                self.stats["assigned_chunks"] = sum(
-                    len(chunks) for chunks in self.chunk_manager.assigned_chunks.values()
-                )
+            # Get queue stats in thread pool to avoid blocking
+            queue_stats = await loop.run_in_executor(None, self._get_queue_stats)
+            self.stats.update(queue_stats)
             # Calculate if we need more chunks
             worker_count = self.stats.get("connected_workers", 0)
@@ -1754,15 +2147,15 @@ class Orchestrator:
                 last_known_total = current_total_outputs
             # Log rate information when workers are connected
-            if (
-                worker_count > 0 and self.rate_tracker["current_rate"] >= 0
-            ):  # Only log non-negative rates
-                logger.info(
-                    f"Rate: {self.rate_tracker['current_rate']:.1f} outputs/min "
-                    f"(avg: {self.rate_tracker['average_rate']:.1f}, "
-                    f"expected: {self.rate_tracker['expected_rate']:.1f}) | "
-                    f"Workers: {worker_count}, Chunks: {active_chunks}/{target_buffer}"
-                )
+            # if (
+            #     worker_count > 0 and self.rate_tracker["current_rate"] >= 0
+            # ):  # Only log non-negative rates
+            #     logger.info(
+            #         f"Rate: {self.rate_tracker['current_rate']:.1f} outputs/min "
+            #         f"(avg: {self.rate_tracker['average_rate']:.1f}, "
+            #         f"expected: {self.rate_tracker['expected_rate']:.1f}) | "
+            #         f"Workers: {worker_count}, Chunks: {active_chunks}/{target_buffer}"
+            #     )
             await self._broadcast_stats()

caption-flow 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

caption-flow 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl