PyPI - caption-flow - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

caption-flow 0.1.0py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

caption_flow/__init__.py +3 -2
caption_flow/cli.py +65 -42
caption_flow/models.py +6 -4
caption_flow/monitor.py +13 -3
caption_flow/orchestrator.py +1049 -264
caption_flow/storage.py +579 -222
caption_flow/utils/__init__.py +3 -1
caption_flow/utils/auth.py +24 -25
caption_flow/utils/checkpoint_tracker.py +92 -0
caption_flow/utils/chunk_tracker.py +278 -194
caption_flow/utils/dataset_loader.py +567 -73
caption_flow/utils/image_processor.py +121 -1
caption_flow/utils/prompt_template.py +137 -0
caption_flow/utils/shard_processor.py +315 -0
caption_flow/utils/shard_tracker.py +87 -0
caption_flow/workers/base.py +228 -0
caption_flow/workers/caption.py +1321 -0
caption_flow/{worker_data.py → workers/data.py} +162 -234
caption_flow-0.2.1.dist-info/METADATA +370 -0
caption_flow-0.2.1.dist-info/RECORD +29 -0
caption_flow/worker.py +0 -300
caption_flow/worker_vllm.py +0 -1028
caption_flow-0.1.0.dist-info/METADATA +0 -427
caption_flow-0.1.0.dist-info/RECORD +0 -25
{caption_flow-0.1.0.dist-info → caption_flow-0.2.1.dist-info}/WHEEL +0 -0
{caption_flow-0.1.0.dist-info → caption_flow-0.2.1.dist-info}/entry_points.txt +0 -0
{caption_flow-0.1.0.dist-info → caption_flow-0.2.1.dist-info}/licenses/LICENSE +0 -0
{caption_flow-0.1.0.dist-info → caption_flow-0.2.1.dist-info}/top_level.txt +0 -0

caption_flow/orchestrator.py CHANGED Viewed

@@ -21,15 +21,15 @@ from collections import deque, defaultdict
 import threading
 from queue import Queue, Empty
+from .workers import data
 import websockets
 from websockets.server import WebSocketServerProtocol
 from .storage import StorageManager
 from .models import Caption, Contributor
 from .utils.auth import AuthManager
-from .utils.dataset_loader import DatasetLoader, ShardTracker
+from .utils import DatasetLoader, ShardTracker, ChunkTracker
 from .utils.json_utils import safe_dict, safe_json_dumps, to_json_dict
-from .utils.chunk_tracker import ChunkTracker
 logger = logging.getLogger(__name__)
@@ -48,6 +48,43 @@ class ShardChunk:
     assigned_at: Optional[datetime] = None
     completed_at: Optional[datetime] = None
+    @classmethod
+    def create(
+        cls, shard_url: str, shard_name: str, start_index: int, chunk_size: int
+    ) -> "ShardChunk":
+        """Factory method to create a chunk with consistent ID."""
+        # Always use consistent format: dataset_chunk_startindex
+        if shard_url.startswith("hf_dataset:"):
+            # Extract dataset path
+            parts = shard_url.split(":")
+            dataset_path = parts[1] if len(parts) > 1 else "unknown"
+            chunk_id = f"{dataset_path.replace('/', '_')}_chunk_{start_index}"
+        else:
+            # WebDataset format
+            chunk_id = f"{shard_name}_chunk_{start_index}"
+        return cls(
+            chunk_id=chunk_id,
+            shard_url=shard_url,
+            shard_name=shard_name,
+            start_index=start_index,
+            chunk_size=chunk_size,
+        )
+    def belongs_to_shard(self, shard_identifier: str) -> bool:
+        """Check if this chunk belongs to a given shard."""
+        return self.shard_name == shard_identifier
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dict for JSON serialization (for workers)."""
+        return {
+            "chunk_id": self.chunk_id,
+            "shard_url": self.shard_url,
+            "shard_name": self.shard_name,
+            "start_index": self.start_index,
+            "chunk_size": self.chunk_size,
+        }
 class ChunkManager:
     """Manages shard chunk creation and assignment."""
@@ -67,9 +104,7 @@ class ChunkManager:
         chunks = []
         for start_idx in range(0, total_items, self.chunk_size):
-            chunk_id = f"{shard_name}_chunk_{start_idx}"
-            chunk = ShardChunk(
-                chunk_id=chunk_id,
+            chunk = ShardChunk.create(
                 shard_url=shard_url,
                 shard_name=shard_name,
                 start_index=start_idx,
@@ -77,8 +112,8 @@ class ChunkManager:
             )
             with self.lock:
-                self.chunks[chunk_id] = chunk
-                self.pending_chunks.append(chunk_id)
+                self.chunks[chunk.chunk_id] = chunk
+                self.pending_chunks.append(chunk.chunk_id)
             chunks.append(chunk)
@@ -86,24 +121,84 @@ class ChunkManager:
     def get_chunks_for_worker(
         self, worker_id: str, count: int = 1, tracker: Optional["ChunkTracker"] = None
-    ) -> List[ShardChunk]:
-        """Get available chunks for a worker."""
+    ) -> List[Dict[str, Any]]:
+        """Get available chunks with unprocessed items for a worker."""
         assigned = []
         with self.lock:
+            # FIRST PRIORITY: Check if this worker already has assigned chunks
+            # Workers should complete their current chunks before getting new ones
+            if worker_id in self.assigned_chunks:
+                existing_chunk_ids = list(self.assigned_chunks[worker_id])
+                for chunk_id in existing_chunk_ids:
+                    if len(assigned) >= count:
+                        break
+                    chunk = self.chunks.get(chunk_id)
+                    if not chunk:
+                        continue
+                    # Check if chunk still has unprocessed items
+                    if tracker:
+                        chunk_info = tracker.get_chunk_with_unprocessed_items(chunk_id)
+                        if chunk_info and chunk_info["unprocessed_ranges"]:
+                            assigned.append(
+                                {
+                                    "chunk": chunk,
+                                    "unprocessed_ranges": chunk_info["unprocessed_ranges"],
+                                }
+                            )
+                    else:
+                        # No tracker, assume chunk needs processing
+                        assigned.append(
+                            {
+                                "chunk": chunk,
+                                "unprocessed_ranges": [(0, chunk.chunk_size - 1)],
+                            }
+                        )
+            # SECOND PRIORITY: Get new pending chunks
+            # Only if worker doesn't have enough chunks already
             while len(assigned) < count and self.pending_chunks:
                 chunk_id = self.pending_chunks.popleft()
-                chunk = self.chunks[chunk_id]
+                chunk = self.chunks.get(chunk_id)
+                if not chunk:
+                    continue
+                # Verify chunk is truly pending (defensive check)
+                if chunk.status != "pending" or chunk.assigned_to is not None:
+                    logger.warning(
+                        f"Chunk {chunk_id} in pending queue but status={chunk.status}, assigned_to={chunk.assigned_to}"
+                    )
+                    continue
+                # Assign to this worker
                 chunk.assigned_to = worker_id
                 chunk.status = "assigned"
                 chunk.assigned_at = datetime.utcnow()
                 self.assigned_chunks[worker_id].add(chunk_id)
-                assigned.append(chunk)
+                # Get unprocessed ranges
+                unprocessed_ranges = [(0, chunk.chunk_size - 1)]  # Default
                 if tracker:
+                    chunk_info = tracker.get_chunk_with_unprocessed_items(chunk_id)
+                    if chunk_info:
+                        unprocessed_ranges = chunk_info["unprocessed_ranges"]
                     tracker.mark_assigned(chunk_id, worker_id)
+                assigned.append({"chunk": chunk, "unprocessed_ranges": unprocessed_ranges})
+        # Log what we're assigning
+        if assigned:
+            chunk_summary = ", ".join(
+                [
+                    f"{info['chunk'].chunk_id}[{len(info['unprocessed_ranges'])} ranges]"
+                    for info in assigned
+                ]
+            )
+            logger.info(f"Assigning to worker {worker_id}: {chunk_summary}")
         return assigned
     def complete_chunk(self, chunk_id: str, worker_id: str) -> bool:
@@ -173,6 +268,27 @@ class Orchestrator:
         self.dataset_config = config.get("dataset", {})
         self.dataset_path = self.dataset_config.get("path")
         self.dataset_type = self.dataset_config.get("type", "huggingface")
+        self.dataset_split = self.dataset_config.get("split", "train")  # Add split configuration
+        self.dataset_image_column = self.dataset_config.get(
+            "image_column", "image"
+        )  # Add image column config
+        # Dataset components
+        self.dataset_loader = None
+        self.shard_tracker = None
+        self.chunk_tracker = None
+        if self.dataset_path:
+            self.dataset_loader = DatasetLoader(
+                self.dataset_path,
+                self.dataset_type,
+                self.dataset_split,
+                self.dataset_image_column,
+            )
+            checkpoint_dir = Path(config.get("storage", {}).get("checkpoint_dir", "./checkpoints"))
+            checkpoint_dir.mkdir(parents=True, exist_ok=True)
+            self.shard_tracker = ShardTracker(checkpoint_dir / "shards.json")
+            self.chunk_tracker = ChunkTracker(checkpoint_dir / "chunks.json")
         # vLLM configuration to distribute to workers
         self.vllm_config = config.get(
@@ -233,6 +349,11 @@ class Orchestrator:
         # Initialize chunk manager with reference to chunk tracker
         self.chunk_manager = ChunkManager(self.chunk_size, self.chunk_tracker)
+        self.pending_processed_items = defaultdict(list)  # chunk_id -> list of indices
+        self.item_batch_lock = threading.Lock()
+        self.last_item_batch_flush = time.time()
+        self.item_batch_interval = 5  # Flush every 5 seconds
+        self.item_batch_size = 100  # Or every 100 items
         # Track connections
         self.workers: Dict[str, WebSocketServerProtocol] = {}
@@ -242,17 +363,15 @@ class Orchestrator:
         self.ssl_context = self._setup_ssl()
         # Statistics
+        self.is_generating_stats = False
         self.stats = {
             "total_chunks": 0,
             "completed_chunks": 0,
             "failed_chunks": 0,
-            "total_captions": 0,
             "connected_workers": 0,
             "total_shards": 0,
             "completed_shards": 0,
             "current_shard": None,
-            "buffer_size": 0,
-            "total_written": 0,
             "last_checkpoint": None,
         }
@@ -266,7 +385,7 @@ class Orchestrator:
             "expected_rate": 0.0,
         }
-        # Data sample queue for VLLMWorkers
+        # Data sample queue for CaptionWorker
         self.data_sample_queue = asyncio.Queue(maxsize=1000)
         self.data_workers: Dict[str, WebSocketServerProtocol] = {}
@@ -310,10 +429,23 @@ class Orchestrator:
         # Mark state as not restored until we process checkpoints
         self.state_restored.clear()
+        # Get dataset info to check format
+        dataset_info = self.dataset_loader.get_dataset_info()
+        dataset_format = dataset_info.get("dataset_format", "unknown")
+        logger.info(f"Dataset format: {dataset_format}")
         # Get all shards
         self.all_shards = self.dataset_loader.get_shard_list()
         self.stats["total_shards"] = len(self.all_shards)
+        # For HuggingFace datasets, we might need to dynamically create more shards
+        if dataset_format == "huggingface_datasets":
+            self._is_hf_dataset = True
+            self._hf_chunk_size = 10000  # Items per virtual shard
+            self._next_hf_shard_index = len(self.all_shards)  # For creating new virtual shards
+        else:
+            self._is_hf_dataset = False
         # Get shard status from ChunkTracker
         shards_summary = self.chunk_tracker.get_shards_summary() if self.chunk_tracker else {}
         completed_shards = {
@@ -336,7 +468,10 @@ class Orchestrator:
         # Filter out shards that already have chunks created
         remaining_shards = [
-            shard for shard in remaining_shards if Path(shard).stem not in shards_with_chunks
+            shard
+            for shard in remaining_shards
+            if (shard if shard.startswith("hf_dataset:") else Path(shard).stem)
+            not in shards_with_chunks
         ]
         self.stats["completed_shards"] = len(completed_shards)
@@ -356,25 +491,18 @@ class Orchestrator:
             with self.chunk_manager.lock:
                 for chunk_state in shard_info["chunks"]:
                     if chunk_state.status in ["pending", "failed", "assigned"]:
-                        # Find shard URL
-                        shard_url = None
-                        for url in self.all_shards:
-                            if Path(url).stem == shard_name:
-                                shard_url = url
-                                break
-                        if shard_url:
-                            chunk = ShardChunk(
-                                chunk_id=chunk_state.chunk_id,
-                                shard_url=shard_url,
-                                shard_name=chunk_state.shard_name,
-                                start_index=chunk_state.start_index,
-                                chunk_size=chunk_state.chunk_size,
-                            )
-                            self.chunk_manager.chunks[chunk_state.chunk_id] = chunk
-                            self.chunk_manager.pending_chunks.append(chunk_state.chunk_id)
-                            requeued_chunks_by_shard[shard_name].append(chunk_state.chunk_id)
-                            initial_pending += 1
+                        # ChunkState already has shard_url stored
+                        chunk = ShardChunk(
+                            chunk_id=chunk_state.chunk_id,
+                            shard_url=chunk_state.shard_url,
+                            shard_name=chunk_state.shard_name,
+                            start_index=chunk_state.start_index,
+                            chunk_size=chunk_state.chunk_size,
+                        )
+                        self.chunk_manager.chunks[chunk_state.chunk_id] = chunk
+                        self.chunk_manager.pending_chunks.append(chunk_state.chunk_id)
+                        requeued_chunks_by_shard[shard_name].append(chunk_state.chunk_id)
+                        initial_pending += 1
         logger.info(f"Re-queued {initial_pending} existing pending chunks")
         for shard_name, chunk_ids in requeued_chunks_by_shard.items():
@@ -426,7 +554,13 @@ class Orchestrator:
                 if current_shard_url is None or current_shard_index >= current_shard_items:
                     try:
                         current_shard_url = next(shard_iter)
-                        current_shard_name = Path(current_shard_url).stem
+                        # Extract shard name based on type
+                        if current_shard_url.startswith("hf_dataset:"):
+                            current_shard_name = current_shard_url  # Use full ID for virtual shards
+                        else:
+                            current_shard_name = Path(current_shard_url).stem
                         self.stats["current_shard"] = current_shard_name
                         # Skip if we already have chunks from this shard
@@ -439,16 +573,74 @@ class Orchestrator:
                         # Count items in new shard
                         logger.info(f"Loading new shard {current_shard_name}")
-                        current_shard_items = sum(
-                            1 for _ in self.dataset_loader.iterate_shard(current_shard_url)
-                        )
+                        # For virtual HF dataset shards, use the chunk size directly
+                        if current_shard_url.startswith("hf_dataset:"):
+                            current_shard_items = self.dataset_loader.count_shard_items(
+                                current_shard_url
+                            )
+                            logger.info(
+                                f"Virtual shard {current_shard_name} has {current_shard_items} items"
+                            )
+                        else:
+                            # For WebDataset, actually count items
+                            current_shard_items = sum(
+                                1 for _ in self.dataset_loader.iterate_shard(current_shard_url)
+                            )
+                            logger.info(
+                                f"Shard {current_shard_name} has {current_shard_items} items"
+                            )
                         current_shard_index = 0
-                        logger.info(f"Shard {current_shard_name} has {current_shard_items} items")
                     except StopIteration:
-                        # No more shards
+                        # No more shards in the iterator
+                        if self._is_hf_dataset:
+                            # Before creating new virtual shards, check if we have pending chunks
+                            with self.chunk_manager.lock:
+                                pending_count = len(self.chunk_manager.pending_chunks)
+                            if pending_count > 0:
+                                # Don't create new shards if we have pending chunks
+                                logger.debug(
+                                    f"Have {pending_count} pending chunks, not creating new virtual shards yet"
+                                )
+                                current_shard_url = None
+                                time.sleep(2)
+                                continue
+                            # For HF datasets, we can create more virtual shards on demand
+                            logger.info(
+                                "Creating additional virtual shards for HuggingFace dataset"
+                            )
+                            # Create 10 more virtual shards
+                            new_shards = []
+                            for i in range(10):
+                                shard_id = f"hf_dataset:{self.dataset_path}:chunk:{self._next_hf_shard_index * self._hf_chunk_size}"
+                                new_shards.append(shard_id)
+                                self._next_hf_shard_index += 1
+                            # Add to all_shards and create new iterator
+                            self.all_shards.extend(new_shards)
+                            self.stats["total_shards"] = len(self.all_shards)
+                            # Filter for unprocessed shards
+                            remaining_new_shards = [
+                                s
+                                for s in new_shards
+                                if s not in shards_summary and s not in completed_shards
+                            ]
+                            if remaining_new_shards:
+                                shard_iter = iter(remaining_new_shards)
+                                logger.info(f"Added {len(remaining_new_shards)} new virtual shards")
+                                continue
+                        # No more shards to process
                         logger.info("No more shards to process")
                         break
                     except Exception as e:
                         logger.error(f"Error loading shard {current_shard_name}: {e}")
                         current_shard_url = None
@@ -456,25 +648,40 @@ class Orchestrator:
                 # Create a chunk from current shard
                 if current_shard_url and current_shard_index < current_shard_items:
-                    chunk_id = f"{current_shard_name}_chunk_{current_shard_index}"
-                    chunk_size = min(self.chunk_size, current_shard_items - current_shard_index)
+                    # Calculate the absolute dataset index for this chunk
+                    if current_shard_url.startswith("hf_dataset:"):
+                        # Parse the virtual shard URL to get the base start index
+                        parts = current_shard_url.split(":")
+                        if len(parts) >= 4 and parts[2] == "chunk":
+                            shard_base_index = int(parts[3])
+                        else:
+                            shard_base_index = 0
+                        # The absolute start index for this chunk in the dataset
+                        absolute_start_index = shard_base_index + current_shard_index
+                    else:
+                        # For WebDataset, current_shard_index is already absolute
+                        absolute_start_index = current_shard_index
+                    # Create chunk with absolute index
+                    chunk = ShardChunk.create(
+                        shard_url=current_shard_url,
+                        shard_name=current_shard_name,
+                        start_index=absolute_start_index,
+                        chunk_size=min(self.chunk_size, current_shard_items - current_shard_index),
+                    )
-                    # Add to ChunkTracker
+                    # Add to ChunkTracker with all required fields
                     if self.chunk_tracker and self.chunk_tracker.add_chunk(
-                        chunk_id, current_shard_name, current_shard_index, chunk_size
+                        chunk.chunk_id,
+                        chunk.shard_name,
+                        chunk.shard_url,
+                        chunk.start_index,
+                        chunk.chunk_size,
                     ):
-                        # Create chunk
-                        chunk = ShardChunk(
-                            chunk_id=chunk_id,
-                            shard_url=current_shard_url,
-                            shard_name=current_shard_name,
-                            start_index=current_shard_index,
-                            chunk_size=chunk_size,
-                        )
                         with self.chunk_manager.lock:
-                            self.chunk_manager.chunks[chunk_id] = chunk
-                            self.chunk_manager.pending_chunks.append(chunk_id)
+                            self.chunk_manager.chunks[chunk.chunk_id] = chunk
+                            self.chunk_manager.pending_chunks.append(chunk.chunk_id)
                         chunks_created += 1
                         self.stats["total_chunks"] += 1
@@ -484,10 +691,14 @@ class Orchestrator:
             if chunks_created > 0:
                 logger.info(f"Created {chunks_created} chunks on demand")
-            # If we couldn't create any chunks and there are no more shards, we're done
+            # If we couldn't create any chunks and there are no more shards, check if it's HF dataset
             if chunks_created == 0 and current_shard_url is None:
-                logger.info("All shards processed, chunk creation complete")
-                break
+                if self._is_hf_dataset:
+                    # We can always create more virtual shards for HF datasets
+                    logger.debug("Will create more virtual shards on next iteration")
+                else:
+                    logger.info("All shards processed, chunk creation complete")
+                    break
             # Brief pause to avoid spinning
             time.sleep(1)
@@ -558,7 +769,9 @@ class Orchestrator:
             elif auth_ticket.role == "admin":
                 await self._handle_admin(websocket, auth_ticket)
             else:
-                await websocket.send(safe_json_dumps({"error": f"Unknown role: {auth_ticket.role}"}))
+                await websocket.send(
+                    safe_json_dumps({"error": f"Unknown role: {auth_ticket.role}"})
+                )
         except Exception as e:
             logger.error(f"Connection error: {e}")
@@ -604,81 +817,118 @@ class Orchestrator:
         requires_worker_restart = False
         try:
+            # Extract orchestrator section if present
+            if "orchestrator" in new_config:
+                # Config has orchestrator wrapper, extract it
+                orchestrator_config = new_config["orchestrator"]
+            else:
+                # Config is already at orchestrator level
+                orchestrator_config = new_config
+            # Helper function for deep comparison
+            def deep_equal(a, b):
+                """Deep comparison of two values including nested dicts and lists."""
+                if type(a) != type(b):
+                    return False
+                if isinstance(a, dict):
+                    if set(a.keys()) != set(b.keys()):
+                        return False
+                    return all(deep_equal(a[k], b[k]) for k in a.keys())
+                elif isinstance(a, (list, tuple)):
+                    if len(a) != len(b):
+                        return False
+                    return all(deep_equal(x, y) for x, y in zip(a, b))
+                else:
+                    return a == b
             # Update vLLM configuration
-            if "vllm" in new_config:
+            if "vllm" in orchestrator_config:
                 old_vllm = self.vllm_config.copy()
+                new_vllm = orchestrator_config["vllm"]
-                # Check each field for actual changes
-                vllm_changed = False
-                for key, value in new_config["vllm"].items():
-                    if self.vllm_config.get(key) != value:
-                        self.vllm_config[key] = value
-                        vllm_changed = True
+                # Check if vLLM config actually changed using deep comparison
+                vllm_changed = not deep_equal(old_vllm, new_vllm)
                 if vllm_changed:
+                    # Update the vLLM config
+                    self.vllm_config = new_vllm.copy()
                     updated_sections.append("vllm")
                     # Check if critical changes require worker restart
                     if (
-                        old_vllm.get("model") != self.vllm_config.get("model")
+                        old_vllm.get("model") != new_vllm.get("model")
                         or old_vllm.get("gpu_memory_utilization")
-                        != self.vllm_config.get("gpu_memory_utilization")
+                        != new_vllm.get("gpu_memory_utilization")
                         or old_vllm.get("tensor_parallel_size")
-                        != self.vllm_config.get("tensor_parallel_size")
+                        != new_vllm.get("tensor_parallel_size")
+                        or old_vllm.get("dtype") != new_vllm.get("dtype")
+                        or old_vllm.get("max_model_len") != new_vllm.get("max_model_len")
                     ):
                         requires_worker_restart = True
                         warnings.append(
                             "Critical vLLM changes detected - workers will be disconnected to reload"
                         )
+                        logger.info(
+                            f"Model change: {old_vllm.get('model')} -> {new_vllm.get('model')}"
+                        )
             # Update dataset configuration
-            if "dataset" in new_config:
-                dataset_changed = False
-                for key, value in new_config["dataset"].items():
-                    if self.dataset_config.get(key) != value:
-                        self.dataset_config[key] = value
-                        dataset_changed = True
+            if "dataset" in orchestrator_config:
+                old_dataset = self.dataset_config.copy()
+                new_dataset = orchestrator_config["dataset"]
+                dataset_changed = not deep_equal(old_dataset, new_dataset)
                 if dataset_changed:
+                    self.dataset_config = new_dataset.copy()
                     self.dataset_path = self.dataset_config.get("path")
                     self.dataset_type = self.dataset_config.get("type", "huggingface")
                     updated_sections.append("dataset")
                     warnings.append("Dataset changes will apply to new chunks only")
             # Update chunk settings
-            if "chunk_size" in new_config and self.chunk_size != new_config["chunk_size"]:
-                self.chunk_size = new_config["chunk_size"]
+            if (
+                "chunk_size" in orchestrator_config
+                and self.chunk_size != orchestrator_config["chunk_size"]
+            ):
+                self.chunk_size = orchestrator_config["chunk_size"]
                 self.chunk_manager.chunk_size = self.chunk_size
                 updated_sections.append("chunk_size")
             if (
-                "chunks_per_request" in new_config
-                and self.chunks_per_request != new_config["chunks_per_request"]
+                "chunks_per_request" in orchestrator_config
+                and self.chunks_per_request != orchestrator_config["chunks_per_request"]
             ):
-                self.chunks_per_request = new_config["chunks_per_request"]
+                self.chunks_per_request = orchestrator_config["chunks_per_request"]
                 updated_sections.append("chunks_per_request")
-            # Recreate auth manager
-            self.auth = AuthManager(config=new_config)
+            # Update auth configuration
+            if "auth" in orchestrator_config:
+                try:
+                    self.auth = AuthManager({"auth": orchestrator_config["auth"]})
+                    updated_sections.append("auth")
+                except Exception as e:
+                    logger.error(f"Failed to update AuthManager: {e}")
+                    warnings.append(f"Auth update failed: {e}")
             # Update buffer settings
             if (
-                "chunk_buffer_multiplier" in new_config
-                and self.chunk_buffer_multiplier != new_config["chunk_buffer_multiplier"]
+                "chunk_buffer_multiplier" in orchestrator_config
+                and self.chunk_buffer_multiplier != orchestrator_config["chunk_buffer_multiplier"]
             ):
-                self.chunk_buffer_multiplier = new_config["chunk_buffer_multiplier"]
+                self.chunk_buffer_multiplier = orchestrator_config["chunk_buffer_multiplier"]
                 updated_sections.append("chunk_buffer_multiplier")
             if (
-                "min_chunk_buffer" in new_config
-                and self.min_chunk_buffer != new_config["min_chunk_buffer"]
+                "min_chunk_buffer" in orchestrator_config
+                and self.min_chunk_buffer != orchestrator_config["min_chunk_buffer"]
             ):
-                self.min_chunk_buffer = new_config["min_chunk_buffer"]
+                self.min_chunk_buffer = orchestrator_config["min_chunk_buffer"]
                 updated_sections.append("min_chunk_buffer")
             # Update storage settings
-            if "storage" in new_config:
-                storage_config = new_config["storage"]
+            if "storage" in orchestrator_config:
+                storage_config = orchestrator_config["storage"]
                 storage_changed = False
                 if (
@@ -701,21 +951,6 @@ class Orchestrator:
                 if storage_changed:
                     updated_sections.append("storage")
-            # Update data worker storage config
-            if "data_worker_storage" in new_config:
-                current_dw_storage = self.config.get("data_worker_storage", {})
-                if current_dw_storage != new_config["data_worker_storage"]:
-                    self.config["data_worker_storage"] = new_config["data_worker_storage"]
-                    updated_sections.append("data_worker_storage")
-                    warnings.append("Data worker storage config will apply to new connections only")
-            # Update backpressure threshold
-            if "backpressure_threshold" in new_config:
-                current_threshold = getattr(self, "backpressure_threshold", 800)
-                if current_threshold != new_config["backpressure_threshold"]:
-                    self.backpressure_threshold = new_config["backpressure_threshold"]
-                    updated_sections.append("backpressure_threshold")
             # Check if any changes were made
             if not updated_sections:
                 await websocket.send(
@@ -729,29 +964,49 @@ class Orchestrator:
                 logger.info("Configuration reload requested but no changes detected")
                 return
-            # Update the main config for any other fields
-            self.config.update(new_config)
+            # Update the main config
+            if "orchestrator" in new_config:
+                self.config["orchestrator"] = orchestrator_config
+            else:
+                self.config.update(orchestrator_config)
             # Handle worker restart if needed
             if requires_worker_restart:
                 logger.info("Disconnecting all workers for configuration reload...")
-                # Disconnect all workers
-                worker_ids = list(self.workers.keys())
-                for worker_id in worker_ids:
+                # Send reload message to workers first
+                reload_msg = safe_json_dumps(
+                    {
+                        "type": "reload_vllm",
+                        "vllm_config": self.vllm_config,
+                    }
+                )
+                # Create a list of worker items to avoid modifying dict during iteration
+                worker_items = list(self.workers.items())
+                disconnected = []
+                for worker_id, ws in worker_items:
                     try:
-                        await self.workers[worker_id].close(
-                            code=1012, reason="Configuration reload"
-                        )
+                        await ws.send(reload_msg)
+                        # Give worker time to process before disconnect
+                        await asyncio.sleep(0.5)
+                        await ws.close(code=1012, reason="Configuration reload")
+                        disconnected.append(worker_id)
                     except:
-                        pass
+                        disconnected.append(worker_id)  # Still mark as disconnected if error
+                # Now safely clear workers dict
+                for worker_id in disconnected:
+                    if worker_id in self.workers:
+                        del self.workers[worker_id]
                 warnings.append(
-                    f"Disconnected {len(worker_ids)} workers - they will reconnect with new config"
+                    f"Sent reload message to {len(disconnected)} workers - they will reconnect with new config"
                 )
             else:
-                # Just notify workers about config changes
-                reload_msg = safe_json_dumps(
+                # Just notify workers about config changes without disconnecting
+                config_update_msg = safe_json_dumps(
                     {
                         "type": "config_update",
                         "vllm_config": self.vllm_config if "vllm" in updated_sections else None,
@@ -761,15 +1016,21 @@ class Orchestrator:
                     }
                 )
+                # Create a list of worker items to avoid modifying dict during iteration
+                worker_items = list(self.workers.items())
                 disconnected = []
-                for worker_id, ws in self.workers.items():
+                for worker_id, ws in worker_items:
                     try:
-                        await ws.send(reload_msg)
+                        await ws.send(config_update_msg)
+                        logger.info(f"Sent config update to worker {worker_id}")
                     except:
                         disconnected.append(worker_id)
+                # Now safely remove disconnected workers
                 for worker_id in disconnected:
-                    del self.workers[worker_id]
+                    if worker_id in self.workers:
+                        del self.workers[worker_id]
             # Send success response
             await websocket.send(
@@ -788,34 +1049,58 @@ class Orchestrator:
         except Exception as e:
             logger.error(f"Configuration reload failed: {e}")
+            import traceback
+            logger.error(traceback.format_exc())
             await websocket.send(safe_json_dumps({"type": "reload_failed", "error": str(e)}))
     async def _handle_worker(self, websocket: WebSocketServerProtocol, auth_ticket):
         """Handle worker connection lifecycle."""
-        worker_id = getattr(auth_ticket, "name", str(uuid.uuid4()))
+        # Generate unique worker ID even if using same token
+        base_name = getattr(auth_ticket, "name", "worker")
+        worker_id = f"{base_name}_{str(uuid.uuid4())[:8]}"  # Add unique suffix
+        # Track the original token/user for accounting
+        worker_user = base_name  # Keep track of which user/token this worker belongs to
         self.workers[worker_id] = websocket
         self.stats["connected_workers"] = len(self.workers)
-        # Register contributor
-        contributor = Contributor(
-            contributor_id=worker_id, name=worker_id, total_captions=0, trust_level=1
-        )
-        await self.storage.save_contributor(contributor)
+        # Optionally track workers by user/token
+        if not hasattr(self, "workers_by_user"):
+            self.workers_by_user = defaultdict(set)
+        self.workers_by_user[worker_user].add(worker_id)
+        # Register contributor with the base name (for aggregating stats per user)
+        contributor = await self.storage.get_contributor(worker_user)
+        if not contributor:
+            contributor = Contributor(
+                contributor_id=worker_user,
+                name=worker_user,
+                total_captions=0,
+                trust_level=1,
+            )
+            await self.storage.save_contributor(contributor)
-        logger.info(f"Worker {worker_id} connected")
+        logger.info(f"Worker {worker_id} (user: {worker_user}) connected")
         await self._broadcast_stats()
-        await self._send_activity(f"Worker {worker_id} connected")
+        await self._send_activity(f"Worker {worker_id} (user: {worker_user}) connected")
         try:
             # Send welcome message with dataset configuration
             welcome_message = {
                 "type": "welcome",
                 "worker_id": worker_id,
+                "user_id": worker_user,
                 "dataset_config": {
                     "dataset_path": self.dataset_path,
                     "dataset_type": self.dataset_type,
-                    "path": self.dataset_path,  # For compatibility
-                    "type": self.dataset_type,  # For compatibility
+                    "dataset_split": self.dataset_split,
+                    "dataset_image_column": self.dataset_image_column,
+                    "path": self.dataset_path,
+                    "type": self.dataset_type,
+                    "split": self.dataset_split,
+                    "image_column": self.dataset_image_column,
                 },
                 "vllm_config": self.vllm_config,
             }
@@ -826,21 +1111,29 @@ class Orchestrator:
                 await self._process_worker_message(worker_id, data)
         except websockets.exceptions.ConnectionClosed:
-            logger.info(f"Worker {worker_id} disconnected")
+            logger.info(f"Worker {worker_id} (user: {worker_user}) disconnected")
         finally:
-            del self.workers[worker_id]
+            if worker_id in self.workers:
+                del self.workers[worker_id]
+            # Clean up user tracking
+            if hasattr(self, "workers_by_user") and worker_user in self.workers_by_user:
+                self.workers_by_user[worker_user].discard(worker_id)
+                if not self.workers_by_user[worker_user]:
+                    del self.workers_by_user[worker_user]
             self.stats["connected_workers"] = len(self.workers)
-            # Release chunks in both managers
+            # Release chunks
             self.chunk_manager.release_worker_chunks(worker_id)
             if self.chunk_tracker:
-                # Mark released chunks as pending in tracker
                 released_chunks = self.chunk_tracker.release_worker_chunks(worker_id)
                 logger.info(
                     f"Released {len(released_chunks) if released_chunks is not None else 0} chunks from worker {worker_id}"
                 )
             await self._broadcast_stats()
-            await self._send_activity(f"Worker {worker_id} disconnected")
+            await self._send_activity(f"Worker {worker_id} (user: {worker_user}) disconnected")
     async def _process_worker_message(self, worker_id: str, data: Dict):
         """Process message from worker."""
@@ -856,28 +1149,26 @@ class Orchestrator:
                 return
             count = data.get("count", self.chunks_per_request)
-            chunks = self.chunk_manager.get_chunks_for_worker(worker_id, count, self.chunk_tracker)
+            chunk_infos = self.chunk_manager.get_chunks_for_worker(
+                worker_id, count, self.chunk_tracker
+            )
-            if chunks:
-                # Only send the fields that worker expects
-                chunk_data = []
-                for chunk in chunks:
-                    chunk_data.append(
-                        {
-                            "chunk_id": chunk.chunk_id,
-                            "shard_url": chunk.shard_url,
-                            "shard_name": chunk.shard_name,
-                            "start_index": chunk.start_index,
-                            "chunk_size": chunk.chunk_size,
-                        }
-                    )
+            if chunk_infos:
+                # Send chunks with unprocessed ranges
+                chunks_data = []
+                for info in chunk_infos:
+                    chunk_dict = info["chunk"].to_dict()
+                    chunk_dict["unprocessed_ranges"] = info["unprocessed_ranges"]
+                    chunks_data.append(chunk_dict)
                 await self.workers[worker_id].send(
-                    safe_json_dumps({"type": "shard_assignment", "chunks": chunk_data})
+                    safe_json_dumps({"type": "shard_assignment", "chunks": chunks_data})
+                )
+                chunk_ids = [c["chunk_id"] for c in chunks_data]
+                logger.info(
+                    f"Assigned {len(chunks_data)} chunks to worker {worker_id}: {chunk_ids}"
                 )
-                chunk_ids = [c["chunk_id"] for c in chunk_data]
-                logger.info(f"Assigned {len(chunks)} chunks to worker {worker_id}: {chunk_ids}")
-                await self._send_activity(f"Assigned {len(chunks)} chunks to {worker_id}")
             else:
                 await self.workers[worker_id].send(safe_json_dumps({"type": "no_chunks"}))
@@ -907,7 +1198,7 @@ class Orchestrator:
         elif msg_type == "submit_captions":
             await self._handle_captions_submission(worker_id, data)
         elif msg_type == "request_job":
-            # VLLMWorker requesting a job from data samples
+            # CaptionWorker requesting a job from data samples
             try:
                 job = await asyncio.wait_for(self.data_sample_queue.get(), timeout=5)
                 await self.workers[worker_id].send(
@@ -921,76 +1212,132 @@ class Orchestrator:
             logger.debug(f"Heartbeat from {worker_id}: {data}")
     async def _handle_captions_submission(self, worker_id: str, data: Dict):
-        """Process multiple captions submission from worker."""
+        """Process caption submission from worker - now handles multi-stage outputs."""
         chunk_id = data.get("chunk_id")
         item_key = data["item_key"]
-        captions_list = data["captions"]
-        logger.debug(
-            f"Received {len(captions_list)} captions for item {item_key} from worker {worker_id}"
-        )
+        item_index = data.get("item_index")  # Worker should send this
+        if item_index is None:
+            # Try to extract from item_key (format: dataset_XXXXXXXX)
+            try:
+                item_index = int(item_key.split("_")[-1])
+            except:
+                logger.warning(f"Could not extract item index from key: {item_key}")
-        # Create a SINGLE caption record with ALL captions as a list
+        # Extract user from worker_id (format: "username_uuid")
+        worker_user = worker_id.rsplit("_", 1)[0] if "_" in worker_id else worker_id
+        # Handle both old format (captions list) and new format (outputs dict)
+        if "outputs" in data:
+            # New multi-stage format
+            outputs = data["outputs"]
+            captions_list = outputs.get("captions", [])
+            total_outputs = sum(len(v) for v in outputs.values())
+            logger.debug(
+                f"Received multi-stage outputs for item {item_key} from worker {worker_id}: "
+                f"{total_outputs} outputs across {len(outputs)} fields"
+            )
+        else:
+            # Old format - single captions list
+            captions_list = data["captions"]
+            outputs = {"captions": captions_list}
+            total_outputs = len(captions_list)
+            logger.debug(
+                f"Received {len(captions_list)} captions for item {item_key} from worker {worker_id}"
+            )
+        # Create caption record with multi-stage outputs
         caption = Caption(
-            job_id=f"{chunk_id}_{item_key}",  # Single ID for the item
+            job_id=f"{chunk_id}_{item_key}",
             dataset=data.get("dataset"),
             shard=data.get("shard"),
             item_key=item_key,
-            captions=captions_list,  # Store ALL captions as a list
-            contributor_id=worker_id,
+            captions=captions_list,
+            outputs=outputs,
+            contributor_id=worker_user,
             timestamp=datetime.utcnow(),
-            quality_scores=None,  # Could be a list of scores matching captions
+            quality_scores=None,
             # Image metadata
             image_width=data.get("image_width"),
             image_height=data.get("image_height"),
             image_format=data.get("image_format"),
             file_size=data.get("file_size"),
             # Processing metadata
-            caption_count=len(captions_list),
+            caption_count=total_outputs,
             processing_time_ms=data.get("processing_time_ms"),
             chunk_id=chunk_id,
+            metadata=data.get("metadata", {}),
         )
-        # Add to central storage buffer as a single entry
+        # Add to central storage buffer
         await self.storage.save_caption(caption)
-        # Update statistics
-        self.stats["total_captions"] += len(captions_list)
-        self.stats["buffer_size"] = len(self.storage.caption_buffer)
+        # Handle item tracking with fixed deadlock
+        should_flush = False
+        if chunk_id and item_index is not None and self.chunk_tracker:
+            with self.item_batch_lock:
+                self.pending_processed_items[chunk_id].append(item_index)
-        # Update contributor stats
-        contributor = await self.storage.get_contributor(worker_id)
+                # Check if we should flush
+                total_pending = sum(
+                    len(indices) for indices in self.pending_processed_items.values()
+                )
+                time_since_flush = time.time() - self.last_item_batch_flush
+                if (
+                    total_pending >= self.item_batch_size
+                    or time_since_flush >= self.item_batch_interval
+                ):
+                    should_flush = True
+            if should_flush:
+                await self._flush_processed_items()
+        # Update contributor stats (use user, not worker)
+        contributor = await self.storage.get_contributor(worker_user)
         if contributor:
-            contributor.total_captions += len(captions_list)
+            contributor.total_captions += total_outputs
             await self.storage.save_contributor(contributor)
         # Broadcast updated stats
         await self._broadcast_stats()
         # Log progress periodically
-        if self.stats["total_captions"] % 100 == 0:
-            logger.info(f"Collected {self.stats['total_captions']} captions centrally")
+        total_outputs = self.stats.get("total_outputs", 0)
+        if total_outputs > 0 and total_outputs % 100 == 0:
+            if (
+                not hasattr(self, "_last_logged_outputs")
+                or self._last_logged_outputs != total_outputs
+            ):
+                logger.info(f"Collected {total_outputs} outputs centrally")
+                self._last_logged_outputs = total_outputs
     async def _check_shard_completion(self, chunk_id: str):
         """Check if a shard is complete after chunk completion."""
-        # Extract shard name from chunk_id
-        shard_name = chunk_id.rsplit("_chunk_", 1)[0]
+        # Get the chunk
+        chunk = self.chunk_manager.chunks.get(chunk_id)
+        if not chunk:
+            return
+        shard_name = chunk.shard_name
-        # Check if all chunks for this shard are complete
-        chunk_stats = self.chunk_manager.get_stats()
+        # Find all chunks for this shard
         shard_chunks = [
-            cid
-            for cid, chunk in self.chunk_manager.chunks.items()
-            if chunk.shard_name == shard_name
+            cid for cid, c in self.chunk_manager.chunks.items() if c.belongs_to_shard(shard_name)
         ]
+        # Check if all are completed
         completed_chunks = [
             cid for cid in shard_chunks if self.chunk_manager.chunks[cid].status == "completed"
         ]
-        if len(completed_chunks) == len(shard_chunks):
+        if len(completed_chunks) == len(shard_chunks) and len(shard_chunks) > 0:
             logger.info(f"Shard {shard_name} complete!")
-            self.shard_tracker.mark_complete(shard_name)
+            # Don't mark virtual shards as complete in ShardTracker
+            if not shard_name.startswith("hf_dataset:"):
+                self.shard_tracker.mark_complete(shard_name)
             self.stats["completed_shards"] += 1
             await self._send_activity(f"Shard {shard_name} completed!")
@@ -1063,47 +1410,198 @@ class Orchestrator:
         finally:
             del self.data_workers[worker_id]
-    async def _handle_monitor(self, websocket: WebSocketServerProtocol):
-        """Handle monitor connection."""
-        self.monitors.add(websocket)
-        logger.info("Monitor connected")
+    async def _send_leaderboard_to_monitor(self, websocket: WebSocketServerProtocol):
+        """Send leaderboard data to a specific monitor."""
+        total_start = time.time()
+        try:
+            if websocket not in self.monitors:
+                return
+            # Get contributors asynchronously
+            contributors_start = time.time()
+            contributors = await self.storage.get_top_contributors(10)
+            logger.debug(
+                f"Contributors retrieved in {(time.time() - contributors_start)*1000:.1f}ms"
+            )
+            # Get worker counts in thread pool
+            worker_counts_start = time.time()
+            loop = asyncio.get_event_loop()
+            worker_counts = await loop.run_in_executor(
+                None,
+                lambda: (
+                    self.get_workers_by_user_stats() if hasattr(self, "workers_by_user") else {}
+                ),
+            )
+            logger.debug(
+                f"Worker counts retrieved in {(time.time() - worker_counts_start)*1000:.1f}ms"
+            )
+            # Build enhanced contributors list
+            build_start = time.time()
+            enhanced_contributors = []
+            for contributor in contributors:
+                contrib_dict = {
+                    "contributor_id": contributor.contributor_id,
+                    "name": contributor.name,
+                    "total_captions": contributor.total_captions,
+                    "trust_level": contributor.trust_level,
+                    "active_workers": len(
+                        worker_counts.get(contributor.contributor_id, {}).get("worker_ids", [])
+                    ),
+                }
+                enhanced_contributors.append(contrib_dict)
+            logger.debug(f"Enhanced contributors built in {(time.time() - build_start)*1000:.1f}ms")
+            # Cache for future monitors
+            self._cached_leaderboard = enhanced_contributors
+            # Send if still connected
+            if websocket in self.monitors:
+                send_start = time.time()
+                await websocket.send(
+                    safe_json_dumps({"type": "leaderboard", "data": enhanced_contributors})
+                )
+                logger.debug(
+                    f"Leaderboard sent to monitor in {(time.time() - send_start)*1000:.1f}ms"
+                )
+            logger.debug(
+                f"Leaderboard send to monitor completed in {(time.time() - total_start)*1000:.1f}ms"
+            )
+        except websockets.exceptions.ConnectionClosed:
+            logger.debug("Monitor disconnected during leaderboard send")
+        except Exception as e:
+            logger.error(f"Error sending leaderboard to monitor: {e}")
+    async def _send_initial_monitor_data(self, websocket: WebSocketServerProtocol):
+        """Send initial data to monitor in a separate task to avoid blocking."""
+        total_start = time.time()
         try:
-            # Send initial stats
+            # Check if websocket is still in monitors set
+            if websocket not in self.monitors:
+                logger.debug("Monitor disconnected before initial data send")
+                return
+            # Send current stats (already in memory)
+            stats_start = time.time()
             await websocket.send(safe_json_dumps({"type": "stats", "data": self.stats}))
+            logger.debug(f"Monitor stats sent in {(time.time() - stats_start)*1000:.1f}ms")
+            # Get chunk stats asynchronously
+            chunk_stats_start = time.time()
+            loop = asyncio.get_event_loop()
+            chunk_stats = await loop.run_in_executor(None, self.chunk_manager.get_stats)
+            logger.debug(f"Chunk stats retrieved in {(time.time() - chunk_stats_start)*1000:.1f}ms")
+            if websocket not in self.monitors:
+                return
-            # Send chunk stats
-            chunk_stats = self.chunk_manager.get_stats()
+            chunk_send_start = time.time()
             await websocket.send(safe_json_dumps({"type": "chunk_stats", "data": chunk_stats}))
+            logger.debug(f"Chunk stats sent in {(time.time() - chunk_send_start)*1000:.1f}ms")
-            # Send contributor leaderboard
-            contributors = await self.storage.get_top_contributors(10)
-            await websocket.send(
-                safe_json_dumps(
-                    {"type": "leaderboard", "data": [safe_dict(c) for c in contributors]}
+            # For leaderboard, check if we have a cached version first
+            if hasattr(self, "_cached_leaderboard") and self._cached_leaderboard:
+                # Use cached leaderboard if available
+                cache_send_start = time.time()
+                await websocket.send(
+                    safe_json_dumps({"type": "leaderboard", "data": self._cached_leaderboard})
                 )
+                logger.debug(
+                    f"Cached leaderboard sent in {(time.time() - cache_send_start)*1000:.1f}ms"
+                )
+            else:
+                # Schedule leaderboard update separately
+                leaderboard_task_start = time.time()
+                asyncio.create_task(self._send_leaderboard_to_monitor(websocket))
+                logger.debug(
+                    f"Leaderboard task created in {(time.time() - leaderboard_task_start)*1000:.1f}ms"
+                )
+            logger.debug(
+                f"Monitor initial data send completed in {(time.time() - total_start)*1000:.1f}ms"
             )
-            # Keep connection alive
-            async for _ in websocket:
-                pass
+        except websockets.exceptions.ConnectionClosed:
+            logger.debug("Monitor disconnected during initial data send")
+        except Exception as e:
+            logger.error(f"Error sending initial monitor data: {e}")
+    async def _handle_monitor(self, websocket: WebSocketServerProtocol):
+        """Handle monitor connection - truly non-blocking version."""
+        monitor_start = time.time()
+        self.monitors.add(websocket)
+        logger.info(f"Monitor connected (total monitors: {len(self.monitors)})")
+        try:
+            # Send welcome message immediately
+            welcome_start = time.time()
+            await websocket.send(safe_json_dumps({"type": "welcome", "role": "monitor"}))
+            logger.debug(f"Monitor welcome sent in {(time.time() - welcome_start)*1000:.1f}ms")
+            # Schedule initial data send as a separate task to avoid blocking
+            task_create_start = time.time()
+            asyncio.create_task(self._send_initial_monitor_data(websocket))
+            logger.debug(
+                f"Monitor initial data task created in {(time.time() - task_create_start)*1000:.1f}ms"
+            )
+            # Just keep the connection alive - no blocking work here
+            try:
+                async for message in websocket:
+                    # Handle any incoming messages from monitor if needed
+                    # For now, just ignore them
+                    pass
+            except websockets.exceptions.ConnectionClosed:
+                pass  # Normal disconnection
         except websockets.exceptions.ConnectionClosed:
             logger.info("Monitor disconnected")
+        except Exception as e:
+            logger.error(f"Error in monitor handler: {e}")
         finally:
             self.monitors.discard(websocket)
+            logger.debug(f"Monitor handler completed in {(time.time() - monitor_start)*1000:.1f}ms")
     async def _broadcast_stats(self):
-        """Broadcast statistics to all monitors."""
+        """Broadcast statistics to all monitors - truly non-blocking version."""
         if not self.monitors:
             return
-        # Include chunk stats
-        chunk_stats = self.chunk_manager.get_stats()
-        self.stats.update({f"chunks_{k}": v for k, v in chunk_stats.items()})
+        if self.is_generating_stats:
+            return  # Already generating stats, skip this call
+        self.is_generating_stats = True
+        total_start = time.time()
+        # Prepare all the data first
+        data_prep_start = time.time()
+        loop = asyncio.get_event_loop()
+        # Get storage stats (already async)
+        storage_stats_start = time.time()
+        storage_stats = await self.storage.get_storage_stats()
+        logger.debug(f"Storage stats retrieved in {(time.time() - storage_stats_start)*1000:.1f}ms")
+        caption_stats_start = time.time()
+        caption_stats = await self.storage.get_caption_stats()
+        logger.debug(f"Caption stats retrieved in {(time.time() - caption_stats_start)*1000:.1f}ms")
+        # Get chunk stats in thread pool
+        chunk_stats_start = time.time()
+        chunk_stats = await loop.run_in_executor(None, self.chunk_manager.get_stats)
+        logger.debug(f"Chunk stats retrieved in {(time.time() - chunk_stats_start)*1000:.1f}ms")
+        # Build stats dict
+        build_stats_start = time.time()
+        stats_update = self.stats.copy()
+        stats_update.update({f"chunks_{k}": v for k, v in chunk_stats.items()})
+        stats_update.update(storage_stats)
+        stats_update["field_breakdown"] = caption_stats.get("field_stats", {})
+        stats_update["output_fields_list"] = caption_stats.get("output_fields", [])
         # Add rate information
-        self.stats.update(
+        stats_update.update(
             {
                 "current_rate": self.rate_tracker["current_rate"],
                 "average_rate": self.rate_tracker["average_rate"],
@@ -1112,22 +1610,227 @@ class Orchestrator:
         )
         # Add vLLM info
-        self.stats["vllm_model"] = self.vllm_config.get("model", "unknown")
-        self.stats["vllm_batch_size"] = self.vllm_config.get("batch_size", 0)
-        message = safe_json_dumps({"type": "stats", "data": self.stats})
-        # Send to all monitors
-        disconnected = set()
-        for monitor in self.monitors:
+        stats_update["vllm_model"] = self.vllm_config.get("model", "unknown")
+        stats_update["vllm_batch_size"] = self.vllm_config.get("batch_size", 0)
+        # Add stage information
+        stages = self.vllm_config.get("stages", [])
+        if stages:
+            stats_update["stage_count"] = len(stages)
+            stats_update["stage_names"] = [s.get("name", "unnamed") for s in stages]
+        else:
+            stats_update["stage_count"] = 1
+            stats_update["stage_names"] = ["default"]
+        # Get field stats
+        field_stats_start = time.time()
+        field_stats = await self.storage.get_output_field_stats()
+        stats_update["output_fields"] = field_stats
+        logger.debug(f"Field stats retrieved in {(time.time() - field_stats_start)*1000:.1f}ms")
+        # Update our internal stats
+        self.stats = stats_update
+        logger.debug(f"Stats prepared in {(time.time() - build_stats_start)*1000:.1f}ms")
+        logger.debug(f"Total data preparation took {(time.time() - data_prep_start)*1000:.1f}ms")
+        # Create message once
+        message_create_start = time.time()
+        stats_message = safe_json_dumps({"type": "stats", "data": self.stats})
+        logger.debug(f"Stats message created in {(time.time() - message_create_start)*1000:.1f}ms")
+        # Send to all monitors asynchronously in parallel
+        send_start = time.time()
+        async def send_to_monitor(monitor):
             try:
-                await monitor.send(message)
+                await monitor.send(stats_message)
             except websockets.exceptions.ConnectionClosed:
-                disconnected.add(monitor)
+                return monitor  # Return for removal
+            except Exception as e:
+                logger.debug(f"Error sending stats to monitor: {e}")
+                return monitor  # Return for removal
+            return None
+        # Send to all monitors in parallel
+        monitors_copy = self.monitors.copy()
+        results = await asyncio.gather(
+            *[send_to_monitor(m) for m in monitors_copy], return_exceptions=True
+        )
-        # Clean up disconnected monitors
+        # Remove disconnected monitors
+        disconnected = {
+            m
+            for m, r in zip(monitors_copy, results)
+            if r is not None and not isinstance(r, Exception)
+        }
         self.monitors -= disconnected
+        logger.debug(
+            f"Stats sent to {len(monitors_copy)} monitors in {(time.time() - send_start)*1000:.1f}ms"
+        )
+        # Send leaderboard update in a separate task to avoid blocking
+        leaderboard_task_start = time.time()
+        asyncio.create_task(self._broadcast_leaderboard())
+        self.is_generating_stats = False
+        logger.debug(
+            f"Leaderboard broadcast task created in {(time.time() - leaderboard_task_start)*1000:.1f}ms"
+        )
+        logger.debug(f"Stats broadcast completed in {(time.time() - total_start)*1000:.1f}ms")
+    async def _broadcast_leaderboard(self):
+        """Send leaderboard updates to monitors - separate from stats to avoid blocking."""
+        if not self.monitors:
+            return
+        total_start = time.time()
+        try:
+            # Get contributors
+            contributors_start = time.time()
+            contributors = await self.storage.get_top_contributors(10)
+            logger.debug(
+                f"Contributors retrieved for broadcast in {(time.time() - contributors_start)*1000:.1f}ms"
+            )
+            # Get worker counts
+            worker_counts_start = time.time()
+            loop = asyncio.get_event_loop()
+            worker_counts = await loop.run_in_executor(
+                None,
+                lambda: (
+                    self.get_workers_by_user_stats() if hasattr(self, "workers_by_user") else {}
+                ),
+            )
+            logger.debug(
+                f"Worker counts retrieved for broadcast in {(time.time() - worker_counts_start)*1000:.1f}ms"
+            )
+            # Build enhanced contributors list
+            build_start = time.time()
+            enhanced_contributors = []
+            for contributor in contributors:
+                contrib_dict = {
+                    "contributor_id": contributor.contributor_id,
+                    "name": contributor.name,
+                    "total_captions": contributor.total_captions,
+                    "trust_level": contributor.trust_level,
+                    "active_workers": len(
+                        worker_counts.get(contributor.contributor_id, {}).get("worker_ids", [])
+                    ),
+                }
+                enhanced_contributors.append(contrib_dict)
+            logger.debug(
+                f"Enhanced contributors built for broadcast in {(time.time() - build_start)*1000:.1f}ms"
+            )
+            # Cache it
+            self._cached_leaderboard = enhanced_contributors
+            # Create message once
+            message_create_start = time.time()
+            leaderboard_message = safe_json_dumps(
+                {"type": "leaderboard", "data": enhanced_contributors}
+            )
+            logger.debug(
+                f"Leaderboard message created in {(time.time() - message_create_start)*1000:.1f}ms"
+            )
+            # Send to all monitors in parallel
+            send_start = time.time()
+            async def send_leaderboard(monitor):
+                try:
+                    await monitor.send(leaderboard_message)
+                except:
+                    return monitor  # Mark for removal
+                return None
+            monitors_copy = self.monitors.copy()
+            results = await asyncio.gather(
+                *[send_leaderboard(m) for m in monitors_copy], return_exceptions=True
+            )
+            # Remove disconnected
+            disconnected = {
+                m
+                for m, r in zip(monitors_copy, results)
+                if r is not None and not isinstance(r, Exception)
+            }
+            self.monitors -= disconnected
+            logger.debug(
+                f"Leaderboard sent to {len(monitors_copy)} monitors in {(time.time() - send_start)*1000:.1f}ms"
+            )
+            logger.debug(
+                f"Leaderboard broadcast completed in {(time.time() - total_start)*1000:.1f}ms"
+            )
+        except Exception as e:
+            logger.error(f"Error broadcasting leaderboard: {e}")
+    def _get_queue_stats(self) -> Dict[str, int]:
+        """Get queue statistics - synchronous helper for thread pool."""
+        with self.chunk_manager.lock:
+            return {
+                "pending_chunks": len(self.chunk_manager.pending_chunks),
+                "assigned_chunks": sum(
+                    len(chunks) for chunks in self.chunk_manager.assigned_chunks.values()
+                ),
+            }
+    async def _flush_processed_items(self):
+        """Flush batched processed items to chunk tracker."""
+        with self.item_batch_lock:
+            if not self.pending_processed_items:
+                return
+            for chunk_id, indices in self.pending_processed_items.items():
+                if not indices:
+                    continue
+                # Indices here are ABSOLUTE dataset indices
+                # Sort indices
+                indices.sort()
+                # Group consecutive indices into ranges
+                ranges = []
+                start = indices[0]
+                end = indices[0]
+                for i in range(1, len(indices)):
+                    if indices[i] == end + 1:
+                        # Consecutive, extend range
+                        end = indices[i]
+                    else:
+                        # Gap found, save current range and start new one
+                        ranges.append((start, end))
+                        start = indices[i]
+                        end = indices[i]
+                # Don't forget the last range
+                ranges.append((start, end))
+                # Mark ranges as processed (mark_items_processed expects absolute indices)
+                for start_idx, end_idx in ranges:
+                    self.chunk_tracker.mark_items_processed(chunk_id, start_idx, end_idx)
+            # Clear pending items
+            self.pending_processed_items.clear()
+            self.last_item_batch_flush = time.time()
+    def get_workers_by_user_stats(self) -> Dict[str, Any]:
+        """Get statistics about workers grouped by user/token - thread-safe version."""
+        if not hasattr(self, "workers_by_user"):
+            return {}
+        # Create a copy to avoid issues with concurrent modification
+        stats = {}
+        workers_snapshot = dict(self.workers_by_user)
+        for user, worker_ids in workers_snapshot.items():
+            stats[user] = {"worker_count": len(worker_ids), "worker_ids": list(worker_ids)}
+        return stats
     async def _send_activity(self, activity: str):
         """Send activity update to monitors."""
         if not self.monitors:
@@ -1149,21 +1852,63 @@ class Orchestrator:
     async def _heartbeat_loop(self):
         """Send periodic heartbeats to maintain connections."""
         while True:
-            await asyncio.sleep(30)
+            try:
+                await asyncio.sleep(30)
-            # Ping workers
-            disconnected = []
-            for worker_id, ws in self.workers.items():
-                try:
-                    await ws.ping()
-                except:
-                    disconnected.append(worker_id)
+                # Create a copy of worker items to avoid modification during iteration
+                worker_items = list(self.workers.items())
+                disconnected = []
-            # Clean up disconnected workers
-            for worker_id in disconnected:
-                if worker_id in self.workers:
-                    del self.workers[worker_id]
-                    self.chunk_manager.release_worker_chunks(worker_id)
+                for worker_id, ws in worker_items:
+                    try:
+                        # Check if worker still exists before pinging
+                        if worker_id not in self.workers:
+                            continue
+                        # Send ping with timeout
+                        pong_waiter = await ws.ping()
+                        try:
+                            await asyncio.wait_for(pong_waiter, timeout=10)
+                        except asyncio.TimeoutError:
+                            logger.warning(f"Worker {worker_id} failed to respond to ping")
+                            disconnected.append(worker_id)
+                    except websockets.exceptions.ConnectionClosed:
+                        logger.info(f"Worker {worker_id} connection already closed")
+                        disconnected.append(worker_id)
+                    except Exception as e:
+                        logger.error(f"Error pinging worker {worker_id}: {e}")
+                        disconnected.append(worker_id)
+                # Clean up disconnected workers
+                for worker_id in disconnected:
+                    if worker_id in self.workers:
+                        logger.info(f"Removing unresponsive worker {worker_id}")
+                        del self.workers[worker_id]
+                        self.chunk_manager.release_worker_chunks(worker_id)
+                        # Update stats
+                        self.stats["connected_workers"] = len(self.workers)
+                        # Also clean up from workers_by_user if it exists
+                        if hasattr(self, "workers_by_user"):
+                            worker_user = (
+                                worker_id.rsplit("_", 1)[0] if "_" in worker_id else worker_id
+                            )
+                            if worker_user in self.workers_by_user:
+                                self.workers_by_user[worker_user].discard(worker_id)
+                                if not self.workers_by_user[worker_user]:
+                                    del self.workers_by_user[worker_user]
+                        # Notify monitors
+                        await self._broadcast_stats()
+                        await self._send_activity(
+                            f"Worker {worker_id} removed due to heartbeat timeout"
+                        )
+            except Exception as e:
+                logger.error(f"Error in heartbeat loop: {e}", exc_info=True)
+                # Continue the loop even if there's an error
+                await asyncio.sleep(5)
     async def _checkpoint_loop(self):
         """Periodically checkpoint storage."""
@@ -1172,42 +1917,58 @@ class Orchestrator:
         while True:
             await asyncio.sleep(60)
+            # Get current caption count from storage
+            storage_stats = await self.storage.get_storage_stats()
+            total_captions = storage_stats["total_captions"]
             # Force checkpoint at regular intervals
-            if self.stats["total_captions"] > 0 and self.stats["total_captions"] % interval == 0:
-                logger.info(f"Triggering checkpoint at {self.stats['total_captions']} captions")
+            if total_captions > 0 and total_captions % interval == 0:
+                logger.info(f"Triggering checkpoint at {total_captions} captions")
                 await self.storage.checkpoint()
                 # Update stats
                 self.stats["last_checkpoint"] = datetime.utcnow().isoformat()
-                self.stats["total_written"] = self.storage.total_captions_written
-                self.stats["buffer_size"] = len(self.storage.caption_buffer)
+                # No need to update total_written or buffer_size - they come from storage
                 await self._broadcast_stats()
                 logger.info(
-                    f"Checkpoint complete. Total written to disk: {self.stats['total_written']}"
+                    f"Checkpoint complete. Total written to disk: {storage_stats['total_written']}"
                 )
     async def _stats_update_loop(self):
-        """Periodically update and broadcast stats."""
+        """Periodically update and broadcast stats - non-blocking version."""
+        # Get the event loop for running blocking operations
+        loop = asyncio.get_event_loop()
         # Track session start values
-        session_start_captions = self.stats["total_captions"]
+        storage_stats = await self.storage.get_storage_stats()
+        session_start_outputs = storage_stats["total_captions"]  # This now counts ALL outputs
         session_start_time = time.time()
+        # Track the last known total to detect flushes
+        last_known_total = session_start_outputs
         while True:
             await asyncio.sleep(10)
-            # Update chunk stats
-            chunk_stats = self.chunk_manager.get_stats()
+            # Update chunk stats in thread pool to avoid blocking
+            chunk_stats = await loop.run_in_executor(None, self.chunk_manager.get_stats)
+            storage_stats = await self.storage.get_storage_stats()
+            current_total_outputs = storage_stats["total_captions"]  # ALL outputs
+            if self.chunk_tracker:
+                await self._flush_processed_items()
             self.stats["total_chunks"] = chunk_stats["total"]
             self.stats["completed_chunks"] = chunk_stats["completed"]
             self.stats["failed_chunks"] = chunk_stats["failed"]
-            # Add queue information
-            with self.chunk_manager.lock:
-                self.stats["pending_chunks"] = len(self.chunk_manager.pending_chunks)
-                self.stats["assigned_chunks"] = sum(
-                    len(chunks) for chunks in self.chunk_manager.assigned_chunks.values()
-                )
+            # Update total outputs stat (rename from total_captions for clarity)
+            self.stats["total_outputs"] = current_total_outputs
+            self.stats["total_captions"] = current_total_outputs  # Keep for backward compatibility
+            # Get queue stats in thread pool to avoid blocking
+            queue_stats = await loop.run_in_executor(None, self._get_queue_stats)
+            self.stats.update(queue_stats)
             # Calculate if we need more chunks
             worker_count = self.stats.get("connected_workers", 0)
@@ -1220,33 +1981,57 @@ class Orchestrator:
             elapsed_since_update = current_time - self.rate_tracker["last_update_time"]
             if elapsed_since_update > 0:
-                # Calculate current rate (captions per minute)
-                caption_diff = (
-                    self.stats["total_captions"] - self.rate_tracker["last_caption_count"]
-                )
-                self.rate_tracker["current_rate"] = (caption_diff / elapsed_since_update) * 60
+                # FIX: Handle the case where duplicates were skipped during save
+                # If current total is less than last known, it means duplicates were skipped
+                # We should not count this as negative progress
+                if current_total_outputs < last_known_total:
+                    logger.debug(
+                        f"Detected duplicate skip during save: {last_known_total} -> {current_total_outputs}"
+                    )
+                    # Don't calculate negative rate, just update the baseline
+                    self.rate_tracker["last_caption_count"] = current_total_outputs
+                    self.rate_tracker["current_rate"] = 0.0  # Set to 0 during flush
+                else:
+                    # Normal rate calculation
+                    output_diff = current_total_outputs - self.rate_tracker["last_caption_count"]
+                    self.rate_tracker["current_rate"] = (output_diff / elapsed_since_update) * 60
+                    self.rate_tracker["last_caption_count"] = current_total_outputs
                 # Calculate average rate since THIS SESSION started
                 session_elapsed = current_time - session_start_time
                 if session_elapsed > 0:
-                    session_captions = self.stats["total_captions"] - session_start_captions
-                    self.rate_tracker["average_rate"] = (session_captions / session_elapsed) * 60
+                    # Always use the difference from session start for average
+                    session_outputs = current_total_outputs - session_start_outputs
+                    self.rate_tracker["average_rate"] = (session_outputs / session_elapsed) * 60
-                # Calculate expected rate based on workers
-                # Assume each worker processes batch_size images every ~2 seconds with 3 captions each
+                # Calculate expected rate based on workers and stages
                 batch_size = self.vllm_config.get("batch_size", 8)
-                num_prompts = len(self.vllm_config.get("inference_prompts", ["", "", ""]))
+                # Count total prompts across all stages
+                total_prompts = 0
+                stages = self.vllm_config.get("stages", [])
+                if stages:
+                    for stage in stages:
+                        total_prompts += len(stage.get("prompts", []))
+                else:
+                    # Backward compatibility
+                    total_prompts = len(self.vllm_config.get("inference_prompts", ["", "", ""]))
                 images_per_minute = 30  # Rough estimate: 30 images/min per worker
-                self.rate_tracker["expected_rate"] = worker_count * images_per_minute * num_prompts
+                self.rate_tracker["expected_rate"] = (
+                    worker_count * images_per_minute * total_prompts
+                )
                 # Update trackers
                 self.rate_tracker["last_update_time"] = current_time
-                self.rate_tracker["last_caption_count"] = self.stats["total_captions"]
+                last_known_total = current_total_outputs
             # Log rate information when workers are connected
-            if worker_count > 0:
+            if (
+                worker_count > 0 and self.rate_tracker["current_rate"] >= 0
+            ):  # Only log non-negative rates
                 logger.info(
-                    f"Rate: {self.rate_tracker['current_rate']:.1f} captions/min "
+                    f"Rate: {self.rate_tracker['current_rate']:.1f} outputs/min "
                     f"(avg: {self.rate_tracker['average_rate']:.1f}, "
                     f"expected: {self.rate_tracker['expected_rate']:.1f}) | "
                     f"Workers: {worker_count}, Chunks: {active_chunks}/{target_buffer}"
@@ -1256,16 +2041,16 @@ class Orchestrator:
     async def _restore_state(self):
         """Restore state from storage on startup."""
-        # Update statistics
-        self.stats["total_captions"] = await self.storage.count_captions()
-        logger.info(f"Restored state: {self.stats['total_captions']} captions")
+        total_captions = await self.storage.count_captions()
+        logger.info(f"Restored state: {total_captions} captions")
     async def shutdown(self):
         """Graceful shutdown."""
         logger.info("Shutting down orchestrator...")
         # Stop chunk creation
+        if self.chunk_tracker:
+            await self._flush_processed_items()
         self.stop_chunk_creation.set()
         if self.chunk_creation_thread:
             self.chunk_creation_thread.join(timeout=5)
@@ -1287,7 +2072,7 @@ class Orchestrator:
         # Save chunk state
         if self.chunk_tracker:
-            self.chunk_tracker.save_checkpoint()
+            self.chunk_tracker.save()
         # Final checkpoint
         logger.info(f"Final flush: {len(self.storage.caption_buffer)} captions in buffer")

caption-flow 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

caption-flow 0.1.0py3-none-any.whl → 0.2.1py3-none-any.whl