PyPI - caption-flow - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

caption-flow 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

caption_flow/__init__.py +3 -2
caption_flow/cli.py +56 -39
caption_flow/models.py +6 -4
caption_flow/monitor.py +12 -2
caption_flow/orchestrator.py +729 -217
caption_flow/storage.py +579 -222
caption_flow/utils/__init__.py +3 -1
caption_flow/utils/auth.py +24 -25
caption_flow/utils/checkpoint_tracker.py +92 -0
caption_flow/utils/chunk_tracker.py +278 -194
caption_flow/utils/dataset_loader.py +392 -73
caption_flow/utils/image_processor.py +121 -1
caption_flow/utils/prompt_template.py +137 -0
caption_flow/utils/shard_processor.py +315 -0
caption_flow/utils/shard_tracker.py +87 -0
caption_flow/workers/base.py +228 -0
caption_flow/workers/caption.py +1321 -0
caption_flow/{worker_data.py → workers/data.py} +162 -234
caption_flow-0.2.0.dist-info/METADATA +369 -0
caption_flow-0.2.0.dist-info/RECORD +29 -0
caption_flow/worker.py +0 -300
caption_flow/worker_vllm.py +0 -1028
caption_flow-0.1.0.dist-info/METADATA +0 -427
caption_flow-0.1.0.dist-info/RECORD +0 -25
{caption_flow-0.1.0.dist-info → caption_flow-0.2.0.dist-info}/WHEEL +0 -0
{caption_flow-0.1.0.dist-info → caption_flow-0.2.0.dist-info}/entry_points.txt +0 -0
{caption_flow-0.1.0.dist-info → caption_flow-0.2.0.dist-info}/licenses/LICENSE +0 -0
{caption_flow-0.1.0.dist-info → caption_flow-0.2.0.dist-info}/top_level.txt +0 -0

caption_flow/orchestrator.py CHANGED Viewed

@@ -21,15 +21,15 @@ from collections import deque, defaultdict
 import threading
 from queue import Queue, Empty
+from .workers import data
 import websockets
 from websockets.server import WebSocketServerProtocol
 from .storage import StorageManager
 from .models import Caption, Contributor
 from .utils.auth import AuthManager
-from .utils.dataset_loader import DatasetLoader, ShardTracker
+from .utils import DatasetLoader, ShardTracker, ChunkTracker
 from .utils.json_utils import safe_dict, safe_json_dumps, to_json_dict
-from .utils.chunk_tracker import ChunkTracker
 logger = logging.getLogger(__name__)
@@ -48,6 +48,43 @@ class ShardChunk:
     assigned_at: Optional[datetime] = None
     completed_at: Optional[datetime] = None
+    @classmethod
+    def create(
+        cls, shard_url: str, shard_name: str, start_index: int, chunk_size: int
+    ) -> "ShardChunk":
+        """Factory method to create a chunk with consistent ID."""
+        # Always use consistent format: dataset_chunk_startindex
+        if shard_url.startswith("hf_dataset:"):
+            # Extract dataset path
+            parts = shard_url.split(":")
+            dataset_path = parts[1] if len(parts) > 1 else "unknown"
+            chunk_id = f"{dataset_path.replace('/', '_')}_chunk_{start_index}"
+        else:
+            # WebDataset format
+            chunk_id = f"{shard_name}_chunk_{start_index}"
+        return cls(
+            chunk_id=chunk_id,
+            shard_url=shard_url,
+            shard_name=shard_name,
+            start_index=start_index,
+            chunk_size=chunk_size,
+        )
+    def belongs_to_shard(self, shard_identifier: str) -> bool:
+        """Check if this chunk belongs to a given shard."""
+        return self.shard_name == shard_identifier
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dict for JSON serialization (for workers)."""
+        return {
+            "chunk_id": self.chunk_id,
+            "shard_url": self.shard_url,
+            "shard_name": self.shard_name,
+            "start_index": self.start_index,
+            "chunk_size": self.chunk_size,
+        }
 class ChunkManager:
     """Manages shard chunk creation and assignment."""
@@ -67,9 +104,7 @@ class ChunkManager:
         chunks = []
         for start_idx in range(0, total_items, self.chunk_size):
-            chunk_id = f"{shard_name}_chunk_{start_idx}"
-            chunk = ShardChunk(
-                chunk_id=chunk_id,
+            chunk = ShardChunk.create(
                 shard_url=shard_url,
                 shard_name=shard_name,
                 start_index=start_idx,
@@ -77,8 +112,8 @@ class ChunkManager:
             )
             with self.lock:
-                self.chunks[chunk_id] = chunk
-                self.pending_chunks.append(chunk_id)
+                self.chunks[chunk.chunk_id] = chunk
+                self.pending_chunks.append(chunk.chunk_id)
             chunks.append(chunk)
@@ -86,24 +121,84 @@ class ChunkManager:
     def get_chunks_for_worker(
         self, worker_id: str, count: int = 1, tracker: Optional["ChunkTracker"] = None
-    ) -> List[ShardChunk]:
-        """Get available chunks for a worker."""
+    ) -> List[Dict[str, Any]]:
+        """Get available chunks with unprocessed items for a worker."""
         assigned = []
         with self.lock:
+            # FIRST PRIORITY: Check if this worker already has assigned chunks
+            # Workers should complete their current chunks before getting new ones
+            if worker_id in self.assigned_chunks:
+                existing_chunk_ids = list(self.assigned_chunks[worker_id])
+                for chunk_id in existing_chunk_ids:
+                    if len(assigned) >= count:
+                        break
+                    chunk = self.chunks.get(chunk_id)
+                    if not chunk:
+                        continue
+                    # Check if chunk still has unprocessed items
+                    if tracker:
+                        chunk_info = tracker.get_chunk_with_unprocessed_items(chunk_id)
+                        if chunk_info and chunk_info["unprocessed_ranges"]:
+                            assigned.append(
+                                {
+                                    "chunk": chunk,
+                                    "unprocessed_ranges": chunk_info["unprocessed_ranges"],
+                                }
+                            )
+                    else:
+                        # No tracker, assume chunk needs processing
+                        assigned.append(
+                            {
+                                "chunk": chunk,
+                                "unprocessed_ranges": [(0, chunk.chunk_size - 1)],
+                            }
+                        )
+            # SECOND PRIORITY: Get new pending chunks
+            # Only if worker doesn't have enough chunks already
             while len(assigned) < count and self.pending_chunks:
                 chunk_id = self.pending_chunks.popleft()
-                chunk = self.chunks[chunk_id]
+                chunk = self.chunks.get(chunk_id)
+                if not chunk:
+                    continue
+                # Verify chunk is truly pending (defensive check)
+                if chunk.status != "pending" or chunk.assigned_to is not None:
+                    logger.warning(
+                        f"Chunk {chunk_id} in pending queue but status={chunk.status}, assigned_to={chunk.assigned_to}"
+                    )
+                    continue
+                # Assign to this worker
                 chunk.assigned_to = worker_id
                 chunk.status = "assigned"
                 chunk.assigned_at = datetime.utcnow()
                 self.assigned_chunks[worker_id].add(chunk_id)
-                assigned.append(chunk)
+                # Get unprocessed ranges
+                unprocessed_ranges = [(0, chunk.chunk_size - 1)]  # Default
                 if tracker:
+                    chunk_info = tracker.get_chunk_with_unprocessed_items(chunk_id)
+                    if chunk_info:
+                        unprocessed_ranges = chunk_info["unprocessed_ranges"]
                     tracker.mark_assigned(chunk_id, worker_id)
+                assigned.append({"chunk": chunk, "unprocessed_ranges": unprocessed_ranges})
+        # Log what we're assigning
+        if assigned:
+            chunk_summary = ", ".join(
+                [
+                    f"{info['chunk'].chunk_id}[{len(info['unprocessed_ranges'])} ranges]"
+                    for info in assigned
+                ]
+            )
+            logger.info(f"Assigning to worker {worker_id}: {chunk_summary}")
         return assigned
     def complete_chunk(self, chunk_id: str, worker_id: str) -> bool:
@@ -173,6 +268,27 @@ class Orchestrator:
         self.dataset_config = config.get("dataset", {})
         self.dataset_path = self.dataset_config.get("path")
         self.dataset_type = self.dataset_config.get("type", "huggingface")
+        self.dataset_split = self.dataset_config.get("split", "train")  # Add split configuration
+        self.dataset_image_column = self.dataset_config.get(
+            "image_column", "image"
+        )  # Add image column config
+        # Dataset components
+        self.dataset_loader = None
+        self.shard_tracker = None
+        self.chunk_tracker = None
+        if self.dataset_path:
+            self.dataset_loader = DatasetLoader(
+                self.dataset_path,
+                self.dataset_type,
+                self.dataset_split,
+                self.dataset_image_column,
+            )
+            checkpoint_dir = Path(config.get("storage", {}).get("checkpoint_dir", "./checkpoints"))
+            checkpoint_dir.mkdir(parents=True, exist_ok=True)
+            self.shard_tracker = ShardTracker(checkpoint_dir / "shards.json")
+            self.chunk_tracker = ChunkTracker(checkpoint_dir / "chunks.json")
         # vLLM configuration to distribute to workers
         self.vllm_config = config.get(
@@ -233,6 +349,11 @@ class Orchestrator:
         # Initialize chunk manager with reference to chunk tracker
         self.chunk_manager = ChunkManager(self.chunk_size, self.chunk_tracker)
+        self.pending_processed_items = defaultdict(list)  # chunk_id -> list of indices
+        self.item_batch_lock = threading.Lock()
+        self.last_item_batch_flush = time.time()
+        self.item_batch_interval = 5  # Flush every 5 seconds
+        self.item_batch_size = 100  # Or every 100 items
         # Track connections
         self.workers: Dict[str, WebSocketServerProtocol] = {}
@@ -246,13 +367,10 @@ class Orchestrator:
             "total_chunks": 0,
             "completed_chunks": 0,
             "failed_chunks": 0,
-            "total_captions": 0,
             "connected_workers": 0,
             "total_shards": 0,
             "completed_shards": 0,
             "current_shard": None,
-            "buffer_size": 0,
-            "total_written": 0,
             "last_checkpoint": None,
         }
@@ -266,7 +384,7 @@ class Orchestrator:
             "expected_rate": 0.0,
         }
-        # Data sample queue for VLLMWorkers
+        # Data sample queue for CaptionWorker
         self.data_sample_queue = asyncio.Queue(maxsize=1000)
         self.data_workers: Dict[str, WebSocketServerProtocol] = {}
@@ -310,10 +428,23 @@ class Orchestrator:
         # Mark state as not restored until we process checkpoints
         self.state_restored.clear()
+        # Get dataset info to check format
+        dataset_info = self.dataset_loader.get_dataset_info()
+        dataset_format = dataset_info.get("dataset_format", "unknown")
+        logger.info(f"Dataset format: {dataset_format}")
         # Get all shards
         self.all_shards = self.dataset_loader.get_shard_list()
         self.stats["total_shards"] = len(self.all_shards)
+        # For HuggingFace datasets, we might need to dynamically create more shards
+        if dataset_format == "huggingface_datasets":
+            self._is_hf_dataset = True
+            self._hf_chunk_size = 10000  # Items per virtual shard
+            self._next_hf_shard_index = len(self.all_shards)  # For creating new virtual shards
+        else:
+            self._is_hf_dataset = False
         # Get shard status from ChunkTracker
         shards_summary = self.chunk_tracker.get_shards_summary() if self.chunk_tracker else {}
         completed_shards = {
@@ -336,7 +467,10 @@ class Orchestrator:
         # Filter out shards that already have chunks created
         remaining_shards = [
-            shard for shard in remaining_shards if Path(shard).stem not in shards_with_chunks
+            shard
+            for shard in remaining_shards
+            if (shard if shard.startswith("hf_dataset:") else Path(shard).stem)
+            not in shards_with_chunks
         ]
         self.stats["completed_shards"] = len(completed_shards)
@@ -356,25 +490,18 @@ class Orchestrator:
             with self.chunk_manager.lock:
                 for chunk_state in shard_info["chunks"]:
                     if chunk_state.status in ["pending", "failed", "assigned"]:
-                        # Find shard URL
-                        shard_url = None
-                        for url in self.all_shards:
-                            if Path(url).stem == shard_name:
-                                shard_url = url
-                                break
-                        if shard_url:
-                            chunk = ShardChunk(
-                                chunk_id=chunk_state.chunk_id,
-                                shard_url=shard_url,
-                                shard_name=chunk_state.shard_name,
-                                start_index=chunk_state.start_index,
-                                chunk_size=chunk_state.chunk_size,
-                            )
-                            self.chunk_manager.chunks[chunk_state.chunk_id] = chunk
-                            self.chunk_manager.pending_chunks.append(chunk_state.chunk_id)
-                            requeued_chunks_by_shard[shard_name].append(chunk_state.chunk_id)
-                            initial_pending += 1
+                        # ChunkState already has shard_url stored
+                        chunk = ShardChunk(
+                            chunk_id=chunk_state.chunk_id,
+                            shard_url=chunk_state.shard_url,
+                            shard_name=chunk_state.shard_name,
+                            start_index=chunk_state.start_index,
+                            chunk_size=chunk_state.chunk_size,
+                        )
+                        self.chunk_manager.chunks[chunk_state.chunk_id] = chunk
+                        self.chunk_manager.pending_chunks.append(chunk_state.chunk_id)
+                        requeued_chunks_by_shard[shard_name].append(chunk_state.chunk_id)
+                        initial_pending += 1
         logger.info(f"Re-queued {initial_pending} existing pending chunks")
         for shard_name, chunk_ids in requeued_chunks_by_shard.items():
@@ -426,7 +553,13 @@ class Orchestrator:
                 if current_shard_url is None or current_shard_index >= current_shard_items:
                     try:
                         current_shard_url = next(shard_iter)
-                        current_shard_name = Path(current_shard_url).stem
+                        # Extract shard name based on type
+                        if current_shard_url.startswith("hf_dataset:"):
+                            current_shard_name = current_shard_url  # Use full ID for virtual shards
+                        else:
+                            current_shard_name = Path(current_shard_url).stem
                         self.stats["current_shard"] = current_shard_name
                         # Skip if we already have chunks from this shard
@@ -439,16 +572,74 @@ class Orchestrator:
                         # Count items in new shard
                         logger.info(f"Loading new shard {current_shard_name}")
-                        current_shard_items = sum(
-                            1 for _ in self.dataset_loader.iterate_shard(current_shard_url)
-                        )
+                        # For virtual HF dataset shards, use the chunk size directly
+                        if current_shard_url.startswith("hf_dataset:"):
+                            current_shard_items = self.dataset_loader.count_shard_items(
+                                current_shard_url
+                            )
+                            logger.info(
+                                f"Virtual shard {current_shard_name} has {current_shard_items} items"
+                            )
+                        else:
+                            # For WebDataset, actually count items
+                            current_shard_items = sum(
+                                1 for _ in self.dataset_loader.iterate_shard(current_shard_url)
+                            )
+                            logger.info(
+                                f"Shard {current_shard_name} has {current_shard_items} items"
+                            )
                         current_shard_index = 0
-                        logger.info(f"Shard {current_shard_name} has {current_shard_items} items")
                     except StopIteration:
-                        # No more shards
+                        # No more shards in the iterator
+                        if self._is_hf_dataset:
+                            # Before creating new virtual shards, check if we have pending chunks
+                            with self.chunk_manager.lock:
+                                pending_count = len(self.chunk_manager.pending_chunks)
+                            if pending_count > 0:
+                                # Don't create new shards if we have pending chunks
+                                logger.debug(
+                                    f"Have {pending_count} pending chunks, not creating new virtual shards yet"
+                                )
+                                current_shard_url = None
+                                time.sleep(2)
+                                continue
+                            # For HF datasets, we can create more virtual shards on demand
+                            logger.info(
+                                "Creating additional virtual shards for HuggingFace dataset"
+                            )
+                            # Create 10 more virtual shards
+                            new_shards = []
+                            for i in range(10):
+                                shard_id = f"hf_dataset:{self.dataset_path}:chunk:{self._next_hf_shard_index * self._hf_chunk_size}"
+                                new_shards.append(shard_id)
+                                self._next_hf_shard_index += 1
+                            # Add to all_shards and create new iterator
+                            self.all_shards.extend(new_shards)
+                            self.stats["total_shards"] = len(self.all_shards)
+                            # Filter for unprocessed shards
+                            remaining_new_shards = [
+                                s
+                                for s in new_shards
+                                if s not in shards_summary and s not in completed_shards
+                            ]
+                            if remaining_new_shards:
+                                shard_iter = iter(remaining_new_shards)
+                                logger.info(f"Added {len(remaining_new_shards)} new virtual shards")
+                                continue
+                        # No more shards to process
                         logger.info("No more shards to process")
                         break
                     except Exception as e:
                         logger.error(f"Error loading shard {current_shard_name}: {e}")
                         current_shard_url = None
@@ -456,25 +647,40 @@ class Orchestrator:
                 # Create a chunk from current shard
                 if current_shard_url and current_shard_index < current_shard_items:
-                    chunk_id = f"{current_shard_name}_chunk_{current_shard_index}"
-                    chunk_size = min(self.chunk_size, current_shard_items - current_shard_index)
+                    # Calculate the absolute dataset index for this chunk
+                    if current_shard_url.startswith("hf_dataset:"):
+                        # Parse the virtual shard URL to get the base start index
+                        parts = current_shard_url.split(":")
+                        if len(parts) >= 4 and parts[2] == "chunk":
+                            shard_base_index = int(parts[3])
+                        else:
+                            shard_base_index = 0
+                        # The absolute start index for this chunk in the dataset
+                        absolute_start_index = shard_base_index + current_shard_index
+                    else:
+                        # For WebDataset, current_shard_index is already absolute
+                        absolute_start_index = current_shard_index
+                    # Create chunk with absolute index
+                    chunk = ShardChunk.create(
+                        shard_url=current_shard_url,
+                        shard_name=current_shard_name,
+                        start_index=absolute_start_index,
+                        chunk_size=min(self.chunk_size, current_shard_items - current_shard_index),
+                    )
-                    # Add to ChunkTracker
+                    # Add to ChunkTracker with all required fields
                     if self.chunk_tracker and self.chunk_tracker.add_chunk(
-                        chunk_id, current_shard_name, current_shard_index, chunk_size
+                        chunk.chunk_id,
+                        chunk.shard_name,
+                        chunk.shard_url,
+                        chunk.start_index,
+                        chunk.chunk_size,
                     ):
-                        # Create chunk
-                        chunk = ShardChunk(
-                            chunk_id=chunk_id,
-                            shard_url=current_shard_url,
-                            shard_name=current_shard_name,
-                            start_index=current_shard_index,
-                            chunk_size=chunk_size,
-                        )
                         with self.chunk_manager.lock:
-                            self.chunk_manager.chunks[chunk_id] = chunk
-                            self.chunk_manager.pending_chunks.append(chunk_id)
+                            self.chunk_manager.chunks[chunk.chunk_id] = chunk
+                            self.chunk_manager.pending_chunks.append(chunk.chunk_id)
                         chunks_created += 1
                         self.stats["total_chunks"] += 1
@@ -484,10 +690,14 @@ class Orchestrator:
             if chunks_created > 0:
                 logger.info(f"Created {chunks_created} chunks on demand")
-            # If we couldn't create any chunks and there are no more shards, we're done
+            # If we couldn't create any chunks and there are no more shards, check if it's HF dataset
             if chunks_created == 0 and current_shard_url is None:
-                logger.info("All shards processed, chunk creation complete")
-                break
+                if self._is_hf_dataset:
+                    # We can always create more virtual shards for HF datasets
+                    logger.debug("Will create more virtual shards on next iteration")
+                else:
+                    logger.info("All shards processed, chunk creation complete")
+                    break
             # Brief pause to avoid spinning
             time.sleep(1)
@@ -558,7 +768,9 @@ class Orchestrator:
             elif auth_ticket.role == "admin":
                 await self._handle_admin(websocket, auth_ticket)
             else:
-                await websocket.send(safe_json_dumps({"error": f"Unknown role: {auth_ticket.role}"}))
+                await websocket.send(
+                    safe_json_dumps({"error": f"Unknown role: {auth_ticket.role}"})
+                )
         except Exception as e:
             logger.error(f"Connection error: {e}")
@@ -604,81 +816,118 @@ class Orchestrator:
         requires_worker_restart = False
         try:
+            # Extract orchestrator section if present
+            if "orchestrator" in new_config:
+                # Config has orchestrator wrapper, extract it
+                orchestrator_config = new_config["orchestrator"]
+            else:
+                # Config is already at orchestrator level
+                orchestrator_config = new_config
+            # Helper function for deep comparison
+            def deep_equal(a, b):
+                """Deep comparison of two values including nested dicts and lists."""
+                if type(a) != type(b):
+                    return False
+                if isinstance(a, dict):
+                    if set(a.keys()) != set(b.keys()):
+                        return False
+                    return all(deep_equal(a[k], b[k]) for k in a.keys())
+                elif isinstance(a, (list, tuple)):
+                    if len(a) != len(b):
+                        return False
+                    return all(deep_equal(x, y) for x, y in zip(a, b))
+                else:
+                    return a == b
             # Update vLLM configuration
-            if "vllm" in new_config:
+            if "vllm" in orchestrator_config:
                 old_vllm = self.vllm_config.copy()
+                new_vllm = orchestrator_config["vllm"]
-                # Check each field for actual changes
-                vllm_changed = False
-                for key, value in new_config["vllm"].items():
-                    if self.vllm_config.get(key) != value:
-                        self.vllm_config[key] = value
-                        vllm_changed = True
+                # Check if vLLM config actually changed using deep comparison
+                vllm_changed = not deep_equal(old_vllm, new_vllm)
                 if vllm_changed:
+                    # Update the vLLM config
+                    self.vllm_config = new_vllm.copy()
                     updated_sections.append("vllm")
                     # Check if critical changes require worker restart
                     if (
-                        old_vllm.get("model") != self.vllm_config.get("model")
+                        old_vllm.get("model") != new_vllm.get("model")
                         or old_vllm.get("gpu_memory_utilization")
-                        != self.vllm_config.get("gpu_memory_utilization")
+                        != new_vllm.get("gpu_memory_utilization")
                         or old_vllm.get("tensor_parallel_size")
-                        != self.vllm_config.get("tensor_parallel_size")
+                        != new_vllm.get("tensor_parallel_size")
+                        or old_vllm.get("dtype") != new_vllm.get("dtype")
+                        or old_vllm.get("max_model_len") != new_vllm.get("max_model_len")
                     ):
                         requires_worker_restart = True
                         warnings.append(
                             "Critical vLLM changes detected - workers will be disconnected to reload"
                         )
+                        logger.info(
+                            f"Model change: {old_vllm.get('model')} -> {new_vllm.get('model')}"
+                        )
             # Update dataset configuration
-            if "dataset" in new_config:
-                dataset_changed = False
-                for key, value in new_config["dataset"].items():
-                    if self.dataset_config.get(key) != value:
-                        self.dataset_config[key] = value
-                        dataset_changed = True
+            if "dataset" in orchestrator_config:
+                old_dataset = self.dataset_config.copy()
+                new_dataset = orchestrator_config["dataset"]
+                dataset_changed = not deep_equal(old_dataset, new_dataset)
                 if dataset_changed:
+                    self.dataset_config = new_dataset.copy()
                     self.dataset_path = self.dataset_config.get("path")
                     self.dataset_type = self.dataset_config.get("type", "huggingface")
                     updated_sections.append("dataset")
                     warnings.append("Dataset changes will apply to new chunks only")
             # Update chunk settings
-            if "chunk_size" in new_config and self.chunk_size != new_config["chunk_size"]:
-                self.chunk_size = new_config["chunk_size"]
+            if (
+                "chunk_size" in orchestrator_config
+                and self.chunk_size != orchestrator_config["chunk_size"]
+            ):
+                self.chunk_size = orchestrator_config["chunk_size"]
                 self.chunk_manager.chunk_size = self.chunk_size
                 updated_sections.append("chunk_size")
             if (
-                "chunks_per_request" in new_config
-                and self.chunks_per_request != new_config["chunks_per_request"]
+                "chunks_per_request" in orchestrator_config
+                and self.chunks_per_request != orchestrator_config["chunks_per_request"]
             ):
-                self.chunks_per_request = new_config["chunks_per_request"]
+                self.chunks_per_request = orchestrator_config["chunks_per_request"]
                 updated_sections.append("chunks_per_request")
-            # Recreate auth manager
-            self.auth = AuthManager(config=new_config)
+            # Update auth configuration
+            if "auth" in orchestrator_config:
+                try:
+                    self.auth = AuthManager({"auth": orchestrator_config["auth"]})
+                    updated_sections.append("auth")
+                except Exception as e:
+                    logger.error(f"Failed to update AuthManager: {e}")
+                    warnings.append(f"Auth update failed: {e}")
             # Update buffer settings
             if (
-                "chunk_buffer_multiplier" in new_config
-                and self.chunk_buffer_multiplier != new_config["chunk_buffer_multiplier"]
+                "chunk_buffer_multiplier" in orchestrator_config
+                and self.chunk_buffer_multiplier != orchestrator_config["chunk_buffer_multiplier"]
             ):
-                self.chunk_buffer_multiplier = new_config["chunk_buffer_multiplier"]
+                self.chunk_buffer_multiplier = orchestrator_config["chunk_buffer_multiplier"]
                 updated_sections.append("chunk_buffer_multiplier")
             if (
-                "min_chunk_buffer" in new_config
-                and self.min_chunk_buffer != new_config["min_chunk_buffer"]
+                "min_chunk_buffer" in orchestrator_config
+                and self.min_chunk_buffer != orchestrator_config["min_chunk_buffer"]
             ):
-                self.min_chunk_buffer = new_config["min_chunk_buffer"]
+                self.min_chunk_buffer = orchestrator_config["min_chunk_buffer"]
                 updated_sections.append("min_chunk_buffer")
             # Update storage settings
-            if "storage" in new_config:
-                storage_config = new_config["storage"]
+            if "storage" in orchestrator_config:
+                storage_config = orchestrator_config["storage"]
                 storage_changed = False
                 if (
@@ -701,21 +950,6 @@ class Orchestrator:
                 if storage_changed:
                     updated_sections.append("storage")
-            # Update data worker storage config
-            if "data_worker_storage" in new_config:
-                current_dw_storage = self.config.get("data_worker_storage", {})
-                if current_dw_storage != new_config["data_worker_storage"]:
-                    self.config["data_worker_storage"] = new_config["data_worker_storage"]
-                    updated_sections.append("data_worker_storage")
-                    warnings.append("Data worker storage config will apply to new connections only")
-            # Update backpressure threshold
-            if "backpressure_threshold" in new_config:
-                current_threshold = getattr(self, "backpressure_threshold", 800)
-                if current_threshold != new_config["backpressure_threshold"]:
-                    self.backpressure_threshold = new_config["backpressure_threshold"]
-                    updated_sections.append("backpressure_threshold")
             # Check if any changes were made
             if not updated_sections:
                 await websocket.send(
@@ -729,29 +963,49 @@ class Orchestrator:
                 logger.info("Configuration reload requested but no changes detected")
                 return
-            # Update the main config for any other fields
-            self.config.update(new_config)
+            # Update the main config
+            if "orchestrator" in new_config:
+                self.config["orchestrator"] = orchestrator_config
+            else:
+                self.config.update(orchestrator_config)
             # Handle worker restart if needed
             if requires_worker_restart:
                 logger.info("Disconnecting all workers for configuration reload...")
-                # Disconnect all workers
-                worker_ids = list(self.workers.keys())
-                for worker_id in worker_ids:
+                # Send reload message to workers first
+                reload_msg = safe_json_dumps(
+                    {
+                        "type": "reload_vllm",
+                        "vllm_config": self.vllm_config,
+                    }
+                )
+                # Create a list of worker items to avoid modifying dict during iteration
+                worker_items = list(self.workers.items())
+                disconnected = []
+                for worker_id, ws in worker_items:
                     try:
-                        await self.workers[worker_id].close(
-                            code=1012, reason="Configuration reload"
-                        )
+                        await ws.send(reload_msg)
+                        # Give worker time to process before disconnect
+                        await asyncio.sleep(0.5)
+                        await ws.close(code=1012, reason="Configuration reload")
+                        disconnected.append(worker_id)
                     except:
-                        pass
+                        disconnected.append(worker_id)  # Still mark as disconnected if error
+                # Now safely clear workers dict
+                for worker_id in disconnected:
+                    if worker_id in self.workers:
+                        del self.workers[worker_id]
                 warnings.append(
-                    f"Disconnected {len(worker_ids)} workers - they will reconnect with new config"
+                    f"Sent reload message to {len(disconnected)} workers - they will reconnect with new config"
                 )
             else:
-                # Just notify workers about config changes
-                reload_msg = safe_json_dumps(
+                # Just notify workers about config changes without disconnecting
+                config_update_msg = safe_json_dumps(
                     {
                         "type": "config_update",
                         "vllm_config": self.vllm_config if "vllm" in updated_sections else None,
@@ -761,15 +1015,21 @@ class Orchestrator:
                     }
                 )
+                # Create a list of worker items to avoid modifying dict during iteration
+                worker_items = list(self.workers.items())
                 disconnected = []
-                for worker_id, ws in self.workers.items():
+                for worker_id, ws in worker_items:
                     try:
-                        await ws.send(reload_msg)
+                        await ws.send(config_update_msg)
+                        logger.info(f"Sent config update to worker {worker_id}")
                     except:
                         disconnected.append(worker_id)
+                # Now safely remove disconnected workers
                 for worker_id in disconnected:
-                    del self.workers[worker_id]
+                    if worker_id in self.workers:
+                        del self.workers[worker_id]
             # Send success response
             await websocket.send(
@@ -788,34 +1048,58 @@ class Orchestrator:
         except Exception as e:
             logger.error(f"Configuration reload failed: {e}")
+            import traceback
+            logger.error(traceback.format_exc())
             await websocket.send(safe_json_dumps({"type": "reload_failed", "error": str(e)}))
     async def _handle_worker(self, websocket: WebSocketServerProtocol, auth_ticket):
         """Handle worker connection lifecycle."""
-        worker_id = getattr(auth_ticket, "name", str(uuid.uuid4()))
+        # Generate unique worker ID even if using same token
+        base_name = getattr(auth_ticket, "name", "worker")
+        worker_id = f"{base_name}_{str(uuid.uuid4())[:8]}"  # Add unique suffix
+        # Track the original token/user for accounting
+        worker_user = base_name  # Keep track of which user/token this worker belongs to
         self.workers[worker_id] = websocket
         self.stats["connected_workers"] = len(self.workers)
-        # Register contributor
-        contributor = Contributor(
-            contributor_id=worker_id, name=worker_id, total_captions=0, trust_level=1
-        )
-        await self.storage.save_contributor(contributor)
+        # Optionally track workers by user/token
+        if not hasattr(self, "workers_by_user"):
+            self.workers_by_user = defaultdict(set)
+        self.workers_by_user[worker_user].add(worker_id)
+        # Register contributor with the base name (for aggregating stats per user)
+        contributor = await self.storage.get_contributor(worker_user)
+        if not contributor:
+            contributor = Contributor(
+                contributor_id=worker_user,
+                name=worker_user,
+                total_captions=0,
+                trust_level=1,
+            )
+            await self.storage.save_contributor(contributor)
-        logger.info(f"Worker {worker_id} connected")
+        logger.info(f"Worker {worker_id} (user: {worker_user}) connected")
         await self._broadcast_stats()
-        await self._send_activity(f"Worker {worker_id} connected")
+        await self._send_activity(f"Worker {worker_id} (user: {worker_user}) connected")
         try:
             # Send welcome message with dataset configuration
             welcome_message = {
                 "type": "welcome",
                 "worker_id": worker_id,
+                "user_id": worker_user,
                 "dataset_config": {
                     "dataset_path": self.dataset_path,
                     "dataset_type": self.dataset_type,
-                    "path": self.dataset_path,  # For compatibility
-                    "type": self.dataset_type,  # For compatibility
+                    "dataset_split": self.dataset_split,
+                    "dataset_image_column": self.dataset_image_column,
+                    "path": self.dataset_path,
+                    "type": self.dataset_type,
+                    "split": self.dataset_split,
+                    "image_column": self.dataset_image_column,
                 },
                 "vllm_config": self.vllm_config,
             }
@@ -826,21 +1110,29 @@ class Orchestrator:
                 await self._process_worker_message(worker_id, data)
         except websockets.exceptions.ConnectionClosed:
-            logger.info(f"Worker {worker_id} disconnected")
+            logger.info(f"Worker {worker_id} (user: {worker_user}) disconnected")
         finally:
-            del self.workers[worker_id]
+            if worker_id in self.workers:
+                del self.workers[worker_id]
+            # Clean up user tracking
+            if hasattr(self, "workers_by_user") and worker_user in self.workers_by_user:
+                self.workers_by_user[worker_user].discard(worker_id)
+                if not self.workers_by_user[worker_user]:
+                    del self.workers_by_user[worker_user]
             self.stats["connected_workers"] = len(self.workers)
-            # Release chunks in both managers
+            # Release chunks
             self.chunk_manager.release_worker_chunks(worker_id)
             if self.chunk_tracker:
-                # Mark released chunks as pending in tracker
                 released_chunks = self.chunk_tracker.release_worker_chunks(worker_id)
                 logger.info(
                     f"Released {len(released_chunks) if released_chunks is not None else 0} chunks from worker {worker_id}"
                 )
             await self._broadcast_stats()
-            await self._send_activity(f"Worker {worker_id} disconnected")
+            await self._send_activity(f"Worker {worker_id} (user: {worker_user}) disconnected")
     async def _process_worker_message(self, worker_id: str, data: Dict):
         """Process message from worker."""
@@ -856,28 +1148,26 @@ class Orchestrator:
                 return
             count = data.get("count", self.chunks_per_request)
-            chunks = self.chunk_manager.get_chunks_for_worker(worker_id, count, self.chunk_tracker)
+            chunk_infos = self.chunk_manager.get_chunks_for_worker(
+                worker_id, count, self.chunk_tracker
+            )
-            if chunks:
-                # Only send the fields that worker expects
-                chunk_data = []
-                for chunk in chunks:
-                    chunk_data.append(
-                        {
-                            "chunk_id": chunk.chunk_id,
-                            "shard_url": chunk.shard_url,
-                            "shard_name": chunk.shard_name,
-                            "start_index": chunk.start_index,
-                            "chunk_size": chunk.chunk_size,
-                        }
-                    )
+            if chunk_infos:
+                # Send chunks with unprocessed ranges
+                chunks_data = []
+                for info in chunk_infos:
+                    chunk_dict = info["chunk"].to_dict()
+                    chunk_dict["unprocessed_ranges"] = info["unprocessed_ranges"]
+                    chunks_data.append(chunk_dict)
                 await self.workers[worker_id].send(
-                    safe_json_dumps({"type": "shard_assignment", "chunks": chunk_data})
+                    safe_json_dumps({"type": "shard_assignment", "chunks": chunks_data})
+                )
+                chunk_ids = [c["chunk_id"] for c in chunks_data]
+                logger.info(
+                    f"Assigned {len(chunks_data)} chunks to worker {worker_id}: {chunk_ids}"
                 )
-                chunk_ids = [c["chunk_id"] for c in chunk_data]
-                logger.info(f"Assigned {len(chunks)} chunks to worker {worker_id}: {chunk_ids}")
-                await self._send_activity(f"Assigned {len(chunks)} chunks to {worker_id}")
             else:
                 await self.workers[worker_id].send(safe_json_dumps({"type": "no_chunks"}))
@@ -907,7 +1197,7 @@ class Orchestrator:
         elif msg_type == "submit_captions":
             await self._handle_captions_submission(worker_id, data)
         elif msg_type == "request_job":
-            # VLLMWorker requesting a job from data samples
+            # CaptionWorker requesting a job from data samples
             try:
                 job = await asyncio.wait_for(self.data_sample_queue.get(), timeout=5)
                 await self.workers[worker_id].send(
@@ -921,76 +1211,132 @@ class Orchestrator:
             logger.debug(f"Heartbeat from {worker_id}: {data}")
     async def _handle_captions_submission(self, worker_id: str, data: Dict):
-        """Process multiple captions submission from worker."""
+        """Process caption submission from worker - now handles multi-stage outputs."""
         chunk_id = data.get("chunk_id")
         item_key = data["item_key"]
-        captions_list = data["captions"]
-        logger.debug(
-            f"Received {len(captions_list)} captions for item {item_key} from worker {worker_id}"
-        )
+        item_index = data.get("item_index")  # Worker should send this
+        if item_index is None:
+            # Try to extract from item_key (format: dataset_XXXXXXXX)
+            try:
+                item_index = int(item_key.split("_")[-1])
+            except:
+                logger.warning(f"Could not extract item index from key: {item_key}")
-        # Create a SINGLE caption record with ALL captions as a list
+        # Extract user from worker_id (format: "username_uuid")
+        worker_user = worker_id.rsplit("_", 1)[0] if "_" in worker_id else worker_id
+        # Handle both old format (captions list) and new format (outputs dict)
+        if "outputs" in data:
+            # New multi-stage format
+            outputs = data["outputs"]
+            captions_list = outputs.get("captions", [])
+            total_outputs = sum(len(v) for v in outputs.values())
+            logger.debug(
+                f"Received multi-stage outputs for item {item_key} from worker {worker_id}: "
+                f"{total_outputs} outputs across {len(outputs)} fields"
+            )
+        else:
+            # Old format - single captions list
+            captions_list = data["captions"]
+            outputs = {"captions": captions_list}
+            total_outputs = len(captions_list)
+            logger.debug(
+                f"Received {len(captions_list)} captions for item {item_key} from worker {worker_id}"
+            )
+        # Create caption record with multi-stage outputs
         caption = Caption(
-            job_id=f"{chunk_id}_{item_key}",  # Single ID for the item
+            job_id=f"{chunk_id}_{item_key}",
             dataset=data.get("dataset"),
             shard=data.get("shard"),
             item_key=item_key,
-            captions=captions_list,  # Store ALL captions as a list
-            contributor_id=worker_id,
+            captions=captions_list,
+            outputs=outputs,
+            contributor_id=worker_user,
             timestamp=datetime.utcnow(),
-            quality_scores=None,  # Could be a list of scores matching captions
+            quality_scores=None,
             # Image metadata
             image_width=data.get("image_width"),
             image_height=data.get("image_height"),
             image_format=data.get("image_format"),
             file_size=data.get("file_size"),
             # Processing metadata
-            caption_count=len(captions_list),
+            caption_count=total_outputs,
             processing_time_ms=data.get("processing_time_ms"),
             chunk_id=chunk_id,
+            metadata=data.get("metadata", {}),
         )
-        # Add to central storage buffer as a single entry
+        # Add to central storage buffer
         await self.storage.save_caption(caption)
-        # Update statistics
-        self.stats["total_captions"] += len(captions_list)
-        self.stats["buffer_size"] = len(self.storage.caption_buffer)
+        # Handle item tracking with fixed deadlock
+        should_flush = False
+        if chunk_id and item_index is not None and self.chunk_tracker:
+            with self.item_batch_lock:
+                self.pending_processed_items[chunk_id].append(item_index)
-        # Update contributor stats
-        contributor = await self.storage.get_contributor(worker_id)
+                # Check if we should flush
+                total_pending = sum(
+                    len(indices) for indices in self.pending_processed_items.values()
+                )
+                time_since_flush = time.time() - self.last_item_batch_flush
+                if (
+                    total_pending >= self.item_batch_size
+                    or time_since_flush >= self.item_batch_interval
+                ):
+                    should_flush = True
+            if should_flush:
+                await self._flush_processed_items()
+        # Update contributor stats (use user, not worker)
+        contributor = await self.storage.get_contributor(worker_user)
         if contributor:
-            contributor.total_captions += len(captions_list)
+            contributor.total_captions += total_outputs
             await self.storage.save_contributor(contributor)
         # Broadcast updated stats
         await self._broadcast_stats()
         # Log progress periodically
-        if self.stats["total_captions"] % 100 == 0:
-            logger.info(f"Collected {self.stats['total_captions']} captions centrally")
+        total_outputs = self.stats.get("total_outputs", 0)
+        if total_outputs > 0 and total_outputs % 100 == 0:
+            if (
+                not hasattr(self, "_last_logged_outputs")
+                or self._last_logged_outputs != total_outputs
+            ):
+                logger.info(f"Collected {total_outputs} outputs centrally")
+                self._last_logged_outputs = total_outputs
     async def _check_shard_completion(self, chunk_id: str):
         """Check if a shard is complete after chunk completion."""
-        # Extract shard name from chunk_id
-        shard_name = chunk_id.rsplit("_chunk_", 1)[0]
+        # Get the chunk
+        chunk = self.chunk_manager.chunks.get(chunk_id)
+        if not chunk:
+            return
-        # Check if all chunks for this shard are complete
-        chunk_stats = self.chunk_manager.get_stats()
+        shard_name = chunk.shard_name
+        # Find all chunks for this shard
         shard_chunks = [
-            cid
-            for cid, chunk in self.chunk_manager.chunks.items()
-            if chunk.shard_name == shard_name
+            cid for cid, c in self.chunk_manager.chunks.items() if c.belongs_to_shard(shard_name)
         ]
+        # Check if all are completed
         completed_chunks = [
             cid for cid in shard_chunks if self.chunk_manager.chunks[cid].status == "completed"
         ]
-        if len(completed_chunks) == len(shard_chunks):
+        if len(completed_chunks) == len(shard_chunks) and len(shard_chunks) > 0:
             logger.info(f"Shard {shard_name} complete!")
-            self.shard_tracker.mark_complete(shard_name)
+            # Don't mark virtual shards as complete in ShardTracker
+            if not shard_name.startswith("hf_dataset:"):
+                self.shard_tracker.mark_complete(shard_name)
             self.stats["completed_shards"] += 1
             await self._send_activity(f"Shard {shard_name} completed!")
@@ -1076,12 +1422,29 @@ class Orchestrator:
             chunk_stats = self.chunk_manager.get_stats()
             await websocket.send(safe_json_dumps({"type": "chunk_stats", "data": chunk_stats}))
-            # Send contributor leaderboard
+            # Send contributor leaderboard with active worker counts
             contributors = await self.storage.get_top_contributors(10)
+            # Enhance contributor data with active worker counts
+            enhanced_contributors = []
+            worker_counts = (
+                self.get_workers_by_user_stats() if hasattr(self, "workers_by_user") else {}
+            )
+            for contributor in contributors:
+                contrib_dict = {
+                    "contributor_id": contributor.contributor_id,
+                    "name": contributor.name,
+                    "total_captions": contributor.total_captions,
+                    "trust_level": contributor.trust_level,
+                    "active_workers": len(
+                        worker_counts.get(contributor.contributor_id, {}).get("worker_ids", [])
+                    ),
+                }
+                enhanced_contributors.append(contrib_dict)
             await websocket.send(
-                safe_json_dumps(
-                    {"type": "leaderboard", "data": [safe_dict(c) for c in contributors]}
-                )
+                safe_json_dumps({"type": "leaderboard", "data": enhanced_contributors})
             )
             # Keep connection alive
@@ -1094,14 +1457,23 @@ class Orchestrator:
             self.monitors.discard(websocket)
     async def _broadcast_stats(self):
-        """Broadcast statistics to all monitors."""
+        """Broadcast statistics to all monitors - enhanced for multi-stage."""
         if not self.monitors:
             return
+        # Get storage stats
+        storage_stats = await self.storage.get_storage_stats()
+        caption_stats = await self.storage.get_caption_stats()
         # Include chunk stats
         chunk_stats = self.chunk_manager.get_stats()
         self.stats.update({f"chunks_{k}": v for k, v in chunk_stats.items()})
+        # Merge storage stats
+        self.stats.update(storage_stats)
+        self.stats["field_breakdown"] = caption_stats.get("field_stats", {})
+        self.stats["output_fields_list"] = caption_stats.get("output_fields", [])
         # Add rate information
         self.stats.update(
             {
@@ -1111,23 +1483,123 @@ class Orchestrator:
             }
         )
-        # Add vLLM info
+        # Add vLLM info - now includes stage count
         self.stats["vllm_model"] = self.vllm_config.get("model", "unknown")
         self.stats["vllm_batch_size"] = self.vllm_config.get("batch_size", 0)
+        # NEW: Add stage information
+        stages = self.vllm_config.get("stages", [])
+        if stages:
+            self.stats["stage_count"] = len(stages)
+            self.stats["stage_names"] = [s.get("name", "unnamed") for s in stages]
+        else:
+            self.stats["stage_count"] = 1  # Backward compatibility
+            self.stats["stage_names"] = ["default"]
+        field_stats = await self.storage.get_output_field_stats()
+        self.stats["output_fields"] = field_stats
         message = safe_json_dumps({"type": "stats", "data": self.stats})
         # Send to all monitors
         disconnected = set()
-        for monitor in self.monitors:
+        _monitors = self.monitors.copy()
+        for monitor in _monitors:
             try:
                 await monitor.send(message)
             except websockets.exceptions.ConnectionClosed:
                 disconnected.add(monitor)
+        # send updated leaderboard
+        try:
+            contributors = await self.storage.get_top_contributors(10)
+            enhanced_contributors = []
+            worker_counts = (
+                self.get_workers_by_user_stats() if hasattr(self, "workers_by_user") else {}
+            )
+            for contributor in contributors:
+                contrib_dict = {
+                    "contributor_id": contributor.contributor_id,
+                    "name": contributor.name,
+                    "total_captions": contributor.total_captions,
+                    "trust_level": contributor.trust_level,
+                    "active_workers": len(
+                        worker_counts.get(contributor.contributor_id, {}).get("worker_ids", [])
+                    ),
+                }
+                enhanced_contributors.append(contrib_dict)
+            leaderboard_message = safe_json_dumps(
+                {"type": "leaderboard", "data": enhanced_contributors}
+            )
+            # Send to all monitors
+            disconnected = set()
+            for monitor in self.monitors.copy():
+                try:
+                    await monitor.send(leaderboard_message)
+                except websockets.exceptions.ConnectionClosed:
+                    disconnected.add(monitor)
+            self.monitors -= disconnected
+        except Exception as e:
+            logger.error(f"Error sending leaderboard update: {e}")
         # Clean up disconnected monitors
         self.monitors -= disconnected
+    async def _flush_processed_items(self):
+        """Flush batched processed items to chunk tracker."""
+        with self.item_batch_lock:
+            if not self.pending_processed_items:
+                return
+            for chunk_id, indices in self.pending_processed_items.items():
+                if not indices:
+                    continue
+                # Indices here are ABSOLUTE dataset indices
+                # Sort indices
+                indices.sort()
+                # Group consecutive indices into ranges
+                ranges = []
+                start = indices[0]
+                end = indices[0]
+                for i in range(1, len(indices)):
+                    if indices[i] == end + 1:
+                        # Consecutive, extend range
+                        end = indices[i]
+                    else:
+                        # Gap found, save current range and start new one
+                        ranges.append((start, end))
+                        start = indices[i]
+                        end = indices[i]
+                # Don't forget the last range
+                ranges.append((start, end))
+                # Mark ranges as processed (mark_items_processed expects absolute indices)
+                for start_idx, end_idx in ranges:
+                    self.chunk_tracker.mark_items_processed(chunk_id, start_idx, end_idx)
+            # Clear pending items
+            self.pending_processed_items.clear()
+            self.last_item_batch_flush = time.time()
+    def get_workers_by_user_stats(self) -> Dict[str, Any]:
+        """Get statistics about workers grouped by user/token."""
+        if not hasattr(self, "workers_by_user"):
+            return {}
+        stats = {}
+        for user, worker_ids in self.workers_by_user.items():
+            stats[user] = {"worker_count": len(worker_ids), "worker_ids": list(worker_ids)}
+        return stats
     async def _send_activity(self, activity: str):
         """Send activity update to monitors."""
         if not self.monitors:
@@ -1172,36 +1644,52 @@ class Orchestrator:
         while True:
             await asyncio.sleep(60)
+            # Get current caption count from storage
+            storage_stats = await self.storage.get_storage_stats()
+            total_captions = storage_stats["total_captions"]
             # Force checkpoint at regular intervals
-            if self.stats["total_captions"] > 0 and self.stats["total_captions"] % interval == 0:
-                logger.info(f"Triggering checkpoint at {self.stats['total_captions']} captions")
+            if total_captions > 0 and total_captions % interval == 0:
+                logger.info(f"Triggering checkpoint at {total_captions} captions")
                 await self.storage.checkpoint()
                 # Update stats
                 self.stats["last_checkpoint"] = datetime.utcnow().isoformat()
-                self.stats["total_written"] = self.storage.total_captions_written
-                self.stats["buffer_size"] = len(self.storage.caption_buffer)
+                # No need to update total_written or buffer_size - they come from storage
                 await self._broadcast_stats()
                 logger.info(
-                    f"Checkpoint complete. Total written to disk: {self.stats['total_written']}"
+                    f"Checkpoint complete. Total written to disk: {storage_stats['total_written']}"
                 )
     async def _stats_update_loop(self):
         """Periodically update and broadcast stats."""
         # Track session start values
-        session_start_captions = self.stats["total_captions"]
+        storage_stats = await self.storage.get_storage_stats()
+        session_start_outputs = storage_stats["total_captions"]  # This now counts ALL outputs
         session_start_time = time.time()
+        # Track the last known total to detect flushes
+        last_known_total = session_start_outputs
         while True:
             await asyncio.sleep(10)
             # Update chunk stats
             chunk_stats = self.chunk_manager.get_stats()
+            storage_stats = await self.storage.get_storage_stats()
+            current_total_outputs = storage_stats["total_captions"]  # ALL outputs
+            if self.chunk_tracker:
+                await self._flush_processed_items()
             self.stats["total_chunks"] = chunk_stats["total"]
             self.stats["completed_chunks"] = chunk_stats["completed"]
             self.stats["failed_chunks"] = chunk_stats["failed"]
+            # Update total outputs stat (rename from total_captions for clarity)
+            self.stats["total_outputs"] = current_total_outputs
+            self.stats["total_captions"] = current_total_outputs  # Keep for backward compatibility
             # Add queue information
             with self.chunk_manager.lock:
                 self.stats["pending_chunks"] = len(self.chunk_manager.pending_chunks)
@@ -1220,33 +1708,57 @@ class Orchestrator:
             elapsed_since_update = current_time - self.rate_tracker["last_update_time"]
             if elapsed_since_update > 0:
-                # Calculate current rate (captions per minute)
-                caption_diff = (
-                    self.stats["total_captions"] - self.rate_tracker["last_caption_count"]
-                )
-                self.rate_tracker["current_rate"] = (caption_diff / elapsed_since_update) * 60
+                # FIX: Handle the case where duplicates were skipped during save
+                # If current total is less than last known, it means duplicates were skipped
+                # We should not count this as negative progress
+                if current_total_outputs < last_known_total:
+                    logger.debug(
+                        f"Detected duplicate skip during save: {last_known_total} -> {current_total_outputs}"
+                    )
+                    # Don't calculate negative rate, just update the baseline
+                    self.rate_tracker["last_caption_count"] = current_total_outputs
+                    self.rate_tracker["current_rate"] = 0.0  # Set to 0 during flush
+                else:
+                    # Normal rate calculation
+                    output_diff = current_total_outputs - self.rate_tracker["last_caption_count"]
+                    self.rate_tracker["current_rate"] = (output_diff / elapsed_since_update) * 60
+                    self.rate_tracker["last_caption_count"] = current_total_outputs
                 # Calculate average rate since THIS SESSION started
                 session_elapsed = current_time - session_start_time
                 if session_elapsed > 0:
-                    session_captions = self.stats["total_captions"] - session_start_captions
-                    self.rate_tracker["average_rate"] = (session_captions / session_elapsed) * 60
+                    # Always use the difference from session start for average
+                    session_outputs = current_total_outputs - session_start_outputs
+                    self.rate_tracker["average_rate"] = (session_outputs / session_elapsed) * 60
-                # Calculate expected rate based on workers
-                # Assume each worker processes batch_size images every ~2 seconds with 3 captions each
+                # Calculate expected rate based on workers and stages
                 batch_size = self.vllm_config.get("batch_size", 8)
-                num_prompts = len(self.vllm_config.get("inference_prompts", ["", "", ""]))
+                # Count total prompts across all stages
+                total_prompts = 0
+                stages = self.vllm_config.get("stages", [])
+                if stages:
+                    for stage in stages:
+                        total_prompts += len(stage.get("prompts", []))
+                else:
+                    # Backward compatibility
+                    total_prompts = len(self.vllm_config.get("inference_prompts", ["", "", ""]))
                 images_per_minute = 30  # Rough estimate: 30 images/min per worker
-                self.rate_tracker["expected_rate"] = worker_count * images_per_minute * num_prompts
+                self.rate_tracker["expected_rate"] = (
+                    worker_count * images_per_minute * total_prompts
+                )
                 # Update trackers
                 self.rate_tracker["last_update_time"] = current_time
-                self.rate_tracker["last_caption_count"] = self.stats["total_captions"]
+                last_known_total = current_total_outputs
             # Log rate information when workers are connected
-            if worker_count > 0:
+            if (
+                worker_count > 0 and self.rate_tracker["current_rate"] >= 0
+            ):  # Only log non-negative rates
                 logger.info(
-                    f"Rate: {self.rate_tracker['current_rate']:.1f} captions/min "
+                    f"Rate: {self.rate_tracker['current_rate']:.1f} outputs/min "
                     f"(avg: {self.rate_tracker['average_rate']:.1f}, "
                     f"expected: {self.rate_tracker['expected_rate']:.1f}) | "
                     f"Workers: {worker_count}, Chunks: {active_chunks}/{target_buffer}"
@@ -1256,16 +1768,16 @@ class Orchestrator:
     async def _restore_state(self):
         """Restore state from storage on startup."""
-        # Update statistics
-        self.stats["total_captions"] = await self.storage.count_captions()
-        logger.info(f"Restored state: {self.stats['total_captions']} captions")
+        total_captions = await self.storage.count_captions()
+        logger.info(f"Restored state: {total_captions} captions")
     async def shutdown(self):
         """Graceful shutdown."""
         logger.info("Shutting down orchestrator...")
         # Stop chunk creation
+        if self.chunk_tracker:
+            await self._flush_processed_items()
         self.stop_chunk_creation.set()
         if self.chunk_creation_thread:
             self.chunk_creation_thread.join(timeout=5)
@@ -1287,7 +1799,7 @@ class Orchestrator:
         # Save chunk state
         if self.chunk_tracker:
-            self.chunk_tracker.save_checkpoint()
+            self.chunk_tracker.save()
         # Final checkpoint
         logger.info(f"Final flush: {len(self.storage.caption_buffer)} captions in buffer")

caption-flow 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

caption-flow 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl