PyPI - caption-flow - Versions diffs - 0.2.3__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

caption-flow 0.2.3py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

caption_flow/__init__.py +1 -1
caption_flow/cli.py +307 -0
caption_flow/models.py +26 -0
caption_flow/orchestrator.py +9 -9
caption_flow/processors/huggingface.py +636 -464
caption_flow/processors/webdataset.py +379 -534
caption_flow/storage/__init__.py +1 -0
caption_flow/storage/exporter.py +550 -0
caption_flow/{storage.py → storage/manager.py} +410 -303
caption_flow/utils/__init__.py +0 -2
caption_flow/utils/chunk_tracker.py +196 -164
caption_flow/utils/image_processor.py +19 -132
caption_flow/viewer.py +594 -0
caption_flow/workers/caption.py +164 -129
{caption_flow-0.2.3.dist-info → caption_flow-0.3.1.dist-info}/METADATA +45 -177
caption_flow-0.3.1.dist-info/RECORD +33 -0
caption_flow/utils/dataset_loader.py +0 -222
caption_flow/utils/dataset_metadata_cache.py +0 -67
caption_flow/utils/job_queue.py +0 -41
caption_flow/utils/shard_processor.py +0 -119
caption_flow/utils/shard_tracker.py +0 -83
caption_flow-0.2.3.dist-info/RECORD +0 -35
{caption_flow-0.2.3.dist-info → caption_flow-0.3.1.dist-info}/WHEEL +0 -0
{caption_flow-0.2.3.dist-info → caption_flow-0.3.1.dist-info}/entry_points.txt +0 -0
{caption_flow-0.2.3.dist-info → caption_flow-0.3.1.dist-info}/licenses/LICENSE +0 -0
{caption_flow-0.2.3.dist-info → caption_flow-0.3.1.dist-info}/top_level.txt +0 -0

caption_flow/processors/huggingface.py CHANGED Viewed

@@ -1,22 +1,24 @@
-"""HuggingFace Datasets processor implementation."""
+"""HuggingFace Datasets processor implementation - Memory Optimized Version."""
 import logging
 import threading
 import re
+import queue
 import requests
+import json
+import io
+import os
+import gc
+import psutil
+from concurrent.futures import ThreadPoolExecutor, Future
 from typing import Dict, Any, List, Optional, Iterator, Set, Deque, Tuple
 from collections import deque, defaultdict
 from pathlib import Path
-import json
-import io
 from datetime import datetime
 from PIL import Image
-from datasets import (
-    Dataset,
-    get_dataset_config_names,
-    get_dataset_split_names,
-    load_dataset_builder,
-)
+import pyarrow as pa
+import pyarrow.parquet as pq
+from datasets import get_dataset_config_names, get_dataset_split_names
 from huggingface_hub import hf_hub_download, get_token
 from caption_flow.storage import StorageManager
@@ -28,11 +30,82 @@ logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
+def log_memory(location: str):
+    """Log memory usage at specific location."""
+    process = psutil.Process(os.getpid())
+    mem_info = process.memory_info()
+    logger.info(
+        f"Memory at {location}: RSS={mem_info.rss/1024/1024:.1f}MB, VMS={mem_info.vms/1024/1024:.1f}MB"
+    )
+    # Force garbage collection
+    gc.collect()
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+class NonBlockingQueueHandler:
+    """Handles non-blocking retrieval from queues using concurrent futures."""
+    def __init__(self, max_workers: int = 1):
+        self.executor = ThreadPoolExecutor(max_workers=max_workers)
+        self.pending_futures: Dict[int, Future] = {}  # queue_id -> Future
+    def get_from_queue_async(self, response_queue: queue.Queue, timeout: float = None) -> Future:
+        """Start an async queue retrieval."""
+        queue_id = id(response_queue)
+        # Check if we already have a pending future for this queue
+        if queue_id in self.pending_futures and not self.pending_futures[queue_id].done():
+            return self.pending_futures[queue_id]
+        # Start new async retrieval
+        future = self.executor.submit(response_queue.get, timeout=timeout)
+        self.pending_futures[queue_id] = future
+        return future
+    def check_response(self, response_queue: queue.Queue, timeout: float = None) -> Optional[Any]:
+        """Non-blocking check for queue response."""
+        queue_id = id(response_queue)
+        # Start async retrieval if needed
+        future = self.get_from_queue_async(response_queue, timeout)
+        # Check if result is ready (non-blocking)
+        if future.done():
+            try:
+                result = future.result(timeout=0)
+                # Clear future for next retrieval
+                if queue_id in self.pending_futures:
+                    del self.pending_futures[queue_id]
+                return result
+            except queue.Empty:
+                # Queue was empty, clear future
+                if queue_id in self.pending_futures:
+                    del self.pending_futures[queue_id]
+                return None
+            except Exception as e:
+                logger.error(f"Error retrieving from queue: {e}")
+                if queue_id in self.pending_futures:
+                    del self.pending_futures[queue_id]
+                return None
+        # Result not ready yet
+        return None
+    def shutdown(self):
+        """Shutdown the executor."""
+        self.executor.shutdown(wait=True)
 class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
-    """Orchestrator processor for HuggingFace datasets."""
+    """Memory-optimized orchestrator processor for HuggingFace datasets with non-blocking operations."""
     def __init__(self):
-        logger.debug("Initializing HuggingFaceDatasetOrchestratorProcessor")
+        logger.debug(
+            "Initializing HuggingFaceDatasetOrchestratorProcessor (Optimized + Non-blocking)"
+        )
         self.dataset_name: Optional[str] = None
         self.config: Optional[str] = None
         self.split: Optional[str] = None
@@ -44,19 +117,33 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
         self.shard_info: Dict[int, Dict[str, Any]] = {}
         self.total_items: int = 0
-        # Work unit management
-        self.work_units: Dict[str, WorkUnit] = {}
+        # Work unit management - only store active units
         self.pending_units: Deque[str] = deque()
-        self.assigned_units: Dict[str, Set[str]] = defaultdict(set)  # worker_id -> unit_ids
+        self.assigned_units: Dict[str, Set[str]] = defaultdict(set)
         self.lock = threading.Lock()
+        # Track current chunk index for on-demand creation
+        self.current_chunk_index = 0
+        # Cache data files info instead of loading builder repeatedly
+        self.data_files: List[str] = []
         # Background thread for creating work units
         self.unit_creation_thread: Optional[threading.Thread] = None
         self.stop_creation = threading.Event()
+        # Non-blocking queue handler
+        self.queue_handler = NonBlockingQueueHandler()
+        # Response processing state
+        self.last_maintenance_time = datetime.now()
+        self.maintenance_interval = 30  # seconds
     def initialize(self, config: ProcessorConfig, storage: StorageManager) -> None:
         """Initialize HuggingFace dataset processor."""
         logger.debug("Initializing orchestrator with config: %s", config.config)
+        log_memory("start of initialize")
         cfg = config.config
         # Dataset configuration
@@ -83,12 +170,12 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
         self.buffer_multiplier = cfg.get("chunk_buffer_multiplier", 3)
         # Initialize chunk tracking
-        checkpoint_dir = Path(cfg.get("checkpoint_dir", "./checkpoints"))
-        checkpoint_dir.mkdir(parents=True, exist_ok=True)
-        self.chunk_tracker = ChunkTracker(checkpoint_dir / "chunks.json")
+        self.checkpoint_dir = Path(cfg.get("checkpoint_dir", "./checkpoints"))
+        self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
+        self.chunk_tracker = ChunkTracker(self.checkpoint_dir / "chunks.json")
-        # Discover shards
-        self._discover_shards()
+        # Discover shards (optimized)
+        self._discover_shards_optimized()
         # Restore existing state
         self._restore_state(storage=storage)
@@ -98,7 +185,8 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
             target=self._create_units_background, daemon=True
         )
         self.unit_creation_thread.start()
-        logger.debug("Unit creation thread started")
+        log_memory("end of initialize")
     def _detect_config(self, provided_config: Optional[str]) -> str:
         """Auto-detect config if not provided."""
@@ -110,14 +198,12 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
             if not configs:
                 return "default"
-            # Prefer common config names
             preferred = ["default", "en", "train", "main"]
             for pref in preferred:
                 if pref in configs:
                     logger.info(f"Auto-selected config: {pref}")
                     return pref
-            # Otherwise use first available
             logger.info(f"Auto-selected first available config: {configs[0]}")
             return configs[0]
         except Exception as e:
@@ -134,17 +220,14 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
                 self.dataset_name, config_name=self.config, token=self.token
             )
             if not splits:
-                logger.warning("No splits found, using 'train'")
                 return "train"
-            # Prefer training splits
             preferred = ["train", "training", "test", "validation", "dev"]
             for pref in preferred:
                 if pref in splits:
                     logger.info(f"Auto-selected split: {pref}")
                     return pref
-            # Otherwise use first available
             logger.info(f"Auto-selected first available split: {splits[0]}")
             return splits[0]
         except Exception as e:
@@ -153,18 +236,16 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
     def _extract_filename_from_url(self, url: str) -> str:
         """Extract filename from HF URL format."""
-        # Format: hf://datasets/user/dataset@hash/filename
         match = re.search(r"@[a-f0-9]+/(.+)$", url)
         if match:
             return match.group(1)
-        # Fallback: just get last part
         return url.split("/")[-1]
-    def _discover_shards(self):
-        """Discover all shards and their sizes."""
-        logger.info("Discovering shards...")
+    def _get_data_files_from_builder(self) -> List[str]:
+        """Get data files using dataset builder with minimal memory usage."""
+        # Load builder to get correct file structure
+        from datasets import load_dataset_builder
-        # Load dataset builder to get file info
         builder = load_dataset_builder(self.dataset_name, self.config)
         # Get data files for our split
@@ -176,81 +257,114 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
                     files = [files]
                 data_files = files
-        if not data_files:
+        # Explicitly delete builder to free memory
+        del builder
+        gc.collect()
+        return data_files
+    def _discover_shards_optimized(self):
+        """Discover all shards using dataset builder but release memory immediately."""
+        logger.info("Discovering shards...")
+        # Try to load cached shard info first
+        shard_info_cache_path = (
+            self.checkpoint_dir / f"{self.dataset_name}_{self.config}_{self.split}_shard_info.json"
+        )
+        if shard_info_cache_path.exists():
+            try:
+                with open(shard_info_cache_path, "r") as f:
+                    cached_info = json.load(f)
+                    if (
+                        cached_info.get("dataset") == self.dataset_name
+                        and cached_info.get("config") == self.config
+                        and cached_info.get("split") == self.split
+                    ):
+                        self.shard_info = {int(k): v for k, v in cached_info["shards"].items()}
+                        self.total_items = cached_info["total_items"]
+                        self.data_files = cached_info.get("data_files", [])
+                        logger.info(
+                            f"Loaded cached shard info: {len(self.shard_info)} shards, {self.total_items} total items"
+                        )
+                        return
+            except Exception as e:
+                logger.warning(f"Failed to load cached shard info: {e}")
+        # Get data files using dataset builder
+        self.data_files = self._get_data_files_from_builder()
+        if not self.data_files:
             raise ValueError(f"No data files found for split '{self.split}'")
-        logger.info(f"Found {len(data_files)} data files")
+        logger.info(f"Found {len(self.data_files)} data files")
-        # Get info about each shard
+        # Get metadata for each shard
         cumulative_offset = 0
-        for i, file_url in enumerate(data_files):
+        for i, file_url in enumerate(self.data_files):
             filename = self._extract_filename_from_url(file_url)
             logger.info(f"Discovering shard {i}: {filename}")
-            # We don't download shards here - workers will do that
-            # For now, store the info we have
-            self.shard_info[i] = {
-                "shard_id": i,
-                "file_url": file_url,
-                "filename": filename,
-                "start_offset": cumulative_offset,
-                # Size will be determined when first worker needs it
-                "size": None,
-                "end_offset": None,
-            }
-            # Try to get size from builder info if available
-            if hasattr(builder.info, "splits") and self.split in builder.info.splits:
-                split_info = builder.info.splits[self.split]
-                if split_info.num_examples and len(data_files) == 1:
-                    # Single shard case
-                    self.shard_info[i]["size"] = split_info.num_examples
-                    self.shard_info[i]["end_offset"] = (
-                        cumulative_offset + split_info.num_examples - 1
-                    )
-                    cumulative_offset += split_info.num_examples
-        # If we couldn't get sizes, we'll need to load shards on demand
-        if self.shard_info[0]["size"] is None:
-            logger.warning("Shard sizes not available from metadata, will load on demand")
-        else:
-            self.total_items = cumulative_offset
-            logger.info(f"Total items across all shards: {self.total_items}")
-    def _get_shard_size(self, shard_id: int) -> int:
-        """Get size of a shard, loading it if necessary."""
-        if self.shard_info[shard_id]["size"] is not None:
-            return self.shard_info[shard_id]["size"]
+            try:
+                # Download file to get metadata
+                local_path = hf_hub_download(
+                    repo_id=self.dataset_name,
+                    filename=filename,
+                    repo_type="dataset",
+                    token=self.token,
+                )
-        # Need to load the shard to get its size
-        logger.info(f"Loading shard {shard_id} to determine size...")
-        filename = self.shard_info[shard_id]["filename"]
+                # Read only metadata
+                metadata = pq.read_metadata(local_path)
+                size = metadata.num_rows
+                self.shard_info[i] = {
+                    "shard_id": i,
+                    "file_url": file_url,
+                    "filename": filename,
+                    "start_offset": cumulative_offset,
+                    "size": size,
+                    "end_offset": cumulative_offset + size - 1,
+                }
-        local_path = hf_hub_download(
-            repo_id=self.dataset_name, filename=filename, repo_type="dataset", token=self.token
-        )
+                cumulative_offset += size
+                logger.info(f"Shard {i} ({filename}): {size} rows")
-        # Load just to get size
-        dataset = Dataset.from_parquet(local_path)
-        size = len(dataset)
+            except Exception as e:
+                logger.error(f"Failed to discover shard {i}: {e}")
+                # Skip this shard
+                continue
-        # Update shard info
-        self.shard_info[shard_id]["size"] = size
+        self.total_items = cumulative_offset
+        logger.info(f"Total items across all shards: {self.total_items}")
-        # Update offsets for this and subsequent shards
-        for sid in range(shard_id, len(self.shard_info)):
-            if sid > shard_id:
-                self.shard_info[sid]["start_offset"] = self.shard_info[sid - 1]["end_offset"] + 1
-            self.shard_info[sid]["end_offset"] = (
-                self.shard_info[sid]["start_offset"] + self.shard_info[sid]["size"] - 1
-            )
+        # Cache shard info
+        try:
+            cache_data = {
+                "dataset": self.dataset_name,
+                "config": self.config,
+                "split": self.split,
+                "shards": self.shard_info,
+                "total_items": self.total_items,
+                "data_files": self.data_files,
+            }
+            with open(shard_info_cache_path, "w") as f:
+                json.dump(cache_data, f)
+            logger.info(f"Cached shard info to {shard_info_cache_path}")
+        except Exception as e:
+            logger.warning(f"Failed to cache shard info: {e}")
-        # Update total items
-        if all(s["size"] is not None for s in self.shard_info.values()):
-            self.total_items = sum(s["size"] for s in self.shard_info.values())
-            logger.info(f"Total items: {self.total_items}")
+        # Force garbage collection
+        gc.collect()
+        log_memory("after discovering shards")
-        return size
+    def _get_shard_for_index(self, global_index: int) -> Tuple[int, int]:
+        """Get shard ID and local index for a global index."""
+        for shard_id, sinfo in self.shard_info.items():
+            if sinfo["start_offset"] <= global_index <= sinfo["end_offset"]:
+                local_index = global_index - sinfo["start_offset"]
+                return shard_id, local_index
+        raise ValueError(f"Global index {global_index} not found in any shard")
     def _restore_state(self, storage: StorageManager) -> None:
         """Restore state from chunk tracker."""
@@ -258,73 +372,83 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
         if not self.chunk_tracker:
             return
-        all_processed_jobs = storage.get_all_processed_job_ids()
         with self.lock:
+            max_chunk_index = -1
             for chunk_id, chunk_state in self.chunk_tracker.chunks.items():
-                # Calculate actual unprocessed ranges
-                chunk_range = (
-                    chunk_state.start_index,
-                    chunk_state.start_index + chunk_state.chunk_size - 1,
-                )
+                chunk_index = chunk_state.start_index // self.chunk_size
+                max_chunk_index = max(max_chunk_index, chunk_index)
+                # Only add incomplete chunks to pending
+                if chunk_state.status != "completed":
+                    self.pending_units.append(chunk_id)
+                elif chunk_state.status == "completed" and chunk_state.processed_ranges:
+                    logger.warning(
+                        f"Chunk {chunk_id} has processed_ranges stored in the checkpoint."
+                    )
+            self.current_chunk_index = max_chunk_index + 1
+            logger.info(f"Resuming from chunk index {self.current_chunk_index}")
-                # Get processed indices for this chunk
-                processed_ranges = self.chunk_tracker.get_processed_indices_for_chunk(
-                    chunk_id, all_processed_jobs
+    def _create_work_unit(self, chunk_index: int) -> Optional[WorkUnit]:
+        """Create a single work unit for a chunk index."""
+        current_index = chunk_index * self.chunk_size
+        if current_index >= self.total_items:
+            return None
+        chunk_size = min(self.chunk_size, self.total_items - current_index)
+        # Find shard for this chunk
+        shard_id, _ = self._get_shard_for_index(current_index)
+        shard_name = Path(self.shard_info[shard_id]["filename"]).stem
+        job_id_obj = JobId(shard_id=shard_name, chunk_id=chunk_index, sample_id=current_index)
+        unit_id = job_id_obj.get_chunk_str()
+        # Calculate unprocessed ranges based on existing chunk state
+        unprocessed_ranges = [(current_index, current_index + chunk_size - 1)]
+        if self.chunk_tracker and unit_id in self.chunk_tracker.chunks:
+            chunk_state = self.chunk_tracker.chunks[unit_id]
+            if chunk_state.processed_ranges:
+                # Subtract processed ranges from total range
+                unprocessed_ranges = self._subtract_ranges(
+                    [(current_index, current_index + chunk_size - 1)], chunk_state.processed_ranges
                 )
-                # Calculate unprocessed ranges
-                unprocessed_ranges = self._subtract_ranges([chunk_range], processed_ranges)
-                if unprocessed_ranges:
-                    # Find which shard(s) this chunk belongs to
-                    shard_ids = []
-                    for sid, sinfo in self.shard_info.items():
-                        # Need size to check
-                        if sinfo["size"] is None:
-                            self._get_shard_size(sid)
-                        if (
-                            sinfo["start_offset"]
-                            <= chunk_state.start_index + chunk_state.chunk_size - 1
-                            and sinfo["end_offset"] >= chunk_state.start_index
-                        ):
-                            shard_ids.append(sid)
-                            logger.info(f"Found shard {sid} for chunk {chunk_id}: {sinfo}")
-                    chunk_index = chunk_state.start_index // self.chunk_size
-                    shard_name = Path(self.shard_info[shard_ids[0]]["filename"]).stem
-                    unit = WorkUnit(
-                        unit_id=chunk_id,
-                        chunk_id=chunk_id,
-                        source_id=shard_name,
-                        data={
-                            "dataset_name": self.dataset_name,
-                            "config": self.config,
-                            "split": self.split,
-                            "start_index": chunk_state.start_index,
-                            "chunk_size": chunk_state.chunk_size,
-                            "unprocessed_ranges": unprocessed_ranges,
-                            "shard_ids": shard_ids,
-                        },
-                        metadata={
-                            "dataset": self.dataset_name,
-                            "shard_name": shard_name,
-                            "chunk_index": chunk_index,
-                        },
-                    )
+        # If all ranges are processed, return None (shouldn't happen if status tracking is correct)
+        if not unprocessed_ranges:
+            return None
-                    self.work_units[unit.unit_id] = unit
-                    self.pending_units.append(unit.unit_id)
+        unit = WorkUnit(
+            unit_id=unit_id,
+            chunk_id=unit_id,
+            source_id=shard_name,
+            data={
+                "dataset_name": self.dataset_name,
+                "config": self.config,
+                "split": self.split,
+                "start_index": current_index,
+                "chunk_size": chunk_size,
+                "unprocessed_ranges": unprocessed_ranges,  # Use calculated ranges
+                "shard_ids": [shard_id],
+                "data_files": self.data_files,
+            },
+            metadata={
+                "dataset": self.dataset_name,
+                "shard_name": shard_name,
+                "chunk_index": chunk_index,
+            },
+        )
+        return unit
     def _create_units_background(self) -> None:
         """Background thread to create work units on demand."""
         logger.info("Starting work unit creation thread")
-        current_index = 0
         while not self.stop_creation.is_set():
-            # Check if we need more units
             with self.lock:
                 pending_count = len(self.pending_units)
                 assigned_count = sum(len(units) for units in self.assigned_units.values())
@@ -337,127 +461,114 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
                 threading.Event().wait(5)
                 continue
-            # Make sure we know total items
-            if self.total_items == 0:
-                # Load all shard sizes
-                for sid in range(len(self.shard_info)):
-                    self._get_shard_size(sid)
             # Create units as needed
             units_created = 0
-            while units_created < units_needed and current_index < self.total_items:
-                chunk_size = min(self.chunk_size, self.total_items - current_index)
-                chunk_id = current_index // self.chunk_size
-                with self.lock:
-                    shard_ids = []
-                    for sid, sinfo in self.shard_info.items():
-                        if (
-                            sinfo["start_offset"] <= current_index + chunk_size - 1
-                            and sinfo["end_offset"] >= current_index
-                        ):
-                            shard_ids.append(sid)
-                    shard_name = Path(self.shard_info[shard_ids[0]]["filename"]).stem
+            while units_created < units_needed:
+                logger.debug(f"Creating work unit for chunk {self.current_chunk_index}")
+                if self.current_chunk_index * self.chunk_size >= self.total_items:
+                    threading.Event().wait(30)
+                    break
+                # Get shard info for proper unit_id
+                current_index = self.current_chunk_index * self.chunk_size
+                if current_index < self.total_items:
+                    shard_id, _ = self._get_shard_for_index(current_index)
+                    shard_name = Path(self.shard_info[shard_id]["filename"]).stem
                     job_id_obj = JobId(
-                        shard_id=shard_name, chunk_id=chunk_id, sample_id=current_index
+                        shard_id=shard_name,
+                        chunk_id=self.current_chunk_index,
+                        sample_id=current_index,
                     )
-                    unit_id = (
-                        job_id_obj.get_chunk_str()
-                    )  # just the chunk part, eg pixel-images:chunk:0
-                    if unit_id in self.work_units:
-                        current_index += self.chunk_size
-                        continue
-                    # Check if chunk is already completed
-                    if self.chunk_tracker:
-                        chunk_state = self.chunk_tracker.chunks.get(unit_id)
-                        if chunk_state and chunk_state.status == "completed":
-                            current_index += self.chunk_size
-                            continue
+                    unit_id = job_id_obj.get_chunk_str()
-                    # Find which shard(s) this chunk belongs to
-                    unit = WorkUnit(
-                        unit_id=unit_id,
-                        chunk_id=unit_id,
-                        source_id=shard_name,
-                        data={
-                            "dataset_name": self.dataset_name,
-                            "config": self.config,
-                            "split": self.split,
-                            "start_index": current_index,
-                            "chunk_size": chunk_size,
-                            "unprocessed_ranges": [(current_index, current_index + chunk_size - 1)],
-                            "shard_ids": shard_ids,
-                        },
-                        metadata={
-                            "dataset": self.dataset_name,
-                            "shard_name": shard_name,
-                            "chunk_index": chunk_id,
-                        },
-                    )
-                    logger.debug(f"Created WorkUnit: {unit}")
+                with self.lock:
+                    # Check if already tracked
+                    if self.chunk_tracker and unit_id in self.chunk_tracker.chunks:
+                        chunk_state = self.chunk_tracker.chunks[unit_id]
+                        if chunk_state.status == "completed":
+                            self.current_chunk_index += 1
+                            continue
-                    self.work_units[unit_id] = unit
+                    # Add to pending
                     self.pending_units.append(unit_id)
+                    # Track in chunk tracker
                     if self.chunk_tracker:
+                        start_index = self.current_chunk_index * self.chunk_size
+                        chunk_size = min(self.chunk_size, self.total_items - start_index)
                         self.chunk_tracker.add_chunk(
                             unit_id,
                             self.dataset_name,
-                            "",  # No shard URL
-                            current_index,
+                            "",
+                            start_index,
                             chunk_size,
                         )
                     units_created += 1
-                current_index += self.chunk_size
+                    self.current_chunk_index += 1
             if units_created > 0:
-                logger.debug(f"Created {units_created} work units")
-    def _subtract_ranges(
-        self, total_ranges: List[Tuple[int, int]], processed_ranges: List[Tuple[int, int]]
-    ) -> List[Tuple[int, int]]:
-        """Subtract processed ranges from total ranges."""
-        if not processed_ranges:
-            return total_ranges
-        # Create a set of all processed indices
-        processed_indices = set()
-        for start, end in processed_ranges:
-            processed_indices.update(range(start, end + 1))
-        # Find unprocessed ranges
-        unprocessed_ranges = []
-        for start, end in total_ranges:
-            current_start = None
-            for i in range(start, end + 1):
-                if i not in processed_indices:
-                    if current_start is None:
-                        current_start = i
-                else:
-                    if current_start is not None:
-                        unprocessed_ranges.append((current_start, i - 1))
-                        current_start = None
+                logger.debug(f"Created {units_created} work unit IDs")
+        logger.info("Thread for creating units has completed. Exiting thread.")
+    def process_responses_non_blocking(self, response_queue: queue.Queue) -> Optional[WorkResult]:
+        """
+        Non-blocking method to process responses from workers.
+        Returns a WorkResult if one is available, None otherwise.
+        """
+        # Check for response without blocking
+        response = self.queue_handler.check_response(response_queue, timeout=0.1)
+        if response is not None:
+            # Process the response
+            if isinstance(response, WorkResult):
+                logger.debug(f"Processing response for unit {response.unit_id}")
+                return response
+            else:
+                logger.warning(f"Unexpected response type: {type(response)}")
+        # Perform periodic maintenance tasks
+        now = datetime.now()
+        if (now - self.last_maintenance_time).total_seconds() > self.maintenance_interval:
+            self._perform_maintenance()
+            self.last_maintenance_time = now
+        return None
+    def _perform_maintenance(self):
+        """Perform periodic maintenance tasks."""
+        with self.lock:
+            # Log current state
+            pending_count = len(self.pending_units)
+            assigned_count = sum(len(units) for units in self.assigned_units.values())
+            logger.debug(f"Maintenance: {pending_count} pending, {assigned_count} assigned units")
-            if current_start is not None:
-                unprocessed_ranges.append((current_start, end))
+            # Check for stale assignments (workers that might have disconnected)
+            # This would be implemented based on your worker heartbeat mechanism
-        return unprocessed_ranges
+            # Force checkpoint save if needed
+            if self.chunk_tracker:
+                self.chunk_tracker.save_checkpoint()
     def get_work_units(self, count: int, worker_id: str) -> List[WorkUnit]:
         """Get available work units for a worker."""
-        logger.debug("get_work_units called: count=%d worker_id=%s", count, worker_id)
-        assigned = []
+        logger.debug(
+            "get_work_units called: count=%d worker_id=%s, pending: %d",
+            count,
+            worker_id,
+            len(self.pending_units),
+        )
+        assigned = []
         with self.lock:
             while len(assigned) < count and self.pending_units:
                 unit_id = self.pending_units.popleft()
-                unit = self.work_units.get(unit_id)
+                # Create work unit on demand
+                chunk_index = int(unit_id.split(":")[-1])
+                unit = self._create_work_unit(chunk_index)
                 if unit:
                     self.assigned_units[worker_id].add(unit_id)
@@ -474,22 +585,26 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
         """Mark a work unit as completed."""
         logger.debug("Marking unit %s as completed by worker %s", unit_id, worker_id)
         with self.lock:
-            if unit_id in self.work_units:
-                self.assigned_units[worker_id].discard(unit_id)
+            self.assigned_units[worker_id].discard(unit_id)
-                if self.chunk_tracker:
-                    self.chunk_tracker.mark_completed(unit_id)
+            if self.chunk_tracker:
+                self.chunk_tracker.mark_completed(unit_id)
+            # remove from pending deque if it's there.
+            try:
+                self.pending_units.remove(unit_id)
+            except:
+                pass
     def mark_failed(self, unit_id: str, worker_id: str, error: str) -> None:
         """Mark a work unit as failed."""
-        logger.debug("Marking unit %s as failed by worker %s, error: %s", unit_id, worker_id, error)
+        logger.error("Marking unit %s as failed by worker %s, error: %s", unit_id, worker_id, error)
         with self.lock:
-            if unit_id in self.work_units:
-                self.assigned_units[worker_id].discard(unit_id)
-                self.pending_units.append(unit_id)
+            self.assigned_units[worker_id].discard(unit_id)
+            self.pending_units.append(unit_id)
-                if self.chunk_tracker:
-                    self.chunk_tracker.mark_failed(unit_id)
+            if self.chunk_tracker:
+                self.chunk_tracker.mark_failed(unit_id)
     def release_assignments(self, worker_id: str) -> None:
         """Release all assignments for a disconnected worker."""
@@ -498,8 +613,8 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
             unit_ids = list(self.assigned_units.get(worker_id, []))
             for unit_id in unit_ids:
-                if unit_id in self.work_units:
-                    self.pending_units.append(unit_id)
+                logger.debug(f"Adding {unit_id} to pending queue")
+                self.pending_units.append(unit_id)
             if worker_id in self.assigned_units:
                 del self.assigned_units[worker_id]
@@ -509,57 +624,8 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
     def update_from_storage(self, processed_job_ids: Set[str]) -> None:
         """Update work units based on what's been processed."""
-        logger.info(f"Updating work units from {len(processed_job_ids)} processed jobs")
-        with self.lock:
-            for unit_id, unit in self.work_units.items():
-                # Extract chunk info from unit
-                logger.debug(f"Checking unit {unit_id} for updates")
-                logger.debug(f"Unit data: {unit.data}")
-                logger.debug(f"Unit metadata: {unit.metadata}")
-                start_index = unit.data["start_index"]
-                chunk_size = unit.data["chunk_size"]
-                shard_name = unit.metadata["shard_name"]
-                chunk_index = unit.metadata["chunk_index"]
-                # Find processed indices for this chunk
-                processed_indices = []
-                for job_id in processed_job_ids:
-                    # Parse job_id format: "data-0000:chunk:0:idx:42"
-                    job_id = JobId.from_str(job_id=job_id)
-                    if job_id.shard_id == shard_name and int(job_id.chunk_id) == chunk_index:
-                        idx = int(job_id.sample_id)
-                        if start_index <= idx < start_index + chunk_size:
-                            processed_indices.append(idx)
-                if processed_indices:
-                    # Convert to ranges
-                    processed_indices.sort()
-                    processed_ranges = []
-                    start = processed_indices[0]
-                    end = processed_indices[0]
-                    for idx in processed_indices[1:]:
-                        if idx == end + 1:
-                            end = idx
-                        else:
-                            processed_ranges.append((start, end))
-                            start = idx
-                            end = idx
-                    processed_ranges.append((start, end))
-                    # Calculate unprocessed ranges
-                    total_range = [(start_index, start_index + chunk_size - 1)]
-                    unprocessed_ranges = self._subtract_ranges(total_range, processed_ranges)
-                    # Update unit
-                    unit.data["unprocessed_ranges"] = unprocessed_ranges
-                    logger.debug(
-                        f"Updated unit {unit_id}: {len(processed_indices)} processed, "
-                        f"unprocessed ranges: {unprocessed_ranges}"
-                    )
+        logger.info(f"Updating from storage with {len(processed_job_ids)} processed jobs")
+        # No need to update in-memory work units since we create on demand
     def get_stats(self) -> Dict[str, Any]:
         """Get processor statistics."""
@@ -568,12 +634,12 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
                 "dataset": self.dataset_name,
                 "config": self.config,
                 "split": self.split,
-                "total_units": len(self.work_units),
                 "pending_units": len(self.pending_units),
                 "assigned_units": sum(len(units) for units in self.assigned_units.values()),
                 "total_shards": len(self.shard_info),
                 "total_items": self.total_items,
                 "workers": len(self.assigned_units),
+                "current_chunk_index": self.current_chunk_index,
             }
             return stats
@@ -581,71 +647,111 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
         """Handle result processing."""
         base_result = super().handle_result(result)
-        # Track processed items
         if self.chunk_tracker:
-            if "item_indices" not in result.metadata:
-                result.metadata["item_indices"] = [result.metadata.get("_item_index")]
-            indices = result.metadata["item_indices"]
+            if "item_indices" in result.metadata:
+                indices = result.metadata["item_indices"]
+                if indices:
+                    # Convert to ranges for efficient tracking
+                    indices.sort()
+                    ranges = []
+                    start = indices[0]
+                    end = indices[0]
+                    for i in range(1, len(indices)):
+                        if indices[i] == end + 1:
+                            end = indices[i]
+                        else:
+                            ranges.append((start, end))
+                            start = indices[i]
+                            end = indices[i]
-            if indices:
-                indices.sort()
-                ranges = []
-                start = indices[0]
-                end = indices[0]
+                    ranges.append((start, end))
-                for i in range(1, len(indices)):
-                    if indices[i] == end + 1:
-                        end = indices[i]
-                    else:
-                        ranges.append((start, end))
-                        start = indices[i]
-                        end = indices[i]
+                    for start_idx, end_idx in ranges:
+                        self.chunk_tracker.mark_items_processed(result.chunk_id, start_idx, end_idx)
-                ranges.append((start, end))
+        return base_result
-                for start_idx, end_idx in ranges:
-                    self.chunk_tracker.mark_items_processed(result.chunk_id, start_idx, end_idx)
+    def _subtract_ranges(
+        self, total_ranges: List[Tuple[int, int]], processed_ranges: List[Tuple[int, int]]
+    ) -> List[Tuple[int, int]]:
+        """Subtract processed ranges from total ranges."""
+        if not processed_ranges:
+            return total_ranges
-        return base_result
+        # Create a set of all processed indices
+        processed_indices = set()
+        for start, end in processed_ranges:
+            processed_indices.update(range(start, end + 1))
+        # Find unprocessed ranges
+        unprocessed_ranges = []
+        for start, end in total_ranges:
+            current_start = None
+            for i in range(start, end + 1):
+                if i not in processed_indices:
+                    if current_start is None:
+                        current_start = i
+                else:
+                    if current_start is not None:
+                        unprocessed_ranges.append((current_start, i - 1))
+                        current_start = None
+            if current_start is not None:
+                unprocessed_ranges.append((current_start, end))
+        return unprocessed_ranges
+    def cleanup(self):
+        """Clean up resources."""
+        logger.info("Cleaning up orchestrator resources")
+        # Stop background threads
+        self.stop_creation.set()
+        if self.unit_creation_thread:
+            self.unit_creation_thread.join(timeout=5)
+        # Shutdown queue handler
+        self.queue_handler.shutdown()
+        # Save final state
+        if self.chunk_tracker:
+            self.chunk_tracker.save_checkpoint()
 class HuggingFaceDatasetWorkerProcessor(WorkerProcessor):
-    """Worker processor for HuggingFace datasets."""
+    """Memory-optimized worker processor for HuggingFace datasets."""
     def __init__(self):
-        logger.debug("Initializing HuggingFaceDatasetWorkerProcessor")
+        logger.debug("Initializing HuggingFaceDatasetWorkerProcessor (Optimized)")
         self.dataset_config: Dict[str, Any] = {}
         self.token = get_token()
-        self.shard_cache: Dict[int, Dataset] = {}  # Cache loaded shards
         self.image_column: Optional[str] = None
         self.url_column: Optional[str] = None
+        # Thread-local storage for shard info to avoid repeated builder loading
+        self._thread_local = threading.local()
     def initialize(self, config: ProcessorConfig) -> None:
         """Initialize processor."""
         logger.debug("Initializing worker with config: %s", config.config)
         self.dataset_config = config.config.get("dataset", {})
-        # Determine if this is an image URL dataset or binary image dataset
         self.image_column = self.dataset_config.get("dataset_image_column", "image")
         self.url_column = self.dataset_config.get("dataset_url_column", "image_url")
         self.dataset_path = self.dataset_config.get("dataset_path", None)
-    def _load_shard(self, dataset_name: str, shard_filename: str, shard_id: int) -> Dataset:
-        """Load a shard if not already cached."""
-        if shard_id in self.shard_cache:
-            return self.shard_cache[shard_id]
-        logger.info(f"Loading shard {shard_id}: {shard_filename}")
+        # Add mock results flag
+        self.mock_results = self.dataset_config.get("mock_results", False)
+        if self.mock_results:
+            logger.info("Mock results mode enabled - will generate dummy images")
-        local_path = hf_hub_download(
+    def _get_shard_path(self, dataset_name: str, shard_filename: str) -> str:
+        """Get local path for a shard, downloading if needed."""
+        return hf_hub_download(
             repo_id=dataset_name, filename=shard_filename, repo_type="dataset", token=self.token
         )
-        dataset = Dataset.from_parquet(local_path)
-        self.shard_cache[shard_id] = dataset
-        return dataset
     def _extract_filename_from_url(self, url: str) -> str:
         """Extract filename from HF URL format."""
         match = re.search(r"@[a-f0-9]+/(.+)$", url)
@@ -653,161 +759,227 @@ class HuggingFaceDatasetWorkerProcessor(WorkerProcessor):
             return match.group(1)
         return url.split("/")[-1]
+    def _create_dummy_image(self, index: int, metadata: Dict[str, Any]) -> Image.Image:
+        """Create a dummy image"""
+        color = (0, 0, 0)
+        width, height = 128, 128
+        image = Image.new("RGB", (width, height), color=color)
+        return image
     def process_unit(self, unit: WorkUnit, context: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
         """Process a work unit, yielding items to be captioned."""
-        logger.debug("Processing unit: %s", unit.unit_id)
+        logger.debug("Processing unit: %s (mock_results=%s)", unit.unit_id, self.mock_results)
+        log_memory(f"start processing unit {unit.unit_id}")
         dataset_name = unit.data["dataset_name"]
-        config = unit.data["config"]
-        split = unit.data["split"]
         start_index = unit.data["start_index"]
         chunk_size = unit.data["chunk_size"]
         unprocessed_ranges = unit.data.get(
             "unprocessed_ranges", [(start_index, start_index + chunk_size - 1)]
         )
         shard_ids = unit.data.get("shard_ids", [])
+        data_files = unit.data.get("data_files", [])
         logger.info(f"Processing unit {unit.unit_id} with ranges: {unprocessed_ranges}")
-        # Need to get shard info - should be passed in unit data
-        # For now, we'll need to load dataset builder to get file info
-        from datasets import load_dataset_builder
-        builder = load_dataset_builder(dataset_name, config)
-        data_files = []
-        if hasattr(builder.config, "data_files"):
-            if isinstance(builder.config.data_files, dict):
-                files = builder.config.data_files.get(split, [])
-                if isinstance(files, str):
-                    files = [files]
-                data_files = files
-        # Build shard info
+        # Build shard info from provided data files (no dataset builder needed)
         shard_info = {}
-        cumulative_offset = 0
-        for i, file_url in enumerate(data_files):
-            if i not in shard_ids:
-                # Skip loading this shard, but we need its size for offsets
-                # This is inefficient - in real implementation, orchestrator should pass this info
-                filename = self._extract_filename_from_url(file_url)
-                dataset = self._load_shard(dataset_name, filename, i)
-                size = len(dataset)
-                cumulative_offset += size
-                continue
-            filename = self._extract_filename_from_url(file_url)
-            dataset = self._load_shard(dataset_name, filename, i)
+        if data_files:
+            # Use provided data files
+            for i, file_url in enumerate(data_files):
+                if i in shard_ids:
+                    filename = self._extract_filename_from_url(file_url)
+                    shard_path = self._get_shard_path(dataset_name, filename)
+                    # Get size from metadata
+                    metadata = pq.read_metadata(shard_path)
+                    size = metadata.num_rows
+                    shard_info[i] = {
+                        "path": shard_path,
+                        "start_offset": 0,  # Will be set below
+                        "end_offset": 0,  # Will be set below
+                        "size": size,
+                        "metadata": metadata,
+                    }
-            shard_info[i] = {
-                "dataset": dataset,
-                "start_offset": cumulative_offset,
-                "end_offset": cumulative_offset + len(dataset) - 1,
-                "columns": dataset.column_names,
-            }
-            cumulative_offset += len(dataset)
+            # Calculate offsets
+            cumulative_offset = 0
+            for i in range(max(shard_info.keys()) + 1):
+                if i in shard_info:
+                    shard_info[i]["start_offset"] = cumulative_offset
+                    shard_info[i]["end_offset"] = cumulative_offset + shard_info[i]["size"] - 1
+                    cumulative_offset += shard_info[i]["size"]
+                else:
+                    # Need to get size for offset calculation
+                    filename = self._extract_filename_from_url(data_files[i])
+                    shard_path = self._get_shard_path(dataset_name, filename)
+                    metadata = pq.read_metadata(shard_path)
+                    cumulative_offset += metadata.num_rows
+        else:
+            # This should never happen with the new orchestrator
+            raise ValueError("No data files provided in work unit")
         # Create set of indices to process
         indices_to_process = set()
         for start, end in unprocessed_ranges:
             indices_to_process.update(range(start, end + 1))
-        processed_indices = []
-        # Process items
-        for global_idx in sorted(indices_to_process):
-            # Find which shard contains this index
-            shard_id = None
-            local_idx = None
-            for sid, sinfo in shard_info.items():
+        # Group indices by shard
+        indices_by_shard = defaultdict(list)
+        for global_idx in indices_to_process:
+            for shard_id, sinfo in shard_info.items():
                 if sinfo["start_offset"] <= global_idx <= sinfo["end_offset"]:
-                    shard_id = sid
                     local_idx = global_idx - sinfo["start_offset"]
+                    indices_by_shard[shard_id].append((global_idx, local_idx))
                     break
-            if shard_id is None:
-                logger.warning(f"Could not find shard for global index {global_idx}")
-                continue
-            try:
-                # Get item from shard
-                item = shard_info[shard_id]["dataset"][local_idx]
-                # Check if this is a URL dataset or binary image dataset
-                image = None
-                image_url = None
-                # Try URL column first
-                if self.url_column and self.url_column in item:
-                    image_url = item[self.url_column]
-                    # Download image from URL
-                    try:
-                        response = requests.get(image_url, timeout=30)
-                        response.raise_for_status()
-                        image = Image.open(io.BytesIO(response.content))
-                    except Exception as e:
-                        logger.error(f"Error downloading image from {image_url}: {e}")
-                        continue
-                # Try binary image column
-                elif self.image_column and self.image_column in item:
-                    image_data = item[self.image_column]
-                    if isinstance(image_data, Image.Image):
-                        image = image_data
-                    elif isinstance(image_data, dict) and "bytes" in image_data:
-                        # Handle datasets Image feature
-                        image = Image.open(io.BytesIO(image_data["bytes"]))
-                    elif isinstance(image_data, bytes):
-                        image = Image.open(io.BytesIO(image_data))
-                if image is None:
-                    logger.warning(f"No image found for item at index {global_idx}")
-                    continue
-                # Build job ID
-                chunk_index = unit.metadata["chunk_index"]
-                shard_name = unit.metadata["shard_name"]
-                job_id_obj = JobId(
-                    shard_id=shard_name, chunk_id=str(chunk_index), sample_id=str(global_idx)
-                )
-                job_id = job_id_obj.get_sample_str()
-                # Clean metadata
-                clean_metadata = {
-                    k: v
-                    for k, v in item.items()
-                    if k not in [self.image_column, self.url_column] and not k.startswith("_")
-                }
-                clean_metadata.update(
-                    {
-                        "_item_index": global_idx,
-                        "_chunk_relative_index": global_idx - start_index,
-                        "_job_id": job_id,
-                        "_shard_id": shard_id,
-                        "_local_index": local_idx,
-                        "_url": image_url,
-                    }
-                )
-                yield {
-                    "image": image,
-                    "item_key": str(global_idx),
-                    "item_index": global_idx,
-                    "metadata": clean_metadata,
-                    "job_id": job_id,
-                }
-                processed_indices.append(global_idx)
+        processed_indices = []
-            except Exception as e:
-                logger.error(f"Error processing item at index {global_idx}: {e}")
+        # Process items shard by shard
+        for shard_id, idx_pairs in indices_by_shard.items():
+            shard_path = shard_info[shard_id]["path"]
+            # Process in batches to avoid loading entire table
+            batch_size = 100
+            for batch_start in range(0, len(idx_pairs), batch_size):
+                batch_pairs = idx_pairs[batch_start : batch_start + batch_size]
+                local_indices = [local_idx for _, local_idx in batch_pairs]
+                # Read only specific rows using PyArrow
+                try:
+                    # Create row group filters based on metadata
+                    metadata = shard_info[shard_id]["metadata"]
+                    row_groups_to_read = set()
+                    # Find which row groups contain our indices
+                    current_row = 0
+                    for rg_idx in range(metadata.num_row_groups):
+                        rg_metadata = metadata.row_group(rg_idx)
+                        rg_num_rows = rg_metadata.num_rows
+                        # Check if any of our indices are in this row group
+                        for local_idx in local_indices:
+                            if current_row <= local_idx < current_row + rg_num_rows:
+                                row_groups_to_read.add(rg_idx)
+                        current_row += rg_num_rows
+                    # Read only necessary row groups
+                    parquet_file = pq.ParquetFile(shard_path)
+                    table = parquet_file.read_row_groups(list(row_groups_to_read))
+                    # Process items
+                    for global_idx, local_idx in batch_pairs:
+                        try:
+                            # Get item as dictionary (efficient row extraction)
+                            row_dict = table.slice(local_idx, 1).to_pydict()
+                            item = {k: v[0] for k, v in row_dict.items()}
+                            # Process image
+                            image = None
+                            image_url = None
+                            if self.mock_results:
+                                # In mock mode, create a dummy image
+                                logger.debug(f"Creating mock image for index {global_idx}")
+                                # Still extract URL if available for metadata
+                                if self.url_column and self.url_column in item:
+                                    image_url = item[self.url_column]
+                                # Create dummy image with metadata context
+                                image = self._create_dummy_image(
+                                    global_idx,
+                                    {
+                                        "_shard_id": shard_id,
+                                        "_local_index": local_idx,
+                                    },
+                                )
+                            else:
+                                # Normal processing - load real images
+                                if self.url_column and self.url_column in item:
+                                    image_url = item[self.url_column]
+                                    try:
+                                        response = requests.get(image_url, timeout=30)
+                                        response.raise_for_status()
+                                        image = Image.open(io.BytesIO(response.content))
+                                    except Exception as e:
+                                        logger.error(
+                                            f"Error downloading image from {image_url}: {e}"
+                                        )
+                                        continue
+                                elif self.image_column and self.image_column in item:
+                                    image_data = item[self.image_column]
+                                    if isinstance(image_data, dict) and "bytes" in image_data:
+                                        image = Image.open(io.BytesIO(image_data["bytes"]))
+                                    elif isinstance(image_data, bytes):
+                                        image = Image.open(io.BytesIO(image_data))
+                            if image is None:
+                                logger.warning(f"No image found for item at index {global_idx}")
+                                continue
+                            # Build job ID
+                            chunk_index = unit.metadata["chunk_index"]
+                            shard_name = unit.metadata["shard_name"]
+                            job_id_obj = JobId(
+                                shard_id=shard_name,
+                                chunk_id=str(chunk_index),
+                                sample_id=str(global_idx),
+                            )
+                            job_id = job_id_obj.get_sample_str()
+                            # Clean metadata
+                            clean_metadata = {
+                                k: v
+                                for k, v in item.items()
+                                if k not in [self.image_column, self.url_column]
+                                and not k.startswith("_")
+                            }
+                            clean_metadata.update(
+                                {
+                                    "_item_index": global_idx,
+                                    "_chunk_relative_index": global_idx - start_index,
+                                    "_job_id": job_id,
+                                    "_shard_id": shard_id,
+                                    "_local_index": local_idx,
+                                    "_url": image_url,
+                                    "_mock": self.mock_results,  # Add flag to indicate mock data
+                                }
+                            )
+                            yield {
+                                "image": image,
+                                "item_key": str(global_idx),
+                                "item_index": global_idx,
+                                "metadata": clean_metadata,
+                                "job_id": job_id,
+                                "_processed_indices": processed_indices,
+                            }
+                            processed_indices.append(global_idx)
+                        except Exception as e:
+                            logger.error(f"Error processing item at index {global_idx}: {e}")
+                    # Explicitly delete table to free memory
+                    del table
+                    gc.collect()
+                except Exception as e:
+                    logger.error(f"Error reading batch from shard {shard_id}: {e}")
         # Store processed indices in context
         context["_processed_indices"] = processed_indices
-        logger.debug("Processed indices for unit %s: %s", unit.unit_id, processed_indices)
+        logger.debug(
+            f"Processed {len(processed_indices)} indices for unit {unit.unit_id}: {processed_indices}, {context}"
+        )
+        log_memory(f"end processing unit {unit.unit_id}")
     def prepare_result(
         self, unit: WorkUnit, outputs: List[Dict[str, Any]], processing_time_ms: float

caption-flow 0.2.3__py3-none-any.whl → 0.3.1__py3-none-any.whl

caption-flow 0.2.3py3-none-any.whl → 0.3.1py3-none-any.whl