PyPI - caption-flow - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

caption-flow 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

caption_flow/cli.py CHANGED Viewed

@@ -124,7 +124,7 @@ def setup_logging(verbose: bool = False):
     level = logging.DEBUG if verbose else logging.INFO
     logging.basicConfig(
         level=level,
-        format="%(asctime)s %(message)s",
+        format="%(message)s",
         datefmt="[%Y-%m-%d %H:%M:%S]",
         handlers=[
             RichHandler(

caption_flow/orchestrator.py CHANGED Viewed

@@ -16,7 +16,7 @@ import uuid
 from dataclasses import dataclass, asdict
 from datetime import datetime
 from pathlib import Path
-from typing import Dict, Set, Optional, Any, List, Deque
+from typing import Dict, Set, Optional, Any, List, Deque, Tuple
 from collections import deque, defaultdict
 import threading
 from queue import Queue, Empty
@@ -97,27 +97,9 @@ class ChunkManager:
         self.lock = threading.Lock()
         self.tracker = tracker  # Reference to chunk tracker
-    def create_chunks_from_shard(
-        self, shard_url: str, shard_name: str, total_items: int
-    ) -> List[ShardChunk]:
-        """Create chunks from a shard."""
-        chunks = []
-        for start_idx in range(0, total_items, self.chunk_size):
-            chunk = ShardChunk.create(
-                shard_url=shard_url,
-                shard_name=shard_name,
-                start_index=start_idx,
-                chunk_size=min(self.chunk_size, total_items - start_idx),
-            )
-            with self.lock:
-                self.chunks[chunk.chunk_id] = chunk
-                self.pending_chunks.append(chunk.chunk_id)
-            chunks.append(chunk)
-        return chunks
+        # NEW: Track assigned ranges to prevent double allocation
+        # Format: {chunk_id: {(start, end): worker_id}}
+        self.assigned_ranges: Dict[str, Dict[Tuple[int, int], str]] = defaultdict(dict)
     def get_chunks_for_worker(
         self, worker_id: str, count: int = 1, tracker: Optional["ChunkTracker"] = None
@@ -127,7 +109,6 @@ class ChunkManager:
         with self.lock:
             # FIRST PRIORITY: Check if this worker already has assigned chunks
-            # Workers should complete their current chunks before getting new ones
             if worker_id in self.assigned_chunks:
                 existing_chunk_ids = list(self.assigned_chunks[worker_id])
                 for chunk_id in existing_chunk_ids:
@@ -142,12 +123,29 @@ class ChunkManager:
                     if tracker:
                         chunk_info = tracker.get_chunk_with_unprocessed_items(chunk_id)
                         if chunk_info and chunk_info["unprocessed_ranges"]:
-                            assigned.append(
-                                {
-                                    "chunk": chunk,
-                                    "unprocessed_ranges": chunk_info["unprocessed_ranges"],
-                                }
-                            )
+                            # Filter out ranges that are assigned to other workers
+                            clean_ranges = []
+                            for start, end in chunk_info["unprocessed_ranges"]:
+                                range_key = (start, end)
+                                if range_key in self.assigned_ranges[chunk_id]:
+                                    assigned_worker = self.assigned_ranges[chunk_id][range_key]
+                                    if assigned_worker != worker_id:
+                                        # Skip this range - it's assigned to another worker
+                                        logger.warning(
+                                            f"Skipping range {start}-{end} in chunk {chunk_id} "
+                                            f"(assigned to {assigned_worker}, not {worker_id})"
+                                        )
+                                        continue
+                                    # else: this worker already owns this range, include it
+                                clean_ranges.append((start, end))
+                            if clean_ranges:
+                                assigned.append(
+                                    {
+                                        "chunk": chunk,
+                                        "unprocessed_ranges": clean_ranges,
+                                    }
+                                )
                     else:
                         # No tracker, assume chunk needs processing
                         assigned.append(
@@ -158,7 +156,6 @@ class ChunkManager:
                         )
             # SECOND PRIORITY: Get new pending chunks
-            # Only if worker doesn't have enough chunks already
             while len(assigned) < count and self.pending_chunks:
                 chunk_id = self.pending_chunks.popleft()
                 chunk = self.chunks.get(chunk_id)
@@ -166,7 +163,7 @@ class ChunkManager:
                 if not chunk:
                     continue
-                # Verify chunk is truly pending (defensive check)
+                # Verify chunk is truly pending
                 if chunk.status != "pending" or chunk.assigned_to is not None:
                     logger.warning(
                         f"Chunk {chunk_id} in pending queue but status={chunk.status}, assigned_to={chunk.assigned_to}"
@@ -179,15 +176,48 @@ class ChunkManager:
                 chunk.assigned_at = datetime.utcnow()
                 self.assigned_chunks[worker_id].add(chunk_id)
-                # Get unprocessed ranges
+                # Get unprocessed ranges and filter out any that are somehow already assigned
                 unprocessed_ranges = [(0, chunk.chunk_size - 1)]  # Default
                 if tracker:
                     chunk_info = tracker.get_chunk_with_unprocessed_items(chunk_id)
                     if chunk_info:
-                        unprocessed_ranges = chunk_info["unprocessed_ranges"]
+                        # Filter out any ranges that are already assigned (shouldn't happen for new chunks)
+                        clean_ranges = []
+                        for start, end in chunk_info["unprocessed_ranges"]:
+                            range_key = (start, end)
+                            if range_key not in self.assigned_ranges[chunk_id]:
+                                clean_ranges.append((start, end))
+                            else:
+                                logger.error(
+                                    f"Range {start}-{end} in newly assigned chunk {chunk_id} "
+                                    f"is already assigned to {self.assigned_ranges[chunk_id][range_key]}!"
+                                )
+                        unprocessed_ranges = clean_ranges if clean_ranges else []
                     tracker.mark_assigned(chunk_id, worker_id)
-                assigned.append({"chunk": chunk, "unprocessed_ranges": unprocessed_ranges})
+                if unprocessed_ranges:
+                    assigned.append({"chunk": chunk, "unprocessed_ranges": unprocessed_ranges})
+            # Track assigned ranges and verify no double allocation
+            for info in assigned:
+                chunk_id = info["chunk"].chunk_id
+                for start, end in info["unprocessed_ranges"]:
+                    range_key = (start, end)
+                    # Check if this range is already assigned
+                    if range_key in self.assigned_ranges[chunk_id]:
+                        existing_worker = self.assigned_ranges[chunk_id][range_key]
+                        if existing_worker != worker_id:
+                            # This should never happen - raise assertion
+                            raise AssertionError(
+                                f"CRITICAL: Attempting to assign range {start}-{end} in chunk {chunk_id} "
+                                f"to worker {worker_id}, but it's already assigned to {existing_worker}! "
+                                f"This would cause duplicate processing."
+                            )
+                    # Track this assignment
+                    self.assigned_ranges[chunk_id][range_key] = worker_id
         # Log what we're assigning
         if assigned:
@@ -199,6 +229,12 @@ class ChunkManager:
             )
             logger.info(f"Assigning to worker {worker_id}: {chunk_summary}")
+            # Detailed range logging for debugging
+            for info in assigned:
+                chunk_id = info["chunk"].chunk_id
+                ranges_str = ", ".join([f"{s}-{e}" for s, e in info["unprocessed_ranges"]])
+                logger.debug(f"  Chunk {chunk_id} ranges: {ranges_str}")
         return assigned
     def complete_chunk(self, chunk_id: str, worker_id: str) -> bool:
@@ -210,6 +246,16 @@ class ChunkManager:
                     chunk.status = "completed"
                     chunk.completed_at = datetime.utcnow()
                     self.assigned_chunks[worker_id].discard(chunk_id)
+                    # Clear assigned ranges for this chunk
+                    if chunk_id in self.assigned_ranges:
+                        # Log what ranges we're clearing
+                        ranges_to_clear = list(self.assigned_ranges[chunk_id].keys())
+                        logger.debug(
+                            f"Clearing {len(ranges_to_clear)} assigned ranges for completed chunk {chunk_id}"
+                        )
+                        del self.assigned_ranges[chunk_id]
                     return True
         return False
@@ -224,6 +270,20 @@ class ChunkManager:
                     chunk.assigned_at = None
                     self.assigned_chunks[worker_id].discard(chunk_id)
                     self.pending_chunks.append(chunk_id)
+                    # Clear assigned ranges for this chunk/worker
+                    if chunk_id in self.assigned_ranges:
+                        ranges_to_clear = [
+                            range_key
+                            for range_key, assigned_worker in self.assigned_ranges[chunk_id].items()
+                            if assigned_worker == worker_id
+                        ]
+                        for range_key in ranges_to_clear:
+                            del self.assigned_ranges[chunk_id][range_key]
+                        logger.debug(
+                            f"Cleared {len(ranges_to_clear)} assigned ranges for failed chunk {chunk_id}"
+                        )
                     return True
         return False
@@ -240,18 +300,62 @@ class ChunkManager:
                         chunk.assigned_at = None
                         self.pending_chunks.append(chunk_id)
+                        # Clear assigned ranges for this worker
+                        if chunk_id in self.assigned_ranges:
+                            ranges_to_clear = [
+                                range_key
+                                for range_key, assigned_worker in self.assigned_ranges[
+                                    chunk_id
+                                ].items()
+                                if assigned_worker == worker_id
+                            ]
+                            for range_key in ranges_to_clear:
+                                del self.assigned_ranges[chunk_id][range_key]
+                            if ranges_to_clear:
+                                logger.info(
+                                    f"Released {len(ranges_to_clear)} ranges from chunk {chunk_id} "
+                                    f"previously assigned to disconnected worker {worker_id}"
+                                )
             if worker_id in self.assigned_chunks:
                 del self.assigned_chunks[worker_id]
+    def mark_ranges_processed(
+        self, chunk_id: str, processed_ranges: List[Tuple[int, int]], worker_id: str
+    ):
+        """Remove ranges from assignment tracking once they're processed."""
+        with self.lock:
+            if chunk_id in self.assigned_ranges:
+                for start, end in processed_ranges:
+                    range_key = (start, end)
+                    if range_key in self.assigned_ranges[chunk_id]:
+                        assigned_worker = self.assigned_ranges[chunk_id][range_key]
+                        if assigned_worker == worker_id:
+                            del self.assigned_ranges[chunk_id][range_key]
+                            logger.debug(
+                                f"Cleared assignment of range {start}-{end} in chunk {chunk_id} "
+                                f"after processing by {worker_id}"
+                            )
+                        else:
+                            logger.warning(
+                                f"Worker {worker_id} claims to have processed range {start}-{end} "
+                                f"in chunk {chunk_id}, but it was assigned to {assigned_worker}"
+                            )
     def get_stats(self) -> Dict[str, int]:
         """Get chunk statistics."""
         with self.lock:
+            # Count total assigned ranges
+            total_assigned_ranges = sum(len(ranges) for ranges in self.assigned_ranges.values())
             stats = {
                 "total": len(self.chunks),
                 "pending": len(self.pending_chunks),
                 "assigned": sum(len(chunks) for chunks in self.assigned_chunks.values()),
                 "completed": sum(1 for c in self.chunks.values() if c.status == "completed"),
                 "failed": sum(1 for c in self.chunks.values() if c.status == "failed"),
+                "assigned_ranges": total_assigned_ranges,
             }
         return stats
@@ -491,13 +595,15 @@ class Orchestrator:
             with self.chunk_manager.lock:
                 for chunk_state in shard_info["chunks"]:
                     if chunk_state.status in ["pending", "failed", "assigned"]:
-                        # ChunkState already has shard_url stored
+                        # For assigned chunks, reset them to pending since workers don't exist
                         chunk = ShardChunk(
                             chunk_id=chunk_state.chunk_id,
                             shard_url=chunk_state.shard_url,
                             shard_name=chunk_state.shard_name,
                             start_index=chunk_state.start_index,
                             chunk_size=chunk_state.chunk_size,
+                            status="pending",  # Reset to pending
+                            assigned_to=None,  # Clear assignment
                         )
                         self.chunk_manager.chunks[chunk_state.chunk_id] = chunk
                         self.chunk_manager.pending_chunks.append(chunk_state.chunk_id)
@@ -1811,10 +1917,24 @@ class Orchestrator:
                 # Don't forget the last range
                 ranges.append((start, end))
-                # Mark ranges as processed (mark_items_processed expects absolute indices)
+                # Mark ranges as processed
                 for start_idx, end_idx in ranges:
                     self.chunk_tracker.mark_items_processed(chunk_id, start_idx, end_idx)
+                with self.chunk_manager.lock:
+                    if chunk_id in self.chunk_manager.assigned_ranges:
+                        for start_idx, end_idx in ranges:
+                            # Clear any assignments in this range
+                            to_remove = []
+                            for range_start, range_end in self.chunk_manager.assigned_ranges[
+                                chunk_id
+                            ]:
+                                if range_start >= start_idx and range_end <= end_idx:
+                                    to_remove.append((range_start, range_end))
+                            for range_key in to_remove:
+                                del self.chunk_manager.assigned_ranges[chunk_id][range_key]
             # Clear pending items
             self.pending_processed_items.clear()
             self.last_item_batch_flush = time.time()
@@ -2027,15 +2147,15 @@ class Orchestrator:
                 last_known_total = current_total_outputs
             # Log rate information when workers are connected
-            if (
-                worker_count > 0 and self.rate_tracker["current_rate"] >= 0
-            ):  # Only log non-negative rates
-                logger.info(
-                    f"Rate: {self.rate_tracker['current_rate']:.1f} outputs/min "
-                    f"(avg: {self.rate_tracker['average_rate']:.1f}, "
-                    f"expected: {self.rate_tracker['expected_rate']:.1f}) | "
-                    f"Workers: {worker_count}, Chunks: {active_chunks}/{target_buffer}"
-                )
+            # if (
+            #     worker_count > 0 and self.rate_tracker["current_rate"] >= 0
+            # ):  # Only log non-negative rates
+            #     logger.info(
+            #         f"Rate: {self.rate_tracker['current_rate']:.1f} outputs/min "
+            #         f"(avg: {self.rate_tracker['average_rate']:.1f}, "
+            #         f"expected: {self.rate_tracker['expected_rate']:.1f}) | "
+            #         f"Workers: {worker_count}, Chunks: {active_chunks}/{target_buffer}"
+            #     )
             await self._broadcast_stats()

caption_flow/storage.py CHANGED Viewed

@@ -386,10 +386,15 @@ class StorageManager:
             # Filter new data to exclude duplicates
             new_rows = []
+            duplicate_rows = []
             for row in prepared_buffer:
                 if row["job_id"] not in existing_job_ids:
                     new_rows.append(row)
+                elif row not in duplicate_rows:
+                    duplicate_rows.append(row)
+            if duplicate_rows:
+                logger.info(f"Example duplicate row: {duplicate_rows[0]}")
             if new_rows:
                 # Create table from new rows only
                 new_table = pa.Table.from_pylist(new_rows, schema=self.caption_schema)

caption_flow/utils/chunk_tracker.py CHANGED Viewed

@@ -441,9 +441,27 @@ class ChunkTracker(CheckpointTracker):
         )
     def get_chunk_with_unprocessed_items(self, chunk_id: str) -> Optional[Dict[str, Any]]:
-        """Get chunk info including unprocessed ranges."""
-        if chunk_id not in self.chunks:
+        """Get chunk info with unprocessed item ranges."""
+        chunk_state = self.chunks.get(chunk_id)
+        if not chunk_state:
             return None
-        chunk = self.chunks[chunk_id]
-        return {"chunk": chunk.to_dict(), "unprocessed_ranges": chunk.get_unprocessed_ranges()}
+        # During startup or if no worker is assigned, treat all unprocessed as available
+        if not hasattr(self, "_startup_complete"):
+            self._startup_complete = False
+        if not self._startup_complete or not chunk_state.assigned_to:
+            # Return all unprocessed ranges
+            return {
+                "chunk_id": chunk_id,
+                "unprocessed_ranges": chunk_state.get_unprocessed_ranges(),
+                "status": chunk_state.status,
+            }
+        # Normal operation - only return ranges not being worked on
+        # This would need more complex tracking of which ranges each worker is processing
+        return {
+            "chunk_id": chunk_id,
+            "unprocessed_ranges": chunk_state.get_unprocessed_ranges(),
+            "status": chunk_state.status,
+        }

caption_flow/utils/dataset_loader.py CHANGED Viewed

@@ -217,17 +217,26 @@ class DatasetLoader:
         return dataset_path, start_idx, chunk_size
     def iterate_shard(
-        self, shard_url: str, processed_keys: Optional[set] = None
+        self,
+        shard_url: str,
+        processed_keys: Optional[set] = None,
+        unprocessed_ranges: Optional[List[Tuple[int, int]]] = None,
     ) -> Generator[Tuple[str, str, bytes], None, None]:
         """
         Iterate over items in a shard.
+        Args:
+            shard_url: URL or identifier of the shard
+            processed_keys: Set of already processed keys to skip
+            unprocessed_ranges: Specific ranges to process (for HF datasets)
         Yields:
             Tuple of (key, url, image_bytes)
         """
-        # Check if this is a virtual HuggingFace dataset shard
         if shard_url.startswith("hf_dataset:"):
-            yield from self._iterate_hf_dataset_shard(shard_url, processed_keys)
+            raise ValueError(
+                "Virtual HuggingFace dataset shards should use iterate_shard_with_metadata()"
+            )
         else:
             # Regular WebDataset shard
             ds = self.load_shard(shard_url, processed_keys)
@@ -296,296 +305,69 @@ class DatasetLoader:
         )
         try:
-            # Try optimized approach for large skips
-            if start_idx > 100:
-                dataset = self._create_dataset_at_position(dataset_path, self.split, start_idx)
-                if dataset:
-                    items_processed = 0
-                    for item in dataset:
-                        # Stop after processing chunk_size items
-                        if items_processed >= chunk_size:
-                            break
-                        # Generate a unique key for this item
-                        key = f"{dataset_path.replace('/', '_')}_{start_idx + items_processed:08d}"
-                        if key in processed_keys:
-                            items_processed += 1
-                            continue
-                        try:
-                            # Extract image data
-                            if self.image_column in item:
-                                img_data = item[self.image_column]
-                                # Process image to bytes
-                                image_bytes = ImageProcessor.process_image_data(img_data)
-                                if image_bytes:
-                                    # Extract all metadata (excluding the image column)
-                                    metadata = {
-                                        k: v for k, v in item.items() if k != self.image_column
-                                    }
-                                    # URL is virtual for HF datasets
-                                    url = f"hf://{dataset_path}#{start_idx + items_processed}"
-                                    items_processed += 1
-                                    yield key, url, image_bytes, metadata
-                                else:
-                                    logger.warning(
-                                        f"Failed to process image for item at index {start_idx + items_processed}"
-                                    )
-                                    items_processed += 1
-                                    continue
-                            else:
-                                logger.warning(
-                                    f"No image column '{self.image_column}' found in item at index {start_idx + items_processed}. "
-                                    f"Available columns: {list(item.keys())}"
-                                )
-                                items_processed += 1
-                        except Exception as e:
-                            logger.error(
-                                f"Error processing item at index {start_idx + items_processed}: {e}"
-                            )
-                            items_processed += 1
-                            continue
-                    return
-            # Fall back to regular approach for small skips or if StatefulDataLoader not available
-            dataset = load_dataset(
-                dataset_path,
-                split=self.split,
-                streaming=True,
-                token=self.token,
-            )
-            # Skip to start index if needed
-            if start_idx > 0:
-                dataset = dataset.skip(start_idx)
+            # For HF datasets, we iterate through the full chunk range
+            # The actual range filtering happens in the shard processor
             items_processed = 0
+            current_abs_idx = start_idx
+            while items_processed < chunk_size:
+                # Create a fresh dataset iterator for each batch
+                # This avoids issues with stateful iterators
+                batch_size = min(1000, chunk_size - items_processed)  # Process in smaller batches
+                dataset = load_dataset(
+                    dataset_path,
+                    split=self.split,
+                    streaming=True,
+                    token=self.token,
+                )
-            for item in dataset:
-                # Stop after processing chunk_size items
-                if items_processed >= chunk_size:
-                    break
-                # Generate a unique key for this item
-                key = f"{dataset_path.replace('/', '_')}_{start_idx + items_processed:08d}"
-                if key in processed_keys:
-                    items_processed += 1
-                    continue
-                try:
-                    # Extract image data
-                    if self.image_column in item:
-                        img_data = item[self.image_column]
+                # Skip to current position
+                if current_abs_idx > 0:
+                    dataset = dataset.skip(current_abs_idx)
-                        # Process image to bytes
-                        image_bytes = ImageProcessor.process_image_data(img_data)
+                batch_processed = 0
+                for item in dataset:
+                    if batch_processed >= batch_size or items_processed >= chunk_size:
+                        break
-                        if image_bytes:
-                            # Extract all metadata (excluding the image column)
-                            metadata = {k: v for k, v in item.items() if k != self.image_column}
+                    # Generate key
+                    key = f"{dataset_path.replace('/', '_')}_{current_abs_idx:08d}"
-                            # URL is virtual for HF datasets
-                            url = f"hf://{dataset_path}#{start_idx + items_processed}"
-                            items_processed += 1
-                            yield key, url, image_bytes, metadata
-                        else:
-                            logger.warning(
-                                f"Failed to process image for item at index {start_idx + items_processed}"
-                            )
-                            items_processed += 1
-                            continue
-                    else:
-                        logger.warning(
-                            f"No image column '{self.image_column}' found in item at index {start_idx + items_processed}. "
-                            f"Available columns: {list(item.keys())}"
-                        )
+                    if key in processed_keys:
+                        current_abs_idx += 1
+                        batch_processed += 1
                         items_processed += 1
+                        continue
-                except Exception as e:
-                    logger.error(
-                        f"Error processing item at index {start_idx + items_processed}: {e}"
-                    )
-                    items_processed += 1
-                    continue
+                    try:
+                        if self.image_column in item:
+                            img_data = item[self.image_column]
+                            image_bytes = ImageProcessor.process_image_data(img_data)
-        except Exception as e:
-            logger.error(f"Error loading HuggingFace dataset: {e}")
-            return
+                            if image_bytes:
+                                metadata = {k: v for k, v in item.items() if k != self.image_column}
+                                url = f"hf://{dataset_path}#{current_abs_idx}"
-    def _iterate_hf_dataset_shard(
-        self, shard_url: str, processed_keys: Optional[set] = None
-    ) -> Generator[Tuple[str, str, bytes], None, None]:
-        """Iterate over a virtual HuggingFace dataset shard."""
-        if processed_keys is None:
-            processed_keys = set()
+                                yield key, url, image_bytes, metadata
-        dataset_path, start_idx, chunk_size = self._parse_virtual_shard(shard_url)
-        # IMPORTANT: Check if start_idx is beyond dataset bounds
-        if self._hf_total_items is not None and start_idx >= self._hf_total_items:
-            logger.warning(
-                f"Virtual shard starts at index {start_idx} but dataset only has "
-                f"{self._hf_total_items} items. Skipping this shard."
-            )
-            return
-        logger.info(
-            f"Loading HuggingFace dataset in streaming mode: {dataset_path} "
-            f"(split: {self.split}, start: {start_idx}, chunk_size: {chunk_size})"
-        )
-        try:
-            # Try optimized approach for large skips
-            if start_idx > 100:
-                dataset = self._create_dataset_at_position(dataset_path, self.split, start_idx)
-                if dataset:
-                    items_processed = 0
-                    for item in dataset:
-                        # Stop after processing chunk_size items
-                        if items_processed >= chunk_size:
-                            logger.info(f"Completed chunk: processed {items_processed} items")
-                            break
-                        # Also stop if we've reached the dataset end
-                        if (
-                            self._hf_total_items
-                            and (start_idx + items_processed) >= self._hf_total_items
-                        ):
-                            logger.info(
-                                f"Reached dataset end at item {start_idx + items_processed} "
-                                f"(total: {self._hf_total_items})"
-                            )
-                            break
-                        # Generate a unique key for this item
-                        key = f"{dataset_path.replace('/', '_')}_{start_idx + items_processed:08d}"
-                        if key in processed_keys:
-                            items_processed += 1
-                            continue
-                        try:
-                            # Extract image data
-                            if self.image_column in item:
-                                img_data = item[self.image_column]
-                                # Delegate image processing to ImageProcessor
-                                image_bytes = ImageProcessor.process_image_data(img_data)
-                                if image_bytes:
-                                    # URL is virtual for HF datasets
-                                    url = f"hf://{dataset_path}#{start_idx + items_processed}"
-                                    items_processed += 1
-                                    yield key, url, image_bytes
-                                else:
-                                    logger.warning(
-                                        f"Failed to process image for item at index {start_idx + items_processed}"
-                                    )
-                                    items_processed += 1
-                                    continue
-                            else:
-                                logger.warning(
-                                    f"No image column '{self.image_column}' found in item at index {start_idx + items_processed}. "
-                                    f"Available columns: {list(item.keys())}"
-                                )
-                                items_processed += 1
-                        except Exception as e:
-                            logger.error(
-                                f"Error processing item at index {start_idx + items_processed}: {e}"
-                            )
+                            current_abs_idx += 1
+                            batch_processed += 1
                             items_processed += 1
-                            continue
-                    logger.info(
-                        f"Virtual shard complete: processed {items_processed} items "
-                        f"(start_idx: {start_idx})"
-                    )
-                    return
-            # Fall back to regular approach for small skips or if StatefulDataLoader not available
-            dataset = load_dataset(
-                dataset_path,
-                split=self.split,
-                streaming=True,
-                token=self.token,
-            )
-            # Use dataset.skip() for efficient skipping
-            if start_idx > 0:
-                dataset = dataset.skip(start_idx)
-                logger.info(f"Skipped to index {start_idx}")
-            items_processed = 0
-            # Now enumerate starts from 0 after skip
-            for item in dataset:
-                # Stop after processing chunk_size items
-                if items_processed >= chunk_size:
-                    logger.info(f"Completed chunk: processed {items_processed} items")
-                    break
-                # Also stop if we've reached the dataset end
-                if self._hf_total_items and (start_idx + items_processed) >= self._hf_total_items:
-                    logger.info(
-                        f"Reached dataset end at item {start_idx + items_processed} "
-                        f"(total: {self._hf_total_items})"
-                    )
-                    break
-                # Generate a unique key for this item - ensure proper formatting
-                key = f"{dataset_path.replace('/', '_')}_{start_idx + items_processed:08d}"
-                if key in processed_keys:
-                    items_processed += 1
-                    continue
-                try:
-                    # Extract image data - check configured column name
-                    if self.image_column in item:
-                        img_data = item[self.image_column]
-                        # Delegate image processing to ImageProcessor
-                        image_bytes = ImageProcessor.process_image_data(img_data)
-                        if image_bytes:
-                            # URL is virtual for HF datasets
-                            url = f"hf://{dataset_path}#{start_idx + items_processed}"
-                            items_processed += 1
-                            yield key, url, image_bytes
                         else:
                             logger.warning(
-                                f"Failed to process image for item at index {start_idx + items_processed}"
+                                f"No image column '{self.image_column}' at index {current_abs_idx}"
                             )
+                            current_abs_idx += 1
+                            batch_processed += 1
                             items_processed += 1
-                            continue
-                    else:
-                        logger.warning(
-                            f"No image column '{self.image_column}' found in item at index {start_idx + items_processed}. "
-                            f"Available columns: {list(item.keys())}"
-                        )
-                        items_processed += 1
-                except Exception as e:
-                    logger.error(
-                        f"Error processing item at index {start_idx + items_processed}: {e}"
-                    )
-                    items_processed += 1
-                    continue
-            logger.info(
-                f"Virtual shard complete: processed {items_processed} items "
-                f"(start_idx: {start_idx})"
-            )
+                    except Exception as e:
+                        logger.error(f"Error processing item at index {current_abs_idx}: {e}")
+                        current_abs_idx += 1
+                        batch_processed += 1
+                        items_processed += 1
+                        continue
         except Exception as e:
             logger.error(f"Error loading HuggingFace dataset: {e}")

caption_flow/utils/shard_processor.py CHANGED Viewed

@@ -7,6 +7,8 @@ from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Generator, Tuple, Optional, Dict, Any
 from dataclasses import dataclass
+from datasets import load_dataset
+from .image_processor import ImageProcessor
 from threading import Event
 import shlex
@@ -108,10 +110,7 @@ class HFDatasetShardProcessor(ShardProcessor):
         connected: Event,
     ) -> Generator[Tuple[str, str, bytes, Dict[str, Any]], None, None]:
         """
-        Process HuggingFace virtual shard chunk with metadata.
-        Yields:
-            Tuple of (key, url, image_data, metadata)
+        Process HuggingFace virtual shard chunk with metadata, range by range.
         """
         if not dataset_loader:
             logger.error("No dataset loader configured for HuggingFace dataset shard")
@@ -121,49 +120,114 @@ class HFDatasetShardProcessor(ShardProcessor):
         unprocessed_ranges = getattr(chunk, "unprocessed_ranges", [(0, chunk.chunk_size - 1)])
         logger.info(
-            f"Processing HF dataset chunk {chunk.chunk_id} with ranges: {unprocessed_ranges}"
+            f"Processing HF dataset chunk {chunk.chunk_id} with {len(unprocessed_ranges)} ranges"
         )
-        items_processed = 0
-        current_idx = 0
-        # Construct proper virtual shard URL
-        parts = chunk.shard_url.split("_chunk_")
-        if len(parts) == 2:
-            base_path = parts[0]
-            virtual_shard_url = f"{base_path}:chunk:{chunk.start_index}"
-        else:
-            virtual_shard_url = chunk.shard_url
-        logger.debug(f"Using virtual shard URL: {virtual_shard_url}")
+        items_yielded = 0
-        # Use the new iterate method that includes metadata
-        for key, url, image_data, metadata in dataset_loader.iterate_shard_with_metadata(
-            virtual_shard_url
-        ):
-            # Check if we should stop
+        # Process each range independently with its own iterator
+        for range_start, range_end in unprocessed_ranges:
             if should_stop.is_set() or not connected.is_set():
                 logger.info(f"Stopping chunk processing early due to disconnect")
                 break
-            # Check if current index is in any unprocessed range
-            in_range = any(start <= current_idx <= end for start, end in unprocessed_ranges)
-            if not in_range:
-                current_idx += 1
-                continue  # Skip already processed items
+            # Calculate absolute indices for this range
+            abs_start = chunk.start_index + range_start
+            abs_end = chunk.start_index + range_end
+            range_size = range_end - range_start + 1
-            # Check if we've processed enough for this chunk
-            if current_idx >= chunk.chunk_size:
-                break
+            logger.debug(
+                f"Processing range [{range_start}, {range_end}] "
+                f"(absolute: [{abs_start}, {abs_end}])"
+            )
-            items_processed += 1
-            current_idx += 1
-            yield key, url, image_data, metadata
+            try:
+                # Create a fresh dataset iterator for this range
+                dataset = load_dataset(
+                    dataset_loader.dataset_path,
+                    split=dataset_loader.split,
+                    streaming=True,
+                    token=dataset_loader.token,
+                )
+                # Use state_dict if available for efficient positioning
+                if hasattr(dataset, "load_state_dict") and hasattr(dataset, "state_dict"):
+                    try:
+                        state = dataset.state_dict()
+                        # Modify state to jump to abs_start
+                        if "num_examples_since_previous_state" in state:
+                            state["num_examples_since_previous_state"] = abs_start
+                        if "examples_iterable" in state and isinstance(
+                            state["examples_iterable"], dict
+                        ):
+                            if "shard_example_idx" in state["examples_iterable"]:
+                                state["examples_iterable"]["shard_example_idx"] = abs_start
+                        dataset.load_state_dict(state)
+                        logger.debug(f"Positioned dataset at index {abs_start} using state_dict")
+                    except Exception as e:
+                        logger.debug(f"Could not use state_dict, falling back to skip: {e}")
+                        dataset = dataset.skip(abs_start)
+                else:
+                    # Fall back to skip
+                    dataset = dataset.skip(abs_start)
+                # Process items in this range
+                range_items = 0
+                for item in dataset:
+                    if range_items >= range_size:
+                        break
+                    if should_stop.is_set() or not connected.is_set():
+                        break
+                    # Generate key for this item
+                    current_abs_idx = abs_start + range_items
+                    key = f"{dataset_loader.dataset_path.replace('/', '_')}_{current_abs_idx:08d}"
+                    try:
+                        if dataset_loader.image_column in item:
+                            img_data = item[dataset_loader.image_column]
+                            image_bytes = ImageProcessor.process_image_data(img_data)
+                            if image_bytes:
+                                # Extract metadata
+                                metadata = {
+                                    k: v
+                                    for k, v in item.items()
+                                    if k != dataset_loader.image_column
+                                }
+                                # Add chunk-relative index to metadata
+                                metadata["_chunk_relative_index"] = range_start + range_items
+                                url = f"hf://{dataset_loader.dataset_path}#{current_abs_idx}"
+                                items_yielded += 1
+                                range_items += 1
+                                yield key, url, image_bytes, metadata
+                            else:
+                                logger.warning(
+                                    f"Failed to process image at index {current_abs_idx}"
+                                )
+                                range_items += 1
+                        else:
+                            logger.warning(
+                                f"No image column '{dataset_loader.image_column}' at index {current_abs_idx}"
+                            )
+                            range_items += 1
+                    except Exception as e:
+                        logger.error(f"Error processing item at index {current_abs_idx}: {e}")
+                        range_items += 1
+                        continue
+            except Exception as e:
+                logger.error(f"Error processing range [{range_start}, {range_end}]: {e}")
+                continue
         logger.info(
-            f"HF dataset chunk {chunk.chunk_id}: yielded {items_processed} items "
-            f"from ranges {unprocessed_ranges}"
+            f"HF dataset chunk {chunk.chunk_id}: yielded {items_yielded} items "
+            f"from {len(unprocessed_ranges)} ranges"
         )

{caption_flow-0.2.1.dist-info → caption_flow-0.2.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: caption-flow
-Version: 0.2.1
+Version: 0.2.2
 Summary: Self-contained distributed community captioning system
 Author-email: bghira <bghira@users.github.com>
 License: MIT

{caption_flow-0.2.1.dist-info → caption_flow-0.2.2.dist-info}/RECORD RENAMED Viewed

@@ -1,29 +1,29 @@
 caption_flow/__init__.py,sha256=NLPJ25lRN7xHqncXweINDNwbt0q8lgjZ30G21zlPdRs,303
-caption_flow/cli.py,sha256=bHxx66CPsCmSieaH3pw8NZBojIIbniRTdU9mEBHMmWA,28832
+caption_flow/cli.py,sha256=fkyQHzs5kei6-9ftkbJjko-K67TARxd7yNf7x9e7KSs,28820
 caption_flow/models.py,sha256=qo6lQiO10UISbaBVr6Cs-fSW_pmjwE6kmiTmmU_l3Wk,2140
 caption_flow/monitor.py,sha256=ZZCSasYLKJ-UzA3-RoAtytv-tbNA-m3h5YjlZg_vukg,7870
-caption_flow/orchestrator.py,sha256=bZ8NnGdqoXSmu7Nq-_7cOSH1DLHkBT88cne0uDyPeNY,89112
-caption_flow/storage.py,sha256=hC6ZHT_PHFoUVjqD5JUwy3_79oAD1e1H30neA_xsz7s,40748
+caption_flow/orchestrator.py,sha256=9yWKVcaR-S6naNQSd7Np8AemwV5lNDmB_lCufpvVrS0,96282
+caption_flow/storage.py,sha256=kGv9iQAgxwLLlAIPU6TBrlagdfxA339eBz1xG0yYRsc,40981
 caption_flow/utils/__init__.py,sha256=F1BChVoCsj9zn1GJRBOLHET1kLW6xrAmsbzcR7hHy6Y,202
 caption_flow/utils/auth.py,sha256=UrxX2n8OEEcfMD1Ey27TxGfrJFmUCpC59x-SCrQJoVE,2253
 caption_flow/utils/caption_utils.py,sha256=esUMAdcCkNjRroZ0Bhxv0_yKlLtMf0XeDCTt-5k6bik,5309
 caption_flow/utils/certificates.py,sha256=eu4blQZEkL9NRaY1ynQWg1asvDorRYhGRZea7STonJE,4635
 caption_flow/utils/checkpoint_tracker.py,sha256=8tsTFF-HcygitK92YcS-QWzeg-qRm9AuCpQoQRfC8M0,3335
-caption_flow/utils/chunk_tracker.py,sha256=hKn8CN6ubErc9kuCWZMj12ZCZKxVlqXqAEocbzjfa-k,17296
-caption_flow/utils/dataset_loader.py,sha256=ZplJv655ZMyUbaZC4BBiL5II18sBy4JSJhxGZtK_VmA,29107
+caption_flow/utils/chunk_tracker.py,sha256=SO6ERvEwGXuikGDVaXFota_3Ix8BnePMU7CiZJKBAnQ,18025
+caption_flow/utils/dataset_loader.py,sha256=Bvo-aa5jWtjzqXW0rEisdiWaN7Q-aH02rXXUu9uXqGo,19194
 caption_flow/utils/image_processor.py,sha256=Zl8TAv9gYPdAYat3UiTuuNdIb2fXNfZ35AxsxuovJTs,5650
 caption_flow/utils/job_queue.py,sha256=itdfXcrkvGjmXn4qtpgMF63k1ufRBaejDe4V6WcxzgU,1104
 caption_flow/utils/json_utils.py,sha256=IiZYn8uCM-3pYmyIbX2fmaOIyutArn67SqAyp0ggNpU,5396
 caption_flow/utils/prompt_template.py,sha256=AKp0diSZqNBMwZkpiTNjw8-bbQwHStr7QZTOJ7o1dC4,4345
-caption_flow/utils/shard_processor.py,sha256=CRda6M4xh4U0vwvYlzq9nJEzz4d_4yzUBosYAeBcPEA,10854
+caption_flow/utils/shard_processor.py,sha256=c6COBKhFzZyUeJqot5uGVR3ANeOReBfs8-DR27mrdcA,14242
 caption_flow/utils/shard_tracker.py,sha256=Wt2oE-O85F2FxSnqIocJiaYeFn00OVVjIiklZIZRGL8,3233
 caption_flow/utils/vllm_config.py,sha256=TC7Rmjk0zRKbBXbWUXrFL4Z58hzax_-4L0pXZn09hdM,6019
 caption_flow/workers/base.py,sha256=jPm_Xw4Lxd0cnrPs-biBqKRQKkTOJLvHLolmp0Gb1CI,7530
 caption_flow/workers/caption.py,sha256=NZ9kTjk2uOoNwyyNSkB_arYk213vLr5mowHN-OjiFkk,54631
 caption_flow/workers/data.py,sha256=0Tg8NE0wdONeMlivYQ4nvbcfWdLuU51O7vR8_YSnJgo,14813
-caption_flow-0.2.1.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
-caption_flow-0.2.1.dist-info/METADATA,sha256=fxNfSOqkCklb96aq3ZFU7SvRuXEBUQ11xbjkQn7Yzuo,11941
-caption_flow-0.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-caption_flow-0.2.1.dist-info/entry_points.txt,sha256=KnVlyrGKZj6p2zNyuEnCx4Y6jvJ4V-mcfN0lddPKTlQ,55
-caption_flow-0.2.1.dist-info/top_level.txt,sha256=_bXpKRutqded0FQ80dCChIz26ETV7tL4d4e2E_Y1FXs,13
-caption_flow-0.2.1.dist-info/RECORD,,
+caption_flow-0.2.2.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
+caption_flow-0.2.2.dist-info/METADATA,sha256=h9VN2ZWXVDH935Eavb-1kfsBpuW7m4Oph3tjh9ucc3w,11941
+caption_flow-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+caption_flow-0.2.2.dist-info/entry_points.txt,sha256=KnVlyrGKZj6p2zNyuEnCx4Y6jvJ4V-mcfN0lddPKTlQ,55
+caption_flow-0.2.2.dist-info/top_level.txt,sha256=_bXpKRutqded0FQ80dCChIz26ETV7tL4d4e2E_Y1FXs,13
+caption_flow-0.2.2.dist-info/RECORD,,

{caption_flow-0.2.1.dist-info → caption_flow-0.2.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{caption_flow-0.2.1.dist-info → caption_flow-0.2.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{caption_flow-0.2.1.dist-info → caption_flow-0.2.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{caption_flow-0.2.1.dist-info → caption_flow-0.2.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

caption-flow 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

caption-flow 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl