PyPI - caption-flow - Versions diffs - 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

caption-flow 0.3.4py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

caption_flow/__init__.py +3 -3
caption_flow/cli.py +934 -415
caption_flow/models.py +45 -3
caption_flow/monitor.py +2 -3
caption_flow/orchestrator.py +153 -104
caption_flow/processors/__init__.py +3 -3
caption_flow/processors/base.py +8 -7
caption_flow/processors/huggingface.py +439 -67
caption_flow/processors/local_filesystem.py +24 -28
caption_flow/processors/webdataset.py +28 -22
caption_flow/storage/exporter.py +420 -339
caption_flow/storage/manager.py +636 -756
caption_flow/utils/__init__.py +1 -1
caption_flow/utils/auth.py +1 -1
caption_flow/utils/caption_utils.py +1 -1
caption_flow/utils/certificates.py +15 -8
caption_flow/utils/checkpoint_tracker.py +30 -28
caption_flow/utils/chunk_tracker.py +153 -56
caption_flow/utils/image_processor.py +9 -9
caption_flow/utils/json_utils.py +37 -20
caption_flow/utils/prompt_template.py +24 -16
caption_flow/utils/vllm_config.py +5 -4
caption_flow/viewer.py +4 -12
caption_flow/workers/base.py +5 -4
caption_flow/workers/caption.py +265 -90
caption_flow/workers/data.py +6 -8
{caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/METADATA +5 -4
caption_flow-0.4.0.dist-info/RECORD +33 -0
caption_flow-0.3.4.dist-info/RECORD +0 -33
{caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/WHEEL +0 -0
{caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/entry_points.txt +0 -0
{caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
{caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/top_level.txt +0 -0

caption_flow/processors/huggingface.py CHANGED Viewed

@@ -1,33 +1,36 @@
 """HuggingFace Datasets processor implementation - Memory Optimized Version."""
-import logging
-import threading
-import re
-import queue
-import requests
-import json
+import gc
 import io
+import json
+import logging
 import os
-import gc
-import psutil
-from concurrent.futures import ThreadPoolExecutor, Future
-from typing import Dict, Any, List, Optional, Iterator, Set, Deque, Tuple
-from collections import deque, defaultdict
-from pathlib import Path
+import queue
+import re
+import threading
+import time
+from collections import defaultdict, deque
+from concurrent.futures import Future, ThreadPoolExecutor
 from datetime import datetime
-from PIL import Image
-import pyarrow as pa
+from pathlib import Path
+from typing import Any, Deque, Dict, Iterator, List, Optional, Set, Tuple
+import psutil
 import pyarrow.parquet as pq
+import requests
 from datasets import get_dataset_config_names, get_dataset_split_names
-from huggingface_hub import hf_hub_download, get_token
+from huggingface_hub import get_token, hf_hub_download
+from PIL import Image
+from tqdm import tqdm
 from caption_flow.storage import StorageManager
-from .base import OrchestratorProcessor, WorkerProcessor, ProcessorConfig, WorkUnit, WorkResult
-from ..utils import ChunkTracker
 from ..models import JobId
+from ..utils import ChunkTracker
+from .base import OrchestratorProcessor, ProcessorConfig, WorkerProcessor, WorkResult, WorkUnit
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
+logger.setLevel(os.environ.get("CAPTIONFLOW_LOG_LEVEL", "INFO").upper())
 def log_memory(location: str):
@@ -41,10 +44,6 @@ def log_memory(location: str):
     gc.collect()
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
 class NonBlockingQueueHandler:
     """Handles non-blocking retrieval from queues using concurrent futures."""
@@ -146,6 +145,9 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
         cfg = config.config
+        # Store storage reference for chunk state synchronization
+        self.storage = storage
         # Dataset configuration
         dataset_cfg = cfg.get("dataset", {})
         self.dataset_name = dataset_cfg.get("dataset_path")
@@ -340,6 +342,8 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
         # Cache shard info
         try:
+            # make dir if it doesn't exist already
+            shard_info_cache_path.parent.mkdir(parents=True, exist_ok=True)
             cache_data = {
                 "dataset": self.dataset_name,
                 "config": self.config,
@@ -367,9 +371,18 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
         raise ValueError(f"Global index {global_index} not found in any shard")
     def _restore_state(self, storage: StorageManager) -> None:
-        """Restore state from chunk tracker."""
-        logger.debug("Restoring state from chunk tracker")
+        """Restore state from chunk tracker and synchronize with storage."""
+        logger.debug("Restoring state from chunk tracker and synchronizing with storage")
+        # FIRST: Update chunk tracker from storage (like WebDataset does)
+        if storage:
+            processed_job_ids = storage.get_all_processed_job_ids()
+            if processed_job_ids:
+                self.update_from_storage(processed_job_ids)
+        # THEN: Restore work units from chunk tracker
         if not self.chunk_tracker:
+            logger.warning("No chunk tracker available for state restoration")
             return
         with self.lock:
@@ -382,14 +395,120 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
                 # Only add incomplete chunks to pending
                 if chunk_state.status != "completed":
                     self.pending_units.append(chunk_id)
-                elif chunk_state.status == "completed" and chunk_state.processed_ranges:
-                    logger.warning(
-                        f"Chunk {chunk_id} has processed_ranges stored in the checkpoint."
-                    )
             self.current_chunk_index = max_chunk_index + 1
             logger.info(f"Resuming from chunk index {self.current_chunk_index}")
+        # Flush checkpoint after major update
+        self.chunk_tracker.flush()
+    def _create_work_units_from_chunk(self, chunk_index: int) -> List[WorkUnit]:
+        """Create one or more work units from a chunk, splitting large gaps in unprocessed ranges."""
+        units = []
+        base_unit = self._create_work_unit(chunk_index)
+        if not base_unit:
+            return []
+        # Check if we should split this into multiple work units based on gaps
+        unprocessed_ranges = base_unit.data["unprocessed_ranges"]
+        if len(unprocessed_ranges) <= 1:
+            # Single range or no ranges, return as-is
+            return [base_unit]
+        # Check for large gaps between ranges that suggest we should split
+        total_span = unprocessed_ranges[-1][1] - unprocessed_ranges[0][0] + 1
+        total_work = sum(end - start + 1 for start, end in unprocessed_ranges)
+        gap_ratio = (total_span - total_work) / total_span if total_span > 0 else 0
+        # If gaps are more than 50% of the span, split into separate work units
+        if gap_ratio > 0.5 and len(unprocessed_ranges) > 1:
+            logger.debug(
+                f"Splitting chunk {chunk_index} with {len(unprocessed_ranges)} ranges (gap ratio: {gap_ratio:.1%})"
+            )
+            # Create separate work units for each contiguous range group
+            current_group = []
+            for i, (start, end) in enumerate(unprocessed_ranges):
+                if not current_group:
+                    current_group = [(start, end)]
+                else:
+                    # Check gap to previous range
+                    prev_end = current_group[-1][1]
+                    gap_size = start - prev_end - 1
+                    # If gap is large (>100 items), start new group
+                    if gap_size > 100:
+                        # Create work unit for current group
+                        units.append(self._create_range_work_unit(chunk_index, current_group))
+                        current_group = [(start, end)]
+                    else:
+                        # Add to current group
+                        current_group.append((start, end))
+            # Don't forget the last group
+            if current_group:
+                units.append(self._create_range_work_unit(chunk_index, current_group))
+            return [unit for unit in units if unit is not None]
+        else:
+            # Keep as single unit
+            return [base_unit]
+    def _create_range_work_unit(
+        self, chunk_index: int, ranges: List[Tuple[int, int]]
+    ) -> Optional[WorkUnit]:
+        """Create a work unit for specific ranges within a chunk."""
+        if not ranges:
+            return None
+        current_index = chunk_index * self.chunk_size
+        chunk_size = min(self.chunk_size, self.total_items - current_index)
+        # Find shard for this chunk
+        shard_id, local_idx = self._get_shard_for_index(current_index)
+        shard_name = Path(self.shard_info[shard_id]["filename"]).stem
+        # Create unique unit ID that includes range info
+        range_suffix = f"r{len(ranges)}"  # r2 = 2 ranges, etc.
+        job_id_obj = JobId(
+            shard_id=shard_name, chunk_id=str(chunk_index), sample_id=str(current_index)
+        )
+        base_unit_id = job_id_obj.get_chunk_str()
+        unit_id = f"{base_unit_id}_{range_suffix}"
+        unprocessed_items = sum(end - start + 1 for start, end in ranges)
+        unit = WorkUnit(
+            unit_id=unit_id,
+            chunk_id=base_unit_id,  # Keep original chunk_id for tracking
+            source_id=shard_name,
+            unit_size=unprocessed_items,
+            data={
+                "dataset_name": self.dataset_name,
+                "config": self.config,
+                "split": self.split,
+                "start_index": current_index,
+                "chunk_size": chunk_size,
+                "actual_work_size": unprocessed_items,
+                "unprocessed_ranges": ranges,
+                "range_based": True,
+                "is_split_unit": True,  # Flag to indicate this is a split from larger chunk
+                "shard_ids": [shard_id],
+                "data_files": self.data_files,
+            },
+            metadata={
+                "dataset": self.dataset_name,
+                "shard_name": shard_name,
+                "chunk_index": chunk_index,
+                "range_count": len(ranges),
+            },
+        )
+        return unit
     def _create_work_unit(self, chunk_index: int) -> Optional[WorkUnit]:
         """Create a single work unit for a chunk index."""
         current_index = chunk_index * self.chunk_size
@@ -400,39 +519,65 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
         chunk_size = min(self.chunk_size, self.total_items - current_index)
         # Find shard for this chunk
-        shard_id, _ = self._get_shard_for_index(current_index)
+        shard_id, local_idx = self._get_shard_for_index(current_index)
         shard_name = Path(self.shard_info[shard_id]["filename"]).stem
-        job_id_obj = JobId(shard_id=shard_name, chunk_id=chunk_index, sample_id=current_index)
+        # Calculate RELATIVE chunk index within the shard
+        job_id_obj = JobId(
+            shard_id=shard_name, chunk_id=str(chunk_index), sample_id=str(current_index)
+        )
         unit_id = job_id_obj.get_chunk_str()
         # Calculate unprocessed ranges based on existing chunk state
         unprocessed_ranges = [(current_index, current_index + chunk_size - 1)]
         if self.chunk_tracker and unit_id in self.chunk_tracker.chunks:
             chunk_state = self.chunk_tracker.chunks[unit_id]
             if chunk_state.processed_ranges:
+                # Convert relative processed ranges to absolute ranges
+                absolute_processed_ranges = [
+                    (start + current_index, end + current_index)
+                    for start, end in chunk_state.processed_ranges
+                ]
                 # Subtract processed ranges from total range
+                range_to_subtract = (current_index, current_index + chunk_size - 1)
+                logger.debug(
+                    f"Chunk {unit_id} has processed ranges: {chunk_state.processed_ranges} (relative), {absolute_processed_ranges} (absolute)"
+                )
                 unprocessed_ranges = self._subtract_ranges(
-                    [(current_index, current_index + chunk_size - 1)], chunk_state.processed_ranges
+                    [range_to_subtract], absolute_processed_ranges
                 )
         # If all ranges are processed, return None (shouldn't happen if status tracking is correct)
         if not unprocessed_ranges:
+            logger.debug(f"Chunk {unit_id} has no unprocessed ranges, skipping")
+            return None
+        # Calculate actual unprocessed items and total work to be assigned
+        unprocessed_items = sum(end - start + 1 for start, end in unprocessed_ranges)
+        # Skip assignment if there are very few unprocessed items (< 10 items)
+        if unprocessed_items < 10:
+            logger.debug(
+                f"Chunk {unit_id} has only {unprocessed_items} unprocessed items, skipping assignment"
+            )
             return None
+        # Create work unit that represents ONLY the unprocessed ranges
+        # This is the key fix: don't assign the full chunk, assign only unprocessed parts
         unit = WorkUnit(
             unit_id=unit_id,
             chunk_id=unit_id,
             source_id=shard_name,
-            unit_size=chunk_size,
+            unit_size=unprocessed_items,  # Only the unprocessed items
             data={
                 "dataset_name": self.dataset_name,
                 "config": self.config,
                 "split": self.split,
-                "start_index": current_index,
-                "chunk_size": chunk_size,
-                "unprocessed_ranges": unprocessed_ranges,  # Use calculated ranges
+                "start_index": current_index,  # Keep original chunk start for reference
+                "chunk_size": chunk_size,  # Keep original chunk size for reference
+                "actual_work_size": unprocessed_items,  # NEW: actual work to be done
+                "unprocessed_ranges": unprocessed_ranges,  # The specific ranges to process
+                "range_based": True,  # NEW: flag to indicate this is range-based
                 "shard_ids": [shard_id],
                 "data_files": self.data_files,
             },
@@ -455,23 +600,35 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
                 assigned_count = sum(len(units) for units in self.assigned_units.values())
                 worker_count = max(1, len(self.assigned_units))
+                # Check if all data has been processed
+                if self.current_chunk_index * self.chunk_size >= self.total_items:
+                    # All chunks processed - exit the background thread
+                    logger.debug("All chunks processed, exiting background thread")
+                    break
                 target_buffer = max(self.min_buffer, worker_count * self.buffer_multiplier)
                 units_needed = max(0, target_buffer - (pending_count + assigned_count))
             if units_needed == 0:
-                threading.Event().wait(5)
+                self.stop_creation.wait(5)
                 continue
             # Create units as needed
             units_created = 0
+            # Progress bar
+            progress_bar = tqdm(total=units_needed, desc="Creating work units", unit="unit")
             while units_created < units_needed:
-                logger.debug(f"Creating work unit for chunk {self.current_chunk_index}")
+                # logger.debug(f"Creating work unit for chunk {self.current_chunk_index}")
                 if self.current_chunk_index * self.chunk_size >= self.total_items:
-                    threading.Event().wait(30)
+                    # No more data available - exit immediately instead of waiting
+                    logger.debug(
+                        f"All chunks processed (chunk_index={self.current_chunk_index}, total_items={self.total_items})"
+                    )
                     break
                 # Get shard info for proper unit_id
-                current_index = self.current_chunk_index * self.chunk_size
+                current_index = self.current_chunk_index
                 if current_index < self.total_items:
                     shard_id, _ = self._get_shard_for_index(current_index)
                     shard_name = Path(self.shard_info[shard_id]["filename"]).stem
@@ -509,14 +666,14 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
                     units_created += 1
                     self.current_chunk_index += 1
+                progress_bar.update(1)
             if units_created > 0:
                 logger.debug(f"Created {units_created} work unit IDs")
         logger.info("Thread for creating units has completed. Exiting thread.")
     def process_responses_non_blocking(self, response_queue: queue.Queue) -> Optional[WorkResult]:
-        """
-        Non-blocking method to process responses from workers.
+        """Non-blocking method to process responses from workers.
         Returns a WorkResult if one is available, None otherwise.
         """
         # Check for response without blocking
@@ -551,33 +708,64 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
             # Force checkpoint save if needed
             if self.chunk_tracker:
-                self.chunk_tracker.save()
+                # Flush statistics updates immediately
+                self.chunk_tracker.flush()
     def get_work_units(self, count: int, worker_id: str) -> List[WorkUnit]:
         """Get available work units for a worker."""
         logger.debug(
             "get_work_units called: count=%d worker_id=%s, pending: %d",
             count,
             worker_id,
             len(self.pending_units),
         )
+        # Periodically sync with storage to ensure we don't assign already-completed work
+        # This is especially important when workers reconnect
+        if hasattr(self, "storage") and self.storage and len(self.pending_units) > 0:
+            try:
+                processed_job_ids = self.storage.get_all_processed_job_ids()
+                if processed_job_ids:
+                    logger.debug(
+                        f"Syncing chunk tracker with {len(processed_job_ids)} processed items before assignment"
+                    )
+                    self.update_from_storage(processed_job_ids)
+                    # Flush after storage sync to ensure consistency
+                    self.chunk_tracker.flush()
+            except Exception as e:
+                logger.warning(f"Failed to sync with storage during work assignment: {e}")
         assigned = []
         with self.lock:
             while len(assigned) < count and self.pending_units:
                 unit_id = self.pending_units.popleft()
-                # Create work unit on demand
+                # Create work units on demand (may create multiple units from one chunk)
                 chunk_index = int(unit_id.split(":")[-1])
-                unit = self._create_work_unit(chunk_index)
+                units = self._create_work_units_from_chunk(chunk_index)
-                if unit:
-                    self.assigned_units[worker_id].add(unit_id)
+                for unit in units:
+                    if len(assigned) >= count:
+                        # Put remaining units back in queue for next worker
+                        self.pending_units.appendleft(
+                            f"{unit.metadata['shard_name']}:chunk:{chunk_index}"
+                        )
+                        break
+                    # Use the unit's actual unit_id for tracking
+                    actual_unit_id = unit.unit_id
+                    self.assigned_units[worker_id].add(actual_unit_id)
                     assigned.append(unit)
-                    logger.debug("Assigning unit %s to worker %s", unit_id, worker_id)
+                    logger.debug(
+                        "Assigning unit %s (%d items) to worker %s",
+                        actual_unit_id,
+                        unit.data.get("actual_work_size", unit.unit_size),
+                        worker_id,
+                    )
                     if self.chunk_tracker:
-                        self.chunk_tracker.mark_assigned(unit_id, worker_id)
+                        # Track assignment using the base chunk_id for chunk tracker compatibility
+                        self.chunk_tracker.mark_assigned(unit.chunk_id, worker_id)
         logger.debug("Returning %d work units to worker %s", len(assigned), worker_id)
         return assigned
@@ -610,12 +798,52 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
     def release_assignments(self, worker_id: str) -> None:
         """Release all assignments for a disconnected worker."""
         logger.debug("Releasing assignments for worker %s", worker_id)
+        # FIRST: Sync with storage to ensure chunk tracker is up-to-date
+        # This prevents reassigning work that was already completed by this or other workers
+        if hasattr(self, "storage") and self.storage:
+            try:
+                processed_job_ids = self.storage.get_all_processed_job_ids()
+                if processed_job_ids:
+                    logger.info(
+                        f"Syncing chunk tracker with {len(processed_job_ids)} processed items before releasing assignments"
+                    )
+                    self.update_from_storage(processed_job_ids)
+                    # Flush after storage sync to ensure consistency
+                    self.chunk_tracker.flush()
+            except Exception as e:
+                logger.warning(f"Failed to sync with storage before releasing assignments: {e}")
         with self.lock:
             unit_ids = list(self.assigned_units.get(worker_id, []))
             for unit_id in unit_ids:
-                logger.debug(f"Adding {unit_id} to pending queue")
-                self.pending_units.append(unit_id)
+                # Check if this chunk is already fully processed before re-queuing
+                should_requeue = True
+                if self.chunk_tracker and unit_id in self.chunk_tracker.chunks:
+                    chunk_state = self.chunk_tracker.chunks[unit_id]
+                    if chunk_state.status == "completed":
+                        logger.info(f"Not re-queuing completed chunk {unit_id}")
+                        should_requeue = False
+                    elif chunk_state.processed_ranges:
+                        # Check if chunk is mostly complete (>90% processed)
+                        total_items = chunk_state.chunk_size
+                        processed_items = sum(
+                            end - start + 1 for start, end in chunk_state.processed_ranges
+                        )
+                        completion_rate = processed_items / total_items if total_items > 0 else 0
+                        if completion_rate > 0.9:
+                            logger.info(
+                                f"Not re-queuing nearly complete chunk {unit_id} ({completion_rate:.1%} done)"
+                            )
+                            should_requeue = False
+                if should_requeue:
+                    logger.debug(f"Adding {unit_id} to pending queue")
+                    self.pending_units.append(unit_id)
+                else:
+                    logger.debug(f"Skipping re-queue of {unit_id} (already processed)")
             if worker_id in self.assigned_units:
                 del self.assigned_units[worker_id]
@@ -624,9 +852,87 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
                 self.chunk_tracker.release_worker_chunks(worker_id)
     def update_from_storage(self, processed_job_ids: Set[str]) -> None:
-        """Update work units based on what's been processed."""
+        """Update chunk tracker based on what's been processed in storage."""
         logger.info(f"Updating from storage with {len(processed_job_ids)} processed jobs")
-        # No need to update in-memory work units since we create on demand
+        if not self.chunk_tracker:
+            return
+        # Group by chunk
+        processed_by_chunk = defaultdict(set)
+        for job_id_str in processed_job_ids:
+            try:
+                # Parse job ID to get chunk and sample index
+                job_id = JobId.from_str(job_id_str)
+                chunk_id = job_id.get_chunk_str()
+                sample_idx = int(job_id.sample_id)
+                processed_by_chunk[chunk_id].add(sample_idx)
+            except ValueError as e:
+                logger.warning(f"Invalid job ID format: {job_id_str} - {e}")
+                continue
+        # Update chunk tracker with processed items
+        for chunk_id, indices in processed_by_chunk.items():
+            if not indices:
+                continue
+            # Get or create chunk state
+            chunk_state = self.chunk_tracker.chunks.get(chunk_id)
+            if not chunk_state:
+                # Parse chunk_id using JobId to get info (reuse existing validation)
+                try:
+                    # Reconstruct a valid job_id to parse chunk info
+                    sample_job_id = f"{chunk_id}:idx:0"  # Use dummy sample_id
+                    job_id = JobId.from_str(sample_job_id)
+                    shard_name = job_id.shard_id
+                    chunk_idx = int(job_id.chunk_id)
+                    start_index = chunk_idx * self.chunk_size
+                    # Add chunk to tracker
+                    self.chunk_tracker.add_chunk(
+                        chunk_id,
+                        shard_name,
+                        "",  # URL not needed for HuggingFace
+                        start_index,
+                        self.chunk_size,
+                    )
+                    chunk_state = self.chunk_tracker.chunks[chunk_id]
+                    logger.info(f"Created chunk state for {chunk_id} from storage")
+                except ValueError as e:
+                    logger.error(f"Failed to parse chunk_id {chunk_id}: {e}")
+                    continue
+            # Get chunk start index for conversion (not used in this implementation but kept for clarity)
+            # chunk_start = chunk_state.start_index
+            # Sort absolute indices for range creation
+            sorted_indices = sorted(indices)
+            # Convert to contiguous ranges using absolute indices
+            ranges = []
+            start_range = sorted_indices[0]
+            end_range = sorted_indices[0]
+            for i in range(1, len(sorted_indices)):
+                if sorted_indices[i] == end_range + 1:
+                    end_range = sorted_indices[i]
+                else:
+                    ranges.append((start_range, end_range))
+                    start_range = sorted_indices[i]
+                    end_range = sorted_indices[i]
+            ranges.append((start_range, end_range))
+            # Mark ranges as processed (WITH ABSOLUTE INDICES)
+            logger.info(f"Marking {len(ranges)} ranges as processed in chunk {chunk_id}")
+            for start_idx, end_idx in ranges:
+                self.chunk_tracker.mark_items_processed(chunk_id, start_idx, end_idx)
+        # Save updated chunk tracker
+        self.chunk_tracker.save()
+        logger.info("Chunk tracker synchronized with storage")
     def get_stats(self) -> Dict[str, Any]:
         """Get processor statistics."""
@@ -761,7 +1067,7 @@ class HuggingFaceDatasetWorkerProcessor(WorkerProcessor):
         return url.split("/")[-1]
     def _create_dummy_image(self, index: int, metadata: Dict[str, Any]) -> Image.Image:
-        """Create a dummy image"""
+        """Create a dummy image."""
         color = (0, 0, 0)
         width, height = 128, 128
         image = Image.new("RGB", (width, height), color=color)
@@ -901,17 +1207,83 @@ class HuggingFaceDatasetWorkerProcessor(WorkerProcessor):
                                 )
                             else:
                                 # Normal processing - load real images
-                                if self.url_column and self.url_column in item:
-                                    image_url = item[self.url_column]
-                                    try:
-                                        response = requests.get(image_url, timeout=30)
-                                        response.raise_for_status()
-                                        image = Image.open(io.BytesIO(response.content))
-                                    except Exception as e:
-                                        logger.error(
-                                            f"Error downloading image from {image_url}: {e}"
+                                if self.url_column:
+                                    if self.url_column in item:
+                                        image_url = item[self.url_column]
+                                        try:
+                                            max_retries = 3
+                                            backoff_factor = 2
+                                            initial_delay = 1  # seconds
+                                            response = None
+                                            for attempt in range(max_retries):
+                                                try:
+                                                    response = requests.get(image_url, timeout=30)
+                                                    response.raise_for_status()
+                                                    break  # Success
+                                                except requests.exceptions.HTTPError as http_err:
+                                                    if (
+                                                        response is not None
+                                                        and response.status_code == 429
+                                                    ):
+                                                        retry_after = response.headers.get(
+                                                            "Retry-After"
+                                                        )
+                                                        sleep_time = initial_delay * (
+                                                            backoff_factor**attempt
+                                                        )
+                                                        if retry_after:
+                                                            try:
+                                                                sleep_time = int(retry_after)
+                                                            except ValueError:
+                                                                pass  # Keep exponential backoff
+                                                        logger.warning(
+                                                            f"Rate limited (429) for {image_url}. Retrying in {sleep_time}s..."
+                                                        )
+                                                        time.sleep(sleep_time)
+                                                    elif (
+                                                        response is not None
+                                                        and 500 <= response.status_code < 600
+                                                    ):
+                                                        delay = initial_delay * (
+                                                            backoff_factor**attempt
+                                                        )
+                                                        logger.warning(
+                                                            f"Server error ({response.status_code}) for {image_url}. Retrying in {delay:.1f}s..."
+                                                        )
+                                                        time.sleep(delay)
+                                                    else:
+                                                        # Non-retriable HTTP error
+                                                        raise http_err
+                                                except (
+                                                    requests.exceptions.RequestException
+                                                ) as req_err:
+                                                    if attempt == max_retries - 1:
+                                                        raise req_err  # Re-raise on last attempt
+                                                    delay = initial_delay * (
+                                                        backoff_factor**attempt
+                                                    )
+                                                    logger.warning(
+                                                        f"Request failed for {image_url}. Retrying in {delay:.1f}s... Error: {req_err}"
+                                                    )
+                                                    time.sleep(delay)
+                                            if response is None or not response.ok:
+                                                logger.error(
+                                                    f"Failed to download image from {image_url} after {max_retries} retries."
+                                                )
+                                                continue
+                                            image = Image.open(io.BytesIO(response.content))
+                                        except Exception as e:
+                                            logger.error(
+                                                f"Error downloading image from {image_url}: {e}"
+                                            )
+                                            continue
+                                    else:
+                                        logger.warning(
+                                            f"URL column '{self.url_column}' not found in item at index {global_idx}"
                                         )
-                                        continue
                                 elif self.image_column and self.image_column in item:
                                     image_data = item[self.image_column]
@@ -930,7 +1302,7 @@ class HuggingFaceDatasetWorkerProcessor(WorkerProcessor):
                             job_id_obj = JobId(
                                 shard_id=shard_name,
                                 chunk_id=str(chunk_index),
-                                sample_id=str(global_idx),
+                                sample_id=str(local_idx),
                             )
                             job_id = job_id_obj.get_sample_str()
@@ -963,7 +1335,7 @@ class HuggingFaceDatasetWorkerProcessor(WorkerProcessor):
                                 "_processed_indices": processed_indices,
                             }
-                            processed_indices.append(global_idx)
+                            processed_indices.append(local_idx)
                         except Exception as e:
                             logger.error(f"Error processing item at index {global_idx}: {e}")

caption-flow 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

caption-flow 0.3.4py3-none-any.whl → 0.4.0py3-none-any.whl