PyPI - caption-flow - Versions diffs - 0.2.4__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

caption-flow 0.2.4py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

caption_flow/__init__.py +1 -1
caption_flow/orchestrator.py +9 -9
caption_flow/processors/base.py +3 -0
caption_flow/processors/huggingface.py +637 -464
caption_flow/processors/local_filesystem.py +2 -0
caption_flow/processors/webdataset.py +438 -538
caption_flow/storage/manager.py +328 -305
caption_flow/utils/__init__.py +0 -2
caption_flow/utils/chunk_tracker.py +197 -164
caption_flow/utils/image_processor.py +19 -132
caption_flow/workers/caption.py +191 -138
{caption_flow-0.2.4.dist-info → caption_flow-0.3.2.dist-info}/METADATA +2 -1
caption_flow-0.3.2.dist-info/RECORD +33 -0
caption_flow/utils/dataset_loader.py +0 -222
caption_flow/utils/dataset_metadata_cache.py +0 -67
caption_flow/utils/job_queue.py +0 -41
caption_flow/utils/shard_processor.py +0 -119
caption_flow/utils/shard_tracker.py +0 -83
caption_flow-0.2.4.dist-info/RECORD +0 -38
{caption_flow-0.2.4.dist-info → caption_flow-0.3.2.dist-info}/WHEEL +0 -0
{caption_flow-0.2.4.dist-info → caption_flow-0.3.2.dist-info}/entry_points.txt +0 -0
{caption_flow-0.2.4.dist-info → caption_flow-0.3.2.dist-info}/licenses/LICENSE +0 -0
{caption_flow-0.2.4.dist-info → caption_flow-0.3.2.dist-info}/top_level.txt +0 -0

caption_flow/processors/webdataset.py CHANGED Viewed

@@ -1,104 +1,113 @@
-"""WebDataset processor implementation."""
+"""WebDataset processor implementation using webshart TarDataLoader."""
 import logging
 import threading
+import gc
+import os
 from typing import Dict, Any, List, Optional, Iterator, Set, Deque, Tuple
 from collections import deque, defaultdict
 from pathlib import Path
 import json
-import io
 from datetime import datetime
 from PIL import Image
-from caption_flow.storage import StorageManager
+import io
+from caption_flow.models import JobId
+from caption_flow.storage import StorageManager
 from .base import OrchestratorProcessor, WorkerProcessor, ProcessorConfig, WorkUnit, WorkResult
-from ..utils import DatasetLoader, ChunkTracker
+from ..utils import ChunkTracker
+import webshart
+import cv2
+import numpy as np
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
-    """Orchestrator processor for WebDataset shards."""
+    """Orchestrator processor for WebDataset shards using webshart with ChunkTracker."""
     def __init__(self):
-        logger.debug("Initializing WebDatasetOrchestratorProcessor")
-        self.dataset_loader: Optional[DatasetLoader] = None
+        logger.info("Initializing WebDatasetOrchestratorProcessor with webshart + ChunkTracker")
+        self.dataset: Optional[webshart.DiscoveredDataset] = None
         self.chunk_tracker: Optional[ChunkTracker] = None
         self.chunk_size: int = 1000
         # Work unit management
         self.work_units: Dict[str, WorkUnit] = {}
         self.pending_units: Deque[str] = deque()
-        self.assigned_units: Dict[str, Set[str]] = defaultdict(set)  # worker_id -> unit_ids
+        self.assigned_units: Dict[str, Set[str]] = defaultdict(set)
         self.lock = threading.Lock()
-        # Shard processing state
-        self.all_shards: List[str] = []
-        self.current_shard_index = 0
-        self.current_shard_items = 0
+        # Shard info cache
+        self.shard_info_cache: Dict[int, Dict] = {}
         # Background thread for creating work units
         self.unit_creation_thread: Optional[threading.Thread] = None
         self.stop_creation = threading.Event()
+        self.min_buffer = 10
+        self.buffer_multiplier = 3
     def initialize(self, config: ProcessorConfig, storage: StorageManager) -> None:
-        """Initialize WebDataset processor."""
-        logger.debug("Initializing orchestrator with config: %s", config.config)
-        cfg = config.config
+        """Initialize with webshart dataset discovery and ChunkTracker."""
+        logger.info("Initializing orchestrator with config")
-        # Dataset configuration
+        cfg = config.config
         dataset_cfg = cfg.get("dataset", {})
-        dataset_path = dataset_cfg.get("dataset_path")
-        dataset_type = dataset_cfg.get("dataset_type", "huggingface")
-        dataset_split = dataset_cfg.get("dataset_split", "train")
-        image_column = dataset_cfg.get("dataset_image_column", "image")
+        self.dataset_path = dataset_cfg.get("dataset_path")
+        metadata_path = dataset_cfg.get("metadata_path", None)
         # Chunk settings
         self.chunk_size = cfg.get("chunk_size", 1000)
         self.min_buffer = cfg.get("min_chunk_buffer", 10)
         self.buffer_multiplier = cfg.get("chunk_buffer_multiplier", 3)
-        logger.debug(
-            "Chunk size: %d, min_buffer: %d, buffer_multiplier: %d",
-            self.chunk_size,
-            self.min_buffer,
-            self.buffer_multiplier,
-        )
+        # Cache configuration
+        cache_dir = Path(cfg.get("cache_dir", "./webshart_cache"))
+        cache_dir.mkdir(parents=True, exist_ok=True)
-        # Initialize dataset loader
-        if dataset_path:
-            checkpoint_dir = Path(cfg.get("checkpoint_dir", "./checkpoints"))
-            checkpoint_dir.mkdir(parents=True, exist_ok=True)
-            logger.debug("Checkpoint dir: %s", checkpoint_dir)
-            self.dataset_loader = DatasetLoader(
-                dataset_path=dataset_path,
-                dataset_type=dataset_type,
-                split=dataset_split,
-                image_column=image_column,
-                cache_dir=checkpoint_dir,
+        if self.dataset_path:
+            # Initialize dataset with webshart
+            self.dataset = webshart.discover_dataset(
+                source=self.dataset_path,
+                metadata=metadata_path,
             )
-            logger.debug("DatasetLoader initialized")
-            self.chunk_tracker = ChunkTracker(checkpoint_dir / "chunks.json")
-            logger.debug("ChunkTracker initialized at %s", checkpoint_dir / "chunks.json")
+            # Enable caching for efficient access
+            self.dataset.enable_metadata_cache(location=str(cache_dir / "metadata_cache"))
+            self.dataset.enable_shard_cache(
+                location=str(cache_dir / "shard_cache"),
+                cache_limit_gb=cfg.get("shard_cache_gb", 10.0),
+            )
+            logger.info(f"Dataset discovered: {self.dataset.num_shards} shards")
-            # Get all shards
-            self.all_shards = self.dataset_loader.get_shard_list()
-            logger.debug("All shards: %s", self.all_shards)
+            # Initialize chunk tracker
+            checkpoint_dir = Path(cfg.get("checkpoint_dir", "./checkpoints"))
+            checkpoint_dir.mkdir(parents=True, exist_ok=True)
+            self.chunk_tracker = ChunkTracker(checkpoint_dir / "chunks.json")
             # Restore existing state from chunk tracker
-            self._restore_state(storage=storage)
+            self._restore_state(storage)
             # Start background unit creation
             self.unit_creation_thread = threading.Thread(
                 target=self._create_units_background, daemon=True
             )
             self.unit_creation_thread.start()
-            logger.debug("Unit creation thread started")
         else:
-            logger.error("No dataset_path provided in config")
+            logger.error("No dataset_path provided")
+    def _get_shard_info_cached(self, shard_idx: int) -> Optional[Dict]:
+        """Get shard info with caching."""
+        if shard_idx not in self.shard_info_cache:
+            try:
+                self.shard_info_cache[shard_idx] = self.dataset.get_shard_info(shard_idx)
+            except Exception as e:
+                logger.error(f"Error getting shard info for idx {shard_idx}: {e}")
+                return None
+        return self.shard_info_cache[shard_idx]
     def _restore_state(self, storage: StorageManager) -> None:
         """Restore state from chunk tracker."""
@@ -107,39 +116,43 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
             return
         shards_summary = self.chunk_tracker.get_shards_summary()
-        # Get all processed job_ids from storage
-        all_processed_jobs = storage.get_all_processed_job_ids()
+        logger.debug(f"Restoring state: {shards_summary}")
         with self.lock:
             for shard_name, shard_info in shards_summary.items():
-                for chunk_state in shard_info["chunks"]:
-                    # Calculate actual unprocessed ranges based on what's in storage
-                    chunk_range = (
-                        chunk_state.start_index,
-                        chunk_state.start_index + chunk_state.chunk_size - 1,
-                    )
-                    # Get processed indices for this chunk
-                    processed_ranges = self.chunk_tracker.get_processed_indices_for_chunk(
-                        chunk_state.chunk_id, all_processed_jobs
-                    )
+                chunks = shard_info.get("chunks", [])
+                logger.debug(f"Existing job ids: {storage.get_all_processed_job_ids()}")
+                for chunk_state in chunks:
+                    # Only add incomplete chunks
+                    if chunk_state.status != "completed":
+                        logger.debug(f"Restoring incomplete chunk {chunk_state}")
+                        # Get unprocessed ranges
+                        unprocessed_ranges = chunk_state.get_unprocessed_ranges()
+                        logger.debug(
+                            f"Chunk {chunk_state.chunk_id} unprocessed ranges: {unprocessed_ranges}"
+                        )
+                        if not unprocessed_ranges:
+                            continue
-                    # Calculate unprocessed ranges
-                    unprocessed_ranges = self._subtract_ranges([chunk_range], processed_ranges)
+                        # Convert relative ranges to absolute file indices
+                        absolute_ranges = []
+                        for start, end in unprocessed_ranges:
+                            abs_start = chunk_state.start_index + start
+                            abs_end = chunk_state.start_index + end
+                            absolute_ranges.append((abs_start, abs_end))
-                    if unprocessed_ranges:
-                        # Create work unit for unprocessed items
-                        logger.debug(f"Creating WorkUnit for chunk {chunk_state}")
                         unit = WorkUnit(
                             unit_id=chunk_state.chunk_id,
                             chunk_id=chunk_state.chunk_id,
                             source_id=shard_name,
+                            unit_size=chunk_state.chunk_size,
                             data={
                                 "shard_url": chunk_state.shard_url,
+                                "shard_name": shard_name,
                                 "start_index": chunk_state.start_index,
                                 "chunk_size": chunk_state.chunk_size,
-                                "unprocessed_ranges": unprocessed_ranges,
+                                "unprocessed_ranges": absolute_ranges,
                             },
                             metadata={
                                 "shard_name": shard_name,
@@ -154,11 +167,8 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
         """Background thread to create work units on demand."""
         logger.info("Starting work unit creation thread")
-        shard_iter = iter(self.all_shards)
-        current_shard_url = None
-        current_shard_name = None
-        current_shard_items = 0
-        current_index = 0
+        current_shard_idx = 0
+        current_file_idx = 0
         while not self.stop_creation.is_set():
             # Check if we need more units
@@ -169,14 +179,6 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
                 target_buffer = max(self.min_buffer, worker_count * self.buffer_multiplier)
                 units_needed = max(0, target_buffer - (pending_count + assigned_count))
-                logger.debug(
-                    "pending_count=%d assigned_count=%d worker_count=%d target_buffer=%d units_needed=%d",
-                    pending_count,
-                    assigned_count,
-                    worker_count,
-                    target_buffer,
-                    units_needed,
-                )
             if units_needed == 0:
                 threading.Event().wait(5)
@@ -184,318 +186,192 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
             # Create units as needed
             units_created = 0
             while units_created < units_needed and not self.stop_creation.is_set():
-                # Load next shard if needed
-                if current_shard_url is None or current_index >= current_shard_items:
-                    try:
-                        current_shard_url = next(shard_iter)
-                        current_shard_name = Path(current_shard_url).stem
+                # Get current shard info
+                if current_shard_idx >= self.dataset.num_shards:
+                    logger.info("All shards processed")
+                    break
-                        logger.debug("Loading shard: %s", current_shard_url)
-                        # Count items in shard
-                        current_shard_items = sum(
-                            1 for _ in self.dataset_loader.iterate_shard(current_shard_url)
-                        )
-                        logger.info(
-                            f"Processing shard {current_shard_name} with {current_shard_items} items"
-                        )
-                        current_index = 0
-                    except StopIteration:
-                        logger.info("All shards processed")
-                        break
-                    except Exception as e:
-                        logger.error("Error loading shard: %s", e)
-                        break
-                # Create work unit
-                if current_shard_url and current_index < current_shard_items:
-                    chunk_size = min(self.chunk_size, current_shard_items - current_index)
-                    unit_id = f"{current_shard_name}:chunk:{current_index // self.chunk_size}"
-                    with self.lock:
-                        # Check if this unit already exists in work_units
-                        if unit_id in self.work_units:
-                            logger.debug(
-                                f"Unit {unit_id} already exists in work_units, skipping creation"
-                            )
-                            current_index += self.chunk_size
+                shard_info = self._get_shard_info_cached(current_shard_idx)
+                if not shard_info:
+                    current_shard_idx += 1
+                    current_file_idx = 0
+                    continue
+                shard_name = shard_info["name"]
+                shard_files = shard_info["num_files"]
+                # Check if we need to move to next shard
+                if current_file_idx >= shard_files:
+                    current_shard_idx += 1
+                    current_file_idx = 0
+                    continue
+                # Create chunk for current position
+                chunk_size = min(self.chunk_size, shard_files - current_file_idx)
+                self.current_chunk_index = current_file_idx // self.chunk_size
+                job_id_obj = JobId(
+                    shard_id=shard_name,
+                    chunk_id=self.current_chunk_index,
+                    sample_id=current_file_idx,
+                )
+                chunk_id = job_id_obj.get_chunk_str()
+                with self.lock:
+                    # Skip if already exists
+                    if chunk_id in self.work_units:
+                        current_file_idx += self.chunk_size
+                        continue
+                    # Check if chunk is already completed
+                    if self.chunk_tracker:
+                        chunk_state = self.chunk_tracker.chunks.get(chunk_id)
+                        if chunk_state and chunk_state.status == "completed":
+                            current_file_idx += self.chunk_size
                             continue
-                        # Check if chunk is already completed or has no unprocessed items
-                        if self.chunk_tracker:
-                            chunk_state = self.chunk_tracker.chunks.get(unit_id)
-                            if chunk_state:
-                                # Check if completed
-                                if chunk_state.status == "completed":
-                                    logger.debug(f"Unit {unit_id} already completed, skipping")
-                                    current_index += self.chunk_size
-                                    continue
-                                # Check if has unprocessed items
-                                unprocessed_ranges = chunk_state.get_unprocessed_ranges()
-                                if not unprocessed_ranges:
-                                    logger.debug(
-                                        f"Unit {unit_id} has no unprocessed items, skipping"
-                                    )
-                                    current_index += self.chunk_size
-                                    continue
-                                # If chunk exists but has unprocessed items, use those ranges
-                                logger.debug(
-                                    f"Existing chunk {unit_id} has unprocessed ranges: {unprocessed_ranges}"
-                                )
-                                unit = WorkUnit(
-                                    unit_id=unit_id,
-                                    chunk_id=unit_id,
-                                    source_id=current_shard_name,
-                                    data={
-                                        "shard_url": current_shard_url,
-                                        "start_index": current_index,
-                                        "chunk_size": chunk_size,
-                                        "unprocessed_ranges": [
-                                            (
-                                                r[0] + chunk_state.start_index,
-                                                r[1] + chunk_state.start_index,
-                                            )
-                                            for r in unprocessed_ranges
-                                        ],  # Convert relative to absolute
-                                    },
-                                    metadata={
-                                        "shard_name": current_shard_name,
-                                        "chunk_index": current_index // self.chunk_size,
-                                    },
-                                )
-                            else:
-                                # New chunk
-                                logger.debug(
-                                    "Creating new work unit: unit_id=%s shard=%s start_index=%d chunk_size=%d",
-                                    unit_id,
-                                    current_shard_name,
-                                    current_index,
-                                    chunk_size,
-                                )
-                                unit = WorkUnit(
-                                    unit_id=unit_id,
-                                    chunk_id=unit_id,
-                                    source_id=current_shard_name,
-                                    data={
-                                        "shard_url": current_shard_url,
-                                        "start_index": current_index,
-                                        "chunk_size": chunk_size,
-                                        "unprocessed_ranges": [
-                                            (current_index, current_index + chunk_size - 1)
-                                        ],
-                                    },
-                                    metadata={
-                                        "shard_name": current_shard_name,
-                                        "chunk_index": current_index // self.chunk_size,
-                                    },
-                                )
-                        else:
-                            # No chunk tracker, create normally
-                            unit = WorkUnit(
-                                unit_id=unit_id,
-                                chunk_id=unit_id,
-                                source_id=current_shard_name,
-                                data={
-                                    "shard_url": current_shard_url,
-                                    "start_index": current_index,
-                                    "chunk_size": chunk_size,
-                                    "unprocessed_ranges": [
-                                        (current_index, current_index + chunk_size - 1)
-                                    ],
-                                },
-                                metadata={
-                                    "shard_name": current_shard_name,
-                                    "chunk_index": current_index // self.chunk_size,
-                                },
-                            )
-                        self.work_units[unit_id] = unit
-                        self.pending_units.append(unit_id)
-                        logger.debug("Added work unit %s to pending_units", unit_id)
-                        if self.chunk_tracker:
-                            added_chunk = self.chunk_tracker.add_chunk(
-                                unit_id,
-                                current_shard_name,
-                                current_shard_url,
-                                current_index,
-                                chunk_size,
-                            )
-                            if added_chunk:
-                                logger.debug("Added chunk to chunk_tracker: %s", unit_id)
-                            else:
-                                logger.debug("Chunk already exists in chunk_tracker: %s", unit_id)
+                    # Get shard URL (path for webshart)
+                    shard_url = shard_info.get("path", f"{shard_name}.tar")
+                    # Create work unit
+                    unit = WorkUnit(
+                        unit_id=chunk_id,
+                        chunk_id=chunk_id,
+                        source_id=shard_name,
+                        unit_size=chunk_size,
+                        data={
+                            "shard_url": shard_url,
+                            "shard_name": shard_name,
+                            "shard_idx": current_shard_idx,
+                            "start_index": current_file_idx,
+                            "chunk_size": chunk_size,
+                            "unprocessed_ranges": [
+                                (current_file_idx, current_file_idx + chunk_size - 1)
+                            ],
+                        },
+                        metadata={
+                            "shard_name": shard_name,
+                            "chunk_index": current_file_idx // self.chunk_size,
+                        },
+                    )
+                    self.work_units[chunk_id] = unit
+                    self.pending_units.append(chunk_id)
+                    # Add to chunk tracker
+                    if self.chunk_tracker:
+                        self.chunk_tracker.add_chunk(
+                            chunk_id, shard_name, shard_url, current_file_idx, chunk_size
+                        )
-                        units_created += 1
+                    units_created += 1
-                    current_index += self.chunk_size
+                current_file_idx += self.chunk_size
             if units_created > 0:
                 logger.debug(f"Created {units_created} work units")
+        logger.info("Work unit creation thread exiting")
     def get_work_units(self, count: int, worker_id: str) -> List[WorkUnit]:
         """Get available work units for a worker."""
-        logger.debug("get_work_units called: count=%d worker_id=%s", count, worker_id)
         assigned = []
         with self.lock:
-            # Get new units if needed
             while len(assigned) < count and self.pending_units:
                 unit_id = self.pending_units.popleft()
                 unit = self.work_units.get(unit_id)
                 if unit:
+                    # Update unprocessed ranges from chunk tracker before assigning
+                    if self.chunk_tracker and unit_id in self.chunk_tracker.chunks:
+                        chunk_state = self.chunk_tracker.chunks[unit_id]
+                        relative_unprocessed = chunk_state.get_unprocessed_ranges()
+                        # Convert relative to absolute indices
+                        absolute_ranges = []
+                        for start, end in relative_unprocessed:
+                            abs_start = chunk_state.start_index + start
+                            abs_end = chunk_state.start_index + end
+                            absolute_ranges.append((abs_start, abs_end))
+                        # Update the work unit's unprocessed ranges
+                        unit.data["unprocessed_ranges"] = absolute_ranges
+                        logger.debug(
+                            f"Updated unit {unit_id} with unprocessed ranges: {absolute_ranges}"
+                        )
                     self.assigned_units[worker_id].add(unit_id)
                     assigned.append(unit)
-                    logger.debug("Assigning new unit %s to worker %s", unit_id, worker_id)
                     if self.chunk_tracker:
                         self.chunk_tracker.mark_assigned(unit_id, worker_id)
-        logger.debug("Returning %d work units to worker %s", len(assigned), worker_id)
+        logger.debug(f"Assigned {len(assigned)} units to worker {worker_id}")
         return assigned
-    def _has_unprocessed_items(self, unit: WorkUnit) -> bool:
-        """Check if a work unit has unprocessed items."""
-        if not self.chunk_tracker:
-            logger.debug("No chunk_tracker, assuming unit %s has unprocessed items", unit.unit_id)
-            return True
-        chunk_info = self.chunk_tracker.get_chunk_with_unprocessed_items(unit.unit_id)
-        has_unprocessed = bool(chunk_info and chunk_info.get("unprocessed_ranges"))
-        logger.debug("Unit %s has unprocessed items: %s", unit.unit_id, has_unprocessed)
-        return has_unprocessed
     def mark_completed(self, unit_id: str, worker_id: str) -> None:
         """Mark a work unit as completed."""
-        logger.debug("Marking unit %s as completed by worker %s", unit_id, worker_id)
         with self.lock:
             if unit_id in self.work_units:
                 self.assigned_units[worker_id].discard(unit_id)
-                logger.debug(
-                    "Removed unit %s from assigned_units for worker %s", unit_id, worker_id
-                )
                 if self.chunk_tracker:
                     self.chunk_tracker.mark_completed(unit_id)
-                    logger.debug("Marked unit %s as completed in chunk_tracker", unit_id)
+                # Remove from memory
+                del self.work_units[unit_id]
     def mark_failed(self, unit_id: str, worker_id: str, error: str) -> None:
         """Mark a work unit as failed."""
-        logger.debug("Marking unit %s as failed by worker %s, error: %s", unit_id, worker_id, error)
+        logger.error(f"Unit {unit_id} failed on {worker_id}: {error}")
         with self.lock:
             if unit_id in self.work_units:
                 self.assigned_units[worker_id].discard(unit_id)
                 self.pending_units.append(unit_id)
-                logger.debug("Returned unit %s to pending_units", unit_id)
                 if self.chunk_tracker:
                     self.chunk_tracker.mark_failed(unit_id)
-                    logger.debug("Marked unit %s as failed in chunk_tracker", unit_id)
     def release_assignments(self, worker_id: str) -> None:
         """Release all assignments for a disconnected worker."""
-        logger.debug("Releasing assignments for worker %s", worker_id)
         with self.lock:
             unit_ids = list(self.assigned_units.get(worker_id, []))
             for unit_id in unit_ids:
-                if unit_id in self.work_units:
-                    unit = self.work_units[unit_id]
-                    # Update unprocessed ranges based on what's been processed
-                    if self.chunk_tracker and unit_id in self.chunk_tracker.chunks:
-                        chunk_state = self.chunk_tracker.chunks[unit_id]
+                if unit_id in self.work_units and self.chunk_tracker:
+                    # Get updated unprocessed ranges from chunk tracker
+                    chunk_state = self.chunk_tracker.chunks.get(unit_id)
+                    if chunk_state:
                         unprocessed_ranges = chunk_state.get_unprocessed_ranges()
-                        # Convert relative ranges back to absolute
+                        # Convert relative to absolute
                         absolute_ranges = []
                         for start, end in unprocessed_ranges:
                             abs_start = chunk_state.start_index + start
                             abs_end = chunk_state.start_index + end
                             absolute_ranges.append((abs_start, abs_end))
-                        # Update the work unit's data
-                        unit.data["unprocessed_ranges"] = absolute_ranges
-                        logger.debug(
-                            f"Updated unit {unit_id} with unprocessed ranges: {absolute_ranges}"
-                        )
+                        # Update work unit
+                        self.work_units[unit_id].data["unprocessed_ranges"] = absolute_ranges
                     self.pending_units.append(unit_id)
-                    logger.debug("Returned unit %s to pending_units", unit_id)
             if worker_id in self.assigned_units:
                 del self.assigned_units[worker_id]
-                logger.debug("Deleted worker %s from assigned_units", worker_id)
             if self.chunk_tracker:
                 self.chunk_tracker.release_worker_chunks(worker_id)
-                logger.debug("Released worker %s chunks in chunk_tracker", worker_id)
-    def _subtract_ranges(
-        self, total_ranges: List[Tuple[int, int]], processed_ranges: List[Tuple[int, int]]
-    ) -> List[Tuple[int, int]]:
-        """Subtract processed ranges from total ranges."""
-        if not processed_ranges:
-            return total_ranges
-        # Create a set of all processed indices
-        processed_indices = set()
-        for start, end in processed_ranges:
-            processed_indices.update(range(start, end + 1))
-        # Find unprocessed ranges
-        unprocessed_ranges = []
-        for start, end in total_ranges:
-            current_start = None
-            for i in range(start, end + 1):
-                if i not in processed_indices:
-                    if current_start is None:
-                        current_start = i
-                else:
-                    if current_start is not None:
-                        unprocessed_ranges.append((current_start, i - 1))
-                        current_start = None
-            if current_start is not None:
-                unprocessed_ranges.append((current_start, end))
-        return unprocessed_ranges
-    def get_stats(self) -> Dict[str, Any]:
-        """Get processor statistics."""
-        with self.lock:
-            stats = {
-                "total_units": len(self.work_units),
-                "pending_units": len(self.pending_units),
-                "assigned_units": sum(len(units) for units in self.assigned_units.values()),
-                "total_shards": len(self.all_shards),
-                "workers": len(self.assigned_units),
-            }
-            logger.debug("Stats: %s", stats)
-            return stats
+        logger.info(f"Released {len(unit_ids)} assignments from {worker_id}")
     def handle_result(self, result: WorkResult) -> Dict[str, Any]:
-        """Handle WebDataset-specific result processing."""
-        # logger.debug("Handling result for unit %s", result.unit_id)
-        base_result = super().handle_result(result)
+        """Handle result from worker."""
         # Track processed items if we have chunk tracker
-        if self.chunk_tracker:
-            if "item_indices" not in result.metadata:
-                result.metadata["item_indices"] = [result.metadata.get("_item_index")]
+        if self.chunk_tracker and "item_indices" in result.metadata:
             indices = result.metadata["item_indices"]
-            logger.debug("Result metadata item_indices: %s", indices)
-            # Group consecutive indices into ranges
+            # Convert to ranges and mark as processed
             if indices:
                 indices.sort()
                 ranges = []
@@ -514,269 +390,293 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
                 # Mark ranges as processed
                 for start_idx, end_idx in ranges:
-                    logger.debug(f"Marking chunk as processed: {result.to_repr()}")
                     self.chunk_tracker.mark_items_processed(result.chunk_id, start_idx, end_idx)
-                    logger.debug(
-                        "Marked items processed for unit %s: %d-%d",
-                        result.unit_id,
-                        start_idx,
-                        end_idx,
-                    )
-        else:
-            logger.error(
-                f"No chunk tracker? {self.chunk_tracker} or no item_indices in {result.metadata}"
-            )
-        return base_result
+        return {
+            "source_id": result.source_id,
+            "chunk_id": result.chunk_id,
+            "outputs": result.outputs,
+            "metadata": result.metadata,
+        }
     def update_from_storage(self, processed_job_ids: Set[str]) -> None:
         """Update work units based on what's been processed."""
-        logger.info(f"Updating work units from {len(processed_job_ids)} processed jobs")
+        logger.info(f"Updating from {len(processed_job_ids)} processed jobs")
         with self.lock:
-            for unit_id, unit in self.work_units.items():
-                # Extract chunk info from unit
-                start_index = unit.data["start_index"]
-                chunk_size = unit.data["chunk_size"]
-                shard_name = unit.metadata["shard_name"]
-                chunk_index = unit.metadata["chunk_index"]
-                # Find processed indices for this chunk
-                processed_indices = []
-                for job_id in processed_job_ids:
-                    # Parse job_id format: "data-0000:chunk:0:idx:42"
-                    parts = job_id.split(":")
-                    if (
-                        len(parts) == 5
-                        and parts[0] == shard_name
-                        and parts[1] == "chunk"
-                        and int(parts[2]) == chunk_index
-                        and parts[3] == "idx"
-                    ):
+            # Group by chunk
+            processed_by_chunk = defaultdict(set)
+            for job_id in processed_job_ids:
+                # Parse job_id to extract chunk and index
+                # Expected format: "shard:chunk:X:idx:Y"
+                parts = job_id.split(":")
+                if len(parts) >= 5 and parts[3] == "idx":
+                    chunk_id = ":".join(parts[:3])  # "shard:chunk:X"
+                    try:
                         idx = int(parts[4])
-                        if start_index <= idx < start_index + chunk_size:
-                            processed_indices.append(idx)
+                        processed_by_chunk[chunk_id].add(idx)
+                    except ValueError:
+                        continue
-                if processed_indices:
-                    # Convert to ranges
-                    processed_indices.sort()
-                    processed_ranges = []
-                    start = processed_indices[0]
-                    end = processed_indices[0]
-                    for idx in processed_indices[1:]:
-                        if idx == end + 1:
-                            end = idx
-                        else:
-                            processed_ranges.append((start, end))
-                            start = idx
-                            end = idx
-                    processed_ranges.append((start, end))
-                    # Calculate unprocessed ranges
-                    total_range = [(start_index, start_index + chunk_size - 1)]
-                    unprocessed_ranges = self._subtract_ranges(total_range, processed_ranges)
-                    # Update unit
-                    unit.data["unprocessed_ranges"] = unprocessed_ranges
-                    logger.debug(
-                        f"Updated unit {unit_id}: {len(processed_indices)} processed, "
-                        f"unprocessed ranges: {unprocessed_ranges}"
-                    )
+            # Update chunk tracker with processed items
+            if self.chunk_tracker:
+                for chunk_id, indices in processed_by_chunk.items():
+                    if indices:
+                        # Sort indices and convert to ranges
+                        sorted_indices = sorted(indices)
+                        if not sorted_indices:
+                            continue
+                        # Condense into contiguous ranges
+                        ranges = []
+                        start_range = sorted_indices[0]
+                        end_range = sorted_indices[0]
-class WebDatasetWorkerProcessor(WorkerProcessor):
-    """Worker processor for WebDataset shards."""
+                        for i in range(1, len(sorted_indices)):
+                            if sorted_indices[i] == end_range + 1:
+                                end_range = sorted_indices[i]
+                            else:
+                                ranges.append((start_range, end_range))
+                                start_range = sorted_indices[i]
+                                end_range = sorted_indices[i]
+                        ranges.append((start_range, end_range))
-    def __init__(self):
-        logger.debug("Initializing WebDatasetWorkerProcessor")
-        self.dataset_loader: Optional[DatasetLoader] = None
-        self.dataset_config: Dict[str, Any] = {}
-        self.dataset_name: Optional[str] = None
+                        # Mark each contiguous range as processed
+                        logger.debug(f"Marking ranges {ranges} as processed in chunk {chunk_id}")
+                        for start_idx, end_idx in ranges:
+                            self.chunk_tracker.mark_items_processed(chunk_id, start_idx, end_idx)
-    def initialize(self, config: ProcessorConfig) -> None:
-        """Initialize WebDataset processor."""
-        logger.debug("Initializing worker with config: %s", config.config)
-        cfg = config.config["dataset"]
-        # Store config
-        self.dataset_config = cfg
-        # Initialize dataset loader
-        dataset_path = cfg.get("dataset_path")
-        self.dataset_path = dataset_path
-        dataset_type = cfg.get("dataset_type", "huggingface")
-        dataset_split = cfg.get("dataset_split", "train")
-        image_column = cfg.get("dataset_image_column", "image")
-        if dataset_path:
-            self.dataset_loader = DatasetLoader(
-                dataset_path=dataset_path,
-                dataset_type=dataset_type,
-                split=dataset_split,
-                image_column=image_column,
-            )
-            logger.debug("DatasetLoader initialized for worker")
-        else:
-            logger.error("No dataset_path provided in worker config")
+    def get_stats(self) -> Dict[str, Any]:
+        """Get processor statistics."""
+        with self.lock:
+            # Get chunk tracker stats if available
+            if self.chunk_tracker:
+                shards_summary = self.chunk_tracker.get_shards_summary()
+                total_chunks = sum(len(s.get("chunks", [])) for s in shards_summary.values())
+                completed_chunks = sum(
+                    1
+                    for s in shards_summary.values()
+                    for c in s.get("chunks", [])
+                    if c.status == "completed"
+                )
+            else:
+                total_chunks = len(self.work_units)
+                completed_chunks = 0
-    def process_unit(self, unit: WorkUnit, context: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
-        """Process a WebDataset chunk, yielding items to be captioned."""
-        logger.debug("Processing unit: %s", unit.unit_id)
-        if not self.dataset_loader:
-            logger.error("Dataset loader not initialized")
-            return
+            return {
+                "total_shards": self.dataset.num_shards if self.dataset else 0,
+                "total_chunks": total_chunks,
+                "pending_units": len(self.pending_units),
+                "assigned_units": sum(len(units) for units in self.assigned_units.values()),
+                "completed_chunks": completed_chunks,
+                "workers": len(self.assigned_units),
+            }
-        shard_name = unit.metadata["shard_name"]
-        chunk_index = unit.metadata["chunk_index"]
-        shard_url = unit.data["shard_url"]
-        start_index = unit.data["start_index"]
-        chunk_size = unit.data["chunk_size"]
-        unprocessed_ranges = unit.data.get(
-            "unprocessed_ranges", [(start_index, start_index + chunk_size - 1)]
-        )
+    def cleanup(self):
+        """Clean up resources."""
+        logger.info("Cleaning up orchestrator")
-        logger.info(f"Processing unit {unit.unit_id} with ranges: {unprocessed_ranges}")
+        # Stop background threads
+        self.stop_creation.set()
+        if self.unit_creation_thread:
+            self.unit_creation_thread.join(timeout=5)
-        # Create set of indices to process
-        indices_to_process = set()
-        for start, end in unprocessed_ranges:
-            indices_to_process.update(range(start, end + 1))
-        logger.debug("Indices to process: %s", indices_to_process)
+        # Save checkpoint
+        if self.chunk_tracker:
+            self.chunk_tracker.save_checkpoint()
-        processed_indices = []
-        # Iterate through shard
-        for idx, (key, url, image_data, metadata) in enumerate(
-            self._iterate_shard_with_metadata(shard_url)
-        ):
-            # Skip if not in our chunk range
-            if idx < start_index or idx >= start_index + chunk_size:
-                # logger.debug(f"Skipping idx={idx} not in chunk range")
-                continue
+class WebDatasetWorkerProcessor(WorkerProcessor):
+    """Worker processor for WebDataset shards using webshart."""
-            # Skip if already processed
-            if idx not in indices_to_process:
-                logger.debug(f"Skipping idx={idx} already processed")
-                continue
+    def __init__(self):
+        logger.info("Initializing WebDatasetWorkerProcessor with webshart")
+        self.loader: Optional[webshart.TarDataLoader] = None
+        self.dataset: Optional[webshart.DiscoveredDataset] = None
+        self.mock_results = False
-            try:
-                # Load image
-                image = Image.open(io.BytesIO(image_data))
-                job_id = f"{shard_name}:chunk:{chunk_index}:idx:{idx}"
-                # Clean metadata - remove sensitive and redundant fields
-                clean_metadata = {
-                    k: v
-                    for k, v in metadata.items()
-                    if k not in ["url", "_shard_url", "shard_name"]  # Remove these fields
-                }
-                # Add only necessary index information
-                clean_metadata.update(
-                    {
-                        "_item_index": idx,
-                        "_chunk_relative_index": idx - start_index,
-                        "_job_id": job_id,
-                    }
-                )
+    def initialize(self, config: ProcessorConfig) -> None:
+        """Initialize worker with webshart loader."""
+        cfg = config.config
+        dataset_cfg = cfg.get("dataset", {})
-                # Prepare item for captioning
-                # logger.debug("Yielding item idx=%d key=%s", idx, key)
-                yield {
-                    "image": image,
-                    "item_key": key,
-                    "item_index": idx,
-                    "metadata": clean_metadata,
-                    "job_id": job_id,
-                }
+        self.dataset_path = dataset_cfg.get("dataset_path")
+        metadata_path = dataset_cfg.get("metadata_path", None)
+        self.mock_results = dataset_cfg.get("mock_results", False)
-                processed_indices.append(idx)
+        # Cache configuration
+        cache_dir = Path(cfg.get("cache_dir", "./webshart_cache"))
+        cache_dir.mkdir(parents=True, exist_ok=True)
-            except Exception as e:
-                logger.error(f"Error processing item {key}: {e}")
+        if self.dataset_path and not self.mock_results:
+            # Discover dataset
+            self.dataset = webshart.discover_dataset(
+                source=self.dataset_path,
+                metadata=metadata_path,
+            )
-        # Store processed indices in context for result preparation
-        context["_processed_indices"] = processed_indices
-        logger.debug("Processed indices for unit %s: %s", unit.unit_id, processed_indices)
+            # Enable caching
+            self.dataset.enable_metadata_cache(location=str(cache_dir / "metadata_cache"))
+            self.dataset.enable_shard_cache(
+                location=str(cache_dir / "shard_cache"),
+                cache_limit_gb=cfg.get("shard_cache_gb", 10.0),
+            )
-    def _iterate_shard_with_metadata(
-        self, shard_url: str
-    ) -> Iterator[Tuple[str, str, bytes, Dict]]:
-        """Iterate through a shard with metadata."""
-        logger.debug("Iterating shard with metadata: %s", shard_url)
+            # Create loader
+            self.loader = webshart.TarDataLoader(
+                self.dataset,
+                buffer_size=cfg.get("buffer_size", 10),
+                max_file_size=cfg.get("max_file_size", 100 * 1024 * 1024),
+                load_file_data=True,
+            )
-        if not self.dataset_loader:
-            logger.error("Dataset loader not initialized")
-            return
+            logger.info("webshart TarDataLoader initialized")
-        # Use the DatasetLoader that returns full samples
-        for sample in self.dataset_loader.iterate_shard(shard_url):
-            if not isinstance(sample, dict):
-                logger.warning("Unexpected sample format: %s", type(sample))
-                continue
+    def _create_mock_image(self, idx: int) -> Image.Image:
+        """Create a dummy test image."""
+        color = ((idx * 37) % 256, (idx * 53) % 256, (idx * 71) % 256)
+        image = Image.new("RGB", (256, 256), color=color)
+        return image
-            key = sample.get("__key__", "unknown")
-            url = sample.get("__url__", "")  # Don't use shard_url as default
+    def process_unit(self, unit: WorkUnit, context: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
+        """Process a work unit by iterating specified ranges."""
+        logger.debug(f"Processing unit: {unit}")
-            # Find image data
-            image_data = None
-            image_ext = None
-            for ext in ["jpg", "jpeg", "png", "webp", "bmp", "jxl"]:
-                if ext in sample:
-                    image_data = sample[ext]
-                    image_ext = ext
-                    break
+        shard_name = unit.data["shard_name"]
+        shard_idx = unit.data.get("shard_idx")
+        unprocessed_ranges = unit.data.get("unprocessed_ranges", [])
-            if not image_data:
-                logger.debug(
-                    "No image data found for item key=%s, available keys: %s",
-                    key,
-                    list(sample.keys()),
-                )
-                continue
+        # For chunk tracking
+        chunk_index = unit.metadata.get("chunk_index", 0)
+        processed_indices = []
-            # Extract metadata (all non-system and non-image keys)
-            metadata = {
-                k: v
-                for k, v in sample.items()
-                if not k.startswith("__") and k not in ["jpg", "jpeg", "png", "webp", "bmp", "jxl"]
-            }
+        if self.mock_results:
+            # Generate mock results for unprocessed ranges
+            for start_idx, end_idx in unprocessed_ranges:
+                for idx in range(start_idx, end_idx + 1):
+                    job_id = f"{shard_name}:chunk:{chunk_index}:idx:{idx}"
+                    yield {
+                        "image": self._create_mock_image(idx),
+                        "image_data": None,
+                        "item_key": f"mock_{idx}",
+                        "item_index": idx,
+                        "metadata": {
+                            "_item_index": idx,
+                            "_chunk_relative_index": idx - unit.data["start_index"],
+                            "_job_id": job_id,
+                            "_mock": True,
+                            "_processed_indices": processed_indices,
+                        },
+                        "job_id": job_id,
+                    }
-            # Add image format but not URLs
-            if image_ext:
-                metadata["_image_format"] = image_ext
+                    processed_indices.append(idx)
+        else:
+            # Use webshart to process unprocessed ranges
+            for start_idx, end_idx in unprocessed_ranges:
+                try:
+                    # Jump to shard and starting position
+                    if shard_idx is not None:
+                        self.loader.shard(shard_idx=shard_idx, cursor_idx=start_idx)
+                    else:
+                        # Try to find shard by name
+                        self.loader.shard(filename=shard_name, cursor_idx=start_idx)
+                    # Iterate through the range
+                    for idx in range(start_idx, end_idx + 1):
+                        try:
+                            entry = next(self.loader)
+                            # Decode image
+                            image = None
+                            if entry.data:
+                                try:
+                                    # Use cv2 to decode from memory
+                                    nparr = np.frombuffer(entry.data, np.uint8)
+                                    img_np = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+                                    if img_np is not None:
+                                        # Convert from BGR (OpenCV default) to RGB (PIL default)
+                                        img_rgb = cv2.cvtColor(img_np, cv2.COLOR_BGR2RGB)
+                                        image = Image.fromarray(img_rgb)
+                                    else:
+                                        logger.warning(f"cv2.imdecode failed for {entry.path}")
+                                except ImportError:
+                                    logger.warning(
+                                        "cv2 or numpy not installed, falling back to PIL"
+                                    )
+                                    image = Image.open(io.BytesIO(entry.data))
+                                except Exception as img_e:
+                                    logger.error(
+                                        f"Error decoding image {entry.path} with cv2: {img_e}"
+                                    )
-            yield key, url, image_data, metadata
+                            # Generate job ID compatible with chunk tracker
+                            job_id = f"{shard_name}:chunk:{chunk_index}:idx:{idx}"
+                            yield {
+                                "image": image,
+                                "image_data": entry.data,
+                                "item_key": Path(entry.path).stem,
+                                "item_index": idx,
+                                "metadata": {
+                                    "_item_index": idx,
+                                    "_chunk_relative_index": idx - unit.data["start_index"],
+                                    "_job_id": job_id,
+                                    "_filename": entry.path,
+                                    "_file_size": entry.size,
+                                    "_processed_indices": processed_indices,
+                                },
+                                "job_id": job_id,
+                            }
+                            processed_indices.append(idx)
+                            if len(processed_indices) % 10 == 0:
+                                gc.collect()
+                        except StopIteration:
+                            logger.warning(f"Unexpected end of shard at index {idx}")
+                            break
+                        except Exception as e:
+                            logger.error(f"Error processing index {idx}: {e}")
+                            continue
+                except Exception as e:
+                    logger.error(f"Error processing range {start_idx}-{end_idx}: {e}")
+                    continue
+        # Store processed indices for result
+        context["_processed_indices"] = processed_indices
+        logger.info(f"Processed {len(processed_indices)} items from unit {unit.unit_id}")
     def prepare_result(
         self, unit: WorkUnit, outputs: List[Dict[str, Any]], processing_time_ms: float
     ) -> WorkResult:
-        """Prepare WebDataset-specific result."""
-        logger.debug("Preparing result for unit %s", unit.unit_id)
+        """Prepare result with processing details."""
         result = super().prepare_result(unit, outputs, processing_time_ms)
-        # Add processed indices to metadata if available
-        if outputs and "_processed_indices" in outputs[0].get("metadata", {}):
-            result.metadata["item_indices"] = outputs[0]["metadata"]["_processed_indices"]
-            logger.debug(
-                "Added item_indices to result metadata: %s", result.metadata["item_indices"]
-            )
+        # Add processed indices for chunk tracker
+        if hasattr(self, "_last_context") and "_processed_indices" in self._last_context:
+            result.metadata["item_indices"] = self._last_context["_processed_indices"]
         return result
     def get_dataset_info(self) -> Dict[str, Any]:
         """Get dataset information."""
-        if self.dataset_loader:
-            info = self.dataset_loader.get_dataset_info()
-            logger.debug("Dataset info: %s", info)
-            return info
-        info = {
-            "dataset_path": self.dataset_config.get("dataset_path"),
-            "dataset_type": self.dataset_config.get("type", "huggingface"),
+        if self.dataset:
+            stats = self.dataset.get_stats()
+            return {
+                "dataset_name": self.dataset.name,
+                "format": self.dataset.dataset_format,
+                "total_shards": stats["total_shards"],
+                "total_files": stats.get("total_files", "Unknown"),
+                "mock_results": self.mock_results,
+            }
+        return {
+            "dataset_name": "Mock Dataset" if self.mock_results else "Unknown",
+            "mock_results": self.mock_results,
         }
-        logger.debug("Dataset info (no loader): %s", info)
-        return info

caption-flow 0.2.4__py3-none-any.whl → 0.3.2__py3-none-any.whl

caption-flow 0.2.4py3-none-any.whl → 0.3.2py3-none-any.whl