PyPI - caption-flow - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

caption-flow 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

caption_flow/cli.py +2 -1
caption_flow/models.py +108 -1
caption_flow/monitor.py +1 -1
caption_flow/orchestrator.py +423 -1595
caption_flow/processors/__init__.py +11 -0
caption_flow/processors/base.py +219 -0
caption_flow/processors/huggingface.py +832 -0
caption_flow/processors/local_filesystem.py +683 -0
caption_flow/processors/webdataset.py +782 -0
caption_flow/storage.py +415 -406
caption_flow/utils/checkpoint_tracker.py +2 -2
caption_flow/utils/chunk_tracker.py +94 -35
caption_flow/utils/dataset_loader.py +64 -522
caption_flow/utils/dataset_metadata_cache.py +67 -0
caption_flow/utils/image_processor.py +1 -4
caption_flow/utils/shard_processor.py +4 -200
caption_flow/utils/shard_tracker.py +1 -5
caption_flow/workers/base.py +3 -3
caption_flow/workers/caption.py +416 -792
{caption_flow-0.2.1.dist-info → caption_flow-0.2.3.dist-info}/METADATA +29 -27
caption_flow-0.2.3.dist-info/RECORD +35 -0
caption_flow-0.2.1.dist-info/RECORD +0 -29
{caption_flow-0.2.1.dist-info → caption_flow-0.2.3.dist-info}/WHEEL +0 -0
{caption_flow-0.2.1.dist-info → caption_flow-0.2.3.dist-info}/entry_points.txt +0 -0
{caption_flow-0.2.1.dist-info → caption_flow-0.2.3.dist-info}/licenses/LICENSE +0 -0
{caption_flow-0.2.1.dist-info → caption_flow-0.2.3.dist-info}/top_level.txt +0 -0

caption_flow/utils/dataset_metadata_cache.py ADDED Viewed

@@ -0,0 +1,67 @@
+"""Dataset metadata caching for efficient HuggingFace dataset handling."""
+import json
+import logging
+from pathlib import Path
+from typing import Dict, Any, Optional, List
+from datetime import datetime
+logger = logging.getLogger(__name__)
+class DatasetMetadataCache:
+    """Caches dataset metadata to avoid repeated full iterations."""
+    def __init__(self, cache_dir: Path):
+        self.cache_dir = Path(cache_dir)
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+        self.cache_file = self.cache_dir / "dataset_metadata.json"
+        self.metadata: Dict[str, Any] = {}
+        self._load_cache()
+    def _load_cache(self):
+        """Load cached metadata from disk."""
+        if self.cache_file.exists():
+            try:
+                with open(self.cache_file, "r") as f:
+                    self.metadata = json.load(f)
+                logger.info(f"Loaded dataset metadata cache with {len(self.metadata)} datasets")
+            except Exception as e:
+                logger.error(f"Failed to load metadata cache: {e}")
+                self.metadata = {}
+    def _save_cache(self):
+        """Save metadata cache to disk."""
+        try:
+            with open(self.cache_file, "w") as f:
+                json.dump(self.metadata, f, indent=2)
+            logger.debug("Saved dataset metadata cache")
+        except Exception as e:
+            logger.error(f"Failed to save metadata cache: {e}")
+    def get_dataset_key(self, dataset_path: str, split: str) -> str:
+        """Generate a unique key for a dataset+split combination."""
+        return f"{dataset_path}:{split}"
+    def get_metadata(self, dataset_path: str, split: str) -> Optional[Dict[str, Any]]:
+        """Get cached metadata for a dataset."""
+        key = self.get_dataset_key(dataset_path, split)
+        return self.metadata.get(key)
+    def set_metadata(self, dataset_path: str, split: str, metadata: Dict[str, Any]):
+        """Cache metadata for a dataset."""
+        key = self.get_dataset_key(dataset_path, split)
+        metadata["cached_at"] = datetime.utcnow().isoformat()
+        metadata["dataset_path"] = dataset_path
+        metadata["split"] = split
+        self.metadata[key] = metadata
+        self._save_cache()
+        logger.info(f"Cached metadata for {key}: {metadata.get('total_items', 0)} items")
+    def invalidate(self, dataset_path: str, split: str):
+        """Remove cached metadata for a dataset."""
+        key = self.get_dataset_key(dataset_path, split)
+        if key in self.metadata:
+            del self.metadata[key]
+            self._save_cache()
+            logger.info(f"Invalidated metadata cache for {key}")

caption_flow/utils/image_processor.py CHANGED Viewed

@@ -112,10 +112,7 @@ class ImageProcessor:
                 return None
         except Exception as e:
-            logger.error(f"Error processing image data: {e}")
-            import traceback
-            logger.error(traceback.format_exc())
+            logger.error(f"Error processing image data: {e}", exc_info=True)
             return None
     @staticmethod

caption_flow/utils/shard_processor.py CHANGED Viewed

@@ -7,6 +7,7 @@ from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Generator, Tuple, Optional, Dict, Any
 from dataclasses import dataclass
+from .image_processor import ImageProcessor
 from threading import Event
 import shlex
@@ -22,84 +23,6 @@ class ShardProcessor(ABC):
     """Abstract base for processing dataset shards."""
     @abstractmethod
-    def iterate_chunk(
-        self,
-        chunk,
-        dataset_loader: Optional[DatasetLoader],
-        should_stop: Event,
-        connected: Event,
-    ) -> Generator[Tuple[str, str, bytes], None, None]:
-        """
-        Iterate through items in a chunk.
-        Yields:
-            Tuple of (key, url, image_data)
-        """
-        pass
-class HFDatasetShardProcessor(ShardProcessor):
-    """Processor for HuggingFace virtual dataset shards."""
-    def iterate_chunk(
-        self,
-        chunk,
-        dataset_loader: Optional[DatasetLoader],
-        should_stop: Event,
-        connected: Event,
-    ) -> Generator[Tuple[str, str, bytes], None, None]:
-        """Process HuggingFace virtual shard chunk."""
-        if not dataset_loader:
-            logger.error("No dataset loader configured for HuggingFace dataset shard")
-            return
-        # Get unprocessed ranges
-        unprocessed_ranges = getattr(chunk, "unprocessed_ranges", [(0, chunk.chunk_size - 1)])
-        logger.info(
-            f"Processing HF dataset chunk {chunk.chunk_id} with ranges: {unprocessed_ranges}"
-        )
-        items_processed = 0
-        current_idx = 0
-        # Construct proper virtual shard URL
-        parts = chunk.shard_url.split("_chunk_")
-        if len(parts) == 2:
-            base_path = parts[0]
-            virtual_shard_url = f"{base_path}:chunk:{chunk.start_index}"
-        else:
-            virtual_shard_url = chunk.shard_url
-        logger.debug(f"Using virtual shard URL: {virtual_shard_url}")
-        # Iterate through the virtual shard
-        for key, url, image_data in dataset_loader.iterate_shard(virtual_shard_url):
-            # Check if we should stop
-            if should_stop.is_set() or not connected.is_set():
-                logger.info(f"Stopping chunk processing early due to disconnect")
-                break
-            # Check if current index is in any unprocessed range
-            in_range = any(start <= current_idx <= end for start, end in unprocessed_ranges)
-            if not in_range:
-                current_idx += 1
-                continue  # Skip already processed items
-            # Check if we've processed enough for this chunk
-            if current_idx >= chunk.chunk_size:
-                break
-            items_processed += 1
-            current_idx += 1
-            yield key, url, image_data
-        logger.info(
-            f"HF dataset chunk {chunk.chunk_id}: yielded {items_processed} items "
-            f"from ranges {unprocessed_ranges}"
-        )
     def iterate_chunk_with_metadata(
         self,
         chunk,
@@ -108,63 +31,12 @@ class HFDatasetShardProcessor(ShardProcessor):
         connected: Event,
     ) -> Generator[Tuple[str, str, bytes, Dict[str, Any]], None, None]:
         """
-        Process HuggingFace virtual shard chunk with metadata.
+        Iterate through items in a chunk with metadata.
         Yields:
             Tuple of (key, url, image_data, metadata)
         """
-        if not dataset_loader:
-            logger.error("No dataset loader configured for HuggingFace dataset shard")
-            return
-        # Get unprocessed ranges
-        unprocessed_ranges = getattr(chunk, "unprocessed_ranges", [(0, chunk.chunk_size - 1)])
-        logger.info(
-            f"Processing HF dataset chunk {chunk.chunk_id} with ranges: {unprocessed_ranges}"
-        )
-        items_processed = 0
-        current_idx = 0
-        # Construct proper virtual shard URL
-        parts = chunk.shard_url.split("_chunk_")
-        if len(parts) == 2:
-            base_path = parts[0]
-            virtual_shard_url = f"{base_path}:chunk:{chunk.start_index}"
-        else:
-            virtual_shard_url = chunk.shard_url
-        logger.debug(f"Using virtual shard URL: {virtual_shard_url}")
-        # Use the new iterate method that includes metadata
-        for key, url, image_data, metadata in dataset_loader.iterate_shard_with_metadata(
-            virtual_shard_url
-        ):
-            # Check if we should stop
-            if should_stop.is_set() or not connected.is_set():
-                logger.info(f"Stopping chunk processing early due to disconnect")
-                break
-            # Check if current index is in any unprocessed range
-            in_range = any(start <= current_idx <= end for start, end in unprocessed_ranges)
-            if not in_range:
-                current_idx += 1
-                continue  # Skip already processed items
-            # Check if we've processed enough for this chunk
-            if current_idx >= chunk.chunk_size:
-                break
-            items_processed += 1
-            current_idx += 1
-            yield key, url, image_data, metadata
-        logger.info(
-            f"HF dataset chunk {chunk.chunk_id}: yielded {items_processed} items "
-            f"from ranges {unprocessed_ranges}"
-        )
+        pass
 class WebDatasetShardProcessor(ShardProcessor):
@@ -174,74 +46,6 @@ class WebDatasetShardProcessor(ShardProcessor):
         self.hf_token = hf_token
         self.dataset_type = dataset_type
-    def iterate_chunk(
-        self,
-        chunk,
-        dataset_loader: Optional[DatasetLoader],
-        should_stop: Event,
-        connected: Event,
-    ) -> Generator[Tuple[str, str, bytes], None, None]:
-        """Process WebDataset shard chunk with unprocessed ranges."""
-        # Get unprocessed ranges
-        unprocessed_ranges = getattr(chunk, "unprocessed_ranges", [(0, chunk.chunk_size - 1)])
-        logger.info(
-            f"Processing WebDataset chunk {chunk.chunk_id} with ranges: {unprocessed_ranges}"
-        )
-        # Create WebDataset pipeline
-        if self.dataset_type == "huggingface" and not chunk.shard_url.startswith("hf_dataset:"):
-            # Use curl with auth for HuggingFace WebDataset
-            url_cmd = f"pipe:curl -s -L -H 'Authorization:Bearer {shlex.quote(self.hf_token)}' {shlex.quote(chunk.shard_url)} || true"
-            ds = wds.DataPipeline(
-                wds.SimpleShardList(url_cmd),
-                wds.tarfile_to_samples(),
-                wds.to_tuple("__key__", "jpg;png;jpeg;webp;jxl"),
-            )
-        else:
-            # Local file
-            ds = wds.DataPipeline(
-                wds.SimpleShardList(chunk.shard_url),
-                wds.tarfile_to_samples(),
-                wds.to_tuple("__key__", "jpg;png;jpeg;webp;jxl"),
-            )
-        # Process items
-        current_idx = 0
-        items_yielded = 0
-        for key, image_data in ds:
-            # Check if we should stop
-            if should_stop.is_set() or not connected.is_set():
-                logger.info(f"Stopping WebDataset chunk processing early due to disconnect")
-                break
-            # Calculate relative index within chunk
-            relative_idx = current_idx - chunk.start_index
-            # Skip items before chunk start
-            if current_idx < chunk.start_index:
-                current_idx += 1
-                continue
-            # Stop if beyond chunk
-            if relative_idx >= chunk.chunk_size:
-                break
-            # Check if current index is in any unprocessed range
-            in_range = any(start <= relative_idx <= end for start, end in unprocessed_ranges)
-            if in_range:
-                items_yielded += 1
-                yield key, chunk.shard_url, image_data
-            current_idx += 1
-        logger.info(
-            f"WebDataset chunk {chunk.chunk_id}: yielded {items_yielded} items "
-            f"from ranges {unprocessed_ranges}"
-        )
     def iterate_chunk_with_metadata(
         self,
         chunk,
@@ -258,7 +62,7 @@ class WebDatasetShardProcessor(ShardProcessor):
         )
         # Create WebDataset pipeline
-        if self.dataset_type == "huggingface" and not chunk.shard_url.startswith("hf_dataset:"):
+        if self.dataset_type == "huggingface":
             # Use curl with auth for HuggingFace WebDataset
             url_cmd = f"pipe:curl -s -L -H 'Authorization:Bearer {shlex.quote(self.hf_token)}' {shlex.quote(chunk.shard_url)} || true"
             ds = wds.DataPipeline(

caption_flow/utils/shard_tracker.py CHANGED Viewed

@@ -61,11 +61,7 @@ class ShardTracker(CheckpointTracker):
         """Get list of shards that still need processing."""
         remaining = []
         for s in all_shards:
-            # Extract shard name properly for both regular and virtual shards
-            if s.startswith("hf_dataset:"):
-                shard_name = s  # Use full virtual shard ID
-            else:
-                shard_name = Path(s).stem
+            shard_name = Path(s).stem
             if shard_name not in self.completed_shards:
                 remaining.append(s)

caption_flow/workers/base.py CHANGED Viewed

@@ -74,7 +74,7 @@ class BaseWorker(ABC):
                 await self._connect_and_run()
                 reconnect_delay = 5  # Reset delay on successful connection
             except Exception as e:
-                logger.error(f"Connection error: {e}")
+                logger.error(f"Connection error: {e}", exc_info=True)
                 self.connected.clear()
                 self.websocket = None
@@ -159,13 +159,13 @@ class BaseWorker(ABC):
                 except json.JSONDecodeError as e:
                     logger.error(f"Invalid message format: {e}")
                 except Exception as e:
-                    logger.error(f"Error handling message: {e}")
+                    logger.error(f"Error handling message: {e}", exc_info=True)
         except websockets.exceptions.ConnectionClosed as e:
             logger.info(f"Connection closed by orchestrator: {e}")
             raise
         except Exception as e:
-            logger.error(f"Message handler error: {e}")
+            logger.error(f"Message handler error: {e}", exc_info=True)
             raise
     async def shutdown(self):

caption-flow 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

caption-flow 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl