PyPI - caption-flow - Versions diffs - 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

caption-flow 0.3.3py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

caption_flow/__init__.py +3 -3
caption_flow/cli.py +937 -416
caption_flow/models.py +45 -3
caption_flow/monitor.py +5 -3
caption_flow/orchestrator.py +186 -116
caption_flow/processors/__init__.py +3 -3
caption_flow/processors/base.py +8 -7
caption_flow/processors/huggingface.py +440 -68
caption_flow/processors/local_filesystem.py +24 -28
caption_flow/processors/webdataset.py +66 -25
caption_flow/storage/exporter.py +420 -339
caption_flow/storage/manager.py +636 -756
caption_flow/utils/__init__.py +1 -1
caption_flow/utils/auth.py +1 -1
caption_flow/utils/caption_utils.py +1 -1
caption_flow/utils/certificates.py +15 -8
caption_flow/utils/checkpoint_tracker.py +41 -19
caption_flow/utils/chunk_tracker.py +200 -65
caption_flow/utils/image_processor.py +9 -9
caption_flow/utils/json_utils.py +37 -20
caption_flow/utils/prompt_template.py +24 -16
caption_flow/utils/vllm_config.py +5 -4
caption_flow/viewer.py +4 -12
caption_flow/workers/base.py +12 -6
caption_flow/workers/caption.py +272 -91
caption_flow/workers/data.py +6 -8
{caption_flow-0.3.3.dist-info → caption_flow-0.4.0.dist-info}/METADATA +5 -4
caption_flow-0.4.0.dist-info/RECORD +33 -0
caption_flow-0.3.3.dist-info/RECORD +0 -33
{caption_flow-0.3.3.dist-info → caption_flow-0.4.0.dist-info}/WHEEL +0 -0
{caption_flow-0.3.3.dist-info → caption_flow-0.4.0.dist-info}/entry_points.txt +0 -0
{caption_flow-0.3.3.dist-info → caption_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
{caption_flow-0.3.3.dist-info → caption_flow-0.4.0.dist-info}/top_level.txt +0 -0

caption_flow/processors/local_filesystem.py CHANGED Viewed

@@ -1,27 +1,27 @@
 """Local filesystem datasets processor implementation."""
+import asyncio
+import io
 import logging
-import threading
+import mimetypes
 import os
-from typing import Dict, Any, List, Optional, Iterator, Set, Deque, Tuple
-from collections import deque, defaultdict
+import threading
+from collections import defaultdict, deque
 from pathlib import Path
-import json
-import io
-import mimetypes
-from datetime import datetime
-from PIL import Image
+from typing import Any, Deque, Dict, Iterator, List, Optional, Set, Tuple
 import aiofiles
-from fastapi import FastAPI, HTTPException, Response
-from fastapi.responses import StreamingResponse
-import uvicorn
-import asyncio
 import requests
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
+from PIL import Image
 from caption_flow.storage import StorageManager
-from .base import OrchestratorProcessor, WorkerProcessor, ProcessorConfig, WorkUnit, WorkResult
-from ..utils import ChunkTracker
 from ..models import JobId
+from ..utils import ChunkTracker
+from .base import OrchestratorProcessor, ProcessorConfig, WorkerProcessor, WorkResult, WorkUnit
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
@@ -217,23 +217,19 @@ class LocalFilesystemOrchestratorProcessor(OrchestratorProcessor):
         if not self.chunk_tracker:
             return
-        all_processed_jobs = storage.get_all_processed_job_ids()
+        storage.get_all_processed_job_ids()
         with self.lock:
             for chunk_id, chunk_state in self.chunk_tracker.chunks.items():
-                # Calculate actual unprocessed ranges
-                chunk_range = (
-                    chunk_state.start_index,
-                    chunk_state.start_index + chunk_state.chunk_size - 1,
-                )
-                # Get processed indices for this chunk
-                processed_ranges = self.chunk_tracker.get_processed_indices_for_chunk(
-                    chunk_id, all_processed_jobs
-                )
+                # Get unprocessed ranges (relative coordinates from ChunkTracker)
+                relative_unprocessed_ranges = chunk_state.get_unprocessed_ranges()
-                # Calculate unprocessed ranges
-                unprocessed_ranges = self._subtract_ranges([chunk_range], processed_ranges)
+                # Convert relative ranges to absolute ranges
+                unprocessed_ranges = []
+                for start, end in relative_unprocessed_ranges:
+                    abs_start = chunk_state.start_index + start
+                    abs_end = chunk_state.start_index + end
+                    unprocessed_ranges.append((abs_start, abs_end))
                 if unprocessed_ranges:
                     # Create work unit for unprocessed items
@@ -588,7 +584,7 @@ class LocalFilesystemWorkerProcessor(WorkerProcessor):
         processed_indices = []
         # Get orchestrator info if we need HTTP
-        orchestrator = context.get("orchestrator")
+        context.get("orchestrator")
         for idx in sorted(indices_to_process):
             try:

caption_flow/processors/webdataset.py CHANGED Viewed

@@ -1,28 +1,27 @@
 """WebDataset processor implementation using webshart TarDataLoader."""
-import logging
-import threading
 import gc
+import io
+import logging
 import os
-from typing import Dict, Any, List, Optional, Iterator, Set, Deque, Tuple
-from collections import deque, defaultdict
+import threading
+from collections import defaultdict, deque
 from pathlib import Path
-import json
-from datetime import datetime
+from typing import Any, Deque, Dict, Iterator, List, Optional, Set
+import cv2
+import numpy as np
+import webshart
 from PIL import Image
-import io
 from caption_flow.models import JobId
 from caption_flow.storage import StorageManager
-from .base import OrchestratorProcessor, WorkerProcessor, ProcessorConfig, WorkUnit, WorkResult
-from ..utils import ChunkTracker
-import webshart
-import cv2
-import numpy as np
+from ..utils import ChunkTracker
+from .base import OrchestratorProcessor, ProcessorConfig, WorkerProcessor, WorkResult, WorkUnit
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
+logger.setLevel(os.environ.get("CAPTIONFLOW_LOG_LEVEL", "INFO").upper())
 class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
@@ -217,7 +216,7 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
             while units_created < units_needed and not self.stop_creation.is_set():
                 # Get current shard info
                 if current_shard_idx >= self.dataset.num_shards:
-                    logger.info("All shards processed")
+                    threading.Event().wait(5)
                     break
                 shard_info = self._get_shard_info_cached(current_shard_idx)
@@ -240,8 +239,8 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
                 self.current_chunk_index = current_file_idx // self.chunk_size
                 job_id_obj = JobId(
                     shard_id=shard_name,
-                    chunk_id=self.current_chunk_index,
-                    sample_id=current_file_idx,
+                    chunk_id=str(self.current_chunk_index),
+                    sample_id=str(current_file_idx),
                 )
                 chunk_id = job_id_obj.get_chunk_str()
@@ -306,8 +305,15 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
         assigned = []
         with self.lock:
-            while len(assigned) < count and self.pending_units:
+            units_checked = 0
+            max_units_to_check = len(self.pending_units)
+            while len(assigned) < count and units_checked < max_units_to_check:
+                if not self.pending_units:
+                    break
                 unit_id = self.pending_units.popleft()
+                units_checked += 1
                 unit = self.work_units.get(unit_id)
                 if unit:
@@ -316,6 +322,16 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
                         chunk_state = self.chunk_tracker.chunks[unit_id]
                         relative_unprocessed = chunk_state.get_unprocessed_ranges()
+                        # If no unprocessed ranges, mark as completed and skip
+                        if not relative_unprocessed:
+                            logger.info(
+                                f"Chunk {unit_id} has no unprocessed ranges, marking as completed"
+                            )
+                            self.chunk_tracker.mark_completed(unit_id)
+                            # Remove from work units
+                            del self.work_units[unit_id]
+                            continue
                         # Convert relative to absolute indices
                         absolute_ranges = []
                         for start, end in relative_unprocessed:
@@ -335,6 +351,9 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
                     if self.chunk_tracker:
                         self.chunk_tracker.mark_assigned(unit_id, worker_id)
+                else:
+                    # Put it back if we couldn't get the unit
+                    self.pending_units.append(unit_id)
         logger.debug(f"Assigned {len(assigned)} units to worker {worker_id}")
         return assigned
@@ -394,8 +413,20 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
         logger.info(f"Released {len(unit_ids)} assignments from {worker_id}")
     def handle_result(self, result: WorkResult) -> Dict[str, Any]:
-        """Handle result from worker."""
-        # Track processed items if we have chunk tracker
+        """Handle result from worker and update chunk tracker."""
+        # Extract the actual item index from the metadata
+        item_index = result.metadata.get("_item_index", None)
+        # If we have an item index, mark it as processed in the chunk tracker
+        if self.chunk_tracker and item_index is not None and result.chunk_id:
+            try:
+                # Mark single item as processed
+                self.chunk_tracker.mark_items_processed(result.chunk_id, item_index, item_index)
+                # logger.debug(f"Marked item {item_index} as processed in chunk {result.chunk_id}")
+            except Exception as e:
+                logger.error(f"Error marking item {item_index} as processed: {e}")
+        # Also handle batch results if present (backward compatibility)
         if self.chunk_tracker and "item_indices" in result.metadata:
             indices = result.metadata["item_indices"]
@@ -419,6 +450,9 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
                 # Mark ranges as processed
                 for start_idx, end_idx in ranges:
                     self.chunk_tracker.mark_items_processed(result.chunk_id, start_idx, end_idx)
+                    logger.debug(
+                        f"Marked range {start_idx}-{end_idx} as processed in chunk {result.chunk_id}"
+                    )
         return {
             "source_id": result.source_id,
@@ -499,8 +533,8 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
                         for start_idx, end_idx in ranges:
                             self.chunk_tracker.mark_items_processed(chunk_id, start_idx, end_idx)
-                # Save checkpoint after updating
-                self.chunk_tracker.save()
+                # Flush checkpoint after major update
+                self.chunk_tracker.flush()
     def get_stats(self) -> Dict[str, Any]:
         """Get processor statistics."""
@@ -537,9 +571,9 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
         if self.unit_creation_thread:
             self.unit_creation_thread.join(timeout=5)
-        # Save checkpoint
+        # Flush final checkpoint on cleanup
         if self.chunk_tracker:
-            self.chunk_tracker.save_checkpoint()
+            self.chunk_tracker.flush()
 class WebDatasetWorkerProcessor(WorkerProcessor):
@@ -559,6 +593,9 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
         self.dataset_path = dataset_cfg.get("dataset_path")
         metadata_path = dataset_cfg.get("metadata_path", None)
         self.mock_results = dataset_cfg.get("mock_results", False)
+        split_worker_cache = dataset_cfg.get(
+            "split_worker_cache", True
+        )  # multiple workers get their own cache by default
         # Cache configuration
         cache_dir = Path(cfg.get("cache_dir", "./webshart_cache"))
@@ -574,7 +611,11 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
             # Enable caching
             self.dataset.enable_metadata_cache(location=str(cache_dir / "metadata_cache"))
             self.dataset.enable_shard_cache(
-                location=str(cache_dir / "shard_cache"),
+                location=(
+                    str(cache_dir / "shard_cache" / str(self.gpu_id))
+                    if split_worker_cache
+                    else str(cache_dir / "shard_cache")
+                ),
                 cache_limit_gb=cfg.get("shard_cache_gb", 10.0),
             )
@@ -646,7 +687,7 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
                     # Iterate through the range
                     for idx in range(start_idx, end_idx + 1):
                         try:
-                            entry = next(self.loader)
+                            entry = webshart.next_with_cache_wait(self.loader)
                             # Decode image
                             image = None

caption-flow 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

caption-flow 0.3.3py3-none-any.whl → 0.4.0py3-none-any.whl