PyPI - caption-flow - Versions diffs - 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

caption-flow 0.3.4py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

caption_flow/__init__.py +3 -3
caption_flow/cli.py +934 -415
caption_flow/models.py +45 -3
caption_flow/monitor.py +2 -3
caption_flow/orchestrator.py +153 -104
caption_flow/processors/__init__.py +3 -3
caption_flow/processors/base.py +8 -7
caption_flow/processors/huggingface.py +439 -67
caption_flow/processors/local_filesystem.py +24 -28
caption_flow/processors/webdataset.py +28 -22
caption_flow/storage/exporter.py +420 -339
caption_flow/storage/manager.py +636 -756
caption_flow/utils/__init__.py +1 -1
caption_flow/utils/auth.py +1 -1
caption_flow/utils/caption_utils.py +1 -1
caption_flow/utils/certificates.py +15 -8
caption_flow/utils/checkpoint_tracker.py +30 -28
caption_flow/utils/chunk_tracker.py +153 -56
caption_flow/utils/image_processor.py +9 -9
caption_flow/utils/json_utils.py +37 -20
caption_flow/utils/prompt_template.py +24 -16
caption_flow/utils/vllm_config.py +5 -4
caption_flow/viewer.py +4 -12
caption_flow/workers/base.py +5 -4
caption_flow/workers/caption.py +265 -90
caption_flow/workers/data.py +6 -8
{caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/METADATA +5 -4
caption_flow-0.4.0.dist-info/RECORD +33 -0
caption_flow-0.3.4.dist-info/RECORD +0 -33
{caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/WHEEL +0 -0
{caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/entry_points.txt +0 -0
{caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
{caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/top_level.txt +0 -0

caption_flow/processors/local_filesystem.py CHANGED Viewed

@@ -1,27 +1,27 @@
 """Local filesystem datasets processor implementation."""
+import asyncio
+import io
 import logging
-import threading
+import mimetypes
 import os
-from typing import Dict, Any, List, Optional, Iterator, Set, Deque, Tuple
-from collections import deque, defaultdict
+import threading
+from collections import defaultdict, deque
 from pathlib import Path
-import json
-import io
-import mimetypes
-from datetime import datetime
-from PIL import Image
+from typing import Any, Deque, Dict, Iterator, List, Optional, Set, Tuple
 import aiofiles
-from fastapi import FastAPI, HTTPException, Response
-from fastapi.responses import StreamingResponse
-import uvicorn
-import asyncio
 import requests
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
+from PIL import Image
 from caption_flow.storage import StorageManager
-from .base import OrchestratorProcessor, WorkerProcessor, ProcessorConfig, WorkUnit, WorkResult
-from ..utils import ChunkTracker
 from ..models import JobId
+from ..utils import ChunkTracker
+from .base import OrchestratorProcessor, ProcessorConfig, WorkerProcessor, WorkResult, WorkUnit
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
@@ -217,23 +217,19 @@ class LocalFilesystemOrchestratorProcessor(OrchestratorProcessor):
         if not self.chunk_tracker:
             return
-        all_processed_jobs = storage.get_all_processed_job_ids()
+        storage.get_all_processed_job_ids()
         with self.lock:
             for chunk_id, chunk_state in self.chunk_tracker.chunks.items():
-                # Calculate actual unprocessed ranges
-                chunk_range = (
-                    chunk_state.start_index,
-                    chunk_state.start_index + chunk_state.chunk_size - 1,
-                )
-                # Get processed indices for this chunk
-                processed_ranges = self.chunk_tracker.get_processed_indices_for_chunk(
-                    chunk_id, all_processed_jobs
-                )
+                # Get unprocessed ranges (relative coordinates from ChunkTracker)
+                relative_unprocessed_ranges = chunk_state.get_unprocessed_ranges()
-                # Calculate unprocessed ranges
-                unprocessed_ranges = self._subtract_ranges([chunk_range], processed_ranges)
+                # Convert relative ranges to absolute ranges
+                unprocessed_ranges = []
+                for start, end in relative_unprocessed_ranges:
+                    abs_start = chunk_state.start_index + start
+                    abs_end = chunk_state.start_index + end
+                    unprocessed_ranges.append((abs_start, abs_end))
                 if unprocessed_ranges:
                     # Create work unit for unprocessed items
@@ -588,7 +584,7 @@ class LocalFilesystemWorkerProcessor(WorkerProcessor):
         processed_indices = []
         # Get orchestrator info if we need HTTP
-        orchestrator = context.get("orchestrator")
+        context.get("orchestrator")
         for idx in sorted(indices_to_process):
             try:

caption_flow/processors/webdataset.py CHANGED Viewed

@@ -1,28 +1,27 @@
 """WebDataset processor implementation using webshart TarDataLoader."""
-import logging
-import threading
 import gc
+import io
+import logging
 import os
-from typing import Dict, Any, List, Optional, Iterator, Set, Deque, Tuple
-from collections import deque, defaultdict
+import threading
+from collections import defaultdict, deque
 from pathlib import Path
-import json
-from datetime import datetime
+from typing import Any, Deque, Dict, Iterator, List, Optional, Set
+import cv2
+import numpy as np
+import webshart
 from PIL import Image
-import io
 from caption_flow.models import JobId
 from caption_flow.storage import StorageManager
-from .base import OrchestratorProcessor, WorkerProcessor, ProcessorConfig, WorkUnit, WorkResult
-from ..utils import ChunkTracker
-import webshart
-import cv2
-import numpy as np
+from ..utils import ChunkTracker
+from .base import OrchestratorProcessor, ProcessorConfig, WorkerProcessor, WorkResult, WorkUnit
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
+logger.setLevel(os.environ.get("CAPTIONFLOW_LOG_LEVEL", "INFO").upper())
 class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
@@ -217,7 +216,7 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
             while units_created < units_needed and not self.stop_creation.is_set():
                 # Get current shard info
                 if current_shard_idx >= self.dataset.num_shards:
-                    logger.info("All shards processed")
+                    threading.Event().wait(5)
                     break
                 shard_info = self._get_shard_info_cached(current_shard_idx)
@@ -240,8 +239,8 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
                 self.current_chunk_index = current_file_idx // self.chunk_size
                 job_id_obj = JobId(
                     shard_id=shard_name,
-                    chunk_id=self.current_chunk_index,
-                    sample_id=current_file_idx,
+                    chunk_id=str(self.current_chunk_index),
+                    sample_id=str(current_file_idx),
                 )
                 chunk_id = job_id_obj.get_chunk_str()
@@ -534,8 +533,8 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
                         for start_idx, end_idx in ranges:
                             self.chunk_tracker.mark_items_processed(chunk_id, start_idx, end_idx)
-                # Save checkpoint after updating
-                self.chunk_tracker.save()
+                # Flush checkpoint after major update
+                self.chunk_tracker.flush()
     def get_stats(self) -> Dict[str, Any]:
         """Get processor statistics."""
@@ -572,9 +571,9 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
         if self.unit_creation_thread:
             self.unit_creation_thread.join(timeout=5)
-        # Save checkpoint
+        # Flush final checkpoint on cleanup
         if self.chunk_tracker:
-            self.chunk_tracker.save()
+            self.chunk_tracker.flush()
 class WebDatasetWorkerProcessor(WorkerProcessor):
@@ -594,6 +593,9 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
         self.dataset_path = dataset_cfg.get("dataset_path")
         metadata_path = dataset_cfg.get("metadata_path", None)
         self.mock_results = dataset_cfg.get("mock_results", False)
+        split_worker_cache = dataset_cfg.get(
+            "split_worker_cache", True
+        )  # multiple workers get their own cache by default
         # Cache configuration
         cache_dir = Path(cfg.get("cache_dir", "./webshart_cache"))
@@ -609,7 +611,11 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
             # Enable caching
             self.dataset.enable_metadata_cache(location=str(cache_dir / "metadata_cache"))
             self.dataset.enable_shard_cache(
-                location=str(cache_dir / "shard_cache"),
+                location=(
+                    str(cache_dir / "shard_cache" / str(self.gpu_id))
+                    if split_worker_cache
+                    else str(cache_dir / "shard_cache")
+                ),
                 cache_limit_gb=cfg.get("shard_cache_gb", 10.0),
             )
@@ -681,7 +687,7 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
                     # Iterate through the range
                     for idx in range(start_idx, end_idx + 1):
                         try:
-                            entry = next(self.loader)
+                            entry = webshart.next_with_cache_wait(self.loader)
                             # Decode image
                             image = None

caption-flow 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

caption-flow 0.3.4py3-none-any.whl → 0.4.0py3-none-any.whl