PyPI - caption-flow - Versions diffs - 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

caption-flow 0.3.4py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

caption_flow/__init__.py +3 -3
caption_flow/cli.py +934 -415
caption_flow/models.py +45 -3
caption_flow/monitor.py +2 -3
caption_flow/orchestrator.py +153 -104
caption_flow/processors/__init__.py +3 -3
caption_flow/processors/base.py +8 -7
caption_flow/processors/huggingface.py +439 -67
caption_flow/processors/local_filesystem.py +24 -28
caption_flow/processors/webdataset.py +28 -22
caption_flow/storage/exporter.py +420 -339
caption_flow/storage/manager.py +636 -756
caption_flow/utils/__init__.py +1 -1
caption_flow/utils/auth.py +1 -1
caption_flow/utils/caption_utils.py +1 -1
caption_flow/utils/certificates.py +15 -8
caption_flow/utils/checkpoint_tracker.py +30 -28
caption_flow/utils/chunk_tracker.py +153 -56
caption_flow/utils/image_processor.py +9 -9
caption_flow/utils/json_utils.py +37 -20
caption_flow/utils/prompt_template.py +24 -16
caption_flow/utils/vllm_config.py +5 -4
caption_flow/viewer.py +4 -12
caption_flow/workers/base.py +5 -4
caption_flow/workers/caption.py +265 -90
caption_flow/workers/data.py +6 -8
{caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/METADATA +5 -4
caption_flow-0.4.0.dist-info/RECORD +33 -0
caption_flow-0.3.4.dist-info/RECORD +0 -33
{caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/WHEEL +0 -0
{caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/entry_points.txt +0 -0
{caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
{caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/top_level.txt +0 -0

caption_flow/models.py CHANGED Viewed

@@ -1,12 +1,18 @@
 """Data models for CaptionFlow."""
-import PIL
+import datetime as _datetime
+import logging
+import os
 from dataclasses import dataclass, field
 from datetime import datetime
 from enum import Enum
 from typing import Any, Dict, List, Optional, Tuple
 from PIL import Image
+logger = logging.getLogger(__name__)
+logger.setLevel(os.environ.get("CAPTIONFLOW_LOG_LEVEL", "INFO").upper())
 class JobStatus(Enum):
     """Job processing status."""
@@ -37,7 +43,7 @@ class Job:
     def __post_init__(self):
         if self.created_at is None:
-            self.created_at = datetime.utcnow()
+            self.created_at = datetime.now(_datetime.UTC)
 @dataclass
@@ -69,7 +75,43 @@ class JobId:
         parts = job_id.split(":")
         if len(parts) != 5:
             raise ValueError(f"Invalid job_id format: {job_id}")
-        return JobId(shard_id=parts[0], chunk_id=parts[2], sample_id=parts[4])
+        shard_id = parts[0]
+        chunk_keyword = parts[1]
+        chunk_id = parts[2]
+        idx_keyword = parts[3]
+        sample_id = parts[4]
+        # Validate format
+        if not shard_id:
+            raise ValueError(f"Invalid job_id format: empty shard_id in {job_id}")
+        if chunk_keyword != "chunk":
+            raise ValueError(
+                f"Invalid job_id format: expected 'chunk' keyword, got '{chunk_keyword}' in {job_id}"
+            )
+        if idx_keyword != "idx":
+            raise ValueError(
+                f"Invalid job_id format: expected 'idx' keyword, got '{idx_keyword}' in {job_id}"
+            )
+        # Validate numeric fields
+        try:
+            int(chunk_id)
+        except ValueError:
+            raise ValueError(
+                f"Invalid job_id format: chunk_id must be numeric, got '{chunk_id}' in {job_id}"
+            )
+        # sample_id can be empty/None for some use cases, but if provided must be numeric
+        if sample_id:
+            try:
+                int(sample_id)
+            except ValueError:
+                raise ValueError(
+                    f"Invalid job_id format: sample_id must be numeric if provided, got '{sample_id}' in {job_id}"
+                )
+        return JobId(shard_id=shard_id, chunk_id=chunk_id, sample_id=sample_id)
 @dataclass

caption_flow/monitor.py CHANGED Viewed

@@ -4,9 +4,8 @@ import asyncio
 import json
 import logging
 import ssl
-import time
 from datetime import datetime
-from typing import Dict, Any, List, Optional
+from typing import Any, Dict, Optional
 import websockets
 from rich.console import Console
@@ -110,7 +109,7 @@ class Monitor:
         """Main display update loop."""
         layout = self._create_layout()
-        with Live(layout, console=self.console, refresh_per_second=1, screen=True) as live:
+        with Live(layout, console=self.console, refresh_per_second=1, screen=True):
             while self.running:
                 self._update_layout(layout)
                 await asyncio.sleep(0.25)

caption_flow/orchestrator.py CHANGED Viewed

@@ -1,34 +1,34 @@
-import time
 import asyncio
+import datetime as _datetime
 import json
 import logging
+import os
 import ssl
+import time
 import uuid
+from collections import defaultdict
 from datetime import datetime
 from pathlib import Path
-from typing import Dict, Set, Optional, Any, List
-from collections import defaultdict
-import threading
+from typing import Any, Dict, Optional, Set
 import websockets
-from websockets.server import WebSocketServerProtocol
+from websockets.asyncio.server import ServerConnection
-from .storage import StorageManager
 from .models import Caption, Contributor, JobId
-from .utils.auth import AuthManager
-from .utils.json_utils import safe_json_dumps
 from .processors import (
+    HuggingFaceDatasetOrchestratorProcessor,
+    LocalFilesystemOrchestratorProcessor,
     ProcessorConfig,
+    WebDatasetOrchestratorProcessor,
     WorkAssignment,
     WorkResult,
-    WorkUnit,
-    WebDatasetOrchestratorProcessor,
-    HuggingFaceDatasetOrchestratorProcessor,
-    LocalFilesystemOrchestratorProcessor,
 )
+from .storage import StorageManager
+from .utils.auth import AuthManager
+from .utils.json_utils import safe_json_dumps
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
+logger.setLevel(os.environ.get("CAPTIONFLOW_LOG_LEVEL", "INFO").upper())
 class Orchestrator:
@@ -69,8 +69,8 @@ class Orchestrator:
         self.chunks_per_request = config.get("chunks_per_request", 2)
         # Track connections
-        self.workers: Dict[str, WebSocketServerProtocol] = {}
-        self.monitors: Set[WebSocketServerProtocol] = set()
+        self.workers: Dict[str, ServerConnection] = {}
+        self.monitors: Set[ServerConnection] = set()
         self.workers_by_user = defaultdict(set)
         # SSL configuration
@@ -160,11 +160,11 @@ class Orchestrator:
         processed_job_ids = self.storage.get_all_processed_job_ids()
         self.processor.update_from_storage(processed_job_ids)
-    async def _send_leaderboard_to_monitor(self, websocket: WebSocketServerProtocol):
+    async def _send_leaderboard_to_monitor(self, websocket: ServerConnection):
         """Alias for _send_monitor_leaderboard for backward compatibility."""
         await self._send_monitor_leaderboard(websocket)
-    async def handle_connection(self, websocket: WebSocketServerProtocol):
+    async def handle_connection(self, websocket: ServerConnection):
         """Handle new WebSocket connection."""
         try:
             # Authenticate
@@ -193,7 +193,7 @@ class Orchestrator:
             logger.error(f"Connection error: {e}", exc_info=True)
             await websocket.close()
-    async def _handle_worker(self, websocket: WebSocketServerProtocol, auth_ticket):
+    async def _handle_worker(self, websocket: ServerConnection, auth_ticket):
         """Handle worker connection lifecycle."""
         # Generate unique worker ID
         base_name = getattr(auth_ticket, "name", "worker")
@@ -250,9 +250,38 @@ class Orchestrator:
             self.processor.release_assignments(worker_id)
             logger.info(f"Worker {worker_id} has safely disconnected")
-    async def _handle_config_reload(
-        self, websocket: WebSocketServerProtocol, new_config: Dict[str, Any]
-    ):
+    def _auth_configs_equal(
+        self, current_config: Dict[str, Any], new_config: Dict[str, Any]
+    ) -> bool:
+        """Compare two auth configurations for equality."""
+        # Helper function to normalize token lists for comparison
+        def normalize_tokens(tokens):
+            if not tokens:
+                return []
+            # Sort by token for consistent comparison
+            return sorted(
+                [{"name": t.get("name"), "token": t.get("token")} for t in tokens],
+                key=lambda x: x.get("token", ""),
+            )
+        # Compare each token type
+        current_workers = normalize_tokens(current_config.get("worker_tokens", []))
+        new_workers = normalize_tokens(new_config.get("worker_tokens", []))
+        current_admins = normalize_tokens(current_config.get("admin_tokens", []))
+        new_admins = normalize_tokens(new_config.get("admin_tokens", []))
+        current_monitors = normalize_tokens(current_config.get("monitor_tokens", []))
+        new_monitors = normalize_tokens(new_config.get("monitor_tokens", []))
+        return (
+            current_workers == new_workers
+            and current_admins == new_admins
+            and current_monitors == new_monitors
+        )
+    async def _handle_config_reload(self, websocket: ServerConnection, new_config: Dict[str, Any]):
         """Handle configuration reload request."""
         logger.info("Processing configuration reload request")
@@ -293,8 +322,16 @@ class Orchestrator:
             # Update auth configuration
             if "auth" in orchestrator_config:
                 try:
-                    self.auth = AuthManager(orchestrator_config["auth"])
-                    updated_sections.append("auth")
+                    current_auth_config = self.config.get("auth", {})
+                    new_auth_config = orchestrator_config["auth"]
+                    # Only recreate AuthManager if auth config has actually changed
+                    if not self._auth_configs_equal(current_auth_config, new_auth_config):
+                        self.auth = AuthManager(new_auth_config)
+                        updated_sections.append("auth")
+                        logger.info("Auth configuration updated due to changes")
+                    else:
+                        logger.info("Auth configuration unchanged, preserving existing AuthManager")
                 except Exception as e:
                     logger.error(f"Failed to update AuthManager: {e}")
                     warnings.append(f"Auth update failed: {e}")
@@ -344,7 +381,7 @@ class Orchestrator:
                     assignment_id=str(uuid.uuid4()),
                     worker_id=worker_id,
                     units=units,
-                    assigned_at=datetime.utcnow(),
+                    assigned_at=datetime.now(_datetime.UTC),
                 )
                 await self.workers[worker_id].send(
@@ -374,80 +411,93 @@ class Orchestrator:
             logger.debug(f"Heartbeat from {worker_id}: {data}")
     async def _handle_results_submission(self, worker_id: str, data: Dict):
-        """Process results submission from worker."""
-        # Extract user from worker_id
-        worker_user = worker_id.rsplit("_", 1)[0] if "_" in worker_id else worker_id
-        # Create work result
-        _job_id = data.get("job_id")
-        job_id = JobId.from_str(_job_id)
-        shard_name = job_id.shard_id
-        chunk_name = job_id.chunk_id
-        result = WorkResult(
-            unit_id=data["unit_id"],
-            source_id=shard_name,
-            chunk_id=job_id.get_chunk_str(),
-            sample_id=data["sample_id"],
-            dataset=data["dataset"],
-            outputs=data["outputs"],
-            metadata=data.get("metadata", {}),
-            processing_time_ms=data.get("processing_time_ms", 0),
-        )
+        """Process results submission from worker - fires off async task and returns immediately."""
+        # Fire and forget - process in background
+        asyncio.create_task(self._process_result_async(worker_id, data))
-        # Let processor handle any custom processing - this updates chunk tracker
-        # IMPORTANT: Call this BEFORE saving to storage so chunk tracker is updated
-        # regardless of whether the item is a duplicate
-        processed = self.processor.handle_result(result)
-        # Create caption record for storage
-        total_outputs = sum(len(v) for v in result.outputs.values())
-        filename = result.metadata.pop("_filename", None)
-        url = result.metadata.pop("_url", None)
-        image_height = result.metadata.pop("image_height", None)
-        image_width = result.metadata.pop("image_width", None)
-        file_size = result.metadata.pop("file_size", None)
-        image_format = result.metadata.pop("image_format", None)
-        item_index = result.metadata.pop("item_index", None)
-        item_key = result.metadata.pop("item_key", None)
-        to_delete_metadata_keys = ["_image_format", "_job_id"]
-        for key in to_delete_metadata_keys:
-            if key in result.metadata:
-                del result.metadata[key]
-        caption = Caption(
-            job_id=job_id,
-            dataset=result.dataset,
-            shard=processed["source_id"],
-            chunk_id=chunk_name,
-            item_key=item_key,
-            captions=result.outputs.get("captions", []),
-            outputs=result.outputs,
-            contributor_id=worker_user,
-            timestamp=datetime.utcnow(),
-            caption_count=total_outputs,
-            processing_time_ms=result.processing_time_ms,
-            metadata=result.metadata,
-            image_height=image_height,
-            image_width=image_width,
-            filename=filename,
-            url=url,
-            file_size=file_size,
-            image_format=image_format,
-        )
+    async def _process_result_async(self, worker_id: str, data: Dict):
+        """Actually process the result in background."""
+        try:
+            # Extract user from worker_id
+            worker_user = worker_id.rsplit("_", 1)[0] if "_" in worker_id else worker_id
+            # Create work result
+            _job_id = data.get("job_id")
+            job_id = JobId.from_str(_job_id)
+            shard_name = job_id.shard_id
+            chunk_name = job_id.chunk_id
+            result = WorkResult(
+                unit_id=data["unit_id"],
+                source_id=shard_name,
+                chunk_id=job_id.get_chunk_str(),
+                sample_id=data["sample_id"],
+                dataset=data["dataset"],
+                outputs=data["outputs"],
+                metadata=data.get("metadata", {}),
+                processing_time_ms=data.get("processing_time_ms", 0),
+            )
+            # Let processor handle any custom processing - this updates chunk tracker
+            # IMPORTANT: Call this BEFORE saving to storage so chunk tracker is updated
+            # regardless of whether the item is a duplicate
+            processed = self.processor.handle_result(result)
+            # Create caption record for storage
+            total_outputs = sum(len(v) for v in result.outputs.values())
+            filename = result.metadata.pop("_filename", None)
+            url = result.metadata.pop("_url", None)
+            image_height = result.metadata.pop("image_height", None)
+            image_width = result.metadata.pop("image_width", None)
+            file_size = result.metadata.pop("file_size", None)
+            image_format = result.metadata.pop("image_format", None)
+            result.metadata.pop("item_index", None)
+            item_key = result.metadata.pop("item_key", None)
+            to_delete_metadata_keys = ["_image_format", "_job_id"]
+            for key in to_delete_metadata_keys:
+                if key in result.metadata:
+                    del result.metadata[key]
+            caption = Caption(
+                job_id=job_id,
+                dataset=result.dataset,
+                shard=processed["source_id"],
+                chunk_id=chunk_name,
+                item_key=item_key,
+                captions=result.outputs.get("captions", []),
+                outputs=result.outputs,
+                contributor_id=worker_user,
+                timestamp=datetime.now(_datetime.UTC),
+                caption_count=total_outputs,
+                processing_time_ms=result.processing_time_ms,
+                metadata=result.metadata,
+                image_height=image_height,
+                image_width=image_width,
+                filename=filename,
+                url=url,
+                file_size=file_size,
+                image_format=image_format,
+            )
-        # Save to storage (might skip if duplicate)
-        saved = await self.storage.save_caption(caption)
+            # Save to storage (might skip if duplicate)
+            saved = await self.storage.save_caption(caption)
-        # Update contributor stats only if actually saved
-        if saved:
-            contributor = await self.storage.get_contributor(worker_user)
-            if contributor:
-                contributor.total_captions += total_outputs
-                await self.storage.save_contributor(contributor)
+            # Update contributor stats only if actually saved
+            if saved:
+                contributor = await self.storage.get_contributor(worker_user)
+                if contributor:
+                    contributor.total_captions += total_outputs
+                    await self.storage.save_contributor(contributor)
-    async def _handle_monitor(self, websocket: WebSocketServerProtocol):
+        except Exception as e:
+            logger.error(
+                f"Error processing result from {worker_id} for unit {data.get('unit_id', 'unknown')}: {e}",
+                exc_info=True,
+            )
+    async def _handle_monitor(self, websocket: ServerConnection):
         """Handle monitor connection."""
         self.monitors.add(websocket)
         logger.info(f"Monitor connected (total: {len(self.monitors)})")
@@ -460,7 +510,7 @@ class Orchestrator:
             await self._send_monitor_stats(websocket)
             # Keep connection alive
-            async for message in websocket:
+            async for _message in websocket:
                 pass
         except websockets.exceptions.ConnectionClosed:
@@ -468,7 +518,7 @@ class Orchestrator:
         finally:
             self.monitors.discard(websocket)
-    async def _handle_admin(self, websocket: WebSocketServerProtocol, auth_ticket):
+    async def _handle_admin(self, websocket: ServerConnection, auth_ticket):
         """Handle admin connection."""
         admin_id = getattr(auth_ticket, "name", "admin")
         logger.info(f"Admin {admin_id} connected")
@@ -490,7 +540,7 @@ class Orchestrator:
         except websockets.exceptions.ConnectionClosed:
             logger.info(f"Admin {admin_id} disconnected")
-    async def _handle_data_worker(self, websocket: WebSocketServerProtocol, auth_ticket):
+    async def _handle_data_worker(self, websocket: ServerConnection, auth_ticket):
         """Handle data worker connection."""
         worker_id = getattr(auth_ticket, "name", str(uuid.uuid4()))
         self.data_workers[worker_id] = websocket
@@ -559,7 +609,7 @@ class Orchestrator:
         finally:
             del self.data_workers[worker_id]
-    async def _send_monitor_initial_data(self, websocket: WebSocketServerProtocol):
+    async def _send_monitor_initial_data(self, websocket: ServerConnection):
         """Send initial data to monitor in a separate task to avoid blocking."""
         total_start = time.time()
         try:
@@ -616,7 +666,7 @@ class Orchestrator:
         except Exception as e:
             logger.error(f"Error sending initial monitor data: {e}")
-    async def _send_monitor_leaderboard(self, websocket: WebSocketServerProtocol):
+    async def _send_monitor_leaderboard(self, websocket: ServerConnection):
         """Send leaderboard data to a specific monitor."""
         total_start = time.time()
         try:
@@ -681,7 +731,7 @@ class Orchestrator:
         except Exception as e:
             logger.error(f"Error sending leaderboard to monitor: {e}")
-    async def _send_monitor_stats(self, websocket: WebSocketServerProtocol):
+    async def _send_monitor_stats(self, websocket: ServerConnection):
         """Send current stats to a monitor."""
         # Get processor stats
         processor_stats = self.processor.get_stats()
@@ -793,7 +843,7 @@ class Orchestrator:
             # Remove disconnected
             disconnected = {
                 m
-                for m, r in zip(monitors_copy, results)
+                for m, r in zip(monitors_copy, results, strict=False)
                 if r is not None and not isinstance(r, Exception)
             }
             self.monitors -= disconnected
@@ -864,9 +914,8 @@ class Orchestrator:
             # Log status
             if active_workers:
                 logger.debug(
-                    f"Active workers: {len(active_workers)} - {', '.join(active_workers[:5])}"
+                    f"Inactive workers: {len(self.workers) - len(active_workers)}/{len(active_workers)} - {', '.join(active_workers[:5])}"
                 )
-                logger.debug(f"Inactive workers: {len(self.workers) - len(active_workers)}")
             # add to self.stats
             self.stats["active_workers"] = len(active_workers)
             self.stats["inactive_workers"] = len(self.workers) - len(active_workers)
@@ -890,7 +939,7 @@ class Orchestrator:
                     )
                     logger.debug("Saved chunk tracker checkpoint")
-                self.stats["last_checkpoint"] = datetime.utcnow().isoformat()
+                self.stats["last_checkpoint"] = datetime.now(_datetime.UTC).isoformat()
                 logger.info("Storage and chunk tracker checkpoint complete")
             except Exception as e:
                 logger.error(f"Error during checkpoint: {e}", exc_info=True)

caption_flow/processors/__init__.py CHANGED Viewed

@@ -1,11 +1,11 @@
 from .base import (
     OrchestratorProcessor,
-    WorkerProcessor,
     ProcessorConfig,
-    WorkUnit,
     WorkAssignment,
+    WorkerProcessor,
     WorkResult,
+    WorkUnit,
 )
 from .huggingface import HuggingFaceDatasetOrchestratorProcessor, HuggingFaceDatasetWorkerProcessor
-from .webdataset import WebDatasetOrchestratorProcessor, WebDatasetWorkerProcessor
 from .local_filesystem import LocalFilesystemOrchestratorProcessor, LocalFilesystemWorkerProcessor
+from .webdataset import WebDatasetOrchestratorProcessor, WebDatasetWorkerProcessor

caption_flow/processors/base.py CHANGED Viewed

@@ -2,9 +2,8 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
-from typing import Dict, Any, List, Optional, Iterator, Tuple
 from datetime import datetime
-from pathlib import Path
+from typing import Any, Dict, Iterator, List, Optional
 @dataclass
@@ -98,9 +97,7 @@ class WorkResult:
         return self.error is None and bool(self.outputs)
     def to_repr(self, filter_outputs: bool = True):
-        """
-        Print the WorkResult, optionally without captions to save on screen wall-of-text dumpage.
-        """
+        """Print the WorkResult, optionally without captions to save on screen wall-of-text dumpage."""
         if filter_outputs:
             outputs = "...filtered from logs..."
         else:
@@ -172,6 +169,8 @@ class OrchestratorProcessor(ABC):
 class WorkerProcessor(ABC):
     """Base processor for worker side - processes work units."""
+    gpu_id: Optional[int] = None
     @abstractmethod
     def initialize(self, config: ProcessorConfig) -> None:
         """Initialize the processor with configuration."""
@@ -179,18 +178,20 @@ class WorkerProcessor(ABC):
     @abstractmethod
     def process_unit(self, unit: WorkUnit, context: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
-        """
-        Process a single work unit, yielding items to be captioned.
+        """Process a single work unit, yielding items to be captioned.
         Args:
+        ----
             unit: The work unit to process
             context: Runtime context (e.g., models, sampling params)
         Yields:
+        ------
             Dict containing:
                 - image: PIL Image
                 - metadata: Dict of metadata
                 - item_key: Unique identifier for this item
         """
         pass

caption-flow 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

caption-flow 0.3.4py3-none-any.whl → 0.4.0py3-none-any.whl