PyPI - caption-flow - Versions diffs - 0.1.0__py3-none-any.whl - Mend

caption-flow 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

caption_flow/__init__.py +9 -0
caption_flow/cli.py +709 -0
caption_flow/models.py +82 -0
caption_flow/monitor.py +211 -0
caption_flow/orchestrator.py +1301 -0
caption_flow/storage.py +694 -0
caption_flow/utils/__init__.py +4 -0
caption_flow/utils/auth.py +67 -0
caption_flow/utils/caption_utils.py +172 -0
caption_flow/utils/certificates.py +140 -0
caption_flow/utils/chunk_tracker.py +365 -0
caption_flow/utils/dataset_loader.py +186 -0
caption_flow/utils/image_processor.py +51 -0
caption_flow/utils/job_queue.py +41 -0
caption_flow/utils/json_utils.py +201 -0
caption_flow/utils/vllm_config.py +164 -0
caption_flow/worker.py +300 -0
caption_flow/worker_data.py +482 -0
caption_flow/worker_vllm.py +1028 -0
caption_flow-0.1.0.dist-info/METADATA +427 -0
caption_flow-0.1.0.dist-info/RECORD +25 -0
caption_flow-0.1.0.dist-info/WHEEL +5 -0
caption_flow-0.1.0.dist-info/entry_points.txt +2 -0
caption_flow-0.1.0.dist-info/licenses/LICENSE +661 -0
caption_flow-0.1.0.dist-info/top_level.txt +1 -0

caption_flow/utils/json_utils.py ADDED Viewed

@@ -0,0 +1,201 @@
+"""JSON serialization utilities for handling special types like datetime."""
+import json
+from datetime import datetime, date
+from decimal import Decimal
+from pathlib import Path
+from typing import Any, Dict, List, Union
+from dataclasses import asdict, is_dataclass
+from enum import Enum
+def safe_json_dumps(obj: Any, **kwargs) -> str:
+    """
+    Safely serialize objects to JSON, handling special types.
+    Args:
+        obj: Object to serialize
+        **kwargs: Additional arguments to pass to json.dumps
+    Returns:
+        JSON string representation
+    """
+    return json.dumps(obj, default=json_serializer, **kwargs)
+def safe_dict(obj: Any) -> Dict[str, Any]:
+    """
+    Convert an object to a dictionary, handling special types.
+    Args:
+        obj: Object to convert (dataclass, dict, etc.)
+    Returns:
+        Dictionary with JSON-serializable values
+    """
+    if is_dataclass(obj):
+        data = asdict(obj)
+    elif hasattr(obj, "__dict__"):
+        data = obj.__dict__.copy()
+    elif isinstance(obj, dict):
+        data = obj.copy()
+    else:
+        return obj
+    return sanitize_dict(data)
+def sanitize_dict(data: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Recursively sanitize a dictionary to ensure all values are JSON-serializable.
+    Args:
+        data: Dictionary to sanitize
+    Returns:
+        Sanitized dictionary
+    """
+    result = {}
+    for key, value in data.items():
+        if value is None:
+            result[key] = None
+        elif isinstance(value, (datetime, date)):
+            result[key] = value.isoformat()
+        elif isinstance(value, Decimal):
+            result[key] = float(value)
+        elif isinstance(value, Path):
+            result[key] = str(value)
+        elif isinstance(value, Enum):
+            result[key] = value.value
+        elif isinstance(value, (list, tuple)):
+            result[key] = [sanitize_value(item) for item in value]
+        elif isinstance(value, dict):
+            result[key] = sanitize_dict(value)
+        elif is_dataclass(value):
+            result[key] = sanitize_dict(asdict(value))
+        elif hasattr(value, "__dict__"):
+            result[key] = sanitize_dict(value.__dict__)
+        else:
+            result[key] = value
+    return result
+def sanitize_value(value: Any) -> Any:
+    """
+    Sanitize a single value for JSON serialization.
+    Args:
+        value: Value to sanitize
+    Returns:
+        JSON-serializable value
+    """
+    if value is None:
+        return None
+    elif isinstance(value, (datetime, date)):
+        return value.isoformat()
+    elif isinstance(value, Decimal):
+        return float(value)
+    elif isinstance(value, Path):
+        return str(value)
+    elif isinstance(value, Enum):
+        return value.value
+    elif isinstance(value, dict):
+        return sanitize_dict(value)
+    elif isinstance(value, (list, tuple)):
+        return [sanitize_value(item) for item in value]
+    elif is_dataclass(value):
+        return sanitize_dict(asdict(value))
+    elif hasattr(value, "__dict__"):
+        return sanitize_dict(value.__dict__)
+    else:
+        return value
+def json_serializer(obj: Any) -> Any:
+    """
+    Default JSON serializer for special types.
+    Args:
+        obj: Object to serialize
+    Returns:
+        JSON-serializable representation
+    Raises:
+        TypeError: If object type is not supported
+    """
+    if isinstance(obj, (datetime, date)):
+        return obj.isoformat()
+    elif isinstance(obj, Decimal):
+        return float(obj)
+    elif isinstance(obj, Path):
+        return str(obj)
+    elif isinstance(obj, Enum):
+        return obj.value
+    elif type(obj).__name__ == "int64":
+        return int(obj)
+    elif is_dataclass(obj):
+        return sanitize_dict(asdict(obj))
+    elif hasattr(obj, "__dict__"):
+        return sanitize_dict(obj.__dict__)
+    else:
+        raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")
+def parse_datetime(dt_string: Union[str, datetime, None]) -> Union[datetime, None]:
+    """
+    Parse a datetime string or return existing datetime.
+    Args:
+        dt_string: ISO format datetime string, datetime object, or None
+    Returns:
+        datetime object or None
+    """
+    if dt_string is None:
+        return None
+    elif isinstance(dt_string, datetime):
+        return dt_string
+    elif isinstance(dt_string, str):
+        try:
+            return datetime.fromisoformat(dt_string.replace("Z", "+00:00"))
+        except ValueError:
+            # Try parsing without timezone
+            return datetime.fromisoformat(dt_string)
+    else:
+        raise ValueError(f"Cannot parse datetime from {type(dt_string).__name__}")
+# Convenience functions for common use cases
+def to_json_dict(obj: Any) -> Dict[str, Any]:
+    """
+    Convert any object to a JSON-serializable dictionary.
+    This is a convenience wrapper around safe_dict.
+    Args:
+        obj: Object to convert
+    Returns:
+        JSON-serializable dictionary
+    """
+    return safe_dict(obj)
+def to_json_string(obj: Any, indent: int = None) -> str:
+    """
+    Convert any object to a JSON string.
+    This is a convenience wrapper around safe_json_dumps.
+    Args:
+        obj: Object to convert
+        indent: Number of spaces for indentation (None for compact)
+    Returns:
+        JSON string
+    """
+    return safe_json_dumps(obj, indent=indent)

caption_flow/utils/vllm_config.py ADDED Viewed

@@ -0,0 +1,164 @@
+"""vLLM configuration management utilities."""
+import logging
+from typing import Dict, Any, Optional, Tuple, List
+from dataclasses import dataclass, field
+logger = logging.getLogger(__name__)
+@dataclass
+class VLLMConfigChange:
+    """Represents changes between vLLM configurations."""
+    requires_reload: bool = False
+    model_changed: bool = False
+    sampling_changed: bool = False
+    prompts_changed: bool = False
+    changed_fields: List[str] = field(default_factory=list)
+class VLLMConfigManager:
+    """Manages vLLM configuration changes and reloading."""
+    # Fields that require full vLLM reload
+    RELOAD_REQUIRED_FIELDS = {
+        "model",
+        "tensor_parallel_size",
+        "max_model_len",
+        "dtype",
+        "gpu_memory_utilization",
+        "enforce_eager",
+        "limit_mm_per_prompt",
+        "disable_mm_preprocessor_cache",
+    }
+    # Fields that can be updated without reload
+    RUNTIME_UPDATEABLE_FIELDS = {
+        "batch_size",
+        "sampling",
+        "inference_prompts",
+    }
+    def __init__(self):
+        self.current_config: Optional[Dict[str, Any]] = None
+        self.current_sampling_params = None
+    def analyze_config_change(
+        self, old_config: Optional[Dict[str, Any]], new_config: Dict[str, Any]
+    ) -> VLLMConfigChange:
+        """Analyze differences between configs to determine required actions."""
+        change = VLLMConfigChange()
+        if not old_config:
+            # First time setup
+            change.requires_reload = True
+            change.model_changed = True
+            logger.info("Initial vLLM configuration - full load required")
+            return change
+        # Check each field for changes
+        all_keys = set(old_config.keys()) | set(new_config.keys())
+        for key in all_keys:
+            old_value = old_config.get(key)
+            new_value = new_config.get(key)
+            if old_value != new_value:
+                change.changed_fields.append(key)
+                if key in self.RELOAD_REQUIRED_FIELDS:
+                    change.requires_reload = True
+                    if key == "model":
+                        change.model_changed = True
+                        logger.info(f"Model changed from {old_value} to {new_value}")
+                elif key == "sampling":
+                    change.sampling_changed = True
+                elif key == "inference_prompts":
+                    change.prompts_changed = True
+        if change.changed_fields:
+            logger.info(f"vLLM config changes detected: {change.changed_fields}")
+            if change.requires_reload:
+                logger.info("Changes require vLLM reload")
+            else:
+                logger.info("Changes can be applied without reload")
+        else:
+            logger.debug("No vLLM config changes detected")
+        return change
+    def create_sampling_params(self, vllm_config: Dict[str, Any]):
+        """Create SamplingParams from config."""
+        from vllm import SamplingParams
+        sampling_config = vllm_config.get("sampling", {})
+        params = SamplingParams(
+            temperature=sampling_config.get("temperature", 0.7),
+            top_p=sampling_config.get("top_p", 0.95),
+            max_tokens=sampling_config.get("max_tokens", 256),
+            stop=sampling_config.get("stop", ["<|end|>", "<|endoftext|>", "<|im_end|>"]),
+            repetition_penalty=sampling_config.get("repetition_penalty", 1.05),
+            skip_special_tokens=sampling_config.get("skip_special_tokens", True),
+        )
+        self.current_sampling_params = params
+        return params
+    def should_reload_vllm(
+        self, old_config: Optional[Dict[str, Any]], new_config: Dict[str, Any]
+    ) -> bool:
+        """Quick check if vLLM needs to be reloaded."""
+        change = self.analyze_config_change(old_config, new_config)
+        return change.requires_reload
+    def get_vllm_init_params(self, vllm_config: Dict[str, Any]) -> Dict[str, Any]:
+        """Extract vLLM initialization parameters from config."""
+        return {
+            "model": vllm_config["model"],
+            "trust_remote_code": True,
+            "tensor_parallel_size": vllm_config.get("tensor_parallel_size", 1),
+            "max_model_len": vllm_config.get("max_model_len", 16384),
+            "enforce_eager": vllm_config.get("enforce_eager", True),
+            "gpu_memory_utilization": vllm_config.get("gpu_memory_utilization", 0.92),
+            "dtype": vllm_config.get("dtype", "float16"),
+            "limit_mm_per_prompt": vllm_config.get("limit_mm_per_prompt", {"image": 1}),
+            "disable_mm_preprocessor_cache": vllm_config.get("disable_mm_preprocessor_cache", True),
+        }
+    def requires_tokenizer_reload(
+        self, old_config: Optional[Dict[str, Any]], new_config: Dict[str, Any]
+    ) -> bool:
+        """Check if tokenizer/processor need to be reloaded."""
+        if not old_config:
+            return True
+        # Tokenizer/processor depend on the model
+        return old_config.get("model") != new_config.get("model")
+    def update_runtime_config(
+        self, vllm_instance, old_config: Dict[str, Any], new_config: Dict[str, Any]
+    ) -> Tuple[bool, Optional[Any]]:
+        """
+        Update vLLM configuration at runtime without reload.
+        Returns:
+            Tuple of (success, new_sampling_params)
+        """
+        change = self.analyze_config_change(old_config, new_config)
+        if change.requires_reload:
+            logger.warning("Config changes require reload, cannot update at runtime")
+            return False, None
+        # Update sampling params if changed
+        new_sampling_params = None
+        if change.sampling_changed:
+            new_sampling_params = self.create_sampling_params(new_config)
+            logger.info("Updated sampling parameters")
+        # Note: batch_size and prompts are handled by the worker directly
+        # as they don't affect the vLLM instance itself
+        return True, new_sampling_params

caption_flow/worker.py ADDED Viewed

@@ -0,0 +1,300 @@
+"""Worker node for distributed captioning."""
+import asyncio
+import json
+import logging
+import ssl
+from typing import Dict, Any, Optional
+from pathlib import Path
+import websockets
+import websockets.exceptions
+from websockets.client import WebSocketClientProtocol
+from .models import Job, JobStatus
+from .utils.image_processor import ImageProcessor
+logger = logging.getLogger(__name__)
+class Worker:
+    """Worker node that processes captioning jobs."""
+    def __init__(self, config: Dict[str, Any]):
+        self.config = config
+        self.server_url = config["server"]
+        self.token = config["token"]
+        self.name = config.get("name", "worker")
+        self.batch_size = config.get("batch_size", 32)
+        # Dataset configuration will be received from orchestrator
+        self.dataset_config = None
+        self.dataset_type = None
+        self.dataset_path = None
+        # SSL configuration
+        self.ssl_context = self._setup_ssl()
+        # Components
+        self.image_processor = ImageProcessor()
+        # State
+        self.worker_id: Optional[str] = None
+        self.websocket: Optional[WebSocketClientProtocol] = None
+        self.running = False
+        self.current_job: Optional[Job] = None
+        # Metrics
+        self.processed_count = 0
+        self.error_count = 0
+    def _setup_ssl(self) -> Optional[ssl.SSLContext]:
+        """Configure SSL context."""
+        # Check if URL is WSS (requires SSL)
+        if self.server_url.startswith("ws://"):
+            logger.warning(
+                "Using insecure WebSocket connection (ws://). Consider using wss:// for production."
+            )
+            return None  # No SSL for ws://
+        if not self.config.get("verify_ssl", True):
+            # Disable SSL verification for development
+            context = ssl.create_default_context()
+            context.check_hostname = False
+            context.verify_mode = ssl.CERT_NONE
+            return context
+        return ssl.create_default_context()
+    async def start(self):
+        """Start the worker and connect to orchestrator."""
+        self.running = True
+        while self.running:
+            try:
+                await self._connect_and_run()
+            except websockets.exceptions.ConnectionClosed as e:
+                logger.warning(f"Connection closed: {e}")
+                if self.running:
+                    logger.info("Reconnecting in 5 seconds...")
+                    await asyncio.sleep(5)
+            except Exception as e:
+                logger.error(f"Connection error: {e}")
+                if self.running:
+                    logger.info("Reconnecting in 5 seconds...")
+                    await asyncio.sleep(5)
+    async def _connect_and_run(self):
+        """Connect to orchestrator and process jobs."""
+        logger.info(f"Connecting to {self.server_url}")
+        try:
+            async with websockets.connect(self.server_url, ssl=self.ssl_context) as websocket:
+                self.websocket = websocket
+                # Authenticate
+                await websocket.send(json.dumps({"token": self.token, "name": self.name}))
+                # Wait for welcome message with dataset configuration
+                welcome = await websocket.recv()
+                welcome_data = json.loads(welcome)
+                if "error" in welcome_data:
+                    logger.error(f"Authentication failed: {welcome_data['error']}")
+                    self.running = False
+                    return
+                self.worker_id = welcome_data.get("worker_id")
+                # Extract and store dataset configuration from orchestrator
+                if "dataset_config" in welcome_data:
+                    self.dataset_config = welcome_data["dataset_config"]
+                    self.dataset_type = self.dataset_config.get("dataset_type")
+                    self.dataset_path = self.dataset_config.get("dataset_path")
+                    logger.info(
+                        f"Received dataset configuration from orchestrator: "
+                        f"type={self.dataset_type}, path={self.dataset_path}"
+                    )
+                else:
+                    logger.warning("No dataset configuration received from orchestrator")
+                logger.info(f"Connected as {self.worker_id}")
+                # Create tasks for concurrent operations
+                tasks = [
+                    asyncio.create_task(self._heartbeat_loop()),
+                    asyncio.create_task(self._job_processing_loop()),
+                    asyncio.create_task(self._message_handler()),
+                ]
+                try:
+                    # Wait for any task to complete (usually due to connection close)
+                    done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
+                    # Cancel remaining tasks
+                    for task in pending:
+                        task.cancel()
+                        try:
+                            await task
+                        except asyncio.CancelledError:
+                            pass
+                    # Check if we had an error in completed tasks
+                    for task in done:
+                        try:
+                            task.result()
+                        except websockets.exceptions.ConnectionClosed:
+                            logger.info("WebSocket connection closed")
+                        except Exception as e:
+                            logger.error(f"Task error: {e}")
+                except websockets.exceptions.ConnectionClosed:
+                    logger.info("Connection closed by orchestrator")
+        except websockets.exceptions.ConnectionClosed as e:
+            logger.info(f"Failed to connect: {e}")
+            raise
+        except Exception as e:
+            logger.error(f"Unexpected error in connection: {e}")
+            raise
+        finally:
+            self.websocket = None
+            self.current_job = None
+    async def _job_processing_loop(self):
+        """Main loop for requesting and processing jobs."""
+        while self.running and self.websocket:
+            try:
+                # Request a job
+                await self.websocket.send(json.dumps({"type": "request_job"}))
+                # Wait a bit for response
+                await asyncio.sleep(1)
+                if self.current_job:
+                    await self._process_job(self.current_job)
+                    self.current_job = None
+                else:
+                    # No job available, wait before requesting again
+                    await asyncio.sleep(5)
+            except websockets.exceptions.ConnectionClosed:
+                logger.info("Connection closed during job processing")
+                break
+            except Exception as e:
+                logger.error(f"Job processing error: {e}")
+                self.error_count += 1
+                await asyncio.sleep(1)
+    async def _message_handler(self):
+        """Handle incoming messages from orchestrator."""
+        try:
+            async for message in self.websocket:
+                try:
+                    data = json.loads(message)
+                    msg_type = data.get("type")
+                    if msg_type == "job":
+                        job_data = data["job"]
+                        self.current_job = Job(**job_data)
+                        logger.info(f"Received job {self.current_job.job_id}")
+                    elif msg_type == "no_jobs":
+                        logger.debug("No jobs available")
+                    elif msg_type == "ack":
+                        logger.debug(f"Job {data['job_id']} acknowledged")
+                        self.processed_count += 1
+                except json.JSONDecodeError as e:
+                    logger.error(f"Invalid message format: {e}")
+                except Exception as e:
+                    logger.error(f"Error handling message: {e}")
+        except websockets.exceptions.ConnectionClosed:
+            logger.info("Connection closed while waiting for messages")
+        except Exception as e:
+            logger.error(f"Message handler error: {e}")
+    async def _process_job(self, job: Job):
+        """Process a single captioning job."""
+        if not self.websocket:
+            logger.warning(f"No websocket connection, skipping job {job.job_id}")
+            return
+        logger.info(f"Processing job {job.job_id}")
+        try:
+            # Load and preprocess images
+            images = await self._load_images(job)
+            # TODO: Here you would integrate your captioning model
+            # For now, using placeholder
+            caption = f"[Generated caption for {job.item_key}]"
+            # Submit result
+            await self.websocket.send(
+                json.dumps(
+                    {
+                        "type": "submit_caption",
+                        "job_id": job.job_id,
+                        "dataset": job.dataset,
+                        "shard": job.shard,
+                        "item_key": job.item_key,
+                        "caption": caption,
+                    }
+                )
+            )
+            logger.info(f"Completed job {job.job_id}")
+        except websockets.exceptions.ConnectionClosed:
+            logger.warning(f"Connection lost while processing job {job.job_id}")
+            raise  # Re-raise to trigger reconnection
+        except Exception as e:
+            logger.error(f"Failed to process job {job.job_id}: {e}")
+            # Report failure if still connected
+            if self.websocket:
+                try:
+                    await self.websocket.send(
+                        json.dumps({"type": "job_failed", "job_id": job.job_id, "error": str(e)})
+                    )
+                except:
+                    pass  # Connection might be closed
+    async def _load_images(self, job: Job):
+        """Load and preprocess images for a job."""
+        # This would load actual images from the dataset
+        # Now can use self.dataset_type and self.dataset_path received from orchestrator
+        # For now, returning placeholder
+        return []
+    async def _heartbeat_loop(self):
+        """Send periodic heartbeats to orchestrator."""
+        while self.running and self.websocket:
+            try:
+                await self.websocket.send(
+                    json.dumps(
+                        {
+                            "type": "heartbeat",
+                            "processed": self.processed_count,
+                            "errors": self.error_count,
+                        }
+                    )
+                )
+                await asyncio.sleep(30)
+            except websockets.exceptions.ConnectionClosed:
+                logger.info("Connection closed during heartbeat")
+                break
+            except Exception as e:
+                logger.error(f"Heartbeat error: {e}")
+                break
+    async def shutdown(self):
+        """Graceful shutdown."""
+        logger.info("Shutting down worker...")
+        self.running = False
+        if self.websocket:
+            await self.websocket.close()