PyPI - vec-inf - Versions diffs - 0.4.1__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

vec-inf 0.4.1py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

vec_inf/README.md +3 -3
vec_inf/cli/_cli.py +227 -325
vec_inf/cli/_helper.py +400 -0
vec_inf/cli/_utils.py +26 -135
vec_inf/cli/_vars.py +32 -0
vec_inf/client/__init__.py +31 -0
vec_inf/client/_client_vars.py +213 -0
vec_inf/client/_exceptions.py +37 -0
vec_inf/client/_helper.py +674 -0
vec_inf/client/_slurm_script_generator.py +179 -0
vec_inf/client/_utils.py +287 -0
vec_inf/client/api.py +302 -0
vec_inf/client/config.py +128 -0
vec_inf/client/models.py +225 -0
vec_inf/client/slurm_vars.py +49 -0
vec_inf/{models → config}/README.md +30 -12
vec_inf/config/models.yaml +1300 -0
vec_inf-0.6.0.dist-info/METADATA +193 -0
vec_inf-0.6.0.dist-info/RECORD +25 -0
vec_inf/launch_server.sh +0 -145
vec_inf/models/models.csv +0 -85
vec_inf/multinode_vllm.slurm +0 -124
vec_inf/vllm.slurm +0 -59
vec_inf-0.4.1.dist-info/METADATA +0 -121
vec_inf-0.4.1.dist-info/RECORD +0 -16
{vec_inf-0.4.1.dist-info → vec_inf-0.6.0.dist-info}/WHEEL +0 -0
{vec_inf-0.4.1.dist-info → vec_inf-0.6.0.dist-info}/entry_points.txt +0 -0
{vec_inf-0.4.1.dist-info → vec_inf-0.6.0.dist-info}/licenses/LICENSE +0 -0

vec_inf/client/api.py ADDED Viewed

@@ -0,0 +1,302 @@
+"""Vector Inference client for programmatic access.
+This module provides the main client class for interacting with Vector Inference
+services programmatically. It includes functionality for launching models, monitoring
+their status, collecting metrics, and managing their lifecycle.
+See Also
+--------
+vec_inf.client._helper : Helper classes for model inference server management
+vec_inf.client.models : Data models for API responses
+"""
+import time
+import warnings
+from typing import Any, Optional, Union
+from vec_inf.client._exceptions import (
+    ServerError,
+    SlurmJobError,
+)
+from vec_inf.client._helper import (
+    ModelLauncher,
+    ModelRegistry,
+    ModelStatusMonitor,
+    PerformanceMetricsCollector,
+)
+from vec_inf.client._utils import run_bash_command
+from vec_inf.client.config import ModelConfig
+from vec_inf.client.models import (
+    LaunchOptions,
+    LaunchResponse,
+    MetricsResponse,
+    ModelInfo,
+    ModelStatus,
+    StatusResponse,
+)
+class VecInfClient:
+    """Client for interacting with Vector Inference programmatically.
+    This class provides methods for launching models, checking their status,
+    retrieving metrics, and shutting down models using the Vector Inference
+    infrastructure.
+    Methods
+    -------
+    list_models()
+        List all available models
+    get_model_config(model_name)
+        Get configuration for a specific model
+    launch_model(model_name, options)
+        Launch a model on the cluster
+    get_status(slurm_job_id, log_dir)
+        Get status of a running model
+    get_metrics(slurm_job_id, log_dir)
+        Get performance metrics of a running model
+    shutdown_model(slurm_job_id)
+        Shutdown a running model
+    wait_until_ready(slurm_job_id, timeout_seconds, poll_interval_seconds, log_dir)
+        Wait for a model to become ready
+    Examples
+    --------
+    >>> from vec_inf.api import VecInfClient
+    >>> client = VecInfClient()
+    >>> response = client.launch_model("Meta-Llama-3.1-8B-Instruct")
+    >>> job_id = response.slurm_job_id
+    >>> status = client.get_status(job_id)
+    >>> if status.status == ModelStatus.READY:
+    ...     print(f"Model is ready at {status.base_url}")
+    >>> client.shutdown_model(job_id)
+    """
+    def __init__(self) -> None:
+        """Initialize the Vector Inference client."""
+        pass
+    def list_models(self) -> list[ModelInfo]:
+        """List all available models.
+        Returns
+        -------
+        list[ModelInfo]
+            List of ModelInfo objects containing information about available models,
+            including their configurations and specifications.
+        """
+        model_registry = ModelRegistry()
+        return model_registry.get_all_models()
+    def get_model_config(self, model_name: str) -> ModelConfig:
+        """Get the configuration for a specific model.
+        Parameters
+        ----------
+        model_name : str
+            Name of the model to get configuration for
+        Returns
+        -------
+        ModelConfig
+            Complete configuration for the specified model
+        Raises
+        ------
+        ModelNotFoundError
+            If the specified model is not found in the configuration
+        """
+        model_registry = ModelRegistry()
+        return model_registry.get_single_model_config(model_name)
+    def launch_model(
+        self, model_name: str, options: Optional[LaunchOptions] = None
+    ) -> LaunchResponse:
+        """Launch a model on the cluster.
+        Parameters
+        ----------
+        model_name : str
+            Name of the model to launch
+        options : LaunchOptions, optional
+            Launch options to override default configuration
+        Returns
+        -------
+        LaunchResponse
+            Response containing launch details including:
+            - SLURM job ID
+            - Model configuration
+            - Launch status
+        Raises
+        ------
+        ModelConfigurationError
+            If the model configuration is invalid
+        SlurmJobError
+            If there's an error launching the SLURM job
+        """
+        # Convert LaunchOptions to dictionary if provided
+        options_dict: dict[str, Any] = {}
+        if options:
+            options_dict = {k: v for k, v in vars(options).items() if v is not None}
+        # Create and use the API Launch Helper
+        model_launcher = ModelLauncher(model_name, options_dict)
+        return model_launcher.launch()
+    def get_status(
+        self, slurm_job_id: int, log_dir: Optional[str] = None
+    ) -> StatusResponse:
+        """Get the status of a running model.
+        Parameters
+        ----------
+        slurm_job_id : int
+            The SLURM job ID to check
+        log_dir : str, optional
+            Path to the SLURM log directory. If None, uses default location
+        Returns
+        -------
+        StatusResponse
+            Status information including:
+            - Model name
+            - Server status
+            - Job state
+            - Base URL (if ready)
+            - Error information (if failed)
+        """
+        model_status_monitor = ModelStatusMonitor(slurm_job_id, log_dir)
+        return model_status_monitor.process_model_status()
+    def get_metrics(
+        self, slurm_job_id: int, log_dir: Optional[str] = None
+    ) -> MetricsResponse:
+        """Get the performance metrics of a running model.
+        Parameters
+        ----------
+        slurm_job_id : int
+            The SLURM job ID to get metrics for
+        log_dir : str, optional
+            Path to the SLURM log directory. If None, uses default location
+        Returns
+        -------
+        MetricsResponse
+            Response containing:
+            - Model name
+            - Performance metrics or error message
+            - Timestamp of collection
+        """
+        performance_metrics_collector = PerformanceMetricsCollector(
+            slurm_job_id, log_dir
+        )
+        metrics: Union[dict[str, float], str]
+        if not performance_metrics_collector.metrics_url.startswith("http"):
+            metrics = performance_metrics_collector.metrics_url
+        else:
+            metrics = performance_metrics_collector.fetch_metrics()
+        return MetricsResponse(
+            model_name=performance_metrics_collector.status_info.model_name,
+            metrics=metrics,
+            timestamp=time.time(),
+        )
+    def shutdown_model(self, slurm_job_id: int) -> bool:
+        """Shutdown a running model.
+        Parameters
+        ----------
+        slurm_job_id : int
+            The SLURM job ID to shut down
+        Returns
+        -------
+        bool
+            True if the model was successfully shutdown
+        Raises
+        ------
+        SlurmJobError
+            If there was an error shutting down the model
+        """
+        shutdown_cmd = f"scancel {slurm_job_id}"
+        _, stderr = run_bash_command(shutdown_cmd)
+        if stderr:
+            raise SlurmJobError(f"Failed to shutdown model: {stderr}")
+        return True
+    def wait_until_ready(
+        self,
+        slurm_job_id: int,
+        timeout_seconds: int = 1800,
+        poll_interval_seconds: int = 10,
+        log_dir: Optional[str] = None,
+    ) -> StatusResponse:
+        """Wait until a model is ready or fails.
+        Parameters
+        ----------
+        slurm_job_id : int
+            The SLURM job ID to wait for
+        timeout_seconds : int, optional
+            Maximum time to wait in seconds, by default 1800 (30 mins)
+        poll_interval_seconds : int, optional
+            How often to check status in seconds, by default 10
+        log_dir : str, optional
+            Path to the SLURM log directory. If None, uses default location
+        Returns
+        -------
+        StatusResponse
+            Status information when the model becomes ready
+        Raises
+        ------
+        SlurmJobError
+            If the specified job is not found or there's an error with the job
+        ServerError
+            If the server fails to start within the timeout period
+        APIError
+            If there was an error checking the status
+        Notes
+        -----
+        The timeout is reset if the model is still in PENDING state after the
+        initial timeout period. This allows for longer queue times in the SLURM
+        scheduler.
+        """
+        start_time = time.time()
+        while True:
+            status_info = self.get_status(slurm_job_id, log_dir)
+            if status_info.server_status == ModelStatus.READY:
+                return status_info
+            if status_info.server_status == ModelStatus.FAILED:
+                error_message = status_info.failed_reason or "Unknown error"
+                raise ServerError(f"Model failed to start: {error_message}")
+            if status_info.server_status == ModelStatus.SHUTDOWN:
+                raise ServerError("Model was shutdown before it became ready")
+            # Check timeout
+            if time.time() - start_time > timeout_seconds:
+                if status_info.server_status == ModelStatus.PENDING:
+                    warnings.warn(
+                        f"Model is still pending after {timeout_seconds} seconds, resetting timer...",
+                        UserWarning,
+                        stacklevel=2,
+                    )
+                    start_time = time.time()
+                raise ServerError(
+                    f"Timed out waiting for model to become ready after {timeout_seconds} seconds"
+                )
+            # Wait before checking again
+            time.sleep(poll_interval_seconds)

vec_inf/client/config.py ADDED Viewed

@@ -0,0 +1,128 @@
+"""Model configuration.
+This module provides a Pydantic model for validating and managing model deployment
+configurations, including hardware requirements and model specifications.
+"""
+from pathlib import Path
+from typing import Any, Optional, Union, cast
+from pydantic import BaseModel, ConfigDict, Field
+from typing_extensions import Literal
+from vec_inf.client.slurm_vars import (
+    DEFAULT_ARGS,
+    MAX_CPUS_PER_TASK,
+    MAX_GPUS_PER_NODE,
+    MAX_NUM_NODES,
+    PARTITION,
+    QOS,
+)
+class ModelConfig(BaseModel):
+    """Pydantic model for validating and managing model deployment configurations.
+    A configuration class that handles validation and management of model deployment
+    settings, including model specifications, hardware requirements, and runtime
+    parameters.
+    Parameters
+    ----------
+    model_name : str
+        Name of the model, must be alphanumeric with allowed characters: '-', '_', '.'
+    model_family : str
+        Family/architecture of the model
+    model_variant : str, optional
+        Specific variant or version of the model family
+    model_type : {'LLM', 'VLM', 'Text_Embedding', 'Reward_Modeling'}
+        Type of model architecture
+    gpus_per_node : int
+        Number of GPUs to use per node (1-MAX_GPUS_PER_NODE)
+    num_nodes : int
+        Number of nodes to use for deployment (1-MAX_NUM_NODES)
+    cpus_per_task : int, optional
+        Number of CPU cores per task (1-MAX_CPUS_PER_TASK)
+    mem_per_node : str, optional
+        Memory allocation per node in GB format (e.g., '32G')
+    vocab_size : int
+        Size of the model's vocabulary (1-1,000,000)
+    account : Optional[str], optional
+        Charge resources used by this job to specified account.
+    qos : Union[QOS, str], optional
+        Quality of Service tier for job scheduling
+    time : str, optional
+        Time limit for the job in HH:MM:SS format
+    partition : Union[PARTITION, str], optional
+        GPU partition type for job scheduling
+    venv : str, optional
+        Virtual environment or container system to use
+    log_dir : Path, optional
+        Directory path for storing logs
+    model_weights_parent_dir : Path, optional
+        Base directory containing model weights
+    vllm_args : dict[str, Any], optional
+        Additional arguments for vLLM engine configuration
+    Notes
+    -----
+    All fields are validated using Pydantic's validation system. The model is
+    configured to be immutable (frozen) and forbids extra fields.
+    """
+    model_name: str = Field(..., min_length=3, pattern=r"^[a-zA-Z0-9\-_\.]+$")
+    model_family: str = Field(..., min_length=2)
+    model_variant: Optional[str] = Field(
+        default=None, description="Specific variant/version of the model family"
+    )
+    model_type: Literal["LLM", "VLM", "Text_Embedding", "Reward_Modeling"] = Field(
+        ..., description="Type of model architecture"
+    )
+    gpus_per_node: int = Field(
+        ..., gt=0, le=MAX_GPUS_PER_NODE, description="GPUs per node"
+    )
+    num_nodes: int = Field(..., gt=0, le=MAX_NUM_NODES, description="Number of nodes")
+    cpus_per_task: int = Field(
+        default=cast(int, DEFAULT_ARGS["cpus_per_task"]),
+        gt=0,
+        le=MAX_CPUS_PER_TASK,
+        description="CPUs per task",
+    )
+    mem_per_node: str = Field(
+        default=cast(str, DEFAULT_ARGS["mem_per_node"]),
+        pattern=r"^\d{1,4}G$",
+        description="Memory per node",
+    )
+    vocab_size: int = Field(..., gt=0, le=1_000_000)
+    account: Optional[str] = Field(
+        default=None, description="Account name for job scheduling"
+    )
+    qos: Union[QOS, str] = Field(
+        default=cast(str, DEFAULT_ARGS["qos"]), description="Quality of Service tier"
+    )
+    time: str = Field(
+        default=cast(str, DEFAULT_ARGS["time"]),
+        pattern=r"^\d{2}:\d{2}:\d{2}$",
+        description="HH:MM:SS time limit",
+    )
+    partition: Union[PARTITION, str] = Field(
+        default=cast(str, DEFAULT_ARGS["partition"]), description="GPU partition type"
+    )
+    venv: str = Field(
+        default="singularity", description="Virtual environment/container system"
+    )
+    log_dir: Path = Field(
+        default=Path(cast(str, DEFAULT_ARGS["log_dir"])),
+        description="Log directory path",
+    )
+    model_weights_parent_dir: Path = Field(
+        default=Path(cast(str, DEFAULT_ARGS["model_weights_parent_dir"])),
+        description="Base directory for model weights",
+    )
+    vllm_args: Optional[dict[str, Any]] = Field(
+        default={}, description="vLLM engine arguments"
+    )
+    model_config = ConfigDict(
+        extra="forbid", str_strip_whitespace=True, validate_default=True, frozen=True
+    )

vec_inf/client/models.py ADDED Viewed

@@ -0,0 +1,225 @@
+"""Data models for Vector Inference API.
+This module contains the data model classes used by the Vector Inference API
+for both request parameters and response objects.
+Classes
+-------
+ModelStatus : Enum
+    Status states of a model
+ModelType : Enum
+    Types of supported models
+LaunchResponse : dataclass
+    Response from model launch operation
+StatusResponse : dataclass
+    Response from model status check
+MetricsResponse : dataclass
+    Response from metrics collection
+LaunchOptions : dataclass
+    Options for model launch
+LaunchOptionsDict : TypedDict
+    Dictionary representation of launch options
+ModelInfo : datacitten
+    Information about available models
+"""
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Optional, Union
+class ModelStatus(str, Enum):
+    """Enum representing the possible status states of a model.
+    Attributes
+    ----------
+    PENDING : str
+        Model is waiting for Slurm to allocate resources
+    LAUNCHING : str
+        Model is in the process of starting
+    READY : str
+        Model is running and ready to serve requests
+    FAILED : str
+        Model failed to start or encountered an error
+    SHUTDOWN : str
+        Model was intentionally stopped
+    UNAVAILABLE : str
+        Model status cannot be determined
+    """
+    PENDING = "PENDING"
+    LAUNCHING = "LAUNCHING"
+    READY = "READY"
+    FAILED = "FAILED"
+    SHUTDOWN = "SHUTDOWN"
+    UNAVAILABLE = "UNAVAILABLE"
+class ModelType(str, Enum):
+    """Enum representing the possible model types.
+    Attributes
+    ----------
+    LLM : str
+        Large Language Model
+    VLM : str
+        Vision Language Model
+    TEXT_EMBEDDING : str
+        Text Embedding Model
+    REWARD_MODELING : str
+        Reward Modeling Model
+    """
+    LLM = "LLM"
+    VLM = "VLM"
+    TEXT_EMBEDDING = "Text_Embedding"
+    REWARD_MODELING = "Reward_Modeling"
+@dataclass
+class LaunchResponse:
+    """Response from launching a model.
+    Parameters
+    ----------
+    slurm_job_id : int
+        ID of the launched SLURM job
+    model_name : str
+        Name of the launched model
+    config : dict[str, Any]
+        Configuration used for the launch
+    raw_output : str
+        Raw output from the launch command (hidden from repr)
+    """
+    slurm_job_id: int
+    model_name: str
+    config: dict[str, Any]
+    raw_output: str = field(repr=False)
+@dataclass
+class StatusResponse:
+    """Response from checking a model's status.
+    Parameters
+    ----------
+    model_name : str
+        Name of the model
+    server_status : ModelStatus
+        Current status of the server
+    job_state : Union[str, ModelStatus]
+        Current state of the SLURM job
+    raw_output : str
+        Raw output from status check (hidden from repr)
+    base_url : str, optional
+        Base URL of the model server if ready
+    pending_reason : str, optional
+        Reason for pending state if applicable
+    failed_reason : str, optional
+        Reason for failure if applicable
+    """
+    model_name: str
+    server_status: ModelStatus
+    job_state: Union[str, ModelStatus]
+    raw_output: str = field(repr=False)
+    base_url: Optional[str] = None
+    pending_reason: Optional[str] = None
+    failed_reason: Optional[str] = None
+@dataclass
+class MetricsResponse:
+    """Response from retrieving model metrics.
+    Parameters
+    ----------
+    model_name : str
+        Name of the model
+    metrics : Union[dict[str, float], str]
+        Either a dictionary of metrics or an error message
+    timestamp : float
+        Unix timestamp of when metrics were collected
+    """
+    model_name: str
+    metrics: Union[dict[str, float], str]
+    timestamp: float
+@dataclass
+class LaunchOptions:
+    """Options for launching a model.
+    Parameters
+    ----------
+    model_family : str, optional
+        Family/architecture of the model
+    model_variant : str, optional
+        Specific variant/version of the model
+    partition : str, optional
+        SLURM partition to use
+    num_nodes : int, optional
+        Number of nodes to allocate
+    gpus_per_node : int, optional
+        Number of GPUs per node
+    account : str, optional
+        Account name for job scheduling
+    qos : str, optional
+        Quality of Service level
+    time : str, optional
+        Time limit for the job
+    vocab_size : int, optional
+        Size of model vocabulary
+    data_type : str, optional
+        Data type for model weights
+    venv : str, optional
+        Virtual environment to use
+    log_dir : str, optional
+        Directory for logs
+    model_weights_parent_dir : str, optional
+        Parent directory containing model weights
+    vllm_args : str, optional
+        Additional arguments for vLLM
+    """
+    model_family: Optional[str] = None
+    model_variant: Optional[str] = None
+    partition: Optional[str] = None
+    num_nodes: Optional[int] = None
+    gpus_per_node: Optional[int] = None
+    account: Optional[str] = None
+    qos: Optional[str] = None
+    time: Optional[str] = None
+    vocab_size: Optional[int] = None
+    data_type: Optional[str] = None
+    venv: Optional[str] = None
+    log_dir: Optional[str] = None
+    model_weights_parent_dir: Optional[str] = None
+    vllm_args: Optional[str] = None
+@dataclass
+class ModelInfo:
+    """Information about an available model.
+    Parameters
+    ----------
+    name : str
+        Name of the model
+    family : str
+        Family/architecture of the model
+    variant : str, optional
+        Specific variant/version of the model
+    model_type : ModelType
+        Type of the model
+    config : dict[str, Any]
+        Additional configuration parameters
+    """
+    name: str
+    family: str
+    variant: Optional[str]
+    model_type: ModelType
+    config: dict[str, Any]

vec_inf/client/slurm_vars.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""Slurm cluster configuration variables."""
+from pathlib import Path
+from typing_extensions import Literal
+CACHED_CONFIG = Path("/", "model-weights", "vec-inf-shared", "models_latest.yaml")
+LD_LIBRARY_PATH = "/scratch/ssd001/pkgs/cudnn-11.7-v8.5.0.96/lib/:/scratch/ssd001/pkgs/cuda-11.7/targets/x86_64-linux/lib/"
+SINGULARITY_IMAGE = "/model-weights/vec-inf-shared/vector-inference_latest.sif"
+SINGULARITY_LOAD_CMD = "module load singularity-ce/3.8.2"
+VLLM_NCCL_SO_PATH = "/vec-inf/nccl/libnccl.so.2.18.1"
+MAX_GPUS_PER_NODE = 8
+MAX_NUM_NODES = 16
+MAX_CPUS_PER_TASK = 128
+QOS = Literal[
+    "normal",
+    "m",
+    "m2",
+    "m3",
+    "m4",
+    "m5",
+    "long",
+    "deadline",
+    "high",
+    "scavenger",
+    "llm",
+    "a100",
+]
+PARTITION = Literal[
+    "a40",
+    "a100",
+    "t4v1",
+    "t4v2",
+    "rtx6000",
+]
+DEFAULT_ARGS = {
+    "cpus_per_task": 16,
+    "mem_per_node": "64G",
+    "qos": "m2",
+    "time": "08:00:00",
+    "partition": "a40",
+    "data_type": "auto",
+    "log_dir": "~/.vec-inf-logs",
+    "model_weights_parent_dir": "/model-weights",
+}

vec-inf 0.4.1__py3-none-any.whl → 0.6.0__py3-none-any.whl

vec-inf 0.4.1py3-none-any.whl → 0.6.0py3-none-any.whl