PyPI - vec-inf - Versions diffs - 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

vec-inf 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

vec_inf/README.md +18 -4
vec_inf/cli/_cli.py +212 -30
vec_inf/cli/_helper.py +95 -14
vec_inf/client/_client_vars.py +19 -152
vec_inf/client/_helper.py +386 -53
vec_inf/client/_slurm_script_generator.py +210 -43
vec_inf/client/_slurm_templates.py +248 -0
vec_inf/client/_slurm_vars.py +82 -0
vec_inf/client/_utils.py +190 -71
vec_inf/client/api.py +96 -25
vec_inf/client/config.py +46 -15
vec_inf/client/models.py +51 -2
vec_inf/config/README.md +4 -243
vec_inf/config/environment.yaml +31 -0
vec_inf/config/models.yaml +102 -281
{vec_inf-0.6.0.dist-info → vec_inf-0.7.0.dist-info}/METADATA +25 -67
vec_inf-0.7.0.dist-info/RECORD +27 -0
vec_inf/client/slurm_vars.py +0 -49
vec_inf-0.6.0.dist-info/RECORD +0 -25
{vec_inf-0.6.0.dist-info → vec_inf-0.7.0.dist-info}/WHEEL +0 -0
{vec_inf-0.6.0.dist-info → vec_inf-0.7.0.dist-info}/entry_points.txt +0 -0
{vec_inf-0.6.0.dist-info → vec_inf-0.7.0.dist-info}/licenses/LICENSE +0 -0

vec_inf/client/_helper.py CHANGED Viewed

@@ -5,10 +5,10 @@ metrics collection, and model registry operations.
 """
 import json
-import os
 import time
 import warnings
 from pathlib import Path
+from shutil import copy2
 from typing import Any, Optional, Union, cast
 from urllib.parse import urlparse, urlunparse
@@ -16,8 +16,8 @@ import requests
 import vec_inf.client._utils as utils
 from vec_inf.client._client_vars import (
+    BATCH_MODE_REQUIRED_MATCHING_ARGS,
     KEY_METRICS,
-    REQUIRED_FIELDS,
     SRC_DIR,
     VLLM_SHORT_TO_LONG_MAP,
 )
@@ -27,19 +27,19 @@ from vec_inf.client._exceptions import (
     ModelNotFoundError,
     SlurmJobError,
 )
-from vec_inf.client._slurm_script_generator import SlurmScriptGenerator
+from vec_inf.client._slurm_script_generator import (
+    BatchSlurmScriptGenerator,
+    SlurmScriptGenerator,
+)
 from vec_inf.client.config import ModelConfig
 from vec_inf.client.models import (
+    BatchLaunchResponse,
     LaunchResponse,
     ModelInfo,
     ModelStatus,
     ModelType,
     StatusResponse,
 )
-from vec_inf.client.slurm_vars import (
-    LD_LIBRARY_PATH,
-    VLLM_NCCL_SO_PATH,
-)
 class ModelLauncher:
@@ -50,27 +50,18 @@ class ModelLauncher:
     Parameters
     ----------
-    model_name : str
+    model_name: str
         Name of the model to launch
-    kwargs : dict[str, Any], optional
+    kwargs: Optional[dict[str, Any]]
         Optional launch keyword arguments to override default configuration
     """
     def __init__(self, model_name: str, kwargs: Optional[dict[str, Any]]):
-        """Initialize the model launcher.
-        Parameters
-        ----------
-        model_name: str
-            Name of the model to launch
-        kwargs: Optional[dict[str, Any]]
-            Optional launch keyword arguments to override default configuration
-        """
         self.model_name = model_name
         self.kwargs = kwargs or {}
         self.slurm_job_id = ""
         self.slurm_script_path = Path("")
-        self.model_config = self._get_model_configuration()
+        self.model_config = self._get_model_configuration(self.kwargs.get("config"))
         self.params = self._get_launch_params()
     def _warn(self, message: str) -> None:
@@ -83,9 +74,14 @@ class ModelLauncher:
         """
         warnings.warn(message, UserWarning, stacklevel=2)
-    def _get_model_configuration(self) -> ModelConfig:
+    def _get_model_configuration(self, config_path: str | None = None) -> ModelConfig:
         """Load and validate model configuration.
+        Parameters
+        ----------
+        config_path : str | None, optional
+            Path to a yaml file with custom model config to use in place of the default
         Returns
         -------
         ModelConfig
@@ -98,7 +94,7 @@ class ModelLauncher:
         ModelConfigurationError
             If model configuration is not found and weights don't exist
         """
-        model_configs = utils.load_config()
+        model_configs = utils.load_config(config_path=config_path)
         config = next(
             (m for m in model_configs if m.model_name == self.model_name), None
         )
@@ -167,6 +163,38 @@ class ModelLauncher:
                 vllm_args[arg.strip()] = True
         return vllm_args
+    def _process_env_vars(self, env_arg: str) -> dict[str, str]:
+        """Process the env string into a dictionary of environment variables.
+        Parameters
+        ----------
+        env_arg : str
+            String containing comma separated list of environment variable definitions
+            (eg. MY_VAR=1), file paths containing environment variable definitions
+            (separated by newlines), or a combination of both
+            (eg. 'MY_VAR=5,my_env.env')
+        Returns
+        -------
+        dict[str, str]
+            Processed environment variables as key-value pairs.
+        """
+        env_vars: dict[str, str] = {}
+        for arg in env_arg.split(","):
+            if "=" in arg:  # Arg is an env var definition
+                key, value = arg.split("=")
+                env_vars[key.strip()] = value.strip()
+            else:  # Arg is a path to a file
+                with open(arg, "r") as file:
+                    lines = [line.rstrip() for line in file]
+                for line in lines:
+                    if "=" in line:
+                        key, value = line.split("=")
+                        env_vars[key.strip()] = value.strip()
+                    else:
+                        print(f"WARNING: Could not parse env var: {line}")
+        return env_vars
     def _get_launch_params(self) -> dict[str, Any]:
         """Prepare launch parameters, set log dir, and validate required fields.
@@ -190,14 +218,19 @@ class ModelLauncher:
                 params["vllm_args"][key] = value
             del self.kwargs["vllm_args"]
+        if self.kwargs.get("env"):
+            env_vars = self._process_env_vars(self.kwargs["env"])
+            for key, value in env_vars.items():
+                params["env"][key] = str(value)
+            del self.kwargs["env"]
         for key, value in self.kwargs.items():
             params[key] = value
-        # Validate required fields and vllm args
-        if not REQUIRED_FIELDS.issubset(set(params.keys())):
-            raise MissingRequiredFieldsError(
-                f"Missing required fields: {REQUIRED_FIELDS - set(params.keys())}"
-            )
+        # Check for required fields without default vals, will raise an error if missing
+        utils.check_required_fields(params)
+        # Validate resource allocation and parallelization settings
         if (
             int(params["gpus_per_node"]) > 1
             and params["vllm_args"].get("--tensor-parallel-size") is None
@@ -206,6 +239,25 @@ class ModelLauncher:
                 "--tensor-parallel-size is required when gpus_per_node > 1"
             )
+        total_gpus_requested = int(params["gpus_per_node"]) * int(params["num_nodes"])
+        if not utils.is_power_of_two(total_gpus_requested):
+            raise ValueError("Total number of GPUs requested must be a power of two")
+        total_parallel_sizes = int(
+            params["vllm_args"].get("--tensor-parallel-size", "1")
+        ) * int(params["vllm_args"].get("--pipeline-parallel-size", "1"))
+        if total_gpus_requested != total_parallel_sizes:
+            raise ValueError(
+                "Mismatch between total number of GPUs requested and parallelization settings"
+            )
+        # Convert gpus_per_node and resource_type to gres
+        resource_type = params.get("resource_type")
+        if resource_type:
+            params["gres"] = f"gpu:{resource_type}:{params['gpus_per_node']}"
+        else:
+            params["gres"] = f"gpu:{params['gpus_per_node']}"
         # Create log directory
         params["log_dir"] = Path(params["log_dir"], params["model_family"]).expanduser()
         params["log_dir"].mkdir(parents=True, exist_ok=True)
@@ -224,17 +276,12 @@ class ModelLauncher:
         # Convert path to string for JSON serialization
         for field in params:
-            if field == "vllm_args":
+            if field in ["vllm_args", "env"]:
                 continue
             params[field] = str(params[field])
         return params
-    def _set_env_vars(self) -> None:
-        """Set environment variables for the launch command."""
-        os.environ["LD_LIBRARY_PATH"] = LD_LIBRARY_PATH
-        os.environ["VLLM_NCCL_SO_PATH"] = VLLM_NCCL_SO_PATH
     def _build_launch_command(self) -> str:
         """Generate the slurm script and construct the launch command.
@@ -259,9 +306,6 @@ class ModelLauncher:
         SlurmJobError
             If SLURM job submission fails
         """
-        # Set environment variables
-        self._set_env_vars()
         # Build and execute the launch command
         command_output, stderr = utils.run_bash_command(self._build_launch_command())
@@ -285,20 +329,288 @@ class ModelLauncher:
         job_json.touch(exist_ok=True)
         self.slurm_script_path.rename(
-            job_log_dir / f"{self.model_name}.{self.slurm_job_id}.slurm"
+            job_log_dir / f"{self.model_name}.{self.slurm_job_id}.sbatch"
         )
         with job_json.open("w") as file:
             json.dump(self.params, file, indent=4)
         return LaunchResponse(
-            slurm_job_id=int(self.slurm_job_id),
+            slurm_job_id=self.slurm_job_id,
             model_name=self.model_name,
             config=self.params,
             raw_output=command_output,
         )
+class BatchModelLauncher:
+    """Helper class for handling batch inference server launch.
+    A class that manages the launch process of multiple inference servers, including
+    configuration validation, and SLURM job submission.
+    Parameters
+    ----------
+    model_names : list[str]
+        List of model names to launch
+    """
+    def __init__(
+        self,
+        model_names: list[str],
+        batch_config: Optional[str] = None,
+        account: Optional[str] = None,
+        work_dir: Optional[str] = None,
+    ):
+        self.model_names = model_names
+        self.batch_config = batch_config
+        self.slurm_job_id = ""
+        self.slurm_job_name = self._get_slurm_job_name()
+        self.batch_script_path = Path("")
+        self.launch_script_paths: list[Path] = []
+        self.model_configs = self._get_model_configurations()
+        self.params = self._get_launch_params(account, work_dir)
+    def _get_slurm_job_name(self) -> str:
+        """Get the SLURM job name from the model names.
+        Returns
+        -------
+        str
+            SLURM job name
+        """
+        return "BATCH-" + "-".join(self.model_names)
+    def _get_model_configurations(self) -> dict[str, ModelConfig]:
+        """Load and validate model configurations.
+        Returns
+        -------
+        dict[str, ModelConfig]
+            Dictionary of validated model configurations
+        Raises
+        ------
+        ModelNotFoundError
+            If model weights parent directory cannot be determined
+        ModelConfigurationError
+            If model configuration is not found and weights don't exist
+        """
+        model_configs = utils.load_config(self.batch_config)
+        model_configs_dict = {}
+        for model_name in self.model_names:
+            config = next(
+                (m for m in model_configs if m.model_name == model_name), None
+            )
+            if config:
+                model_configs_dict[model_name] = config
+            else:
+                raise ModelConfigurationError(
+                    f"'{model_name}' not found in configuration, batch launch requires all models to be present in the configuration file"
+                )
+        return model_configs_dict
+    def _get_launch_params(
+        self, account: Optional[str] = None, work_dir: Optional[str] = None
+    ) -> dict[str, Any]:
+        """Prepare launch parameters, set log dir, and validate required fields.
+        Returns
+        -------
+        dict[str, Any]
+            Dictionary of prepared launch parameters
+        Raises
+        ------
+        MissingRequiredFieldsError
+            If required fields are missing or tensor parallel size is not specified
+            when using multiple GPUs
+        """
+        params: dict[str, Any] = {
+            "models": {},
+            "slurm_job_name": self.slurm_job_name,
+            "src_dir": str(SRC_DIR),
+            "account": account,
+            "work_dir": work_dir,
+        }
+        # Check for required fields without default vals, will raise an error if missing
+        utils.check_required_fields(params)
+        for i, (model_name, config) in enumerate(self.model_configs.items()):
+            params["models"][model_name] = config.model_dump(exclude_none=True)
+            params["models"][model_name]["het_group_id"] = i
+            # Validate resource allocation and parallelization settings
+            if (
+                int(config.gpus_per_node) > 1
+                and (config.vllm_args or {}).get("--tensor-parallel-size") is None
+            ):
+                raise MissingRequiredFieldsError(
+                    f"--tensor-parallel-size is required when gpus_per_node > 1, check your configuration for {model_name}"
+                )
+            total_gpus_requested = int(config.gpus_per_node) * int(config.num_nodes)
+            if not utils.is_power_of_two(total_gpus_requested):
+                raise ValueError(
+                    f"Total number of GPUs requested must be a power of two, check your configuration for {model_name}"
+                )
+            total_parallel_sizes = int(
+                (config.vllm_args or {}).get("--tensor-parallel-size", "1")
+            ) * int((config.vllm_args or {}).get("--pipeline-parallel-size", "1"))
+            if total_gpus_requested != total_parallel_sizes:
+                raise ValueError(
+                    f"Mismatch between total number of GPUs requested and parallelization settings, check your configuration for {model_name}"
+                )
+            # Convert gpus_per_node and resource_type to gres
+            params["models"][model_name]["gres"] = (
+                f"gpu:{config.resource_type}:{config.gpus_per_node}"
+            )
+            # Create log directory
+            log_dir = Path(
+                params["models"][model_name]["log_dir"], self.slurm_job_name
+            ).expanduser()
+            log_dir.mkdir(parents=True, exist_ok=True)
+            params["models"][model_name]["log_dir"] = str(log_dir)
+            # Convert model_weights_parent_dir to string for JSON serialization
+            params["models"][model_name]["model_weights_parent_dir"] = str(
+                params["models"][model_name]["model_weights_parent_dir"]
+            )
+            # Construct slurm log file paths
+            params["models"][model_name]["out_file"] = (
+                f"{params['models'][model_name]['log_dir']}/{self.slurm_job_name}.%j/{model_name}.%j.out"
+            )
+            params["models"][model_name]["err_file"] = (
+                f"{params['models'][model_name]['log_dir']}/{self.slurm_job_name}.%j/{model_name}.%j.err"
+            )
+            params["models"][model_name]["json_file"] = (
+                f"{params['models'][model_name]['log_dir']}/{self.slurm_job_name}.$SLURM_JOB_ID/{model_name}.$SLURM_JOB_ID.json"
+            )
+            # Create top level log files using the first model's log directory
+            if not params.get("out_file"):
+                params["out_file"] = (
+                    f"{params['models'][model_name]['log_dir']}/{self.slurm_job_name}.%j/{self.slurm_job_name}.%j.out"
+                )
+            if not params.get("err_file"):
+                params["err_file"] = (
+                    f"{params['models'][model_name]['log_dir']}/{self.slurm_job_name}.%j/{self.slurm_job_name}.%j.err"
+                )
+            # Check if required matching arguments are matched
+            for arg in BATCH_MODE_REQUIRED_MATCHING_ARGS:
+                if not params.get(arg):
+                    params[arg] = params["models"][model_name][arg]
+                elif params[arg] != params["models"][model_name][arg]:
+                    # Remove the created directory since we found a mismatch
+                    log_dir.rmdir()
+                    raise ValueError(
+                        f"Mismatch found for {arg}: {params[arg]} != {params['models'][model_name][arg]}, check your configuration"
+                    )
+        return params
+    def _build_launch_command(self) -> str:
+        """Generate the slurm script and construct the launch command.
+        Returns
+        -------
+        str
+            Complete SLURM launch command
+        """
+        batch_script_generator = BatchSlurmScriptGenerator(self.params)
+        self.batch_script_path = batch_script_generator.generate_batch_slurm_script()
+        self.launch_script_paths = batch_script_generator.script_paths
+        return f"sbatch {str(self.batch_script_path)}"
+    def launch(self) -> BatchLaunchResponse:
+        """Launch models in batch mode.
+        Returns
+        -------
+        BatchLaunchResponse
+            Response object containing launch details and status
+        Raises
+        ------
+        SlurmJobError
+            If SLURM job submission fails
+        """
+        # Build and execute the launch command
+        command_output, stderr = utils.run_bash_command(self._build_launch_command())
+        if stderr:
+            raise SlurmJobError(f"Error: {stderr}")
+        # Extract slurm job id from command output
+        self.slurm_job_id = command_output.split(" ")[-1].strip().strip("\n")
+        self.params["slurm_job_id"] = self.slurm_job_id
+        # Create log directory and job json file, move slurm script to job log directory
+        main_job_log_dir = Path("")
+        for model_name in self.model_names:
+            model_job_id = int(self.slurm_job_id) + int(
+                self.params["models"][model_name]["het_group_id"]
+            )
+            job_log_dir = Path(
+                self.params["log_dir"], f"{self.slurm_job_name}.{model_job_id}"
+            )
+            job_log_dir.mkdir(parents=True, exist_ok=True)
+            if main_job_log_dir == Path(""):
+                main_job_log_dir = job_log_dir
+            job_json = Path(
+                job_log_dir,
+                f"{model_name}.{model_job_id}.json",
+            )
+            job_json.touch(exist_ok=True)
+            with job_json.open("w") as file:
+                json.dump(self.params["models"][model_name], file, indent=4)
+        # Copy the launch scripts to the job log directory, the original scripts
+        # cannot be deleted otherwise slurm will not be able to find them
+        script_path_mapper = {}
+        for script_path in self.launch_script_paths:
+            old_path = script_path.name
+            file_name = old_path.split("/")[-1]
+            copy2(script_path, main_job_log_dir / file_name)
+            new_path = script_path.name
+            script_path_mapper[old_path] = new_path
+        # Replace old launch script paths with new paths in batch slurm script
+        with self.batch_script_path.open("r") as f:
+            script_content = f.read()
+        for old_path, new_path in script_path_mapper.items():
+            script_content = script_content.replace(old_path, new_path)
+        with self.batch_script_path.open("w") as f:
+            f.write(script_content)
+        # Move the batch script to the job log directory
+        self.batch_script_path.rename(
+            main_job_log_dir / f"{self.slurm_job_name}.{self.slurm_job_id}.sbatch"
+        )
+        return BatchLaunchResponse(
+            slurm_job_id=self.slurm_job_id,
+            slurm_job_name=self.slurm_job_name,
+            model_names=self.model_names,
+            config=self.params,
+            raw_output=command_output,
+        )
 class ModelStatusMonitor:
     """Class for handling server status information and monitoring.
@@ -307,16 +619,17 @@ class ModelStatusMonitor:
     Parameters
     ----------
-    slurm_job_id : int
+    slurm_job_id : str
         ID of the SLURM job to monitor
-    log_dir : str, optional
-        Base directory containing log files
     """
-    def __init__(self, slurm_job_id: int, log_dir: Optional[str] = None):
+    def __init__(self, slurm_job_id: str):
         self.slurm_job_id = slurm_job_id
         self.output = self._get_raw_status_output()
-        self.log_dir = log_dir
+        self.job_status = dict(
+            field.split("=", 1) for field in self.output.split() if "=" in field
+        )
+        self.log_dir = self._get_log_dir()
         self.status_info = self._get_base_status_data()
     def _get_raw_status_output(self) -> str:
@@ -334,10 +647,28 @@ class ModelStatusMonitor:
         """
         status_cmd = f"scontrol show job {self.slurm_job_id} --oneliner"
         output, stderr = utils.run_bash_command(status_cmd)
         if stderr:
             raise SlurmJobError(f"Error: {stderr}")
         return output
+    def _get_log_dir(self) -> str:
+        """Get the log directory for the job.
+        Returns
+        -------
+        str
+            Log directory for the job
+        """
+        try:
+            outfile_path = self.job_status["StdOut"]
+            directory = Path(outfile_path).parent
+            return str(directory)
+        except KeyError as err:
+            raise FileNotFoundError(
+                f"Output file not found for job {self.slurm_job_id}"
+            ) from err
     def _get_base_status_data(self) -> StatusResponse:
         """Extract basic job status information from scontrol output.
@@ -347,14 +678,15 @@ class ModelStatusMonitor:
             Basic status information for the job
         """
         try:
-            job_name = self.output.split(" ")[1].split("=")[1]
-            job_state = self.output.split(" ")[9].split("=")[1]
-        except IndexError:
+            job_name = self.job_status["JobName"]
+            job_state = self.job_status["JobState"]
+        except KeyError:
             job_name = "UNAVAILABLE"
             job_state = ModelStatus.UNAVAILABLE
         return StatusResponse(
             model_name=job_name,
+            log_dir=self.log_dir,
             server_status=ModelStatus.UNAVAILABLE,
             job_state=job_state,
             raw_output=self.output,
@@ -399,9 +731,9 @@ class ModelStatusMonitor:
     def _process_pending_state(self) -> None:
         """Process PENDING job state and update status information."""
         try:
-            self.status_info.pending_reason = self.output.split(" ")[10].split("=")[1]
+            self.status_info.pending_reason = self.job_status["Reason"]
             self.status_info.server_status = ModelStatus.PENDING
-        except IndexError:
+        except KeyError:
             self.status_info.pending_reason = "Unknown pending reason"
     def process_model_status(self) -> StatusResponse:
@@ -428,16 +760,16 @@ class PerformanceMetricsCollector:
     Parameters
     ----------
-    slurm_job_id : int
+    slurm_job_id : str
         ID of the SLURM job to collect metrics from
     log_dir : str, optional
         Directory containing log files
     """
-    def __init__(self, slurm_job_id: int, log_dir: Optional[str] = None):
+    def __init__(self, slurm_job_id: str):
         self.slurm_job_id = slurm_job_id
-        self.log_dir = log_dir
         self.status_info = self._get_status_info()
+        self.log_dir = self.status_info.log_dir
         self.metrics_url = self._build_metrics_url()
         self.enabled_prefix_caching = self._check_prefix_caching()
@@ -454,7 +786,7 @@ class PerformanceMetricsCollector:
         StatusResponse
             Current status information for the model
         """
-        status_helper = ModelStatusMonitor(self.slurm_job_id, self.log_dir)
+        status_helper = ModelStatusMonitor(self.slurm_job_id)
         return status_helper.process_model_status()
     def _build_metrics_url(self) -> str:
@@ -646,7 +978,7 @@ class ModelRegistry:
                 config=config.model_dump(exclude={"model_name", "venv", "log_dir"}),
             )
             available_models.append(info)
-        return available_models
+        return sorted(available_models, key=lambda x: x.name)
     def get_single_model_config(self, model_name: str) -> ModelConfig:
         """Get configuration for a specific model.
@@ -667,7 +999,8 @@ class ModelRegistry:
             If the specified model is not found in configuration
         """
         config = next(
-            (c for c in self.model_configs if c.model_name == model_name), None
+            (c for c in self.model_configs if c.model_name == model_name),
+            None,
         )
         if not config:
             raise ModelNotFoundError(f"Model '{model_name}' not found in configuration")

vec-inf 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

vec-inf 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl