PyPI - vec-inf - Versions diffs - 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

vec-inf 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

vec_inf/README.md +18 -4
vec_inf/cli/_cli.py +212 -30
vec_inf/cli/_helper.py +95 -14
vec_inf/client/_client_vars.py +19 -152
vec_inf/client/_helper.py +386 -53
vec_inf/client/_slurm_script_generator.py +210 -43
vec_inf/client/_slurm_templates.py +248 -0
vec_inf/client/_slurm_vars.py +82 -0
vec_inf/client/_utils.py +190 -71
vec_inf/client/api.py +96 -25
vec_inf/client/config.py +46 -15
vec_inf/client/models.py +51 -2
vec_inf/config/README.md +4 -243
vec_inf/config/environment.yaml +31 -0
vec_inf/config/models.yaml +102 -281
{vec_inf-0.6.0.dist-info → vec_inf-0.7.0.dist-info}/METADATA +25 -67
vec_inf-0.7.0.dist-info/RECORD +27 -0
vec_inf/client/slurm_vars.py +0 -49
vec_inf-0.6.0.dist-info/RECORD +0 -25
{vec_inf-0.6.0.dist-info → vec_inf-0.7.0.dist-info}/WHEEL +0 -0
{vec_inf-0.6.0.dist-info → vec_inf-0.7.0.dist-info}/entry_points.txt +0 -0
{vec_inf-0.6.0.dist-info → vec_inf-0.7.0.dist-info}/licenses/LICENSE +0 -0

vec_inf/client/_utils.py CHANGED Viewed

@@ -14,10 +14,11 @@ from typing import Any, Optional, Union, cast
 import requests
 import yaml
-from vec_inf.client._client_vars import MODEL_READY_SIGNATURE
+from vec_inf.client._client_vars import MODEL_READY_SIGNATURE, REQUIRED_ARGS
+from vec_inf.client._exceptions import MissingRequiredFieldsError
+from vec_inf.client._slurm_vars import CACHED_CONFIG_DIR
 from vec_inf.client.config import ModelConfig
 from vec_inf.client.models import ModelStatus
-from vec_inf.client.slurm_vars import CACHED_CONFIG
 def run_bash_command(command: str) -> tuple[str, str]:
@@ -41,9 +42,9 @@ def run_bash_command(command: str) -> tuple[str, str]:
 def read_slurm_log(
     slurm_job_name: str,
-    slurm_job_id: int,
+    slurm_job_id: str,
     slurm_log_type: str,
-    log_dir: Optional[Union[str, Path]],
+    log_dir: str,
 ) -> Union[list[str], str, dict[str, str]]:
     """Read the slurm log file.
@@ -51,12 +52,12 @@ def read_slurm_log(
     ----------
     slurm_job_name : str
         Name of the SLURM job
-    slurm_job_id : int
+    slurm_job_id : str
         ID of the SLURM job
     slurm_log_type : str
         Type of log file to read ('out', 'err', or 'json')
-    log_dir : Optional[Union[str, Path]]
-        Directory containing log files, if None uses default location
+    log_dir : str
+        Directory containing log files
     Returns
     -------
@@ -66,31 +67,11 @@ def read_slurm_log(
         - dict[str, str] for 'json' logs
         - str for error messages if file not found
     """
-    if not log_dir:
-        # Default log directory
-        models_dir = Path.home() / ".vec-inf-logs"
-        # Iterate over all dirs in models_dir, sorted by dir name length in desc order
-        for directory in sorted(
-            [d for d in models_dir.iterdir() if d.is_dir()],
-            key=lambda d: len(d.name),
-            reverse=True,
-        ):
-            if directory.name in slurm_job_name:
-                log_dir = directory
-                break
-    else:
-        log_dir = Path(log_dir)
-    # If log_dir is still not set, then didn't find the log dir at default location
-    if not log_dir:
-        return "LOG DIR NOT FOUND"
     try:
-        file_path = (
-            log_dir
-            / Path(f"{slurm_job_name}.{slurm_job_id}")
-            / f"{slurm_job_name}.{slurm_job_id}.{slurm_log_type}"
-        )
+        if "+" in slurm_job_id:
+            main_job_id, het_job_id = slurm_job_id.split("+")
+            slurm_job_id = str(int(main_job_id) + int(het_job_id))
+        file_path = Path(log_dir, f"{slurm_job_name}.{slurm_job_id}.{slurm_log_type}")
         if slurm_log_type == "json":
             with file_path.open("r") as file:
                 json_content: dict[str, str] = json.load(file)
@@ -103,7 +84,7 @@ def read_slurm_log(
 def is_server_running(
-    slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]
+    slurm_job_name: str, slurm_job_id: str, log_dir: str
 ) -> Union[str, ModelStatus, tuple[ModelStatus, str]]:
     """Check if a model is ready to serve requests.
@@ -111,9 +92,9 @@ def is_server_running(
     ----------
     slurm_job_name : str
         Name of the SLURM job
-    slurm_job_id : int
+    slurm_job_id : str
         ID of the SLURM job
-    log_dir : Optional[str]
+    log_dir : str
         Directory containing log files
     Returns
@@ -138,16 +119,16 @@ def is_server_running(
     return status
-def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]) -> str:
+def get_base_url(slurm_job_name: str, slurm_job_id: str, log_dir: str) -> str:
     """Get the base URL of a model.
     Parameters
     ----------
     slurm_job_name : str
         Name of the SLURM job
-    slurm_job_id : int
+    slurm_job_id : str
         ID of the SLURM job
-    log_dir : Optional[str]
+    log_dir : str
         Directory containing log files
     Returns
@@ -164,7 +145,7 @@ def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str])
 def model_health_check(
-    slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]
+    slurm_job_name: str, slurm_job_id: str, log_dir: str
 ) -> tuple[ModelStatus, Union[str, int]]:
     """Check the health of a running model on the cluster.
@@ -172,9 +153,9 @@ def model_health_check(
     ----------
     slurm_job_name : str
         Name of the SLURM job
-    slurm_job_id : int
+    slurm_job_id : str
         ID of the SLURM job
-    log_dir : Optional[str]
+    log_dir : str
         Directory containing log files
     Returns
@@ -199,12 +180,17 @@ def model_health_check(
         return (ModelStatus.FAILED, str(e))
-def load_config() -> list[ModelConfig]:
+def load_config(config_path: Optional[str] = None) -> list[ModelConfig]:
     """Load the model configuration.
     Loads configuration from default and user-specified paths, merging them
     if both exist. User configuration takes precedence over default values.
+    Parameters
+    ----------
+    config_path : Optional[str]
+        Path to the configuration file
     Returns
     -------
     list[ModelConfig]
@@ -213,44 +199,80 @@ def load_config() -> list[ModelConfig]:
     Notes
     -----
     Configuration is loaded from:
-    1. Default path: package's config/models.yaml
-    2. User path: specified by VEC_INF_CONFIG environment variable
+    1. User path: specified by config_path
+    2. Default path: package's config/models.yaml or CACHED_CONFIG if it exists
+    3. Environment variable: specified by VEC_INF_CONFIG environment variable
+        and merged with default config
     If user configuration exists, it will be merged with default configuration,
     with user values taking precedence for overlapping fields.
     """
+    def load_yaml_config(path: Path) -> dict[str, Any]:
+        """Load YAML config with error handling."""
+        try:
+            with path.open() as f:
+                return yaml.safe_load(f) or {}
+        except FileNotFoundError as err:
+            raise FileNotFoundError(f"Could not find config: {path}") from err
+        except yaml.YAMLError as err:
+            raise ValueError(f"Error parsing YAML config at {path}: {err}") from err
+    def process_config(config: dict[str, Any]) -> list[ModelConfig]:
+        """Process the config based on the config type."""
+        return [
+            ModelConfig(model_name=name, **model_data)
+            for name, model_data in config.get("models", {}).items()
+        ]
+    def resolve_config_path_from_env_var() -> Path | None:
+        """Resolve the config path from the environment variable."""
+        config_dir = os.getenv("VEC_INF_CONFIG_DIR")
+        config_path = os.getenv("VEC_INF_MODEL_CONFIG")
+        if config_path:
+            return Path(config_path)
+        if config_dir:
+            return Path(config_dir, "models.yaml")
+        return None
+    def update_config(
+        config: dict[str, Any], user_config: dict[str, Any]
+    ) -> dict[str, Any]:
+        """Update the config with the user config."""
+        for name, data in user_config.get("models", {}).items():
+            if name in config.get("models", {}):
+                config["models"][name].update(data)
+            else:
+                config.setdefault("models", {})[name] = data
+        return config
+    # 1. If config_path is given, use only that
+    if config_path:
+        config = load_yaml_config(Path(config_path))
+        return process_config(config)
+    # 2. Otherwise, load default config
     default_path = (
-        CACHED_CONFIG
-        if CACHED_CONFIG.exists()
+        CACHED_CONFIG_DIR / "models.yaml"
+        if CACHED_CONFIG_DIR.exists()
         else Path(__file__).resolve().parent.parent / "config" / "models.yaml"
     )
+    config = load_yaml_config(default_path)
+    # 3. If user config exists, merge it
+    user_path = resolve_config_path_from_env_var()
+    if user_path and user_path.exists():
+        user_config = load_yaml_config(user_path)
+        config = update_config(config, user_config)
+    elif user_path:
+        warnings.warn(
+            f"WARNING: Could not find user config: {str(user_path)}, revert to default config located at {default_path}",
+            UserWarning,
+            stacklevel=2,
+        )
-    config: dict[str, Any] = {}
-    with open(default_path) as f:
-        config = yaml.safe_load(f) or {}
-    user_path = os.getenv("VEC_INF_CONFIG")
-    if user_path:
-        user_path_obj = Path(user_path)
-        if user_path_obj.exists():
-            with open(user_path_obj) as f:
-                user_config = yaml.safe_load(f) or {}
-                for name, data in user_config.get("models", {}).items():
-                    if name in config.get("models", {}):
-                        config["models"][name].update(data)
-                    else:
-                        config.setdefault("models", {})[name] = data
-        else:
-            warnings.warn(
-                f"WARNING: Could not find user config: {user_path}, revert to default config located at {default_path}",
-                UserWarning,
-                stacklevel=2,
-            )
-    return [
-        ModelConfig(model_name=name, **model_data)
-        for name, model_data in config.get("models", {}).items()
-    ]
+    return process_config(config)
 def parse_launch_output(output: str) -> tuple[str, dict[str, str]]:
@@ -285,3 +307,100 @@ def parse_launch_output(output: str) -> tuple[str, dict[str, str]]:
             config_dict[key.lower().replace(" ", "_")] = value
     return slurm_job_id, config_dict
+def is_power_of_two(n: int) -> bool:
+    """Check if a number is a power of two.
+    Parameters
+    ----------
+    n : int
+        The number to check
+    """
+    return n > 0 and (n & (n - 1)) == 0
+def find_matching_dirs(
+    log_dir: Path,
+    model_family: Optional[str] = None,
+    model_name: Optional[str] = None,
+    job_id: Optional[int] = None,
+    before_job_id: Optional[int] = None,
+) -> list[Path]:
+    """
+    Find log directories based on filtering criteria.
+    Parameters
+    ----------
+    log_dir : Path
+        The base directory containing model family directories.
+    model_family : str, optional
+        Filter to only search inside this family.
+    model_name : str, optional
+        Filter to only match model names.
+    job_id : int, optional
+        Filter to only match this exact SLURM job ID.
+    before_job_id : int, optional
+        Filter to only include job IDs less than this value.
+    Returns
+    -------
+    list[Path]
+        List of directories that match the criteria and can be deleted.
+    """
+    matched = []
+    if not log_dir.exists() or not log_dir.is_dir():
+        raise FileNotFoundError(f"Log directory does not exist: {log_dir}")
+    if not model_family and not model_name and not job_id and not before_job_id:
+        return [log_dir]
+    for family_dir in log_dir.iterdir():
+        if not family_dir.is_dir():
+            continue
+        if model_family and family_dir.name != model_family:
+            continue
+        if model_family and not model_name and not job_id and not before_job_id:
+            return [family_dir]
+        for job_dir in family_dir.iterdir():
+            if not job_dir.is_dir():
+                continue
+            try:
+                name_part, id_part = job_dir.name.rsplit(".", 1)
+                parsed_id = int(id_part)
+            except ValueError:
+                continue
+            if model_name and name_part != model_name:
+                continue
+            if job_id is not None and parsed_id != job_id:
+                continue
+            if before_job_id is not None and parsed_id >= before_job_id:
+                continue
+            matched.append(job_dir)
+    return matched
+def check_required_fields(params: dict[str, Any]) -> None:
+    """Check for required fields without default vals and their corresponding env vars.
+    Parameters
+    ----------
+    params : dict[str, Any]
+        Dictionary of parameters to check.
+    """
+    for arg in REQUIRED_ARGS:
+        if not params.get(arg):
+            default_value = os.getenv(REQUIRED_ARGS[arg])
+            if default_value:
+                params[arg] = default_value
+            else:
+                raise MissingRequiredFieldsError(
+                    f"{arg} is required, please set it in the command arguments or environment variables"
+                )

vec_inf/client/api.py CHANGED Viewed

@@ -10,8 +10,10 @@ vec_inf.client._helper : Helper classes for model inference server management
 vec_inf.client.models : Data models for API responses
 """
+import shutil
 import time
 import warnings
+from pathlib import Path
 from typing import Any, Optional, Union
 from vec_inf.client._exceptions import (
@@ -19,14 +21,16 @@ from vec_inf.client._exceptions import (
     SlurmJobError,
 )
 from vec_inf.client._helper import (
+    BatchModelLauncher,
     ModelLauncher,
     ModelRegistry,
     ModelStatusMonitor,
     PerformanceMetricsCollector,
 )
-from vec_inf.client._utils import run_bash_command
+from vec_inf.client._utils import find_matching_dirs, run_bash_command
 from vec_inf.client.config import ModelConfig
 from vec_inf.client.models import (
+    BatchLaunchResponse,
     LaunchOptions,
     LaunchResponse,
     MetricsResponse,
@@ -60,6 +64,9 @@ class VecInfClient:
     wait_until_ready(slurm_job_id, timeout_seconds, poll_interval_seconds, log_dir)
         Wait for a model to become ready
+    cleanup_logs(log_dir, model_name, model_family, job_id, dry_run)
+        Remove logs from the log directory.
     Examples
     --------
     >>> from vec_inf.api import VecInfClient
@@ -145,17 +152,42 @@ class VecInfClient:
         model_launcher = ModelLauncher(model_name, options_dict)
         return model_launcher.launch()
-    def get_status(
-        self, slurm_job_id: int, log_dir: Optional[str] = None
-    ) -> StatusResponse:
+    def batch_launch_models(
+        self,
+        model_names: list[str],
+        batch_config: Optional[str] = None,
+        account: Optional[str] = None,
+        work_dir: Optional[str] = None,
+    ) -> BatchLaunchResponse:
+        """Launch multiple models on the cluster.
+        Parameters
+        ----------
+        model_names : list[str]
+            List of model names to launch
+        Returns
+        -------
+        BatchLaunchResponse
+            Response containing launch details for each model
+        Raises
+        ------
+        ModelConfigurationError
+            If the model configuration is invalid
+        """
+        model_launcher = BatchModelLauncher(
+            model_names, batch_config, account, work_dir
+        )
+        return model_launcher.launch()
+    def get_status(self, slurm_job_id: str) -> StatusResponse:
         """Get the status of a running model.
         Parameters
         ----------
-        slurm_job_id : int
+        slurm_job_id : str
             The SLURM job ID to check
-        log_dir : str, optional
-            Path to the SLURM log directory. If None, uses default location
         Returns
         -------
@@ -167,20 +199,16 @@ class VecInfClient:
             - Base URL (if ready)
             - Error information (if failed)
         """
-        model_status_monitor = ModelStatusMonitor(slurm_job_id, log_dir)
+        model_status_monitor = ModelStatusMonitor(slurm_job_id)
         return model_status_monitor.process_model_status()
-    def get_metrics(
-        self, slurm_job_id: int, log_dir: Optional[str] = None
-    ) -> MetricsResponse:
+    def get_metrics(self, slurm_job_id: str) -> MetricsResponse:
         """Get the performance metrics of a running model.
         Parameters
         ----------
-        slurm_job_id : int
+        slurm_job_id : str
             The SLURM job ID to get metrics for
-        log_dir : str, optional
-            Path to the SLURM log directory. If None, uses default location
         Returns
         -------
@@ -190,9 +218,7 @@ class VecInfClient:
             - Performance metrics or error message
             - Timestamp of collection
         """
-        performance_metrics_collector = PerformanceMetricsCollector(
-            slurm_job_id, log_dir
-        )
+        performance_metrics_collector = PerformanceMetricsCollector(slurm_job_id)
         metrics: Union[dict[str, float], str]
         if not performance_metrics_collector.metrics_url.startswith("http"):
@@ -206,12 +232,12 @@ class VecInfClient:
             timestamp=time.time(),
         )
-    def shutdown_model(self, slurm_job_id: int) -> bool:
+    def shutdown_model(self, slurm_job_id: str) -> bool:
         """Shutdown a running model.
         Parameters
         ----------
-        slurm_job_id : int
+        slurm_job_id : str
             The SLURM job ID to shut down
         Returns
@@ -232,23 +258,20 @@ class VecInfClient:
     def wait_until_ready(
         self,
-        slurm_job_id: int,
+        slurm_job_id: str,
         timeout_seconds: int = 1800,
         poll_interval_seconds: int = 10,
-        log_dir: Optional[str] = None,
     ) -> StatusResponse:
         """Wait until a model is ready or fails.
         Parameters
         ----------
-        slurm_job_id : int
+        slurm_job_id : str
             The SLURM job ID to wait for
         timeout_seconds : int, optional
             Maximum time to wait in seconds, by default 1800 (30 mins)
         poll_interval_seconds : int, optional
             How often to check status in seconds, by default 10
-        log_dir : str, optional
-            Path to the SLURM log directory. If None, uses default location
         Returns
         -------
@@ -273,7 +296,7 @@ class VecInfClient:
         start_time = time.time()
         while True:
-            status_info = self.get_status(slurm_job_id, log_dir)
+            status_info = self.get_status(slurm_job_id)
             if status_info.server_status == ModelStatus.READY:
                 return status_info
@@ -300,3 +323,51 @@ class VecInfClient:
             # Wait before checking again
             time.sleep(poll_interval_seconds)
+    def cleanup_logs(
+        self,
+        log_dir: Optional[Union[str, Path]] = None,
+        model_family: Optional[str] = None,
+        model_name: Optional[str] = None,
+        job_id: Optional[int] = None,
+        before_job_id: Optional[int] = None,
+        dry_run: bool = False,
+    ) -> list[Path]:
+        """Remove logs from the log directory.
+        Parameters
+        ----------
+        log_dir : str or Path, optional
+            Root directory containing log files. Defaults to ~/.vec-inf-logs.
+        model_family : str, optional
+            Only delete logs for this model family.
+        model_name : str, optional
+            Only delete logs for this model name.
+        job_id : int, optional
+            If provided, only match directories with this exact SLURM job ID.
+        before_job_id : int, optional
+            If provided, only delete logs with job ID less than this value.
+        dry_run : bool
+            If True, return matching files without deleting them.
+        Returns
+        -------
+        list[Path]
+            List of deleted (or matched if dry_run) log file paths.
+        """
+        log_root = Path(log_dir) if log_dir else Path.home() / ".vec-inf-logs"
+        matched = find_matching_dirs(
+            log_dir=log_root,
+            model_family=model_family,
+            model_name=model_name,
+            job_id=job_id,
+            before_job_id=before_job_id,
+        )
+        if dry_run:
+            return matched
+        for path in matched:
+            shutil.rmtree(path)
+        return matched

vec-inf 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

vec-inf 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl