PyPI - vec-inf - Versions diffs - 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

vec-inf 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

vec_inf/README.md +3 -3
vec_inf/cli/_cli.py +214 -104
vec_inf/cli/_helper.py +289 -564
vec_inf/cli/_utils.py +26 -150
vec_inf/cli/_vars.py +32 -0
vec_inf/client/__init__.py +31 -0
vec_inf/client/_client_vars.py +213 -0
vec_inf/client/_exceptions.py +37 -0
vec_inf/client/_helper.py +674 -0
vec_inf/client/_slurm_script_generator.py +179 -0
vec_inf/client/_utils.py +287 -0
vec_inf/client/api.py +302 -0
vec_inf/client/config.py +128 -0
vec_inf/client/models.py +225 -0
vec_inf/client/slurm_vars.py +49 -0
vec_inf/config/README.md +0 -12
vec_inf/config/models.yaml +417 -391
{vec_inf-0.5.0.dist-info → vec_inf-0.6.0.dist-info}/METADATA +44 -61
vec_inf-0.6.0.dist-info/RECORD +25 -0
vec_inf/cli/_config.py +0 -87
vec_inf/multinode_vllm.slurm +0 -154
vec_inf/vllm.slurm +0 -90
vec_inf-0.5.0.dist-info/RECORD +0 -17
{vec_inf-0.5.0.dist-info → vec_inf-0.6.0.dist-info}/WHEEL +0 -0
{vec_inf-0.5.0.dist-info → vec_inf-0.6.0.dist-info}/entry_points.txt +0 -0
{vec_inf-0.5.0.dist-info → vec_inf-0.6.0.dist-info}/licenses/LICENSE +0 -0

vec_inf/README.md CHANGED Viewed

@@ -1,9 +1,9 @@
 # `vec-inf` Commands
-* `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server, `--json-mode` supported. Check [`here`](./models/README.md) for complete list of available options.
-* `list`: List all available model names, or append a supported model name to view the default configuration, `--json-mode` supported.
-* `metrics`: Streams performance metrics to the console.
+* `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server, `--json-mode` supported.
 * `status`: Check the model status by providing its Slurm job ID, `--json-mode` supported.
+* `metrics`: Streams performance metrics to the console.
 * `shutdown`: Shutdown a model by providing its Slurm job ID.
+* `list`: List all available model names, or view the default/cached configuration of a specific model, `--json-mode` supported.
 Use `--help` to see all available options

vec_inf/cli/_cli.py CHANGED Viewed

@@ -1,4 +1,22 @@
-"""Command line interface for Vector Inference."""
+"""Command line interface for Vector Inference.
+This module provides the command-line interface for interacting with Vector
+Inference services, including model launching, status checking, metrics
+monitoring, and shutdown operations.
+Commands
+--------
+launch
+    Launch a model on the cluster
+status
+    Check the status of a running model
+shutdown
+    Stop a running model
+list
+    List available models or get specific model configuration
+metrics
+    Stream real-time performance metrics
+"""
 import time
 from typing import Optional, Union
@@ -7,8 +25,13 @@ import click
 from rich.console import Console
 from rich.live import Live
-import vec_inf.cli._utils as utils
-from vec_inf.cli._helper import LaunchHelper, ListHelper, MetricsHelper, StatusHelper
+from vec_inf.cli._helper import (
+    LaunchResponseFormatter,
+    ListCmdDisplay,
+    MetricsResponseFormatter,
+    StatusResponseFormatter,
+)
+from vec_inf.client import LaunchOptions, VecInfClient
 CONSOLE = Console()
@@ -24,36 +47,6 @@ def cli() -> None:
 @click.argument("model-name", type=str, nargs=1)
 @click.option("--model-family", type=str, help="The model family")
 @click.option("--model-variant", type=str, help="The model variant")
-@click.option(
-    "--max-model-len",
-    type=int,
-    help="Model context length. Default value set based on suggested resource allocation.",
-)
-@click.option(
-    "--max-num-seqs",
-    type=int,
-    help="Maximum number of sequences to process in a single request",
-)
-@click.option(
-    "--gpu-memory-utilization",
-    type=float,
-    help="GPU memory utilization, default to 0.9",
-)
-@click.option(
-    "--enable-prefix-caching",
-    is_flag=True,
-    help="Enables automatic prefix caching",
-)
-@click.option(
-    "--enable-chunked-prefill",
-    is_flag=True,
-    help="Enable chunked prefill, enabled by default if max number of sequences > 32k",
-)
-@click.option(
-    "--max-num-batched-tokens",
-    type=int,
-    help="Maximum number of batched tokens per iteration, defaults to 2048 if --enable-chunked-prefill is set, else None",
-)
 @click.option(
     "--partition",
     type=str,
@@ -69,6 +62,11 @@ def cli() -> None:
     type=int,
     help="Number of GPUs/node to use, default to suggested resource allocation for model",
 )
+@click.option(
+    "--account",
+    type=str,
+    help="Charge resources used by this job to specified account.",
+)
 @click.option(
     "--qos",
     type=str,
@@ -79,12 +77,6 @@ def cli() -> None:
     type=str,
     help="Time limit for job, this should comply with QoS limits",
 )
-@click.option(
-    "--vocab-size",
-    type=int,
-    help="Vocabulary size, this option is intended for custom models",
-)
-@click.option("--data-type", type=str, help="Model data type")
 @click.option(
     "--venv",
     type=str,
@@ -101,19 +93,9 @@ def cli() -> None:
     help="Path to parent directory containing model weights",
 )
 @click.option(
-    "--pipeline-parallelism",
-    is_flag=True,
-    help="Enable pipeline parallelism, enabled by default for supported models",
-)
-@click.option(
-    "--compilation-config",
-    type=click.Choice(["0", "3"]),
-    help="torch.compile optimization level, accepts '0' or '3', default to '0', which means no optimization is applied",
-)
-@click.option(
-    "--enforce-eager",
-    is_flag=True,
-    help="Always use eager-mode PyTorch",
+    "--vllm-args",
+    type=str,
+    help="vLLM engine arguments to be set, use the format as specified in vLLM documentation and separate arguments with commas, e.g. --vllm-args '--max-model-len=8192,--max-num-seqs=256,--enable-prefix-caching'",
 )
 @click.option(
     "--json-mode",
@@ -122,18 +104,64 @@ def cli() -> None:
 )
 def launch(
     model_name: str,
-    **cli_kwargs: Optional[Union[str, int, bool]],
+    **cli_kwargs: Optional[Union[str, int, float, bool]],
 ) -> None:
-    """Launch a model on the cluster."""
+    """Launch a model on the cluster.
+    Parameters
+    ----------
+    model_name : str
+        Name of the model to launch
+    **cli_kwargs : dict
+        Additional launch options including:
+        - model_family : str, optional
+            Family/architecture of the model
+        - model_variant : str, optional
+            Specific variant of the model
+        - partition : str, optional
+            Type of compute partition
+        - num_nodes : int, optional
+            Number of nodes to use
+        - gpus_per_node : int, optional
+            Number of GPUs per node
+        - qos : str, optional
+            Quality of service tier
+        - time : str, optional
+            Time limit for job
+        - venv : str, optional
+            Path to virtual environment
+        - log_dir : str, optional
+            Path to SLURM log directory
+        - model_weights_parent_dir : str, optional
+            Path to model weights directory
+        - vllm_args : str, optional
+            vLLM engine arguments
+        - json_mode : bool, optional
+            Output in JSON format
+    Raises
+    ------
+    click.ClickException
+        If launch fails for any reason
+    """
     try:
-        launch_helper = LaunchHelper(model_name, cli_kwargs)
+        # Convert cli_kwargs to LaunchOptions
+        json_mode = cli_kwargs["json_mode"]
+        del cli_kwargs["json_mode"]
+        launch_options = LaunchOptions(**cli_kwargs)  # type: ignore
-        launch_helper.set_env_vars()
-        launch_command = launch_helper.build_launch_command()
-        command_output, stderr = utils.run_bash_command(launch_command)
-        if stderr:
-            raise click.ClickException(f"Error: {stderr}")
-        launch_helper.post_launch_processing(command_output, CONSOLE)
+        # Start the client and launch model inference server
+        client = VecInfClient()
+        launch_response = client.launch_model(model_name, launch_options)
+        # Display launch information
+        launch_formatter = LaunchResponseFormatter(model_name, launch_response.config)
+        if json_mode:
+            click.echo(launch_response.config)
+        else:
+            launch_info_table = launch_formatter.format_table_output()
+            CONSOLE.print(launch_info_table)
     except click.ClickException as e:
         raise e
@@ -156,28 +184,61 @@ def launch(
 def status(
     slurm_job_id: int, log_dir: Optional[str] = None, json_mode: bool = False
 ) -> None:
-    """Get the status of a running model on the cluster."""
-    status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
-    output, stderr = utils.run_bash_command(status_cmd)
-    if stderr:
-        raise click.ClickException(f"Error: {stderr}")
+    """Get the status of a running model on the cluster.
+    Parameters
+    ----------
+    slurm_job_id : int
+        ID of the SLURM job to check
+    log_dir : str, optional
+        Path to SLURM log directory
+    json_mode : bool, default=False
+        Whether to output in JSON format
-    status_helper = StatusHelper(slurm_job_id, output, log_dir)
+    Raises
+    ------
+    click.ClickException
+        If status check fails
+    """
+    try:
+        # Start the client and get model inference server status
+        client = VecInfClient()
+        status_response = client.get_status(slurm_job_id, log_dir)
+        # Display status information
+        status_formatter = StatusResponseFormatter(status_response)
+        if json_mode:
+            status_formatter.output_json()
+        else:
+            status_info_table = status_formatter.output_table()
+            CONSOLE.print(status_info_table)
-    status_helper.process_job_state()
-    if json_mode:
-        status_helper.output_json()
-    else:
-        status_helper.output_table(CONSOLE)
+    except click.ClickException as e:
+        raise e
+    except Exception as e:
+        raise click.ClickException(f"Status check failed: {str(e)}") from e
 @cli.command("shutdown")
 @click.argument("slurm_job_id", type=int, nargs=1)
 def shutdown(slurm_job_id: int) -> None:
-    """Shutdown a running model on the cluster."""
-    shutdown_cmd = f"scancel {slurm_job_id}"
-    utils.run_bash_command(shutdown_cmd)
-    click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}")
+    """Shutdown a running model on the cluster.
+    Parameters
+    ----------
+    slurm_job_id : int
+        ID of the SLURM job to shut down
+    Raises
+    ------
+    click.ClickException
+        If shutdown operation fails
+    """
+    try:
+        client = VecInfClient()
+        client.shutdown_model(slurm_job_id)
+        click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}")
+    except Exception as e:
+        raise click.ClickException(f"Shutdown failed: {str(e)}") from e
 @cli.command("list")
@@ -188,9 +249,34 @@ def shutdown(slurm_job_id: int) -> None:
     help="Output in JSON string",
 )
 def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> None:
-    """List all available models, or get default setup of a specific model."""
-    list_helper = ListHelper(model_name, json_mode)
-    list_helper.process_list_command(CONSOLE)
+    """List all available models, or get default setup of a specific model.
+    Parameters
+    ----------
+    model_name : str, optional
+        Name of specific model to get information for
+    json_mode : bool, default=False
+        Whether to output in JSON format
+    Raises
+    ------
+    click.ClickException
+        If list operation fails
+    """
+    try:
+        # Start the client
+        client = VecInfClient()
+        list_display = ListCmdDisplay(CONSOLE, json_mode)
+        if model_name:
+            model_config = client.get_model_config(model_name)
+            list_display.display_single_model_output(model_config)
+        else:
+            model_infos = client.list_models()
+            list_display.display_all_models_output(model_infos)
+    except click.ClickException as e:
+        raise e
+    except Exception as e:
+        raise click.ClickException(f"List models failed: {str(e)}") from e
 @cli.command("metrics")
@@ -199,31 +285,55 @@ def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> No
     "--log-dir", type=str, help="Path to slurm log directory (if used during launch)"
 )
 def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
-    """Stream real-time performance metrics from the model endpoint."""
-    helper = MetricsHelper(slurm_job_id, log_dir)
-    # Check if metrics URL is ready
-    if not helper.metrics_url.startswith("http"):
-        table = utils.create_table("Metric", "Value")
-        helper.display_failed_metrics(
-            table, f"Metrics endpoint unavailable - {helper.metrics_url}"
-        )
-        CONSOLE.print(table)
-        return
-    with Live(refresh_per_second=1, console=CONSOLE) as live:
-        while True:
-            metrics = helper.fetch_metrics()
-            table = utils.create_table("Metric", "Value")
-            if isinstance(metrics, str):
-                # Show status information if metrics aren't available
-                helper.display_failed_metrics(table, metrics)
-            else:
-                helper.display_metrics(table, metrics)
-            live.update(table)
-            time.sleep(2)
+    """Stream real-time performance metrics from the model endpoint.
+    Parameters
+    ----------
+    slurm_job_id : int
+        ID of the SLURM job to monitor
+    log_dir : str, optional
+        Path to SLURM log directory
+    Raises
+    ------
+    click.ClickException
+        If metrics collection fails
+    Notes
+    -----
+    This command continuously streams metrics with a 2-second refresh interval
+    until interrupted. If metrics are not available, it will display status
+    information instead.
+    """
+    try:
+        # Start the client and get inference server metrics
+        client = VecInfClient()
+        metrics_response = client.get_metrics(slurm_job_id, log_dir)
+        metrics_formatter = MetricsResponseFormatter(metrics_response.metrics)
+        # Check if metrics response is ready
+        if isinstance(metrics_response.metrics, str):
+            metrics_formatter.format_failed_metrics(metrics_response.metrics)
+            CONSOLE.print(metrics_formatter.table)
+            return
+        with Live(refresh_per_second=1, console=CONSOLE) as live:
+            while True:
+                metrics_response = client.get_metrics(slurm_job_id, log_dir)
+                metrics_formatter = MetricsResponseFormatter(metrics_response.metrics)
+                if isinstance(metrics_response.metrics, str):
+                    # Show status information if metrics aren't available
+                    metrics_formatter.format_failed_metrics(metrics_response.metrics)
+                else:
+                    metrics_formatter.format_metrics()
+                live.update(metrics_formatter.table)
+                time.sleep(2)
+    except click.ClickException as e:
+        raise e
+    except Exception as e:
+        raise click.ClickException(f"Metrics check failed: {str(e)}") from e
 if __name__ == "__main__":

vec-inf 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

vec-inf 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl