PyPI - vec-inf - Versions diffs - 0.3.3__py3-none-any.whl → 0.4.0.post1__py3-none-any.whl - Mend

vec-inf 0.3.3py3-none-any.whl → 0.4.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

vec_inf/README.md +2 -1
vec_inf/cli/_cli.py +151 -31
vec_inf/cli/_utils.py +40 -14
vec_inf/launch_server.sh +32 -28
vec_inf/models/README.md +132 -35
vec_inf/models/models.csv +73 -46
vec_inf/multinode_vllm.slurm +18 -8
vec_inf/vllm.slurm +15 -5
vec_inf-0.4.0.post1.dist-info/LICENSE +21 -0
{vec_inf-0.3.3.dist-info → vec_inf-0.4.0.post1.dist-info}/METADATA +30 -8
vec_inf-0.4.0.post1.dist-info/RECORD +16 -0
{vec_inf-0.3.3.dist-info → vec_inf-0.4.0.post1.dist-info}/WHEEL +1 -1
vec_inf-0.3.3.dist-info/RECORD +0 -15
{vec_inf-0.3.3.dist-info → vec_inf-0.4.0.post1.dist-info}/entry_points.txt +0 -0

vec_inf/README.md CHANGED Viewed

@@ -1,7 +1,8 @@
 # `vec-inf` Commands
 * `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server, `--json-mode` supported. Check [`here`](./models/README.md) for complete list of available options.
-* `list`: List all available model names, `--json-mode` supported.
+* `list`: List all available model names, or append a supported model name to view the default configuration, `--json-mode` supported.
+* `metrics`: Streams performance metrics to the console.
 * `status`: Check the model status by providing its Slurm job ID, `--json-mode` supported.
 * `shutdown`: Shutdown a model by providing its Slurm job ID.

vec_inf/cli/_cli.py CHANGED Viewed

@@ -1,9 +1,13 @@
 import os
-from typing import Optional
+import time
+from typing import Optional, cast
 import click
+import polars as pl
 from rich.columns import Columns
 from rich.console import Console
+from rich.live import Live
 from rich.panel import Panel
 import vec_inf.cli._utils as utils
@@ -24,9 +28,19 @@ def cli():
 @click.option(
     "--max-model-len",
     type=int,
-    help="Model context length. If unspecified, will be automatically derived from the model config.",
+    help="Model context length. Default value set based on suggested resource allocation.",
+)
+@click.option(
+    "--max-num-seqs",
+    type=int,
+    help="Maximum number of sequences to process in a single request",
+)
+@click.option(
+    "--partition",
+    type=str,
+    default="a40",
+    help="Type of compute partition, default to a40",
 )
-@click.option("--partition", type=str, help="Type of compute partition, default to a40")
 @click.option(
     "--num-nodes",
     type=int,
@@ -40,24 +54,48 @@ def cli():
 @click.option(
     "--qos",
     type=str,
-    help="Quality of service, default depends on suggested resource allocation required for the model",
+    help="Quality of service",
 )
 @click.option(
     "--time",
     type=str,
-    help="Time limit for job, this should comply with QoS, default to max walltime of the chosen QoS",
+    help="Time limit for job, this should comply with QoS limits",
 )
 @click.option(
     "--vocab-size",
     type=int,
     help="Vocabulary size, this option is intended for custom models",
 )
-@click.option("--data-type", type=str, help="Model data type, default to auto")
-@click.option("--venv", type=str, help="Path to virtual environment")
+@click.option(
+    "--data-type", type=str, default="auto", help="Model data type, default to auto"
+)
+@click.option(
+    "--venv",
+    type=str,
+    default="singularity",
+    help="Path to virtual environment, default to preconfigured singularity container",
+)
 @click.option(
     "--log-dir",
     type=str,
-    help="Path to slurm log directory, default to .vec-inf-logs in home directory",
+    default="default",
+    help="Path to slurm log directory, default to .vec-inf-logs in user home directory",
+)
+@click.option(
+    "--model-weights-parent-dir",
+    type=str,
+    default="/model-weights",
+    help="Path to parent directory containing model weights, default to '/model-weights' for supported models",
+)
+@click.option(
+    "--pipeline-parallelism",
+    type=str,
+    help="Enable pipeline parallelism, accepts 'True' or 'False', default to 'True' for supported models",
+)
+@click.option(
+    "--enforce-eager",
+    type=str,
+    help="Always use eager-mode PyTorch, accepts 'True' or 'False', default to 'False' for custom models if not set",
 )
 @click.option(
     "--json-mode",
@@ -69,6 +107,7 @@ def launch(
     model_family: Optional[str] = None,
     model_variant: Optional[str] = None,
     max_model_len: Optional[int] = None,
+    max_num_seqs: Optional[int] = None,
     partition: Optional[str] = None,
     num_nodes: Optional[int] = None,
     num_gpus: Optional[int] = None,
@@ -78,11 +117,20 @@ def launch(
     data_type: Optional[str] = None,
     venv: Optional[str] = None,
     log_dir: Optional[str] = None,
+    model_weights_parent_dir: Optional[str] = None,
+    pipeline_parallelism: Optional[str] = None,
+    enforce_eager: Optional[str] = None,
     json_mode: bool = False,
 ) -> None:
     """
     Launch a model on the cluster
     """
+    if isinstance(pipeline_parallelism, str):
+        pipeline_parallelism = (
+            "True" if pipeline_parallelism.lower() == "true" else "False"
+        )
     launch_script_path = os.path.join(
         os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "launch_server.sh"
     )
@@ -90,7 +138,7 @@ def launch(
     models_df = utils.load_models_df()
-    if model_name in models_df["model_name"].values:
+    if model_name in models_df["model_name"].to_list():
         default_args = utils.load_default_args(models_df, model_name)
         for arg in default_args:
             if arg in locals() and locals()[arg] is not None:
@@ -98,10 +146,11 @@ def launch(
             renamed_arg = arg.replace("_", "-")
             launch_cmd += f" --{renamed_arg} {default_args[arg]}"
     else:
-        model_args = models_df.columns.tolist()
-        excluded_keys = ["model_name", "pipeline_parallelism"]
+        model_args = models_df.columns
+        model_args.remove("model_name")
+        model_args.remove("model_type")
         for arg in model_args:
-            if arg not in excluded_keys and locals()[arg] is not None:
+            if locals()[arg] is not None:
                 renamed_arg = arg.replace("_", "-")
                 launch_cmd += f" --{renamed_arg} {locals()[arg]}"
@@ -225,40 +274,111 @@ def shutdown(slurm_job_id: int) -> None:
     is_flag=True,
     help="Output in JSON string",
 )
-def list(model_name: Optional[str] = None, json_mode: bool = False) -> None:
+def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> None:
     """
     List all available models, or get default setup of a specific model
     """
-    models_df = utils.load_models_df()
-    if model_name:
-        if model_name not in models_df["model_name"].values:
+    def list_model(model_name: str, models_df: pl.DataFrame, json_mode: bool):
+        if model_name not in models_df["model_name"].to_list():
             raise ValueError(f"Model name {model_name} not found in available models")
-        excluded_keys = {"venv", "log_dir", "pipeline_parallelism"}
-        model_row = models_df.loc[models_df["model_name"] == model_name]
+        excluded_keys = {"venv", "log_dir"}
+        model_row = models_df.filter(models_df["model_name"] == model_name)
         if json_mode:
-            # click.echo(model_row.to_json(orient='records'))
-            filtered_model_row = model_row.drop(columns=excluded_keys, errors="ignore")
-            click.echo(filtered_model_row.to_json(orient="records"))
+            filtered_model_row = model_row.drop(excluded_keys, strict=False)
+            click.echo(filtered_model_row.to_dicts()[0])
             return
         table = utils.create_table(key_title="Model Config", value_title="Value")
-        for _, row in model_row.iterrows():
+        for row in model_row.to_dicts():
             for key, value in row.items():
                 if key not in excluded_keys:
                     table.add_row(key, str(value))
         CONSOLE.print(table)
-        return
-    if json_mode:
-        click.echo(models_df["model_name"].to_json(orient="records"))
-        return
-    panels = []
-    for _, row in models_df.iterrows():
-        styled_text = f"[magenta]{row['model_family']}[/magenta]-{row['model_variant']}"
-        panels.append(Panel(styled_text, expand=True))
-    CONSOLE.print(Columns(panels, equal=True))
+    def list_all(models_df: pl.DataFrame, json_mode: bool):
+        if json_mode:
+            click.echo(models_df["model_name"].to_list())
+            return
+        panels = []
+        model_type_colors = {
+            "LLM": "cyan",
+            "VLM": "bright_blue",
+            "Text Embedding": "purple",
+            "Reward Modeling": "bright_magenta",
+        }
+        models_df = models_df.with_columns(
+            pl.when(pl.col("model_type") == "LLM")
+            .then(0)
+            .when(pl.col("model_type") == "VLM")
+            .then(1)
+            .when(pl.col("model_type") == "Text Embedding")
+            .then(2)
+            .when(pl.col("model_type") == "Reward Modeling")
+            .then(3)
+            .otherwise(-1)
+            .alias("model_type_order")
+        )
+        models_df = models_df.sort("model_type_order")
+        models_df = models_df.drop("model_type_order")
+        for row in models_df.to_dicts():
+            panel_color = model_type_colors.get(row["model_type"], "white")
+            styled_text = (
+                f"[magenta]{row['model_family']}[/magenta]-{row['model_variant']}"
+            )
+            panels.append(Panel(styled_text, expand=True, border_style=panel_color))
+        CONSOLE.print(Columns(panels, equal=True))
+    models_df = utils.load_models_df()
+    if model_name:
+        list_model(model_name, models_df, json_mode)
+    else:
+        list_all(models_df, json_mode)
+@cli.command("metrics")
+@click.argument("slurm_job_id", type=int, nargs=1)
+@click.option(
+    "--log-dir",
+    type=str,
+    help="Path to slurm log directory. This is required if --log-dir was set in model launch",
+)
+def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
+    """
+    Stream performance metrics to the console
+    """
+    status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
+    output = utils.run_bash_command(status_cmd)
+    slurm_job_name = output.split(" ")[1].split("=")[1]
+    with Live(refresh_per_second=1, console=CONSOLE) as live:
+        while True:
+            out_logs = utils.read_slurm_log(
+                slurm_job_name, slurm_job_id, "out", log_dir
+            )
+            # if out_logs is a string, then it is an error message
+            if isinstance(out_logs, str):
+                live.update(out_logs)
+                break
+            out_logs = cast(list, out_logs)
+            latest_metrics = utils.get_latest_metric(out_logs)
+            # if latest_metrics is a string, then it is an error message
+            if isinstance(latest_metrics, str):
+                live.update(latest_metrics)
+                break
+            latest_metrics = cast(dict, latest_metrics)
+            table = utils.create_table(key_title="Metric", value_title="Value")
+            for key, value in latest_metrics.items():
+                table.add_row(key, value)
+            live.update(table)
+            time.sleep(2)
 if __name__ == "__main__":

vec_inf/cli/_utils.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import os
 import subprocess
-from typing import Optional, Union
+from typing import Optional, Union, cast
-import pandas as pd
+import polars as pl
 import requests
 from rich.table import Table
-MODEL_READY_SIGNATURE = "INFO:     Uvicorn running on http://0.0.0.0:"
+MODEL_READY_SIGNATURE = "INFO:     Application startup complete."
 SERVER_ADDRESS_SIGNATURE = "Server address: "
@@ -25,7 +25,7 @@ def read_slurm_log(
     slurm_job_name: str, slurm_job_id: int, slurm_log_type: str, log_dir: Optional[str]
 ) -> Union[list[str], str]:
     """
-    Get the directory of a model
+    Read the slurm log file
     """
     if not log_dir:
         models_dir = os.path.join(os.path.expanduser("~"), ".vec-inf-logs")
@@ -35,9 +35,11 @@ def read_slurm_log(
                 log_dir = os.path.join(models_dir, dir)
                 break
+    log_dir = cast(str, log_dir)
     try:
         file_path = os.path.join(
-            log_dir,  # type: ignore
+            log_dir,
             f"{slurm_job_name}.{slurm_job_id}.{slurm_log_type}",
         )
         with open(file_path, "r") as file:
@@ -58,12 +60,15 @@ def is_server_running(
     if isinstance(log_content, str):
         return log_content
+    status: Union[str, tuple[str, str]] = "LAUNCHING"
     for line in log_content:
         if "error" in line.lower():
-            return ("FAILED", line.strip("\n"))
+            status = ("FAILED", line.strip("\n"))
         if MODEL_READY_SIGNATURE in line:
-            return "RUNNING"
-    return "LAUNCHING"
+            status = "RUNNING"
+    return status
 def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]) -> str:
@@ -114,11 +119,11 @@ def create_table(
     return table
-def load_models_df() -> pd.DataFrame:
+def load_models_df() -> pl.DataFrame:
     """
     Load the models dataframe
     """
-    models_df = pd.read_csv(
+    models_df = pl.read_csv(
         os.path.join(
             os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
             "models/models.csv",
@@ -127,11 +132,32 @@ def load_models_df() -> pd.DataFrame:
     return models_df
-def load_default_args(models_df: pd.DataFrame, model_name: str) -> dict:
+def load_default_args(models_df: pl.DataFrame, model_name: str) -> dict:
     """
     Load the default arguments for a model
     """
-    row_data = models_df.loc[models_df["model_name"] == model_name]
-    default_args = row_data.iloc[0].to_dict()
-    default_args.pop("model_name")
+    row_data = models_df.filter(models_df["model_name"] == model_name)
+    default_args = row_data.to_dicts()[0]
+    default_args.pop("model_name", None)
+    default_args.pop("model_type", None)
     return default_args
+def get_latest_metric(log_lines: list[str]) -> dict | str:
+    """Read the latest metric entry from the log file."""
+    latest_metric = {}
+    try:
+        for line in reversed(log_lines):
+            if "Avg prompt throughput" in line:
+                # Parse the metric values from the line
+                metrics_str = line.split("] ")[1].strip().strip(".")
+                metrics_list = metrics_str.split(", ")
+                for metric in metrics_list:
+                    key, value = metric.split(": ")
+                    latest_metric[key] = value
+                break
+    except Exception as e:
+        return f"[red]Error reading log file: {e}[/red]"
+    return latest_metric

vec_inf/launch_server.sh CHANGED Viewed

@@ -12,21 +12,24 @@ while [[ "$#" -gt 0 ]]; do
         --num-nodes) num_nodes="$2"; shift ;;
         --num-gpus) num_gpus="$2"; shift ;;
         --max-model-len) max_model_len="$2"; shift ;;
+        --max-num-seqs) max_num_seqs="$2"; shift ;;
         --vocab-size) vocab_size="$2"; shift ;;
         --data-type) data_type="$2"; shift ;;
-        --venv) virtual_env="$2"; shift ;;
+        --venv) venv="$2"; shift ;;
         --log-dir) log_dir="$2"; shift ;;
+        --model-weights-parent-dir) model_weights_parent_dir="$2"; shift ;;
         --pipeline-parallelism) pipeline_parallelism="$2"; shift ;;
+        --enforce-eager) enforce_eager="$2"; shift ;;
         *) echo "Unknown parameter passed: $1"; exit 1 ;;
     esac
     shift
 done
-required_vars=(model_family model_variant partition qos walltime num_nodes num_gpus max_model_len vocab_size)
+required_vars=(model_family model_variant partition qos walltime num_nodes num_gpus max_model_len vocab_size data_type venv log_dir model_weights_parent_dir)
 for var in "$required_vars[@]"; do
     if [ -z "$!var" ]; then
-        echo "Error: Missing required --$var//_/- argument."
+        echo "Error: Missing required --$var argument."
         exit 1
     fi
 done
@@ -40,27 +43,27 @@ export NUM_NODES=$num_nodes
 export NUM_GPUS=$num_gpus
 export VLLM_MAX_MODEL_LEN=$max_model_len
 export VLLM_MAX_LOGPROBS=$vocab_size
-# For custom models, the following are set to default if not specified
-export VLLM_DATA_TYPE="auto"
-export VENV_BASE="singularity"
-export LOG_DIR="default"
-# Pipeline parallelism is disabled and can only be enabled if specified in models.csv as this is an experimental feature
-export PIPELINE_PARALLELISM="false"
-if [ -n "$data_type" ]; then
-    export VLLM_DATA_TYPE=$data_type
-fi
-if [ -n "$virtual_env" ]; then
-    export VENV_BASE=$virtual_env
-fi
-if [ -n "$log_dir" ]; then
-    export LOG_DIR=$log_dir
+export VLLM_DATA_TYPE=$data_type
+export VENV_BASE=$venv
+export LOG_DIR=$log_dir
+export MODEL_WEIGHTS_PARENT_DIR=$model_weights_parent_dir
+if [ -n "$max_num_seqs" ]; then
+    export VLLM_MAX_NUM_SEQS=$max_num_seqs
+else
+    export VLLM_MAX_NUM_SEQS=256
 fi
 if [ -n "$pipeline_parallelism" ]; then
     export PIPELINE_PARALLELISM=$pipeline_parallelism
+else
+    export PIPELINE_PARALLELISM="False"
+fi
+if [ -n "$enforce_eager" ]; then
+    export ENFORCE_EAGER=$enforce_eager
+else
+    export ENFORCE_EAGER="False"
 fi
 # ================================= Set default environment variables ======================================
@@ -72,13 +75,12 @@ fi
 mkdir -p $LOG_DIR
 # Model and entrypoint configuration. API Server URL (host, port) are set automatically based on the
-# SLURM job and are written to the file specified at VLLM_BASE_URL_FILENAME
+# SLURM job
 export SRC_DIR="$(dirname "$0")"
 export MODEL_DIR="${SRC_DIR}/models/${MODEL_FAMILY}"
-export VLLM_BASE_URL_FILENAME="${MODEL_DIR}/.${JOB_NAME}_url"
 # Variables specific to your working environment, below are examples for the Vector cluster
-export VLLM_MODEL_WEIGHTS="/model-weights/$JOB_NAME"
+export VLLM_MODEL_WEIGHTS="${MODEL_WEIGHTS_PARENT_DIR}/${JOB_NAME}"
 export LD_LIBRARY_PATH="/scratch/ssd001/pkgs/cudnn-11.7-v8.5.0.96/lib/:/scratch/ssd001/pkgs/cuda-11.7/targets/x86_64-linux/lib/"
@@ -93,11 +95,6 @@ if [[ $fp16_partitions =~ $JOB_PARTITION ]]; then
     echo "Data type set to due to non-Ampere GPUs used: $VLLM_DATA_TYPE"
 fi
-# Create a file to store the API server URL if it doesn't exist
-if [ -f $VLLM_BASE_URL_FILENAME ]; then
-    touch $VLLM_BASE_URL_FILENAME
-fi
 echo Job Name: $JOB_NAME
 echo Partition: $JOB_PARTITION
 echo Num Nodes: $NUM_NODES
@@ -105,6 +102,13 @@ echo GPUs per Node: $NUM_GPUS
 echo QOS: $QOS
 echo Walltime: $WALLTIME
 echo Data Type: $VLLM_DATA_TYPE
+echo Max Model Length: $VLLM_MAX_MODEL_LEN
+echo Max Num Seqs: $VLLM_MAX_NUM_SEQS
+echo Vocabulary Size: $VLLM_MAX_LOGPROBS
+echo Pipeline Parallelism: $PIPELINE_PARALLELISM
+echo Enforce Eager: $ENFORCE_EAGER
+echo Log Directory: $LOG_DIR
+echo Model Weights Parent Directory: $MODEL_WEIGHTS_PARENT_DIR
 is_special=""
 if [ "$NUM_NODES" -gt 1 ]; then

vec_inf/models/README.md CHANGED Viewed

@@ -1,13 +1,17 @@
 # Available Models
 More profiling metrics coming soon!
-## [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
+## Text Generation Models
+### [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
-|[`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus)| 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
+| [`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus) | 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
+| [`c4ai-command-r-plus-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
+| [`c4ai-command-r-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
-## [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
+### [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
@@ -20,13 +24,13 @@ More profiling metrics coming soon!
 | [`CodeLlama-70b-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-hf) | 4x a40 | - tokens/s | - tokens/s |
 | [`CodeLlama-70b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-Instruct-hf) | 4x a40 | - tokens/s | - tokens/s |
-## [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
+### [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
-|[`dbrx-instruct`](https://huggingface.co/databricks/dbrx-instruct)| 8x a40 (2 nodes, 4 a40/node) | 107 tokens/s | 904 tokens/s |
+| [`dbrx-instruct`](https://huggingface.co/databricks/dbrx-instruct) | 8x a40 (2 nodes, 4 a40/node) | 107 tokens/s | 904 tokens/s |
-## [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
+### [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
@@ -35,21 +39,7 @@ More profiling metrics coming soon!
 | [`gemma-2-27b`](https://huggingface.co/google/gemma-2-27b) | 2x a40 | - tokens/s | - tokens/s |
 | [`gemma-2-27b-it`](https://huggingface.co/google/gemma-2-27b-it) | 2x a40 | - tokens/s | - tokens/s |
-## [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-|[`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
-|[`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf)| 1x a40 | - tokens/s | - tokens/s |
-## [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-|[`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
-|[`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf)| 2x a40 | - tokens/s | - tokens/s |
-## [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
+### [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
 | Variant | Suggested resource allocation |
 |:----------:|:----------:|
@@ -60,7 +50,7 @@ More profiling metrics coming soon!
 | [`Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 4x a40 |
 | [`Llama-2-70b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4x a40 |
-## [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
+### [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
@@ -69,7 +59,7 @@ More profiling metrics coming soon!
 | [`Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 4x a40 | 81 tokens/s | 618 tokens/s |
 | [`Meta-Llama-3-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 4x a40 | 301 tokens/s | 660 tokens/s |
-## [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
+### [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
@@ -79,28 +69,135 @@ More profiling metrics coming soon!
 | [`Meta-Llama-3.1-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
 | [`Meta-Llama-3.1-405B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct) | 32x a40 (8 nodes, 4 a40/node) | - tokens/s | - tokens/s |
-## [Mistral AI: Mistral](https://huggingface.co/mistralai)
+### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`Llama-3.2-1B`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 1x a40 | - tokens/s | - tokens/s |
+| [`Llama-3.2-1B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
+| [`Llama-3.2-3B`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 1x a40 | - tokens/s | - tokens/s |
+| [`Llama-3.2-3B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
+### [Mistral AI: Mistral](https://huggingface.co/mistralai)
 | Variant (Mistral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
-|[`Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1)| 1x a40 | - tokens/s | - tokens/s|
-|[`Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)| 1x a40 | - tokens/s | - tokens/s|
-|[`Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2)| 1x a40 | - tokens/s | - tokens/s|
-|[`Mistral-7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3)| 1x a40 | - tokens/s | - tokens/s |
-|[`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)| 1x a40 | - tokens/s | - tokens/s|
-|[`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407)| 4x a40 | - tokens/s | - tokens/s|
+| [`Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1) | 1x a40 | - tokens/s | - tokens/s|
+| [`Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) | 1x a40 | - tokens/s | - tokens/s|
+| [`Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2) | 1x a40 | - tokens/s | - tokens/s|
+| [`Mistral-7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3) | 1x a40 | - tokens/s | - tokens/s |
+| [`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) | 1x a40 | - tokens/s | - tokens/s|
+| [`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
+| [`Mistral-Large-Instruct-2411`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2411) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
-## [Mistral AI: Mixtral](https://huggingface.co/mistralai)
+### [Mistral AI: Mixtral](https://huggingface.co/mistralai)
 | Variant (Mixtral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
-|[`Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)| 4x a40 | 222 tokens/s | 1543 tokens/s |
-|[`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
-|[`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
+| [`Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | 4x a40 | 222 tokens/s | 1543 tokens/s |
+| [`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
+| [`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
-## [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
+### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
 | [`Phi-3-medium-128k-instruct`](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
+### [Aaditya Ura: Llama3-OpenBioLLM](https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`Llama3-OpenBioLLM-70B`](https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B) | 4x a40 | - tokens/s | - tokens/s |
+### [Nvidia: Llama-3.1-Nemotron](https://huggingface.co/collections/nvidia/llama-31-nemotron-70b-670e93cd366feea16abc13d8)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`Llama-3.1-Nemotron-70B-Instruct-HF`](https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF) | 4x a40 | - tokens/s | - tokens/s |
+### [Qwen: Qwen2.5](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`Qwen2.5-0.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
+| [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
+| [`Qwen2.5-3B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
+| [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
+| [`Qwen2.5-14B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
+| [`Qwen2.5-32B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
+| [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
+### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
+| [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
+| [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
+### [Qwen: Qwen2.5-Coder](https://huggingface.co/collections/Qwen/qwen25-coder-66eaa22e6f99801bf65b0c2f)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`Qwen2.5-Coder-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
+### [Qwen: QwQ](https://huggingface.co/collections/Qwen/qwq-674762b79b75eac01735070a)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`QwQ-32B-Preview`](https://huggingface.co/Qwen/QwQ-32B-Preview) | 2x a40 | - tokens/s | - tokens/s |
+## Vision Language Models
+### [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
+| [`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
+### [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
+| [`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
+### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
 | [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
+### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`Llama-3.2-11B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 2x a40 | - tokens/s | - tokens/s |
+| [`Llama-3.2-11B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
+| [`Llama-3.2-90B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
+| [`Llama-3.2-90B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
+**NOTE**: `MllamaForConditionalGeneration` currently doesn't support pipeline parallelsim, to save memory, maximum number of requests is reduced and enforce eager mode is on.
+### [Mistral: Pixtral](https://huggingface.co/mistralai)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`Pixtral-12B-2409`](https://huggingface.co/mistralai/Pixtral-12B-2409) | 1x a40 | - tokens/s | - tokens/s |
+## Text Embedding Models
+### [Liang Wang: e5](https://huggingface.co/intfloat)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`e5-mistral-7b-instruct`](https://huggingface.co/intfloat/e5-mistral-7b-instruct) | 1x a40 | - tokens/s | - tokens/s |
+## Reward Modeling Models
+### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`Qwen2.5-Math-RM-72B`](https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B) | 4x a40 | - tokens/s | - tokens/s |

vec_inf/models/models.csv CHANGED Viewed

@@ -1,46 +1,73 @@
-model_name,model_family,model_variant,partition,qos,time,num_gpus,num_nodes,vocab_size,max_model_len,data_type,venv,log_dir,pipeline_parallelism
-c4ai-command-r-plus,c4ai-command-r,plus,a40,m2,08:00:00,4,2,256000,8192,auto,singularity,default,false
-CodeLlama-7b-hf,CodeLlama,7b-hf,a40,m2,08:00:00,1,1,32000,16384,auto,singularity,default,false
-CodeLlama-7b-Instruct-hf,CodeLlama,7b-Instruct-hf,a40,m2,08:00:00,1,1,32000,16384,auto,singularity,default,false
-CodeLlama-13b-hf,CodeLlama,13b-hf,a40,m2,08:00:00,1,1,32000,16384,auto,singularity,default,false
-CodeLlama-13b-Instruct-hf,CodeLlama,13b-Instruct-hf,a40,m2,08:00:00,1,1,32000,16384,auto,singularity,default,false
-CodeLlama-34b-hf,CodeLlama,34b-hf,a40,m2,08:00:00,2,1,32000,16384,auto,singularity,default,false
-CodeLlama-34b-Instruct-hf,CodeLlama,34b-Instruct-hf,a40,m2,08:00:00,2,1,32000,16384,auto,singularity,default,false
-CodeLlama-70b-hf,CodeLlama,70b-hf,a40,m2,08:00:00,4,1,32000,4096,auto,singularity,default,false
-CodeLlama-70b-Instruct-hf,CodeLlama,70b-Instruct-hf,a40,m2,08:00:00,4,1,32000,4096,auto,singularity,default,false
-dbrx-instruct,dbrx,instruct,a40,m2,08:00:00,4,2,100352,32000,auto,singularity,default,false
-gemma-2-9b,gemma-2,9b,a40,m2,08:00:00,1,1,256000,4096,auto,singularity,default,false
-gemma-2-9b-it,gemma-2,9b-it,a40,m2,08:00:00,1,1,256000,4096,auto,singularity,default,false
-gemma-2-27b,gemma-2,27b,a40,m2,08:00:00,2,1,256000,4096,auto,singularity,default,false
-gemma-2-27b-it,gemma-2,27b-it,a40,m2,08:00:00,2,1,256000,4096,auto,singularity,default,false
-Llama-2-7b-hf,Llama-2,7b-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
-Llama-2-7b-chat-hf,Llama-2,7b-chat-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
-Llama-2-13b-hf,Llama-2,13b-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
-Llama-2-13b-chat-hf,Llama-2,13b-chat-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
-Llama-2-70b-hf,Llama-2,70b-hf,a40,m2,08:00:00,4,1,32000,4096,auto,singularity,default,false
-Llama-2-70b-chat-hf,Llama-2,70b-chat-hf,a40,m2,08:00:00,4,1,32000,4096,auto,singularity,default,false
-llava-1.5-7b-hf,llava-1.5,7b-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
-llava-1.5-13b-hf,llava-1.5,13b-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
-llava-v1.6-mistral-7b-hf,llava-v1.6,mistral-7b-hf,a40,m2,08:00:00,1,1,32064,32768,auto,singularity,default,false
-llava-v1.6-34b-hf,llava-v1.6,34b-hf,a40,m2,08:00:00,2,1,64064,4096,auto,singularity,default,false
-Meta-Llama-3-8B,Meta-Llama-3,8B,a40,m2,08:00:00,1,1,128256,8192,auto,singularity,default,false
-Meta-Llama-3-8B-Instruct,Meta-Llama-3,8B-Instruct,a40,m2,08:00:00,1,1,128256,8192,auto,singularity,default,false
-Meta-Llama-3-70B,Meta-Llama-3,70B,a40,m2,08:00:00,4,1,128256,8192,auto,singularity,default,false
-Meta-Llama-3-70B-Instruct,Meta-Llama-3,70B-Instruct,a40,m2,08:00:00,4,1,128256,8192,auto,singularity,default,false
-Meta-Llama-3.1-8B,Meta-Llama-3.1,8B,a40,m2,08:00:00,1,1,128256,131072,auto,singularity,default,false
-Meta-Llama-3.1-8B-Instruct,Meta-Llama-3.1,8B-Instruct,a40,m2,08:00:00,1,1,128256,131072,auto,singularity,default,false
-Meta-Llama-3.1-70B,Meta-Llama-3.1,70B,a40,m2,08:00:00,4,1,128256,65536,auto,singularity,default,false
-Meta-Llama-3.1-70B-Instruct,Meta-Llama-3.1,70B-Instruct,a40,m2,08:00:00,4,1,128256,65536,auto,singularity,default,false
-Meta-Llama-3.1-405B-Instruct,Meta-Llama-3.1,405B-Instruct,a40,m4,02:00:00,4,8,128256,16384,auto,singularity,default,true
-Mistral-7B-v0.1,Mistral,7B-v0.1,a40,m2,08:00:00,1,1,32000,32768,auto,singularity,default,false
-Mistral-7B-Instruct-v0.1,Mistral,7B-Instruct-v0.1,a40,m2,08:00:00,1,1,32000,32768,auto,singularity,default,false
-Mistral-7B-Instruct-v0.2,Mistral,7B-Instruct-v0.2,a40,m2,08:00:00,1,1,32000,32768,auto,singularity,default,false
-Mistral-7B-v0.3,Mistral,7B-v0.3,a40,m2,08:00:00,1,1,32768,32768,auto,singularity,default,false
-Mistral-7B-Instruct-v0.3,Mistral,7B-Instruct-v0.3,a40,m2,08:00:00,1,1,32768,32768,auto,singularity,default,false
-Mistral-Large-Instruct-2407,Mistral,Large-Instruct-2407,a40,m2,08:00:00,4,1,32768,131072,auto,singularity,default,false
-Mixtral-8x7B-Instruct-v0.1,Mixtral,8x7B-Instruct-v0.1,a40,m2,08:00:00,4,1,32000,32768,auto,singularity,default,false
-Mixtral-8x22B-v0.1,Mixtral,8x22B-v0.1,a40,m2,08:00:00,4,2,32768,65536,auto,singularity,default,false
-Mixtral-8x22B-Instruct-v0.1,Mixtral,8x22B-Instruct-v0.1,a40,m2,08:00:00,4,2,32768,65536,auto,singularity,default,false
-Phi-3-medium-128k-instruct,Phi-3,medium-128k-instruct,a40,m2,08:00:00,2,1,32064,131072,auto,singularity,default,false
-Phi-3-vision-128k-instruct,Phi-3,vision-128k-instruct,a40,m2,08:00:00,2,1,32064,65536,auto,singularity,default,false
-Llama3-OpenBioLLM-70B,Llama3-OpenBioLLM,70B,a40,m2,08:00:00,4,1,128256,8192,auto,singularity,default,false
+model_name,model_family,model_variant,model_type,num_gpus,num_nodes,vocab_size,max_model_len,max_num_seqs,pipeline_parallelism,enforce_eager,qos,time,partition,data_type,venv,log_dir,model_weights_parent_dir
+c4ai-command-r-plus,c4ai-command-r,plus,LLM,4,2,256000,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+c4ai-command-r-plus-08-2024,c4ai-command-r,plus-08-2024,LLM,4,2,256000,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+c4ai-command-r-08-2024,c4ai-command-r,08-2024,LLM,2,1,256000,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+CodeLlama-7b-hf,CodeLlama,7b-hf,LLM,1,1,32000,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+CodeLlama-7b-Instruct-hf,CodeLlama,7b-Instruct-hf,LLM,1,1,32000,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+CodeLlama-13b-hf,CodeLlama,13b-hf,LLM,1,1,32000,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+CodeLlama-13b-Instruct-hf,CodeLlama,13b-Instruct-hf,LLM,1,1,32000,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+CodeLlama-34b-hf,CodeLlama,34b-hf,LLM,2,1,32000,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+CodeLlama-34b-Instruct-hf,CodeLlama,34b-Instruct-hf,LLM,2,1,32000,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+CodeLlama-70b-hf,CodeLlama,70b-hf,LLM,4,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+CodeLlama-70b-Instruct-hf,CodeLlama,70b-Instruct-hf,LLM,4,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+dbrx-instruct,dbrx,instruct,LLM,4,2,100352,32000,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+gemma-2-9b,gemma-2,9b,LLM,1,1,256000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+gemma-2-9b-it,gemma-2,9b-it,LLM,1,1,256000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+gemma-2-27b,gemma-2,27b,LLM,2,1,256000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+gemma-2-27b-it,gemma-2,27b-it,LLM,2,1,256000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Llama-2-7b-hf,Llama-2,7b-hf,LLM,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Llama-2-7b-chat-hf,Llama-2,7b-chat-hf,LLM,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Llama-2-13b-hf,Llama-2,13b-hf,LLM,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Llama-2-13b-chat-hf,Llama-2,13b-chat-hf,LLM,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Llama-2-70b-hf,Llama-2,70b-hf,LLM,4,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Llama-2-70b-chat-hf,Llama-2,70b-chat-hf,LLM,4,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+llava-1.5-7b-hf,llava-1.5,7b-hf,VLM,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+llava-1.5-13b-hf,llava-1.5,13b-hf,VLM,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+llava-v1.6-mistral-7b-hf,llava-v1.6,mistral-7b-hf,VLM,1,1,32064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+llava-v1.6-34b-hf,llava-v1.6,34b-hf,VLM,2,1,64064,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Meta-Llama-3-8B,Meta-Llama-3,8B,LLM,1,1,128256,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Meta-Llama-3-8B-Instruct,Meta-Llama-3,8B-Instruct,LLM,1,1,128256,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Meta-Llama-3-70B,Meta-Llama-3,70B,LLM,4,1,128256,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Meta-Llama-3-70B-Instruct,Meta-Llama-3,70B-Instruct,LLM,4,1,128256,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Meta-Llama-3.1-8B,Meta-Llama-3.1,8B,LLM,1,1,128256,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Meta-Llama-3.1-8B-Instruct,Meta-Llama-3.1,8B-Instruct,LLM,1,1,128256,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Meta-Llama-3.1-70B,Meta-Llama-3.1,70B,LLM,4,1,128256,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Meta-Llama-3.1-70B-Instruct,Meta-Llama-3.1,70B-Instruct,LLM,4,1,128256,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Meta-Llama-3.1-405B-Instruct,Meta-Llama-3.1,405B-Instruct,LLM,4,8,128256,16384,256,true,false,m4,02:00:00,a40,auto,singularity,default,/model-weights
+Mistral-7B-v0.1,Mistral,7B-v0.1,LLM,1,1,32000,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Mistral-7B-Instruct-v0.1,Mistral,7B-Instruct-v0.1,LLM,1,1,32000,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Mistral-7B-Instruct-v0.2,Mistral,7B-Instruct-v0.2,LLM,1,1,32000,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Mistral-7B-v0.3,Mistral,7B-v0.3,LLM,1,1,32768,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Mistral-7B-Instruct-v0.3,Mistral,7B-Instruct-v0.3,LLM,1,1,32768,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Mistral-Large-Instruct-2407,Mistral,Large-Instruct-2407,LLM,4,2,32768,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Mistral-Large-Instruct-2411,Mistral,Large-Instruct-2411,LLM,4,2,32768,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Mixtral-8x7B-Instruct-v0.1,Mixtral,8x7B-Instruct-v0.1,LLM,4,1,32000,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Mixtral-8x22B-v0.1,Mixtral,8x22B-v0.1,LLM,4,2,32768,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Mixtral-8x22B-Instruct-v0.1,Mixtral,8x22B-Instruct-v0.1,LLM,4,2,32768,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Phi-3-medium-128k-instruct,Phi-3,medium-128k-instruct,LLM,2,1,32064,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Phi-3-vision-128k-instruct,Phi-3,vision-128k-instruct,VLM,2,1,32064,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Llama3-OpenBioLLM-70B,Llama3-OpenBioLLM,70B,LLM,4,1,128256,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Llama-3.1-Nemotron-70B-Instruct-HF,Llama-3.1-Nemotron,70B-Instruct-HF,LLM,4,1,128256,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Llama-3.2-1B,Llama-3.2,1B,LLM,1,1,128256,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Llama-3.2-1B-Instruct,Llama-3.2,1B-Instruct,LLM,1,1,128256,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Llama-3.2-3B,Llama-3.2,3B,LLM,1,1,128256,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Llama-3.2-3B-Instruct,Llama-3.2,3B-Instruct,LLM,1,1,128256,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Llama-3.2-11B-Vision,Llama-3.2,11B-Vision,VLM,2,1,128256,4096,64,false,true,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Llama-3.2-11B-Vision-Instruct,Llama-3.2,11B-Vision-Instruct,VLM,2,1,128256,4096,64,false,true,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Llama-3.2-90B-Vision,Llama-3.2,90B-Vision,VLM,4,2,128256,4096,32,false,true,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Llama-3.2-90B-Vision-Instruct,Llama-3.2,90B-Vision-Instruct,VLM,4,2,128256,4096,32,false,true,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Qwen2.5-0.5B-Instruct,Qwen2.5,0.5B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Qwen2.5-1.5B-Instruct,Qwen2.5,1.5B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Qwen2.5-3B-Instruct,Qwen2.5,3B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Qwen2.5-7B-Instruct,Qwen2.5,7B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Qwen2.5-14B-Instruct,Qwen2.5,14B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Qwen2.5-32B-Instruct,Qwen2.5,32B-Instruct,LLM,2,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Qwen2.5-72B-Instruct,Qwen2.5,72B-Instruct,LLM,4,1,152064,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Qwen2.5-Math-1.5B-Instruct,Qwen2.5,Math-1.5B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Qwen2.5-Math-7B-Instruct,Qwen2.5,Math-7B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Qwen2.5-Math-72B-Instruct,Qwen2.5,Math-72B-Instruct,LLM,4,1,152064,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Qwen2.5-Coder-7B-Instruct,Qwen2.5,Coder-7B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Qwen2.5-Math-RM-72B,Qwen2.5,Math-RM-72B,Reward Modeling,4,1,152064,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+QwQ-32B-Preview,QwQ,32B-Preview,LLM,2,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Pixtral-12B-2409,Pixtral,12B-2409,VLM,1,1,131072,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+e5-mistral-7b-instruct,e5,mistral-7b-instruct,Text Embedding,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights

vec_inf/multinode_vllm.slurm CHANGED Viewed

@@ -5,13 +5,14 @@
 #SBATCH --tasks-per-node=1
 # Load CUDA, change to the cuda version on your environment if different
+source /opt/lmod/lmod/init/profile
 module load cuda-12.3
 nvidia-smi
 source ${SRC_DIR}/find_port.sh
 if [ "$VENV_BASE" = "singularity" ]; then
-    export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_0.3.0.sif
+    export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_0.6.4.post1.sif
     export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1
     module load singularity-ce/3.8.2
     singularity exec $SINGULARITY_IMAGE ray stop
@@ -35,7 +36,7 @@ echo "IP Head: $ip_head"
 echo "Starting HEAD at $head_node"
 if [ "$VENV_BASE" = "singularity" ]; then
     srun --nodes=1 --ntasks=1 -w "$head_node" \
-        singularity exec --nv --bind /model-weights:/model-weights $SINGULARITY_IMAGE \
+        singularity exec --nv --bind ${MODEL_WEIGHTS_PARENT_DIR}:${MODEL_WEIGHTS_PARENT_DIR} $SINGULARITY_IMAGE \
         ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \
         --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${NUM_GPUS}" --block &
 else
@@ -56,7 +57,7 @@ for ((i = 1; i <= worker_num; i++)); do
     echo "Starting WORKER $i at $node_i"
     if [ "$VENV_BASE" = "singularity" ]; then
         srun --nodes=1 --ntasks=1 -w "$node_i" \
-            singularity exec --nv --bind /model-weights:/model-weights $SINGULARITY_IMAGE \
+            singularity exec --nv --bind ${MODEL_WEIGHTS_PARENT_DIR}:${MODEL_WEIGHTS_PARENT_DIR} $SINGULARITY_IMAGE \
             ray start --address "$ip_head" \
             --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${NUM_GPUS}" --block &
     else
@@ -72,9 +73,8 @@ done
 vllm_port_number=$(find_available_port $head_node_ip 8080 65535)
 echo "Server address: http://${head_node_ip}:${vllm_port_number}/v1"
-echo "http://${head_node_ip}:${vllm_port_number}/v1" > ${VLLM_BASE_URL_FILENAME}
-if [ "$PIPELINE_PARALLELISM" = "true" ]; then
+if [ "$PIPELINE_PARALLELISM" = "True" ]; then
     export PIPELINE_PARALLEL_SIZE=$NUM_NODES
     export TENSOR_PARALLEL_SIZE=$NUM_GPUS
 else
@@ -82,9 +82,15 @@ else
     export TENSOR_PARALLEL_SIZE=$((NUM_NODES*NUM_GPUS))
 fi
+if [ "$ENFORCE_EAGER" = "True" ]; then
+    export ENFORCE_EAGER="--enforce-eager"
+else
+    export ENFORCE_EAGER=""
+fi
 # Activate vllm venv
 if [ "$VENV_BASE" = "singularity" ]; then
-    singularity exec --nv --bind /model-weights:/model-weights $SINGULARITY_IMAGE \
+    singularity exec --nv --bind ${MODEL_WEIGHTS_PARENT_DIR}:${MODEL_WEIGHTS_PARENT_DIR} $SINGULARITY_IMAGE \
     python3.10 -m vllm.entrypoints.openai.api_server \
     --model ${VLLM_MODEL_WEIGHTS} \
     --served-model-name ${JOB_NAME} \
@@ -95,7 +101,9 @@ if [ "$VENV_BASE" = "singularity" ]; then
     --dtype ${VLLM_DATA_TYPE} \
     --trust-remote-code \
     --max-logprobs ${VLLM_MAX_LOGPROBS} \
-    --max-model-len ${VLLM_MAX_MODEL_LEN}
+    --max-model-len ${VLLM_MAX_MODEL_LEN} \
+    --max-num-seqs ${VLLM_MAX_NUM_SEQS} \
+    ${ENFORCE_EAGER}
 else
     source ${VENV_BASE}/bin/activate
     python3 -m vllm.entrypoints.openai.api_server \
@@ -108,5 +116,7 @@ else
     --dtype ${VLLM_DATA_TYPE} \
     --trust-remote-code \
     --max-logprobs ${VLLM_MAX_LOGPROBS} \
-    --max-model-len ${VLLM_MAX_MODEL_LEN}
+    --max-model-len ${VLLM_MAX_MODEL_LEN} \
+    --max-num-seqs ${VLLM_MAX_NUM_SEQS} \
+    ${ENFORCE_EAGER}
 fi

vec_inf/vllm.slurm CHANGED Viewed

@@ -3,6 +3,7 @@
 #SBATCH --mem=64G
 # Load CUDA, change to the cuda version on your environment if different
+source /opt/lmod/lmod/init/profile
 module load cuda-12.3
 nvidia-smi
@@ -13,15 +14,20 @@ hostname=${SLURMD_NODENAME}
 vllm_port_number=$(find_available_port $hostname 8080 65535)
 echo "Server address: http://${hostname}:${vllm_port_number}/v1"
-echo "http://${hostname}:${vllm_port_number}/v1" > ${VLLM_BASE_URL_FILENAME}
+if [ "$ENFORCE_EAGER" = "True" ]; then
+    export ENFORCE_EAGER="--enforce-eager"
+else
+    export ENFORCE_EAGER=""
+fi
 # Activate vllm venv
 if [ "$VENV_BASE" = "singularity" ]; then
-    export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_0.3.0.sif
+    export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_0.6.4.post1.sif
     export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1
     module load singularity-ce/3.8.2
     singularity exec $SINGULARITY_IMAGE ray stop
-    singularity exec --nv --bind /model-weights:/model-weights $SINGULARITY_IMAGE \
+    singularity exec --nv --bind ${MODEL_WEIGHTS_PARENT_DIR}:${MODEL_WEIGHTS_PARENT_DIR} $SINGULARITY_IMAGE \
     python3.10 -m vllm.entrypoints.openai.api_server \
     --model ${VLLM_MODEL_WEIGHTS} \
     --served-model-name ${JOB_NAME} \
@@ -31,7 +37,9 @@ if [ "$VENV_BASE" = "singularity" ]; then
     --dtype ${VLLM_DATA_TYPE} \
     --max-logprobs ${VLLM_MAX_LOGPROBS} \
     --trust-remote-code \
-    --max-model-len ${VLLM_MAX_MODEL_LEN}
+    --max-model-len ${VLLM_MAX_MODEL_LEN} \
+    --max-num-seqs ${VLLM_MAX_NUM_SEQS} \
+    ${ENFORCE_EAGER}
 else
     source ${VENV_BASE}/bin/activate
     python3 -m vllm.entrypoints.openai.api_server \
@@ -43,5 +51,7 @@ else
     --dtype ${VLLM_DATA_TYPE} \
     --max-logprobs ${VLLM_MAX_LOGPROBS} \
     --trust-remote-code \
-    --max-model-len ${VLLM_MAX_MODEL_LEN}
+    --max-model-len ${VLLM_MAX_MODEL_LEN} \
+    --max-num-seqs ${VLLM_MAX_NUM_SEQS} \
+    ${ENFORCE_EAGER}
 fi

vec_inf-0.4.0.post1.dist-info/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2024 Vector Institute
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

{vec_inf-0.3.3.dist-info → vec_inf-0.4.0.post1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vec-inf
-Version: 0.3.3
+Version: 0.4.0.post1
 Summary: Efficient LLM inference on Slurm clusters using vLLM.
 License: MIT
 Author: Marshall Wang
@@ -11,19 +11,21 @@ Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
 Provides-Extra: dev
 Requires-Dist: click (>=8.1.0,<9.0.0)
 Requires-Dist: cupy-cuda12x (==12.1.0) ; extra == "dev"
-Requires-Dist: pandas (>=2.2.2,<3.0.0)
+Requires-Dist: numpy (>=1.24.0,<2.0.0)
+Requires-Dist: polars (>=1.15.0,<2.0.0)
 Requires-Dist: ray (>=2.9.3,<3.0.0) ; extra == "dev"
 Requires-Dist: requests (>=2.31.0,<3.0.0)
 Requires-Dist: rich (>=13.7.0,<14.0.0)
-Requires-Dist: vllm (>=0.5.0,<0.6.0) ; extra == "dev"
+Requires-Dist: vllm (>=0.6.0,<0.7.0) ; extra == "dev"
 Requires-Dist: vllm-nccl-cu12 (>=2.18,<2.19) ; extra == "dev"
 Description-Content-Type: text/markdown
 # Vector Inference: Easy inference on Slurm clusters
-This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update [`launch_server.sh`](vec-inf/launch_server.sh), [`vllm.slurm`](vec-inf/vllm.slurm), [`multinode_vllm.slurm`](vec-inf/multinode_vllm.slurm) and [`models.csv`](vec-inf/models/models.csv) accordingly.
+This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update [`launch_server.sh`](vec_inf/launch_server.sh), [`vllm.slurm`](vec_inf/vllm.slurm), [`multinode_vllm.slurm`](vec_inf/multinode_vllm.slurm) and [`models.csv`](vec_inf/models/models.csv) accordingly.
 ## Installation
 If you are using the Vector cluster environment, and you don't need any customization to the inference server environment, run the following to install package:
@@ -33,16 +35,23 @@ pip install vec-inf
 Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package
 ## Launch an inference server
+### `launch` command
 We will use the Llama 3.1 model as example, to launch an OpenAI compatible inference server for Meta-Llama-3.1-8B-Instruct, run:
 ```bash
 vec-inf launch Meta-Llama-3.1-8B-Instruct
 ```
 You should see an output like the following:
-<img width="400" alt="launch_img" src="https://github.com/user-attachments/assets/557eb421-47db-4810-bccd-c49c526b1b43">
+<img width="700" alt="launch_img" src="https://github.com/user-attachments/assets/ab658552-18b2-47e0-bf70-e539c3b898d5">
-The model would be launched using the [default parameters](vec-inf/models/models.csv), you can override these values by providing additional options, use `--help` to see the full list. You can also launch your own customized model as long as the model architecture is [supported by vLLM](https://docs.vllm.ai/en/stable/models/supported_models.html), you'll need to specify all model launching related options to run a successful run.
+The model would be launched using the [default parameters](vec_inf/models/models.csv), you can override these values by providing additional parameters, use `--help` to see the full list. You can also launch your own customized model as long as the model architecture is [supported by vLLM](https://docs.vllm.ai/en/stable/models/supported_models.html), and make sure to follow the instructions below:
+* Your model weights directory naming convention should follow `$MODEL_FAMILY-$MODEL_VARIANT`.
+* Your model weights directory should contain HF format weights.
+* The following launch parameters will conform to default value if not specified: `--max-num-seqs`, `--partition`, `--data-type`, `--venv`, `--log-dir`, `--model-weights-parent-dir`, `--pipeline-parallelism`, `--enforce-eager`. All other launch parameters need to be specified for custom models.
+* Example for setting the model weights parent directory: `--model-weights-parent-dir /h/user_name/my_weights`.
+* For other model launch parameters you can reference the default values for similar models using the [`list` command ](#list-command).
+### `status` command
 You can check the inference server status by providing the Slurm job ID to the `status` command:
 ```bash
 vec-inf status 13014393
@@ -62,6 +71,17 @@ There are 5 possible states:
 Note that the base URL is only available when model is in `READY` state, and if you've changed the Slurm log directory path, you also need to specify it when using the `status` command.
+### `metrics` command
+Once your server is ready, you can check performance metrics by providing the Slurm job ID to the `metrics` command:
+```bash
+vec-inf metrics 13014393
+```
+And you will see the performance metrics streamed to your console, note that the metrics are updated with a 10-second interval.
+<img width="400" alt="metrics_img" src="https://github.com/user-attachments/assets/e5ff2cd5-659b-4c88-8ebc-d8f3fdc023a4">
+### `shutdown` command
 Finally, when you're finished using a model, you can shut it down by providing the Slurm job ID:
 ```bash
 vec-inf shutdown 13014393
@@ -69,17 +89,19 @@ vec-inf shutdown 13014393
 > Shutting down model with Slurm Job ID: 13014393
 ```
+### `list` command
 You call view the full list of available models by running the `list` command:
 ```bash
 vec-inf list
 ```
-<img width="1200" alt="list_img" src="https://github.com/user-attachments/assets/a4f0d896-989d-43bf-82a2-6a6e5d0d288f">
+<img width="940" alt="list_img" src="https://github.com/user-attachments/assets/8cf901c4-404c-4398-a52f-0486f00747a3">
 You can also view the default setup for a specific supported model by providing the model name, for example `Meta-Llama-3.1-70B-Instruct`:
 ```bash
 vec-inf list Meta-Llama-3.1-70B-Instruct
 ```
-<img width="400" alt="list_model_img" src="https://github.com/user-attachments/assets/5dec7a33-ba6b-490d-af47-4cf7341d0b42">
+<img width="400" alt="list_model_img" src="https://github.com/user-attachments/assets/30e42ab7-dde2-4d20-85f0-187adffefc3d">
 `launch`, `list`, and `status` command supports `--json-mode`, where the command output would be structured as a JSON string.

vec_inf-0.4.0.post1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+vec_inf/README.md,sha256=dxX0xKfwLioG0mJ2YFv5JJ5q1m5NlWBrVBOap1wuHfQ,624
+vec_inf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+vec_inf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+vec_inf/cli/_cli.py,sha256=TRaY-QSBQ_do9b4R6Pl7fyDlrfuMN8Z8HH_xOCKkVJA,12585
+vec_inf/cli/_utils.py,sha256=sQqi7JdPOb7gfW4EVsXY2yhLUo8xWqxoY1spQ53bag4,4845
+vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
+vec_inf/launch_server.sh,sha256=gFovqXuYiQ8bEc6O31WTMDuBoNj7opB5iVfnCDhz2Nw,4165
+vec_inf/models/README.md,sha256=YNEVTWliHehCpJTq2SXAidqgFl6CWL6GUOnAPksDYFE,14844
+vec_inf/models/models.csv,sha256=f_cNeM7L0-4pgZqYfWilQd12-WVec2IVk6dRq5BE4mE,9875
+vec_inf/multinode_vllm.slurm,sha256=tg0WgLRdpRFD-oT05aucOpe6h2TZiTyYJFTMqSIj-HQ,4154
+vec_inf/vllm.slurm,sha256=lMgBI7r9jUVVhSIdrUH2DdC-Bxz0eyQ8vuB5uwOzWt0,1847
+vec_inf-0.4.0.post1.dist-info/LICENSE,sha256=mq8zeqpvVSF1EsxmydeXcokt8XnEIfSofYn66S2-cJI,1073
+vec_inf-0.4.0.post1.dist-info/METADATA,sha256=Q6KhU-ggnR9FB5YUjWrPwy2MSd_c9GCFXAQqT9YXZOw,7032
+vec_inf-0.4.0.post1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+vec_inf-0.4.0.post1.dist-info/entry_points.txt,sha256=JF4uXsj1H4XacxaBw9f0KN0P0qDzmp7K_1zTEBDappo,48
+vec_inf-0.4.0.post1.dist-info/RECORD,,

{vec_inf-0.3.3.dist-info → vec_inf-0.4.0.post1.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 1.9.0
+Generator: poetry-core 1.9.1
 Root-Is-Purelib: true
 Tag: py3-none-any

vec_inf-0.3.3.dist-info/RECORD DELETED Viewed

@@ -1,15 +0,0 @@
-vec_inf/README.md,sha256=ny3ffk6FeRwk_nERimK-JQwEuysvBe5eKpNyLk_A-8k,499
-vec_inf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-vec_inf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-vec_inf/cli/_cli.py,sha256=XwCBkwFrN06T_o1CkUKD2nWT6P4bwOfDpVPoM3AUyUA,8984
-vec_inf/cli/_utils.py,sha256=n37X0AcgXNEi3wOEqQFA4_iHHeGclHew6NyQaML6q7s,4034
-vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
-vec_inf/launch_server.sh,sha256=-efoTEIDKlJD7YhbYMgq4fFRV7H_1okjT5uKhfQAGUg,3998
-vec_inf/models/README.md,sha256=7Vz-AMValcic5Mpi9i5FshhRUV9K8nwSnItN4O1TSvI,8124
-vec_inf/models/models.csv,sha256=dOthlc04TyTQTin_fyt-PFDqg-lARScI9i0-tUkIgQ8,4828
-vec_inf/multinode_vllm.slurm,sha256=KbxsKD9kV8wsB_jCEqh63BHq8h2DLmYMV46z5h2wAe0,3867
-vec_inf/vllm.slurm,sha256=wRBkDunb0Oc1d8ESl_Dn9wRs_kIKvN_J39pL8dWAbV0,1608
-vec_inf-0.3.3.dist-info/METADATA,sha256=IefFGb9Gb7bOwI3RjNTbTlTCL6AImzx5XBSJjCp4y8c,5751
-vec_inf-0.3.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-vec_inf-0.3.3.dist-info/entry_points.txt,sha256=JF4uXsj1H4XacxaBw9f0KN0P0qDzmp7K_1zTEBDappo,48
-vec_inf-0.3.3.dist-info/RECORD,,

{vec_inf-0.3.3.dist-info → vec_inf-0.4.0.post1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

vec-inf 0.3.3__py3-none-any.whl → 0.4.0.post1__py3-none-any.whl

vec-inf 0.3.3py3-none-any.whl → 0.4.0.post1py3-none-any.whl