PyPI - vec-inf - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

vec-inf 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

vec_inf/__init__.py +1 -0
vec_inf/cli/__init__.py +1 -0
vec_inf/cli/_cli.py +134 -81
vec_inf/cli/_utils.py +21 -37
vec_inf/launch_server.sh +22 -3
vec_inf/models/README.md +156 -35
vec_inf/models/models.csv +12 -1
vec_inf/multinode_vllm.slurm +3 -1
vec_inf/vllm.slurm +3 -1
{vec_inf-0.4.0.dist-info → vec_inf-0.4.1.dist-info}/METADATA +26 -24
vec_inf-0.4.1.dist-info/RECORD +16 -0
{vec_inf-0.4.0.dist-info → vec_inf-0.4.1.dist-info}/WHEEL +1 -1
vec_inf-0.4.1.dist-info/entry_points.txt +2 -0
vec_inf-0.4.0.dist-info/RECORD +0 -16
vec_inf-0.4.0.dist-info/entry_points.txt +0 -3
{vec_inf-0.4.0.dist-info → vec_inf-0.4.1.dist-info/licenses}/LICENSE +0 -0

vec_inf/__init__.py CHANGED Viewed

	@@ -0,0 +1 @@
1	+ """vec_inf package."""

vec_inf/cli/__init__.py CHANGED Viewed

	@@ -0,0 +1 @@
1	+ """vec_inf cli package."""

vec_inf/cli/_cli.py CHANGED Viewed

@@ -1,9 +1,10 @@
+"""Command line interface for Vector Inference."""
 import os
 import time
-from typing import Optional, cast
+from typing import Any, Dict, Optional
 import click
 import polars as pl
 from rich.columns import Columns
 from rich.console import Console
@@ -12,12 +13,13 @@ from rich.panel import Panel
 import vec_inf.cli._utils as utils
 CONSOLE = Console()
 @click.group()
-def cli():
-    """Vector Inference CLI"""
+def cli() -> None:
+    """Vector Inference CLI."""
     pass
@@ -122,10 +124,7 @@ def launch(
     enforce_eager: Optional[str] = None,
     json_mode: bool = False,
 ) -> None:
-    """
-    Launch a model on the cluster
-    """
+    """Launch a model on the cluster."""
     if isinstance(pipeline_parallelism, str):
         pipeline_parallelism = (
             "True" if pipeline_parallelism.lower() == "true" else "False"
@@ -138,6 +137,13 @@ def launch(
     models_df = utils.load_models_df()
+    models_df = models_df.with_columns(
+        pl.col("model_type").replace("Reward Modeling", "Reward_Modeling")
+    )
+    models_df = models_df.with_columns(
+        pl.col("model_type").replace("Text Embedding", "Text_Embedding")
+    )
     if model_name in models_df["model_name"].to_list():
         default_args = utils.load_default_args(models_df, model_name)
         for arg in default_args:
@@ -148,7 +154,6 @@ def launch(
     else:
         model_args = models_df.columns
         model_args.remove("model_name")
-        model_args.remove("model_type")
         for arg in model_args:
             if locals()[arg] is not None:
                 renamed_arg = arg.replace("_", "-")
@@ -189,79 +194,130 @@ def launch(
 def status(
     slurm_job_id: int, log_dir: Optional[str] = None, json_mode: bool = False
 ) -> None:
-    """
-    Get the status of a running model on the cluster
-    """
+    """Get the status of a running model on the cluster."""
     status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
     output = utils.run_bash_command(status_cmd)
-    slurm_job_name = "UNAVAILABLE"
-    status = "SHUTDOWN"
-    base_url = "UNAVAILABLE"
+    base_data = _get_base_status_data(output)
+    status_info = _process_job_state(output, base_data, slurm_job_id, log_dir)
+    _display_status(status_info, json_mode)
+def _get_base_status_data(output: str) -> Dict[str, Any]:
+    """Extract basic job status information from scontrol output."""
     try:
-        slurm_job_name = output.split(" ")[1].split("=")[1]
-        slurm_job_state = output.split(" ")[9].split("=")[1]
+        job_name = output.split(" ")[1].split("=")[1]
+        job_state = output.split(" ")[9].split("=")[1]
     except IndexError:
-        # Job ID not found
-        slurm_job_state = "UNAVAILABLE"
-    # If Slurm job is currently PENDING
-    if slurm_job_state == "PENDING":
-        slurm_job_pending_reason = output.split(" ")[10].split("=")[1]
-        status = "PENDING"
-    # If Slurm job is currently RUNNING
-    elif slurm_job_state == "RUNNING":
-        # Check whether the server is ready, if yes, run model health check to further determine status
-        server_status = utils.is_server_running(slurm_job_name, slurm_job_id, log_dir)
-        # If server status is a tuple, then server status is "FAILED"
-        if isinstance(server_status, tuple):
-            status = server_status[0]
-            slurm_job_failed_reason = server_status[1]
-        elif server_status == "RUNNING":
-            model_status = utils.model_health_check(
-                slurm_job_name, slurm_job_id, log_dir
-            )
-            if model_status == "READY":
-                # Only set base_url if model is ready to serve requests
-                base_url = utils.get_base_url(slurm_job_name, slurm_job_id, log_dir)
-                status = "READY"
-            else:
-                # If model is not ready, then status must be "FAILED"
-                status = model_status[0]
-                slurm_job_failed_reason = str(model_status[1])
-        else:
-            status = server_status
+        job_name = "UNAVAILABLE"
+        job_state = "UNAVAILABLE"
+    return {
+        "model_name": job_name,
+        "status": "SHUTDOWN",
+        "base_url": "UNAVAILABLE",
+        "state": job_state,
+        "pending_reason": None,
+        "failed_reason": None,
+    }
+def _process_job_state(
+    output: str, status_info: Dict[str, Any], slurm_job_id: int, log_dir: Optional[str]
+) -> Dict[str, Any]:
+    """Process different job states and update status information."""
+    if status_info["state"] == "PENDING":
+        _process_pending_state(output, status_info)
+    elif status_info["state"] == "RUNNING":
+        _handle_running_state(status_info, slurm_job_id, log_dir)
+    return status_info
+def _process_pending_state(output: str, status_info: Dict[str, Any]) -> None:
+    """Handle PENDING job state."""
+    try:
+        status_info["pending_reason"] = output.split(" ")[10].split("=")[1]
+        status_info["status"] = "PENDING"
+    except IndexError:
+        status_info["pending_reason"] = "Unknown pending reason"
+def _handle_running_state(
+    status_info: Dict[str, Any], slurm_job_id: int, log_dir: Optional[str]
+) -> None:
+    """Handle RUNNING job state and check server status."""
+    server_status = utils.is_server_running(
+        status_info["model_name"], slurm_job_id, log_dir
+    )
+    if isinstance(server_status, tuple):
+        status_info["status"], status_info["failed_reason"] = server_status
+        return
+    if server_status == "RUNNING":
+        _check_model_health(status_info, slurm_job_id, log_dir)
+    else:
+        status_info["status"] = server_status
+def _check_model_health(
+    status_info: Dict[str, Any], slurm_job_id: int, log_dir: Optional[str]
+) -> None:
+    """Check model health and update status accordingly."""
+    model_status = utils.model_health_check(
+        status_info["model_name"], slurm_job_id, log_dir
+    )
+    status, failed_reason = model_status
+    if status == "READY":
+        status_info["base_url"] = utils.get_base_url(
+            status_info["model_name"], slurm_job_id, log_dir
+        )
+        status_info["status"] = status
+    else:
+        status_info["status"], status_info["failed_reason"] = status, failed_reason
+def _display_status(status_info: Dict[str, Any], json_mode: bool) -> None:
+    """Display the status information in appropriate format."""
     if json_mode:
-        status_dict = {
-            "model_name": slurm_job_name,
-            "model_status": status,
-            "base_url": base_url,
-        }
-        if "slurm_job_pending_reason" in locals():
-            status_dict["pending_reason"] = slurm_job_pending_reason
-        if "slurm_job_failed_reason" in locals():
-            status_dict["failed_reason"] = slurm_job_failed_reason
-        click.echo(f"{status_dict}")
+        _output_json(status_info)
     else:
-        table = utils.create_table(key_title="Job Status", value_title="Value")
-        table.add_row("Model Name", slurm_job_name)
-        table.add_row("Model Status", status, style="blue")
-        if "slurm_job_pending_reason" in locals():
-            table.add_row("Reason", slurm_job_pending_reason)
-        if "slurm_job_failed_reason" in locals():
-            table.add_row("Reason", slurm_job_failed_reason)
-        table.add_row("Base URL", base_url)
-        CONSOLE.print(table)
+        _output_table(status_info)
+def _output_json(status_info: Dict[str, Any]) -> None:
+    """Format and output JSON data."""
+    json_data = {
+        "model_name": status_info["model_name"],
+        "model_status": status_info["status"],
+        "base_url": status_info["base_url"],
+    }
+    if status_info["pending_reason"]:
+        json_data["pending_reason"] = status_info["pending_reason"]
+    if status_info["failed_reason"]:
+        json_data["failed_reason"] = status_info["failed_reason"]
+    click.echo(json_data)
+def _output_table(status_info: Dict[str, Any]) -> None:
+    """Create and display rich table."""
+    table = utils.create_table(key_title="Job Status", value_title="Value")
+    table.add_row("Model Name", status_info["model_name"])
+    table.add_row("Model Status", status_info["status"], style="blue")
+    if status_info["pending_reason"]:
+        table.add_row("Pending Reason", status_info["pending_reason"])
+    if status_info["failed_reason"]:
+        table.add_row("Failed Reason", status_info["failed_reason"])
+    table.add_row("Base URL", status_info["base_url"])
+    CONSOLE.print(table)
 @cli.command("shutdown")
 @click.argument("slurm_job_id", type=int, nargs=1)
 def shutdown(slurm_job_id: int) -> None:
-    """
-    Shutdown a running model on the cluster
-    """
+    """Shutdown a running model on the cluster."""
     shutdown_cmd = f"scancel {slurm_job_id}"
     utils.run_bash_command(shutdown_cmd)
     click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}")
@@ -275,11 +331,9 @@ def shutdown(slurm_job_id: int) -> None:
     help="Output in JSON string",
 )
 def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> None:
-    """
-    List all available models, or get default setup of a specific model
-    """
+    """List all available models, or get default setup of a specific model."""
-    def list_model(model_name: str, models_df: pl.DataFrame, json_mode: bool):
+    def list_model(model_name: str, models_df: pl.DataFrame, json_mode: bool) -> None:
         if model_name not in models_df["model_name"].to_list():
             raise ValueError(f"Model name {model_name} not found in available models")
@@ -297,7 +351,7 @@ def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> No
                     table.add_row(key, str(value))
         CONSOLE.print(table)
-    def list_all(models_df: pl.DataFrame, json_mode: bool):
+    def list_all(models_df: pl.DataFrame, json_mode: bool) -> None:
         if json_mode:
             click.echo(models_df["model_name"].to_list())
             return
@@ -327,9 +381,12 @@ def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> No
         for row in models_df.to_dicts():
             panel_color = model_type_colors.get(row["model_type"], "white")
-            styled_text = (
-                f"[magenta]{row['model_family']}[/magenta]-{row['model_variant']}"
-            )
+            if row["model_variant"] == "None":
+                styled_text = f"[magenta]{row['model_family']}[/magenta]"
+            else:
+                styled_text = (
+                    f"[magenta]{row['model_family']}[/magenta]-{row['model_variant']}"
+                )
             panels.append(Panel(styled_text, expand=True, border_style=panel_color))
         CONSOLE.print(Columns(panels, equal=True))
@@ -349,9 +406,7 @@ def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> No
     help="Path to slurm log directory. This is required if --log-dir was set in model launch",
 )
 def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
-    """
-    Stream performance metrics to the console
-    """
+    """Stream performance metrics to the console."""
     status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
     output = utils.run_bash_command(status_cmd)
     slurm_job_name = output.split(" ")[1].split("=")[1]
@@ -365,13 +420,11 @@ def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
             if isinstance(out_logs, str):
                 live.update(out_logs)
                 break
-            out_logs = cast(list, out_logs)
             latest_metrics = utils.get_latest_metric(out_logs)
             # if latest_metrics is a string, then it is an error message
             if isinstance(latest_metrics, str):
                 live.update(latest_metrics)
                 break
-            latest_metrics = cast(dict, latest_metrics)
             table = utils.create_table(key_title="Metric", value_title="Value")
             for key, value in latest_metrics.items():
                 table.add_row(key, value)

vec_inf/cli/_utils.py CHANGED Viewed

@@ -1,19 +1,20 @@
+"""Utility functions for the CLI."""
 import os
 import subprocess
-from typing import Optional, Union, cast
+from typing import Dict, List, Optional, Tuple, Union, cast
 import polars as pl
 import requests
 from rich.table import Table
 MODEL_READY_SIGNATURE = "INFO:     Application startup complete."
 SERVER_ADDRESS_SIGNATURE = "Server address: "
 def run_bash_command(command: str) -> str:
-    """
-    Run a bash command and return the output
-    """
+    """Run a bash command and return the output."""
     process = subprocess.Popen(
         command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
     )
@@ -24,15 +25,13 @@ def run_bash_command(command: str) -> str:
 def read_slurm_log(
     slurm_job_name: str, slurm_job_id: int, slurm_log_type: str, log_dir: Optional[str]
 ) -> Union[list[str], str]:
-    """
-    Read the slurm log file
-    """
+    """Read the slurm log file."""
     if not log_dir:
         models_dir = os.path.join(os.path.expanduser("~"), ".vec-inf-logs")
-        for dir in sorted(os.listdir(models_dir), key=len, reverse=True):
-            if dir in slurm_job_name:
-                log_dir = os.path.join(models_dir, dir)
+        for directory in sorted(os.listdir(models_dir), key=len, reverse=True):
+            if directory in slurm_job_name:
+                log_dir = os.path.join(models_dir, directory)
                 break
     log_dir = cast(str, log_dir)
@@ -53,9 +52,7 @@ def read_slurm_log(
 def is_server_running(
     slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]
 ) -> Union[str, tuple[str, str]]:
-    """
-    Check if a model is ready to serve requests
-    """
+    """Check if a model is ready to serve requests."""
     log_content = read_slurm_log(slurm_job_name, slurm_job_id, "err", log_dir)
     if isinstance(log_content, str):
         return log_content
@@ -72,9 +69,7 @@ def is_server_running(
 def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]) -> str:
-    """
-    Get the base URL of a model
-    """
+    """Get the base URL of a model."""
     log_content = read_slurm_log(slurm_job_name, slurm_job_id, "out", log_dir)
     if isinstance(log_content, str):
         return log_content
@@ -87,10 +82,8 @@ def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str])
 def model_health_check(
     slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]
-) -> Union[str, tuple[str, Union[str, int]]]:
-    """
-    Check the health of a running model on the cluster
-    """
+) -> Tuple[str, Union[str, int]]:
+    """Check the health of a running model on the cluster."""
     base_url = get_base_url(slurm_job_name, slurm_job_id, log_dir)
     if not base_url.startswith("http"):
         return ("FAILED", base_url)
@@ -100,9 +93,8 @@ def model_health_check(
         response = requests.get(health_check_url)
         # Check if the request was successful
         if response.status_code == 200:
-            return "READY"
-        else:
-            return ("FAILED", response.status_code)
+            return ("READY", response.status_code)
+        return ("FAILED", response.status_code)
     except requests.exceptions.RequestException as e:
         return ("FAILED", str(e))
@@ -110,9 +102,7 @@ def model_health_check(
 def create_table(
     key_title: str = "", value_title: str = "", show_header: bool = True
 ) -> Table:
-    """
-    Create a table for displaying model status
-    """
+    """Create a table for displaying model status."""
     table = Table(show_header=show_header, header_style="bold magenta")
     table.add_column(key_title, style="dim")
     table.add_column(value_title)
@@ -120,30 +110,24 @@ def create_table(
 def load_models_df() -> pl.DataFrame:
-    """
-    Load the models dataframe
-    """
-    models_df = pl.read_csv(
+    """Load the models dataframe."""
+    return pl.read_csv(
         os.path.join(
             os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
             "models/models.csv",
         )
     )
-    return models_df
-def load_default_args(models_df: pl.DataFrame, model_name: str) -> dict:
-    """
-    Load the default arguments for a model
-    """
+def load_default_args(models_df: pl.DataFrame, model_name: str) -> Dict[str, str]:
+    """Load the default arguments for a model."""
     row_data = models_df.filter(models_df["model_name"] == model_name)
     default_args = row_data.to_dicts()[0]
     default_args.pop("model_name", None)
-    default_args.pop("model_type", None)
     return default_args
-def get_latest_metric(log_lines: list[str]) -> dict | str:
+def get_latest_metric(log_lines: List[str]) -> Union[str, Dict[str, str]]:
     """Read the latest metric entry from the log file."""
     latest_metric = {}

vec_inf/launch_server.sh CHANGED Viewed

@@ -6,6 +6,7 @@ while [[ "$#" -gt 0 ]]; do
     case $1 in
         --model-family) model_family="$2"; shift ;;
         --model-variant) model_variant="$2"; shift ;;
+        --model-type) model_type="$2"; shift ;;
         --partition) partition="$2"; shift ;;
         --qos) qos="$2"; shift ;;
         --time) walltime="$2"; shift ;;
@@ -25,7 +26,7 @@ while [[ "$#" -gt 0 ]]; do
     shift
 done
-required_vars=(model_family model_variant partition qos walltime num_nodes num_gpus max_model_len vocab_size data_type venv log_dir model_weights_parent_dir)
+required_vars=(model_family model_variant model_type partition qos walltime num_nodes num_gpus max_model_len vocab_size data_type venv log_dir model_weights_parent_dir)
 for var in "$required_vars[@]"; do
     if [ -z "$!var" ]; then
@@ -36,6 +37,7 @@ done
 export MODEL_FAMILY=$model_family
 export MODEL_VARIANT=$model_variant
+export MODEL_TYPE=$model_type
 export JOB_PARTITION=$partition
 export QOS=$qos
 export WALLTIME=$walltime
@@ -48,9 +50,20 @@ export VENV_BASE=$venv
 export LOG_DIR=$log_dir
 export MODEL_WEIGHTS_PARENT_DIR=$model_weights_parent_dir
+if [[ "$model_type" == "LLM" || "$model_type" == "VLM" ]]; then
+    export VLLM_TASK="generate"
+elif [ "$model_type" == "Reward_Modeling" ]; then
+    export VLLM_TASK="reward"
+elif [ "$model_type" == "Text_Embedding" ]; then
+    export VLLM_TASK="embed"
+else
+    echo "Error: Unknown model_type: $model_type"
+    exit 1
+fi
 if [ -n "$max_num_seqs" ]; then
     export VLLM_MAX_NUM_SEQS=$max_num_seqs
-else
+else
     export VLLM_MAX_NUM_SEQS=256
 fi
@@ -69,13 +82,17 @@ fi
 # ================================= Set default environment variables ======================================
 # Slurm job configuration
 export JOB_NAME="$MODEL_FAMILY-$MODEL_VARIANT"
+if [ "$JOB_NAME" == "DeepSeek-R1-None" ]; then
+    export JOB_NAME=$MODEL_FAMILY
+fi
 if [ "$LOG_DIR" = "default" ]; then
     export LOG_DIR="$HOME/.vec-inf-logs/$MODEL_FAMILY"
 fi
 mkdir -p $LOG_DIR
 # Model and entrypoint configuration. API Server URL (host, port) are set automatically based on the
-# SLURM job
+# SLURM job
 export SRC_DIR="$(dirname "$0")"
 export MODEL_DIR="${SRC_DIR}/models/${MODEL_FAMILY}"
@@ -101,6 +118,8 @@ echo Num Nodes: $NUM_NODES
 echo GPUs per Node: $NUM_GPUS
 echo QOS: $QOS
 echo Walltime: $WALLTIME
+echo Model Type: $MODEL_TYPE
+echo Task: $VLLM_TASK
 echo Data Type: $VLLM_DATA_TYPE
 echo Max Model Length: $VLLM_MAX_MODEL_LEN
 echo Max Num Seqs: $VLLM_MAX_NUM_SEQS

vec_inf/models/README.md CHANGED Viewed

@@ -1,13 +1,17 @@
 # Available Models
 More profiling metrics coming soon!
-## [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
+## Text Generation Models
+### [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
-|[`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus)| 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
+| [`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus) | 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
+| [`c4ai-command-r-plus-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
+| [`c4ai-command-r-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
-## [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
+### [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
@@ -20,13 +24,13 @@ More profiling metrics coming soon!
 | [`CodeLlama-70b-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-hf) | 4x a40 | - tokens/s | - tokens/s |
 | [`CodeLlama-70b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-Instruct-hf) | 4x a40 | - tokens/s | - tokens/s |
-## [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
+### [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
-|[`dbrx-instruct`](https://huggingface.co/databricks/dbrx-instruct)| 8x a40 (2 nodes, 4 a40/node) | 107 tokens/s | 904 tokens/s |
+| [`dbrx-instruct`](https://huggingface.co/databricks/dbrx-instruct) | 8x a40 (2 nodes, 4 a40/node) | 107 tokens/s | 904 tokens/s |
-## [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
+### [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
@@ -35,21 +39,7 @@ More profiling metrics coming soon!
 | [`gemma-2-27b`](https://huggingface.co/google/gemma-2-27b) | 2x a40 | - tokens/s | - tokens/s |
 | [`gemma-2-27b-it`](https://huggingface.co/google/gemma-2-27b-it) | 2x a40 | - tokens/s | - tokens/s |
-## [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-|[`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
-|[`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf)| 1x a40 | - tokens/s | - tokens/s |
-## [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-|[`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
-|[`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf)| 2x a40 | - tokens/s | - tokens/s |
-## [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
+### [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
 | Variant | Suggested resource allocation |
 |:----------:|:----------:|
@@ -60,7 +50,7 @@ More profiling metrics coming soon!
 | [`Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 4x a40 |
 | [`Llama-2-70b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4x a40 |
-## [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
+### [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
@@ -69,7 +59,7 @@ More profiling metrics coming soon!
 | [`Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 4x a40 | 81 tokens/s | 618 tokens/s |
 | [`Meta-Llama-3-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 4x a40 | 301 tokens/s | 660 tokens/s |
-## [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
+### [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
@@ -79,28 +69,159 @@ More profiling metrics coming soon!
 | [`Meta-Llama-3.1-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
 | [`Meta-Llama-3.1-405B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct) | 32x a40 (8 nodes, 4 a40/node) | - tokens/s | - tokens/s |
-## [Mistral AI: Mistral](https://huggingface.co/mistralai)
+### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`Llama-3.2-1B`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 1x a40 | - tokens/s | - tokens/s |
+| [`Llama-3.2-1B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
+| [`Llama-3.2-3B`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 1x a40 | - tokens/s | - tokens/s |
+| [`Llama-3.2-3B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
+### [Mistral AI: Mistral](https://huggingface.co/mistralai)
 | Variant (Mistral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
-|[`Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1)| 1x a40 | - tokens/s | - tokens/s|
-|[`Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)| 1x a40 | - tokens/s | - tokens/s|
-|[`Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2)| 1x a40 | - tokens/s | - tokens/s|
-|[`Mistral-7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3)| 1x a40 | - tokens/s | - tokens/s |
-|[`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)| 1x a40 | - tokens/s | - tokens/s|
-|[`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407)| 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
+| [`Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1) | 1x a40 | - tokens/s | - tokens/s|
+| [`Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) | 1x a40 | - tokens/s | - tokens/s|
+| [`Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2) | 1x a40 | - tokens/s | - tokens/s|
+| [`Mistral-7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3) | 1x a40 | - tokens/s | - tokens/s |
+| [`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) | 1x a40 | - tokens/s | - tokens/s|
+| [`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
+| [`Mistral-Large-Instruct-2411`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2411) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
-## [Mistral AI: Mixtral](https://huggingface.co/mistralai)
+### [Mistral AI: Mixtral](https://huggingface.co/mistralai)
 | Variant (Mixtral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
-|[`Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)| 4x a40 | 222 tokens/s | 1543 tokens/s |
-|[`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
-|[`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
+| [`Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | 4x a40 | 222 tokens/s | 1543 tokens/s |
+| [`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
+| [`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
-## [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
+### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
 | [`Phi-3-medium-128k-instruct`](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
+### [Aaditya Ura: Llama3-OpenBioLLM](https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`Llama3-OpenBioLLM-70B`](https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B) | 4x a40 | - tokens/s | - tokens/s |
+### [Nvidia: Llama-3.1-Nemotron](https://huggingface.co/collections/nvidia/llama-31-nemotron-70b-670e93cd366feea16abc13d8)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`Llama-3.1-Nemotron-70B-Instruct-HF`](https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF) | 4x a40 | - tokens/s | - tokens/s |
+### [Qwen: Qwen2.5](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`Qwen2.5-0.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
+| [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
+| [`Qwen2.5-3B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
+| [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
+| [`Qwen2.5-14B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
+| [`Qwen2.5-32B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
+| [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
+### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
+| [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
+| [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
+### [Qwen: Qwen2.5-Coder](https://huggingface.co/collections/Qwen/qwen25-coder-66eaa22e6f99801bf65b0c2f)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`Qwen2.5-Coder-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
+### [Qwen: QwQ](https://huggingface.co/collections/Qwen/qwq-674762b79b75eac01735070a)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`QwQ-32B-Preview`](https://huggingface.co/Qwen/QwQ-32B-Preview) | 2x a40 | - tokens/s | - tokens/s |
+### [DeepSeek-R1: Distilled Models](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`DeepSeek-R1-Distill-Llama-8B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) | 1x a40 | - tokens/s | - tokens/s |
+| [`DeepSeek-R1-Distill-Llama-70B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) | 4x a40 | - tokens/s | - tokens/s |
+| [`DeepSeek-R1-Distill-Qwen-1.5B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) | 1x a40 | - tokens/s | - tokens/s |
+| [`DeepSeek-R1-Distill-Qwen-7B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | 1x a40 | - tokens/s | - tokens/s |
+| [`DeepSeek-R1-Distill-Qwen-14B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B) | 2x a40 | - tokens/s | - tokens/s |
+| [`DeepSeek-R1-Distill-Qwen-32B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) | 4x a40 | - tokens/s | - tokens/s |
+## Vision Language Models
+### [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
+| [`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
+### [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
+| [`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
+### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
 | [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
+### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`Llama-3.2-11B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 2x a40 | - tokens/s | - tokens/s |
+| [`Llama-3.2-11B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
+| [`Llama-3.2-90B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
+| [`Llama-3.2-90B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
+**NOTE**: `MllamaForConditionalGeneration` currently doesn't support pipeline parallelsim, to save memory, maximum number of requests is reduced and enforce eager mode is on.
+### [Mistral: Pixtral](https://huggingface.co/mistralai)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`Pixtral-12B-2409`](https://huggingface.co/mistralai/Pixtral-12B-2409) | 1x a40 | - tokens/s | - tokens/s |
+## Text Embedding Models
+### [Liang Wang: e5](https://huggingface.co/intfloat)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`e5-mistral-7b-instruct`](https://huggingface.co/intfloat/e5-mistral-7b-instruct) | 1x a40 | - tokens/s | - tokens/s |
+### [BAAI: bge](https://huggingface.co/BAAI)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`bge-base-en-v1.5`](https://huggingface.co/BAAI/bge-base-en-v1.5) | 1x A40 | - tokens/s | - tokens/s |
+### [Sentence Transformers: MiniLM](https://huggingface.co/sentence-transformers)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`all-MiniLM-L6-v2`](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | 1x A40 | - tokens/s | - tokens/s |
+## Reward Modeling Models
+### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`Qwen2.5-Math-RM-72B`](https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B) | 4x a40 | - tokens/s | - tokens/s |

vec_inf/models/models.csv CHANGED Viewed

@@ -70,5 +70,16 @@ Qwen2.5-Coder-7B-Instruct,Qwen2.5,Coder-7B-Instruct,LLM,1,1,152064,32768,256,tru
 Qwen2.5-Math-RM-72B,Qwen2.5,Math-RM-72B,Reward Modeling,4,1,152064,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
 QwQ-32B-Preview,QwQ,32B-Preview,LLM,2,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
 Pixtral-12B-2409,Pixtral,12B-2409,VLM,1,1,131072,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
-bge-multilingual-gemma2,bge,multilingual-gemma2,Text Embedding,1,1,256002,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
 e5-mistral-7b-instruct,e5,mistral-7b-instruct,Text Embedding,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+bge-base-en-v1.5,bge,base-en-v1.5,Text Embedding,1,1,30522,512,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+all-MiniLM-L6-v2,all-MiniLM,L6-v2,Text Embedding,1,1,30522,512,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Llama-3.3-70B-Instruct,Llama-3.3,70B-Instruct,LLM,4,1,128256,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+InternVL2_5-26B,InternVL2_5,26B,VLM,2,1,92553,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+InternVL2_5-38B,InternVL2_5,38B,VLM,4,1,92553,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+Aya-Expanse-32B,Aya-Expanse,32B,LLM,2,1,256000,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+DeepSeek-R1-Distill-Llama-70B,DeepSeek-R1,Distill-Llama-70B,LLM,4,1,128256,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+DeepSeek-R1-Distill-Llama-8B,DeepSeek-R1,Distill-Llama-8B,LLM,1,1,128256,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+DeepSeek-R1-Distill-Qwen-32B,DeepSeek-R1,Distill-Qwen-32B,LLM,4,1,152064,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+DeepSeek-R1-Distill-Qwen-14B,DeepSeek-R1,Distill-Qwen-14B,LLM,2,1,152064,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+DeepSeek-R1-Distill-Qwen-7B,DeepSeek-R1,Distill-Qwen-7B,LLM,1,1,152064,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
+DeepSeek-R1-Distill-Qwen-1.5B,DeepSeek-R1,Distill-Qwen-1.5B,LLM,1,1,152064,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights

vec_inf/multinode_vllm.slurm CHANGED Viewed

@@ -12,7 +12,7 @@ nvidia-smi
 source ${SRC_DIR}/find_port.sh
 if [ "$VENV_BASE" = "singularity" ]; then
-    export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_0.6.4.post1.sif
+    export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_latest.sif
     export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1
     module load singularity-ce/3.8.2
     singularity exec $SINGULARITY_IMAGE ray stop
@@ -103,6 +103,7 @@ if [ "$VENV_BASE" = "singularity" ]; then
     --max-logprobs ${VLLM_MAX_LOGPROBS} \
     --max-model-len ${VLLM_MAX_MODEL_LEN} \
     --max-num-seqs ${VLLM_MAX_NUM_SEQS} \
+    --task ${VLLM_TASK} \
     ${ENFORCE_EAGER}
 else
     source ${VENV_BASE}/bin/activate
@@ -118,5 +119,6 @@ else
     --max-logprobs ${VLLM_MAX_LOGPROBS} \
     --max-model-len ${VLLM_MAX_MODEL_LEN} \
     --max-num-seqs ${VLLM_MAX_NUM_SEQS} \
+    --task ${VLLM_TASK} \
     ${ENFORCE_EAGER}
 fi

vec_inf/vllm.slurm CHANGED Viewed

@@ -23,7 +23,7 @@ fi
 # Activate vllm venv
 if [ "$VENV_BASE" = "singularity" ]; then
-    export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_0.6.4.post1.sif
+    export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_latest.sif
     export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1
     module load singularity-ce/3.8.2
     singularity exec $SINGULARITY_IMAGE ray stop
@@ -39,6 +39,7 @@ if [ "$VENV_BASE" = "singularity" ]; then
     --trust-remote-code \
     --max-model-len ${VLLM_MAX_MODEL_LEN} \
     --max-num-seqs ${VLLM_MAX_NUM_SEQS} \
+    --task ${VLLM_TASK} \
     ${ENFORCE_EAGER}
 else
     source ${VENV_BASE}/bin/activate
@@ -53,5 +54,6 @@ else
     --trust-remote-code \
     --max-model-len ${VLLM_MAX_MODEL_LEN} \
     --max-num-seqs ${VLLM_MAX_NUM_SEQS} \
+    --task ${VLLM_TASK} \
     ${ENFORCE_EAGER}
 fi

{vec_inf-0.4.0.dist-info → vec_inf-0.4.1.dist-info}/METADATA RENAMED Viewed

@@ -1,30 +1,32 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: vec-inf
-Version: 0.4.0
+Version: 0.4.1
 Summary: Efficient LLM inference on Slurm clusters using vLLM.
-License: MIT
-Author: Marshall Wang
-Author-email: marshall.wang@vectorinstitute.ai
-Requires-Python: >=3.10,<4.0
-Classifier: License :: OSI Approved :: MIT License
-Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.10
-Classifier: Programming Language :: Python :: 3.11
-Classifier: Programming Language :: Python :: 3.12
-Classifier: Programming Language :: Python :: 3.13
+Author-email: Marshall Wang <marshall.wang@vectorinstitute.ai>
+License-Expression: MIT
+License-File: LICENSE
+Requires-Python: <3.11,>=3.10
+Requires-Dist: click>=8.1.0
+Requires-Dist: numpy>=1.24.0
+Requires-Dist: polars>=1.15.0
+Requires-Dist: requests>=2.31.0
+Requires-Dist: rich>=13.7.0
 Provides-Extra: dev
-Requires-Dist: click (>=8.1.0,<9.0.0)
-Requires-Dist: cupy-cuda12x (==12.1.0) ; extra == "dev"
-Requires-Dist: numpy (>=1.24.0,<2.0.0)
-Requires-Dist: pandas (>=1.15.0,<2.0.0)
-Requires-Dist: ray (>=2.9.3,<3.0.0) ; extra == "dev"
-Requires-Dist: requests (>=2.31.0,<3.0.0)
-Requires-Dist: rich (>=13.7.0,<14.0.0)
-Requires-Dist: vllm (>=0.6.0,<0.7.0) ; extra == "dev"
-Requires-Dist: vllm-nccl-cu12 (>=2.18,<2.19) ; extra == "dev"
+Requires-Dist: cupy-cuda12x==12.1.0; extra == 'dev'
+Requires-Dist: ray>=2.40.0; extra == 'dev'
+Requires-Dist: vllm-nccl-cu12<2.19,>=2.18; extra == 'dev'
+Requires-Dist: vllm>=0.7.2; extra == 'dev'
 Description-Content-Type: text/markdown
 # Vector Inference: Easy inference on Slurm clusters
+----------------------------------------------------
+[![code checks](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml)
+[![docs](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs_build.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs_build.yml)
+[![codecov](https://codecov.io/github/VectorInstitute/vector-inference/graph/badge.svg?token=83MYFZ3UPA)](https://codecov.io/github/VectorInstitute/vector-inference)
+![GitHub License](https://img.shields.io/github/license/VectorInstitute/vector-inference)
 This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update [`launch_server.sh`](vec_inf/launch_server.sh), [`vllm.slurm`](vec_inf/vllm.slurm), [`multinode_vllm.slurm`](vec_inf/multinode_vllm.slurm) and [`models.csv`](vec_inf/models/models.csv) accordingly.
 ## Installation
@@ -42,7 +44,7 @@ vec-inf launch Meta-Llama-3.1-8B-Instruct
 ```
 You should see an output like the following:
-<img width="700" alt="launch_img" src="https://github.com/user-attachments/assets/ab658552-18b2-47e0-bf70-e539c3b898d5">
+<img width="600" alt="launch_img" src="https://github.com/user-attachments/assets/ab658552-18b2-47e0-bf70-e539c3b898d5">
 The model would be launched using the [default parameters](vec_inf/models/models.csv), you can override these values by providing additional parameters, use `--help` to see the full list. You can also launch your own customized model as long as the model architecture is [supported by vLLM](https://docs.vllm.ai/en/stable/models/supported_models.html), and make sure to follow the instructions below:
 * Your model weights directory naming convention should follow `$MODEL_FAMILY-$MODEL_VARIANT`.
@@ -94,7 +96,8 @@ You call view the full list of available models by running the `list` command:
 ```bash
 vec-inf list
 ```
-<img width="900" alt="list_img" src="https://github.com/user-attachments/assets/7cb2b2ac-d30c-48a8-b773-f648c27d9de2">
+<img width="940" alt="list_img" src="https://github.com/user-attachments/assets/8cf901c4-404c-4398-a52f-0486f00747a3">
 You can also view the default setup for a specific supported model by providing the model name, for example `Meta-Llama-3.1-70B-Instruct`:
 ```bash
@@ -116,4 +119,3 @@ If you want to run inference from your local device, you can open a SSH tunnel t
 ssh -L 8081:172.17.8.29:8081 username@v.vectorinstitute.ai -N
 ```
 Where the last number in the URL is the GPU number (gpu029 in this case). The example provided above is for the vector cluster, change the variables accordingly for your environment

vec_inf-0.4.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+vec_inf/README.md,sha256=dxX0xKfwLioG0mJ2YFv5JJ5q1m5NlWBrVBOap1wuHfQ,624
+vec_inf/__init__.py,sha256=bHwSIz9lebYuxIemni-lP0h3gwJHVbJnwExQKGJWw_Q,23
+vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
+vec_inf/launch_server.sh,sha256=WJ7HyOEbknxe7zjF388qgnTqoapl90cUrjsIJQChidc,4714
+vec_inf/multinode_vllm.slurm,sha256=ymyteZWWspNDL0yBjhPNMZRd18Jepbw28HRw0EDuXYY,4201
+vec_inf/vllm.slurm,sha256=64jg8t9FHp4IH5Jc_Vrk0XwSSIrpN4Xjwko6GO7cDXQ,1894
+vec_inf/cli/__init__.py,sha256=5XIvGQCOnaGl73XMkwetjC-Ul3xuXGrWDXdYJ3aUzvU,27
+vec_inf/cli/_cli.py,sha256=3LZ7RbJsQ3mxHWTtt-34uQNCZ7G9HaJifyfTQw33zuI,14330
+vec_inf/cli/_utils.py,sha256=t_zFDEomSP9eDvad85GlJIFQ7Kl5ZXOxbgbkfMZ3DwA,4802
+vec_inf/models/README.md,sha256=JpQCg5taBuQp4sLmasK7YPjFMZritOAKlfPpEJsOpeQ,16602
+vec_inf/models/models.csv,sha256=xYrNykRu5HabsUjj4bdRI63YuGgCJSZ-ti_nIjuGPCY,11557
+vec_inf-0.4.1.dist-info/METADATA,sha256=yFvkCgCVpYzuZZJmD22BlTYQeTMk8gD6gmYagyTUyog,7375
+vec_inf-0.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+vec_inf-0.4.1.dist-info/entry_points.txt,sha256=uNRXjCuJSR2nveEqD3IeMznI9oVI9YLZh5a24cZg6B0,49
+vec_inf-0.4.1.dist-info/licenses/LICENSE,sha256=mq8zeqpvVSF1EsxmydeXcokt8XnEIfSofYn66S2-cJI,1073
+vec_inf-0.4.1.dist-info/RECORD,,

{vec_inf-0.4.0.dist-info → vec_inf-0.4.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 1.9.1
+Generator: hatchling 1.27.0
 Root-Is-Purelib: true
 Tag: py3-none-any

vec_inf-0.4.1.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ vec-inf = vec_inf.cli._cli:cli

vec_inf-0.4.0.dist-info/RECORD DELETED Viewed

@@ -1,16 +0,0 @@
-vec_inf/README.md,sha256=dxX0xKfwLioG0mJ2YFv5JJ5q1m5NlWBrVBOap1wuHfQ,624
-vec_inf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-vec_inf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-vec_inf/cli/_cli.py,sha256=TRaY-QSBQ_do9b4R6Pl7fyDlrfuMN8Z8HH_xOCKkVJA,12585
-vec_inf/cli/_utils.py,sha256=sQqi7JdPOb7gfW4EVsXY2yhLUo8xWqxoY1spQ53bag4,4845
-vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
-vec_inf/launch_server.sh,sha256=3-esdDzfuG0qSOPhrZHgx2nQ9GEiaI2tjTPw7VrdMuQ,4167
-vec_inf/models/README.md,sha256=n9I8HsIHCafz0G9k1OFwkraK9J-OY92v6M3z42a-Nho,8146
-vec_inf/models/models.csv,sha256=CK2NDHgdkwx5qpaduuYy9KhcHhS0z60quSeV_KtWx9c,10025
-vec_inf/multinode_vllm.slurm,sha256=tg0WgLRdpRFD-oT05aucOpe6h2TZiTyYJFTMqSIj-HQ,4154
-vec_inf/vllm.slurm,sha256=lMgBI7r9jUVVhSIdrUH2DdC-Bxz0eyQ8vuB5uwOzWt0,1847
-vec_inf-0.4.0.dist-info/LICENSE,sha256=mq8zeqpvVSF1EsxmydeXcokt8XnEIfSofYn66S2-cJI,1073
-vec_inf-0.4.0.dist-info/METADATA,sha256=X-zLib_6dTZT9ZvrIBoQThImgpJSkgTFBL12oi-Dt1A,7025
-vec_inf-0.4.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-vec_inf-0.4.0.dist-info/entry_points.txt,sha256=JF4uXsj1H4XacxaBw9f0KN0P0qDzmp7K_1zTEBDappo,48
-vec_inf-0.4.0.dist-info/RECORD,,

vec_inf-0.4.0.dist-info/entry_points.txt DELETED Viewed

@@ -1,3 +0,0 @@
-[console_scripts]
-vec-inf=vec_inf.cli._cli:cli

{vec_inf-0.4.0.dist-info → vec_inf-0.4.1.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

vec-inf 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

vec-inf 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl