PyPI - vec-inf - Versions diffs - 0.3.0__py3-none-any.whl - Mend

vec-inf 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

vec_inf/README.md +5 -0
vec_inf/__init__.py +0 -0
vec_inf/cli/__init__.py +0 -0
vec_inf/cli/_cli.py +262 -0
vec_inf/cli/_utils.py +129 -0
vec_inf/find_port.sh +39 -0
vec_inf/launch_server.sh +104 -0
vec_inf/models/CodeLlama/README.md +12 -0
vec_inf/models/CodeLlama/config.sh +5 -0
vec_inf/models/Llama-2/README.md +10 -0
vec_inf/models/Llama-2/config.sh +5 -0
vec_inf/models/Meta-Llama-3/README.md +8 -0
vec_inf/models/Meta-Llama-3/config.sh +5 -0
vec_inf/models/Meta-Llama-3.1/README.md +8 -0
vec_inf/models/Meta-Llama-3.1/config.sh +6 -0
vec_inf/models/Mistral/README.md +10 -0
vec_inf/models/Mistral/config.sh +5 -0
vec_inf/models/Mixtral/README.md +8 -0
vec_inf/models/Mixtral/config.sh +5 -0
vec_inf/models/Phi-3/README.md +6 -0
vec_inf/models/Phi-3/config.sh +6 -0
vec_inf/models/README.md +31 -0
vec_inf/models/c4ai-command-r/README.md +5 -0
vec_inf/models/c4ai-command-r/config.sh +5 -0
vec_inf/models/dbrx/README.md +5 -0
vec_inf/models/dbrx/config.sh +5 -0
vec_inf/models/gemma-2/README.md +8 -0
vec_inf/models/gemma-2/config.sh +6 -0
vec_inf/models/llava-1.5/README.md +7 -0
vec_inf/models/llava-1.5/chat_template.jinja +23 -0
vec_inf/models/llava-1.5/config.sh +5 -0
vec_inf/models/llava-v1.6/README.md +7 -0
vec_inf/models/llava-v1.6/chat_template.jinja +23 -0
vec_inf/models/llava-v1.6/config.sh +5 -0
vec_inf/models/models.csv +45 -0
vec_inf/multinode_vllm.slurm +114 -0
vec_inf/vllm.slurm +47 -0
vec_inf-0.3.0.dist-info/METADATA +94 -0
vec_inf-0.3.0.dist-info/RECORD +41 -0
vec_inf-0.3.0.dist-info/WHEEL +4 -0
vec_inf-0.3.0.dist-info/entry_points.txt +3 -0

vec_inf/README.md ADDED Viewed

@@ -0,0 +1,5 @@
+# `vec-inf` Commands
+* `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server, `--json-mode` supported. Check [`here`](./models/README.md) for complete list of available options.
+* `status`: Check the model status by providing its Slurm job ID, `--json-mode` supported.
+* `shutdown`: Shutdown a model by providing its Slurm job ID.

vec_inf/__init__.py ADDED Viewed

File without changes

vec_inf/cli/__init__.py ADDED Viewed

File without changes

vec_inf/cli/_cli.py ADDED Viewed

@@ -0,0 +1,262 @@
+import os
+import click
+import pandas as pd
+from rich.console import Console
+from rich.columns import Columns
+from rich.panel import Panel
+from ._utils import *
+CONSOLE = Console()
+@click.group()
+def cli():
+    """Vector Inference CLI"""
+    pass
+@cli.command("launch")
+@click.argument(
+    "model-name",
+    type=str,
+    nargs=1
+)
+@click.option(
+    "--model-family",
+    type=str,
+    help='The model family name according to the directories in `models`'
+)
+@click.option(
+    "--model-variant",
+    type=str,
+    help='The model variant according to the README in `models/model-family`'
+)
+@click.option(
+    "--max-model-len",
+    type=int,
+    help='Model context length. If unspecified, will be automatically derived from the model config.'
+)
+@click.option(
+    "--partition",
+    type=str,
+    help='Type of compute partition, default to a40'
+)
+@click.option(
+    "--num-nodes",
+    type=int,
+    help='Number of nodes to use, default to suggested resource allocation for model'
+)
+@click.option(
+    "--num-gpus",
+    type=int,
+    help='Number of GPUs/node to use, default to suggested resource allocation for model'
+)
+@click.option(
+    "--qos",
+    type=str,
+    help='Quality of service, default to m3'
+)
+@click.option(
+    "--time",
+    type=str,
+    help='Time limit for job, this should comply with QoS, default to 4:00:00'
+)
+@click.option(
+    "--data-type",
+    type=str,
+    help='Model data type, default to auto'
+)
+@click.option(
+    "--venv",
+    type=str,
+    help='Path to virtual environment'
+)
+@click.option(
+    "--log-dir",
+    type=str,
+    help='Path to slurm log directory'
+)
+@click.option(
+    "--json-mode",
+    is_flag=True,
+    help='Output in JSON string',
+)
+def launch(
+    model_name: str,
+    model_family: str=None,
+    model_variant: str=None,
+    max_model_len: int=None,
+    partition: str=None,
+    num_nodes: int=None,
+    num_gpus: int=None,
+    qos: str=None,
+    time: str=None,
+    data_type: str=None,
+    venv: str=None,
+    log_dir: str=None,
+    json_mode: bool=False
+) -> None:
+    """
+    Launch a model on the cluster
+    """
+    launch_script_path = os.path.join(
+        os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
+        "launch_server.sh"
+    )
+    launch_cmd = f"bash {launch_script_path}"
+    models_df = load_models_df()
+    if model_name not in models_df['model_name'].values:
+        raise ValueError(f"Model name {model_name} not found in available models")
+    default_args = load_default_args(models_df, model_name)
+    for arg in default_args:
+        if arg in locals() and locals()[arg] is not None:
+            default_args[arg] = locals()[arg]
+        renamed_arg = arg.replace("_", "-")
+        launch_cmd += f" --{renamed_arg} {default_args[arg]}"
+    output = run_bash_command(launch_cmd)
+    slurm_job_id = output.split(" ")[-1].strip().strip("\n")
+    output_lines = output.split("\n")[:-2]
+    table = create_table(key_title="Job Config", value_title="Value")
+    table.add_row("Slurm Job ID", slurm_job_id, style="blue")
+    output_dict = {"slurm_job_id": slurm_job_id}
+    for line in output_lines:
+        key, value = line.split(": ")
+        table.add_row(key, value)
+        output_dict[key.lower().replace(" ", "_")] = value
+    if json_mode:
+        click.echo(output_dict)
+    else:
+        CONSOLE.print(table)
+@cli.command("status")
+@click.argument(
+    "slurm_job_id",
+    type=int,
+    nargs=1
+)
+@click.option(
+    "--log-dir",
+    type=str,
+    help='Path to slurm log directory. This is required if it was set when launching the model'
+)
+@click.option(
+    "--json-mode",
+    is_flag=True,
+    help='Output in JSON string',
+)
+def status(slurm_job_id: int, log_dir: str=None, json_mode: bool=False) -> None:
+    """
+    Get the status of a running model on the cluster
+    """
+    status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
+    output = run_bash_command(status_cmd)
+    slurm_job_name = "UNAVAILABLE"
+    status = "SHUTDOWN"
+    base_url = "UNAVAILABLE"
+    try:
+        slurm_job_name = output.split(" ")[1].split("=")[1]
+        slurm_job_state = output.split(" ")[9].split("=")[1]
+    except IndexError:
+        # Job ID not found
+        slurm_job_state = "UNAVAILABLE"
+    # If Slurm job is currently PENDING
+    if slurm_job_state == "PENDING":
+        slurm_job_pending_reason = output.split(" ")[10].split("=")[1]
+        status = "PENDING"
+    # If Slurm job is currently RUNNING
+    elif slurm_job_state == "RUNNING":
+        # Check whether the server is ready, if yes, run model health check to further determine status
+        server_status = is_server_running(slurm_job_name, slurm_job_id, log_dir)
+        # If server status is a tuple, then server status is "FAILED"
+        if type(server_status) is tuple:
+            status = server_status[0]
+            slurm_job_failed_reason = server_status[1]
+        elif server_status == "RUNNING":
+            status = model_health_check(slurm_job_name, slurm_job_id, log_dir)
+            if status == "READY":
+                # Only set base_url if model is ready to serve requests
+                base_url = get_base_url(slurm_job_name, slurm_job_id, log_dir)
+            else:
+                # If model is not ready, then status must be "FAILED"
+                status = status[0]
+                slurm_job_failed_reason = status[1]
+        else:
+            status = server_status
+    if json_mode:
+        status_dict = {
+            "model_name": slurm_job_name,
+            "model_status": status,
+            "base_url": base_url
+        }
+        if "slurm_job_pending_reason" in locals():
+            status_dict["pending_reason"] = slurm_job_pending_reason
+        if "slurm_job_failed_reason" in locals():
+            status_dict["failed_reason"] = slurm_job_failed_reason
+        click.echo(f'{status_dict}')
+    else:
+        table = create_table(key_title="Job Status", value_title="Value")
+        table.add_row("Model Name", slurm_job_name)
+        table.add_row("Model Status", status, style="blue")
+        if "slurm_job_pending_reason" in locals():
+            table.add_row("Reason", slurm_job_pending_reason)
+        if "slurm_job_failed_reason" in locals():
+            table.add_row("Reason", slurm_job_failed_reason)
+        table.add_row("Base URL", base_url)
+        CONSOLE.print(table)
+@cli.command("shutdown")
+@click.argument(
+    "slurm_job_id",
+    type=int,
+    nargs=1
+)
+def shutdown(slurm_job_id: int) -> None:
+    """
+    Shutdown a running model on the cluster
+    """
+    shutdown_cmd = f"scancel {slurm_job_id}"
+    run_bash_command(shutdown_cmd)
+    click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}")
+@cli.command("list")
+@click.option(
+    "--json-mode",
+    is_flag=True,
+    help='Output in JSON string',
+)
+def list(json_mode: bool=False) -> None:
+    """
+    List all available models
+    """
+    models_df = load_models_df()
+    if json_mode:
+        click.echo(models_df['model_name'].to_json(orient='records'))
+        return
+    panels = []
+    for _, row in models_df.iterrows():
+        styled_text = f"[magenta]{row['model_family']}[/magenta]-{row['model_variant']}"
+        panels.append(Panel(styled_text, expand=True))
+    CONSOLE.print(Columns(panels, equal=True))
+if __name__ == '__main__':
+    cli()

vec_inf/cli/_utils.py ADDED Viewed

@@ -0,0 +1,129 @@
+import subprocess
+import os
+from typing import Union
+import requests
+from rich.table import Table
+import pandas as pd
+MODEL_READY_SIGNATURE = "INFO:     Uvicorn running on http://0.0.0.0:"
+SERVER_ADDRESS_SIGNATURE = "Server address: "
+def run_bash_command(command: str) -> str:
+    """
+    Run a bash command and return the output
+    """
+    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    stdout, _ = process.communicate()
+    return stdout
+def read_slurm_log(
+        slurm_job_name: str,
+        slurm_job_id: int,
+        slurm_log_type: str,
+        log_dir: str
+    ) -> Union[list, str]:
+    """
+    Get the directory of a model
+    """
+    if not log_dir:
+        models_dir = os.path.join(os.path.expanduser("~"), ".vec-inf-logs")
+        for dir in sorted(os.listdir(models_dir), key=len, reverse=True):
+            if dir in slurm_job_name:
+                log_dir = os.path.join(models_dir, dir)
+                break
+    try:
+        file_path = os.path.join(log_dir, f"{slurm_job_name}.{slurm_job_id}.{slurm_log_type}")
+        with open(file_path, 'r') as file:
+            lines = file.readlines()
+    except FileNotFoundError:
+        print(f"Could not find file: {file_path}")
+        return "LOG_FILE_NOT_FOUND"
+    return lines
+def is_server_running(slurm_job_name: str, slurm_job_id: int, log_dir: str) -> Union[str, tuple]:
+    """
+    Check if a model is ready to serve requests
+    """
+    log_content = read_slurm_log(slurm_job_name, slurm_job_id, "err", log_dir)
+    if type(log_content) is str:
+        return log_content
+    for line in log_content:
+        if "error" in line.lower():
+            return ("FAILED", line.strip("\n"))
+        if MODEL_READY_SIGNATURE in line:
+            return "RUNNING"
+    return "LAUNCHING"
+def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: str) -> str:
+    """
+    Get the base URL of a model
+    """
+    log_content = read_slurm_log(slurm_job_name, slurm_job_id, "out", log_dir)
+    if type(log_content) is str:
+        return log_content
+    for line in log_content:
+        if SERVER_ADDRESS_SIGNATURE in line:
+            return line.split(SERVER_ADDRESS_SIGNATURE)[1].strip("\n")
+    return "URL_NOT_FOUND"
+def model_health_check(slurm_job_name: str, slurm_job_id: int, log_dir: str) -> Union[str, tuple]:
+    """
+    Check the health of a running model on the cluster
+    """
+    base_url = get_base_url(slurm_job_name, slurm_job_id, log_dir)
+    if not base_url.startswith("http"):
+        return ("FAILED", base_url)
+    health_check_url = base_url.replace("v1", "health")
+    try:
+        response = requests.get(health_check_url)
+        # Check if the request was successful
+        if response.status_code == 200:
+            return "READY"
+        else:
+            return ("FAILED", response.status_code)
+    except requests.exceptions.RequestException as e:
+        return ("FAILED", str(e))
+def create_table(key_title: str = "", value_title: str = "", show_header: bool = True) -> Table:
+    """
+    Create a table for displaying model status
+    """
+    table = Table(show_header=show_header, header_style="bold magenta")
+    table.add_column(key_title, style="dim")
+    table.add_column(value_title)
+    return table
+def load_models_df() -> pd.DataFrame:
+    """
+    Load the models dataframe
+    """
+    models_df = pd.read_csv(
+        os.path.join(
+            os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
+            "models/models.csv"
+        )
+    )
+    return models_df
+def load_default_args(models_df: pd.DataFrame, model_name: str) -> dict:
+    """
+    Load the default arguments for a model
+    """
+    row_data = models_df.loc[models_df["model_name"] == model_name]
+    default_args = row_data.iloc[0].to_dict()
+    default_args.pop("model_name")
+    return default_args

vec_inf/find_port.sh ADDED Viewed

@@ -0,0 +1,39 @@
+#!/bin/bash
+# Function to check if a port is available on the specified IP
+is_port_available() {
+    local ip=$1
+    local port=$2
+    # Attempt to listen on the specified port and IP. Use & to background the process.
+    nc -l $ip $port &> /dev/null &
+    # Capture the PID of the background process
+    local pid=$!
+    # Wait a short moment to ensure nc had time to bind to the port
+    sleep 0.1
+    # Check if nc is still running. If so, the port was available.
+    if kill -0 $pid &> /dev/null; then
+        # Kill the background nc process
+        kill $pid &> /dev/null
+        return 0  # True, port is available
+    else
+        return 1  # False, port is not available
+    fi
+}
+# Function to find an available port on the specified IP
+find_available_port() {
+    local ip=$1
+    local base_port=$2
+    local max_port=$3
+    for ((port=base_port; port<=max_port; port++)); do
+        if is_port_available $ip $port; then
+            echo $port
+            return
+        fi
+    done
+    echo "No available port between $base_port and $max_port for $ip." >&2
+    return 1
+}

vec_inf/launch_server.sh ADDED Viewed

@@ -0,0 +1,104 @@
+#!/bin/bash
+# ================================= Read Named Args ======================================
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        --model-family) model_family="$2"; shift ;;
+        --model-variant) model_variant="$2"; shift ;;
+        --partition) partition="$2"; shift ;;
+        --qos) qos="$2"; shift ;;
+        --time) walltime="$2"; shift ;;
+        --num-nodes) num_nodes="$2"; shift ;;
+        --num-gpus) num_gpus="$2"; shift ;;
+        --max-model-len) max_model_len="$2"; shift ;;
+        --vocab-size) vocab_size="$2"; shift ;;
+        --data-type) data_type="$2"; shift ;;
+        --venv) virtual_env="$2"; shift ;;
+        --log-dir) log_dir="$2"; shift ;;
+        --pipeline-parallelism) pipeline_parallelism="$2"; shift ;;
+        *) echo "Unknown parameter passed: $1"; exit 1 ;;
+    esac
+    shift
+done
+required_vars=(model_family model_variant partition qos walltime num_nodes num_gpus max_model_len vocab_size data_type virtual_env log_dir pipeline_parallelism)
+for var in "$required_vars[@]"; do
+    if [ -z "$!var" ]; then
+        echo "Error: Missing required --$var//_/- argument."
+        exit 1
+    fi
+done
+export MODEL_FAMILY=$model_family
+export MODEL_VARIANT=$model_variant
+export JOB_PARTITION=$partition
+export QOS=$qos
+export WALLTIME=$walltime
+export NUM_NODES=$num_nodes
+export NUM_GPUS=$num_gpus
+export VLLM_MAX_MODEL_LEN=$max_model_len
+export VLLM_MAX_LOGPROBS=$vocab_size
+export VLLM_DATA_TYPE=$data_type
+export VENV_BASE=$virtual_env
+export LOG_DIR=$log_dir
+export PIPELINE_PARALLELISM=$pipeline_parallelism
+# ================================= Set default environment variables ======================================
+# Slurm job configuration
+export JOB_NAME="$MODEL_FAMILY-$MODEL_VARIANT"
+if [ "$LOG_DIR" = "default" ]; then
+    export LOG_DIR="$HOME/.vec-inf-logs/$MODEL_FAMILY"
+fi
+mkdir -p $LOG_DIR
+# Model and entrypoint configuration. API Server URL (host, port) are set automatically based on the
+# SLURM job and are written to the file specified at VLLM_BASE_URL_FILENAME
+export SRC_DIR="$(dirname "$0")"
+export MODEL_DIR="${SRC_DIR}/models/${MODEL_FAMILY}"
+export VLLM_BASE_URL_FILENAME="${MODEL_DIR}/.${JOB_NAME}_url"
+# Variables specific to your working environment, below are examples for the Vector cluster
+export VLLM_MODEL_WEIGHTS="/model-weights/$JOB_NAME"
+export LD_LIBRARY_PATH="/scratch/ssd001/pkgs/cudnn-11.7-v8.5.0.96/lib/:/scratch/ssd001/pkgs/cuda-11.7/targets/x86_64-linux/lib/"
+# ================================ Validate Inputs & Launch Server =================================
+# Set data type to fp16 instead of bf16 for non-Ampere GPUs
+fp16_partitions="t4v1 t4v2"
+# choose from 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
+if [[ $fp16_partitions =~ $JOB_PARTITION ]]; then
+    export VLLM_DATA_TYPE="float16"
+    echo "Data type set to due to non-Ampere GPUs used: $VLLM_DATA_TYPE"
+fi
+# Create a file to store the API server URL if it doesn't exist
+if [ -f $VLLM_BASE_URL_FILENAME ]; then
+    touch $VLLM_BASE_URL_FILENAME
+fi
+echo Job Name: $JOB_NAME
+echo Partition: $JOB_PARTITION
+echo Num Nodes: $NUM_NODES
+echo GPUs per Node: $NUM_GPUS
+echo QOS: $QOS
+echo Walltime: $WALLTIME
+echo Data Type: $VLLM_DATA_TYPE
+is_special=""
+if [ "$NUM_NODES" -gt 1 ]; then
+    is_special="multinode_"
+fi
+sbatch --job-name $JOB_NAME \
+    --partition $JOB_PARTITION \
+    --nodes $NUM_NODES \
+    --gres gpu:$NUM_GPUS \
+    --qos $QOS \
+    --time $WALLTIME \
+    --output $LOG_DIR/$JOB_NAME.%j.out \
+    --error $LOG_DIR/$JOB_NAME.%j.err \
+    $SRC_DIR/${is_special}vllm.slurm

vec_inf/models/CodeLlama/README.md ADDED Viewed

@@ -0,0 +1,12 @@
+# [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`7b-hf`](https://huggingface.co/meta-llama/CodeLlama-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
+| [**`7b-Instruct-hf`**](https://huggingface.co/meta-llama/CodeLlama-7b-Instruct-hf) | 1x a40 | - tokens/s | - tokens/s |
+| [`13b-hf`](https://huggingface.co/meta-llama/CodeLlama-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
+| [`13b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-13b-Instruct-hf) | 1x a40 | - tokens/s | - tokens/s |
+| [`34b-hf`](https://huggingface.co/meta-llama/CodeLlama-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
+| [`34b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-34b-Instruct-hf) | 2x a40 | - tokens/s | - tokens/s |
+| [`70b-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-hf) | 4x a40 | - tokens/s | - tokens/s |
+| [`70b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-Instruct-hf) | 4x a40 | - tokens/s | - tokens/s |

vec_inf/models/CodeLlama/config.sh ADDED Viewed

@@ -0,0 +1,5 @@
+export MODEL_NAME="CodeLlama"
+export MODEL_VARIANT="7b-Instruct-hf"
+export NUM_NODES=1
+export NUM_GPUS=1
+export VLLM_MAX_LOGPROBS=32000

vec_inf/models/Llama-2/README.md ADDED Viewed

@@ -0,0 +1,10 @@
+# [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
+| Variant | Suggested resource allocation |
+|:----------:|:----------:|
+| [**`7b-hf`**](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 1x a40 |
+| [`7b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1x a40 |
+| [`13b-hf`](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 1x a40 |
+| [`13b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 1x a40 |
+| [`70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 4x a40 |
+| [`70b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4x a40 |

vec_inf/models/Llama-2/config.sh ADDED Viewed

@@ -0,0 +1,5 @@
+export MODEL_NAME="Llama-2"
+export MODEL_VARIANT="7b-chat-hf"
+export NUM_NODES=1
+export NUM_GPUS=1
+export VLLM_MAX_LOGPROBS=32000

vec_inf/models/Meta-Llama-3/README.md ADDED Viewed

@@ -0,0 +1,8 @@
+# [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`8B`](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 1x a40 | 222 tokens/s | 1811 tokens/s |
+| [**`8B-Instruct`**](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | 1x a40 | 371 tokens/s | 1990 tokens/s |
+| [`70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 4x a40 | 81 tokens/s | 618 tokens/s |
+| [`70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 4x a40 | 301 tokens/s | 660 tokens/s |

vec_inf/models/Meta-Llama-3/config.sh ADDED Viewed

@@ -0,0 +1,5 @@
+export MODEL_NAME="Meta-Llama-3"
+export MODEL_VARIANT="8B-Instruct"
+export NUM_NODES=1
+export NUM_GPUS=1
+export VLLM_MAX_LOGPROBS=128256

vec_inf/models/Meta-Llama-3.1/README.md ADDED Viewed

@@ -0,0 +1,8 @@
+# [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`8B`](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) | 1x a40 | - tokens/s | - tokens/s |
+| [**`8B-Instruct`**](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
+| [`70B`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) | 4x a40 | - tokens/s | - tokens/s |
+| [`70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 4x a40 | - tokens/s | - tokens/s |

vec_inf/models/Meta-Llama-3.1/config.sh ADDED Viewed

@@ -0,0 +1,6 @@
+export MODEL_NAME="Meta-Llama-3.1"
+export MODEL_VARIANT="8B-Instruct"
+export NUM_NODES=1
+export NUM_GPUS=1
+export VLLM_MAX_LOGPROBS=128256
+export MAX_NUM_SEQS=256

vec_inf/models/Mistral/README.md ADDED Viewed

@@ -0,0 +1,10 @@
+# [Mistral AI: Mistral](https://huggingface.co/mistralai)
+* Supported model variants:
+| Variant (Mistral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+|[`7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1)| 1x a40 | - tokens/s | - tokens/s|
+|[`7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)| 1x a40 | - tokens/s | - tokens/s|
+|[`7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2)| 1x a40 | - tokens/s | - tokens/s|
+|[`7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3)| 1x a40 | - tokens/s | - tokens/s |
+|[**`7B-Instruct-v0.3`**](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)| 1x a40 | - tokens/s | - tokens/s|

vec_inf/models/Mistral/config.sh ADDED Viewed

@@ -0,0 +1,5 @@
+export MODEL_NAME="Mistral"
+export MODEL_VARIANT="7B-Instruct-v0.3"
+export NUM_NODES=1
+export NUM_GPUS=1
+export VLLM_MAX_LOGPROBS=32768

vec_inf/models/Mixtral/README.md ADDED Viewed

@@ -0,0 +1,8 @@
+# [Mistral AI: Mixtral](https://huggingface.co/mistralai)
+* Supported model variants:
+| Variant (Mixtral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+|[**`8x7B-Instruct-v0.1`**](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)| 4x a40 | 222 tokens/s | 1543 tokens/s |
+|[`8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
+|[`8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|

vec_inf/models/Mixtral/config.sh ADDED Viewed

@@ -0,0 +1,5 @@
+export MODEL_NAME="Mixtral"
+export MODEL_VARIANT="8x7B-Instruct-v0.1"
+export NUM_NODES=1
+export NUM_GPUS=4
+export VLLM_MAX_LOGPROBS=32000

vec_inf/models/Phi-3/README.md ADDED Viewed

@@ -0,0 +1,6 @@
+# [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [**`medium-128k-instruct`**](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
+| [**`vision-128k-instruct`**](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |

vec_inf/models/Phi-3/config.sh ADDED Viewed

@@ -0,0 +1,6 @@
+export MODEL_NAME="Phi-3"
+export MODEL_VARIANT="medium-128k-instruct"
+export NUM_NODES=1
+export NUM_GPUS=2
+export VLLM_MAX_LOGPROBS=32064
+export MAX_NUM_SEQS=200

vec_inf/models/README.md ADDED Viewed

@@ -0,0 +1,31 @@
+# Environment Variables
+The following environment variables all have default values that's suitable for the Vector cluster environment. You can use flags to modify certain environment variable values.
+* **MODEL_FAMILY**: Directory name of the model family.
+* **SRC_DIR**: Relative path for the `[src](../src/)` folder.
+* **CONFIG_FILE**: Config file containing default values for some environment variables in the **MODEL_FAMILY** diretory.
+* **MODEL_NAME**: Name of model family according to the actual model weights.
+* **MODEL_VARIANT**: Variant of the model, the variants available are listed in respective model folders. Default variant is bolded in the corresponding README.md file.
+* **MODEL_DIR**: Path to model's directory in vector-inference repo.
+* **VLLM_BASE_URL_FILENAME**: The file to store the inference server URL, this file would be generated after launching an inference server, and it would be located in the corresponding model folder with the name `.vllm_{model-name}-{model-variant}_url`.
+* **VENV_BASE**: Location of the virtual environment.
+* **VLLM_MODEL_WEIGHTS**: Location of the model weights.
+* **VLLM_DATA_TYPE**: Model data type.
+* **LD_LIBRARY_PATH**: Include custom locations for dynamically linked library files in a Unix-like operating system. In the script, we tell the dynamic linker to also look at the CUDA and cuDNN directories.
+* **JOB_NAME**: Slurm job name.
+* **NUM_NODES**: Numeber of nodes scheduled. Default to suggested resource allocation.
+* **NUM_GPUS**: Number of GPUs scheduled. Default to suggested resource allocation.
+* **JOB_PARTITION**: Type of compute partition. Default to suggested resource allocation.
+* **QOS**: Quality of Service.
+* **TIME**: Max Walltime.
+# Named Arguments
+NOTE: Arguments like `--num-nodes` or `model-variant` might not be available to certain model families because they should fit inside a single node or there is no variant availble in `/model-weights` yet. You can manually add these options in launch scripts if you need, or make a request to download weights for other variants.
+* `--model-variant`: Overrides **MODEL_VARIANT**
+* `--partition`: Overrides **JOB_PARTITION**.
+* `--num-nodes`: Overrides **NUM_NODES**.
+* `--num-gpus`: Overrides **NUM_GPUS**.
+* `--qos`: Overrides **QOS**.
+* `--time`: Overrides **TIME**.
+* `--data-type`: Overrides **VLLM_DATA_TYPE**.
+* `--venv`: Overrides **VENV_BASE**.

vec_inf/models/c4ai-command-r/README.md ADDED Viewed

@@ -0,0 +1,5 @@
+# [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+|[**`plus`**](https://huggingface.co/CohereForAI/c4ai-command-r-plus)| 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |

vec_inf/models/c4ai-command-r/config.sh ADDED Viewed

@@ -0,0 +1,5 @@
+export MODEL_NAME="c4ai-command-r"
+export MODEL_VARIANT="plus"
+export NUM_NODES=2
+export NUM_GPUS=4
+export VLLM_MAX_LOGPROBS=256000

vec_inf/models/dbrx/README.md ADDED Viewed

@@ -0,0 +1,5 @@
+# [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+|[**`dbrx-instruct`**](https://huggingface.co/databricks/dbrx-instruct)| 8x a40 (2 nodes, 4 a40/node) | 107 tokens/s | 904 tokens/s |

vec_inf/models/dbrx/config.sh ADDED Viewed

@@ -0,0 +1,5 @@
+export MODEL_NAME="dbrx"
+export MODEL_VARIANT="instruct"
+export NUM_NODES=2
+export NUM_GPUS=4
+export VLLM_MAX_LOGPROBS=100352

vec_inf/models/gemma-2/README.md ADDED Viewed

@@ -0,0 +1,8 @@
+# [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`9b`](https://huggingface.co/google/gemma-2-9b) | 1x a40 | - tokens/s | - tokens/s |
+| [**`9b-it`**](https://huggingface.co/google/gemma-2-9b-it) | 1x a40 | - tokens/s | - tokens/s |
+| [`27b`](https://huggingface.co/google/gemma-2-27b) | 2x a40 | - tokens/s | - tokens/s |
+| [`27b-it`](https://huggingface.co/google/gemma-2-27b-it) | 2x a40 | - tokens/s | - tokens/s |

vec_inf/models/gemma-2/config.sh ADDED Viewed

@@ -0,0 +1,6 @@
+export MODEL_NAME="gemma-2"
+export MODEL_VARIANT="9b-it"
+export NUM_NODES=1
+export NUM_GPUS=1
+export VLLM_MAX_LOGPROBS=256000
+export VLLM_ATTENTION_BACKEND=FLASHINFER

vec_inf/models/llava-1.5/README.md ADDED Viewed

@@ -0,0 +1,7 @@
+# [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
+* Supported model variants:
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+|[**`7b-hf`**](https://huggingface.co/llava-hf/llava-1.5-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
+|[`13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf)| 1x a40 | - tokens/s | - tokens/s |

vec_inf/models/llava-1.5/chat_template.jinja ADDED Viewed

@@ -0,0 +1,23 @@
+{%- if messages[0]['role'] == 'system' -%}
+    {%- set system_message = messages[0]['content'] -%}
+    {%- set messages = messages[1:] -%}
+{%- else -%}
+    {% set system_message = '' -%}
+{%- endif -%}
+{{ bos_token + system_message }}
+{%- for message in messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif -%}
+    {%- if message['role'] == 'user' -%}
+        {{ 'USER: ' + message['content'] + '\n' }}
+    {%- elif message['role'] == 'assistant' -%}
+        {{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }}
+    {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{ 'ASSISTANT:' }}
+{% endif %}

vec_inf/models/llava-1.5/config.sh ADDED Viewed

@@ -0,0 +1,5 @@
+export MODEL_NAME="llava-1.5"
+export MODEL_VARIANT="7b-hf"
+export NUM_NODES=1
+export NUM_GPUS=1
+export VLLM_MAX_LOGPROBS=32000

vec_inf/models/llava-v1.6/README.md ADDED Viewed

@@ -0,0 +1,7 @@
+# [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
+* Supported model variants:
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+|[**`mistral-7b-hf`**](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
+|[`34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf)| 2x a40 | - tokens/s | - tokens/s |

vec_inf/models/llava-v1.6/chat_template.jinja ADDED Viewed

@@ -0,0 +1,23 @@
+{%- if messages[0]['role'] == 'system' -%}
+    {%- set system_message = messages[0]['content'] -%}
+    {%- set messages = messages[1:] -%}
+{%- else -%}
+    {% set system_message = '' -%}
+{%- endif -%}
+{{ bos_token + system_message }}
+{%- for message in messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif -%}
+    {%- if message['role'] == 'user' -%}
+        {{ 'USER: ' + message['content'] + '\n' }}
+    {%- elif message['role'] == 'assistant' -%}
+        {{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }}
+    {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{ 'ASSISTANT:' }}
+{% endif %}

vec_inf/models/llava-v1.6/config.sh ADDED Viewed

@@ -0,0 +1,5 @@
+export MODEL_NAME="llava-v1.6"
+export MODEL_VARIANT="mistral-7b-hf"
+export NUM_NODES=1
+export NUM_GPUS=1
+export VLLM_MAX_LOGPROBS=32064

vec_inf/models/models.csv ADDED Viewed

@@ -0,0 +1,45 @@
+model_name,model_family,model_variant,partition,qos,time,num_gpus,num_nodes,vocab_size,max_model_len,data_type,venv,log_dir,pipeline_parallelism
+c4ai-command-r-plus,c4ai-command-r,plus,a40,m2,08:00:00,4,2,256000,8192,auto,singularity,default,false
+CodeLlama-7b-hf,CodeLlama,7b-hf,a40,m2,08:00:00,1,1,32000,16384,auto,singularity,default,false
+CodeLlama-7b-Instruct-hf,CodeLlama,7b-Instruct-hf,a40,m2,08:00:00,1,1,32000,16384,auto,singularity,default,false
+CodeLlama-13b-hf,CodeLlama,13b-hf,a40,m2,08:00:00,1,1,32000,16384,auto,singularity,default,false
+CodeLlama-13b-Instruct-hf,CodeLlama,13b-Instruct-hf,a40,m2,08:00:00,1,1,32000,16384,auto,singularity,default,false
+CodeLlama-34b-hf,CodeLlama,34b-hf,a40,m2,08:00:00,2,1,32000,16384,auto,singularity,default,false
+CodeLlama-34b-Instruct-hf,CodeLlama,34b-Instruct-hf,a40,m2,08:00:00,2,1,32000,16384,auto,singularity,default,false
+CodeLlama-70b-hf,CodeLlama,70b-hf,a40,m2,08:00:00,4,1,32000,4096,auto,singularity,default,false
+CodeLlama-70b-Instruct-hf,CodeLlama,70b-Instruct-hf,a40,m2,08:00:00,4,1,32000,4096,auto,singularity,default,false
+dbrx-instruct,dbrx,instruct,a40,m2,08:00:00,4,2,100352,32000,auto,singularity,default,false
+gemma-2-9b,gemma-2,9b,a40,m2,08:00:00,1,1,256000,4096,auto,singularity,default,false
+gemma-2-9b-it,gemma-2,9b-it,a40,m2,08:00:00,1,1,256000,4096,auto,singularity,default,false
+gemma-2-27b,gemma-2,27b,a40,m2,08:00:00,2,1,256000,4096,auto,singularity,default,false
+gemma-2-27b-it,gemma-2,27b-it,a40,m2,08:00:00,2,1,256000,4096,auto,singularity,default,false
+Llama-2-7b-hf,Llama-2,7b-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
+Llama-2-7b-chat-hf,Llama-2,7b-chat-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
+Llama-2-13b-hf,Llama-2,13b-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
+Llama-2-13b-chat-hf,Llama-2,13b-chat-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
+Llama-2-70b-hf,Llama-2,70b-hf,a40,m2,08:00:00,4,1,32000,4096,auto,singularity,default,false
+Llama-2-70b-chat-hf,Llama-2,70b-chat-hf,a40,m2,08:00:00,4,1,32000,4096,auto,singularity,default,false
+llava-1.5-7b-hf,llava-1.5,7b-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
+llava-1.5-13b-hf,llava-1.5,13b-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
+llava-v1.6-mistral-7b-hf,llava-v1.6,mistral-7b-hf,a40,m2,08:00:00,1,1,32064,32768,auto,singularity,default,false
+llava-v1.6-34b-hf,llava-v1.6,34b-hf,a40,m2,08:00:00,2,1,64064,4096,auto,singularity,default,false
+Meta-Llama-3-8B,Meta-Llama-3,8B,a40,m2,08:00:00,1,1,128256,8192,auto,singularity,default,false
+Meta-Llama-3-8B-Instruct,Meta-Llama-3,8B-Instruct,a40,m2,08:00:00,1,1,128256,8192,auto,singularity,default,false
+Meta-Llama-3-70B,Meta-Llama-3,70B,a40,m2,08:00:00,4,1,128256,8192,auto,singularity,default,false
+Meta-Llama-3-70B-Instruct,Meta-Llama-3,70B-Instruct,a40,m2,08:00:00,4,1,128256,8192,auto,singularity,default,false
+Meta-Llama-3.1-8B,Meta-Llama-3.1,8B,a40,m2,08:00:00,1,1,128256,131072,auto,singularity,default,false
+Meta-Llama-3.1-8B-Instruct,Meta-Llama-3.1,8B-Instruct,a40,m2,08:00:00,1,1,128256,131072,auto,singularity,default,false
+Meta-Llama-3.1-70B,Meta-Llama-3.1,70B,a40,m2,08:00:00,4,1,128256,65536,auto,singularity,default,false
+Meta-Llama-3.1-70B-Instruct,Meta-Llama-3.1,70B-Instruct,a40,m2,08:00:00,4,1,128256,65536,auto,singularity,default,false
+Meta-Llama-3.1-405B-Instruct,Meta-Llama-3.1,405B-Instruct,a40,m4,02:00:00,4,8,128256,16384,auto,singularity,default,true
+Mistral-7B-v0.1,Mistral,7B-v0.1,a40,m2,08:00:00,1,1,32000,32768,auto,singularity,default,false
+Mistral-7B-Instruct-v0.1,Mistral,7B-Instruct-v0.1,a40,m2,08:00:00,1,1,32000,32768,auto,singularity,default,false
+Mistral-7B-Instruct-v0.2,Mistral,7B-Instruct-v0.2,a40,m2,08:00:00,1,1,32000,32768,auto,singularity,default,false
+Mistral-7B-v0.3,Mistral,7B-v0.3,a40,m2,08:00:00,1,1,32768,32768,auto,singularity,default,false
+Mistral-7B-Instruct-v0.3,Mistral,7B-Instruct-v0.3,a40,m2,08:00:00,1,1,32768,32768,auto,singularity,default,false
+Mistral-Large-Instruct-2407,Mistral,Large-Instruct-2407,a40,m2,08:00:00,4,1,32768,131072,auto,singularity,default,false
+Mixtral-8x7B-Instruct-v0.1,Mixtral,8x7B-Instruct-v0.1,a40,m2,08:00:00,4,1,32000,32768,auto,singularity,default,false
+Mixtral-8x22B-v0.1,Mixtral,8x22B-v0.1,a40,m2,08:00:00,4,2,32768,65536,auto,singularity,default,false
+Mixtral-8x22B-Instruct-v0.1,Mixtral,8x22B-Instruct-v0.1,a40,m2,08:00:00,4,2,32768,65536,auto,singularity,default,false
+Phi-3-medium-128k-instruct,Phi-3,medium-128k-instruct,a40,m2,08:00:00,2,1,32064,131072,auto,singularity,default,false
+Phi-3-vision-128k-instruct,Phi-3,vision-128k-instruct,a40,m2,08:00:00,2,1,32064,65536,auto,singularity,default,false

vec_inf/multinode_vllm.slurm ADDED Viewed

@@ -0,0 +1,114 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=16
+#SBATCH --mem=64G
+#SBATCH --exclusive
+#SBATCH --tasks-per-node=1
+# Load CUDA, change to the cuda version on your environment if different
+module load cuda-12.3
+nvidia-smi
+source ${SRC_DIR}/find_port.sh
+if [ "$VENV_BASE" = "singularity" ]; then
+    export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_0.3.0.sif
+    export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1
+    module load singularity-ce/3.8.2
+    singularity exec $SINGULARITY_IMAGE ray stop
+fi
+# Getting the node names
+nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
+nodes_array=($nodes)
+head_node=${nodes_array[0]}
+head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+# Find port for head node
+head_node_port=$(find_available_port $head_node_ip 8080 65535)
+# Starting the Ray head node
+ip_head=$head_node_ip:$head_node_port
+export ip_head
+echo "IP Head: $ip_head"
+echo "Starting HEAD at $head_node"
+if [ "$VENV_BASE" = "singularity" ]; then
+    srun --nodes=1 --ntasks=1 -w "$head_node" \
+        singularity exec --nv --bind /model-weights:/model-weights $SINGULARITY_IMAGE \
+        ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \
+        --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${NUM_GPUS}" --block &
+else
+    srun --nodes=1 --ntasks=1 -w "$head_node" \
+        ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \
+        --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${NUM_GPUS}" --block &
+fi
+# Starting the Ray worker nodes
+# Optional, though may be useful in certain versions of Ray < 1.0.
+sleep 10
+# number of nodes other than the head node
+worker_num=$((SLURM_JOB_NUM_NODES - 1))
+for ((i = 1; i <= worker_num; i++)); do
+    node_i=${nodes_array[$i]}
+    echo "Starting WORKER $i at $node_i"
+    if [ "$VENV_BASE" = "singularity" ]; then
+        srun --nodes=1 --ntasks=1 -w "$node_i" \
+            singularity exec --nv --bind /model-weights:/model-weights $SINGULARITY_IMAGE \
+            ray start --address "$ip_head" \
+            --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${NUM_GPUS}" --block &
+    else
+        srun --nodes=1 --ntasks=1 -w "$node_i" \
+            ray start --address "$ip_head" \
+            --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${NUM_GPUS}" --block &
+    fi
+    sleep 5
+done
+vllm_port_number=$(find_available_port $head_node_ip 8080 65535)
+echo "Server address: http://${head_node_ip}:${vllm_port_number}/v1"
+echo "http://${head_node_ip}:${vllm_port_number}/v1" > ${VLLM_BASE_URL_FILENAME}
+if [ "$PIPELINE_PARALLELISM" = "true" ]; then
+    export PIPELINE_PARALLEL_SIZE=$NUM_NODES
+    export TENSOR_PARALLEL_SIZE=$NUM_GPUS
+else
+    export PIPELINE_PARALLEL_SIZE=1
+    export TENSOR_PARALLEL_SIZE=$((NUM_NODES*NUM_GPUS))
+fi
+# Activate vllm venv
+if [ "$VENV_BASE" = "singularity" ]; then
+    singularity exec --nv --bind /model-weights:/model-weights $SINGULARITY_IMAGE \
+    python3.10 -m vllm.entrypoints.openai.api_server \
+    --model ${VLLM_MODEL_WEIGHTS} \
+    --served-model-name ${JOB_NAME} \
+    --host "0.0.0.0" \
+    --port ${vllm_port_number} \
+    --pipeline-parallel-size ${PIPELINE_PARALLEL_SIZE} \
+    --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
+    --dtype ${VLLM_DATA_TYPE} \
+    --load-format safetensors \
+    --trust-remote-code \
+    --max-logprobs ${VLLM_MAX_LOGPROBS} \
+    --max-model-len ${VLLM_MAX_MODEL_LEN}
+else
+    source ${VENV_BASE}/bin/activate
+    python3 -m vllm.entrypoints.openai.api_server \
+    --model ${VLLM_MODEL_WEIGHTS} \
+    --served-model-name ${JOB_NAME} \
+    --host "0.0.0.0" \
+    --port ${vllm_port_number} \
+    --pipeline-parallel-size ${PIPELINE_PARALLEL_SIZE} \
+    --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
+    --dtype ${VLLM_DATA_TYPE} \
+    --load-format safetensors \
+    --trust-remote-code \
+    --max-logprobs ${VLLM_MAX_LOGPROBS} \
+    --max-model-len ${VLLM_MAX_MODEL_LEN}
+fi

vec_inf/vllm.slurm ADDED Viewed

@@ -0,0 +1,47 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=16
+#SBATCH --mem=64G
+# Load CUDA, change to the cuda version on your environment if different
+module load cuda-12.3
+nvidia-smi
+source ${SRC_DIR}/find_port.sh
+# Write server url to file
+hostname=${SLURMD_NODENAME}
+vllm_port_number=$(find_available_port $hostname 8080 65535)
+echo "Server address: http://${hostname}:${vllm_port_number}/v1"
+echo "http://${hostname}:${vllm_port_number}/v1" > ${VLLM_BASE_URL_FILENAME}
+# Activate vllm venv
+if [ "$VENV_BASE" = "singularity" ]; then
+    export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_0.3.0.sif
+    export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1
+    module load singularity-ce/3.8.2
+    singularity exec $SINGULARITY_IMAGE ray stop
+    singularity exec --nv --bind /model-weights:/model-weights $SINGULARITY_IMAGE \
+    python3.10 -m vllm.entrypoints.openai.api_server \
+    --model ${VLLM_MODEL_WEIGHTS} \
+    --served-model-name ${JOB_NAME} \
+    --host "0.0.0.0" \
+    --port ${vllm_port_number} \
+    --tensor-parallel-size ${NUM_GPUS} \
+    --dtype ${VLLM_DATA_TYPE} \
+    --max-logprobs ${VLLM_MAX_LOGPROBS} \
+    --trust-remote-code \
+    --max-model-len ${VLLM_MAX_MODEL_LEN}
+else
+    source ${VENV_BASE}/bin/activate
+    python3 -m vllm.entrypoints.openai.api_server \
+    --model ${VLLM_MODEL_WEIGHTS} \
+    --served-model-name ${JOB_NAME} \
+    --host "0.0.0.0" \
+    --port ${vllm_port_number} \
+    --tensor-parallel-size ${NUM_GPUS} \
+    --dtype ${VLLM_DATA_TYPE} \
+    --max-logprobs ${VLLM_MAX_LOGPROBS} \
+    --trust-remote-code \
+    --max-model-len ${VLLM_MAX_MODEL_LEN}
+fi

vec_inf-0.3.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,94 @@
+Metadata-Version: 2.1
+Name: vec-inf
+Version: 0.3.0
+Summary: Efficient LLM inference on Slurm clusters using vLLM.
+License: MIT
+Author: Marshall Wang
+Author-email: marshall.wang@vectorinstitute.ai
+Requires-Python: >=3.10,<4.0
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Provides-Extra: dev
+Requires-Dist: click (>=8.1.0,<9.0.0)
+Requires-Dist: cupy-cuda12x (==12.1.0) ; extra == "dev"
+Requires-Dist: ray (>=2.9.3,<3.0.0) ; extra == "dev"
+Requires-Dist: requests (>=2.31.0,<3.0.0)
+Requires-Dist: rich (>=13.7.0,<14.0.0)
+Requires-Dist: vllm (>=0.5.0,<0.6.0) ; extra == "dev"
+Requires-Dist: vllm-nccl-cu12 (>=2.18,<2.19) ; extra == "dev"
+Description-Content-Type: text/markdown
+# Vector Inference: Easy inference on Slurm clusters
+This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update the config files in the `vec_inf/models` folder and the environment variables in the model launching scripts in `vec_inf` accordingly.
+## Installation
+If you are using the Vector cluster environment, and you don't need any customization to the inference server environment, run the following to install package:
+```bash
+pip install vec-inf
+```
+Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package
+## Launch an inference server
+We will use the Llama 3 model as example, to launch an inference server for Llama 3 8B, run:
+```bash
+vec-inf launch llama-3
+```
+You should see an output like the following:
+<img src="https://github.com/user-attachments/assets/c50646df-0991-4164-ad8f-6eb7e86b67e0" width="350">
+There is a default variant for every model family, which is specified in `vec_inf/models/{MODEL_FAMILY_NAME}/README.md`, you can switch to other variants with the `--model-variant` option, and make sure to change the requested resource accordingly. More information about the available options can be found in the [`vec_inf/models`](vec_inf/models) folder. The inference server is compatible with the OpenAI `Completion` and `ChatCompletion` API.
+You can check the inference server status by providing the Slurm job ID to the `status` command:
+```bash
+vec-inf status 13014393
+```
+You should see an output like the following:
+<img src="https://github.com/user-attachments/assets/310086fd-82ea-4bfc-8062-5c8e71c5650c" width="400">
+There are 5 possible states:
+* **PENDING**: Job submitted to Slurm, but not executed yet.
+* **LAUNCHING**: Job is running but the server is not ready yet.
+* **READY**: Inference server running and ready to take requests.
+* **FAILED**: Inference server in an unhealthy state.
+* **SHUTDOWN**: Inference server is shutdown/cancelled.
+Note that the base URL is only available when model is in `READY` state.
+Both `launch` and `status` command supports `--json-mode`, where the output information would be structured as a JSON string.
+Finally, when you're finished using a model, you can shut it down by providing the Slurm job ID:
+```bash
+vec-inf shutdown 13014393
+> Shutting down model with Slurm Job ID: 13014393
+```
+Here is a more complicated example that launches a model variant using multiple nodes, say we want to launch Mixtral 8x22B, run
+```bash
+vec-inf launch mixtral --model-variant 8x22B-v0.1 --num-nodes 2 --num-gpus 4
+```
+And for launching a multimodal model, here is an example for launching LLaVa-NEXT Mistral 7B (default variant)
+```bash
+vec-inf launch llava-v1.6 --is-vlm
+```
+## Send inference requests
+Once the inference server is ready, you can start sending in inference requests. We provide example scripts for sending inference requests in [`examples`](examples) folder. Make sure to update the model server URL and the model weights location in the scripts. For example, you can run `python examples/inference/llm/completions.py`, and you should expect to see an output like the following:
+> {"id":"cmpl-bdf43763adf242588af07af88b070b62","object":"text_completion","created":2983960,"model":"/model-weights/Llama-2-7b-hf","choices":[{"index":0,"text":"\nCanada is close to the actual continent of North America. Aside from the Arctic islands","logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":8,"total_tokens":28,"completion_tokens":20}}
+**NOTE**: For multimodal models, currently only `ChatCompletion` is available, and only one image can be provided for each prompt.
+## SSH tunnel from your local device
+If you want to run inference from your local device, you can open a SSH tunnel to your cluster environment like the following:
+```bash
+ssh -L 8081:172.17.8.29:8081 username@v.vectorinstitute.ai -N
+```
+The example provided above is for the vector cluster, change the variables accordingly for your environment

vec_inf-0.3.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,41 @@
+vec_inf/README.md,sha256=6QAPmd9ccLDHmZMNs4Tjjv0dA28FQIVFJtgmnwgAkPE,389
+vec_inf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+vec_inf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+vec_inf/cli/_cli.py,sha256=weUeKHz1Hjq4AnJAfw-YpFceKWThrs80FgfWS1Ccq5I,7332
+vec_inf/cli/_utils.py,sha256=2Grz-bX_mGjzxXUBdrX7MbNfXUM7JQ3399GKe-N74FE,3910
+vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
+vec_inf/launch_server.sh,sha256=dVBVx6udXjb2Vw2rRTddTewDuw0WtV8ne0ImS0brMVk,3577
+vec_inf/models/CodeLlama/README.md,sha256=4R5Vp8mq_Qa9WgwGutihEw3vBc_5Euj-QTgHeq7s_ds,1145
+vec_inf/models/CodeLlama/config.sh,sha256=_DFM1NJibpSmbOIlSKB28m0C5PzM9mb8jVLLigSTgiQ,136
+vec_inf/models/Llama-2/README.md,sha256=0asL53BytxSeilUoyZmy1Z6bJd-UMgTkwu721HVNpn4,656
+vec_inf/models/Llama-2/config.sh,sha256=rAjwo51rczP7VWr9nLsVrlWwRqWA9ncGJzr61LdTBU8,129
+vec_inf/models/Meta-Llama-3/README.md,sha256=FQgpLniE_krQyoTe8ziRFyzyZMkNamCFAhqkd-N0TR8,704
+vec_inf/models/Meta-Llama-3/config.sh,sha256=q-SpdvTIbC4-U8xfaV_Uzzodl5okxq_Z5YNnzGYwdVQ,136
+vec_inf/models/Meta-Llama-3.1/README.md,sha256=yjzIg5xp5XgUzZxJmM3mz6uzqSl_n7tTTi9YyTuudAk,693
+vec_inf/models/Meta-Llama-3.1/config.sh,sha256=XhV-e33tuNJYX32PHx8AxZ5sR_A_z3glcuDfiZooV0o,162
+vec_inf/models/Mistral/README.md,sha256=uv4c_oHr3DAN_3fy0YfcGiIGmMdz1Vswx3wfaAcChlk,788
+vec_inf/models/Mistral/config.sh,sha256=8UWTYouNmctOd_eM0ArmuXhSYRkwkMqLY8WbturH1wY,135
+vec_inf/models/Mixtral/README.md,sha256=Ic94pH0NY-MniVR5b1uRDJrpYx1rVXLYQpjFEw98054,655
+vec_inf/models/Mixtral/config.sh,sha256=AbTfEmzHZ3UX08WAa2zcgdGPDw178xtfCh7l3znZIUQ,137
+vec_inf/models/Phi-3/README.md,sha256=lj8Bx538O0yC8SjID-GyFHDSf6MU6HezPdtqCO6zm1E,507
+vec_inf/models/Phi-3/config.sh,sha256=vX6UWZg7YCtDAO3QKHz7PwvGJ5clp7QYnytNPFx4tZ0,161
+vec_inf/models/README.md,sha256=V5atdrL3y6euM244iBrh6ASstWvr__uvCy3y7Ktg2qU,2390
+vec_inf/models/c4ai-command-r/README.md,sha256=yGCYVzsMpBSYa2eSn-YU2kBFv3qW3acd-mHY7FLIc9M,406
+vec_inf/models/c4ai-command-r/config.sh,sha256=InBRtlAHIxve3xbNN0UomMCh4xlAZlOQu-j4wWWc3Co,132
+vec_inf/models/dbrx/README.md,sha256=MJRyZtqhYqN9_BvTD-lxqf34ytYCP6a1tg4_aWfJhsI,384
+vec_inf/models/dbrx/config.sh,sha256=UjtHdUZ_6TKDGR8c449iVkaBa63a8Z3-IaxDq_KO4Go,126
+vec_inf/models/gemma-2/README.md,sha256=4QMheXAZe0bNNQ2kEZn15I3x83rF9iLQnSUejx8p46o,628
+vec_inf/models/gemma-2/config.sh,sha256=Tl1U774WXoOsAbiGa4tZGg53GWh_niqe5Z_bQ92VX1I,166
+vec_inf/models/llava-1.5/README.md,sha256=YxkN_BWnK4nNf0rXi4_1isJVxlV73YhVABxhCjqNSvY,471
+vec_inf/models/llava-1.5/chat_template.jinja,sha256=qCE9YwTfTa3jwjrB5yAnqVIm1bDkUBc5LjHBM0d9Sso,765
+vec_inf/models/llava-1.5/config.sh,sha256=Yvb6s1mil0vmkVllnC3DjphpSkC2U5KOQG3l5OJawME,127
+vec_inf/models/llava-v1.6/README.md,sha256=5JYjW0XoGJf40wd-oIdO3uAQYPv5XPZQuo5T37pxZcg,491
+vec_inf/models/llava-v1.6/chat_template.jinja,sha256=qCE9YwTfTa3jwjrB5yAnqVIm1bDkUBc5LjHBM0d9Sso,765
+vec_inf/models/llava-v1.6/config.sh,sha256=zuoK2cg5KgKbs9jk_M3R-vALGP1TesMkWEeRjSw209E,136
+vec_inf/models/models.csv,sha256=JFGMhT9o7Pf0tkY-w2GRQG5MxdYK2V5T8s6bk166MpM,4720
+vec_inf/multinode_vllm.slurm,sha256=pedYWIzPN-BKtL6ezoZSKJ3DO7RduDyAR4_cxZD4KyY,3938
+vec_inf/vllm.slurm,sha256=6Nx14qyAwHlbweCbFMUcMV2jaZSv41ghkyx2MiHJY8Y,1608
+vec_inf-0.3.0.dist-info/METADATA,sha256=Vqr7b5pmz4rWK1B4my9a_jG6BT5C_8XvGJtzjy3HVng,5142
+vec_inf-0.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+vec_inf-0.3.0.dist-info/entry_points.txt,sha256=JF4uXsj1H4XacxaBw9f0KN0P0qDzmp7K_1zTEBDappo,48
+vec_inf-0.3.0.dist-info/RECORD,,

vec_inf-0.3.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: poetry-core 1.9.0
+Root-Is-Purelib: true
+Tag: py3-none-any

vec_inf-0.3.0.dist-info/entry_points.txt ADDED Viewed

@@ -0,0 +1,3 @@
+[console_scripts]
+vec-inf=vec_inf.cli._cli:cli