PyPI - vec-inf - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

vec-inf 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

vec_inf/README.md +2 -2
vec_inf/cli/_cli.py +93 -113
vec_inf/cli/_utils.py +33 -25
vec_inf/launch_server.sh +2 -2
vec_inf/models/README.md +16 -16
vec_inf/models/models.csv +2 -1
vec_inf/multinode_vllm.slurm +3 -5
vec_inf/vllm.slurm +2 -2
{vec_inf-0.3.1.dist-info → vec_inf-0.3.3.dist-info}/METADATA +6 -5
vec_inf-0.3.3.dist-info/RECORD +15 -0
vec_inf-0.3.1.dist-info/RECORD +0 -15
{vec_inf-0.3.1.dist-info → vec_inf-0.3.3.dist-info}/WHEEL +0 -0
{vec_inf-0.3.1.dist-info → vec_inf-0.3.3.dist-info}/entry_points.txt +0 -0

vec_inf/README.md CHANGED Viewed

@@ -3,6 +3,6 @@
 * `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server, `--json-mode` supported. Check [`here`](./models/README.md) for complete list of available options.
 * `list`: List all available model names, `--json-mode` supported.
 * `status`: Check the model status by providing its Slurm job ID, `--json-mode` supported.
-* `shutdown`: Shutdown a model by providing its Slurm job ID.
+* `shutdown`: Shutdown a model by providing its Slurm job ID.
-Use `--help` to see all available options
+Use `--help` to see all available options

vec_inf/cli/_cli.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import os
+from typing import Optional
 import click
-from rich.console import Console
 from rich.columns import Columns
+from rich.console import Console
 from rich.panel import Panel
-from ._utils import *
+import vec_inf.cli._utils as utils
 CONSOLE = Console()
@@ -18,122 +18,107 @@ def cli():
 @cli.command("launch")
-@click.argument(
-    "model-name",
-    type=str,
-    nargs=1
-)
-@click.option(
-    "--model-family",
-    type=str,
-    help='The model family'
-)
-@click.option(
-    "--model-variant",
-    type=str,
-    help='The model variant'
-)
+@click.argument("model-name", type=str, nargs=1)
+@click.option("--model-family", type=str, help="The model family")
+@click.option("--model-variant", type=str, help="The model variant")
 @click.option(
     "--max-model-len",
     type=int,
-    help='Model context length. If unspecified, will be automatically derived from the model config.'
-)
-@click.option(
-    "--partition",
-    type=str,
-    help='Type of compute partition, default to a40'
+    help="Model context length. If unspecified, will be automatically derived from the model config.",
 )
+@click.option("--partition", type=str, help="Type of compute partition, default to a40")
 @click.option(
     "--num-nodes",
     type=int,
-    help='Number of nodes to use, default to suggested resource allocation for model'
+    help="Number of nodes to use, default to suggested resource allocation for model",
 )
 @click.option(
     "--num-gpus",
     type=int,
-    help='Number of GPUs/node to use, default to suggested resource allocation for model'
+    help="Number of GPUs/node to use, default to suggested resource allocation for model",
 )
 @click.option(
     "--qos",
     type=str,
-    help='Quality of service, default depends on suggested resource allocation required for the model'
+    help="Quality of service, default depends on suggested resource allocation required for the model",
 )
 @click.option(
     "--time",
     type=str,
-    help='Time limit for job, this should comply with QoS, default to max walltime of the chosen QoS'
+    help="Time limit for job, this should comply with QoS, default to max walltime of the chosen QoS",
 )
 @click.option(
-    "--data-type",
-    type=str,
-    help='Model data type, default to auto'
-)
-@click.option(
-    "--venv",
-    type=str,
-    help='Path to virtual environment'
+    "--vocab-size",
+    type=int,
+    help="Vocabulary size, this option is intended for custom models",
 )
+@click.option("--data-type", type=str, help="Model data type, default to auto")
+@click.option("--venv", type=str, help="Path to virtual environment")
 @click.option(
     "--log-dir",
     type=str,
-    help='Path to slurm log directory, default to .vec-inf-logs in home directory'
+    help="Path to slurm log directory, default to .vec-inf-logs in home directory",
 )
 @click.option(
     "--json-mode",
     is_flag=True,
-    help='Output in JSON string',
+    help="Output in JSON string",
 )
 def launch(
     model_name: str,
-    model_family: str=None,
-    model_variant: str=None,
-    max_model_len: int=None,
-    partition: str=None,
-    num_nodes: int=None,
-    num_gpus: int=None,
-    qos: str=None,
-    time: str=None,
-    data_type: str=None,
-    venv: str=None,
-    log_dir: str=None,
-    json_mode: bool=False
+    model_family: Optional[str] = None,
+    model_variant: Optional[str] = None,
+    max_model_len: Optional[int] = None,
+    partition: Optional[str] = None,
+    num_nodes: Optional[int] = None,
+    num_gpus: Optional[int] = None,
+    qos: Optional[str] = None,
+    time: Optional[str] = None,
+    vocab_size: Optional[int] = None,
+    data_type: Optional[str] = None,
+    venv: Optional[str] = None,
+    log_dir: Optional[str] = None,
+    json_mode: bool = False,
 ) -> None:
     """
     Launch a model on the cluster
     """
     launch_script_path = os.path.join(
-        os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
-        "launch_server.sh"
+        os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "launch_server.sh"
     )
-    launch_cmd = f"bash {launch_script_path}"
-    models_df = load_models_df()
+    launch_cmd = f"bash {launch_script_path}"
-    if model_name not in models_df['model_name'].values:
-        raise ValueError(f"Model name {model_name} not found in available models")
+    models_df = utils.load_models_df()
-    default_args = load_default_args(models_df, model_name)
+    if model_name in models_df["model_name"].values:
+        default_args = utils.load_default_args(models_df, model_name)
+        for arg in default_args:
+            if arg in locals() and locals()[arg] is not None:
+                default_args[arg] = locals()[arg]
+            renamed_arg = arg.replace("_", "-")
+            launch_cmd += f" --{renamed_arg} {default_args[arg]}"
+    else:
+        model_args = models_df.columns.tolist()
+        excluded_keys = ["model_name", "pipeline_parallelism"]
+        for arg in model_args:
+            if arg not in excluded_keys and locals()[arg] is not None:
+                renamed_arg = arg.replace("_", "-")
+                launch_cmd += f" --{renamed_arg} {locals()[arg]}"
-    for arg in default_args:
-        if arg in locals() and locals()[arg] is not None:
-            default_args[arg] = locals()[arg]
-        renamed_arg = arg.replace("_", "-")
-        launch_cmd += f" --{renamed_arg} {default_args[arg]}"
-    output = run_bash_command(launch_cmd)
+    output = utils.run_bash_command(launch_cmd)
     slurm_job_id = output.split(" ")[-1].strip().strip("\n")
     output_lines = output.split("\n")[:-2]
-    table = create_table(key_title="Job Config", value_title="Value")
+    table = utils.create_table(key_title="Job Config", value_title="Value")
     table.add_row("Slurm Job ID", slurm_job_id, style="blue")
     output_dict = {"slurm_job_id": slurm_job_id}
     for line in output_lines:
         key, value = line.split(": ")
         table.add_row(key, value)
         output_dict[key.lower().replace(" ", "_")] = value
     if json_mode:
         click.echo(output_dict)
     else:
@@ -141,27 +126,25 @@ def launch(
 @cli.command("status")
-@click.argument(
-    "slurm_job_id",
-    type=int,
-    nargs=1
-)
+@click.argument("slurm_job_id", type=int, nargs=1)
 @click.option(
     "--log-dir",
     type=str,
-    help='Path to slurm log directory. This is required if --log-dir was set in model launch'
+    help="Path to slurm log directory. This is required if --log-dir was set in model launch",
 )
 @click.option(
     "--json-mode",
     is_flag=True,
-    help='Output in JSON string',
+    help="Output in JSON string",
 )
-def status(slurm_job_id: int, log_dir: str=None, json_mode: bool=False) -> None:
+def status(
+    slurm_job_id: int, log_dir: Optional[str] = None, json_mode: bool = False
+) -> None:
     """
     Get the status of a running model on the cluster
     """
     status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
-    output = run_bash_command(status_cmd)
+    output = utils.run_bash_command(status_cmd)
     slurm_job_name = "UNAVAILABLE"
     status = "SHUTDOWN"
@@ -181,36 +164,39 @@ def status(slurm_job_id: int, log_dir: str=None, json_mode: bool=False) -> None:
     # If Slurm job is currently RUNNING
     elif slurm_job_state == "RUNNING":
         # Check whether the server is ready, if yes, run model health check to further determine status
-        server_status = is_server_running(slurm_job_name, slurm_job_id, log_dir)
+        server_status = utils.is_server_running(slurm_job_name, slurm_job_id, log_dir)
         # If server status is a tuple, then server status is "FAILED"
-        if type(server_status) is tuple:
+        if isinstance(server_status, tuple):
             status = server_status[0]
             slurm_job_failed_reason = server_status[1]
         elif server_status == "RUNNING":
-            status = model_health_check(slurm_job_name, slurm_job_id, log_dir)
-            if status == "READY":
+            model_status = utils.model_health_check(
+                slurm_job_name, slurm_job_id, log_dir
+            )
+            if model_status == "READY":
                 # Only set base_url if model is ready to serve requests
-                base_url = get_base_url(slurm_job_name, slurm_job_id, log_dir)
+                base_url = utils.get_base_url(slurm_job_name, slurm_job_id, log_dir)
+                status = "READY"
             else:
                 # If model is not ready, then status must be "FAILED"
-                status = status[0]
-                slurm_job_failed_reason = status[1]
+                status = model_status[0]
+                slurm_job_failed_reason = str(model_status[1])
         else:
             status = server_status
     if json_mode:
         status_dict = {
-            "model_name": slurm_job_name,
-            "model_status": status,
-            "base_url": base_url
+            "model_name": slurm_job_name,
+            "model_status": status,
+            "base_url": base_url,
         }
         if "slurm_job_pending_reason" in locals():
             status_dict["pending_reason"] = slurm_job_pending_reason
         if "slurm_job_failed_reason" in locals():
             status_dict["failed_reason"] = slurm_job_failed_reason
-        click.echo(f'{status_dict}')
+        click.echo(f"{status_dict}")
     else:
-        table = create_table(key_title="Job Status", value_title="Value")
+        table = utils.create_table(key_title="Job Status", value_title="Value")
         table.add_row("Model Name", slurm_job_name)
         table.add_row("Model Status", status, style="blue")
         if "slurm_job_pending_reason" in locals():
@@ -219,60 +205,54 @@ def status(slurm_job_id: int, log_dir: str=None, json_mode: bool=False) -> None:
             table.add_row("Reason", slurm_job_failed_reason)
         table.add_row("Base URL", base_url)
         CONSOLE.print(table)
 @cli.command("shutdown")
-@click.argument(
-    "slurm_job_id",
-    type=int,
-    nargs=1
-)
+@click.argument("slurm_job_id", type=int, nargs=1)
 def shutdown(slurm_job_id: int) -> None:
     """
     Shutdown a running model on the cluster
     """
     shutdown_cmd = f"scancel {slurm_job_id}"
-    run_bash_command(shutdown_cmd)
+    utils.run_bash_command(shutdown_cmd)
     click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}")
 @cli.command("list")
-@click.argument(
-    "model-name",
-    required=False)
+@click.argument("model-name", required=False)
 @click.option(
     "--json-mode",
     is_flag=True,
-    help='Output in JSON string',
+    help="Output in JSON string",
 )
-def list(model_name: str=None, json_mode: bool=False) -> None:
+def list(model_name: Optional[str] = None, json_mode: bool = False) -> None:
     """
     List all available models, or get default setup of a specific model
     """
-    models_df = load_models_df()
+    models_df = utils.load_models_df()
     if model_name:
-        if model_name not in models_df['model_name'].values:
+        if model_name not in models_df["model_name"].values:
             raise ValueError(f"Model name {model_name} not found in available models")
-        excluded_keys = {'venv', 'log_dir', 'pipeline_parallelism'}
-        model_row = models_df.loc[models_df['model_name'] == model_name]
+        excluded_keys = {"venv", "log_dir", "pipeline_parallelism"}
+        model_row = models_df.loc[models_df["model_name"] == model_name]
         if json_mode:
             # click.echo(model_row.to_json(orient='records'))
-            filtered_model_row = model_row.drop(columns=excluded_keys, errors='ignore')
-            click.echo(filtered_model_row.to_json(orient='records'))
+            filtered_model_row = model_row.drop(columns=excluded_keys, errors="ignore")
+            click.echo(filtered_model_row.to_json(orient="records"))
             return
-        table = create_table(key_title="Model Config", value_title="Value")
+        table = utils.create_table(key_title="Model Config", value_title="Value")
         for _, row in model_row.iterrows():
             for key, value in row.items():
                 if key not in excluded_keys:
                     table.add_row(key, str(value))
         CONSOLE.print(table)
         return
     if json_mode:
-        click.echo(models_df['model_name'].to_json(orient='records'))
+        click.echo(models_df["model_name"].to_json(orient="records"))
         return
     panels = []
     for _, row in models_df.iterrows():
@@ -281,5 +261,5 @@ def list(model_name: str=None, json_mode: bool=False) -> None:
     CONSOLE.print(Columns(panels, equal=True))
-if __name__ == '__main__':
-    cli()
+if __name__ == "__main__":
+    cli()

vec_inf/cli/_utils.py CHANGED Viewed

@@ -1,11 +1,10 @@
-import subprocess
 import os
-from typing import Union
+import subprocess
+from typing import Optional, Union
+import pandas as pd
 import requests
 from rich.table import Table
-import pandas as pd
 MODEL_READY_SIGNATURE = "INFO:     Uvicorn running on http://0.0.0.0:"
 SERVER_ADDRESS_SIGNATURE = "Server address: "
@@ -15,45 +14,50 @@ def run_bash_command(command: str) -> str:
     """
     Run a bash command and return the output
     """
-    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    process = subprocess.Popen(
+        command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
+    )
     stdout, _ = process.communicate()
     return stdout
 def read_slurm_log(
-        slurm_job_name: str,
-        slurm_job_id: int,
-        slurm_log_type: str,
-        log_dir: str
-    ) -> Union[list, str]:
+    slurm_job_name: str, slurm_job_id: int, slurm_log_type: str, log_dir: Optional[str]
+) -> Union[list[str], str]:
     """
     Get the directory of a model
     """
     if not log_dir:
         models_dir = os.path.join(os.path.expanduser("~"), ".vec-inf-logs")
         for dir in sorted(os.listdir(models_dir), key=len, reverse=True):
             if dir in slurm_job_name:
                 log_dir = os.path.join(models_dir, dir)
                 break
     try:
-        file_path = os.path.join(log_dir, f"{slurm_job_name}.{slurm_job_id}.{slurm_log_type}")
-        with open(file_path, 'r') as file:
+        file_path = os.path.join(
+            log_dir,  # type: ignore
+            f"{slurm_job_name}.{slurm_job_id}.{slurm_log_type}",
+        )
+        with open(file_path, "r") as file:
             lines = file.readlines()
     except FileNotFoundError:
         print(f"Could not find file: {file_path}")
         return "LOG_FILE_NOT_FOUND"
     return lines
-def is_server_running(slurm_job_name: str, slurm_job_id: int, log_dir: str) -> Union[str, tuple]:
+def is_server_running(
+    slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]
+) -> Union[str, tuple[str, str]]:
     """
     Check if a model is ready to serve requests
     """
     log_content = read_slurm_log(slurm_job_name, slurm_job_id, "err", log_dir)
-    if type(log_content) is str:
+    if isinstance(log_content, str):
         return log_content
     for line in log_content:
         if "error" in line.lower():
             return ("FAILED", line.strip("\n"))
@@ -62,21 +66,23 @@ def is_server_running(slurm_job_name: str, slurm_job_id: int, log_dir: str) -> U
     return "LAUNCHING"
-def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: str) -> str:
+def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]) -> str:
     """
     Get the base URL of a model
     """
     log_content = read_slurm_log(slurm_job_name, slurm_job_id, "out", log_dir)
-    if type(log_content) is str:
+    if isinstance(log_content, str):
         return log_content
     for line in log_content:
         if SERVER_ADDRESS_SIGNATURE in line:
             return line.split(SERVER_ADDRESS_SIGNATURE)[1].strip("\n")
     return "URL_NOT_FOUND"
-def model_health_check(slurm_job_name: str, slurm_job_id: int, log_dir: str) -> Union[str, tuple]:
+def model_health_check(
+    slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]
+) -> Union[str, tuple[str, Union[str, int]]]:
     """
     Check the health of a running model on the cluster
     """
@@ -94,9 +100,11 @@ def model_health_check(slurm_job_name: str, slurm_job_id: int, log_dir: str) ->
             return ("FAILED", response.status_code)
     except requests.exceptions.RequestException as e:
         return ("FAILED", str(e))
-def create_table(key_title: str = "", value_title: str = "", show_header: bool = True) -> Table:
+def create_table(
+    key_title: str = "", value_title: str = "", show_header: bool = True
+) -> Table:
     """
     Create a table for displaying model status
     """
@@ -113,7 +121,7 @@ def load_models_df() -> pd.DataFrame:
     models_df = pd.read_csv(
         os.path.join(
             os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
-            "models/models.csv"
+            "models/models.csv",
         )
     )
     return models_df
@@ -126,4 +134,4 @@ def load_default_args(models_df: pd.DataFrame, model_name: str) -> dict:
     row_data = models_df.loc[models_df["model_name"] == model_name]
     default_args = row_data.iloc[0].to_dict()
     default_args.pop("model_name")
-    return default_args
+    return default_args

vec_inf/launch_server.sh CHANGED Viewed

@@ -76,7 +76,7 @@ mkdir -p $LOG_DIR
 export SRC_DIR="$(dirname "$0")"
 export MODEL_DIR="${SRC_DIR}/models/${MODEL_FAMILY}"
 export VLLM_BASE_URL_FILENAME="${MODEL_DIR}/.${JOB_NAME}_url"
 # Variables specific to your working environment, below are examples for the Vector cluster
 export VLLM_MODEL_WEIGHTS="/model-weights/$JOB_NAME"
 export LD_LIBRARY_PATH="/scratch/ssd001/pkgs/cudnn-11.7-v8.5.0.96/lib/:/scratch/ssd001/pkgs/cuda-11.7/targets/x86_64-linux/lib/"
@@ -119,4 +119,4 @@ sbatch --job-name $JOB_NAME \
     --time $WALLTIME \
     --output $LOG_DIR/$JOB_NAME.%j.out \
     --error $LOG_DIR/$JOB_NAME.%j.err \
-    $SRC_DIR/${is_special}vllm.slurm
+    $SRC_DIR/${is_special}vllm.slurm

vec_inf/models/README.md CHANGED Viewed

@@ -1,13 +1,13 @@
 # Available Models
 More profiling metrics coming soon!
-## [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
+## [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
 |[`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus)| 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
-## [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
+## [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
@@ -20,13 +20,13 @@ More profiling metrics coming soon!
 | [`CodeLlama-70b-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-hf) | 4x a40 | - tokens/s | - tokens/s |
 | [`CodeLlama-70b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-Instruct-hf) | 4x a40 | - tokens/s | - tokens/s |
-## [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
+## [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
 |[`dbrx-instruct`](https://huggingface.co/databricks/dbrx-instruct)| 8x a40 (2 nodes, 4 a40/node) | 107 tokens/s | 904 tokens/s |
-## [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
+## [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
@@ -35,32 +35,32 @@ More profiling metrics coming soon!
 | [`gemma-2-27b`](https://huggingface.co/google/gemma-2-27b) | 2x a40 | - tokens/s | - tokens/s |
 | [`gemma-2-27b-it`](https://huggingface.co/google/gemma-2-27b-it) | 2x a40 | - tokens/s | - tokens/s |
-## [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
+## [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
 |[`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
 |[`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf)| 1x a40 | - tokens/s | - tokens/s |
-## [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
+## [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
 |[`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
 |[`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf)| 2x a40 | - tokens/s | - tokens/s |
-## [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
+## [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
 | Variant | Suggested resource allocation |
 |:----------:|:----------:|
-| [`Llama-2-7b-hf`](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 1x a40 |
-| [`Llama-2-7b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1x a40 |
-| [`Llama-2-13b-hf`](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 1x a40 |
+| [`Llama-2-7b-hf`](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 1x a40 |
+| [`Llama-2-7b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1x a40 |
+| [`Llama-2-13b-hf`](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 1x a40 |
 | [`Llama-2-13b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 1x a40 |
 | [`Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 4x a40 |
 | [`Llama-2-70b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4x a40 |
-## [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
+## [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
@@ -69,7 +69,7 @@ More profiling metrics coming soon!
 | [`Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 4x a40 | 81 tokens/s | 618 tokens/s |
 | [`Meta-Llama-3-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 4x a40 | 301 tokens/s | 660 tokens/s |
-## [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
+## [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
@@ -79,7 +79,7 @@ More profiling metrics coming soon!
 | [`Meta-Llama-3.1-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
 | [`Meta-Llama-3.1-405B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct) | 32x a40 (8 nodes, 4 a40/node) | - tokens/s | - tokens/s |
-## [Mistral AI: Mistral](https://huggingface.co/mistralai)
+## [Mistral AI: Mistral](https://huggingface.co/mistralai)
 | Variant (Mistral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
@@ -90,7 +90,7 @@ More profiling metrics coming soon!
 |[`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)| 1x a40 | - tokens/s | - tokens/s|
 |[`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407)| 4x a40 | - tokens/s | - tokens/s|
-## [Mistral AI: Mixtral](https://huggingface.co/mistralai)
+## [Mistral AI: Mixtral](https://huggingface.co/mistralai)
 | Variant (Mixtral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
@@ -98,9 +98,9 @@ More profiling metrics coming soon!
 |[`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
 |[`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
-## [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
+## [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
 | [`Phi-3-medium-128k-instruct`](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
-| [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
+| [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |

vec_inf/models/models.csv CHANGED Viewed

@@ -42,4 +42,5 @@ Mixtral-8x7B-Instruct-v0.1,Mixtral,8x7B-Instruct-v0.1,a40,m2,08:00:00,4,1,32000,
 Mixtral-8x22B-v0.1,Mixtral,8x22B-v0.1,a40,m2,08:00:00,4,2,32768,65536,auto,singularity,default,false
 Mixtral-8x22B-Instruct-v0.1,Mixtral,8x22B-Instruct-v0.1,a40,m2,08:00:00,4,2,32768,65536,auto,singularity,default,false
 Phi-3-medium-128k-instruct,Phi-3,medium-128k-instruct,a40,m2,08:00:00,2,1,32064,131072,auto,singularity,default,false
-Phi-3-vision-128k-instruct,Phi-3,vision-128k-instruct,a40,m2,08:00:00,2,1,32064,65536,auto,singularity,default,false
+Phi-3-vision-128k-instruct,Phi-3,vision-128k-instruct,a40,m2,08:00:00,2,1,32064,65536,auto,singularity,default,false
+Llama3-OpenBioLLM-70B,Llama3-OpenBioLLM,70B,a40,m2,08:00:00,4,1,128256,8192,auto,singularity,default,false

vec_inf/multinode_vllm.slurm CHANGED Viewed

@@ -64,7 +64,7 @@ for ((i = 1; i <= worker_num; i++)); do
             ray start --address "$ip_head" \
             --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${NUM_GPUS}" --block &
     fi
     sleep 5
 done
@@ -83,7 +83,7 @@ else
 fi
 # Activate vllm venv
-if [ "$VENV_BASE" = "singularity" ]; then
+if [ "$VENV_BASE" = "singularity" ]; then
     singularity exec --nv --bind /model-weights:/model-weights $SINGULARITY_IMAGE \
     python3.10 -m vllm.entrypoints.openai.api_server \
     --model ${VLLM_MODEL_WEIGHTS} \
@@ -93,7 +93,6 @@ if [ "$VENV_BASE" = "singularity" ]; then
     --pipeline-parallel-size ${PIPELINE_PARALLEL_SIZE} \
     --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
     --dtype ${VLLM_DATA_TYPE} \
-    --load-format safetensors \
     --trust-remote-code \
     --max-logprobs ${VLLM_MAX_LOGPROBS} \
     --max-model-len ${VLLM_MAX_MODEL_LEN}
@@ -107,8 +106,7 @@ else
     --pipeline-parallel-size ${PIPELINE_PARALLEL_SIZE} \
     --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
     --dtype ${VLLM_DATA_TYPE} \
-    --load-format safetensors \
     --trust-remote-code \
     --max-logprobs ${VLLM_MAX_LOGPROBS} \
     --max-model-len ${VLLM_MAX_MODEL_LEN}
-fi
+fi

vec_inf/vllm.slurm CHANGED Viewed

@@ -41,7 +41,7 @@ else
     --port ${vllm_port_number} \
     --tensor-parallel-size ${NUM_GPUS} \
     --dtype ${VLLM_DATA_TYPE} \
-    --max-logprobs ${VLLM_MAX_LOGPROBS} \
+    --max-logprobs ${VLLM_MAX_LOGPROBS} \
     --trust-remote-code \
     --max-model-len ${VLLM_MAX_MODEL_LEN}
-fi
+fi

{vec_inf-0.3.1.dist-info → vec_inf-0.3.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vec-inf
-Version: 0.3.1
+Version: 0.3.3
 Summary: Efficient LLM inference on Slurm clusters using vLLM.
 License: MIT
 Author: Marshall Wang
@@ -14,6 +14,7 @@ Classifier: Programming Language :: Python :: 3.12
 Provides-Extra: dev
 Requires-Dist: click (>=8.1.0,<9.0.0)
 Requires-Dist: cupy-cuda12x (==12.1.0) ; extra == "dev"
+Requires-Dist: pandas (>=2.2.2,<3.0.0)
 Requires-Dist: ray (>=2.9.3,<3.0.0) ; extra == "dev"
 Requires-Dist: requests (>=2.31.0,<3.0.0)
 Requires-Dist: rich (>=13.7.0,<14.0.0)
@@ -22,7 +23,7 @@ Requires-Dist: vllm-nccl-cu12 (>=2.18,<2.19) ; extra == "dev"
 Description-Content-Type: text/markdown
 # Vector Inference: Easy inference on Slurm clusters
-This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update [`launch_server.sh`](vec-inf/launch_server.sh), [`vllm.slurm`](vec-inf/vllm.slurm), [`multinode_vllm.slurm`](vec-inf/multinode_vllm.slurm) and [`models.csv`](vec-inf/models/models.csv) accordingly.
+This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update [`launch_server.sh`](vec-inf/launch_server.sh), [`vllm.slurm`](vec-inf/vllm.slurm), [`multinode_vllm.slurm`](vec-inf/multinode_vllm.slurm) and [`models.csv`](vec-inf/models/models.csv) accordingly.
 ## Installation
 If you are using the Vector cluster environment, and you don't need any customization to the inference server environment, run the following to install package:
@@ -40,7 +41,7 @@ You should see an output like the following:
 <img width="400" alt="launch_img" src="https://github.com/user-attachments/assets/557eb421-47db-4810-bccd-c49c526b1b43">
-The model would be launched using the [default parameters](vec-inf/models/models.csv), you can override these values by providing additional options, use `--help` to see the full list. You can also launch your own customized model as long as the model architecture is [supported by vLLM](https://docs.vllm.ai/en/stable/models/supported_models.html), you'll need to specify all model launching related options to run a successful run.
+The model would be launched using the [default parameters](vec-inf/models/models.csv), you can override these values by providing additional options, use `--help` to see the full list. You can also launch your own customized model as long as the model architecture is [supported by vLLM](https://docs.vllm.ai/en/stable/models/supported_models.html), you'll need to specify all model launching related options to run a successful run.
 You can check the inference server status by providing the Slurm job ID to the `status` command:
 ```bash
@@ -55,7 +56,7 @@ There are 5 possible states:
 * **PENDING**: Job submitted to Slurm, but not executed yet. Job pending reason will be shown.
 * **LAUNCHING**: Job is running but the server is not ready yet.
-* **READY**: Inference server running and ready to take requests.
+* **READY**: Inference server running and ready to take requests.
 * **FAILED**: Inference server in an unhealthy state. Job failed reason will be shown.
 * **SHUTDOWN**: Inference server is shutdown/cancelled.
@@ -84,7 +85,7 @@ vec-inf list Meta-Llama-3.1-70B-Instruct
 ## Send inference requests
 Once the inference server is ready, you can start sending in inference requests. We provide example scripts for sending inference requests in [`examples`](examples) folder. Make sure to update the model server URL and the model weights location in the scripts. For example, you can run `python examples/inference/llm/completions.py`, and you should expect to see an output like the following:
-> {"id":"cmpl-bdf43763adf242588af07af88b070b62","object":"text_completion","created":2983960,"model":"/model-weights/Llama-2-7b-hf","choices":[{"index":0,"text":"\nCanada is close to the actual continent of North America. Aside from the Arctic islands","logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":8,"total_tokens":28,"completion_tokens":20}}
+> {"id":"cmpl-c08d8946224747af9cce9f4d9f36ceb3","object":"text_completion","created":1725394970,"model":"Meta-Llama-3.1-8B-Instruct","choices":[{"index":0,"text":" is a question that many people may wonder. The answer is, of course, Ottawa. But if","logprobs":null,"finish_reason":"length","stop_reason":null}],"usage":{"prompt_tokens":8,"total_tokens":28,"completion_tokens":20}}
 **NOTE**: For multimodal models, currently only `ChatCompletion` is available, and only one image can be provided for each prompt.

vec_inf-0.3.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+vec_inf/README.md,sha256=ny3ffk6FeRwk_nERimK-JQwEuysvBe5eKpNyLk_A-8k,499
+vec_inf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+vec_inf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+vec_inf/cli/_cli.py,sha256=XwCBkwFrN06T_o1CkUKD2nWT6P4bwOfDpVPoM3AUyUA,8984
+vec_inf/cli/_utils.py,sha256=n37X0AcgXNEi3wOEqQFA4_iHHeGclHew6NyQaML6q7s,4034
+vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
+vec_inf/launch_server.sh,sha256=-efoTEIDKlJD7YhbYMgq4fFRV7H_1okjT5uKhfQAGUg,3998
+vec_inf/models/README.md,sha256=7Vz-AMValcic5Mpi9i5FshhRUV9K8nwSnItN4O1TSvI,8124
+vec_inf/models/models.csv,sha256=dOthlc04TyTQTin_fyt-PFDqg-lARScI9i0-tUkIgQ8,4828
+vec_inf/multinode_vllm.slurm,sha256=KbxsKD9kV8wsB_jCEqh63BHq8h2DLmYMV46z5h2wAe0,3867
+vec_inf/vllm.slurm,sha256=wRBkDunb0Oc1d8ESl_Dn9wRs_kIKvN_J39pL8dWAbV0,1608
+vec_inf-0.3.3.dist-info/METADATA,sha256=IefFGb9Gb7bOwI3RjNTbTlTCL6AImzx5XBSJjCp4y8c,5751
+vec_inf-0.3.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+vec_inf-0.3.3.dist-info/entry_points.txt,sha256=JF4uXsj1H4XacxaBw9f0KN0P0qDzmp7K_1zTEBDappo,48
+vec_inf-0.3.3.dist-info/RECORD,,

vec_inf-0.3.1.dist-info/RECORD DELETED Viewed

@@ -1,15 +0,0 @@
-vec_inf/README.md,sha256=jtvslzw1MjTFFIXwzlrb0NstUyTEDL0S_k27K5bLl34,499
-vec_inf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-vec_inf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-vec_inf/cli/_cli.py,sha256=8UHNFitbmq1OTNO1cLM_LVuHFndnvNyQSezGs1oT3tc,8346
-vec_inf/cli/_utils.py,sha256=2Grz-bX_mGjzxXUBdrX7MbNfXUM7JQ3399GKe-N74FE,3910
-vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
-vec_inf/launch_server.sh,sha256=BW5oK_10OjfHXhIsdf9vPsEBlCXh8j2lOV7qvSlPcZU,3998
-vec_inf/models/README.md,sha256=y_Cr1ZAkqIw1vIEOZMEp4FsyLGVijDoIoqwxn6aeQwo,8138
-vec_inf/models/models.csv,sha256=JFGMhT9o7Pf0tkY-w2GRQG5MxdYK2V5T8s6bk166MpM,4720
-vec_inf/multinode_vllm.slurm,sha256=pedYWIzPN-BKtL6ezoZSKJ3DO7RduDyAR4_cxZD4KyY,3938
-vec_inf/vllm.slurm,sha256=6Nx14qyAwHlbweCbFMUcMV2jaZSv41ghkyx2MiHJY8Y,1608
-vec_inf-0.3.1.dist-info/METADATA,sha256=xRhpXmFmMv5A77xdJaKBo_m7UXC13CkBmzegnQzQnPg,5701
-vec_inf-0.3.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-vec_inf-0.3.1.dist-info/entry_points.txt,sha256=JF4uXsj1H4XacxaBw9f0KN0P0qDzmp7K_1zTEBDappo,48
-vec_inf-0.3.1.dist-info/RECORD,,

{vec_inf-0.3.1.dist-info → vec_inf-0.3.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{vec_inf-0.3.1.dist-info → vec_inf-0.3.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

vec-inf 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

vec-inf 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl