vec-inf 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. vec_inf/README.md +5 -0
  2. vec_inf/__init__.py +0 -0
  3. vec_inf/cli/__init__.py +0 -0
  4. vec_inf/cli/_cli.py +262 -0
  5. vec_inf/cli/_utils.py +129 -0
  6. vec_inf/find_port.sh +39 -0
  7. vec_inf/launch_server.sh +104 -0
  8. vec_inf/models/CodeLlama/README.md +12 -0
  9. vec_inf/models/CodeLlama/config.sh +5 -0
  10. vec_inf/models/Llama-2/README.md +10 -0
  11. vec_inf/models/Llama-2/config.sh +5 -0
  12. vec_inf/models/Meta-Llama-3/README.md +8 -0
  13. vec_inf/models/Meta-Llama-3/config.sh +5 -0
  14. vec_inf/models/Meta-Llama-3.1/README.md +8 -0
  15. vec_inf/models/Meta-Llama-3.1/config.sh +6 -0
  16. vec_inf/models/Mistral/README.md +10 -0
  17. vec_inf/models/Mistral/config.sh +5 -0
  18. vec_inf/models/Mixtral/README.md +8 -0
  19. vec_inf/models/Mixtral/config.sh +5 -0
  20. vec_inf/models/Phi-3/README.md +6 -0
  21. vec_inf/models/Phi-3/config.sh +6 -0
  22. vec_inf/models/README.md +31 -0
  23. vec_inf/models/c4ai-command-r/README.md +5 -0
  24. vec_inf/models/c4ai-command-r/config.sh +5 -0
  25. vec_inf/models/dbrx/README.md +5 -0
  26. vec_inf/models/dbrx/config.sh +5 -0
  27. vec_inf/models/gemma-2/README.md +8 -0
  28. vec_inf/models/gemma-2/config.sh +6 -0
  29. vec_inf/models/llava-1.5/README.md +7 -0
  30. vec_inf/models/llava-1.5/chat_template.jinja +23 -0
  31. vec_inf/models/llava-1.5/config.sh +5 -0
  32. vec_inf/models/llava-v1.6/README.md +7 -0
  33. vec_inf/models/llava-v1.6/chat_template.jinja +23 -0
  34. vec_inf/models/llava-v1.6/config.sh +5 -0
  35. vec_inf/models/models.csv +45 -0
  36. vec_inf/multinode_vllm.slurm +114 -0
  37. vec_inf/vllm.slurm +47 -0
  38. vec_inf-0.3.0.dist-info/METADATA +94 -0
  39. vec_inf-0.3.0.dist-info/RECORD +41 -0
  40. vec_inf-0.3.0.dist-info/WHEEL +4 -0
  41. vec_inf-0.3.0.dist-info/entry_points.txt +3 -0
vec_inf/README.md ADDED
@@ -0,0 +1,5 @@
1
+ # `vec-inf` Commands
2
+
3
+ * `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server, `--json-mode` supported. Check [`here`](./models/README.md) for complete list of available options.
4
+ * `status`: Check the model status by providing its Slurm job ID, `--json-mode` supported.
5
+ * `shutdown`: Shutdown a model by providing its Slurm job ID.
vec_inf/__init__.py ADDED
File without changes
File without changes
vec_inf/cli/_cli.py ADDED
@@ -0,0 +1,262 @@
1
+ import os
2
+
3
+ import click
4
+ import pandas as pd
5
+ from rich.console import Console
6
+ from rich.columns import Columns
7
+ from rich.panel import Panel
8
+
9
+ from ._utils import *
10
+
11
+
12
+ CONSOLE = Console()
13
+
14
+
15
+ @click.group()
16
+ def cli():
17
+ """Vector Inference CLI"""
18
+ pass
19
+
20
+
21
+ @cli.command("launch")
22
+ @click.argument(
23
+ "model-name",
24
+ type=str,
25
+ nargs=1
26
+ )
27
+ @click.option(
28
+ "--model-family",
29
+ type=str,
30
+ help='The model family name according to the directories in `models`'
31
+ )
32
+ @click.option(
33
+ "--model-variant",
34
+ type=str,
35
+ help='The model variant according to the README in `models/model-family`'
36
+ )
37
+ @click.option(
38
+ "--max-model-len",
39
+ type=int,
40
+ help='Model context length. If unspecified, will be automatically derived from the model config.'
41
+ )
42
+ @click.option(
43
+ "--partition",
44
+ type=str,
45
+ help='Type of compute partition, default to a40'
46
+ )
47
+ @click.option(
48
+ "--num-nodes",
49
+ type=int,
50
+ help='Number of nodes to use, default to suggested resource allocation for model'
51
+ )
52
+ @click.option(
53
+ "--num-gpus",
54
+ type=int,
55
+ help='Number of GPUs/node to use, default to suggested resource allocation for model'
56
+ )
57
+ @click.option(
58
+ "--qos",
59
+ type=str,
60
+ help='Quality of service, default to m3'
61
+ )
62
+ @click.option(
63
+ "--time",
64
+ type=str,
65
+ help='Time limit for job, this should comply with QoS, default to 4:00:00'
66
+ )
67
+ @click.option(
68
+ "--data-type",
69
+ type=str,
70
+ help='Model data type, default to auto'
71
+ )
72
+ @click.option(
73
+ "--venv",
74
+ type=str,
75
+ help='Path to virtual environment'
76
+ )
77
+ @click.option(
78
+ "--log-dir",
79
+ type=str,
80
+ help='Path to slurm log directory'
81
+ )
82
+ @click.option(
83
+ "--json-mode",
84
+ is_flag=True,
85
+ help='Output in JSON string',
86
+ )
87
+ def launch(
88
+ model_name: str,
89
+ model_family: str=None,
90
+ model_variant: str=None,
91
+ max_model_len: int=None,
92
+ partition: str=None,
93
+ num_nodes: int=None,
94
+ num_gpus: int=None,
95
+ qos: str=None,
96
+ time: str=None,
97
+ data_type: str=None,
98
+ venv: str=None,
99
+ log_dir: str=None,
100
+ json_mode: bool=False
101
+ ) -> None:
102
+ """
103
+ Launch a model on the cluster
104
+ """
105
+ launch_script_path = os.path.join(
106
+ os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
107
+ "launch_server.sh"
108
+ )
109
+ launch_cmd = f"bash {launch_script_path}"
110
+
111
+ models_df = load_models_df()
112
+
113
+ if model_name not in models_df['model_name'].values:
114
+ raise ValueError(f"Model name {model_name} not found in available models")
115
+
116
+ default_args = load_default_args(models_df, model_name)
117
+
118
+ for arg in default_args:
119
+ if arg in locals() and locals()[arg] is not None:
120
+ default_args[arg] = locals()[arg]
121
+ renamed_arg = arg.replace("_", "-")
122
+ launch_cmd += f" --{renamed_arg} {default_args[arg]}"
123
+
124
+ output = run_bash_command(launch_cmd)
125
+
126
+ slurm_job_id = output.split(" ")[-1].strip().strip("\n")
127
+ output_lines = output.split("\n")[:-2]
128
+
129
+ table = create_table(key_title="Job Config", value_title="Value")
130
+ table.add_row("Slurm Job ID", slurm_job_id, style="blue")
131
+ output_dict = {"slurm_job_id": slurm_job_id}
132
+
133
+ for line in output_lines:
134
+ key, value = line.split(": ")
135
+ table.add_row(key, value)
136
+ output_dict[key.lower().replace(" ", "_")] = value
137
+
138
+ if json_mode:
139
+ click.echo(output_dict)
140
+ else:
141
+ CONSOLE.print(table)
142
+
143
+
144
+ @cli.command("status")
145
+ @click.argument(
146
+ "slurm_job_id",
147
+ type=int,
148
+ nargs=1
149
+ )
150
+ @click.option(
151
+ "--log-dir",
152
+ type=str,
153
+ help='Path to slurm log directory. This is required if it was set when launching the model'
154
+ )
155
+ @click.option(
156
+ "--json-mode",
157
+ is_flag=True,
158
+ help='Output in JSON string',
159
+ )
160
+ def status(slurm_job_id: int, log_dir: str=None, json_mode: bool=False) -> None:
161
+ """
162
+ Get the status of a running model on the cluster
163
+ """
164
+ status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
165
+ output = run_bash_command(status_cmd)
166
+
167
+ slurm_job_name = "UNAVAILABLE"
168
+ status = "SHUTDOWN"
169
+ base_url = "UNAVAILABLE"
170
+
171
+ try:
172
+ slurm_job_name = output.split(" ")[1].split("=")[1]
173
+ slurm_job_state = output.split(" ")[9].split("=")[1]
174
+ except IndexError:
175
+ # Job ID not found
176
+ slurm_job_state = "UNAVAILABLE"
177
+
178
+ # If Slurm job is currently PENDING
179
+ if slurm_job_state == "PENDING":
180
+ slurm_job_pending_reason = output.split(" ")[10].split("=")[1]
181
+ status = "PENDING"
182
+ # If Slurm job is currently RUNNING
183
+ elif slurm_job_state == "RUNNING":
184
+ # Check whether the server is ready, if yes, run model health check to further determine status
185
+ server_status = is_server_running(slurm_job_name, slurm_job_id, log_dir)
186
+ # If server status is a tuple, then server status is "FAILED"
187
+ if type(server_status) is tuple:
188
+ status = server_status[0]
189
+ slurm_job_failed_reason = server_status[1]
190
+ elif server_status == "RUNNING":
191
+ status = model_health_check(slurm_job_name, slurm_job_id, log_dir)
192
+ if status == "READY":
193
+ # Only set base_url if model is ready to serve requests
194
+ base_url = get_base_url(slurm_job_name, slurm_job_id, log_dir)
195
+ else:
196
+ # If model is not ready, then status must be "FAILED"
197
+ status = status[0]
198
+ slurm_job_failed_reason = status[1]
199
+ else:
200
+ status = server_status
201
+
202
+ if json_mode:
203
+ status_dict = {
204
+ "model_name": slurm_job_name,
205
+ "model_status": status,
206
+ "base_url": base_url
207
+ }
208
+ if "slurm_job_pending_reason" in locals():
209
+ status_dict["pending_reason"] = slurm_job_pending_reason
210
+ if "slurm_job_failed_reason" in locals():
211
+ status_dict["failed_reason"] = slurm_job_failed_reason
212
+ click.echo(f'{status_dict}')
213
+ else:
214
+ table = create_table(key_title="Job Status", value_title="Value")
215
+ table.add_row("Model Name", slurm_job_name)
216
+ table.add_row("Model Status", status, style="blue")
217
+ if "slurm_job_pending_reason" in locals():
218
+ table.add_row("Reason", slurm_job_pending_reason)
219
+ if "slurm_job_failed_reason" in locals():
220
+ table.add_row("Reason", slurm_job_failed_reason)
221
+ table.add_row("Base URL", base_url)
222
+ CONSOLE.print(table)
223
+
224
+
225
+ @cli.command("shutdown")
226
+ @click.argument(
227
+ "slurm_job_id",
228
+ type=int,
229
+ nargs=1
230
+ )
231
+ def shutdown(slurm_job_id: int) -> None:
232
+ """
233
+ Shutdown a running model on the cluster
234
+ """
235
+ shutdown_cmd = f"scancel {slurm_job_id}"
236
+ run_bash_command(shutdown_cmd)
237
+ click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}")
238
+
239
+
240
+ @cli.command("list")
241
+ @click.option(
242
+ "--json-mode",
243
+ is_flag=True,
244
+ help='Output in JSON string',
245
+ )
246
+ def list(json_mode: bool=False) -> None:
247
+ """
248
+ List all available models
249
+ """
250
+ models_df = load_models_df()
251
+ if json_mode:
252
+ click.echo(models_df['model_name'].to_json(orient='records'))
253
+ return
254
+ panels = []
255
+ for _, row in models_df.iterrows():
256
+ styled_text = f"[magenta]{row['model_family']}[/magenta]-{row['model_variant']}"
257
+ panels.append(Panel(styled_text, expand=True))
258
+ CONSOLE.print(Columns(panels, equal=True))
259
+
260
+
261
+ if __name__ == '__main__':
262
+ cli()
vec_inf/cli/_utils.py ADDED
@@ -0,0 +1,129 @@
1
+ import subprocess
2
+ import os
3
+ from typing import Union
4
+
5
+ import requests
6
+ from rich.table import Table
7
+ import pandas as pd
8
+
9
+
10
+ MODEL_READY_SIGNATURE = "INFO: Uvicorn running on http://0.0.0.0:"
11
+ SERVER_ADDRESS_SIGNATURE = "Server address: "
12
+
13
+
14
+ def run_bash_command(command: str) -> str:
15
+ """
16
+ Run a bash command and return the output
17
+ """
18
+ process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
19
+ stdout, _ = process.communicate()
20
+ return stdout
21
+
22
+
23
+ def read_slurm_log(
24
+ slurm_job_name: str,
25
+ slurm_job_id: int,
26
+ slurm_log_type: str,
27
+ log_dir: str
28
+ ) -> Union[list, str]:
29
+ """
30
+ Get the directory of a model
31
+ """
32
+ if not log_dir:
33
+ models_dir = os.path.join(os.path.expanduser("~"), ".vec-inf-logs")
34
+
35
+ for dir in sorted(os.listdir(models_dir), key=len, reverse=True):
36
+ if dir in slurm_job_name:
37
+ log_dir = os.path.join(models_dir, dir)
38
+ break
39
+
40
+ try:
41
+ file_path = os.path.join(log_dir, f"{slurm_job_name}.{slurm_job_id}.{slurm_log_type}")
42
+ with open(file_path, 'r') as file:
43
+ lines = file.readlines()
44
+ except FileNotFoundError:
45
+ print(f"Could not find file: {file_path}")
46
+ return "LOG_FILE_NOT_FOUND"
47
+ return lines
48
+
49
+ def is_server_running(slurm_job_name: str, slurm_job_id: int, log_dir: str) -> Union[str, tuple]:
50
+ """
51
+ Check if a model is ready to serve requests
52
+ """
53
+ log_content = read_slurm_log(slurm_job_name, slurm_job_id, "err", log_dir)
54
+ if type(log_content) is str:
55
+ return log_content
56
+
57
+ for line in log_content:
58
+ if "error" in line.lower():
59
+ return ("FAILED", line.strip("\n"))
60
+ if MODEL_READY_SIGNATURE in line:
61
+ return "RUNNING"
62
+ return "LAUNCHING"
63
+
64
+
65
+ def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: str) -> str:
66
+ """
67
+ Get the base URL of a model
68
+ """
69
+ log_content = read_slurm_log(slurm_job_name, slurm_job_id, "out", log_dir)
70
+ if type(log_content) is str:
71
+ return log_content
72
+
73
+ for line in log_content:
74
+ if SERVER_ADDRESS_SIGNATURE in line:
75
+ return line.split(SERVER_ADDRESS_SIGNATURE)[1].strip("\n")
76
+ return "URL_NOT_FOUND"
77
+
78
+
79
+ def model_health_check(slurm_job_name: str, slurm_job_id: int, log_dir: str) -> Union[str, tuple]:
80
+ """
81
+ Check the health of a running model on the cluster
82
+ """
83
+ base_url = get_base_url(slurm_job_name, slurm_job_id, log_dir)
84
+ if not base_url.startswith("http"):
85
+ return ("FAILED", base_url)
86
+ health_check_url = base_url.replace("v1", "health")
87
+
88
+ try:
89
+ response = requests.get(health_check_url)
90
+ # Check if the request was successful
91
+ if response.status_code == 200:
92
+ return "READY"
93
+ else:
94
+ return ("FAILED", response.status_code)
95
+ except requests.exceptions.RequestException as e:
96
+ return ("FAILED", str(e))
97
+
98
+
99
+ def create_table(key_title: str = "", value_title: str = "", show_header: bool = True) -> Table:
100
+ """
101
+ Create a table for displaying model status
102
+ """
103
+ table = Table(show_header=show_header, header_style="bold magenta")
104
+ table.add_column(key_title, style="dim")
105
+ table.add_column(value_title)
106
+ return table
107
+
108
+
109
+ def load_models_df() -> pd.DataFrame:
110
+ """
111
+ Load the models dataframe
112
+ """
113
+ models_df = pd.read_csv(
114
+ os.path.join(
115
+ os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
116
+ "models/models.csv"
117
+ )
118
+ )
119
+ return models_df
120
+
121
+
122
+ def load_default_args(models_df: pd.DataFrame, model_name: str) -> dict:
123
+ """
124
+ Load the default arguments for a model
125
+ """
126
+ row_data = models_df.loc[models_df["model_name"] == model_name]
127
+ default_args = row_data.iloc[0].to_dict()
128
+ default_args.pop("model_name")
129
+ return default_args
vec_inf/find_port.sh ADDED
@@ -0,0 +1,39 @@
1
+ #!/bin/bash
2
+
3
+ # Function to check if a port is available on the specified IP
4
+ is_port_available() {
5
+ local ip=$1
6
+ local port=$2
7
+ # Attempt to listen on the specified port and IP. Use & to background the process.
8
+ nc -l $ip $port &> /dev/null &
9
+
10
+ # Capture the PID of the background process
11
+ local pid=$!
12
+ # Wait a short moment to ensure nc had time to bind to the port
13
+ sleep 0.1
14
+
15
+ # Check if nc is still running. If so, the port was available.
16
+ if kill -0 $pid &> /dev/null; then
17
+ # Kill the background nc process
18
+ kill $pid &> /dev/null
19
+ return 0 # True, port is available
20
+ else
21
+ return 1 # False, port is not available
22
+ fi
23
+ }
24
+
25
+ # Function to find an available port on the specified IP
26
+ find_available_port() {
27
+ local ip=$1
28
+ local base_port=$2
29
+ local max_port=$3
30
+
31
+ for ((port=base_port; port<=max_port; port++)); do
32
+ if is_port_available $ip $port; then
33
+ echo $port
34
+ return
35
+ fi
36
+ done
37
+ echo "No available port between $base_port and $max_port for $ip." >&2
38
+ return 1
39
+ }
@@ -0,0 +1,104 @@
1
+ #!/bin/bash
2
+
3
+ # ================================= Read Named Args ======================================
4
+
5
+ while [[ "$#" -gt 0 ]]; do
6
+ case $1 in
7
+ --model-family) model_family="$2"; shift ;;
8
+ --model-variant) model_variant="$2"; shift ;;
9
+ --partition) partition="$2"; shift ;;
10
+ --qos) qos="$2"; shift ;;
11
+ --time) walltime="$2"; shift ;;
12
+ --num-nodes) num_nodes="$2"; shift ;;
13
+ --num-gpus) num_gpus="$2"; shift ;;
14
+ --max-model-len) max_model_len="$2"; shift ;;
15
+ --vocab-size) vocab_size="$2"; shift ;;
16
+ --data-type) data_type="$2"; shift ;;
17
+ --venv) virtual_env="$2"; shift ;;
18
+ --log-dir) log_dir="$2"; shift ;;
19
+ --pipeline-parallelism) pipeline_parallelism="$2"; shift ;;
20
+ *) echo "Unknown parameter passed: $1"; exit 1 ;;
21
+ esac
22
+ shift
23
+ done
24
+
25
+ required_vars=(model_family model_variant partition qos walltime num_nodes num_gpus max_model_len vocab_size data_type virtual_env log_dir pipeline_parallelism)
26
+
27
+ for var in "$required_vars[@]"; do
28
+ if [ -z "$!var" ]; then
29
+ echo "Error: Missing required --$var//_/- argument."
30
+ exit 1
31
+ fi
32
+ done
33
+
34
+ export MODEL_FAMILY=$model_family
35
+ export MODEL_VARIANT=$model_variant
36
+ export JOB_PARTITION=$partition
37
+ export QOS=$qos
38
+ export WALLTIME=$walltime
39
+ export NUM_NODES=$num_nodes
40
+ export NUM_GPUS=$num_gpus
41
+ export VLLM_MAX_MODEL_LEN=$max_model_len
42
+ export VLLM_MAX_LOGPROBS=$vocab_size
43
+ export VLLM_DATA_TYPE=$data_type
44
+ export VENV_BASE=$virtual_env
45
+ export LOG_DIR=$log_dir
46
+ export PIPELINE_PARALLELISM=$pipeline_parallelism
47
+
48
+ # ================================= Set default environment variables ======================================
49
+ # Slurm job configuration
50
+ export JOB_NAME="$MODEL_FAMILY-$MODEL_VARIANT"
51
+ if [ "$LOG_DIR" = "default" ]; then
52
+ export LOG_DIR="$HOME/.vec-inf-logs/$MODEL_FAMILY"
53
+ fi
54
+ mkdir -p $LOG_DIR
55
+
56
+ # Model and entrypoint configuration. API Server URL (host, port) are set automatically based on the
57
+ # SLURM job and are written to the file specified at VLLM_BASE_URL_FILENAME
58
+ export SRC_DIR="$(dirname "$0")"
59
+ export MODEL_DIR="${SRC_DIR}/models/${MODEL_FAMILY}"
60
+ export VLLM_BASE_URL_FILENAME="${MODEL_DIR}/.${JOB_NAME}_url"
61
+
62
+ # Variables specific to your working environment, below are examples for the Vector cluster
63
+ export VLLM_MODEL_WEIGHTS="/model-weights/$JOB_NAME"
64
+ export LD_LIBRARY_PATH="/scratch/ssd001/pkgs/cudnn-11.7-v8.5.0.96/lib/:/scratch/ssd001/pkgs/cuda-11.7/targets/x86_64-linux/lib/"
65
+
66
+
67
+ # ================================ Validate Inputs & Launch Server =================================
68
+
69
+ # Set data type to fp16 instead of bf16 for non-Ampere GPUs
70
+ fp16_partitions="t4v1 t4v2"
71
+
72
+ # choose from 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
73
+ if [[ $fp16_partitions =~ $JOB_PARTITION ]]; then
74
+ export VLLM_DATA_TYPE="float16"
75
+ echo "Data type set to due to non-Ampere GPUs used: $VLLM_DATA_TYPE"
76
+ fi
77
+
78
+ # Create a file to store the API server URL if it doesn't exist
79
+ if [ -f $VLLM_BASE_URL_FILENAME ]; then
80
+ touch $VLLM_BASE_URL_FILENAME
81
+ fi
82
+
83
+ echo Job Name: $JOB_NAME
84
+ echo Partition: $JOB_PARTITION
85
+ echo Num Nodes: $NUM_NODES
86
+ echo GPUs per Node: $NUM_GPUS
87
+ echo QOS: $QOS
88
+ echo Walltime: $WALLTIME
89
+ echo Data Type: $VLLM_DATA_TYPE
90
+
91
+ is_special=""
92
+ if [ "$NUM_NODES" -gt 1 ]; then
93
+ is_special="multinode_"
94
+ fi
95
+
96
+ sbatch --job-name $JOB_NAME \
97
+ --partition $JOB_PARTITION \
98
+ --nodes $NUM_NODES \
99
+ --gres gpu:$NUM_GPUS \
100
+ --qos $QOS \
101
+ --time $WALLTIME \
102
+ --output $LOG_DIR/$JOB_NAME.%j.out \
103
+ --error $LOG_DIR/$JOB_NAME.%j.err \
104
+ $SRC_DIR/${is_special}vllm.slurm
@@ -0,0 +1,12 @@
1
+ # [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
2
+
3
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
4
+ |:----------:|:----------:|:----------:|:----------:|
5
+ | [`7b-hf`](https://huggingface.co/meta-llama/CodeLlama-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
6
+ | [**`7b-Instruct-hf`**](https://huggingface.co/meta-llama/CodeLlama-7b-Instruct-hf) | 1x a40 | - tokens/s | - tokens/s |
7
+ | [`13b-hf`](https://huggingface.co/meta-llama/CodeLlama-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
8
+ | [`13b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-13b-Instruct-hf) | 1x a40 | - tokens/s | - tokens/s |
9
+ | [`34b-hf`](https://huggingface.co/meta-llama/CodeLlama-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
10
+ | [`34b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-34b-Instruct-hf) | 2x a40 | - tokens/s | - tokens/s |
11
+ | [`70b-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-hf) | 4x a40 | - tokens/s | - tokens/s |
12
+ | [`70b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-Instruct-hf) | 4x a40 | - tokens/s | - tokens/s |
@@ -0,0 +1,5 @@
1
+ export MODEL_NAME="CodeLlama"
2
+ export MODEL_VARIANT="7b-Instruct-hf"
3
+ export NUM_NODES=1
4
+ export NUM_GPUS=1
5
+ export VLLM_MAX_LOGPROBS=32000
@@ -0,0 +1,10 @@
1
+ # [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
2
+
3
+ | Variant | Suggested resource allocation |
4
+ |:----------:|:----------:|
5
+ | [**`7b-hf`**](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 1x a40 |
6
+ | [`7b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1x a40 |
7
+ | [`13b-hf`](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 1x a40 |
8
+ | [`13b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 1x a40 |
9
+ | [`70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 4x a40 |
10
+ | [`70b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4x a40 |
@@ -0,0 +1,5 @@
1
+ export MODEL_NAME="Llama-2"
2
+ export MODEL_VARIANT="7b-chat-hf"
3
+ export NUM_NODES=1
4
+ export NUM_GPUS=1
5
+ export VLLM_MAX_LOGPROBS=32000
@@ -0,0 +1,8 @@
1
+ # [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
2
+
3
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
4
+ |:----------:|:----------:|:----------:|:----------:|
5
+ | [`8B`](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 1x a40 | 222 tokens/s | 1811 tokens/s |
6
+ | [**`8B-Instruct`**](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | 1x a40 | 371 tokens/s | 1990 tokens/s |
7
+ | [`70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 4x a40 | 81 tokens/s | 618 tokens/s |
8
+ | [`70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 4x a40 | 301 tokens/s | 660 tokens/s |
@@ -0,0 +1,5 @@
1
+ export MODEL_NAME="Meta-Llama-3"
2
+ export MODEL_VARIANT="8B-Instruct"
3
+ export NUM_NODES=1
4
+ export NUM_GPUS=1
5
+ export VLLM_MAX_LOGPROBS=128256
@@ -0,0 +1,8 @@
1
+ # [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
2
+
3
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
4
+ |:----------:|:----------:|:----------:|:----------:|
5
+ | [`8B`](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) | 1x a40 | - tokens/s | - tokens/s |
6
+ | [**`8B-Instruct`**](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
7
+ | [`70B`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) | 4x a40 | - tokens/s | - tokens/s |
8
+ | [`70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
@@ -0,0 +1,6 @@
1
+ export MODEL_NAME="Meta-Llama-3.1"
2
+ export MODEL_VARIANT="8B-Instruct"
3
+ export NUM_NODES=1
4
+ export NUM_GPUS=1
5
+ export VLLM_MAX_LOGPROBS=128256
6
+ export MAX_NUM_SEQS=256
@@ -0,0 +1,10 @@
1
+ # [Mistral AI: Mistral](https://huggingface.co/mistralai)
2
+ * Supported model variants:
3
+
4
+ | Variant (Mistral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
5
+ |:----------:|:----------:|:----------:|:----------:|
6
+ |[`7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1)| 1x a40 | - tokens/s | - tokens/s|
7
+ |[`7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)| 1x a40 | - tokens/s | - tokens/s|
8
+ |[`7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2)| 1x a40 | - tokens/s | - tokens/s|
9
+ |[`7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3)| 1x a40 | - tokens/s | - tokens/s |
10
+ |[**`7B-Instruct-v0.3`**](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)| 1x a40 | - tokens/s | - tokens/s|
@@ -0,0 +1,5 @@
1
+ export MODEL_NAME="Mistral"
2
+ export MODEL_VARIANT="7B-Instruct-v0.3"
3
+ export NUM_NODES=1
4
+ export NUM_GPUS=1
5
+ export VLLM_MAX_LOGPROBS=32768
@@ -0,0 +1,8 @@
1
+ # [Mistral AI: Mixtral](https://huggingface.co/mistralai)
2
+ * Supported model variants:
3
+
4
+ | Variant (Mixtral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
5
+ |:----------:|:----------:|:----------:|:----------:|
6
+ |[**`8x7B-Instruct-v0.1`**](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)| 4x a40 | 222 tokens/s | 1543 tokens/s |
7
+ |[`8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
8
+ |[`8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
@@ -0,0 +1,5 @@
1
+ export MODEL_NAME="Mixtral"
2
+ export MODEL_VARIANT="8x7B-Instruct-v0.1"
3
+ export NUM_NODES=1
4
+ export NUM_GPUS=4
5
+ export VLLM_MAX_LOGPROBS=32000
@@ -0,0 +1,6 @@
1
+ # [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
2
+
3
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
4
+ |:----------:|:----------:|:----------:|:----------:|
5
+ | [**`medium-128k-instruct`**](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
6
+ | [**`vision-128k-instruct`**](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
@@ -0,0 +1,6 @@
1
+ export MODEL_NAME="Phi-3"
2
+ export MODEL_VARIANT="medium-128k-instruct"
3
+ export NUM_NODES=1
4
+ export NUM_GPUS=2
5
+ export VLLM_MAX_LOGPROBS=32064
6
+ export MAX_NUM_SEQS=200
@@ -0,0 +1,31 @@
1
+ # Environment Variables
2
+ The following environment variables all have default values that's suitable for the Vector cluster environment. You can use flags to modify certain environment variable values.
3
+
4
+ * **MODEL_FAMILY**: Directory name of the model family.
5
+ * **SRC_DIR**: Relative path for the `[src](../src/)` folder.
6
+ * **CONFIG_FILE**: Config file containing default values for some environment variables in the **MODEL_FAMILY** diretory.
7
+ * **MODEL_NAME**: Name of model family according to the actual model weights.
8
+ * **MODEL_VARIANT**: Variant of the model, the variants available are listed in respective model folders. Default variant is bolded in the corresponding README.md file.
9
+ * **MODEL_DIR**: Path to model's directory in vector-inference repo.
10
+ * **VLLM_BASE_URL_FILENAME**: The file to store the inference server URL, this file would be generated after launching an inference server, and it would be located in the corresponding model folder with the name `.vllm_{model-name}-{model-variant}_url`.
11
+ * **VENV_BASE**: Location of the virtual environment.
12
+ * **VLLM_MODEL_WEIGHTS**: Location of the model weights.
13
+ * **VLLM_DATA_TYPE**: Model data type.
14
+ * **LD_LIBRARY_PATH**: Include custom locations for dynamically linked library files in a Unix-like operating system. In the script, we tell the dynamic linker to also look at the CUDA and cuDNN directories.
15
+ * **JOB_NAME**: Slurm job name.
16
+ * **NUM_NODES**: Numeber of nodes scheduled. Default to suggested resource allocation.
17
+ * **NUM_GPUS**: Number of GPUs scheduled. Default to suggested resource allocation.
18
+ * **JOB_PARTITION**: Type of compute partition. Default to suggested resource allocation.
19
+ * **QOS**: Quality of Service.
20
+ * **TIME**: Max Walltime.
21
+
22
+ # Named Arguments
23
+ NOTE: Arguments like `--num-nodes` or `model-variant` might not be available to certain model families because they should fit inside a single node or there is no variant availble in `/model-weights` yet. You can manually add these options in launch scripts if you need, or make a request to download weights for other variants.
24
+ * `--model-variant`: Overrides **MODEL_VARIANT**
25
+ * `--partition`: Overrides **JOB_PARTITION**.
26
+ * `--num-nodes`: Overrides **NUM_NODES**.
27
+ * `--num-gpus`: Overrides **NUM_GPUS**.
28
+ * `--qos`: Overrides **QOS**.
29
+ * `--time`: Overrides **TIME**.
30
+ * `--data-type`: Overrides **VLLM_DATA_TYPE**.
31
+ * `--venv`: Overrides **VENV_BASE**.
@@ -0,0 +1,5 @@
1
+ # [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
2
+
3
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
4
+ |:----------:|:----------:|:----------:|:----------:|
5
+ |[**`plus`**](https://huggingface.co/CohereForAI/c4ai-command-r-plus)| 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
@@ -0,0 +1,5 @@
1
+ export MODEL_NAME="c4ai-command-r"
2
+ export MODEL_VARIANT="plus"
3
+ export NUM_NODES=2
4
+ export NUM_GPUS=4
5
+ export VLLM_MAX_LOGPROBS=256000
@@ -0,0 +1,5 @@
1
+ # [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
2
+
3
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
4
+ |:----------:|:----------:|:----------:|:----------:|
5
+ |[**`dbrx-instruct`**](https://huggingface.co/databricks/dbrx-instruct)| 8x a40 (2 nodes, 4 a40/node) | 107 tokens/s | 904 tokens/s |
@@ -0,0 +1,5 @@
1
+ export MODEL_NAME="dbrx"
2
+ export MODEL_VARIANT="instruct"
3
+ export NUM_NODES=2
4
+ export NUM_GPUS=4
5
+ export VLLM_MAX_LOGPROBS=100352
@@ -0,0 +1,8 @@
1
+ # [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
2
+
3
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
4
+ |:----------:|:----------:|:----------:|:----------:|
5
+ | [`9b`](https://huggingface.co/google/gemma-2-9b) | 1x a40 | - tokens/s | - tokens/s |
6
+ | [**`9b-it`**](https://huggingface.co/google/gemma-2-9b-it) | 1x a40 | - tokens/s | - tokens/s |
7
+ | [`27b`](https://huggingface.co/google/gemma-2-27b) | 2x a40 | - tokens/s | - tokens/s |
8
+ | [`27b-it`](https://huggingface.co/google/gemma-2-27b-it) | 2x a40 | - tokens/s | - tokens/s |
@@ -0,0 +1,6 @@
1
+ export MODEL_NAME="gemma-2"
2
+ export MODEL_VARIANT="9b-it"
3
+ export NUM_NODES=1
4
+ export NUM_GPUS=1
5
+ export VLLM_MAX_LOGPROBS=256000
6
+ export VLLM_ATTENTION_BACKEND=FLASHINFER
@@ -0,0 +1,7 @@
1
+ # [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
2
+ * Supported model variants:
3
+
4
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
5
+ |:----------:|:----------:|:----------:|:----------:|
6
+ |[**`7b-hf`**](https://huggingface.co/llava-hf/llava-1.5-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
7
+ |[`13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf)| 1x a40 | - tokens/s | - tokens/s |
@@ -0,0 +1,23 @@
1
+ {%- if messages[0]['role'] == 'system' -%}
2
+ {%- set system_message = messages[0]['content'] -%}
3
+ {%- set messages = messages[1:] -%}
4
+ {%- else -%}
5
+ {% set system_message = '' -%}
6
+ {%- endif -%}
7
+
8
+ {{ bos_token + system_message }}
9
+ {%- for message in messages -%}
10
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
11
+ {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
12
+ {%- endif -%}
13
+
14
+ {%- if message['role'] == 'user' -%}
15
+ {{ 'USER: ' + message['content'] + '\n' }}
16
+ {%- elif message['role'] == 'assistant' -%}
17
+ {{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }}
18
+ {%- endif -%}
19
+ {%- endfor -%}
20
+
21
+ {%- if add_generation_prompt -%}
22
+ {{ 'ASSISTANT:' }}
23
+ {% endif %}
@@ -0,0 +1,5 @@
1
+ export MODEL_NAME="llava-1.5"
2
+ export MODEL_VARIANT="7b-hf"
3
+ export NUM_NODES=1
4
+ export NUM_GPUS=1
5
+ export VLLM_MAX_LOGPROBS=32000
@@ -0,0 +1,7 @@
1
+ # [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
2
+ * Supported model variants:
3
+
4
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
5
+ |:----------:|:----------:|:----------:|:----------:|
6
+ |[**`mistral-7b-hf`**](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
7
+ |[`34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf)| 2x a40 | - tokens/s | - tokens/s |
@@ -0,0 +1,23 @@
1
+ {%- if messages[0]['role'] == 'system' -%}
2
+ {%- set system_message = messages[0]['content'] -%}
3
+ {%- set messages = messages[1:] -%}
4
+ {%- else -%}
5
+ {% set system_message = '' -%}
6
+ {%- endif -%}
7
+
8
+ {{ bos_token + system_message }}
9
+ {%- for message in messages -%}
10
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
11
+ {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
12
+ {%- endif -%}
13
+
14
+ {%- if message['role'] == 'user' -%}
15
+ {{ 'USER: ' + message['content'] + '\n' }}
16
+ {%- elif message['role'] == 'assistant' -%}
17
+ {{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }}
18
+ {%- endif -%}
19
+ {%- endfor -%}
20
+
21
+ {%- if add_generation_prompt -%}
22
+ {{ 'ASSISTANT:' }}
23
+ {% endif %}
@@ -0,0 +1,5 @@
1
+ export MODEL_NAME="llava-v1.6"
2
+ export MODEL_VARIANT="mistral-7b-hf"
3
+ export NUM_NODES=1
4
+ export NUM_GPUS=1
5
+ export VLLM_MAX_LOGPROBS=32064
@@ -0,0 +1,45 @@
1
+ model_name,model_family,model_variant,partition,qos,time,num_gpus,num_nodes,vocab_size,max_model_len,data_type,venv,log_dir,pipeline_parallelism
2
+ c4ai-command-r-plus,c4ai-command-r,plus,a40,m2,08:00:00,4,2,256000,8192,auto,singularity,default,false
3
+ CodeLlama-7b-hf,CodeLlama,7b-hf,a40,m2,08:00:00,1,1,32000,16384,auto,singularity,default,false
4
+ CodeLlama-7b-Instruct-hf,CodeLlama,7b-Instruct-hf,a40,m2,08:00:00,1,1,32000,16384,auto,singularity,default,false
5
+ CodeLlama-13b-hf,CodeLlama,13b-hf,a40,m2,08:00:00,1,1,32000,16384,auto,singularity,default,false
6
+ CodeLlama-13b-Instruct-hf,CodeLlama,13b-Instruct-hf,a40,m2,08:00:00,1,1,32000,16384,auto,singularity,default,false
7
+ CodeLlama-34b-hf,CodeLlama,34b-hf,a40,m2,08:00:00,2,1,32000,16384,auto,singularity,default,false
8
+ CodeLlama-34b-Instruct-hf,CodeLlama,34b-Instruct-hf,a40,m2,08:00:00,2,1,32000,16384,auto,singularity,default,false
9
+ CodeLlama-70b-hf,CodeLlama,70b-hf,a40,m2,08:00:00,4,1,32000,4096,auto,singularity,default,false
10
+ CodeLlama-70b-Instruct-hf,CodeLlama,70b-Instruct-hf,a40,m2,08:00:00,4,1,32000,4096,auto,singularity,default,false
11
+ dbrx-instruct,dbrx,instruct,a40,m2,08:00:00,4,2,100352,32000,auto,singularity,default,false
12
+ gemma-2-9b,gemma-2,9b,a40,m2,08:00:00,1,1,256000,4096,auto,singularity,default,false
13
+ gemma-2-9b-it,gemma-2,9b-it,a40,m2,08:00:00,1,1,256000,4096,auto,singularity,default,false
14
+ gemma-2-27b,gemma-2,27b,a40,m2,08:00:00,2,1,256000,4096,auto,singularity,default,false
15
+ gemma-2-27b-it,gemma-2,27b-it,a40,m2,08:00:00,2,1,256000,4096,auto,singularity,default,false
16
+ Llama-2-7b-hf,Llama-2,7b-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
17
+ Llama-2-7b-chat-hf,Llama-2,7b-chat-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
18
+ Llama-2-13b-hf,Llama-2,13b-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
19
+ Llama-2-13b-chat-hf,Llama-2,13b-chat-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
20
+ Llama-2-70b-hf,Llama-2,70b-hf,a40,m2,08:00:00,4,1,32000,4096,auto,singularity,default,false
21
+ Llama-2-70b-chat-hf,Llama-2,70b-chat-hf,a40,m2,08:00:00,4,1,32000,4096,auto,singularity,default,false
22
+ llava-1.5-7b-hf,llava-1.5,7b-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
23
+ llava-1.5-13b-hf,llava-1.5,13b-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
24
+ llava-v1.6-mistral-7b-hf,llava-v1.6,mistral-7b-hf,a40,m2,08:00:00,1,1,32064,32768,auto,singularity,default,false
25
+ llava-v1.6-34b-hf,llava-v1.6,34b-hf,a40,m2,08:00:00,2,1,64064,4096,auto,singularity,default,false
26
+ Meta-Llama-3-8B,Meta-Llama-3,8B,a40,m2,08:00:00,1,1,128256,8192,auto,singularity,default,false
27
+ Meta-Llama-3-8B-Instruct,Meta-Llama-3,8B-Instruct,a40,m2,08:00:00,1,1,128256,8192,auto,singularity,default,false
28
+ Meta-Llama-3-70B,Meta-Llama-3,70B,a40,m2,08:00:00,4,1,128256,8192,auto,singularity,default,false
29
+ Meta-Llama-3-70B-Instruct,Meta-Llama-3,70B-Instruct,a40,m2,08:00:00,4,1,128256,8192,auto,singularity,default,false
30
+ Meta-Llama-3.1-8B,Meta-Llama-3.1,8B,a40,m2,08:00:00,1,1,128256,131072,auto,singularity,default,false
31
+ Meta-Llama-3.1-8B-Instruct,Meta-Llama-3.1,8B-Instruct,a40,m2,08:00:00,1,1,128256,131072,auto,singularity,default,false
32
+ Meta-Llama-3.1-70B,Meta-Llama-3.1,70B,a40,m2,08:00:00,4,1,128256,65536,auto,singularity,default,false
33
+ Meta-Llama-3.1-70B-Instruct,Meta-Llama-3.1,70B-Instruct,a40,m2,08:00:00,4,1,128256,65536,auto,singularity,default,false
34
+ Meta-Llama-3.1-405B-Instruct,Meta-Llama-3.1,405B-Instruct,a40,m4,02:00:00,4,8,128256,16384,auto,singularity,default,true
35
+ Mistral-7B-v0.1,Mistral,7B-v0.1,a40,m2,08:00:00,1,1,32000,32768,auto,singularity,default,false
36
+ Mistral-7B-Instruct-v0.1,Mistral,7B-Instruct-v0.1,a40,m2,08:00:00,1,1,32000,32768,auto,singularity,default,false
37
+ Mistral-7B-Instruct-v0.2,Mistral,7B-Instruct-v0.2,a40,m2,08:00:00,1,1,32000,32768,auto,singularity,default,false
38
+ Mistral-7B-v0.3,Mistral,7B-v0.3,a40,m2,08:00:00,1,1,32768,32768,auto,singularity,default,false
39
+ Mistral-7B-Instruct-v0.3,Mistral,7B-Instruct-v0.3,a40,m2,08:00:00,1,1,32768,32768,auto,singularity,default,false
40
+ Mistral-Large-Instruct-2407,Mistral,Large-Instruct-2407,a40,m2,08:00:00,4,1,32768,131072,auto,singularity,default,false
41
+ Mixtral-8x7B-Instruct-v0.1,Mixtral,8x7B-Instruct-v0.1,a40,m2,08:00:00,4,1,32000,32768,auto,singularity,default,false
42
+ Mixtral-8x22B-v0.1,Mixtral,8x22B-v0.1,a40,m2,08:00:00,4,2,32768,65536,auto,singularity,default,false
43
+ Mixtral-8x22B-Instruct-v0.1,Mixtral,8x22B-Instruct-v0.1,a40,m2,08:00:00,4,2,32768,65536,auto,singularity,default,false
44
+ Phi-3-medium-128k-instruct,Phi-3,medium-128k-instruct,a40,m2,08:00:00,2,1,32064,131072,auto,singularity,default,false
45
+ Phi-3-vision-128k-instruct,Phi-3,vision-128k-instruct,a40,m2,08:00:00,2,1,32064,65536,auto,singularity,default,false
@@ -0,0 +1,114 @@
1
+ #!/bin/bash
2
+ #SBATCH --cpus-per-task=16
3
+ #SBATCH --mem=64G
4
+ #SBATCH --exclusive
5
+ #SBATCH --tasks-per-node=1
6
+
7
+ # Load CUDA, change to the cuda version on your environment if different
8
+ module load cuda-12.3
9
+ nvidia-smi
10
+
11
+ source ${SRC_DIR}/find_port.sh
12
+
13
+ if [ "$VENV_BASE" = "singularity" ]; then
14
+ export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_0.3.0.sif
15
+ export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1
16
+ module load singularity-ce/3.8.2
17
+ singularity exec $SINGULARITY_IMAGE ray stop
18
+ fi
19
+
20
+ # Getting the node names
21
+ nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
22
+ nodes_array=($nodes)
23
+
24
+ head_node=${nodes_array[0]}
25
+ head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
26
+
27
+ # Find port for head node
28
+ head_node_port=$(find_available_port $head_node_ip 8080 65535)
29
+
30
+ # Starting the Ray head node
31
+ ip_head=$head_node_ip:$head_node_port
32
+ export ip_head
33
+ echo "IP Head: $ip_head"
34
+
35
+ echo "Starting HEAD at $head_node"
36
+ if [ "$VENV_BASE" = "singularity" ]; then
37
+ srun --nodes=1 --ntasks=1 -w "$head_node" \
38
+ singularity exec --nv --bind /model-weights:/model-weights $SINGULARITY_IMAGE \
39
+ ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \
40
+ --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${NUM_GPUS}" --block &
41
+ else
42
+ srun --nodes=1 --ntasks=1 -w "$head_node" \
43
+ ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \
44
+ --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${NUM_GPUS}" --block &
45
+ fi
46
+
47
+ # Starting the Ray worker nodes
48
+ # Optional, though may be useful in certain versions of Ray < 1.0.
49
+ sleep 10
50
+
51
+ # number of nodes other than the head node
52
+ worker_num=$((SLURM_JOB_NUM_NODES - 1))
53
+
54
+ for ((i = 1; i <= worker_num; i++)); do
55
+ node_i=${nodes_array[$i]}
56
+ echo "Starting WORKER $i at $node_i"
57
+ if [ "$VENV_BASE" = "singularity" ]; then
58
+ srun --nodes=1 --ntasks=1 -w "$node_i" \
59
+ singularity exec --nv --bind /model-weights:/model-weights $SINGULARITY_IMAGE \
60
+ ray start --address "$ip_head" \
61
+ --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${NUM_GPUS}" --block &
62
+ else
63
+ srun --nodes=1 --ntasks=1 -w "$node_i" \
64
+ ray start --address "$ip_head" \
65
+ --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${NUM_GPUS}" --block &
66
+ fi
67
+
68
+ sleep 5
69
+ done
70
+
71
+
72
+ vllm_port_number=$(find_available_port $head_node_ip 8080 65535)
73
+
74
+ echo "Server address: http://${head_node_ip}:${vllm_port_number}/v1"
75
+ echo "http://${head_node_ip}:${vllm_port_number}/v1" > ${VLLM_BASE_URL_FILENAME}
76
+
77
+ if [ "$PIPELINE_PARALLELISM" = "true" ]; then
78
+ export PIPELINE_PARALLEL_SIZE=$NUM_NODES
79
+ export TENSOR_PARALLEL_SIZE=$NUM_GPUS
80
+ else
81
+ export PIPELINE_PARALLEL_SIZE=1
82
+ export TENSOR_PARALLEL_SIZE=$((NUM_NODES*NUM_GPUS))
83
+ fi
84
+
85
+ # Activate vllm venv
86
+ if [ "$VENV_BASE" = "singularity" ]; then
87
+ singularity exec --nv --bind /model-weights:/model-weights $SINGULARITY_IMAGE \
88
+ python3.10 -m vllm.entrypoints.openai.api_server \
89
+ --model ${VLLM_MODEL_WEIGHTS} \
90
+ --served-model-name ${JOB_NAME} \
91
+ --host "0.0.0.0" \
92
+ --port ${vllm_port_number} \
93
+ --pipeline-parallel-size ${PIPELINE_PARALLEL_SIZE} \
94
+ --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
95
+ --dtype ${VLLM_DATA_TYPE} \
96
+ --load-format safetensors \
97
+ --trust-remote-code \
98
+ --max-logprobs ${VLLM_MAX_LOGPROBS} \
99
+ --max-model-len ${VLLM_MAX_MODEL_LEN}
100
+ else
101
+ source ${VENV_BASE}/bin/activate
102
+ python3 -m vllm.entrypoints.openai.api_server \
103
+ --model ${VLLM_MODEL_WEIGHTS} \
104
+ --served-model-name ${JOB_NAME} \
105
+ --host "0.0.0.0" \
106
+ --port ${vllm_port_number} \
107
+ --pipeline-parallel-size ${PIPELINE_PARALLEL_SIZE} \
108
+ --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
109
+ --dtype ${VLLM_DATA_TYPE} \
110
+ --load-format safetensors \
111
+ --trust-remote-code \
112
+ --max-logprobs ${VLLM_MAX_LOGPROBS} \
113
+ --max-model-len ${VLLM_MAX_MODEL_LEN}
114
+ fi
vec_inf/vllm.slurm ADDED
@@ -0,0 +1,47 @@
1
+ #!/bin/bash
2
+ #SBATCH --cpus-per-task=16
3
+ #SBATCH --mem=64G
4
+
5
+ # Load CUDA, change to the cuda version on your environment if different
6
+ module load cuda-12.3
7
+ nvidia-smi
8
+
9
+ source ${SRC_DIR}/find_port.sh
10
+
11
+ # Write server url to file
12
+ hostname=${SLURMD_NODENAME}
13
+ vllm_port_number=$(find_available_port $hostname 8080 65535)
14
+
15
+ echo "Server address: http://${hostname}:${vllm_port_number}/v1"
16
+ echo "http://${hostname}:${vllm_port_number}/v1" > ${VLLM_BASE_URL_FILENAME}
17
+
18
+ # Activate vllm venv
19
+ if [ "$VENV_BASE" = "singularity" ]; then
20
+ export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_0.3.0.sif
21
+ export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1
22
+ module load singularity-ce/3.8.2
23
+ singularity exec $SINGULARITY_IMAGE ray stop
24
+ singularity exec --nv --bind /model-weights:/model-weights $SINGULARITY_IMAGE \
25
+ python3.10 -m vllm.entrypoints.openai.api_server \
26
+ --model ${VLLM_MODEL_WEIGHTS} \
27
+ --served-model-name ${JOB_NAME} \
28
+ --host "0.0.0.0" \
29
+ --port ${vllm_port_number} \
30
+ --tensor-parallel-size ${NUM_GPUS} \
31
+ --dtype ${VLLM_DATA_TYPE} \
32
+ --max-logprobs ${VLLM_MAX_LOGPROBS} \
33
+ --trust-remote-code \
34
+ --max-model-len ${VLLM_MAX_MODEL_LEN}
35
+ else
36
+ source ${VENV_BASE}/bin/activate
37
+ python3 -m vllm.entrypoints.openai.api_server \
38
+ --model ${VLLM_MODEL_WEIGHTS} \
39
+ --served-model-name ${JOB_NAME} \
40
+ --host "0.0.0.0" \
41
+ --port ${vllm_port_number} \
42
+ --tensor-parallel-size ${NUM_GPUS} \
43
+ --dtype ${VLLM_DATA_TYPE} \
44
+ --max-logprobs ${VLLM_MAX_LOGPROBS} \
45
+ --trust-remote-code \
46
+ --max-model-len ${VLLM_MAX_MODEL_LEN}
47
+ fi
@@ -0,0 +1,94 @@
1
+ Metadata-Version: 2.1
2
+ Name: vec-inf
3
+ Version: 0.3.0
4
+ Summary: Efficient LLM inference on Slurm clusters using vLLM.
5
+ License: MIT
6
+ Author: Marshall Wang
7
+ Author-email: marshall.wang@vectorinstitute.ai
8
+ Requires-Python: >=3.10,<4.0
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Provides-Extra: dev
15
+ Requires-Dist: click (>=8.1.0,<9.0.0)
16
+ Requires-Dist: cupy-cuda12x (==12.1.0) ; extra == "dev"
17
+ Requires-Dist: ray (>=2.9.3,<3.0.0) ; extra == "dev"
18
+ Requires-Dist: requests (>=2.31.0,<3.0.0)
19
+ Requires-Dist: rich (>=13.7.0,<14.0.0)
20
+ Requires-Dist: vllm (>=0.5.0,<0.6.0) ; extra == "dev"
21
+ Requires-Dist: vllm-nccl-cu12 (>=2.18,<2.19) ; extra == "dev"
22
+ Description-Content-Type: text/markdown
23
+
24
+ # Vector Inference: Easy inference on Slurm clusters
25
+ This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update the config files in the `vec_inf/models` folder and the environment variables in the model launching scripts in `vec_inf` accordingly.
26
+
27
+ ## Installation
28
+ If you are using the Vector cluster environment, and you don't need any customization to the inference server environment, run the following to install package:
29
+ ```bash
30
+ pip install vec-inf
31
+ ```
32
+ Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package
33
+
34
+ ## Launch an inference server
35
+ We will use the Llama 3 model as example, to launch an inference server for Llama 3 8B, run:
36
+ ```bash
37
+ vec-inf launch llama-3
38
+ ```
39
+ You should see an output like the following:
40
+
41
+ <img src="https://github.com/user-attachments/assets/c50646df-0991-4164-ad8f-6eb7e86b67e0" width="350">
42
+
43
+ There is a default variant for every model family, which is specified in `vec_inf/models/{MODEL_FAMILY_NAME}/README.md`, you can switch to other variants with the `--model-variant` option, and make sure to change the requested resource accordingly. More information about the available options can be found in the [`vec_inf/models`](vec_inf/models) folder. The inference server is compatible with the OpenAI `Completion` and `ChatCompletion` API.
44
+
45
+ You can check the inference server status by providing the Slurm job ID to the `status` command:
46
+ ```bash
47
+ vec-inf status 13014393
48
+ ```
49
+
50
+ You should see an output like the following:
51
+
52
+ <img src="https://github.com/user-attachments/assets/310086fd-82ea-4bfc-8062-5c8e71c5650c" width="400">
53
+
54
+ There are 5 possible states:
55
+
56
+ * **PENDING**: Job submitted to Slurm, but not executed yet.
57
+ * **LAUNCHING**: Job is running but the server is not ready yet.
58
+ * **READY**: Inference server running and ready to take requests.
59
+ * **FAILED**: Inference server in an unhealthy state.
60
+ * **SHUTDOWN**: Inference server is shutdown/cancelled.
61
+
62
+ Note that the base URL is only available when model is in `READY` state.
63
+ Both `launch` and `status` command supports `--json-mode`, where the output information would be structured as a JSON string.
64
+
65
+ Finally, when you're finished using a model, you can shut it down by providing the Slurm job ID:
66
+ ```bash
67
+ vec-inf shutdown 13014393
68
+
69
+ > Shutting down model with Slurm Job ID: 13014393
70
+ ```
71
+
72
+ Here is a more complicated example that launches a model variant using multiple nodes, say we want to launch Mixtral 8x22B, run
73
+ ```bash
74
+ vec-inf launch mixtral --model-variant 8x22B-v0.1 --num-nodes 2 --num-gpus 4
75
+ ```
76
+
77
+ And for launching a multimodal model, here is an example for launching LLaVa-NEXT Mistral 7B (default variant)
78
+ ```bash
79
+ vec-inf launch llava-v1.6 --is-vlm
80
+ ```
81
+
82
+ ## Send inference requests
83
+ Once the inference server is ready, you can start sending in inference requests. We provide example scripts for sending inference requests in [`examples`](examples) folder. Make sure to update the model server URL and the model weights location in the scripts. For example, you can run `python examples/inference/llm/completions.py`, and you should expect to see an output like the following:
84
+ > {"id":"cmpl-bdf43763adf242588af07af88b070b62","object":"text_completion","created":2983960,"model":"/model-weights/Llama-2-7b-hf","choices":[{"index":0,"text":"\nCanada is close to the actual continent of North America. Aside from the Arctic islands","logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":8,"total_tokens":28,"completion_tokens":20}}
85
+
86
+ **NOTE**: For multimodal models, currently only `ChatCompletion` is available, and only one image can be provided for each prompt.
87
+
88
+ ## SSH tunnel from your local device
89
+ If you want to run inference from your local device, you can open a SSH tunnel to your cluster environment like the following:
90
+ ```bash
91
+ ssh -L 8081:172.17.8.29:8081 username@v.vectorinstitute.ai -N
92
+ ```
93
+ The example provided above is for the vector cluster, change the variables accordingly for your environment
94
+
@@ -0,0 +1,41 @@
1
+ vec_inf/README.md,sha256=6QAPmd9ccLDHmZMNs4Tjjv0dA28FQIVFJtgmnwgAkPE,389
2
+ vec_inf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ vec_inf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ vec_inf/cli/_cli.py,sha256=weUeKHz1Hjq4AnJAfw-YpFceKWThrs80FgfWS1Ccq5I,7332
5
+ vec_inf/cli/_utils.py,sha256=2Grz-bX_mGjzxXUBdrX7MbNfXUM7JQ3399GKe-N74FE,3910
6
+ vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
7
+ vec_inf/launch_server.sh,sha256=dVBVx6udXjb2Vw2rRTddTewDuw0WtV8ne0ImS0brMVk,3577
8
+ vec_inf/models/CodeLlama/README.md,sha256=4R5Vp8mq_Qa9WgwGutihEw3vBc_5Euj-QTgHeq7s_ds,1145
9
+ vec_inf/models/CodeLlama/config.sh,sha256=_DFM1NJibpSmbOIlSKB28m0C5PzM9mb8jVLLigSTgiQ,136
10
+ vec_inf/models/Llama-2/README.md,sha256=0asL53BytxSeilUoyZmy1Z6bJd-UMgTkwu721HVNpn4,656
11
+ vec_inf/models/Llama-2/config.sh,sha256=rAjwo51rczP7VWr9nLsVrlWwRqWA9ncGJzr61LdTBU8,129
12
+ vec_inf/models/Meta-Llama-3/README.md,sha256=FQgpLniE_krQyoTe8ziRFyzyZMkNamCFAhqkd-N0TR8,704
13
+ vec_inf/models/Meta-Llama-3/config.sh,sha256=q-SpdvTIbC4-U8xfaV_Uzzodl5okxq_Z5YNnzGYwdVQ,136
14
+ vec_inf/models/Meta-Llama-3.1/README.md,sha256=yjzIg5xp5XgUzZxJmM3mz6uzqSl_n7tTTi9YyTuudAk,693
15
+ vec_inf/models/Meta-Llama-3.1/config.sh,sha256=XhV-e33tuNJYX32PHx8AxZ5sR_A_z3glcuDfiZooV0o,162
16
+ vec_inf/models/Mistral/README.md,sha256=uv4c_oHr3DAN_3fy0YfcGiIGmMdz1Vswx3wfaAcChlk,788
17
+ vec_inf/models/Mistral/config.sh,sha256=8UWTYouNmctOd_eM0ArmuXhSYRkwkMqLY8WbturH1wY,135
18
+ vec_inf/models/Mixtral/README.md,sha256=Ic94pH0NY-MniVR5b1uRDJrpYx1rVXLYQpjFEw98054,655
19
+ vec_inf/models/Mixtral/config.sh,sha256=AbTfEmzHZ3UX08WAa2zcgdGPDw178xtfCh7l3znZIUQ,137
20
+ vec_inf/models/Phi-3/README.md,sha256=lj8Bx538O0yC8SjID-GyFHDSf6MU6HezPdtqCO6zm1E,507
21
+ vec_inf/models/Phi-3/config.sh,sha256=vX6UWZg7YCtDAO3QKHz7PwvGJ5clp7QYnytNPFx4tZ0,161
22
+ vec_inf/models/README.md,sha256=V5atdrL3y6euM244iBrh6ASstWvr__uvCy3y7Ktg2qU,2390
23
+ vec_inf/models/c4ai-command-r/README.md,sha256=yGCYVzsMpBSYa2eSn-YU2kBFv3qW3acd-mHY7FLIc9M,406
24
+ vec_inf/models/c4ai-command-r/config.sh,sha256=InBRtlAHIxve3xbNN0UomMCh4xlAZlOQu-j4wWWc3Co,132
25
+ vec_inf/models/dbrx/README.md,sha256=MJRyZtqhYqN9_BvTD-lxqf34ytYCP6a1tg4_aWfJhsI,384
26
+ vec_inf/models/dbrx/config.sh,sha256=UjtHdUZ_6TKDGR8c449iVkaBa63a8Z3-IaxDq_KO4Go,126
27
+ vec_inf/models/gemma-2/README.md,sha256=4QMheXAZe0bNNQ2kEZn15I3x83rF9iLQnSUejx8p46o,628
28
+ vec_inf/models/gemma-2/config.sh,sha256=Tl1U774WXoOsAbiGa4tZGg53GWh_niqe5Z_bQ92VX1I,166
29
+ vec_inf/models/llava-1.5/README.md,sha256=YxkN_BWnK4nNf0rXi4_1isJVxlV73YhVABxhCjqNSvY,471
30
+ vec_inf/models/llava-1.5/chat_template.jinja,sha256=qCE9YwTfTa3jwjrB5yAnqVIm1bDkUBc5LjHBM0d9Sso,765
31
+ vec_inf/models/llava-1.5/config.sh,sha256=Yvb6s1mil0vmkVllnC3DjphpSkC2U5KOQG3l5OJawME,127
32
+ vec_inf/models/llava-v1.6/README.md,sha256=5JYjW0XoGJf40wd-oIdO3uAQYPv5XPZQuo5T37pxZcg,491
33
+ vec_inf/models/llava-v1.6/chat_template.jinja,sha256=qCE9YwTfTa3jwjrB5yAnqVIm1bDkUBc5LjHBM0d9Sso,765
34
+ vec_inf/models/llava-v1.6/config.sh,sha256=zuoK2cg5KgKbs9jk_M3R-vALGP1TesMkWEeRjSw209E,136
35
+ vec_inf/models/models.csv,sha256=JFGMhT9o7Pf0tkY-w2GRQG5MxdYK2V5T8s6bk166MpM,4720
36
+ vec_inf/multinode_vllm.slurm,sha256=pedYWIzPN-BKtL6ezoZSKJ3DO7RduDyAR4_cxZD4KyY,3938
37
+ vec_inf/vllm.slurm,sha256=6Nx14qyAwHlbweCbFMUcMV2jaZSv41ghkyx2MiHJY8Y,1608
38
+ vec_inf-0.3.0.dist-info/METADATA,sha256=Vqr7b5pmz4rWK1B4my9a_jG6BT5C_8XvGJtzjy3HVng,5142
39
+ vec_inf-0.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
40
+ vec_inf-0.3.0.dist-info/entry_points.txt,sha256=JF4uXsj1H4XacxaBw9f0KN0P0qDzmp7K_1zTEBDappo,48
41
+ vec_inf-0.3.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 1.9.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ vec-inf=vec_inf.cli._cli:cli
3
+