vec-inf 0.3.3__py3-none-any.whl → 0.4.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vec_inf/README.md +2 -1
- vec_inf/cli/_cli.py +151 -31
- vec_inf/cli/_utils.py +40 -14
- vec_inf/launch_server.sh +32 -28
- vec_inf/models/README.md +132 -35
- vec_inf/models/models.csv +73 -46
- vec_inf/multinode_vllm.slurm +18 -8
- vec_inf/vllm.slurm +15 -5
- vec_inf-0.4.0.post1.dist-info/LICENSE +21 -0
- {vec_inf-0.3.3.dist-info → vec_inf-0.4.0.post1.dist-info}/METADATA +30 -8
- vec_inf-0.4.0.post1.dist-info/RECORD +16 -0
- {vec_inf-0.3.3.dist-info → vec_inf-0.4.0.post1.dist-info}/WHEEL +1 -1
- vec_inf-0.3.3.dist-info/RECORD +0 -15
- {vec_inf-0.3.3.dist-info → vec_inf-0.4.0.post1.dist-info}/entry_points.txt +0 -0
vec_inf/README.md
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
# `vec-inf` Commands
|
|
2
2
|
|
|
3
3
|
* `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server, `--json-mode` supported. Check [`here`](./models/README.md) for complete list of available options.
|
|
4
|
-
* `list`: List all available model names, `--json-mode` supported.
|
|
4
|
+
* `list`: List all available model names, or append a supported model name to view the default configuration, `--json-mode` supported.
|
|
5
|
+
* `metrics`: Streams performance metrics to the console.
|
|
5
6
|
* `status`: Check the model status by providing its Slurm job ID, `--json-mode` supported.
|
|
6
7
|
* `shutdown`: Shutdown a model by providing its Slurm job ID.
|
|
7
8
|
|
vec_inf/cli/_cli.py
CHANGED
|
@@ -1,9 +1,13 @@
|
|
|
1
1
|
import os
|
|
2
|
-
|
|
2
|
+
import time
|
|
3
|
+
from typing import Optional, cast
|
|
3
4
|
|
|
4
5
|
import click
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
5
8
|
from rich.columns import Columns
|
|
6
9
|
from rich.console import Console
|
|
10
|
+
from rich.live import Live
|
|
7
11
|
from rich.panel import Panel
|
|
8
12
|
|
|
9
13
|
import vec_inf.cli._utils as utils
|
|
@@ -24,9 +28,19 @@ def cli():
|
|
|
24
28
|
@click.option(
|
|
25
29
|
"--max-model-len",
|
|
26
30
|
type=int,
|
|
27
|
-
help="Model context length.
|
|
31
|
+
help="Model context length. Default value set based on suggested resource allocation.",
|
|
32
|
+
)
|
|
33
|
+
@click.option(
|
|
34
|
+
"--max-num-seqs",
|
|
35
|
+
type=int,
|
|
36
|
+
help="Maximum number of sequences to process in a single request",
|
|
37
|
+
)
|
|
38
|
+
@click.option(
|
|
39
|
+
"--partition",
|
|
40
|
+
type=str,
|
|
41
|
+
default="a40",
|
|
42
|
+
help="Type of compute partition, default to a40",
|
|
28
43
|
)
|
|
29
|
-
@click.option("--partition", type=str, help="Type of compute partition, default to a40")
|
|
30
44
|
@click.option(
|
|
31
45
|
"--num-nodes",
|
|
32
46
|
type=int,
|
|
@@ -40,24 +54,48 @@ def cli():
|
|
|
40
54
|
@click.option(
|
|
41
55
|
"--qos",
|
|
42
56
|
type=str,
|
|
43
|
-
help="Quality of service
|
|
57
|
+
help="Quality of service",
|
|
44
58
|
)
|
|
45
59
|
@click.option(
|
|
46
60
|
"--time",
|
|
47
61
|
type=str,
|
|
48
|
-
help="Time limit for job, this should comply with QoS
|
|
62
|
+
help="Time limit for job, this should comply with QoS limits",
|
|
49
63
|
)
|
|
50
64
|
@click.option(
|
|
51
65
|
"--vocab-size",
|
|
52
66
|
type=int,
|
|
53
67
|
help="Vocabulary size, this option is intended for custom models",
|
|
54
68
|
)
|
|
55
|
-
@click.option(
|
|
56
|
-
|
|
69
|
+
@click.option(
|
|
70
|
+
"--data-type", type=str, default="auto", help="Model data type, default to auto"
|
|
71
|
+
)
|
|
72
|
+
@click.option(
|
|
73
|
+
"--venv",
|
|
74
|
+
type=str,
|
|
75
|
+
default="singularity",
|
|
76
|
+
help="Path to virtual environment, default to preconfigured singularity container",
|
|
77
|
+
)
|
|
57
78
|
@click.option(
|
|
58
79
|
"--log-dir",
|
|
59
80
|
type=str,
|
|
60
|
-
|
|
81
|
+
default="default",
|
|
82
|
+
help="Path to slurm log directory, default to .vec-inf-logs in user home directory",
|
|
83
|
+
)
|
|
84
|
+
@click.option(
|
|
85
|
+
"--model-weights-parent-dir",
|
|
86
|
+
type=str,
|
|
87
|
+
default="/model-weights",
|
|
88
|
+
help="Path to parent directory containing model weights, default to '/model-weights' for supported models",
|
|
89
|
+
)
|
|
90
|
+
@click.option(
|
|
91
|
+
"--pipeline-parallelism",
|
|
92
|
+
type=str,
|
|
93
|
+
help="Enable pipeline parallelism, accepts 'True' or 'False', default to 'True' for supported models",
|
|
94
|
+
)
|
|
95
|
+
@click.option(
|
|
96
|
+
"--enforce-eager",
|
|
97
|
+
type=str,
|
|
98
|
+
help="Always use eager-mode PyTorch, accepts 'True' or 'False', default to 'False' for custom models if not set",
|
|
61
99
|
)
|
|
62
100
|
@click.option(
|
|
63
101
|
"--json-mode",
|
|
@@ -69,6 +107,7 @@ def launch(
|
|
|
69
107
|
model_family: Optional[str] = None,
|
|
70
108
|
model_variant: Optional[str] = None,
|
|
71
109
|
max_model_len: Optional[int] = None,
|
|
110
|
+
max_num_seqs: Optional[int] = None,
|
|
72
111
|
partition: Optional[str] = None,
|
|
73
112
|
num_nodes: Optional[int] = None,
|
|
74
113
|
num_gpus: Optional[int] = None,
|
|
@@ -78,11 +117,20 @@ def launch(
|
|
|
78
117
|
data_type: Optional[str] = None,
|
|
79
118
|
venv: Optional[str] = None,
|
|
80
119
|
log_dir: Optional[str] = None,
|
|
120
|
+
model_weights_parent_dir: Optional[str] = None,
|
|
121
|
+
pipeline_parallelism: Optional[str] = None,
|
|
122
|
+
enforce_eager: Optional[str] = None,
|
|
81
123
|
json_mode: bool = False,
|
|
82
124
|
) -> None:
|
|
83
125
|
"""
|
|
84
126
|
Launch a model on the cluster
|
|
85
127
|
"""
|
|
128
|
+
|
|
129
|
+
if isinstance(pipeline_parallelism, str):
|
|
130
|
+
pipeline_parallelism = (
|
|
131
|
+
"True" if pipeline_parallelism.lower() == "true" else "False"
|
|
132
|
+
)
|
|
133
|
+
|
|
86
134
|
launch_script_path = os.path.join(
|
|
87
135
|
os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "launch_server.sh"
|
|
88
136
|
)
|
|
@@ -90,7 +138,7 @@ def launch(
|
|
|
90
138
|
|
|
91
139
|
models_df = utils.load_models_df()
|
|
92
140
|
|
|
93
|
-
if model_name in models_df["model_name"].
|
|
141
|
+
if model_name in models_df["model_name"].to_list():
|
|
94
142
|
default_args = utils.load_default_args(models_df, model_name)
|
|
95
143
|
for arg in default_args:
|
|
96
144
|
if arg in locals() and locals()[arg] is not None:
|
|
@@ -98,10 +146,11 @@ def launch(
|
|
|
98
146
|
renamed_arg = arg.replace("_", "-")
|
|
99
147
|
launch_cmd += f" --{renamed_arg} {default_args[arg]}"
|
|
100
148
|
else:
|
|
101
|
-
model_args = models_df.columns
|
|
102
|
-
|
|
149
|
+
model_args = models_df.columns
|
|
150
|
+
model_args.remove("model_name")
|
|
151
|
+
model_args.remove("model_type")
|
|
103
152
|
for arg in model_args:
|
|
104
|
-
if
|
|
153
|
+
if locals()[arg] is not None:
|
|
105
154
|
renamed_arg = arg.replace("_", "-")
|
|
106
155
|
launch_cmd += f" --{renamed_arg} {locals()[arg]}"
|
|
107
156
|
|
|
@@ -225,40 +274,111 @@ def shutdown(slurm_job_id: int) -> None:
|
|
|
225
274
|
is_flag=True,
|
|
226
275
|
help="Output in JSON string",
|
|
227
276
|
)
|
|
228
|
-
def
|
|
277
|
+
def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> None:
|
|
229
278
|
"""
|
|
230
279
|
List all available models, or get default setup of a specific model
|
|
231
280
|
"""
|
|
232
|
-
models_df = utils.load_models_df()
|
|
233
281
|
|
|
234
|
-
|
|
235
|
-
if model_name not in models_df["model_name"].
|
|
282
|
+
def list_model(model_name: str, models_df: pl.DataFrame, json_mode: bool):
|
|
283
|
+
if model_name not in models_df["model_name"].to_list():
|
|
236
284
|
raise ValueError(f"Model name {model_name} not found in available models")
|
|
237
285
|
|
|
238
|
-
excluded_keys = {"venv", "log_dir"
|
|
239
|
-
model_row = models_df.
|
|
286
|
+
excluded_keys = {"venv", "log_dir"}
|
|
287
|
+
model_row = models_df.filter(models_df["model_name"] == model_name)
|
|
240
288
|
|
|
241
289
|
if json_mode:
|
|
242
|
-
|
|
243
|
-
filtered_model_row
|
|
244
|
-
click.echo(filtered_model_row.to_json(orient="records"))
|
|
290
|
+
filtered_model_row = model_row.drop(excluded_keys, strict=False)
|
|
291
|
+
click.echo(filtered_model_row.to_dicts()[0])
|
|
245
292
|
return
|
|
246
293
|
table = utils.create_table(key_title="Model Config", value_title="Value")
|
|
247
|
-
for
|
|
294
|
+
for row in model_row.to_dicts():
|
|
248
295
|
for key, value in row.items():
|
|
249
296
|
if key not in excluded_keys:
|
|
250
297
|
table.add_row(key, str(value))
|
|
251
298
|
CONSOLE.print(table)
|
|
252
|
-
return
|
|
253
299
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
300
|
+
def list_all(models_df: pl.DataFrame, json_mode: bool):
|
|
301
|
+
if json_mode:
|
|
302
|
+
click.echo(models_df["model_name"].to_list())
|
|
303
|
+
return
|
|
304
|
+
panels = []
|
|
305
|
+
model_type_colors = {
|
|
306
|
+
"LLM": "cyan",
|
|
307
|
+
"VLM": "bright_blue",
|
|
308
|
+
"Text Embedding": "purple",
|
|
309
|
+
"Reward Modeling": "bright_magenta",
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
models_df = models_df.with_columns(
|
|
313
|
+
pl.when(pl.col("model_type") == "LLM")
|
|
314
|
+
.then(0)
|
|
315
|
+
.when(pl.col("model_type") == "VLM")
|
|
316
|
+
.then(1)
|
|
317
|
+
.when(pl.col("model_type") == "Text Embedding")
|
|
318
|
+
.then(2)
|
|
319
|
+
.when(pl.col("model_type") == "Reward Modeling")
|
|
320
|
+
.then(3)
|
|
321
|
+
.otherwise(-1)
|
|
322
|
+
.alias("model_type_order")
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
models_df = models_df.sort("model_type_order")
|
|
326
|
+
models_df = models_df.drop("model_type_order")
|
|
327
|
+
|
|
328
|
+
for row in models_df.to_dicts():
|
|
329
|
+
panel_color = model_type_colors.get(row["model_type"], "white")
|
|
330
|
+
styled_text = (
|
|
331
|
+
f"[magenta]{row['model_family']}[/magenta]-{row['model_variant']}"
|
|
332
|
+
)
|
|
333
|
+
panels.append(Panel(styled_text, expand=True, border_style=panel_color))
|
|
334
|
+
CONSOLE.print(Columns(panels, equal=True))
|
|
335
|
+
|
|
336
|
+
models_df = utils.load_models_df()
|
|
337
|
+
|
|
338
|
+
if model_name:
|
|
339
|
+
list_model(model_name, models_df, json_mode)
|
|
340
|
+
else:
|
|
341
|
+
list_all(models_df, json_mode)
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
@cli.command("metrics")
|
|
345
|
+
@click.argument("slurm_job_id", type=int, nargs=1)
|
|
346
|
+
@click.option(
|
|
347
|
+
"--log-dir",
|
|
348
|
+
type=str,
|
|
349
|
+
help="Path to slurm log directory. This is required if --log-dir was set in model launch",
|
|
350
|
+
)
|
|
351
|
+
def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
|
|
352
|
+
"""
|
|
353
|
+
Stream performance metrics to the console
|
|
354
|
+
"""
|
|
355
|
+
status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
|
|
356
|
+
output = utils.run_bash_command(status_cmd)
|
|
357
|
+
slurm_job_name = output.split(" ")[1].split("=")[1]
|
|
358
|
+
|
|
359
|
+
with Live(refresh_per_second=1, console=CONSOLE) as live:
|
|
360
|
+
while True:
|
|
361
|
+
out_logs = utils.read_slurm_log(
|
|
362
|
+
slurm_job_name, slurm_job_id, "out", log_dir
|
|
363
|
+
)
|
|
364
|
+
# if out_logs is a string, then it is an error message
|
|
365
|
+
if isinstance(out_logs, str):
|
|
366
|
+
live.update(out_logs)
|
|
367
|
+
break
|
|
368
|
+
out_logs = cast(list, out_logs)
|
|
369
|
+
latest_metrics = utils.get_latest_metric(out_logs)
|
|
370
|
+
# if latest_metrics is a string, then it is an error message
|
|
371
|
+
if isinstance(latest_metrics, str):
|
|
372
|
+
live.update(latest_metrics)
|
|
373
|
+
break
|
|
374
|
+
latest_metrics = cast(dict, latest_metrics)
|
|
375
|
+
table = utils.create_table(key_title="Metric", value_title="Value")
|
|
376
|
+
for key, value in latest_metrics.items():
|
|
377
|
+
table.add_row(key, value)
|
|
378
|
+
|
|
379
|
+
live.update(table)
|
|
380
|
+
|
|
381
|
+
time.sleep(2)
|
|
262
382
|
|
|
263
383
|
|
|
264
384
|
if __name__ == "__main__":
|
vec_inf/cli/_utils.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import subprocess
|
|
3
|
-
from typing import Optional, Union
|
|
3
|
+
from typing import Optional, Union, cast
|
|
4
4
|
|
|
5
|
-
import
|
|
5
|
+
import polars as pl
|
|
6
6
|
import requests
|
|
7
7
|
from rich.table import Table
|
|
8
8
|
|
|
9
|
-
MODEL_READY_SIGNATURE = "INFO:
|
|
9
|
+
MODEL_READY_SIGNATURE = "INFO: Application startup complete."
|
|
10
10
|
SERVER_ADDRESS_SIGNATURE = "Server address: "
|
|
11
11
|
|
|
12
12
|
|
|
@@ -25,7 +25,7 @@ def read_slurm_log(
|
|
|
25
25
|
slurm_job_name: str, slurm_job_id: int, slurm_log_type: str, log_dir: Optional[str]
|
|
26
26
|
) -> Union[list[str], str]:
|
|
27
27
|
"""
|
|
28
|
-
|
|
28
|
+
Read the slurm log file
|
|
29
29
|
"""
|
|
30
30
|
if not log_dir:
|
|
31
31
|
models_dir = os.path.join(os.path.expanduser("~"), ".vec-inf-logs")
|
|
@@ -35,9 +35,11 @@ def read_slurm_log(
|
|
|
35
35
|
log_dir = os.path.join(models_dir, dir)
|
|
36
36
|
break
|
|
37
37
|
|
|
38
|
+
log_dir = cast(str, log_dir)
|
|
39
|
+
|
|
38
40
|
try:
|
|
39
41
|
file_path = os.path.join(
|
|
40
|
-
log_dir,
|
|
42
|
+
log_dir,
|
|
41
43
|
f"{slurm_job_name}.{slurm_job_id}.{slurm_log_type}",
|
|
42
44
|
)
|
|
43
45
|
with open(file_path, "r") as file:
|
|
@@ -58,12 +60,15 @@ def is_server_running(
|
|
|
58
60
|
if isinstance(log_content, str):
|
|
59
61
|
return log_content
|
|
60
62
|
|
|
63
|
+
status: Union[str, tuple[str, str]] = "LAUNCHING"
|
|
64
|
+
|
|
61
65
|
for line in log_content:
|
|
62
66
|
if "error" in line.lower():
|
|
63
|
-
|
|
67
|
+
status = ("FAILED", line.strip("\n"))
|
|
64
68
|
if MODEL_READY_SIGNATURE in line:
|
|
65
|
-
|
|
66
|
-
|
|
69
|
+
status = "RUNNING"
|
|
70
|
+
|
|
71
|
+
return status
|
|
67
72
|
|
|
68
73
|
|
|
69
74
|
def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]) -> str:
|
|
@@ -114,11 +119,11 @@ def create_table(
|
|
|
114
119
|
return table
|
|
115
120
|
|
|
116
121
|
|
|
117
|
-
def load_models_df() ->
|
|
122
|
+
def load_models_df() -> pl.DataFrame:
|
|
118
123
|
"""
|
|
119
124
|
Load the models dataframe
|
|
120
125
|
"""
|
|
121
|
-
models_df =
|
|
126
|
+
models_df = pl.read_csv(
|
|
122
127
|
os.path.join(
|
|
123
128
|
os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
|
|
124
129
|
"models/models.csv",
|
|
@@ -127,11 +132,32 @@ def load_models_df() -> pd.DataFrame:
|
|
|
127
132
|
return models_df
|
|
128
133
|
|
|
129
134
|
|
|
130
|
-
def load_default_args(models_df:
|
|
135
|
+
def load_default_args(models_df: pl.DataFrame, model_name: str) -> dict:
|
|
131
136
|
"""
|
|
132
137
|
Load the default arguments for a model
|
|
133
138
|
"""
|
|
134
|
-
row_data = models_df.
|
|
135
|
-
default_args = row_data.
|
|
136
|
-
default_args.pop("model_name")
|
|
139
|
+
row_data = models_df.filter(models_df["model_name"] == model_name)
|
|
140
|
+
default_args = row_data.to_dicts()[0]
|
|
141
|
+
default_args.pop("model_name", None)
|
|
142
|
+
default_args.pop("model_type", None)
|
|
137
143
|
return default_args
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def get_latest_metric(log_lines: list[str]) -> dict | str:
|
|
147
|
+
"""Read the latest metric entry from the log file."""
|
|
148
|
+
latest_metric = {}
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
for line in reversed(log_lines):
|
|
152
|
+
if "Avg prompt throughput" in line:
|
|
153
|
+
# Parse the metric values from the line
|
|
154
|
+
metrics_str = line.split("] ")[1].strip().strip(".")
|
|
155
|
+
metrics_list = metrics_str.split(", ")
|
|
156
|
+
for metric in metrics_list:
|
|
157
|
+
key, value = metric.split(": ")
|
|
158
|
+
latest_metric[key] = value
|
|
159
|
+
break
|
|
160
|
+
except Exception as e:
|
|
161
|
+
return f"[red]Error reading log file: {e}[/red]"
|
|
162
|
+
|
|
163
|
+
return latest_metric
|
vec_inf/launch_server.sh
CHANGED
|
@@ -12,21 +12,24 @@ while [[ "$#" -gt 0 ]]; do
|
|
|
12
12
|
--num-nodes) num_nodes="$2"; shift ;;
|
|
13
13
|
--num-gpus) num_gpus="$2"; shift ;;
|
|
14
14
|
--max-model-len) max_model_len="$2"; shift ;;
|
|
15
|
+
--max-num-seqs) max_num_seqs="$2"; shift ;;
|
|
15
16
|
--vocab-size) vocab_size="$2"; shift ;;
|
|
16
17
|
--data-type) data_type="$2"; shift ;;
|
|
17
|
-
--venv)
|
|
18
|
+
--venv) venv="$2"; shift ;;
|
|
18
19
|
--log-dir) log_dir="$2"; shift ;;
|
|
20
|
+
--model-weights-parent-dir) model_weights_parent_dir="$2"; shift ;;
|
|
19
21
|
--pipeline-parallelism) pipeline_parallelism="$2"; shift ;;
|
|
22
|
+
--enforce-eager) enforce_eager="$2"; shift ;;
|
|
20
23
|
*) echo "Unknown parameter passed: $1"; exit 1 ;;
|
|
21
24
|
esac
|
|
22
25
|
shift
|
|
23
26
|
done
|
|
24
27
|
|
|
25
|
-
required_vars=(model_family model_variant partition qos walltime num_nodes num_gpus max_model_len vocab_size)
|
|
28
|
+
required_vars=(model_family model_variant partition qos walltime num_nodes num_gpus max_model_len vocab_size data_type venv log_dir model_weights_parent_dir)
|
|
26
29
|
|
|
27
30
|
for var in "$required_vars[@]"; do
|
|
28
31
|
if [ -z "$!var" ]; then
|
|
29
|
-
echo "Error: Missing required --$var
|
|
32
|
+
echo "Error: Missing required --$var argument."
|
|
30
33
|
exit 1
|
|
31
34
|
fi
|
|
32
35
|
done
|
|
@@ -40,27 +43,27 @@ export NUM_NODES=$num_nodes
|
|
|
40
43
|
export NUM_GPUS=$num_gpus
|
|
41
44
|
export VLLM_MAX_MODEL_LEN=$max_model_len
|
|
42
45
|
export VLLM_MAX_LOGPROBS=$vocab_size
|
|
43
|
-
|
|
44
|
-
export
|
|
45
|
-
export
|
|
46
|
-
export
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
export
|
|
52
|
-
fi
|
|
53
|
-
|
|
54
|
-
if [ -n "$virtual_env" ]; then
|
|
55
|
-
export VENV_BASE=$virtual_env
|
|
56
|
-
fi
|
|
57
|
-
|
|
58
|
-
if [ -n "$log_dir" ]; then
|
|
59
|
-
export LOG_DIR=$log_dir
|
|
46
|
+
export VLLM_DATA_TYPE=$data_type
|
|
47
|
+
export VENV_BASE=$venv
|
|
48
|
+
export LOG_DIR=$log_dir
|
|
49
|
+
export MODEL_WEIGHTS_PARENT_DIR=$model_weights_parent_dir
|
|
50
|
+
|
|
51
|
+
if [ -n "$max_num_seqs" ]; then
|
|
52
|
+
export VLLM_MAX_NUM_SEQS=$max_num_seqs
|
|
53
|
+
else
|
|
54
|
+
export VLLM_MAX_NUM_SEQS=256
|
|
60
55
|
fi
|
|
61
56
|
|
|
62
57
|
if [ -n "$pipeline_parallelism" ]; then
|
|
63
58
|
export PIPELINE_PARALLELISM=$pipeline_parallelism
|
|
59
|
+
else
|
|
60
|
+
export PIPELINE_PARALLELISM="False"
|
|
61
|
+
fi
|
|
62
|
+
|
|
63
|
+
if [ -n "$enforce_eager" ]; then
|
|
64
|
+
export ENFORCE_EAGER=$enforce_eager
|
|
65
|
+
else
|
|
66
|
+
export ENFORCE_EAGER="False"
|
|
64
67
|
fi
|
|
65
68
|
|
|
66
69
|
# ================================= Set default environment variables ======================================
|
|
@@ -72,13 +75,12 @@ fi
|
|
|
72
75
|
mkdir -p $LOG_DIR
|
|
73
76
|
|
|
74
77
|
# Model and entrypoint configuration. API Server URL (host, port) are set automatically based on the
|
|
75
|
-
# SLURM job
|
|
78
|
+
# SLURM job
|
|
76
79
|
export SRC_DIR="$(dirname "$0")"
|
|
77
80
|
export MODEL_DIR="${SRC_DIR}/models/${MODEL_FAMILY}"
|
|
78
|
-
export VLLM_BASE_URL_FILENAME="${MODEL_DIR}/.${JOB_NAME}_url"
|
|
79
81
|
|
|
80
82
|
# Variables specific to your working environment, below are examples for the Vector cluster
|
|
81
|
-
export VLLM_MODEL_WEIGHTS="
|
|
83
|
+
export VLLM_MODEL_WEIGHTS="${MODEL_WEIGHTS_PARENT_DIR}/${JOB_NAME}"
|
|
82
84
|
export LD_LIBRARY_PATH="/scratch/ssd001/pkgs/cudnn-11.7-v8.5.0.96/lib/:/scratch/ssd001/pkgs/cuda-11.7/targets/x86_64-linux/lib/"
|
|
83
85
|
|
|
84
86
|
|
|
@@ -93,11 +95,6 @@ if [[ $fp16_partitions =~ $JOB_PARTITION ]]; then
|
|
|
93
95
|
echo "Data type set to due to non-Ampere GPUs used: $VLLM_DATA_TYPE"
|
|
94
96
|
fi
|
|
95
97
|
|
|
96
|
-
# Create a file to store the API server URL if it doesn't exist
|
|
97
|
-
if [ -f $VLLM_BASE_URL_FILENAME ]; then
|
|
98
|
-
touch $VLLM_BASE_URL_FILENAME
|
|
99
|
-
fi
|
|
100
|
-
|
|
101
98
|
echo Job Name: $JOB_NAME
|
|
102
99
|
echo Partition: $JOB_PARTITION
|
|
103
100
|
echo Num Nodes: $NUM_NODES
|
|
@@ -105,6 +102,13 @@ echo GPUs per Node: $NUM_GPUS
|
|
|
105
102
|
echo QOS: $QOS
|
|
106
103
|
echo Walltime: $WALLTIME
|
|
107
104
|
echo Data Type: $VLLM_DATA_TYPE
|
|
105
|
+
echo Max Model Length: $VLLM_MAX_MODEL_LEN
|
|
106
|
+
echo Max Num Seqs: $VLLM_MAX_NUM_SEQS
|
|
107
|
+
echo Vocabulary Size: $VLLM_MAX_LOGPROBS
|
|
108
|
+
echo Pipeline Parallelism: $PIPELINE_PARALLELISM
|
|
109
|
+
echo Enforce Eager: $ENFORCE_EAGER
|
|
110
|
+
echo Log Directory: $LOG_DIR
|
|
111
|
+
echo Model Weights Parent Directory: $MODEL_WEIGHTS_PARENT_DIR
|
|
108
112
|
|
|
109
113
|
is_special=""
|
|
110
114
|
if [ "$NUM_NODES" -gt 1 ]; then
|
vec_inf/models/README.md
CHANGED
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
# Available Models
|
|
2
2
|
More profiling metrics coming soon!
|
|
3
3
|
|
|
4
|
-
##
|
|
4
|
+
## Text Generation Models
|
|
5
|
+
|
|
6
|
+
### [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
|
|
5
7
|
|
|
6
8
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
7
9
|
|:----------:|:----------:|:----------:|:----------:|
|
|
8
|
-
|[`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus)| 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
|
|
10
|
+
| [`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus) | 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
|
|
11
|
+
| [`c4ai-command-r-plus-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
12
|
+
| [`c4ai-command-r-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
9
13
|
|
|
10
|
-
|
|
14
|
+
### [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
|
|
11
15
|
|
|
12
16
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
13
17
|
|:----------:|:----------:|:----------:|:----------:|
|
|
@@ -20,13 +24,13 @@ More profiling metrics coming soon!
|
|
|
20
24
|
| [`CodeLlama-70b-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-hf) | 4x a40 | - tokens/s | - tokens/s |
|
|
21
25
|
| [`CodeLlama-70b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-Instruct-hf) | 4x a40 | - tokens/s | - tokens/s |
|
|
22
26
|
|
|
23
|
-
|
|
27
|
+
### [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
|
|
24
28
|
|
|
25
29
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
26
30
|
|:----------:|:----------:|:----------:|:----------:|
|
|
27
|
-
|[`dbrx-instruct`](https://huggingface.co/databricks/dbrx-instruct)| 8x a40 (2 nodes, 4 a40/node) | 107 tokens/s | 904 tokens/s |
|
|
31
|
+
| [`dbrx-instruct`](https://huggingface.co/databricks/dbrx-instruct) | 8x a40 (2 nodes, 4 a40/node) | 107 tokens/s | 904 tokens/s |
|
|
28
32
|
|
|
29
|
-
|
|
33
|
+
### [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
|
|
30
34
|
|
|
31
35
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
32
36
|
|:----------:|:----------:|:----------:|:----------:|
|
|
@@ -35,21 +39,7 @@ More profiling metrics coming soon!
|
|
|
35
39
|
| [`gemma-2-27b`](https://huggingface.co/google/gemma-2-27b) | 2x a40 | - tokens/s | - tokens/s |
|
|
36
40
|
| [`gemma-2-27b-it`](https://huggingface.co/google/gemma-2-27b-it) | 2x a40 | - tokens/s | - tokens/s |
|
|
37
41
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
41
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
42
|
-
|[`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
|
|
43
|
-
|[`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf)| 1x a40 | - tokens/s | - tokens/s |
|
|
44
|
-
|
|
45
|
-
## [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
|
|
46
|
-
|
|
47
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
48
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
49
|
-
|[`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
|
|
50
|
-
|[`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf)| 2x a40 | - tokens/s | - tokens/s |
|
|
51
|
-
|
|
52
|
-
## [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
|
|
42
|
+
### [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
|
|
53
43
|
|
|
54
44
|
| Variant | Suggested resource allocation |
|
|
55
45
|
|:----------:|:----------:|
|
|
@@ -60,7 +50,7 @@ More profiling metrics coming soon!
|
|
|
60
50
|
| [`Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 4x a40 |
|
|
61
51
|
| [`Llama-2-70b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4x a40 |
|
|
62
52
|
|
|
63
|
-
|
|
53
|
+
### [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
|
|
64
54
|
|
|
65
55
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
66
56
|
|:----------:|:----------:|:----------:|:----------:|
|
|
@@ -69,7 +59,7 @@ More profiling metrics coming soon!
|
|
|
69
59
|
| [`Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 4x a40 | 81 tokens/s | 618 tokens/s |
|
|
70
60
|
| [`Meta-Llama-3-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 4x a40 | 301 tokens/s | 660 tokens/s |
|
|
71
61
|
|
|
72
|
-
|
|
62
|
+
### [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
|
|
73
63
|
|
|
74
64
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
75
65
|
|:----------:|:----------:|:----------:|:----------:|
|
|
@@ -79,28 +69,135 @@ More profiling metrics coming soon!
|
|
|
79
69
|
| [`Meta-Llama-3.1-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
|
|
80
70
|
| [`Meta-Llama-3.1-405B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct) | 32x a40 (8 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
81
71
|
|
|
82
|
-
|
|
72
|
+
### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
|
|
73
|
+
|
|
74
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
75
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
76
|
+
| [`Llama-3.2-1B`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 1x a40 | - tokens/s | - tokens/s |
|
|
77
|
+
| [`Llama-3.2-1B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
78
|
+
| [`Llama-3.2-3B`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 1x a40 | - tokens/s | - tokens/s |
|
|
79
|
+
| [`Llama-3.2-3B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
80
|
+
|
|
81
|
+
### [Mistral AI: Mistral](https://huggingface.co/mistralai)
|
|
83
82
|
|
|
84
83
|
| Variant (Mistral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
85
84
|
|:----------:|:----------:|:----------:|:----------:|
|
|
86
|
-
|[`Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1)| 1x a40 | - tokens/s | - tokens/s|
|
|
87
|
-
|[`Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)| 1x a40 | - tokens/s | - tokens/s|
|
|
88
|
-
|[`Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2)| 1x a40 | - tokens/s | - tokens/s|
|
|
89
|
-
|[`Mistral-7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3)| 1x a40 | - tokens/s | - tokens/s |
|
|
90
|
-
|[`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)| 1x a40 | - tokens/s | - tokens/s|
|
|
91
|
-
|[`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407)|
|
|
85
|
+
| [`Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1) | 1x a40 | - tokens/s | - tokens/s|
|
|
86
|
+
| [`Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) | 1x a40 | - tokens/s | - tokens/s|
|
|
87
|
+
| [`Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2) | 1x a40 | - tokens/s | - tokens/s|
|
|
88
|
+
| [`Mistral-7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3) | 1x a40 | - tokens/s | - tokens/s |
|
|
89
|
+
| [`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) | 1x a40 | - tokens/s | - tokens/s|
|
|
90
|
+
| [`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
|
|
91
|
+
| [`Mistral-Large-Instruct-2411`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2411) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
|
|
92
92
|
|
|
93
|
-
|
|
93
|
+
### [Mistral AI: Mixtral](https://huggingface.co/mistralai)
|
|
94
94
|
|
|
95
95
|
| Variant (Mixtral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
96
96
|
|:----------:|:----------:|:----------:|:----------:|
|
|
97
|
-
|[`Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)| 4x a40 | 222 tokens/s | 1543 tokens/s |
|
|
98
|
-
|[`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
|
|
99
|
-
|[`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
|
|
97
|
+
| [`Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | 4x a40 | 222 tokens/s | 1543 tokens/s |
|
|
98
|
+
| [`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
|
|
99
|
+
| [`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
|
|
100
100
|
|
|
101
|
-
|
|
101
|
+
### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
|
|
102
102
|
|
|
103
103
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
104
104
|
|:----------:|:----------:|:----------:|:----------:|
|
|
105
105
|
| [`Phi-3-medium-128k-instruct`](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
106
|
+
|
|
107
|
+
### [Aaditya Ura: Llama3-OpenBioLLM](https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B)
|
|
108
|
+
|
|
109
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
110
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
111
|
+
| [`Llama3-OpenBioLLM-70B`](https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B) | 4x a40 | - tokens/s | - tokens/s |
|
|
112
|
+
|
|
113
|
+
### [Nvidia: Llama-3.1-Nemotron](https://huggingface.co/collections/nvidia/llama-31-nemotron-70b-670e93cd366feea16abc13d8)
|
|
114
|
+
|
|
115
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
116
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
117
|
+
| [`Llama-3.1-Nemotron-70B-Instruct-HF`](https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF) | 4x a40 | - tokens/s | - tokens/s |
|
|
118
|
+
|
|
119
|
+
### [Qwen: Qwen2.5](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e)
|
|
120
|
+
|
|
121
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
122
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
123
|
+
| [`Qwen2.5-0.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
124
|
+
| [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
125
|
+
| [`Qwen2.5-3B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
126
|
+
| [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
127
|
+
| [`Qwen2.5-14B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
128
|
+
| [`Qwen2.5-32B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
129
|
+
| [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
|
|
130
|
+
|
|
131
|
+
### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
|
|
132
|
+
|
|
133
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
134
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
135
|
+
| [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
136
|
+
| [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
137
|
+
| [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
|
|
138
|
+
|
|
139
|
+
### [Qwen: Qwen2.5-Coder](https://huggingface.co/collections/Qwen/qwen25-coder-66eaa22e6f99801bf65b0c2f)
|
|
140
|
+
|
|
141
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
142
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
143
|
+
| [`Qwen2.5-Coder-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
144
|
+
|
|
145
|
+
### [Qwen: QwQ](https://huggingface.co/collections/Qwen/qwq-674762b79b75eac01735070a)
|
|
146
|
+
|
|
147
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
148
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
149
|
+
| [`QwQ-32B-Preview`](https://huggingface.co/Qwen/QwQ-32B-Preview) | 2x a40 | - tokens/s | - tokens/s |
|
|
150
|
+
|
|
151
|
+
## Vision Language Models
|
|
152
|
+
|
|
153
|
+
### [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
|
|
154
|
+
|
|
155
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
156
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
157
|
+
| [`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
158
|
+
| [`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
159
|
+
|
|
160
|
+
### [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
|
|
161
|
+
|
|
162
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
163
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
164
|
+
| [`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
165
|
+
| [`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
|
|
166
|
+
|
|
167
|
+
### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
|
|
168
|
+
|
|
169
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
170
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
106
171
|
| [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
172
|
+
|
|
173
|
+
### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
|
|
174
|
+
|
|
175
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
176
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
177
|
+
| [`Llama-3.2-11B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 2x a40 | - tokens/s | - tokens/s |
|
|
178
|
+
| [`Llama-3.2-11B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
179
|
+
| [`Llama-3.2-90B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
180
|
+
| [`Llama-3.2-90B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
181
|
+
|
|
182
|
+
**NOTE**: `MllamaForConditionalGeneration` currently doesn't support pipeline parallelsim, to save memory, maximum number of requests is reduced and enforce eager mode is on.
|
|
183
|
+
|
|
184
|
+
### [Mistral: Pixtral](https://huggingface.co/mistralai)
|
|
185
|
+
|
|
186
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
187
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
188
|
+
| [`Pixtral-12B-2409`](https://huggingface.co/mistralai/Pixtral-12B-2409) | 1x a40 | - tokens/s | - tokens/s |
|
|
189
|
+
|
|
190
|
+
## Text Embedding Models
|
|
191
|
+
|
|
192
|
+
### [Liang Wang: e5](https://huggingface.co/intfloat)
|
|
193
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
194
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
195
|
+
| [`e5-mistral-7b-instruct`](https://huggingface.co/intfloat/e5-mistral-7b-instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
196
|
+
|
|
197
|
+
## Reward Modeling Models
|
|
198
|
+
|
|
199
|
+
### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
|
|
200
|
+
|
|
201
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
202
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
203
|
+
| [`Qwen2.5-Math-RM-72B`](https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B) | 4x a40 | - tokens/s | - tokens/s |
|
vec_inf/models/models.csv
CHANGED
|
@@ -1,46 +1,73 @@
|
|
|
1
|
-
model_name,model_family,model_variant,
|
|
2
|
-
c4ai-command-r-plus,c4ai-command-r,plus,
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
CodeLlama-
|
|
6
|
-
CodeLlama-
|
|
7
|
-
CodeLlama-
|
|
8
|
-
CodeLlama-
|
|
9
|
-
CodeLlama-
|
|
10
|
-
CodeLlama-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
gemma-2-
|
|
15
|
-
gemma-2-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
Llama-2-
|
|
19
|
-
Llama-2-
|
|
20
|
-
Llama-2-
|
|
21
|
-
Llama-2-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
llava-
|
|
25
|
-
llava-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
Meta-Llama-3-
|
|
29
|
-
Meta-Llama-3-
|
|
30
|
-
Meta-Llama-3
|
|
31
|
-
Meta-Llama-3
|
|
32
|
-
Meta-Llama-3.1-
|
|
33
|
-
Meta-Llama-3.1-
|
|
34
|
-
Meta-Llama-3.1-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
Mistral-7B-
|
|
38
|
-
Mistral-7B-v0.
|
|
39
|
-
Mistral-7B-Instruct-v0.
|
|
40
|
-
Mistral-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
1
|
+
model_name,model_family,model_variant,model_type,num_gpus,num_nodes,vocab_size,max_model_len,max_num_seqs,pipeline_parallelism,enforce_eager,qos,time,partition,data_type,venv,log_dir,model_weights_parent_dir
|
|
2
|
+
c4ai-command-r-plus,c4ai-command-r,plus,LLM,4,2,256000,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
3
|
+
c4ai-command-r-plus-08-2024,c4ai-command-r,plus-08-2024,LLM,4,2,256000,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
4
|
+
c4ai-command-r-08-2024,c4ai-command-r,08-2024,LLM,2,1,256000,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
5
|
+
CodeLlama-7b-hf,CodeLlama,7b-hf,LLM,1,1,32000,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
6
|
+
CodeLlama-7b-Instruct-hf,CodeLlama,7b-Instruct-hf,LLM,1,1,32000,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
7
|
+
CodeLlama-13b-hf,CodeLlama,13b-hf,LLM,1,1,32000,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
8
|
+
CodeLlama-13b-Instruct-hf,CodeLlama,13b-Instruct-hf,LLM,1,1,32000,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
9
|
+
CodeLlama-34b-hf,CodeLlama,34b-hf,LLM,2,1,32000,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
10
|
+
CodeLlama-34b-Instruct-hf,CodeLlama,34b-Instruct-hf,LLM,2,1,32000,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
11
|
+
CodeLlama-70b-hf,CodeLlama,70b-hf,LLM,4,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
12
|
+
CodeLlama-70b-Instruct-hf,CodeLlama,70b-Instruct-hf,LLM,4,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
13
|
+
dbrx-instruct,dbrx,instruct,LLM,4,2,100352,32000,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
14
|
+
gemma-2-9b,gemma-2,9b,LLM,1,1,256000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
15
|
+
gemma-2-9b-it,gemma-2,9b-it,LLM,1,1,256000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
16
|
+
gemma-2-27b,gemma-2,27b,LLM,2,1,256000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
17
|
+
gemma-2-27b-it,gemma-2,27b-it,LLM,2,1,256000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
18
|
+
Llama-2-7b-hf,Llama-2,7b-hf,LLM,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
19
|
+
Llama-2-7b-chat-hf,Llama-2,7b-chat-hf,LLM,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
20
|
+
Llama-2-13b-hf,Llama-2,13b-hf,LLM,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
21
|
+
Llama-2-13b-chat-hf,Llama-2,13b-chat-hf,LLM,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
22
|
+
Llama-2-70b-hf,Llama-2,70b-hf,LLM,4,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
23
|
+
Llama-2-70b-chat-hf,Llama-2,70b-chat-hf,LLM,4,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
24
|
+
llava-1.5-7b-hf,llava-1.5,7b-hf,VLM,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
25
|
+
llava-1.5-13b-hf,llava-1.5,13b-hf,VLM,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
26
|
+
llava-v1.6-mistral-7b-hf,llava-v1.6,mistral-7b-hf,VLM,1,1,32064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
27
|
+
llava-v1.6-34b-hf,llava-v1.6,34b-hf,VLM,2,1,64064,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
28
|
+
Meta-Llama-3-8B,Meta-Llama-3,8B,LLM,1,1,128256,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
29
|
+
Meta-Llama-3-8B-Instruct,Meta-Llama-3,8B-Instruct,LLM,1,1,128256,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
30
|
+
Meta-Llama-3-70B,Meta-Llama-3,70B,LLM,4,1,128256,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
31
|
+
Meta-Llama-3-70B-Instruct,Meta-Llama-3,70B-Instruct,LLM,4,1,128256,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
32
|
+
Meta-Llama-3.1-8B,Meta-Llama-3.1,8B,LLM,1,1,128256,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
33
|
+
Meta-Llama-3.1-8B-Instruct,Meta-Llama-3.1,8B-Instruct,LLM,1,1,128256,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
34
|
+
Meta-Llama-3.1-70B,Meta-Llama-3.1,70B,LLM,4,1,128256,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
35
|
+
Meta-Llama-3.1-70B-Instruct,Meta-Llama-3.1,70B-Instruct,LLM,4,1,128256,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
36
|
+
Meta-Llama-3.1-405B-Instruct,Meta-Llama-3.1,405B-Instruct,LLM,4,8,128256,16384,256,true,false,m4,02:00:00,a40,auto,singularity,default,/model-weights
|
|
37
|
+
Mistral-7B-v0.1,Mistral,7B-v0.1,LLM,1,1,32000,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
38
|
+
Mistral-7B-Instruct-v0.1,Mistral,7B-Instruct-v0.1,LLM,1,1,32000,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
39
|
+
Mistral-7B-Instruct-v0.2,Mistral,7B-Instruct-v0.2,LLM,1,1,32000,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
40
|
+
Mistral-7B-v0.3,Mistral,7B-v0.3,LLM,1,1,32768,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
41
|
+
Mistral-7B-Instruct-v0.3,Mistral,7B-Instruct-v0.3,LLM,1,1,32768,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
42
|
+
Mistral-Large-Instruct-2407,Mistral,Large-Instruct-2407,LLM,4,2,32768,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
43
|
+
Mistral-Large-Instruct-2411,Mistral,Large-Instruct-2411,LLM,4,2,32768,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
44
|
+
Mixtral-8x7B-Instruct-v0.1,Mixtral,8x7B-Instruct-v0.1,LLM,4,1,32000,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
45
|
+
Mixtral-8x22B-v0.1,Mixtral,8x22B-v0.1,LLM,4,2,32768,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
46
|
+
Mixtral-8x22B-Instruct-v0.1,Mixtral,8x22B-Instruct-v0.1,LLM,4,2,32768,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
47
|
+
Phi-3-medium-128k-instruct,Phi-3,medium-128k-instruct,LLM,2,1,32064,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
48
|
+
Phi-3-vision-128k-instruct,Phi-3,vision-128k-instruct,VLM,2,1,32064,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
49
|
+
Llama3-OpenBioLLM-70B,Llama3-OpenBioLLM,70B,LLM,4,1,128256,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
50
|
+
Llama-3.1-Nemotron-70B-Instruct-HF,Llama-3.1-Nemotron,70B-Instruct-HF,LLM,4,1,128256,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
51
|
+
Llama-3.2-1B,Llama-3.2,1B,LLM,1,1,128256,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
52
|
+
Llama-3.2-1B-Instruct,Llama-3.2,1B-Instruct,LLM,1,1,128256,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
53
|
+
Llama-3.2-3B,Llama-3.2,3B,LLM,1,1,128256,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
54
|
+
Llama-3.2-3B-Instruct,Llama-3.2,3B-Instruct,LLM,1,1,128256,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
55
|
+
Llama-3.2-11B-Vision,Llama-3.2,11B-Vision,VLM,2,1,128256,4096,64,false,true,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
56
|
+
Llama-3.2-11B-Vision-Instruct,Llama-3.2,11B-Vision-Instruct,VLM,2,1,128256,4096,64,false,true,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
57
|
+
Llama-3.2-90B-Vision,Llama-3.2,90B-Vision,VLM,4,2,128256,4096,32,false,true,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
58
|
+
Llama-3.2-90B-Vision-Instruct,Llama-3.2,90B-Vision-Instruct,VLM,4,2,128256,4096,32,false,true,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
59
|
+
Qwen2.5-0.5B-Instruct,Qwen2.5,0.5B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
60
|
+
Qwen2.5-1.5B-Instruct,Qwen2.5,1.5B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
61
|
+
Qwen2.5-3B-Instruct,Qwen2.5,3B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
62
|
+
Qwen2.5-7B-Instruct,Qwen2.5,7B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
63
|
+
Qwen2.5-14B-Instruct,Qwen2.5,14B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
64
|
+
Qwen2.5-32B-Instruct,Qwen2.5,32B-Instruct,LLM,2,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
65
|
+
Qwen2.5-72B-Instruct,Qwen2.5,72B-Instruct,LLM,4,1,152064,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
66
|
+
Qwen2.5-Math-1.5B-Instruct,Qwen2.5,Math-1.5B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
67
|
+
Qwen2.5-Math-7B-Instruct,Qwen2.5,Math-7B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
68
|
+
Qwen2.5-Math-72B-Instruct,Qwen2.5,Math-72B-Instruct,LLM,4,1,152064,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
69
|
+
Qwen2.5-Coder-7B-Instruct,Qwen2.5,Coder-7B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
70
|
+
Qwen2.5-Math-RM-72B,Qwen2.5,Math-RM-72B,Reward Modeling,4,1,152064,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
71
|
+
QwQ-32B-Preview,QwQ,32B-Preview,LLM,2,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
72
|
+
Pixtral-12B-2409,Pixtral,12B-2409,VLM,1,1,131072,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
73
|
+
e5-mistral-7b-instruct,e5,mistral-7b-instruct,Text Embedding,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
vec_inf/multinode_vllm.slurm
CHANGED
|
@@ -5,13 +5,14 @@
|
|
|
5
5
|
#SBATCH --tasks-per-node=1
|
|
6
6
|
|
|
7
7
|
# Load CUDA, change to the cuda version on your environment if different
|
|
8
|
+
source /opt/lmod/lmod/init/profile
|
|
8
9
|
module load cuda-12.3
|
|
9
10
|
nvidia-smi
|
|
10
11
|
|
|
11
12
|
source ${SRC_DIR}/find_port.sh
|
|
12
13
|
|
|
13
14
|
if [ "$VENV_BASE" = "singularity" ]; then
|
|
14
|
-
export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_0.
|
|
15
|
+
export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_0.6.4.post1.sif
|
|
15
16
|
export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1
|
|
16
17
|
module load singularity-ce/3.8.2
|
|
17
18
|
singularity exec $SINGULARITY_IMAGE ray stop
|
|
@@ -35,7 +36,7 @@ echo "IP Head: $ip_head"
|
|
|
35
36
|
echo "Starting HEAD at $head_node"
|
|
36
37
|
if [ "$VENV_BASE" = "singularity" ]; then
|
|
37
38
|
srun --nodes=1 --ntasks=1 -w "$head_node" \
|
|
38
|
-
singularity exec --nv --bind
|
|
39
|
+
singularity exec --nv --bind ${MODEL_WEIGHTS_PARENT_DIR}:${MODEL_WEIGHTS_PARENT_DIR} $SINGULARITY_IMAGE \
|
|
39
40
|
ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \
|
|
40
41
|
--num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${NUM_GPUS}" --block &
|
|
41
42
|
else
|
|
@@ -56,7 +57,7 @@ for ((i = 1; i <= worker_num; i++)); do
|
|
|
56
57
|
echo "Starting WORKER $i at $node_i"
|
|
57
58
|
if [ "$VENV_BASE" = "singularity" ]; then
|
|
58
59
|
srun --nodes=1 --ntasks=1 -w "$node_i" \
|
|
59
|
-
singularity exec --nv --bind
|
|
60
|
+
singularity exec --nv --bind ${MODEL_WEIGHTS_PARENT_DIR}:${MODEL_WEIGHTS_PARENT_DIR} $SINGULARITY_IMAGE \
|
|
60
61
|
ray start --address "$ip_head" \
|
|
61
62
|
--num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${NUM_GPUS}" --block &
|
|
62
63
|
else
|
|
@@ -72,9 +73,8 @@ done
|
|
|
72
73
|
vllm_port_number=$(find_available_port $head_node_ip 8080 65535)
|
|
73
74
|
|
|
74
75
|
echo "Server address: http://${head_node_ip}:${vllm_port_number}/v1"
|
|
75
|
-
echo "http://${head_node_ip}:${vllm_port_number}/v1" > ${VLLM_BASE_URL_FILENAME}
|
|
76
76
|
|
|
77
|
-
if [ "$PIPELINE_PARALLELISM" = "
|
|
77
|
+
if [ "$PIPELINE_PARALLELISM" = "True" ]; then
|
|
78
78
|
export PIPELINE_PARALLEL_SIZE=$NUM_NODES
|
|
79
79
|
export TENSOR_PARALLEL_SIZE=$NUM_GPUS
|
|
80
80
|
else
|
|
@@ -82,9 +82,15 @@ else
|
|
|
82
82
|
export TENSOR_PARALLEL_SIZE=$((NUM_NODES*NUM_GPUS))
|
|
83
83
|
fi
|
|
84
84
|
|
|
85
|
+
if [ "$ENFORCE_EAGER" = "True" ]; then
|
|
86
|
+
export ENFORCE_EAGER="--enforce-eager"
|
|
87
|
+
else
|
|
88
|
+
export ENFORCE_EAGER=""
|
|
89
|
+
fi
|
|
90
|
+
|
|
85
91
|
# Activate vllm venv
|
|
86
92
|
if [ "$VENV_BASE" = "singularity" ]; then
|
|
87
|
-
singularity exec --nv --bind
|
|
93
|
+
singularity exec --nv --bind ${MODEL_WEIGHTS_PARENT_DIR}:${MODEL_WEIGHTS_PARENT_DIR} $SINGULARITY_IMAGE \
|
|
88
94
|
python3.10 -m vllm.entrypoints.openai.api_server \
|
|
89
95
|
--model ${VLLM_MODEL_WEIGHTS} \
|
|
90
96
|
--served-model-name ${JOB_NAME} \
|
|
@@ -95,7 +101,9 @@ if [ "$VENV_BASE" = "singularity" ]; then
|
|
|
95
101
|
--dtype ${VLLM_DATA_TYPE} \
|
|
96
102
|
--trust-remote-code \
|
|
97
103
|
--max-logprobs ${VLLM_MAX_LOGPROBS} \
|
|
98
|
-
--max-model-len ${VLLM_MAX_MODEL_LEN}
|
|
104
|
+
--max-model-len ${VLLM_MAX_MODEL_LEN} \
|
|
105
|
+
--max-num-seqs ${VLLM_MAX_NUM_SEQS} \
|
|
106
|
+
${ENFORCE_EAGER}
|
|
99
107
|
else
|
|
100
108
|
source ${VENV_BASE}/bin/activate
|
|
101
109
|
python3 -m vllm.entrypoints.openai.api_server \
|
|
@@ -108,5 +116,7 @@ else
|
|
|
108
116
|
--dtype ${VLLM_DATA_TYPE} \
|
|
109
117
|
--trust-remote-code \
|
|
110
118
|
--max-logprobs ${VLLM_MAX_LOGPROBS} \
|
|
111
|
-
--max-model-len ${VLLM_MAX_MODEL_LEN}
|
|
119
|
+
--max-model-len ${VLLM_MAX_MODEL_LEN} \
|
|
120
|
+
--max-num-seqs ${VLLM_MAX_NUM_SEQS} \
|
|
121
|
+
${ENFORCE_EAGER}
|
|
112
122
|
fi
|
vec_inf/vllm.slurm
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#SBATCH --mem=64G
|
|
4
4
|
|
|
5
5
|
# Load CUDA, change to the cuda version on your environment if different
|
|
6
|
+
source /opt/lmod/lmod/init/profile
|
|
6
7
|
module load cuda-12.3
|
|
7
8
|
nvidia-smi
|
|
8
9
|
|
|
@@ -13,15 +14,20 @@ hostname=${SLURMD_NODENAME}
|
|
|
13
14
|
vllm_port_number=$(find_available_port $hostname 8080 65535)
|
|
14
15
|
|
|
15
16
|
echo "Server address: http://${hostname}:${vllm_port_number}/v1"
|
|
16
|
-
|
|
17
|
+
|
|
18
|
+
if [ "$ENFORCE_EAGER" = "True" ]; then
|
|
19
|
+
export ENFORCE_EAGER="--enforce-eager"
|
|
20
|
+
else
|
|
21
|
+
export ENFORCE_EAGER=""
|
|
22
|
+
fi
|
|
17
23
|
|
|
18
24
|
# Activate vllm venv
|
|
19
25
|
if [ "$VENV_BASE" = "singularity" ]; then
|
|
20
|
-
export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_0.
|
|
26
|
+
export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_0.6.4.post1.sif
|
|
21
27
|
export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1
|
|
22
28
|
module load singularity-ce/3.8.2
|
|
23
29
|
singularity exec $SINGULARITY_IMAGE ray stop
|
|
24
|
-
singularity exec --nv --bind
|
|
30
|
+
singularity exec --nv --bind ${MODEL_WEIGHTS_PARENT_DIR}:${MODEL_WEIGHTS_PARENT_DIR} $SINGULARITY_IMAGE \
|
|
25
31
|
python3.10 -m vllm.entrypoints.openai.api_server \
|
|
26
32
|
--model ${VLLM_MODEL_WEIGHTS} \
|
|
27
33
|
--served-model-name ${JOB_NAME} \
|
|
@@ -31,7 +37,9 @@ if [ "$VENV_BASE" = "singularity" ]; then
|
|
|
31
37
|
--dtype ${VLLM_DATA_TYPE} \
|
|
32
38
|
--max-logprobs ${VLLM_MAX_LOGPROBS} \
|
|
33
39
|
--trust-remote-code \
|
|
34
|
-
--max-model-len ${VLLM_MAX_MODEL_LEN}
|
|
40
|
+
--max-model-len ${VLLM_MAX_MODEL_LEN} \
|
|
41
|
+
--max-num-seqs ${VLLM_MAX_NUM_SEQS} \
|
|
42
|
+
${ENFORCE_EAGER}
|
|
35
43
|
else
|
|
36
44
|
source ${VENV_BASE}/bin/activate
|
|
37
45
|
python3 -m vllm.entrypoints.openai.api_server \
|
|
@@ -43,5 +51,7 @@ else
|
|
|
43
51
|
--dtype ${VLLM_DATA_TYPE} \
|
|
44
52
|
--max-logprobs ${VLLM_MAX_LOGPROBS} \
|
|
45
53
|
--trust-remote-code \
|
|
46
|
-
--max-model-len ${VLLM_MAX_MODEL_LEN}
|
|
54
|
+
--max-model-len ${VLLM_MAX_MODEL_LEN} \
|
|
55
|
+
--max-num-seqs ${VLLM_MAX_NUM_SEQS} \
|
|
56
|
+
${ENFORCE_EAGER}
|
|
47
57
|
fi
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Vector Institute
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: vec-inf
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0.post1
|
|
4
4
|
Summary: Efficient LLM inference on Slurm clusters using vLLM.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Marshall Wang
|
|
@@ -11,19 +11,21 @@ Classifier: Programming Language :: Python :: 3
|
|
|
11
11
|
Classifier: Programming Language :: Python :: 3.10
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
15
|
Provides-Extra: dev
|
|
15
16
|
Requires-Dist: click (>=8.1.0,<9.0.0)
|
|
16
17
|
Requires-Dist: cupy-cuda12x (==12.1.0) ; extra == "dev"
|
|
17
|
-
Requires-Dist:
|
|
18
|
+
Requires-Dist: numpy (>=1.24.0,<2.0.0)
|
|
19
|
+
Requires-Dist: polars (>=1.15.0,<2.0.0)
|
|
18
20
|
Requires-Dist: ray (>=2.9.3,<3.0.0) ; extra == "dev"
|
|
19
21
|
Requires-Dist: requests (>=2.31.0,<3.0.0)
|
|
20
22
|
Requires-Dist: rich (>=13.7.0,<14.0.0)
|
|
21
|
-
Requires-Dist: vllm (>=0.
|
|
23
|
+
Requires-Dist: vllm (>=0.6.0,<0.7.0) ; extra == "dev"
|
|
22
24
|
Requires-Dist: vllm-nccl-cu12 (>=2.18,<2.19) ; extra == "dev"
|
|
23
25
|
Description-Content-Type: text/markdown
|
|
24
26
|
|
|
25
27
|
# Vector Inference: Easy inference on Slurm clusters
|
|
26
|
-
This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update [`launch_server.sh`](
|
|
28
|
+
This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update [`launch_server.sh`](vec_inf/launch_server.sh), [`vllm.slurm`](vec_inf/vllm.slurm), [`multinode_vllm.slurm`](vec_inf/multinode_vllm.slurm) and [`models.csv`](vec_inf/models/models.csv) accordingly.
|
|
27
29
|
|
|
28
30
|
## Installation
|
|
29
31
|
If you are using the Vector cluster environment, and you don't need any customization to the inference server environment, run the following to install package:
|
|
@@ -33,16 +35,23 @@ pip install vec-inf
|
|
|
33
35
|
Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package
|
|
34
36
|
|
|
35
37
|
## Launch an inference server
|
|
38
|
+
### `launch` command
|
|
36
39
|
We will use the Llama 3.1 model as example, to launch an OpenAI compatible inference server for Meta-Llama-3.1-8B-Instruct, run:
|
|
37
40
|
```bash
|
|
38
41
|
vec-inf launch Meta-Llama-3.1-8B-Instruct
|
|
39
42
|
```
|
|
40
43
|
You should see an output like the following:
|
|
41
44
|
|
|
42
|
-
<img width="
|
|
45
|
+
<img width="700" alt="launch_img" src="https://github.com/user-attachments/assets/ab658552-18b2-47e0-bf70-e539c3b898d5">
|
|
43
46
|
|
|
44
|
-
The model would be launched using the [default parameters](
|
|
47
|
+
The model would be launched using the [default parameters](vec_inf/models/models.csv), you can override these values by providing additional parameters, use `--help` to see the full list. You can also launch your own customized model as long as the model architecture is [supported by vLLM](https://docs.vllm.ai/en/stable/models/supported_models.html), and make sure to follow the instructions below:
|
|
48
|
+
* Your model weights directory naming convention should follow `$MODEL_FAMILY-$MODEL_VARIANT`.
|
|
49
|
+
* Your model weights directory should contain HF format weights.
|
|
50
|
+
* The following launch parameters will conform to default value if not specified: `--max-num-seqs`, `--partition`, `--data-type`, `--venv`, `--log-dir`, `--model-weights-parent-dir`, `--pipeline-parallelism`, `--enforce-eager`. All other launch parameters need to be specified for custom models.
|
|
51
|
+
* Example for setting the model weights parent directory: `--model-weights-parent-dir /h/user_name/my_weights`.
|
|
52
|
+
* For other model launch parameters you can reference the default values for similar models using the [`list` command ](#list-command).
|
|
45
53
|
|
|
54
|
+
### `status` command
|
|
46
55
|
You can check the inference server status by providing the Slurm job ID to the `status` command:
|
|
47
56
|
```bash
|
|
48
57
|
vec-inf status 13014393
|
|
@@ -62,6 +71,17 @@ There are 5 possible states:
|
|
|
62
71
|
|
|
63
72
|
Note that the base URL is only available when model is in `READY` state, and if you've changed the Slurm log directory path, you also need to specify it when using the `status` command.
|
|
64
73
|
|
|
74
|
+
### `metrics` command
|
|
75
|
+
Once your server is ready, you can check performance metrics by providing the Slurm job ID to the `metrics` command:
|
|
76
|
+
```bash
|
|
77
|
+
vec-inf metrics 13014393
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
And you will see the performance metrics streamed to your console, note that the metrics are updated with a 10-second interval.
|
|
81
|
+
|
|
82
|
+
<img width="400" alt="metrics_img" src="https://github.com/user-attachments/assets/e5ff2cd5-659b-4c88-8ebc-d8f3fdc023a4">
|
|
83
|
+
|
|
84
|
+
### `shutdown` command
|
|
65
85
|
Finally, when you're finished using a model, you can shut it down by providing the Slurm job ID:
|
|
66
86
|
```bash
|
|
67
87
|
vec-inf shutdown 13014393
|
|
@@ -69,17 +89,19 @@ vec-inf shutdown 13014393
|
|
|
69
89
|
> Shutting down model with Slurm Job ID: 13014393
|
|
70
90
|
```
|
|
71
91
|
|
|
92
|
+
### `list` command
|
|
72
93
|
You call view the full list of available models by running the `list` command:
|
|
73
94
|
```bash
|
|
74
95
|
vec-inf list
|
|
75
96
|
```
|
|
76
|
-
<img width="
|
|
97
|
+
<img width="940" alt="list_img" src="https://github.com/user-attachments/assets/8cf901c4-404c-4398-a52f-0486f00747a3">
|
|
98
|
+
|
|
77
99
|
|
|
78
100
|
You can also view the default setup for a specific supported model by providing the model name, for example `Meta-Llama-3.1-70B-Instruct`:
|
|
79
101
|
```bash
|
|
80
102
|
vec-inf list Meta-Llama-3.1-70B-Instruct
|
|
81
103
|
```
|
|
82
|
-
<img width="400" alt="list_model_img" src="https://github.com/user-attachments/assets/
|
|
104
|
+
<img width="400" alt="list_model_img" src="https://github.com/user-attachments/assets/30e42ab7-dde2-4d20-85f0-187adffefc3d">
|
|
83
105
|
|
|
84
106
|
`launch`, `list`, and `status` command supports `--json-mode`, where the command output would be structured as a JSON string.
|
|
85
107
|
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
vec_inf/README.md,sha256=dxX0xKfwLioG0mJ2YFv5JJ5q1m5NlWBrVBOap1wuHfQ,624
|
|
2
|
+
vec_inf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
vec_inf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
vec_inf/cli/_cli.py,sha256=TRaY-QSBQ_do9b4R6Pl7fyDlrfuMN8Z8HH_xOCKkVJA,12585
|
|
5
|
+
vec_inf/cli/_utils.py,sha256=sQqi7JdPOb7gfW4EVsXY2yhLUo8xWqxoY1spQ53bag4,4845
|
|
6
|
+
vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
|
|
7
|
+
vec_inf/launch_server.sh,sha256=gFovqXuYiQ8bEc6O31WTMDuBoNj7opB5iVfnCDhz2Nw,4165
|
|
8
|
+
vec_inf/models/README.md,sha256=YNEVTWliHehCpJTq2SXAidqgFl6CWL6GUOnAPksDYFE,14844
|
|
9
|
+
vec_inf/models/models.csv,sha256=f_cNeM7L0-4pgZqYfWilQd12-WVec2IVk6dRq5BE4mE,9875
|
|
10
|
+
vec_inf/multinode_vllm.slurm,sha256=tg0WgLRdpRFD-oT05aucOpe6h2TZiTyYJFTMqSIj-HQ,4154
|
|
11
|
+
vec_inf/vllm.slurm,sha256=lMgBI7r9jUVVhSIdrUH2DdC-Bxz0eyQ8vuB5uwOzWt0,1847
|
|
12
|
+
vec_inf-0.4.0.post1.dist-info/LICENSE,sha256=mq8zeqpvVSF1EsxmydeXcokt8XnEIfSofYn66S2-cJI,1073
|
|
13
|
+
vec_inf-0.4.0.post1.dist-info/METADATA,sha256=Q6KhU-ggnR9FB5YUjWrPwy2MSd_c9GCFXAQqT9YXZOw,7032
|
|
14
|
+
vec_inf-0.4.0.post1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
|
15
|
+
vec_inf-0.4.0.post1.dist-info/entry_points.txt,sha256=JF4uXsj1H4XacxaBw9f0KN0P0qDzmp7K_1zTEBDappo,48
|
|
16
|
+
vec_inf-0.4.0.post1.dist-info/RECORD,,
|
vec_inf-0.3.3.dist-info/RECORD
DELETED
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
vec_inf/README.md,sha256=ny3ffk6FeRwk_nERimK-JQwEuysvBe5eKpNyLk_A-8k,499
|
|
2
|
-
vec_inf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
vec_inf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
vec_inf/cli/_cli.py,sha256=XwCBkwFrN06T_o1CkUKD2nWT6P4bwOfDpVPoM3AUyUA,8984
|
|
5
|
-
vec_inf/cli/_utils.py,sha256=n37X0AcgXNEi3wOEqQFA4_iHHeGclHew6NyQaML6q7s,4034
|
|
6
|
-
vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
|
|
7
|
-
vec_inf/launch_server.sh,sha256=-efoTEIDKlJD7YhbYMgq4fFRV7H_1okjT5uKhfQAGUg,3998
|
|
8
|
-
vec_inf/models/README.md,sha256=7Vz-AMValcic5Mpi9i5FshhRUV9K8nwSnItN4O1TSvI,8124
|
|
9
|
-
vec_inf/models/models.csv,sha256=dOthlc04TyTQTin_fyt-PFDqg-lARScI9i0-tUkIgQ8,4828
|
|
10
|
-
vec_inf/multinode_vllm.slurm,sha256=KbxsKD9kV8wsB_jCEqh63BHq8h2DLmYMV46z5h2wAe0,3867
|
|
11
|
-
vec_inf/vllm.slurm,sha256=wRBkDunb0Oc1d8ESl_Dn9wRs_kIKvN_J39pL8dWAbV0,1608
|
|
12
|
-
vec_inf-0.3.3.dist-info/METADATA,sha256=IefFGb9Gb7bOwI3RjNTbTlTCL6AImzx5XBSJjCp4y8c,5751
|
|
13
|
-
vec_inf-0.3.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
14
|
-
vec_inf-0.3.3.dist-info/entry_points.txt,sha256=JF4uXsj1H4XacxaBw9f0KN0P0qDzmp7K_1zTEBDappo,48
|
|
15
|
-
vec_inf-0.3.3.dist-info/RECORD,,
|
|
File without changes
|