vec-inf 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vec_inf/README.md +2 -2
- vec_inf/cli/_cli.py +93 -113
- vec_inf/cli/_utils.py +33 -25
- vec_inf/launch_server.sh +2 -2
- vec_inf/models/README.md +16 -16
- vec_inf/models/models.csv +2 -1
- vec_inf/multinode_vllm.slurm +3 -5
- vec_inf/vllm.slurm +2 -2
- {vec_inf-0.3.1.dist-info → vec_inf-0.3.3.dist-info}/METADATA +6 -5
- vec_inf-0.3.3.dist-info/RECORD +15 -0
- vec_inf-0.3.1.dist-info/RECORD +0 -15
- {vec_inf-0.3.1.dist-info → vec_inf-0.3.3.dist-info}/WHEEL +0 -0
- {vec_inf-0.3.1.dist-info → vec_inf-0.3.3.dist-info}/entry_points.txt +0 -0
vec_inf/README.md
CHANGED
|
@@ -3,6 +3,6 @@
|
|
|
3
3
|
* `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server, `--json-mode` supported. Check [`here`](./models/README.md) for complete list of available options.
|
|
4
4
|
* `list`: List all available model names, `--json-mode` supported.
|
|
5
5
|
* `status`: Check the model status by providing its Slurm job ID, `--json-mode` supported.
|
|
6
|
-
* `shutdown`: Shutdown a model by providing its Slurm job ID.
|
|
6
|
+
* `shutdown`: Shutdown a model by providing its Slurm job ID.
|
|
7
7
|
|
|
8
|
-
Use `--help` to see all available options
|
|
8
|
+
Use `--help` to see all available options
|
vec_inf/cli/_cli.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import os
|
|
2
|
+
from typing import Optional
|
|
2
3
|
|
|
3
4
|
import click
|
|
4
|
-
from rich.console import Console
|
|
5
5
|
from rich.columns import Columns
|
|
6
|
+
from rich.console import Console
|
|
6
7
|
from rich.panel import Panel
|
|
7
8
|
|
|
8
|
-
|
|
9
|
-
|
|
9
|
+
import vec_inf.cli._utils as utils
|
|
10
10
|
|
|
11
11
|
CONSOLE = Console()
|
|
12
12
|
|
|
@@ -18,122 +18,107 @@ def cli():
|
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
@cli.command("launch")
|
|
21
|
-
@click.argument(
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
nargs=1
|
|
25
|
-
)
|
|
26
|
-
@click.option(
|
|
27
|
-
"--model-family",
|
|
28
|
-
type=str,
|
|
29
|
-
help='The model family'
|
|
30
|
-
)
|
|
31
|
-
@click.option(
|
|
32
|
-
"--model-variant",
|
|
33
|
-
type=str,
|
|
34
|
-
help='The model variant'
|
|
35
|
-
)
|
|
21
|
+
@click.argument("model-name", type=str, nargs=1)
|
|
22
|
+
@click.option("--model-family", type=str, help="The model family")
|
|
23
|
+
@click.option("--model-variant", type=str, help="The model variant")
|
|
36
24
|
@click.option(
|
|
37
25
|
"--max-model-len",
|
|
38
26
|
type=int,
|
|
39
|
-
help=
|
|
40
|
-
)
|
|
41
|
-
@click.option(
|
|
42
|
-
"--partition",
|
|
43
|
-
type=str,
|
|
44
|
-
help='Type of compute partition, default to a40'
|
|
27
|
+
help="Model context length. If unspecified, will be automatically derived from the model config.",
|
|
45
28
|
)
|
|
29
|
+
@click.option("--partition", type=str, help="Type of compute partition, default to a40")
|
|
46
30
|
@click.option(
|
|
47
31
|
"--num-nodes",
|
|
48
32
|
type=int,
|
|
49
|
-
help=
|
|
33
|
+
help="Number of nodes to use, default to suggested resource allocation for model",
|
|
50
34
|
)
|
|
51
35
|
@click.option(
|
|
52
36
|
"--num-gpus",
|
|
53
37
|
type=int,
|
|
54
|
-
help=
|
|
38
|
+
help="Number of GPUs/node to use, default to suggested resource allocation for model",
|
|
55
39
|
)
|
|
56
40
|
@click.option(
|
|
57
41
|
"--qos",
|
|
58
42
|
type=str,
|
|
59
|
-
help=
|
|
43
|
+
help="Quality of service, default depends on suggested resource allocation required for the model",
|
|
60
44
|
)
|
|
61
45
|
@click.option(
|
|
62
46
|
"--time",
|
|
63
47
|
type=str,
|
|
64
|
-
help=
|
|
48
|
+
help="Time limit for job, this should comply with QoS, default to max walltime of the chosen QoS",
|
|
65
49
|
)
|
|
66
50
|
@click.option(
|
|
67
|
-
"--
|
|
68
|
-
type=
|
|
69
|
-
help=
|
|
70
|
-
)
|
|
71
|
-
@click.option(
|
|
72
|
-
"--venv",
|
|
73
|
-
type=str,
|
|
74
|
-
help='Path to virtual environment'
|
|
51
|
+
"--vocab-size",
|
|
52
|
+
type=int,
|
|
53
|
+
help="Vocabulary size, this option is intended for custom models",
|
|
75
54
|
)
|
|
55
|
+
@click.option("--data-type", type=str, help="Model data type, default to auto")
|
|
56
|
+
@click.option("--venv", type=str, help="Path to virtual environment")
|
|
76
57
|
@click.option(
|
|
77
58
|
"--log-dir",
|
|
78
59
|
type=str,
|
|
79
|
-
help=
|
|
60
|
+
help="Path to slurm log directory, default to .vec-inf-logs in home directory",
|
|
80
61
|
)
|
|
81
62
|
@click.option(
|
|
82
63
|
"--json-mode",
|
|
83
64
|
is_flag=True,
|
|
84
|
-
help=
|
|
65
|
+
help="Output in JSON string",
|
|
85
66
|
)
|
|
86
67
|
def launch(
|
|
87
68
|
model_name: str,
|
|
88
|
-
model_family: str=None,
|
|
89
|
-
model_variant: str=None,
|
|
90
|
-
max_model_len: int=None,
|
|
91
|
-
partition: str=None,
|
|
92
|
-
num_nodes: int=None,
|
|
93
|
-
num_gpus: int=None,
|
|
94
|
-
qos: str=None,
|
|
95
|
-
time: str=None,
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
69
|
+
model_family: Optional[str] = None,
|
|
70
|
+
model_variant: Optional[str] = None,
|
|
71
|
+
max_model_len: Optional[int] = None,
|
|
72
|
+
partition: Optional[str] = None,
|
|
73
|
+
num_nodes: Optional[int] = None,
|
|
74
|
+
num_gpus: Optional[int] = None,
|
|
75
|
+
qos: Optional[str] = None,
|
|
76
|
+
time: Optional[str] = None,
|
|
77
|
+
vocab_size: Optional[int] = None,
|
|
78
|
+
data_type: Optional[str] = None,
|
|
79
|
+
venv: Optional[str] = None,
|
|
80
|
+
log_dir: Optional[str] = None,
|
|
81
|
+
json_mode: bool = False,
|
|
100
82
|
) -> None:
|
|
101
83
|
"""
|
|
102
84
|
Launch a model on the cluster
|
|
103
85
|
"""
|
|
104
86
|
launch_script_path = os.path.join(
|
|
105
|
-
os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
|
|
106
|
-
"launch_server.sh"
|
|
87
|
+
os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "launch_server.sh"
|
|
107
88
|
)
|
|
108
|
-
launch_cmd = f"bash {launch_script_path}"
|
|
109
|
-
|
|
110
|
-
models_df = load_models_df()
|
|
89
|
+
launch_cmd = f"bash {launch_script_path}"
|
|
111
90
|
|
|
112
|
-
|
|
113
|
-
raise ValueError(f"Model name {model_name} not found in available models")
|
|
91
|
+
models_df = utils.load_models_df()
|
|
114
92
|
|
|
115
|
-
|
|
93
|
+
if model_name in models_df["model_name"].values:
|
|
94
|
+
default_args = utils.load_default_args(models_df, model_name)
|
|
95
|
+
for arg in default_args:
|
|
96
|
+
if arg in locals() and locals()[arg] is not None:
|
|
97
|
+
default_args[arg] = locals()[arg]
|
|
98
|
+
renamed_arg = arg.replace("_", "-")
|
|
99
|
+
launch_cmd += f" --{renamed_arg} {default_args[arg]}"
|
|
100
|
+
else:
|
|
101
|
+
model_args = models_df.columns.tolist()
|
|
102
|
+
excluded_keys = ["model_name", "pipeline_parallelism"]
|
|
103
|
+
for arg in model_args:
|
|
104
|
+
if arg not in excluded_keys and locals()[arg] is not None:
|
|
105
|
+
renamed_arg = arg.replace("_", "-")
|
|
106
|
+
launch_cmd += f" --{renamed_arg} {locals()[arg]}"
|
|
116
107
|
|
|
117
|
-
|
|
118
|
-
if arg in locals() and locals()[arg] is not None:
|
|
119
|
-
default_args[arg] = locals()[arg]
|
|
120
|
-
renamed_arg = arg.replace("_", "-")
|
|
121
|
-
launch_cmd += f" --{renamed_arg} {default_args[arg]}"
|
|
122
|
-
|
|
123
|
-
output = run_bash_command(launch_cmd)
|
|
108
|
+
output = utils.run_bash_command(launch_cmd)
|
|
124
109
|
|
|
125
110
|
slurm_job_id = output.split(" ")[-1].strip().strip("\n")
|
|
126
111
|
output_lines = output.split("\n")[:-2]
|
|
127
112
|
|
|
128
|
-
table = create_table(key_title="Job Config", value_title="Value")
|
|
113
|
+
table = utils.create_table(key_title="Job Config", value_title="Value")
|
|
129
114
|
table.add_row("Slurm Job ID", slurm_job_id, style="blue")
|
|
130
115
|
output_dict = {"slurm_job_id": slurm_job_id}
|
|
131
|
-
|
|
116
|
+
|
|
132
117
|
for line in output_lines:
|
|
133
118
|
key, value = line.split(": ")
|
|
134
119
|
table.add_row(key, value)
|
|
135
120
|
output_dict[key.lower().replace(" ", "_")] = value
|
|
136
|
-
|
|
121
|
+
|
|
137
122
|
if json_mode:
|
|
138
123
|
click.echo(output_dict)
|
|
139
124
|
else:
|
|
@@ -141,27 +126,25 @@ def launch(
|
|
|
141
126
|
|
|
142
127
|
|
|
143
128
|
@cli.command("status")
|
|
144
|
-
@click.argument(
|
|
145
|
-
"slurm_job_id",
|
|
146
|
-
type=int,
|
|
147
|
-
nargs=1
|
|
148
|
-
)
|
|
129
|
+
@click.argument("slurm_job_id", type=int, nargs=1)
|
|
149
130
|
@click.option(
|
|
150
131
|
"--log-dir",
|
|
151
132
|
type=str,
|
|
152
|
-
help=
|
|
133
|
+
help="Path to slurm log directory. This is required if --log-dir was set in model launch",
|
|
153
134
|
)
|
|
154
135
|
@click.option(
|
|
155
136
|
"--json-mode",
|
|
156
137
|
is_flag=True,
|
|
157
|
-
help=
|
|
138
|
+
help="Output in JSON string",
|
|
158
139
|
)
|
|
159
|
-
def status(
|
|
140
|
+
def status(
|
|
141
|
+
slurm_job_id: int, log_dir: Optional[str] = None, json_mode: bool = False
|
|
142
|
+
) -> None:
|
|
160
143
|
"""
|
|
161
144
|
Get the status of a running model on the cluster
|
|
162
145
|
"""
|
|
163
146
|
status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
|
|
164
|
-
output = run_bash_command(status_cmd)
|
|
147
|
+
output = utils.run_bash_command(status_cmd)
|
|
165
148
|
|
|
166
149
|
slurm_job_name = "UNAVAILABLE"
|
|
167
150
|
status = "SHUTDOWN"
|
|
@@ -181,36 +164,39 @@ def status(slurm_job_id: int, log_dir: str=None, json_mode: bool=False) -> None:
|
|
|
181
164
|
# If Slurm job is currently RUNNING
|
|
182
165
|
elif slurm_job_state == "RUNNING":
|
|
183
166
|
# Check whether the server is ready, if yes, run model health check to further determine status
|
|
184
|
-
server_status = is_server_running(slurm_job_name, slurm_job_id, log_dir)
|
|
167
|
+
server_status = utils.is_server_running(slurm_job_name, slurm_job_id, log_dir)
|
|
185
168
|
# If server status is a tuple, then server status is "FAILED"
|
|
186
|
-
if
|
|
169
|
+
if isinstance(server_status, tuple):
|
|
187
170
|
status = server_status[0]
|
|
188
171
|
slurm_job_failed_reason = server_status[1]
|
|
189
172
|
elif server_status == "RUNNING":
|
|
190
|
-
|
|
191
|
-
|
|
173
|
+
model_status = utils.model_health_check(
|
|
174
|
+
slurm_job_name, slurm_job_id, log_dir
|
|
175
|
+
)
|
|
176
|
+
if model_status == "READY":
|
|
192
177
|
# Only set base_url if model is ready to serve requests
|
|
193
|
-
base_url = get_base_url(slurm_job_name, slurm_job_id, log_dir)
|
|
178
|
+
base_url = utils.get_base_url(slurm_job_name, slurm_job_id, log_dir)
|
|
179
|
+
status = "READY"
|
|
194
180
|
else:
|
|
195
181
|
# If model is not ready, then status must be "FAILED"
|
|
196
|
-
status =
|
|
197
|
-
slurm_job_failed_reason =
|
|
182
|
+
status = model_status[0]
|
|
183
|
+
slurm_job_failed_reason = str(model_status[1])
|
|
198
184
|
else:
|
|
199
185
|
status = server_status
|
|
200
186
|
|
|
201
187
|
if json_mode:
|
|
202
188
|
status_dict = {
|
|
203
|
-
"model_name": slurm_job_name,
|
|
204
|
-
"model_status": status,
|
|
205
|
-
"base_url": base_url
|
|
189
|
+
"model_name": slurm_job_name,
|
|
190
|
+
"model_status": status,
|
|
191
|
+
"base_url": base_url,
|
|
206
192
|
}
|
|
207
193
|
if "slurm_job_pending_reason" in locals():
|
|
208
194
|
status_dict["pending_reason"] = slurm_job_pending_reason
|
|
209
195
|
if "slurm_job_failed_reason" in locals():
|
|
210
196
|
status_dict["failed_reason"] = slurm_job_failed_reason
|
|
211
|
-
click.echo(f
|
|
197
|
+
click.echo(f"{status_dict}")
|
|
212
198
|
else:
|
|
213
|
-
table = create_table(key_title="Job Status", value_title="Value")
|
|
199
|
+
table = utils.create_table(key_title="Job Status", value_title="Value")
|
|
214
200
|
table.add_row("Model Name", slurm_job_name)
|
|
215
201
|
table.add_row("Model Status", status, style="blue")
|
|
216
202
|
if "slurm_job_pending_reason" in locals():
|
|
@@ -219,60 +205,54 @@ def status(slurm_job_id: int, log_dir: str=None, json_mode: bool=False) -> None:
|
|
|
219
205
|
table.add_row("Reason", slurm_job_failed_reason)
|
|
220
206
|
table.add_row("Base URL", base_url)
|
|
221
207
|
CONSOLE.print(table)
|
|
222
|
-
|
|
208
|
+
|
|
223
209
|
|
|
224
210
|
@cli.command("shutdown")
|
|
225
|
-
@click.argument(
|
|
226
|
-
"slurm_job_id",
|
|
227
|
-
type=int,
|
|
228
|
-
nargs=1
|
|
229
|
-
)
|
|
211
|
+
@click.argument("slurm_job_id", type=int, nargs=1)
|
|
230
212
|
def shutdown(slurm_job_id: int) -> None:
|
|
231
213
|
"""
|
|
232
214
|
Shutdown a running model on the cluster
|
|
233
215
|
"""
|
|
234
216
|
shutdown_cmd = f"scancel {slurm_job_id}"
|
|
235
|
-
run_bash_command(shutdown_cmd)
|
|
217
|
+
utils.run_bash_command(shutdown_cmd)
|
|
236
218
|
click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}")
|
|
237
219
|
|
|
238
220
|
|
|
239
221
|
@cli.command("list")
|
|
240
|
-
@click.argument(
|
|
241
|
-
"model-name",
|
|
242
|
-
required=False)
|
|
222
|
+
@click.argument("model-name", required=False)
|
|
243
223
|
@click.option(
|
|
244
224
|
"--json-mode",
|
|
245
225
|
is_flag=True,
|
|
246
|
-
help=
|
|
226
|
+
help="Output in JSON string",
|
|
247
227
|
)
|
|
248
|
-
def list(model_name: str=None, json_mode: bool=False) -> None:
|
|
228
|
+
def list(model_name: Optional[str] = None, json_mode: bool = False) -> None:
|
|
249
229
|
"""
|
|
250
230
|
List all available models, or get default setup of a specific model
|
|
251
231
|
"""
|
|
252
|
-
models_df = load_models_df()
|
|
232
|
+
models_df = utils.load_models_df()
|
|
253
233
|
|
|
254
234
|
if model_name:
|
|
255
|
-
if model_name not in models_df[
|
|
235
|
+
if model_name not in models_df["model_name"].values:
|
|
256
236
|
raise ValueError(f"Model name {model_name} not found in available models")
|
|
257
|
-
|
|
258
|
-
excluded_keys = {
|
|
259
|
-
model_row = models_df.loc[models_df[
|
|
237
|
+
|
|
238
|
+
excluded_keys = {"venv", "log_dir", "pipeline_parallelism"}
|
|
239
|
+
model_row = models_df.loc[models_df["model_name"] == model_name]
|
|
260
240
|
|
|
261
241
|
if json_mode:
|
|
262
242
|
# click.echo(model_row.to_json(orient='records'))
|
|
263
|
-
filtered_model_row = model_row.drop(columns=excluded_keys, errors=
|
|
264
|
-
click.echo(filtered_model_row.to_json(orient=
|
|
243
|
+
filtered_model_row = model_row.drop(columns=excluded_keys, errors="ignore")
|
|
244
|
+
click.echo(filtered_model_row.to_json(orient="records"))
|
|
265
245
|
return
|
|
266
|
-
table = create_table(key_title="Model Config", value_title="Value")
|
|
246
|
+
table = utils.create_table(key_title="Model Config", value_title="Value")
|
|
267
247
|
for _, row in model_row.iterrows():
|
|
268
248
|
for key, value in row.items():
|
|
269
249
|
if key not in excluded_keys:
|
|
270
250
|
table.add_row(key, str(value))
|
|
271
251
|
CONSOLE.print(table)
|
|
272
252
|
return
|
|
273
|
-
|
|
253
|
+
|
|
274
254
|
if json_mode:
|
|
275
|
-
click.echo(models_df[
|
|
255
|
+
click.echo(models_df["model_name"].to_json(orient="records"))
|
|
276
256
|
return
|
|
277
257
|
panels = []
|
|
278
258
|
for _, row in models_df.iterrows():
|
|
@@ -281,5 +261,5 @@ def list(model_name: str=None, json_mode: bool=False) -> None:
|
|
|
281
261
|
CONSOLE.print(Columns(panels, equal=True))
|
|
282
262
|
|
|
283
263
|
|
|
284
|
-
if __name__ ==
|
|
285
|
-
cli()
|
|
264
|
+
if __name__ == "__main__":
|
|
265
|
+
cli()
|
vec_inf/cli/_utils.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
|
1
|
-
import subprocess
|
|
2
1
|
import os
|
|
3
|
-
|
|
2
|
+
import subprocess
|
|
3
|
+
from typing import Optional, Union
|
|
4
4
|
|
|
5
|
+
import pandas as pd
|
|
5
6
|
import requests
|
|
6
7
|
from rich.table import Table
|
|
7
|
-
import pandas as pd
|
|
8
|
-
|
|
9
8
|
|
|
10
9
|
MODEL_READY_SIGNATURE = "INFO: Uvicorn running on http://0.0.0.0:"
|
|
11
10
|
SERVER_ADDRESS_SIGNATURE = "Server address: "
|
|
@@ -15,45 +14,50 @@ def run_bash_command(command: str) -> str:
|
|
|
15
14
|
"""
|
|
16
15
|
Run a bash command and return the output
|
|
17
16
|
"""
|
|
18
|
-
process = subprocess.Popen(
|
|
17
|
+
process = subprocess.Popen(
|
|
18
|
+
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
|
|
19
|
+
)
|
|
19
20
|
stdout, _ = process.communicate()
|
|
20
21
|
return stdout
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
def read_slurm_log(
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
slurm_log_type: str,
|
|
27
|
-
log_dir: str
|
|
28
|
-
) -> Union[list, str]:
|
|
25
|
+
slurm_job_name: str, slurm_job_id: int, slurm_log_type: str, log_dir: Optional[str]
|
|
26
|
+
) -> Union[list[str], str]:
|
|
29
27
|
"""
|
|
30
28
|
Get the directory of a model
|
|
31
29
|
"""
|
|
32
30
|
if not log_dir:
|
|
33
31
|
models_dir = os.path.join(os.path.expanduser("~"), ".vec-inf-logs")
|
|
34
|
-
|
|
32
|
+
|
|
35
33
|
for dir in sorted(os.listdir(models_dir), key=len, reverse=True):
|
|
36
34
|
if dir in slurm_job_name:
|
|
37
35
|
log_dir = os.path.join(models_dir, dir)
|
|
38
36
|
break
|
|
39
|
-
|
|
37
|
+
|
|
40
38
|
try:
|
|
41
|
-
file_path = os.path.join(
|
|
42
|
-
|
|
39
|
+
file_path = os.path.join(
|
|
40
|
+
log_dir, # type: ignore
|
|
41
|
+
f"{slurm_job_name}.{slurm_job_id}.{slurm_log_type}",
|
|
42
|
+
)
|
|
43
|
+
with open(file_path, "r") as file:
|
|
43
44
|
lines = file.readlines()
|
|
44
45
|
except FileNotFoundError:
|
|
45
46
|
print(f"Could not find file: {file_path}")
|
|
46
47
|
return "LOG_FILE_NOT_FOUND"
|
|
47
48
|
return lines
|
|
48
49
|
|
|
49
|
-
|
|
50
|
+
|
|
51
|
+
def is_server_running(
|
|
52
|
+
slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]
|
|
53
|
+
) -> Union[str, tuple[str, str]]:
|
|
50
54
|
"""
|
|
51
55
|
Check if a model is ready to serve requests
|
|
52
56
|
"""
|
|
53
57
|
log_content = read_slurm_log(slurm_job_name, slurm_job_id, "err", log_dir)
|
|
54
|
-
if
|
|
58
|
+
if isinstance(log_content, str):
|
|
55
59
|
return log_content
|
|
56
|
-
|
|
60
|
+
|
|
57
61
|
for line in log_content:
|
|
58
62
|
if "error" in line.lower():
|
|
59
63
|
return ("FAILED", line.strip("\n"))
|
|
@@ -62,21 +66,23 @@ def is_server_running(slurm_job_name: str, slurm_job_id: int, log_dir: str) -> U
|
|
|
62
66
|
return "LAUNCHING"
|
|
63
67
|
|
|
64
68
|
|
|
65
|
-
def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: str) -> str:
|
|
69
|
+
def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]) -> str:
|
|
66
70
|
"""
|
|
67
71
|
Get the base URL of a model
|
|
68
72
|
"""
|
|
69
73
|
log_content = read_slurm_log(slurm_job_name, slurm_job_id, "out", log_dir)
|
|
70
|
-
if
|
|
74
|
+
if isinstance(log_content, str):
|
|
71
75
|
return log_content
|
|
72
|
-
|
|
76
|
+
|
|
73
77
|
for line in log_content:
|
|
74
78
|
if SERVER_ADDRESS_SIGNATURE in line:
|
|
75
79
|
return line.split(SERVER_ADDRESS_SIGNATURE)[1].strip("\n")
|
|
76
80
|
return "URL_NOT_FOUND"
|
|
77
81
|
|
|
78
82
|
|
|
79
|
-
def model_health_check(
|
|
83
|
+
def model_health_check(
|
|
84
|
+
slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]
|
|
85
|
+
) -> Union[str, tuple[str, Union[str, int]]]:
|
|
80
86
|
"""
|
|
81
87
|
Check the health of a running model on the cluster
|
|
82
88
|
"""
|
|
@@ -94,9 +100,11 @@ def model_health_check(slurm_job_name: str, slurm_job_id: int, log_dir: str) ->
|
|
|
94
100
|
return ("FAILED", response.status_code)
|
|
95
101
|
except requests.exceptions.RequestException as e:
|
|
96
102
|
return ("FAILED", str(e))
|
|
97
|
-
|
|
98
103
|
|
|
99
|
-
|
|
104
|
+
|
|
105
|
+
def create_table(
|
|
106
|
+
key_title: str = "", value_title: str = "", show_header: bool = True
|
|
107
|
+
) -> Table:
|
|
100
108
|
"""
|
|
101
109
|
Create a table for displaying model status
|
|
102
110
|
"""
|
|
@@ -113,7 +121,7 @@ def load_models_df() -> pd.DataFrame:
|
|
|
113
121
|
models_df = pd.read_csv(
|
|
114
122
|
os.path.join(
|
|
115
123
|
os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
|
|
116
|
-
"models/models.csv"
|
|
124
|
+
"models/models.csv",
|
|
117
125
|
)
|
|
118
126
|
)
|
|
119
127
|
return models_df
|
|
@@ -126,4 +134,4 @@ def load_default_args(models_df: pd.DataFrame, model_name: str) -> dict:
|
|
|
126
134
|
row_data = models_df.loc[models_df["model_name"] == model_name]
|
|
127
135
|
default_args = row_data.iloc[0].to_dict()
|
|
128
136
|
default_args.pop("model_name")
|
|
129
|
-
return default_args
|
|
137
|
+
return default_args
|
vec_inf/launch_server.sh
CHANGED
|
@@ -76,7 +76,7 @@ mkdir -p $LOG_DIR
|
|
|
76
76
|
export SRC_DIR="$(dirname "$0")"
|
|
77
77
|
export MODEL_DIR="${SRC_DIR}/models/${MODEL_FAMILY}"
|
|
78
78
|
export VLLM_BASE_URL_FILENAME="${MODEL_DIR}/.${JOB_NAME}_url"
|
|
79
|
-
|
|
79
|
+
|
|
80
80
|
# Variables specific to your working environment, below are examples for the Vector cluster
|
|
81
81
|
export VLLM_MODEL_WEIGHTS="/model-weights/$JOB_NAME"
|
|
82
82
|
export LD_LIBRARY_PATH="/scratch/ssd001/pkgs/cudnn-11.7-v8.5.0.96/lib/:/scratch/ssd001/pkgs/cuda-11.7/targets/x86_64-linux/lib/"
|
|
@@ -119,4 +119,4 @@ sbatch --job-name $JOB_NAME \
|
|
|
119
119
|
--time $WALLTIME \
|
|
120
120
|
--output $LOG_DIR/$JOB_NAME.%j.out \
|
|
121
121
|
--error $LOG_DIR/$JOB_NAME.%j.err \
|
|
122
|
-
$SRC_DIR/${is_special}vllm.slurm
|
|
122
|
+
$SRC_DIR/${is_special}vllm.slurm
|
vec_inf/models/README.md
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
# Available Models
|
|
2
2
|
More profiling metrics coming soon!
|
|
3
3
|
|
|
4
|
-
## [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
|
|
4
|
+
## [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
|
|
5
5
|
|
|
6
6
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
7
7
|
|:----------:|:----------:|:----------:|:----------:|
|
|
8
8
|
|[`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus)| 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
|
|
9
9
|
|
|
10
|
-
## [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
|
|
10
|
+
## [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
|
|
11
11
|
|
|
12
12
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
13
13
|
|:----------:|:----------:|:----------:|:----------:|
|
|
@@ -20,13 +20,13 @@ More profiling metrics coming soon!
|
|
|
20
20
|
| [`CodeLlama-70b-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-hf) | 4x a40 | - tokens/s | - tokens/s |
|
|
21
21
|
| [`CodeLlama-70b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-Instruct-hf) | 4x a40 | - tokens/s | - tokens/s |
|
|
22
22
|
|
|
23
|
-
## [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
|
|
23
|
+
## [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
|
|
24
24
|
|
|
25
25
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
26
26
|
|:----------:|:----------:|:----------:|:----------:|
|
|
27
27
|
|[`dbrx-instruct`](https://huggingface.co/databricks/dbrx-instruct)| 8x a40 (2 nodes, 4 a40/node) | 107 tokens/s | 904 tokens/s |
|
|
28
28
|
|
|
29
|
-
## [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
|
|
29
|
+
## [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
|
|
30
30
|
|
|
31
31
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
32
32
|
|:----------:|:----------:|:----------:|:----------:|
|
|
@@ -35,32 +35,32 @@ More profiling metrics coming soon!
|
|
|
35
35
|
| [`gemma-2-27b`](https://huggingface.co/google/gemma-2-27b) | 2x a40 | - tokens/s | - tokens/s |
|
|
36
36
|
| [`gemma-2-27b-it`](https://huggingface.co/google/gemma-2-27b-it) | 2x a40 | - tokens/s | - tokens/s |
|
|
37
37
|
|
|
38
|
-
## [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
|
|
38
|
+
## [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
|
|
39
39
|
|
|
40
40
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
41
41
|
|:----------:|:----------:|:----------:|:----------:|
|
|
42
42
|
|[`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
|
|
43
43
|
|[`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf)| 1x a40 | - tokens/s | - tokens/s |
|
|
44
44
|
|
|
45
|
-
## [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
|
|
45
|
+
## [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
|
|
46
46
|
|
|
47
47
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
48
48
|
|:----------:|:----------:|:----------:|:----------:|
|
|
49
49
|
|[`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
|
|
50
50
|
|[`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf)| 2x a40 | - tokens/s | - tokens/s |
|
|
51
51
|
|
|
52
|
-
## [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
|
|
52
|
+
## [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
|
|
53
53
|
|
|
54
54
|
| Variant | Suggested resource allocation |
|
|
55
55
|
|:----------:|:----------:|
|
|
56
|
-
| [`Llama-2-7b-hf`](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 1x a40 |
|
|
57
|
-
| [`Llama-2-7b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1x a40 |
|
|
58
|
-
| [`Llama-2-13b-hf`](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 1x a40 |
|
|
56
|
+
| [`Llama-2-7b-hf`](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 1x a40 |
|
|
57
|
+
| [`Llama-2-7b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1x a40 |
|
|
58
|
+
| [`Llama-2-13b-hf`](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 1x a40 |
|
|
59
59
|
| [`Llama-2-13b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 1x a40 |
|
|
60
60
|
| [`Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 4x a40 |
|
|
61
61
|
| [`Llama-2-70b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4x a40 |
|
|
62
62
|
|
|
63
|
-
## [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
|
|
63
|
+
## [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
|
|
64
64
|
|
|
65
65
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
66
66
|
|:----------:|:----------:|:----------:|:----------:|
|
|
@@ -69,7 +69,7 @@ More profiling metrics coming soon!
|
|
|
69
69
|
| [`Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 4x a40 | 81 tokens/s | 618 tokens/s |
|
|
70
70
|
| [`Meta-Llama-3-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 4x a40 | 301 tokens/s | 660 tokens/s |
|
|
71
71
|
|
|
72
|
-
## [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
|
|
72
|
+
## [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
|
|
73
73
|
|
|
74
74
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
75
75
|
|:----------:|:----------:|:----------:|:----------:|
|
|
@@ -79,7 +79,7 @@ More profiling metrics coming soon!
|
|
|
79
79
|
| [`Meta-Llama-3.1-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
|
|
80
80
|
| [`Meta-Llama-3.1-405B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct) | 32x a40 (8 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
81
81
|
|
|
82
|
-
## [Mistral AI: Mistral](https://huggingface.co/mistralai)
|
|
82
|
+
## [Mistral AI: Mistral](https://huggingface.co/mistralai)
|
|
83
83
|
|
|
84
84
|
| Variant (Mistral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
85
85
|
|:----------:|:----------:|:----------:|:----------:|
|
|
@@ -90,7 +90,7 @@ More profiling metrics coming soon!
|
|
|
90
90
|
|[`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)| 1x a40 | - tokens/s | - tokens/s|
|
|
91
91
|
|[`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407)| 4x a40 | - tokens/s | - tokens/s|
|
|
92
92
|
|
|
93
|
-
## [Mistral AI: Mixtral](https://huggingface.co/mistralai)
|
|
93
|
+
## [Mistral AI: Mixtral](https://huggingface.co/mistralai)
|
|
94
94
|
|
|
95
95
|
| Variant (Mixtral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
96
96
|
|:----------:|:----------:|:----------:|:----------:|
|
|
@@ -98,9 +98,9 @@ More profiling metrics coming soon!
|
|
|
98
98
|
|[`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
|
|
99
99
|
|[`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
|
|
100
100
|
|
|
101
|
-
## [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
|
|
101
|
+
## [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
|
|
102
102
|
|
|
103
103
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
104
104
|
|:----------:|:----------:|:----------:|:----------:|
|
|
105
105
|
| [`Phi-3-medium-128k-instruct`](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
106
|
-
| [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
106
|
+
| [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
|
vec_inf/models/models.csv
CHANGED
|
@@ -42,4 +42,5 @@ Mixtral-8x7B-Instruct-v0.1,Mixtral,8x7B-Instruct-v0.1,a40,m2,08:00:00,4,1,32000,
|
|
|
42
42
|
Mixtral-8x22B-v0.1,Mixtral,8x22B-v0.1,a40,m2,08:00:00,4,2,32768,65536,auto,singularity,default,false
|
|
43
43
|
Mixtral-8x22B-Instruct-v0.1,Mixtral,8x22B-Instruct-v0.1,a40,m2,08:00:00,4,2,32768,65536,auto,singularity,default,false
|
|
44
44
|
Phi-3-medium-128k-instruct,Phi-3,medium-128k-instruct,a40,m2,08:00:00,2,1,32064,131072,auto,singularity,default,false
|
|
45
|
-
Phi-3-vision-128k-instruct,Phi-3,vision-128k-instruct,a40,m2,08:00:00,2,1,32064,65536,auto,singularity,default,false
|
|
45
|
+
Phi-3-vision-128k-instruct,Phi-3,vision-128k-instruct,a40,m2,08:00:00,2,1,32064,65536,auto,singularity,default,false
|
|
46
|
+
Llama3-OpenBioLLM-70B,Llama3-OpenBioLLM,70B,a40,m2,08:00:00,4,1,128256,8192,auto,singularity,default,false
|
vec_inf/multinode_vllm.slurm
CHANGED
|
@@ -64,7 +64,7 @@ for ((i = 1; i <= worker_num; i++)); do
|
|
|
64
64
|
ray start --address "$ip_head" \
|
|
65
65
|
--num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${NUM_GPUS}" --block &
|
|
66
66
|
fi
|
|
67
|
-
|
|
67
|
+
|
|
68
68
|
sleep 5
|
|
69
69
|
done
|
|
70
70
|
|
|
@@ -83,7 +83,7 @@ else
|
|
|
83
83
|
fi
|
|
84
84
|
|
|
85
85
|
# Activate vllm venv
|
|
86
|
-
if [ "$VENV_BASE" = "singularity" ]; then
|
|
86
|
+
if [ "$VENV_BASE" = "singularity" ]; then
|
|
87
87
|
singularity exec --nv --bind /model-weights:/model-weights $SINGULARITY_IMAGE \
|
|
88
88
|
python3.10 -m vllm.entrypoints.openai.api_server \
|
|
89
89
|
--model ${VLLM_MODEL_WEIGHTS} \
|
|
@@ -93,7 +93,6 @@ if [ "$VENV_BASE" = "singularity" ]; then
|
|
|
93
93
|
--pipeline-parallel-size ${PIPELINE_PARALLEL_SIZE} \
|
|
94
94
|
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
|
|
95
95
|
--dtype ${VLLM_DATA_TYPE} \
|
|
96
|
-
--load-format safetensors \
|
|
97
96
|
--trust-remote-code \
|
|
98
97
|
--max-logprobs ${VLLM_MAX_LOGPROBS} \
|
|
99
98
|
--max-model-len ${VLLM_MAX_MODEL_LEN}
|
|
@@ -107,8 +106,7 @@ else
|
|
|
107
106
|
--pipeline-parallel-size ${PIPELINE_PARALLEL_SIZE} \
|
|
108
107
|
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
|
|
109
108
|
--dtype ${VLLM_DATA_TYPE} \
|
|
110
|
-
--load-format safetensors \
|
|
111
109
|
--trust-remote-code \
|
|
112
110
|
--max-logprobs ${VLLM_MAX_LOGPROBS} \
|
|
113
111
|
--max-model-len ${VLLM_MAX_MODEL_LEN}
|
|
114
|
-
fi
|
|
112
|
+
fi
|
vec_inf/vllm.slurm
CHANGED
|
@@ -41,7 +41,7 @@ else
|
|
|
41
41
|
--port ${vllm_port_number} \
|
|
42
42
|
--tensor-parallel-size ${NUM_GPUS} \
|
|
43
43
|
--dtype ${VLLM_DATA_TYPE} \
|
|
44
|
-
--max-logprobs ${VLLM_MAX_LOGPROBS} \
|
|
44
|
+
--max-logprobs ${VLLM_MAX_LOGPROBS} \
|
|
45
45
|
--trust-remote-code \
|
|
46
46
|
--max-model-len ${VLLM_MAX_MODEL_LEN}
|
|
47
|
-
fi
|
|
47
|
+
fi
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: vec-inf
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.3
|
|
4
4
|
Summary: Efficient LLM inference on Slurm clusters using vLLM.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Marshall Wang
|
|
@@ -14,6 +14,7 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
14
14
|
Provides-Extra: dev
|
|
15
15
|
Requires-Dist: click (>=8.1.0,<9.0.0)
|
|
16
16
|
Requires-Dist: cupy-cuda12x (==12.1.0) ; extra == "dev"
|
|
17
|
+
Requires-Dist: pandas (>=2.2.2,<3.0.0)
|
|
17
18
|
Requires-Dist: ray (>=2.9.3,<3.0.0) ; extra == "dev"
|
|
18
19
|
Requires-Dist: requests (>=2.31.0,<3.0.0)
|
|
19
20
|
Requires-Dist: rich (>=13.7.0,<14.0.0)
|
|
@@ -22,7 +23,7 @@ Requires-Dist: vllm-nccl-cu12 (>=2.18,<2.19) ; extra == "dev"
|
|
|
22
23
|
Description-Content-Type: text/markdown
|
|
23
24
|
|
|
24
25
|
# Vector Inference: Easy inference on Slurm clusters
|
|
25
|
-
This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update [`launch_server.sh`](vec-inf/launch_server.sh), [`vllm.slurm`](vec-inf/vllm.slurm), [`multinode_vllm.slurm`](vec-inf/multinode_vllm.slurm) and [`models.csv`](vec-inf/models/models.csv) accordingly.
|
|
26
|
+
This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update [`launch_server.sh`](vec-inf/launch_server.sh), [`vllm.slurm`](vec-inf/vllm.slurm), [`multinode_vllm.slurm`](vec-inf/multinode_vllm.slurm) and [`models.csv`](vec-inf/models/models.csv) accordingly.
|
|
26
27
|
|
|
27
28
|
## Installation
|
|
28
29
|
If you are using the Vector cluster environment, and you don't need any customization to the inference server environment, run the following to install package:
|
|
@@ -40,7 +41,7 @@ You should see an output like the following:
|
|
|
40
41
|
|
|
41
42
|
<img width="400" alt="launch_img" src="https://github.com/user-attachments/assets/557eb421-47db-4810-bccd-c49c526b1b43">
|
|
42
43
|
|
|
43
|
-
The model would be launched using the [default parameters](vec-inf/models/models.csv), you can override these values by providing additional options, use `--help` to see the full list. You can also launch your own customized model as long as the model architecture is [supported by vLLM](https://docs.vllm.ai/en/stable/models/supported_models.html), you'll need to specify all model launching related options to run a successful run.
|
|
44
|
+
The model would be launched using the [default parameters](vec-inf/models/models.csv), you can override these values by providing additional options, use `--help` to see the full list. You can also launch your own customized model as long as the model architecture is [supported by vLLM](https://docs.vllm.ai/en/stable/models/supported_models.html), you'll need to specify all model launching related options to run a successful run.
|
|
44
45
|
|
|
45
46
|
You can check the inference server status by providing the Slurm job ID to the `status` command:
|
|
46
47
|
```bash
|
|
@@ -55,7 +56,7 @@ There are 5 possible states:
|
|
|
55
56
|
|
|
56
57
|
* **PENDING**: Job submitted to Slurm, but not executed yet. Job pending reason will be shown.
|
|
57
58
|
* **LAUNCHING**: Job is running but the server is not ready yet.
|
|
58
|
-
* **READY**: Inference server running and ready to take requests.
|
|
59
|
+
* **READY**: Inference server running and ready to take requests.
|
|
59
60
|
* **FAILED**: Inference server in an unhealthy state. Job failed reason will be shown.
|
|
60
61
|
* **SHUTDOWN**: Inference server is shutdown/cancelled.
|
|
61
62
|
|
|
@@ -84,7 +85,7 @@ vec-inf list Meta-Llama-3.1-70B-Instruct
|
|
|
84
85
|
|
|
85
86
|
## Send inference requests
|
|
86
87
|
Once the inference server is ready, you can start sending in inference requests. We provide example scripts for sending inference requests in [`examples`](examples) folder. Make sure to update the model server URL and the model weights location in the scripts. For example, you can run `python examples/inference/llm/completions.py`, and you should expect to see an output like the following:
|
|
87
|
-
> {"id":"cmpl-
|
|
88
|
+
> {"id":"cmpl-c08d8946224747af9cce9f4d9f36ceb3","object":"text_completion","created":1725394970,"model":"Meta-Llama-3.1-8B-Instruct","choices":[{"index":0,"text":" is a question that many people may wonder. The answer is, of course, Ottawa. But if","logprobs":null,"finish_reason":"length","stop_reason":null}],"usage":{"prompt_tokens":8,"total_tokens":28,"completion_tokens":20}}
|
|
88
89
|
|
|
89
90
|
**NOTE**: For multimodal models, currently only `ChatCompletion` is available, and only one image can be provided for each prompt.
|
|
90
91
|
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
vec_inf/README.md,sha256=ny3ffk6FeRwk_nERimK-JQwEuysvBe5eKpNyLk_A-8k,499
|
|
2
|
+
vec_inf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
vec_inf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
vec_inf/cli/_cli.py,sha256=XwCBkwFrN06T_o1CkUKD2nWT6P4bwOfDpVPoM3AUyUA,8984
|
|
5
|
+
vec_inf/cli/_utils.py,sha256=n37X0AcgXNEi3wOEqQFA4_iHHeGclHew6NyQaML6q7s,4034
|
|
6
|
+
vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
|
|
7
|
+
vec_inf/launch_server.sh,sha256=-efoTEIDKlJD7YhbYMgq4fFRV7H_1okjT5uKhfQAGUg,3998
|
|
8
|
+
vec_inf/models/README.md,sha256=7Vz-AMValcic5Mpi9i5FshhRUV9K8nwSnItN4O1TSvI,8124
|
|
9
|
+
vec_inf/models/models.csv,sha256=dOthlc04TyTQTin_fyt-PFDqg-lARScI9i0-tUkIgQ8,4828
|
|
10
|
+
vec_inf/multinode_vllm.slurm,sha256=KbxsKD9kV8wsB_jCEqh63BHq8h2DLmYMV46z5h2wAe0,3867
|
|
11
|
+
vec_inf/vllm.slurm,sha256=wRBkDunb0Oc1d8ESl_Dn9wRs_kIKvN_J39pL8dWAbV0,1608
|
|
12
|
+
vec_inf-0.3.3.dist-info/METADATA,sha256=IefFGb9Gb7bOwI3RjNTbTlTCL6AImzx5XBSJjCp4y8c,5751
|
|
13
|
+
vec_inf-0.3.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
14
|
+
vec_inf-0.3.3.dist-info/entry_points.txt,sha256=JF4uXsj1H4XacxaBw9f0KN0P0qDzmp7K_1zTEBDappo,48
|
|
15
|
+
vec_inf-0.3.3.dist-info/RECORD,,
|
vec_inf-0.3.1.dist-info/RECORD
DELETED
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
vec_inf/README.md,sha256=jtvslzw1MjTFFIXwzlrb0NstUyTEDL0S_k27K5bLl34,499
|
|
2
|
-
vec_inf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
vec_inf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
vec_inf/cli/_cli.py,sha256=8UHNFitbmq1OTNO1cLM_LVuHFndnvNyQSezGs1oT3tc,8346
|
|
5
|
-
vec_inf/cli/_utils.py,sha256=2Grz-bX_mGjzxXUBdrX7MbNfXUM7JQ3399GKe-N74FE,3910
|
|
6
|
-
vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
|
|
7
|
-
vec_inf/launch_server.sh,sha256=BW5oK_10OjfHXhIsdf9vPsEBlCXh8j2lOV7qvSlPcZU,3998
|
|
8
|
-
vec_inf/models/README.md,sha256=y_Cr1ZAkqIw1vIEOZMEp4FsyLGVijDoIoqwxn6aeQwo,8138
|
|
9
|
-
vec_inf/models/models.csv,sha256=JFGMhT9o7Pf0tkY-w2GRQG5MxdYK2V5T8s6bk166MpM,4720
|
|
10
|
-
vec_inf/multinode_vllm.slurm,sha256=pedYWIzPN-BKtL6ezoZSKJ3DO7RduDyAR4_cxZD4KyY,3938
|
|
11
|
-
vec_inf/vllm.slurm,sha256=6Nx14qyAwHlbweCbFMUcMV2jaZSv41ghkyx2MiHJY8Y,1608
|
|
12
|
-
vec_inf-0.3.1.dist-info/METADATA,sha256=xRhpXmFmMv5A77xdJaKBo_m7UXC13CkBmzegnQzQnPg,5701
|
|
13
|
-
vec_inf-0.3.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
14
|
-
vec_inf-0.3.1.dist-info/entry_points.txt,sha256=JF4uXsj1H4XacxaBw9f0KN0P0qDzmp7K_1zTEBDappo,48
|
|
15
|
-
vec_inf-0.3.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|