vec-inf 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vec_inf/README.md +3 -3
- vec_inf/cli/_cli.py +239 -104
- vec_inf/cli/_helper.py +289 -564
- vec_inf/cli/_utils.py +26 -150
- vec_inf/cli/_vars.py +32 -0
- vec_inf/client/__init__.py +31 -0
- vec_inf/client/_client_vars.py +231 -0
- vec_inf/client/_exceptions.py +37 -0
- vec_inf/client/_helper.py +661 -0
- vec_inf/client/_slurm_script_generator.py +178 -0
- vec_inf/client/_utils.py +287 -0
- vec_inf/client/api.py +302 -0
- vec_inf/client/config.py +138 -0
- vec_inf/client/models.py +234 -0
- vec_inf/client/slurm_vars.py +49 -0
- vec_inf/config/README.md +0 -12
- vec_inf/config/models.yaml +410 -391
- {vec_inf-0.5.0.dist-info → vec_inf-0.6.1.dist-info}/METADATA +52 -63
- vec_inf-0.6.1.dist-info/RECORD +25 -0
- vec_inf/cli/_config.py +0 -87
- vec_inf/multinode_vllm.slurm +0 -154
- vec_inf/vllm.slurm +0 -90
- vec_inf-0.5.0.dist-info/RECORD +0 -17
- {vec_inf-0.5.0.dist-info → vec_inf-0.6.1.dist-info}/WHEEL +0 -0
- {vec_inf-0.5.0.dist-info → vec_inf-0.6.1.dist-info}/entry_points.txt +0 -0
- {vec_inf-0.5.0.dist-info → vec_inf-0.6.1.dist-info}/licenses/LICENSE +0 -0
vec_inf/README.md
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
# `vec-inf` Commands
|
|
2
2
|
|
|
3
|
-
* `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server, `--json-mode` supported.
|
|
4
|
-
* `list`: List all available model names, or append a supported model name to view the default configuration, `--json-mode` supported.
|
|
5
|
-
* `metrics`: Streams performance metrics to the console.
|
|
3
|
+
* `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server, `--json-mode` supported.
|
|
6
4
|
* `status`: Check the model status by providing its Slurm job ID, `--json-mode` supported.
|
|
5
|
+
* `metrics`: Streams performance metrics to the console.
|
|
7
6
|
* `shutdown`: Shutdown a model by providing its Slurm job ID.
|
|
7
|
+
* `list`: List all available model names, or view the default/cached configuration of a specific model, `--json-mode` supported.
|
|
8
8
|
|
|
9
9
|
Use `--help` to see all available options
|
vec_inf/cli/_cli.py
CHANGED
|
@@ -1,5 +1,24 @@
|
|
|
1
|
-
"""Command line interface for Vector Inference.
|
|
1
|
+
"""Command line interface for Vector Inference.
|
|
2
2
|
|
|
3
|
+
This module provides the command-line interface for interacting with Vector
|
|
4
|
+
Inference services, including model launching, status checking, metrics
|
|
5
|
+
monitoring, and shutdown operations.
|
|
6
|
+
|
|
7
|
+
Commands
|
|
8
|
+
--------
|
|
9
|
+
launch
|
|
10
|
+
Launch a model on the cluster
|
|
11
|
+
status
|
|
12
|
+
Check the status of a running model
|
|
13
|
+
shutdown
|
|
14
|
+
Stop a running model
|
|
15
|
+
list
|
|
16
|
+
List available models or get specific model configuration
|
|
17
|
+
metrics
|
|
18
|
+
Stream real-time performance metrics
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import json
|
|
3
22
|
import time
|
|
4
23
|
from typing import Optional, Union
|
|
5
24
|
|
|
@@ -7,8 +26,13 @@ import click
|
|
|
7
26
|
from rich.console import Console
|
|
8
27
|
from rich.live import Live
|
|
9
28
|
|
|
10
|
-
|
|
11
|
-
|
|
29
|
+
from vec_inf.cli._helper import (
|
|
30
|
+
LaunchResponseFormatter,
|
|
31
|
+
ListCmdDisplay,
|
|
32
|
+
MetricsResponseFormatter,
|
|
33
|
+
StatusResponseFormatter,
|
|
34
|
+
)
|
|
35
|
+
from vec_inf.client import LaunchOptions, VecInfClient
|
|
12
36
|
|
|
13
37
|
|
|
14
38
|
CONSOLE = Console()
|
|
@@ -24,36 +48,6 @@ def cli() -> None:
|
|
|
24
48
|
@click.argument("model-name", type=str, nargs=1)
|
|
25
49
|
@click.option("--model-family", type=str, help="The model family")
|
|
26
50
|
@click.option("--model-variant", type=str, help="The model variant")
|
|
27
|
-
@click.option(
|
|
28
|
-
"--max-model-len",
|
|
29
|
-
type=int,
|
|
30
|
-
help="Model context length. Default value set based on suggested resource allocation.",
|
|
31
|
-
)
|
|
32
|
-
@click.option(
|
|
33
|
-
"--max-num-seqs",
|
|
34
|
-
type=int,
|
|
35
|
-
help="Maximum number of sequences to process in a single request",
|
|
36
|
-
)
|
|
37
|
-
@click.option(
|
|
38
|
-
"--gpu-memory-utilization",
|
|
39
|
-
type=float,
|
|
40
|
-
help="GPU memory utilization, default to 0.9",
|
|
41
|
-
)
|
|
42
|
-
@click.option(
|
|
43
|
-
"--enable-prefix-caching",
|
|
44
|
-
is_flag=True,
|
|
45
|
-
help="Enables automatic prefix caching",
|
|
46
|
-
)
|
|
47
|
-
@click.option(
|
|
48
|
-
"--enable-chunked-prefill",
|
|
49
|
-
is_flag=True,
|
|
50
|
-
help="Enable chunked prefill, enabled by default if max number of sequences > 32k",
|
|
51
|
-
)
|
|
52
|
-
@click.option(
|
|
53
|
-
"--max-num-batched-tokens",
|
|
54
|
-
type=int,
|
|
55
|
-
help="Maximum number of batched tokens per iteration, defaults to 2048 if --enable-chunked-prefill is set, else None",
|
|
56
|
-
)
|
|
57
51
|
@click.option(
|
|
58
52
|
"--partition",
|
|
59
53
|
type=str,
|
|
@@ -69,22 +63,36 @@ def cli() -> None:
|
|
|
69
63
|
type=int,
|
|
70
64
|
help="Number of GPUs/node to use, default to suggested resource allocation for model",
|
|
71
65
|
)
|
|
66
|
+
@click.option(
|
|
67
|
+
"--account",
|
|
68
|
+
type=str,
|
|
69
|
+
help="Charge resources used by this job to specified account.",
|
|
70
|
+
)
|
|
72
71
|
@click.option(
|
|
73
72
|
"--qos",
|
|
74
73
|
type=str,
|
|
75
74
|
help="Quality of service",
|
|
76
75
|
)
|
|
77
76
|
@click.option(
|
|
78
|
-
"--
|
|
77
|
+
"--exclude",
|
|
79
78
|
type=str,
|
|
80
|
-
help="
|
|
79
|
+
help="Exclude certain nodes from the resources granted to the job",
|
|
81
80
|
)
|
|
82
81
|
@click.option(
|
|
83
|
-
"--
|
|
84
|
-
type=
|
|
85
|
-
help="
|
|
82
|
+
"--node-list",
|
|
83
|
+
type=str,
|
|
84
|
+
help="Request a specific list of nodes for deployment",
|
|
85
|
+
)
|
|
86
|
+
@click.option(
|
|
87
|
+
"--bind",
|
|
88
|
+
type=str,
|
|
89
|
+
help="Additional binds for the singularity container as a comma separated list of bind paths",
|
|
90
|
+
)
|
|
91
|
+
@click.option(
|
|
92
|
+
"--time",
|
|
93
|
+
type=str,
|
|
94
|
+
help="Time limit for job, this should comply with QoS limits",
|
|
86
95
|
)
|
|
87
|
-
@click.option("--data-type", type=str, help="Model data type")
|
|
88
96
|
@click.option(
|
|
89
97
|
"--venv",
|
|
90
98
|
type=str,
|
|
@@ -101,19 +109,9 @@ def cli() -> None:
|
|
|
101
109
|
help="Path to parent directory containing model weights",
|
|
102
110
|
)
|
|
103
111
|
@click.option(
|
|
104
|
-
"--
|
|
105
|
-
|
|
106
|
-
help="
|
|
107
|
-
)
|
|
108
|
-
@click.option(
|
|
109
|
-
"--compilation-config",
|
|
110
|
-
type=click.Choice(["0", "3"]),
|
|
111
|
-
help="torch.compile optimization level, accepts '0' or '3', default to '0', which means no optimization is applied",
|
|
112
|
-
)
|
|
113
|
-
@click.option(
|
|
114
|
-
"--enforce-eager",
|
|
115
|
-
is_flag=True,
|
|
116
|
-
help="Always use eager-mode PyTorch",
|
|
112
|
+
"--vllm-args",
|
|
113
|
+
type=str,
|
|
114
|
+
help="vLLM engine arguments to be set, use the format as specified in vLLM documentation and separate arguments with commas, e.g. --vllm-args '--max-model-len=8192,--max-num-seqs=256,--enable-prefix-caching'",
|
|
117
115
|
)
|
|
118
116
|
@click.option(
|
|
119
117
|
"--json-mode",
|
|
@@ -122,18 +120,73 @@ def cli() -> None:
|
|
|
122
120
|
)
|
|
123
121
|
def launch(
|
|
124
122
|
model_name: str,
|
|
125
|
-
**cli_kwargs: Optional[Union[str, int, bool]],
|
|
123
|
+
**cli_kwargs: Optional[Union[str, int, float, bool]],
|
|
126
124
|
) -> None:
|
|
127
|
-
"""Launch a model on the cluster.
|
|
125
|
+
"""Launch a model on the cluster.
|
|
126
|
+
|
|
127
|
+
Parameters
|
|
128
|
+
----------
|
|
129
|
+
model_name : str
|
|
130
|
+
Name of the model to launch
|
|
131
|
+
**cli_kwargs : dict
|
|
132
|
+
Additional launch options including:
|
|
133
|
+
- model_family : str, optional
|
|
134
|
+
Family/architecture of the model
|
|
135
|
+
- model_variant : str, optional
|
|
136
|
+
Specific variant of the model
|
|
137
|
+
- partition : str, optional
|
|
138
|
+
Type of compute partition
|
|
139
|
+
- num_nodes : int, optional
|
|
140
|
+
Number of nodes to use
|
|
141
|
+
- gpus_per_node : int, optional
|
|
142
|
+
Number of GPUs per node
|
|
143
|
+
- account : str, optional
|
|
144
|
+
Charge resources used by this job to specified account
|
|
145
|
+
- qos : str, optional
|
|
146
|
+
Quality of service tier
|
|
147
|
+
- exclude : str, optional
|
|
148
|
+
Exclude certain nodes from the resources granted to the job
|
|
149
|
+
- node_list : str, optional
|
|
150
|
+
Request a specific list of nodes for deployment
|
|
151
|
+
- bind : str, optional
|
|
152
|
+
Additional binds for the singularity container
|
|
153
|
+
- time : str, optional
|
|
154
|
+
Time limit for job
|
|
155
|
+
- venv : str, optional
|
|
156
|
+
Path to virtual environment
|
|
157
|
+
- log_dir : str, optional
|
|
158
|
+
Path to SLURM log directory
|
|
159
|
+
- model_weights_parent_dir : str, optional
|
|
160
|
+
Path to model weights directory
|
|
161
|
+
- vllm_args : str, optional
|
|
162
|
+
vLLM engine arguments
|
|
163
|
+
- json_mode : bool, optional
|
|
164
|
+
Output in JSON format
|
|
165
|
+
|
|
166
|
+
Raises
|
|
167
|
+
------
|
|
168
|
+
click.ClickException
|
|
169
|
+
If launch fails for any reason
|
|
170
|
+
"""
|
|
128
171
|
try:
|
|
129
|
-
|
|
172
|
+
# Convert cli_kwargs to LaunchOptions
|
|
173
|
+
json_mode = cli_kwargs["json_mode"]
|
|
174
|
+
del cli_kwargs["json_mode"]
|
|
175
|
+
|
|
176
|
+
launch_options = LaunchOptions(**cli_kwargs) # type: ignore
|
|
177
|
+
|
|
178
|
+
# Start the client and launch model inference server
|
|
179
|
+
client = VecInfClient()
|
|
180
|
+
launch_response = client.launch_model(model_name, launch_options)
|
|
130
181
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
if
|
|
135
|
-
|
|
136
|
-
|
|
182
|
+
# Display launch information
|
|
183
|
+
launch_formatter = LaunchResponseFormatter(model_name, launch_response.config)
|
|
184
|
+
|
|
185
|
+
if json_mode:
|
|
186
|
+
click.echo(json.dumps(launch_response.config))
|
|
187
|
+
else:
|
|
188
|
+
launch_info_table = launch_formatter.format_table_output()
|
|
189
|
+
CONSOLE.print(launch_info_table)
|
|
137
190
|
|
|
138
191
|
except click.ClickException as e:
|
|
139
192
|
raise e
|
|
@@ -156,28 +209,61 @@ def launch(
|
|
|
156
209
|
def status(
|
|
157
210
|
slurm_job_id: int, log_dir: Optional[str] = None, json_mode: bool = False
|
|
158
211
|
) -> None:
|
|
159
|
-
"""Get the status of a running model on the cluster.
|
|
160
|
-
status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
|
|
161
|
-
output, stderr = utils.run_bash_command(status_cmd)
|
|
162
|
-
if stderr:
|
|
163
|
-
raise click.ClickException(f"Error: {stderr}")
|
|
212
|
+
"""Get the status of a running model on the cluster.
|
|
164
213
|
|
|
165
|
-
|
|
214
|
+
Parameters
|
|
215
|
+
----------
|
|
216
|
+
slurm_job_id : int
|
|
217
|
+
ID of the SLURM job to check
|
|
218
|
+
log_dir : str, optional
|
|
219
|
+
Path to SLURM log directory
|
|
220
|
+
json_mode : bool, default=False
|
|
221
|
+
Whether to output in JSON format
|
|
166
222
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
223
|
+
Raises
|
|
224
|
+
------
|
|
225
|
+
click.ClickException
|
|
226
|
+
If status check fails
|
|
227
|
+
"""
|
|
228
|
+
try:
|
|
229
|
+
# Start the client and get model inference server status
|
|
230
|
+
client = VecInfClient()
|
|
231
|
+
status_response = client.get_status(slurm_job_id, log_dir)
|
|
232
|
+
# Display status information
|
|
233
|
+
status_formatter = StatusResponseFormatter(status_response)
|
|
234
|
+
if json_mode:
|
|
235
|
+
status_formatter.output_json()
|
|
236
|
+
else:
|
|
237
|
+
status_info_table = status_formatter.output_table()
|
|
238
|
+
CONSOLE.print(status_info_table)
|
|
239
|
+
|
|
240
|
+
except click.ClickException as e:
|
|
241
|
+
raise e
|
|
242
|
+
except Exception as e:
|
|
243
|
+
raise click.ClickException(f"Status check failed: {str(e)}") from e
|
|
172
244
|
|
|
173
245
|
|
|
174
246
|
@cli.command("shutdown")
|
|
175
247
|
@click.argument("slurm_job_id", type=int, nargs=1)
|
|
176
248
|
def shutdown(slurm_job_id: int) -> None:
|
|
177
|
-
"""Shutdown a running model on the cluster.
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
249
|
+
"""Shutdown a running model on the cluster.
|
|
250
|
+
|
|
251
|
+
Parameters
|
|
252
|
+
----------
|
|
253
|
+
slurm_job_id : int
|
|
254
|
+
ID of the SLURM job to shut down
|
|
255
|
+
|
|
256
|
+
Raises
|
|
257
|
+
------
|
|
258
|
+
click.ClickException
|
|
259
|
+
If shutdown operation fails
|
|
260
|
+
"""
|
|
261
|
+
try:
|
|
262
|
+
client = VecInfClient()
|
|
263
|
+
client.shutdown_model(slurm_job_id)
|
|
264
|
+
click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}")
|
|
265
|
+
except Exception as e:
|
|
266
|
+
raise click.ClickException(f"Shutdown failed: {str(e)}") from e
|
|
181
267
|
|
|
182
268
|
|
|
183
269
|
@cli.command("list")
|
|
@@ -188,9 +274,34 @@ def shutdown(slurm_job_id: int) -> None:
|
|
|
188
274
|
help="Output in JSON string",
|
|
189
275
|
)
|
|
190
276
|
def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> None:
|
|
191
|
-
"""List all available models, or get default setup of a specific model.
|
|
192
|
-
|
|
193
|
-
|
|
277
|
+
"""List all available models, or get default setup of a specific model.
|
|
278
|
+
|
|
279
|
+
Parameters
|
|
280
|
+
----------
|
|
281
|
+
model_name : str, optional
|
|
282
|
+
Name of specific model to get information for
|
|
283
|
+
json_mode : bool, default=False
|
|
284
|
+
Whether to output in JSON format
|
|
285
|
+
|
|
286
|
+
Raises
|
|
287
|
+
------
|
|
288
|
+
click.ClickException
|
|
289
|
+
If list operation fails
|
|
290
|
+
"""
|
|
291
|
+
try:
|
|
292
|
+
# Start the client
|
|
293
|
+
client = VecInfClient()
|
|
294
|
+
list_display = ListCmdDisplay(CONSOLE, json_mode)
|
|
295
|
+
if model_name:
|
|
296
|
+
model_config = client.get_model_config(model_name)
|
|
297
|
+
list_display.display_single_model_output(model_config)
|
|
298
|
+
else:
|
|
299
|
+
model_infos = client.list_models()
|
|
300
|
+
list_display.display_all_models_output(model_infos)
|
|
301
|
+
except click.ClickException as e:
|
|
302
|
+
raise e
|
|
303
|
+
except Exception as e:
|
|
304
|
+
raise click.ClickException(f"List models failed: {str(e)}") from e
|
|
194
305
|
|
|
195
306
|
|
|
196
307
|
@cli.command("metrics")
|
|
@@ -199,31 +310,55 @@ def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> No
|
|
|
199
310
|
"--log-dir", type=str, help="Path to slurm log directory (if used during launch)"
|
|
200
311
|
)
|
|
201
312
|
def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
|
|
202
|
-
"""Stream real-time performance metrics from the model endpoint.
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
313
|
+
"""Stream real-time performance metrics from the model endpoint.
|
|
314
|
+
|
|
315
|
+
Parameters
|
|
316
|
+
----------
|
|
317
|
+
slurm_job_id : int
|
|
318
|
+
ID of the SLURM job to monitor
|
|
319
|
+
log_dir : str, optional
|
|
320
|
+
Path to SLURM log directory
|
|
321
|
+
|
|
322
|
+
Raises
|
|
323
|
+
------
|
|
324
|
+
click.ClickException
|
|
325
|
+
If metrics collection fails
|
|
326
|
+
|
|
327
|
+
Notes
|
|
328
|
+
-----
|
|
329
|
+
This command continuously streams metrics with a 2-second refresh interval
|
|
330
|
+
until interrupted. If metrics are not available, it will display status
|
|
331
|
+
information instead.
|
|
332
|
+
"""
|
|
333
|
+
try:
|
|
334
|
+
# Start the client and get inference server metrics
|
|
335
|
+
client = VecInfClient()
|
|
336
|
+
metrics_response = client.get_metrics(slurm_job_id, log_dir)
|
|
337
|
+
metrics_formatter = MetricsResponseFormatter(metrics_response.metrics)
|
|
338
|
+
|
|
339
|
+
# Check if metrics response is ready
|
|
340
|
+
if isinstance(metrics_response.metrics, str):
|
|
341
|
+
metrics_formatter.format_failed_metrics(metrics_response.metrics)
|
|
342
|
+
CONSOLE.print(metrics_formatter.table)
|
|
343
|
+
return
|
|
344
|
+
|
|
345
|
+
with Live(refresh_per_second=1, console=CONSOLE) as live:
|
|
346
|
+
while True:
|
|
347
|
+
metrics_response = client.get_metrics(slurm_job_id, log_dir)
|
|
348
|
+
metrics_formatter = MetricsResponseFormatter(metrics_response.metrics)
|
|
349
|
+
|
|
350
|
+
if isinstance(metrics_response.metrics, str):
|
|
351
|
+
# Show status information if metrics aren't available
|
|
352
|
+
metrics_formatter.format_failed_metrics(metrics_response.metrics)
|
|
353
|
+
else:
|
|
354
|
+
metrics_formatter.format_metrics()
|
|
355
|
+
|
|
356
|
+
live.update(metrics_formatter.table)
|
|
357
|
+
time.sleep(2)
|
|
358
|
+
except click.ClickException as e:
|
|
359
|
+
raise e
|
|
360
|
+
except Exception as e:
|
|
361
|
+
raise click.ClickException(f"Metrics check failed: {str(e)}") from e
|
|
227
362
|
|
|
228
363
|
|
|
229
364
|
if __name__ == "__main__":
|