vec-inf 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vec_inf/README.md +3 -3
- vec_inf/cli/_cli.py +214 -104
- vec_inf/cli/_helper.py +289 -564
- vec_inf/cli/_utils.py +26 -150
- vec_inf/cli/_vars.py +32 -0
- vec_inf/client/__init__.py +31 -0
- vec_inf/client/_client_vars.py +213 -0
- vec_inf/client/_exceptions.py +37 -0
- vec_inf/client/_helper.py +674 -0
- vec_inf/client/_slurm_script_generator.py +179 -0
- vec_inf/client/_utils.py +287 -0
- vec_inf/client/api.py +302 -0
- vec_inf/client/config.py +128 -0
- vec_inf/client/models.py +225 -0
- vec_inf/client/slurm_vars.py +49 -0
- vec_inf/config/README.md +0 -12
- vec_inf/config/models.yaml +417 -391
- {vec_inf-0.5.0.dist-info → vec_inf-0.6.0.dist-info}/METADATA +44 -61
- vec_inf-0.6.0.dist-info/RECORD +25 -0
- vec_inf/cli/_config.py +0 -87
- vec_inf/multinode_vllm.slurm +0 -154
- vec_inf/vllm.slurm +0 -90
- vec_inf-0.5.0.dist-info/RECORD +0 -17
- {vec_inf-0.5.0.dist-info → vec_inf-0.6.0.dist-info}/WHEEL +0 -0
- {vec_inf-0.5.0.dist-info → vec_inf-0.6.0.dist-info}/entry_points.txt +0 -0
- {vec_inf-0.5.0.dist-info → vec_inf-0.6.0.dist-info}/licenses/LICENSE +0 -0
vec_inf/README.md
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
# `vec-inf` Commands
|
|
2
2
|
|
|
3
|
-
* `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server, `--json-mode` supported.
|
|
4
|
-
* `list`: List all available model names, or append a supported model name to view the default configuration, `--json-mode` supported.
|
|
5
|
-
* `metrics`: Streams performance metrics to the console.
|
|
3
|
+
* `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server, `--json-mode` supported.
|
|
6
4
|
* `status`: Check the model status by providing its Slurm job ID, `--json-mode` supported.
|
|
5
|
+
* `metrics`: Streams performance metrics to the console.
|
|
7
6
|
* `shutdown`: Shutdown a model by providing its Slurm job ID.
|
|
7
|
+
* `list`: List all available model names, or view the default/cached configuration of a specific model, `--json-mode` supported.
|
|
8
8
|
|
|
9
9
|
Use `--help` to see all available options
|
vec_inf/cli/_cli.py
CHANGED
|
@@ -1,4 +1,22 @@
|
|
|
1
|
-
"""Command line interface for Vector Inference.
|
|
1
|
+
"""Command line interface for Vector Inference.
|
|
2
|
+
|
|
3
|
+
This module provides the command-line interface for interacting with Vector
|
|
4
|
+
Inference services, including model launching, status checking, metrics
|
|
5
|
+
monitoring, and shutdown operations.
|
|
6
|
+
|
|
7
|
+
Commands
|
|
8
|
+
--------
|
|
9
|
+
launch
|
|
10
|
+
Launch a model on the cluster
|
|
11
|
+
status
|
|
12
|
+
Check the status of a running model
|
|
13
|
+
shutdown
|
|
14
|
+
Stop a running model
|
|
15
|
+
list
|
|
16
|
+
List available models or get specific model configuration
|
|
17
|
+
metrics
|
|
18
|
+
Stream real-time performance metrics
|
|
19
|
+
"""
|
|
2
20
|
|
|
3
21
|
import time
|
|
4
22
|
from typing import Optional, Union
|
|
@@ -7,8 +25,13 @@ import click
|
|
|
7
25
|
from rich.console import Console
|
|
8
26
|
from rich.live import Live
|
|
9
27
|
|
|
10
|
-
|
|
11
|
-
|
|
28
|
+
from vec_inf.cli._helper import (
|
|
29
|
+
LaunchResponseFormatter,
|
|
30
|
+
ListCmdDisplay,
|
|
31
|
+
MetricsResponseFormatter,
|
|
32
|
+
StatusResponseFormatter,
|
|
33
|
+
)
|
|
34
|
+
from vec_inf.client import LaunchOptions, VecInfClient
|
|
12
35
|
|
|
13
36
|
|
|
14
37
|
CONSOLE = Console()
|
|
@@ -24,36 +47,6 @@ def cli() -> None:
|
|
|
24
47
|
@click.argument("model-name", type=str, nargs=1)
|
|
25
48
|
@click.option("--model-family", type=str, help="The model family")
|
|
26
49
|
@click.option("--model-variant", type=str, help="The model variant")
|
|
27
|
-
@click.option(
|
|
28
|
-
"--max-model-len",
|
|
29
|
-
type=int,
|
|
30
|
-
help="Model context length. Default value set based on suggested resource allocation.",
|
|
31
|
-
)
|
|
32
|
-
@click.option(
|
|
33
|
-
"--max-num-seqs",
|
|
34
|
-
type=int,
|
|
35
|
-
help="Maximum number of sequences to process in a single request",
|
|
36
|
-
)
|
|
37
|
-
@click.option(
|
|
38
|
-
"--gpu-memory-utilization",
|
|
39
|
-
type=float,
|
|
40
|
-
help="GPU memory utilization, default to 0.9",
|
|
41
|
-
)
|
|
42
|
-
@click.option(
|
|
43
|
-
"--enable-prefix-caching",
|
|
44
|
-
is_flag=True,
|
|
45
|
-
help="Enables automatic prefix caching",
|
|
46
|
-
)
|
|
47
|
-
@click.option(
|
|
48
|
-
"--enable-chunked-prefill",
|
|
49
|
-
is_flag=True,
|
|
50
|
-
help="Enable chunked prefill, enabled by default if max number of sequences > 32k",
|
|
51
|
-
)
|
|
52
|
-
@click.option(
|
|
53
|
-
"--max-num-batched-tokens",
|
|
54
|
-
type=int,
|
|
55
|
-
help="Maximum number of batched tokens per iteration, defaults to 2048 if --enable-chunked-prefill is set, else None",
|
|
56
|
-
)
|
|
57
50
|
@click.option(
|
|
58
51
|
"--partition",
|
|
59
52
|
type=str,
|
|
@@ -69,6 +62,11 @@ def cli() -> None:
|
|
|
69
62
|
type=int,
|
|
70
63
|
help="Number of GPUs/node to use, default to suggested resource allocation for model",
|
|
71
64
|
)
|
|
65
|
+
@click.option(
|
|
66
|
+
"--account",
|
|
67
|
+
type=str,
|
|
68
|
+
help="Charge resources used by this job to specified account.",
|
|
69
|
+
)
|
|
72
70
|
@click.option(
|
|
73
71
|
"--qos",
|
|
74
72
|
type=str,
|
|
@@ -79,12 +77,6 @@ def cli() -> None:
|
|
|
79
77
|
type=str,
|
|
80
78
|
help="Time limit for job, this should comply with QoS limits",
|
|
81
79
|
)
|
|
82
|
-
@click.option(
|
|
83
|
-
"--vocab-size",
|
|
84
|
-
type=int,
|
|
85
|
-
help="Vocabulary size, this option is intended for custom models",
|
|
86
|
-
)
|
|
87
|
-
@click.option("--data-type", type=str, help="Model data type")
|
|
88
80
|
@click.option(
|
|
89
81
|
"--venv",
|
|
90
82
|
type=str,
|
|
@@ -101,19 +93,9 @@ def cli() -> None:
|
|
|
101
93
|
help="Path to parent directory containing model weights",
|
|
102
94
|
)
|
|
103
95
|
@click.option(
|
|
104
|
-
"--
|
|
105
|
-
|
|
106
|
-
help="
|
|
107
|
-
)
|
|
108
|
-
@click.option(
|
|
109
|
-
"--compilation-config",
|
|
110
|
-
type=click.Choice(["0", "3"]),
|
|
111
|
-
help="torch.compile optimization level, accepts '0' or '3', default to '0', which means no optimization is applied",
|
|
112
|
-
)
|
|
113
|
-
@click.option(
|
|
114
|
-
"--enforce-eager",
|
|
115
|
-
is_flag=True,
|
|
116
|
-
help="Always use eager-mode PyTorch",
|
|
96
|
+
"--vllm-args",
|
|
97
|
+
type=str,
|
|
98
|
+
help="vLLM engine arguments to be set, use the format as specified in vLLM documentation and separate arguments with commas, e.g. --vllm-args '--max-model-len=8192,--max-num-seqs=256,--enable-prefix-caching'",
|
|
117
99
|
)
|
|
118
100
|
@click.option(
|
|
119
101
|
"--json-mode",
|
|
@@ -122,18 +104,64 @@ def cli() -> None:
|
|
|
122
104
|
)
|
|
123
105
|
def launch(
|
|
124
106
|
model_name: str,
|
|
125
|
-
**cli_kwargs: Optional[Union[str, int, bool]],
|
|
107
|
+
**cli_kwargs: Optional[Union[str, int, float, bool]],
|
|
126
108
|
) -> None:
|
|
127
|
-
"""Launch a model on the cluster.
|
|
109
|
+
"""Launch a model on the cluster.
|
|
110
|
+
|
|
111
|
+
Parameters
|
|
112
|
+
----------
|
|
113
|
+
model_name : str
|
|
114
|
+
Name of the model to launch
|
|
115
|
+
**cli_kwargs : dict
|
|
116
|
+
Additional launch options including:
|
|
117
|
+
- model_family : str, optional
|
|
118
|
+
Family/architecture of the model
|
|
119
|
+
- model_variant : str, optional
|
|
120
|
+
Specific variant of the model
|
|
121
|
+
- partition : str, optional
|
|
122
|
+
Type of compute partition
|
|
123
|
+
- num_nodes : int, optional
|
|
124
|
+
Number of nodes to use
|
|
125
|
+
- gpus_per_node : int, optional
|
|
126
|
+
Number of GPUs per node
|
|
127
|
+
- qos : str, optional
|
|
128
|
+
Quality of service tier
|
|
129
|
+
- time : str, optional
|
|
130
|
+
Time limit for job
|
|
131
|
+
- venv : str, optional
|
|
132
|
+
Path to virtual environment
|
|
133
|
+
- log_dir : str, optional
|
|
134
|
+
Path to SLURM log directory
|
|
135
|
+
- model_weights_parent_dir : str, optional
|
|
136
|
+
Path to model weights directory
|
|
137
|
+
- vllm_args : str, optional
|
|
138
|
+
vLLM engine arguments
|
|
139
|
+
- json_mode : bool, optional
|
|
140
|
+
Output in JSON format
|
|
141
|
+
|
|
142
|
+
Raises
|
|
143
|
+
------
|
|
144
|
+
click.ClickException
|
|
145
|
+
If launch fails for any reason
|
|
146
|
+
"""
|
|
128
147
|
try:
|
|
129
|
-
|
|
148
|
+
# Convert cli_kwargs to LaunchOptions
|
|
149
|
+
json_mode = cli_kwargs["json_mode"]
|
|
150
|
+
del cli_kwargs["json_mode"]
|
|
151
|
+
|
|
152
|
+
launch_options = LaunchOptions(**cli_kwargs) # type: ignore
|
|
130
153
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
154
|
+
# Start the client and launch model inference server
|
|
155
|
+
client = VecInfClient()
|
|
156
|
+
launch_response = client.launch_model(model_name, launch_options)
|
|
157
|
+
|
|
158
|
+
# Display launch information
|
|
159
|
+
launch_formatter = LaunchResponseFormatter(model_name, launch_response.config)
|
|
160
|
+
if json_mode:
|
|
161
|
+
click.echo(launch_response.config)
|
|
162
|
+
else:
|
|
163
|
+
launch_info_table = launch_formatter.format_table_output()
|
|
164
|
+
CONSOLE.print(launch_info_table)
|
|
137
165
|
|
|
138
166
|
except click.ClickException as e:
|
|
139
167
|
raise e
|
|
@@ -156,28 +184,61 @@ def launch(
|
|
|
156
184
|
def status(
|
|
157
185
|
slurm_job_id: int, log_dir: Optional[str] = None, json_mode: bool = False
|
|
158
186
|
) -> None:
|
|
159
|
-
"""Get the status of a running model on the cluster.
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
187
|
+
"""Get the status of a running model on the cluster.
|
|
188
|
+
|
|
189
|
+
Parameters
|
|
190
|
+
----------
|
|
191
|
+
slurm_job_id : int
|
|
192
|
+
ID of the SLURM job to check
|
|
193
|
+
log_dir : str, optional
|
|
194
|
+
Path to SLURM log directory
|
|
195
|
+
json_mode : bool, default=False
|
|
196
|
+
Whether to output in JSON format
|
|
164
197
|
|
|
165
|
-
|
|
198
|
+
Raises
|
|
199
|
+
------
|
|
200
|
+
click.ClickException
|
|
201
|
+
If status check fails
|
|
202
|
+
"""
|
|
203
|
+
try:
|
|
204
|
+
# Start the client and get model inference server status
|
|
205
|
+
client = VecInfClient()
|
|
206
|
+
status_response = client.get_status(slurm_job_id, log_dir)
|
|
207
|
+
# Display status information
|
|
208
|
+
status_formatter = StatusResponseFormatter(status_response)
|
|
209
|
+
if json_mode:
|
|
210
|
+
status_formatter.output_json()
|
|
211
|
+
else:
|
|
212
|
+
status_info_table = status_formatter.output_table()
|
|
213
|
+
CONSOLE.print(status_info_table)
|
|
166
214
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
status_helper.output_table(CONSOLE)
|
|
215
|
+
except click.ClickException as e:
|
|
216
|
+
raise e
|
|
217
|
+
except Exception as e:
|
|
218
|
+
raise click.ClickException(f"Status check failed: {str(e)}") from e
|
|
172
219
|
|
|
173
220
|
|
|
174
221
|
@cli.command("shutdown")
|
|
175
222
|
@click.argument("slurm_job_id", type=int, nargs=1)
|
|
176
223
|
def shutdown(slurm_job_id: int) -> None:
|
|
177
|
-
"""Shutdown a running model on the cluster.
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
224
|
+
"""Shutdown a running model on the cluster.
|
|
225
|
+
|
|
226
|
+
Parameters
|
|
227
|
+
----------
|
|
228
|
+
slurm_job_id : int
|
|
229
|
+
ID of the SLURM job to shut down
|
|
230
|
+
|
|
231
|
+
Raises
|
|
232
|
+
------
|
|
233
|
+
click.ClickException
|
|
234
|
+
If shutdown operation fails
|
|
235
|
+
"""
|
|
236
|
+
try:
|
|
237
|
+
client = VecInfClient()
|
|
238
|
+
client.shutdown_model(slurm_job_id)
|
|
239
|
+
click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}")
|
|
240
|
+
except Exception as e:
|
|
241
|
+
raise click.ClickException(f"Shutdown failed: {str(e)}") from e
|
|
181
242
|
|
|
182
243
|
|
|
183
244
|
@cli.command("list")
|
|
@@ -188,9 +249,34 @@ def shutdown(slurm_job_id: int) -> None:
|
|
|
188
249
|
help="Output in JSON string",
|
|
189
250
|
)
|
|
190
251
|
def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> None:
|
|
191
|
-
"""List all available models, or get default setup of a specific model.
|
|
192
|
-
|
|
193
|
-
|
|
252
|
+
"""List all available models, or get default setup of a specific model.
|
|
253
|
+
|
|
254
|
+
Parameters
|
|
255
|
+
----------
|
|
256
|
+
model_name : str, optional
|
|
257
|
+
Name of specific model to get information for
|
|
258
|
+
json_mode : bool, default=False
|
|
259
|
+
Whether to output in JSON format
|
|
260
|
+
|
|
261
|
+
Raises
|
|
262
|
+
------
|
|
263
|
+
click.ClickException
|
|
264
|
+
If list operation fails
|
|
265
|
+
"""
|
|
266
|
+
try:
|
|
267
|
+
# Start the client
|
|
268
|
+
client = VecInfClient()
|
|
269
|
+
list_display = ListCmdDisplay(CONSOLE, json_mode)
|
|
270
|
+
if model_name:
|
|
271
|
+
model_config = client.get_model_config(model_name)
|
|
272
|
+
list_display.display_single_model_output(model_config)
|
|
273
|
+
else:
|
|
274
|
+
model_infos = client.list_models()
|
|
275
|
+
list_display.display_all_models_output(model_infos)
|
|
276
|
+
except click.ClickException as e:
|
|
277
|
+
raise e
|
|
278
|
+
except Exception as e:
|
|
279
|
+
raise click.ClickException(f"List models failed: {str(e)}") from e
|
|
194
280
|
|
|
195
281
|
|
|
196
282
|
@cli.command("metrics")
|
|
@@ -199,31 +285,55 @@ def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> No
|
|
|
199
285
|
"--log-dir", type=str, help="Path to slurm log directory (if used during launch)"
|
|
200
286
|
)
|
|
201
287
|
def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
|
|
202
|
-
"""Stream real-time performance metrics from the model endpoint.
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
288
|
+
"""Stream real-time performance metrics from the model endpoint.
|
|
289
|
+
|
|
290
|
+
Parameters
|
|
291
|
+
----------
|
|
292
|
+
slurm_job_id : int
|
|
293
|
+
ID of the SLURM job to monitor
|
|
294
|
+
log_dir : str, optional
|
|
295
|
+
Path to SLURM log directory
|
|
296
|
+
|
|
297
|
+
Raises
|
|
298
|
+
------
|
|
299
|
+
click.ClickException
|
|
300
|
+
If metrics collection fails
|
|
301
|
+
|
|
302
|
+
Notes
|
|
303
|
+
-----
|
|
304
|
+
This command continuously streams metrics with a 2-second refresh interval
|
|
305
|
+
until interrupted. If metrics are not available, it will display status
|
|
306
|
+
information instead.
|
|
307
|
+
"""
|
|
308
|
+
try:
|
|
309
|
+
# Start the client and get inference server metrics
|
|
310
|
+
client = VecInfClient()
|
|
311
|
+
metrics_response = client.get_metrics(slurm_job_id, log_dir)
|
|
312
|
+
metrics_formatter = MetricsResponseFormatter(metrics_response.metrics)
|
|
313
|
+
|
|
314
|
+
# Check if metrics response is ready
|
|
315
|
+
if isinstance(metrics_response.metrics, str):
|
|
316
|
+
metrics_formatter.format_failed_metrics(metrics_response.metrics)
|
|
317
|
+
CONSOLE.print(metrics_formatter.table)
|
|
318
|
+
return
|
|
319
|
+
|
|
320
|
+
with Live(refresh_per_second=1, console=CONSOLE) as live:
|
|
321
|
+
while True:
|
|
322
|
+
metrics_response = client.get_metrics(slurm_job_id, log_dir)
|
|
323
|
+
metrics_formatter = MetricsResponseFormatter(metrics_response.metrics)
|
|
324
|
+
|
|
325
|
+
if isinstance(metrics_response.metrics, str):
|
|
326
|
+
# Show status information if metrics aren't available
|
|
327
|
+
metrics_formatter.format_failed_metrics(metrics_response.metrics)
|
|
328
|
+
else:
|
|
329
|
+
metrics_formatter.format_metrics()
|
|
330
|
+
|
|
331
|
+
live.update(metrics_formatter.table)
|
|
332
|
+
time.sleep(2)
|
|
333
|
+
except click.ClickException as e:
|
|
334
|
+
raise e
|
|
335
|
+
except Exception as e:
|
|
336
|
+
raise click.ClickException(f"Metrics check failed: {str(e)}") from e
|
|
227
337
|
|
|
228
338
|
|
|
229
339
|
if __name__ == "__main__":
|