vec-inf 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vec_inf/README.md +2 -1
- vec_inf/cli/_cli.py +43 -12
- vec_inf/cli/_helper.py +79 -12
- vec_inf/cli/_vars.py +37 -22
- vec_inf/client/_client_vars.py +31 -1
- vec_inf/client/_helper.py +154 -49
- vec_inf/client/_slurm_script_generator.py +109 -43
- vec_inf/client/_slurm_templates.py +110 -48
- vec_inf/client/_slurm_vars.py +13 -4
- vec_inf/client/_utils.py +13 -7
- vec_inf/client/api.py +47 -0
- vec_inf/client/config.py +17 -7
- vec_inf/client/models.py +25 -19
- vec_inf/config/README.md +1 -1
- vec_inf/config/environment.yaml +9 -2
- vec_inf/config/models.yaml +184 -368
- vec_inf/find_port.sh +10 -1
- {vec_inf-0.7.2.dist-info → vec_inf-0.8.0.dist-info}/METADATA +17 -16
- vec_inf-0.8.0.dist-info/RECORD +27 -0
- {vec_inf-0.7.2.dist-info → vec_inf-0.8.0.dist-info}/WHEEL +1 -1
- vec_inf-0.7.2.dist-info/RECORD +0 -27
- {vec_inf-0.7.2.dist-info → vec_inf-0.8.0.dist-info}/entry_points.txt +0 -0
- {vec_inf-0.7.2.dist-info → vec_inf-0.8.0.dist-info}/licenses/LICENSE +0 -0
vec_inf/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
* `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server.
|
|
4
4
|
* `batch-launch`: Specify a list of models to launch multiple OpenAI compatible inference servers at the same time.
|
|
5
|
-
* `status`: Check the
|
|
5
|
+
* `status`: Check the status of all `vec-inf` jobs, or a specific job by providing its job ID.
|
|
6
6
|
* `metrics`: Streams performance metrics to the console.
|
|
7
7
|
* `shutdown`: Shutdown a model by providing its Slurm job ID.
|
|
8
8
|
* `list`: List all available model names, or view the default/cached configuration of a specific model.
|
|
@@ -14,6 +14,7 @@ Use `--help` to see all available options
|
|
|
14
14
|
|
|
15
15
|
* `launch_model`: Launch an OpenAI compatible inference server.
|
|
16
16
|
* `batch_launch_models`: Launch multiple OpenAI compatible inference servers.
|
|
17
|
+
* `fetch_running_jobs`: Get the running `vec-inf` job IDs.
|
|
17
18
|
* `get_status`: Get the status of a running model.
|
|
18
19
|
* `get_metrics`: Get the performance metrics of a running model.
|
|
19
20
|
* `shutdown_model`: Shutdown a running model.
|
vec_inf/cli/_cli.py
CHANGED
|
@@ -30,6 +30,7 @@ from vec_inf.cli._helper import (
|
|
|
30
30
|
BatchLaunchResponseFormatter,
|
|
31
31
|
LaunchResponseFormatter,
|
|
32
32
|
ListCmdDisplay,
|
|
33
|
+
ListStatusDisplay,
|
|
33
34
|
MetricsResponseFormatter,
|
|
34
35
|
StatusResponseFormatter,
|
|
35
36
|
)
|
|
@@ -131,10 +132,20 @@ def cli() -> None:
|
|
|
131
132
|
type=str,
|
|
132
133
|
help="Path to parent directory containing model weights",
|
|
133
134
|
)
|
|
135
|
+
@click.option(
|
|
136
|
+
"--engine",
|
|
137
|
+
type=str,
|
|
138
|
+
help="Inference engine to use, supports 'vllm' and 'sglang'",
|
|
139
|
+
)
|
|
134
140
|
@click.option(
|
|
135
141
|
"--vllm-args",
|
|
136
142
|
type=str,
|
|
137
|
-
help="vLLM engine arguments to be set, use the format as specified in vLLM documentation and separate arguments with commas, e.g. --vllm-args '--max-model-len=8192,--max-num-seqs=256,--enable-prefix-caching'",
|
|
143
|
+
help="vLLM engine arguments to be set, use the format as specified in vLLM serve documentation and separate arguments with commas, e.g. --vllm-args '--max-model-len=8192,--max-num-seqs=256,--enable-prefix-caching'",
|
|
144
|
+
)
|
|
145
|
+
@click.option(
|
|
146
|
+
"--sglang-args",
|
|
147
|
+
type=str,
|
|
148
|
+
help="SGLang engine arguments to be set, use the format as specified in SGLang Server Arguments documentation and separate arguments with commas, e.g. --sglang-args '--context-length=8192,--mem-fraction-static=0.85'",
|
|
138
149
|
)
|
|
139
150
|
@click.option(
|
|
140
151
|
"--json-mode",
|
|
@@ -149,7 +160,7 @@ def cli() -> None:
|
|
|
149
160
|
@click.option(
|
|
150
161
|
"--config",
|
|
151
162
|
type=str,
|
|
152
|
-
help="Path to a model config yaml file to use in place of the default",
|
|
163
|
+
help="Path to a model config yaml file to use in place of the default, you can also set VEC_INF_MODEL_CONFIG to the path to the model config file",
|
|
153
164
|
)
|
|
154
165
|
def launch(
|
|
155
166
|
model_name: str,
|
|
@@ -200,7 +211,9 @@ def launch(
|
|
|
200
211
|
- model_weights_parent_dir : str, optional
|
|
201
212
|
Path to model weights directory
|
|
202
213
|
- vllm_args : str, optional
|
|
203
|
-
|
|
214
|
+
vllm engine arguments
|
|
215
|
+
- sglang_args : str, optional
|
|
216
|
+
sglang engine arguments
|
|
204
217
|
- env : str, optional
|
|
205
218
|
Environment variables
|
|
206
219
|
- config : str, optional
|
|
@@ -228,6 +241,10 @@ def launch(
|
|
|
228
241
|
if json_mode:
|
|
229
242
|
click.echo(json.dumps(launch_response.config))
|
|
230
243
|
else:
|
|
244
|
+
if launch_response.config.get("engine_inferred"):
|
|
245
|
+
CONSOLE.print(
|
|
246
|
+
"Warning: Inference engine inferred from engine-specific args"
|
|
247
|
+
)
|
|
231
248
|
launch_formatter = LaunchResponseFormatter(
|
|
232
249
|
model_name, launch_response.config
|
|
233
250
|
)
|
|
@@ -313,14 +330,14 @@ def batch_launch(
|
|
|
313
330
|
raise click.ClickException(f"Batch launch failed: {str(e)}") from e
|
|
314
331
|
|
|
315
332
|
|
|
316
|
-
@cli.command("status", help="Check the status of
|
|
317
|
-
@click.argument("slurm_job_id",
|
|
333
|
+
@cli.command("status", help="Check the status of running vec-inf jobs on the cluster.")
|
|
334
|
+
@click.argument("slurm_job_id", required=False)
|
|
318
335
|
@click.option(
|
|
319
336
|
"--json-mode",
|
|
320
337
|
is_flag=True,
|
|
321
338
|
help="Output in JSON string",
|
|
322
339
|
)
|
|
323
|
-
def status(slurm_job_id: str, json_mode: bool = False) -> None:
|
|
340
|
+
def status(slurm_job_id: Optional[str] = None, json_mode: bool = False) -> None:
|
|
324
341
|
"""Get the status of a running model on the cluster.
|
|
325
342
|
|
|
326
343
|
Parameters
|
|
@@ -338,14 +355,28 @@ def status(slurm_job_id: str, json_mode: bool = False) -> None:
|
|
|
338
355
|
try:
|
|
339
356
|
# Start the client and get model inference server status
|
|
340
357
|
client = VecInfClient()
|
|
341
|
-
|
|
358
|
+
if not slurm_job_id:
|
|
359
|
+
slurm_job_ids = client.fetch_running_jobs()
|
|
360
|
+
if not slurm_job_ids:
|
|
361
|
+
click.echo("No running jobs found.")
|
|
362
|
+
return
|
|
363
|
+
else:
|
|
364
|
+
slurm_job_ids = [slurm_job_id]
|
|
365
|
+
responses = []
|
|
366
|
+
for job_id in slurm_job_ids:
|
|
367
|
+
responses.append(client.get_status(job_id))
|
|
368
|
+
|
|
342
369
|
# Display status information
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
370
|
+
if slurm_job_id:
|
|
371
|
+
status_formatter = StatusResponseFormatter(responses[0])
|
|
372
|
+
if json_mode:
|
|
373
|
+
status_formatter.output_json()
|
|
374
|
+
else:
|
|
375
|
+
status_info_table = status_formatter.output_table()
|
|
376
|
+
CONSOLE.print(status_info_table)
|
|
346
377
|
else:
|
|
347
|
-
|
|
348
|
-
|
|
378
|
+
list_status_display = ListStatusDisplay(slurm_job_ids, responses, json_mode)
|
|
379
|
+
list_status_display.display_multiple_status_output(CONSOLE)
|
|
349
380
|
|
|
350
381
|
except click.ClickException as e:
|
|
351
382
|
raise e
|
vec_inf/cli/_helper.py
CHANGED
|
@@ -15,7 +15,7 @@ from rich.panel import Panel
|
|
|
15
15
|
from rich.table import Table
|
|
16
16
|
|
|
17
17
|
from vec_inf.cli._utils import create_table
|
|
18
|
-
from vec_inf.cli._vars import MODEL_TYPE_COLORS, MODEL_TYPE_PRIORITY
|
|
18
|
+
from vec_inf.cli._vars import ENGINE_NAME_MAP, MODEL_TYPE_COLORS, MODEL_TYPE_PRIORITY
|
|
19
19
|
from vec_inf.client import ModelConfig, ModelInfo, StatusResponse
|
|
20
20
|
|
|
21
21
|
|
|
@@ -49,11 +49,12 @@ class LaunchResponseFormatter:
|
|
|
49
49
|
if self.params.get(key):
|
|
50
50
|
table.add_row(label, self.params[key])
|
|
51
51
|
|
|
52
|
-
def
|
|
53
|
-
"""Add
|
|
54
|
-
if self.params.get("
|
|
55
|
-
|
|
56
|
-
|
|
52
|
+
def _add_engine_config(self, table: Table) -> None:
|
|
53
|
+
"""Add inference engine configuration details to the table."""
|
|
54
|
+
if self.params.get("engine_args"):
|
|
55
|
+
engine_name = ENGINE_NAME_MAP[self.params["engine"]]
|
|
56
|
+
table.add_row(f"{engine_name} Arguments:", style="magenta")
|
|
57
|
+
for arg, value in self.params["engine_args"].items():
|
|
57
58
|
table.add_row(f" {arg}:", str(value))
|
|
58
59
|
|
|
59
60
|
def _add_env_vars(self, table: Table) -> None:
|
|
@@ -111,9 +112,10 @@ class LaunchResponseFormatter:
|
|
|
111
112
|
str(Path(self.params["model_weights_parent_dir"], self.model_name)),
|
|
112
113
|
)
|
|
113
114
|
table.add_row("Log Directory", self.params["log_dir"])
|
|
115
|
+
table.add_row("Inference Engine", ENGINE_NAME_MAP[self.params["engine"]])
|
|
114
116
|
|
|
115
117
|
# Add configuration details
|
|
116
|
-
self.
|
|
118
|
+
self._add_engine_config(table)
|
|
117
119
|
self._add_env_vars(table)
|
|
118
120
|
self._add_bind_paths(table)
|
|
119
121
|
|
|
@@ -185,6 +187,10 @@ class BatchLaunchResponseFormatter:
|
|
|
185
187
|
table.add_row(
|
|
186
188
|
"Memory/Node", f" {self.params['models'][model_name]['mem_per_node']}"
|
|
187
189
|
)
|
|
190
|
+
table.add_row(
|
|
191
|
+
"Inference Engine",
|
|
192
|
+
f" {ENGINE_NAME_MAP[self.params['models'][model_name]['engine']]}",
|
|
193
|
+
)
|
|
188
194
|
|
|
189
195
|
return table
|
|
190
196
|
|
|
@@ -251,6 +257,62 @@ class StatusResponseFormatter:
|
|
|
251
257
|
return table
|
|
252
258
|
|
|
253
259
|
|
|
260
|
+
class ListStatusDisplay:
|
|
261
|
+
"""CLI Helper class for formatting a list of StatusResponse.
|
|
262
|
+
|
|
263
|
+
A formatter class that handles the presentation of multiple job statuses
|
|
264
|
+
in a table format.
|
|
265
|
+
|
|
266
|
+
Parameters
|
|
267
|
+
----------
|
|
268
|
+
statuses : list[StatusResponse]
|
|
269
|
+
List of model status information
|
|
270
|
+
"""
|
|
271
|
+
|
|
272
|
+
def __init__(
|
|
273
|
+
self,
|
|
274
|
+
job_ids: list[str],
|
|
275
|
+
statuses: list[StatusResponse],
|
|
276
|
+
json_mode: bool = False,
|
|
277
|
+
):
|
|
278
|
+
self.job_ids = job_ids
|
|
279
|
+
self.statuses = statuses
|
|
280
|
+
self.json_mode = json_mode
|
|
281
|
+
|
|
282
|
+
self.table = Table(show_header=True, header_style="bold magenta")
|
|
283
|
+
self.table.add_column("Job ID")
|
|
284
|
+
self.table.add_column("Model Name")
|
|
285
|
+
self.table.add_column("Status", style="blue")
|
|
286
|
+
self.table.add_column("Base URL")
|
|
287
|
+
|
|
288
|
+
def display_multiple_status_output(self, console: Console) -> None:
|
|
289
|
+
"""Format and display all model statuses.
|
|
290
|
+
|
|
291
|
+
Formats each model's status and adds it to the table.
|
|
292
|
+
"""
|
|
293
|
+
if self.json_mode:
|
|
294
|
+
json_data = [
|
|
295
|
+
{
|
|
296
|
+
"job_id": status.model_name,
|
|
297
|
+
"model_name": status.model_name,
|
|
298
|
+
"model_status": status.server_status,
|
|
299
|
+
"base_url": status.base_url,
|
|
300
|
+
}
|
|
301
|
+
for status in self.statuses
|
|
302
|
+
]
|
|
303
|
+
click.echo(json.dumps(json_data, indent=4))
|
|
304
|
+
return
|
|
305
|
+
|
|
306
|
+
for i, status in enumerate(self.statuses):
|
|
307
|
+
self.table.add_row(
|
|
308
|
+
self.job_ids[i],
|
|
309
|
+
status.model_name,
|
|
310
|
+
status.server_status,
|
|
311
|
+
status.base_url,
|
|
312
|
+
)
|
|
313
|
+
console.print(self.table)
|
|
314
|
+
|
|
315
|
+
|
|
254
316
|
class MetricsResponseFormatter:
|
|
255
317
|
"""CLI Helper class for formatting MetricsResponse.
|
|
256
318
|
|
|
@@ -423,14 +485,19 @@ class ListCmdDisplay:
|
|
|
423
485
|
)
|
|
424
486
|
return json.dumps(config_dict, indent=4)
|
|
425
487
|
|
|
488
|
+
excluded_list = ["venv", "log_dir"]
|
|
489
|
+
|
|
426
490
|
table = create_table(key_title="Model Config", value_title="Value")
|
|
427
491
|
for field, value in config.model_dump().items():
|
|
428
|
-
if
|
|
492
|
+
if "args" in field:
|
|
493
|
+
if not value:
|
|
494
|
+
continue
|
|
495
|
+
engine_name = ENGINE_NAME_MAP[field.split("_")[0]]
|
|
496
|
+
table.add_row(f"{engine_name} Arguments:", style="magenta")
|
|
497
|
+
for engine_arg, engine_value in value.items():
|
|
498
|
+
table.add_row(f" {engine_arg}:", str(engine_value))
|
|
499
|
+
elif field not in excluded_list and value:
|
|
429
500
|
table.add_row(field, str(value))
|
|
430
|
-
if field == "vllm_args":
|
|
431
|
-
table.add_row("vLLM Arguments:", style="magenta")
|
|
432
|
-
for vllm_arg, vllm_value in value.items():
|
|
433
|
-
table.add_row(f" {vllm_arg}:", str(vllm_value))
|
|
434
501
|
return table
|
|
435
502
|
|
|
436
503
|
def _format_all_models_output(
|
vec_inf/cli/_vars.py
CHANGED
|
@@ -1,32 +1,47 @@
|
|
|
1
1
|
"""Constants for CLI rendering.
|
|
2
2
|
|
|
3
|
-
This module defines
|
|
3
|
+
This module defines mappings for model type priorities, colors, and engine name mappings
|
|
4
4
|
used in the CLI display formatting.
|
|
5
|
+
"""
|
|
5
6
|
|
|
6
|
-
|
|
7
|
-
---------
|
|
8
|
-
MODEL_TYPE_PRIORITY : dict
|
|
9
|
-
Mapping of model types to their display priority (lower numbers shown first)
|
|
7
|
+
from typing import get_args
|
|
10
8
|
|
|
11
|
-
|
|
12
|
-
Mapping of model types to their display colors in Rich
|
|
9
|
+
from vec_inf.client._slurm_vars import MODEL_TYPES
|
|
13
10
|
|
|
14
|
-
Notes
|
|
15
|
-
-----
|
|
16
|
-
These constants are used primarily by the ListCmdDisplay class to ensure
|
|
17
|
-
consistent sorting and color coding of different model types in the CLI output.
|
|
18
|
-
"""
|
|
19
11
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
12
|
+
# Extract model type values from the Literal type
|
|
13
|
+
_MODEL_TYPES = get_args(MODEL_TYPES)
|
|
14
|
+
|
|
15
|
+
# Rich color options (prioritizing current colors, with fallbacks for additional types)
|
|
16
|
+
_RICH_COLORS = [
|
|
17
|
+
"cyan",
|
|
18
|
+
"bright_blue",
|
|
19
|
+
"purple",
|
|
20
|
+
"bright_magenta",
|
|
21
|
+
"green",
|
|
22
|
+
"yellow",
|
|
23
|
+
"bright_green",
|
|
24
|
+
"bright_yellow",
|
|
25
|
+
"red",
|
|
26
|
+
"bright_red",
|
|
27
|
+
"blue",
|
|
28
|
+
"magenta",
|
|
29
|
+
"bright_cyan",
|
|
30
|
+
"white",
|
|
31
|
+
"bright_white",
|
|
32
|
+
]
|
|
26
33
|
|
|
34
|
+
# Mapping of model types to their display priority (lower numbers shown first)
|
|
35
|
+
MODEL_TYPE_PRIORITY = {model_type: idx for idx, model_type in enumerate(_MODEL_TYPES)}
|
|
36
|
+
|
|
37
|
+
# Mapping of model types to their display colors in Rich
|
|
27
38
|
MODEL_TYPE_COLORS = {
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
39
|
+
model_type: _RICH_COLORS[idx % len(_RICH_COLORS)]
|
|
40
|
+
for idx, model_type in enumerate(_MODEL_TYPES)
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
# Inference engine choice and name mapping
|
|
44
|
+
ENGINE_NAME_MAP = {
|
|
45
|
+
"vllm": "vLLM",
|
|
46
|
+
"sglang": "SGLang",
|
|
32
47
|
}
|
vec_inf/client/_client_vars.py
CHANGED
|
@@ -49,7 +49,7 @@ SLURM_JOB_CONFIG_ARGS = {
|
|
|
49
49
|
"time": "time",
|
|
50
50
|
"nodes": "num_nodes",
|
|
51
51
|
"exclude": "exclude",
|
|
52
|
-
"nodelist": "
|
|
52
|
+
"nodelist": "nodelist",
|
|
53
53
|
"gres": "gres",
|
|
54
54
|
"cpus-per-task": "cpus_per_task",
|
|
55
55
|
"mem": "mem_per_node",
|
|
@@ -61,13 +61,43 @@ SLURM_JOB_CONFIG_ARGS = {
|
|
|
61
61
|
VLLM_SHORT_TO_LONG_MAP = {
|
|
62
62
|
"-tp": "--tensor-parallel-size",
|
|
63
63
|
"-pp": "--pipeline-parallel-size",
|
|
64
|
+
"-n": "--nnodes",
|
|
65
|
+
"-r": "--node-rank",
|
|
66
|
+
"-dcp": "--decode-context-parallel-size",
|
|
67
|
+
"-pcp": "--prefill-context-parallel-size",
|
|
64
68
|
"-dp": "--data-parallel-size",
|
|
69
|
+
"-dpn": "--data-parallel-rank",
|
|
70
|
+
"-dpr": "--data-parallel-start-rank",
|
|
65
71
|
"-dpl": "--data-parallel-size-local",
|
|
66
72
|
"-dpa": "--data-parallel-address",
|
|
67
73
|
"-dpp": "--data-parallel-rpc-port",
|
|
74
|
+
"-dpb": "--data-parallel-backend",
|
|
75
|
+
"-dph": "--data-parallel-hybrid-lb",
|
|
76
|
+
"-dpe": "--data-parallel-external-lb",
|
|
68
77
|
"-O": "--compilation-config",
|
|
69
78
|
"-q": "--quantization",
|
|
70
79
|
}
|
|
71
80
|
|
|
81
|
+
# SGLang engine args mapping between short and long names
|
|
82
|
+
SGLANG_SHORT_TO_LONG_MAP = {
|
|
83
|
+
"--tp": "--tensor-parallel-size",
|
|
84
|
+
"--tp-size": "--tensor-parallel-size",
|
|
85
|
+
"--pp": "--pipeline-parallel-size",
|
|
86
|
+
"--pp-size": "--pipeline-parallel-size",
|
|
87
|
+
"--dp": "--data-parallel-size",
|
|
88
|
+
"--dp-size": "--data-parallel-size",
|
|
89
|
+
"--ep": "--expert-parallel-size",
|
|
90
|
+
"--ep-size": "--expert-parallel-expert-size",
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
# Mapping of engine short names to their argument mappings
|
|
94
|
+
ENGINE_SHORT_TO_LONG_MAP = {
|
|
95
|
+
"vllm": VLLM_SHORT_TO_LONG_MAP,
|
|
96
|
+
"sglang": SGLANG_SHORT_TO_LONG_MAP,
|
|
97
|
+
}
|
|
98
|
+
|
|
72
99
|
# Required matching arguments for batch mode
|
|
73
100
|
BATCH_MODE_REQUIRED_MATCHING_ARGS = ["venv", "log_dir"]
|
|
101
|
+
|
|
102
|
+
# Supported engines
|
|
103
|
+
SUPPORTED_ENGINES = ["vllm", "sglang"]
|