vec-inf 0.6.1__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vec_inf/README.md +18 -4
- vec_inf/cli/_cli.py +191 -34
- vec_inf/cli/_helper.py +95 -14
- vec_inf/client/_client_vars.py +7 -165
- vec_inf/client/_helper.py +386 -40
- vec_inf/client/_slurm_script_generator.py +204 -36
- vec_inf/client/_slurm_templates.py +248 -0
- vec_inf/client/_slurm_vars.py +86 -0
- vec_inf/client/_utils.py +189 -70
- vec_inf/client/api.py +96 -25
- vec_inf/client/config.py +40 -19
- vec_inf/client/models.py +44 -4
- vec_inf/config/README.md +4 -243
- vec_inf/config/environment.yaml +35 -0
- vec_inf/config/models.yaml +102 -274
- {vec_inf-0.6.1.dist-info → vec_inf-0.7.1.dist-info}/METADATA +43 -73
- vec_inf-0.7.1.dist-info/RECORD +27 -0
- vec_inf/client/slurm_vars.py +0 -49
- vec_inf-0.6.1.dist-info/RECORD +0 -25
- {vec_inf-0.6.1.dist-info → vec_inf-0.7.1.dist-info}/WHEEL +0 -0
- {vec_inf-0.6.1.dist-info → vec_inf-0.7.1.dist-info}/entry_points.txt +0 -0
- {vec_inf-0.6.1.dist-info → vec_inf-0.7.1.dist-info}/licenses/LICENSE +0 -0
vec_inf/README.md
CHANGED
|
@@ -1,9 +1,23 @@
|
|
|
1
|
-
|
|
1
|
+
## `vec-inf` CLI Commands
|
|
2
2
|
|
|
3
|
-
* `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server
|
|
4
|
-
* `
|
|
3
|
+
* `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server.
|
|
4
|
+
* `batch-launch`: Specify a list of models to launch multiple OpenAI compatible inference servers at the same time.
|
|
5
|
+
* `status`: Check the model status by providing its Slurm job ID.
|
|
5
6
|
* `metrics`: Streams performance metrics to the console.
|
|
6
7
|
* `shutdown`: Shutdown a model by providing its Slurm job ID.
|
|
7
|
-
* `list`: List all available model names, or view the default/cached configuration of a specific model
|
|
8
|
+
* `list`: List all available model names, or view the default/cached configuration of a specific model.
|
|
9
|
+
* `cleanup`: Remove old log directories. You can filter by `--model-family`, `--model-name`, `--job-id`, and/or `--before-job-id`. Use `--dry-run` to preview what would be deleted.
|
|
8
10
|
|
|
9
11
|
Use `--help` to see all available options
|
|
12
|
+
|
|
13
|
+
## `VecInfClient` API
|
|
14
|
+
|
|
15
|
+
* `launch_model`: Launch an OpenAI compatible inference server.
|
|
16
|
+
* `batch_launch_models`: Launch multiple OpenAI compatible inference servers.
|
|
17
|
+
* `get_status`: Get the status of a running model.
|
|
18
|
+
* `get_metrics`: Get the performance metrics of a running model.
|
|
19
|
+
* `shutdown_model`: Shutdown a running model.
|
|
20
|
+
* `list_models`" List all available models.
|
|
21
|
+
* `get_model_config`: Get the configuration for a specific model.
|
|
22
|
+
* `wait_until_ready`: Wait until a model is ready or fails.
|
|
23
|
+
* `cleanup_logs`: Remove logs from the log directory.
|
vec_inf/cli/_cli.py
CHANGED
|
@@ -27,6 +27,7 @@ from rich.console import Console
|
|
|
27
27
|
from rich.live import Live
|
|
28
28
|
|
|
29
29
|
from vec_inf.cli._helper import (
|
|
30
|
+
BatchLaunchResponseFormatter,
|
|
30
31
|
LaunchResponseFormatter,
|
|
31
32
|
ListCmdDisplay,
|
|
32
33
|
MetricsResponseFormatter,
|
|
@@ -44,14 +45,19 @@ def cli() -> None:
|
|
|
44
45
|
pass
|
|
45
46
|
|
|
46
47
|
|
|
47
|
-
@cli.command("launch")
|
|
48
|
+
@cli.command("launch", help="Launch a model on the cluster.")
|
|
48
49
|
@click.argument("model-name", type=str, nargs=1)
|
|
49
50
|
@click.option("--model-family", type=str, help="The model family")
|
|
50
51
|
@click.option("--model-variant", type=str, help="The model variant")
|
|
51
52
|
@click.option(
|
|
52
53
|
"--partition",
|
|
53
54
|
type=str,
|
|
54
|
-
help="Type of
|
|
55
|
+
help="Type of Slurm partition",
|
|
56
|
+
)
|
|
57
|
+
@click.option(
|
|
58
|
+
"--resource-type",
|
|
59
|
+
type=str,
|
|
60
|
+
help="Type of resource to request for the job",
|
|
55
61
|
)
|
|
56
62
|
@click.option(
|
|
57
63
|
"--num-nodes",
|
|
@@ -65,9 +71,16 @@ def cli() -> None:
|
|
|
65
71
|
)
|
|
66
72
|
@click.option(
|
|
67
73
|
"--account",
|
|
74
|
+
"-A",
|
|
68
75
|
type=str,
|
|
69
76
|
help="Charge resources used by this job to specified account.",
|
|
70
77
|
)
|
|
78
|
+
@click.option(
|
|
79
|
+
"--work-dir",
|
|
80
|
+
"-D",
|
|
81
|
+
type=str,
|
|
82
|
+
help="Set working directory for the batch job",
|
|
83
|
+
)
|
|
71
84
|
@click.option(
|
|
72
85
|
"--qos",
|
|
73
86
|
type=str,
|
|
@@ -79,14 +92,14 @@ def cli() -> None:
|
|
|
79
92
|
help="Exclude certain nodes from the resources granted to the job",
|
|
80
93
|
)
|
|
81
94
|
@click.option(
|
|
82
|
-
"--
|
|
95
|
+
"--nodelist",
|
|
83
96
|
type=str,
|
|
84
97
|
help="Request a specific list of nodes for deployment",
|
|
85
98
|
)
|
|
86
99
|
@click.option(
|
|
87
100
|
"--bind",
|
|
88
101
|
type=str,
|
|
89
|
-
help="Additional binds for the
|
|
102
|
+
help="Additional binds for the container as a comma separated list of bind paths",
|
|
90
103
|
)
|
|
91
104
|
@click.option(
|
|
92
105
|
"--time",
|
|
@@ -118,6 +131,16 @@ def cli() -> None:
|
|
|
118
131
|
is_flag=True,
|
|
119
132
|
help="Output in JSON string",
|
|
120
133
|
)
|
|
134
|
+
@click.option(
|
|
135
|
+
"--env",
|
|
136
|
+
type=str,
|
|
137
|
+
help="Environment variables to be set. Seperate variables with commas. Can also include path to a file containing environment variables seperated by newlines. e.g. --env 'TRITON_CACHE_DIR=/scratch/.cache/triton,my_custom_vars_file.env'",
|
|
138
|
+
)
|
|
139
|
+
@click.option(
|
|
140
|
+
"--config",
|
|
141
|
+
type=str,
|
|
142
|
+
help="Path to a model config yaml file to use in place of the default",
|
|
143
|
+
)
|
|
121
144
|
def launch(
|
|
122
145
|
model_name: str,
|
|
123
146
|
**cli_kwargs: Optional[Union[str, int, float, bool]],
|
|
@@ -135,21 +158,25 @@ def launch(
|
|
|
135
158
|
- model_variant : str, optional
|
|
136
159
|
Specific variant of the model
|
|
137
160
|
- partition : str, optional
|
|
138
|
-
Type of
|
|
161
|
+
Type of Slurm partition
|
|
162
|
+
- resource_type : str, optional
|
|
163
|
+
Type of resource to request for the job
|
|
139
164
|
- num_nodes : int, optional
|
|
140
165
|
Number of nodes to use
|
|
141
166
|
- gpus_per_node : int, optional
|
|
142
167
|
Number of GPUs per node
|
|
143
168
|
- account : str, optional
|
|
144
169
|
Charge resources used by this job to specified account
|
|
170
|
+
- work_dir : str, optional
|
|
171
|
+
Set working directory for the batch job
|
|
145
172
|
- qos : str, optional
|
|
146
173
|
Quality of service tier
|
|
147
174
|
- exclude : str, optional
|
|
148
175
|
Exclude certain nodes from the resources granted to the job
|
|
149
|
-
-
|
|
176
|
+
- nodelist : str, optional
|
|
150
177
|
Request a specific list of nodes for deployment
|
|
151
178
|
- bind : str, optional
|
|
152
|
-
Additional binds for the
|
|
179
|
+
Additional binds for the container as a comma separated list of bind paths
|
|
153
180
|
- time : str, optional
|
|
154
181
|
Time limit for job
|
|
155
182
|
- venv : str, optional
|
|
@@ -160,6 +187,10 @@ def launch(
|
|
|
160
187
|
Path to model weights directory
|
|
161
188
|
- vllm_args : str, optional
|
|
162
189
|
vLLM engine arguments
|
|
190
|
+
- env : str, optional
|
|
191
|
+
Environment variables
|
|
192
|
+
- config : str, optional
|
|
193
|
+
Path to custom model config yaml file
|
|
163
194
|
- json_mode : bool, optional
|
|
164
195
|
Output in JSON format
|
|
165
196
|
|
|
@@ -180,11 +211,12 @@ def launch(
|
|
|
180
211
|
launch_response = client.launch_model(model_name, launch_options)
|
|
181
212
|
|
|
182
213
|
# Display launch information
|
|
183
|
-
launch_formatter = LaunchResponseFormatter(model_name, launch_response.config)
|
|
184
|
-
|
|
185
214
|
if json_mode:
|
|
186
215
|
click.echo(json.dumps(launch_response.config))
|
|
187
216
|
else:
|
|
217
|
+
launch_formatter = LaunchResponseFormatter(
|
|
218
|
+
model_name, launch_response.config
|
|
219
|
+
)
|
|
188
220
|
launch_info_table = launch_formatter.format_table_output()
|
|
189
221
|
CONSOLE.print(launch_info_table)
|
|
190
222
|
|
|
@@ -194,29 +226,93 @@ def launch(
|
|
|
194
226
|
raise click.ClickException(f"Launch failed: {str(e)}") from e
|
|
195
227
|
|
|
196
228
|
|
|
197
|
-
@cli.command(
|
|
198
|
-
|
|
229
|
+
@cli.command(
|
|
230
|
+
"batch-launch",
|
|
231
|
+
help="Launch multiple models in a batch, separate model names with spaces.",
|
|
232
|
+
)
|
|
233
|
+
@click.argument("model-names", type=str, nargs=-1)
|
|
199
234
|
@click.option(
|
|
200
|
-
"--
|
|
235
|
+
"--batch-config",
|
|
236
|
+
type=str,
|
|
237
|
+
help="Model configuration for batch launch",
|
|
238
|
+
)
|
|
239
|
+
@click.option(
|
|
240
|
+
"--account",
|
|
241
|
+
"-A",
|
|
201
242
|
type=str,
|
|
202
|
-
help="
|
|
243
|
+
help="Charge resources used by this job to specified account.",
|
|
244
|
+
)
|
|
245
|
+
@click.option(
|
|
246
|
+
"--work-dir",
|
|
247
|
+
"-D",
|
|
248
|
+
type=str,
|
|
249
|
+
help="Set working directory for the batch job",
|
|
203
250
|
)
|
|
204
251
|
@click.option(
|
|
205
252
|
"--json-mode",
|
|
206
253
|
is_flag=True,
|
|
207
254
|
help="Output in JSON string",
|
|
208
255
|
)
|
|
209
|
-
def
|
|
210
|
-
|
|
256
|
+
def batch_launch(
|
|
257
|
+
model_names: tuple[str, ...],
|
|
258
|
+
batch_config: Optional[str] = None,
|
|
259
|
+
account: Optional[str] = None,
|
|
260
|
+
work_dir: Optional[str] = None,
|
|
261
|
+
json_mode: Optional[bool] = False,
|
|
211
262
|
) -> None:
|
|
263
|
+
"""Launch multiple models in a batch.
|
|
264
|
+
|
|
265
|
+
Parameters
|
|
266
|
+
----------
|
|
267
|
+
model_names : tuple[str, ...]
|
|
268
|
+
Names of the models to launch
|
|
269
|
+
batch_config : str
|
|
270
|
+
Model configuration for batch launch
|
|
271
|
+
json_mode : bool, default=False
|
|
272
|
+
Whether to output in JSON format
|
|
273
|
+
|
|
274
|
+
Raises
|
|
275
|
+
------
|
|
276
|
+
click.ClickException
|
|
277
|
+
If batch launch fails
|
|
278
|
+
"""
|
|
279
|
+
try:
|
|
280
|
+
# Start the client and launch models in batch mode
|
|
281
|
+
client = VecInfClient()
|
|
282
|
+
batch_launch_response = client.batch_launch_models(
|
|
283
|
+
list(model_names), batch_config, account, work_dir
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
# Display batch launch information
|
|
287
|
+
if json_mode:
|
|
288
|
+
click.echo(json.dumps(batch_launch_response.config, indent=4))
|
|
289
|
+
else:
|
|
290
|
+
batch_launch_formatter = BatchLaunchResponseFormatter(
|
|
291
|
+
batch_launch_response.config
|
|
292
|
+
)
|
|
293
|
+
batch_launch_info_table = batch_launch_formatter.format_table_output()
|
|
294
|
+
CONSOLE.print(batch_launch_info_table)
|
|
295
|
+
|
|
296
|
+
except click.ClickException as e:
|
|
297
|
+
raise e
|
|
298
|
+
except Exception as e:
|
|
299
|
+
raise click.ClickException(f"Batch launch failed: {str(e)}") from e
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
@cli.command("status", help="Check the status of a running model on the cluster.")
|
|
303
|
+
@click.argument("slurm_job_id", type=str, nargs=1)
|
|
304
|
+
@click.option(
|
|
305
|
+
"--json-mode",
|
|
306
|
+
is_flag=True,
|
|
307
|
+
help="Output in JSON string",
|
|
308
|
+
)
|
|
309
|
+
def status(slurm_job_id: str, json_mode: bool = False) -> None:
|
|
212
310
|
"""Get the status of a running model on the cluster.
|
|
213
311
|
|
|
214
312
|
Parameters
|
|
215
313
|
----------
|
|
216
|
-
slurm_job_id :
|
|
314
|
+
slurm_job_id : str
|
|
217
315
|
ID of the SLURM job to check
|
|
218
|
-
log_dir : str, optional
|
|
219
|
-
Path to SLURM log directory
|
|
220
316
|
json_mode : bool, default=False
|
|
221
317
|
Whether to output in JSON format
|
|
222
318
|
|
|
@@ -228,7 +324,7 @@ def status(
|
|
|
228
324
|
try:
|
|
229
325
|
# Start the client and get model inference server status
|
|
230
326
|
client = VecInfClient()
|
|
231
|
-
status_response = client.get_status(slurm_job_id
|
|
327
|
+
status_response = client.get_status(slurm_job_id)
|
|
232
328
|
# Display status information
|
|
233
329
|
status_formatter = StatusResponseFormatter(status_response)
|
|
234
330
|
if json_mode:
|
|
@@ -243,14 +339,14 @@ def status(
|
|
|
243
339
|
raise click.ClickException(f"Status check failed: {str(e)}") from e
|
|
244
340
|
|
|
245
341
|
|
|
246
|
-
@cli.command("shutdown")
|
|
247
|
-
@click.argument("slurm_job_id", type=
|
|
248
|
-
def shutdown(slurm_job_id:
|
|
342
|
+
@cli.command("shutdown", help="Shutdown a running model on the cluster.")
|
|
343
|
+
@click.argument("slurm_job_id", type=str, nargs=1)
|
|
344
|
+
def shutdown(slurm_job_id: str) -> None:
|
|
249
345
|
"""Shutdown a running model on the cluster.
|
|
250
346
|
|
|
251
347
|
Parameters
|
|
252
348
|
----------
|
|
253
|
-
slurm_job_id :
|
|
349
|
+
slurm_job_id : str
|
|
254
350
|
ID of the SLURM job to shut down
|
|
255
351
|
|
|
256
352
|
Raises
|
|
@@ -266,7 +362,7 @@ def shutdown(slurm_job_id: int) -> None:
|
|
|
266
362
|
raise click.ClickException(f"Shutdown failed: {str(e)}") from e
|
|
267
363
|
|
|
268
364
|
|
|
269
|
-
@cli.command("list")
|
|
365
|
+
@cli.command("list", help="List available models or get specific model configuration.")
|
|
270
366
|
@click.argument("model-name", required=False)
|
|
271
367
|
@click.option(
|
|
272
368
|
"--json-mode",
|
|
@@ -304,20 +400,17 @@ def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> No
|
|
|
304
400
|
raise click.ClickException(f"List models failed: {str(e)}") from e
|
|
305
401
|
|
|
306
402
|
|
|
307
|
-
@cli.command(
|
|
308
|
-
|
|
309
|
-
@click.option(
|
|
310
|
-
"--log-dir", type=str, help="Path to slurm log directory (if used during launch)"
|
|
403
|
+
@cli.command(
|
|
404
|
+
"metrics", help="Stream real-time performance metrics from the model endpoint."
|
|
311
405
|
)
|
|
312
|
-
|
|
406
|
+
@click.argument("slurm_job_id", type=str, nargs=1)
|
|
407
|
+
def metrics(slurm_job_id: str) -> None:
|
|
313
408
|
"""Stream real-time performance metrics from the model endpoint.
|
|
314
409
|
|
|
315
410
|
Parameters
|
|
316
411
|
----------
|
|
317
|
-
slurm_job_id :
|
|
412
|
+
slurm_job_id : str
|
|
318
413
|
ID of the SLURM job to monitor
|
|
319
|
-
log_dir : str, optional
|
|
320
|
-
Path to SLURM log directory
|
|
321
414
|
|
|
322
415
|
Raises
|
|
323
416
|
------
|
|
@@ -333,7 +426,7 @@ def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
|
|
|
333
426
|
try:
|
|
334
427
|
# Start the client and get inference server metrics
|
|
335
428
|
client = VecInfClient()
|
|
336
|
-
metrics_response = client.get_metrics(slurm_job_id
|
|
429
|
+
metrics_response = client.get_metrics(slurm_job_id)
|
|
337
430
|
metrics_formatter = MetricsResponseFormatter(metrics_response.metrics)
|
|
338
431
|
|
|
339
432
|
# Check if metrics response is ready
|
|
@@ -344,7 +437,7 @@ def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
|
|
|
344
437
|
|
|
345
438
|
with Live(refresh_per_second=1, console=CONSOLE) as live:
|
|
346
439
|
while True:
|
|
347
|
-
metrics_response = client.get_metrics(slurm_job_id
|
|
440
|
+
metrics_response = client.get_metrics(slurm_job_id)
|
|
348
441
|
metrics_formatter = MetricsResponseFormatter(metrics_response.metrics)
|
|
349
442
|
|
|
350
443
|
if isinstance(metrics_response.metrics, str):
|
|
@@ -361,5 +454,69 @@ def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
|
|
|
361
454
|
raise click.ClickException(f"Metrics check failed: {str(e)}") from e
|
|
362
455
|
|
|
363
456
|
|
|
457
|
+
@cli.command("cleanup", help="Clean up log files based on optional filters.")
|
|
458
|
+
@click.option("--log-dir", type=str, help="Path to SLURM log directory")
|
|
459
|
+
@click.option("--model-family", type=str, help="Filter by model family")
|
|
460
|
+
@click.option("--model-name", type=str, help="Filter by model name")
|
|
461
|
+
@click.option(
|
|
462
|
+
"--job-id", type=int, help="Only remove logs with this exact SLURM job ID"
|
|
463
|
+
)
|
|
464
|
+
@click.option(
|
|
465
|
+
"--before-job-id",
|
|
466
|
+
type=int,
|
|
467
|
+
help="Remove logs with job ID less than this value",
|
|
468
|
+
)
|
|
469
|
+
@click.option("--dry-run", is_flag=True, help="List matching logs without deleting")
|
|
470
|
+
def cleanup_logs_cli(
|
|
471
|
+
log_dir: Optional[str],
|
|
472
|
+
model_family: Optional[str],
|
|
473
|
+
model_name: Optional[str],
|
|
474
|
+
job_id: Optional[int],
|
|
475
|
+
before_job_id: Optional[int],
|
|
476
|
+
dry_run: bool,
|
|
477
|
+
) -> None:
|
|
478
|
+
"""Clean up log files based on optional filters.
|
|
479
|
+
|
|
480
|
+
Parameters
|
|
481
|
+
----------
|
|
482
|
+
log_dir : str or Path, optional
|
|
483
|
+
Root directory containing log files. Defaults to ~/.vec-inf-logs.
|
|
484
|
+
model_family : str, optional
|
|
485
|
+
Only delete logs for this model family.
|
|
486
|
+
model_name : str, optional
|
|
487
|
+
Only delete logs for this model name.
|
|
488
|
+
job_id : int, optional
|
|
489
|
+
If provided, only match directories with this exact SLURM job ID.
|
|
490
|
+
before_job_id : int, optional
|
|
491
|
+
If provided, only delete logs with job ID less than this value.
|
|
492
|
+
dry_run : bool
|
|
493
|
+
If True, return matching files without deleting them.
|
|
494
|
+
"""
|
|
495
|
+
try:
|
|
496
|
+
client = VecInfClient()
|
|
497
|
+
matched = client.cleanup_logs(
|
|
498
|
+
log_dir=log_dir,
|
|
499
|
+
model_family=model_family,
|
|
500
|
+
model_name=model_name,
|
|
501
|
+
job_id=job_id,
|
|
502
|
+
before_job_id=before_job_id,
|
|
503
|
+
dry_run=dry_run,
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
if not matched:
|
|
507
|
+
if dry_run:
|
|
508
|
+
click.echo("Dry run: no matching log directories found.")
|
|
509
|
+
else:
|
|
510
|
+
click.echo("No matching log directories were deleted.")
|
|
511
|
+
elif dry_run:
|
|
512
|
+
click.echo(f"Dry run: {len(matched)} directories would be deleted:")
|
|
513
|
+
for f in matched:
|
|
514
|
+
click.echo(f" - {f}")
|
|
515
|
+
else:
|
|
516
|
+
click.echo(f"Deleted {len(matched)} log directory(ies).")
|
|
517
|
+
except Exception as e:
|
|
518
|
+
raise click.ClickException(f"Cleanup failed: {str(e)}") from e
|
|
519
|
+
|
|
520
|
+
|
|
364
521
|
if __name__ == "__main__":
|
|
365
522
|
cli()
|
vec_inf/cli/_helper.py
CHANGED
|
@@ -4,6 +4,7 @@ This module provides formatting and display classes for the command-line interfa
|
|
|
4
4
|
handling the presentation of model information, status updates, and metrics.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
+
import json
|
|
7
8
|
from pathlib import Path
|
|
8
9
|
from typing import Any, Union
|
|
9
10
|
|
|
@@ -27,9 +28,8 @@ class LaunchResponseFormatter:
|
|
|
27
28
|
Parameters
|
|
28
29
|
----------
|
|
29
30
|
model_name : str
|
|
30
|
-
Name of the launched model
|
|
31
|
-
|
|
32
|
-
Launch parameters and configuration
|
|
31
|
+
Name of the launched model params : dict[str, Any] Launch parameters and
|
|
32
|
+
configuration
|
|
33
33
|
"""
|
|
34
34
|
|
|
35
35
|
def __init__(self, model_name: str, params: dict[str, Any]):
|
|
@@ -59,8 +59,16 @@ class LaunchResponseFormatter:
|
|
|
59
59
|
table.add_row("Vocabulary Size", self.params["vocab_size"])
|
|
60
60
|
|
|
61
61
|
# Add resource allocation details
|
|
62
|
-
|
|
63
|
-
|
|
62
|
+
if self.params.get("account"):
|
|
63
|
+
table.add_row("Account", self.params["account"])
|
|
64
|
+
if self.params.get("work_dir"):
|
|
65
|
+
table.add_row("Working Directory", self.params["work_dir"])
|
|
66
|
+
if self.params.get("resource_type"):
|
|
67
|
+
table.add_row("Resource Type", self.params["resource_type"])
|
|
68
|
+
if self.params.get("partition"):
|
|
69
|
+
table.add_row("Partition", self.params["partition"])
|
|
70
|
+
if self.params.get("qos"):
|
|
71
|
+
table.add_row("QoS", self.params["qos"])
|
|
64
72
|
table.add_row("Time Limit", self.params["time"])
|
|
65
73
|
table.add_row("Num Nodes", self.params["num_nodes"])
|
|
66
74
|
table.add_row("GPUs/Node", self.params["gpus_per_node"])
|
|
@@ -79,6 +87,80 @@ class LaunchResponseFormatter:
|
|
|
79
87
|
for arg, value in self.params["vllm_args"].items():
|
|
80
88
|
table.add_row(f" {arg}:", str(value))
|
|
81
89
|
|
|
90
|
+
# Add Environment Variable Configuration Details
|
|
91
|
+
table.add_row("Environment Variables", style="magenta")
|
|
92
|
+
for arg, value in self.params["env"].items():
|
|
93
|
+
table.add_row(f" {arg}:", str(value))
|
|
94
|
+
|
|
95
|
+
return table
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class BatchLaunchResponseFormatter:
|
|
99
|
+
"""CLI Helper class for formatting BatchLaunchResponse.
|
|
100
|
+
|
|
101
|
+
A formatter class that handles the presentation of batch launch information
|
|
102
|
+
in both table and JSON formats.
|
|
103
|
+
|
|
104
|
+
Parameters
|
|
105
|
+
----------
|
|
106
|
+
params : dict[str, Any]
|
|
107
|
+
Configuration for the batch launch
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
def __init__(self, params: dict[str, Any]):
|
|
111
|
+
self.params = params
|
|
112
|
+
|
|
113
|
+
def format_table_output(self) -> Table:
|
|
114
|
+
"""Format output as rich Table.
|
|
115
|
+
|
|
116
|
+
Returns
|
|
117
|
+
-------
|
|
118
|
+
Table
|
|
119
|
+
Rich table containing formatted batch launch information including:
|
|
120
|
+
- Job configuration
|
|
121
|
+
- Model details
|
|
122
|
+
- Resource allocation
|
|
123
|
+
- vLLM configuration
|
|
124
|
+
"""
|
|
125
|
+
table = create_table(key_title="Job Config", value_title="Value")
|
|
126
|
+
# Add key information with consistent styling
|
|
127
|
+
table.add_row("Slurm Job ID", self.params["slurm_job_id"], style="blue")
|
|
128
|
+
table.add_row("Slurm Job Name", self.params["slurm_job_name"], style="blue")
|
|
129
|
+
if self.params.get("account"):
|
|
130
|
+
table.add_row("Account", self.params["account"], style="blue")
|
|
131
|
+
if self.params.get("work_dir"):
|
|
132
|
+
table.add_row("Working Directory", self.params["work_dir"], style="blue")
|
|
133
|
+
table.add_row("Log Directory", self.params["log_dir"], style="blue")
|
|
134
|
+
for model_name in self.params["models"]:
|
|
135
|
+
table.add_row("Model Name", model_name, style="magenta")
|
|
136
|
+
# Add resource allocation details
|
|
137
|
+
if self.params["models"][model_name].get("resource_type"):
|
|
138
|
+
table.add_row(
|
|
139
|
+
"Resource Type",
|
|
140
|
+
f" {self.params['models'][model_name]['resource_type']}",
|
|
141
|
+
)
|
|
142
|
+
if self.params["models"][model_name].get("partition"):
|
|
143
|
+
table.add_row(
|
|
144
|
+
"Partition", f" {self.params['models'][model_name]['partition']}"
|
|
145
|
+
)
|
|
146
|
+
if self.params["models"][model_name].get("qos"):
|
|
147
|
+
table.add_row("QoS", f" {self.params['models'][model_name]['qos']}")
|
|
148
|
+
table.add_row(
|
|
149
|
+
"Time Limit", f" {self.params['models'][model_name]['time']}"
|
|
150
|
+
)
|
|
151
|
+
table.add_row(
|
|
152
|
+
"Num Nodes", f" {self.params['models'][model_name]['num_nodes']}"
|
|
153
|
+
)
|
|
154
|
+
table.add_row(
|
|
155
|
+
"GPUs/Node", f" {self.params['models'][model_name]['gpus_per_node']}"
|
|
156
|
+
)
|
|
157
|
+
table.add_row(
|
|
158
|
+
"CPUs/Task", f" {self.params['models'][model_name]['cpus_per_task']}"
|
|
159
|
+
)
|
|
160
|
+
table.add_row(
|
|
161
|
+
"Memory/Node", f" {self.params['models'][model_name]['mem_per_node']}"
|
|
162
|
+
)
|
|
163
|
+
|
|
82
164
|
return table
|
|
83
165
|
|
|
84
166
|
|
|
@@ -116,7 +198,8 @@ class StatusResponseFormatter:
|
|
|
116
198
|
json_data["pending_reason"] = self.status_info.pending_reason
|
|
117
199
|
if self.status_info.failed_reason:
|
|
118
200
|
json_data["failed_reason"] = self.status_info.failed_reason
|
|
119
|
-
|
|
201
|
+
|
|
202
|
+
click.echo(json.dumps(json_data, indent=4))
|
|
120
203
|
|
|
121
204
|
def output_table(self) -> Table:
|
|
122
205
|
"""Create and display rich table.
|
|
@@ -292,9 +375,7 @@ class ListCmdDisplay:
|
|
|
292
375
|
self.model_config = None
|
|
293
376
|
self.model_names: list[str] = []
|
|
294
377
|
|
|
295
|
-
def _format_single_model_output(
|
|
296
|
-
self, config: ModelConfig
|
|
297
|
-
) -> Union[dict[str, Any], Table]:
|
|
378
|
+
def _format_single_model_output(self, config: ModelConfig) -> Union[str, Table]:
|
|
298
379
|
"""Format output table for a single model.
|
|
299
380
|
|
|
300
381
|
Parameters
|
|
@@ -304,8 +385,8 @@ class ListCmdDisplay:
|
|
|
304
385
|
|
|
305
386
|
Returns
|
|
306
387
|
-------
|
|
307
|
-
Union[
|
|
308
|
-
Either a
|
|
388
|
+
Union[str, Table]
|
|
389
|
+
Either a JSON string for JSON output or a Rich table
|
|
309
390
|
"""
|
|
310
391
|
if self.json_mode:
|
|
311
392
|
# Exclude non-essential fields from JSON output
|
|
@@ -315,11 +396,11 @@ class ListCmdDisplay:
|
|
|
315
396
|
config_dict["model_weights_parent_dir"] = str(
|
|
316
397
|
config_dict["model_weights_parent_dir"]
|
|
317
398
|
)
|
|
318
|
-
return config_dict
|
|
399
|
+
return json.dumps(config_dict, indent=4)
|
|
319
400
|
|
|
320
401
|
table = create_table(key_title="Model Config", value_title="Value")
|
|
321
402
|
for field, value in config.model_dump().items():
|
|
322
|
-
if field not in {"venv", "log_dir", "vllm_args"}:
|
|
403
|
+
if field not in {"venv", "log_dir", "vllm_args"} and value:
|
|
323
404
|
table.add_row(field, str(value))
|
|
324
405
|
if field == "vllm_args":
|
|
325
406
|
table.add_row("vLLM Arguments:", style="magenta")
|
|
@@ -394,7 +475,7 @@ class ListCmdDisplay:
|
|
|
394
475
|
"""
|
|
395
476
|
if self.json_mode:
|
|
396
477
|
model_names = [info.name for info in model_infos]
|
|
397
|
-
click.echo(model_names)
|
|
478
|
+
click.echo(json.dumps(model_names, indent=4))
|
|
398
479
|
else:
|
|
399
480
|
panels = self._format_all_models_output(model_infos)
|
|
400
481
|
self.console.print(Columns(panels, equal=True))
|