vec-inf 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vec_inf/README.md +18 -4
- vec_inf/cli/_cli.py +212 -30
- vec_inf/cli/_helper.py +95 -14
- vec_inf/client/_client_vars.py +19 -152
- vec_inf/client/_helper.py +386 -53
- vec_inf/client/_slurm_script_generator.py +210 -43
- vec_inf/client/_slurm_templates.py +248 -0
- vec_inf/client/_slurm_vars.py +82 -0
- vec_inf/client/_utils.py +190 -71
- vec_inf/client/api.py +96 -25
- vec_inf/client/config.py +46 -15
- vec_inf/client/models.py +51 -2
- vec_inf/config/README.md +4 -243
- vec_inf/config/environment.yaml +31 -0
- vec_inf/config/models.yaml +102 -281
- {vec_inf-0.6.0.dist-info → vec_inf-0.7.0.dist-info}/METADATA +25 -67
- vec_inf-0.7.0.dist-info/RECORD +27 -0
- vec_inf/client/slurm_vars.py +0 -49
- vec_inf-0.6.0.dist-info/RECORD +0 -25
- {vec_inf-0.6.0.dist-info → vec_inf-0.7.0.dist-info}/WHEEL +0 -0
- {vec_inf-0.6.0.dist-info → vec_inf-0.7.0.dist-info}/entry_points.txt +0 -0
- {vec_inf-0.6.0.dist-info → vec_inf-0.7.0.dist-info}/licenses/LICENSE +0 -0
vec_inf/README.md
CHANGED
|
@@ -1,9 +1,23 @@
|
|
|
1
|
-
|
|
1
|
+
## `vec-inf` CLI Commands
|
|
2
2
|
|
|
3
|
-
* `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server
|
|
4
|
-
* `
|
|
3
|
+
* `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server.
|
|
4
|
+
* `batch-launch`: Specify a list of models to launch multiple OpenAI compatible inference servers at the same time.
|
|
5
|
+
* `status`: Check the model status by providing its Slurm job ID.
|
|
5
6
|
* `metrics`: Streams performance metrics to the console.
|
|
6
7
|
* `shutdown`: Shutdown a model by providing its Slurm job ID.
|
|
7
|
-
* `list`: List all available model names, or view the default/cached configuration of a specific model
|
|
8
|
+
* `list`: List all available model names, or view the default/cached configuration of a specific model.
|
|
9
|
+
* `cleanup`: Remove old log directories. You can filter by `--model-family`, `--model-name`, `--job-id`, and/or `--before-job-id`. Use `--dry-run` to preview what would be deleted.
|
|
8
10
|
|
|
9
11
|
Use `--help` to see all available options
|
|
12
|
+
|
|
13
|
+
## `VecInfClient` API
|
|
14
|
+
|
|
15
|
+
* `launch_model`: Launch an OpenAI compatible inference server.
|
|
16
|
+
* `batch_launch_models`: Launch multiple OpenAI compatible inference servers.
|
|
17
|
+
* `get_status`: Get the status of a running model.
|
|
18
|
+
* `get_metrics`: Get the performance metrics of a running model.
|
|
19
|
+
* `shutdown_model`: Shutdown a running model.
|
|
20
|
+
* `list_models`" List all available models.
|
|
21
|
+
* `get_model_config`: Get the configuration for a specific model.
|
|
22
|
+
* `wait_until_ready`: Wait until a model is ready or fails.
|
|
23
|
+
* `cleanup_logs`: Remove logs from the log directory.
|
vec_inf/cli/_cli.py
CHANGED
|
@@ -18,6 +18,7 @@ metrics
|
|
|
18
18
|
Stream real-time performance metrics
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
|
+
import json
|
|
21
22
|
import time
|
|
22
23
|
from typing import Optional, Union
|
|
23
24
|
|
|
@@ -26,6 +27,7 @@ from rich.console import Console
|
|
|
26
27
|
from rich.live import Live
|
|
27
28
|
|
|
28
29
|
from vec_inf.cli._helper import (
|
|
30
|
+
BatchLaunchResponseFormatter,
|
|
29
31
|
LaunchResponseFormatter,
|
|
30
32
|
ListCmdDisplay,
|
|
31
33
|
MetricsResponseFormatter,
|
|
@@ -43,14 +45,19 @@ def cli() -> None:
|
|
|
43
45
|
pass
|
|
44
46
|
|
|
45
47
|
|
|
46
|
-
@cli.command("launch")
|
|
48
|
+
@cli.command("launch", help="Launch a model on the cluster.")
|
|
47
49
|
@click.argument("model-name", type=str, nargs=1)
|
|
48
50
|
@click.option("--model-family", type=str, help="The model family")
|
|
49
51
|
@click.option("--model-variant", type=str, help="The model variant")
|
|
50
52
|
@click.option(
|
|
51
53
|
"--partition",
|
|
52
54
|
type=str,
|
|
53
|
-
help="Type of
|
|
55
|
+
help="Type of Slurm partition",
|
|
56
|
+
)
|
|
57
|
+
@click.option(
|
|
58
|
+
"--resource-type",
|
|
59
|
+
type=str,
|
|
60
|
+
help="Type of resource to request for the job",
|
|
54
61
|
)
|
|
55
62
|
@click.option(
|
|
56
63
|
"--num-nodes",
|
|
@@ -64,14 +71,36 @@ def cli() -> None:
|
|
|
64
71
|
)
|
|
65
72
|
@click.option(
|
|
66
73
|
"--account",
|
|
74
|
+
"-A",
|
|
67
75
|
type=str,
|
|
68
76
|
help="Charge resources used by this job to specified account.",
|
|
69
77
|
)
|
|
78
|
+
@click.option(
|
|
79
|
+
"--work-dir",
|
|
80
|
+
"-D",
|
|
81
|
+
type=str,
|
|
82
|
+
help="Set working directory for the batch job",
|
|
83
|
+
)
|
|
70
84
|
@click.option(
|
|
71
85
|
"--qos",
|
|
72
86
|
type=str,
|
|
73
87
|
help="Quality of service",
|
|
74
88
|
)
|
|
89
|
+
@click.option(
|
|
90
|
+
"--exclude",
|
|
91
|
+
type=str,
|
|
92
|
+
help="Exclude certain nodes from the resources granted to the job",
|
|
93
|
+
)
|
|
94
|
+
@click.option(
|
|
95
|
+
"--nodelist",
|
|
96
|
+
type=str,
|
|
97
|
+
help="Request a specific list of nodes for deployment",
|
|
98
|
+
)
|
|
99
|
+
@click.option(
|
|
100
|
+
"--bind",
|
|
101
|
+
type=str,
|
|
102
|
+
help="Additional binds for the container as a comma separated list of bind paths",
|
|
103
|
+
)
|
|
75
104
|
@click.option(
|
|
76
105
|
"--time",
|
|
77
106
|
type=str,
|
|
@@ -102,6 +131,16 @@ def cli() -> None:
|
|
|
102
131
|
is_flag=True,
|
|
103
132
|
help="Output in JSON string",
|
|
104
133
|
)
|
|
134
|
+
@click.option(
|
|
135
|
+
"--env",
|
|
136
|
+
type=str,
|
|
137
|
+
help="Environment variables to be set. Seperate variables with commas. Can also include path to a file containing environment variables seperated by newlines. e.g. --env 'TRITON_CACHE_DIR=/scratch/.cache/triton,my_custom_vars_file.env'",
|
|
138
|
+
)
|
|
139
|
+
@click.option(
|
|
140
|
+
"--config",
|
|
141
|
+
type=str,
|
|
142
|
+
help="Path to a model config yaml file to use in place of the default",
|
|
143
|
+
)
|
|
105
144
|
def launch(
|
|
106
145
|
model_name: str,
|
|
107
146
|
**cli_kwargs: Optional[Union[str, int, float, bool]],
|
|
@@ -119,13 +158,25 @@ def launch(
|
|
|
119
158
|
- model_variant : str, optional
|
|
120
159
|
Specific variant of the model
|
|
121
160
|
- partition : str, optional
|
|
122
|
-
Type of
|
|
161
|
+
Type of Slurm partition
|
|
162
|
+
- resource_type : str, optional
|
|
163
|
+
Type of resource to request for the job
|
|
123
164
|
- num_nodes : int, optional
|
|
124
165
|
Number of nodes to use
|
|
125
166
|
- gpus_per_node : int, optional
|
|
126
167
|
Number of GPUs per node
|
|
168
|
+
- account : str, optional
|
|
169
|
+
Charge resources used by this job to specified account
|
|
170
|
+
- work_dir : str, optional
|
|
171
|
+
Set working directory for the batch job
|
|
127
172
|
- qos : str, optional
|
|
128
173
|
Quality of service tier
|
|
174
|
+
- exclude : str, optional
|
|
175
|
+
Exclude certain nodes from the resources granted to the job
|
|
176
|
+
- nodelist : str, optional
|
|
177
|
+
Request a specific list of nodes for deployment
|
|
178
|
+
- bind : str, optional
|
|
179
|
+
Additional binds for the container as a comma separated list of bind paths
|
|
129
180
|
- time : str, optional
|
|
130
181
|
Time limit for job
|
|
131
182
|
- venv : str, optional
|
|
@@ -136,6 +187,10 @@ def launch(
|
|
|
136
187
|
Path to model weights directory
|
|
137
188
|
- vllm_args : str, optional
|
|
138
189
|
vLLM engine arguments
|
|
190
|
+
- env : str, optional
|
|
191
|
+
Environment variables
|
|
192
|
+
- config : str, optional
|
|
193
|
+
Path to custom model config yaml file
|
|
139
194
|
- json_mode : bool, optional
|
|
140
195
|
Output in JSON format
|
|
141
196
|
|
|
@@ -156,10 +211,12 @@ def launch(
|
|
|
156
211
|
launch_response = client.launch_model(model_name, launch_options)
|
|
157
212
|
|
|
158
213
|
# Display launch information
|
|
159
|
-
launch_formatter = LaunchResponseFormatter(model_name, launch_response.config)
|
|
160
214
|
if json_mode:
|
|
161
|
-
click.echo(launch_response.config)
|
|
215
|
+
click.echo(json.dumps(launch_response.config))
|
|
162
216
|
else:
|
|
217
|
+
launch_formatter = LaunchResponseFormatter(
|
|
218
|
+
model_name, launch_response.config
|
|
219
|
+
)
|
|
163
220
|
launch_info_table = launch_formatter.format_table_output()
|
|
164
221
|
CONSOLE.print(launch_info_table)
|
|
165
222
|
|
|
@@ -169,29 +226,93 @@ def launch(
|
|
|
169
226
|
raise click.ClickException(f"Launch failed: {str(e)}") from e
|
|
170
227
|
|
|
171
228
|
|
|
172
|
-
@cli.command(
|
|
173
|
-
|
|
229
|
+
@cli.command(
|
|
230
|
+
"batch-launch",
|
|
231
|
+
help="Launch multiple models in a batch, separate model names with spaces.",
|
|
232
|
+
)
|
|
233
|
+
@click.argument("model-names", type=str, nargs=-1)
|
|
174
234
|
@click.option(
|
|
175
|
-
"--
|
|
235
|
+
"--batch-config",
|
|
236
|
+
type=str,
|
|
237
|
+
help="Model configuration for batch launch",
|
|
238
|
+
)
|
|
239
|
+
@click.option(
|
|
240
|
+
"--account",
|
|
241
|
+
"-A",
|
|
176
242
|
type=str,
|
|
177
|
-
help="
|
|
243
|
+
help="Charge resources used by this job to specified account.",
|
|
244
|
+
)
|
|
245
|
+
@click.option(
|
|
246
|
+
"--work-dir",
|
|
247
|
+
"-D",
|
|
248
|
+
type=str,
|
|
249
|
+
help="Set working directory for the batch job",
|
|
178
250
|
)
|
|
179
251
|
@click.option(
|
|
180
252
|
"--json-mode",
|
|
181
253
|
is_flag=True,
|
|
182
254
|
help="Output in JSON string",
|
|
183
255
|
)
|
|
184
|
-
def
|
|
185
|
-
|
|
256
|
+
def batch_launch(
|
|
257
|
+
model_names: tuple[str, ...],
|
|
258
|
+
batch_config: Optional[str] = None,
|
|
259
|
+
account: Optional[str] = None,
|
|
260
|
+
work_dir: Optional[str] = None,
|
|
261
|
+
json_mode: Optional[bool] = False,
|
|
186
262
|
) -> None:
|
|
263
|
+
"""Launch multiple models in a batch.
|
|
264
|
+
|
|
265
|
+
Parameters
|
|
266
|
+
----------
|
|
267
|
+
model_names : tuple[str, ...]
|
|
268
|
+
Names of the models to launch
|
|
269
|
+
batch_config : str
|
|
270
|
+
Model configuration for batch launch
|
|
271
|
+
json_mode : bool, default=False
|
|
272
|
+
Whether to output in JSON format
|
|
273
|
+
|
|
274
|
+
Raises
|
|
275
|
+
------
|
|
276
|
+
click.ClickException
|
|
277
|
+
If batch launch fails
|
|
278
|
+
"""
|
|
279
|
+
try:
|
|
280
|
+
# Start the client and launch models in batch mode
|
|
281
|
+
client = VecInfClient()
|
|
282
|
+
batch_launch_response = client.batch_launch_models(
|
|
283
|
+
list(model_names), batch_config, account, work_dir
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
# Display batch launch information
|
|
287
|
+
if json_mode:
|
|
288
|
+
click.echo(json.dumps(batch_launch_response.config, indent=4))
|
|
289
|
+
else:
|
|
290
|
+
batch_launch_formatter = BatchLaunchResponseFormatter(
|
|
291
|
+
batch_launch_response.config
|
|
292
|
+
)
|
|
293
|
+
batch_launch_info_table = batch_launch_formatter.format_table_output()
|
|
294
|
+
CONSOLE.print(batch_launch_info_table)
|
|
295
|
+
|
|
296
|
+
except click.ClickException as e:
|
|
297
|
+
raise e
|
|
298
|
+
except Exception as e:
|
|
299
|
+
raise click.ClickException(f"Batch launch failed: {str(e)}") from e
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
@cli.command("status", help="Check the status of a running model on the cluster.")
|
|
303
|
+
@click.argument("slurm_job_id", type=str, nargs=1)
|
|
304
|
+
@click.option(
|
|
305
|
+
"--json-mode",
|
|
306
|
+
is_flag=True,
|
|
307
|
+
help="Output in JSON string",
|
|
308
|
+
)
|
|
309
|
+
def status(slurm_job_id: str, json_mode: bool = False) -> None:
|
|
187
310
|
"""Get the status of a running model on the cluster.
|
|
188
311
|
|
|
189
312
|
Parameters
|
|
190
313
|
----------
|
|
191
|
-
slurm_job_id :
|
|
314
|
+
slurm_job_id : str
|
|
192
315
|
ID of the SLURM job to check
|
|
193
|
-
log_dir : str, optional
|
|
194
|
-
Path to SLURM log directory
|
|
195
316
|
json_mode : bool, default=False
|
|
196
317
|
Whether to output in JSON format
|
|
197
318
|
|
|
@@ -203,7 +324,7 @@ def status(
|
|
|
203
324
|
try:
|
|
204
325
|
# Start the client and get model inference server status
|
|
205
326
|
client = VecInfClient()
|
|
206
|
-
status_response = client.get_status(slurm_job_id
|
|
327
|
+
status_response = client.get_status(slurm_job_id)
|
|
207
328
|
# Display status information
|
|
208
329
|
status_formatter = StatusResponseFormatter(status_response)
|
|
209
330
|
if json_mode:
|
|
@@ -218,14 +339,14 @@ def status(
|
|
|
218
339
|
raise click.ClickException(f"Status check failed: {str(e)}") from e
|
|
219
340
|
|
|
220
341
|
|
|
221
|
-
@cli.command("shutdown")
|
|
222
|
-
@click.argument("slurm_job_id", type=
|
|
223
|
-
def shutdown(slurm_job_id:
|
|
342
|
+
@cli.command("shutdown", help="Shutdown a running model on the cluster.")
|
|
343
|
+
@click.argument("slurm_job_id", type=str, nargs=1)
|
|
344
|
+
def shutdown(slurm_job_id: str) -> None:
|
|
224
345
|
"""Shutdown a running model on the cluster.
|
|
225
346
|
|
|
226
347
|
Parameters
|
|
227
348
|
----------
|
|
228
|
-
slurm_job_id :
|
|
349
|
+
slurm_job_id : str
|
|
229
350
|
ID of the SLURM job to shut down
|
|
230
351
|
|
|
231
352
|
Raises
|
|
@@ -241,7 +362,7 @@ def shutdown(slurm_job_id: int) -> None:
|
|
|
241
362
|
raise click.ClickException(f"Shutdown failed: {str(e)}") from e
|
|
242
363
|
|
|
243
364
|
|
|
244
|
-
@cli.command("list")
|
|
365
|
+
@cli.command("list", help="List available models or get specific model configuration.")
|
|
245
366
|
@click.argument("model-name", required=False)
|
|
246
367
|
@click.option(
|
|
247
368
|
"--json-mode",
|
|
@@ -279,20 +400,17 @@ def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> No
|
|
|
279
400
|
raise click.ClickException(f"List models failed: {str(e)}") from e
|
|
280
401
|
|
|
281
402
|
|
|
282
|
-
@cli.command(
|
|
283
|
-
|
|
284
|
-
@click.option(
|
|
285
|
-
"--log-dir", type=str, help="Path to slurm log directory (if used during launch)"
|
|
403
|
+
@cli.command(
|
|
404
|
+
"metrics", help="Stream real-time performance metrics from the model endpoint."
|
|
286
405
|
)
|
|
287
|
-
|
|
406
|
+
@click.argument("slurm_job_id", type=str, nargs=1)
|
|
407
|
+
def metrics(slurm_job_id: str) -> None:
|
|
288
408
|
"""Stream real-time performance metrics from the model endpoint.
|
|
289
409
|
|
|
290
410
|
Parameters
|
|
291
411
|
----------
|
|
292
|
-
slurm_job_id :
|
|
412
|
+
slurm_job_id : str
|
|
293
413
|
ID of the SLURM job to monitor
|
|
294
|
-
log_dir : str, optional
|
|
295
|
-
Path to SLURM log directory
|
|
296
414
|
|
|
297
415
|
Raises
|
|
298
416
|
------
|
|
@@ -308,7 +426,7 @@ def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
|
|
|
308
426
|
try:
|
|
309
427
|
# Start the client and get inference server metrics
|
|
310
428
|
client = VecInfClient()
|
|
311
|
-
metrics_response = client.get_metrics(slurm_job_id
|
|
429
|
+
metrics_response = client.get_metrics(slurm_job_id)
|
|
312
430
|
metrics_formatter = MetricsResponseFormatter(metrics_response.metrics)
|
|
313
431
|
|
|
314
432
|
# Check if metrics response is ready
|
|
@@ -319,7 +437,7 @@ def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
|
|
|
319
437
|
|
|
320
438
|
with Live(refresh_per_second=1, console=CONSOLE) as live:
|
|
321
439
|
while True:
|
|
322
|
-
metrics_response = client.get_metrics(slurm_job_id
|
|
440
|
+
metrics_response = client.get_metrics(slurm_job_id)
|
|
323
441
|
metrics_formatter = MetricsResponseFormatter(metrics_response.metrics)
|
|
324
442
|
|
|
325
443
|
if isinstance(metrics_response.metrics, str):
|
|
@@ -336,5 +454,69 @@ def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
|
|
|
336
454
|
raise click.ClickException(f"Metrics check failed: {str(e)}") from e
|
|
337
455
|
|
|
338
456
|
|
|
457
|
+
@cli.command("cleanup", help="Clean up log files based on optional filters.")
|
|
458
|
+
@click.option("--log-dir", type=str, help="Path to SLURM log directory")
|
|
459
|
+
@click.option("--model-family", type=str, help="Filter by model family")
|
|
460
|
+
@click.option("--model-name", type=str, help="Filter by model name")
|
|
461
|
+
@click.option(
|
|
462
|
+
"--job-id", type=int, help="Only remove logs with this exact SLURM job ID"
|
|
463
|
+
)
|
|
464
|
+
@click.option(
|
|
465
|
+
"--before-job-id",
|
|
466
|
+
type=int,
|
|
467
|
+
help="Remove logs with job ID less than this value",
|
|
468
|
+
)
|
|
469
|
+
@click.option("--dry-run", is_flag=True, help="List matching logs without deleting")
|
|
470
|
+
def cleanup_logs_cli(
|
|
471
|
+
log_dir: Optional[str],
|
|
472
|
+
model_family: Optional[str],
|
|
473
|
+
model_name: Optional[str],
|
|
474
|
+
job_id: Optional[int],
|
|
475
|
+
before_job_id: Optional[int],
|
|
476
|
+
dry_run: bool,
|
|
477
|
+
) -> None:
|
|
478
|
+
"""Clean up log files based on optional filters.
|
|
479
|
+
|
|
480
|
+
Parameters
|
|
481
|
+
----------
|
|
482
|
+
log_dir : str or Path, optional
|
|
483
|
+
Root directory containing log files. Defaults to ~/.vec-inf-logs.
|
|
484
|
+
model_family : str, optional
|
|
485
|
+
Only delete logs for this model family.
|
|
486
|
+
model_name : str, optional
|
|
487
|
+
Only delete logs for this model name.
|
|
488
|
+
job_id : int, optional
|
|
489
|
+
If provided, only match directories with this exact SLURM job ID.
|
|
490
|
+
before_job_id : int, optional
|
|
491
|
+
If provided, only delete logs with job ID less than this value.
|
|
492
|
+
dry_run : bool
|
|
493
|
+
If True, return matching files without deleting them.
|
|
494
|
+
"""
|
|
495
|
+
try:
|
|
496
|
+
client = VecInfClient()
|
|
497
|
+
matched = client.cleanup_logs(
|
|
498
|
+
log_dir=log_dir,
|
|
499
|
+
model_family=model_family,
|
|
500
|
+
model_name=model_name,
|
|
501
|
+
job_id=job_id,
|
|
502
|
+
before_job_id=before_job_id,
|
|
503
|
+
dry_run=dry_run,
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
if not matched:
|
|
507
|
+
if dry_run:
|
|
508
|
+
click.echo("Dry run: no matching log directories found.")
|
|
509
|
+
else:
|
|
510
|
+
click.echo("No matching log directories were deleted.")
|
|
511
|
+
elif dry_run:
|
|
512
|
+
click.echo(f"Dry run: {len(matched)} directories would be deleted:")
|
|
513
|
+
for f in matched:
|
|
514
|
+
click.echo(f" - {f}")
|
|
515
|
+
else:
|
|
516
|
+
click.echo(f"Deleted {len(matched)} log directory(ies).")
|
|
517
|
+
except Exception as e:
|
|
518
|
+
raise click.ClickException(f"Cleanup failed: {str(e)}") from e
|
|
519
|
+
|
|
520
|
+
|
|
339
521
|
if __name__ == "__main__":
|
|
340
522
|
cli()
|
vec_inf/cli/_helper.py
CHANGED
|
@@ -4,6 +4,7 @@ This module provides formatting and display classes for the command-line interfa
|
|
|
4
4
|
handling the presentation of model information, status updates, and metrics.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
+
import json
|
|
7
8
|
from pathlib import Path
|
|
8
9
|
from typing import Any, Union
|
|
9
10
|
|
|
@@ -27,9 +28,8 @@ class LaunchResponseFormatter:
|
|
|
27
28
|
Parameters
|
|
28
29
|
----------
|
|
29
30
|
model_name : str
|
|
30
|
-
Name of the launched model
|
|
31
|
-
|
|
32
|
-
Launch parameters and configuration
|
|
31
|
+
Name of the launched model params : dict[str, Any] Launch parameters and
|
|
32
|
+
configuration
|
|
33
33
|
"""
|
|
34
34
|
|
|
35
35
|
def __init__(self, model_name: str, params: dict[str, Any]):
|
|
@@ -59,8 +59,16 @@ class LaunchResponseFormatter:
|
|
|
59
59
|
table.add_row("Vocabulary Size", self.params["vocab_size"])
|
|
60
60
|
|
|
61
61
|
# Add resource allocation details
|
|
62
|
-
|
|
63
|
-
|
|
62
|
+
if self.params.get("account"):
|
|
63
|
+
table.add_row("Account", self.params["account"])
|
|
64
|
+
if self.params.get("work_dir"):
|
|
65
|
+
table.add_row("Working Directory", self.params["work_dir"])
|
|
66
|
+
if self.params.get("resource_type"):
|
|
67
|
+
table.add_row("Resource Type", self.params["resource_type"])
|
|
68
|
+
if self.params.get("partition"):
|
|
69
|
+
table.add_row("Partition", self.params["partition"])
|
|
70
|
+
if self.params.get("qos"):
|
|
71
|
+
table.add_row("QoS", self.params["qos"])
|
|
64
72
|
table.add_row("Time Limit", self.params["time"])
|
|
65
73
|
table.add_row("Num Nodes", self.params["num_nodes"])
|
|
66
74
|
table.add_row("GPUs/Node", self.params["gpus_per_node"])
|
|
@@ -79,6 +87,80 @@ class LaunchResponseFormatter:
|
|
|
79
87
|
for arg, value in self.params["vllm_args"].items():
|
|
80
88
|
table.add_row(f" {arg}:", str(value))
|
|
81
89
|
|
|
90
|
+
# Add Environment Variable Configuration Details
|
|
91
|
+
table.add_row("Environment Variables", style="magenta")
|
|
92
|
+
for arg, value in self.params["env"].items():
|
|
93
|
+
table.add_row(f" {arg}:", str(value))
|
|
94
|
+
|
|
95
|
+
return table
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class BatchLaunchResponseFormatter:
|
|
99
|
+
"""CLI Helper class for formatting BatchLaunchResponse.
|
|
100
|
+
|
|
101
|
+
A formatter class that handles the presentation of batch launch information
|
|
102
|
+
in both table and JSON formats.
|
|
103
|
+
|
|
104
|
+
Parameters
|
|
105
|
+
----------
|
|
106
|
+
params : dict[str, Any]
|
|
107
|
+
Configuration for the batch launch
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
def __init__(self, params: dict[str, Any]):
|
|
111
|
+
self.params = params
|
|
112
|
+
|
|
113
|
+
def format_table_output(self) -> Table:
|
|
114
|
+
"""Format output as rich Table.
|
|
115
|
+
|
|
116
|
+
Returns
|
|
117
|
+
-------
|
|
118
|
+
Table
|
|
119
|
+
Rich table containing formatted batch launch information including:
|
|
120
|
+
- Job configuration
|
|
121
|
+
- Model details
|
|
122
|
+
- Resource allocation
|
|
123
|
+
- vLLM configuration
|
|
124
|
+
"""
|
|
125
|
+
table = create_table(key_title="Job Config", value_title="Value")
|
|
126
|
+
# Add key information with consistent styling
|
|
127
|
+
table.add_row("Slurm Job ID", self.params["slurm_job_id"], style="blue")
|
|
128
|
+
table.add_row("Slurm Job Name", self.params["slurm_job_name"], style="blue")
|
|
129
|
+
if self.params.get("account"):
|
|
130
|
+
table.add_row("Account", self.params["account"], style="blue")
|
|
131
|
+
if self.params.get("work_dir"):
|
|
132
|
+
table.add_row("Working Directory", self.params["work_dir"], style="blue")
|
|
133
|
+
table.add_row("Log Directory", self.params["log_dir"], style="blue")
|
|
134
|
+
for model_name in self.params["models"]:
|
|
135
|
+
table.add_row("Model Name", model_name, style="magenta")
|
|
136
|
+
# Add resource allocation details
|
|
137
|
+
if self.params["models"][model_name].get("resource_type"):
|
|
138
|
+
table.add_row(
|
|
139
|
+
"Resource Type",
|
|
140
|
+
f" {self.params['models'][model_name]['resource_type']}",
|
|
141
|
+
)
|
|
142
|
+
if self.params["models"][model_name].get("partition"):
|
|
143
|
+
table.add_row(
|
|
144
|
+
"Partition", f" {self.params['models'][model_name]['partition']}"
|
|
145
|
+
)
|
|
146
|
+
if self.params["models"][model_name].get("qos"):
|
|
147
|
+
table.add_row("QoS", f" {self.params['models'][model_name]['qos']}")
|
|
148
|
+
table.add_row(
|
|
149
|
+
"Time Limit", f" {self.params['models'][model_name]['time']}"
|
|
150
|
+
)
|
|
151
|
+
table.add_row(
|
|
152
|
+
"Num Nodes", f" {self.params['models'][model_name]['num_nodes']}"
|
|
153
|
+
)
|
|
154
|
+
table.add_row(
|
|
155
|
+
"GPUs/Node", f" {self.params['models'][model_name]['gpus_per_node']}"
|
|
156
|
+
)
|
|
157
|
+
table.add_row(
|
|
158
|
+
"CPUs/Task", f" {self.params['models'][model_name]['cpus_per_task']}"
|
|
159
|
+
)
|
|
160
|
+
table.add_row(
|
|
161
|
+
"Memory/Node", f" {self.params['models'][model_name]['mem_per_node']}"
|
|
162
|
+
)
|
|
163
|
+
|
|
82
164
|
return table
|
|
83
165
|
|
|
84
166
|
|
|
@@ -116,7 +198,8 @@ class StatusResponseFormatter:
|
|
|
116
198
|
json_data["pending_reason"] = self.status_info.pending_reason
|
|
117
199
|
if self.status_info.failed_reason:
|
|
118
200
|
json_data["failed_reason"] = self.status_info.failed_reason
|
|
119
|
-
|
|
201
|
+
|
|
202
|
+
click.echo(json.dumps(json_data, indent=4))
|
|
120
203
|
|
|
121
204
|
def output_table(self) -> Table:
|
|
122
205
|
"""Create and display rich table.
|
|
@@ -292,9 +375,7 @@ class ListCmdDisplay:
|
|
|
292
375
|
self.model_config = None
|
|
293
376
|
self.model_names: list[str] = []
|
|
294
377
|
|
|
295
|
-
def _format_single_model_output(
|
|
296
|
-
self, config: ModelConfig
|
|
297
|
-
) -> Union[dict[str, Any], Table]:
|
|
378
|
+
def _format_single_model_output(self, config: ModelConfig) -> Union[str, Table]:
|
|
298
379
|
"""Format output table for a single model.
|
|
299
380
|
|
|
300
381
|
Parameters
|
|
@@ -304,8 +385,8 @@ class ListCmdDisplay:
|
|
|
304
385
|
|
|
305
386
|
Returns
|
|
306
387
|
-------
|
|
307
|
-
Union[
|
|
308
|
-
Either a
|
|
388
|
+
Union[str, Table]
|
|
389
|
+
Either a JSON string for JSON output or a Rich table
|
|
309
390
|
"""
|
|
310
391
|
if self.json_mode:
|
|
311
392
|
# Exclude non-essential fields from JSON output
|
|
@@ -315,11 +396,11 @@ class ListCmdDisplay:
|
|
|
315
396
|
config_dict["model_weights_parent_dir"] = str(
|
|
316
397
|
config_dict["model_weights_parent_dir"]
|
|
317
398
|
)
|
|
318
|
-
return config_dict
|
|
399
|
+
return json.dumps(config_dict, indent=4)
|
|
319
400
|
|
|
320
401
|
table = create_table(key_title="Model Config", value_title="Value")
|
|
321
402
|
for field, value in config.model_dump().items():
|
|
322
|
-
if field not in {"venv", "log_dir", "vllm_args"}:
|
|
403
|
+
if field not in {"venv", "log_dir", "vllm_args"} and value:
|
|
323
404
|
table.add_row(field, str(value))
|
|
324
405
|
if field == "vllm_args":
|
|
325
406
|
table.add_row("vLLM Arguments:", style="magenta")
|
|
@@ -394,7 +475,7 @@ class ListCmdDisplay:
|
|
|
394
475
|
"""
|
|
395
476
|
if self.json_mode:
|
|
396
477
|
model_names = [info.name for info in model_infos]
|
|
397
|
-
click.echo(model_names)
|
|
478
|
+
click.echo(json.dumps(model_names, indent=4))
|
|
398
479
|
else:
|
|
399
480
|
panels = self._format_all_models_output(model_infos)
|
|
400
481
|
self.console.print(Columns(panels, equal=True))
|