vec-inf 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vec_inf/README.md +18 -4
- vec_inf/cli/_cli.py +212 -30
- vec_inf/cli/_helper.py +95 -14
- vec_inf/client/_client_vars.py +19 -152
- vec_inf/client/_helper.py +386 -53
- vec_inf/client/_slurm_script_generator.py +210 -43
- vec_inf/client/_slurm_templates.py +248 -0
- vec_inf/client/_slurm_vars.py +82 -0
- vec_inf/client/_utils.py +190 -71
- vec_inf/client/api.py +96 -25
- vec_inf/client/config.py +46 -15
- vec_inf/client/models.py +51 -2
- vec_inf/config/README.md +4 -243
- vec_inf/config/environment.yaml +31 -0
- vec_inf/config/models.yaml +102 -281
- {vec_inf-0.6.0.dist-info → vec_inf-0.7.0.dist-info}/METADATA +25 -67
- vec_inf-0.7.0.dist-info/RECORD +27 -0
- vec_inf/client/slurm_vars.py +0 -49
- vec_inf-0.6.0.dist-info/RECORD +0 -25
- {vec_inf-0.6.0.dist-info → vec_inf-0.7.0.dist-info}/WHEEL +0 -0
- {vec_inf-0.6.0.dist-info → vec_inf-0.7.0.dist-info}/entry_points.txt +0 -0
- {vec_inf-0.6.0.dist-info → vec_inf-0.7.0.dist-info}/licenses/LICENSE +0 -0
vec_inf/client/_helper.py
CHANGED
|
@@ -5,10 +5,10 @@ metrics collection, and model registry operations.
|
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
import json
|
|
8
|
-
import os
|
|
9
8
|
import time
|
|
10
9
|
import warnings
|
|
11
10
|
from pathlib import Path
|
|
11
|
+
from shutil import copy2
|
|
12
12
|
from typing import Any, Optional, Union, cast
|
|
13
13
|
from urllib.parse import urlparse, urlunparse
|
|
14
14
|
|
|
@@ -16,8 +16,8 @@ import requests
|
|
|
16
16
|
|
|
17
17
|
import vec_inf.client._utils as utils
|
|
18
18
|
from vec_inf.client._client_vars import (
|
|
19
|
+
BATCH_MODE_REQUIRED_MATCHING_ARGS,
|
|
19
20
|
KEY_METRICS,
|
|
20
|
-
REQUIRED_FIELDS,
|
|
21
21
|
SRC_DIR,
|
|
22
22
|
VLLM_SHORT_TO_LONG_MAP,
|
|
23
23
|
)
|
|
@@ -27,19 +27,19 @@ from vec_inf.client._exceptions import (
|
|
|
27
27
|
ModelNotFoundError,
|
|
28
28
|
SlurmJobError,
|
|
29
29
|
)
|
|
30
|
-
from vec_inf.client._slurm_script_generator import
|
|
30
|
+
from vec_inf.client._slurm_script_generator import (
|
|
31
|
+
BatchSlurmScriptGenerator,
|
|
32
|
+
SlurmScriptGenerator,
|
|
33
|
+
)
|
|
31
34
|
from vec_inf.client.config import ModelConfig
|
|
32
35
|
from vec_inf.client.models import (
|
|
36
|
+
BatchLaunchResponse,
|
|
33
37
|
LaunchResponse,
|
|
34
38
|
ModelInfo,
|
|
35
39
|
ModelStatus,
|
|
36
40
|
ModelType,
|
|
37
41
|
StatusResponse,
|
|
38
42
|
)
|
|
39
|
-
from vec_inf.client.slurm_vars import (
|
|
40
|
-
LD_LIBRARY_PATH,
|
|
41
|
-
VLLM_NCCL_SO_PATH,
|
|
42
|
-
)
|
|
43
43
|
|
|
44
44
|
|
|
45
45
|
class ModelLauncher:
|
|
@@ -50,27 +50,18 @@ class ModelLauncher:
|
|
|
50
50
|
|
|
51
51
|
Parameters
|
|
52
52
|
----------
|
|
53
|
-
model_name
|
|
53
|
+
model_name: str
|
|
54
54
|
Name of the model to launch
|
|
55
|
-
kwargs
|
|
55
|
+
kwargs: Optional[dict[str, Any]]
|
|
56
56
|
Optional launch keyword arguments to override default configuration
|
|
57
57
|
"""
|
|
58
58
|
|
|
59
59
|
def __init__(self, model_name: str, kwargs: Optional[dict[str, Any]]):
|
|
60
|
-
"""Initialize the model launcher.
|
|
61
|
-
|
|
62
|
-
Parameters
|
|
63
|
-
----------
|
|
64
|
-
model_name: str
|
|
65
|
-
Name of the model to launch
|
|
66
|
-
kwargs: Optional[dict[str, Any]]
|
|
67
|
-
Optional launch keyword arguments to override default configuration
|
|
68
|
-
"""
|
|
69
60
|
self.model_name = model_name
|
|
70
61
|
self.kwargs = kwargs or {}
|
|
71
62
|
self.slurm_job_id = ""
|
|
72
63
|
self.slurm_script_path = Path("")
|
|
73
|
-
self.model_config = self._get_model_configuration()
|
|
64
|
+
self.model_config = self._get_model_configuration(self.kwargs.get("config"))
|
|
74
65
|
self.params = self._get_launch_params()
|
|
75
66
|
|
|
76
67
|
def _warn(self, message: str) -> None:
|
|
@@ -83,9 +74,14 @@ class ModelLauncher:
|
|
|
83
74
|
"""
|
|
84
75
|
warnings.warn(message, UserWarning, stacklevel=2)
|
|
85
76
|
|
|
86
|
-
def _get_model_configuration(self) -> ModelConfig:
|
|
77
|
+
def _get_model_configuration(self, config_path: str | None = None) -> ModelConfig:
|
|
87
78
|
"""Load and validate model configuration.
|
|
88
79
|
|
|
80
|
+
Parameters
|
|
81
|
+
----------
|
|
82
|
+
config_path : str | None, optional
|
|
83
|
+
Path to a yaml file with custom model config to use in place of the default
|
|
84
|
+
|
|
89
85
|
Returns
|
|
90
86
|
-------
|
|
91
87
|
ModelConfig
|
|
@@ -98,7 +94,7 @@ class ModelLauncher:
|
|
|
98
94
|
ModelConfigurationError
|
|
99
95
|
If model configuration is not found and weights don't exist
|
|
100
96
|
"""
|
|
101
|
-
model_configs = utils.load_config()
|
|
97
|
+
model_configs = utils.load_config(config_path=config_path)
|
|
102
98
|
config = next(
|
|
103
99
|
(m for m in model_configs if m.model_name == self.model_name), None
|
|
104
100
|
)
|
|
@@ -167,6 +163,38 @@ class ModelLauncher:
|
|
|
167
163
|
vllm_args[arg.strip()] = True
|
|
168
164
|
return vllm_args
|
|
169
165
|
|
|
166
|
+
def _process_env_vars(self, env_arg: str) -> dict[str, str]:
|
|
167
|
+
"""Process the env string into a dictionary of environment variables.
|
|
168
|
+
|
|
169
|
+
Parameters
|
|
170
|
+
----------
|
|
171
|
+
env_arg : str
|
|
172
|
+
String containing comma separated list of environment variable definitions
|
|
173
|
+
(eg. MY_VAR=1), file paths containing environment variable definitions
|
|
174
|
+
(separated by newlines), or a combination of both
|
|
175
|
+
(eg. 'MY_VAR=5,my_env.env')
|
|
176
|
+
|
|
177
|
+
Returns
|
|
178
|
+
-------
|
|
179
|
+
dict[str, str]
|
|
180
|
+
Processed environment variables as key-value pairs.
|
|
181
|
+
"""
|
|
182
|
+
env_vars: dict[str, str] = {}
|
|
183
|
+
for arg in env_arg.split(","):
|
|
184
|
+
if "=" in arg: # Arg is an env var definition
|
|
185
|
+
key, value = arg.split("=")
|
|
186
|
+
env_vars[key.strip()] = value.strip()
|
|
187
|
+
else: # Arg is a path to a file
|
|
188
|
+
with open(arg, "r") as file:
|
|
189
|
+
lines = [line.rstrip() for line in file]
|
|
190
|
+
for line in lines:
|
|
191
|
+
if "=" in line:
|
|
192
|
+
key, value = line.split("=")
|
|
193
|
+
env_vars[key.strip()] = value.strip()
|
|
194
|
+
else:
|
|
195
|
+
print(f"WARNING: Could not parse env var: {line}")
|
|
196
|
+
return env_vars
|
|
197
|
+
|
|
170
198
|
def _get_launch_params(self) -> dict[str, Any]:
|
|
171
199
|
"""Prepare launch parameters, set log dir, and validate required fields.
|
|
172
200
|
|
|
@@ -190,14 +218,19 @@ class ModelLauncher:
|
|
|
190
218
|
params["vllm_args"][key] = value
|
|
191
219
|
del self.kwargs["vllm_args"]
|
|
192
220
|
|
|
221
|
+
if self.kwargs.get("env"):
|
|
222
|
+
env_vars = self._process_env_vars(self.kwargs["env"])
|
|
223
|
+
for key, value in env_vars.items():
|
|
224
|
+
params["env"][key] = str(value)
|
|
225
|
+
del self.kwargs["env"]
|
|
226
|
+
|
|
193
227
|
for key, value in self.kwargs.items():
|
|
194
228
|
params[key] = value
|
|
195
229
|
|
|
196
|
-
#
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
)
|
|
230
|
+
# Check for required fields without default vals, will raise an error if missing
|
|
231
|
+
utils.check_required_fields(params)
|
|
232
|
+
|
|
233
|
+
# Validate resource allocation and parallelization settings
|
|
201
234
|
if (
|
|
202
235
|
int(params["gpus_per_node"]) > 1
|
|
203
236
|
and params["vllm_args"].get("--tensor-parallel-size") is None
|
|
@@ -206,6 +239,25 @@ class ModelLauncher:
|
|
|
206
239
|
"--tensor-parallel-size is required when gpus_per_node > 1"
|
|
207
240
|
)
|
|
208
241
|
|
|
242
|
+
total_gpus_requested = int(params["gpus_per_node"]) * int(params["num_nodes"])
|
|
243
|
+
if not utils.is_power_of_two(total_gpus_requested):
|
|
244
|
+
raise ValueError("Total number of GPUs requested must be a power of two")
|
|
245
|
+
|
|
246
|
+
total_parallel_sizes = int(
|
|
247
|
+
params["vllm_args"].get("--tensor-parallel-size", "1")
|
|
248
|
+
) * int(params["vllm_args"].get("--pipeline-parallel-size", "1"))
|
|
249
|
+
if total_gpus_requested != total_parallel_sizes:
|
|
250
|
+
raise ValueError(
|
|
251
|
+
"Mismatch between total number of GPUs requested and parallelization settings"
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# Convert gpus_per_node and resource_type to gres
|
|
255
|
+
resource_type = params.get("resource_type")
|
|
256
|
+
if resource_type:
|
|
257
|
+
params["gres"] = f"gpu:{resource_type}:{params['gpus_per_node']}"
|
|
258
|
+
else:
|
|
259
|
+
params["gres"] = f"gpu:{params['gpus_per_node']}"
|
|
260
|
+
|
|
209
261
|
# Create log directory
|
|
210
262
|
params["log_dir"] = Path(params["log_dir"], params["model_family"]).expanduser()
|
|
211
263
|
params["log_dir"].mkdir(parents=True, exist_ok=True)
|
|
@@ -224,17 +276,12 @@ class ModelLauncher:
|
|
|
224
276
|
|
|
225
277
|
# Convert path to string for JSON serialization
|
|
226
278
|
for field in params:
|
|
227
|
-
if field
|
|
279
|
+
if field in ["vllm_args", "env"]:
|
|
228
280
|
continue
|
|
229
281
|
params[field] = str(params[field])
|
|
230
282
|
|
|
231
283
|
return params
|
|
232
284
|
|
|
233
|
-
def _set_env_vars(self) -> None:
|
|
234
|
-
"""Set environment variables for the launch command."""
|
|
235
|
-
os.environ["LD_LIBRARY_PATH"] = LD_LIBRARY_PATH
|
|
236
|
-
os.environ["VLLM_NCCL_SO_PATH"] = VLLM_NCCL_SO_PATH
|
|
237
|
-
|
|
238
285
|
def _build_launch_command(self) -> str:
|
|
239
286
|
"""Generate the slurm script and construct the launch command.
|
|
240
287
|
|
|
@@ -259,9 +306,6 @@ class ModelLauncher:
|
|
|
259
306
|
SlurmJobError
|
|
260
307
|
If SLURM job submission fails
|
|
261
308
|
"""
|
|
262
|
-
# Set environment variables
|
|
263
|
-
self._set_env_vars()
|
|
264
|
-
|
|
265
309
|
# Build and execute the launch command
|
|
266
310
|
command_output, stderr = utils.run_bash_command(self._build_launch_command())
|
|
267
311
|
|
|
@@ -285,20 +329,288 @@ class ModelLauncher:
|
|
|
285
329
|
job_json.touch(exist_ok=True)
|
|
286
330
|
|
|
287
331
|
self.slurm_script_path.rename(
|
|
288
|
-
job_log_dir / f"{self.model_name}.{self.slurm_job_id}.
|
|
332
|
+
job_log_dir / f"{self.model_name}.{self.slurm_job_id}.sbatch"
|
|
289
333
|
)
|
|
290
334
|
|
|
291
335
|
with job_json.open("w") as file:
|
|
292
336
|
json.dump(self.params, file, indent=4)
|
|
293
337
|
|
|
294
338
|
return LaunchResponse(
|
|
295
|
-
slurm_job_id=
|
|
339
|
+
slurm_job_id=self.slurm_job_id,
|
|
296
340
|
model_name=self.model_name,
|
|
297
341
|
config=self.params,
|
|
298
342
|
raw_output=command_output,
|
|
299
343
|
)
|
|
300
344
|
|
|
301
345
|
|
|
346
|
+
class BatchModelLauncher:
|
|
347
|
+
"""Helper class for handling batch inference server launch.
|
|
348
|
+
|
|
349
|
+
A class that manages the launch process of multiple inference servers, including
|
|
350
|
+
configuration validation, and SLURM job submission.
|
|
351
|
+
|
|
352
|
+
Parameters
|
|
353
|
+
----------
|
|
354
|
+
model_names : list[str]
|
|
355
|
+
List of model names to launch
|
|
356
|
+
"""
|
|
357
|
+
|
|
358
|
+
def __init__(
|
|
359
|
+
self,
|
|
360
|
+
model_names: list[str],
|
|
361
|
+
batch_config: Optional[str] = None,
|
|
362
|
+
account: Optional[str] = None,
|
|
363
|
+
work_dir: Optional[str] = None,
|
|
364
|
+
):
|
|
365
|
+
self.model_names = model_names
|
|
366
|
+
self.batch_config = batch_config
|
|
367
|
+
self.slurm_job_id = ""
|
|
368
|
+
self.slurm_job_name = self._get_slurm_job_name()
|
|
369
|
+
self.batch_script_path = Path("")
|
|
370
|
+
self.launch_script_paths: list[Path] = []
|
|
371
|
+
self.model_configs = self._get_model_configurations()
|
|
372
|
+
self.params = self._get_launch_params(account, work_dir)
|
|
373
|
+
|
|
374
|
+
def _get_slurm_job_name(self) -> str:
|
|
375
|
+
"""Get the SLURM job name from the model names.
|
|
376
|
+
|
|
377
|
+
Returns
|
|
378
|
+
-------
|
|
379
|
+
str
|
|
380
|
+
SLURM job name
|
|
381
|
+
"""
|
|
382
|
+
return "BATCH-" + "-".join(self.model_names)
|
|
383
|
+
|
|
384
|
+
def _get_model_configurations(self) -> dict[str, ModelConfig]:
|
|
385
|
+
"""Load and validate model configurations.
|
|
386
|
+
|
|
387
|
+
Returns
|
|
388
|
+
-------
|
|
389
|
+
dict[str, ModelConfig]
|
|
390
|
+
Dictionary of validated model configurations
|
|
391
|
+
|
|
392
|
+
Raises
|
|
393
|
+
------
|
|
394
|
+
ModelNotFoundError
|
|
395
|
+
If model weights parent directory cannot be determined
|
|
396
|
+
ModelConfigurationError
|
|
397
|
+
If model configuration is not found and weights don't exist
|
|
398
|
+
"""
|
|
399
|
+
model_configs = utils.load_config(self.batch_config)
|
|
400
|
+
|
|
401
|
+
model_configs_dict = {}
|
|
402
|
+
for model_name in self.model_names:
|
|
403
|
+
config = next(
|
|
404
|
+
(m for m in model_configs if m.model_name == model_name), None
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
if config:
|
|
408
|
+
model_configs_dict[model_name] = config
|
|
409
|
+
else:
|
|
410
|
+
raise ModelConfigurationError(
|
|
411
|
+
f"'{model_name}' not found in configuration, batch launch requires all models to be present in the configuration file"
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
return model_configs_dict
|
|
415
|
+
|
|
416
|
+
def _get_launch_params(
|
|
417
|
+
self, account: Optional[str] = None, work_dir: Optional[str] = None
|
|
418
|
+
) -> dict[str, Any]:
|
|
419
|
+
"""Prepare launch parameters, set log dir, and validate required fields.
|
|
420
|
+
|
|
421
|
+
Returns
|
|
422
|
+
-------
|
|
423
|
+
dict[str, Any]
|
|
424
|
+
Dictionary of prepared launch parameters
|
|
425
|
+
|
|
426
|
+
Raises
|
|
427
|
+
------
|
|
428
|
+
MissingRequiredFieldsError
|
|
429
|
+
If required fields are missing or tensor parallel size is not specified
|
|
430
|
+
when using multiple GPUs
|
|
431
|
+
"""
|
|
432
|
+
params: dict[str, Any] = {
|
|
433
|
+
"models": {},
|
|
434
|
+
"slurm_job_name": self.slurm_job_name,
|
|
435
|
+
"src_dir": str(SRC_DIR),
|
|
436
|
+
"account": account,
|
|
437
|
+
"work_dir": work_dir,
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
# Check for required fields without default vals, will raise an error if missing
|
|
441
|
+
utils.check_required_fields(params)
|
|
442
|
+
|
|
443
|
+
for i, (model_name, config) in enumerate(self.model_configs.items()):
|
|
444
|
+
params["models"][model_name] = config.model_dump(exclude_none=True)
|
|
445
|
+
params["models"][model_name]["het_group_id"] = i
|
|
446
|
+
|
|
447
|
+
# Validate resource allocation and parallelization settings
|
|
448
|
+
if (
|
|
449
|
+
int(config.gpus_per_node) > 1
|
|
450
|
+
and (config.vllm_args or {}).get("--tensor-parallel-size") is None
|
|
451
|
+
):
|
|
452
|
+
raise MissingRequiredFieldsError(
|
|
453
|
+
f"--tensor-parallel-size is required when gpus_per_node > 1, check your configuration for {model_name}"
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
total_gpus_requested = int(config.gpus_per_node) * int(config.num_nodes)
|
|
457
|
+
if not utils.is_power_of_two(total_gpus_requested):
|
|
458
|
+
raise ValueError(
|
|
459
|
+
f"Total number of GPUs requested must be a power of two, check your configuration for {model_name}"
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
total_parallel_sizes = int(
|
|
463
|
+
(config.vllm_args or {}).get("--tensor-parallel-size", "1")
|
|
464
|
+
) * int((config.vllm_args or {}).get("--pipeline-parallel-size", "1"))
|
|
465
|
+
if total_gpus_requested != total_parallel_sizes:
|
|
466
|
+
raise ValueError(
|
|
467
|
+
f"Mismatch between total number of GPUs requested and parallelization settings, check your configuration for {model_name}"
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
# Convert gpus_per_node and resource_type to gres
|
|
471
|
+
params["models"][model_name]["gres"] = (
|
|
472
|
+
f"gpu:{config.resource_type}:{config.gpus_per_node}"
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
# Create log directory
|
|
476
|
+
log_dir = Path(
|
|
477
|
+
params["models"][model_name]["log_dir"], self.slurm_job_name
|
|
478
|
+
).expanduser()
|
|
479
|
+
log_dir.mkdir(parents=True, exist_ok=True)
|
|
480
|
+
params["models"][model_name]["log_dir"] = str(log_dir)
|
|
481
|
+
|
|
482
|
+
# Convert model_weights_parent_dir to string for JSON serialization
|
|
483
|
+
params["models"][model_name]["model_weights_parent_dir"] = str(
|
|
484
|
+
params["models"][model_name]["model_weights_parent_dir"]
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
# Construct slurm log file paths
|
|
488
|
+
params["models"][model_name]["out_file"] = (
|
|
489
|
+
f"{params['models'][model_name]['log_dir']}/{self.slurm_job_name}.%j/{model_name}.%j.out"
|
|
490
|
+
)
|
|
491
|
+
params["models"][model_name]["err_file"] = (
|
|
492
|
+
f"{params['models'][model_name]['log_dir']}/{self.slurm_job_name}.%j/{model_name}.%j.err"
|
|
493
|
+
)
|
|
494
|
+
params["models"][model_name]["json_file"] = (
|
|
495
|
+
f"{params['models'][model_name]['log_dir']}/{self.slurm_job_name}.$SLURM_JOB_ID/{model_name}.$SLURM_JOB_ID.json"
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
# Create top level log files using the first model's log directory
|
|
499
|
+
if not params.get("out_file"):
|
|
500
|
+
params["out_file"] = (
|
|
501
|
+
f"{params['models'][model_name]['log_dir']}/{self.slurm_job_name}.%j/{self.slurm_job_name}.%j.out"
|
|
502
|
+
)
|
|
503
|
+
if not params.get("err_file"):
|
|
504
|
+
params["err_file"] = (
|
|
505
|
+
f"{params['models'][model_name]['log_dir']}/{self.slurm_job_name}.%j/{self.slurm_job_name}.%j.err"
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
# Check if required matching arguments are matched
|
|
509
|
+
for arg in BATCH_MODE_REQUIRED_MATCHING_ARGS:
|
|
510
|
+
if not params.get(arg):
|
|
511
|
+
params[arg] = params["models"][model_name][arg]
|
|
512
|
+
elif params[arg] != params["models"][model_name][arg]:
|
|
513
|
+
# Remove the created directory since we found a mismatch
|
|
514
|
+
log_dir.rmdir()
|
|
515
|
+
raise ValueError(
|
|
516
|
+
f"Mismatch found for {arg}: {params[arg]} != {params['models'][model_name][arg]}, check your configuration"
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
return params
|
|
520
|
+
|
|
521
|
+
def _build_launch_command(self) -> str:
|
|
522
|
+
"""Generate the slurm script and construct the launch command.
|
|
523
|
+
|
|
524
|
+
Returns
|
|
525
|
+
-------
|
|
526
|
+
str
|
|
527
|
+
Complete SLURM launch command
|
|
528
|
+
"""
|
|
529
|
+
batch_script_generator = BatchSlurmScriptGenerator(self.params)
|
|
530
|
+
self.batch_script_path = batch_script_generator.generate_batch_slurm_script()
|
|
531
|
+
self.launch_script_paths = batch_script_generator.script_paths
|
|
532
|
+
return f"sbatch {str(self.batch_script_path)}"
|
|
533
|
+
|
|
534
|
+
def launch(self) -> BatchLaunchResponse:
|
|
535
|
+
"""Launch models in batch mode.
|
|
536
|
+
|
|
537
|
+
Returns
|
|
538
|
+
-------
|
|
539
|
+
BatchLaunchResponse
|
|
540
|
+
Response object containing launch details and status
|
|
541
|
+
|
|
542
|
+
Raises
|
|
543
|
+
------
|
|
544
|
+
SlurmJobError
|
|
545
|
+
If SLURM job submission fails
|
|
546
|
+
"""
|
|
547
|
+
# Build and execute the launch command
|
|
548
|
+
command_output, stderr = utils.run_bash_command(self._build_launch_command())
|
|
549
|
+
|
|
550
|
+
if stderr:
|
|
551
|
+
raise SlurmJobError(f"Error: {stderr}")
|
|
552
|
+
|
|
553
|
+
# Extract slurm job id from command output
|
|
554
|
+
self.slurm_job_id = command_output.split(" ")[-1].strip().strip("\n")
|
|
555
|
+
self.params["slurm_job_id"] = self.slurm_job_id
|
|
556
|
+
|
|
557
|
+
# Create log directory and job json file, move slurm script to job log directory
|
|
558
|
+
main_job_log_dir = Path("")
|
|
559
|
+
|
|
560
|
+
for model_name in self.model_names:
|
|
561
|
+
model_job_id = int(self.slurm_job_id) + int(
|
|
562
|
+
self.params["models"][model_name]["het_group_id"]
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
job_log_dir = Path(
|
|
566
|
+
self.params["log_dir"], f"{self.slurm_job_name}.{model_job_id}"
|
|
567
|
+
)
|
|
568
|
+
job_log_dir.mkdir(parents=True, exist_ok=True)
|
|
569
|
+
|
|
570
|
+
if main_job_log_dir == Path(""):
|
|
571
|
+
main_job_log_dir = job_log_dir
|
|
572
|
+
|
|
573
|
+
job_json = Path(
|
|
574
|
+
job_log_dir,
|
|
575
|
+
f"{model_name}.{model_job_id}.json",
|
|
576
|
+
)
|
|
577
|
+
job_json.touch(exist_ok=True)
|
|
578
|
+
|
|
579
|
+
with job_json.open("w") as file:
|
|
580
|
+
json.dump(self.params["models"][model_name], file, indent=4)
|
|
581
|
+
|
|
582
|
+
# Copy the launch scripts to the job log directory, the original scripts
|
|
583
|
+
# cannot be deleted otherwise slurm will not be able to find them
|
|
584
|
+
script_path_mapper = {}
|
|
585
|
+
for script_path in self.launch_script_paths:
|
|
586
|
+
old_path = script_path.name
|
|
587
|
+
file_name = old_path.split("/")[-1]
|
|
588
|
+
copy2(script_path, main_job_log_dir / file_name)
|
|
589
|
+
new_path = script_path.name
|
|
590
|
+
script_path_mapper[old_path] = new_path
|
|
591
|
+
|
|
592
|
+
# Replace old launch script paths with new paths in batch slurm script
|
|
593
|
+
with self.batch_script_path.open("r") as f:
|
|
594
|
+
script_content = f.read()
|
|
595
|
+
for old_path, new_path in script_path_mapper.items():
|
|
596
|
+
script_content = script_content.replace(old_path, new_path)
|
|
597
|
+
with self.batch_script_path.open("w") as f:
|
|
598
|
+
f.write(script_content)
|
|
599
|
+
|
|
600
|
+
# Move the batch script to the job log directory
|
|
601
|
+
self.batch_script_path.rename(
|
|
602
|
+
main_job_log_dir / f"{self.slurm_job_name}.{self.slurm_job_id}.sbatch"
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
return BatchLaunchResponse(
|
|
606
|
+
slurm_job_id=self.slurm_job_id,
|
|
607
|
+
slurm_job_name=self.slurm_job_name,
|
|
608
|
+
model_names=self.model_names,
|
|
609
|
+
config=self.params,
|
|
610
|
+
raw_output=command_output,
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
|
|
302
614
|
class ModelStatusMonitor:
|
|
303
615
|
"""Class for handling server status information and monitoring.
|
|
304
616
|
|
|
@@ -307,16 +619,17 @@ class ModelStatusMonitor:
|
|
|
307
619
|
|
|
308
620
|
Parameters
|
|
309
621
|
----------
|
|
310
|
-
slurm_job_id :
|
|
622
|
+
slurm_job_id : str
|
|
311
623
|
ID of the SLURM job to monitor
|
|
312
|
-
log_dir : str, optional
|
|
313
|
-
Base directory containing log files
|
|
314
624
|
"""
|
|
315
625
|
|
|
316
|
-
def __init__(self, slurm_job_id:
|
|
626
|
+
def __init__(self, slurm_job_id: str):
|
|
317
627
|
self.slurm_job_id = slurm_job_id
|
|
318
628
|
self.output = self._get_raw_status_output()
|
|
319
|
-
self.
|
|
629
|
+
self.job_status = dict(
|
|
630
|
+
field.split("=", 1) for field in self.output.split() if "=" in field
|
|
631
|
+
)
|
|
632
|
+
self.log_dir = self._get_log_dir()
|
|
320
633
|
self.status_info = self._get_base_status_data()
|
|
321
634
|
|
|
322
635
|
def _get_raw_status_output(self) -> str:
|
|
@@ -334,10 +647,28 @@ class ModelStatusMonitor:
|
|
|
334
647
|
"""
|
|
335
648
|
status_cmd = f"scontrol show job {self.slurm_job_id} --oneliner"
|
|
336
649
|
output, stderr = utils.run_bash_command(status_cmd)
|
|
650
|
+
|
|
337
651
|
if stderr:
|
|
338
652
|
raise SlurmJobError(f"Error: {stderr}")
|
|
339
653
|
return output
|
|
340
654
|
|
|
655
|
+
def _get_log_dir(self) -> str:
|
|
656
|
+
"""Get the log directory for the job.
|
|
657
|
+
|
|
658
|
+
Returns
|
|
659
|
+
-------
|
|
660
|
+
str
|
|
661
|
+
Log directory for the job
|
|
662
|
+
"""
|
|
663
|
+
try:
|
|
664
|
+
outfile_path = self.job_status["StdOut"]
|
|
665
|
+
directory = Path(outfile_path).parent
|
|
666
|
+
return str(directory)
|
|
667
|
+
except KeyError as err:
|
|
668
|
+
raise FileNotFoundError(
|
|
669
|
+
f"Output file not found for job {self.slurm_job_id}"
|
|
670
|
+
) from err
|
|
671
|
+
|
|
341
672
|
def _get_base_status_data(self) -> StatusResponse:
|
|
342
673
|
"""Extract basic job status information from scontrol output.
|
|
343
674
|
|
|
@@ -347,14 +678,15 @@ class ModelStatusMonitor:
|
|
|
347
678
|
Basic status information for the job
|
|
348
679
|
"""
|
|
349
680
|
try:
|
|
350
|
-
job_name = self.
|
|
351
|
-
job_state = self.
|
|
352
|
-
except
|
|
681
|
+
job_name = self.job_status["JobName"]
|
|
682
|
+
job_state = self.job_status["JobState"]
|
|
683
|
+
except KeyError:
|
|
353
684
|
job_name = "UNAVAILABLE"
|
|
354
685
|
job_state = ModelStatus.UNAVAILABLE
|
|
355
686
|
|
|
356
687
|
return StatusResponse(
|
|
357
688
|
model_name=job_name,
|
|
689
|
+
log_dir=self.log_dir,
|
|
358
690
|
server_status=ModelStatus.UNAVAILABLE,
|
|
359
691
|
job_state=job_state,
|
|
360
692
|
raw_output=self.output,
|
|
@@ -399,9 +731,9 @@ class ModelStatusMonitor:
|
|
|
399
731
|
def _process_pending_state(self) -> None:
|
|
400
732
|
"""Process PENDING job state and update status information."""
|
|
401
733
|
try:
|
|
402
|
-
self.status_info.pending_reason = self.
|
|
734
|
+
self.status_info.pending_reason = self.job_status["Reason"]
|
|
403
735
|
self.status_info.server_status = ModelStatus.PENDING
|
|
404
|
-
except
|
|
736
|
+
except KeyError:
|
|
405
737
|
self.status_info.pending_reason = "Unknown pending reason"
|
|
406
738
|
|
|
407
739
|
def process_model_status(self) -> StatusResponse:
|
|
@@ -428,16 +760,16 @@ class PerformanceMetricsCollector:
|
|
|
428
760
|
|
|
429
761
|
Parameters
|
|
430
762
|
----------
|
|
431
|
-
slurm_job_id :
|
|
763
|
+
slurm_job_id : str
|
|
432
764
|
ID of the SLURM job to collect metrics from
|
|
433
765
|
log_dir : str, optional
|
|
434
766
|
Directory containing log files
|
|
435
767
|
"""
|
|
436
768
|
|
|
437
|
-
def __init__(self, slurm_job_id:
|
|
769
|
+
def __init__(self, slurm_job_id: str):
|
|
438
770
|
self.slurm_job_id = slurm_job_id
|
|
439
|
-
self.log_dir = log_dir
|
|
440
771
|
self.status_info = self._get_status_info()
|
|
772
|
+
self.log_dir = self.status_info.log_dir
|
|
441
773
|
self.metrics_url = self._build_metrics_url()
|
|
442
774
|
self.enabled_prefix_caching = self._check_prefix_caching()
|
|
443
775
|
|
|
@@ -454,7 +786,7 @@ class PerformanceMetricsCollector:
|
|
|
454
786
|
StatusResponse
|
|
455
787
|
Current status information for the model
|
|
456
788
|
"""
|
|
457
|
-
status_helper = ModelStatusMonitor(self.slurm_job_id
|
|
789
|
+
status_helper = ModelStatusMonitor(self.slurm_job_id)
|
|
458
790
|
return status_helper.process_model_status()
|
|
459
791
|
|
|
460
792
|
def _build_metrics_url(self) -> str:
|
|
@@ -646,7 +978,7 @@ class ModelRegistry:
|
|
|
646
978
|
config=config.model_dump(exclude={"model_name", "venv", "log_dir"}),
|
|
647
979
|
)
|
|
648
980
|
available_models.append(info)
|
|
649
|
-
return available_models
|
|
981
|
+
return sorted(available_models, key=lambda x: x.name)
|
|
650
982
|
|
|
651
983
|
def get_single_model_config(self, model_name: str) -> ModelConfig:
|
|
652
984
|
"""Get configuration for a specific model.
|
|
@@ -667,7 +999,8 @@ class ModelRegistry:
|
|
|
667
999
|
If the specified model is not found in configuration
|
|
668
1000
|
"""
|
|
669
1001
|
config = next(
|
|
670
|
-
(c for c in self.model_configs if c.model_name == model_name),
|
|
1002
|
+
(c for c in self.model_configs if c.model_name == model_name),
|
|
1003
|
+
None,
|
|
671
1004
|
)
|
|
672
1005
|
if not config:
|
|
673
1006
|
raise ModelNotFoundError(f"Model '{model_name}' not found in configuration")
|