vec-inf 0.6.1__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vec_inf/README.md +18 -4
- vec_inf/cli/_cli.py +191 -34
- vec_inf/cli/_helper.py +95 -14
- vec_inf/client/_client_vars.py +7 -165
- vec_inf/client/_helper.py +386 -40
- vec_inf/client/_slurm_script_generator.py +204 -36
- vec_inf/client/_slurm_templates.py +248 -0
- vec_inf/client/_slurm_vars.py +86 -0
- vec_inf/client/_utils.py +189 -70
- vec_inf/client/api.py +96 -25
- vec_inf/client/config.py +40 -19
- vec_inf/client/models.py +44 -4
- vec_inf/config/README.md +4 -243
- vec_inf/config/environment.yaml +35 -0
- vec_inf/config/models.yaml +102 -274
- {vec_inf-0.6.1.dist-info → vec_inf-0.7.1.dist-info}/METADATA +43 -73
- vec_inf-0.7.1.dist-info/RECORD +27 -0
- vec_inf/client/slurm_vars.py +0 -49
- vec_inf-0.6.1.dist-info/RECORD +0 -25
- {vec_inf-0.6.1.dist-info → vec_inf-0.7.1.dist-info}/WHEEL +0 -0
- {vec_inf-0.6.1.dist-info → vec_inf-0.7.1.dist-info}/entry_points.txt +0 -0
- {vec_inf-0.6.1.dist-info → vec_inf-0.7.1.dist-info}/licenses/LICENSE +0 -0
vec_inf/client/_helper.py
CHANGED
|
@@ -8,6 +8,7 @@ import json
|
|
|
8
8
|
import time
|
|
9
9
|
import warnings
|
|
10
10
|
from pathlib import Path
|
|
11
|
+
from shutil import copy2
|
|
11
12
|
from typing import Any, Optional, Union, cast
|
|
12
13
|
from urllib.parse import urlparse, urlunparse
|
|
13
14
|
|
|
@@ -15,8 +16,8 @@ import requests
|
|
|
15
16
|
|
|
16
17
|
import vec_inf.client._utils as utils
|
|
17
18
|
from vec_inf.client._client_vars import (
|
|
19
|
+
BATCH_MODE_REQUIRED_MATCHING_ARGS,
|
|
18
20
|
KEY_METRICS,
|
|
19
|
-
REQUIRED_FIELDS,
|
|
20
21
|
SRC_DIR,
|
|
21
22
|
VLLM_SHORT_TO_LONG_MAP,
|
|
22
23
|
)
|
|
@@ -26,9 +27,13 @@ from vec_inf.client._exceptions import (
|
|
|
26
27
|
ModelNotFoundError,
|
|
27
28
|
SlurmJobError,
|
|
28
29
|
)
|
|
29
|
-
from vec_inf.client._slurm_script_generator import
|
|
30
|
+
from vec_inf.client._slurm_script_generator import (
|
|
31
|
+
BatchSlurmScriptGenerator,
|
|
32
|
+
SlurmScriptGenerator,
|
|
33
|
+
)
|
|
30
34
|
from vec_inf.client.config import ModelConfig
|
|
31
35
|
from vec_inf.client.models import (
|
|
36
|
+
BatchLaunchResponse,
|
|
32
37
|
LaunchResponse,
|
|
33
38
|
ModelInfo,
|
|
34
39
|
ModelStatus,
|
|
@@ -45,27 +50,18 @@ class ModelLauncher:
|
|
|
45
50
|
|
|
46
51
|
Parameters
|
|
47
52
|
----------
|
|
48
|
-
model_name
|
|
53
|
+
model_name: str
|
|
49
54
|
Name of the model to launch
|
|
50
|
-
kwargs
|
|
55
|
+
kwargs: Optional[dict[str, Any]]
|
|
51
56
|
Optional launch keyword arguments to override default configuration
|
|
52
57
|
"""
|
|
53
58
|
|
|
54
59
|
def __init__(self, model_name: str, kwargs: Optional[dict[str, Any]]):
|
|
55
|
-
"""Initialize the model launcher.
|
|
56
|
-
|
|
57
|
-
Parameters
|
|
58
|
-
----------
|
|
59
|
-
model_name: str
|
|
60
|
-
Name of the model to launch
|
|
61
|
-
kwargs: Optional[dict[str, Any]]
|
|
62
|
-
Optional launch keyword arguments to override default configuration
|
|
63
|
-
"""
|
|
64
60
|
self.model_name = model_name
|
|
65
61
|
self.kwargs = kwargs or {}
|
|
66
62
|
self.slurm_job_id = ""
|
|
67
63
|
self.slurm_script_path = Path("")
|
|
68
|
-
self.model_config = self._get_model_configuration()
|
|
64
|
+
self.model_config = self._get_model_configuration(self.kwargs.get("config"))
|
|
69
65
|
self.params = self._get_launch_params()
|
|
70
66
|
|
|
71
67
|
def _warn(self, message: str) -> None:
|
|
@@ -78,9 +74,14 @@ class ModelLauncher:
|
|
|
78
74
|
"""
|
|
79
75
|
warnings.warn(message, UserWarning, stacklevel=2)
|
|
80
76
|
|
|
81
|
-
def _get_model_configuration(self) -> ModelConfig:
|
|
77
|
+
def _get_model_configuration(self, config_path: str | None = None) -> ModelConfig:
|
|
82
78
|
"""Load and validate model configuration.
|
|
83
79
|
|
|
80
|
+
Parameters
|
|
81
|
+
----------
|
|
82
|
+
config_path : str | None, optional
|
|
83
|
+
Path to a yaml file with custom model config to use in place of the default
|
|
84
|
+
|
|
84
85
|
Returns
|
|
85
86
|
-------
|
|
86
87
|
ModelConfig
|
|
@@ -93,7 +94,7 @@ class ModelLauncher:
|
|
|
93
94
|
ModelConfigurationError
|
|
94
95
|
If model configuration is not found and weights don't exist
|
|
95
96
|
"""
|
|
96
|
-
model_configs = utils.load_config()
|
|
97
|
+
model_configs = utils.load_config(config_path=config_path)
|
|
97
98
|
config = next(
|
|
98
99
|
(m for m in model_configs if m.model_name == self.model_name), None
|
|
99
100
|
)
|
|
@@ -162,6 +163,38 @@ class ModelLauncher:
|
|
|
162
163
|
vllm_args[arg.strip()] = True
|
|
163
164
|
return vllm_args
|
|
164
165
|
|
|
166
|
+
def _process_env_vars(self, env_arg: str) -> dict[str, str]:
|
|
167
|
+
"""Process the env string into a dictionary of environment variables.
|
|
168
|
+
|
|
169
|
+
Parameters
|
|
170
|
+
----------
|
|
171
|
+
env_arg : str
|
|
172
|
+
String containing comma separated list of environment variable definitions
|
|
173
|
+
(eg. MY_VAR=1), file paths containing environment variable definitions
|
|
174
|
+
(separated by newlines), or a combination of both
|
|
175
|
+
(eg. 'MY_VAR=5,my_env.env')
|
|
176
|
+
|
|
177
|
+
Returns
|
|
178
|
+
-------
|
|
179
|
+
dict[str, str]
|
|
180
|
+
Processed environment variables as key-value pairs.
|
|
181
|
+
"""
|
|
182
|
+
env_vars: dict[str, str] = {}
|
|
183
|
+
for arg in env_arg.split(","):
|
|
184
|
+
if "=" in arg: # Arg is an env var definition
|
|
185
|
+
key, value = arg.split("=")
|
|
186
|
+
env_vars[key.strip()] = value.strip()
|
|
187
|
+
else: # Arg is a path to a file
|
|
188
|
+
with open(arg, "r") as file:
|
|
189
|
+
lines = [line.rstrip() for line in file]
|
|
190
|
+
for line in lines:
|
|
191
|
+
if "=" in line:
|
|
192
|
+
key, value = line.split("=")
|
|
193
|
+
env_vars[key.strip()] = value.strip()
|
|
194
|
+
else:
|
|
195
|
+
print(f"WARNING: Could not parse env var: {line}")
|
|
196
|
+
return env_vars
|
|
197
|
+
|
|
165
198
|
def _get_launch_params(self) -> dict[str, Any]:
|
|
166
199
|
"""Prepare launch parameters, set log dir, and validate required fields.
|
|
167
200
|
|
|
@@ -185,14 +218,19 @@ class ModelLauncher:
|
|
|
185
218
|
params["vllm_args"][key] = value
|
|
186
219
|
del self.kwargs["vllm_args"]
|
|
187
220
|
|
|
221
|
+
if self.kwargs.get("env"):
|
|
222
|
+
env_vars = self._process_env_vars(self.kwargs["env"])
|
|
223
|
+
for key, value in env_vars.items():
|
|
224
|
+
params["env"][key] = str(value)
|
|
225
|
+
del self.kwargs["env"]
|
|
226
|
+
|
|
188
227
|
for key, value in self.kwargs.items():
|
|
189
228
|
params[key] = value
|
|
190
229
|
|
|
191
|
-
#
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
)
|
|
230
|
+
# Check for required fields without default vals, will raise an error if missing
|
|
231
|
+
utils.check_required_fields(params)
|
|
232
|
+
|
|
233
|
+
# Validate resource allocation and parallelization settings
|
|
196
234
|
if (
|
|
197
235
|
int(params["gpus_per_node"]) > 1
|
|
198
236
|
and params["vllm_args"].get("--tensor-parallel-size") is None
|
|
@@ -201,6 +239,25 @@ class ModelLauncher:
|
|
|
201
239
|
"--tensor-parallel-size is required when gpus_per_node > 1"
|
|
202
240
|
)
|
|
203
241
|
|
|
242
|
+
total_gpus_requested = int(params["gpus_per_node"]) * int(params["num_nodes"])
|
|
243
|
+
if not utils.is_power_of_two(total_gpus_requested):
|
|
244
|
+
raise ValueError("Total number of GPUs requested must be a power of two")
|
|
245
|
+
|
|
246
|
+
total_parallel_sizes = int(
|
|
247
|
+
params["vllm_args"].get("--tensor-parallel-size", "1")
|
|
248
|
+
) * int(params["vllm_args"].get("--pipeline-parallel-size", "1"))
|
|
249
|
+
if total_gpus_requested != total_parallel_sizes:
|
|
250
|
+
raise ValueError(
|
|
251
|
+
"Mismatch between total number of GPUs requested and parallelization settings"
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# Convert gpus_per_node and resource_type to gres
|
|
255
|
+
resource_type = params.get("resource_type")
|
|
256
|
+
if resource_type:
|
|
257
|
+
params["gres"] = f"gpu:{resource_type}:{params['gpus_per_node']}"
|
|
258
|
+
else:
|
|
259
|
+
params["gres"] = f"gpu:{params['gpus_per_node']}"
|
|
260
|
+
|
|
204
261
|
# Create log directory
|
|
205
262
|
params["log_dir"] = Path(params["log_dir"], params["model_family"]).expanduser()
|
|
206
263
|
params["log_dir"].mkdir(parents=True, exist_ok=True)
|
|
@@ -219,7 +276,7 @@ class ModelLauncher:
|
|
|
219
276
|
|
|
220
277
|
# Convert path to string for JSON serialization
|
|
221
278
|
for field in params:
|
|
222
|
-
if field
|
|
279
|
+
if field in ["vllm_args", "env"]:
|
|
223
280
|
continue
|
|
224
281
|
params[field] = str(params[field])
|
|
225
282
|
|
|
@@ -272,20 +329,288 @@ class ModelLauncher:
|
|
|
272
329
|
job_json.touch(exist_ok=True)
|
|
273
330
|
|
|
274
331
|
self.slurm_script_path.rename(
|
|
275
|
-
job_log_dir / f"{self.model_name}.{self.slurm_job_id}.
|
|
332
|
+
job_log_dir / f"{self.model_name}.{self.slurm_job_id}.sbatch"
|
|
276
333
|
)
|
|
277
334
|
|
|
278
335
|
with job_json.open("w") as file:
|
|
279
336
|
json.dump(self.params, file, indent=4)
|
|
280
337
|
|
|
281
338
|
return LaunchResponse(
|
|
282
|
-
slurm_job_id=
|
|
339
|
+
slurm_job_id=self.slurm_job_id,
|
|
283
340
|
model_name=self.model_name,
|
|
284
341
|
config=self.params,
|
|
285
342
|
raw_output=command_output,
|
|
286
343
|
)
|
|
287
344
|
|
|
288
345
|
|
|
346
|
+
class BatchModelLauncher:
|
|
347
|
+
"""Helper class for handling batch inference server launch.
|
|
348
|
+
|
|
349
|
+
A class that manages the launch process of multiple inference servers, including
|
|
350
|
+
configuration validation, and SLURM job submission.
|
|
351
|
+
|
|
352
|
+
Parameters
|
|
353
|
+
----------
|
|
354
|
+
model_names : list[str]
|
|
355
|
+
List of model names to launch
|
|
356
|
+
"""
|
|
357
|
+
|
|
358
|
+
def __init__(
|
|
359
|
+
self,
|
|
360
|
+
model_names: list[str],
|
|
361
|
+
batch_config: Optional[str] = None,
|
|
362
|
+
account: Optional[str] = None,
|
|
363
|
+
work_dir: Optional[str] = None,
|
|
364
|
+
):
|
|
365
|
+
self.model_names = model_names
|
|
366
|
+
self.batch_config = batch_config
|
|
367
|
+
self.slurm_job_id = ""
|
|
368
|
+
self.slurm_job_name = self._get_slurm_job_name()
|
|
369
|
+
self.batch_script_path = Path("")
|
|
370
|
+
self.launch_script_paths: list[Path] = []
|
|
371
|
+
self.model_configs = self._get_model_configurations()
|
|
372
|
+
self.params = self._get_launch_params(account, work_dir)
|
|
373
|
+
|
|
374
|
+
def _get_slurm_job_name(self) -> str:
|
|
375
|
+
"""Get the SLURM job name from the model names.
|
|
376
|
+
|
|
377
|
+
Returns
|
|
378
|
+
-------
|
|
379
|
+
str
|
|
380
|
+
SLURM job name
|
|
381
|
+
"""
|
|
382
|
+
return "BATCH-" + "-".join(self.model_names)
|
|
383
|
+
|
|
384
|
+
def _get_model_configurations(self) -> dict[str, ModelConfig]:
|
|
385
|
+
"""Load and validate model configurations.
|
|
386
|
+
|
|
387
|
+
Returns
|
|
388
|
+
-------
|
|
389
|
+
dict[str, ModelConfig]
|
|
390
|
+
Dictionary of validated model configurations
|
|
391
|
+
|
|
392
|
+
Raises
|
|
393
|
+
------
|
|
394
|
+
ModelNotFoundError
|
|
395
|
+
If model weights parent directory cannot be determined
|
|
396
|
+
ModelConfigurationError
|
|
397
|
+
If model configuration is not found and weights don't exist
|
|
398
|
+
"""
|
|
399
|
+
model_configs = utils.load_config(self.batch_config)
|
|
400
|
+
|
|
401
|
+
model_configs_dict = {}
|
|
402
|
+
for model_name in self.model_names:
|
|
403
|
+
config = next(
|
|
404
|
+
(m for m in model_configs if m.model_name == model_name), None
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
if config:
|
|
408
|
+
model_configs_dict[model_name] = config
|
|
409
|
+
else:
|
|
410
|
+
raise ModelConfigurationError(
|
|
411
|
+
f"'{model_name}' not found in configuration, batch launch requires all models to be present in the configuration file"
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
return model_configs_dict
|
|
415
|
+
|
|
416
|
+
def _get_launch_params(
|
|
417
|
+
self, account: Optional[str] = None, work_dir: Optional[str] = None
|
|
418
|
+
) -> dict[str, Any]:
|
|
419
|
+
"""Prepare launch parameters, set log dir, and validate required fields.
|
|
420
|
+
|
|
421
|
+
Returns
|
|
422
|
+
-------
|
|
423
|
+
dict[str, Any]
|
|
424
|
+
Dictionary of prepared launch parameters
|
|
425
|
+
|
|
426
|
+
Raises
|
|
427
|
+
------
|
|
428
|
+
MissingRequiredFieldsError
|
|
429
|
+
If required fields are missing or tensor parallel size is not specified
|
|
430
|
+
when using multiple GPUs
|
|
431
|
+
"""
|
|
432
|
+
params: dict[str, Any] = {
|
|
433
|
+
"models": {},
|
|
434
|
+
"slurm_job_name": self.slurm_job_name,
|
|
435
|
+
"src_dir": str(SRC_DIR),
|
|
436
|
+
"account": account,
|
|
437
|
+
"work_dir": work_dir,
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
# Check for required fields without default vals, will raise an error if missing
|
|
441
|
+
utils.check_required_fields(params)
|
|
442
|
+
|
|
443
|
+
for i, (model_name, config) in enumerate(self.model_configs.items()):
|
|
444
|
+
params["models"][model_name] = config.model_dump(exclude_none=True)
|
|
445
|
+
params["models"][model_name]["het_group_id"] = i
|
|
446
|
+
|
|
447
|
+
# Validate resource allocation and parallelization settings
|
|
448
|
+
if (
|
|
449
|
+
int(config.gpus_per_node) > 1
|
|
450
|
+
and (config.vllm_args or {}).get("--tensor-parallel-size") is None
|
|
451
|
+
):
|
|
452
|
+
raise MissingRequiredFieldsError(
|
|
453
|
+
f"--tensor-parallel-size is required when gpus_per_node > 1, check your configuration for {model_name}"
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
total_gpus_requested = int(config.gpus_per_node) * int(config.num_nodes)
|
|
457
|
+
if not utils.is_power_of_two(total_gpus_requested):
|
|
458
|
+
raise ValueError(
|
|
459
|
+
f"Total number of GPUs requested must be a power of two, check your configuration for {model_name}"
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
total_parallel_sizes = int(
|
|
463
|
+
(config.vllm_args or {}).get("--tensor-parallel-size", "1")
|
|
464
|
+
) * int((config.vllm_args or {}).get("--pipeline-parallel-size", "1"))
|
|
465
|
+
if total_gpus_requested != total_parallel_sizes:
|
|
466
|
+
raise ValueError(
|
|
467
|
+
f"Mismatch between total number of GPUs requested and parallelization settings, check your configuration for {model_name}"
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
# Convert gpus_per_node and resource_type to gres
|
|
471
|
+
params["models"][model_name]["gres"] = (
|
|
472
|
+
f"gpu:{config.resource_type}:{config.gpus_per_node}"
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
# Create log directory
|
|
476
|
+
log_dir = Path(
|
|
477
|
+
params["models"][model_name]["log_dir"], self.slurm_job_name
|
|
478
|
+
).expanduser()
|
|
479
|
+
log_dir.mkdir(parents=True, exist_ok=True)
|
|
480
|
+
params["models"][model_name]["log_dir"] = str(log_dir)
|
|
481
|
+
|
|
482
|
+
# Convert model_weights_parent_dir to string for JSON serialization
|
|
483
|
+
params["models"][model_name]["model_weights_parent_dir"] = str(
|
|
484
|
+
params["models"][model_name]["model_weights_parent_dir"]
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
# Construct slurm log file paths
|
|
488
|
+
params["models"][model_name]["out_file"] = (
|
|
489
|
+
f"{params['models'][model_name]['log_dir']}/{self.slurm_job_name}.%j/{model_name}.%j.out"
|
|
490
|
+
)
|
|
491
|
+
params["models"][model_name]["err_file"] = (
|
|
492
|
+
f"{params['models'][model_name]['log_dir']}/{self.slurm_job_name}.%j/{model_name}.%j.err"
|
|
493
|
+
)
|
|
494
|
+
params["models"][model_name]["json_file"] = (
|
|
495
|
+
f"{params['models'][model_name]['log_dir']}/{self.slurm_job_name}.$SLURM_JOB_ID/{model_name}.$SLURM_JOB_ID.json"
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
# Create top level log files using the first model's log directory
|
|
499
|
+
if not params.get("out_file"):
|
|
500
|
+
params["out_file"] = (
|
|
501
|
+
f"{params['models'][model_name]['log_dir']}/{self.slurm_job_name}.%j/{self.slurm_job_name}.%j.out"
|
|
502
|
+
)
|
|
503
|
+
if not params.get("err_file"):
|
|
504
|
+
params["err_file"] = (
|
|
505
|
+
f"{params['models'][model_name]['log_dir']}/{self.slurm_job_name}.%j/{self.slurm_job_name}.%j.err"
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
# Check if required matching arguments are matched
|
|
509
|
+
for arg in BATCH_MODE_REQUIRED_MATCHING_ARGS:
|
|
510
|
+
if not params.get(arg):
|
|
511
|
+
params[arg] = params["models"][model_name][arg]
|
|
512
|
+
elif params[arg] != params["models"][model_name][arg]:
|
|
513
|
+
# Remove the created directory since we found a mismatch
|
|
514
|
+
log_dir.rmdir()
|
|
515
|
+
raise ValueError(
|
|
516
|
+
f"Mismatch found for {arg}: {params[arg]} != {params['models'][model_name][arg]}, check your configuration"
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
return params
|
|
520
|
+
|
|
521
|
+
def _build_launch_command(self) -> str:
|
|
522
|
+
"""Generate the slurm script and construct the launch command.
|
|
523
|
+
|
|
524
|
+
Returns
|
|
525
|
+
-------
|
|
526
|
+
str
|
|
527
|
+
Complete SLURM launch command
|
|
528
|
+
"""
|
|
529
|
+
batch_script_generator = BatchSlurmScriptGenerator(self.params)
|
|
530
|
+
self.batch_script_path = batch_script_generator.generate_batch_slurm_script()
|
|
531
|
+
self.launch_script_paths = batch_script_generator.script_paths
|
|
532
|
+
return f"sbatch {str(self.batch_script_path)}"
|
|
533
|
+
|
|
534
|
+
def launch(self) -> BatchLaunchResponse:
|
|
535
|
+
"""Launch models in batch mode.
|
|
536
|
+
|
|
537
|
+
Returns
|
|
538
|
+
-------
|
|
539
|
+
BatchLaunchResponse
|
|
540
|
+
Response object containing launch details and status
|
|
541
|
+
|
|
542
|
+
Raises
|
|
543
|
+
------
|
|
544
|
+
SlurmJobError
|
|
545
|
+
If SLURM job submission fails
|
|
546
|
+
"""
|
|
547
|
+
# Build and execute the launch command
|
|
548
|
+
command_output, stderr = utils.run_bash_command(self._build_launch_command())
|
|
549
|
+
|
|
550
|
+
if stderr:
|
|
551
|
+
raise SlurmJobError(f"Error: {stderr}")
|
|
552
|
+
|
|
553
|
+
# Extract slurm job id from command output
|
|
554
|
+
self.slurm_job_id = command_output.split(" ")[-1].strip().strip("\n")
|
|
555
|
+
self.params["slurm_job_id"] = self.slurm_job_id
|
|
556
|
+
|
|
557
|
+
# Create log directory and job json file, move slurm script to job log directory
|
|
558
|
+
main_job_log_dir = Path("")
|
|
559
|
+
|
|
560
|
+
for model_name in self.model_names:
|
|
561
|
+
model_job_id = int(self.slurm_job_id) + int(
|
|
562
|
+
self.params["models"][model_name]["het_group_id"]
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
job_log_dir = Path(
|
|
566
|
+
self.params["log_dir"], f"{self.slurm_job_name}.{model_job_id}"
|
|
567
|
+
)
|
|
568
|
+
job_log_dir.mkdir(parents=True, exist_ok=True)
|
|
569
|
+
|
|
570
|
+
if main_job_log_dir == Path(""):
|
|
571
|
+
main_job_log_dir = job_log_dir
|
|
572
|
+
|
|
573
|
+
job_json = Path(
|
|
574
|
+
job_log_dir,
|
|
575
|
+
f"{model_name}.{model_job_id}.json",
|
|
576
|
+
)
|
|
577
|
+
job_json.touch(exist_ok=True)
|
|
578
|
+
|
|
579
|
+
with job_json.open("w") as file:
|
|
580
|
+
json.dump(self.params["models"][model_name], file, indent=4)
|
|
581
|
+
|
|
582
|
+
# Copy the launch scripts to the job log directory, the original scripts
|
|
583
|
+
# cannot be deleted otherwise slurm will not be able to find them
|
|
584
|
+
script_path_mapper = {}
|
|
585
|
+
for script_path in self.launch_script_paths:
|
|
586
|
+
old_path = script_path.name
|
|
587
|
+
file_name = old_path.split("/")[-1]
|
|
588
|
+
copy2(script_path, main_job_log_dir / file_name)
|
|
589
|
+
new_path = script_path.name
|
|
590
|
+
script_path_mapper[old_path] = new_path
|
|
591
|
+
|
|
592
|
+
# Replace old launch script paths with new paths in batch slurm script
|
|
593
|
+
with self.batch_script_path.open("r") as f:
|
|
594
|
+
script_content = f.read()
|
|
595
|
+
for old_path, new_path in script_path_mapper.items():
|
|
596
|
+
script_content = script_content.replace(old_path, new_path)
|
|
597
|
+
with self.batch_script_path.open("w") as f:
|
|
598
|
+
f.write(script_content)
|
|
599
|
+
|
|
600
|
+
# Move the batch script to the job log directory
|
|
601
|
+
self.batch_script_path.rename(
|
|
602
|
+
main_job_log_dir / f"{self.slurm_job_name}.{self.slurm_job_id}.sbatch"
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
return BatchLaunchResponse(
|
|
606
|
+
slurm_job_id=self.slurm_job_id,
|
|
607
|
+
slurm_job_name=self.slurm_job_name,
|
|
608
|
+
model_names=self.model_names,
|
|
609
|
+
config=self.params,
|
|
610
|
+
raw_output=command_output,
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
|
|
289
614
|
class ModelStatusMonitor:
|
|
290
615
|
"""Class for handling server status information and monitoring.
|
|
291
616
|
|
|
@@ -294,16 +619,17 @@ class ModelStatusMonitor:
|
|
|
294
619
|
|
|
295
620
|
Parameters
|
|
296
621
|
----------
|
|
297
|
-
slurm_job_id :
|
|
622
|
+
slurm_job_id : str
|
|
298
623
|
ID of the SLURM job to monitor
|
|
299
|
-
log_dir : str, optional
|
|
300
|
-
Base directory containing log files
|
|
301
624
|
"""
|
|
302
625
|
|
|
303
|
-
def __init__(self, slurm_job_id:
|
|
626
|
+
def __init__(self, slurm_job_id: str):
|
|
304
627
|
self.slurm_job_id = slurm_job_id
|
|
305
628
|
self.output = self._get_raw_status_output()
|
|
306
|
-
self.
|
|
629
|
+
self.job_status = dict(
|
|
630
|
+
field.split("=", 1) for field in self.output.split() if "=" in field
|
|
631
|
+
)
|
|
632
|
+
self.log_dir = self._get_log_dir()
|
|
307
633
|
self.status_info = self._get_base_status_data()
|
|
308
634
|
|
|
309
635
|
def _get_raw_status_output(self) -> str:
|
|
@@ -321,10 +647,28 @@ class ModelStatusMonitor:
|
|
|
321
647
|
"""
|
|
322
648
|
status_cmd = f"scontrol show job {self.slurm_job_id} --oneliner"
|
|
323
649
|
output, stderr = utils.run_bash_command(status_cmd)
|
|
650
|
+
|
|
324
651
|
if stderr:
|
|
325
652
|
raise SlurmJobError(f"Error: {stderr}")
|
|
326
653
|
return output
|
|
327
654
|
|
|
655
|
+
def _get_log_dir(self) -> str:
|
|
656
|
+
"""Get the log directory for the job.
|
|
657
|
+
|
|
658
|
+
Returns
|
|
659
|
+
-------
|
|
660
|
+
str
|
|
661
|
+
Log directory for the job
|
|
662
|
+
"""
|
|
663
|
+
try:
|
|
664
|
+
outfile_path = self.job_status["StdOut"]
|
|
665
|
+
directory = Path(outfile_path).parent
|
|
666
|
+
return str(directory)
|
|
667
|
+
except KeyError as err:
|
|
668
|
+
raise FileNotFoundError(
|
|
669
|
+
f"Output file not found for job {self.slurm_job_id}"
|
|
670
|
+
) from err
|
|
671
|
+
|
|
328
672
|
def _get_base_status_data(self) -> StatusResponse:
|
|
329
673
|
"""Extract basic job status information from scontrol output.
|
|
330
674
|
|
|
@@ -334,14 +678,15 @@ class ModelStatusMonitor:
|
|
|
334
678
|
Basic status information for the job
|
|
335
679
|
"""
|
|
336
680
|
try:
|
|
337
|
-
job_name = self.
|
|
338
|
-
job_state = self.
|
|
339
|
-
except
|
|
681
|
+
job_name = self.job_status["JobName"]
|
|
682
|
+
job_state = self.job_status["JobState"]
|
|
683
|
+
except KeyError:
|
|
340
684
|
job_name = "UNAVAILABLE"
|
|
341
685
|
job_state = ModelStatus.UNAVAILABLE
|
|
342
686
|
|
|
343
687
|
return StatusResponse(
|
|
344
688
|
model_name=job_name,
|
|
689
|
+
log_dir=self.log_dir,
|
|
345
690
|
server_status=ModelStatus.UNAVAILABLE,
|
|
346
691
|
job_state=job_state,
|
|
347
692
|
raw_output=self.output,
|
|
@@ -386,9 +731,9 @@ class ModelStatusMonitor:
|
|
|
386
731
|
def _process_pending_state(self) -> None:
|
|
387
732
|
"""Process PENDING job state and update status information."""
|
|
388
733
|
try:
|
|
389
|
-
self.status_info.pending_reason = self.
|
|
734
|
+
self.status_info.pending_reason = self.job_status["Reason"]
|
|
390
735
|
self.status_info.server_status = ModelStatus.PENDING
|
|
391
|
-
except
|
|
736
|
+
except KeyError:
|
|
392
737
|
self.status_info.pending_reason = "Unknown pending reason"
|
|
393
738
|
|
|
394
739
|
def process_model_status(self) -> StatusResponse:
|
|
@@ -415,16 +760,16 @@ class PerformanceMetricsCollector:
|
|
|
415
760
|
|
|
416
761
|
Parameters
|
|
417
762
|
----------
|
|
418
|
-
slurm_job_id :
|
|
763
|
+
slurm_job_id : str
|
|
419
764
|
ID of the SLURM job to collect metrics from
|
|
420
765
|
log_dir : str, optional
|
|
421
766
|
Directory containing log files
|
|
422
767
|
"""
|
|
423
768
|
|
|
424
|
-
def __init__(self, slurm_job_id:
|
|
769
|
+
def __init__(self, slurm_job_id: str):
|
|
425
770
|
self.slurm_job_id = slurm_job_id
|
|
426
|
-
self.log_dir = log_dir
|
|
427
771
|
self.status_info = self._get_status_info()
|
|
772
|
+
self.log_dir = self.status_info.log_dir
|
|
428
773
|
self.metrics_url = self._build_metrics_url()
|
|
429
774
|
self.enabled_prefix_caching = self._check_prefix_caching()
|
|
430
775
|
|
|
@@ -441,7 +786,7 @@ class PerformanceMetricsCollector:
|
|
|
441
786
|
StatusResponse
|
|
442
787
|
Current status information for the model
|
|
443
788
|
"""
|
|
444
|
-
status_helper = ModelStatusMonitor(self.slurm_job_id
|
|
789
|
+
status_helper = ModelStatusMonitor(self.slurm_job_id)
|
|
445
790
|
return status_helper.process_model_status()
|
|
446
791
|
|
|
447
792
|
def _build_metrics_url(self) -> str:
|
|
@@ -633,7 +978,7 @@ class ModelRegistry:
|
|
|
633
978
|
config=config.model_dump(exclude={"model_name", "venv", "log_dir"}),
|
|
634
979
|
)
|
|
635
980
|
available_models.append(info)
|
|
636
|
-
return available_models
|
|
981
|
+
return sorted(available_models, key=lambda x: x.name)
|
|
637
982
|
|
|
638
983
|
def get_single_model_config(self, model_name: str) -> ModelConfig:
|
|
639
984
|
"""Get configuration for a specific model.
|
|
@@ -654,7 +999,8 @@ class ModelRegistry:
|
|
|
654
999
|
If the specified model is not found in configuration
|
|
655
1000
|
"""
|
|
656
1001
|
config = next(
|
|
657
|
-
(c for c in self.model_configs if c.model_name == model_name),
|
|
1002
|
+
(c for c in self.model_configs if c.model_name == model_name),
|
|
1003
|
+
None,
|
|
658
1004
|
)
|
|
659
1005
|
if not config:
|
|
660
1006
|
raise ModelNotFoundError(f"Model '{model_name}' not found in configuration")
|