vec-inf 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vec_inf/client/_helper.py CHANGED
@@ -5,10 +5,10 @@ metrics collection, and model registry operations.
5
5
  """
6
6
 
7
7
  import json
8
- import os
9
8
  import time
10
9
  import warnings
11
10
  from pathlib import Path
11
+ from shutil import copy2
12
12
  from typing import Any, Optional, Union, cast
13
13
  from urllib.parse import urlparse, urlunparse
14
14
 
@@ -16,8 +16,8 @@ import requests
16
16
 
17
17
  import vec_inf.client._utils as utils
18
18
  from vec_inf.client._client_vars import (
19
+ BATCH_MODE_REQUIRED_MATCHING_ARGS,
19
20
  KEY_METRICS,
20
- REQUIRED_FIELDS,
21
21
  SRC_DIR,
22
22
  VLLM_SHORT_TO_LONG_MAP,
23
23
  )
@@ -27,19 +27,19 @@ from vec_inf.client._exceptions import (
27
27
  ModelNotFoundError,
28
28
  SlurmJobError,
29
29
  )
30
- from vec_inf.client._slurm_script_generator import SlurmScriptGenerator
30
+ from vec_inf.client._slurm_script_generator import (
31
+ BatchSlurmScriptGenerator,
32
+ SlurmScriptGenerator,
33
+ )
31
34
  from vec_inf.client.config import ModelConfig
32
35
  from vec_inf.client.models import (
36
+ BatchLaunchResponse,
33
37
  LaunchResponse,
34
38
  ModelInfo,
35
39
  ModelStatus,
36
40
  ModelType,
37
41
  StatusResponse,
38
42
  )
39
- from vec_inf.client.slurm_vars import (
40
- LD_LIBRARY_PATH,
41
- VLLM_NCCL_SO_PATH,
42
- )
43
43
 
44
44
 
45
45
  class ModelLauncher:
@@ -50,27 +50,18 @@ class ModelLauncher:
50
50
 
51
51
  Parameters
52
52
  ----------
53
- model_name : str
53
+ model_name: str
54
54
  Name of the model to launch
55
- kwargs : dict[str, Any], optional
55
+ kwargs: Optional[dict[str, Any]]
56
56
  Optional launch keyword arguments to override default configuration
57
57
  """
58
58
 
59
59
  def __init__(self, model_name: str, kwargs: Optional[dict[str, Any]]):
60
- """Initialize the model launcher.
61
-
62
- Parameters
63
- ----------
64
- model_name: str
65
- Name of the model to launch
66
- kwargs: Optional[dict[str, Any]]
67
- Optional launch keyword arguments to override default configuration
68
- """
69
60
  self.model_name = model_name
70
61
  self.kwargs = kwargs or {}
71
62
  self.slurm_job_id = ""
72
63
  self.slurm_script_path = Path("")
73
- self.model_config = self._get_model_configuration()
64
+ self.model_config = self._get_model_configuration(self.kwargs.get("config"))
74
65
  self.params = self._get_launch_params()
75
66
 
76
67
  def _warn(self, message: str) -> None:
@@ -83,9 +74,14 @@ class ModelLauncher:
83
74
  """
84
75
  warnings.warn(message, UserWarning, stacklevel=2)
85
76
 
86
- def _get_model_configuration(self) -> ModelConfig:
77
+ def _get_model_configuration(self, config_path: str | None = None) -> ModelConfig:
87
78
  """Load and validate model configuration.
88
79
 
80
+ Parameters
81
+ ----------
82
+ config_path : str | None, optional
83
+ Path to a yaml file with custom model config to use in place of the default
84
+
89
85
  Returns
90
86
  -------
91
87
  ModelConfig
@@ -98,7 +94,7 @@ class ModelLauncher:
98
94
  ModelConfigurationError
99
95
  If model configuration is not found and weights don't exist
100
96
  """
101
- model_configs = utils.load_config()
97
+ model_configs = utils.load_config(config_path=config_path)
102
98
  config = next(
103
99
  (m for m in model_configs if m.model_name == self.model_name), None
104
100
  )
@@ -167,6 +163,38 @@ class ModelLauncher:
167
163
  vllm_args[arg.strip()] = True
168
164
  return vllm_args
169
165
 
166
+ def _process_env_vars(self, env_arg: str) -> dict[str, str]:
167
+ """Process the env string into a dictionary of environment variables.
168
+
169
+ Parameters
170
+ ----------
171
+ env_arg : str
172
+ String containing comma separated list of environment variable definitions
173
+ (eg. MY_VAR=1), file paths containing environment variable definitions
174
+ (separated by newlines), or a combination of both
175
+ (eg. 'MY_VAR=5,my_env.env')
176
+
177
+ Returns
178
+ -------
179
+ dict[str, str]
180
+ Processed environment variables as key-value pairs.
181
+ """
182
+ env_vars: dict[str, str] = {}
183
+ for arg in env_arg.split(","):
184
+ if "=" in arg: # Arg is an env var definition
185
+ key, value = arg.split("=")
186
+ env_vars[key.strip()] = value.strip()
187
+ else: # Arg is a path to a file
188
+ with open(arg, "r") as file:
189
+ lines = [line.rstrip() for line in file]
190
+ for line in lines:
191
+ if "=" in line:
192
+ key, value = line.split("=")
193
+ env_vars[key.strip()] = value.strip()
194
+ else:
195
+ print(f"WARNING: Could not parse env var: {line}")
196
+ return env_vars
197
+
170
198
  def _get_launch_params(self) -> dict[str, Any]:
171
199
  """Prepare launch parameters, set log dir, and validate required fields.
172
200
 
@@ -190,14 +218,19 @@ class ModelLauncher:
190
218
  params["vllm_args"][key] = value
191
219
  del self.kwargs["vllm_args"]
192
220
 
221
+ if self.kwargs.get("env"):
222
+ env_vars = self._process_env_vars(self.kwargs["env"])
223
+ for key, value in env_vars.items():
224
+ params["env"][key] = str(value)
225
+ del self.kwargs["env"]
226
+
193
227
  for key, value in self.kwargs.items():
194
228
  params[key] = value
195
229
 
196
- # Validate required fields and vllm args
197
- if not REQUIRED_FIELDS.issubset(set(params.keys())):
198
- raise MissingRequiredFieldsError(
199
- f"Missing required fields: {REQUIRED_FIELDS - set(params.keys())}"
200
- )
230
+ # Check for required fields without default vals, will raise an error if missing
231
+ utils.check_required_fields(params)
232
+
233
+ # Validate resource allocation and parallelization settings
201
234
  if (
202
235
  int(params["gpus_per_node"]) > 1
203
236
  and params["vllm_args"].get("--tensor-parallel-size") is None
@@ -206,6 +239,25 @@ class ModelLauncher:
206
239
  "--tensor-parallel-size is required when gpus_per_node > 1"
207
240
  )
208
241
 
242
+ total_gpus_requested = int(params["gpus_per_node"]) * int(params["num_nodes"])
243
+ if not utils.is_power_of_two(total_gpus_requested):
244
+ raise ValueError("Total number of GPUs requested must be a power of two")
245
+
246
+ total_parallel_sizes = int(
247
+ params["vllm_args"].get("--tensor-parallel-size", "1")
248
+ ) * int(params["vllm_args"].get("--pipeline-parallel-size", "1"))
249
+ if total_gpus_requested != total_parallel_sizes:
250
+ raise ValueError(
251
+ "Mismatch between total number of GPUs requested and parallelization settings"
252
+ )
253
+
254
+ # Convert gpus_per_node and resource_type to gres
255
+ resource_type = params.get("resource_type")
256
+ if resource_type:
257
+ params["gres"] = f"gpu:{resource_type}:{params['gpus_per_node']}"
258
+ else:
259
+ params["gres"] = f"gpu:{params['gpus_per_node']}"
260
+
209
261
  # Create log directory
210
262
  params["log_dir"] = Path(params["log_dir"], params["model_family"]).expanduser()
211
263
  params["log_dir"].mkdir(parents=True, exist_ok=True)
@@ -224,17 +276,12 @@ class ModelLauncher:
224
276
 
225
277
  # Convert path to string for JSON serialization
226
278
  for field in params:
227
- if field == "vllm_args":
279
+ if field in ["vllm_args", "env"]:
228
280
  continue
229
281
  params[field] = str(params[field])
230
282
 
231
283
  return params
232
284
 
233
- def _set_env_vars(self) -> None:
234
- """Set environment variables for the launch command."""
235
- os.environ["LD_LIBRARY_PATH"] = LD_LIBRARY_PATH
236
- os.environ["VLLM_NCCL_SO_PATH"] = VLLM_NCCL_SO_PATH
237
-
238
285
  def _build_launch_command(self) -> str:
239
286
  """Generate the slurm script and construct the launch command.
240
287
 
@@ -259,9 +306,6 @@ class ModelLauncher:
259
306
  SlurmJobError
260
307
  If SLURM job submission fails
261
308
  """
262
- # Set environment variables
263
- self._set_env_vars()
264
-
265
309
  # Build and execute the launch command
266
310
  command_output, stderr = utils.run_bash_command(self._build_launch_command())
267
311
 
@@ -285,20 +329,288 @@ class ModelLauncher:
285
329
  job_json.touch(exist_ok=True)
286
330
 
287
331
  self.slurm_script_path.rename(
288
- job_log_dir / f"{self.model_name}.{self.slurm_job_id}.slurm"
332
+ job_log_dir / f"{self.model_name}.{self.slurm_job_id}.sbatch"
289
333
  )
290
334
 
291
335
  with job_json.open("w") as file:
292
336
  json.dump(self.params, file, indent=4)
293
337
 
294
338
  return LaunchResponse(
295
- slurm_job_id=int(self.slurm_job_id),
339
+ slurm_job_id=self.slurm_job_id,
296
340
  model_name=self.model_name,
297
341
  config=self.params,
298
342
  raw_output=command_output,
299
343
  )
300
344
 
301
345
 
346
+ class BatchModelLauncher:
347
+ """Helper class for handling batch inference server launch.
348
+
349
+ A class that manages the launch process of multiple inference servers, including
350
+ configuration validation, and SLURM job submission.
351
+
352
+ Parameters
353
+ ----------
354
+ model_names : list[str]
355
+ List of model names to launch
356
+ """
357
+
358
+ def __init__(
359
+ self,
360
+ model_names: list[str],
361
+ batch_config: Optional[str] = None,
362
+ account: Optional[str] = None,
363
+ work_dir: Optional[str] = None,
364
+ ):
365
+ self.model_names = model_names
366
+ self.batch_config = batch_config
367
+ self.slurm_job_id = ""
368
+ self.slurm_job_name = self._get_slurm_job_name()
369
+ self.batch_script_path = Path("")
370
+ self.launch_script_paths: list[Path] = []
371
+ self.model_configs = self._get_model_configurations()
372
+ self.params = self._get_launch_params(account, work_dir)
373
+
374
+ def _get_slurm_job_name(self) -> str:
375
+ """Get the SLURM job name from the model names.
376
+
377
+ Returns
378
+ -------
379
+ str
380
+ SLURM job name
381
+ """
382
+ return "BATCH-" + "-".join(self.model_names)
383
+
384
+ def _get_model_configurations(self) -> dict[str, ModelConfig]:
385
+ """Load and validate model configurations.
386
+
387
+ Returns
388
+ -------
389
+ dict[str, ModelConfig]
390
+ Dictionary of validated model configurations
391
+
392
+ Raises
393
+ ------
394
+ ModelNotFoundError
395
+ If model weights parent directory cannot be determined
396
+ ModelConfigurationError
397
+ If model configuration is not found and weights don't exist
398
+ """
399
+ model_configs = utils.load_config(self.batch_config)
400
+
401
+ model_configs_dict = {}
402
+ for model_name in self.model_names:
403
+ config = next(
404
+ (m for m in model_configs if m.model_name == model_name), None
405
+ )
406
+
407
+ if config:
408
+ model_configs_dict[model_name] = config
409
+ else:
410
+ raise ModelConfigurationError(
411
+ f"'{model_name}' not found in configuration, batch launch requires all models to be present in the configuration file"
412
+ )
413
+
414
+ return model_configs_dict
415
+
416
+ def _get_launch_params(
417
+ self, account: Optional[str] = None, work_dir: Optional[str] = None
418
+ ) -> dict[str, Any]:
419
+ """Prepare launch parameters, set log dir, and validate required fields.
420
+
421
+ Returns
422
+ -------
423
+ dict[str, Any]
424
+ Dictionary of prepared launch parameters
425
+
426
+ Raises
427
+ ------
428
+ MissingRequiredFieldsError
429
+ If required fields are missing or tensor parallel size is not specified
430
+ when using multiple GPUs
431
+ """
432
+ params: dict[str, Any] = {
433
+ "models": {},
434
+ "slurm_job_name": self.slurm_job_name,
435
+ "src_dir": str(SRC_DIR),
436
+ "account": account,
437
+ "work_dir": work_dir,
438
+ }
439
+
440
+ # Check for required fields without default vals, will raise an error if missing
441
+ utils.check_required_fields(params)
442
+
443
+ for i, (model_name, config) in enumerate(self.model_configs.items()):
444
+ params["models"][model_name] = config.model_dump(exclude_none=True)
445
+ params["models"][model_name]["het_group_id"] = i
446
+
447
+ # Validate resource allocation and parallelization settings
448
+ if (
449
+ int(config.gpus_per_node) > 1
450
+ and (config.vllm_args or {}).get("--tensor-parallel-size") is None
451
+ ):
452
+ raise MissingRequiredFieldsError(
453
+ f"--tensor-parallel-size is required when gpus_per_node > 1, check your configuration for {model_name}"
454
+ )
455
+
456
+ total_gpus_requested = int(config.gpus_per_node) * int(config.num_nodes)
457
+ if not utils.is_power_of_two(total_gpus_requested):
458
+ raise ValueError(
459
+ f"Total number of GPUs requested must be a power of two, check your configuration for {model_name}"
460
+ )
461
+
462
+ total_parallel_sizes = int(
463
+ (config.vllm_args or {}).get("--tensor-parallel-size", "1")
464
+ ) * int((config.vllm_args or {}).get("--pipeline-parallel-size", "1"))
465
+ if total_gpus_requested != total_parallel_sizes:
466
+ raise ValueError(
467
+ f"Mismatch between total number of GPUs requested and parallelization settings, check your configuration for {model_name}"
468
+ )
469
+
470
+ # Convert gpus_per_node and resource_type to gres
471
+ params["models"][model_name]["gres"] = (
472
+ f"gpu:{config.resource_type}:{config.gpus_per_node}"
473
+ )
474
+
475
+ # Create log directory
476
+ log_dir = Path(
477
+ params["models"][model_name]["log_dir"], self.slurm_job_name
478
+ ).expanduser()
479
+ log_dir.mkdir(parents=True, exist_ok=True)
480
+ params["models"][model_name]["log_dir"] = str(log_dir)
481
+
482
+ # Convert model_weights_parent_dir to string for JSON serialization
483
+ params["models"][model_name]["model_weights_parent_dir"] = str(
484
+ params["models"][model_name]["model_weights_parent_dir"]
485
+ )
486
+
487
+ # Construct slurm log file paths
488
+ params["models"][model_name]["out_file"] = (
489
+ f"{params['models'][model_name]['log_dir']}/{self.slurm_job_name}.%j/{model_name}.%j.out"
490
+ )
491
+ params["models"][model_name]["err_file"] = (
492
+ f"{params['models'][model_name]['log_dir']}/{self.slurm_job_name}.%j/{model_name}.%j.err"
493
+ )
494
+ params["models"][model_name]["json_file"] = (
495
+ f"{params['models'][model_name]['log_dir']}/{self.slurm_job_name}.$SLURM_JOB_ID/{model_name}.$SLURM_JOB_ID.json"
496
+ )
497
+
498
+ # Create top level log files using the first model's log directory
499
+ if not params.get("out_file"):
500
+ params["out_file"] = (
501
+ f"{params['models'][model_name]['log_dir']}/{self.slurm_job_name}.%j/{self.slurm_job_name}.%j.out"
502
+ )
503
+ if not params.get("err_file"):
504
+ params["err_file"] = (
505
+ f"{params['models'][model_name]['log_dir']}/{self.slurm_job_name}.%j/{self.slurm_job_name}.%j.err"
506
+ )
507
+
508
+ # Check if required matching arguments are matched
509
+ for arg in BATCH_MODE_REQUIRED_MATCHING_ARGS:
510
+ if not params.get(arg):
511
+ params[arg] = params["models"][model_name][arg]
512
+ elif params[arg] != params["models"][model_name][arg]:
513
+ # Remove the created directory since we found a mismatch
514
+ log_dir.rmdir()
515
+ raise ValueError(
516
+ f"Mismatch found for {arg}: {params[arg]} != {params['models'][model_name][arg]}, check your configuration"
517
+ )
518
+
519
+ return params
520
+
521
+ def _build_launch_command(self) -> str:
522
+ """Generate the slurm script and construct the launch command.
523
+
524
+ Returns
525
+ -------
526
+ str
527
+ Complete SLURM launch command
528
+ """
529
+ batch_script_generator = BatchSlurmScriptGenerator(self.params)
530
+ self.batch_script_path = batch_script_generator.generate_batch_slurm_script()
531
+ self.launch_script_paths = batch_script_generator.script_paths
532
+ return f"sbatch {str(self.batch_script_path)}"
533
+
534
+ def launch(self) -> BatchLaunchResponse:
535
+ """Launch models in batch mode.
536
+
537
+ Returns
538
+ -------
539
+ BatchLaunchResponse
540
+ Response object containing launch details and status
541
+
542
+ Raises
543
+ ------
544
+ SlurmJobError
545
+ If SLURM job submission fails
546
+ """
547
+ # Build and execute the launch command
548
+ command_output, stderr = utils.run_bash_command(self._build_launch_command())
549
+
550
+ if stderr:
551
+ raise SlurmJobError(f"Error: {stderr}")
552
+
553
+ # Extract slurm job id from command output
554
+ self.slurm_job_id = command_output.split(" ")[-1].strip().strip("\n")
555
+ self.params["slurm_job_id"] = self.slurm_job_id
556
+
557
+ # Create log directory and job json file, move slurm script to job log directory
558
+ main_job_log_dir = Path("")
559
+
560
+ for model_name in self.model_names:
561
+ model_job_id = int(self.slurm_job_id) + int(
562
+ self.params["models"][model_name]["het_group_id"]
563
+ )
564
+
565
+ job_log_dir = Path(
566
+ self.params["log_dir"], f"{self.slurm_job_name}.{model_job_id}"
567
+ )
568
+ job_log_dir.mkdir(parents=True, exist_ok=True)
569
+
570
+ if main_job_log_dir == Path(""):
571
+ main_job_log_dir = job_log_dir
572
+
573
+ job_json = Path(
574
+ job_log_dir,
575
+ f"{model_name}.{model_job_id}.json",
576
+ )
577
+ job_json.touch(exist_ok=True)
578
+
579
+ with job_json.open("w") as file:
580
+ json.dump(self.params["models"][model_name], file, indent=4)
581
+
582
+ # Copy the launch scripts to the job log directory, the original scripts
583
+ # cannot be deleted otherwise slurm will not be able to find them
584
+ script_path_mapper = {}
585
+ for script_path in self.launch_script_paths:
586
+ old_path = script_path.name
587
+ file_name = old_path.split("/")[-1]
588
+ copy2(script_path, main_job_log_dir / file_name)
589
+ new_path = script_path.name
590
+ script_path_mapper[old_path] = new_path
591
+
592
+ # Replace old launch script paths with new paths in batch slurm script
593
+ with self.batch_script_path.open("r") as f:
594
+ script_content = f.read()
595
+ for old_path, new_path in script_path_mapper.items():
596
+ script_content = script_content.replace(old_path, new_path)
597
+ with self.batch_script_path.open("w") as f:
598
+ f.write(script_content)
599
+
600
+ # Move the batch script to the job log directory
601
+ self.batch_script_path.rename(
602
+ main_job_log_dir / f"{self.slurm_job_name}.{self.slurm_job_id}.sbatch"
603
+ )
604
+
605
+ return BatchLaunchResponse(
606
+ slurm_job_id=self.slurm_job_id,
607
+ slurm_job_name=self.slurm_job_name,
608
+ model_names=self.model_names,
609
+ config=self.params,
610
+ raw_output=command_output,
611
+ )
612
+
613
+
302
614
  class ModelStatusMonitor:
303
615
  """Class for handling server status information and monitoring.
304
616
 
@@ -307,16 +619,17 @@ class ModelStatusMonitor:
307
619
 
308
620
  Parameters
309
621
  ----------
310
- slurm_job_id : int
622
+ slurm_job_id : str
311
623
  ID of the SLURM job to monitor
312
- log_dir : str, optional
313
- Base directory containing log files
314
624
  """
315
625
 
316
- def __init__(self, slurm_job_id: int, log_dir: Optional[str] = None):
626
+ def __init__(self, slurm_job_id: str):
317
627
  self.slurm_job_id = slurm_job_id
318
628
  self.output = self._get_raw_status_output()
319
- self.log_dir = log_dir
629
+ self.job_status = dict(
630
+ field.split("=", 1) for field in self.output.split() if "=" in field
631
+ )
632
+ self.log_dir = self._get_log_dir()
320
633
  self.status_info = self._get_base_status_data()
321
634
 
322
635
  def _get_raw_status_output(self) -> str:
@@ -334,10 +647,28 @@ class ModelStatusMonitor:
334
647
  """
335
648
  status_cmd = f"scontrol show job {self.slurm_job_id} --oneliner"
336
649
  output, stderr = utils.run_bash_command(status_cmd)
650
+
337
651
  if stderr:
338
652
  raise SlurmJobError(f"Error: {stderr}")
339
653
  return output
340
654
 
655
+ def _get_log_dir(self) -> str:
656
+ """Get the log directory for the job.
657
+
658
+ Returns
659
+ -------
660
+ str
661
+ Log directory for the job
662
+ """
663
+ try:
664
+ outfile_path = self.job_status["StdOut"]
665
+ directory = Path(outfile_path).parent
666
+ return str(directory)
667
+ except KeyError as err:
668
+ raise FileNotFoundError(
669
+ f"Output file not found for job {self.slurm_job_id}"
670
+ ) from err
671
+
341
672
  def _get_base_status_data(self) -> StatusResponse:
342
673
  """Extract basic job status information from scontrol output.
343
674
 
@@ -347,14 +678,15 @@ class ModelStatusMonitor:
347
678
  Basic status information for the job
348
679
  """
349
680
  try:
350
- job_name = self.output.split(" ")[1].split("=")[1]
351
- job_state = self.output.split(" ")[9].split("=")[1]
352
- except IndexError:
681
+ job_name = self.job_status["JobName"]
682
+ job_state = self.job_status["JobState"]
683
+ except KeyError:
353
684
  job_name = "UNAVAILABLE"
354
685
  job_state = ModelStatus.UNAVAILABLE
355
686
 
356
687
  return StatusResponse(
357
688
  model_name=job_name,
689
+ log_dir=self.log_dir,
358
690
  server_status=ModelStatus.UNAVAILABLE,
359
691
  job_state=job_state,
360
692
  raw_output=self.output,
@@ -399,9 +731,9 @@ class ModelStatusMonitor:
399
731
  def _process_pending_state(self) -> None:
400
732
  """Process PENDING job state and update status information."""
401
733
  try:
402
- self.status_info.pending_reason = self.output.split(" ")[10].split("=")[1]
734
+ self.status_info.pending_reason = self.job_status["Reason"]
403
735
  self.status_info.server_status = ModelStatus.PENDING
404
- except IndexError:
736
+ except KeyError:
405
737
  self.status_info.pending_reason = "Unknown pending reason"
406
738
 
407
739
  def process_model_status(self) -> StatusResponse:
@@ -428,16 +760,16 @@ class PerformanceMetricsCollector:
428
760
 
429
761
  Parameters
430
762
  ----------
431
- slurm_job_id : int
763
+ slurm_job_id : str
432
764
  ID of the SLURM job to collect metrics from
433
765
  log_dir : str, optional
434
766
  Directory containing log files
435
767
  """
436
768
 
437
- def __init__(self, slurm_job_id: int, log_dir: Optional[str] = None):
769
+ def __init__(self, slurm_job_id: str):
438
770
  self.slurm_job_id = slurm_job_id
439
- self.log_dir = log_dir
440
771
  self.status_info = self._get_status_info()
772
+ self.log_dir = self.status_info.log_dir
441
773
  self.metrics_url = self._build_metrics_url()
442
774
  self.enabled_prefix_caching = self._check_prefix_caching()
443
775
 
@@ -454,7 +786,7 @@ class PerformanceMetricsCollector:
454
786
  StatusResponse
455
787
  Current status information for the model
456
788
  """
457
- status_helper = ModelStatusMonitor(self.slurm_job_id, self.log_dir)
789
+ status_helper = ModelStatusMonitor(self.slurm_job_id)
458
790
  return status_helper.process_model_status()
459
791
 
460
792
  def _build_metrics_url(self) -> str:
@@ -646,7 +978,7 @@ class ModelRegistry:
646
978
  config=config.model_dump(exclude={"model_name", "venv", "log_dir"}),
647
979
  )
648
980
  available_models.append(info)
649
- return available_models
981
+ return sorted(available_models, key=lambda x: x.name)
650
982
 
651
983
  def get_single_model_config(self, model_name: str) -> ModelConfig:
652
984
  """Get configuration for a specific model.
@@ -667,7 +999,8 @@ class ModelRegistry:
667
999
  If the specified model is not found in configuration
668
1000
  """
669
1001
  config = next(
670
- (c for c in self.model_configs if c.model_name == model_name), None
1002
+ (c for c in self.model_configs if c.model_name == model_name),
1003
+ None,
671
1004
  )
672
1005
  if not config:
673
1006
  raise ModelNotFoundError(f"Model '{model_name}' not found in configuration")