nemo-evaluator-launcher 0.1.56__py3-none-any.whl → 0.1.67__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,7 +18,7 @@
18
18
  This module defines data structures and helpers for configuration and type safety in the API layer.
19
19
  """
20
20
 
21
- import os
21
+ import pathlib
22
22
  import warnings
23
23
  from dataclasses import dataclass
24
24
  from typing import cast
@@ -42,33 +42,40 @@ from nemo_evaluator_launcher.common.logging_utils import logger
42
42
  class RunConfig(DictConfig):
43
43
  @staticmethod
44
44
  def from_hydra(
45
- config_name: str = "default",
46
- config_dir: str | None = None,
47
- hydra_overrides: list[str] = [],
48
- dict_overrides: dict = {},
45
+ config: str | None = None,
46
+ hydra_overrides: list[str] | None = None,
47
+ dict_overrides: dict | None = None,
49
48
  ) -> "RunConfig":
50
49
  """Load configuration from Hydra and merge with dictionary overrides.
51
50
 
52
51
  Args:
53
- config_name: Name of the Hydra configuration to load.
52
+ config: Optional full path to a config file (e.g. /path/to/my_config.yaml).
53
+ If omitted, loads the internal default config from
54
+ `nemo_evaluator_launcher.configs`.
54
55
  hydra_overrides: List of Hydra command-line style overrides.
55
56
  dict_overrides: Dictionary of configuration overrides to merge.
56
- config_dir: Optional path to user config directory. If provided, Hydra will
57
- search in this directory first, then fall back to internal configs.
58
57
 
59
58
  Returns:
60
59
  RunConfig: Merged configuration object.
61
60
  """
62
- overrides = hydra_overrides.copy()
61
+ overrides = list(hydra_overrides or [])
62
+ dict_overrides = dict_overrides or {}
63
+
64
+ resolved_config_path: str | None = None
65
+ config_name = "default"
66
+
63
67
  # Check if a GlobalHydra instance is already initialized and clear it
64
68
  if GlobalHydra.instance().is_initialized():
65
69
  GlobalHydra.instance().clear()
66
70
 
67
- if config_dir:
68
- # Convert relative path to absolute path if needed
69
- if not os.path.isabs(config_dir):
70
- config_dir = os.path.abspath(config_dir)
71
+ if config:
72
+ config_path = pathlib.Path(config).expanduser()
73
+ if not config_path.is_absolute():
74
+ config_path = (pathlib.Path.cwd() / config_path).resolve()
75
+ resolved_config_path = str(config_path)
71
76
 
77
+ config_dir = str(config_path.parent)
78
+ config_name = str(config_path.stem)
72
79
  hydra.initialize_config_dir(
73
80
  config_dir=config_dir,
74
81
  version_base=None,
@@ -90,7 +97,7 @@ class RunConfig(DictConfig):
90
97
  logger.debug(
91
98
  "Loaded run config from hydra",
92
99
  config_name=config_name,
93
- config_dir=config_dir,
100
+ config=resolved_config_path,
94
101
  overrides=hydra_overrides,
95
102
  dict_overrides=dict_overrides,
96
103
  result=cfg,
@@ -27,7 +27,6 @@ from nemo_evaluator_launcher.common.printing_utils import (
27
27
  green,
28
28
  magenta,
29
29
  red,
30
- yellow,
31
30
  )
32
31
 
33
32
 
@@ -42,20 +41,6 @@ class Cmd:
42
41
  "help": "Full path to config file. Uses Hydra by default (--config-mode=hydra). Use --config-mode=raw to load directly (bypasses Hydra)."
43
42
  },
44
43
  )
45
- config_name: str = field(
46
- default="default",
47
- alias=["-c", "--config-name"],
48
- metadata={
49
- "help": "Config name to use. Consult `nemo_evaluator_launcher.configs`"
50
- },
51
- )
52
- config_dir: str | None = field(
53
- default=None,
54
- alias=["-d", "--config-dir"],
55
- metadata={
56
- "help": "Path to user config directory. If provided, searches here first, then falls back to internal configs."
57
- },
58
- )
59
44
  config_mode: Literal["hydra", "raw"] = field(
60
45
  default="hydra",
61
46
  alias=["--config-mode"],
@@ -138,14 +123,6 @@ class Cmd:
138
123
  # Load configuration either from Hydra or directly from a config file
139
124
  if self.config_mode == "raw" and self.config:
140
125
  # Validate that raw config loading is not used with other config options
141
- if self.config_name != "default":
142
- raise ValueError(
143
- "Cannot use --config-mode=raw with --config-name. Raw mode only works with --config."
144
- )
145
- if self.config_dir is not None:
146
- raise ValueError(
147
- "Cannot use --config-mode=raw with --config-dir. Raw mode only works with --config."
148
- )
149
126
  if self.override:
150
127
  raise ValueError(
151
128
  "Cannot use --config-mode=raw with --override. Raw mode only works with --config."
@@ -158,23 +135,9 @@ class Cmd:
158
135
  # Create RunConfig from the loaded data
159
136
  config = OmegaConf.create(config_dict)
160
137
  else:
161
- # Handle --config parameter: split path into config_dir and config_name for Hydra
162
- if self.config:
163
- if self.config_name != "default":
164
- raise ValueError("Cannot use --config with --config-name")
165
- if self.config_dir is not None:
166
- raise ValueError("Cannot use --config with --config-dir")
167
- config_path = pathlib.Path(self.config)
168
- config_dir = str(config_path.parent)
169
- config_name = str(config_path.stem)
170
- else:
171
- config_dir = self.config_dir
172
- config_name = self.config_name
173
-
174
138
  # Load the complete Hydra configuration
175
139
  config = RunConfig.from_hydra(
176
- config_dir=config_dir,
177
- config_name=config_name,
140
+ config=self.config,
178
141
  hydra_overrides=self.override,
179
142
  )
180
143
 
@@ -283,16 +246,4 @@ class Cmd:
283
246
  )
284
247
  )
285
248
 
286
- # Warn if both config_dir and config_name are provided (and config_name is not default)
287
- if (
288
- self.config is None
289
- and self.config_dir is not None
290
- and self.config_name != "default"
291
- ):
292
- joint_path = pathlib.Path(self.config_dir) / f"{self.config_name}.yaml"
293
- print(
294
- yellow(
295
- f"Warning: Using --config-dir and --config-name together is deprecated. "
296
- f"Please use --config {joint_path} instead."
297
- )
298
- )
249
+ # Done.
@@ -16,8 +16,6 @@
16
16
  import pathlib
17
17
  from typing import Any
18
18
 
19
- import yaml
20
-
21
19
  from nemo_evaluator_launcher.common.container_metadata import (
22
20
  TaskIntermediateRepresentation,
23
21
  load_tasks_from_tasks_file,
@@ -25,201 +23,6 @@ from nemo_evaluator_launcher.common.container_metadata import (
25
23
  from nemo_evaluator_launcher.common.logging_utils import logger
26
24
 
27
25
 
28
- def _load_packaged_resource(*_args: Any, **_kwargs: Any) -> dict[str, Any]:
29
- """Deprecated: mapping.toml support was removed in favor of packaged IRs."""
30
- raise RuntimeError(
31
- "mapping.toml is no longer supported. Use packaged IRs (all_tasks_irs.yaml) instead."
32
- )
33
-
34
-
35
- def _process_mapping(mapping_toml: dict) -> dict:
36
- """Process the raw mapping TOML into the expected format.
37
-
38
- Args:
39
- mapping_toml: Raw mapping TOML data.
40
- Returns:
41
- dict: Processed mapping in the expected format.
42
- """
43
- mapping = {}
44
- for harness_name, harness_data in mapping_toml.items():
45
- # Skip entries that don't have the expected structure
46
- if not isinstance(harness_data, dict):
47
- logger.warning(
48
- "Skipping invalid harness entry",
49
- harness_name=harness_name,
50
- reason="harness_data is not a dict",
51
- )
52
- continue
53
-
54
- # Check if tasks field exists
55
- if "tasks" not in harness_data:
56
- logger.warning(
57
- "Skipping harness entry without tasks",
58
- harness_name=harness_name,
59
- )
60
- continue
61
-
62
- if not isinstance(harness_data["tasks"], dict):
63
- logger.warning(
64
- "Skipping invalid harness entry",
65
- harness_name=harness_name,
66
- reason="tasks is not a dict",
67
- )
68
- continue
69
-
70
- # Get container, which may be optional
71
- container = harness_data.get("container")
72
- if not container:
73
- logger.debug(
74
- "Harness entry without container",
75
- harness_name=harness_name,
76
- )
77
-
78
- for endpoint_type, harness_tasks in harness_data["tasks"].items():
79
- if not isinstance(harness_tasks, dict):
80
- logger.warning(
81
- "Skipping invalid endpoint type",
82
- harness_name=harness_name,
83
- endpoint_type=endpoint_type,
84
- reason="harness_tasks is not a dict",
85
- )
86
- continue
87
-
88
- for task_name, task_data in harness_tasks.items():
89
- if not isinstance(task_data, dict):
90
- logger.warning(
91
- "Skipping invalid task entry",
92
- harness_name=harness_name,
93
- task_name=task_name,
94
- reason="task_data is not a dict",
95
- )
96
- continue
97
-
98
- key = (harness_name, task_name)
99
- if key in mapping:
100
- raise KeyError(
101
- f"(harness,task)-tuple key {repr(key)} already exists in the mapping"
102
- )
103
-
104
- # Validate required fields exist in task_data
105
- # task_name and harness_name are already validated above
106
- # endpoint_type is validated as a key in harness_tasks
107
- # task_data must be a dict (validated above)
108
-
109
- mapping[key] = {
110
- "task": task_name,
111
- "harness": harness_name,
112
- "endpoint_type": endpoint_type,
113
- }
114
- # Only add container if it exists
115
- if container:
116
- mapping[key]["container"] = container
117
-
118
- # Validate task_data keys before updating
119
- for task_data_key in task_data.keys():
120
- if task_data_key in mapping[key]:
121
- raise KeyError(
122
- f"{repr(task_data_key)} is not allowed as key under {repr(key)} in the mapping"
123
- )
124
- # Validate that task_data values are valid types (basic check)
125
- if task_data_key not in ("description", "type") and not isinstance(
126
- task_data[task_data_key],
127
- (str, int, float, bool, dict, list, type(None)),
128
- ):
129
- logger.warning(
130
- "Unexpected value type in task_data",
131
- harness_name=harness_name,
132
- task_name=task_name,
133
- key=task_data_key,
134
- value_type=type(task_data[task_data_key]).__name__,
135
- )
136
-
137
- mapping[key].update(task_data)
138
- return mapping
139
-
140
-
141
- def _extract_tasks_from_framework_yml(
142
- framework_yml_content: str, harness_name: str, container: str
143
- ) -> dict[tuple[str, str], dict]:
144
- """Extract tasks from framework.yml content and return as mapping entries.
145
-
146
- Args:
147
- framework_yml_content: YAML content from framework.yml file
148
- harness_name: Name of the harness
149
- container: Container image string
150
-
151
- Returns:
152
- Dictionary mapping (harness_name, task_name) to task configuration
153
- """
154
- tasks = {}
155
- try:
156
- framework_data = yaml.safe_load(framework_yml_content)
157
- if not framework_data or "evaluations" not in framework_data:
158
- logger.warning(
159
- "No evaluations found in framework.yml",
160
- harness=harness_name,
161
- container=container,
162
- )
163
- return tasks
164
-
165
- evaluations = framework_data.get("evaluations", [])
166
- for eval_config in evaluations:
167
- task_name = eval_config.get("name")
168
- description = eval_config.get("description", "")
169
-
170
- if not task_name:
171
- continue
172
-
173
- # Extract endpoint types from the evaluation config
174
- defaults = eval_config.get("defaults", {})
175
- config = defaults.get("config", {})
176
- supported_endpoint_types = config.get("supported_endpoint_types", ["chat"])
177
- task_type = config.get("type", "") # Extract type from defaults.config.type
178
-
179
- # Use first endpoint type (mapping key is (harness, task), so one entry per task)
180
- endpoint_type = (
181
- supported_endpoint_types[0] if supported_endpoint_types else "chat"
182
- )
183
-
184
- key = (harness_name, task_name)
185
- # Only add if not already in mapping (don't override existing entries)
186
- if key not in tasks:
187
- tasks[key] = {
188
- "task": task_name,
189
- "harness": harness_name,
190
- "container": container,
191
- "endpoint_type": endpoint_type,
192
- "description": description,
193
- "type": task_type, # Store type from defaults.config.type
194
- }
195
- # Merge any additional config from defaults
196
- if defaults:
197
- tasks[key].update(defaults)
198
-
199
- logger.info(
200
- "Extracted tasks from framework.yml",
201
- harness=harness_name,
202
- container=container,
203
- num_tasks=len(tasks),
204
- )
205
- except yaml.YAMLError as e:
206
- logger.warning(
207
- "Failed to parse framework.yml",
208
- harness=harness_name,
209
- container=container,
210
- error=str(e),
211
- )
212
- except Exception as e:
213
- logger.warning(
214
- "Error extracting tasks from framework.yml",
215
- harness=harness_name,
216
- container=container,
217
- error=str(e),
218
- )
219
-
220
- return tasks
221
-
222
-
223
26
  def _convert_irs_to_mapping_format(
224
27
  tasks: list[TaskIntermediateRepresentation],
225
28
  ) -> dict[tuple[str, str], dict]:
@@ -18,13 +18,15 @@ image: ??? # e.g., nvcr.io/nim/meta/llama-3.1-8b-instruct:1.8.6
18
18
  served_model_name: ???
19
19
  port: 8000
20
20
 
21
+ command: /opt/nim/start_server.sh
22
+
21
23
  # NIM containers use default entrypoint - no custom command needed
22
24
  # Configuration is done via environment variables in lepton_config
23
25
 
24
26
  endpoints:
25
27
  chat: /v1/chat/completions
26
28
  completions: /v1/completions
27
- health: /health
29
+ health: /v1/health/ready
28
30
  # Note: Environment variables should be configured in lepton_config.envs
29
31
  # Auto-derived environment variables from deployment config:
30
32
  # - SERVED_MODEL_NAME (from served_model_name)
@@ -408,10 +408,10 @@ class SlurmExecutor(BaseExecutor):
408
408
  )
409
409
  statuses = []
410
410
  for i, slurm_job_id in enumerate(slurm_job_ids):
411
- slurm_status = slurm_jobs_status[slurm_job_id]
411
+ slurm_status = slurm_jobs_status[slurm_job_id][0]
412
412
  if slurm_job_id in latest_slurm_job_ids:
413
413
  latest_slurm_job_id = latest_slurm_job_ids[slurm_job_id]
414
- slurm_status = latest_slurm_jobs_status[latest_slurm_job_id]
414
+ slurm_status = latest_slurm_jobs_status[latest_slurm_job_id][0]
415
415
  progress = progress_list[i]
416
416
  progress = progress if progress is not None else "unknown"
417
417
  execution_state = SlurmExecutor._map_slurm_state_to_execution_state(
@@ -644,7 +644,7 @@ def _create_slurm_sbatch_script(
644
644
  s += deployment_srun_cmd
645
645
 
646
646
  # wait for the server to initialize
647
- health_path = cfg.deployment.get("health_check_path", "/health")
647
+ health_path = cfg.deployment.endpoints.get("health", "/health")
648
648
  # For multi-instance check all node IPs, for single instance check localhost
649
649
  if cfg.deployment.get("multiple_instances", False):
650
650
  ip_list = '"${NODES_IPS_ARRAY[@]}"'
@@ -710,7 +710,7 @@ def _create_slurm_sbatch_script(
710
710
 
711
711
  s += "# evaluation client\n"
712
712
  s += "srun --mpi pmix --overlap "
713
- s += "--nodes 1 --ntasks 1 " # Client always runs on single node
713
+ s += "--nodelist ${nodes_array[0]} --nodes 1 --ntasks 1 "
714
714
  s += "--container-image {} ".format(eval_image)
715
715
  evaluation_env_var_names = list(
716
716
  cfg.execution.get("env_vars", {}).get("evaluation", {})
@@ -835,7 +835,7 @@ def _generate_auto_export_section(
835
835
 
836
836
  s += " # export\n"
837
837
  s += " srun --mpi pmix --overlap "
838
- s += "--nodes 1 --ntasks 1 " # Client always runs on single node
838
+ s += "--nodelist ${nodes_array[0]} --nodes 1 --ntasks 1 "
839
839
  s += "--container-image {} ".format(export_image)
840
840
  if export_env:
841
841
  s += "--container-env {} ".format(",".join(export_env))
@@ -1009,8 +1009,55 @@ def _query_slurm_jobs_status(
1009
1009
  username: str,
1010
1010
  hostname: str,
1011
1011
  socket: str | None,
1012
- ) -> Dict[str, str]:
1013
- """Query SLURM for job statuses using sacct command.
1012
+ ) -> Dict[str, tuple[str, str]]:
1013
+ """Query SLURM for job statuses using squeue (for active jobs) and sacct (fallback).
1014
+
1015
+ This function first tries squeue which is more accurate for currently running jobs,
1016
+ then falls back to sacct for completed/historical jobs that squeue doesn't show.
1017
+ It also finds follow-up jobs (from autoresume) that depend on our known jobs.
1018
+
1019
+ Args:
1020
+ slurm_job_ids: List of SLURM job IDs to query.
1021
+ username: SSH username.
1022
+ hostname: SSH hostname.
1023
+ socket: control socket location or None
1024
+
1025
+ Returns:
1026
+ Dict mapping from slurm_job_id to tuple of status, current_job_id.
1027
+ """
1028
+ if len(slurm_job_ids) == 0:
1029
+ return {}
1030
+
1031
+ # First, try squeue for active jobs (more accurate for running jobs)
1032
+ squeue_statuses = _query_squeue_for_jobs(slurm_job_ids, username, hostname, socket)
1033
+
1034
+ # For jobs not found in squeue, fall back to sacct
1035
+ missing_jobs = [job_id for job_id in slurm_job_ids if job_id not in squeue_statuses]
1036
+ sacct_statuses = {}
1037
+
1038
+ if missing_jobs:
1039
+ sacct_statuses = _query_sacct_for_jobs(missing_jobs, username, hostname, socket)
1040
+
1041
+ # Combine results, preferring squeue data
1042
+ combined_statuses = {**sacct_statuses, **squeue_statuses}
1043
+
1044
+ return combined_statuses
1045
+
1046
+
1047
+ def _query_squeue_for_jobs(
1048
+ slurm_job_ids: List[str],
1049
+ username: str,
1050
+ hostname: str,
1051
+ socket: str | None,
1052
+ ) -> Dict[str, tuple[str, str]]:
1053
+ """Query SLURM for active job statuses using squeue command.
1054
+
1055
+ This function finds:
1056
+ 1. Jobs that directly match our known job IDs
1057
+ 2. Follow-up jobs that depend on our known job IDs (from autoresume mechanism)
1058
+
1059
+ For follow-up jobs, returns the status mapped to the original job ID, along with
1060
+ the actual current SLURM job ID.
1014
1061
 
1015
1062
  Args:
1016
1063
  slurm_job_ids: List of SLURM job IDs to query.
@@ -1019,10 +1066,77 @@ def _query_slurm_jobs_status(
1019
1066
  socket: control socket location or None
1020
1067
 
1021
1068
  Returns:
1022
- Dict mapping from slurm_job_id to returned slurm status.
1069
+ Dict mapping from original slurm_job_id to tuple of status, current_job_id.
1023
1070
  """
1024
1071
  if len(slurm_job_ids) == 0:
1025
1072
  return {}
1073
+
1074
+ # Use squeue to get active jobs - more accurate than sacct for running jobs
1075
+ squeue_command = "squeue -u {} -h -o '%i|%T|%E'".format(username)
1076
+
1077
+ ssh_command = ["ssh"]
1078
+ if socket is not None:
1079
+ ssh_command.append(f"-S {socket}")
1080
+ ssh_command.append(f"{username}@{hostname}")
1081
+ ssh_command.append(squeue_command)
1082
+ ssh_command = " ".join(ssh_command)
1083
+
1084
+ completed_process = subprocess.run(
1085
+ args=shlex.split(ssh_command),
1086
+ stdout=subprocess.PIPE,
1087
+ stderr=subprocess.PIPE,
1088
+ )
1089
+
1090
+ squeue_statuses = {}
1091
+ dependent_jobs = []
1092
+ if completed_process.returncode == 0:
1093
+ squeue_output = completed_process.stdout.decode("utf-8")
1094
+ squeue_output_lines = squeue_output.strip().split("\n")
1095
+
1096
+ for line in squeue_output_lines:
1097
+ if not line.strip():
1098
+ continue
1099
+ parts = line.split("|")
1100
+ if len(parts) >= 3:
1101
+ job_id = parts[0].strip()
1102
+ status = parts[1].strip()
1103
+ dependency = parts[2].strip()
1104
+ # Extract base job ID (handle array jobs like 123456_0 -> 123456)
1105
+ base_job_id = job_id.split("_")[0].split("[")[0]
1106
+ if base_job_id in slurm_job_ids:
1107
+ squeue_statuses[base_job_id] = status, base_job_id
1108
+ elif dependency and dependency != "(null)":
1109
+ dependent_jobs.append((base_job_id, status, dependency))
1110
+
1111
+ for dep_job_id, dep_status, dependency in dependent_jobs:
1112
+ for known_job_id in slurm_job_ids:
1113
+ if known_job_id in dependency and known_job_id not in squeue_statuses:
1114
+ squeue_statuses[known_job_id] = dep_status, dep_job_id
1115
+ break
1116
+
1117
+ return squeue_statuses
1118
+
1119
+
1120
+ def _query_sacct_for_jobs(
1121
+ slurm_job_ids: List[str],
1122
+ username: str,
1123
+ hostname: str,
1124
+ socket: str | None,
1125
+ ) -> Dict[str, tuple[str, str]]:
1126
+ """Query SLURM for job statuses using sacct command (for completed/historical jobs).
1127
+
1128
+ Args:
1129
+ slurm_job_ids: List of SLURM job IDs to query.
1130
+ username: SSH username.
1131
+ hostname: SSH hostname.
1132
+ socket: control socket location or None
1133
+
1134
+ Returns:
1135
+ Dict mapping from slurm_job_id to tuple of status, job_id.
1136
+ """
1137
+ if len(slurm_job_ids) == 0:
1138
+ return {}
1139
+
1026
1140
  sacct_command = "sacct -j {} --format='JobID,State%32' --noheader -P".format(
1027
1141
  ",".join(slurm_job_ids)
1028
1142
  )
@@ -1049,7 +1163,7 @@ def _query_slurm_jobs_status(
1049
1163
  slurm_jobs_status = {}
1050
1164
  for slurm_job_id in slurm_job_ids:
1051
1165
  slurm_job_status = _parse_slurm_job_status(slurm_job_id, sacct_output_lines)
1052
- slurm_jobs_status[slurm_job_id] = slurm_job_status
1166
+ slurm_jobs_status[slurm_job_id] = slurm_job_status, slurm_job_id
1053
1167
  return slurm_jobs_status
1054
1168
 
1055
1169
 
@@ -1264,9 +1378,11 @@ def _generate_haproxy_config_with_placeholders(cfg):
1264
1378
  for i in range(num_nodes):
1265
1379
  nodes.append({"ip": f"{{IP_{i}}}", "port": cfg.deployment.port})
1266
1380
 
1267
- # Get health check parameters from execution config
1381
+ # Get health check parameters - prefer proxy config, fallback to deployment.endpoints.health
1268
1382
  proxy_config = cfg.execution.get("proxy", {}).get("config", {})
1269
- health_check_path = proxy_config.get("health_check_path", "/health")
1383
+ health_check_path = proxy_config.get(
1384
+ "health_check_path", cfg.deployment.endpoints.get("health", "/health")
1385
+ )
1270
1386
  health_check_status = proxy_config.get("health_check_status", 200)
1271
1387
  haproxy_port = proxy_config.get("haproxy_port", 5009)
1272
1388
 
@@ -1301,7 +1417,7 @@ def _generate_haproxy_config(cfg, nodes_ips):
1301
1417
  )
1302
1418
 
1303
1419
  # Get health check parameters from deployment config
1304
- health_check_path = cfg.deployment.get("health_check_path", "/health")
1420
+ health_check_path = cfg.deployment.endpoints.get("health", "/health")
1305
1421
  health_check_status = cfg.deployment.get("health_check_status", 200)
1306
1422
  haproxy_port = cfg.deployment.get("haproxy_port", 5009)
1307
1423
 
@@ -1461,7 +1577,7 @@ def _generate_haproxy_srun_command(cfg, remote_task_subdir):
1461
1577
  s += "done\n"
1462
1578
  s += "\n"
1463
1579
  s += "srun --mpi pmix --overlap "
1464
- s += "--nodes 1 --ntasks 1 "
1580
+ s += "--nodelist ${nodes_array[0]} --nodes 1 --ntasks 1 "
1465
1581
  s += f"--container-image {cfg.execution.get('proxy', {}).get('image', 'haproxy:latest')} "
1466
1582
  s += f"--container-mounts {remote_task_subdir}/proxy.cfg:/usr/local/etc/haproxy/haproxy.cfg:ro "
1467
1583
  s += f"--output {remote_task_subdir}/logs/proxy-%A.log "
@@ -16,7 +16,7 @@
16
16
  # Below is the _next_ version that will be published, not the currently published one.
17
17
  MAJOR = 0
18
18
  MINOR = 1
19
- PATCH = 56
19
+ PATCH = 67
20
20
  PRE_RELEASE = ""
21
21
 
22
22
  # Use the following formatting: (major, minor, patch, pre-release)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nemo-evaluator-launcher
3
- Version: 0.1.56
3
+ Version: 0.1.67
4
4
  Summary: Launcher for the evaluations provided by NeMo Evaluator containers with different runtime backends
5
5
  Author: NVIDIA
6
6
  Author-email: nemo-toolkit@nvidia.com
@@ -1,8 +1,8 @@
1
1
  nemo_evaluator_launcher/__init__.py,sha256=GT38zGwbvBOSeU52WCRx-n9N49LvLGEV1PItgKC8orA,2320
2
- nemo_evaluator_launcher/package_info.py,sha256=zHKOOOIbhAWOrU7pN-acMaSYwI2zdqC5syv_zy4x21A,1586
2
+ nemo_evaluator_launcher/package_info.py,sha256=HOfV93vn4SKitHYaUmZ6OXBBSz64WCDVCQPm6oliO6A,1586
3
3
  nemo_evaluator_launcher/api/__init__.py,sha256=U9q_MJK2vRsFaymanhyy0nD1SNAZQZC8oY45RXPX7ac,1024
4
4
  nemo_evaluator_launcher/api/functional.py,sha256=di5az6OkMSGBr5bWTT6JLgtqNBU8-fwAb_eZETyVulI,33641
5
- nemo_evaluator_launcher/api/types.py,sha256=W7ZQ9ZTPR6YxInxxsKE6NxuuQAg4pVYz6SRmFCFxY0A,3635
5
+ nemo_evaluator_launcher/api/types.py,sha256=OVAI_mHYTqGc6IlXmCLRcQ1rbgwG7Oe4qpnbM0kCrfc,3884
6
6
  nemo_evaluator_launcher/api/utils.py,sha256=q5HArRj7PKgBfeH3bOX8q1U97yMyQQp72yRRA5JP9PE,818
7
7
  nemo_evaluator_launcher/cli/__init__.py,sha256=lNC_skFLYTOt-arnY3ZQnZMWzHlrtD2wAoHvDcHddwM,673
8
8
  nemo_evaluator_launcher/cli/export.py,sha256=GRXxusKDq_1qjMKN6MKOIjZ8x4u5ERgXwHSAGrvsGCY,11211
@@ -13,14 +13,14 @@ nemo_evaluator_launcher/cli/ls_runs.py,sha256=vJTwRdhVKLolnJuP8AnnQdJBE-BKZfCcCy
13
13
  nemo_evaluator_launcher/cli/ls_task.py,sha256=sChh9UQJK0Zh1g3RitVtm8o_6qJQuS75JGrHXf3Y0y4,10954
14
14
  nemo_evaluator_launcher/cli/ls_tasks.py,sha256=aMzuAiUUeTCeSN3XCtoneOm-BBSnOVfhgIzgxZ8Y8wI,11167
15
15
  nemo_evaluator_launcher/cli/main.py,sha256=3BwJXuB9WHmiTKu4FR2dyrVonwpo8YWcPfw3MSmqOu8,8148
16
- nemo_evaluator_launcher/cli/run.py,sha256=KGBIXiDfT5OUv09FaTl-GyOxDhjxWEIE1rGlUkccB2o,11336
16
+ nemo_evaluator_launcher/cli/run.py,sha256=2ITc5ZIdwzmjZsQct7D0_SF8UccOb2xqzBQscj1vWEg,9210
17
17
  nemo_evaluator_launcher/cli/status.py,sha256=ANdu0JYnfKNvd1gXmdu_0FrbPG-g0A_R4leOuNXzenQ,5947
18
18
  nemo_evaluator_launcher/cli/version.py,sha256=GgMNTAd4S0bu3t-uVfVedAf7p6pymWDDwOaNm4WHOxQ,1998
19
19
  nemo_evaluator_launcher/common/__init__.py,sha256=6-xb4KpG8-lZbWBI42c_Gax-Sq0kMSW8UG0Vn8dOBlo,744
20
20
  nemo_evaluator_launcher/common/execdb.py,sha256=WPzg5Iu2ojvFpBuYahSt3voP_iEUpoO8NgqMLUBwFxA,9767
21
21
  nemo_evaluator_launcher/common/helpers.py,sha256=pWYd1P5tbBSAK1JPv7dk_d_Sq92gJ4NIHXiiOEyNiZY,13218
22
22
  nemo_evaluator_launcher/common/logging_utils.py,sha256=7QkWlpA80QN5ipTUFJ198IiAsPRS36C6ISAAtNverbA,12338
23
- nemo_evaluator_launcher/common/mapping.py,sha256=O4Xg_0hgnQOOxAEGFlFJjHh_sTXvbIEUCgNXYxvfnDU,18035
23
+ nemo_evaluator_launcher/common/mapping.py,sha256=cGjyjvPzOL6T1pshGCbaa1QLkOsqRRZJVcU0_XszY9M,10657
24
24
  nemo_evaluator_launcher/common/printing_utils.py,sha256=K9_-ENUIGHcL9i-fmGscfCJoV-uWJlaMKNToNx1Ypmg,2631
25
25
  nemo_evaluator_launcher/common/container_metadata/__init__.py,sha256=QxPpIUTWprJybsfsaNoHnpLNIGxIVERW0mAtZ-u-F3c,2145
26
26
  nemo_evaluator_launcher/common/container_metadata/intermediate_repr.py,sha256=aa9J0tdvdVYpGPu85vkdpBnFOpSReLdA0SNb45jAC0Y,17525
@@ -30,7 +30,7 @@ nemo_evaluator_launcher/common/container_metadata/utils.py,sha256=kaINUugS7Jhydm
30
30
  nemo_evaluator_launcher/configs/__init__.py,sha256=lNC_skFLYTOt-arnY3ZQnZMWzHlrtD2wAoHvDcHddwM,673
31
31
  nemo_evaluator_launcher/configs/default.yaml,sha256=JHFjSl3KByhggRMTTo9nesQATVoz18PJIV6KM5Wng64,974
32
32
  nemo_evaluator_launcher/configs/deployment/generic.yaml,sha256=8_Z0fcjZuH6GfV9jJkY_8CS18Tbsn0gV-nK1LfGr_Vg,1262
33
- nemo_evaluator_launcher/configs/deployment/nim.yaml,sha256=hRJGwCR2XIS3bUFWkXzmroLep81KaHa3Sn2_edfWbkU,1266
33
+ nemo_evaluator_launcher/configs/deployment/nim.yaml,sha256=6YGGOyb-JIJqk2fOoo8NalUppbFaml7_lDBQXC9AVmw,1310
34
34
  nemo_evaluator_launcher/configs/deployment/none.yaml,sha256=buPada1yMz9ClPDbq63vPDzLGr_IubTLia91DG3i5Lo,684
35
35
  nemo_evaluator_launcher/configs/deployment/sglang.yaml,sha256=Yy2uOko7-HeJ1fE0LHK3AHUOAzaeVSfnDp6f7fRBA9s,1358
36
36
  nemo_evaluator_launcher/configs/deployment/trtllm.yaml,sha256=1-001_ylJOiQQiqhcIkX_8SFTkOGx47odB_4Myug_9A,568
@@ -49,7 +49,7 @@ nemo_evaluator_launcher/executors/local/__init__.py,sha256=lNC_skFLYTOt-arnY3ZQn
49
49
  nemo_evaluator_launcher/executors/local/executor.py,sha256=lLBEv0UoB7PRKdVsYdcodW0klCn90t0ua3Sp41IZr4U,37012
50
50
  nemo_evaluator_launcher/executors/local/run.template.sh,sha256=S6qLqnqA3B4P_-ngklCU5dO9rKV0Qom5NWDwDHf5i0g,6103
51
51
  nemo_evaluator_launcher/executors/slurm/__init__.py,sha256=lNC_skFLYTOt-arnY3ZQnZMWzHlrtD2wAoHvDcHddwM,673
52
- nemo_evaluator_launcher/executors/slurm/executor.py,sha256=zumFTbnKQzKNv5g7Y_PxsAXAsHIIvXKvIIOPLS8hoiA,59726
52
+ nemo_evaluator_launcher/executors/slurm/executor.py,sha256=YtTNMKJX-6HJelZk4PI0XPqlVRJsHnOtEt_FszZ45t0,63919
53
53
  nemo_evaluator_launcher/executors/slurm/proxy.cfg.template,sha256=nPg_JyBHPcjI1RSxDvl-lBgcbakrbZLl4IAXNBc8LM0,594
54
54
  nemo_evaluator_launcher/exporters/__init__.py,sha256=mBXG9FG48FeYrs8sF0zA2mgo1eqBmRgoml7zjJrqDso,1323
55
55
  nemo_evaluator_launcher/exporters/base.py,sha256=0BEqS-Zjez-KsrGE9yfo8S5w2uwMW3btBZve3SiiUp0,4307
@@ -61,9 +61,9 @@ nemo_evaluator_launcher/exporters/utils.py,sha256=5HGBKoFTSCng-GhAJpvfAa7N52_r9U
61
61
  nemo_evaluator_launcher/exporters/wandb.py,sha256=1qRUV_YE1Ury7rH7KH65AabR7gmEQ38kXBh2XrfiEpE,18082
62
62
  nemo_evaluator_launcher/resources/all_tasks_irs.yaml,sha256=g2T5Jb_2zkJFZd4gqHmOLKiKR52ioz9adjOYcaSEnyE,978316
63
63
  nemo_evaluator_launcher/resources/mapping.toml,sha256=iB5tHeMYJMnnej4TwKmrmI1rq3zJ-rWymO8eWxvfGvk,3619
64
- nemo_evaluator_launcher-0.1.56.dist-info/licenses/LICENSE,sha256=DyGb0fqHPZAsd_uXHA0DGcOCqsvrNsImuLC0Ts4s1zI,23413
65
- nemo_evaluator_launcher-0.1.56.dist-info/METADATA,sha256=uKG_JgnSQcHFxj2Nxs-Qmq7jc39BCksOdQ-qMfevGUE,28760
66
- nemo_evaluator_launcher-0.1.56.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
67
- nemo_evaluator_launcher-0.1.56.dist-info/entry_points.txt,sha256=HPmLybw-y1y4NxjK7slFS4mtTB7p4pnjRvCtJLvSiZs,174
68
- nemo_evaluator_launcher-0.1.56.dist-info/top_level.txt,sha256=5PvawNm9TXKqPRjZita1xPOtFiMOipcoRf50FI1iY3s,24
69
- nemo_evaluator_launcher-0.1.56.dist-info/RECORD,,
64
+ nemo_evaluator_launcher-0.1.67.dist-info/licenses/LICENSE,sha256=DyGb0fqHPZAsd_uXHA0DGcOCqsvrNsImuLC0Ts4s1zI,23413
65
+ nemo_evaluator_launcher-0.1.67.dist-info/METADATA,sha256=u7EwjsAfTzBcYcn9Ye8jQNPWpziL7ThNtJWz3CHNeLM,28760
66
+ nemo_evaluator_launcher-0.1.67.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
67
+ nemo_evaluator_launcher-0.1.67.dist-info/entry_points.txt,sha256=HPmLybw-y1y4NxjK7slFS4mtTB7p4pnjRvCtJLvSiZs,174
68
+ nemo_evaluator_launcher-0.1.67.dist-info/top_level.txt,sha256=5PvawNm9TXKqPRjZita1xPOtFiMOipcoRf50FI1iY3s,24
69
+ nemo_evaluator_launcher-0.1.67.dist-info/RECORD,,