nemo-evaluator-launcher 0.1.56__py3-none-any.whl → 0.1.67__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nemo_evaluator_launcher/api/types.py +21 -14
- nemo_evaluator_launcher/cli/run.py +2 -51
- nemo_evaluator_launcher/common/mapping.py +0 -197
- nemo_evaluator_launcher/configs/deployment/nim.yaml +3 -1
- nemo_evaluator_launcher/executors/slurm/executor.py +129 -13
- nemo_evaluator_launcher/package_info.py +1 -1
- {nemo_evaluator_launcher-0.1.56.dist-info → nemo_evaluator_launcher-0.1.67.dist-info}/METADATA +1 -1
- {nemo_evaluator_launcher-0.1.56.dist-info → nemo_evaluator_launcher-0.1.67.dist-info}/RECORD +12 -12
- {nemo_evaluator_launcher-0.1.56.dist-info → nemo_evaluator_launcher-0.1.67.dist-info}/WHEEL +0 -0
- {nemo_evaluator_launcher-0.1.56.dist-info → nemo_evaluator_launcher-0.1.67.dist-info}/entry_points.txt +0 -0
- {nemo_evaluator_launcher-0.1.56.dist-info → nemo_evaluator_launcher-0.1.67.dist-info}/licenses/LICENSE +0 -0
- {nemo_evaluator_launcher-0.1.56.dist-info → nemo_evaluator_launcher-0.1.67.dist-info}/top_level.txt +0 -0
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
This module defines data structures and helpers for configuration and type safety in the API layer.
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
|
-
import
|
|
21
|
+
import pathlib
|
|
22
22
|
import warnings
|
|
23
23
|
from dataclasses import dataclass
|
|
24
24
|
from typing import cast
|
|
@@ -42,33 +42,40 @@ from nemo_evaluator_launcher.common.logging_utils import logger
|
|
|
42
42
|
class RunConfig(DictConfig):
|
|
43
43
|
@staticmethod
|
|
44
44
|
def from_hydra(
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
dict_overrides: dict = {},
|
|
45
|
+
config: str | None = None,
|
|
46
|
+
hydra_overrides: list[str] | None = None,
|
|
47
|
+
dict_overrides: dict | None = None,
|
|
49
48
|
) -> "RunConfig":
|
|
50
49
|
"""Load configuration from Hydra and merge with dictionary overrides.
|
|
51
50
|
|
|
52
51
|
Args:
|
|
53
|
-
|
|
52
|
+
config: Optional full path to a config file (e.g. /path/to/my_config.yaml).
|
|
53
|
+
If omitted, loads the internal default config from
|
|
54
|
+
`nemo_evaluator_launcher.configs`.
|
|
54
55
|
hydra_overrides: List of Hydra command-line style overrides.
|
|
55
56
|
dict_overrides: Dictionary of configuration overrides to merge.
|
|
56
|
-
config_dir: Optional path to user config directory. If provided, Hydra will
|
|
57
|
-
search in this directory first, then fall back to internal configs.
|
|
58
57
|
|
|
59
58
|
Returns:
|
|
60
59
|
RunConfig: Merged configuration object.
|
|
61
60
|
"""
|
|
62
|
-
overrides = hydra_overrides
|
|
61
|
+
overrides = list(hydra_overrides or [])
|
|
62
|
+
dict_overrides = dict_overrides or {}
|
|
63
|
+
|
|
64
|
+
resolved_config_path: str | None = None
|
|
65
|
+
config_name = "default"
|
|
66
|
+
|
|
63
67
|
# Check if a GlobalHydra instance is already initialized and clear it
|
|
64
68
|
if GlobalHydra.instance().is_initialized():
|
|
65
69
|
GlobalHydra.instance().clear()
|
|
66
70
|
|
|
67
|
-
if
|
|
68
|
-
|
|
69
|
-
if not
|
|
70
|
-
|
|
71
|
+
if config:
|
|
72
|
+
config_path = pathlib.Path(config).expanduser()
|
|
73
|
+
if not config_path.is_absolute():
|
|
74
|
+
config_path = (pathlib.Path.cwd() / config_path).resolve()
|
|
75
|
+
resolved_config_path = str(config_path)
|
|
71
76
|
|
|
77
|
+
config_dir = str(config_path.parent)
|
|
78
|
+
config_name = str(config_path.stem)
|
|
72
79
|
hydra.initialize_config_dir(
|
|
73
80
|
config_dir=config_dir,
|
|
74
81
|
version_base=None,
|
|
@@ -90,7 +97,7 @@ class RunConfig(DictConfig):
|
|
|
90
97
|
logger.debug(
|
|
91
98
|
"Loaded run config from hydra",
|
|
92
99
|
config_name=config_name,
|
|
93
|
-
|
|
100
|
+
config=resolved_config_path,
|
|
94
101
|
overrides=hydra_overrides,
|
|
95
102
|
dict_overrides=dict_overrides,
|
|
96
103
|
result=cfg,
|
|
@@ -27,7 +27,6 @@ from nemo_evaluator_launcher.common.printing_utils import (
|
|
|
27
27
|
green,
|
|
28
28
|
magenta,
|
|
29
29
|
red,
|
|
30
|
-
yellow,
|
|
31
30
|
)
|
|
32
31
|
|
|
33
32
|
|
|
@@ -42,20 +41,6 @@ class Cmd:
|
|
|
42
41
|
"help": "Full path to config file. Uses Hydra by default (--config-mode=hydra). Use --config-mode=raw to load directly (bypasses Hydra)."
|
|
43
42
|
},
|
|
44
43
|
)
|
|
45
|
-
config_name: str = field(
|
|
46
|
-
default="default",
|
|
47
|
-
alias=["-c", "--config-name"],
|
|
48
|
-
metadata={
|
|
49
|
-
"help": "Config name to use. Consult `nemo_evaluator_launcher.configs`"
|
|
50
|
-
},
|
|
51
|
-
)
|
|
52
|
-
config_dir: str | None = field(
|
|
53
|
-
default=None,
|
|
54
|
-
alias=["-d", "--config-dir"],
|
|
55
|
-
metadata={
|
|
56
|
-
"help": "Path to user config directory. If provided, searches here first, then falls back to internal configs."
|
|
57
|
-
},
|
|
58
|
-
)
|
|
59
44
|
config_mode: Literal["hydra", "raw"] = field(
|
|
60
45
|
default="hydra",
|
|
61
46
|
alias=["--config-mode"],
|
|
@@ -138,14 +123,6 @@ class Cmd:
|
|
|
138
123
|
# Load configuration either from Hydra or directly from a config file
|
|
139
124
|
if self.config_mode == "raw" and self.config:
|
|
140
125
|
# Validate that raw config loading is not used with other config options
|
|
141
|
-
if self.config_name != "default":
|
|
142
|
-
raise ValueError(
|
|
143
|
-
"Cannot use --config-mode=raw with --config-name. Raw mode only works with --config."
|
|
144
|
-
)
|
|
145
|
-
if self.config_dir is not None:
|
|
146
|
-
raise ValueError(
|
|
147
|
-
"Cannot use --config-mode=raw with --config-dir. Raw mode only works with --config."
|
|
148
|
-
)
|
|
149
126
|
if self.override:
|
|
150
127
|
raise ValueError(
|
|
151
128
|
"Cannot use --config-mode=raw with --override. Raw mode only works with --config."
|
|
@@ -158,23 +135,9 @@ class Cmd:
|
|
|
158
135
|
# Create RunConfig from the loaded data
|
|
159
136
|
config = OmegaConf.create(config_dict)
|
|
160
137
|
else:
|
|
161
|
-
# Handle --config parameter: split path into config_dir and config_name for Hydra
|
|
162
|
-
if self.config:
|
|
163
|
-
if self.config_name != "default":
|
|
164
|
-
raise ValueError("Cannot use --config with --config-name")
|
|
165
|
-
if self.config_dir is not None:
|
|
166
|
-
raise ValueError("Cannot use --config with --config-dir")
|
|
167
|
-
config_path = pathlib.Path(self.config)
|
|
168
|
-
config_dir = str(config_path.parent)
|
|
169
|
-
config_name = str(config_path.stem)
|
|
170
|
-
else:
|
|
171
|
-
config_dir = self.config_dir
|
|
172
|
-
config_name = self.config_name
|
|
173
|
-
|
|
174
138
|
# Load the complete Hydra configuration
|
|
175
139
|
config = RunConfig.from_hydra(
|
|
176
|
-
|
|
177
|
-
config_name=config_name,
|
|
140
|
+
config=self.config,
|
|
178
141
|
hydra_overrides=self.override,
|
|
179
142
|
)
|
|
180
143
|
|
|
@@ -283,16 +246,4 @@ class Cmd:
|
|
|
283
246
|
)
|
|
284
247
|
)
|
|
285
248
|
|
|
286
|
-
#
|
|
287
|
-
if (
|
|
288
|
-
self.config is None
|
|
289
|
-
and self.config_dir is not None
|
|
290
|
-
and self.config_name != "default"
|
|
291
|
-
):
|
|
292
|
-
joint_path = pathlib.Path(self.config_dir) / f"{self.config_name}.yaml"
|
|
293
|
-
print(
|
|
294
|
-
yellow(
|
|
295
|
-
f"Warning: Using --config-dir and --config-name together is deprecated. "
|
|
296
|
-
f"Please use --config {joint_path} instead."
|
|
297
|
-
)
|
|
298
|
-
)
|
|
249
|
+
# Done.
|
|
@@ -16,8 +16,6 @@
|
|
|
16
16
|
import pathlib
|
|
17
17
|
from typing import Any
|
|
18
18
|
|
|
19
|
-
import yaml
|
|
20
|
-
|
|
21
19
|
from nemo_evaluator_launcher.common.container_metadata import (
|
|
22
20
|
TaskIntermediateRepresentation,
|
|
23
21
|
load_tasks_from_tasks_file,
|
|
@@ -25,201 +23,6 @@ from nemo_evaluator_launcher.common.container_metadata import (
|
|
|
25
23
|
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
26
24
|
|
|
27
25
|
|
|
28
|
-
def _load_packaged_resource(*_args: Any, **_kwargs: Any) -> dict[str, Any]:
|
|
29
|
-
"""Deprecated: mapping.toml support was removed in favor of packaged IRs."""
|
|
30
|
-
raise RuntimeError(
|
|
31
|
-
"mapping.toml is no longer supported. Use packaged IRs (all_tasks_irs.yaml) instead."
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def _process_mapping(mapping_toml: dict) -> dict:
|
|
36
|
-
"""Process the raw mapping TOML into the expected format.
|
|
37
|
-
|
|
38
|
-
Args:
|
|
39
|
-
mapping_toml: Raw mapping TOML data.
|
|
40
|
-
Returns:
|
|
41
|
-
dict: Processed mapping in the expected format.
|
|
42
|
-
"""
|
|
43
|
-
mapping = {}
|
|
44
|
-
for harness_name, harness_data in mapping_toml.items():
|
|
45
|
-
# Skip entries that don't have the expected structure
|
|
46
|
-
if not isinstance(harness_data, dict):
|
|
47
|
-
logger.warning(
|
|
48
|
-
"Skipping invalid harness entry",
|
|
49
|
-
harness_name=harness_name,
|
|
50
|
-
reason="harness_data is not a dict",
|
|
51
|
-
)
|
|
52
|
-
continue
|
|
53
|
-
|
|
54
|
-
# Check if tasks field exists
|
|
55
|
-
if "tasks" not in harness_data:
|
|
56
|
-
logger.warning(
|
|
57
|
-
"Skipping harness entry without tasks",
|
|
58
|
-
harness_name=harness_name,
|
|
59
|
-
)
|
|
60
|
-
continue
|
|
61
|
-
|
|
62
|
-
if not isinstance(harness_data["tasks"], dict):
|
|
63
|
-
logger.warning(
|
|
64
|
-
"Skipping invalid harness entry",
|
|
65
|
-
harness_name=harness_name,
|
|
66
|
-
reason="tasks is not a dict",
|
|
67
|
-
)
|
|
68
|
-
continue
|
|
69
|
-
|
|
70
|
-
# Get container, which may be optional
|
|
71
|
-
container = harness_data.get("container")
|
|
72
|
-
if not container:
|
|
73
|
-
logger.debug(
|
|
74
|
-
"Harness entry without container",
|
|
75
|
-
harness_name=harness_name,
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
for endpoint_type, harness_tasks in harness_data["tasks"].items():
|
|
79
|
-
if not isinstance(harness_tasks, dict):
|
|
80
|
-
logger.warning(
|
|
81
|
-
"Skipping invalid endpoint type",
|
|
82
|
-
harness_name=harness_name,
|
|
83
|
-
endpoint_type=endpoint_type,
|
|
84
|
-
reason="harness_tasks is not a dict",
|
|
85
|
-
)
|
|
86
|
-
continue
|
|
87
|
-
|
|
88
|
-
for task_name, task_data in harness_tasks.items():
|
|
89
|
-
if not isinstance(task_data, dict):
|
|
90
|
-
logger.warning(
|
|
91
|
-
"Skipping invalid task entry",
|
|
92
|
-
harness_name=harness_name,
|
|
93
|
-
task_name=task_name,
|
|
94
|
-
reason="task_data is not a dict",
|
|
95
|
-
)
|
|
96
|
-
continue
|
|
97
|
-
|
|
98
|
-
key = (harness_name, task_name)
|
|
99
|
-
if key in mapping:
|
|
100
|
-
raise KeyError(
|
|
101
|
-
f"(harness,task)-tuple key {repr(key)} already exists in the mapping"
|
|
102
|
-
)
|
|
103
|
-
|
|
104
|
-
# Validate required fields exist in task_data
|
|
105
|
-
# task_name and harness_name are already validated above
|
|
106
|
-
# endpoint_type is validated as a key in harness_tasks
|
|
107
|
-
# task_data must be a dict (validated above)
|
|
108
|
-
|
|
109
|
-
mapping[key] = {
|
|
110
|
-
"task": task_name,
|
|
111
|
-
"harness": harness_name,
|
|
112
|
-
"endpoint_type": endpoint_type,
|
|
113
|
-
}
|
|
114
|
-
# Only add container if it exists
|
|
115
|
-
if container:
|
|
116
|
-
mapping[key]["container"] = container
|
|
117
|
-
|
|
118
|
-
# Validate task_data keys before updating
|
|
119
|
-
for task_data_key in task_data.keys():
|
|
120
|
-
if task_data_key in mapping[key]:
|
|
121
|
-
raise KeyError(
|
|
122
|
-
f"{repr(task_data_key)} is not allowed as key under {repr(key)} in the mapping"
|
|
123
|
-
)
|
|
124
|
-
# Validate that task_data values are valid types (basic check)
|
|
125
|
-
if task_data_key not in ("description", "type") and not isinstance(
|
|
126
|
-
task_data[task_data_key],
|
|
127
|
-
(str, int, float, bool, dict, list, type(None)),
|
|
128
|
-
):
|
|
129
|
-
logger.warning(
|
|
130
|
-
"Unexpected value type in task_data",
|
|
131
|
-
harness_name=harness_name,
|
|
132
|
-
task_name=task_name,
|
|
133
|
-
key=task_data_key,
|
|
134
|
-
value_type=type(task_data[task_data_key]).__name__,
|
|
135
|
-
)
|
|
136
|
-
|
|
137
|
-
mapping[key].update(task_data)
|
|
138
|
-
return mapping
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
def _extract_tasks_from_framework_yml(
|
|
142
|
-
framework_yml_content: str, harness_name: str, container: str
|
|
143
|
-
) -> dict[tuple[str, str], dict]:
|
|
144
|
-
"""Extract tasks from framework.yml content and return as mapping entries.
|
|
145
|
-
|
|
146
|
-
Args:
|
|
147
|
-
framework_yml_content: YAML content from framework.yml file
|
|
148
|
-
harness_name: Name of the harness
|
|
149
|
-
container: Container image string
|
|
150
|
-
|
|
151
|
-
Returns:
|
|
152
|
-
Dictionary mapping (harness_name, task_name) to task configuration
|
|
153
|
-
"""
|
|
154
|
-
tasks = {}
|
|
155
|
-
try:
|
|
156
|
-
framework_data = yaml.safe_load(framework_yml_content)
|
|
157
|
-
if not framework_data or "evaluations" not in framework_data:
|
|
158
|
-
logger.warning(
|
|
159
|
-
"No evaluations found in framework.yml",
|
|
160
|
-
harness=harness_name,
|
|
161
|
-
container=container,
|
|
162
|
-
)
|
|
163
|
-
return tasks
|
|
164
|
-
|
|
165
|
-
evaluations = framework_data.get("evaluations", [])
|
|
166
|
-
for eval_config in evaluations:
|
|
167
|
-
task_name = eval_config.get("name")
|
|
168
|
-
description = eval_config.get("description", "")
|
|
169
|
-
|
|
170
|
-
if not task_name:
|
|
171
|
-
continue
|
|
172
|
-
|
|
173
|
-
# Extract endpoint types from the evaluation config
|
|
174
|
-
defaults = eval_config.get("defaults", {})
|
|
175
|
-
config = defaults.get("config", {})
|
|
176
|
-
supported_endpoint_types = config.get("supported_endpoint_types", ["chat"])
|
|
177
|
-
task_type = config.get("type", "") # Extract type from defaults.config.type
|
|
178
|
-
|
|
179
|
-
# Use first endpoint type (mapping key is (harness, task), so one entry per task)
|
|
180
|
-
endpoint_type = (
|
|
181
|
-
supported_endpoint_types[0] if supported_endpoint_types else "chat"
|
|
182
|
-
)
|
|
183
|
-
|
|
184
|
-
key = (harness_name, task_name)
|
|
185
|
-
# Only add if not already in mapping (don't override existing entries)
|
|
186
|
-
if key not in tasks:
|
|
187
|
-
tasks[key] = {
|
|
188
|
-
"task": task_name,
|
|
189
|
-
"harness": harness_name,
|
|
190
|
-
"container": container,
|
|
191
|
-
"endpoint_type": endpoint_type,
|
|
192
|
-
"description": description,
|
|
193
|
-
"type": task_type, # Store type from defaults.config.type
|
|
194
|
-
}
|
|
195
|
-
# Merge any additional config from defaults
|
|
196
|
-
if defaults:
|
|
197
|
-
tasks[key].update(defaults)
|
|
198
|
-
|
|
199
|
-
logger.info(
|
|
200
|
-
"Extracted tasks from framework.yml",
|
|
201
|
-
harness=harness_name,
|
|
202
|
-
container=container,
|
|
203
|
-
num_tasks=len(tasks),
|
|
204
|
-
)
|
|
205
|
-
except yaml.YAMLError as e:
|
|
206
|
-
logger.warning(
|
|
207
|
-
"Failed to parse framework.yml",
|
|
208
|
-
harness=harness_name,
|
|
209
|
-
container=container,
|
|
210
|
-
error=str(e),
|
|
211
|
-
)
|
|
212
|
-
except Exception as e:
|
|
213
|
-
logger.warning(
|
|
214
|
-
"Error extracting tasks from framework.yml",
|
|
215
|
-
harness=harness_name,
|
|
216
|
-
container=container,
|
|
217
|
-
error=str(e),
|
|
218
|
-
)
|
|
219
|
-
|
|
220
|
-
return tasks
|
|
221
|
-
|
|
222
|
-
|
|
223
26
|
def _convert_irs_to_mapping_format(
|
|
224
27
|
tasks: list[TaskIntermediateRepresentation],
|
|
225
28
|
) -> dict[tuple[str, str], dict]:
|
|
@@ -18,13 +18,15 @@ image: ??? # e.g., nvcr.io/nim/meta/llama-3.1-8b-instruct:1.8.6
|
|
|
18
18
|
served_model_name: ???
|
|
19
19
|
port: 8000
|
|
20
20
|
|
|
21
|
+
command: /opt/nim/start_server.sh
|
|
22
|
+
|
|
21
23
|
# NIM containers use default entrypoint - no custom command needed
|
|
22
24
|
# Configuration is done via environment variables in lepton_config
|
|
23
25
|
|
|
24
26
|
endpoints:
|
|
25
27
|
chat: /v1/chat/completions
|
|
26
28
|
completions: /v1/completions
|
|
27
|
-
health: /health
|
|
29
|
+
health: /v1/health/ready
|
|
28
30
|
# Note: Environment variables should be configured in lepton_config.envs
|
|
29
31
|
# Auto-derived environment variables from deployment config:
|
|
30
32
|
# - SERVED_MODEL_NAME (from served_model_name)
|
|
@@ -408,10 +408,10 @@ class SlurmExecutor(BaseExecutor):
|
|
|
408
408
|
)
|
|
409
409
|
statuses = []
|
|
410
410
|
for i, slurm_job_id in enumerate(slurm_job_ids):
|
|
411
|
-
slurm_status = slurm_jobs_status[slurm_job_id]
|
|
411
|
+
slurm_status = slurm_jobs_status[slurm_job_id][0]
|
|
412
412
|
if slurm_job_id in latest_slurm_job_ids:
|
|
413
413
|
latest_slurm_job_id = latest_slurm_job_ids[slurm_job_id]
|
|
414
|
-
slurm_status = latest_slurm_jobs_status[latest_slurm_job_id]
|
|
414
|
+
slurm_status = latest_slurm_jobs_status[latest_slurm_job_id][0]
|
|
415
415
|
progress = progress_list[i]
|
|
416
416
|
progress = progress if progress is not None else "unknown"
|
|
417
417
|
execution_state = SlurmExecutor._map_slurm_state_to_execution_state(
|
|
@@ -644,7 +644,7 @@ def _create_slurm_sbatch_script(
|
|
|
644
644
|
s += deployment_srun_cmd
|
|
645
645
|
|
|
646
646
|
# wait for the server to initialize
|
|
647
|
-
health_path = cfg.deployment.get("
|
|
647
|
+
health_path = cfg.deployment.endpoints.get("health", "/health")
|
|
648
648
|
# For multi-instance check all node IPs, for single instance check localhost
|
|
649
649
|
if cfg.deployment.get("multiple_instances", False):
|
|
650
650
|
ip_list = '"${NODES_IPS_ARRAY[@]}"'
|
|
@@ -710,7 +710,7 @@ def _create_slurm_sbatch_script(
|
|
|
710
710
|
|
|
711
711
|
s += "# evaluation client\n"
|
|
712
712
|
s += "srun --mpi pmix --overlap "
|
|
713
|
-
s += "--nodes 1 --ntasks 1 "
|
|
713
|
+
s += "--nodelist ${nodes_array[0]} --nodes 1 --ntasks 1 "
|
|
714
714
|
s += "--container-image {} ".format(eval_image)
|
|
715
715
|
evaluation_env_var_names = list(
|
|
716
716
|
cfg.execution.get("env_vars", {}).get("evaluation", {})
|
|
@@ -835,7 +835,7 @@ def _generate_auto_export_section(
|
|
|
835
835
|
|
|
836
836
|
s += " # export\n"
|
|
837
837
|
s += " srun --mpi pmix --overlap "
|
|
838
|
-
s += "--nodes 1 --ntasks 1 "
|
|
838
|
+
s += "--nodelist ${nodes_array[0]} --nodes 1 --ntasks 1 "
|
|
839
839
|
s += "--container-image {} ".format(export_image)
|
|
840
840
|
if export_env:
|
|
841
841
|
s += "--container-env {} ".format(",".join(export_env))
|
|
@@ -1009,8 +1009,55 @@ def _query_slurm_jobs_status(
|
|
|
1009
1009
|
username: str,
|
|
1010
1010
|
hostname: str,
|
|
1011
1011
|
socket: str | None,
|
|
1012
|
-
) -> Dict[str, str]:
|
|
1013
|
-
"""Query SLURM for job statuses using sacct
|
|
1012
|
+
) -> Dict[str, tuple[str, str]]:
|
|
1013
|
+
"""Query SLURM for job statuses using squeue (for active jobs) and sacct (fallback).
|
|
1014
|
+
|
|
1015
|
+
This function first tries squeue which is more accurate for currently running jobs,
|
|
1016
|
+
then falls back to sacct for completed/historical jobs that squeue doesn't show.
|
|
1017
|
+
It also finds follow-up jobs (from autoresume) that depend on our known jobs.
|
|
1018
|
+
|
|
1019
|
+
Args:
|
|
1020
|
+
slurm_job_ids: List of SLURM job IDs to query.
|
|
1021
|
+
username: SSH username.
|
|
1022
|
+
hostname: SSH hostname.
|
|
1023
|
+
socket: control socket location or None
|
|
1024
|
+
|
|
1025
|
+
Returns:
|
|
1026
|
+
Dict mapping from slurm_job_id to tuple of status, current_job_id.
|
|
1027
|
+
"""
|
|
1028
|
+
if len(slurm_job_ids) == 0:
|
|
1029
|
+
return {}
|
|
1030
|
+
|
|
1031
|
+
# First, try squeue for active jobs (more accurate for running jobs)
|
|
1032
|
+
squeue_statuses = _query_squeue_for_jobs(slurm_job_ids, username, hostname, socket)
|
|
1033
|
+
|
|
1034
|
+
# For jobs not found in squeue, fall back to sacct
|
|
1035
|
+
missing_jobs = [job_id for job_id in slurm_job_ids if job_id not in squeue_statuses]
|
|
1036
|
+
sacct_statuses = {}
|
|
1037
|
+
|
|
1038
|
+
if missing_jobs:
|
|
1039
|
+
sacct_statuses = _query_sacct_for_jobs(missing_jobs, username, hostname, socket)
|
|
1040
|
+
|
|
1041
|
+
# Combine results, preferring squeue data
|
|
1042
|
+
combined_statuses = {**sacct_statuses, **squeue_statuses}
|
|
1043
|
+
|
|
1044
|
+
return combined_statuses
|
|
1045
|
+
|
|
1046
|
+
|
|
1047
|
+
def _query_squeue_for_jobs(
|
|
1048
|
+
slurm_job_ids: List[str],
|
|
1049
|
+
username: str,
|
|
1050
|
+
hostname: str,
|
|
1051
|
+
socket: str | None,
|
|
1052
|
+
) -> Dict[str, tuple[str, str]]:
|
|
1053
|
+
"""Query SLURM for active job statuses using squeue command.
|
|
1054
|
+
|
|
1055
|
+
This function finds:
|
|
1056
|
+
1. Jobs that directly match our known job IDs
|
|
1057
|
+
2. Follow-up jobs that depend on our known job IDs (from autoresume mechanism)
|
|
1058
|
+
|
|
1059
|
+
For follow-up jobs, returns the status mapped to the original job ID, along with
|
|
1060
|
+
the actual current SLURM job ID.
|
|
1014
1061
|
|
|
1015
1062
|
Args:
|
|
1016
1063
|
slurm_job_ids: List of SLURM job IDs to query.
|
|
@@ -1019,10 +1066,77 @@ def _query_slurm_jobs_status(
|
|
|
1019
1066
|
socket: control socket location or None
|
|
1020
1067
|
|
|
1021
1068
|
Returns:
|
|
1022
|
-
Dict mapping from slurm_job_id to
|
|
1069
|
+
Dict mapping from original slurm_job_id to tuple of status, current_job_id.
|
|
1023
1070
|
"""
|
|
1024
1071
|
if len(slurm_job_ids) == 0:
|
|
1025
1072
|
return {}
|
|
1073
|
+
|
|
1074
|
+
# Use squeue to get active jobs - more accurate than sacct for running jobs
|
|
1075
|
+
squeue_command = "squeue -u {} -h -o '%i|%T|%E'".format(username)
|
|
1076
|
+
|
|
1077
|
+
ssh_command = ["ssh"]
|
|
1078
|
+
if socket is not None:
|
|
1079
|
+
ssh_command.append(f"-S {socket}")
|
|
1080
|
+
ssh_command.append(f"{username}@{hostname}")
|
|
1081
|
+
ssh_command.append(squeue_command)
|
|
1082
|
+
ssh_command = " ".join(ssh_command)
|
|
1083
|
+
|
|
1084
|
+
completed_process = subprocess.run(
|
|
1085
|
+
args=shlex.split(ssh_command),
|
|
1086
|
+
stdout=subprocess.PIPE,
|
|
1087
|
+
stderr=subprocess.PIPE,
|
|
1088
|
+
)
|
|
1089
|
+
|
|
1090
|
+
squeue_statuses = {}
|
|
1091
|
+
dependent_jobs = []
|
|
1092
|
+
if completed_process.returncode == 0:
|
|
1093
|
+
squeue_output = completed_process.stdout.decode("utf-8")
|
|
1094
|
+
squeue_output_lines = squeue_output.strip().split("\n")
|
|
1095
|
+
|
|
1096
|
+
for line in squeue_output_lines:
|
|
1097
|
+
if not line.strip():
|
|
1098
|
+
continue
|
|
1099
|
+
parts = line.split("|")
|
|
1100
|
+
if len(parts) >= 3:
|
|
1101
|
+
job_id = parts[0].strip()
|
|
1102
|
+
status = parts[1].strip()
|
|
1103
|
+
dependency = parts[2].strip()
|
|
1104
|
+
# Extract base job ID (handle array jobs like 123456_0 -> 123456)
|
|
1105
|
+
base_job_id = job_id.split("_")[0].split("[")[0]
|
|
1106
|
+
if base_job_id in slurm_job_ids:
|
|
1107
|
+
squeue_statuses[base_job_id] = status, base_job_id
|
|
1108
|
+
elif dependency and dependency != "(null)":
|
|
1109
|
+
dependent_jobs.append((base_job_id, status, dependency))
|
|
1110
|
+
|
|
1111
|
+
for dep_job_id, dep_status, dependency in dependent_jobs:
|
|
1112
|
+
for known_job_id in slurm_job_ids:
|
|
1113
|
+
if known_job_id in dependency and known_job_id not in squeue_statuses:
|
|
1114
|
+
squeue_statuses[known_job_id] = dep_status, dep_job_id
|
|
1115
|
+
break
|
|
1116
|
+
|
|
1117
|
+
return squeue_statuses
|
|
1118
|
+
|
|
1119
|
+
|
|
1120
|
+
def _query_sacct_for_jobs(
|
|
1121
|
+
slurm_job_ids: List[str],
|
|
1122
|
+
username: str,
|
|
1123
|
+
hostname: str,
|
|
1124
|
+
socket: str | None,
|
|
1125
|
+
) -> Dict[str, tuple[str, str]]:
|
|
1126
|
+
"""Query SLURM for job statuses using sacct command (for completed/historical jobs).
|
|
1127
|
+
|
|
1128
|
+
Args:
|
|
1129
|
+
slurm_job_ids: List of SLURM job IDs to query.
|
|
1130
|
+
username: SSH username.
|
|
1131
|
+
hostname: SSH hostname.
|
|
1132
|
+
socket: control socket location or None
|
|
1133
|
+
|
|
1134
|
+
Returns:
|
|
1135
|
+
Dict mapping from slurm_job_id to tuple of status, job_id.
|
|
1136
|
+
"""
|
|
1137
|
+
if len(slurm_job_ids) == 0:
|
|
1138
|
+
return {}
|
|
1139
|
+
|
|
1026
1140
|
sacct_command = "sacct -j {} --format='JobID,State%32' --noheader -P".format(
|
|
1027
1141
|
",".join(slurm_job_ids)
|
|
1028
1142
|
)
|
|
@@ -1049,7 +1163,7 @@ def _query_slurm_jobs_status(
|
|
|
1049
1163
|
slurm_jobs_status = {}
|
|
1050
1164
|
for slurm_job_id in slurm_job_ids:
|
|
1051
1165
|
slurm_job_status = _parse_slurm_job_status(slurm_job_id, sacct_output_lines)
|
|
1052
|
-
slurm_jobs_status[slurm_job_id] = slurm_job_status
|
|
1166
|
+
slurm_jobs_status[slurm_job_id] = slurm_job_status, slurm_job_id
|
|
1053
1167
|
return slurm_jobs_status
|
|
1054
1168
|
|
|
1055
1169
|
|
|
@@ -1264,9 +1378,11 @@ def _generate_haproxy_config_with_placeholders(cfg):
|
|
|
1264
1378
|
for i in range(num_nodes):
|
|
1265
1379
|
nodes.append({"ip": f"{{IP_{i}}}", "port": cfg.deployment.port})
|
|
1266
1380
|
|
|
1267
|
-
# Get health check parameters
|
|
1381
|
+
# Get health check parameters - prefer proxy config, fallback to deployment.endpoints.health
|
|
1268
1382
|
proxy_config = cfg.execution.get("proxy", {}).get("config", {})
|
|
1269
|
-
health_check_path = proxy_config.get(
|
|
1383
|
+
health_check_path = proxy_config.get(
|
|
1384
|
+
"health_check_path", cfg.deployment.endpoints.get("health", "/health")
|
|
1385
|
+
)
|
|
1270
1386
|
health_check_status = proxy_config.get("health_check_status", 200)
|
|
1271
1387
|
haproxy_port = proxy_config.get("haproxy_port", 5009)
|
|
1272
1388
|
|
|
@@ -1301,7 +1417,7 @@ def _generate_haproxy_config(cfg, nodes_ips):
|
|
|
1301
1417
|
)
|
|
1302
1418
|
|
|
1303
1419
|
# Get health check parameters from deployment config
|
|
1304
|
-
health_check_path = cfg.deployment.get("
|
|
1420
|
+
health_check_path = cfg.deployment.endpoints.get("health", "/health")
|
|
1305
1421
|
health_check_status = cfg.deployment.get("health_check_status", 200)
|
|
1306
1422
|
haproxy_port = cfg.deployment.get("haproxy_port", 5009)
|
|
1307
1423
|
|
|
@@ -1461,7 +1577,7 @@ def _generate_haproxy_srun_command(cfg, remote_task_subdir):
|
|
|
1461
1577
|
s += "done\n"
|
|
1462
1578
|
s += "\n"
|
|
1463
1579
|
s += "srun --mpi pmix --overlap "
|
|
1464
|
-
s += "--nodes 1 --ntasks 1 "
|
|
1580
|
+
s += "--nodelist ${nodes_array[0]} --nodes 1 --ntasks 1 "
|
|
1465
1581
|
s += f"--container-image {cfg.execution.get('proxy', {}).get('image', 'haproxy:latest')} "
|
|
1466
1582
|
s += f"--container-mounts {remote_task_subdir}/proxy.cfg:/usr/local/etc/haproxy/haproxy.cfg:ro "
|
|
1467
1583
|
s += f"--output {remote_task_subdir}/logs/proxy-%A.log "
|
{nemo_evaluator_launcher-0.1.56.dist-info → nemo_evaluator_launcher-0.1.67.dist-info}/RECORD
RENAMED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
nemo_evaluator_launcher/__init__.py,sha256=GT38zGwbvBOSeU52WCRx-n9N49LvLGEV1PItgKC8orA,2320
|
|
2
|
-
nemo_evaluator_launcher/package_info.py,sha256=
|
|
2
|
+
nemo_evaluator_launcher/package_info.py,sha256=HOfV93vn4SKitHYaUmZ6OXBBSz64WCDVCQPm6oliO6A,1586
|
|
3
3
|
nemo_evaluator_launcher/api/__init__.py,sha256=U9q_MJK2vRsFaymanhyy0nD1SNAZQZC8oY45RXPX7ac,1024
|
|
4
4
|
nemo_evaluator_launcher/api/functional.py,sha256=di5az6OkMSGBr5bWTT6JLgtqNBU8-fwAb_eZETyVulI,33641
|
|
5
|
-
nemo_evaluator_launcher/api/types.py,sha256=
|
|
5
|
+
nemo_evaluator_launcher/api/types.py,sha256=OVAI_mHYTqGc6IlXmCLRcQ1rbgwG7Oe4qpnbM0kCrfc,3884
|
|
6
6
|
nemo_evaluator_launcher/api/utils.py,sha256=q5HArRj7PKgBfeH3bOX8q1U97yMyQQp72yRRA5JP9PE,818
|
|
7
7
|
nemo_evaluator_launcher/cli/__init__.py,sha256=lNC_skFLYTOt-arnY3ZQnZMWzHlrtD2wAoHvDcHddwM,673
|
|
8
8
|
nemo_evaluator_launcher/cli/export.py,sha256=GRXxusKDq_1qjMKN6MKOIjZ8x4u5ERgXwHSAGrvsGCY,11211
|
|
@@ -13,14 +13,14 @@ nemo_evaluator_launcher/cli/ls_runs.py,sha256=vJTwRdhVKLolnJuP8AnnQdJBE-BKZfCcCy
|
|
|
13
13
|
nemo_evaluator_launcher/cli/ls_task.py,sha256=sChh9UQJK0Zh1g3RitVtm8o_6qJQuS75JGrHXf3Y0y4,10954
|
|
14
14
|
nemo_evaluator_launcher/cli/ls_tasks.py,sha256=aMzuAiUUeTCeSN3XCtoneOm-BBSnOVfhgIzgxZ8Y8wI,11167
|
|
15
15
|
nemo_evaluator_launcher/cli/main.py,sha256=3BwJXuB9WHmiTKu4FR2dyrVonwpo8YWcPfw3MSmqOu8,8148
|
|
16
|
-
nemo_evaluator_launcher/cli/run.py,sha256=
|
|
16
|
+
nemo_evaluator_launcher/cli/run.py,sha256=2ITc5ZIdwzmjZsQct7D0_SF8UccOb2xqzBQscj1vWEg,9210
|
|
17
17
|
nemo_evaluator_launcher/cli/status.py,sha256=ANdu0JYnfKNvd1gXmdu_0FrbPG-g0A_R4leOuNXzenQ,5947
|
|
18
18
|
nemo_evaluator_launcher/cli/version.py,sha256=GgMNTAd4S0bu3t-uVfVedAf7p6pymWDDwOaNm4WHOxQ,1998
|
|
19
19
|
nemo_evaluator_launcher/common/__init__.py,sha256=6-xb4KpG8-lZbWBI42c_Gax-Sq0kMSW8UG0Vn8dOBlo,744
|
|
20
20
|
nemo_evaluator_launcher/common/execdb.py,sha256=WPzg5Iu2ojvFpBuYahSt3voP_iEUpoO8NgqMLUBwFxA,9767
|
|
21
21
|
nemo_evaluator_launcher/common/helpers.py,sha256=pWYd1P5tbBSAK1JPv7dk_d_Sq92gJ4NIHXiiOEyNiZY,13218
|
|
22
22
|
nemo_evaluator_launcher/common/logging_utils.py,sha256=7QkWlpA80QN5ipTUFJ198IiAsPRS36C6ISAAtNverbA,12338
|
|
23
|
-
nemo_evaluator_launcher/common/mapping.py,sha256=
|
|
23
|
+
nemo_evaluator_launcher/common/mapping.py,sha256=cGjyjvPzOL6T1pshGCbaa1QLkOsqRRZJVcU0_XszY9M,10657
|
|
24
24
|
nemo_evaluator_launcher/common/printing_utils.py,sha256=K9_-ENUIGHcL9i-fmGscfCJoV-uWJlaMKNToNx1Ypmg,2631
|
|
25
25
|
nemo_evaluator_launcher/common/container_metadata/__init__.py,sha256=QxPpIUTWprJybsfsaNoHnpLNIGxIVERW0mAtZ-u-F3c,2145
|
|
26
26
|
nemo_evaluator_launcher/common/container_metadata/intermediate_repr.py,sha256=aa9J0tdvdVYpGPu85vkdpBnFOpSReLdA0SNb45jAC0Y,17525
|
|
@@ -30,7 +30,7 @@ nemo_evaluator_launcher/common/container_metadata/utils.py,sha256=kaINUugS7Jhydm
|
|
|
30
30
|
nemo_evaluator_launcher/configs/__init__.py,sha256=lNC_skFLYTOt-arnY3ZQnZMWzHlrtD2wAoHvDcHddwM,673
|
|
31
31
|
nemo_evaluator_launcher/configs/default.yaml,sha256=JHFjSl3KByhggRMTTo9nesQATVoz18PJIV6KM5Wng64,974
|
|
32
32
|
nemo_evaluator_launcher/configs/deployment/generic.yaml,sha256=8_Z0fcjZuH6GfV9jJkY_8CS18Tbsn0gV-nK1LfGr_Vg,1262
|
|
33
|
-
nemo_evaluator_launcher/configs/deployment/nim.yaml,sha256=
|
|
33
|
+
nemo_evaluator_launcher/configs/deployment/nim.yaml,sha256=6YGGOyb-JIJqk2fOoo8NalUppbFaml7_lDBQXC9AVmw,1310
|
|
34
34
|
nemo_evaluator_launcher/configs/deployment/none.yaml,sha256=buPada1yMz9ClPDbq63vPDzLGr_IubTLia91DG3i5Lo,684
|
|
35
35
|
nemo_evaluator_launcher/configs/deployment/sglang.yaml,sha256=Yy2uOko7-HeJ1fE0LHK3AHUOAzaeVSfnDp6f7fRBA9s,1358
|
|
36
36
|
nemo_evaluator_launcher/configs/deployment/trtllm.yaml,sha256=1-001_ylJOiQQiqhcIkX_8SFTkOGx47odB_4Myug_9A,568
|
|
@@ -49,7 +49,7 @@ nemo_evaluator_launcher/executors/local/__init__.py,sha256=lNC_skFLYTOt-arnY3ZQn
|
|
|
49
49
|
nemo_evaluator_launcher/executors/local/executor.py,sha256=lLBEv0UoB7PRKdVsYdcodW0klCn90t0ua3Sp41IZr4U,37012
|
|
50
50
|
nemo_evaluator_launcher/executors/local/run.template.sh,sha256=S6qLqnqA3B4P_-ngklCU5dO9rKV0Qom5NWDwDHf5i0g,6103
|
|
51
51
|
nemo_evaluator_launcher/executors/slurm/__init__.py,sha256=lNC_skFLYTOt-arnY3ZQnZMWzHlrtD2wAoHvDcHddwM,673
|
|
52
|
-
nemo_evaluator_launcher/executors/slurm/executor.py,sha256=
|
|
52
|
+
nemo_evaluator_launcher/executors/slurm/executor.py,sha256=YtTNMKJX-6HJelZk4PI0XPqlVRJsHnOtEt_FszZ45t0,63919
|
|
53
53
|
nemo_evaluator_launcher/executors/slurm/proxy.cfg.template,sha256=nPg_JyBHPcjI1RSxDvl-lBgcbakrbZLl4IAXNBc8LM0,594
|
|
54
54
|
nemo_evaluator_launcher/exporters/__init__.py,sha256=mBXG9FG48FeYrs8sF0zA2mgo1eqBmRgoml7zjJrqDso,1323
|
|
55
55
|
nemo_evaluator_launcher/exporters/base.py,sha256=0BEqS-Zjez-KsrGE9yfo8S5w2uwMW3btBZve3SiiUp0,4307
|
|
@@ -61,9 +61,9 @@ nemo_evaluator_launcher/exporters/utils.py,sha256=5HGBKoFTSCng-GhAJpvfAa7N52_r9U
|
|
|
61
61
|
nemo_evaluator_launcher/exporters/wandb.py,sha256=1qRUV_YE1Ury7rH7KH65AabR7gmEQ38kXBh2XrfiEpE,18082
|
|
62
62
|
nemo_evaluator_launcher/resources/all_tasks_irs.yaml,sha256=g2T5Jb_2zkJFZd4gqHmOLKiKR52ioz9adjOYcaSEnyE,978316
|
|
63
63
|
nemo_evaluator_launcher/resources/mapping.toml,sha256=iB5tHeMYJMnnej4TwKmrmI1rq3zJ-rWymO8eWxvfGvk,3619
|
|
64
|
-
nemo_evaluator_launcher-0.1.
|
|
65
|
-
nemo_evaluator_launcher-0.1.
|
|
66
|
-
nemo_evaluator_launcher-0.1.
|
|
67
|
-
nemo_evaluator_launcher-0.1.
|
|
68
|
-
nemo_evaluator_launcher-0.1.
|
|
69
|
-
nemo_evaluator_launcher-0.1.
|
|
64
|
+
nemo_evaluator_launcher-0.1.67.dist-info/licenses/LICENSE,sha256=DyGb0fqHPZAsd_uXHA0DGcOCqsvrNsImuLC0Ts4s1zI,23413
|
|
65
|
+
nemo_evaluator_launcher-0.1.67.dist-info/METADATA,sha256=u7EwjsAfTzBcYcn9Ye8jQNPWpziL7ThNtJWz3CHNeLM,28760
|
|
66
|
+
nemo_evaluator_launcher-0.1.67.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
67
|
+
nemo_evaluator_launcher-0.1.67.dist-info/entry_points.txt,sha256=HPmLybw-y1y4NxjK7slFS4mtTB7p4pnjRvCtJLvSiZs,174
|
|
68
|
+
nemo_evaluator_launcher-0.1.67.dist-info/top_level.txt,sha256=5PvawNm9TXKqPRjZita1xPOtFiMOipcoRf50FI1iY3s,24
|
|
69
|
+
nemo_evaluator_launcher-0.1.67.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nemo_evaluator_launcher-0.1.56.dist-info → nemo_evaluator_launcher-0.1.67.dist-info}/top_level.txt
RENAMED
|
File without changes
|