nemo-evaluator-launcher 0.1.12__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nemo-evaluator-launcher might be problematic. Click here for more details.
- nemo_evaluator_launcher/api/functional.py +28 -2
- nemo_evaluator_launcher/cli/export.py +128 -10
- nemo_evaluator_launcher/cli/run.py +22 -3
- nemo_evaluator_launcher/cli/status.py +3 -1
- nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +24 -4
- nemo_evaluator_launcher/executors/lepton/executor.py +3 -5
- nemo_evaluator_launcher/executors/local/executor.py +26 -5
- nemo_evaluator_launcher/executors/slurm/executor.py +90 -26
- nemo_evaluator_launcher/exporters/base.py +9 -0
- nemo_evaluator_launcher/exporters/gsheets.py +27 -9
- nemo_evaluator_launcher/exporters/local.py +5 -0
- nemo_evaluator_launcher/exporters/mlflow.py +105 -32
- nemo_evaluator_launcher/exporters/utils.py +22 -105
- nemo_evaluator_launcher/exporters/wandb.py +117 -38
- nemo_evaluator_launcher/package_info.py +1 -1
- {nemo_evaluator_launcher-0.1.12.dist-info → nemo_evaluator_launcher-0.1.13.dist-info}/METADATA +1 -1
- {nemo_evaluator_launcher-0.1.12.dist-info → nemo_evaluator_launcher-0.1.13.dist-info}/RECORD +21 -21
- {nemo_evaluator_launcher-0.1.12.dist-info → nemo_evaluator_launcher-0.1.13.dist-info}/WHEEL +0 -0
- {nemo_evaluator_launcher-0.1.12.dist-info → nemo_evaluator_launcher-0.1.13.dist-info}/entry_points.txt +0 -0
- {nemo_evaluator_launcher-0.1.12.dist-info → nemo_evaluator_launcher-0.1.13.dist-info}/licenses/LICENSE +0 -0
- {nemo_evaluator_launcher-0.1.12.dist-info → nemo_evaluator_launcher-0.1.13.dist-info}/top_level.txt +0 -0
|
@@ -456,6 +456,7 @@ def export_results(
|
|
|
456
456
|
yaml.safe_load(ypath_export.read_text(encoding="utf-8"))
|
|
457
457
|
or {}
|
|
458
458
|
)
|
|
459
|
+
# execution.auto_export contains auto-export destinations
|
|
459
460
|
exec_cfg = cfg_yaml.get("execution") or {}
|
|
460
461
|
auto_exp = (exp_yaml.get("execution") or {}).get(
|
|
461
462
|
"auto_export"
|
|
@@ -463,15 +464,39 @@ def export_results(
|
|
|
463
464
|
if auto_exp is not None:
|
|
464
465
|
exec_cfg["auto_export"] = auto_exp
|
|
465
466
|
cfg_yaml["execution"] = exec_cfg
|
|
467
|
+
|
|
468
|
+
# top-level export block contains exporter config
|
|
469
|
+
if "export" in exp_yaml:
|
|
470
|
+
cfg_yaml["export"] = exp_yaml["export"]
|
|
471
|
+
|
|
472
|
+
# Merge evaluation.tasks from export_config (Slurm writes it there)
|
|
473
|
+
if "evaluation" in exp_yaml and exp_yaml["evaluation"]:
|
|
474
|
+
eval_cfg = cfg_yaml.get("evaluation") or {}
|
|
475
|
+
eval_cfg.update(exp_yaml["evaluation"])
|
|
476
|
+
cfg_yaml["evaluation"] = eval_cfg
|
|
477
|
+
|
|
466
478
|
# metadata
|
|
479
|
+
executor_name = (cfg_yaml.get("execution") or {}).get(
|
|
480
|
+
"type", "local"
|
|
481
|
+
)
|
|
482
|
+
|
|
467
483
|
md_job_data = JobData(
|
|
468
484
|
invocation_id=single_id.split(".")[0],
|
|
469
485
|
job_id=single_id,
|
|
470
486
|
timestamp=0.0,
|
|
471
|
-
executor=
|
|
472
|
-
data={
|
|
487
|
+
executor=executor_name,
|
|
488
|
+
data={
|
|
489
|
+
"output_dir": str(Path.cwd().parent),
|
|
490
|
+
"storage_type": "remote_local",
|
|
491
|
+
},
|
|
473
492
|
config=cfg_yaml,
|
|
474
493
|
)
|
|
494
|
+
# DEBUG: print what we loaded
|
|
495
|
+
print(f"DEBUG: cfg_yaml keys: {list(cfg_yaml.keys())}")
|
|
496
|
+
if "evaluation" in cfg_yaml:
|
|
497
|
+
print(
|
|
498
|
+
f"DEBUG: evaluation.tasks: {cfg_yaml.get('evaluation', {}).get('tasks')}"
|
|
499
|
+
)
|
|
475
500
|
except Exception:
|
|
476
501
|
md_job_data = None
|
|
477
502
|
# fallback to execDB only
|
|
@@ -492,6 +517,7 @@ def export_results(
|
|
|
492
517
|
"success": job_result.success,
|
|
493
518
|
"message": job_result.message,
|
|
494
519
|
"metadata": job_result.metadata or {},
|
|
520
|
+
"dest": getattr(job_result, "dest", None),
|
|
495
521
|
}
|
|
496
522
|
},
|
|
497
523
|
"metadata": job_result.metadata or {},
|
|
@@ -27,8 +27,8 @@ class ExportCmd:
|
|
|
27
27
|
|
|
28
28
|
# Short usage examples will show up in -h as the class docstring:
|
|
29
29
|
# Examples:
|
|
30
|
-
# nemo-evaluator-launcher export 8abcd123 --dest local --format json
|
|
31
|
-
# nemo-evaluator-launcher export 8abcd123.0 9ef01234 --dest local --format csv
|
|
30
|
+
# nemo-evaluator-launcher export 8abcd123 --dest local --format json --out .
|
|
31
|
+
# nemo-evaluator-launcher export 8abcd123.0 9ef01234 --dest local --format csv --out results/ -fname processed_results.csv
|
|
32
32
|
# nemo-evaluator-launcher export 8abcd123 --dest jet
|
|
33
33
|
|
|
34
34
|
invocation_ids: List[str] = field(
|
|
@@ -41,9 +41,17 @@ class ExportCmd:
|
|
|
41
41
|
choices=["local", "wandb", "mlflow", "gsheets", "jet"],
|
|
42
42
|
help="Export destination.",
|
|
43
43
|
)
|
|
44
|
+
# overrides for exporter config; use -o similar to run command
|
|
45
|
+
override: List[str] = field(
|
|
46
|
+
default_factory=list,
|
|
47
|
+
action="append",
|
|
48
|
+
nargs="?",
|
|
49
|
+
alias=["-o", "--override"],
|
|
50
|
+
help="Hydra-style overrides for exporter config. Use `export.<dest>.key=value` (e.g., -o export.wandb.entity=org-name).",
|
|
51
|
+
)
|
|
44
52
|
output_dir: Optional[str] = field(
|
|
45
53
|
default=".",
|
|
46
|
-
alias=["--output-dir", "-
|
|
54
|
+
alias=["--output-dir", "-out"],
|
|
47
55
|
help="Output directory (default: current directory).",
|
|
48
56
|
)
|
|
49
57
|
output_filename: Optional[str] = field(
|
|
@@ -67,8 +75,8 @@ class ExportCmd:
|
|
|
67
75
|
alias=["--log-metrics"],
|
|
68
76
|
help="Filter metrics by name (repeatable). Examples: score, f1, mmlu_score_micro.",
|
|
69
77
|
)
|
|
70
|
-
only_required: bool = field(
|
|
71
|
-
default=
|
|
78
|
+
only_required: Optional[bool] = field(
|
|
79
|
+
default=None,
|
|
72
80
|
alias=["--only-required"],
|
|
73
81
|
help="Copy only required+optional artifacts (default: True). Set to False to copy all available artifacts.",
|
|
74
82
|
)
|
|
@@ -76,11 +84,20 @@ class ExportCmd:
|
|
|
76
84
|
def execute(self) -> None:
|
|
77
85
|
"""Execute export."""
|
|
78
86
|
# Import heavy dependencies only when needed
|
|
87
|
+
from omegaconf import OmegaConf
|
|
88
|
+
|
|
79
89
|
from nemo_evaluator_launcher.api.functional import export_results
|
|
80
90
|
|
|
91
|
+
# Validation: ensure IDs are provided
|
|
92
|
+
if not self.invocation_ids:
|
|
93
|
+
print("Error: No IDs provided. Specify one or more invocation or job IDs.")
|
|
94
|
+
print(
|
|
95
|
+
"Usage: nemo-evaluator-launcher export <id> [<id>...] --dest <destination>"
|
|
96
|
+
)
|
|
97
|
+
return
|
|
98
|
+
|
|
81
99
|
config: dict[str, Any] = {
|
|
82
100
|
"copy_logs": self.copy_logs,
|
|
83
|
-
"only_required": self.only_required,
|
|
84
101
|
}
|
|
85
102
|
|
|
86
103
|
# Output handling
|
|
@@ -95,20 +112,90 @@ class ExportCmd:
|
|
|
95
112
|
if self.log_metrics:
|
|
96
113
|
config["log_metrics"] = self.log_metrics
|
|
97
114
|
|
|
115
|
+
# Add only_required if explicitly passed via CLI
|
|
116
|
+
if self.only_required is not None:
|
|
117
|
+
config["only_required"] = self.only_required
|
|
118
|
+
|
|
119
|
+
# Parse and validate overrides
|
|
120
|
+
if self.override:
|
|
121
|
+
# Flatten possible list-of-lists from parser
|
|
122
|
+
flat_overrides: list[str] = []
|
|
123
|
+
for item in self.override:
|
|
124
|
+
if isinstance(item, list):
|
|
125
|
+
flat_overrides.extend(str(x) for x in item)
|
|
126
|
+
else:
|
|
127
|
+
flat_overrides.append(str(item))
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
self._validate_overrides(flat_overrides, self.dest)
|
|
131
|
+
except ValueError as e:
|
|
132
|
+
print(f"Error: {e}")
|
|
133
|
+
return
|
|
134
|
+
|
|
135
|
+
# Expand env vars in override vals ($VAR / ${VAR})
|
|
136
|
+
import os
|
|
137
|
+
|
|
138
|
+
from omegaconf import OmegaConf
|
|
139
|
+
|
|
140
|
+
expanded_overrides: list[str] = []
|
|
141
|
+
for ov in flat_overrides:
|
|
142
|
+
if "=" in ov:
|
|
143
|
+
k, v = ov.split("=", 1)
|
|
144
|
+
expanded_overrides.append(f"{k}={os.path.expandvars(v)}")
|
|
145
|
+
else:
|
|
146
|
+
expanded_overrides.append(os.path.expandvars(ov))
|
|
147
|
+
|
|
148
|
+
dot_cfg = OmegaConf.from_dotlist(expanded_overrides)
|
|
149
|
+
as_dict = OmegaConf.to_container(dot_cfg, resolve=True) or {}
|
|
150
|
+
if isinstance(as_dict, dict) and "export" in as_dict:
|
|
151
|
+
export_map = as_dict.get("export") or {}
|
|
152
|
+
if isinstance(export_map, dict) and self.dest in export_map:
|
|
153
|
+
config.update(export_map[self.dest] or {})
|
|
154
|
+
else:
|
|
155
|
+
config.update(as_dict)
|
|
156
|
+
else:
|
|
157
|
+
config.update(as_dict)
|
|
158
|
+
|
|
98
159
|
if self.format and self.dest != "local":
|
|
99
160
|
print(
|
|
100
161
|
"Note: --format is only used by --dest local. It will be ignored for other destinations."
|
|
101
162
|
)
|
|
102
163
|
|
|
103
|
-
|
|
164
|
+
if "only_required" in config and self.only_required is True:
|
|
165
|
+
config.pop("only_required", None)
|
|
166
|
+
|
|
104
167
|
print(
|
|
105
168
|
f"Exporting {len(self.invocation_ids)} {'invocations' if len(self.invocation_ids) > 1 else 'invocation'} to {self.dest}..."
|
|
106
169
|
)
|
|
107
170
|
|
|
108
171
|
result = export_results(self.invocation_ids, self.dest, config)
|
|
109
172
|
|
|
110
|
-
if not result
|
|
111
|
-
|
|
173
|
+
if not result.get("success", False):
|
|
174
|
+
err = result.get("error", "Unknown error")
|
|
175
|
+
print(f"\nExport failed: {err}")
|
|
176
|
+
# Provide actionable guidance for common configuration issues
|
|
177
|
+
if self.dest == "mlflow":
|
|
178
|
+
if "tracking_uri" in str(err).lower():
|
|
179
|
+
print("\nMLflow requires 'tracking_uri' to be configured.")
|
|
180
|
+
print(
|
|
181
|
+
"Set it via: -o export.mlflow.tracking_uri=http://mlflow-server:5000"
|
|
182
|
+
)
|
|
183
|
+
elif "not installed" in str(err).lower():
|
|
184
|
+
print("\nMLflow package not installed.")
|
|
185
|
+
print("Install via: pip install nemo-evaluator-launcher[mlflow]")
|
|
186
|
+
elif self.dest == "wandb":
|
|
187
|
+
if "entity" in str(err).lower() or "project" in str(err).lower():
|
|
188
|
+
print("\nW&B requires 'entity' and 'project' to be configured.")
|
|
189
|
+
print(
|
|
190
|
+
"Set via: -o export.wandb.entity=my-org -o export.wandb.project=my-proj"
|
|
191
|
+
)
|
|
192
|
+
elif "not installed" in str(err).lower():
|
|
193
|
+
print("\nW&B package not installed.")
|
|
194
|
+
print("Install via: pip install nemo-evaluator-launcher[wandb]")
|
|
195
|
+
elif self.dest == "gsheets":
|
|
196
|
+
if "not installed" in str(err).lower():
|
|
197
|
+
print("\nGoogle Sheets package not installed.")
|
|
198
|
+
print("Install via: pip install nemo-evaluator-launcher[gsheets]")
|
|
112
199
|
return
|
|
113
200
|
|
|
114
201
|
# Success path
|
|
@@ -125,6 +212,9 @@ class ExportCmd:
|
|
|
125
212
|
print(f" URL: {metadata['run_url']}")
|
|
126
213
|
if metadata.get("summary_path"):
|
|
127
214
|
print(f" Summary: {metadata['summary_path']}")
|
|
215
|
+
path_hint = job_result.get("dest") or metadata.get("output_dir")
|
|
216
|
+
if self.dest == "local" and path_hint:
|
|
217
|
+
print(f" Path: {path_hint}")
|
|
128
218
|
else:
|
|
129
219
|
print(f" {job_id} failed: {job_result.get('message', '')}")
|
|
130
220
|
else:
|
|
@@ -137,7 +227,6 @@ class ExportCmd:
|
|
|
137
227
|
# Show summary path if available
|
|
138
228
|
if metadata.get("summary_path"):
|
|
139
229
|
print(f"Summary: {metadata['summary_path']}")
|
|
140
|
-
|
|
141
230
|
# Show per-invocation status
|
|
142
231
|
for invocation_id, inv_result in result["invocations"].items():
|
|
143
232
|
if inv_result.get("success"):
|
|
@@ -147,3 +236,32 @@ class ExportCmd:
|
|
|
147
236
|
print(
|
|
148
237
|
f" {invocation_id}: failed, {inv_result.get('error', 'Unknown error')}"
|
|
149
238
|
)
|
|
239
|
+
|
|
240
|
+
def _validate_overrides(self, overrides: List[str], dest: str) -> None:
|
|
241
|
+
"""Validate override list for destination consistency.
|
|
242
|
+
|
|
243
|
+
Raises:
|
|
244
|
+
ValueError: If overrides specify wrong destination or have other issues.
|
|
245
|
+
"""
|
|
246
|
+
if not overrides:
|
|
247
|
+
return # nothing to validate
|
|
248
|
+
|
|
249
|
+
# Check each override for destination mismatch
|
|
250
|
+
for override_str in overrides:
|
|
251
|
+
if override_str.startswith(
|
|
252
|
+
"export."
|
|
253
|
+
): # check if override starts with export.
|
|
254
|
+
# Extract destination from override path
|
|
255
|
+
try:
|
|
256
|
+
key_part = override_str.split("=")[0] # Get left side before =
|
|
257
|
+
parts = key_part.split(".")
|
|
258
|
+
if len(parts) >= 2:
|
|
259
|
+
override_dest = parts[1]
|
|
260
|
+
if override_dest != dest:
|
|
261
|
+
raise ValueError(
|
|
262
|
+
f"Override destination mismatch: override specifies 'export.{override_dest}' but --dest is '{dest}'. "
|
|
263
|
+
f"Either change --dest to '{override_dest}' or use 'export.{dest}' in overrides."
|
|
264
|
+
)
|
|
265
|
+
except (IndexError, AttributeError):
|
|
266
|
+
# miconstructed override -> OmegaConf handles this
|
|
267
|
+
pass
|
|
@@ -98,7 +98,17 @@ class Cmd:
|
|
|
98
98
|
config_dir=self.config_dir,
|
|
99
99
|
)
|
|
100
100
|
|
|
101
|
-
|
|
101
|
+
try:
|
|
102
|
+
invocation_id = run_eval(config, self.dry_run)
|
|
103
|
+
except Exception as e:
|
|
104
|
+
print(f"\033[31m✗ Job submission failed | Error: {e}\033[0m")
|
|
105
|
+
raise
|
|
106
|
+
|
|
107
|
+
# Print general success message with invocation ID
|
|
108
|
+
if invocation_id is not None and not self.dry_run:
|
|
109
|
+
print(
|
|
110
|
+
f"\033[32m✓ Job submission successful | Invocation ID: {invocation_id}\033[0m"
|
|
111
|
+
)
|
|
102
112
|
|
|
103
113
|
# Save the complete configuration
|
|
104
114
|
if not self.dry_run and invocation_id is not None:
|
|
@@ -146,6 +156,15 @@ class Cmd:
|
|
|
146
156
|
if invocation_id is not None:
|
|
147
157
|
print(f"to check status: nemo-evaluator-launcher status {invocation_id}")
|
|
148
158
|
print(f"to kill all jobs: nemo-evaluator-launcher kill {invocation_id}")
|
|
149
|
-
|
|
150
|
-
|
|
159
|
+
|
|
160
|
+
# Show actual job IDs and task names
|
|
161
|
+
print("to kill individual jobs:")
|
|
162
|
+
# Access tasks - will work after normalization in run_eval
|
|
163
|
+
tasks = (
|
|
164
|
+
config.evaluation.tasks
|
|
165
|
+
if hasattr(config.evaluation, "tasks")
|
|
166
|
+
else config.evaluation
|
|
151
167
|
)
|
|
168
|
+
for idx, task in enumerate(tasks):
|
|
169
|
+
job_id = f"{invocation_id}.{idx}"
|
|
170
|
+
print(f" nemo-evaluator-launcher kill {job_id} # {task.name}")
|
|
@@ -102,6 +102,8 @@ class Cmd:
|
|
|
102
102
|
status = job.get("status", "")
|
|
103
103
|
formatted_status = self._format_status_with_indicators(status)
|
|
104
104
|
|
|
105
|
+
# Extract task name
|
|
106
|
+
|
|
105
107
|
rows.append(
|
|
106
108
|
[
|
|
107
109
|
job.get("job_id", ""),
|
|
@@ -144,7 +146,7 @@ class Cmd:
|
|
|
144
146
|
ExecutionState.SUCCESS.value: "\033[32m✓ SUCCESS\033[0m", # Green Unicode checkmark
|
|
145
147
|
ExecutionState.FAILED.value: "\033[31m✗ FAILED\033[0m", # Red Unicode X
|
|
146
148
|
ExecutionState.RUNNING.value: "\033[33m▶ RUNNING\033[0m", # Yellow Unicode play button
|
|
147
|
-
ExecutionState.PENDING.value: "\033[36m
|
|
149
|
+
ExecutionState.PENDING.value: "\033[36m⧗ PENDING\033[0m", # Cyan Unicode hourglass (U+29D7)
|
|
148
150
|
ExecutionState.KILLED.value: "\033[35m✗ KILLED\033[0m", # Magenta Unicode X
|
|
149
151
|
# Additional states for error handling
|
|
150
152
|
"not_found": "\033[90m? NOT FOUND\033[0m", # Gray question mark
|
|
@@ -428,14 +428,34 @@ def create_lepton_endpoint(cfg: DictConfig, endpoint_name: str) -> bool:
|
|
|
428
428
|
print(f"✅ Successfully created Lepton endpoint: {endpoint_name}")
|
|
429
429
|
return True
|
|
430
430
|
else:
|
|
431
|
-
|
|
431
|
+
error_msg = result.stderr.strip() if result.stderr else ""
|
|
432
|
+
output_msg = result.stdout.strip() if result.stdout else ""
|
|
433
|
+
print(
|
|
434
|
+
f"✗ Failed to create Lepton endpoint | Endpoint: {endpoint_name} | Return code: {result.returncode}"
|
|
435
|
+
)
|
|
436
|
+
if error_msg:
|
|
437
|
+
print(f" stderr: {error_msg}")
|
|
438
|
+
if output_msg:
|
|
439
|
+
print(f" stdout: {output_msg}")
|
|
432
440
|
return False
|
|
433
441
|
|
|
434
|
-
except subprocess.TimeoutExpired:
|
|
435
|
-
print(
|
|
442
|
+
except subprocess.TimeoutExpired as e:
|
|
443
|
+
print(
|
|
444
|
+
f"✗ Timeout creating Lepton endpoint | Endpoint: {endpoint_name} | Timeout: 300s"
|
|
445
|
+
)
|
|
446
|
+
if hasattr(e, "stderr") and e.stderr:
|
|
447
|
+
print(f" stderr: {e.stderr}")
|
|
448
|
+
if hasattr(e, "stdout") and e.stdout:
|
|
449
|
+
print(f" stdout: {e.stdout}")
|
|
436
450
|
return False
|
|
437
451
|
except subprocess.CalledProcessError as e:
|
|
438
|
-
print(
|
|
452
|
+
print(
|
|
453
|
+
f"✗ Error creating Lepton endpoint | Endpoint: {endpoint_name} | Error: {e}"
|
|
454
|
+
)
|
|
455
|
+
if hasattr(e, "stderr") and e.stderr:
|
|
456
|
+
print(f" stderr: {e.stderr}")
|
|
457
|
+
if hasattr(e, "stdout") and e.stdout:
|
|
458
|
+
print(f" stdout: {e.stdout}")
|
|
439
459
|
return False
|
|
440
460
|
finally:
|
|
441
461
|
# Clean up temporary file
|
|
@@ -482,7 +482,8 @@ class LeptonExecutor(BaseExecutor):
|
|
|
482
482
|
|
|
483
483
|
if not job_success:
|
|
484
484
|
raise RuntimeError(
|
|
485
|
-
f"Failed to submit Lepton job
|
|
485
|
+
f"Failed to submit Lepton job | Task: {task.name} | Job ID: {job_id} | "
|
|
486
|
+
f"Lepton job name: {lepton_job_name} | Error: {error_msg}"
|
|
486
487
|
)
|
|
487
488
|
|
|
488
489
|
# Store job metadata in database (with task-specific endpoint info)
|
|
@@ -504,8 +505,6 @@ class LeptonExecutor(BaseExecutor):
|
|
|
504
505
|
)
|
|
505
506
|
)
|
|
506
507
|
|
|
507
|
-
print(f"✅ Task {task.name}: Submitted evaluation job {job_id}")
|
|
508
|
-
|
|
509
508
|
# Jobs submitted successfully - return immediately (non-blocking)
|
|
510
509
|
print(
|
|
511
510
|
f"\n✅ Successfully submitted {len(lepton_job_names)} evaluation jobs to Lepton"
|
|
@@ -536,9 +535,8 @@ class LeptonExecutor(BaseExecutor):
|
|
|
536
535
|
|
|
537
536
|
return invocation_id
|
|
538
537
|
|
|
539
|
-
except Exception
|
|
538
|
+
except Exception:
|
|
540
539
|
# Clean up any created endpoints on failure
|
|
541
|
-
print(f"❌ Error during evaluation: {e}")
|
|
542
540
|
if cfg.deployment.type != "none" and "endpoint_names" in locals():
|
|
543
541
|
for endpoint_name in endpoint_names:
|
|
544
542
|
if endpoint_name:
|
|
@@ -23,6 +23,7 @@ import os
|
|
|
23
23
|
import pathlib
|
|
24
24
|
import platform
|
|
25
25
|
import shlex
|
|
26
|
+
import shutil
|
|
26
27
|
import subprocess
|
|
27
28
|
import time
|
|
28
29
|
from typing import List, Optional
|
|
@@ -76,6 +77,13 @@ class LocalExecutor(BaseExecutor):
|
|
|
76
77
|
f"type {cfg.deployment.type} is not implemented -- add deployment support"
|
|
77
78
|
)
|
|
78
79
|
|
|
80
|
+
# Check if docker is available (skip in dry_run mode)
|
|
81
|
+
if not dry_run and shutil.which("docker") is None:
|
|
82
|
+
raise RuntimeError(
|
|
83
|
+
"Docker is not installed or not in PATH. "
|
|
84
|
+
"Please install Docker to run local evaluations."
|
|
85
|
+
)
|
|
86
|
+
|
|
79
87
|
# Generate invocation ID for this evaluation run
|
|
80
88
|
invocation_id = generate_invocation_id()
|
|
81
89
|
|
|
@@ -233,35 +241,48 @@ class LocalExecutor(BaseExecutor):
|
|
|
233
241
|
# To ensure subprocess continues after python exits:
|
|
234
242
|
# - on Unix-like systems, to fully detach the subprocess
|
|
235
243
|
# so it does not die when Python exits, pass start_new_session=True;
|
|
236
|
-
# - on
|
|
244
|
+
# - on Windows use creationflags=subprocess.CREATE_NEW_PROCESS_GROUP flag.
|
|
237
245
|
os_name = platform.system()
|
|
246
|
+
processes = []
|
|
247
|
+
|
|
238
248
|
if is_execution_mode_sequential:
|
|
239
249
|
if os_name == "Windows":
|
|
240
|
-
subprocess.Popen(
|
|
250
|
+
proc = subprocess.Popen(
|
|
241
251
|
shlex.split("bash run_all.sequential.sh"),
|
|
242
252
|
cwd=output_dir,
|
|
243
253
|
creationflags=subprocess.CREATE_NEW_PROCESS_GROUP,
|
|
244
254
|
)
|
|
245
255
|
else:
|
|
246
|
-
subprocess.Popen(
|
|
256
|
+
proc = subprocess.Popen(
|
|
247
257
|
shlex.split("bash run_all.sequential.sh"),
|
|
248
258
|
cwd=output_dir,
|
|
249
259
|
start_new_session=True,
|
|
250
260
|
)
|
|
261
|
+
processes.append(("run_all.sequential.sh", proc, output_dir))
|
|
251
262
|
else:
|
|
252
263
|
for task in cfg.evaluation.tasks:
|
|
253
264
|
if os_name == "Windows":
|
|
254
|
-
subprocess.Popen(
|
|
265
|
+
proc = subprocess.Popen(
|
|
255
266
|
shlex.split("bash run.sh"),
|
|
256
267
|
cwd=output_dir / task.name,
|
|
257
268
|
creationflags=subprocess.CREATE_NEW_PROCESS_GROUP,
|
|
258
269
|
)
|
|
259
270
|
else:
|
|
260
|
-
subprocess.Popen(
|
|
271
|
+
proc = subprocess.Popen(
|
|
261
272
|
shlex.split("bash run.sh"),
|
|
262
273
|
cwd=output_dir / task.name,
|
|
263
274
|
start_new_session=True,
|
|
264
275
|
)
|
|
276
|
+
processes.append((task.name, proc, output_dir / task.name))
|
|
277
|
+
|
|
278
|
+
# Wait briefly and check if bash scripts exited immediately (which means error)
|
|
279
|
+
time.sleep(0.3)
|
|
280
|
+
|
|
281
|
+
for name, proc, work_dir in processes:
|
|
282
|
+
exit_code = proc.poll()
|
|
283
|
+
if exit_code is not None and exit_code != 0:
|
|
284
|
+
error_msg = f"Script for {name} exited with code {exit_code}"
|
|
285
|
+
raise RuntimeError(f"Job startup failed | {error_msg}")
|
|
265
286
|
|
|
266
287
|
print("\nCommands for real-time monitoring:")
|
|
267
288
|
for job_id, evaluation_task in zip(job_ids, evaluation_tasks):
|
|
@@ -174,10 +174,11 @@ class SlurmExecutor(BaseExecutor):
|
|
|
174
174
|
for idx, (slurm_job_id, remote_runsub_path) in enumerate(
|
|
175
175
|
zip(slurm_job_ids, remote_runsub_paths)
|
|
176
176
|
):
|
|
177
|
+
job_id = generate_job_id(invocation_id, idx)
|
|
177
178
|
db.write_job(
|
|
178
179
|
job=JobData(
|
|
179
180
|
invocation_id=invocation_id,
|
|
180
|
-
job_id=
|
|
181
|
+
job_id=job_id,
|
|
181
182
|
timestamp=time.time(),
|
|
182
183
|
executor="slurm",
|
|
183
184
|
data={
|
|
@@ -204,7 +205,7 @@ class SlurmExecutor(BaseExecutor):
|
|
|
204
205
|
"""
|
|
205
206
|
db = ExecutionDB()
|
|
206
207
|
|
|
207
|
-
# If id looks like an invocation_id
|
|
208
|
+
# If id looks like an invocation_id
|
|
208
209
|
if "." not in id:
|
|
209
210
|
jobs = db.get_jobs(id)
|
|
210
211
|
if not jobs:
|
|
@@ -605,20 +606,27 @@ def _create_slurm_sbatch_script(
|
|
|
605
606
|
s += "kill $SERVER_PID # terminate the server to finish gracefully\n\n"
|
|
606
607
|
|
|
607
608
|
# auto-export
|
|
608
|
-
|
|
609
|
-
|
|
609
|
+
ae_cfg = cfg.execution.get("auto_export")
|
|
610
|
+
destinations: list = []
|
|
611
|
+
if isinstance(ae_cfg, list):
|
|
612
|
+
destinations = list(ae_cfg)
|
|
613
|
+
elif isinstance(ae_cfg, dict) or isinstance(ae_cfg, DictConfig):
|
|
614
|
+
destinations = list(ae_cfg.get("destinations", []) or [])
|
|
615
|
+
|
|
616
|
+
if destinations:
|
|
617
|
+
export_env = dict(cfg.execution.get("env_vars", {}).get("export", {}) or {})
|
|
618
|
+
s += _generate_auto_export_section(cfg, job_id, destinations, export_env)
|
|
610
619
|
|
|
611
620
|
return s
|
|
612
621
|
|
|
613
622
|
|
|
614
623
|
def _generate_auto_export_section(
|
|
615
624
|
cfg: DictConfig,
|
|
616
|
-
job_id: str,
|
|
625
|
+
job_id: str,
|
|
626
|
+
destinations: list,
|
|
627
|
+
export_env: dict,
|
|
617
628
|
) -> str:
|
|
618
629
|
"""Generate simple auto-export section for sbatch script."""
|
|
619
|
-
auto_export_config = cfg.execution.get("auto_export", {})
|
|
620
|
-
destinations = auto_export_config.get("destinations", [])
|
|
621
|
-
|
|
622
630
|
if not destinations:
|
|
623
631
|
return ""
|
|
624
632
|
|
|
@@ -626,18 +634,65 @@ def _generate_auto_export_section(
|
|
|
626
634
|
s += "EVAL_EXIT_CODE=$?\n"
|
|
627
635
|
s += "if [ $EVAL_EXIT_CODE -eq 0 ]; then\n"
|
|
628
636
|
s += " echo 'Evaluation completed successfully. Starting auto-export...'\n"
|
|
629
|
-
s += " set +e\n"
|
|
637
|
+
s += " set +e\n"
|
|
630
638
|
s += " set +x\n"
|
|
639
|
+
s += " set +u\n"
|
|
631
640
|
s += ' cd "$TASK_DIR/artifacts"\n'
|
|
632
|
-
|
|
633
|
-
|
|
641
|
+
|
|
642
|
+
# Work with DictConfig; convert only for YAML at the end
|
|
643
|
+
exec_type = (
|
|
644
|
+
cfg.execution.type
|
|
645
|
+
if hasattr(cfg.execution, "type")
|
|
646
|
+
else cfg.execution.get("type", "slurm")
|
|
634
647
|
)
|
|
635
|
-
|
|
636
|
-
|
|
648
|
+
eval_tasks = (
|
|
649
|
+
list(cfg.evaluation.tasks)
|
|
650
|
+
if hasattr(cfg, "evaluation") and hasattr(cfg.evaluation, "tasks")
|
|
651
|
+
else list((cfg.get("evaluation", {}) or {}).get("tasks", []) or [])
|
|
637
652
|
)
|
|
653
|
+
export_block = cfg.get("export", {}) or {}
|
|
654
|
+
|
|
655
|
+
payload = {
|
|
656
|
+
"execution": {
|
|
657
|
+
"auto_export": {
|
|
658
|
+
"destinations": list(destinations),
|
|
659
|
+
**({"env_vars": dict(export_env)} if export_env else {}),
|
|
660
|
+
},
|
|
661
|
+
"type": exec_type,
|
|
662
|
+
},
|
|
663
|
+
"evaluation": {"tasks": eval_tasks},
|
|
664
|
+
}
|
|
665
|
+
if export_block:
|
|
666
|
+
# Convert just this block to plain for YAML
|
|
667
|
+
payload["export"] = (
|
|
668
|
+
OmegaConf.to_object(export_block)
|
|
669
|
+
if OmegaConf.is_config(export_block)
|
|
670
|
+
else dict(export_block)
|
|
671
|
+
)
|
|
672
|
+
|
|
673
|
+
# Final YAML (single conversion at the end)
|
|
674
|
+
payload_clean = OmegaConf.to_container(OmegaConf.create(payload), resolve=True)
|
|
675
|
+
yaml_str = yaml.safe_dump(payload_clean, sort_keys=False)
|
|
638
676
|
s += " cat > export_config.yml << 'EOF'\n"
|
|
639
677
|
s += yaml_str
|
|
640
678
|
s += "EOF\n"
|
|
679
|
+
|
|
680
|
+
# write launcher config as config.yml for exporters (no core command)
|
|
681
|
+
submitted_yaml = yaml.safe_dump(
|
|
682
|
+
OmegaConf.to_container(cfg, resolve=True), sort_keys=False
|
|
683
|
+
)
|
|
684
|
+
s += " cat > config.yml << 'EOF'\n"
|
|
685
|
+
s += submitted_yaml
|
|
686
|
+
s += "EOF\n"
|
|
687
|
+
|
|
688
|
+
# Export host only env before running auto export
|
|
689
|
+
for k, v in (export_env or {}).items():
|
|
690
|
+
if isinstance(v, str) and re.fullmatch(r"[A-Za-z_][A-Za-z0-9_]*", v):
|
|
691
|
+
s += f' export {k}="${{{v}}}"\n'
|
|
692
|
+
else:
|
|
693
|
+
esc = str(v).replace('"', '\\"')
|
|
694
|
+
s += f' export {k}="{esc}"\n'
|
|
695
|
+
|
|
641
696
|
for dest in destinations:
|
|
642
697
|
s += f" echo 'Exporting to {dest}...'\n"
|
|
643
698
|
s += f" nemo-evaluator-launcher export {job_id} --dest {dest} || echo 'Export to {dest} failed'\n"
|
|
@@ -656,7 +711,9 @@ def _open_master_connection(
|
|
|
656
711
|
socket: str,
|
|
657
712
|
) -> str | None:
|
|
658
713
|
ssh_command = f"ssh -MNf -S {socket} {username}@{hostname}"
|
|
659
|
-
completed_process = subprocess.run(
|
|
714
|
+
completed_process = subprocess.run(
|
|
715
|
+
args=shlex.split(ssh_command), capture_output=True
|
|
716
|
+
)
|
|
660
717
|
if completed_process.returncode == 0:
|
|
661
718
|
return socket
|
|
662
719
|
return None
|
|
@@ -694,12 +751,17 @@ def _make_remote_execution_output_dir(
|
|
|
694
751
|
ssh_command.append(f"{username}@{hostname}")
|
|
695
752
|
ssh_command.append(mkdir_command)
|
|
696
753
|
ssh_command = " ".join(ssh_command)
|
|
697
|
-
completed_process = subprocess.run(
|
|
754
|
+
completed_process = subprocess.run(
|
|
755
|
+
args=shlex.split(ssh_command), capture_output=True
|
|
756
|
+
)
|
|
698
757
|
if completed_process.returncode != 0:
|
|
758
|
+
error_msg = (
|
|
759
|
+
completed_process.stderr.decode("utf-8")
|
|
760
|
+
if completed_process.stderr
|
|
761
|
+
else "Unknown error"
|
|
762
|
+
)
|
|
699
763
|
raise RuntimeError(
|
|
700
|
-
"failed to make a remote execution output dir\n{}".format(
|
|
701
|
-
completed_process.stderr.decode("utf-8")
|
|
702
|
-
)
|
|
764
|
+
"failed to make a remote execution output dir\n{}".format(error_msg)
|
|
703
765
|
)
|
|
704
766
|
|
|
705
767
|
|
|
@@ -725,13 +787,16 @@ def _rsync_upload_rundirs(
|
|
|
725
787
|
remote_destination_str = f"{username}@{hostname}:{remote_target}"
|
|
726
788
|
local_sources_str = " ".join(map(str, local_sources))
|
|
727
789
|
rsync_upload_command = f"rsync -qcaz {local_sources_str} {remote_destination_str}"
|
|
728
|
-
completed_process = subprocess.run(
|
|
790
|
+
completed_process = subprocess.run(
|
|
791
|
+
args=shlex.split(rsync_upload_command), capture_output=True
|
|
792
|
+
)
|
|
729
793
|
if completed_process.returncode != 0:
|
|
730
|
-
|
|
731
|
-
"
|
|
732
|
-
|
|
733
|
-
|
|
794
|
+
error_msg = (
|
|
795
|
+
completed_process.stderr.decode("utf-8")
|
|
796
|
+
if completed_process.stderr
|
|
797
|
+
else "Unknown error"
|
|
734
798
|
)
|
|
799
|
+
raise RuntimeError("failed to upload local sources\n{}".format(error_msg))
|
|
735
800
|
|
|
736
801
|
|
|
737
802
|
def _sbatch_remote_runsubs(
|
|
@@ -757,10 +822,9 @@ def _sbatch_remote_runsubs(
|
|
|
757
822
|
args=shlex.split(ssh_command), capture_output=True
|
|
758
823
|
)
|
|
759
824
|
if completed_process.returncode != 0:
|
|
825
|
+
error_msg = completed_process.stderr.decode("utf-8")
|
|
760
826
|
raise RuntimeError(
|
|
761
|
-
"failed to submit sbatch scripts for execution\n{}".format(
|
|
762
|
-
completed_process.stderr.decode("utf-8")
|
|
763
|
-
)
|
|
827
|
+
"failed to submit sbatch scripts for execution\n{}".format(error_msg)
|
|
764
828
|
)
|
|
765
829
|
|
|
766
830
|
sbatch_output = completed_process.stdout.decode("utf-8")
|