nemo-evaluator-launcher 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -456,6 +456,7 @@ def export_results(
456
456
  yaml.safe_load(ypath_export.read_text(encoding="utf-8"))
457
457
  or {}
458
458
  )
459
+ # execution.auto_export contains auto-export destinations
459
460
  exec_cfg = cfg_yaml.get("execution") or {}
460
461
  auto_exp = (exp_yaml.get("execution") or {}).get(
461
462
  "auto_export"
@@ -463,15 +464,39 @@ def export_results(
463
464
  if auto_exp is not None:
464
465
  exec_cfg["auto_export"] = auto_exp
465
466
  cfg_yaml["execution"] = exec_cfg
467
+
468
+ # top-level export block contains exporter config
469
+ if "export" in exp_yaml:
470
+ cfg_yaml["export"] = exp_yaml["export"]
471
+
472
+ # Merge evaluation.tasks from export_config (Slurm writes it there)
473
+ if "evaluation" in exp_yaml and exp_yaml["evaluation"]:
474
+ eval_cfg = cfg_yaml.get("evaluation") or {}
475
+ eval_cfg.update(exp_yaml["evaluation"])
476
+ cfg_yaml["evaluation"] = eval_cfg
477
+
466
478
  # metadata
479
+ executor_name = (cfg_yaml.get("execution") or {}).get(
480
+ "type", "local"
481
+ )
482
+
467
483
  md_job_data = JobData(
468
484
  invocation_id=single_id.split(".")[0],
469
485
  job_id=single_id,
470
486
  timestamp=0.0,
471
- executor="local", #
472
- data={"output_dir": str(Path.cwd().parent)},
487
+ executor=executor_name,
488
+ data={
489
+ "output_dir": str(Path.cwd().parent),
490
+ "storage_type": "remote_local",
491
+ },
473
492
  config=cfg_yaml,
474
493
  )
494
+ # DEBUG: print what we loaded
495
+ print(f"DEBUG: cfg_yaml keys: {list(cfg_yaml.keys())}")
496
+ if "evaluation" in cfg_yaml:
497
+ print(
498
+ f"DEBUG: evaluation.tasks: {cfg_yaml.get('evaluation', {}).get('tasks')}"
499
+ )
475
500
  except Exception:
476
501
  md_job_data = None
477
502
  # fallback to execDB only
@@ -492,6 +517,7 @@ def export_results(
492
517
  "success": job_result.success,
493
518
  "message": job_result.message,
494
519
  "metadata": job_result.metadata or {},
520
+ "dest": getattr(job_result, "dest", None),
495
521
  }
496
522
  },
497
523
  "metadata": job_result.metadata or {},
@@ -27,8 +27,8 @@ class ExportCmd:
27
27
 
28
28
  # Short usage examples will show up in -h as the class docstring:
29
29
  # Examples:
30
- # nemo-evaluator-launcher export 8abcd123 --dest local --format json -o .
31
- # nemo-evaluator-launcher export 8abcd123.0 9ef01234 --dest local --format csv -o results/ -fname processed_results.csv
30
+ # nemo-evaluator-launcher export 8abcd123 --dest local --format json --out .
31
+ # nemo-evaluator-launcher export 8abcd123.0 9ef01234 --dest local --format csv --out results/ -fname processed_results.csv
32
32
  # nemo-evaluator-launcher export 8abcd123 --dest jet
33
33
 
34
34
  invocation_ids: List[str] = field(
@@ -41,9 +41,17 @@ class ExportCmd:
41
41
  choices=["local", "wandb", "mlflow", "gsheets", "jet"],
42
42
  help="Export destination.",
43
43
  )
44
+ # overrides for exporter config; use -o similar to run command
45
+ override: List[str] = field(
46
+ default_factory=list,
47
+ action="append",
48
+ nargs="?",
49
+ alias=["-o", "--override"],
50
+ help="Hydra-style overrides for exporter config. Use `export.<dest>.key=value` (e.g., -o export.wandb.entity=org-name).",
51
+ )
44
52
  output_dir: Optional[str] = field(
45
53
  default=".",
46
- alias=["--output-dir", "-o"],
54
+ alias=["--output-dir", "-out"],
47
55
  help="Output directory (default: current directory).",
48
56
  )
49
57
  output_filename: Optional[str] = field(
@@ -67,8 +75,8 @@ class ExportCmd:
67
75
  alias=["--log-metrics"],
68
76
  help="Filter metrics by name (repeatable). Examples: score, f1, mmlu_score_micro.",
69
77
  )
70
- only_required: bool = field(
71
- default=True,
78
+ only_required: Optional[bool] = field(
79
+ default=None,
72
80
  alias=["--only-required"],
73
81
  help="Copy only required+optional artifacts (default: True). Set to False to copy all available artifacts.",
74
82
  )
@@ -76,11 +84,20 @@ class ExportCmd:
76
84
  def execute(self) -> None:
77
85
  """Execute export."""
78
86
  # Import heavy dependencies only when needed
87
+ from omegaconf import OmegaConf
88
+
79
89
  from nemo_evaluator_launcher.api.functional import export_results
80
90
 
91
+ # Validation: ensure IDs are provided
92
+ if not self.invocation_ids:
93
+ print("Error: No IDs provided. Specify one or more invocation or job IDs.")
94
+ print(
95
+ "Usage: nemo-evaluator-launcher export <id> [<id>...] --dest <destination>"
96
+ )
97
+ return
98
+
81
99
  config: dict[str, Any] = {
82
100
  "copy_logs": self.copy_logs,
83
- "only_required": self.only_required,
84
101
  }
85
102
 
86
103
  # Output handling
@@ -95,20 +112,90 @@ class ExportCmd:
95
112
  if self.log_metrics:
96
113
  config["log_metrics"] = self.log_metrics
97
114
 
115
+ # Add only_required if explicitly passed via CLI
116
+ if self.only_required is not None:
117
+ config["only_required"] = self.only_required
118
+
119
+ # Parse and validate overrides
120
+ if self.override:
121
+ # Flatten possible list-of-lists from parser
122
+ flat_overrides: list[str] = []
123
+ for item in self.override:
124
+ if isinstance(item, list):
125
+ flat_overrides.extend(str(x) for x in item)
126
+ else:
127
+ flat_overrides.append(str(item))
128
+
129
+ try:
130
+ self._validate_overrides(flat_overrides, self.dest)
131
+ except ValueError as e:
132
+ print(f"Error: {e}")
133
+ return
134
+
135
+ # Expand env vars in override vals ($VAR / ${VAR})
136
+ import os
137
+
138
+ from omegaconf import OmegaConf
139
+
140
+ expanded_overrides: list[str] = []
141
+ for ov in flat_overrides:
142
+ if "=" in ov:
143
+ k, v = ov.split("=", 1)
144
+ expanded_overrides.append(f"{k}={os.path.expandvars(v)}")
145
+ else:
146
+ expanded_overrides.append(os.path.expandvars(ov))
147
+
148
+ dot_cfg = OmegaConf.from_dotlist(expanded_overrides)
149
+ as_dict = OmegaConf.to_container(dot_cfg, resolve=True) or {}
150
+ if isinstance(as_dict, dict) and "export" in as_dict:
151
+ export_map = as_dict.get("export") or {}
152
+ if isinstance(export_map, dict) and self.dest in export_map:
153
+ config.update(export_map[self.dest] or {})
154
+ else:
155
+ config.update(as_dict)
156
+ else:
157
+ config.update(as_dict)
158
+
98
159
  if self.format and self.dest != "local":
99
160
  print(
100
161
  "Note: --format is only used by --dest local. It will be ignored for other destinations."
101
162
  )
102
163
 
103
- # Execute
164
+ if "only_required" in config and self.only_required is True:
165
+ config.pop("only_required", None)
166
+
104
167
  print(
105
168
  f"Exporting {len(self.invocation_ids)} {'invocations' if len(self.invocation_ids) > 1 else 'invocation'} to {self.dest}..."
106
169
  )
107
170
 
108
171
  result = export_results(self.invocation_ids, self.dest, config)
109
172
 
110
- if not result["success"]:
111
- print(f"Export failed: {result.get('error', 'Unknown error')}")
173
+ if not result.get("success", False):
174
+ err = result.get("error", "Unknown error")
175
+ print(f"\nExport failed: {err}")
176
+ # Provide actionable guidance for common configuration issues
177
+ if self.dest == "mlflow":
178
+ if "tracking_uri" in str(err).lower():
179
+ print("\nMLflow requires 'tracking_uri' to be configured.")
180
+ print(
181
+ "Set it via: -o export.mlflow.tracking_uri=http://mlflow-server:5000"
182
+ )
183
+ elif "not installed" in str(err).lower():
184
+ print("\nMLflow package not installed.")
185
+ print("Install via: pip install nemo-evaluator-launcher[mlflow]")
186
+ elif self.dest == "wandb":
187
+ if "entity" in str(err).lower() or "project" in str(err).lower():
188
+ print("\nW&B requires 'entity' and 'project' to be configured.")
189
+ print(
190
+ "Set via: -o export.wandb.entity=my-org -o export.wandb.project=my-proj"
191
+ )
192
+ elif "not installed" in str(err).lower():
193
+ print("\nW&B package not installed.")
194
+ print("Install via: pip install nemo-evaluator-launcher[wandb]")
195
+ elif self.dest == "gsheets":
196
+ if "not installed" in str(err).lower():
197
+ print("\nGoogle Sheets package not installed.")
198
+ print("Install via: pip install nemo-evaluator-launcher[gsheets]")
112
199
  return
113
200
 
114
201
  # Success path
@@ -125,6 +212,9 @@ class ExportCmd:
125
212
  print(f" URL: {metadata['run_url']}")
126
213
  if metadata.get("summary_path"):
127
214
  print(f" Summary: {metadata['summary_path']}")
215
+ path_hint = job_result.get("dest") or metadata.get("output_dir")
216
+ if self.dest == "local" and path_hint:
217
+ print(f" Path: {path_hint}")
128
218
  else:
129
219
  print(f" {job_id} failed: {job_result.get('message', '')}")
130
220
  else:
@@ -137,7 +227,6 @@ class ExportCmd:
137
227
  # Show summary path if available
138
228
  if metadata.get("summary_path"):
139
229
  print(f"Summary: {metadata['summary_path']}")
140
-
141
230
  # Show per-invocation status
142
231
  for invocation_id, inv_result in result["invocations"].items():
143
232
  if inv_result.get("success"):
@@ -147,3 +236,32 @@ class ExportCmd:
147
236
  print(
148
237
  f" {invocation_id}: failed, {inv_result.get('error', 'Unknown error')}"
149
238
  )
239
+
240
+ def _validate_overrides(self, overrides: List[str], dest: str) -> None:
241
+ """Validate override list for destination consistency.
242
+
243
+ Raises:
244
+ ValueError: If overrides specify wrong destination or have other issues.
245
+ """
246
+ if not overrides:
247
+ return # nothing to validate
248
+
249
+ # Check each override for destination mismatch
250
+ for override_str in overrides:
251
+ if override_str.startswith(
252
+ "export."
253
+ ): # check if override starts with export.
254
+ # Extract destination from override path
255
+ try:
256
+ key_part = override_str.split("=")[0] # Get left side before =
257
+ parts = key_part.split(".")
258
+ if len(parts) >= 2:
259
+ override_dest = parts[1]
260
+ if override_dest != dest:
261
+ raise ValueError(
262
+ f"Override destination mismatch: override specifies 'export.{override_dest}' but --dest is '{dest}'. "
263
+ f"Either change --dest to '{override_dest}' or use 'export.{dest}' in overrides."
264
+ )
265
+ except (IndexError, AttributeError):
266
+ # miconstructed override -> OmegaConf handles this
267
+ pass
@@ -98,7 +98,17 @@ class Cmd:
98
98
  config_dir=self.config_dir,
99
99
  )
100
100
 
101
- invocation_id = run_eval(config, self.dry_run)
101
+ try:
102
+ invocation_id = run_eval(config, self.dry_run)
103
+ except Exception as e:
104
+ print(f"\033[31m✗ Job submission failed | Error: {e}\033[0m")
105
+ raise
106
+
107
+ # Print general success message with invocation ID
108
+ if invocation_id is not None and not self.dry_run:
109
+ print(
110
+ f"\033[32m✓ Job submission successful | Invocation ID: {invocation_id}\033[0m"
111
+ )
102
112
 
103
113
  # Save the complete configuration
104
114
  if not self.dry_run and invocation_id is not None:
@@ -146,6 +156,15 @@ class Cmd:
146
156
  if invocation_id is not None:
147
157
  print(f"to check status: nemo-evaluator-launcher status {invocation_id}")
148
158
  print(f"to kill all jobs: nemo-evaluator-launcher kill {invocation_id}")
149
- print(
150
- f"to kill individual jobs: nemo-evaluator-launcher kill <job_id> (e.g., {invocation_id}.0)"
159
+
160
+ # Show actual job IDs and task names
161
+ print("to kill individual jobs:")
162
+ # Access tasks - will work after normalization in run_eval
163
+ tasks = (
164
+ config.evaluation.tasks
165
+ if hasattr(config.evaluation, "tasks")
166
+ else config.evaluation
151
167
  )
168
+ for idx, task in enumerate(tasks):
169
+ job_id = f"{invocation_id}.{idx}"
170
+ print(f" nemo-evaluator-launcher kill {job_id} # {task.name}")
@@ -102,6 +102,8 @@ class Cmd:
102
102
  status = job.get("status", "")
103
103
  formatted_status = self._format_status_with_indicators(status)
104
104
 
105
+ # Extract task name
106
+
105
107
  rows.append(
106
108
  [
107
109
  job.get("job_id", ""),
@@ -144,7 +146,7 @@ class Cmd:
144
146
  ExecutionState.SUCCESS.value: "\033[32m✓ SUCCESS\033[0m", # Green Unicode checkmark
145
147
  ExecutionState.FAILED.value: "\033[31m✗ FAILED\033[0m", # Red Unicode X
146
148
  ExecutionState.RUNNING.value: "\033[33m▶ RUNNING\033[0m", # Yellow Unicode play button
147
- ExecutionState.PENDING.value: "\033[36m PENDING\033[0m", # Cyan Unicode hourglass
149
+ ExecutionState.PENDING.value: "\033[36m PENDING\033[0m", # Cyan Unicode hourglass (U+29D7)
148
150
  ExecutionState.KILLED.value: "\033[35m✗ KILLED\033[0m", # Magenta Unicode X
149
151
  # Additional states for error handling
150
152
  "not_found": "\033[90m? NOT FOUND\033[0m", # Gray question mark
@@ -428,14 +428,34 @@ def create_lepton_endpoint(cfg: DictConfig, endpoint_name: str) -> bool:
428
428
  print(f"✅ Successfully created Lepton endpoint: {endpoint_name}")
429
429
  return True
430
430
  else:
431
- print(f"❌ Failed to create Lepton endpoint: {result.stderr}")
431
+ error_msg = result.stderr.strip() if result.stderr else ""
432
+ output_msg = result.stdout.strip() if result.stdout else ""
433
+ print(
434
+ f"✗ Failed to create Lepton endpoint | Endpoint: {endpoint_name} | Return code: {result.returncode}"
435
+ )
436
+ if error_msg:
437
+ print(f" stderr: {error_msg}")
438
+ if output_msg:
439
+ print(f" stdout: {output_msg}")
432
440
  return False
433
441
 
434
- except subprocess.TimeoutExpired:
435
- print(f"❌ Timeout creating Lepton endpoint: {endpoint_name}")
442
+ except subprocess.TimeoutExpired as e:
443
+ print(
444
+ f"✗ Timeout creating Lepton endpoint | Endpoint: {endpoint_name} | Timeout: 300s"
445
+ )
446
+ if hasattr(e, "stderr") and e.stderr:
447
+ print(f" stderr: {e.stderr}")
448
+ if hasattr(e, "stdout") and e.stdout:
449
+ print(f" stdout: {e.stdout}")
436
450
  return False
437
451
  except subprocess.CalledProcessError as e:
438
- print(f"❌ Error creating Lepton endpoint: {e}")
452
+ print(
453
+ f"✗ Error creating Lepton endpoint | Endpoint: {endpoint_name} | Error: {e}"
454
+ )
455
+ if hasattr(e, "stderr") and e.stderr:
456
+ print(f" stderr: {e.stderr}")
457
+ if hasattr(e, "stdout") and e.stdout:
458
+ print(f" stdout: {e.stdout}")
439
459
  return False
440
460
  finally:
441
461
  # Clean up temporary file
@@ -482,7 +482,8 @@ class LeptonExecutor(BaseExecutor):
482
482
 
483
483
  if not job_success:
484
484
  raise RuntimeError(
485
- f"Failed to submit Lepton job for task: {task.name}. Error: {error_msg}"
485
+ f"Failed to submit Lepton job | Task: {task.name} | Job ID: {job_id} | "
486
+ f"Lepton job name: {lepton_job_name} | Error: {error_msg}"
486
487
  )
487
488
 
488
489
  # Store job metadata in database (with task-specific endpoint info)
@@ -504,8 +505,6 @@ class LeptonExecutor(BaseExecutor):
504
505
  )
505
506
  )
506
507
 
507
- print(f"✅ Task {task.name}: Submitted evaluation job {job_id}")
508
-
509
508
  # Jobs submitted successfully - return immediately (non-blocking)
510
509
  print(
511
510
  f"\n✅ Successfully submitted {len(lepton_job_names)} evaluation jobs to Lepton"
@@ -536,9 +535,8 @@ class LeptonExecutor(BaseExecutor):
536
535
 
537
536
  return invocation_id
538
537
 
539
- except Exception as e:
538
+ except Exception:
540
539
  # Clean up any created endpoints on failure
541
- print(f"❌ Error during evaluation: {e}")
542
540
  if cfg.deployment.type != "none" and "endpoint_names" in locals():
543
541
  for endpoint_name in endpoint_names:
544
542
  if endpoint_name:
@@ -23,6 +23,7 @@ import os
23
23
  import pathlib
24
24
  import platform
25
25
  import shlex
26
+ import shutil
26
27
  import subprocess
27
28
  import time
28
29
  from typing import List, Optional
@@ -76,6 +77,13 @@ class LocalExecutor(BaseExecutor):
76
77
  f"type {cfg.deployment.type} is not implemented -- add deployment support"
77
78
  )
78
79
 
80
+ # Check if docker is available (skip in dry_run mode)
81
+ if not dry_run and shutil.which("docker") is None:
82
+ raise RuntimeError(
83
+ "Docker is not installed or not in PATH. "
84
+ "Please install Docker to run local evaluations."
85
+ )
86
+
79
87
  # Generate invocation ID for this evaluation run
80
88
  invocation_id = generate_invocation_id()
81
89
 
@@ -233,35 +241,48 @@ class LocalExecutor(BaseExecutor):
233
241
  # To ensure subprocess continues after python exits:
234
242
  # - on Unix-like systems, to fully detach the subprocess
235
243
  # so it does not die when Python exits, pass start_new_session=True;
236
- # - on Widnows use creationflags=subprocess.CREATE_NEW_PROCESS_GROUP flag.
244
+ # - on Windows use creationflags=subprocess.CREATE_NEW_PROCESS_GROUP flag.
237
245
  os_name = platform.system()
246
+ processes = []
247
+
238
248
  if is_execution_mode_sequential:
239
249
  if os_name == "Windows":
240
- subprocess.Popen(
250
+ proc = subprocess.Popen(
241
251
  shlex.split("bash run_all.sequential.sh"),
242
252
  cwd=output_dir,
243
253
  creationflags=subprocess.CREATE_NEW_PROCESS_GROUP,
244
254
  )
245
255
  else:
246
- subprocess.Popen(
256
+ proc = subprocess.Popen(
247
257
  shlex.split("bash run_all.sequential.sh"),
248
258
  cwd=output_dir,
249
259
  start_new_session=True,
250
260
  )
261
+ processes.append(("run_all.sequential.sh", proc, output_dir))
251
262
  else:
252
263
  for task in cfg.evaluation.tasks:
253
264
  if os_name == "Windows":
254
- subprocess.Popen(
265
+ proc = subprocess.Popen(
255
266
  shlex.split("bash run.sh"),
256
267
  cwd=output_dir / task.name,
257
268
  creationflags=subprocess.CREATE_NEW_PROCESS_GROUP,
258
269
  )
259
270
  else:
260
- subprocess.Popen(
271
+ proc = subprocess.Popen(
261
272
  shlex.split("bash run.sh"),
262
273
  cwd=output_dir / task.name,
263
274
  start_new_session=True,
264
275
  )
276
+ processes.append((task.name, proc, output_dir / task.name))
277
+
278
+ # Wait briefly and check if bash scripts exited immediately (which means error)
279
+ time.sleep(0.3)
280
+
281
+ for name, proc, work_dir in processes:
282
+ exit_code = proc.poll()
283
+ if exit_code is not None and exit_code != 0:
284
+ error_msg = f"Script for {name} exited with code {exit_code}"
285
+ raise RuntimeError(f"Job startup failed | {error_msg}")
265
286
 
266
287
  print("\nCommands for real-time monitoring:")
267
288
  for job_id, evaluation_task in zip(job_ids, evaluation_tasks):
@@ -174,10 +174,11 @@ class SlurmExecutor(BaseExecutor):
174
174
  for idx, (slurm_job_id, remote_runsub_path) in enumerate(
175
175
  zip(slurm_job_ids, remote_runsub_paths)
176
176
  ):
177
+ job_id = generate_job_id(invocation_id, idx)
177
178
  db.write_job(
178
179
  job=JobData(
179
180
  invocation_id=invocation_id,
180
- job_id=generate_job_id(invocation_id, idx),
181
+ job_id=job_id,
181
182
  timestamp=time.time(),
182
183
  executor="slurm",
183
184
  data={
@@ -204,7 +205,7 @@ class SlurmExecutor(BaseExecutor):
204
205
  """
205
206
  db = ExecutionDB()
206
207
 
207
- # If id looks like an invocation_id (no dot), get all jobs for it
208
+ # If id looks like an invocation_id
208
209
  if "." not in id:
209
210
  jobs = db.get_jobs(id)
210
211
  if not jobs:
@@ -605,20 +606,27 @@ def _create_slurm_sbatch_script(
605
606
  s += "kill $SERVER_PID # terminate the server to finish gracefully\n\n"
606
607
 
607
608
  # auto-export
608
- if cfg.execution.get("auto_export", {}).get("destinations", []):
609
- s += _generate_auto_export_section(cfg, job_id)
609
+ ae_cfg = cfg.execution.get("auto_export")
610
+ destinations: list = []
611
+ if isinstance(ae_cfg, list):
612
+ destinations = list(ae_cfg)
613
+ elif isinstance(ae_cfg, dict) or isinstance(ae_cfg, DictConfig):
614
+ destinations = list(ae_cfg.get("destinations", []) or [])
615
+
616
+ if destinations:
617
+ export_env = dict(cfg.execution.get("env_vars", {}).get("export", {}) or {})
618
+ s += _generate_auto_export_section(cfg, job_id, destinations, export_env)
610
619
 
611
620
  return s
612
621
 
613
622
 
614
623
  def _generate_auto_export_section(
615
624
  cfg: DictConfig,
616
- job_id: str, # Complete job_id string
625
+ job_id: str,
626
+ destinations: list,
627
+ export_env: dict,
617
628
  ) -> str:
618
629
  """Generate simple auto-export section for sbatch script."""
619
- auto_export_config = cfg.execution.get("auto_export", {})
620
- destinations = auto_export_config.get("destinations", [])
621
-
622
630
  if not destinations:
623
631
  return ""
624
632
 
@@ -626,18 +634,65 @@ def _generate_auto_export_section(
626
634
  s += "EVAL_EXIT_CODE=$?\n"
627
635
  s += "if [ $EVAL_EXIT_CODE -eq 0 ]; then\n"
628
636
  s += " echo 'Evaluation completed successfully. Starting auto-export...'\n"
629
- s += " set +e\n" # per exporter failure allowed
637
+ s += " set +e\n"
630
638
  s += " set +x\n"
639
+ s += " set +u\n"
631
640
  s += ' cd "$TASK_DIR/artifacts"\n'
632
- auto_export_cfg = OmegaConf.to_container(
633
- cfg.execution.get("auto_export", {}), resolve=True
641
+
642
+ # Work with DictConfig; convert only for YAML at the end
643
+ exec_type = (
644
+ cfg.execution.type
645
+ if hasattr(cfg.execution, "type")
646
+ else cfg.execution.get("type", "slurm")
634
647
  )
635
- yaml_str = yaml.safe_dump(
636
- {"execution": {"auto_export": auto_export_cfg}}, sort_keys=False
648
+ eval_tasks = (
649
+ list(cfg.evaluation.tasks)
650
+ if hasattr(cfg, "evaluation") and hasattr(cfg.evaluation, "tasks")
651
+ else list((cfg.get("evaluation", {}) or {}).get("tasks", []) or [])
637
652
  )
653
+ export_block = cfg.get("export", {}) or {}
654
+
655
+ payload = {
656
+ "execution": {
657
+ "auto_export": {
658
+ "destinations": list(destinations),
659
+ **({"env_vars": dict(export_env)} if export_env else {}),
660
+ },
661
+ "type": exec_type,
662
+ },
663
+ "evaluation": {"tasks": eval_tasks},
664
+ }
665
+ if export_block:
666
+ # Convert just this block to plain for YAML
667
+ payload["export"] = (
668
+ OmegaConf.to_object(export_block)
669
+ if OmegaConf.is_config(export_block)
670
+ else dict(export_block)
671
+ )
672
+
673
+ # Final YAML (single conversion at the end)
674
+ payload_clean = OmegaConf.to_container(OmegaConf.create(payload), resolve=True)
675
+ yaml_str = yaml.safe_dump(payload_clean, sort_keys=False)
638
676
  s += " cat > export_config.yml << 'EOF'\n"
639
677
  s += yaml_str
640
678
  s += "EOF\n"
679
+
680
+ # write launcher config as config.yml for exporters (no core command)
681
+ submitted_yaml = yaml.safe_dump(
682
+ OmegaConf.to_container(cfg, resolve=True), sort_keys=False
683
+ )
684
+ s += " cat > config.yml << 'EOF'\n"
685
+ s += submitted_yaml
686
+ s += "EOF\n"
687
+
688
+ # Export host only env before running auto export
689
+ for k, v in (export_env or {}).items():
690
+ if isinstance(v, str) and re.fullmatch(r"[A-Za-z_][A-Za-z0-9_]*", v):
691
+ s += f' export {k}="${{{v}}}"\n'
692
+ else:
693
+ esc = str(v).replace('"', '\\"')
694
+ s += f' export {k}="{esc}"\n'
695
+
641
696
  for dest in destinations:
642
697
  s += f" echo 'Exporting to {dest}...'\n"
643
698
  s += f" nemo-evaluator-launcher export {job_id} --dest {dest} || echo 'Export to {dest} failed'\n"
@@ -656,7 +711,9 @@ def _open_master_connection(
656
711
  socket: str,
657
712
  ) -> str | None:
658
713
  ssh_command = f"ssh -MNf -S {socket} {username}@{hostname}"
659
- completed_process = subprocess.run(args=shlex.split(ssh_command))
714
+ completed_process = subprocess.run(
715
+ args=shlex.split(ssh_command), capture_output=True
716
+ )
660
717
  if completed_process.returncode == 0:
661
718
  return socket
662
719
  return None
@@ -694,12 +751,17 @@ def _make_remote_execution_output_dir(
694
751
  ssh_command.append(f"{username}@{hostname}")
695
752
  ssh_command.append(mkdir_command)
696
753
  ssh_command = " ".join(ssh_command)
697
- completed_process = subprocess.run(args=shlex.split(ssh_command))
754
+ completed_process = subprocess.run(
755
+ args=shlex.split(ssh_command), capture_output=True
756
+ )
698
757
  if completed_process.returncode != 0:
758
+ error_msg = (
759
+ completed_process.stderr.decode("utf-8")
760
+ if completed_process.stderr
761
+ else "Unknown error"
762
+ )
699
763
  raise RuntimeError(
700
- "failed to make a remote execution output dir\n{}".format(
701
- completed_process.stderr.decode("utf-8")
702
- )
764
+ "failed to make a remote execution output dir\n{}".format(error_msg)
703
765
  )
704
766
 
705
767
 
@@ -725,13 +787,16 @@ def _rsync_upload_rundirs(
725
787
  remote_destination_str = f"{username}@{hostname}:{remote_target}"
726
788
  local_sources_str = " ".join(map(str, local_sources))
727
789
  rsync_upload_command = f"rsync -qcaz {local_sources_str} {remote_destination_str}"
728
- completed_process = subprocess.run(args=shlex.split(rsync_upload_command))
790
+ completed_process = subprocess.run(
791
+ args=shlex.split(rsync_upload_command), capture_output=True
792
+ )
729
793
  if completed_process.returncode != 0:
730
- raise RuntimeError(
731
- "failed to upload local sources\n{}".format(
732
- completed_process.stderr.decode("utf-8")
733
- )
794
+ error_msg = (
795
+ completed_process.stderr.decode("utf-8")
796
+ if completed_process.stderr
797
+ else "Unknown error"
734
798
  )
799
+ raise RuntimeError("failed to upload local sources\n{}".format(error_msg))
735
800
 
736
801
 
737
802
  def _sbatch_remote_runsubs(
@@ -757,10 +822,9 @@ def _sbatch_remote_runsubs(
757
822
  args=shlex.split(ssh_command), capture_output=True
758
823
  )
759
824
  if completed_process.returncode != 0:
825
+ error_msg = completed_process.stderr.decode("utf-8")
760
826
  raise RuntimeError(
761
- "failed to submit sbatch scripts for execution\n{}".format(
762
- completed_process.stderr.decode("utf-8")
763
- )
827
+ "failed to submit sbatch scripts for execution\n{}".format(error_msg)
764
828
  )
765
829
 
766
830
  sbatch_output = completed_process.stdout.decode("utf-8")