nemo-evaluator-launcher 0.1.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nemo-evaluator-launcher might be problematic. Click here for more details.

Files changed (57) hide show
  1. nemo_evaluator_launcher/__init__.py +65 -0
  2. nemo_evaluator_launcher/api/__init__.py +24 -0
  3. nemo_evaluator_launcher/api/functional.py +641 -0
  4. nemo_evaluator_launcher/api/types.py +89 -0
  5. nemo_evaluator_launcher/api/utils.py +19 -0
  6. nemo_evaluator_launcher/cli/__init__.py +15 -0
  7. nemo_evaluator_launcher/cli/export.py +148 -0
  8. nemo_evaluator_launcher/cli/info.py +117 -0
  9. nemo_evaluator_launcher/cli/kill.py +39 -0
  10. nemo_evaluator_launcher/cli/ls_runs.py +113 -0
  11. nemo_evaluator_launcher/cli/ls_tasks.py +34 -0
  12. nemo_evaluator_launcher/cli/main.py +136 -0
  13. nemo_evaluator_launcher/cli/run.py +135 -0
  14. nemo_evaluator_launcher/cli/status.py +118 -0
  15. nemo_evaluator_launcher/cli/version.py +52 -0
  16. nemo_evaluator_launcher/common/__init__.py +16 -0
  17. nemo_evaluator_launcher/common/execdb.py +189 -0
  18. nemo_evaluator_launcher/common/helpers.py +157 -0
  19. nemo_evaluator_launcher/common/logging_utils.py +349 -0
  20. nemo_evaluator_launcher/common/mapping.py +310 -0
  21. nemo_evaluator_launcher/configs/__init__.py +15 -0
  22. nemo_evaluator_launcher/configs/default.yaml +28 -0
  23. nemo_evaluator_launcher/configs/deployment/nim.yaml +32 -0
  24. nemo_evaluator_launcher/configs/deployment/none.yaml +16 -0
  25. nemo_evaluator_launcher/configs/deployment/sglang.yaml +38 -0
  26. nemo_evaluator_launcher/configs/deployment/vllm.yaml +41 -0
  27. nemo_evaluator_launcher/configs/execution/lepton/default.yaml +92 -0
  28. nemo_evaluator_launcher/configs/execution/local.yaml +17 -0
  29. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +33 -0
  30. nemo_evaluator_launcher/executors/__init__.py +22 -0
  31. nemo_evaluator_launcher/executors/base.py +97 -0
  32. nemo_evaluator_launcher/executors/lepton/__init__.py +16 -0
  33. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +589 -0
  34. nemo_evaluator_launcher/executors/lepton/executor.py +905 -0
  35. nemo_evaluator_launcher/executors/lepton/job_helpers.py +394 -0
  36. nemo_evaluator_launcher/executors/local/__init__.py +15 -0
  37. nemo_evaluator_launcher/executors/local/executor.py +491 -0
  38. nemo_evaluator_launcher/executors/local/run.template.sh +88 -0
  39. nemo_evaluator_launcher/executors/registry.py +38 -0
  40. nemo_evaluator_launcher/executors/slurm/__init__.py +15 -0
  41. nemo_evaluator_launcher/executors/slurm/executor.py +982 -0
  42. nemo_evaluator_launcher/exporters/__init__.py +36 -0
  43. nemo_evaluator_launcher/exporters/base.py +112 -0
  44. nemo_evaluator_launcher/exporters/gsheets.py +391 -0
  45. nemo_evaluator_launcher/exporters/local.py +488 -0
  46. nemo_evaluator_launcher/exporters/mlflow.py +448 -0
  47. nemo_evaluator_launcher/exporters/registry.py +40 -0
  48. nemo_evaluator_launcher/exporters/utils.py +669 -0
  49. nemo_evaluator_launcher/exporters/wandb.py +376 -0
  50. nemo_evaluator_launcher/package_info.py +35 -0
  51. nemo_evaluator_launcher/resources/mapping.toml +344 -0
  52. nemo_evaluator_launcher-0.1.0rc2.dist-info/METADATA +35 -0
  53. nemo_evaluator_launcher-0.1.0rc2.dist-info/RECORD +57 -0
  54. nemo_evaluator_launcher-0.1.0rc2.dist-info/WHEEL +5 -0
  55. nemo_evaluator_launcher-0.1.0rc2.dist-info/entry_points.txt +3 -0
  56. nemo_evaluator_launcher-0.1.0rc2.dist-info/licenses/LICENSE +451 -0
  57. nemo_evaluator_launcher-0.1.0rc2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,491 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ """Local executor implementation for nemo-evaluator-launcher.
17
+
18
+ Handles running evaluation jobs locally using shell scripts and Docker containers.
19
+ """
20
+
21
+ import copy
22
+ import os
23
+ import pathlib
24
+ import platform
25
+ import shlex
26
+ import subprocess
27
+ import time
28
+ from typing import List, Optional
29
+
30
+ import jinja2
31
+ import yaml
32
+ from omegaconf import DictConfig, OmegaConf
33
+
34
+ from nemo_evaluator_launcher.common.execdb import (
35
+ ExecutionDB,
36
+ JobData,
37
+ generate_invocation_id,
38
+ generate_job_id,
39
+ )
40
+ from nemo_evaluator_launcher.common.helpers import (
41
+ get_eval_factory_command,
42
+ get_eval_factory_dataset_size_from_run_config,
43
+ get_timestamp_string,
44
+ )
45
+ from nemo_evaluator_launcher.common.mapping import (
46
+ get_task_from_mapping,
47
+ load_tasks_mapping,
48
+ )
49
+ from nemo_evaluator_launcher.executors.base import (
50
+ BaseExecutor,
51
+ ExecutionState,
52
+ ExecutionStatus,
53
+ )
54
+ from nemo_evaluator_launcher.executors.registry import register_executor
55
+
56
+
57
+ @register_executor("local")
58
+ class LocalExecutor(BaseExecutor):
59
+ @classmethod
60
+ def execute_eval(cls, cfg: DictConfig, dry_run: bool = False) -> str:
61
+ """Run evaluation jobs locally using the provided configuration.
62
+
63
+ Args:
64
+ cfg: The configuration object for the evaluation run.
65
+ dry_run: If True, prepare scripts and save them without execution.
66
+
67
+ Returns:
68
+ str: The invocation ID for the evaluation run.
69
+
70
+ Raises:
71
+ NotImplementedError: If deployment is not 'none'.
72
+ RuntimeError: If the run script fails.
73
+ """
74
+ if cfg.deployment.type != "none":
75
+ raise NotImplementedError(
76
+ f"type {cfg.deployment.type} is not implemented -- add deployment support"
77
+ )
78
+
79
+ # Generate invocation ID for this evaluation run
80
+ invocation_id = generate_invocation_id()
81
+
82
+ output_dir = pathlib.Path(cfg.execution.output_dir).absolute() / (
83
+ get_timestamp_string(include_microseconds=False) + "-" + invocation_id
84
+ )
85
+ output_dir.mkdir(parents=True, exist_ok=True)
86
+
87
+ tasks_mapping = load_tasks_mapping()
88
+ evaluation_tasks = []
89
+ job_ids = []
90
+
91
+ eval_template = jinja2.Template(
92
+ open(pathlib.Path(__file__).parent / "run.template.sh", "r").read()
93
+ )
94
+
95
+ execution_mode = cfg.execution.get("mode", "parallel")
96
+ if execution_mode == "parallel":
97
+ is_execution_mode_sequential = False
98
+ elif execution_mode == "sequential":
99
+ is_execution_mode_sequential = True
100
+ else:
101
+ raise ValueError(
102
+ "unknown execution mode: {}. Choose one of {}".format(
103
+ repr(execution_mode), ["parallel", "sequential"]
104
+ )
105
+ )
106
+
107
+ for idx, task in enumerate(cfg.evaluation.tasks):
108
+ task_definition = get_task_from_mapping(task.name, tasks_mapping)
109
+
110
+ # Create job ID as <invocation_id>.<n>
111
+ job_id = generate_job_id(invocation_id, idx)
112
+ job_ids.append(job_id)
113
+ container_name = f"{task.name}-{get_timestamp_string()}"
114
+
115
+ # collect all env vars
116
+ env_vars = copy.deepcopy(dict(cfg.evaluation.get("env_vars", {})))
117
+ env_vars.update(task.get("env_vars", {}))
118
+ if cfg.target.api_endpoint.api_key_name:
119
+ assert "API_KEY" not in env_vars
120
+ env_vars["API_KEY"] = cfg.target.api_endpoint.api_key_name
121
+
122
+ # check if the environment variables are set
123
+ for env_var in env_vars.values():
124
+ if os.getenv(env_var) is None:
125
+ raise ValueError(
126
+ f"Trying to pass an unset environment variable {env_var}."
127
+ )
128
+
129
+ # check if required env vars are defined:
130
+ for required_env_var in task_definition.get("required_env_vars", []):
131
+ if required_env_var not in env_vars.keys():
132
+ raise ValueError(
133
+ f"{task.name} task requires environment variable {required_env_var}."
134
+ " Specify it in the task subconfig in the 'env_vars' dict as the following"
135
+ f" pair {required_env_var}: YOUR_ENV_VAR_NAME"
136
+ )
137
+
138
+ # format env_vars for a template
139
+ env_vars = [
140
+ f"{env_var_dst}=${env_var_src}"
141
+ for env_var_dst, env_var_src in env_vars.items()
142
+ ]
143
+
144
+ eval_image = task_definition["container"]
145
+ if "container" in task:
146
+ eval_image = task["container"]
147
+
148
+ task_output_dir = output_dir / task.name
149
+ task_output_dir.mkdir(parents=True, exist_ok=True)
150
+ evaluation_task = {
151
+ "name": task.name,
152
+ "job_id": job_id,
153
+ "eval_image": eval_image,
154
+ "container_name": container_name,
155
+ "env_vars": env_vars,
156
+ "output_dir": task_output_dir,
157
+ "eval_factory_command": get_eval_factory_command(
158
+ cfg, task, task_definition
159
+ ),
160
+ }
161
+ evaluation_tasks.append(evaluation_task)
162
+
163
+ # Check if auto-export is enabled by presence of destination(s)
164
+ auto_export_config = cfg.execution.get("auto_export", {})
165
+ auto_export_destinations = auto_export_config.get("destinations", [])
166
+
167
+ run_sh_content = (
168
+ eval_template.render(
169
+ evaluation_tasks=[evaluation_task],
170
+ auto_export_destinations=auto_export_destinations,
171
+ ).rstrip("\n")
172
+ + "\n"
173
+ )
174
+
175
+ (task_output_dir / "run.sh").write_text(run_sh_content)
176
+
177
+ run_all_sequentially_sh_content = (
178
+ eval_template.render(
179
+ evaluation_tasks=evaluation_tasks,
180
+ auto_export_destinations=auto_export_destinations,
181
+ ).rstrip("\n")
182
+ + "\n"
183
+ )
184
+ (output_dir / "run_all.sequential.sh").write_text(
185
+ run_all_sequentially_sh_content
186
+ )
187
+
188
+ # Save launched jobs metadata
189
+ db = ExecutionDB()
190
+ for job_id, task, evaluation_task in zip(
191
+ job_ids, cfg.evaluation.tasks, evaluation_tasks
192
+ ):
193
+ db.write_job(
194
+ job=JobData(
195
+ invocation_id=invocation_id,
196
+ job_id=job_id,
197
+ timestamp=time.time(),
198
+ executor="local",
199
+ data={
200
+ "output_dir": str(evaluation_task["output_dir"]),
201
+ "container": evaluation_task["container_name"],
202
+ "eval_image": evaluation_task["eval_image"],
203
+ },
204
+ config=OmegaConf.to_object(cfg),
205
+ )
206
+ )
207
+
208
+ if dry_run:
209
+ print("\n\n=============================================\n\n")
210
+ print(f"DRY RUN: Scripts prepared and saved to {output_dir}")
211
+ if is_execution_mode_sequential:
212
+ print(
213
+ "\n\n =========== Main script | run_all.sequential.sh ===================== \n\n"
214
+ )
215
+ with open(output_dir / "run_all.sequential.sh", "r") as f:
216
+ print(f.read())
217
+ else:
218
+ for idx, task in enumerate(cfg.evaluation.tasks):
219
+ task_output_dir = output_dir / task.name
220
+ print(
221
+ f"\n\n =========== Task script | {task.name}/run.sh ===================== \n\n"
222
+ )
223
+ with open(task_output_dir / "run.sh", "r") as f:
224
+ print(f.read())
225
+ print("\nTo execute, run without --dry-run")
226
+ return invocation_id
227
+
228
+ # Launch bash scripts with Popen for non-blocking execution.
229
+ # To ensure subprocess continues after python exits:
230
+ # - on Unix-like systems, to fully detach the subprocess
231
+ # so it does not die when Python exits, pass start_new_session=True;
232
+ # - on Widnows use creationflags=subprocess.CREATE_NEW_PROCESS_GROUP flag.
233
+ os_name = platform.system()
234
+ if is_execution_mode_sequential:
235
+ if os_name == "Windows":
236
+ subprocess.Popen(
237
+ shlex.split("bash run_all.sequential.sh"),
238
+ cwd=output_dir,
239
+ creationflags=subprocess.CREATE_NEW_PROCESS_GROUP,
240
+ )
241
+ else:
242
+ subprocess.Popen(
243
+ shlex.split("bash run_all.sequential.sh"),
244
+ cwd=output_dir,
245
+ start_new_session=True,
246
+ )
247
+ else:
248
+ for task in cfg.evaluation.tasks:
249
+ if os_name == "Windows":
250
+ subprocess.Popen(
251
+ shlex.split("bash run.sh"),
252
+ cwd=output_dir / task.name,
253
+ creationflags=subprocess.CREATE_NEW_PROCESS_GROUP,
254
+ )
255
+ else:
256
+ subprocess.Popen(
257
+ shlex.split("bash run.sh"),
258
+ cwd=output_dir / task.name,
259
+ start_new_session=True,
260
+ )
261
+
262
+ print("\nCommands for real-time monitoring:")
263
+ for job_id, evaluation_task in zip(job_ids, evaluation_tasks):
264
+ log_file = evaluation_task["output_dir"] / "logs" / "stdout.log"
265
+ print(f" tail -f {log_file}")
266
+
267
+ print("\nFollow all logs for this invocation:")
268
+ print(f" tail -f {output_dir}/*/logs/stdout.log")
269
+
270
+ return invocation_id
271
+
272
+ @staticmethod
273
+ def get_status(id: str) -> List[ExecutionStatus]:
274
+ """Get the status of a specific job or all jobs in an invocation group.
275
+
276
+ Args:
277
+ id: Unique job identifier or invocation identifier.
278
+
279
+ Returns:
280
+ List containing the execution status for the job(s).
281
+ """
282
+ db = ExecutionDB()
283
+
284
+ # If id looks like an invocation_id (8 hex digits, no dot), get all jobs for it
285
+ if len(id) == 8 and "." not in id:
286
+ jobs = db.get_jobs(id)
287
+ statuses: List[ExecutionStatus] = []
288
+ for job_id, _ in jobs.items():
289
+ statuses.extend(LocalExecutor.get_status(job_id))
290
+ return statuses
291
+
292
+ # Otherwise, treat as job_id
293
+ job_data = db.get_job(id)
294
+ if job_data is None:
295
+ return []
296
+ if job_data.executor != "local":
297
+ return []
298
+
299
+ output_dir = pathlib.Path(job_data.data.get("output_dir", ""))
300
+ if not output_dir.exists():
301
+ return [ExecutionStatus(id=id, state=ExecutionState.PENDING)]
302
+
303
+ artifacts_dir = output_dir / "artifacts"
304
+ progress = _get_progress(artifacts_dir)
305
+
306
+ logs_dir = output_dir / "logs"
307
+ if not logs_dir.exists():
308
+ return [
309
+ ExecutionStatus(
310
+ id=id,
311
+ state=ExecutionState.PENDING,
312
+ progress=dict(progress=progress),
313
+ )
314
+ ]
315
+
316
+ # Check if job was killed
317
+ if job_data.data.get("killed", False):
318
+ return [
319
+ ExecutionStatus(
320
+ id=id, state=ExecutionState.KILLED, progress=dict(progress=progress)
321
+ )
322
+ ]
323
+
324
+ stage_files = {
325
+ "pre_start": logs_dir / "stage.pre-start",
326
+ "running": logs_dir / "stage.running",
327
+ "exit": logs_dir / "stage.exit",
328
+ }
329
+
330
+ if stage_files["exit"].exists():
331
+ try:
332
+ content = stage_files["exit"].read_text().strip()
333
+ if " " in content:
334
+ timestamp, exit_code_str = content.rsplit(" ", 1)
335
+ exit_code = int(exit_code_str)
336
+ if exit_code == 0:
337
+ return [
338
+ ExecutionStatus(
339
+ id=id,
340
+ state=ExecutionState.SUCCESS,
341
+ progress=dict(progress=progress),
342
+ )
343
+ ]
344
+ else:
345
+ return [
346
+ ExecutionStatus(
347
+ id=id,
348
+ state=ExecutionState.FAILED,
349
+ progress=dict(progress=progress),
350
+ )
351
+ ]
352
+ else:
353
+ return [
354
+ ExecutionStatus(
355
+ id=id,
356
+ state=ExecutionState.FAILED,
357
+ progress=dict(progress=progress),
358
+ )
359
+ ]
360
+ except (ValueError, OSError):
361
+ return [
362
+ ExecutionStatus(
363
+ id=id,
364
+ state=ExecutionState.FAILED,
365
+ progress=dict(progress=progress),
366
+ )
367
+ ]
368
+ elif stage_files["running"].exists():
369
+ return [
370
+ ExecutionStatus(
371
+ id=id,
372
+ state=ExecutionState.RUNNING,
373
+ progress=dict(progress=progress),
374
+ )
375
+ ]
376
+ elif stage_files["pre_start"].exists():
377
+ return [
378
+ ExecutionStatus(
379
+ id=id,
380
+ state=ExecutionState.PENDING,
381
+ progress=dict(progress=progress),
382
+ )
383
+ ]
384
+
385
+ return [
386
+ ExecutionStatus(
387
+ id=id, state=ExecutionState.PENDING, progress=dict(progress=progress)
388
+ )
389
+ ]
390
+
391
+ @staticmethod
392
+ def kill_job(job_id: str) -> None:
393
+ """Kill a local job by stopping its Docker container and related processes.
394
+
395
+ Args:
396
+ job_id: The job ID to kill.
397
+
398
+ Raises:
399
+ ValueError: If job is not found or invalid.
400
+ RuntimeError: If Docker container cannot be stopped.
401
+ """
402
+ db = ExecutionDB()
403
+ job_data = db.get_job(job_id)
404
+
405
+ if job_data is None:
406
+ raise ValueError(f"Job {job_id} not found")
407
+
408
+ if job_data.executor != "local":
409
+ raise ValueError(
410
+ f"Job {job_id} is not a local job (executor: {job_data.executor})"
411
+ )
412
+
413
+ # Get container name from database
414
+ container_name = job_data.data.get("container")
415
+ if not container_name:
416
+ raise ValueError(f"No container name found for job {job_id}")
417
+
418
+ killed_something = False
419
+
420
+ # First, try to stop the Docker container if it's running
421
+ result = subprocess.run(
422
+ shlex.split(f"docker stop {container_name}"),
423
+ capture_output=True,
424
+ text=True,
425
+ timeout=30,
426
+ )
427
+ if result.returncode == 0:
428
+ killed_something = True
429
+ # Don't raise error if container doesn't exist (might be still pulling)
430
+
431
+ # Find and kill Docker processes for this container
432
+ result = subprocess.run(
433
+ shlex.split(f"pkill -f 'docker run.*{container_name}'"),
434
+ capture_output=True,
435
+ text=True,
436
+ timeout=10,
437
+ )
438
+ if result.returncode == 0:
439
+ killed_something = True
440
+
441
+ # Mark job as killed in database if we killed something
442
+ if killed_something:
443
+ job_data.data["killed"] = True
444
+ db.write_job(job_data)
445
+ else:
446
+ raise RuntimeError(
447
+ f"Could not find or kill job {job_id} (container: {container_name})"
448
+ )
449
+
450
+
451
+ def _get_progress(artifacts_dir: pathlib.Path) -> Optional[float]:
452
+ """Get the progress of a local job.
453
+
454
+ Args:
455
+ artifacts_dir: The directory containing the evaluation artifacts.
456
+
457
+ Returns:
458
+ The progress of the job as a float between 0 and 1.
459
+ """
460
+ progress_filepath = artifacts_dir / "progress"
461
+ if not progress_filepath.exists():
462
+ return None
463
+ progress_str = progress_filepath.read_text().strip()
464
+ try:
465
+ processed_samples = int(progress_str)
466
+ except ValueError:
467
+ return None
468
+
469
+ dataset_size = _get_dataset_size(artifacts_dir)
470
+ if dataset_size is not None:
471
+ progress = processed_samples / dataset_size
472
+ else:
473
+ # NOTE(dfridman): if we don't know the dataset size, report the number of processed samples
474
+ progress = processed_samples
475
+ return progress
476
+
477
+
478
+ def _get_dataset_size(artifacts_dir: pathlib.Path) -> Optional[int]:
479
+ """Get the dataset size for a benchmark.
480
+
481
+ Args:
482
+ artifacts_dir: The directory containing the evaluation artifacts.
483
+
484
+ Returns:
485
+ The dataset size for the benchmark.
486
+ """
487
+ run_config = artifacts_dir / "run_config.yml"
488
+ if not run_config.exists():
489
+ return None
490
+ run_config = yaml.safe_load(run_config.read_text())
491
+ return get_eval_factory_dataset_size_from_run_config(run_config)
@@ -0,0 +1,88 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ # check if docker exists
18
+ command -v docker >/dev/null 2>&1 || { echo 'docker not found'; exit 1; }
19
+
20
+ {% for task in evaluation_tasks %}
21
+ # {{ task.job_id }} {{ task.name }}
22
+
23
+ task_dir="{{ task.output_dir }}"
24
+ artifacts_dir="$task_dir/artifacts"
25
+ logs_dir="$task_dir/logs"
26
+
27
+ mkdir -m 777 -p "$task_dir"
28
+ mkdir -m 777 -p "$artifacts_dir"
29
+ mkdir -m 777 -p "$logs_dir"
30
+
31
+ # Create pre-start stage file
32
+ echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.pre-start"
33
+
34
+ # Docker run with eval factory command
35
+ (
36
+ echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.running"
37
+ docker run --rm --shm-size=100g \
38
+ --name {{ task.container_name }} \
39
+ --volume "$artifacts_dir":/results \
40
+ {% for env_var in task.env_vars -%}
41
+ -e {{ env_var }} \
42
+ {% endfor -%}
43
+ {{ task.eval_image }} \
44
+ bash -c '
45
+ {{ task.eval_factory_command }} ;
46
+ exit_code=$?
47
+ chmod 777 -R /results;
48
+ if [ "$exit_code" -ne 0 ]; then
49
+ echo "The evaluation container failed with exit code $exit_code" >&2;
50
+ exit "$exit_code";
51
+ fi;
52
+ echo "Container completed successfully" >&2;
53
+ exit 0;
54
+ ' > "$logs_dir/stdout.log" 2>&1
55
+ exit_code=$?
56
+ echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) $exit_code" > "$logs_dir/stage.exit"
57
+ ) >> "$logs_dir/stdout.log" 2>&1
58
+
59
+
60
+ {% if auto_export_destinations %}
61
+ # Monitor job completion and auto-export
62
+ (
63
+ # Give it a moment to ensure file is fully written
64
+ sleep 1
65
+
66
+ exit_code=$(tail -1 "$logs_dir/stage.exit" | cut -d' ' -f2)
67
+ if [ "$exit_code" = "0" ]; then
68
+ # Log auto-export activity to task logs
69
+ echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) Job {{ task.job_id }} completed successfully. Starting auto-export..." >> "$logs_dir/stdout.log"
70
+
71
+ {% for dest in auto_export_destinations %}
72
+ echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) Exporting job {{ task.job_id }} to {{ dest }}..." >> "$logs_dir/stdout.log"
73
+ nemo-evaluator-launcher export {{ task.job_id }} --dest {{ dest }} >> "$logs_dir/stdout.log" 2>&1
74
+ if [ $? -eq 0 ]; then
75
+ echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) Export to {{ dest }} completed successfully" >> "$logs_dir/stdout.log"
76
+ else
77
+ echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) Export to {{ dest }} failed" >> "$logs_dir/stdout.log"
78
+ fi
79
+ {% endfor %}
80
+
81
+ echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) Auto-export completed for job {{ task.job_id }}" >> "$logs_dir/stdout.log"
82
+ else
83
+ echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) Job {{ task.job_id }} failed with exit code $exit_code. Skipping auto-export." >> "$logs_dir/stdout.log"
84
+ fi
85
+ )
86
+
87
+ {% endif %}
88
+ {% endfor %}
@@ -0,0 +1,38 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ from typing import Callable, Type
17
+
18
+ from nemo_evaluator_launcher.executors.base import BaseExecutor
19
+
20
+ _EXECUTOR_REGISTRY: dict[str, Type[BaseExecutor]] = {}
21
+
22
+
23
+ def register_executor(
24
+ executor_name: str,
25
+ ) -> Callable[[Type[BaseExecutor]], Type[BaseExecutor]]:
26
+ def wrapper(executor_cls: Type[BaseExecutor]) -> Type[BaseExecutor]:
27
+ _EXECUTOR_REGISTRY[executor_name] = executor_cls
28
+ return executor_cls
29
+
30
+ return wrapper
31
+
32
+
33
+ def get_executor(executor_name: str) -> Type[BaseExecutor]:
34
+ if executor_name not in _EXECUTOR_REGISTRY:
35
+ raise ValueError(
36
+ f"Executor {executor_name} not found. Available executors: {list(_EXECUTOR_REGISTRY.keys())}"
37
+ )
38
+ return _EXECUTOR_REGISTRY[executor_name]
@@ -0,0 +1,15 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #