nemo-evaluator-launcher 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. nemo_evaluator_launcher/__init__.py +15 -1
  2. nemo_evaluator_launcher/api/functional.py +188 -27
  3. nemo_evaluator_launcher/api/types.py +9 -0
  4. nemo_evaluator_launcher/cli/export.py +131 -12
  5. nemo_evaluator_launcher/cli/info.py +477 -82
  6. nemo_evaluator_launcher/cli/kill.py +5 -3
  7. nemo_evaluator_launcher/cli/logs.py +102 -0
  8. nemo_evaluator_launcher/cli/ls_runs.py +31 -10
  9. nemo_evaluator_launcher/cli/ls_tasks.py +105 -3
  10. nemo_evaluator_launcher/cli/main.py +101 -5
  11. nemo_evaluator_launcher/cli/run.py +153 -30
  12. nemo_evaluator_launcher/cli/status.py +49 -5
  13. nemo_evaluator_launcher/cli/version.py +26 -23
  14. nemo_evaluator_launcher/common/execdb.py +121 -27
  15. nemo_evaluator_launcher/common/helpers.py +213 -33
  16. nemo_evaluator_launcher/common/logging_utils.py +16 -5
  17. nemo_evaluator_launcher/common/printing_utils.py +100 -0
  18. nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
  19. nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
  20. nemo_evaluator_launcher/configs/deployment/trtllm.yaml +23 -0
  21. nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -2
  22. nemo_evaluator_launcher/configs/execution/local.yaml +2 -0
  23. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +19 -4
  24. nemo_evaluator_launcher/executors/base.py +54 -1
  25. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +60 -5
  26. nemo_evaluator_launcher/executors/lepton/executor.py +240 -101
  27. nemo_evaluator_launcher/executors/lepton/job_helpers.py +15 -11
  28. nemo_evaluator_launcher/executors/local/executor.py +492 -56
  29. nemo_evaluator_launcher/executors/local/run.template.sh +76 -9
  30. nemo_evaluator_launcher/executors/slurm/executor.py +571 -98
  31. nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
  32. nemo_evaluator_launcher/exporters/base.py +9 -0
  33. nemo_evaluator_launcher/exporters/gsheets.py +27 -9
  34. nemo_evaluator_launcher/exporters/local.py +30 -16
  35. nemo_evaluator_launcher/exporters/mlflow.py +245 -74
  36. nemo_evaluator_launcher/exporters/utils.py +139 -184
  37. nemo_evaluator_launcher/exporters/wandb.py +157 -43
  38. nemo_evaluator_launcher/package_info.py +6 -3
  39. nemo_evaluator_launcher/resources/mapping.toml +56 -15
  40. nemo_evaluator_launcher-0.1.41.dist-info/METADATA +494 -0
  41. nemo_evaluator_launcher-0.1.41.dist-info/RECORD +62 -0
  42. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/entry_points.txt +1 -0
  43. nemo_evaluator_launcher-0.1.0rc6.dist-info/METADATA +0 -35
  44. nemo_evaluator_launcher-0.1.0rc6.dist-info/RECORD +0 -57
  45. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/WHEEL +0 -0
  46. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/licenses/LICENSE +0 -0
  47. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/top_level.txt +0 -0
@@ -19,7 +19,7 @@ import os
19
19
  import shutil
20
20
  import tempfile
21
21
  from pathlib import Path
22
- from typing import Any, Dict, List
22
+ from typing import Any, Dict, List, Optional
23
23
 
24
24
  import yaml
25
25
 
@@ -38,6 +38,7 @@ from nemo_evaluator_launcher.exporters.registry import register_exporter
38
38
  from nemo_evaluator_launcher.exporters.utils import (
39
39
  extract_accuracy_metrics,
40
40
  extract_exporter_config,
41
+ get_artifact_root,
41
42
  get_available_artifacts,
42
43
  get_benchmark_info,
43
44
  get_task_name,
@@ -67,10 +68,41 @@ class WandBExporter(BaseExporter):
67
68
  "log_mode", "per_task"
68
69
  ) # Default per_task for immediate export
69
70
 
70
- # Get metrics
71
- metrics = extract_accuracy_metrics(
72
- job_data, self.get_job_paths, wandb_config.get("log_metrics", [])
73
- )
71
+ # Stage artifacts locally if remote_ssh (e.g., Slurm), so we can extract metrics
72
+ staged_base_dir = None
73
+ try:
74
+ paths = self.get_job_paths(job_data)
75
+ if paths.get("storage_type") == "remote_ssh":
76
+ tmp_stage = Path(tempfile.mkdtemp(prefix="wandb_stage_"))
77
+ LocalExporter(
78
+ {
79
+ "output_dir": str(tmp_stage),
80
+ "copy_logs": wandb_config.get("log_logs", False),
81
+ "only_required": wandb_config.get("only_required", True),
82
+ }
83
+ ).export_job(job_data)
84
+ staged_base_dir = (
85
+ tmp_stage / job_data.invocation_id / job_data.job_id
86
+ )
87
+ except Exception as e:
88
+ logger.warning(f"W&B: staging failed for {job_data.job_id}: {e}")
89
+
90
+ # Metrics (prefer staged if available)
91
+ log_metrics = wandb_config.get("log_metrics", [])
92
+ if staged_base_dir and (staged_base_dir / "artifacts").exists():
93
+ metrics = extract_accuracy_metrics(
94
+ job_data,
95
+ lambda _: {
96
+ "artifacts_dir": staged_base_dir / "artifacts",
97
+ "storage_type": "local_filesystem",
98
+ },
99
+ log_metrics,
100
+ )
101
+ else:
102
+ metrics = extract_accuracy_metrics(
103
+ job_data, self.get_job_paths, log_metrics
104
+ )
105
+
74
106
  if not metrics:
75
107
  return ExportResult(
76
108
  success=False, dest="wandb", message="No metrics found"
@@ -163,29 +195,92 @@ class WandBExporter(BaseExporter):
163
195
  return {"success": False, "error": f"W&B export failed: {str(e)}"}
164
196
 
165
197
  def _log_artifacts(
166
- self, job_data: JobData, wandb_config: Dict[str, Any], artifact
198
+ self,
199
+ job_data: JobData,
200
+ wandb_config: Dict[str, Any],
201
+ artifact,
202
+ register_staging_dir=None,
167
203
  ) -> List[str]:
168
- """Log evaluation artifacts to WandB using LocalExporter for transfer."""
204
+ """Log evaluation artifacts to WandB using LocalExporter for staging."""
169
205
  if not wandb_config.get("log_artifacts", True):
170
206
  return []
171
207
  try:
172
208
  temp_dir = tempfile.mkdtemp(prefix="wandb_artifacts_")
173
- local_exporter = LocalExporter({"output_dir": temp_dir})
209
+ if callable(register_staging_dir):
210
+ register_staging_dir(temp_dir)
211
+ local_exporter = LocalExporter(
212
+ {
213
+ "output_dir": temp_dir,
214
+ "copy_logs": wandb_config.get(
215
+ "log_logs", wandb_config.get("copy_logs", False)
216
+ ),
217
+ "only_required": wandb_config.get("only_required", True),
218
+ "format": wandb_config.get("format"),
219
+ "log_metrics": wandb_config.get("log_metrics", []),
220
+ "output_filename": wandb_config.get("output_filename"),
221
+ }
222
+ )
174
223
  local_result = local_exporter.export_job(job_data)
175
224
 
176
225
  if not local_result.success:
177
226
  logger.error(f"Failed to download artifacts: {local_result.message}")
178
227
  return []
179
228
 
180
- artifacts_dir = Path(local_result.dest) / "artifacts"
181
- logged_names = []
182
- task_name = get_task_name(job_data)
183
- for fname in get_available_artifacts(artifacts_dir):
184
- fpath = artifacts_dir / fname
185
- if fpath.exists():
186
- artifact.add_file(str(fpath), name=f"{task_name}/{fname}")
187
- logged_names.append(fname)
188
- shutil.rmtree(temp_dir)
229
+ base_dir = Path(local_result.dest)
230
+ artifacts_dir = base_dir / "artifacts"
231
+ logs_dir = base_dir / "logs"
232
+ logged_names: list[str] = []
233
+
234
+ artifact_root = get_artifact_root(job_data) # "<harness>.<benchmark>"
235
+
236
+ # Add config file only when artifacts logging is enabled
237
+ if wandb_config.get("log_artifacts", True):
238
+ cfg_added = False
239
+ for fname in ("config.yml", "run_config.yml"):
240
+ p = artifacts_dir / fname
241
+ if p.exists():
242
+ artifact.add_file(str(p), name=f"{artifact_root}/{fname}")
243
+ logged_names.append(fname)
244
+ cfg_added = True
245
+ break
246
+ if not cfg_added:
247
+ with tempfile.NamedTemporaryFile(
248
+ "w", suffix=".yaml", delete=False
249
+ ) as tmp_cfg:
250
+ yaml.dump(
251
+ job_data.config or {},
252
+ tmp_cfg,
253
+ default_flow_style=False,
254
+ sort_keys=False,
255
+ )
256
+ cfg_path = tmp_cfg.name
257
+ artifact.add_file(cfg_path, name=f"{artifact_root}/config.yaml")
258
+ os.unlink(cfg_path)
259
+ logged_names.append("config.yaml")
260
+
261
+ files_to_upload: list[Path] = []
262
+ if wandb_config.get("only_required", True):
263
+ for fname in get_available_artifacts(artifacts_dir):
264
+ p = artifacts_dir / fname
265
+ if p.exists():
266
+ files_to_upload.append(p)
267
+ else:
268
+ for p in artifacts_dir.iterdir():
269
+ if p.is_file():
270
+ files_to_upload.append(p)
271
+
272
+ for fpath in files_to_upload:
273
+ rel = fpath.relative_to(artifacts_dir).as_posix()
274
+ artifact.add_file(str(fpath), name=f"{artifact_root}/artifacts/{rel}")
275
+ logged_names.append(rel)
276
+
277
+ if wandb_config.get("log_logs", False) and logs_dir.exists():
278
+ for p in logs_dir.rglob("*"):
279
+ if p.is_file():
280
+ rel = p.relative_to(logs_dir).as_posix()
281
+ artifact.add_file(str(p), name=f"{artifact_root}/logs/{rel}")
282
+ logged_names.append(f"logs/{rel}")
283
+
189
284
  return logged_names
190
285
  except Exception as e:
191
286
  logger.error(f"Error logging artifacts: {e}")
@@ -193,7 +288,7 @@ class WandBExporter(BaseExporter):
193
288
 
194
289
  def _check_existing_run(
195
290
  self, identifier: str, job_data: JobData, config: Dict[str, Any]
196
- ) -> tuple[bool, str]:
291
+ ) -> tuple[bool, Optional[str]]:
197
292
  """Check if run exists based on webhook metadata then name patterns."""
198
293
  try:
199
294
  import wandb
@@ -204,7 +299,7 @@ class WandBExporter(BaseExporter):
204
299
  if not (entity and project):
205
300
  return False, None
206
301
 
207
- # # Check webhook metadata for run_id first
302
+ # Check webhook metadata for run_id first
208
303
  webhook_meta = job_data.data.get("webhook_metadata", {})
209
304
  if (
210
305
  webhook_meta.get("webhook_source") == "wandb"
@@ -281,10 +376,14 @@ class WandBExporter(BaseExporter):
281
376
  run_args["resume"] = "allow"
282
377
 
283
378
  # Config metadata
379
+ exec_type = (job_data.config or {}).get("execution", {}).get(
380
+ "type"
381
+ ) or job_data.executor
284
382
  run_config = {
285
383
  "invocation_id": job_data.invocation_id,
286
- "executor": job_data.executor,
384
+ "executor": exec_type,
287
385
  }
386
+
288
387
  if log_mode == "per_task":
289
388
  run_config["job_id"] = job_data.job_id
290
389
  run_config["harness"] = harness
@@ -306,6 +405,13 @@ class WandBExporter(BaseExporter):
306
405
  # Initialize
307
406
  run = wandb.init(**{k: v for k, v in run_args.items() if v is not None})
308
407
 
408
+ # Track staging dirs for this run
409
+ staging_dirs: List[str] = []
410
+
411
+ def register_staging_dir(path: str) -> None:
412
+ if path and os.path.isdir(path):
413
+ staging_dirs.append(path)
414
+
309
415
  # In multi_task, aggregate lists after init (no overwrite)
310
416
  if log_mode == "multi_task":
311
417
  try:
@@ -339,34 +445,42 @@ class WandBExporter(BaseExporter):
339
445
  "harness": harness,
340
446
  },
341
447
  )
342
- with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as tmp_cfg:
343
- yaml.dump(job_data.config or {}, tmp_cfg, default_flow_style=False)
344
- cfg_path = tmp_cfg.name
345
- artifact.add_file(cfg_path, name="config.yaml")
346
- os.unlink(cfg_path)
347
448
 
348
- logged_artifacts = self._log_artifacts(job_data, config, artifact)
349
- run.log_artifact(artifact)
449
+ logged_artifacts = self._log_artifacts(
450
+ job_data, config, artifact, register_staging_dir=register_staging_dir
451
+ )
350
452
 
351
- # charts for each logged metric
352
453
  try:
353
- for k in metrics.keys():
354
- run.define_metric(k, summary="last")
355
- except Exception:
356
- pass
454
+ run.log_artifact(artifact)
455
+ # charts for each logged metric
456
+ try:
457
+ for k in metrics.keys():
458
+ run.define_metric(k, summary="last")
459
+ except Exception:
460
+ pass
357
461
 
358
- # Log metrics with per-task step
359
- try:
360
- step_idx = int(job_data.job_id.split(".")[-1])
361
- except Exception:
362
- step_idx = 0
363
- run.log(metrics, step=step_idx)
462
+ # Log metrics with per-task step
463
+ try:
464
+ step_idx = int(job_data.job_id.split(".")[-1])
465
+ except Exception:
466
+ step_idx = 0
467
+ run.log(metrics, step=step_idx)
364
468
 
365
- # metrics summary
366
- try:
367
- run.summary.update(metrics)
368
- except Exception:
369
- pass
469
+ # metrics summary
470
+ try:
471
+ run.summary.update(metrics)
472
+ except Exception:
473
+ pass
474
+ finally:
475
+ for d in staging_dirs:
476
+ try:
477
+ shutil.rmtree(d, ignore_errors=True)
478
+ except Exception:
479
+ pass
480
+ try:
481
+ run.finish()
482
+ except Exception:
483
+ pass
370
484
 
371
485
  return {
372
486
  "run_id": run.id,
@@ -13,10 +13,11 @@
13
13
  # limitations under the License.
14
14
 
15
15
 
16
+ # Below is the _next_ version that will be published, not the currently published one.
16
17
  MAJOR = 0
17
18
  MINOR = 1
18
- PATCH = 0
19
- PRE_RELEASE = "rc6"
19
+ PATCH = 41
20
+ PRE_RELEASE = ""
20
21
 
21
22
  # Use the following formatting: (major, minor, patch, pre-release)
22
23
  VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
@@ -24,12 +25,14 @@ VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
24
25
  __shortversion__ = ".".join(map(str, VERSION[:3]))
25
26
  __version__ = ".".join(map(str, VERSION[:3])) + "".join(VERSION[3:])
26
27
 
28
+ # BEGIN(if-changed): check the pyproject.toml, too
27
29
  __package_name__ = "nemo_evaluator_launcher"
28
30
  __contact_names__ = "NVIDIA"
29
31
  __contact_emails__ = "nemo-toolkit@nvidia.com"
30
32
  __homepage__ = "https://github.com/NVIDIA-NeMo/Eval"
31
33
  __repository_url__ = "https://github.com/NVIDIA-NeMo/Eval"
32
- __download_url__ = "https://github.com/NVIDIA-NeMo/Eval/releases"
34
+ __download_url__ = "https://github.com/NVIDIA-NeMo/Evaluator/releases"
33
35
  __description__ = "Launcher for the evaluations provided by NeMo Evaluator containers with different runtime backends"
34
36
  __license__ = "Apache2"
35
37
  __keywords__ = "deep learning, evaluations, machine learning, gpu, NLP, pytorch, torch"
38
+ # END(if-changed)
@@ -1,6 +1,6 @@
1
1
  # NOTE(agronskiy): checked parity
2
2
  [lm-evaluation-harness]
3
- container = "nvcr.io/nvidia/eval-factory/lm-evaluation-harness:25.08.1"
3
+ container = "nvcr.io/nvidia/eval-factory/lm-evaluation-harness:25.10"
4
4
 
5
5
  [lm-evaluation-harness.tasks.chat.ifeval]
6
6
  required_env_vars = []
@@ -79,6 +79,8 @@ required_env_vars = []
79
79
 
80
80
  [lm-evaluation-harness.tasks.chat.mmlu_redux_instruct]
81
81
 
82
+ [lm-evaluation-harness.tasks.chat.mmlu_cot_0_shot_chat]
83
+
82
84
  [lm-evaluation-harness.tasks.completions.gsm8k]
83
85
  required_env_vars = []
84
86
 
@@ -124,7 +126,7 @@ required_env_vars = []
124
126
  ###############################################################################
125
127
  # NOTE(agronskiy): checked parity
126
128
  [mtbench]
127
- container = "nvcr.io/nvidia/eval-factory/mtbench:25.08.1"
129
+ container = "nvcr.io/nvidia/eval-factory/mtbench:25.10"
128
130
 
129
131
  [mtbench.tasks.chat.mtbench]
130
132
 
@@ -134,7 +136,7 @@ container = "nvcr.io/nvidia/eval-factory/mtbench:25.08.1"
134
136
  ###############################################################################
135
137
  # NOTE(agronskiy): checked parity
136
138
  [ifbench]
137
- container = "nvcr.io/nvidia/eval-factory/ifbench:25.08.1"
139
+ container = "nvcr.io/nvidia/eval-factory/ifbench:25.10"
138
140
 
139
141
  [ifbench.tasks.chat.ifbench]
140
142
  required_env_vars = []
@@ -142,7 +144,7 @@ required_env_vars = []
142
144
 
143
145
  ###############################################################################
144
146
  [simple_evals]
145
- container = "nvcr.io/nvidia/eval-factory/simple-evals:25.08.1"
147
+ container = "nvcr.io/nvidia/eval-factory/simple-evals:25.10"
146
148
 
147
149
  [simple_evals.tasks.chat.gpqa_diamond]
148
150
  required_env_vars = ["HF_TOKEN"]
@@ -213,7 +215,7 @@ required_env_vars = []
213
215
  ###############################################################################
214
216
  # NOTE(agronskiy): checked parity
215
217
  [bigcode-evaluation-harness]
216
- container = "nvcr.io/nvidia/eval-factory/bigcode-evaluation-harness:25.08.1"
218
+ container = "nvcr.io/nvidia/eval-factory/bigcode-evaluation-harness:25.10"
217
219
 
218
220
  [bigcode-evaluation-harness.tasks.chat.mbpp]
219
221
  required_env_vars = []
@@ -226,12 +228,12 @@ required_env_vars = []
226
228
  [bigcode-evaluation-harness.tasks.completions.humaneval]
227
229
  required_env_vars = []
228
230
 
229
- [bigcode-evaluation-harness.tasks.completions.humaneval_instruct]
231
+ [bigcode-evaluation-harness.tasks.chat.humaneval_instruct]
230
232
 
231
233
 
232
234
  ###############################################################################
233
235
  [livecodebench]
234
- container = "nvcr.io/nvidia/eval-factory/livecodebench:25.08.1"
236
+ container = "nvcr.io/nvidia/eval-factory/livecodebench:25.10"
235
237
 
236
238
  [livecodebench.tasks.chat.livecodebench_0724_0125]
237
239
  required_env_vars = []
@@ -242,7 +244,7 @@ required_env_vars = []
242
244
 
243
245
  ###############################################################################
244
246
  [scicode]
245
- container = "nvcr.io/nvidia/eval-factory/scicode:25.08.1"
247
+ container = "nvcr.io/nvidia/eval-factory/scicode:25.10"
246
248
 
247
249
  [scicode.tasks.chat.aa_scicode]
248
250
  required_env_vars = []
@@ -250,7 +252,7 @@ required_env_vars = []
250
252
 
251
253
  ###############################################################################
252
254
  [hle]
253
- container = "nvcr.io/nvidia/eval-factory/hle:25.08.1"
255
+ container = "nvcr.io/nvidia/eval-factory/hle:25.10"
254
256
 
255
257
  [hle.tasks.chat.hle]
256
258
  required_env_vars = ["HF_TOKEN", "OPENAI_CLIENT_ID", "OPENAI_CLIENT_SECRET"]
@@ -258,7 +260,7 @@ required_env_vars = ["HF_TOKEN", "OPENAI_CLIENT_ID", "OPENAI_CLIENT_SECRET"]
258
260
 
259
261
  ###############################################################################
260
262
  [bfcl]
261
- container = "nvcr.io/nvidia/eval-factory/bfcl:25.08.1"
263
+ container = "nvcr.io/nvidia/eval-factory/bfcl:25.10"
262
264
 
263
265
  [bfcl.tasks.chat.bfclv2_ast_prompting]
264
266
  required_env_vars = []
@@ -267,9 +269,20 @@ required_env_vars = []
267
269
  required_env_vars = []
268
270
 
269
271
 
272
+ ###############################################################################
273
+ [profbench]
274
+ container = "nvcr.io/nvidia/eval-factory/profbench:25.10"
275
+
276
+ [profbench.tasks.chat.llm_judge]
277
+ required_env_vars = []
278
+
279
+ [profbench.tasks.chat.report_generation]
280
+ required_env_vars = []
281
+
282
+
270
283
  ###############################################################################
271
284
  [vlmevalkit]
272
- container = "nvcr.io/nvidia/eval-factory/vlmevalkit:25.08.1"
285
+ container = "nvcr.io/nvidia/eval-factory/vlmevalkit:25.10"
273
286
 
274
287
  [vlmevalkit.tasks.vlm.ocrbench]
275
288
  required_env_vars = []
@@ -286,15 +299,43 @@ required_env_vars = ["OPENAI_CLIENT_ID", "OPENAI_CLIENT_SECRET"]
286
299
 
287
300
  ###############################################################################
288
301
  [garak]
289
- container = "nvcr.io/nvidia/eval-factory/garak:25.08.1"
302
+ container = "nvcr.io/nvidia/eval-factory/garak:25.10"
290
303
 
291
304
  [garak.tasks.chat.garak]
292
305
  required_env_vars = []
293
306
 
307
+ ###############################################################################
308
+ # NOTE(wprazuch): to verify if the tasks need any env var setting
309
+ [nemo_skills]
310
+ container = "nvcr.io/nvidia/eval-factory/nemo_skills:25.10"
311
+
312
+ [nemo_skills.tasks.chat.ns_aime2024]
313
+ required_env_vars = ["JUDGE_API_KEY"]
314
+
315
+ [nemo_skills.tasks.chat.ns_aime2025]
316
+ required_env_vars = []
317
+
318
+ [nemo_skills.tasks.chat.ns_bfcl_v3]
319
+ required_env_vars = []
320
+
321
+ [nemo_skills.tasks.chat.ns_gpqa]
322
+ required_env_vars = ["HF_TOKEN"]
323
+
324
+ [nemo_skills.tasks.chat.ns_hle]
325
+ required_env_vars = []
326
+
327
+ [nemo_skills.tasks.chat.ns_mmlu]
328
+ required_env_vars = ["HF_TOKEN"]
329
+
330
+ [nemo_skills.tasks.chat.ns_mmlu_pro]
331
+ required_env_vars = ["HF_TOKEN"]
332
+
333
+ [nemo_skills.tasks.chat.ns_aa_lcr]
334
+ required_env_vars = ["JUDGE_API_KEY"]
294
335
 
295
336
  ###############################################################################
296
337
  [safety-harness]
297
- container = "nvcr.io/nvidia/eval-factory/safety-harness:25.08.1"
338
+ container = "nvcr.io/nvidia/eval-factory/safety-harness:25.10"
298
339
 
299
340
  [safety-harness.tasks.chat.aegis_v2]
300
341
  required_env_vars = ["HF_TOKEN"]
@@ -303,7 +344,7 @@ required_env_vars = ["HF_TOKEN"]
303
344
  ###############################################################################
304
345
  # NOTE(agronskiy): checked parity
305
346
  [helm]
306
- container = "nvcr.io/nvidia/eval-factory/helm:25.08.1"
347
+ container = "nvcr.io/nvidia/eval-factory/helm:25.10"
307
348
 
308
349
  [helm.tasks.chat.medcalc_bench]
309
350
 
@@ -339,6 +380,6 @@ container = "nvcr.io/nvidia/eval-factory/helm:25.08.1"
339
380
  ###############################################################################
340
381
  # NOTE(agronskiy): checked parity
341
382
  [tooltalk]
342
- container = "nvcr.io/nvidia/eval-factory/tooltalk:25.08.1"
383
+ container = "nvcr.io/nvidia/eval-factory/tooltalk:25.10"
343
384
 
344
385
  [tooltalk.tasks.chat.tooltalk]