nemo-evaluator-launcher 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nemo_evaluator_launcher/__init__.py +15 -1
- nemo_evaluator_launcher/api/functional.py +188 -27
- nemo_evaluator_launcher/api/types.py +9 -0
- nemo_evaluator_launcher/cli/export.py +131 -12
- nemo_evaluator_launcher/cli/info.py +477 -82
- nemo_evaluator_launcher/cli/kill.py +5 -3
- nemo_evaluator_launcher/cli/logs.py +102 -0
- nemo_evaluator_launcher/cli/ls_runs.py +31 -10
- nemo_evaluator_launcher/cli/ls_tasks.py +105 -3
- nemo_evaluator_launcher/cli/main.py +101 -5
- nemo_evaluator_launcher/cli/run.py +153 -30
- nemo_evaluator_launcher/cli/status.py +49 -5
- nemo_evaluator_launcher/cli/version.py +26 -23
- nemo_evaluator_launcher/common/execdb.py +121 -27
- nemo_evaluator_launcher/common/helpers.py +213 -33
- nemo_evaluator_launcher/common/logging_utils.py +16 -5
- nemo_evaluator_launcher/common/printing_utils.py +100 -0
- nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
- nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
- nemo_evaluator_launcher/configs/deployment/trtllm.yaml +23 -0
- nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -2
- nemo_evaluator_launcher/configs/execution/local.yaml +2 -0
- nemo_evaluator_launcher/configs/execution/slurm/default.yaml +19 -4
- nemo_evaluator_launcher/executors/base.py +54 -1
- nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +60 -5
- nemo_evaluator_launcher/executors/lepton/executor.py +240 -101
- nemo_evaluator_launcher/executors/lepton/job_helpers.py +15 -11
- nemo_evaluator_launcher/executors/local/executor.py +492 -56
- nemo_evaluator_launcher/executors/local/run.template.sh +76 -9
- nemo_evaluator_launcher/executors/slurm/executor.py +571 -98
- nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
- nemo_evaluator_launcher/exporters/base.py +9 -0
- nemo_evaluator_launcher/exporters/gsheets.py +27 -9
- nemo_evaluator_launcher/exporters/local.py +30 -16
- nemo_evaluator_launcher/exporters/mlflow.py +245 -74
- nemo_evaluator_launcher/exporters/utils.py +139 -184
- nemo_evaluator_launcher/exporters/wandb.py +157 -43
- nemo_evaluator_launcher/package_info.py +6 -3
- nemo_evaluator_launcher/resources/mapping.toml +56 -15
- nemo_evaluator_launcher-0.1.41.dist-info/METADATA +494 -0
- nemo_evaluator_launcher-0.1.41.dist-info/RECORD +62 -0
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/entry_points.txt +1 -0
- nemo_evaluator_launcher-0.1.0rc6.dist-info/METADATA +0 -35
- nemo_evaluator_launcher-0.1.0rc6.dist-info/RECORD +0 -57
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/WHEEL +0 -0
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/licenses/LICENSE +0 -0
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/top_level.txt +0 -0
|
@@ -19,7 +19,7 @@ import os
|
|
|
19
19
|
import shutil
|
|
20
20
|
import tempfile
|
|
21
21
|
from pathlib import Path
|
|
22
|
-
from typing import Any, Dict, List
|
|
22
|
+
from typing import Any, Dict, List, Optional
|
|
23
23
|
|
|
24
24
|
import yaml
|
|
25
25
|
|
|
@@ -38,6 +38,7 @@ from nemo_evaluator_launcher.exporters.registry import register_exporter
|
|
|
38
38
|
from nemo_evaluator_launcher.exporters.utils import (
|
|
39
39
|
extract_accuracy_metrics,
|
|
40
40
|
extract_exporter_config,
|
|
41
|
+
get_artifact_root,
|
|
41
42
|
get_available_artifacts,
|
|
42
43
|
get_benchmark_info,
|
|
43
44
|
get_task_name,
|
|
@@ -67,10 +68,41 @@ class WandBExporter(BaseExporter):
|
|
|
67
68
|
"log_mode", "per_task"
|
|
68
69
|
) # Default per_task for immediate export
|
|
69
70
|
|
|
70
|
-
#
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
71
|
+
# Stage artifacts locally if remote_ssh (e.g., Slurm), so we can extract metrics
|
|
72
|
+
staged_base_dir = None
|
|
73
|
+
try:
|
|
74
|
+
paths = self.get_job_paths(job_data)
|
|
75
|
+
if paths.get("storage_type") == "remote_ssh":
|
|
76
|
+
tmp_stage = Path(tempfile.mkdtemp(prefix="wandb_stage_"))
|
|
77
|
+
LocalExporter(
|
|
78
|
+
{
|
|
79
|
+
"output_dir": str(tmp_stage),
|
|
80
|
+
"copy_logs": wandb_config.get("log_logs", False),
|
|
81
|
+
"only_required": wandb_config.get("only_required", True),
|
|
82
|
+
}
|
|
83
|
+
).export_job(job_data)
|
|
84
|
+
staged_base_dir = (
|
|
85
|
+
tmp_stage / job_data.invocation_id / job_data.job_id
|
|
86
|
+
)
|
|
87
|
+
except Exception as e:
|
|
88
|
+
logger.warning(f"W&B: staging failed for {job_data.job_id}: {e}")
|
|
89
|
+
|
|
90
|
+
# Metrics (prefer staged if available)
|
|
91
|
+
log_metrics = wandb_config.get("log_metrics", [])
|
|
92
|
+
if staged_base_dir and (staged_base_dir / "artifacts").exists():
|
|
93
|
+
metrics = extract_accuracy_metrics(
|
|
94
|
+
job_data,
|
|
95
|
+
lambda _: {
|
|
96
|
+
"artifacts_dir": staged_base_dir / "artifacts",
|
|
97
|
+
"storage_type": "local_filesystem",
|
|
98
|
+
},
|
|
99
|
+
log_metrics,
|
|
100
|
+
)
|
|
101
|
+
else:
|
|
102
|
+
metrics = extract_accuracy_metrics(
|
|
103
|
+
job_data, self.get_job_paths, log_metrics
|
|
104
|
+
)
|
|
105
|
+
|
|
74
106
|
if not metrics:
|
|
75
107
|
return ExportResult(
|
|
76
108
|
success=False, dest="wandb", message="No metrics found"
|
|
@@ -163,29 +195,92 @@ class WandBExporter(BaseExporter):
|
|
|
163
195
|
return {"success": False, "error": f"W&B export failed: {str(e)}"}
|
|
164
196
|
|
|
165
197
|
def _log_artifacts(
|
|
166
|
-
self,
|
|
198
|
+
self,
|
|
199
|
+
job_data: JobData,
|
|
200
|
+
wandb_config: Dict[str, Any],
|
|
201
|
+
artifact,
|
|
202
|
+
register_staging_dir=None,
|
|
167
203
|
) -> List[str]:
|
|
168
|
-
"""Log evaluation artifacts to WandB using LocalExporter for
|
|
204
|
+
"""Log evaluation artifacts to WandB using LocalExporter for staging."""
|
|
169
205
|
if not wandb_config.get("log_artifacts", True):
|
|
170
206
|
return []
|
|
171
207
|
try:
|
|
172
208
|
temp_dir = tempfile.mkdtemp(prefix="wandb_artifacts_")
|
|
173
|
-
|
|
209
|
+
if callable(register_staging_dir):
|
|
210
|
+
register_staging_dir(temp_dir)
|
|
211
|
+
local_exporter = LocalExporter(
|
|
212
|
+
{
|
|
213
|
+
"output_dir": temp_dir,
|
|
214
|
+
"copy_logs": wandb_config.get(
|
|
215
|
+
"log_logs", wandb_config.get("copy_logs", False)
|
|
216
|
+
),
|
|
217
|
+
"only_required": wandb_config.get("only_required", True),
|
|
218
|
+
"format": wandb_config.get("format"),
|
|
219
|
+
"log_metrics": wandb_config.get("log_metrics", []),
|
|
220
|
+
"output_filename": wandb_config.get("output_filename"),
|
|
221
|
+
}
|
|
222
|
+
)
|
|
174
223
|
local_result = local_exporter.export_job(job_data)
|
|
175
224
|
|
|
176
225
|
if not local_result.success:
|
|
177
226
|
logger.error(f"Failed to download artifacts: {local_result.message}")
|
|
178
227
|
return []
|
|
179
228
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
229
|
+
base_dir = Path(local_result.dest)
|
|
230
|
+
artifacts_dir = base_dir / "artifacts"
|
|
231
|
+
logs_dir = base_dir / "logs"
|
|
232
|
+
logged_names: list[str] = []
|
|
233
|
+
|
|
234
|
+
artifact_root = get_artifact_root(job_data) # "<harness>.<benchmark>"
|
|
235
|
+
|
|
236
|
+
# Add config file only when artifacts logging is enabled
|
|
237
|
+
if wandb_config.get("log_artifacts", True):
|
|
238
|
+
cfg_added = False
|
|
239
|
+
for fname in ("config.yml", "run_config.yml"):
|
|
240
|
+
p = artifacts_dir / fname
|
|
241
|
+
if p.exists():
|
|
242
|
+
artifact.add_file(str(p), name=f"{artifact_root}/{fname}")
|
|
243
|
+
logged_names.append(fname)
|
|
244
|
+
cfg_added = True
|
|
245
|
+
break
|
|
246
|
+
if not cfg_added:
|
|
247
|
+
with tempfile.NamedTemporaryFile(
|
|
248
|
+
"w", suffix=".yaml", delete=False
|
|
249
|
+
) as tmp_cfg:
|
|
250
|
+
yaml.dump(
|
|
251
|
+
job_data.config or {},
|
|
252
|
+
tmp_cfg,
|
|
253
|
+
default_flow_style=False,
|
|
254
|
+
sort_keys=False,
|
|
255
|
+
)
|
|
256
|
+
cfg_path = tmp_cfg.name
|
|
257
|
+
artifact.add_file(cfg_path, name=f"{artifact_root}/config.yaml")
|
|
258
|
+
os.unlink(cfg_path)
|
|
259
|
+
logged_names.append("config.yaml")
|
|
260
|
+
|
|
261
|
+
files_to_upload: list[Path] = []
|
|
262
|
+
if wandb_config.get("only_required", True):
|
|
263
|
+
for fname in get_available_artifacts(artifacts_dir):
|
|
264
|
+
p = artifacts_dir / fname
|
|
265
|
+
if p.exists():
|
|
266
|
+
files_to_upload.append(p)
|
|
267
|
+
else:
|
|
268
|
+
for p in artifacts_dir.iterdir():
|
|
269
|
+
if p.is_file():
|
|
270
|
+
files_to_upload.append(p)
|
|
271
|
+
|
|
272
|
+
for fpath in files_to_upload:
|
|
273
|
+
rel = fpath.relative_to(artifacts_dir).as_posix()
|
|
274
|
+
artifact.add_file(str(fpath), name=f"{artifact_root}/artifacts/{rel}")
|
|
275
|
+
logged_names.append(rel)
|
|
276
|
+
|
|
277
|
+
if wandb_config.get("log_logs", False) and logs_dir.exists():
|
|
278
|
+
for p in logs_dir.rglob("*"):
|
|
279
|
+
if p.is_file():
|
|
280
|
+
rel = p.relative_to(logs_dir).as_posix()
|
|
281
|
+
artifact.add_file(str(p), name=f"{artifact_root}/logs/{rel}")
|
|
282
|
+
logged_names.append(f"logs/{rel}")
|
|
283
|
+
|
|
189
284
|
return logged_names
|
|
190
285
|
except Exception as e:
|
|
191
286
|
logger.error(f"Error logging artifacts: {e}")
|
|
@@ -193,7 +288,7 @@ class WandBExporter(BaseExporter):
|
|
|
193
288
|
|
|
194
289
|
def _check_existing_run(
|
|
195
290
|
self, identifier: str, job_data: JobData, config: Dict[str, Any]
|
|
196
|
-
) -> tuple[bool, str]:
|
|
291
|
+
) -> tuple[bool, Optional[str]]:
|
|
197
292
|
"""Check if run exists based on webhook metadata then name patterns."""
|
|
198
293
|
try:
|
|
199
294
|
import wandb
|
|
@@ -204,7 +299,7 @@ class WandBExporter(BaseExporter):
|
|
|
204
299
|
if not (entity and project):
|
|
205
300
|
return False, None
|
|
206
301
|
|
|
207
|
-
#
|
|
302
|
+
# Check webhook metadata for run_id first
|
|
208
303
|
webhook_meta = job_data.data.get("webhook_metadata", {})
|
|
209
304
|
if (
|
|
210
305
|
webhook_meta.get("webhook_source") == "wandb"
|
|
@@ -281,10 +376,14 @@ class WandBExporter(BaseExporter):
|
|
|
281
376
|
run_args["resume"] = "allow"
|
|
282
377
|
|
|
283
378
|
# Config metadata
|
|
379
|
+
exec_type = (job_data.config or {}).get("execution", {}).get(
|
|
380
|
+
"type"
|
|
381
|
+
) or job_data.executor
|
|
284
382
|
run_config = {
|
|
285
383
|
"invocation_id": job_data.invocation_id,
|
|
286
|
-
"executor":
|
|
384
|
+
"executor": exec_type,
|
|
287
385
|
}
|
|
386
|
+
|
|
288
387
|
if log_mode == "per_task":
|
|
289
388
|
run_config["job_id"] = job_data.job_id
|
|
290
389
|
run_config["harness"] = harness
|
|
@@ -306,6 +405,13 @@ class WandBExporter(BaseExporter):
|
|
|
306
405
|
# Initialize
|
|
307
406
|
run = wandb.init(**{k: v for k, v in run_args.items() if v is not None})
|
|
308
407
|
|
|
408
|
+
# Track staging dirs for this run
|
|
409
|
+
staging_dirs: List[str] = []
|
|
410
|
+
|
|
411
|
+
def register_staging_dir(path: str) -> None:
|
|
412
|
+
if path and os.path.isdir(path):
|
|
413
|
+
staging_dirs.append(path)
|
|
414
|
+
|
|
309
415
|
# In multi_task, aggregate lists after init (no overwrite)
|
|
310
416
|
if log_mode == "multi_task":
|
|
311
417
|
try:
|
|
@@ -339,34 +445,42 @@ class WandBExporter(BaseExporter):
|
|
|
339
445
|
"harness": harness,
|
|
340
446
|
},
|
|
341
447
|
)
|
|
342
|
-
with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as tmp_cfg:
|
|
343
|
-
yaml.dump(job_data.config or {}, tmp_cfg, default_flow_style=False)
|
|
344
|
-
cfg_path = tmp_cfg.name
|
|
345
|
-
artifact.add_file(cfg_path, name="config.yaml")
|
|
346
|
-
os.unlink(cfg_path)
|
|
347
448
|
|
|
348
|
-
logged_artifacts = self._log_artifacts(
|
|
349
|
-
|
|
449
|
+
logged_artifacts = self._log_artifacts(
|
|
450
|
+
job_data, config, artifact, register_staging_dir=register_staging_dir
|
|
451
|
+
)
|
|
350
452
|
|
|
351
|
-
# charts for each logged metric
|
|
352
453
|
try:
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
454
|
+
run.log_artifact(artifact)
|
|
455
|
+
# charts for each logged metric
|
|
456
|
+
try:
|
|
457
|
+
for k in metrics.keys():
|
|
458
|
+
run.define_metric(k, summary="last")
|
|
459
|
+
except Exception:
|
|
460
|
+
pass
|
|
357
461
|
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
462
|
+
# Log metrics with per-task step
|
|
463
|
+
try:
|
|
464
|
+
step_idx = int(job_data.job_id.split(".")[-1])
|
|
465
|
+
except Exception:
|
|
466
|
+
step_idx = 0
|
|
467
|
+
run.log(metrics, step=step_idx)
|
|
364
468
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
469
|
+
# metrics summary
|
|
470
|
+
try:
|
|
471
|
+
run.summary.update(metrics)
|
|
472
|
+
except Exception:
|
|
473
|
+
pass
|
|
474
|
+
finally:
|
|
475
|
+
for d in staging_dirs:
|
|
476
|
+
try:
|
|
477
|
+
shutil.rmtree(d, ignore_errors=True)
|
|
478
|
+
except Exception:
|
|
479
|
+
pass
|
|
480
|
+
try:
|
|
481
|
+
run.finish()
|
|
482
|
+
except Exception:
|
|
483
|
+
pass
|
|
370
484
|
|
|
371
485
|
return {
|
|
372
486
|
"run_id": run.id,
|
|
@@ -13,10 +13,11 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
|
|
16
|
+
# Below is the _next_ version that will be published, not the currently published one.
|
|
16
17
|
MAJOR = 0
|
|
17
18
|
MINOR = 1
|
|
18
|
-
PATCH =
|
|
19
|
-
PRE_RELEASE = "
|
|
19
|
+
PATCH = 41
|
|
20
|
+
PRE_RELEASE = ""
|
|
20
21
|
|
|
21
22
|
# Use the following formatting: (major, minor, patch, pre-release)
|
|
22
23
|
VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
|
|
@@ -24,12 +25,14 @@ VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
|
|
|
24
25
|
__shortversion__ = ".".join(map(str, VERSION[:3]))
|
|
25
26
|
__version__ = ".".join(map(str, VERSION[:3])) + "".join(VERSION[3:])
|
|
26
27
|
|
|
28
|
+
# BEGIN(if-changed): check the pyproject.toml, too
|
|
27
29
|
__package_name__ = "nemo_evaluator_launcher"
|
|
28
30
|
__contact_names__ = "NVIDIA"
|
|
29
31
|
__contact_emails__ = "nemo-toolkit@nvidia.com"
|
|
30
32
|
__homepage__ = "https://github.com/NVIDIA-NeMo/Eval"
|
|
31
33
|
__repository_url__ = "https://github.com/NVIDIA-NeMo/Eval"
|
|
32
|
-
__download_url__ = "https://github.com/NVIDIA-NeMo/
|
|
34
|
+
__download_url__ = "https://github.com/NVIDIA-NeMo/Evaluator/releases"
|
|
33
35
|
__description__ = "Launcher for the evaluations provided by NeMo Evaluator containers with different runtime backends"
|
|
34
36
|
__license__ = "Apache2"
|
|
35
37
|
__keywords__ = "deep learning, evaluations, machine learning, gpu, NLP, pytorch, torch"
|
|
38
|
+
# END(if-changed)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# NOTE(agronskiy): checked parity
|
|
2
2
|
[lm-evaluation-harness]
|
|
3
|
-
container = "nvcr.io/nvidia/eval-factory/lm-evaluation-harness:25.
|
|
3
|
+
container = "nvcr.io/nvidia/eval-factory/lm-evaluation-harness:25.10"
|
|
4
4
|
|
|
5
5
|
[lm-evaluation-harness.tasks.chat.ifeval]
|
|
6
6
|
required_env_vars = []
|
|
@@ -79,6 +79,8 @@ required_env_vars = []
|
|
|
79
79
|
|
|
80
80
|
[lm-evaluation-harness.tasks.chat.mmlu_redux_instruct]
|
|
81
81
|
|
|
82
|
+
[lm-evaluation-harness.tasks.chat.mmlu_cot_0_shot_chat]
|
|
83
|
+
|
|
82
84
|
[lm-evaluation-harness.tasks.completions.gsm8k]
|
|
83
85
|
required_env_vars = []
|
|
84
86
|
|
|
@@ -124,7 +126,7 @@ required_env_vars = []
|
|
|
124
126
|
###############################################################################
|
|
125
127
|
# NOTE(agronskiy): checked parity
|
|
126
128
|
[mtbench]
|
|
127
|
-
container = "nvcr.io/nvidia/eval-factory/mtbench:25.
|
|
129
|
+
container = "nvcr.io/nvidia/eval-factory/mtbench:25.10"
|
|
128
130
|
|
|
129
131
|
[mtbench.tasks.chat.mtbench]
|
|
130
132
|
|
|
@@ -134,7 +136,7 @@ container = "nvcr.io/nvidia/eval-factory/mtbench:25.08.1"
|
|
|
134
136
|
###############################################################################
|
|
135
137
|
# NOTE(agronskiy): checked parity
|
|
136
138
|
[ifbench]
|
|
137
|
-
container = "nvcr.io/nvidia/eval-factory/ifbench:25.
|
|
139
|
+
container = "nvcr.io/nvidia/eval-factory/ifbench:25.10"
|
|
138
140
|
|
|
139
141
|
[ifbench.tasks.chat.ifbench]
|
|
140
142
|
required_env_vars = []
|
|
@@ -142,7 +144,7 @@ required_env_vars = []
|
|
|
142
144
|
|
|
143
145
|
###############################################################################
|
|
144
146
|
[simple_evals]
|
|
145
|
-
container = "nvcr.io/nvidia/eval-factory/simple-evals:25.
|
|
147
|
+
container = "nvcr.io/nvidia/eval-factory/simple-evals:25.10"
|
|
146
148
|
|
|
147
149
|
[simple_evals.tasks.chat.gpqa_diamond]
|
|
148
150
|
required_env_vars = ["HF_TOKEN"]
|
|
@@ -213,7 +215,7 @@ required_env_vars = []
|
|
|
213
215
|
###############################################################################
|
|
214
216
|
# NOTE(agronskiy): checked parity
|
|
215
217
|
[bigcode-evaluation-harness]
|
|
216
|
-
container = "nvcr.io/nvidia/eval-factory/bigcode-evaluation-harness:25.
|
|
218
|
+
container = "nvcr.io/nvidia/eval-factory/bigcode-evaluation-harness:25.10"
|
|
217
219
|
|
|
218
220
|
[bigcode-evaluation-harness.tasks.chat.mbpp]
|
|
219
221
|
required_env_vars = []
|
|
@@ -226,12 +228,12 @@ required_env_vars = []
|
|
|
226
228
|
[bigcode-evaluation-harness.tasks.completions.humaneval]
|
|
227
229
|
required_env_vars = []
|
|
228
230
|
|
|
229
|
-
[bigcode-evaluation-harness.tasks.
|
|
231
|
+
[bigcode-evaluation-harness.tasks.chat.humaneval_instruct]
|
|
230
232
|
|
|
231
233
|
|
|
232
234
|
###############################################################################
|
|
233
235
|
[livecodebench]
|
|
234
|
-
container = "nvcr.io/nvidia/eval-factory/livecodebench:25.
|
|
236
|
+
container = "nvcr.io/nvidia/eval-factory/livecodebench:25.10"
|
|
235
237
|
|
|
236
238
|
[livecodebench.tasks.chat.livecodebench_0724_0125]
|
|
237
239
|
required_env_vars = []
|
|
@@ -242,7 +244,7 @@ required_env_vars = []
|
|
|
242
244
|
|
|
243
245
|
###############################################################################
|
|
244
246
|
[scicode]
|
|
245
|
-
container = "nvcr.io/nvidia/eval-factory/scicode:25.
|
|
247
|
+
container = "nvcr.io/nvidia/eval-factory/scicode:25.10"
|
|
246
248
|
|
|
247
249
|
[scicode.tasks.chat.aa_scicode]
|
|
248
250
|
required_env_vars = []
|
|
@@ -250,7 +252,7 @@ required_env_vars = []
|
|
|
250
252
|
|
|
251
253
|
###############################################################################
|
|
252
254
|
[hle]
|
|
253
|
-
container = "nvcr.io/nvidia/eval-factory/hle:25.
|
|
255
|
+
container = "nvcr.io/nvidia/eval-factory/hle:25.10"
|
|
254
256
|
|
|
255
257
|
[hle.tasks.chat.hle]
|
|
256
258
|
required_env_vars = ["HF_TOKEN", "OPENAI_CLIENT_ID", "OPENAI_CLIENT_SECRET"]
|
|
@@ -258,7 +260,7 @@ required_env_vars = ["HF_TOKEN", "OPENAI_CLIENT_ID", "OPENAI_CLIENT_SECRET"]
|
|
|
258
260
|
|
|
259
261
|
###############################################################################
|
|
260
262
|
[bfcl]
|
|
261
|
-
container = "nvcr.io/nvidia/eval-factory/bfcl:25.
|
|
263
|
+
container = "nvcr.io/nvidia/eval-factory/bfcl:25.10"
|
|
262
264
|
|
|
263
265
|
[bfcl.tasks.chat.bfclv2_ast_prompting]
|
|
264
266
|
required_env_vars = []
|
|
@@ -267,9 +269,20 @@ required_env_vars = []
|
|
|
267
269
|
required_env_vars = []
|
|
268
270
|
|
|
269
271
|
|
|
272
|
+
###############################################################################
|
|
273
|
+
[profbench]
|
|
274
|
+
container = "nvcr.io/nvidia/eval-factory/profbench:25.10"
|
|
275
|
+
|
|
276
|
+
[profbench.tasks.chat.llm_judge]
|
|
277
|
+
required_env_vars = []
|
|
278
|
+
|
|
279
|
+
[profbench.tasks.chat.report_generation]
|
|
280
|
+
required_env_vars = []
|
|
281
|
+
|
|
282
|
+
|
|
270
283
|
###############################################################################
|
|
271
284
|
[vlmevalkit]
|
|
272
|
-
container = "nvcr.io/nvidia/eval-factory/vlmevalkit:25.
|
|
285
|
+
container = "nvcr.io/nvidia/eval-factory/vlmevalkit:25.10"
|
|
273
286
|
|
|
274
287
|
[vlmevalkit.tasks.vlm.ocrbench]
|
|
275
288
|
required_env_vars = []
|
|
@@ -286,15 +299,43 @@ required_env_vars = ["OPENAI_CLIENT_ID", "OPENAI_CLIENT_SECRET"]
|
|
|
286
299
|
|
|
287
300
|
###############################################################################
|
|
288
301
|
[garak]
|
|
289
|
-
container = "nvcr.io/nvidia/eval-factory/garak:25.
|
|
302
|
+
container = "nvcr.io/nvidia/eval-factory/garak:25.10"
|
|
290
303
|
|
|
291
304
|
[garak.tasks.chat.garak]
|
|
292
305
|
required_env_vars = []
|
|
293
306
|
|
|
307
|
+
###############################################################################
|
|
308
|
+
# NOTE(wprazuch): to verify if the tasks need any env var setting
|
|
309
|
+
[nemo_skills]
|
|
310
|
+
container = "nvcr.io/nvidia/eval-factory/nemo_skills:25.10"
|
|
311
|
+
|
|
312
|
+
[nemo_skills.tasks.chat.ns_aime2024]
|
|
313
|
+
required_env_vars = ["JUDGE_API_KEY"]
|
|
314
|
+
|
|
315
|
+
[nemo_skills.tasks.chat.ns_aime2025]
|
|
316
|
+
required_env_vars = []
|
|
317
|
+
|
|
318
|
+
[nemo_skills.tasks.chat.ns_bfcl_v3]
|
|
319
|
+
required_env_vars = []
|
|
320
|
+
|
|
321
|
+
[nemo_skills.tasks.chat.ns_gpqa]
|
|
322
|
+
required_env_vars = ["HF_TOKEN"]
|
|
323
|
+
|
|
324
|
+
[nemo_skills.tasks.chat.ns_hle]
|
|
325
|
+
required_env_vars = []
|
|
326
|
+
|
|
327
|
+
[nemo_skills.tasks.chat.ns_mmlu]
|
|
328
|
+
required_env_vars = ["HF_TOKEN"]
|
|
329
|
+
|
|
330
|
+
[nemo_skills.tasks.chat.ns_mmlu_pro]
|
|
331
|
+
required_env_vars = ["HF_TOKEN"]
|
|
332
|
+
|
|
333
|
+
[nemo_skills.tasks.chat.ns_aa_lcr]
|
|
334
|
+
required_env_vars = ["JUDGE_API_KEY"]
|
|
294
335
|
|
|
295
336
|
###############################################################################
|
|
296
337
|
[safety-harness]
|
|
297
|
-
container = "nvcr.io/nvidia/eval-factory/safety-harness:25.
|
|
338
|
+
container = "nvcr.io/nvidia/eval-factory/safety-harness:25.10"
|
|
298
339
|
|
|
299
340
|
[safety-harness.tasks.chat.aegis_v2]
|
|
300
341
|
required_env_vars = ["HF_TOKEN"]
|
|
@@ -303,7 +344,7 @@ required_env_vars = ["HF_TOKEN"]
|
|
|
303
344
|
###############################################################################
|
|
304
345
|
# NOTE(agronskiy): checked parity
|
|
305
346
|
[helm]
|
|
306
|
-
container = "nvcr.io/nvidia/eval-factory/helm:25.
|
|
347
|
+
container = "nvcr.io/nvidia/eval-factory/helm:25.10"
|
|
307
348
|
|
|
308
349
|
[helm.tasks.chat.medcalc_bench]
|
|
309
350
|
|
|
@@ -339,6 +380,6 @@ container = "nvcr.io/nvidia/eval-factory/helm:25.08.1"
|
|
|
339
380
|
###############################################################################
|
|
340
381
|
# NOTE(agronskiy): checked parity
|
|
341
382
|
[tooltalk]
|
|
342
|
-
container = "nvcr.io/nvidia/eval-factory/tooltalk:25.
|
|
383
|
+
container = "nvcr.io/nvidia/eval-factory/tooltalk:25.10"
|
|
343
384
|
|
|
344
385
|
[tooltalk.tasks.chat.tooltalk]
|