nemo-evaluator-launcher 0.1.19__tar.gz → 0.1.26__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/PKG-INFO +2 -2
  2. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/pyproject.toml +1 -1
  3. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/common/helpers.py +106 -32
  4. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/common/logging_utils.py +12 -4
  5. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/executors/lepton/executor.py +1 -1
  6. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/executors/slurm/executor.py +10 -1
  7. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/package_info.py +1 -1
  8. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/resources/mapping.toml +51 -15
  9. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher.egg-info/PKG-INFO +2 -2
  10. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher.egg-info/requires.txt +1 -1
  11. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/LICENSE +0 -0
  12. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/README.md +0 -0
  13. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/setup.cfg +0 -0
  14. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/__init__.py +0 -0
  15. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/api/__init__.py +0 -0
  16. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/api/functional.py +0 -0
  17. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/api/types.py +0 -0
  18. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/api/utils.py +0 -0
  19. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/cli/__init__.py +0 -0
  20. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/cli/export.py +0 -0
  21. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/cli/info.py +0 -0
  22. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/cli/kill.py +0 -0
  23. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/cli/ls_runs.py +0 -0
  24. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/cli/ls_tasks.py +0 -0
  25. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/cli/main.py +0 -0
  26. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/cli/run.py +0 -0
  27. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/cli/status.py +0 -0
  28. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/cli/version.py +0 -0
  29. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/common/__init__.py +0 -0
  30. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/common/execdb.py +0 -0
  31. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/common/mapping.py +0 -0
  32. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/common/printing_utils.py +0 -0
  33. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/configs/__init__.py +0 -0
  34. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/configs/default.yaml +0 -0
  35. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/configs/deployment/generic.yaml +0 -0
  36. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/configs/deployment/nim.yaml +0 -0
  37. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/configs/deployment/none.yaml +0 -0
  38. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/configs/deployment/sglang.yaml +0 -0
  39. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/configs/deployment/trtllm.yaml +0 -0
  40. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/configs/deployment/vllm.yaml +0 -0
  41. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/configs/execution/lepton/default.yaml +0 -0
  42. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/configs/execution/local.yaml +0 -0
  43. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/configs/execution/slurm/default.yaml +0 -0
  44. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/executors/__init__.py +0 -0
  45. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/executors/base.py +0 -0
  46. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/executors/lepton/__init__.py +0 -0
  47. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +0 -0
  48. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/executors/lepton/job_helpers.py +0 -0
  49. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/executors/local/__init__.py +0 -0
  50. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/executors/local/executor.py +0 -0
  51. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/executors/local/run.template.sh +0 -0
  52. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/executors/registry.py +0 -0
  53. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/executors/slurm/__init__.py +0 -0
  54. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/exporters/__init__.py +0 -0
  55. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/exporters/base.py +0 -0
  56. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/exporters/gsheets.py +0 -0
  57. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/exporters/local.py +0 -0
  58. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/exporters/mlflow.py +0 -0
  59. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/exporters/registry.py +0 -0
  60. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/exporters/utils.py +0 -0
  61. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/exporters/wandb.py +0 -0
  62. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher.egg-info/SOURCES.txt +0 -0
  63. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher.egg-info/dependency_links.txt +0 -0
  64. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher.egg-info/entry_points.txt +0 -0
  65. {nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nemo-evaluator-launcher
3
- Version: 0.1.19
3
+ Version: 0.1.26
4
4
  Summary: Launcher for the evaluations provided by NeMo Evaluator containers with different runtime backends
5
5
  Author: NVIDIA
6
6
  Author-email: nemo-toolkit@nvidia.com
@@ -478,7 +478,7 @@ Requires-Dist: mlflow>=2.8.0; extra == "mlflow"
478
478
  Provides-Extra: wandb
479
479
  Requires-Dist: wandb>=0.15.0; extra == "wandb"
480
480
  Provides-Extra: gsheets
481
- Requires-Dist: gsheets>=0.1.0; extra == "gsheets"
481
+ Requires-Dist: gspread>=5.0.0; extra == "gsheets"
482
482
  Provides-Extra: exporters
483
483
  Requires-Dist: mlflow; extra == "exporters"
484
484
  Requires-Dist: wandb; extra == "exporters"
@@ -40,7 +40,7 @@ repository = "https://github.com/NVIDIA-NeMo/Evaluator/packages/nemo-evaluator-l
40
40
  [project.optional-dependencies]
41
41
  mlflow = ["mlflow>=2.8.0"]
42
42
  wandb = ["wandb>=0.15.0"]
43
- gsheets = ["gsheets>=0.1.0"]
43
+ gsheets = ["gspread>=5.0.0"]
44
44
  exporters = ["mlflow", "wandb", "gsheets"]
45
45
  all = ["mlflow", "wandb", "gsheets"]
46
46
 
@@ -57,13 +57,38 @@ def _yaml_to_echo_command(
57
57
  )
58
58
 
59
59
 
60
+ def _set_nested_optionally_overriding(
61
+ d: dict, keys: list[str], val: object, *, override_if_exists: bool = False
62
+ ):
63
+ """Sets d[...keys....] = value, creating keys all the way"""
64
+ temp = d
65
+ for key in keys[:-1]:
66
+ temp = temp.setdefault(key, {})
67
+ if override_if_exists or keys[-1] not in temp:
68
+ temp[keys[-1]] = val
69
+
70
+
60
71
  def get_eval_factory_config(
61
- cfg: DictConfig, user_task_config: DictConfig, task_definition: dict
72
+ cfg: DictConfig,
73
+ user_task_config: DictConfig,
62
74
  ) -> dict:
63
75
  """Extract config fields for eval factory.
64
76
 
65
77
  This function extracts the config field similar to how overrides are handled.
78
+
79
+ Overrides will be start to be deprecated (or not, but at least a warning will be logged).
66
80
  """
81
+
82
+ if cfg.evaluation.get("overrides") or user_task_config.get("overrides"):
83
+ # TODO(agronskiy): start removing overrides, test `test_start_deprecating_overrides`
84
+ # will start failing soon.
85
+ logger.warning(
86
+ "We are deprecating using old-style dot-delimited overrides "
87
+ "in favour of `nemo_evaluator_config` field. Please check "
88
+ "the documentation."
89
+ )
90
+
91
+ logger.debug("Getting nemo evaluator merged config")
67
92
  # Extract config fields similar to overrides - convert to basic Python types first
68
93
  # Support both new and old format for backward compatibility
69
94
  cfg_config = cfg.evaluation.get("nemo_evaluator_config") or cfg.evaluation.get(
@@ -80,17 +105,73 @@ def get_eval_factory_config(
80
105
  user_config = OmegaConf.to_container(user_config, resolve=True)
81
106
 
82
107
  # Merge the configs
83
- config_fields = copy.deepcopy(cfg_config or {})
84
- config_fields.update(user_config or {})
108
+ merged_nemo_evaluator_config: dict = OmegaConf.to_container(
109
+ OmegaConf.merge(cfg_config, user_config)
110
+ )
85
111
 
86
- return config_fields
112
+ logger.debug(
113
+ "Merged nemo evaluator config, not final",
114
+ source_global_cfg=cfg_config,
115
+ source_task_config=user_config,
116
+ result=merged_nemo_evaluator_config,
117
+ )
118
+
119
+ return merged_nemo_evaluator_config
87
120
 
88
121
 
89
122
  def get_eval_factory_command(
90
123
  cfg: DictConfig, user_task_config: DictConfig, task_definition: dict
91
124
  ) -> CmdAndReadableComment:
92
- config_fields = get_eval_factory_config(cfg, user_task_config, task_definition)
125
+ merged_nemo_evaluator_config = get_eval_factory_config(
126
+ cfg,
127
+ user_task_config,
128
+ )
129
+
130
+ # We now prepare the config to be passed to `nemo-evaluator` command.
131
+ _set_nested_optionally_overriding(
132
+ merged_nemo_evaluator_config,
133
+ ["target", "api_endpoint", "url"],
134
+ get_endpoint_url(
135
+ cfg,
136
+ merged_nemo_evaluator_config=merged_nemo_evaluator_config,
137
+ endpoint_type=task_definition["endpoint_type"],
138
+ ),
139
+ )
140
+ _set_nested_optionally_overriding(
141
+ merged_nemo_evaluator_config,
142
+ ["target", "api_endpoint", "model_id"],
143
+ get_served_model_name(cfg),
144
+ )
145
+ _set_nested_optionally_overriding(
146
+ merged_nemo_evaluator_config,
147
+ ["target", "api_endpoint", "type"],
148
+ task_definition["endpoint_type"],
149
+ )
150
+ _set_nested_optionally_overriding(
151
+ merged_nemo_evaluator_config,
152
+ ["config", "type"],
153
+ task_definition["task"],
154
+ )
155
+ _set_nested_optionally_overriding(
156
+ merged_nemo_evaluator_config,
157
+ ["config", "output_dir"],
158
+ "/results",
159
+ )
160
+ _set_nested_optionally_overriding(
161
+ merged_nemo_evaluator_config,
162
+ ["target", "api_endpoint", "api_key"],
163
+ "API_KEY",
164
+ )
165
+
166
+ create_file_cmd = _yaml_to_echo_command(
167
+ yaml.safe_dump(merged_nemo_evaluator_config), "config_ef.yaml"
168
+ )
169
+ eval_command = (
170
+ "cmd=$(command -v nemo-evaluator >/dev/null 2>&1 && echo nemo-evaluator || echo eval-factory) "
171
+ + "&& $cmd run_eval --run_config config_ef.yaml"
172
+ )
93
173
 
174
+ # NOTE: see note and test about deprecating that.
94
175
  overrides = copy.deepcopy(dict(cfg.evaluation.get("overrides", {})))
95
176
  overrides.update(dict(user_task_config.get("overrides", {})))
96
177
  # NOTE(dfridman): Temporary fix to make sure that the overrides arg is not split into multiple lines.
@@ -99,18 +180,7 @@ def get_eval_factory_command(
99
180
  k: (v.strip("\n") if isinstance(v, str) else v) for k, v in overrides.items()
100
181
  }
101
182
  overrides_str = ",".join([f"{k}={v}" for k, v in overrides.items()])
102
- model_url = get_endpoint_url(cfg, user_task_config, task_definition)
103
-
104
- model_id = get_served_model_name(cfg)
105
- model_type = task_definition["endpoint_type"]
106
- eval_type = task_definition["task"]
107
-
108
- create_file_cmd = _yaml_to_echo_command(
109
- yaml.safe_dump(config_fields), "config_ef.yaml"
110
- )
111
- eval_command = f"""cmd=$([[ $(command -v nemo-evaluator) ]] && echo 'nemo-evaluator' || echo 'eval-factory') && $cmd run_eval --model_id {model_id} --model_type {model_type} --eval_type {eval_type} --model_url {model_url} --api_key_name API_KEY --output_dir /results --run_config config_ef.yaml"""
112
-
113
- if overrides:
183
+ if overrides_str:
114
184
  eval_command = f"{eval_command} --overrides {overrides_str}"
115
185
 
116
186
  # We return both the command and the debugging base64-decoded strings, useful
@@ -121,24 +191,29 @@ def get_eval_factory_command(
121
191
 
122
192
 
123
193
  def get_endpoint_url(
124
- cfg: DictConfig, user_task_config: DictConfig, task_definition: dict
194
+ cfg: DictConfig,
195
+ merged_nemo_evaluator_config: dict,
196
+ endpoint_type: str,
125
197
  ) -> str:
126
198
  def apply_url_override(url: str) -> str:
127
199
  """Apply user URL override if provided."""
128
- nemo_evaluator_config_url = user_task_config.get(
129
- "nemo_evaluator_config", {}
130
- ).get("target.api_endpoint.url", None)
131
-
132
- override_url = user_task_config.get("overrides", {}).get(
133
- "config.target.api_endpoint.url", None
200
+ nemo_evaluator_config_url = (
201
+ merged_nemo_evaluator_config.get("target", {})
202
+ .get("api_endpoint", {})
203
+ .get("url", None)
134
204
  )
135
- return (
136
- override_url
137
- if override_url is not None
138
- else nemo_evaluator_config_url
139
- if nemo_evaluator_config_url is not None
140
- else url
205
+
206
+ if nemo_evaluator_config_url:
207
+ return nemo_evaluator_config_url
208
+
209
+ # Being deprecated, see `get_eval_factory_config` message.
210
+ overrides_old_style_url = merged_nemo_evaluator_config.get("overrides", {}).get(
211
+ "target.api_endpoint.url", None
141
212
  )
213
+ if overrides_old_style_url:
214
+ return overrides_old_style_url
215
+
216
+ return url
142
217
 
143
218
  if cfg.deployment.type == "none":
144
219
  # For deployment: none, use target URL regardless of executor type
@@ -160,8 +235,7 @@ def get_endpoint_url(
160
235
 
161
236
  else:
162
237
  # Local executor - use localhost
163
- task_endpoint_type = task_definition["endpoint_type"]
164
- endpoint_uri = cfg.deployment.endpoints[task_endpoint_type]
238
+ endpoint_uri = cfg.deployment.endpoints[endpoint_type]
165
239
  endpoint_url = f"http://127.0.0.1:{cfg.deployment.port}{endpoint_uri}"
166
240
  return endpoint_url
167
241
 
@@ -61,8 +61,9 @@ import structlog
61
61
  # both are unset, default would be used.
62
62
  _LOG_LEVEL_ENV_VAR = "NEMO_EVALUATOR_LOG_LEVEL"
63
63
  _DEFAULT_LOG_LEVEL = "WARNING"
64
- _SENSITIVE_KEY_SUBSTRINGS = {
65
- # Keep minimal, broad substrings (normalized: lowercased, no spaces/_/-)
64
+ _SENSITIVE_KEY_SUBSTRINGS_NORMALIZED = {
65
+ # Keep minimal, broad substrings
66
+ # NOTE: normalized: lowercased, no spaces/_/-
66
67
  "authorization", # covers proxy-authorization, etc.
67
68
  "apikey", # covers api_key, api-key, x-api-key, nvidia_api_key, ...
68
69
  "accesskey", # covers access_key / access-key
@@ -73,6 +74,10 @@ _SENSITIVE_KEY_SUBSTRINGS = {
73
74
  "pwd", # common shorthand
74
75
  "passwd", # common variant
75
76
  }
77
+ _ALLOWLISTED_KEYS_SUBSTRINGS = {
78
+ # NOTE: non-normalized (for allowlisting we want more control)
79
+ "_tokens", # This likely would allow us to not redact useful stuff like `limit_tokens`, `max_new_tokens`
80
+ }
76
81
 
77
82
 
78
83
  def _mask(val: object) -> str:
@@ -91,8 +96,11 @@ def _normalize(name: object) -> str:
91
96
 
92
97
 
93
98
  def _is_sensitive_key(key: object) -> bool:
94
- k = _normalize(key)
95
- return any(substr in k for substr in _SENSITIVE_KEY_SUBSTRINGS)
99
+ k_norm = _normalize(key)
100
+ k_non_norm = str(key)
101
+ return any(
102
+ substr in k_norm for substr in _SENSITIVE_KEY_SUBSTRINGS_NORMALIZED
103
+ ) and not any(substr in k_non_norm for substr in _ALLOWLISTED_KEYS_SUBSTRINGS)
96
104
 
97
105
 
98
106
  def _redact_mapping(m: dict) -> dict:
@@ -610,7 +610,7 @@ class LeptonExecutor(BaseExecutor):
610
610
  job_state = lepton_status.get("state", "Unknown")
611
611
 
612
612
  # Map Lepton job states to our execution states
613
- if job_state == "Succeeded":
613
+ if job_state in ["Succeeded", "Completed"]:
614
614
  state = ExecutionState.SUCCESS
615
615
  elif job_state in ["Running", "Pending", "Starting"]:
616
616
  state = ExecutionState.RUNNING
@@ -42,6 +42,7 @@ from nemo_evaluator_launcher.common.helpers import (
42
42
  get_api_key_name,
43
43
  get_endpoint_url,
44
44
  get_eval_factory_command,
45
+ get_eval_factory_config,
45
46
  get_eval_factory_dataset_size_from_run_config,
46
47
  get_health_url,
47
48
  get_timestamp_string,
@@ -453,7 +454,15 @@ def _create_slurm_sbatch_script(
453
454
  # get task from mapping, overrides, urls
454
455
  tasks_mapping = load_tasks_mapping()
455
456
  task_definition = get_task_from_mapping(task.name, tasks_mapping)
456
- health_url = get_health_url(cfg, get_endpoint_url(cfg, task, task_definition))
457
+
458
+ # Create merged config for get_endpoint_url
459
+ merged_nemo_evaluator_config = get_eval_factory_config(cfg, task)
460
+ health_url = get_health_url(
461
+ cfg,
462
+ get_endpoint_url(
463
+ cfg, merged_nemo_evaluator_config, task_definition["endpoint_type"]
464
+ ),
465
+ )
457
466
 
458
467
  # TODO(public release): convert to template
459
468
  s = "#!/bin/bash\n"
@@ -16,7 +16,7 @@
16
16
  # Below is the _next_ version that will be published, not the currently published one.
17
17
  MAJOR = 0
18
18
  MINOR = 1
19
- PATCH = 19
19
+ PATCH = 26
20
20
  PRE_RELEASE = ""
21
21
 
22
22
  # Use the following formatting: (major, minor, patch, pre-release)
@@ -1,6 +1,6 @@
1
1
  # NOTE(agronskiy): checked parity
2
2
  [lm-evaluation-harness]
3
- container = "nvcr.io/nvidia/eval-factory/lm-evaluation-harness:25.08.1"
3
+ container = "nvcr.io/nvidia/eval-factory/lm-evaluation-harness:25.10"
4
4
 
5
5
  [lm-evaluation-harness.tasks.chat.ifeval]
6
6
  required_env_vars = []
@@ -124,7 +124,7 @@ required_env_vars = []
124
124
  ###############################################################################
125
125
  # NOTE(agronskiy): checked parity
126
126
  [mtbench]
127
- container = "nvcr.io/nvidia/eval-factory/mtbench:25.08.1"
127
+ container = "nvcr.io/nvidia/eval-factory/mtbench:25.10"
128
128
 
129
129
  [mtbench.tasks.chat.mtbench]
130
130
 
@@ -134,7 +134,7 @@ container = "nvcr.io/nvidia/eval-factory/mtbench:25.08.1"
134
134
  ###############################################################################
135
135
  # NOTE(agronskiy): checked parity
136
136
  [ifbench]
137
- container = "nvcr.io/nvidia/eval-factory/ifbench:25.08.1"
137
+ container = "nvcr.io/nvidia/eval-factory/ifbench:25.10"
138
138
 
139
139
  [ifbench.tasks.chat.ifbench]
140
140
  required_env_vars = []
@@ -142,7 +142,7 @@ required_env_vars = []
142
142
 
143
143
  ###############################################################################
144
144
  [simple_evals]
145
- container = "nvcr.io/nvidia/eval-factory/simple-evals:25.08.1"
145
+ container = "nvcr.io/nvidia/eval-factory/simple-evals:25.10"
146
146
 
147
147
  [simple_evals.tasks.chat.gpqa_diamond]
148
148
  required_env_vars = ["HF_TOKEN"]
@@ -213,7 +213,7 @@ required_env_vars = []
213
213
  ###############################################################################
214
214
  # NOTE(agronskiy): checked parity
215
215
  [bigcode-evaluation-harness]
216
- container = "nvcr.io/nvidia/eval-factory/bigcode-evaluation-harness:25.08.1"
216
+ container = "nvcr.io/nvidia/eval-factory/bigcode-evaluation-harness:25.10"
217
217
 
218
218
  [bigcode-evaluation-harness.tasks.chat.mbpp]
219
219
  required_env_vars = []
@@ -226,12 +226,12 @@ required_env_vars = []
226
226
  [bigcode-evaluation-harness.tasks.completions.humaneval]
227
227
  required_env_vars = []
228
228
 
229
- [bigcode-evaluation-harness.tasks.completions.humaneval_instruct]
229
+ [bigcode-evaluation-harness.tasks.chat.humaneval_instruct]
230
230
 
231
231
 
232
232
  ###############################################################################
233
233
  [livecodebench]
234
- container = "nvcr.io/nvidia/eval-factory/livecodebench:25.08.1"
234
+ container = "nvcr.io/nvidia/eval-factory/livecodebench:25.10"
235
235
 
236
236
  [livecodebench.tasks.chat.livecodebench_0724_0125]
237
237
  required_env_vars = []
@@ -242,7 +242,7 @@ required_env_vars = []
242
242
 
243
243
  ###############################################################################
244
244
  [scicode]
245
- container = "nvcr.io/nvidia/eval-factory/scicode:25.08.1"
245
+ container = "nvcr.io/nvidia/eval-factory/scicode:25.10"
246
246
 
247
247
  [scicode.tasks.chat.aa_scicode]
248
248
  required_env_vars = []
@@ -250,7 +250,7 @@ required_env_vars = []
250
250
 
251
251
  ###############################################################################
252
252
  [hle]
253
- container = "nvcr.io/nvidia/eval-factory/hle:25.08.1"
253
+ container = "nvcr.io/nvidia/eval-factory/hle:25.10"
254
254
 
255
255
  [hle.tasks.chat.hle]
256
256
  required_env_vars = ["HF_TOKEN", "OPENAI_CLIENT_ID", "OPENAI_CLIENT_SECRET"]
@@ -258,7 +258,7 @@ required_env_vars = ["HF_TOKEN", "OPENAI_CLIENT_ID", "OPENAI_CLIENT_SECRET"]
258
258
 
259
259
  ###############################################################################
260
260
  [bfcl]
261
- container = "nvcr.io/nvidia/eval-factory/bfcl:25.08.1"
261
+ container = "nvcr.io/nvidia/eval-factory/bfcl:25.10"
262
262
 
263
263
  [bfcl.tasks.chat.bfclv2_ast_prompting]
264
264
  required_env_vars = []
@@ -267,9 +267,20 @@ required_env_vars = []
267
267
  required_env_vars = []
268
268
 
269
269
 
270
+ ###############################################################################
271
+ [profbench]
272
+ container = "nvcr.io/nvidia/eval-factory/profbench:25.10"
273
+
274
+ [profbench.tasks.chat.llm_judge]
275
+ required_env_vars = []
276
+
277
+ [profbench.tasks.chat.report_generation]
278
+ required_env_vars = []
279
+
280
+
270
281
  ###############################################################################
271
282
  [vlmevalkit]
272
- container = "nvcr.io/nvidia/eval-factory/vlmevalkit:25.08.1"
283
+ container = "nvcr.io/nvidia/eval-factory/vlmevalkit:25.10"
273
284
 
274
285
  [vlmevalkit.tasks.vlm.ocrbench]
275
286
  required_env_vars = []
@@ -286,15 +297,40 @@ required_env_vars = ["OPENAI_CLIENT_ID", "OPENAI_CLIENT_SECRET"]
286
297
 
287
298
  ###############################################################################
288
299
  [garak]
289
- container = "nvcr.io/nvidia/eval-factory/garak:25.08.1"
300
+ container = "nvcr.io/nvidia/eval-factory/garak:25.10"
290
301
 
291
302
  [garak.tasks.chat.garak]
292
303
  required_env_vars = []
293
304
 
305
+ ###############################################################################
306
+ # NOTE(wprazuch): to verify if the tasks need any env var setting
307
+ [nemo_skills]
308
+ container = "nvcr.io/nvidia/eval-factory/nemo_skills:25.10"
309
+
310
+ [nemo_skills.tasks.chat.ns_aime2024]
311
+ required_env_vars = ["JUDGE_API_KEY"]
312
+
313
+ [nemo_skills.tasks.chat.ns_aime2025]
314
+ required_env_vars = []
315
+
316
+ [nemo_skills.tasks.chat.ns_bfcl_v3]
317
+ required_env_vars = []
318
+
319
+ [nemo_skills.tasks.chat.ns_gpqa]
320
+ required_env_vars = ["HF_TOKEN"]
321
+
322
+ [nemo_skills.tasks.chat.ns_hle]
323
+ required_env_vars = []
324
+
325
+ [nemo_skills.tasks.chat.ns_mmlu]
326
+ required_env_vars = ["HF_TOKEN"]
327
+
328
+ [nemo_skills.tasks.chat.ns_mmlu_pro]
329
+ required_env_vars = ["HF_TOKEN"]
294
330
 
295
331
  ###############################################################################
296
332
  [safety-harness]
297
- container = "nvcr.io/nvidia/eval-factory/safety-harness:25.08.1"
333
+ container = "nvcr.io/nvidia/eval-factory/safety-harness:25.10"
298
334
 
299
335
  [safety-harness.tasks.chat.aegis_v2]
300
336
  required_env_vars = ["HF_TOKEN"]
@@ -303,7 +339,7 @@ required_env_vars = ["HF_TOKEN"]
303
339
  ###############################################################################
304
340
  # NOTE(agronskiy): checked parity
305
341
  [helm]
306
- container = "nvcr.io/nvidia/eval-factory/helm:25.08.1"
342
+ container = "nvcr.io/nvidia/eval-factory/helm:25.10"
307
343
 
308
344
  [helm.tasks.chat.medcalc_bench]
309
345
 
@@ -339,6 +375,6 @@ container = "nvcr.io/nvidia/eval-factory/helm:25.08.1"
339
375
  ###############################################################################
340
376
  # NOTE(agronskiy): checked parity
341
377
  [tooltalk]
342
- container = "nvcr.io/nvidia/eval-factory/tooltalk:25.08.1"
378
+ container = "nvcr.io/nvidia/eval-factory/tooltalk:25.10"
343
379
 
344
380
  [tooltalk.tasks.chat.tooltalk]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nemo-evaluator-launcher
3
- Version: 0.1.19
3
+ Version: 0.1.26
4
4
  Summary: Launcher for the evaluations provided by NeMo Evaluator containers with different runtime backends
5
5
  Author: NVIDIA
6
6
  Author-email: nemo-toolkit@nvidia.com
@@ -478,7 +478,7 @@ Requires-Dist: mlflow>=2.8.0; extra == "mlflow"
478
478
  Provides-Extra: wandb
479
479
  Requires-Dist: wandb>=0.15.0; extra == "wandb"
480
480
  Provides-Extra: gsheets
481
- Requires-Dist: gsheets>=0.1.0; extra == "gsheets"
481
+ Requires-Dist: gspread>=5.0.0; extra == "gsheets"
482
482
  Provides-Extra: exporters
483
483
  Requires-Dist: mlflow; extra == "exporters"
484
484
  Requires-Dist: wandb; extra == "exporters"
@@ -21,7 +21,7 @@ wandb
21
21
  gsheets
22
22
 
23
23
  [gsheets]
24
- gsheets>=0.1.0
24
+ gspread>=5.0.0
25
25
 
26
26
  [mlflow]
27
27
  mlflow>=2.8.0