nemo-evaluator-launcher 0.1.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nemo-evaluator-launcher might be problematic. Click here for more details.

Files changed (60) hide show
  1. nemo_evaluator_launcher/__init__.py +79 -0
  2. nemo_evaluator_launcher/api/__init__.py +24 -0
  3. nemo_evaluator_launcher/api/functional.py +698 -0
  4. nemo_evaluator_launcher/api/types.py +98 -0
  5. nemo_evaluator_launcher/api/utils.py +19 -0
  6. nemo_evaluator_launcher/cli/__init__.py +15 -0
  7. nemo_evaluator_launcher/cli/export.py +267 -0
  8. nemo_evaluator_launcher/cli/info.py +512 -0
  9. nemo_evaluator_launcher/cli/kill.py +41 -0
  10. nemo_evaluator_launcher/cli/ls_runs.py +134 -0
  11. nemo_evaluator_launcher/cli/ls_tasks.py +136 -0
  12. nemo_evaluator_launcher/cli/main.py +226 -0
  13. nemo_evaluator_launcher/cli/run.py +200 -0
  14. nemo_evaluator_launcher/cli/status.py +164 -0
  15. nemo_evaluator_launcher/cli/version.py +55 -0
  16. nemo_evaluator_launcher/common/__init__.py +16 -0
  17. nemo_evaluator_launcher/common/execdb.py +283 -0
  18. nemo_evaluator_launcher/common/helpers.py +366 -0
  19. nemo_evaluator_launcher/common/logging_utils.py +357 -0
  20. nemo_evaluator_launcher/common/mapping.py +295 -0
  21. nemo_evaluator_launcher/common/printing_utils.py +93 -0
  22. nemo_evaluator_launcher/configs/__init__.py +15 -0
  23. nemo_evaluator_launcher/configs/default.yaml +28 -0
  24. nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
  25. nemo_evaluator_launcher/configs/deployment/nim.yaml +32 -0
  26. nemo_evaluator_launcher/configs/deployment/none.yaml +16 -0
  27. nemo_evaluator_launcher/configs/deployment/sglang.yaml +38 -0
  28. nemo_evaluator_launcher/configs/deployment/trtllm.yaml +24 -0
  29. nemo_evaluator_launcher/configs/deployment/vllm.yaml +42 -0
  30. nemo_evaluator_launcher/configs/execution/lepton/default.yaml +92 -0
  31. nemo_evaluator_launcher/configs/execution/local.yaml +19 -0
  32. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +34 -0
  33. nemo_evaluator_launcher/executors/__init__.py +22 -0
  34. nemo_evaluator_launcher/executors/base.py +120 -0
  35. nemo_evaluator_launcher/executors/lepton/__init__.py +16 -0
  36. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +609 -0
  37. nemo_evaluator_launcher/executors/lepton/executor.py +1004 -0
  38. nemo_evaluator_launcher/executors/lepton/job_helpers.py +398 -0
  39. nemo_evaluator_launcher/executors/local/__init__.py +15 -0
  40. nemo_evaluator_launcher/executors/local/executor.py +605 -0
  41. nemo_evaluator_launcher/executors/local/run.template.sh +103 -0
  42. nemo_evaluator_launcher/executors/registry.py +38 -0
  43. nemo_evaluator_launcher/executors/slurm/__init__.py +15 -0
  44. nemo_evaluator_launcher/executors/slurm/executor.py +1147 -0
  45. nemo_evaluator_launcher/exporters/__init__.py +36 -0
  46. nemo_evaluator_launcher/exporters/base.py +121 -0
  47. nemo_evaluator_launcher/exporters/gsheets.py +409 -0
  48. nemo_evaluator_launcher/exporters/local.py +502 -0
  49. nemo_evaluator_launcher/exporters/mlflow.py +619 -0
  50. nemo_evaluator_launcher/exporters/registry.py +40 -0
  51. nemo_evaluator_launcher/exporters/utils.py +624 -0
  52. nemo_evaluator_launcher/exporters/wandb.py +490 -0
  53. nemo_evaluator_launcher/package_info.py +38 -0
  54. nemo_evaluator_launcher/resources/mapping.toml +380 -0
  55. nemo_evaluator_launcher-0.1.28.dist-info/METADATA +494 -0
  56. nemo_evaluator_launcher-0.1.28.dist-info/RECORD +60 -0
  57. nemo_evaluator_launcher-0.1.28.dist-info/WHEEL +5 -0
  58. nemo_evaluator_launcher-0.1.28.dist-info/entry_points.txt +3 -0
  59. nemo_evaluator_launcher-0.1.28.dist-info/licenses/LICENSE +451 -0
  60. nemo_evaluator_launcher-0.1.28.dist-info/top_level.txt +1 -0
@@ -0,0 +1,366 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import base64
17
+ import copy
18
+ import datetime
19
+ from dataclasses import dataclass
20
+ from typing import Optional
21
+
22
+ import yaml
23
+ from omegaconf import DictConfig, OmegaConf
24
+
25
+ from nemo_evaluator_launcher.cli.version import get_versions
26
+ from nemo_evaluator_launcher.common.logging_utils import logger
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class CmdAndReadableComment:
31
+ """See the comment to `_yaml_to_echo_command`."""
32
+
33
+ # Actual command. Might include hard-to-debug elements such as base64-encoded
34
+ # configs.
35
+ cmd: str
36
+ # A debuggale readable comment that can be passed along for accompanying
37
+ # the actual command
38
+ debug: str
39
+ # Whether the content might be potentially unsafe. This is a flag useful for
40
+ # downstream callers who want to raise exceptions e.g. when a script was
41
+ # saved that would execute this command.
42
+ is_potentially_unsafe: bool = False
43
+
44
+
45
+ def _str_to_echo_command(str_to_save: str, filename: str) -> CmdAndReadableComment:
46
+ """Create a safe (see below) echo command saving a string to file.
47
+
48
+ Safety in this context means the ability to pass such echo command through the
49
+ `bash -c '...'` boundaries for example.
50
+
51
+ Naturally, enconding with base64 creates debuggability issues. For that, the second
52
+ output of the function is the string with bash comment signs prepended.
53
+ """
54
+ str_to_save_b64 = base64.b64encode(str_to_save.encode("utf-8")).decode("utf-8")
55
+ debug_str = "\n".join(
56
+ [f"# Contents of {filename}"] + ["# " + s for s in str_to_save.splitlines()]
57
+ )
58
+ return CmdAndReadableComment(
59
+ cmd=f'echo "{str_to_save_b64}" | base64 -d > {filename}', debug=debug_str
60
+ )
61
+
62
+
63
+ def _set_nested_optionally_overriding(
64
+ d: dict, keys: list[str], val: object, *, override_if_exists: bool = False
65
+ ):
66
+ """Sets d[...keys....] = value, creating keys all the way"""
67
+ temp = d
68
+ for key in keys[:-1]:
69
+ temp = temp.setdefault(key, {})
70
+ if override_if_exists or keys[-1] not in temp:
71
+ temp[keys[-1]] = val
72
+
73
+
74
+ def get_eval_factory_config(
75
+ cfg: DictConfig,
76
+ user_task_config: DictConfig,
77
+ ) -> dict:
78
+ """Extract config fields for eval factory.
79
+
80
+ This function extracts the config field similar to how overrides are handled.
81
+
82
+ Overrides will start to be deprecated (or not, but at least a warning will be logged).
83
+ """
84
+
85
+ if cfg.evaluation.get("overrides") or user_task_config.get("overrides"):
86
+ # TODO(agronskiy): start removing overrides, test `test_start_deprecating_overrides`
87
+ # will start failing soon.
88
+ logger.warning(
89
+ "We are deprecating using old-style dot-delimited overrides "
90
+ "in favour of `nemo_evaluator_config` field. Please check "
91
+ "the documentation."
92
+ )
93
+
94
+ logger.debug("Getting nemo evaluator merged config")
95
+ # Extract config fields similar to overrides - convert to basic Python types first
96
+ # Support both new and old format for backward compatibility
97
+ cfg_config = cfg.evaluation.get("nemo_evaluator_config") or cfg.evaluation.get(
98
+ "config", {}
99
+ )
100
+ user_config = user_task_config.get("nemo_evaluator_config") or user_task_config.get(
101
+ "config", {}
102
+ )
103
+
104
+ # Convert OmegaConf objects to basic Python types
105
+ if cfg_config:
106
+ cfg_config = OmegaConf.to_container(cfg_config, resolve=True)
107
+ if user_config:
108
+ user_config = OmegaConf.to_container(user_config, resolve=True)
109
+
110
+ # Merge the configs
111
+ merged_nemo_evaluator_config: dict = OmegaConf.to_container(
112
+ OmegaConf.merge(cfg_config, user_config)
113
+ )
114
+
115
+ logger.debug(
116
+ "Merged nemo evaluator config, not final",
117
+ source_global_cfg=cfg_config,
118
+ source_task_config=user_config,
119
+ result=merged_nemo_evaluator_config,
120
+ )
121
+
122
+ return merged_nemo_evaluator_config
123
+
124
+
125
+ def get_eval_factory_command(
126
+ cfg: DictConfig,
127
+ user_task_config: DictConfig,
128
+ task_definition: dict,
129
+ ) -> CmdAndReadableComment:
130
+ # This gets the eval_factory_config merged from both top-level and task-level.
131
+ merged_nemo_evaluator_config = get_eval_factory_config(
132
+ cfg,
133
+ user_task_config,
134
+ )
135
+
136
+ # We now prepare the config to be passed to `nemo-evaluator` command.
137
+ _set_nested_optionally_overriding(
138
+ merged_nemo_evaluator_config,
139
+ ["target", "api_endpoint", "url"],
140
+ get_endpoint_url(
141
+ cfg,
142
+ merged_nemo_evaluator_config=merged_nemo_evaluator_config,
143
+ endpoint_type=task_definition["endpoint_type"],
144
+ ),
145
+ )
146
+ _set_nested_optionally_overriding(
147
+ merged_nemo_evaluator_config,
148
+ ["target", "api_endpoint", "model_id"],
149
+ get_served_model_name(cfg),
150
+ )
151
+ _set_nested_optionally_overriding(
152
+ merged_nemo_evaluator_config,
153
+ ["target", "api_endpoint", "type"],
154
+ task_definition["endpoint_type"],
155
+ )
156
+ _set_nested_optionally_overriding(
157
+ merged_nemo_evaluator_config,
158
+ ["config", "type"],
159
+ task_definition["task"],
160
+ )
161
+ _set_nested_optionally_overriding(
162
+ merged_nemo_evaluator_config,
163
+ ["config", "output_dir"],
164
+ "/results",
165
+ )
166
+ _set_nested_optionally_overriding(
167
+ merged_nemo_evaluator_config,
168
+ ["target", "api_endpoint", "api_key"],
169
+ "API_KEY",
170
+ )
171
+ _set_nested_optionally_overriding(
172
+ merged_nemo_evaluator_config,
173
+ [
174
+ "metadata",
175
+ "launcher_resolved_config",
176
+ ],
177
+ OmegaConf.to_container(cfg, resolve=True),
178
+ )
179
+ _set_nested_optionally_overriding(
180
+ merged_nemo_evaluator_config,
181
+ ["metadata", "versioning"],
182
+ get_versions(),
183
+ )
184
+
185
+ # Now get the pre_cmd either from `evaluation.pre_cmd` or task-level pre_cmd. Note the
186
+ # order -- task level wins.
187
+ pre_cmd: str = (
188
+ user_task_config.get("pre_cmd") or cfg.evaluation.get("pre_cmd") or ""
189
+ )
190
+
191
+ is_potentially_unsafe = False
192
+ if pre_cmd:
193
+ logger.warning(
194
+ "Found non-empty pre_cmd that might be a security risk if executed. "
195
+ "Setting `is_potentially_unsafe` to `True`",
196
+ pre_cmd=pre_cmd,
197
+ )
198
+ is_potentially_unsafe = True
199
+ _set_nested_optionally_overriding(
200
+ merged_nemo_evaluator_config,
201
+ ["metadata", "pre_cmd"],
202
+ pre_cmd,
203
+ )
204
+
205
+ create_pre_script_cmd = _str_to_echo_command(pre_cmd, filename="pre_cmd.sh")
206
+
207
+ create_yaml_cmd = _str_to_echo_command(
208
+ yaml.safe_dump(merged_nemo_evaluator_config), "config_ef.yaml"
209
+ )
210
+
211
+ # NOTE: we use `source` to allow tricks like exports etc (if needed) -- it runs in the same
212
+ # shell as the command.
213
+ eval_command = (
214
+ "cmd=$(command -v nemo-evaluator >/dev/null 2>&1 && echo nemo-evaluator || echo eval-factory) "
215
+ + "&& source pre_cmd.sh "
216
+ + "&& $cmd run_eval --run_config config_ef.yaml"
217
+ )
218
+
219
+ # NOTE: see note and test about deprecating that.
220
+ overrides = copy.deepcopy(dict(cfg.evaluation.get("overrides", {})))
221
+ overrides.update(dict(user_task_config.get("overrides", {})))
222
+ # NOTE(dfridman): Temporary fix to make sure that the overrides arg is not split into multiple lines.
223
+ # Consider passing a JSON object on Eval Factory side
224
+ overrides = {
225
+ k: (v.strip("\n") if isinstance(v, str) else v) for k, v in overrides.items()
226
+ }
227
+ overrides_str = ",".join([f"{k}={v}" for k, v in overrides.items()])
228
+ if overrides_str:
229
+ eval_command = f"{eval_command} --overrides {overrides_str}"
230
+
231
+ # We return both the command and the debugging base64-decoded strings, useful
232
+ # for exposing when building scripts.
233
+ return CmdAndReadableComment(
234
+ cmd=create_pre_script_cmd.cmd
235
+ + " && "
236
+ + create_yaml_cmd.cmd
237
+ + " && "
238
+ + eval_command,
239
+ debug=create_pre_script_cmd.debug + "\n\n" + create_yaml_cmd.debug,
240
+ is_potentially_unsafe=is_potentially_unsafe,
241
+ )
242
+
243
+
244
+ def get_endpoint_url(
245
+ cfg: DictConfig,
246
+ merged_nemo_evaluator_config: dict,
247
+ endpoint_type: str,
248
+ ) -> str:
249
+ def apply_url_override(url: str) -> str:
250
+ """Apply user URL override if provided."""
251
+ nemo_evaluator_config_url = (
252
+ merged_nemo_evaluator_config.get("target", {})
253
+ .get("api_endpoint", {})
254
+ .get("url", None)
255
+ )
256
+
257
+ if nemo_evaluator_config_url:
258
+ return nemo_evaluator_config_url
259
+
260
+ # Being deprecated, see `get_eval_factory_config` message.
261
+ overrides_old_style_url = merged_nemo_evaluator_config.get("overrides", {}).get(
262
+ "target.api_endpoint.url", None
263
+ )
264
+ if overrides_old_style_url:
265
+ return overrides_old_style_url
266
+
267
+ return url
268
+
269
+ if cfg.deployment.type == "none":
270
+ # For deployment: none, use target URL regardless of executor type
271
+ if OmegaConf.is_missing(cfg.target.api_endpoint, "url"):
272
+ raise ValueError(
273
+ "API endpoint URL is not set. Add `target.api_endpoint.url` to your config "
274
+ "OR override via CLI"
275
+ )
276
+ return apply_url_override(cfg.target.api_endpoint.url)
277
+
278
+ elif (
279
+ hasattr(cfg, "target")
280
+ and hasattr(cfg.target, "api_endpoint")
281
+ and hasattr(cfg.target.api_endpoint, "url")
282
+ and not OmegaConf.is_missing(cfg.target.api_endpoint, "url")
283
+ ):
284
+ # For Lepton executor with dynamically set target URL
285
+ return apply_url_override(cfg.target.api_endpoint.url)
286
+
287
+ else:
288
+ # Local executor - use localhost
289
+ endpoint_uri = cfg.deployment.endpoints[endpoint_type]
290
+ endpoint_url = f"http://127.0.0.1:{cfg.deployment.port}{endpoint_uri}"
291
+ return endpoint_url
292
+
293
+
294
+ def get_health_url(cfg: DictConfig, endpoint_url: str) -> str:
295
+ if cfg.deployment.type == "none":
296
+ logger.warning("Using endpoint URL as health URL", will_be_used=endpoint_url)
297
+ return endpoint_url # TODO(public release) is using model url as health url OK?
298
+ health_uri = cfg.deployment.endpoints["health"]
299
+ health_url = f"http://127.0.0.1:{cfg.deployment.port}{health_uri}"
300
+ return health_url
301
+
302
+
303
+ def get_served_model_name(cfg: DictConfig) -> str:
304
+ if cfg.deployment.type == "none":
305
+ return str(cfg.target.api_endpoint.model_id)
306
+ else:
307
+ return str(cfg.deployment.served_model_name)
308
+
309
+
310
+ def get_api_key_name(cfg: DictConfig) -> str | None:
311
+ res = cfg.get("target", {}).get("api_endpoint", {}).get("api_key_name", None)
312
+ return str(res) if res else None
313
+
314
+
315
+ def get_timestamp_string(include_microseconds: bool = True) -> str:
316
+ """Get timestamp in format YYYYmmdd_HHMMSS_ffffff."""
317
+ dt = datetime.datetime.now()
318
+ fmt = "%Y%m%d_%H%M%S"
319
+ if include_microseconds:
320
+ fmt += "_%f"
321
+ dts = datetime.datetime.strftime(dt, fmt)
322
+ return dts
323
+
324
+
325
+ def get_eval_factory_dataset_size_from_run_config(run_config: dict) -> Optional[int]:
326
+ config = run_config["config"]
327
+ limit_samples = config["params"].get("limit_samples", None)
328
+ if limit_samples is not None:
329
+ return int(limit_samples)
330
+
331
+ # TODO(dfridman): Move `dataset_size` values to the corresponding `framework.yaml` in Eval Factory.
332
+ dataset_sizes = {
333
+ ("lm-evaluation-harness", "ifeval"): 541,
334
+ ("simple_evals", "gpqa_diamond"): 198,
335
+ ("simple_evals", "gpqa_diamond_nemo"): 198,
336
+ ("simple_evals", "AA_math_test_500"): 500,
337
+ ("simple_evals", "math_test_500_nemo"): 500,
338
+ ("simple_evals", "aime_2024_nemo"): 30,
339
+ ("simple_evals", "AA_AIME_2024"): 30,
340
+ ("simple_evals", "aime_2025_nemo"): 30,
341
+ ("simple_evals", "AIME_2025"): 30,
342
+ ("simple_evals", "humaneval"): 164,
343
+ ("simple_evals", "mmlu"): 14042,
344
+ ("simple_evals", "mmlu_pro"): 12032,
345
+ ("bigcode-evaluation-harness", "mbpp"): 500,
346
+ ("bigcode-evaluation-harness", "humaneval"): 164,
347
+ ("livecodebench", "livecodebench_0724_0125"): 315,
348
+ ("livecodebench", "livecodebench_0824_0225"): 279,
349
+ ("hle", "hle"): 2684,
350
+ ("scicode", "aa_scicode"): 338,
351
+ }
352
+ dataset_size = dataset_sizes.get((run_config["framework_name"], config["type"]))
353
+ if dataset_size is None:
354
+ return None
355
+ else:
356
+ dataset_size = int(dataset_size)
357
+
358
+ downsampling_ratio = (
359
+ config["params"].get("extra", {}).get("downsampling_ratio", None)
360
+ )
361
+ if downsampling_ratio is not None:
362
+ dataset_size = int(round(dataset_size * downsampling_ratio))
363
+
364
+ n_samples = int(config["params"].get("extra", {}).get("n_samples", 1))
365
+ dataset_size *= n_samples
366
+ return dataset_size