nemo-evaluator-launcher 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nemo-evaluator-launcher might be problematic. Click here for more details.

Files changed (57) hide show
  1. nemo_evaluator_launcher/__init__.py +65 -0
  2. nemo_evaluator_launcher/api/__init__.py +24 -0
  3. nemo_evaluator_launcher/api/functional.py +678 -0
  4. nemo_evaluator_launcher/api/types.py +89 -0
  5. nemo_evaluator_launcher/api/utils.py +19 -0
  6. nemo_evaluator_launcher/cli/__init__.py +15 -0
  7. nemo_evaluator_launcher/cli/export.py +148 -0
  8. nemo_evaluator_launcher/cli/info.py +117 -0
  9. nemo_evaluator_launcher/cli/kill.py +39 -0
  10. nemo_evaluator_launcher/cli/ls_runs.py +113 -0
  11. nemo_evaluator_launcher/cli/ls_tasks.py +134 -0
  12. nemo_evaluator_launcher/cli/main.py +143 -0
  13. nemo_evaluator_launcher/cli/run.py +135 -0
  14. nemo_evaluator_launcher/cli/status.py +120 -0
  15. nemo_evaluator_launcher/cli/version.py +52 -0
  16. nemo_evaluator_launcher/common/__init__.py +16 -0
  17. nemo_evaluator_launcher/common/execdb.py +189 -0
  18. nemo_evaluator_launcher/common/helpers.py +194 -0
  19. nemo_evaluator_launcher/common/logging_utils.py +349 -0
  20. nemo_evaluator_launcher/common/mapping.py +295 -0
  21. nemo_evaluator_launcher/configs/__init__.py +15 -0
  22. nemo_evaluator_launcher/configs/default.yaml +28 -0
  23. nemo_evaluator_launcher/configs/deployment/nim.yaml +32 -0
  24. nemo_evaluator_launcher/configs/deployment/none.yaml +16 -0
  25. nemo_evaluator_launcher/configs/deployment/sglang.yaml +38 -0
  26. nemo_evaluator_launcher/configs/deployment/vllm.yaml +41 -0
  27. nemo_evaluator_launcher/configs/execution/lepton/default.yaml +92 -0
  28. nemo_evaluator_launcher/configs/execution/local.yaml +17 -0
  29. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +33 -0
  30. nemo_evaluator_launcher/executors/__init__.py +22 -0
  31. nemo_evaluator_launcher/executors/base.py +97 -0
  32. nemo_evaluator_launcher/executors/lepton/__init__.py +16 -0
  33. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +589 -0
  34. nemo_evaluator_launcher/executors/lepton/executor.py +905 -0
  35. nemo_evaluator_launcher/executors/lepton/job_helpers.py +394 -0
  36. nemo_evaluator_launcher/executors/local/__init__.py +15 -0
  37. nemo_evaluator_launcher/executors/local/executor.py +491 -0
  38. nemo_evaluator_launcher/executors/local/run.template.sh +88 -0
  39. nemo_evaluator_launcher/executors/registry.py +38 -0
  40. nemo_evaluator_launcher/executors/slurm/__init__.py +15 -0
  41. nemo_evaluator_launcher/executors/slurm/executor.py +996 -0
  42. nemo_evaluator_launcher/exporters/__init__.py +36 -0
  43. nemo_evaluator_launcher/exporters/base.py +112 -0
  44. nemo_evaluator_launcher/exporters/gsheets.py +391 -0
  45. nemo_evaluator_launcher/exporters/local.py +488 -0
  46. nemo_evaluator_launcher/exporters/mlflow.py +448 -0
  47. nemo_evaluator_launcher/exporters/registry.py +40 -0
  48. nemo_evaluator_launcher/exporters/utils.py +669 -0
  49. nemo_evaluator_launcher/exporters/wandb.py +376 -0
  50. nemo_evaluator_launcher/package_info.py +38 -0
  51. nemo_evaluator_launcher/resources/mapping.toml +344 -0
  52. nemo_evaluator_launcher-0.1.0.dist-info/METADATA +494 -0
  53. nemo_evaluator_launcher-0.1.0.dist-info/RECORD +57 -0
  54. nemo_evaluator_launcher-0.1.0.dist-info/WHEEL +5 -0
  55. nemo_evaluator_launcher-0.1.0.dist-info/entry_points.txt +3 -0
  56. nemo_evaluator_launcher-0.1.0.dist-info/licenses/LICENSE +451 -0
  57. nemo_evaluator_launcher-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,295 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import importlib
17
+ import pathlib
18
+ import sys
19
+ from importlib import resources
20
+ from typing import Any, Optional
21
+
22
+ import requests
23
+
24
+ if sys.version_info >= (3, 11):
25
+ import tomllib
26
+ else:
27
+ import tomli as tomllib
28
+
29
+ from nemo_evaluator_launcher.common.logging_utils import logger
30
+
31
+ # Configuration constants
32
+ # For below, see docs: https://docs.github.com/en/rest/repos/contents
33
+ MAPPING_URL = "https://raw.githubusercontent.com/NVIDIA-NeMo/Eval/main/packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/resources/mapping.toml"
34
+ CACHE_DIR = pathlib.Path.home() / ".nemo-evaluator" / "cache"
35
+ CACHE_FILENAME = "mapping.toml"
36
+ INTERNAL_RESOURCES_PKG = "nemo_evaluator_launcher_internal.resources"
37
+
38
+
39
+ def _ensure_cache_dir() -> None:
40
+ """Ensure the cache directory exists."""
41
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
42
+
43
+
44
+ def _get_cache_file() -> pathlib.Path:
45
+ """Get the cache file path.
46
+
47
+ Returns:
48
+ pathlib.Path: Path to the cache file.
49
+ """
50
+ return CACHE_DIR / CACHE_FILENAME
51
+
52
+
53
+ def _download_latest_mapping() -> Optional[bytes]:
54
+ """Download latest mapping from MAPPING_URL and return raw bytes.
55
+
56
+ Returns:
57
+ Optional[bytes]: Downloaded mapping bytes, or None if download fails.
58
+ """
59
+ try:
60
+ response = requests.get(MAPPING_URL, timeout=10)
61
+ response.raise_for_status()
62
+
63
+ # For GitHub raw URLs, the response content is the file content directly
64
+ mapping_bytes = response.content
65
+ assert isinstance(mapping_bytes, bytes)
66
+
67
+ logger.debug("Successfully downloaded mapping from remote URL")
68
+ return mapping_bytes
69
+ except (requests.RequestException, OSError) as e:
70
+ logger.warning("Failed to download mapping from remote URL", error=str(e))
71
+ return None
72
+
73
+
74
+ def _load_cached_mapping() -> Optional[dict[Any, Any]]:
75
+ """Load mapping from cache file.
76
+
77
+ Returns:
78
+ Optional[dict]: Loaded mapping data, or None if loading fails.
79
+ """
80
+ cache_file = _get_cache_file()
81
+ if not cache_file.exists():
82
+ return None
83
+
84
+ try:
85
+ with open(cache_file, "rb") as f:
86
+ mapping = tomllib.load(f)
87
+ logger.debug("Loaded mapping from cache")
88
+ return mapping # type: ignore[no-any-return]
89
+ except (OSError, tomllib.TOMLDecodeError) as e:
90
+ logger.warning("Failed to load mapping from cache", error=str(e))
91
+ return None
92
+
93
+
94
+ def _save_mapping_to_cache(mapping_bytes: bytes) -> None:
95
+ """Save mapping to cache file.
96
+
97
+ Args:
98
+ mapping_bytes: Mapping data to save.
99
+ """
100
+ try:
101
+ _ensure_cache_dir()
102
+ cache_file = _get_cache_file()
103
+
104
+ # Save the mapping data
105
+ with open(cache_file, "wb") as f:
106
+ f.write(mapping_bytes)
107
+
108
+ except OSError as e:
109
+ logger.warning("Failed to save mapping to cache", error=str(e))
110
+
111
+
112
+ def _load_packaged_resource(
113
+ resource_name: str, pkg_name: str = "nemo_evaluator_launcher.resources"
114
+ ) -> dict[str, Any]:
115
+ """Load a resource from the packaged resources.
116
+
117
+ Args:
118
+ resource_name: The name of the resource to load.
119
+ """
120
+ try:
121
+ resource_toml: dict[str, Any] = {}
122
+ with resources.files(pkg_name).joinpath(resource_name).open("rb") as f:
123
+ resource_toml = tomllib.load(f)
124
+ logger.info(
125
+ "Loaded resource from packaged file", resource=resource_name, pkg=pkg_name
126
+ )
127
+ return resource_toml
128
+ except (OSError, tomllib.TOMLDecodeError) as e:
129
+ logger.error(
130
+ "Failed to load from packaged file",
131
+ resource=resource_name,
132
+ pkg=pkg_name,
133
+ error=str(e),
134
+ )
135
+ raise RuntimeError(f"Failed to load {resource_name} from packaged file") from e
136
+
137
+
138
+ def _process_mapping(mapping_toml: dict) -> dict:
139
+ """Process the raw mapping TOML into the expected format.
140
+
141
+ Args:
142
+ mapping_toml: Raw mapping TOML data.
143
+ Returns:
144
+ dict: Processed mapping in the expected format.
145
+ """
146
+ mapping = {}
147
+ for harness_name, harness_data in mapping_toml.items():
148
+ assert isinstance(harness_data["tasks"], dict)
149
+ for endpoint_type, harness_tasks in harness_data["tasks"].items():
150
+ assert isinstance(harness_tasks, dict)
151
+ for task_name, task_data in harness_tasks.items():
152
+ assert isinstance(task_data, dict)
153
+ key = (harness_name, task_name)
154
+ if key in mapping:
155
+ raise KeyError(
156
+ f"(harness,task)-tuple key {repr(key)} already exists in the mapping"
157
+ )
158
+ mapping[key] = {
159
+ "task": task_name,
160
+ "harness": harness_name,
161
+ "container": harness_data["container"],
162
+ "endpoint_type": endpoint_type,
163
+ }
164
+ for task_data_key in task_data.keys():
165
+ if task_data_key in mapping[key]:
166
+ raise KeyError(
167
+ f"{repr(task_data_key)} is not allowed as key under {repr(key)} in the mapping"
168
+ )
169
+ mapping[key].update(task_data)
170
+ return mapping
171
+
172
+
173
+ def load_tasks_mapping(
174
+ latest: bool = False,
175
+ mapping_toml: pathlib.Path | str | None = None,
176
+ ) -> dict[tuple[str, str], dict]:
177
+ """Load tasks mapping.
178
+
179
+ The function obeys the following priority rules:
180
+ 1. (Default) If latest==False and mapping_toml is None -> load packaged mapping.
181
+ 2. If latest==True -> fetch MAPPING_URL, save to cache, load it.
182
+ 3. If mapping_toml is not None -> load mapping from this path.
183
+
184
+ Returns:
185
+ dict: Mapping of (harness_name, task_name) to dict holding their configuration.
186
+
187
+ """
188
+ local_mapping: dict = {}
189
+ if latest:
190
+ mapping_bytes = _download_latest_mapping()
191
+ if mapping_bytes:
192
+ _save_mapping_to_cache(mapping_bytes)
193
+ local_mapping = _process_mapping(
194
+ tomllib.loads(mapping_bytes.decode("utf-8"))
195
+ )
196
+ else:
197
+ # Fallback to cached mapping; raise only if cache is missing/invalid
198
+ cached = _load_cached_mapping()
199
+ if cached:
200
+ local_mapping = _process_mapping(cached)
201
+ else:
202
+ raise RuntimeError("could not download latest mapping")
203
+
204
+ elif mapping_toml is not None:
205
+ with open(mapping_toml, "rb") as f:
206
+ local_mapping = _process_mapping(tomllib.load(f))
207
+ else:
208
+ local_mapping = _process_mapping(_load_packaged_resource(CACHE_FILENAME))
209
+
210
+ # TODO: make more elegant. We consider it ok to avoid a fully-blown plugin system.
211
+ # Check if nemo_evaluator_launcher_internal is available and load its mapping.toml
212
+ # CAVEAT: lazy-loading here, not somewhere top level, is important, to ensure
213
+ # order of package initialization.
214
+ try:
215
+ importlib.import_module("nemo_evaluator_launcher_internal")
216
+ logger.debug("Internal package available, loading internal mapping")
217
+ internal_mapping = _process_mapping(
218
+ _load_packaged_resource(CACHE_FILENAME, INTERNAL_RESOURCES_PKG)
219
+ )
220
+
221
+ # Merge internal mapping with local mapping (internal takes precedence)
222
+ local_mapping.update(internal_mapping)
223
+ logger.info(
224
+ "Successfully merged internal mapping", internal_tasks=len(internal_mapping)
225
+ )
226
+ except ImportError:
227
+ logger.debug("Internal package not available, using external mapping only")
228
+ except Exception as e:
229
+ logger.warning("Failed to load internal mapping", error=str(e))
230
+
231
+ return local_mapping
232
+
233
+
234
+ def get_task_from_mapping(query: str, mapping: dict[Any, Any]) -> dict[Any, Any]:
235
+ """Unambiguously selects one task from the mapping based on the query.
236
+
237
+ Args:
238
+ query: Either `task_name` or `harness_name.task_name`.
239
+ mapping: The object returned from `load_tasks_mapping` function.
240
+
241
+ Returns:
242
+ dict: Task data.
243
+
244
+ """
245
+ num_dots = query.count(".")
246
+
247
+ # if there are no dots in query, treat it like a task name
248
+ if num_dots == 0:
249
+ matching_keys = [key for key in mapping.keys() if key[1] == query]
250
+ # if exactly one task matching the query has been found:
251
+ if len(matching_keys) == 1:
252
+ key = matching_keys[0]
253
+ return mapping[key] # type: ignore[no-any-return]
254
+ # if more than one task matching the query has been found:
255
+ elif len(matching_keys) > 1:
256
+ matching_queries = [
257
+ f"{harness_name}.{task_name}"
258
+ for harness_name, task_name in matching_keys
259
+ ]
260
+ raise ValueError(
261
+ f"there are multiple tasks named {repr(query)} in the mapping,"
262
+ f" please select one of {repr(matching_queries)}"
263
+ )
264
+ # no tasks have been found:
265
+ else:
266
+ raise ValueError(f"task {repr(query)} does not exist in the mapping")
267
+
268
+ # if there is one dot in query, treat it like "{harness_name}.{task_name}"
269
+ elif num_dots == 1:
270
+ harness_name, task_name = query.split(".")
271
+ matching_keys = [
272
+ key for key in mapping.keys() if key == (harness_name, task_name)
273
+ ]
274
+ # if exactly one task matching the query has been found:
275
+ if len(matching_keys) == 1:
276
+ key = matching_keys[0]
277
+ return mapping[key] # type: ignore[no-any-return]
278
+ # if more than one task matching the query has been found:
279
+ elif len(matching_keys) >= 2:
280
+ raise ValueError(
281
+ f"there are multiple matches for {repr(query)} in the mapping,"
282
+ " which means the mapping is not correct"
283
+ )
284
+ # no tasks have been found:
285
+ else:
286
+ raise ValueError(
287
+ f"harness.task {repr(query)} does not exist in the mapping"
288
+ )
289
+
290
+ # invalid query
291
+ else:
292
+ raise ValueError(
293
+ f"invalid query={repr(query)} for task mapping,"
294
+ " it must contain exactly zero or one occurrence of '.' character"
295
+ )
@@ -0,0 +1,15 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
@@ -0,0 +1,28 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ defaults:
17
+ - execution: local
18
+ - deployment: none
19
+ - _self_
20
+
21
+ # NOTE(dfridman): If deployment is used, `target` parameters will be automatically populated.
22
+ target:
23
+ api_endpoint:
24
+ url: ???
25
+ model_id: ???
26
+ api_key_name: "<YOUR_API_KEY_NAME>" # NOTE: the name of the env var
27
+
28
+ evaluation: []
@@ -0,0 +1,32 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ type: nim
17
+ image: ??? # e.g., nvcr.io/nim/meta/llama-3.1-8b-instruct:1.8.6
18
+ served_model_name: ???
19
+ port: 8000
20
+
21
+ # NIM containers use default entrypoint - no custom command needed
22
+ # Configuration is done via environment variables in lepton_config
23
+
24
+ endpoints:
25
+ chat: /v1/chat/completions
26
+ completions: /v1/completions
27
+ health: /health
28
+ # Note: Environment variables should be configured in lepton_config.envs
29
+ # Auto-derived environment variables from deployment config:
30
+ # - SERVED_MODEL_NAME (from served_model_name)
31
+ # - NIM_MODEL_NAME (from served_model_name for NIM)
32
+ # - MODEL_PORT (from port)
@@ -0,0 +1,16 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ type: none
@@ -0,0 +1,38 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ type: sglang
17
+ image: lmsysorg/sglang:latest
18
+ checkpoint_path: ???
19
+ served_model_name: ???
20
+ port: 8000
21
+ tensor_parallel_size: 8
22
+ data_parallel_size: 1
23
+ extra_args: ""
24
+ env_vars: {} # {name: value} dict
25
+
26
+ endpoints:
27
+ chat: /v1/chat/completions
28
+ completions: /v1/completions
29
+ health: /health
30
+
31
+ command: python3 -m sglang.launch_server
32
+ --model-path ${oc.select:deployment.hf_model_handle,/checkpoint}
33
+ --host 0.0.0.0
34
+ --port ${deployment.port}
35
+ --served-model-name ${deployment.served_model_name}
36
+ --tp ${deployment.tensor_parallel_size}
37
+ --dp ${deployment.data_parallel_size}
38
+ ${deployment.extra_args}
@@ -0,0 +1,41 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ type: vllm
17
+ image: vllm/vllm-openai:latest
18
+ checkpoint_path: ???
19
+ served_model_name: ???
20
+ port: 8000
21
+ tensor_parallel_size: 8
22
+ pipeline_parallel_size: 1
23
+ data_parallel_size: 1
24
+ extra_args: ""
25
+ env_vars: {} # {name: value} dict
26
+
27
+ endpoints:
28
+ chat: /v1/chat/completions
29
+ completions: /v1/completions
30
+ health: /health
31
+
32
+ command: vllm serve ${oc.select:deployment.hf_model_handle,/checkpoint}
33
+ --tensor-parallel-size=${deployment.tensor_parallel_size}
34
+ --pipeline-parallel-size=${deployment.pipeline_parallel_size}
35
+ --data-parallel-size=${deployment.data_parallel_size}
36
+ --port ${deployment.port}
37
+ --trust-remote-code
38
+ --served-model-name ${deployment.served_model_name}
39
+ --enforce-eager
40
+ --gpu-memory-utilization 0.95
41
+ ${deployment.extra_args}
@@ -0,0 +1,92 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ # =============================================================================
17
+ # LEPTON EXECUTION CONFIGURATION - Default Environment
18
+ # Following the same pattern as SLURM internal configs
19
+ # =============================================================================
20
+
21
+ type: lepton
22
+ output_dir: ???
23
+
24
+ # Environment variables passed to evaluation containers
25
+ env_var_names: []
26
+
27
+ # Evaluation task settings for Lepton batch execution
28
+ evaluation_tasks:
29
+ # Resource requirements for evaluation tasks
30
+ resource_shape: "cpu.small" # Can be overridden to gpu.small, gpu.1xh200, etc
31
+
32
+ # Timeout for individual evaluation tasks (in seconds)
33
+ timeout: 3600 # 1 hour per evaluation task
34
+
35
+ # Whether to use shared storage for results
36
+ use_shared_storage: true
37
+
38
+ # Lepton platform infrastructure settings for this environment
39
+ lepton_platform:
40
+ # Deployment configuration (for model endpoints)
41
+ deployment:
42
+ # Node group for endpoint deployments
43
+ node_group: "nv-int-multiteam-nebius-h200-01-mjgbgffo"
44
+
45
+ # Endpoint readiness timeout (in seconds)
46
+ endpoint_readiness_timeout: 1200 # 20 minutes
47
+
48
+ # Platform defaults that get merged into all deployments
49
+ platform_defaults:
50
+ # Health checks
51
+ health:
52
+ readiness: {}
53
+ liveness: {}
54
+
55
+ # Logging
56
+ log:
57
+ enable_collection: true
58
+
59
+ # Monitoring and routing
60
+ metrics: {}
61
+ routing_policy: {}
62
+
63
+ # Queue configuration for resource prioritization
64
+ queue_config:
65
+ priority_class: "mid-4000"
66
+
67
+ # Security and infrastructure
68
+ enable_rdma: false
69
+ user_security_context: {}
70
+
71
+ # NGC registry access
72
+ image_pull_secrets:
73
+ - "ngc-general-access-and-private-registry-in-sa-nvex-org"
74
+
75
+ # Task execution configuration (for evaluation tasks)
76
+ tasks:
77
+ # Node group for evaluation tasks
78
+ node_group: "nv-int-multiteam-nebius-h200-01"
79
+
80
+ # Default environment variables for all tasks
81
+ env_vars: {}
82
+
83
+ # Storage mounts for task execution
84
+ mounts:
85
+ # Main workspace mount
86
+ - from: "node-nfs:lepton-shared-fs"
87
+ path: "/shared/nemo-evaluator-launcher-workspace"
88
+ mount_path: "/workspace"
89
+
90
+ # Image pull secrets for task containers
91
+ image_pull_secrets:
92
+ - "lepton-nvidia"
@@ -0,0 +1,17 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ type: local
17
+ output_dir: ???
@@ -0,0 +1,33 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ # Each slurm cluster has its own flavour, below we provide some defaults that might meet one's needs.
17
+ hostname: ???
18
+ username: ${oc.env:USER}
19
+ account: ???
20
+ partition: batch
21
+ num_nodes: 1
22
+ ntasks_per_node: 1
23
+ gres: gpu:8
24
+ walltime: 01:00:00
25
+ subproject: nemo-evaluator-launcher
26
+ output_dir: ???
27
+ env_vars:
28
+ deployment: {}
29
+ evaluation: {}
30
+ mounts:
31
+ deployment: {}
32
+ evaluation: {}
33
+ mount_home: true
@@ -0,0 +1,22 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ # NOTE(dfridman): importing all executors will register them in the registry
17
+
18
+ from nemo_evaluator_launcher.executors.lepton.executor import LeptonExecutor
19
+ from nemo_evaluator_launcher.executors.local.executor import LocalExecutor
20
+ from nemo_evaluator_launcher.executors.slurm.executor import SlurmExecutor
21
+
22
+ __all__ = ["LeptonExecutor", "LocalExecutor", "SlurmExecutor"]