nemo-evaluator-launcher 0.1.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nemo-evaluator-launcher might be problematic. Click here for more details.

Files changed (57) hide show
  1. nemo_evaluator_launcher/__init__.py +65 -0
  2. nemo_evaluator_launcher/api/__init__.py +24 -0
  3. nemo_evaluator_launcher/api/functional.py +641 -0
  4. nemo_evaluator_launcher/api/types.py +89 -0
  5. nemo_evaluator_launcher/api/utils.py +19 -0
  6. nemo_evaluator_launcher/cli/__init__.py +15 -0
  7. nemo_evaluator_launcher/cli/export.py +148 -0
  8. nemo_evaluator_launcher/cli/info.py +117 -0
  9. nemo_evaluator_launcher/cli/kill.py +39 -0
  10. nemo_evaluator_launcher/cli/ls_runs.py +113 -0
  11. nemo_evaluator_launcher/cli/ls_tasks.py +34 -0
  12. nemo_evaluator_launcher/cli/main.py +136 -0
  13. nemo_evaluator_launcher/cli/run.py +135 -0
  14. nemo_evaluator_launcher/cli/status.py +118 -0
  15. nemo_evaluator_launcher/cli/version.py +52 -0
  16. nemo_evaluator_launcher/common/__init__.py +16 -0
  17. nemo_evaluator_launcher/common/execdb.py +189 -0
  18. nemo_evaluator_launcher/common/helpers.py +157 -0
  19. nemo_evaluator_launcher/common/logging_utils.py +349 -0
  20. nemo_evaluator_launcher/common/mapping.py +310 -0
  21. nemo_evaluator_launcher/configs/__init__.py +15 -0
  22. nemo_evaluator_launcher/configs/default.yaml +28 -0
  23. nemo_evaluator_launcher/configs/deployment/nim.yaml +32 -0
  24. nemo_evaluator_launcher/configs/deployment/none.yaml +16 -0
  25. nemo_evaluator_launcher/configs/deployment/sglang.yaml +38 -0
  26. nemo_evaluator_launcher/configs/deployment/vllm.yaml +41 -0
  27. nemo_evaluator_launcher/configs/execution/lepton/default.yaml +92 -0
  28. nemo_evaluator_launcher/configs/execution/local.yaml +17 -0
  29. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +33 -0
  30. nemo_evaluator_launcher/executors/__init__.py +22 -0
  31. nemo_evaluator_launcher/executors/base.py +97 -0
  32. nemo_evaluator_launcher/executors/lepton/__init__.py +16 -0
  33. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +589 -0
  34. nemo_evaluator_launcher/executors/lepton/executor.py +905 -0
  35. nemo_evaluator_launcher/executors/lepton/job_helpers.py +394 -0
  36. nemo_evaluator_launcher/executors/local/__init__.py +15 -0
  37. nemo_evaluator_launcher/executors/local/executor.py +491 -0
  38. nemo_evaluator_launcher/executors/local/run.template.sh +88 -0
  39. nemo_evaluator_launcher/executors/registry.py +38 -0
  40. nemo_evaluator_launcher/executors/slurm/__init__.py +15 -0
  41. nemo_evaluator_launcher/executors/slurm/executor.py +982 -0
  42. nemo_evaluator_launcher/exporters/__init__.py +36 -0
  43. nemo_evaluator_launcher/exporters/base.py +112 -0
  44. nemo_evaluator_launcher/exporters/gsheets.py +391 -0
  45. nemo_evaluator_launcher/exporters/local.py +488 -0
  46. nemo_evaluator_launcher/exporters/mlflow.py +448 -0
  47. nemo_evaluator_launcher/exporters/registry.py +40 -0
  48. nemo_evaluator_launcher/exporters/utils.py +669 -0
  49. nemo_evaluator_launcher/exporters/wandb.py +376 -0
  50. nemo_evaluator_launcher/package_info.py +35 -0
  51. nemo_evaluator_launcher/resources/mapping.toml +344 -0
  52. nemo_evaluator_launcher-0.1.0rc2.dist-info/METADATA +35 -0
  53. nemo_evaluator_launcher-0.1.0rc2.dist-info/RECORD +57 -0
  54. nemo_evaluator_launcher-0.1.0rc2.dist-info/WHEEL +5 -0
  55. nemo_evaluator_launcher-0.1.0rc2.dist-info/entry_points.txt +3 -0
  56. nemo_evaluator_launcher-0.1.0rc2.dist-info/licenses/LICENSE +451 -0
  57. nemo_evaluator_launcher-0.1.0rc2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,310 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import base64
17
+ import importlib
18
+ import os
19
+ import pathlib
20
+ import sys
21
+ from importlib import resources
22
+ from typing import Any, Optional
23
+
24
+ import requests
25
+
26
+ if sys.version_info >= (3, 11):
27
+ import tomllib
28
+ else:
29
+ import tomli as tomllib
30
+
31
+ from nemo_evaluator_launcher.common.logging_utils import logger
32
+
33
+ # Configuration constants
34
+ # For below, see docs: https://docs.gitlab.com/api/repository_files/
35
+ MAPPING_URL = "TODO: set to github actual one"
36
+ CACHE_DIR = pathlib.Path.home() / ".nemo-evaluator" / "cache"
37
+ CACHE_FILENAME = "mapping.toml"
38
+ INTERNAL_RESOURCES_PKG = "nemo_evaluator_launcher_internal.resources"
39
+
40
+
41
+ def _ensure_cache_dir() -> None:
42
+ """Ensure the cache directory exists."""
43
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
44
+
45
+
46
+ def _get_cache_file() -> pathlib.Path:
47
+ """Get the cache file path.
48
+
49
+ Returns:
50
+ pathlib.Path: Path to the cache file.
51
+ """
52
+ return CACHE_DIR / CACHE_FILENAME
53
+
54
+
55
+ def _download_latest_mapping() -> Optional[bytes]:
56
+ """Download latest mapping from MAPPING_URL and return raw bytes.
57
+
58
+ Returns:
59
+ Optional[bytes]: Downloaded mapping bytes, or None if download fails.
60
+ """
61
+ raise NotImplementedError("This logic is still not implemented")
62
+ try:
63
+ # Get GitLab token from environment
64
+ gitlab_token = os.environ.get("GITLAB_TOKEN", "")
65
+ if not gitlab_token:
66
+ logger.warning(
67
+ "GITLAB_TOKEN not set, download may fail due to authentication"
68
+ )
69
+ headers = {"PRIVATE-TOKEN": gitlab_token, "Content-Type": "application/json"}
70
+
71
+ response = requests.get(MAPPING_URL, headers=headers, timeout=10)
72
+ response.raise_for_status()
73
+
74
+ # GitLab API returns JSON with content in base64
75
+ response_data = response.json()
76
+ if "content" in response_data:
77
+ mapping_bytes = base64.b64decode(response_data["content"])
78
+ else:
79
+ mapping_bytes = response.content
80
+ assert isinstance(mapping_bytes, bytes)
81
+
82
+ logger.debug("Successfully downloaded mapping from remote URL")
83
+ return mapping_bytes
84
+ except (requests.RequestException, OSError) as e:
85
+ logger.warning("Failed to download mapping from remote URL", error=str(e))
86
+ return None
87
+
88
+
89
+ def _load_cached_mapping() -> Optional[dict[Any, Any]]:
90
+ """Load mapping from cache file.
91
+
92
+ Returns:
93
+ Optional[dict]: Loaded mapping data, or None if loading fails.
94
+ """
95
+ cache_file = _get_cache_file()
96
+ if not cache_file.exists():
97
+ return None
98
+
99
+ try:
100
+ with open(cache_file, "rb") as f:
101
+ mapping = tomllib.load(f)
102
+ logger.debug("Loaded mapping from cache")
103
+ return mapping # type: ignore[no-any-return]
104
+ except (OSError, tomllib.TOMLDecodeError) as e:
105
+ logger.warning("Failed to load mapping from cache", error=str(e))
106
+ return None
107
+
108
+
109
+ def _save_mapping_to_cache(mapping_bytes: bytes) -> None:
110
+ """Save mapping to cache file.
111
+
112
+ Args:
113
+ mapping_bytes: Mapping data to save.
114
+ """
115
+ try:
116
+ _ensure_cache_dir()
117
+ cache_file = _get_cache_file()
118
+
119
+ # Save the mapping data
120
+ with open(cache_file, "wb") as f:
121
+ f.write(mapping_bytes)
122
+
123
+ except OSError as e:
124
+ logger.warning("Failed to save mapping to cache", error=str(e))
125
+
126
+
127
+ def _load_packaged_resource(
128
+ resource_name: str, pkg_name: str = "nemo_evaluator_launcher.resources"
129
+ ) -> dict[str, Any]:
130
+ """Load a resource from the packaged resources.
131
+
132
+ Args:
133
+ resource_name: The name of the resource to load.
134
+ """
135
+ try:
136
+ resource_toml: dict[str, Any] = {}
137
+ with resources.files(pkg_name).joinpath(resource_name).open("rb") as f:
138
+ resource_toml = tomllib.load(f)
139
+ logger.info(
140
+ "Loaded resource from packaged file", resource=resource_name, pkg=pkg_name
141
+ )
142
+ return resource_toml
143
+ except (OSError, tomllib.TOMLDecodeError) as e:
144
+ logger.error(
145
+ "Failed to load from packaged file",
146
+ resource=resource_name,
147
+ pkg=pkg_name,
148
+ error=str(e),
149
+ )
150
+ raise RuntimeError(f"Failed to load {resource_name} from packaged file") from e
151
+
152
+
153
+ def _process_mapping(mapping_toml: dict) -> dict:
154
+ """Process the raw mapping TOML into the expected format.
155
+
156
+ Args:
157
+ mapping_toml: Raw mapping TOML data.
158
+ Returns:
159
+ dict: Processed mapping in the expected format.
160
+ """
161
+ mapping = {}
162
+ for harness_name, harness_data in mapping_toml.items():
163
+ assert isinstance(harness_data["tasks"], dict)
164
+ for endpoint_type, harness_tasks in harness_data["tasks"].items():
165
+ assert isinstance(harness_tasks, dict)
166
+ for task_name, task_data in harness_tasks.items():
167
+ assert isinstance(task_data, dict)
168
+ key = (harness_name, task_name)
169
+ if key in mapping:
170
+ raise KeyError(
171
+ f"(harness,task)-tuple key {repr(key)} already exists in the mapping"
172
+ )
173
+ mapping[key] = {
174
+ "task": task_name,
175
+ "harness": harness_name,
176
+ "container": harness_data["container"],
177
+ "endpoint_type": endpoint_type,
178
+ }
179
+ for task_data_key in task_data.keys():
180
+ if task_data_key in mapping[key]:
181
+ raise KeyError(
182
+ f"{repr(task_data_key)} is not allowed as key under {repr(key)} in the mapping"
183
+ )
184
+ mapping[key].update(task_data)
185
+ return mapping
186
+
187
+
188
+ def load_tasks_mapping(
189
+ latest: bool = False,
190
+ mapping_toml: pathlib.Path | str | None = None,
191
+ ) -> dict[tuple[str, str], dict]:
192
+ """Load tasks mapping.
193
+
194
+ The function obeys the following priority rules:
195
+ 1. (Default) If latest==False and mapping_toml is None -> load packaged mapping.
196
+ 2. If latest==True -> fetch MAPPING_URL, save to cache, load it.
197
+ 3. If mapping_toml is not None -> load mapping from this path.
198
+
199
+ Returns:
200
+ dict: Mapping of (harness_name, task_name) to dict holding their configuration.
201
+
202
+ """
203
+ local_mapping: dict = {}
204
+ if latest:
205
+ mapping_bytes = _download_latest_mapping()
206
+ if mapping_bytes:
207
+ _save_mapping_to_cache(mapping_bytes)
208
+ local_mapping = _process_mapping(
209
+ tomllib.loads(mapping_bytes.decode("utf-8"))
210
+ )
211
+ else:
212
+ # Fallback to cached mapping; raise only if cache is missing/invalid
213
+ cached = _load_cached_mapping()
214
+ if cached:
215
+ local_mapping = _process_mapping(cached)
216
+ else:
217
+ raise RuntimeError("could not download latest mapping")
218
+
219
+ elif mapping_toml is not None:
220
+ with open(mapping_toml, "rb") as f:
221
+ local_mapping = _process_mapping(tomllib.load(f))
222
+ else:
223
+ local_mapping = _process_mapping(_load_packaged_resource(CACHE_FILENAME))
224
+
225
+ # TODO: make more elegant. We consider it ok to avoid a fully-blown plugin system.
226
+ # Check if nemo_evaluator_launcher_internal is available and load its mapping.toml
227
+ # CAVEAT: lazy-loading here, not somewhere top level, is important, to ensure
228
+ # order of package initialization.
229
+ try:
230
+ importlib.import_module("nemo_evaluator_launcher_internal")
231
+ logger.debug("Internal package available, loading internal mapping")
232
+ internal_mapping = _process_mapping(
233
+ _load_packaged_resource(CACHE_FILENAME, INTERNAL_RESOURCES_PKG)
234
+ )
235
+
236
+ # Merge internal mapping with local mapping (internal takes precedence)
237
+ local_mapping.update(internal_mapping)
238
+ logger.info(
239
+ "Successfully merged internal mapping", internal_tasks=len(internal_mapping)
240
+ )
241
+ except ImportError:
242
+ logger.debug("Internal package not available, using external mapping only")
243
+ except Exception as e:
244
+ logger.warning("Failed to load internal mapping", error=str(e))
245
+
246
+ return local_mapping
247
+
248
+
249
+ def get_task_from_mapping(query: str, mapping: dict[Any, Any]) -> dict[Any, Any]:
250
+ """Unambiguously selects one task from the mapping based on the query.
251
+
252
+ Args:
253
+ query: Either `task_name` or `harness_name.task_name`.
254
+ mapping: The object returned from `load_tasks_mapping` function.
255
+
256
+ Returns:
257
+ dict: Task data.
258
+
259
+ """
260
+ num_dots = query.count(".")
261
+
262
+ # if there are no dots in query, treat it like a task name
263
+ if num_dots == 0:
264
+ matching_keys = [key for key in mapping.keys() if key[1] == query]
265
+ # if exactly one task matching the query has been found:
266
+ if len(matching_keys) == 1:
267
+ key = matching_keys[0]
268
+ return mapping[key] # type: ignore[no-any-return]
269
+ # if more than one task matching the query has been found:
270
+ elif len(matching_keys) > 1:
271
+ matching_queries = [
272
+ f"{harness_name}.{task_name}"
273
+ for harness_name, task_name in matching_keys
274
+ ]
275
+ raise ValueError(
276
+ f"there are multiple tasks named {repr(query)} in the mapping,"
277
+ f" please select one of {repr(matching_queries)}"
278
+ )
279
+ # no tasks have been found:
280
+ else:
281
+ raise ValueError(f"task {repr(query)} does not exist in the mapping")
282
+
283
+ # if there is one dot in query, treat it like "{harness_name}.{task_name}"
284
+ elif num_dots == 1:
285
+ harness_name, task_name = query.split(".")
286
+ matching_keys = [
287
+ key for key in mapping.keys() if key == (harness_name, task_name)
288
+ ]
289
+ # if exactly one task matching the query has been found:
290
+ if len(matching_keys) == 1:
291
+ key = matching_keys[0]
292
+ return mapping[key] # type: ignore[no-any-return]
293
+ # if more than one task matching the query has been found:
294
+ elif len(matching_keys) >= 2:
295
+ raise ValueError(
296
+ f"there are multiple matches for {repr(query)} in the mapping,"
297
+ " which means the mapping is not correct"
298
+ )
299
+ # no tasks have been found:
300
+ else:
301
+ raise ValueError(
302
+ f"harness.task {repr(query)} does not exist in the mapping"
303
+ )
304
+
305
+ # invalid query
306
+ else:
307
+ raise ValueError(
308
+ f"invalid query={repr(query)} for task mapping,"
309
+ " it must contain exactly zero or one occurrence of '.' character"
310
+ )
@@ -0,0 +1,15 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
@@ -0,0 +1,28 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ defaults:
17
+ - execution: local
18
+ - deployment: none
19
+ - _self_
20
+
21
+ # NOTE(dfridman): If deployment is used, `target` parameters will be automatically populated.
22
+ target:
23
+ api_endpoint:
24
+ url: ???
25
+ model_id: ???
26
+ api_key_name: "<YOUR_API_KEY_NAME>" # NOTE: the name of the env var
27
+
28
+ evaluation: []
@@ -0,0 +1,32 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ type: nim
17
+ image: ??? # e.g., nvcr.io/nim/meta/llama-3.1-8b-instruct:1.8.6
18
+ served_model_name: ???
19
+ port: 8000
20
+
21
+ # NIM containers use default entrypoint - no custom command needed
22
+ # Configuration is done via environment variables in lepton_config
23
+
24
+ endpoints:
25
+ chat: /v1/chat/completions
26
+ completions: /v1/completions
27
+ health: /health
28
+ # Note: Environment variables should be configured in lepton_config.envs
29
+ # Auto-derived environment variables from deployment config:
30
+ # - SERVED_MODEL_NAME (from served_model_name)
31
+ # - NIM_MODEL_NAME (from served_model_name for NIM)
32
+ # - MODEL_PORT (from port)
@@ -0,0 +1,16 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ type: none
@@ -0,0 +1,38 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ type: sglang
17
+ image: lmsysorg/sglang:latest
18
+ checkpoint_path: ???
19
+ served_model_name: ???
20
+ port: 8000
21
+ tensor_parallel_size: 8
22
+ data_parallel_size: 1
23
+ extra_args: ""
24
+ env_vars: {} # {name: value} dict
25
+
26
+ endpoints:
27
+ chat: /v1/chat/completions
28
+ completions: /v1/completions
29
+ health: /health
30
+
31
+ command: python3 -m sglang.launch_server
32
+ --model-path ${oc.select:deployment.hf_model_handle,/checkpoint}
33
+ --host 0.0.0.0
34
+ --port ${deployment.port}
35
+ --served-model-name ${deployment.served_model_name}
36
+ --tp ${deployment.tensor_parallel_size}
37
+ --dp ${deployment.data_parallel_size}
38
+ ${deployment.extra_args}
@@ -0,0 +1,41 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ type: vllm
17
+ image: vllm/vllm-openai:latest
18
+ checkpoint_path: ???
19
+ served_model_name: ???
20
+ port: 8000
21
+ tensor_parallel_size: 8
22
+ pipeline_parallel_size: 1
23
+ data_parallel_size: 1
24
+ extra_args: ""
25
+ env_vars: {} # {name: value} dict
26
+
27
+ endpoints:
28
+ chat: /v1/chat/completions
29
+ completions: /v1/completions
30
+ health: /health
31
+
32
+ command: vllm serve ${oc.select:deployment.hf_model_handle,/checkpoint}
33
+ --tensor-parallel-size=${deployment.tensor_parallel_size}
34
+ --pipeline-parallel-size=${deployment.pipeline_parallel_size}
35
+ --data-parallel-size=${deployment.data_parallel_size}
36
+ --port ${deployment.port}
37
+ --trust-remote-code
38
+ --served-model-name ${deployment.served_model_name}
39
+ --enforce-eager
40
+ --gpu-memory-utilization 0.95
41
+ ${deployment.extra_args}
@@ -0,0 +1,92 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ # =============================================================================
17
+ # LEPTON EXECUTION CONFIGURATION - Default Environment
18
+ # Following the same pattern as SLURM internal configs
19
+ # =============================================================================
20
+
21
+ type: lepton
22
+ output_dir: ???
23
+
24
+ # Environment variables passed to evaluation containers
25
+ env_var_names: []
26
+
27
+ # Evaluation task settings for Lepton batch execution
28
+ evaluation_tasks:
29
+ # Resource requirements for evaluation tasks
30
+ resource_shape: "cpu.small" # Can be overridden to gpu.small, gpu.1xh200, etc
31
+
32
+ # Timeout for individual evaluation tasks (in seconds)
33
+ timeout: 3600 # 1 hour per evaluation task
34
+
35
+ # Whether to use shared storage for results
36
+ use_shared_storage: true
37
+
38
+ # Lepton platform infrastructure settings for this environment
39
+ lepton_platform:
40
+ # Deployment configuration (for model endpoints)
41
+ deployment:
42
+ # Node group for endpoint deployments
43
+ node_group: "nv-int-multiteam-nebius-h200-01-mjgbgffo"
44
+
45
+ # Endpoint readiness timeout (in seconds)
46
+ endpoint_readiness_timeout: 1200 # 20 minutes
47
+
48
+ # Platform defaults that get merged into all deployments
49
+ platform_defaults:
50
+ # Health checks
51
+ health:
52
+ readiness: {}
53
+ liveness: {}
54
+
55
+ # Logging
56
+ log:
57
+ enable_collection: true
58
+
59
+ # Monitoring and routing
60
+ metrics: {}
61
+ routing_policy: {}
62
+
63
+ # Queue configuration for resource prioritization
64
+ queue_config:
65
+ priority_class: "mid-4000"
66
+
67
+ # Security and infrastructure
68
+ enable_rdma: false
69
+ user_security_context: {}
70
+
71
+ # NGC registry access
72
+ image_pull_secrets:
73
+ - "ngc-general-access-and-private-registry-in-sa-nvex-org"
74
+
75
+ # Task execution configuration (for evaluation tasks)
76
+ tasks:
77
+ # Node group for evaluation tasks
78
+ node_group: "nv-int-multiteam-nebius-h200-01"
79
+
80
+ # Default environment variables for all tasks
81
+ env_vars: {}
82
+
83
+ # Storage mounts for task execution
84
+ mounts:
85
+ # Main workspace mount
86
+ - from: "node-nfs:lepton-shared-fs"
87
+ path: "/shared/nemo-evaluator-launcher-workspace"
88
+ mount_path: "/workspace"
89
+
90
+ # Image pull secrets for task containers
91
+ image_pull_secrets:
92
+ - "lepton-nvidia"
@@ -0,0 +1,17 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ type: local
17
+ output_dir: ???
@@ -0,0 +1,33 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ # Each slurm cluster has its own flavour, below we provide some defaults that might meet one's needs.
17
+ hostname: ???
18
+ username: ${oc.env:USER}
19
+ account: ???
20
+ partition: batch
21
+ num_nodes: 1
22
+ ntasks_per_node: 1
23
+ gres: gpu:8
24
+ walltime: 01:00:00
25
+ subproject: nemo-evaluator-launcher
26
+ output_dir: ???
27
+ env_vars:
28
+ deployment: {}
29
+ evaluation: {}
30
+ mounts:
31
+ deployment: {}
32
+ evaluation: {}
33
+ mount_home: true
@@ -0,0 +1,22 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ # NOTE(dfridman): importing all executors will register them in the registry
17
+
18
+ from nemo_evaluator_launcher.executors.lepton.executor import LeptonExecutor
19
+ from nemo_evaluator_launcher.executors.local.executor import LocalExecutor
20
+ from nemo_evaluator_launcher.executors.slurm.executor import SlurmExecutor
21
+
22
+ __all__ = ["LeptonExecutor", "LocalExecutor", "SlurmExecutor"]