nemo-evaluator-launcher 0.1.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nemo-evaluator-launcher might be problematic. Click here for more details.
- nemo_evaluator_launcher/__init__.py +65 -0
- nemo_evaluator_launcher/api/__init__.py +24 -0
- nemo_evaluator_launcher/api/functional.py +641 -0
- nemo_evaluator_launcher/api/types.py +89 -0
- nemo_evaluator_launcher/api/utils.py +19 -0
- nemo_evaluator_launcher/cli/__init__.py +15 -0
- nemo_evaluator_launcher/cli/export.py +148 -0
- nemo_evaluator_launcher/cli/info.py +117 -0
- nemo_evaluator_launcher/cli/kill.py +39 -0
- nemo_evaluator_launcher/cli/ls_runs.py +113 -0
- nemo_evaluator_launcher/cli/ls_tasks.py +34 -0
- nemo_evaluator_launcher/cli/main.py +136 -0
- nemo_evaluator_launcher/cli/run.py +135 -0
- nemo_evaluator_launcher/cli/status.py +118 -0
- nemo_evaluator_launcher/cli/version.py +52 -0
- nemo_evaluator_launcher/common/__init__.py +16 -0
- nemo_evaluator_launcher/common/execdb.py +189 -0
- nemo_evaluator_launcher/common/helpers.py +157 -0
- nemo_evaluator_launcher/common/logging_utils.py +349 -0
- nemo_evaluator_launcher/common/mapping.py +310 -0
- nemo_evaluator_launcher/configs/__init__.py +15 -0
- nemo_evaluator_launcher/configs/default.yaml +28 -0
- nemo_evaluator_launcher/configs/deployment/nim.yaml +32 -0
- nemo_evaluator_launcher/configs/deployment/none.yaml +16 -0
- nemo_evaluator_launcher/configs/deployment/sglang.yaml +38 -0
- nemo_evaluator_launcher/configs/deployment/vllm.yaml +41 -0
- nemo_evaluator_launcher/configs/execution/lepton/default.yaml +92 -0
- nemo_evaluator_launcher/configs/execution/local.yaml +17 -0
- nemo_evaluator_launcher/configs/execution/slurm/default.yaml +33 -0
- nemo_evaluator_launcher/executors/__init__.py +22 -0
- nemo_evaluator_launcher/executors/base.py +97 -0
- nemo_evaluator_launcher/executors/lepton/__init__.py +16 -0
- nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +589 -0
- nemo_evaluator_launcher/executors/lepton/executor.py +905 -0
- nemo_evaluator_launcher/executors/lepton/job_helpers.py +394 -0
- nemo_evaluator_launcher/executors/local/__init__.py +15 -0
- nemo_evaluator_launcher/executors/local/executor.py +491 -0
- nemo_evaluator_launcher/executors/local/run.template.sh +88 -0
- nemo_evaluator_launcher/executors/registry.py +38 -0
- nemo_evaluator_launcher/executors/slurm/__init__.py +15 -0
- nemo_evaluator_launcher/executors/slurm/executor.py +982 -0
- nemo_evaluator_launcher/exporters/__init__.py +36 -0
- nemo_evaluator_launcher/exporters/base.py +112 -0
- nemo_evaluator_launcher/exporters/gsheets.py +391 -0
- nemo_evaluator_launcher/exporters/local.py +488 -0
- nemo_evaluator_launcher/exporters/mlflow.py +448 -0
- nemo_evaluator_launcher/exporters/registry.py +40 -0
- nemo_evaluator_launcher/exporters/utils.py +669 -0
- nemo_evaluator_launcher/exporters/wandb.py +376 -0
- nemo_evaluator_launcher/package_info.py +35 -0
- nemo_evaluator_launcher/resources/mapping.toml +344 -0
- nemo_evaluator_launcher-0.1.0rc2.dist-info/METADATA +35 -0
- nemo_evaluator_launcher-0.1.0rc2.dist-info/RECORD +57 -0
- nemo_evaluator_launcher-0.1.0rc2.dist-info/WHEEL +5 -0
- nemo_evaluator_launcher-0.1.0rc2.dist-info/entry_points.txt +3 -0
- nemo_evaluator_launcher-0.1.0rc2.dist-info/licenses/LICENSE +451 -0
- nemo_evaluator_launcher-0.1.0rc2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
import base64
|
|
17
|
+
import importlib
|
|
18
|
+
import os
|
|
19
|
+
import pathlib
|
|
20
|
+
import sys
|
|
21
|
+
from importlib import resources
|
|
22
|
+
from typing import Any, Optional
|
|
23
|
+
|
|
24
|
+
import requests
|
|
25
|
+
|
|
26
|
+
if sys.version_info >= (3, 11):
|
|
27
|
+
import tomllib
|
|
28
|
+
else:
|
|
29
|
+
import tomli as tomllib
|
|
30
|
+
|
|
31
|
+
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
32
|
+
|
|
33
|
+
# Configuration constants
|
|
34
|
+
# For below, see docs: https://docs.gitlab.com/api/repository_files/
|
|
35
|
+
MAPPING_URL = "TODO: set to github actual one"
|
|
36
|
+
CACHE_DIR = pathlib.Path.home() / ".nemo-evaluator" / "cache"
|
|
37
|
+
CACHE_FILENAME = "mapping.toml"
|
|
38
|
+
INTERNAL_RESOURCES_PKG = "nemo_evaluator_launcher_internal.resources"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _ensure_cache_dir() -> None:
|
|
42
|
+
"""Ensure the cache directory exists."""
|
|
43
|
+
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _get_cache_file() -> pathlib.Path:
|
|
47
|
+
"""Get the cache file path.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
pathlib.Path: Path to the cache file.
|
|
51
|
+
"""
|
|
52
|
+
return CACHE_DIR / CACHE_FILENAME
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _download_latest_mapping() -> Optional[bytes]:
|
|
56
|
+
"""Download latest mapping from MAPPING_URL and return raw bytes.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
Optional[bytes]: Downloaded mapping bytes, or None if download fails.
|
|
60
|
+
"""
|
|
61
|
+
raise NotImplementedError("This logic is still not implemented")
|
|
62
|
+
try:
|
|
63
|
+
# Get GitLab token from environment
|
|
64
|
+
gitlab_token = os.environ.get("GITLAB_TOKEN", "")
|
|
65
|
+
if not gitlab_token:
|
|
66
|
+
logger.warning(
|
|
67
|
+
"GITLAB_TOKEN not set, download may fail due to authentication"
|
|
68
|
+
)
|
|
69
|
+
headers = {"PRIVATE-TOKEN": gitlab_token, "Content-Type": "application/json"}
|
|
70
|
+
|
|
71
|
+
response = requests.get(MAPPING_URL, headers=headers, timeout=10)
|
|
72
|
+
response.raise_for_status()
|
|
73
|
+
|
|
74
|
+
# GitLab API returns JSON with content in base64
|
|
75
|
+
response_data = response.json()
|
|
76
|
+
if "content" in response_data:
|
|
77
|
+
mapping_bytes = base64.b64decode(response_data["content"])
|
|
78
|
+
else:
|
|
79
|
+
mapping_bytes = response.content
|
|
80
|
+
assert isinstance(mapping_bytes, bytes)
|
|
81
|
+
|
|
82
|
+
logger.debug("Successfully downloaded mapping from remote URL")
|
|
83
|
+
return mapping_bytes
|
|
84
|
+
except (requests.RequestException, OSError) as e:
|
|
85
|
+
logger.warning("Failed to download mapping from remote URL", error=str(e))
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _load_cached_mapping() -> Optional[dict[Any, Any]]:
|
|
90
|
+
"""Load mapping from cache file.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Optional[dict]: Loaded mapping data, or None if loading fails.
|
|
94
|
+
"""
|
|
95
|
+
cache_file = _get_cache_file()
|
|
96
|
+
if not cache_file.exists():
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
with open(cache_file, "rb") as f:
|
|
101
|
+
mapping = tomllib.load(f)
|
|
102
|
+
logger.debug("Loaded mapping from cache")
|
|
103
|
+
return mapping # type: ignore[no-any-return]
|
|
104
|
+
except (OSError, tomllib.TOMLDecodeError) as e:
|
|
105
|
+
logger.warning("Failed to load mapping from cache", error=str(e))
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _save_mapping_to_cache(mapping_bytes: bytes) -> None:
|
|
110
|
+
"""Save mapping to cache file.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
mapping_bytes: Mapping data to save.
|
|
114
|
+
"""
|
|
115
|
+
try:
|
|
116
|
+
_ensure_cache_dir()
|
|
117
|
+
cache_file = _get_cache_file()
|
|
118
|
+
|
|
119
|
+
# Save the mapping data
|
|
120
|
+
with open(cache_file, "wb") as f:
|
|
121
|
+
f.write(mapping_bytes)
|
|
122
|
+
|
|
123
|
+
except OSError as e:
|
|
124
|
+
logger.warning("Failed to save mapping to cache", error=str(e))
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _load_packaged_resource(
|
|
128
|
+
resource_name: str, pkg_name: str = "nemo_evaluator_launcher.resources"
|
|
129
|
+
) -> dict[str, Any]:
|
|
130
|
+
"""Load a resource from the packaged resources.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
resource_name: The name of the resource to load.
|
|
134
|
+
"""
|
|
135
|
+
try:
|
|
136
|
+
resource_toml: dict[str, Any] = {}
|
|
137
|
+
with resources.files(pkg_name).joinpath(resource_name).open("rb") as f:
|
|
138
|
+
resource_toml = tomllib.load(f)
|
|
139
|
+
logger.info(
|
|
140
|
+
"Loaded resource from packaged file", resource=resource_name, pkg=pkg_name
|
|
141
|
+
)
|
|
142
|
+
return resource_toml
|
|
143
|
+
except (OSError, tomllib.TOMLDecodeError) as e:
|
|
144
|
+
logger.error(
|
|
145
|
+
"Failed to load from packaged file",
|
|
146
|
+
resource=resource_name,
|
|
147
|
+
pkg=pkg_name,
|
|
148
|
+
error=str(e),
|
|
149
|
+
)
|
|
150
|
+
raise RuntimeError(f"Failed to load {resource_name} from packaged file") from e
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _process_mapping(mapping_toml: dict) -> dict:
|
|
154
|
+
"""Process the raw mapping TOML into the expected format.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
mapping_toml: Raw mapping TOML data.
|
|
158
|
+
Returns:
|
|
159
|
+
dict: Processed mapping in the expected format.
|
|
160
|
+
"""
|
|
161
|
+
mapping = {}
|
|
162
|
+
for harness_name, harness_data in mapping_toml.items():
|
|
163
|
+
assert isinstance(harness_data["tasks"], dict)
|
|
164
|
+
for endpoint_type, harness_tasks in harness_data["tasks"].items():
|
|
165
|
+
assert isinstance(harness_tasks, dict)
|
|
166
|
+
for task_name, task_data in harness_tasks.items():
|
|
167
|
+
assert isinstance(task_data, dict)
|
|
168
|
+
key = (harness_name, task_name)
|
|
169
|
+
if key in mapping:
|
|
170
|
+
raise KeyError(
|
|
171
|
+
f"(harness,task)-tuple key {repr(key)} already exists in the mapping"
|
|
172
|
+
)
|
|
173
|
+
mapping[key] = {
|
|
174
|
+
"task": task_name,
|
|
175
|
+
"harness": harness_name,
|
|
176
|
+
"container": harness_data["container"],
|
|
177
|
+
"endpoint_type": endpoint_type,
|
|
178
|
+
}
|
|
179
|
+
for task_data_key in task_data.keys():
|
|
180
|
+
if task_data_key in mapping[key]:
|
|
181
|
+
raise KeyError(
|
|
182
|
+
f"{repr(task_data_key)} is not allowed as key under {repr(key)} in the mapping"
|
|
183
|
+
)
|
|
184
|
+
mapping[key].update(task_data)
|
|
185
|
+
return mapping
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def load_tasks_mapping(
|
|
189
|
+
latest: bool = False,
|
|
190
|
+
mapping_toml: pathlib.Path | str | None = None,
|
|
191
|
+
) -> dict[tuple[str, str], dict]:
|
|
192
|
+
"""Load tasks mapping.
|
|
193
|
+
|
|
194
|
+
The function obeys the following priority rules:
|
|
195
|
+
1. (Default) If latest==False and mapping_toml is None -> load packaged mapping.
|
|
196
|
+
2. If latest==True -> fetch MAPPING_URL, save to cache, load it.
|
|
197
|
+
3. If mapping_toml is not None -> load mapping from this path.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
dict: Mapping of (harness_name, task_name) to dict holding their configuration.
|
|
201
|
+
|
|
202
|
+
"""
|
|
203
|
+
local_mapping: dict = {}
|
|
204
|
+
if latest:
|
|
205
|
+
mapping_bytes = _download_latest_mapping()
|
|
206
|
+
if mapping_bytes:
|
|
207
|
+
_save_mapping_to_cache(mapping_bytes)
|
|
208
|
+
local_mapping = _process_mapping(
|
|
209
|
+
tomllib.loads(mapping_bytes.decode("utf-8"))
|
|
210
|
+
)
|
|
211
|
+
else:
|
|
212
|
+
# Fallback to cached mapping; raise only if cache is missing/invalid
|
|
213
|
+
cached = _load_cached_mapping()
|
|
214
|
+
if cached:
|
|
215
|
+
local_mapping = _process_mapping(cached)
|
|
216
|
+
else:
|
|
217
|
+
raise RuntimeError("could not download latest mapping")
|
|
218
|
+
|
|
219
|
+
elif mapping_toml is not None:
|
|
220
|
+
with open(mapping_toml, "rb") as f:
|
|
221
|
+
local_mapping = _process_mapping(tomllib.load(f))
|
|
222
|
+
else:
|
|
223
|
+
local_mapping = _process_mapping(_load_packaged_resource(CACHE_FILENAME))
|
|
224
|
+
|
|
225
|
+
# TODO: make more elegant. We consider it ok to avoid a fully-blown plugin system.
|
|
226
|
+
# Check if nemo_evaluator_launcher_internal is available and load its mapping.toml
|
|
227
|
+
# CAVEAT: lazy-loading here, not somewhere top level, is important, to ensure
|
|
228
|
+
# order of package initialization.
|
|
229
|
+
try:
|
|
230
|
+
importlib.import_module("nemo_evaluator_launcher_internal")
|
|
231
|
+
logger.debug("Internal package available, loading internal mapping")
|
|
232
|
+
internal_mapping = _process_mapping(
|
|
233
|
+
_load_packaged_resource(CACHE_FILENAME, INTERNAL_RESOURCES_PKG)
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# Merge internal mapping with local mapping (internal takes precedence)
|
|
237
|
+
local_mapping.update(internal_mapping)
|
|
238
|
+
logger.info(
|
|
239
|
+
"Successfully merged internal mapping", internal_tasks=len(internal_mapping)
|
|
240
|
+
)
|
|
241
|
+
except ImportError:
|
|
242
|
+
logger.debug("Internal package not available, using external mapping only")
|
|
243
|
+
except Exception as e:
|
|
244
|
+
logger.warning("Failed to load internal mapping", error=str(e))
|
|
245
|
+
|
|
246
|
+
return local_mapping
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def get_task_from_mapping(query: str, mapping: dict[Any, Any]) -> dict[Any, Any]:
|
|
250
|
+
"""Unambiguously selects one task from the mapping based on the query.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
query: Either `task_name` or `harness_name.task_name`.
|
|
254
|
+
mapping: The object returned from `load_tasks_mapping` function.
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
dict: Task data.
|
|
258
|
+
|
|
259
|
+
"""
|
|
260
|
+
num_dots = query.count(".")
|
|
261
|
+
|
|
262
|
+
# if there are no dots in query, treat it like a task name
|
|
263
|
+
if num_dots == 0:
|
|
264
|
+
matching_keys = [key for key in mapping.keys() if key[1] == query]
|
|
265
|
+
# if exactly one task matching the query has been found:
|
|
266
|
+
if len(matching_keys) == 1:
|
|
267
|
+
key = matching_keys[0]
|
|
268
|
+
return mapping[key] # type: ignore[no-any-return]
|
|
269
|
+
# if more than one task matching the query has been found:
|
|
270
|
+
elif len(matching_keys) > 1:
|
|
271
|
+
matching_queries = [
|
|
272
|
+
f"{harness_name}.{task_name}"
|
|
273
|
+
for harness_name, task_name in matching_keys
|
|
274
|
+
]
|
|
275
|
+
raise ValueError(
|
|
276
|
+
f"there are multiple tasks named {repr(query)} in the mapping,"
|
|
277
|
+
f" please select one of {repr(matching_queries)}"
|
|
278
|
+
)
|
|
279
|
+
# no tasks have been found:
|
|
280
|
+
else:
|
|
281
|
+
raise ValueError(f"task {repr(query)} does not exist in the mapping")
|
|
282
|
+
|
|
283
|
+
# if there is one dot in query, treat it like "{harness_name}.{task_name}"
|
|
284
|
+
elif num_dots == 1:
|
|
285
|
+
harness_name, task_name = query.split(".")
|
|
286
|
+
matching_keys = [
|
|
287
|
+
key for key in mapping.keys() if key == (harness_name, task_name)
|
|
288
|
+
]
|
|
289
|
+
# if exactly one task matching the query has been found:
|
|
290
|
+
if len(matching_keys) == 1:
|
|
291
|
+
key = matching_keys[0]
|
|
292
|
+
return mapping[key] # type: ignore[no-any-return]
|
|
293
|
+
# if more than one task matching the query has been found:
|
|
294
|
+
elif len(matching_keys) >= 2:
|
|
295
|
+
raise ValueError(
|
|
296
|
+
f"there are multiple matches for {repr(query)} in the mapping,"
|
|
297
|
+
" which means the mapping is not correct"
|
|
298
|
+
)
|
|
299
|
+
# no tasks have been found:
|
|
300
|
+
else:
|
|
301
|
+
raise ValueError(
|
|
302
|
+
f"harness.task {repr(query)} does not exist in the mapping"
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# invalid query
|
|
306
|
+
else:
|
|
307
|
+
raise ValueError(
|
|
308
|
+
f"invalid query={repr(query)} for task mapping,"
|
|
309
|
+
" it must contain exactly zero or one occurrence of '.' character"
|
|
310
|
+
)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
defaults:
|
|
17
|
+
- execution: local
|
|
18
|
+
- deployment: none
|
|
19
|
+
- _self_
|
|
20
|
+
|
|
21
|
+
# NOTE(dfridman): If deployment is used, `target` parameters will be automatically populated.
|
|
22
|
+
target:
|
|
23
|
+
api_endpoint:
|
|
24
|
+
url: ???
|
|
25
|
+
model_id: ???
|
|
26
|
+
api_key_name: "<YOUR_API_KEY_NAME>" # NOTE: the name of the env var
|
|
27
|
+
|
|
28
|
+
evaluation: []
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
type: nim
|
|
17
|
+
image: ??? # e.g., nvcr.io/nim/meta/llama-3.1-8b-instruct:1.8.6
|
|
18
|
+
served_model_name: ???
|
|
19
|
+
port: 8000
|
|
20
|
+
|
|
21
|
+
# NIM containers use default entrypoint - no custom command needed
|
|
22
|
+
# Configuration is done via environment variables in lepton_config
|
|
23
|
+
|
|
24
|
+
endpoints:
|
|
25
|
+
chat: /v1/chat/completions
|
|
26
|
+
completions: /v1/completions
|
|
27
|
+
health: /health
|
|
28
|
+
# Note: Environment variables should be configured in lepton_config.envs
|
|
29
|
+
# Auto-derived environment variables from deployment config:
|
|
30
|
+
# - SERVED_MODEL_NAME (from served_model_name)
|
|
31
|
+
# - NIM_MODEL_NAME (from served_model_name for NIM)
|
|
32
|
+
# - MODEL_PORT (from port)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
type: none
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
type: sglang
|
|
17
|
+
image: lmsysorg/sglang:latest
|
|
18
|
+
checkpoint_path: ???
|
|
19
|
+
served_model_name: ???
|
|
20
|
+
port: 8000
|
|
21
|
+
tensor_parallel_size: 8
|
|
22
|
+
data_parallel_size: 1
|
|
23
|
+
extra_args: ""
|
|
24
|
+
env_vars: {} # {name: value} dict
|
|
25
|
+
|
|
26
|
+
endpoints:
|
|
27
|
+
chat: /v1/chat/completions
|
|
28
|
+
completions: /v1/completions
|
|
29
|
+
health: /health
|
|
30
|
+
|
|
31
|
+
command: python3 -m sglang.launch_server
|
|
32
|
+
--model-path ${oc.select:deployment.hf_model_handle,/checkpoint}
|
|
33
|
+
--host 0.0.0.0
|
|
34
|
+
--port ${deployment.port}
|
|
35
|
+
--served-model-name ${deployment.served_model_name}
|
|
36
|
+
--tp ${deployment.tensor_parallel_size}
|
|
37
|
+
--dp ${deployment.data_parallel_size}
|
|
38
|
+
${deployment.extra_args}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
type: vllm
|
|
17
|
+
image: vllm/vllm-openai:latest
|
|
18
|
+
checkpoint_path: ???
|
|
19
|
+
served_model_name: ???
|
|
20
|
+
port: 8000
|
|
21
|
+
tensor_parallel_size: 8
|
|
22
|
+
pipeline_parallel_size: 1
|
|
23
|
+
data_parallel_size: 1
|
|
24
|
+
extra_args: ""
|
|
25
|
+
env_vars: {} # {name: value} dict
|
|
26
|
+
|
|
27
|
+
endpoints:
|
|
28
|
+
chat: /v1/chat/completions
|
|
29
|
+
completions: /v1/completions
|
|
30
|
+
health: /health
|
|
31
|
+
|
|
32
|
+
command: vllm serve ${oc.select:deployment.hf_model_handle,/checkpoint}
|
|
33
|
+
--tensor-parallel-size=${deployment.tensor_parallel_size}
|
|
34
|
+
--pipeline-parallel-size=${deployment.pipeline_parallel_size}
|
|
35
|
+
--data-parallel-size=${deployment.data_parallel_size}
|
|
36
|
+
--port ${deployment.port}
|
|
37
|
+
--trust-remote-code
|
|
38
|
+
--served-model-name ${deployment.served_model_name}
|
|
39
|
+
--enforce-eager
|
|
40
|
+
--gpu-memory-utilization 0.95
|
|
41
|
+
${deployment.extra_args}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
# =============================================================================
|
|
17
|
+
# LEPTON EXECUTION CONFIGURATION - Default Environment
|
|
18
|
+
# Following the same pattern as SLURM internal configs
|
|
19
|
+
# =============================================================================
|
|
20
|
+
|
|
21
|
+
type: lepton
|
|
22
|
+
output_dir: ???
|
|
23
|
+
|
|
24
|
+
# Environment variables passed to evaluation containers
|
|
25
|
+
env_var_names: []
|
|
26
|
+
|
|
27
|
+
# Evaluation task settings for Lepton batch execution
|
|
28
|
+
evaluation_tasks:
|
|
29
|
+
# Resource requirements for evaluation tasks
|
|
30
|
+
resource_shape: "cpu.small" # Can be overridden to gpu.small, gpu.1xh200, etc
|
|
31
|
+
|
|
32
|
+
# Timeout for individual evaluation tasks (in seconds)
|
|
33
|
+
timeout: 3600 # 1 hour per evaluation task
|
|
34
|
+
|
|
35
|
+
# Whether to use shared storage for results
|
|
36
|
+
use_shared_storage: true
|
|
37
|
+
|
|
38
|
+
# Lepton platform infrastructure settings for this environment
|
|
39
|
+
lepton_platform:
|
|
40
|
+
# Deployment configuration (for model endpoints)
|
|
41
|
+
deployment:
|
|
42
|
+
# Node group for endpoint deployments
|
|
43
|
+
node_group: "nv-int-multiteam-nebius-h200-01-mjgbgffo"
|
|
44
|
+
|
|
45
|
+
# Endpoint readiness timeout (in seconds)
|
|
46
|
+
endpoint_readiness_timeout: 1200 # 20 minutes
|
|
47
|
+
|
|
48
|
+
# Platform defaults that get merged into all deployments
|
|
49
|
+
platform_defaults:
|
|
50
|
+
# Health checks
|
|
51
|
+
health:
|
|
52
|
+
readiness: {}
|
|
53
|
+
liveness: {}
|
|
54
|
+
|
|
55
|
+
# Logging
|
|
56
|
+
log:
|
|
57
|
+
enable_collection: true
|
|
58
|
+
|
|
59
|
+
# Monitoring and routing
|
|
60
|
+
metrics: {}
|
|
61
|
+
routing_policy: {}
|
|
62
|
+
|
|
63
|
+
# Queue configuration for resource prioritization
|
|
64
|
+
queue_config:
|
|
65
|
+
priority_class: "mid-4000"
|
|
66
|
+
|
|
67
|
+
# Security and infrastructure
|
|
68
|
+
enable_rdma: false
|
|
69
|
+
user_security_context: {}
|
|
70
|
+
|
|
71
|
+
# NGC registry access
|
|
72
|
+
image_pull_secrets:
|
|
73
|
+
- "ngc-general-access-and-private-registry-in-sa-nvex-org"
|
|
74
|
+
|
|
75
|
+
# Task execution configuration (for evaluation tasks)
|
|
76
|
+
tasks:
|
|
77
|
+
# Node group for evaluation tasks
|
|
78
|
+
node_group: "nv-int-multiteam-nebius-h200-01"
|
|
79
|
+
|
|
80
|
+
# Default environment variables for all tasks
|
|
81
|
+
env_vars: {}
|
|
82
|
+
|
|
83
|
+
# Storage mounts for task execution
|
|
84
|
+
mounts:
|
|
85
|
+
# Main workspace mount
|
|
86
|
+
- from: "node-nfs:lepton-shared-fs"
|
|
87
|
+
path: "/shared/nemo-evaluator-launcher-workspace"
|
|
88
|
+
mount_path: "/workspace"
|
|
89
|
+
|
|
90
|
+
# Image pull secrets for task containers
|
|
91
|
+
image_pull_secrets:
|
|
92
|
+
- "lepton-nvidia"
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
type: local
|
|
17
|
+
output_dir: ???
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
# Each slurm cluster has its own flavour, below we provide some defaults that might meet one's needs.
|
|
17
|
+
hostname: ???
|
|
18
|
+
username: ${oc.env:USER}
|
|
19
|
+
account: ???
|
|
20
|
+
partition: batch
|
|
21
|
+
num_nodes: 1
|
|
22
|
+
ntasks_per_node: 1
|
|
23
|
+
gres: gpu:8
|
|
24
|
+
walltime: 01:00:00
|
|
25
|
+
subproject: nemo-evaluator-launcher
|
|
26
|
+
output_dir: ???
|
|
27
|
+
env_vars:
|
|
28
|
+
deployment: {}
|
|
29
|
+
evaluation: {}
|
|
30
|
+
mounts:
|
|
31
|
+
deployment: {}
|
|
32
|
+
evaluation: {}
|
|
33
|
+
mount_home: true
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
# NOTE(dfridman): importing all executors will register them in the registry
|
|
17
|
+
|
|
18
|
+
from nemo_evaluator_launcher.executors.lepton.executor import LeptonExecutor
|
|
19
|
+
from nemo_evaluator_launcher.executors.local.executor import LocalExecutor
|
|
20
|
+
from nemo_evaluator_launcher.executors.slurm.executor import SlurmExecutor
|
|
21
|
+
|
|
22
|
+
__all__ = ["LeptonExecutor", "LocalExecutor", "SlurmExecutor"]
|