nemo-evaluator-launcher 0.1.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nemo-evaluator-launcher might be problematic. Click here for more details.

Files changed (57) hide show
  1. nemo_evaluator_launcher/__init__.py +65 -0
  2. nemo_evaluator_launcher/api/__init__.py +24 -0
  3. nemo_evaluator_launcher/api/functional.py +641 -0
  4. nemo_evaluator_launcher/api/types.py +89 -0
  5. nemo_evaluator_launcher/api/utils.py +19 -0
  6. nemo_evaluator_launcher/cli/__init__.py +15 -0
  7. nemo_evaluator_launcher/cli/export.py +148 -0
  8. nemo_evaluator_launcher/cli/info.py +117 -0
  9. nemo_evaluator_launcher/cli/kill.py +39 -0
  10. nemo_evaluator_launcher/cli/ls_runs.py +113 -0
  11. nemo_evaluator_launcher/cli/ls_tasks.py +34 -0
  12. nemo_evaluator_launcher/cli/main.py +136 -0
  13. nemo_evaluator_launcher/cli/run.py +135 -0
  14. nemo_evaluator_launcher/cli/status.py +118 -0
  15. nemo_evaluator_launcher/cli/version.py +52 -0
  16. nemo_evaluator_launcher/common/__init__.py +16 -0
  17. nemo_evaluator_launcher/common/execdb.py +189 -0
  18. nemo_evaluator_launcher/common/helpers.py +157 -0
  19. nemo_evaluator_launcher/common/logging_utils.py +349 -0
  20. nemo_evaluator_launcher/common/mapping.py +310 -0
  21. nemo_evaluator_launcher/configs/__init__.py +15 -0
  22. nemo_evaluator_launcher/configs/default.yaml +28 -0
  23. nemo_evaluator_launcher/configs/deployment/nim.yaml +32 -0
  24. nemo_evaluator_launcher/configs/deployment/none.yaml +16 -0
  25. nemo_evaluator_launcher/configs/deployment/sglang.yaml +38 -0
  26. nemo_evaluator_launcher/configs/deployment/vllm.yaml +41 -0
  27. nemo_evaluator_launcher/configs/execution/lepton/default.yaml +92 -0
  28. nemo_evaluator_launcher/configs/execution/local.yaml +17 -0
  29. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +33 -0
  30. nemo_evaluator_launcher/executors/__init__.py +22 -0
  31. nemo_evaluator_launcher/executors/base.py +97 -0
  32. nemo_evaluator_launcher/executors/lepton/__init__.py +16 -0
  33. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +589 -0
  34. nemo_evaluator_launcher/executors/lepton/executor.py +905 -0
  35. nemo_evaluator_launcher/executors/lepton/job_helpers.py +394 -0
  36. nemo_evaluator_launcher/executors/local/__init__.py +15 -0
  37. nemo_evaluator_launcher/executors/local/executor.py +491 -0
  38. nemo_evaluator_launcher/executors/local/run.template.sh +88 -0
  39. nemo_evaluator_launcher/executors/registry.py +38 -0
  40. nemo_evaluator_launcher/executors/slurm/__init__.py +15 -0
  41. nemo_evaluator_launcher/executors/slurm/executor.py +982 -0
  42. nemo_evaluator_launcher/exporters/__init__.py +36 -0
  43. nemo_evaluator_launcher/exporters/base.py +112 -0
  44. nemo_evaluator_launcher/exporters/gsheets.py +391 -0
  45. nemo_evaluator_launcher/exporters/local.py +488 -0
  46. nemo_evaluator_launcher/exporters/mlflow.py +448 -0
  47. nemo_evaluator_launcher/exporters/registry.py +40 -0
  48. nemo_evaluator_launcher/exporters/utils.py +669 -0
  49. nemo_evaluator_launcher/exporters/wandb.py +376 -0
  50. nemo_evaluator_launcher/package_info.py +35 -0
  51. nemo_evaluator_launcher/resources/mapping.toml +344 -0
  52. nemo_evaluator_launcher-0.1.0rc2.dist-info/METADATA +35 -0
  53. nemo_evaluator_launcher-0.1.0rc2.dist-info/RECORD +57 -0
  54. nemo_evaluator_launcher-0.1.0rc2.dist-info/WHEEL +5 -0
  55. nemo_evaluator_launcher-0.1.0rc2.dist-info/entry_points.txt +3 -0
  56. nemo_evaluator_launcher-0.1.0rc2.dist-info/licenses/LICENSE +451 -0
  57. nemo_evaluator_launcher-0.1.0rc2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,394 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ """Lepton job helper functions for nemo-evaluator-launcher.
17
+
18
+ Handles Lepton job creation, management, and monitoring.
19
+ """
20
+
21
+ import json
22
+ import subprocess
23
+ import time
24
+ from typing import Any, List, Union
25
+
26
+ from leptonai.api.v1.types.affinity import LeptonResourceAffinity
27
+ from leptonai.api.v1.types.common import LeptonVisibility, Metadata
28
+ from leptonai.api.v1.types.deployment import EnvVar, LeptonContainer, Mount
29
+ from leptonai.api.v1.types.job import LeptonJob, LeptonJobUserSpec
30
+
31
+ # Import lepton dependencies
32
+ from leptonai.api.v2.client import APIClient
33
+ from omegaconf import DictConfig
34
+
35
+ from nemo_evaluator_launcher.common.logging_utils import logger
36
+
37
+ # =============================================================================
38
+ # LEPTON JOB MANAGEMENT FUNCTIONS
39
+ # =============================================================================
40
+
41
+
42
+ def create_lepton_job(
43
+ job_name: str,
44
+ container_image: str,
45
+ command: List[str],
46
+ resource_shape: str = "cpu.small",
47
+ env_vars: dict[Any, Any] | None = None,
48
+ mounts: List[Union[dict[Any, Any], DictConfig]] | None = None,
49
+ timeout: int = 3600,
50
+ node_group: str | None = None,
51
+ image_pull_secrets: List[str] | None = None,
52
+ ) -> tuple[bool, str]:
53
+ """Create a Lepton batch job for evaluation using the API client.
54
+
55
+ Args:
56
+ job_name: Name for the job.
57
+ container_image: Docker image to use for the job.
58
+ command: Command to run in the container.
59
+ resource_shape: Resource requirements (cpu.small, gpu.a10, etc).
60
+ env_vars: Environment variables for the job.
61
+ mounts: Storage mounts for the job.
62
+ timeout: Job timeout in seconds.
63
+ node_group: Node group for job placement.
64
+ image_pull_secrets: Secrets for pulling container images.
65
+
66
+ Returns:
67
+ Tuple of (success: bool, error_message: str).
68
+ """
69
+ return _create_lepton_job_api(
70
+ job_name,
71
+ container_image,
72
+ command,
73
+ resource_shape,
74
+ env_vars,
75
+ mounts,
76
+ timeout,
77
+ node_group,
78
+ image_pull_secrets,
79
+ )
80
+
81
+
82
+ def _create_lepton_job_api(
83
+ job_name: str,
84
+ container_image: str,
85
+ command: List[str],
86
+ resource_shape: str,
87
+ env_vars: dict[Any, Any] | None = None,
88
+ mounts: List[Union[dict[Any, Any], DictConfig]] | None = None,
89
+ timeout: int = 3600,
90
+ node_group: str | None = None,
91
+ image_pull_secrets: List[str] | None = None,
92
+ ) -> tuple[bool, str]:
93
+ """Create Lepton job using API client (preferred method)."""
94
+ try:
95
+ client = APIClient()
96
+
97
+ # Prepare environment variables (support both direct values and secret references)
98
+ lepton_env_vars = []
99
+ if env_vars:
100
+ for key, value in env_vars.items():
101
+ # Handle both regular dicts and OmegaConf objects
102
+ from omegaconf import DictConfig
103
+
104
+ if isinstance(value, (dict, DictConfig)) and "value_from" in value:
105
+ # Secret reference: {value_from: {secret_name_ref: "secret_name"}}
106
+ from leptonai.api.v1.types.deployment import EnvValue
107
+
108
+ # Convert OmegaConf to dict if needed
109
+ value_dict = dict(value) if isinstance(value, DictConfig) else value
110
+ env_var = EnvVar(
111
+ name=key,
112
+ value_from=EnvValue(
113
+ secret_name_ref=value_dict["value_from"]["secret_name_ref"]
114
+ ),
115
+ )
116
+ lepton_env_vars.append(env_var)
117
+ else:
118
+ # Direct value
119
+ lepton_env_vars.append(EnvVar(name=key, value=str(value)))
120
+
121
+ # Prepare mounts
122
+ lepton_mounts = []
123
+ if mounts:
124
+ for mount in mounts:
125
+ # Handle both regular dicts and OmegaConf DictConfig objects
126
+ if isinstance(mount, (dict, DictConfig)):
127
+ try:
128
+ # Convert DictConfig to regular dict if needed
129
+ mount_dict: dict[Any, Any] = dict(mount)
130
+ lepton_mount = Mount(**mount_dict)
131
+ lepton_mounts.append(lepton_mount)
132
+ except Exception as e:
133
+ return False, f"Invalid mount configuration: {e}"
134
+ else:
135
+ return ( # type: ignore[unreachable]
136
+ False,
137
+ f"Mount must be a dictionary or DictConfig, got {type(mount)}",
138
+ )
139
+
140
+ # Handle node group affinity
141
+ affinity = None
142
+ if node_group:
143
+ # Get node groups to find the correct one
144
+ node_groups = client.nodegroup.list_all()
145
+ node_group_map = {ng.metadata.name: ng for ng in node_groups}
146
+ if node_group in node_group_map:
147
+ node_group_obj = node_group_map[node_group]
148
+ valid_node_ids = [
149
+ node.metadata.id_
150
+ for node in client.nodegroup.list_nodes(node_group_obj)
151
+ ]
152
+ affinity = LeptonResourceAffinity(
153
+ allowed_dedicated_node_groups=[node_group_obj.metadata.id_],
154
+ allowed_nodes_in_node_group=valid_node_ids,
155
+ )
156
+
157
+ # Create job specification
158
+ job_spec = LeptonJobUserSpec(
159
+ resource_shape=resource_shape,
160
+ affinity=affinity,
161
+ container=LeptonContainer(image=container_image, command=command),
162
+ envs=lepton_env_vars,
163
+ mounts=lepton_mounts,
164
+ image_pull_secrets=image_pull_secrets or [],
165
+ shared_memory_size=1024, # 1GB - appropriate for CPU tasks
166
+ completions=1,
167
+ parallelism=1,
168
+ intra_job_communication=False,
169
+ )
170
+
171
+ # Create the job
172
+ job = LeptonJob(
173
+ metadata=Metadata(name=job_name, visibility=LeptonVisibility.PRIVATE),
174
+ spec=job_spec,
175
+ )
176
+
177
+ response = client.job.create(job)
178
+ logger.info(
179
+ "Successfully created Lepton job",
180
+ job_name=job_name,
181
+ id=response.metadata.id_,
182
+ )
183
+ return True, ""
184
+
185
+ except Exception as e:
186
+ error_msg = f"Error creating Lepton job via API: {e}"
187
+ logger.error("Error creating Lepton job via API", err=str(e))
188
+ return False, error_msg
189
+
190
+
191
+ def get_lepton_job_status(job_name_or_id: str) -> dict[Any, Any] | None:
192
+ """Get the status of a Lepton job using the API client.
193
+
194
+ Args:
195
+ job_name_or_id: Name or ID of the job.
196
+
197
+ Returns:
198
+ Job status dictionary if successful, None otherwise.
199
+ """
200
+ return _get_lepton_job_status_api(job_name_or_id)
201
+
202
+
203
+ def _get_lepton_job_status_api(job_name_or_id: str) -> dict[Any, Any] | None:
204
+ """Get job status using API client (preferred method)."""
205
+ try:
206
+ client = APIClient()
207
+
208
+ # Try to get job by ID first, then by name
209
+ job = None
210
+ try:
211
+ # If it looks like an ID, try that first
212
+ if len(job_name_or_id) > 20: # Job IDs are longer
213
+ job = client.job.get(job_name_or_id)
214
+ except Exception:
215
+ pass
216
+
217
+ # If not found by ID, try by name
218
+ if not job:
219
+ # List all jobs and find by name
220
+ all_jobs = client.job.list_all()
221
+ for j in all_jobs:
222
+ if j.metadata.name == job_name_or_id:
223
+ job = j
224
+ break
225
+
226
+ if not job:
227
+ logger.warn(
228
+ "Not found when getting job status via API",
229
+ job_name_or_id=job_name_or_id,
230
+ )
231
+ return None
232
+
233
+ # Extract status information
234
+ if job.status:
235
+ # Handle enum states
236
+ state_str = str(job.status.state)
237
+ if "." in state_str:
238
+ state = state_str.split(".")[
239
+ -1
240
+ ] # Extract "Completed" from "LeptonJobState.Completed"
241
+ else:
242
+ state = state_str
243
+
244
+ return {
245
+ "id": job.metadata.id_,
246
+ "name": job.metadata.name,
247
+ "state": state,
248
+ "start_time": getattr(job.status, "start_time", None),
249
+ "end_time": getattr(job.status, "end_time", None),
250
+ "ready": getattr(job.status, "ready", 0),
251
+ "active": getattr(job.status, "active", 0),
252
+ "succeeded": getattr(job.status, "succeeded", 0),
253
+ "failed": getattr(job.status, "failed", 0),
254
+ }
255
+ else:
256
+ return {
257
+ "id": job.metadata.id_,
258
+ "name": job.metadata.name,
259
+ "state": "Unknown",
260
+ }
261
+
262
+ except Exception as e:
263
+ logger.error("Error getting job status via API", err=str(e))
264
+ return None
265
+
266
+
267
+ def _get_lepton_job_status_cli(job_name: str) -> dict[Any, Any] | None:
268
+ """Get job status using CLI (fallback method)."""
269
+ try:
270
+ result = subprocess.run(
271
+ ["lep", "job", "get", "--name", job_name],
272
+ capture_output=True,
273
+ text=True,
274
+ timeout=30,
275
+ )
276
+
277
+ if result.returncode == 0:
278
+ job_info: dict[Any, Any] = json.loads(result.stdout)
279
+ # Return the job info which contains status information
280
+ return job_info
281
+ else:
282
+ return None
283
+ except (
284
+ subprocess.TimeoutExpired,
285
+ subprocess.CalledProcessError,
286
+ json.JSONDecodeError,
287
+ ):
288
+ return None
289
+
290
+
291
+ def list_lepton_jobs(prefix: str | None = None) -> List[dict[Any, Any]]:
292
+ """List Lepton jobs, optionally filtered by name prefix.
293
+
294
+ Args:
295
+ prefix: Optional prefix to filter job names.
296
+
297
+ Returns:
298
+ List of job information dictionaries.
299
+ """
300
+ try:
301
+ result = subprocess.run(
302
+ ["lep", "job", "list"], capture_output=True, text=True, timeout=30
303
+ )
304
+
305
+ if result.returncode == 0:
306
+ jobs_info: dict[Any, Any] = json.loads(result.stdout)
307
+ jobs: List[dict[Any, Any]] = jobs_info.get("jobs", [])
308
+
309
+ if prefix:
310
+ jobs = [job for job in jobs if job.get("name", "").startswith(prefix)]
311
+
312
+ return jobs
313
+ else:
314
+ return []
315
+ except (
316
+ subprocess.TimeoutExpired,
317
+ subprocess.CalledProcessError,
318
+ json.JSONDecodeError,
319
+ ):
320
+ return []
321
+
322
+
323
+ def delete_lepton_job(job_name: str) -> bool:
324
+ """Delete/cancel a Lepton job.
325
+
326
+ Args:
327
+ job_name: Name of the job to delete.
328
+
329
+ Returns:
330
+ True if deletion succeeded, False otherwise.
331
+ """
332
+ try:
333
+ result = subprocess.run(
334
+ ["lep", "job", "remove", "--name", job_name],
335
+ capture_output=True,
336
+ text=True,
337
+ timeout=60,
338
+ )
339
+
340
+ return result.returncode == 0
341
+ except (subprocess.TimeoutExpired, subprocess.CalledProcessError):
342
+ return False
343
+
344
+
345
+ def wait_for_lepton_jobs_completion(job_names: List[str], timeout: int = 3600) -> dict:
346
+ """Wait for multiple Lepton jobs to complete.
347
+
348
+ Args:
349
+ job_names: List of job names to monitor.
350
+ timeout: Maximum time to wait in seconds.
351
+
352
+ Returns:
353
+ Dictionary mapping job names to their final status.
354
+ """
355
+
356
+ start_time = time.time()
357
+ job_statuses = {}
358
+ completed_jobs: set[str] = set()
359
+
360
+ print(f"⏳ Monitoring {len(job_names)} evaluation jobs...")
361
+
362
+ while len(completed_jobs) < len(job_names) and (time.time() - start_time) < timeout:
363
+ for job_name in job_names:
364
+ if job_name in completed_jobs:
365
+ continue
366
+
367
+ status = get_lepton_job_status(job_name)
368
+ if status:
369
+ job_state = status.get("state", "Unknown")
370
+ job_statuses[job_name] = status
371
+
372
+ if job_state in ["Succeeded", "Failed", "Cancelled"]:
373
+ completed_jobs.add(job_name)
374
+ if job_state == "Succeeded":
375
+ print(f"✅ Job {job_name}: {job_state}")
376
+ else:
377
+ print(f"❌ Job {job_name}: {job_state}")
378
+ else:
379
+ print(f"⏳ Job {job_name}: {job_state}")
380
+
381
+ if len(completed_jobs) < len(job_names):
382
+ time.sleep(10) # Check every 10 seconds
383
+
384
+ # Final status check
385
+ for job_name in job_names:
386
+ if job_name not in completed_jobs:
387
+ status = get_lepton_job_status(job_name)
388
+ if status:
389
+ job_statuses[job_name] = status
390
+ print(
391
+ f"⏰ Job {job_name}: Timeout (still {status.get('state', 'Unknown')})"
392
+ )
393
+
394
+ return job_statuses
@@ -0,0 +1,15 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #