nemo-evaluator-launcher 0.1.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nemo-evaluator-launcher might be problematic. Click here for more details.

Files changed (60) hide show
  1. nemo_evaluator_launcher/__init__.py +79 -0
  2. nemo_evaluator_launcher/api/__init__.py +24 -0
  3. nemo_evaluator_launcher/api/functional.py +698 -0
  4. nemo_evaluator_launcher/api/types.py +98 -0
  5. nemo_evaluator_launcher/api/utils.py +19 -0
  6. nemo_evaluator_launcher/cli/__init__.py +15 -0
  7. nemo_evaluator_launcher/cli/export.py +267 -0
  8. nemo_evaluator_launcher/cli/info.py +512 -0
  9. nemo_evaluator_launcher/cli/kill.py +41 -0
  10. nemo_evaluator_launcher/cli/ls_runs.py +134 -0
  11. nemo_evaluator_launcher/cli/ls_tasks.py +136 -0
  12. nemo_evaluator_launcher/cli/main.py +226 -0
  13. nemo_evaluator_launcher/cli/run.py +200 -0
  14. nemo_evaluator_launcher/cli/status.py +164 -0
  15. nemo_evaluator_launcher/cli/version.py +55 -0
  16. nemo_evaluator_launcher/common/__init__.py +16 -0
  17. nemo_evaluator_launcher/common/execdb.py +283 -0
  18. nemo_evaluator_launcher/common/helpers.py +366 -0
  19. nemo_evaluator_launcher/common/logging_utils.py +357 -0
  20. nemo_evaluator_launcher/common/mapping.py +295 -0
  21. nemo_evaluator_launcher/common/printing_utils.py +93 -0
  22. nemo_evaluator_launcher/configs/__init__.py +15 -0
  23. nemo_evaluator_launcher/configs/default.yaml +28 -0
  24. nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
  25. nemo_evaluator_launcher/configs/deployment/nim.yaml +32 -0
  26. nemo_evaluator_launcher/configs/deployment/none.yaml +16 -0
  27. nemo_evaluator_launcher/configs/deployment/sglang.yaml +38 -0
  28. nemo_evaluator_launcher/configs/deployment/trtllm.yaml +24 -0
  29. nemo_evaluator_launcher/configs/deployment/vllm.yaml +42 -0
  30. nemo_evaluator_launcher/configs/execution/lepton/default.yaml +92 -0
  31. nemo_evaluator_launcher/configs/execution/local.yaml +19 -0
  32. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +34 -0
  33. nemo_evaluator_launcher/executors/__init__.py +22 -0
  34. nemo_evaluator_launcher/executors/base.py +120 -0
  35. nemo_evaluator_launcher/executors/lepton/__init__.py +16 -0
  36. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +609 -0
  37. nemo_evaluator_launcher/executors/lepton/executor.py +1004 -0
  38. nemo_evaluator_launcher/executors/lepton/job_helpers.py +398 -0
  39. nemo_evaluator_launcher/executors/local/__init__.py +15 -0
  40. nemo_evaluator_launcher/executors/local/executor.py +605 -0
  41. nemo_evaluator_launcher/executors/local/run.template.sh +103 -0
  42. nemo_evaluator_launcher/executors/registry.py +38 -0
  43. nemo_evaluator_launcher/executors/slurm/__init__.py +15 -0
  44. nemo_evaluator_launcher/executors/slurm/executor.py +1147 -0
  45. nemo_evaluator_launcher/exporters/__init__.py +36 -0
  46. nemo_evaluator_launcher/exporters/base.py +121 -0
  47. nemo_evaluator_launcher/exporters/gsheets.py +409 -0
  48. nemo_evaluator_launcher/exporters/local.py +502 -0
  49. nemo_evaluator_launcher/exporters/mlflow.py +619 -0
  50. nemo_evaluator_launcher/exporters/registry.py +40 -0
  51. nemo_evaluator_launcher/exporters/utils.py +624 -0
  52. nemo_evaluator_launcher/exporters/wandb.py +490 -0
  53. nemo_evaluator_launcher/package_info.py +38 -0
  54. nemo_evaluator_launcher/resources/mapping.toml +380 -0
  55. nemo_evaluator_launcher-0.1.28.dist-info/METADATA +494 -0
  56. nemo_evaluator_launcher-0.1.28.dist-info/RECORD +60 -0
  57. nemo_evaluator_launcher-0.1.28.dist-info/WHEEL +5 -0
  58. nemo_evaluator_launcher-0.1.28.dist-info/entry_points.txt +3 -0
  59. nemo_evaluator_launcher-0.1.28.dist-info/licenses/LICENSE +451 -0
  60. nemo_evaluator_launcher-0.1.28.dist-info/top_level.txt +1 -0
@@ -0,0 +1,398 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ """Lepton job helper functions for nemo-evaluator-launcher.
17
+
18
+ Handles Lepton job creation, management, and monitoring.
19
+ """
20
+
21
+ import json
22
+ import subprocess
23
+ import time
24
+ from typing import Any, List, Union
25
+
26
+ from omegaconf import DictConfig
27
+
28
+ from nemo_evaluator_launcher.common.logging_utils import logger
29
+
30
+ # =============================================================================
31
+ # LEPTON JOB MANAGEMENT FUNCTIONS
32
+ # =============================================================================
33
+
34
+
35
+ def create_lepton_job(
36
+ job_name: str,
37
+ container_image: str,
38
+ command: List[str],
39
+ resource_shape: str = "cpu.small",
40
+ env_vars: dict[Any, Any] | None = None,
41
+ mounts: List[Union[dict[Any, Any], DictConfig]] | None = None,
42
+ timeout: int = 3600,
43
+ node_group: str | None = None,
44
+ image_pull_secrets: List[str] | None = None,
45
+ ) -> tuple[bool, str]:
46
+ """Create a Lepton batch job for evaluation using the API client.
47
+
48
+ Args:
49
+ job_name: Name for the job.
50
+ container_image: Docker image to use for the job.
51
+ command: Command to run in the container.
52
+ resource_shape: Resource requirements (cpu.small, gpu.a10, etc).
53
+ env_vars: Environment variables for the job.
54
+ mounts: Storage mounts for the job.
55
+ timeout: Job timeout in seconds.
56
+ node_group: Node group for job placement.
57
+ image_pull_secrets: Secrets for pulling container images.
58
+
59
+ Returns:
60
+ Tuple of (success: bool, error_message: str).
61
+ """
62
+ return _create_lepton_job_api(
63
+ job_name,
64
+ container_image,
65
+ command,
66
+ resource_shape,
67
+ env_vars,
68
+ mounts,
69
+ timeout,
70
+ node_group,
71
+ image_pull_secrets,
72
+ )
73
+
74
+
75
+ def _create_lepton_job_api(
76
+ job_name: str,
77
+ container_image: str,
78
+ command: List[str],
79
+ resource_shape: str,
80
+ env_vars: dict[Any, Any] | None = None,
81
+ mounts: List[Union[dict[Any, Any], DictConfig]] | None = None,
82
+ timeout: int = 3600,
83
+ node_group: str | None = None,
84
+ image_pull_secrets: List[str] | None = None,
85
+ ) -> tuple[bool, str]:
86
+ """Create Lepton job using API client (preferred method)."""
87
+ try:
88
+ # Import leptonai dependencies locally
89
+ from leptonai.api.v1.types.affinity import LeptonResourceAffinity
90
+ from leptonai.api.v1.types.common import LeptonVisibility, Metadata
91
+ from leptonai.api.v1.types.deployment import (
92
+ EnvValue,
93
+ EnvVar,
94
+ LeptonContainer,
95
+ Mount,
96
+ )
97
+ from leptonai.api.v1.types.job import LeptonJob, LeptonJobUserSpec
98
+ from leptonai.api.v2.client import APIClient
99
+
100
+ client = APIClient()
101
+
102
+ # Prepare environment variables (support both direct values and secret references)
103
+ lepton_env_vars = []
104
+ if env_vars:
105
+ for key, value in env_vars.items():
106
+ # Handle both regular dicts and OmegaConf objects
107
+ if isinstance(value, (dict, DictConfig)) and "value_from" in value:
108
+ # Secret reference: {value_from: {secret_name_ref: "secret_name"}}
109
+ # Convert OmegaConf to dict if needed
110
+ value_dict = dict(value) if isinstance(value, DictConfig) else value
111
+ env_var = EnvVar(
112
+ name=key,
113
+ value_from=EnvValue(
114
+ secret_name_ref=value_dict["value_from"]["secret_name_ref"]
115
+ ),
116
+ )
117
+ lepton_env_vars.append(env_var)
118
+ else:
119
+ # Direct value
120
+ lepton_env_vars.append(EnvVar(name=key, value=str(value)))
121
+
122
+ # Prepare mounts
123
+ lepton_mounts = []
124
+ if mounts:
125
+ for mount in mounts:
126
+ # Handle both regular dicts and OmegaConf DictConfig objects
127
+ if isinstance(mount, (dict, DictConfig)):
128
+ try:
129
+ # Convert DictConfig to regular dict if needed
130
+ mount_dict: dict[Any, Any] = dict(mount)
131
+ lepton_mount = Mount(**mount_dict)
132
+ lepton_mounts.append(lepton_mount)
133
+ except Exception as e:
134
+ return False, f"Invalid mount configuration: {e}"
135
+ else:
136
+ return ( # type: ignore[unreachable]
137
+ False,
138
+ f"Mount must be a dictionary or DictConfig, got {type(mount)}",
139
+ )
140
+
141
+ # Handle node group affinity
142
+ affinity = None
143
+ if node_group:
144
+ # Get node groups to find the correct one
145
+ node_groups = client.nodegroup.list_all()
146
+ node_group_map = {ng.metadata.name: ng for ng in node_groups}
147
+ if node_group in node_group_map:
148
+ node_group_obj = node_group_map[node_group]
149
+ valid_node_ids = [
150
+ node.metadata.id_
151
+ for node in client.nodegroup.list_nodes(node_group_obj)
152
+ ]
153
+ affinity = LeptonResourceAffinity(
154
+ allowed_dedicated_node_groups=[node_group_obj.metadata.id_],
155
+ allowed_nodes_in_node_group=valid_node_ids,
156
+ )
157
+
158
+ # Create job specification
159
+ job_spec = LeptonJobUserSpec(
160
+ resource_shape=resource_shape,
161
+ affinity=affinity,
162
+ container=LeptonContainer(image=container_image, command=command),
163
+ envs=lepton_env_vars,
164
+ mounts=lepton_mounts,
165
+ image_pull_secrets=image_pull_secrets or [],
166
+ shared_memory_size=1024, # 1GB - appropriate for CPU tasks
167
+ completions=1,
168
+ parallelism=1,
169
+ intra_job_communication=False,
170
+ )
171
+
172
+ # Create the job
173
+ job = LeptonJob(
174
+ metadata=Metadata(name=job_name, visibility=LeptonVisibility.PRIVATE),
175
+ spec=job_spec,
176
+ )
177
+
178
+ response = client.job.create(job)
179
+ logger.info(
180
+ "Successfully created Lepton job",
181
+ job_name=job_name,
182
+ id=response.metadata.id_,
183
+ )
184
+ return True, ""
185
+
186
+ except Exception as e:
187
+ error_msg = f"Error creating Lepton job via API: {e}"
188
+ logger.error("Error creating Lepton job via API", err=str(e))
189
+ return False, error_msg
190
+
191
+
192
+ def get_lepton_job_status(job_name_or_id: str) -> dict[Any, Any] | None:
193
+ """Get the status of a Lepton job using the API client.
194
+
195
+ Args:
196
+ job_name_or_id: Name or ID of the job.
197
+
198
+ Returns:
199
+ Job status dictionary if successful, None otherwise.
200
+ """
201
+ return _get_lepton_job_status_api(job_name_or_id)
202
+
203
+
204
+ def _get_lepton_job_status_api(job_name_or_id: str) -> dict[Any, Any] | None:
205
+ """Get job status using API client (preferred method)."""
206
+ try:
207
+ # Import leptonai dependencies locally
208
+ from leptonai.api.v2.client import APIClient
209
+
210
+ client = APIClient()
211
+
212
+ # Try to get job by ID first, then by name
213
+ job = None
214
+ try:
215
+ # If it looks like an ID, try that first
216
+ if len(job_name_or_id) > 20: # Job IDs are longer
217
+ job = client.job.get(job_name_or_id)
218
+ except Exception:
219
+ pass
220
+
221
+ # If not found by ID, try by name
222
+ if not job:
223
+ # List all jobs and find by name
224
+ all_jobs = client.job.list_all()
225
+ for j in all_jobs:
226
+ if j.metadata.name == job_name_or_id:
227
+ job = j
228
+ break
229
+
230
+ if not job:
231
+ logger.warn(
232
+ "Not found when getting job status via API",
233
+ job_name_or_id=job_name_or_id,
234
+ )
235
+ return None
236
+
237
+ # Extract status information
238
+ if job.status:
239
+ # Handle enum states
240
+ state_str = str(job.status.state)
241
+ if "." in state_str:
242
+ state = state_str.split(".")[
243
+ -1
244
+ ] # Extract "Completed" from "LeptonJobState.Completed"
245
+ else:
246
+ state = state_str
247
+
248
+ return {
249
+ "id": job.metadata.id_,
250
+ "name": job.metadata.name,
251
+ "state": state,
252
+ "start_time": getattr(job.status, "start_time", None),
253
+ "end_time": getattr(job.status, "end_time", None),
254
+ "ready": getattr(job.status, "ready", 0),
255
+ "active": getattr(job.status, "active", 0),
256
+ "succeeded": getattr(job.status, "succeeded", 0),
257
+ "failed": getattr(job.status, "failed", 0),
258
+ }
259
+ else:
260
+ return {
261
+ "id": job.metadata.id_,
262
+ "name": job.metadata.name,
263
+ "state": "Unknown",
264
+ }
265
+
266
+ except Exception as e:
267
+ logger.error("Error getting job status via API", err=str(e))
268
+ return None
269
+
270
+
271
+ def _get_lepton_job_status_cli(job_name: str) -> dict[Any, Any] | None:
272
+ """Get job status using CLI (fallback method)."""
273
+ try:
274
+ result = subprocess.run(
275
+ ["lep", "job", "get", "--name", job_name],
276
+ capture_output=True,
277
+ text=True,
278
+ timeout=30,
279
+ )
280
+
281
+ if result.returncode == 0:
282
+ job_info: dict[Any, Any] = json.loads(result.stdout)
283
+ # Return the job info which contains status information
284
+ return job_info
285
+ else:
286
+ return None
287
+ except (
288
+ subprocess.TimeoutExpired,
289
+ subprocess.CalledProcessError,
290
+ json.JSONDecodeError,
291
+ ):
292
+ return None
293
+
294
+
295
+ def list_lepton_jobs(prefix: str | None = None) -> List[dict[Any, Any]]:
296
+ """List Lepton jobs, optionally filtered by name prefix.
297
+
298
+ Args:
299
+ prefix: Optional prefix to filter job names.
300
+
301
+ Returns:
302
+ List of job information dictionaries.
303
+ """
304
+ try:
305
+ result = subprocess.run(
306
+ ["lep", "job", "list"], capture_output=True, text=True, timeout=30
307
+ )
308
+
309
+ if result.returncode == 0:
310
+ jobs_info: dict[Any, Any] = json.loads(result.stdout)
311
+ jobs: List[dict[Any, Any]] = jobs_info.get("jobs", [])
312
+
313
+ if prefix:
314
+ jobs = [job for job in jobs if job.get("name", "").startswith(prefix)]
315
+
316
+ return jobs
317
+ else:
318
+ return []
319
+ except (
320
+ subprocess.TimeoutExpired,
321
+ subprocess.CalledProcessError,
322
+ json.JSONDecodeError,
323
+ ):
324
+ return []
325
+
326
+
327
+ def delete_lepton_job(job_name: str) -> bool:
328
+ """Delete/cancel a Lepton job.
329
+
330
+ Args:
331
+ job_name: Name of the job to delete.
332
+
333
+ Returns:
334
+ True if deletion succeeded, False otherwise.
335
+ """
336
+ try:
337
+ result = subprocess.run(
338
+ ["lep", "job", "remove", "--name", job_name],
339
+ capture_output=True,
340
+ text=True,
341
+ timeout=60,
342
+ )
343
+
344
+ return result.returncode == 0
345
+ except (subprocess.TimeoutExpired, subprocess.CalledProcessError):
346
+ return False
347
+
348
+
349
+ def wait_for_lepton_jobs_completion(job_names: List[str], timeout: int = 3600) -> dict:
350
+ """Wait for multiple Lepton jobs to complete.
351
+
352
+ Args:
353
+ job_names: List of job names to monitor.
354
+ timeout: Maximum time to wait in seconds.
355
+
356
+ Returns:
357
+ Dictionary mapping job names to their final status.
358
+ """
359
+
360
+ start_time = time.time()
361
+ job_statuses = {}
362
+ completed_jobs: set[str] = set()
363
+
364
+ print(f"⏳ Monitoring {len(job_names)} evaluation jobs...")
365
+
366
+ while len(completed_jobs) < len(job_names) and (time.time() - start_time) < timeout:
367
+ for job_name in job_names:
368
+ if job_name in completed_jobs:
369
+ continue
370
+
371
+ status = get_lepton_job_status(job_name)
372
+ if status:
373
+ job_state = status.get("state", "Unknown")
374
+ job_statuses[job_name] = status
375
+
376
+ if job_state in ["Succeeded", "Failed", "Cancelled"]:
377
+ completed_jobs.add(job_name)
378
+ if job_state == "Succeeded":
379
+ print(f"✅ Job {job_name}: {job_state}")
380
+ else:
381
+ print(f"❌ Job {job_name}: {job_state}")
382
+ else:
383
+ print(f"⏳ Job {job_name}: {job_state}")
384
+
385
+ if len(completed_jobs) < len(job_names):
386
+ time.sleep(10) # Check every 10 seconds
387
+
388
+ # Final status check
389
+ for job_name in job_names:
390
+ if job_name not in completed_jobs:
391
+ status = get_lepton_job_status(job_name)
392
+ if status:
393
+ job_statuses[job_name] = status
394
+ print(
395
+ f"⏰ Job {job_name}: Timeout (still {status.get('state', 'Unknown')})"
396
+ )
397
+
398
+ return job_statuses
@@ -0,0 +1,15 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #