nemo-evaluator-launcher 0.1.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nemo-evaluator-launcher might be problematic. Click here for more details.

Files changed (57) hide show
  1. nemo_evaluator_launcher/__init__.py +65 -0
  2. nemo_evaluator_launcher/api/__init__.py +24 -0
  3. nemo_evaluator_launcher/api/functional.py +641 -0
  4. nemo_evaluator_launcher/api/types.py +89 -0
  5. nemo_evaluator_launcher/api/utils.py +19 -0
  6. nemo_evaluator_launcher/cli/__init__.py +15 -0
  7. nemo_evaluator_launcher/cli/export.py +148 -0
  8. nemo_evaluator_launcher/cli/info.py +117 -0
  9. nemo_evaluator_launcher/cli/kill.py +39 -0
  10. nemo_evaluator_launcher/cli/ls_runs.py +113 -0
  11. nemo_evaluator_launcher/cli/ls_tasks.py +34 -0
  12. nemo_evaluator_launcher/cli/main.py +136 -0
  13. nemo_evaluator_launcher/cli/run.py +135 -0
  14. nemo_evaluator_launcher/cli/status.py +118 -0
  15. nemo_evaluator_launcher/cli/version.py +52 -0
  16. nemo_evaluator_launcher/common/__init__.py +16 -0
  17. nemo_evaluator_launcher/common/execdb.py +189 -0
  18. nemo_evaluator_launcher/common/helpers.py +157 -0
  19. nemo_evaluator_launcher/common/logging_utils.py +349 -0
  20. nemo_evaluator_launcher/common/mapping.py +310 -0
  21. nemo_evaluator_launcher/configs/__init__.py +15 -0
  22. nemo_evaluator_launcher/configs/default.yaml +28 -0
  23. nemo_evaluator_launcher/configs/deployment/nim.yaml +32 -0
  24. nemo_evaluator_launcher/configs/deployment/none.yaml +16 -0
  25. nemo_evaluator_launcher/configs/deployment/sglang.yaml +38 -0
  26. nemo_evaluator_launcher/configs/deployment/vllm.yaml +41 -0
  27. nemo_evaluator_launcher/configs/execution/lepton/default.yaml +92 -0
  28. nemo_evaluator_launcher/configs/execution/local.yaml +17 -0
  29. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +33 -0
  30. nemo_evaluator_launcher/executors/__init__.py +22 -0
  31. nemo_evaluator_launcher/executors/base.py +97 -0
  32. nemo_evaluator_launcher/executors/lepton/__init__.py +16 -0
  33. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +589 -0
  34. nemo_evaluator_launcher/executors/lepton/executor.py +905 -0
  35. nemo_evaluator_launcher/executors/lepton/job_helpers.py +394 -0
  36. nemo_evaluator_launcher/executors/local/__init__.py +15 -0
  37. nemo_evaluator_launcher/executors/local/executor.py +491 -0
  38. nemo_evaluator_launcher/executors/local/run.template.sh +88 -0
  39. nemo_evaluator_launcher/executors/registry.py +38 -0
  40. nemo_evaluator_launcher/executors/slurm/__init__.py +15 -0
  41. nemo_evaluator_launcher/executors/slurm/executor.py +982 -0
  42. nemo_evaluator_launcher/exporters/__init__.py +36 -0
  43. nemo_evaluator_launcher/exporters/base.py +112 -0
  44. nemo_evaluator_launcher/exporters/gsheets.py +391 -0
  45. nemo_evaluator_launcher/exporters/local.py +488 -0
  46. nemo_evaluator_launcher/exporters/mlflow.py +448 -0
  47. nemo_evaluator_launcher/exporters/registry.py +40 -0
  48. nemo_evaluator_launcher/exporters/utils.py +669 -0
  49. nemo_evaluator_launcher/exporters/wandb.py +376 -0
  50. nemo_evaluator_launcher/package_info.py +35 -0
  51. nemo_evaluator_launcher/resources/mapping.toml +344 -0
  52. nemo_evaluator_launcher-0.1.0rc2.dist-info/METADATA +35 -0
  53. nemo_evaluator_launcher-0.1.0rc2.dist-info/RECORD +57 -0
  54. nemo_evaluator_launcher-0.1.0rc2.dist-info/WHEEL +5 -0
  55. nemo_evaluator_launcher-0.1.0rc2.dist-info/entry_points.txt +3 -0
  56. nemo_evaluator_launcher-0.1.0rc2.dist-info/licenses/LICENSE +451 -0
  57. nemo_evaluator_launcher-0.1.0rc2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,376 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ """Weights & Biases results exporter."""
17
+
18
+ import os
19
+ import shutil
20
+ import tempfile
21
+ from pathlib import Path
22
+ from typing import Any, Dict, List
23
+
24
+ import yaml
25
+
26
+ try:
27
+ import wandb
28
+
29
+ WANDB_AVAILABLE = True
30
+ except ImportError:
31
+ WANDB_AVAILABLE = False
32
+
33
+ from nemo_evaluator_launcher.common.execdb import JobData
34
+ from nemo_evaluator_launcher.common.logging_utils import logger
35
+ from nemo_evaluator_launcher.exporters.base import BaseExporter, ExportResult
36
+ from nemo_evaluator_launcher.exporters.local import LocalExporter
37
+ from nemo_evaluator_launcher.exporters.registry import register_exporter
38
+ from nemo_evaluator_launcher.exporters.utils import (
39
+ extract_accuracy_metrics,
40
+ extract_exporter_config,
41
+ get_available_artifacts,
42
+ get_benchmark_info,
43
+ get_task_name,
44
+ )
45
+
46
+
47
+ @register_exporter("wandb")
48
+ class WandBExporter(BaseExporter):
49
+ """Export accuracy metrics to W&B."""
50
+
51
+ def supports_executor(self, executor_type: str) -> bool:
52
+ return True
53
+
54
+ def is_available(self) -> bool:
55
+ return WANDB_AVAILABLE
56
+
57
+ def export_job(self, job_data: JobData) -> ExportResult:
58
+ """Export single job - same logic as invocation but for one job."""
59
+ if not self.is_available():
60
+ return ExportResult(
61
+ success=False, dest="wandb", message="wandb package not installed"
62
+ )
63
+
64
+ try:
65
+ wandb_config = extract_exporter_config(job_data, "wandb", self.config)
66
+ log_mode = wandb_config.get(
67
+ "log_mode", "per_task"
68
+ ) # Default per_task for immediate export
69
+
70
+ # Get metrics
71
+ metrics = extract_accuracy_metrics(
72
+ job_data, self.get_job_paths, wandb_config.get("log_metrics", [])
73
+ )
74
+ if not metrics:
75
+ return ExportResult(
76
+ success=False, dest="wandb", message="No metrics found"
77
+ )
78
+
79
+ # Choose either jobId or invocationId based on log_mode
80
+ if log_mode == "per_task":
81
+ # Create separate run per task
82
+ task_name = get_task_name(job_data)
83
+ identifier = f"{job_data.invocation_id}-{task_name}"
84
+ should_resume = False
85
+ run_id = None
86
+ elif log_mode == "multi_task":
87
+ # Append to shared run by invocation_id
88
+ identifier = job_data.invocation_id
89
+ should_resume, run_id = self._check_existing_run(
90
+ identifier, job_data, wandb_config
91
+ )
92
+ result = self._create_wandb_run(
93
+ identifier, wandb_config, metrics, job_data, should_resume, run_id
94
+ )
95
+ return ExportResult(
96
+ success=True, dest="wandb", message="Export completed", metadata=result
97
+ )
98
+
99
+ except Exception as e:
100
+ logger.error(f"W&B export failed: {e}")
101
+ return ExportResult(
102
+ success=False, dest="wandb", message=f"Failed: {str(e)}"
103
+ )
104
+
105
+ def export_invocation(self, invocation_id: str) -> Dict[str, Any]:
106
+ """Export all jobs in invocation as one W&B run."""
107
+ if not self.is_available():
108
+ return {"success": False, "error": "wandb package not installed"}
109
+
110
+ jobs = self.db.get_jobs(invocation_id)
111
+ if not jobs:
112
+ return {
113
+ "success": False,
114
+ "error": f"No jobs found for invocation {invocation_id}",
115
+ }
116
+
117
+ try:
118
+ first_job = list(jobs.values())[0]
119
+ wandb_config = extract_exporter_config(first_job, "wandb", self.config)
120
+
121
+ all_metrics = {}
122
+ for _, job_data in jobs.items():
123
+ log_metrics = wandb_config.get("log_metrics", [])
124
+ job_metrics = extract_accuracy_metrics(
125
+ job_data, self.get_job_paths, log_metrics
126
+ )
127
+ all_metrics.update(job_metrics)
128
+
129
+ if not all_metrics:
130
+ return {
131
+ "success": False,
132
+ "error": "No accuracy metrics found in any job",
133
+ }
134
+
135
+ should_resume, run_id = self._check_existing_run(
136
+ invocation_id, first_job, wandb_config
137
+ )
138
+
139
+ result = self._create_wandb_run(
140
+ invocation_id,
141
+ wandb_config,
142
+ all_metrics,
143
+ first_job,
144
+ should_resume,
145
+ run_id,
146
+ )
147
+
148
+ return {
149
+ "success": True,
150
+ "invocation_id": invocation_id,
151
+ "jobs": {
152
+ job_id: {
153
+ "success": True,
154
+ "message": "Contributed to invocation run",
155
+ }
156
+ for job_id in jobs.keys()
157
+ },
158
+ "metadata": result,
159
+ }
160
+
161
+ except Exception as e:
162
+ logger.error(f"W&B export failed for invocation {invocation_id}: {e}")
163
+ return {"success": False, "error": f"W&B export failed: {str(e)}"}
164
+
165
+ def _log_artifacts(
166
+ self, job_data: JobData, wandb_config: Dict[str, Any], artifact
167
+ ) -> List[str]:
168
+ """Log evaluation artifacts to WandB using LocalExporter for transfer."""
169
+ if not wandb_config.get("log_artifacts", True):
170
+ return []
171
+ try:
172
+ temp_dir = tempfile.mkdtemp(prefix="wandb_artifacts_")
173
+ local_exporter = LocalExporter({"output_dir": temp_dir})
174
+ local_result = local_exporter.export_job(job_data)
175
+
176
+ if not local_result.success:
177
+ logger.error(f"Failed to download artifacts: {local_result.message}")
178
+ return []
179
+
180
+ artifacts_dir = Path(local_result.dest) / "artifacts"
181
+ logged_names = []
182
+ task_name = get_task_name(job_data)
183
+ for fname in get_available_artifacts(artifacts_dir):
184
+ fpath = artifacts_dir / fname
185
+ if fpath.exists():
186
+ artifact.add_file(str(fpath), name=f"{task_name}/{fname}")
187
+ logged_names.append(fname)
188
+ shutil.rmtree(temp_dir)
189
+ return logged_names
190
+ except Exception as e:
191
+ logger.error(f"Error logging artifacts: {e}")
192
+ return []
193
+
194
+ def _check_existing_run(
195
+ self, identifier: str, job_data: JobData, config: Dict[str, Any]
196
+ ) -> tuple[bool, str]:
197
+ """Check if run exists based on webhook metadata then name patterns."""
198
+ try:
199
+ import wandb
200
+
201
+ api = wandb.Api()
202
+ entity = config.get("entity")
203
+ project = config.get("project")
204
+ if not (entity and project):
205
+ return False, None
206
+
207
+ # # Check webhook metadata for run_id first
208
+ webhook_meta = job_data.data.get("webhook_metadata", {})
209
+ if (
210
+ webhook_meta.get("webhook_source") == "wandb"
211
+ and config.get("triggered_by_webhook")
212
+ and "run_id" in webhook_meta
213
+ ):
214
+ try:
215
+ # Verify the run actually exists
216
+ run = api.run(f"{entity}/{project}/{webhook_meta['run_id']}")
217
+ return True, run.id
218
+ except Exception:
219
+ pass
220
+
221
+ # Check explicit name first
222
+ if config.get("name"):
223
+ runs = api.runs(f"{entity}/{project}")
224
+ for run in runs:
225
+ if run.display_name == config["name"]:
226
+ return True, run.id
227
+
228
+ # Check default pattern
229
+ default_run_name = f"eval-{identifier}"
230
+ runs = api.runs(f"{entity}/{project}")
231
+ for run in runs:
232
+ if run.display_name == default_run_name:
233
+ return True, run.id
234
+
235
+ return False, None
236
+ except Exception:
237
+ return False, None
238
+
239
+ def _create_wandb_run(
240
+ self,
241
+ identifier: str,
242
+ config: Dict[str, Any],
243
+ metrics: Dict[str, float],
244
+ job_data: JobData,
245
+ should_resume: bool,
246
+ existing_run_id: str,
247
+ ) -> Dict[str, Any]:
248
+ """Create or resume W&B run for single job."""
249
+ log_mode = config.get("log_mode", "per_task")
250
+ task_name = get_task_name(job_data)
251
+ bench_info = get_benchmark_info(job_data)
252
+ benchmark = bench_info.get("benchmark", task_name)
253
+ harness = bench_info.get("harness", "unknown")
254
+
255
+ if config.get("name"):
256
+ run_name = config["name"]
257
+ else:
258
+ run_name = (
259
+ f"eval-{job_data.invocation_id}-{benchmark}"
260
+ if log_mode == "per_task"
261
+ else f"eval-{identifier}"
262
+ )
263
+
264
+ run_args = {
265
+ "entity": config.get("entity"),
266
+ "project": config.get("project"),
267
+ "name": run_name,
268
+ "group": config.get("group", job_data.invocation_id),
269
+ "job_type": config.get("job_type", "evaluation"),
270
+ "tags": config.get("tags"),
271
+ "notes": config.get("description"),
272
+ }
273
+
274
+ # resume for multi_task runs
275
+ if log_mode == "multi_task":
276
+ stable_id = config.get("run_id") or identifier # invocation_id
277
+ run_args["id"] = stable_id
278
+ run_args["resume"] = "allow"
279
+ elif should_resume:
280
+ run_args["id"] = existing_run_id
281
+ run_args["resume"] = "allow"
282
+
283
+ # Config metadata
284
+ run_config = {
285
+ "invocation_id": job_data.invocation_id,
286
+ "executor": job_data.executor,
287
+ }
288
+ if log_mode == "per_task":
289
+ run_config["job_id"] = job_data.job_id
290
+ run_config["harness"] = harness
291
+ run_config["benchmark"] = benchmark
292
+
293
+ if config.get("triggered_by_webhook"):
294
+ run_config.update(
295
+ {
296
+ "webhook_triggered": True,
297
+ "webhook_source": config.get("webhook_source"),
298
+ "source_artifact": config.get("source_artifact"),
299
+ "config_source": config.get("config_source"),
300
+ }
301
+ )
302
+
303
+ run_config.update(config.get("extra_metadata", {}))
304
+ run_args["config"] = run_config
305
+
306
+ # Initialize
307
+ run = wandb.init(**{k: v for k, v in run_args.items() if v is not None})
308
+
309
+ # In multi_task, aggregate lists after init (no overwrite)
310
+ if log_mode == "multi_task":
311
+ try:
312
+ benchmarks = list(run.config.get("benchmarks", []))
313
+ if benchmark not in benchmarks:
314
+ benchmarks.append(benchmark)
315
+ harnesses = list(run.config.get("harnesses", []))
316
+ if harness not in harnesses:
317
+ harnesses.append(harness)
318
+ run.config.update(
319
+ {"benchmarks": benchmarks, "harnesses": harnesses},
320
+ allow_val_change=True,
321
+ )
322
+ except Exception:
323
+ pass
324
+
325
+ # Artifact naming
326
+ artifact_name = (
327
+ f"{job_data.invocation_id}_{benchmark}"
328
+ if log_mode == "per_task"
329
+ else job_data.invocation_id
330
+ )
331
+ artifact = wandb.Artifact(
332
+ name=artifact_name,
333
+ type="evaluation_result",
334
+ description="Evaluation results",
335
+ metadata={
336
+ "invocation_id": job_data.invocation_id,
337
+ "task": task_name,
338
+ "benchmark": benchmark,
339
+ "harness": harness,
340
+ },
341
+ )
342
+ with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as tmp_cfg:
343
+ yaml.dump(job_data.config or {}, tmp_cfg, default_flow_style=False)
344
+ cfg_path = tmp_cfg.name
345
+ artifact.add_file(cfg_path, name="config.yaml")
346
+ os.unlink(cfg_path)
347
+
348
+ logged_artifacts = self._log_artifacts(job_data, config, artifact)
349
+ run.log_artifact(artifact)
350
+
351
+ # charts for each logged metric
352
+ try:
353
+ for k in metrics.keys():
354
+ run.define_metric(k, summary="last")
355
+ except Exception:
356
+ pass
357
+
358
+ # Log metrics with per-task step
359
+ try:
360
+ step_idx = int(job_data.job_id.split(".")[-1])
361
+ except Exception:
362
+ step_idx = 0
363
+ run.log(metrics, step=step_idx)
364
+
365
+ # metrics summary
366
+ try:
367
+ run.summary.update(metrics)
368
+ except Exception:
369
+ pass
370
+
371
+ return {
372
+ "run_id": run.id,
373
+ "run_url": run.url,
374
+ "metrics_logged": len(metrics),
375
+ "artifacts_logged": len(logged_artifacts),
376
+ }
@@ -0,0 +1,35 @@
1
+ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ MAJOR = 0
17
+ MINOR = 1
18
+ PATCH = 0
19
+ PRE_RELEASE = "rc2"
20
+
21
+ # Use the following formatting: (major, minor, patch, pre-release)
22
+ VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
23
+
24
+ __shortversion__ = ".".join(map(str, VERSION[:3]))
25
+ __version__ = ".".join(map(str, VERSION[:3])) + "".join(VERSION[3:])
26
+
27
+ __package_name__ = "nemo_evaluator_launcher"
28
+ __contact_names__ = "NVIDIA"
29
+ __contact_emails__ = "nemo-toolkit@nvidia.com"
30
+ __homepage__ = "https://github.com/NVIDIA-NeMo/Eval"
31
+ __repository_url__ = "https://github.com/NVIDIA-NeMo/Eval"
32
+ __download_url__ = "https://github.com/NVIDIA-NeMo/Eval/releases"
33
+ __description__ = "Launcher for the evaluations provided by NeMo Evaluator containers with different runtime backends"
34
+ __license__ = "Apache2"
35
+ __keywords__ = "deep learning, evaluations, machine learning, gpu, NLP, pytorch, torch"