nemo-evaluator-launcher 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. nemo_evaluator_launcher/__init__.py +15 -1
  2. nemo_evaluator_launcher/api/functional.py +188 -27
  3. nemo_evaluator_launcher/api/types.py +9 -0
  4. nemo_evaluator_launcher/cli/export.py +131 -12
  5. nemo_evaluator_launcher/cli/info.py +477 -82
  6. nemo_evaluator_launcher/cli/kill.py +5 -3
  7. nemo_evaluator_launcher/cli/logs.py +102 -0
  8. nemo_evaluator_launcher/cli/ls_runs.py +31 -10
  9. nemo_evaluator_launcher/cli/ls_tasks.py +105 -3
  10. nemo_evaluator_launcher/cli/main.py +101 -5
  11. nemo_evaluator_launcher/cli/run.py +153 -30
  12. nemo_evaluator_launcher/cli/status.py +49 -5
  13. nemo_evaluator_launcher/cli/version.py +26 -23
  14. nemo_evaluator_launcher/common/execdb.py +121 -27
  15. nemo_evaluator_launcher/common/helpers.py +213 -33
  16. nemo_evaluator_launcher/common/logging_utils.py +16 -5
  17. nemo_evaluator_launcher/common/printing_utils.py +100 -0
  18. nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
  19. nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
  20. nemo_evaluator_launcher/configs/deployment/trtllm.yaml +23 -0
  21. nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -2
  22. nemo_evaluator_launcher/configs/execution/local.yaml +2 -0
  23. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +19 -4
  24. nemo_evaluator_launcher/executors/base.py +54 -1
  25. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +60 -5
  26. nemo_evaluator_launcher/executors/lepton/executor.py +240 -101
  27. nemo_evaluator_launcher/executors/lepton/job_helpers.py +15 -11
  28. nemo_evaluator_launcher/executors/local/executor.py +492 -56
  29. nemo_evaluator_launcher/executors/local/run.template.sh +76 -9
  30. nemo_evaluator_launcher/executors/slurm/executor.py +571 -98
  31. nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
  32. nemo_evaluator_launcher/exporters/base.py +9 -0
  33. nemo_evaluator_launcher/exporters/gsheets.py +27 -9
  34. nemo_evaluator_launcher/exporters/local.py +30 -16
  35. nemo_evaluator_launcher/exporters/mlflow.py +245 -74
  36. nemo_evaluator_launcher/exporters/utils.py +139 -184
  37. nemo_evaluator_launcher/exporters/wandb.py +157 -43
  38. nemo_evaluator_launcher/package_info.py +6 -3
  39. nemo_evaluator_launcher/resources/mapping.toml +56 -15
  40. nemo_evaluator_launcher-0.1.41.dist-info/METADATA +494 -0
  41. nemo_evaluator_launcher-0.1.41.dist-info/RECORD +62 -0
  42. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/entry_points.txt +1 -0
  43. nemo_evaluator_launcher-0.1.0rc6.dist-info/METADATA +0 -35
  44. nemo_evaluator_launcher-0.1.0rc6.dist-info/RECORD +0 -57
  45. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/WHEEL +0 -0
  46. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/licenses/LICENSE +0 -0
  47. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/top_level.txt +0 -0
@@ -15,13 +15,11 @@
15
15
  #
16
16
  """Evaluation results exporter for MLflow tracking."""
17
17
 
18
- import shutil
18
+ import os
19
19
  import tempfile
20
20
  from pathlib import Path
21
21
  from typing import Any, Dict, List
22
22
 
23
- import yaml
24
-
25
23
  try:
26
24
  import mlflow
27
25
 
@@ -37,9 +35,11 @@ from nemo_evaluator_launcher.exporters.registry import register_exporter
37
35
  from nemo_evaluator_launcher.exporters.utils import (
38
36
  extract_accuracy_metrics,
39
37
  extract_exporter_config,
38
+ get_artifact_root,
40
39
  get_available_artifacts,
41
40
  get_benchmark_info,
42
41
  get_task_name,
42
+ mlflow_sanitize,
43
43
  )
44
44
 
45
45
 
@@ -100,11 +100,57 @@ class MLflowExporter(BaseExporter):
100
100
  # Extract config using common utility
101
101
  mlflow_config = extract_exporter_config(job_data, "mlflow", self.config)
102
102
 
103
- # Extract metrics
103
+ # resolve tracking_uri with fallbacks
104
+ tracking_uri = mlflow_config.get("tracking_uri")
105
+ if not tracking_uri:
106
+ tracking_uri = os.getenv("MLFLOW_TRACKING_URI")
107
+ # allow env var name
108
+ if tracking_uri and "://" not in tracking_uri:
109
+ tracking_uri = os.getenv(tracking_uri, tracking_uri)
110
+
111
+ if not tracking_uri:
112
+ return ExportResult(
113
+ success=False,
114
+ dest="mlflow",
115
+ message="tracking_uri is required (set export.mlflow.tracking_uri or MLFLOW_TRACKING_URI)",
116
+ )
117
+
118
+ # Stage artifacts locally if remote_ssh (e.g., Slurm), so we can extract metrics
119
+ staged_base_dir = None
120
+ try:
121
+ paths = self.get_job_paths(job_data)
122
+ if paths.get("storage_type") == "remote_ssh":
123
+ tmp_stage = Path(tempfile.mkdtemp(prefix="mlflow_stage_"))
124
+ LocalExporter(
125
+ {
126
+ "output_dir": str(tmp_stage),
127
+ "copy_logs": mlflow_config.get(
128
+ "log_logs", False
129
+ ), # log_logs -> copy_logs
130
+ "only_required": mlflow_config.get("only_required", True),
131
+ }
132
+ ).export_job(job_data)
133
+ staged_base_dir = (
134
+ tmp_stage / job_data.invocation_id / job_data.job_id
135
+ )
136
+ except Exception as e:
137
+ logger.warning(f"Failed staging artifacts for {job_data.job_id}: {e}")
138
+
139
+ # Extract metrics (prefer staged if available)
104
140
  log_metrics = mlflow_config.get("log_metrics", [])
105
- accuracy_metrics = extract_accuracy_metrics(
106
- job_data, self.get_job_paths, log_metrics
107
- )
141
+ if staged_base_dir and (staged_base_dir / "artifacts").exists():
142
+ accuracy_metrics = extract_accuracy_metrics(
143
+ job_data,
144
+ lambda _: {
145
+ "artifacts_dir": staged_base_dir / "artifacts",
146
+ "storage_type": "local_filesystem",
147
+ },
148
+ log_metrics,
149
+ )
150
+ else:
151
+ accuracy_metrics = extract_accuracy_metrics(
152
+ job_data, self.get_job_paths, log_metrics
153
+ )
108
154
 
109
155
  if not accuracy_metrics:
110
156
  return ExportResult(
@@ -112,12 +158,6 @@ class MLflowExporter(BaseExporter):
112
158
  )
113
159
 
114
160
  # Set up MLflow
115
- tracking_uri = mlflow_config.get("tracking_uri")
116
- if not tracking_uri:
117
- return ExportResult(
118
- success=False, dest="mlflow", message="tracking_uri is required"
119
- )
120
-
121
161
  tracking_uri = tracking_uri.rstrip("/")
122
162
  mlflow.set_tracking_uri(tracking_uri)
123
163
 
@@ -149,10 +189,13 @@ class MLflowExporter(BaseExporter):
149
189
  }
150
190
  )
151
191
 
152
- # Truncate params
192
+ # Sanitize params
153
193
  safe_params = {
154
- str(k)[:250]: str(v)[:250] for k, v in all_params.items() if v
194
+ mlflow_sanitize(k, "param_key"): mlflow_sanitize(v, "param_value")
195
+ for k, v in (all_params or {}).items()
196
+ if v is not None
155
197
  }
198
+
156
199
  # Prepare tags
157
200
  tags = {}
158
201
  if mlflow_config.get("tags"):
@@ -162,7 +205,10 @@ class MLflowExporter(BaseExporter):
162
205
  benchmark = bench_info.get("benchmark", get_task_name(job_data))
163
206
  harness = bench_info.get("harness", "unknown")
164
207
 
165
- # Tag the run with invocation_id and task metadata (task_name is benchmark-only)
208
+ # Tag the run with invocation_id and task metadata
209
+ exec_type = (job_data.config or {}).get("execution", {}).get(
210
+ "type"
211
+ ) or job_data.executor
166
212
  tags.update(
167
213
  {
168
214
  "invocation_id": job_data.invocation_id,
@@ -170,11 +216,16 @@ class MLflowExporter(BaseExporter):
170
216
  "task_name": benchmark,
171
217
  "benchmark": benchmark,
172
218
  "harness": harness,
173
- "executor": job_data.executor,
219
+ "executor": exec_type,
174
220
  }
175
221
  )
176
- # Truncate tags
177
- safe_tags = {str(k)[:250]: str(v)[:5000] for k, v in tags.items() if v}
222
+
223
+ # Sanitize tags
224
+ safe_tags = {
225
+ mlflow_sanitize(k, "tag_key"): mlflow_sanitize(v, "tag_value")
226
+ for k, v in (tags or {}).items()
227
+ if v is not None
228
+ }
178
229
 
179
230
  # skip run if it already exists
180
231
  exists, existing_run_id = self._get_existing_run_info(
@@ -193,26 +244,34 @@ class MLflowExporter(BaseExporter):
193
244
  if safe_tags:
194
245
  mlflow.set_tags(safe_tags)
195
246
 
196
- # Set run name)
247
+ # Set run name
197
248
  run_name = (
198
249
  mlflow_config.get("run_name")
199
250
  or f"eval-{job_data.invocation_id}-{benchmark}"
200
251
  )
201
- mlflow.set_tag("mlflow.runName", run_name)
252
+ mlflow.set_tag("mlflow.runName", mlflow_sanitize(run_name, "tag_value"))
202
253
 
203
254
  # Set description only if provided
204
255
  description = mlflow_config.get("description")
205
256
  if description:
206
- mlflow.set_tag("mlflow.note.content", str(description)[:5000])
257
+ mlflow.set_tag(
258
+ "mlflow.note.content", mlflow_sanitize(description, "tag_value")
259
+ )
207
260
 
208
261
  # Log parameters
209
262
  mlflow.log_params(safe_params)
210
263
 
211
- # Log metrics
212
- mlflow.log_metrics(accuracy_metrics)
264
+ # Sanitize metric keys before logging
265
+ safe_metrics = {
266
+ mlflow_sanitize(k, "metric"): v
267
+ for k, v in (accuracy_metrics or {}).items()
268
+ }
269
+ mlflow.log_metrics(safe_metrics)
213
270
 
214
271
  # Log artifacts
215
- artifacts_logged = self._log_artifacts(job_data, mlflow_config)
272
+ artifacts_logged = self._log_artifacts(
273
+ job_data, mlflow_config, staged_base_dir
274
+ )
216
275
 
217
276
  # Build run URL
218
277
  run_url = None
@@ -242,7 +301,10 @@ class MLflowExporter(BaseExporter):
242
301
  )
243
302
 
244
303
  def _log_artifacts(
245
- self, job_data: JobData, mlflow_config: Dict[str, Any]
304
+ self,
305
+ job_data: JobData,
306
+ mlflow_config: Dict[str, Any],
307
+ pre_staged_dir: Path = None,
246
308
  ) -> List[str]:
247
309
  """Log evaluation artifacts to MLflow using LocalExporter for transfer."""
248
310
 
@@ -251,44 +313,103 @@ class MLflowExporter(BaseExporter):
251
313
  return []
252
314
 
253
315
  try:
254
- # Use LocalExporter to get files locally first
255
- temp_dir = tempfile.mkdtemp(prefix="mlflow_artifacts_")
256
- local_exporter = LocalExporter({"output_dir": temp_dir})
257
- local_result = local_exporter.export_job(job_data)
258
-
259
- if not local_result.success:
260
- logger.error(f"Failed to download artifacts: {local_result.message}")
261
- return []
262
-
263
- artifacts_dir = Path(local_result.dest) / "artifacts"
264
- logged_names = []
265
-
266
- task_name = get_task_name(job_data)
267
- artifact_path = task_name
268
-
269
- # Log config at root level
270
- with tempfile.TemporaryDirectory() as tmpdir:
271
- cfg_file = Path(tmpdir) / "config.yaml"
272
- with cfg_file.open("w") as f:
273
- yaml.dump(
274
- job_data.config or {},
275
- f,
276
- default_flow_style=False,
277
- sort_keys=False,
316
+ should_cleanup = False
317
+ # Use pre-staged dir if available; otherwise stage now
318
+ if pre_staged_dir and pre_staged_dir.exists():
319
+ base_dir = pre_staged_dir
320
+ else:
321
+ temp_dir = tempfile.mkdtemp(prefix="mlflow_artifacts_")
322
+ local_exporter = LocalExporter(
323
+ {
324
+ "output_dir": str(temp_dir),
325
+ "copy_logs": mlflow_config.get(
326
+ "log_logs", mlflow_config.get("copy_logs", False)
327
+ ),
328
+ "only_required": mlflow_config.get("only_required", True),
329
+ "format": mlflow_config.get("format", None),
330
+ "log_metrics": mlflow_config.get("log_metrics", []),
331
+ "output_filename": mlflow_config.get("output_filename", None),
332
+ }
333
+ )
334
+ local_result = local_exporter.export_job(job_data)
335
+ if not local_result.success:
336
+ logger.error(
337
+ f"Failed to download artifacts: {local_result.message}"
338
+ )
339
+ return []
340
+ base_dir = Path(local_result.dest)
341
+ should_cleanup = True
342
+
343
+ artifacts_dir = base_dir / "artifacts"
344
+ logs_dir = base_dir / "logs"
345
+ logged_names: list[str] = []
346
+ artifact_path = get_artifact_root(job_data) # "<harness>.<benchmark>"
347
+
348
+ # Log config at root level (or synthesize)
349
+ cfg_logged = False
350
+ for fname in ("config.yml", "run_config.yml"):
351
+ p = artifacts_dir / fname
352
+ if p.exists():
353
+ mlflow.log_artifact(str(p))
354
+ cfg_logged = True
355
+ break
356
+ if not cfg_logged:
357
+ with tempfile.TemporaryDirectory() as tmpdir:
358
+ from yaml import dump as ydump
359
+
360
+ cfg_file = Path(tmpdir) / "config.yaml"
361
+ cfg_file.write_text(
362
+ ydump(
363
+ job_data.config or {},
364
+ default_flow_style=False,
365
+ sort_keys=False,
366
+ )
278
367
  )
279
- mlflow.log_artifact(str(cfg_file))
368
+ mlflow.log_artifact(str(cfg_file))
369
+
370
+ # Choose files to upload
371
+ files_to_upload: list[Path] = []
372
+ if mlflow_config.get("only_required", True):
373
+ for fname in get_available_artifacts(artifacts_dir):
374
+ p = artifacts_dir / fname
375
+ if p.exists():
376
+ files_to_upload.append(p)
377
+ else:
378
+ for p in artifacts_dir.iterdir(): # top-level files only
379
+ if p.is_file():
380
+ files_to_upload.append(p)
381
+
382
+ # Upload artifacts (with DEBUG per-file)
383
+ for fpath in files_to_upload:
384
+ rel = fpath.relative_to(artifacts_dir).as_posix()
385
+ parent = os.path.dirname(rel)
386
+ mlflow.log_artifact(
387
+ str(fpath),
388
+ artifact_path=f"{artifact_path}/artifacts/{parent}".rstrip("/"),
389
+ )
390
+ logged_names.append(rel)
391
+ logger.debug(f"mlflow upload artifact: {rel}")
392
+
393
+ # Optionally upload logs under "<harness.task>/logs"
394
+ if mlflow_config.get("log_logs", False) and logs_dir.exists():
395
+ for p in logs_dir.iterdir():
396
+ if p.is_file():
397
+ rel = p.name
398
+ mlflow.log_artifact(
399
+ str(p), artifact_path=f"{artifact_path}/logs"
400
+ )
401
+ logged_names.append(f"logs/{rel}")
402
+ logger.debug(f"mlflow upload log: {rel}")
403
+
404
+ logger.info(
405
+ f"MLflow upload summary: files={len(logged_names)}, only_required={mlflow_config.get('only_required', True)}, log_logs={mlflow_config.get('log_logs', False)}"
406
+ )
407
+ if should_cleanup:
408
+ import shutil
280
409
 
281
- # Then log results files
282
- for fname in get_available_artifacts(artifacts_dir):
283
- file_path = artifacts_dir / fname
284
- if file_path.exists():
285
- mlflow.log_artifact(str(file_path), artifact_path=artifact_path)
286
- logged_names.append(fname)
410
+ shutil.rmtree(base_dir, ignore_errors=True)
287
411
 
288
- # cleanup temp
289
- shutil.rmtree(temp_dir)
290
412
  return logged_names
291
-
292
413
  except Exception as e:
293
414
  logger.error(f"Error logging artifacts: {e}")
294
415
  return []
@@ -312,13 +433,56 @@ class MLflowExporter(BaseExporter):
312
433
  # Extract config using common utility
313
434
  mlflow_config = extract_exporter_config(first_job, "mlflow", self.config)
314
435
 
436
+ # resolve tracking_uri with fallbacks
437
+ tracking_uri = mlflow_config.get("tracking_uri") or os.getenv(
438
+ "MLFLOW_TRACKING_URI"
439
+ )
440
+ if tracking_uri and "://" not in tracking_uri:
441
+ tracking_uri = os.getenv(tracking_uri, tracking_uri)
442
+ if not tracking_uri:
443
+ return {
444
+ "success": False,
445
+ "error": "tracking_uri is required (set export.mlflow.tracking_uri or MLFLOW_TRACKING_URI)",
446
+ }
447
+
315
448
  # Collect metrics from ALL jobs
316
449
  all_metrics = {}
450
+ staged_map: dict[str, Path] = {}
451
+ for job_id, job_data in jobs.items():
452
+ try:
453
+ paths = self.get_job_paths(job_data)
454
+ if paths.get("storage_type") == "remote_ssh":
455
+ tmp_stage = Path(tempfile.mkdtemp(prefix="mlflow_inv_stage_"))
456
+ LocalExporter(
457
+ {
458
+ "output_dir": str(tmp_stage),
459
+ "copy_logs": mlflow_config.get("log_logs", False),
460
+ "only_required": mlflow_config.get(
461
+ "only_required", True
462
+ ),
463
+ }
464
+ ).export_job(job_data)
465
+ staged_map[job_id] = (
466
+ tmp_stage / job_data.invocation_id / job_data.job_id
467
+ )
468
+ except Exception as e:
469
+ logger.warning(f"Staging failed for {job_id}: {e}")
470
+
317
471
  for job_id, job_data in jobs.items():
318
472
  log_metrics = mlflow_config.get("log_metrics", [])
319
- job_metrics = extract_accuracy_metrics(
320
- job_data, self.get_job_paths, log_metrics
321
- )
473
+ if job_id in staged_map and (staged_map[job_id] / "artifacts").exists():
474
+ job_metrics = extract_accuracy_metrics(
475
+ job_data,
476
+ lambda _: {
477
+ "artifacts_dir": staged_map[job_id] / "artifacts",
478
+ "storage_type": "local_filesystem",
479
+ },
480
+ log_metrics,
481
+ )
482
+ else:
483
+ job_metrics = extract_accuracy_metrics(
484
+ job_data, self.get_job_paths, log_metrics
485
+ )
322
486
  all_metrics.update(job_metrics)
323
487
 
324
488
  if not all_metrics:
@@ -328,10 +492,6 @@ class MLflowExporter(BaseExporter):
328
492
  }
329
493
 
330
494
  # Set up MLflow
331
- tracking_uri = mlflow_config.get("tracking_uri")
332
- if not tracking_uri:
333
- return {"success": False, "error": "tracking_uri is required"}
334
-
335
495
  tracking_uri = tracking_uri.rstrip("/")
336
496
  mlflow.set_tracking_uri(tracking_uri)
337
497
 
@@ -341,9 +501,12 @@ class MLflowExporter(BaseExporter):
341
501
  mlflow.set_experiment(experiment_name)
342
502
 
343
503
  # Prepare parameters for invocation
504
+ inv_exec_type = (first_job.config or {}).get("execution", {}).get(
505
+ "type"
506
+ ) or first_job.executor
344
507
  all_params = {
345
508
  "invocation_id": invocation_id,
346
- "executor": first_job.executor,
509
+ "executor": inv_exec_type,
347
510
  "timestamp": str(first_job.timestamp),
348
511
  "jobs_count": str(len(jobs)),
349
512
  }
@@ -399,23 +562,31 @@ class MLflowExporter(BaseExporter):
399
562
 
400
563
  # Set run name
401
564
  run_name = mlflow_config.get("run_name") or f"eval-{invocation_id}"
402
- mlflow.set_tag("mlflow.runName", run_name)
565
+ mlflow.set_tag("mlflow.runName", mlflow_sanitize(run_name, "tag_value"))
403
566
 
404
567
  # Set description
405
568
  description = mlflow_config.get("description")
406
569
  if description:
407
- mlflow.set_tag("mlflow.note.content", str(description)[:5000])
570
+ mlflow.set_tag(
571
+ "mlflow.note.content", mlflow_sanitize(description, "tag_value")
572
+ )
408
573
 
409
574
  # Log parameters
410
575
  mlflow.log_params(safe_params)
411
576
 
412
- # Log ALL metrics
413
- mlflow.log_metrics(all_metrics)
577
+ # Sanitize metric keys
578
+ safe_all_metrics = {
579
+ mlflow_sanitize(k, "metric"): v
580
+ for k, v in (all_metrics or {}).items()
581
+ }
582
+ mlflow.log_metrics(safe_all_metrics)
414
583
 
415
584
  # Log artifacts from all jobs
416
585
  total_artifacts = 0
417
- for job_data in jobs.values():
418
- artifacts_logged = self._log_artifacts(job_data, mlflow_config)
586
+ for job_id, job_data in jobs.items():
587
+ artifacts_logged = self._log_artifacts(
588
+ job_data, mlflow_config, staged_map.get(job_id)
589
+ )
419
590
  total_artifacts += len(artifacts_logged)
420
591
 
421
592
  # Build run URL