nemo-evaluator-launcher 0.1.19__py3-none-any.whl → 0.1.56__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. nemo_evaluator_launcher/api/functional.py +159 -5
  2. nemo_evaluator_launcher/cli/logs.py +102 -0
  3. nemo_evaluator_launcher/cli/ls_task.py +280 -0
  4. nemo_evaluator_launcher/cli/ls_tasks.py +208 -55
  5. nemo_evaluator_launcher/cli/main.py +29 -2
  6. nemo_evaluator_launcher/cli/run.py +114 -16
  7. nemo_evaluator_launcher/cli/version.py +26 -23
  8. nemo_evaluator_launcher/common/container_metadata/__init__.py +61 -0
  9. nemo_evaluator_launcher/common/container_metadata/intermediate_repr.py +530 -0
  10. nemo_evaluator_launcher/common/container_metadata/loading.py +1126 -0
  11. nemo_evaluator_launcher/common/container_metadata/registries.py +824 -0
  12. nemo_evaluator_launcher/common/container_metadata/utils.py +63 -0
  13. nemo_evaluator_launcher/common/helpers.py +200 -51
  14. nemo_evaluator_launcher/common/logging_utils.py +16 -5
  15. nemo_evaluator_launcher/common/mapping.py +341 -155
  16. nemo_evaluator_launcher/common/printing_utils.py +25 -12
  17. nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
  18. nemo_evaluator_launcher/configs/deployment/trtllm.yaml +2 -3
  19. nemo_evaluator_launcher/configs/deployment/vllm.yaml +0 -1
  20. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +14 -0
  21. nemo_evaluator_launcher/executors/base.py +31 -1
  22. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +36 -1
  23. nemo_evaluator_launcher/executors/lepton/executor.py +107 -9
  24. nemo_evaluator_launcher/executors/local/executor.py +383 -24
  25. nemo_evaluator_launcher/executors/local/run.template.sh +54 -2
  26. nemo_evaluator_launcher/executors/slurm/executor.py +559 -64
  27. nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
  28. nemo_evaluator_launcher/exporters/utils.py +32 -46
  29. nemo_evaluator_launcher/package_info.py +1 -1
  30. nemo_evaluator_launcher/resources/all_tasks_irs.yaml +17016 -0
  31. nemo_evaluator_launcher/resources/mapping.toml +64 -315
  32. {nemo_evaluator_launcher-0.1.19.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/METADATA +4 -3
  33. nemo_evaluator_launcher-0.1.56.dist-info/RECORD +69 -0
  34. {nemo_evaluator_launcher-0.1.19.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/entry_points.txt +1 -0
  35. nemo_evaluator_launcher-0.1.19.dist-info/RECORD +0 -60
  36. {nemo_evaluator_launcher-0.1.19.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/WHEEL +0 -0
  37. {nemo_evaluator_launcher-0.1.19.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/licenses/LICENSE +0 -0
  38. {nemo_evaluator_launcher-0.1.19.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,26 @@
1
+ global
2
+ log stdout format raw local0
3
+ maxconn 4096
4
+
5
+ defaults
6
+ log global
7
+ mode http
8
+ option httplog
9
+ timeout connect 10s
10
+ timeout client 100000s
11
+ timeout server 100000s
12
+
13
+ frontend service_frontend
14
+ bind *:{{ haproxy_port }}
15
+ default_backend service_backend
16
+
17
+ backend service_backend
18
+ mode http
19
+ option httpchk GET {{ health_check_path }}
20
+ http-check expect status {{ health_check_status }}
21
+ option http-server-close
22
+ balance leastconn
23
+ {% for node in nodes %}
24
+ server node{{ loop.index }} {{ node.ip }}:{{ node.port }} check
25
+ {% endfor %}
26
+
@@ -471,15 +471,12 @@ def _extract_metrics_from_results(results: dict) -> Dict[str, float]:
471
471
  section_data = results.get(section)
472
472
  if isinstance(section_data, dict):
473
473
  for task_name, task_data in section_data.items():
474
- if isinstance(task_data, dict) and "metrics" in task_data:
475
- task_metrics = _extract_task_metrics(
476
- task_name, task_data["metrics"]
477
- )
478
- _safe_update_metrics(
479
- target=metrics,
480
- source=task_metrics,
481
- context=f" while extracting results for task '{task_name}'",
482
- )
474
+ task_metrics = _extract_task_metrics(task_name, task_data)
475
+ _safe_update_metrics(
476
+ target=metrics,
477
+ source=task_metrics,
478
+ context=f" while extracting results for task '{task_name}'",
479
+ )
483
480
  return metrics
484
481
 
485
482
 
@@ -518,54 +515,43 @@ def _extract_from_json_files(artifacts_dir: Path) -> Dict[str, float]:
518
515
  return metrics
519
516
 
520
517
 
521
- def _extract_task_metrics(task_name: str, metrics_data: dict) -> Dict[str, float]:
518
+ def _extract_task_metrics(task_name: str, task_data: dict) -> Dict[str, float]:
522
519
  """Extract metrics from a task's metrics data."""
523
520
  extracted = {}
524
- score_patterns = [
525
- "acc",
526
- "accuracy",
527
- "score",
528
- "exact_match",
529
- "f1",
530
- "em",
531
- "pass@1",
532
- "pass@k",
533
- ]
534
521
 
535
- for metric_name, metric_data in metrics_data.items():
536
- # Only extract score-like metrics
537
- if not any(pattern in metric_name.lower() for pattern in score_patterns):
538
- continue
522
+ metrics_data = task_data.get("metrics", {})
523
+ if "groups" in task_data:
524
+ for group_name, group_data in task_data["groups"].items():
525
+ group_extracted = _extract_task_metrics(
526
+ f"{task_name}_{group_name}", group_data
527
+ )
528
+ _safe_update_metrics(
529
+ target=extracted,
530
+ source=group_extracted,
531
+ context=f" in task '{task_name}'",
532
+ )
539
533
 
534
+ for metric_name, metric_data in metrics_data.items():
540
535
  try:
541
- if isinstance(metric_data, dict):
542
- if "scores" in metric_data:
543
- # Handle nested scores (e.g., mmlu macro/micro)
544
- for score_type, score_data in metric_data["scores"].items():
545
- if isinstance(score_data, dict) and "value" in score_data:
546
- key = f"{task_name}_{metric_name}_{score_type}"
547
- _safe_set_metric(
548
- container=extracted,
549
- key=key,
550
- new_value=score_data["value"],
551
- context=f" in task '{task_name}'",
552
- )
553
- elif "value" in metric_data:
536
+ for score_type, score_data in metric_data["scores"].items():
537
+ if score_type != metric_name:
538
+ key = f"{task_name}_{metric_name}_{score_type}"
539
+ else:
554
540
  key = f"{task_name}_{metric_name}"
555
- _safe_set_metric(
556
- container=extracted,
557
- key=key,
558
- new_value=metric_data["value"],
559
- context=f" in task '{task_name}'",
560
- )
561
- elif isinstance(metric_data, (int, float)):
562
- key = f"{task_name}_{metric_name}"
563
541
  _safe_set_metric(
564
542
  container=extracted,
565
543
  key=key,
566
- new_value=metric_data,
544
+ new_value=score_data["value"],
567
545
  context=f" in task '{task_name}'",
568
546
  )
547
+ for stat_name, stat_value in metric_data.get("stats", {}).items():
548
+ stats_key = f"{key}_{stat_name}"
549
+ _safe_set_metric(
550
+ container=extracted,
551
+ key=stats_key,
552
+ new_value=stat_value,
553
+ context=f" in task '{task_name}'",
554
+ )
569
555
  except (ValueError, TypeError) as e:
570
556
  logger.warning(
571
557
  f"Failed to extract metric {metric_name} for task {task_name}: {e}"
@@ -16,7 +16,7 @@
16
16
  # Below is the _next_ version that will be published, not the currently published one.
17
17
  MAJOR = 0
18
18
  MINOR = 1
19
- PATCH = 19
19
+ PATCH = 56
20
20
  PRE_RELEASE = ""
21
21
 
22
22
  # Use the following formatting: (major, minor, patch, pre-release)