nemo-evaluator-launcher 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. nemo_evaluator_launcher/__init__.py +15 -1
  2. nemo_evaluator_launcher/api/functional.py +188 -27
  3. nemo_evaluator_launcher/api/types.py +9 -0
  4. nemo_evaluator_launcher/cli/export.py +131 -12
  5. nemo_evaluator_launcher/cli/info.py +477 -82
  6. nemo_evaluator_launcher/cli/kill.py +5 -3
  7. nemo_evaluator_launcher/cli/logs.py +102 -0
  8. nemo_evaluator_launcher/cli/ls_runs.py +31 -10
  9. nemo_evaluator_launcher/cli/ls_tasks.py +105 -3
  10. nemo_evaluator_launcher/cli/main.py +101 -5
  11. nemo_evaluator_launcher/cli/run.py +153 -30
  12. nemo_evaluator_launcher/cli/status.py +49 -5
  13. nemo_evaluator_launcher/cli/version.py +26 -23
  14. nemo_evaluator_launcher/common/execdb.py +121 -27
  15. nemo_evaluator_launcher/common/helpers.py +213 -33
  16. nemo_evaluator_launcher/common/logging_utils.py +16 -5
  17. nemo_evaluator_launcher/common/printing_utils.py +100 -0
  18. nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
  19. nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
  20. nemo_evaluator_launcher/configs/deployment/trtllm.yaml +23 -0
  21. nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -2
  22. nemo_evaluator_launcher/configs/execution/local.yaml +2 -0
  23. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +19 -4
  24. nemo_evaluator_launcher/executors/base.py +54 -1
  25. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +60 -5
  26. nemo_evaluator_launcher/executors/lepton/executor.py +240 -101
  27. nemo_evaluator_launcher/executors/lepton/job_helpers.py +15 -11
  28. nemo_evaluator_launcher/executors/local/executor.py +492 -56
  29. nemo_evaluator_launcher/executors/local/run.template.sh +76 -9
  30. nemo_evaluator_launcher/executors/slurm/executor.py +571 -98
  31. nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
  32. nemo_evaluator_launcher/exporters/base.py +9 -0
  33. nemo_evaluator_launcher/exporters/gsheets.py +27 -9
  34. nemo_evaluator_launcher/exporters/local.py +30 -16
  35. nemo_evaluator_launcher/exporters/mlflow.py +245 -74
  36. nemo_evaluator_launcher/exporters/utils.py +139 -184
  37. nemo_evaluator_launcher/exporters/wandb.py +157 -43
  38. nemo_evaluator_launcher/package_info.py +6 -3
  39. nemo_evaluator_launcher/resources/mapping.toml +56 -15
  40. nemo_evaluator_launcher-0.1.41.dist-info/METADATA +494 -0
  41. nemo_evaluator_launcher-0.1.41.dist-info/RECORD +62 -0
  42. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/entry_points.txt +1 -0
  43. nemo_evaluator_launcher-0.1.0rc6.dist-info/METADATA +0 -35
  44. nemo_evaluator_launcher-0.1.0rc6.dist-info/RECORD +0 -57
  45. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/WHEEL +0 -0
  46. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/licenses/LICENSE +0 -0
  47. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,26 @@
1
+ global
2
+ log stdout format raw local0
3
+ maxconn 4096
4
+
5
+ defaults
6
+ log global
7
+ mode http
8
+ option httplog
9
+ timeout connect 10s
10
+ timeout client 100000s
11
+ timeout server 100000s
12
+
13
+ frontend service_frontend
14
+ bind *:{{ haproxy_port }}
15
+ default_backend service_backend
16
+
17
+ backend service_backend
18
+ mode http
19
+ option httpchk GET {{ health_check_path }}
20
+ http-check expect status {{ health_check_status }}
21
+ option http-server-close
22
+ balance leastconn
23
+ {% for node in nodes %}
24
+ server node{{ loop.index }} {{ node.ip }}:{{ node.port }} check
25
+ {% endfor %}
26
+
@@ -70,6 +70,15 @@ class BaseExporter(ABC):
70
70
 
71
71
  def get_job_paths(self, job_data: JobData) -> Dict[str, Any]:
72
72
  """Get result paths based on executor type from job metadata."""
73
+ # Special case: remote executor artifacts accessed locally (remote auto-export)
74
+ if job_data.data.get("storage_type") == "remote_local":
75
+ output_dir = Path(job_data.data["output_dir"])
76
+ return {
77
+ "artifacts_dir": output_dir / "artifacts",
78
+ "logs_dir": output_dir / "logs",
79
+ "storage_type": "remote_local",
80
+ }
81
+
73
82
  if job_data.executor == "local":
74
83
  output_dir = Path(job_data.data["output_dir"])
75
84
  return {
@@ -15,6 +15,7 @@
15
15
  #
16
16
  """Google Sheets evaluation results exporter."""
17
17
 
18
+ import os
18
19
  import shutil
19
20
  import tempfile
20
21
  from pathlib import Path
@@ -89,28 +90,38 @@ class GSheetsExporter(BaseExporter):
89
90
  }
90
91
 
91
92
  try:
93
+ # Load exporter config from the first job (supports job-embedded config and CLI overrides)
94
+ first_job = next(iter(jobs.values()))
95
+ gsheets_config = extract_exporter_config(first_job, "gsheets", self.config)
96
+
92
97
  # Connect to Google Sheets
93
- service_account_file = self.config.get("service_account_file")
94
- spreadsheet_name = self.config.get(
98
+ service_account_file = gsheets_config.get("service_account_file")
99
+ spreadsheet_name = gsheets_config.get(
95
100
  "spreadsheet_name", "NeMo Evaluator Launcher Results"
96
101
  )
97
102
 
98
103
  if service_account_file:
99
- gc = gspread.service_account(filename=service_account_file)
104
+ gc = gspread.service_account(
105
+ filename=os.path.expanduser(service_account_file)
106
+ )
100
107
  else:
101
108
  gc = gspread.service_account()
102
109
 
103
110
  # Get or create spreadsheet
111
+ spreadsheet_id = gsheets_config.get("spreadsheet_id")
104
112
  try:
105
- sh = gc.open(spreadsheet_name)
113
+ if spreadsheet_id:
114
+ sh = gc.open_by_key(spreadsheet_id)
115
+ else:
116
+ sh = gc.open(spreadsheet_name)
106
117
  logger.info(f"Opened existing spreadsheet: {spreadsheet_name}")
107
118
  except gspread.SpreadsheetNotFound:
119
+ if spreadsheet_id:
120
+ raise # Can't create with explicit ID
108
121
  sh = gc.create(spreadsheet_name)
109
122
  logger.info(f"Created new spreadsheet: {spreadsheet_name}")
110
- sh.share("", perm_type="anyone", role="reader")
111
123
 
112
124
  worksheet = sh.sheet1
113
-
114
125
  # Extract metrics from ALL jobs first to determine headers
115
126
  all_job_metrics = {}
116
127
  results = {}
@@ -226,16 +237,23 @@ class GSheetsExporter(BaseExporter):
226
237
  )
227
238
 
228
239
  if service_account_file:
229
- gc = gspread.service_account(filename=service_account_file)
240
+ gc = gspread.service_account(
241
+ filename=os.path.expanduser(service_account_file)
242
+ )
230
243
  else:
231
244
  gc = gspread.service_account()
232
245
 
233
246
  # Get or create spreadsheet
247
+ spreadsheet_id = gsheets_config.get("spreadsheet_id")
234
248
  try:
235
- sh = gc.open(spreadsheet_name)
249
+ if spreadsheet_id:
250
+ sh = gc.open_by_key(spreadsheet_id)
251
+ else:
252
+ sh = gc.open(spreadsheet_name)
236
253
  except gspread.SpreadsheetNotFound:
254
+ if spreadsheet_id:
255
+ raise # Can't create with explicit ID
237
256
  sh = gc.create(spreadsheet_name)
238
- sh.share("", perm_type="anyone", role="reader")
239
257
 
240
258
  worksheet = sh.sheet1
241
259
 
@@ -62,6 +62,7 @@ class LocalExporter(BaseExporter):
62
62
  """Export job artifacts to local directory."""
63
63
  # Merge auto-export + CLI config
64
64
  cfg = extract_exporter_config(job_data, "local", self.config)
65
+ skip_validation = bool(cfg.get("skip_validation", False))
65
66
 
66
67
  output_dir = Path(cfg.get("output_dir", "./nemo-evaluator-launcher-results"))
67
68
  job_export_dir = output_dir / job_data.invocation_id / job_data.job_id
@@ -74,25 +75,34 @@ class LocalExporter(BaseExporter):
74
75
  # Stage artifacts per storage type
75
76
  if paths["storage_type"] == "local_filesystem":
76
77
  exported_files = self._copy_local_artifacts(paths, job_export_dir, cfg)
77
- elif paths["storage_type"] == "remote_ssh":
78
- exported_files = ssh_download_artifacts(
79
- paths, job_export_dir, cfg, None
80
- )
81
- elif paths["storage_type"] == "gitlab_ci_local":
78
+ elif paths["storage_type"] == "remote_local":
79
+ # Same as local_filesystem (we're on the remote machine, accessing locally)
82
80
  exported_files = self._copy_local_artifacts(paths, job_export_dir, cfg)
83
- elif paths["storage_type"] == "gitlab_remote":
84
- raise NotImplementedError("Unsupported storage type")
85
- # exported_files = self._download_gitlab_remote_artifacts(
86
- # paths, job_export_dir
87
- # )
81
+ elif paths["storage_type"] == "remote_ssh":
82
+ cp = ssh_setup_masters({job_data.job_id: job_data})
83
+ try:
84
+ exported_files = ssh_download_artifacts(
85
+ paths, job_export_dir, cfg, cp
86
+ )
87
+ finally:
88
+ ssh_cleanup_masters(cp)
88
89
  else:
89
- raise ValueError(
90
- f"Cannot export from storage type: {paths['storage_type']}"
90
+ raise NotImplementedError(
91
+ f"Export not implemented for storage type: {paths['storage_type']}"
91
92
  )
92
93
 
93
94
  # Validate artifacts
94
95
  artifacts_dir = job_export_dir / "artifacts"
95
- validation = validate_artifacts(artifacts_dir)
96
+ validation = (
97
+ validate_artifacts(artifacts_dir)
98
+ if not skip_validation
99
+ else {
100
+ "can_export": True,
101
+ "missing_required": [],
102
+ "missing_optional": [],
103
+ "message": "Validation skipped",
104
+ }
105
+ )
96
106
 
97
107
  # Save metadata
98
108
  self._save_job_metadata(job_data, job_export_dir)
@@ -125,6 +135,8 @@ class LocalExporter(BaseExporter):
125
135
  logger.warning(f"Failed to create {fmt} summary: {e}")
126
136
  msg += " (summary failed)"
127
137
 
138
+ meta["output_dir"] = str(job_export_dir.resolve())
139
+
128
140
  return ExportResult(
129
141
  success=True, dest=str(job_export_dir), message=msg, metadata=meta
130
142
  )
@@ -266,10 +278,12 @@ class LocalExporter(BaseExporter):
266
278
  ) -> List[str]:
267
279
  exported_files: List[str] = []
268
280
  copy_logs = bool(cfg.get("copy_logs", False))
281
+ copy_artifacts = bool(cfg.get("copy_artifacts", True))
269
282
  only_required = bool(cfg.get("only_required", True))
270
283
 
284
+ # separate logic for artifacts and logs
271
285
  # artifacts/
272
- if paths["artifacts_dir"].exists():
286
+ if copy_artifacts and paths["artifacts_dir"].exists():
273
287
  if only_required:
274
288
  names = [
275
289
  a
@@ -283,7 +297,7 @@ class LocalExporter(BaseExporter):
283
297
  shutil.copy2(src, dst)
284
298
  exported_files.append(str(dst))
285
299
  else:
286
- # Copy everything under artifacts/ recursively
300
+ # Restore recursive copy (test_copy_all_tree expects nested files)
287
301
  shutil.copytree(
288
302
  paths["artifacts_dir"], export_dir / "artifacts", dirs_exist_ok=True
289
303
  )
@@ -297,7 +311,7 @@ class LocalExporter(BaseExporter):
297
311
 
298
312
  # logs/
299
313
  # If only_required is False → always copy logs; otherwise respect copy_logs
300
- if (not only_required or copy_logs) and paths["logs_dir"].exists():
314
+ if ((not only_required) or copy_logs) and paths["logs_dir"].exists():
301
315
  shutil.copytree(paths["logs_dir"], export_dir / "logs", dirs_exist_ok=True)
302
316
  exported_files.extend(
303
317
  [str(f) for f in (export_dir / "logs").rglob("*") if f.is_file()]