nemo-evaluator-launcher 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nemo_evaluator_launcher/__init__.py +15 -1
- nemo_evaluator_launcher/api/functional.py +188 -27
- nemo_evaluator_launcher/api/types.py +9 -0
- nemo_evaluator_launcher/cli/export.py +131 -12
- nemo_evaluator_launcher/cli/info.py +477 -82
- nemo_evaluator_launcher/cli/kill.py +5 -3
- nemo_evaluator_launcher/cli/logs.py +102 -0
- nemo_evaluator_launcher/cli/ls_runs.py +31 -10
- nemo_evaluator_launcher/cli/ls_tasks.py +105 -3
- nemo_evaluator_launcher/cli/main.py +101 -5
- nemo_evaluator_launcher/cli/run.py +153 -30
- nemo_evaluator_launcher/cli/status.py +49 -5
- nemo_evaluator_launcher/cli/version.py +26 -23
- nemo_evaluator_launcher/common/execdb.py +121 -27
- nemo_evaluator_launcher/common/helpers.py +213 -33
- nemo_evaluator_launcher/common/logging_utils.py +16 -5
- nemo_evaluator_launcher/common/printing_utils.py +100 -0
- nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
- nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
- nemo_evaluator_launcher/configs/deployment/trtllm.yaml +23 -0
- nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -2
- nemo_evaluator_launcher/configs/execution/local.yaml +2 -0
- nemo_evaluator_launcher/configs/execution/slurm/default.yaml +19 -4
- nemo_evaluator_launcher/executors/base.py +54 -1
- nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +60 -5
- nemo_evaluator_launcher/executors/lepton/executor.py +240 -101
- nemo_evaluator_launcher/executors/lepton/job_helpers.py +15 -11
- nemo_evaluator_launcher/executors/local/executor.py +492 -56
- nemo_evaluator_launcher/executors/local/run.template.sh +76 -9
- nemo_evaluator_launcher/executors/slurm/executor.py +571 -98
- nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
- nemo_evaluator_launcher/exporters/base.py +9 -0
- nemo_evaluator_launcher/exporters/gsheets.py +27 -9
- nemo_evaluator_launcher/exporters/local.py +30 -16
- nemo_evaluator_launcher/exporters/mlflow.py +245 -74
- nemo_evaluator_launcher/exporters/utils.py +139 -184
- nemo_evaluator_launcher/exporters/wandb.py +157 -43
- nemo_evaluator_launcher/package_info.py +6 -3
- nemo_evaluator_launcher/resources/mapping.toml +56 -15
- nemo_evaluator_launcher-0.1.41.dist-info/METADATA +494 -0
- nemo_evaluator_launcher-0.1.41.dist-info/RECORD +62 -0
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/entry_points.txt +1 -0
- nemo_evaluator_launcher-0.1.0rc6.dist-info/METADATA +0 -35
- nemo_evaluator_launcher-0.1.0rc6.dist-info/RECORD +0 -57
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/WHEEL +0 -0
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/licenses/LICENSE +0 -0
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
global
|
|
2
|
+
log stdout format raw local0
|
|
3
|
+
maxconn 4096
|
|
4
|
+
|
|
5
|
+
defaults
|
|
6
|
+
log global
|
|
7
|
+
mode http
|
|
8
|
+
option httplog
|
|
9
|
+
timeout connect 10s
|
|
10
|
+
timeout client 100000s
|
|
11
|
+
timeout server 100000s
|
|
12
|
+
|
|
13
|
+
frontend service_frontend
|
|
14
|
+
bind *:{{ haproxy_port }}
|
|
15
|
+
default_backend service_backend
|
|
16
|
+
|
|
17
|
+
backend service_backend
|
|
18
|
+
mode http
|
|
19
|
+
option httpchk GET {{ health_check_path }}
|
|
20
|
+
http-check expect status {{ health_check_status }}
|
|
21
|
+
option http-server-close
|
|
22
|
+
balance leastconn
|
|
23
|
+
{% for node in nodes %}
|
|
24
|
+
server node{{ loop.index }} {{ node.ip }}:{{ node.port }} check
|
|
25
|
+
{% endfor %}
|
|
26
|
+
|
|
@@ -70,6 +70,15 @@ class BaseExporter(ABC):
|
|
|
70
70
|
|
|
71
71
|
def get_job_paths(self, job_data: JobData) -> Dict[str, Any]:
|
|
72
72
|
"""Get result paths based on executor type from job metadata."""
|
|
73
|
+
# Special case: remote executor artifacts accessed locally (remote auto-export)
|
|
74
|
+
if job_data.data.get("storage_type") == "remote_local":
|
|
75
|
+
output_dir = Path(job_data.data["output_dir"])
|
|
76
|
+
return {
|
|
77
|
+
"artifacts_dir": output_dir / "artifacts",
|
|
78
|
+
"logs_dir": output_dir / "logs",
|
|
79
|
+
"storage_type": "remote_local",
|
|
80
|
+
}
|
|
81
|
+
|
|
73
82
|
if job_data.executor == "local":
|
|
74
83
|
output_dir = Path(job_data.data["output_dir"])
|
|
75
84
|
return {
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
#
|
|
16
16
|
"""Google Sheets evaluation results exporter."""
|
|
17
17
|
|
|
18
|
+
import os
|
|
18
19
|
import shutil
|
|
19
20
|
import tempfile
|
|
20
21
|
from pathlib import Path
|
|
@@ -89,28 +90,38 @@ class GSheetsExporter(BaseExporter):
|
|
|
89
90
|
}
|
|
90
91
|
|
|
91
92
|
try:
|
|
93
|
+
# Load exporter config from the first job (supports job-embedded config and CLI overrides)
|
|
94
|
+
first_job = next(iter(jobs.values()))
|
|
95
|
+
gsheets_config = extract_exporter_config(first_job, "gsheets", self.config)
|
|
96
|
+
|
|
92
97
|
# Connect to Google Sheets
|
|
93
|
-
service_account_file =
|
|
94
|
-
spreadsheet_name =
|
|
98
|
+
service_account_file = gsheets_config.get("service_account_file")
|
|
99
|
+
spreadsheet_name = gsheets_config.get(
|
|
95
100
|
"spreadsheet_name", "NeMo Evaluator Launcher Results"
|
|
96
101
|
)
|
|
97
102
|
|
|
98
103
|
if service_account_file:
|
|
99
|
-
gc = gspread.service_account(
|
|
104
|
+
gc = gspread.service_account(
|
|
105
|
+
filename=os.path.expanduser(service_account_file)
|
|
106
|
+
)
|
|
100
107
|
else:
|
|
101
108
|
gc = gspread.service_account()
|
|
102
109
|
|
|
103
110
|
# Get or create spreadsheet
|
|
111
|
+
spreadsheet_id = gsheets_config.get("spreadsheet_id")
|
|
104
112
|
try:
|
|
105
|
-
|
|
113
|
+
if spreadsheet_id:
|
|
114
|
+
sh = gc.open_by_key(spreadsheet_id)
|
|
115
|
+
else:
|
|
116
|
+
sh = gc.open(spreadsheet_name)
|
|
106
117
|
logger.info(f"Opened existing spreadsheet: {spreadsheet_name}")
|
|
107
118
|
except gspread.SpreadsheetNotFound:
|
|
119
|
+
if spreadsheet_id:
|
|
120
|
+
raise # Can't create with explicit ID
|
|
108
121
|
sh = gc.create(spreadsheet_name)
|
|
109
122
|
logger.info(f"Created new spreadsheet: {spreadsheet_name}")
|
|
110
|
-
sh.share("", perm_type="anyone", role="reader")
|
|
111
123
|
|
|
112
124
|
worksheet = sh.sheet1
|
|
113
|
-
|
|
114
125
|
# Extract metrics from ALL jobs first to determine headers
|
|
115
126
|
all_job_metrics = {}
|
|
116
127
|
results = {}
|
|
@@ -226,16 +237,23 @@ class GSheetsExporter(BaseExporter):
|
|
|
226
237
|
)
|
|
227
238
|
|
|
228
239
|
if service_account_file:
|
|
229
|
-
gc = gspread.service_account(
|
|
240
|
+
gc = gspread.service_account(
|
|
241
|
+
filename=os.path.expanduser(service_account_file)
|
|
242
|
+
)
|
|
230
243
|
else:
|
|
231
244
|
gc = gspread.service_account()
|
|
232
245
|
|
|
233
246
|
# Get or create spreadsheet
|
|
247
|
+
spreadsheet_id = gsheets_config.get("spreadsheet_id")
|
|
234
248
|
try:
|
|
235
|
-
|
|
249
|
+
if spreadsheet_id:
|
|
250
|
+
sh = gc.open_by_key(spreadsheet_id)
|
|
251
|
+
else:
|
|
252
|
+
sh = gc.open(spreadsheet_name)
|
|
236
253
|
except gspread.SpreadsheetNotFound:
|
|
254
|
+
if spreadsheet_id:
|
|
255
|
+
raise # Can't create with explicit ID
|
|
237
256
|
sh = gc.create(spreadsheet_name)
|
|
238
|
-
sh.share("", perm_type="anyone", role="reader")
|
|
239
257
|
|
|
240
258
|
worksheet = sh.sheet1
|
|
241
259
|
|
|
@@ -62,6 +62,7 @@ class LocalExporter(BaseExporter):
|
|
|
62
62
|
"""Export job artifacts to local directory."""
|
|
63
63
|
# Merge auto-export + CLI config
|
|
64
64
|
cfg = extract_exporter_config(job_data, "local", self.config)
|
|
65
|
+
skip_validation = bool(cfg.get("skip_validation", False))
|
|
65
66
|
|
|
66
67
|
output_dir = Path(cfg.get("output_dir", "./nemo-evaluator-launcher-results"))
|
|
67
68
|
job_export_dir = output_dir / job_data.invocation_id / job_data.job_id
|
|
@@ -74,25 +75,34 @@ class LocalExporter(BaseExporter):
|
|
|
74
75
|
# Stage artifacts per storage type
|
|
75
76
|
if paths["storage_type"] == "local_filesystem":
|
|
76
77
|
exported_files = self._copy_local_artifacts(paths, job_export_dir, cfg)
|
|
77
|
-
elif paths["storage_type"] == "
|
|
78
|
-
|
|
79
|
-
paths, job_export_dir, cfg, None
|
|
80
|
-
)
|
|
81
|
-
elif paths["storage_type"] == "gitlab_ci_local":
|
|
78
|
+
elif paths["storage_type"] == "remote_local":
|
|
79
|
+
# Same as local_filesystem (we're on the remote machine, accessing locally)
|
|
82
80
|
exported_files = self._copy_local_artifacts(paths, job_export_dir, cfg)
|
|
83
|
-
elif paths["storage_type"] == "
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
81
|
+
elif paths["storage_type"] == "remote_ssh":
|
|
82
|
+
cp = ssh_setup_masters({job_data.job_id: job_data})
|
|
83
|
+
try:
|
|
84
|
+
exported_files = ssh_download_artifacts(
|
|
85
|
+
paths, job_export_dir, cfg, cp
|
|
86
|
+
)
|
|
87
|
+
finally:
|
|
88
|
+
ssh_cleanup_masters(cp)
|
|
88
89
|
else:
|
|
89
|
-
raise
|
|
90
|
-
f"
|
|
90
|
+
raise NotImplementedError(
|
|
91
|
+
f"Export not implemented for storage type: {paths['storage_type']}"
|
|
91
92
|
)
|
|
92
93
|
|
|
93
94
|
# Validate artifacts
|
|
94
95
|
artifacts_dir = job_export_dir / "artifacts"
|
|
95
|
-
validation =
|
|
96
|
+
validation = (
|
|
97
|
+
validate_artifacts(artifacts_dir)
|
|
98
|
+
if not skip_validation
|
|
99
|
+
else {
|
|
100
|
+
"can_export": True,
|
|
101
|
+
"missing_required": [],
|
|
102
|
+
"missing_optional": [],
|
|
103
|
+
"message": "Validation skipped",
|
|
104
|
+
}
|
|
105
|
+
)
|
|
96
106
|
|
|
97
107
|
# Save metadata
|
|
98
108
|
self._save_job_metadata(job_data, job_export_dir)
|
|
@@ -125,6 +135,8 @@ class LocalExporter(BaseExporter):
|
|
|
125
135
|
logger.warning(f"Failed to create {fmt} summary: {e}")
|
|
126
136
|
msg += " (summary failed)"
|
|
127
137
|
|
|
138
|
+
meta["output_dir"] = str(job_export_dir.resolve())
|
|
139
|
+
|
|
128
140
|
return ExportResult(
|
|
129
141
|
success=True, dest=str(job_export_dir), message=msg, metadata=meta
|
|
130
142
|
)
|
|
@@ -266,10 +278,12 @@ class LocalExporter(BaseExporter):
|
|
|
266
278
|
) -> List[str]:
|
|
267
279
|
exported_files: List[str] = []
|
|
268
280
|
copy_logs = bool(cfg.get("copy_logs", False))
|
|
281
|
+
copy_artifacts = bool(cfg.get("copy_artifacts", True))
|
|
269
282
|
only_required = bool(cfg.get("only_required", True))
|
|
270
283
|
|
|
284
|
+
# separate logic for artifacts and logs
|
|
271
285
|
# artifacts/
|
|
272
|
-
if paths["artifacts_dir"].exists():
|
|
286
|
+
if copy_artifacts and paths["artifacts_dir"].exists():
|
|
273
287
|
if only_required:
|
|
274
288
|
names = [
|
|
275
289
|
a
|
|
@@ -283,7 +297,7 @@ class LocalExporter(BaseExporter):
|
|
|
283
297
|
shutil.copy2(src, dst)
|
|
284
298
|
exported_files.append(str(dst))
|
|
285
299
|
else:
|
|
286
|
-
#
|
|
300
|
+
# Restore recursive copy (test_copy_all_tree expects nested files)
|
|
287
301
|
shutil.copytree(
|
|
288
302
|
paths["artifacts_dir"], export_dir / "artifacts", dirs_exist_ok=True
|
|
289
303
|
)
|
|
@@ -297,7 +311,7 @@ class LocalExporter(BaseExporter):
|
|
|
297
311
|
|
|
298
312
|
# logs/
|
|
299
313
|
# If only_required is False → always copy logs; otherwise respect copy_logs
|
|
300
|
-
if (not only_required or copy_logs) and paths["logs_dir"].exists():
|
|
314
|
+
if ((not only_required) or copy_logs) and paths["logs_dir"].exists():
|
|
301
315
|
shutil.copytree(paths["logs_dir"], export_dir / "logs", dirs_exist_ok=True)
|
|
302
316
|
exported_files.extend(
|
|
303
317
|
[str(f) for f in (export_dir / "logs").rglob("*") if f.is_file()]
|