nemo-evaluator-launcher 0.1.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nemo-evaluator-launcher might be problematic. Click here for more details.
- nemo_evaluator_launcher/__init__.py +79 -0
- nemo_evaluator_launcher/api/__init__.py +24 -0
- nemo_evaluator_launcher/api/functional.py +698 -0
- nemo_evaluator_launcher/api/types.py +98 -0
- nemo_evaluator_launcher/api/utils.py +19 -0
- nemo_evaluator_launcher/cli/__init__.py +15 -0
- nemo_evaluator_launcher/cli/export.py +267 -0
- nemo_evaluator_launcher/cli/info.py +512 -0
- nemo_evaluator_launcher/cli/kill.py +41 -0
- nemo_evaluator_launcher/cli/ls_runs.py +134 -0
- nemo_evaluator_launcher/cli/ls_tasks.py +136 -0
- nemo_evaluator_launcher/cli/main.py +226 -0
- nemo_evaluator_launcher/cli/run.py +200 -0
- nemo_evaluator_launcher/cli/status.py +164 -0
- nemo_evaluator_launcher/cli/version.py +55 -0
- nemo_evaluator_launcher/common/__init__.py +16 -0
- nemo_evaluator_launcher/common/execdb.py +283 -0
- nemo_evaluator_launcher/common/helpers.py +366 -0
- nemo_evaluator_launcher/common/logging_utils.py +357 -0
- nemo_evaluator_launcher/common/mapping.py +295 -0
- nemo_evaluator_launcher/common/printing_utils.py +93 -0
- nemo_evaluator_launcher/configs/__init__.py +15 -0
- nemo_evaluator_launcher/configs/default.yaml +28 -0
- nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
- nemo_evaluator_launcher/configs/deployment/nim.yaml +32 -0
- nemo_evaluator_launcher/configs/deployment/none.yaml +16 -0
- nemo_evaluator_launcher/configs/deployment/sglang.yaml +38 -0
- nemo_evaluator_launcher/configs/deployment/trtllm.yaml +24 -0
- nemo_evaluator_launcher/configs/deployment/vllm.yaml +42 -0
- nemo_evaluator_launcher/configs/execution/lepton/default.yaml +92 -0
- nemo_evaluator_launcher/configs/execution/local.yaml +19 -0
- nemo_evaluator_launcher/configs/execution/slurm/default.yaml +34 -0
- nemo_evaluator_launcher/executors/__init__.py +22 -0
- nemo_evaluator_launcher/executors/base.py +120 -0
- nemo_evaluator_launcher/executors/lepton/__init__.py +16 -0
- nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +609 -0
- nemo_evaluator_launcher/executors/lepton/executor.py +1004 -0
- nemo_evaluator_launcher/executors/lepton/job_helpers.py +398 -0
- nemo_evaluator_launcher/executors/local/__init__.py +15 -0
- nemo_evaluator_launcher/executors/local/executor.py +605 -0
- nemo_evaluator_launcher/executors/local/run.template.sh +103 -0
- nemo_evaluator_launcher/executors/registry.py +38 -0
- nemo_evaluator_launcher/executors/slurm/__init__.py +15 -0
- nemo_evaluator_launcher/executors/slurm/executor.py +1147 -0
- nemo_evaluator_launcher/exporters/__init__.py +36 -0
- nemo_evaluator_launcher/exporters/base.py +121 -0
- nemo_evaluator_launcher/exporters/gsheets.py +409 -0
- nemo_evaluator_launcher/exporters/local.py +502 -0
- nemo_evaluator_launcher/exporters/mlflow.py +619 -0
- nemo_evaluator_launcher/exporters/registry.py +40 -0
- nemo_evaluator_launcher/exporters/utils.py +624 -0
- nemo_evaluator_launcher/exporters/wandb.py +490 -0
- nemo_evaluator_launcher/package_info.py +38 -0
- nemo_evaluator_launcher/resources/mapping.toml +380 -0
- nemo_evaluator_launcher-0.1.28.dist-info/METADATA +494 -0
- nemo_evaluator_launcher-0.1.28.dist-info/RECORD +60 -0
- nemo_evaluator_launcher-0.1.28.dist-info/WHEEL +5 -0
- nemo_evaluator_launcher-0.1.28.dist-info/entry_points.txt +3 -0
- nemo_evaluator_launcher-0.1.28.dist-info/licenses/LICENSE +451 -0
- nemo_evaluator_launcher-0.1.28.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,619 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
"""Evaluation results exporter for MLflow tracking."""
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
import tempfile
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any, Dict, List
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
import mlflow
|
|
25
|
+
|
|
26
|
+
MLFLOW_AVAILABLE = True
|
|
27
|
+
except ImportError:
|
|
28
|
+
MLFLOW_AVAILABLE = False
|
|
29
|
+
|
|
30
|
+
from nemo_evaluator_launcher.common.execdb import JobData
|
|
31
|
+
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
32
|
+
from nemo_evaluator_launcher.exporters.base import BaseExporter, ExportResult
|
|
33
|
+
from nemo_evaluator_launcher.exporters.local import LocalExporter
|
|
34
|
+
from nemo_evaluator_launcher.exporters.registry import register_exporter
|
|
35
|
+
from nemo_evaluator_launcher.exporters.utils import (
|
|
36
|
+
extract_accuracy_metrics,
|
|
37
|
+
extract_exporter_config,
|
|
38
|
+
get_artifact_root,
|
|
39
|
+
get_available_artifacts,
|
|
40
|
+
get_benchmark_info,
|
|
41
|
+
get_task_name,
|
|
42
|
+
mlflow_sanitize,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@register_exporter("mlflow")
|
|
47
|
+
class MLflowExporter(BaseExporter):
|
|
48
|
+
"""Export accuracy metrics to MLflow tracking server."""
|
|
49
|
+
|
|
50
|
+
def supports_executor(self, executor_type: str) -> bool:
|
|
51
|
+
return True
|
|
52
|
+
|
|
53
|
+
def is_available(self) -> bool:
|
|
54
|
+
return MLFLOW_AVAILABLE
|
|
55
|
+
|
|
56
|
+
def _get_existing_run_info(
|
|
57
|
+
self, job_data: JobData, config: Dict[str, Any]
|
|
58
|
+
) -> tuple[bool, str]:
|
|
59
|
+
"""Check if MLflow run exists for this invocation/job."""
|
|
60
|
+
try:
|
|
61
|
+
import mlflow
|
|
62
|
+
|
|
63
|
+
tracking_uri = config.get("tracking_uri")
|
|
64
|
+
if not tracking_uri:
|
|
65
|
+
return False, None
|
|
66
|
+
|
|
67
|
+
mlflow.set_tracking_uri(tracking_uri)
|
|
68
|
+
experiment_name = config.get("experiment_name", "nemo-evaluator-launcher")
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
experiment = mlflow.get_experiment_by_name(experiment_name)
|
|
72
|
+
if not experiment:
|
|
73
|
+
return False, None
|
|
74
|
+
|
|
75
|
+
# Search for runs with matching invocation_id tag
|
|
76
|
+
runs = mlflow.search_runs(
|
|
77
|
+
experiment_ids=[experiment.experiment_id],
|
|
78
|
+
filter_string=f"tags.invocation_id = '{job_data.invocation_id}'",
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
if not runs.empty:
|
|
82
|
+
existing_run = runs.iloc[0]
|
|
83
|
+
return True, existing_run.run_id
|
|
84
|
+
|
|
85
|
+
except Exception:
|
|
86
|
+
pass
|
|
87
|
+
|
|
88
|
+
return False, None
|
|
89
|
+
except ImportError:
|
|
90
|
+
return False, None
|
|
91
|
+
|
|
92
|
+
def export_job(self, job_data: JobData) -> ExportResult:
|
|
93
|
+
"""Export job to MLflow."""
|
|
94
|
+
if not self.is_available():
|
|
95
|
+
return ExportResult(
|
|
96
|
+
success=False, dest="mlflow", message="mlflow package not installed"
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
# Extract config using common utility
|
|
101
|
+
mlflow_config = extract_exporter_config(job_data, "mlflow", self.config)
|
|
102
|
+
|
|
103
|
+
# resolve tracking_uri with fallbacks
|
|
104
|
+
tracking_uri = mlflow_config.get("tracking_uri")
|
|
105
|
+
if not tracking_uri:
|
|
106
|
+
tracking_uri = os.getenv("MLFLOW_TRACKING_URI")
|
|
107
|
+
# allow env var name
|
|
108
|
+
if tracking_uri and "://" not in tracking_uri:
|
|
109
|
+
tracking_uri = os.getenv(tracking_uri, tracking_uri)
|
|
110
|
+
|
|
111
|
+
if not tracking_uri:
|
|
112
|
+
return ExportResult(
|
|
113
|
+
success=False,
|
|
114
|
+
dest="mlflow",
|
|
115
|
+
message="tracking_uri is required (set export.mlflow.tracking_uri or MLFLOW_TRACKING_URI)",
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Stage artifacts locally if remote_ssh (e.g., Slurm), so we can extract metrics
|
|
119
|
+
staged_base_dir = None
|
|
120
|
+
try:
|
|
121
|
+
paths = self.get_job_paths(job_data)
|
|
122
|
+
if paths.get("storage_type") == "remote_ssh":
|
|
123
|
+
tmp_stage = Path(tempfile.mkdtemp(prefix="mlflow_stage_"))
|
|
124
|
+
LocalExporter(
|
|
125
|
+
{
|
|
126
|
+
"output_dir": str(tmp_stage),
|
|
127
|
+
"copy_logs": mlflow_config.get(
|
|
128
|
+
"log_logs", False
|
|
129
|
+
), # log_logs -> copy_logs
|
|
130
|
+
"only_required": mlflow_config.get("only_required", True),
|
|
131
|
+
}
|
|
132
|
+
).export_job(job_data)
|
|
133
|
+
staged_base_dir = (
|
|
134
|
+
tmp_stage / job_data.invocation_id / job_data.job_id
|
|
135
|
+
)
|
|
136
|
+
except Exception as e:
|
|
137
|
+
logger.warning(f"Failed staging artifacts for {job_data.job_id}: {e}")
|
|
138
|
+
|
|
139
|
+
# Extract metrics (prefer staged if available)
|
|
140
|
+
log_metrics = mlflow_config.get("log_metrics", [])
|
|
141
|
+
if staged_base_dir and (staged_base_dir / "artifacts").exists():
|
|
142
|
+
accuracy_metrics = extract_accuracy_metrics(
|
|
143
|
+
job_data,
|
|
144
|
+
lambda _: {
|
|
145
|
+
"artifacts_dir": staged_base_dir / "artifacts",
|
|
146
|
+
"storage_type": "local_filesystem",
|
|
147
|
+
},
|
|
148
|
+
log_metrics,
|
|
149
|
+
)
|
|
150
|
+
else:
|
|
151
|
+
accuracy_metrics = extract_accuracy_metrics(
|
|
152
|
+
job_data, self.get_job_paths, log_metrics
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
if not accuracy_metrics:
|
|
156
|
+
return ExportResult(
|
|
157
|
+
success=False, dest="mlflow", message="No accuracy metrics found"
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
# Set up MLflow
|
|
161
|
+
tracking_uri = tracking_uri.rstrip("/")
|
|
162
|
+
mlflow.set_tracking_uri(tracking_uri)
|
|
163
|
+
|
|
164
|
+
# Set experiment
|
|
165
|
+
experiment_name = mlflow_config.get(
|
|
166
|
+
"experiment_name", "nemo-evaluator-launcher"
|
|
167
|
+
)
|
|
168
|
+
mlflow.set_experiment(experiment_name)
|
|
169
|
+
|
|
170
|
+
# Prepare parameters
|
|
171
|
+
all_params = {
|
|
172
|
+
"invocation_id": job_data.invocation_id,
|
|
173
|
+
"executor": job_data.executor,
|
|
174
|
+
"timestamp": str(job_data.timestamp),
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
# Add extra metadata if provided
|
|
178
|
+
if mlflow_config.get("extra_metadata"):
|
|
179
|
+
all_params.update(mlflow_config["extra_metadata"])
|
|
180
|
+
|
|
181
|
+
# Add webhook info if available
|
|
182
|
+
if mlflow_config.get("triggered_by_webhook"):
|
|
183
|
+
all_params.update(
|
|
184
|
+
{
|
|
185
|
+
"webhook_triggered": "true",
|
|
186
|
+
"webhook_source": mlflow_config.get("webhook_source"),
|
|
187
|
+
"source_artifact": mlflow_config.get("source_artifact"),
|
|
188
|
+
"config_source": mlflow_config.get("config_source"),
|
|
189
|
+
}
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Sanitize params
|
|
193
|
+
safe_params = {
|
|
194
|
+
mlflow_sanitize(k, "param_key"): mlflow_sanitize(v, "param_value")
|
|
195
|
+
for k, v in (all_params or {}).items()
|
|
196
|
+
if v is not None
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
# Prepare tags
|
|
200
|
+
tags = {}
|
|
201
|
+
if mlflow_config.get("tags"):
|
|
202
|
+
tags.update({k: v for k, v in mlflow_config["tags"].items() if v})
|
|
203
|
+
|
|
204
|
+
bench_info = get_benchmark_info(job_data)
|
|
205
|
+
benchmark = bench_info.get("benchmark", get_task_name(job_data))
|
|
206
|
+
harness = bench_info.get("harness", "unknown")
|
|
207
|
+
|
|
208
|
+
# Tag the run with invocation_id and task metadata
|
|
209
|
+
exec_type = (job_data.config or {}).get("execution", {}).get(
|
|
210
|
+
"type"
|
|
211
|
+
) or job_data.executor
|
|
212
|
+
tags.update(
|
|
213
|
+
{
|
|
214
|
+
"invocation_id": job_data.invocation_id,
|
|
215
|
+
"job_id": job_data.job_id,
|
|
216
|
+
"task_name": benchmark,
|
|
217
|
+
"benchmark": benchmark,
|
|
218
|
+
"harness": harness,
|
|
219
|
+
"executor": exec_type,
|
|
220
|
+
}
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# Sanitize tags
|
|
224
|
+
safe_tags = {
|
|
225
|
+
mlflow_sanitize(k, "tag_key"): mlflow_sanitize(v, "tag_value")
|
|
226
|
+
for k, v in (tags or {}).items()
|
|
227
|
+
if v is not None
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
# skip run if it already exists
|
|
231
|
+
exists, existing_run_id = self._get_existing_run_info(
|
|
232
|
+
job_data, mlflow_config
|
|
233
|
+
)
|
|
234
|
+
if exists and mlflow_config.get("skip_existing"):
|
|
235
|
+
return ExportResult(
|
|
236
|
+
success=True,
|
|
237
|
+
dest="mlflow",
|
|
238
|
+
message=f"Run already exists: {existing_run_id}, skipped",
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
# run
|
|
242
|
+
with mlflow.start_run() as run:
|
|
243
|
+
# Set tags
|
|
244
|
+
if safe_tags:
|
|
245
|
+
mlflow.set_tags(safe_tags)
|
|
246
|
+
|
|
247
|
+
# Set run name
|
|
248
|
+
run_name = (
|
|
249
|
+
mlflow_config.get("run_name")
|
|
250
|
+
or f"eval-{job_data.invocation_id}-{benchmark}"
|
|
251
|
+
)
|
|
252
|
+
mlflow.set_tag("mlflow.runName", mlflow_sanitize(run_name, "tag_value"))
|
|
253
|
+
|
|
254
|
+
# Set description only if provided
|
|
255
|
+
description = mlflow_config.get("description")
|
|
256
|
+
if description:
|
|
257
|
+
mlflow.set_tag(
|
|
258
|
+
"mlflow.note.content", mlflow_sanitize(description, "tag_value")
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
# Log parameters
|
|
262
|
+
mlflow.log_params(safe_params)
|
|
263
|
+
|
|
264
|
+
# Sanitize metric keys before logging
|
|
265
|
+
safe_metrics = {
|
|
266
|
+
mlflow_sanitize(k, "metric"): v
|
|
267
|
+
for k, v in (accuracy_metrics or {}).items()
|
|
268
|
+
}
|
|
269
|
+
mlflow.log_metrics(safe_metrics)
|
|
270
|
+
|
|
271
|
+
# Log artifacts
|
|
272
|
+
artifacts_logged = self._log_artifacts(
|
|
273
|
+
job_data, mlflow_config, staged_base_dir
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# Build run URL
|
|
277
|
+
run_url = None
|
|
278
|
+
if tracking_uri.startswith(("http://", "https://")):
|
|
279
|
+
run_url = f"{tracking_uri}/#/experiments/{run.info.experiment_id}/runs/{run.info.run_id}"
|
|
280
|
+
|
|
281
|
+
return ExportResult(
|
|
282
|
+
success=True,
|
|
283
|
+
dest="mlflow",
|
|
284
|
+
message=f"Logged {len(accuracy_metrics)} metrics to MLflow",
|
|
285
|
+
metadata={
|
|
286
|
+
"run_id": run.info.run_id,
|
|
287
|
+
"experiment_id": run.info.experiment_id,
|
|
288
|
+
"tracking_uri": tracking_uri,
|
|
289
|
+
"run_url": run_url,
|
|
290
|
+
"invocation_id": job_data.invocation_id,
|
|
291
|
+
"metrics_logged": len(accuracy_metrics),
|
|
292
|
+
"params_logged": len(safe_params),
|
|
293
|
+
"artifacts_logged": len(artifacts_logged),
|
|
294
|
+
},
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
except Exception as e:
|
|
298
|
+
logger.error(f"MLflow export failed: {e}")
|
|
299
|
+
return ExportResult(
|
|
300
|
+
success=False, dest="mlflow", message=f"Failed: {str(e)}"
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
def _log_artifacts(
|
|
304
|
+
self,
|
|
305
|
+
job_data: JobData,
|
|
306
|
+
mlflow_config: Dict[str, Any],
|
|
307
|
+
pre_staged_dir: Path = None,
|
|
308
|
+
) -> List[str]:
|
|
309
|
+
"""Log evaluation artifacts to MLflow using LocalExporter for transfer."""
|
|
310
|
+
|
|
311
|
+
# Check if artifacts should be logged (default: True)
|
|
312
|
+
if not mlflow_config.get("log_artifacts", True):
|
|
313
|
+
return []
|
|
314
|
+
|
|
315
|
+
try:
|
|
316
|
+
should_cleanup = False
|
|
317
|
+
# Use pre-staged dir if available; otherwise stage now
|
|
318
|
+
if pre_staged_dir and pre_staged_dir.exists():
|
|
319
|
+
base_dir = pre_staged_dir
|
|
320
|
+
else:
|
|
321
|
+
temp_dir = tempfile.mkdtemp(prefix="mlflow_artifacts_")
|
|
322
|
+
local_exporter = LocalExporter(
|
|
323
|
+
{
|
|
324
|
+
"output_dir": str(temp_dir),
|
|
325
|
+
"copy_logs": mlflow_config.get(
|
|
326
|
+
"log_logs", mlflow_config.get("copy_logs", False)
|
|
327
|
+
),
|
|
328
|
+
"only_required": mlflow_config.get("only_required", True),
|
|
329
|
+
"format": mlflow_config.get("format", None),
|
|
330
|
+
"log_metrics": mlflow_config.get("log_metrics", []),
|
|
331
|
+
"output_filename": mlflow_config.get("output_filename", None),
|
|
332
|
+
}
|
|
333
|
+
)
|
|
334
|
+
local_result = local_exporter.export_job(job_data)
|
|
335
|
+
if not local_result.success:
|
|
336
|
+
logger.error(
|
|
337
|
+
f"Failed to download artifacts: {local_result.message}"
|
|
338
|
+
)
|
|
339
|
+
return []
|
|
340
|
+
base_dir = Path(local_result.dest)
|
|
341
|
+
should_cleanup = True
|
|
342
|
+
|
|
343
|
+
artifacts_dir = base_dir / "artifacts"
|
|
344
|
+
logs_dir = base_dir / "logs"
|
|
345
|
+
logged_names: list[str] = []
|
|
346
|
+
artifact_path = get_artifact_root(job_data) # "<harness>.<benchmark>"
|
|
347
|
+
|
|
348
|
+
# Log config at root level (or synthesize)
|
|
349
|
+
cfg_logged = False
|
|
350
|
+
for fname in ("config.yml", "run_config.yml"):
|
|
351
|
+
p = artifacts_dir / fname
|
|
352
|
+
if p.exists():
|
|
353
|
+
mlflow.log_artifact(str(p))
|
|
354
|
+
cfg_logged = True
|
|
355
|
+
break
|
|
356
|
+
if not cfg_logged:
|
|
357
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
358
|
+
from yaml import dump as ydump
|
|
359
|
+
|
|
360
|
+
cfg_file = Path(tmpdir) / "config.yaml"
|
|
361
|
+
cfg_file.write_text(
|
|
362
|
+
ydump(
|
|
363
|
+
job_data.config or {},
|
|
364
|
+
default_flow_style=False,
|
|
365
|
+
sort_keys=False,
|
|
366
|
+
)
|
|
367
|
+
)
|
|
368
|
+
mlflow.log_artifact(str(cfg_file))
|
|
369
|
+
|
|
370
|
+
# Choose files to upload
|
|
371
|
+
files_to_upload: list[Path] = []
|
|
372
|
+
if mlflow_config.get("only_required", True):
|
|
373
|
+
for fname in get_available_artifacts(artifacts_dir):
|
|
374
|
+
p = artifacts_dir / fname
|
|
375
|
+
if p.exists():
|
|
376
|
+
files_to_upload.append(p)
|
|
377
|
+
else:
|
|
378
|
+
for p in artifacts_dir.iterdir(): # top-level files only
|
|
379
|
+
if p.is_file():
|
|
380
|
+
files_to_upload.append(p)
|
|
381
|
+
|
|
382
|
+
# Upload artifacts (with DEBUG per-file)
|
|
383
|
+
for fpath in files_to_upload:
|
|
384
|
+
rel = fpath.relative_to(artifacts_dir).as_posix()
|
|
385
|
+
parent = os.path.dirname(rel)
|
|
386
|
+
mlflow.log_artifact(
|
|
387
|
+
str(fpath),
|
|
388
|
+
artifact_path=f"{artifact_path}/artifacts/{parent}".rstrip("/"),
|
|
389
|
+
)
|
|
390
|
+
logged_names.append(rel)
|
|
391
|
+
logger.debug(f"mlflow upload artifact: {rel}")
|
|
392
|
+
|
|
393
|
+
# Optionally upload logs under "<harness.task>/logs"
|
|
394
|
+
if mlflow_config.get("log_logs", False) and logs_dir.exists():
|
|
395
|
+
for p in logs_dir.iterdir():
|
|
396
|
+
if p.is_file():
|
|
397
|
+
rel = p.name
|
|
398
|
+
mlflow.log_artifact(
|
|
399
|
+
str(p), artifact_path=f"{artifact_path}/logs"
|
|
400
|
+
)
|
|
401
|
+
logged_names.append(f"logs/{rel}")
|
|
402
|
+
logger.debug(f"mlflow upload log: {rel}")
|
|
403
|
+
|
|
404
|
+
logger.info(
|
|
405
|
+
f"MLflow upload summary: files={len(logged_names)}, only_required={mlflow_config.get('only_required', True)}, log_logs={mlflow_config.get('log_logs', False)}"
|
|
406
|
+
)
|
|
407
|
+
if should_cleanup:
|
|
408
|
+
import shutil
|
|
409
|
+
|
|
410
|
+
shutil.rmtree(base_dir, ignore_errors=True)
|
|
411
|
+
|
|
412
|
+
return logged_names
|
|
413
|
+
except Exception as e:
|
|
414
|
+
logger.error(f"Error logging artifacts: {e}")
|
|
415
|
+
return []
|
|
416
|
+
|
|
417
|
+
def export_invocation(self, invocation_id: str) -> Dict[str, Any]:
|
|
418
|
+
"""Export all jobs in invocation as one MLflow run."""
|
|
419
|
+
if not self.is_available():
|
|
420
|
+
return {"success": False, "error": "mlflow package not installed"}
|
|
421
|
+
|
|
422
|
+
jobs = self.db.get_jobs(invocation_id)
|
|
423
|
+
if not jobs:
|
|
424
|
+
return {
|
|
425
|
+
"success": False,
|
|
426
|
+
"error": f"No jobs found for invocation {invocation_id}",
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
try:
|
|
430
|
+
# Get first job for config access
|
|
431
|
+
first_job = list(jobs.values())[0]
|
|
432
|
+
|
|
433
|
+
# Extract config using common utility
|
|
434
|
+
mlflow_config = extract_exporter_config(first_job, "mlflow", self.config)
|
|
435
|
+
|
|
436
|
+
# resolve tracking_uri with fallbacks
|
|
437
|
+
tracking_uri = mlflow_config.get("tracking_uri") or os.getenv(
|
|
438
|
+
"MLFLOW_TRACKING_URI"
|
|
439
|
+
)
|
|
440
|
+
if tracking_uri and "://" not in tracking_uri:
|
|
441
|
+
tracking_uri = os.getenv(tracking_uri, tracking_uri)
|
|
442
|
+
if not tracking_uri:
|
|
443
|
+
return {
|
|
444
|
+
"success": False,
|
|
445
|
+
"error": "tracking_uri is required (set export.mlflow.tracking_uri or MLFLOW_TRACKING_URI)",
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
# Collect metrics from ALL jobs
|
|
449
|
+
all_metrics = {}
|
|
450
|
+
staged_map: dict[str, Path] = {}
|
|
451
|
+
for job_id, job_data in jobs.items():
|
|
452
|
+
try:
|
|
453
|
+
paths = self.get_job_paths(job_data)
|
|
454
|
+
if paths.get("storage_type") == "remote_ssh":
|
|
455
|
+
tmp_stage = Path(tempfile.mkdtemp(prefix="mlflow_inv_stage_"))
|
|
456
|
+
LocalExporter(
|
|
457
|
+
{
|
|
458
|
+
"output_dir": str(tmp_stage),
|
|
459
|
+
"copy_logs": mlflow_config.get("log_logs", False),
|
|
460
|
+
"only_required": mlflow_config.get(
|
|
461
|
+
"only_required", True
|
|
462
|
+
),
|
|
463
|
+
}
|
|
464
|
+
).export_job(job_data)
|
|
465
|
+
staged_map[job_id] = (
|
|
466
|
+
tmp_stage / job_data.invocation_id / job_data.job_id
|
|
467
|
+
)
|
|
468
|
+
except Exception as e:
|
|
469
|
+
logger.warning(f"Staging failed for {job_id}: {e}")
|
|
470
|
+
|
|
471
|
+
for job_id, job_data in jobs.items():
|
|
472
|
+
log_metrics = mlflow_config.get("log_metrics", [])
|
|
473
|
+
if job_id in staged_map and (staged_map[job_id] / "artifacts").exists():
|
|
474
|
+
job_metrics = extract_accuracy_metrics(
|
|
475
|
+
job_data,
|
|
476
|
+
lambda _: {
|
|
477
|
+
"artifacts_dir": staged_map[job_id] / "artifacts",
|
|
478
|
+
"storage_type": "local_filesystem",
|
|
479
|
+
},
|
|
480
|
+
log_metrics,
|
|
481
|
+
)
|
|
482
|
+
else:
|
|
483
|
+
job_metrics = extract_accuracy_metrics(
|
|
484
|
+
job_data, self.get_job_paths, log_metrics
|
|
485
|
+
)
|
|
486
|
+
all_metrics.update(job_metrics)
|
|
487
|
+
|
|
488
|
+
if not all_metrics:
|
|
489
|
+
return {
|
|
490
|
+
"success": False,
|
|
491
|
+
"error": "No accuracy metrics found in any job",
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
# Set up MLflow
|
|
495
|
+
tracking_uri = tracking_uri.rstrip("/")
|
|
496
|
+
mlflow.set_tracking_uri(tracking_uri)
|
|
497
|
+
|
|
498
|
+
experiment_name = mlflow_config.get(
|
|
499
|
+
"experiment_name", "nemo-evaluator-launcher"
|
|
500
|
+
)
|
|
501
|
+
mlflow.set_experiment(experiment_name)
|
|
502
|
+
|
|
503
|
+
# Prepare parameters for invocation
|
|
504
|
+
inv_exec_type = (first_job.config or {}).get("execution", {}).get(
|
|
505
|
+
"type"
|
|
506
|
+
) or first_job.executor
|
|
507
|
+
all_params = {
|
|
508
|
+
"invocation_id": invocation_id,
|
|
509
|
+
"executor": inv_exec_type,
|
|
510
|
+
"timestamp": str(first_job.timestamp),
|
|
511
|
+
"jobs_count": str(len(jobs)),
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
# Add webhook info if available
|
|
515
|
+
if mlflow_config.get("triggered_by_webhook"):
|
|
516
|
+
all_params.update(
|
|
517
|
+
{
|
|
518
|
+
"webhook_triggered": "true",
|
|
519
|
+
"webhook_source": mlflow_config.get("webhook_source"),
|
|
520
|
+
"source_artifact": mlflow_config.get("source_artifact"),
|
|
521
|
+
"config_source": mlflow_config.get("config_source"),
|
|
522
|
+
}
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
if mlflow_config.get("extra_metadata"):
|
|
526
|
+
all_params.update(mlflow_config["extra_metadata"])
|
|
527
|
+
|
|
528
|
+
# Prepare tags
|
|
529
|
+
tags = {"invocation_id": invocation_id}
|
|
530
|
+
if mlflow_config.get("tags"):
|
|
531
|
+
tags.update({k: v for k, v in mlflow_config["tags"].items() if v})
|
|
532
|
+
|
|
533
|
+
# Truncate
|
|
534
|
+
safe_params = {
|
|
535
|
+
str(k)[:250]: str(v)[:250] for k, v in all_params.items() if v
|
|
536
|
+
}
|
|
537
|
+
safe_tags = {str(k)[:250]: str(v)[:5000] for k, v in tags.items() if v}
|
|
538
|
+
|
|
539
|
+
# Check for existing run
|
|
540
|
+
exists, existing_run_id = self._get_existing_run_info(
|
|
541
|
+
first_job, mlflow_config
|
|
542
|
+
)
|
|
543
|
+
if exists and mlflow_config.get("skip_existing"):
|
|
544
|
+
return {
|
|
545
|
+
"success": True,
|
|
546
|
+
"invocation_id": invocation_id,
|
|
547
|
+
"jobs": {
|
|
548
|
+
job_id: {
|
|
549
|
+
"success": True,
|
|
550
|
+
"message": f"Run already exists: {existing_run_id}, skipped",
|
|
551
|
+
}
|
|
552
|
+
for job_id in jobs.keys()
|
|
553
|
+
},
|
|
554
|
+
"metadata": {"run_id": existing_run_id, "skipped": True},
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
# Create MLflow run with ALL metrics
|
|
558
|
+
with mlflow.start_run() as run:
|
|
559
|
+
# Set tags
|
|
560
|
+
if safe_tags:
|
|
561
|
+
mlflow.set_tags(safe_tags)
|
|
562
|
+
|
|
563
|
+
# Set run name
|
|
564
|
+
run_name = mlflow_config.get("run_name") or f"eval-{invocation_id}"
|
|
565
|
+
mlflow.set_tag("mlflow.runName", mlflow_sanitize(run_name, "tag_value"))
|
|
566
|
+
|
|
567
|
+
# Set description
|
|
568
|
+
description = mlflow_config.get("description")
|
|
569
|
+
if description:
|
|
570
|
+
mlflow.set_tag(
|
|
571
|
+
"mlflow.note.content", mlflow_sanitize(description, "tag_value")
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
# Log parameters
|
|
575
|
+
mlflow.log_params(safe_params)
|
|
576
|
+
|
|
577
|
+
# Sanitize metric keys
|
|
578
|
+
safe_all_metrics = {
|
|
579
|
+
mlflow_sanitize(k, "metric"): v
|
|
580
|
+
for k, v in (all_metrics or {}).items()
|
|
581
|
+
}
|
|
582
|
+
mlflow.log_metrics(safe_all_metrics)
|
|
583
|
+
|
|
584
|
+
# Log artifacts from all jobs
|
|
585
|
+
total_artifacts = 0
|
|
586
|
+
for job_id, job_data in jobs.items():
|
|
587
|
+
artifacts_logged = self._log_artifacts(
|
|
588
|
+
job_data, mlflow_config, staged_map.get(job_id)
|
|
589
|
+
)
|
|
590
|
+
total_artifacts += len(artifacts_logged)
|
|
591
|
+
|
|
592
|
+
# Build run URL
|
|
593
|
+
run_url = None
|
|
594
|
+
if tracking_uri.startswith(("http://", "https://")):
|
|
595
|
+
run_url = f"{tracking_uri}/#/experiments/{run.info.experiment_id}/runs/{run.info.run_id}"
|
|
596
|
+
|
|
597
|
+
return {
|
|
598
|
+
"success": True,
|
|
599
|
+
"invocation_id": invocation_id,
|
|
600
|
+
"jobs": {
|
|
601
|
+
job_id: {
|
|
602
|
+
"success": True,
|
|
603
|
+
"message": "Contributed to invocation run",
|
|
604
|
+
}
|
|
605
|
+
for job_id in jobs.keys()
|
|
606
|
+
},
|
|
607
|
+
"metadata": {
|
|
608
|
+
"run_id": run.info.run_id,
|
|
609
|
+
"experiment_id": run.info.experiment_id,
|
|
610
|
+
"tracking_uri": tracking_uri,
|
|
611
|
+
"run_url": run_url,
|
|
612
|
+
"metrics_logged": len(all_metrics),
|
|
613
|
+
"params_logged": len(safe_params),
|
|
614
|
+
"artifacts_logged": total_artifacts,
|
|
615
|
+
},
|
|
616
|
+
}
|
|
617
|
+
except Exception as e:
|
|
618
|
+
logger.error(f"MLflow export failed for invocation {invocation_id}: {e}")
|
|
619
|
+
return {"success": False, "error": f"MLflow export failed: {str(e)}"}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
from typing import Callable, Dict
|
|
17
|
+
|
|
18
|
+
from nemo_evaluator_launcher.exporters.base import BaseExporter
|
|
19
|
+
|
|
20
|
+
_EXPORTER_REGISTRY: Dict[str, BaseExporter] = {}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def register_exporter(name: str) -> Callable:
|
|
24
|
+
def wrapper(cls):
|
|
25
|
+
_EXPORTER_REGISTRY[name] = cls
|
|
26
|
+
return cls
|
|
27
|
+
|
|
28
|
+
return wrapper
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_exporter(name: str) -> BaseExporter:
|
|
32
|
+
if name not in _EXPORTER_REGISTRY:
|
|
33
|
+
raise ValueError(
|
|
34
|
+
f"Unknown exporter: {name}. Available: {list(_EXPORTER_REGISTRY.keys())}"
|
|
35
|
+
)
|
|
36
|
+
return _EXPORTER_REGISTRY[name]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def available_exporters() -> list[str]:
|
|
40
|
+
return list(_EXPORTER_REGISTRY.keys())
|