nemo-evaluator-launcher 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nemo_evaluator_launcher/__init__.py +15 -1
- nemo_evaluator_launcher/api/functional.py +188 -27
- nemo_evaluator_launcher/api/types.py +9 -0
- nemo_evaluator_launcher/cli/export.py +131 -12
- nemo_evaluator_launcher/cli/info.py +477 -82
- nemo_evaluator_launcher/cli/kill.py +5 -3
- nemo_evaluator_launcher/cli/logs.py +102 -0
- nemo_evaluator_launcher/cli/ls_runs.py +31 -10
- nemo_evaluator_launcher/cli/ls_tasks.py +105 -3
- nemo_evaluator_launcher/cli/main.py +101 -5
- nemo_evaluator_launcher/cli/run.py +153 -30
- nemo_evaluator_launcher/cli/status.py +49 -5
- nemo_evaluator_launcher/cli/version.py +26 -23
- nemo_evaluator_launcher/common/execdb.py +121 -27
- nemo_evaluator_launcher/common/helpers.py +213 -33
- nemo_evaluator_launcher/common/logging_utils.py +16 -5
- nemo_evaluator_launcher/common/printing_utils.py +100 -0
- nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
- nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
- nemo_evaluator_launcher/configs/deployment/trtllm.yaml +23 -0
- nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -2
- nemo_evaluator_launcher/configs/execution/local.yaml +2 -0
- nemo_evaluator_launcher/configs/execution/slurm/default.yaml +19 -4
- nemo_evaluator_launcher/executors/base.py +54 -1
- nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +60 -5
- nemo_evaluator_launcher/executors/lepton/executor.py +240 -101
- nemo_evaluator_launcher/executors/lepton/job_helpers.py +15 -11
- nemo_evaluator_launcher/executors/local/executor.py +492 -56
- nemo_evaluator_launcher/executors/local/run.template.sh +76 -9
- nemo_evaluator_launcher/executors/slurm/executor.py +571 -98
- nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
- nemo_evaluator_launcher/exporters/base.py +9 -0
- nemo_evaluator_launcher/exporters/gsheets.py +27 -9
- nemo_evaluator_launcher/exporters/local.py +30 -16
- nemo_evaluator_launcher/exporters/mlflow.py +245 -74
- nemo_evaluator_launcher/exporters/utils.py +139 -184
- nemo_evaluator_launcher/exporters/wandb.py +157 -43
- nemo_evaluator_launcher/package_info.py +6 -3
- nemo_evaluator_launcher/resources/mapping.toml +56 -15
- nemo_evaluator_launcher-0.1.41.dist-info/METADATA +494 -0
- nemo_evaluator_launcher-0.1.41.dist-info/RECORD +62 -0
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/entry_points.txt +1 -0
- nemo_evaluator_launcher-0.1.0rc6.dist-info/METADATA +0 -35
- nemo_evaluator_launcher-0.1.0rc6.dist-info/RECORD +0 -57
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/WHEEL +0 -0
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/licenses/LICENSE +0 -0
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/top_level.txt +0 -0
|
@@ -15,13 +15,11 @@
|
|
|
15
15
|
#
|
|
16
16
|
"""Evaluation results exporter for MLflow tracking."""
|
|
17
17
|
|
|
18
|
-
import
|
|
18
|
+
import os
|
|
19
19
|
import tempfile
|
|
20
20
|
from pathlib import Path
|
|
21
21
|
from typing import Any, Dict, List
|
|
22
22
|
|
|
23
|
-
import yaml
|
|
24
|
-
|
|
25
23
|
try:
|
|
26
24
|
import mlflow
|
|
27
25
|
|
|
@@ -37,9 +35,11 @@ from nemo_evaluator_launcher.exporters.registry import register_exporter
|
|
|
37
35
|
from nemo_evaluator_launcher.exporters.utils import (
|
|
38
36
|
extract_accuracy_metrics,
|
|
39
37
|
extract_exporter_config,
|
|
38
|
+
get_artifact_root,
|
|
40
39
|
get_available_artifacts,
|
|
41
40
|
get_benchmark_info,
|
|
42
41
|
get_task_name,
|
|
42
|
+
mlflow_sanitize,
|
|
43
43
|
)
|
|
44
44
|
|
|
45
45
|
|
|
@@ -100,11 +100,57 @@ class MLflowExporter(BaseExporter):
|
|
|
100
100
|
# Extract config using common utility
|
|
101
101
|
mlflow_config = extract_exporter_config(job_data, "mlflow", self.config)
|
|
102
102
|
|
|
103
|
-
#
|
|
103
|
+
# resolve tracking_uri with fallbacks
|
|
104
|
+
tracking_uri = mlflow_config.get("tracking_uri")
|
|
105
|
+
if not tracking_uri:
|
|
106
|
+
tracking_uri = os.getenv("MLFLOW_TRACKING_URI")
|
|
107
|
+
# allow env var name
|
|
108
|
+
if tracking_uri and "://" not in tracking_uri:
|
|
109
|
+
tracking_uri = os.getenv(tracking_uri, tracking_uri)
|
|
110
|
+
|
|
111
|
+
if not tracking_uri:
|
|
112
|
+
return ExportResult(
|
|
113
|
+
success=False,
|
|
114
|
+
dest="mlflow",
|
|
115
|
+
message="tracking_uri is required (set export.mlflow.tracking_uri or MLFLOW_TRACKING_URI)",
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Stage artifacts locally if remote_ssh (e.g., Slurm), so we can extract metrics
|
|
119
|
+
staged_base_dir = None
|
|
120
|
+
try:
|
|
121
|
+
paths = self.get_job_paths(job_data)
|
|
122
|
+
if paths.get("storage_type") == "remote_ssh":
|
|
123
|
+
tmp_stage = Path(tempfile.mkdtemp(prefix="mlflow_stage_"))
|
|
124
|
+
LocalExporter(
|
|
125
|
+
{
|
|
126
|
+
"output_dir": str(tmp_stage),
|
|
127
|
+
"copy_logs": mlflow_config.get(
|
|
128
|
+
"log_logs", False
|
|
129
|
+
), # log_logs -> copy_logs
|
|
130
|
+
"only_required": mlflow_config.get("only_required", True),
|
|
131
|
+
}
|
|
132
|
+
).export_job(job_data)
|
|
133
|
+
staged_base_dir = (
|
|
134
|
+
tmp_stage / job_data.invocation_id / job_data.job_id
|
|
135
|
+
)
|
|
136
|
+
except Exception as e:
|
|
137
|
+
logger.warning(f"Failed staging artifacts for {job_data.job_id}: {e}")
|
|
138
|
+
|
|
139
|
+
# Extract metrics (prefer staged if available)
|
|
104
140
|
log_metrics = mlflow_config.get("log_metrics", [])
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
141
|
+
if staged_base_dir and (staged_base_dir / "artifacts").exists():
|
|
142
|
+
accuracy_metrics = extract_accuracy_metrics(
|
|
143
|
+
job_data,
|
|
144
|
+
lambda _: {
|
|
145
|
+
"artifacts_dir": staged_base_dir / "artifacts",
|
|
146
|
+
"storage_type": "local_filesystem",
|
|
147
|
+
},
|
|
148
|
+
log_metrics,
|
|
149
|
+
)
|
|
150
|
+
else:
|
|
151
|
+
accuracy_metrics = extract_accuracy_metrics(
|
|
152
|
+
job_data, self.get_job_paths, log_metrics
|
|
153
|
+
)
|
|
108
154
|
|
|
109
155
|
if not accuracy_metrics:
|
|
110
156
|
return ExportResult(
|
|
@@ -112,12 +158,6 @@ class MLflowExporter(BaseExporter):
|
|
|
112
158
|
)
|
|
113
159
|
|
|
114
160
|
# Set up MLflow
|
|
115
|
-
tracking_uri = mlflow_config.get("tracking_uri")
|
|
116
|
-
if not tracking_uri:
|
|
117
|
-
return ExportResult(
|
|
118
|
-
success=False, dest="mlflow", message="tracking_uri is required"
|
|
119
|
-
)
|
|
120
|
-
|
|
121
161
|
tracking_uri = tracking_uri.rstrip("/")
|
|
122
162
|
mlflow.set_tracking_uri(tracking_uri)
|
|
123
163
|
|
|
@@ -149,10 +189,13 @@ class MLflowExporter(BaseExporter):
|
|
|
149
189
|
}
|
|
150
190
|
)
|
|
151
191
|
|
|
152
|
-
#
|
|
192
|
+
# Sanitize params
|
|
153
193
|
safe_params = {
|
|
154
|
-
|
|
194
|
+
mlflow_sanitize(k, "param_key"): mlflow_sanitize(v, "param_value")
|
|
195
|
+
for k, v in (all_params or {}).items()
|
|
196
|
+
if v is not None
|
|
155
197
|
}
|
|
198
|
+
|
|
156
199
|
# Prepare tags
|
|
157
200
|
tags = {}
|
|
158
201
|
if mlflow_config.get("tags"):
|
|
@@ -162,7 +205,10 @@ class MLflowExporter(BaseExporter):
|
|
|
162
205
|
benchmark = bench_info.get("benchmark", get_task_name(job_data))
|
|
163
206
|
harness = bench_info.get("harness", "unknown")
|
|
164
207
|
|
|
165
|
-
# Tag the run with invocation_id and task metadata
|
|
208
|
+
# Tag the run with invocation_id and task metadata
|
|
209
|
+
exec_type = (job_data.config or {}).get("execution", {}).get(
|
|
210
|
+
"type"
|
|
211
|
+
) or job_data.executor
|
|
166
212
|
tags.update(
|
|
167
213
|
{
|
|
168
214
|
"invocation_id": job_data.invocation_id,
|
|
@@ -170,11 +216,16 @@ class MLflowExporter(BaseExporter):
|
|
|
170
216
|
"task_name": benchmark,
|
|
171
217
|
"benchmark": benchmark,
|
|
172
218
|
"harness": harness,
|
|
173
|
-
"executor":
|
|
219
|
+
"executor": exec_type,
|
|
174
220
|
}
|
|
175
221
|
)
|
|
176
|
-
|
|
177
|
-
|
|
222
|
+
|
|
223
|
+
# Sanitize tags
|
|
224
|
+
safe_tags = {
|
|
225
|
+
mlflow_sanitize(k, "tag_key"): mlflow_sanitize(v, "tag_value")
|
|
226
|
+
for k, v in (tags or {}).items()
|
|
227
|
+
if v is not None
|
|
228
|
+
}
|
|
178
229
|
|
|
179
230
|
# skip run if it already exists
|
|
180
231
|
exists, existing_run_id = self._get_existing_run_info(
|
|
@@ -193,26 +244,34 @@ class MLflowExporter(BaseExporter):
|
|
|
193
244
|
if safe_tags:
|
|
194
245
|
mlflow.set_tags(safe_tags)
|
|
195
246
|
|
|
196
|
-
# Set run name
|
|
247
|
+
# Set run name
|
|
197
248
|
run_name = (
|
|
198
249
|
mlflow_config.get("run_name")
|
|
199
250
|
or f"eval-{job_data.invocation_id}-{benchmark}"
|
|
200
251
|
)
|
|
201
|
-
mlflow.set_tag("mlflow.runName", run_name)
|
|
252
|
+
mlflow.set_tag("mlflow.runName", mlflow_sanitize(run_name, "tag_value"))
|
|
202
253
|
|
|
203
254
|
# Set description only if provided
|
|
204
255
|
description = mlflow_config.get("description")
|
|
205
256
|
if description:
|
|
206
|
-
mlflow.set_tag(
|
|
257
|
+
mlflow.set_tag(
|
|
258
|
+
"mlflow.note.content", mlflow_sanitize(description, "tag_value")
|
|
259
|
+
)
|
|
207
260
|
|
|
208
261
|
# Log parameters
|
|
209
262
|
mlflow.log_params(safe_params)
|
|
210
263
|
|
|
211
|
-
#
|
|
212
|
-
|
|
264
|
+
# Sanitize metric keys before logging
|
|
265
|
+
safe_metrics = {
|
|
266
|
+
mlflow_sanitize(k, "metric"): v
|
|
267
|
+
for k, v in (accuracy_metrics or {}).items()
|
|
268
|
+
}
|
|
269
|
+
mlflow.log_metrics(safe_metrics)
|
|
213
270
|
|
|
214
271
|
# Log artifacts
|
|
215
|
-
artifacts_logged = self._log_artifacts(
|
|
272
|
+
artifacts_logged = self._log_artifacts(
|
|
273
|
+
job_data, mlflow_config, staged_base_dir
|
|
274
|
+
)
|
|
216
275
|
|
|
217
276
|
# Build run URL
|
|
218
277
|
run_url = None
|
|
@@ -242,7 +301,10 @@ class MLflowExporter(BaseExporter):
|
|
|
242
301
|
)
|
|
243
302
|
|
|
244
303
|
def _log_artifacts(
|
|
245
|
-
self,
|
|
304
|
+
self,
|
|
305
|
+
job_data: JobData,
|
|
306
|
+
mlflow_config: Dict[str, Any],
|
|
307
|
+
pre_staged_dir: Path = None,
|
|
246
308
|
) -> List[str]:
|
|
247
309
|
"""Log evaluation artifacts to MLflow using LocalExporter for transfer."""
|
|
248
310
|
|
|
@@ -251,44 +313,103 @@ class MLflowExporter(BaseExporter):
|
|
|
251
313
|
return []
|
|
252
314
|
|
|
253
315
|
try:
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
f
|
|
276
|
-
|
|
277
|
-
|
|
316
|
+
should_cleanup = False
|
|
317
|
+
# Use pre-staged dir if available; otherwise stage now
|
|
318
|
+
if pre_staged_dir and pre_staged_dir.exists():
|
|
319
|
+
base_dir = pre_staged_dir
|
|
320
|
+
else:
|
|
321
|
+
temp_dir = tempfile.mkdtemp(prefix="mlflow_artifacts_")
|
|
322
|
+
local_exporter = LocalExporter(
|
|
323
|
+
{
|
|
324
|
+
"output_dir": str(temp_dir),
|
|
325
|
+
"copy_logs": mlflow_config.get(
|
|
326
|
+
"log_logs", mlflow_config.get("copy_logs", False)
|
|
327
|
+
),
|
|
328
|
+
"only_required": mlflow_config.get("only_required", True),
|
|
329
|
+
"format": mlflow_config.get("format", None),
|
|
330
|
+
"log_metrics": mlflow_config.get("log_metrics", []),
|
|
331
|
+
"output_filename": mlflow_config.get("output_filename", None),
|
|
332
|
+
}
|
|
333
|
+
)
|
|
334
|
+
local_result = local_exporter.export_job(job_data)
|
|
335
|
+
if not local_result.success:
|
|
336
|
+
logger.error(
|
|
337
|
+
f"Failed to download artifacts: {local_result.message}"
|
|
338
|
+
)
|
|
339
|
+
return []
|
|
340
|
+
base_dir = Path(local_result.dest)
|
|
341
|
+
should_cleanup = True
|
|
342
|
+
|
|
343
|
+
artifacts_dir = base_dir / "artifacts"
|
|
344
|
+
logs_dir = base_dir / "logs"
|
|
345
|
+
logged_names: list[str] = []
|
|
346
|
+
artifact_path = get_artifact_root(job_data) # "<harness>.<benchmark>"
|
|
347
|
+
|
|
348
|
+
# Log config at root level (or synthesize)
|
|
349
|
+
cfg_logged = False
|
|
350
|
+
for fname in ("config.yml", "run_config.yml"):
|
|
351
|
+
p = artifacts_dir / fname
|
|
352
|
+
if p.exists():
|
|
353
|
+
mlflow.log_artifact(str(p))
|
|
354
|
+
cfg_logged = True
|
|
355
|
+
break
|
|
356
|
+
if not cfg_logged:
|
|
357
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
358
|
+
from yaml import dump as ydump
|
|
359
|
+
|
|
360
|
+
cfg_file = Path(tmpdir) / "config.yaml"
|
|
361
|
+
cfg_file.write_text(
|
|
362
|
+
ydump(
|
|
363
|
+
job_data.config or {},
|
|
364
|
+
default_flow_style=False,
|
|
365
|
+
sort_keys=False,
|
|
366
|
+
)
|
|
278
367
|
)
|
|
279
|
-
|
|
368
|
+
mlflow.log_artifact(str(cfg_file))
|
|
369
|
+
|
|
370
|
+
# Choose files to upload
|
|
371
|
+
files_to_upload: list[Path] = []
|
|
372
|
+
if mlflow_config.get("only_required", True):
|
|
373
|
+
for fname in get_available_artifacts(artifacts_dir):
|
|
374
|
+
p = artifacts_dir / fname
|
|
375
|
+
if p.exists():
|
|
376
|
+
files_to_upload.append(p)
|
|
377
|
+
else:
|
|
378
|
+
for p in artifacts_dir.iterdir(): # top-level files only
|
|
379
|
+
if p.is_file():
|
|
380
|
+
files_to_upload.append(p)
|
|
381
|
+
|
|
382
|
+
# Upload artifacts (with DEBUG per-file)
|
|
383
|
+
for fpath in files_to_upload:
|
|
384
|
+
rel = fpath.relative_to(artifacts_dir).as_posix()
|
|
385
|
+
parent = os.path.dirname(rel)
|
|
386
|
+
mlflow.log_artifact(
|
|
387
|
+
str(fpath),
|
|
388
|
+
artifact_path=f"{artifact_path}/artifacts/{parent}".rstrip("/"),
|
|
389
|
+
)
|
|
390
|
+
logged_names.append(rel)
|
|
391
|
+
logger.debug(f"mlflow upload artifact: {rel}")
|
|
392
|
+
|
|
393
|
+
# Optionally upload logs under "<harness.task>/logs"
|
|
394
|
+
if mlflow_config.get("log_logs", False) and logs_dir.exists():
|
|
395
|
+
for p in logs_dir.iterdir():
|
|
396
|
+
if p.is_file():
|
|
397
|
+
rel = p.name
|
|
398
|
+
mlflow.log_artifact(
|
|
399
|
+
str(p), artifact_path=f"{artifact_path}/logs"
|
|
400
|
+
)
|
|
401
|
+
logged_names.append(f"logs/{rel}")
|
|
402
|
+
logger.debug(f"mlflow upload log: {rel}")
|
|
403
|
+
|
|
404
|
+
logger.info(
|
|
405
|
+
f"MLflow upload summary: files={len(logged_names)}, only_required={mlflow_config.get('only_required', True)}, log_logs={mlflow_config.get('log_logs', False)}"
|
|
406
|
+
)
|
|
407
|
+
if should_cleanup:
|
|
408
|
+
import shutil
|
|
280
409
|
|
|
281
|
-
|
|
282
|
-
for fname in get_available_artifacts(artifacts_dir):
|
|
283
|
-
file_path = artifacts_dir / fname
|
|
284
|
-
if file_path.exists():
|
|
285
|
-
mlflow.log_artifact(str(file_path), artifact_path=artifact_path)
|
|
286
|
-
logged_names.append(fname)
|
|
410
|
+
shutil.rmtree(base_dir, ignore_errors=True)
|
|
287
411
|
|
|
288
|
-
# cleanup temp
|
|
289
|
-
shutil.rmtree(temp_dir)
|
|
290
412
|
return logged_names
|
|
291
|
-
|
|
292
413
|
except Exception as e:
|
|
293
414
|
logger.error(f"Error logging artifacts: {e}")
|
|
294
415
|
return []
|
|
@@ -312,13 +433,56 @@ class MLflowExporter(BaseExporter):
|
|
|
312
433
|
# Extract config using common utility
|
|
313
434
|
mlflow_config = extract_exporter_config(first_job, "mlflow", self.config)
|
|
314
435
|
|
|
436
|
+
# resolve tracking_uri with fallbacks
|
|
437
|
+
tracking_uri = mlflow_config.get("tracking_uri") or os.getenv(
|
|
438
|
+
"MLFLOW_TRACKING_URI"
|
|
439
|
+
)
|
|
440
|
+
if tracking_uri and "://" not in tracking_uri:
|
|
441
|
+
tracking_uri = os.getenv(tracking_uri, tracking_uri)
|
|
442
|
+
if not tracking_uri:
|
|
443
|
+
return {
|
|
444
|
+
"success": False,
|
|
445
|
+
"error": "tracking_uri is required (set export.mlflow.tracking_uri or MLFLOW_TRACKING_URI)",
|
|
446
|
+
}
|
|
447
|
+
|
|
315
448
|
# Collect metrics from ALL jobs
|
|
316
449
|
all_metrics = {}
|
|
450
|
+
staged_map: dict[str, Path] = {}
|
|
451
|
+
for job_id, job_data in jobs.items():
|
|
452
|
+
try:
|
|
453
|
+
paths = self.get_job_paths(job_data)
|
|
454
|
+
if paths.get("storage_type") == "remote_ssh":
|
|
455
|
+
tmp_stage = Path(tempfile.mkdtemp(prefix="mlflow_inv_stage_"))
|
|
456
|
+
LocalExporter(
|
|
457
|
+
{
|
|
458
|
+
"output_dir": str(tmp_stage),
|
|
459
|
+
"copy_logs": mlflow_config.get("log_logs", False),
|
|
460
|
+
"only_required": mlflow_config.get(
|
|
461
|
+
"only_required", True
|
|
462
|
+
),
|
|
463
|
+
}
|
|
464
|
+
).export_job(job_data)
|
|
465
|
+
staged_map[job_id] = (
|
|
466
|
+
tmp_stage / job_data.invocation_id / job_data.job_id
|
|
467
|
+
)
|
|
468
|
+
except Exception as e:
|
|
469
|
+
logger.warning(f"Staging failed for {job_id}: {e}")
|
|
470
|
+
|
|
317
471
|
for job_id, job_data in jobs.items():
|
|
318
472
|
log_metrics = mlflow_config.get("log_metrics", [])
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
473
|
+
if job_id in staged_map and (staged_map[job_id] / "artifacts").exists():
|
|
474
|
+
job_metrics = extract_accuracy_metrics(
|
|
475
|
+
job_data,
|
|
476
|
+
lambda _: {
|
|
477
|
+
"artifacts_dir": staged_map[job_id] / "artifacts",
|
|
478
|
+
"storage_type": "local_filesystem",
|
|
479
|
+
},
|
|
480
|
+
log_metrics,
|
|
481
|
+
)
|
|
482
|
+
else:
|
|
483
|
+
job_metrics = extract_accuracy_metrics(
|
|
484
|
+
job_data, self.get_job_paths, log_metrics
|
|
485
|
+
)
|
|
322
486
|
all_metrics.update(job_metrics)
|
|
323
487
|
|
|
324
488
|
if not all_metrics:
|
|
@@ -328,10 +492,6 @@ class MLflowExporter(BaseExporter):
|
|
|
328
492
|
}
|
|
329
493
|
|
|
330
494
|
# Set up MLflow
|
|
331
|
-
tracking_uri = mlflow_config.get("tracking_uri")
|
|
332
|
-
if not tracking_uri:
|
|
333
|
-
return {"success": False, "error": "tracking_uri is required"}
|
|
334
|
-
|
|
335
495
|
tracking_uri = tracking_uri.rstrip("/")
|
|
336
496
|
mlflow.set_tracking_uri(tracking_uri)
|
|
337
497
|
|
|
@@ -341,9 +501,12 @@ class MLflowExporter(BaseExporter):
|
|
|
341
501
|
mlflow.set_experiment(experiment_name)
|
|
342
502
|
|
|
343
503
|
# Prepare parameters for invocation
|
|
504
|
+
inv_exec_type = (first_job.config or {}).get("execution", {}).get(
|
|
505
|
+
"type"
|
|
506
|
+
) or first_job.executor
|
|
344
507
|
all_params = {
|
|
345
508
|
"invocation_id": invocation_id,
|
|
346
|
-
"executor":
|
|
509
|
+
"executor": inv_exec_type,
|
|
347
510
|
"timestamp": str(first_job.timestamp),
|
|
348
511
|
"jobs_count": str(len(jobs)),
|
|
349
512
|
}
|
|
@@ -399,23 +562,31 @@ class MLflowExporter(BaseExporter):
|
|
|
399
562
|
|
|
400
563
|
# Set run name
|
|
401
564
|
run_name = mlflow_config.get("run_name") or f"eval-{invocation_id}"
|
|
402
|
-
mlflow.set_tag("mlflow.runName", run_name)
|
|
565
|
+
mlflow.set_tag("mlflow.runName", mlflow_sanitize(run_name, "tag_value"))
|
|
403
566
|
|
|
404
567
|
# Set description
|
|
405
568
|
description = mlflow_config.get("description")
|
|
406
569
|
if description:
|
|
407
|
-
mlflow.set_tag(
|
|
570
|
+
mlflow.set_tag(
|
|
571
|
+
"mlflow.note.content", mlflow_sanitize(description, "tag_value")
|
|
572
|
+
)
|
|
408
573
|
|
|
409
574
|
# Log parameters
|
|
410
575
|
mlflow.log_params(safe_params)
|
|
411
576
|
|
|
412
|
-
#
|
|
413
|
-
|
|
577
|
+
# Sanitize metric keys
|
|
578
|
+
safe_all_metrics = {
|
|
579
|
+
mlflow_sanitize(k, "metric"): v
|
|
580
|
+
for k, v in (all_metrics or {}).items()
|
|
581
|
+
}
|
|
582
|
+
mlflow.log_metrics(safe_all_metrics)
|
|
414
583
|
|
|
415
584
|
# Log artifacts from all jobs
|
|
416
585
|
total_artifacts = 0
|
|
417
|
-
for job_data in jobs.
|
|
418
|
-
artifacts_logged = self._log_artifacts(
|
|
586
|
+
for job_id, job_data in jobs.items():
|
|
587
|
+
artifacts_logged = self._log_artifacts(
|
|
588
|
+
job_data, mlflow_config, staged_map.get(job_id)
|
|
589
|
+
)
|
|
419
590
|
total_artifacts += len(artifacts_logged)
|
|
420
591
|
|
|
421
592
|
# Build run URL
|