nemo-evaluator-launcher 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nemo-evaluator-launcher might be problematic. Click here for more details.
- nemo_evaluator_launcher/api/functional.py +19 -29
- nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -1
- nemo_evaluator_launcher/configs/execution/local.yaml +1 -0
- nemo_evaluator_launcher/executors/base.py +23 -0
- nemo_evaluator_launcher/executors/lepton/executor.py +17 -71
- nemo_evaluator_launcher/executors/local/executor.py +48 -7
- nemo_evaluator_launcher/executors/local/run.template.sh +18 -6
- nemo_evaluator_launcher/executors/slurm/executor.py +40 -22
- nemo_evaluator_launcher/exporters/local.py +25 -16
- nemo_evaluator_launcher/exporters/mlflow.py +168 -70
- nemo_evaluator_launcher/exporters/utils.py +85 -33
- nemo_evaluator_launcher/exporters/wandb.py +40 -5
- nemo_evaluator_launcher/package_info.py +1 -1
- {nemo_evaluator_launcher-0.1.14.dist-info → nemo_evaluator_launcher-0.1.15.dist-info}/METADATA +1 -1
- {nemo_evaluator_launcher-0.1.14.dist-info → nemo_evaluator_launcher-0.1.15.dist-info}/RECORD +19 -19
- {nemo_evaluator_launcher-0.1.14.dist-info → nemo_evaluator_launcher-0.1.15.dist-info}/WHEEL +0 -0
- {nemo_evaluator_launcher-0.1.14.dist-info → nemo_evaluator_launcher-0.1.15.dist-info}/entry_points.txt +0 -0
- {nemo_evaluator_launcher-0.1.14.dist-info → nemo_evaluator_launcher-0.1.15.dist-info}/licenses/LICENSE +0 -0
- {nemo_evaluator_launcher-0.1.14.dist-info → nemo_evaluator_launcher-0.1.15.dist-info}/top_level.txt +0 -0
|
@@ -16,13 +16,10 @@
|
|
|
16
16
|
"""Evaluation results exporter for MLflow tracking."""
|
|
17
17
|
|
|
18
18
|
import os
|
|
19
|
-
import shutil
|
|
20
19
|
import tempfile
|
|
21
20
|
from pathlib import Path
|
|
22
21
|
from typing import Any, Dict, List
|
|
23
22
|
|
|
24
|
-
import yaml
|
|
25
|
-
|
|
26
23
|
try:
|
|
27
24
|
import mlflow
|
|
28
25
|
|
|
@@ -42,6 +39,7 @@ from nemo_evaluator_launcher.exporters.utils import (
|
|
|
42
39
|
get_available_artifacts,
|
|
43
40
|
get_benchmark_info,
|
|
44
41
|
get_task_name,
|
|
42
|
+
mlflow_sanitize,
|
|
45
43
|
)
|
|
46
44
|
|
|
47
45
|
|
|
@@ -117,11 +115,42 @@ class MLflowExporter(BaseExporter):
|
|
|
117
115
|
message="tracking_uri is required (set export.mlflow.tracking_uri or MLFLOW_TRACKING_URI)",
|
|
118
116
|
)
|
|
119
117
|
|
|
120
|
-
#
|
|
118
|
+
# Stage artifacts locally if remote_ssh (e.g., Slurm), so we can extract metrics
|
|
119
|
+
staged_base_dir = None
|
|
120
|
+
try:
|
|
121
|
+
paths = self.get_job_paths(job_data)
|
|
122
|
+
if paths.get("storage_type") == "remote_ssh":
|
|
123
|
+
tmp_stage = Path(tempfile.mkdtemp(prefix="mlflow_stage_"))
|
|
124
|
+
LocalExporter(
|
|
125
|
+
{
|
|
126
|
+
"output_dir": str(tmp_stage),
|
|
127
|
+
"copy_logs": mlflow_config.get(
|
|
128
|
+
"log_logs", False
|
|
129
|
+
), # log_logs -> copy_logs
|
|
130
|
+
"only_required": mlflow_config.get("only_required", True),
|
|
131
|
+
}
|
|
132
|
+
).export_job(job_data)
|
|
133
|
+
staged_base_dir = (
|
|
134
|
+
tmp_stage / job_data.invocation_id / job_data.job_id
|
|
135
|
+
)
|
|
136
|
+
except Exception as e:
|
|
137
|
+
logger.warning(f"Failed staging artifacts for {job_data.job_id}: {e}")
|
|
138
|
+
|
|
139
|
+
# Extract metrics (prefer staged if available)
|
|
121
140
|
log_metrics = mlflow_config.get("log_metrics", [])
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
141
|
+
if staged_base_dir and (staged_base_dir / "artifacts").exists():
|
|
142
|
+
accuracy_metrics = extract_accuracy_metrics(
|
|
143
|
+
job_data,
|
|
144
|
+
lambda _: {
|
|
145
|
+
"artifacts_dir": staged_base_dir / "artifacts",
|
|
146
|
+
"storage_type": "local_filesystem",
|
|
147
|
+
},
|
|
148
|
+
log_metrics,
|
|
149
|
+
)
|
|
150
|
+
else:
|
|
151
|
+
accuracy_metrics = extract_accuracy_metrics(
|
|
152
|
+
job_data, self.get_job_paths, log_metrics
|
|
153
|
+
)
|
|
125
154
|
|
|
126
155
|
if not accuracy_metrics:
|
|
127
156
|
return ExportResult(
|
|
@@ -160,10 +189,13 @@ class MLflowExporter(BaseExporter):
|
|
|
160
189
|
}
|
|
161
190
|
)
|
|
162
191
|
|
|
163
|
-
#
|
|
192
|
+
# Sanitize params
|
|
164
193
|
safe_params = {
|
|
165
|
-
|
|
194
|
+
mlflow_sanitize(k, "param_key"): mlflow_sanitize(v, "param_value")
|
|
195
|
+
for k, v in (all_params or {}).items()
|
|
196
|
+
if v is not None
|
|
166
197
|
}
|
|
198
|
+
|
|
167
199
|
# Prepare tags
|
|
168
200
|
tags = {}
|
|
169
201
|
if mlflow_config.get("tags"):
|
|
@@ -173,7 +205,10 @@ class MLflowExporter(BaseExporter):
|
|
|
173
205
|
benchmark = bench_info.get("benchmark", get_task_name(job_data))
|
|
174
206
|
harness = bench_info.get("harness", "unknown")
|
|
175
207
|
|
|
176
|
-
# Tag the run with invocation_id and task metadata
|
|
208
|
+
# Tag the run with invocation_id and task metadata
|
|
209
|
+
exec_type = (job_data.config or {}).get("execution", {}).get(
|
|
210
|
+
"type"
|
|
211
|
+
) or job_data.executor
|
|
177
212
|
tags.update(
|
|
178
213
|
{
|
|
179
214
|
"invocation_id": job_data.invocation_id,
|
|
@@ -181,11 +216,16 @@ class MLflowExporter(BaseExporter):
|
|
|
181
216
|
"task_name": benchmark,
|
|
182
217
|
"benchmark": benchmark,
|
|
183
218
|
"harness": harness,
|
|
184
|
-
"executor":
|
|
219
|
+
"executor": exec_type,
|
|
185
220
|
}
|
|
186
221
|
)
|
|
187
|
-
|
|
188
|
-
|
|
222
|
+
|
|
223
|
+
# Sanitize tags
|
|
224
|
+
safe_tags = {
|
|
225
|
+
mlflow_sanitize(k, "tag_key"): mlflow_sanitize(v, "tag_value")
|
|
226
|
+
for k, v in (tags or {}).items()
|
|
227
|
+
if v is not None
|
|
228
|
+
}
|
|
189
229
|
|
|
190
230
|
# skip run if it already exists
|
|
191
231
|
exists, existing_run_id = self._get_existing_run_info(
|
|
@@ -204,26 +244,34 @@ class MLflowExporter(BaseExporter):
|
|
|
204
244
|
if safe_tags:
|
|
205
245
|
mlflow.set_tags(safe_tags)
|
|
206
246
|
|
|
207
|
-
# Set run name
|
|
247
|
+
# Set run name
|
|
208
248
|
run_name = (
|
|
209
249
|
mlflow_config.get("run_name")
|
|
210
250
|
or f"eval-{job_data.invocation_id}-{benchmark}"
|
|
211
251
|
)
|
|
212
|
-
mlflow.set_tag("mlflow.runName", run_name)
|
|
252
|
+
mlflow.set_tag("mlflow.runName", mlflow_sanitize(run_name, "tag_value"))
|
|
213
253
|
|
|
214
254
|
# Set description only if provided
|
|
215
255
|
description = mlflow_config.get("description")
|
|
216
256
|
if description:
|
|
217
|
-
mlflow.set_tag(
|
|
257
|
+
mlflow.set_tag(
|
|
258
|
+
"mlflow.note.content", mlflow_sanitize(description, "tag_value")
|
|
259
|
+
)
|
|
218
260
|
|
|
219
261
|
# Log parameters
|
|
220
262
|
mlflow.log_params(safe_params)
|
|
221
263
|
|
|
222
|
-
#
|
|
223
|
-
|
|
264
|
+
# Sanitize metric keys before logging
|
|
265
|
+
safe_metrics = {
|
|
266
|
+
mlflow_sanitize(k, "metric"): v
|
|
267
|
+
for k, v in (accuracy_metrics or {}).items()
|
|
268
|
+
}
|
|
269
|
+
mlflow.log_metrics(safe_metrics)
|
|
224
270
|
|
|
225
271
|
# Log artifacts
|
|
226
|
-
artifacts_logged = self._log_artifacts(
|
|
272
|
+
artifacts_logged = self._log_artifacts(
|
|
273
|
+
job_data, mlflow_config, staged_base_dir
|
|
274
|
+
)
|
|
227
275
|
|
|
228
276
|
# Build run URL
|
|
229
277
|
run_url = None
|
|
@@ -253,7 +301,10 @@ class MLflowExporter(BaseExporter):
|
|
|
253
301
|
)
|
|
254
302
|
|
|
255
303
|
def _log_artifacts(
|
|
256
|
-
self,
|
|
304
|
+
self,
|
|
305
|
+
job_data: JobData,
|
|
306
|
+
mlflow_config: Dict[str, Any],
|
|
307
|
+
pre_staged_dir: Path = None,
|
|
257
308
|
) -> List[str]:
|
|
258
309
|
"""Log evaluation artifacts to MLflow using LocalExporter for transfer."""
|
|
259
310
|
|
|
@@ -262,34 +313,39 @@ class MLflowExporter(BaseExporter):
|
|
|
262
313
|
return []
|
|
263
314
|
|
|
264
315
|
try:
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
316
|
+
should_cleanup = False
|
|
317
|
+
# Use pre-staged dir if available; otherwise stage now
|
|
318
|
+
if pre_staged_dir and pre_staged_dir.exists():
|
|
319
|
+
base_dir = pre_staged_dir
|
|
320
|
+
else:
|
|
321
|
+
temp_dir = tempfile.mkdtemp(prefix="mlflow_artifacts_")
|
|
322
|
+
local_exporter = LocalExporter(
|
|
323
|
+
{
|
|
324
|
+
"output_dir": str(temp_dir),
|
|
325
|
+
"copy_logs": mlflow_config.get(
|
|
326
|
+
"log_logs", mlflow_config.get("copy_logs", False)
|
|
327
|
+
),
|
|
328
|
+
"only_required": mlflow_config.get("only_required", True),
|
|
329
|
+
"format": mlflow_config.get("format", None),
|
|
330
|
+
"log_metrics": mlflow_config.get("log_metrics", []),
|
|
331
|
+
"output_filename": mlflow_config.get("output_filename", None),
|
|
332
|
+
}
|
|
333
|
+
)
|
|
334
|
+
local_result = local_exporter.export_job(job_data)
|
|
335
|
+
if not local_result.success:
|
|
336
|
+
logger.error(
|
|
337
|
+
f"Failed to download artifacts: {local_result.message}"
|
|
338
|
+
)
|
|
339
|
+
return []
|
|
340
|
+
base_dir = Path(local_result.dest)
|
|
341
|
+
should_cleanup = True
|
|
284
342
|
|
|
285
|
-
base_dir = Path(local_result.dest)
|
|
286
343
|
artifacts_dir = base_dir / "artifacts"
|
|
287
344
|
logs_dir = base_dir / "logs"
|
|
288
345
|
logged_names: list[str] = []
|
|
289
|
-
|
|
290
346
|
artifact_path = get_artifact_root(job_data) # "<harness>.<benchmark>"
|
|
291
347
|
|
|
292
|
-
# Log config at root level
|
|
348
|
+
# Log config at root level (or synthesize)
|
|
293
349
|
cfg_logged = False
|
|
294
350
|
for fname in ("config.yml", "run_config.yml"):
|
|
295
351
|
p = artifacts_dir / fname
|
|
@@ -299,16 +355,19 @@ class MLflowExporter(BaseExporter):
|
|
|
299
355
|
break
|
|
300
356
|
if not cfg_logged:
|
|
301
357
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
358
|
+
from yaml import dump as ydump
|
|
359
|
+
|
|
302
360
|
cfg_file = Path(tmpdir) / "config.yaml"
|
|
303
|
-
|
|
304
|
-
|
|
361
|
+
cfg_file.write_text(
|
|
362
|
+
ydump(
|
|
305
363
|
job_data.config or {},
|
|
306
|
-
f,
|
|
307
364
|
default_flow_style=False,
|
|
308
365
|
sort_keys=False,
|
|
309
366
|
)
|
|
367
|
+
)
|
|
310
368
|
mlflow.log_artifact(str(cfg_file))
|
|
311
369
|
|
|
370
|
+
# Choose files to upload
|
|
312
371
|
files_to_upload: list[Path] = []
|
|
313
372
|
if mlflow_config.get("only_required", True):
|
|
314
373
|
for fname in get_available_artifacts(artifacts_dir):
|
|
@@ -316,10 +375,11 @@ class MLflowExporter(BaseExporter):
|
|
|
316
375
|
if p.exists():
|
|
317
376
|
files_to_upload.append(p)
|
|
318
377
|
else:
|
|
319
|
-
for p in artifacts_dir.iterdir():
|
|
378
|
+
for p in artifacts_dir.iterdir(): # top-level files only
|
|
320
379
|
if p.is_file():
|
|
321
380
|
files_to_upload.append(p)
|
|
322
381
|
|
|
382
|
+
# Upload artifacts (with DEBUG per-file)
|
|
323
383
|
for fpath in files_to_upload:
|
|
324
384
|
rel = fpath.relative_to(artifacts_dir).as_posix()
|
|
325
385
|
parent = os.path.dirname(rel)
|
|
@@ -328,32 +388,28 @@ class MLflowExporter(BaseExporter):
|
|
|
328
388
|
artifact_path=f"{artifact_path}/artifacts/{parent}".rstrip("/"),
|
|
329
389
|
)
|
|
330
390
|
logged_names.append(rel)
|
|
391
|
+
logger.debug(f"mlflow upload artifact: {rel}")
|
|
331
392
|
|
|
332
393
|
# Optionally upload logs under "<harness.task>/logs"
|
|
333
394
|
if mlflow_config.get("log_logs", False) and logs_dir.exists():
|
|
334
|
-
for p in logs_dir.
|
|
395
|
+
for p in logs_dir.iterdir():
|
|
335
396
|
if p.is_file():
|
|
397
|
+
rel = p.name
|
|
336
398
|
mlflow.log_artifact(
|
|
337
|
-
str(p),
|
|
338
|
-
artifact_path=f"{artifact_path}/logs",
|
|
399
|
+
str(p), artifact_path=f"{artifact_path}/logs"
|
|
339
400
|
)
|
|
340
|
-
logged_names.append(f"logs/{
|
|
401
|
+
logged_names.append(f"logs/{rel}")
|
|
402
|
+
logger.debug(f"mlflow upload log: {rel}")
|
|
341
403
|
|
|
342
|
-
# Debug summary of what we uploaded
|
|
343
404
|
logger.info(
|
|
344
405
|
f"MLflow upload summary: files={len(logged_names)}, only_required={mlflow_config.get('only_required', True)}, log_logs={mlflow_config.get('log_logs', False)}"
|
|
345
406
|
)
|
|
346
|
-
if
|
|
347
|
-
|
|
348
|
-
preview = "\n - " + "\n - ".join(sorted(logged_names)[:50])
|
|
349
|
-
logger.debug(f"Uploaded files preview (first 50):{preview}")
|
|
350
|
-
except Exception:
|
|
351
|
-
pass
|
|
407
|
+
if should_cleanup:
|
|
408
|
+
import shutil
|
|
352
409
|
|
|
353
|
-
|
|
354
|
-
shutil.rmtree(temp_dir)
|
|
355
|
-
return logged_names
|
|
410
|
+
shutil.rmtree(base_dir, ignore_errors=True)
|
|
356
411
|
|
|
412
|
+
return logged_names
|
|
357
413
|
except Exception as e:
|
|
358
414
|
logger.error(f"Error logging artifacts: {e}")
|
|
359
415
|
return []
|
|
@@ -391,11 +447,42 @@ class MLflowExporter(BaseExporter):
|
|
|
391
447
|
|
|
392
448
|
# Collect metrics from ALL jobs
|
|
393
449
|
all_metrics = {}
|
|
450
|
+
staged_map: dict[str, Path] = {}
|
|
451
|
+
for job_id, job_data in jobs.items():
|
|
452
|
+
try:
|
|
453
|
+
paths = self.get_job_paths(job_data)
|
|
454
|
+
if paths.get("storage_type") == "remote_ssh":
|
|
455
|
+
tmp_stage = Path(tempfile.mkdtemp(prefix="mlflow_inv_stage_"))
|
|
456
|
+
LocalExporter(
|
|
457
|
+
{
|
|
458
|
+
"output_dir": str(tmp_stage),
|
|
459
|
+
"copy_logs": mlflow_config.get("log_logs", False),
|
|
460
|
+
"only_required": mlflow_config.get(
|
|
461
|
+
"only_required", True
|
|
462
|
+
),
|
|
463
|
+
}
|
|
464
|
+
).export_job(job_data)
|
|
465
|
+
staged_map[job_id] = (
|
|
466
|
+
tmp_stage / job_data.invocation_id / job_data.job_id
|
|
467
|
+
)
|
|
468
|
+
except Exception as e:
|
|
469
|
+
logger.warning(f"Staging failed for {job_id}: {e}")
|
|
470
|
+
|
|
394
471
|
for job_id, job_data in jobs.items():
|
|
395
472
|
log_metrics = mlflow_config.get("log_metrics", [])
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
473
|
+
if job_id in staged_map and (staged_map[job_id] / "artifacts").exists():
|
|
474
|
+
job_metrics = extract_accuracy_metrics(
|
|
475
|
+
job_data,
|
|
476
|
+
lambda _: {
|
|
477
|
+
"artifacts_dir": staged_map[job_id] / "artifacts",
|
|
478
|
+
"storage_type": "local_filesystem",
|
|
479
|
+
},
|
|
480
|
+
log_metrics,
|
|
481
|
+
)
|
|
482
|
+
else:
|
|
483
|
+
job_metrics = extract_accuracy_metrics(
|
|
484
|
+
job_data, self.get_job_paths, log_metrics
|
|
485
|
+
)
|
|
399
486
|
all_metrics.update(job_metrics)
|
|
400
487
|
|
|
401
488
|
if not all_metrics:
|
|
@@ -414,9 +501,12 @@ class MLflowExporter(BaseExporter):
|
|
|
414
501
|
mlflow.set_experiment(experiment_name)
|
|
415
502
|
|
|
416
503
|
# Prepare parameters for invocation
|
|
504
|
+
inv_exec_type = (first_job.config or {}).get("execution", {}).get(
|
|
505
|
+
"type"
|
|
506
|
+
) or first_job.executor
|
|
417
507
|
all_params = {
|
|
418
508
|
"invocation_id": invocation_id,
|
|
419
|
-
"executor":
|
|
509
|
+
"executor": inv_exec_type,
|
|
420
510
|
"timestamp": str(first_job.timestamp),
|
|
421
511
|
"jobs_count": str(len(jobs)),
|
|
422
512
|
}
|
|
@@ -472,23 +562,31 @@ class MLflowExporter(BaseExporter):
|
|
|
472
562
|
|
|
473
563
|
# Set run name
|
|
474
564
|
run_name = mlflow_config.get("run_name") or f"eval-{invocation_id}"
|
|
475
|
-
mlflow.set_tag("mlflow.runName", run_name)
|
|
565
|
+
mlflow.set_tag("mlflow.runName", mlflow_sanitize(run_name, "tag_value"))
|
|
476
566
|
|
|
477
567
|
# Set description
|
|
478
568
|
description = mlflow_config.get("description")
|
|
479
569
|
if description:
|
|
480
|
-
mlflow.set_tag(
|
|
570
|
+
mlflow.set_tag(
|
|
571
|
+
"mlflow.note.content", mlflow_sanitize(description, "tag_value")
|
|
572
|
+
)
|
|
481
573
|
|
|
482
574
|
# Log parameters
|
|
483
575
|
mlflow.log_params(safe_params)
|
|
484
576
|
|
|
485
|
-
#
|
|
486
|
-
|
|
577
|
+
# Sanitize metric keys
|
|
578
|
+
safe_all_metrics = {
|
|
579
|
+
mlflow_sanitize(k, "metric"): v
|
|
580
|
+
for k, v in (all_metrics or {}).items()
|
|
581
|
+
}
|
|
582
|
+
mlflow.log_metrics(safe_all_metrics)
|
|
487
583
|
|
|
488
584
|
# Log artifacts from all jobs
|
|
489
585
|
total_artifacts = 0
|
|
490
|
-
for job_data in jobs.
|
|
491
|
-
artifacts_logged = self._log_artifacts(
|
|
586
|
+
for job_id, job_data in jobs.items():
|
|
587
|
+
artifacts_logged = self._log_artifacts(
|
|
588
|
+
job_data, mlflow_config, staged_map.get(job_id)
|
|
589
|
+
)
|
|
492
590
|
total_artifacts += len(artifacts_logged)
|
|
493
591
|
|
|
494
592
|
# Build run URL
|
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
"""Shared utilities for metrics and configuration handling."""
|
|
17
17
|
|
|
18
18
|
import json
|
|
19
|
+
import re
|
|
19
20
|
import subprocess
|
|
20
21
|
from pathlib import Path
|
|
21
22
|
from typing import Any, Callable, Dict, List, Tuple
|
|
@@ -306,21 +307,28 @@ def ssh_setup_masters(jobs: Dict[str, JobData]) -> Dict[Tuple[str, str], str]:
|
|
|
306
307
|
remote_pairs: set[tuple[str, str]] = set()
|
|
307
308
|
for jd in jobs.values():
|
|
308
309
|
try:
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
310
|
+
# Preferred: explicit 'paths' from job data
|
|
311
|
+
p = (jd.data or {}).get("paths") or {}
|
|
312
|
+
if (
|
|
313
|
+
p.get("storage_type") == "remote_ssh"
|
|
314
|
+
and p.get("username")
|
|
315
|
+
and p.get("hostname")
|
|
316
|
+
):
|
|
317
|
+
remote_pairs.add((p["username"], p["hostname"]))
|
|
318
|
+
continue
|
|
319
|
+
# Fallback: common slurm fields (works with BaseExporter.get_job_paths)
|
|
320
|
+
d = jd.data or {}
|
|
321
|
+
if jd.executor == "slurm" and d.get("username") and d.get("hostname"):
|
|
322
|
+
remote_pairs.add((d["username"], d["hostname"]))
|
|
312
323
|
except Exception:
|
|
313
324
|
pass
|
|
314
325
|
|
|
315
326
|
if not remote_pairs:
|
|
316
|
-
return {}
|
|
327
|
+
return {}
|
|
317
328
|
|
|
318
|
-
# Ensure connections directory exists (like execDB does)
|
|
319
329
|
CONNECTIONS_DIR.mkdir(parents=True, exist_ok=True)
|
|
320
|
-
|
|
321
330
|
control_paths: Dict[Tuple[str, str], str] = {}
|
|
322
331
|
for username, hostname in remote_pairs:
|
|
323
|
-
# Simple socket name
|
|
324
332
|
socket_path = CONNECTIONS_DIR / f"{username}_{hostname}.sock"
|
|
325
333
|
try:
|
|
326
334
|
cmd = [
|
|
@@ -371,9 +379,10 @@ def ssh_download_artifacts(
|
|
|
371
379
|
config: Dict[str, Any] | None = None,
|
|
372
380
|
control_paths: Dict[Tuple[str, str], str] | None = None,
|
|
373
381
|
) -> List[str]:
|
|
374
|
-
"""Download artifacts via SSH with optional connection reuse."""
|
|
382
|
+
"""Download artifacts/logs via SSH with optional connection reuse."""
|
|
375
383
|
exported_files: List[str] = []
|
|
376
384
|
copy_logs = bool((config or {}).get("copy_logs", False))
|
|
385
|
+
copy_artifacts = bool((config or {}).get("copy_artifacts", True))
|
|
377
386
|
only_required = bool((config or {}).get("only_required", True))
|
|
378
387
|
|
|
379
388
|
control_path = None
|
|
@@ -390,44 +399,49 @@ def ssh_download_artifacts(
|
|
|
390
399
|
str(local_path),
|
|
391
400
|
]
|
|
392
401
|
)
|
|
393
|
-
|
|
394
|
-
return result.returncode == 0
|
|
402
|
+
return subprocess.run(cmd, capture_output=True).returncode == 0
|
|
395
403
|
|
|
396
404
|
export_dir.mkdir(parents=True, exist_ok=True)
|
|
397
|
-
(export_dir / "artifacts").mkdir(parents=True, exist_ok=True)
|
|
398
|
-
|
|
399
|
-
available_local = (
|
|
400
|
-
get_available_artifacts(paths.get("artifacts_dir", Path()))
|
|
401
|
-
if not only_required
|
|
402
|
-
else None
|
|
403
|
-
)
|
|
404
|
-
artifact_names = (
|
|
405
|
-
[a for a in get_relevant_artifacts()]
|
|
406
|
-
if only_required
|
|
407
|
-
else (available_local or [])
|
|
408
|
-
)
|
|
409
|
-
|
|
410
|
-
for artifact in artifact_names:
|
|
411
|
-
remote_file = f"{paths['remote_path']}/artifacts/{artifact}"
|
|
412
|
-
local_file = export_dir / "artifacts" / artifact
|
|
413
|
-
if scp_file(remote_file, local_file):
|
|
414
|
-
exported_files.append(str(local_file))
|
|
415
405
|
|
|
406
|
+
# Artifacts
|
|
407
|
+
if copy_artifacts:
|
|
408
|
+
art_dir = export_dir / "artifacts"
|
|
409
|
+
art_dir.mkdir(parents=True, exist_ok=True)
|
|
410
|
+
|
|
411
|
+
if only_required:
|
|
412
|
+
for artifact in get_relevant_artifacts():
|
|
413
|
+
remote_file = f"{paths['remote_path']}/artifacts/{artifact}"
|
|
414
|
+
local_file = art_dir / artifact
|
|
415
|
+
local_file.parent.mkdir(parents=True, exist_ok=True)
|
|
416
|
+
if scp_file(remote_file, local_file):
|
|
417
|
+
exported_files.append(str(local_file))
|
|
418
|
+
else:
|
|
419
|
+
# Copy known files individually to avoid subfolders and satisfy tests
|
|
420
|
+
for artifact in get_available_artifacts(paths.get("artifacts_dir", Path())):
|
|
421
|
+
remote_file = f"{paths['remote_path']}/artifacts/{artifact}"
|
|
422
|
+
local_file = art_dir / artifact
|
|
423
|
+
if scp_file(remote_file, local_file):
|
|
424
|
+
exported_files.append(str(local_file))
|
|
425
|
+
|
|
426
|
+
# Logs (top-level only)
|
|
416
427
|
if copy_logs:
|
|
417
|
-
remote_logs = f"{paths['remote_path']}/logs"
|
|
418
428
|
local_logs = export_dir / "logs"
|
|
429
|
+
remote_logs = f"{paths['remote_path']}/logs"
|
|
419
430
|
cmd = (
|
|
420
431
|
["scp", "-r"]
|
|
421
432
|
+ ssh_opts
|
|
422
433
|
+ [
|
|
423
|
-
f"{paths['username']}@{paths['hostname']}:{remote_logs}",
|
|
434
|
+
f"{paths['username']}@{paths['hostname']}:{remote_logs}/.",
|
|
424
435
|
str(local_logs),
|
|
425
436
|
]
|
|
426
437
|
)
|
|
427
438
|
if subprocess.run(cmd, capture_output=True).returncode == 0:
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
439
|
+
for p in local_logs.iterdir():
|
|
440
|
+
if p.is_dir():
|
|
441
|
+
import shutil
|
|
442
|
+
|
|
443
|
+
shutil.rmtree(p, ignore_errors=True)
|
|
444
|
+
exported_files.extend([str(f) for f in local_logs.glob("*") if f.is_file()])
|
|
431
445
|
|
|
432
446
|
return exported_files
|
|
433
447
|
|
|
@@ -584,3 +598,41 @@ def _safe_update_metrics(
|
|
|
584
598
|
"""Update target from source safely, raising on collisions with detailed values."""
|
|
585
599
|
for k, v in source.items():
|
|
586
600
|
_safe_set_metric(target, k, v, context)
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
# =============================================================================
|
|
604
|
+
# MLFLOW FUNCTIONS
|
|
605
|
+
# =============================================================================
|
|
606
|
+
|
|
607
|
+
# MLflow constants
|
|
608
|
+
_MLFLOW_KEY_MAX = 250
|
|
609
|
+
_MLFLOW_PARAM_VAL_MAX = 250
|
|
610
|
+
_MLFLOW_TAG_VAL_MAX = 5000
|
|
611
|
+
|
|
612
|
+
_INVALID_KEY_CHARS = re.compile(r"[^/\w.\- ]")
|
|
613
|
+
_MULTI_UNDERSCORE = re.compile(r"_+")
|
|
614
|
+
|
|
615
|
+
|
|
616
|
+
def mlflow_sanitize(s: Any, kind: str = "key") -> str:
|
|
617
|
+
"""
|
|
618
|
+
Sanitize strings for MLflow logging.
|
|
619
|
+
|
|
620
|
+
kind:
|
|
621
|
+
- "key", "metric", "tag_key", "param_key": apply key rules
|
|
622
|
+
- "tag_value": apply tag value rules
|
|
623
|
+
- "param_value": apply param value rules
|
|
624
|
+
"""
|
|
625
|
+
s = "" if s is None else str(s)
|
|
626
|
+
|
|
627
|
+
if kind in ("key", "metric", "tag_key", "param_key"):
|
|
628
|
+
# common replacements
|
|
629
|
+
s = s.replace("pass@", "pass_at_")
|
|
630
|
+
# drop disallowed chars, collapse underscores, trim
|
|
631
|
+
s = _INVALID_KEY_CHARS.sub("_", s)
|
|
632
|
+
s = _MULTI_UNDERSCORE.sub("_", s).strip()
|
|
633
|
+
return s[:_MLFLOW_KEY_MAX] or "key"
|
|
634
|
+
|
|
635
|
+
# values: normalize whitespace, enforce length
|
|
636
|
+
s = s.replace("\n", " ").replace("\r", " ").strip()
|
|
637
|
+
max_len = _MLFLOW_TAG_VAL_MAX if kind == "tag_value" else _MLFLOW_PARAM_VAL_MAX
|
|
638
|
+
return s[:max_len]
|
|
@@ -68,10 +68,41 @@ class WandBExporter(BaseExporter):
|
|
|
68
68
|
"log_mode", "per_task"
|
|
69
69
|
) # Default per_task for immediate export
|
|
70
70
|
|
|
71
|
-
#
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
71
|
+
# Stage artifacts locally if remote_ssh (e.g., Slurm), so we can extract metrics
|
|
72
|
+
staged_base_dir = None
|
|
73
|
+
try:
|
|
74
|
+
paths = self.get_job_paths(job_data)
|
|
75
|
+
if paths.get("storage_type") == "remote_ssh":
|
|
76
|
+
tmp_stage = Path(tempfile.mkdtemp(prefix="wandb_stage_"))
|
|
77
|
+
LocalExporter(
|
|
78
|
+
{
|
|
79
|
+
"output_dir": str(tmp_stage),
|
|
80
|
+
"copy_logs": wandb_config.get("log_logs", False),
|
|
81
|
+
"only_required": wandb_config.get("only_required", True),
|
|
82
|
+
}
|
|
83
|
+
).export_job(job_data)
|
|
84
|
+
staged_base_dir = (
|
|
85
|
+
tmp_stage / job_data.invocation_id / job_data.job_id
|
|
86
|
+
)
|
|
87
|
+
except Exception as e:
|
|
88
|
+
logger.warning(f"W&B: staging failed for {job_data.job_id}: {e}")
|
|
89
|
+
|
|
90
|
+
# Metrics (prefer staged if available)
|
|
91
|
+
log_metrics = wandb_config.get("log_metrics", [])
|
|
92
|
+
if staged_base_dir and (staged_base_dir / "artifacts").exists():
|
|
93
|
+
metrics = extract_accuracy_metrics(
|
|
94
|
+
job_data,
|
|
95
|
+
lambda _: {
|
|
96
|
+
"artifacts_dir": staged_base_dir / "artifacts",
|
|
97
|
+
"storage_type": "local_filesystem",
|
|
98
|
+
},
|
|
99
|
+
log_metrics,
|
|
100
|
+
)
|
|
101
|
+
else:
|
|
102
|
+
metrics = extract_accuracy_metrics(
|
|
103
|
+
job_data, self.get_job_paths, log_metrics
|
|
104
|
+
)
|
|
105
|
+
|
|
75
106
|
if not metrics:
|
|
76
107
|
return ExportResult(
|
|
77
108
|
success=False, dest="wandb", message="No metrics found"
|
|
@@ -345,10 +376,14 @@ class WandBExporter(BaseExporter):
|
|
|
345
376
|
run_args["resume"] = "allow"
|
|
346
377
|
|
|
347
378
|
# Config metadata
|
|
379
|
+
exec_type = (job_data.config or {}).get("execution", {}).get(
|
|
380
|
+
"type"
|
|
381
|
+
) or job_data.executor
|
|
348
382
|
run_config = {
|
|
349
383
|
"invocation_id": job_data.invocation_id,
|
|
350
|
-
"executor":
|
|
384
|
+
"executor": exec_type,
|
|
351
385
|
}
|
|
386
|
+
|
|
352
387
|
if log_mode == "per_task":
|
|
353
388
|
run_config["job_id"] = job_data.job_id
|
|
354
389
|
run_config["harness"] = harness
|