nemo-evaluator-launcher 0.1.41__py3-none-any.whl → 0.1.56__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nemo_evaluator_launcher/api/functional.py +55 -5
- nemo_evaluator_launcher/cli/ls_task.py +280 -0
- nemo_evaluator_launcher/cli/ls_tasks.py +208 -55
- nemo_evaluator_launcher/cli/main.py +17 -2
- nemo_evaluator_launcher/cli/run.py +41 -1
- nemo_evaluator_launcher/common/container_metadata/__init__.py +61 -0
- nemo_evaluator_launcher/common/container_metadata/intermediate_repr.py +530 -0
- nemo_evaluator_launcher/common/container_metadata/loading.py +1126 -0
- nemo_evaluator_launcher/common/container_metadata/registries.py +824 -0
- nemo_evaluator_launcher/common/container_metadata/utils.py +63 -0
- nemo_evaluator_launcher/common/helpers.py +44 -28
- nemo_evaluator_launcher/common/mapping.py +341 -155
- nemo_evaluator_launcher/common/printing_utils.py +18 -12
- nemo_evaluator_launcher/executors/lepton/executor.py +26 -8
- nemo_evaluator_launcher/executors/local/executor.py +6 -2
- nemo_evaluator_launcher/executors/slurm/executor.py +141 -9
- nemo_evaluator_launcher/package_info.py +1 -1
- nemo_evaluator_launcher/resources/all_tasks_irs.yaml +17016 -0
- nemo_evaluator_launcher/resources/mapping.toml +62 -354
- {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/METADATA +2 -1
- {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/RECORD +25 -18
- {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/WHEEL +0 -0
- {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/entry_points.txt +0 -0
- {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/licenses/LICENSE +0 -0
- {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/top_level.txt +0 -0
|
@@ -34,7 +34,7 @@ from nemo_evaluator_launcher.common.execdb import (
|
|
|
34
34
|
from nemo_evaluator_launcher.common.helpers import get_eval_factory_command
|
|
35
35
|
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
36
36
|
from nemo_evaluator_launcher.common.mapping import (
|
|
37
|
-
|
|
37
|
+
get_task_definition_for_job,
|
|
38
38
|
load_tasks_mapping,
|
|
39
39
|
)
|
|
40
40
|
from nemo_evaluator_launcher.common.printing_utils import red
|
|
@@ -293,8 +293,10 @@ class LeptonExecutor(BaseExecutor):
|
|
|
293
293
|
return
|
|
294
294
|
|
|
295
295
|
# Construct the full endpoint URL
|
|
296
|
-
task_definition =
|
|
297
|
-
task.name,
|
|
296
|
+
task_definition = get_task_definition_for_job(
|
|
297
|
+
task_query=task.name,
|
|
298
|
+
base_mapping=tasks_mapping,
|
|
299
|
+
container=task.get("container"),
|
|
298
300
|
)
|
|
299
301
|
task_endpoint_type = task_definition["endpoint_type"]
|
|
300
302
|
endpoint_path = cfg.deployment.endpoints[task_endpoint_type]
|
|
@@ -383,7 +385,11 @@ class LeptonExecutor(BaseExecutor):
|
|
|
383
385
|
|
|
384
386
|
# Submit each evaluation task as a Lepton job
|
|
385
387
|
for idx, task in enumerate(cfg.evaluation.tasks):
|
|
386
|
-
task_definition =
|
|
388
|
+
task_definition = get_task_definition_for_job(
|
|
389
|
+
task_query=task.name,
|
|
390
|
+
base_mapping=tasks_mapping,
|
|
391
|
+
container=task.get("container"),
|
|
392
|
+
)
|
|
387
393
|
|
|
388
394
|
# Create job ID and Lepton job name (max 36 chars)
|
|
389
395
|
job_id = generate_job_id(invocation_id, idx)
|
|
@@ -889,9 +895,13 @@ def _dry_run_lepton(
|
|
|
889
895
|
) -> None:
|
|
890
896
|
print("DRY RUN: Lepton job configurations prepared")
|
|
891
897
|
try:
|
|
892
|
-
# validate tasks
|
|
898
|
+
# validate tasks (container overrides are supported)
|
|
893
899
|
for task in cfg.evaluation.tasks:
|
|
894
|
-
|
|
900
|
+
_ = get_task_definition_for_job(
|
|
901
|
+
task_query=task.name,
|
|
902
|
+
base_mapping=tasks_mapping,
|
|
903
|
+
container=task.get("container"),
|
|
904
|
+
)
|
|
895
905
|
|
|
896
906
|
# nice-to-have checks (existing endpoint URL or endpoints mapping)
|
|
897
907
|
if getattr(cfg.deployment, "type", None) == "none":
|
|
@@ -909,7 +919,11 @@ def _dry_run_lepton(
|
|
|
909
919
|
else:
|
|
910
920
|
endpoints_cfg = getattr(cfg.deployment, "endpoints", {}) or {}
|
|
911
921
|
for task in cfg.evaluation.tasks:
|
|
912
|
-
td =
|
|
922
|
+
td = get_task_definition_for_job(
|
|
923
|
+
task_query=task.name,
|
|
924
|
+
base_mapping=tasks_mapping,
|
|
925
|
+
container=task.get("container"),
|
|
926
|
+
)
|
|
913
927
|
etype = td.get("endpoint_type")
|
|
914
928
|
if etype not in endpoints_cfg:
|
|
915
929
|
raise ValueError(
|
|
@@ -928,7 +942,11 @@ def _dry_run_lepton(
|
|
|
928
942
|
getattr(cfg, "target", {}).get("api_endpoint", {}), "api_key_name", None
|
|
929
943
|
)
|
|
930
944
|
for task in cfg.evaluation.tasks:
|
|
931
|
-
td =
|
|
945
|
+
td = get_task_definition_for_job(
|
|
946
|
+
task_query=task.name,
|
|
947
|
+
base_mapping=tasks_mapping,
|
|
948
|
+
container=task.get("container"),
|
|
949
|
+
)
|
|
932
950
|
required = td.get("required_env_vars", []) or []
|
|
933
951
|
for var in required:
|
|
934
952
|
# Skip NEMO_EVALUATOR_DATASET_DIR as it's handled by dataset mounting logic
|
|
@@ -49,7 +49,7 @@ from nemo_evaluator_launcher.common.helpers import (
|
|
|
49
49
|
)
|
|
50
50
|
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
51
51
|
from nemo_evaluator_launcher.common.mapping import (
|
|
52
|
-
|
|
52
|
+
get_task_definition_for_job,
|
|
53
53
|
load_tasks_mapping,
|
|
54
54
|
)
|
|
55
55
|
from nemo_evaluator_launcher.common.printing_utils import bold, cyan, grey, red
|
|
@@ -123,7 +123,11 @@ class LocalExecutor(BaseExecutor):
|
|
|
123
123
|
|
|
124
124
|
for idx, task in enumerate(cfg.evaluation.tasks):
|
|
125
125
|
timestamp = get_timestamp_string()
|
|
126
|
-
task_definition =
|
|
126
|
+
task_definition = get_task_definition_for_job(
|
|
127
|
+
task_query=task.name,
|
|
128
|
+
base_mapping=tasks_mapping,
|
|
129
|
+
container=task.get("container"),
|
|
130
|
+
)
|
|
127
131
|
|
|
128
132
|
if cfg.deployment.type != "none":
|
|
129
133
|
# container name
|
|
@@ -49,7 +49,7 @@ from nemo_evaluator_launcher.common.helpers import (
|
|
|
49
49
|
)
|
|
50
50
|
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
51
51
|
from nemo_evaluator_launcher.common.mapping import (
|
|
52
|
-
|
|
52
|
+
get_task_definition_for_job,
|
|
53
53
|
load_tasks_mapping,
|
|
54
54
|
)
|
|
55
55
|
from nemo_evaluator_launcher.common.printing_utils import bold, cyan, grey, red
|
|
@@ -109,7 +109,11 @@ class SlurmExecutor(BaseExecutor):
|
|
|
109
109
|
(local_task_subdir / "artifacts").mkdir()
|
|
110
110
|
|
|
111
111
|
# resolve eval image and pass directly via task override
|
|
112
|
-
task_definition =
|
|
112
|
+
task_definition = get_task_definition_for_job(
|
|
113
|
+
task_query=task.name,
|
|
114
|
+
base_mapping=tasks_mapping,
|
|
115
|
+
container=task.get("container"),
|
|
116
|
+
)
|
|
113
117
|
eval_image = task_definition["container"]
|
|
114
118
|
if "container" in task:
|
|
115
119
|
eval_image = task["container"]
|
|
@@ -201,6 +205,22 @@ class SlurmExecutor(BaseExecutor):
|
|
|
201
205
|
hostname=cfg.execution.hostname,
|
|
202
206
|
socket=socket,
|
|
203
207
|
)
|
|
208
|
+
|
|
209
|
+
if socket_or_none is None:
|
|
210
|
+
raise RuntimeError(
|
|
211
|
+
f"Failed to connect to the cluster {cfg.execution.hostname} as user {cfg.execution.username}. "
|
|
212
|
+
"Please check your SSH configuration."
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
# Validate that all mount paths exist on the remote host
|
|
216
|
+
mount_paths = _collect_mount_paths(cfg)
|
|
217
|
+
_validate_remote_paths_exist(
|
|
218
|
+
paths=mount_paths,
|
|
219
|
+
username=cfg.execution.username,
|
|
220
|
+
hostname=cfg.execution.hostname,
|
|
221
|
+
socket=socket_or_none,
|
|
222
|
+
)
|
|
223
|
+
|
|
204
224
|
_make_remote_execution_output_dir(
|
|
205
225
|
dirpath=cfg.execution.output_dir,
|
|
206
226
|
username=cfg.execution.username,
|
|
@@ -507,7 +527,11 @@ def _create_slurm_sbatch_script(
|
|
|
507
527
|
"""
|
|
508
528
|
# get task from mapping, overrides, urls
|
|
509
529
|
tasks_mapping = load_tasks_mapping()
|
|
510
|
-
task_definition =
|
|
530
|
+
task_definition = get_task_definition_for_job(
|
|
531
|
+
task_query=task.name,
|
|
532
|
+
base_mapping=tasks_mapping,
|
|
533
|
+
container=task.get("container"),
|
|
534
|
+
)
|
|
511
535
|
|
|
512
536
|
# TODO(public release): convert to template
|
|
513
537
|
s = "#!/bin/bash\n"
|
|
@@ -531,7 +555,8 @@ def _create_slurm_sbatch_script(
|
|
|
531
555
|
)
|
|
532
556
|
s += "#SBATCH --job-name {}\n".format(job_name)
|
|
533
557
|
s += "#SBATCH --exclusive\n"
|
|
534
|
-
s += "#SBATCH --
|
|
558
|
+
s += "#SBATCH --no-requeue\n" # We have our own auto-resume logic
|
|
559
|
+
s += "#SBATCH --output {}\n".format(remote_task_subdir / "logs" / "slurm-%A.log")
|
|
535
560
|
s += "\n"
|
|
536
561
|
s += f'TASK_DIR="{str(remote_task_subdir)}"\n'
|
|
537
562
|
s += "\n"
|
|
@@ -696,7 +721,7 @@ def _create_slurm_sbatch_script(
|
|
|
696
721
|
s += "--no-container-mount-home "
|
|
697
722
|
|
|
698
723
|
s += "--container-mounts {} ".format(",".join(evaluation_mounts_list))
|
|
699
|
-
s += "--output {} ".format(remote_task_subdir / "logs" / "client-%A.
|
|
724
|
+
s += "--output {} ".format(remote_task_subdir / "logs" / "client-%A.log")
|
|
700
725
|
s += "bash -c '\n"
|
|
701
726
|
s += eval_factory_command
|
|
702
727
|
s += "'\n\n"
|
|
@@ -817,8 +842,8 @@ def _generate_auto_export_section(
|
|
|
817
842
|
if not cfg.execution.get("mounts", {}).get("mount_home", True):
|
|
818
843
|
s += "--no-container-mount-home "
|
|
819
844
|
|
|
820
|
-
s += f"--container-mounts {remote_task_subdir}/artifacts:{remote_task_subdir}/artifacts "
|
|
821
|
-
s += "--output {} ".format(remote_task_subdir / "logs" / "export-%A.
|
|
845
|
+
s += f"--container-mounts {remote_task_subdir}/artifacts:{remote_task_subdir}/artifacts,{remote_task_subdir}/logs:{remote_task_subdir}/logs "
|
|
846
|
+
s += "--output {} ".format(remote_task_subdir / "logs" / "export-%A.log")
|
|
822
847
|
s += " bash -c '\n"
|
|
823
848
|
# FIXME(martas): would be good to install specific version
|
|
824
849
|
s += " pip install nemo-evaluator-launcher[all]\n"
|
|
@@ -1336,7 +1361,7 @@ def _generate_deployment_srun_command(
|
|
|
1336
1361
|
s += "--container-mounts {} ".format(",".join(deployment_mounts_list))
|
|
1337
1362
|
if not cfg.execution.get("mounts", {}).get("mount_home", True):
|
|
1338
1363
|
s += "--no-container-mount-home "
|
|
1339
|
-
s += "--output {} ".format(remote_task_subdir / "logs" / "server-%A-%t.
|
|
1364
|
+
s += "--output {} ".format(remote_task_subdir / "logs" / "server-%A-%t.log")
|
|
1340
1365
|
|
|
1341
1366
|
deployment_env_var_names = list(
|
|
1342
1367
|
cfg.execution.get("env_vars", {}).get("deployment", {})
|
|
@@ -1439,7 +1464,7 @@ def _generate_haproxy_srun_command(cfg, remote_task_subdir):
|
|
|
1439
1464
|
s += "--nodes 1 --ntasks 1 "
|
|
1440
1465
|
s += f"--container-image {cfg.execution.get('proxy', {}).get('image', 'haproxy:latest')} "
|
|
1441
1466
|
s += f"--container-mounts {remote_task_subdir}/proxy.cfg:/usr/local/etc/haproxy/haproxy.cfg:ro "
|
|
1442
|
-
s += f"--output {remote_task_subdir}/logs/proxy-%A.
|
|
1467
|
+
s += f"--output {remote_task_subdir}/logs/proxy-%A.log "
|
|
1443
1468
|
s += "haproxy -f /usr/local/etc/haproxy/haproxy.cfg &\n"
|
|
1444
1469
|
s += "PROXY_PID=$! # capture the PID of the proxy background srun process\n"
|
|
1445
1470
|
s += 'echo "Proxy started with PID: $PROXY_PID"\n\n'
|
|
@@ -1454,3 +1479,110 @@ def _generate_haproxy_srun_command(cfg, remote_task_subdir):
|
|
|
1454
1479
|
s += "\n"
|
|
1455
1480
|
|
|
1456
1481
|
return s
|
|
1482
|
+
|
|
1483
|
+
|
|
1484
|
+
def _collect_mount_paths(cfg: DictConfig) -> List[str]:
|
|
1485
|
+
"""Collect all mount source paths from the configuration.
|
|
1486
|
+
|
|
1487
|
+
Args:
|
|
1488
|
+
cfg: The configuration object for the evaluation run.
|
|
1489
|
+
|
|
1490
|
+
Returns:
|
|
1491
|
+
List of source paths that need to be mounted.
|
|
1492
|
+
"""
|
|
1493
|
+
mount_paths = []
|
|
1494
|
+
|
|
1495
|
+
# Deployment mounts
|
|
1496
|
+
if cfg.deployment.type != "none":
|
|
1497
|
+
if checkpoint_path := cfg.deployment.get("checkpoint_path"):
|
|
1498
|
+
mount_paths.append(checkpoint_path)
|
|
1499
|
+
if cache_path := cfg.deployment.get("cache_path"):
|
|
1500
|
+
mount_paths.append(cache_path)
|
|
1501
|
+
for source_mnt in cfg.execution.get("mounts", {}).get("deployment", {}).keys():
|
|
1502
|
+
mount_paths.append(source_mnt)
|
|
1503
|
+
|
|
1504
|
+
# Evaluation mounts
|
|
1505
|
+
for source_mnt in cfg.execution.get("mounts", {}).get("evaluation", {}).keys():
|
|
1506
|
+
mount_paths.append(source_mnt)
|
|
1507
|
+
|
|
1508
|
+
return mount_paths
|
|
1509
|
+
|
|
1510
|
+
|
|
1511
|
+
def _validate_remote_paths_exist(
|
|
1512
|
+
paths: List[str],
|
|
1513
|
+
username: str,
|
|
1514
|
+
hostname: str,
|
|
1515
|
+
socket: str | None,
|
|
1516
|
+
) -> None:
|
|
1517
|
+
"""Validate that all specified paths exist as directories on the remote host.
|
|
1518
|
+
|
|
1519
|
+
Args:
|
|
1520
|
+
paths: List of directory paths to validate.
|
|
1521
|
+
username: SSH username.
|
|
1522
|
+
hostname: SSH hostname.
|
|
1523
|
+
socket: control socket location or None
|
|
1524
|
+
|
|
1525
|
+
Raises:
|
|
1526
|
+
ValueError: If any paths do not exist as directories on the remote host.
|
|
1527
|
+
"""
|
|
1528
|
+
if not paths:
|
|
1529
|
+
return
|
|
1530
|
+
|
|
1531
|
+
# Remove duplicates while preserving order
|
|
1532
|
+
unique_paths = list(dict.fromkeys(paths))
|
|
1533
|
+
|
|
1534
|
+
# Build a single SSH command to check all paths at once
|
|
1535
|
+
test_commands = []
|
|
1536
|
+
for path in unique_paths:
|
|
1537
|
+
# Use test -d to check if directory exists
|
|
1538
|
+
# Escape single quotes in path using POSIX-safe method: ' becomes '"'"'
|
|
1539
|
+
escaped_path = path.replace("'", "'\"'\"'")
|
|
1540
|
+
test_commands.append(
|
|
1541
|
+
f"test -d '{escaped_path}' && echo 'EXISTS:{path}' || echo 'MISSING:{path}'"
|
|
1542
|
+
)
|
|
1543
|
+
|
|
1544
|
+
combined_command = " ; ".join(test_commands)
|
|
1545
|
+
|
|
1546
|
+
ssh_command = ["ssh"]
|
|
1547
|
+
if socket is not None:
|
|
1548
|
+
ssh_command.append(f"-S {socket}")
|
|
1549
|
+
ssh_command.append(f"{username}@{hostname}")
|
|
1550
|
+
ssh_command.append(combined_command)
|
|
1551
|
+
ssh_command = " ".join(ssh_command)
|
|
1552
|
+
|
|
1553
|
+
logger.info("Validating mount directories exist on remote host", cmd=ssh_command)
|
|
1554
|
+
completed_process = subprocess.run(
|
|
1555
|
+
args=shlex.split(ssh_command),
|
|
1556
|
+
stdout=subprocess.PIPE,
|
|
1557
|
+
stderr=subprocess.PIPE,
|
|
1558
|
+
)
|
|
1559
|
+
|
|
1560
|
+
if completed_process.returncode != 0:
|
|
1561
|
+
error_msg = (
|
|
1562
|
+
completed_process.stderr.decode("utf-8")
|
|
1563
|
+
if completed_process.stderr
|
|
1564
|
+
else "Unknown error"
|
|
1565
|
+
)
|
|
1566
|
+
logger.error(
|
|
1567
|
+
"Error validating remote paths",
|
|
1568
|
+
code=completed_process.returncode,
|
|
1569
|
+
msg=error_msg,
|
|
1570
|
+
)
|
|
1571
|
+
raise RuntimeError(f"Failed to validate remote paths: {error_msg}")
|
|
1572
|
+
|
|
1573
|
+
# Parse output to find missing paths
|
|
1574
|
+
output = completed_process.stdout.decode("utf-8")
|
|
1575
|
+
missing_paths = []
|
|
1576
|
+
for line in output.strip().split("\n"):
|
|
1577
|
+
if line.startswith("MISSING:"):
|
|
1578
|
+
missing_path = line.replace("MISSING:", "")
|
|
1579
|
+
missing_paths.append(missing_path)
|
|
1580
|
+
|
|
1581
|
+
if missing_paths:
|
|
1582
|
+
error_message = (
|
|
1583
|
+
f"The following mount paths do not exist as directories on {username}@{hostname}:\n"
|
|
1584
|
+
+ "\n".join(f" - {path}" for path in missing_paths)
|
|
1585
|
+
+ "\n\nMount paths must be directories. Please create these directories on the cluster or update your configuration."
|
|
1586
|
+
)
|
|
1587
|
+
logger.error("Mount validation failed", missing_paths=missing_paths)
|
|
1588
|
+
raise ValueError(error_message)
|