nemo-evaluator-launcher 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nemo-evaluator-launcher might be problematic. Click here for more details.
- nemo_evaluator_launcher/api/types.py +9 -0
- nemo_evaluator_launcher/cli/{debug.py → info.py} +170 -63
- nemo_evaluator_launcher/cli/main.py +10 -10
- nemo_evaluator_launcher/cli/run.py +39 -13
- nemo_evaluator_launcher/cli/status.py +9 -8
- nemo_evaluator_launcher/common/helpers.py +36 -4
- nemo_evaluator_launcher/common/printing_utils.py +93 -0
- nemo_evaluator_launcher/configs/execution/slurm/default.yaml +5 -4
- nemo_evaluator_launcher/executors/lepton/executor.py +138 -23
- nemo_evaluator_launcher/executors/local/executor.py +28 -13
- nemo_evaluator_launcher/executors/local/run.template.sh +4 -1
- nemo_evaluator_launcher/executors/slurm/executor.py +22 -7
- nemo_evaluator_launcher/package_info.py +1 -1
- {nemo_evaluator_launcher-0.1.16.dist-info → nemo_evaluator_launcher-0.1.18.dist-info}/METADATA +1 -1
- {nemo_evaluator_launcher-0.1.16.dist-info → nemo_evaluator_launcher-0.1.18.dist-info}/RECORD +19 -18
- {nemo_evaluator_launcher-0.1.16.dist-info → nemo_evaluator_launcher-0.1.18.dist-info}/WHEEL +0 -0
- {nemo_evaluator_launcher-0.1.16.dist-info → nemo_evaluator_launcher-0.1.18.dist-info}/entry_points.txt +0 -0
- {nemo_evaluator_launcher-0.1.16.dist-info → nemo_evaluator_launcher-0.1.18.dist-info}/licenses/LICENSE +0 -0
- {nemo_evaluator_launcher-0.1.16.dist-info → nemo_evaluator_launcher-0.1.18.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
"""Printing utils for more structured or visually appealing prints.
|
|
17
|
+
|
|
18
|
+
NOTE: use printing only for main application output that matters. For logging,
|
|
19
|
+
see `logging_utils.py`.
|
|
20
|
+
|
|
21
|
+
USAGE:
|
|
22
|
+
```
|
|
23
|
+
from nemo_evaluator_launcher.common.printing_utils import red, bold
|
|
24
|
+
print(bold(red("some red bold")))
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
import os
|
|
31
|
+
|
|
32
|
+
# If this env var is set, it will override a more standard "LOG_LEVEL". If
|
|
33
|
+
# both are unset, default would be used.
|
|
34
|
+
_DISABLE_COLOR_ENV_VAR = "NEMO_EVALUATOR_DISABLE_COLOR"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _is_color_disabled():
|
|
38
|
+
env_var = os.environ.get(_DISABLE_COLOR_ENV_VAR, "0").lower()
|
|
39
|
+
|
|
40
|
+
if "1" in env_var or "yes" in env_var or "y" in env_var or "true" in env_var:
|
|
41
|
+
return True
|
|
42
|
+
|
|
43
|
+
return False
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
_CODES: dict[str, str] = dict(
|
|
47
|
+
green="\033[32m",
|
|
48
|
+
red="\033[31m",
|
|
49
|
+
red_bg="\033[41m", # red background
|
|
50
|
+
cyan="\033[36m",
|
|
51
|
+
yellow="\033[33m",
|
|
52
|
+
magenta="\033[35m",
|
|
53
|
+
grey="\033[90m",
|
|
54
|
+
bold="\033[1m",
|
|
55
|
+
reset="\033[0m",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# If the colors are disabled, we null-out all the codes.
|
|
59
|
+
if _is_color_disabled():
|
|
60
|
+
for c in _CODES.keys():
|
|
61
|
+
_CODES[c] = ""
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def green(s: str) -> str:
|
|
65
|
+
return _CODES["green"] + s + _CODES["reset"]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def red(s: str) -> str:
|
|
69
|
+
return _CODES["red"] + s + _CODES["reset"]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def red_bg(s: str) -> str:
|
|
73
|
+
return _CODES["red_bg"] + s + _CODES["reset"]
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def cyan(s: str) -> str:
|
|
77
|
+
return _CODES["cyan"] + s + _CODES["reset"]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def yellow(s: str) -> str:
|
|
81
|
+
return _CODES["yellow"] + s + _CODES["reset"]
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def magenta(s: str) -> str:
|
|
85
|
+
return _CODES["magenta"] + s + _CODES["reset"]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def grey(s: str) -> str:
|
|
89
|
+
return _CODES["grey"] + s + _CODES["reset"]
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def bold(s: str) -> str:
|
|
93
|
+
return _CODES["bold"] + s + _CODES["reset"]
|
|
@@ -14,16 +14,17 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
#
|
|
16
16
|
# Each slurm cluster has its own flavour, below we provide some defaults that might meet one's needs.
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
17
|
+
type: slurm # Executor is chosen based on this field
|
|
18
|
+
hostname: ??? # SLURM headnode (login) hostname (required)
|
|
19
|
+
username: ${oc.env:USER} # Defaults to $USER env var
|
|
20
|
+
account: ??? # SLURM account allocation (required)
|
|
21
|
+
output_dir: ??? # Absolute path accessible on compute nodes (required)
|
|
20
22
|
partition: batch
|
|
21
23
|
num_nodes: 1
|
|
22
24
|
ntasks_per_node: 1
|
|
23
25
|
gres: gpu:8
|
|
24
26
|
walltime: 01:00:00
|
|
25
27
|
subproject: nemo-evaluator-launcher
|
|
26
|
-
output_dir: ???
|
|
27
28
|
env_vars:
|
|
28
29
|
deployment: {}
|
|
29
30
|
evaluation: {}
|
|
@@ -78,9 +78,32 @@ class LeptonExecutor(BaseExecutor):
|
|
|
78
78
|
"LeptonExecutor supports deployment types: 'vllm', 'sglang', 'nim', 'none'"
|
|
79
79
|
)
|
|
80
80
|
|
|
81
|
+
# Load tasks mapping
|
|
82
|
+
tasks_mapping = load_tasks_mapping()
|
|
83
|
+
job_ids = []
|
|
84
|
+
lepton_job_names = []
|
|
85
|
+
endpoint_names = [] # Track multiple endpoints
|
|
86
|
+
db = ExecutionDB()
|
|
87
|
+
|
|
81
88
|
# Generate invocation ID
|
|
82
89
|
invocation_id = generate_invocation_id()
|
|
83
90
|
|
|
91
|
+
# DRY-RUN mode
|
|
92
|
+
if dry_run:
|
|
93
|
+
output_dir = Path(cfg.execution.output_dir).absolute() / invocation_id
|
|
94
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
95
|
+
|
|
96
|
+
# Validate configuration
|
|
97
|
+
_dry_run_lepton(cfg, tasks_mapping, invocation_id=invocation_id)
|
|
98
|
+
|
|
99
|
+
if cfg.deployment.type == "none":
|
|
100
|
+
print("Using existing endpoint (deployment: none)")
|
|
101
|
+
print("using shared endpoint")
|
|
102
|
+
else:
|
|
103
|
+
print(f"with endpoint type '{cfg.deployment.type}'")
|
|
104
|
+
|
|
105
|
+
return invocation_id
|
|
106
|
+
|
|
84
107
|
# For deployment: none, we use the existing endpoint for all tasks
|
|
85
108
|
if cfg.deployment.type == "none":
|
|
86
109
|
print("📌 Using existing endpoint (deployment: none)")
|
|
@@ -88,13 +111,6 @@ class LeptonExecutor(BaseExecutor):
|
|
|
88
111
|
print(f"✅ Using shared endpoint: {shared_endpoint_url}")
|
|
89
112
|
|
|
90
113
|
try:
|
|
91
|
-
# Load tasks mapping
|
|
92
|
-
tasks_mapping = load_tasks_mapping()
|
|
93
|
-
job_ids = []
|
|
94
|
-
lepton_job_names = []
|
|
95
|
-
endpoint_names = [] # Track multiple endpoints
|
|
96
|
-
db = ExecutionDB()
|
|
97
|
-
|
|
98
114
|
# Create local directory for outputs
|
|
99
115
|
output_dir = Path(cfg.execution.output_dir).absolute() / invocation_id
|
|
100
116
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -139,8 +155,13 @@ class LeptonExecutor(BaseExecutor):
|
|
|
139
155
|
task_index = str(idx)
|
|
140
156
|
endpoint_name = f"{cfg.deployment.type}-{short_task_name}-{task_index}-{short_invocation}"
|
|
141
157
|
|
|
142
|
-
# Ensure we don't exceed 36 character limit
|
|
143
158
|
if len(endpoint_name) > 36:
|
|
159
|
+
logger.info(
|
|
160
|
+
"Lepton endpoint name will be deployed under name {task_name}",
|
|
161
|
+
task_name=task.name,
|
|
162
|
+
original=endpoint_name,
|
|
163
|
+
limit=36,
|
|
164
|
+
)
|
|
144
165
|
# Truncate task name further if needed
|
|
145
166
|
max_task_len = (
|
|
146
167
|
36
|
|
@@ -151,7 +172,19 @@ class LeptonExecutor(BaseExecutor):
|
|
|
151
172
|
) # 3 hyphens
|
|
152
173
|
short_task_name = sanitized_task_name[:max_task_len]
|
|
153
174
|
endpoint_name = f"{cfg.deployment.type}-{short_task_name}-{task_index}-{short_invocation}"
|
|
175
|
+
logger.info(
|
|
176
|
+
"Lepton endpoint name is auto-generated",
|
|
177
|
+
task_name=task.name,
|
|
178
|
+
original=endpoint_name,
|
|
179
|
+
truncated=endpoint_name,
|
|
180
|
+
limit=36,
|
|
181
|
+
)
|
|
154
182
|
|
|
183
|
+
logger.info(
|
|
184
|
+
"Lepton endpoint name (auto-generated)",
|
|
185
|
+
task_name=task.name,
|
|
186
|
+
endpoint_name=endpoint_name,
|
|
187
|
+
)
|
|
155
188
|
endpoint_names.append(endpoint_name)
|
|
156
189
|
endpoint_creation_tasks.append((idx, task, endpoint_name))
|
|
157
190
|
|
|
@@ -298,20 +331,6 @@ class LeptonExecutor(BaseExecutor):
|
|
|
298
331
|
f"✅ All {len(cfg.evaluation.tasks)} endpoints created successfully!"
|
|
299
332
|
)
|
|
300
333
|
|
|
301
|
-
if dry_run:
|
|
302
|
-
print("🔍 DRY RUN: Lepton job configurations prepared")
|
|
303
|
-
print(f" - Tasks: {len(cfg.evaluation.tasks)}")
|
|
304
|
-
for idx, task in enumerate(cfg.evaluation.tasks):
|
|
305
|
-
if cfg.deployment.type == "none":
|
|
306
|
-
print(f" - Task {idx}: {task.name} using shared endpoint")
|
|
307
|
-
else:
|
|
308
|
-
print(
|
|
309
|
-
f" - Task {idx}: {task.name} with endpoint {endpoint_names[idx]}"
|
|
310
|
-
)
|
|
311
|
-
print(f" - Output directory: {output_dir}")
|
|
312
|
-
print("\nTo submit jobs, run the executor without --dry-run")
|
|
313
|
-
return invocation_id
|
|
314
|
-
|
|
315
334
|
# ================================================================
|
|
316
335
|
# JOB SUBMISSION (Sequential, as before)
|
|
317
336
|
# ================================================================
|
|
@@ -334,8 +353,18 @@ class LeptonExecutor(BaseExecutor):
|
|
|
334
353
|
max_base_length = 36 - 1 - len(suffix) # -1 for the hyphen
|
|
335
354
|
if len(base_job_name) > max_base_length:
|
|
336
355
|
base_job_name = base_job_name[:max_base_length]
|
|
356
|
+
logger.info(
|
|
357
|
+
"Lepton job auto-generated name",
|
|
358
|
+
task_name=task.name,
|
|
359
|
+
job_name=f"{base_job_name}-{suffix}",
|
|
360
|
+
)
|
|
337
361
|
|
|
338
362
|
lepton_job_name = f"{base_job_name}-{suffix}"
|
|
363
|
+
logger.info(
|
|
364
|
+
"Lepton job name (auto-generated)",
|
|
365
|
+
task_name=task.name,
|
|
366
|
+
job_name=lepton_job_name,
|
|
367
|
+
)
|
|
339
368
|
job_ids.append(job_id)
|
|
340
369
|
lepton_job_names.append(lepton_job_name)
|
|
341
370
|
|
|
@@ -377,7 +406,12 @@ class LeptonExecutor(BaseExecutor):
|
|
|
377
406
|
cfg.target.api_endpoint.url = full_endpoint_url
|
|
378
407
|
|
|
379
408
|
# Generate command with the correct endpoint URL
|
|
380
|
-
|
|
409
|
+
eval_command_struct = get_eval_factory_command(
|
|
410
|
+
cfg, task, task_definition
|
|
411
|
+
)
|
|
412
|
+
eval_command = eval_command_struct.cmd
|
|
413
|
+
# Debug string for explainability of some base64-parts of the command
|
|
414
|
+
eval_command_debug_comment = eval_command_struct.debug
|
|
381
415
|
|
|
382
416
|
finally:
|
|
383
417
|
# Restore original URL and struct mode
|
|
@@ -402,6 +436,7 @@ class LeptonExecutor(BaseExecutor):
|
|
|
402
436
|
task_name=task.name,
|
|
403
437
|
invocation_id=invocation_id,
|
|
404
438
|
eval_command=eval_command, # Pass the fixed command
|
|
439
|
+
eval_command_debug_comment=eval_command_debug_comment,
|
|
405
440
|
)
|
|
406
441
|
|
|
407
442
|
# Prepare job command to run the launch script
|
|
@@ -705,6 +740,7 @@ def _create_evaluation_launch_script(
|
|
|
705
740
|
task_name: str,
|
|
706
741
|
invocation_id: str,
|
|
707
742
|
eval_command: str,
|
|
743
|
+
eval_command_debug_comment: str,
|
|
708
744
|
) -> str:
|
|
709
745
|
"""Create bash script for running evaluation in Lepton job container.
|
|
710
746
|
|
|
@@ -718,6 +754,7 @@ def _create_evaluation_launch_script(
|
|
|
718
754
|
task_name: Name of the evaluation task.
|
|
719
755
|
invocation_id: Unique invocation identifier.
|
|
720
756
|
eval_command: The evaluation command with correct endpoint URL.
|
|
757
|
+
eval_command_debug_comment: The debug comment for placing into the script and easy debug
|
|
721
758
|
|
|
722
759
|
Returns:
|
|
723
760
|
String containing the bash launch script.
|
|
@@ -750,6 +787,8 @@ echo "Invocation ID: {invocation_id}"
|
|
|
750
787
|
echo "Endpoint URL: {endpoint_url}"
|
|
751
788
|
echo "Command: {eval_command_modified}"
|
|
752
789
|
|
|
790
|
+
{eval_command_debug_comment}
|
|
791
|
+
|
|
753
792
|
# Execute the evaluation with proper error handling
|
|
754
793
|
set +e
|
|
755
794
|
{eval_command_modified}
|
|
@@ -773,6 +812,82 @@ exit 0
|
|
|
773
812
|
return script
|
|
774
813
|
|
|
775
814
|
|
|
815
|
+
def _dry_run_lepton(
|
|
816
|
+
cfg: DictConfig, tasks_mapping: dict, invocation_id: str | None = None
|
|
817
|
+
) -> None:
|
|
818
|
+
print("DRY RUN: Lepton job configurations prepared")
|
|
819
|
+
try:
|
|
820
|
+
# validate tasks
|
|
821
|
+
for task in cfg.evaluation.tasks:
|
|
822
|
+
get_task_from_mapping(task.name, tasks_mapping)
|
|
823
|
+
|
|
824
|
+
# nice-to-have checks (existing endpoint URL or endpoints mapping)
|
|
825
|
+
if getattr(cfg.deployment, "type", None) == "none":
|
|
826
|
+
tgt = getattr(cfg, "target", {})
|
|
827
|
+
api = (
|
|
828
|
+
tgt.get("api_endpoint")
|
|
829
|
+
if isinstance(tgt, dict)
|
|
830
|
+
else getattr(tgt, "api_endpoint", None)
|
|
831
|
+
) or {}
|
|
832
|
+
url = api.get("url") if isinstance(api, dict) else getattr(api, "url", None)
|
|
833
|
+
if not url or not str(url).strip():
|
|
834
|
+
raise ValueError(
|
|
835
|
+
"target.api_endpoint.url must be set when deployment.type == 'none'"
|
|
836
|
+
)
|
|
837
|
+
else:
|
|
838
|
+
endpoints_cfg = getattr(cfg.deployment, "endpoints", {}) or {}
|
|
839
|
+
for task in cfg.evaluation.tasks:
|
|
840
|
+
td = get_task_from_mapping(task.name, tasks_mapping)
|
|
841
|
+
etype = td.get("endpoint_type")
|
|
842
|
+
if etype not in endpoints_cfg:
|
|
843
|
+
raise ValueError(
|
|
844
|
+
f"deployment.endpoints missing path for endpoint_type '{etype}' (task '{task.name}')"
|
|
845
|
+
)
|
|
846
|
+
path = endpoints_cfg.get(etype)
|
|
847
|
+
if not isinstance(path, str) or not path.startswith("/"):
|
|
848
|
+
raise ValueError(
|
|
849
|
+
f"deployment.endpoints['{etype}'] must be a non-empty path starting with '/'"
|
|
850
|
+
)
|
|
851
|
+
|
|
852
|
+
# lepton env var presence (reference-level)
|
|
853
|
+
tasks_cfg = getattr(cfg.execution, "lepton_platform", {}).get("tasks", {}) or {}
|
|
854
|
+
lepton_env_vars = tasks_cfg.get("env_vars", {}) or {}
|
|
855
|
+
api_key_name = getattr(
|
|
856
|
+
getattr(cfg, "target", {}).get("api_endpoint", {}), "api_key_name", None
|
|
857
|
+
)
|
|
858
|
+
for task in cfg.evaluation.tasks:
|
|
859
|
+
td = get_task_from_mapping(task.name, tasks_mapping)
|
|
860
|
+
required = td.get("required_env_vars", []) or []
|
|
861
|
+
for var in required:
|
|
862
|
+
if var == "API_KEY":
|
|
863
|
+
if not (("API_KEY" in lepton_env_vars) or bool(api_key_name)):
|
|
864
|
+
raise ValueError(
|
|
865
|
+
f"Task '{task.name}' requires API_KEY: set execution.lepton_platform.tasks.env_vars.API_KEY "
|
|
866
|
+
"or target.api_endpoint.api_key_name"
|
|
867
|
+
)
|
|
868
|
+
else:
|
|
869
|
+
if var not in lepton_env_vars:
|
|
870
|
+
raise ValueError(
|
|
871
|
+
f"Task '{task.name}' requires {var}: set it under execution.lepton_platform.tasks.env_vars"
|
|
872
|
+
)
|
|
873
|
+
|
|
874
|
+
# success (use realized output directory if invocation_id is available)
|
|
875
|
+
preview_output_dir = (
|
|
876
|
+
Path(cfg.execution.output_dir).absolute() / invocation_id
|
|
877
|
+
if invocation_id
|
|
878
|
+
else Path(cfg.execution.output_dir).absolute() / "<invocation_id>"
|
|
879
|
+
)
|
|
880
|
+
print(f" - Tasks: {len(cfg.evaluation.tasks)}")
|
|
881
|
+
for idx, task in enumerate(cfg.evaluation.tasks):
|
|
882
|
+
print(f" - Task {idx}: {task.name}")
|
|
883
|
+
print(f" - Output directory: {preview_output_dir}")
|
|
884
|
+
print("\nTo run evaluation, execute run command without --dry-run")
|
|
885
|
+
except Exception as e:
|
|
886
|
+
print(f"❌ Configuration invalid: {e}")
|
|
887
|
+
logger.error("Lepton dry-run validation failed", error=str(e))
|
|
888
|
+
return
|
|
889
|
+
|
|
890
|
+
|
|
776
891
|
def _get_statuses_for_invocation_id(id: str, db: ExecutionDB) -> List[ExecutionStatus]:
|
|
777
892
|
"""Helper method that returns statuses if id is the invocation id"""
|
|
778
893
|
jobs = db.get_jobs(id)
|
|
@@ -47,6 +47,7 @@ from nemo_evaluator_launcher.common.mapping import (
|
|
|
47
47
|
get_task_from_mapping,
|
|
48
48
|
load_tasks_mapping,
|
|
49
49
|
)
|
|
50
|
+
from nemo_evaluator_launcher.common.printing_utils import bold, cyan, grey
|
|
50
51
|
from nemo_evaluator_launcher.executors.base import (
|
|
51
52
|
BaseExecutor,
|
|
52
53
|
ExecutionState,
|
|
@@ -155,6 +156,16 @@ class LocalExecutor(BaseExecutor):
|
|
|
155
156
|
|
|
156
157
|
task_output_dir = output_dir / task.name
|
|
157
158
|
task_output_dir.mkdir(parents=True, exist_ok=True)
|
|
159
|
+
eval_factory_command_struct = get_eval_factory_command(
|
|
160
|
+
cfg, task, task_definition
|
|
161
|
+
)
|
|
162
|
+
eval_factory_command = eval_factory_command_struct.cmd
|
|
163
|
+
# The debug comment for placing into the script and easy debug. Reason
|
|
164
|
+
# (see `CmdAndReadableComment`) is the current way of passing the command
|
|
165
|
+
# is base64-encoded config `echo`-ed into file.
|
|
166
|
+
# TODO(agronskiy): cleaner way is to encode everything with base64, not
|
|
167
|
+
# some parts (like ef_config.yaml) and just output as logs somewhere.
|
|
168
|
+
eval_factory_command_debug_comment = eval_factory_command_struct.debug
|
|
158
169
|
evaluation_task = {
|
|
159
170
|
"name": task.name,
|
|
160
171
|
"job_id": job_id,
|
|
@@ -162,9 +173,8 @@ class LocalExecutor(BaseExecutor):
|
|
|
162
173
|
"container_name": container_name,
|
|
163
174
|
"env_vars": env_vars,
|
|
164
175
|
"output_dir": task_output_dir,
|
|
165
|
-
"eval_factory_command":
|
|
166
|
-
|
|
167
|
-
),
|
|
176
|
+
"eval_factory_command": eval_factory_command,
|
|
177
|
+
"eval_factory_command_debug_comment": eval_factory_command_debug_comment,
|
|
168
178
|
}
|
|
169
179
|
evaluation_tasks.append(evaluation_task)
|
|
170
180
|
|
|
@@ -198,23 +208,28 @@ class LocalExecutor(BaseExecutor):
|
|
|
198
208
|
)
|
|
199
209
|
|
|
200
210
|
if dry_run:
|
|
201
|
-
print("\n\n=============================================\n\n")
|
|
202
|
-
print(f"DRY RUN: Scripts prepared and saved to {output_dir}")
|
|
211
|
+
print(bold("\n\n=============================================\n\n"))
|
|
212
|
+
print(bold(cyan(f"DRY RUN: Scripts prepared and saved to {output_dir}")))
|
|
203
213
|
if is_execution_mode_sequential:
|
|
204
214
|
print(
|
|
205
|
-
|
|
215
|
+
cyan(
|
|
216
|
+
"\n\n=========== Main script | run_all.sequential.sh =====================\n\n"
|
|
217
|
+
)
|
|
206
218
|
)
|
|
219
|
+
|
|
207
220
|
with open(output_dir / "run_all.sequential.sh", "r") as f:
|
|
208
|
-
print(f.read())
|
|
221
|
+
print(grey(f.read()))
|
|
209
222
|
else:
|
|
210
223
|
for idx, task in enumerate(cfg.evaluation.tasks):
|
|
211
224
|
task_output_dir = output_dir / task.name
|
|
212
225
|
print(
|
|
213
|
-
|
|
226
|
+
cyan(
|
|
227
|
+
f"\n\n=========== Task script | {task.name}/run.sh =====================\n\n"
|
|
228
|
+
)
|
|
214
229
|
)
|
|
215
230
|
with open(task_output_dir / "run.sh", "r") as f:
|
|
216
|
-
print(f.read())
|
|
217
|
-
print("\nTo execute, run without --dry-run")
|
|
231
|
+
print(grey(f.read()))
|
|
232
|
+
print(bold("\nTo execute, run without --dry-run"))
|
|
218
233
|
return invocation_id
|
|
219
234
|
|
|
220
235
|
# Save launched jobs metadata
|
|
@@ -284,13 +299,13 @@ class LocalExecutor(BaseExecutor):
|
|
|
284
299
|
error_msg = f"Script for {name} exited with code {exit_code}"
|
|
285
300
|
raise RuntimeError(f"Job startup failed | {error_msg}")
|
|
286
301
|
|
|
287
|
-
print("\nCommands for real-time monitoring:")
|
|
302
|
+
print(bold(cyan("\nCommands for real-time monitoring:")))
|
|
288
303
|
for job_id, evaluation_task in zip(job_ids, evaluation_tasks):
|
|
289
304
|
log_file = evaluation_task["output_dir"] / "logs" / "stdout.log"
|
|
290
305
|
print(f" tail -f {log_file}")
|
|
291
306
|
|
|
292
|
-
print("\nFollow all logs for this invocation:")
|
|
293
|
-
print(f" tail -f {output_dir}/*/logs/stdout.log")
|
|
307
|
+
print(bold(cyan("\nFollow all logs for this invocation:")))
|
|
308
|
+
print(f" tail -f {output_dir}/*/logs/stdout.log\n")
|
|
294
309
|
|
|
295
310
|
return invocation_id
|
|
296
311
|
|
|
@@ -40,6 +40,9 @@ else
|
|
|
40
40
|
# Create pre-start stage file
|
|
41
41
|
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.pre-start"
|
|
42
42
|
|
|
43
|
+
# Debug contents of the eval factory command's config
|
|
44
|
+
{{ task.eval_factory_command_debug_comment | indent(4) }}
|
|
45
|
+
|
|
43
46
|
# Docker run with eval factory command
|
|
44
47
|
(
|
|
45
48
|
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.running"
|
|
@@ -51,7 +54,7 @@ else
|
|
|
51
54
|
{% endfor -%}
|
|
52
55
|
{{ task.eval_image }} \
|
|
53
56
|
bash -c '
|
|
54
|
-
{{ task.eval_factory_command }} ;
|
|
57
|
+
{{ task.eval_factory_command | indent(8) }} ;
|
|
55
58
|
exit_code=$?
|
|
56
59
|
chmod 777 -R /results;
|
|
57
60
|
if [ "$exit_code" -ne 0 ]; then
|
|
@@ -50,6 +50,7 @@ from nemo_evaluator_launcher.common.mapping import (
|
|
|
50
50
|
get_task_from_mapping,
|
|
51
51
|
load_tasks_mapping,
|
|
52
52
|
)
|
|
53
|
+
from nemo_evaluator_launcher.common.printing_utils import bold, cyan, grey
|
|
53
54
|
from nemo_evaluator_launcher.executors.base import (
|
|
54
55
|
BaseExecutor,
|
|
55
56
|
ExecutionState,
|
|
@@ -130,13 +131,13 @@ class SlurmExecutor(BaseExecutor):
|
|
|
130
131
|
remote_runsub_paths.append(remote_runsub_path)
|
|
131
132
|
|
|
132
133
|
if dry_run:
|
|
133
|
-
print("\n\n=============================================\n\n")
|
|
134
|
-
print("DRY RUN: SLURM scripts prepared")
|
|
134
|
+
print(bold("\n\n=============================================\n\n"))
|
|
135
|
+
print(bold(cyan("DRY RUN: SLURM scripts prepared")))
|
|
135
136
|
for idx, local_runsub_path in enumerate(local_runsub_paths):
|
|
136
|
-
print(f"\n\n
|
|
137
|
+
print(cyan(f"\n\n=========== Task {idx} =====================\n\n"))
|
|
137
138
|
with open(local_runsub_path, "r") as f:
|
|
138
|
-
print(f.read())
|
|
139
|
-
print("
|
|
139
|
+
print(grey(f.read()))
|
|
140
|
+
print(bold("To submit jobs") + ", run the executor without --dry-run")
|
|
140
141
|
return invocation_id
|
|
141
142
|
|
|
142
143
|
socket = str(Path(tmpdirname) / "socket")
|
|
@@ -589,7 +590,20 @@ def _create_slurm_sbatch_script(
|
|
|
589
590
|
):
|
|
590
591
|
evaluation_mounts_list.append(f"{source_mnt}:{target_mnt}")
|
|
591
592
|
|
|
593
|
+
eval_factory_command_struct = get_eval_factory_command(cfg, task, task_definition)
|
|
594
|
+
eval_factory_command = eval_factory_command_struct.cmd
|
|
595
|
+
# The debug comment for placing into the script and easy debug. Reason
|
|
596
|
+
# (see `CmdAndReadableComment`) is the current way of passing the command
|
|
597
|
+
# is base64-encoded config `echo`-ed into file.
|
|
598
|
+
# TODO(agronskiy): cleaner way is to encode everything with base64, not
|
|
599
|
+
# some parts (like ef_config.yaml) and just output as logs somewhere.
|
|
600
|
+
eval_factory_command_debug_comment = eval_factory_command_struct.debug
|
|
601
|
+
|
|
592
602
|
# add evaluation srun command
|
|
603
|
+
s += "# Debug contents of the eval factory command's config\n"
|
|
604
|
+
s += eval_factory_command_debug_comment
|
|
605
|
+
s += "\n\n"
|
|
606
|
+
|
|
593
607
|
s += "# evaluation client\n"
|
|
594
608
|
s += "srun --mpi pmix --overlap "
|
|
595
609
|
s += "--container-image {} ".format(eval_image)
|
|
@@ -600,10 +614,11 @@ def _create_slurm_sbatch_script(
|
|
|
600
614
|
s += "--container-env {} ".format(",".join(evaluation_env_var_names))
|
|
601
615
|
if not cfg.execution.get("mounts", {}).get("mount_home", True):
|
|
602
616
|
s += "--no-container-mount-home "
|
|
617
|
+
|
|
603
618
|
s += "--container-mounts {} ".format(",".join(evaluation_mounts_list))
|
|
604
619
|
s += "--output {} ".format(remote_task_subdir / "logs" / "client-%A.out")
|
|
605
|
-
s += "bash -c '"
|
|
606
|
-
s +=
|
|
620
|
+
s += "bash -c '\n"
|
|
621
|
+
s += eval_factory_command
|
|
607
622
|
s += "'\n\n"
|
|
608
623
|
|
|
609
624
|
# terminate the server after all evaluation clients finish
|
{nemo_evaluator_launcher-0.1.16.dist-info → nemo_evaluator_launcher-0.1.18.dist-info}/RECORD
RENAMED
|
@@ -1,24 +1,25 @@
|
|
|
1
1
|
nemo_evaluator_launcher/__init__.py,sha256=2F703fttLaIyMHoVD54rptHMXt4AWnplHDrwWJ3e3PM,1930
|
|
2
|
-
nemo_evaluator_launcher/package_info.py,sha256=
|
|
2
|
+
nemo_evaluator_launcher/package_info.py,sha256=praNLExAuGkNXRb2vVR5Kn_A9aqyBy60Pwx9DiOg-Pk,1586
|
|
3
3
|
nemo_evaluator_launcher/api/__init__.py,sha256=U9q_MJK2vRsFaymanhyy0nD1SNAZQZC8oY45RXPX7ac,1024
|
|
4
4
|
nemo_evaluator_launcher/api/functional.py,sha256=T1HTIeiTXb-APWP7lPPTwFam4vFOApZCScRi6tMp538,27648
|
|
5
|
-
nemo_evaluator_launcher/api/types.py,sha256=
|
|
5
|
+
nemo_evaluator_launcher/api/types.py,sha256=W7ZQ9ZTPR6YxInxxsKE6NxuuQAg4pVYz6SRmFCFxY0A,3635
|
|
6
6
|
nemo_evaluator_launcher/api/utils.py,sha256=q5HArRj7PKgBfeH3bOX8q1U97yMyQQp72yRRA5JP9PE,818
|
|
7
7
|
nemo_evaluator_launcher/cli/__init__.py,sha256=lNC_skFLYTOt-arnY3ZQnZMWzHlrtD2wAoHvDcHddwM,673
|
|
8
|
-
nemo_evaluator_launcher/cli/debug.py,sha256=uJurq0yWzUnMy3S2hCX6msksPtQAQnon11GJlWfy_-4,16506
|
|
9
8
|
nemo_evaluator_launcher/cli/export.py,sha256=GRXxusKDq_1qjMKN6MKOIjZ8x4u5ERgXwHSAGrvsGCY,11211
|
|
9
|
+
nemo_evaluator_launcher/cli/info.py,sha256=2dZA2BqXpTG1wO_Wzt6Ol9ZNJzJJ0PibOB0hLFZpL14,20705
|
|
10
10
|
nemo_evaluator_launcher/cli/kill.py,sha256=C-4PWmMu8mIITo92o5AHxtq_s-8Cckbp7wAlG0I_ylw,1323
|
|
11
11
|
nemo_evaluator_launcher/cli/ls_runs.py,sha256=vJTwRdhVKLolnJuP8AnnQdJBE-BKZfCcCypLKSz5gqs,4942
|
|
12
12
|
nemo_evaluator_launcher/cli/ls_tasks.py,sha256=Pd2lBQOQBNHBWrjk4tZg0SQ9Ul9F2Ak-zOyh-G9x-DY,5293
|
|
13
|
-
nemo_evaluator_launcher/cli/main.py,sha256=
|
|
14
|
-
nemo_evaluator_launcher/cli/run.py,sha256=
|
|
15
|
-
nemo_evaluator_launcher/cli/status.py,sha256=
|
|
13
|
+
nemo_evaluator_launcher/cli/main.py,sha256=MQo-DcVF_f0aBgnVUic5PP1qlfXXs0Evkd1X4EXdOvA,7217
|
|
14
|
+
nemo_evaluator_launcher/cli/run.py,sha256=3-C_GvaIg9IxqpvC4P3h3lHcHdjB94Zpgq0ccOXVcpw,7503
|
|
15
|
+
nemo_evaluator_launcher/cli/status.py,sha256=ANdu0JYnfKNvd1gXmdu_0FrbPG-g0A_R4leOuNXzenQ,5947
|
|
16
16
|
nemo_evaluator_launcher/cli/version.py,sha256=puMwIvkmfD3HESjftdTSP6T3Nc8J4cbz8uXWHJcTemY,2030
|
|
17
17
|
nemo_evaluator_launcher/common/__init__.py,sha256=6-xb4KpG8-lZbWBI42c_Gax-Sq0kMSW8UG0Vn8dOBlo,744
|
|
18
18
|
nemo_evaluator_launcher/common/execdb.py,sha256=WPzg5Iu2ojvFpBuYahSt3voP_iEUpoO8NgqMLUBwFxA,9767
|
|
19
|
-
nemo_evaluator_launcher/common/helpers.py,sha256=
|
|
19
|
+
nemo_evaluator_launcher/common/helpers.py,sha256=ZB4gTJQLGD33Rw93phFxsFnS466SUyMZdcsiZ1pBlUk,8842
|
|
20
20
|
nemo_evaluator_launcher/common/logging_utils.py,sha256=8UMAQ22t5NAJRDZtI0gVbdKUlNAiG23WQwZZ0HwzOT4,11843
|
|
21
21
|
nemo_evaluator_launcher/common/mapping.py,sha256=tD3jWN7rm9-iJEFlENhYMt7adz8DKs67g3Xd43XIAMM,10731
|
|
22
|
+
nemo_evaluator_launcher/common/printing_utils.py,sha256=YICPY-KhxjL5QNEFJNvYfnj6_ArkZURDdP1pizqY-yU,2368
|
|
22
23
|
nemo_evaluator_launcher/configs/__init__.py,sha256=lNC_skFLYTOt-arnY3ZQnZMWzHlrtD2wAoHvDcHddwM,673
|
|
23
24
|
nemo_evaluator_launcher/configs/default.yaml,sha256=JHFjSl3KByhggRMTTo9nesQATVoz18PJIV6KM5Wng64,974
|
|
24
25
|
nemo_evaluator_launcher/configs/deployment/generic.yaml,sha256=8_Z0fcjZuH6GfV9jJkY_8CS18Tbsn0gV-nK1LfGr_Vg,1262
|
|
@@ -29,19 +30,19 @@ nemo_evaluator_launcher/configs/deployment/trtllm.yaml,sha256=nZF1ueCF9uMticElge
|
|
|
29
30
|
nemo_evaluator_launcher/configs/deployment/vllm.yaml,sha256=NOAfK7hALTeVxjNJ63zEkb0vnefeRuAQCh6OZwe18pA,1476
|
|
30
31
|
nemo_evaluator_launcher/configs/execution/local.yaml,sha256=0XtVXHeCK-zAoTURxnf4pr9RCe46HUCpKVzcfPlskz4,740
|
|
31
32
|
nemo_evaluator_launcher/configs/execution/lepton/default.yaml,sha256=SRSMxohEtafzb-QS_oz4kP-RBgigRCAZgYwKkXtjymY,2930
|
|
32
|
-
nemo_evaluator_launcher/configs/execution/slurm/default.yaml,sha256=
|
|
33
|
+
nemo_evaluator_launcher/configs/execution/slurm/default.yaml,sha256=zTevKSKch7yl4D0wGh4BjDBuI0YAiJSljVKwD7G8Cww,1328
|
|
33
34
|
nemo_evaluator_launcher/executors/__init__.py,sha256=mSU1op5r7R_vqOCLDP84z6utfFgXOIl_1vBzN7KOC6o,1042
|
|
34
35
|
nemo_evaluator_launcher/executors/base.py,sha256=4BOz2-jMG1OJ-5o5qCh-SJqLUE64YJWlnmB9hc1p4Pc,4040
|
|
35
36
|
nemo_evaluator_launcher/executors/registry.py,sha256=8QXSrsJyHeNi8iSttJ8KWQLXmZve1vxnnCNw_CkeopI,1409
|
|
36
37
|
nemo_evaluator_launcher/executors/lepton/__init__.py,sha256=F_7yuBaYQ6WWTcptADdkL3AIZ_jXJQHGgKag-Hm7BbQ,698
|
|
37
38
|
nemo_evaluator_launcher/executors/lepton/deployment_helpers.py,sha256=AAIlHHn-WifevNosug0DlSDLN6NtjkclEu5LHyu1xq8,21799
|
|
38
|
-
nemo_evaluator_launcher/executors/lepton/executor.py,sha256=
|
|
39
|
+
nemo_evaluator_launcher/executors/lepton/executor.py,sha256=BeKZFflrooh_gbGoujY-cKOkarS-_VI0AoER91t5zvA,40240
|
|
39
40
|
nemo_evaluator_launcher/executors/lepton/job_helpers.py,sha256=6baTxcygfP1oFgAJ7I9EL4xRlcJDWqbqzZoE1CRrwSk,13528
|
|
40
41
|
nemo_evaluator_launcher/executors/local/__init__.py,sha256=lNC_skFLYTOt-arnY3ZQnZMWzHlrtD2wAoHvDcHddwM,673
|
|
41
|
-
nemo_evaluator_launcher/executors/local/executor.py,sha256=
|
|
42
|
-
nemo_evaluator_launcher/executors/local/run.template.sh,sha256=
|
|
42
|
+
nemo_evaluator_launcher/executors/local/executor.py,sha256=OUJ4B8qvE_mz9tXTYkwDqEjtOLtJzzbvOHDW35LYR4U,21431
|
|
43
|
+
nemo_evaluator_launcher/executors/local/run.template.sh,sha256=uPofo2g8f7LsS0uL0YK0Y5YFFCQ5fAeiAkYMGBTalpg,3966
|
|
43
44
|
nemo_evaluator_launcher/executors/slurm/__init__.py,sha256=lNC_skFLYTOt-arnY3ZQnZMWzHlrtD2wAoHvDcHddwM,673
|
|
44
|
-
nemo_evaluator_launcher/executors/slurm/executor.py,sha256=
|
|
45
|
+
nemo_evaluator_launcher/executors/slurm/executor.py,sha256=J_0sYzCcbcGzitf_oC9ncguoD2eG-K3osJxmZTW-9tE,40096
|
|
45
46
|
nemo_evaluator_launcher/exporters/__init__.py,sha256=mBXG9FG48FeYrs8sF0zA2mgo1eqBmRgoml7zjJrqDso,1323
|
|
46
47
|
nemo_evaluator_launcher/exporters/base.py,sha256=0BEqS-Zjez-KsrGE9yfo8S5w2uwMW3btBZve3SiiUp0,4307
|
|
47
48
|
nemo_evaluator_launcher/exporters/gsheets.py,sha256=hBOL3vaomCW2fPMDEOQWkZkFCgF4jCoS4U5ZlsNVENs,15911
|
|
@@ -51,9 +52,9 @@ nemo_evaluator_launcher/exporters/registry.py,sha256=XsPTv_SBAFjcErO6BJ3OHqs3EvX
|
|
|
51
52
|
nemo_evaluator_launcher/exporters/utils.py,sha256=XZVgTDmoa20tjEMwez0oUSpYpjt3ILV75D4KWuHtZ80,23119
|
|
52
53
|
nemo_evaluator_launcher/exporters/wandb.py,sha256=1qRUV_YE1Ury7rH7KH65AabR7gmEQ38kXBh2XrfiEpE,18082
|
|
53
54
|
nemo_evaluator_launcher/resources/mapping.toml,sha256=uOg4Y-gDXXskbbba2vuwJ5FLJ3W0kSZz7Fap_nJnFQc,11322
|
|
54
|
-
nemo_evaluator_launcher-0.1.
|
|
55
|
-
nemo_evaluator_launcher-0.1.
|
|
56
|
-
nemo_evaluator_launcher-0.1.
|
|
57
|
-
nemo_evaluator_launcher-0.1.
|
|
58
|
-
nemo_evaluator_launcher-0.1.
|
|
59
|
-
nemo_evaluator_launcher-0.1.
|
|
55
|
+
nemo_evaluator_launcher-0.1.18.dist-info/licenses/LICENSE,sha256=DyGb0fqHPZAsd_uXHA0DGcOCqsvrNsImuLC0Ts4s1zI,23413
|
|
56
|
+
nemo_evaluator_launcher-0.1.18.dist-info/METADATA,sha256=QnzvHFzKYVGHhjKGNp55ErRQbG_FIscLpOyAhwgTS4U,28725
|
|
57
|
+
nemo_evaluator_launcher-0.1.18.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
58
|
+
nemo_evaluator_launcher-0.1.18.dist-info/entry_points.txt,sha256=64z1T5GKSB9PW1fCENQuor6X6eqH1rcfg0NQGfKrEy8,130
|
|
59
|
+
nemo_evaluator_launcher-0.1.18.dist-info/top_level.txt,sha256=5PvawNm9TXKqPRjZita1xPOtFiMOipcoRf50FI1iY3s,24
|
|
60
|
+
nemo_evaluator_launcher-0.1.18.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nemo_evaluator_launcher-0.1.16.dist-info → nemo_evaluator_launcher-0.1.18.dist-info}/top_level.txt
RENAMED
|
File without changes
|