nemo-evaluator-launcher 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nemo_evaluator_launcher/__init__.py +15 -1
- nemo_evaluator_launcher/api/functional.py +188 -27
- nemo_evaluator_launcher/api/types.py +9 -0
- nemo_evaluator_launcher/cli/export.py +131 -12
- nemo_evaluator_launcher/cli/info.py +477 -82
- nemo_evaluator_launcher/cli/kill.py +5 -3
- nemo_evaluator_launcher/cli/logs.py +102 -0
- nemo_evaluator_launcher/cli/ls_runs.py +31 -10
- nemo_evaluator_launcher/cli/ls_tasks.py +105 -3
- nemo_evaluator_launcher/cli/main.py +101 -5
- nemo_evaluator_launcher/cli/run.py +153 -30
- nemo_evaluator_launcher/cli/status.py +49 -5
- nemo_evaluator_launcher/cli/version.py +26 -23
- nemo_evaluator_launcher/common/execdb.py +121 -27
- nemo_evaluator_launcher/common/helpers.py +213 -33
- nemo_evaluator_launcher/common/logging_utils.py +16 -5
- nemo_evaluator_launcher/common/printing_utils.py +100 -0
- nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
- nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
- nemo_evaluator_launcher/configs/deployment/trtllm.yaml +23 -0
- nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -2
- nemo_evaluator_launcher/configs/execution/local.yaml +2 -0
- nemo_evaluator_launcher/configs/execution/slurm/default.yaml +19 -4
- nemo_evaluator_launcher/executors/base.py +54 -1
- nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +60 -5
- nemo_evaluator_launcher/executors/lepton/executor.py +240 -101
- nemo_evaluator_launcher/executors/lepton/job_helpers.py +15 -11
- nemo_evaluator_launcher/executors/local/executor.py +492 -56
- nemo_evaluator_launcher/executors/local/run.template.sh +76 -9
- nemo_evaluator_launcher/executors/slurm/executor.py +571 -98
- nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
- nemo_evaluator_launcher/exporters/base.py +9 -0
- nemo_evaluator_launcher/exporters/gsheets.py +27 -9
- nemo_evaluator_launcher/exporters/local.py +30 -16
- nemo_evaluator_launcher/exporters/mlflow.py +245 -74
- nemo_evaluator_launcher/exporters/utils.py +139 -184
- nemo_evaluator_launcher/exporters/wandb.py +157 -43
- nemo_evaluator_launcher/package_info.py +6 -3
- nemo_evaluator_launcher/resources/mapping.toml +56 -15
- nemo_evaluator_launcher-0.1.41.dist-info/METADATA +494 -0
- nemo_evaluator_launcher-0.1.41.dist-info/RECORD +62 -0
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/entry_points.txt +1 -0
- nemo_evaluator_launcher-0.1.0rc6.dist-info/METADATA +0 -35
- nemo_evaluator_launcher-0.1.0rc6.dist-info/RECORD +0 -57
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/WHEEL +0 -0
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/licenses/LICENSE +0 -0
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/top_level.txt +0 -0
|
@@ -17,6 +17,25 @@
|
|
|
17
17
|
# check if docker exists
|
|
18
18
|
command -v docker >/dev/null 2>&1 || { echo 'docker not found'; exit 1; }
|
|
19
19
|
|
|
20
|
+
# Initialize: remove killed jobs file from previous runs
|
|
21
|
+
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
22
|
+
killed_jobs_file="$script_dir/killed_jobs.txt"
|
|
23
|
+
rm -f "$killed_jobs_file"
|
|
24
|
+
|
|
25
|
+
# Create all directories and stdout.log files upfront before any container starts
|
|
26
|
+
{% for task in evaluation_tasks %}
|
|
27
|
+
task_dir="{{ task.output_dir }}"
|
|
28
|
+
artifacts_dir="$task_dir/artifacts"
|
|
29
|
+
logs_dir="$task_dir/logs"
|
|
30
|
+
|
|
31
|
+
mkdir -m 777 -p "$task_dir"
|
|
32
|
+
mkdir -m 777 -p "$artifacts_dir"
|
|
33
|
+
mkdir -m 777 -p "$logs_dir"
|
|
34
|
+
# Create stdout.log file upfront
|
|
35
|
+
touch "$logs_dir/client_stdout.log"
|
|
36
|
+
chmod 666 "$logs_dir/client_stdout.log"
|
|
37
|
+
{% endfor %}
|
|
38
|
+
|
|
20
39
|
{% for task in evaluation_tasks %}
|
|
21
40
|
# {{ task.job_id }} {{ task.name }}
|
|
22
41
|
|
|
@@ -28,21 +47,60 @@ mkdir -m 777 -p "$task_dir"
|
|
|
28
47
|
mkdir -m 777 -p "$artifacts_dir"
|
|
29
48
|
mkdir -m 777 -p "$logs_dir"
|
|
30
49
|
|
|
31
|
-
#
|
|
32
|
-
|
|
50
|
+
# Check if this job was killed
|
|
51
|
+
if [ -f "$killed_jobs_file" ] && grep -q "^{{ task.job_id }}$" "$killed_jobs_file"; then
|
|
52
|
+
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) Job {{ task.job_id }} ({{ task.name }}) was killed, skipping execution" | tee -a "$logs_dir/stdout.log"
|
|
53
|
+
else
|
|
54
|
+
# Create pre-start stage file
|
|
55
|
+
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.pre-start"
|
|
33
56
|
|
|
34
|
-
#
|
|
35
|
-
(
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
57
|
+
# Debug contents of the eval factory command's config
|
|
58
|
+
{{ task.eval_factory_command_debug_comment | indent(4) }}
|
|
59
|
+
|
|
60
|
+
# Docker run with eval factory command
|
|
61
|
+
(
|
|
62
|
+
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.running"
|
|
63
|
+
{% if task.deployment %}
|
|
64
|
+
docker run --rm --shm-size=100g --gpus all {{ task.deployment.extra_docker_args }} \
|
|
65
|
+
--name {{ task.deployment.container_name }} --entrypoint '' \
|
|
66
|
+
-p {{ task.deployment.port }}:{{ task.deployment.port }} \
|
|
67
|
+
{% for env_var in task.deployment.env_vars -%}
|
|
68
|
+
-e {{ env_var }} \
|
|
69
|
+
{% endfor -%}
|
|
70
|
+
{% for mount in task.deployment.mounts -%}
|
|
71
|
+
-v {{ mount }} \
|
|
72
|
+
{% endfor -%}
|
|
73
|
+
{{ task.deployment.image }} \
|
|
74
|
+
{{ task.deployment.command }} > "$logs_dir/server_stdout.log" 2>&1 &
|
|
75
|
+
|
|
76
|
+
SERVER_PID=$!
|
|
77
|
+
SERVER_CONTAINER_NAME="{{ task.deployment.container_name }}"
|
|
78
|
+
|
|
79
|
+
date
|
|
80
|
+
# wait for the server to initialize
|
|
81
|
+
TIMEOUT=600
|
|
82
|
+
ELAPSED=0
|
|
83
|
+
while [[ "$(curl -s -o /dev/null -w "%{http_code}" {{ task.deployment.health_url }})" != "200" ]]; do
|
|
84
|
+
kill -0 $SERVER_PID 2>/dev/null || { echo "Server process $SERVER_PID died"; echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) 1" > "$logs_dir/stage.exit"; exit 1; }
|
|
85
|
+
[ $ELAPSED -ge $TIMEOUT ] && { echo "Health check timeout after ${TIMEOUT}s"; echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) 1" > "$logs_dir/stage.exit"; exit 1; }
|
|
86
|
+
sleep 5
|
|
87
|
+
ELAPSED=$((ELAPSED + 5))
|
|
88
|
+
done
|
|
89
|
+
date
|
|
90
|
+
|
|
91
|
+
{% endif %}
|
|
92
|
+
docker run --rm --shm-size=100g {{ extra_docker_args }} \
|
|
93
|
+
{% if task.deployment %}--network container:$SERVER_CONTAINER_NAME \{% endif %}--name {{ task.client_container_name }} \
|
|
39
94
|
--volume "$artifacts_dir":/results \
|
|
95
|
+
{% if task.dataset_mount_host and task.dataset_mount_container -%}
|
|
96
|
+
--volume "{{ task.dataset_mount_host }}:{{ task.dataset_mount_container }}" \
|
|
97
|
+
{% endif -%}
|
|
40
98
|
{% for env_var in task.env_vars -%}
|
|
41
99
|
-e {{ env_var }} \
|
|
42
100
|
{% endfor -%}
|
|
43
101
|
{{ task.eval_image }} \
|
|
44
102
|
bash -c '
|
|
45
|
-
{{ task.eval_factory_command }} ;
|
|
103
|
+
{{ task.eval_factory_command | indent(8) }} ;
|
|
46
104
|
exit_code=$?
|
|
47
105
|
chmod 777 -R /results;
|
|
48
106
|
if [ "$exit_code" -ne 0 ]; then
|
|
@@ -51,8 +109,14 @@ echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.pre-start"
|
|
|
51
109
|
fi;
|
|
52
110
|
echo "Container completed successfully" >&2;
|
|
53
111
|
exit 0;
|
|
54
|
-
' > "$logs_dir/
|
|
112
|
+
' > "$logs_dir/client_stdout.log" 2>&1
|
|
55
113
|
exit_code=$?
|
|
114
|
+
|
|
115
|
+
{% if task.deployment %}
|
|
116
|
+
# Stop the server
|
|
117
|
+
docker stop $SERVER_CONTAINER_NAME 2>/dev/null || true
|
|
118
|
+
{% endif %}
|
|
119
|
+
|
|
56
120
|
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) $exit_code" > "$logs_dir/stage.exit"
|
|
57
121
|
) >> "$logs_dir/stdout.log" 2>&1
|
|
58
122
|
|
|
@@ -85,4 +149,7 @@ echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.pre-start"
|
|
|
85
149
|
)
|
|
86
150
|
|
|
87
151
|
{% endif %}
|
|
152
|
+
fi
|
|
153
|
+
|
|
154
|
+
|
|
88
155
|
{% endfor %}
|