nemo-evaluator-launcher 0.1.19__py3-none-any.whl → 0.1.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. nemo_evaluator_launcher/api/functional.py +105 -1
  2. nemo_evaluator_launcher/cli/logs.py +102 -0
  3. nemo_evaluator_launcher/cli/main.py +12 -0
  4. nemo_evaluator_launcher/cli/run.py +73 -15
  5. nemo_evaluator_launcher/cli/version.py +26 -23
  6. nemo_evaluator_launcher/common/helpers.py +176 -43
  7. nemo_evaluator_launcher/common/logging_utils.py +16 -5
  8. nemo_evaluator_launcher/common/printing_utils.py +7 -0
  9. nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
  10. nemo_evaluator_launcher/configs/deployment/trtllm.yaml +2 -3
  11. nemo_evaluator_launcher/configs/deployment/vllm.yaml +0 -1
  12. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +14 -0
  13. nemo_evaluator_launcher/executors/base.py +31 -1
  14. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +36 -1
  15. nemo_evaluator_launcher/executors/lepton/executor.py +81 -1
  16. nemo_evaluator_launcher/executors/local/executor.py +377 -22
  17. nemo_evaluator_launcher/executors/local/run.template.sh +54 -2
  18. nemo_evaluator_launcher/executors/slurm/executor.py +422 -59
  19. nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
  20. nemo_evaluator_launcher/exporters/utils.py +32 -46
  21. nemo_evaluator_launcher/package_info.py +1 -1
  22. nemo_evaluator_launcher/resources/mapping.toml +56 -15
  23. {nemo_evaluator_launcher-0.1.19.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/METADATA +3 -3
  24. {nemo_evaluator_launcher-0.1.19.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/RECORD +28 -26
  25. {nemo_evaluator_launcher-0.1.19.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/entry_points.txt +1 -0
  26. {nemo_evaluator_launcher-0.1.19.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/WHEEL +0 -0
  27. {nemo_evaluator_launcher-0.1.19.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/licenses/LICENSE +0 -0
  28. {nemo_evaluator_launcher-0.1.19.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/top_level.txt +0 -0
@@ -22,6 +22,20 @@ script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
22
22
  killed_jobs_file="$script_dir/killed_jobs.txt"
23
23
  rm -f "$killed_jobs_file"
24
24
 
25
+ # Create all directories and stdout.log files upfront before any container starts
26
+ {% for task in evaluation_tasks %}
27
+ task_dir="{{ task.output_dir }}"
28
+ artifacts_dir="$task_dir/artifacts"
29
+ logs_dir="$task_dir/logs"
30
+
31
+ mkdir -m 777 -p "$task_dir"
32
+ mkdir -m 777 -p "$artifacts_dir"
33
+ mkdir -m 777 -p "$logs_dir"
34
+ # Create stdout.log file upfront
35
+ touch "$logs_dir/client_stdout.log"
36
+ chmod 666 "$logs_dir/client_stdout.log"
37
+ {% endfor %}
38
+
25
39
  {% for task in evaluation_tasks %}
26
40
  # {{ task.job_id }} {{ task.name }}
27
41
 
@@ -46,9 +60,41 @@ else
46
60
  # Docker run with eval factory command
47
61
  (
48
62
  echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.running"
63
+ {% if task.deployment %}
64
+ docker run --rm --shm-size=100g --gpus all {{ task.deployment.extra_docker_args }} \
65
+ --name {{ task.deployment.container_name }} --entrypoint '' \
66
+ -p {{ task.deployment.port }}:{{ task.deployment.port }} \
67
+ {% for env_var in task.deployment.env_vars -%}
68
+ -e {{ env_var }} \
69
+ {% endfor -%}
70
+ {% for mount in task.deployment.mounts -%}
71
+ -v {{ mount }} \
72
+ {% endfor -%}
73
+ {{ task.deployment.image }} \
74
+ {{ task.deployment.command }} > "$logs_dir/server_stdout.log" 2>&1 &
75
+
76
+ SERVER_PID=$!
77
+ SERVER_CONTAINER_NAME="{{ task.deployment.container_name }}"
78
+
79
+ date
80
+ # wait for the server to initialize
81
+ TIMEOUT=600
82
+ ELAPSED=0
83
+ while [[ "$(curl -s -o /dev/null -w "%{http_code}" {{ task.deployment.health_url }})" != "200" ]]; do
84
+ kill -0 $SERVER_PID 2>/dev/null || { echo "Server process $SERVER_PID died"; echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) 1" > "$logs_dir/stage.exit"; exit 1; }
85
+ [ $ELAPSED -ge $TIMEOUT ] && { echo "Health check timeout after ${TIMEOUT}s"; echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) 1" > "$logs_dir/stage.exit"; exit 1; }
86
+ sleep 5
87
+ ELAPSED=$((ELAPSED + 5))
88
+ done
89
+ date
90
+
91
+ {% endif %}
49
92
  docker run --rm --shm-size=100g {{ extra_docker_args }} \
50
- --name {{ task.container_name }} \
93
+ {% if task.deployment %}--network container:$SERVER_CONTAINER_NAME \{% endif %}--name {{ task.client_container_name }} \
51
94
  --volume "$artifacts_dir":/results \
95
+ {% if task.dataset_mount_host and task.dataset_mount_container -%}
96
+ --volume "{{ task.dataset_mount_host }}:{{ task.dataset_mount_container }}" \
97
+ {% endif -%}
52
98
  {% for env_var in task.env_vars -%}
53
99
  -e {{ env_var }} \
54
100
  {% endfor -%}
@@ -63,8 +109,14 @@ else
63
109
  fi;
64
110
  echo "Container completed successfully" >&2;
65
111
  exit 0;
66
- ' > "$logs_dir/stdout.log" 2>&1
112
+ ' > "$logs_dir/client_stdout.log" 2>&1
67
113
  exit_code=$?
114
+
115
+ {% if task.deployment %}
116
+ # Stop the server
117
+ docker stop $SERVER_CONTAINER_NAME 2>/dev/null || true
118
+ {% endif %}
119
+
68
120
  echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) $exit_code" > "$logs_dir/stage.exit"
69
121
  ) >> "$logs_dir/stdout.log" 2>&1
70
122