nemo-evaluator-launcher 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. nemo_evaluator_launcher/__init__.py +15 -1
  2. nemo_evaluator_launcher/api/functional.py +188 -27
  3. nemo_evaluator_launcher/api/types.py +9 -0
  4. nemo_evaluator_launcher/cli/export.py +131 -12
  5. nemo_evaluator_launcher/cli/info.py +477 -82
  6. nemo_evaluator_launcher/cli/kill.py +5 -3
  7. nemo_evaluator_launcher/cli/logs.py +102 -0
  8. nemo_evaluator_launcher/cli/ls_runs.py +31 -10
  9. nemo_evaluator_launcher/cli/ls_tasks.py +105 -3
  10. nemo_evaluator_launcher/cli/main.py +101 -5
  11. nemo_evaluator_launcher/cli/run.py +153 -30
  12. nemo_evaluator_launcher/cli/status.py +49 -5
  13. nemo_evaluator_launcher/cli/version.py +26 -23
  14. nemo_evaluator_launcher/common/execdb.py +121 -27
  15. nemo_evaluator_launcher/common/helpers.py +213 -33
  16. nemo_evaluator_launcher/common/logging_utils.py +16 -5
  17. nemo_evaluator_launcher/common/printing_utils.py +100 -0
  18. nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
  19. nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
  20. nemo_evaluator_launcher/configs/deployment/trtllm.yaml +23 -0
  21. nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -2
  22. nemo_evaluator_launcher/configs/execution/local.yaml +2 -0
  23. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +19 -4
  24. nemo_evaluator_launcher/executors/base.py +54 -1
  25. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +60 -5
  26. nemo_evaluator_launcher/executors/lepton/executor.py +240 -101
  27. nemo_evaluator_launcher/executors/lepton/job_helpers.py +15 -11
  28. nemo_evaluator_launcher/executors/local/executor.py +492 -56
  29. nemo_evaluator_launcher/executors/local/run.template.sh +76 -9
  30. nemo_evaluator_launcher/executors/slurm/executor.py +571 -98
  31. nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
  32. nemo_evaluator_launcher/exporters/base.py +9 -0
  33. nemo_evaluator_launcher/exporters/gsheets.py +27 -9
  34. nemo_evaluator_launcher/exporters/local.py +30 -16
  35. nemo_evaluator_launcher/exporters/mlflow.py +245 -74
  36. nemo_evaluator_launcher/exporters/utils.py +139 -184
  37. nemo_evaluator_launcher/exporters/wandb.py +157 -43
  38. nemo_evaluator_launcher/package_info.py +6 -3
  39. nemo_evaluator_launcher/resources/mapping.toml +56 -15
  40. nemo_evaluator_launcher-0.1.41.dist-info/METADATA +494 -0
  41. nemo_evaluator_launcher-0.1.41.dist-info/RECORD +62 -0
  42. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/entry_points.txt +1 -0
  43. nemo_evaluator_launcher-0.1.0rc6.dist-info/METADATA +0 -35
  44. nemo_evaluator_launcher-0.1.0rc6.dist-info/RECORD +0 -57
  45. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/WHEEL +0 -0
  46. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/licenses/LICENSE +0 -0
  47. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/top_level.txt +0 -0
@@ -17,6 +17,25 @@
17
17
  # check if docker exists
18
18
  command -v docker >/dev/null 2>&1 || { echo 'docker not found'; exit 1; }
19
19
 
20
+ # Initialize: remove killed jobs file from previous runs
21
+ script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
22
+ killed_jobs_file="$script_dir/killed_jobs.txt"
23
+ rm -f "$killed_jobs_file"
24
+
25
+ # Create all directories and stdout.log files upfront before any container starts
26
+ {% for task in evaluation_tasks %}
27
+ task_dir="{{ task.output_dir }}"
28
+ artifacts_dir="$task_dir/artifacts"
29
+ logs_dir="$task_dir/logs"
30
+
31
+ mkdir -m 777 -p "$task_dir"
32
+ mkdir -m 777 -p "$artifacts_dir"
33
+ mkdir -m 777 -p "$logs_dir"
34
+ # Create stdout.log file upfront
35
+ touch "$logs_dir/client_stdout.log"
36
+ chmod 666 "$logs_dir/client_stdout.log"
37
+ {% endfor %}
38
+
20
39
  {% for task in evaluation_tasks %}
21
40
  # {{ task.job_id }} {{ task.name }}
22
41
 
@@ -28,21 +47,60 @@ mkdir -m 777 -p "$task_dir"
28
47
  mkdir -m 777 -p "$artifacts_dir"
29
48
  mkdir -m 777 -p "$logs_dir"
30
49
 
31
- # Create pre-start stage file
32
- echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.pre-start"
50
+ # Check if this job was killed
51
+ if [ -f "$killed_jobs_file" ] && grep -q "^{{ task.job_id }}$" "$killed_jobs_file"; then
52
+ echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) Job {{ task.job_id }} ({{ task.name }}) was killed, skipping execution" | tee -a "$logs_dir/stdout.log"
53
+ else
54
+ # Create pre-start stage file
55
+ echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.pre-start"
33
56
 
34
- # Docker run with eval factory command
35
- (
36
- echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.running"
37
- docker run --rm --shm-size=100g \
38
- --name {{ task.container_name }} \
57
+ # Debug contents of the eval factory command's config
58
+ {{ task.eval_factory_command_debug_comment | indent(4) }}
59
+
60
+ # Docker run with eval factory command
61
+ (
62
+ echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.running"
63
+ {% if task.deployment %}
64
+ docker run --rm --shm-size=100g --gpus all {{ task.deployment.extra_docker_args }} \
65
+ --name {{ task.deployment.container_name }} --entrypoint '' \
66
+ -p {{ task.deployment.port }}:{{ task.deployment.port }} \
67
+ {% for env_var in task.deployment.env_vars -%}
68
+ -e {{ env_var }} \
69
+ {% endfor -%}
70
+ {% for mount in task.deployment.mounts -%}
71
+ -v {{ mount }} \
72
+ {% endfor -%}
73
+ {{ task.deployment.image }} \
74
+ {{ task.deployment.command }} > "$logs_dir/server_stdout.log" 2>&1 &
75
+
76
+ SERVER_PID=$!
77
+ SERVER_CONTAINER_NAME="{{ task.deployment.container_name }}"
78
+
79
+ date
80
+ # wait for the server to initialize
81
+ TIMEOUT=600
82
+ ELAPSED=0
83
+ while [[ "$(curl -s -o /dev/null -w "%{http_code}" {{ task.deployment.health_url }})" != "200" ]]; do
84
+ kill -0 $SERVER_PID 2>/dev/null || { echo "Server process $SERVER_PID died"; echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) 1" > "$logs_dir/stage.exit"; exit 1; }
85
+ [ $ELAPSED -ge $TIMEOUT ] && { echo "Health check timeout after ${TIMEOUT}s"; echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) 1" > "$logs_dir/stage.exit"; exit 1; }
86
+ sleep 5
87
+ ELAPSED=$((ELAPSED + 5))
88
+ done
89
+ date
90
+
91
+ {% endif %}
92
+ docker run --rm --shm-size=100g {{ extra_docker_args }} \
93
+ {% if task.deployment %}--network container:$SERVER_CONTAINER_NAME \{% endif %}--name {{ task.client_container_name }} \
39
94
  --volume "$artifacts_dir":/results \
95
+ {% if task.dataset_mount_host and task.dataset_mount_container -%}
96
+ --volume "{{ task.dataset_mount_host }}:{{ task.dataset_mount_container }}" \
97
+ {% endif -%}
40
98
  {% for env_var in task.env_vars -%}
41
99
  -e {{ env_var }} \
42
100
  {% endfor -%}
43
101
  {{ task.eval_image }} \
44
102
  bash -c '
45
- {{ task.eval_factory_command }} ;
103
+ {{ task.eval_factory_command | indent(8) }} ;
46
104
  exit_code=$?
47
105
  chmod 777 -R /results;
48
106
  if [ "$exit_code" -ne 0 ]; then
@@ -51,8 +109,14 @@ echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.pre-start"
51
109
  fi;
52
110
  echo "Container completed successfully" >&2;
53
111
  exit 0;
54
- ' > "$logs_dir/stdout.log" 2>&1
112
+ ' > "$logs_dir/client_stdout.log" 2>&1
55
113
  exit_code=$?
114
+
115
+ {% if task.deployment %}
116
+ # Stop the server
117
+ docker stop $SERVER_CONTAINER_NAME 2>/dev/null || true
118
+ {% endif %}
119
+
56
120
  echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) $exit_code" > "$logs_dir/stage.exit"
57
121
  ) >> "$logs_dir/stdout.log" 2>&1
58
122
 
@@ -85,4 +149,7 @@ echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.pre-start"
85
149
  )
86
150
 
87
151
  {% endif %}
152
+ fi
153
+
154
+
88
155
  {% endfor %}