nemo-evaluator-launcher 0.1.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nemo-evaluator-launcher might be problematic. Click here for more details.

Files changed (60) hide show
  1. nemo_evaluator_launcher/__init__.py +79 -0
  2. nemo_evaluator_launcher/api/__init__.py +24 -0
  3. nemo_evaluator_launcher/api/functional.py +698 -0
  4. nemo_evaluator_launcher/api/types.py +98 -0
  5. nemo_evaluator_launcher/api/utils.py +19 -0
  6. nemo_evaluator_launcher/cli/__init__.py +15 -0
  7. nemo_evaluator_launcher/cli/export.py +267 -0
  8. nemo_evaluator_launcher/cli/info.py +512 -0
  9. nemo_evaluator_launcher/cli/kill.py +41 -0
  10. nemo_evaluator_launcher/cli/ls_runs.py +134 -0
  11. nemo_evaluator_launcher/cli/ls_tasks.py +136 -0
  12. nemo_evaluator_launcher/cli/main.py +226 -0
  13. nemo_evaluator_launcher/cli/run.py +200 -0
  14. nemo_evaluator_launcher/cli/status.py +164 -0
  15. nemo_evaluator_launcher/cli/version.py +55 -0
  16. nemo_evaluator_launcher/common/__init__.py +16 -0
  17. nemo_evaluator_launcher/common/execdb.py +283 -0
  18. nemo_evaluator_launcher/common/helpers.py +366 -0
  19. nemo_evaluator_launcher/common/logging_utils.py +357 -0
  20. nemo_evaluator_launcher/common/mapping.py +295 -0
  21. nemo_evaluator_launcher/common/printing_utils.py +93 -0
  22. nemo_evaluator_launcher/configs/__init__.py +15 -0
  23. nemo_evaluator_launcher/configs/default.yaml +28 -0
  24. nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
  25. nemo_evaluator_launcher/configs/deployment/nim.yaml +32 -0
  26. nemo_evaluator_launcher/configs/deployment/none.yaml +16 -0
  27. nemo_evaluator_launcher/configs/deployment/sglang.yaml +38 -0
  28. nemo_evaluator_launcher/configs/deployment/trtllm.yaml +24 -0
  29. nemo_evaluator_launcher/configs/deployment/vllm.yaml +42 -0
  30. nemo_evaluator_launcher/configs/execution/lepton/default.yaml +92 -0
  31. nemo_evaluator_launcher/configs/execution/local.yaml +19 -0
  32. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +34 -0
  33. nemo_evaluator_launcher/executors/__init__.py +22 -0
  34. nemo_evaluator_launcher/executors/base.py +120 -0
  35. nemo_evaluator_launcher/executors/lepton/__init__.py +16 -0
  36. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +609 -0
  37. nemo_evaluator_launcher/executors/lepton/executor.py +1004 -0
  38. nemo_evaluator_launcher/executors/lepton/job_helpers.py +398 -0
  39. nemo_evaluator_launcher/executors/local/__init__.py +15 -0
  40. nemo_evaluator_launcher/executors/local/executor.py +605 -0
  41. nemo_evaluator_launcher/executors/local/run.template.sh +103 -0
  42. nemo_evaluator_launcher/executors/registry.py +38 -0
  43. nemo_evaluator_launcher/executors/slurm/__init__.py +15 -0
  44. nemo_evaluator_launcher/executors/slurm/executor.py +1147 -0
  45. nemo_evaluator_launcher/exporters/__init__.py +36 -0
  46. nemo_evaluator_launcher/exporters/base.py +121 -0
  47. nemo_evaluator_launcher/exporters/gsheets.py +409 -0
  48. nemo_evaluator_launcher/exporters/local.py +502 -0
  49. nemo_evaluator_launcher/exporters/mlflow.py +619 -0
  50. nemo_evaluator_launcher/exporters/registry.py +40 -0
  51. nemo_evaluator_launcher/exporters/utils.py +624 -0
  52. nemo_evaluator_launcher/exporters/wandb.py +490 -0
  53. nemo_evaluator_launcher/package_info.py +38 -0
  54. nemo_evaluator_launcher/resources/mapping.toml +380 -0
  55. nemo_evaluator_launcher-0.1.28.dist-info/METADATA +494 -0
  56. nemo_evaluator_launcher-0.1.28.dist-info/RECORD +60 -0
  57. nemo_evaluator_launcher-0.1.28.dist-info/WHEEL +5 -0
  58. nemo_evaluator_launcher-0.1.28.dist-info/entry_points.txt +3 -0
  59. nemo_evaluator_launcher-0.1.28.dist-info/licenses/LICENSE +451 -0
  60. nemo_evaluator_launcher-0.1.28.dist-info/top_level.txt +1 -0
@@ -0,0 +1,380 @@
1
+ # NOTE(agronskiy): checked parity
2
+ [lm-evaluation-harness]
3
+ container = "nvcr.io/nvidia/eval-factory/lm-evaluation-harness:25.10"
4
+
5
+ [lm-evaluation-harness.tasks.chat.ifeval]
6
+ required_env_vars = []
7
+
8
+ [lm-evaluation-harness.tasks.chat.mmlu_prox]
9
+ required_env_vars = []
10
+
11
+ [lm-evaluation-harness.tasks.completions.mmlu]
12
+ required_env_vars = []
13
+
14
+ [lm-evaluation-harness.tasks.completions.mmlu_pro]
15
+
16
+ [lm-evaluation-harness.tasks.completions.global_mmlu]
17
+ [lm-evaluation-harness.tasks.completions.global_mmlu_ar]
18
+ [lm-evaluation-harness.tasks.completions.global_mmlu_bn]
19
+ [lm-evaluation-harness.tasks.completions.global_mmlu_de]
20
+ [lm-evaluation-harness.tasks.completions.global_mmlu_en]
21
+ [lm-evaluation-harness.tasks.completions.global_mmlu_es]
22
+ [lm-evaluation-harness.tasks.completions.global_mmlu_fr]
23
+ [lm-evaluation-harness.tasks.completions.global_mmlu_hi]
24
+ [lm-evaluation-harness.tasks.completions.global_mmlu_id]
25
+ [lm-evaluation-harness.tasks.completions.global_mmlu_it]
26
+ [lm-evaluation-harness.tasks.completions.global_mmlu_ja]
27
+ [lm-evaluation-harness.tasks.completions.global_mmlu_ko]
28
+ [lm-evaluation-harness.tasks.completions.global_mmlu_pt]
29
+ [lm-evaluation-harness.tasks.completions.global_mmlu_sw]
30
+ [lm-evaluation-harness.tasks.completions.global_mmlu_yo]
31
+ [lm-evaluation-harness.tasks.completions.global_mmlu_zh]
32
+
33
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full]
34
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_am]
35
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_ar]
36
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_bn]
37
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_cs]
38
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_de]
39
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_el]
40
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_en]
41
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_es]
42
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_fa]
43
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_fil]
44
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_fr]
45
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_ha]
46
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_he]
47
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_hi]
48
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_id]
49
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_ig]
50
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_it]
51
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_ja]
52
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_ko]
53
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_ky]
54
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_lt]
55
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_mg]
56
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_ms]
57
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_ne]
58
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_nl]
59
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_ny]
60
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_pl]
61
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_pt]
62
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_ro]
63
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_ru]
64
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_si]
65
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_sn]
66
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_so]
67
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_sr]
68
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_sv]
69
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_sw]
70
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_te]
71
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_tr]
72
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_uk]
73
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_vi]
74
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_yo]
75
+ [lm-evaluation-harness.tasks.completions.global_mmlu_full_zh]
76
+ [lm-evaluation-harness.tasks.completions.mmlu_logits]
77
+
78
+ [lm-evaluation-harness.tasks.chat.mmlu_instruct]
79
+
80
+ [lm-evaluation-harness.tasks.chat.mmlu_redux_instruct]
81
+
82
+ [lm-evaluation-harness.tasks.completions.gsm8k]
83
+ required_env_vars = []
84
+
85
+ [lm-evaluation-harness.tasks.chat.gsm8k_cot_instruct]
86
+ required_env_vars = []
87
+
88
+ [lm-evaluation-harness.tasks.chat.gsm8k_cot_llama]
89
+ required_env_vars = []
90
+
91
+ [lm-evaluation-harness.tasks.chat.mgsm_cot]
92
+
93
+ [lm-evaluation-harness.tasks.chat.gpqa_diamond_cot]
94
+
95
+ [lm-evaluation-harness.tasks.completions.winogrande]
96
+
97
+ [lm-evaluation-harness.tasks.completions.hellaswag]
98
+ [lm-evaluation-harness.tasks.completions.hellaswag_multilingual]
99
+
100
+ [lm-evaluation-harness.tasks.completions.commonsense_qa]
101
+
102
+ [lm-evaluation-harness.tasks.completions.openbookqa]
103
+
104
+ [lm-evaluation-harness.tasks.completions.piqa]
105
+
106
+ [lm-evaluation-harness.tasks.completions.adlr_race]
107
+
108
+ [lm-evaluation-harness.tasks.completions.social_iqa]
109
+
110
+ [lm-evaluation-harness.tasks.completions.adlr_truthfulqa_mc2]
111
+ [lm-evaluation-harness.tasks.completions.adlr_minerva_math_nemo]
112
+ [lm-evaluation-harness.tasks.completions.adlr_arc_challenge_llama]
113
+ [lm-evaluation-harness.tasks.completions.adlr_mmlu_pro_5_shot_base]
114
+ [lm-evaluation-harness.tasks.completions.adlr_mbpp_sanitized_3shot_greedy]
115
+ [lm-evaluation-harness.tasks.completions.adlr_mbppplus_greedy_sanitized]
116
+ [lm-evaluation-harness.tasks.completions.adlr_humaneval_greedy]
117
+ [lm-evaluation-harness.tasks.completions.adlr_humanevalplus_greedy]
118
+ [lm-evaluation-harness.tasks.chat.adlr_gsm8k_fewshot_cot]
119
+ required_env_vars = []
120
+
121
+ [lm-evaluation-harness.tasks.completions.arc_multilingual]
122
+
123
+
124
+ ###############################################################################
125
+ # NOTE(agronskiy): checked parity
126
+ [mtbench]
127
+ container = "nvcr.io/nvidia/eval-factory/mtbench:25.10"
128
+
129
+ [mtbench.tasks.chat.mtbench]
130
+
131
+ [mtbench.tasks.chat.mtbench-cor1]
132
+
133
+
134
+ ###############################################################################
135
+ # NOTE(agronskiy): checked parity
136
+ [ifbench]
137
+ container = "nvcr.io/nvidia/eval-factory/ifbench:25.10"
138
+
139
+ [ifbench.tasks.chat.ifbench]
140
+ required_env_vars = []
141
+
142
+
143
+ ###############################################################################
144
+ [simple_evals]
145
+ container = "nvcr.io/nvidia/eval-factory/simple-evals:25.10"
146
+
147
+ [simple_evals.tasks.chat.gpqa_diamond]
148
+ required_env_vars = ["HF_TOKEN"]
149
+
150
+ [simple_evals.tasks.chat.gpqa_diamond_aa_v2]
151
+ required_env_vars = ["HF_TOKEN"]
152
+
153
+ [simple_evals.tasks.chat.gpqa_diamond_aa_v2_llama_4]
154
+ required_env_vars = ["HF_TOKEN"]
155
+
156
+ [simple_evals.tasks.chat.gpqa_diamond_nemo]
157
+ required_env_vars = ["HF_TOKEN"]
158
+
159
+ [simple_evals.tasks.chat.AA_math_test_500]
160
+ required_env_vars = ["JUDGE_API_KEY"]
161
+
162
+ [simple_evals.tasks.chat.math_test_500_nemo]
163
+ required_env_vars = []
164
+
165
+ [simple_evals.tasks.chat.aime_2024_nemo]
166
+ required_env_vars = []
167
+
168
+ [simple_evals.tasks.chat.AA_AIME_2024]
169
+ required_env_vars = ["JUDGE_API_KEY"]
170
+
171
+ [simple_evals.tasks.chat.aime_2025_nemo]
172
+ required_env_vars = []
173
+
174
+ [simple_evals.tasks.chat.AIME_2025]
175
+ required_env_vars = ["JUDGE_API_KEY"]
176
+
177
+ [simple_evals.tasks.chat.humaneval]
178
+ required_env_vars = []
179
+
180
+ [simple_evals.tasks.chat.mgsm]
181
+ required_env_vars = []
182
+
183
+ [simple_evals.tasks.chat.mmlu_pro]
184
+ required_env_vars = []
185
+
186
+ [simple_evals.tasks.chat.mmlu]
187
+ required_env_vars = []
188
+
189
+ [simple_evals.tasks.chat.mmlu_llama_4]
190
+ required_env_vars = []
191
+
192
+ [simple_evals.tasks.chat.mmlu_pro_llama_4]
193
+ required_env_vars = []
194
+
195
+ [simple_evals.tasks.chat.mmlu_ar-lite]
196
+ [simple_evals.tasks.chat.mmlu_bn-lite]
197
+ [simple_evals.tasks.chat.mmlu_de-lite]
198
+ [simple_evals.tasks.chat.mmlu_en-lite]
199
+ [simple_evals.tasks.chat.mmlu_es-lite]
200
+ [simple_evals.tasks.chat.mmlu_fr-lite]
201
+ [simple_evals.tasks.chat.mmlu_hi-lite]
202
+ [simple_evals.tasks.chat.mmlu_id-lite]
203
+ [simple_evals.tasks.chat.mmlu_it-lite]
204
+ [simple_evals.tasks.chat.mmlu_ja-lite]
205
+ [simple_evals.tasks.chat.mmlu_ko-lite]
206
+ [simple_evals.tasks.chat.mmlu_my-lite]
207
+ [simple_evals.tasks.chat.mmlu_pt-lite]
208
+ [simple_evals.tasks.chat.mmlu_sw-lite]
209
+ [simple_evals.tasks.chat.mmlu_yo-lite]
210
+ [simple_evals.tasks.chat.mmlu_zh-lite]
211
+
212
+
213
+ ###############################################################################
214
+ # NOTE(agronskiy): checked parity
215
+ [bigcode-evaluation-harness]
216
+ container = "nvcr.io/nvidia/eval-factory/bigcode-evaluation-harness:25.10"
217
+
218
+ [bigcode-evaluation-harness.tasks.chat.mbpp]
219
+ required_env_vars = []
220
+
221
+ [bigcode-evaluation-harness.tasks.chat.mbppplus]
222
+
223
+ [bigcode-evaluation-harness.tasks.chat.mbppplus_nemo]
224
+ required_env_vars = []
225
+
226
+ [bigcode-evaluation-harness.tasks.completions.humaneval]
227
+ required_env_vars = []
228
+
229
+ [bigcode-evaluation-harness.tasks.chat.humaneval_instruct]
230
+
231
+
232
+ ###############################################################################
233
+ [livecodebench]
234
+ container = "nvcr.io/nvidia/eval-factory/livecodebench:25.10"
235
+
236
+ [livecodebench.tasks.chat.livecodebench_0724_0125]
237
+ required_env_vars = []
238
+
239
+ [livecodebench.tasks.chat.livecodebench_0824_0225]
240
+ required_env_vars = []
241
+
242
+
243
+ ###############################################################################
244
+ [scicode]
245
+ container = "nvcr.io/nvidia/eval-factory/scicode:25.10"
246
+
247
+ [scicode.tasks.chat.aa_scicode]
248
+ required_env_vars = []
249
+
250
+
251
+ ###############################################################################
252
+ [hle]
253
+ container = "nvcr.io/nvidia/eval-factory/hle:25.10"
254
+
255
+ [hle.tasks.chat.hle]
256
+ required_env_vars = ["HF_TOKEN", "OPENAI_CLIENT_ID", "OPENAI_CLIENT_SECRET"]
257
+
258
+
259
+ ###############################################################################
260
+ [bfcl]
261
+ container = "nvcr.io/nvidia/eval-factory/bfcl:25.10"
262
+
263
+ [bfcl.tasks.chat.bfclv2_ast_prompting]
264
+ required_env_vars = []
265
+
266
+ [bfcl.tasks.chat.bfclv3_ast_prompting]
267
+ required_env_vars = []
268
+
269
+
270
+ ###############################################################################
271
+ [profbench]
272
+ container = "nvcr.io/nvidia/eval-factory/profbench:25.10"
273
+
274
+ [profbench.tasks.chat.llm_judge]
275
+ required_env_vars = []
276
+
277
+ [profbench.tasks.chat.report_generation]
278
+ required_env_vars = []
279
+
280
+
281
+ ###############################################################################
282
+ [vlmevalkit]
283
+ container = "nvcr.io/nvidia/eval-factory/vlmevalkit:25.10"
284
+
285
+ [vlmevalkit.tasks.vlm.ocrbench]
286
+ required_env_vars = []
287
+
288
+ [vlmevalkit.tasks.vlm.slidevqa]
289
+ required_env_vars = ["OPENAI_CLIENT_ID", "OPENAI_CLIENT_SECRET"]
290
+
291
+ [vlmevalkit.tasks.vlm.chartqa]
292
+ required_env_vars = []
293
+
294
+ [vlmevalkit.tasks.vlm.ai2d_judge]
295
+ required_env_vars = ["OPENAI_CLIENT_ID", "OPENAI_CLIENT_SECRET"]
296
+
297
+
298
+ ###############################################################################
299
+ [garak]
300
+ container = "nvcr.io/nvidia/eval-factory/garak:25.10"
301
+
302
+ [garak.tasks.chat.garak]
303
+ required_env_vars = []
304
+
305
+ ###############################################################################
306
+ # NOTE(wprazuch): to verify if the tasks need any env var setting
307
+ [nemo_skills]
308
+ container = "nvcr.io/nvidia/eval-factory/nemo_skills:25.10"
309
+
310
+ [nemo_skills.tasks.chat.ns_aime2024]
311
+ required_env_vars = ["JUDGE_API_KEY"]
312
+
313
+ [nemo_skills.tasks.chat.ns_aime2025]
314
+ required_env_vars = []
315
+
316
+ [nemo_skills.tasks.chat.ns_bfcl_v3]
317
+ required_env_vars = []
318
+
319
+ [nemo_skills.tasks.chat.ns_gpqa]
320
+ required_env_vars = ["HF_TOKEN"]
321
+
322
+ [nemo_skills.tasks.chat.ns_hle]
323
+ required_env_vars = []
324
+
325
+ [nemo_skills.tasks.chat.ns_mmlu]
326
+ required_env_vars = ["HF_TOKEN"]
327
+
328
+ [nemo_skills.tasks.chat.ns_mmlu_pro]
329
+ required_env_vars = ["HF_TOKEN"]
330
+
331
+ ###############################################################################
332
+ [safety-harness]
333
+ container = "nvcr.io/nvidia/eval-factory/safety-harness:25.10"
334
+
335
+ [safety-harness.tasks.chat.aegis_v2]
336
+ required_env_vars = ["HF_TOKEN"]
337
+
338
+
339
+ ###############################################################################
340
+ # NOTE(agronskiy): checked parity
341
+ [helm]
342
+ container = "nvcr.io/nvidia/eval-factory/helm:25.10"
343
+
344
+ [helm.tasks.chat.medcalc_bench]
345
+
346
+ [helm.tasks.chat.medec]
347
+
348
+ [helm.tasks.chat.head_qa]
349
+
350
+ [helm.tasks.chat.medbullets]
351
+
352
+ [helm.tasks.chat.pubmed_qa]
353
+
354
+ [helm.tasks.chat.ehr_sql]
355
+
356
+ [helm.tasks.chat.race_based_med]
357
+
358
+ [helm.tasks.chat.medhallu]
359
+
360
+ [helm.tasks.chat.mtsamples_replicate]
361
+
362
+ [helm.tasks.chat.aci_bench]
363
+
364
+ [helm.tasks.chat.mtsamples_procedures]
365
+
366
+ [helm.tasks.chat.medication_qa]
367
+
368
+ [helm.tasks.chat.med_dialog_healthcaremagic]
369
+
370
+ [helm.tasks.chat.med_dialog_icliniq]
371
+
372
+ [helm.tasks.chat.medi_qa]
373
+
374
+
375
+ ###############################################################################
376
+ # NOTE(agronskiy): checked parity
377
+ [tooltalk]
378
+ container = "nvcr.io/nvidia/eval-factory/tooltalk:25.10"
379
+
380
+ [tooltalk.tasks.chat.tooltalk]