crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  31. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  32. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  33. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  34. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  35. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  36. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  37. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  38. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  39. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  40. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  41. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  42. helm/benchmark/annotation/medalign_annotator.py +100 -0
  43. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  44. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  45. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  46. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  47. helm/benchmark/annotation/model_as_judge.py +218 -6
  48. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  49. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  50. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  51. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  52. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  53. helm/benchmark/annotation/spider_annotator.py +18 -0
  54. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  55. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  56. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  57. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  58. helm/benchmark/annotation_executor.py +35 -15
  59. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  60. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  61. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  62. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  63. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  64. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  65. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  66. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  67. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  68. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  69. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  70. helm/benchmark/augmentations/perturbation.py +1 -1
  71. helm/benchmark/augmentations/space_perturbation.py +2 -2
  72. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  73. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  74. helm/benchmark/augmentations/test_perturbation.py +16 -13
  75. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  76. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  77. helm/benchmark/data_preprocessor.py +2 -2
  78. helm/benchmark/huggingface_registration.py +2 -7
  79. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  80. helm/benchmark/metrics/basic_metrics.py +6 -6
  81. helm/benchmark/metrics/bbq_metrics.py +2 -2
  82. helm/benchmark/metrics/bias_metrics.py +12 -3
  83. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  84. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  85. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  86. helm/benchmark/metrics/classification_metrics.py +76 -12
  87. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  88. helm/benchmark/metrics/code_metrics.py +5 -5
  89. helm/benchmark/metrics/comet_metric.py +125 -0
  90. helm/benchmark/metrics/common_metric_specs.py +9 -2
  91. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  92. helm/benchmark/metrics/copyright_metrics.py +4 -4
  93. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  94. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  95. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  96. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  97. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  98. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  99. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  100. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  101. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  102. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  103. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  104. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  105. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  106. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  107. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  108. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  109. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  110. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  111. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  112. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  113. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  114. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  115. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  116. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  117. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  118. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  120. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  121. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  122. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  123. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  124. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  125. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  126. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  127. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  128. helm/benchmark/metrics/medalign_metrics.py +34 -0
  129. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  130. helm/benchmark/metrics/medec_metrics.py +101 -0
  131. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  132. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  133. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  134. helm/benchmark/metrics/metric.py +3 -3
  135. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  136. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  137. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  138. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  139. helm/benchmark/metrics/nltk_helper.py +32 -0
  140. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  141. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  142. helm/benchmark/metrics/output_processing_metric.py +60 -0
  143. helm/benchmark/metrics/output_processors.py +15 -0
  144. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  145. helm/benchmark/metrics/ranking_metrics.py +3 -3
  146. helm/benchmark/metrics/reference_metric.py +3 -3
  147. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  148. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  149. helm/benchmark/metrics/spider_metrics.py +7 -0
  150. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  151. helm/benchmark/metrics/statistic.py +1 -1
  152. helm/benchmark/metrics/summac/model_summac.py +1 -1
  153. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  154. helm/benchmark/metrics/summarization_metrics.py +19 -9
  155. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  156. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  157. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  158. helm/benchmark/metrics/test_metric.py +1 -1
  159. helm/benchmark/metrics/test_statistic.py +2 -2
  160. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  161. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  162. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  163. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  164. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  165. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  167. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  169. helm/benchmark/metrics/unitxt_metrics.py +4 -1
  170. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  171. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  172. helm/benchmark/model_metadata_registry.py +16 -0
  173. helm/benchmark/presentation/summarize.py +23 -10
  174. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  175. helm/benchmark/reeval_run.py +203 -0
  176. helm/benchmark/reeval_runner.py +355 -0
  177. helm/benchmark/run.py +8 -17
  178. helm/benchmark/run_expander.py +78 -8
  179. helm/benchmark/run_spec_factory.py +12 -0
  180. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  181. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  182. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  183. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  184. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  185. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  186. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  187. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  188. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  189. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  190. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  191. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  192. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  193. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  194. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  195. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  196. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  197. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  198. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  199. helm/benchmark/run_specs/vlm_run_specs.py +75 -2
  200. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  201. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  202. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  203. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  204. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  205. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  206. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  207. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  208. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  209. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  210. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  211. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  212. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  213. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  214. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  215. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  216. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  217. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  218. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  219. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  220. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  221. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  222. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  223. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  224. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  225. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  226. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  227. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  228. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  229. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  230. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  231. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  232. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  233. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  234. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  235. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  236. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  237. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  238. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  239. helm/benchmark/scenarios/bold_scenario.py +1 -1
  240. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  241. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  242. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  243. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  244. helm/benchmark/scenarios/clear_scenario.py +153 -0
  245. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  246. helm/benchmark/scenarios/code_scenario.py +17 -4
  247. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  248. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  249. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  250. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  251. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  252. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  253. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  254. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  255. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  256. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  257. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  258. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  259. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  260. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  261. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  262. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  263. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  264. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  265. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  266. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  267. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  268. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  269. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  270. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  271. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  272. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  273. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  274. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  275. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  276. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  277. helm/benchmark/scenarios/ice_scenario.py +8 -4
  278. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  279. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  280. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  281. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  282. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  283. helm/benchmark/scenarios/koala_scenario.py +1 -1
  284. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  285. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  286. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  287. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  288. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  289. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  290. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  291. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  292. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  293. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  294. helm/benchmark/scenarios/math_scenario.py +9 -1
  295. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  296. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  297. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  298. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  299. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  300. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  301. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  302. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  303. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  304. helm/benchmark/scenarios/medec_scenario.py +120 -0
  305. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  306. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  307. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  308. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  309. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  310. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  311. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  312. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  313. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  314. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  315. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  316. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  317. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  318. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  319. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  320. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  321. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  322. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  323. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  324. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  325. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  326. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  328. helm/benchmark/scenarios/quac_scenario.py +10 -1
  329. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  330. helm/benchmark/scenarios/raft_scenario.py +17 -2
  331. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  332. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  333. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  334. helm/benchmark/scenarios/scenario.py +9 -1
  335. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  336. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  337. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  338. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  339. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  340. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  341. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  342. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  343. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  344. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  345. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  346. helm/benchmark/scenarios/spider_scenario.py +91 -0
  347. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  348. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  349. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  350. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  351. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  352. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  353. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  354. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  355. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  356. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  357. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  358. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  359. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  360. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  361. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  362. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  363. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  364. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  366. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  367. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  368. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  369. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  370. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  371. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  372. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  373. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  374. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  375. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  376. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  377. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  378. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  379. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  380. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  381. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  382. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  383. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  384. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  385. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  386. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  387. helm/benchmark/server.py +11 -0
  388. helm/benchmark/slurm_runner.py +1 -1
  389. helm/benchmark/static/schema_audio.yaml +752 -0
  390. helm/benchmark/static/schema_autobencher.yaml +150 -0
  391. helm/benchmark/static/schema_call_center.yaml +97 -60
  392. helm/benchmark/static/schema_capabilities.yaml +254 -0
  393. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  394. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  395. helm/benchmark/static/schema_enterprise.yaml +298 -0
  396. helm/benchmark/static/schema_finance.yaml +14 -12
  397. helm/benchmark/static/schema_heim.yaml +1389 -0
  398. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  399. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  400. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  401. helm/benchmark/static/schema_safety.yaml +18 -1
  402. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  403. helm/benchmark/static/schema_social_audio.yaml +224 -0
  404. helm/benchmark/static/schema_sql.yaml +171 -0
  405. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  406. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  407. helm/benchmark/static/schema_vhelm.yaml +109 -36
  408. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  409. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  410. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  411. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  412. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  413. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  414. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  415. helm/benchmark/static_build/config.js +1 -1
  416. helm/benchmark/static_build/index.html +5 -5
  417. helm/benchmark/window_services/default_window_service.py +1 -1
  418. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  419. helm/benchmark/window_services/ice_window_service.py +1 -1
  420. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  421. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  422. helm/benchmark/window_services/local_window_service.py +2 -2
  423. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  424. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  425. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  426. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  427. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  428. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  429. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  430. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  431. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  432. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  433. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  434. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  435. helm/benchmark/window_services/test_utils.py +1 -1
  436. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  437. helm/benchmark/window_services/yalm_window_service.py +1 -1
  438. helm/clients/ai21_client.py +3 -3
  439. helm/clients/aleph_alpha_client.py +1 -1
  440. helm/clients/audio_language/__init__.py +0 -0
  441. helm/clients/audio_language/diva_llama_client.py +118 -0
  442. helm/clients/audio_language/llama_omni_client.py +198 -0
  443. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  444. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  445. helm/clients/auto_client.py +4 -2
  446. helm/clients/azure_openai_client.py +55 -0
  447. helm/clients/bedrock_client.py +201 -7
  448. helm/clients/bedrock_utils.py +33 -0
  449. helm/clients/clip_scorers/clip_scorer.py +1 -1
  450. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  451. helm/clients/cohere_client.py +3 -3
  452. helm/clients/google_client.py +1 -1
  453. helm/clients/http_model_client.py +1 -1
  454. helm/clients/huggingface_client.py +10 -18
  455. helm/clients/ibm_client.py +267 -0
  456. helm/clients/image_generation/adobe_vision_client.py +1 -1
  457. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  458. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  459. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  460. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  461. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  462. helm/clients/image_generation/cogview2_client.py +1 -1
  463. helm/clients/image_generation/dalle2_client.py +1 -1
  464. helm/clients/image_generation/dalle3_client.py +2 -2
  465. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  466. helm/clients/image_generation/dalle_mini/data.py +1 -1
  467. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  468. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  469. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  470. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  471. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  472. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  473. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  474. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  475. helm/clients/image_generation/dalle_mini_client.py +1 -1
  476. helm/clients/image_generation/deep_floyd_client.py +1 -1
  477. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  478. helm/clients/image_generation/lexica_client.py +1 -1
  479. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  480. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  481. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  482. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  483. helm/clients/image_generation/mindalle_client.py +1 -1
  484. helm/clients/image_generation/together_image_generation_client.py +1 -1
  485. helm/clients/lit_gpt_client.py +2 -2
  486. helm/clients/mistral_client.py +62 -18
  487. helm/clients/nvidia_nim_client.py +0 -3
  488. helm/clients/openai_client.py +241 -22
  489. helm/clients/palmyra_client.py +1 -4
  490. helm/clients/reka_client.py +1 -1
  491. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  492. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  493. helm/clients/stanfordhealthcare_google_client.py +43 -0
  494. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  495. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  496. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  497. helm/clients/test_client.py +1 -1
  498. helm/clients/test_together_client.py +6 -1
  499. helm/clients/together_client.py +47 -7
  500. helm/clients/upstage_client.py +23 -0
  501. helm/clients/vertexai_client.py +39 -13
  502. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  503. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  504. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  505. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  506. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  507. helm/clients/vllm_client.py +4 -6
  508. helm/clients/yi_client.py +0 -3
  509. helm/common/audio_utils.py +111 -0
  510. helm/common/file_caches/local_file_cache.py +1 -1
  511. helm/common/file_caches/test_local_file_cache.py +1 -1
  512. helm/common/images_utils.py +2 -2
  513. helm/common/media_object.py +2 -2
  514. helm/common/multimodal_request_utils.py +26 -0
  515. helm/common/reeval_parameters.py +12 -0
  516. helm/common/request.py +6 -2
  517. helm/common/response_format.py +18 -0
  518. helm/common/test_media_object.py +1 -1
  519. helm/config/model_deployments.yaml +1112 -19
  520. helm/config/model_metadata.yaml +985 -44
  521. helm/config/tokenizer_configs.yaml +379 -3
  522. helm/proxy/cli.py +2 -2
  523. helm/proxy/example_queries.py +1 -1
  524. helm/proxy/server.py +11 -4
  525. helm/proxy/services/remote_service.py +1 -1
  526. helm/proxy/services/server_service.py +1 -1
  527. helm/proxy/services/test_remote_service.py +2 -2
  528. helm/proxy/services/test_service.py +1 -1
  529. helm/proxy/static/general.js +122 -0
  530. helm/proxy/static/help.html +99 -0
  531. helm/proxy/static/index.css +57 -0
  532. helm/proxy/static/index.html +40 -0
  533. helm/proxy/static/index.js +456 -0
  534. helm/proxy/static/info-icon.png +0 -0
  535. helm/proxy/test_retry.py +1 -1
  536. helm/proxy/token_counters/auto_token_counter.py +1 -1
  537. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  538. helm/tokenizers/caching_tokenizer.py +2 -30
  539. helm/tokenizers/http_model_tokenizer.py +1 -1
  540. helm/tokenizers/huggingface_tokenizer.py +2 -2
  541. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  542. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  543. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  544. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  545. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  546. helm/tokenizers/tokenizer.py +3 -1
  547. helm/tokenizers/yalm_tokenizer.py +3 -3
  548. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  549. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  550. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  551. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  552. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  553. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  554. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  555. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  556. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  557. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  558. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  559. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  560. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  561. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  562. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  563. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  564. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  565. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  566. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  567. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  568. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  569. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  570. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  571. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  572. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  573. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  574. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  575. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  576. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  577. helm/tokenizers/anthropic_tokenizer.py +0 -52
  578. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  579. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  580. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,355 @@
1
+ import dacite
2
+ import json
3
+ import os
4
+ import typing
5
+ from collections import Counter
6
+ from typing import Any, Dict, List, Optional
7
+ import torch
8
+
9
+ from tqdm import tqdm
10
+ from dataclasses import replace
11
+ from datasets import load_dataset
12
+
13
+ from helm.benchmark.adaptation.request_state import RequestState
14
+ from helm.common.general import ensure_directory_exists, write, asdict_without_nones
15
+ from helm.common.hierarchical_logger import hlog, htrack_block
16
+ from helm.common.cache import cache_stats
17
+ from helm.benchmark.scenarios.scenario import (
18
+ Scenario,
19
+ create_scenario,
20
+ Instance,
21
+ get_scenario_cache_path,
22
+ with_instance_ids,
23
+ )
24
+ from helm.benchmark.adaptation.adapters.adapter import Adapter
25
+ from helm.benchmark.adaptation.adapters.adapter_factory import AdapterFactory
26
+ from helm.benchmark.adaptation.scenario_state import ScenarioState
27
+ from helm.benchmark.run_spec import RunSpec
28
+ from helm.benchmark.data_preprocessor import DataPreprocessor
29
+ from helm.benchmark.executor import ExecutionSpec
30
+ from helm.benchmark.metrics.metric_name import MetricName
31
+ from helm.benchmark.metrics.dry_run_metrics import DryRunMetric
32
+ from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, create_metric, Stat
33
+ from helm.benchmark.runner import (
34
+ Runner,
35
+ remove_stats_nans,
36
+ remove_per_instance_stats_nans,
37
+ )
38
+
39
+ scenario_to_metric_name = {
40
+ "air_bench_2024": "air_score",
41
+ "babi_qa": "quasi_exact_match",
42
+ "bbq": "quasi_exact_match",
43
+ "blimp": "exact_match",
44
+ "boolq": "quasi_exact_match",
45
+ "civil_comments": "quasi_exact_match",
46
+ "dyck_language": "exact_match_indicator",
47
+ "entity_data_imputation": "quasi_exact_match",
48
+ "entity_matching": "quasi_exact_match",
49
+ "imdb": "quasi_exact_match",
50
+ "legal_support": "quasi_exact_match",
51
+ "raft": "quasi_exact_match",
52
+ "synthetic_reasoning": "quasi_exact_match",
53
+ "truthful_qa": "exact_match",
54
+ "wikifact": "quasi_exact_match",
55
+ "mmlu": "exact_match",
56
+ "commonsense": "exact_match",
57
+ "gsm": "final_number_exact_match",
58
+ # "gsm": ["exact_match_indicator", "final_number_exact_match"],
59
+ "legalbench": "quasi_exact_match",
60
+ "math": "math_equiv_chain_of_thought",
61
+ "med_qa": "quasi_exact_match",
62
+ "thai_exam": "exact_match",
63
+ }
64
+
65
+
66
+ class REEvalRunner(Runner):
67
+ """
68
+ This runner implements the basic (non-amortized) method described in the paper
69
+ `Reliable and Efficient Amortized Model-Based Evaluation`. This approach, which is
70
+ also known as Computerized Adaptive Testing (CAT) within the framework of Item Response
71
+ Theory (IRT), leverages adaptive testing to evaluate model performance.
72
+
73
+ The difficulties of the questions are provided in a HuggingFace repository. In addition,
74
+ the authors of the paper will supply a Python package for calculating these difficulties.
75
+ At each iteration, the runner estimates the model's ability based on all previously
76
+ administered questions and their corresponding responses. It then selects the next question
77
+ whose difficulty is closest to the estimated ability, thereby reliably and efficiently
78
+ eliciting the model's ability.
79
+ """
80
+
81
+ def __init__(
82
+ self,
83
+ execution_spec: ExecutionSpec,
84
+ output_path: str,
85
+ suite: str,
86
+ skip_instances: bool,
87
+ cache_instances: bool,
88
+ cache_instances_only: bool,
89
+ skip_completed_runs: bool,
90
+ exit_on_error: bool,
91
+ ):
92
+ super().__init__(
93
+ execution_spec=execution_spec,
94
+ output_path=output_path,
95
+ suite=suite,
96
+ skip_instances=skip_instances,
97
+ cache_instances=cache_instances,
98
+ cache_instances_only=cache_instances_only,
99
+ skip_completed_runs=skip_completed_runs,
100
+ exit_on_error=exit_on_error,
101
+ )
102
+
103
+ def _estimate_model_ability(
104
+ self,
105
+ old_ability: float,
106
+ response_correctness: List[float],
107
+ instance_difficulties: List[float],
108
+ ) -> float:
109
+ def closure():
110
+ optim.zero_grad()
111
+ probs = torch.sigmoid(ability + difficulties)
112
+ loss = -torch.distributions.Bernoulli(probs=probs).log_prob(responses).mean()
113
+ loss.backward()
114
+ return loss
115
+
116
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
117
+ responses = torch.tensor(response_correctness, device=device)
118
+ difficulties = torch.tensor(instance_difficulties, device=device)
119
+ ability = torch.tensor([old_ability], requires_grad=True, device=device)
120
+ optim = torch.optim.LBFGS([ability], lr=0.1, max_iter=20, history_size=10, line_search_fn="strong_wolfe")
121
+
122
+ for iteration in range(100):
123
+ loss = optim.step(closure)
124
+
125
+ if iteration > 0:
126
+ prev_ability = ability.clone()
127
+ prev_loss = loss
128
+ d_loss = prev_loss - loss
129
+ d_theta = torch.norm(prev_ability - ability, p=2)
130
+ grad_norm = torch.norm(optim.param_groups[0]["params"][0].grad, p=2)
131
+ if d_loss < 1e-5 and d_theta < 1e-5 and grad_norm < 1e-5:
132
+ break
133
+
134
+ return ability.item()
135
+
136
+ def run_one(self, run_spec: RunSpec):
137
+ run_path: str = self._get_run_path(run_spec)
138
+ if self.skip_completed_runs and self._is_run_completed(run_path):
139
+ hlog(f"Skipping run {run_spec.name} because run is completed and all output files exist.")
140
+ return
141
+ ensure_directory_exists(run_path)
142
+
143
+ # Load the scenario
144
+ scenario: Scenario = create_scenario(run_spec.scenario_spec)
145
+
146
+ # This 'output_path' will be used when the model's input instances are saved.
147
+ args_str = ",".join([f"{k}={v}" for k, v in sorted(run_spec.scenario_spec.args.items())])
148
+ scenario_name_with_args = f"{scenario.name}:{args_str}" if args_str else f"{scenario.name}"
149
+ input_instances_output_path = os.path.join(self.instances_path, scenario_name_with_args)
150
+ input_instances_file_path = os.path.join(input_instances_output_path, "input_instances.json")
151
+
152
+ instances: List[Instance]
153
+ if self.skip_instances:
154
+ instances = []
155
+ else:
156
+ if self.cache_instances and os.path.exists(input_instances_file_path):
157
+ with open(input_instances_file_path) as f:
158
+ json_instances: List[Dict[str, Any]] = json.load(f)
159
+ instances = [dacite.from_dict(Instance, instance) for instance in json_instances]
160
+ else:
161
+ # Create the instances of the scenario
162
+ scenario_output_path = get_scenario_cache_path(self.output_path, scenario.name)
163
+ with htrack_block("scenario.get_instances"):
164
+ instances = scenario.get_instances(scenario_output_path)
165
+ if self.cache_instances and not os.path.exists(input_instances_file_path):
166
+ # Save instances to file
167
+ ensure_directory_exists(input_instances_output_path)
168
+ write(
169
+ os.path.join(input_instances_file_path),
170
+ json.dumps([asdict_without_nones(instance) for instance in instances], indent=2),
171
+ )
172
+ if self.cache_instances_only:
173
+ return # Exit after saving the instances.
174
+
175
+ # Give each instance a unique ID
176
+ if any([instance.id is None for instance in instances]):
177
+ instances = with_instance_ids(instances)
178
+
179
+ # Data preprocessing
180
+ instances = DataPreprocessor(run_spec.data_augmenter_spec).preprocess(
181
+ instances, self.executor.execution_spec.parallelism
182
+ )
183
+
184
+ # Adapt (convert to requests)
185
+ adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)
186
+ unasked_request_states_without_difficulty: List[RequestState] = adapter.adapt(
187
+ instances, self.executor.execution_spec.parallelism
188
+ )
189
+
190
+ # load difficulty
191
+ split_name = "dyck_language_np_3" if scenario.name == "dyck_language" else scenario.name
192
+ try:
193
+ difficulty_dataset = load_dataset("stair-lab/reeval-difficulty", split=split_name)
194
+ prompt_to_difficulty: dict[str, float] = {row["request.prompt"]: row["z"] for row in difficulty_dataset}
195
+ except ValueError:
196
+ hlog(f"WARNING: no available difficulty for {split_name}, skipping")
197
+ return
198
+
199
+ unasked_request_states: List[RequestState] = []
200
+ for request_state in unasked_request_states_without_difficulty:
201
+ prompt = request_state.request.prompt
202
+ if prompt in prompt_to_difficulty:
203
+ difficulty = prompt_to_difficulty[prompt]
204
+ current_extra_data = request_state.instance.extra_data or {}
205
+ if "difficulty" in current_extra_data:
206
+ raise Exception("Extra_data already contains a 'difficulty' key.")
207
+ new_extra_data = current_extra_data.copy()
208
+ new_extra_data["difficulty"] = difficulty
209
+ new_instance = replace(request_state.instance, extra_data=new_extra_data)
210
+ new_request_state = replace(request_state, instance=new_instance)
211
+ unasked_request_states.append(new_request_state)
212
+ assert unasked_request_states
213
+
214
+ # Execute the requests in an reeval manner
215
+ assert run_spec.adapter_spec.reeval_parameters is not None
216
+ model_ability: float = run_spec.adapter_spec.reeval_parameters.model_ability or 0.0
217
+ scenario_metric_name: str = scenario_to_metric_name[scenario.name]
218
+
219
+ asked_request_states: List[RequestState] = []
220
+ reeval_trajectory: Dict[str, List[float]] = {
221
+ "model_ability": [],
222
+ "response_correctness": [],
223
+ "instance_difficulties": [],
224
+ }
225
+
226
+ assert run_spec.adapter_spec.max_eval_instances is not None
227
+ for _ in tqdm(range(run_spec.adapter_spec.max_eval_instances), desc="REEval Execution"):
228
+ if not unasked_request_states:
229
+ break
230
+
231
+ selected_item: Optional[RequestState] = None
232
+ min_diff = float("inf")
233
+ for item in unasked_request_states:
234
+ assert item.instance.extra_data is not None
235
+ diff = abs(item.instance.extra_data["difficulty"] + model_ability)
236
+ if diff < min_diff:
237
+ min_diff = diff
238
+ selected_item = item
239
+ assert selected_item is not None
240
+ unasked_request_states.remove(selected_item)
241
+
242
+ # Execute the request
243
+ single_scenario_state: ScenarioState = ScenarioState(
244
+ adapter_spec=run_spec.adapter_spec,
245
+ request_states=[selected_item],
246
+ annotator_specs=run_spec.annotators,
247
+ )
248
+
249
+ # Execute (fill up results)
250
+ single_scenario_state = self.executor.execute(single_scenario_state)
251
+
252
+ # Annotate (post-process the results)
253
+ single_scenario_state = self.annotator_executor.execute(single_scenario_state)
254
+
255
+ # Apply the metrics
256
+ # When performing a dry run, only estimate the number of tokens instead
257
+ # of calculating the metrics.
258
+ metrics: List[MetricInterface] = (
259
+ [DryRunMetric()]
260
+ if self.dry_run
261
+ else [create_metric(metric_spec) for metric_spec in run_spec.metric_specs]
262
+ )
263
+
264
+ temp_per_instance_stats: List[PerInstanceStats] = []
265
+ with htrack_block(f"{len(metrics)} metrics"):
266
+ for metric in metrics:
267
+ with htrack_block(metric):
268
+ temp_metric_result: MetricResult = metric.evaluate(
269
+ single_scenario_state,
270
+ self.metric_service,
271
+ self.eval_cache_path,
272
+ self.executor.execution_spec.parallelism,
273
+ )
274
+ temp_per_instance_stats.extend(temp_metric_result.per_instance_stats)
275
+
276
+ # Update the reeval request states
277
+ asked_request_states.extend(single_scenario_state.request_states)
278
+
279
+ # Update the reeval trajectory
280
+ reeval_trajectory["model_ability"].append(model_ability)
281
+ scenario_metric_value = [
282
+ s for s in temp_per_instance_stats[0].stats if s.name.name == scenario_metric_name
283
+ ][0].mean
284
+
285
+ assert scenario_metric_value is not None
286
+ reeval_trajectory["response_correctness"].append(scenario_metric_value)
287
+ assert selected_item.instance.extra_data is not None
288
+ reeval_trajectory["instance_difficulties"].append(selected_item.instance.extra_data["difficulty"])
289
+
290
+ # Estimate the model ability
291
+ model_ability = self._estimate_model_ability(
292
+ old_ability=model_ability,
293
+ response_correctness=reeval_trajectory["response_correctness"],
294
+ instance_difficulties=reeval_trajectory["instance_difficulties"],
295
+ )
296
+
297
+ # Create the scenario state
298
+ scenario_state: ScenarioState = ScenarioState(
299
+ adapter_spec=run_spec.adapter_spec,
300
+ request_states=asked_request_states,
301
+ annotator_specs=run_spec.annotators,
302
+ )
303
+
304
+ stats: List[Stat] = []
305
+ per_instance_stats: List[PerInstanceStats] = []
306
+ with htrack_block(f"{len(metrics)} metrics"):
307
+ for metric in metrics:
308
+ with htrack_block(metric):
309
+ metric_result: MetricResult = metric.evaluate(
310
+ scenario_state,
311
+ self.metric_service,
312
+ self.eval_cache_path,
313
+ self.executor.execution_spec.parallelism,
314
+ )
315
+ stats.extend(metric_result.aggregated_stats)
316
+ per_instance_stats.extend(metric_result.per_instance_stats)
317
+
318
+ # Check that there aren't duplicate `Stat`s
319
+ # Note: doesn't catch near misses.
320
+ metric_counts: typing.Counter[MetricName] = Counter([stat.name for stat in stats])
321
+ for metric_name, count in metric_counts.items():
322
+ if count > 1:
323
+ hlog(f"WARNING: duplicate metric name {metric_name}")
324
+
325
+ # Print out the number of stats
326
+ hlog(f"Generated {len(stats)} stats.")
327
+
328
+ if self.skip_instances:
329
+ hlog("skip_instances was True. Skipping writing results out.")
330
+ return
331
+
332
+ # Output benchmarking information and results to files
333
+ write(os.path.join(run_path, "run_spec.json"), json.dumps(asdict_without_nones(run_spec), indent=2))
334
+
335
+ # Write out scenario
336
+ write(os.path.join(run_path, "scenario.json"), json.dumps(asdict_without_nones(scenario), indent=2))
337
+
338
+ # Write scenario state
339
+ write(os.path.join(run_path, "scenario_state.json"), json.dumps(asdict_without_nones(scenario_state), indent=2))
340
+
341
+ write(
342
+ os.path.join(run_path, "stats.json"),
343
+ json.dumps([asdict_without_nones(stat) for stat in remove_stats_nans(stats)], indent=2),
344
+ )
345
+ write(
346
+ os.path.join(run_path, "per_instance_stats.json"),
347
+ json.dumps(list(map(asdict_without_nones, remove_per_instance_stats_nans(per_instance_stats))), indent=2),
348
+ )
349
+
350
+ write(
351
+ os.path.join(run_path, "reeval_trajectory.json"),
352
+ json.dumps(reeval_trajectory, indent=2),
353
+ )
354
+
355
+ cache_stats.print_status()
helm/benchmark/run.py CHANGED
@@ -266,13 +266,6 @@ def main():
266
266
  default=None,
267
267
  help="Full class name of the Runner class to use. If unset, uses the default Runner.",
268
268
  )
269
- parser.add_argument(
270
- "--openvino",
271
- action="store_true",
272
- default=False,
273
- help="Experimental: Apply openvino optimization to Hugging Face AutoModelForCausalLM models "
274
- "specified with the --enable-huggingface-models and --enable-local-huggingface-models flags.",
275
- )
276
269
  add_run_args(parser)
277
270
  args = parser.parse_args()
278
271
  validate_args(args)
@@ -284,19 +277,13 @@ def main():
284
277
  from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
285
278
 
286
279
  for huggingface_model_name in args.enable_huggingface_models:
287
- if args.openvino:
288
- register_huggingface_hub_model_from_flag_value(huggingface_model_name, args.openvino)
289
- else:
290
- register_huggingface_hub_model_from_flag_value(huggingface_model_name)
280
+ register_huggingface_hub_model_from_flag_value(huggingface_model_name)
291
281
 
292
282
  if args.enable_local_huggingface_models:
293
283
  from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
294
284
 
295
285
  for huggingface_model_path in args.enable_local_huggingface_models:
296
- if args.openvino:
297
- register_huggingface_local_model_from_flag_value(huggingface_model_path, args.openvino)
298
- else:
299
- register_huggingface_local_model_from_flag_value(huggingface_model_path)
286
+ register_huggingface_local_model_from_flag_value(huggingface_model_path)
300
287
 
301
288
  run_entries: List[RunEntry] = []
302
289
  if args.conf_paths:
@@ -323,12 +310,16 @@ def main():
323
310
  if model_to_run not in all_models:
324
311
  raise Exception(f"Unknown model '{model_to_run}' passed to --models-to-run")
325
312
  else:
326
- model_expander_pattern = re.compile(
313
+ model_expander_wildcard_pattern = re.compile(
327
314
  r"\bmodel=(?:all|text_code|text|code|instruction_following|full_functionality_text|limited_functionality_text)\b" # noqa: E501
328
315
  )
329
- if any(model_expander_pattern.search(run_entry.description) for run_entry in run_entries):
316
+ if any(model_expander_wildcard_pattern.search(run_entry.description) for run_entry in run_entries):
330
317
  raise Exception("--models-to-run must be set if the `models=` run expander expands to multiple models")
331
318
 
319
+ model_expander_pattern = re.compile(r"\bmodel=\b")
320
+ if not any(model_expander_pattern.search(run_entry.description) for run_entry in run_entries):
321
+ raise Exception("--models-to-run must be set if the `models=` run expander is omitted")
322
+
332
323
  run_specs = run_entries_to_run_specs(
333
324
  run_entries=run_entries,
334
325
  max_eval_instances=args.max_eval_instances,
@@ -3,6 +3,7 @@ from abc import ABC, abstractmethod
3
3
  from dataclasses import replace
4
4
  from typing import Any, List, Dict, Optional, Tuple, Type
5
5
 
6
+ from helm.benchmark.metrics.metric import MetricSpec
6
7
  from helm.benchmark.model_metadata_registry import (
7
8
  get_all_instruction_following_models,
8
9
  get_all_code_models,
@@ -11,19 +12,21 @@ from helm.benchmark.model_metadata_registry import (
11
12
  get_model_metadata,
12
13
  get_model_names_with_tag,
13
14
  DEPRECATED_MODEL_TAG,
15
+ UNSUPPORTED_MODEL_TAG,
14
16
  FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
15
17
  LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
16
18
  ABLATION_MODEL_TAG,
17
19
  TEXT_TO_IMAGE_MODEL_TAG,
18
20
  VISION_LANGUAGE_MODEL_TAG,
21
+ AUDIO_LANGUAGE_MODEL_TAG,
19
22
  INSTRUCTION_FOLLOWING_MODEL_TAG,
20
23
  )
21
24
  from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
22
25
  from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
23
- from .run_spec import RunSpec
26
+ from helm.benchmark.run_spec import RunSpec
24
27
  from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT, AdapterSpec, Substitution
25
- from .augmentations.perturbation import PerturbationSpec
26
- from .augmentations.data_augmenter import DataAugmenterSpec
28
+ from helm.benchmark.augmentations.perturbation import PerturbationSpec
29
+ from helm.benchmark.augmentations.data_augmenter import DataAugmenterSpec
27
30
  from helm.benchmark.scenarios.scenario import TEST_SPLIT, VALID_SPLIT
28
31
 
29
32
 
@@ -347,6 +350,29 @@ class AnthropicClaude3RunExpander(RunExpander):
347
350
  return [run_spec]
348
351
 
349
352
 
353
+ class NovaRunExpander(RunExpander):
354
+ """
355
+ Custom prompt for Amazon Nova models.
356
+ These models need more explicit instructions about following the format.
357
+ """
358
+
359
+ name = "amazon-nova"
360
+
361
+ PROMPT = "Do not provide any additional explanation. Follow the format shown in the provided examples strictly."
362
+
363
+ def __init__(self):
364
+ pass
365
+
366
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
367
+ return [
368
+ replace(
369
+ run_spec,
370
+ name=run_spec.name,
371
+ adapter_spec=replace(run_spec.adapter_spec, global_prefix=NovaRunExpander.PROMPT + "\n\n"),
372
+ ),
373
+ ]
374
+
375
+
350
376
  class FollowFormatInstructionsRunExpander(RunExpander):
351
377
  """Adds more explicit instructions about following the format to prompts.
352
378
 
@@ -588,6 +614,7 @@ class ModelRunExpander(ReplaceValueRunExpander):
588
614
  "opinions_qa_ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
589
615
  "text_to_image": get_model_names_with_tag(TEXT_TO_IMAGE_MODEL_TAG),
590
616
  "vlm": get_model_names_with_tag(VISION_LANGUAGE_MODEL_TAG),
617
+ "audiolm": get_model_names_with_tag(AUDIO_LANGUAGE_MODEL_TAG),
591
618
  }
592
619
 
593
620
  # For each of the keys above (e.g., "text"), create a corresponding ablation (e.g., "ablation_text")
@@ -604,8 +631,10 @@ class ModelRunExpander(ReplaceValueRunExpander):
604
631
 
605
632
  # For each of the keys above, filter out deprecated models.
606
633
  deprecated_models = set(get_model_names_with_tag(DEPRECATED_MODEL_TAG))
634
+ unsupported_models = set(get_model_names_with_tag(UNSUPPORTED_MODEL_TAG))
635
+ excluded_models = deprecated_models | unsupported_models
607
636
  for family_name in values_dict.keys():
608
- values_dict[family_name] = [model for model in values_dict[family_name] if model not in deprecated_models]
637
+ values_dict[family_name] = [model for model in values_dict[family_name] if model not in excluded_models]
609
638
 
610
639
  return values_dict
611
640
 
@@ -1424,14 +1453,20 @@ class OutputFormatInstructions(RunExpander):
1424
1453
  name = "output_format_instructions"
1425
1454
 
1426
1455
  _SUFFIX_SUFFIX = "_suffix"
1456
+ _NO_PREFIX_SUFFIX = "_no_prefix"
1427
1457
 
1428
1458
  def __init__(self, scenario: str):
1459
+ self.suffix = False
1429
1460
  if scenario.endswith(OutputFormatInstructions._SUFFIX_SUFFIX):
1430
- self.scenario = scenario[: -len(OutputFormatInstructions._SUFFIX_SUFFIX)]
1461
+ scenario = scenario.removesuffix(OutputFormatInstructions._SUFFIX_SUFFIX)
1431
1462
  self.suffix = True
1432
- else:
1433
- self.scenario = scenario
1434
- self.suffix = False
1463
+
1464
+ self.no_prefix = False
1465
+ if scenario.endswith(OutputFormatInstructions._NO_PREFIX_SUFFIX):
1466
+ scenario = scenario.removesuffix(OutputFormatInstructions._NO_PREFIX_SUFFIX)
1467
+ self.no_prefix = True
1468
+
1469
+ self.scenario = scenario
1435
1470
 
1436
1471
  def expand(self, run_spec: RunSpec) -> List[RunSpec]:
1437
1472
  if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
@@ -1441,6 +1476,8 @@ class OutputFormatInstructions(RunExpander):
1441
1476
  instructions = "Answer with only a single letter."
1442
1477
  elif self.scenario == "mcqa":
1443
1478
  instructions = "Answer with only a single letter."
1479
+ elif self.scenario == "mcqa_only_last_question":
1480
+ instructions = "Answer only the last question with only a single letter."
1444
1481
  else:
1445
1482
  instructions = "Answer with only a single letter."
1446
1483
  elif run_spec.adapter_spec.method == ADAPT_GENERATION:
@@ -1452,6 +1489,8 @@ class OutputFormatInstructions(RunExpander):
1452
1489
  )
1453
1490
  elif self.scenario == "natural_qa":
1454
1491
  instructions = "Answer with a short answer or a boolean 'yes' or 'no' answer."
1492
+ elif self.scenario == "natural_qa_short_answer":
1493
+ instructions = "Answer with a short answer."
1455
1494
  elif self.scenario == "legalbench":
1456
1495
  if output_noun != "Answer":
1457
1496
  instructions = f"Answer with the {output_noun.lower()}."
@@ -1483,6 +1522,11 @@ class OutputFormatInstructions(RunExpander):
1483
1522
  else:
1484
1523
  raise ValueError(f"Unknown scenario {self.scenario}")
1485
1524
 
1525
+ if self.no_prefix:
1526
+ if instructions:
1527
+ instructions += " "
1528
+ instructions += f"Do not include '{run_spec.adapter_spec.output_prefix.strip()}' in your answer."
1529
+
1486
1530
  if self.suffix:
1487
1531
  return [
1488
1532
  replace(
@@ -1506,6 +1550,31 @@ class OutputFormatInstructions(RunExpander):
1506
1550
  ]
1507
1551
 
1508
1552
 
1553
+ class ProcessOutputRunExpander(RunExpander):
1554
+ name = "process_output"
1555
+
1556
+ def __init__(self, processor: str):
1557
+ self.processor = processor
1558
+
1559
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
1560
+ output_processing_metric_spec = MetricSpec(
1561
+ class_name="helm.benchmark.metrics.output_processing_metric.OutputProcessingMetric",
1562
+ args={
1563
+ "processor": self.processor,
1564
+ "metric_specs": [
1565
+ {"class_name": metric_spec.class_name, "args": metric_spec.args}
1566
+ for metric_spec in run_spec.metric_specs
1567
+ ],
1568
+ },
1569
+ )
1570
+ return [
1571
+ replace(
1572
+ run_spec,
1573
+ metric_specs=[output_processing_metric_spec],
1574
+ ),
1575
+ ]
1576
+
1577
+
1509
1578
  RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
1510
1579
  InstructionsRunExpander,
1511
1580
  PromptRunExpander,
@@ -1532,6 +1601,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
1532
1601
  TemperatureRunExpander,
1533
1602
  IncreaseTemperatureRunExpander,
1534
1603
  IncreaseMaxTokensRunExpander,
1604
+ ProcessOutputRunExpander,
1535
1605
  ]
1536
1606
 
1537
1607
 
@@ -37,6 +37,7 @@ from helm.benchmark.run_expander import (
37
37
  IncreaseTemperatureRunExpander,
38
38
  IncreaseMaxTokensRunExpander,
39
39
  LlavaRunExpander,
40
+ ModelRunExpander,
40
41
  OpenFlamingoRunExpander,
41
42
  StopRunExpander,
42
43
  )
@@ -61,6 +62,10 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
61
62
  expanders = [RUN_EXPANDERS[key](value) for key, value in args.items() if key in RUN_EXPANDERS] # type: ignore
62
63
  args = dict((key, value) for key, value in args.items() if key not in RUN_EXPANDERS)
63
64
 
65
+ # If no model run expander was specified, add the model=all run expander
66
+ if not any([expander for expander in expanders if isinstance(expander, ModelRunExpander)]):
67
+ expanders.append(ModelRunExpander("all"))
68
+
64
69
  run_specs: List[RunSpec] = [run_spec_function(**args)]
65
70
 
66
71
  # Apply expanders
@@ -138,6 +143,13 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
138
143
  ):
139
144
  run_spec = singleton(IncreaseMaxTokensRunExpander(value=1).expand(run_spec))
140
145
 
146
+ if model.name == "openai/o1-2024-12-17":
147
+ # From https://platform.openai.com/docs/guides/reasoning,
148
+ # "OpenAI recommends reserving at least 25,000 tokens for reasoning and outputs when you start
149
+ # experimenting with these models. As you become familiar with the number of reasoning tokens your
150
+ # prompts require, you can adjust this buffer accordingly."
151
+ run_spec = singleton(IncreaseMaxTokensRunExpander(value=25_000).expand(run_spec))
152
+
141
153
  # IDEFICS special handling
142
154
  if IDEFICS_MODEL_TAG in model.tags:
143
155
  if IDEFICS_INSTRUCT_MODEL_TAG in model.tags:
@@ -1,3 +1,5 @@
1
+ from typing import Dict, Optional
2
+
1
3
  from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
2
4
  from helm.benchmark.annotation.annotator import AnnotatorSpec
3
5
  from helm.benchmark.metrics.metric import MetricSpec
@@ -6,7 +8,10 @@ from helm.benchmark.scenarios.scenario import ScenarioSpec
6
8
 
7
9
 
8
10
  @run_spec_function("air_bench_2024")
9
- def get_air_bench_2024_spec() -> RunSpec:
11
+ def get_air_bench_2024_spec(
12
+ annotator_model: Optional[str] = None, annotator_model_deployment: Optional[str] = None
13
+ ) -> RunSpec:
14
+ run_spec_name = "air_bench_2024"
10
15
  adapter_spec = AdapterSpec(
11
16
  method=ADAPT_GENERATION,
12
17
  global_prefix="",
@@ -24,14 +29,27 @@ def get_air_bench_2024_spec() -> RunSpec:
24
29
  stop_sequences=[],
25
30
  )
26
31
  scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.air_bench_scenario.AIRBench2024Scenario")
27
- annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.air_bench_annotator.AIRBench2024Annotator")]
32
+ annotator_args: Dict[str, str] = {}
33
+ if annotator_model:
34
+ annotator_args["model"] = annotator_model
35
+ annotator_args["model_deployment"] = annotator_model_deployment or annotator_model
36
+ run_spec_name = (
37
+ "air_bench_2024:"
38
+ f"annotator_model={annotator_args['model']},"
39
+ f"annotator_model_deployment={annotator_args['model_deployment']}"
40
+ )
41
+ annotator_specs = [
42
+ AnnotatorSpec(
43
+ class_name="helm.benchmark.annotation.air_bench_annotator.AIRBench2024Annotator", args=annotator_args
44
+ )
45
+ ]
28
46
  metric_specs = [
29
47
  MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024ScoreMetric"),
30
48
  MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024BasicGenerationMetric"),
31
49
  MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
32
50
  ]
33
51
  return RunSpec(
34
- name="air_bench_2024",
52
+ name=run_spec_name,
35
53
  scenario_spec=scenario_spec,
36
54
  adapter_spec=adapter_spec,
37
55
  metric_specs=metric_specs,