crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  31. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  32. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  33. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  34. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  35. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  36. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  37. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  38. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  39. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  40. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  41. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  42. helm/benchmark/annotation/medalign_annotator.py +100 -0
  43. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  44. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  45. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  46. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  47. helm/benchmark/annotation/model_as_judge.py +218 -6
  48. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  49. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  50. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  51. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  52. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  53. helm/benchmark/annotation/spider_annotator.py +18 -0
  54. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  55. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  56. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  57. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  58. helm/benchmark/annotation_executor.py +35 -15
  59. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  60. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  61. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  62. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  63. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  64. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  65. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  66. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  67. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  68. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  69. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  70. helm/benchmark/augmentations/perturbation.py +1 -1
  71. helm/benchmark/augmentations/space_perturbation.py +2 -2
  72. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  73. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  74. helm/benchmark/augmentations/test_perturbation.py +16 -13
  75. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  76. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  77. helm/benchmark/data_preprocessor.py +2 -2
  78. helm/benchmark/huggingface_registration.py +2 -7
  79. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  80. helm/benchmark/metrics/basic_metrics.py +6 -6
  81. helm/benchmark/metrics/bbq_metrics.py +2 -2
  82. helm/benchmark/metrics/bias_metrics.py +12 -3
  83. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  84. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  85. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  86. helm/benchmark/metrics/classification_metrics.py +76 -12
  87. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  88. helm/benchmark/metrics/code_metrics.py +5 -5
  89. helm/benchmark/metrics/comet_metric.py +125 -0
  90. helm/benchmark/metrics/common_metric_specs.py +9 -2
  91. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  92. helm/benchmark/metrics/copyright_metrics.py +4 -4
  93. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  94. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  95. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  96. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  97. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  98. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  99. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  100. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  101. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  102. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  103. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  104. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  105. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  106. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  107. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  108. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  109. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  110. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  111. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  112. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  113. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  114. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  115. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  116. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  117. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  118. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  120. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  121. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  122. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  123. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  124. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  125. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  126. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  127. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  128. helm/benchmark/metrics/medalign_metrics.py +34 -0
  129. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  130. helm/benchmark/metrics/medec_metrics.py +101 -0
  131. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  132. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  133. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  134. helm/benchmark/metrics/metric.py +3 -3
  135. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  136. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  137. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  138. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  139. helm/benchmark/metrics/nltk_helper.py +32 -0
  140. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  141. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  142. helm/benchmark/metrics/output_processing_metric.py +60 -0
  143. helm/benchmark/metrics/output_processors.py +15 -0
  144. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  145. helm/benchmark/metrics/ranking_metrics.py +3 -3
  146. helm/benchmark/metrics/reference_metric.py +3 -3
  147. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  148. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  149. helm/benchmark/metrics/spider_metrics.py +7 -0
  150. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  151. helm/benchmark/metrics/statistic.py +1 -1
  152. helm/benchmark/metrics/summac/model_summac.py +1 -1
  153. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  154. helm/benchmark/metrics/summarization_metrics.py +19 -9
  155. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  156. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  157. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  158. helm/benchmark/metrics/test_metric.py +1 -1
  159. helm/benchmark/metrics/test_statistic.py +2 -2
  160. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  161. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  162. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  163. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  164. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  165. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  167. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  169. helm/benchmark/metrics/unitxt_metrics.py +4 -1
  170. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  171. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  172. helm/benchmark/model_metadata_registry.py +16 -0
  173. helm/benchmark/presentation/summarize.py +23 -10
  174. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  175. helm/benchmark/reeval_run.py +203 -0
  176. helm/benchmark/reeval_runner.py +355 -0
  177. helm/benchmark/run.py +8 -17
  178. helm/benchmark/run_expander.py +78 -8
  179. helm/benchmark/run_spec_factory.py +12 -0
  180. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  181. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  182. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  183. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  184. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  185. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  186. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  187. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  188. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  189. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  190. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  191. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  192. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  193. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  194. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  195. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  196. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  197. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  198. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  199. helm/benchmark/run_specs/vlm_run_specs.py +75 -2
  200. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  201. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  202. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  203. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  204. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  205. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  206. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  207. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  208. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  209. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  210. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  211. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  212. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  213. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  214. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  215. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  216. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  217. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  218. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  219. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  220. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  221. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  222. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  223. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  224. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  225. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  226. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  227. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  228. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  229. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  230. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  231. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  232. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  233. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  234. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  235. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  236. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  237. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  238. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  239. helm/benchmark/scenarios/bold_scenario.py +1 -1
  240. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  241. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  242. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  243. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  244. helm/benchmark/scenarios/clear_scenario.py +153 -0
  245. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  246. helm/benchmark/scenarios/code_scenario.py +17 -4
  247. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  248. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  249. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  250. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  251. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  252. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  253. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  254. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  255. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  256. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  257. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  258. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  259. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  260. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  261. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  262. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  263. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  264. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  265. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  266. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  267. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  268. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  269. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  270. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  271. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  272. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  273. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  274. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  275. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  276. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  277. helm/benchmark/scenarios/ice_scenario.py +8 -4
  278. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  279. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  280. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  281. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  282. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  283. helm/benchmark/scenarios/koala_scenario.py +1 -1
  284. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  285. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  286. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  287. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  288. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  289. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  290. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  291. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  292. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  293. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  294. helm/benchmark/scenarios/math_scenario.py +9 -1
  295. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  296. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  297. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  298. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  299. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  300. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  301. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  302. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  303. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  304. helm/benchmark/scenarios/medec_scenario.py +120 -0
  305. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  306. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  307. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  308. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  309. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  310. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  311. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  312. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  313. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  314. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  315. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  316. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  317. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  318. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  319. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  320. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  321. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  322. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  323. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  324. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  325. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  326. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  328. helm/benchmark/scenarios/quac_scenario.py +10 -1
  329. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  330. helm/benchmark/scenarios/raft_scenario.py +17 -2
  331. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  332. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  333. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  334. helm/benchmark/scenarios/scenario.py +9 -1
  335. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  336. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  337. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  338. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  339. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  340. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  341. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  342. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  343. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  344. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  345. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  346. helm/benchmark/scenarios/spider_scenario.py +91 -0
  347. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  348. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  349. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  350. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  351. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  352. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  353. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  354. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  355. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  356. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  357. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  358. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  359. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  360. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  361. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  362. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  363. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  364. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  366. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  367. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  368. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  369. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  370. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  371. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  372. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  373. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  374. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  375. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  376. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  377. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  378. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  379. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  380. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  381. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  382. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  383. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  384. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  385. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  386. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  387. helm/benchmark/server.py +11 -0
  388. helm/benchmark/slurm_runner.py +1 -1
  389. helm/benchmark/static/schema_audio.yaml +752 -0
  390. helm/benchmark/static/schema_autobencher.yaml +150 -0
  391. helm/benchmark/static/schema_call_center.yaml +97 -60
  392. helm/benchmark/static/schema_capabilities.yaml +254 -0
  393. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  394. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  395. helm/benchmark/static/schema_enterprise.yaml +298 -0
  396. helm/benchmark/static/schema_finance.yaml +14 -12
  397. helm/benchmark/static/schema_heim.yaml +1389 -0
  398. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  399. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  400. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  401. helm/benchmark/static/schema_safety.yaml +18 -1
  402. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  403. helm/benchmark/static/schema_social_audio.yaml +224 -0
  404. helm/benchmark/static/schema_sql.yaml +171 -0
  405. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  406. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  407. helm/benchmark/static/schema_vhelm.yaml +109 -36
  408. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  409. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  410. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  411. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  412. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  413. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  414. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  415. helm/benchmark/static_build/config.js +1 -1
  416. helm/benchmark/static_build/index.html +5 -5
  417. helm/benchmark/window_services/default_window_service.py +1 -1
  418. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  419. helm/benchmark/window_services/ice_window_service.py +1 -1
  420. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  421. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  422. helm/benchmark/window_services/local_window_service.py +2 -2
  423. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  424. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  425. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  426. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  427. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  428. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  429. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  430. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  431. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  432. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  433. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  434. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  435. helm/benchmark/window_services/test_utils.py +1 -1
  436. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  437. helm/benchmark/window_services/yalm_window_service.py +1 -1
  438. helm/clients/ai21_client.py +3 -3
  439. helm/clients/aleph_alpha_client.py +1 -1
  440. helm/clients/audio_language/__init__.py +0 -0
  441. helm/clients/audio_language/diva_llama_client.py +118 -0
  442. helm/clients/audio_language/llama_omni_client.py +198 -0
  443. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  444. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  445. helm/clients/auto_client.py +4 -2
  446. helm/clients/azure_openai_client.py +55 -0
  447. helm/clients/bedrock_client.py +201 -7
  448. helm/clients/bedrock_utils.py +33 -0
  449. helm/clients/clip_scorers/clip_scorer.py +1 -1
  450. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  451. helm/clients/cohere_client.py +3 -3
  452. helm/clients/google_client.py +1 -1
  453. helm/clients/http_model_client.py +1 -1
  454. helm/clients/huggingface_client.py +10 -18
  455. helm/clients/ibm_client.py +267 -0
  456. helm/clients/image_generation/adobe_vision_client.py +1 -1
  457. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  458. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  459. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  460. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  461. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  462. helm/clients/image_generation/cogview2_client.py +1 -1
  463. helm/clients/image_generation/dalle2_client.py +1 -1
  464. helm/clients/image_generation/dalle3_client.py +2 -2
  465. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  466. helm/clients/image_generation/dalle_mini/data.py +1 -1
  467. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  468. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  469. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  470. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  471. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  472. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  473. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  474. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  475. helm/clients/image_generation/dalle_mini_client.py +1 -1
  476. helm/clients/image_generation/deep_floyd_client.py +1 -1
  477. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  478. helm/clients/image_generation/lexica_client.py +1 -1
  479. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  480. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  481. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  482. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  483. helm/clients/image_generation/mindalle_client.py +1 -1
  484. helm/clients/image_generation/together_image_generation_client.py +1 -1
  485. helm/clients/lit_gpt_client.py +2 -2
  486. helm/clients/mistral_client.py +62 -18
  487. helm/clients/nvidia_nim_client.py +0 -3
  488. helm/clients/openai_client.py +241 -22
  489. helm/clients/palmyra_client.py +1 -4
  490. helm/clients/reka_client.py +1 -1
  491. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  492. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  493. helm/clients/stanfordhealthcare_google_client.py +43 -0
  494. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  495. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  496. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  497. helm/clients/test_client.py +1 -1
  498. helm/clients/test_together_client.py +6 -1
  499. helm/clients/together_client.py +47 -7
  500. helm/clients/upstage_client.py +23 -0
  501. helm/clients/vertexai_client.py +39 -13
  502. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  503. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  504. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  505. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  506. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  507. helm/clients/vllm_client.py +4 -6
  508. helm/clients/yi_client.py +0 -3
  509. helm/common/audio_utils.py +111 -0
  510. helm/common/file_caches/local_file_cache.py +1 -1
  511. helm/common/file_caches/test_local_file_cache.py +1 -1
  512. helm/common/images_utils.py +2 -2
  513. helm/common/media_object.py +2 -2
  514. helm/common/multimodal_request_utils.py +26 -0
  515. helm/common/reeval_parameters.py +12 -0
  516. helm/common/request.py +6 -2
  517. helm/common/response_format.py +18 -0
  518. helm/common/test_media_object.py +1 -1
  519. helm/config/model_deployments.yaml +1112 -19
  520. helm/config/model_metadata.yaml +985 -44
  521. helm/config/tokenizer_configs.yaml +379 -3
  522. helm/proxy/cli.py +2 -2
  523. helm/proxy/example_queries.py +1 -1
  524. helm/proxy/server.py +11 -4
  525. helm/proxy/services/remote_service.py +1 -1
  526. helm/proxy/services/server_service.py +1 -1
  527. helm/proxy/services/test_remote_service.py +2 -2
  528. helm/proxy/services/test_service.py +1 -1
  529. helm/proxy/static/general.js +122 -0
  530. helm/proxy/static/help.html +99 -0
  531. helm/proxy/static/index.css +57 -0
  532. helm/proxy/static/index.html +40 -0
  533. helm/proxy/static/index.js +456 -0
  534. helm/proxy/static/info-icon.png +0 -0
  535. helm/proxy/test_retry.py +1 -1
  536. helm/proxy/token_counters/auto_token_counter.py +1 -1
  537. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  538. helm/tokenizers/caching_tokenizer.py +2 -30
  539. helm/tokenizers/http_model_tokenizer.py +1 -1
  540. helm/tokenizers/huggingface_tokenizer.py +2 -2
  541. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  542. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  543. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  544. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  545. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  546. helm/tokenizers/tokenizer.py +3 -1
  547. helm/tokenizers/yalm_tokenizer.py +3 -3
  548. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  549. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  550. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  551. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  552. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  553. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  554. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  555. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  556. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  557. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  558. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  559. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  560. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  561. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  562. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  563. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  564. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  565. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  566. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  567. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  568. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  569. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  570. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  571. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  572. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  573. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  574. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  575. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  576. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  577. helm/tokenizers/anthropic_tokenizer.py +0 -52
  578. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  579. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  580. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1081 @@
1
+ ---
2
+ ############################################################
3
+ metrics:
4
+ # Infrastructure metrics:
5
+ - name: num_perplexity_tokens
6
+ display_name: '# tokens'
7
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
8
+ - name: num_bytes
9
+ display_name: '# bytes'
10
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
11
+
12
+ - name: num_references
13
+ display_name: '# ref'
14
+ description: Number of references.
15
+ - name: num_train_trials
16
+ display_name: '# trials'
17
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
18
+ - name: estimated_num_tokens_cost
19
+ display_name: 'cost'
20
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
21
+ - name: num_prompt_tokens
22
+ display_name: '# prompt tokens'
23
+ description: Number of tokens in the prompt.
24
+ - name: num_prompt_characters
25
+ display_name: '# prompt chars'
26
+ description: Number of characters in the prompt.
27
+ - name: num_completion_tokens
28
+ display_name: '# completion tokens'
29
+ description: Actual number of completion tokens (over all completions).
30
+ - name: num_output_tokens
31
+ display_name: '# output tokens'
32
+ description: Actual number of output tokens.
33
+ - name: max_num_output_tokens
34
+ display_name: 'Max output tokens'
35
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
36
+ - name: num_requests
37
+ display_name: '# requests'
38
+ description: Number of distinct API requests.
39
+ - name: num_instances
40
+ display_name: '# eval'
41
+ description: Number of evaluation instances.
42
+ - name: num_train_instances
43
+ display_name: '# train'
44
+ description: Number of training instances (e.g., in-context examples).
45
+ - name: prompt_truncated
46
+ display_name: truncated
47
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
48
+ - name: finish_reason_length
49
+ display_name: finish b/c length
50
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
51
+ - name: finish_reason_stop
52
+ display_name: finish b/c stop
53
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
54
+ - name: finish_reason_endoftext
55
+ display_name: finish b/c endoftext
56
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
57
+ - name: finish_reason_unknown
58
+ display_name: finish b/c unknown
59
+ description: Fraction of instances where the the output was terminated for unknown reasons.
60
+ - name: num_completions
61
+ display_name: '# completions'
62
+ description: Number of completions.
63
+ - name: predicted_index
64
+ display_name: Predicted index
65
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
66
+
67
+ # Accuracy metrics:
68
+ - name: exact_match
69
+ display_name: Exact match
70
+ short_display_name: EM
71
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
72
+ lower_is_better: false
73
+ - name: f1_score
74
+ display_name: F1
75
+ description: Average F1 score in terms of word overlap between the model output and correct reference.
76
+ lower_is_better: false
77
+ - name: live_qa_score
78
+ display_name: Judge Score
79
+ description: LLM-as-judge score
80
+ lower_is_better: false
81
+ - name: medication_qa_score
82
+ display_name: Judge Score
83
+ description: LLM-as-judge score
84
+ lower_is_better: false
85
+ - name: quasi_exact_match
86
+ display_name: Quasi-exact match
87
+ short_display_name: EM
88
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
89
+ lower_is_better: false
90
+ - name: prefix_exact_match
91
+ display_name: Prefix exact match
92
+ short_display_name: PEM
93
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
94
+ lower_is_better: false
95
+ - name: quasi_prefix_exact_match
96
+ # TODO: should call this prefix_quasi_exact_match
97
+ display_name: Prefix quasi-exact match
98
+ short_display_name: PEM
99
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
100
+ lower_is_better: false
101
+ - name: logprob
102
+ display_name: Log probability
103
+ short_display_name: Logprob
104
+ description: Predicted output's average log probability (input's log prob for language modeling).
105
+ lower_is_better: false
106
+ - name: logprob_per_byte
107
+ display_name: Log probability / byte
108
+ short_display_name: Logprob/byte
109
+ description: Predicted output's average log probability normalized by the number of bytes.
110
+ lower_is_better: false
111
+ - name: bits_per_byte
112
+ display_name: Bits/byte
113
+ short_display_name: BPB
114
+ lower_is_better: true
115
+ description: Average number of bits per byte according to model probabilities.
116
+ - name: perplexity
117
+ display_name: Perplexity
118
+ short_display_name: PPL
119
+ lower_is_better: true
120
+ description: Perplexity of the output completion (effective branching factor per output token).
121
+ - name: rouge_1
122
+ display_name: ROUGE-1
123
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
124
+ lower_is_better: false
125
+ - name: rouge_2
126
+ display_name: ROUGE-2
127
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
128
+ lower_is_better: false
129
+ - name: rouge_l
130
+ display_name: ROUGE-L
131
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
132
+ lower_is_better: false
133
+ - name: bleu_1
134
+ display_name: BLEU-1
135
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
136
+ lower_is_better: false
137
+ - name: bleu_4
138
+ display_name: BLEU-4
139
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
140
+ lower_is_better: false
141
+ - name: medec_error_flag_accuracy
142
+ display_name: Medical Error Flag Accuracy
143
+ short_display_name: MedecFlagAcc
144
+ description: Measures how accurately the model identifies whether a clinical note contains an error (binary classification of correct/incorrect).
145
+ lower_is_better: false
146
+ - name: medec_error_sentence_accuracy
147
+ display_name: Medical Error Sentence Accuracy
148
+ short_display_name: MedecSentenceAcc
149
+ description: Measures how accurately the model identifies the specific erroneous sentence within a clinical note.
150
+ lower_is_better: false
151
+ - name: ehr_sql_precision_answerable
152
+ display_name: Precision for Answerable Questions
153
+ short_display_name: EHRSQLPreAns
154
+ description: Measures the proportion of correctly predicted answerable questions among all questions predicted to be answerable.
155
+ lower_is_better: false
156
+ - name: ehr_sql_recall_answerable
157
+ display_name: Recall for Answerable Questions
158
+ short_display_name: EHRSQLReAns
159
+ description: Measures the proportion of correctly predicted answerable questions among all answerable questions in the dataset.
160
+ lower_is_better: false
161
+ - name: mimiciv_billing_code_precision
162
+ display_name: Precision for MIMIC Billing Codes
163
+ short_display_name: MIMICBillingPre
164
+ description: Measures the proportion of correctly predicted ICD codes among all ICD codes predicted by the model.
165
+ lower_is_better: false
166
+ - name: mimiciv_billing_code_recall
167
+ display_name: Recall for MIMIC Billing Codes
168
+ short_display_name: MIMICBillingRec
169
+ description: Measures the proportion of correctly predicted ICD codes among all ICD codes present in the gold standard.
170
+ lower_is_better: false
171
+ - name: mimiciv_billing_code_f1
172
+ display_name: F1 Score for MIMIC Billing Codes
173
+ short_display_name: MIMICBillingF1
174
+ description: Measures the harmonic mean of precision and recall for ICD codes, providing a balanced evaluation of the model's performance.
175
+ lower_is_better: false
176
+ - name: exact_match@5
177
+ display_name: Exact match @5
178
+ short_display_name: EM@5
179
+ description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference exactly.
180
+ lower_is_better: false
181
+ - name: quasi_exact_match@5
182
+ display_name: Quasi-exact match @5
183
+ short_display_name: EM@5
184
+ description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference up to light processing.
185
+ lower_is_better: false
186
+ - name: prefix_exact_match@5
187
+ display_name: Prefix exact match @5
188
+ short_display_name: PEM@5
189
+ description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference exactly.
190
+ lower_is_better: false
191
+ - name: quasi_prefix_exact_match@5
192
+ display_name: Prefix quasi-exact match @5
193
+ short_display_name: PEM@5
194
+ description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference up to light processing.
195
+ lower_is_better: false
196
+ - name: ehr_sql_execution_accuracy
197
+ display_name: Execution accuracy for Generated Query
198
+ short_display_name: EHRSQLExeAcc
199
+ description: Measures the proportion of correctly predicted answerable questions among all questions predicted to be answerable.
200
+ lower_is_better: false
201
+ - name: ehr_sql_query_validity
202
+ display_name: Validity of Generated Query
203
+ short_display_name: EHRSQLQueryValid
204
+ description: Measures the proportion of correctly predicted answerable questions among all answerable questions in the dataset.
205
+ lower_is_better: false
206
+ - name: aci_bench_accuracy
207
+ display_name: ACI-Bench Accuracy
208
+ short_display_name: Accuracy
209
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
210
+ lower_is_better: false
211
+ - name: mtsamples_replicate_accuracy
212
+ display_name: MTSamples Replicate Accuracy
213
+ short_display_name: Accuracy
214
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
215
+ lower_is_better: false
216
+ - name: medalign_accuracy
217
+ display_name: Medalign Accuracy
218
+ short_display_name: Accuracy
219
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
220
+ lower_is_better: false
221
+ - name: dischargeme_accuracy
222
+ display_name: DischargeMe Accuracy
223
+ short_display_name: Accuracy
224
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
225
+ lower_is_better: false
226
+ - name: mtsamples_procedures_accuracy
227
+ display_name: MTSamples Procedures Accuracy
228
+ short_display_name: Accuracy
229
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
230
+ lower_is_better: false
231
+ - name: mimic_rrs_accuracy
232
+ display_name: MIMIC-RRS Accuracy
233
+ short_display_name: Accuracy
234
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
235
+ lower_is_better: false
236
+ - name: chw_care_plan_accuracy
237
+ display_name: NoteExtract Accuracy
238
+ short_display_name: Accuracy
239
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
240
+ lower_is_better: false
241
+ - name: medication_qa_accuracy
242
+ display_name: MedicationQA Accuracy
243
+ short_display_name: Accuracy
244
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
245
+ lower_is_better: false
246
+ - name: starr_patient_instructions_accuracy
247
+ display_name: PatientInstruct Accuracy
248
+ short_display_name: Accuracy
249
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
250
+ lower_is_better: false
251
+ - name: med_dialog_accuracy
252
+ display_name: MedDialog Accuracy
253
+ short_display_name: Accuracy
254
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
255
+ lower_is_better: false
256
+ - name: medi_qa_accuracy
257
+ display_name: MediQA Accuracy
258
+ short_display_name: Accuracy
259
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
260
+ lower_is_better: false
261
+ - name: mental_health_accuracy
262
+ display_name: MentalHealth Accuracy
263
+ short_display_name: Accuracy
264
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
265
+ lower_is_better: false
266
+
267
+ # Summariazation metrics
268
+ - name: summac
269
+ display_name: SummaC
270
+ description: Faithfulness scores based on the SummaC method of [Laban et al. (2022)](https://aclanthology.org/2022.tacl-1.10/).
271
+ lower_is_better: false
272
+ - name: QAFactEval
273
+ display_name: QAFactEval
274
+ description: Faithfulness scores based on the SummaC method of [Laban et al. (2022)](https://aclanthology.org/2022.tacl-1.10/).
275
+ lower_is_better: false
276
+ - name: summarization_coverage
277
+ display_name: Coverage
278
+ description: Extent to which the model-generated summaries are extractive fragments from the source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).
279
+ - name: summarization_density
280
+ display_name: Density
281
+ description: Extent to which the model-generated summaries are extractive summaries based on the source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).
282
+ - name: summarization_compression
283
+ display_name: Compression
284
+ description: Extent to which the model-generated summaries are compressed relative to the source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).
285
+ - name: BERTScore-P
286
+ display_name: BERTScore (P)
287
+ description: Average BERTScore precision [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference summary.
288
+ lower_is_better: false
289
+ - name: BERTScore-R
290
+ display_name: BERTScore (R)
291
+ description: Average BERTScore recall [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference summary.
292
+ lower_is_better: false
293
+ - name: BERTScore-F
294
+ display_name: BERTScore (F1)
295
+ description: Average BERTScore F1 [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference summary.
296
+ lower_is_better: false
297
+ - name: HumanEval-faithfulness
298
+ display_name: HumanEval-faithfulness
299
+ description: Human evaluation score for faithfulness.
300
+ lower_is_better: false
301
+ - name: HumanEval-relevance
302
+ display_name: HumanEval-relevance
303
+ description: Human evaluation score for relevance.
304
+ lower_is_better: false
305
+ - name: HumanEval-coherence
306
+ display_name: HumanEval-coherence
307
+ description: Human evaluation score for coherence.
308
+ lower_is_better: false
309
+
310
+ # Toxicity metrics
311
+ - name: expected_max_toxicity
312
+ display_name: Average max toxicity
313
+ short_display_name: Avg. max toxicity
314
+ lower_is_better: true
315
+ description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
316
+ - name: max_toxicity_probability
317
+ display_name: max toxicity
318
+ lower_is_better: true
319
+ description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
320
+ - name: toxic_frac
321
+ display_name: Toxic fraction
322
+ lower_is_better: true
323
+ description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
324
+
325
+ # Efficiency metrics
326
+ - name: training_co2_cost
327
+ display_name: Estimated training emissions (kg CO2)
328
+ short_display_name: Training emissions (kg CO2)
329
+ lower_is_better: true
330
+ description: Estimate of the CO2 emissions from training the model.
331
+ - name: training_energy_cost
332
+ display_name: Estimated training energy cost (MWh)
333
+ short_display_name: Training energy (MWh)
334
+ lower_is_better: true
335
+ description: Estimate of the amount of energy used to train the model.
336
+ - name: inference_runtime
337
+ display_name: Observed inference runtime (s)
338
+ short_display_name: Observed inference time (s)
339
+ lower_is_better: true
340
+ description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
341
+ - name: inference_idealized_runtime
342
+ display_name: Idealized inference runtime (s)
343
+ short_display_name: Idealized inference time (s)
344
+ lower_is_better: true
345
+ description: Average time to process a request to the model based solely on the model architecture (using Megatron-LM).
346
+ - name: inference_denoised_runtime
347
+ display_name: Denoised inference runtime (s)
348
+ short_display_name: Denoised inference time (s)
349
+ lower_is_better: true
350
+ description: Average time to process a request to the model minus performance contention by using profiled runtimes from multiple trials of SyntheticEfficiencyScenario.
351
+ - name: batch_size
352
+ display_name: Batch size
353
+ description: For batch jobs, how many requests are in a batch.
354
+
355
+ # Calibration metrics:
356
+ - name: max_prob
357
+ display_name: Max prob
358
+ description: Model's average confidence in its prediction (only computed for classification tasks)
359
+ lower_is_better: false
360
+ - name: ece_10_bin
361
+ display_name: 10-bin expected calibration error
362
+ short_display_name: ECE (10-bin)
363
+ lower_is_better: true
364
+ description: The average difference between the model's confidence and accuracy, averaged across 10 bins where each bin contains an equal number of points (only computed for classification tasks). Warning - not reliable for small datasets (e.g., with < 300 examples) because each bin will have very few examples.
365
+ - name: ece_1_bin
366
+ display_name: 1-bin expected calibration error
367
+ short_display_name: ECE (1-bin)
368
+ lower_is_better: true
369
+ description: The (absolute value) difference between the model's average confidence and accuracy (only computed for classification tasks).
370
+ - name: selective_cov_acc_area
371
+ display_name: Selective coverage-accuracy area
372
+ short_display_name: Selective Acc
373
+ description: The area under the coverage-accuracy curve, a standard selective classification metric (only computed for classification tasks).
374
+ lower_is_better: false
375
+ - name: selective_acc@10
376
+ display_name: Accuracy at 10% coverage
377
+ short_display_name: Acc@10%
378
+ description: The accuracy for the 10% of predictions that the model is most confident on (only computed for classification tasks).
379
+ lower_is_better: false
380
+ - name: platt_ece_10_bin
381
+ display_name: 10-bin Expected Calibration Error (after Platt scaling)
382
+ short_display_name: Platt-scaled ECE (10-bin)
383
+ lower_is_better: true
384
+ description: 10-bin ECE computed after applying Platt scaling to recalibrate the model's predicted probabilities.
385
+ - name: platt_ece_1_bin
386
+ display_name: 1-bin expected calibration error (after Platt scaling)
387
+ short_display_name: Platt-scaled ECE (1-bin)
388
+ lower_is_better: true
389
+ description: 1-bin ECE computed after applying Platt scaling to recalibrate the model's predicted probabilities.
390
+ - name: platt_coef
391
+ display_name: Platt Scaling Coefficient
392
+ short_display_name: Platt Coef
393
+ description: Coefficient of the Platt scaling classifier (can compare this across tasks).
394
+ lower_is_better: false
395
+ - name: platt_intercept
396
+ display_name: Platt Scaling Intercept
397
+ short_display_name: Platt Intercept
398
+ description: Intercept of the Platt scaling classifier (can compare this across tasks).
399
+ lower_is_better: false
400
+
401
+ - name: ehr_sql_total_predicted_answerable
402
+ display_name: Total Predicted Answerable
403
+ short_display_name: Total Pred Ans
404
+ description: Total number of questions predicted to be answerable by the model.
405
+ lower_is_better: false
406
+
407
+ - name: ehr_sql_total_ground_truth_answerable
408
+ display_name: Total Ground Truth Answerable
409
+ short_display_name: Total GT Ans
410
+ description: Total number of answerable questions in the ground truth.
411
+ lower_is_better: false
412
+
413
+ - name: medcalc_bench_accuracy
414
+ display_name: MedCalc Accuracy
415
+ short_display_name: MedCalc Accuracy
416
+ description: Comparison based on category. Exact match for categories risk, severity and diagnosis. Check if within range for the other categories.
417
+ lower_is_better: false
418
+
419
+ ############################################################
420
+ perturbations: []
421
+
422
+ ############################################################
423
+ metric_groups:
424
+ - name: accuracy
425
+ display_name: Accuracy
426
+ metrics:
427
+ - name: ${main_name}
428
+ split: ${main_split}
429
+
430
+ - name: efficiency
431
+ display_name: Efficiency
432
+ metrics:
433
+ - name: inference_runtime
434
+ split: ${main_split}
435
+
436
+ - name: general_information
437
+ display_name: General information
438
+ hide_win_rates: true
439
+ metrics:
440
+ - name: num_instances
441
+ split: ${main_split}
442
+ - name: num_train_instances
443
+ split: ${main_split}
444
+ - name: prompt_truncated
445
+ split: ${main_split}
446
+ - name: num_prompt_tokens
447
+ split: ${main_split}
448
+ - name: num_output_tokens
449
+ split: ${main_split}
450
+
451
+ - name: toxicity
452
+ display_name: Toxicity
453
+ metrics:
454
+ - name: toxic_frac
455
+ split: ${main_split}
456
+
457
+ ############################################################
458
+ run_groups:
459
+ - name: medhelm_scenarios
460
+ display_name: MedHELM Scenarios
461
+ description: Scenarios for the medical domain
462
+ category: All scenarios
463
+ subgroups:
464
+ - clinical_decision_support
465
+ - clinical_note_generation
466
+ - patient_communication
467
+ - medical_research
468
+ - administration_and_workflow
469
+
470
+ - name: clinical_decision_support
471
+ display_name: Clinical Decision Support
472
+ description: Scenarios for clinical decision support
473
+ category: Healthcare Task Categories
474
+ subgroups:
475
+ - medcalc_bench
476
+ - clear
477
+ - mtsamples_replicate
478
+ - medec
479
+ - ehrshot
480
+ - head_qa
481
+ - medbullets
482
+ - medalign
483
+ - shc_ptbm_med
484
+ - shc_sei_med
485
+
486
+ - name: clinical_note_generation
487
+ display_name: Clinical Note Generation
488
+ description: Scenarios for clinical note generation
489
+ category: Healthcare Task Categories
490
+ subgroups:
491
+ - dischargeme
492
+ - aci_bench
493
+ - mtsamples_procedures
494
+ - mimic_rrs
495
+ - mimic_bhc
496
+ - chw_care_plan
497
+
498
+ - name: patient_communication
499
+ display_name: Patient Communication and Education
500
+ description: Scenarios for patient communication and education
501
+ category: Healthcare Task Categories
502
+ subgroups:
503
+ - medication_qa
504
+ - starr_patient_instructions
505
+ - med_dialog
506
+ - shc_conf_med
507
+ - medi_qa
508
+ - mental_health
509
+
510
+ - name: medical_research
511
+ display_name: Medical Research Assistance
512
+ description: Scenarios for medical research assistance
513
+ category: Healthcare Task Categories
514
+ subgroups:
515
+ - pubmed_qa
516
+ - ehr_sql
517
+ - shc_bmt_med
518
+ - race_based_med
519
+ - n2c2_ct_matching
520
+
521
+ - name: administration_and_workflow
522
+ display_name: Administration and Workflow
523
+ description: Scenarios for administration and workflow
524
+ category: Healthcare Task Categories
525
+ subgroups:
526
+ - shc_gip_med
527
+ - mimiciv_billing_code
528
+ - shc_sequoia_med
529
+ - shc_cdi_med
530
+ - shc_ent_med
531
+
532
+ - name: medcalc_bench
533
+ display_name: MedCalc-Bench
534
+ description: A dataset which consists of a patient note, a question requesting to compute a specific medical value, and a ground truth answer [(Khandekar et al., 2024)](https://arxiv.org/abs/2406.12036).
535
+ metric_groups:
536
+ - accuracy
537
+ - efficiency
538
+ - general_information
539
+ environment:
540
+ main_name: medcalc_bench_accuracy
541
+ main_split: test
542
+ taxonomy:
543
+ task: Computational reasoning
544
+ what: "Compute a specific medical value from a patient note"
545
+ who: "Clinician, Researcher"
546
+ when: "Any"
547
+ language: English
548
+
549
+ - name: medalign
550
+ display_name: MedAlign
551
+ short_display_name: MedAlign
552
+ description: A dataset that asks models to answer questions/follow instructions over longitudinal EHR [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).
553
+ metric_groups:
554
+ - accuracy
555
+ - efficiency
556
+ - general_information
557
+ environment:
558
+ main_name: medalign_accuracy
559
+ main_split: test
560
+ taxonomy:
561
+ task: Text generation
562
+ what: "Answer questions and follow instructions over longitudinal EHR"
563
+ who: "Clinician, Researcher"
564
+ when: "Any"
565
+ language: English
566
+
567
+ - name: mtsamples_replicate
568
+ display_name: MTSamples
569
+ short_display_name: MTSamples
570
+ description: A dataset of clinical notes where the model is prompted to generate the appropriate treatment plan for this patient [(MTSamples, 2025)](https://mtsamples.com).
571
+ metric_groups:
572
+ - accuracy
573
+ - efficiency
574
+ - general_information
575
+ environment:
576
+ main_name: mtsamples_replicate_accuracy
577
+ main_split: test
578
+ taxonomy:
579
+ task: Text generation
580
+ what: "Generate treatment plans based on clinical notes"
581
+ who: "Clinician"
582
+ when: "Post-diagnosis"
583
+ language: English
584
+
585
+ - name: ehrshot
586
+ display_name: EHRSHOT
587
+ description: A dataset given a patient record of EHR codes, classifying if an event will occur at a future date or not [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).
588
+ metric_groups:
589
+ - accuracy
590
+ - efficiency
591
+ - general_information
592
+ environment:
593
+ main_name: exact_match
594
+ main_split: test
595
+ taxonomy:
596
+ task: Classification
597
+ what: "Predict whether a medical event will occur in the future based on EHR codes"
598
+ who: "Clinician, Insurer"
599
+ when: "Future prediction"
600
+ language: English
601
+
602
+ - name: starr_patient_instructions
603
+ display_name: PatientInstruct
604
+ description: A dataset containing case details used to generate customized post-procedure patient instructions.
605
+ metric_groups:
606
+ - accuracy
607
+ - efficiency
608
+ - general_information
609
+ environment:
610
+ main_name: starr_patient_instructions_accuracy
611
+ main_split: test
612
+ taxonomy:
613
+ task: Text generation
614
+ what: Generate customized post-procedure patient instructions
615
+ who: Clinician
616
+ when: Post-procedure
617
+ language: English
618
+
619
+ - name: clear
620
+ display_name: CLEAR
621
+ description: "A dataset for evaluating the presence of a specific medical condition from patient notes with yes/no/maybe classifications [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1)."
622
+ metric_groups:
623
+ - accuracy
624
+ - efficiency
625
+ - general_information
626
+ environment:
627
+ main_name: exact_match
628
+ main_split: test
629
+ taxonomy:
630
+ task: Classification
631
+ what: Classify medical condition presence from patient notes
632
+ who: Clinician
633
+ when: Any
634
+ language: English
635
+
636
+ - name: race_based_med
637
+ display_name: RaceBias
638
+ description: A collection of LLM outputs in response to medical questions with race-based biases, with the objective being to classify whether the output contains racially biased content.
639
+ metric_groups:
640
+ - accuracy
641
+ - efficiency
642
+ - general_information
643
+ environment:
644
+ main_name: exact_match
645
+ main_split: test
646
+ taxonomy:
647
+ task: Classification
648
+ what: Identify race-based bias in LLM-generated medical responses
649
+ who: Researcher
650
+ when: Any
651
+ language: English
652
+
653
+ - name: n2c2_ct_matching
654
+ display_name: N2C2-CT Matching
655
+ short_display_name: N2C2-CT
656
+ description: A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.
657
+ metric_groups:
658
+ - accuracy
659
+ - efficiency
660
+ - general_information
661
+ environment:
662
+ main_name: exact_match
663
+ main_split: test
664
+ taxonomy:
665
+ task: Classification
666
+ what: Classify whether a patient is a valid candidate for a clinical trial based on clinical notes
667
+ who: Researcher
668
+ when: Pre-Trial
669
+ language: English
670
+
671
+ - name: med_dialog
672
+ display_name: MedDialog
673
+ short_display_name: MedDialog
674
+ description: A collection of doctor-patient conversations with corresponding summaries.
675
+ metric_groups:
676
+ - accuracy
677
+ - efficiency
678
+ - general_information
679
+ environment:
680
+ main_name: med_dialog_accuracy
681
+ main_split: test
682
+ taxonomy:
683
+ task: Text generation
684
+ what: Generate summaries of doctor-patient conversations
685
+ who: Clinician
686
+ when: Any
687
+ language: English
688
+
689
+ - name: medi_qa
690
+ display_name: MEDIQA
691
+ description: A dataset including a medical question, a set of candidate answers, relevance annotations for ranking, and additional context to evaluate understanding and retrieval capabilities in a healthcare setting.
692
+ metric_groups:
693
+ - accuracy
694
+ - efficiency
695
+ - general_information
696
+ environment:
697
+ main_name: medi_qa_accuracy
698
+ main_split: test
699
+ taxonomy:
700
+ task: Text generation
701
+ what: Retrieve and rank answers based on medical question understanding
702
+ who: Clinician, Medical Student
703
+ when: Any
704
+ language: English
705
+
706
+ - name: mental_health
707
+ display_name: MentalHealth
708
+ description: A dataset containing a counselor and mental health patient conversation, where the objective is to generate an empathetic counselor response.
709
+ metric_groups:
710
+ - accuracy
711
+ - efficiency
712
+ - general_information
713
+ environment:
714
+ main_name: mental_health_accuracy
715
+ main_split: test
716
+ taxonomy:
717
+ task: Text generation
718
+ what: Generate empathetic counseling responses in mental health conversations
719
+ who: Counselors, Patients
720
+ when: Any
721
+ language: English
722
+
723
+ - name: mimic_rrs
724
+ display_name: MIMIC-RRS
725
+ short_display_name: MIMIC-RRS
726
+ description: A dataset containing radiology reports with findings sections from MIMIC-III paired with their corresponding impression sections, used for generating radiology report summaries [(Chen et al., 2023)](https://arxiv.org/abs/2211.08584).
727
+ metric_groups:
728
+ - accuracy
729
+ - efficiency
730
+ - general_information
731
+ environment:
732
+ main_name: mimic_rrs_accuracy
733
+ main_split: test
734
+ taxonomy:
735
+ task: Text generation
736
+ what: Generate radiology report summaries from findings sections
737
+ who: Radiologist
738
+ when: Post-imaging
739
+ language: English
740
+
741
+ - name: mimic_bhc
742
+ display_name: MIMIC-IV-BHC
743
+ short_display_name: MIMIC-BHC
744
+ description: A summarization task using a curated collection of preprocessed discharge notes paired with their corresponding brief hospital course (BHC) summaries [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).
745
+ metric_groups:
746
+ - accuracy
747
+ - efficiency
748
+ - general_information
749
+ environment:
750
+ main_name: BERTScore-F
751
+ main_split: test
752
+ taxonomy:
753
+ task: Text generation
754
+ what: Summarize the clinical note into a brief hospital course
755
+ who: Clinician
756
+ when: Upon hospital discharge
757
+ language: English
758
+
759
+ - name: mimiciv_billing_code
760
+ display_name: MIMIC-IV Billing Code
761
+ description: A dataset pairing clinical notes from MIMIC-IV with corresponding ICD-10 billing codes.
762
+ metric_groups:
763
+ - accuracy
764
+ - efficiency
765
+ - general_information
766
+ environment:
767
+ main_name: mimiciv_billing_code_f1
768
+ main_split: test
769
+ taxonomy:
770
+ task: Classification
771
+ what: Predict ICD-10 billing codes from clinical discharge notes
772
+ who: Hospital Admistrator
773
+ when: During or after patient discharge
774
+ language: English
775
+
776
+ - name: dischargeme
777
+ display_name: DischargeMe
778
+ short_display_name: DischargeMe
779
+ description: DischargeMe is a discharge instruction generation dataset and brief hospital course generation dataset collected from MIMIC-IV data, considering only the discharge text as well as the radiology report text [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).
780
+ metric_groups:
781
+ - accuracy
782
+ - efficiency
783
+ - general_information
784
+ environment:
785
+ main_name: dischargeme_accuracy
786
+ main_split: test
787
+ taxonomy:
788
+ task: Text generation
789
+ what: Generate discharge instructions from hospital notes
790
+ who: Clinician
791
+ when: Upon hospital discharge
792
+ language: English
793
+
794
+ - name: pubmed_qa
795
+ display_name: PubMedQA
796
+ description: A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions.
797
+ metric_groups:
798
+ - accuracy
799
+ - efficiency
800
+ - general_information
801
+ environment:
802
+ main_name: exact_match
803
+ main_split: test
804
+ taxonomy:
805
+ task: Question answering
806
+ what: Answer questions based on PubMed abstracts
807
+ who: Researcher
808
+ when: Any
809
+ language: English
810
+
811
+ - name: medec
812
+ display_name: Medec
813
+ description: A dataset containing medical narratives with error detection and correction pairs [(Abacha et al., 2025)](https://arxiv.org/abs/2412.19260).
814
+ metric_groups:
815
+ - accuracy
816
+ - efficiency
817
+ - general_information
818
+ environment:
819
+ main_name: medec_error_flag_accuracy
820
+ main_split: test
821
+ taxonomy:
822
+ task: Classification
823
+ what: Detect and correct errors in medical narratives
824
+ who: Researcher, Clinician
825
+ when: Any
826
+ language: English
827
+
828
+ - name: aci_bench
829
+ display_name: ACI-Bench
830
+ description: A dataset of patient-doctor conversations paired with structured clinical notes [(Yim et al., 2024)](https://www.nature.com/articles/s41597-023-02487-3).
831
+ metric_groups:
832
+ - accuracy
833
+ - efficiency
834
+ - general_information
835
+ environment:
836
+ main_name: aci_bench_accuracy
837
+ main_split: test
838
+ taxonomy:
839
+ task: Text generation
840
+ what: Extract and structure information from patient-doctor conversations
841
+ who: Clinician
842
+ when: Any
843
+ language: English
844
+
845
+ - name: chw_care_plan
846
+ display_name: NoteExtract
847
+ description: A dataset containing free form text of a clinical health worker care plan, with the associated goal being to restructure that text into a given format.
848
+ metric_groups:
849
+ - accuracy
850
+ - efficiency
851
+ - general_information
852
+ environment:
853
+ main_name: chw_care_plan_accuracy
854
+ main_split: test
855
+ taxonomy:
856
+ task: Text generation
857
+ what: Convert general text care plans into structured formats
858
+ who: Clinician, Researcher
859
+ when: Any
860
+ language: English
861
+
862
+ - name: ehr_sql
863
+ display_name: EHRSQL
864
+ description: Given a natural language instruction, generate an SQL query that would be used in clinical research.
865
+ metric_groups:
866
+ - accuracy
867
+ - efficiency
868
+ - general_information
869
+ environment:
870
+ main_name: ehr_sql_execution_accuracy
871
+ main_split: test
872
+ taxonomy:
873
+ task: Code generation
874
+ what: Generate SQL queries from natural language for clinical research
875
+ who: Researcher
876
+ when: Any
877
+ language: English
878
+
879
+ - name: head_qa
880
+ display_name: HeadQA
881
+ description: A collection of biomedical multiple-choice questions for testing medical knowledge [(Vilares et al., 2019)](https://arxiv.org/abs/1906.04701).
882
+ metric_groups:
883
+ - accuracy
884
+ - efficiency
885
+ - general_information
886
+ environment:
887
+ main_name: exact_match
888
+ main_split: test
889
+ taxonomy:
890
+ task: Question answering
891
+ what: Medical knowledge testing
892
+ who: Medical student, Researcher
893
+ when: Any
894
+ language: English
895
+
896
+ - name: medbullets
897
+ display_name: Medbullets
898
+ description: A USMLE-style medical question dataset with multiple-choice answers and explanations [(MedBullets, 2025)](https://step2.medbullets.com).
899
+ metric_groups:
900
+ - accuracy
901
+ - efficiency
902
+ - general_information
903
+ environment:
904
+ main_name: exact_match
905
+ main_split: test
906
+ taxonomy:
907
+ task: Question answering
908
+ what: Medical knowledge testing
909
+ who: Medical student, . Researcher
910
+ when: Any
911
+ language: English
912
+
913
+ - name: mtsamples_procedures
914
+ display_name: MTSamples Procedures
915
+ description: A dataset that provides a patient note regarding an operation, with the objective to document the procedure.
916
+ metric_groups:
917
+ - accuracy
918
+ - efficiency
919
+ - general_information
920
+ environment:
921
+ main_name: mtsamples_procedures_accuracy
922
+ main_split: test
923
+ taxonomy:
924
+ task: Text generation
925
+ what: Document and extract information about medical procedures
926
+ who: Clinician, Researcher
927
+ when: Post-procedure
928
+ language: English
929
+
930
+ - name: medication_qa
931
+ display_name: MedicationQA
932
+ description: Consumer medication questions with reference answers.
933
+ metric_groups:
934
+ - accuracy
935
+ - efficiency
936
+ - general_information
937
+ environment:
938
+ main_name: medication_qa_accuracy
939
+ main_split: test
940
+ taxonomy:
941
+ task: Question answering
942
+ what: Answer consumer medication-related questions
943
+ who: Patient, Pharmacist
944
+ when: Any
945
+ language: English
946
+
947
+ - name: shc_bmt_med
948
+ display_name: BMT-Status
949
+ description: A dataset containing patient notes with associated questions and answers related to bone marrow transplantation.
950
+ metric_groups:
951
+ - accuracy
952
+ - efficiency
953
+ - general_information
954
+ environment:
955
+ main_name: exact_match
956
+ main_split: test
957
+ taxonomy:
958
+ task: question answering
959
+ what: Answer bone marrow transplant questions
960
+ who: Researcher
961
+ when: Any
962
+ language: English
963
+
964
+ - name: shc_gip_med
965
+ display_name: HospiceReferral
966
+ description: A dataset evaluating performance in identifying appropriate patient referrals to hospice care.
967
+ metric_groups:
968
+ - accuracy
969
+ - efficiency
970
+ - general_information
971
+ environment:
972
+ main_name: exact_match
973
+ main_split: test
974
+ taxonomy:
975
+ task: Classification
976
+ what: Assess hospice referral appropriateness
977
+ who: Hospital Admistrator
978
+ when: End-of-care
979
+ language: English
980
+
981
+ - name: shc_cdi_med
982
+ display_name: CDI-QA
983
+ description: A dataset built from Clinical Document Integrity (CDI) notes, to assess the ability to answer verification questions from previous notes.
984
+ metric_groups:
985
+ - accuracy
986
+ - efficiency
987
+ - general_information
988
+ environment:
989
+ main_name: exact_match
990
+ main_split: test
991
+ taxonomy:
992
+ task: Classification
993
+ what: Answer verification questions from CDI notes
994
+ who: Hospital Admistrator
995
+ when: Any
996
+ language: English
997
+
998
+ - name: shc_ent_med
999
+ display_name: ENT-Referral
1000
+ description: A dataset designed to evaluate performance in identifying appropriate patient referrals to Ear, Nose, and Throat specialists.
1001
+ metric_groups:
1002
+ - accuracy
1003
+ - efficiency
1004
+ - general_information
1005
+ environment:
1006
+ main_name: exact_match
1007
+ main_split: test
1008
+ taxonomy:
1009
+ task: Classification
1010
+ what: Identify referrals for ENT specialists
1011
+ who: Hospital Admistrator
1012
+ when: Any
1013
+ language: English
1014
+
1015
+ - name: shc_sequoia_med
1016
+ display_name: ClinicReferral
1017
+ description: A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic.
1018
+ metric_groups:
1019
+ - accuracy
1020
+ - efficiency
1021
+ - general_information
1022
+ environment:
1023
+ main_name: exact_match
1024
+ main_split: test
1025
+ taxonomy:
1026
+ task: Classification
1027
+ what: Provide answers on clinic referrals
1028
+ who: Hospital Admistrator
1029
+ when: Pre-referral
1030
+ language: English
1031
+
1032
+ - name: shc_conf_med
1033
+ display_name: MedConfInfo
1034
+ description: A dataset of clinical notes from adolescent patients used to identify sensitive protected health information that should be restricted from parental access [(Rabbani et al., 2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).
1035
+ metric_groups:
1036
+ - accuracy
1037
+ - efficiency
1038
+ - general_information
1039
+ environment:
1040
+ main_name: exact_match
1041
+ main_split: test
1042
+ taxonomy:
1043
+ task: Classification
1044
+ what: Identify sensitive health info in adolescent notes
1045
+ who: Clinician
1046
+ when: Any
1047
+ language: English
1048
+
1049
+ - name: shc_ptbm_med
1050
+ display_name: ADHD-Behavior
1051
+ description: A dataset that classifies whether a clinical note contains a clinician recommendation for parent training in behavior management, which is the first-line evidence-based treatment for young children with ADHD [(Pillai et al., 2024)](https://doi.org/10.1093/jamia/ocae001).
1052
+ metric_groups:
1053
+ - accuracy
1054
+ - efficiency
1055
+ - general_information
1056
+ environment:
1057
+ main_name: exact_match
1058
+ main_split: test
1059
+ taxonomy:
1060
+ task: Classification
1061
+ what: Detect ADHD medication side effect monitoring
1062
+ who: Clinician, Researcher
1063
+ when: During Treatment
1064
+ language: English
1065
+
1066
+ - name: shc_sei_med
1067
+ display_name: ADHD-MedEffects
1068
+ description: A dataset that classifies whether a clinical note contains documentation of side effect monitoring (recording of absence or presence of medication side effects), as recommended in clinical practice guidelines [(Bannet et al., 2024)](https://doi.org/10.1542/peds.2024-067223).
1069
+ metric_groups:
1070
+ - accuracy
1071
+ - efficiency
1072
+ - general_information
1073
+ environment:
1074
+ main_name: exact_match
1075
+ main_split: test
1076
+ taxonomy:
1077
+ task: Classification
1078
+ what: Classify clinician recommendations for ADHD behavior management
1079
+ who: Clinician, Caregiver
1080
+ when: Early Intervention
1081
+ language: English