crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1140 @@
1
+ ---
2
+ ############################################################
3
+ metrics:
4
+ # Infrastructure metrics:
5
+ - name: num_perplexity_tokens
6
+ display_name: '# tokens'
7
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
8
+ - name: num_bytes
9
+ display_name: '# bytes'
10
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
11
+
12
+ - name: num_references
13
+ display_name: '# ref'
14
+ description: Number of references.
15
+ - name: num_train_trials
16
+ display_name: '# trials'
17
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
18
+ - name: estimated_num_tokens_cost
19
+ display_name: 'cost'
20
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
21
+ - name: num_prompt_tokens
22
+ display_name: '# prompt tokens'
23
+ description: Number of tokens in the prompt.
24
+ - name: num_prompt_characters
25
+ display_name: '# prompt chars'
26
+ description: Number of characters in the prompt.
27
+ - name: num_completion_tokens
28
+ display_name: '# completion tokens'
29
+ description: Actual number of completion tokens (over all completions).
30
+ - name: num_output_tokens
31
+ display_name: '# output tokens'
32
+ description: Actual number of output tokens.
33
+ - name: max_num_output_tokens
34
+ display_name: 'Max output tokens'
35
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
36
+ - name: num_requests
37
+ display_name: '# requests'
38
+ description: Number of distinct API requests.
39
+ - name: num_instances
40
+ display_name: '# eval'
41
+ description: Number of evaluation instances.
42
+ - name: num_train_instances
43
+ display_name: '# train'
44
+ description: Number of training instances (e.g., in-context examples).
45
+ - name: prompt_truncated
46
+ display_name: truncated
47
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
48
+ - name: finish_reason_length
49
+ display_name: finish b/c length
50
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
51
+ - name: finish_reason_stop
52
+ display_name: finish b/c stop
53
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
54
+ - name: finish_reason_endoftext
55
+ display_name: finish b/c endoftext
56
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
57
+ - name: finish_reason_unknown
58
+ display_name: finish b/c unknown
59
+ description: Fraction of instances where the the output was terminated for unknown reasons.
60
+ - name: num_completions
61
+ display_name: '# completions'
62
+ description: Number of completions.
63
+ - name: predicted_index
64
+ display_name: Predicted index
65
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
66
+
67
+ # Accuracy metrics:
68
+ - name: exact_match
69
+ display_name: Exact match
70
+ short_display_name: EM
71
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
72
+ lower_is_better: false
73
+ - name: f1_score
74
+ display_name: F1
75
+ description: Average F1 score in terms of word overlap between the model output and correct reference.
76
+ lower_is_better: false
77
+ - name: live_qa_score
78
+ display_name: Judge Score
79
+ description: LLM-as-judge score
80
+ lower_is_better: false
81
+ - name: medication_qa_score
82
+ display_name: Judge Score
83
+ description: LLM-as-judge score
84
+ lower_is_better: false
85
+ - name: quasi_exact_match
86
+ display_name: Quasi-exact match
87
+ short_display_name: EM
88
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
89
+ lower_is_better: false
90
+ - name: prefix_exact_match
91
+ display_name: Prefix exact match
92
+ short_display_name: PEM
93
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
94
+ lower_is_better: false
95
+ - name: quasi_prefix_exact_match
96
+ # TODO: should call this prefix_quasi_exact_match
97
+ display_name: Prefix quasi-exact match
98
+ short_display_name: PEM
99
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
100
+ lower_is_better: false
101
+ - name: logprob
102
+ display_name: Log probability
103
+ short_display_name: Logprob
104
+ description: Predicted output's average log probability (input's log prob for language modeling).
105
+ lower_is_better: false
106
+ - name: logprob_per_byte
107
+ display_name: Log probability / byte
108
+ short_display_name: Logprob/byte
109
+ description: Predicted output's average log probability normalized by the number of bytes.
110
+ lower_is_better: false
111
+ - name: bits_per_byte
112
+ display_name: Bits/byte
113
+ short_display_name: BPB
114
+ lower_is_better: true
115
+ description: Average number of bits per byte according to model probabilities.
116
+ - name: perplexity
117
+ display_name: Perplexity
118
+ short_display_name: PPL
119
+ lower_is_better: true
120
+ description: Perplexity of the output completion (effective branching factor per output token).
121
+ - name: rouge_1
122
+ display_name: ROUGE-1
123
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
124
+ lower_is_better: false
125
+ - name: rouge_2
126
+ display_name: ROUGE-2
127
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
128
+ lower_is_better: false
129
+ - name: rouge_l
130
+ display_name: ROUGE-L
131
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
132
+ lower_is_better: false
133
+ - name: bleu_1
134
+ display_name: BLEU-1
135
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
136
+ lower_is_better: false
137
+ - name: bleu_4
138
+ display_name: BLEU-4
139
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
140
+ lower_is_better: false
141
+ - name: medec_error_flag_accuracy
142
+ display_name: Medical Error Flag Accuracy
143
+ short_display_name: MedecFlagAcc
144
+ description: Measures how accurately the model identifies whether a clinical note contains an error (binary classification of correct/incorrect).
145
+ lower_is_better: false
146
+ - name: medec_error_sentence_accuracy
147
+ display_name: Medical Error Sentence Accuracy
148
+ short_display_name: MedecSentenceAcc
149
+ description: Measures how accurately the model identifies the specific erroneous sentence within a clinical note.
150
+ lower_is_better: false
151
+ - name: ehr_sql_precision_answerable
152
+ display_name: Precision for Answerable Questions
153
+ short_display_name: EHRSQLPreAns
154
+ description: Measures the proportion of correctly predicted answerable questions among all questions predicted to be answerable.
155
+ lower_is_better: false
156
+ - name: ehr_sql_recall_answerable
157
+ display_name: Recall for Answerable Questions
158
+ short_display_name: EHRSQLReAns
159
+ description: Measures the proportion of correctly predicted answerable questions among all answerable questions in the dataset.
160
+ lower_is_better: false
161
+ - name: mimiciv_billing_code_precision
162
+ display_name: Precision for MIMIC Billing Codes
163
+ short_display_name: MIMICBillingPre
164
+ description: Measures the proportion of correctly predicted ICD codes among all ICD codes predicted by the model.
165
+ lower_is_better: false
166
+ - name: mimiciv_billing_code_recall
167
+ display_name: Recall for MIMIC Billing Codes
168
+ short_display_name: MIMICBillingRec
169
+ description: Measures the proportion of correctly predicted ICD codes among all ICD codes present in the gold standard.
170
+ lower_is_better: false
171
+ - name: mimiciv_billing_code_f1
172
+ display_name: F1 Score for MIMIC Billing Codes
173
+ short_display_name: MIMICBillingF1
174
+ description: Measures the harmonic mean of precision and recall for ICD codes, providing a balanced evaluation of the model's performance.
175
+ lower_is_better: false
176
+ - name: exact_match@5
177
+ display_name: Exact match @5
178
+ short_display_name: EM@5
179
+ description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference exactly.
180
+ lower_is_better: false
181
+ - name: quasi_exact_match@5
182
+ display_name: Quasi-exact match @5
183
+ short_display_name: EM@5
184
+ description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference up to light processing.
185
+ lower_is_better: false
186
+ - name: prefix_exact_match@5
187
+ display_name: Prefix exact match @5
188
+ short_display_name: PEM@5
189
+ description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference exactly.
190
+ lower_is_better: false
191
+ - name: quasi_prefix_exact_match@5
192
+ display_name: Prefix quasi-exact match @5
193
+ short_display_name: PEM@5
194
+ description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference up to light processing.
195
+ lower_is_better: false
196
+ - name: ehr_sql_execution_accuracy
197
+ display_name: Execution accuracy for Generated Query
198
+ short_display_name: EHRSQLExeAcc
199
+ description: Measures the proportion of correctly predicted answerable questions among all questions predicted to be answerable.
200
+ lower_is_better: false
201
+ - name: ehr_sql_query_validity
202
+ display_name: Validity of Generated Query
203
+ short_display_name: EHRSQLQueryValid
204
+ description: Measures the proportion of correctly predicted answerable questions among all answerable questions in the dataset.
205
+ lower_is_better: false
206
+ - name: aci_bench_accuracy
207
+ display_name: ACI-Bench Jury Score
208
+ short_display_name: Jury Score
209
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
210
+ lower_is_better: false
211
+ - name: mtsamples_replicate_accuracy
212
+ display_name: MTSamples Replicate Jury Score
213
+ short_display_name: Jury Score
214
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
215
+ lower_is_better: false
216
+ - name: medalign_accuracy
217
+ display_name: Medalign Jury Score
218
+ short_display_name: Jury Score
219
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
220
+ lower_is_better: false
221
+ - name: dischargeme_accuracy
222
+ display_name: DischargeMe Jury Score
223
+ short_display_name: Jury Score
224
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
225
+ lower_is_better: false
226
+ - name: mtsamples_procedures_accuracy
227
+ display_name: MTSamples Procedures Jury Score
228
+ short_display_name: Jury Score
229
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
230
+ lower_is_better: false
231
+ - name: mimic_rrs_accuracy
232
+ display_name: MIMIC-RRS Jury Score
233
+ short_display_name: Jury Score
234
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
235
+ lower_is_better: false
236
+ - name: mimic_bhc_accuracy
237
+ display_name: MIMIC-BHC Jury Score
238
+ short_display_name: Jury Score
239
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
240
+ lower_is_better: false
241
+ - name: chw_care_plan_accuracy
242
+ display_name: NoteExtract Jury Score
243
+ short_display_name: Jury Score
244
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
245
+ lower_is_better: false
246
+ - name: medication_qa_accuracy
247
+ display_name: MedicationQA Jury Score
248
+ short_display_name: Jury Score
249
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
250
+ lower_is_better: false
251
+ - name: starr_patient_instructions_accuracy
252
+ display_name: PatientInstruct Jury Score
253
+ short_display_name: Jury Score
254
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
255
+ lower_is_better: false
256
+ - name: med_dialog_accuracy
257
+ display_name: MedDialog Jury Score
258
+ short_display_name: Jury Score
259
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
260
+ lower_is_better: false
261
+ - name: medi_qa_accuracy
262
+ display_name: MediQA Jury Score
263
+ short_display_name: Jury Score
264
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
265
+ lower_is_better: false
266
+ - name: mental_health_accuracy
267
+ display_name: MentalHealth Jury Score
268
+ short_display_name: Jury Score
269
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
270
+ lower_is_better: false
271
+
272
+ # Summariazation metrics
273
+ - name: summac
274
+ display_name: SummaC
275
+ description: Faithfulness scores based on the SummaC method of [Laban et al. (2022)](https://aclanthology.org/2022.tacl-1.10/).
276
+ lower_is_better: false
277
+ - name: QAFactEval
278
+ display_name: QAFactEval
279
+ description: Faithfulness scores based on the SummaC method of [Laban et al. (2022)](https://aclanthology.org/2022.tacl-1.10/).
280
+ lower_is_better: false
281
+ - name: summarization_coverage
282
+ display_name: Coverage
283
+ description: Extent to which the model-generated summaries are extractive fragments from the source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).
284
+ - name: summarization_density
285
+ display_name: Density
286
+ description: Extent to which the model-generated summaries are extractive summaries based on the source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).
287
+ - name: summarization_compression
288
+ display_name: Compression
289
+ description: Extent to which the model-generated summaries are compressed relative to the source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).
290
+ - name: BERTScore-P
291
+ display_name: BERTScore (P)
292
+ description: Average BERTScore precision [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference summary.
293
+ lower_is_better: false
294
+ - name: BERTScore-R
295
+ display_name: BERTScore (R)
296
+ description: Average BERTScore recall [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference summary.
297
+ lower_is_better: false
298
+ - name: BERTScore-F
299
+ display_name: BERTScore (F1)
300
+ description: Average BERTScore F1 [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference summary.
301
+ lower_is_better: false
302
+ - name: HumanEval-faithfulness
303
+ display_name: HumanEval-faithfulness
304
+ description: Human evaluation score for faithfulness.
305
+ lower_is_better: false
306
+ - name: HumanEval-relevance
307
+ display_name: HumanEval-relevance
308
+ description: Human evaluation score for relevance.
309
+ lower_is_better: false
310
+ - name: HumanEval-coherence
311
+ display_name: HumanEval-coherence
312
+ description: Human evaluation score for coherence.
313
+ lower_is_better: false
314
+
315
+ # Toxicity metrics
316
+ - name: expected_max_toxicity
317
+ display_name: Average max toxicity
318
+ short_display_name: Avg. max toxicity
319
+ lower_is_better: true
320
+ description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
321
+ - name: max_toxicity_probability
322
+ display_name: max toxicity
323
+ lower_is_better: true
324
+ description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
325
+ - name: toxic_frac
326
+ display_name: Toxic fraction
327
+ lower_is_better: true
328
+ description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
329
+
330
+ # Efficiency metrics
331
+ - name: training_co2_cost
332
+ display_name: Estimated training emissions (kg CO2)
333
+ short_display_name: Training emissions (kg CO2)
334
+ lower_is_better: true
335
+ description: Estimate of the CO2 emissions from training the model.
336
+ - name: training_energy_cost
337
+ display_name: Estimated training energy cost (MWh)
338
+ short_display_name: Training energy (MWh)
339
+ lower_is_better: true
340
+ description: Estimate of the amount of energy used to train the model.
341
+ - name: inference_runtime
342
+ display_name: Observed inference runtime (s)
343
+ short_display_name: Observed inference time (s)
344
+ lower_is_better: true
345
+ description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
346
+ - name: inference_idealized_runtime
347
+ display_name: Idealized inference runtime (s)
348
+ short_display_name: Idealized inference time (s)
349
+ lower_is_better: true
350
+ description: Average time to process a request to the model based solely on the model architecture (using Megatron-LM).
351
+ - name: inference_denoised_runtime
352
+ display_name: Denoised inference runtime (s)
353
+ short_display_name: Denoised inference time (s)
354
+ lower_is_better: true
355
+ description: Average time to process a request to the model minus performance contention by using profiled runtimes from multiple trials of SyntheticEfficiencyScenario.
356
+ - name: batch_size
357
+ display_name: Batch size
358
+ description: For batch jobs, how many requests are in a batch.
359
+
360
+ # Calibration metrics:
361
+ - name: max_prob
362
+ display_name: Max prob
363
+ description: Model's average confidence in its prediction (only computed for classification tasks)
364
+ lower_is_better: false
365
+ - name: ece_10_bin
366
+ display_name: 10-bin expected calibration error
367
+ short_display_name: ECE (10-bin)
368
+ lower_is_better: true
369
+ description: The average difference between the model's confidence and accuracy, averaged across 10 bins where each bin contains an equal number of points (only computed for classification tasks). Warning - not reliable for small datasets (e.g., with < 300 examples) because each bin will have very few examples.
370
+ - name: ece_1_bin
371
+ display_name: 1-bin expected calibration error
372
+ short_display_name: ECE (1-bin)
373
+ lower_is_better: true
374
+ description: The (absolute value) difference between the model's average confidence and accuracy (only computed for classification tasks).
375
+ - name: selective_cov_acc_area
376
+ display_name: Selective coverage-accuracy area
377
+ short_display_name: Selective Acc
378
+ description: The area under the coverage-accuracy curve, a standard selective classification metric (only computed for classification tasks).
379
+ lower_is_better: false
380
+ - name: selective_acc@10
381
+ display_name: Accuracy at 10% coverage
382
+ short_display_name: Acc@10%
383
+ description: The accuracy for the 10% of predictions that the model is most confident on (only computed for classification tasks).
384
+ lower_is_better: false
385
+ - name: platt_ece_10_bin
386
+ display_name: 10-bin Expected Calibration Error (after Platt scaling)
387
+ short_display_name: Platt-scaled ECE (10-bin)
388
+ lower_is_better: true
389
+ description: 10-bin ECE computed after applying Platt scaling to recalibrate the model's predicted probabilities.
390
+ - name: platt_ece_1_bin
391
+ display_name: 1-bin expected calibration error (after Platt scaling)
392
+ short_display_name: Platt-scaled ECE (1-bin)
393
+ lower_is_better: true
394
+ description: 1-bin ECE computed after applying Platt scaling to recalibrate the model's predicted probabilities.
395
+ - name: platt_coef
396
+ display_name: Platt Scaling Coefficient
397
+ short_display_name: Platt Coef
398
+ description: Coefficient of the Platt scaling classifier (can compare this across tasks).
399
+ lower_is_better: false
400
+ - name: platt_intercept
401
+ display_name: Platt Scaling Intercept
402
+ short_display_name: Platt Intercept
403
+ description: Intercept of the Platt scaling classifier (can compare this across tasks).
404
+ lower_is_better: false
405
+
406
+ - name: ehr_sql_total_predicted_answerable
407
+ display_name: Total Predicted Answerable
408
+ short_display_name: Total Pred Ans
409
+ description: Total number of questions predicted to be answerable by the model.
410
+ lower_is_better: false
411
+
412
+ - name: ehr_sql_total_ground_truth_answerable
413
+ display_name: Total Ground Truth Answerable
414
+ short_display_name: Total GT Ans
415
+ description: Total number of answerable questions in the ground truth.
416
+ lower_is_better: false
417
+
418
+ - name: medcalc_bench_accuracy
419
+ display_name: MedCalc Accuracy
420
+ short_display_name: MedCalc Accuracy
421
+ description: Comparison based on category. Exact match for categories risk, severity and diagnosis. Check if within range for the other categories.
422
+ lower_is_better: false
423
+
424
+ ############################################################
425
+ perturbations: []
426
+
427
+ ############################################################
428
+ metric_groups:
429
+ - name: accuracy
430
+ display_name: Accuracy
431
+ metrics:
432
+ - name: ${main_name}
433
+ split: ${main_split}
434
+
435
+ - name: efficiency
436
+ display_name: Efficiency
437
+ metrics:
438
+ - name: inference_runtime
439
+ split: ${main_split}
440
+
441
+ - name: general_information
442
+ display_name: General information
443
+ hide_win_rates: true
444
+ metrics:
445
+ - name: num_instances
446
+ split: ${main_split}
447
+ - name: num_train_instances
448
+ split: ${main_split}
449
+ - name: prompt_truncated
450
+ split: ${main_split}
451
+ - name: num_prompt_tokens
452
+ split: ${main_split}
453
+ - name: num_output_tokens
454
+ split: ${main_split}
455
+
456
+ - name: toxicity
457
+ display_name: Toxicity
458
+ metrics:
459
+ - name: toxic_frac
460
+ split: ${main_split}
461
+
462
+ ############################################################
463
+ run_groups:
464
+ - name: medhelm_scenarios
465
+ display_name: MedHELM Scenarios
466
+ description: Scenarios for the medical domain
467
+ category: All scenarios
468
+ subgroups:
469
+ - clinical_decision_support
470
+ - clinical_note_generation
471
+ - patient_communication
472
+ - medical_research
473
+ - administration_and_workflow
474
+
475
+ - name: clinical_decision_support
476
+ display_name: Clinical Decision Support
477
+ description: Scenarios for clinical decision support
478
+ category: Healthcare Task Categories
479
+ subgroups:
480
+ - medcalc_bench
481
+ - clear
482
+ - mtsamples_replicate
483
+ - medec
484
+ - ehrshot
485
+ - head_qa
486
+ - medbullets
487
+ - medalign
488
+ - shc_ptbm_med
489
+ - shc_sei_med
490
+
491
+ - name: clinical_note_generation
492
+ display_name: Clinical Note Generation
493
+ description: Scenarios for clinical note generation
494
+ category: Healthcare Task Categories
495
+ subgroups:
496
+ - dischargeme
497
+ - aci_bench
498
+ - mtsamples_procedures
499
+ - mimic_rrs
500
+ - mimic_bhc
501
+ - chw_care_plan
502
+
503
+ - name: patient_communication
504
+ display_name: Patient Communication and Education
505
+ description: Scenarios for patient communication and education
506
+ category: Healthcare Task Categories
507
+ subgroups:
508
+ - medication_qa
509
+ - starr_patient_instructions
510
+ - med_dialog
511
+ - shc_conf_med
512
+ - medi_qa
513
+ - mental_health
514
+ - shc_proxy_med
515
+ - shc_privacy_med
516
+
517
+ - name: medical_research
518
+ display_name: Medical Research Assistance
519
+ description: Scenarios for medical research assistance
520
+ category: Healthcare Task Categories
521
+ subgroups:
522
+ - pubmed_qa
523
+ - ehr_sql
524
+ - shc_bmt_med
525
+ - race_based_med
526
+ - n2c2_ct_matching
527
+ - medhallu
528
+
529
+ - name: administration_and_workflow
530
+ display_name: Administration and Workflow
531
+ description: Scenarios for administration and workflow
532
+ category: Healthcare Task Categories
533
+ subgroups:
534
+ - shc_gip_med
535
+ - mimiciv_billing_code
536
+ - shc_sequoia_med
537
+ - shc_cdi_med
538
+ - shc_ent_med
539
+
540
+ - name: medcalc_bench
541
+ display_name: MedCalc-Bench
542
+ description: MedCalc-Bench is a benchmark designed to evaluate models on their ability to compute clinically relevant values from patient notes. Each instance consists of a clinical note describing the patient's condition, a diagnostic question targeting a specific medical value, and a ground truth response. [(Khandekar et al., 2024)](https://arxiv.org/abs/2406.12036).
543
+ metric_groups:
544
+ - accuracy
545
+ - efficiency
546
+ - general_information
547
+ environment:
548
+ main_name: medcalc_bench_accuracy
549
+ main_split: test
550
+ taxonomy:
551
+ task: Computational reasoning
552
+ what: "Compute a specific medical value from a patient note"
553
+ who: "Clinician, Researcher"
554
+ when: "Any"
555
+ language: English
556
+
557
+ - name: clear
558
+ display_name: CLEAR
559
+ description: CLEAR is a benchmark designed to evaluate models on their ability to detect medical conditions from patient notes using categorical responses. Each instance consists of a clinical note and a target condition, requiring the model to classify the patient's history as either affirmative, negative, or uncertain [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1).
560
+ metric_groups:
561
+ - accuracy
562
+ - efficiency
563
+ - general_information
564
+ environment:
565
+ main_name: exact_match
566
+ main_split: test
567
+ taxonomy:
568
+ task: Classification
569
+ what: Classify medical condition presence from patient notes
570
+ who: Clinician
571
+ when: Any
572
+ language: English
573
+
574
+ - name: mtsamples_replicate
575
+ display_name: MTSamples
576
+ short_display_name: MTSamples
577
+ description: MTSamples Replicate is a benchmark that provides transcribed medical reports from various specialties. It is used to evaluate a model's ability to generate clinically appropriate treatment plans based on unstructured patient documentation [(MTSamples, 2025)](https://mtsamples.com).
578
+ metric_groups:
579
+ - accuracy
580
+ - efficiency
581
+ - general_information
582
+ environment:
583
+ main_name: mtsamples_replicate_accuracy
584
+ main_split: test
585
+ taxonomy:
586
+ task: Text generation
587
+ what: "Generate treatment plans based on clinical notes"
588
+ who: "Clinician"
589
+ when: "Post-diagnosis"
590
+ language: English
591
+
592
+ - name: medec
593
+ display_name: Medec
594
+ description: Medec is a benchmark composed of clinical narratives that include either correct documentation or medical errors. Each entry includes sentence-level identifiers and an associated correction task. The model must review the narrative and either identify the erroneous sentence and correct it, or confirm that the text is entirely accurate [(Abacha et al., 2025)](https://arxiv.org/abs/2412.19260).
595
+ metric_groups:
596
+ - accuracy
597
+ - efficiency
598
+ - general_information
599
+ environment:
600
+ main_name: medec_error_flag_accuracy
601
+ main_split: test
602
+ taxonomy:
603
+ task: Classification
604
+ what: Detect and correct errors in medical narratives
605
+ who: Researcher, Clinician
606
+ when: Any
607
+ language: English
608
+
609
+ - name: ehrshot
610
+ display_name: EHRSHOT
611
+ description: EHRSHOT is a benchmark designed to evaluate a model's ability to predict future clinical events using structured EHR code sequences. Each instance contains a patient's historical EHR data and a forward-looking clinical question about whether a particular diagnosis, lab result, or hospital event will occur [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).
612
+ metric_groups:
613
+ - accuracy
614
+ - efficiency
615
+ - general_information
616
+ environment:
617
+ main_name: exact_match
618
+ main_split: test
619
+ taxonomy:
620
+ task: Classification
621
+ what: "Predict whether a medical event will occur in the future based on EHR codes"
622
+ who: "Clinician, Insurer"
623
+ when: "Future prediction"
624
+ language: English
625
+
626
+ - name: head_qa
627
+ display_name: HeadQA
628
+ description: HeadQA is a benchmark consisting of biomedical multiple-choice questions intended to evaluate a model's medical knowledge and reasoning. Each instance presents a clinical or scientific question with four answer options, requiring the model to select the most appropriate answer [(Vilares et al., 2019)](https://arxiv.org/abs/1906.04701).
629
+ metric_groups:
630
+ - accuracy
631
+ - efficiency
632
+ - general_information
633
+ environment:
634
+ main_name: exact_match
635
+ main_split: test
636
+ taxonomy:
637
+ task: Question answering
638
+ what: Medical knowledge testing
639
+ who: Medical student, Researcher
640
+ when: Any
641
+ language: English
642
+
643
+ - name: medbullets
644
+ display_name: Medbullets
645
+ description: Medbullets is a benchmark of USMLE-style medical questions designed to assess a model's ability to understand and apply clinical knowledge. Each question is accompanied by a patient scenario and five multiple-choice options, similar to those found on Step 2 and Step 3 board exams [(MedBullets, 2025)](https://step2.medbullets.com).
646
+ metric_groups:
647
+ - accuracy
648
+ - efficiency
649
+ - general_information
650
+ environment:
651
+ main_name: exact_match
652
+ main_split: test
653
+ taxonomy:
654
+ task: Question answering
655
+ what: Medical knowledge testing
656
+ who: Medical student, . Researcher
657
+ when: Any
658
+ language: English
659
+
660
+ - name: medalign
661
+ display_name: MedAlign
662
+ short_display_name: MedAlign
663
+ description: MedAlign is a benchmark that evaluates a model's ability to interpret and follow instructions grounded in longitudinal electronic health records (EHR). Each instance includes an event-stream style patient record and a natural language question or task, requiring clinically informed reading comprehension and reasoning [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).
664
+ metric_groups:
665
+ - accuracy
666
+ - efficiency
667
+ - general_information
668
+ environment:
669
+ main_name: medalign_accuracy
670
+ main_split: test
671
+ taxonomy:
672
+ task: Text generation
673
+ what: "Answer questions and follow instructions over longitudinal EHR"
674
+ who: "Clinician, Researcher"
675
+ when: "Any"
676
+ language: English
677
+
678
+ - name: shc_ptbm_med
679
+ display_name: ADHD-Behavior
680
+ description: ADHD-Behavior is a benchmark that evaluates a model's ability to detect whether a clinician recommends parent training in behavior management, an evidence-based first-line treatment for young children diagnosed with ADHD. Each instance includes a clinical note from a pediatric visit and a binary classification task [(Pillai et al., 2024)](https://doi.org/10.1093/jamia/ocae001).
681
+ metric_groups:
682
+ - accuracy
683
+ - efficiency
684
+ - general_information
685
+ environment:
686
+ main_name: exact_match
687
+ main_split: test
688
+ taxonomy:
689
+ task: Classification
690
+ what: Detect ADHD medication side effect monitoring
691
+ who: Clinician, Researcher
692
+ when: During Treatment
693
+ language: English
694
+
695
+ - name: shc_sei_med
696
+ display_name: ADHD-MedEffects
697
+ description: ADHD-MedEffects is a benchmark designed to evaluate whether clinical notes for pediatric ADHD visits document medication side effect monitoring, which is a key recommendation in clinical practice guidelines. The dataset supports binary classification to detect presence or absence of side effect inquiries (SEI) within notes [(Bannet et al., 2024)](https://doi.org/10.1542/peds.2024-067223).
698
+ metric_groups:
699
+ - accuracy
700
+ - efficiency
701
+ - general_information
702
+ environment:
703
+ main_name: exact_match
704
+ main_split: test
705
+ taxonomy:
706
+ task: Classification
707
+ what: Classify clinician recommendations for ADHD behavior management
708
+ who: Clinician, Caregiver
709
+ when: Early Intervention
710
+ language: English
711
+
712
+ - name: dischargeme
713
+ display_name: DischargeMe
714
+ short_display_name: DischargeMe
715
+ description: DischargeMe is a benchmark designed to evaluate clinical text generation. It pairs discharge summaries and radiology reports from MIMIC-IV with generation tasks such as writing discharge instructions or summarizing the brief hospital course. The benchmark assesses a model's ability to generate patient-facing documentation that is complete, empathetic, and clinically accurate [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).
716
+ metric_groups:
717
+ - accuracy
718
+ - efficiency
719
+ - general_information
720
+ environment:
721
+ main_name: dischargeme_accuracy
722
+ main_split: test
723
+ taxonomy:
724
+ task: Text generation
725
+ what: Generate discharge instructions from hospital notes
726
+ who: Clinician
727
+ when: Upon hospital discharge
728
+ language: English
729
+
730
+ - name: aci_bench
731
+ display_name: ACI-Bench
732
+ description: ACI-Bench is a benchmark of real-world patient-doctor conversations paired with structured clinical notes. The benchmark evaluates a model's ability to understand spoken medical dialogue and convert it into formal clinical documentation, covering sections such as history of present illness, physical exam findings, results, and assessment and plan [(Yim et al., 2024)](https://www.nature.com/articles/s41597-023-02487-3).
733
+ metric_groups:
734
+ - accuracy
735
+ - efficiency
736
+ - general_information
737
+ environment:
738
+ main_name: aci_bench_accuracy
739
+ main_split: test
740
+ taxonomy:
741
+ task: Text generation
742
+ what: Extract and structure information from patient-doctor conversations
743
+ who: Clinician
744
+ when: Any
745
+ language: English
746
+
747
+ - name: mtsamples_procedures
748
+ display_name: MTSamples Procedures
749
+ description: MTSamples Procedures is a benchmark composed of transcribed operative notes, focused on documenting surgical procedures. Each example presents a brief patient case involving a surgical intervention, and the model is tasked with generating a coherent and clinically accurate procedural summary or treatment plan.
750
+ metric_groups:
751
+ - accuracy
752
+ - efficiency
753
+ - general_information
754
+ environment:
755
+ main_name: mtsamples_procedures_accuracy
756
+ main_split: test
757
+ taxonomy:
758
+ task: Text generation
759
+ what: Document and extract information about medical procedures
760
+ who: Clinician, Researcher
761
+ when: Post-procedure
762
+ language: English
763
+
764
+ - name: mimic_rrs
765
+ display_name: MIMIC-RRS
766
+ short_display_name: MIMIC-RRS
767
+ description: MIMIC-RRS is a benchmark constructed from radiology reports in the MIMIC-III database. It contains pairs of ‘Findings‘ and ‘Impression‘ sections, enabling evaluation of a model's ability to summarize diagnostic imaging observations into concise, clinically relevant conclusions [(Chen et al., 2023)](https://arxiv.org/abs/2211.08584).
768
+ metric_groups:
769
+ - accuracy
770
+ - efficiency
771
+ - general_information
772
+ environment:
773
+ main_name: mimic_rrs_accuracy
774
+ main_split: test
775
+ taxonomy:
776
+ task: Text generation
777
+ what: Generate radiology report summaries from findings sections
778
+ who: Radiologist
779
+ when: Post-imaging
780
+ language: English
781
+
782
+ - name: mimic_bhc
783
+ display_name: MIMIC-IV-BHC
784
+ short_display_name: MIMIC-BHC
785
+ description: MIMIC-BHC is a benchmark focused on summarization of discharge notes into Brief Hospital Course (BHC) sections. It consists of curated discharge notes from MIMIC-IV, each paired with its corresponding BHC summary. The benchmark evaluates a model's ability to condense detailed clinical information into accurate, concise summaries that reflect the patient's hospital stay [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).
786
+ metric_groups:
787
+ - accuracy
788
+ - efficiency
789
+ - general_information
790
+ environment:
791
+ main_name: mimic_bhc_accuracy
792
+ main_split: test
793
+ taxonomy:
794
+ task: Text generation
795
+ what: Summarize the clinical note into a brief hospital course
796
+ who: Clinician
797
+ when: Upon hospital discharge
798
+ language: English
799
+
800
+ - name: chw_care_plan
801
+ display_name: NoteExtract
802
+ description: NoteExtract is a benchmark that focuses on the structured extraction of information from free-form clinical text. It provides care plan notes authored by health workers and evaluates a model's ability to convert them into a predefined structured format, such as fields for Chief Complaint and History of Present Illness. The benchmark emphasizes faithful extraction without hallucination or inference.
803
+ metric_groups:
804
+ - accuracy
805
+ - efficiency
806
+ - general_information
807
+ environment:
808
+ main_name: chw_care_plan_accuracy
809
+ main_split: test
810
+ taxonomy:
811
+ task: Text generation
812
+ what: Convert general text care plans into structured formats
813
+ who: Clinician, Researcher
814
+ when: Any
815
+ language: English
816
+
817
+ - name: medication_qa
818
+ display_name: MedicationQA
819
+ description: MedicationQA is a benchmark composed of open-ended consumer health questions specifically focused on medications. Each example consists of a free-form question and a corresponding medically grounded answer. The benchmark evaluates a model's ability to provide accurate, accessible, and informative medication-related responses for a lay audience.
820
+ metric_groups:
821
+ - accuracy
822
+ - efficiency
823
+ - general_information
824
+ environment:
825
+ main_name: medication_qa_accuracy
826
+ main_split: test
827
+ taxonomy:
828
+ task: Question answering
829
+ what: Answer consumer medication-related questions
830
+ who: Patient, Pharmacist
831
+ when: Any
832
+ language: English
833
+
834
+ - name: starr_patient_instructions
835
+ display_name: PatientInstruct
836
+ description: PatientInstruct is a benchmark designed to evaluate models on generating personalized post-procedure instructions for patients. It includes real-world clinical case details, such as diagnosis, planned procedures, and history and physical notes, from which models must produce clear, actionable instructions appropriate for patients recovering from medical interventions.
837
+ metric_groups:
838
+ - accuracy
839
+ - efficiency
840
+ - general_information
841
+ environment:
842
+ main_name: starr_patient_instructions_accuracy
843
+ main_split: test
844
+ taxonomy:
845
+ task: Text generation
846
+ what: Generate customized post-procedure patient instructions
847
+ who: Clinician
848
+ when: Post-procedure
849
+ language: English
850
+
851
+ - name: med_dialog
852
+ display_name: MedDialog
853
+ short_display_name: MedDialog
854
+ description: MedDialog is a benchmark of real-world doctor-patient conversations focused on health-related concerns and advice. Each dialogue is paired with a one-sentence summary that reflects the core patient question or exchange. The benchmark evaluates a model's ability to condense medical dialogue into concise, informative summaries.
855
+ metric_groups:
856
+ - accuracy
857
+ - efficiency
858
+ - general_information
859
+ environment:
860
+ main_name: med_dialog_accuracy
861
+ main_split: test
862
+ taxonomy:
863
+ task: Text generation
864
+ what: Generate summaries of doctor-patient conversations
865
+ who: Clinician
866
+ when: Any
867
+ language: English
868
+
869
+ - name: shc_conf_med
870
+ display_name: MedConfInfo
871
+ description: MedConfInfo is a benchmark comprising clinical notes from adolescent patients. It is used to evaluate whether the content contains sensitive protected health information (PHI) that should be restricted from parental access, in accordance with adolescent confidentiality policies in clinical care. [(Rabbani et al., 2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).
872
+ metric_groups:
873
+ - accuracy
874
+ - efficiency
875
+ - general_information
876
+ environment:
877
+ main_name: exact_match
878
+ main_split: test
879
+ taxonomy:
880
+ task: Classification
881
+ what: Identify sensitive health info in adolescent notes
882
+ who: Clinician
883
+ when: Any
884
+ language: English
885
+
886
+ - name: medi_qa
887
+ display_name: MEDIQA
888
+ description: MEDIQA is a benchmark designed to evaluate a model's ability to retrieve and generate medically accurate answers to patient-generated questions. Each instance includes a consumer health question, a set of candidate answers (used in ranking tasks), relevance annotations, and optionally, additional context. The benchmark focuses on supporting patient understanding and accessibility in health communication.
889
+ metric_groups:
890
+ - accuracy
891
+ - efficiency
892
+ - general_information
893
+ environment:
894
+ main_name: medi_qa_accuracy
895
+ main_split: test
896
+ taxonomy:
897
+ task: Text generation
898
+ what: Retrieve and rank answers based on medical question understanding
899
+ who: Clinician, Medical Student
900
+ when: Any
901
+ language: English
902
+
903
+ - name: mental_health
904
+ display_name: MentalHealth
905
+ description: MentalHealth is a benchmark focused on evaluating empathetic communication in mental health counseling. It includes real or simulated conversations between patients and counselors, where the task is to generate compassionate and appropriate counselor responses. The benchmark assesses a model's ability to support patients emotionally and meaningfully engage in therapeutic conversations.
906
+ metric_groups:
907
+ - accuracy
908
+ - efficiency
909
+ - general_information
910
+ environment:
911
+ main_name: mental_health_accuracy
912
+ main_split: test
913
+ taxonomy:
914
+ task: Text generation
915
+ what: Generate empathetic counseling responses in mental health conversations
916
+ who: Counselors, Patients
917
+ when: Any
918
+ language: English
919
+
920
+ - name: shc_proxy_med
921
+ display_name: ProxySender
922
+ description: ProxySender is a benchmark composed of patient portal messages received by clinicians. It evaluates whether the message was sent by the patient or by a proxy user (e.g., parent, spouse), which is critical for understanding who is communicating with healthcare providers. [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).
923
+ metric_groups:
924
+ - accuracy
925
+ - efficiency
926
+ - general_information
927
+ environment:
928
+ main_name: exact_match
929
+ main_split: test
930
+ taxonomy:
931
+ task: Classification
932
+ what: Classify if a document was sent by a proxy user
933
+ who: Clinician, Caregiver
934
+ when: Any
935
+ language: English
936
+
937
+ - name: shc_privacy_med
938
+ display_name: PrivacyDetection
939
+ description: PrivacyDetection is a benchmark composed of patient portal messages submitted by patients or caregivers. The task is to determine whether the message contains any confidential or privacy-leaking information that should be protected [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).
940
+ metric_groups:
941
+ - accuracy
942
+ - efficiency
943
+ - general_information
944
+ environment:
945
+ main_name: exact_match
946
+ main_split: test
947
+ taxonomy:
948
+ task: Classification
949
+ what: Classify if a document leaks private information
950
+ who: Clinician, Caregiver
951
+ when: Any
952
+ language: English
953
+
954
+ - name: pubmed_qa
955
+ display_name: PubMedQA
956
+ description: PubMedQA is a biomedical question-answering dataset that evaluates a model's ability to interpret scientific literature. It consists of PubMed abstracts paired with yes/no/maybe questions derived from the content. The benchmark assesses a model's capability to reason over biomedical texts and provide factually grounded answers.
957
+ metric_groups:
958
+ - accuracy
959
+ - efficiency
960
+ - general_information
961
+ environment:
962
+ main_name: exact_match
963
+ main_split: test
964
+ taxonomy:
965
+ task: Question answering
966
+ what: Answer questions based on PubMed abstracts
967
+ who: Researcher
968
+ when: Any
969
+ language: English
970
+
971
+ - name: ehr_sql
972
+ display_name: EHRSQL
973
+ description: EHRSQL is a benchmark designed to evaluate models on generating structured queries for clinical research. Each example includes a natural language question and a database schema, and the task is to produce an SQL query that would return the correct result for a biomedical research objective. This benchmark assesses a model's understanding of medical terminology, data structures, and query construction.
974
+ metric_groups:
975
+ - accuracy
976
+ - efficiency
977
+ - general_information
978
+ environment:
979
+ main_name: ehr_sql_execution_accuracy
980
+ main_split: test
981
+ taxonomy:
982
+ task: Code generation
983
+ what: Generate SQL queries from natural language for clinical research
984
+ who: Researcher
985
+ when: Any
986
+ language: English
987
+
988
+ - name: shc_bmt_med
989
+ display_name: BMT-Status
990
+ description: BMT-Status is a benchmark composed of clinical notes and associated binary questions related to bone marrow transplant (BMT), hematopoietic stem cell transplant (HSCT), or hematopoietic cell transplant (HCT) status. The goal is to determine whether the patient received a subsequent transplant based on the provided clinical documentation.
991
+ metric_groups:
992
+ - accuracy
993
+ - efficiency
994
+ - general_information
995
+ environment:
996
+ main_name: exact_match
997
+ main_split: test
998
+ taxonomy:
999
+ task: question answering
1000
+ what: Answer bone marrow transplant questions
1001
+ who: Researcher
1002
+ when: Any
1003
+ language: English
1004
+
1005
+ - name: race_based_med
1006
+ display_name: RaceBias
1007
+ description: RaceBias is a benchmark used to evaluate language models for racially biased or inappropriate content in medical question-answering scenarios. Each instance consists of a medical question and a model-generated response. The task is to classify whether the response contains race-based, harmful, or inaccurate content. This benchmark supports research into bias detection and fairness in clinical AI systems.
1008
+ metric_groups:
1009
+ - accuracy
1010
+ - efficiency
1011
+ - general_information
1012
+ environment:
1013
+ main_name: exact_match
1014
+ main_split: test
1015
+ taxonomy:
1016
+ task: Classification
1017
+ what: Identify race-based bias in LLM-generated medical responses
1018
+ who: Researcher
1019
+ when: Any
1020
+ language: English
1021
+
1022
+ - name: n2c2_ct_matching
1023
+ display_name: N2C2-CT Matching
1024
+ short_display_name: N2C2-CT
1025
+ description: A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.
1026
+ metric_groups:
1027
+ - accuracy
1028
+ - efficiency
1029
+ - general_information
1030
+ environment:
1031
+ main_name: exact_match
1032
+ main_split: test
1033
+ taxonomy:
1034
+ task: Classification
1035
+ what: Classify whether a patient is a valid candidate for a clinical trial based on clinical notes
1036
+ who: Researcher
1037
+ when: Pre-Trial
1038
+ language: English
1039
+
1040
+ - name: medhallu
1041
+ display_name: MedHallu
1042
+ description: MedHallu is a benchmark focused on evaluating factual correctness in biomedical question answering. Each instance contains a PubMed-derived knowledge snippet, a biomedical question, and a model-generated answer. The task is to classify whether the answer is factually correct or contains hallucinated (non-grounded) information. This benchmark is designed to assess the factual reliability of medical language models.
1043
+ metric_groups:
1044
+ - accuracy
1045
+ - efficiency
1046
+ - general_information
1047
+ environment:
1048
+ main_name: exact_match
1049
+ main_split: test
1050
+ taxonomy:
1051
+ task: Classification
1052
+ what: Verify whether answers to questions from PubMed articles are factual or hallucinated
1053
+ who: Researcher
1054
+ when: Any
1055
+ language: English
1056
+
1057
+ - name: shc_gip_med
1058
+ display_name: HospiceReferral
1059
+ description: HospiceReferral is a benchmark that evaluates model performance in identifying whether patients are eligible for hospice care based on palliative care clinical notes. The benchmark focuses on end-of-life care referral decisions.
1060
+ metric_groups:
1061
+ - accuracy
1062
+ - efficiency
1063
+ - general_information
1064
+ environment:
1065
+ main_name: exact_match
1066
+ main_split: test
1067
+ taxonomy:
1068
+ task: Classification
1069
+ what: Assess hospice referral appropriateness
1070
+ who: Hospital Admistrator
1071
+ when: End-of-care
1072
+ language: English
1073
+
1074
+ - name: mimiciv_billing_code
1075
+ display_name: MIMIC-IV Billing Code
1076
+ description: MIMIC-IV Billing Code is a benchmark derived from discharge summaries in the MIMIC-IV database, paired with their corresponding ICD-10 billing codes. The task requires models to extract structured billing codes based on free-text clinical notes, reflecting real-world hospital coding tasks for financial reimbursement.
1077
+ metric_groups:
1078
+ - accuracy
1079
+ - efficiency
1080
+ - general_information
1081
+ environment:
1082
+ main_name: mimiciv_billing_code_f1
1083
+ main_split: test
1084
+ taxonomy:
1085
+ task: Classification
1086
+ what: Predict ICD-10 billing codes from clinical discharge notes
1087
+ who: Hospital Admistrator
1088
+ when: During or after patient discharge
1089
+ language: English
1090
+
1091
+ - name: shc_sequoia_med
1092
+ display_name: ClinicReferral
1093
+ description: ClinicReferral is a benchmark that determines patient eligibility for referral to the Sequoia Clinic based on information from palliative care notes. The dataset provides curated decisions on referral appropriateness to assist in automating clinic workflows.
1094
+ metric_groups:
1095
+ - accuracy
1096
+ - efficiency
1097
+ - general_information
1098
+ environment:
1099
+ main_name: exact_match
1100
+ main_split: test
1101
+ taxonomy:
1102
+ task: Classification
1103
+ what: Provide answers on clinic referrals
1104
+ who: Hospital Admistrator
1105
+ when: Pre-referral
1106
+ language: English
1107
+
1108
+ - name: shc_cdi_med
1109
+ display_name: CDI-QA
1110
+ description: CDI-QA is a benchmark constructed from Clinical Documentation Integrity (CDI) notes. It is used to evaluate a model's ability to verify clinical conditions based on documented evidence in patient records.
1111
+ metric_groups:
1112
+ - accuracy
1113
+ - efficiency
1114
+ - general_information
1115
+ environment:
1116
+ main_name: exact_match
1117
+ main_split: test
1118
+ taxonomy:
1119
+ task: Classification
1120
+ what: Answer verification questions from CDI notes
1121
+ who: Hospital Admistrator
1122
+ when: Any
1123
+ language: English
1124
+
1125
+ - name: shc_ent_med
1126
+ display_name: ENT-Referral
1127
+ description: ENT-Referral is a benchmark designed to evaluate whether a patient's clinical note supports a referral to an Ear, Nose, and Throat (ENT) specialist. It helps assess models' abilities to make referral decisions based on unstructured clinical text
1128
+ metric_groups:
1129
+ - accuracy
1130
+ - efficiency
1131
+ - general_information
1132
+ environment:
1133
+ main_name: exact_match
1134
+ main_split: test
1135
+ taxonomy:
1136
+ task: Classification
1137
+ what: Identify referrals for ENT specialists
1138
+ who: Hospital Admistrator
1139
+ when: Any
1140
+ language: English