crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +2 -2
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  13. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  14. helm/benchmark/annotation/model_as_judge.py +12 -16
  15. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  16. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  17. helm/benchmark/executor.py +11 -12
  18. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  19. helm/benchmark/metrics/bias_word_lists.py +1 -1
  20. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  21. helm/benchmark/metrics/classification_metrics.py +3 -3
  22. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  23. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  24. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  25. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  26. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  27. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  28. helm/benchmark/metrics/comet_metric.py +1 -1
  29. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  30. helm/benchmark/metrics/copyright_metrics.py +1 -1
  31. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  32. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  33. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  34. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  35. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  36. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  37. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  38. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  39. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  40. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  41. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  42. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  43. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  44. helm/benchmark/metrics/medalign_metrics.py +9 -29
  45. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  46. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  47. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  48. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  49. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  50. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  51. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  52. helm/benchmark/metrics/metric_service.py +11 -11
  53. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  54. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  55. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  56. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  57. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  58. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  59. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  60. helm/benchmark/metrics/summac/model_summac.py +2 -3
  61. helm/benchmark/metrics/summarization_metrics.py +2 -1
  62. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  63. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  64. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  65. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  66. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  67. helm/benchmark/model_deployment_registry.py +16 -26
  68. helm/benchmark/presentation/contamination.py +3 -3
  69. helm/benchmark/presentation/create_plots.py +43 -13
  70. helm/benchmark/presentation/run_display.py +13 -0
  71. helm/benchmark/presentation/schema.py +7 -1
  72. helm/benchmark/presentation/summarize.py +84 -61
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/reeval_run.py +3 -4
  75. helm/benchmark/reeval_runner.py +3 -3
  76. helm/benchmark/run.py +84 -73
  77. helm/benchmark/run_expander.py +12 -1
  78. helm/benchmark/run_spec_factory.py +7 -6
  79. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  80. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  81. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  82. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  83. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  84. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  85. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  86. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  87. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  88. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  89. helm/benchmark/run_specs/long_context_run_specs.py +114 -15
  90. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  91. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  92. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  93. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
  94. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  95. helm/benchmark/runner.py +5 -5
  96. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  97. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  98. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  99. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  100. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  101. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  102. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  103. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  104. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  105. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
  106. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  107. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
  108. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
  109. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
  110. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  111. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  112. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  113. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  115. helm/benchmark/scenarios/clear_scenario.py +11 -7
  116. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  117. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  118. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  119. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  120. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  121. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  122. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  123. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  124. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  125. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  126. helm/benchmark/scenarios/grammar.py +2 -2
  127. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  128. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  129. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  130. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  131. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  132. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  133. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  134. helm/benchmark/scenarios/math_scenario.py +21 -20
  135. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  136. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  137. helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
  138. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  139. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  140. helm/benchmark/scenarios/medec_scenario.py +6 -1
  141. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  142. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  143. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  144. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  145. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  146. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  147. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  148. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  149. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  150. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  151. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  152. helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
  153. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  154. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  155. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  156. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  157. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  158. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  159. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  160. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  161. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  162. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  163. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  164. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  165. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  166. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  167. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  168. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  169. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  170. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  171. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  172. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  173. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  174. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  175. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  176. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  177. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  178. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  179. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  180. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  181. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  182. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  183. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  184. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  185. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  186. helm/benchmark/server.py +2 -1
  187. helm/benchmark/slurm_jobs.py +1 -2
  188. helm/benchmark/slurm_runner.py +8 -1
  189. helm/benchmark/static/schema_arabic.yaml +228 -0
  190. helm/benchmark/static/schema_audio.yaml +60 -49
  191. helm/benchmark/static/schema_classic.yaml +0 -17
  192. helm/benchmark/static/schema_enterprise.yaml +21 -0
  193. helm/benchmark/static/schema_long_context.yaml +81 -20
  194. helm/benchmark/static/schema_medhelm.yaml +272 -213
  195. helm/benchmark/static/schema_melt.yaml +1257 -0
  196. helm/benchmark/static/schema_slphelm.yaml +162 -0
  197. helm/benchmark/static/schema_vhelm.yaml +26 -26
  198. helm/benchmark/static/schema_video.yaml +219 -0
  199. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  200. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  201. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  202. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  203. helm/benchmark/static_build/index.html +4 -4
  204. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  205. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  206. helm/benchmark/window_services/test_utils.py +3 -4
  207. helm/benchmark/window_services/tokenizer_service.py +7 -8
  208. helm/clients/anthropic_client.py +69 -29
  209. helm/clients/audio_language/diva_llama_client.py +4 -2
  210. helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
  211. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  212. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  213. helm/clients/audio_language/test.py +62 -0
  214. helm/clients/bedrock_client.py +3 -1
  215. helm/clients/client.py +7 -7
  216. helm/clients/grok_client.py +36 -0
  217. helm/clients/huggingface_client.py +42 -3
  218. helm/clients/huggingface_pipeline_client.py +138 -0
  219. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  220. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  221. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  222. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  223. helm/clients/openai_client.py +102 -55
  224. helm/clients/openai_responses_client.py +176 -0
  225. helm/clients/palmyra_client.py +2 -5
  226. helm/clients/reka_client.py +2 -2
  227. helm/clients/test_huggingface_client.py +3 -3
  228. helm/clients/together_client.py +31 -6
  229. helm/clients/vertexai_client.py +17 -9
  230. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  231. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  232. helm/clients/vision_language/idefics_client.py +6 -2
  233. helm/clients/vision_language/paligemma_client.py +2 -2
  234. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  235. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  236. helm/clients/vllm_client.py +43 -7
  237. helm/clients/vllm_granite_thinking_client.py +56 -0
  238. helm/clients/writer_client.py +102 -0
  239. helm/common/context.py +80 -0
  240. helm/common/credentials_utils.py +5 -5
  241. helm/common/critique_request.py +0 -1
  242. helm/common/general.py +9 -2
  243. helm/common/hierarchical_logger.py +104 -12
  244. helm/common/local_context.py +140 -0
  245. helm/common/object_spec.py +23 -8
  246. helm/common/remote_context.py +61 -0
  247. helm/common/request.py +8 -0
  248. helm/common/test_logging.py +94 -0
  249. helm/config/model_deployments.yaml +995 -45
  250. helm/config/model_metadata.yaml +780 -59
  251. helm/config/tokenizer_configs.yaml +224 -3
  252. helm/proxy/cli.py +4 -2
  253. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  254. helm/proxy/retry.py +5 -0
  255. helm/proxy/services/server_service.py +21 -85
  256. helm/tokenizers/grok_tokenizer.py +55 -0
  257. helm/tokenizers/huggingface_tokenizer.py +1 -1
  258. helm/tokenizers/test_grok_tokenizer.py +33 -0
  259. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  260. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  261. helm/benchmark/scenarios/numeracy_scenario.py +0 -793
  262. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  263. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  264. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  265. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  266. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  267. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
  268. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -204,63 +204,68 @@ metrics:
204
204
  description: Measures the proportion of correctly predicted answerable questions among all answerable questions in the dataset.
205
205
  lower_is_better: false
206
206
  - name: aci_bench_accuracy
207
- display_name: ACI-Bench Accuracy
208
- short_display_name: Accuracy
207
+ display_name: ACI-Bench Jury Score
208
+ short_display_name: Jury Score
209
209
  description: Measures the average score assigned by an LLM-based jury evaluating task performance.
210
210
  lower_is_better: false
211
211
  - name: mtsamples_replicate_accuracy
212
- display_name: MTSamples Replicate Accuracy
213
- short_display_name: Accuracy
212
+ display_name: MTSamples Replicate Jury Score
213
+ short_display_name: Jury Score
214
214
  description: Measures the average score assigned by an LLM-based jury evaluating task performance.
215
215
  lower_is_better: false
216
216
  - name: medalign_accuracy
217
- display_name: Medalign Accuracy
218
- short_display_name: Accuracy
217
+ display_name: Medalign Jury Score
218
+ short_display_name: Jury Score
219
219
  description: Measures the average score assigned by an LLM-based jury evaluating task performance.
220
220
  lower_is_better: false
221
221
  - name: dischargeme_accuracy
222
- display_name: DischargeMe Accuracy
223
- short_display_name: Accuracy
222
+ display_name: DischargeMe Jury Score
223
+ short_display_name: Jury Score
224
224
  description: Measures the average score assigned by an LLM-based jury evaluating task performance.
225
225
  lower_is_better: false
226
226
  - name: mtsamples_procedures_accuracy
227
- display_name: MTSamples Procedures Accuracy
228
- short_display_name: Accuracy
227
+ display_name: MTSamples Procedures Jury Score
228
+ short_display_name: Jury Score
229
229
  description: Measures the average score assigned by an LLM-based jury evaluating task performance.
230
230
  lower_is_better: false
231
231
  - name: mimic_rrs_accuracy
232
- display_name: MIMIC-RRS Accuracy
233
- short_display_name: Accuracy
232
+ display_name: MIMIC-RRS Jury Score
233
+ short_display_name: Jury Score
234
+ description: Measures the average score assigned by an LLM-based jury evaluating task performance.
235
+ lower_is_better: false
236
+ - name: mimic_bhc_accuracy
237
+ display_name: MIMIC-BHC Jury Score
238
+ short_display_name: Jury Score
234
239
  description: Measures the average score assigned by an LLM-based jury evaluating task performance.
235
240
  lower_is_better: false
236
241
  - name: chw_care_plan_accuracy
237
- display_name: NoteExtract Accuracy
238
- short_display_name: Accuracy
242
+ display_name: NoteExtract Jury Score
243
+ short_display_name: Jury Score
239
244
  description: Measures the average score assigned by an LLM-based jury evaluating task performance.
240
245
  lower_is_better: false
241
246
  - name: medication_qa_accuracy
242
- display_name: MedicationQA Accuracy
243
- short_display_name: Accuracy
247
+ display_name: MedicationQA Jury Score
248
+ short_display_name: Jury Score
244
249
  description: Measures the average score assigned by an LLM-based jury evaluating task performance.
245
250
  lower_is_better: false
246
251
  - name: starr_patient_instructions_accuracy
247
- display_name: PatientInstruct Accuracy
248
- short_display_name: Accuracy
252
+ display_name: PatientInstruct Jury Score
253
+ short_display_name: Jury Score
249
254
  description: Measures the average score assigned by an LLM-based jury evaluating task performance.
250
255
  lower_is_better: false
251
256
  - name: med_dialog_accuracy
252
- display_name: MedDialog Accuracy
253
- short_display_name: Accuracy
257
+ display_name: MedDialog Jury Score
258
+ short_display_name: Jury Score
254
259
  description: Measures the average score assigned by an LLM-based jury evaluating task performance.
255
260
  lower_is_better: false
256
261
  - name: medi_qa_accuracy
257
- display_name: MediQA Accuracy
258
- short_display_name: Accuracy
262
+ display_name: MediQA Jury Score
263
+ short_display_name: Jury Score
259
264
  description: Measures the average score assigned by an LLM-based jury evaluating task performance.
260
265
  lower_is_better: false
261
266
  - name: mental_health_accuracy
262
- display_name: MentalHealth Accuracy
263
- short_display_name: Accuracy
267
+ display_name: MentalHealth Jury Score
268
+ short_display_name: Jury Score
264
269
  description: Measures the average score assigned by an LLM-based jury evaluating task performance.
265
270
  lower_is_better: false
266
271
 
@@ -506,6 +511,8 @@ run_groups:
506
511
  - shc_conf_med
507
512
  - medi_qa
508
513
  - mental_health
514
+ - shc_proxy_med
515
+ - shc_privacy_med
509
516
 
510
517
  - name: medical_research
511
518
  display_name: Medical Research Assistance
@@ -517,6 +524,7 @@ run_groups:
517
524
  - shc_bmt_med
518
525
  - race_based_med
519
526
  - n2c2_ct_matching
527
+ - medhallu
520
528
 
521
529
  - name: administration_and_workflow
522
530
  display_name: Administration and Workflow
@@ -531,7 +539,7 @@ run_groups:
531
539
 
532
540
  - name: medcalc_bench
533
541
  display_name: MedCalc-Bench
534
- description: A dataset which consists of a patient note, a question requesting to compute a specific medical value, and a ground truth answer [(Khandekar et al., 2024)](https://arxiv.org/abs/2406.12036).
542
+ description: MedCalc-Bench is a benchmark designed to evaluate models on their ability to compute clinically relevant values from patient notes. Each instance consists of a clinical note describing the patient's condition, a diagnostic question targeting a specific medical value, and a ground truth response. [(Khandekar et al., 2024)](https://arxiv.org/abs/2406.12036).
535
543
  metric_groups:
536
544
  - accuracy
537
545
  - efficiency
@@ -546,28 +554,27 @@ run_groups:
546
554
  when: "Any"
547
555
  language: English
548
556
 
549
- - name: medalign
550
- display_name: MedAlign
551
- short_display_name: MedAlign
552
- description: A dataset that asks models to answer questions/follow instructions over longitudinal EHR [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).
557
+ - name: clear
558
+ display_name: CLEAR
559
+ description: CLEAR is a benchmark designed to evaluate models on their ability to detect medical conditions from patient notes using categorical responses. Each instance consists of a clinical note and a target condition, requiring the model to classify the patient's history as either affirmative, negative, or uncertain [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1).
553
560
  metric_groups:
554
561
  - accuracy
555
562
  - efficiency
556
563
  - general_information
557
564
  environment:
558
- main_name: medalign_accuracy
565
+ main_name: exact_match
559
566
  main_split: test
560
567
  taxonomy:
561
- task: Text generation
562
- what: "Answer questions and follow instructions over longitudinal EHR"
563
- who: "Clinician, Researcher"
564
- when: "Any"
568
+ task: Classification
569
+ what: Classify medical condition presence from patient notes
570
+ who: Clinician
571
+ when: Any
565
572
  language: English
566
573
 
567
574
  - name: mtsamples_replicate
568
575
  display_name: MTSamples
569
576
  short_display_name: MTSamples
570
- description: A dataset of clinical notes where the model is prompted to generate the appropriate treatment plan for this patient [(MTSamples, 2025)](https://mtsamples.com).
577
+ description: MTSamples Replicate is a benchmark that provides transcribed medical reports from various specialties. It is used to evaluate a model's ability to generate clinically appropriate treatment plans based on unstructured patient documentation [(MTSamples, 2025)](https://mtsamples.com).
571
578
  metric_groups:
572
579
  - accuracy
573
580
  - efficiency
@@ -582,9 +589,26 @@ run_groups:
582
589
  when: "Post-diagnosis"
583
590
  language: English
584
591
 
592
+ - name: medec
593
+ display_name: Medec
594
+ description: Medec is a benchmark composed of clinical narratives that include either correct documentation or medical errors. Each entry includes sentence-level identifiers and an associated correction task. The model must review the narrative and either identify the erroneous sentence and correct it, or confirm that the text is entirely accurate [(Abacha et al., 2025)](https://arxiv.org/abs/2412.19260).
595
+ metric_groups:
596
+ - accuracy
597
+ - efficiency
598
+ - general_information
599
+ environment:
600
+ main_name: medec_error_flag_accuracy
601
+ main_split: test
602
+ taxonomy:
603
+ task: Classification
604
+ what: Detect and correct errors in medical narratives
605
+ who: Researcher, Clinician
606
+ when: Any
607
+ language: English
608
+
585
609
  - name: ehrshot
586
610
  display_name: EHRSHOT
587
- description: A dataset given a patient record of EHR codes, classifying if an event will occur at a future date or not [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).
611
+ description: EHRSHOT is a benchmark designed to evaluate a model's ability to predict future clinical events using structured EHR code sequences. Each instance contains a patient's historical EHR data and a forward-looking clinical question about whether a particular diagnosis, lab result, or hospital event will occur [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).
588
612
  metric_groups:
589
613
  - accuracy
590
614
  - efficiency
@@ -598,27 +622,27 @@ run_groups:
598
622
  who: "Clinician, Insurer"
599
623
  when: "Future prediction"
600
624
  language: English
601
-
602
- - name: starr_patient_instructions
603
- display_name: PatientInstruct
604
- description: A dataset containing case details used to generate customized post-procedure patient instructions.
625
+
626
+ - name: head_qa
627
+ display_name: HeadQA
628
+ description: HeadQA is a benchmark consisting of biomedical multiple-choice questions intended to evaluate a model's medical knowledge and reasoning. Each instance presents a clinical or scientific question with four answer options, requiring the model to select the most appropriate answer [(Vilares et al., 2019)](https://arxiv.org/abs/1906.04701).
605
629
  metric_groups:
606
630
  - accuracy
607
631
  - efficiency
608
632
  - general_information
609
633
  environment:
610
- main_name: starr_patient_instructions_accuracy
634
+ main_name: exact_match
611
635
  main_split: test
612
636
  taxonomy:
613
- task: Text generation
614
- what: Generate customized post-procedure patient instructions
615
- who: Clinician
616
- when: Post-procedure
637
+ task: Question answering
638
+ what: Medical knowledge testing
639
+ who: Medical student, Researcher
640
+ when: Any
617
641
  language: English
618
-
619
- - name: clear
620
- display_name: CLEAR
621
- description: "A dataset for evaluating the presence of a specific medical condition from patient notes with yes/no/maybe classifications [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1)."
642
+
643
+ - name: medbullets
644
+ display_name: Medbullets
645
+ description: Medbullets is a benchmark of USMLE-style medical questions designed to assess a model's ability to understand and apply clinical knowledge. Each question is accompanied by a patient scenario and five multiple-choice options, similar to those found on Step 2 and Step 3 board exams [(MedBullets, 2025)](https://step2.medbullets.com).
622
646
  metric_groups:
623
647
  - accuracy
624
648
  - efficiency
@@ -627,15 +651,33 @@ run_groups:
627
651
  main_name: exact_match
628
652
  main_split: test
629
653
  taxonomy:
630
- task: Classification
631
- what: Classify medical condition presence from patient notes
632
- who: Clinician
654
+ task: Question answering
655
+ what: Medical knowledge testing
656
+ who: Medical student, . Researcher
633
657
  when: Any
634
658
  language: English
635
659
 
636
- - name: race_based_med
637
- display_name: RaceBias
638
- description: A collection of LLM outputs in response to medical questions with race-based biases, with the objective being to classify whether the output contains racially biased content.
660
+ - name: medalign
661
+ display_name: MedAlign
662
+ short_display_name: MedAlign
663
+ description: MedAlign is a benchmark that evaluates a model's ability to interpret and follow instructions grounded in longitudinal electronic health records (EHR). Each instance includes an event-stream style patient record and a natural language question or task, requiring clinically informed reading comprehension and reasoning [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).
664
+ metric_groups:
665
+ - accuracy
666
+ - efficiency
667
+ - general_information
668
+ environment:
669
+ main_name: medalign_accuracy
670
+ main_split: test
671
+ taxonomy:
672
+ task: Text generation
673
+ what: "Answer questions and follow instructions over longitudinal EHR"
674
+ who: "Clinician, Researcher"
675
+ when: "Any"
676
+ language: English
677
+
678
+ - name: shc_ptbm_med
679
+ display_name: ADHD-Behavior
680
+ description: ADHD-Behavior is a benchmark that evaluates a model's ability to detect whether a clinician recommends parent training in behavior management, an evidence-based first-line treatment for young children diagnosed with ADHD. Each instance includes a clinical note from a pediatric visit and a binary classification task [(Pillai et al., 2024)](https://doi.org/10.1093/jamia/ocae001).
639
681
  metric_groups:
640
682
  - accuracy
641
683
  - efficiency
@@ -645,15 +687,14 @@ run_groups:
645
687
  main_split: test
646
688
  taxonomy:
647
689
  task: Classification
648
- what: Identify race-based bias in LLM-generated medical responses
649
- who: Researcher
650
- when: Any
690
+ what: Detect ADHD medication side effect monitoring
691
+ who: Clinician, Researcher
692
+ when: During Treatment
651
693
  language: English
652
694
 
653
- - name: n2c2_ct_matching
654
- display_name: N2C2-CT Matching
655
- short_display_name: N2C2-CT
656
- description: A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.
695
+ - name: shc_sei_med
696
+ display_name: ADHD-MedEffects
697
+ description: ADHD-MedEffects is a benchmark designed to evaluate whether clinical notes for pediatric ADHD visits document medication side effect monitoring, which is a key recommendation in clinical practice guidelines. The dataset supports binary classification to detect presence or absence of side effect inquiries (SEI) within notes [(Bannet et al., 2024)](https://doi.org/10.1542/peds.2024-067223).
657
698
  metric_groups:
658
699
  - accuracy
659
700
  - efficiency
@@ -663,67 +704,67 @@ run_groups:
663
704
  main_split: test
664
705
  taxonomy:
665
706
  task: Classification
666
- what: Classify whether a patient is a valid candidate for a clinical trial based on clinical notes
667
- who: Researcher
668
- when: Pre-Trial
707
+ what: Classify clinician recommendations for ADHD behavior management
708
+ who: Clinician, Caregiver
709
+ when: Early Intervention
669
710
  language: English
670
711
 
671
- - name: med_dialog
672
- display_name: MedDialog
673
- short_display_name: MedDialog
674
- description: A collection of doctor-patient conversations with corresponding summaries.
712
+ - name: dischargeme
713
+ display_name: DischargeMe
714
+ short_display_name: DischargeMe
715
+ description: DischargeMe is a benchmark designed to evaluate clinical text generation. It pairs discharge summaries and radiology reports from MIMIC-IV with generation tasks such as writing discharge instructions or summarizing the brief hospital course. The benchmark assesses a model's ability to generate patient-facing documentation that is complete, empathetic, and clinically accurate [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).
675
716
  metric_groups:
676
717
  - accuracy
677
718
  - efficiency
678
719
  - general_information
679
720
  environment:
680
- main_name: med_dialog_accuracy
721
+ main_name: dischargeme_accuracy
681
722
  main_split: test
682
723
  taxonomy:
683
724
  task: Text generation
684
- what: Generate summaries of doctor-patient conversations
725
+ what: Generate discharge instructions from hospital notes
685
726
  who: Clinician
686
- when: Any
727
+ when: Upon hospital discharge
687
728
  language: English
688
-
689
- - name: medi_qa
690
- display_name: MEDIQA
691
- description: A dataset including a medical question, a set of candidate answers, relevance annotations for ranking, and additional context to evaluate understanding and retrieval capabilities in a healthcare setting.
729
+
730
+ - name: aci_bench
731
+ display_name: ACI-Bench
732
+ description: ACI-Bench is a benchmark of real-world patient-doctor conversations paired with structured clinical notes. The benchmark evaluates a model's ability to understand spoken medical dialogue and convert it into formal clinical documentation, covering sections such as history of present illness, physical exam findings, results, and assessment and plan [(Yim et al., 2024)](https://www.nature.com/articles/s41597-023-02487-3).
692
733
  metric_groups:
693
734
  - accuracy
694
735
  - efficiency
695
736
  - general_information
696
737
  environment:
697
- main_name: medi_qa_accuracy
738
+ main_name: aci_bench_accuracy
698
739
  main_split: test
699
740
  taxonomy:
700
741
  task: Text generation
701
- what: Retrieve and rank answers based on medical question understanding
702
- who: Clinician, Medical Student
742
+ what: Extract and structure information from patient-doctor conversations
743
+ who: Clinician
703
744
  when: Any
704
745
  language: English
705
746
 
706
- - name: mental_health
707
- display_name: MentalHealth
708
- description: A dataset containing a counselor and mental health patient conversation, where the objective is to generate an empathetic counselor response.
747
+ - name: mtsamples_procedures
748
+ display_name: MTSamples Procedures
749
+ description: MTSamples Procedures is a benchmark composed of transcribed operative notes, focused on documenting surgical procedures. Each example presents a brief patient case involving a surgical intervention, and the model is tasked with generating a coherent and clinically accurate procedural summary or treatment plan.
709
750
  metric_groups:
710
751
  - accuracy
711
752
  - efficiency
712
753
  - general_information
713
754
  environment:
714
- main_name: mental_health_accuracy
755
+ main_name: mtsamples_procedures_accuracy
715
756
  main_split: test
716
757
  taxonomy:
717
758
  task: Text generation
718
- what: Generate empathetic counseling responses in mental health conversations
719
- who: Counselors, Patients
720
- when: Any
759
+ what: Document and extract information about medical procedures
760
+ who: Clinician, Researcher
761
+ when: Post-procedure
721
762
  language: English
722
763
 
723
764
  - name: mimic_rrs
724
765
  display_name: MIMIC-RRS
725
766
  short_display_name: MIMIC-RRS
726
- description: A dataset containing radiology reports with findings sections from MIMIC-III paired with their corresponding impression sections, used for generating radiology report summaries [(Chen et al., 2023)](https://arxiv.org/abs/2211.08584).
767
+ description: MIMIC-RRS is a benchmark constructed from radiology reports in the MIMIC-III database. It contains pairs of ‘Findings‘ and ‘Impression‘ sections, enabling evaluation of a model's ability to summarize diagnostic imaging observations into concise, clinically relevant conclusions [(Chen et al., 2023)](https://arxiv.org/abs/2211.08584).
727
768
  metric_groups:
728
769
  - accuracy
729
770
  - efficiency
@@ -741,13 +782,13 @@ run_groups:
741
782
  - name: mimic_bhc
742
783
  display_name: MIMIC-IV-BHC
743
784
  short_display_name: MIMIC-BHC
744
- description: A summarization task using a curated collection of preprocessed discharge notes paired with their corresponding brief hospital course (BHC) summaries [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).
785
+ description: MIMIC-BHC is a benchmark focused on summarization of discharge notes into Brief Hospital Course (BHC) sections. It consists of curated discharge notes from MIMIC-IV, each paired with its corresponding BHC summary. The benchmark evaluates a model's ability to condense detailed clinical information into accurate, concise summaries that reflect the patient's hospital stay [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).
745
786
  metric_groups:
746
787
  - accuracy
747
788
  - efficiency
748
789
  - general_information
749
790
  environment:
750
- main_name: BERTScore-F
791
+ main_name: mimic_bhc_accuracy
751
792
  main_split: test
752
793
  taxonomy:
753
794
  task: Text generation
@@ -755,130 +796,147 @@ run_groups:
755
796
  who: Clinician
756
797
  when: Upon hospital discharge
757
798
  language: English
758
-
759
- - name: mimiciv_billing_code
760
- display_name: MIMIC-IV Billing Code
761
- description: A dataset pairing clinical notes from MIMIC-IV with corresponding ICD-10 billing codes.
799
+
800
+ - name: chw_care_plan
801
+ display_name: NoteExtract
802
+ description: NoteExtract is a benchmark that focuses on the structured extraction of information from free-form clinical text. It provides care plan notes authored by health workers and evaluates a model's ability to convert them into a predefined structured format, such as fields for Chief Complaint and History of Present Illness. The benchmark emphasizes faithful extraction without hallucination or inference.
762
803
  metric_groups:
763
804
  - accuracy
764
805
  - efficiency
765
806
  - general_information
766
807
  environment:
767
- main_name: mimiciv_billing_code_f1
808
+ main_name: chw_care_plan_accuracy
768
809
  main_split: test
769
810
  taxonomy:
770
- task: Classification
771
- what: Predict ICD-10 billing codes from clinical discharge notes
772
- who: Hospital Admistrator
773
- when: During or after patient discharge
811
+ task: Text generation
812
+ what: Convert general text care plans into structured formats
813
+ who: Clinician, Researcher
814
+ when: Any
774
815
  language: English
775
-
776
- - name: dischargeme
777
- display_name: DischargeMe
778
- short_display_name: DischargeMe
779
- description: DischargeMe is a discharge instruction generation dataset and brief hospital course generation dataset collected from MIMIC-IV data, considering only the discharge text as well as the radiology report text [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).
816
+
817
+ - name: medication_qa
818
+ display_name: MedicationQA
819
+ description: MedicationQA is a benchmark composed of open-ended consumer health questions specifically focused on medications. Each example consists of a free-form question and a corresponding medically grounded answer. The benchmark evaluates a model's ability to provide accurate, accessible, and informative medication-related responses for a lay audience.
780
820
  metric_groups:
781
821
  - accuracy
782
822
  - efficiency
783
823
  - general_information
784
824
  environment:
785
- main_name: dischargeme_accuracy
825
+ main_name: medication_qa_accuracy
826
+ main_split: test
827
+ taxonomy:
828
+ task: Question answering
829
+ what: Answer consumer medication-related questions
830
+ who: Patient, Pharmacist
831
+ when: Any
832
+ language: English
833
+
834
+ - name: starr_patient_instructions
835
+ display_name: PatientInstruct
836
+ description: PatientInstruct is a benchmark designed to evaluate models on generating personalized post-procedure instructions for patients. It includes real-world clinical case details, such as diagnosis, planned procedures, and history and physical notes, from which models must produce clear, actionable instructions appropriate for patients recovering from medical interventions.
837
+ metric_groups:
838
+ - accuracy
839
+ - efficiency
840
+ - general_information
841
+ environment:
842
+ main_name: starr_patient_instructions_accuracy
786
843
  main_split: test
787
844
  taxonomy:
788
845
  task: Text generation
789
- what: Generate discharge instructions from hospital notes
846
+ what: Generate customized post-procedure patient instructions
790
847
  who: Clinician
791
- when: Upon hospital discharge
848
+ when: Post-procedure
792
849
  language: English
793
850
 
794
- - name: pubmed_qa
795
- display_name: PubMedQA
796
- description: A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions.
851
+ - name: med_dialog
852
+ display_name: MedDialog
853
+ short_display_name: MedDialog
854
+ description: MedDialog is a benchmark of real-world doctor-patient conversations focused on health-related concerns and advice. Each dialogue is paired with a one-sentence summary that reflects the core patient question or exchange. The benchmark evaluates a model's ability to condense medical dialogue into concise, informative summaries.
797
855
  metric_groups:
798
856
  - accuracy
799
857
  - efficiency
800
858
  - general_information
801
859
  environment:
802
- main_name: exact_match
860
+ main_name: med_dialog_accuracy
803
861
  main_split: test
804
862
  taxonomy:
805
- task: Question answering
806
- what: Answer questions based on PubMed abstracts
807
- who: Researcher
863
+ task: Text generation
864
+ what: Generate summaries of doctor-patient conversations
865
+ who: Clinician
808
866
  when: Any
809
867
  language: English
810
868
 
811
- - name: medec
812
- display_name: Medec
813
- description: A dataset containing medical narratives with error detection and correction pairs [(Abacha et al., 2025)](https://arxiv.org/abs/2412.19260).
869
+ - name: shc_conf_med
870
+ display_name: MedConfInfo
871
+ description: MedConfInfo is a benchmark comprising clinical notes from adolescent patients. It is used to evaluate whether the content contains sensitive protected health information (PHI) that should be restricted from parental access, in accordance with adolescent confidentiality policies in clinical care. [(Rabbani et al., 2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).
814
872
  metric_groups:
815
873
  - accuracy
816
874
  - efficiency
817
875
  - general_information
818
876
  environment:
819
- main_name: medec_error_flag_accuracy
877
+ main_name: exact_match
820
878
  main_split: test
821
879
  taxonomy:
822
880
  task: Classification
823
- what: Detect and correct errors in medical narratives
824
- who: Researcher, Clinician
881
+ what: Identify sensitive health info in adolescent notes
882
+ who: Clinician
825
883
  when: Any
826
884
  language: English
827
885
 
828
- - name: aci_bench
829
- display_name: ACI-Bench
830
- description: A dataset of patient-doctor conversations paired with structured clinical notes [(Yim et al., 2024)](https://www.nature.com/articles/s41597-023-02487-3).
886
+ - name: medi_qa
887
+ display_name: MEDIQA
888
+ description: MEDIQA is a benchmark designed to evaluate a model's ability to retrieve and generate medically accurate answers to patient-generated questions. Each instance includes a consumer health question, a set of candidate answers (used in ranking tasks), relevance annotations, and optionally, additional context. The benchmark focuses on supporting patient understanding and accessibility in health communication.
831
889
  metric_groups:
832
890
  - accuracy
833
891
  - efficiency
834
892
  - general_information
835
893
  environment:
836
- main_name: aci_bench_accuracy
894
+ main_name: medi_qa_accuracy
837
895
  main_split: test
838
896
  taxonomy:
839
897
  task: Text generation
840
- what: Extract and structure information from patient-doctor conversations
841
- who: Clinician
898
+ what: Retrieve and rank answers based on medical question understanding
899
+ who: Clinician, Medical Student
842
900
  when: Any
843
901
  language: English
844
902
 
845
- - name: chw_care_plan
846
- display_name: NoteExtract
847
- description: A dataset containing free form text of a clinical health worker care plan, with the associated goal being to restructure that text into a given format.
903
+ - name: mental_health
904
+ display_name: MentalHealth
905
+ description: MentalHealth is a benchmark focused on evaluating empathetic communication in mental health counseling. It includes real or simulated conversations between patients and counselors, where the task is to generate compassionate and appropriate counselor responses. The benchmark assesses a model's ability to support patients emotionally and meaningfully engage in therapeutic conversations.
848
906
  metric_groups:
849
907
  - accuracy
850
908
  - efficiency
851
909
  - general_information
852
910
  environment:
853
- main_name: chw_care_plan_accuracy
911
+ main_name: mental_health_accuracy
854
912
  main_split: test
855
913
  taxonomy:
856
914
  task: Text generation
857
- what: Convert general text care plans into structured formats
858
- who: Clinician, Researcher
915
+ what: Generate empathetic counseling responses in mental health conversations
916
+ who: Counselors, Patients
859
917
  when: Any
860
918
  language: English
861
919
 
862
- - name: ehr_sql
863
- display_name: EHRSQL
864
- description: Given a natural language instruction, generate an SQL query that would be used in clinical research.
920
+ - name: shc_proxy_med
921
+ display_name: ProxySender
922
+ description: ProxySender is a benchmark composed of patient portal messages received by clinicians. It evaluates whether the message was sent by the patient or by a proxy user (e.g., parent, spouse), which is critical for understanding who is communicating with healthcare providers. [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).
865
923
  metric_groups:
866
924
  - accuracy
867
925
  - efficiency
868
926
  - general_information
869
927
  environment:
870
- main_name: ehr_sql_execution_accuracy
928
+ main_name: exact_match
871
929
  main_split: test
872
930
  taxonomy:
873
- task: Code generation
874
- what: Generate SQL queries from natural language for clinical research
875
- who: Researcher
931
+ task: Classification
932
+ what: Classify if a document was sent by a proxy user
933
+ who: Clinician, Caregiver
876
934
  when: Any
877
935
  language: English
878
936
 
879
- - name: head_qa
880
- display_name: HeadQA
881
- description: A collection of biomedical multiple-choice questions for testing medical knowledge [(Vilares et al., 2019)](https://arxiv.org/abs/1906.04701).
937
+ - name: shc_privacy_med
938
+ display_name: PrivacyDetection
939
+ description: PrivacyDetection is a benchmark composed of patient portal messages submitted by patients or caregivers. The task is to determine whether the message contains any confidential or privacy-leaking information that should be protected [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).
882
940
  metric_groups:
883
941
  - accuracy
884
942
  - efficiency
@@ -887,15 +945,15 @@ run_groups:
887
945
  main_name: exact_match
888
946
  main_split: test
889
947
  taxonomy:
890
- task: Question answering
891
- what: Medical knowledge testing
892
- who: Medical student, Researcher
948
+ task: Classification
949
+ what: Classify if a document leaks private information
950
+ who: Clinician, Caregiver
893
951
  when: Any
894
952
  language: English
895
953
 
896
- - name: medbullets
897
- display_name: Medbullets
898
- description: A USMLE-style medical question dataset with multiple-choice answers and explanations [(MedBullets, 2025)](https://step2.medbullets.com).
954
+ - name: pubmed_qa
955
+ display_name: PubMedQA
956
+ description: PubMedQA is a biomedical question-answering dataset that evaluates a model's ability to interpret scientific literature. It consists of PubMed abstracts paired with yes/no/maybe questions derived from the content. The benchmark assesses a model's capability to reason over biomedical texts and provide factually grounded answers.
899
957
  metric_groups:
900
958
  - accuracy
901
959
  - efficiency
@@ -905,48 +963,48 @@ run_groups:
905
963
  main_split: test
906
964
  taxonomy:
907
965
  task: Question answering
908
- what: Medical knowledge testing
909
- who: Medical student, . Researcher
966
+ what: Answer questions based on PubMed abstracts
967
+ who: Researcher
910
968
  when: Any
911
969
  language: English
912
970
 
913
- - name: mtsamples_procedures
914
- display_name: MTSamples Procedures
915
- description: A dataset that provides a patient note regarding an operation, with the objective to document the procedure.
971
+ - name: ehr_sql
972
+ display_name: EHRSQL
973
+ description: EHRSQL is a benchmark designed to evaluate models on generating structured queries for clinical research. Each example includes a natural language question and a database schema, and the task is to produce an SQL query that would return the correct result for a biomedical research objective. This benchmark assesses a model's understanding of medical terminology, data structures, and query construction.
916
974
  metric_groups:
917
975
  - accuracy
918
976
  - efficiency
919
977
  - general_information
920
978
  environment:
921
- main_name: mtsamples_procedures_accuracy
979
+ main_name: ehr_sql_execution_accuracy
922
980
  main_split: test
923
981
  taxonomy:
924
- task: Text generation
925
- what: Document and extract information about medical procedures
926
- who: Clinician, Researcher
927
- when: Post-procedure
982
+ task: Code generation
983
+ what: Generate SQL queries from natural language for clinical research
984
+ who: Researcher
985
+ when: Any
928
986
  language: English
929
987
 
930
- - name: medication_qa
931
- display_name: MedicationQA
932
- description: Consumer medication questions with reference answers.
988
+ - name: shc_bmt_med
989
+ display_name: BMT-Status
990
+ description: BMT-Status is a benchmark composed of clinical notes and associated binary questions related to bone marrow transplant (BMT), hematopoietic stem cell transplant (HSCT), or hematopoietic cell transplant (HCT) status. The goal is to determine whether the patient received a subsequent transplant based on the provided clinical documentation.
933
991
  metric_groups:
934
992
  - accuracy
935
993
  - efficiency
936
994
  - general_information
937
995
  environment:
938
- main_name: medication_qa_accuracy
996
+ main_name: exact_match
939
997
  main_split: test
940
998
  taxonomy:
941
- task: Question answering
942
- what: Answer consumer medication-related questions
943
- who: Patient, Pharmacist
999
+ task: question answering
1000
+ what: Answer bone marrow transplant questions
1001
+ who: Researcher
944
1002
  when: Any
945
1003
  language: English
946
1004
 
947
- - name: shc_bmt_med
948
- display_name: BMT-Status
949
- description: A dataset containing patient notes with associated questions and answers related to bone marrow transplantation.
1005
+ - name: race_based_med
1006
+ display_name: RaceBias
1007
+ description: RaceBias is a benchmark used to evaluate language models for racially biased or inappropriate content in medical question-answering scenarios. Each instance consists of a medical question and a model-generated response. The task is to classify whether the response contains race-based, harmful, or inaccurate content. This benchmark supports research into bias detection and fairness in clinical AI systems.
950
1008
  metric_groups:
951
1009
  - accuracy
952
1010
  - efficiency
@@ -955,15 +1013,16 @@ run_groups:
955
1013
  main_name: exact_match
956
1014
  main_split: test
957
1015
  taxonomy:
958
- task: question answering
959
- what: Answer bone marrow transplant questions
1016
+ task: Classification
1017
+ what: Identify race-based bias in LLM-generated medical responses
960
1018
  who: Researcher
961
1019
  when: Any
962
1020
  language: English
963
-
964
- - name: shc_gip_med
965
- display_name: HospiceReferral
966
- description: A dataset evaluating performance in identifying appropriate patient referrals to hospice care.
1021
+
1022
+ - name: n2c2_ct_matching
1023
+ display_name: N2C2-CT Matching
1024
+ short_display_name: N2C2-CT
1025
+ description: A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.
967
1026
  metric_groups:
968
1027
  - accuracy
969
1028
  - efficiency
@@ -973,14 +1032,14 @@ run_groups:
973
1032
  main_split: test
974
1033
  taxonomy:
975
1034
  task: Classification
976
- what: Assess hospice referral appropriateness
977
- who: Hospital Admistrator
978
- when: End-of-care
1035
+ what: Classify whether a patient is a valid candidate for a clinical trial based on clinical notes
1036
+ who: Researcher
1037
+ when: Pre-Trial
979
1038
  language: English
980
1039
 
981
- - name: shc_cdi_med
982
- display_name: CDI-QA
983
- description: A dataset built from Clinical Document Integrity (CDI) notes, to assess the ability to answer verification questions from previous notes.
1040
+ - name: medhallu
1041
+ display_name: MedHallu
1042
+ description: MedHallu is a benchmark focused on evaluating factual correctness in biomedical question answering. Each instance contains a PubMed-derived knowledge snippet, a biomedical question, and a model-generated answer. The task is to classify whether the answer is factually correct or contains hallucinated (non-grounded) information. This benchmark is designed to assess the factual reliability of medical language models.
984
1043
  metric_groups:
985
1044
  - accuracy
986
1045
  - efficiency
@@ -990,14 +1049,14 @@ run_groups:
990
1049
  main_split: test
991
1050
  taxonomy:
992
1051
  task: Classification
993
- what: Answer verification questions from CDI notes
994
- who: Hospital Admistrator
1052
+ what: Verify whether answers to questions from PubMed articles are factual or hallucinated
1053
+ who: Researcher
995
1054
  when: Any
996
1055
  language: English
997
-
998
- - name: shc_ent_med
999
- display_name: ENT-Referral
1000
- description: A dataset designed to evaluate performance in identifying appropriate patient referrals to Ear, Nose, and Throat specialists.
1056
+
1057
+ - name: shc_gip_med
1058
+ display_name: HospiceReferral
1059
+ description: HospiceReferral is a benchmark that evaluates model performance in identifying whether patients are eligible for hospice care based on palliative care clinical notes. The benchmark focuses on end-of-life care referral decisions.
1001
1060
  metric_groups:
1002
1061
  - accuracy
1003
1062
  - efficiency
@@ -1007,31 +1066,31 @@ run_groups:
1007
1066
  main_split: test
1008
1067
  taxonomy:
1009
1068
  task: Classification
1010
- what: Identify referrals for ENT specialists
1069
+ what: Assess hospice referral appropriateness
1011
1070
  who: Hospital Admistrator
1012
- when: Any
1071
+ when: End-of-care
1013
1072
  language: English
1014
1073
 
1015
- - name: shc_sequoia_med
1016
- display_name: ClinicReferral
1017
- description: A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic.
1074
+ - name: mimiciv_billing_code
1075
+ display_name: MIMIC-IV Billing Code
1076
+ description: MIMIC-IV Billing Code is a benchmark derived from discharge summaries in the MIMIC-IV database, paired with their corresponding ICD-10 billing codes. The task requires models to extract structured billing codes based on free-text clinical notes, reflecting real-world hospital coding tasks for financial reimbursement.
1018
1077
  metric_groups:
1019
1078
  - accuracy
1020
1079
  - efficiency
1021
1080
  - general_information
1022
1081
  environment:
1023
- main_name: exact_match
1082
+ main_name: mimiciv_billing_code_f1
1024
1083
  main_split: test
1025
1084
  taxonomy:
1026
1085
  task: Classification
1027
- what: Provide answers on clinic referrals
1086
+ what: Predict ICD-10 billing codes from clinical discharge notes
1028
1087
  who: Hospital Admistrator
1029
- when: Pre-referral
1088
+ when: During or after patient discharge
1030
1089
  language: English
1031
-
1032
- - name: shc_conf_med
1033
- display_name: MedConfInfo
1034
- description: A dataset of clinical notes from adolescent patients used to identify sensitive protected health information that should be restricted from parental access [(Rabbani et al., 2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).
1090
+
1091
+ - name: shc_sequoia_med
1092
+ display_name: ClinicReferral
1093
+ description: ClinicReferral is a benchmark that determines patient eligibility for referral to the Sequoia Clinic based on information from palliative care notes. The dataset provides curated decisions on referral appropriateness to assist in automating clinic workflows.
1035
1094
  metric_groups:
1036
1095
  - accuracy
1037
1096
  - efficiency
@@ -1041,14 +1100,14 @@ run_groups:
1041
1100
  main_split: test
1042
1101
  taxonomy:
1043
1102
  task: Classification
1044
- what: Identify sensitive health info in adolescent notes
1045
- who: Clinician
1046
- when: Any
1103
+ what: Provide answers on clinic referrals
1104
+ who: Hospital Admistrator
1105
+ when: Pre-referral
1047
1106
  language: English
1048
1107
 
1049
- - name: shc_ptbm_med
1050
- display_name: ADHD-Behavior
1051
- description: A dataset that classifies whether a clinical note contains a clinician recommendation for parent training in behavior management, which is the first-line evidence-based treatment for young children with ADHD [(Pillai et al., 2024)](https://doi.org/10.1093/jamia/ocae001).
1108
+ - name: shc_cdi_med
1109
+ display_name: CDI-QA
1110
+ description: CDI-QA is a benchmark constructed from Clinical Documentation Integrity (CDI) notes. It is used to evaluate a model's ability to verify clinical conditions based on documented evidence in patient records.
1052
1111
  metric_groups:
1053
1112
  - accuracy
1054
1113
  - efficiency
@@ -1058,14 +1117,14 @@ run_groups:
1058
1117
  main_split: test
1059
1118
  taxonomy:
1060
1119
  task: Classification
1061
- what: Detect ADHD medication side effect monitoring
1062
- who: Clinician, Researcher
1063
- when: During Treatment
1120
+ what: Answer verification questions from CDI notes
1121
+ who: Hospital Admistrator
1122
+ when: Any
1064
1123
  language: English
1065
-
1066
- - name: shc_sei_med
1067
- display_name: ADHD-MedEffects
1068
- description: A dataset that classifies whether a clinical note contains documentation of side effect monitoring (recording of absence or presence of medication side effects), as recommended in clinical practice guidelines [(Bannet et al., 2024)](https://doi.org/10.1542/peds.2024-067223).
1124
+
1125
+ - name: shc_ent_med
1126
+ display_name: ENT-Referral
1127
+ description: ENT-Referral is a benchmark designed to evaluate whether a patient's clinical note supports a referral to an Ear, Nose, and Throat (ENT) specialist. It helps assess models' abilities to make referral decisions based on unstructured clinical text
1069
1128
  metric_groups:
1070
1129
  - accuracy
1071
1130
  - efficiency
@@ -1075,7 +1134,7 @@ run_groups:
1075
1134
  main_split: test
1076
1135
  taxonomy:
1077
1136
  task: Classification
1078
- what: Classify clinician recommendations for ADHD behavior management
1079
- who: Clinician, Caregiver
1080
- when: Early Intervention
1137
+ what: Identify referrals for ENT specialists
1138
+ who: Hospital Admistrator
1139
+ when: Any
1081
1140
  language: English