crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +2 -2
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +16 -26
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +43 -13
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +7 -1
- helm/benchmark/presentation/summarize.py +84 -61
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +84 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +114 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +81 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +102 -55
- helm/clients/openai_responses_client.py +176 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +31 -6
- helm/clients/vertexai_client.py +17 -9
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +0 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +104 -12
- helm/common/local_context.py +140 -0
- helm/common/object_spec.py +23 -8
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +995 -45
- helm/config/model_metadata.yaml +780 -59
- helm/config/tokenizer_configs.yaml +224 -3
- helm/proxy/cli.py +4 -2
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -793
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
|
@@ -204,63 +204,68 @@ metrics:
|
|
|
204
204
|
description: Measures the proportion of correctly predicted answerable questions among all answerable questions in the dataset.
|
|
205
205
|
lower_is_better: false
|
|
206
206
|
- name: aci_bench_accuracy
|
|
207
|
-
display_name: ACI-Bench
|
|
208
|
-
short_display_name:
|
|
207
|
+
display_name: ACI-Bench Jury Score
|
|
208
|
+
short_display_name: Jury Score
|
|
209
209
|
description: Measures the average score assigned by an LLM-based jury evaluating task performance.
|
|
210
210
|
lower_is_better: false
|
|
211
211
|
- name: mtsamples_replicate_accuracy
|
|
212
|
-
display_name: MTSamples Replicate
|
|
213
|
-
short_display_name:
|
|
212
|
+
display_name: MTSamples Replicate Jury Score
|
|
213
|
+
short_display_name: Jury Score
|
|
214
214
|
description: Measures the average score assigned by an LLM-based jury evaluating task performance.
|
|
215
215
|
lower_is_better: false
|
|
216
216
|
- name: medalign_accuracy
|
|
217
|
-
display_name: Medalign
|
|
218
|
-
short_display_name:
|
|
217
|
+
display_name: Medalign Jury Score
|
|
218
|
+
short_display_name: Jury Score
|
|
219
219
|
description: Measures the average score assigned by an LLM-based jury evaluating task performance.
|
|
220
220
|
lower_is_better: false
|
|
221
221
|
- name: dischargeme_accuracy
|
|
222
|
-
display_name: DischargeMe
|
|
223
|
-
short_display_name:
|
|
222
|
+
display_name: DischargeMe Jury Score
|
|
223
|
+
short_display_name: Jury Score
|
|
224
224
|
description: Measures the average score assigned by an LLM-based jury evaluating task performance.
|
|
225
225
|
lower_is_better: false
|
|
226
226
|
- name: mtsamples_procedures_accuracy
|
|
227
|
-
display_name: MTSamples Procedures
|
|
228
|
-
short_display_name:
|
|
227
|
+
display_name: MTSamples Procedures Jury Score
|
|
228
|
+
short_display_name: Jury Score
|
|
229
229
|
description: Measures the average score assigned by an LLM-based jury evaluating task performance.
|
|
230
230
|
lower_is_better: false
|
|
231
231
|
- name: mimic_rrs_accuracy
|
|
232
|
-
display_name: MIMIC-RRS
|
|
233
|
-
short_display_name:
|
|
232
|
+
display_name: MIMIC-RRS Jury Score
|
|
233
|
+
short_display_name: Jury Score
|
|
234
|
+
description: Measures the average score assigned by an LLM-based jury evaluating task performance.
|
|
235
|
+
lower_is_better: false
|
|
236
|
+
- name: mimic_bhc_accuracy
|
|
237
|
+
display_name: MIMIC-BHC Jury Score
|
|
238
|
+
short_display_name: Jury Score
|
|
234
239
|
description: Measures the average score assigned by an LLM-based jury evaluating task performance.
|
|
235
240
|
lower_is_better: false
|
|
236
241
|
- name: chw_care_plan_accuracy
|
|
237
|
-
display_name: NoteExtract
|
|
238
|
-
short_display_name:
|
|
242
|
+
display_name: NoteExtract Jury Score
|
|
243
|
+
short_display_name: Jury Score
|
|
239
244
|
description: Measures the average score assigned by an LLM-based jury evaluating task performance.
|
|
240
245
|
lower_is_better: false
|
|
241
246
|
- name: medication_qa_accuracy
|
|
242
|
-
display_name: MedicationQA
|
|
243
|
-
short_display_name:
|
|
247
|
+
display_name: MedicationQA Jury Score
|
|
248
|
+
short_display_name: Jury Score
|
|
244
249
|
description: Measures the average score assigned by an LLM-based jury evaluating task performance.
|
|
245
250
|
lower_is_better: false
|
|
246
251
|
- name: starr_patient_instructions_accuracy
|
|
247
|
-
display_name: PatientInstruct
|
|
248
|
-
short_display_name:
|
|
252
|
+
display_name: PatientInstruct Jury Score
|
|
253
|
+
short_display_name: Jury Score
|
|
249
254
|
description: Measures the average score assigned by an LLM-based jury evaluating task performance.
|
|
250
255
|
lower_is_better: false
|
|
251
256
|
- name: med_dialog_accuracy
|
|
252
|
-
display_name: MedDialog
|
|
253
|
-
short_display_name:
|
|
257
|
+
display_name: MedDialog Jury Score
|
|
258
|
+
short_display_name: Jury Score
|
|
254
259
|
description: Measures the average score assigned by an LLM-based jury evaluating task performance.
|
|
255
260
|
lower_is_better: false
|
|
256
261
|
- name: medi_qa_accuracy
|
|
257
|
-
display_name: MediQA
|
|
258
|
-
short_display_name:
|
|
262
|
+
display_name: MediQA Jury Score
|
|
263
|
+
short_display_name: Jury Score
|
|
259
264
|
description: Measures the average score assigned by an LLM-based jury evaluating task performance.
|
|
260
265
|
lower_is_better: false
|
|
261
266
|
- name: mental_health_accuracy
|
|
262
|
-
display_name: MentalHealth
|
|
263
|
-
short_display_name:
|
|
267
|
+
display_name: MentalHealth Jury Score
|
|
268
|
+
short_display_name: Jury Score
|
|
264
269
|
description: Measures the average score assigned by an LLM-based jury evaluating task performance.
|
|
265
270
|
lower_is_better: false
|
|
266
271
|
|
|
@@ -506,6 +511,8 @@ run_groups:
|
|
|
506
511
|
- shc_conf_med
|
|
507
512
|
- medi_qa
|
|
508
513
|
- mental_health
|
|
514
|
+
- shc_proxy_med
|
|
515
|
+
- shc_privacy_med
|
|
509
516
|
|
|
510
517
|
- name: medical_research
|
|
511
518
|
display_name: Medical Research Assistance
|
|
@@ -517,6 +524,7 @@ run_groups:
|
|
|
517
524
|
- shc_bmt_med
|
|
518
525
|
- race_based_med
|
|
519
526
|
- n2c2_ct_matching
|
|
527
|
+
- medhallu
|
|
520
528
|
|
|
521
529
|
- name: administration_and_workflow
|
|
522
530
|
display_name: Administration and Workflow
|
|
@@ -531,7 +539,7 @@ run_groups:
|
|
|
531
539
|
|
|
532
540
|
- name: medcalc_bench
|
|
533
541
|
display_name: MedCalc-Bench
|
|
534
|
-
description:
|
|
542
|
+
description: MedCalc-Bench is a benchmark designed to evaluate models on their ability to compute clinically relevant values from patient notes. Each instance consists of a clinical note describing the patient's condition, a diagnostic question targeting a specific medical value, and a ground truth response. [(Khandekar et al., 2024)](https://arxiv.org/abs/2406.12036).
|
|
535
543
|
metric_groups:
|
|
536
544
|
- accuracy
|
|
537
545
|
- efficiency
|
|
@@ -546,28 +554,27 @@ run_groups:
|
|
|
546
554
|
when: "Any"
|
|
547
555
|
language: English
|
|
548
556
|
|
|
549
|
-
- name:
|
|
550
|
-
display_name:
|
|
551
|
-
|
|
552
|
-
description: A dataset that asks models to answer questions/follow instructions over longitudinal EHR [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).
|
|
557
|
+
- name: clear
|
|
558
|
+
display_name: CLEAR
|
|
559
|
+
description: CLEAR is a benchmark designed to evaluate models on their ability to detect medical conditions from patient notes using categorical responses. Each instance consists of a clinical note and a target condition, requiring the model to classify the patient's history as either affirmative, negative, or uncertain [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1).
|
|
553
560
|
metric_groups:
|
|
554
561
|
- accuracy
|
|
555
562
|
- efficiency
|
|
556
563
|
- general_information
|
|
557
564
|
environment:
|
|
558
|
-
main_name:
|
|
565
|
+
main_name: exact_match
|
|
559
566
|
main_split: test
|
|
560
567
|
taxonomy:
|
|
561
|
-
task:
|
|
562
|
-
what:
|
|
563
|
-
who:
|
|
564
|
-
when:
|
|
568
|
+
task: Classification
|
|
569
|
+
what: Classify medical condition presence from patient notes
|
|
570
|
+
who: Clinician
|
|
571
|
+
when: Any
|
|
565
572
|
language: English
|
|
566
573
|
|
|
567
574
|
- name: mtsamples_replicate
|
|
568
575
|
display_name: MTSamples
|
|
569
576
|
short_display_name: MTSamples
|
|
570
|
-
description:
|
|
577
|
+
description: MTSamples Replicate is a benchmark that provides transcribed medical reports from various specialties. It is used to evaluate a model's ability to generate clinically appropriate treatment plans based on unstructured patient documentation [(MTSamples, 2025)](https://mtsamples.com).
|
|
571
578
|
metric_groups:
|
|
572
579
|
- accuracy
|
|
573
580
|
- efficiency
|
|
@@ -582,9 +589,26 @@ run_groups:
|
|
|
582
589
|
when: "Post-diagnosis"
|
|
583
590
|
language: English
|
|
584
591
|
|
|
592
|
+
- name: medec
|
|
593
|
+
display_name: Medec
|
|
594
|
+
description: Medec is a benchmark composed of clinical narratives that include either correct documentation or medical errors. Each entry includes sentence-level identifiers and an associated correction task. The model must review the narrative and either identify the erroneous sentence and correct it, or confirm that the text is entirely accurate [(Abacha et al., 2025)](https://arxiv.org/abs/2412.19260).
|
|
595
|
+
metric_groups:
|
|
596
|
+
- accuracy
|
|
597
|
+
- efficiency
|
|
598
|
+
- general_information
|
|
599
|
+
environment:
|
|
600
|
+
main_name: medec_error_flag_accuracy
|
|
601
|
+
main_split: test
|
|
602
|
+
taxonomy:
|
|
603
|
+
task: Classification
|
|
604
|
+
what: Detect and correct errors in medical narratives
|
|
605
|
+
who: Researcher, Clinician
|
|
606
|
+
when: Any
|
|
607
|
+
language: English
|
|
608
|
+
|
|
585
609
|
- name: ehrshot
|
|
586
610
|
display_name: EHRSHOT
|
|
587
|
-
description:
|
|
611
|
+
description: EHRSHOT is a benchmark designed to evaluate a model's ability to predict future clinical events using structured EHR code sequences. Each instance contains a patient's historical EHR data and a forward-looking clinical question about whether a particular diagnosis, lab result, or hospital event will occur [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).
|
|
588
612
|
metric_groups:
|
|
589
613
|
- accuracy
|
|
590
614
|
- efficiency
|
|
@@ -598,27 +622,27 @@ run_groups:
|
|
|
598
622
|
who: "Clinician, Insurer"
|
|
599
623
|
when: "Future prediction"
|
|
600
624
|
language: English
|
|
601
|
-
|
|
602
|
-
- name:
|
|
603
|
-
display_name:
|
|
604
|
-
description:
|
|
625
|
+
|
|
626
|
+
- name: head_qa
|
|
627
|
+
display_name: HeadQA
|
|
628
|
+
description: HeadQA is a benchmark consisting of biomedical multiple-choice questions intended to evaluate a model's medical knowledge and reasoning. Each instance presents a clinical or scientific question with four answer options, requiring the model to select the most appropriate answer [(Vilares et al., 2019)](https://arxiv.org/abs/1906.04701).
|
|
605
629
|
metric_groups:
|
|
606
630
|
- accuracy
|
|
607
631
|
- efficiency
|
|
608
632
|
- general_information
|
|
609
633
|
environment:
|
|
610
|
-
main_name:
|
|
634
|
+
main_name: exact_match
|
|
611
635
|
main_split: test
|
|
612
636
|
taxonomy:
|
|
613
|
-
task:
|
|
614
|
-
what:
|
|
615
|
-
who:
|
|
616
|
-
when:
|
|
637
|
+
task: Question answering
|
|
638
|
+
what: Medical knowledge testing
|
|
639
|
+
who: Medical student, Researcher
|
|
640
|
+
when: Any
|
|
617
641
|
language: English
|
|
618
|
-
|
|
619
|
-
- name:
|
|
620
|
-
display_name:
|
|
621
|
-
description:
|
|
642
|
+
|
|
643
|
+
- name: medbullets
|
|
644
|
+
display_name: Medbullets
|
|
645
|
+
description: Medbullets is a benchmark of USMLE-style medical questions designed to assess a model's ability to understand and apply clinical knowledge. Each question is accompanied by a patient scenario and five multiple-choice options, similar to those found on Step 2 and Step 3 board exams [(MedBullets, 2025)](https://step2.medbullets.com).
|
|
622
646
|
metric_groups:
|
|
623
647
|
- accuracy
|
|
624
648
|
- efficiency
|
|
@@ -627,15 +651,33 @@ run_groups:
|
|
|
627
651
|
main_name: exact_match
|
|
628
652
|
main_split: test
|
|
629
653
|
taxonomy:
|
|
630
|
-
task:
|
|
631
|
-
what:
|
|
632
|
-
who:
|
|
654
|
+
task: Question answering
|
|
655
|
+
what: Medical knowledge testing
|
|
656
|
+
who: Medical student, . Researcher
|
|
633
657
|
when: Any
|
|
634
658
|
language: English
|
|
635
659
|
|
|
636
|
-
- name:
|
|
637
|
-
display_name:
|
|
638
|
-
|
|
660
|
+
- name: medalign
|
|
661
|
+
display_name: MedAlign
|
|
662
|
+
short_display_name: MedAlign
|
|
663
|
+
description: MedAlign is a benchmark that evaluates a model's ability to interpret and follow instructions grounded in longitudinal electronic health records (EHR). Each instance includes an event-stream style patient record and a natural language question or task, requiring clinically informed reading comprehension and reasoning [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).
|
|
664
|
+
metric_groups:
|
|
665
|
+
- accuracy
|
|
666
|
+
- efficiency
|
|
667
|
+
- general_information
|
|
668
|
+
environment:
|
|
669
|
+
main_name: medalign_accuracy
|
|
670
|
+
main_split: test
|
|
671
|
+
taxonomy:
|
|
672
|
+
task: Text generation
|
|
673
|
+
what: "Answer questions and follow instructions over longitudinal EHR"
|
|
674
|
+
who: "Clinician, Researcher"
|
|
675
|
+
when: "Any"
|
|
676
|
+
language: English
|
|
677
|
+
|
|
678
|
+
- name: shc_ptbm_med
|
|
679
|
+
display_name: ADHD-Behavior
|
|
680
|
+
description: ADHD-Behavior is a benchmark that evaluates a model's ability to detect whether a clinician recommends parent training in behavior management, an evidence-based first-line treatment for young children diagnosed with ADHD. Each instance includes a clinical note from a pediatric visit and a binary classification task [(Pillai et al., 2024)](https://doi.org/10.1093/jamia/ocae001).
|
|
639
681
|
metric_groups:
|
|
640
682
|
- accuracy
|
|
641
683
|
- efficiency
|
|
@@ -645,15 +687,14 @@ run_groups:
|
|
|
645
687
|
main_split: test
|
|
646
688
|
taxonomy:
|
|
647
689
|
task: Classification
|
|
648
|
-
what:
|
|
649
|
-
who: Researcher
|
|
650
|
-
when:
|
|
690
|
+
what: Detect ADHD medication side effect monitoring
|
|
691
|
+
who: Clinician, Researcher
|
|
692
|
+
when: During Treatment
|
|
651
693
|
language: English
|
|
652
694
|
|
|
653
|
-
- name:
|
|
654
|
-
display_name:
|
|
655
|
-
|
|
656
|
-
description: A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.
|
|
695
|
+
- name: shc_sei_med
|
|
696
|
+
display_name: ADHD-MedEffects
|
|
697
|
+
description: ADHD-MedEffects is a benchmark designed to evaluate whether clinical notes for pediatric ADHD visits document medication side effect monitoring, which is a key recommendation in clinical practice guidelines. The dataset supports binary classification to detect presence or absence of side effect inquiries (SEI) within notes [(Bannet et al., 2024)](https://doi.org/10.1542/peds.2024-067223).
|
|
657
698
|
metric_groups:
|
|
658
699
|
- accuracy
|
|
659
700
|
- efficiency
|
|
@@ -663,67 +704,67 @@ run_groups:
|
|
|
663
704
|
main_split: test
|
|
664
705
|
taxonomy:
|
|
665
706
|
task: Classification
|
|
666
|
-
what: Classify
|
|
667
|
-
who:
|
|
668
|
-
when:
|
|
707
|
+
what: Classify clinician recommendations for ADHD behavior management
|
|
708
|
+
who: Clinician, Caregiver
|
|
709
|
+
when: Early Intervention
|
|
669
710
|
language: English
|
|
670
711
|
|
|
671
|
-
- name:
|
|
672
|
-
display_name:
|
|
673
|
-
short_display_name:
|
|
674
|
-
description:
|
|
712
|
+
- name: dischargeme
|
|
713
|
+
display_name: DischargeMe
|
|
714
|
+
short_display_name: DischargeMe
|
|
715
|
+
description: DischargeMe is a benchmark designed to evaluate clinical text generation. It pairs discharge summaries and radiology reports from MIMIC-IV with generation tasks such as writing discharge instructions or summarizing the brief hospital course. The benchmark assesses a model's ability to generate patient-facing documentation that is complete, empathetic, and clinically accurate [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).
|
|
675
716
|
metric_groups:
|
|
676
717
|
- accuracy
|
|
677
718
|
- efficiency
|
|
678
719
|
- general_information
|
|
679
720
|
environment:
|
|
680
|
-
main_name:
|
|
721
|
+
main_name: dischargeme_accuracy
|
|
681
722
|
main_split: test
|
|
682
723
|
taxonomy:
|
|
683
724
|
task: Text generation
|
|
684
|
-
what: Generate
|
|
725
|
+
what: Generate discharge instructions from hospital notes
|
|
685
726
|
who: Clinician
|
|
686
|
-
when:
|
|
727
|
+
when: Upon hospital discharge
|
|
687
728
|
language: English
|
|
688
|
-
|
|
689
|
-
- name:
|
|
690
|
-
display_name:
|
|
691
|
-
description:
|
|
729
|
+
|
|
730
|
+
- name: aci_bench
|
|
731
|
+
display_name: ACI-Bench
|
|
732
|
+
description: ACI-Bench is a benchmark of real-world patient-doctor conversations paired with structured clinical notes. The benchmark evaluates a model's ability to understand spoken medical dialogue and convert it into formal clinical documentation, covering sections such as history of present illness, physical exam findings, results, and assessment and plan [(Yim et al., 2024)](https://www.nature.com/articles/s41597-023-02487-3).
|
|
692
733
|
metric_groups:
|
|
693
734
|
- accuracy
|
|
694
735
|
- efficiency
|
|
695
736
|
- general_information
|
|
696
737
|
environment:
|
|
697
|
-
main_name:
|
|
738
|
+
main_name: aci_bench_accuracy
|
|
698
739
|
main_split: test
|
|
699
740
|
taxonomy:
|
|
700
741
|
task: Text generation
|
|
701
|
-
what:
|
|
702
|
-
who: Clinician
|
|
742
|
+
what: Extract and structure information from patient-doctor conversations
|
|
743
|
+
who: Clinician
|
|
703
744
|
when: Any
|
|
704
745
|
language: English
|
|
705
746
|
|
|
706
|
-
- name:
|
|
707
|
-
display_name:
|
|
708
|
-
description:
|
|
747
|
+
- name: mtsamples_procedures
|
|
748
|
+
display_name: MTSamples Procedures
|
|
749
|
+
description: MTSamples Procedures is a benchmark composed of transcribed operative notes, focused on documenting surgical procedures. Each example presents a brief patient case involving a surgical intervention, and the model is tasked with generating a coherent and clinically accurate procedural summary or treatment plan.
|
|
709
750
|
metric_groups:
|
|
710
751
|
- accuracy
|
|
711
752
|
- efficiency
|
|
712
753
|
- general_information
|
|
713
754
|
environment:
|
|
714
|
-
main_name:
|
|
755
|
+
main_name: mtsamples_procedures_accuracy
|
|
715
756
|
main_split: test
|
|
716
757
|
taxonomy:
|
|
717
758
|
task: Text generation
|
|
718
|
-
what:
|
|
719
|
-
who:
|
|
720
|
-
when:
|
|
759
|
+
what: Document and extract information about medical procedures
|
|
760
|
+
who: Clinician, Researcher
|
|
761
|
+
when: Post-procedure
|
|
721
762
|
language: English
|
|
722
763
|
|
|
723
764
|
- name: mimic_rrs
|
|
724
765
|
display_name: MIMIC-RRS
|
|
725
766
|
short_display_name: MIMIC-RRS
|
|
726
|
-
description:
|
|
767
|
+
description: MIMIC-RRS is a benchmark constructed from radiology reports in the MIMIC-III database. It contains pairs of ‘Findings‘ and ‘Impression‘ sections, enabling evaluation of a model's ability to summarize diagnostic imaging observations into concise, clinically relevant conclusions [(Chen et al., 2023)](https://arxiv.org/abs/2211.08584).
|
|
727
768
|
metric_groups:
|
|
728
769
|
- accuracy
|
|
729
770
|
- efficiency
|
|
@@ -741,13 +782,13 @@ run_groups:
|
|
|
741
782
|
- name: mimic_bhc
|
|
742
783
|
display_name: MIMIC-IV-BHC
|
|
743
784
|
short_display_name: MIMIC-BHC
|
|
744
|
-
description:
|
|
785
|
+
description: MIMIC-BHC is a benchmark focused on summarization of discharge notes into Brief Hospital Course (BHC) sections. It consists of curated discharge notes from MIMIC-IV, each paired with its corresponding BHC summary. The benchmark evaluates a model's ability to condense detailed clinical information into accurate, concise summaries that reflect the patient's hospital stay [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).
|
|
745
786
|
metric_groups:
|
|
746
787
|
- accuracy
|
|
747
788
|
- efficiency
|
|
748
789
|
- general_information
|
|
749
790
|
environment:
|
|
750
|
-
main_name:
|
|
791
|
+
main_name: mimic_bhc_accuracy
|
|
751
792
|
main_split: test
|
|
752
793
|
taxonomy:
|
|
753
794
|
task: Text generation
|
|
@@ -755,130 +796,147 @@ run_groups:
|
|
|
755
796
|
who: Clinician
|
|
756
797
|
when: Upon hospital discharge
|
|
757
798
|
language: English
|
|
758
|
-
|
|
759
|
-
- name:
|
|
760
|
-
display_name:
|
|
761
|
-
description:
|
|
799
|
+
|
|
800
|
+
- name: chw_care_plan
|
|
801
|
+
display_name: NoteExtract
|
|
802
|
+
description: NoteExtract is a benchmark that focuses on the structured extraction of information from free-form clinical text. It provides care plan notes authored by health workers and evaluates a model's ability to convert them into a predefined structured format, such as fields for Chief Complaint and History of Present Illness. The benchmark emphasizes faithful extraction without hallucination or inference.
|
|
762
803
|
metric_groups:
|
|
763
804
|
- accuracy
|
|
764
805
|
- efficiency
|
|
765
806
|
- general_information
|
|
766
807
|
environment:
|
|
767
|
-
main_name:
|
|
808
|
+
main_name: chw_care_plan_accuracy
|
|
768
809
|
main_split: test
|
|
769
810
|
taxonomy:
|
|
770
|
-
task:
|
|
771
|
-
what:
|
|
772
|
-
who:
|
|
773
|
-
when:
|
|
811
|
+
task: Text generation
|
|
812
|
+
what: Convert general text care plans into structured formats
|
|
813
|
+
who: Clinician, Researcher
|
|
814
|
+
when: Any
|
|
774
815
|
language: English
|
|
775
|
-
|
|
776
|
-
- name:
|
|
777
|
-
display_name:
|
|
778
|
-
|
|
779
|
-
description: DischargeMe is a discharge instruction generation dataset and brief hospital course generation dataset collected from MIMIC-IV data, considering only the discharge text as well as the radiology report text [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).
|
|
816
|
+
|
|
817
|
+
- name: medication_qa
|
|
818
|
+
display_name: MedicationQA
|
|
819
|
+
description: MedicationQA is a benchmark composed of open-ended consumer health questions specifically focused on medications. Each example consists of a free-form question and a corresponding medically grounded answer. The benchmark evaluates a model's ability to provide accurate, accessible, and informative medication-related responses for a lay audience.
|
|
780
820
|
metric_groups:
|
|
781
821
|
- accuracy
|
|
782
822
|
- efficiency
|
|
783
823
|
- general_information
|
|
784
824
|
environment:
|
|
785
|
-
main_name:
|
|
825
|
+
main_name: medication_qa_accuracy
|
|
826
|
+
main_split: test
|
|
827
|
+
taxonomy:
|
|
828
|
+
task: Question answering
|
|
829
|
+
what: Answer consumer medication-related questions
|
|
830
|
+
who: Patient, Pharmacist
|
|
831
|
+
when: Any
|
|
832
|
+
language: English
|
|
833
|
+
|
|
834
|
+
- name: starr_patient_instructions
|
|
835
|
+
display_name: PatientInstruct
|
|
836
|
+
description: PatientInstruct is a benchmark designed to evaluate models on generating personalized post-procedure instructions for patients. It includes real-world clinical case details, such as diagnosis, planned procedures, and history and physical notes, from which models must produce clear, actionable instructions appropriate for patients recovering from medical interventions.
|
|
837
|
+
metric_groups:
|
|
838
|
+
- accuracy
|
|
839
|
+
- efficiency
|
|
840
|
+
- general_information
|
|
841
|
+
environment:
|
|
842
|
+
main_name: starr_patient_instructions_accuracy
|
|
786
843
|
main_split: test
|
|
787
844
|
taxonomy:
|
|
788
845
|
task: Text generation
|
|
789
|
-
what: Generate
|
|
846
|
+
what: Generate customized post-procedure patient instructions
|
|
790
847
|
who: Clinician
|
|
791
|
-
when:
|
|
848
|
+
when: Post-procedure
|
|
792
849
|
language: English
|
|
793
850
|
|
|
794
|
-
- name:
|
|
795
|
-
display_name:
|
|
796
|
-
|
|
851
|
+
- name: med_dialog
|
|
852
|
+
display_name: MedDialog
|
|
853
|
+
short_display_name: MedDialog
|
|
854
|
+
description: MedDialog is a benchmark of real-world doctor-patient conversations focused on health-related concerns and advice. Each dialogue is paired with a one-sentence summary that reflects the core patient question or exchange. The benchmark evaluates a model's ability to condense medical dialogue into concise, informative summaries.
|
|
797
855
|
metric_groups:
|
|
798
856
|
- accuracy
|
|
799
857
|
- efficiency
|
|
800
858
|
- general_information
|
|
801
859
|
environment:
|
|
802
|
-
main_name:
|
|
860
|
+
main_name: med_dialog_accuracy
|
|
803
861
|
main_split: test
|
|
804
862
|
taxonomy:
|
|
805
|
-
task:
|
|
806
|
-
what:
|
|
807
|
-
who:
|
|
863
|
+
task: Text generation
|
|
864
|
+
what: Generate summaries of doctor-patient conversations
|
|
865
|
+
who: Clinician
|
|
808
866
|
when: Any
|
|
809
867
|
language: English
|
|
810
868
|
|
|
811
|
-
- name:
|
|
812
|
-
display_name:
|
|
813
|
-
description:
|
|
869
|
+
- name: shc_conf_med
|
|
870
|
+
display_name: MedConfInfo
|
|
871
|
+
description: MedConfInfo is a benchmark comprising clinical notes from adolescent patients. It is used to evaluate whether the content contains sensitive protected health information (PHI) that should be restricted from parental access, in accordance with adolescent confidentiality policies in clinical care. [(Rabbani et al., 2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).
|
|
814
872
|
metric_groups:
|
|
815
873
|
- accuracy
|
|
816
874
|
- efficiency
|
|
817
875
|
- general_information
|
|
818
876
|
environment:
|
|
819
|
-
main_name:
|
|
877
|
+
main_name: exact_match
|
|
820
878
|
main_split: test
|
|
821
879
|
taxonomy:
|
|
822
880
|
task: Classification
|
|
823
|
-
what:
|
|
824
|
-
who:
|
|
881
|
+
what: Identify sensitive health info in adolescent notes
|
|
882
|
+
who: Clinician
|
|
825
883
|
when: Any
|
|
826
884
|
language: English
|
|
827
885
|
|
|
828
|
-
- name:
|
|
829
|
-
display_name:
|
|
830
|
-
description:
|
|
886
|
+
- name: medi_qa
|
|
887
|
+
display_name: MEDIQA
|
|
888
|
+
description: MEDIQA is a benchmark designed to evaluate a model's ability to retrieve and generate medically accurate answers to patient-generated questions. Each instance includes a consumer health question, a set of candidate answers (used in ranking tasks), relevance annotations, and optionally, additional context. The benchmark focuses on supporting patient understanding and accessibility in health communication.
|
|
831
889
|
metric_groups:
|
|
832
890
|
- accuracy
|
|
833
891
|
- efficiency
|
|
834
892
|
- general_information
|
|
835
893
|
environment:
|
|
836
|
-
main_name:
|
|
894
|
+
main_name: medi_qa_accuracy
|
|
837
895
|
main_split: test
|
|
838
896
|
taxonomy:
|
|
839
897
|
task: Text generation
|
|
840
|
-
what:
|
|
841
|
-
who: Clinician
|
|
898
|
+
what: Retrieve and rank answers based on medical question understanding
|
|
899
|
+
who: Clinician, Medical Student
|
|
842
900
|
when: Any
|
|
843
901
|
language: English
|
|
844
902
|
|
|
845
|
-
- name:
|
|
846
|
-
display_name:
|
|
847
|
-
description:
|
|
903
|
+
- name: mental_health
|
|
904
|
+
display_name: MentalHealth
|
|
905
|
+
description: MentalHealth is a benchmark focused on evaluating empathetic communication in mental health counseling. It includes real or simulated conversations between patients and counselors, where the task is to generate compassionate and appropriate counselor responses. The benchmark assesses a model's ability to support patients emotionally and meaningfully engage in therapeutic conversations.
|
|
848
906
|
metric_groups:
|
|
849
907
|
- accuracy
|
|
850
908
|
- efficiency
|
|
851
909
|
- general_information
|
|
852
910
|
environment:
|
|
853
|
-
main_name:
|
|
911
|
+
main_name: mental_health_accuracy
|
|
854
912
|
main_split: test
|
|
855
913
|
taxonomy:
|
|
856
914
|
task: Text generation
|
|
857
|
-
what:
|
|
858
|
-
who:
|
|
915
|
+
what: Generate empathetic counseling responses in mental health conversations
|
|
916
|
+
who: Counselors, Patients
|
|
859
917
|
when: Any
|
|
860
918
|
language: English
|
|
861
919
|
|
|
862
|
-
- name:
|
|
863
|
-
display_name:
|
|
864
|
-
description:
|
|
920
|
+
- name: shc_proxy_med
|
|
921
|
+
display_name: ProxySender
|
|
922
|
+
description: ProxySender is a benchmark composed of patient portal messages received by clinicians. It evaluates whether the message was sent by the patient or by a proxy user (e.g., parent, spouse), which is critical for understanding who is communicating with healthcare providers. [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).
|
|
865
923
|
metric_groups:
|
|
866
924
|
- accuracy
|
|
867
925
|
- efficiency
|
|
868
926
|
- general_information
|
|
869
927
|
environment:
|
|
870
|
-
main_name:
|
|
928
|
+
main_name: exact_match
|
|
871
929
|
main_split: test
|
|
872
930
|
taxonomy:
|
|
873
|
-
task:
|
|
874
|
-
what:
|
|
875
|
-
who:
|
|
931
|
+
task: Classification
|
|
932
|
+
what: Classify if a document was sent by a proxy user
|
|
933
|
+
who: Clinician, Caregiver
|
|
876
934
|
when: Any
|
|
877
935
|
language: English
|
|
878
936
|
|
|
879
|
-
- name:
|
|
880
|
-
display_name:
|
|
881
|
-
description:
|
|
937
|
+
- name: shc_privacy_med
|
|
938
|
+
display_name: PrivacyDetection
|
|
939
|
+
description: PrivacyDetection is a benchmark composed of patient portal messages submitted by patients or caregivers. The task is to determine whether the message contains any confidential or privacy-leaking information that should be protected [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).
|
|
882
940
|
metric_groups:
|
|
883
941
|
- accuracy
|
|
884
942
|
- efficiency
|
|
@@ -887,15 +945,15 @@ run_groups:
|
|
|
887
945
|
main_name: exact_match
|
|
888
946
|
main_split: test
|
|
889
947
|
taxonomy:
|
|
890
|
-
task:
|
|
891
|
-
what:
|
|
892
|
-
who:
|
|
948
|
+
task: Classification
|
|
949
|
+
what: Classify if a document leaks private information
|
|
950
|
+
who: Clinician, Caregiver
|
|
893
951
|
when: Any
|
|
894
952
|
language: English
|
|
895
953
|
|
|
896
|
-
- name:
|
|
897
|
-
display_name:
|
|
898
|
-
description:
|
|
954
|
+
- name: pubmed_qa
|
|
955
|
+
display_name: PubMedQA
|
|
956
|
+
description: PubMedQA is a biomedical question-answering dataset that evaluates a model's ability to interpret scientific literature. It consists of PubMed abstracts paired with yes/no/maybe questions derived from the content. The benchmark assesses a model's capability to reason over biomedical texts and provide factually grounded answers.
|
|
899
957
|
metric_groups:
|
|
900
958
|
- accuracy
|
|
901
959
|
- efficiency
|
|
@@ -905,48 +963,48 @@ run_groups:
|
|
|
905
963
|
main_split: test
|
|
906
964
|
taxonomy:
|
|
907
965
|
task: Question answering
|
|
908
|
-
what:
|
|
909
|
-
who:
|
|
966
|
+
what: Answer questions based on PubMed abstracts
|
|
967
|
+
who: Researcher
|
|
910
968
|
when: Any
|
|
911
969
|
language: English
|
|
912
970
|
|
|
913
|
-
- name:
|
|
914
|
-
display_name:
|
|
915
|
-
description:
|
|
971
|
+
- name: ehr_sql
|
|
972
|
+
display_name: EHRSQL
|
|
973
|
+
description: EHRSQL is a benchmark designed to evaluate models on generating structured queries for clinical research. Each example includes a natural language question and a database schema, and the task is to produce an SQL query that would return the correct result for a biomedical research objective. This benchmark assesses a model's understanding of medical terminology, data structures, and query construction.
|
|
916
974
|
metric_groups:
|
|
917
975
|
- accuracy
|
|
918
976
|
- efficiency
|
|
919
977
|
- general_information
|
|
920
978
|
environment:
|
|
921
|
-
main_name:
|
|
979
|
+
main_name: ehr_sql_execution_accuracy
|
|
922
980
|
main_split: test
|
|
923
981
|
taxonomy:
|
|
924
|
-
task:
|
|
925
|
-
what:
|
|
926
|
-
who:
|
|
927
|
-
when:
|
|
982
|
+
task: Code generation
|
|
983
|
+
what: Generate SQL queries from natural language for clinical research
|
|
984
|
+
who: Researcher
|
|
985
|
+
when: Any
|
|
928
986
|
language: English
|
|
929
987
|
|
|
930
|
-
- name:
|
|
931
|
-
display_name:
|
|
932
|
-
description:
|
|
988
|
+
- name: shc_bmt_med
|
|
989
|
+
display_name: BMT-Status
|
|
990
|
+
description: BMT-Status is a benchmark composed of clinical notes and associated binary questions related to bone marrow transplant (BMT), hematopoietic stem cell transplant (HSCT), or hematopoietic cell transplant (HCT) status. The goal is to determine whether the patient received a subsequent transplant based on the provided clinical documentation.
|
|
933
991
|
metric_groups:
|
|
934
992
|
- accuracy
|
|
935
993
|
- efficiency
|
|
936
994
|
- general_information
|
|
937
995
|
environment:
|
|
938
|
-
main_name:
|
|
996
|
+
main_name: exact_match
|
|
939
997
|
main_split: test
|
|
940
998
|
taxonomy:
|
|
941
|
-
task:
|
|
942
|
-
what: Answer
|
|
943
|
-
who:
|
|
999
|
+
task: question answering
|
|
1000
|
+
what: Answer bone marrow transplant questions
|
|
1001
|
+
who: Researcher
|
|
944
1002
|
when: Any
|
|
945
1003
|
language: English
|
|
946
1004
|
|
|
947
|
-
- name:
|
|
948
|
-
display_name:
|
|
949
|
-
description:
|
|
1005
|
+
- name: race_based_med
|
|
1006
|
+
display_name: RaceBias
|
|
1007
|
+
description: RaceBias is a benchmark used to evaluate language models for racially biased or inappropriate content in medical question-answering scenarios. Each instance consists of a medical question and a model-generated response. The task is to classify whether the response contains race-based, harmful, or inaccurate content. This benchmark supports research into bias detection and fairness in clinical AI systems.
|
|
950
1008
|
metric_groups:
|
|
951
1009
|
- accuracy
|
|
952
1010
|
- efficiency
|
|
@@ -955,15 +1013,16 @@ run_groups:
|
|
|
955
1013
|
main_name: exact_match
|
|
956
1014
|
main_split: test
|
|
957
1015
|
taxonomy:
|
|
958
|
-
task:
|
|
959
|
-
what:
|
|
1016
|
+
task: Classification
|
|
1017
|
+
what: Identify race-based bias in LLM-generated medical responses
|
|
960
1018
|
who: Researcher
|
|
961
1019
|
when: Any
|
|
962
1020
|
language: English
|
|
963
|
-
|
|
964
|
-
- name:
|
|
965
|
-
display_name:
|
|
966
|
-
|
|
1021
|
+
|
|
1022
|
+
- name: n2c2_ct_matching
|
|
1023
|
+
display_name: N2C2-CT Matching
|
|
1024
|
+
short_display_name: N2C2-CT
|
|
1025
|
+
description: A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.
|
|
967
1026
|
metric_groups:
|
|
968
1027
|
- accuracy
|
|
969
1028
|
- efficiency
|
|
@@ -973,14 +1032,14 @@ run_groups:
|
|
|
973
1032
|
main_split: test
|
|
974
1033
|
taxonomy:
|
|
975
1034
|
task: Classification
|
|
976
|
-
what:
|
|
977
|
-
who:
|
|
978
|
-
when:
|
|
1035
|
+
what: Classify whether a patient is a valid candidate for a clinical trial based on clinical notes
|
|
1036
|
+
who: Researcher
|
|
1037
|
+
when: Pre-Trial
|
|
979
1038
|
language: English
|
|
980
1039
|
|
|
981
|
-
- name:
|
|
982
|
-
display_name:
|
|
983
|
-
description:
|
|
1040
|
+
- name: medhallu
|
|
1041
|
+
display_name: MedHallu
|
|
1042
|
+
description: MedHallu is a benchmark focused on evaluating factual correctness in biomedical question answering. Each instance contains a PubMed-derived knowledge snippet, a biomedical question, and a model-generated answer. The task is to classify whether the answer is factually correct or contains hallucinated (non-grounded) information. This benchmark is designed to assess the factual reliability of medical language models.
|
|
984
1043
|
metric_groups:
|
|
985
1044
|
- accuracy
|
|
986
1045
|
- efficiency
|
|
@@ -990,14 +1049,14 @@ run_groups:
|
|
|
990
1049
|
main_split: test
|
|
991
1050
|
taxonomy:
|
|
992
1051
|
task: Classification
|
|
993
|
-
what:
|
|
994
|
-
who:
|
|
1052
|
+
what: Verify whether answers to questions from PubMed articles are factual or hallucinated
|
|
1053
|
+
who: Researcher
|
|
995
1054
|
when: Any
|
|
996
1055
|
language: English
|
|
997
|
-
|
|
998
|
-
- name:
|
|
999
|
-
display_name:
|
|
1000
|
-
description:
|
|
1056
|
+
|
|
1057
|
+
- name: shc_gip_med
|
|
1058
|
+
display_name: HospiceReferral
|
|
1059
|
+
description: HospiceReferral is a benchmark that evaluates model performance in identifying whether patients are eligible for hospice care based on palliative care clinical notes. The benchmark focuses on end-of-life care referral decisions.
|
|
1001
1060
|
metric_groups:
|
|
1002
1061
|
- accuracy
|
|
1003
1062
|
- efficiency
|
|
@@ -1007,31 +1066,31 @@ run_groups:
|
|
|
1007
1066
|
main_split: test
|
|
1008
1067
|
taxonomy:
|
|
1009
1068
|
task: Classification
|
|
1010
|
-
what:
|
|
1069
|
+
what: Assess hospice referral appropriateness
|
|
1011
1070
|
who: Hospital Admistrator
|
|
1012
|
-
when:
|
|
1071
|
+
when: End-of-care
|
|
1013
1072
|
language: English
|
|
1014
1073
|
|
|
1015
|
-
- name:
|
|
1016
|
-
display_name:
|
|
1017
|
-
description:
|
|
1074
|
+
- name: mimiciv_billing_code
|
|
1075
|
+
display_name: MIMIC-IV Billing Code
|
|
1076
|
+
description: MIMIC-IV Billing Code is a benchmark derived from discharge summaries in the MIMIC-IV database, paired with their corresponding ICD-10 billing codes. The task requires models to extract structured billing codes based on free-text clinical notes, reflecting real-world hospital coding tasks for financial reimbursement.
|
|
1018
1077
|
metric_groups:
|
|
1019
1078
|
- accuracy
|
|
1020
1079
|
- efficiency
|
|
1021
1080
|
- general_information
|
|
1022
1081
|
environment:
|
|
1023
|
-
main_name:
|
|
1082
|
+
main_name: mimiciv_billing_code_f1
|
|
1024
1083
|
main_split: test
|
|
1025
1084
|
taxonomy:
|
|
1026
1085
|
task: Classification
|
|
1027
|
-
what:
|
|
1086
|
+
what: Predict ICD-10 billing codes from clinical discharge notes
|
|
1028
1087
|
who: Hospital Admistrator
|
|
1029
|
-
when:
|
|
1088
|
+
when: During or after patient discharge
|
|
1030
1089
|
language: English
|
|
1031
|
-
|
|
1032
|
-
- name:
|
|
1033
|
-
display_name:
|
|
1034
|
-
description:
|
|
1090
|
+
|
|
1091
|
+
- name: shc_sequoia_med
|
|
1092
|
+
display_name: ClinicReferral
|
|
1093
|
+
description: ClinicReferral is a benchmark that determines patient eligibility for referral to the Sequoia Clinic based on information from palliative care notes. The dataset provides curated decisions on referral appropriateness to assist in automating clinic workflows.
|
|
1035
1094
|
metric_groups:
|
|
1036
1095
|
- accuracy
|
|
1037
1096
|
- efficiency
|
|
@@ -1041,14 +1100,14 @@ run_groups:
|
|
|
1041
1100
|
main_split: test
|
|
1042
1101
|
taxonomy:
|
|
1043
1102
|
task: Classification
|
|
1044
|
-
what:
|
|
1045
|
-
who:
|
|
1046
|
-
when:
|
|
1103
|
+
what: Provide answers on clinic referrals
|
|
1104
|
+
who: Hospital Admistrator
|
|
1105
|
+
when: Pre-referral
|
|
1047
1106
|
language: English
|
|
1048
1107
|
|
|
1049
|
-
- name:
|
|
1050
|
-
display_name:
|
|
1051
|
-
description:
|
|
1108
|
+
- name: shc_cdi_med
|
|
1109
|
+
display_name: CDI-QA
|
|
1110
|
+
description: CDI-QA is a benchmark constructed from Clinical Documentation Integrity (CDI) notes. It is used to evaluate a model's ability to verify clinical conditions based on documented evidence in patient records.
|
|
1052
1111
|
metric_groups:
|
|
1053
1112
|
- accuracy
|
|
1054
1113
|
- efficiency
|
|
@@ -1058,14 +1117,14 @@ run_groups:
|
|
|
1058
1117
|
main_split: test
|
|
1059
1118
|
taxonomy:
|
|
1060
1119
|
task: Classification
|
|
1061
|
-
what:
|
|
1062
|
-
who:
|
|
1063
|
-
when:
|
|
1120
|
+
what: Answer verification questions from CDI notes
|
|
1121
|
+
who: Hospital Admistrator
|
|
1122
|
+
when: Any
|
|
1064
1123
|
language: English
|
|
1065
|
-
|
|
1066
|
-
- name:
|
|
1067
|
-
display_name:
|
|
1068
|
-
description:
|
|
1124
|
+
|
|
1125
|
+
- name: shc_ent_med
|
|
1126
|
+
display_name: ENT-Referral
|
|
1127
|
+
description: ENT-Referral is a benchmark designed to evaluate whether a patient's clinical note supports a referral to an Ear, Nose, and Throat (ENT) specialist. It helps assess models' abilities to make referral decisions based on unstructured clinical text
|
|
1069
1128
|
metric_groups:
|
|
1070
1129
|
- accuracy
|
|
1071
1130
|
- efficiency
|
|
@@ -1075,7 +1134,7 @@ run_groups:
|
|
|
1075
1134
|
main_split: test
|
|
1076
1135
|
taxonomy:
|
|
1077
1136
|
task: Classification
|
|
1078
|
-
what:
|
|
1079
|
-
who:
|
|
1080
|
-
when:
|
|
1137
|
+
what: Identify referrals for ENT specialists
|
|
1138
|
+
who: Hospital Admistrator
|
|
1139
|
+
when: Any
|
|
1081
1140
|
language: English
|