PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +2 -2
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +16 -26
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +43 -13
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +7 -1
helm/benchmark/presentation/summarize.py +84 -61
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +84 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/arabic_run_specs.py +73 -0
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +0 -53
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +114 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
helm/benchmark/scenarios/aratrust_scenario.py +76 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/bluex_scenario.py +66 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/cleva_scenario.py +1 -1
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/math_scenario.py +21 -20
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/seahelm_scenario.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +228 -0
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +81 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +102 -55
helm/clients/openai_responses_client.py +176 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/test_huggingface_client.py +3 -3
helm/clients/together_client.py +31 -6
helm/clients/vertexai_client.py +17 -9
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/critique_request.py +0 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +104 -12
helm/common/local_context.py +140 -0
helm/common/object_spec.py +23 -8
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +995 -45
helm/config/model_metadata.yaml +780 -59
helm/config/tokenizer_configs.yaml +224 -3
helm/proxy/cli.py +4 -2
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/retry.py +5 -0
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +55 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/numeracy_scenario.py +0 -793
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

helm/benchmark/static/schema_medhelm.yaml CHANGED Viewed

@@ -204,63 +204,68 @@ metrics:
     description: Measures the proportion of correctly predicted answerable questions among all answerable questions in the dataset.
     lower_is_better: false
   - name: aci_bench_accuracy
-    display_name: ACI-Bench Accuracy
-    short_display_name: Accuracy
+    display_name: ACI-Bench Jury Score
+    short_display_name: Jury Score
     description: Measures the average score assigned by an LLM-based jury evaluating task performance.
     lower_is_better: false
   - name: mtsamples_replicate_accuracy
-    display_name: MTSamples Replicate Accuracy
-    short_display_name: Accuracy
+    display_name: MTSamples Replicate Jury Score
+    short_display_name: Jury Score
     description: Measures the average score assigned by an LLM-based jury evaluating task performance.
     lower_is_better: false
   - name: medalign_accuracy
-    display_name: Medalign Accuracy
-    short_display_name: Accuracy
+    display_name: Medalign Jury Score
+    short_display_name: Jury Score
     description: Measures the average score assigned by an LLM-based jury evaluating task performance.
     lower_is_better: false
   - name: dischargeme_accuracy
-    display_name: DischargeMe Accuracy
-    short_display_name: Accuracy
+    display_name: DischargeMe Jury Score
+    short_display_name: Jury Score
     description: Measures the average score assigned by an LLM-based jury evaluating task performance.
     lower_is_better: false
   - name: mtsamples_procedures_accuracy
-    display_name: MTSamples Procedures Accuracy
-    short_display_name: Accuracy
+    display_name: MTSamples Procedures Jury Score
+    short_display_name: Jury Score
     description: Measures the average score assigned by an LLM-based jury evaluating task performance.
     lower_is_better: false
   - name: mimic_rrs_accuracy
-    display_name: MIMIC-RRS Accuracy
-    short_display_name: Accuracy
+    display_name: MIMIC-RRS Jury Score
+    short_display_name: Jury Score
+    description: Measures the average score assigned by an LLM-based jury evaluating task performance.
+    lower_is_better: false
+  - name: mimic_bhc_accuracy
+    display_name: MIMIC-BHC Jury Score
+    short_display_name: Jury Score
     description: Measures the average score assigned by an LLM-based jury evaluating task performance.
     lower_is_better: false
   - name: chw_care_plan_accuracy
-    display_name: NoteExtract Accuracy
-    short_display_name: Accuracy
+    display_name: NoteExtract Jury Score
+    short_display_name: Jury Score
     description: Measures the average score assigned by an LLM-based jury evaluating task performance.
     lower_is_better: false
   - name: medication_qa_accuracy
-    display_name: MedicationQA Accuracy
-    short_display_name: Accuracy
+    display_name: MedicationQA Jury Score
+    short_display_name: Jury Score
     description: Measures the average score assigned by an LLM-based jury evaluating task performance.
     lower_is_better: false
   - name: starr_patient_instructions_accuracy
-    display_name: PatientInstruct Accuracy
-    short_display_name: Accuracy
+    display_name: PatientInstruct Jury Score
+    short_display_name: Jury Score
     description: Measures the average score assigned by an LLM-based jury evaluating task performance.
     lower_is_better: false
   - name: med_dialog_accuracy
-    display_name: MedDialog Accuracy
-    short_display_name: Accuracy
+    display_name: MedDialog Jury Score
+    short_display_name: Jury Score
     description: Measures the average score assigned by an LLM-based jury evaluating task performance.
     lower_is_better: false
   - name: medi_qa_accuracy
-    display_name: MediQA Accuracy
-    short_display_name: Accuracy
+    display_name: MediQA Jury Score
+    short_display_name: Jury Score
     description: Measures the average score assigned by an LLM-based jury evaluating task performance.
     lower_is_better: false
   - name: mental_health_accuracy
-    display_name: MentalHealth Accuracy
-    short_display_name: Accuracy
+    display_name: MentalHealth Jury Score
+    short_display_name: Jury Score
     description: Measures the average score assigned by an LLM-based jury evaluating task performance.
     lower_is_better: false
@@ -506,6 +511,8 @@ run_groups:
       - shc_conf_med
       - medi_qa
       - mental_health
+      - shc_proxy_med
+      - shc_privacy_med
   - name: medical_research
     display_name: Medical Research Assistance
@@ -517,6 +524,7 @@ run_groups:
       - shc_bmt_med
       - race_based_med
       - n2c2_ct_matching
+      - medhallu
   - name: administration_and_workflow
     display_name: Administration and Workflow
@@ -531,7 +539,7 @@ run_groups:
   - name: medcalc_bench
     display_name: MedCalc-Bench
-    description: A dataset which consists of a patient note, a question requesting to compute a specific medical value, and a ground truth answer [(Khandekar et al., 2024)](https://arxiv.org/abs/2406.12036).
+    description: MedCalc-Bench is a benchmark designed to evaluate models on their ability to compute clinically relevant values from patient notes. Each instance consists of a clinical note describing the patient's condition, a diagnostic question targeting a specific medical value, and a ground truth response. [(Khandekar et al., 2024)](https://arxiv.org/abs/2406.12036).
     metric_groups:
       - accuracy
       - efficiency
@@ -546,28 +554,27 @@ run_groups:
       when: "Any"
       language: English
-  - name: medalign
-    display_name: MedAlign
-    short_display_name: MedAlign
-    description: A dataset that asks models to answer questions/follow instructions over longitudinal EHR [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).
+  - name: clear
+    display_name: CLEAR
+    description: CLEAR is a benchmark designed to evaluate models on their ability to detect medical conditions from patient notes using categorical responses. Each instance consists of a clinical note and a target condition, requiring the model to classify the patient's history as either affirmative, negative, or uncertain [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1).
     metric_groups:
       - accuracy
       - efficiency
       - general_information
     environment:
-      main_name: medalign_accuracy
+      main_name: exact_match
       main_split: test
     taxonomy:
-      task: Text generation
-      what: "Answer questions and follow instructions over longitudinal EHR"
-      who: "Clinician, Researcher"
-      when: "Any"
+      task: Classification
+      what: Classify medical condition presence from patient notes
+      who: Clinician
+      when: Any
       language: English
   - name: mtsamples_replicate
     display_name: MTSamples
     short_display_name: MTSamples
-    description: A dataset of clinical notes where the model is prompted to generate the appropriate treatment plan for this patient [(MTSamples, 2025)](https://mtsamples.com).
+    description: MTSamples Replicate is a benchmark that provides transcribed medical reports from various specialties. It is used to evaluate a model's ability to generate clinically appropriate treatment plans based on unstructured patient documentation [(MTSamples, 2025)](https://mtsamples.com).
     metric_groups:
       - accuracy
       - efficiency
@@ -582,9 +589,26 @@ run_groups:
       when: "Post-diagnosis"
       language: English
+  - name: medec
+    display_name: Medec
+    description: Medec is a benchmark composed of clinical narratives that include either correct documentation or medical errors. Each entry includes sentence-level identifiers and an associated correction task. The model must review the narrative and either identify the erroneous sentence and correct it, or confirm that the text is entirely accurate [(Abacha et al., 2025)](https://arxiv.org/abs/2412.19260).
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: medec_error_flag_accuracy
+      main_split: test
+    taxonomy:
+      task: Classification
+      what: Detect and correct errors in medical narratives
+      who: Researcher, Clinician
+      when: Any
+      language: English
   - name: ehrshot
     display_name: EHRSHOT
-    description: A dataset given a patient record of EHR codes, classifying if an event will occur at a future date or not [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).
+    description: EHRSHOT is a benchmark designed to evaluate a model's ability to predict future clinical events using structured EHR code sequences. Each instance contains a patient's historical EHR data and a forward-looking clinical question about whether a particular diagnosis, lab result, or hospital event will occur [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).
     metric_groups:
       - accuracy
       - efficiency
@@ -598,27 +622,27 @@ run_groups:
       who: "Clinician, Insurer"
       when: "Future prediction"
       language: English
-  - name: starr_patient_instructions
-    display_name: PatientInstruct
-    description: A dataset containing case details used to generate customized post-procedure patient instructions.
+  - name: head_qa
+    display_name: HeadQA
+    description: HeadQA is a benchmark consisting of biomedical multiple-choice questions intended to evaluate a model's medical knowledge and reasoning. Each instance presents a clinical or scientific question with four answer options, requiring the model to select the most appropriate answer [(Vilares et al., 2019)](https://arxiv.org/abs/1906.04701).
     metric_groups:
       - accuracy
       - efficiency
       - general_information
     environment:
-      main_name: starr_patient_instructions_accuracy
+      main_name: exact_match
       main_split: test
     taxonomy:
-      task: Text generation
-      what: Generate customized post-procedure patient instructions
-      who: Clinician
-      when: Post-procedure
+      task: Question answering
+      what: Medical knowledge testing
+      who: Medical student, Researcher
+      when: Any
       language: English
-  - name: clear
-    display_name: CLEAR
-    description: "A dataset for evaluating the presence of a specific medical condition from patient notes with yes/no/maybe classifications [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1)."
+  - name: medbullets
+    display_name: Medbullets
+    description: Medbullets is a benchmark of USMLE-style medical questions designed to assess a model's ability to understand and apply clinical knowledge. Each question is accompanied by a patient scenario and five multiple-choice options, similar to those found on Step 2 and Step 3 board exams [(MedBullets, 2025)](https://step2.medbullets.com).
     metric_groups:
       - accuracy
       - efficiency
@@ -627,15 +651,33 @@ run_groups:
       main_name: exact_match
       main_split: test
     taxonomy:
-      task: Classification
-      what: Classify medical condition presence from patient notes
-      who: Clinician
+      task: Question answering
+      what: Medical knowledge testing
+      who: Medical student, . Researcher
       when: Any
       language: English
-  - name: race_based_med
-    display_name: RaceBias
-    description: A collection of LLM outputs in response to medical questions with race-based biases, with the objective being to classify whether the output contains racially biased content.
+  - name: medalign
+    display_name: MedAlign
+    short_display_name: MedAlign
+    description: MedAlign is a benchmark that evaluates a model's ability to interpret and follow instructions grounded in longitudinal electronic health records (EHR). Each instance includes an event-stream style patient record and a natural language question or task, requiring clinically informed reading comprehension and reasoning [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: medalign_accuracy
+      main_split: test
+    taxonomy:
+      task: Text generation
+      what: "Answer questions and follow instructions over longitudinal EHR"
+      who: "Clinician, Researcher"
+      when: "Any"
+      language: English
+  - name: shc_ptbm_med
+    display_name: ADHD-Behavior
+    description: ADHD-Behavior is a benchmark that evaluates a model's ability to detect whether a clinician recommends parent training in behavior management, an evidence-based first-line treatment for young children diagnosed with ADHD. Each instance includes a clinical note from a pediatric visit and a binary classification task [(Pillai et al., 2024)](https://doi.org/10.1093/jamia/ocae001).
     metric_groups:
       - accuracy
       - efficiency
@@ -645,15 +687,14 @@ run_groups:
       main_split: test
     taxonomy:
       task: Classification
-      what: Identify race-based bias in LLM-generated medical responses
-      who: Researcher
-      when: Any
+      what: Detect ADHD medication side effect monitoring
+      who: Clinician, Researcher
+      when: During Treatment
       language: English
-  - name: n2c2_ct_matching
-    display_name: N2C2-CT Matching
-    short_display_name: N2C2-CT
-    description: A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.
+  - name: shc_sei_med
+    display_name:  ADHD-MedEffects
+    description: ADHD-MedEffects is a benchmark designed to evaluate whether clinical notes for pediatric ADHD visits document medication side effect monitoring, which is a key recommendation in clinical practice guidelines. The dataset supports binary classification to detect presence or absence of side effect inquiries (SEI) within notes [(Bannet et al., 2024)](https://doi.org/10.1542/peds.2024-067223).
     metric_groups:
       - accuracy
       - efficiency
@@ -663,67 +704,67 @@ run_groups:
       main_split: test
     taxonomy:
       task: Classification
-      what: Classify whether a patient is a valid candidate for a clinical trial based on clinical notes
-      who: Researcher
-      when: Pre-Trial
+      what: Classify clinician recommendations for ADHD behavior management
+      who: Clinician, Caregiver
+      when: Early Intervention
       language: English
-  - name: med_dialog
-    display_name: MedDialog
-    short_display_name: MedDialog
-    description: A collection of doctor-patient conversations with corresponding summaries.
+  - name: dischargeme
+    display_name: DischargeMe
+    short_display_name: DischargeMe
+    description: DischargeMe is a benchmark designed to evaluate clinical text generation. It pairs discharge summaries and radiology reports from MIMIC-IV with generation tasks such as writing discharge instructions or summarizing the brief hospital course. The benchmark assesses a model's ability to generate patient-facing documentation that is complete, empathetic, and clinically accurate [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).
     metric_groups:
       - accuracy
       - efficiency
       - general_information
     environment:
-      main_name: med_dialog_accuracy
+      main_name: dischargeme_accuracy
       main_split: test
     taxonomy:
       task: Text generation
-      what: Generate summaries of doctor-patient conversations
+      what: Generate discharge instructions from hospital notes
       who: Clinician
-      when: Any
+      when: Upon hospital discharge
       language: English
-  - name: medi_qa
-    display_name: MEDIQA
-    description: A dataset including a medical question, a set of candidate answers, relevance annotations for ranking, and additional context to evaluate understanding and retrieval capabilities in a healthcare setting.
+  - name: aci_bench
+    display_name: ACI-Bench
+    description: ACI-Bench is a benchmark of real-world patient-doctor conversations paired with structured clinical notes. The benchmark evaluates a model's ability to understand spoken medical dialogue and convert it into formal clinical documentation, covering sections such as history of present illness, physical exam findings, results, and assessment and plan [(Yim et al., 2024)](https://www.nature.com/articles/s41597-023-02487-3).
     metric_groups:
       - accuracy
       - efficiency
       - general_information
     environment:
-      main_name: medi_qa_accuracy
+      main_name: aci_bench_accuracy
       main_split: test
     taxonomy:
       task: Text generation
-      what: Retrieve and rank answers based on medical question understanding
-      who: Clinician, Medical Student
+      what: Extract and structure information from patient-doctor conversations
+      who: Clinician
       when: Any
       language: English
-  - name: mental_health
-    display_name: MentalHealth
-    description: A dataset containing a counselor and mental health patient conversation, where the objective is to generate an empathetic counselor response.
+  - name: mtsamples_procedures
+    display_name: MTSamples Procedures
+    description: MTSamples Procedures is a benchmark composed of transcribed operative notes, focused on documenting surgical procedures. Each example presents a brief patient case involving a surgical intervention, and the model is tasked with generating a coherent and clinically accurate procedural summary or treatment plan.
     metric_groups:
       - accuracy
       - efficiency
       - general_information
     environment:
-      main_name: mental_health_accuracy
+      main_name: mtsamples_procedures_accuracy
       main_split: test
     taxonomy:
       task: Text generation
-      what: Generate empathetic counseling responses in mental health conversations
-      who: Counselors, Patients
-      when: Any
+      what: Document and extract information about medical procedures
+      who: Clinician, Researcher
+      when: Post-procedure
       language: English
   - name: mimic_rrs
     display_name: MIMIC-RRS
     short_display_name: MIMIC-RRS
-    description: A dataset containing radiology reports with findings sections from MIMIC-III paired with their corresponding impression sections, used for generating radiology report summaries [(Chen et al., 2023)](https://arxiv.org/abs/2211.08584).
+    description: MIMIC-RRS is a benchmark constructed from radiology reports in the MIMIC-III database. It contains pairs of ‘Findings‘ and ‘Impression‘ sections, enabling evaluation of a model's ability to summarize diagnostic imaging observations into concise, clinically relevant conclusions [(Chen et al., 2023)](https://arxiv.org/abs/2211.08584).
     metric_groups:
       - accuracy
       - efficiency
@@ -741,13 +782,13 @@ run_groups:
   - name: mimic_bhc
     display_name: MIMIC-IV-BHC
     short_display_name: MIMIC-BHC
-    description: A summarization task using a curated collection of preprocessed discharge notes paired with their corresponding brief hospital course (BHC) summaries [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).
+    description: MIMIC-BHC is a benchmark focused on summarization of discharge notes into Brief Hospital Course (BHC) sections. It consists of curated discharge notes from MIMIC-IV, each paired with its corresponding BHC summary. The benchmark evaluates a model's ability to condense detailed clinical information into accurate, concise summaries that reflect the patient's hospital stay [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).
     metric_groups:
       - accuracy
       - efficiency
       - general_information
     environment:
-      main_name: BERTScore-F
+      main_name: mimic_bhc_accuracy
       main_split: test
     taxonomy:
       task: Text generation
@@ -755,130 +796,147 @@ run_groups:
       who: Clinician
       when: Upon hospital discharge
       language: English
-  - name: mimiciv_billing_code
-    display_name: MIMIC-IV Billing Code
-    description: A  dataset pairing clinical notes from MIMIC-IV with corresponding ICD-10 billing codes.
+  - name: chw_care_plan
+    display_name: NoteExtract
+    description: NoteExtract is a benchmark that focuses on the structured extraction of information from free-form clinical text. It provides care plan notes authored by health workers and evaluates a model's ability to convert them into a predefined structured format, such as fields for Chief Complaint and History of Present Illness. The benchmark emphasizes faithful extraction without hallucination or inference.
     metric_groups:
       - accuracy
       - efficiency
       - general_information
     environment:
-      main_name: mimiciv_billing_code_f1
+      main_name: chw_care_plan_accuracy
       main_split: test
     taxonomy:
-      task: Classification
-      what: Predict ICD-10 billing codes from clinical discharge notes
-      who: Hospital Admistrator
-      when: During or after patient discharge
+      task: Text generation
+      what: Convert general text care plans into structured formats
+      who: Clinician, Researcher
+      when: Any
       language: English
-  - name: dischargeme
-    display_name: DischargeMe
-    short_display_name: DischargeMe
-    description: DischargeMe is a discharge instruction generation dataset and brief hospital course generation dataset collected from MIMIC-IV data, considering only the discharge text as well as the radiology report text [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).
+  - name: medication_qa
+    display_name: MedicationQA
+    description: MedicationQA is a benchmark composed of open-ended consumer health questions specifically focused on medications. Each example consists of a free-form question and a corresponding medically grounded answer. The benchmark evaluates a model's ability to provide accurate, accessible, and informative medication-related responses for a lay audience.
     metric_groups:
       - accuracy
       - efficiency
       - general_information
     environment:
-      main_name: dischargeme_accuracy
+      main_name: medication_qa_accuracy
+      main_split: test
+    taxonomy:
+      task: Question answering
+      what: Answer consumer medication-related questions
+      who: Patient, Pharmacist
+      when: Any
+      language: English
+  - name: starr_patient_instructions
+    display_name: PatientInstruct
+    description: PatientInstruct is a benchmark designed to evaluate models on generating personalized post-procedure instructions for patients. It includes real-world clinical case details, such as diagnosis, planned procedures, and history and physical notes, from which models must produce clear, actionable instructions appropriate for patients recovering from medical interventions.
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: starr_patient_instructions_accuracy
       main_split: test
     taxonomy:
       task: Text generation
-      what: Generate discharge instructions from hospital notes
+      what: Generate customized post-procedure patient instructions
       who: Clinician
-      when: Upon hospital discharge
+      when: Post-procedure
       language: English
-  - name: pubmed_qa
-    display_name: PubMedQA
-    description: A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions.
+  - name: med_dialog
+    display_name: MedDialog
+    short_display_name: MedDialog
+    description: MedDialog is a benchmark of real-world doctor-patient conversations focused on health-related concerns and advice. Each dialogue is paired with a one-sentence summary that reflects the core patient question or exchange. The benchmark evaluates a model's ability to condense medical dialogue into concise, informative summaries.
     metric_groups:
       - accuracy
       - efficiency
       - general_information
     environment:
-      main_name: exact_match
+      main_name: med_dialog_accuracy
       main_split: test
     taxonomy:
-      task: Question answering
-      what: Answer questions based on PubMed abstracts
-      who: Researcher
+      task: Text generation
+      what: Generate summaries of doctor-patient conversations
+      who: Clinician
       when: Any
       language: English
-  - name: medec
-    display_name: Medec
-    description: A dataset containing medical narratives with error detection and correction pairs [(Abacha et al., 2025)](https://arxiv.org/abs/2412.19260).
+  - name: shc_conf_med
+    display_name: MedConfInfo
+    description: MedConfInfo is a benchmark comprising clinical notes from adolescent patients. It is used to evaluate whether the content contains sensitive protected health information (PHI) that should be restricted from parental access, in accordance with adolescent confidentiality policies in clinical care. [(Rabbani et al., 2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).
     metric_groups:
       - accuracy
       - efficiency
       - general_information
     environment:
-      main_name: medec_error_flag_accuracy
+      main_name: exact_match
       main_split: test
     taxonomy:
       task: Classification
-      what: Detect and correct errors in medical narratives
-      who: Researcher, Clinician
+      what: Identify sensitive health info in adolescent notes
+      who: Clinician
       when: Any
       language: English
-  - name: aci_bench
-    display_name: ACI-Bench
-    description: A dataset of patient-doctor conversations paired with structured clinical notes [(Yim et al., 2024)](https://www.nature.com/articles/s41597-023-02487-3).
+  - name: medi_qa
+    display_name: MEDIQA
+    description: MEDIQA is a benchmark designed to evaluate a model's ability to retrieve and generate medically accurate answers to patient-generated questions. Each instance includes a consumer health question, a set of candidate answers (used in ranking tasks), relevance annotations, and optionally, additional context. The benchmark focuses on supporting patient understanding and accessibility in health communication.
     metric_groups:
       - accuracy
       - efficiency
       - general_information
     environment:
-      main_name: aci_bench_accuracy
+      main_name: medi_qa_accuracy
       main_split: test
     taxonomy:
       task: Text generation
-      what: Extract and structure information from patient-doctor conversations
-      who: Clinician
+      what: Retrieve and rank answers based on medical question understanding
+      who: Clinician, Medical Student
       when: Any
       language: English
-  - name: chw_care_plan
-    display_name: NoteExtract
-    description: A dataset containing free form text of a clinical health worker care plan, with the associated goal being to restructure that text into a given format.
+  - name: mental_health
+    display_name: MentalHealth
+    description: MentalHealth is a benchmark focused on evaluating empathetic communication in mental health counseling. It includes real or simulated conversations between patients and counselors, where the task is to generate compassionate and appropriate counselor responses. The benchmark assesses a model's ability to support patients emotionally and meaningfully engage in therapeutic conversations.
     metric_groups:
       - accuracy
       - efficiency
       - general_information
     environment:
-      main_name: chw_care_plan_accuracy
+      main_name: mental_health_accuracy
       main_split: test
     taxonomy:
       task: Text generation
-      what: Convert general text care plans into structured formats
-      who: Clinician, Researcher
+      what: Generate empathetic counseling responses in mental health conversations
+      who: Counselors, Patients
       when: Any
       language: English
-  - name: ehr_sql
-    display_name: EHRSQL
-    description: Given a natural language instruction, generate an SQL query that would be used in clinical research.
+  - name: shc_proxy_med
+    display_name:  ProxySender
+    description: ProxySender is a benchmark composed of patient portal messages received by clinicians. It evaluates whether the message was sent by the patient or by a proxy user (e.g., parent, spouse), which is critical for understanding who is communicating with healthcare providers. [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).
     metric_groups:
       - accuracy
       - efficiency
       - general_information
     environment:
-      main_name: ehr_sql_execution_accuracy
+      main_name: exact_match
       main_split: test
     taxonomy:
-      task: Code generation
-      what: Generate SQL queries from natural language for clinical research
-      who: Researcher
+      task: Classification
+      what: Classify if a document was sent by a proxy user
+      who: Clinician, Caregiver
       when: Any
       language: English
-  - name: head_qa
-    display_name: HeadQA
-    description: A collection of biomedical multiple-choice questions for testing medical knowledge [(Vilares et al., 2019)](https://arxiv.org/abs/1906.04701).
+  - name: shc_privacy_med
+    display_name:  PrivacyDetection
+    description: PrivacyDetection is a benchmark composed of patient portal messages submitted by patients or caregivers. The task is to determine whether the message contains any confidential or privacy-leaking information that should be protected [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).
     metric_groups:
       - accuracy
       - efficiency
@@ -887,15 +945,15 @@ run_groups:
       main_name: exact_match
       main_split: test
     taxonomy:
-      task: Question answering
-      what: Medical knowledge testing
-      who: Medical student, Researcher
+      task: Classification
+      what: Classify if a document leaks private information
+      who: Clinician, Caregiver
       when: Any
       language: English
-  - name: medbullets
-    display_name: Medbullets
-    description: A USMLE-style medical question dataset with multiple-choice answers and explanations [(MedBullets, 2025)](https://step2.medbullets.com).
+  - name: pubmed_qa
+    display_name: PubMedQA
+    description: PubMedQA is a biomedical question-answering dataset that evaluates a model's ability to interpret scientific literature. It consists of PubMed abstracts paired with yes/no/maybe questions derived from the content. The benchmark assesses a model's capability to reason over biomedical texts and provide factually grounded answers.
     metric_groups:
       - accuracy
       - efficiency
@@ -905,48 +963,48 @@ run_groups:
       main_split: test
     taxonomy:
       task: Question answering
-      what: Medical knowledge testing
-      who: Medical student, . Researcher
+      what: Answer questions based on PubMed abstracts
+      who: Researcher
       when: Any
       language: English
-  - name: mtsamples_procedures
-    display_name: MTSamples Procedures
-    description: A dataset that provides a patient note regarding an operation, with the objective to document the procedure.
+  - name: ehr_sql
+    display_name: EHRSQL
+    description: EHRSQL is a benchmark designed to evaluate models on generating structured queries for clinical research. Each example includes a natural language question and a database schema, and the task is to produce an SQL query that would return the correct result for a biomedical research objective. This benchmark assesses a model's understanding of medical terminology, data structures, and query construction.
     metric_groups:
       - accuracy
       - efficiency
       - general_information
     environment:
-      main_name: mtsamples_procedures_accuracy
+      main_name: ehr_sql_execution_accuracy
       main_split: test
     taxonomy:
-      task: Text generation
-      what: Document and extract information about medical procedures
-      who: Clinician, Researcher
-      when: Post-procedure
+      task: Code generation
+      what: Generate SQL queries from natural language for clinical research
+      who: Researcher
+      when: Any
       language: English
-  - name: medication_qa
-    display_name: MedicationQA
-    description: Consumer medication questions with reference answers.
+  - name: shc_bmt_med
+    display_name: BMT-Status
+    description: BMT-Status is a benchmark composed of clinical notes and associated binary questions related to bone marrow transplant (BMT), hematopoietic stem cell transplant (HSCT), or hematopoietic cell transplant (HCT) status. The goal is to determine whether the patient received a subsequent transplant based on the provided clinical documentation.
     metric_groups:
       - accuracy
       - efficiency
       - general_information
     environment:
-      main_name: medication_qa_accuracy
+      main_name: exact_match
       main_split: test
     taxonomy:
-      task: Question answering
-      what: Answer consumer medication-related questions
-      who: Patient, Pharmacist
+      task: question answering
+      what: Answer bone marrow transplant questions
+      who: Researcher
       when: Any
       language: English
-  - name: shc_bmt_med
-    display_name: BMT-Status
-    description: A dataset containing patient notes with associated questions and answers related to bone marrow transplantation.
+  - name: race_based_med
+    display_name: RaceBias
+    description: RaceBias is a benchmark used to evaluate language models for racially biased or inappropriate content in medical question-answering scenarios. Each instance consists of a medical question and a model-generated response. The task is to classify whether the response contains race-based, harmful, or inaccurate content. This benchmark supports research into bias detection and fairness in clinical AI systems.
     metric_groups:
       - accuracy
       - efficiency
@@ -955,15 +1013,16 @@ run_groups:
       main_name: exact_match
       main_split: test
     taxonomy:
-      task: question answering
-      what: Answer bone marrow transplant questions
+      task: Classification
+      what: Identify race-based bias in LLM-generated medical responses
       who: Researcher
       when: Any
       language: English
-  - name: shc_gip_med
-    display_name: HospiceReferral
-    description: A dataset evaluating performance in identifying appropriate patient referrals to hospice care.
+  - name: n2c2_ct_matching
+    display_name: N2C2-CT Matching
+    short_display_name: N2C2-CT
+    description: A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.
     metric_groups:
       - accuracy
       - efficiency
@@ -973,14 +1032,14 @@ run_groups:
       main_split: test
     taxonomy:
       task: Classification
-      what: Assess hospice referral appropriateness
-      who: Hospital Admistrator
-      when: End-of-care
+      what: Classify whether a patient is a valid candidate for a clinical trial based on clinical notes
+      who: Researcher
+      when: Pre-Trial
       language: English
-  - name: shc_cdi_med
-    display_name: CDI-QA
-    description: A dataset built from Clinical Document Integrity (CDI) notes, to assess the ability to answer verification questions from previous notes.
+  - name: medhallu
+    display_name: MedHallu
+    description: MedHallu is a benchmark focused on evaluating factual correctness in biomedical question answering. Each instance contains a PubMed-derived knowledge snippet, a biomedical question, and a model-generated answer. The task is to classify whether the answer is factually correct or contains hallucinated (non-grounded) information. This benchmark is designed to assess the factual reliability of medical language models.
     metric_groups:
       - accuracy
       - efficiency
@@ -990,14 +1049,14 @@ run_groups:
       main_split: test
     taxonomy:
       task: Classification
-      what: Answer verification questions from CDI notes
-      who: Hospital Admistrator
+      what: Verify whether answers to questions from PubMed articles are factual or hallucinated
+      who: Researcher
       when: Any
       language: English
-  - name: shc_ent_med
-    display_name: ENT-Referral
-    description: A dataset designed to evaluate performance in identifying appropriate patient referrals to Ear, Nose, and Throat specialists.
+  - name: shc_gip_med
+    display_name: HospiceReferral
+    description: HospiceReferral is a benchmark that evaluates model performance in identifying whether patients are eligible for hospice care based on palliative care clinical notes. The benchmark focuses on end-of-life care referral decisions.
     metric_groups:
       - accuracy
       - efficiency
@@ -1007,31 +1066,31 @@ run_groups:
       main_split: test
     taxonomy:
       task: Classification
-      what: Identify referrals for ENT specialists
+      what: Assess hospice referral appropriateness
       who: Hospital Admistrator
-      when: Any
+      when: End-of-care
       language: English
-  - name: shc_sequoia_med
-    display_name: ClinicReferral
-    description: A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic.
+  - name: mimiciv_billing_code
+    display_name: MIMIC-IV Billing Code
+    description: MIMIC-IV Billing Code is a benchmark derived from discharge summaries in the MIMIC-IV database, paired with their corresponding ICD-10 billing codes. The task requires models to extract structured billing codes based on free-text clinical notes, reflecting real-world hospital coding tasks for financial reimbursement.
     metric_groups:
       - accuracy
       - efficiency
       - general_information
     environment:
-      main_name: exact_match
+      main_name: mimiciv_billing_code_f1
       main_split: test
     taxonomy:
       task: Classification
-      what: Provide answers on clinic referrals
+      what: Predict ICD-10 billing codes from clinical discharge notes
       who: Hospital Admistrator
-      when: Pre-referral
+      when: During or after patient discharge
       language: English
-  - name: shc_conf_med
-    display_name: MedConfInfo
-    description: A dataset of clinical notes from adolescent patients used to identify sensitive protected health information that should be restricted from parental access [(Rabbani et al., 2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).
+  - name: shc_sequoia_med
+    display_name: ClinicReferral
+    description: ClinicReferral is a benchmark that determines patient eligibility for referral to the Sequoia Clinic based on information from palliative care notes. The dataset provides curated decisions on referral appropriateness to assist in automating clinic workflows.
     metric_groups:
       - accuracy
       - efficiency
@@ -1041,14 +1100,14 @@ run_groups:
       main_split: test
     taxonomy:
       task: Classification
-      what: Identify sensitive health info in adolescent notes
-      who: Clinician
-      when: Any
+      what: Provide answers on clinic referrals
+      who: Hospital Admistrator
+      when: Pre-referral
       language: English
-  - name: shc_ptbm_med
-    display_name: ADHD-Behavior
-    description: A dataset that classifies whether a clinical note contains a clinician recommendation for parent training in behavior management, which is the first-line evidence-based treatment for young children with ADHD [(Pillai et al., 2024)](https://doi.org/10.1093/jamia/ocae001).
+  - name: shc_cdi_med
+    display_name: CDI-QA
+    description: CDI-QA is a benchmark constructed from Clinical Documentation Integrity (CDI) notes. It is used to evaluate a model's ability to verify clinical conditions based on documented evidence in patient records.
     metric_groups:
       - accuracy
       - efficiency
@@ -1058,14 +1117,14 @@ run_groups:
       main_split: test
     taxonomy:
       task: Classification
-      what: Detect ADHD medication side effect monitoring
-      who: Clinician, Researcher
-      when: During Treatment
+      what: Answer verification questions from CDI notes
+      who: Hospital Admistrator
+      when: Any
       language: English
-  - name: shc_sei_med
-    display_name:  ADHD-MedEffects
-    description: A dataset that classifies whether a clinical note contains documentation of side effect monitoring (recording of absence or presence of medication side effects), as recommended in clinical practice guidelines [(Bannet et al., 2024)](https://doi.org/10.1542/peds.2024-067223).
+  - name: shc_ent_med
+    display_name: ENT-Referral
+    description: ENT-Referral is a benchmark designed to evaluate whether a patient's clinical note supports a referral to an Ear, Nose, and Throat (ENT) specialist. It helps assess models' abilities to make referral decisions based on unstructured clinical text
     metric_groups:
       - accuracy
       - efficiency
@@ -1075,7 +1134,7 @@ run_groups:
       main_split: test
     taxonomy:
       task: Classification
-      what: Classify clinician recommendations for ADHD behavior management
-      who: Clinician, Caregiver
-      when: Early Intervention
+      what: Identify referrals for ENT specialists
+      who: Hospital Admistrator
+      when: Any
       language: English

crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl