crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -2
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +5 -22
- helm/benchmark/presentation/summarize.py +180 -11
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/run.py +1 -1
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +140 -16
- helm/benchmark/run_specs/bluex_run_specs.py +1 -1
- helm/benchmark/run_specs/classic_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +2 -2
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
- helm/benchmark/scenarios/aratrust_scenario.py +19 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +6 -2
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +479 -0
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +33 -0
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +348 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
- helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/static/schema_arabic.yaml +55 -12
- helm/benchmark/static/schema_long_context.yaml +11 -30
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +2 -0
- helm/clients/cohere_client.py +3 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +2 -1
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +36 -20
- helm/clients/openai_responses_client.py +27 -3
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -11
- helm/clients/vertexai_client.py +12 -2
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/writer_client.py +2 -0
- helm/common/hierarchical_logger.py +20 -0
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/config/model_deployments.yaml +300 -1
- helm/config/model_metadata.yaml +302 -9
- helm/config/tokenizer_configs.yaml +92 -4
- helm/proxy/example_queries.py +8 -8
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -2,6 +2,7 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
13
|
VALID_SPLIT,
|
|
13
14
|
Input,
|
|
14
15
|
Output,
|
|
16
|
+
ScenarioMetadata,
|
|
15
17
|
)
|
|
16
18
|
|
|
17
19
|
|
|
@@ -103,3 +105,21 @@ class MedQAScenario(Scenario):
|
|
|
103
105
|
instances.append(instance)
|
|
104
106
|
|
|
105
107
|
return instances
|
|
108
|
+
|
|
109
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
110
|
+
return ScenarioMetadata(
|
|
111
|
+
name="med_qa",
|
|
112
|
+
display_name="MedQA",
|
|
113
|
+
description="MedQA is an open domain question answering dataset composed of questions from "
|
|
114
|
+
"professional medical board exams ([Jin et al. "
|
|
115
|
+
"2020](https://arxiv.org/pdf/2009.13081.pdf)).",
|
|
116
|
+
taxonomy=TaxonomyInfo(
|
|
117
|
+
task="multiple-choice question answering",
|
|
118
|
+
what="US medical licensing exams",
|
|
119
|
+
when="before 2020",
|
|
120
|
+
who="problem setters",
|
|
121
|
+
language="English",
|
|
122
|
+
),
|
|
123
|
+
main_metric="quasi_exact_match",
|
|
124
|
+
main_split="test",
|
|
125
|
+
)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
|
|
3
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
3
4
|
from helm.benchmark.scenarios.scenario import (
|
|
4
5
|
Scenario,
|
|
5
6
|
Instance,
|
|
@@ -8,6 +9,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
8
9
|
CORRECT_TAG,
|
|
9
10
|
PassageQuestionInput,
|
|
10
11
|
Output,
|
|
12
|
+
ScenarioMetadata,
|
|
11
13
|
)
|
|
12
14
|
from helm.benchmark.scenarios.medalign_scenario_helper import return_dataset_dataframe # type: ignore
|
|
13
15
|
|
|
@@ -92,3 +94,24 @@ class MedalignScenario(Scenario):
|
|
|
92
94
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
93
95
|
dataset = return_dataset_dataframe(self.max_length, self.data_path)
|
|
94
96
|
return self.process_tsv(dataset)
|
|
97
|
+
|
|
98
|
+
def get_metadata(self):
|
|
99
|
+
return ScenarioMetadata(
|
|
100
|
+
name="medalign",
|
|
101
|
+
display_name="MedAlign",
|
|
102
|
+
short_display_name="MedAlign",
|
|
103
|
+
description="MedAlign is a benchmark that evaluates a model's ability to interpret and "
|
|
104
|
+
"follow instructions grounded in longitudinal electronic health records (EHR). "
|
|
105
|
+
"Each instance includes an event-stream style patient record and a natural "
|
|
106
|
+
"language question or task, requiring clinically informed reading comprehension "
|
|
107
|
+
"and reasoning [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).",
|
|
108
|
+
taxonomy=TaxonomyInfo(
|
|
109
|
+
task="Text generation",
|
|
110
|
+
what="Answer questions and follow instructions over longitudinal EHR",
|
|
111
|
+
when="Any",
|
|
112
|
+
who="Clinician, Researcher",
|
|
113
|
+
language="English",
|
|
114
|
+
),
|
|
115
|
+
main_metric="medalign_accuracy",
|
|
116
|
+
main_split="test",
|
|
117
|
+
)
|
|
@@ -3,6 +3,7 @@ import csv
|
|
|
3
3
|
import sys
|
|
4
4
|
from typing import List
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
CORRECT_TAG,
|
|
8
9
|
TEST_SPLIT,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
Output,
|
|
12
13
|
Reference,
|
|
13
14
|
Scenario,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
from helm.common.general import ensure_file_downloaded
|
|
16
18
|
|
|
@@ -143,3 +145,23 @@ class MedBulletsScenario(Scenario):
|
|
|
143
145
|
csv_path = self.download_csv(output_path, split_suffix)
|
|
144
146
|
instances.extend(self.process_csv(csv_path, split))
|
|
145
147
|
return instances
|
|
148
|
+
|
|
149
|
+
def get_metadata(self):
|
|
150
|
+
return ScenarioMetadata(
|
|
151
|
+
name="medbullets",
|
|
152
|
+
display_name="Medbullets",
|
|
153
|
+
description="Medbullets is a benchmark of USMLE-style medical questions designed to assess "
|
|
154
|
+
"a model's ability to understand and apply clinical knowledge. Each question is "
|
|
155
|
+
"accompanied by a patient scenario and five multiple-choice options, similar to "
|
|
156
|
+
"those found on Step 2 and Step 3 board exams [(MedBullets, "
|
|
157
|
+
"2025)](https://step2.medbullets.com).",
|
|
158
|
+
taxonomy=TaxonomyInfo(
|
|
159
|
+
task="Question answering",
|
|
160
|
+
what="Medical knowledge testing",
|
|
161
|
+
when="Any",
|
|
162
|
+
who="Medical student, . Researcher",
|
|
163
|
+
language="English",
|
|
164
|
+
),
|
|
165
|
+
main_metric="exact_match",
|
|
166
|
+
main_split="test",
|
|
167
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import Dict, List
|
|
2
2
|
from datasets import load_dataset
|
|
3
3
|
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.common.hierarchical_logger import hlog
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
CORRECT_TAG,
|
|
11
12
|
PassageQuestionInput,
|
|
12
13
|
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
|
|
15
17
|
|
|
@@ -125,3 +127,23 @@ class MedCalcBenchScenario(Scenario):
|
|
|
125
127
|
instances.extend(self.process_csv(data, split))
|
|
126
128
|
|
|
127
129
|
return instances
|
|
130
|
+
|
|
131
|
+
def get_metadata(self):
|
|
132
|
+
return ScenarioMetadata(
|
|
133
|
+
name="medcalc_bench",
|
|
134
|
+
display_name="MedCalc-Bench",
|
|
135
|
+
description="MedCalc-Bench is a benchmark designed to evaluate models on their ability to "
|
|
136
|
+
"compute clinically relevant values from patient notes. Each instance consists "
|
|
137
|
+
"of a clinical note describing the patient's condition, a diagnostic question "
|
|
138
|
+
"targeting a specific medical value, and a ground truth response. [(Khandekar "
|
|
139
|
+
"et al., 2024)](https://arxiv.org/abs/2406.12036).",
|
|
140
|
+
taxonomy=TaxonomyInfo(
|
|
141
|
+
task="Computational reasoning",
|
|
142
|
+
what="Compute a specific medical value from a patient note",
|
|
143
|
+
when="Any",
|
|
144
|
+
who="Clinician, Researcher",
|
|
145
|
+
language="English",
|
|
146
|
+
),
|
|
147
|
+
main_metric="medcalc_bench_accuracy",
|
|
148
|
+
main_split="test",
|
|
149
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import csv
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.benchmark.scenarios.scenario import (
|
|
5
6
|
Scenario,
|
|
6
7
|
Instance,
|
|
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
9
10
|
TEST_SPLIT,
|
|
10
11
|
Input,
|
|
11
12
|
Output,
|
|
13
|
+
ScenarioMetadata,
|
|
12
14
|
)
|
|
13
15
|
from helm.common.general import ensure_file_downloaded
|
|
14
16
|
|
|
@@ -123,3 +125,24 @@ class MedecScenario(Scenario):
|
|
|
123
125
|
instances.extend(self.process_csv(test_csv, TEST_SPLIT))
|
|
124
126
|
|
|
125
127
|
return instances
|
|
128
|
+
|
|
129
|
+
def get_metadata(self):
|
|
130
|
+
return ScenarioMetadata(
|
|
131
|
+
name="medec",
|
|
132
|
+
display_name="Medec",
|
|
133
|
+
description="Medec is a benchmark composed of clinical narratives that include either "
|
|
134
|
+
"correct documentation or medical errors. Each entry includes sentence-level "
|
|
135
|
+
"identifiers and an associated correction task. The model must review the "
|
|
136
|
+
"narrative and either identify the erroneous sentence and correct it, or "
|
|
137
|
+
"confirm that the text is entirely accurate [(Abacha et al., "
|
|
138
|
+
"2025)](https://arxiv.org/abs/2412.19260).",
|
|
139
|
+
taxonomy=TaxonomyInfo(
|
|
140
|
+
task="Classification",
|
|
141
|
+
what="Detect and correct errors in medical narratives",
|
|
142
|
+
when="Any",
|
|
143
|
+
who="Researcher, Clinician",
|
|
144
|
+
language="English",
|
|
145
|
+
),
|
|
146
|
+
main_metric="medec_error_flag_accuracy",
|
|
147
|
+
main_split="test",
|
|
148
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
from datasets import load_dataset
|
|
3
3
|
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.benchmark.scenarios.scenario import (
|
|
5
6
|
Scenario,
|
|
6
7
|
Instance,
|
|
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
9
10
|
CORRECT_TAG,
|
|
10
11
|
Output,
|
|
11
12
|
Input,
|
|
13
|
+
ScenarioMetadata,
|
|
12
14
|
)
|
|
13
15
|
|
|
14
16
|
|
|
@@ -70,3 +72,24 @@ Answer: {answer}
|
|
|
70
72
|
)
|
|
71
73
|
instances.append(hallucinated_instance)
|
|
72
74
|
return instances
|
|
75
|
+
|
|
76
|
+
def get_metadata(self):
|
|
77
|
+
return ScenarioMetadata(
|
|
78
|
+
name="medhallu",
|
|
79
|
+
display_name="MedHallu",
|
|
80
|
+
description="MedHallu is a benchmark focused on evaluating factual correctness in "
|
|
81
|
+
"biomedical question answering. Each instance contains a PubMed-derived "
|
|
82
|
+
"knowledge snippet, a biomedical question, and a model-generated answer. The "
|
|
83
|
+
"task is to classify whether the answer is factually correct or contains "
|
|
84
|
+
"hallucinated (non-grounded) information. This benchmark is designed to assess "
|
|
85
|
+
"the factual reliability of medical language models.",
|
|
86
|
+
taxonomy=TaxonomyInfo(
|
|
87
|
+
task="Classification",
|
|
88
|
+
what="Verify whether answers to questions from PubMed articles are " "factual or hallucinated",
|
|
89
|
+
when="Any",
|
|
90
|
+
who="Researcher",
|
|
91
|
+
language="English",
|
|
92
|
+
),
|
|
93
|
+
main_metric="exact_match",
|
|
94
|
+
main_split="test",
|
|
95
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# The judges to be used for evaluating the note summary scenario.
|
|
2
|
+
# name: The short name for the judge.
|
|
3
|
+
# model: The field value matching the 'model_name' field under model_deployments.yaml
|
|
4
|
+
# model_deployment: The field value matching the 'name' under model_deployments.yaml.
|
|
5
|
+
judges:
|
|
6
|
+
- name: "gpt"
|
|
7
|
+
model: "openai/gpt-4o-2024-05-13"
|
|
8
|
+
model_deployment: "stanfordhealthcare/gpt-4o-2024-05-13"
|
|
9
|
+
- name: "llama"
|
|
10
|
+
model: "meta/llama-3.3-70b-instruct"
|
|
11
|
+
model_deployment: "stanfordhealthcare/llama-3.3-70b-instruct"
|
|
12
|
+
- name: "claude"
|
|
13
|
+
model: "anthropic/claude-3-7-sonnet-20250219"
|
|
14
|
+
model_deployment: "stanfordhealthcare/claude-3-7-sonnet-20250219"
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import string
|
|
2
|
+
import json
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
|
+
from helm.benchmark.run_specs.medhelm.benchmark_config import get_benchmark_config_from_path
|
|
8
|
+
from helm.common.general import check_file_exists
|
|
9
|
+
|
|
10
|
+
from helm.benchmark.scenarios.scenario import (
|
|
11
|
+
Scenario,
|
|
12
|
+
Instance,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
Reference,
|
|
15
|
+
Input,
|
|
16
|
+
Output,
|
|
17
|
+
TEST_SPLIT,
|
|
18
|
+
ScenarioMetadata,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class MedHELMConfigurableScenario(Scenario):
|
|
23
|
+
"""
|
|
24
|
+
MedHELM configuratble scenario
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
tags = ["biomedical"]
|
|
28
|
+
|
|
29
|
+
def __init__(self, name: str, config_path: str):
|
|
30
|
+
super().__init__()
|
|
31
|
+
self.benchmark_config = get_benchmark_config_from_path(config_path)
|
|
32
|
+
self.name = name
|
|
33
|
+
self.description = self.benchmark_config.description
|
|
34
|
+
|
|
35
|
+
def get_columns_in_template(self, template: str) -> List[str]:
|
|
36
|
+
"""
|
|
37
|
+
Extract field names from a template string using Python's Formatter.
|
|
38
|
+
Example: "Name: {name}, Age: {age}" → ["name", "age"]
|
|
39
|
+
"""
|
|
40
|
+
formatter = string.Formatter()
|
|
41
|
+
return [fname for _, fname, _, _ in formatter.parse(template) if fname]
|
|
42
|
+
|
|
43
|
+
def populate_template(self, template: str, row: pd.Series, fields: List[str]) -> str:
|
|
44
|
+
"""
|
|
45
|
+
Populate the template with values from the row using format_map.
|
|
46
|
+
Missing fields default to empty string.
|
|
47
|
+
"""
|
|
48
|
+
mapping = {field: row.get(field, "") for field in fields}
|
|
49
|
+
return template.format_map(mapping)
|
|
50
|
+
|
|
51
|
+
def get_references(self, row: pd.Series) -> List[Reference]:
|
|
52
|
+
references: List[Reference] = []
|
|
53
|
+
if "correct_answer" in row:
|
|
54
|
+
references.append(Reference(Output(text=row["correct_answer"]), tags=[CORRECT_TAG]))
|
|
55
|
+
if "incorrect_answers" in row:
|
|
56
|
+
for incorrect_answer in row["incorrect_answers"]:
|
|
57
|
+
references.append(Reference(Output(text=incorrect_answer), tags=[]))
|
|
58
|
+
return references
|
|
59
|
+
|
|
60
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
61
|
+
check_file_exists(self.benchmark_config.prompt_file, msg=f"Prompt file for {self.name} does not exist")
|
|
62
|
+
check_file_exists(self.benchmark_config.dataset_file, msg=f"Dataset file for {self.name} does not exist")
|
|
63
|
+
instances: List[Instance] = []
|
|
64
|
+
df = pd.read_csv(self.benchmark_config.dataset_file)
|
|
65
|
+
if "correct_answer" not in df.columns:
|
|
66
|
+
if not self._is_llm_as_judge() or len(self.benchmark_config.metrics) > 1:
|
|
67
|
+
raise ValueError(
|
|
68
|
+
"Dataset must contain 'correct_answer' column unless using jury_score as the only metric."
|
|
69
|
+
)
|
|
70
|
+
if "incorrect_answers" in df.columns:
|
|
71
|
+
df["incorrect_answers"] = df["incorrect_answers"].apply(json.loads)
|
|
72
|
+
with open(self.benchmark_config.prompt_file, "r") as f:
|
|
73
|
+
template = f.read()
|
|
74
|
+
fields = self.get_columns_in_template(template)
|
|
75
|
+
for _, row in df.iterrows():
|
|
76
|
+
filled = self.populate_template(template, row, fields)
|
|
77
|
+
prompt = Input(text=filled)
|
|
78
|
+
instances.append(Instance(input=prompt, references=self.get_references(row), split=TEST_SPLIT))
|
|
79
|
+
return instances
|
|
80
|
+
|
|
81
|
+
def get_metadata(self):
|
|
82
|
+
return ScenarioMetadata(
|
|
83
|
+
name=self.name,
|
|
84
|
+
display_name=self.name,
|
|
85
|
+
description=self.description,
|
|
86
|
+
taxonomy=TaxonomyInfo(
|
|
87
|
+
task="",
|
|
88
|
+
what="",
|
|
89
|
+
when="",
|
|
90
|
+
who="",
|
|
91
|
+
language="",
|
|
92
|
+
),
|
|
93
|
+
main_metric=self.benchmark_config.main_metric.name,
|
|
94
|
+
main_split="test",
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
def _is_llm_as_judge(self) -> bool:
|
|
98
|
+
for metric in self.benchmark_config.metrics:
|
|
99
|
+
if metric.name == "jury_score":
|
|
100
|
+
return True
|
|
101
|
+
return False
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import Dict, List
|
|
2
2
|
from datasets import load_dataset
|
|
3
3
|
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.common.hierarchical_logger import hlog
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
CORRECT_TAG,
|
|
11
12
|
Input,
|
|
12
13
|
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
|
|
15
17
|
|
|
@@ -49,7 +51,7 @@ class MediQAScenario(Scenario):
|
|
|
49
51
|
|
|
50
52
|
name = "medi_qa"
|
|
51
53
|
description = (
|
|
52
|
-
"MEDIQA is a benchmark designed to evaluate a model's ability to
|
|
54
|
+
"MEDIQA is a benchmark designed to evaluate a model's ability to generate"
|
|
53
55
|
"medically accurate answers to patient-generated questions. Each instance includes a"
|
|
54
56
|
"consumer health question, a set of candidate answers (used in ranking tasks), relevance"
|
|
55
57
|
"annotations, and optionally, additional context. The benchmark focuses on supporting"
|
|
@@ -109,3 +111,24 @@ class MediQAScenario(Scenario):
|
|
|
109
111
|
instances.extend(self.process_csv(data, split))
|
|
110
112
|
|
|
111
113
|
return instances
|
|
114
|
+
|
|
115
|
+
def get_metadata(self):
|
|
116
|
+
return ScenarioMetadata(
|
|
117
|
+
name="medi_qa",
|
|
118
|
+
display_name="MEDIQA",
|
|
119
|
+
description="MEDIQA is a benchmark designed to evaluate a model's ability to retrieve and "
|
|
120
|
+
"generate medically accurate answers to patient-generated questions. Each "
|
|
121
|
+
"instance includes a consumer health question, a set of candidate answers (used "
|
|
122
|
+
"in ranking tasks), relevance annotations, and optionally, additional context. "
|
|
123
|
+
"The benchmark focuses on supporting patient understanding and accessibility in "
|
|
124
|
+
"health communication.",
|
|
125
|
+
taxonomy=TaxonomyInfo(
|
|
126
|
+
task="Text generation",
|
|
127
|
+
what="Generate medically accurate answers to patient-generated questions.",
|
|
128
|
+
when="Any",
|
|
129
|
+
who="Clinician, Medical Student",
|
|
130
|
+
language="English",
|
|
131
|
+
),
|
|
132
|
+
main_metric="medi_qa_accuracy",
|
|
133
|
+
main_split="test",
|
|
134
|
+
)
|
|
@@ -3,9 +3,19 @@ from typing import List
|
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
7
|
from helm.common.general import ensure_file_downloaded
|
|
7
8
|
|
|
8
|
-
from helm.benchmark.scenarios.scenario import
|
|
9
|
+
from helm.benchmark.scenarios.scenario import (
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
Input,
|
|
13
|
+
Instance,
|
|
14
|
+
Output,
|
|
15
|
+
Reference,
|
|
16
|
+
Scenario,
|
|
17
|
+
ScenarioMetadata,
|
|
18
|
+
)
|
|
9
19
|
|
|
10
20
|
|
|
11
21
|
class MedicationQAScenario(Scenario):
|
|
@@ -64,3 +74,23 @@ class MedicationQAScenario(Scenario):
|
|
|
64
74
|
]
|
|
65
75
|
|
|
66
76
|
return instances
|
|
77
|
+
|
|
78
|
+
def get_metadata(self):
|
|
79
|
+
return ScenarioMetadata(
|
|
80
|
+
name="medication_qa",
|
|
81
|
+
display_name="MedicationQA",
|
|
82
|
+
description="MedicationQA is a benchmark composed of open-ended consumer health questions "
|
|
83
|
+
"specifically focused on medications. Each example consists of a free-form "
|
|
84
|
+
"question and a corresponding medically grounded answer. The benchmark "
|
|
85
|
+
"evaluates a model's ability to provide accurate, accessible, and informative "
|
|
86
|
+
"medication-related responses for a lay audience.",
|
|
87
|
+
taxonomy=TaxonomyInfo(
|
|
88
|
+
task="Question answering",
|
|
89
|
+
what="Answer consumer medication-related questions",
|
|
90
|
+
when="Any",
|
|
91
|
+
who="Patient, Pharmacist",
|
|
92
|
+
language="English",
|
|
93
|
+
),
|
|
94
|
+
main_metric="medication_qa_accuracy",
|
|
95
|
+
main_split="test",
|
|
96
|
+
)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
from typing import List
|
|
3
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
3
4
|
from helm.benchmark.scenarios.scenario import (
|
|
4
5
|
Scenario,
|
|
5
6
|
Instance,
|
|
@@ -8,6 +9,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
8
9
|
CORRECT_TAG,
|
|
9
10
|
PassageQuestionInput,
|
|
10
11
|
Output,
|
|
12
|
+
ScenarioMetadata,
|
|
11
13
|
)
|
|
12
14
|
from helm.common.general import check_file_exists
|
|
13
15
|
|
|
@@ -121,3 +123,24 @@ class MentalHealthScenario(Scenario):
|
|
|
121
123
|
instances = self.process_dialogue_data(dialogue_data)
|
|
122
124
|
|
|
123
125
|
return instances
|
|
126
|
+
|
|
127
|
+
def get_metadata(self):
|
|
128
|
+
return ScenarioMetadata(
|
|
129
|
+
name="mental_health",
|
|
130
|
+
display_name="MentalHealth",
|
|
131
|
+
description="MentalHealth is a benchmark focused on evaluating empathetic communication in "
|
|
132
|
+
"mental health counseling. It includes real or simulated conversations between "
|
|
133
|
+
"patients and counselors, where the task is to generate compassionate and "
|
|
134
|
+
"appropriate counselor responses. The benchmark assesses a model's ability to "
|
|
135
|
+
"support patients emotionally and meaningfully engage in therapeutic "
|
|
136
|
+
"conversations.",
|
|
137
|
+
taxonomy=TaxonomyInfo(
|
|
138
|
+
task="Text generation",
|
|
139
|
+
what="Generate empathetic counseling responses in mental health " "conversations",
|
|
140
|
+
when="Any",
|
|
141
|
+
who="Counselors, Patients",
|
|
142
|
+
language="English",
|
|
143
|
+
),
|
|
144
|
+
main_metric="mental_health_accuracy",
|
|
145
|
+
main_split="test",
|
|
146
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from typing import Dict, List
|
|
3
3
|
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.common.general import check_file_exists
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Input,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
CORRECT_TAG,
|
|
11
12
|
Reference,
|
|
12
13
|
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
|
|
15
17
|
|
|
@@ -101,3 +103,25 @@ class MIMICBHCScenario(Scenario):
|
|
|
101
103
|
)
|
|
102
104
|
|
|
103
105
|
return instances
|
|
106
|
+
|
|
107
|
+
def get_metadata(self):
|
|
108
|
+
return ScenarioMetadata(
|
|
109
|
+
name="mimic_bhc",
|
|
110
|
+
display_name="MIMIC-IV-BHC",
|
|
111
|
+
short_display_name="MIMIC-BHC",
|
|
112
|
+
description="MIMIC-BHC is a benchmark focused on summarization of discharge notes into "
|
|
113
|
+
"Brief Hospital Course (BHC) sections. It consists of curated discharge notes "
|
|
114
|
+
"from MIMIC-IV, each paired with its corresponding BHC summary. The benchmark "
|
|
115
|
+
"evaluates a model's ability to condense detailed clinical information into "
|
|
116
|
+
"accurate, concise summaries that reflect the patient's hospital stay [(Aali et "
|
|
117
|
+
"al., 2024)](https://doi.org/10.1093/jamia/ocae312).",
|
|
118
|
+
taxonomy=TaxonomyInfo(
|
|
119
|
+
task="Text generation",
|
|
120
|
+
what="Summarize the clinical note into a brief hospital course",
|
|
121
|
+
when="Upon hospital discharge",
|
|
122
|
+
who="Clinician",
|
|
123
|
+
language="English",
|
|
124
|
+
),
|
|
125
|
+
main_metric="mimic_bhc_accuracy",
|
|
126
|
+
main_split="test",
|
|
127
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import Dict, List
|
|
3
3
|
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.common.general import check_file_exists
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Input,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
CORRECT_TAG,
|
|
11
12
|
Reference,
|
|
12
13
|
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
|
|
15
17
|
|
|
@@ -96,3 +98,24 @@ class MIMICRRSScenario(Scenario):
|
|
|
96
98
|
lines = file.readlines()
|
|
97
99
|
lines = [line.strip() for line in lines]
|
|
98
100
|
return lines
|
|
101
|
+
|
|
102
|
+
def get_metadata(self):
|
|
103
|
+
return ScenarioMetadata(
|
|
104
|
+
name="mimic_rrs",
|
|
105
|
+
display_name="MIMIC-RRS",
|
|
106
|
+
short_display_name="MIMIC-RRS",
|
|
107
|
+
description="MIMIC-RRS is a benchmark constructed from radiology reports in the MIMIC-III "
|
|
108
|
+
"database. It contains pairs of ‘Findings‘ and ‘Impression‘ sections, enabling "
|
|
109
|
+
"evaluation of a model's ability to summarize diagnostic imaging observations "
|
|
110
|
+
"into concise, clinically relevant conclusions [(Chen et al., "
|
|
111
|
+
"2023)](https://arxiv.org/abs/2211.08584).",
|
|
112
|
+
taxonomy=TaxonomyInfo(
|
|
113
|
+
task="Text generation",
|
|
114
|
+
what="Generate radiology report summaries from findings sections",
|
|
115
|
+
when="Post-imaging",
|
|
116
|
+
who="Radiologist",
|
|
117
|
+
language="English",
|
|
118
|
+
),
|
|
119
|
+
main_metric="mimic_rrs_accuracy",
|
|
120
|
+
main_split="test",
|
|
121
|
+
)
|
|
@@ -2,6 +2,7 @@ import pandas as pd
|
|
|
2
2
|
import numpy as np
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import check_file_exists
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Input,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
CORRECT_TAG,
|
|
12
13
|
Reference,
|
|
13
14
|
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
|
|
16
18
|
|
|
@@ -75,3 +77,23 @@ class MIMICIVBillingCodeScenario(Scenario):
|
|
|
75
77
|
continue
|
|
76
78
|
|
|
77
79
|
return instances
|
|
80
|
+
|
|
81
|
+
def get_metadata(self):
|
|
82
|
+
return ScenarioMetadata(
|
|
83
|
+
name="mimiciv_billing_code",
|
|
84
|
+
display_name="MIMIC-IV Billing Code",
|
|
85
|
+
description="MIMIC-IV Billing Code is a benchmark derived from discharge summaries in the "
|
|
86
|
+
"MIMIC-IV database, paired with their corresponding ICD-10 billing codes. The "
|
|
87
|
+
"task requires models to extract structured billing codes based on free-text "
|
|
88
|
+
"clinical notes, reflecting real-world hospital coding tasks for financial "
|
|
89
|
+
"reimbursement.",
|
|
90
|
+
taxonomy=TaxonomyInfo(
|
|
91
|
+
task="Classification",
|
|
92
|
+
what="Predict ICD-10 billing codes from clinical discharge notes",
|
|
93
|
+
when="During or after patient discharge",
|
|
94
|
+
who="Hospital Admistrator",
|
|
95
|
+
language="English",
|
|
96
|
+
),
|
|
97
|
+
main_metric="mimiciv_billing_code_f1",
|
|
98
|
+
main_split="test",
|
|
99
|
+
)
|
|
@@ -2,6 +2,7 @@ from typing import Dict, List
|
|
|
2
2
|
from datasets import Dataset, load_dataset
|
|
3
3
|
|
|
4
4
|
from helm.common.hierarchical_logger import hlog
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
7
8
|
Instance,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
CORRECT_TAG,
|
|
12
13
|
Input,
|
|
13
14
|
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
|
|
16
18
|
|
|
@@ -93,3 +95,19 @@ class MMLUProScenario(Scenario):
|
|
|
93
95
|
instances.extend(self.process_dataset(data, split))
|
|
94
96
|
|
|
95
97
|
return instances
|
|
98
|
+
|
|
99
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
100
|
+
return ScenarioMetadata(
|
|
101
|
+
name=self.name,
|
|
102
|
+
display_name="MMLU-Pro",
|
|
103
|
+
description=self.description,
|
|
104
|
+
main_metric="chain_of_thought_correctness",
|
|
105
|
+
main_split="test",
|
|
106
|
+
taxonomy=TaxonomyInfo(
|
|
107
|
+
task="question answering",
|
|
108
|
+
what="graduate-level questions in biology, physics, and chemistry",
|
|
109
|
+
who="domain experts",
|
|
110
|
+
when="2023",
|
|
111
|
+
language="English",
|
|
112
|
+
),
|
|
113
|
+
)
|