crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +10 -5
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +90 -61
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +281 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +21 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +119 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +105 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +83 -5
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +18 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +42 -6
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +151 -47
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +255 -21
- helm/clients/palmyra_client.py +2 -6
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +69 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +8 -30
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/key_value_store.py +9 -9
- helm/common/media_object.py +2 -2
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +1258 -33
- helm/config/model_metadata.yaml +1110 -41
- helm/config/tokenizer_configs.yaml +403 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -13
- helm/proxy/services/remote_service.py +1 -7
- helm/proxy/services/server_service.py +6 -19
- helm/proxy/services/service.py +0 -6
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.3.dist-info/METADATA +0 -355
- crfm_helm-0.5.3.dist-info/RECORD +0 -699
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
|
@@ -0,0 +1,1155 @@
|
|
|
1
|
+
"""Run spec functions for the MedHELM leaderboard.
|
|
2
|
+
|
|
3
|
+
Website: https://crfm.stanford.edu/helm/medhelm/
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import Union
|
|
7
|
+
|
|
8
|
+
from helm.benchmark.adaptation.adapter_spec import (
|
|
9
|
+
ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
10
|
+
)
|
|
11
|
+
from helm.benchmark.adaptation.common_adapter_specs import (
|
|
12
|
+
get_generation_adapter_spec,
|
|
13
|
+
get_multiple_choice_adapter_spec,
|
|
14
|
+
)
|
|
15
|
+
from helm.benchmark.annotation.annotator import AnnotatorSpec
|
|
16
|
+
from helm.benchmark.metrics.common_metric_specs import (
|
|
17
|
+
get_basic_metric_specs,
|
|
18
|
+
get_exact_match_metric_specs,
|
|
19
|
+
get_open_ended_generation_metric_specs,
|
|
20
|
+
get_summarization_metric_specs,
|
|
21
|
+
get_generic_metric_specs,
|
|
22
|
+
)
|
|
23
|
+
from helm.benchmark.metrics.metric import MetricSpec
|
|
24
|
+
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
25
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
26
|
+
from helm.common.gpu_utils import get_torch_device_name
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@run_spec_function("medcalc_bench")
|
|
30
|
+
def get_medcalc_bench_spec() -> RunSpec:
|
|
31
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medcalc_bench_scenario.MedCalcBenchScenario")
|
|
32
|
+
|
|
33
|
+
adapter_spec = get_generation_adapter_spec(
|
|
34
|
+
instructions="Given a patient note and a clinical question, compute the requested medical value.",
|
|
35
|
+
input_noun=None,
|
|
36
|
+
newline_after_input_noun=False,
|
|
37
|
+
output_noun="Answer only the requested quantity without units. No explanation needed",
|
|
38
|
+
max_tokens=10,
|
|
39
|
+
max_train_instances=0,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
metric_specs = [
|
|
43
|
+
MetricSpec(
|
|
44
|
+
class_name="helm.benchmark.metrics.medcalc_bench_metrics.MedCalcBenchMetric",
|
|
45
|
+
args={},
|
|
46
|
+
)
|
|
47
|
+
] + get_exact_match_metric_specs()
|
|
48
|
+
|
|
49
|
+
return RunSpec(
|
|
50
|
+
name="medcalc_bench",
|
|
51
|
+
scenario_spec=scenario_spec,
|
|
52
|
+
adapter_spec=adapter_spec,
|
|
53
|
+
metric_specs=metric_specs,
|
|
54
|
+
groups=["medcalc_bench"],
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@run_spec_function("clear")
|
|
59
|
+
def get_clear_spec(condition: str) -> RunSpec:
|
|
60
|
+
scenario_spec = ScenarioSpec(
|
|
61
|
+
class_name="helm.benchmark.scenarios.clear_scenario.CLEARScenario", args={"condition": condition}
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
condition_display = condition.replace("_", " ")
|
|
65
|
+
|
|
66
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
67
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
68
|
+
instructions=(
|
|
69
|
+
f"Answer 'A' for 'Has a history of {condition_display}', "
|
|
70
|
+
f"'B' for 'Does not have a history of {condition_display}', or "
|
|
71
|
+
f"'C' for 'Uncertain'"
|
|
72
|
+
),
|
|
73
|
+
input_noun=None,
|
|
74
|
+
output_noun="Respond only with 'A', 'B', or 'C'. Do not add any other text, punctuation, or symbols",
|
|
75
|
+
max_train_instances=0,
|
|
76
|
+
max_tokens=1,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
return RunSpec(
|
|
80
|
+
name=f"clear:condition={condition}",
|
|
81
|
+
scenario_spec=scenario_spec,
|
|
82
|
+
adapter_spec=adapter_spec,
|
|
83
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
84
|
+
groups=["clear"],
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@run_spec_function("mtsamples_replicate")
|
|
89
|
+
def get_mtsamples_spec() -> RunSpec:
|
|
90
|
+
scenario_spec = ScenarioSpec(
|
|
91
|
+
class_name="helm.benchmark.scenarios.mtsamples_replicate_scenario.MTSamplesReplicateScenario"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
adapter_spec = get_generation_adapter_spec(
|
|
95
|
+
instructions="Given various information about a patient, return a reasonable treatment plan for the patient.",
|
|
96
|
+
input_noun=None,
|
|
97
|
+
newline_after_input_noun=False,
|
|
98
|
+
output_noun="Answer",
|
|
99
|
+
max_tokens=512,
|
|
100
|
+
max_train_instances=0,
|
|
101
|
+
stop_sequences=[],
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
annotator_specs = [
|
|
105
|
+
AnnotatorSpec(class_name="helm.benchmark.annotation.mtsamples_replicate_annotator.MTSamplesReplicateAnnotator")
|
|
106
|
+
]
|
|
107
|
+
|
|
108
|
+
metric_args = {
|
|
109
|
+
"task": "mtsamples_replicate",
|
|
110
|
+
"device": get_torch_device_name(),
|
|
111
|
+
"bertscore_model": "distilbert-base-uncased",
|
|
112
|
+
"rescale_with_baseline": False,
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
116
|
+
MetricSpec(class_name="helm.benchmark.metrics.mtsamples_replicate_metrics.MTSamplesReplicateMetric", args={})
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
return RunSpec(
|
|
120
|
+
name="mtsamples_replicate",
|
|
121
|
+
scenario_spec=scenario_spec,
|
|
122
|
+
adapter_spec=adapter_spec,
|
|
123
|
+
annotators=annotator_specs,
|
|
124
|
+
metric_specs=metric_specs,
|
|
125
|
+
groups=["mtsamples_replicate"],
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@run_spec_function("medec")
|
|
130
|
+
def get_medec_run_spec() -> RunSpec:
|
|
131
|
+
"""
|
|
132
|
+
RunSpec for the MEDEC dataset.
|
|
133
|
+
This configuration evaluates the model's ability to summarize doctor-patient
|
|
134
|
+
dialogues into structured clinical notes.
|
|
135
|
+
"""
|
|
136
|
+
# Define the scenario
|
|
137
|
+
scenario_spec = ScenarioSpec(
|
|
138
|
+
class_name="helm.benchmark.scenarios.medec_scenario.MedecScenario",
|
|
139
|
+
args={},
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# Define the adapter
|
|
143
|
+
adapter_spec = get_generation_adapter_spec(
|
|
144
|
+
instructions=(
|
|
145
|
+
"The following is a medical narrative about a patient. "
|
|
146
|
+
"You are a skilled medical doctor reviewing the clinical text. "
|
|
147
|
+
"The text is either correct or contains one error. "
|
|
148
|
+
"The text has a sentence per line. Each line starts with the "
|
|
149
|
+
"sentence ID, followed by a space character then the sentence to check. "
|
|
150
|
+
"Check every sentence of the text. "
|
|
151
|
+
"If the text is correct return the following output: CORRECT. "
|
|
152
|
+
"If the text has a medical error, return the sentence ID of the "
|
|
153
|
+
"sentence containing the error, followed by a space, "
|
|
154
|
+
"and a corrected version of the sentence."
|
|
155
|
+
),
|
|
156
|
+
input_noun="Clinical Note",
|
|
157
|
+
output_noun="Answer",
|
|
158
|
+
max_tokens=256,
|
|
159
|
+
max_train_instances=0,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Define the metrics
|
|
163
|
+
metric_specs = [
|
|
164
|
+
MetricSpec(
|
|
165
|
+
class_name="helm.benchmark.metrics.medec_metrics.MedecMetric",
|
|
166
|
+
args={},
|
|
167
|
+
)
|
|
168
|
+
] + get_basic_metric_specs([])
|
|
169
|
+
|
|
170
|
+
# Return the RunSpec
|
|
171
|
+
return RunSpec(
|
|
172
|
+
name="medec",
|
|
173
|
+
scenario_spec=scenario_spec,
|
|
174
|
+
adapter_spec=adapter_spec,
|
|
175
|
+
metric_specs=metric_specs,
|
|
176
|
+
groups=["clinical", "medec"],
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
@run_spec_function("ehrshot")
|
|
181
|
+
def get_ehrshot_spec(subject: str, max_length: int = 100000) -> RunSpec:
|
|
182
|
+
scenario_spec = ScenarioSpec(
|
|
183
|
+
class_name="helm.benchmark.scenarios.ehrshot_scenario.EHRSHOTScenario",
|
|
184
|
+
args={"subject": subject, "max_length": max_length},
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
188
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
189
|
+
instructions="Answer A for yes, B for no.",
|
|
190
|
+
input_noun="",
|
|
191
|
+
output_noun="Respond with only 'A' for yes or 'B' for no. Do not add any other text, punctuation, or symbols",
|
|
192
|
+
max_train_instances=0,
|
|
193
|
+
max_tokens=1,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
return RunSpec(
|
|
197
|
+
name=f"ehrshot:subject={subject}",
|
|
198
|
+
scenario_spec=scenario_spec,
|
|
199
|
+
adapter_spec=adapter_spec,
|
|
200
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
201
|
+
groups=["ehrshot"],
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
@run_spec_function("head_qa")
|
|
206
|
+
def get_head_qa_run_spec(language: str = "en", category: Union[str, None] = None) -> RunSpec:
|
|
207
|
+
"""
|
|
208
|
+
RunSpec for the HEAD-QA dataset.
|
|
209
|
+
This configuration evaluates the model's ability to answer challenging multiple-choice biomedical questions.
|
|
210
|
+
"""
|
|
211
|
+
# Define the scenario
|
|
212
|
+
scenario_spec = ScenarioSpec(
|
|
213
|
+
class_name="helm.benchmark.scenarios.headqa_scenario.HeadQAScenario",
|
|
214
|
+
args={"language": language, "category": category},
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Define the adapter
|
|
218
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
219
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
220
|
+
instructions=(
|
|
221
|
+
"You are a highly knowledgeable AI assistant specializing in biomedical sciences. Your task is to answer "
|
|
222
|
+
"multiple-choice questions accurately based on the options provided. "
|
|
223
|
+
"Each question will relate to biomedical concepts, "
|
|
224
|
+
"and you will be asked to choose the most appropriate answer.\n\n"
|
|
225
|
+
"Select the correct answer by outputting only the letter corresponding to your choice (A, B, C, or D)."
|
|
226
|
+
),
|
|
227
|
+
input_noun="Question",
|
|
228
|
+
output_noun="Answer",
|
|
229
|
+
max_tokens=1,
|
|
230
|
+
max_train_instances=0,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# Define the metrics
|
|
234
|
+
metric_specs = get_exact_match_metric_specs()
|
|
235
|
+
|
|
236
|
+
# Return the RunSpec
|
|
237
|
+
return RunSpec(
|
|
238
|
+
name=f"head_qa:language={language},category={category}",
|
|
239
|
+
scenario_spec=scenario_spec,
|
|
240
|
+
adapter_spec=adapter_spec,
|
|
241
|
+
metric_specs=metric_specs,
|
|
242
|
+
groups=["biomedical", "head_qa"],
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
@run_spec_function("medbullets")
|
|
247
|
+
def get_medbullets_run_spec() -> RunSpec:
|
|
248
|
+
"""
|
|
249
|
+
RunSpec for the MedBullets dataset.
|
|
250
|
+
This configuration evaluates the model's ability to answer challenging multiple-choice clinical questions.
|
|
251
|
+
"""
|
|
252
|
+
# Define the scenario
|
|
253
|
+
scenario_spec = ScenarioSpec(
|
|
254
|
+
class_name="helm.benchmark.scenarios.medbullets_scenario.MedBulletsScenario",
|
|
255
|
+
args={},
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
# Define the adapter
|
|
259
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
260
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
261
|
+
instructions=(
|
|
262
|
+
"You are a highly knowledgeable AI assistant specializing in medicine. "
|
|
263
|
+
"Your task is to answer medical questions similar to those found on the USMLE Step 2/3 exams. "
|
|
264
|
+
"You will be provided with a clinical scenario followed by several multiple-choice options.\n\n"
|
|
265
|
+
"Select the correct answer by outputting only the letter corresponding to your choice (A, B, C, D, or E)."
|
|
266
|
+
),
|
|
267
|
+
input_noun="Clinical Scenario",
|
|
268
|
+
output_noun="Answer",
|
|
269
|
+
max_tokens=1,
|
|
270
|
+
max_train_instances=0,
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
# Define the metrics
|
|
274
|
+
metric_specs = get_exact_match_metric_specs()
|
|
275
|
+
|
|
276
|
+
# Return the RunSpec
|
|
277
|
+
return RunSpec(
|
|
278
|
+
name="medbullets",
|
|
279
|
+
scenario_spec=scenario_spec,
|
|
280
|
+
adapter_spec=adapter_spec,
|
|
281
|
+
metric_specs=metric_specs,
|
|
282
|
+
groups=["clinical", "medbullets"],
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
@run_spec_function("medbullets_freetext")
|
|
287
|
+
def get_medbullets_freetext_run_spec() -> RunSpec:
|
|
288
|
+
"""RunSpec for the MedBullets Free-text dataset."""
|
|
289
|
+
# Define the scenario
|
|
290
|
+
scenario_spec = ScenarioSpec(
|
|
291
|
+
class_name="helm.benchmark.scenarios.medbullets_scenario.MedBulletsFreeTextScenario",
|
|
292
|
+
args={},
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
# Define the adapter
|
|
296
|
+
adapter_spec = get_generation_adapter_spec(
|
|
297
|
+
instructions=(
|
|
298
|
+
"You are a helpful and highly knowledgeable AI assistant specializing in medicine. "
|
|
299
|
+
"Your task is to answer medical questions similar to those found on the USMLE Step 2/3 exams. "
|
|
300
|
+
"You will be provided with a clinical scenario, "
|
|
301
|
+
"and for each question, you must:\n"
|
|
302
|
+
"- Provide an answer to the question.\n"
|
|
303
|
+
"- Give a concise explanation for why that answer is correct, based on the clinical scenario provided."
|
|
304
|
+
),
|
|
305
|
+
input_noun="Clinical Scenario",
|
|
306
|
+
output_noun="Answer",
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
# Define the metrics
|
|
310
|
+
metric_specs = get_open_ended_generation_metric_specs()
|
|
311
|
+
|
|
312
|
+
# Return the RunSpec
|
|
313
|
+
return RunSpec(
|
|
314
|
+
name="medbullets-freetext",
|
|
315
|
+
scenario_spec=scenario_spec,
|
|
316
|
+
adapter_spec=adapter_spec,
|
|
317
|
+
metric_specs=metric_specs,
|
|
318
|
+
groups=["clinical", "medbullets-freetext"],
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
@run_spec_function("medalign")
|
|
323
|
+
def get_medalign_spec(max_length: int = 40000) -> RunSpec:
|
|
324
|
+
scenario_spec = ScenarioSpec(
|
|
325
|
+
class_name="helm.benchmark.scenarios.medalign_scenario.MedalignScenario", args={"max_length": max_length}
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
adapter_spec = get_generation_adapter_spec(
|
|
329
|
+
instructions="",
|
|
330
|
+
input_noun=None,
|
|
331
|
+
newline_after_input_noun=False,
|
|
332
|
+
output_noun=None,
|
|
333
|
+
max_tokens=256,
|
|
334
|
+
stop_sequences=[],
|
|
335
|
+
max_train_instances=0,
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.medalign_annotator.MedalignAnnotator")]
|
|
339
|
+
|
|
340
|
+
metric_args = {
|
|
341
|
+
"task": "medalign",
|
|
342
|
+
"device": get_torch_device_name(),
|
|
343
|
+
"bertscore_model": "distilbert-base-uncased",
|
|
344
|
+
"rescale_with_baseline": False,
|
|
345
|
+
}
|
|
346
|
+
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
347
|
+
MetricSpec(class_name="helm.benchmark.metrics.medalign_metrics.MedalignMetric", args={})
|
|
348
|
+
]
|
|
349
|
+
|
|
350
|
+
return RunSpec(
|
|
351
|
+
name="medalign",
|
|
352
|
+
scenario_spec=scenario_spec,
|
|
353
|
+
adapter_spec=adapter_spec,
|
|
354
|
+
annotators=annotator_specs,
|
|
355
|
+
metric_specs=metric_specs,
|
|
356
|
+
groups=["medalign"],
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
@run_spec_function("shc_ptbm_med")
|
|
361
|
+
def get_shc_ptbm_spec() -> RunSpec:
|
|
362
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.shc_ptbm_scenario.SHCPTBMMedScenario", args={})
|
|
363
|
+
|
|
364
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
365
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
366
|
+
instructions="Answer A or B.",
|
|
367
|
+
input_noun="",
|
|
368
|
+
output_noun="",
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
return RunSpec(
|
|
372
|
+
name="shc_ptbm_med",
|
|
373
|
+
scenario_spec=scenario_spec,
|
|
374
|
+
adapter_spec=adapter_spec,
|
|
375
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
376
|
+
groups=["shc_ptbm_med"],
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
@run_spec_function("shc_sei_med")
|
|
381
|
+
def get_shc_sei_spec() -> RunSpec:
|
|
382
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.shc_sei_scenario.SHCSEIMedScenario", args={})
|
|
383
|
+
|
|
384
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
385
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
386
|
+
instructions="Answer A or B.",
|
|
387
|
+
input_noun="",
|
|
388
|
+
output_noun="",
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
return RunSpec(
|
|
392
|
+
name="shc_sei_med",
|
|
393
|
+
scenario_spec=scenario_spec,
|
|
394
|
+
adapter_spec=adapter_spec,
|
|
395
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
396
|
+
groups=["shc_sei_med"],
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
@run_spec_function("dischargeme")
|
|
401
|
+
def get_dischargeme_spec() -> RunSpec:
|
|
402
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.dischargeme_scenario.DischargeMeScenario")
|
|
403
|
+
|
|
404
|
+
adapter_spec = get_generation_adapter_spec(
|
|
405
|
+
instructions=(
|
|
406
|
+
"Given a discharge text, a radiology report text, and a target "
|
|
407
|
+
"document of either discharge instructions or a brief hospital course, "
|
|
408
|
+
"return the generated target document from the context provided."
|
|
409
|
+
),
|
|
410
|
+
input_noun=None,
|
|
411
|
+
newline_after_input_noun=False,
|
|
412
|
+
output_noun="Answer",
|
|
413
|
+
max_tokens=300,
|
|
414
|
+
stop_sequences=[],
|
|
415
|
+
max_train_instances=0,
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.dischargeme_annotator.DischargeMeAnnotator")]
|
|
419
|
+
|
|
420
|
+
metric_args = {
|
|
421
|
+
"task": "dischargeme",
|
|
422
|
+
"device": get_torch_device_name(),
|
|
423
|
+
"bertscore_model": "distilbert-base-uncased",
|
|
424
|
+
"rescale_with_baseline": False,
|
|
425
|
+
}
|
|
426
|
+
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
427
|
+
MetricSpec(class_name="helm.benchmark.metrics.dischargeme_metrics.DischargeMeMetric", args={})
|
|
428
|
+
]
|
|
429
|
+
return RunSpec(
|
|
430
|
+
name="dischargeme",
|
|
431
|
+
scenario_spec=scenario_spec,
|
|
432
|
+
adapter_spec=adapter_spec,
|
|
433
|
+
annotators=annotator_specs,
|
|
434
|
+
metric_specs=metric_specs,
|
|
435
|
+
groups=["dischargeme"],
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
@run_spec_function("aci_bench")
|
|
440
|
+
def get_aci_bench_run_spec() -> RunSpec:
|
|
441
|
+
"""
|
|
442
|
+
RunSpec for the ACI-Bench dataset.
|
|
443
|
+
This configuration evaluates the model's ability to summarize
|
|
444
|
+
doctor-patient dialogues into structured clinical notes.
|
|
445
|
+
"""
|
|
446
|
+
# Define the scenario
|
|
447
|
+
scenario_spec = ScenarioSpec(
|
|
448
|
+
class_name="helm.benchmark.scenarios.aci_bench_scenario.ACIBenchScenario",
|
|
449
|
+
args={},
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
# Define the adapter
|
|
453
|
+
adapter_spec = get_generation_adapter_spec(
|
|
454
|
+
instructions=(
|
|
455
|
+
"Summarize the conversation to generate a clinical note with four sections:\n"
|
|
456
|
+
"1. HISTORY OF PRESENT ILLNESS\n"
|
|
457
|
+
"2. PHYSICAL EXAM\n"
|
|
458
|
+
"3. RESULTS\n"
|
|
459
|
+
"4. ASSESSMENT AND PLAN\n\n"
|
|
460
|
+
"The conversation is:"
|
|
461
|
+
),
|
|
462
|
+
input_noun="Conversation",
|
|
463
|
+
output_noun="Clinical Note",
|
|
464
|
+
max_tokens=768, # avg tokens in response is 618.9
|
|
465
|
+
max_train_instances=0,
|
|
466
|
+
stop_sequences=[],
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.aci_bench_annotator.ACIBenchAnnotator")]
|
|
470
|
+
|
|
471
|
+
# Define the metrics
|
|
472
|
+
metric_args = {
|
|
473
|
+
"task": "aci_bench",
|
|
474
|
+
"device": get_torch_device_name(),
|
|
475
|
+
"bertscore_model": "distilbert-base-uncased",
|
|
476
|
+
"rescale_with_baseline": False,
|
|
477
|
+
}
|
|
478
|
+
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
479
|
+
MetricSpec(class_name="helm.benchmark.metrics.aci_bench_metrics.ACIBenchMetric", args={})
|
|
480
|
+
]
|
|
481
|
+
|
|
482
|
+
# Return the RunSpec
|
|
483
|
+
return RunSpec(
|
|
484
|
+
name="aci_bench",
|
|
485
|
+
scenario_spec=scenario_spec,
|
|
486
|
+
adapter_spec=adapter_spec,
|
|
487
|
+
annotators=annotator_specs,
|
|
488
|
+
metric_specs=metric_specs,
|
|
489
|
+
groups=["clinical", "aci_bench"],
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
@run_spec_function("mtsamples_procedures")
|
|
494
|
+
def get_mtsamples_procedures_spec() -> RunSpec:
|
|
495
|
+
scenario_spec = ScenarioSpec(
|
|
496
|
+
class_name="helm.benchmark.scenarios.mtsamples_procedures_scenario.MTSamplesProceduresScenario"
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
adapter_spec = get_generation_adapter_spec(
|
|
500
|
+
instructions="Here are information about a patient, return a reasonable treatment plan for the patient.",
|
|
501
|
+
input_noun="Patient Notes",
|
|
502
|
+
newline_after_input_noun=False,
|
|
503
|
+
output_noun="Answer",
|
|
504
|
+
max_tokens=512,
|
|
505
|
+
max_train_instances=0,
|
|
506
|
+
stop_sequences=[],
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
annotator_specs = [
|
|
510
|
+
AnnotatorSpec(
|
|
511
|
+
class_name="helm.benchmark.annotation.mtsamples_procedures_annotator.MTSamplesProceduresAnnotator"
|
|
512
|
+
)
|
|
513
|
+
]
|
|
514
|
+
|
|
515
|
+
metric_args = {
|
|
516
|
+
"task": "mtsamples_procedures",
|
|
517
|
+
"device": get_torch_device_name(),
|
|
518
|
+
"bertscore_model": "distilbert-base-uncased",
|
|
519
|
+
"rescale_with_baseline": False,
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
523
|
+
MetricSpec(class_name="helm.benchmark.metrics.mtsamples_procedures_metrics.MTSamplesProceduresMetric", args={})
|
|
524
|
+
]
|
|
525
|
+
|
|
526
|
+
return RunSpec(
|
|
527
|
+
name="mtsamples_procedures",
|
|
528
|
+
scenario_spec=scenario_spec,
|
|
529
|
+
adapter_spec=adapter_spec,
|
|
530
|
+
annotators=annotator_specs,
|
|
531
|
+
metric_specs=metric_specs,
|
|
532
|
+
groups=["mtsamples_procedures"],
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
@run_spec_function("mimic_rrs")
|
|
537
|
+
def get_mimic_rrs_spec() -> RunSpec:
|
|
538
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.mimic_rrs_scenario.MIMICRRSScenario", args={})
|
|
539
|
+
|
|
540
|
+
adapter_spec = get_generation_adapter_spec(
|
|
541
|
+
instructions=(
|
|
542
|
+
"Generate the impression section of the radiology report based on its findings. "
|
|
543
|
+
"This will not be used to diagnose nor treat any patients. Be as concise as possible."
|
|
544
|
+
),
|
|
545
|
+
input_noun="Findings",
|
|
546
|
+
output_noun="Impression",
|
|
547
|
+
newline_after_input_noun=True,
|
|
548
|
+
newline_after_output_noun=True,
|
|
549
|
+
max_tokens=128,
|
|
550
|
+
max_train_instances=0,
|
|
551
|
+
stop_sequences=[],
|
|
552
|
+
)
|
|
553
|
+
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.mimic_rrs_annotator.MIMICRRSAnnotator")]
|
|
554
|
+
|
|
555
|
+
metric_args = {
|
|
556
|
+
"task": "mimic_rrs",
|
|
557
|
+
"device": get_torch_device_name(),
|
|
558
|
+
"bertscore_model": "distilbert-base-uncased",
|
|
559
|
+
"rescale_with_baseline": False,
|
|
560
|
+
}
|
|
561
|
+
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
562
|
+
MetricSpec(class_name="helm.benchmark.metrics.mimic_rrs_metrics.MIMICRRSMetric", args={})
|
|
563
|
+
]
|
|
564
|
+
return RunSpec(
|
|
565
|
+
name="mimic_rrs",
|
|
566
|
+
scenario_spec=scenario_spec,
|
|
567
|
+
adapter_spec=adapter_spec,
|
|
568
|
+
annotators=annotator_specs,
|
|
569
|
+
metric_specs=metric_specs,
|
|
570
|
+
groups=["mimic_rrs"],
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
@run_spec_function("mimic_bhc")
|
|
575
|
+
def get_mimic_bhc_spec() -> RunSpec:
|
|
576
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.mimic_bhc_scenario.MIMICBHCScenario", args={})
|
|
577
|
+
|
|
578
|
+
adapter_spec = get_generation_adapter_spec(
|
|
579
|
+
instructions=("Summarize the clinical note into a brief hospital course."),
|
|
580
|
+
input_noun="Clinical Note",
|
|
581
|
+
output_noun="Brief Hospital Course",
|
|
582
|
+
newline_after_input_noun=True,
|
|
583
|
+
newline_after_output_noun=True,
|
|
584
|
+
max_tokens=1024,
|
|
585
|
+
max_train_instances=0,
|
|
586
|
+
stop_sequences=[],
|
|
587
|
+
)
|
|
588
|
+
metric_args = {
|
|
589
|
+
"task": "mimic_bhc",
|
|
590
|
+
"device": get_torch_device_name(),
|
|
591
|
+
"bertscore_model": "distilbert-base-uncased",
|
|
592
|
+
"rescale_with_baseline": False,
|
|
593
|
+
}
|
|
594
|
+
return RunSpec(
|
|
595
|
+
name="mimic_bhc",
|
|
596
|
+
scenario_spec=scenario_spec,
|
|
597
|
+
adapter_spec=adapter_spec,
|
|
598
|
+
metric_specs=get_summarization_metric_specs(metric_args),
|
|
599
|
+
groups=["mimic_bhc"],
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
@run_spec_function("chw_care_plan")
|
|
604
|
+
def get_chw_care_plan_run_spec() -> RunSpec:
|
|
605
|
+
"""
|
|
606
|
+
RunSpec for the chw_care_plan dataset.
|
|
607
|
+
This configuration evaluates the model's ability to summarize
|
|
608
|
+
doctor-patient dialogues into structured clinical notes.
|
|
609
|
+
"""
|
|
610
|
+
scenario_spec = ScenarioSpec(
|
|
611
|
+
class_name="helm.benchmark.scenarios.chw_care_plan_scenario.CHWCarePlanScenario",
|
|
612
|
+
args={},
|
|
613
|
+
)
|
|
614
|
+
|
|
615
|
+
adapter_spec = get_generation_adapter_spec(
|
|
616
|
+
instructions=(
|
|
617
|
+
"Follow the instructions provided regarding conversion of a patient note into a specified format."
|
|
618
|
+
),
|
|
619
|
+
input_noun="",
|
|
620
|
+
output_noun="",
|
|
621
|
+
max_tokens=768,
|
|
622
|
+
max_train_instances=0,
|
|
623
|
+
stop_sequences=[],
|
|
624
|
+
)
|
|
625
|
+
annotator_specs = [
|
|
626
|
+
AnnotatorSpec(class_name="helm.benchmark.annotation.chw_care_plan_annotator.CHWCarePlanAnnotator")
|
|
627
|
+
]
|
|
628
|
+
|
|
629
|
+
metric_args = {
|
|
630
|
+
"task": "chw_care_plan",
|
|
631
|
+
"device": get_torch_device_name(),
|
|
632
|
+
"bertscore_model": "distilbert-base-uncased",
|
|
633
|
+
"rescale_with_baseline": False,
|
|
634
|
+
}
|
|
635
|
+
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
636
|
+
MetricSpec(class_name="helm.benchmark.metrics.chw_care_plan_metrics.CHWCarePlanMetric", args={})
|
|
637
|
+
]
|
|
638
|
+
# Return the RunSpec
|
|
639
|
+
return RunSpec(
|
|
640
|
+
name="chw_care_plan",
|
|
641
|
+
scenario_spec=scenario_spec,
|
|
642
|
+
adapter_spec=adapter_spec,
|
|
643
|
+
annotators=annotator_specs,
|
|
644
|
+
metric_specs=metric_specs,
|
|
645
|
+
groups=["clinical", "chw_care_plan"],
|
|
646
|
+
)
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
@run_spec_function("medication_qa")
|
|
650
|
+
def get_medication_qa_spec() -> RunSpec:
|
|
651
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medication_qa_scenario.MedicationQAScenario")
|
|
652
|
+
|
|
653
|
+
adapter_spec = get_generation_adapter_spec(
|
|
654
|
+
instructions="Please answer the following consumer health question.",
|
|
655
|
+
input_noun="Question",
|
|
656
|
+
output_noun="Answer",
|
|
657
|
+
max_train_instances=0,
|
|
658
|
+
max_tokens=512,
|
|
659
|
+
stop_sequences=[],
|
|
660
|
+
)
|
|
661
|
+
annotator_specs = [
|
|
662
|
+
AnnotatorSpec(class_name="helm.benchmark.annotation.medication_qa_annotator.MedicationQAAnnotator")
|
|
663
|
+
]
|
|
664
|
+
metric_args = {
|
|
665
|
+
"task": "medication_qa",
|
|
666
|
+
"device": get_torch_device_name(),
|
|
667
|
+
"bertscore_model": "distilbert-base-uncased",
|
|
668
|
+
"rescale_with_baseline": False,
|
|
669
|
+
}
|
|
670
|
+
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
671
|
+
MetricSpec(class_name="helm.benchmark.metrics.medication_qa_metrics.MedicationQAMetric", args={})
|
|
672
|
+
]
|
|
673
|
+
return RunSpec(
|
|
674
|
+
name="medication_qa",
|
|
675
|
+
scenario_spec=scenario_spec,
|
|
676
|
+
adapter_spec=adapter_spec,
|
|
677
|
+
annotators=annotator_specs,
|
|
678
|
+
metric_specs=metric_specs,
|
|
679
|
+
groups=["medication_qa"],
|
|
680
|
+
)
|
|
681
|
+
|
|
682
|
+
|
|
683
|
+
@run_spec_function("starr_patient_instructions")
|
|
684
|
+
def get_starr_patient_instructions_run_spec() -> RunSpec:
|
|
685
|
+
scenario_spec = ScenarioSpec(
|
|
686
|
+
class_name="helm.benchmark.scenarios.starr_patient_instructions_scenario.StarrPatientInstructionsScenario",
|
|
687
|
+
args={},
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
adapter_spec = get_generation_adapter_spec(
|
|
691
|
+
instructions=(
|
|
692
|
+
"You are a medical professional tasked with generating personalized post-procedure "
|
|
693
|
+
"patient instructions. Given the following case details which include the patient's "
|
|
694
|
+
"diagnosis, the planned procedure, the history & physical note, and the operative report, "
|
|
695
|
+
"generate clear and actionable instructions for the patient to follow after their procedure. "
|
|
696
|
+
"Don't worry, this information will not be used for any clinical decision making. "
|
|
697
|
+
"This will not be used to diagnose nor treat any patients."
|
|
698
|
+
),
|
|
699
|
+
input_noun="Case Details",
|
|
700
|
+
output_noun="Patient Instructions",
|
|
701
|
+
max_tokens=256,
|
|
702
|
+
max_train_instances=0,
|
|
703
|
+
stop_sequences=[],
|
|
704
|
+
)
|
|
705
|
+
annotator_specs = [
|
|
706
|
+
AnnotatorSpec(
|
|
707
|
+
class_name=(
|
|
708
|
+
"helm.benchmark.annotation.starr_patient_instructions_annotator.StarrPatientInstructionsAnnotator"
|
|
709
|
+
)
|
|
710
|
+
)
|
|
711
|
+
]
|
|
712
|
+
|
|
713
|
+
metric_args = {
|
|
714
|
+
"task": "starr_patient_instructions",
|
|
715
|
+
"device": get_torch_device_name(),
|
|
716
|
+
"bertscore_model": "distilbert-base-uncased",
|
|
717
|
+
"rescale_with_baseline": False,
|
|
718
|
+
}
|
|
719
|
+
metric_specs = (
|
|
720
|
+
get_summarization_metric_specs(metric_args)
|
|
721
|
+
+ [
|
|
722
|
+
MetricSpec(
|
|
723
|
+
class_name="helm.benchmark.metrics.starr_patient_instructions_metrics.StarrPatientInstructionsMetric",
|
|
724
|
+
args={},
|
|
725
|
+
)
|
|
726
|
+
]
|
|
727
|
+
+ get_basic_metric_specs([])
|
|
728
|
+
)
|
|
729
|
+
return RunSpec(
|
|
730
|
+
name="starr_patient_instructions",
|
|
731
|
+
scenario_spec=scenario_spec,
|
|
732
|
+
adapter_spec=adapter_spec,
|
|
733
|
+
annotators=annotator_specs,
|
|
734
|
+
metric_specs=metric_specs,
|
|
735
|
+
groups=["starr_patient_instructions"],
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
@run_spec_function("med_dialog")
|
|
740
|
+
def get_med_dialog_spec(subset: str) -> RunSpec:
|
|
741
|
+
scenario_spec = ScenarioSpec(
|
|
742
|
+
class_name="helm.benchmark.scenarios.med_dialog_scenario.MedDialogScenario", args={"subset": subset}
|
|
743
|
+
)
|
|
744
|
+
|
|
745
|
+
adapter_spec = get_generation_adapter_spec(
|
|
746
|
+
instructions="Generate a one sentence summary of this patient-doctor conversation.",
|
|
747
|
+
input_noun="Patient-Doctor",
|
|
748
|
+
output_noun="Summary",
|
|
749
|
+
max_tokens=80,
|
|
750
|
+
max_train_instances=0,
|
|
751
|
+
)
|
|
752
|
+
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.med_dialog_annotator.MedDialogAnnotator")]
|
|
753
|
+
|
|
754
|
+
metric_args = {
|
|
755
|
+
"task": "med_dialog",
|
|
756
|
+
"device": get_torch_device_name(),
|
|
757
|
+
"bertscore_model": "distilbert-base-uncased",
|
|
758
|
+
"rescale_with_baseline": False,
|
|
759
|
+
}
|
|
760
|
+
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
761
|
+
MetricSpec(class_name="helm.benchmark.metrics.med_dialog_metrics.MedDialogMetric", args={})
|
|
762
|
+
]
|
|
763
|
+
return RunSpec(
|
|
764
|
+
name=f"med_dialog,subset={subset}",
|
|
765
|
+
scenario_spec=scenario_spec,
|
|
766
|
+
adapter_spec=adapter_spec,
|
|
767
|
+
annotators=annotator_specs,
|
|
768
|
+
metric_specs=metric_specs,
|
|
769
|
+
groups=["med_dialog"],
|
|
770
|
+
)
|
|
771
|
+
|
|
772
|
+
|
|
773
|
+
@run_spec_function("shc_conf_med")
|
|
774
|
+
def get_shc_conf_spec() -> RunSpec:
|
|
775
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.shc_conf_scenario.SHCCONFMedScenario", args={})
|
|
776
|
+
|
|
777
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
778
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
779
|
+
instructions="Answer A or B.",
|
|
780
|
+
input_noun="",
|
|
781
|
+
output_noun="",
|
|
782
|
+
)
|
|
783
|
+
|
|
784
|
+
return RunSpec(
|
|
785
|
+
name="shc_conf_med",
|
|
786
|
+
scenario_spec=scenario_spec,
|
|
787
|
+
adapter_spec=adapter_spec,
|
|
788
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
789
|
+
groups=["shc_conf_med"],
|
|
790
|
+
)
|
|
791
|
+
|
|
792
|
+
|
|
793
|
+
@run_spec_function("medi_qa")
|
|
794
|
+
def get_medi_qa_spec() -> RunSpec:
|
|
795
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medi_qa_scenario.MediQAScenario", args={})
|
|
796
|
+
|
|
797
|
+
adapter_spec = get_generation_adapter_spec(
|
|
798
|
+
instructions="Answer the following consumer health question.",
|
|
799
|
+
input_noun="Question",
|
|
800
|
+
output_noun="Answer",
|
|
801
|
+
max_tokens=1024,
|
|
802
|
+
max_train_instances=0,
|
|
803
|
+
stop_sequences=[],
|
|
804
|
+
)
|
|
805
|
+
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.medi_qa_annotator.MediQAAnnotator")]
|
|
806
|
+
|
|
807
|
+
metric_args = {
|
|
808
|
+
"task": "medi_qa",
|
|
809
|
+
"device": get_torch_device_name(),
|
|
810
|
+
"bertscore_model": "distilbert-base-uncased",
|
|
811
|
+
"rescale_with_baseline": False,
|
|
812
|
+
}
|
|
813
|
+
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
814
|
+
MetricSpec(class_name="helm.benchmark.metrics.medi_qa_metrics.MediQAMetric", args={})
|
|
815
|
+
]
|
|
816
|
+
return RunSpec(
|
|
817
|
+
name="medi_qa",
|
|
818
|
+
scenario_spec=scenario_spec,
|
|
819
|
+
adapter_spec=adapter_spec,
|
|
820
|
+
annotators=annotator_specs,
|
|
821
|
+
metric_specs=metric_specs,
|
|
822
|
+
groups=["medi_qa"],
|
|
823
|
+
)
|
|
824
|
+
|
|
825
|
+
|
|
826
|
+
@run_spec_function("mental_health")
|
|
827
|
+
def get_mental_health_spec() -> RunSpec:
|
|
828
|
+
"""
|
|
829
|
+
Returns the run specification for the mental health counseling scenario.
|
|
830
|
+
This scenario evaluates a model's ability to generate appropriate counseling responses
|
|
831
|
+
in mental health conversations.
|
|
832
|
+
"""
|
|
833
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.mental_health_scenario.MentalHealthScenario")
|
|
834
|
+
|
|
835
|
+
adapter_spec = get_generation_adapter_spec(
|
|
836
|
+
instructions=(
|
|
837
|
+
"Given a mental health conversation history, generate an empathetic and appropriate counselor response."
|
|
838
|
+
),
|
|
839
|
+
input_noun=None, # No specific input noun needed as format is defined in scenario
|
|
840
|
+
newline_after_input_noun=False,
|
|
841
|
+
output_noun="Counselor response",
|
|
842
|
+
max_tokens=512,
|
|
843
|
+
)
|
|
844
|
+
annotator_specs = [
|
|
845
|
+
AnnotatorSpec(class_name="helm.benchmark.annotation.mental_health_annotator.MentalHealthAnnotator")
|
|
846
|
+
]
|
|
847
|
+
|
|
848
|
+
metric_args = {
|
|
849
|
+
"task": "mental_health",
|
|
850
|
+
"device": get_torch_device_name(),
|
|
851
|
+
"bertscore_model": "distilbert-base-uncased",
|
|
852
|
+
"rescale_with_baseline": False,
|
|
853
|
+
}
|
|
854
|
+
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
855
|
+
MetricSpec(class_name="helm.benchmark.metrics.mental_health_metrics.MentalHealthMetric", args={})
|
|
856
|
+
]
|
|
857
|
+
|
|
858
|
+
return RunSpec(
|
|
859
|
+
name="mental_health",
|
|
860
|
+
scenario_spec=scenario_spec,
|
|
861
|
+
adapter_spec=adapter_spec,
|
|
862
|
+
annotators=annotator_specs,
|
|
863
|
+
metric_specs=metric_specs,
|
|
864
|
+
groups=["mental_health"],
|
|
865
|
+
)
|
|
866
|
+
|
|
867
|
+
|
|
868
|
+
@run_spec_function("pubmed_qa")
|
|
869
|
+
def get_pubmed_qa_spec() -> RunSpec:
|
|
870
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.pubmed_qa_scenario.PubMedQAScenario", args={})
|
|
871
|
+
|
|
872
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
873
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
874
|
+
instructions="Answer A for yes, B for no or C for maybe.",
|
|
875
|
+
input_noun="Question",
|
|
876
|
+
output_noun="Answer",
|
|
877
|
+
max_train_instances=0,
|
|
878
|
+
)
|
|
879
|
+
|
|
880
|
+
return RunSpec(
|
|
881
|
+
name="pubmed_qa",
|
|
882
|
+
scenario_spec=scenario_spec,
|
|
883
|
+
adapter_spec=adapter_spec,
|
|
884
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
885
|
+
groups=["pubmed_qa"],
|
|
886
|
+
)
|
|
887
|
+
|
|
888
|
+
|
|
889
|
+
@run_spec_function("ehr_sql")
|
|
890
|
+
def get_ehr_sql_run_spec() -> RunSpec:
|
|
891
|
+
"""
|
|
892
|
+
RunSpec for the EHR SQL dataset.
|
|
893
|
+
This configuration evaluates the model's ability to generate accurate SQL queries from natural language questions.
|
|
894
|
+
"""
|
|
895
|
+
|
|
896
|
+
# Define the scenario
|
|
897
|
+
scenario_spec = ScenarioSpec(
|
|
898
|
+
class_name="helm.benchmark.scenarios.ehr_sql_scenario.EhrSqlScenario",
|
|
899
|
+
args={},
|
|
900
|
+
)
|
|
901
|
+
|
|
902
|
+
# Define the adapter
|
|
903
|
+
adapter_spec = get_generation_adapter_spec(
|
|
904
|
+
instructions=(
|
|
905
|
+
"You are a highly skilled AI specializing in medical SQL queries. "
|
|
906
|
+
"Given a database schema and a medical question, generate a valid SQL query "
|
|
907
|
+
"that retrieves the required information from the database. "
|
|
908
|
+
"Output only the SQL query without explanations.\n\n"
|
|
909
|
+
"Input: A database schema followed by a natural language question.\n"
|
|
910
|
+
"Output: A valid SQL query ending with ;. Only return SQL query, don't add additional text.\n\n"
|
|
911
|
+
"If the question is unanswerable, return an empty string without additional text or comments."
|
|
912
|
+
),
|
|
913
|
+
input_noun="Medical Question + Schema",
|
|
914
|
+
output_noun="SQL Query",
|
|
915
|
+
max_tokens=1024,
|
|
916
|
+
temperature=0,
|
|
917
|
+
max_train_instances=0,
|
|
918
|
+
stop_sequences=[],
|
|
919
|
+
)
|
|
920
|
+
|
|
921
|
+
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.ehr_sql_annotator.EhrSqlAnnotator")]
|
|
922
|
+
|
|
923
|
+
# Define the metrics
|
|
924
|
+
metric_specs = [
|
|
925
|
+
MetricSpec(class_name="helm.benchmark.metrics.ehr_sql_metrics.EhrSqlMetric", args={})
|
|
926
|
+
] + get_exact_match_metric_specs()
|
|
927
|
+
|
|
928
|
+
# Return the RunSpec
|
|
929
|
+
return RunSpec(
|
|
930
|
+
name="ehr_sql",
|
|
931
|
+
scenario_spec=scenario_spec,
|
|
932
|
+
adapter_spec=adapter_spec,
|
|
933
|
+
annotators=annotator_specs,
|
|
934
|
+
metric_specs=metric_specs,
|
|
935
|
+
groups=["ehr_sql"],
|
|
936
|
+
)
|
|
937
|
+
|
|
938
|
+
|
|
939
|
+
@run_spec_function("shc_bmt_med")
|
|
940
|
+
def get_shc_bmt_spec() -> RunSpec:
|
|
941
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.shc_bmt_scenario.SHCBMTMedScenario", args={})
|
|
942
|
+
|
|
943
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
944
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
945
|
+
instructions="Answer A or B.",
|
|
946
|
+
input_noun="",
|
|
947
|
+
output_noun="",
|
|
948
|
+
)
|
|
949
|
+
|
|
950
|
+
return RunSpec(
|
|
951
|
+
name="shc_bmt_med",
|
|
952
|
+
scenario_spec=scenario_spec,
|
|
953
|
+
adapter_spec=adapter_spec,
|
|
954
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
955
|
+
groups=["shc_bmt_med"],
|
|
956
|
+
)
|
|
957
|
+
|
|
958
|
+
|
|
959
|
+
@run_spec_function("race_based_med")
|
|
960
|
+
def get_race_based_med_spec() -> RunSpec:
|
|
961
|
+
scenario_spec = ScenarioSpec(
|
|
962
|
+
class_name="helm.benchmark.scenarios.race_based_med_scenario.RaceBasedMedScenario", args={}
|
|
963
|
+
)
|
|
964
|
+
|
|
965
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
966
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
967
|
+
instructions="Answer A for yes, B for no.",
|
|
968
|
+
input_noun=None,
|
|
969
|
+
output_noun="Respond with only 'A' for yes or 'B' for no. Do not add any other text, punctuation, or symbols",
|
|
970
|
+
max_train_instances=0,
|
|
971
|
+
)
|
|
972
|
+
|
|
973
|
+
return RunSpec(
|
|
974
|
+
name="race_based_med",
|
|
975
|
+
scenario_spec=scenario_spec,
|
|
976
|
+
adapter_spec=adapter_spec,
|
|
977
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
978
|
+
groups=["race_based_med"],
|
|
979
|
+
)
|
|
980
|
+
|
|
981
|
+
|
|
982
|
+
@run_spec_function("medhallu")
|
|
983
|
+
def get_medhallu_spec() -> RunSpec:
|
|
984
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medhallu_scenario.MedHalluScenario", args={})
|
|
985
|
+
|
|
986
|
+
adapter_spec = get_generation_adapter_spec(
|
|
987
|
+
instructions="""You are an AI assistant with extensive knowledge in the field of medicine. I want you act as an answer judge. Given a question and an answer, your objective is to determine if the provided answer contains non-factual or hallucinated information. You SHOULD give your judgement based on the following hallucination types and the world knowledge.
|
|
988
|
+
|
|
989
|
+
You are trying to determine if there is a factual contradiction between the answer and the world knowledge. Some information in the answer might be fabricated. Answer '0' if the answer is factual and '1' if the answer is hallucinated. You can categorize the answer as hallucinated if it falls in either of the category:
|
|
990
|
+
Type: Misinterpretation of #Question#: These are hallucinated answers that misunderstands the question, leading to an off-topic or irrelevant response.
|
|
991
|
+
Type: Incomplete Information: These are hallucinated answers that Point out what is not true without providing correct information.
|
|
992
|
+
Type: Mechanism and Pathway Misattribution - These are hallucinated answer that falsely attribution of biological mechanisms, molecular pathways, or disease processes that contradicts established medical knowledge
|
|
993
|
+
Type: Methodological and Evidence Fabrication - Inventing false research methods, statistical data, or specific clinical outcomes
|
|
994
|
+
|
|
995
|
+
Do not return anything else, just the answer.
|
|
996
|
+
Return just an integer value, '0' if the answer is factual and '1' if the answer is hallucinated. No letter or word, just the integer value.""", # noqa: E501
|
|
997
|
+
input_noun=None,
|
|
998
|
+
output_noun=(
|
|
999
|
+
"""Return just an integer value, '0' if the answer is factual and '1' if the answer is hallucinated.
|
|
1000
|
+
No letter or word, just the integer value.
|
|
1001
|
+
|
|
1002
|
+
Your Judgment""" # noqa: E501
|
|
1003
|
+
),
|
|
1004
|
+
max_train_instances=0,
|
|
1005
|
+
)
|
|
1006
|
+
|
|
1007
|
+
return RunSpec(
|
|
1008
|
+
name="medhallu",
|
|
1009
|
+
scenario_spec=scenario_spec,
|
|
1010
|
+
adapter_spec=adapter_spec,
|
|
1011
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
1012
|
+
groups=["medhallu"],
|
|
1013
|
+
)
|
|
1014
|
+
|
|
1015
|
+
|
|
1016
|
+
@run_spec_function("n2c2_ct_matching")
|
|
1017
|
+
def get_n2c2_ct_matching_spec(subject: str) -> RunSpec:
|
|
1018
|
+
scenario_spec = ScenarioSpec(
|
|
1019
|
+
class_name="helm.benchmark.scenarios.n2c2_ct_matching_scenario.N2C2CTMatchingScenario",
|
|
1020
|
+
args={"subject": subject},
|
|
1021
|
+
)
|
|
1022
|
+
|
|
1023
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
1024
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
1025
|
+
instructions="Answer A for yes, B for no.",
|
|
1026
|
+
input_noun="",
|
|
1027
|
+
output_noun="Answer A for yes, B for no",
|
|
1028
|
+
max_train_instances=0,
|
|
1029
|
+
)
|
|
1030
|
+
|
|
1031
|
+
return RunSpec(
|
|
1032
|
+
name=f"n2c2_ct_matching:subject={subject}",
|
|
1033
|
+
scenario_spec=scenario_spec,
|
|
1034
|
+
adapter_spec=adapter_spec,
|
|
1035
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
1036
|
+
groups=["n2c2_ct_matching"],
|
|
1037
|
+
)
|
|
1038
|
+
|
|
1039
|
+
|
|
1040
|
+
@run_spec_function("shc_gip_med")
|
|
1041
|
+
def get_shc_gip_spec() -> RunSpec:
|
|
1042
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.shc_gip_scenario.SHCGIPMedScenario", args={})
|
|
1043
|
+
|
|
1044
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
1045
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
1046
|
+
instructions="Answer A or B.",
|
|
1047
|
+
input_noun="",
|
|
1048
|
+
output_noun="",
|
|
1049
|
+
)
|
|
1050
|
+
|
|
1051
|
+
return RunSpec(
|
|
1052
|
+
name="shc_gip_med",
|
|
1053
|
+
scenario_spec=scenario_spec,
|
|
1054
|
+
adapter_spec=adapter_spec,
|
|
1055
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
1056
|
+
groups=["shc_gip_med"],
|
|
1057
|
+
)
|
|
1058
|
+
|
|
1059
|
+
|
|
1060
|
+
@run_spec_function("mimiciv_billing_code")
|
|
1061
|
+
def get_mimiciv_billing_code_spec() -> RunSpec:
|
|
1062
|
+
scenario_spec = ScenarioSpec(
|
|
1063
|
+
class_name="helm.benchmark.scenarios.mimiciv_billing_code_scenario.MIMICIVBillingCodeScenario",
|
|
1064
|
+
args={
|
|
1065
|
+
"data_file": "/share/pi/nigam/data/medhelm/mimiciv_billing_codes/mimiciv_icd10.feather",
|
|
1066
|
+
},
|
|
1067
|
+
)
|
|
1068
|
+
adapter_spec = get_generation_adapter_spec(
|
|
1069
|
+
instructions="Given the following clinical note, identify all relevant ICD-10 codes.",
|
|
1070
|
+
input_noun="Note",
|
|
1071
|
+
output_noun="Predicted ICD-10 Codes",
|
|
1072
|
+
newline_after_input_noun=True,
|
|
1073
|
+
newline_after_output_noun=True,
|
|
1074
|
+
max_tokens=256,
|
|
1075
|
+
max_train_instances=0,
|
|
1076
|
+
stop_sequences=[],
|
|
1077
|
+
)
|
|
1078
|
+
# Define the metrics
|
|
1079
|
+
metric_specs = [
|
|
1080
|
+
MetricSpec(
|
|
1081
|
+
class_name="helm.benchmark.metrics.mimiciv_billing_code_metrics.MIMICIVBillingCodeMetric",
|
|
1082
|
+
args={},
|
|
1083
|
+
)
|
|
1084
|
+
] + get_generic_metric_specs()
|
|
1085
|
+
|
|
1086
|
+
# Return the RunSpec
|
|
1087
|
+
return RunSpec(
|
|
1088
|
+
name="mimiciv_billing_code",
|
|
1089
|
+
scenario_spec=scenario_spec,
|
|
1090
|
+
adapter_spec=adapter_spec,
|
|
1091
|
+
metric_specs=metric_specs,
|
|
1092
|
+
groups=["mimiciv_billing_code"],
|
|
1093
|
+
)
|
|
1094
|
+
|
|
1095
|
+
|
|
1096
|
+
@run_spec_function("shc_sequoia_med")
|
|
1097
|
+
def get_shc_sequoia_spec() -> RunSpec:
|
|
1098
|
+
scenario_spec = ScenarioSpec(
|
|
1099
|
+
class_name="helm.benchmark.scenarios.shc_sequoia_scenario.SHCSequoiaMedScenario", args={}
|
|
1100
|
+
)
|
|
1101
|
+
|
|
1102
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
1103
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
1104
|
+
instructions="Answer A or B.",
|
|
1105
|
+
input_noun="",
|
|
1106
|
+
output_noun="",
|
|
1107
|
+
)
|
|
1108
|
+
|
|
1109
|
+
return RunSpec(
|
|
1110
|
+
name="shc_sequoia_med",
|
|
1111
|
+
scenario_spec=scenario_spec,
|
|
1112
|
+
adapter_spec=adapter_spec,
|
|
1113
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
1114
|
+
groups=["shc_sequoia_med"],
|
|
1115
|
+
)
|
|
1116
|
+
|
|
1117
|
+
|
|
1118
|
+
@run_spec_function("shc_cdi_med")
|
|
1119
|
+
def get_shc_cdi_spec() -> RunSpec:
|
|
1120
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.shc_cdi_scenario.SHCCDIMedScenario", args={})
|
|
1121
|
+
|
|
1122
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
1123
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
1124
|
+
instructions="Answer A or B.",
|
|
1125
|
+
input_noun="",
|
|
1126
|
+
output_noun="",
|
|
1127
|
+
)
|
|
1128
|
+
|
|
1129
|
+
return RunSpec(
|
|
1130
|
+
name="shc_cdi_med",
|
|
1131
|
+
scenario_spec=scenario_spec,
|
|
1132
|
+
adapter_spec=adapter_spec,
|
|
1133
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
1134
|
+
groups=["shc_cdi_med"],
|
|
1135
|
+
)
|
|
1136
|
+
|
|
1137
|
+
|
|
1138
|
+
@run_spec_function("shc_ent_med")
|
|
1139
|
+
def get_shc_ent_spec() -> RunSpec:
|
|
1140
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.shc_ent_scenario.SHCENTMedScenario", args={})
|
|
1141
|
+
|
|
1142
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
1143
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
1144
|
+
instructions="Answer A, B, or C.",
|
|
1145
|
+
input_noun="",
|
|
1146
|
+
output_noun="",
|
|
1147
|
+
)
|
|
1148
|
+
|
|
1149
|
+
return RunSpec(
|
|
1150
|
+
name="shc_ent_med",
|
|
1151
|
+
scenario_spec=scenario_spec,
|
|
1152
|
+
adapter_spec=adapter_spec,
|
|
1153
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
1154
|
+
groups=["shc_ent_med"],
|
|
1155
|
+
)
|