crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +10 -5
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +90 -61
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +281 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +21 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +119 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +105 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +83 -5
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +18 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +42 -6
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +151 -47
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +255 -21
- helm/clients/palmyra_client.py +2 -6
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +69 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +8 -30
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/key_value_store.py +9 -9
- helm/common/media_object.py +2 -2
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +1258 -33
- helm/config/model_metadata.yaml +1110 -41
- helm/config/tokenizer_configs.yaml +403 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -13
- helm/proxy/services/remote_service.py +1 -7
- helm/proxy/services/server_service.py +6 -19
- helm/proxy/services/service.py +0 -6
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.3.dist-info/METADATA +0 -355
- crfm_helm-0.5.3.dist-info/RECORD +0 -699
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
|
@@ -5,7 +5,7 @@ from typing import Any
|
|
|
5
5
|
from helm.common.general import ensure_directory_exists
|
|
6
6
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
7
7
|
from helm.benchmark.annotation.annotator import Annotator
|
|
8
|
-
from helm.benchmark.annotation.
|
|
8
|
+
from helm.benchmark.annotation.model_as_judge import score_with_reasoning
|
|
9
9
|
from helm.clients.auto_client import AutoClient
|
|
10
10
|
|
|
11
11
|
PROMPT_TEMPLATE = """You are a medical judge will be given a user request and a model response to that request.
|
|
@@ -28,11 +28,16 @@ Determine if the model is correct with respect to one or more of the <correct_re
|
|
|
28
28
|
# 0.7, if the model response contains correct information but is incomplete relative to the correct response(s)
|
|
29
29
|
# 0.3, if the model response contains some correct information but is incomplete and/or incorrect
|
|
30
30
|
# 0, if the model response is completely unrelated or incorrect relative to the correct response(s)
|
|
31
|
-
|
|
31
|
+
Please output your one-sentence concise reasoning within the "reasoning" tags and your score within the "score" tags.
|
|
32
32
|
Your reasoning should be less than 20 tokens. The score should be a single number with no other output.
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
33
|
+
Only output a tag-delimited object with the following format:
|
|
34
|
+
<reasoning>
|
|
35
|
+
INSERT_YOUR_REASONING_HERE
|
|
36
|
+
</reasoning>
|
|
37
|
+
<score>
|
|
38
|
+
INSERT_YOUR_SCORE_HERE
|
|
39
|
+
</score>
|
|
40
|
+
"""
|
|
36
41
|
|
|
37
42
|
|
|
38
43
|
class LiveQAAnnotator(Annotator):
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from typing import Dict, Optional, Set
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
|
|
4
|
+
from helm.clients.auto_client import AutoClient
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
PROMPT_TEMPLATE = """You are a medical expert tasked with evaluating the quality of medical dialogue summaries.
|
|
8
|
+
|
|
9
|
+
Your goal is to assess how well the summary captures the patient-doctor conversation,
|
|
10
|
+
and how it compares to the gold response in terms of accuracy, completeness, and clarity.
|
|
11
|
+
|
|
12
|
+
The patient-doctor conversation will be provided in these tags:
|
|
13
|
+
<conversation>
|
|
14
|
+
{{QUESTION}}
|
|
15
|
+
</conversation>
|
|
16
|
+
|
|
17
|
+
The response will be provided in these tags:
|
|
18
|
+
<response>
|
|
19
|
+
{{RESPONSE}}
|
|
20
|
+
</response>
|
|
21
|
+
|
|
22
|
+
The reference response will be provided in these tags:
|
|
23
|
+
<gold_response>
|
|
24
|
+
{{GOLD_RESPONSE}}
|
|
25
|
+
</gold_response>
|
|
26
|
+
|
|
27
|
+
Carefully review the <response> and compare it to the <gold_response> when needed.
|
|
28
|
+
|
|
29
|
+
Please evaluate the generated summary on a scale of 1-5 (1 = poor, 5 = excellent)
|
|
30
|
+
for each of these three key dimensions:
|
|
31
|
+
|
|
32
|
+
Evaluation Criteria:
|
|
33
|
+
Accuracy (1-5)
|
|
34
|
+
- Does the summary correctly capture the main medical issue and clinical details from the conversation?
|
|
35
|
+
|
|
36
|
+
Completeness (1-5)
|
|
37
|
+
- Does the summary include all important medical information from the conversation?
|
|
38
|
+
|
|
39
|
+
Clarity (1-5)
|
|
40
|
+
- Is the summary easy to understand for clinical use?
|
|
41
|
+
|
|
42
|
+
Output Format:
|
|
43
|
+
Output your evaluation as a single valid JSON object matching the following structure:
|
|
44
|
+
{
|
|
45
|
+
"accuracy": {
|
|
46
|
+
"score": 0,
|
|
47
|
+
"explanation": "Brief explanation of why this score was given."
|
|
48
|
+
},
|
|
49
|
+
"completeness": {
|
|
50
|
+
"score": 0,
|
|
51
|
+
"explanation": "Brief explanation of why this score was given."
|
|
52
|
+
},
|
|
53
|
+
"clarity": {
|
|
54
|
+
"score": 0,
|
|
55
|
+
"explanation": "Brief explanation of why this score was given."
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
Ensure the output is valid JSON:
|
|
60
|
+
- Use **double quotes** (") for all keys and string values.
|
|
61
|
+
- When quoting text or sections inside the explanations, use escaped double quotes (\") to
|
|
62
|
+
maintain valid JSON formatting.
|
|
63
|
+
- Do not include any additional information in the output.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
|
|
67
|
+
"accuracy": {"score", "explanation"},
|
|
68
|
+
"completeness": {"score", "explanation"},
|
|
69
|
+
"clarity": {"score", "explanation"},
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
|
|
73
|
+
"gpt": AnnotatorModelInfo(
|
|
74
|
+
model_name="openai/gpt-4o-2024-05-13",
|
|
75
|
+
model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
|
|
76
|
+
),
|
|
77
|
+
"llama": AnnotatorModelInfo(
|
|
78
|
+
model_name="meta/llama-3.3-70b-instruct",
|
|
79
|
+
model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
|
|
80
|
+
),
|
|
81
|
+
"claude": AnnotatorModelInfo(
|
|
82
|
+
model_name="anthropic/claude-3-7-sonnet-20250219",
|
|
83
|
+
model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
|
|
84
|
+
),
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class MedDialogAnnotator(LLMAsJuryAnnotator):
|
|
89
|
+
"""The MedDialog autograder."""
|
|
90
|
+
|
|
91
|
+
name = "med_dialog"
|
|
92
|
+
|
|
93
|
+
def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
|
|
94
|
+
super().__init__(
|
|
95
|
+
auto_client=auto_client,
|
|
96
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
97
|
+
annotation_criteria=ANNOTATION_CRITERIA,
|
|
98
|
+
annotator_models=ANNOTATOR_MODELS,
|
|
99
|
+
)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from typing import Dict, Optional, Set
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
|
|
4
|
+
from helm.clients.auto_client import AutoClient
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
PROMPT_TEMPLATE = """You are a medical expert responsible for evaluating the quality of a
|
|
8
|
+
response to a clinical instruction, given an associated Electronic Health Record (EHR).
|
|
9
|
+
|
|
10
|
+
Your goal is to assess how well the response interprets the EHR, follows clinical best practices,
|
|
11
|
+
and aligns with the gold response in terms of accuracy, completeness, and clarity.
|
|
12
|
+
|
|
13
|
+
The instruction and EHR pair will be provided in these tags:
|
|
14
|
+
<user_request>
|
|
15
|
+
{{QUESTION}}
|
|
16
|
+
</user_request>
|
|
17
|
+
|
|
18
|
+
The response will be provided in these tags:
|
|
19
|
+
<response>
|
|
20
|
+
{{RESPONSE}}
|
|
21
|
+
</response>
|
|
22
|
+
|
|
23
|
+
The gold response (reference answer) will be provided in these tags:
|
|
24
|
+
<gold_response>
|
|
25
|
+
{{GOLD_RESPONSE}}
|
|
26
|
+
</gold_response>
|
|
27
|
+
|
|
28
|
+
Carefully review the <response> based on the <user_request> and compare it to
|
|
29
|
+
the <gold_response> when needed.
|
|
30
|
+
For each category below, rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent)
|
|
31
|
+
and provide a brief explanation for your score:
|
|
32
|
+
|
|
33
|
+
Evaluation Criteria:
|
|
34
|
+
Accuracy (1-5)
|
|
35
|
+
- Does the response provide correct medical information based on the EHR and clinical guidelines?
|
|
36
|
+
|
|
37
|
+
Completeness (1-5)
|
|
38
|
+
- Does the response include all important patient information and address the clinical question?
|
|
39
|
+
|
|
40
|
+
Clarity (1-5)
|
|
41
|
+
- Is the response written clearly so clinicians can easily understand it?
|
|
42
|
+
|
|
43
|
+
Output Format:
|
|
44
|
+
Generate a valid JSON object with your evaluation:
|
|
45
|
+
{
|
|
46
|
+
"accuracy": {
|
|
47
|
+
"score": 0,
|
|
48
|
+
"explanation": "Explain why this score was given."
|
|
49
|
+
},
|
|
50
|
+
"completeness": {
|
|
51
|
+
"score": 0,
|
|
52
|
+
"explanation": "Explain why this score was given."
|
|
53
|
+
},
|
|
54
|
+
"clarity": {
|
|
55
|
+
"score": 0,
|
|
56
|
+
"explanation": "Explain why this score was given."
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
Ensure the output is valid JSON:
|
|
61
|
+
- Use **double quotes** (") for all keys and string values.
|
|
62
|
+
- When quoting text or sections inside the explanations, use escaped double quotes (\") to
|
|
63
|
+
maintain valid JSON formatting.
|
|
64
|
+
- Do not include any additional information in the output.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
|
|
68
|
+
"accuracy": {"score", "explanation"},
|
|
69
|
+
"completeness": {"score", "explanation"},
|
|
70
|
+
"clarity": {"score", "explanation"},
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
|
|
74
|
+
"gpt": AnnotatorModelInfo(
|
|
75
|
+
model_name="openai/gpt-4o-2024-05-13",
|
|
76
|
+
model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
|
|
77
|
+
),
|
|
78
|
+
"llama": AnnotatorModelInfo(
|
|
79
|
+
model_name="meta/llama-3.3-70b-instruct",
|
|
80
|
+
model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
|
|
81
|
+
),
|
|
82
|
+
"claude": AnnotatorModelInfo(
|
|
83
|
+
model_name="anthropic/claude-3-7-sonnet-20250219",
|
|
84
|
+
model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
|
|
85
|
+
),
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class MedalignAnnotator(LLMAsJuryAnnotator):
|
|
90
|
+
"""The Medalign autograder."""
|
|
91
|
+
|
|
92
|
+
name = "medalign"
|
|
93
|
+
|
|
94
|
+
def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
|
|
95
|
+
super().__init__(
|
|
96
|
+
auto_client=auto_client,
|
|
97
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
98
|
+
annotation_criteria=ANNOTATION_CRITERIA,
|
|
99
|
+
annotator_models=ANNOTATOR_MODELS,
|
|
100
|
+
)
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
from typing import Dict, Optional, Set
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
|
|
4
|
+
from helm.clients.auto_client import AutoClient
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
PROMPT_TEMPLATE = """You are a medical expert tasked with evaluating the quality of a
|
|
8
|
+
response to a consumer health question.
|
|
9
|
+
Your goal is to assess how well the generated response answers the medical question,
|
|
10
|
+
and how it compares to the gold response in terms of accuracy, completeness, and clarity.
|
|
11
|
+
|
|
12
|
+
The question will be provided in these tags:
|
|
13
|
+
<question>
|
|
14
|
+
{{QUESTION}}
|
|
15
|
+
</question>
|
|
16
|
+
|
|
17
|
+
The response will be provided in these tags:
|
|
18
|
+
<response>
|
|
19
|
+
{{RESPONSE}}
|
|
20
|
+
</response>
|
|
21
|
+
|
|
22
|
+
The reference answer will be provided in these tags:
|
|
23
|
+
<gold_response>
|
|
24
|
+
{{GOLD_RESPONSE}}
|
|
25
|
+
</gold_response>
|
|
26
|
+
|
|
27
|
+
Carefully analyze the <response> compared to the <gold_response> and the original <question>.
|
|
28
|
+
For each of the following criteria, rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent),
|
|
29
|
+
and provide a short justification for your score.
|
|
30
|
+
|
|
31
|
+
Evaluation Criteria:
|
|
32
|
+
Accuracy (1-5)
|
|
33
|
+
- Is the medical information correct?
|
|
34
|
+
|
|
35
|
+
Completeness (1-5)
|
|
36
|
+
- Does the response fully answer the question and include important medical details?
|
|
37
|
+
|
|
38
|
+
Clarity (1-5)
|
|
39
|
+
- Is the response easy for consumers to understand?
|
|
40
|
+
|
|
41
|
+
Output Format:
|
|
42
|
+
Output the evaluation as a single valid JSON object matching the following structure:
|
|
43
|
+
{
|
|
44
|
+
"accuracy": {
|
|
45
|
+
"score": 0,
|
|
46
|
+
"explanation": "Explain why this score was given."
|
|
47
|
+
},
|
|
48
|
+
"completeness": {
|
|
49
|
+
"score": 0,
|
|
50
|
+
"explanation": "Explain why this score was given."
|
|
51
|
+
},
|
|
52
|
+
"clarity": {
|
|
53
|
+
"score": 0,
|
|
54
|
+
"explanation": "Explain why this score was given."
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
Ensure the output is valid JSON:
|
|
59
|
+
- Use **double quotes** (") for all keys and string values.
|
|
60
|
+
- When quoting text or sections inside the explanations, use escaped double quotes (\") to
|
|
61
|
+
maintain valid JSON formatting.
|
|
62
|
+
- Do not include any additional information in the output.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
|
|
66
|
+
"accuracy": {"score", "explanation"},
|
|
67
|
+
"completeness": {"score", "explanation"},
|
|
68
|
+
"clarity": {"score", "explanation"},
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
|
|
72
|
+
"gpt": AnnotatorModelInfo(
|
|
73
|
+
model_name="openai/gpt-4o-2024-05-13",
|
|
74
|
+
model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
|
|
75
|
+
),
|
|
76
|
+
"llama": AnnotatorModelInfo(
|
|
77
|
+
model_name="meta/llama-3.3-70b-instruct",
|
|
78
|
+
model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
|
|
79
|
+
),
|
|
80
|
+
"claude": AnnotatorModelInfo(
|
|
81
|
+
model_name="anthropic/claude-3-7-sonnet-20250219",
|
|
82
|
+
model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
|
|
83
|
+
),
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class MediQAAnnotator(LLMAsJuryAnnotator):
|
|
88
|
+
"""The MediQA autograder."""
|
|
89
|
+
|
|
90
|
+
name = "medi_qa"
|
|
91
|
+
|
|
92
|
+
def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
|
|
93
|
+
super().__init__(
|
|
94
|
+
auto_client=auto_client,
|
|
95
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
96
|
+
annotation_criteria=ANNOTATION_CRITERIA,
|
|
97
|
+
annotator_models=ANNOTATOR_MODELS,
|
|
98
|
+
)
|
|
@@ -1,68 +1,97 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
3
|
-
from
|
|
4
|
-
|
|
5
|
-
from helm.common.general import ensure_directory_exists
|
|
6
|
-
from helm.benchmark.adaptation.request_state import RequestState
|
|
7
|
-
from helm.benchmark.annotation.annotator import Annotator
|
|
8
|
-
from helm.benchmark.annotation.model_as_judge import score_with_reasoning
|
|
1
|
+
from typing import Dict, Optional, Set
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
|
|
9
4
|
from helm.clients.auto_client import AutoClient
|
|
10
5
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
6
|
+
|
|
7
|
+
PROMPT_TEMPLATE = """You are a medical expert tasked with evaluating responses to consumer medication questions.
|
|
8
|
+
|
|
9
|
+
Your goal is to assess how well the response captures the information asked,
|
|
10
|
+
and how it compares to the gold response in terms of accuracy, completeness, and clarity.
|
|
11
|
+
|
|
12
|
+
The question provided in these tags:
|
|
13
|
+
<medication_question>
|
|
15
14
|
{{QUESTION}}
|
|
16
|
-
</
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
15
|
+
</medication_question>
|
|
16
|
+
|
|
17
|
+
The response will be provided in these tags:
|
|
18
|
+
<response>
|
|
19
|
+
{{RESPONSE}}
|
|
20
|
+
</response>
|
|
21
|
+
|
|
22
|
+
The reference response will be provided in these tags:
|
|
23
|
+
<gold_response>
|
|
24
|
+
{{GOLD_RESPONSE}}
|
|
25
|
+
</gold_response>
|
|
26
|
+
|
|
27
|
+
Carefully review the <response> and compare it to the <gold_response> when needed.
|
|
28
|
+
Please evaluate the response on a scale of 1-5 (1 = poor, 5 = excellent) for each of these three key dimensions:
|
|
29
|
+
|
|
30
|
+
Evaluation Criteria:
|
|
31
|
+
Accuracy (1-5)
|
|
32
|
+
- Is the information about the medication factually correct?
|
|
33
|
+
|
|
34
|
+
Completeness (1-5)
|
|
35
|
+
- Does the response fully answer the question, covering all important points?
|
|
36
|
+
|
|
37
|
+
Clarity (1-5)
|
|
38
|
+
- Is the response easy for a consumer to understand?
|
|
39
|
+
|
|
40
|
+
Output Format:
|
|
41
|
+
Output your evaluation as a single valid JSON object matching the following structure:
|
|
42
|
+
{
|
|
43
|
+
"accuracy": {
|
|
44
|
+
"score": 0,
|
|
45
|
+
"explanation": "Brief explanation of why this score was given."
|
|
46
|
+
},
|
|
47
|
+
"completeness": {
|
|
48
|
+
"score": 0,
|
|
49
|
+
"explanation": "Brief explanation of why this score was given."
|
|
50
|
+
},
|
|
51
|
+
"clarity": {
|
|
52
|
+
"score": 0,
|
|
53
|
+
"explanation": "Brief explanation of why this score was given."
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
Ensure the output is valid JSON:
|
|
58
|
+
- Use **double quotes** (") for all keys and string values.
|
|
59
|
+
- When quoting text or sections inside the explanations, use escaped double quotes (\") to
|
|
60
|
+
maintain valid JSON formatting.
|
|
61
|
+
- Do not include any additional information in the output.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
|
|
65
|
+
"accuracy": {"score", "explanation"},
|
|
66
|
+
"completeness": {"score", "explanation"},
|
|
67
|
+
"clarity": {"score", "explanation"},
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
|
|
71
|
+
"gpt": AnnotatorModelInfo(
|
|
72
|
+
model_name="openai/gpt-4o-2024-05-13",
|
|
73
|
+
model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
|
|
74
|
+
),
|
|
75
|
+
"llama": AnnotatorModelInfo(
|
|
76
|
+
model_name="meta/llama-3.3-70b-instruct",
|
|
77
|
+
model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
|
|
78
|
+
),
|
|
79
|
+
"claude": AnnotatorModelInfo(
|
|
80
|
+
model_name="anthropic/claude-3-7-sonnet-20250219",
|
|
81
|
+
model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
|
|
82
|
+
),
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class MedicationQAAnnotator(LLMAsJuryAnnotator):
|
|
87
|
+
"""The MedicationQA autograder."""
|
|
39
88
|
|
|
40
89
|
name = "medication_qa"
|
|
41
90
|
|
|
42
|
-
def __init__(self, auto_client: AutoClient,
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def annotate(self, request_state: RequestState) -> Any:
|
|
50
|
-
assert request_state.result
|
|
51
|
-
assert len(request_state.result.completions) == 1
|
|
52
|
-
model_input_text = request_state.request.prompt
|
|
53
|
-
model_output_text = request_state.result.completions[0].text
|
|
54
|
-
if not model_output_text.strip():
|
|
55
|
-
return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 0.0}
|
|
56
|
-
correct_response = request_state.instance.references[0].output.text
|
|
57
|
-
annotator_prompt = (
|
|
58
|
-
PROMPT_TEMPLATE.strip()
|
|
59
|
-
.replace("{{QUESTION}}", model_input_text)
|
|
60
|
-
.replace("{{ANSWER}}", model_output_text)
|
|
61
|
-
.replace("{{CORRECT_RESPONSE}}", correct_response)
|
|
62
|
-
)
|
|
63
|
-
result = score_with_reasoning(
|
|
64
|
-
self._auto_client, annotator_prompt, "openai/gpt-4o-2024-05-13", "openai/gpt-4o-2024-05-13"
|
|
91
|
+
def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
|
|
92
|
+
super().__init__(
|
|
93
|
+
auto_client=auto_client,
|
|
94
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
95
|
+
annotation_criteria=ANNOTATION_CRITERIA,
|
|
96
|
+
annotator_models=ANNOTATOR_MODELS,
|
|
65
97
|
)
|
|
66
|
-
reasoning = result["reasoning"]
|
|
67
|
-
score = result["score"]
|
|
68
|
-
return {"prompt_text": annotator_prompt, "reasoning": reasoning, "score": score}
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
from typing import Dict, Optional, Set
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
|
|
4
|
+
from helm.clients.auto_client import AutoClient
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
PROMPT_TEMPLATE = """You are a mental health expert tasked with evaluating the quality
|
|
8
|
+
of a counselor response in a therapy conversation.
|
|
9
|
+
Your goal is to assess how well the generated response demonstrates therapeutic skills,
|
|
10
|
+
and how it compares to the gold response in terms of accuracy, completeness, and clarity.
|
|
11
|
+
|
|
12
|
+
The conversation history and question will be provided in these tags:
|
|
13
|
+
<conversation>
|
|
14
|
+
{{QUESTION}}
|
|
15
|
+
</conversation>
|
|
16
|
+
|
|
17
|
+
The response will be provided in these tags:
|
|
18
|
+
<response>
|
|
19
|
+
{{RESPONSE}}
|
|
20
|
+
</response>
|
|
21
|
+
|
|
22
|
+
The reference response will be provided in these tags:
|
|
23
|
+
<gold_response>
|
|
24
|
+
{{GOLD_RESPONSE}}
|
|
25
|
+
</gold_response>
|
|
26
|
+
|
|
27
|
+
Carefully analyze the <response> compared to the <gold_response> and the original <conversation>.
|
|
28
|
+
For each of the following criteria, rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent),
|
|
29
|
+
and provide a short justification for your score.
|
|
30
|
+
|
|
31
|
+
Evaluation Criteria:
|
|
32
|
+
Accuracy (1-5)
|
|
33
|
+
- Does the response use appropriate therapeutic techniques and show understanding of the client's concerns?
|
|
34
|
+
|
|
35
|
+
Completeness (1-5)
|
|
36
|
+
- Does the response address the client's key concerns and include important context from the conversation?
|
|
37
|
+
|
|
38
|
+
Clarity (1-5)
|
|
39
|
+
- Is the response easy to understand and communicated in an empathetic, therapeutic way?
|
|
40
|
+
|
|
41
|
+
Output Format:
|
|
42
|
+
Output the evaluation as a single valid JSON object matching the following structure:
|
|
43
|
+
{
|
|
44
|
+
"accuracy": {
|
|
45
|
+
"score": 0,
|
|
46
|
+
"explanation": "Explain why this score was given."
|
|
47
|
+
},
|
|
48
|
+
"completeness": {
|
|
49
|
+
"score": 0,
|
|
50
|
+
"explanation": "Explain why this score was given."
|
|
51
|
+
},
|
|
52
|
+
"clarity": {
|
|
53
|
+
"score": 0,
|
|
54
|
+
"explanation": "Explain why this score was given."
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
Ensure the output is valid JSON:
|
|
59
|
+
- Use **double quotes** (") for all keys and string values.
|
|
60
|
+
- When quoting text or sections inside the explanations, use escaped double quotes (\") to
|
|
61
|
+
maintain valid JSON formatting.
|
|
62
|
+
- Do not include any additional information in the output.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
|
|
66
|
+
"accuracy": {"score", "explanation"},
|
|
67
|
+
"completeness": {"score", "explanation"},
|
|
68
|
+
"clarity": {"score", "explanation"},
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
|
|
72
|
+
"gpt": AnnotatorModelInfo(
|
|
73
|
+
model_name="openai/gpt-4o-2024-05-13",
|
|
74
|
+
model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
|
|
75
|
+
),
|
|
76
|
+
"llama": AnnotatorModelInfo(
|
|
77
|
+
model_name="meta/llama-3.3-70b-instruct",
|
|
78
|
+
model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
|
|
79
|
+
),
|
|
80
|
+
"claude": AnnotatorModelInfo(
|
|
81
|
+
model_name="anthropic/claude-3-7-sonnet-20250219",
|
|
82
|
+
model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
|
|
83
|
+
),
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class MentalHealthAnnotator(LLMAsJuryAnnotator):
|
|
88
|
+
"""The MentalHealth autograder."""
|
|
89
|
+
|
|
90
|
+
name = "mental_health"
|
|
91
|
+
|
|
92
|
+
def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
|
|
93
|
+
super().__init__(
|
|
94
|
+
auto_client=auto_client,
|
|
95
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
96
|
+
annotation_criteria=ANNOTATION_CRITERIA,
|
|
97
|
+
annotator_models=ANNOTATOR_MODELS,
|
|
98
|
+
)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from typing import Dict, Optional, Set
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
|
|
4
|
+
from helm.clients.auto_client import AutoClient
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
PROMPT_TEMPLATE = """You are tasked with evaluating the quality of the generated impression section
|
|
8
|
+
of a radiology report based on the provided findings.
|
|
9
|
+
Your goal is to assess how well the impression section captures the all the clinical findings and
|
|
10
|
+
how it compares to the gold response in terms of accuracy, completeness, and clarity.
|
|
11
|
+
|
|
12
|
+
The user's request will be provided in these tags:
|
|
13
|
+
<user_request>
|
|
14
|
+
{{QUESTION}}
|
|
15
|
+
</user_request>
|
|
16
|
+
|
|
17
|
+
The response will be provided in these tags:
|
|
18
|
+
<response>
|
|
19
|
+
{{RESPONSE}}
|
|
20
|
+
</response>
|
|
21
|
+
|
|
22
|
+
Some potential correct responses will be provided in these tags:
|
|
23
|
+
<gold_response>
|
|
24
|
+
{{GOLD_RESPONSE}}
|
|
25
|
+
</gold_response>
|
|
26
|
+
|
|
27
|
+
Carefully analyze the <response>.
|
|
28
|
+
For each of the following categories, rate the Response on a scale of 1 to 5 (1 = very poor, 5 = excellent),
|
|
29
|
+
and provide a short justification for your score.
|
|
30
|
+
|
|
31
|
+
Your evaluation should focus on the following criteria:
|
|
32
|
+
|
|
33
|
+
Evaluation Criteria:
|
|
34
|
+
Accuracy (1-5)
|
|
35
|
+
- Does the impression correctly reflect the key findings from the radiology report?
|
|
36
|
+
|
|
37
|
+
Completeness (1-5)
|
|
38
|
+
- Does the impression include all important findings and address the clinical question?
|
|
39
|
+
|
|
40
|
+
Clarity (1-5)
|
|
41
|
+
- Is the impression easy for referring clinicians to understand?
|
|
42
|
+
|
|
43
|
+
Output Format:
|
|
44
|
+
Output the evaluation as a single valid JSON object matching the following structure:
|
|
45
|
+
{
|
|
46
|
+
"accuracy": {
|
|
47
|
+
"score": 0,
|
|
48
|
+
"explanation": "Explain why this score was given."
|
|
49
|
+
},
|
|
50
|
+
"completeness": {
|
|
51
|
+
"score": 0,
|
|
52
|
+
"explanation": "Explain why this score was given."
|
|
53
|
+
},
|
|
54
|
+
"clarity": {
|
|
55
|
+
"score": 0,
|
|
56
|
+
"explanation": "Explain why this score was given."
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
Ensure the output is valid JSON:
|
|
61
|
+
- Use **double quotes** (") for all keys and string values.
|
|
62
|
+
- When quoting text or sections inside the explanations, use escaped double quotes (\") to
|
|
63
|
+
maintain valid JSON formatting.
|
|
64
|
+
- Do not include any additional information in the output.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
|
|
68
|
+
"accuracy": {"score", "explanation"},
|
|
69
|
+
"completeness": {"score", "explanation"},
|
|
70
|
+
"clarity": {"score", "explanation"},
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
|
|
74
|
+
"gpt": AnnotatorModelInfo(
|
|
75
|
+
model_name="openai/gpt-4o-2024-05-13",
|
|
76
|
+
model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
|
|
77
|
+
),
|
|
78
|
+
"llama": AnnotatorModelInfo(
|
|
79
|
+
model_name="meta/llama-3.3-70b-instruct",
|
|
80
|
+
model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
|
|
81
|
+
),
|
|
82
|
+
"claude": AnnotatorModelInfo(
|
|
83
|
+
model_name="anthropic/claude-3-7-sonnet-20250219",
|
|
84
|
+
model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
|
|
85
|
+
),
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class MIMICRRSAnnotator(LLMAsJuryAnnotator):
|
|
90
|
+
"""The MIMICRRS autograder."""
|
|
91
|
+
|
|
92
|
+
name = "mimic_rrs"
|
|
93
|
+
|
|
94
|
+
def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
|
|
95
|
+
super().__init__(
|
|
96
|
+
auto_client=auto_client,
|
|
97
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
98
|
+
annotation_criteria=ANNOTATION_CRITERIA,
|
|
99
|
+
annotator_models=ANNOTATOR_MODELS,
|
|
100
|
+
)
|