crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +218 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +4 -1
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/summarize.py +23 -10
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +78 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +75 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +109 -36
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +241 -22
- helm/clients/palmyra_client.py +1 -4
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +47 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1112 -19
- helm/config/model_metadata.yaml +985 -44
- helm/config/tokenizer_configs.yaml +379 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +1 -1
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
from typing import Dict, Optional, Set
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
|
|
4
|
+
from helm.clients.auto_client import AutoClient
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
PROMPT_TEMPLATE = """You are a medical expert tasked with evaluating the quality of a
|
|
8
|
+
generated response of a clinical scenario.
|
|
9
|
+
Your goal is to assess how well the generated response captures the necessary information and
|
|
10
|
+
how it compares to the gold response in terms of accuracy, completeness, and clarity.
|
|
11
|
+
|
|
12
|
+
The user's request will be provided in these tags:
|
|
13
|
+
<user_request>
|
|
14
|
+
{{QUESTION}}
|
|
15
|
+
</user_request>
|
|
16
|
+
|
|
17
|
+
The response will be provided in these tags:
|
|
18
|
+
<response>
|
|
19
|
+
{{RESPONSE}}
|
|
20
|
+
</response>
|
|
21
|
+
|
|
22
|
+
The reference response will be provided in these tags:
|
|
23
|
+
<gold_response>
|
|
24
|
+
{{GOLD_RESPONSE}}
|
|
25
|
+
</gold_response>
|
|
26
|
+
|
|
27
|
+
Carefully analyze the <response>.
|
|
28
|
+
For each of the following categories, rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent),
|
|
29
|
+
and provide a short justification for your score.
|
|
30
|
+
|
|
31
|
+
Evaluation Criteria:
|
|
32
|
+
Accuracy (1-5)
|
|
33
|
+
- Is all the information in the response factually correct?
|
|
34
|
+
|
|
35
|
+
Completeness (1-5)
|
|
36
|
+
- Does the response include all necessary information from the gold response?
|
|
37
|
+
|
|
38
|
+
Clarity (1-5)
|
|
39
|
+
- Is the response easy to understand for a clinician?
|
|
40
|
+
|
|
41
|
+
Output Format:
|
|
42
|
+
Output the evaluation as a single valid JSON object matching the following structure:
|
|
43
|
+
{
|
|
44
|
+
"accuracy": {
|
|
45
|
+
"score": 0,
|
|
46
|
+
"explanation": "Explain why this score was given."
|
|
47
|
+
},
|
|
48
|
+
"completeness": {
|
|
49
|
+
"score": 0,
|
|
50
|
+
"explanation": "Explain why this score was given."
|
|
51
|
+
},
|
|
52
|
+
"clarity": {
|
|
53
|
+
"score": 0,
|
|
54
|
+
"explanation": "Explain why this score was given."
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
Ensure the output is valid JSON:
|
|
59
|
+
- Use **double quotes** (") for all keys and string values.
|
|
60
|
+
- When quoting text or sections inside the explanations, use escaped double quotes (\") to
|
|
61
|
+
maintain valid JSON formatting.
|
|
62
|
+
- Do not include any additional information in the output.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
|
|
66
|
+
"accuracy": {"score", "explanation"},
|
|
67
|
+
"completeness": {"score", "explanation"},
|
|
68
|
+
"clarity": {"score", "explanation"},
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
|
|
72
|
+
"gpt": AnnotatorModelInfo(
|
|
73
|
+
model_name="openai/gpt-4o-2024-05-13",
|
|
74
|
+
model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
|
|
75
|
+
),
|
|
76
|
+
"llama": AnnotatorModelInfo(
|
|
77
|
+
model_name="meta/llama-3.3-70b-instruct",
|
|
78
|
+
model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
|
|
79
|
+
),
|
|
80
|
+
"claude": AnnotatorModelInfo(
|
|
81
|
+
model_name="anthropic/claude-3-7-sonnet-20250219",
|
|
82
|
+
model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
|
|
83
|
+
),
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class CHWCarePlanAnnotator(LLMAsJuryAnnotator):
|
|
88
|
+
"""The CHWCarePlan autograder."""
|
|
89
|
+
|
|
90
|
+
name = "chw_care_plan"
|
|
91
|
+
|
|
92
|
+
def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
|
|
93
|
+
super().__init__(
|
|
94
|
+
auto_client=auto_client,
|
|
95
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
96
|
+
annotation_criteria=ANNOTATION_CRITERIA,
|
|
97
|
+
annotator_models=ANNOTATOR_MODELS,
|
|
98
|
+
)
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sqlite3
|
|
3
|
+
import threading
|
|
4
|
+
from typing import Any, Optional, Tuple
|
|
5
|
+
|
|
6
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
7
|
+
from helm.benchmark.annotation.annotator import Annotator
|
|
8
|
+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CzechBankQAAnnotator(Annotator):
|
|
12
|
+
"""The CzechBankQA autograder.
|
|
13
|
+
|
|
14
|
+
MUST BE RUN WITH --num-threads 1 FOR SOME REASON"""
|
|
15
|
+
|
|
16
|
+
name = "czech_bank_qa"
|
|
17
|
+
|
|
18
|
+
DATABASE_SOURCE_URL = (
|
|
19
|
+
"https://huggingface.co/datasets/yifanmai/czech_bank_qa/resolve/main/czech_bank.db?download=true"
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
def __init__(self, file_storage_path: str):
|
|
23
|
+
super().__init__()
|
|
24
|
+
|
|
25
|
+
cache_dir = os.path.join(file_storage_path, "data")
|
|
26
|
+
ensure_directory_exists(cache_dir)
|
|
27
|
+
file_name = "czech_bank.db"
|
|
28
|
+
file_path = os.path.join(cache_dir, file_name)
|
|
29
|
+
ensure_file_downloaded(source_url=CzechBankQAAnnotator.DATABASE_SOURCE_URL, target_path=file_path)
|
|
30
|
+
database = sqlite3.connect(file_path)
|
|
31
|
+
|
|
32
|
+
# csv_files_dir = "/home/yifanmai/oss/helm-scenarios/1999-czech-bank"
|
|
33
|
+
# # table_name_to_file_name = {
|
|
34
|
+
# # "account": "account.csv",
|
|
35
|
+
# # "client": "client.csv",
|
|
36
|
+
# # "disposition": "disp.csv",
|
|
37
|
+
# # "permenant_order": "order.csv",
|
|
38
|
+
# # "transaction": "trans.csv",
|
|
39
|
+
# # "loan": "loan.csv",
|
|
40
|
+
# # "credit_card": "card.csv",
|
|
41
|
+
# # "demographic_data": "district.csv"
|
|
42
|
+
# # }
|
|
43
|
+
# for file_name in os.listdir(csv_files_dir):
|
|
44
|
+
# file_path = os.path.join(csv_files_dir, file_name)
|
|
45
|
+
# df = pd.read_csv(file_path)
|
|
46
|
+
# table_name = file_name.removesuffix(".csv")
|
|
47
|
+
# df.to_sql(table_name, database, if_exists="append", index=False)
|
|
48
|
+
# print("Commited to SQL")
|
|
49
|
+
# # df.to_sql(table_name, conn, if_exists='append', index=False)
|
|
50
|
+
|
|
51
|
+
self.database = database
|
|
52
|
+
self.lock = threading.Lock()
|
|
53
|
+
|
|
54
|
+
def get_result(self, query: str) -> Tuple[Optional[str], Optional[str]]:
|
|
55
|
+
result: Optional[str] = None
|
|
56
|
+
error: Optional[str] = None
|
|
57
|
+
try:
|
|
58
|
+
cursor = self.database.cursor()
|
|
59
|
+
cursor.execute("PRAGMA query_only = TRUE")
|
|
60
|
+
cursor.execute(query)
|
|
61
|
+
result = str(cursor.fetchall())
|
|
62
|
+
cursor.close()
|
|
63
|
+
except (sqlite3.DatabaseError, sqlite3.Warning) as e:
|
|
64
|
+
error = str(e)
|
|
65
|
+
return (result, error)
|
|
66
|
+
|
|
67
|
+
def annotate(self, request_state: RequestState) -> Any:
|
|
68
|
+
assert request_state.result
|
|
69
|
+
assert len(request_state.result.completions) == 1
|
|
70
|
+
|
|
71
|
+
assert len(request_state.instance.references) == 1
|
|
72
|
+
gold_query = request_state.instance.references[0].output.text
|
|
73
|
+
query = request_state.result.completions[0].text
|
|
74
|
+
query = query.replace("```sql", "").replace("```", "")
|
|
75
|
+
result, error = self.get_result(query)
|
|
76
|
+
gold_result, gold_error = self.get_result(gold_query)
|
|
77
|
+
|
|
78
|
+
return {"query": query, "result": result, "error": error, "gold_result": gold_result, "gold_error": gold_error}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from typing import Dict, Optional, Set
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
|
|
4
|
+
from helm.clients.auto_client import AutoClient
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
PROMPT_TEMPLATE = """You are a medical expert responsible for evaluating a hospital document.
|
|
8
|
+
The task requires generating either discharge instructions or a brief hospital course based
|
|
9
|
+
on the provided discharge summary and radiology report.
|
|
10
|
+
|
|
11
|
+
Your goal is to assess whether the generated text is clinically accurate, complete, and clear
|
|
12
|
+
for the intended document type. The evaluation should ensure the document aligns with the
|
|
13
|
+
gold response in terms of accuracy, completeness, and clarity.
|
|
14
|
+
|
|
15
|
+
The target task of either generating a discharge instruction or brief hospital course along with
|
|
16
|
+
the patient discharge text and radiology report will be provided in these tags:
|
|
17
|
+
<patient_information>
|
|
18
|
+
{{QUESTION}}
|
|
19
|
+
</patient_information>
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
The document will be provided in these tags:
|
|
23
|
+
<response>
|
|
24
|
+
{{RESPONSE}}
|
|
25
|
+
</response>
|
|
26
|
+
|
|
27
|
+
The gold standard target document (either discharge instructions or a brief hospital course)
|
|
28
|
+
will be provided in these tags:
|
|
29
|
+
<gold_response>
|
|
30
|
+
{{GOLD_RESPONSE}}
|
|
31
|
+
</gold_response>
|
|
32
|
+
|
|
33
|
+
Carefully analyze the <response> based on the <patient_information> and compare
|
|
34
|
+
it to the <gold_response> when necessary.
|
|
35
|
+
|
|
36
|
+
For each of the following categories, rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent)
|
|
37
|
+
and provide a brief justification for the score.
|
|
38
|
+
|
|
39
|
+
Evaluation Criteria:
|
|
40
|
+
Accuracy (1-5)
|
|
41
|
+
- Does the document provide correct medical information based on the patient's condition and source materials?
|
|
42
|
+
|
|
43
|
+
Completeness (1-5)
|
|
44
|
+
- Does the document include all important information needed for the specific document type?
|
|
45
|
+
|
|
46
|
+
Clarity (1-5)
|
|
47
|
+
- Is the document easy to understand for the right audience — patients for discharge
|
|
48
|
+
instructions or clinicians for the hospital course?
|
|
49
|
+
|
|
50
|
+
Output Format:
|
|
51
|
+
Generate a valid JSON object with the following structure:
|
|
52
|
+
{
|
|
53
|
+
"accuracy": {
|
|
54
|
+
"score": 0,
|
|
55
|
+
"explanation": "Explain why this score was given."
|
|
56
|
+
},
|
|
57
|
+
"completeness": {
|
|
58
|
+
"score": 0,
|
|
59
|
+
"explanation": "Explain why this score was given."
|
|
60
|
+
},
|
|
61
|
+
"clarity": {
|
|
62
|
+
"score": 0,
|
|
63
|
+
"explanation": "Explain why this score was given."
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
Ensure the output is valid JSON:
|
|
68
|
+
- Use **double quotes** (") for all keys and string values.
|
|
69
|
+
- When quoting text or sections inside the explanations, use escaped double quotes (\") to
|
|
70
|
+
maintain valid JSON formatting.
|
|
71
|
+
- Do not include any additional information in the output.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
|
|
75
|
+
"accuracy": {"score", "explanation"},
|
|
76
|
+
"completeness": {"score", "explanation"},
|
|
77
|
+
"clarity": {"score", "explanation"},
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
|
|
81
|
+
"gpt": AnnotatorModelInfo(
|
|
82
|
+
model_name="openai/gpt-4o-2024-05-13",
|
|
83
|
+
model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
|
|
84
|
+
),
|
|
85
|
+
"llama": AnnotatorModelInfo(
|
|
86
|
+
model_name="meta/llama-3.3-70b-instruct",
|
|
87
|
+
model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
|
|
88
|
+
),
|
|
89
|
+
"claude": AnnotatorModelInfo(
|
|
90
|
+
model_name="anthropic/claude-3-7-sonnet-20250219",
|
|
91
|
+
model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
|
|
92
|
+
),
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class DischargeMeAnnotator(LLMAsJuryAnnotator):
|
|
97
|
+
"""The DischargeMe autograder."""
|
|
98
|
+
|
|
99
|
+
name = "dischargeme"
|
|
100
|
+
|
|
101
|
+
def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
|
|
102
|
+
super().__init__(
|
|
103
|
+
auto_client=auto_client,
|
|
104
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
105
|
+
annotation_criteria=ANNOTATION_CRITERIA,
|
|
106
|
+
annotator_models=ANNOTATOR_MODELS,
|
|
107
|
+
)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from typing import Any, List, Optional
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
import sqlite3
|
|
5
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
|
+
from helm.benchmark.annotation.annotator import Annotator
|
|
7
|
+
from helm.common.hierarchical_logger import hlog
|
|
8
|
+
from helm.benchmark.runner import get_benchmark_output_path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class EhrSqlAnnotator(Annotator):
|
|
12
|
+
"""
|
|
13
|
+
Executes both ground truth and generated SQL queries on the eicu.sqlite database.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
name = "ehr_sql"
|
|
17
|
+
|
|
18
|
+
def annotate(self, request_state: RequestState) -> Any:
|
|
19
|
+
"""Evaluate SQL execution accuracy by running queries against the eicu.sqlite database."""
|
|
20
|
+
|
|
21
|
+
databases_root_path = os.path.join(get_benchmark_output_path(), "scenarios", "ehr_sql")
|
|
22
|
+
database_path = os.path.join(databases_root_path, "eicu.sqlite")
|
|
23
|
+
|
|
24
|
+
assert len(request_state.instance.references) == 1
|
|
25
|
+
ground_truth_sql = request_state.instance.references[0].output.text.strip()
|
|
26
|
+
ground_truth_result: List[str] = []
|
|
27
|
+
|
|
28
|
+
# Execute the ground truth query
|
|
29
|
+
try:
|
|
30
|
+
with sqlite3.connect(database_path) as conn:
|
|
31
|
+
cursor = conn.cursor()
|
|
32
|
+
cursor.execute(ground_truth_sql)
|
|
33
|
+
ground_truth_result = cursor.fetchall()
|
|
34
|
+
except (sqlite3.OperationalError, sqlite3.Warning) as e:
|
|
35
|
+
hlog(f"WARNING: Ground truth SQL failed with error: {e}")
|
|
36
|
+
|
|
37
|
+
# If ground truth SQL execution didn't return results, attempt to use extra_data["value"]
|
|
38
|
+
if not ground_truth_result and request_state.instance.extra_data is not None:
|
|
39
|
+
if "value" in request_state.instance.extra_data:
|
|
40
|
+
extra_values = list(request_state.instance.extra_data["value"].values())
|
|
41
|
+
|
|
42
|
+
# Try inferring types from the database schema if possible
|
|
43
|
+
with sqlite3.connect(database_path) as conn:
|
|
44
|
+
cursor = conn.cursor()
|
|
45
|
+
try:
|
|
46
|
+
cursor.execute(ground_truth_sql)
|
|
47
|
+
fetched_result = cursor.fetchone()
|
|
48
|
+
if fetched_result:
|
|
49
|
+
# Convert extra_values to match SQLite's expected types
|
|
50
|
+
converted_values = [
|
|
51
|
+
type(fetched_result[i])(extra_values[i]) for i in range(len(extra_values))
|
|
52
|
+
]
|
|
53
|
+
ground_truth_result = converted_values
|
|
54
|
+
else:
|
|
55
|
+
# If no rows were fetched, use `extra_values` as-is
|
|
56
|
+
ground_truth_result = extra_values
|
|
57
|
+
except sqlite3.OperationalError:
|
|
58
|
+
# If query fails (syntax error, etc.), just use `extra_values` as-is
|
|
59
|
+
ground_truth_result = extra_values
|
|
60
|
+
|
|
61
|
+
assert request_state.result is not None
|
|
62
|
+
assert len(request_state.result.completions) == 1
|
|
63
|
+
predicted_text = request_state.result.completions[0].text.strip()
|
|
64
|
+
|
|
65
|
+
predicted_sql_match = re.search(r"<\s*sql\s*>(.*?)<\/?\s*sql\s*>", predicted_text, re.DOTALL | re.IGNORECASE)
|
|
66
|
+
predicted_sql = predicted_sql_match.group(1).strip() if predicted_sql_match else predicted_text.strip()
|
|
67
|
+
|
|
68
|
+
predicted_result: List[str] = []
|
|
69
|
+
query_error: Optional[str] = None
|
|
70
|
+
predicted_sql = predicted_sql.replace("`", "").strip()
|
|
71
|
+
predicted_sql = re.sub(r"^sql\n", "", predicted_sql, flags=re.MULTILINE)
|
|
72
|
+
if not predicted_sql:
|
|
73
|
+
query_error = "No query generated"
|
|
74
|
+
else:
|
|
75
|
+
try:
|
|
76
|
+
with sqlite3.connect(database_path) as conn:
|
|
77
|
+
cursor = conn.cursor()
|
|
78
|
+
cursor.execute(predicted_sql)
|
|
79
|
+
predicted_result = cursor.fetchall()
|
|
80
|
+
except (sqlite3.OperationalError, sqlite3.Warning) as e:
|
|
81
|
+
query_error = str(e)
|
|
82
|
+
|
|
83
|
+
return {
|
|
84
|
+
"predicted_result": predicted_result,
|
|
85
|
+
"ground_truth_result": ground_truth_result,
|
|
86
|
+
"query_error": query_error,
|
|
87
|
+
}
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict, Optional, Union
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
|
|
5
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
|
+
from helm.benchmark.annotation.annotator import Annotator
|
|
7
|
+
from helm.clients.auto_client import AutoClient
|
|
8
|
+
from helm.common.hierarchical_logger import hlog
|
|
9
|
+
from helm.common.request import Request
|
|
10
|
+
from helm.proxy.retry import NonRetriableException
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class AnnotatorResponseParseFailure(NonRetriableException):
|
|
14
|
+
def __init__(self, response_text: str, **kwargs):
|
|
15
|
+
self.response_text = response_text
|
|
16
|
+
super().__init__(kwargs)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class HelpdeskCallSummarizationAnnotator(Annotator):
|
|
20
|
+
"""The Helpdesk Call Summarization autograder."""
|
|
21
|
+
|
|
22
|
+
name = "helpdesk_call_center_summarization"
|
|
23
|
+
|
|
24
|
+
SHORT_NAME_TO_MODEL_INFO: Dict[str, AnnotatorModelInfo] = {
|
|
25
|
+
"gpt": AnnotatorModelInfo(
|
|
26
|
+
model_name="openai/gpt-4o-2024-05-13",
|
|
27
|
+
model_deployment="openai/gpt-4o-2024-05-13",
|
|
28
|
+
),
|
|
29
|
+
"llama": AnnotatorModelInfo(
|
|
30
|
+
model_name="meta/llama-3.1-405b-instruct-turbo",
|
|
31
|
+
model_deployment="together/llama-3.1-405b-instruct-turbo",
|
|
32
|
+
),
|
|
33
|
+
"claude": AnnotatorModelInfo(
|
|
34
|
+
model_name="anthropic/claude-3-5-sonnet-20241022",
|
|
35
|
+
model_deployment="anthropic/claude-3-5-sonnet-20241022",
|
|
36
|
+
),
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
# Template loosely based on these references:
|
|
40
|
+
# - https://ieeexplore.ieee.org/abstract/document/9946852
|
|
41
|
+
# - https://arxiv.org/abs/2409.02413v1
|
|
42
|
+
PROMPT_TEMPLATE = """You are an expert evaluator. Your task is to evaluate the quality of a model-generated summary of a helpdesk call transcript.
|
|
43
|
+
|
|
44
|
+
The helpdesk call transcript and summary are provided below, delineated with start and end tags:
|
|
45
|
+
|
|
46
|
+
<call_transcript>
|
|
47
|
+
{{QUESTION}}
|
|
48
|
+
</call_transcript>
|
|
49
|
+
<summary>
|
|
50
|
+
{{PRED}}
|
|
51
|
+
</summary>
|
|
52
|
+
|
|
53
|
+
Evaluate the summary based on the following criteria:
|
|
54
|
+
|
|
55
|
+
- Conciseness: A high-quality summary should effectively convey the most important information from the call transcript while keeping the length brief.
|
|
56
|
+
- Relevance: The information presented in the summary should be relevant to the main topic.
|
|
57
|
+
- Coherence: A good summary should have a clear structure and flow of ideas that make it easy to understand and follow.
|
|
58
|
+
- Accuracy: The summary's information should be factually correct and should not contain false or misleading information.
|
|
59
|
+
- Completeness: The summary should a fair description of the main problems (concerns) and resolutions based on the original call transcripts.
|
|
60
|
+
|
|
61
|
+
Think step by step, then score the summary. Your reasoning should be less than 200 words. The score should be a single number between 1 to 10 inclusive.
|
|
62
|
+
|
|
63
|
+
Please respond with your output and reasoning in the following format, your reasoning within <reasoning></reasoning> tags and your score within <score></score> tags, without any other output:
|
|
64
|
+
|
|
65
|
+
<reasoning>INSERT_YOUR_REASONING_HERE</reasoning>
|
|
66
|
+
<score>INSERT_YOUR_SCORE_HERE</score>
|
|
67
|
+
""" # noqa: E501
|
|
68
|
+
|
|
69
|
+
PATTERN = r"^\s*reason:(.*)##(.*)"
|
|
70
|
+
|
|
71
|
+
def __init__(self, auto_client: AutoClient):
|
|
72
|
+
self._auto_client = auto_client
|
|
73
|
+
|
|
74
|
+
def annotate(self, request_state: RequestState) -> Any:
|
|
75
|
+
assert request_state.result
|
|
76
|
+
assert len(request_state.result.completions) == 1
|
|
77
|
+
prediction_text = request_state.result.completions[0].text
|
|
78
|
+
|
|
79
|
+
question_text = request_state.instance.input.text
|
|
80
|
+
|
|
81
|
+
annotator_prompt = self.PROMPT_TEMPLATE.replace("{{QUESTION}}", question_text).replace(
|
|
82
|
+
"{{PRED}}", prediction_text
|
|
83
|
+
)
|
|
84
|
+
annotations: Dict[str, Union[Optional[str], Optional[float]]] = {"prompt_text": annotator_prompt}
|
|
85
|
+
for annotator_name, annotator_model_info in self.SHORT_NAME_TO_MODEL_INFO.items():
|
|
86
|
+
annotator_request = Request(
|
|
87
|
+
model=annotator_model_info.model_name,
|
|
88
|
+
model_deployment=annotator_model_info.model_deployment,
|
|
89
|
+
prompt=annotator_prompt,
|
|
90
|
+
temperature=0.0,
|
|
91
|
+
max_tokens=512,
|
|
92
|
+
)
|
|
93
|
+
annotator_response = self._auto_client.make_request(annotator_request)
|
|
94
|
+
if not annotator_response.success:
|
|
95
|
+
raise Exception(f"Annotation request failed: {annotator_response.error}")
|
|
96
|
+
assert len(annotator_response.completions) == 1
|
|
97
|
+
annotator_response_text = annotator_response.completions[0].text
|
|
98
|
+
# fuzzy match regex check, allows for different casing, or forgetting / in end tag
|
|
99
|
+
reasoning_match = re.search(
|
|
100
|
+
r"<\s*reasoning\s*>(.*?)<\/?\s*reasoning\s*>", annotator_response_text, re.DOTALL | re.IGNORECASE
|
|
101
|
+
)
|
|
102
|
+
score_match = re.search(
|
|
103
|
+
r"<\s*score\s*>(.*?)<\/?\s*score\s*>", annotator_response_text, re.DOTALL | re.IGNORECASE
|
|
104
|
+
)
|
|
105
|
+
reasoning: Optional[str] = None
|
|
106
|
+
score: Optional[float] = None
|
|
107
|
+
if reasoning_match:
|
|
108
|
+
reasoning = reasoning_match.group(1).strip()
|
|
109
|
+
else:
|
|
110
|
+
hlog(
|
|
111
|
+
"WARNING: HelpdeskCallSummarizationAnnotator could not get Reasoning from annotation from "
|
|
112
|
+
f"{annotator_model_info.model_name}: {annotator_response_text}"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
if score_match:
|
|
116
|
+
try:
|
|
117
|
+
score = float(score_match.group(1).strip())
|
|
118
|
+
except ValueError:
|
|
119
|
+
hlog(
|
|
120
|
+
"WARNING: HelpdeskCallSummarizationAnnotator could not parse Score from annotation from "
|
|
121
|
+
f"{annotator_model_info.model_name}: {annotator_response_text}"
|
|
122
|
+
)
|
|
123
|
+
else:
|
|
124
|
+
hlog(
|
|
125
|
+
"WARNING: HelpdeskCallSummarizationAnnotator could not get Score from annotation from "
|
|
126
|
+
f"{annotator_model_info.model_name}: {annotator_response_text}"
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
annotations[f"{annotator_name}_reasoning"] = reasoning
|
|
130
|
+
annotations[f"{annotator_name}_score"] = score
|
|
131
|
+
return annotations
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
|
+
from threading import Lock
|
|
2
3
|
from typing import Any, Dict, List, Tuple, Callable
|
|
3
4
|
|
|
4
5
|
from dacite import from_dict
|
|
@@ -17,6 +18,9 @@ except ModuleNotFoundError as e:
|
|
|
17
18
|
handle_module_not_found_error(e, suggestions=["images"])
|
|
18
19
|
|
|
19
20
|
|
|
21
|
+
_compilation_lock = Lock()
|
|
22
|
+
|
|
23
|
+
|
|
20
24
|
def retry_if_compilation_failed(result: Dict[str, Any]) -> bool:
|
|
21
25
|
"""Retries when the compilation fails."""
|
|
22
26
|
return "unknown_error" in result
|
|
@@ -78,7 +82,8 @@ class ImageCompilerAnnotator(Annotator, ABC):
|
|
|
78
82
|
except Exception as e:
|
|
79
83
|
return {"unknown_error": str(e)}
|
|
80
84
|
|
|
81
|
-
|
|
85
|
+
with _compilation_lock:
|
|
86
|
+
raw_response = compile()
|
|
82
87
|
response = {**raw_response}
|
|
83
88
|
if "media_object" in response:
|
|
84
89
|
response["media_object"] = from_dict(MediaObject, response["media_object"])
|
|
@@ -5,7 +5,7 @@ from typing import Any
|
|
|
5
5
|
from helm.common.general import ensure_directory_exists
|
|
6
6
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
7
7
|
from helm.benchmark.annotation.annotator import Annotator
|
|
8
|
-
from helm.benchmark.annotation.
|
|
8
|
+
from helm.benchmark.annotation.model_as_judge import score_with_reasoning
|
|
9
9
|
from helm.clients.auto_client import AutoClient
|
|
10
10
|
|
|
11
11
|
PROMPT_TEMPLATE = """You are a medical judge will be given a user request and a model response to that request.
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from typing import Dict, Optional, Set
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
|
|
4
|
+
from helm.clients.auto_client import AutoClient
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
PROMPT_TEMPLATE = """You are a medical expert tasked with evaluating the quality of medical dialogue summaries.
|
|
8
|
+
|
|
9
|
+
Your goal is to assess how well the summary captures the patient-doctor conversation,
|
|
10
|
+
and how it compares to the gold response in terms of accuracy, completeness, and clarity.
|
|
11
|
+
|
|
12
|
+
The patient-doctor conversation will be provided in these tags:
|
|
13
|
+
<conversation>
|
|
14
|
+
{{QUESTION}}
|
|
15
|
+
</conversation>
|
|
16
|
+
|
|
17
|
+
The response will be provided in these tags:
|
|
18
|
+
<response>
|
|
19
|
+
{{RESPONSE}}
|
|
20
|
+
</response>
|
|
21
|
+
|
|
22
|
+
The reference response will be provided in these tags:
|
|
23
|
+
<gold_response>
|
|
24
|
+
{{GOLD_RESPONSE}}
|
|
25
|
+
</gold_response>
|
|
26
|
+
|
|
27
|
+
Carefully review the <response> and compare it to the <gold_response> when needed.
|
|
28
|
+
|
|
29
|
+
Please evaluate the generated summary on a scale of 1-5 (1 = poor, 5 = excellent)
|
|
30
|
+
for each of these three key dimensions:
|
|
31
|
+
|
|
32
|
+
Evaluation Criteria:
|
|
33
|
+
Accuracy (1-5)
|
|
34
|
+
- Does the summary correctly capture the main medical issue and clinical details from the conversation?
|
|
35
|
+
|
|
36
|
+
Completeness (1-5)
|
|
37
|
+
- Does the summary include all important medical information from the conversation?
|
|
38
|
+
|
|
39
|
+
Clarity (1-5)
|
|
40
|
+
- Is the summary easy to understand for clinical use?
|
|
41
|
+
|
|
42
|
+
Output Format:
|
|
43
|
+
Output your evaluation as a single valid JSON object matching the following structure:
|
|
44
|
+
{
|
|
45
|
+
"accuracy": {
|
|
46
|
+
"score": 0,
|
|
47
|
+
"explanation": "Brief explanation of why this score was given."
|
|
48
|
+
},
|
|
49
|
+
"completeness": {
|
|
50
|
+
"score": 0,
|
|
51
|
+
"explanation": "Brief explanation of why this score was given."
|
|
52
|
+
},
|
|
53
|
+
"clarity": {
|
|
54
|
+
"score": 0,
|
|
55
|
+
"explanation": "Brief explanation of why this score was given."
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
Ensure the output is valid JSON:
|
|
60
|
+
- Use **double quotes** (") for all keys and string values.
|
|
61
|
+
- When quoting text or sections inside the explanations, use escaped double quotes (\") to
|
|
62
|
+
maintain valid JSON formatting.
|
|
63
|
+
- Do not include any additional information in the output.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
|
|
67
|
+
"accuracy": {"score", "explanation"},
|
|
68
|
+
"completeness": {"score", "explanation"},
|
|
69
|
+
"clarity": {"score", "explanation"},
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
|
|
73
|
+
"gpt": AnnotatorModelInfo(
|
|
74
|
+
model_name="openai/gpt-4o-2024-05-13",
|
|
75
|
+
model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
|
|
76
|
+
),
|
|
77
|
+
"llama": AnnotatorModelInfo(
|
|
78
|
+
model_name="meta/llama-3.3-70b-instruct",
|
|
79
|
+
model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
|
|
80
|
+
),
|
|
81
|
+
"claude": AnnotatorModelInfo(
|
|
82
|
+
model_name="anthropic/claude-3-7-sonnet-20250219",
|
|
83
|
+
model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
|
|
84
|
+
),
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class MedDialogAnnotator(LLMAsJuryAnnotator):
|
|
89
|
+
"""The MedDialog autograder."""
|
|
90
|
+
|
|
91
|
+
name = "med_dialog"
|
|
92
|
+
|
|
93
|
+
def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
|
|
94
|
+
super().__init__(
|
|
95
|
+
auto_client=auto_client,
|
|
96
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
97
|
+
annotation_criteria=ANNOTATION_CRITERIA,
|
|
98
|
+
annotator_models=ANNOTATOR_MODELS,
|
|
99
|
+
)
|