crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +10 -5
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +90 -61
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +281 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +21 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +119 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +105 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +83 -5
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +18 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +42 -6
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +151 -47
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +255 -21
- helm/clients/palmyra_client.py +2 -6
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +69 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +8 -30
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/key_value_store.py +9 -9
- helm/common/media_object.py +2 -2
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +1258 -33
- helm/config/model_metadata.yaml +1110 -41
- helm/config/tokenizer_configs.yaml +403 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -13
- helm/proxy/services/remote_service.py +1 -7
- helm/proxy/services/server_service.py +6 -19
- helm/proxy/services/service.py +0 -6
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.3.dist-info/METADATA +0 -355
- crfm_helm-0.5.3.dist-info/RECORD +0 -699
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
+
from helm.benchmark.annotation.med_dialog_annotator import ANNOTATOR_MODELS
|
|
6
|
+
from helm.benchmark.metrics.metric import Metric
|
|
7
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
8
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
9
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MedDialogMetric(Metric):
|
|
13
|
+
"""Score metrics for MedDialog."""
|
|
14
|
+
|
|
15
|
+
def evaluate_generation(
|
|
16
|
+
self,
|
|
17
|
+
adapter_spec: AdapterSpec,
|
|
18
|
+
request_state: RequestState,
|
|
19
|
+
metric_service: MetricService,
|
|
20
|
+
eval_cache_path: str,
|
|
21
|
+
) -> List[Stat]:
|
|
22
|
+
assert request_state.annotations
|
|
23
|
+
annotations: Dict[str, Any] = request_state.annotations["med_dialog"]
|
|
24
|
+
scores: List[int] = []
|
|
25
|
+
score = 0.0
|
|
26
|
+
for annotation_key, annotation_dict in annotations.items():
|
|
27
|
+
if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
|
|
28
|
+
for val in annotation_dict.values():
|
|
29
|
+
scores.append(int(val["score"]))
|
|
30
|
+
if scores:
|
|
31
|
+
score = sum(scores) / len(scores)
|
|
32
|
+
return [
|
|
33
|
+
Stat(MetricName("med_dialog_accuracy")).add(score),
|
|
34
|
+
]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
+
from helm.benchmark.annotation.medalign_annotator import ANNOTATOR_MODELS
|
|
6
|
+
from helm.benchmark.metrics.metric import Metric
|
|
7
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
8
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
9
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MedalignMetric(Metric):
|
|
13
|
+
"""Score metrics for Medalign."""
|
|
14
|
+
|
|
15
|
+
def evaluate_generation(
|
|
16
|
+
self,
|
|
17
|
+
adapter_spec: AdapterSpec,
|
|
18
|
+
request_state: RequestState,
|
|
19
|
+
metric_service: MetricService,
|
|
20
|
+
eval_cache_path: str,
|
|
21
|
+
) -> List[Stat]:
|
|
22
|
+
assert request_state.annotations
|
|
23
|
+
annotations: Dict[str, Any] = request_state.annotations["medalign"]
|
|
24
|
+
scores: List[int] = []
|
|
25
|
+
score = 0.0
|
|
26
|
+
for annotation_key, annotation_dict in annotations.items():
|
|
27
|
+
if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
|
|
28
|
+
for val in annotation_dict.values():
|
|
29
|
+
scores.append(int(val["score"]))
|
|
30
|
+
if scores:
|
|
31
|
+
score = sum(scores) / len(scores)
|
|
32
|
+
return [
|
|
33
|
+
Stat(MetricName("medalign_accuracy")).add(score),
|
|
34
|
+
]
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import List, Dict, Any
|
|
5
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
6
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
7
|
+
from helm.benchmark.metrics.metric import Metric
|
|
8
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
9
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
10
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
11
|
+
from helm.common.hierarchical_logger import hlog
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MedCalcBenchMetric(Metric):
|
|
15
|
+
"""
|
|
16
|
+
Metric for evaluating the MedCalc Bench dataset, assessing the model's ability to
|
|
17
|
+
be a clinical calculator.
|
|
18
|
+
|
|
19
|
+
Exact match based on category:
|
|
20
|
+
1. Normal exact match: for categories "risk", "severity" or "diagnosis".
|
|
21
|
+
2. Variant exact match: for other categories, if the number calculated by the model falls between the values
|
|
22
|
+
in the Lower limit and Upper limit columns, we mark it as accurate.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def parse_duration(self, duration_str) -> int:
|
|
26
|
+
"""Parses a duration tuple (weeks, days) from a string format like ('14 weeks', '2 days')."""
|
|
27
|
+
match = re.match(r"\('(\d+) weeks', '(\d+) days'\)", duration_str)
|
|
28
|
+
if match:
|
|
29
|
+
weeks, days = map(int, match.groups())
|
|
30
|
+
return weeks * 7 + days # Convert to total days
|
|
31
|
+
else:
|
|
32
|
+
raise ValueError(f"Invalid format: {duration_str}")
|
|
33
|
+
|
|
34
|
+
def is_within_range(self, lower_bound, upper_bound, prediction) -> int:
|
|
35
|
+
"""
|
|
36
|
+
Checks if a predicted duration falls within the given range.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
lower_bound (str): The lower bound in format "('X weeks', 'Y days')".
|
|
40
|
+
upper_bound (str): The upper bound in format "('X weeks', 'Y days')".
|
|
41
|
+
prediction (str): The predicted duration in the same format.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
int: 1 if within range (inclusive), 0 otherwise.
|
|
45
|
+
"""
|
|
46
|
+
lower_days = self.parse_duration(lower_bound)
|
|
47
|
+
upper_days = self.parse_duration(upper_bound)
|
|
48
|
+
prediction_days = self.parse_duration(prediction)
|
|
49
|
+
return 1 if lower_days <= prediction_days <= upper_days else 0
|
|
50
|
+
|
|
51
|
+
def check_date(self, prediction: str, reference: str, extra_data: Dict[str, Any]) -> int:
|
|
52
|
+
"""Checks if prediction date is withing limits"""
|
|
53
|
+
if re.match(r"\('(\d+) weeks', '(\d+) days'\)", reference):
|
|
54
|
+
exact_match = self.is_within_range(extra_data["lower_limit"], extra_data["upper_limit"], prediction)
|
|
55
|
+
else:
|
|
56
|
+
prediction_date = self._str_to_date(prediction)
|
|
57
|
+
upper_limit_date = self._str_to_date(extra_data["upper_limit"])
|
|
58
|
+
lower_limit_date = self._str_to_date(extra_data["lower_limit"])
|
|
59
|
+
exact_match = 1 if lower_limit_date <= prediction_date <= upper_limit_date else 0
|
|
60
|
+
return exact_match
|
|
61
|
+
|
|
62
|
+
def _str_to_date(self, date_str: str) -> datetime:
|
|
63
|
+
"""Convert string to datetime object."""
|
|
64
|
+
return datetime.strptime(date_str, "%m/%d/%Y")
|
|
65
|
+
|
|
66
|
+
def check_in_range(self, prediction: str, reference: str, extra_data: Dict[str, Any], category: str) -> int:
|
|
67
|
+
"""Check if the prediction falls within the range specified by the reference."""
|
|
68
|
+
try:
|
|
69
|
+
if category == "date":
|
|
70
|
+
exact_match = self.check_date(prediction, reference, extra_data)
|
|
71
|
+
elif category in ["dosage conversion", "physical"]:
|
|
72
|
+
lower_limit = float(extra_data["lower_limit"])
|
|
73
|
+
upper_limit = float(extra_data["upper_limit"])
|
|
74
|
+
float_prediction = float(prediction)
|
|
75
|
+
exact_match = 1 if lower_limit <= float_prediction <= upper_limit else 0
|
|
76
|
+
else:
|
|
77
|
+
raise ValueError(f"Category {category} not supported")
|
|
78
|
+
except ValueError:
|
|
79
|
+
return 0
|
|
80
|
+
|
|
81
|
+
return exact_match
|
|
82
|
+
|
|
83
|
+
def evaluate_generation(
|
|
84
|
+
self,
|
|
85
|
+
adapter_spec: AdapterSpec,
|
|
86
|
+
request_state: RequestState,
|
|
87
|
+
metric_service: MetricService,
|
|
88
|
+
eval_cache_path: str,
|
|
89
|
+
) -> List[Stat]:
|
|
90
|
+
"""
|
|
91
|
+
Evaluate a single generation against reference labels.
|
|
92
|
+
"""
|
|
93
|
+
# Extract predictions
|
|
94
|
+
assert request_state.result, "request_state.result is unexpectedly None"
|
|
95
|
+
predictions = [completion.text.strip() for completion in request_state.result.completions]
|
|
96
|
+
|
|
97
|
+
if not predictions:
|
|
98
|
+
hlog("Warning: No predictions found in completions")
|
|
99
|
+
return []
|
|
100
|
+
|
|
101
|
+
# Get the first prediction
|
|
102
|
+
prediction = predictions[0]
|
|
103
|
+
|
|
104
|
+
# Get references
|
|
105
|
+
references = getattr(request_state.instance, "references", None)
|
|
106
|
+
|
|
107
|
+
if not references or len(references) == 0:
|
|
108
|
+
hlog(f"Warning: Missing references for instance {request_state.instance}")
|
|
109
|
+
return []
|
|
110
|
+
|
|
111
|
+
reference = references[0].output.text
|
|
112
|
+
|
|
113
|
+
# Extract category, upper limit and lower limit
|
|
114
|
+
assert request_state.instance.extra_data, "Extra data dict was expected but got None"
|
|
115
|
+
category = request_state.instance.extra_data["category"]
|
|
116
|
+
|
|
117
|
+
if category in ["risk", "severity", "diagnosis"]:
|
|
118
|
+
exact_match = 1 if prediction == reference else 0
|
|
119
|
+
else:
|
|
120
|
+
exact_match = self.check_in_range(prediction, reference, request_state.instance.extra_data, category)
|
|
121
|
+
|
|
122
|
+
return [
|
|
123
|
+
Stat(MetricName("medcalc_bench_accuracy")).add(exact_match),
|
|
124
|
+
]
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
3
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
|
+
from helm.benchmark.metrics.metric import Metric
|
|
5
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
6
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
7
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
8
|
+
from helm.common.hierarchical_logger import hlog
|
|
9
|
+
import re
|
|
10
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MedecMetric(Metric):
|
|
14
|
+
"""
|
|
15
|
+
Metric for evaluating the MEDEC dataset, assessing medical error detection and correction.
|
|
16
|
+
|
|
17
|
+
- **Error Flag Accuracy**: Whether the model correctly identifies if a medical note contains an error.
|
|
18
|
+
- **Error Sentence Detection Accuracy**: Whether the model correctly identifies the erroneous
|
|
19
|
+
sentence when an error is present.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def evaluate_generation(
|
|
23
|
+
self,
|
|
24
|
+
adapter_spec: AdapterSpec,
|
|
25
|
+
request_state: RequestState,
|
|
26
|
+
metric_service: MetricService,
|
|
27
|
+
eval_cache_path: str,
|
|
28
|
+
) -> List[Stat]:
|
|
29
|
+
"""
|
|
30
|
+
Evaluate a single LLM generation against the ground truth labels.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
# Extract predictions from the model output
|
|
34
|
+
if request_state.result is not None:
|
|
35
|
+
predictions = [completion.text.strip() for completion in request_state.result.completions]
|
|
36
|
+
else:
|
|
37
|
+
predictions = []
|
|
38
|
+
|
|
39
|
+
if not predictions:
|
|
40
|
+
raise ValueError("No predictions found in the completions.")
|
|
41
|
+
|
|
42
|
+
prediction = predictions[0] # Use the first completion
|
|
43
|
+
|
|
44
|
+
# Extract references (ground truth)
|
|
45
|
+
references = getattr(request_state.instance, "references", None)
|
|
46
|
+
|
|
47
|
+
if not references or len(references) == 0:
|
|
48
|
+
hlog(f"Warning: Missing references for instance {request_state.instance}")
|
|
49
|
+
return []
|
|
50
|
+
|
|
51
|
+
ground_truth_reference = next((ref for ref in references if CORRECT_TAG in ref.tags), None)
|
|
52
|
+
|
|
53
|
+
if not ground_truth_reference:
|
|
54
|
+
hlog(f"Warning: No ground truth reference with CORRECT_TAG for instance {request_state.instance}")
|
|
55
|
+
return []
|
|
56
|
+
|
|
57
|
+
# Extract the ground truth error flag and sentence ID
|
|
58
|
+
# ground_truth_text_pre = ground_truth_reference.output.text.strip()
|
|
59
|
+
ground_truth_text = ground_truth_reference.output.text.replace(".", "").strip()
|
|
60
|
+
|
|
61
|
+
# Determine ground truth values
|
|
62
|
+
if ground_truth_text == "CORRECT":
|
|
63
|
+
ground_truth_flag = 0 # No medical error
|
|
64
|
+
ground_truth_sentence_id = -1 # No sentence should be flagged
|
|
65
|
+
else:
|
|
66
|
+
ground_truth_flag = 1 # Medical error exists
|
|
67
|
+
try:
|
|
68
|
+
ground_truth_sentence_id = int(ground_truth_text)
|
|
69
|
+
except ValueError:
|
|
70
|
+
ground_truth_sentence_id = -1
|
|
71
|
+
|
|
72
|
+
# Determine model predictions
|
|
73
|
+
prediction = prediction.strip()
|
|
74
|
+
|
|
75
|
+
if prediction == "CORRECT":
|
|
76
|
+
predicted_flag = 0
|
|
77
|
+
predicted_sentence_id = -1
|
|
78
|
+
else:
|
|
79
|
+
match = re.match(r"^(\d+)", prediction) # Ensure first token is a number
|
|
80
|
+
if match:
|
|
81
|
+
predicted_sentence_id = int(match.group(1))
|
|
82
|
+
predicted_flag = 1 # A valid sentence ID implies the model thinks there's an error
|
|
83
|
+
else:
|
|
84
|
+
predicted_flag = 1 # The model attempted to correct something, but in an invalid format
|
|
85
|
+
predicted_sentence_id = -2 # Mark invalid outputs
|
|
86
|
+
|
|
87
|
+
# Calculate Error Flag Accuracy
|
|
88
|
+
flag_accuracy = 1 if predicted_flag == ground_truth_flag else 0
|
|
89
|
+
|
|
90
|
+
# Calculate Error Sentence Accuracy
|
|
91
|
+
if ground_truth_flag == 1 and predicted_flag == 1:
|
|
92
|
+
sentence_accuracy = 1 if predicted_sentence_id == ground_truth_sentence_id else 0
|
|
93
|
+
else:
|
|
94
|
+
sentence_accuracy = (
|
|
95
|
+
1 if ground_truth_flag == 0 and predicted_flag == 0 else 0
|
|
96
|
+
) # Both must agree it's "CORRECT"
|
|
97
|
+
|
|
98
|
+
return [
|
|
99
|
+
Stat(MetricName("medec_error_flag_accuracy")).add(flag_accuracy),
|
|
100
|
+
Stat(MetricName("medec_error_sentence_accuracy")).add(sentence_accuracy),
|
|
101
|
+
]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
+
from helm.benchmark.annotation.medi_qa_annotator import ANNOTATOR_MODELS
|
|
6
|
+
from helm.benchmark.metrics.metric import Metric
|
|
7
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
8
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
9
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MediQAMetric(Metric):
|
|
13
|
+
"""Score metrics for MediQA."""
|
|
14
|
+
|
|
15
|
+
def evaluate_generation(
|
|
16
|
+
self,
|
|
17
|
+
adapter_spec: AdapterSpec,
|
|
18
|
+
request_state: RequestState,
|
|
19
|
+
metric_service: MetricService,
|
|
20
|
+
eval_cache_path: str,
|
|
21
|
+
) -> List[Stat]:
|
|
22
|
+
assert request_state.annotations
|
|
23
|
+
annotations: Dict[str, Any] = request_state.annotations["medi_qa"]
|
|
24
|
+
scores: List[int] = []
|
|
25
|
+
score = 0.0
|
|
26
|
+
for annotation_key, annotation_dict in annotations.items():
|
|
27
|
+
if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
|
|
28
|
+
for val in annotation_dict.values():
|
|
29
|
+
scores.append(int(val["score"]))
|
|
30
|
+
if scores:
|
|
31
|
+
score = sum(scores) / len(scores)
|
|
32
|
+
return [
|
|
33
|
+
Stat(MetricName("medi_qa_accuracy")).add(score),
|
|
34
|
+
]
|
|
@@ -1,14 +1,15 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
2
|
|
|
3
3
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
4
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
+
from helm.benchmark.annotation.medi_qa_annotator import ANNOTATOR_MODELS
|
|
5
6
|
from helm.benchmark.metrics.metric import Metric
|
|
6
7
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
8
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
8
9
|
from helm.benchmark.metrics.statistic import Stat
|
|
9
10
|
|
|
10
11
|
|
|
11
|
-
class
|
|
12
|
+
class MedicationQAMetric(Metric):
|
|
12
13
|
"""Score metrics for MedicationQA."""
|
|
13
14
|
|
|
14
15
|
def evaluate_generation(
|
|
@@ -19,5 +20,15 @@ class MedicationQAScoreMetric(Metric):
|
|
|
19
20
|
eval_cache_path: str,
|
|
20
21
|
) -> List[Stat]:
|
|
21
22
|
assert request_state.annotations
|
|
22
|
-
|
|
23
|
-
|
|
23
|
+
annotations: Dict[str, Any] = request_state.annotations["medication_qa"]
|
|
24
|
+
scores: List[int] = []
|
|
25
|
+
score = 0.0
|
|
26
|
+
for annotation_key, annotation_dict in annotations.items():
|
|
27
|
+
if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
|
|
28
|
+
for val in annotation_dict.values():
|
|
29
|
+
scores.append(int(val["score"]))
|
|
30
|
+
if scores:
|
|
31
|
+
score = sum(scores) / len(scores)
|
|
32
|
+
return [
|
|
33
|
+
Stat(MetricName("medication_qa_accuracy")).add(score),
|
|
34
|
+
]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
+
from helm.benchmark.annotation.mental_health_annotator import ANNOTATOR_MODELS
|
|
6
|
+
from helm.benchmark.metrics.metric import Metric
|
|
7
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
8
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
9
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MentalHealthMetric(Metric):
|
|
13
|
+
"""Score metrics for MentalHealth."""
|
|
14
|
+
|
|
15
|
+
def evaluate_generation(
|
|
16
|
+
self,
|
|
17
|
+
adapter_spec: AdapterSpec,
|
|
18
|
+
request_state: RequestState,
|
|
19
|
+
metric_service: MetricService,
|
|
20
|
+
eval_cache_path: str,
|
|
21
|
+
) -> List[Stat]:
|
|
22
|
+
assert request_state.annotations
|
|
23
|
+
annotations: Dict[str, Any] = request_state.annotations["mental_health"]
|
|
24
|
+
scores: List[int] = []
|
|
25
|
+
score = 0.0
|
|
26
|
+
for annotation_key, annotation_dict in annotations.items():
|
|
27
|
+
if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
|
|
28
|
+
for val in annotation_dict.values():
|
|
29
|
+
scores.append(int(val["score"]))
|
|
30
|
+
if scores:
|
|
31
|
+
score = sum(scores) / len(scores)
|
|
32
|
+
return [
|
|
33
|
+
Stat(MetricName("mental_health_accuracy")).add(score),
|
|
34
|
+
]
|
helm/benchmark/metrics/metric.py
CHANGED
|
@@ -15,9 +15,9 @@ from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
|
15
15
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
16
16
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
17
17
|
from helm.benchmark.scenarios.scenario import Instance
|
|
18
|
-
from .metric_name import MetricName, MetricContext
|
|
19
|
-
from .metric_service import MetricService
|
|
20
|
-
from .statistic import Stat, merge_stat
|
|
18
|
+
from helm.benchmark.metrics.metric_name import MetricName, MetricContext
|
|
19
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
20
|
+
from helm.benchmark.metrics.statistic import Stat, merge_stat
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
@dataclass(frozen=True)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
+
from helm.benchmark.annotation.mimic_rrs_annotator import ANNOTATOR_MODELS
|
|
6
|
+
from helm.benchmark.metrics.metric import Metric
|
|
7
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
8
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
9
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MIMICRRSMetric(Metric):
|
|
13
|
+
"""Score metrics for MIMICRRS."""
|
|
14
|
+
|
|
15
|
+
def evaluate_generation(
|
|
16
|
+
self,
|
|
17
|
+
adapter_spec: AdapterSpec,
|
|
18
|
+
request_state: RequestState,
|
|
19
|
+
metric_service: MetricService,
|
|
20
|
+
eval_cache_path: str,
|
|
21
|
+
) -> List[Stat]:
|
|
22
|
+
assert request_state.annotations
|
|
23
|
+
annotations: Dict[str, Any] = request_state.annotations["mimic_rrs"]
|
|
24
|
+
scores: List[int] = []
|
|
25
|
+
score = 0.0
|
|
26
|
+
for annotation_key, annotation_dict in annotations.items():
|
|
27
|
+
if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
|
|
28
|
+
for val in annotation_dict.values():
|
|
29
|
+
scores.append(int(val["score"]))
|
|
30
|
+
if scores:
|
|
31
|
+
score = sum(scores) / len(scores)
|
|
32
|
+
return [
|
|
33
|
+
Stat(MetricName("mimic_rrs_accuracy")).add(score),
|
|
34
|
+
]
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
3
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
|
+
from helm.benchmark.metrics.metric import Metric
|
|
5
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
6
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
7
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
8
|
+
from helm.common.hierarchical_logger import hlog
|
|
9
|
+
import re
|
|
10
|
+
from sklearn.metrics import precision_score, recall_score, f1_score
|
|
11
|
+
from sklearn.preprocessing import MultiLabelBinarizer
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MIMICIVBillingCodeMetric(Metric):
|
|
15
|
+
"""
|
|
16
|
+
Metric for evaluating the MIMIC Billing Code dataset, assessing the model's ability to match the
|
|
17
|
+
reference ICD codes. Handles cases where raw prediction output contains additional text.
|
|
18
|
+
|
|
19
|
+
Calculates:
|
|
20
|
+
1. Precision: proportion of correctly predicted ICD codes among all predicted codes
|
|
21
|
+
2. Recall: proportion of correctly predicted ICD codes among all reference codes
|
|
22
|
+
3. F1 score: harmonic mean of precision and recall
|
|
23
|
+
|
|
24
|
+
ICD codes format: letter followed by 1-3 digits, optional period, optional additional digits
|
|
25
|
+
Examples: "J18.9", "J45.909", "J47.1", "J96.01"
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def extract_icd_codes(self, text: str) -> List[str]:
|
|
29
|
+
"""Extract ICD codes from text, handling markdown and standardizing format."""
|
|
30
|
+
if not text:
|
|
31
|
+
return []
|
|
32
|
+
|
|
33
|
+
# Remove markdown bold formatting
|
|
34
|
+
cleaned_text = re.sub(r"\*\*", "", text)
|
|
35
|
+
# Match ICD code pattern with optional period and trailing digits
|
|
36
|
+
pattern = r"\b[A-Z]\d{1,3}(?:\.\d{1,4})?\.?\b"
|
|
37
|
+
codes = re.findall(pattern, cleaned_text)
|
|
38
|
+
# Standardize by removing trailing periods
|
|
39
|
+
return [code.rstrip(".").strip() for code in codes]
|
|
40
|
+
|
|
41
|
+
def evaluate_generation(
|
|
42
|
+
self,
|
|
43
|
+
adapter_spec: AdapterSpec,
|
|
44
|
+
request_state: RequestState,
|
|
45
|
+
metric_service: MetricService,
|
|
46
|
+
eval_cache_path: str,
|
|
47
|
+
) -> List[Stat]:
|
|
48
|
+
"""
|
|
49
|
+
Evaluate a single generation against reference labels.
|
|
50
|
+
"""
|
|
51
|
+
# Extract predictions
|
|
52
|
+
if request_state.result is None:
|
|
53
|
+
predictions = []
|
|
54
|
+
else:
|
|
55
|
+
predictions = [completion.text.strip() for completion in request_state.result.completions]
|
|
56
|
+
if not predictions:
|
|
57
|
+
hlog("Warning: No predictions found in completions")
|
|
58
|
+
return []
|
|
59
|
+
|
|
60
|
+
# Get the first prediction
|
|
61
|
+
prediction = predictions[0]
|
|
62
|
+
|
|
63
|
+
# Get references
|
|
64
|
+
references = getattr(request_state.instance, "references", None)
|
|
65
|
+
|
|
66
|
+
if not references or len(references) == 0:
|
|
67
|
+
hlog(f"Warning: Missing references for instance {request_state.instance}")
|
|
68
|
+
return []
|
|
69
|
+
|
|
70
|
+
# Extract codes from reference and prediction
|
|
71
|
+
ref_codes = []
|
|
72
|
+
for ref in references:
|
|
73
|
+
if ref.output.text:
|
|
74
|
+
ref_codes.extend(self.extract_icd_codes(ref.output.text))
|
|
75
|
+
ref_codes = list(set(ref_codes)) # Remove duplicates
|
|
76
|
+
|
|
77
|
+
pred_codes = self.extract_icd_codes(prediction)
|
|
78
|
+
pred_codes = list(set(pred_codes)) # Remove duplicates
|
|
79
|
+
|
|
80
|
+
# Convert to binary format for metrics
|
|
81
|
+
all_codes = sorted(list(set(ref_codes + pred_codes)))
|
|
82
|
+
mlb = MultiLabelBinarizer(classes=all_codes)
|
|
83
|
+
|
|
84
|
+
y_true_bin = mlb.fit_transform([ref_codes])
|
|
85
|
+
y_pred_bin = mlb.transform([pred_codes])
|
|
86
|
+
|
|
87
|
+
# Calculate metrics
|
|
88
|
+
precision = precision_score(y_true_bin, y_pred_bin, average="micro", zero_division=0)
|
|
89
|
+
recall = recall_score(y_true_bin, y_pred_bin, average="micro", zero_division=0)
|
|
90
|
+
f1 = f1_score(y_true_bin, y_pred_bin, average="micro", zero_division=0)
|
|
91
|
+
|
|
92
|
+
return [
|
|
93
|
+
Stat(MetricName("mimiciv_billing_code_precision")).add(precision),
|
|
94
|
+
Stat(MetricName("mimiciv_billing_code_recall")).add(recall),
|
|
95
|
+
Stat(MetricName("mimiciv_billing_code_f1")).add(f1),
|
|
96
|
+
]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
+
from helm.benchmark.annotation.mtsamples_procedures_annotator import ANNOTATOR_MODELS
|
|
6
|
+
from helm.benchmark.metrics.metric import Metric
|
|
7
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
8
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
9
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MTSamplesProceduresMetric(Metric):
|
|
13
|
+
"""Score metrics for MTSamplesProcedures."""
|
|
14
|
+
|
|
15
|
+
def evaluate_generation(
|
|
16
|
+
self,
|
|
17
|
+
adapter_spec: AdapterSpec,
|
|
18
|
+
request_state: RequestState,
|
|
19
|
+
metric_service: MetricService,
|
|
20
|
+
eval_cache_path: str,
|
|
21
|
+
) -> List[Stat]:
|
|
22
|
+
assert request_state.annotations
|
|
23
|
+
annotations: Dict[str, Any] = request_state.annotations["mtsamples_procedures"]
|
|
24
|
+
scores: List[int] = []
|
|
25
|
+
score = 0.0
|
|
26
|
+
for annotation_key, annotation_dict in annotations.items():
|
|
27
|
+
if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
|
|
28
|
+
for val in annotation_dict.values():
|
|
29
|
+
scores.append(int(val["score"]))
|
|
30
|
+
if scores:
|
|
31
|
+
score = sum(scores) / len(scores)
|
|
32
|
+
return [
|
|
33
|
+
Stat(MetricName("mtsamples_procedures_accuracy")).add(score),
|
|
34
|
+
]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
+
from helm.benchmark.annotation.mtsamples_replicate_annotator import ANNOTATOR_MODELS
|
|
6
|
+
from helm.benchmark.metrics.metric import Metric
|
|
7
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
8
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
9
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MTSamplesReplicateMetric(Metric):
|
|
13
|
+
"""Score metrics for MTSamplesReplicate."""
|
|
14
|
+
|
|
15
|
+
def evaluate_generation(
|
|
16
|
+
self,
|
|
17
|
+
adapter_spec: AdapterSpec,
|
|
18
|
+
request_state: RequestState,
|
|
19
|
+
metric_service: MetricService,
|
|
20
|
+
eval_cache_path: str,
|
|
21
|
+
) -> List[Stat]:
|
|
22
|
+
assert request_state.annotations
|
|
23
|
+
annotations: Dict[str, Any] = request_state.annotations["mtsamples_replicate"]
|
|
24
|
+
scores: List[int] = []
|
|
25
|
+
score = 0.0
|
|
26
|
+
for annotation_key, annotation_dict in annotations.items():
|
|
27
|
+
if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
|
|
28
|
+
for val in annotation_dict.values():
|
|
29
|
+
scores.append(int(val["score"]))
|
|
30
|
+
if scores:
|
|
31
|
+
score = sum(scores) / len(scores)
|
|
32
|
+
return [
|
|
33
|
+
Stat(MetricName("mtsamples_replicate_accuracy")).add(score),
|
|
34
|
+
]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import nltk
|
|
2
|
+
from importlib.metadata import version
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def install_nltk_resources():
|
|
6
|
+
"""Install resources for nltk tokenizers, which is required for bleu and rouge scores."""
|
|
7
|
+
# Install "punkt_tab" for nltk>=3.9.1 or "punkt" for nltk<=3.8.1
|
|
8
|
+
#
|
|
9
|
+
# Note that nltk 3.9.0 is disallowed due to https://github.com/nltk/nltk/issues/3308
|
|
10
|
+
#
|
|
11
|
+
# "punkt" is not longer supported for newer versions of nltk due to a security issue
|
|
12
|
+
# and has been replaced by "punkt_tab". For more information, see:
|
|
13
|
+
#
|
|
14
|
+
# - https://github.com/stanford-crfm/helm/issues/2926
|
|
15
|
+
# - https://github.com/nltk/nltk/issues/3293
|
|
16
|
+
# - https://github.com/nltk/nltk/issues/3266
|
|
17
|
+
# - https://nvd.nist.gov/vuln/detail/CVE-2024-39705
|
|
18
|
+
#
|
|
19
|
+
# TODO: Remove support for nltk<=3.8.1 and only install "punkt_tab"
|
|
20
|
+
nltk_major_version, nltk_minor_version = [int(v) for v in version("nltk").split(".")[0:2]]
|
|
21
|
+
if nltk_major_version < 3:
|
|
22
|
+
raise Exception("ntlk version <3 is not supported")
|
|
23
|
+
if nltk_major_version == 3 and nltk_minor_version <= 8:
|
|
24
|
+
try:
|
|
25
|
+
nltk.data.find("tokenizers/punkt")
|
|
26
|
+
except LookupError:
|
|
27
|
+
nltk.download("punkt")
|
|
28
|
+
else:
|
|
29
|
+
try:
|
|
30
|
+
nltk.data.find("tokenizers/punkt_tab")
|
|
31
|
+
except LookupError:
|
|
32
|
+
nltk.download("punkt_tab")
|
|
@@ -12,10 +12,10 @@ from helm.benchmark.scenarios.numeracy_scenario import ( # noqa
|
|
|
12
12
|
distance_plane,
|
|
13
13
|
distance_paraboloid,
|
|
14
14
|
)
|
|
15
|
-
from .metric import Metric
|
|
16
|
-
from .metric_name import MetricName
|
|
17
|
-
from .metric_service import MetricService
|
|
18
|
-
from .statistic import Stat
|
|
15
|
+
from helm.benchmark.metrics.metric import Metric
|
|
16
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
17
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
18
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class DistanceMetric(Metric):
|