crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.6.dist-info/METADATA +427 -0
- crfm_helm-0.5.6.dist-info/RECORD +941 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +21 -6
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +214 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +11 -12
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +14 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +14 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +6 -6
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +14 -0
- helm/benchmark/metrics/medalign_metrics.py +14 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +14 -0
- helm/benchmark/metrics/medication_qa_metrics.py +10 -19
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +14 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +20 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/toxicity_metrics.py +6 -6
- helm/benchmark/metrics/unitxt_metrics.py +7 -5
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +97 -67
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +86 -90
- helm/benchmark/run_expander.py +90 -9
- helm/benchmark/run_spec_factory.py +13 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +142 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +141 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +103 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +157 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +136 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +94 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
- helm/benchmark/scenarios/medbullets_scenario.py +145 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
- helm/benchmark/scenarios/medec_scenario.py +125 -0
- helm/benchmark/scenarios/medhallu_scenario.py +72 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +123 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +12 -2
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +13 -1
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_long_context.yaml +283 -0
- helm/benchmark/static/schema_medhelm.yaml +1140 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +129 -56
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +6 -6
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +4 -5
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +120 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
- helm/clients/audio_language/qwen_audiolm_client.py +152 -0
- helm/clients/audio_language/test.py +62 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +203 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/client.py +7 -7
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/grok_client.py +36 -0
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +52 -21
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
- helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
- helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +308 -43
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +3 -9
- helm/clients/reka_client.py +3 -3
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +76 -9
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +45 -13
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +188 -0
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +4 -6
- helm/clients/writer_client.py +102 -0
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/images_utils.py +2 -2
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +14 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1792 -28
- helm/config/model_metadata.yaml +1606 -51
- helm/config/tokenizer_configs.yaml +521 -4
- helm/proxy/cli.py +5 -3
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +22 -86
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +3 -3
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Iterable, List, Dict, Set, Tuple
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
8
|
+
from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
|
|
9
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
10
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
11
|
+
from helm.benchmark.scenarios.kpi_edgar_scenario import KPIEDGARScenario
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class _Entity:
|
|
16
|
+
phrase: str
|
|
17
|
+
tag: str
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class _Counts:
|
|
22
|
+
num_overlap: int
|
|
23
|
+
num_pred: int
|
|
24
|
+
num_gold: int
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class _Rates:
|
|
29
|
+
tp: float
|
|
30
|
+
fn: float
|
|
31
|
+
fp: float
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _get_all_tags() -> Iterable[str]:
|
|
35
|
+
return KPIEDGARScenario.TAG_DICT.keys()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _parse_entities(text: str) -> List[_Entity]:
|
|
39
|
+
all_matches = re.findall(r"(?:^|(?<=\],))([^\[\]]+)\[([0-9a-z]+)\](?:$|(?=,))", text.strip())
|
|
40
|
+
return [_Entity(phrase=match[0].strip(), tag=match[1].strip()) for match in all_matches]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _compute_tag_to_words(entities: List[_Entity]) -> Dict[str, Set[str]]:
|
|
44
|
+
tag_to_words: Dict[str, Set[str]] = {tag: set() for tag in _get_all_tags()}
|
|
45
|
+
for entity in entities:
|
|
46
|
+
for word in entity.phrase.split():
|
|
47
|
+
tag_to_words[entity.tag].add(word)
|
|
48
|
+
return tag_to_words
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _compute_tag_to_counts(pred_entities: List[_Entity], gold_entities: List[_Entity]) -> Dict[str, _Counts]:
|
|
52
|
+
tag_to_counts: Dict[str, _Counts] = {}
|
|
53
|
+
pred_tag_to_words = _compute_tag_to_words(pred_entities)
|
|
54
|
+
gold_tag_to_words = _compute_tag_to_words(gold_entities)
|
|
55
|
+
for tag in _get_all_tags():
|
|
56
|
+
tag_to_counts[tag] = _Counts(
|
|
57
|
+
num_overlap=len(pred_tag_to_words[tag] & gold_tag_to_words[tag]),
|
|
58
|
+
num_pred=len(pred_tag_to_words[tag]),
|
|
59
|
+
num_gold=len(gold_tag_to_words[tag]),
|
|
60
|
+
)
|
|
61
|
+
return tag_to_counts
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _counts_to_rates(counts: _Counts, adjust: bool) -> _Rates:
|
|
65
|
+
if adjust:
|
|
66
|
+
return _Rates(
|
|
67
|
+
tp=counts.num_overlap / counts.num_gold if counts.num_gold > 0 else 0.0,
|
|
68
|
+
fn=1 - (counts.num_overlap / counts.num_gold) if counts.num_gold > 0 else 0.0,
|
|
69
|
+
fp=(counts.num_pred - counts.num_overlap) / counts.num_pred if counts.num_pred > 0 else 0.0,
|
|
70
|
+
)
|
|
71
|
+
else:
|
|
72
|
+
return _Rates(
|
|
73
|
+
tp=counts.num_overlap,
|
|
74
|
+
fn=counts.num_gold - counts.num_overlap,
|
|
75
|
+
fp=counts.num_pred - counts.num_overlap,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _compute_f1_score(rates: _Rates) -> float:
|
|
80
|
+
return (2 * rates.tp) / (2 * rates.tp + rates.fp + rates.fn) if rates.tp + rates.fp + rates.fn > 0 else 0.0
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _compute_stats(pred_gold_pairs: List[Tuple[str, str]]) -> List[Stat]:
|
|
84
|
+
tag_to_counts: Dict[str, _Counts] = {tag: _Counts(0, 0, 0) for tag in _get_all_tags()}
|
|
85
|
+
for pred_text, gold_text in pred_gold_pairs:
|
|
86
|
+
pred_entities = _parse_entities(pred_text)
|
|
87
|
+
gold_entities = _parse_entities(gold_text)
|
|
88
|
+
instance_tag_to_counts = _compute_tag_to_counts(pred_entities=pred_entities, gold_entities=gold_entities)
|
|
89
|
+
for tag, instance_counts in instance_tag_to_counts.items():
|
|
90
|
+
tag_to_counts[tag].num_overlap += instance_counts.num_overlap
|
|
91
|
+
tag_to_counts[tag].num_pred += instance_counts.num_pred
|
|
92
|
+
tag_to_counts[tag].num_gold += instance_counts.num_gold
|
|
93
|
+
tag_word_f1_scores: List[float] = [
|
|
94
|
+
_compute_f1_score(_counts_to_rates(counts, adjust=False)) for counts in tag_to_counts.values()
|
|
95
|
+
]
|
|
96
|
+
tag_adjusted_f1_scores: List[float] = [
|
|
97
|
+
_compute_f1_score(_counts_to_rates(counts, adjust=True)) for counts in tag_to_counts.values()
|
|
98
|
+
]
|
|
99
|
+
return [
|
|
100
|
+
Stat(MetricName("word_macro_f1_score")).add(np.mean(tag_word_f1_scores)),
|
|
101
|
+
Stat(MetricName("adjusted_macro_f1_score")).add(np.mean(tag_adjusted_f1_scores)),
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _request_states_to_pred_gold_pairs(request_states: List[RequestState]) -> List[Tuple[str, str]]:
|
|
106
|
+
pred_gold_pairs: List[Tuple[str, str]] = []
|
|
107
|
+
for request_state in request_states:
|
|
108
|
+
assert request_state.result
|
|
109
|
+
assert len(request_state.result.completions) == 1
|
|
110
|
+
assert len(request_state.instance.references) == 1
|
|
111
|
+
pred_gold_pairs.append(
|
|
112
|
+
(request_state.instance.references[0].output.text, request_state.result.completions[0].text)
|
|
113
|
+
)
|
|
114
|
+
return pred_gold_pairs
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class KPIEdgarMetric(EvaluateInstancesMetric):
|
|
118
|
+
"""Word-level entity type classification F1 score, macro-averaged across entity types."""
|
|
119
|
+
|
|
120
|
+
def evaluate_instances(self, request_states: List[RequestState], eval_cache_path: str) -> List[Stat]:
|
|
121
|
+
return _compute_stats(_request_states_to_pred_gold_pairs(request_states))
|
|
@@ -11,10 +11,10 @@ from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
|
|
|
11
11
|
|
|
12
12
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
13
13
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
14
|
-
from .metric import MetricInterface, MetricResult, PerInstanceStats, add_context
|
|
15
|
-
from .metric_name import MetricContext, MetricName
|
|
16
|
-
from .metric_service import MetricService
|
|
17
|
-
from .statistic import Stat, merge_stat
|
|
14
|
+
from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, add_context
|
|
15
|
+
from helm.benchmark.metrics.metric_name import MetricContext, MetricName
|
|
16
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
17
|
+
from helm.benchmark.metrics.statistic import Stat, merge_stat
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
class LanguageModelingMetric(MetricInterface):
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
+
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
|
|
6
|
+
from helm.benchmark.metrics.metric import Metric
|
|
7
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
8
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
9
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class LLMJuryMetric(Metric):
|
|
13
|
+
"""Score metrics for LLM Jury."""
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
metric_name: str,
|
|
18
|
+
scenario_name: str,
|
|
19
|
+
annotator_models: Dict[str, AnnotatorModelInfo],
|
|
20
|
+
default_score: float = 0.0,
|
|
21
|
+
):
|
|
22
|
+
self.metric_name = metric_name
|
|
23
|
+
self.scenario_name = scenario_name
|
|
24
|
+
self.annotator_models = annotator_models
|
|
25
|
+
self.default_score = default_score
|
|
26
|
+
|
|
27
|
+
def evaluate_generation(
|
|
28
|
+
self,
|
|
29
|
+
adapter_spec: AdapterSpec,
|
|
30
|
+
request_state: RequestState,
|
|
31
|
+
metric_service: MetricService,
|
|
32
|
+
eval_cache_path: str,
|
|
33
|
+
) -> List[Stat]:
|
|
34
|
+
assert request_state.annotations
|
|
35
|
+
annotations: Dict[str, Any] = request_state.annotations[self.scenario_name]
|
|
36
|
+
scores: List[int] = []
|
|
37
|
+
score = self.default_score
|
|
38
|
+
for annotation_key, annotation_dict in annotations.items():
|
|
39
|
+
if annotation_key in self.annotator_models.keys() and annotation_dict is not None:
|
|
40
|
+
for val in annotation_dict.values():
|
|
41
|
+
scores.append(int(val["score"]))
|
|
42
|
+
if scores:
|
|
43
|
+
score = sum(scores) / len(scores)
|
|
44
|
+
return [
|
|
45
|
+
Stat(MetricName(self.metric_name)).add(score),
|
|
46
|
+
]
|
|
@@ -3,8 +3,8 @@ from typing import List
|
|
|
3
3
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
4
|
from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
|
|
5
5
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
6
|
-
from .metric_name import MetricName
|
|
7
|
-
from .statistic import Stat
|
|
6
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
8
8
|
|
|
9
9
|
try:
|
|
10
10
|
from sacrebleu.metrics import BLEU
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from helm.benchmark.annotation.med_dialog_annotator import ANNOTATOR_MODELS
|
|
2
|
+
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class MedDialogMetric(LLMJuryMetric):
|
|
6
|
+
"""Score metrics for MedDialog."""
|
|
7
|
+
|
|
8
|
+
def __init__(self):
|
|
9
|
+
super().__init__(
|
|
10
|
+
metric_name="med_dialog_accuracy",
|
|
11
|
+
scenario_name="med_dialog",
|
|
12
|
+
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
+
default_score=1.0,
|
|
14
|
+
)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from helm.benchmark.annotation.medalign_annotator import ANNOTATOR_MODELS
|
|
2
|
+
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class MedalignMetric(LLMJuryMetric):
|
|
6
|
+
"""Score metrics for Medalign."""
|
|
7
|
+
|
|
8
|
+
def __init__(self):
|
|
9
|
+
super().__init__(
|
|
10
|
+
metric_name="medalign_accuracy",
|
|
11
|
+
scenario_name="medalign",
|
|
12
|
+
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
+
default_score=1.0,
|
|
14
|
+
)
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import List, Dict, Any
|
|
5
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
6
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
7
|
+
from helm.benchmark.metrics.metric import Metric
|
|
8
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
9
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
10
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
11
|
+
from helm.common.hierarchical_logger import hlog
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MedCalcBenchMetric(Metric):
|
|
15
|
+
"""
|
|
16
|
+
Metric for evaluating the MedCalc Bench dataset, assessing the model's ability to
|
|
17
|
+
be a clinical calculator.
|
|
18
|
+
|
|
19
|
+
Exact match based on category:
|
|
20
|
+
1. Normal exact match: for categories "risk", "severity" or "diagnosis".
|
|
21
|
+
2. Variant exact match: for other categories, if the number calculated by the model falls between the values
|
|
22
|
+
in the Lower limit and Upper limit columns, we mark it as accurate.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def parse_duration(self, duration_str) -> int:
|
|
26
|
+
"""Parses a duration tuple (weeks, days) from a string format like ('14 weeks', '2 days')."""
|
|
27
|
+
match = re.match(r"\('(\d+) weeks', '(\d+) days'\)", duration_str)
|
|
28
|
+
if match:
|
|
29
|
+
weeks, days = map(int, match.groups())
|
|
30
|
+
return weeks * 7 + days # Convert to total days
|
|
31
|
+
else:
|
|
32
|
+
raise ValueError(f"Invalid format: {duration_str}")
|
|
33
|
+
|
|
34
|
+
def is_within_range(self, lower_bound, upper_bound, prediction) -> int:
|
|
35
|
+
"""
|
|
36
|
+
Checks if a predicted duration falls within the given range.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
lower_bound (str): The lower bound in format "('X weeks', 'Y days')".
|
|
40
|
+
upper_bound (str): The upper bound in format "('X weeks', 'Y days')".
|
|
41
|
+
prediction (str): The predicted duration in the same format.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
int: 1 if within range (inclusive), 0 otherwise.
|
|
45
|
+
"""
|
|
46
|
+
lower_days = self.parse_duration(lower_bound)
|
|
47
|
+
upper_days = self.parse_duration(upper_bound)
|
|
48
|
+
prediction_days = self.parse_duration(prediction)
|
|
49
|
+
return 1 if lower_days <= prediction_days <= upper_days else 0
|
|
50
|
+
|
|
51
|
+
def check_date(self, prediction: str, reference: str, extra_data: Dict[str, Any]) -> int:
|
|
52
|
+
"""Checks if prediction date is withing limits"""
|
|
53
|
+
if re.match(r"\('(\d+) weeks', '(\d+) days'\)", reference):
|
|
54
|
+
exact_match = self.is_within_range(extra_data["lower_limit"], extra_data["upper_limit"], prediction)
|
|
55
|
+
else:
|
|
56
|
+
prediction_date = self._str_to_date(prediction)
|
|
57
|
+
upper_limit_date = self._str_to_date(extra_data["upper_limit"])
|
|
58
|
+
lower_limit_date = self._str_to_date(extra_data["lower_limit"])
|
|
59
|
+
exact_match = 1 if lower_limit_date <= prediction_date <= upper_limit_date else 0
|
|
60
|
+
return exact_match
|
|
61
|
+
|
|
62
|
+
def _str_to_date(self, date_str: str) -> datetime:
|
|
63
|
+
"""Convert string to datetime object."""
|
|
64
|
+
return datetime.strptime(date_str, "%m/%d/%Y")
|
|
65
|
+
|
|
66
|
+
def check_in_range(self, prediction: str, reference: str, extra_data: Dict[str, Any], category: str) -> int:
|
|
67
|
+
"""Check if the prediction falls within the range specified by the reference."""
|
|
68
|
+
try:
|
|
69
|
+
if category == "date":
|
|
70
|
+
exact_match = self.check_date(prediction, reference, extra_data)
|
|
71
|
+
elif category in ["dosage conversion", "physical"]:
|
|
72
|
+
lower_limit = float(extra_data["lower_limit"])
|
|
73
|
+
upper_limit = float(extra_data["upper_limit"])
|
|
74
|
+
float_prediction = float(prediction)
|
|
75
|
+
exact_match = 1 if lower_limit <= float_prediction <= upper_limit else 0
|
|
76
|
+
else:
|
|
77
|
+
raise ValueError(f"Category {category} not supported")
|
|
78
|
+
except ValueError:
|
|
79
|
+
return 0
|
|
80
|
+
|
|
81
|
+
return exact_match
|
|
82
|
+
|
|
83
|
+
def evaluate_generation(
|
|
84
|
+
self,
|
|
85
|
+
adapter_spec: AdapterSpec,
|
|
86
|
+
request_state: RequestState,
|
|
87
|
+
metric_service: MetricService,
|
|
88
|
+
eval_cache_path: str,
|
|
89
|
+
) -> List[Stat]:
|
|
90
|
+
"""
|
|
91
|
+
Evaluate a single generation against reference labels.
|
|
92
|
+
"""
|
|
93
|
+
# Extract predictions
|
|
94
|
+
assert request_state.result, "request_state.result is unexpectedly None"
|
|
95
|
+
predictions = [completion.text.strip() for completion in request_state.result.completions]
|
|
96
|
+
|
|
97
|
+
if not predictions:
|
|
98
|
+
hlog("Warning: No predictions found in completions")
|
|
99
|
+
return []
|
|
100
|
+
|
|
101
|
+
# Get the first prediction
|
|
102
|
+
prediction = predictions[0]
|
|
103
|
+
|
|
104
|
+
# Get references
|
|
105
|
+
references = getattr(request_state.instance, "references", None)
|
|
106
|
+
|
|
107
|
+
if not references or len(references) == 0:
|
|
108
|
+
hlog(f"Warning: Missing references for instance {request_state.instance}")
|
|
109
|
+
return []
|
|
110
|
+
|
|
111
|
+
reference = references[0].output.text
|
|
112
|
+
|
|
113
|
+
# Extract category, upper limit and lower limit
|
|
114
|
+
assert request_state.instance.extra_data, "Extra data dict was expected but got None"
|
|
115
|
+
category = request_state.instance.extra_data["category"]
|
|
116
|
+
|
|
117
|
+
if category in ["risk", "severity", "diagnosis"]:
|
|
118
|
+
exact_match = 1 if prediction == reference else 0
|
|
119
|
+
else:
|
|
120
|
+
exact_match = self.check_in_range(prediction, reference, request_state.instance.extra_data, category)
|
|
121
|
+
|
|
122
|
+
return [
|
|
123
|
+
Stat(MetricName("medcalc_bench_accuracy")).add(exact_match),
|
|
124
|
+
]
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
3
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
|
+
from helm.benchmark.metrics.metric import Metric
|
|
5
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
6
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
7
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
8
|
+
from helm.common.hierarchical_logger import hlog
|
|
9
|
+
import re
|
|
10
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MedecMetric(Metric):
|
|
14
|
+
"""
|
|
15
|
+
Metric for evaluating the MEDEC dataset, assessing medical error detection and correction.
|
|
16
|
+
|
|
17
|
+
- **Error Flag Accuracy**: Whether the model correctly identifies if a medical note contains an error.
|
|
18
|
+
- **Error Sentence Detection Accuracy**: Whether the model correctly identifies the erroneous
|
|
19
|
+
sentence when an error is present.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def evaluate_generation(
|
|
23
|
+
self,
|
|
24
|
+
adapter_spec: AdapterSpec,
|
|
25
|
+
request_state: RequestState,
|
|
26
|
+
metric_service: MetricService,
|
|
27
|
+
eval_cache_path: str,
|
|
28
|
+
) -> List[Stat]:
|
|
29
|
+
"""
|
|
30
|
+
Evaluate a single LLM generation against the ground truth labels.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
# Extract predictions from the model output
|
|
34
|
+
if request_state.result is not None:
|
|
35
|
+
predictions = [completion.text.strip() for completion in request_state.result.completions]
|
|
36
|
+
else:
|
|
37
|
+
predictions = []
|
|
38
|
+
|
|
39
|
+
if not predictions:
|
|
40
|
+
raise ValueError("No predictions found in the completions.")
|
|
41
|
+
|
|
42
|
+
prediction = predictions[0] # Use the first completion
|
|
43
|
+
|
|
44
|
+
# Extract references (ground truth)
|
|
45
|
+
references = getattr(request_state.instance, "references", None)
|
|
46
|
+
|
|
47
|
+
if not references or len(references) == 0:
|
|
48
|
+
hlog(f"Warning: Missing references for instance {request_state.instance}")
|
|
49
|
+
return []
|
|
50
|
+
|
|
51
|
+
ground_truth_reference = next((ref for ref in references if CORRECT_TAG in ref.tags), None)
|
|
52
|
+
|
|
53
|
+
if not ground_truth_reference:
|
|
54
|
+
hlog(f"Warning: No ground truth reference with CORRECT_TAG for instance {request_state.instance}")
|
|
55
|
+
return []
|
|
56
|
+
|
|
57
|
+
# Extract the ground truth error flag and sentence ID
|
|
58
|
+
# ground_truth_text_pre = ground_truth_reference.output.text.strip()
|
|
59
|
+
ground_truth_text = ground_truth_reference.output.text.replace(".", "").strip()
|
|
60
|
+
|
|
61
|
+
# Determine ground truth values
|
|
62
|
+
if ground_truth_text == "CORRECT":
|
|
63
|
+
ground_truth_flag = 0 # No medical error
|
|
64
|
+
ground_truth_sentence_id = -1 # No sentence should be flagged
|
|
65
|
+
else:
|
|
66
|
+
ground_truth_flag = 1 # Medical error exists
|
|
67
|
+
try:
|
|
68
|
+
ground_truth_sentence_id = int(ground_truth_text)
|
|
69
|
+
except ValueError:
|
|
70
|
+
ground_truth_sentence_id = -1
|
|
71
|
+
|
|
72
|
+
# Determine model predictions
|
|
73
|
+
prediction = prediction.strip()
|
|
74
|
+
|
|
75
|
+
if prediction == "CORRECT":
|
|
76
|
+
predicted_flag = 0
|
|
77
|
+
predicted_sentence_id = -1
|
|
78
|
+
else:
|
|
79
|
+
match = re.match(r"^(\d+)", prediction) # Ensure first token is a number
|
|
80
|
+
if match:
|
|
81
|
+
predicted_sentence_id = int(match.group(1))
|
|
82
|
+
predicted_flag = 1 # A valid sentence ID implies the model thinks there's an error
|
|
83
|
+
else:
|
|
84
|
+
predicted_flag = 1 # The model attempted to correct something, but in an invalid format
|
|
85
|
+
predicted_sentence_id = -2 # Mark invalid outputs
|
|
86
|
+
|
|
87
|
+
# Calculate Error Flag Accuracy
|
|
88
|
+
flag_accuracy = 1 if predicted_flag == ground_truth_flag else 0
|
|
89
|
+
|
|
90
|
+
# Calculate Error Sentence Accuracy
|
|
91
|
+
if ground_truth_flag == 1 and predicted_flag == 1:
|
|
92
|
+
sentence_accuracy = 1 if predicted_sentence_id == ground_truth_sentence_id else 0
|
|
93
|
+
else:
|
|
94
|
+
sentence_accuracy = (
|
|
95
|
+
1 if ground_truth_flag == 0 and predicted_flag == 0 else 0
|
|
96
|
+
) # Both must agree it's "CORRECT"
|
|
97
|
+
|
|
98
|
+
return [
|
|
99
|
+
Stat(MetricName("medec_error_flag_accuracy")).add(flag_accuracy),
|
|
100
|
+
Stat(MetricName("medec_error_sentence_accuracy")).add(sentence_accuracy),
|
|
101
|
+
]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from helm.benchmark.annotation.medi_qa_annotator import ANNOTATOR_MODELS
|
|
2
|
+
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class MediQAMetric(LLMJuryMetric):
|
|
6
|
+
"""Score metrics for MediQA."""
|
|
7
|
+
|
|
8
|
+
def __init__(self):
|
|
9
|
+
super().__init__(
|
|
10
|
+
metric_name="medi_qa_accuracy",
|
|
11
|
+
scenario_name="medi_qa",
|
|
12
|
+
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
+
default_score=1.0,
|
|
14
|
+
)
|
|
@@ -1,23 +1,14 @@
|
|
|
1
|
-
from
|
|
1
|
+
from helm.benchmark.annotation.medication_qa_annotator import ANNOTATOR_MODELS
|
|
2
|
+
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
2
3
|
|
|
3
|
-
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
|
-
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
-
from helm.benchmark.metrics.metric import Metric
|
|
6
|
-
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
|
-
from helm.benchmark.metrics.metric_service import MetricService
|
|
8
|
-
from helm.benchmark.metrics.statistic import Stat
|
|
9
4
|
|
|
10
|
-
|
|
11
|
-
class MedicationQAScoreMetric(Metric):
|
|
5
|
+
class MedicationQAMetric(LLMJuryMetric):
|
|
12
6
|
"""Score metrics for MedicationQA."""
|
|
13
7
|
|
|
14
|
-
def
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
assert request_state.annotations
|
|
22
|
-
score = request_state.annotations["medication_qa"]["score"]
|
|
23
|
-
return [Stat(MetricName("medication_qa_score")).add(score)]
|
|
8
|
+
def __init__(self):
|
|
9
|
+
super().__init__(
|
|
10
|
+
metric_name="medication_qa_accuracy",
|
|
11
|
+
scenario_name="medication_qa",
|
|
12
|
+
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
+
default_score=1.0,
|
|
14
|
+
)
|