crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +218 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +4 -1
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/summarize.py +23 -10
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +78 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +75 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +109 -36
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +241 -22
- helm/clients/palmyra_client.py +1 -4
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +47 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1112 -19
- helm/config/model_metadata.yaml +985 -44
- helm/config/tokenizer_configs.yaml +379 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +1 -1
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
+
from helm.benchmark.metrics.metric import Metric
|
|
6
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
8
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class OmniMATHMetric(Metric):
|
|
12
|
+
"""Score metrics for Omni-MATH."""
|
|
13
|
+
|
|
14
|
+
def evaluate_generation(
|
|
15
|
+
self,
|
|
16
|
+
adapter_spec: AdapterSpec,
|
|
17
|
+
request_state: RequestState,
|
|
18
|
+
metric_service: MetricService,
|
|
19
|
+
eval_cache_path: str,
|
|
20
|
+
) -> List[Stat]:
|
|
21
|
+
assert request_state.annotations
|
|
22
|
+
annotations: Dict[str, Any] = request_state.annotations["omni_math"]
|
|
23
|
+
scores: List[int] = []
|
|
24
|
+
for annotation_key, annotation_value in annotations.items():
|
|
25
|
+
if annotation_key.endswith("_equivalence_judgement") and annotation_value is not None:
|
|
26
|
+
scores.append(int(annotation_value))
|
|
27
|
+
if not scores:
|
|
28
|
+
raise ValueError("Could not compute Omni-MATH accuracy because all annotators failed.")
|
|
29
|
+
score = sum(scores) / len(scores)
|
|
30
|
+
return [
|
|
31
|
+
Stat(MetricName("omni_math_accuracy")).add(score),
|
|
32
|
+
]
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
from typing import Any, Dict, List, TypedDict
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
+
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
6
|
+
from helm.benchmark.metrics.metric import (
|
|
7
|
+
create_metric,
|
|
8
|
+
Metric,
|
|
9
|
+
MetricInterface,
|
|
10
|
+
MetricResult,
|
|
11
|
+
MetricSpec,
|
|
12
|
+
PerInstanceStats,
|
|
13
|
+
)
|
|
14
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
15
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
16
|
+
from helm.common.object_spec import get_class_by_name
|
|
17
|
+
from helm.common.request import GeneratedOutput
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class _MetricSpecDict(TypedDict):
|
|
21
|
+
class_name: str
|
|
22
|
+
args: Dict[str, Any]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _dict_to_metric_spec(metric_spec_dict: _MetricSpecDict) -> MetricSpec:
|
|
26
|
+
return MetricSpec(metric_spec_dict["class_name"], metric_spec_dict["args"])
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class OutputProcessingMetric(MetricInterface):
|
|
30
|
+
def __init__(self, processor: str, metric_specs: List[_MetricSpecDict]):
|
|
31
|
+
self.processor = get_class_by_name(processor) # actually a function, not a class
|
|
32
|
+
self.metrics: List[Metric] = [create_metric(_dict_to_metric_spec(metric_spec)) for metric_spec in metric_specs]
|
|
33
|
+
|
|
34
|
+
def _process_request_state(self, request_state: RequestState) -> RequestState:
|
|
35
|
+
if not request_state.result:
|
|
36
|
+
return request_state
|
|
37
|
+
processed_completions: List[GeneratedOutput] = []
|
|
38
|
+
for completion in request_state.result.completions:
|
|
39
|
+
processed_completions.append(dataclasses.replace(completion, text=self.processor(completion.text)))
|
|
40
|
+
return dataclasses.replace(
|
|
41
|
+
request_state, result=dataclasses.replace(request_state.result, completions=processed_completions)
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def evaluate(
|
|
45
|
+
self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int
|
|
46
|
+
) -> MetricResult:
|
|
47
|
+
aggregated_stats: List[Stat] = []
|
|
48
|
+
per_instance_stats: List[PerInstanceStats] = []
|
|
49
|
+
|
|
50
|
+
processed_scenario_state = dataclasses.replace(
|
|
51
|
+
scenario_state,
|
|
52
|
+
request_states=[
|
|
53
|
+
self._process_request_state(request_state) for request_state in scenario_state.request_states
|
|
54
|
+
],
|
|
55
|
+
)
|
|
56
|
+
for metric in self.metrics:
|
|
57
|
+
metric_result = metric.evaluate(processed_scenario_state, metric_service, eval_cache_path, parallelism)
|
|
58
|
+
aggregated_stats.extend(metric_result.aggregated_stats)
|
|
59
|
+
per_instance_stats.extend(metric_result.per_instance_stats)
|
|
60
|
+
return MetricResult(aggregated_stats=aggregated_stats, per_instance_stats=per_instance_stats)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def remove_deepseek_r1_thinking(input: str) -> str:
|
|
5
|
+
if "<think>" not in input:
|
|
6
|
+
return input
|
|
7
|
+
|
|
8
|
+
if "</think>\n\n" in input:
|
|
9
|
+
# The think block is usually followed by two newlines, so we should remove that
|
|
10
|
+
return re.sub("<think>.*</think>\n\n", "", input, flags=re.DOTALL)
|
|
11
|
+
elif "</think>" in input:
|
|
12
|
+
return re.sub("<think>.*</think>", "", input, flags=re.DOTALL)
|
|
13
|
+
else:
|
|
14
|
+
# Unclosed think block
|
|
15
|
+
return ""
|
|
@@ -2,8 +2,8 @@ from typing import List
|
|
|
2
2
|
|
|
3
3
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
4
|
from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
|
|
5
|
-
from .metric_name import MetricName
|
|
6
|
-
from .statistic import Stat
|
|
5
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
6
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
7
7
|
from nltk.translate.bleu_score import corpus_bleu
|
|
8
8
|
|
|
9
9
|
|
|
@@ -9,9 +9,9 @@ from helm.common.optional_dependencies import handle_module_not_found_error
|
|
|
9
9
|
from helm.benchmark.scenarios.scenario import unpack_tag, CORRECT_TAG, Reference
|
|
10
10
|
from helm.common.request import RequestResult
|
|
11
11
|
from helm.common.general import assert_present, binarize_dict
|
|
12
|
-
from .metric_name import MetricName
|
|
13
|
-
from .metric_service import MetricService
|
|
14
|
-
from .statistic import Stat
|
|
12
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
13
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
14
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
15
15
|
|
|
16
16
|
try:
|
|
17
17
|
import pytrec_eval
|
|
@@ -14,9 +14,9 @@ from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_LANGUAGE_MO
|
|
|
14
14
|
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
15
15
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
16
16
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
17
|
-
from .metric_name import MetricName, MetricContext
|
|
18
|
-
from .metric_service import MetricService
|
|
19
|
-
from .statistic import Stat, merge_stat
|
|
17
|
+
from helm.benchmark.metrics.metric_name import MetricName, MetricContext
|
|
18
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
19
|
+
from helm.benchmark.metrics.statistic import Stat, merge_stat
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
@dataclass(frozen=True)
|
|
@@ -14,7 +14,7 @@ from helm.benchmark.metrics.metric_service import MetricService
|
|
|
14
14
|
from helm.benchmark.metrics.statistic import Stat
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
class
|
|
17
|
+
class SEAHELMMachineTranslationMetric(Metric):
|
|
18
18
|
"""Machine Translation Metrics
|
|
19
19
|
|
|
20
20
|
This class computes the following standard machine translation metrics
|
|
@@ -74,8 +74,8 @@ class BhasaMachineTranslationMetric(Metric):
|
|
|
74
74
|
return result
|
|
75
75
|
|
|
76
76
|
|
|
77
|
-
class
|
|
78
|
-
"""
|
|
77
|
+
class SEAHELMQAMetric(Metric):
|
|
78
|
+
"""SEAHELM QA Metrics
|
|
79
79
|
|
|
80
80
|
This class computes the following standard SQuAD v1.1 metrics
|
|
81
81
|
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
from helm.benchmark.metrics.metric import MetricSpec
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def get_seahelm_machine_translation_metric_specs() -> List[MetricSpec]:
|
|
6
|
+
return [MetricSpec(class_name="helm.benchmark.metrics.seahelm_metrics.SEAHELMMachineTranslationMetric")]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_seahelm_qa_metric_specs(args: Dict[str, Any]) -> List[MetricSpec]:
|
|
10
|
+
return [MetricSpec(class_name="helm.benchmark.metrics.seahelm_metrics.SEAHELMQAMetric", args=args)]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
+
from helm.benchmark.annotation.starr_patient_instructions_annotator import ANNOTATOR_MODELS
|
|
6
|
+
from helm.benchmark.metrics.metric import Metric
|
|
7
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
8
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
9
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class StarrPatientInstructionsMetric(Metric):
|
|
13
|
+
"""Score metrics for StarrPatientInstructions."""
|
|
14
|
+
|
|
15
|
+
def evaluate_generation(
|
|
16
|
+
self,
|
|
17
|
+
adapter_spec: AdapterSpec,
|
|
18
|
+
request_state: RequestState,
|
|
19
|
+
metric_service: MetricService,
|
|
20
|
+
eval_cache_path: str,
|
|
21
|
+
) -> List[Stat]:
|
|
22
|
+
assert request_state.annotations
|
|
23
|
+
annotations: Dict[str, Any] = request_state.annotations["starr_patient_instructions"]
|
|
24
|
+
scores: List[int] = []
|
|
25
|
+
score = 0.0
|
|
26
|
+
for annotation_key, annotation_dict in annotations.items():
|
|
27
|
+
if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
|
|
28
|
+
for val in annotation_dict.values():
|
|
29
|
+
scores.append(int(val["score"]))
|
|
30
|
+
if scores:
|
|
31
|
+
score = sum(scores) / len(scores)
|
|
32
|
+
return [
|
|
33
|
+
Stat(MetricName("starr_patient_instructions_accuracy")).add(score),
|
|
34
|
+
]
|
|
@@ -10,7 +10,7 @@ import numpy.typing as npt
|
|
|
10
10
|
import torch
|
|
11
11
|
import os
|
|
12
12
|
import json
|
|
13
|
-
from . import utils_misc
|
|
13
|
+
from helm.benchmark.metrics.summac import utils_misc
|
|
14
14
|
|
|
15
15
|
model_map = {
|
|
16
16
|
"snli-base": {"model_card": "boychaboy/SNLI_roberta-base", "entailment_idx": 0, "contradiction_idx": 2},
|
|
@@ -2,10 +2,10 @@ from typing import Dict, List
|
|
|
2
2
|
|
|
3
3
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
4
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
5
|
-
from .metric import Metric
|
|
6
|
-
from .metric_name import MetricName
|
|
7
|
-
from .metric_service import MetricService
|
|
8
|
-
from .statistic import Stat
|
|
5
|
+
from helm.benchmark.metrics.metric import Metric
|
|
6
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
8
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
9
9
|
from helm.common.critique_request import CritiqueTaskTemplate, CritiqueQuestionTemplate, CritiqueRequest, QuestionType
|
|
10
10
|
|
|
11
11
|
|
|
@@ -16,12 +16,16 @@ from helm.benchmark.metrics.evaluate_reference_metrics import get_rouge_function
|
|
|
16
16
|
from helm.common.hierarchical_logger import hlog
|
|
17
17
|
from helm.common.general import ensure_file_downloaded
|
|
18
18
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
19
|
-
from .metric import Metric, MetricResult
|
|
20
|
-
from .metric_name import MetricName
|
|
21
|
-
from .metric_service import MetricService
|
|
22
|
-
from .statistic import Stat
|
|
23
|
-
from .summac.model_summac import SummaCZS
|
|
24
|
-
|
|
19
|
+
from helm.benchmark.metrics.metric import Metric, MetricResult
|
|
20
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
21
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
22
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
23
|
+
from helm.benchmark.metrics.summac.model_summac import SummaCZS
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
from bert_score import BERTScorer # type: ignore
|
|
27
|
+
except ModuleNotFoundError as e:
|
|
28
|
+
handle_module_not_found_error(e, ["summarization"])
|
|
25
29
|
|
|
26
30
|
|
|
27
31
|
QAFACTEVAL_URL: str = (
|
|
@@ -43,7 +47,13 @@ class SummarizationMetric(Metric):
|
|
|
43
47
|
4. Faithfulness (SummaC)
|
|
44
48
|
"""
|
|
45
49
|
|
|
46
|
-
def __init__(
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
task: str,
|
|
53
|
+
device: str = "cpu",
|
|
54
|
+
bertscore_model: str = "microsoft/deberta-large-mnli",
|
|
55
|
+
rescale_with_baseline: bool = True,
|
|
56
|
+
):
|
|
47
57
|
self.rouge_fns = {
|
|
48
58
|
"rouge_1": get_rouge_function("rouge1"),
|
|
49
59
|
"rouge_2": get_rouge_function("rouge2"),
|
|
@@ -56,7 +66,7 @@ class SummarizationMetric(Metric):
|
|
|
56
66
|
spacy.cli.download("en_core_web_sm")
|
|
57
67
|
|
|
58
68
|
try:
|
|
59
|
-
from summ_eval.data_stats_metric import DataStatsMetric
|
|
69
|
+
from summ_eval.data_stats_metric import DataStatsMetric # type: ignore
|
|
60
70
|
except ModuleNotFoundError as e:
|
|
61
71
|
handle_module_not_found_error(e, ["summarization"])
|
|
62
72
|
|
|
@@ -71,7 +81,7 @@ class SummarizationMetric(Metric):
|
|
|
71
81
|
else:
|
|
72
82
|
self.compute_bertscore = True
|
|
73
83
|
self.bert_scorer = BERTScorer(
|
|
74
|
-
model_type=
|
|
84
|
+
model_type=bertscore_model, lang="en", rescale_with_baseline=rescale_with_baseline, device=device
|
|
75
85
|
)
|
|
76
86
|
# Need GPU for faithfulness metrics since they are model-based.
|
|
77
87
|
self.compute_faithfulness = True
|
|
@@ -1,7 +1,11 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
2
|
from typing import Callable, List, Optional
|
|
3
3
|
|
|
4
|
-
from .bias_metrics import BiasMetric
|
|
4
|
+
from helm.benchmark.metrics.bias_metrics import BiasMetric
|
|
5
|
+
from helm.benchmark.metrics.nltk_helper import install_nltk_resources
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
install_nltk_resources()
|
|
5
9
|
|
|
6
10
|
|
|
7
11
|
@dataclass(frozen=True)
|
|
@@ -38,12 +38,14 @@ def _request_state(prediction: str, options: List[_Option]):
|
|
|
38
38
|
)
|
|
39
39
|
|
|
40
40
|
|
|
41
|
-
def
|
|
42
|
-
|
|
43
|
-
|
|
41
|
+
def get_stat_value(stats: List[Stat], stat_name: str):
|
|
42
|
+
for stat in stats:
|
|
43
|
+
if stat.name.name == stat_name:
|
|
44
|
+
return stat.mean
|
|
45
|
+
raise ValueError(f"No stat with name {stat_name}")
|
|
44
46
|
|
|
45
47
|
|
|
46
|
-
def
|
|
48
|
+
def compute_stats(all_classes_counts: Dict[str, Dict[str, int]]):
|
|
47
49
|
micro_counts: Dict[str, int] = defaultdict(int)
|
|
48
50
|
for class_counts in all_classes_counts.values():
|
|
49
51
|
for key, class_count in class_counts.items():
|
|
@@ -52,102 +54,172 @@ def _expected_stats(all_classes_counts: Dict[str, Dict[str, int]]):
|
|
|
52
54
|
micro_recall = micro_counts["tp"] / (micro_counts["tp"] + micro_counts["fn"])
|
|
53
55
|
micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall)
|
|
54
56
|
|
|
55
|
-
class_f1:
|
|
56
|
-
for class_counts in all_classes_counts.
|
|
57
|
+
class_f1: Dict[str, float] = {}
|
|
58
|
+
for class_name, class_counts in all_classes_counts.items():
|
|
57
59
|
class_precision = class_counts["tp"] / (class_counts["tp"] + class_counts["fp"])
|
|
58
60
|
class_recall = class_counts["tp"] / (class_counts["tp"] + class_counts["fn"])
|
|
59
|
-
class_f1
|
|
60
|
-
macro_f1 = sum(class_f1) / len(class_f1)
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
"classification_micro_f1": micro_f1,
|
|
64
|
-
"classification_macro_f1": macro_f1,
|
|
61
|
+
class_f1[class_name] = 2 * (class_precision * class_recall) / (class_precision + class_recall)
|
|
62
|
+
macro_f1 = sum(class_f1.values()) / len(class_f1)
|
|
63
|
+
class_name_to_support = {
|
|
64
|
+
class_name: class_counts["tp"] + class_counts["fn"] for class_name, class_counts in all_classes_counts.items()
|
|
65
65
|
}
|
|
66
|
+
weighted_f1 = sum(class_f1[class_name] * support for class_name, support in class_name_to_support.items()) / sum(
|
|
67
|
+
support for support in class_name_to_support.values()
|
|
68
|
+
)
|
|
66
69
|
|
|
70
|
+
stats = {
|
|
71
|
+
"macro_f1": macro_f1,
|
|
72
|
+
"micro_f1": micro_f1,
|
|
73
|
+
"weighted_f1": weighted_f1,
|
|
74
|
+
}
|
|
75
|
+
for class_name, class_f1_score in class_f1.items():
|
|
76
|
+
stats[f"{class_name}_f1"] = class_f1_score
|
|
77
|
+
return stats
|
|
67
78
|
|
|
68
|
-
def test_evaluate_instances_binary_generation():
|
|
69
|
-
metric = ClassificationMetric(delimiter=None)
|
|
70
79
|
|
|
80
|
+
def test_evaluate_instances_default_parameters():
|
|
71
81
|
request_states = [
|
|
72
82
|
_request_state("yes", [_Option("yes", True)]),
|
|
83
|
+
_request_state("yes ", [_Option("yes", True)]),
|
|
84
|
+
_request_state("yeS", [_Option("yes", True)]),
|
|
85
|
+
_request_state("yes", [_Option("no", True)]),
|
|
86
|
+
_request_state("no", [_Option("yes", True)]),
|
|
87
|
+
_request_state("no", [_Option("no", True)]),
|
|
88
|
+
_request_state("invalid", [_Option("no", True)]),
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
expected_stats = compute_stats(
|
|
92
|
+
{
|
|
93
|
+
"yes": {"tp": 3, "fp": 1, "tn": 2, "fn": 1},
|
|
94
|
+
"no": {"tp": 1, "fp": 1, "tn": 3, "fn": 2},
|
|
95
|
+
}
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
actual_stats = ClassificationMetric().evaluate_instances(request_states, "")
|
|
99
|
+
actual_macro_f1 = get_stat_value(actual_stats, "classification_macro_f1")
|
|
100
|
+
assert actual_macro_f1 == approx(expected_stats["macro_f1"])
|
|
101
|
+
actual_micro_f1 = get_stat_value(actual_stats, "classification_micro_f1")
|
|
102
|
+
assert actual_micro_f1 == approx(expected_stats["micro_f1"])
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def test_evaluate_instances_yes_and_no():
|
|
106
|
+
labels = ["yes", "no"]
|
|
107
|
+
request_states = [
|
|
73
108
|
_request_state("yes", [_Option("yes", True)]),
|
|
74
|
-
_request_state("yes", [_Option("yes", True)]),
|
|
109
|
+
_request_state("yes ", [_Option("yes", True)]),
|
|
110
|
+
_request_state("yeS", [_Option("yes", True)]),
|
|
75
111
|
_request_state("yes", [_Option("no", True)]),
|
|
76
112
|
_request_state("no", [_Option("yes", True)]),
|
|
77
113
|
_request_state("no", [_Option("no", True)]),
|
|
78
114
|
_request_state("invalid", [_Option("no", True)]),
|
|
79
115
|
]
|
|
80
116
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
{
|
|
85
|
-
|
|
86
|
-
"no": {"tp": 1, "fp": 1, "tn": 3, "fn": 2},
|
|
87
|
-
}
|
|
88
|
-
),
|
|
117
|
+
expected_stats = compute_stats(
|
|
118
|
+
{
|
|
119
|
+
"yes": {"tp": 3, "fp": 1, "tn": 2, "fn": 1},
|
|
120
|
+
"no": {"tp": 1, "fp": 1, "tn": 3, "fn": 2},
|
|
121
|
+
}
|
|
89
122
|
)
|
|
90
123
|
|
|
124
|
+
actual_stats = ClassificationMetric(
|
|
125
|
+
scores=["f1"], averages=["macro", "micro", "weighted", None], labels=labels
|
|
126
|
+
).evaluate_instances(request_states, "")
|
|
127
|
+
actual_macro_f1 = get_stat_value(actual_stats, "classification_macro_f1")
|
|
128
|
+
assert actual_macro_f1 == approx(expected_stats["macro_f1"])
|
|
129
|
+
actual_micro_f1 = get_stat_value(actual_stats, "classification_micro_f1")
|
|
130
|
+
assert actual_micro_f1 == approx(expected_stats["micro_f1"])
|
|
131
|
+
actual_weighted_f1 = get_stat_value(actual_stats, "classification_weighted_f1")
|
|
132
|
+
assert actual_weighted_f1 == approx(expected_stats["weighted_f1"])
|
|
133
|
+
actual_yes_f1 = get_stat_value(actual_stats, "classification_yes_f1")
|
|
134
|
+
assert actual_yes_f1 == approx(expected_stats["yes_f1"])
|
|
135
|
+
actual_no_f1 = get_stat_value(actual_stats, "classification_no_f1")
|
|
136
|
+
assert actual_no_f1 == approx(expected_stats["no_f1"])
|
|
137
|
+
|
|
91
138
|
|
|
92
139
|
def test_evaluate_instances_multi_class():
|
|
93
|
-
|
|
94
|
-
metric = ClassificationMetric(delimiter=None)
|
|
140
|
+
labels = ["a", "b", "c"]
|
|
95
141
|
|
|
96
|
-
def
|
|
97
|
-
return [_Option(text, text == correct) for text in
|
|
142
|
+
def _gold_label(correct: str):
|
|
143
|
+
return [_Option(text, text == correct) for text in labels]
|
|
98
144
|
|
|
99
145
|
request_states = [
|
|
100
|
-
_request_state("
|
|
101
|
-
_request_state("
|
|
102
|
-
_request_state("
|
|
103
|
-
_request_state("
|
|
104
|
-
_request_state("b",
|
|
105
|
-
_request_state("b",
|
|
106
|
-
_request_state("b",
|
|
107
|
-
_request_state("c",
|
|
108
|
-
_request_state("c",
|
|
109
|
-
_request_state("invalid",
|
|
146
|
+
_request_state("a", _gold_label("a")),
|
|
147
|
+
_request_state("a", _gold_label("a")),
|
|
148
|
+
_request_state("a", _gold_label("a")),
|
|
149
|
+
_request_state("a", _gold_label("b")),
|
|
150
|
+
_request_state("b", _gold_label("b")),
|
|
151
|
+
_request_state("b", _gold_label("b")),
|
|
152
|
+
_request_state("b", _gold_label("c")),
|
|
153
|
+
_request_state("c", _gold_label("a")),
|
|
154
|
+
_request_state("c", _gold_label("c")),
|
|
155
|
+
_request_state("invalid", _gold_label("c")),
|
|
110
156
|
]
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
{
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
}
|
|
119
|
-
),
|
|
157
|
+
|
|
158
|
+
expected_stats = compute_stats(
|
|
159
|
+
{
|
|
160
|
+
"a": {"tp": 3, "fp": 1, "tn": 5, "fn": 1},
|
|
161
|
+
"b": {"tp": 2, "fp": 1, "tn": 6, "fn": 1},
|
|
162
|
+
"c": {"tp": 1, "fp": 1, "tn": 6, "fn": 2},
|
|
163
|
+
}
|
|
120
164
|
)
|
|
121
165
|
|
|
166
|
+
actual_stats = ClassificationMetric(
|
|
167
|
+
scores=["f1"], averages=["macro", "micro", "weighted", None], labels=labels
|
|
168
|
+
).evaluate_instances(request_states, "")
|
|
169
|
+
actual_macro_f1 = get_stat_value(actual_stats, "classification_macro_f1")
|
|
170
|
+
assert actual_macro_f1 == approx(expected_stats["macro_f1"])
|
|
171
|
+
actual_micro_f1 = get_stat_value(actual_stats, "classification_micro_f1")
|
|
172
|
+
assert actual_micro_f1 == approx(expected_stats["micro_f1"])
|
|
173
|
+
actual_weighted_f1 = get_stat_value(actual_stats, "classification_weighted_f1")
|
|
174
|
+
assert actual_weighted_f1 == approx(expected_stats["weighted_f1"])
|
|
175
|
+
actual_a_f1 = get_stat_value(actual_stats, "classification_a_f1")
|
|
176
|
+
assert actual_a_f1 == approx(expected_stats["a_f1"])
|
|
177
|
+
actual_b_f1 = get_stat_value(actual_stats, "classification_b_f1")
|
|
178
|
+
assert actual_b_f1 == approx(expected_stats["b_f1"])
|
|
179
|
+
actual_c_f1 = get_stat_value(actual_stats, "classification_c_f1")
|
|
180
|
+
assert actual_c_f1 == approx(expected_stats["c_f1"])
|
|
181
|
+
|
|
122
182
|
|
|
123
183
|
def test_evaluate_instances_multilabel():
|
|
124
|
-
|
|
125
|
-
metric = ClassificationMetric(delimiter=",")
|
|
184
|
+
labels = ["a", "b", "c"]
|
|
126
185
|
|
|
127
|
-
def
|
|
128
|
-
return [_Option(text, text in correct) for text in
|
|
186
|
+
def _gold_labels(correct: List[str]):
|
|
187
|
+
return [_Option(text, text in correct) for text in labels]
|
|
129
188
|
|
|
130
189
|
request_states = [
|
|
131
|
-
_request_state("
|
|
132
|
-
_request_state("
|
|
133
|
-
_request_state("
|
|
134
|
-
_request_state("c",
|
|
135
|
-
_request_state("b",
|
|
136
|
-
_request_state("
|
|
137
|
-
_request_state("
|
|
138
|
-
_request_state("
|
|
190
|
+
_request_state("a,b", _gold_labels(["a", "b"])),
|
|
191
|
+
_request_state("a,b", _gold_labels(["a", "c"])),
|
|
192
|
+
_request_state("a", _gold_labels(["a"])),
|
|
193
|
+
_request_state("c", _gold_labels(["b"])),
|
|
194
|
+
_request_state("b", _gold_labels(["b", "c"])),
|
|
195
|
+
_request_state("a,b", _gold_labels(["c"])),
|
|
196
|
+
_request_state("a,c", _gold_labels(["a"])),
|
|
197
|
+
_request_state("a,b,c", _gold_labels(["a", "b", "c"])),
|
|
139
198
|
_request_state("", []),
|
|
140
199
|
_request_state("n/a", []),
|
|
141
|
-
_request_state("invalid",
|
|
200
|
+
_request_state("invalid", _gold_labels(["c"])),
|
|
142
201
|
]
|
|
143
202
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
{
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
"c": {"tp": 1, "fp": 2, "tn": 4, "fn": 4},
|
|
151
|
-
}
|
|
152
|
-
),
|
|
203
|
+
expected_stats = compute_stats(
|
|
204
|
+
{
|
|
205
|
+
"a": {"tp": 5, "fp": 1, "tn": 5, "fn": 0},
|
|
206
|
+
"b": {"tp": 3, "fp": 2, "tn": 5, "fn": 1},
|
|
207
|
+
"c": {"tp": 1, "fp": 2, "tn": 4, "fn": 4},
|
|
208
|
+
}
|
|
153
209
|
)
|
|
210
|
+
|
|
211
|
+
actual_stats = ClassificationMetric(
|
|
212
|
+
scores=["f1"], averages=["macro", "micro", "weighted", None], labels=labels, delimiter=","
|
|
213
|
+
).evaluate_instances(request_states, "")
|
|
214
|
+
actual_macro_f1 = get_stat_value(actual_stats, "classification_macro_f1")
|
|
215
|
+
assert actual_macro_f1 == approx(expected_stats["macro_f1"])
|
|
216
|
+
actual_micro_f1 = get_stat_value(actual_stats, "classification_micro_f1")
|
|
217
|
+
assert actual_micro_f1 == approx(expected_stats["micro_f1"])
|
|
218
|
+
actual_weighted_f1 = get_stat_value(actual_stats, "classification_weighted_f1")
|
|
219
|
+
assert actual_weighted_f1 == approx(expected_stats["weighted_f1"])
|
|
220
|
+
actual_a_f1 = get_stat_value(actual_stats, "classification_a_f1")
|
|
221
|
+
assert actual_a_f1 == approx(expected_stats["a_f1"])
|
|
222
|
+
actual_b_f1 = get_stat_value(actual_stats, "classification_b_f1")
|
|
223
|
+
assert actual_b_f1 == approx(expected_stats["b_f1"])
|
|
224
|
+
actual_c_f1 = get_stat_value(actual_stats, "classification_c_f1")
|
|
225
|
+
assert actual_c_f1 == approx(expected_stats["c_f1"])
|
|
@@ -1,4 +1,7 @@
|
|
|
1
|
+
import pytest
|
|
1
2
|
from helm.benchmark.metrics.evaluate_reference_metrics import (
|
|
3
|
+
bleu_1,
|
|
4
|
+
chinese_bleu_1,
|
|
2
5
|
exact_match,
|
|
3
6
|
exact_match_indicator,
|
|
4
7
|
final_number_exact_match,
|
|
@@ -28,3 +31,15 @@ def test_final_number_exact_match():
|
|
|
28
31
|
assert final_number_exact_match("34.2", "2") == 0
|
|
29
32
|
assert final_number_exact_match("342", "342.") == 1
|
|
30
33
|
assert final_number_exact_match("3,420", "3420") == 1
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_bleu_1():
|
|
37
|
+
assert bleu_1("the quick brown fox jumped over the lazy dog", "the fox jumped over the dog") == pytest.approx(
|
|
38
|
+
0.6065306597126334
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_chinese_bleu_1():
|
|
43
|
+
assert chinese_bleu_1(
|
|
44
|
+
"太祖武皇帝,沛國譙人也,姓曹,諱操,字孟德,漢相國參之後。", "太祖武皇帝,沛國譙人也,漢相國參之後。"
|
|
45
|
+
) == pytest.approx(0.5907775139012316)
|
|
@@ -3,8 +3,8 @@ from typing import Dict
|
|
|
3
3
|
import pytest
|
|
4
4
|
import statistics
|
|
5
5
|
|
|
6
|
-
from .metric_name import MetricName
|
|
7
|
-
from .statistic import Stat, merge_stat
|
|
6
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
|
+
from helm.benchmark.metrics.statistic import Stat, merge_stat
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def test_stat_add():
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
2
2
|
from helm.common.request import Request
|
|
3
|
-
from .token_cost_estimator import TokenCostEstimator
|
|
3
|
+
from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class AI21TokenCostEstimator(TokenCostEstimator):
|