crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.6.dist-info/METADATA +427 -0
- crfm_helm-0.5.6.dist-info/RECORD +941 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +21 -6
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +214 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +11 -12
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +14 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +14 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +6 -6
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +14 -0
- helm/benchmark/metrics/medalign_metrics.py +14 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +14 -0
- helm/benchmark/metrics/medication_qa_metrics.py +10 -19
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +14 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +20 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/toxicity_metrics.py +6 -6
- helm/benchmark/metrics/unitxt_metrics.py +7 -5
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +97 -67
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +86 -90
- helm/benchmark/run_expander.py +90 -9
- helm/benchmark/run_spec_factory.py +13 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +142 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +141 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +103 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +157 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +136 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +94 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
- helm/benchmark/scenarios/medbullets_scenario.py +145 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
- helm/benchmark/scenarios/medec_scenario.py +125 -0
- helm/benchmark/scenarios/medhallu_scenario.py +72 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +123 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +12 -2
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +13 -1
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_long_context.yaml +283 -0
- helm/benchmark/static/schema_medhelm.yaml +1140 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +129 -56
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +6 -6
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +4 -5
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +120 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
- helm/clients/audio_language/qwen_audiolm_client.py +152 -0
- helm/clients/audio_language/test.py +62 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +203 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/client.py +7 -7
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/grok_client.py +36 -0
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +52 -21
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
- helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
- helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +308 -43
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +3 -9
- helm/clients/reka_client.py +3 -3
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +76 -9
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +45 -13
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +188 -0
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +4 -6
- helm/clients/writer_client.py +102 -0
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/images_utils.py +2 -2
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +14 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1792 -28
- helm/config/model_metadata.yaml +1606 -51
- helm/config/tokenizer_configs.yaml +521 -4
- helm/proxy/cli.py +5 -3
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +22 -86
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +3 -3
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import List, Optional
|
|
2
2
|
|
|
3
|
-
from sklearn.metrics import f1_score
|
|
3
|
+
from sklearn.metrics import f1_score, precision_score, recall_score
|
|
4
4
|
from sklearn.preprocessing import MultiLabelBinarizer
|
|
5
5
|
|
|
6
6
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
@@ -9,9 +9,14 @@ from helm.benchmark.metrics.evaluate_reference_metrics import normalize_text
|
|
|
9
9
|
from helm.benchmark.metrics.metric import MetricName
|
|
10
10
|
from helm.benchmark.metrics.statistic import Stat
|
|
11
11
|
from helm.benchmark.scenarios.scenario import Reference
|
|
12
|
+
from helm.common.hierarchical_logger import hwarn
|
|
12
13
|
from helm.common.request import GeneratedOutput
|
|
13
14
|
|
|
14
15
|
|
|
16
|
+
def _normalize_label_text(text: str) -> str:
|
|
17
|
+
return normalize_text(text, should_remove_articles=False)
|
|
18
|
+
|
|
19
|
+
|
|
15
20
|
class ClassificationMetric(EvaluateInstancesMetric):
|
|
16
21
|
"""Defines metrics for multi-class classification using the generation adapter.
|
|
17
22
|
|
|
@@ -23,8 +28,12 @@ class ClassificationMetric(EvaluateInstancesMetric):
|
|
|
23
28
|
reference. The predicted class for each instance is the normalized text of the generation.
|
|
24
29
|
|
|
25
30
|
Note:
|
|
26
|
-
-
|
|
27
|
-
|
|
31
|
+
- It is highly recommended to specify the set of classes should be specified using the
|
|
32
|
+
`labels` parameter. Otherwise, the set of classes is derived from the correct references
|
|
33
|
+
from all the instances. This means that classes may be incorrectly omitted if they are never
|
|
34
|
+
used as a correct reference.
|
|
35
|
+
- The `averages` parameter is a list of averaging methods to be used.
|
|
36
|
+
It has the same meaning `average` as in scikit-learn.
|
|
28
37
|
- Generations that are not in any of the known classes are counted as a
|
|
29
38
|
negative prediction for every class.
|
|
30
39
|
- Perturbed classes are considered different classes from unperturbed
|
|
@@ -32,8 +41,45 @@ class ClassificationMetric(EvaluateInstancesMetric):
|
|
|
32
41
|
- Currently, multi-label classification is not supported.
|
|
33
42
|
"""
|
|
34
43
|
|
|
35
|
-
|
|
44
|
+
AVERAGE_OPTIONS = ["micro", "macro", "weighted", None]
|
|
45
|
+
SCORE_OPTIONS = ["f1", "precision", "recall"]
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
averages: Optional[List[Optional[str]]] = None,
|
|
50
|
+
labels: Optional[List[str]] = None,
|
|
51
|
+
scores: Optional[List[str]] = None,
|
|
52
|
+
delimiter: Optional[str] = None,
|
|
53
|
+
) -> None:
|
|
54
|
+
"""Creates metrics for multi-class classification.
|
|
55
|
+
|
|
56
|
+
:param delimiter: For multi-label classification, the string delimiter between classes in the model's output.
|
|
57
|
+
:param average: The list of scores to compute (e.g. "f1", "precision", "recall").
|
|
58
|
+
Defaults to ["f1"].
|
|
59
|
+
:param average: The averaging methods (e.g. "micro", "macro", "weighted") to be used.
|
|
60
|
+
It has the same meaning `average` as in scikit-learn.
|
|
61
|
+
Defaults to ["macro", "micro"].
|
|
62
|
+
:param labels: The set of labels.
|
|
63
|
+
:return: A list of `Stat` objects.
|
|
64
|
+
"""
|
|
65
|
+
self.averages = averages or ["macro", "micro"]
|
|
66
|
+
for average in self.averages:
|
|
67
|
+
if average not in ClassificationMetric.AVERAGE_OPTIONS:
|
|
68
|
+
raise ValueError(
|
|
69
|
+
f"Each value in `averages` must be set to one of {ClassificationMetric.AVERAGE_OPTIONS}."
|
|
70
|
+
)
|
|
71
|
+
self.scores = scores or ["f1"]
|
|
72
|
+
for score_name in self.scores:
|
|
73
|
+
if score_name not in ClassificationMetric.SCORE_OPTIONS:
|
|
74
|
+
raise ValueError(f"Each value in `scores` must be set to one of {ClassificationMetric.SCORE_OPTIONS}.")
|
|
36
75
|
self.delimiter = delimiter
|
|
76
|
+
self.labels = labels
|
|
77
|
+
if not self.labels:
|
|
78
|
+
hwarn(
|
|
79
|
+
"`labels` were not set on `ClassificationMetric`, "
|
|
80
|
+
"so they will be inferred from target references. "
|
|
81
|
+
"It is recommend to explicitly set `labels` on `ClassificationMetric`."
|
|
82
|
+
)
|
|
37
83
|
|
|
38
84
|
def is_multi_label(self) -> bool:
|
|
39
85
|
return bool(self.delimiter)
|
|
@@ -57,20 +103,38 @@ class ClassificationMetric(EvaluateInstancesMetric):
|
|
|
57
103
|
references = request_state.instance.all_correct_references
|
|
58
104
|
if not self.is_multi_label():
|
|
59
105
|
assert len(references) == 1
|
|
60
|
-
correct_ref_texts = [
|
|
106
|
+
correct_ref_texts = [_normalize_label_text(ref.output.text) for ref in references if ref.output.text]
|
|
61
107
|
y_true.append(correct_ref_texts)
|
|
62
108
|
|
|
63
109
|
input_text = request_state.result.completions[0].text
|
|
64
110
|
predictions = input_text.split(self.delimiter) if self.is_multi_label() else [input_text]
|
|
65
|
-
y_pred.append([
|
|
66
|
-
|
|
67
|
-
|
|
111
|
+
y_pred.append([_normalize_label_text(pred) for pred in predictions if pred])
|
|
112
|
+
mlb = MultiLabelBinarizer().fit(
|
|
113
|
+
[[_normalize_label_text(label) for label in self.labels]] if self.labels else y_true
|
|
114
|
+
)
|
|
68
115
|
y_true = mlb.transform(y_true)
|
|
69
116
|
y_pred = mlb.transform(y_pred)
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
117
|
+
stats: List[Stat] = []
|
|
118
|
+
for average in self.averages:
|
|
119
|
+
for score_name in self.scores:
|
|
120
|
+
if score_name == "f1":
|
|
121
|
+
score_value = f1_score(y_pred=y_pred, y_true=y_true, average=average)
|
|
122
|
+
elif score_name == "precision":
|
|
123
|
+
score_value = precision_score(y_pred=y_pred, y_true=y_true, average=average)
|
|
124
|
+
elif score_name == "recall":
|
|
125
|
+
score_value = recall_score(y_pred=y_pred, y_true=y_true, average=average)
|
|
126
|
+
else:
|
|
127
|
+
raise ValueError(
|
|
128
|
+
f"Unknown score name: '{score_name}' - expected one of ['f1', 'precision', 'recall']"
|
|
129
|
+
)
|
|
130
|
+
if average is None:
|
|
131
|
+
for mlb_class, class_score_value in zip(mlb.classes_, score_value):
|
|
132
|
+
stats.append(
|
|
133
|
+
Stat(MetricName(f"classification_{mlb_class}_{score_name}")).add(class_score_value)
|
|
134
|
+
)
|
|
135
|
+
else:
|
|
136
|
+
stats.append(Stat(MetricName(f"classification_{average}_{score_name}")).add(score_value))
|
|
137
|
+
return stats
|
|
74
138
|
|
|
75
139
|
|
|
76
140
|
class MultipleChoiceClassificationMetric(EvaluateInstancesMetric):
|
|
@@ -7,19 +7,20 @@ import numpy as np
|
|
|
7
7
|
|
|
8
8
|
from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
|
|
9
9
|
from helm.common.request import RequestResult
|
|
10
|
-
from helm.common.hierarchical_logger import hlog
|
|
10
|
+
from helm.common.hierarchical_logger import hlog, hwarn
|
|
11
11
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
12
12
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
13
13
|
from helm.benchmark.metrics.cleva_metrics_helper import ChineseTokenizer
|
|
14
14
|
from helm.clients.perspective_api_client import PerspectiveAPIClientCredentialsError
|
|
15
15
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
16
16
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
17
|
-
from .bias_metrics import BiasMetric
|
|
18
|
-
from .toxicity_metrics import ToxicityMetric
|
|
19
|
-
from .copyright_metrics import BasicCopyrightMetric
|
|
20
|
-
from .metric_name import MetricName
|
|
21
|
-
from .metric_service import MetricService
|
|
22
|
-
from .statistic import Stat
|
|
17
|
+
from helm.benchmark.metrics.bias_metrics import BiasMetric
|
|
18
|
+
from helm.benchmark.metrics.toxicity_metrics import ToxicityMetric
|
|
19
|
+
from helm.benchmark.metrics.copyright_metrics import BasicCopyrightMetric
|
|
20
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
21
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
22
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
23
|
+
from helm.benchmark.runner import get_benchmark_output_path
|
|
23
24
|
|
|
24
25
|
try:
|
|
25
26
|
import jieba
|
|
@@ -71,7 +72,7 @@ class CLEVABiasMetric(BiasMetric):
|
|
|
71
72
|
"demographic_category". One of "adjective" or "profession".
|
|
72
73
|
"""
|
|
73
74
|
# Ensure all necessary data are downloaded
|
|
74
|
-
self.output_dir = os.path.join(
|
|
75
|
+
self.output_dir = os.path.join(get_benchmark_output_path(), "metrics", self.name)
|
|
75
76
|
ensure_directory_exists(self.output_dir)
|
|
76
77
|
for filename in self.FILE_NAMES:
|
|
77
78
|
target_path = os.path.join(self.output_dir, filename)
|
|
@@ -199,7 +200,7 @@ class CLEVAToxicityMetric(ToxicityMetric):
|
|
|
199
200
|
)
|
|
200
201
|
except PerspectiveAPIClientCredentialsError as e:
|
|
201
202
|
self._perspective_api_unavailable = True
|
|
202
|
-
|
|
203
|
+
hwarn(f"Skipping ToxicityMetrics because Perspective API Client unavailable due to error: {e}")
|
|
203
204
|
hlog(
|
|
204
205
|
"To enable ToxicityMetrics, see: https://crfm-helm.readthedocs.io/en/latest/benchmark/#perspective-api"
|
|
205
206
|
)
|
|
@@ -10,11 +10,11 @@ from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
|
10
10
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
11
11
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
12
12
|
from helm.benchmark.scenarios.code_scenario import CodeReference
|
|
13
|
-
from . import code_metrics_helper
|
|
14
|
-
from .metric import Metric, MetricResult
|
|
15
|
-
from .metric_service import MetricService
|
|
16
|
-
from .metric_name import MetricName
|
|
17
|
-
from .statistic import Stat
|
|
13
|
+
from helm.benchmark.metrics import code_metrics_helper
|
|
14
|
+
from helm.benchmark.metrics.metric import Metric, MetricResult
|
|
15
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
16
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
17
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
18
18
|
|
|
19
19
|
MAXIMUM_MEMORY_BYTES = 8 * 1024 * 1024 * 1024 # 8GB.
|
|
20
20
|
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
import comet
|
|
5
|
+
from torch import nn
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
8
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
9
|
+
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
10
|
+
from helm.benchmark.metrics.metric import Metric, MetricResult
|
|
11
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
12
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
13
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
14
|
+
from helm.common.hierarchical_logger import hlog
|
|
15
|
+
from helm.common.request import RequestResult
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CometMetric(Metric):
|
|
19
|
+
"""COMET machine translation metric using a regression model.
|
|
20
|
+
The model takes a triplet of source sentence, translation, and reference
|
|
21
|
+
and computes a score in the range [0, 1] reflecting the quality of the predicted
|
|
22
|
+
translation.
|
|
23
|
+
|
|
24
|
+
Paper:
|
|
25
|
+
@inproceedings{rei-etal-2022-comet,
|
|
26
|
+
title = "{COMET}-22: Unbabel-{IST} 2022 Submission for the Metrics Shared Task",
|
|
27
|
+
author = "Rei, Ricardo and
|
|
28
|
+
C. de Souza, Jos{\'e} G. and
|
|
29
|
+
Alves, Duarte and
|
|
30
|
+
Zerva, Chrysoula and
|
|
31
|
+
Farinha, Ana C and
|
|
32
|
+
Glushkova, Taisiya and
|
|
33
|
+
Lavie, Alon and
|
|
34
|
+
Coheur, Luisa and
|
|
35
|
+
Martins, Andr{\'e} F. T.",
|
|
36
|
+
editor = {Koehn, Philipp and
|
|
37
|
+
Barrault, Lo{\"\i}c and
|
|
38
|
+
Bojar, Ond{\v{r}}ej and
|
|
39
|
+
Bougares, Fethi and
|
|
40
|
+
Chatterjee, Rajen and
|
|
41
|
+
Costa-juss{\`a}, Marta R. and
|
|
42
|
+
Federmann, Christian and
|
|
43
|
+
Fishel, Mark and
|
|
44
|
+
Fraser, Alexander and
|
|
45
|
+
Freitag, Markus and
|
|
46
|
+
Graham, Yvette and
|
|
47
|
+
Grundkiewicz, Roman and
|
|
48
|
+
Guzman, Paco and
|
|
49
|
+
Haddow, Barry and
|
|
50
|
+
Huck, Matthias and
|
|
51
|
+
Jimeno Yepes, Antonio and
|
|
52
|
+
Kocmi, Tom and
|
|
53
|
+
Martins, Andr{\'e} and
|
|
54
|
+
Morishita, Makoto and
|
|
55
|
+
Monz, Christof and
|
|
56
|
+
Nagata, Masaaki and
|
|
57
|
+
Nakazawa, Toshiaki and
|
|
58
|
+
Negri, Matteo and
|
|
59
|
+
N{\'e}v{\'e}ol, Aur{\'e}lie and
|
|
60
|
+
Neves, Mariana and
|
|
61
|
+
Popel, Martin and
|
|
62
|
+
Turchi, Marco and
|
|
63
|
+
Zampieri, Marcos},
|
|
64
|
+
booktitle = "Proceedings of the Seventh Conference on Machine Translation (WMT)",
|
|
65
|
+
month = dec,
|
|
66
|
+
year = "2022",
|
|
67
|
+
publisher = "Association for Computational Linguistics",
|
|
68
|
+
url = "https://aclanthology.org/2022.wmt-1.52",
|
|
69
|
+
}
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
METRIC_NAME = "comet"
|
|
73
|
+
|
|
74
|
+
def __init__(self, task: str, model_name: str = "Unbabel/wmt22-comet-da", device: str = "cpu"):
|
|
75
|
+
self.model_name = model_name
|
|
76
|
+
self.comet_scorer: nn.Module = self._load_model(model_name)
|
|
77
|
+
self.num_gpus = 0 if device == "cpu" else 1
|
|
78
|
+
|
|
79
|
+
# suppress warnings from PyTorch Lightning which spams terminal
|
|
80
|
+
logging.getLogger("lightning.pytorch.utilities.rank_zero").setLevel(logging.WARNING)
|
|
81
|
+
logging.getLogger("lightning.pytorch.accelerators.cuda").setLevel(logging.WARNING)
|
|
82
|
+
|
|
83
|
+
@staticmethod
|
|
84
|
+
def _load_model(model_name: str) -> nn.Module:
|
|
85
|
+
"""Load Comet model from the checkpoint.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
The loaded model.
|
|
89
|
+
"""
|
|
90
|
+
return comet.load_from_checkpoint(comet.download_model(model_name))
|
|
91
|
+
|
|
92
|
+
def evaluate(
|
|
93
|
+
self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int
|
|
94
|
+
) -> MetricResult:
|
|
95
|
+
hlog(
|
|
96
|
+
f"Setting parallelism from {parallelism} to 1, since "
|
|
97
|
+
f"evaluating {self.__class__.__name__} with parallelism > 1 seg faults."
|
|
98
|
+
)
|
|
99
|
+
return super().evaluate(scenario_state, metric_service, eval_cache_path, parallelism=1)
|
|
100
|
+
|
|
101
|
+
def evaluate_generation(
|
|
102
|
+
self,
|
|
103
|
+
adapter_spec: AdapterSpec,
|
|
104
|
+
request_state: RequestState,
|
|
105
|
+
metric_service: MetricService,
|
|
106
|
+
eval_cache_path: str,
|
|
107
|
+
) -> List[Stat]:
|
|
108
|
+
"""Compute the COMET score for this instance"""
|
|
109
|
+
assert len(request_state.instance.references) == 1
|
|
110
|
+
ref = request_state.instance.references[0].output.text
|
|
111
|
+
src = request_state.instance.input.text
|
|
112
|
+
|
|
113
|
+
result = request_state.result
|
|
114
|
+
if not isinstance(result, RequestResult):
|
|
115
|
+
raise TypeError(f"Expected a valid result, but got {result}!")
|
|
116
|
+
mt = result.completions[0].text.strip()
|
|
117
|
+
|
|
118
|
+
# comet requires this exac5 format
|
|
119
|
+
data = [dict(ref=ref, src=src, mt=mt)]
|
|
120
|
+
output = self.comet_scorer.predict(data, gpus=self.num_gpus, progress_bar=False) # type: ignore
|
|
121
|
+
comet_score = output[0][0] # extract the actual score
|
|
122
|
+
|
|
123
|
+
metric_result = [Stat(MetricName(self.METRIC_NAME)).add(comet_score)]
|
|
124
|
+
|
|
125
|
+
return metric_result
|
|
@@ -44,11 +44,18 @@ def get_language_modeling_metric_specs(names: List[str]) -> List[MetricSpec]:
|
|
|
44
44
|
]
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
def get_classification_metric_specs(
|
|
47
|
+
def get_classification_metric_specs(
|
|
48
|
+
labels: Optional[List[str]] = None, delimiter: Optional[str] = None
|
|
49
|
+
) -> List[MetricSpec]:
|
|
50
|
+
extra_args: Dict[str, Any] = {}
|
|
51
|
+
if labels:
|
|
52
|
+
extra_args["labels"] = labels
|
|
53
|
+
if delimiter:
|
|
54
|
+
extra_args["delimiter"] = delimiter
|
|
48
55
|
return [
|
|
49
56
|
MetricSpec(
|
|
50
57
|
class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric",
|
|
51
|
-
args=
|
|
58
|
+
args=extra_args,
|
|
52
59
|
)
|
|
53
60
|
]
|
|
54
61
|
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, List
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
5
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
|
+
from helm.benchmark.metrics.metric import Metric
|
|
7
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
8
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
9
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
10
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG
|
|
11
|
+
from helm.common.hierarchical_logger import hwarn
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _strip_string(str: str) -> Any:
|
|
15
|
+
# from https://stackoverflow.com/a/4703508
|
|
16
|
+
numeric_const_pattern = r"[-+]?(?:(?:\d*\.\d+)|(?:\d+\.?))(?:[Ee][+-]?\d+)?"
|
|
17
|
+
match = re.search(numeric_const_pattern, str)
|
|
18
|
+
if match:
|
|
19
|
+
try:
|
|
20
|
+
return float(str[match.start() : match.end()])
|
|
21
|
+
except Exception:
|
|
22
|
+
return None
|
|
23
|
+
return None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def float_equiv(str1: str, str2: str, eps: float = 1e-6) -> float:
|
|
27
|
+
"""Check if two values have the same float value, up to a small tolerance.
|
|
28
|
+
|
|
29
|
+
This is the implementation used in the IBM Enterprise Benchmark paper.
|
|
30
|
+
|
|
31
|
+
Note: This is a "mostly-correct" equality function and does not handle some cases correctly:
|
|
32
|
+
|
|
33
|
+
- If both values are non-floats, then it will always return 1.0,
|
|
34
|
+
regardless of whether strings match.
|
|
35
|
+
- If either of both values have different units (e.g. currency symbols,
|
|
36
|
+
trailing "M" or "B", trailing %), the values will not be converted to the same
|
|
37
|
+
units before comparison.
|
|
38
|
+
"""
|
|
39
|
+
try:
|
|
40
|
+
ss1 = _strip_string(str1)
|
|
41
|
+
ss2 = _strip_string(str2)
|
|
42
|
+
|
|
43
|
+
if ss1 is None or ss2 is None:
|
|
44
|
+
hwarn("float_equiv returning 1.0 because both values are non-floats")
|
|
45
|
+
return 0.0
|
|
46
|
+
return float(abs(ss1 - ss2) < eps)
|
|
47
|
+
except Exception:
|
|
48
|
+
return float(str1 == str2)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class ConvFinQACalcMetric(Metric):
|
|
52
|
+
"""Score metrics for AIRBench 2024."""
|
|
53
|
+
|
|
54
|
+
def evaluate_generation(
|
|
55
|
+
self,
|
|
56
|
+
adapter_spec: AdapterSpec,
|
|
57
|
+
request_state: RequestState,
|
|
58
|
+
metric_service: MetricService,
|
|
59
|
+
eval_cache_path: str,
|
|
60
|
+
) -> List[Stat]:
|
|
61
|
+
assert request_state.result
|
|
62
|
+
assert len(request_state.result.completions) == 1
|
|
63
|
+
model_answer = request_state.result.completions[0].text
|
|
64
|
+
|
|
65
|
+
assert len(request_state.instance.references) == 1
|
|
66
|
+
assert len(request_state.instance.references[0].tags) == 1
|
|
67
|
+
assert request_state.instance.references[0].tags[0] == CORRECT_TAG
|
|
68
|
+
gold_answer = request_state.instance.references[0].output.text
|
|
69
|
+
|
|
70
|
+
return [
|
|
71
|
+
Stat(MetricName("float_equiv")).add(float_equiv(model_answer, gold_answer)),
|
|
72
|
+
]
|
|
@@ -9,10 +9,10 @@ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
|
9
9
|
from helm.benchmark.scenarios.scenario import Reference
|
|
10
10
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
11
11
|
from helm.common.request import RequestResult
|
|
12
|
-
from .metric import Metric
|
|
13
|
-
from .metric_name import MetricName
|
|
14
|
-
from .metric_service import MetricService
|
|
15
|
-
from .statistic import Stat
|
|
12
|
+
from helm.benchmark.metrics.metric import Metric
|
|
13
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
14
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
15
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
16
16
|
|
|
17
17
|
try:
|
|
18
18
|
import numba
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
+
from helm.benchmark.metrics.metric import Metric
|
|
6
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
8
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CzechBankQAMetrics(Metric):
|
|
12
|
+
"""Score metrics for AIRBench 2024."""
|
|
13
|
+
|
|
14
|
+
def evaluate_generation(
|
|
15
|
+
self,
|
|
16
|
+
adapter_spec: AdapterSpec,
|
|
17
|
+
request_state: RequestState,
|
|
18
|
+
metric_service: MetricService,
|
|
19
|
+
eval_cache_path: str,
|
|
20
|
+
) -> List[Stat]:
|
|
21
|
+
# assert len(request_state.instance.references) > 1
|
|
22
|
+
# category_text = request_state.instance.references[0].output.text
|
|
23
|
+
# category_parts = category_text.split(".")
|
|
24
|
+
# assert len(category_parts) == 3
|
|
25
|
+
assert request_state.annotations
|
|
26
|
+
error_rate = 0.0 if request_state.annotations["czech_bank_qa"]["error"] is None else 1.0
|
|
27
|
+
return [
|
|
28
|
+
Stat(MetricName("error_rate")).add(error_rate),
|
|
29
|
+
]
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
3
3
|
from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
|
|
4
|
-
from .metric_name import MetricName
|
|
5
|
-
from .statistic import Stat
|
|
4
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
5
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
6
6
|
from helm.benchmark.metrics.evaluate_reference_metrics import normalize_text
|
|
7
7
|
import numpy as np
|
|
8
8
|
from fairlearn.metrics import equalized_odds_difference
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
2
2
|
from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
|
|
3
|
-
from .metric_name import MetricName
|
|
4
|
-
from .statistic import Stat
|
|
3
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
4
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
5
5
|
from typing import List
|
|
6
6
|
import re
|
|
7
7
|
|
|
@@ -3,8 +3,8 @@ from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMe
|
|
|
3
3
|
from helm.common.request import GeneratedOutput
|
|
4
4
|
from helm.common.request import RequestResult
|
|
5
5
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
|
-
from .metric_name import MetricName
|
|
7
|
-
from .statistic import Stat
|
|
6
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from helm.benchmark.annotation.dischargeme_annotator import ANNOTATOR_MODELS
|
|
2
|
+
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class DischargeMeMetric(LLMJuryMetric):
|
|
6
|
+
"""Score metrics for DischargeMe."""
|
|
7
|
+
|
|
8
|
+
def __init__(self):
|
|
9
|
+
super().__init__(
|
|
10
|
+
metric_name="dischargeme_accuracy",
|
|
11
|
+
scenario_name="dischargeme",
|
|
12
|
+
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
+
default_score=1.0,
|
|
14
|
+
)
|
|
@@ -11,10 +11,10 @@ from helm.common.optional_dependencies import handle_module_not_found_error
|
|
|
11
11
|
from helm.common.request import RequestResult, GeneratedOutput
|
|
12
12
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
13
13
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
14
|
-
from .metric import Metric
|
|
15
|
-
from .metric_name import MetricName
|
|
16
|
-
from .metric_service import MetricService
|
|
17
|
-
from .statistic import Stat
|
|
14
|
+
from helm.benchmark.metrics.metric import Metric
|
|
15
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
16
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
17
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
18
18
|
|
|
19
19
|
try:
|
|
20
20
|
from sacrebleu.metrics import BLEU
|
|
@@ -8,11 +8,11 @@ from helm.benchmark.adaptation.request_state import RequestState
|
|
|
8
8
|
from helm.benchmark.metrics.statistic import Stat, merge_stat
|
|
9
9
|
from helm.benchmark.window_services.window_service import WindowService
|
|
10
10
|
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
11
|
-
from .metric import MetricInterface, MetricResult, PerInstanceStats
|
|
12
|
-
from .metric_name import MetricName
|
|
13
|
-
from .metric_service import MetricService
|
|
14
|
-
from .tokens.auto_token_cost_estimator import AutoTokenCostEstimator
|
|
15
|
-
from .tokens.token_cost_estimator import TokenCostEstimator
|
|
11
|
+
from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats
|
|
12
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
13
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
14
|
+
from helm.benchmark.metrics.tokens.auto_token_cost_estimator import AutoTokenCostEstimator
|
|
15
|
+
from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
@dataclass
|
|
@@ -3,15 +3,15 @@ from typing import Dict, List, Optional
|
|
|
3
3
|
import json
|
|
4
4
|
import importlib_resources as resources
|
|
5
5
|
|
|
6
|
-
from helm.common.hierarchical_logger import
|
|
6
|
+
from helm.common.hierarchical_logger import hwarn
|
|
7
7
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
8
8
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
9
9
|
from helm.benchmark.window_services.window_service import WindowService
|
|
10
10
|
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
11
11
|
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
12
|
-
from .metric_name import MetricName
|
|
13
|
-
from .metric_service import MetricService
|
|
14
|
-
from .statistic import Stat
|
|
12
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
13
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
14
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
EFFICIENCY_DATA_PACKAGE: str = "helm.benchmark.efficiency_data"
|
|
@@ -112,8 +112,8 @@ class EfficiencyMetric:
|
|
|
112
112
|
if num_prompt_tokens <= num_output_tokens:
|
|
113
113
|
num_output_tokens -= num_prompt_tokens
|
|
114
114
|
else:
|
|
115
|
-
|
|
116
|
-
f"
|
|
115
|
+
hwarn(
|
|
116
|
+
f"num_prompt_tokens ({num_prompt_tokens}) > num_output_tokens ({num_output_tokens}) "
|
|
117
117
|
f"for prompt: {prompt}"
|
|
118
118
|
)
|
|
119
119
|
num_output_tokens = 0
|