crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +218 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +4 -1
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/summarize.py +23 -10
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +78 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +75 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +109 -36
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +241 -22
- helm/clients/palmyra_client.py +1 -4
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +47 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1112 -19
- helm/config/model_metadata.yaml +985 -44
- helm/config/tokenizer_configs.yaml +379 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +1 -1
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,752 @@
|
|
|
1
|
+
---
|
|
2
|
+
############################################################
|
|
3
|
+
metrics:
|
|
4
|
+
# Infrastructure metrics:
|
|
5
|
+
- name: num_perplexity_tokens
|
|
6
|
+
display_name: '# tokens'
|
|
7
|
+
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
8
|
+
- name: num_bytes
|
|
9
|
+
display_name: '# bytes'
|
|
10
|
+
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
11
|
+
|
|
12
|
+
- name: num_references
|
|
13
|
+
display_name: '# ref'
|
|
14
|
+
description: Number of references.
|
|
15
|
+
- name: num_train_trials
|
|
16
|
+
display_name: '# trials'
|
|
17
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
18
|
+
- name: estimated_num_tokens_cost
|
|
19
|
+
display_name: 'cost'
|
|
20
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
21
|
+
- name: num_prompt_tokens
|
|
22
|
+
display_name: '# prompt tokens'
|
|
23
|
+
description: Number of tokens in the prompt.
|
|
24
|
+
- name: num_prompt_characters
|
|
25
|
+
display_name: '# prompt chars'
|
|
26
|
+
description: Number of characters in the prompt.
|
|
27
|
+
- name: num_completion_tokens
|
|
28
|
+
display_name: '# completion tokens'
|
|
29
|
+
description: Actual number of completion tokens (over all completions).
|
|
30
|
+
- name: num_output_tokens
|
|
31
|
+
display_name: '# output tokens'
|
|
32
|
+
description: Actual number of output tokens.
|
|
33
|
+
- name: max_num_output_tokens
|
|
34
|
+
display_name: 'Max output tokens'
|
|
35
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
36
|
+
- name: num_requests
|
|
37
|
+
display_name: '# requests'
|
|
38
|
+
description: Number of distinct API requests.
|
|
39
|
+
- name: num_instances
|
|
40
|
+
display_name: '# eval'
|
|
41
|
+
description: Number of evaluation instances.
|
|
42
|
+
- name: num_train_instances
|
|
43
|
+
display_name: '# train'
|
|
44
|
+
description: Number of training instances (e.g., in-context examples).
|
|
45
|
+
- name: prompt_truncated
|
|
46
|
+
display_name: truncated
|
|
47
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
48
|
+
- name: finish_reason_length
|
|
49
|
+
display_name: finish b/c length
|
|
50
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
51
|
+
- name: finish_reason_stop
|
|
52
|
+
display_name: finish b/c stop
|
|
53
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
54
|
+
- name: finish_reason_endoftext
|
|
55
|
+
display_name: finish b/c endoftext
|
|
56
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
57
|
+
- name: finish_reason_unknown
|
|
58
|
+
display_name: finish b/c unknown
|
|
59
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
60
|
+
- name: num_completions
|
|
61
|
+
display_name: '# completions'
|
|
62
|
+
description: Number of completions.
|
|
63
|
+
- name: predicted_index
|
|
64
|
+
display_name: Predicted index
|
|
65
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
66
|
+
|
|
67
|
+
# Accuracy metrics:
|
|
68
|
+
- name: exact_match
|
|
69
|
+
display_name: Exact match
|
|
70
|
+
short_display_name: EM
|
|
71
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
72
|
+
lower_is_better: false
|
|
73
|
+
- name: quasi_exact_match
|
|
74
|
+
display_name: Quasi-exact match
|
|
75
|
+
short_display_name: EM
|
|
76
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
77
|
+
lower_is_better: false
|
|
78
|
+
- name: prefix_exact_match
|
|
79
|
+
display_name: Prefix exact match
|
|
80
|
+
short_display_name: PEM
|
|
81
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
|
|
82
|
+
lower_is_better: false
|
|
83
|
+
- name: quasi_prefix_exact_match
|
|
84
|
+
# TODO: should call this prefix_quasi_exact_match
|
|
85
|
+
display_name: Prefix quasi-exact match
|
|
86
|
+
short_display_name: PEM
|
|
87
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
|
|
88
|
+
lower_is_better: false
|
|
89
|
+
|
|
90
|
+
- name: rouge_1
|
|
91
|
+
display_name: ROUGE-1
|
|
92
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
|
|
93
|
+
lower_is_better: false
|
|
94
|
+
- name: rouge_2
|
|
95
|
+
display_name: ROUGE-2
|
|
96
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
|
|
97
|
+
lower_is_better: false
|
|
98
|
+
- name: rouge_l
|
|
99
|
+
display_name: ROUGE-L
|
|
100
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
|
|
101
|
+
lower_is_better: false
|
|
102
|
+
- name: bleu_1
|
|
103
|
+
display_name: BLEU-1
|
|
104
|
+
description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
|
|
105
|
+
lower_is_better: false
|
|
106
|
+
- name: bleu_4
|
|
107
|
+
display_name: BLEU-4
|
|
108
|
+
description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
|
|
109
|
+
lower_is_better: false
|
|
110
|
+
- name: f1_score
|
|
111
|
+
display_name: F1
|
|
112
|
+
description: Average F1 score in terms of word overlap between the model output and correct reference.
|
|
113
|
+
lower_is_better: false
|
|
114
|
+
- name: cider
|
|
115
|
+
display_name: CIDEr
|
|
116
|
+
description: Evaluates the quality of generated caption by measuring the weighted similarity of n-grams between the captions and a set of human-written reference captions, emphasizing informativeness and consensus.
|
|
117
|
+
lower_is_better: false
|
|
118
|
+
|
|
119
|
+
# Machine Translation metrics
|
|
120
|
+
- name: bleu
|
|
121
|
+
display_name: BLEU
|
|
122
|
+
short_display_name: BLEU
|
|
123
|
+
description: BLEU score based on [Post, (2018)](https://aclanthology.org/W18-6319/).
|
|
124
|
+
lower_is_better: false
|
|
125
|
+
|
|
126
|
+
# Speech Recognition metrics
|
|
127
|
+
- name: wer_score
|
|
128
|
+
display_name: Word Error Rate
|
|
129
|
+
short_display_name: WER
|
|
130
|
+
description: Word error rate between model predictions and ground truth answers for ASR tasks.
|
|
131
|
+
lower_is_better: true
|
|
132
|
+
|
|
133
|
+
- name: mer_score
|
|
134
|
+
display_name: Match Error Rate
|
|
135
|
+
short_display_name: MER
|
|
136
|
+
description: Word match error rate between model predictions and ground truth answers.
|
|
137
|
+
lower_is_better: true
|
|
138
|
+
|
|
139
|
+
- name: wip_score
|
|
140
|
+
display_name: Word Information Preservation
|
|
141
|
+
short_display_name: WIP
|
|
142
|
+
description: Word information preservation (WIP) for evaluating the preserved information of ASR.
|
|
143
|
+
lower_is_better: false
|
|
144
|
+
|
|
145
|
+
- name: cer_score
|
|
146
|
+
display_name: Character Error Rate
|
|
147
|
+
short_display_name: CER
|
|
148
|
+
description: Character error rate (CER) for evaluating the accuracy of ASR.
|
|
149
|
+
lower_is_better: true
|
|
150
|
+
|
|
151
|
+
- name: chinese_wer_score
|
|
152
|
+
display_name: Chinese Word Error Rate
|
|
153
|
+
short_display_name: Chinese WER
|
|
154
|
+
description: Chinese word error rate between model predictions and ground truth answers for ASR tasks.
|
|
155
|
+
lower_is_better: true
|
|
156
|
+
|
|
157
|
+
- name: chinese_mer_score
|
|
158
|
+
display_name: Chinese Match Error Rate
|
|
159
|
+
short_display_name: Chinese MER
|
|
160
|
+
description: Chinese word match error rate between model predictions and ground truth answers.
|
|
161
|
+
lower_is_better: true
|
|
162
|
+
|
|
163
|
+
- name: chinese_wip_score
|
|
164
|
+
display_name: Chinese Word Information Preservation
|
|
165
|
+
short_display_name: Chinese WIP
|
|
166
|
+
description: Chinese word information preservation (WIP) for evaluating the preserved information of ASR.
|
|
167
|
+
lower_is_better: false
|
|
168
|
+
|
|
169
|
+
- name: chinese_cer_score
|
|
170
|
+
display_name: Chinese Character Error Rate
|
|
171
|
+
short_display_name: Chinese CER
|
|
172
|
+
description: Chinese character error rate (CER) for evaluating the accuracy of Chiese ASR.
|
|
173
|
+
lower_is_better: true
|
|
174
|
+
|
|
175
|
+
# Toxicity metrics
|
|
176
|
+
- name: expected_max_toxicity
|
|
177
|
+
display_name: Average max toxicity
|
|
178
|
+
short_display_name: Avg. max toxicity
|
|
179
|
+
lower_is_better: true
|
|
180
|
+
description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
|
|
181
|
+
|
|
182
|
+
- name: max_toxicity_probability
|
|
183
|
+
display_name: max toxicity
|
|
184
|
+
lower_is_better: true
|
|
185
|
+
description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
|
|
186
|
+
|
|
187
|
+
- name: toxic_frac
|
|
188
|
+
display_name: Toxic fraction
|
|
189
|
+
lower_is_better: true
|
|
190
|
+
description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
|
|
191
|
+
|
|
192
|
+
- name: gpt4_audio_critique
|
|
193
|
+
display_name: GPT-4o Judge Critique
|
|
194
|
+
lower_is_better: false
|
|
195
|
+
description: Judgements by GPT-4o
|
|
196
|
+
|
|
197
|
+
############################################################
|
|
198
|
+
perturbations: []
|
|
199
|
+
|
|
200
|
+
############################################################
|
|
201
|
+
metric_groups:
|
|
202
|
+
- name: accuracy
|
|
203
|
+
display_name: Accuracy
|
|
204
|
+
metrics:
|
|
205
|
+
- name: ${main_name}
|
|
206
|
+
split: ${main_split}
|
|
207
|
+
|
|
208
|
+
- name: efficiency
|
|
209
|
+
display_name: Efficiency
|
|
210
|
+
metrics:
|
|
211
|
+
- name: inference_runtime
|
|
212
|
+
split: ${main_split}
|
|
213
|
+
|
|
214
|
+
- name: general_information
|
|
215
|
+
display_name: General information
|
|
216
|
+
metrics:
|
|
217
|
+
- name: num_instances
|
|
218
|
+
split: ${main_split}
|
|
219
|
+
- name: num_train_instances
|
|
220
|
+
split: ${main_split}
|
|
221
|
+
- name: prompt_truncated
|
|
222
|
+
split: ${main_split}
|
|
223
|
+
- name: num_prompt_tokens
|
|
224
|
+
split: ${main_split}
|
|
225
|
+
- name: num_output_tokens
|
|
226
|
+
split: ${main_split}
|
|
227
|
+
|
|
228
|
+
############################################################
|
|
229
|
+
|
|
230
|
+
run_groups:
|
|
231
|
+
- name: audio_scenarios
|
|
232
|
+
display_name: Audio Scenarios
|
|
233
|
+
description: Audio Scenarios
|
|
234
|
+
category: All scenarios
|
|
235
|
+
subgroups:
|
|
236
|
+
- auditory_perception
|
|
237
|
+
- emotion_detection
|
|
238
|
+
- knowledge
|
|
239
|
+
- robustness
|
|
240
|
+
- multilinguality
|
|
241
|
+
- toxicity_detection
|
|
242
|
+
- safety
|
|
243
|
+
- fairness
|
|
244
|
+
- bias
|
|
245
|
+
|
|
246
|
+
- name: auditory_perception
|
|
247
|
+
display_name: Auditory Perception
|
|
248
|
+
description: Interpreting fundamental information in an audio clip
|
|
249
|
+
category: Core scenarios
|
|
250
|
+
subgroups:
|
|
251
|
+
- audiocaps
|
|
252
|
+
- voxceleb2
|
|
253
|
+
- vocal_sound
|
|
254
|
+
- librispeech
|
|
255
|
+
|
|
256
|
+
- name: emotion_detection
|
|
257
|
+
display_name: Emotion Detection
|
|
258
|
+
description: Detecting emotions in audio clips.
|
|
259
|
+
category: Core scenarios
|
|
260
|
+
subgroups:
|
|
261
|
+
- meld_audio
|
|
262
|
+
- mustard
|
|
263
|
+
|
|
264
|
+
- name: knowledge
|
|
265
|
+
display_name: Knowledge
|
|
266
|
+
description: Recalling facts or information contained in the audio LLM.
|
|
267
|
+
category: Core scenarios
|
|
268
|
+
subgroups:
|
|
269
|
+
- air_bench_chat
|
|
270
|
+
- air_bench_foundation
|
|
271
|
+
|
|
272
|
+
- name: robustness
|
|
273
|
+
display_name: Robustness
|
|
274
|
+
description: Producing desired answers under invariant perturbations to the audio.
|
|
275
|
+
category: Core scenarios
|
|
276
|
+
subgroups:
|
|
277
|
+
- speech_robust_bench
|
|
278
|
+
|
|
279
|
+
- name: multilinguality
|
|
280
|
+
display_name: Multilinguality
|
|
281
|
+
description: Performs the same task when the language is changed
|
|
282
|
+
category: Core scenarios
|
|
283
|
+
subgroups:
|
|
284
|
+
- covost2
|
|
285
|
+
- fleurs
|
|
286
|
+
- multilingual_librispeech
|
|
287
|
+
|
|
288
|
+
- name: toxicity_detection
|
|
289
|
+
display_name: Toxicity detection
|
|
290
|
+
description: Identifying and avoiding offensive or damaging materials.
|
|
291
|
+
category: Core scenarios
|
|
292
|
+
subgroups:
|
|
293
|
+
- mutox
|
|
294
|
+
|
|
295
|
+
- name: safety
|
|
296
|
+
display_name: Safety
|
|
297
|
+
description: Refusing to produce answers that cause harm to humans.
|
|
298
|
+
category: Core scenarios
|
|
299
|
+
subgroups:
|
|
300
|
+
- voice_jailbreak_attacks
|
|
301
|
+
|
|
302
|
+
- name: fairness
|
|
303
|
+
display_name: Fairness
|
|
304
|
+
description: Whether the model is fair to all groups or demographics.
|
|
305
|
+
category: Core scenarios
|
|
306
|
+
subgroups:
|
|
307
|
+
- fleurs_fairness
|
|
308
|
+
- librispeech_fairness
|
|
309
|
+
|
|
310
|
+
- name: bias
|
|
311
|
+
display_name: Bias
|
|
312
|
+
description: Whether the model is biased towards certain groups or demographics.
|
|
313
|
+
category: Core scenarios
|
|
314
|
+
subgroups:
|
|
315
|
+
- parade
|
|
316
|
+
|
|
317
|
+
- name: covost2
|
|
318
|
+
display_name: CoVost-2
|
|
319
|
+
description: >
|
|
320
|
+
CoVost-2 is a large-scale multilingual speech translation corpus covering translations from 21 languages
|
|
321
|
+
into English and from English into 15 languages.
|
|
322
|
+
|
|
323
|
+
The dataset contains the audio, transcriptions, and translations in the following languages:
|
|
324
|
+
French, German, Dutch, Russian, Spanish, Italian, Turkish, Persian, Swedish, Mongolian, Chinese,
|
|
325
|
+
Welsh, Catalan, Slovenian, Estonian, Indonesian, Arabic, Tamil, Portuguese, Latvian, and Japanese
|
|
326
|
+
([Wang et al, 2020](https://arxiv.org/abs/2007.10310)).
|
|
327
|
+
metric_groups:
|
|
328
|
+
- accuracy
|
|
329
|
+
- general_information
|
|
330
|
+
environment:
|
|
331
|
+
main_name: bleu
|
|
332
|
+
main_split: test
|
|
333
|
+
taxonomy:
|
|
334
|
+
task: audio machine translation
|
|
335
|
+
what: audio, transcriptions, and translations in 15 languages
|
|
336
|
+
who: real speakers
|
|
337
|
+
when: "2020"
|
|
338
|
+
language: 15 languages
|
|
339
|
+
|
|
340
|
+
- name: vocal_sound
|
|
341
|
+
display_name: VocalSound
|
|
342
|
+
description: >
|
|
343
|
+
VocalSound dataset consisting of over 21,000 crowdsourced recordings of laughter, sighs, coughs, throat
|
|
344
|
+
clearing, sneezes, and sniffs from 3,365 unique subjects.
|
|
345
|
+
|
|
346
|
+
Different from previous datasets, the VocalSound dataset contains meta information such as speaker
|
|
347
|
+
age, gender, native language, country, and health condition ([Gong et al, 2022](https://arxiv.org/abs/2205.03433)).
|
|
348
|
+
metric_groups:
|
|
349
|
+
- accuracy
|
|
350
|
+
- general_information
|
|
351
|
+
environment:
|
|
352
|
+
main_name: quasi_exact_match
|
|
353
|
+
main_split: test
|
|
354
|
+
taxonomy:
|
|
355
|
+
task: audio classification
|
|
356
|
+
what: audio samples of behaviours ("cough", "laughter", "sigh", "sneeze", "sniff", "throatclearing")
|
|
357
|
+
who: real speakers
|
|
358
|
+
when: "2022"
|
|
359
|
+
language: English
|
|
360
|
+
|
|
361
|
+
- name: multilingual_librispeech
|
|
362
|
+
display_name: Multilingual Librispeech
|
|
363
|
+
description: >
|
|
364
|
+
Multilingual Librispeech is derived from read audiobooks from LibriVox and consists of 8 languages,
|
|
365
|
+
including about 44.5K hours of English and a total of about 6K hours for other languages.
|
|
366
|
+
|
|
367
|
+
The dataset contains the audio and transcriptions in the following languages:
|
|
368
|
+
Dutch, German, French, Spanish, Italian, Portuguese", Polish ([Pratap et al, 2022](https://arxiv.org/abs/2012.03411)).
|
|
369
|
+
metric_groups:
|
|
370
|
+
- accuracy
|
|
371
|
+
- general_information
|
|
372
|
+
environment:
|
|
373
|
+
main_name: wer_score
|
|
374
|
+
main_split: test
|
|
375
|
+
taxonomy:
|
|
376
|
+
task: audio captioning
|
|
377
|
+
what: audio and transcriptions in 8 languages
|
|
378
|
+
who: real speakers
|
|
379
|
+
when: "2020"
|
|
380
|
+
language: 8 languages
|
|
381
|
+
|
|
382
|
+
- name: fleurs
|
|
383
|
+
display_name: FLEURS
|
|
384
|
+
description: >
|
|
385
|
+
FLEURS is an n-way parallel speech dataset in 102 languages built on top of the machine translation FLoRes-101
|
|
386
|
+
benchmark, with approximately 12 hours of speech supervision per language. FLEURS can be used for a variety of
|
|
387
|
+
speech tasks, including Automatic Speech Recognition (ASR), Speech Language Identification (Speech LangID),
|
|
388
|
+
Translation and Retrieval.
|
|
389
|
+
|
|
390
|
+
The dataset contains the audio, transcriptions, and language in 102 different languages, which are divided into
|
|
391
|
+
7 language groups: Western European, Eastern European, Central Asia Middle North African, Sub Saharan African,
|
|
392
|
+
South Asian, South East Asian, Chinese Japanase Korean ([Conneau et al, 2022](https://arxiv.org/abs/2205.12446)).
|
|
393
|
+
metric_groups:
|
|
394
|
+
- accuracy
|
|
395
|
+
- general_information
|
|
396
|
+
environment:
|
|
397
|
+
main_name: wer_score
|
|
398
|
+
main_split: test
|
|
399
|
+
taxonomy:
|
|
400
|
+
task: audio classification
|
|
401
|
+
what: audio, transcripts, and language names in 102 languages
|
|
402
|
+
who: real speakers
|
|
403
|
+
when: "2022"
|
|
404
|
+
language: 102 languages
|
|
405
|
+
|
|
406
|
+
- name: fleurs_fairness
|
|
407
|
+
display_name: FLEURS Fairness
|
|
408
|
+
description: >
|
|
409
|
+
FLEURS is an n-way parallel speech dataset in 102 languages built on top of the machine translation FLoRes-101
|
|
410
|
+
benchmark, with approximately 12 hours of speech supervision per language. FLEURS can be used for a variety of
|
|
411
|
+
speech tasks, including Automatic Speech Recognition (ASR), Speech Language Identification (Speech LangID),
|
|
412
|
+
Translation and Retrieval.
|
|
413
|
+
|
|
414
|
+
We only use the English subset of the dataset for the fairness task. We ask the model to do ASR on
|
|
415
|
+
audio files from different gender groups ([Conneau et al, 2022](https://arxiv.org/abs/2205.12446)).
|
|
416
|
+
metric_groups:
|
|
417
|
+
- accuracy
|
|
418
|
+
- general_information
|
|
419
|
+
environment:
|
|
420
|
+
main_name: wer_score
|
|
421
|
+
main_split: test
|
|
422
|
+
taxonomy:
|
|
423
|
+
task: audio classification
|
|
424
|
+
what: audio, transcripts, and gender of the speaker
|
|
425
|
+
who: real speakers
|
|
426
|
+
when: "2022"
|
|
427
|
+
language: English
|
|
428
|
+
|
|
429
|
+
- name: audiocaps
|
|
430
|
+
display_name: AudioCaps
|
|
431
|
+
description: >
|
|
432
|
+
AudioCaps is a large-scale dataset of about 46K audio clips to human-written text pairs collected
|
|
433
|
+
via crowdsourcing on the AudioSet dataset, which covers a wide range of human and animal sounds,
|
|
434
|
+
musical instruments and genres, and common everyday environmental sounds.
|
|
435
|
+
([Kim et al, 2019](https://aclanthology.org/N19-1011.pdf)).
|
|
436
|
+
metric_groups:
|
|
437
|
+
- accuracy
|
|
438
|
+
- general_information
|
|
439
|
+
environment:
|
|
440
|
+
main_name: gpt4_audio_critique
|
|
441
|
+
main_split: test
|
|
442
|
+
taxonomy:
|
|
443
|
+
task: audio captioning
|
|
444
|
+
what: audio clips in the wild
|
|
445
|
+
who: real speakers
|
|
446
|
+
when: "2019"
|
|
447
|
+
language: English
|
|
448
|
+
|
|
449
|
+
- name: voxceleb2
|
|
450
|
+
display_name: VoxCeleb2
|
|
451
|
+
description: >
|
|
452
|
+
VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from
|
|
453
|
+
interview videos uploaded to YouTube. It contains over a million utterances from over 6,000
|
|
454
|
+
speakers with their gender, race, identity information in 145 different nationalities, covering
|
|
455
|
+
a wide range of accents, ages, ethnicities and languages.
|
|
456
|
+
([Chung et al, 2018](https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf))
|
|
457
|
+
metric_groups:
|
|
458
|
+
- accuracy
|
|
459
|
+
- general_information
|
|
460
|
+
environment:
|
|
461
|
+
main_name: exact_match
|
|
462
|
+
main_split: test
|
|
463
|
+
taxonomy:
|
|
464
|
+
task: audio identification
|
|
465
|
+
what: audio clips in the wild
|
|
466
|
+
who: real speakers
|
|
467
|
+
when: "2018"
|
|
468
|
+
language: English, Germany, French
|
|
469
|
+
|
|
470
|
+
- name: speech_robust_bench
|
|
471
|
+
display_name: Robust Speech Bench
|
|
472
|
+
description: >
|
|
473
|
+
Speech Robust Bench (Shah et al, 2024) is a comprehensive benchmark for evaluating
|
|
474
|
+
the robustness of ASR models to diverse corruptions. SRB is composed of 114 input
|
|
475
|
+
perturbations which simulate an heterogeneous range of corruptions that ASR models
|
|
476
|
+
may encounter when deployed in the wild. In this scenario, we select 4 subsets:
|
|
477
|
+
accent_cv, accent_cv_es, chinme, and AIM for evaluation.
|
|
478
|
+
|
|
479
|
+
The dataset contains the audio, transcriptions for all subsets
|
|
480
|
+
([Shah et al, 2024](https://arxiv.org/abs/2403.07937)).
|
|
481
|
+
metric_groups:
|
|
482
|
+
- accuracy
|
|
483
|
+
- general_information
|
|
484
|
+
environment:
|
|
485
|
+
main_name: wer_score
|
|
486
|
+
main_split: test
|
|
487
|
+
taxonomy:
|
|
488
|
+
task: audio recognition
|
|
489
|
+
what: audio, transcripts of audio samples in a wide range of perturbations
|
|
490
|
+
who: real speakers
|
|
491
|
+
when: "2024"
|
|
492
|
+
language: English, Spanish
|
|
493
|
+
|
|
494
|
+
- name: audio_pairs
|
|
495
|
+
display_name: Audio PAIRS
|
|
496
|
+
description: >
|
|
497
|
+
Audio PAIRS is an audio extension of the PAIRS dataset (Fraser et al, 2024) to examine gender and
|
|
498
|
+
racial bias in audio large language models. We convert the questions in the PAIRS dataset to audio
|
|
499
|
+
clips using OpenAI's TTS-1-HD API. This dataset is also modified to add an option to opt-out with
|
|
500
|
+
"unclear" as a choice.
|
|
501
|
+
|
|
502
|
+
The dataset contains the audio and question for three subsets: occupation, status, and potential_crime.
|
|
503
|
+
metric_groups:
|
|
504
|
+
- accuracy
|
|
505
|
+
- general_information
|
|
506
|
+
environment:
|
|
507
|
+
main_name: exact_match
|
|
508
|
+
main_split: test
|
|
509
|
+
taxonomy:
|
|
510
|
+
task: audio classification
|
|
511
|
+
what: audio and question of audio samples to examine models' gender and racial bias
|
|
512
|
+
who: OpenAI's TTS-1-HD
|
|
513
|
+
when: "2024"
|
|
514
|
+
language: English
|
|
515
|
+
|
|
516
|
+
- name: meld_audio
|
|
517
|
+
display_name: Multimodal EmotionLines Dataset (MELD) Audio
|
|
518
|
+
description: >
|
|
519
|
+
Multimodal EmotionLines Dataset (MELD) has been created by enhancing and extending EmotionLines dataset.
|
|
520
|
+
MELD has more than 1400 dialogues and 13000 utterances from Friends TV series. Multiple speakers participated
|
|
521
|
+
in the dialogues. Each utterance in a dialogue has been labeled by any of these seven emotions -
|
|
522
|
+
Anger, Disgust, Sadness, Joy, Neutral, Surprise and Fear.
|
|
523
|
+
metric_groups:
|
|
524
|
+
- accuracy
|
|
525
|
+
- general_information
|
|
526
|
+
environment:
|
|
527
|
+
main_name: quasi_exact_match
|
|
528
|
+
main_split: test
|
|
529
|
+
taxonomy:
|
|
530
|
+
task: audio classification
|
|
531
|
+
what: Classify audio by emotion
|
|
532
|
+
who: Friends TV series
|
|
533
|
+
when: "2018"
|
|
534
|
+
language: English
|
|
535
|
+
|
|
536
|
+
- name: casual_conversations2
|
|
537
|
+
display_name: Casual Conversations 2
|
|
538
|
+
description: >
|
|
539
|
+
Casual Conversation v2 (Porgali et al, 2023) is composed of over 5,567 participants (26,467 videos).
|
|
540
|
+
The videos feature paid individuals who agreed to participate in the project and explicitly provided
|
|
541
|
+
Age, Gender, Language/Dialect, Geo-location, Disability, Physical adornments, Physical attributes labels
|
|
542
|
+
themselves. The videos were recorded in Brazil, India, Indonesia, Mexico, Philippines, United States,
|
|
543
|
+
and Vietnam with a diverse set of adults in various categories.
|
|
544
|
+
|
|
545
|
+
The dataset contains two classification tasks: age and gender classification
|
|
546
|
+
([Porgali et al., 2023](https://arxiv.org/abs/2303.04838)). We phrase these two tasks as the multi-choice
|
|
547
|
+
questions answering task.
|
|
548
|
+
metric_groups:
|
|
549
|
+
- accuracy
|
|
550
|
+
- general_information
|
|
551
|
+
environment:
|
|
552
|
+
main_name: exact_match
|
|
553
|
+
main_split: test
|
|
554
|
+
taxonomy:
|
|
555
|
+
task: audio classification
|
|
556
|
+
what: audio, spoken language, speaker's gender, age information of audio samples
|
|
557
|
+
who: real speakers
|
|
558
|
+
when: "2023"
|
|
559
|
+
language: 10 languages
|
|
560
|
+
|
|
561
|
+
- name: air_bench_chat
|
|
562
|
+
display_name: Air-Bench Chat
|
|
563
|
+
description: >
|
|
564
|
+
Air-Bench (Yang et al, 2024) encompasses two dimensions: foundation and chat benchmarks. The former consists of 19 tasks with
|
|
565
|
+
approximately 19k single-choice questions. The latter one contains 2k instances of open-ended question-and-answer data.
|
|
566
|
+
We consider the chat benchmark in this scenario.
|
|
567
|
+
|
|
568
|
+
The dataset contains the audio question answering task in four subjects: sound, speech, music, and mixed.
|
|
569
|
+
([Yang et al, 2024](https://aclanthology.org/2024.acl-long.109.pdf)).
|
|
570
|
+
metric_groups:
|
|
571
|
+
- accuracy
|
|
572
|
+
- general_information
|
|
573
|
+
environment:
|
|
574
|
+
main_name: gpt4_audio_critique
|
|
575
|
+
main_split: test
|
|
576
|
+
taxonomy:
|
|
577
|
+
task: audio question answering
|
|
578
|
+
what: audio, question, and answer of audio samples
|
|
579
|
+
who: real speakers
|
|
580
|
+
when: "2024"
|
|
581
|
+
language: English
|
|
582
|
+
|
|
583
|
+
- name: air_bench_foundation
|
|
584
|
+
display_name: Air-Bench Foundation
|
|
585
|
+
description: >
|
|
586
|
+
Air-Bench (Yang et al, 2024) encompasses two dimensions: foundation and chat benchmarks. The former consists of 19 tasks with
|
|
587
|
+
approximately 19k single-choice questions. The latter one contains 2k instances of open-ended question-and-answer data.
|
|
588
|
+
We consider the chat benchmark in this scenario.
|
|
589
|
+
|
|
590
|
+
The dataset contains the audio multiple choice question answering task in three subjects: music instrument classification, music
|
|
591
|
+
genera recognition, and music question-answering.
|
|
592
|
+
([Yang et al, 2024](https://aclanthology.org/2024.acl-long.109.pdf)).
|
|
593
|
+
metric_groups:
|
|
594
|
+
- accuracy
|
|
595
|
+
- general_information
|
|
596
|
+
environment:
|
|
597
|
+
main_name: exact_match
|
|
598
|
+
main_split: test
|
|
599
|
+
taxonomy:
|
|
600
|
+
task: audio question answering
|
|
601
|
+
what: audio, question, and answer of the question
|
|
602
|
+
who: real speakers
|
|
603
|
+
when: "2024"
|
|
604
|
+
language: English
|
|
605
|
+
|
|
606
|
+
- name: mutox
|
|
607
|
+
display_name: MuTox
|
|
608
|
+
description: >
|
|
609
|
+
The dataset consists of 20k audio utterances for English and Spanish, and 4k for the other languages.
|
|
610
|
+
([Costa-jussà et al, 2018](https://arxiv.org/abs/2401.05060)).
|
|
611
|
+
metric_groups:
|
|
612
|
+
- accuracy
|
|
613
|
+
- general_information
|
|
614
|
+
environment:
|
|
615
|
+
main_name: exact_match
|
|
616
|
+
main_split: test
|
|
617
|
+
taxonomy:
|
|
618
|
+
task: toxicity detection
|
|
619
|
+
what: samples of utterances
|
|
620
|
+
who: real speakers
|
|
621
|
+
when: "2024"
|
|
622
|
+
language: 30 languages
|
|
623
|
+
|
|
624
|
+
- name: mustard
|
|
625
|
+
display_name: MUStARD
|
|
626
|
+
description: >
|
|
627
|
+
A multimodal video corpus for research in automated sarcasm discovery. The dataset is compiled from popular
|
|
628
|
+
TV shows including Friends, The Golden Girls, The Big Bang Theory, and Sarcasmaholics Anonymous.
|
|
629
|
+
([Castro et al, 2019](https://arxiv.org/abs/1906.01815)).
|
|
630
|
+
metric_groups:
|
|
631
|
+
- accuracy
|
|
632
|
+
- general_information
|
|
633
|
+
environment:
|
|
634
|
+
main_name: exact_match
|
|
635
|
+
main_split: test
|
|
636
|
+
taxonomy:
|
|
637
|
+
task: sarcasm detection
|
|
638
|
+
what: clips from television shows
|
|
639
|
+
who: real speakers
|
|
640
|
+
when: "2019"
|
|
641
|
+
language: English
|
|
642
|
+
|
|
643
|
+
- name: voice_jailbreak_attacks
|
|
644
|
+
display_name: Voice Jailbreak Attacks Against GPT-4o
|
|
645
|
+
description: >
|
|
646
|
+
Voice jailbreak attacks against GPT-4o ([Shen et al, 2024](https://arxiv.org/abs/2405.19103)).
|
|
647
|
+
metric_groups:
|
|
648
|
+
- accuracy
|
|
649
|
+
- general_information
|
|
650
|
+
environment:
|
|
651
|
+
main_name: toxic_frac
|
|
652
|
+
main_split: test
|
|
653
|
+
taxonomy:
|
|
654
|
+
task: refusal for safety
|
|
655
|
+
what: voice jailbreak attacks against GPT-4o
|
|
656
|
+
who: AI-generated speech
|
|
657
|
+
when: "2024"
|
|
658
|
+
language: English
|
|
659
|
+
|
|
660
|
+
- name: ami
|
|
661
|
+
display_name: AMI Meeting Corpus
|
|
662
|
+
description: >
|
|
663
|
+
The AMI Meeting Corpus (Carletta et al. 2005) is a multi-modal data set consisting of
|
|
664
|
+
100 hours of meeting recordings. It is being created in the context of a project that
|
|
665
|
+
is developing meeting browsing technology. The corpus is being recorded using a wide
|
|
666
|
+
range of devices including close-talking and far-field microphones, individual and
|
|
667
|
+
room-view video cameras, projection, a whiteboard, and individual pens, all of which
|
|
668
|
+
produce output signals that are synchronized with each other.
|
|
669
|
+
|
|
670
|
+
The dataset contains the audio, transcriptions for all subsets
|
|
671
|
+
([Carletta et al, 2005](https://link.springer.com/chapter/10.1007/11677482_3)).
|
|
672
|
+
metric_groups:
|
|
673
|
+
- accuracy
|
|
674
|
+
- general_information
|
|
675
|
+
environment:
|
|
676
|
+
main_name: wer_score
|
|
677
|
+
main_split: test
|
|
678
|
+
taxonomy:
|
|
679
|
+
task: audio recognition
|
|
680
|
+
what: audio, transcripts of audio samples from meeting environments
|
|
681
|
+
who: real speakers
|
|
682
|
+
when: "2005"
|
|
683
|
+
language: English
|
|
684
|
+
|
|
685
|
+
- name: librispeech
|
|
686
|
+
display_name: LibriSpeech
|
|
687
|
+
description: >
|
|
688
|
+
The LibriSpeech corpus (Vassil et al. 2015) is derived from audiobooks that are part
|
|
689
|
+
of the LibriVox project, and contains 1000 hours of speech sampled at 16 kHz. The
|
|
690
|
+
data has separately prepared language-model training data and pre-built language models.
|
|
691
|
+
This corpus is one of the most widely-used ASR corpus, which has been extended to many
|
|
692
|
+
applicaitons such as robust ASR and multilingual ASR tasks.
|
|
693
|
+
|
|
694
|
+
The dataset contains the audio, transcriptions for all subsets
|
|
695
|
+
([Vassil et al. 2015](https://ieeexplore.ieee.org/document/7178964)).
|
|
696
|
+
metric_groups:
|
|
697
|
+
- accuracy
|
|
698
|
+
- general_information
|
|
699
|
+
environment:
|
|
700
|
+
main_name: wer_score
|
|
701
|
+
main_split: test
|
|
702
|
+
taxonomy:
|
|
703
|
+
task: audio recognition
|
|
704
|
+
what: audio, transcripts of audio samples in daily scenarios
|
|
705
|
+
who: real speakers
|
|
706
|
+
when: "2015"
|
|
707
|
+
language: English
|
|
708
|
+
|
|
709
|
+
- name: librispeech_fairness
|
|
710
|
+
display_name: LibriSpeech Fairness
|
|
711
|
+
description: >
|
|
712
|
+
The LibriSpeech corpus (Vassil et al. 2015) is derived from audiobooks that are part
|
|
713
|
+
of the LibriVox project, and contains 1000 hours of speech sampled at 16 kHz. The
|
|
714
|
+
data has separately prepared language-model training data and pre-built language models.
|
|
715
|
+
This corpus is one of the most widely-used ASR corpus, which has been extended to many
|
|
716
|
+
applicaitons such as robust ASR and multilingual ASR tasks.
|
|
717
|
+
|
|
718
|
+
The dataset contains the audio, transcriptions for all subsets. We ask the model to do
|
|
719
|
+
ASR on audio files from different gender groups
|
|
720
|
+
([Vassil et al. 2015](https://ieeexplore.ieee.org/document/7178964)).
|
|
721
|
+
metric_groups:
|
|
722
|
+
- accuracy
|
|
723
|
+
- general_information
|
|
724
|
+
environment:
|
|
725
|
+
main_name: wer_score
|
|
726
|
+
main_split: test
|
|
727
|
+
taxonomy:
|
|
728
|
+
task: audio recognition
|
|
729
|
+
what: audio, transcripts of audio samples in daily scenarios
|
|
730
|
+
who: real speakers
|
|
731
|
+
when: "2015"
|
|
732
|
+
language: English
|
|
733
|
+
|
|
734
|
+
- name: parade
|
|
735
|
+
display_name: PARADE
|
|
736
|
+
description: >
|
|
737
|
+
The PARADE dataset is inspired by the PAIRS dataset for evaluating occupation and status
|
|
738
|
+
bias in vision-language models. We collect a new dataset of audio-text multi-choice QA
|
|
739
|
+
task that involves exploring occupation and status bias. The dataset consists of 436
|
|
740
|
+
audio-text QA pairs with 3 options each.
|
|
741
|
+
metric_groups:
|
|
742
|
+
- accuracy
|
|
743
|
+
- general_information
|
|
744
|
+
environment:
|
|
745
|
+
main_name: exact_match
|
|
746
|
+
main_split: test
|
|
747
|
+
taxonomy:
|
|
748
|
+
task: audio classification
|
|
749
|
+
what: audio, question and answer given the audio
|
|
750
|
+
who: OpenAI's TTS
|
|
751
|
+
when: "2025"
|
|
752
|
+
language: English
|