crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +218 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +4 -1
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/summarize.py +23 -10
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +78 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +75 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +109 -36
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +241 -22
- helm/clients/palmyra_client.py +1 -4
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +47 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1112 -19
- helm/config/model_metadata.yaml +985 -44
- helm/config/tokenizer_configs.yaml +379 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +1 -1
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import os
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
from tqdm import tqdm
|
|
15
|
+
from datasets import load_dataset
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.audio_utils import ensure_audio_file_exists_from_array
|
|
18
|
+
from helm.common.general import ensure_file_downloaded
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class AMIScenario(Scenario):
|
|
22
|
+
"""AMI Meeting Corpus
|
|
23
|
+
The AMI Meeting Corpus (Carletta et al. 2005) is a multi-modal data set consisting of
|
|
24
|
+
100 hours of meeting recordings. It is being created in the context of a project that
|
|
25
|
+
is developing meeting browsing technology. The corpus is being recorded using a wide
|
|
26
|
+
range of devices including close-talking and far-field microphones, individual and
|
|
27
|
+
room-view video cameras, projection, a whiteboard, and individual pens, all of which
|
|
28
|
+
produce output signals that are synchronized with each other.
|
|
29
|
+
|
|
30
|
+
Paper: https://link.springer.com/chapter/10.1007/11677482_3
|
|
31
|
+
Code: https://groups.inf.ed.ac.uk/ami/corpus/
|
|
32
|
+
|
|
33
|
+
Citation:
|
|
34
|
+
@inproceedings{Carletta2005TheAM,
|
|
35
|
+
title={The AMI Meeting Corpus: A Pre-announcement},
|
|
36
|
+
author={Jean Carletta and Simone Ashby and Sebastien Bourban and Mike Flynn and Ma{\"e}l Guillemot
|
|
37
|
+
and Thomas Hain and Jaroslav Kadlec and Vasilis Karaiskos and Wessel Kraaij and Melissa Kronenthal
|
|
38
|
+
and Guillaume Lathoud and Mike Lincoln and Agnes Lisowska Masson and Iain McCowan and Wilfried Post
|
|
39
|
+
and Dennis Reidsma and Pierre D. Wellner},
|
|
40
|
+
booktitle={Machine Learning for Multimodal Interaction},
|
|
41
|
+
year={2005},
|
|
42
|
+
url={https://api.semanticscholar.org/CorpusID:6118869}
|
|
43
|
+
}
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
HF_DATASET_NAME = "edinburghcstr/ami"
|
|
47
|
+
HF_MAPPING_URL = (
|
|
48
|
+
"https://huggingface.co/datasets/PahaII/SRB_instance_key_mapping/resolve/main/srb_instance_keys.json"
|
|
49
|
+
)
|
|
50
|
+
name = "ami"
|
|
51
|
+
description = (
|
|
52
|
+
"Speech recognition of speech recorded with different devices "
|
|
53
|
+
"([Carletta et al, 2005](https://link.springer.com/chapter/10.1007/11677482_3))."
|
|
54
|
+
)
|
|
55
|
+
SUBJECT_DICT = {
|
|
56
|
+
"ihm": {"mapping_key": "ami_ihm_id2line", "srb_mapping": "nearfield"},
|
|
57
|
+
"sdm": {"mapping_key": "ami_sdm_id2line", "srb_mapping": "farfield"},
|
|
58
|
+
}
|
|
59
|
+
tags: List[str] = ["audio", "recognition"]
|
|
60
|
+
|
|
61
|
+
def __init__(self, subject: str) -> None:
|
|
62
|
+
super().__init__()
|
|
63
|
+
subject = subject.lower()
|
|
64
|
+
if subject not in AMIScenario.SUBJECT_DICT.keys():
|
|
65
|
+
raise ValueError(f"Invalid subject. Valid subjects are: {AMIScenario.SUBJECT_DICT.keys()}")
|
|
66
|
+
|
|
67
|
+
self._subject: str = subject
|
|
68
|
+
|
|
69
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
70
|
+
instances: List[Instance] = []
|
|
71
|
+
audio_save_dir = os.path.join(output_path, "audio_files")
|
|
72
|
+
mapping_local_path = os.path.join(output_path, "srb_instance_keys.json")
|
|
73
|
+
ensure_file_downloaded(source_url=AMIScenario.HF_MAPPING_URL, target_path=mapping_local_path)
|
|
74
|
+
meta_data = load_dataset(
|
|
75
|
+
AMIScenario.HF_DATASET_NAME,
|
|
76
|
+
name=self._subject,
|
|
77
|
+
cache_dir=output_path,
|
|
78
|
+
split=TEST_SPLIT,
|
|
79
|
+
)
|
|
80
|
+
index_mappings = AMIScenario.SUBJECT_DICT[self._subject]["mapping_key"]
|
|
81
|
+
srb_mappings = AMIScenario.SUBJECT_DICT[self._subject]["srb_mapping"]
|
|
82
|
+
mapping_dict = json.load(open(mapping_local_path))
|
|
83
|
+
srb_mapping_keys = mapping_dict["srb_aim_field_key2audio"][srb_mappings]
|
|
84
|
+
index2line_num = mapping_dict[index_mappings]
|
|
85
|
+
for line_num in tqdm(list(srb_mapping_keys)):
|
|
86
|
+
row = meta_data[int(index2line_num[line_num])]
|
|
87
|
+
local_audio_name = f"{self._subject}_{line_num}.wav"
|
|
88
|
+
local_audio_path = os.path.join(audio_save_dir, local_audio_name)
|
|
89
|
+
ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
|
|
90
|
+
answer = row["text"].lower()
|
|
91
|
+
input = Input(
|
|
92
|
+
multimedia_content=MultimediaObject([MediaObject(content_type="audio/wav", location=local_audio_path)])
|
|
93
|
+
)
|
|
94
|
+
references = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
|
|
95
|
+
instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
|
|
96
|
+
return instances
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Scenarios for audio models"""
|
|
2
|
+
|
|
3
|
+
from typing import List
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
Reference,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Input,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
15
|
+
from tqdm import tqdm
|
|
16
|
+
from datasets import load_dataset
|
|
17
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
18
|
+
from helm.common.general import ensure_directory_exists
|
|
19
|
+
from helm.common.audio_utils import ensure_audio_file_exists_from_array
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class AudioMNISTScenario(Scenario):
|
|
23
|
+
"""AudioMNIST
|
|
24
|
+
|
|
25
|
+
The AudioMNIST (Becker et al, 2023) dataset consists of a dataset of 30000 audio samples of
|
|
26
|
+
spoken digits (0-9) of 60 different speakers. The task is to classify the digit from the
|
|
27
|
+
audio sample.
|
|
28
|
+
|
|
29
|
+
Paper: https://arxiv.org/abs/1807.03418
|
|
30
|
+
Code: https://github.com/soerenab/AudioMNIST
|
|
31
|
+
|
|
32
|
+
Citation:
|
|
33
|
+
@article{audiomnist2023,
|
|
34
|
+
title = {AudioMNIST: Exploring Explainable Artificial Intelligence for audio analysis on a simple benchmark},
|
|
35
|
+
journal = {Journal of the Franklin Institute},
|
|
36
|
+
year = {2023},
|
|
37
|
+
issn = {0016-0032},
|
|
38
|
+
doi = {https://doi.org/10.1016/j.jfranklin.2023.11.038},
|
|
39
|
+
url = {https://www.sciencedirect.com/science/article/pii/S0016003223007536},
|
|
40
|
+
author = {Sören Becker and Johanna Vielhaben and Marcel Ackermann and Klaus-Robert Müller and Sebastian Lapuschkin and Wojciech Samek},
|
|
41
|
+
keywords = {Deep learning, Neural networks, Interpretability, Explainable artificial intelligence, Audio classification, Speech recognition},
|
|
42
|
+
}
|
|
43
|
+
""" # noqa: E501
|
|
44
|
+
|
|
45
|
+
name = "audio_mnist"
|
|
46
|
+
description = "Classify an audio sample of a spoken digit ([Becker et al, 2023](https://arxiv.org/abs/1807.03418))."
|
|
47
|
+
tags = ["audio", "classification"]
|
|
48
|
+
|
|
49
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
50
|
+
instances: List[Instance] = []
|
|
51
|
+
wav_save_dir: str = os.path.join(output_path, "audio_files")
|
|
52
|
+
ensure_directory_exists(wav_save_dir)
|
|
53
|
+
for row in tqdm(load_dataset("flexthink/audiomnist", cache_dir=output_path, split=TEST_SPLIT)):
|
|
54
|
+
local_audio_path = os.path.join(wav_save_dir, row["audio"]["path"])
|
|
55
|
+
audio_array = row["audio"]["array"]
|
|
56
|
+
ensure_audio_file_exists_from_array(local_audio_path, audio_array, row["audio"]["sampling_rate"])
|
|
57
|
+
input = Input(
|
|
58
|
+
multimedia_content=MultimediaObject([MediaObject(content_type="audio/wav", location=local_audio_path)])
|
|
59
|
+
)
|
|
60
|
+
references = [Reference(Output(text=str(row["digit"])), tags=[CORRECT_TAG])]
|
|
61
|
+
instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
|
|
62
|
+
return instances
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.scenario import (
|
|
5
|
+
Scenario,
|
|
6
|
+
Instance,
|
|
7
|
+
Reference,
|
|
8
|
+
TEST_SPLIT,
|
|
9
|
+
CORRECT_TAG,
|
|
10
|
+
Input,
|
|
11
|
+
Output,
|
|
12
|
+
)
|
|
13
|
+
from tqdm import tqdm
|
|
14
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
15
|
+
from helm.common.general import ensure_file_downloaded
|
|
16
|
+
import json
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class AudioPAIRSScenario(Scenario):
|
|
20
|
+
"""Audio PAIRS
|
|
21
|
+
|
|
22
|
+
Audio PAIRS is an audio extension of the PAIRS dataset (Fraser et al, 2024) to examine gender and
|
|
23
|
+
racial bias in audio large language models. We convert the questions in the PAIRS dataset to audio
|
|
24
|
+
clips using OpenAI's TTS-1-HD API.
|
|
25
|
+
|
|
26
|
+
This dataset is also modified to add an option to opt-out with "unclear" as a choice.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
DOWNLOADING_URL = "https://huggingface.co/datasets/UCSC-VLAA/Audio_PAIRS/resolve/main/audio_pairs_files.zip"
|
|
30
|
+
SUJECTS = ["occupation", "status", "potential_crime"]
|
|
31
|
+
|
|
32
|
+
name = "audio_pairs"
|
|
33
|
+
description = "Examining gender and racial bias in AudioLMs using a converted audio from the PAIRS dataset."
|
|
34
|
+
tags: List[str] = ["audio", "classification"]
|
|
35
|
+
|
|
36
|
+
def __init__(self, subject: str) -> None:
|
|
37
|
+
super().__init__()
|
|
38
|
+
|
|
39
|
+
if subject not in AudioPAIRSScenario.SUJECTS:
|
|
40
|
+
raise ValueError(f"Invalid subject. Valid subjects are: {AudioPAIRSScenario.SUJECTS}")
|
|
41
|
+
|
|
42
|
+
self._subject: str = subject
|
|
43
|
+
|
|
44
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
45
|
+
instances: List[Instance] = []
|
|
46
|
+
downloading_dir: str = os.path.join(output_path, "download")
|
|
47
|
+
ensure_file_downloaded(source_url=AudioPAIRSScenario.DOWNLOADING_URL, target_path=downloading_dir, unpack=True)
|
|
48
|
+
data_dir: str = os.path.join(downloading_dir, "audio_pairs_files")
|
|
49
|
+
audio_file_folder = os.path.join(data_dir, self._subject)
|
|
50
|
+
audio_instruction_path = os.path.join(data_dir, "audio_pairs_instructions.json")
|
|
51
|
+
audio_instructions = json.load(open(audio_instruction_path))[self._subject]
|
|
52
|
+
for audio_file_name, instruction in tqdm(audio_instructions.items()):
|
|
53
|
+
local_audio_file_name = "_".join(audio_file_name.split("_")[:-1]) + ".mp3"
|
|
54
|
+
local_audio_path: str = os.path.join(audio_file_folder, local_audio_file_name)
|
|
55
|
+
content = [
|
|
56
|
+
MediaObject(content_type="audio/mpeg", location=local_audio_path),
|
|
57
|
+
MediaObject(content_type="text/plain", text=instruction),
|
|
58
|
+
]
|
|
59
|
+
input = Input(multimedia_content=MultimediaObject(content))
|
|
60
|
+
references = [Reference(Output(text="unclear"), tags=[CORRECT_TAG])]
|
|
61
|
+
instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
|
|
62
|
+
return instances
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.scenario import (
|
|
5
|
+
Scenario,
|
|
6
|
+
Instance,
|
|
7
|
+
Reference,
|
|
8
|
+
TEST_SPLIT,
|
|
9
|
+
CORRECT_TAG,
|
|
10
|
+
Input,
|
|
11
|
+
Output,
|
|
12
|
+
)
|
|
13
|
+
from tqdm import tqdm
|
|
14
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
15
|
+
from helm.common.general import ensure_file_downloaded
|
|
16
|
+
import pandas as pd
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class AudioCapsScenario(Scenario):
|
|
20
|
+
"""AudioCaps
|
|
21
|
+
|
|
22
|
+
AudioCaps is a large-scale dataset of about 46K audio clips to human-written text pairs collected
|
|
23
|
+
via crowdsourcing on the AudioSet dataset, which covers a wide range of human and animal sounds,
|
|
24
|
+
musical instruments and genres, and common everyday environmental sounds.
|
|
25
|
+
|
|
26
|
+
Paper: https://aclanthology.org/N19-1011.pdf
|
|
27
|
+
Code: https://github.com/cdjkim/audiocaps
|
|
28
|
+
|
|
29
|
+
Citation:
|
|
30
|
+
@inproceedings{audiocaps,
|
|
31
|
+
title={AudioCaps: Generating Captions for Audios in The Wild},
|
|
32
|
+
author={Kim, Chris Dongjoo and Kim, Byeongchang and Lee, Hyunmin and Kim, Gunhee},
|
|
33
|
+
booktitle={NAACL-HLT},
|
|
34
|
+
year={2019}
|
|
35
|
+
}
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
DOWNLOADING_URL = "https://huggingface.co/datasets/Olivia714/audiocaps/resolve/main/wav_files.zip"
|
|
39
|
+
REFERENCE_URL = "https://huggingface.co/datasets/Olivia714/audiocaps/resolve/main/test.csv"
|
|
40
|
+
|
|
41
|
+
name = "audiocaps"
|
|
42
|
+
description = "A large-scale dataset of about 46K audio clips to human-written text pairs \
|
|
43
|
+
([Kim et al, 2019](https://aclanthology.org/N19-1011.pdf))."
|
|
44
|
+
tags: List[str] = ["audio", "captioning"]
|
|
45
|
+
|
|
46
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
47
|
+
instances: List[Instance] = []
|
|
48
|
+
data_dir: str = os.path.join(output_path, "wav_files")
|
|
49
|
+
ensure_file_downloaded(source_url=AudioCapsScenario.DOWNLOADING_URL, target_path=data_dir, unpack=True)
|
|
50
|
+
for _, row in tqdm(pd.read_csv(AudioCapsScenario.REFERENCE_URL, sep=",").iterrows()):
|
|
51
|
+
audiocap_id = row["audiocap_id"]
|
|
52
|
+
audio_path: str = os.path.join(data_dir, f"{audiocap_id}.wav")
|
|
53
|
+
assert os.path.exists(audio_path), f"Audio file does not exist at path: {audio_path}"
|
|
54
|
+
input = Input(
|
|
55
|
+
multimedia_content=MultimediaObject([MediaObject(content_type="audio/wav", location=audio_path)])
|
|
56
|
+
)
|
|
57
|
+
references = [Reference(Output(text=str(row["caption"])), tags=[CORRECT_TAG])]
|
|
58
|
+
instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
|
|
59
|
+
return instances
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
18
|
+
from helm.common.audio_utils import extract_audio
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class CasualConversations2Scenario(Scenario):
|
|
22
|
+
"""
|
|
23
|
+
Casual Conversation v2 (Porgali et al, 2023) is composed of over 5,567 participants (26,467 videos).
|
|
24
|
+
The videos feature paid individuals who agreed to participate in the project and explicitly provided
|
|
25
|
+
Age, Gender, Language/Dialect, Geo-location, Disability, Physical adornments, Physical attributes labels
|
|
26
|
+
themselves. The videos were recorded in Brazil, India, Indonesia, Mexico, Philippines, United States,
|
|
27
|
+
and Vietnam with a diverse set of adults in various categories.
|
|
28
|
+
|
|
29
|
+
The dataset contains the audio, speaker's age, gender information in the following languages:
|
|
30
|
+
English, Hindi, Indonesian, Italian, Portuguese, Spanish, Tagalog, Tamil, Telugu, and Vietnamese.
|
|
31
|
+
|
|
32
|
+
Paper: https://arxiv.org/abs/2303.04838
|
|
33
|
+
Dataset: https://ai.meta.com/datasets/casual-conversations-v2-dataset/
|
|
34
|
+
|
|
35
|
+
Requires downloading Causal Conversations V2 from https://ai.meta.com/datasets/casual-conversations-v2-downloads
|
|
36
|
+
|
|
37
|
+
Citation:
|
|
38
|
+
@inproceedings{porgali2023casual,
|
|
39
|
+
title={The casual conversations v2 dataset},
|
|
40
|
+
author={Porgali, Bilal and Albiero, V{\'\i}tor and Ryda, Jordan and Ferrer, Cristian Canton and Hazirbas, Caner},
|
|
41
|
+
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
|
|
42
|
+
pages={10--17},
|
|
43
|
+
year={2023}
|
|
44
|
+
}
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
SUBJECTS = ["age", "gender"]
|
|
48
|
+
SCRIPT_DOWNLOADING_URL = (
|
|
49
|
+
"https://huggingface.co/datasets/UCSC-VLAA/Causal_Conversation_V2_script/"
|
|
50
|
+
"resolve/main/CasualConversationsV2_v2.json"
|
|
51
|
+
)
|
|
52
|
+
AGE_INSTRUCTION = "Listen to the audio and take your best guess to estimate the speaker's age."
|
|
53
|
+
GENDER_INSTRUCTION = "Listen to the audio and take your best guess to determine the speaker's gender."
|
|
54
|
+
name = "casual_conversations2"
|
|
55
|
+
description = (
|
|
56
|
+
"A large scale multilingual speech translation corpus "
|
|
57
|
+
"([Porgali et al., 2023](https://arxiv.org/abs/2303.04838))."
|
|
58
|
+
)
|
|
59
|
+
tags = ["audio", "classification", "multilinguality"]
|
|
60
|
+
gender_options: List[str] = ["male", "female", "transgender male", "transgender female", "non-binary", "other"]
|
|
61
|
+
age_options: List[str] = ["18-30", "31-50", "51+", "other"]
|
|
62
|
+
|
|
63
|
+
def __init__(self, subject: str) -> None:
|
|
64
|
+
super().__init__()
|
|
65
|
+
|
|
66
|
+
if subject not in self.SUBJECTS:
|
|
67
|
+
raise ValueError(f"Invalid subject. Valid subjects are: {CasualConversations2Scenario.SUBJECTS}")
|
|
68
|
+
|
|
69
|
+
self._subject: str = subject
|
|
70
|
+
self._convert_answer_to_label_func = (
|
|
71
|
+
self._convert_age_to_label if subject == "age" else self._convert_gender_to_label
|
|
72
|
+
)
|
|
73
|
+
self.options = self.age_options if subject == "age" else self.gender_options
|
|
74
|
+
self.instruction = self.AGE_INSTRUCTION if subject == "age" else self.GENDER_INSTRUCTION
|
|
75
|
+
|
|
76
|
+
def _convert_age_to_label(self, age: str) -> str:
|
|
77
|
+
if age != "prefer not to say":
|
|
78
|
+
age_int = int(age)
|
|
79
|
+
if 18 <= age_int <= 30:
|
|
80
|
+
return "A"
|
|
81
|
+
elif 31 <= age_int <= 50:
|
|
82
|
+
return "B"
|
|
83
|
+
elif 51 <= age_int:
|
|
84
|
+
return "C"
|
|
85
|
+
else:
|
|
86
|
+
raise ValueError(f"Invalid age: {age}")
|
|
87
|
+
else:
|
|
88
|
+
return "D"
|
|
89
|
+
|
|
90
|
+
def _convert_gender_to_label(self, gender: Optional[str]) -> str:
|
|
91
|
+
if gender is not None and gender != "prefer not to say":
|
|
92
|
+
if gender == "cis man":
|
|
93
|
+
return "A"
|
|
94
|
+
elif gender == "cis woman":
|
|
95
|
+
return "B"
|
|
96
|
+
elif gender == "transgender man":
|
|
97
|
+
return "C"
|
|
98
|
+
elif gender == "transgender woman":
|
|
99
|
+
return "D"
|
|
100
|
+
elif gender == "non-binary":
|
|
101
|
+
return "E"
|
|
102
|
+
else:
|
|
103
|
+
raise ValueError(f"Invalid gender: {gender}")
|
|
104
|
+
else:
|
|
105
|
+
return "F"
|
|
106
|
+
|
|
107
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
108
|
+
data_dir: str = os.path.join(output_path, "videos_files")
|
|
109
|
+
assert os.path.exists(data_dir), (
|
|
110
|
+
f"Download the video files from Meta's Casual Conversations v2 dataset from "
|
|
111
|
+
f"(https://ai.meta.com/datasets/casual-conversations-v2-downloads) and unzip and place at {data_dir}."
|
|
112
|
+
)
|
|
113
|
+
script_file_path: str = os.path.join(output_path, "CasualConversationsV2.json")
|
|
114
|
+
audio_file_folder: str = os.path.join(output_path, "audio_files")
|
|
115
|
+
ensure_directory_exists(audio_file_folder)
|
|
116
|
+
ensure_file_downloaded(self.SCRIPT_DOWNLOADING_URL, script_file_path)
|
|
117
|
+
audio_scripts = json.load(open(script_file_path))
|
|
118
|
+
|
|
119
|
+
instances: List[Instance] = []
|
|
120
|
+
split: str = TEST_SPLIT
|
|
121
|
+
|
|
122
|
+
for file_name in tqdm(os.listdir(data_dir)):
|
|
123
|
+
if file_name.endswith(".mp4"):
|
|
124
|
+
local_audio_path: str = os.path.join(audio_file_folder, file_name.replace(".mp4", ".mp3"))
|
|
125
|
+
local_video_path: str = os.path.join(data_dir, file_name)
|
|
126
|
+
|
|
127
|
+
if not os.path.exists(local_audio_path):
|
|
128
|
+
extract_audio(local_video_path, local_audio_path)
|
|
129
|
+
assert os.path.exists(local_audio_path), f"Audio file does not exist at path: {local_audio_path}"
|
|
130
|
+
|
|
131
|
+
subject_answer = audio_scripts[file_name][self._subject]
|
|
132
|
+
answer = self._convert_answer_to_label_func(subject_answer)
|
|
133
|
+
# The given correct answer is a letter, but we need an index
|
|
134
|
+
correct_answer_index: int = ord(answer) - ord("A")
|
|
135
|
+
# The options are originally appended to the question
|
|
136
|
+
|
|
137
|
+
references: List[Reference] = []
|
|
138
|
+
for i, option in enumerate(self.options):
|
|
139
|
+
reference: Reference
|
|
140
|
+
is_correct: bool = i == correct_answer_index
|
|
141
|
+
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
|
|
142
|
+
references.append(reference)
|
|
143
|
+
|
|
144
|
+
content = [
|
|
145
|
+
MediaObject(content_type="audio/mpeg", location=local_audio_path),
|
|
146
|
+
MediaObject(content_type="text/plain", text=self.instruction),
|
|
147
|
+
]
|
|
148
|
+
|
|
149
|
+
input = Input(multimedia_content=MultimediaObject(content))
|
|
150
|
+
instances.append(Instance(input=input, references=references, split=split))
|
|
151
|
+
|
|
152
|
+
return instances
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Scenarios for audio models"""
|
|
2
|
+
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
from collections import OrderedDict
|
|
15
|
+
from tqdm import tqdm
|
|
16
|
+
from datasets import load_dataset
|
|
17
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
18
|
+
from helm.common.hierarchical_logger import hlog
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class CommonVoice15Scenario(Scenario):
|
|
22
|
+
"""CommonVoice15 Scenario
|
|
23
|
+
|
|
24
|
+
The most recent release of CommonVoice15 (Ardila et al, 2019) includes 114 languages. Over 50,000
|
|
25
|
+
individuals have participated so far, resulting in 2,500 hours of collected audio. This is the largest
|
|
26
|
+
audio corpus in the public domain for speech recognition, both in terms of number of hours and number
|
|
27
|
+
of languages. The task is to recognize the speech from the audio sample.
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
Paper: https://arxiv.org/abs/1912.06670
|
|
32
|
+
Code: https://github.com/common-voice/common-voice
|
|
33
|
+
|
|
34
|
+
Citation:
|
|
35
|
+
@article{ardila2019common,
|
|
36
|
+
title={Common voice: A massively-multilingual speech corpus},
|
|
37
|
+
author={Ardila, Rosana and Branson, Megan and Davis, Kelly and
|
|
38
|
+
Henretty, Michael and Kohler, Michael and Meyer, Josh and Morais,
|
|
39
|
+
Reuben and Saunders, Lindsay and Tyers, Francis M and Weber, Gregor},
|
|
40
|
+
journal={arXiv preprint arXiv:1912.06670},
|
|
41
|
+
year={2019}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
HF_DATASET_NAME = "mozilla-foundation/common_voice_15_0"
|
|
47
|
+
|
|
48
|
+
# Randomly selected 4 languages from 114 languages in the Common Voice 15 dataset following
|
|
49
|
+
# Qwen2-Audio (https://arxiv.org/abs/2407.10759). The full language is:
|
|
50
|
+
# https://huggingface.co/datasets/mozilla-foundation/common_voice_15_0/blob/main/languages.py
|
|
51
|
+
_COMMON_VOICE_TEST_LANG_TO_ID = OrderedDict(
|
|
52
|
+
[
|
|
53
|
+
("English", "en"),
|
|
54
|
+
("Chinese_hk", "zh-HK"),
|
|
55
|
+
("German", "de"),
|
|
56
|
+
("French", "fr"),
|
|
57
|
+
]
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
name = "common_voice_15"
|
|
61
|
+
description = "Speech recognition for 4 languages from 114 different languages in Common Voice 15 \
|
|
62
|
+
([Ardila et al, 2019](https://arxiv.org/abs/1912.06670))."
|
|
63
|
+
tags: List[str] = ["audio", "recognition", "multilinguality"]
|
|
64
|
+
|
|
65
|
+
def __init__(self, language: str) -> None:
|
|
66
|
+
super().__init__()
|
|
67
|
+
|
|
68
|
+
language = language.capitalize()
|
|
69
|
+
if language not in CommonVoice15Scenario._COMMON_VOICE_TEST_LANG_TO_ID.keys():
|
|
70
|
+
raise ValueError(
|
|
71
|
+
f"Invalid language. Valid languages are: {CommonVoice15Scenario._COMMON_VOICE_TEST_LANG_TO_ID.keys()}"
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
self._language: str = language
|
|
75
|
+
hlog(
|
|
76
|
+
"You need to sign in Huggingface to download the dataset. Please remember "
|
|
77
|
+
"to sign in to download the dataset."
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
81
|
+
instances: List[Instance] = []
|
|
82
|
+
language_category = CommonVoice15Scenario._COMMON_VOICE_TEST_LANG_TO_ID[self._language]
|
|
83
|
+
for row in tqdm(
|
|
84
|
+
load_dataset(
|
|
85
|
+
CommonVoice15Scenario.HF_DATASET_NAME,
|
|
86
|
+
name=language_category,
|
|
87
|
+
cache_dir=output_path,
|
|
88
|
+
split=TEST_SPLIT,
|
|
89
|
+
trust_remote_code=True,
|
|
90
|
+
)
|
|
91
|
+
):
|
|
92
|
+
local_audio_path = row["path"]
|
|
93
|
+
answer = row["sentence"]
|
|
94
|
+
input = Input(
|
|
95
|
+
multimedia_content=MultimediaObject([MediaObject(content_type="audio/mpeg", location=local_audio_path)])
|
|
96
|
+
)
|
|
97
|
+
references = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
|
|
98
|
+
instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
|
|
99
|
+
return instances
|