crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +10 -5
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +90 -61
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +281 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +21 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +119 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +105 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +83 -5
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +18 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +42 -6
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +151 -47
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +255 -21
- helm/clients/palmyra_client.py +2 -6
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +69 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +8 -30
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/key_value_store.py +9 -9
- helm/common/media_object.py +2 -2
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +1258 -33
- helm/config/model_metadata.yaml +1110 -41
- helm/config/tokenizer_configs.yaml +403 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -13
- helm/proxy/services/remote_service.py +1 -7
- helm/proxy/services/server_service.py +6 -19
- helm/proxy/services/service.py +0 -6
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.3.dist-info/METADATA +0 -355
- crfm_helm-0.5.3.dist-info/RECORD +0 -699
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
from io import BytesIO
|
|
2
|
+
from typing import List
|
|
3
|
+
import os
|
|
4
|
+
import requests
|
|
5
|
+
|
|
6
|
+
from pydub import AudioSegment
|
|
7
|
+
from tqdm import tqdm
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from helm.benchmark.scenarios.scenario import (
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
Scenario,
|
|
13
|
+
Instance,
|
|
14
|
+
Reference,
|
|
15
|
+
CORRECT_TAG,
|
|
16
|
+
Input,
|
|
17
|
+
Output,
|
|
18
|
+
)
|
|
19
|
+
from helm.common.audio_utils import is_invalid_audio_file
|
|
20
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
21
|
+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
22
|
+
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MuToxScenario(Scenario):
|
|
26
|
+
"""
|
|
27
|
+
MuTox: MuTox: Universal MUltilingual Audio-based TOXicity Dataset and Zero-shot Detector
|
|
28
|
+
|
|
29
|
+
MuTox, the first highly multilingual audio-based dataset with toxicity labels. The dataset consists of 20k
|
|
30
|
+
audio utterances for English and Spanish, and 4k for the other languages. To showcase the quality of this
|
|
31
|
+
dataset, we train the MuTox audio-based toxicity classifier, which allows zero-shot toxicity detection across
|
|
32
|
+
a broad range of languages. This classifier outperforms existing text-based trainable classifiers by more than
|
|
33
|
+
1% AUC, while increasing the language coverage from 8 to 100+ languages. When compared to a wordlist-based
|
|
34
|
+
classifier that covers a similar number of languages, MuTox improves precision and recall by ∼2.5 times.
|
|
35
|
+
|
|
36
|
+
Languages:
|
|
37
|
+
"Arabic": "arb",
|
|
38
|
+
"Bengali": "ben",
|
|
39
|
+
"Bulgarian": "bul",
|
|
40
|
+
"Catalan": "cat",
|
|
41
|
+
"Czech": "ces",
|
|
42
|
+
"Mandarin Chinese": "cmn",
|
|
43
|
+
"Danish": "dan",
|
|
44
|
+
"German": "deu",
|
|
45
|
+
"Greek": "ell",
|
|
46
|
+
"English": "eng",
|
|
47
|
+
"Estonian": "est",
|
|
48
|
+
"Western Persian": "fas",
|
|
49
|
+
"Finnish": "fin",
|
|
50
|
+
"French": "fra",
|
|
51
|
+
"Hebrew": "heb",
|
|
52
|
+
"Hindi": "hin",
|
|
53
|
+
"Hungarian": "hun",
|
|
54
|
+
"Indonesian": "ind",
|
|
55
|
+
"Italian": "ita",
|
|
56
|
+
"Dutch": "nld",
|
|
57
|
+
"Polish": "pol",
|
|
58
|
+
"Portuguese": "por",
|
|
59
|
+
"Russian": "rus",
|
|
60
|
+
"Spanish": "spa",
|
|
61
|
+
"Slovak": "slk",
|
|
62
|
+
"Swahili": "swh",
|
|
63
|
+
"Tagalog": "tgl",
|
|
64
|
+
"Turkish": "tur",
|
|
65
|
+
"Urdu": "urd",
|
|
66
|
+
"Vietnamese": "vie",
|
|
67
|
+
|
|
68
|
+
The columns of the dataset are:
|
|
69
|
+
|
|
70
|
+
id: a string id of the segment;
|
|
71
|
+
lang: 3-letter language code;
|
|
72
|
+
partition: one of train, dev, or devtest
|
|
73
|
+
public_url_segment: a string formatted as url:start:end, where start and end are indicated in milliseconds;
|
|
74
|
+
audio_file_transcript: text transctiption of the segment;
|
|
75
|
+
contains_toxicity, toxicity_types, perlocutionary_effects: annotation results as strings
|
|
76
|
+
label: an integer label, equal to 1 if contains_toxicity equals Yes and 0 otherwise;
|
|
77
|
+
etox_result: toxic word (or multiple words, separated by |) detected by the Etox matcher;
|
|
78
|
+
detoxify_score: toxicity probabilities predicted by the Detoxify system (float numbers between 0 and 1);
|
|
79
|
+
mutox_speech_score, mutox_text_score, mutox_zero_shot_speech_score, mutox_zero_shot_text_score: MuTox predictions
|
|
80
|
+
as float numbers with any value (they can be interpreted as logits,
|
|
81
|
+
i.e. probabilities before a sigmoid transformation).
|
|
82
|
+
|
|
83
|
+
Citation:
|
|
84
|
+
|
|
85
|
+
@misc{costajussà2023mutox,
|
|
86
|
+
title={MuTox: Universal MUltilingual Audio-based TOXicity Dataset and Zero-shot Detector},
|
|
87
|
+
author={ Marta R. Costa-jussà, Mariano Coria Meglioli, Pierre Andrews, David Dale, Prangthip Hansanti,
|
|
88
|
+
Elahe Kalbassi, Alex Mourachko, Christophe Ropers, Carleigh Wood},
|
|
89
|
+
year={2023},
|
|
90
|
+
eprint={},
|
|
91
|
+
archivePrefix={arXiv},
|
|
92
|
+
primaryClass={cs.CL}
|
|
93
|
+
}
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
ANNOTATIONS_URL = "https://dl.fbaipublicfiles.com/seamless/datasets/mutox.tsv"
|
|
97
|
+
|
|
98
|
+
LANGAUGE_CODES = {
|
|
99
|
+
"Arabic": "arb",
|
|
100
|
+
"Bengali": "ben",
|
|
101
|
+
"Bulgarian": "bul",
|
|
102
|
+
"Catalan": "cat",
|
|
103
|
+
"Czech": "ces",
|
|
104
|
+
"Mandarin_Chinese": "cmn",
|
|
105
|
+
"Danish": "dan",
|
|
106
|
+
"German": "deu",
|
|
107
|
+
"Greek": "ell",
|
|
108
|
+
"English": "eng",
|
|
109
|
+
"Estonian": "est",
|
|
110
|
+
"Western_Persian": "fas",
|
|
111
|
+
"Finnish": "fin",
|
|
112
|
+
"French": "fra",
|
|
113
|
+
"Hebrew": "heb",
|
|
114
|
+
"Hindi": "hin",
|
|
115
|
+
"Hungarian": "hun",
|
|
116
|
+
"Indonesian": "ind",
|
|
117
|
+
"Italian": "ita",
|
|
118
|
+
"Dutch": "nld",
|
|
119
|
+
"Polish": "pol",
|
|
120
|
+
"Portuguese": "por",
|
|
121
|
+
"Russian": "rus",
|
|
122
|
+
"Spanish": "spa",
|
|
123
|
+
"Slovak": "slk",
|
|
124
|
+
"Swahili": "swh",
|
|
125
|
+
"Tagalog": "tgl",
|
|
126
|
+
"Turkish": "tur",
|
|
127
|
+
"Urdu": "urd",
|
|
128
|
+
"Vietnamese": "vie",
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
name = "mutox"
|
|
132
|
+
description = "Toxicity detection benchmark ([Costa-jussà et al, 2024](https://arxiv.org/abs/2401.05060))."
|
|
133
|
+
tags = ["audio", "classification", "toxicity "]
|
|
134
|
+
|
|
135
|
+
@staticmethod
|
|
136
|
+
def track_bad_audio_file(bad_audio_file: str, output_path: str) -> None:
|
|
137
|
+
"""
|
|
138
|
+
Many of the links do not exist or point to broken so we keep track of them
|
|
139
|
+
and skip them in the future runs to significantly speed up gathering the instances.
|
|
140
|
+
"""
|
|
141
|
+
with open(output_path, "a") as f:
|
|
142
|
+
f.write(bad_audio_file + "\n")
|
|
143
|
+
|
|
144
|
+
def __init__(self, language: str) -> None:
|
|
145
|
+
super().__init__()
|
|
146
|
+
self._language_code: str = self.LANGAUGE_CODES[language]
|
|
147
|
+
|
|
148
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
149
|
+
# Download the annotations
|
|
150
|
+
annotations_path: str = os.path.join(output_path, "mutox.tsv")
|
|
151
|
+
ensure_file_downloaded(self.ANNOTATIONS_URL, annotations_path)
|
|
152
|
+
|
|
153
|
+
# Read bad audio files
|
|
154
|
+
bad_audio_files: set[str] = set()
|
|
155
|
+
bad_audio_files_path: str = os.path.join(output_path, "bad_audio_files.txt")
|
|
156
|
+
if os.path.exists(bad_audio_files_path):
|
|
157
|
+
# Each line is the audio file name
|
|
158
|
+
with open(bad_audio_files_path, "r") as f:
|
|
159
|
+
for line in f:
|
|
160
|
+
bad_audio_files.add(line.strip())
|
|
161
|
+
hlog(f"Found {len(bad_audio_files)} bad audio files.")
|
|
162
|
+
|
|
163
|
+
# Where the audio files will be downloaded to
|
|
164
|
+
audio_path: str = os.path.join(output_path, "audio")
|
|
165
|
+
ensure_directory_exists(audio_path)
|
|
166
|
+
|
|
167
|
+
instances: List[Instance] = []
|
|
168
|
+
df = pd.read_csv(annotations_path, delimiter="\t")
|
|
169
|
+
hlog(f"Found {len(df)} rows in the dataset")
|
|
170
|
+
|
|
171
|
+
valid_count: int = 0
|
|
172
|
+
total_count: int = 0
|
|
173
|
+
for row in tqdm(df.itertuples()):
|
|
174
|
+
# Only proces examples that are in devtest and the language we're interested in
|
|
175
|
+
if row.partition != "devtest":
|
|
176
|
+
continue
|
|
177
|
+
|
|
178
|
+
if row.lang != self._language_code:
|
|
179
|
+
continue
|
|
180
|
+
|
|
181
|
+
total_count += 1
|
|
182
|
+
|
|
183
|
+
# Discard known bad audio files
|
|
184
|
+
audio_filename: str = f"{row.id}.mp3"
|
|
185
|
+
with htrack_block(f"Processing audio file: {audio_filename}"):
|
|
186
|
+
if audio_filename in bad_audio_files:
|
|
187
|
+
hlog(f"Skipping this example -- known bad audio file: {audio_filename}")
|
|
188
|
+
continue
|
|
189
|
+
|
|
190
|
+
local_audio_path: str = os.path.join(audio_path, audio_filename)
|
|
191
|
+
if not os.path.exists(local_audio_path):
|
|
192
|
+
# The provided URL has the complete audio, so we need to download it and clip it
|
|
193
|
+
# public_url_segment: a string formatted as url:start:end,
|
|
194
|
+
if not isinstance(row.public_url_segment, str):
|
|
195
|
+
# Sometimes URL is just a float causing an error. Skip those.
|
|
196
|
+
hlog(f"Skipping this example -- invalid URL: {row.public_url_segment}")
|
|
197
|
+
continue
|
|
198
|
+
|
|
199
|
+
parts = row.public_url_segment.split()
|
|
200
|
+
if len(parts) != 3:
|
|
201
|
+
hlog(f"Skipping this example -- invalid URL: {row.public_url_segment}")
|
|
202
|
+
continue
|
|
203
|
+
|
|
204
|
+
audio_url, start_ms_str, end_ms_str = parts
|
|
205
|
+
start_ms, end_ms = int(start_ms_str), int(end_ms_str)
|
|
206
|
+
|
|
207
|
+
# Download the full audio file
|
|
208
|
+
try:
|
|
209
|
+
response = requests.get(audio_url, stream=True)
|
|
210
|
+
response.raise_for_status()
|
|
211
|
+
|
|
212
|
+
# Load audio from the downloaded file
|
|
213
|
+
audio = AudioSegment.from_file(BytesIO(response.content))
|
|
214
|
+
|
|
215
|
+
# Clip the audio
|
|
216
|
+
clipped_audio = audio[start_ms:end_ms]
|
|
217
|
+
|
|
218
|
+
# Save the clipped file
|
|
219
|
+
clipped_audio.export(local_audio_path, format="mp3")
|
|
220
|
+
except Exception as e:
|
|
221
|
+
# Some files are no longer available or invalid
|
|
222
|
+
hlog(
|
|
223
|
+
f"Skipping this example -- error downloading / processing audio file from {audio_url}: {e}"
|
|
224
|
+
)
|
|
225
|
+
self.track_bad_audio_file(audio_filename, bad_audio_files_path)
|
|
226
|
+
continue
|
|
227
|
+
|
|
228
|
+
if is_invalid_audio_file(local_audio_path):
|
|
229
|
+
hlog(f"Skipping this example -- Invalid audio file even after downloading at {local_audio_path}")
|
|
230
|
+
self.track_bad_audio_file(audio_filename, bad_audio_files_path)
|
|
231
|
+
continue
|
|
232
|
+
else:
|
|
233
|
+
valid_count += 1
|
|
234
|
+
hlog(f"Valid audio file found at {local_audio_path} (valid/total: {valid_count}/{total_count})")
|
|
235
|
+
|
|
236
|
+
input = Input(
|
|
237
|
+
multimedia_content=MultimediaObject(
|
|
238
|
+
media_objects=[
|
|
239
|
+
# Following what was prompted annotators as outlined in the paper
|
|
240
|
+
MediaObject(text="Does the utterance contain toxicity?", content_type="text/plain"),
|
|
241
|
+
MediaObject(location=local_audio_path, content_type="audio/mpeg"),
|
|
242
|
+
]
|
|
243
|
+
)
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
is_toxic: bool = row.label == 1
|
|
247
|
+
references = [
|
|
248
|
+
Reference(Output(text="Yes"), tags=[CORRECT_TAG] if is_toxic else []),
|
|
249
|
+
Reference(Output(text="No"), tags=[CORRECT_TAG] if not is_toxic else []),
|
|
250
|
+
]
|
|
251
|
+
instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
|
|
252
|
+
|
|
253
|
+
assert len(instances) > 0, f"No instances found for language: {self._language_code}"
|
|
254
|
+
return instances
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import os
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
from tqdm import tqdm
|
|
15
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
16
|
+
from helm.common.general import ensure_file_downloaded
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class PARADEScenario(Scenario):
|
|
20
|
+
"""PARADE
|
|
21
|
+
|
|
22
|
+
PARADE dataset is inspired by the PAIRS dataset for evaluating occupation and status bias
|
|
23
|
+
in vision-language models. We collect a new dataset of audio-text multi-choice QA task that
|
|
24
|
+
involves exploring occupation and status bias. The dataset consists of 436 audio-text QA pairs
|
|
25
|
+
with 3 options each.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
ANNOT_URL = (
|
|
29
|
+
"https://huggingface.co/datasets/UCSC-VLAA/PARADE_audio/resolve/main/audio_result" "_path_mapping_v2.json"
|
|
30
|
+
)
|
|
31
|
+
MALE_AUDIO = "https://huggingface.co/datasets/UCSC-VLAA/PARADE_audio/resolve/main/onyx.zip"
|
|
32
|
+
FEMALE_AUDIO = "https://huggingface.co/datasets/UCSC-VLAA/PARADE_audio/resolve/main/nova.zip"
|
|
33
|
+
|
|
34
|
+
PARADE_INSTRUCTION = "\n\n Answer the question with one of the following options: A, B, or C."
|
|
35
|
+
|
|
36
|
+
SUBSET_LIST = ["occupation", "status"]
|
|
37
|
+
VOICE_MAPPING = {"male": "onyx", "female": "nova"}
|
|
38
|
+
|
|
39
|
+
name = "parade"
|
|
40
|
+
description = "Exploring occupation and status bias in the audio-text multi-choice QA task."
|
|
41
|
+
tags: List[str] = ["audio", "bias"]
|
|
42
|
+
|
|
43
|
+
def __init__(self, subset: str, voice: str) -> None:
|
|
44
|
+
super().__init__()
|
|
45
|
+
|
|
46
|
+
subset = subset.lower()
|
|
47
|
+
voice = voice.lower()
|
|
48
|
+
if subset not in PARADEScenario.SUBSET_LIST:
|
|
49
|
+
raise ValueError(f"Invalid subset. Valid subsets are: {PARADEScenario.SUBSET_LIST}")
|
|
50
|
+
|
|
51
|
+
if voice not in PARADEScenario.VOICE_MAPPING.keys():
|
|
52
|
+
raise ValueError(f"Invalid voice. Valid voices are: {PARADEScenario.VOICE_MAPPING.keys()}")
|
|
53
|
+
|
|
54
|
+
self._subset: str = subset
|
|
55
|
+
self._voice: str = voice
|
|
56
|
+
|
|
57
|
+
def _convert_answer_to_label(self, options: list, answer: str) -> str:
|
|
58
|
+
option_list = ["A", "B", "C"]
|
|
59
|
+
return option_list[options.index(answer)]
|
|
60
|
+
|
|
61
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
62
|
+
instances: List[Instance] = []
|
|
63
|
+
annot_save_path = os.path.join(output_path, "annotation.json")
|
|
64
|
+
ensure_file_downloaded(source_url=PARADEScenario.ANNOT_URL, target_path=annot_save_path)
|
|
65
|
+
annotations = json.load(open(annot_save_path))[self.VOICE_MAPPING[self._voice]][self._subset]
|
|
66
|
+
test_annotations = []
|
|
67
|
+
for key in annotations:
|
|
68
|
+
for key2 in annotations[key]:
|
|
69
|
+
test_annotations.append(annotations[key][key2])
|
|
70
|
+
audio_save_dir = os.path.join(output_path, "audio_files")
|
|
71
|
+
if self._voice == "male":
|
|
72
|
+
ensure_file_downloaded(source_url=PARADEScenario.MALE_AUDIO, target_path=audio_save_dir, unpack=True)
|
|
73
|
+
else:
|
|
74
|
+
ensure_file_downloaded(source_url=PARADEScenario.FEMALE_AUDIO, target_path=audio_save_dir, unpack=True)
|
|
75
|
+
for row in tqdm(test_annotations):
|
|
76
|
+
local_audio_path = os.path.join(output_path, "audio_files", row["path"])
|
|
77
|
+
answer = self._convert_answer_to_label(row["options"], row["label"])
|
|
78
|
+
# The given correct answer is a letter, but we need an index
|
|
79
|
+
correct_answer_index: int = ord(answer) - ord("A")
|
|
80
|
+
references: List[Reference] = []
|
|
81
|
+
question = row["question"]
|
|
82
|
+
for i, option in enumerate(row["options"]):
|
|
83
|
+
reference: Reference
|
|
84
|
+
is_correct: bool = i == correct_answer_index
|
|
85
|
+
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
|
|
86
|
+
references.append(reference)
|
|
87
|
+
|
|
88
|
+
input = Input(
|
|
89
|
+
multimedia_content=MultimediaObject(
|
|
90
|
+
[
|
|
91
|
+
MediaObject(content_type="audio/mpeg", location=local_audio_path),
|
|
92
|
+
MediaObject(content_type="text/plain", text=question + self.PARADE_INSTRUCTION),
|
|
93
|
+
]
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
|
|
97
|
+
return instances
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""Scenarios for audio models"""
|
|
2
|
+
|
|
3
|
+
from typing import List
|
|
4
|
+
import os
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
16
|
+
from tqdm import tqdm
|
|
17
|
+
from datasets import load_dataset
|
|
18
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
19
|
+
from helm.common.audio_utils import ensure_audio_file_exists_from_array
|
|
20
|
+
from helm.common.general import ensure_file_downloaded
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SpeechRobustBenchScenario(Scenario):
|
|
24
|
+
"""Speech Robust Bench Scenario
|
|
25
|
+
|
|
26
|
+
Speech Robust Bench (Shah et al, 2024) is a comprehensive benchmark for evaluating
|
|
27
|
+
the robustness of ASR models to diverse corruptions. SRB is composed of 114 input
|
|
28
|
+
perturbations which simulate an heterogeneous range of corruptions that ASR models
|
|
29
|
+
may encounter when deployed in the wild. In this scenario, we select four subsets
|
|
30
|
+
in the benchmark for evaluation, each corresponds to a clean version of audio task.
|
|
31
|
+
|
|
32
|
+
Paper: https://arxiv.org/abs/2403.07937
|
|
33
|
+
Code: https://github.com/ahmedshah1494/speech_robust_bench
|
|
34
|
+
|
|
35
|
+
Citation:
|
|
36
|
+
@article{shah2024speech,
|
|
37
|
+
title={Speech robust bench: A robustness benchmark for speech recognition},
|
|
38
|
+
author={Shah, Muhammad A and Noguero, David Solans and Heikkila, Mikko A and Raj,
|
|
39
|
+
Bhiksha and Kourtellis, Nicolas},
|
|
40
|
+
journal={arXiv preprint arXiv:2403.07937},
|
|
41
|
+
year={2024}
|
|
42
|
+
}
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
HF_DATASET_NAME = "mshah1/speech_robust_bench"
|
|
46
|
+
HF_MAPPING_URL = (
|
|
47
|
+
"https://huggingface.co/datasets/PahaII/SRB_instance_key_mapping/resolve/main/srb_instance_keys.json"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Select four subsets of the dataset for the benchmark
|
|
51
|
+
SUBJECTS_DICT = {
|
|
52
|
+
"ami_far": {
|
|
53
|
+
"name": "in-the-wild-AMI",
|
|
54
|
+
"split": "farfield",
|
|
55
|
+
"type": "audio/wav",
|
|
56
|
+
"mapping_key": "srb_aim_field_key2audio",
|
|
57
|
+
},
|
|
58
|
+
"ami_near": {
|
|
59
|
+
"name": "in-the-wild-AMI",
|
|
60
|
+
"split": "nearfield",
|
|
61
|
+
"type": "audio/wav",
|
|
62
|
+
"mapping_key": "srb_aim_field_key2audio",
|
|
63
|
+
},
|
|
64
|
+
"librispeech_gnoise": {
|
|
65
|
+
"name": "librispeech_asr-test.clean_pertEval_500_30",
|
|
66
|
+
"split": "gnoise.1",
|
|
67
|
+
"type": "audio/mp3",
|
|
68
|
+
"mapping_key": "srb_librispeech_noises_key2audio",
|
|
69
|
+
},
|
|
70
|
+
"librispeech_env_noise": {
|
|
71
|
+
"name": "librispeech_asr-test.clean_pertEval_500_30",
|
|
72
|
+
"split": "env_noise_esc50.1",
|
|
73
|
+
"type": "audio/mp3",
|
|
74
|
+
"mapping_key": "srb_librispeech_noises_key2audio",
|
|
75
|
+
},
|
|
76
|
+
}
|
|
77
|
+
# There are 30 different perturbation samples for each LibriSpeech ID
|
|
78
|
+
PERTURBATION_LEVELS = list(range(1, 31))
|
|
79
|
+
name = "speech_robust_bench"
|
|
80
|
+
description = (
|
|
81
|
+
"Speech recognition for 4 datasets with a wide range of corruptions"
|
|
82
|
+
"([Shah et al, 2024](https://arxiv.org/abs/2403.07937))."
|
|
83
|
+
)
|
|
84
|
+
tags: List[str] = ["audio", "recognition", "robustness", "multilinguality"]
|
|
85
|
+
|
|
86
|
+
def __init__(self, subject: str, level: int) -> None:
|
|
87
|
+
super().__init__()
|
|
88
|
+
|
|
89
|
+
self._subject = subject
|
|
90
|
+
if self._subject not in SpeechRobustBenchScenario.SUBJECTS_DICT.keys():
|
|
91
|
+
raise ValueError(f"Invalid subject. Valid subjects are: {SpeechRobustBenchScenario.SUBJECTS_DICT.keys()}")
|
|
92
|
+
self._level = level
|
|
93
|
+
if self._level not in SpeechRobustBenchScenario.PERTURBATION_LEVELS:
|
|
94
|
+
raise ValueError(f"Invalid level. Valid levels are: {SpeechRobustBenchScenario.PERTURBATION_LEVELS}")
|
|
95
|
+
|
|
96
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
97
|
+
instances: List[Instance] = []
|
|
98
|
+
subject_name = SpeechRobustBenchScenario.SUBJECTS_DICT[self._subject]["name"]
|
|
99
|
+
subject_split = SpeechRobustBenchScenario.SUBJECTS_DICT[self._subject]["split"]
|
|
100
|
+
subject_type = SpeechRobustBenchScenario.SUBJECTS_DICT[self._subject]["type"]
|
|
101
|
+
subject_audio_type = subject_type.split("/")[-1]
|
|
102
|
+
subject_mapping = SpeechRobustBenchScenario.SUBJECTS_DICT[self._subject]["mapping_key"]
|
|
103
|
+
audio_save_dir = os.path.join(output_path, "audio_files")
|
|
104
|
+
mapping_local_path = os.path.join(output_path, "srb_instance_keys.json")
|
|
105
|
+
ensure_file_downloaded(source_url=SpeechRobustBenchScenario.HF_MAPPING_URL, target_path=mapping_local_path)
|
|
106
|
+
mapping_keys = json.load(open(mapping_local_path))[subject_mapping][subject_split]
|
|
107
|
+
meta_data = load_dataset(
|
|
108
|
+
SpeechRobustBenchScenario.HF_DATASET_NAME,
|
|
109
|
+
name=subject_name,
|
|
110
|
+
cache_dir=output_path,
|
|
111
|
+
split=subject_split,
|
|
112
|
+
)
|
|
113
|
+
for line_num in tqdm(list(mapping_keys.keys())):
|
|
114
|
+
row = meta_data[int(mapping_keys[line_num][self._level - 1])]
|
|
115
|
+
local_audio_name = f"{self._subject}_{subject_split}_{line_num}.{subject_audio_type}"
|
|
116
|
+
local_audio_path = os.path.join(audio_save_dir, local_audio_name)
|
|
117
|
+
ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
|
|
118
|
+
answer = row["text"].lower()
|
|
119
|
+
input = Input(
|
|
120
|
+
multimedia_content=MultimediaObject([MediaObject(content_type=subject_type, location=local_audio_path)])
|
|
121
|
+
)
|
|
122
|
+
references = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
|
|
123
|
+
instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
|
|
124
|
+
return instances
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Scenarios for audio models"""
|
|
2
|
+
|
|
3
|
+
from typing import List
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
Reference,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Input,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
15
|
+
from tqdm import tqdm
|
|
16
|
+
|
|
17
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
18
|
+
from helm.common.general import ensure_file_downloaded
|
|
19
|
+
from helm.common.audio_utils import is_invalid_audio_file
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class VocalSoundScenario(Scenario):
|
|
23
|
+
"""Vocal Sound Scenario
|
|
24
|
+
|
|
25
|
+
The VocalSound (Gong et al, 2022) dataset consists of 21,000 crowdsourced recordings
|
|
26
|
+
of laughter, sighs, coughs, throat clearing, sneezes, and sniffs from 3,365 unique subjects.
|
|
27
|
+
The task is to classify the human behaviour from the audio sample.
|
|
28
|
+
|
|
29
|
+
Paper: https://arxiv.org/abs/2205.03433
|
|
30
|
+
Code: https://github.com/YuanGongND/vocalsound
|
|
31
|
+
|
|
32
|
+
Citation:
|
|
33
|
+
@INPROCEEDINGS{gong_vocalsound,
|
|
34
|
+
author={Gong, Yuan and Yu, Jin and Glass, James},
|
|
35
|
+
booktitle={ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
|
|
36
|
+
title={Vocalsound: A Dataset for Improving Human Vocal Sounds Recognition},
|
|
37
|
+
year={2022},
|
|
38
|
+
pages={151-155},
|
|
39
|
+
doi={10.1109/ICASSP43922.2022.9746828}
|
|
40
|
+
}
|
|
41
|
+
""" # noqa: E501
|
|
42
|
+
|
|
43
|
+
DOWNLOADING_URL = "https://www.dropbox.com/s/c5ace70qh1vbyzb/vs_release_16k.zip"
|
|
44
|
+
|
|
45
|
+
name = "vocal_sound"
|
|
46
|
+
description = "Classify an audio sample of a spoken digit ([Gong et al, 2022](https://arxiv.org/abs/2205.03433))."
|
|
47
|
+
tags: List[str] = ["audio", "classification"]
|
|
48
|
+
|
|
49
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
50
|
+
instances: List[Instance] = []
|
|
51
|
+
down_loading_path = os.path.join(output_path, "download")
|
|
52
|
+
ensure_file_downloaded(VocalSoundScenario.DOWNLOADING_URL, down_loading_path, unpack=True)
|
|
53
|
+
wav_save_dir = os.path.join(down_loading_path, "audio_16k")
|
|
54
|
+
for file_name in tqdm(os.listdir(wav_save_dir)):
|
|
55
|
+
local_audio_path: str = os.path.join(wav_save_dir, file_name)
|
|
56
|
+
if not file_name.endswith(".wav") or is_invalid_audio_file(local_audio_path):
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
input = Input(
|
|
60
|
+
multimedia_content=MultimediaObject([MediaObject(content_type="audio/wav", location=local_audio_path)])
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
answer: str = file_name.split("_")[-1].split(".")[0]
|
|
64
|
+
if answer == "throatclearing":
|
|
65
|
+
answer = "throat clearing"
|
|
66
|
+
|
|
67
|
+
references = [Reference(Output(text=str(answer)), tags=[CORRECT_TAG])]
|
|
68
|
+
instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
|
|
69
|
+
return instances
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.scenario import (
|
|
5
|
+
TEST_SPLIT,
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Input,
|
|
9
|
+
)
|
|
10
|
+
from helm.common.audio_utils import is_invalid_audio_file
|
|
11
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class VoiceJailbreakAttacksScenario(Scenario):
|
|
15
|
+
"""
|
|
16
|
+
Voice Jailbreak Attacks Against GPT-4o
|
|
17
|
+
|
|
18
|
+
Paper: https://arxiv.org/abs/2405.19103
|
|
19
|
+
|
|
20
|
+
The examples used in this scenario was generated following the instructions in the GitHub:
|
|
21
|
+
https://github.com/TrustAIRLab/VoiceJailbreakAttack
|
|
22
|
+
Note: The more advanced jailbreaking subset addresses the AI as "ChatGPT".
|
|
23
|
+
|
|
24
|
+
We ran:
|
|
25
|
+
python tts/prompt2audio.py --dataset baseline --voice fable
|
|
26
|
+
python tts/prompt2audio.py --dataset baseline --voice fable
|
|
27
|
+
|
|
28
|
+
then placed the generated folders at benchmark_output/scenarios/voice_jailbreak_attacks:
|
|
29
|
+
|
|
30
|
+
voice_jailbreak_attacks/
|
|
31
|
+
baseline_fable/
|
|
32
|
+
*.wav
|
|
33
|
+
textjailbreak_fable/
|
|
34
|
+
*.wav
|
|
35
|
+
|
|
36
|
+
Base prompts (30 total):
|
|
37
|
+
https://github.com/TrustAIRLab/VoiceJailbreakAttack/blob/main/data/question_set/questions_tiny.csv
|
|
38
|
+
Jailbreak templates:
|
|
39
|
+
https://github.com/TrustAIRLab/VoiceJailbreakAttack/blob/main/data/jailbreak_prompts/text_jailbreak_prompts.csv
|
|
40
|
+
|
|
41
|
+
The authors used Attack Success Rate (ASR) as the evaluation metric.
|
|
42
|
+
Specifically, two authors manually label each response to determine if it answers the forbidden
|
|
43
|
+
question. If there is any uncertainty, the authors discuss to reach a conclusion
|
|
44
|
+
|
|
45
|
+
Citation:
|
|
46
|
+
|
|
47
|
+
@article{SWBZ24,
|
|
48
|
+
author = {Xinyue Shen and Yixin Wu and Michael Backes and Yang Zhang},
|
|
49
|
+
title = {{Voice Jailbreak Attacks Against GPT-4o}},
|
|
50
|
+
journal = {{CoRR abs/2405.19103}},
|
|
51
|
+
year = {2024}
|
|
52
|
+
}
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
VALID_SUBSETS = ["baseline", "textjailbreak"]
|
|
56
|
+
|
|
57
|
+
name = "voice_jailbreak_attacks"
|
|
58
|
+
description = "Voice jailbreak attacks against GPT-4o ([Shen et al, 2024](https://arxiv.org/abs/2405.19103))."
|
|
59
|
+
tags = ["audio", "safety"]
|
|
60
|
+
|
|
61
|
+
def __init__(self, subset: str = "baseline") -> None:
|
|
62
|
+
super().__init__()
|
|
63
|
+
assert subset in self.VALID_SUBSETS, f"Invalid subset: {subset}"
|
|
64
|
+
self._subset = subset
|
|
65
|
+
|
|
66
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
67
|
+
audio_directory_path: str = os.path.join(output_path, f"{self._subset}_fable")
|
|
68
|
+
assert os.path.exists(audio_directory_path), f"Audio directory does not exist: {audio_directory_path}"
|
|
69
|
+
|
|
70
|
+
instances: List[Instance] = []
|
|
71
|
+
for file in os.listdir(audio_directory_path):
|
|
72
|
+
if not file.endswith("wav"):
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
audio_path: str = os.path.join(audio_directory_path, file)
|
|
76
|
+
assert not is_invalid_audio_file(audio_path), f"Invalid audio file: {audio_path}"
|
|
77
|
+
|
|
78
|
+
input = Input(
|
|
79
|
+
multimedia_content=MultimediaObject(
|
|
80
|
+
media_objects=[
|
|
81
|
+
MediaObject(location=audio_path, content_type="audio/wav"),
|
|
82
|
+
]
|
|
83
|
+
)
|
|
84
|
+
)
|
|
85
|
+
instances.append(Instance(input=input, references=[], split=TEST_SPLIT))
|
|
86
|
+
|
|
87
|
+
return instances
|