crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +10 -5
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +90 -61
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +281 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +21 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +119 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +105 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +83 -5
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +18 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +42 -6
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +151 -47
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +255 -21
- helm/clients/palmyra_client.py +2 -6
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +69 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +8 -30
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/key_value_store.py +9 -9
- helm/common/media_object.py +2 -2
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +1258 -33
- helm/config/model_metadata.yaml +1110 -41
- helm/config/tokenizer_configs.yaml +403 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -13
- helm/proxy/services/remote_service.py +1 -7
- helm/proxy/services/server_service.py +6 -19
- helm/proxy/services/service.py +0 -6
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.3.dist-info/METADATA +0 -355
- crfm_helm-0.5.3.dist-info/RECORD +0 -699
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import os
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
from collections import OrderedDict
|
|
15
|
+
from tqdm import tqdm
|
|
16
|
+
from datasets import load_dataset
|
|
17
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
18
|
+
from helm.common.audio_utils import ensure_audio_file_exists_from_array
|
|
19
|
+
from helm.common.general import ensure_file_downloaded
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class LibriSpeechFairnessScenario(Scenario):
|
|
23
|
+
"""LibriSpeech Fairness Scenario
|
|
24
|
+
The LibriSpeech corpus (Vassil et al. 2015) is derived from audiobooks that are part of the LibriVox
|
|
25
|
+
project, and contains 1000 hours of speech sampled at 16 kHz. The data has separately prepared language-model
|
|
26
|
+
training data and pre-built language models. This corpus is one of the most widely-used ASR corpus, which
|
|
27
|
+
has been extended to many applicaitons such as robust ASR and multilingual ASR tasks.
|
|
28
|
+
|
|
29
|
+
Paper: https://ieeexplore.ieee.org/document/7178964
|
|
30
|
+
Code: https://www.openslr.org/12
|
|
31
|
+
|
|
32
|
+
Citation:
|
|
33
|
+
@INPROCEEDINGS{7178964,
|
|
34
|
+
author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},
|
|
35
|
+
booktitle={2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
|
|
36
|
+
title={Librispeech: An ASR corpus based on public domain audio books},
|
|
37
|
+
year={2015},
|
|
38
|
+
doi={10.1109/ICASSP.2015.7178964}}
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
HF_DATASET_NAME = "openslr/librispeech_asr"
|
|
42
|
+
HF_MAPPING_URL = (
|
|
43
|
+
"https://huggingface.co/datasets/PahaII/SRB_instance_key_mapping/resolve/main/srb_instance_keys.json"
|
|
44
|
+
)
|
|
45
|
+
GENDER_MAPPING_URL = (
|
|
46
|
+
"https://huggingface.co/datasets/PahaII/librispeech_id2gender/resolve/main/librispeech_id2gender.json"
|
|
47
|
+
)
|
|
48
|
+
GENDERS = ["male", "female"]
|
|
49
|
+
|
|
50
|
+
name = "librispeech_fairness"
|
|
51
|
+
description = (
|
|
52
|
+
"Widely-used speech corpus for the speech recognition task "
|
|
53
|
+
"([Vassil et al. 2015](https://ieeexplore.ieee.org/document/7178964))."
|
|
54
|
+
)
|
|
55
|
+
tags: List[str] = ["audio", "recognition"]
|
|
56
|
+
|
|
57
|
+
def __init__(self, gender: str) -> None:
|
|
58
|
+
super().__init__()
|
|
59
|
+
|
|
60
|
+
if gender.lower() not in LibriSpeechFairnessScenario.GENDERS:
|
|
61
|
+
raise ValueError(
|
|
62
|
+
f"Invalid gender input: {gender}. Valid languages are: {LibriSpeechFairnessScenario.GENDERS}"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
self._gender: str = gender
|
|
66
|
+
|
|
67
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
68
|
+
instances: List[Instance] = []
|
|
69
|
+
audio_save_dir = os.path.join(output_path, "audio_files")
|
|
70
|
+
mapping_local_path = os.path.join(output_path, "librispeech_id2gender.json")
|
|
71
|
+
ensure_file_downloaded(
|
|
72
|
+
source_url=LibriSpeechFairnessScenario.GENDER_MAPPING_URL, target_path=mapping_local_path
|
|
73
|
+
)
|
|
74
|
+
meta_data = load_dataset(
|
|
75
|
+
LibriSpeechFairnessScenario.HF_DATASET_NAME,
|
|
76
|
+
name="clean",
|
|
77
|
+
cache_dir=output_path,
|
|
78
|
+
split=TEST_SPLIT,
|
|
79
|
+
)
|
|
80
|
+
gender_mapping_dict = json.load(open(mapping_local_path))
|
|
81
|
+
loading_cases: List[OrderedDict] = []
|
|
82
|
+
for row in tqdm(meta_data):
|
|
83
|
+
if gender_mapping_dict[str(row["speaker_id"])] == self._gender.lower():
|
|
84
|
+
loading_cases.append(row)
|
|
85
|
+
|
|
86
|
+
for row in tqdm(loading_cases):
|
|
87
|
+
local_audio_name = f"librispeech_{row['id']}.mp3"
|
|
88
|
+
local_audio_path = os.path.join(audio_save_dir, local_audio_name)
|
|
89
|
+
ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
|
|
90
|
+
answer = row["text"].lower()
|
|
91
|
+
input = Input(
|
|
92
|
+
multimedia_content=MultimediaObject([MediaObject(content_type="audio/mp3", location=local_audio_path)])
|
|
93
|
+
)
|
|
94
|
+
references = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
|
|
95
|
+
instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
|
|
96
|
+
return instances
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import os
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
from tqdm import tqdm
|
|
15
|
+
from datasets import load_dataset
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.audio_utils import ensure_audio_file_exists_from_array
|
|
18
|
+
from helm.common.general import ensure_file_downloaded
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class LibriSpeechScenario(Scenario):
|
|
22
|
+
"""LibriSpeech Corpus
|
|
23
|
+
The LibriSpeech corpus (Vassil et al. 2015) is derived from audiobooks that are part of the LibriVox
|
|
24
|
+
project, and contains 1000 hours of speech sampled at 16 kHz. The data has separately prepared language-model
|
|
25
|
+
training data and pre-built language models. This corpus is one of the most widely-used ASR corpus, which
|
|
26
|
+
has been extended to many applicaitons such as robust ASR and multilingual ASR tasks.
|
|
27
|
+
|
|
28
|
+
Paper: https://ieeexplore.ieee.org/document/7178964
|
|
29
|
+
Code: https://www.openslr.org/12
|
|
30
|
+
|
|
31
|
+
Citation:
|
|
32
|
+
@INPROCEEDINGS{7178964,
|
|
33
|
+
author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},
|
|
34
|
+
booktitle={2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
|
|
35
|
+
title={Librispeech: An ASR corpus based on public domain audio books},
|
|
36
|
+
year={2015},
|
|
37
|
+
doi={10.1109/ICASSP.2015.7178964}}
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
HF_DATASET_NAME = "openslr/librispeech_asr"
|
|
41
|
+
HF_MAPPING_URL = (
|
|
42
|
+
"https://huggingface.co/datasets/PahaII/SRB_instance_key_mapping/resolve/main/srb_instance_keys.json"
|
|
43
|
+
)
|
|
44
|
+
SRB_KEY = "srb_librispeech_noises_key2audio"
|
|
45
|
+
SRB_SUBSET = "gnoise.1"
|
|
46
|
+
MAPPING_KEY = "librispeech_id2line"
|
|
47
|
+
|
|
48
|
+
name = "librispeech"
|
|
49
|
+
description = (
|
|
50
|
+
"Widely-used speech corpus for the speech recognition task "
|
|
51
|
+
"([Vassil et al. 2015](https://ieeexplore.ieee.org/document/7178964))."
|
|
52
|
+
)
|
|
53
|
+
tags: List[str] = ["audio", "recognition"]
|
|
54
|
+
|
|
55
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
56
|
+
instances: List[Instance] = []
|
|
57
|
+
audio_save_dir = os.path.join(output_path, "audio_files")
|
|
58
|
+
mapping_local_path = os.path.join(output_path, "srb_instance_keys.json")
|
|
59
|
+
ensure_file_downloaded(source_url=LibriSpeechScenario.HF_MAPPING_URL, target_path=mapping_local_path)
|
|
60
|
+
meta_data = load_dataset(
|
|
61
|
+
LibriSpeechScenario.HF_DATASET_NAME,
|
|
62
|
+
name="clean",
|
|
63
|
+
cache_dir=output_path,
|
|
64
|
+
split=TEST_SPLIT,
|
|
65
|
+
)
|
|
66
|
+
mapping_dict = json.load(open(mapping_local_path))
|
|
67
|
+
srb_mapping_keys = mapping_dict[self.SRB_KEY][self.SRB_SUBSET]
|
|
68
|
+
index2line_num = mapping_dict[self.MAPPING_KEY]
|
|
69
|
+
for line_num in tqdm(list(srb_mapping_keys)):
|
|
70
|
+
row = meta_data[int(index2line_num[line_num])]
|
|
71
|
+
local_audio_name = f"{self.name}_{line_num}.mp3"
|
|
72
|
+
local_audio_path = os.path.join(audio_save_dir, local_audio_name)
|
|
73
|
+
ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
|
|
74
|
+
answer = row["text"].lower()
|
|
75
|
+
input = Input(
|
|
76
|
+
multimedia_content=MultimediaObject([MediaObject(content_type="audio/mp3", location=local_audio_path)])
|
|
77
|
+
)
|
|
78
|
+
references = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
|
|
79
|
+
instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
|
|
80
|
+
return instances
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
TEST_SPLIT,
|
|
9
|
+
TRAIN_SPLIT,
|
|
10
|
+
VALID_SPLIT,
|
|
11
|
+
Scenario,
|
|
12
|
+
Instance,
|
|
13
|
+
Reference,
|
|
14
|
+
CORRECT_TAG,
|
|
15
|
+
Input,
|
|
16
|
+
Output,
|
|
17
|
+
)
|
|
18
|
+
from helm.common.audio_utils import ensure_audio_file_exists_from_array, get_array_from_audio_file
|
|
19
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
20
|
+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class MELDAudioScenario(Scenario):
|
|
24
|
+
"""Multimodal EmotionLines Dataset (MELD) Audio
|
|
25
|
+
|
|
26
|
+
Multimodal EmotionLines Dataset (MELD) has been created by enhancing and extending EmotionLines dataset.
|
|
27
|
+
MELD has more than 1400 dialogues and 13000 utterances from Friends TV series. Multiple speakers participated
|
|
28
|
+
in the dialogues. Each utterance in a dialogue has been labeled by any of these seven emotions -
|
|
29
|
+
Anger, Disgust, Sadness, Joy, Neutral, Surprise and Fear.
|
|
30
|
+
|
|
31
|
+
The task is to classify the emotion based on only the audio clip.
|
|
32
|
+
|
|
33
|
+
Website: https://affective-meld.github.io/
|
|
34
|
+
Paper: https://arxiv.org/abs/1810.02508
|
|
35
|
+
Dataset: https://huggingface.co/datasets/DavidCombei/Wav2Vec_MELD_Audio
|
|
36
|
+
|
|
37
|
+
Citation:
|
|
38
|
+
S. Poria, D. Hazarika, N. Majumder, G. Naik, R. Mihalcea,
|
|
39
|
+
E. Cambria. MELD: A Multimodal Multi-Party Dataset
|
|
40
|
+
for Emotion Recognition in Conversation. (2018)
|
|
41
|
+
|
|
42
|
+
Chen, S.Y., Hsu, C.C., Kuo, C.C. and Ku, L.W.
|
|
43
|
+
EmotionLines: An Emotion Corpus of Multi-Party
|
|
44
|
+
Conversations. arXiv preprint arXiv:1802.08379 (2018).
|
|
45
|
+
""" # noqa: E501
|
|
46
|
+
|
|
47
|
+
name = "meld_audio"
|
|
48
|
+
description = "Classify emotions in audio clips from the television series Friends ([Poria et al, 2018](https://arxiv.org/abs/1810.02508))." # noqa: E501
|
|
49
|
+
tags = ["audio", "classification"]
|
|
50
|
+
|
|
51
|
+
LABEL_NAMES = ["anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise"]
|
|
52
|
+
CSV_URL_PREFIX = (
|
|
53
|
+
"https://raw.githubusercontent.com/declare-lab/MELD/2d2011b409d3ca2d7e94460cd007d434b1d0a102/data/MELD/"
|
|
54
|
+
)
|
|
55
|
+
SPLIT_NAME_TO_CSV_FILE_NAME = {
|
|
56
|
+
VALID_SPLIT: "dev_sent_emo.csv",
|
|
57
|
+
TRAIN_SPLIT: "train_sent_emo.csv",
|
|
58
|
+
TEST_SPLIT: "test_sent_emo.csv",
|
|
59
|
+
}
|
|
60
|
+
SPLIT_NAME_TO_TGZ_FILE_NAME = {
|
|
61
|
+
VALID_SPLIT: "audios_validation.tgz",
|
|
62
|
+
TRAIN_SPLIT: "audios_train.tgz",
|
|
63
|
+
TEST_SPLIT: "audios_test.tgz",
|
|
64
|
+
}
|
|
65
|
+
SAMPLE_RATE = 16000
|
|
66
|
+
|
|
67
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
68
|
+
csv_dir = os.path.join(output_path, "csv")
|
|
69
|
+
ensure_directory_exists(csv_dir)
|
|
70
|
+
|
|
71
|
+
instances: List[Instance] = []
|
|
72
|
+
# Only download the test split.
|
|
73
|
+
# We don't need the train split (because we use zero shot) or the validation split.
|
|
74
|
+
split_name = TEST_SPLIT
|
|
75
|
+
|
|
76
|
+
# Download the CSV to get the labels and IDs
|
|
77
|
+
csv_file_name = MELDAudioScenario.SPLIT_NAME_TO_CSV_FILE_NAME[split_name]
|
|
78
|
+
csv_file_path = os.path.join(csv_dir, csv_file_name)
|
|
79
|
+
ensure_file_downloaded(MELDAudioScenario.CSV_URL_PREFIX + csv_file_name, csv_file_path)
|
|
80
|
+
df = pd.read_csv(csv_file_path, header=0).rename(columns={"Sr No.": "serial_number"})
|
|
81
|
+
|
|
82
|
+
# Download FLAC files
|
|
83
|
+
flac_dir = os.path.join(output_path, f"flac_{split_name}")
|
|
84
|
+
ensure_file_downloaded(
|
|
85
|
+
source_url=f"https://huggingface.co/datasets/zrr1999/MELD_Text_Audio/resolve/main/archive/{MELDAudioScenario.SPLIT_NAME_TO_TGZ_FILE_NAME[split_name]}?download=true", # noqa: E501
|
|
86
|
+
target_path=flac_dir,
|
|
87
|
+
unpack=True,
|
|
88
|
+
unpack_type="untar",
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
wav_dir = os.path.join(output_path, f"wav_{split_name}")
|
|
92
|
+
ensure_directory_exists(wav_dir)
|
|
93
|
+
for row in tqdm(df.itertuples()):
|
|
94
|
+
# Transcode FLAC to WAV
|
|
95
|
+
wav_file_name = f"dia{row.Dialogue_ID}_utt{row.Utterance_ID}.wav"
|
|
96
|
+
wav_file_path = os.path.join(wav_dir, wav_file_name)
|
|
97
|
+
if not os.path.isfile(wav_file_path):
|
|
98
|
+
flac_file_name = f"dia{row.Dialogue_ID}_utt{row.Utterance_ID}.flac"
|
|
99
|
+
flac_file_path = os.path.join(flac_dir, flac_file_name)
|
|
100
|
+
audio_array = get_array_from_audio_file(flac_file_path, MELDAudioScenario.SAMPLE_RATE)
|
|
101
|
+
ensure_audio_file_exists_from_array(wav_file_path, audio_array, MELDAudioScenario.SAMPLE_RATE)
|
|
102
|
+
input = Input(
|
|
103
|
+
multimedia_content=MultimediaObject(
|
|
104
|
+
media_objects=[MediaObject(location=wav_file_path, content_type="audio/wav")]
|
|
105
|
+
)
|
|
106
|
+
)
|
|
107
|
+
assert row.Emotion in MELDAudioScenario.LABEL_NAMES
|
|
108
|
+
references = [Reference(output=Output(text=row.Emotion), tags=[CORRECT_TAG])]
|
|
109
|
+
instance = Instance(
|
|
110
|
+
id=str(f"awoo{row.serial_number}"), input=input, references=references, split=split_name
|
|
111
|
+
)
|
|
112
|
+
instances.append(instance)
|
|
113
|
+
return instances
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.scenario import (
|
|
5
|
+
Scenario,
|
|
6
|
+
Instance,
|
|
7
|
+
Reference,
|
|
8
|
+
TEST_SPLIT,
|
|
9
|
+
CORRECT_TAG,
|
|
10
|
+
Input,
|
|
11
|
+
Output,
|
|
12
|
+
)
|
|
13
|
+
from tqdm import tqdm
|
|
14
|
+
from datasets import load_dataset
|
|
15
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
16
|
+
from helm.common.audio_utils import ensure_audio_file_exists_from_array
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class MultilingualLibriSpeechScenario(Scenario):
|
|
20
|
+
"""Multilingual Librispeech
|
|
21
|
+
|
|
22
|
+
The Multilingual LibriSpeech (Pratap et al, 2020) dataset is derived from read audiobooks
|
|
23
|
+
from LibriVox and consists of 8 languages, including about 44.5K hours of English and a total
|
|
24
|
+
of about 6K hours for other 7 languages. The task is to recognize the textual content from the
|
|
25
|
+
audio sample.
|
|
26
|
+
|
|
27
|
+
Paper: https://arxiv.org/abs/2012.03411
|
|
28
|
+
Code: https://www.openslr.org/
|
|
29
|
+
|
|
30
|
+
Citation:
|
|
31
|
+
@article{Pratap2020MLSAL,
|
|
32
|
+
title={MLS: A Large-Scale Multilingual Dataset for Speech Research},
|
|
33
|
+
author={Vineel Pratap and Qiantong Xu and Anuroop Sriram and Gabriel Synnaeve and Ronan Collobert},
|
|
34
|
+
journal={ArXiv},
|
|
35
|
+
year={2020},
|
|
36
|
+
volume={abs/2012.03411}
|
|
37
|
+
}
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
HF_DATASET_NAME = "facebook/multilingual_librispeech"
|
|
41
|
+
LANGUAGE_LIST: List[str] = ["dutch", "german", "french", "spanish", "italian", "portuguese", "polish"]
|
|
42
|
+
|
|
43
|
+
name = "multilingual_librispeech"
|
|
44
|
+
description = (
|
|
45
|
+
"Speech recognition in 7 different languages ([Pratap et al, 2022](https://arxiv.org/abs/2012.03411))."
|
|
46
|
+
)
|
|
47
|
+
tags: List[str] = ["audio", "multilinguality", "recognition"]
|
|
48
|
+
|
|
49
|
+
def __init__(self, language: str) -> None:
|
|
50
|
+
super().__init__()
|
|
51
|
+
|
|
52
|
+
language = language.lower()
|
|
53
|
+
if language not in MultilingualLibriSpeechScenario.LANGUAGE_LIST:
|
|
54
|
+
raise ValueError(f"Invalid language. Valid languages are: {MultilingualLibriSpeechScenario.LANGUAGE_LIST}")
|
|
55
|
+
|
|
56
|
+
self._language: str = language
|
|
57
|
+
|
|
58
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
59
|
+
instances: List[Instance] = []
|
|
60
|
+
audio_save_dir = os.path.join(output_path, "audio_files")
|
|
61
|
+
for idx, row in enumerate(
|
|
62
|
+
tqdm(
|
|
63
|
+
load_dataset(
|
|
64
|
+
MultilingualLibriSpeechScenario.HF_DATASET_NAME,
|
|
65
|
+
name=self._language,
|
|
66
|
+
cache_dir=output_path,
|
|
67
|
+
split=TEST_SPLIT,
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
):
|
|
71
|
+
local_audio_path = os.path.join(audio_save_dir, str(idx) + "_" + row["original_path"].split("/")[-1])
|
|
72
|
+
# download to the local path
|
|
73
|
+
ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
|
|
74
|
+
answer = row["transcript"]
|
|
75
|
+
input = Input(
|
|
76
|
+
multimedia_content=MultimediaObject([MediaObject(content_type="audio/mpeg", location=local_audio_path)])
|
|
77
|
+
)
|
|
78
|
+
references = [Reference(Output(text=str(answer)), tags=[CORRECT_TAG])]
|
|
79
|
+
instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
|
|
80
|
+
return instances
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
TEST_SPLIT,
|
|
9
|
+
Scenario,
|
|
10
|
+
Instance,
|
|
11
|
+
Reference,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.audio_utils import is_invalid_audio_file, extract_audio
|
|
17
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
18
|
+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class MUStARDScenario(Scenario):
|
|
22
|
+
"""
|
|
23
|
+
MUStARD: Multimodal Sarcasm Detection Dataset
|
|
24
|
+
|
|
25
|
+
A multimodal video corpus for research in automated sarcasm discovery. The dataset is compiled from popular
|
|
26
|
+
TV shows including Friends, The Golden Girls, The Big Bang Theory, and Sarcasmaholics Anonymous. MUStARD consists
|
|
27
|
+
of audiovisual utterances annotated with sarcasm labels. Each utterance is accompanied by its context, providing
|
|
28
|
+
additional information on the scenario where it occurs.
|
|
29
|
+
|
|
30
|
+
We just extract the audio from the given videos.
|
|
31
|
+
|
|
32
|
+
The columns of the dataset are:
|
|
33
|
+
- utterance: The text of the target utterance to classify.
|
|
34
|
+
- speaker: Speaker of the target utterance.
|
|
35
|
+
- context: List of utterances (in chronological order) preceding the target utterance.
|
|
36
|
+
- context_speakers: Respective speakers of the context utterances.
|
|
37
|
+
- sarcasm: Binary label for sarcasm tag.
|
|
38
|
+
|
|
39
|
+
More specifically an example looks like this:
|
|
40
|
+
|
|
41
|
+
"1_60": {
|
|
42
|
+
"utterance": "It's just a privilege to watch your mind at work.",
|
|
43
|
+
"speaker": "SHELDON",
|
|
44
|
+
"context": [
|
|
45
|
+
"I never would have identified the fingerprints of string theory in the aftermath of the Big Bang.",
|
|
46
|
+
"My apologies. What's your plan?"
|
|
47
|
+
],
|
|
48
|
+
"context_speakers": [
|
|
49
|
+
"LEONARD",
|
|
50
|
+
"SHELDON"
|
|
51
|
+
],
|
|
52
|
+
"show": "BBT",
|
|
53
|
+
"sarcasm": true
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
The key is the video id.
|
|
57
|
+
|
|
58
|
+
The video folder has two subfolders:
|
|
59
|
+
- context_final: Contains the context videos (e.g., 1_60_c.mp4)
|
|
60
|
+
- utterances_final: Contains the target utterance videos (e.g., 1_60.mp4)
|
|
61
|
+
|
|
62
|
+
Citation:
|
|
63
|
+
|
|
64
|
+
@inproceedings{mustard,
|
|
65
|
+
title = "Towards Multimodal Sarcasm Detection (An \_Obviously\_ Perfect Paper)",
|
|
66
|
+
author = "Castro, Santiago and
|
|
67
|
+
Hazarika, Devamanyu and
|
|
68
|
+
P{\'e}rez-Rosas, Ver{\'o}nica and
|
|
69
|
+
Zimmermann, Roger and
|
|
70
|
+
Mihalcea, Rada and
|
|
71
|
+
Poria, Soujanya",
|
|
72
|
+
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics
|
|
73
|
+
(Volume 1: Long Papers)",
|
|
74
|
+
month = "7",
|
|
75
|
+
year = "2019",
|
|
76
|
+
address = "Florence, Italy",
|
|
77
|
+
publisher = "Association for Computational Linguistics",
|
|
78
|
+
}
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
RAW_VIDEO_CLIPS_URL: str = "https://huggingface.co/datasets/MichiganNLP/MUStARD/resolve/main/mmsd_raw_data.zip"
|
|
82
|
+
ANNOTATIONS_URL: str = (
|
|
83
|
+
"https://raw.githubusercontent.com/soujanyaporia/MUStARD/refs/heads/master/data/" "sarcasm_data.json"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
name = "mustard"
|
|
87
|
+
description = "Sarcasm detection benchmark ([Castro et al, 2018](https://arxiv.org/abs/1906.01815))."
|
|
88
|
+
tags = ["audio", "classification", "toxicity", "sarcasm detection"]
|
|
89
|
+
|
|
90
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
91
|
+
# Download the annotations
|
|
92
|
+
annotations_path: str = os.path.join(output_path, "sarcasm_data.json")
|
|
93
|
+
ensure_file_downloaded(self.ANNOTATIONS_URL, annotations_path)
|
|
94
|
+
|
|
95
|
+
# Where the video files will be downloaded to
|
|
96
|
+
video_path: str = os.path.join(output_path, "video")
|
|
97
|
+
ensure_file_downloaded(self.RAW_VIDEO_CLIPS_URL, video_path, unpack=True)
|
|
98
|
+
|
|
99
|
+
# Where the audio files will be extracted to
|
|
100
|
+
audio_path: str = os.path.join(output_path, "audio")
|
|
101
|
+
ensure_directory_exists(audio_path)
|
|
102
|
+
|
|
103
|
+
instances: List[Instance] = []
|
|
104
|
+
annotations = json.load(open(annotations_path, "r"))
|
|
105
|
+
for key, row in tqdm(annotations.items()):
|
|
106
|
+
# Extract the audio from the context video
|
|
107
|
+
context_audio_path: str = os.path.join(audio_path, f"{key}_c.mp3")
|
|
108
|
+
if not os.path.exists(context_audio_path):
|
|
109
|
+
# Extract the audio from the video
|
|
110
|
+
context_video_path: str = os.path.join(video_path, "context_final", f"{key}_c.mp4")
|
|
111
|
+
extract_audio(context_video_path, context_audio_path)
|
|
112
|
+
assert not is_invalid_audio_file(context_audio_path), f"Invalid audio file: {context_audio_path}"
|
|
113
|
+
|
|
114
|
+
# Extract the audio from the target utterance video
|
|
115
|
+
utterance_audio_path: str = os.path.join(audio_path, f"{key}.mp3")
|
|
116
|
+
if not os.path.exists(utterance_audio_path):
|
|
117
|
+
utterance_video_path: str = os.path.join(video_path, "utterances_final", f"{key}.mp4")
|
|
118
|
+
extract_audio(utterance_video_path, utterance_audio_path)
|
|
119
|
+
assert not is_invalid_audio_file(utterance_audio_path), f"Invalid audio file: {utterance_audio_path}"
|
|
120
|
+
|
|
121
|
+
input = Input(
|
|
122
|
+
multimedia_content=MultimediaObject(
|
|
123
|
+
media_objects=[
|
|
124
|
+
# Input both the context and the utterance audio
|
|
125
|
+
MediaObject(text="Context:", content_type="text/plain"),
|
|
126
|
+
MediaObject(location=context_audio_path, content_type="audio/mpeg"),
|
|
127
|
+
MediaObject(text="Utterance:", content_type="text/plain"),
|
|
128
|
+
MediaObject(location=utterance_audio_path, content_type="audio/mpeg"),
|
|
129
|
+
MediaObject(
|
|
130
|
+
text="Given the context, does the utterance contain sarcasm?", content_type="text/plain"
|
|
131
|
+
),
|
|
132
|
+
]
|
|
133
|
+
)
|
|
134
|
+
)
|
|
135
|
+
is_sarcastic: bool = row["sarcasm"]
|
|
136
|
+
references = [
|
|
137
|
+
Reference(Output(text="Yes"), tags=[CORRECT_TAG] if is_sarcastic else []),
|
|
138
|
+
Reference(Output(text="No"), tags=[CORRECT_TAG] if not is_sarcastic else []),
|
|
139
|
+
]
|
|
140
|
+
instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
|
|
141
|
+
|
|
142
|
+
return instances
|