PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show

crfm_helm-0.5.6.dist-info/METADATA +427 -0
crfm_helm-0.5.6.dist-info/RECORD +941 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +13 -1
helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/common_adapter_specs.py +69 -4
helm/benchmark/adaptation/prompt.py +1 -1
helm/benchmark/annotation/aci_bench_annotator.py +95 -0
helm/benchmark/annotation/air_bench_annotator.py +21 -6
helm/benchmark/annotation/annotator.py +5 -0
helm/benchmark/annotation/annotator_factory.py +3 -20
helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
helm/benchmark/annotation/bird_sql_annotator.py +58 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
helm/benchmark/annotation/dischargeme_annotator.py +107 -0
helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +99 -0
helm/benchmark/annotation/medalign_annotator.py +100 -0
helm/benchmark/annotation/medi_qa_annotator.py +98 -0
helm/benchmark/annotation/medication_qa_annotator.py +87 -63
helm/benchmark/annotation/mental_health_annotator.py +98 -0
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +214 -6
helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
helm/benchmark/annotation/omni_math_annotator.py +131 -0
helm/benchmark/annotation/spider_annotator.py +18 -0
helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
helm/benchmark/annotation/wildbench_annotator.py +119 -0
helm/benchmark/annotation_executor.py +35 -15
helm/benchmark/augmentations/cleva_perturbation.py +9 -8
helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
helm/benchmark/augmentations/dialect_perturbation.py +4 -5
helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
helm/benchmark/augmentations/gender_perturbation.py +2 -2
helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
helm/benchmark/augmentations/person_name_perturbation.py +4 -5
helm/benchmark/augmentations/perturbation.py +1 -1
helm/benchmark/augmentations/space_perturbation.py +2 -2
helm/benchmark/augmentations/suffix_perturbation.py +2 -2
helm/benchmark/augmentations/synonym_perturbation.py +4 -3
helm/benchmark/augmentations/test_perturbation.py +16 -13
helm/benchmark/augmentations/translate_perturbation.py +2 -2
helm/benchmark/augmentations/typos_perturbation.py +2 -2
helm/benchmark/data_preprocessor.py +2 -2
helm/benchmark/executor.py +11 -12
helm/benchmark/huggingface_registration.py +2 -7
helm/benchmark/metrics/aci_bench_metrics.py +14 -0
helm/benchmark/metrics/basic_metrics.py +6 -6
helm/benchmark/metrics/bbq_metrics.py +2 -2
helm/benchmark/metrics/bias_metrics.py +12 -3
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
helm/benchmark/metrics/bird_sql_metrics.py +28 -0
helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
helm/benchmark/metrics/classification_metrics.py +76 -12
helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
helm/benchmark/metrics/code_metrics.py +5 -5
helm/benchmark/metrics/comet_metric.py +125 -0
helm/benchmark/metrics/common_metric_specs.py +9 -2
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
helm/benchmark/metrics/copyright_metrics.py +4 -4
helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
helm/benchmark/metrics/dischargeme_metrics.py +14 -0
helm/benchmark/metrics/disinformation_metrics.py +4 -4
helm/benchmark/metrics/dry_run_metrics.py +5 -5
helm/benchmark/metrics/efficiency_metrics.py +6 -6
helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
helm/benchmark/metrics/ifeval/__init__.py +0 -0
helm/benchmark/metrics/ifeval/instructions.py +1574 -0
helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
helm/benchmark/metrics/ifeval_metrics.py +55 -0
helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/language_modeling_metrics.py +4 -4
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/machine_translation_metrics.py +2 -2
helm/benchmark/metrics/med_dialog_metrics.py +14 -0
helm/benchmark/metrics/medalign_metrics.py +14 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
helm/benchmark/metrics/medec_metrics.py +101 -0
helm/benchmark/metrics/medi_qa_metrics.py +14 -0
helm/benchmark/metrics/medication_qa_metrics.py +10 -19
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +14 -0
helm/benchmark/metrics/metric.py +3 -3
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
helm/benchmark/metrics/nltk_helper.py +32 -0
helm/benchmark/metrics/numeracy_metrics.py +4 -4
helm/benchmark/metrics/omni_math_metrics.py +32 -0
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/output_processing_metric.py +60 -0
helm/benchmark/metrics/output_processors.py +15 -0
helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
helm/benchmark/metrics/ranking_metrics.py +3 -3
helm/benchmark/metrics/reference_metric.py +3 -3
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
helm/benchmark/metrics/spider_metrics.py +7 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
helm/benchmark/metrics/statistic.py +1 -1
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
helm/benchmark/metrics/summarization_metrics.py +20 -9
helm/benchmark/metrics/test_bias_metrics.py +5 -1
helm/benchmark/metrics/test_classification_metrics.py +140 -68
helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
helm/benchmark/metrics/test_metric.py +1 -1
helm/benchmark/metrics/test_statistic.py +2 -2
helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
helm/benchmark/metrics/toxicity_metrics.py +6 -6
helm/benchmark/metrics/unitxt_metrics.py +7 -5
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/metrics/wildbench_metrics.py +34 -0
helm/benchmark/model_deployment_registry.py +6 -8
helm/benchmark/model_metadata_registry.py +16 -0
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +33 -12
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +2 -1
helm/benchmark/presentation/summarize.py +97 -67
helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
helm/benchmark/reeval_run.py +202 -0
helm/benchmark/reeval_runner.py +355 -0
helm/benchmark/run.py +86 -90
helm/benchmark/run_expander.py +90 -9
helm/benchmark/run_spec_factory.py +13 -0
helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
helm/benchmark/run_specs/audio_run_specs.py +657 -0
helm/benchmark/run_specs/call_center_run_specs.py +49 -0
helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
helm/benchmark/run_specs/classic_run_specs.py +1 -69
helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
helm/benchmark/run_specs/experimental_run_specs.py +142 -3
helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
helm/benchmark/run_specs/lite_run_specs.py +2 -2
helm/benchmark/run_specs/long_context_run_specs.py +141 -0
helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
helm/benchmark/run_specs/oab_exams_specs.py +32 -0
helm/benchmark/run_specs/safety_run_specs.py +37 -0
helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
helm/benchmark/run_specs/sql_run_specs.py +54 -0
helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
helm/benchmark/run_specs/vlm_run_specs.py +103 -2
helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
helm/benchmark/scenarios/air_bench_scenario.py +6 -1
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/__init__.py +0 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
helm/benchmark/scenarios/banking77_scenario.py +6 -1
helm/benchmark/scenarios/bbq_scenario.py +1 -1
helm/benchmark/scenarios/big_bench_scenario.py +11 -1
helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
helm/benchmark/scenarios/blimp_scenario.py +1 -1
helm/benchmark/scenarios/bold_scenario.py +1 -1
helm/benchmark/scenarios/boolq_scenario.py +1 -1
helm/benchmark/scenarios/casehold_scenario.py +79 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
helm/benchmark/scenarios/clear_scenario.py +157 -0
helm/benchmark/scenarios/cleva_scenario.py +2 -2
helm/benchmark/scenarios/code_scenario.py +17 -4
helm/benchmark/scenarios/commonsense_scenario.py +1 -1
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
helm/benchmark/scenarios/copyright_scenario.py +1 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
helm/benchmark/scenarios/disinformation_scenario.py +10 -1
helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
helm/benchmark/scenarios/gpqa_scenario.py +80 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/grammar_scenario.py +2 -2
helm/benchmark/scenarios/gsm_scenario.py +10 -1
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
helm/benchmark/scenarios/headqa_scenario.py +136 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
helm/benchmark/scenarios/ice_scenario.py +8 -4
helm/benchmark/scenarios/ifeval_scenario.py +53 -0
helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
helm/benchmark/scenarios/imdb_scenario.py +11 -2
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
helm/benchmark/scenarios/koala_scenario.py +1 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
helm/benchmark/scenarios/legal_support_scenario.py +11 -1
helm/benchmark/scenarios/legalbench_scenario.py +22 -3
helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
helm/benchmark/scenarios/lextreme_scenario.py +11 -1
helm/benchmark/scenarios/live_qa_scenario.py +1 -1
helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
helm/benchmark/scenarios/math_scenario.py +9 -1
helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
helm/benchmark/scenarios/med_qa_scenario.py +10 -1
helm/benchmark/scenarios/medalign_scenario.py +94 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
helm/benchmark/scenarios/medbullets_scenario.py +145 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
helm/benchmark/scenarios/medec_scenario.py +125 -0
helm/benchmark/scenarios/medhallu_scenario.py +72 -0
helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +123 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
helm/benchmark/scenarios/mmlu_scenario.py +11 -1
helm/benchmark/scenarios/msmarco_scenario.py +1 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
helm/benchmark/scenarios/newsqa_scenario.py +1 -1
helm/benchmark/scenarios/numeracy_scenario.py +12 -2
helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
helm/benchmark/scenarios/omni_math_scenario.py +53 -0
helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
helm/benchmark/scenarios/quac_scenario.py +10 -1
helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
helm/benchmark/scenarios/raft_scenario.py +17 -2
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
helm/benchmark/scenarios/scenario.py +9 -1
helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
helm/benchmark/scenarios/spider_scenario.py +91 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
helm/benchmark/scenarios/summarization_scenario.py +11 -1
helm/benchmark/scenarios/sumosum_scenario.py +157 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/test_math_scenario.py +1 -0
helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
helm/benchmark/scenarios/the_pile_scenario.py +1 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
helm/benchmark/scenarios/unitxt_scenario.py +8 -2
helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
helm/benchmark/scenarios/vicuna_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
helm/benchmark/scenarios/wikifact_scenario.py +11 -1
helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
helm/benchmark/scenarios/wildbench_scenario.py +83 -0
helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
helm/benchmark/scenarios/xstest_scenario.py +1 -1
helm/benchmark/server.py +13 -1
helm/benchmark/slurm_runner.py +1 -1
helm/benchmark/static/schema_audio.yaml +763 -0
helm/benchmark/static/schema_autobencher.yaml +150 -0
helm/benchmark/static/schema_call_center.yaml +97 -60
helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
helm/benchmark/static/schema_czech_bank.yaml +148 -0
helm/benchmark/static/schema_enem_challenge.yaml +146 -0
helm/benchmark/static/schema_enterprise.yaml +319 -0
helm/benchmark/static/schema_finance.yaml +14 -12
helm/benchmark/static/schema_heim.yaml +1389 -0
helm/benchmark/static/schema_long_context.yaml +283 -0
helm/benchmark/static/schema_medhelm.yaml +1140 -0
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
helm/benchmark/static/schema_safety.yaml +18 -1
helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_social_audio.yaml +224 -0
helm/benchmark/static/schema_sql.yaml +171 -0
helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
helm/benchmark/static/schema_vhelm.yaml +129 -56
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
helm/benchmark/static_build/assets/index-94295e78.js +10 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
helm/benchmark/static_build/config.js +1 -1
helm/benchmark/static_build/index.html +6 -6
helm/benchmark/window_services/default_window_service.py +1 -1
helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
helm/benchmark/window_services/ice_window_service.py +1 -1
helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
helm/benchmark/window_services/local_window_service.py +2 -2
helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
helm/benchmark/window_services/test_bloom_window_service.py +3 -3
helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
helm/benchmark/window_services/test_gptj_window_service.py +8 -3
helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
helm/benchmark/window_services/test_openai_window_service.py +8 -3
helm/benchmark/window_services/test_opt_window_service.py +3 -3
helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
helm/benchmark/window_services/test_t511b_window_service.py +3 -3
helm/benchmark/window_services/test_ul2_window_service.py +3 -3
helm/benchmark/window_services/test_utils.py +4 -5
helm/benchmark/window_services/test_yalm_window_service.py +3 -3
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/benchmark/window_services/yalm_window_service.py +1 -1
helm/clients/ai21_client.py +3 -3
helm/clients/aleph_alpha_client.py +1 -1
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/__init__.py +0 -0
helm/clients/audio_language/diva_llama_client.py +120 -0
helm/clients/audio_language/llama_omni_client.py +198 -0
helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
helm/clients/audio_language/qwen_audiolm_client.py +152 -0
helm/clients/audio_language/test.py +62 -0
helm/clients/auto_client.py +4 -2
helm/clients/azure_openai_client.py +55 -0
helm/clients/bedrock_client.py +203 -7
helm/clients/bedrock_utils.py +33 -0
helm/clients/client.py +7 -7
helm/clients/clip_scorers/clip_scorer.py +1 -1
helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
helm/clients/cohere_client.py +3 -3
helm/clients/google_client.py +1 -1
helm/clients/grok_client.py +36 -0
helm/clients/http_model_client.py +1 -1
helm/clients/huggingface_client.py +52 -21
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/ibm_client.py +267 -0
helm/clients/image_generation/adobe_vision_client.py +1 -1
helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
helm/clients/image_generation/cogview2_client.py +1 -1
helm/clients/image_generation/dalle2_client.py +1 -1
helm/clients/image_generation/dalle3_client.py +2 -2
helm/clients/image_generation/dalle_mini/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/data.py +1 -1
helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
helm/clients/image_generation/dalle_mini_client.py +1 -1
helm/clients/image_generation/deep_floyd_client.py +1 -1
helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
helm/clients/image_generation/lexica_client.py +1 -1
helm/clients/image_generation/mindalle/models/__init__.py +6 -6
helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
helm/clients/image_generation/mindalle_client.py +1 -1
helm/clients/image_generation/together_image_generation_client.py +1 -1
helm/clients/lit_gpt_client.py +2 -2
helm/clients/mistral_client.py +62 -18
helm/clients/nvidia_nim_client.py +0 -3
helm/clients/openai_client.py +308 -43
helm/clients/openai_responses_client.py +174 -0
helm/clients/palmyra_client.py +3 -9
helm/clients/reka_client.py +3 -3
helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
helm/clients/stanfordhealthcare_claude_client.py +31 -0
helm/clients/stanfordhealthcare_google_client.py +43 -0
helm/clients/stanfordhealthcare_http_model_client.py +93 -0
helm/clients/stanfordhealthcare_openai_client.py +62 -0
helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
helm/clients/test_client.py +1 -1
helm/clients/test_together_client.py +6 -1
helm/clients/together_client.py +76 -9
helm/clients/upstage_client.py +23 -0
helm/clients/vertexai_client.py +45 -13
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/open_flamingo/__init__.py +2 -2
helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +188 -0
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +4 -6
helm/clients/writer_client.py +102 -0
helm/clients/yi_client.py +0 -3
helm/common/audio_utils.py +111 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/file_caches/local_file_cache.py +1 -1
helm/common/file_caches/test_local_file_cache.py +1 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +46 -3
helm/common/images_utils.py +2 -2
helm/common/local_context.py +140 -0
helm/common/media_object.py +2 -2
helm/common/multimodal_request_utils.py +26 -0
helm/common/reeval_parameters.py +12 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +14 -2
helm/common/response_format.py +18 -0
helm/common/test_media_object.py +1 -1
helm/config/model_deployments.yaml +1792 -28
helm/config/model_metadata.yaml +1606 -51
helm/config/tokenizer_configs.yaml +521 -4
helm/proxy/cli.py +5 -3
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/example_queries.py +1 -1
helm/proxy/server.py +11 -4
helm/proxy/services/remote_service.py +1 -1
helm/proxy/services/server_service.py +22 -86
helm/proxy/services/test_remote_service.py +2 -2
helm/proxy/services/test_service.py +1 -1
helm/proxy/static/general.js +122 -0
helm/proxy/static/help.html +99 -0
helm/proxy/static/index.css +57 -0
helm/proxy/static/index.html +40 -0
helm/proxy/static/index.js +456 -0
helm/proxy/static/info-icon.png +0 -0
helm/proxy/test_retry.py +1 -1
helm/proxy/token_counters/auto_token_counter.py +1 -1
helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
helm/tokenizers/caching_tokenizer.py +2 -30
helm/tokenizers/grok_tokenizer.py +53 -0
helm/tokenizers/http_model_tokenizer.py +1 -1
helm/tokenizers/huggingface_tokenizer.py +3 -3
helm/tokenizers/lit_gpt_tokenizer.py +1 -1
helm/tokenizers/test_anthropic_tokenizer.py +6 -2
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/tokenizers/test_huggingface_tokenizer.py +1 -1
helm/tokenizers/test_yalm_tokenizer.py +1 -1
helm/tokenizers/tiktoken_tokenizer.py +1 -1
helm/tokenizers/tokenizer.py +3 -1
helm/tokenizers/yalm_tokenizer.py +3 -3
helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
crfm_helm-0.5.4.dist-info/METADATA +0 -350
crfm_helm-0.5.4.dist-info/RECORD +0 -697
helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
helm/tokenizers/anthropic_tokenizer.py +0 -52
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0

helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py ADDED Viewed

@@ -0,0 +1,80 @@
+from typing import List
+import os
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from tqdm import tqdm
+from datasets import load_dataset
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.audio_utils import ensure_audio_file_exists_from_array
+class MultilingualLibriSpeechScenario(Scenario):
+    """Multilingual Librispeech
+    The Multilingual LibriSpeech (Pratap et al, 2020) dataset is derived from read audiobooks
+    from LibriVox and consists of 8 languages, including about 44.5K hours of English and a total
+    of about 6K hours for other 7 languages. The task is to recognize the textual content from the
+    audio sample.
+    Paper: https://arxiv.org/abs/2012.03411
+    Code: https://www.openslr.org/
+    Citation:
+    @article{Pratap2020MLSAL,
+        title={MLS: A Large-Scale Multilingual Dataset for Speech Research},
+        author={Vineel Pratap and Qiantong Xu and Anuroop Sriram and Gabriel Synnaeve and Ronan Collobert},
+        journal={ArXiv},
+        year={2020},
+        volume={abs/2012.03411}
+        }
+    """
+    HF_DATASET_NAME = "facebook/multilingual_librispeech"
+    LANGUAGE_LIST: List[str] = ["dutch", "german", "french", "spanish", "italian", "portuguese", "polish"]
+    name = "multilingual_librispeech"
+    description = (
+        "Speech recognition in 7 different languages ([Pratap et al, 2022](https://arxiv.org/abs/2012.03411))."
+    )
+    tags: List[str] = ["audio", "multilinguality", "recognition"]
+    def __init__(self, language: str) -> None:
+        super().__init__()
+        language = language.lower()
+        if language not in MultilingualLibriSpeechScenario.LANGUAGE_LIST:
+            raise ValueError(f"Invalid language. Valid languages are: {MultilingualLibriSpeechScenario.LANGUAGE_LIST}")
+        self._language: str = language
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        audio_save_dir = os.path.join(output_path, "audio_files")
+        for idx, row in enumerate(
+            tqdm(
+                load_dataset(
+                    MultilingualLibriSpeechScenario.HF_DATASET_NAME,
+                    name=self._language,
+                    cache_dir=output_path,
+                    split=TEST_SPLIT,
+                )
+            )
+        ):
+            local_audio_path = os.path.join(audio_save_dir, str(idx) + "_" + row["original_path"].split("/")[-1])
+            # download to the local path
+            ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
+            answer = row["transcript"]
+            input = Input(
+                multimedia_content=MultimediaObject([MediaObject(content_type="audio/mpeg", location=local_audio_path)])
+            )
+            references = [Reference(Output(text=str(answer)), tags=[CORRECT_TAG])]
+            instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
+        return instances

helm/benchmark/scenarios/audio_language/mustard_scenario.py ADDED Viewed

@@ -0,0 +1,142 @@
+import json
+import os
+from typing import List
+from tqdm import tqdm
+from helm.benchmark.scenarios.scenario import (
+    TEST_SPLIT,
+    Scenario,
+    Instance,
+    Reference,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from helm.common.audio_utils import is_invalid_audio_file, extract_audio
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_directory_exists, ensure_file_downloaded
+class MUStARDScenario(Scenario):
+    """
+    MUStARD: Multimodal Sarcasm Detection Dataset
+    A multimodal video corpus for research in automated sarcasm discovery. The dataset is compiled from popular
+    TV shows including Friends, The Golden Girls, The Big Bang Theory, and Sarcasmaholics Anonymous. MUStARD consists
+    of audiovisual utterances annotated with sarcasm labels. Each utterance is accompanied by its context, providing
+    additional information on the scenario where it occurs.
+    We just extract the audio from the given videos.
+    The columns of the dataset are:
+    - utterance: The text of the target utterance to classify.
+    - speaker: Speaker of the target utterance.
+    - context: List of utterances (in chronological order) preceding the target utterance.
+    - context_speakers: Respective speakers of the context utterances.
+    - sarcasm: Binary label for sarcasm tag.
+    More specifically an example looks like this:
+    "1_60": {
+        "utterance": "It's just a privilege to watch your mind at work.",
+        "speaker": "SHELDON",
+        "context": [
+            "I never would have identified the fingerprints of string theory in the aftermath of the Big Bang.",
+            "My apologies. What's your plan?"
+        ],
+        "context_speakers": [
+            "LEONARD",
+            "SHELDON"
+        ],
+        "show": "BBT",
+        "sarcasm": true
+    }
+    The key is the video id.
+    The video folder has two subfolders:
+    - context_final: Contains the context videos (e.g., 1_60_c.mp4)
+    - utterances_final: Contains the target utterance videos (e.g., 1_60.mp4)
+    Citation:
+    @inproceedings{mustard,
+        title = "Towards Multimodal Sarcasm Detection (An \_Obviously\_ Perfect Paper)",
+        author = "Castro, Santiago  and
+          Hazarika, Devamanyu  and
+          P{\'e}rez-Rosas, Ver{\'o}nica  and
+          Zimmermann, Roger  and
+          Mihalcea, Rada  and
+          Poria, Soujanya",
+        booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics
+                    (Volume 1: Long Papers)",
+        month = "7",
+        year = "2019",
+        address = "Florence, Italy",
+        publisher = "Association for Computational Linguistics",
+    }
+    """
+    RAW_VIDEO_CLIPS_URL: str = "https://huggingface.co/datasets/MichiganNLP/MUStARD/resolve/main/mmsd_raw_data.zip"
+    ANNOTATIONS_URL: str = (
+        "https://raw.githubusercontent.com/soujanyaporia/MUStARD/refs/heads/master/data/" "sarcasm_data.json"
+    )
+    name = "mustard"
+    description = "Sarcasm detection benchmark ([Castro et al, 2018](https://arxiv.org/abs/1906.01815))."
+    tags = ["audio", "classification", "toxicity", "sarcasm detection"]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Download the annotations
+        annotations_path: str = os.path.join(output_path, "sarcasm_data.json")
+        ensure_file_downloaded(self.ANNOTATIONS_URL, annotations_path)
+        # Where the video files will be downloaded to
+        video_path: str = os.path.join(output_path, "video")
+        ensure_file_downloaded(self.RAW_VIDEO_CLIPS_URL, video_path, unpack=True)
+        # Where the audio files will be extracted to
+        audio_path: str = os.path.join(output_path, "audio")
+        ensure_directory_exists(audio_path)
+        instances: List[Instance] = []
+        annotations = json.load(open(annotations_path, "r"))
+        for key, row in tqdm(annotations.items()):
+            # Extract the audio from the context video
+            context_audio_path: str = os.path.join(audio_path, f"{key}_c.mp3")
+            if not os.path.exists(context_audio_path):
+                # Extract the audio from the video
+                context_video_path: str = os.path.join(video_path, "context_final", f"{key}_c.mp4")
+                extract_audio(context_video_path, context_audio_path)
+            assert not is_invalid_audio_file(context_audio_path), f"Invalid audio file: {context_audio_path}"
+            # Extract the audio from the target utterance video
+            utterance_audio_path: str = os.path.join(audio_path, f"{key}.mp3")
+            if not os.path.exists(utterance_audio_path):
+                utterance_video_path: str = os.path.join(video_path, "utterances_final", f"{key}.mp4")
+                extract_audio(utterance_video_path, utterance_audio_path)
+            assert not is_invalid_audio_file(utterance_audio_path), f"Invalid audio file: {utterance_audio_path}"
+            input = Input(
+                multimedia_content=MultimediaObject(
+                    media_objects=[
+                        # Input both the context and the utterance audio
+                        MediaObject(text="Context:", content_type="text/plain"),
+                        MediaObject(location=context_audio_path, content_type="audio/mpeg"),
+                        MediaObject(text="Utterance:", content_type="text/plain"),
+                        MediaObject(location=utterance_audio_path, content_type="audio/mpeg"),
+                        MediaObject(
+                            text="Given the context, does the utterance contain sarcasm?", content_type="text/plain"
+                        ),
+                    ]
+                )
+            )
+            is_sarcastic: bool = row["sarcasm"]
+            references = [
+                Reference(Output(text="Yes"), tags=[CORRECT_TAG] if is_sarcastic else []),
+                Reference(Output(text="No"), tags=[CORRECT_TAG] if not is_sarcastic else []),
+            ]
+            instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
+        return instances

helm/benchmark/scenarios/audio_language/mutox_scenario.py ADDED Viewed

@@ -0,0 +1,254 @@
+from io import BytesIO
+from typing import List
+import os
+import requests
+from pydub import AudioSegment
+from tqdm import tqdm
+import pandas as pd
+from helm.benchmark.scenarios.scenario import (
+    TEST_SPLIT,
+    Scenario,
+    Instance,
+    Reference,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from helm.common.audio_utils import is_invalid_audio_file
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_directory_exists, ensure_file_downloaded
+from helm.common.hierarchical_logger import hlog, htrack_block
+class MuToxScenario(Scenario):
+    """
+    MuTox: MuTox: Universal MUltilingual Audio-based TOXicity Dataset and Zero-shot Detector
+    MuTox, the first highly multilingual audio-based dataset with toxicity labels. The dataset consists of 20k
+    audio utterances for English and Spanish, and 4k for the other languages. To showcase the quality of this
+    dataset, we train the MuTox audio-based toxicity classifier, which allows zero-shot toxicity detection across
+    a broad range of languages. This classifier outperforms existing text-based trainable classifiers by more than
+    1% AUC, while increasing the language coverage from 8 to 100+ languages. When compared to a wordlist-based
+    classifier that covers a similar number of languages, MuTox improves precision and recall by ∼2.5 times.
+    Languages:
+        "Arabic": "arb",
+        "Bengali": "ben",
+        "Bulgarian": "bul",
+        "Catalan": "cat",
+        "Czech": "ces",
+        "Mandarin Chinese": "cmn",
+        "Danish": "dan",
+        "German": "deu",
+        "Greek": "ell",
+        "English": "eng",
+        "Estonian": "est",
+        "Western Persian": "fas",
+        "Finnish": "fin",
+        "French": "fra",
+        "Hebrew": "heb",
+        "Hindi": "hin",
+        "Hungarian": "hun",
+        "Indonesian": "ind",
+        "Italian": "ita",
+        "Dutch": "nld",
+        "Polish": "pol",
+        "Portuguese": "por",
+        "Russian": "rus",
+        "Spanish": "spa",
+        "Slovak": "slk",
+        "Swahili": "swh",
+        "Tagalog": "tgl",
+        "Turkish": "tur",
+        "Urdu": "urd",
+        "Vietnamese": "vie",
+    The columns of the dataset are:
+    id: a string id of the segment;
+    lang: 3-letter language code;
+    partition: one of train, dev, or devtest
+    public_url_segment: a string formatted as url:start:end, where start and end are indicated in milliseconds;
+    audio_file_transcript: text transctiption of the segment;
+    contains_toxicity, toxicity_types, perlocutionary_effects: annotation results as strings
+    label: an integer label, equal to 1 if contains_toxicity equals Yes and 0 otherwise;
+    etox_result: toxic word (or multiple words, separated by |) detected by the Etox matcher;
+    detoxify_score: toxicity probabilities predicted by the Detoxify system (float numbers between 0 and 1);
+    mutox_speech_score, mutox_text_score, mutox_zero_shot_speech_score, mutox_zero_shot_text_score: MuTox predictions
+    as float numbers with any value (they can be interpreted as logits,
+    i.e. probabilities before a sigmoid transformation).
+    Citation:
+    @misc{costajussà2023mutox,
+      title={MuTox: Universal MUltilingual Audio-based TOXicity Dataset and Zero-shot Detector},
+      author={ Marta R. Costa-jussà, Mariano Coria Meglioli, Pierre Andrews, David Dale, Prangthip Hansanti,
+      Elahe Kalbassi, Alex Mourachko, Christophe Ropers, Carleigh Wood},
+      year={2023},
+      eprint={},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+    }
+    """
+    ANNOTATIONS_URL = "https://dl.fbaipublicfiles.com/seamless/datasets/mutox.tsv"
+    LANGAUGE_CODES = {
+        "Arabic": "arb",
+        "Bengali": "ben",
+        "Bulgarian": "bul",
+        "Catalan": "cat",
+        "Czech": "ces",
+        "Mandarin_Chinese": "cmn",
+        "Danish": "dan",
+        "German": "deu",
+        "Greek": "ell",
+        "English": "eng",
+        "Estonian": "est",
+        "Western_Persian": "fas",
+        "Finnish": "fin",
+        "French": "fra",
+        "Hebrew": "heb",
+        "Hindi": "hin",
+        "Hungarian": "hun",
+        "Indonesian": "ind",
+        "Italian": "ita",
+        "Dutch": "nld",
+        "Polish": "pol",
+        "Portuguese": "por",
+        "Russian": "rus",
+        "Spanish": "spa",
+        "Slovak": "slk",
+        "Swahili": "swh",
+        "Tagalog": "tgl",
+        "Turkish": "tur",
+        "Urdu": "urd",
+        "Vietnamese": "vie",
+    }
+    name = "mutox"
+    description = "Toxicity detection benchmark ([Costa-jussà et al, 2024](https://arxiv.org/abs/2401.05060))."
+    tags = ["audio", "classification", "toxicity "]
+    @staticmethod
+    def track_bad_audio_file(bad_audio_file: str, output_path: str) -> None:
+        """
+        Many of the links do not exist or point to broken so we keep track of them
+        and skip them in the future runs to significantly speed up gathering the instances.
+        """
+        with open(output_path, "a") as f:
+            f.write(bad_audio_file + "\n")
+    def __init__(self, language: str) -> None:
+        super().__init__()
+        self._language_code: str = self.LANGAUGE_CODES[language]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Download the annotations
+        annotations_path: str = os.path.join(output_path, "mutox.tsv")
+        ensure_file_downloaded(self.ANNOTATIONS_URL, annotations_path)
+        # Read bad audio files
+        bad_audio_files: set[str] = set()
+        bad_audio_files_path: str = os.path.join(output_path, "bad_audio_files.txt")
+        if os.path.exists(bad_audio_files_path):
+            # Each line is the audio file name
+            with open(bad_audio_files_path, "r") as f:
+                for line in f:
+                    bad_audio_files.add(line.strip())
+            hlog(f"Found {len(bad_audio_files)} bad audio files.")
+        # Where the audio files will be downloaded to
+        audio_path: str = os.path.join(output_path, "audio")
+        ensure_directory_exists(audio_path)
+        instances: List[Instance] = []
+        df = pd.read_csv(annotations_path, delimiter="\t")
+        hlog(f"Found {len(df)} rows in the dataset")
+        valid_count: int = 0
+        total_count: int = 0
+        for row in tqdm(df.itertuples()):
+            # Only proces examples that are in devtest and the language we're interested in
+            if row.partition != "devtest":
+                continue
+            if row.lang != self._language_code:
+                continue
+            total_count += 1
+            # Discard known bad audio files
+            audio_filename: str = f"{row.id}.mp3"
+            with htrack_block(f"Processing audio file: {audio_filename}"):
+                if audio_filename in bad_audio_files:
+                    hlog(f"Skipping this example -- known bad audio file: {audio_filename}")
+                    continue
+                local_audio_path: str = os.path.join(audio_path, audio_filename)
+                if not os.path.exists(local_audio_path):
+                    # The provided URL has the complete audio, so we need to download it and clip it
+                    # public_url_segment: a string formatted as url:start:end,
+                    if not isinstance(row.public_url_segment, str):
+                        # Sometimes URL is just a float causing an error. Skip those.
+                        hlog(f"Skipping this example -- invalid URL: {row.public_url_segment}")
+                        continue
+                    parts = row.public_url_segment.split()
+                    if len(parts) != 3:
+                        hlog(f"Skipping this example -- invalid URL: {row.public_url_segment}")
+                        continue
+                    audio_url, start_ms_str, end_ms_str = parts
+                    start_ms, end_ms = int(start_ms_str), int(end_ms_str)
+                    # Download the full audio file
+                    try:
+                        response = requests.get(audio_url, stream=True)
+                        response.raise_for_status()
+                        # Load audio from the downloaded file
+                        audio = AudioSegment.from_file(BytesIO(response.content))
+                        # Clip the audio
+                        clipped_audio = audio[start_ms:end_ms]
+                        # Save the clipped file
+                        clipped_audio.export(local_audio_path, format="mp3")
+                    except Exception as e:
+                        # Some files are no longer available or invalid
+                        hlog(
+                            f"Skipping this example -- error downloading / processing audio file from {audio_url}: {e}"
+                        )
+                        self.track_bad_audio_file(audio_filename, bad_audio_files_path)
+                        continue
+                if is_invalid_audio_file(local_audio_path):
+                    hlog(f"Skipping this example -- Invalid audio file even after downloading at {local_audio_path}")
+                    self.track_bad_audio_file(audio_filename, bad_audio_files_path)
+                    continue
+                else:
+                    valid_count += 1
+                    hlog(f"Valid audio file found at {local_audio_path} (valid/total: {valid_count}/{total_count})")
+            input = Input(
+                multimedia_content=MultimediaObject(
+                    media_objects=[
+                        # Following what was prompted annotators as outlined in the paper
+                        MediaObject(text="Does the utterance contain toxicity?", content_type="text/plain"),
+                        MediaObject(location=local_audio_path, content_type="audio/mpeg"),
+                    ]
+                )
+            )
+            is_toxic: bool = row.label == 1
+            references = [
+                Reference(Output(text="Yes"), tags=[CORRECT_TAG] if is_toxic else []),
+                Reference(Output(text="No"), tags=[CORRECT_TAG] if not is_toxic else []),
+            ]
+            instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
+        assert len(instances) > 0, f"No instances found for language: {self._language_code}"
+        return instances

helm/benchmark/scenarios/audio_language/parade_scenario.py ADDED Viewed

@@ -0,0 +1,97 @@
+from typing import List
+import os
+import json
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from tqdm import tqdm
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_file_downloaded
+class PARADEScenario(Scenario):
+    """PARADE
+    PARADE dataset is inspired by the PAIRS dataset for evaluating occupation and status bias
+    in vision-language models. We collect a new dataset of audio-text multi-choice QA task that
+    involves exploring occupation and status bias. The dataset consists of 436 audio-text QA pairs
+    with 3 options each.
+    """
+    ANNOT_URL = (
+        "https://huggingface.co/datasets/UCSC-VLAA/PARADE_audio/resolve/main/audio_result" "_path_mapping_v2.json"
+    )
+    MALE_AUDIO = "https://huggingface.co/datasets/UCSC-VLAA/PARADE_audio/resolve/main/onyx.zip"
+    FEMALE_AUDIO = "https://huggingface.co/datasets/UCSC-VLAA/PARADE_audio/resolve/main/nova.zip"
+    PARADE_INSTRUCTION = "\n\n Answer the question with one of the following options: A, B, or C."
+    SUBSET_LIST = ["occupation", "status"]
+    VOICE_MAPPING = {"male": "onyx", "female": "nova"}
+    name = "parade"
+    description = "Exploring occupation and status bias in the audio-text multi-choice QA task."
+    tags: List[str] = ["audio", "bias"]
+    def __init__(self, subset: str, voice: str) -> None:
+        super().__init__()
+        subset = subset.lower()
+        voice = voice.lower()
+        if subset not in PARADEScenario.SUBSET_LIST:
+            raise ValueError(f"Invalid subset. Valid subsets are: {PARADEScenario.SUBSET_LIST}")
+        if voice not in PARADEScenario.VOICE_MAPPING.keys():
+            raise ValueError(f"Invalid voice. Valid voices are: {PARADEScenario.VOICE_MAPPING.keys()}")
+        self._subset: str = subset
+        self._voice: str = voice
+    def _convert_answer_to_label(self, options: list, answer: str) -> str:
+        option_list = ["A", "B", "C"]
+        return option_list[options.index(answer)]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        annot_save_path = os.path.join(output_path, "annotation.json")
+        ensure_file_downloaded(source_url=PARADEScenario.ANNOT_URL, target_path=annot_save_path)
+        annotations = json.load(open(annot_save_path))[self.VOICE_MAPPING[self._voice]][self._subset]
+        test_annotations = []
+        for key in annotations:
+            for key2 in annotations[key]:
+                test_annotations.append(annotations[key][key2])
+        audio_save_dir = os.path.join(output_path, "audio_files")
+        if self._voice == "male":
+            ensure_file_downloaded(source_url=PARADEScenario.MALE_AUDIO, target_path=audio_save_dir, unpack=True)
+        else:
+            ensure_file_downloaded(source_url=PARADEScenario.FEMALE_AUDIO, target_path=audio_save_dir, unpack=True)
+        for row in tqdm(test_annotations):
+            local_audio_path = os.path.join(output_path, "audio_files", row["path"])
+            answer = self._convert_answer_to_label(row["options"], row["label"])
+            # The given correct answer is a letter, but we need an index
+            correct_answer_index: int = ord(answer) - ord("A")
+            references: List[Reference] = []
+            question = row["question"]
+            for i, option in enumerate(row["options"]):
+                reference: Reference
+                is_correct: bool = i == correct_answer_index
+                reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
+                references.append(reference)
+            input = Input(
+                multimedia_content=MultimediaObject(
+                    [
+                        MediaObject(content_type="audio/mpeg", location=local_audio_path),
+                        MediaObject(content_type="text/plain", text=question + self.PARADE_INSTRUCTION),
+                    ]
+                )
+            )
+            instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
+        return instances

crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl