crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +10 -5
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +90 -61
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +281 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +21 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +119 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +105 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +83 -5
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +18 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +42 -6
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +151 -47
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +255 -21
- helm/clients/palmyra_client.py +2 -6
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +69 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +8 -30
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/key_value_store.py +9 -9
- helm/common/media_object.py +2 -2
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +1258 -33
- helm/config/model_metadata.yaml +1110 -41
- helm/config/tokenizer_configs.yaml +403 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -13
- helm/proxy/services/remote_service.py +1 -7
- helm/proxy/services/server_service.py +6 -19
- helm/proxy/services/service.py +0 -6
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.3.dist-info/METADATA +0 -355
- crfm_helm-0.5.3.dist-info/RECORD +0 -699
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.mmlu_pro_scenario import MMLUProScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_mmlu_pro_scenario():
|
|
10
|
+
with TemporaryDirectory() as tmpdir:
|
|
11
|
+
# Test for the "abstract_algebra" subject
|
|
12
|
+
scenario = MMLUProScenario(subject="math")
|
|
13
|
+
instances = scenario.get_instances(tmpdir)
|
|
14
|
+
assert instances[1].input == Input(
|
|
15
|
+
text="Let V be the set of all real polynomials p(x). Let transformations T, S be defined on V by T:p(x) -> xp(x) and S:p(x) -> p'(x) = d/dx p(x), and interpret (ST)(p(x)) as S(T(p(x))). Which of the following is true?" # noqa: E501
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
# Ensure it handles up to 10 answer options
|
|
19
|
+
assert instances[1].references == [
|
|
20
|
+
Reference(output=Output(text="ST + TS is the identity map of V onto itself."), tags=[]),
|
|
21
|
+
Reference(output=Output(text="TS = 0"), tags=[]),
|
|
22
|
+
Reference(output=Output(text="ST = 1"), tags=[]),
|
|
23
|
+
Reference(output=Output(text="ST - TS = 0"), tags=[]),
|
|
24
|
+
Reference(output=Output(text="ST = T"), tags=[]),
|
|
25
|
+
Reference(output=Output(text="ST = 0"), tags=[]),
|
|
26
|
+
Reference(output=Output(text="ST = TS"), tags=[]),
|
|
27
|
+
Reference(output=Output(text="ST - TS is the identity map of V onto itself."), tags=[CORRECT_TAG]),
|
|
28
|
+
Reference(output=Output(text="TS = T"), tags=[]),
|
|
29
|
+
Reference(output=Output(text="ST = S"), tags=[]),
|
|
30
|
+
]
|
|
31
|
+
assert instances[1].split == "train"
|
|
32
|
+
|
|
33
|
+
scenario = MMLUProScenario(subject="health")
|
|
34
|
+
instances = scenario.get_instances(tmpdir)
|
|
35
|
+
|
|
36
|
+
assert instances[0].input == Input(
|
|
37
|
+
text="Which of the following is the body cavity that contains the pituitary gland?"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Check references with more answer choices and correct tagging
|
|
41
|
+
assert instances[0].references == [
|
|
42
|
+
Reference(output=Output(text="Ventral"), tags=[]),
|
|
43
|
+
Reference(output=Output(text="Dorsal"), tags=[]),
|
|
44
|
+
Reference(output=Output(text="Buccal"), tags=[]),
|
|
45
|
+
Reference(output=Output(text="Thoracic"), tags=[]),
|
|
46
|
+
Reference(output=Output(text="Pericardial"), tags=[]),
|
|
47
|
+
Reference(output=Output(text="Abdominal"), tags=[]),
|
|
48
|
+
Reference(output=Output(text="Spinal"), tags=[]),
|
|
49
|
+
Reference(output=Output(text="Pelvic"), tags=[]),
|
|
50
|
+
Reference(output=Output(text="Pleural"), tags=[]),
|
|
51
|
+
Reference(output=Output(text="Cranial"), tags=[CORRECT_TAG]),
|
|
52
|
+
]
|
|
53
|
+
assert instances[0].split == "train"
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.oab_exams_scenario import OABExamsScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_oab_exams_scenario():
|
|
10
|
+
oab_scenario = OABExamsScenario()
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
instances = oab_scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(instances) == 2210
|
|
14
|
+
assert instances[100].split == TEST_SPLIT
|
|
15
|
+
|
|
16
|
+
assert instances[100].input.text.startswith(
|
|
17
|
+
"O Congresso Nacional e suas respectivas Casas se reúnem anualmente para a atividade legislativa."
|
|
18
|
+
)
|
|
19
|
+
assert len(instances[100].input.text) == 178
|
|
20
|
+
|
|
21
|
+
assert instances[100].references == [
|
|
22
|
+
Reference(
|
|
23
|
+
output=Output(
|
|
24
|
+
text="Legislatura: o período compreendido entre 2 de fevereiro a 17 de julho e 1º de agosto a 22 de dezembro." # noqa: E501
|
|
25
|
+
),
|
|
26
|
+
tags=[],
|
|
27
|
+
),
|
|
28
|
+
Reference(
|
|
29
|
+
output=Output(
|
|
30
|
+
text="Sessão legislativa: os quatro anos equivalentes ao mandato dos parlamentares." # noqa: E501
|
|
31
|
+
),
|
|
32
|
+
tags=[],
|
|
33
|
+
),
|
|
34
|
+
Reference(
|
|
35
|
+
output=Output(
|
|
36
|
+
text="Sessão conjunta: a reunião da Câmara dos Deputados e do Senado Federal destinada, por exemplo, "
|
|
37
|
+
"a conhecer do veto presidencial e sobre ele deliberar." # noqa: E501
|
|
38
|
+
),
|
|
39
|
+
tags=[CORRECT_TAG],
|
|
40
|
+
),
|
|
41
|
+
Reference(
|
|
42
|
+
output=Output(
|
|
43
|
+
text="Sessão extraordinária: a que ocorre por convocação ou do Presidente do Senado Federal ou do "
|
|
44
|
+
"Presidente da Câmara dos Deputados ou do Presidente da República e mesmo por requerimento da maioria "
|
|
45
|
+
"dos membros de ambas as Casas para, excepcionalmente, inaugurar a "
|
|
46
|
+
"sessão legislativa e eleger as respectivas mesas diretoras." # noqa: E501
|
|
47
|
+
),
|
|
48
|
+
tags=[],
|
|
49
|
+
),
|
|
50
|
+
]
|
|
51
|
+
assert instances[100].references[2].is_correct
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.omni_math_scenario import OmniMATHScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import Input, TEST_SPLIT
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_omni_math_scenario_get_instances():
|
|
10
|
+
omni_math_scenario = OmniMATHScenario()
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
instances = omni_math_scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(instances) == 4428
|
|
14
|
+
assert instances[0].input == Input(
|
|
15
|
+
text=(
|
|
16
|
+
"Let $ n(\\ge2) $ be a positive integer. Find the minimum $ m $, "
|
|
17
|
+
"so that there exists $x_{ij}(1\\le i ,j\\le n)$ satisfying:\n(1)For every "
|
|
18
|
+
"$1\\le i ,j\\le n, x_{ij}=max\\{x_{i1},x_{i2},...,x_{ij}\\} $ or $ x_{ij}="
|
|
19
|
+
"max\\{x_{1j},x_{2j},...,x_{ij}\\}.$\n(2)For every $1\\le i \\le n$, there "
|
|
20
|
+
"are at most $m$ indices $k$ with $x_{ik}=max\\{x_{i1},x_{i2},...,x_{ik}\\}."
|
|
21
|
+
"$\n(3)For every $1\\le j \\le n$, there are at most $m$ indices $k$ with "
|
|
22
|
+
"$x_{kj}=max\\{x_{1j},x_{2j},...,x_{kj}\\}.$"
|
|
23
|
+
)
|
|
24
|
+
)
|
|
25
|
+
assert instances[0].split == TEST_SPLIT
|
|
26
|
+
assert instances[0].references
|
|
27
|
+
assert instances[0].references[0].output.text == "1 + \\left\\lceil \\frac{n}{2} \\right\\rceil"
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.tweetsentbr_scenario import TweetSentBRScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import TRAIN_SPLIT, CORRECT_TAG, Output, Reference
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_tweetsentbr_scenario():
|
|
10
|
+
tweetsentbr = TweetSentBRScenario()
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
instances = tweetsentbr.get_instances(tmpdir)
|
|
13
|
+
assert len(instances) == 2085
|
|
14
|
+
assert instances[0].split == TRAIN_SPLIT
|
|
15
|
+
|
|
16
|
+
assert instances[0].input.text.startswith("joca tá com a corda toda 😂 😂 😂 😂")
|
|
17
|
+
assert len(instances[0].input.text) == 32
|
|
18
|
+
|
|
19
|
+
assert instances[0].references == [
|
|
20
|
+
Reference(
|
|
21
|
+
output=Output(text="Positivo"),
|
|
22
|
+
tags=[CORRECT_TAG],
|
|
23
|
+
)
|
|
24
|
+
]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.wildbench_scenario import WildBenchScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import TEST_SPLIT
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_wildbench_scenario_get_instances():
|
|
10
|
+
wildbench_scenario = WildBenchScenario(subset="v2")
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
instances = wildbench_scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(instances) == 1024
|
|
14
|
+
assert instances[0].split == TEST_SPLIT
|
|
15
|
+
assert instances[0].extra_data
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.winogrande_afr_scenario import Winogrande_Afr_Scenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_winogrande_afr_scenario():
|
|
10
|
+
with TemporaryDirectory() as tmpdir:
|
|
11
|
+
scenario = Winogrande_Afr_Scenario(lang="am")
|
|
12
|
+
instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(instances) == 3674
|
|
14
|
+
assert instances[0].input == Input(text="ሳራ ከማሪያ በጣም የተሻለች የቀዶ ጥገና ሐኪም ስለነበረች ሁልጊዜ _ ቀላል ህመሞችን ታክማለች.")
|
|
15
|
+
assert instances[0].references == [
|
|
16
|
+
Reference(output=Output(text="ሳራ"), tags=[]),
|
|
17
|
+
Reference(output=Output(text="ማሪያ"), tags=[CORRECT_TAG]),
|
|
18
|
+
]
|
|
19
|
+
assert instances[0].split == "train"
|
|
@@ -4,7 +4,16 @@ import json
|
|
|
4
4
|
|
|
5
5
|
from helm.common.general import ensure_file_downloaded
|
|
6
6
|
from helm.common.hierarchical_logger import hlog
|
|
7
|
-
from .scenario import
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TRAIN_SPLIT,
|
|
12
|
+
TEST_SPLIT,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
Input,
|
|
15
|
+
Output,
|
|
16
|
+
)
|
|
8
17
|
|
|
9
18
|
|
|
10
19
|
class ThaiExamScenario(Scenario):
|
|
@@ -7,7 +7,7 @@ from typing import Dict, List
|
|
|
7
7
|
|
|
8
8
|
from helm.common.general import ensure_file_downloaded
|
|
9
9
|
from helm.common.hierarchical_logger import hlog, htrack, htrack_block
|
|
10
|
-
from .scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
10
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class ThePileScenario(Scenario):
|
|
@@ -3,7 +3,16 @@ import os
|
|
|
3
3
|
from typing import List, Dict, Any
|
|
4
4
|
|
|
5
5
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
6
|
-
from .scenario import
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
Reference,
|
|
10
|
+
VALID_SPLIT,
|
|
11
|
+
TRAIN_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
7
16
|
|
|
8
17
|
|
|
9
18
|
class TruthfulQAScenario(Scenario):
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from typing import Any, List, Dict
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from datasets import load_dataset
|
|
4
|
+
from helm.common.hierarchical_logger import hlog
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
TRAIN_SPLIT,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Input,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TweetSentBRScenario(Scenario):
|
|
18
|
+
"""
|
|
19
|
+
TweetSentBR is a corpus of Tweets in Brazilian Portuguese. It was labeled by several
|
|
20
|
+
annotators following steps stablished on the literature for improving reliability on
|
|
21
|
+
the task of Sentiment Analysis. Each Tweet was annotated in one of the three following classes:
|
|
22
|
+
|
|
23
|
+
Positive - tweets where a user meant a positive reaction or evaluation about the main topic on the post;
|
|
24
|
+
Negative - tweets where a user meant a negative reaction or evaluation about the main topic on the post;
|
|
25
|
+
Neutral - tweets not belonging to any of the last classes, usually not making a point, out of topic,
|
|
26
|
+
irrelevant, confusing or containing only objective data.
|
|
27
|
+
|
|
28
|
+
This dataset is a subset of the tweetSentBR, it contains only 75 samples from the training set
|
|
29
|
+
and all 2.000+ instances of the test set. This is meant for evaluating language models in a few-shot setting.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
name = "simple_classification"
|
|
33
|
+
description = "Classify tweets into Positive, Negative or Neutral."
|
|
34
|
+
tags = ["classification"]
|
|
35
|
+
|
|
36
|
+
def process_dataset(self, dataset: Any, split: str) -> List[Instance]:
|
|
37
|
+
instances: List[Instance] = []
|
|
38
|
+
label_names = {"Positive": "Positivo", "Negative": "Negativo", "Neutral": "Neutro"}
|
|
39
|
+
for example in dataset[split]:
|
|
40
|
+
input = Input(text=example["sentence"])
|
|
41
|
+
# NOTE: For classification scenarios, the reference outputs should be the same
|
|
42
|
+
# for all instances, and should include both correct and incorrect classes.
|
|
43
|
+
# HELM only supports single-label classification. Exactly one reference
|
|
44
|
+
# should have the CORRECT_TAG tag.
|
|
45
|
+
references = [
|
|
46
|
+
Reference(Output(text=label_names[example["label"]]), tags=[CORRECT_TAG]),
|
|
47
|
+
]
|
|
48
|
+
instance = Instance(input=input, references=references, split=split)
|
|
49
|
+
instances.append(instance)
|
|
50
|
+
return instances
|
|
51
|
+
|
|
52
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
53
|
+
instances: List[Instance] = []
|
|
54
|
+
cache_dir = str(Path(output_path) / "data")
|
|
55
|
+
dataset = load_dataset("eduagarcia/tweetsentbr_fewshot", cache_dir=cache_dir)
|
|
56
|
+
splits: Dict[str, str] = {
|
|
57
|
+
"train": TRAIN_SPLIT,
|
|
58
|
+
"test": TEST_SPLIT,
|
|
59
|
+
}
|
|
60
|
+
for split in splits:
|
|
61
|
+
if split not in splits.keys():
|
|
62
|
+
hlog(f"{split} split doesn't exist, skipping")
|
|
63
|
+
continue
|
|
64
|
+
instances.extend(self.process_dataset(dataset, splits[split]))
|
|
65
|
+
|
|
66
|
+
return instances
|
|
@@ -4,7 +4,7 @@ from typing import List
|
|
|
4
4
|
|
|
5
5
|
from helm.common.general import ensure_file_downloaded
|
|
6
6
|
from helm.common.hierarchical_logger import hlog
|
|
7
|
-
from .scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
7
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
8
8
|
|
|
9
9
|
CODALAB_URI_TEMPLATE: str = (
|
|
10
10
|
"https://worksheets.codalab.org/rest/bundles/0x31485f8c37ad481fb9f4e9bf7ccff6e5/contents/blob/"
|
|
@@ -32,13 +32,19 @@ class UnitxtScenario(Scenario):
|
|
|
32
32
|
self.kwargs = kwargs
|
|
33
33
|
|
|
34
34
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
35
|
-
|
|
35
|
+
if len(self.kwargs) == 1 and "recipe" in self.kwargs:
|
|
36
|
+
dataset_name = self.kwargs["recipe"]
|
|
37
|
+
else:
|
|
38
|
+
dataset_name = ",".join(f"{key}={value}" for key, value in self.kwargs.items())
|
|
36
39
|
dataset = load_dataset("unitxt/data", dataset_name, trust_remote_code=True)
|
|
37
40
|
|
|
38
41
|
instances: List[Instance] = []
|
|
39
42
|
|
|
40
43
|
for unitxt_split_name, helm_split_name in UnitxtScenario.UNITXT_SPLIT_NAME_TO_HELM_SPLIT_NAME.items():
|
|
41
|
-
|
|
44
|
+
dataset_split = dataset.get(unitxt_split_name)
|
|
45
|
+
if dataset_split is None:
|
|
46
|
+
continue
|
|
47
|
+
for index, row in enumerate(dataset_split):
|
|
42
48
|
references = [
|
|
43
49
|
Reference(
|
|
44
50
|
output=Output(text=reference_text),
|
|
@@ -3,7 +3,7 @@ import os
|
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
5
|
from helm.common.general import ensure_file_downloaded
|
|
6
|
-
from .scenario import Scenario, Instance, Input, TEST_SPLIT
|
|
6
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class VicunaScenario(Scenario):
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
VALID_SPLIT,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.images_utils import generate_hash
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class BlinkScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans “within a blink”,
|
|
23
|
+
but pose significant challenges for VLMs.
|
|
24
|
+
|
|
25
|
+
Website: https://zeyofu.github.io/blink/
|
|
26
|
+
|
|
27
|
+
@article{fu2024blink,
|
|
28
|
+
title={BLINK: Multimodal Large Language Models Can See but Not Perceive},
|
|
29
|
+
author={Fu, Xingyu and Hu, Yushi and Li, Bangzheng and Feng, Yu and Wang, Haoyu and Lin, Xudong and Roth,
|
|
30
|
+
Dan and Smith, Noah A and Ma, Wei-Chiu and Krishna, Ranjay},
|
|
31
|
+
journal={arXiv preprint arXiv:2404.12390},
|
|
32
|
+
year={2024}
|
|
33
|
+
}
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
HUGGINGFACE_DATASET_NAME: str = "BLINK-Benchmark/BLINK"
|
|
37
|
+
|
|
38
|
+
VALID_CATEGORIES: List[str] = [
|
|
39
|
+
"Art_Style",
|
|
40
|
+
"Counting",
|
|
41
|
+
"Forensic_Detection",
|
|
42
|
+
"Functional_Correspondence",
|
|
43
|
+
"IQ_Test",
|
|
44
|
+
"Jigsaw",
|
|
45
|
+
"Multi-view_Reasoning",
|
|
46
|
+
"Object_Localization",
|
|
47
|
+
"Relative_Depth",
|
|
48
|
+
"Relative_Reflectance",
|
|
49
|
+
"Semantic_Correspondence",
|
|
50
|
+
"Spatial_Relation",
|
|
51
|
+
"Visual_Correspondence",
|
|
52
|
+
"Visual_Similarity",
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
name = "blink"
|
|
56
|
+
description = (
|
|
57
|
+
"BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans within a blink, "
|
|
58
|
+
"but pose significant challenges for VLMs. ([Fu, 2024](https://arxiv.org/abs/2404.12390))."
|
|
59
|
+
)
|
|
60
|
+
tags = ["vision-language", "knowledge", "reasoning"]
|
|
61
|
+
|
|
62
|
+
def __init__(self, category: str):
|
|
63
|
+
super().__init__()
|
|
64
|
+
|
|
65
|
+
if category not in self.VALID_CATEGORIES:
|
|
66
|
+
raise ValueError(f"Invalid category: {category}. Valid categories are: {self.VALID_CATEGORIES}")
|
|
67
|
+
self._category: str = category
|
|
68
|
+
|
|
69
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
70
|
+
def save_image(image) -> str:
|
|
71
|
+
image_file_name: str = generate_hash(image) + ".jpg"
|
|
72
|
+
local_image_path: str = os.path.join(output_path, image_file_name)
|
|
73
|
+
if not os.path.exists(local_image_path):
|
|
74
|
+
image.save(local_image_path)
|
|
75
|
+
return local_image_path
|
|
76
|
+
|
|
77
|
+
def get_image_header(image_index: int) -> str:
|
|
78
|
+
if image_index == 1:
|
|
79
|
+
return "First image:"
|
|
80
|
+
elif image_index == 2:
|
|
81
|
+
return "Second image:"
|
|
82
|
+
elif image_index == 3:
|
|
83
|
+
return "Third image:"
|
|
84
|
+
elif image_index == 4:
|
|
85
|
+
return "Fourth image:"
|
|
86
|
+
else:
|
|
87
|
+
raise ValueError(f"Invalid image index: {image_index}")
|
|
88
|
+
|
|
89
|
+
instances: List[Instance] = []
|
|
90
|
+
for row in tqdm(
|
|
91
|
+
load_dataset(self.HUGGINGFACE_DATASET_NAME, self._category, split="val", cache_dir=output_path)
|
|
92
|
+
):
|
|
93
|
+
# Save the image(s) to disk
|
|
94
|
+
has_multiple_images: bool = row["image_2"] is not None
|
|
95
|
+
content: List[MediaObject] = []
|
|
96
|
+
|
|
97
|
+
if has_multiple_images:
|
|
98
|
+
# An example can have up to 4 images
|
|
99
|
+
for i in range(1, 5):
|
|
100
|
+
image_i = row[f"image_{i}"]
|
|
101
|
+
if image_i is None:
|
|
102
|
+
break
|
|
103
|
+
|
|
104
|
+
# Before each image, include a header text that indicates which number image it is.
|
|
105
|
+
# Some prompts refer to specific image numbers within the question, e.g.,
|
|
106
|
+
# "Given three similar but different images, take the first image as reference.
|
|
107
|
+
# Can you tell which one of the latter two images is most similar to the first one?
|
|
108
|
+
# Select from the following choices. (A) the second image (B) the third image"
|
|
109
|
+
image_path: str = save_image(image_i)
|
|
110
|
+
content.extend(
|
|
111
|
+
[
|
|
112
|
+
MediaObject(text=get_image_header(i), content_type="text/plain"),
|
|
113
|
+
MediaObject(location=image_path, content_type="image/jpeg"),
|
|
114
|
+
]
|
|
115
|
+
)
|
|
116
|
+
else:
|
|
117
|
+
image1 = row["image_1"]
|
|
118
|
+
image1_path: str = save_image(image1)
|
|
119
|
+
content.append(MediaObject(location=image1_path, content_type="image/jpeg"))
|
|
120
|
+
|
|
121
|
+
# Add the prompt that has both the question and the answer choices
|
|
122
|
+
prompt: str = row["prompt"]
|
|
123
|
+
# Replace (A), (B), (C), (D) with \nA. \nB. \nC. \nD. since we are just expecting the letter answer
|
|
124
|
+
prompt = prompt.replace("(A)", "\nA.").replace("(B)", "\nB.").replace("(C)", "\nC.").replace("(D)", "\nD.")
|
|
125
|
+
content.append(MediaObject(text=prompt, content_type="text/plain"))
|
|
126
|
+
|
|
127
|
+
# The answer has the correct letter choices surrounded by parentheses
|
|
128
|
+
paren_letter_answer: str = row["answer"]
|
|
129
|
+
assert (
|
|
130
|
+
paren_letter_answer[0] == "(" and paren_letter_answer[-1] == ")"
|
|
131
|
+
), f"Unexpected answer format: {paren_letter_answer}"
|
|
132
|
+
letter_answer: str = paren_letter_answer[1]
|
|
133
|
+
references: List[Reference] = [
|
|
134
|
+
Reference(output=Output(text=letter_answer), tags=[CORRECT_TAG]),
|
|
135
|
+
]
|
|
136
|
+
instances.append(
|
|
137
|
+
Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=VALID_SPLIT)
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
return instances
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
VALID_SPLIT,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.images_utils import generate_hash
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MMStarScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples meticulously
|
|
23
|
+
selected by humans. MMStar is designed to benchmark 6 core capabilities and 18 detailed axes, aiming to evaluate
|
|
24
|
+
the multi-modal capacities of LVLMs with a carefully balanced and purified selection of samples. The samples
|
|
25
|
+
are first roughly selected from current benchmarks with an automated pipeline, strict human review is then
|
|
26
|
+
involved to ensure each selected sample exhibits visual dependency, minimal data leakage, and requires advanced
|
|
27
|
+
multi-modal capabilities for the solution.
|
|
28
|
+
|
|
29
|
+
Website: https://mmstar-benchmark.github.io/
|
|
30
|
+
|
|
31
|
+
@article{chen2024we,
|
|
32
|
+
title={Are We on the Right Way for Evaluating Large Vision-Language Models?},
|
|
33
|
+
author={Chen, Lin and Li, Jinsong and Dong, Xiaoyi and Zhang, Pan and Zang, Yuhang and Chen, Zehui and Duan,
|
|
34
|
+
Haodong and Wang, Jiaqi and Qiao, Yu and Lin, Dahua and others},
|
|
35
|
+
journal={arXiv preprint arXiv:2403.20330},
|
|
36
|
+
year={2024}
|
|
37
|
+
}
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
HUGGINGFACE_DATASET_NAME: str = "Lin-Chen/MMStar"
|
|
41
|
+
|
|
42
|
+
VALID_CATEGORIES: List[str] = [
|
|
43
|
+
"coarse perception",
|
|
44
|
+
"fine-grained perception",
|
|
45
|
+
"instance reasoning",
|
|
46
|
+
"logical reasoning",
|
|
47
|
+
"math",
|
|
48
|
+
"science technology",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
name = "mm_star"
|
|
52
|
+
description = (
|
|
53
|
+
"MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples "
|
|
54
|
+
"meticulously selected by humans."
|
|
55
|
+
"([Chen, 2024](https://arxiv.org/abs/2403.20330))."
|
|
56
|
+
)
|
|
57
|
+
tags = ["vision-language", "knowledge", "reasoning"]
|
|
58
|
+
|
|
59
|
+
def __init__(self, category: str):
|
|
60
|
+
super().__init__()
|
|
61
|
+
|
|
62
|
+
category = category.replace("_", " ")
|
|
63
|
+
if category not in self.VALID_CATEGORIES:
|
|
64
|
+
raise ValueError(f"Invalid category: {category}. Valid categories are: {self.VALID_CATEGORIES}")
|
|
65
|
+
if category == "science technology":
|
|
66
|
+
category = "science & technology"
|
|
67
|
+
|
|
68
|
+
self._category: str = category
|
|
69
|
+
|
|
70
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
71
|
+
instances: List[Instance] = []
|
|
72
|
+
|
|
73
|
+
for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split="val", cache_dir=output_path)):
|
|
74
|
+
# Filter by category
|
|
75
|
+
category: str = row["category"]
|
|
76
|
+
if category != self._category:
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
# Save the image to disk
|
|
80
|
+
image = row["image"]
|
|
81
|
+
image_file_name: str = generate_hash(image) + ".jpg"
|
|
82
|
+
local_image_path: str = os.path.join(output_path, image_file_name)
|
|
83
|
+
if not os.path.exists(local_image_path):
|
|
84
|
+
image.save(local_image_path)
|
|
85
|
+
|
|
86
|
+
content: List[MediaObject] = [
|
|
87
|
+
MediaObject(location=local_image_path, content_type="image/jpeg"),
|
|
88
|
+
MediaObject(text=row["question"], content_type="text/plain"),
|
|
89
|
+
]
|
|
90
|
+
references: List[Reference] = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])]
|
|
91
|
+
instances.append(
|
|
92
|
+
Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=VALID_SPLIT)
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
return instances
|