crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +218 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +4 -1
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/summarize.py +23 -10
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +78 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +75 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +109 -36
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +241 -22
- helm/clients/palmyra_client.py +1 -4
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +47 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1112 -19
- helm/config/model_metadata.yaml +985 -44
- helm/config/tokenizer_configs.yaml +379 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +1 -1
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
|
@@ -160,6 +160,11 @@ metrics:
|
|
|
160
160
|
short_display_name: Exact Match
|
|
161
161
|
description: Unsorted List Exact Match
|
|
162
162
|
lower_is_better: false
|
|
163
|
+
- name: f1_strings
|
|
164
|
+
display_name: F1 Strings
|
|
165
|
+
short_display_name: F1 Strings
|
|
166
|
+
description: F1 Strings
|
|
167
|
+
lower_is_better: false
|
|
163
168
|
|
|
164
169
|
# FinQA Accuracy
|
|
165
170
|
- name: program_accuracy
|
|
@@ -174,21 +179,44 @@ metrics:
|
|
|
174
179
|
lower_is_better: false
|
|
175
180
|
|
|
176
181
|
# SciGen Accuracy
|
|
177
|
-
- name:
|
|
182
|
+
- name: llama_3_1_70b_instruct_cross_provider_template_table2text_single_turn_with_reference
|
|
178
183
|
display_name: Rating
|
|
179
184
|
short_display_name: Rating
|
|
180
|
-
description: Rating by Llama 3 (
|
|
185
|
+
description: Rating by Llama 3.1 (70B) LLM as judge
|
|
186
|
+
lower_is_better: false
|
|
187
|
+
|
|
188
|
+
# Robustness
|
|
189
|
+
# NOTE: This is a "virtual" metric that is not produced directly by the metrics, but will appear as an aggregate table.
|
|
190
|
+
# Run `helm-summarize` with `--summarizer-class helm.benchmark.presentation.torr_robustness_summarizer.ToRRRobustnessSummarizer`
|
|
191
|
+
# to compute the values of this metric.
|
|
192
|
+
- name: robustness
|
|
193
|
+
display_name: Robustness
|
|
194
|
+
short_display_name: Robustness
|
|
195
|
+
description: Robustness
|
|
181
196
|
lower_is_better: false
|
|
182
197
|
|
|
183
198
|
perturbations: []
|
|
184
199
|
|
|
185
200
|
metric_groups:
|
|
186
|
-
- name:
|
|
187
|
-
display_name:
|
|
201
|
+
- name: performance_metrics
|
|
202
|
+
display_name: Performance
|
|
203
|
+
aggregation_strategies:
|
|
204
|
+
- mean
|
|
188
205
|
metrics:
|
|
189
206
|
- name: ${main_name}
|
|
190
207
|
split: __all__
|
|
191
208
|
|
|
209
|
+
# NOTE: Robustness is a "virtual" metric that is not produced directly by the metrics, but will appear as an aggregate table.
|
|
210
|
+
# Run `helm-summarize` with `--summarizer-class helm.benchmark.presentation.torr_robustness_summarizer.ToRRRobustnessSummarizer`
|
|
211
|
+
# to compute the values of this metric.
|
|
212
|
+
- name: robustness_metrics
|
|
213
|
+
display_name: Robustness
|
|
214
|
+
aggregation_strategies:
|
|
215
|
+
- mean
|
|
216
|
+
metrics:
|
|
217
|
+
- name: robustness
|
|
218
|
+
split: __all__
|
|
219
|
+
|
|
192
220
|
- name: generation_metrics
|
|
193
221
|
display_name: Other Generation Metrics
|
|
194
222
|
hide_win_rates: true
|
|
@@ -244,22 +272,85 @@ run_groups:
|
|
|
244
272
|
description: Table Scenarios
|
|
245
273
|
category: All Scenarios
|
|
246
274
|
subgroups:
|
|
247
|
-
-
|
|
248
|
-
-
|
|
249
|
-
-
|
|
250
|
-
-
|
|
275
|
+
- fin_qa
|
|
276
|
+
- numeric_nlg
|
|
277
|
+
- qtsumm
|
|
278
|
+
- scigen
|
|
279
|
+
- tab_fact
|
|
280
|
+
- tablebench_data_analysis
|
|
281
|
+
- tablebench_fact_checking
|
|
282
|
+
- tablebench_numerical_reasoning
|
|
283
|
+
- turl_col_type
|
|
284
|
+
- wikitq
|
|
285
|
+
|
|
286
|
+
- name: fin_qa
|
|
287
|
+
display_name: FinQA
|
|
288
|
+
description: The FinQA benchmark for numeric reasoning over financial data, with question answering pairs written by financial experts over financial reports [(Chen et al., 2021)](https://arxiv.org/abs/2109.00122/).
|
|
289
|
+
metric_groups:
|
|
290
|
+
- performance_metrics
|
|
291
|
+
- robustness_metrics
|
|
292
|
+
- efficiency
|
|
293
|
+
- general_information
|
|
294
|
+
environment:
|
|
295
|
+
main_name: program_accuracy
|
|
296
|
+
main_split: test
|
|
297
|
+
taxonomy:
|
|
298
|
+
task: question answering with numeric reasoning
|
|
299
|
+
what: financial reports
|
|
300
|
+
who: financial experts
|
|
301
|
+
when: 1999 to 2019
|
|
302
|
+
language: English
|
|
251
303
|
|
|
252
|
-
- name:
|
|
304
|
+
- name: numeric_nlg
|
|
253
305
|
display_name: NumericNLG
|
|
254
306
|
short_display_name: NumericNLG
|
|
255
307
|
description: "NumericNLG is a dataset for numerical table-to-text generation using pairs of a table and a paragraph of a table description with richer inference from scientific papers."
|
|
256
308
|
metric_groups:
|
|
257
|
-
-
|
|
258
|
-
-
|
|
309
|
+
- performance_metrics
|
|
310
|
+
- robustness_metrics
|
|
311
|
+
# - generation_metrics
|
|
312
|
+
- efficiency
|
|
313
|
+
- general_information
|
|
314
|
+
environment:
|
|
315
|
+
main_name: rougeL
|
|
316
|
+
main_split: test
|
|
317
|
+
taxonomy:
|
|
318
|
+
task: "?"
|
|
319
|
+
what: "?"
|
|
320
|
+
who: "?"
|
|
321
|
+
when: "?"
|
|
322
|
+
language: English
|
|
323
|
+
|
|
324
|
+
- name: qtsumm
|
|
325
|
+
display_name: QTSumm
|
|
326
|
+
short_display_name: QTSumm
|
|
327
|
+
description: QTFumm
|
|
328
|
+
metric_groups:
|
|
329
|
+
- performance_metrics
|
|
330
|
+
- robustness_metrics
|
|
331
|
+
# - generation_metrics
|
|
332
|
+
- efficiency
|
|
333
|
+
- general_information
|
|
334
|
+
environment:
|
|
335
|
+
main_name: rougeL
|
|
336
|
+
main_split: test
|
|
337
|
+
taxonomy:
|
|
338
|
+
task: "?"
|
|
339
|
+
what: "?"
|
|
340
|
+
who: "?"
|
|
341
|
+
when: "?"
|
|
342
|
+
language: English
|
|
343
|
+
|
|
344
|
+
- name: scigen
|
|
345
|
+
display_name: SciGen
|
|
346
|
+
description: SciGen
|
|
347
|
+
metric_groups:
|
|
348
|
+
- performance_metrics
|
|
349
|
+
- robustness_metrics
|
|
259
350
|
- efficiency
|
|
260
351
|
- general_information
|
|
261
352
|
environment:
|
|
262
|
-
main_name:
|
|
353
|
+
main_name: rougeL
|
|
263
354
|
main_split: test
|
|
264
355
|
taxonomy:
|
|
265
356
|
task: "?"
|
|
@@ -268,13 +359,13 @@ run_groups:
|
|
|
268
359
|
when: "?"
|
|
269
360
|
language: English
|
|
270
361
|
|
|
271
|
-
- name:
|
|
362
|
+
- name: tab_fact
|
|
272
363
|
display_name: TabFact
|
|
273
364
|
short_display_name: TabFact
|
|
274
365
|
description: "tab_fact is a large-scale dataset for the task of fact-checking on tables."
|
|
275
366
|
metric_groups:
|
|
276
|
-
-
|
|
277
|
-
-
|
|
367
|
+
- performance_metrics
|
|
368
|
+
- robustness_metrics
|
|
278
369
|
- efficiency
|
|
279
370
|
- general_information
|
|
280
371
|
environment:
|
|
@@ -287,17 +378,17 @@ run_groups:
|
|
|
287
378
|
when: "?"
|
|
288
379
|
language: English
|
|
289
380
|
|
|
290
|
-
- name:
|
|
291
|
-
display_name:
|
|
292
|
-
short_display_name:
|
|
293
|
-
description: "
|
|
381
|
+
- name: tablebench_data_analysis
|
|
382
|
+
display_name: Tablebench Data Analysis
|
|
383
|
+
short_display_name: Tablebench Data Analysis
|
|
384
|
+
description: "tab_fact is a large-scale dataset for the task of fact-checking on tables."
|
|
294
385
|
metric_groups:
|
|
295
|
-
-
|
|
296
|
-
-
|
|
386
|
+
- performance_metrics
|
|
387
|
+
- robustness_metrics
|
|
297
388
|
- efficiency
|
|
298
389
|
- general_information
|
|
299
390
|
environment:
|
|
300
|
-
main_name:
|
|
391
|
+
main_name: rougeL
|
|
301
392
|
main_split: test
|
|
302
393
|
taxonomy:
|
|
303
394
|
task: "?"
|
|
@@ -306,32 +397,74 @@ run_groups:
|
|
|
306
397
|
when: "?"
|
|
307
398
|
language: English
|
|
308
399
|
|
|
309
|
-
- name:
|
|
310
|
-
display_name:
|
|
311
|
-
|
|
400
|
+
- name: tablebench_fact_checking
|
|
401
|
+
display_name: Tablebench Fact Checking
|
|
402
|
+
short_display_name: Tablebench Fact Checking
|
|
403
|
+
description: "tab_fact is a large-scale dataset for the task of fact-checking on tables."
|
|
312
404
|
metric_groups:
|
|
313
|
-
-
|
|
405
|
+
- performance_metrics
|
|
406
|
+
- robustness_metrics
|
|
314
407
|
- efficiency
|
|
315
408
|
- general_information
|
|
316
409
|
environment:
|
|
317
|
-
main_name:
|
|
410
|
+
main_name: rougeL
|
|
318
411
|
main_split: test
|
|
319
412
|
taxonomy:
|
|
320
|
-
task:
|
|
321
|
-
what:
|
|
322
|
-
who:
|
|
323
|
-
when:
|
|
413
|
+
task: "?"
|
|
414
|
+
what: "?"
|
|
415
|
+
who: "?"
|
|
416
|
+
when: "?"
|
|
324
417
|
language: English
|
|
325
418
|
|
|
326
|
-
- name:
|
|
327
|
-
display_name:
|
|
328
|
-
|
|
419
|
+
- name: tablebench_numerical_reasoning
|
|
420
|
+
display_name: Tablebench Numerical Reasoning
|
|
421
|
+
short_display_name: Tablebench Numerical Reasoning
|
|
422
|
+
description: "tab_fact is a large-scale dataset for the task of fact-checking on tables."
|
|
423
|
+
metric_groups:
|
|
424
|
+
- performance_metrics
|
|
425
|
+
- robustness_metrics
|
|
426
|
+
- efficiency
|
|
427
|
+
- general_information
|
|
428
|
+
environment:
|
|
429
|
+
main_name: rougeL
|
|
430
|
+
main_split: test
|
|
431
|
+
taxonomy:
|
|
432
|
+
task: "?"
|
|
433
|
+
what: "?"
|
|
434
|
+
who: "?"
|
|
435
|
+
when: "?"
|
|
436
|
+
language: English
|
|
437
|
+
|
|
438
|
+
- name: turl_col_type
|
|
439
|
+
display_name: Turl Col Type
|
|
440
|
+
description: Turl Col Type
|
|
441
|
+
metric_groups:
|
|
442
|
+
- performance_metrics
|
|
443
|
+
- robustness_metrics
|
|
444
|
+
- efficiency
|
|
445
|
+
- general_information
|
|
446
|
+
environment:
|
|
447
|
+
main_name: f1_micro
|
|
448
|
+
main_split: test
|
|
449
|
+
taxonomy:
|
|
450
|
+
task: "?"
|
|
451
|
+
what: "?"
|
|
452
|
+
who: "?"
|
|
453
|
+
when: "?"
|
|
454
|
+
language: English
|
|
455
|
+
|
|
456
|
+
- name: wikitq
|
|
457
|
+
display_name: WikiTableQuestions
|
|
458
|
+
short_display_name: WikiTableQuestions
|
|
459
|
+
description: "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables."
|
|
329
460
|
metric_groups:
|
|
330
|
-
-
|
|
461
|
+
- performance_metrics
|
|
462
|
+
- robustness_metrics
|
|
463
|
+
# - classification_metrics
|
|
331
464
|
- efficiency
|
|
332
465
|
- general_information
|
|
333
466
|
environment:
|
|
334
|
-
main_name:
|
|
467
|
+
main_name: f1_strings
|
|
335
468
|
main_split: test
|
|
336
469
|
taxonomy:
|
|
337
470
|
task: "?"
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
############################################################
|
|
2
|
+
metrics:
|
|
3
|
+
# Infrastructure metrics:
|
|
4
|
+
- name: num_perplexity_tokens
|
|
5
|
+
display_name: '# tokens'
|
|
6
|
+
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
7
|
+
- name: num_bytes
|
|
8
|
+
display_name: '# bytes'
|
|
9
|
+
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
10
|
+
|
|
11
|
+
- name: num_references
|
|
12
|
+
display_name: '# ref'
|
|
13
|
+
description: Number of references.
|
|
14
|
+
- name: num_train_trials
|
|
15
|
+
display_name: '# trials'
|
|
16
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
17
|
+
- name: estimated_num_tokens_cost
|
|
18
|
+
display_name: 'cost'
|
|
19
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
20
|
+
- name: num_prompt_tokens
|
|
21
|
+
display_name: '# prompt tokens'
|
|
22
|
+
description: Number of tokens in the prompt.
|
|
23
|
+
- name: num_prompt_characters
|
|
24
|
+
display_name: '# prompt chars'
|
|
25
|
+
description: Number of characters in the prompt.
|
|
26
|
+
- name: num_completion_tokens
|
|
27
|
+
display_name: '# completion tokens'
|
|
28
|
+
description: Actual number of completion tokens (over all completions).
|
|
29
|
+
- name: num_output_tokens
|
|
30
|
+
display_name: '# output tokens'
|
|
31
|
+
description: Actual number of output tokens.
|
|
32
|
+
- name: max_num_output_tokens
|
|
33
|
+
display_name: 'Max output tokens'
|
|
34
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
35
|
+
- name: num_requests
|
|
36
|
+
display_name: '# requests'
|
|
37
|
+
description: Number of distinct API requests.
|
|
38
|
+
- name: num_instances
|
|
39
|
+
display_name: '# eval'
|
|
40
|
+
description: Number of evaluation instances.
|
|
41
|
+
- name: num_train_instances
|
|
42
|
+
display_name: '# train'
|
|
43
|
+
description: Number of training instances (e.g., in-context examples).
|
|
44
|
+
- name: prompt_truncated
|
|
45
|
+
display_name: truncated
|
|
46
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
47
|
+
- name: finish_reason_length
|
|
48
|
+
display_name: finish b/c length
|
|
49
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
50
|
+
- name: finish_reason_stop
|
|
51
|
+
display_name: finish b/c stop
|
|
52
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
53
|
+
- name: finish_reason_endoftext
|
|
54
|
+
display_name: finish b/c endoftext
|
|
55
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
56
|
+
- name: finish_reason_unknown
|
|
57
|
+
display_name: finish b/c unknown
|
|
58
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
59
|
+
- name: num_completions
|
|
60
|
+
display_name: '# completions'
|
|
61
|
+
description: Number of completions.
|
|
62
|
+
- name: predicted_index
|
|
63
|
+
display_name: Predicted index
|
|
64
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
65
|
+
|
|
66
|
+
# Accuracy metrics:
|
|
67
|
+
- name: exact_match
|
|
68
|
+
display_name: Exact match
|
|
69
|
+
short_display_name: EM
|
|
70
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
71
|
+
lower_is_better: false
|
|
72
|
+
- name: quasi_exact_match
|
|
73
|
+
display_name: Quasi-exact match
|
|
74
|
+
short_display_name: EM
|
|
75
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
76
|
+
lower_is_better: false
|
|
77
|
+
- name: prefix_exact_match
|
|
78
|
+
display_name: Prefix exact match
|
|
79
|
+
short_display_name: PEM
|
|
80
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
|
|
81
|
+
lower_is_better: false
|
|
82
|
+
- name: quasi_prefix_exact_match
|
|
83
|
+
# TODO: should call this prefix_quasi_exact_match
|
|
84
|
+
display_name: Prefix quasi-exact match
|
|
85
|
+
short_display_name: PEM
|
|
86
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
|
|
87
|
+
lower_is_better: false
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
############################################################
|
|
91
|
+
perturbations: []
|
|
92
|
+
|
|
93
|
+
############################################################
|
|
94
|
+
metric_groups:
|
|
95
|
+
- name: accuracy
|
|
96
|
+
display_name: Accuracy
|
|
97
|
+
metrics:
|
|
98
|
+
- name: ${main_name}
|
|
99
|
+
split: ${main_split}
|
|
100
|
+
|
|
101
|
+
- name: efficiency
|
|
102
|
+
display_name: Efficiency
|
|
103
|
+
metrics:
|
|
104
|
+
- name: inference_runtime
|
|
105
|
+
split: ${main_split}
|
|
106
|
+
|
|
107
|
+
- name: general_information
|
|
108
|
+
display_name: General information
|
|
109
|
+
hide_win_rates: true
|
|
110
|
+
metrics:
|
|
111
|
+
- name: num_instances
|
|
112
|
+
split: ${main_split}
|
|
113
|
+
- name: num_train_instances
|
|
114
|
+
split: ${main_split}
|
|
115
|
+
- name: prompt_truncated
|
|
116
|
+
split: ${main_split}
|
|
117
|
+
- name: num_prompt_tokens
|
|
118
|
+
split: ${main_split}
|
|
119
|
+
- name: num_output_tokens
|
|
120
|
+
split: ${main_split}
|
|
121
|
+
|
|
122
|
+
############################################################
|
|
123
|
+
run_groups:
|
|
124
|
+
- name: core_scenarios
|
|
125
|
+
display_name: Core Scenarios
|
|
126
|
+
description: Core Scenarios
|
|
127
|
+
category: All scenarios
|
|
128
|
+
subgroups:
|
|
129
|
+
- tweetsentbr
|
|
130
|
+
|
|
131
|
+
- name: tweetsentbr
|
|
132
|
+
display_name: TweetSentBR
|
|
133
|
+
description: TweetSentBR
|
|
134
|
+
metric_groups:
|
|
135
|
+
- accuracy
|
|
136
|
+
- efficiency
|
|
137
|
+
- general_information
|
|
138
|
+
environment:
|
|
139
|
+
main_name: exact_match
|
|
140
|
+
main_split: test
|
|
141
|
+
taxonomy:
|
|
142
|
+
task: "text classification"
|
|
143
|
+
what: "tweets with sentiments"
|
|
144
|
+
who: "?"
|
|
145
|
+
when: "2018"
|
|
146
|
+
language: Portuguese
|
|
@@ -122,6 +122,11 @@ metrics:
|
|
|
122
122
|
short_display_name: EM
|
|
123
123
|
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
124
124
|
lower_is_better: false
|
|
125
|
+
- name: quasi_leave_articles_exact_match
|
|
126
|
+
display_name: Quasi-exact match
|
|
127
|
+
short_display_name: EM
|
|
128
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
129
|
+
lower_is_better: false
|
|
125
130
|
- name: prefix_exact_match
|
|
126
131
|
display_name: Prefix exact match
|
|
127
132
|
short_display_name: PEM
|
|
@@ -295,6 +300,8 @@ run_groups:
|
|
|
295
300
|
- viz_wiz
|
|
296
301
|
- flickr30k
|
|
297
302
|
- pope
|
|
303
|
+
- mm_star_perception
|
|
304
|
+
- blink_perception
|
|
298
305
|
- name: reasoning
|
|
299
306
|
display_name: Reasoning
|
|
300
307
|
description: Does the model understand objects, counts and spatial relations? Can the model reason about both the text and image input?
|
|
@@ -305,6 +312,8 @@ run_groups:
|
|
|
305
312
|
- seed_bench
|
|
306
313
|
- mementos
|
|
307
314
|
- real_world_qa
|
|
315
|
+
- mm_star_reasoning
|
|
316
|
+
- blink_reasoning
|
|
308
317
|
- name: knowledge
|
|
309
318
|
display_name: Knowledge
|
|
310
319
|
description: Does the model have knowledge about the world and common sense?
|
|
@@ -314,6 +323,8 @@ run_groups:
|
|
|
314
323
|
- mmmu
|
|
315
324
|
- mme
|
|
316
325
|
- vibe_eval
|
|
326
|
+
- mm_star_knowledge
|
|
327
|
+
- blink_knowledge
|
|
317
328
|
- name: bias
|
|
318
329
|
display_name: Bias
|
|
319
330
|
description: Are the generations biased in demographic representation? We focus on gender and skin tone bias.
|
|
@@ -347,8 +358,6 @@ run_groups:
|
|
|
347
358
|
description: Is the model robust to perturbations? We focus on both text and image perturbations.
|
|
348
359
|
category: Core scenarios
|
|
349
360
|
subgroups:
|
|
350
|
-
- vqa_robustness
|
|
351
|
-
- a_okvqa_robustness
|
|
352
361
|
- unicorn
|
|
353
362
|
- bingo
|
|
354
363
|
- name: multilinguality
|
|
@@ -394,22 +403,6 @@ run_groups:
|
|
|
394
403
|
when: "2023"
|
|
395
404
|
language: English
|
|
396
405
|
|
|
397
|
-
- name: a_okvqa_robustness
|
|
398
|
-
display_name: A-OKVQA (robustness)
|
|
399
|
-
description: Robustness Typos Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
400
|
-
metric_groups:
|
|
401
|
-
- robustness
|
|
402
|
-
- general_information
|
|
403
|
-
environment:
|
|
404
|
-
main_name: exact_match
|
|
405
|
-
main_split: valid
|
|
406
|
-
taxonomy:
|
|
407
|
-
task: multiple-choice question answering
|
|
408
|
-
what: Real-world images
|
|
409
|
-
who: Human experts
|
|
410
|
-
when: "2023"
|
|
411
|
-
language: English
|
|
412
|
-
|
|
413
406
|
- name: a_okvqa_chinese
|
|
414
407
|
display_name: A-OKVQA (chinese)
|
|
415
408
|
description: Chinese Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
@@ -474,6 +467,102 @@ run_groups:
|
|
|
474
467
|
when: "2023"
|
|
475
468
|
language: Swahili
|
|
476
469
|
|
|
470
|
+
- name: mm_star_perception
|
|
471
|
+
display_name: MM-Star (Perception subsets)
|
|
472
|
+
description: MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples meticulously selected by humans. ([Chen et al., 2024](https://arxiv.org/abs/2403.20330)).
|
|
473
|
+
metric_groups:
|
|
474
|
+
- accuracy
|
|
475
|
+
- general_information
|
|
476
|
+
environment:
|
|
477
|
+
main_name: exact_match
|
|
478
|
+
main_split: valid
|
|
479
|
+
taxonomy:
|
|
480
|
+
task: multiple-choice question answering
|
|
481
|
+
what: Real-world images
|
|
482
|
+
who: Human experts
|
|
483
|
+
when: "2024"
|
|
484
|
+
language: English
|
|
485
|
+
|
|
486
|
+
- name: mm_star_reasoning
|
|
487
|
+
display_name: MM-Star (Reasoning subsets)
|
|
488
|
+
description: MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples meticulously selected by humans. ([Chen et al., 2024](https://arxiv.org/abs/2403.20330)).
|
|
489
|
+
metric_groups:
|
|
490
|
+
- accuracy
|
|
491
|
+
- general_information
|
|
492
|
+
environment:
|
|
493
|
+
main_name: exact_match
|
|
494
|
+
main_split: valid
|
|
495
|
+
taxonomy:
|
|
496
|
+
task: multiple-choice question answering
|
|
497
|
+
what: Real-world images
|
|
498
|
+
who: Human experts
|
|
499
|
+
when: "2024"
|
|
500
|
+
language: English
|
|
501
|
+
|
|
502
|
+
- name: mm_star_knowledge
|
|
503
|
+
display_name: MM-Star (Knowledge subsets)
|
|
504
|
+
description: MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples meticulously selected by humans. ([Chen et al., 2024](https://arxiv.org/abs/2403.20330)).
|
|
505
|
+
metric_groups:
|
|
506
|
+
- accuracy
|
|
507
|
+
- general_information
|
|
508
|
+
environment:
|
|
509
|
+
main_name: exact_match
|
|
510
|
+
main_split: valid
|
|
511
|
+
taxonomy:
|
|
512
|
+
task: multiple-choice question answering
|
|
513
|
+
what: Real-world images
|
|
514
|
+
who: Human experts
|
|
515
|
+
when: "2024"
|
|
516
|
+
language: English
|
|
517
|
+
|
|
518
|
+
- name: blink_perception
|
|
519
|
+
display_name: BLINK (Perception subsets)
|
|
520
|
+
description: BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans within a blink, but difficulty for VLMs. ([Fu, 2024](https://arxiv.org/abs/2404.12390)).
|
|
521
|
+
metric_groups:
|
|
522
|
+
- accuracy
|
|
523
|
+
- general_information
|
|
524
|
+
environment:
|
|
525
|
+
main_name: exact_match
|
|
526
|
+
main_split: valid
|
|
527
|
+
taxonomy:
|
|
528
|
+
task: multiple-choice question answering
|
|
529
|
+
what: Real-world images
|
|
530
|
+
who: Human experts
|
|
531
|
+
when: "2024"
|
|
532
|
+
language: English
|
|
533
|
+
|
|
534
|
+
- name: blink_knowledge
|
|
535
|
+
display_name: BLINK (Knowledge subsets)
|
|
536
|
+
description: BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans within a blink, but difficulty for VLMs. ([Fu, 2024](https://arxiv.org/abs/2404.12390)).
|
|
537
|
+
metric_groups:
|
|
538
|
+
- accuracy
|
|
539
|
+
- general_information
|
|
540
|
+
environment:
|
|
541
|
+
main_name: exact_match
|
|
542
|
+
main_split: valid
|
|
543
|
+
taxonomy:
|
|
544
|
+
task: multiple-choice question answering
|
|
545
|
+
what: Real-world images
|
|
546
|
+
who: Human experts
|
|
547
|
+
when: "2024"
|
|
548
|
+
language: English
|
|
549
|
+
|
|
550
|
+
- name: blink_reasoning
|
|
551
|
+
display_name: BLINK (Reasoning subsets)
|
|
552
|
+
description: BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans within a blink, but difficulty for VLMs. ([Fu, 2024](https://arxiv.org/abs/2404.12390)).
|
|
553
|
+
metric_groups:
|
|
554
|
+
- accuracy
|
|
555
|
+
- general_information
|
|
556
|
+
environment:
|
|
557
|
+
main_name: exact_match
|
|
558
|
+
main_split: valid
|
|
559
|
+
taxonomy:
|
|
560
|
+
task: multiple-choice question answering
|
|
561
|
+
what: Real-world images
|
|
562
|
+
who: Human experts
|
|
563
|
+
when: "2024"
|
|
564
|
+
language: English
|
|
565
|
+
|
|
477
566
|
- name: crossmodal_3600
|
|
478
567
|
display_name: Crossmodal 3600
|
|
479
568
|
description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([Thapliyal et al., 2022](https://arxiv.org/abs/2205.12522))
|
|
@@ -603,22 +692,6 @@ run_groups:
|
|
|
603
692
|
when: "2017"
|
|
604
693
|
language: English
|
|
605
694
|
|
|
606
|
-
- name: vqa_robustness
|
|
607
|
-
display_name: VQAv2 (robustness)
|
|
608
|
-
description: Robustness Typos Perturbation + Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
|
|
609
|
-
metric_groups:
|
|
610
|
-
- robustness
|
|
611
|
-
- general_information
|
|
612
|
-
environment:
|
|
613
|
-
main_name: quasi_exact_match
|
|
614
|
-
main_split: valid
|
|
615
|
-
taxonomy:
|
|
616
|
-
task: short-answer question answering
|
|
617
|
-
what: Real-world images
|
|
618
|
-
who: Human experts
|
|
619
|
-
when: "2017"
|
|
620
|
-
language: English
|
|
621
|
-
|
|
622
695
|
- name: math_vista
|
|
623
696
|
display_name: MathVista
|
|
624
697
|
description: A benchmark designed to combine challenges from diverse mathematical and visual tasks ([Lu et al., 2024](https://arxiv.org/abs/2310.02255)).
|
|
@@ -658,7 +731,7 @@ run_groups:
|
|
|
658
731
|
- accuracy
|
|
659
732
|
- general_information
|
|
660
733
|
environment:
|
|
661
|
-
main_name:
|
|
734
|
+
main_name: quasi_prefix_exact_match
|
|
662
735
|
main_split: test
|
|
663
736
|
taxonomy:
|
|
664
737
|
task: short-answer question answering
|
|
@@ -834,7 +907,7 @@ run_groups:
|
|
|
834
907
|
- accuracy
|
|
835
908
|
- general_information
|
|
836
909
|
environment:
|
|
837
|
-
main_name:
|
|
910
|
+
main_name: quasi_leave_articles_exact_match
|
|
838
911
|
main_split: test
|
|
839
912
|
taxonomy:
|
|
840
913
|
task: short-answer question answering
|
|
Binary file
|