crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.6.dist-info/METADATA +427 -0
- crfm_helm-0.5.6.dist-info/RECORD +941 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +21 -6
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +214 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +11 -12
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +14 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +14 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +6 -6
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +14 -0
- helm/benchmark/metrics/medalign_metrics.py +14 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +14 -0
- helm/benchmark/metrics/medication_qa_metrics.py +10 -19
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +14 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +20 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/toxicity_metrics.py +6 -6
- helm/benchmark/metrics/unitxt_metrics.py +7 -5
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +97 -67
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +86 -90
- helm/benchmark/run_expander.py +90 -9
- helm/benchmark/run_spec_factory.py +13 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +142 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +141 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +103 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +157 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +136 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +94 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
- helm/benchmark/scenarios/medbullets_scenario.py +145 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
- helm/benchmark/scenarios/medec_scenario.py +125 -0
- helm/benchmark/scenarios/medhallu_scenario.py +72 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +123 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +12 -2
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +13 -1
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_long_context.yaml +283 -0
- helm/benchmark/static/schema_medhelm.yaml +1140 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +129 -56
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +6 -6
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +4 -5
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +120 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
- helm/clients/audio_language/qwen_audiolm_client.py +152 -0
- helm/clients/audio_language/test.py +62 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +203 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/client.py +7 -7
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/grok_client.py +36 -0
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +52 -21
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
- helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
- helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +308 -43
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +3 -9
- helm/clients/reka_client.py +3 -3
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +76 -9
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +45 -13
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +188 -0
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +4 -6
- helm/clients/writer_client.py +102 -0
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/images_utils.py +2 -2
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +14 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1792 -28
- helm/config/model_metadata.yaml +1606 -51
- helm/config/tokenizer_configs.yaml +521 -4
- helm/proxy/cli.py +5 -3
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +22 -86
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +3 -3
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
|
@@ -122,6 +122,11 @@ metrics:
|
|
|
122
122
|
short_display_name: EM
|
|
123
123
|
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
124
124
|
lower_is_better: false
|
|
125
|
+
- name: quasi_leave_articles_exact_match
|
|
126
|
+
display_name: Quasi-exact match
|
|
127
|
+
short_display_name: EM
|
|
128
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
129
|
+
lower_is_better: false
|
|
125
130
|
- name: prefix_exact_match
|
|
126
131
|
display_name: Prefix exact match
|
|
127
132
|
short_display_name: PEM
|
|
@@ -295,25 +300,32 @@ run_groups:
|
|
|
295
300
|
- viz_wiz
|
|
296
301
|
- flickr30k
|
|
297
302
|
- pope
|
|
303
|
+
- mm_star_perception
|
|
304
|
+
- blink_perception
|
|
298
305
|
- name: reasoning
|
|
299
306
|
display_name: Reasoning
|
|
300
307
|
description: Does the model understand objects, counts and spatial relations? Can the model reason about both the text and image input?
|
|
301
308
|
category: Core scenarios
|
|
302
309
|
subgroups:
|
|
310
|
+
- mmmu
|
|
311
|
+
- exams_v
|
|
303
312
|
- gqa
|
|
304
313
|
- math_vista
|
|
305
314
|
- seed_bench
|
|
306
315
|
- mementos
|
|
307
316
|
- real_world_qa
|
|
317
|
+
- mm_star_reasoning
|
|
318
|
+
- blink_reasoning
|
|
308
319
|
- name: knowledge
|
|
309
320
|
display_name: Knowledge
|
|
310
321
|
description: Does the model have knowledge about the world and common sense?
|
|
311
322
|
category: Core scenarios
|
|
312
323
|
subgroups:
|
|
313
324
|
- a_okvqa_base
|
|
314
|
-
- mmmu
|
|
315
325
|
- mme
|
|
316
326
|
- vibe_eval
|
|
327
|
+
- mm_star_knowledge
|
|
328
|
+
- blink_knowledge
|
|
317
329
|
- name: bias
|
|
318
330
|
display_name: Bias
|
|
319
331
|
description: Are the generations biased in demographic representation? We focus on gender and skin tone bias.
|
|
@@ -347,8 +359,6 @@ run_groups:
|
|
|
347
359
|
description: Is the model robust to perturbations? We focus on both text and image perturbations.
|
|
348
360
|
category: Core scenarios
|
|
349
361
|
subgroups:
|
|
350
|
-
- vqa_robustness
|
|
351
|
-
- a_okvqa_robustness
|
|
352
362
|
- unicorn
|
|
353
363
|
- bingo
|
|
354
364
|
- name: multilinguality
|
|
@@ -360,7 +370,6 @@ run_groups:
|
|
|
360
370
|
- a_okvqa_hindi
|
|
361
371
|
- a_okvqa_spanish
|
|
362
372
|
- a_okvqa_swahili
|
|
363
|
-
- exams_v
|
|
364
373
|
- bingo_multilinguality
|
|
365
374
|
- name: a_okvqa_base
|
|
366
375
|
display_name: A-OKVQA
|
|
@@ -369,7 +378,7 @@ run_groups:
|
|
|
369
378
|
- accuracy
|
|
370
379
|
- general_information
|
|
371
380
|
environment:
|
|
372
|
-
main_name:
|
|
381
|
+
main_name: quasi_prefix_exact_match
|
|
373
382
|
main_split: valid
|
|
374
383
|
taxonomy:
|
|
375
384
|
task: multiple-choice question answering
|
|
@@ -385,23 +394,7 @@ run_groups:
|
|
|
385
394
|
- fairness
|
|
386
395
|
- general_information
|
|
387
396
|
environment:
|
|
388
|
-
main_name:
|
|
389
|
-
main_split: valid
|
|
390
|
-
taxonomy:
|
|
391
|
-
task: multiple-choice question answering
|
|
392
|
-
what: Real-world images
|
|
393
|
-
who: Human experts
|
|
394
|
-
when: "2023"
|
|
395
|
-
language: English
|
|
396
|
-
|
|
397
|
-
- name: a_okvqa_robustness
|
|
398
|
-
display_name: A-OKVQA (robustness)
|
|
399
|
-
description: Robustness Typos Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
400
|
-
metric_groups:
|
|
401
|
-
- robustness
|
|
402
|
-
- general_information
|
|
403
|
-
environment:
|
|
404
|
-
main_name: exact_match
|
|
397
|
+
main_name: quasi_prefix_exact_match
|
|
405
398
|
main_split: valid
|
|
406
399
|
taxonomy:
|
|
407
400
|
task: multiple-choice question answering
|
|
@@ -417,7 +410,7 @@ run_groups:
|
|
|
417
410
|
- translate
|
|
418
411
|
- general_information
|
|
419
412
|
environment:
|
|
420
|
-
main_name:
|
|
413
|
+
main_name: quasi_prefix_exact_match
|
|
421
414
|
main_split: valid
|
|
422
415
|
taxonomy:
|
|
423
416
|
task: multiple-choice question answering
|
|
@@ -433,7 +426,7 @@ run_groups:
|
|
|
433
426
|
- translate
|
|
434
427
|
- general_information
|
|
435
428
|
environment:
|
|
436
|
-
main_name:
|
|
429
|
+
main_name: quasi_prefix_exact_match
|
|
437
430
|
main_split: valid
|
|
438
431
|
taxonomy:
|
|
439
432
|
task: multiple-choice question answering
|
|
@@ -449,7 +442,7 @@ run_groups:
|
|
|
449
442
|
- translate
|
|
450
443
|
- general_information
|
|
451
444
|
environment:
|
|
452
|
-
main_name:
|
|
445
|
+
main_name: quasi_prefix_exact_match
|
|
453
446
|
main_split: valid
|
|
454
447
|
taxonomy:
|
|
455
448
|
task: multiple-choice question answering
|
|
@@ -465,7 +458,7 @@ run_groups:
|
|
|
465
458
|
- translate
|
|
466
459
|
- general_information
|
|
467
460
|
environment:
|
|
468
|
-
main_name:
|
|
461
|
+
main_name: quasi_prefix_exact_match
|
|
469
462
|
main_split: valid
|
|
470
463
|
taxonomy:
|
|
471
464
|
task: multiple-choice question answering
|
|
@@ -474,6 +467,102 @@ run_groups:
|
|
|
474
467
|
when: "2023"
|
|
475
468
|
language: Swahili
|
|
476
469
|
|
|
470
|
+
- name: mm_star_perception
|
|
471
|
+
display_name: MM-Star (Perception subsets)
|
|
472
|
+
description: MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples meticulously selected by humans. ([Chen et al., 2024](https://arxiv.org/abs/2403.20330)).
|
|
473
|
+
metric_groups:
|
|
474
|
+
- accuracy
|
|
475
|
+
- general_information
|
|
476
|
+
environment:
|
|
477
|
+
main_name: quasi_prefix_exact_match
|
|
478
|
+
main_split: valid
|
|
479
|
+
taxonomy:
|
|
480
|
+
task: multiple-choice question answering
|
|
481
|
+
what: Real-world images
|
|
482
|
+
who: Human experts
|
|
483
|
+
when: "2024"
|
|
484
|
+
language: English
|
|
485
|
+
|
|
486
|
+
- name: mm_star_reasoning
|
|
487
|
+
display_name: MM-Star (Reasoning subsets)
|
|
488
|
+
description: MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples meticulously selected by humans. ([Chen et al., 2024](https://arxiv.org/abs/2403.20330)).
|
|
489
|
+
metric_groups:
|
|
490
|
+
- accuracy
|
|
491
|
+
- general_information
|
|
492
|
+
environment:
|
|
493
|
+
main_name: quasi_prefix_exact_match
|
|
494
|
+
main_split: valid
|
|
495
|
+
taxonomy:
|
|
496
|
+
task: multiple-choice question answering
|
|
497
|
+
what: Real-world images
|
|
498
|
+
who: Human experts
|
|
499
|
+
when: "2024"
|
|
500
|
+
language: English
|
|
501
|
+
|
|
502
|
+
- name: mm_star_knowledge
|
|
503
|
+
display_name: MM-Star (Knowledge subsets)
|
|
504
|
+
description: MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples meticulously selected by humans. ([Chen et al., 2024](https://arxiv.org/abs/2403.20330)).
|
|
505
|
+
metric_groups:
|
|
506
|
+
- accuracy
|
|
507
|
+
- general_information
|
|
508
|
+
environment:
|
|
509
|
+
main_name: quasi_prefix_exact_match
|
|
510
|
+
main_split: valid
|
|
511
|
+
taxonomy:
|
|
512
|
+
task: multiple-choice question answering
|
|
513
|
+
what: Real-world images
|
|
514
|
+
who: Human experts
|
|
515
|
+
when: "2024"
|
|
516
|
+
language: English
|
|
517
|
+
|
|
518
|
+
- name: blink_perception
|
|
519
|
+
display_name: BLINK (Perception subsets)
|
|
520
|
+
description: BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans within a blink, but difficulty for VLMs. ([Fu, 2024](https://arxiv.org/abs/2404.12390)).
|
|
521
|
+
metric_groups:
|
|
522
|
+
- accuracy
|
|
523
|
+
- general_information
|
|
524
|
+
environment:
|
|
525
|
+
main_name: quasi_prefix_exact_match
|
|
526
|
+
main_split: valid
|
|
527
|
+
taxonomy:
|
|
528
|
+
task: multiple-choice question answering
|
|
529
|
+
what: Real-world images
|
|
530
|
+
who: Human experts
|
|
531
|
+
when: "2024"
|
|
532
|
+
language: English
|
|
533
|
+
|
|
534
|
+
- name: blink_knowledge
|
|
535
|
+
display_name: BLINK (Knowledge subsets)
|
|
536
|
+
description: BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans within a blink, but difficulty for VLMs. ([Fu, 2024](https://arxiv.org/abs/2404.12390)).
|
|
537
|
+
metric_groups:
|
|
538
|
+
- accuracy
|
|
539
|
+
- general_information
|
|
540
|
+
environment:
|
|
541
|
+
main_name: quasi_prefix_exact_match
|
|
542
|
+
main_split: valid
|
|
543
|
+
taxonomy:
|
|
544
|
+
task: multiple-choice question answering
|
|
545
|
+
what: Real-world images
|
|
546
|
+
who: Human experts
|
|
547
|
+
when: "2024"
|
|
548
|
+
language: English
|
|
549
|
+
|
|
550
|
+
- name: blink_reasoning
|
|
551
|
+
display_name: BLINK (Reasoning subsets)
|
|
552
|
+
description: BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans within a blink, but difficulty for VLMs. ([Fu, 2024](https://arxiv.org/abs/2404.12390)).
|
|
553
|
+
metric_groups:
|
|
554
|
+
- accuracy
|
|
555
|
+
- general_information
|
|
556
|
+
environment:
|
|
557
|
+
main_name: quasi_prefix_exact_match
|
|
558
|
+
main_split: valid
|
|
559
|
+
taxonomy:
|
|
560
|
+
task: multiple-choice question answering
|
|
561
|
+
what: Real-world images
|
|
562
|
+
who: Human experts
|
|
563
|
+
when: "2024"
|
|
564
|
+
language: English
|
|
565
|
+
|
|
477
566
|
- name: crossmodal_3600
|
|
478
567
|
display_name: Crossmodal 3600
|
|
479
568
|
description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([Thapliyal et al., 2022](https://arxiv.org/abs/2205.12522))
|
|
@@ -513,7 +602,7 @@ run_groups:
|
|
|
513
602
|
- accuracy
|
|
514
603
|
- general_information
|
|
515
604
|
environment:
|
|
516
|
-
main_name:
|
|
605
|
+
main_name: quasi_prefix_exact_match
|
|
517
606
|
main_split: valid
|
|
518
607
|
taxonomy:
|
|
519
608
|
task: short-answer question answering
|
|
@@ -529,7 +618,7 @@ run_groups:
|
|
|
529
618
|
- accuracy
|
|
530
619
|
- general_information
|
|
531
620
|
environment:
|
|
532
|
-
main_name:
|
|
621
|
+
main_name: quasi_prefix_exact_match
|
|
533
622
|
main_split: test
|
|
534
623
|
taxonomy:
|
|
535
624
|
task: toxicity identification
|
|
@@ -562,7 +651,7 @@ run_groups:
|
|
|
562
651
|
- accuracy
|
|
563
652
|
- general_information
|
|
564
653
|
environment:
|
|
565
|
-
main_name:
|
|
654
|
+
main_name: quasi_prefix_exact_match
|
|
566
655
|
main_split: valid
|
|
567
656
|
taxonomy:
|
|
568
657
|
task: short-answer question answering
|
|
@@ -578,7 +667,7 @@ run_groups:
|
|
|
578
667
|
- accuracy
|
|
579
668
|
- general_information
|
|
580
669
|
environment:
|
|
581
|
-
main_name:
|
|
670
|
+
main_name: quasi_prefix_exact_match
|
|
582
671
|
main_split: valid
|
|
583
672
|
taxonomy:
|
|
584
673
|
task: short-answer question answering
|
|
@@ -594,23 +683,7 @@ run_groups:
|
|
|
594
683
|
- fairness
|
|
595
684
|
- general_information
|
|
596
685
|
environment:
|
|
597
|
-
main_name:
|
|
598
|
-
main_split: valid
|
|
599
|
-
taxonomy:
|
|
600
|
-
task: short-answer question answering
|
|
601
|
-
what: Real-world images
|
|
602
|
-
who: Human experts
|
|
603
|
-
when: "2017"
|
|
604
|
-
language: English
|
|
605
|
-
|
|
606
|
-
- name: vqa_robustness
|
|
607
|
-
display_name: VQAv2 (robustness)
|
|
608
|
-
description: Robustness Typos Perturbation + Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
|
|
609
|
-
metric_groups:
|
|
610
|
-
- robustness
|
|
611
|
-
- general_information
|
|
612
|
-
environment:
|
|
613
|
-
main_name: quasi_exact_match
|
|
686
|
+
main_name: quasi_prefix_exact_match
|
|
614
687
|
main_split: valid
|
|
615
688
|
taxonomy:
|
|
616
689
|
task: short-answer question answering
|
|
@@ -642,7 +715,7 @@ run_groups:
|
|
|
642
715
|
- accuracy
|
|
643
716
|
- general_information
|
|
644
717
|
environment:
|
|
645
|
-
main_name:
|
|
718
|
+
main_name: quasi_prefix_exact_match
|
|
646
719
|
main_split: valid
|
|
647
720
|
taxonomy:
|
|
648
721
|
task: multiple-choice question answering
|
|
@@ -658,7 +731,7 @@ run_groups:
|
|
|
658
731
|
- accuracy
|
|
659
732
|
- general_information
|
|
660
733
|
environment:
|
|
661
|
-
main_name:
|
|
734
|
+
main_name: quasi_prefix_exact_match
|
|
662
735
|
main_split: test
|
|
663
736
|
taxonomy:
|
|
664
737
|
task: short-answer question answering
|
|
@@ -722,7 +795,7 @@ run_groups:
|
|
|
722
795
|
- accuracy
|
|
723
796
|
- general_information
|
|
724
797
|
environment:
|
|
725
|
-
main_name:
|
|
798
|
+
main_name: quasi_prefix_exact_match
|
|
726
799
|
main_split: test
|
|
727
800
|
taxonomy:
|
|
728
801
|
task: short-answer question answering
|
|
@@ -738,7 +811,7 @@ run_groups:
|
|
|
738
811
|
- accuracy
|
|
739
812
|
- general_information
|
|
740
813
|
environment:
|
|
741
|
-
main_name:
|
|
814
|
+
main_name: quasi_prefix_exact_match
|
|
742
815
|
main_split: test
|
|
743
816
|
taxonomy:
|
|
744
817
|
task: multiple-choice question answering
|
|
@@ -754,7 +827,7 @@ run_groups:
|
|
|
754
827
|
- accuracy
|
|
755
828
|
- general_information
|
|
756
829
|
environment:
|
|
757
|
-
main_name:
|
|
830
|
+
main_name: quasi_prefix_exact_match
|
|
758
831
|
main_split: test
|
|
759
832
|
taxonomy:
|
|
760
833
|
task: multiple-choice question answering
|
|
@@ -802,7 +875,7 @@ run_groups:
|
|
|
802
875
|
- accuracy
|
|
803
876
|
- general_information
|
|
804
877
|
environment:
|
|
805
|
-
main_name:
|
|
878
|
+
main_name: quasi_prefix_exact_match
|
|
806
879
|
main_split: test
|
|
807
880
|
taxonomy:
|
|
808
881
|
task: multiple-choice question answering
|
|
@@ -818,7 +891,7 @@ run_groups:
|
|
|
818
891
|
- accuracy
|
|
819
892
|
- general_information
|
|
820
893
|
environment:
|
|
821
|
-
main_name:
|
|
894
|
+
main_name: quasi_prefix_exact_match
|
|
822
895
|
main_split: valid
|
|
823
896
|
taxonomy:
|
|
824
897
|
task: multiple-choice question answering
|
|
@@ -834,7 +907,7 @@ run_groups:
|
|
|
834
907
|
- accuracy
|
|
835
908
|
- general_information
|
|
836
909
|
environment:
|
|
837
|
-
main_name:
|
|
910
|
+
main_name: quasi_leave_articles_exact_match
|
|
838
911
|
main_split: test
|
|
839
912
|
taxonomy:
|
|
840
913
|
task: short-answer question answering
|
|
@@ -850,7 +923,7 @@ run_groups:
|
|
|
850
923
|
- accuracy
|
|
851
924
|
- general_information
|
|
852
925
|
environment:
|
|
853
|
-
main_name:
|
|
926
|
+
main_name: quasi_prefix_exact_match
|
|
854
927
|
main_split: test
|
|
855
928
|
taxonomy:
|
|
856
929
|
task: multiple-choice question answering
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
---
|
|
2
|
+
############################################################
|
|
3
|
+
perturbations: []
|
|
4
|
+
############################################################
|
|
5
|
+
metrics:
|
|
6
|
+
- name: num_references
|
|
7
|
+
display_name: '# ref'
|
|
8
|
+
description: Number of references.
|
|
9
|
+
- name: num_train_trials
|
|
10
|
+
display_name: '# trials'
|
|
11
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
12
|
+
- name: estimated_num_tokens_cost
|
|
13
|
+
display_name: 'cost'
|
|
14
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
15
|
+
- name: num_prompt_tokens
|
|
16
|
+
display_name: '# prompt tokens'
|
|
17
|
+
description: Number of tokens in the prompt.
|
|
18
|
+
- name: num_prompt_characters
|
|
19
|
+
display_name: '# prompt chars'
|
|
20
|
+
description: Number of characters in the prompt.
|
|
21
|
+
- name: num_completion_tokens
|
|
22
|
+
display_name: '# completion tokens'
|
|
23
|
+
description: Actual number of completion tokens (over all completions).
|
|
24
|
+
- name: num_output_tokens
|
|
25
|
+
display_name: '# output tokens'
|
|
26
|
+
description: Actual number of output tokens.
|
|
27
|
+
- name: max_num_output_tokens
|
|
28
|
+
display_name: 'Max output tokens'
|
|
29
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
30
|
+
- name: num_requests
|
|
31
|
+
display_name: '# requests'
|
|
32
|
+
description: Number of distinct API requests.
|
|
33
|
+
- name: num_instances
|
|
34
|
+
display_name: '# eval'
|
|
35
|
+
description: Number of evaluation instances.
|
|
36
|
+
- name: num_train_instances
|
|
37
|
+
display_name: '# train'
|
|
38
|
+
description: Number of training instances (e.g., in-context examples).
|
|
39
|
+
- name: prompt_truncated
|
|
40
|
+
display_name: truncated
|
|
41
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
42
|
+
- name: finish_reason_length
|
|
43
|
+
display_name: finish b/c length
|
|
44
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
45
|
+
- name: finish_reason_stop
|
|
46
|
+
display_name: finish b/c stop
|
|
47
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
48
|
+
- name: finish_reason_endoftext
|
|
49
|
+
display_name: finish b/c endoftext
|
|
50
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
51
|
+
- name: finish_reason_unknown
|
|
52
|
+
display_name: finish b/c unknown
|
|
53
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
54
|
+
- name: num_completions
|
|
55
|
+
display_name: '# completions'
|
|
56
|
+
description: Number of completions.
|
|
57
|
+
- name: predicted_index
|
|
58
|
+
display_name: Predicted index
|
|
59
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
60
|
+
|
|
61
|
+
# Vision Language metrics [image]:
|
|
62
|
+
- name: earth_mover_similarity
|
|
63
|
+
display_name: Earth Mover Similarity
|
|
64
|
+
short_display_name: EMD-Sim
|
|
65
|
+
description: 1 - Earth Mover Distance [(Rubner and Tomasi, 2000)](https://www.cs.cmu.edu/~efros/courses/LBMV07/Papers/rubner-jcviu-00.pdf) between an image generated by the model and the target image.
|
|
66
|
+
lower_is_better: false
|
|
67
|
+
- name: pixel_similarity
|
|
68
|
+
display_name: Pixel Similarity
|
|
69
|
+
short_display_name: PS
|
|
70
|
+
description: Pixel Similarity between an image generated by the model and the target image.
|
|
71
|
+
lower_is_better: false
|
|
72
|
+
- name: sift_similarity
|
|
73
|
+
display_name: SIFT Similarity
|
|
74
|
+
short_display_name: SIFT
|
|
75
|
+
description: SIFT Similarity (Scale-Invariant Feature Transform) [(Lowe, 1999)](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=790410) between an image generated by the model and the target image.
|
|
76
|
+
lower_is_better: false
|
|
77
|
+
- name: compilation_success
|
|
78
|
+
display_name: Compilation success
|
|
79
|
+
description: Fraction of instances where the generated code compiles successfully.
|
|
80
|
+
lower_is_better: false
|
|
81
|
+
- name: lpips_similarity
|
|
82
|
+
display_name: LPIPS similarity
|
|
83
|
+
short_display_name: LPIPS
|
|
84
|
+
description: LPIPS similarity (Learned Perceptual Image Patch Similarity) [(Zhang et al., 2018)](https://arxiv.org/abs/1801.03924) between an image generated by the model and the target image.
|
|
85
|
+
lower_is_better: false
|
|
86
|
+
- name: fid_similarity
|
|
87
|
+
display_name: FID similarity
|
|
88
|
+
short_display_name: FID
|
|
89
|
+
description: FID similarity (Fréchet Inception Distance) [(Heusel et al., 2017)](https://arxiv.org/abs/1706.08500) between an image generated by the model and the target image.
|
|
90
|
+
lower_is_better: false
|
|
91
|
+
- name: ssim_similarity
|
|
92
|
+
display_name: SSIM
|
|
93
|
+
short_display_name: SSIM
|
|
94
|
+
description: SSIM similarity (Structural Similarity Index) [(Wang et al., 2004)](https://www.cns.nyu.edu/pub/eero/wang03-reprint.pdf) between an image generated by the model and the target image.
|
|
95
|
+
lower_is_better: false
|
|
96
|
+
|
|
97
|
+
# Accuracy metrics:
|
|
98
|
+
- name: exact_match
|
|
99
|
+
display_name: Exact match
|
|
100
|
+
short_display_name: EM
|
|
101
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
102
|
+
lower_is_better: false
|
|
103
|
+
- name: quasi_exact_match
|
|
104
|
+
display_name: Quasi-exact match
|
|
105
|
+
short_display_name: EM
|
|
106
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
107
|
+
lower_is_better: false
|
|
108
|
+
- name: quasi_leave_articles_exact_match
|
|
109
|
+
display_name: Quasi-exact match
|
|
110
|
+
short_display_name: EM
|
|
111
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
112
|
+
lower_is_better: false
|
|
113
|
+
- name: prefix_exact_match
|
|
114
|
+
display_name: Prefix exact match
|
|
115
|
+
short_display_name: PEM
|
|
116
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
|
|
117
|
+
lower_is_better: false
|
|
118
|
+
- name: quasi_prefix_exact_match
|
|
119
|
+
# TODO: should call this prefix_quasi_exact_match
|
|
120
|
+
display_name: Prefix quasi-exact match
|
|
121
|
+
short_display_name: PEM
|
|
122
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
|
|
123
|
+
lower_is_better: false
|
|
124
|
+
|
|
125
|
+
- name: rouge_1
|
|
126
|
+
display_name: ROUGE-1
|
|
127
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
|
|
128
|
+
lower_is_better: false
|
|
129
|
+
- name: rouge_2
|
|
130
|
+
display_name: ROUGE-2
|
|
131
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
|
|
132
|
+
lower_is_better: false
|
|
133
|
+
- name: rouge_l
|
|
134
|
+
display_name: ROUGE-L
|
|
135
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
|
|
136
|
+
lower_is_better: false
|
|
137
|
+
- name: bleu_1
|
|
138
|
+
display_name: BLEU-1
|
|
139
|
+
description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
|
|
140
|
+
lower_is_better: false
|
|
141
|
+
- name: bleu_4
|
|
142
|
+
display_name: BLEU-4
|
|
143
|
+
description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
|
|
144
|
+
lower_is_better: false
|
|
145
|
+
- name: f1_score
|
|
146
|
+
display_name: F1
|
|
147
|
+
description: Average F1 score in terms of word overlap between the model output and correct reference.
|
|
148
|
+
lower_is_better: false
|
|
149
|
+
- name: cider
|
|
150
|
+
display_name: CIDEr
|
|
151
|
+
description: Evaluates the quality of generated caption by measuring the weighted similarity of n-grams between the captions and a set of human-written reference captions, emphasizing informativeness and consensus.
|
|
152
|
+
lower_is_better: false
|
|
153
|
+
- name: prometheus_vision
|
|
154
|
+
display_name: Prometheus Vision rating
|
|
155
|
+
description: Scores are from 1 to 5, where 5 is the best.
|
|
156
|
+
lower_is_better: false
|
|
157
|
+
|
|
158
|
+
# Toxicity metrics
|
|
159
|
+
- name: expected_max_toxicity
|
|
160
|
+
display_name: Average max toxicity
|
|
161
|
+
short_display_name: Avg. max toxicity
|
|
162
|
+
lower_is_better: true
|
|
163
|
+
description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
|
|
164
|
+
- name: max_toxicity_probability
|
|
165
|
+
display_name: max toxicity
|
|
166
|
+
lower_is_better: true
|
|
167
|
+
description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
|
|
168
|
+
- name: toxic_frac
|
|
169
|
+
display_name: Toxic fraction
|
|
170
|
+
lower_is_better: true
|
|
171
|
+
description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
|
|
172
|
+
|
|
173
|
+
############################################################
|
|
174
|
+
metric_groups:
|
|
175
|
+
- name: accuracy
|
|
176
|
+
display_name: Accuracy
|
|
177
|
+
metrics:
|
|
178
|
+
- name: ${main_name}
|
|
179
|
+
split: ${main_split}
|
|
180
|
+
|
|
181
|
+
- name: general_information
|
|
182
|
+
display_name: General information
|
|
183
|
+
metrics:
|
|
184
|
+
- name: num_instances
|
|
185
|
+
split: ${main_split}
|
|
186
|
+
- name: num_train_instances
|
|
187
|
+
split: ${main_split}
|
|
188
|
+
- name: prompt_truncated
|
|
189
|
+
split: ${main_split}
|
|
190
|
+
- name: num_prompt_tokens
|
|
191
|
+
split: ${main_split}
|
|
192
|
+
- name: num_output_tokens
|
|
193
|
+
split: ${main_split}
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
############################################################
|
|
197
|
+
run_groups:
|
|
198
|
+
- name: core_scenarios
|
|
199
|
+
display_name: All
|
|
200
|
+
description: All scenarios across capabilities
|
|
201
|
+
category: All scenarios
|
|
202
|
+
subgroups:
|
|
203
|
+
- msr_vtt
|
|
204
|
+
|
|
205
|
+
- name: msr_vtt
|
|
206
|
+
display_name: MST-VTT
|
|
207
|
+
description: A large-scale video benchmark for video understanding, especially the emerging task of translating video to text.
|
|
208
|
+
metric_groups:
|
|
209
|
+
- accuracy
|
|
210
|
+
- general_information
|
|
211
|
+
environment:
|
|
212
|
+
main_name: f1_score
|
|
213
|
+
main_split: test
|
|
214
|
+
taxonomy:
|
|
215
|
+
task: captioning
|
|
216
|
+
what: Real-world videos
|
|
217
|
+
who: Human experts
|
|
218
|
+
when: "2016"
|
|
219
|
+
language: English
|
|
Binary file
|