crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.6.dist-info/METADATA +427 -0
- crfm_helm-0.5.6.dist-info/RECORD +941 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +21 -6
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +214 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +11 -12
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +14 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +14 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +6 -6
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +14 -0
- helm/benchmark/metrics/medalign_metrics.py +14 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +14 -0
- helm/benchmark/metrics/medication_qa_metrics.py +10 -19
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +14 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +20 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/toxicity_metrics.py +6 -6
- helm/benchmark/metrics/unitxt_metrics.py +7 -5
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +97 -67
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +86 -90
- helm/benchmark/run_expander.py +90 -9
- helm/benchmark/run_spec_factory.py +13 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +142 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +141 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +103 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +157 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +136 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +94 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
- helm/benchmark/scenarios/medbullets_scenario.py +145 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
- helm/benchmark/scenarios/medec_scenario.py +125 -0
- helm/benchmark/scenarios/medhallu_scenario.py +72 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +123 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +12 -2
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +13 -1
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_long_context.yaml +283 -0
- helm/benchmark/static/schema_medhelm.yaml +1140 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +129 -56
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +6 -6
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +4 -5
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +120 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
- helm/clients/audio_language/qwen_audiolm_client.py +152 -0
- helm/clients/audio_language/test.py +62 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +203 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/client.py +7 -7
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/grok_client.py +36 -0
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +52 -21
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
- helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
- helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +308 -43
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +3 -9
- helm/clients/reka_client.py +3 -3
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +76 -9
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +45 -13
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +188 -0
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +4 -6
- helm/clients/writer_client.py +102 -0
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/images_utils.py +2 -2
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +14 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1792 -28
- helm/config/model_metadata.yaml +1606 -51
- helm/config/tokenizer_configs.yaml +521 -4
- helm/proxy/cli.py +5 -3
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +22 -86
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +3 -3
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1389 @@
|
|
|
1
|
+
---
|
|
2
|
+
############################################################
|
|
3
|
+
metrics:
|
|
4
|
+
|
|
5
|
+
- name: num_perplexity_tokens
|
|
6
|
+
display_name: "# tokens"
|
|
7
|
+
description: Average number of tokens in the predicted output (for language modeling,
|
|
8
|
+
the input too).
|
|
9
|
+
|
|
10
|
+
- name: num_bytes
|
|
11
|
+
display_name: "# bytes"
|
|
12
|
+
description: Average number of bytes in the predicted output (for language modeling,
|
|
13
|
+
the input too).
|
|
14
|
+
|
|
15
|
+
- name: num_references
|
|
16
|
+
display_name: "# ref"
|
|
17
|
+
description: Number of references.
|
|
18
|
+
|
|
19
|
+
- name: num_train_trials
|
|
20
|
+
display_name: "# trials"
|
|
21
|
+
description: Number of trials, where in each trial we choose an independent, random
|
|
22
|
+
set of training instances.
|
|
23
|
+
|
|
24
|
+
- name: estimated_num_tokens_cost
|
|
25
|
+
display_name: cost
|
|
26
|
+
description: An estimate of the number of tokens (including prompt and output completions)
|
|
27
|
+
needed to perform the request.
|
|
28
|
+
|
|
29
|
+
- name: num_prompt_tokens
|
|
30
|
+
display_name: "# prompt tokens"
|
|
31
|
+
description: Number of tokens in the prompt.
|
|
32
|
+
|
|
33
|
+
- name: num_prompt_characters
|
|
34
|
+
display_name: "# prompt chars"
|
|
35
|
+
description: Number of characters in the prompt.
|
|
36
|
+
|
|
37
|
+
- name: num_completion_tokens
|
|
38
|
+
display_name: "# completion tokens"
|
|
39
|
+
description: Actual number of completion tokens (over all completions).
|
|
40
|
+
|
|
41
|
+
- name: num_output_tokens
|
|
42
|
+
display_name: "# output tokens"
|
|
43
|
+
description: Actual number of output tokens.
|
|
44
|
+
|
|
45
|
+
- name: max_num_output_tokens
|
|
46
|
+
display_name: Max output tokens
|
|
47
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier
|
|
48
|
+
due to stop sequences).
|
|
49
|
+
|
|
50
|
+
- name: num_requests
|
|
51
|
+
display_name: "# requests"
|
|
52
|
+
description: Number of distinct API requests.
|
|
53
|
+
|
|
54
|
+
- name: num_instances
|
|
55
|
+
display_name: "# eval"
|
|
56
|
+
description: Number of evaluation instances.
|
|
57
|
+
|
|
58
|
+
- name: num_train_instances
|
|
59
|
+
display_name: "# train"
|
|
60
|
+
description: Number of training instances (e.g., in-context examples).
|
|
61
|
+
|
|
62
|
+
- name: prompt_truncated
|
|
63
|
+
display_name: truncated
|
|
64
|
+
description: Fraction of instances where the prompt itself was truncated (implies
|
|
65
|
+
that there were no in-context examples).
|
|
66
|
+
|
|
67
|
+
- name: finish_reason_length
|
|
68
|
+
display_name: finish b/c length
|
|
69
|
+
description: Fraction of instances where the the output was terminated because of
|
|
70
|
+
the max tokens limit.
|
|
71
|
+
|
|
72
|
+
- name: finish_reason_stop
|
|
73
|
+
display_name: finish b/c stop
|
|
74
|
+
description: Fraction of instances where the the output was terminated because of
|
|
75
|
+
the stop sequences.
|
|
76
|
+
|
|
77
|
+
- name: finish_reason_endoftext
|
|
78
|
+
display_name: finish b/c endoftext
|
|
79
|
+
description: Fraction of instances where the the output was terminated because the
|
|
80
|
+
end of text token was generated.
|
|
81
|
+
|
|
82
|
+
- name: finish_reason_unknown
|
|
83
|
+
display_name: finish b/c unknown
|
|
84
|
+
description: Fraction of instances where the the output was terminated for unknown
|
|
85
|
+
reasons.
|
|
86
|
+
|
|
87
|
+
- name: num_completions
|
|
88
|
+
display_name: "# completions"
|
|
89
|
+
description: Number of completions.
|
|
90
|
+
|
|
91
|
+
- name: predicted_index
|
|
92
|
+
display_name: Predicted index
|
|
93
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the
|
|
94
|
+
model (for multiple-choice).
|
|
95
|
+
|
|
96
|
+
- name: training_co2_cost
|
|
97
|
+
display_name: Estimated training emissions (kg CO2)
|
|
98
|
+
short_display_name: Training emissions (kg CO2)
|
|
99
|
+
lower_is_better: true
|
|
100
|
+
description: Estimate of the CO2 emissions from training the model.
|
|
101
|
+
|
|
102
|
+
- name: training_energy_cost
|
|
103
|
+
display_name: Estimated training energy cost (MWh)
|
|
104
|
+
short_display_name: Training energy (MWh)
|
|
105
|
+
lower_is_better: true
|
|
106
|
+
description: Estimate of the amount of energy used to train the model.
|
|
107
|
+
|
|
108
|
+
- name: inference_runtime
|
|
109
|
+
display_name: Observed inference runtime (s)
|
|
110
|
+
short_display_name: Observed inference time (s)
|
|
111
|
+
lower_is_better: true
|
|
112
|
+
description: Average observed time to process a request to the model (via an API,
|
|
113
|
+
and thus depends on particular deployment).
|
|
114
|
+
|
|
115
|
+
- name: inference_idealized_runtime
|
|
116
|
+
display_name: Idealized inference runtime (s)
|
|
117
|
+
short_display_name: Idealized inference time (s)
|
|
118
|
+
lower_is_better: true
|
|
119
|
+
description: Average time to process a request to the model based solely on the
|
|
120
|
+
model architecture (using Megatron-LM).
|
|
121
|
+
|
|
122
|
+
- name: inference_denoised_runtime
|
|
123
|
+
display_name: Denoised inference runtime (s)
|
|
124
|
+
short_display_name: Denoised inference time (s)
|
|
125
|
+
lower_is_better: true
|
|
126
|
+
description: Average time to process a request to the model minus performance contention
|
|
127
|
+
by using profiled runtimes from multiple trials of SyntheticEfficiencyScenario.
|
|
128
|
+
|
|
129
|
+
- name: batch_size
|
|
130
|
+
display_name: Batch size
|
|
131
|
+
description: For batch jobs, how many requests are in a batch.
|
|
132
|
+
|
|
133
|
+
- name: fid
|
|
134
|
+
display_name: FID
|
|
135
|
+
short_display_name: FID
|
|
136
|
+
description: Fréchet Inception Distance (FID) is a metric used for evaluating the
|
|
137
|
+
quality of images generated by models. The FID compares the distribution of generated
|
|
138
|
+
images with the distribution of real images in feature space.
|
|
139
|
+
lower_is_better: true
|
|
140
|
+
|
|
141
|
+
- name: inception_score
|
|
142
|
+
display_name: Inception Score (IS)
|
|
143
|
+
short_display_name: Inception Score
|
|
144
|
+
description: Inception score (IS) is a metric used for evaluating the quality of
|
|
145
|
+
image-based generative models. The score is calculated based on the output of
|
|
146
|
+
a separate, pretrained Inception v3 image classification model applied to a sample
|
|
147
|
+
of generated images.
|
|
148
|
+
lower_is_better: false
|
|
149
|
+
|
|
150
|
+
- name: kernel_inception_distance
|
|
151
|
+
display_name: Kernel Inception Distance (KID)
|
|
152
|
+
short_display_name: Kernel Inception Distance
|
|
153
|
+
description: Kernel Inception Distance (KID) is a metric used to assess the quality
|
|
154
|
+
of image-based generative models. It was proposed to replace FID.
|
|
155
|
+
lower_is_better: true
|
|
156
|
+
|
|
157
|
+
- name: expected_lpips_score
|
|
158
|
+
display_name: Expected Learned Perceptual Image Patch Similarity (LPIPS) score
|
|
159
|
+
short_display_name: Expected LPIPS score
|
|
160
|
+
description: The Learned Perceptual Image Patch Similarity (LPIPS) is used to judge
|
|
161
|
+
the perceptual similarity between two images. LPIPS computes the similarity between
|
|
162
|
+
the activations of two image patches for some pre-defined network.
|
|
163
|
+
lower_is_better: true
|
|
164
|
+
|
|
165
|
+
- name: expected_multi_scale_ssim_score
|
|
166
|
+
display_name: Expected Multi-scale Structural Similarity Index Measure (SSIM)
|
|
167
|
+
short_display_name: Expected Multi-Scale SSIM
|
|
168
|
+
description: The Multi-scale Structural Similarity Index Measure (MS-SSIM) is measure
|
|
169
|
+
of image quality and a generalization of Structural Similarity Index Measure (SSIM)
|
|
170
|
+
by incorporating image details at different resolution scores.
|
|
171
|
+
lower_is_better: false
|
|
172
|
+
|
|
173
|
+
- name: expected_psnr_score
|
|
174
|
+
display_name: Expected Peak Signal-to-Noise Ratio (PSNR)
|
|
175
|
+
short_display_name: Expected PSNR
|
|
176
|
+
description: Peak signal-to-noise ratio (PSNR) is the ratio between the maximum
|
|
177
|
+
possible power of a signal and the power of corrupting noise that affects the
|
|
178
|
+
fidelity of its representation.
|
|
179
|
+
lower_is_better: false
|
|
180
|
+
|
|
181
|
+
- name: expected_uiqi_score
|
|
182
|
+
display_name: Expected Universal Image Quality Index (UIQI)
|
|
183
|
+
short_display_name: Expected UIQI
|
|
184
|
+
description: The Universal Image Quality Index (UIQI) is a full-reference image
|
|
185
|
+
quality assessment method that measures the similarity between two images by comparing
|
|
186
|
+
their luminance, contrast, and structure.
|
|
187
|
+
lower_is_better: false
|
|
188
|
+
|
|
189
|
+
- name: gender_imbalance
|
|
190
|
+
display_name: Gender imbalance
|
|
191
|
+
short_display_name: Gender imbalance
|
|
192
|
+
description: The gender imbalance metric assesses the presence of binary gender
|
|
193
|
+
bias within a set of images.
|
|
194
|
+
lower_is_better: true
|
|
195
|
+
|
|
196
|
+
- name: skin_tone_imbalance
|
|
197
|
+
display_name: Skin tone imbalance
|
|
198
|
+
short_display_name: Skin tone imbalance
|
|
199
|
+
description: This skin tone metric assesses the presence of skin tone bias within
|
|
200
|
+
a set of images.
|
|
201
|
+
lower_is_better: true
|
|
202
|
+
|
|
203
|
+
- name: expected_clip_score
|
|
204
|
+
display_name: Expected CLIP score
|
|
205
|
+
short_display_name: Expected CLIP score
|
|
206
|
+
description: CLIPscore measures how well an image is aligned with a corresponding
|
|
207
|
+
natural language description, using the pre-trained CLIP model.
|
|
208
|
+
lower_is_better: false
|
|
209
|
+
|
|
210
|
+
- name: max_clip_score
|
|
211
|
+
display_name: Maximum CLIP score
|
|
212
|
+
short_display_name: Max CLIP score
|
|
213
|
+
description: CLIPscore measures how well an image is aligned with a corresponding
|
|
214
|
+
natural language description, using the pre-trained CLIP model.
|
|
215
|
+
lower_is_better: false
|
|
216
|
+
|
|
217
|
+
- name: detection_correct_frac
|
|
218
|
+
display_name: Detection correct fraction
|
|
219
|
+
short_display_name: Detection correct frac
|
|
220
|
+
description: Fraction of correct images according to the ViTDet object detector
|
|
221
|
+
with ViT-B backbone and detectron2 library to automatically detect objects specified
|
|
222
|
+
in the prompts.
|
|
223
|
+
lower_is_better: false
|
|
224
|
+
|
|
225
|
+
- name: expected_aesthetics_score
|
|
226
|
+
display_name: Expected aesthetics score
|
|
227
|
+
short_display_name: Expected aesthetics score
|
|
228
|
+
description: Following LAION-Aesthetics for determining the aesthetic score of a
|
|
229
|
+
generated image.
|
|
230
|
+
lower_is_better: false
|
|
231
|
+
|
|
232
|
+
- name: max_aesthetics_score
|
|
233
|
+
display_name: Maximum aesthetics score
|
|
234
|
+
short_display_name: Max aesthetics score
|
|
235
|
+
description: Following LAION-Aesthetics for determining the aesthetic score of a
|
|
236
|
+
generated image.
|
|
237
|
+
lower_is_better: false
|
|
238
|
+
|
|
239
|
+
- name: aesthetics_human
|
|
240
|
+
display_name: Aesthetics (human)
|
|
241
|
+
short_display_name: Aesthetics (human)
|
|
242
|
+
description: How aesthetically pleasing is the image according to human evaluators.
|
|
243
|
+
lower_is_better: false
|
|
244
|
+
|
|
245
|
+
- name: image_text_alignment_human
|
|
246
|
+
display_name: Image text alignment (human)
|
|
247
|
+
short_display_name: Image text alignment (human)
|
|
248
|
+
description: How well does the image match the description according to human evaluators.
|
|
249
|
+
lower_is_better: false
|
|
250
|
+
|
|
251
|
+
- name: clear_subject_human
|
|
252
|
+
display_name: Clear subject (human)
|
|
253
|
+
short_display_name: Clear subject (human)
|
|
254
|
+
description: Is it clear who the subject(s) of the image is according to human evaluators.
|
|
255
|
+
lower_is_better: false
|
|
256
|
+
|
|
257
|
+
- name: originality_human
|
|
258
|
+
display_name: Originality (human)
|
|
259
|
+
short_display_name: Originality (human)
|
|
260
|
+
description: How original is the image, given it was created with the description
|
|
261
|
+
according to human evaluators.
|
|
262
|
+
lower_is_better: false
|
|
263
|
+
|
|
264
|
+
- name: copyright_human
|
|
265
|
+
display_name: Copyright (human)
|
|
266
|
+
short_display_name: Copyright (human)
|
|
267
|
+
description: Whether the image is a derivative work of another image according to
|
|
268
|
+
human evaluators.
|
|
269
|
+
lower_is_better: true
|
|
270
|
+
|
|
271
|
+
- name: toxic_prompt_perspective_api
|
|
272
|
+
display_name: Toxic prompt (Perspective API)
|
|
273
|
+
short_display_name: Toxic prompt (Perspective API)
|
|
274
|
+
description: Whether the prompt is toxic according to the Perspective API
|
|
275
|
+
lower_is_better: true
|
|
276
|
+
|
|
277
|
+
- name: toxic_prompt_openai
|
|
278
|
+
display_name: Toxic prompt (OpenAI)
|
|
279
|
+
short_display_name: Toxic prompt (OpenAI)
|
|
280
|
+
description: Whether the prompt violated [OpenAI's content policy](https://labs.openai.com/policies/content-policy).
|
|
281
|
+
lower_is_better: true
|
|
282
|
+
|
|
283
|
+
- name: expected_max_nsfw_score
|
|
284
|
+
display_name: Expected maximum NSFW score
|
|
285
|
+
short_display_name: Expected max NSFW score
|
|
286
|
+
description: NSFW (not safe for work) classifier from LAION
|
|
287
|
+
lower_is_better: true
|
|
288
|
+
|
|
289
|
+
- name: expected_max_nsfw_score_given_toxic_prompt=True
|
|
290
|
+
display_name: Expected maximum NSFW score given toxic prompt
|
|
291
|
+
short_display_name: Expected max NSFW score given toxic prompt
|
|
292
|
+
description: NSFW (not safe for work) classifier from LAION
|
|
293
|
+
lower_is_better: true
|
|
294
|
+
|
|
295
|
+
- name: expected_max_nsfw_score_given_toxic_prompt=False
|
|
296
|
+
display_name: Expected maximum NSFW score given non-toxic prompt
|
|
297
|
+
short_display_name: Expected max NSFW score given non-toxic prompt
|
|
298
|
+
description: NSFW (not safe for work) classifier from LAION
|
|
299
|
+
lower_is_better: true
|
|
300
|
+
|
|
301
|
+
- name: fractal_dimension_loss
|
|
302
|
+
display_name: Fractal dimension loss
|
|
303
|
+
short_display_name: Fractal dimension loss
|
|
304
|
+
description: Fractal coefficient is a measure of the fractal patterns in an image.
|
|
305
|
+
The ideal value is 1.4.
|
|
306
|
+
lower_is_better: true
|
|
307
|
+
|
|
308
|
+
- name: max_nsfw_probability
|
|
309
|
+
display_name: Expected maximum NSFW probability
|
|
310
|
+
short_display_name: Expected max NSFW probability
|
|
311
|
+
description: NSFW (not safe for work) classifier from LAION
|
|
312
|
+
lower_is_better: true
|
|
313
|
+
|
|
314
|
+
- name: max_nsfw_probability_given_toxic_prompt=True
|
|
315
|
+
display_name: Expected maximum NSFW probability given toxic prompt
|
|
316
|
+
short_display_name: Expected max NSFW probability given toxic prompt
|
|
317
|
+
description: NSFW (not safe for work) classifier from LAION
|
|
318
|
+
lower_is_better: true
|
|
319
|
+
|
|
320
|
+
- name: max_nsfw_probability_given_toxic_prompt=False
|
|
321
|
+
display_name: Expected maximum NSFW probability given non-toxic prompt
|
|
322
|
+
short_display_name: Expected max NSFW probability given non-toxic prompt
|
|
323
|
+
description: NSFW (not safe for work) classifier from LAION
|
|
324
|
+
lower_is_better: true
|
|
325
|
+
|
|
326
|
+
- name: nsfw_frac
|
|
327
|
+
display_name: NSFW frac
|
|
328
|
+
short_display_name: NSFW frac
|
|
329
|
+
description: NSFW (not safe for work) classifier from LAION
|
|
330
|
+
lower_is_better: true
|
|
331
|
+
|
|
332
|
+
- name: nsfw_frac_given_toxic_prompt=True
|
|
333
|
+
display_name: NSFW frac given toxic prompt
|
|
334
|
+
short_display_name: NSFW frac given toxic prompt
|
|
335
|
+
description: NSFW (not safe for work) classifier from LAION
|
|
336
|
+
lower_is_better: true
|
|
337
|
+
|
|
338
|
+
- name: nsfw_frac_given_toxic_prompt=False
|
|
339
|
+
display_name: NSFW frac given non-toxic prompt
|
|
340
|
+
short_display_name: NSFW frac given non-toxic prompt
|
|
341
|
+
description: NSFW (not safe for work) classifier from LAION
|
|
342
|
+
lower_is_better: true
|
|
343
|
+
|
|
344
|
+
- name: nudity_frac
|
|
345
|
+
display_name: Nudity frac
|
|
346
|
+
short_display_name: Nudity frac
|
|
347
|
+
description: Pre-trained NudeNet to determine whether an image contains nudity.
|
|
348
|
+
lower_is_better: true
|
|
349
|
+
|
|
350
|
+
- name: blacked_out_frac
|
|
351
|
+
display_name: Blacked out frac
|
|
352
|
+
short_display_name: Blacked out frac
|
|
353
|
+
description: This metric represents whether the image has been blacked out.
|
|
354
|
+
lower_is_better: true
|
|
355
|
+
|
|
356
|
+
- name: blacked_out_frac_given_toxic_prompt=True
|
|
357
|
+
display_name: Blacked out frac given toxic prompt
|
|
358
|
+
short_display_name: Blacked out frac given toxic prompt
|
|
359
|
+
description: This metric represents whether the image has been blacked out.
|
|
360
|
+
lower_is_better: true
|
|
361
|
+
|
|
362
|
+
- name: blacked_out_frac_given_toxic_prompt=False
|
|
363
|
+
display_name: Blacked out frac given non-toxic prompt
|
|
364
|
+
short_display_name: Blacked out frac given non-toxic prompt
|
|
365
|
+
description: This metric represents whether the image has been blacked out.
|
|
366
|
+
lower_is_better: true
|
|
367
|
+
|
|
368
|
+
- name: watermark_frac
|
|
369
|
+
display_name: Watermark frac
|
|
370
|
+
short_display_name: Watermark frac
|
|
371
|
+
description: Watermark detector from LAION to determine whether an image contains
|
|
372
|
+
watermarks.
|
|
373
|
+
lower_is_better: true
|
|
374
|
+
|
|
375
|
+
- name: expected_max_watermark_prob
|
|
376
|
+
display_name: Expected maximum watermark prob
|
|
377
|
+
short_display_name: Expected max watermark prob
|
|
378
|
+
description: Watermark detector from LAION to determine whether an image contains
|
|
379
|
+
watermarks.
|
|
380
|
+
lower_is_better: true
|
|
381
|
+
|
|
382
|
+
- name: photorealism_generated_human
|
|
383
|
+
display_name: Photorealism w/ generated images (human)
|
|
384
|
+
short_display_name: Photorealism - generated (human)
|
|
385
|
+
description: Determine if the following image is AI-generated or real according
|
|
386
|
+
to human raters.
|
|
387
|
+
lower_is_better: false
|
|
388
|
+
|
|
389
|
+
- name: photorealism_real_human
|
|
390
|
+
display_name: Photorealism w/ real images (human)
|
|
391
|
+
short_display_name: Photorealism - real (human)
|
|
392
|
+
description: Determine if the following image is AI-generated or real according
|
|
393
|
+
to human raters.
|
|
394
|
+
lower_is_better: false
|
|
395
|
+
|
|
396
|
+
- name: denoised_runtime
|
|
397
|
+
display_name: Denoised runtime (in seconds)
|
|
398
|
+
short_display_name: Denoised runtime (s)
|
|
399
|
+
description: Denoised runtime is the runtime with the performance variance factored
|
|
400
|
+
out as described [here](https://arxiv.org/abs/2305.02440).
|
|
401
|
+
lower_is_better: true
|
|
402
|
+
|
|
403
|
+
- name: prompt_length
|
|
404
|
+
display_name: Prompt length (number of characters)
|
|
405
|
+
short_display_name: Prompt length (characters)
|
|
406
|
+
description: The number of characters in the prompt
|
|
407
|
+
lower_is_better: false
|
|
408
|
+
|
|
409
|
+
- name: inference_runtime
|
|
410
|
+
display_name: Inference runtime (in seconds)
|
|
411
|
+
short_display_name: Inference runtime (s)
|
|
412
|
+
description: How long it took to generate the images
|
|
413
|
+
lower_is_better: true
|
|
414
|
+
|
|
415
|
+
- name: num_generated_images
|
|
416
|
+
display_name: Number of generated images
|
|
417
|
+
short_display_name: Number of generated images
|
|
418
|
+
description: The number of images the model generated
|
|
419
|
+
lower_is_better: false
|
|
420
|
+
perturbations:
|
|
421
|
+
|
|
422
|
+
- name: robustness
|
|
423
|
+
display_name: Robustness
|
|
424
|
+
description: Computes worst case over different robustness perturbations (misspellings,
|
|
425
|
+
formatting, contrast sets).
|
|
426
|
+
|
|
427
|
+
- name: fairness
|
|
428
|
+
display_name: Fairness
|
|
429
|
+
description: Computes worst case over different fairness perturbations (changing
|
|
430
|
+
dialect, race of names, gender).
|
|
431
|
+
|
|
432
|
+
- name: typos
|
|
433
|
+
display_name: Typos
|
|
434
|
+
description: 'Randomly adds typos to each token in the input with probability 0.05
|
|
435
|
+
and computes the per-instance worst-case performance between perturbed and unperturbed
|
|
436
|
+
versions.'
|
|
437
|
+
|
|
438
|
+
- name: synonym
|
|
439
|
+
display_name: Synonyms
|
|
440
|
+
description: 'Randomly substitutes words in the input with WordNet synonyms with
|
|
441
|
+
probability 0.5 and computes the per-instance worst-case performance between perturbed
|
|
442
|
+
and unperturbed versions.'
|
|
443
|
+
|
|
444
|
+
- name: dialect
|
|
445
|
+
display_name: SAE -> AAE
|
|
446
|
+
short_display_name: Dialect
|
|
447
|
+
description: 'Deterministically substitutes SAE words in input with AAE counterparts
|
|
448
|
+
using validated dictionary of [Ziems et al. (2022)](https://aclanthology.org/2022.acl-long.258/)
|
|
449
|
+
and computes the per-instance worst-case performance between perturbed and unperturbed
|
|
450
|
+
versions.'
|
|
451
|
+
|
|
452
|
+
- name: race
|
|
453
|
+
display_name: First names by race (White -> Black)
|
|
454
|
+
short_display_name: Race
|
|
455
|
+
description: 'Deterministically substitutes White first names with Black first names
|
|
456
|
+
sampled from the lists of [Caliskan et al. (2017)](https://www.science.org/doi/10.1126/science.aal4230)
|
|
457
|
+
and computes the per-instance worst-case performance between perturbed and unperturbed
|
|
458
|
+
versions.'
|
|
459
|
+
|
|
460
|
+
- name: gender
|
|
461
|
+
display_name: Pronouns by gender (Male -> Female)
|
|
462
|
+
short_display_name: Gender
|
|
463
|
+
description: 'Deterministically substitutes male pronouns with female pronouns and
|
|
464
|
+
computes the per-instance worst-case performance between perturbed and unperturbed
|
|
465
|
+
versions.'
|
|
466
|
+
|
|
467
|
+
- name: translate
|
|
468
|
+
display_name: Translate
|
|
469
|
+
short_display_name: Translate
|
|
470
|
+
description: 'Translate text to other languages.'
|
|
471
|
+
|
|
472
|
+
############################################################
|
|
473
|
+
metric_groups:
|
|
474
|
+
|
|
475
|
+
- name: efficiency
|
|
476
|
+
display_name: Efficiency
|
|
477
|
+
metrics:
|
|
478
|
+
- name: inference_denoised_runtime
|
|
479
|
+
split: "${main_split}"
|
|
480
|
+
|
|
481
|
+
- name: efficiency_detailed
|
|
482
|
+
display_name: Efficiency
|
|
483
|
+
description: The efficiency of the model across both training and inference.
|
|
484
|
+
metrics:
|
|
485
|
+
- name: inference_runtime
|
|
486
|
+
split: "${main_split}"
|
|
487
|
+
- name: inference_idealized_runtime
|
|
488
|
+
split: "${main_split}"
|
|
489
|
+
- name: inference_denoised_runtime
|
|
490
|
+
split: "${main_split}"
|
|
491
|
+
- name: training_co2_cost
|
|
492
|
+
split: "${main_split}"
|
|
493
|
+
- name: training_energy_cost
|
|
494
|
+
split: "${main_split}"
|
|
495
|
+
|
|
496
|
+
- name: general_information
|
|
497
|
+
display_name: General information
|
|
498
|
+
metrics:
|
|
499
|
+
- name: num_instances
|
|
500
|
+
split: "${main_split}"
|
|
501
|
+
- name: prompt_truncated
|
|
502
|
+
split: "${main_split}"
|
|
503
|
+
- name: num_prompt_tokens
|
|
504
|
+
split: "${main_split}"
|
|
505
|
+
|
|
506
|
+
- name: heim_inception
|
|
507
|
+
display_name: HEIM inception metrics
|
|
508
|
+
metrics:
|
|
509
|
+
- name: fid
|
|
510
|
+
split: __all__
|
|
511
|
+
- name: inception_score
|
|
512
|
+
split: __all__
|
|
513
|
+
- name: kernel_inception_distance
|
|
514
|
+
split: __all__
|
|
515
|
+
|
|
516
|
+
- name: heim_fidelity
|
|
517
|
+
display_name: HEIM Image fidelity metrics
|
|
518
|
+
metrics:
|
|
519
|
+
- name: expected_lpips_score
|
|
520
|
+
split: "${main_split}"
|
|
521
|
+
- name: expected_multi_scale_ssim_score
|
|
522
|
+
split: "${main_split}"
|
|
523
|
+
- name: expected_psnr_score
|
|
524
|
+
split: "${main_split}"
|
|
525
|
+
- name: expected_uiqi_score
|
|
526
|
+
split: "${main_split}"
|
|
527
|
+
|
|
528
|
+
- name: heim_gender
|
|
529
|
+
display_name: HEIM gender metrics
|
|
530
|
+
metrics:
|
|
531
|
+
- name: gender_imbalance
|
|
532
|
+
split: "${main_split}"
|
|
533
|
+
|
|
534
|
+
- name: heim_skin_tone
|
|
535
|
+
display_name: HEIM skin tone metrics
|
|
536
|
+
metrics:
|
|
537
|
+
- name: skin_tone_imbalance
|
|
538
|
+
split: "${main_split}"
|
|
539
|
+
|
|
540
|
+
- name: heim_alignment_human_metrics
|
|
541
|
+
display_name: Image-text Alignment - Human Evaluation
|
|
542
|
+
metrics:
|
|
543
|
+
- name: image_text_alignment_human
|
|
544
|
+
split: __all__
|
|
545
|
+
perturbation_name: __all__
|
|
546
|
+
|
|
547
|
+
- name: heim_alignment_clip_metrics
|
|
548
|
+
display_name: Image-text Alignment - CLIP Score
|
|
549
|
+
metrics:
|
|
550
|
+
- name: expected_clip_score
|
|
551
|
+
split: "${main_split}"
|
|
552
|
+
- name: max_clip_score
|
|
553
|
+
split: "${main_split}"
|
|
554
|
+
|
|
555
|
+
- name: heim_quality_human_metrics
|
|
556
|
+
display_name: Quality - Human Evaluation
|
|
557
|
+
metrics:
|
|
558
|
+
- name: photorealism_generated_human
|
|
559
|
+
split: __all__
|
|
560
|
+
perturbation_name: __all__
|
|
561
|
+
|
|
562
|
+
- name: heim_quality_fid_metrics
|
|
563
|
+
display_name: Quality - FID
|
|
564
|
+
metrics:
|
|
565
|
+
- name: fid
|
|
566
|
+
split: __all__
|
|
567
|
+
|
|
568
|
+
- name: heim_quality_inception_metrics
|
|
569
|
+
display_name: Quality - Inception
|
|
570
|
+
metrics:
|
|
571
|
+
- name: inception_score
|
|
572
|
+
split: __all__
|
|
573
|
+
- name: kernel_inception_distance
|
|
574
|
+
split: __all__
|
|
575
|
+
|
|
576
|
+
- name: heim_quality_automated_metrics
|
|
577
|
+
display_name: Quality - Other Automated Metrics
|
|
578
|
+
metrics:
|
|
579
|
+
- name: expected_lpips_score
|
|
580
|
+
split: "${main_split}"
|
|
581
|
+
- name: expected_multi_scale_ssim_score
|
|
582
|
+
split: "${main_split}"
|
|
583
|
+
- name: expected_psnr_score
|
|
584
|
+
split: "${main_split}"
|
|
585
|
+
- name: expected_uiqi_score
|
|
586
|
+
split: "${main_split}"
|
|
587
|
+
|
|
588
|
+
- name: heim_aesthetics_human_metrics
|
|
589
|
+
display_name: Aesthetics - Human Evaluation
|
|
590
|
+
metrics:
|
|
591
|
+
- name: clear_subject_human
|
|
592
|
+
split: __all__
|
|
593
|
+
perturbation_name: __all__
|
|
594
|
+
- name: aesthetics_human
|
|
595
|
+
split: __all__
|
|
596
|
+
perturbation_name: __all__
|
|
597
|
+
|
|
598
|
+
- name: heim_aesthetics_automated_metrics
|
|
599
|
+
display_name: Aesthetics - Automated Metrics
|
|
600
|
+
metrics:
|
|
601
|
+
- name: expected_aesthetics_score
|
|
602
|
+
split: "${main_split}"
|
|
603
|
+
- name: max_aesthetics_score
|
|
604
|
+
split: "${main_split}"
|
|
605
|
+
- name: fractal_dimension_loss
|
|
606
|
+
split: "${main_split}"
|
|
607
|
+
|
|
608
|
+
- name: heim_originality_human_metrics
|
|
609
|
+
display_name: Originality - Human Evaluation
|
|
610
|
+
metrics:
|
|
611
|
+
- name: originality_human
|
|
612
|
+
split: __all__
|
|
613
|
+
perturbation_name: __all__
|
|
614
|
+
|
|
615
|
+
- name: heim_originality_watermark_metrics
|
|
616
|
+
display_name: Originality - Watermark
|
|
617
|
+
metrics:
|
|
618
|
+
- name: watermark_frac
|
|
619
|
+
split: "${main_split}"
|
|
620
|
+
|
|
621
|
+
- name: heim_reasoning_metrics
|
|
622
|
+
display_name: Reasoning
|
|
623
|
+
metrics:
|
|
624
|
+
- name: detection_correct_frac
|
|
625
|
+
split: "${main_split}"
|
|
626
|
+
|
|
627
|
+
- name: heim_bias_metrics
|
|
628
|
+
display_name: Bias
|
|
629
|
+
metrics:
|
|
630
|
+
- name: gender_imbalance
|
|
631
|
+
split: "${main_split}"
|
|
632
|
+
- name: skin_tone_imbalance
|
|
633
|
+
split: "${main_split}"
|
|
634
|
+
|
|
635
|
+
- name: heim_gender_metrics
|
|
636
|
+
display_name: Gender imbalance
|
|
637
|
+
metrics:
|
|
638
|
+
- name: gender_imbalance
|
|
639
|
+
split: "${main_split}"
|
|
640
|
+
|
|
641
|
+
- name: heim_skin_tone_metrics
|
|
642
|
+
display_name: Skin tone imbalance
|
|
643
|
+
metrics:
|
|
644
|
+
- name: skin_tone_imbalance
|
|
645
|
+
split: "${main_split}"
|
|
646
|
+
|
|
647
|
+
- name: heim_toxicity_metrics
|
|
648
|
+
display_name: Toxicity (NSFW)
|
|
649
|
+
metrics:
|
|
650
|
+
- name: nsfw_frac_given_toxic_prompt=False
|
|
651
|
+
split: "${main_split}"
|
|
652
|
+
|
|
653
|
+
- name: heim_toxicity_black_metrics
|
|
654
|
+
display_name: Toxicity (blacked out images)
|
|
655
|
+
metrics:
|
|
656
|
+
- name: blacked_out_frac
|
|
657
|
+
split: "${main_split}"
|
|
658
|
+
|
|
659
|
+
- name: heim_toxicity_nudity_metrics
|
|
660
|
+
display_name: Toxicity (nudity)
|
|
661
|
+
metrics:
|
|
662
|
+
- name: nudity_frac
|
|
663
|
+
split: "${main_split}"
|
|
664
|
+
|
|
665
|
+
- name: heim_efficiency_metrics
|
|
666
|
+
display_name: Efficiency
|
|
667
|
+
metrics:
|
|
668
|
+
- name: denoised_runtime
|
|
669
|
+
split: __all__
|
|
670
|
+
|
|
671
|
+
- name: heim_photorealism
|
|
672
|
+
display_name: HEIM photorealism metrics
|
|
673
|
+
metrics:
|
|
674
|
+
- name: photorealism_generated_human
|
|
675
|
+
split: __all__
|
|
676
|
+
perturbation_name: __all__
|
|
677
|
+
|
|
678
|
+
############################################################
|
|
679
|
+
run_groups:
|
|
680
|
+
|
|
681
|
+
- name: core_scenarios
|
|
682
|
+
display_name: All scenarios
|
|
683
|
+
description: All scenarios
|
|
684
|
+
category: All scenarios
|
|
685
|
+
subgroups:
|
|
686
|
+
- mscoco_base
|
|
687
|
+
- mscoco_fid
|
|
688
|
+
- mscoco_efficiency
|
|
689
|
+
- mscoco_gender
|
|
690
|
+
- mscoco_dialect
|
|
691
|
+
- mscoco_robustness
|
|
692
|
+
- mscoco_chinese
|
|
693
|
+
- mscoco_hindi
|
|
694
|
+
- mscoco_spanish
|
|
695
|
+
- mscoco_art_styles
|
|
696
|
+
- cub200
|
|
697
|
+
- draw_bench_image_quality
|
|
698
|
+
- parti_prompts_image_quality
|
|
699
|
+
- daily_dalle
|
|
700
|
+
- landing_page
|
|
701
|
+
- logos
|
|
702
|
+
- magazine_cover
|
|
703
|
+
- common_syntactic_processes
|
|
704
|
+
- draw_bench_reasoning
|
|
705
|
+
- parti_prompts_reasoning
|
|
706
|
+
- relational_understanding
|
|
707
|
+
- detection
|
|
708
|
+
- winoground
|
|
709
|
+
- parti_prompts_knowledge
|
|
710
|
+
- draw_bench_knowledge
|
|
711
|
+
- time_most_significant_historical_figures
|
|
712
|
+
- demographic_stereotypes
|
|
713
|
+
- mental_disorders
|
|
714
|
+
- i2p
|
|
715
|
+
|
|
716
|
+
- name: heim_alignment_scenarios
|
|
717
|
+
display_name: Alignment
|
|
718
|
+
description: Is the image semantically correct given the text (image-text alignment)
|
|
719
|
+
category: Scenarios for specific aspects
|
|
720
|
+
subgroups:
|
|
721
|
+
- mscoco_base
|
|
722
|
+
- cub200
|
|
723
|
+
- draw_bench_image_quality
|
|
724
|
+
- draw_bench_reasoning
|
|
725
|
+
- draw_bench_knowledge
|
|
726
|
+
- parti_prompts_image_quality
|
|
727
|
+
- parti_prompts_reasoning
|
|
728
|
+
- parti_prompts_knowledge
|
|
729
|
+
|
|
730
|
+
- name: heim_quality_scenarios
|
|
731
|
+
display_name: Quality
|
|
732
|
+
description: Do the generated images look like real images/photos
|
|
733
|
+
category: Scenarios for specific aspects
|
|
734
|
+
subgroups:
|
|
735
|
+
- mscoco_base
|
|
736
|
+
- draw_bench_image_quality
|
|
737
|
+
- parti_prompts_image_quality
|
|
738
|
+
|
|
739
|
+
- name: heim_aesthetics_scenarios
|
|
740
|
+
display_name: Aesthetics
|
|
741
|
+
description: Is the image aesthetically pleasing
|
|
742
|
+
category: Scenarios for specific aspects
|
|
743
|
+
subgroups:
|
|
744
|
+
- mscoco_base
|
|
745
|
+
- mscoco_art_styles
|
|
746
|
+
- daily_dalle
|
|
747
|
+
- logos
|
|
748
|
+
- landing_page
|
|
749
|
+
- magazine_cover
|
|
750
|
+
|
|
751
|
+
- name: heim_originality_scenarios
|
|
752
|
+
display_name: Originality
|
|
753
|
+
description: Does the model generate creative images and prevent copyright infringement
|
|
754
|
+
category: Scenarios for specific aspects
|
|
755
|
+
subgroups:
|
|
756
|
+
- daily_dalle
|
|
757
|
+
- logos
|
|
758
|
+
- landing_page
|
|
759
|
+
- magazine_cover
|
|
760
|
+
|
|
761
|
+
- name: heim_reasoning_scenarios
|
|
762
|
+
display_name: Reasoning
|
|
763
|
+
description: Does the model understand objects, counts, and spatial relations (compositionality)
|
|
764
|
+
category: Scenarios for specific aspects
|
|
765
|
+
subgroups:
|
|
766
|
+
- common_syntactic_processes
|
|
767
|
+
- draw_bench_reasoning
|
|
768
|
+
- parti_prompts_reasoning
|
|
769
|
+
- relational_understanding
|
|
770
|
+
- detection
|
|
771
|
+
- winoground
|
|
772
|
+
|
|
773
|
+
- name: heim_knowledge_scenarios
|
|
774
|
+
display_name: Knowledge
|
|
775
|
+
description: Does the model have knowledge about the world or domains
|
|
776
|
+
category: Scenarios for specific aspects
|
|
777
|
+
subgroups:
|
|
778
|
+
- time_most_significant_historical_figures
|
|
779
|
+
- draw_bench_knowledge
|
|
780
|
+
- parti_prompts_knowledge
|
|
781
|
+
|
|
782
|
+
- name: heim_bias_scenarios
|
|
783
|
+
display_name: Bias
|
|
784
|
+
description: Are the generated images biased in demographic representation (e.g.,
|
|
785
|
+
gender, skin tone)
|
|
786
|
+
category: Scenarios for specific aspects
|
|
787
|
+
subgroups:
|
|
788
|
+
- demographic_stereotypes
|
|
789
|
+
- mental_disorders
|
|
790
|
+
|
|
791
|
+
- name: heim_toxicity_scenarios
|
|
792
|
+
display_name: Toxicity
|
|
793
|
+
description: Does the model generate toxic or inappropriate images (e.g., violence,
|
|
794
|
+
sexual, illegal content)
|
|
795
|
+
category: Scenarios for specific aspects
|
|
796
|
+
subgroups:
|
|
797
|
+
- i2p
|
|
798
|
+
|
|
799
|
+
- name: heim_fairness_dialect_scenarios
|
|
800
|
+
display_name: Fairness - African American Vernacular English (AAVE)
|
|
801
|
+
description: Does the model exhibit performance disparities across social groups.
|
|
802
|
+
The African American Vernacular English (AAVE) dialect perturbation converts each
|
|
803
|
+
word to the corresponding word in AAVE if one exists.
|
|
804
|
+
category: Scenarios for specific aspects
|
|
805
|
+
subgroups:
|
|
806
|
+
- mscoco_dialect
|
|
807
|
+
|
|
808
|
+
- name: heim_fairness_gender_scenarios
|
|
809
|
+
display_name: Fairness - Gender
|
|
810
|
+
description: Does the model exhibit performance disparities across social groups.
|
|
811
|
+
The gender perturbation maps male gender terms to female gender terms (e.g., son
|
|
812
|
+
to daughter and father to mother).
|
|
813
|
+
category: Scenarios for specific aspects
|
|
814
|
+
subgroups:
|
|
815
|
+
- mscoco_gender
|
|
816
|
+
|
|
817
|
+
- name: heim_robustness_scenarios
|
|
818
|
+
display_name: Robustness
|
|
819
|
+
description: Is the model robust to invariant input perturbations
|
|
820
|
+
category: Scenarios for specific aspects
|
|
821
|
+
subgroups:
|
|
822
|
+
- mscoco_robustness
|
|
823
|
+
|
|
824
|
+
- name: heim_multilinguality_chinese_scenarios
|
|
825
|
+
display_name: Multilinguality (Chinese)
|
|
826
|
+
description: Does the model support non-English languages.
|
|
827
|
+
category: Scenarios for specific aspects
|
|
828
|
+
subgroups:
|
|
829
|
+
- mscoco_chinese
|
|
830
|
+
|
|
831
|
+
- name: heim_multilinguality_hindi_scenarios
|
|
832
|
+
display_name: Multilinguality (Hindi)
|
|
833
|
+
description: Does the model support non-English languages.
|
|
834
|
+
category: Scenarios for specific aspects
|
|
835
|
+
subgroups:
|
|
836
|
+
- mscoco_hindi
|
|
837
|
+
|
|
838
|
+
- name: heim_multilinguality_spanish_scenarios
|
|
839
|
+
display_name: Multilinguality (Spanish)
|
|
840
|
+
description: Does the model support non-English languages.
|
|
841
|
+
category: Scenarios for specific aspects
|
|
842
|
+
subgroups:
|
|
843
|
+
- mscoco_spanish
|
|
844
|
+
|
|
845
|
+
- name: heim_fid_scenarios
|
|
846
|
+
display_name: Fidelity
|
|
847
|
+
description: Fidelity metrics computed with MS-COCO.
|
|
848
|
+
category: Scenarios for specific aspects
|
|
849
|
+
subgroups:
|
|
850
|
+
- mscoco_fid
|
|
851
|
+
|
|
852
|
+
- name: heim_efficiency_scenarios
|
|
853
|
+
display_name: Efficiency
|
|
854
|
+
description: How fast is inference for the model
|
|
855
|
+
category: Scenarios for specific aspects
|
|
856
|
+
subgroups:
|
|
857
|
+
- mscoco_efficiency
|
|
858
|
+
|
|
859
|
+
- name: heim_art_styles_scenarios
|
|
860
|
+
display_name: Art styles
|
|
861
|
+
description: To test the ability of these models to generate images in specific
|
|
862
|
+
art styles.
|
|
863
|
+
category: Scenarios for specific aspects
|
|
864
|
+
subgroups:
|
|
865
|
+
- mscoco_art_styles
|
|
866
|
+
|
|
867
|
+
- name: mscoco
|
|
868
|
+
display_name: MS-COCO (all)
|
|
869
|
+
description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312)).
|
|
870
|
+
metric_groups:
|
|
871
|
+
- heim_alignment_human_metrics
|
|
872
|
+
- heim_alignment_clip_metrics
|
|
873
|
+
- heim_quality_human_metrics
|
|
874
|
+
- heim_quality_automated_metrics
|
|
875
|
+
- heim_aesthetics_human_metrics
|
|
876
|
+
- heim_aesthetics_automated_metrics
|
|
877
|
+
- heim_originality_watermark_metrics
|
|
878
|
+
- heim_toxicity_metrics
|
|
879
|
+
- heim_toxicity_black_metrics
|
|
880
|
+
- heim_toxicity_nudity_metrics
|
|
881
|
+
- general_information
|
|
882
|
+
environment:
|
|
883
|
+
main_split: valid
|
|
884
|
+
taxonomy:
|
|
885
|
+
task: Image quality
|
|
886
|
+
|
|
887
|
+
- name: mscoco_base
|
|
888
|
+
display_name: MS-COCO (base)
|
|
889
|
+
description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312)).
|
|
890
|
+
metric_groups:
|
|
891
|
+
- heim_alignment_human_metrics
|
|
892
|
+
- heim_alignment_clip_metrics
|
|
893
|
+
- heim_quality_human_metrics
|
|
894
|
+
- heim_quality_automated_metrics
|
|
895
|
+
- heim_aesthetics_human_metrics
|
|
896
|
+
- heim_aesthetics_automated_metrics
|
|
897
|
+
- heim_originality_watermark_metrics
|
|
898
|
+
- heim_toxicity_metrics
|
|
899
|
+
- heim_toxicity_black_metrics
|
|
900
|
+
- heim_toxicity_nudity_metrics
|
|
901
|
+
- general_information
|
|
902
|
+
environment:
|
|
903
|
+
main_split: valid
|
|
904
|
+
taxonomy:
|
|
905
|
+
task: Image quality
|
|
906
|
+
|
|
907
|
+
- name: mscoco_fid
|
|
908
|
+
display_name: MS-COCO Fidelity
|
|
909
|
+
description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))
|
|
910
|
+
for fidelity. To compute the FID, we randomly selected 30,000 text prompts from
|
|
911
|
+
MS-COCO and generated a single image for each prompt using the text-to-image generation
|
|
912
|
+
model that we are evaluating. Then, we used [pytorch-fid](https://github.com/mseitzer/pytorch-fid)
|
|
913
|
+
to compute the FID between the set of real images associated with the prompts
|
|
914
|
+
and the set of generated images.
|
|
915
|
+
metric_groups:
|
|
916
|
+
- heim_quality_fid_metrics
|
|
917
|
+
- heim_quality_inception_metrics
|
|
918
|
+
- general_information
|
|
919
|
+
environment:
|
|
920
|
+
main_split: valid
|
|
921
|
+
taxonomy:
|
|
922
|
+
task: Image quality
|
|
923
|
+
|
|
924
|
+
- name: mscoco_efficiency
|
|
925
|
+
display_name: MS-COCO Efficiency
|
|
926
|
+
description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))
|
|
927
|
+
for efficiency.
|
|
928
|
+
metric_groups:
|
|
929
|
+
- heim_efficiency_metrics
|
|
930
|
+
- general_information
|
|
931
|
+
environment:
|
|
932
|
+
main_split: valid
|
|
933
|
+
taxonomy:
|
|
934
|
+
task: Efficiency
|
|
935
|
+
|
|
936
|
+
- name: mscoco_gender
|
|
937
|
+
display_name: MS-COCO (fairness - gender)
|
|
938
|
+
description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))
|
|
939
|
+
with the gender perturbation maps male gender terms to female gender terms (e.g.,
|
|
940
|
+
son to daughter and father to mother).
|
|
941
|
+
metric_groups:
|
|
942
|
+
- heim_alignment_human_metrics
|
|
943
|
+
- heim_quality_human_metrics
|
|
944
|
+
- heim_aesthetics_human_metrics
|
|
945
|
+
- general_information
|
|
946
|
+
environment:
|
|
947
|
+
main_split: valid
|
|
948
|
+
taxonomy:
|
|
949
|
+
task: Image quality
|
|
950
|
+
|
|
951
|
+
- name: mscoco_dialect
|
|
952
|
+
display_name: MS-COCO (fairness - AAVE dialect)
|
|
953
|
+
description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))
|
|
954
|
+
with the African American Vernacular English (AAVE) dialect perturbation, which
|
|
955
|
+
converts each word to the corresponding word in AAVE if one exists.
|
|
956
|
+
metric_groups:
|
|
957
|
+
- heim_alignment_human_metrics
|
|
958
|
+
- heim_quality_human_metrics
|
|
959
|
+
- heim_aesthetics_human_metrics
|
|
960
|
+
- general_information
|
|
961
|
+
environment:
|
|
962
|
+
main_split: valid
|
|
963
|
+
taxonomy:
|
|
964
|
+
task: Image quality
|
|
965
|
+
|
|
966
|
+
- name: mscoco_robustness
|
|
967
|
+
display_name: MS-COCO (robustness)
|
|
968
|
+
description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))
|
|
969
|
+
with input perturbations.
|
|
970
|
+
metric_groups:
|
|
971
|
+
- heim_alignment_human_metrics
|
|
972
|
+
- heim_quality_human_metrics
|
|
973
|
+
- heim_aesthetics_human_metrics
|
|
974
|
+
- general_information
|
|
975
|
+
environment:
|
|
976
|
+
main_split: valid
|
|
977
|
+
taxonomy:
|
|
978
|
+
task: Image quality
|
|
979
|
+
|
|
980
|
+
- name: mscoco_chinese
|
|
981
|
+
display_name: MS-COCO (Chinese)
|
|
982
|
+
description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))
|
|
983
|
+
with prompts translated to Chinese.
|
|
984
|
+
metric_groups:
|
|
985
|
+
- heim_alignment_human_metrics
|
|
986
|
+
- heim_quality_human_metrics
|
|
987
|
+
- heim_aesthetics_human_metrics
|
|
988
|
+
- general_information
|
|
989
|
+
environment:
|
|
990
|
+
main_split: valid
|
|
991
|
+
taxonomy:
|
|
992
|
+
task: Image quality
|
|
993
|
+
|
|
994
|
+
- name: mscoco_hindi
|
|
995
|
+
display_name: MS-COCO (Hindi)
|
|
996
|
+
description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))
|
|
997
|
+
with prompts translated to Hindi.
|
|
998
|
+
metric_groups:
|
|
999
|
+
- heim_alignment_human_metrics
|
|
1000
|
+
- heim_quality_human_metrics
|
|
1001
|
+
- heim_aesthetics_human_metrics
|
|
1002
|
+
- general_information
|
|
1003
|
+
environment:
|
|
1004
|
+
main_split: valid
|
|
1005
|
+
taxonomy:
|
|
1006
|
+
task: Image quality
|
|
1007
|
+
|
|
1008
|
+
- name: mscoco_spanish
|
|
1009
|
+
display_name: MS-COCO (Spanish)
|
|
1010
|
+
description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))
|
|
1011
|
+
with prompts translated to Spanish.
|
|
1012
|
+
metric_groups:
|
|
1013
|
+
- heim_alignment_human_metrics
|
|
1014
|
+
- heim_quality_human_metrics
|
|
1015
|
+
- heim_aesthetics_human_metrics
|
|
1016
|
+
- general_information
|
|
1017
|
+
environment:
|
|
1018
|
+
main_split: valid
|
|
1019
|
+
taxonomy:
|
|
1020
|
+
task: Image quality
|
|
1021
|
+
|
|
1022
|
+
- name: mscoco_art_styles
|
|
1023
|
+
display_name: MS-COCO (Art styles)
|
|
1024
|
+
description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))
|
|
1025
|
+
with prompts that generate images in specific art styles.
|
|
1026
|
+
metric_groups:
|
|
1027
|
+
- heim_alignment_human_metrics
|
|
1028
|
+
- heim_quality_human_metrics
|
|
1029
|
+
- heim_aesthetics_human_metrics
|
|
1030
|
+
- general_information
|
|
1031
|
+
environment:
|
|
1032
|
+
main_split: valid
|
|
1033
|
+
taxonomy:
|
|
1034
|
+
task: Image quality
|
|
1035
|
+
|
|
1036
|
+
- name: cub200
|
|
1037
|
+
display_name: Caltech-UCSD Birds-200-2011
|
|
1038
|
+
description: Caltech-UCSD Birds-200-2011 is a challenging dataset of 200 bird species
|
|
1039
|
+
with 10 captions for each bird ([paper](https://authors.library.caltech.edu/27452/1/CUB_200_2011.pdf),
|
|
1040
|
+
[paper](https://arxiv.org/abs/1711.10485)).
|
|
1041
|
+
metric_groups:
|
|
1042
|
+
- heim_alignment_clip_metrics
|
|
1043
|
+
- heim_quality_automated_metrics
|
|
1044
|
+
- heim_aesthetics_automated_metrics
|
|
1045
|
+
- heim_originality_watermark_metrics
|
|
1046
|
+
- heim_toxicity_metrics
|
|
1047
|
+
- heim_toxicity_black_metrics
|
|
1048
|
+
- heim_toxicity_nudity_metrics
|
|
1049
|
+
- general_information
|
|
1050
|
+
environment:
|
|
1051
|
+
main_split: test
|
|
1052
|
+
taxonomy:
|
|
1053
|
+
task: Image quality
|
|
1054
|
+
|
|
1055
|
+
- name: draw_bench_image_quality
|
|
1056
|
+
display_name: DrawBench (image quality categories)
|
|
1057
|
+
description: A comprehensive and challenging set of prompts that support the evaluation
|
|
1058
|
+
and comparison of text-to-image models.
|
|
1059
|
+
metric_groups:
|
|
1060
|
+
- heim_alignment_human_metrics
|
|
1061
|
+
- heim_alignment_clip_metrics
|
|
1062
|
+
- heim_aesthetics_automated_metrics
|
|
1063
|
+
- heim_originality_watermark_metrics
|
|
1064
|
+
- heim_toxicity_metrics
|
|
1065
|
+
- heim_toxicity_black_metrics
|
|
1066
|
+
- heim_toxicity_nudity_metrics
|
|
1067
|
+
- general_information
|
|
1068
|
+
environment:
|
|
1069
|
+
main_split: test
|
|
1070
|
+
taxonomy:
|
|
1071
|
+
task: Image quality
|
|
1072
|
+
|
|
1073
|
+
- name: parti_prompts_image_quality
|
|
1074
|
+
display_name: PartiPrompts (image quality categories)
|
|
1075
|
+
description: PartiPrompts (P2) is a set of 1600 diverse English prompts that allow
|
|
1076
|
+
us to more comprehensively evaluate and test the limits of text-to-image synthesis
|
|
1077
|
+
models.
|
|
1078
|
+
metric_groups:
|
|
1079
|
+
- heim_alignment_human_metrics
|
|
1080
|
+
- heim_alignment_clip_metrics
|
|
1081
|
+
- heim_aesthetics_automated_metrics
|
|
1082
|
+
- heim_originality_watermark_metrics
|
|
1083
|
+
- heim_toxicity_metrics
|
|
1084
|
+
- heim_toxicity_black_metrics
|
|
1085
|
+
- heim_toxicity_nudity_metrics
|
|
1086
|
+
- general_information
|
|
1087
|
+
environment:
|
|
1088
|
+
main_split: test
|
|
1089
|
+
taxonomy:
|
|
1090
|
+
task: Image quality
|
|
1091
|
+
|
|
1092
|
+
- name: daily_dalle
|
|
1093
|
+
display_name: dailydall.e
|
|
1094
|
+
description: DALL-E 2 prompts from [Chad Nelson's Instagram](https://www.instagram.com/dailydall.e/)
|
|
1095
|
+
metric_groups:
|
|
1096
|
+
- heim_alignment_human_metrics
|
|
1097
|
+
- heim_alignment_clip_metrics
|
|
1098
|
+
- heim_aesthetics_human_metrics
|
|
1099
|
+
- heim_aesthetics_automated_metrics
|
|
1100
|
+
- heim_originality_human_metrics
|
|
1101
|
+
- heim_originality_watermark_metrics
|
|
1102
|
+
- heim_toxicity_metrics
|
|
1103
|
+
- heim_toxicity_black_metrics
|
|
1104
|
+
- heim_toxicity_nudity_metrics
|
|
1105
|
+
- general_information
|
|
1106
|
+
environment:
|
|
1107
|
+
main_split: test
|
|
1108
|
+
taxonomy:
|
|
1109
|
+
task: Originality
|
|
1110
|
+
|
|
1111
|
+
- name: landing_page
|
|
1112
|
+
display_name: Landing Page
|
|
1113
|
+
description: Prompts to generate landing pages for mobile or web applications.
|
|
1114
|
+
metric_groups:
|
|
1115
|
+
- heim_alignment_human_metrics
|
|
1116
|
+
- heim_alignment_clip_metrics
|
|
1117
|
+
- heim_aesthetics_human_metrics
|
|
1118
|
+
- heim_aesthetics_automated_metrics
|
|
1119
|
+
- heim_originality_human_metrics
|
|
1120
|
+
- heim_originality_watermark_metrics
|
|
1121
|
+
- heim_toxicity_metrics
|
|
1122
|
+
- heim_toxicity_black_metrics
|
|
1123
|
+
- heim_toxicity_nudity_metrics
|
|
1124
|
+
- general_information
|
|
1125
|
+
environment:
|
|
1126
|
+
main_split: test
|
|
1127
|
+
taxonomy:
|
|
1128
|
+
task: Originality
|
|
1129
|
+
|
|
1130
|
+
- name: logos
|
|
1131
|
+
display_name: Logos
|
|
1132
|
+
description: Prompts to generate logos for brands and companies
|
|
1133
|
+
metric_groups:
|
|
1134
|
+
- heim_alignment_human_metrics
|
|
1135
|
+
- heim_alignment_clip_metrics
|
|
1136
|
+
- heim_aesthetics_human_metrics
|
|
1137
|
+
- heim_aesthetics_automated_metrics
|
|
1138
|
+
- heim_originality_human_metrics
|
|
1139
|
+
- heim_originality_watermark_metrics
|
|
1140
|
+
- heim_toxicity_metrics
|
|
1141
|
+
- heim_toxicity_black_metrics
|
|
1142
|
+
- heim_toxicity_nudity_metrics
|
|
1143
|
+
- general_information
|
|
1144
|
+
environment:
|
|
1145
|
+
main_split: test
|
|
1146
|
+
taxonomy:
|
|
1147
|
+
task: Originality
|
|
1148
|
+
|
|
1149
|
+
- name: magazine_cover
|
|
1150
|
+
display_name: Magazine Cover Photos
|
|
1151
|
+
description: Prompts to generate magazine cover photos
|
|
1152
|
+
metric_groups:
|
|
1153
|
+
- heim_alignment_human_metrics
|
|
1154
|
+
- heim_alignment_clip_metrics
|
|
1155
|
+
- heim_aesthetics_human_metrics
|
|
1156
|
+
- heim_aesthetics_automated_metrics
|
|
1157
|
+
- heim_originality_human_metrics
|
|
1158
|
+
- heim_originality_watermark_metrics
|
|
1159
|
+
- heim_toxicity_metrics
|
|
1160
|
+
- heim_toxicity_black_metrics
|
|
1161
|
+
- heim_toxicity_nudity_metrics
|
|
1162
|
+
- general_information
|
|
1163
|
+
environment:
|
|
1164
|
+
main_split: test
|
|
1165
|
+
taxonomy:
|
|
1166
|
+
task: Originality
|
|
1167
|
+
|
|
1168
|
+
- name: common_syntactic_processes
|
|
1169
|
+
display_name: Common Syntactic Processes
|
|
1170
|
+
description: Prompts from 8 different grammatical phenomena ([paper](https://arxiv.org/abs/2210.12889)).
|
|
1171
|
+
metric_groups:
|
|
1172
|
+
- heim_alignment_human_metrics
|
|
1173
|
+
- heim_alignment_clip_metrics
|
|
1174
|
+
- heim_aesthetics_automated_metrics
|
|
1175
|
+
- heim_originality_watermark_metrics
|
|
1176
|
+
- heim_toxicity_metrics
|
|
1177
|
+
- heim_toxicity_black_metrics
|
|
1178
|
+
- heim_toxicity_nudity_metrics
|
|
1179
|
+
- general_information
|
|
1180
|
+
environment:
|
|
1181
|
+
main_split: test
|
|
1182
|
+
taxonomy:
|
|
1183
|
+
task: Reasoning
|
|
1184
|
+
|
|
1185
|
+
- name: draw_bench_reasoning
|
|
1186
|
+
display_name: DrawBench (reasoning categories)
|
|
1187
|
+
description: A comprehensive and challenging set of prompts that support the evaluation
|
|
1188
|
+
and comparison of text-to-image models.
|
|
1189
|
+
metric_groups:
|
|
1190
|
+
- heim_alignment_human_metrics
|
|
1191
|
+
- heim_alignment_clip_metrics
|
|
1192
|
+
- heim_aesthetics_automated_metrics
|
|
1193
|
+
- heim_originality_watermark_metrics
|
|
1194
|
+
- heim_toxicity_metrics
|
|
1195
|
+
- heim_toxicity_black_metrics
|
|
1196
|
+
- heim_toxicity_nudity_metrics
|
|
1197
|
+
- general_information
|
|
1198
|
+
environment:
|
|
1199
|
+
main_split: test
|
|
1200
|
+
taxonomy:
|
|
1201
|
+
task: Reasoning
|
|
1202
|
+
|
|
1203
|
+
- name: parti_prompts_reasoning
|
|
1204
|
+
display_name: PartiPrompts (reasoning categories)
|
|
1205
|
+
description: PartiPrompts (P2) is a set of 1600 diverse English prompts that allow
|
|
1206
|
+
us to more comprehensively evaluate and test the limits of text-to-image synthesis
|
|
1207
|
+
models.
|
|
1208
|
+
metric_groups:
|
|
1209
|
+
- heim_alignment_human_metrics
|
|
1210
|
+
- heim_alignment_clip_metrics
|
|
1211
|
+
- heim_aesthetics_automated_metrics
|
|
1212
|
+
- heim_originality_watermark_metrics
|
|
1213
|
+
- heim_toxicity_metrics
|
|
1214
|
+
- heim_toxicity_black_metrics
|
|
1215
|
+
- heim_toxicity_nudity_metrics
|
|
1216
|
+
- general_information
|
|
1217
|
+
environment:
|
|
1218
|
+
main_split: test
|
|
1219
|
+
taxonomy:
|
|
1220
|
+
task: Reasoning
|
|
1221
|
+
|
|
1222
|
+
- name: relational_understanding
|
|
1223
|
+
display_name: Relational Understanding
|
|
1224
|
+
description: Based on existing cognitive, linguistic, and developmental literature,
|
|
1225
|
+
the authors created a set of 15 relations (8 physical, 7 agentic) and a set of
|
|
1226
|
+
12 entities (6 objects, 6 agents)
|
|
1227
|
+
metric_groups:
|
|
1228
|
+
- heim_alignment_human_metrics
|
|
1229
|
+
- heim_alignment_clip_metrics
|
|
1230
|
+
- heim_aesthetics_automated_metrics
|
|
1231
|
+
- heim_originality_watermark_metrics
|
|
1232
|
+
- heim_toxicity_metrics
|
|
1233
|
+
- heim_toxicity_black_metrics
|
|
1234
|
+
- heim_toxicity_nudity_metrics
|
|
1235
|
+
- general_information
|
|
1236
|
+
environment:
|
|
1237
|
+
main_split: test
|
|
1238
|
+
taxonomy:
|
|
1239
|
+
task: Reasoning
|
|
1240
|
+
|
|
1241
|
+
- name: detection
|
|
1242
|
+
display_name: Detection (PaintSkills)
|
|
1243
|
+
description: A compositional diagnostic dataset an evaluation toolkit that measures
|
|
1244
|
+
visual reasoning skills ([paper](https://arxiv.org/abs/2202.04053)).
|
|
1245
|
+
metric_groups:
|
|
1246
|
+
- heim_alignment_human_metrics
|
|
1247
|
+
- heim_alignment_clip_metrics
|
|
1248
|
+
- heim_aesthetics_automated_metrics
|
|
1249
|
+
- heim_originality_watermark_metrics
|
|
1250
|
+
- heim_reasoning_metrics
|
|
1251
|
+
- heim_toxicity_metrics
|
|
1252
|
+
- heim_toxicity_black_metrics
|
|
1253
|
+
- heim_toxicity_nudity_metrics
|
|
1254
|
+
- general_information
|
|
1255
|
+
environment:
|
|
1256
|
+
main_split: test
|
|
1257
|
+
taxonomy:
|
|
1258
|
+
task: Reasoning
|
|
1259
|
+
|
|
1260
|
+
- name: winoground
|
|
1261
|
+
display_name: Winoground
|
|
1262
|
+
description: Winoground is a novel task and dataset for evaluating the ability of
|
|
1263
|
+
vision and language models to conduct visio-linguistic compositional reasoning
|
|
1264
|
+
([paper](https://arxiv.org/abs/2204.03162)).
|
|
1265
|
+
metric_groups:
|
|
1266
|
+
- heim_alignment_human_metrics
|
|
1267
|
+
- heim_alignment_clip_metrics
|
|
1268
|
+
- heim_quality_automated_metrics
|
|
1269
|
+
- heim_aesthetics_automated_metrics
|
|
1270
|
+
- heim_originality_watermark_metrics
|
|
1271
|
+
- heim_toxicity_metrics
|
|
1272
|
+
- heim_toxicity_black_metrics
|
|
1273
|
+
- heim_toxicity_nudity_metrics
|
|
1274
|
+
- general_information
|
|
1275
|
+
environment:
|
|
1276
|
+
main_split: test
|
|
1277
|
+
taxonomy:
|
|
1278
|
+
task: Reasoning
|
|
1279
|
+
|
|
1280
|
+
- name: parti_prompts_knowledge
|
|
1281
|
+
display_name: PartiPrompts (knowledge categories)
|
|
1282
|
+
description: PartiPrompts (P2) is a set of 1600 diverse English prompts that allow
|
|
1283
|
+
us to more comprehensively evaluate and test the limits of text-to-image synthesis
|
|
1284
|
+
models.
|
|
1285
|
+
metric_groups:
|
|
1286
|
+
- heim_alignment_human_metrics
|
|
1287
|
+
- heim_alignment_clip_metrics
|
|
1288
|
+
- heim_aesthetics_automated_metrics
|
|
1289
|
+
- heim_originality_watermark_metrics
|
|
1290
|
+
- heim_toxicity_metrics
|
|
1291
|
+
- heim_toxicity_black_metrics
|
|
1292
|
+
- heim_toxicity_nudity_metrics
|
|
1293
|
+
- general_information
|
|
1294
|
+
environment:
|
|
1295
|
+
main_split: test
|
|
1296
|
+
taxonomy:
|
|
1297
|
+
task: Knowledge
|
|
1298
|
+
|
|
1299
|
+
- name: draw_bench_knowledge
|
|
1300
|
+
display_name: DrawBench (knowledge categories)
|
|
1301
|
+
description: A comprehensive and challenging set of prompts that support the evaluation
|
|
1302
|
+
and comparison of text-to-image models.
|
|
1303
|
+
metric_groups:
|
|
1304
|
+
- heim_alignment_human_metrics
|
|
1305
|
+
- heim_alignment_clip_metrics
|
|
1306
|
+
- heim_aesthetics_automated_metrics
|
|
1307
|
+
- heim_originality_watermark_metrics
|
|
1308
|
+
- heim_toxicity_metrics
|
|
1309
|
+
- heim_toxicity_black_metrics
|
|
1310
|
+
- heim_toxicity_nudity_metrics
|
|
1311
|
+
- general_information
|
|
1312
|
+
environment:
|
|
1313
|
+
main_split: test
|
|
1314
|
+
taxonomy:
|
|
1315
|
+
task: Knowledge
|
|
1316
|
+
|
|
1317
|
+
- name: time_most_significant_historical_figures
|
|
1318
|
+
display_name: TIME's most significant historical figures
|
|
1319
|
+
description: People from TIME's "The 100 Most Significant Figures in History" list
|
|
1320
|
+
metric_groups:
|
|
1321
|
+
- heim_alignment_human_metrics
|
|
1322
|
+
- heim_alignment_clip_metrics
|
|
1323
|
+
- heim_aesthetics_automated_metrics
|
|
1324
|
+
- heim_originality_watermark_metrics
|
|
1325
|
+
- heim_toxicity_metrics
|
|
1326
|
+
- heim_toxicity_black_metrics
|
|
1327
|
+
- heim_toxicity_nudity_metrics
|
|
1328
|
+
- general_information
|
|
1329
|
+
environment:
|
|
1330
|
+
main_split: test
|
|
1331
|
+
taxonomy:
|
|
1332
|
+
task: Knowledge
|
|
1333
|
+
|
|
1334
|
+
- name: demographic_stereotypes
|
|
1335
|
+
display_name: Demographic Stereotypes
|
|
1336
|
+
description: Simple user prompts ("A photo of the face of...") generate images perpetuating
|
|
1337
|
+
dangerous racial, ethnic, gendered, class, and intersectional stereotypes.
|
|
1338
|
+
metric_groups:
|
|
1339
|
+
- heim_alignment_clip_metrics
|
|
1340
|
+
- heim_aesthetics_automated_metrics
|
|
1341
|
+
- heim_originality_watermark_metrics
|
|
1342
|
+
- heim_bias_metrics
|
|
1343
|
+
- heim_gender_metrics
|
|
1344
|
+
- heim_skin_tone_metrics
|
|
1345
|
+
- heim_toxicity_metrics
|
|
1346
|
+
- heim_toxicity_black_metrics
|
|
1347
|
+
- heim_toxicity_nudity_metrics
|
|
1348
|
+
- general_information
|
|
1349
|
+
environment:
|
|
1350
|
+
main_split: test
|
|
1351
|
+
taxonomy:
|
|
1352
|
+
task: Unbiasedness
|
|
1353
|
+
|
|
1354
|
+
- name: mental_disorders
|
|
1355
|
+
display_name: Mental Disorders
|
|
1356
|
+
description: Prompt text-to-image models with mental disorders listed by the World
|
|
1357
|
+
Health Organization
|
|
1358
|
+
metric_groups:
|
|
1359
|
+
- heim_alignment_clip_metrics
|
|
1360
|
+
- heim_aesthetics_automated_metrics
|
|
1361
|
+
- heim_originality_watermark_metrics
|
|
1362
|
+
- heim_bias_metrics
|
|
1363
|
+
- heim_gender_metrics
|
|
1364
|
+
- heim_skin_tone_metrics
|
|
1365
|
+
- heim_toxicity_metrics
|
|
1366
|
+
- heim_toxicity_black_metrics
|
|
1367
|
+
- heim_toxicity_nudity_metrics
|
|
1368
|
+
- general_information
|
|
1369
|
+
environment:
|
|
1370
|
+
main_split: test
|
|
1371
|
+
taxonomy:
|
|
1372
|
+
task: Unbiasedness
|
|
1373
|
+
|
|
1374
|
+
- name: i2p
|
|
1375
|
+
display_name: Inappropriate Image Prompts (I2P)
|
|
1376
|
+
description: Contains real user prompts for generative text-to-image prompts that
|
|
1377
|
+
are likely to produce inappropriate images ([paper](https://arxiv.org/abs/2211.05105)).
|
|
1378
|
+
metric_groups:
|
|
1379
|
+
- heim_alignment_clip_metrics
|
|
1380
|
+
- heim_aesthetics_automated_metrics
|
|
1381
|
+
- heim_originality_watermark_metrics
|
|
1382
|
+
- heim_toxicity_metrics
|
|
1383
|
+
- heim_toxicity_black_metrics
|
|
1384
|
+
- heim_toxicity_nudity_metrics
|
|
1385
|
+
- general_information
|
|
1386
|
+
environment:
|
|
1387
|
+
main_split: test
|
|
1388
|
+
taxonomy:
|
|
1389
|
+
task: Toxicity mitigation
|