crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +32 -31
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
- helm/benchmark/adaptation/common_adapter_specs.py +2 -0
- helm/benchmark/annotation/air_bench_annotator.py +64 -0
- helm/benchmark/annotation/annotator_factory.py +6 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
- helm/benchmark/annotation/call_center_annotator.py +247 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +68 -0
- helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
- helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
- helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
- helm/benchmark/annotation/live_qa_annotator.py +71 -0
- helm/benchmark/annotation/medication_qa_annotator.py +68 -0
- helm/benchmark/annotation/model_as_judge.py +45 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
- helm/benchmark/annotation/xstest_annotator.py +110 -0
- helm/benchmark/augmentations/translate_perturbation.py +1 -0
- helm/benchmark/huggingface_registration.py +16 -6
- helm/benchmark/metrics/air_bench_metrics.py +56 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/bhasa_metrics.py +188 -0
- helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
- helm/benchmark/metrics/code_metrics_helper.py +11 -1
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
- helm/benchmark/metrics/live_qa_metrics.py +23 -0
- helm/benchmark/metrics/medication_qa_metrics.py +23 -0
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/safety_metrics.py +57 -0
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +20 -10
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
- helm/benchmark/metrics/vision_language/image_utils.py +1 -1
- helm/benchmark/model_metadata_registry.py +3 -3
- helm/benchmark/presentation/schema.py +54 -4
- helm/benchmark/presentation/test_run_entry.py +1 -0
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/run.py +31 -2
- helm/benchmark/run_expander.py +113 -10
- helm/benchmark/run_spec_factory.py +4 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
- helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
- helm/benchmark/run_specs/call_center_run_specs.py +152 -0
- helm/benchmark/run_specs/classic_run_specs.py +15 -11
- helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
- helm/benchmark/run_specs/experimental_run_specs.py +85 -0
- helm/benchmark/run_specs/finance_run_specs.py +110 -0
- helm/benchmark/run_specs/safety_run_specs.py +154 -0
- helm/benchmark/run_specs/vlm_run_specs.py +251 -57
- helm/benchmark/scenarios/air_bench_scenario.py +50 -0
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
- helm/benchmark/scenarios/banking77_scenario.py +51 -0
- helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
- helm/benchmark/scenarios/financebench_scenario.py +53 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
- helm/benchmark/scenarios/scenario.py +1 -1
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +2 -8
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
- helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +1 -6
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_bhasa.yaml +709 -0
- helm/benchmark/static/schema_call_center.yaml +232 -0
- helm/benchmark/static/schema_classic.yaml +3 -59
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +189 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +3 -52
- helm/benchmark/static/schema_lite.yaml +3 -61
- helm/benchmark/static/schema_medical.yaml +255 -0
- helm/benchmark/static/schema_mmlu.yaml +3 -61
- helm/benchmark/static/schema_safety.yaml +247 -0
- helm/benchmark/static/schema_tables.yaml +317 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_unitxt.yaml +3 -61
- helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
- helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
- helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
- helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
- helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/test_openai_window_service.py +8 -8
- helm/clients/ai21_client.py +71 -1
- helm/clients/anthropic_client.py +50 -28
- helm/clients/auto_client.py +11 -0
- helm/clients/client.py +24 -7
- helm/clients/cohere_client.py +98 -3
- helm/clients/huggingface_client.py +79 -19
- helm/clients/nvidia_nim_client.py +35 -0
- helm/clients/openai_client.py +11 -5
- helm/clients/palmyra_client.py +25 -0
- helm/clients/perspective_api_client.py +11 -6
- helm/clients/reka_client.py +189 -0
- helm/clients/test_client.py +7 -9
- helm/clients/test_huggingface_client.py +19 -3
- helm/clients/test_together_client.py +72 -2
- helm/clients/together_client.py +129 -23
- helm/clients/vertexai_client.py +62 -18
- helm/clients/vision_language/huggingface_vlm_client.py +1 -0
- helm/clients/vision_language/open_flamingo_client.py +1 -2
- helm/clients/vision_language/paligemma_client.py +146 -0
- helm/clients/vision_language/palmyra_vision_client.py +99 -0
- helm/clients/yi_client.py +31 -0
- helm/common/critique_request.py +10 -1
- helm/common/images_utils.py +25 -0
- helm/common/mongo_key_value_store.py +2 -1
- helm/common/request.py +16 -0
- helm/config/model_deployments.yaml +740 -363
- helm/config/model_metadata.yaml +824 -128
- helm/config/tokenizer_configs.yaml +207 -10
- helm/proxy/critique/model_critique_client.py +32 -4
- helm/proxy/example_queries.py +14 -21
- helm/proxy/services/server_service.py +2 -3
- helm/proxy/token_counters/test_auto_token_counter.py +2 -2
- helm/tokenizers/ai21_tokenizer.py +51 -59
- helm/tokenizers/auto_tokenizer.py +1 -1
- helm/tokenizers/cohere_tokenizer.py +29 -62
- helm/tokenizers/huggingface_tokenizer.py +35 -13
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_huggingface_tokenizer.py +5 -1
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/general.js +0 -122
- helm/benchmark/static/images/crfm-logo.png +0 -0
- helm/benchmark/static/images/helm-logo-simple.png +0 -0
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/info-icon.png +0 -0
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/schema_image2structure.yaml +0 -304
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
- helm/benchmark/static_build/assets/index-878a1094.css +0 -1
- helm/benchmark/window_services/ai21_window_service.py +0 -247
- helm/benchmark/window_services/cohere_window_service.py +0 -101
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -75
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -327
- helm/tokenizers/ice_tokenizer.py +0 -30
- helm/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
|
@@ -1,66 +1,8 @@
|
|
|
1
1
|
---
|
|
2
2
|
############################################################
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
values:
|
|
7
|
-
- name: generation
|
|
8
|
-
description: Given the input, the model generates the output free-form.
|
|
9
|
-
- name: multiple_choice_joint
|
|
10
|
-
description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
|
|
11
|
-
- name: multiple_choice_separate_original
|
|
12
|
-
description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
|
|
13
|
-
- name: multiple_choice_separate_calibrated
|
|
14
|
-
description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
|
|
15
|
-
- name: language_modeling
|
|
16
|
-
description: Given the input, the model assigns the sequence a probability.
|
|
17
|
-
- name: instructions
|
|
18
|
-
description: The description of the task that is included at the very beginning of the prompt.
|
|
19
|
-
- name: global_prefix
|
|
20
|
-
description: The string that is prepended to the prompt.
|
|
21
|
-
- name: global_suffix
|
|
22
|
-
description: The string that is appended to the prompt.
|
|
23
|
-
- name: instance_prefix
|
|
24
|
-
description: The string that is included before each instance (e.g., '\n\n').
|
|
25
|
-
- name: input_prefix
|
|
26
|
-
description: The string that is included before each input (e.g., 'Question:').
|
|
27
|
-
- name: input_suffix
|
|
28
|
-
description: The string that is included after each input (e.g., '\n').
|
|
29
|
-
- name: reference_prefix
|
|
30
|
-
description: The string that is included before each reference (for multiple-choice questions).
|
|
31
|
-
- name: reference_suffix
|
|
32
|
-
description: The string that is included after each reference (for multiple-choice questions).
|
|
33
|
-
- name: output_prefix
|
|
34
|
-
description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
|
|
35
|
-
- name: output_suffix
|
|
36
|
-
description: The string that is included after the correct answer/predicted output (e.g., '\n').
|
|
37
|
-
- name: substitutions
|
|
38
|
-
description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
|
|
39
|
-
- name: max_train_instances
|
|
40
|
-
description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
|
|
41
|
-
- name: max_eval_instances
|
|
42
|
-
description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
|
|
43
|
-
- name: num_outputs
|
|
44
|
-
description: Maximum number of possible outputs to generate by sampling multiple outputs.
|
|
45
|
-
- name: num_train_trials
|
|
46
|
-
description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
|
|
47
|
-
- name: sample_train
|
|
48
|
-
description: If true, randomly sample N training examples; if false, select N consecutive training examples
|
|
49
|
-
- name: model
|
|
50
|
-
description: Name of the language model (<creator_organization>/<model name>) to send requests to.
|
|
51
|
-
- name: model_deployment
|
|
52
|
-
description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
|
|
53
|
-
- name: temperature
|
|
54
|
-
description: Temperature parameter used in generation.
|
|
55
|
-
- name: max_tokens
|
|
56
|
-
description: Maximum number of tokens to generate.
|
|
57
|
-
- name: stop_sequences
|
|
58
|
-
description: List of sequences, where we stop generation if we encounter any of them.
|
|
59
|
-
- name: random
|
|
60
|
-
description: Random seed (string), which guarantees reproducibility.
|
|
61
|
-
- name: multi_label
|
|
62
|
-
description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
|
|
63
|
-
|
|
3
|
+
# For backwards compatibility with older versions of HELM.
|
|
4
|
+
# TODO: Remove this after 2024-09-01.
|
|
5
|
+
adapter: []
|
|
64
6
|
############################################################
|
|
65
7
|
metrics:
|
|
66
8
|
# Infrastructure metrics:
|
|
@@ -220,49 +162,10 @@ metrics:
|
|
|
220
162
|
display_name: CIDEr
|
|
221
163
|
description: Evaluates the quality of generated caption by measuring the weighted similarity of n-grams between the captions and a set of human-written reference captions, emphasizing informativeness and consensus.
|
|
222
164
|
lower_is_better: false
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
short_display_name: Stereotypes (race)
|
|
228
|
-
lower_is_better: true
|
|
229
|
-
description: Measures uneven association of racial groups (Asian, Hispanic, White) with target professions. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
|
|
230
|
-
- name: bias_metric:mode=associations,demographic_category=race,target_category=adjective
|
|
231
|
-
display_name: Stereotypical associations (race, adjectives)
|
|
232
|
-
short_display_name: Stereotypes (race)
|
|
233
|
-
lower_is_better: true
|
|
234
|
-
description: Measures uneven association of racial groups (Asian, Hispanic, White) with target adjectives. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
|
|
235
|
-
|
|
236
|
-
- name: bias_metric:mode=associations,demographic_category=gender,target_category=profession
|
|
237
|
-
display_name: Stereotypical associations (gender, profession)
|
|
238
|
-
short_display_name: Stereotypes (gender)
|
|
239
|
-
lower_is_better: true
|
|
240
|
-
description: Measures uneven association of gender groups (male, female) with target professions. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
|
|
241
|
-
- name: bias_metric:mode=associations,demographic_category=gender,target_category=adjective
|
|
242
|
-
display_name: Stereotypical associations (gender, adjectives)
|
|
243
|
-
short_display_name: Stereotypes (gender)
|
|
244
|
-
lower_is_better: true
|
|
245
|
-
description: Measures uneven association of gender groups (male, female) with target adjectives. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
|
|
246
|
-
|
|
247
|
-
- name: bias_metric:mode=representation,demographic_category=race
|
|
248
|
-
display_name: Demographic representation (race)
|
|
249
|
-
short_display_name: Representation (race)
|
|
250
|
-
lower_is_better: true
|
|
251
|
-
description: Measures uneven representation of racial groups (Asian, Hispanic, White). This measurement is based on disparities in the frequency statistics across racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics).
|
|
252
|
-
- name: bias_metric:mode=representation,demographic_category=gender
|
|
253
|
-
display_name: Demographic representation (gender)
|
|
254
|
-
short_display_name: Representation (gender)
|
|
255
|
-
lower_is_better: true
|
|
256
|
-
description: Measures uneven representation of gender groups (male, female). This measurement is based on disparities in the frequency statistics across gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
|
|
257
|
-
|
|
258
|
-
- name: bbq_metric_ambiguous_bias
|
|
259
|
-
display_name: BBQ (ambiguous)
|
|
260
|
-
lower_is_better: true
|
|
261
|
-
description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on ambiguous examples.
|
|
262
|
-
- name: bbq_metric_unambiguous_bias
|
|
263
|
-
display_name: BBQ (unambiguous)
|
|
264
|
-
lower_is_better: true
|
|
265
|
-
description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on unambiguous examples.
|
|
165
|
+
- name: prometheus_vision
|
|
166
|
+
display_name: Prometheus Vision rating
|
|
167
|
+
description: Scores are from 1 to 5, where 5 is the best.
|
|
168
|
+
lower_is_better: false
|
|
266
169
|
|
|
267
170
|
# Toxicity metrics
|
|
268
171
|
- name: expected_max_toxicity
|
|
@@ -281,6 +184,9 @@ metrics:
|
|
|
281
184
|
|
|
282
185
|
############################################################
|
|
283
186
|
perturbations:
|
|
187
|
+
- name: translate
|
|
188
|
+
display_name: Translate
|
|
189
|
+
description: Translate prompts to different languages.
|
|
284
190
|
- name: robustness
|
|
285
191
|
display_name: Robustness
|
|
286
192
|
description: Computes worst case over different robustness perturbations (misspellings, formatting, contrast sets).
|
|
@@ -322,12 +228,6 @@ metric_groups:
|
|
|
322
228
|
- name: ${main_name}
|
|
323
229
|
split: ${main_split}
|
|
324
230
|
|
|
325
|
-
- name: efficiency
|
|
326
|
-
display_name: Efficiency
|
|
327
|
-
metrics:
|
|
328
|
-
- name: inference_runtime
|
|
329
|
-
split: ${main_split}
|
|
330
|
-
|
|
331
231
|
- name: general_information
|
|
332
232
|
display_name: General information
|
|
333
233
|
metrics:
|
|
@@ -348,50 +248,126 @@ metric_groups:
|
|
|
348
248
|
- name: toxic_frac
|
|
349
249
|
split: ${main_split}
|
|
350
250
|
|
|
351
|
-
- name:
|
|
352
|
-
display_name:
|
|
251
|
+
- name: fairness
|
|
252
|
+
display_name: Fairness
|
|
353
253
|
metrics:
|
|
354
|
-
- name:
|
|
355
|
-
split: ${main_split}
|
|
356
|
-
- name: compilation_success
|
|
357
|
-
split: ${main_split}
|
|
358
|
-
- name: fid_similarity
|
|
254
|
+
- name: ${main_name}
|
|
359
255
|
split: ${main_split}
|
|
360
|
-
|
|
256
|
+
perturbation_name: fairness
|
|
257
|
+
|
|
258
|
+
- name: robustness
|
|
259
|
+
display_name: Robustness
|
|
260
|
+
metrics:
|
|
261
|
+
- name: ${main_name}
|
|
361
262
|
split: ${main_split}
|
|
263
|
+
perturbation_name: robustness
|
|
362
264
|
|
|
363
|
-
- name:
|
|
364
|
-
display_name:
|
|
265
|
+
- name: translate
|
|
266
|
+
display_name: Translate
|
|
365
267
|
metrics:
|
|
366
|
-
- name:
|
|
268
|
+
- name: ${main_name}
|
|
367
269
|
split: ${main_split}
|
|
270
|
+
perturbation_name: translate
|
|
271
|
+
|
|
368
272
|
|
|
369
273
|
############################################################
|
|
370
274
|
run_groups:
|
|
371
275
|
- name: core_scenarios
|
|
372
|
-
display_name:
|
|
373
|
-
description:
|
|
276
|
+
display_name: All
|
|
277
|
+
description: All scenarios across capabilities
|
|
374
278
|
category: All scenarios
|
|
375
279
|
subgroups:
|
|
376
|
-
-
|
|
377
|
-
-
|
|
280
|
+
- visual_perception
|
|
281
|
+
- reasoning
|
|
282
|
+
- knowledge
|
|
283
|
+
- bias
|
|
284
|
+
- fairness
|
|
285
|
+
- toxicity
|
|
286
|
+
- robustness
|
|
287
|
+
- multilinguality
|
|
288
|
+
- name: visual_perception
|
|
289
|
+
display_name: Visual perception
|
|
290
|
+
description: Is the output semantically correct, given the text and image inputs?
|
|
291
|
+
category: Core scenarios
|
|
292
|
+
subgroups:
|
|
293
|
+
- vqa_base
|
|
378
294
|
- viz_wiz
|
|
379
|
-
-
|
|
295
|
+
- flickr30k
|
|
296
|
+
- name: reasoning
|
|
297
|
+
display_name: Reasoning
|
|
298
|
+
description: Does the model understand objects, counts and spatial relations? Can the model reason about both the text and image input?
|
|
299
|
+
category: Core scenarios
|
|
300
|
+
subgroups:
|
|
301
|
+
- gqa
|
|
302
|
+
- math_vista
|
|
303
|
+
- seed_bench
|
|
304
|
+
- name: real_world_reasoning
|
|
305
|
+
display_name: Real-world Reasoning
|
|
306
|
+
description: Reasoning in the real-world
|
|
307
|
+
category: Core scenarios
|
|
308
|
+
subgroups:
|
|
309
|
+
- gqa
|
|
310
|
+
- seed_bench
|
|
311
|
+
- mementos
|
|
312
|
+
- real_world_qa
|
|
313
|
+
- name: knowledge
|
|
314
|
+
display_name: Knowledge
|
|
315
|
+
description: Does the model have knowledge about the world and common sense?
|
|
316
|
+
category: Core scenarios
|
|
317
|
+
subgroups:
|
|
318
|
+
- a_okvqa_base
|
|
380
319
|
- mmmu
|
|
381
|
-
-
|
|
320
|
+
- mme
|
|
321
|
+
- vibe_eval
|
|
322
|
+
- real_world_qa
|
|
323
|
+
- name: bias
|
|
324
|
+
display_name: Bias
|
|
325
|
+
description: Are the generations biased in demographic representation? We focus on gender and skin tone bias.
|
|
326
|
+
category: Core scenarios
|
|
327
|
+
subgroups:
|
|
328
|
+
- pairs
|
|
329
|
+
- name: fairness
|
|
330
|
+
display_name: Fairness
|
|
331
|
+
description: Does the model exhibit performance disparities across different groups? We focus on gender, dialect and geographic bias.
|
|
332
|
+
category: Core scenarios
|
|
333
|
+
subgroups:
|
|
334
|
+
- vqa_dialect
|
|
335
|
+
- a_okvqa_dialect
|
|
336
|
+
- crossmodal_3600
|
|
337
|
+
- fair_face
|
|
338
|
+
- name: toxicity
|
|
339
|
+
display_name: Toxicity
|
|
340
|
+
description: Does the model generate toxic or inappropriate content? Can the model identify toxic or inappropriate content?
|
|
341
|
+
category: Core scenarios
|
|
342
|
+
subgroups:
|
|
343
|
+
- mm_safety_bench
|
|
344
|
+
- hateful_memes
|
|
345
|
+
- name: robustness
|
|
346
|
+
display_name: Robustness
|
|
347
|
+
description: Is the model robust to perturbations? We focus on both text and image perturbations.
|
|
348
|
+
category: Core scenarios
|
|
349
|
+
subgroups:
|
|
350
|
+
- vqa_robustness
|
|
351
|
+
- a_okvqa_robustness
|
|
382
352
|
- unicorn
|
|
383
353
|
- bingo
|
|
384
|
-
- multipanelvqa
|
|
385
354
|
- pope
|
|
386
|
-
|
|
387
|
-
|
|
355
|
+
- name: multilinguality
|
|
356
|
+
display_name: Multilinguality
|
|
357
|
+
description: Do the model support non-English languages?
|
|
358
|
+
category: Core scenarios
|
|
359
|
+
subgroups:
|
|
360
|
+
- a_okvqa_chinese
|
|
361
|
+
- a_okvqa_hindi
|
|
362
|
+
- a_okvqa_spanish
|
|
363
|
+
- a_okvqa_swahili
|
|
364
|
+
- exams_v
|
|
388
365
|
|
|
389
|
-
- name:
|
|
366
|
+
- name: a_okvqa_base
|
|
390
367
|
display_name: A-OKVQA
|
|
391
|
-
description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([
|
|
368
|
+
description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
392
369
|
metric_groups:
|
|
393
370
|
- accuracy
|
|
394
|
-
- efficiency
|
|
395
371
|
- general_information
|
|
396
372
|
environment:
|
|
397
373
|
main_name: exact_match
|
|
@@ -403,15 +379,110 @@ run_groups:
|
|
|
403
379
|
when: "2023"
|
|
404
380
|
language: English
|
|
405
381
|
|
|
382
|
+
- name: a_okvqa_dialect
|
|
383
|
+
display_name: A-OKVQA (AAE)
|
|
384
|
+
description: African-American English Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
385
|
+
metric_groups:
|
|
386
|
+
- fairness
|
|
387
|
+
- general_information
|
|
388
|
+
environment:
|
|
389
|
+
main_name: exact_match
|
|
390
|
+
main_split: valid
|
|
391
|
+
taxonomy:
|
|
392
|
+
task: multiple-choice question answering
|
|
393
|
+
what: Real-world images
|
|
394
|
+
who: Human experts
|
|
395
|
+
when: "2023"
|
|
396
|
+
language: English
|
|
397
|
+
|
|
398
|
+
- name: a_okvqa_robustness
|
|
399
|
+
display_name: A-OKVQA (robustness)
|
|
400
|
+
description: Robustness Typos Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
401
|
+
metric_groups:
|
|
402
|
+
- robustness
|
|
403
|
+
- general_information
|
|
404
|
+
environment:
|
|
405
|
+
main_name: exact_match
|
|
406
|
+
main_split: valid
|
|
407
|
+
taxonomy:
|
|
408
|
+
task: multiple-choice question answering
|
|
409
|
+
what: Real-world images
|
|
410
|
+
who: Human experts
|
|
411
|
+
when: "2023"
|
|
412
|
+
language: English
|
|
413
|
+
|
|
414
|
+
- name: a_okvqa_chinese
|
|
415
|
+
display_name: A-OKVQA (chinese)
|
|
416
|
+
description: Chinese Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
417
|
+
metric_groups:
|
|
418
|
+
- translate
|
|
419
|
+
- general_information
|
|
420
|
+
environment:
|
|
421
|
+
main_name: exact_match
|
|
422
|
+
main_split: valid
|
|
423
|
+
taxonomy:
|
|
424
|
+
task: multiple-choice question answering
|
|
425
|
+
what: Real-world images
|
|
426
|
+
who: Human experts
|
|
427
|
+
when: "2023"
|
|
428
|
+
language: Chinese
|
|
429
|
+
|
|
430
|
+
- name: a_okvqa_hindi
|
|
431
|
+
display_name: A-OKVQA (hindi)
|
|
432
|
+
description: Hindi Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
433
|
+
metric_groups:
|
|
434
|
+
- translate
|
|
435
|
+
- general_information
|
|
436
|
+
environment:
|
|
437
|
+
main_name: exact_match
|
|
438
|
+
main_split: valid
|
|
439
|
+
taxonomy:
|
|
440
|
+
task: multiple-choice question answering
|
|
441
|
+
what: Real-world images
|
|
442
|
+
who: Human experts
|
|
443
|
+
when: "2023"
|
|
444
|
+
language: Hindi
|
|
445
|
+
|
|
446
|
+
- name: a_okvqa_spanish
|
|
447
|
+
display_name: A-OKVQA (spanish)
|
|
448
|
+
description: Spanish Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
449
|
+
metric_groups:
|
|
450
|
+
- translate
|
|
451
|
+
- general_information
|
|
452
|
+
environment:
|
|
453
|
+
main_name: exact_match
|
|
454
|
+
main_split: valid
|
|
455
|
+
taxonomy:
|
|
456
|
+
task: multiple-choice question answering
|
|
457
|
+
what: Real-world images
|
|
458
|
+
who: Human experts
|
|
459
|
+
when: "2023"
|
|
460
|
+
language: Spanish
|
|
461
|
+
|
|
462
|
+
- name: a_okvqa_swahili
|
|
463
|
+
display_name: A-OKVQA (swahili)
|
|
464
|
+
description: Swahili Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
465
|
+
metric_groups:
|
|
466
|
+
- translate
|
|
467
|
+
- general_information
|
|
468
|
+
environment:
|
|
469
|
+
main_name: exact_match
|
|
470
|
+
main_split: valid
|
|
471
|
+
taxonomy:
|
|
472
|
+
task: multiple-choice question answering
|
|
473
|
+
what: Real-world images
|
|
474
|
+
who: Human experts
|
|
475
|
+
when: "2023"
|
|
476
|
+
language: Swahili
|
|
477
|
+
|
|
406
478
|
- name: crossmodal_3600
|
|
407
479
|
display_name: Crossmodal 3600
|
|
408
|
-
description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([
|
|
480
|
+
description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([Thapliyal et al., 2022](https://arxiv.org/abs/2205.12522))
|
|
409
481
|
metric_groups:
|
|
410
482
|
- accuracy
|
|
411
|
-
- efficiency
|
|
412
483
|
- general_information
|
|
413
484
|
environment:
|
|
414
|
-
main_name:
|
|
485
|
+
main_name: prometheus_vision
|
|
415
486
|
main_split: test
|
|
416
487
|
taxonomy:
|
|
417
488
|
task: multilingual captioning
|
|
@@ -422,13 +493,12 @@ run_groups:
|
|
|
422
493
|
|
|
423
494
|
- name: flickr30k
|
|
424
495
|
display_name: Flickr30k
|
|
425
|
-
description: An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr images. ([
|
|
496
|
+
description: An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr images. ([Young et al., 2014](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))
|
|
426
497
|
metric_groups:
|
|
427
498
|
- accuracy
|
|
428
|
-
- efficiency
|
|
429
499
|
- general_information
|
|
430
500
|
environment:
|
|
431
|
-
main_name:
|
|
501
|
+
main_name: prometheus_vision
|
|
432
502
|
main_split: test
|
|
433
503
|
taxonomy:
|
|
434
504
|
task: image captioning
|
|
@@ -439,145 +509,112 @@ run_groups:
|
|
|
439
509
|
|
|
440
510
|
- name: gqa
|
|
441
511
|
display_name: GQA
|
|
442
|
-
description: Questions about real-world visual reasoning and compositional QA
|
|
512
|
+
description: Questions about real-world visual reasoning and compositional QA ([Hudson and Manning, 2019](https://arxiv.org/abs/1902.09506)).
|
|
443
513
|
metric_groups:
|
|
444
514
|
- accuracy
|
|
445
|
-
- efficiency
|
|
446
515
|
- general_information
|
|
447
516
|
environment:
|
|
448
|
-
main_name:
|
|
517
|
+
main_name: quasi_exact_match
|
|
449
518
|
main_split: valid
|
|
450
519
|
taxonomy:
|
|
451
|
-
task: short
|
|
520
|
+
task: short-answer question answering
|
|
452
521
|
what: Real-world images
|
|
453
522
|
who: Human experts
|
|
454
523
|
when: "2019"
|
|
455
524
|
language: English
|
|
456
525
|
|
|
457
|
-
- name: heim_human_eval
|
|
458
|
-
display_name: HEIM Human Eval Scenario
|
|
459
|
-
description: Seeing if we can use VLMs to evaluate AI-generated images from HEIM
|
|
460
|
-
metric_groups:
|
|
461
|
-
- accuracy
|
|
462
|
-
- efficiency
|
|
463
|
-
- general_information
|
|
464
|
-
environment:
|
|
465
|
-
main_name: exact_match
|
|
466
|
-
main_split: test
|
|
467
|
-
taxonomy:
|
|
468
|
-
task: multiple-choice question answering
|
|
469
|
-
what: AI-generated images
|
|
470
|
-
who: Text-to-image models
|
|
471
|
-
when: "2024"
|
|
472
|
-
language: English
|
|
473
|
-
|
|
474
|
-
- name: image2structure
|
|
475
|
-
display_name: Image2Structure
|
|
476
|
-
description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images.
|
|
477
|
-
category: All scenarios
|
|
478
|
-
subgroups:
|
|
479
|
-
- image2latex
|
|
480
|
-
- image2webpage
|
|
481
|
-
- image2musicsheet
|
|
482
|
-
|
|
483
526
|
- name: hateful_memes
|
|
484
527
|
display_name: Hateful Memes
|
|
485
|
-
description:
|
|
528
|
+
description: Multimodal hate speech detection in memes ([Kiela et al., 2020](https://arxiv.org/abs/2005.04790)).
|
|
486
529
|
metric_groups:
|
|
487
530
|
- accuracy
|
|
488
|
-
- efficiency
|
|
489
531
|
- general_information
|
|
490
532
|
environment:
|
|
491
533
|
main_name: exact_match
|
|
492
534
|
main_split: test
|
|
493
535
|
taxonomy:
|
|
494
|
-
task:
|
|
495
|
-
what:
|
|
496
|
-
who:
|
|
536
|
+
task: toxicity identification
|
|
537
|
+
what: Memes
|
|
538
|
+
who: Human experts
|
|
497
539
|
when: "2020"
|
|
498
540
|
language: English
|
|
499
541
|
|
|
500
542
|
- name: mm_safety_bench
|
|
501
543
|
display_name: MM-SafetyBench
|
|
502
|
-
description:
|
|
544
|
+
description: Exposes the vulnerability of open-source VLMs with toxic and biased content ([Liu et al., 2023](https://arxiv.org/abs/2311.17600)).
|
|
503
545
|
metric_groups:
|
|
504
546
|
- accuracy
|
|
505
|
-
- efficiency
|
|
506
547
|
- toxicity
|
|
507
548
|
- general_information
|
|
508
549
|
environment:
|
|
509
550
|
main_name: toxic_frac
|
|
510
551
|
main_split: test
|
|
511
552
|
taxonomy:
|
|
512
|
-
task:
|
|
513
|
-
what:
|
|
553
|
+
task: toxicity mitigation
|
|
554
|
+
what: Jail-break images
|
|
514
555
|
who: Human experts
|
|
515
556
|
when: "2023"
|
|
516
557
|
language: English
|
|
517
558
|
|
|
518
|
-
- name:
|
|
519
|
-
display_name:
|
|
520
|
-
description:
|
|
559
|
+
- name: viz_wiz
|
|
560
|
+
display_name: VizWiz
|
|
561
|
+
description: A benchmark for visual question answering with images and questions created by visually impaired people ([Gurari et al., 2018](https://arxiv.org/abs/1802.08218)).
|
|
521
562
|
metric_groups:
|
|
522
563
|
- accuracy
|
|
523
|
-
- efficiency
|
|
524
564
|
- general_information
|
|
525
565
|
environment:
|
|
526
|
-
main_name:
|
|
566
|
+
main_name: quasi_exact_match
|
|
527
567
|
main_split: valid
|
|
528
568
|
taxonomy:
|
|
529
|
-
task:
|
|
530
|
-
what: Real
|
|
531
|
-
who:
|
|
532
|
-
when: "
|
|
569
|
+
task: short-answer question answering
|
|
570
|
+
what: Real-world images
|
|
571
|
+
who: Visually impaired people
|
|
572
|
+
when: "2018"
|
|
533
573
|
language: English
|
|
534
574
|
|
|
535
|
-
- name:
|
|
536
|
-
display_name:
|
|
537
|
-
description:
|
|
575
|
+
- name: vqa_base
|
|
576
|
+
display_name: VQAv2
|
|
577
|
+
description: Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
|
|
538
578
|
metric_groups:
|
|
539
579
|
- accuracy
|
|
540
|
-
- efficiency
|
|
541
580
|
- general_information
|
|
542
581
|
environment:
|
|
543
|
-
main_name:
|
|
582
|
+
main_name: quasi_exact_match
|
|
544
583
|
main_split: valid
|
|
545
584
|
taxonomy:
|
|
546
|
-
task:
|
|
547
|
-
what: Real
|
|
585
|
+
task: short-answer question answering
|
|
586
|
+
what: Real-world images
|
|
548
587
|
who: Human experts
|
|
549
|
-
when: "
|
|
588
|
+
when: "2017"
|
|
550
589
|
language: English
|
|
551
590
|
|
|
552
|
-
- name:
|
|
553
|
-
display_name:
|
|
554
|
-
description:
|
|
591
|
+
- name: vqa_dialect
|
|
592
|
+
display_name: VQAv2 (AAE)
|
|
593
|
+
description: African-American English Perturbation + Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
|
|
555
594
|
metric_groups:
|
|
556
|
-
-
|
|
557
|
-
- efficiency
|
|
595
|
+
- fairness
|
|
558
596
|
- general_information
|
|
559
597
|
environment:
|
|
560
|
-
main_name:
|
|
598
|
+
main_name: quasi_exact_match
|
|
561
599
|
main_split: valid
|
|
562
600
|
taxonomy:
|
|
563
|
-
task:
|
|
601
|
+
task: short-answer question answering
|
|
564
602
|
what: Real-world images
|
|
565
|
-
who:
|
|
566
|
-
when: "
|
|
603
|
+
who: Human experts
|
|
604
|
+
when: "2017"
|
|
567
605
|
language: English
|
|
568
606
|
|
|
569
|
-
- name:
|
|
570
|
-
display_name: VQAv2
|
|
571
|
-
description: Open-ended questions about real-world images [
|
|
607
|
+
- name: vqa_robustness
|
|
608
|
+
display_name: VQAv2 (robustness)
|
|
609
|
+
description: Robustness Typos Perturbation + Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
|
|
572
610
|
metric_groups:
|
|
573
|
-
-
|
|
574
|
-
- efficiency
|
|
611
|
+
- robustness
|
|
575
612
|
- general_information
|
|
576
613
|
environment:
|
|
577
|
-
main_name:
|
|
614
|
+
main_name: quasi_exact_match
|
|
578
615
|
main_split: valid
|
|
579
616
|
taxonomy:
|
|
580
|
-
task:
|
|
617
|
+
task: short-answer question answering
|
|
581
618
|
what: Real-world images
|
|
582
619
|
who: Human experts
|
|
583
620
|
when: "2017"
|
|
@@ -585,10 +622,9 @@ run_groups:
|
|
|
585
622
|
|
|
586
623
|
- name: math_vista
|
|
587
624
|
display_name: MathVista
|
|
588
|
-
description:
|
|
625
|
+
description: A benchmark designed to combine challenges from diverse mathematical and visual tasks ([Lu et al., 2024](https://arxiv.org/abs/2310.02255)).
|
|
589
626
|
metric_groups:
|
|
590
627
|
- accuracy
|
|
591
|
-
- efficiency
|
|
592
628
|
- general_information
|
|
593
629
|
environment:
|
|
594
630
|
main_name: exact_match
|
|
@@ -602,16 +638,15 @@ run_groups:
|
|
|
602
638
|
|
|
603
639
|
- name: mmmu
|
|
604
640
|
display_name: MMMU
|
|
605
|
-
description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning [
|
|
641
|
+
description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning ([Yue et al., 2023](https://arxiv.org/abs/2311.16502)).
|
|
606
642
|
metric_groups:
|
|
607
643
|
- accuracy
|
|
608
|
-
- efficiency
|
|
609
644
|
- general_information
|
|
610
645
|
environment:
|
|
611
646
|
main_name: exact_match
|
|
612
647
|
main_split: valid
|
|
613
648
|
taxonomy:
|
|
614
|
-
task:
|
|
649
|
+
task: multiple-choice question answering
|
|
615
650
|
what: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering
|
|
616
651
|
who: Human experts
|
|
617
652
|
when: "2023"
|
|
@@ -619,7 +654,7 @@ run_groups:
|
|
|
619
654
|
|
|
620
655
|
- name: unicorn
|
|
621
656
|
display_name: Unicorn
|
|
622
|
-
description: Safety Evaluation Benchmark for Evaluating on Out-of-Distribution and Sketch Images
|
|
657
|
+
description: Safety Evaluation Benchmark for Evaluating on Out-of-Distribution and Sketch Images ([Tu et al., 2023](https://arxiv.org/abs/2311.16101)).
|
|
623
658
|
metric_groups:
|
|
624
659
|
- accuracy
|
|
625
660
|
- general_information
|
|
@@ -627,7 +662,7 @@ run_groups:
|
|
|
627
662
|
main_name: exact_match
|
|
628
663
|
main_split: test
|
|
629
664
|
taxonomy:
|
|
630
|
-
task: short
|
|
665
|
+
task: short-answer question answering
|
|
631
666
|
what: OOD images and sketch images
|
|
632
667
|
who: Human experts
|
|
633
668
|
when: "2023"
|
|
@@ -635,48 +670,31 @@ run_groups:
|
|
|
635
670
|
|
|
636
671
|
- name: bingo
|
|
637
672
|
display_name: Bingo
|
|
638
|
-
description: Open-ended questions about biased images
|
|
673
|
+
description: Open-ended questions about biased images and hallucinations-inducing images ([Cui et al., 2023](https://arxiv.org/abs/2311.03287)).
|
|
639
674
|
metric_groups:
|
|
640
675
|
- accuracy
|
|
676
|
+
- general_information
|
|
641
677
|
environment:
|
|
642
|
-
main_name:
|
|
678
|
+
main_name: prometheus_vision
|
|
643
679
|
main_split: test
|
|
644
680
|
taxonomy:
|
|
645
|
-
task: short
|
|
681
|
+
task: short-answer question answering
|
|
646
682
|
what: Biased images about Region, OCR, Factual, Text-to-Image and Image-to-Image inference challenges
|
|
647
683
|
who: Human experts
|
|
648
684
|
when: "2023"
|
|
649
685
|
language: English, Chinese, Japanese, etc.
|
|
650
|
-
|
|
651
|
-
- name: multipanelvqa
|
|
652
|
-
display_name: MultipanelVQA
|
|
653
|
-
description: Question about real-world or synthetic multipanel images for evaluating multi-panel image reasoning ability
|
|
654
|
-
metric_groups:
|
|
655
|
-
- accuracy
|
|
656
|
-
- efficiency
|
|
657
|
-
- general_information
|
|
658
|
-
environment:
|
|
659
|
-
main_name: exact_match
|
|
660
|
-
main_split: test
|
|
661
|
-
taxonomy:
|
|
662
|
-
task: short answer or multiple-choice question answering
|
|
663
|
-
what: Real-world or synthetic multipanel images
|
|
664
|
-
who: Human experts
|
|
665
|
-
when: "2024"
|
|
666
|
-
language: English
|
|
667
|
-
|
|
686
|
+
|
|
668
687
|
- name: pope
|
|
669
688
|
display_name: POPE
|
|
670
|
-
description: Open-ended questions about object appearance in real-world images for evaluating hallucination behaviour
|
|
689
|
+
description: Open-ended questions about object appearance in real-world images for evaluating hallucination behaviour ([Li et al., 2023](https://aclanthology.org/2023.emnlp-main.20)).
|
|
671
690
|
metric_groups:
|
|
672
691
|
- accuracy
|
|
673
|
-
- efficiency
|
|
674
692
|
- general_information
|
|
675
693
|
environment:
|
|
676
694
|
main_name: exact_match
|
|
677
695
|
main_split: test
|
|
678
696
|
taxonomy:
|
|
679
|
-
task: short
|
|
697
|
+
task: short-answer question answering
|
|
680
698
|
what: Real-world images
|
|
681
699
|
who: Human experts
|
|
682
700
|
when: "2023"
|
|
@@ -684,11 +702,9 @@ run_groups:
|
|
|
684
702
|
|
|
685
703
|
- name: seed_bench
|
|
686
704
|
display_name: Seed Bench
|
|
687
|
-
description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input
|
|
688
|
-
including the comprehension of both the image and video modality
|
|
705
|
+
description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input including the comprehension of both the image and video modality ([Li et al., 2023](https://arxiv.org/abs/2307.16125)).
|
|
689
706
|
metric_groups:
|
|
690
707
|
- accuracy
|
|
691
|
-
- efficiency
|
|
692
708
|
- general_information
|
|
693
709
|
environment:
|
|
694
710
|
main_name: exact_match
|
|
@@ -702,10 +718,9 @@ run_groups:
|
|
|
702
718
|
|
|
703
719
|
- name: mme
|
|
704
720
|
display_name: MME
|
|
705
|
-
description: A comprehensive MLLM Evaluation benchmark with perception and cognition evaluations on 14 subtasks
|
|
721
|
+
description: A comprehensive MLLM Evaluation benchmark with perception and cognition evaluations on 14 subtasks ([Fu et al., 2023](https://arxiv.org/abs/2306.13394)).
|
|
706
722
|
metric_groups:
|
|
707
723
|
- accuracy
|
|
708
|
-
- efficiency
|
|
709
724
|
- general_information
|
|
710
725
|
environment:
|
|
711
726
|
main_name: exact_match
|
|
@@ -717,107 +732,98 @@ run_groups:
|
|
|
717
732
|
when: "2023"
|
|
718
733
|
language: English
|
|
719
734
|
|
|
720
|
-
- name:
|
|
721
|
-
display_name:
|
|
722
|
-
description: A
|
|
735
|
+
- name: vibe_eval
|
|
736
|
+
display_name: Vibe Eval
|
|
737
|
+
description: A difficult evaluation suite for measuring progress of multimodal language models with day-to-day tasks ([Padlewski et al., 2024](https://arxiv.org/abs/2405.02287)).
|
|
723
738
|
metric_groups:
|
|
724
739
|
- accuracy
|
|
740
|
+
- general_information
|
|
725
741
|
environment:
|
|
726
|
-
main_name:
|
|
742
|
+
main_name: prometheus_vision
|
|
727
743
|
main_split: test
|
|
728
744
|
taxonomy:
|
|
729
|
-
task: short
|
|
730
|
-
what:
|
|
745
|
+
task: short-answer question answering
|
|
746
|
+
what: Knowledge intensive
|
|
731
747
|
who: Human experts
|
|
732
748
|
when: "2024"
|
|
733
749
|
language: English
|
|
734
750
|
|
|
735
|
-
- name:
|
|
736
|
-
display_name:
|
|
737
|
-
description:
|
|
751
|
+
- name: mementos
|
|
752
|
+
display_name: Mementos
|
|
753
|
+
description: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences ([Wang et al., 2024](https://arxiv.org/abs/2401.10529)).
|
|
738
754
|
metric_groups:
|
|
739
755
|
- accuracy
|
|
740
|
-
- generation_image
|
|
741
|
-
- generation_text
|
|
742
|
-
- efficiency
|
|
743
756
|
- general_information
|
|
744
757
|
environment:
|
|
745
|
-
main_name:
|
|
746
|
-
main_split:
|
|
758
|
+
main_name: prometheus_vision
|
|
759
|
+
main_split: test
|
|
747
760
|
taxonomy:
|
|
748
|
-
task:
|
|
749
|
-
what:
|
|
750
|
-
who:
|
|
761
|
+
task: short-answer question answering
|
|
762
|
+
what: Image sequences of comics, daily life and robotics
|
|
763
|
+
who: Human experts
|
|
751
764
|
when: "2024"
|
|
752
765
|
language: English
|
|
753
766
|
|
|
754
|
-
- name:
|
|
755
|
-
display_name:
|
|
756
|
-
description:
|
|
767
|
+
- name: pairs
|
|
768
|
+
display_name: PAIRS
|
|
769
|
+
description: Examining gender and racial bias using parallel images ([Fraser et al., 2024](https://arxiv.org/abs/2402.05779)).
|
|
757
770
|
metric_groups:
|
|
758
771
|
- accuracy
|
|
759
|
-
- generation_image
|
|
760
|
-
- generation_text
|
|
761
|
-
- efficiency
|
|
762
772
|
- general_information
|
|
763
773
|
environment:
|
|
764
|
-
main_name:
|
|
765
|
-
main_split:
|
|
774
|
+
main_name: exact_match
|
|
775
|
+
main_split: test
|
|
766
776
|
taxonomy:
|
|
767
|
-
task:
|
|
768
|
-
what:
|
|
769
|
-
who:
|
|
777
|
+
task: multiple-choice question answering
|
|
778
|
+
what: Bias
|
|
779
|
+
who: Human experts
|
|
770
780
|
when: "2024"
|
|
771
781
|
language: English
|
|
772
782
|
|
|
773
|
-
- name:
|
|
774
|
-
display_name:
|
|
775
|
-
description:
|
|
783
|
+
- name: fair_face
|
|
784
|
+
display_name: FairFace
|
|
785
|
+
description: Identify the race, gender or age of a photo of a person ([Karkkainen et al., 2019](https://arxiv.org/abs/1908.04913)).
|
|
776
786
|
metric_groups:
|
|
777
787
|
- accuracy
|
|
778
|
-
- generation_image
|
|
779
|
-
- efficiency
|
|
780
788
|
- general_information
|
|
781
789
|
environment:
|
|
782
|
-
main_name:
|
|
790
|
+
main_name: exact_match
|
|
783
791
|
main_split: valid
|
|
784
792
|
taxonomy:
|
|
785
|
-
task:
|
|
786
|
-
what:
|
|
787
|
-
who:
|
|
788
|
-
when: "
|
|
793
|
+
task: multiple-choice question answering
|
|
794
|
+
what: Fairness
|
|
795
|
+
who: Human experts
|
|
796
|
+
when: "2019"
|
|
789
797
|
language: English
|
|
790
798
|
|
|
791
|
-
- name:
|
|
792
|
-
display_name:
|
|
793
|
-
description:
|
|
799
|
+
- name: real_world_qa
|
|
800
|
+
display_name: RealWorldQA
|
|
801
|
+
description: A benchmark designed to to evaluate real-world spatial understanding capabilities of multimodal models ([xAI, 2024](https://x.ai/blog/grok-1.5v)).
|
|
794
802
|
metric_groups:
|
|
795
803
|
- accuracy
|
|
796
|
-
- efficiency
|
|
797
804
|
- general_information
|
|
798
805
|
environment:
|
|
799
806
|
main_name: exact_match
|
|
800
807
|
main_split: test
|
|
801
808
|
taxonomy:
|
|
802
|
-
task:
|
|
803
|
-
what:
|
|
804
|
-
who:
|
|
809
|
+
task: short-answer question answering
|
|
810
|
+
what: Real world images
|
|
811
|
+
who: Human experts
|
|
805
812
|
when: "2024"
|
|
806
813
|
language: English
|
|
807
814
|
|
|
808
|
-
- name:
|
|
809
|
-
display_name:
|
|
810
|
-
description:
|
|
815
|
+
- name: exams_v
|
|
816
|
+
display_name: Exams-V
|
|
817
|
+
description: A multimodal and multilingual benchmark with knowledge-intensive exam questions covering natural science, social science, and other miscellaneous studies ([Das et al., 2024]( https://arxiv.org/abs/2403.10378)).
|
|
811
818
|
metric_groups:
|
|
812
819
|
- accuracy
|
|
813
|
-
- efficiency
|
|
814
820
|
- general_information
|
|
815
821
|
environment:
|
|
816
822
|
main_name: exact_match
|
|
817
823
|
main_split: test
|
|
818
824
|
taxonomy:
|
|
819
825
|
task: multiple-choice question answering
|
|
820
|
-
what:
|
|
826
|
+
what: Exam questions
|
|
821
827
|
who: Human experts
|
|
822
828
|
when: "2024"
|
|
823
|
-
language: English
|
|
829
|
+
language: English, Chinese, Croation, Hungarian, Arabic, Serbian, Bulgarian, English, German, French, Spanish, Polish
|