crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
- helm/benchmark/adaptation/adapter_spec.py +32 -31
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/annotation/air_bench_annotator.py +64 -0
- helm/benchmark/annotation/annotator_factory.py +6 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
- helm/benchmark/annotation/live_qa_annotator.py +84 -0
- helm/benchmark/annotation/medication_qa_annotator.py +81 -0
- helm/benchmark/augmentations/perturbation.py +17 -1
- helm/benchmark/augmentations/test_perturbation.py +30 -0
- helm/benchmark/augmentations/translate_perturbation.py +1 -0
- helm/benchmark/huggingface_registration.py +16 -6
- helm/benchmark/metrics/air_bench_metrics.py +56 -0
- helm/benchmark/metrics/efficiency_metrics.py +9 -2
- helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
- helm/benchmark/metrics/live_qa_metrics.py +23 -0
- helm/benchmark/metrics/medication_qa_metrics.py +23 -0
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/unitxt_metrics.py +20 -10
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
- helm/benchmark/model_metadata_registry.py +5 -1
- helm/benchmark/presentation/schema.py +54 -4
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/run.py +16 -2
- helm/benchmark/run_expander.py +112 -63
- helm/benchmark/run_spec_factory.py +15 -10
- helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +15 -11
- helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
- helm/benchmark/run_specs/experimental_run_specs.py +33 -0
- helm/benchmark/run_specs/finance_run_specs.py +33 -0
- helm/benchmark/run_specs/vlm_run_specs.py +444 -65
- helm/benchmark/scenarios/air_bench_scenario.py +50 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
- helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/math_scenario.py +1 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_classic.yaml +3 -59
- helm/benchmark/static/schema_finance.yaml +143 -0
- helm/benchmark/static/schema_image2structure.yaml +447 -0
- helm/benchmark/static/schema_instruction_following.yaml +3 -52
- helm/benchmark/static/schema_lite.yaml +3 -61
- helm/benchmark/static/schema_medical.yaml +255 -0
- helm/benchmark/static/schema_mmlu.yaml +3 -61
- helm/benchmark/static/schema_tables.yaml +200 -0
- helm/benchmark/static/schema_thai.yaml +223 -0
- helm/benchmark/static/schema_unitxt.yaml +3 -61
- helm/benchmark/static/schema_vhelm.yaml +824 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
- helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
- helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
- helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
- helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/anthropic_client.py +78 -14
- helm/clients/auto_client.py +11 -0
- helm/clients/client.py +24 -7
- helm/clients/cohere_client.py +98 -3
- helm/clients/huggingface_client.py +71 -12
- helm/clients/openai_client.py +11 -5
- helm/clients/reka_client.py +189 -0
- helm/clients/test_client.py +3 -3
- helm/clients/test_huggingface_client.py +19 -3
- helm/clients/test_together_client.py +72 -2
- helm/clients/together_client.py +199 -2
- helm/clients/vertexai_client.py +117 -64
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +12 -4
- helm/clients/vision_language/idefics_client.py +2 -2
- helm/clients/vision_language/paligemma_client.py +146 -0
- helm/clients/vision_language/palmyra_vision_client.py +84 -0
- helm/clients/yi_client.py +31 -0
- helm/common/critique_request.py +10 -1
- helm/common/images_utils.py +29 -3
- helm/config/model_deployments.yaml +504 -12
- helm/config/model_metadata.yaml +579 -52
- helm/config/tokenizer_configs.yaml +100 -1
- helm/proxy/critique/model_critique_client.py +32 -4
- helm/proxy/services/server_service.py +1 -1
- helm/tokenizers/auto_tokenizer.py +1 -1
- helm/tokenizers/cohere_tokenizer.py +44 -2
- helm/tokenizers/huggingface_tokenizer.py +36 -13
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_huggingface_tokenizer.py +5 -1
- helm/benchmark/static/schema_vlm.yaml +0 -576
- helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
- helm/benchmark/static_build/assets/index-d839df55.js +0 -9
- helm/benchmark/test_model_deployment_definition.py +0 -90
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,824 @@
|
|
|
1
|
+
---
|
|
2
|
+
############################################################
|
|
3
|
+
# For backwards compatibility with older versions of HELM.
|
|
4
|
+
# TODO: Remove this after 2024-09-01.
|
|
5
|
+
adapter: []
|
|
6
|
+
############################################################
|
|
7
|
+
metrics:
|
|
8
|
+
# Infrastructure metrics:
|
|
9
|
+
- name: num_perplexity_tokens
|
|
10
|
+
display_name: '# tokens'
|
|
11
|
+
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
12
|
+
- name: num_bytes
|
|
13
|
+
display_name: '# bytes'
|
|
14
|
+
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
15
|
+
|
|
16
|
+
- name: num_references
|
|
17
|
+
display_name: '# ref'
|
|
18
|
+
description: Number of references.
|
|
19
|
+
- name: num_train_trials
|
|
20
|
+
display_name: '# trials'
|
|
21
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
22
|
+
- name: estimated_num_tokens_cost
|
|
23
|
+
display_name: 'cost'
|
|
24
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
25
|
+
- name: num_prompt_tokens
|
|
26
|
+
display_name: '# prompt tokens'
|
|
27
|
+
description: Number of tokens in the prompt.
|
|
28
|
+
- name: num_prompt_characters
|
|
29
|
+
display_name: '# prompt chars'
|
|
30
|
+
description: Number of characters in the prompt.
|
|
31
|
+
- name: num_completion_tokens
|
|
32
|
+
display_name: '# completion tokens'
|
|
33
|
+
description: Actual number of completion tokens (over all completions).
|
|
34
|
+
- name: num_output_tokens
|
|
35
|
+
display_name: '# output tokens'
|
|
36
|
+
description: Actual number of output tokens.
|
|
37
|
+
- name: max_num_output_tokens
|
|
38
|
+
display_name: 'Max output tokens'
|
|
39
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
40
|
+
- name: num_requests
|
|
41
|
+
display_name: '# requests'
|
|
42
|
+
description: Number of distinct API requests.
|
|
43
|
+
- name: num_instances
|
|
44
|
+
display_name: '# eval'
|
|
45
|
+
description: Number of evaluation instances.
|
|
46
|
+
- name: num_train_instances
|
|
47
|
+
display_name: '# train'
|
|
48
|
+
description: Number of training instances (e.g., in-context examples).
|
|
49
|
+
- name: prompt_truncated
|
|
50
|
+
display_name: truncated
|
|
51
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
52
|
+
- name: finish_reason_length
|
|
53
|
+
display_name: finish b/c length
|
|
54
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
55
|
+
- name: finish_reason_stop
|
|
56
|
+
display_name: finish b/c stop
|
|
57
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
58
|
+
- name: finish_reason_endoftext
|
|
59
|
+
display_name: finish b/c endoftext
|
|
60
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
61
|
+
- name: finish_reason_unknown
|
|
62
|
+
display_name: finish b/c unknown
|
|
63
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
64
|
+
- name: num_completions
|
|
65
|
+
display_name: '# completions'
|
|
66
|
+
description: Number of completions.
|
|
67
|
+
- name: predicted_index
|
|
68
|
+
display_name: Predicted index
|
|
69
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
70
|
+
|
|
71
|
+
# Vision Language metrics [text]:
|
|
72
|
+
- name: edit_similarity
|
|
73
|
+
display_name: Edit similarity (Levenshtein)
|
|
74
|
+
short_display_name: Edit sim.
|
|
75
|
+
lower_is_better: false
|
|
76
|
+
description: Average Levenshtein edit similarity (1 - distance normalized by length of longer sequence) between model generation and reference.
|
|
77
|
+
|
|
78
|
+
# Vision Language metrics [image]:
|
|
79
|
+
- name: earth_mover_similarity
|
|
80
|
+
display_name: Earth Mover Similarity
|
|
81
|
+
short_display_name: EMD-Sim
|
|
82
|
+
description: 1 - Earth Mover Distance [(Rubner and Tomasi, 2000)](https://www.cs.cmu.edu/~efros/courses/LBMV07/Papers/rubner-jcviu-00.pdf) between an image generated by the model and the target image.
|
|
83
|
+
lower_is_better: false
|
|
84
|
+
- name: pixel_similarity
|
|
85
|
+
display_name: Pixel Similarity
|
|
86
|
+
short_display_name: PS
|
|
87
|
+
description: Pixel Similarity between an image generated by the model and the target image.
|
|
88
|
+
lower_is_better: false
|
|
89
|
+
- name: sift_similarity
|
|
90
|
+
display_name: SIFT Similarity
|
|
91
|
+
short_display_name: SIFT
|
|
92
|
+
description: SIFT Similarity (Scale-Invariant Feature Transform) [(Lowe, 1999)](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=790410) between an image generated by the model and the target image.
|
|
93
|
+
lower_is_better: false
|
|
94
|
+
- name: compilation_success
|
|
95
|
+
display_name: Compilation success
|
|
96
|
+
description: Fraction of instances where the generated code compiles successfully.
|
|
97
|
+
lower_is_better: false
|
|
98
|
+
- name: lpips_similarity
|
|
99
|
+
display_name: LPIPS similarity
|
|
100
|
+
short_display_name: LPIPS
|
|
101
|
+
description: LPIPS similarity (Learned Perceptual Image Patch Similarity) [(Zhang et al., 2018)](https://arxiv.org/abs/1801.03924) between an image generated by the model and the target image.
|
|
102
|
+
lower_is_better: false
|
|
103
|
+
- name: fid_similarity
|
|
104
|
+
display_name: FID similarity
|
|
105
|
+
short_display_name: FID
|
|
106
|
+
description: FID similarity (Fréchet Inception Distance) [(Heusel et al., 2017)](https://arxiv.org/abs/1706.08500) between an image generated by the model and the target image.
|
|
107
|
+
lower_is_better: false
|
|
108
|
+
- name: ssim_similarity
|
|
109
|
+
display_name: SSIM
|
|
110
|
+
short_display_name: SSIM
|
|
111
|
+
description: SSIM similarity (Structural Similarity Index) [(Wang et al., 2004)](https://www.cns.nyu.edu/pub/eero/wang03-reprint.pdf) between an image generated by the model and the target image.
|
|
112
|
+
lower_is_better: false
|
|
113
|
+
|
|
114
|
+
# Accuracy metrics:
|
|
115
|
+
- name: exact_match
|
|
116
|
+
display_name: Exact match
|
|
117
|
+
short_display_name: EM
|
|
118
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
119
|
+
lower_is_better: false
|
|
120
|
+
- name: quasi_exact_match
|
|
121
|
+
display_name: Quasi-exact match
|
|
122
|
+
short_display_name: EM
|
|
123
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
124
|
+
lower_is_better: false
|
|
125
|
+
- name: prefix_exact_match
|
|
126
|
+
display_name: Prefix exact match
|
|
127
|
+
short_display_name: PEM
|
|
128
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
|
|
129
|
+
lower_is_better: false
|
|
130
|
+
- name: quasi_prefix_exact_match
|
|
131
|
+
# TODO: should call this prefix_quasi_exact_match
|
|
132
|
+
display_name: Prefix quasi-exact match
|
|
133
|
+
short_display_name: PEM
|
|
134
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
|
|
135
|
+
lower_is_better: false
|
|
136
|
+
|
|
137
|
+
- name: rouge_1
|
|
138
|
+
display_name: ROUGE-1
|
|
139
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
|
|
140
|
+
lower_is_better: false
|
|
141
|
+
- name: rouge_2
|
|
142
|
+
display_name: ROUGE-2
|
|
143
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
|
|
144
|
+
lower_is_better: false
|
|
145
|
+
- name: rouge_l
|
|
146
|
+
display_name: ROUGE-L
|
|
147
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
|
|
148
|
+
lower_is_better: false
|
|
149
|
+
- name: bleu_1
|
|
150
|
+
display_name: BLEU-1
|
|
151
|
+
description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
|
|
152
|
+
lower_is_better: false
|
|
153
|
+
- name: bleu_4
|
|
154
|
+
display_name: BLEU-4
|
|
155
|
+
description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
|
|
156
|
+
lower_is_better: false
|
|
157
|
+
- name: f1_score
|
|
158
|
+
display_name: F1
|
|
159
|
+
description: Average F1 score in terms of word overlap between the model output and correct reference.
|
|
160
|
+
lower_is_better: false
|
|
161
|
+
- name: cider
|
|
162
|
+
display_name: CIDEr
|
|
163
|
+
description: Evaluates the quality of generated caption by measuring the weighted similarity of n-grams between the captions and a set of human-written reference captions, emphasizing informativeness and consensus.
|
|
164
|
+
lower_is_better: false
|
|
165
|
+
- name: prometheus_vision
|
|
166
|
+
display_name: Prometheus Vision rating
|
|
167
|
+
description: Scores are from 1 to 5, where 5 is the best.
|
|
168
|
+
lower_is_better: false
|
|
169
|
+
|
|
170
|
+
# Toxicity metrics
|
|
171
|
+
- name: expected_max_toxicity
|
|
172
|
+
display_name: Average max toxicity
|
|
173
|
+
short_display_name: Avg. max toxicity
|
|
174
|
+
lower_is_better: true
|
|
175
|
+
description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
|
|
176
|
+
- name: max_toxicity_probability
|
|
177
|
+
display_name: max toxicity
|
|
178
|
+
lower_is_better: true
|
|
179
|
+
description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
|
|
180
|
+
- name: toxic_frac
|
|
181
|
+
display_name: Toxic fraction
|
|
182
|
+
lower_is_better: true
|
|
183
|
+
description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
|
|
184
|
+
|
|
185
|
+
############################################################
|
|
186
|
+
perturbations:
|
|
187
|
+
- name: translate
|
|
188
|
+
display_name: Translate
|
|
189
|
+
description: Translate prompts to different languages.
|
|
190
|
+
- name: robustness
|
|
191
|
+
display_name: Robustness
|
|
192
|
+
description: Computes worst case over different robustness perturbations (misspellings, formatting, contrast sets).
|
|
193
|
+
- name: fairness
|
|
194
|
+
display_name: Fairness
|
|
195
|
+
description: Computes worst case over different fairness perturbations (changing dialect, race of names, gender).
|
|
196
|
+
- name: typos
|
|
197
|
+
display_name: Typos
|
|
198
|
+
description: >
|
|
199
|
+
Randomly adds typos to each token in the input with probability 0.05 and computes the per-instance worst-case
|
|
200
|
+
performance between perturbed and unperturbed versions.
|
|
201
|
+
- name: synonym
|
|
202
|
+
display_name: Synonyms
|
|
203
|
+
description: >
|
|
204
|
+
Randomly substitutes words in the input with WordNet synonyms with probability 0.5 and computes the per-instance
|
|
205
|
+
worst-case performance between perturbed and unperturbed versions.
|
|
206
|
+
- name: dialect
|
|
207
|
+
display_name: SAE -> AAE
|
|
208
|
+
short_display_name: Dialect
|
|
209
|
+
description: >
|
|
210
|
+
Deterministically substitutes SAE words in input with AAE counterparts using validated dictionary of [Ziems et al. (2022)](https://aclanthology.org/2022.acl-long.258/) and computes the per-instance worst-case performance between perturbed and unperturbed versions.
|
|
211
|
+
- name: race
|
|
212
|
+
display_name: First names by race (White -> Black)
|
|
213
|
+
short_display_name: Race
|
|
214
|
+
description: >
|
|
215
|
+
Deterministically substitutes White first names with Black first names sampled from the lists of [Caliskan et al. (2017)](https://www.science.org/doi/10.1126/science.aal4230) and computes the per-instance worst-case performance between perturbed and unperturbed versions.
|
|
216
|
+
- name: gender
|
|
217
|
+
display_name: Pronouns by gender (Male -> Female)
|
|
218
|
+
short_display_name: Gender
|
|
219
|
+
description: >
|
|
220
|
+
Deterministically substitutes male pronouns with female pronouns and computes the per-instance worst-case
|
|
221
|
+
performance between perturbed and unperturbed versions.
|
|
222
|
+
|
|
223
|
+
############################################################
|
|
224
|
+
metric_groups:
|
|
225
|
+
- name: accuracy
|
|
226
|
+
display_name: Accuracy
|
|
227
|
+
metrics:
|
|
228
|
+
- name: ${main_name}
|
|
229
|
+
split: ${main_split}
|
|
230
|
+
|
|
231
|
+
- name: general_information
|
|
232
|
+
display_name: General information
|
|
233
|
+
metrics:
|
|
234
|
+
- name: num_instances
|
|
235
|
+
split: ${main_split}
|
|
236
|
+
- name: num_train_instances
|
|
237
|
+
split: ${main_split}
|
|
238
|
+
- name: prompt_truncated
|
|
239
|
+
split: ${main_split}
|
|
240
|
+
- name: num_prompt_tokens
|
|
241
|
+
split: ${main_split}
|
|
242
|
+
- name: num_output_tokens
|
|
243
|
+
split: ${main_split}
|
|
244
|
+
|
|
245
|
+
- name: toxicity
|
|
246
|
+
display_name: Toxicity
|
|
247
|
+
metrics:
|
|
248
|
+
- name: toxic_frac
|
|
249
|
+
split: ${main_split}
|
|
250
|
+
|
|
251
|
+
- name: fairness
|
|
252
|
+
display_name: Fairness
|
|
253
|
+
metrics:
|
|
254
|
+
- name: ${main_name}
|
|
255
|
+
split: ${main_split}
|
|
256
|
+
perturbation_name: fairness
|
|
257
|
+
|
|
258
|
+
- name: robustness
|
|
259
|
+
display_name: Robustness
|
|
260
|
+
metrics:
|
|
261
|
+
- name: ${main_name}
|
|
262
|
+
split: ${main_split}
|
|
263
|
+
perturbation_name: robustness
|
|
264
|
+
|
|
265
|
+
- name: translate
|
|
266
|
+
display_name: Translate
|
|
267
|
+
metrics:
|
|
268
|
+
- name: ${main_name}
|
|
269
|
+
split: ${main_split}
|
|
270
|
+
perturbation_name: translate
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
############################################################
|
|
274
|
+
run_groups:
|
|
275
|
+
- name: core_scenarios
|
|
276
|
+
display_name: All
|
|
277
|
+
description: All scenarios across capabilities
|
|
278
|
+
category: All scenarios
|
|
279
|
+
subgroups:
|
|
280
|
+
- visual_perception
|
|
281
|
+
- reasoning
|
|
282
|
+
- knowledge
|
|
283
|
+
- bias
|
|
284
|
+
- fairness
|
|
285
|
+
- toxicity
|
|
286
|
+
- robustness
|
|
287
|
+
- multilinguality
|
|
288
|
+
- name: visual_perception
|
|
289
|
+
display_name: Visual perception
|
|
290
|
+
description: Is the output semantically correct, given the text and image inputs?
|
|
291
|
+
category: Core scenarios
|
|
292
|
+
subgroups:
|
|
293
|
+
- vqa_base
|
|
294
|
+
- viz_wiz
|
|
295
|
+
- flickr30k
|
|
296
|
+
- name: reasoning
|
|
297
|
+
display_name: Reasoning
|
|
298
|
+
description: Does the model understand objects, counts and spatial relations? Can the model reason about both the text and image input?
|
|
299
|
+
category: Core scenarios
|
|
300
|
+
subgroups:
|
|
301
|
+
- gqa
|
|
302
|
+
- math_vista
|
|
303
|
+
- seed_bench
|
|
304
|
+
- name: real_world_reasoning
|
|
305
|
+
display_name: Real-world Reasoning
|
|
306
|
+
description: Reasoning in the real-world
|
|
307
|
+
category: Core scenarios
|
|
308
|
+
subgroups:
|
|
309
|
+
- gqa
|
|
310
|
+
- seed_bench
|
|
311
|
+
- mementos
|
|
312
|
+
- name: knowledge
|
|
313
|
+
display_name: Knowledge
|
|
314
|
+
description: Does the model have knowledge about the world and common sense?
|
|
315
|
+
category: Core scenarios
|
|
316
|
+
subgroups:
|
|
317
|
+
- a_okvqa_base
|
|
318
|
+
- mmmu
|
|
319
|
+
- mme
|
|
320
|
+
- vibe_eval
|
|
321
|
+
- name: bias
|
|
322
|
+
display_name: Bias
|
|
323
|
+
description: Are the generations biased in demographic representation? We focus on gender and skin tone bias.
|
|
324
|
+
category: Core scenarios
|
|
325
|
+
subgroups:
|
|
326
|
+
- pairs
|
|
327
|
+
- name: fairness
|
|
328
|
+
display_name: Fairness
|
|
329
|
+
description: Does the model exhibit performance disparities across different groups? We focus on gender, dialect and geographic bias.
|
|
330
|
+
category: Core scenarios
|
|
331
|
+
subgroups:
|
|
332
|
+
- vqa_dialect
|
|
333
|
+
- a_okvqa_dialect
|
|
334
|
+
- crossmodal_3600
|
|
335
|
+
- name: toxicity
|
|
336
|
+
display_name: Toxicity
|
|
337
|
+
description: Does the model generate toxic or inappropriate content? Can the model identify toxic or inappropriate content?
|
|
338
|
+
category: Core scenarios
|
|
339
|
+
subgroups:
|
|
340
|
+
- mm_safety_bench
|
|
341
|
+
- hateful_memes
|
|
342
|
+
- name: robustness
|
|
343
|
+
display_name: Robustness
|
|
344
|
+
description: Is the model robust to perturbations? We focus on both text and image perturbations.
|
|
345
|
+
category: Core scenarios
|
|
346
|
+
subgroups:
|
|
347
|
+
- vqa_robustness
|
|
348
|
+
- a_okvqa_robustness
|
|
349
|
+
- unicorn
|
|
350
|
+
- bingo
|
|
351
|
+
- pope
|
|
352
|
+
- name: multilinguality
|
|
353
|
+
display_name: Multilinguality
|
|
354
|
+
description: Do the model support non-English languages?
|
|
355
|
+
category: Core scenarios
|
|
356
|
+
subgroups:
|
|
357
|
+
- a_okvqa_chinese
|
|
358
|
+
- a_okvqa_hindi
|
|
359
|
+
- a_okvqa_spanish
|
|
360
|
+
- a_okvqa_swahili
|
|
361
|
+
|
|
362
|
+
- name: a_okvqa_base
|
|
363
|
+
display_name: A-OKVQA
|
|
364
|
+
description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
|
|
365
|
+
metric_groups:
|
|
366
|
+
- accuracy
|
|
367
|
+
- general_information
|
|
368
|
+
environment:
|
|
369
|
+
main_name: exact_match
|
|
370
|
+
main_split: valid
|
|
371
|
+
taxonomy:
|
|
372
|
+
task: multiple-choice question answering
|
|
373
|
+
what: Real-world images
|
|
374
|
+
who: Human experts
|
|
375
|
+
when: "2023"
|
|
376
|
+
language: English
|
|
377
|
+
|
|
378
|
+
- name: a_okvqa_dialect
|
|
379
|
+
display_name: A-OKVQA (AAE)
|
|
380
|
+
description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
|
|
381
|
+
metric_groups:
|
|
382
|
+
- fairness
|
|
383
|
+
- general_information
|
|
384
|
+
environment:
|
|
385
|
+
main_name: exact_match
|
|
386
|
+
main_split: valid
|
|
387
|
+
taxonomy:
|
|
388
|
+
task: multiple-choice question answering
|
|
389
|
+
what: Real-world images
|
|
390
|
+
who: Human experts
|
|
391
|
+
when: "2023"
|
|
392
|
+
language: English
|
|
393
|
+
|
|
394
|
+
- name: a_okvqa_robustness
|
|
395
|
+
display_name: A-OKVQA (robustness)
|
|
396
|
+
description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
|
|
397
|
+
metric_groups:
|
|
398
|
+
- robustness
|
|
399
|
+
- general_information
|
|
400
|
+
environment:
|
|
401
|
+
main_name: exact_match
|
|
402
|
+
main_split: valid
|
|
403
|
+
taxonomy:
|
|
404
|
+
task: multiple-choice question answering
|
|
405
|
+
what: Real-world images
|
|
406
|
+
who: Human experts
|
|
407
|
+
when: "2023"
|
|
408
|
+
language: English
|
|
409
|
+
|
|
410
|
+
- name: a_okvqa_chinese
|
|
411
|
+
display_name: A-OKVQA (chinese)
|
|
412
|
+
description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
|
|
413
|
+
metric_groups:
|
|
414
|
+
- translate
|
|
415
|
+
- general_information
|
|
416
|
+
environment:
|
|
417
|
+
main_name: exact_match
|
|
418
|
+
main_split: valid
|
|
419
|
+
taxonomy:
|
|
420
|
+
task: multiple-choice question answering
|
|
421
|
+
what: Real-world images
|
|
422
|
+
who: Human experts
|
|
423
|
+
when: "2023"
|
|
424
|
+
language: Chinese
|
|
425
|
+
|
|
426
|
+
- name: a_okvqa_hindi
|
|
427
|
+
display_name: A-OKVQA (hindi)
|
|
428
|
+
description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
|
|
429
|
+
metric_groups:
|
|
430
|
+
- translate
|
|
431
|
+
- general_information
|
|
432
|
+
environment:
|
|
433
|
+
main_name: exact_match
|
|
434
|
+
main_split: valid
|
|
435
|
+
taxonomy:
|
|
436
|
+
task: multiple-choice question answering
|
|
437
|
+
what: Real-world images
|
|
438
|
+
who: Human experts
|
|
439
|
+
when: "2023"
|
|
440
|
+
language: Hindi
|
|
441
|
+
|
|
442
|
+
- name: a_okvqa_spanish
|
|
443
|
+
display_name: A-OKVQA (spanish)
|
|
444
|
+
description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
|
|
445
|
+
metric_groups:
|
|
446
|
+
- translate
|
|
447
|
+
- general_information
|
|
448
|
+
environment:
|
|
449
|
+
main_name: exact_match
|
|
450
|
+
main_split: valid
|
|
451
|
+
taxonomy:
|
|
452
|
+
task: multiple-choice question answering
|
|
453
|
+
what: Real-world images
|
|
454
|
+
who: Human experts
|
|
455
|
+
when: "2023"
|
|
456
|
+
language: Spanish
|
|
457
|
+
|
|
458
|
+
- name: a_okvqa_swahili
|
|
459
|
+
display_name: A-OKVQA (swahili)
|
|
460
|
+
description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
|
|
461
|
+
metric_groups:
|
|
462
|
+
- translate
|
|
463
|
+
- general_information
|
|
464
|
+
environment:
|
|
465
|
+
main_name: exact_match
|
|
466
|
+
main_split: valid
|
|
467
|
+
taxonomy:
|
|
468
|
+
task: multiple-choice question answering
|
|
469
|
+
what: Real-world images
|
|
470
|
+
who: Human experts
|
|
471
|
+
when: "2023"
|
|
472
|
+
language: Swahili
|
|
473
|
+
|
|
474
|
+
- name: crossmodal_3600
|
|
475
|
+
display_name: Crossmodal 3600
|
|
476
|
+
description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([paper](https://arxiv.org/abs/2205.12522))
|
|
477
|
+
metric_groups:
|
|
478
|
+
- accuracy
|
|
479
|
+
- general_information
|
|
480
|
+
environment:
|
|
481
|
+
main_name: prometheus_vision
|
|
482
|
+
main_split: test
|
|
483
|
+
taxonomy:
|
|
484
|
+
task: multilingual captioning
|
|
485
|
+
what: Real-world images
|
|
486
|
+
who: Human experts
|
|
487
|
+
when: "2022"
|
|
488
|
+
language: 36 languages
|
|
489
|
+
|
|
490
|
+
- name: flickr30k
|
|
491
|
+
display_name: Flickr30k
|
|
492
|
+
description: An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr images. ([paper](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))
|
|
493
|
+
metric_groups:
|
|
494
|
+
- accuracy
|
|
495
|
+
- general_information
|
|
496
|
+
environment:
|
|
497
|
+
main_name: prometheus_vision
|
|
498
|
+
main_split: test
|
|
499
|
+
taxonomy:
|
|
500
|
+
task: image captioning
|
|
501
|
+
what: Flickr images
|
|
502
|
+
who: Human experts
|
|
503
|
+
when: "2014"
|
|
504
|
+
language: English
|
|
505
|
+
|
|
506
|
+
- name: gqa
|
|
507
|
+
display_name: GQA
|
|
508
|
+
description: Questions about real-world visual reasoning and compositional QA
|
|
509
|
+
metric_groups:
|
|
510
|
+
- accuracy
|
|
511
|
+
- general_information
|
|
512
|
+
environment:
|
|
513
|
+
main_name: quasi_exact_match
|
|
514
|
+
main_split: valid
|
|
515
|
+
taxonomy:
|
|
516
|
+
task: short answer question answering
|
|
517
|
+
what: Real-world images
|
|
518
|
+
who: Human experts
|
|
519
|
+
when: "2019"
|
|
520
|
+
language: English
|
|
521
|
+
|
|
522
|
+
- name: hateful_memes
|
|
523
|
+
display_name: Hateful Memes
|
|
524
|
+
description: The Hateful Memes benchmark for multimodal hate speech detection [(Dwibedi et al., 2020)](https://arxiv.org/pdf/2005.04790.pdf).
|
|
525
|
+
metric_groups:
|
|
526
|
+
- accuracy
|
|
527
|
+
- general_information
|
|
528
|
+
environment:
|
|
529
|
+
main_name: exact_match
|
|
530
|
+
main_split: test
|
|
531
|
+
taxonomy:
|
|
532
|
+
task: multimodal classification
|
|
533
|
+
what: images and text
|
|
534
|
+
who: annotators from Amazon Mechanical Turk
|
|
535
|
+
when: "2020"
|
|
536
|
+
language: English
|
|
537
|
+
|
|
538
|
+
- name: mm_safety_bench
|
|
539
|
+
display_name: MM-SafetyBench
|
|
540
|
+
description: Expose the vulnerability of open-source VLMs with toxic and biased content ([paper](https://arxiv.org/abs/2311.17600))
|
|
541
|
+
metric_groups:
|
|
542
|
+
- accuracy
|
|
543
|
+
- toxicity
|
|
544
|
+
- general_information
|
|
545
|
+
environment:
|
|
546
|
+
main_name: toxic_frac
|
|
547
|
+
main_split: test
|
|
548
|
+
taxonomy:
|
|
549
|
+
task: safety
|
|
550
|
+
what: safety images
|
|
551
|
+
who: Human experts
|
|
552
|
+
when: "2023"
|
|
553
|
+
language: English
|
|
554
|
+
|
|
555
|
+
- name: viz_wiz
|
|
556
|
+
display_name: VizWiz
|
|
557
|
+
description: A benchmark for visual question answering with images and questions created by visually impaired people [(Gurari et al., 2018)](https://arxiv.org/abs/1802.08218).
|
|
558
|
+
metric_groups:
|
|
559
|
+
- accuracy
|
|
560
|
+
environment:
|
|
561
|
+
main_name: quasi_exact_match
|
|
562
|
+
main_split: valid
|
|
563
|
+
taxonomy:
|
|
564
|
+
task: multimodal short answer question answering
|
|
565
|
+
what: Real-world images
|
|
566
|
+
who: Visually impaired people
|
|
567
|
+
when: "2018"
|
|
568
|
+
language: English
|
|
569
|
+
|
|
570
|
+
- name: vqa_base
|
|
571
|
+
display_name: VQAv2
|
|
572
|
+
description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
|
|
573
|
+
metric_groups:
|
|
574
|
+
- accuracy
|
|
575
|
+
- general_information
|
|
576
|
+
environment:
|
|
577
|
+
main_name: quasi_exact_match
|
|
578
|
+
main_split: valid
|
|
579
|
+
taxonomy:
|
|
580
|
+
task: multimodal short answer question answering
|
|
581
|
+
what: Real-world images
|
|
582
|
+
who: Human experts
|
|
583
|
+
when: "2017"
|
|
584
|
+
language: English
|
|
585
|
+
|
|
586
|
+
- name: vqa_dialect
|
|
587
|
+
display_name: VQAv2 (AAE)
|
|
588
|
+
description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
|
|
589
|
+
metric_groups:
|
|
590
|
+
- fairness
|
|
591
|
+
- general_information
|
|
592
|
+
environment:
|
|
593
|
+
main_name: quasi_exact_match
|
|
594
|
+
main_split: valid
|
|
595
|
+
taxonomy:
|
|
596
|
+
task: multimodal short answer question answering
|
|
597
|
+
what: Real-world images
|
|
598
|
+
who: Human experts
|
|
599
|
+
when: "2017"
|
|
600
|
+
language: English
|
|
601
|
+
|
|
602
|
+
- name: vqa_robustness
|
|
603
|
+
display_name: VQAv2 (robustness)
|
|
604
|
+
description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
|
|
605
|
+
metric_groups:
|
|
606
|
+
- robustness
|
|
607
|
+
- general_information
|
|
608
|
+
environment:
|
|
609
|
+
main_name: quasi_exact_match
|
|
610
|
+
main_split: valid
|
|
611
|
+
taxonomy:
|
|
612
|
+
task: multimodal short answer question answering
|
|
613
|
+
what: Real-world images
|
|
614
|
+
who: Human experts
|
|
615
|
+
when: "2017"
|
|
616
|
+
language: English
|
|
617
|
+
|
|
618
|
+
- name: vqa_chinese
|
|
619
|
+
display_name: VQAv2 (chinese)
|
|
620
|
+
description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
|
|
621
|
+
metric_groups:
|
|
622
|
+
- translate
|
|
623
|
+
- general_information
|
|
624
|
+
environment:
|
|
625
|
+
main_name: quasi_exact_match
|
|
626
|
+
main_split: valid
|
|
627
|
+
taxonomy:
|
|
628
|
+
task: multimodal short answer question answering
|
|
629
|
+
what: Real-world images
|
|
630
|
+
who: Human experts
|
|
631
|
+
when: "2017"
|
|
632
|
+
language: Chinese
|
|
633
|
+
|
|
634
|
+
- name: vqa_hindi
|
|
635
|
+
display_name: VQAv2 (hindi)
|
|
636
|
+
description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
|
|
637
|
+
metric_groups:
|
|
638
|
+
- translate
|
|
639
|
+
- general_information
|
|
640
|
+
environment:
|
|
641
|
+
main_name: quasi_exact_match
|
|
642
|
+
main_split: valid
|
|
643
|
+
taxonomy:
|
|
644
|
+
task: multimodal short answer question answering
|
|
645
|
+
what: Real-world images
|
|
646
|
+
who: Human experts
|
|
647
|
+
when: "2017"
|
|
648
|
+
language: Hindi
|
|
649
|
+
|
|
650
|
+
- name: vqa_spanish
|
|
651
|
+
display_name: VQAv2 (spanish)
|
|
652
|
+
description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
|
|
653
|
+
metric_groups:
|
|
654
|
+
- translate
|
|
655
|
+
- general_information
|
|
656
|
+
environment:
|
|
657
|
+
main_name: quasi_exact_match
|
|
658
|
+
main_split: valid
|
|
659
|
+
taxonomy:
|
|
660
|
+
task: multimodal short answer question answering
|
|
661
|
+
what: Real-world images
|
|
662
|
+
who: Human experts
|
|
663
|
+
when: "2017"
|
|
664
|
+
language: Spanish
|
|
665
|
+
|
|
666
|
+
- name: math_vista
|
|
667
|
+
display_name: MathVista
|
|
668
|
+
description: Evaluating Math Reasoning in Visual Contexts
|
|
669
|
+
metric_groups:
|
|
670
|
+
- accuracy
|
|
671
|
+
- general_information
|
|
672
|
+
environment:
|
|
673
|
+
main_name: exact_match
|
|
674
|
+
main_split: test
|
|
675
|
+
taxonomy:
|
|
676
|
+
task: multiple-choice question answering
|
|
677
|
+
what: Evaluating Math Reasoning in Visual Contexts
|
|
678
|
+
who: Human experts
|
|
679
|
+
when: "2024"
|
|
680
|
+
language: English
|
|
681
|
+
|
|
682
|
+
- name: mmmu
|
|
683
|
+
display_name: MMMU
|
|
684
|
+
description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning [(Yue et al., 2023)](https://arxiv.org/abs/2311.16502).
|
|
685
|
+
metric_groups:
|
|
686
|
+
- accuracy
|
|
687
|
+
- general_information
|
|
688
|
+
environment:
|
|
689
|
+
main_name: exact_match
|
|
690
|
+
main_split: valid
|
|
691
|
+
taxonomy:
|
|
692
|
+
task: multimodal multiple-choice question answering
|
|
693
|
+
what: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering
|
|
694
|
+
who: Human experts
|
|
695
|
+
when: "2023"
|
|
696
|
+
language: English
|
|
697
|
+
|
|
698
|
+
- name: unicorn
|
|
699
|
+
display_name: Unicorn
|
|
700
|
+
description: Safety Evaluation Benchmark for Evaluating on Out-of-Distribution and Sketch Images
|
|
701
|
+
metric_groups:
|
|
702
|
+
- accuracy
|
|
703
|
+
- general_information
|
|
704
|
+
environment:
|
|
705
|
+
main_name: exact_match
|
|
706
|
+
main_split: test
|
|
707
|
+
taxonomy:
|
|
708
|
+
task: short answer question answering
|
|
709
|
+
what: OOD images and sketch images
|
|
710
|
+
who: Human experts
|
|
711
|
+
when: "2023"
|
|
712
|
+
language: English
|
|
713
|
+
|
|
714
|
+
- name: bingo
|
|
715
|
+
display_name: Bingo
|
|
716
|
+
description: Open-ended questions about biased images
|
|
717
|
+
metric_groups:
|
|
718
|
+
- accuracy
|
|
719
|
+
- general_information
|
|
720
|
+
environment:
|
|
721
|
+
main_name: prometheus_vision
|
|
722
|
+
main_split: test
|
|
723
|
+
taxonomy:
|
|
724
|
+
task: short answer question answering
|
|
725
|
+
what: Biased images about Region, OCR, Factual, Text-to-Image and Image-to-Image inference challenges
|
|
726
|
+
who: Human experts
|
|
727
|
+
when: "2023"
|
|
728
|
+
language: English, Chinese, Japanese, etc.
|
|
729
|
+
|
|
730
|
+
- name: pope
|
|
731
|
+
display_name: POPE
|
|
732
|
+
description: Open-ended questions about object appearance in real-world images for evaluating hallucination behaviour
|
|
733
|
+
metric_groups:
|
|
734
|
+
- accuracy
|
|
735
|
+
- general_information
|
|
736
|
+
environment:
|
|
737
|
+
main_name: exact_match
|
|
738
|
+
main_split: test
|
|
739
|
+
taxonomy:
|
|
740
|
+
task: short answer question answering
|
|
741
|
+
what: Real-world images
|
|
742
|
+
who: Human experts
|
|
743
|
+
when: "2023"
|
|
744
|
+
language: English
|
|
745
|
+
|
|
746
|
+
- name: seed_bench
|
|
747
|
+
display_name: Seed Bench
|
|
748
|
+
description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input including the comprehension of both the image and video modality
|
|
749
|
+
metric_groups:
|
|
750
|
+
- accuracy
|
|
751
|
+
- general_information
|
|
752
|
+
environment:
|
|
753
|
+
main_name: exact_match
|
|
754
|
+
main_split: test
|
|
755
|
+
taxonomy:
|
|
756
|
+
task: multiple-choice question answering
|
|
757
|
+
what: Real-world images
|
|
758
|
+
who: Human experts
|
|
759
|
+
when: "2023"
|
|
760
|
+
language: English
|
|
761
|
+
|
|
762
|
+
- name: mme
|
|
763
|
+
display_name: MME
|
|
764
|
+
description: A comprehensive MLLM Evaluation benchmark with perception and cognition evaluations on 14 subtasks
|
|
765
|
+
metric_groups:
|
|
766
|
+
- accuracy
|
|
767
|
+
- general_information
|
|
768
|
+
environment:
|
|
769
|
+
main_name: exact_match
|
|
770
|
+
main_split: test
|
|
771
|
+
taxonomy:
|
|
772
|
+
task: multiple-choice question answering
|
|
773
|
+
what: Real-world images
|
|
774
|
+
who: Human experts
|
|
775
|
+
when: "2023"
|
|
776
|
+
language: English
|
|
777
|
+
|
|
778
|
+
- name: vibe_eval
|
|
779
|
+
display_name: Vibe Eval
|
|
780
|
+
description: hard evaluation suite for measuring progress of multimodal language models
|
|
781
|
+
metric_groups:
|
|
782
|
+
- accuracy
|
|
783
|
+
- general_information
|
|
784
|
+
environment:
|
|
785
|
+
main_name: prometheus_vision
|
|
786
|
+
main_split: test
|
|
787
|
+
taxonomy:
|
|
788
|
+
task: short answer question answering
|
|
789
|
+
what: Knowledge intensive
|
|
790
|
+
who: Human experts
|
|
791
|
+
when: "2024"
|
|
792
|
+
language: English
|
|
793
|
+
|
|
794
|
+
- name: mementos
|
|
795
|
+
display_name: Mementos
|
|
796
|
+
description: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences
|
|
797
|
+
metric_groups:
|
|
798
|
+
- accuracy
|
|
799
|
+
- general_information
|
|
800
|
+
environment:
|
|
801
|
+
main_name: prometheus_vision
|
|
802
|
+
main_split: test
|
|
803
|
+
taxonomy:
|
|
804
|
+
task: short answer question answering
|
|
805
|
+
what: Image sequences of comics, dailylife and robotics
|
|
806
|
+
who: Human experts
|
|
807
|
+
when: "2024"
|
|
808
|
+
language: English
|
|
809
|
+
|
|
810
|
+
- name: pairs
|
|
811
|
+
display_name: PAIRS
|
|
812
|
+
description: Examining Gender and Racial Bias in Large Vision-Language Models Using a Novel Dataset of Parallel Images.
|
|
813
|
+
metric_groups:
|
|
814
|
+
- accuracy
|
|
815
|
+
- general_information
|
|
816
|
+
environment:
|
|
817
|
+
main_name: exact_match
|
|
818
|
+
main_split: test
|
|
819
|
+
taxonomy:
|
|
820
|
+
task: multiple-choice question answering
|
|
821
|
+
what: Bias
|
|
822
|
+
who: Human experts
|
|
823
|
+
when: "2024"
|
|
824
|
+
language: English
|