crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (125) hide show
  1. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
  2. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
  3. helm/benchmark/adaptation/adapter_spec.py +32 -31
  4. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  5. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  6. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  7. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  8. helm/benchmark/annotation/annotator_factory.py +6 -0
  9. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
  10. helm/benchmark/annotation/live_qa_annotator.py +84 -0
  11. helm/benchmark/annotation/medication_qa_annotator.py +81 -0
  12. helm/benchmark/augmentations/perturbation.py +17 -1
  13. helm/benchmark/augmentations/test_perturbation.py +30 -0
  14. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  15. helm/benchmark/huggingface_registration.py +16 -6
  16. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  17. helm/benchmark/metrics/efficiency_metrics.py +9 -2
  18. helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
  19. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  20. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  21. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  22. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  23. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  24. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  25. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  26. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  27. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  28. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  29. helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
  30. helm/benchmark/model_metadata_registry.py +5 -1
  31. helm/benchmark/presentation/schema.py +54 -4
  32. helm/benchmark/presentation/test_schema.py +11 -0
  33. helm/benchmark/run.py +16 -2
  34. helm/benchmark/run_expander.py +112 -63
  35. helm/benchmark/run_spec_factory.py +15 -10
  36. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  37. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  38. helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
  39. helm/benchmark/run_specs/experimental_run_specs.py +33 -0
  40. helm/benchmark/run_specs/finance_run_specs.py +33 -0
  41. helm/benchmark/run_specs/vlm_run_specs.py +444 -65
  42. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  43. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  44. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  45. helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
  46. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  47. helm/benchmark/scenarios/math_scenario.py +1 -1
  48. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  49. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  50. helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
  51. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  52. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  53. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  54. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
  55. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
  56. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
  57. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
  58. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
  59. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  60. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  61. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  62. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  63. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  64. helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
  65. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
  66. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
  67. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
  68. helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
  69. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  70. helm/benchmark/static/schema_classic.yaml +3 -59
  71. helm/benchmark/static/schema_finance.yaml +143 -0
  72. helm/benchmark/static/schema_image2structure.yaml +447 -0
  73. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  74. helm/benchmark/static/schema_lite.yaml +3 -61
  75. helm/benchmark/static/schema_medical.yaml +255 -0
  76. helm/benchmark/static/schema_mmlu.yaml +3 -61
  77. helm/benchmark/static/schema_tables.yaml +200 -0
  78. helm/benchmark/static/schema_thai.yaml +223 -0
  79. helm/benchmark/static/schema_unitxt.yaml +3 -61
  80. helm/benchmark/static/schema_vhelm.yaml +824 -0
  81. helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
  82. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  83. helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
  84. helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
  85. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  86. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  87. helm/benchmark/static_build/index.html +2 -2
  88. helm/clients/anthropic_client.py +78 -14
  89. helm/clients/auto_client.py +11 -0
  90. helm/clients/client.py +24 -7
  91. helm/clients/cohere_client.py +98 -3
  92. helm/clients/huggingface_client.py +71 -12
  93. helm/clients/openai_client.py +11 -5
  94. helm/clients/reka_client.py +189 -0
  95. helm/clients/test_client.py +3 -3
  96. helm/clients/test_huggingface_client.py +19 -3
  97. helm/clients/test_together_client.py +72 -2
  98. helm/clients/together_client.py +199 -2
  99. helm/clients/vertexai_client.py +117 -64
  100. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  101. helm/clients/vision_language/huggingface_vlm_client.py +12 -4
  102. helm/clients/vision_language/idefics_client.py +2 -2
  103. helm/clients/vision_language/paligemma_client.py +146 -0
  104. helm/clients/vision_language/palmyra_vision_client.py +84 -0
  105. helm/clients/yi_client.py +31 -0
  106. helm/common/critique_request.py +10 -1
  107. helm/common/images_utils.py +29 -3
  108. helm/config/model_deployments.yaml +504 -12
  109. helm/config/model_metadata.yaml +579 -52
  110. helm/config/tokenizer_configs.yaml +100 -1
  111. helm/proxy/critique/model_critique_client.py +32 -4
  112. helm/proxy/services/server_service.py +1 -1
  113. helm/tokenizers/auto_tokenizer.py +1 -1
  114. helm/tokenizers/cohere_tokenizer.py +44 -2
  115. helm/tokenizers/huggingface_tokenizer.py +36 -13
  116. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  117. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  118. helm/benchmark/static/schema_vlm.yaml +0 -576
  119. helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
  120. helm/benchmark/static_build/assets/index-d839df55.js +0 -9
  121. helm/benchmark/test_model_deployment_definition.py +0 -90
  122. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
  123. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
  124. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
  125. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,824 @@
1
+ ---
2
+ ############################################################
3
+ # For backwards compatibility with older versions of HELM.
4
+ # TODO: Remove this after 2024-09-01.
5
+ adapter: []
6
+ ############################################################
7
+ metrics:
8
+ # Infrastructure metrics:
9
+ - name: num_perplexity_tokens
10
+ display_name: '# tokens'
11
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
12
+ - name: num_bytes
13
+ display_name: '# bytes'
14
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
15
+
16
+ - name: num_references
17
+ display_name: '# ref'
18
+ description: Number of references.
19
+ - name: num_train_trials
20
+ display_name: '# trials'
21
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
22
+ - name: estimated_num_tokens_cost
23
+ display_name: 'cost'
24
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
25
+ - name: num_prompt_tokens
26
+ display_name: '# prompt tokens'
27
+ description: Number of tokens in the prompt.
28
+ - name: num_prompt_characters
29
+ display_name: '# prompt chars'
30
+ description: Number of characters in the prompt.
31
+ - name: num_completion_tokens
32
+ display_name: '# completion tokens'
33
+ description: Actual number of completion tokens (over all completions).
34
+ - name: num_output_tokens
35
+ display_name: '# output tokens'
36
+ description: Actual number of output tokens.
37
+ - name: max_num_output_tokens
38
+ display_name: 'Max output tokens'
39
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
40
+ - name: num_requests
41
+ display_name: '# requests'
42
+ description: Number of distinct API requests.
43
+ - name: num_instances
44
+ display_name: '# eval'
45
+ description: Number of evaluation instances.
46
+ - name: num_train_instances
47
+ display_name: '# train'
48
+ description: Number of training instances (e.g., in-context examples).
49
+ - name: prompt_truncated
50
+ display_name: truncated
51
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
52
+ - name: finish_reason_length
53
+ display_name: finish b/c length
54
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
55
+ - name: finish_reason_stop
56
+ display_name: finish b/c stop
57
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
58
+ - name: finish_reason_endoftext
59
+ display_name: finish b/c endoftext
60
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
61
+ - name: finish_reason_unknown
62
+ display_name: finish b/c unknown
63
+ description: Fraction of instances where the the output was terminated for unknown reasons.
64
+ - name: num_completions
65
+ display_name: '# completions'
66
+ description: Number of completions.
67
+ - name: predicted_index
68
+ display_name: Predicted index
69
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
70
+
71
+ # Vision Language metrics [text]:
72
+ - name: edit_similarity
73
+ display_name: Edit similarity (Levenshtein)
74
+ short_display_name: Edit sim.
75
+ lower_is_better: false
76
+ description: Average Levenshtein edit similarity (1 - distance normalized by length of longer sequence) between model generation and reference.
77
+
78
+ # Vision Language metrics [image]:
79
+ - name: earth_mover_similarity
80
+ display_name: Earth Mover Similarity
81
+ short_display_name: EMD-Sim
82
+ description: 1 - Earth Mover Distance [(Rubner and Tomasi, 2000)](https://www.cs.cmu.edu/~efros/courses/LBMV07/Papers/rubner-jcviu-00.pdf) between an image generated by the model and the target image.
83
+ lower_is_better: false
84
+ - name: pixel_similarity
85
+ display_name: Pixel Similarity
86
+ short_display_name: PS
87
+ description: Pixel Similarity between an image generated by the model and the target image.
88
+ lower_is_better: false
89
+ - name: sift_similarity
90
+ display_name: SIFT Similarity
91
+ short_display_name: SIFT
92
+ description: SIFT Similarity (Scale-Invariant Feature Transform) [(Lowe, 1999)](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=790410) between an image generated by the model and the target image.
93
+ lower_is_better: false
94
+ - name: compilation_success
95
+ display_name: Compilation success
96
+ description: Fraction of instances where the generated code compiles successfully.
97
+ lower_is_better: false
98
+ - name: lpips_similarity
99
+ display_name: LPIPS similarity
100
+ short_display_name: LPIPS
101
+ description: LPIPS similarity (Learned Perceptual Image Patch Similarity) [(Zhang et al., 2018)](https://arxiv.org/abs/1801.03924) between an image generated by the model and the target image.
102
+ lower_is_better: false
103
+ - name: fid_similarity
104
+ display_name: FID similarity
105
+ short_display_name: FID
106
+ description: FID similarity (Fréchet Inception Distance) [(Heusel et al., 2017)](https://arxiv.org/abs/1706.08500) between an image generated by the model and the target image.
107
+ lower_is_better: false
108
+ - name: ssim_similarity
109
+ display_name: SSIM
110
+ short_display_name: SSIM
111
+ description: SSIM similarity (Structural Similarity Index) [(Wang et al., 2004)](https://www.cns.nyu.edu/pub/eero/wang03-reprint.pdf) between an image generated by the model and the target image.
112
+ lower_is_better: false
113
+
114
+ # Accuracy metrics:
115
+ - name: exact_match
116
+ display_name: Exact match
117
+ short_display_name: EM
118
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
119
+ lower_is_better: false
120
+ - name: quasi_exact_match
121
+ display_name: Quasi-exact match
122
+ short_display_name: EM
123
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
124
+ lower_is_better: false
125
+ - name: prefix_exact_match
126
+ display_name: Prefix exact match
127
+ short_display_name: PEM
128
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
129
+ lower_is_better: false
130
+ - name: quasi_prefix_exact_match
131
+ # TODO: should call this prefix_quasi_exact_match
132
+ display_name: Prefix quasi-exact match
133
+ short_display_name: PEM
134
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
135
+ lower_is_better: false
136
+
137
+ - name: rouge_1
138
+ display_name: ROUGE-1
139
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
140
+ lower_is_better: false
141
+ - name: rouge_2
142
+ display_name: ROUGE-2
143
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
144
+ lower_is_better: false
145
+ - name: rouge_l
146
+ display_name: ROUGE-L
147
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
148
+ lower_is_better: false
149
+ - name: bleu_1
150
+ display_name: BLEU-1
151
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
152
+ lower_is_better: false
153
+ - name: bleu_4
154
+ display_name: BLEU-4
155
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
156
+ lower_is_better: false
157
+ - name: f1_score
158
+ display_name: F1
159
+ description: Average F1 score in terms of word overlap between the model output and correct reference.
160
+ lower_is_better: false
161
+ - name: cider
162
+ display_name: CIDEr
163
+ description: Evaluates the quality of generated caption by measuring the weighted similarity of n-grams between the captions and a set of human-written reference captions, emphasizing informativeness and consensus.
164
+ lower_is_better: false
165
+ - name: prometheus_vision
166
+ display_name: Prometheus Vision rating
167
+ description: Scores are from 1 to 5, where 5 is the best.
168
+ lower_is_better: false
169
+
170
+ # Toxicity metrics
171
+ - name: expected_max_toxicity
172
+ display_name: Average max toxicity
173
+ short_display_name: Avg. max toxicity
174
+ lower_is_better: true
175
+ description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
176
+ - name: max_toxicity_probability
177
+ display_name: max toxicity
178
+ lower_is_better: true
179
+ description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
180
+ - name: toxic_frac
181
+ display_name: Toxic fraction
182
+ lower_is_better: true
183
+ description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
184
+
185
+ ############################################################
186
+ perturbations:
187
+ - name: translate
188
+ display_name: Translate
189
+ description: Translate prompts to different languages.
190
+ - name: robustness
191
+ display_name: Robustness
192
+ description: Computes worst case over different robustness perturbations (misspellings, formatting, contrast sets).
193
+ - name: fairness
194
+ display_name: Fairness
195
+ description: Computes worst case over different fairness perturbations (changing dialect, race of names, gender).
196
+ - name: typos
197
+ display_name: Typos
198
+ description: >
199
+ Randomly adds typos to each token in the input with probability 0.05 and computes the per-instance worst-case
200
+ performance between perturbed and unperturbed versions.
201
+ - name: synonym
202
+ display_name: Synonyms
203
+ description: >
204
+ Randomly substitutes words in the input with WordNet synonyms with probability 0.5 and computes the per-instance
205
+ worst-case performance between perturbed and unperturbed versions.
206
+ - name: dialect
207
+ display_name: SAE -> AAE
208
+ short_display_name: Dialect
209
+ description: >
210
+ Deterministically substitutes SAE words in input with AAE counterparts using validated dictionary of [Ziems et al. (2022)](https://aclanthology.org/2022.acl-long.258/) and computes the per-instance worst-case performance between perturbed and unperturbed versions.
211
+ - name: race
212
+ display_name: First names by race (White -> Black)
213
+ short_display_name: Race
214
+ description: >
215
+ Deterministically substitutes White first names with Black first names sampled from the lists of [Caliskan et al. (2017)](https://www.science.org/doi/10.1126/science.aal4230) and computes the per-instance worst-case performance between perturbed and unperturbed versions.
216
+ - name: gender
217
+ display_name: Pronouns by gender (Male -> Female)
218
+ short_display_name: Gender
219
+ description: >
220
+ Deterministically substitutes male pronouns with female pronouns and computes the per-instance worst-case
221
+ performance between perturbed and unperturbed versions.
222
+
223
+ ############################################################
224
+ metric_groups:
225
+ - name: accuracy
226
+ display_name: Accuracy
227
+ metrics:
228
+ - name: ${main_name}
229
+ split: ${main_split}
230
+
231
+ - name: general_information
232
+ display_name: General information
233
+ metrics:
234
+ - name: num_instances
235
+ split: ${main_split}
236
+ - name: num_train_instances
237
+ split: ${main_split}
238
+ - name: prompt_truncated
239
+ split: ${main_split}
240
+ - name: num_prompt_tokens
241
+ split: ${main_split}
242
+ - name: num_output_tokens
243
+ split: ${main_split}
244
+
245
+ - name: toxicity
246
+ display_name: Toxicity
247
+ metrics:
248
+ - name: toxic_frac
249
+ split: ${main_split}
250
+
251
+ - name: fairness
252
+ display_name: Fairness
253
+ metrics:
254
+ - name: ${main_name}
255
+ split: ${main_split}
256
+ perturbation_name: fairness
257
+
258
+ - name: robustness
259
+ display_name: Robustness
260
+ metrics:
261
+ - name: ${main_name}
262
+ split: ${main_split}
263
+ perturbation_name: robustness
264
+
265
+ - name: translate
266
+ display_name: Translate
267
+ metrics:
268
+ - name: ${main_name}
269
+ split: ${main_split}
270
+ perturbation_name: translate
271
+
272
+
273
+ ############################################################
274
+ run_groups:
275
+ - name: core_scenarios
276
+ display_name: All
277
+ description: All scenarios across capabilities
278
+ category: All scenarios
279
+ subgroups:
280
+ - visual_perception
281
+ - reasoning
282
+ - knowledge
283
+ - bias
284
+ - fairness
285
+ - toxicity
286
+ - robustness
287
+ - multilinguality
288
+ - name: visual_perception
289
+ display_name: Visual perception
290
+ description: Is the output semantically correct, given the text and image inputs?
291
+ category: Core scenarios
292
+ subgroups:
293
+ - vqa_base
294
+ - viz_wiz
295
+ - flickr30k
296
+ - name: reasoning
297
+ display_name: Reasoning
298
+ description: Does the model understand objects, counts and spatial relations? Can the model reason about both the text and image input?
299
+ category: Core scenarios
300
+ subgroups:
301
+ - gqa
302
+ - math_vista
303
+ - seed_bench
304
+ - name: real_world_reasoning
305
+ display_name: Real-world Reasoning
306
+ description: Reasoning in the real-world
307
+ category: Core scenarios
308
+ subgroups:
309
+ - gqa
310
+ - seed_bench
311
+ - mementos
312
+ - name: knowledge
313
+ display_name: Knowledge
314
+ description: Does the model have knowledge about the world and common sense?
315
+ category: Core scenarios
316
+ subgroups:
317
+ - a_okvqa_base
318
+ - mmmu
319
+ - mme
320
+ - vibe_eval
321
+ - name: bias
322
+ display_name: Bias
323
+ description: Are the generations biased in demographic representation? We focus on gender and skin tone bias.
324
+ category: Core scenarios
325
+ subgroups:
326
+ - pairs
327
+ - name: fairness
328
+ display_name: Fairness
329
+ description: Does the model exhibit performance disparities across different groups? We focus on gender, dialect and geographic bias.
330
+ category: Core scenarios
331
+ subgroups:
332
+ - vqa_dialect
333
+ - a_okvqa_dialect
334
+ - crossmodal_3600
335
+ - name: toxicity
336
+ display_name: Toxicity
337
+ description: Does the model generate toxic or inappropriate content? Can the model identify toxic or inappropriate content?
338
+ category: Core scenarios
339
+ subgroups:
340
+ - mm_safety_bench
341
+ - hateful_memes
342
+ - name: robustness
343
+ display_name: Robustness
344
+ description: Is the model robust to perturbations? We focus on both text and image perturbations.
345
+ category: Core scenarios
346
+ subgroups:
347
+ - vqa_robustness
348
+ - a_okvqa_robustness
349
+ - unicorn
350
+ - bingo
351
+ - pope
352
+ - name: multilinguality
353
+ display_name: Multilinguality
354
+ description: Do the model support non-English languages?
355
+ category: Core scenarios
356
+ subgroups:
357
+ - a_okvqa_chinese
358
+ - a_okvqa_hindi
359
+ - a_okvqa_spanish
360
+ - a_okvqa_swahili
361
+
362
+ - name: a_okvqa_base
363
+ display_name: A-OKVQA
364
+ description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
365
+ metric_groups:
366
+ - accuracy
367
+ - general_information
368
+ environment:
369
+ main_name: exact_match
370
+ main_split: valid
371
+ taxonomy:
372
+ task: multiple-choice question answering
373
+ what: Real-world images
374
+ who: Human experts
375
+ when: "2023"
376
+ language: English
377
+
378
+ - name: a_okvqa_dialect
379
+ display_name: A-OKVQA (AAE)
380
+ description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
381
+ metric_groups:
382
+ - fairness
383
+ - general_information
384
+ environment:
385
+ main_name: exact_match
386
+ main_split: valid
387
+ taxonomy:
388
+ task: multiple-choice question answering
389
+ what: Real-world images
390
+ who: Human experts
391
+ when: "2023"
392
+ language: English
393
+
394
+ - name: a_okvqa_robustness
395
+ display_name: A-OKVQA (robustness)
396
+ description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
397
+ metric_groups:
398
+ - robustness
399
+ - general_information
400
+ environment:
401
+ main_name: exact_match
402
+ main_split: valid
403
+ taxonomy:
404
+ task: multiple-choice question answering
405
+ what: Real-world images
406
+ who: Human experts
407
+ when: "2023"
408
+ language: English
409
+
410
+ - name: a_okvqa_chinese
411
+ display_name: A-OKVQA (chinese)
412
+ description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
413
+ metric_groups:
414
+ - translate
415
+ - general_information
416
+ environment:
417
+ main_name: exact_match
418
+ main_split: valid
419
+ taxonomy:
420
+ task: multiple-choice question answering
421
+ what: Real-world images
422
+ who: Human experts
423
+ when: "2023"
424
+ language: Chinese
425
+
426
+ - name: a_okvqa_hindi
427
+ display_name: A-OKVQA (hindi)
428
+ description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
429
+ metric_groups:
430
+ - translate
431
+ - general_information
432
+ environment:
433
+ main_name: exact_match
434
+ main_split: valid
435
+ taxonomy:
436
+ task: multiple-choice question answering
437
+ what: Real-world images
438
+ who: Human experts
439
+ when: "2023"
440
+ language: Hindi
441
+
442
+ - name: a_okvqa_spanish
443
+ display_name: A-OKVQA (spanish)
444
+ description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
445
+ metric_groups:
446
+ - translate
447
+ - general_information
448
+ environment:
449
+ main_name: exact_match
450
+ main_split: valid
451
+ taxonomy:
452
+ task: multiple-choice question answering
453
+ what: Real-world images
454
+ who: Human experts
455
+ when: "2023"
456
+ language: Spanish
457
+
458
+ - name: a_okvqa_swahili
459
+ display_name: A-OKVQA (swahili)
460
+ description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
461
+ metric_groups:
462
+ - translate
463
+ - general_information
464
+ environment:
465
+ main_name: exact_match
466
+ main_split: valid
467
+ taxonomy:
468
+ task: multiple-choice question answering
469
+ what: Real-world images
470
+ who: Human experts
471
+ when: "2023"
472
+ language: Swahili
473
+
474
+ - name: crossmodal_3600
475
+ display_name: Crossmodal 3600
476
+ description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([paper](https://arxiv.org/abs/2205.12522))
477
+ metric_groups:
478
+ - accuracy
479
+ - general_information
480
+ environment:
481
+ main_name: prometheus_vision
482
+ main_split: test
483
+ taxonomy:
484
+ task: multilingual captioning
485
+ what: Real-world images
486
+ who: Human experts
487
+ when: "2022"
488
+ language: 36 languages
489
+
490
+ - name: flickr30k
491
+ display_name: Flickr30k
492
+ description: An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr images. ([paper](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))
493
+ metric_groups:
494
+ - accuracy
495
+ - general_information
496
+ environment:
497
+ main_name: prometheus_vision
498
+ main_split: test
499
+ taxonomy:
500
+ task: image captioning
501
+ what: Flickr images
502
+ who: Human experts
503
+ when: "2014"
504
+ language: English
505
+
506
+ - name: gqa
507
+ display_name: GQA
508
+ description: Questions about real-world visual reasoning and compositional QA
509
+ metric_groups:
510
+ - accuracy
511
+ - general_information
512
+ environment:
513
+ main_name: quasi_exact_match
514
+ main_split: valid
515
+ taxonomy:
516
+ task: short answer question answering
517
+ what: Real-world images
518
+ who: Human experts
519
+ when: "2019"
520
+ language: English
521
+
522
+ - name: hateful_memes
523
+ display_name: Hateful Memes
524
+ description: The Hateful Memes benchmark for multimodal hate speech detection [(Dwibedi et al., 2020)](https://arxiv.org/pdf/2005.04790.pdf).
525
+ metric_groups:
526
+ - accuracy
527
+ - general_information
528
+ environment:
529
+ main_name: exact_match
530
+ main_split: test
531
+ taxonomy:
532
+ task: multimodal classification
533
+ what: images and text
534
+ who: annotators from Amazon Mechanical Turk
535
+ when: "2020"
536
+ language: English
537
+
538
+ - name: mm_safety_bench
539
+ display_name: MM-SafetyBench
540
+ description: Expose the vulnerability of open-source VLMs with toxic and biased content ([paper](https://arxiv.org/abs/2311.17600))
541
+ metric_groups:
542
+ - accuracy
543
+ - toxicity
544
+ - general_information
545
+ environment:
546
+ main_name: toxic_frac
547
+ main_split: test
548
+ taxonomy:
549
+ task: safety
550
+ what: safety images
551
+ who: Human experts
552
+ when: "2023"
553
+ language: English
554
+
555
+ - name: viz_wiz
556
+ display_name: VizWiz
557
+ description: A benchmark for visual question answering with images and questions created by visually impaired people [(Gurari et al., 2018)](https://arxiv.org/abs/1802.08218).
558
+ metric_groups:
559
+ - accuracy
560
+ environment:
561
+ main_name: quasi_exact_match
562
+ main_split: valid
563
+ taxonomy:
564
+ task: multimodal short answer question answering
565
+ what: Real-world images
566
+ who: Visually impaired people
567
+ when: "2018"
568
+ language: English
569
+
570
+ - name: vqa_base
571
+ display_name: VQAv2
572
+ description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
573
+ metric_groups:
574
+ - accuracy
575
+ - general_information
576
+ environment:
577
+ main_name: quasi_exact_match
578
+ main_split: valid
579
+ taxonomy:
580
+ task: multimodal short answer question answering
581
+ what: Real-world images
582
+ who: Human experts
583
+ when: "2017"
584
+ language: English
585
+
586
+ - name: vqa_dialect
587
+ display_name: VQAv2 (AAE)
588
+ description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
589
+ metric_groups:
590
+ - fairness
591
+ - general_information
592
+ environment:
593
+ main_name: quasi_exact_match
594
+ main_split: valid
595
+ taxonomy:
596
+ task: multimodal short answer question answering
597
+ what: Real-world images
598
+ who: Human experts
599
+ when: "2017"
600
+ language: English
601
+
602
+ - name: vqa_robustness
603
+ display_name: VQAv2 (robustness)
604
+ description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
605
+ metric_groups:
606
+ - robustness
607
+ - general_information
608
+ environment:
609
+ main_name: quasi_exact_match
610
+ main_split: valid
611
+ taxonomy:
612
+ task: multimodal short answer question answering
613
+ what: Real-world images
614
+ who: Human experts
615
+ when: "2017"
616
+ language: English
617
+
618
+ - name: vqa_chinese
619
+ display_name: VQAv2 (chinese)
620
+ description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
621
+ metric_groups:
622
+ - translate
623
+ - general_information
624
+ environment:
625
+ main_name: quasi_exact_match
626
+ main_split: valid
627
+ taxonomy:
628
+ task: multimodal short answer question answering
629
+ what: Real-world images
630
+ who: Human experts
631
+ when: "2017"
632
+ language: Chinese
633
+
634
+ - name: vqa_hindi
635
+ display_name: VQAv2 (hindi)
636
+ description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
637
+ metric_groups:
638
+ - translate
639
+ - general_information
640
+ environment:
641
+ main_name: quasi_exact_match
642
+ main_split: valid
643
+ taxonomy:
644
+ task: multimodal short answer question answering
645
+ what: Real-world images
646
+ who: Human experts
647
+ when: "2017"
648
+ language: Hindi
649
+
650
+ - name: vqa_spanish
651
+ display_name: VQAv2 (spanish)
652
+ description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
653
+ metric_groups:
654
+ - translate
655
+ - general_information
656
+ environment:
657
+ main_name: quasi_exact_match
658
+ main_split: valid
659
+ taxonomy:
660
+ task: multimodal short answer question answering
661
+ what: Real-world images
662
+ who: Human experts
663
+ when: "2017"
664
+ language: Spanish
665
+
666
+ - name: math_vista
667
+ display_name: MathVista
668
+ description: Evaluating Math Reasoning in Visual Contexts
669
+ metric_groups:
670
+ - accuracy
671
+ - general_information
672
+ environment:
673
+ main_name: exact_match
674
+ main_split: test
675
+ taxonomy:
676
+ task: multiple-choice question answering
677
+ what: Evaluating Math Reasoning in Visual Contexts
678
+ who: Human experts
679
+ when: "2024"
680
+ language: English
681
+
682
+ - name: mmmu
683
+ display_name: MMMU
684
+ description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning [(Yue et al., 2023)](https://arxiv.org/abs/2311.16502).
685
+ metric_groups:
686
+ - accuracy
687
+ - general_information
688
+ environment:
689
+ main_name: exact_match
690
+ main_split: valid
691
+ taxonomy:
692
+ task: multimodal multiple-choice question answering
693
+ what: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering
694
+ who: Human experts
695
+ when: "2023"
696
+ language: English
697
+
698
+ - name: unicorn
699
+ display_name: Unicorn
700
+ description: Safety Evaluation Benchmark for Evaluating on Out-of-Distribution and Sketch Images
701
+ metric_groups:
702
+ - accuracy
703
+ - general_information
704
+ environment:
705
+ main_name: exact_match
706
+ main_split: test
707
+ taxonomy:
708
+ task: short answer question answering
709
+ what: OOD images and sketch images
710
+ who: Human experts
711
+ when: "2023"
712
+ language: English
713
+
714
+ - name: bingo
715
+ display_name: Bingo
716
+ description: Open-ended questions about biased images
717
+ metric_groups:
718
+ - accuracy
719
+ - general_information
720
+ environment:
721
+ main_name: prometheus_vision
722
+ main_split: test
723
+ taxonomy:
724
+ task: short answer question answering
725
+ what: Biased images about Region, OCR, Factual, Text-to-Image and Image-to-Image inference challenges
726
+ who: Human experts
727
+ when: "2023"
728
+ language: English, Chinese, Japanese, etc.
729
+
730
+ - name: pope
731
+ display_name: POPE
732
+ description: Open-ended questions about object appearance in real-world images for evaluating hallucination behaviour
733
+ metric_groups:
734
+ - accuracy
735
+ - general_information
736
+ environment:
737
+ main_name: exact_match
738
+ main_split: test
739
+ taxonomy:
740
+ task: short answer question answering
741
+ what: Real-world images
742
+ who: Human experts
743
+ when: "2023"
744
+ language: English
745
+
746
+ - name: seed_bench
747
+ display_name: Seed Bench
748
+ description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input including the comprehension of both the image and video modality
749
+ metric_groups:
750
+ - accuracy
751
+ - general_information
752
+ environment:
753
+ main_name: exact_match
754
+ main_split: test
755
+ taxonomy:
756
+ task: multiple-choice question answering
757
+ what: Real-world images
758
+ who: Human experts
759
+ when: "2023"
760
+ language: English
761
+
762
+ - name: mme
763
+ display_name: MME
764
+ description: A comprehensive MLLM Evaluation benchmark with perception and cognition evaluations on 14 subtasks
765
+ metric_groups:
766
+ - accuracy
767
+ - general_information
768
+ environment:
769
+ main_name: exact_match
770
+ main_split: test
771
+ taxonomy:
772
+ task: multiple-choice question answering
773
+ what: Real-world images
774
+ who: Human experts
775
+ when: "2023"
776
+ language: English
777
+
778
+ - name: vibe_eval
779
+ display_name: Vibe Eval
780
+ description: hard evaluation suite for measuring progress of multimodal language models
781
+ metric_groups:
782
+ - accuracy
783
+ - general_information
784
+ environment:
785
+ main_name: prometheus_vision
786
+ main_split: test
787
+ taxonomy:
788
+ task: short answer question answering
789
+ what: Knowledge intensive
790
+ who: Human experts
791
+ when: "2024"
792
+ language: English
793
+
794
+ - name: mementos
795
+ display_name: Mementos
796
+ description: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences
797
+ metric_groups:
798
+ - accuracy
799
+ - general_information
800
+ environment:
801
+ main_name: prometheus_vision
802
+ main_split: test
803
+ taxonomy:
804
+ task: short answer question answering
805
+ what: Image sequences of comics, dailylife and robotics
806
+ who: Human experts
807
+ when: "2024"
808
+ language: English
809
+
810
+ - name: pairs
811
+ display_name: PAIRS
812
+ description: Examining Gender and Racial Bias in Large Vision-Language Models Using a Novel Dataset of Parallel Images.
813
+ metric_groups:
814
+ - accuracy
815
+ - general_information
816
+ environment:
817
+ main_name: exact_match
818
+ main_split: test
819
+ taxonomy:
820
+ task: multiple-choice question answering
821
+ what: Bias
822
+ who: Human experts
823
+ when: "2024"
824
+ language: English