crfm-helm 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (56) hide show
  1. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +7 -3
  2. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/RECORD +53 -41
  3. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  5. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  6. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
  7. helm/benchmark/augmentations/perturbation.py +17 -1
  8. helm/benchmark/augmentations/test_perturbation.py +30 -0
  9. helm/benchmark/metrics/efficiency_metrics.py +9 -2
  10. helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
  11. helm/benchmark/metrics/vision_language/image_metrics.py +142 -17
  12. helm/benchmark/model_metadata_registry.py +5 -1
  13. helm/benchmark/run_expander.py +35 -63
  14. helm/benchmark/run_spec_factory.py +11 -10
  15. helm/benchmark/run_specs/vlm_run_specs.py +294 -38
  16. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  17. helm/benchmark/scenarios/math_scenario.py +1 -1
  18. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  19. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  20. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  21. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  22. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
  23. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -1
  24. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +1 -1
  25. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  26. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  27. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  28. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  29. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  30. helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
  31. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
  32. helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
  33. helm/benchmark/static/schema_image2structure.yaml +304 -0
  34. helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
  35. helm/benchmark/static/schema_vlm.yaml +257 -10
  36. helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
  37. helm/benchmark/static_build/assets/index-878a1094.css +1 -0
  38. helm/benchmark/static_build/index.html +2 -2
  39. helm/clients/anthropic_client.py +36 -6
  40. helm/clients/openai_client.py +2 -3
  41. helm/clients/together_client.py +93 -2
  42. helm/clients/vertexai_client.py +59 -50
  43. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  44. helm/clients/vision_language/huggingface_vlm_client.py +11 -4
  45. helm/clients/vision_language/idefics_client.py +2 -2
  46. helm/common/images_utils.py +10 -3
  47. helm/config/model_deployments.yaml +100 -2
  48. helm/config/model_metadata.yaml +136 -31
  49. helm/config/tokenizer_configs.yaml +7 -0
  50. helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
  51. helm/benchmark/static_build/assets/index-d839df55.js +0 -9
  52. helm/benchmark/test_model_deployment_definition.py +0 -90
  53. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
  54. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +0 -0
  55. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
  56. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,304 @@
1
+ ---
2
+ ############################################################
3
+ adapter:
4
+ - name: method
5
+ description: The high-level strategy for converting instances into a prompt for the language model.
6
+ values:
7
+ - name: generation
8
+ description: Given the input, the model generates the output free-form.
9
+ - name: generation_multimodal
10
+ description: Given the multimodal input, the model generates the output free-form.
11
+ - name: multiple_choice_joint
12
+ description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
13
+ - name: multiple_choice_separate_original
14
+ description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
15
+ - name: multiple_choice_separate_calibrated
16
+ description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
17
+ - name: language_modeling
18
+ description: Given the input, the model assigns the sequence a probability.
19
+ - name: instructions
20
+ description: The description of the task that is included at the very beginning of the prompt.
21
+ - name: global_prefix
22
+ description: The string that is prepended to the prompt.
23
+ - name: global_suffix
24
+ description: The string that is appended to the prompt.
25
+ - name: instance_prefix
26
+ description: The string that is included before each instance (e.g., '\n\n').
27
+ - name: input_prefix
28
+ description: The string that is included before each input (e.g., 'Question:').
29
+ - name: input_suffix
30
+ description: The string that is included after each input (e.g., '\n').
31
+ - name: reference_prefix
32
+ description: The string that is included before each reference (for multiple-choice questions).
33
+ - name: reference_suffix
34
+ description: The string that is included after each reference (for multiple-choice questions).
35
+ - name: output_prefix
36
+ description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
37
+ - name: output_suffix
38
+ description: The string that is included after the correct answer/predicted output (e.g., '\n').
39
+ - name: substitutions
40
+ description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
41
+ - name: max_train_instances
42
+ description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
43
+ - name: max_eval_instances
44
+ description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
45
+ - name: num_outputs
46
+ description: Maximum number of possible outputs to generate by sampling multiple outputs.
47
+ - name: num_train_trials
48
+ description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
49
+ - name: sample_train
50
+ description: If true, randomly sample N training examples; if false, select N consecutive training examples
51
+ - name: model
52
+ description: Name of the language model (<creator_organization>/<model name>) to send requests to.
53
+ - name: model_deployment
54
+ description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
55
+ - name: temperature
56
+ description: Temperature parameter used in generation.
57
+ - name: max_tokens
58
+ description: Maximum number of tokens to generate.
59
+ - name: stop_sequences
60
+ description: List of sequences, where we stop generation if we encounter any of them.
61
+ - name: random
62
+ description: Random seed (string), which guarantees reproducibility.
63
+ - name: multi_label
64
+ description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
65
+
66
+ ############################################################
67
+ metrics:
68
+ # Infrastructure metrics:
69
+ - name: num_perplexity_tokens
70
+ display_name: '# tokens'
71
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
72
+ - name: num_bytes
73
+ display_name: '# bytes'
74
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
75
+
76
+ - name: num_references
77
+ display_name: '# ref'
78
+ description: Number of references.
79
+ - name: num_train_trials
80
+ display_name: '# trials'
81
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
82
+ - name: estimated_num_tokens_cost
83
+ display_name: 'cost'
84
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
85
+ - name: num_prompt_tokens
86
+ display_name: '# prompt tokens'
87
+ description: Number of tokens in the prompt.
88
+ - name: num_prompt_characters
89
+ display_name: '# prompt chars'
90
+ description: Number of characters in the prompt.
91
+ - name: num_completion_tokens
92
+ display_name: '# completion tokens'
93
+ description: Actual number of completion tokens (over all completions).
94
+ - name: num_output_tokens
95
+ display_name: '# output tokens'
96
+ description: Actual number of output tokens.
97
+ - name: max_num_output_tokens
98
+ display_name: 'Max output tokens'
99
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
100
+ - name: num_requests
101
+ display_name: '# requests'
102
+ description: Number of distinct API requests.
103
+ - name: num_instances
104
+ display_name: '# eval'
105
+ description: Number of evaluation instances.
106
+ - name: num_train_instances
107
+ display_name: '# train'
108
+ description: Number of training instances (e.g., in-context examples).
109
+ - name: prompt_truncated
110
+ display_name: truncated
111
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
112
+ - name: finish_reason_length
113
+ display_name: finish b/c length
114
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
115
+ - name: finish_reason_stop
116
+ display_name: finish b/c stop
117
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
118
+ - name: finish_reason_endoftext
119
+ display_name: finish b/c endoftext
120
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
121
+ - name: finish_reason_unknown
122
+ display_name: finish b/c unknown
123
+ description: Fraction of instances where the the output was terminated for unknown reasons.
124
+ - name: num_completions
125
+ display_name: '# completions'
126
+ description: Number of completions.
127
+ - name: predicted_index
128
+ display_name: Predicted index
129
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
130
+
131
+ # Vision Language metrics [text]:
132
+ - name: edit_similarity
133
+ display_name: Edit similarity (Levenshtein)
134
+ short_display_name: Edit sim.
135
+ lower_is_better: false
136
+ description: Average Levenshtein edit similarity (1 - distance normalized by length of longer sequence) between model generation and reference.
137
+
138
+ # Vision Language metrics [image]:
139
+ - name: block_emd_similarity
140
+ display_name: Block Earth Mover Similarity
141
+ short_display_name: Block EMS
142
+ description: Block Earth Mover Similarity
143
+ lower_is_better: false
144
+ - name: block_emd_similarity_white
145
+ display_name: Block Earth Mover Similarity (white)
146
+ short_display_name: Block EMS (white)
147
+ description: Block Earth Mover Similarity (white)
148
+ lower_is_better: false
149
+ - name: block_emd_similarity_median_color
150
+ display_name: Block Earth Mover Similarity (median)
151
+ short_display_name: Block EMS (median)
152
+ description: Block Earth Mover Similarity (median)
153
+ lower_is_better: false
154
+ - name: pixel_similarity
155
+ display_name: Pixel Similarity
156
+ short_display_name: PS
157
+ description: Pixel Similarity between an image generated by the model and the target image.
158
+ lower_is_better: false
159
+ - name: sift_similarity
160
+ display_name: SIFT Similarity
161
+ short_display_name: SIFT
162
+ description: SIFT Similarity (Scale-Invariant Feature Transform) [(Lowe, 1999)](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=790410) between an image generated by the model and the target image.
163
+ lower_is_better: false
164
+ - name: compilation_success
165
+ display_name: Compilation success
166
+ description: Fraction of instances where the generated code compiles successfully.
167
+ lower_is_better: false
168
+ - name: lpips_similarity
169
+ display_name: LPIPS similarity
170
+ short_display_name: LPIPS
171
+ description: LPIPS similarity (Learned Perceptual Image Patch Similarity) [(Zhang et al., 2018)](https://arxiv.org/abs/1801.03924) between an image generated by the model and the target image.
172
+ lower_is_better: false
173
+ - name: fid_similarity
174
+ display_name: FID similarity
175
+ short_display_name: FID
176
+ description: FID similarity (Fréchet Inception Distance) [(Heusel et al., 2017)](https://arxiv.org/abs/1706.08500) between an image generated by the model and the target image.
177
+ lower_is_better: false
178
+ - name: ssim_similarity
179
+ display_name: SSIM
180
+ short_display_name: SSIM
181
+ description: SSIM similarity (Structural Similarity Index) [(Wang et al., 2004)](https://www.cns.nyu.edu/pub/eero/wang03-reprint.pdf) between an image generated by the model and the target image.
182
+ lower_is_better: false
183
+
184
+ # Accuracy metrics:
185
+ - name: exact_match
186
+ display_name: Exact match
187
+ short_display_name: EM
188
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
189
+ lower_is_better: false
190
+ - name: quasi_exact_match
191
+ display_name: Quasi-exact match
192
+ short_display_name: EM
193
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
194
+ lower_is_better: false
195
+ - name: prefix_exact_match
196
+ display_name: Prefix exact match
197
+ short_display_name: PEM
198
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
199
+ lower_is_better: false
200
+ - name: quasi_prefix_exact_match
201
+ # TODO: should call this prefix_quasi_exact_match
202
+ display_name: Prefix quasi-exact match
203
+ short_display_name: PEM
204
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
205
+ lower_is_better: false
206
+
207
+ ############################################################
208
+ perturbations:
209
+ - name: robustness
210
+ display_name: Robustness
211
+ description: Computes worst case over different robustness perturbations (misspellings, formatting, contrast sets).
212
+
213
+ ############################################################
214
+ metric_groups:
215
+ - name: accuracy
216
+ display_name: Compilation Rate and Earth Mover Similarity
217
+ metrics:
218
+ - name: ${main_name}
219
+ split: ${main_split}
220
+ - name: compilation_success
221
+ split: ${main_split}
222
+
223
+ - name: generation_image
224
+ display_name: Generation (image)
225
+ metrics:
226
+ - name: pixel_similarity
227
+ split: ${main_split}
228
+ - name: compilation_success
229
+ split: ${main_split}
230
+ - name: fid_similarity
231
+ split: ${main_split}
232
+ - name: block_emd_similarity
233
+ split: ${main_split}
234
+ - name: block_emd_similarity_white
235
+ split: ${main_split}
236
+ - name: block_emd_similarity_median_color
237
+ split: ${main_split}
238
+
239
+ - name: generation_text
240
+ display_name: Generation (text)
241
+ metrics:
242
+ - name: edit_similarity
243
+ split: ${main_split}
244
+
245
+ ############################################################
246
+ run_groups:
247
+ - name: core_scenarios
248
+ display_name: Image2Structure
249
+ description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images.
250
+ category: All scenarios
251
+ subgroups:
252
+ - image2latex
253
+ - image2webpage
254
+ - image2musicsheet
255
+
256
+ - name: image2latex
257
+ display_name: Image2LaTeX
258
+ description: The Image2LaTeX benchmark for converting images of mathematical equations, tables. algorithms and tikz to LaTeX.
259
+ metric_groups:
260
+ - accuracy
261
+ - generation_image
262
+ - generation_text
263
+ environment:
264
+ main_name: block_emd_similarity
265
+ main_split: valid
266
+ taxonomy:
267
+ task: image-to-text
268
+ what: mathematical equations, tables, algorithms, tikz
269
+ who: n/a
270
+ when: "2024"
271
+ language: English
272
+
273
+ - name: image2webpage
274
+ display_name: Image2webpage
275
+ description: The Image2webpage benchmark for converting images of webpages to HTML/CSS/Javascript.
276
+ metric_groups:
277
+ - accuracy
278
+ - generation_image
279
+ - generation_text
280
+ environment:
281
+ main_name: block_emd_similarity
282
+ main_split: valid
283
+ taxonomy:
284
+ task: image-to-text
285
+ what: css, html, javascript
286
+ who: n/a
287
+ when: "2024"
288
+ language: English
289
+
290
+ - name: image2musicsheet
291
+ display_name: Image2musicsheet
292
+ description: The Image2musicsheet benchmark for converting images of music sheets to LilyPond.
293
+ metric_groups:
294
+ - accuracy
295
+ - generation_image
296
+ environment:
297
+ main_name: block_emd_similarity
298
+ main_split: valid
299
+ taxonomy:
300
+ task: image-to-text
301
+ what: music sheets
302
+ who: n/a
303
+ when: "2024"
304
+ language: English
@@ -0,0 +1,164 @@
1
+ ---
2
+ ############################################################
3
+ adapter:
4
+ - name: method
5
+ description: The high-level strategy for converting instances into a prompt for the language model.
6
+ values:
7
+ - name: generation
8
+ description: Given the input, the model generates the output free-form.
9
+ - name: generation_multimodal
10
+ description: Given the multimodal input, the model generates the output free-form.
11
+ - name: multiple_choice_joint
12
+ description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
13
+ - name: multiple_choice_joint_multimodal
14
+ description: Given the multimodal input, the model selects from multiple-choice options (A., B., C., D., E.).
15
+ - name: instructions
16
+ description: The description of the task that is included at the very beginning of the prompt.
17
+ - name: global_prefix
18
+ description: The string that is prepended to the prompt.
19
+ - name: global_suffix
20
+ description: The string that is appended to the prompt.
21
+ - name: instance_prefix
22
+ description: The string that is included before each instance (e.g., '\n\n').
23
+ - name: input_prefix
24
+ description: The string that is included before each input (e.g., 'Question:').
25
+ - name: input_suffix
26
+ description: The string that is included after each input (e.g., '\n').
27
+ - name: reference_prefix
28
+ description: The string that is included before each reference (for multiple-choice questions).
29
+ - name: reference_suffix
30
+ description: The string that is included after each reference (for multiple-choice questions).
31
+ - name: output_prefix
32
+ description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
33
+ - name: output_suffix
34
+ description: The string that is included after the correct answer/predicted output (e.g., '\n').
35
+ - name: substitutions
36
+ description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
37
+ - name: max_train_instances
38
+ description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
39
+ - name: max_eval_instances
40
+ description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
41
+ - name: num_outputs
42
+ description: Maximum number of possible outputs to generate by sampling multiple outputs.
43
+ - name: num_train_trials
44
+ description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
45
+ - name: sample_train
46
+ description: If true, randomly sample N training examples; if false, select N consecutive training examples
47
+ - name: model
48
+ description: Name of the language model (<creator_organization>/<model name>) to send requests to.
49
+ - name: model_deployment
50
+ description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
51
+ - name: temperature
52
+ description: Temperature parameter used in generation.
53
+ - name: max_tokens
54
+ description: Maximum number of tokens to generate.
55
+ - name: stop_sequences
56
+ description: List of sequences, where we stop generation if we encounter any of them.
57
+ - name: random
58
+ description: Random seed (string), which guarantees reproducibility.
59
+ - name: multi_label
60
+ description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
61
+
62
+ perturbations:
63
+ - name: robustness
64
+ display_name: Robustness
65
+ description: Computes worst case over different robustness perturbations (misspellings, formatting, contrast sets).
66
+
67
+ ############################################################
68
+ metrics:
69
+ # Accuracy metrics:
70
+ - name: exact_match
71
+ display_name: Exact match
72
+ short_display_name: EM
73
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
74
+ lower_is_better: false
75
+ - name: quasi_exact_match
76
+ display_name: Quasi-exact match
77
+ short_display_name: EM
78
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
79
+ lower_is_better: false
80
+ - name: prefix_exact_match
81
+ display_name: Prefix exact match
82
+ short_display_name: PEM
83
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
84
+ lower_is_better: false
85
+ - name: quasi_prefix_exact_match
86
+ # TODO: should call this prefix_quasi_exact_match
87
+ display_name: Prefix quasi-exact match
88
+ short_display_name: PEM
89
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
90
+ lower_is_better: false
91
+
92
+ - name: f1_score
93
+ display_name: F1
94
+ description: Average F1 score in terms of word overlap between the model output and correct reference.
95
+ lower_is_better: false
96
+ - name: cider
97
+ display_name: CIDEr
98
+ description: Evaluates the quality of generated caption by measuring the weighted similarity of n-grams between the captions and a set of human-written reference captions, emphasizing informativeness and consensus.
99
+ lower_is_better: false
100
+
101
+
102
+ ############################################################
103
+ metric_groups:
104
+ - name: accuracy
105
+ display_name: Accuracy
106
+ metrics:
107
+ - name: ${main_name}
108
+ split: ${main_split}
109
+
110
+ ############################################################
111
+ run_groups:
112
+ - name: core_scenarios
113
+ display_name: Core scenarios
114
+ description: The scenarios where we evaluate all the models.
115
+ category: All scenarios
116
+ subgroups:
117
+ - viz_wiz
118
+ - vqa
119
+ - mmmu
120
+
121
+ - name: viz_wiz
122
+ display_name: VizWiz
123
+ description: A benchmark for visual question answering with images and questions created by visually impaired people [(Gurari et al., 2018)](https://arxiv.org/abs/1802.08218).
124
+ metric_groups:
125
+ - accuracy
126
+ environment:
127
+ main_name: quasi_exact_match
128
+ main_split: valid
129
+ taxonomy:
130
+ task: multimodal short answer question answering
131
+ what: Real-world images
132
+ who: Visually impaired people
133
+ when: "2018"
134
+ language: English
135
+
136
+ - name: vqa
137
+ display_name: VQAv2
138
+ description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
139
+ metric_groups:
140
+ - accuracy
141
+ environment:
142
+ main_name: quasi_exact_match
143
+ main_split: valid
144
+ taxonomy:
145
+ task: multimodal short answer question answering
146
+ what: Real-world images
147
+ who: Human experts
148
+ when: "2017"
149
+ language: English
150
+
151
+ - name: mmmu
152
+ display_name: MMMU
153
+ description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning [(Yue et al., 2023)](https://arxiv.org/abs/2311.16502).
154
+ metric_groups:
155
+ - accuracy
156
+ environment:
157
+ main_name: exact_match
158
+ main_split: valid
159
+ taxonomy:
160
+ task: multimodal multiple-choice question answering
161
+ what: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering
162
+ who: Human experts
163
+ when: "2023"
164
+ language: English