crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (125) hide show
  1. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
  2. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
  3. helm/benchmark/adaptation/adapter_spec.py +32 -31
  4. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  5. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  6. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  7. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  8. helm/benchmark/annotation/annotator_factory.py +6 -0
  9. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
  10. helm/benchmark/annotation/live_qa_annotator.py +84 -0
  11. helm/benchmark/annotation/medication_qa_annotator.py +81 -0
  12. helm/benchmark/augmentations/perturbation.py +17 -1
  13. helm/benchmark/augmentations/test_perturbation.py +30 -0
  14. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  15. helm/benchmark/huggingface_registration.py +16 -6
  16. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  17. helm/benchmark/metrics/efficiency_metrics.py +9 -2
  18. helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
  19. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  20. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  21. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  22. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  23. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  24. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  25. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  26. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  27. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  28. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  29. helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
  30. helm/benchmark/model_metadata_registry.py +5 -1
  31. helm/benchmark/presentation/schema.py +54 -4
  32. helm/benchmark/presentation/test_schema.py +11 -0
  33. helm/benchmark/run.py +16 -2
  34. helm/benchmark/run_expander.py +112 -63
  35. helm/benchmark/run_spec_factory.py +15 -10
  36. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  37. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  38. helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
  39. helm/benchmark/run_specs/experimental_run_specs.py +33 -0
  40. helm/benchmark/run_specs/finance_run_specs.py +33 -0
  41. helm/benchmark/run_specs/vlm_run_specs.py +444 -65
  42. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  43. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  44. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  45. helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
  46. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  47. helm/benchmark/scenarios/math_scenario.py +1 -1
  48. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  49. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  50. helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
  51. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  52. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  53. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  54. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
  55. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
  56. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
  57. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
  58. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
  59. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  60. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  61. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  62. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  63. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  64. helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
  65. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
  66. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
  67. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
  68. helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
  69. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  70. helm/benchmark/static/schema_classic.yaml +3 -59
  71. helm/benchmark/static/schema_finance.yaml +143 -0
  72. helm/benchmark/static/schema_image2structure.yaml +447 -0
  73. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  74. helm/benchmark/static/schema_lite.yaml +3 -61
  75. helm/benchmark/static/schema_medical.yaml +255 -0
  76. helm/benchmark/static/schema_mmlu.yaml +3 -61
  77. helm/benchmark/static/schema_tables.yaml +200 -0
  78. helm/benchmark/static/schema_thai.yaml +223 -0
  79. helm/benchmark/static/schema_unitxt.yaml +3 -61
  80. helm/benchmark/static/schema_vhelm.yaml +824 -0
  81. helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
  82. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  83. helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
  84. helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
  85. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  86. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  87. helm/benchmark/static_build/index.html +2 -2
  88. helm/clients/anthropic_client.py +78 -14
  89. helm/clients/auto_client.py +11 -0
  90. helm/clients/client.py +24 -7
  91. helm/clients/cohere_client.py +98 -3
  92. helm/clients/huggingface_client.py +71 -12
  93. helm/clients/openai_client.py +11 -5
  94. helm/clients/reka_client.py +189 -0
  95. helm/clients/test_client.py +3 -3
  96. helm/clients/test_huggingface_client.py +19 -3
  97. helm/clients/test_together_client.py +72 -2
  98. helm/clients/together_client.py +199 -2
  99. helm/clients/vertexai_client.py +117 -64
  100. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  101. helm/clients/vision_language/huggingface_vlm_client.py +12 -4
  102. helm/clients/vision_language/idefics_client.py +2 -2
  103. helm/clients/vision_language/paligemma_client.py +146 -0
  104. helm/clients/vision_language/palmyra_vision_client.py +84 -0
  105. helm/clients/yi_client.py +31 -0
  106. helm/common/critique_request.py +10 -1
  107. helm/common/images_utils.py +29 -3
  108. helm/config/model_deployments.yaml +504 -12
  109. helm/config/model_metadata.yaml +579 -52
  110. helm/config/tokenizer_configs.yaml +100 -1
  111. helm/proxy/critique/model_critique_client.py +32 -4
  112. helm/proxy/services/server_service.py +1 -1
  113. helm/tokenizers/auto_tokenizer.py +1 -1
  114. helm/tokenizers/cohere_tokenizer.py +44 -2
  115. helm/tokenizers/huggingface_tokenizer.py +36 -13
  116. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  117. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  118. helm/benchmark/static/schema_vlm.yaml +0 -576
  119. helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
  120. helm/benchmark/static_build/assets/index-d839df55.js +0 -9
  121. helm/benchmark/test_model_deployment_definition.py +0 -90
  122. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
  123. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
  124. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
  125. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
@@ -1,576 +0,0 @@
1
- ---
2
- ############################################################
3
- adapter:
4
- - name: method
5
- description: The high-level strategy for converting instances into a prompt for the language model.
6
- values:
7
- - name: generation
8
- description: Given the input, the model generates the output free-form.
9
- - name: multiple_choice_joint
10
- description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
11
- - name: multiple_choice_separate_original
12
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
13
- - name: multiple_choice_separate_calibrated
14
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
15
- - name: language_modeling
16
- description: Given the input, the model assigns the sequence a probability.
17
- - name: instructions
18
- description: The description of the task that is included at the very beginning of the prompt.
19
- - name: global_prefix
20
- description: The string that is prepended to the prompt.
21
- - name: global_suffix
22
- description: The string that is appended to the prompt.
23
- - name: instance_prefix
24
- description: The string that is included before each instance (e.g., '\n\n').
25
- - name: input_prefix
26
- description: The string that is included before each input (e.g., 'Question:').
27
- - name: input_suffix
28
- description: The string that is included after each input (e.g., '\n').
29
- - name: reference_prefix
30
- description: The string that is included before each reference (for multiple-choice questions).
31
- - name: reference_suffix
32
- description: The string that is included after each reference (for multiple-choice questions).
33
- - name: output_prefix
34
- description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
35
- - name: output_suffix
36
- description: The string that is included after the correct answer/predicted output (e.g., '\n').
37
- - name: substitutions
38
- description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
39
- - name: max_train_instances
40
- description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
41
- - name: max_eval_instances
42
- description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
43
- - name: num_outputs
44
- description: Maximum number of possible outputs to generate by sampling multiple outputs.
45
- - name: num_train_trials
46
- description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
47
- - name: sample_train
48
- description: If true, randomly sample N training examples; if false, select N consecutive training examples
49
- - name: model
50
- description: Name of the language model (<creator_organization>/<model name>) to send requests to.
51
- - name: model_deployment
52
- description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
53
- - name: temperature
54
- description: Temperature parameter used in generation.
55
- - name: max_tokens
56
- description: Maximum number of tokens to generate.
57
- - name: stop_sequences
58
- description: List of sequences, where we stop generation if we encounter any of them.
59
- - name: random
60
- description: Random seed (string), which guarantees reproducibility.
61
- - name: multi_label
62
- description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
63
-
64
- ############################################################
65
- metrics:
66
- # Infrastructure metrics:
67
- - name: num_perplexity_tokens
68
- display_name: '# tokens'
69
- description: Average number of tokens in the predicted output (for language modeling, the input too).
70
- - name: num_bytes
71
- display_name: '# bytes'
72
- description: Average number of bytes in the predicted output (for language modeling, the input too).
73
-
74
- - name: num_references
75
- display_name: '# ref'
76
- description: Number of references.
77
- - name: num_train_trials
78
- display_name: '# trials'
79
- description: Number of trials, where in each trial we choose an independent, random set of training instances.
80
- - name: estimated_num_tokens_cost
81
- display_name: 'cost'
82
- description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
83
- - name: num_prompt_tokens
84
- display_name: '# prompt tokens'
85
- description: Number of tokens in the prompt.
86
- - name: num_prompt_characters
87
- display_name: '# prompt chars'
88
- description: Number of characters in the prompt.
89
- - name: num_completion_tokens
90
- display_name: '# completion tokens'
91
- description: Actual number of completion tokens (over all completions).
92
- - name: num_output_tokens
93
- display_name: '# output tokens'
94
- description: Actual number of output tokens.
95
- - name: max_num_output_tokens
96
- display_name: 'Max output tokens'
97
- description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
98
- - name: num_requests
99
- display_name: '# requests'
100
- description: Number of distinct API requests.
101
- - name: num_instances
102
- display_name: '# eval'
103
- description: Number of evaluation instances.
104
- - name: num_train_instances
105
- display_name: '# train'
106
- description: Number of training instances (e.g., in-context examples).
107
- - name: prompt_truncated
108
- display_name: truncated
109
- description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
110
- - name: finish_reason_length
111
- display_name: finish b/c length
112
- description: Fraction of instances where the the output was terminated because of the max tokens limit.
113
- - name: finish_reason_stop
114
- display_name: finish b/c stop
115
- description: Fraction of instances where the the output was terminated because of the stop sequences.
116
- - name: finish_reason_endoftext
117
- display_name: finish b/c endoftext
118
- description: Fraction of instances where the the output was terminated because the end of text token was generated.
119
- - name: finish_reason_unknown
120
- display_name: finish b/c unknown
121
- description: Fraction of instances where the the output was terminated for unknown reasons.
122
- - name: num_completions
123
- display_name: '# completions'
124
- description: Number of completions.
125
- - name: predicted_index
126
- display_name: Predicted index
127
- description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
128
-
129
- # Vision Language metrics [text]:
130
- - name: edit_similarity
131
- display_name: Edit similarity (Levenshtein)
132
- short_display_name: Edit sim.
133
- lower_is_better: false
134
- description: Average Levenshtein edit similarity (1 - distance normalized by length of longer sequence) between model generation and reference.
135
-
136
- # Vision Language metrics [image]:
137
- - name: earth_mover_similarity
138
- display_name: Earth Mover Similarity
139
- short_display_name: EMD-Sim
140
- description: 1 - Earth Mover Distance [(Rubner and Tomasi, 2000)](https://www.cs.cmu.edu/~efros/courses/LBMV07/Papers/rubner-jcviu-00.pdf) between an image generated by the model and the target image.
141
- lower_is_better: false
142
- - name: pixel_similarity
143
- display_name: Pixel Similarity
144
- short_display_name: PS
145
- description: Pixel Similarity between an image generated by the model and the target image.
146
- lower_is_better: false
147
- - name: sift_similarity
148
- display_name: SIFT Similarity
149
- short_display_name: SIFT
150
- description: SIFT Similarity (Scale-Invariant Feature Transform) [(Lowe, 1999)](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=790410) between an image generated by the model and the target image.
151
- lower_is_better: false
152
- - name: compilation_success
153
- display_name: Compilation success
154
- description: Fraction of instances where the generated code compiles successfully.
155
- lower_is_better: false
156
- - name: lpips_similarity
157
- display_name: LPIPS similarity
158
- short_display_name: LPIPS
159
- description: LPIPS similarity (Learned Perceptual Image Patch Similarity) [(Zhang et al., 2018)](https://arxiv.org/abs/1801.03924) between an image generated by the model and the target image.
160
- lower_is_better: false
161
- - name: fid_similarity
162
- display_name: FID similarity
163
- short_display_name: FID
164
- description: FID similarity (Fréchet Inception Distance) [(Heusel et al., 2017)](https://arxiv.org/abs/1706.08500) between an image generated by the model and the target image.
165
- lower_is_better: false
166
- - name: ssim_similarity
167
- display_name: SSIM
168
- short_display_name: SSIM
169
- description: SSIM similarity (Structural Similarity Index) [(Wang et al., 2004)](https://www.cns.nyu.edu/pub/eero/wang03-reprint.pdf) between an image generated by the model and the target image.
170
- lower_is_better: false
171
-
172
- # Accuracy metrics:
173
- - name: exact_match
174
- display_name: Exact match
175
- short_display_name: EM
176
- description: Fraction of instances that the predicted output matches a correct reference exactly.
177
- lower_is_better: false
178
- - name: quasi_exact_match
179
- display_name: Quasi-exact match
180
- short_display_name: EM
181
- description: Fraction of instances that the predicted output matches a correct reference up to light processing.
182
- lower_is_better: false
183
- - name: prefix_exact_match
184
- display_name: Prefix exact match
185
- short_display_name: PEM
186
- description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
187
- lower_is_better: false
188
- - name: quasi_prefix_exact_match
189
- # TODO: should call this prefix_quasi_exact_match
190
- display_name: Prefix quasi-exact match
191
- short_display_name: PEM
192
- description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
193
- lower_is_better: false
194
-
195
- ############################################################
196
- perturbations:
197
- - name: robustness
198
- display_name: Robustness
199
- description: Computes worst case over different robustness perturbations (misspellings, formatting, contrast sets).
200
- - name: fairness
201
- display_name: Fairness
202
- description: Computes worst case over different fairness perturbations (changing dialect, race of names, gender).
203
- - name: typos
204
- display_name: Typos
205
- description: >
206
- Randomly adds typos to each token in the input with probability 0.05 and computes the per-instance worst-case
207
- performance between perturbed and unperturbed versions.
208
- - name: synonym
209
- display_name: Synonyms
210
- description: >
211
- Randomly substitutes words in the input with WordNet synonyms with probability 0.5 and computes the per-instance
212
- worst-case performance between perturbed and unperturbed versions.
213
- - name: dialect
214
- display_name: SAE -> AAE
215
- short_display_name: Dialect
216
- description: >
217
- Deterministically substitutes SAE words in input with AAE counterparts using validated dictionary of [Ziems et al. (2022)](https://aclanthology.org/2022.acl-long.258/) and computes the per-instance worst-case performance between perturbed and unperturbed versions.
218
- - name: race
219
- display_name: First names by race (White -> Black)
220
- short_display_name: Race
221
- description: >
222
- Deterministically substitutes White first names with Black first names sampled from the lists of [Caliskan et al. (2017)](https://www.science.org/doi/10.1126/science.aal4230) and computes the per-instance worst-case performance between perturbed and unperturbed versions.
223
- - name: gender
224
- display_name: Pronouns by gender (Male -> Female)
225
- short_display_name: Gender
226
- description: >
227
- Deterministically substitutes male pronouns with female pronouns and computes the per-instance worst-case
228
- performance between perturbed and unperturbed versions.
229
-
230
- ############################################################
231
- metric_groups:
232
- - name: accuracy
233
- display_name: Accuracy
234
- metrics:
235
- - name: ${main_name}
236
- split: ${main_split}
237
-
238
- - name: efficiency
239
- display_name: Efficiency
240
- metrics:
241
- - name: inference_runtime
242
- split: ${main_split}
243
-
244
- - name: general_information
245
- display_name: General information
246
- metrics:
247
- - name: num_instances
248
- split: ${main_split}
249
- - name: num_train_instances
250
- split: ${main_split}
251
- - name: prompt_truncated
252
- split: ${main_split}
253
- - name: num_prompt_tokens
254
- split: ${main_split}
255
- - name: num_output_tokens
256
- split: ${main_split}
257
-
258
- - name: generation_image
259
- display_name: Generation (image)
260
- metrics:
261
- - name: pixel_similarity
262
- split: ${main_split}
263
- - name: compilation_success
264
- split: ${main_split}
265
- - name: fid_similarity
266
- split: ${main_split}
267
- - name: earth_mover_similarity
268
- split: ${main_split}
269
-
270
- - name: generation_text
271
- display_name: Generation (text)
272
- metrics:
273
- - name: edit_similarity
274
- split: ${main_split}
275
-
276
- ############################################################
277
- run_groups:
278
- - name: core_scenarios
279
- display_name: Core scenarios
280
- description: The scenarios where we evaluate all the models.
281
- category: All scenarios
282
- subgroups:
283
- - hateful_memes
284
- - heim_human_eval
285
- - viz_wiz
286
- - vqa
287
- - mmmu
288
- - image2structure
289
- - unicorn
290
- - bingo
291
- - multipanelvqa
292
- - pope
293
- - seed_bench
294
- - mme
295
-
296
- - name: heim_human_eval
297
- display_name: HEIM Human Eval Scenario
298
- description: Seeing if we can use VLMs to evaluate AI-generated images from HEIM
299
- metric_groups:
300
- - accuracy
301
- - efficiency
302
- - general_information
303
- environment:
304
- main_name: exact_match
305
- main_split: test
306
- taxonomy:
307
- task: multiple-choice question answering
308
- what: AI-generated images
309
- who: Text-to-image models
310
- when: "2024"
311
- language: English
312
-
313
- - name: image2structure
314
- display_name: Image2Structure
315
- description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images.
316
- category: All scenarios
317
- subgroups:
318
- - image2latex
319
- - image2webpage
320
- - image2musicsheet
321
-
322
- - name: hateful_memes
323
- display_name: Hateful Memes
324
- description: The Hateful Memes benchmark for multimodal hate speech detection [(Dwibedi et al., 2020)](https://arxiv.org/pdf/2005.04790.pdf).
325
- metric_groups:
326
- - accuracy
327
- - efficiency
328
- - general_information
329
- environment:
330
- main_name: exact_match
331
- main_split: test
332
- taxonomy:
333
- task: multimodal classification
334
- what: images and text
335
- who: annotators from Amazon Mechanical Turk
336
- when: "2020"
337
- language: English
338
-
339
- - name: viz_wiz
340
- display_name: VizWiz
341
- description: The VizWiz benchmark for visual question answering on images taken by blind people [(Gurari et al., 2018)](https://arxiv.org/pdf/1802.08218.pdf).
342
- metric_groups:
343
- - accuracy
344
- - efficiency
345
- - general_information
346
- environment:
347
- main_name: exact_match
348
- main_split: test
349
- taxonomy:
350
- task: multimodal question answering
351
- what: images and text
352
- who: blind people
353
- when: "2018"
354
- language: English
355
-
356
- - name: vqa
357
- display_name: VQAv2
358
- description: Open-ended questions about images
359
- metric_groups:
360
- - accuracy
361
- - efficiency
362
- - general_information
363
- environment:
364
- main_name: f1_score
365
- main_split: valid
366
- taxonomy:
367
- task: short answer question answering
368
- what: Real-world images
369
- who: Human experts
370
- when: "2017"
371
- language: English
372
-
373
- - name: mmmu
374
- display_name: MMMU
375
- description: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI
376
- metric_groups:
377
- - accuracy
378
- - efficiency
379
- - general_information
380
- environment:
381
- main_name: exact_match
382
- main_split: valid
383
- taxonomy:
384
- task: multiple-choice question answering
385
- what: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering
386
- who: Human experts
387
- when: "2023"
388
- language: English
389
-
390
- - name: unicorn
391
- display_name: Unicorn
392
- description: Safety Evaluation Benchmark for Evaluating on Out-of-Distribution and Sketch Images
393
- metric_groups:
394
- - accuracy
395
- - general_information
396
- environment:
397
- main_name: exact_match
398
- main_split: test
399
- taxonomy:
400
- task: short answer question answering
401
- what: OOD images and sketch images
402
- who: Human experts
403
- when: "2023"
404
- language: English
405
-
406
- - name: bingo
407
- display_name: Bingo
408
- description: Open-ended questions about biased images
409
- metric_groups:
410
- - accuracy
411
- environment:
412
- main_name: f1_score
413
- main_split: test
414
- taxonomy:
415
- task: short answer question answering
416
- what: Biased images about Region, OCR, Factual, Text-to-Image and Image-to-Image inference challenges
417
- who: Human experts
418
- when: "2023"
419
- language: English, Chinese, Japanese, etc.
420
-
421
- - name: multipanelvqa
422
- display_name: MultipanelVQA
423
- description: Question about real-world or synthetic multipanel images for evaluating multi-panel image reasoning ability
424
- metric_groups:
425
- - accuracy
426
- - efficiency
427
- - general_information
428
- environment:
429
- main_name: exact_match
430
- main_split: test
431
- taxonomy:
432
- task: short answer or multiple-choice question answering
433
- what: Real-world or synthetic multipanel images
434
- who: Human experts
435
- when: "2024"
436
- language: English
437
-
438
- - name: pope
439
- display_name: POPE
440
- description: Open-ended questions about object appearance in real-world images for evaluating hallucination behaviour
441
- metric_groups:
442
- - accuracy
443
- - efficiency
444
- - general_information
445
- environment:
446
- main_name: exact_match
447
- main_split: test
448
- taxonomy:
449
- task: short answer question answering
450
- what: Real-world images
451
- who: Human experts
452
- when: "2023"
453
- language: English
454
-
455
- - name: seed_bench
456
- display_name: Seed Bench
457
- description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input
458
- including the comprehension of both the image and video modality
459
- metric_groups:
460
- - accuracy
461
- - efficiency
462
- - general_information
463
- environment:
464
- main_name: exact_match
465
- main_split: test
466
- taxonomy:
467
- task: multiple-choice question answering
468
- what: Real-world images
469
- who: Human experts
470
- when: "2023"
471
- language: English
472
-
473
- - name: mme
474
- display_name: MME
475
- description: A comprehensive MLLM Evaluation benchmark with perception and cognition evaluations on 14 subtasks
476
- metric_groups:
477
- - accuracy
478
- - efficiency
479
- - general_information
480
- environment:
481
- main_name: exact_match
482
- main_split: test
483
- taxonomy:
484
- task: multiple-choice question answering
485
- what: Real-world images
486
- who: Human experts
487
- when: "2023"
488
- language: English
489
-
490
- - name: mementos
491
- display_name: Mementos
492
- description: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences
493
- metric_groups:
494
- - accuracy
495
- environment:
496
- main_name: f1_score
497
- main_split: test
498
- taxonomy:
499
- task: short answer question answering
500
- what: Image sequences of comics, dailylife and robotics
501
- who: Human experts
502
- when: "2024"
503
- language: English
504
-
505
- - name: image2latex
506
- display_name: Image2LaTeX
507
- description: The Image2LaTeX benchmark for converting images of mathematical equations, tables. algorithms and tikz to LaTeX.
508
- metric_groups:
509
- - accuracy
510
- - generation_image
511
- - generation_text
512
- - efficiency
513
- - general_information
514
- environment:
515
- main_name: earth_mover_similarity
516
- main_split: valid
517
- taxonomy:
518
- task: image-to-text
519
- what: mathematical equations, tables, algorithms, tikz
520
- who: n/a
521
- when: "2024"
522
- language: English
523
-
524
- - name: image2webpage
525
- display_name: Image2webpage
526
- description: The Image2webpage benchmark for converting images of webpages to HTML/CSS/Javascript.
527
- metric_groups:
528
- - accuracy
529
- - generation_image
530
- - generation_text
531
- - efficiency
532
- - general_information
533
- environment:
534
- main_name: earth_mover_similarity
535
- main_split: valid
536
- taxonomy:
537
- task: image-to-text
538
- what: css, html, javascript
539
- who: n/a
540
- when: "2024"
541
- language: English
542
-
543
- - name: image2musicsheet
544
- display_name: Image2musicsheet
545
- description: The Image2musicsheet benchmark for converting images of music sheets to LilyPond.
546
- metric_groups:
547
- - accuracy
548
- - generation_image
549
- - efficiency
550
- - general_information
551
- environment:
552
- main_name: earth_mover_similarity
553
- main_split: valid
554
- taxonomy:
555
- task: image-to-text
556
- what: music sheets
557
- who: n/a
558
- when: "2024"
559
- language: English
560
-
561
- - name: chart2csv
562
- display_name: Chart2CSV
563
- description: The Chart2CSV benchmark for converting images of charts to CSV.
564
- metric_groups:
565
- - accuracy
566
- - efficiency
567
- - general_information
568
- environment:
569
- main_name: exact_match
570
- main_split: test
571
- taxonomy:
572
- task: chart to CSV
573
- what: plots
574
- who: n/a
575
- when: "2024"
576
- language: English