crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (236) hide show
  1. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
  2. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
  3. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +32 -31
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  6. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  7. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  8. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  9. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  10. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  11. helm/benchmark/annotation/annotator_factory.py +6 -0
  12. helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
  13. helm/benchmark/annotation/call_center_annotator.py +247 -0
  14. helm/benchmark/annotation/financebench_annotator.py +79 -0
  15. helm/benchmark/annotation/harm_bench_annotator.py +68 -0
  16. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  17. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  18. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  19. helm/benchmark/annotation/live_qa_annotator.py +71 -0
  20. helm/benchmark/annotation/medication_qa_annotator.py +68 -0
  21. helm/benchmark/annotation/model_as_judge.py +45 -0
  22. helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
  23. helm/benchmark/annotation/xstest_annotator.py +110 -0
  24. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  25. helm/benchmark/huggingface_registration.py +16 -6
  26. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  27. helm/benchmark/metrics/annotation_metrics.py +108 -0
  28. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  29. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  30. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  31. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  32. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  33. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  34. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  35. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  36. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  37. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  38. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  39. helm/benchmark/metrics/safety_metrics.py +57 -0
  40. helm/benchmark/metrics/summac/model_summac.py +3 -3
  41. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  42. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  43. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  44. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  45. helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
  46. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  47. helm/benchmark/model_metadata_registry.py +3 -3
  48. helm/benchmark/presentation/schema.py +54 -4
  49. helm/benchmark/presentation/test_run_entry.py +1 -0
  50. helm/benchmark/presentation/test_schema.py +11 -0
  51. helm/benchmark/run.py +31 -2
  52. helm/benchmark/run_expander.py +113 -10
  53. helm/benchmark/run_spec_factory.py +4 -0
  54. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  55. helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
  56. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  57. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  58. helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
  59. helm/benchmark/run_specs/experimental_run_specs.py +85 -0
  60. helm/benchmark/run_specs/finance_run_specs.py +110 -0
  61. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  62. helm/benchmark/run_specs/vlm_run_specs.py +251 -57
  63. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  64. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  65. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  66. helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
  67. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  68. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  69. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  70. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  71. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  72. helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
  73. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  74. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  75. helm/benchmark/scenarios/scenario.py +1 -1
  76. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  77. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  78. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  79. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  80. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  81. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  82. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  83. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  84. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  85. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  86. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  87. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  88. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  89. helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
  90. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  91. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  92. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  93. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  94. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  95. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  96. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  97. helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
  98. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
  99. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
  100. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  101. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  102. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  103. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
  104. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  105. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  106. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  107. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  108. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  109. helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
  110. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  111. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  112. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  113. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
  114. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
  115. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  116. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  117. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  118. helm/benchmark/server.py +1 -6
  119. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  120. helm/benchmark/static/schema_bhasa.yaml +709 -0
  121. helm/benchmark/static/schema_call_center.yaml +232 -0
  122. helm/benchmark/static/schema_classic.yaml +3 -59
  123. helm/benchmark/static/schema_cleva.yaml +768 -0
  124. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  125. helm/benchmark/static/schema_ewok.yaml +367 -0
  126. helm/benchmark/static/schema_finance.yaml +189 -0
  127. helm/benchmark/static/schema_image2struct.yaml +588 -0
  128. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  129. helm/benchmark/static/schema_lite.yaml +3 -61
  130. helm/benchmark/static/schema_medical.yaml +255 -0
  131. helm/benchmark/static/schema_mmlu.yaml +3 -61
  132. helm/benchmark/static/schema_safety.yaml +247 -0
  133. helm/benchmark/static/schema_tables.yaml +317 -0
  134. helm/benchmark/static/schema_thai.yaml +244 -0
  135. helm/benchmark/static/schema_unitxt.yaml +3 -61
  136. helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
  137. helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
  138. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  139. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  140. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  141. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  142. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  143. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  144. helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
  145. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  146. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  147. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  148. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  149. helm/benchmark/static_build/index.html +2 -2
  150. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  151. helm/clients/ai21_client.py +71 -1
  152. helm/clients/anthropic_client.py +50 -28
  153. helm/clients/auto_client.py +11 -0
  154. helm/clients/client.py +24 -7
  155. helm/clients/cohere_client.py +98 -3
  156. helm/clients/huggingface_client.py +79 -19
  157. helm/clients/nvidia_nim_client.py +35 -0
  158. helm/clients/openai_client.py +11 -5
  159. helm/clients/palmyra_client.py +25 -0
  160. helm/clients/perspective_api_client.py +11 -6
  161. helm/clients/reka_client.py +189 -0
  162. helm/clients/test_client.py +7 -9
  163. helm/clients/test_huggingface_client.py +19 -3
  164. helm/clients/test_together_client.py +72 -2
  165. helm/clients/together_client.py +129 -23
  166. helm/clients/vertexai_client.py +62 -18
  167. helm/clients/vision_language/huggingface_vlm_client.py +1 -0
  168. helm/clients/vision_language/open_flamingo_client.py +1 -2
  169. helm/clients/vision_language/paligemma_client.py +146 -0
  170. helm/clients/vision_language/palmyra_vision_client.py +99 -0
  171. helm/clients/yi_client.py +31 -0
  172. helm/common/critique_request.py +10 -1
  173. helm/common/images_utils.py +25 -0
  174. helm/common/mongo_key_value_store.py +2 -1
  175. helm/common/request.py +16 -0
  176. helm/config/model_deployments.yaml +740 -363
  177. helm/config/model_metadata.yaml +824 -128
  178. helm/config/tokenizer_configs.yaml +207 -10
  179. helm/proxy/critique/model_critique_client.py +32 -4
  180. helm/proxy/example_queries.py +14 -21
  181. helm/proxy/services/server_service.py +2 -3
  182. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  183. helm/tokenizers/ai21_tokenizer.py +51 -59
  184. helm/tokenizers/auto_tokenizer.py +1 -1
  185. helm/tokenizers/cohere_tokenizer.py +29 -62
  186. helm/tokenizers/huggingface_tokenizer.py +35 -13
  187. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  188. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  189. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  190. helm/benchmark/static/benchmarking.css +0 -156
  191. helm/benchmark/static/benchmarking.js +0 -1705
  192. helm/benchmark/static/config.js +0 -3
  193. helm/benchmark/static/general.js +0 -122
  194. helm/benchmark/static/images/crfm-logo.png +0 -0
  195. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  196. helm/benchmark/static/images/helm-logo.png +0 -0
  197. helm/benchmark/static/images/language-model-helm.png +0 -0
  198. helm/benchmark/static/images/organizations/ai21.png +0 -0
  199. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  200. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  201. helm/benchmark/static/images/organizations/cohere.png +0 -0
  202. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  203. helm/benchmark/static/images/organizations/google.png +0 -0
  204. helm/benchmark/static/images/organizations/meta.png +0 -0
  205. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  206. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  207. helm/benchmark/static/images/organizations/openai.png +0 -0
  208. helm/benchmark/static/images/organizations/together.png +0 -0
  209. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  210. helm/benchmark/static/images/organizations/yandex.png +0 -0
  211. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  212. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  213. helm/benchmark/static/index.html +0 -68
  214. helm/benchmark/static/info-icon.png +0 -0
  215. helm/benchmark/static/json-urls.js +0 -69
  216. helm/benchmark/static/plot-captions.js +0 -27
  217. helm/benchmark/static/schema_image2structure.yaml +0 -304
  218. helm/benchmark/static/utils.js +0 -285
  219. helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
  220. helm/benchmark/static_build/assets/index-878a1094.css +0 -1
  221. helm/benchmark/window_services/ai21_window_service.py +0 -247
  222. helm/benchmark/window_services/cohere_window_service.py +0 -101
  223. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  224. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  225. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  226. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  227. helm/tokenizers/ice_tokenizer.py +0 -30
  228. helm/tokenizers/test_ice_tokenizer.py +0 -57
  229. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
  230. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
  231. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
  232. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  233. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  234. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
  235. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
  236. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -1,66 +1,8 @@
1
1
  ---
2
2
  ############################################################
3
- adapter:
4
- - name: method
5
- description: The high-level strategy for converting instances into a prompt for the language model.
6
- values:
7
- - name: generation
8
- description: Given the input, the model generates the output free-form.
9
- - name: multiple_choice_joint
10
- description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
11
- - name: multiple_choice_separate_original
12
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
13
- - name: multiple_choice_separate_calibrated
14
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
15
- - name: language_modeling
16
- description: Given the input, the model assigns the sequence a probability.
17
- - name: instructions
18
- description: The description of the task that is included at the very beginning of the prompt.
19
- - name: global_prefix
20
- description: The string that is prepended to the prompt.
21
- - name: global_suffix
22
- description: The string that is appended to the prompt.
23
- - name: instance_prefix
24
- description: The string that is included before each instance (e.g., '\n\n').
25
- - name: input_prefix
26
- description: The string that is included before each input (e.g., 'Question:').
27
- - name: input_suffix
28
- description: The string that is included after each input (e.g., '\n').
29
- - name: reference_prefix
30
- description: The string that is included before each reference (for multiple-choice questions).
31
- - name: reference_suffix
32
- description: The string that is included after each reference (for multiple-choice questions).
33
- - name: output_prefix
34
- description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
35
- - name: output_suffix
36
- description: The string that is included after the correct answer/predicted output (e.g., '\n').
37
- - name: substitutions
38
- description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
39
- - name: max_train_instances
40
- description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
41
- - name: max_eval_instances
42
- description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
43
- - name: num_outputs
44
- description: Maximum number of possible outputs to generate by sampling multiple outputs.
45
- - name: num_train_trials
46
- description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
47
- - name: sample_train
48
- description: If true, randomly sample N training examples; if false, select N consecutive training examples
49
- - name: model
50
- description: Name of the language model (<creator_organization>/<model name>) to send requests to.
51
- - name: model_deployment
52
- description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
53
- - name: temperature
54
- description: Temperature parameter used in generation.
55
- - name: max_tokens
56
- description: Maximum number of tokens to generate.
57
- - name: stop_sequences
58
- description: List of sequences, where we stop generation if we encounter any of them.
59
- - name: random
60
- description: Random seed (string), which guarantees reproducibility.
61
- - name: multi_label
62
- description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
63
-
3
+ # For backwards compatibility with older versions of HELM.
4
+ # TODO: Remove this after 2024-09-01.
5
+ adapter: []
64
6
  ############################################################
65
7
  metrics:
66
8
  # Infrastructure metrics:
@@ -220,49 +162,10 @@ metrics:
220
162
  display_name: CIDEr
221
163
  description: Evaluates the quality of generated caption by measuring the weighted similarity of n-grams between the captions and a set of human-written reference captions, emphasizing informativeness and consensus.
222
164
  lower_is_better: false
223
-
224
- # Bias metrics:
225
- - name: bias_metric:mode=associations,demographic_category=race,target_category=profession
226
- display_name: Stereotypical associations (race, profession)
227
- short_display_name: Stereotypes (race)
228
- lower_is_better: true
229
- description: Measures uneven association of racial groups (Asian, Hispanic, White) with target professions. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
230
- - name: bias_metric:mode=associations,demographic_category=race,target_category=adjective
231
- display_name: Stereotypical associations (race, adjectives)
232
- short_display_name: Stereotypes (race)
233
- lower_is_better: true
234
- description: Measures uneven association of racial groups (Asian, Hispanic, White) with target adjectives. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
235
-
236
- - name: bias_metric:mode=associations,demographic_category=gender,target_category=profession
237
- display_name: Stereotypical associations (gender, profession)
238
- short_display_name: Stereotypes (gender)
239
- lower_is_better: true
240
- description: Measures uneven association of gender groups (male, female) with target professions. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
241
- - name: bias_metric:mode=associations,demographic_category=gender,target_category=adjective
242
- display_name: Stereotypical associations (gender, adjectives)
243
- short_display_name: Stereotypes (gender)
244
- lower_is_better: true
245
- description: Measures uneven association of gender groups (male, female) with target adjectives. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
246
-
247
- - name: bias_metric:mode=representation,demographic_category=race
248
- display_name: Demographic representation (race)
249
- short_display_name: Representation (race)
250
- lower_is_better: true
251
- description: Measures uneven representation of racial groups (Asian, Hispanic, White). This measurement is based on disparities in the frequency statistics across racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics).
252
- - name: bias_metric:mode=representation,demographic_category=gender
253
- display_name: Demographic representation (gender)
254
- short_display_name: Representation (gender)
255
- lower_is_better: true
256
- description: Measures uneven representation of gender groups (male, female). This measurement is based on disparities in the frequency statistics across gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
257
-
258
- - name: bbq_metric_ambiguous_bias
259
- display_name: BBQ (ambiguous)
260
- lower_is_better: true
261
- description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on ambiguous examples.
262
- - name: bbq_metric_unambiguous_bias
263
- display_name: BBQ (unambiguous)
264
- lower_is_better: true
265
- description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on unambiguous examples.
165
+ - name: prometheus_vision
166
+ display_name: Prometheus Vision rating
167
+ description: Scores are from 1 to 5, where 5 is the best.
168
+ lower_is_better: false
266
169
 
267
170
  # Toxicity metrics
268
171
  - name: expected_max_toxicity
@@ -281,6 +184,9 @@ metrics:
281
184
 
282
185
  ############################################################
283
186
  perturbations:
187
+ - name: translate
188
+ display_name: Translate
189
+ description: Translate prompts to different languages.
284
190
  - name: robustness
285
191
  display_name: Robustness
286
192
  description: Computes worst case over different robustness perturbations (misspellings, formatting, contrast sets).
@@ -322,12 +228,6 @@ metric_groups:
322
228
  - name: ${main_name}
323
229
  split: ${main_split}
324
230
 
325
- - name: efficiency
326
- display_name: Efficiency
327
- metrics:
328
- - name: inference_runtime
329
- split: ${main_split}
330
-
331
231
  - name: general_information
332
232
  display_name: General information
333
233
  metrics:
@@ -348,50 +248,126 @@ metric_groups:
348
248
  - name: toxic_frac
349
249
  split: ${main_split}
350
250
 
351
- - name: generation_image
352
- display_name: Generation (image)
251
+ - name: fairness
252
+ display_name: Fairness
353
253
  metrics:
354
- - name: pixel_similarity
355
- split: ${main_split}
356
- - name: compilation_success
357
- split: ${main_split}
358
- - name: fid_similarity
254
+ - name: ${main_name}
359
255
  split: ${main_split}
360
- - name: earth_mover_similarity
256
+ perturbation_name: fairness
257
+
258
+ - name: robustness
259
+ display_name: Robustness
260
+ metrics:
261
+ - name: ${main_name}
361
262
  split: ${main_split}
263
+ perturbation_name: robustness
362
264
 
363
- - name: generation_text
364
- display_name: Generation (text)
265
+ - name: translate
266
+ display_name: Translate
365
267
  metrics:
366
- - name: edit_similarity
268
+ - name: ${main_name}
367
269
  split: ${main_split}
270
+ perturbation_name: translate
271
+
368
272
 
369
273
  ############################################################
370
274
  run_groups:
371
275
  - name: core_scenarios
372
- display_name: Core scenarios
373
- description: The scenarios where we evaluate all the models.
276
+ display_name: All
277
+ description: All scenarios across capabilities
374
278
  category: All scenarios
375
279
  subgroups:
376
- - hateful_memes
377
- - heim_human_eval
280
+ - visual_perception
281
+ - reasoning
282
+ - knowledge
283
+ - bias
284
+ - fairness
285
+ - toxicity
286
+ - robustness
287
+ - multilinguality
288
+ - name: visual_perception
289
+ display_name: Visual perception
290
+ description: Is the output semantically correct, given the text and image inputs?
291
+ category: Core scenarios
292
+ subgroups:
293
+ - vqa_base
378
294
  - viz_wiz
379
- - vqa
295
+ - flickr30k
296
+ - name: reasoning
297
+ display_name: Reasoning
298
+ description: Does the model understand objects, counts and spatial relations? Can the model reason about both the text and image input?
299
+ category: Core scenarios
300
+ subgroups:
301
+ - gqa
302
+ - math_vista
303
+ - seed_bench
304
+ - name: real_world_reasoning
305
+ display_name: Real-world Reasoning
306
+ description: Reasoning in the real-world
307
+ category: Core scenarios
308
+ subgroups:
309
+ - gqa
310
+ - seed_bench
311
+ - mementos
312
+ - real_world_qa
313
+ - name: knowledge
314
+ display_name: Knowledge
315
+ description: Does the model have knowledge about the world and common sense?
316
+ category: Core scenarios
317
+ subgroups:
318
+ - a_okvqa_base
380
319
  - mmmu
381
- - image2structure
320
+ - mme
321
+ - vibe_eval
322
+ - real_world_qa
323
+ - name: bias
324
+ display_name: Bias
325
+ description: Are the generations biased in demographic representation? We focus on gender and skin tone bias.
326
+ category: Core scenarios
327
+ subgroups:
328
+ - pairs
329
+ - name: fairness
330
+ display_name: Fairness
331
+ description: Does the model exhibit performance disparities across different groups? We focus on gender, dialect and geographic bias.
332
+ category: Core scenarios
333
+ subgroups:
334
+ - vqa_dialect
335
+ - a_okvqa_dialect
336
+ - crossmodal_3600
337
+ - fair_face
338
+ - name: toxicity
339
+ display_name: Toxicity
340
+ description: Does the model generate toxic or inappropriate content? Can the model identify toxic or inappropriate content?
341
+ category: Core scenarios
342
+ subgroups:
343
+ - mm_safety_bench
344
+ - hateful_memes
345
+ - name: robustness
346
+ display_name: Robustness
347
+ description: Is the model robust to perturbations? We focus on both text and image perturbations.
348
+ category: Core scenarios
349
+ subgroups:
350
+ - vqa_robustness
351
+ - a_okvqa_robustness
382
352
  - unicorn
383
353
  - bingo
384
- - multipanelvqa
385
354
  - pope
386
- - seed_bench
387
- - mme
355
+ - name: multilinguality
356
+ display_name: Multilinguality
357
+ description: Do the model support non-English languages?
358
+ category: Core scenarios
359
+ subgroups:
360
+ - a_okvqa_chinese
361
+ - a_okvqa_hindi
362
+ - a_okvqa_spanish
363
+ - a_okvqa_swahili
364
+ - exams_v
388
365
 
389
- - name: a_okvqa
366
+ - name: a_okvqa_base
390
367
  display_name: A-OKVQA
391
- description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
368
+ description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
392
369
  metric_groups:
393
370
  - accuracy
394
- - efficiency
395
371
  - general_information
396
372
  environment:
397
373
  main_name: exact_match
@@ -403,15 +379,110 @@ run_groups:
403
379
  when: "2023"
404
380
  language: English
405
381
 
382
+ - name: a_okvqa_dialect
383
+ display_name: A-OKVQA (AAE)
384
+ description: African-American English Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
385
+ metric_groups:
386
+ - fairness
387
+ - general_information
388
+ environment:
389
+ main_name: exact_match
390
+ main_split: valid
391
+ taxonomy:
392
+ task: multiple-choice question answering
393
+ what: Real-world images
394
+ who: Human experts
395
+ when: "2023"
396
+ language: English
397
+
398
+ - name: a_okvqa_robustness
399
+ display_name: A-OKVQA (robustness)
400
+ description: Robustness Typos Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
401
+ metric_groups:
402
+ - robustness
403
+ - general_information
404
+ environment:
405
+ main_name: exact_match
406
+ main_split: valid
407
+ taxonomy:
408
+ task: multiple-choice question answering
409
+ what: Real-world images
410
+ who: Human experts
411
+ when: "2023"
412
+ language: English
413
+
414
+ - name: a_okvqa_chinese
415
+ display_name: A-OKVQA (chinese)
416
+ description: Chinese Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
417
+ metric_groups:
418
+ - translate
419
+ - general_information
420
+ environment:
421
+ main_name: exact_match
422
+ main_split: valid
423
+ taxonomy:
424
+ task: multiple-choice question answering
425
+ what: Real-world images
426
+ who: Human experts
427
+ when: "2023"
428
+ language: Chinese
429
+
430
+ - name: a_okvqa_hindi
431
+ display_name: A-OKVQA (hindi)
432
+ description: Hindi Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
433
+ metric_groups:
434
+ - translate
435
+ - general_information
436
+ environment:
437
+ main_name: exact_match
438
+ main_split: valid
439
+ taxonomy:
440
+ task: multiple-choice question answering
441
+ what: Real-world images
442
+ who: Human experts
443
+ when: "2023"
444
+ language: Hindi
445
+
446
+ - name: a_okvqa_spanish
447
+ display_name: A-OKVQA (spanish)
448
+ description: Spanish Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
449
+ metric_groups:
450
+ - translate
451
+ - general_information
452
+ environment:
453
+ main_name: exact_match
454
+ main_split: valid
455
+ taxonomy:
456
+ task: multiple-choice question answering
457
+ what: Real-world images
458
+ who: Human experts
459
+ when: "2023"
460
+ language: Spanish
461
+
462
+ - name: a_okvqa_swahili
463
+ display_name: A-OKVQA (swahili)
464
+ description: Swahili Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
465
+ metric_groups:
466
+ - translate
467
+ - general_information
468
+ environment:
469
+ main_name: exact_match
470
+ main_split: valid
471
+ taxonomy:
472
+ task: multiple-choice question answering
473
+ what: Real-world images
474
+ who: Human experts
475
+ when: "2023"
476
+ language: Swahili
477
+
406
478
  - name: crossmodal_3600
407
479
  display_name: Crossmodal 3600
408
- description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([paper](https://arxiv.org/abs/2205.12522))
480
+ description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([Thapliyal et al., 2022](https://arxiv.org/abs/2205.12522))
409
481
  metric_groups:
410
482
  - accuracy
411
- - efficiency
412
483
  - general_information
413
484
  environment:
414
- main_name: f1_score
485
+ main_name: prometheus_vision
415
486
  main_split: test
416
487
  taxonomy:
417
488
  task: multilingual captioning
@@ -422,13 +493,12 @@ run_groups:
422
493
 
423
494
  - name: flickr30k
424
495
  display_name: Flickr30k
425
- description: An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr images. ([paper](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))
496
+ description: An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr images. ([Young et al., 2014](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))
426
497
  metric_groups:
427
498
  - accuracy
428
- - efficiency
429
499
  - general_information
430
500
  environment:
431
- main_name: f1_score
501
+ main_name: prometheus_vision
432
502
  main_split: test
433
503
  taxonomy:
434
504
  task: image captioning
@@ -439,145 +509,112 @@ run_groups:
439
509
 
440
510
  - name: gqa
441
511
  display_name: GQA
442
- description: Questions about real-world visual reasoning and compositional QA
512
+ description: Questions about real-world visual reasoning and compositional QA ([Hudson and Manning, 2019](https://arxiv.org/abs/1902.09506)).
443
513
  metric_groups:
444
514
  - accuracy
445
- - efficiency
446
515
  - general_information
447
516
  environment:
448
- main_name: f1_score
517
+ main_name: quasi_exact_match
449
518
  main_split: valid
450
519
  taxonomy:
451
- task: short answer question answering
520
+ task: short-answer question answering
452
521
  what: Real-world images
453
522
  who: Human experts
454
523
  when: "2019"
455
524
  language: English
456
525
 
457
- - name: heim_human_eval
458
- display_name: HEIM Human Eval Scenario
459
- description: Seeing if we can use VLMs to evaluate AI-generated images from HEIM
460
- metric_groups:
461
- - accuracy
462
- - efficiency
463
- - general_information
464
- environment:
465
- main_name: exact_match
466
- main_split: test
467
- taxonomy:
468
- task: multiple-choice question answering
469
- what: AI-generated images
470
- who: Text-to-image models
471
- when: "2024"
472
- language: English
473
-
474
- - name: image2structure
475
- display_name: Image2Structure
476
- description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images.
477
- category: All scenarios
478
- subgroups:
479
- - image2latex
480
- - image2webpage
481
- - image2musicsheet
482
-
483
526
  - name: hateful_memes
484
527
  display_name: Hateful Memes
485
- description: The Hateful Memes benchmark for multimodal hate speech detection [(Dwibedi et al., 2020)](https://arxiv.org/pdf/2005.04790.pdf).
528
+ description: Multimodal hate speech detection in memes ([Kiela et al., 2020](https://arxiv.org/abs/2005.04790)).
486
529
  metric_groups:
487
530
  - accuracy
488
- - efficiency
489
531
  - general_information
490
532
  environment:
491
533
  main_name: exact_match
492
534
  main_split: test
493
535
  taxonomy:
494
- task: multimodal classification
495
- what: images and text
496
- who: annotators from Amazon Mechanical Turk
536
+ task: toxicity identification
537
+ what: Memes
538
+ who: Human experts
497
539
  when: "2020"
498
540
  language: English
499
541
 
500
542
  - name: mm_safety_bench
501
543
  display_name: MM-SafetyBench
502
- description: Expose the vulnerability of open-source VLMs with toxic and biased content ([paper](https://arxiv.org/abs/2311.17600))
544
+ description: Exposes the vulnerability of open-source VLMs with toxic and biased content ([Liu et al., 2023](https://arxiv.org/abs/2311.17600)).
503
545
  metric_groups:
504
546
  - accuracy
505
- - efficiency
506
547
  - toxicity
507
548
  - general_information
508
549
  environment:
509
550
  main_name: toxic_frac
510
551
  main_split: test
511
552
  taxonomy:
512
- task: safety
513
- what: safety images
553
+ task: toxicity mitigation
554
+ what: Jail-break images
514
555
  who: Human experts
515
556
  when: "2023"
516
557
  language: English
517
558
 
518
- - name: mscoco_captioning
519
- display_name: MSCOCO (captioning)
520
- description: Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
559
+ - name: viz_wiz
560
+ display_name: VizWiz
561
+ description: A benchmark for visual question answering with images and questions created by visually impaired people ([Gurari et al., 2018](https://arxiv.org/abs/1802.08218)).
521
562
  metric_groups:
522
563
  - accuracy
523
- - efficiency
524
564
  - general_information
525
565
  environment:
526
- main_name: f1_score
566
+ main_name: quasi_exact_match
527
567
  main_split: valid
528
568
  taxonomy:
529
- task: image captioning
530
- what: Real world images
531
- who: Human experts
532
- when: "2014"
569
+ task: short-answer question answering
570
+ what: Real-world images
571
+ who: Visually impaired people
572
+ when: "2018"
533
573
  language: English
534
574
 
535
- - name: mscoco_categorization
536
- display_name: MSCOCO (categorization)
537
- description: Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
575
+ - name: vqa_base
576
+ display_name: VQAv2
577
+ description: Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
538
578
  metric_groups:
539
579
  - accuracy
540
- - efficiency
541
580
  - general_information
542
581
  environment:
543
- main_name: exact_match
582
+ main_name: quasi_exact_match
544
583
  main_split: valid
545
584
  taxonomy:
546
- task: image captioning
547
- what: Real world images
585
+ task: short-answer question answering
586
+ what: Real-world images
548
587
  who: Human experts
549
- when: "2014"
588
+ when: "2017"
550
589
  language: English
551
590
 
552
- - name: viz_wiz
553
- display_name: VizWiz
554
- description: A benchmark for visual question answering with images and questions created by visually impaired people [(Gurari et al., 2018)](https://arxiv.org/abs/1802.08218).
591
+ - name: vqa_dialect
592
+ display_name: VQAv2 (AAE)
593
+ description: African-American English Perturbation + Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
555
594
  metric_groups:
556
- - accuracy
557
- - efficiency
595
+ - fairness
558
596
  - general_information
559
597
  environment:
560
- main_name: f1_score
598
+ main_name: quasi_exact_match
561
599
  main_split: valid
562
600
  taxonomy:
563
- task: multimodal short answer question answering
601
+ task: short-answer question answering
564
602
  what: Real-world images
565
- who: Visually impaired people
566
- when: "2018"
603
+ who: Human experts
604
+ when: "2017"
567
605
  language: English
568
606
 
569
- - name: vqa
570
- display_name: VQAv2
571
- description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
607
+ - name: vqa_robustness
608
+ display_name: VQAv2 (robustness)
609
+ description: Robustness Typos Perturbation + Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
572
610
  metric_groups:
573
- - accuracy
574
- - efficiency
611
+ - robustness
575
612
  - general_information
576
613
  environment:
577
- main_name: f1_score
614
+ main_name: quasi_exact_match
578
615
  main_split: valid
579
616
  taxonomy:
580
- task: multimodal short answer question answering
617
+ task: short-answer question answering
581
618
  what: Real-world images
582
619
  who: Human experts
583
620
  when: "2017"
@@ -585,10 +622,9 @@ run_groups:
585
622
 
586
623
  - name: math_vista
587
624
  display_name: MathVista
588
- description: Evaluating Math Reasoning in Visual Contexts
625
+ description: A benchmark designed to combine challenges from diverse mathematical and visual tasks ([Lu et al., 2024](https://arxiv.org/abs/2310.02255)).
589
626
  metric_groups:
590
627
  - accuracy
591
- - efficiency
592
628
  - general_information
593
629
  environment:
594
630
  main_name: exact_match
@@ -602,16 +638,15 @@ run_groups:
602
638
 
603
639
  - name: mmmu
604
640
  display_name: MMMU
605
- description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning [(Yue et al., 2023)](https://arxiv.org/abs/2311.16502).
641
+ description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning ([Yue et al., 2023](https://arxiv.org/abs/2311.16502)).
606
642
  metric_groups:
607
643
  - accuracy
608
- - efficiency
609
644
  - general_information
610
645
  environment:
611
646
  main_name: exact_match
612
647
  main_split: valid
613
648
  taxonomy:
614
- task: multimodal multiple-choice question answering
649
+ task: multiple-choice question answering
615
650
  what: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering
616
651
  who: Human experts
617
652
  when: "2023"
@@ -619,7 +654,7 @@ run_groups:
619
654
 
620
655
  - name: unicorn
621
656
  display_name: Unicorn
622
- description: Safety Evaluation Benchmark for Evaluating on Out-of-Distribution and Sketch Images
657
+ description: Safety Evaluation Benchmark for Evaluating on Out-of-Distribution and Sketch Images ([Tu et al., 2023](https://arxiv.org/abs/2311.16101)).
623
658
  metric_groups:
624
659
  - accuracy
625
660
  - general_information
@@ -627,7 +662,7 @@ run_groups:
627
662
  main_name: exact_match
628
663
  main_split: test
629
664
  taxonomy:
630
- task: short answer question answering
665
+ task: short-answer question answering
631
666
  what: OOD images and sketch images
632
667
  who: Human experts
633
668
  when: "2023"
@@ -635,48 +670,31 @@ run_groups:
635
670
 
636
671
  - name: bingo
637
672
  display_name: Bingo
638
- description: Open-ended questions about biased images
673
+ description: Open-ended questions about biased images and hallucinations-inducing images ([Cui et al., 2023](https://arxiv.org/abs/2311.03287)).
639
674
  metric_groups:
640
675
  - accuracy
676
+ - general_information
641
677
  environment:
642
- main_name: f1_score
678
+ main_name: prometheus_vision
643
679
  main_split: test
644
680
  taxonomy:
645
- task: short answer question answering
681
+ task: short-answer question answering
646
682
  what: Biased images about Region, OCR, Factual, Text-to-Image and Image-to-Image inference challenges
647
683
  who: Human experts
648
684
  when: "2023"
649
685
  language: English, Chinese, Japanese, etc.
650
-
651
- - name: multipanelvqa
652
- display_name: MultipanelVQA
653
- description: Question about real-world or synthetic multipanel images for evaluating multi-panel image reasoning ability
654
- metric_groups:
655
- - accuracy
656
- - efficiency
657
- - general_information
658
- environment:
659
- main_name: exact_match
660
- main_split: test
661
- taxonomy:
662
- task: short answer or multiple-choice question answering
663
- what: Real-world or synthetic multipanel images
664
- who: Human experts
665
- when: "2024"
666
- language: English
667
-
686
+
668
687
  - name: pope
669
688
  display_name: POPE
670
- description: Open-ended questions about object appearance in real-world images for evaluating hallucination behaviour
689
+ description: Open-ended questions about object appearance in real-world images for evaluating hallucination behaviour ([Li et al., 2023](https://aclanthology.org/2023.emnlp-main.20)).
671
690
  metric_groups:
672
691
  - accuracy
673
- - efficiency
674
692
  - general_information
675
693
  environment:
676
694
  main_name: exact_match
677
695
  main_split: test
678
696
  taxonomy:
679
- task: short answer question answering
697
+ task: short-answer question answering
680
698
  what: Real-world images
681
699
  who: Human experts
682
700
  when: "2023"
@@ -684,11 +702,9 @@ run_groups:
684
702
 
685
703
  - name: seed_bench
686
704
  display_name: Seed Bench
687
- description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input
688
- including the comprehension of both the image and video modality
705
+ description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input including the comprehension of both the image and video modality ([Li et al., 2023](https://arxiv.org/abs/2307.16125)).
689
706
  metric_groups:
690
707
  - accuracy
691
- - efficiency
692
708
  - general_information
693
709
  environment:
694
710
  main_name: exact_match
@@ -702,10 +718,9 @@ run_groups:
702
718
 
703
719
  - name: mme
704
720
  display_name: MME
705
- description: A comprehensive MLLM Evaluation benchmark with perception and cognition evaluations on 14 subtasks
721
+ description: A comprehensive MLLM Evaluation benchmark with perception and cognition evaluations on 14 subtasks ([Fu et al., 2023](https://arxiv.org/abs/2306.13394)).
706
722
  metric_groups:
707
723
  - accuracy
708
- - efficiency
709
724
  - general_information
710
725
  environment:
711
726
  main_name: exact_match
@@ -717,107 +732,98 @@ run_groups:
717
732
  when: "2023"
718
733
  language: English
719
734
 
720
- - name: mementos
721
- display_name: Mementos
722
- description: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences
735
+ - name: vibe_eval
736
+ display_name: Vibe Eval
737
+ description: A difficult evaluation suite for measuring progress of multimodal language models with day-to-day tasks ([Padlewski et al., 2024](https://arxiv.org/abs/2405.02287)).
723
738
  metric_groups:
724
739
  - accuracy
740
+ - general_information
725
741
  environment:
726
- main_name: f1_score
742
+ main_name: prometheus_vision
727
743
  main_split: test
728
744
  taxonomy:
729
- task: short answer question answering
730
- what: Image sequences of comics, dailylife and robotics
745
+ task: short-answer question answering
746
+ what: Knowledge intensive
731
747
  who: Human experts
732
748
  when: "2024"
733
749
  language: English
734
750
 
735
- - name: image2latex
736
- display_name: Image2LaTeX
737
- description: The Image2LaTeX benchmark for converting images of mathematical equations, tables. algorithms and tikz to LaTeX.
751
+ - name: mementos
752
+ display_name: Mementos
753
+ description: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences ([Wang et al., 2024](https://arxiv.org/abs/2401.10529)).
738
754
  metric_groups:
739
755
  - accuracy
740
- - generation_image
741
- - generation_text
742
- - efficiency
743
756
  - general_information
744
757
  environment:
745
- main_name: earth_mover_similarity
746
- main_split: valid
758
+ main_name: prometheus_vision
759
+ main_split: test
747
760
  taxonomy:
748
- task: image-to-text
749
- what: mathematical equations, tables, algorithms, tikz
750
- who: n/a
761
+ task: short-answer question answering
762
+ what: Image sequences of comics, daily life and robotics
763
+ who: Human experts
751
764
  when: "2024"
752
765
  language: English
753
766
 
754
- - name: image2webpage
755
- display_name: Image2webpage
756
- description: The Image2webpage benchmark for converting images of webpages to HTML/CSS/Javascript.
767
+ - name: pairs
768
+ display_name: PAIRS
769
+ description: Examining gender and racial bias using parallel images ([Fraser et al., 2024](https://arxiv.org/abs/2402.05779)).
757
770
  metric_groups:
758
771
  - accuracy
759
- - generation_image
760
- - generation_text
761
- - efficiency
762
772
  - general_information
763
773
  environment:
764
- main_name: earth_mover_similarity
765
- main_split: valid
774
+ main_name: exact_match
775
+ main_split: test
766
776
  taxonomy:
767
- task: image-to-text
768
- what: css, html, javascript
769
- who: n/a
777
+ task: multiple-choice question answering
778
+ what: Bias
779
+ who: Human experts
770
780
  when: "2024"
771
781
  language: English
772
782
 
773
- - name: image2musicsheet
774
- display_name: Image2musicsheet
775
- description: The Image2musicsheet benchmark for converting images of music sheets to LilyPond.
783
+ - name: fair_face
784
+ display_name: FairFace
785
+ description: Identify the race, gender or age of a photo of a person ([Karkkainen et al., 2019](https://arxiv.org/abs/1908.04913)).
776
786
  metric_groups:
777
787
  - accuracy
778
- - generation_image
779
- - efficiency
780
788
  - general_information
781
789
  environment:
782
- main_name: earth_mover_similarity
790
+ main_name: exact_match
783
791
  main_split: valid
784
792
  taxonomy:
785
- task: image-to-text
786
- what: music sheets
787
- who: n/a
788
- when: "2024"
793
+ task: multiple-choice question answering
794
+ what: Fairness
795
+ who: Human experts
796
+ when: "2019"
789
797
  language: English
790
798
 
791
- - name: chart2csv
792
- display_name: Chart2CSV
793
- description: The Chart2CSV benchmark for converting images of charts to CSV.
799
+ - name: real_world_qa
800
+ display_name: RealWorldQA
801
+ description: A benchmark designed to to evaluate real-world spatial understanding capabilities of multimodal models ([xAI, 2024](https://x.ai/blog/grok-1.5v)).
794
802
  metric_groups:
795
803
  - accuracy
796
- - efficiency
797
804
  - general_information
798
805
  environment:
799
806
  main_name: exact_match
800
807
  main_split: test
801
808
  taxonomy:
802
- task: chart to CSV
803
- what: plots
804
- who: n/a
809
+ task: short-answer question answering
810
+ what: Real world images
811
+ who: Human experts
805
812
  when: "2024"
806
813
  language: English
807
814
 
808
- - name: pairs
809
- display_name: PAIRS
810
- description: Examining Gender and Racial Bias in Large Vision-Language Models Using a Novel Dataset of Parallel Images.
815
+ - name: exams_v
816
+ display_name: Exams-V
817
+ description: A multimodal and multilingual benchmark with knowledge-intensive exam questions covering natural science, social science, and other miscellaneous studies ([Das et al., 2024]( https://arxiv.org/abs/2403.10378)).
811
818
  metric_groups:
812
819
  - accuracy
813
- - efficiency
814
820
  - general_information
815
821
  environment:
816
822
  main_name: exact_match
817
823
  main_split: test
818
824
  taxonomy:
819
825
  task: multiple-choice question answering
820
- what: Bias
826
+ what: Exam questions
821
827
  who: Human experts
822
828
  when: "2024"
823
- language: English
829
+ language: English, Chinese, Croation, Hungarian, Arabic, Serbian, Bulgarian, English, German, French, Spanish, Polish