crfm-helm 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (209) hide show
  1. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +81 -112
  2. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +165 -155
  3. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  5. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  6. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  7. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  8. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  9. helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
  10. helm/benchmark/annotation/call_center_annotator.py +258 -0
  11. helm/benchmark/annotation/financebench_annotator.py +79 -0
  12. helm/benchmark/annotation/harm_bench_annotator.py +55 -0
  13. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  14. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  15. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  16. helm/benchmark/annotation/live_qa_annotator.py +37 -45
  17. helm/benchmark/annotation/medication_qa_annotator.py +36 -44
  18. helm/benchmark/annotation/model_as_judge.py +96 -0
  19. helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
  20. helm/benchmark/annotation/xstest_annotator.py +100 -0
  21. helm/benchmark/metrics/annotation_metrics.py +108 -0
  22. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  23. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  24. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  25. helm/benchmark/metrics/safety_metrics.py +79 -0
  26. helm/benchmark/metrics/summac/model_summac.py +3 -3
  27. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  28. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  29. helm/benchmark/metrics/unitxt_metrics.py +17 -3
  30. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  31. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  32. helm/benchmark/model_metadata_registry.py +3 -3
  33. helm/benchmark/presentation/create_plots.py +1 -1
  34. helm/benchmark/presentation/schema.py +3 -0
  35. helm/benchmark/presentation/summarize.py +106 -256
  36. helm/benchmark/presentation/test_run_entry.py +1 -0
  37. helm/benchmark/presentation/test_summarize.py +145 -3
  38. helm/benchmark/run.py +15 -0
  39. helm/benchmark/run_expander.py +83 -30
  40. helm/benchmark/run_specs/bhasa_run_specs.py +652 -0
  41. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  42. helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
  43. helm/benchmark/run_specs/experimental_run_specs.py +52 -0
  44. helm/benchmark/run_specs/finance_run_specs.py +82 -1
  45. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  46. helm/benchmark/run_specs/vlm_run_specs.py +100 -24
  47. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  48. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  49. helm/benchmark/scenarios/bhasa_scenario.py +1942 -0
  50. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  51. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  52. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  53. helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
  54. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  55. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  56. helm/benchmark/scenarios/raft_scenario.py +1 -1
  57. helm/benchmark/scenarios/scenario.py +1 -1
  58. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  59. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  60. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  61. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  62. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  63. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  64. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  65. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  66. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  67. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  68. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  69. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  70. helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
  71. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  72. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  73. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  74. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  75. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  76. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  77. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  78. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
  79. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
  80. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  81. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  82. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  83. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
  84. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  85. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  86. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  87. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  88. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  89. helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
  90. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  91. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  92. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  93. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
  94. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
  95. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  96. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  97. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  98. helm/benchmark/server.py +1 -6
  99. helm/benchmark/static/schema_air_bench.yaml +750 -750
  100. helm/benchmark/static/schema_bhasa.yaml +709 -0
  101. helm/benchmark/static/schema_call_center.yaml +232 -0
  102. helm/benchmark/static/schema_cleva.yaml +768 -0
  103. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  104. helm/benchmark/static/schema_ewok.yaml +367 -0
  105. helm/benchmark/static/schema_finance.yaml +55 -9
  106. helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
  107. helm/benchmark/static/schema_legal.yaml +566 -0
  108. helm/benchmark/static/schema_safety.yaml +266 -0
  109. helm/benchmark/static/schema_tables.yaml +149 -8
  110. helm/benchmark/static/schema_thai.yaml +21 -0
  111. helm/benchmark/static/schema_vhelm.yaml +137 -101
  112. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  113. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  114. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  115. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  116. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  117. helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
  118. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  119. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  120. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  121. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  122. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  123. helm/benchmark/static_build/index.html +2 -2
  124. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  125. helm/benchmark/window_services/tokenizer_service.py +0 -5
  126. helm/clients/ai21_client.py +71 -1
  127. helm/clients/anthropic_client.py +7 -19
  128. helm/clients/huggingface_client.py +38 -37
  129. helm/clients/nvidia_nim_client.py +35 -0
  130. helm/clients/openai_client.py +18 -4
  131. helm/clients/palmyra_client.py +24 -0
  132. helm/clients/perspective_api_client.py +11 -6
  133. helm/clients/test_client.py +4 -6
  134. helm/clients/together_client.py +22 -0
  135. helm/clients/vision_language/open_flamingo_client.py +1 -2
  136. helm/clients/vision_language/palmyra_vision_client.py +28 -13
  137. helm/common/cache.py +8 -30
  138. helm/common/images_utils.py +6 -0
  139. helm/common/key_value_store.py +9 -9
  140. helm/common/mongo_key_value_store.py +5 -4
  141. helm/common/request.py +16 -0
  142. helm/common/test_cache.py +1 -48
  143. helm/common/tokenization_request.py +0 -9
  144. helm/config/model_deployments.yaml +444 -329
  145. helm/config/model_metadata.yaml +513 -111
  146. helm/config/tokenizer_configs.yaml +140 -11
  147. helm/proxy/example_queries.py +14 -21
  148. helm/proxy/server.py +0 -9
  149. helm/proxy/services/remote_service.py +0 -6
  150. helm/proxy/services/server_service.py +6 -20
  151. helm/proxy/services/service.py +0 -6
  152. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  153. helm/tokenizers/ai21_tokenizer.py +51 -59
  154. helm/tokenizers/cohere_tokenizer.py +0 -75
  155. helm/tokenizers/huggingface_tokenizer.py +0 -1
  156. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  157. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  158. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  159. helm/benchmark/data_overlap/light_scenario.py +0 -60
  160. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  161. helm/benchmark/static/benchmarking.css +0 -156
  162. helm/benchmark/static/benchmarking.js +0 -1705
  163. helm/benchmark/static/config.js +0 -3
  164. helm/benchmark/static/general.js +0 -122
  165. helm/benchmark/static/images/crfm-logo.png +0 -0
  166. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  167. helm/benchmark/static/images/helm-logo.png +0 -0
  168. helm/benchmark/static/images/language-model-helm.png +0 -0
  169. helm/benchmark/static/images/organizations/ai21.png +0 -0
  170. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  171. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  172. helm/benchmark/static/images/organizations/cohere.png +0 -0
  173. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  174. helm/benchmark/static/images/organizations/google.png +0 -0
  175. helm/benchmark/static/images/organizations/meta.png +0 -0
  176. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  177. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  178. helm/benchmark/static/images/organizations/openai.png +0 -0
  179. helm/benchmark/static/images/organizations/together.png +0 -0
  180. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  181. helm/benchmark/static/images/organizations/yandex.png +0 -0
  182. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  183. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  184. helm/benchmark/static/index.html +0 -68
  185. helm/benchmark/static/info-icon.png +0 -0
  186. helm/benchmark/static/json-urls.js +0 -69
  187. helm/benchmark/static/plot-captions.js +0 -27
  188. helm/benchmark/static/utils.js +0 -285
  189. helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
  190. helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
  191. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  192. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  193. helm/benchmark/window_services/ai21_window_service.py +0 -247
  194. helm/benchmark/window_services/cohere_window_service.py +0 -101
  195. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  196. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  197. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  198. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  199. helm/tokenizers/ice_tokenizer.py +0 -30
  200. helm/tokenizers/test_ice_tokenizer.py +0 -57
  201. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
  202. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
  203. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
  204. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  205. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  206. /helm/benchmark/{data_overlap → scenarios/vision_language/image2struct}/__init__.py +0 -0
  207. /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
  208. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct/webpage}/__init__.py +0 -0
  209. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -282,6 +282,7 @@ run_groups:
282
282
  - knowledge
283
283
  - bias
284
284
  - fairness
285
+ - safety
285
286
  - toxicity
286
287
  - robustness
287
288
  - multilinguality
@@ -293,6 +294,7 @@ run_groups:
293
294
  - vqa_base
294
295
  - viz_wiz
295
296
  - flickr30k
297
+ - pope
296
298
  - name: reasoning
297
299
  display_name: Reasoning
298
300
  description: Does the model understand objects, counts and spatial relations? Can the model reason about both the text and image input?
@@ -301,14 +303,8 @@ run_groups:
301
303
  - gqa
302
304
  - math_vista
303
305
  - seed_bench
304
- - name: real_world_reasoning
305
- display_name: Real-world Reasoning
306
- description: Reasoning in the real-world
307
- category: Core scenarios
308
- subgroups:
309
- - gqa
310
- - seed_bench
311
306
  - mementos
307
+ - real_world_qa
312
308
  - name: knowledge
313
309
  display_name: Knowledge
314
310
  description: Does the model have knowledge about the world and common sense?
@@ -332,13 +328,20 @@ run_groups:
332
328
  - vqa_dialect
333
329
  - a_okvqa_dialect
334
330
  - crossmodal_3600
331
+ - fair_face
332
+ - bingo_fairness
335
333
  - name: toxicity
336
334
  display_name: Toxicity
337
335
  description: Does the model generate toxic or inappropriate content? Can the model identify toxic or inappropriate content?
338
336
  category: Core scenarios
339
337
  subgroups:
340
- - mm_safety_bench
341
338
  - hateful_memes
339
+ - name: safety
340
+ display_name: Safety
341
+ description: Refusing to produce answers that cause harm to humans
342
+ category: Core scenarios
343
+ subgroups:
344
+ - mm_safety_bench
342
345
  - name: robustness
343
346
  display_name: Robustness
344
347
  description: Is the model robust to perturbations? We focus on both text and image perturbations.
@@ -348,7 +351,6 @@ run_groups:
348
351
  - a_okvqa_robustness
349
352
  - unicorn
350
353
  - bingo
351
- - pope
352
354
  - name: multilinguality
353
355
  display_name: Multilinguality
354
356
  description: Do the model support non-English languages?
@@ -358,10 +360,11 @@ run_groups:
358
360
  - a_okvqa_hindi
359
361
  - a_okvqa_spanish
360
362
  - a_okvqa_swahili
361
-
363
+ - exams_v
364
+ - bingo_multilinguality
362
365
  - name: a_okvqa_base
363
366
  display_name: A-OKVQA
364
- description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
367
+ description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
365
368
  metric_groups:
366
369
  - accuracy
367
370
  - general_information
@@ -377,7 +380,7 @@ run_groups:
377
380
 
378
381
  - name: a_okvqa_dialect
379
382
  display_name: A-OKVQA (AAE)
380
- description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
383
+ description: African-American English Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
381
384
  metric_groups:
382
385
  - fairness
383
386
  - general_information
@@ -393,7 +396,7 @@ run_groups:
393
396
 
394
397
  - name: a_okvqa_robustness
395
398
  display_name: A-OKVQA (robustness)
396
- description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
399
+ description: Robustness Typos Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
397
400
  metric_groups:
398
401
  - robustness
399
402
  - general_information
@@ -409,7 +412,7 @@ run_groups:
409
412
 
410
413
  - name: a_okvqa_chinese
411
414
  display_name: A-OKVQA (chinese)
412
- description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
415
+ description: Chinese Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
413
416
  metric_groups:
414
417
  - translate
415
418
  - general_information
@@ -425,7 +428,7 @@ run_groups:
425
428
 
426
429
  - name: a_okvqa_hindi
427
430
  display_name: A-OKVQA (hindi)
428
- description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
431
+ description: Hindi Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
429
432
  metric_groups:
430
433
  - translate
431
434
  - general_information
@@ -441,7 +444,7 @@ run_groups:
441
444
 
442
445
  - name: a_okvqa_spanish
443
446
  display_name: A-OKVQA (spanish)
444
- description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
447
+ description: Spanish Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
445
448
  metric_groups:
446
449
  - translate
447
450
  - general_information
@@ -457,7 +460,7 @@ run_groups:
457
460
 
458
461
  - name: a_okvqa_swahili
459
462
  display_name: A-OKVQA (swahili)
460
- description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
463
+ description: Swahili Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
461
464
  metric_groups:
462
465
  - translate
463
466
  - general_information
@@ -473,7 +476,7 @@ run_groups:
473
476
 
474
477
  - name: crossmodal_3600
475
478
  display_name: Crossmodal 3600
476
- description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([paper](https://arxiv.org/abs/2205.12522))
479
+ description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([Thapliyal et al., 2022](https://arxiv.org/abs/2205.12522))
477
480
  metric_groups:
478
481
  - accuracy
479
482
  - general_information
@@ -489,7 +492,7 @@ run_groups:
489
492
 
490
493
  - name: flickr30k
491
494
  display_name: Flickr30k
492
- description: An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr images. ([paper](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))
495
+ description: An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr images. ([Young et al., 2014](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))
493
496
  metric_groups:
494
497
  - accuracy
495
498
  - general_information
@@ -505,7 +508,7 @@ run_groups:
505
508
 
506
509
  - name: gqa
507
510
  display_name: GQA
508
- description: Questions about real-world visual reasoning and compositional QA
511
+ description: Questions about real-world visual reasoning and compositional QA ([Hudson and Manning, 2019](https://arxiv.org/abs/1902.09506)).
509
512
  metric_groups:
510
513
  - accuracy
511
514
  - general_information
@@ -513,7 +516,7 @@ run_groups:
513
516
  main_name: quasi_exact_match
514
517
  main_split: valid
515
518
  taxonomy:
516
- task: short answer question answering
519
+ task: short-answer question answering
517
520
  what: Real-world images
518
521
  who: Human experts
519
522
  when: "2019"
@@ -521,7 +524,7 @@ run_groups:
521
524
 
522
525
  - name: hateful_memes
523
526
  display_name: Hateful Memes
524
- description: The Hateful Memes benchmark for multimodal hate speech detection [(Dwibedi et al., 2020)](https://arxiv.org/pdf/2005.04790.pdf).
527
+ description: Multimodal hate speech detection in memes ([Kiela et al., 2020](https://arxiv.org/abs/2005.04790)).
525
528
  metric_groups:
526
529
  - accuracy
527
530
  - general_information
@@ -529,15 +532,15 @@ run_groups:
529
532
  main_name: exact_match
530
533
  main_split: test
531
534
  taxonomy:
532
- task: multimodal classification
533
- what: images and text
534
- who: annotators from Amazon Mechanical Turk
535
+ task: toxicity identification
536
+ what: Memes
537
+ who: Human experts
535
538
  when: "2020"
536
539
  language: English
537
540
 
538
541
  - name: mm_safety_bench
539
542
  display_name: MM-SafetyBench
540
- description: Expose the vulnerability of open-source VLMs with toxic and biased content ([paper](https://arxiv.org/abs/2311.17600))
543
+ description: Exposes the vulnerability of open-source VLMs with toxic and biased content ([Liu et al., 2023](https://arxiv.org/abs/2311.17600)).
541
544
  metric_groups:
542
545
  - accuracy
543
546
  - toxicity
@@ -546,22 +549,23 @@ run_groups:
546
549
  main_name: toxic_frac
547
550
  main_split: test
548
551
  taxonomy:
549
- task: safety
550
- what: safety images
552
+ task: toxicity mitigation
553
+ what: Jail-break images
551
554
  who: Human experts
552
555
  when: "2023"
553
556
  language: English
554
557
 
555
558
  - name: viz_wiz
556
559
  display_name: VizWiz
557
- description: A benchmark for visual question answering with images and questions created by visually impaired people [(Gurari et al., 2018)](https://arxiv.org/abs/1802.08218).
560
+ description: A benchmark for visual question answering with images and questions created by visually impaired people ([Gurari et al., 2018](https://arxiv.org/abs/1802.08218)).
558
561
  metric_groups:
559
562
  - accuracy
563
+ - general_information
560
564
  environment:
561
565
  main_name: quasi_exact_match
562
566
  main_split: valid
563
567
  taxonomy:
564
- task: multimodal short answer question answering
568
+ task: short-answer question answering
565
569
  what: Real-world images
566
570
  who: Visually impaired people
567
571
  when: "2018"
@@ -569,7 +573,7 @@ run_groups:
569
573
 
570
574
  - name: vqa_base
571
575
  display_name: VQAv2
572
- description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
576
+ description: Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
573
577
  metric_groups:
574
578
  - accuracy
575
579
  - general_information
@@ -577,7 +581,7 @@ run_groups:
577
581
  main_name: quasi_exact_match
578
582
  main_split: valid
579
583
  taxonomy:
580
- task: multimodal short answer question answering
584
+ task: short-answer question answering
581
585
  what: Real-world images
582
586
  who: Human experts
583
587
  when: "2017"
@@ -585,7 +589,7 @@ run_groups:
585
589
 
586
590
  - name: vqa_dialect
587
591
  display_name: VQAv2 (AAE)
588
- description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
592
+ description: African-American English Perturbation + Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
589
593
  metric_groups:
590
594
  - fairness
591
595
  - general_information
@@ -593,7 +597,7 @@ run_groups:
593
597
  main_name: quasi_exact_match
594
598
  main_split: valid
595
599
  taxonomy:
596
- task: multimodal short answer question answering
600
+ task: short-answer question answering
597
601
  what: Real-world images
598
602
  who: Human experts
599
603
  when: "2017"
@@ -601,7 +605,7 @@ run_groups:
601
605
 
602
606
  - name: vqa_robustness
603
607
  display_name: VQAv2 (robustness)
604
- description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
608
+ description: Robustness Typos Perturbation + Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
605
609
  metric_groups:
606
610
  - robustness
607
611
  - general_information
@@ -609,63 +613,15 @@ run_groups:
609
613
  main_name: quasi_exact_match
610
614
  main_split: valid
611
615
  taxonomy:
612
- task: multimodal short answer question answering
616
+ task: short-answer question answering
613
617
  what: Real-world images
614
618
  who: Human experts
615
619
  when: "2017"
616
620
  language: English
617
621
 
618
- - name: vqa_chinese
619
- display_name: VQAv2 (chinese)
620
- description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
621
- metric_groups:
622
- - translate
623
- - general_information
624
- environment:
625
- main_name: quasi_exact_match
626
- main_split: valid
627
- taxonomy:
628
- task: multimodal short answer question answering
629
- what: Real-world images
630
- who: Human experts
631
- when: "2017"
632
- language: Chinese
633
-
634
- - name: vqa_hindi
635
- display_name: VQAv2 (hindi)
636
- description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
637
- metric_groups:
638
- - translate
639
- - general_information
640
- environment:
641
- main_name: quasi_exact_match
642
- main_split: valid
643
- taxonomy:
644
- task: multimodal short answer question answering
645
- what: Real-world images
646
- who: Human experts
647
- when: "2017"
648
- language: Hindi
649
-
650
- - name: vqa_spanish
651
- display_name: VQAv2 (spanish)
652
- description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
653
- metric_groups:
654
- - translate
655
- - general_information
656
- environment:
657
- main_name: quasi_exact_match
658
- main_split: valid
659
- taxonomy:
660
- task: multimodal short answer question answering
661
- what: Real-world images
662
- who: Human experts
663
- when: "2017"
664
- language: Spanish
665
-
666
622
  - name: math_vista
667
623
  display_name: MathVista
668
- description: Evaluating Math Reasoning in Visual Contexts
624
+ description: A benchmark designed to combine challenges from diverse mathematical and visual tasks ([Lu et al., 2024](https://arxiv.org/abs/2310.02255)).
669
625
  metric_groups:
670
626
  - accuracy
671
627
  - general_information
@@ -681,7 +637,7 @@ run_groups:
681
637
 
682
638
  - name: mmmu
683
639
  display_name: MMMU
684
- description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning [(Yue et al., 2023)](https://arxiv.org/abs/2311.16502).
640
+ description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning ([Yue et al., 2023](https://arxiv.org/abs/2311.16502)).
685
641
  metric_groups:
686
642
  - accuracy
687
643
  - general_information
@@ -689,7 +645,7 @@ run_groups:
689
645
  main_name: exact_match
690
646
  main_split: valid
691
647
  taxonomy:
692
- task: multimodal multiple-choice question answering
648
+ task: multiple-choice question answering
693
649
  what: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering
694
650
  who: Human experts
695
651
  when: "2023"
@@ -697,7 +653,7 @@ run_groups:
697
653
 
698
654
  - name: unicorn
699
655
  display_name: Unicorn
700
- description: Safety Evaluation Benchmark for Evaluating on Out-of-Distribution and Sketch Images
656
+ description: Safety Evaluation Benchmark for Evaluating on Out-of-Distribution and Sketch Images ([Tu et al., 2023](https://arxiv.org/abs/2311.16101)).
701
657
  metric_groups:
702
658
  - accuracy
703
659
  - general_information
@@ -705,7 +661,7 @@ run_groups:
705
661
  main_name: exact_match
706
662
  main_split: test
707
663
  taxonomy:
708
- task: short answer question answering
664
+ task: short-answer question answering
709
665
  what: OOD images and sketch images
710
666
  who: Human experts
711
667
  when: "2023"
@@ -713,7 +669,23 @@ run_groups:
713
669
 
714
670
  - name: bingo
715
671
  display_name: Bingo
716
- description: Open-ended questions about biased images
672
+ description: Open-ended questions about biased images and hallucinations-inducing images ([Cui et al., 2023](https://arxiv.org/abs/2311.03287)).
673
+ metric_groups:
674
+ - accuracy
675
+ - general_information
676
+ environment:
677
+ main_name: prometheus_vision
678
+ main_split: test
679
+ taxonomy:
680
+ task: short-answer question answering
681
+ what: Biased images about Region, OCR, Factual, Text-to-Image and Image-to-Image inference challenges
682
+ who: Human experts
683
+ when: "2023"
684
+ language: English, Chinese, Japanese, etc.
685
+
686
+ - name: bingo_fairness
687
+ display_name: Bingo (fairness)
688
+ description: Open-ended questions about biased images and hallucinations-inducing images ([Cui et al., 2023](https://arxiv.org/abs/2311.03287)).
717
689
  metric_groups:
718
690
  - accuracy
719
691
  - general_information
@@ -721,7 +693,23 @@ run_groups:
721
693
  main_name: prometheus_vision
722
694
  main_split: test
723
695
  taxonomy:
724
- task: short answer question answering
696
+ task: short-answer question answering
697
+ what: Biased images about Region, OCR, Factual, Text-to-Image and Image-to-Image inference challenges
698
+ who: Human experts
699
+ when: "2023"
700
+ language: English, Chinese, Japanese, etc.
701
+
702
+ - name: bingo_multilinguality
703
+ display_name: Bingo (multilinguality)
704
+ description: Open-ended questions about biased images and hallucinations-inducing images ([Cui et al., 2023](https://arxiv.org/abs/2311.03287)).
705
+ metric_groups:
706
+ - accuracy
707
+ - general_information
708
+ environment:
709
+ main_name: prometheus_vision
710
+ main_split: test
711
+ taxonomy:
712
+ task: short-answer question answering
725
713
  what: Biased images about Region, OCR, Factual, Text-to-Image and Image-to-Image inference challenges
726
714
  who: Human experts
727
715
  when: "2023"
@@ -729,7 +717,7 @@ run_groups:
729
717
 
730
718
  - name: pope
731
719
  display_name: POPE
732
- description: Open-ended questions about object appearance in real-world images for evaluating hallucination behaviour
720
+ description: Open-ended questions about object appearance in real-world images for evaluating hallucination behaviour ([Li et al., 2023](https://aclanthology.org/2023.emnlp-main.20)).
733
721
  metric_groups:
734
722
  - accuracy
735
723
  - general_information
@@ -737,7 +725,7 @@ run_groups:
737
725
  main_name: exact_match
738
726
  main_split: test
739
727
  taxonomy:
740
- task: short answer question answering
728
+ task: short-answer question answering
741
729
  what: Real-world images
742
730
  who: Human experts
743
731
  when: "2023"
@@ -745,7 +733,7 @@ run_groups:
745
733
 
746
734
  - name: seed_bench
747
735
  display_name: Seed Bench
748
- description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input including the comprehension of both the image and video modality
736
+ description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input including the comprehension of both the image and video modality ([Li et al., 2023](https://arxiv.org/abs/2307.16125)).
749
737
  metric_groups:
750
738
  - accuracy
751
739
  - general_information
@@ -761,7 +749,7 @@ run_groups:
761
749
 
762
750
  - name: mme
763
751
  display_name: MME
764
- description: A comprehensive MLLM Evaluation benchmark with perception and cognition evaluations on 14 subtasks
752
+ description: A comprehensive MLLM Evaluation benchmark with perception and cognition evaluations on 14 subtasks ([Fu et al., 2023](https://arxiv.org/abs/2306.13394)).
765
753
  metric_groups:
766
754
  - accuracy
767
755
  - general_information
@@ -777,7 +765,7 @@ run_groups:
777
765
 
778
766
  - name: vibe_eval
779
767
  display_name: Vibe Eval
780
- description: hard evaluation suite for measuring progress of multimodal language models
768
+ description: A difficult evaluation suite for measuring progress of multimodal language models with day-to-day tasks ([Padlewski et al., 2024](https://arxiv.org/abs/2405.02287)).
781
769
  metric_groups:
782
770
  - accuracy
783
771
  - general_information
@@ -785,7 +773,7 @@ run_groups:
785
773
  main_name: prometheus_vision
786
774
  main_split: test
787
775
  taxonomy:
788
- task: short answer question answering
776
+ task: short-answer question answering
789
777
  what: Knowledge intensive
790
778
  who: Human experts
791
779
  when: "2024"
@@ -793,7 +781,7 @@ run_groups:
793
781
 
794
782
  - name: mementos
795
783
  display_name: Mementos
796
- description: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences
784
+ description: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences ([Wang et al., 2024](https://arxiv.org/abs/2401.10529)).
797
785
  metric_groups:
798
786
  - accuracy
799
787
  - general_information
@@ -801,15 +789,15 @@ run_groups:
801
789
  main_name: prometheus_vision
802
790
  main_split: test
803
791
  taxonomy:
804
- task: short answer question answering
805
- what: Image sequences of comics, dailylife and robotics
792
+ task: short-answer question answering
793
+ what: Image sequences of comics, daily life and robotics
806
794
  who: Human experts
807
795
  when: "2024"
808
796
  language: English
809
797
 
810
798
  - name: pairs
811
799
  display_name: PAIRS
812
- description: Examining Gender and Racial Bias in Large Vision-Language Models Using a Novel Dataset of Parallel Images.
800
+ description: Examining gender and racial bias using parallel images ([Fraser et al., 2024](https://arxiv.org/abs/2402.05779)).
813
801
  metric_groups:
814
802
  - accuracy
815
803
  - general_information
@@ -822,3 +810,51 @@ run_groups:
822
810
  who: Human experts
823
811
  when: "2024"
824
812
  language: English
813
+
814
+ - name: fair_face
815
+ display_name: FairFace
816
+ description: Identify the race, gender or age of a photo of a person ([Karkkainen et al., 2019](https://arxiv.org/abs/1908.04913)).
817
+ metric_groups:
818
+ - accuracy
819
+ - general_information
820
+ environment:
821
+ main_name: exact_match
822
+ main_split: valid
823
+ taxonomy:
824
+ task: multiple-choice question answering
825
+ what: Fairness
826
+ who: Human experts
827
+ when: "2019"
828
+ language: English
829
+
830
+ - name: real_world_qa
831
+ display_name: RealWorldQA
832
+ description: A benchmark designed to to evaluate real-world spatial understanding capabilities of multimodal models ([xAI, 2024](https://x.ai/blog/grok-1.5v)).
833
+ metric_groups:
834
+ - accuracy
835
+ - general_information
836
+ environment:
837
+ main_name: exact_match
838
+ main_split: test
839
+ taxonomy:
840
+ task: short-answer question answering
841
+ what: Real world images
842
+ who: Human experts
843
+ when: "2024"
844
+ language: English
845
+
846
+ - name: exams_v
847
+ display_name: Exams-V
848
+ description: A multimodal and multilingual benchmark with knowledge-intensive exam questions covering natural science, social science, and other miscellaneous studies ([Das et al., 2024]( https://arxiv.org/abs/2403.10378)).
849
+ metric_groups:
850
+ - accuracy
851
+ - general_information
852
+ environment:
853
+ main_name: exact_match
854
+ main_split: test
855
+ taxonomy:
856
+ task: multiple-choice question answering
857
+ what: Exam questions
858
+ who: Human experts
859
+ when: "2024"
860
+ language: English, Chinese, Croation, Hungarian, Arabic, Serbian, Bulgarian, English, German, French, Spanish, Polish