crfm-helm 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (184) hide show
  1. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +29 -55
  2. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +146 -134
  3. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  5. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  6. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  7. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  8. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  9. helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
  10. helm/benchmark/annotation/call_center_annotator.py +247 -0
  11. helm/benchmark/annotation/financebench_annotator.py +79 -0
  12. helm/benchmark/annotation/harm_bench_annotator.py +68 -0
  13. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  14. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  15. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  16. helm/benchmark/annotation/live_qa_annotator.py +32 -45
  17. helm/benchmark/annotation/medication_qa_annotator.py +31 -44
  18. helm/benchmark/annotation/model_as_judge.py +45 -0
  19. helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
  20. helm/benchmark/annotation/xstest_annotator.py +110 -0
  21. helm/benchmark/metrics/annotation_metrics.py +108 -0
  22. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  23. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  24. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  25. helm/benchmark/metrics/safety_metrics.py +57 -0
  26. helm/benchmark/metrics/summac/model_summac.py +3 -3
  27. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  28. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  29. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  30. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  31. helm/benchmark/model_metadata_registry.py +3 -3
  32. helm/benchmark/presentation/test_run_entry.py +1 -0
  33. helm/benchmark/run.py +15 -0
  34. helm/benchmark/run_expander.py +56 -30
  35. helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
  36. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  37. helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
  38. helm/benchmark/run_specs/experimental_run_specs.py +52 -0
  39. helm/benchmark/run_specs/finance_run_specs.py +78 -1
  40. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  41. helm/benchmark/run_specs/vlm_run_specs.py +92 -21
  42. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  43. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  44. helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
  45. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  46. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  47. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  48. helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
  49. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  50. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  51. helm/benchmark/scenarios/scenario.py +1 -1
  52. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  53. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  54. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  55. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  56. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  57. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  58. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  59. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  60. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  61. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  62. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  63. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  64. helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
  65. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  66. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  67. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  68. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  69. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  70. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  71. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  72. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
  73. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
  74. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  75. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  76. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  77. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
  78. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  79. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  80. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  81. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  82. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  83. helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
  84. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  85. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  86. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  87. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
  88. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
  89. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  90. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  91. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  92. helm/benchmark/server.py +1 -6
  93. helm/benchmark/static/schema_air_bench.yaml +750 -750
  94. helm/benchmark/static/schema_bhasa.yaml +709 -0
  95. helm/benchmark/static/schema_call_center.yaml +232 -0
  96. helm/benchmark/static/schema_cleva.yaml +768 -0
  97. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  98. helm/benchmark/static/schema_ewok.yaml +367 -0
  99. helm/benchmark/static/schema_finance.yaml +55 -9
  100. helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
  101. helm/benchmark/static/schema_safety.yaml +247 -0
  102. helm/benchmark/static/schema_tables.yaml +124 -7
  103. helm/benchmark/static/schema_thai.yaml +21 -0
  104. helm/benchmark/static/schema_vhelm.yaml +96 -91
  105. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  106. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  107. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  108. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  109. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  110. helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
  111. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  112. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  113. helm/benchmark/static_build/index.html +2 -2
  114. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  115. helm/clients/ai21_client.py +71 -1
  116. helm/clients/anthropic_client.py +7 -19
  117. helm/clients/huggingface_client.py +38 -37
  118. helm/clients/nvidia_nim_client.py +35 -0
  119. helm/clients/openai_client.py +2 -3
  120. helm/clients/palmyra_client.py +25 -0
  121. helm/clients/perspective_api_client.py +11 -6
  122. helm/clients/test_client.py +4 -6
  123. helm/clients/vision_language/open_flamingo_client.py +1 -2
  124. helm/clients/vision_language/palmyra_vision_client.py +28 -13
  125. helm/common/images_utils.py +6 -0
  126. helm/common/mongo_key_value_store.py +2 -1
  127. helm/common/request.py +16 -0
  128. helm/config/model_deployments.yaml +315 -332
  129. helm/config/model_metadata.yaml +384 -110
  130. helm/config/tokenizer_configs.yaml +116 -11
  131. helm/proxy/example_queries.py +14 -21
  132. helm/proxy/services/server_service.py +1 -2
  133. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  134. helm/tokenizers/ai21_tokenizer.py +51 -59
  135. helm/tokenizers/cohere_tokenizer.py +0 -75
  136. helm/tokenizers/huggingface_tokenizer.py +0 -1
  137. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  138. helm/benchmark/static/benchmarking.css +0 -156
  139. helm/benchmark/static/benchmarking.js +0 -1705
  140. helm/benchmark/static/config.js +0 -3
  141. helm/benchmark/static/general.js +0 -122
  142. helm/benchmark/static/images/crfm-logo.png +0 -0
  143. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  144. helm/benchmark/static/images/helm-logo.png +0 -0
  145. helm/benchmark/static/images/language-model-helm.png +0 -0
  146. helm/benchmark/static/images/organizations/ai21.png +0 -0
  147. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  148. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  149. helm/benchmark/static/images/organizations/cohere.png +0 -0
  150. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  151. helm/benchmark/static/images/organizations/google.png +0 -0
  152. helm/benchmark/static/images/organizations/meta.png +0 -0
  153. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  154. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  155. helm/benchmark/static/images/organizations/openai.png +0 -0
  156. helm/benchmark/static/images/organizations/together.png +0 -0
  157. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  158. helm/benchmark/static/images/organizations/yandex.png +0 -0
  159. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  160. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  161. helm/benchmark/static/index.html +0 -68
  162. helm/benchmark/static/info-icon.png +0 -0
  163. helm/benchmark/static/json-urls.js +0 -69
  164. helm/benchmark/static/plot-captions.js +0 -27
  165. helm/benchmark/static/utils.js +0 -285
  166. helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
  167. helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
  168. helm/benchmark/window_services/ai21_window_service.py +0 -247
  169. helm/benchmark/window_services/cohere_window_service.py +0 -101
  170. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  171. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  172. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  173. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  174. helm/tokenizers/ice_tokenizer.py +0 -30
  175. helm/tokenizers/test_ice_tokenizer.py +0 -57
  176. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
  177. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
  178. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
  179. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  180. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  181. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
  182. /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
  183. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
  184. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -309,6 +309,7 @@ run_groups:
309
309
  - gqa
310
310
  - seed_bench
311
311
  - mementos
312
+ - real_world_qa
312
313
  - name: knowledge
313
314
  display_name: Knowledge
314
315
  description: Does the model have knowledge about the world and common sense?
@@ -318,6 +319,7 @@ run_groups:
318
319
  - mmmu
319
320
  - mme
320
321
  - vibe_eval
322
+ - real_world_qa
321
323
  - name: bias
322
324
  display_name: Bias
323
325
  description: Are the generations biased in demographic representation? We focus on gender and skin tone bias.
@@ -332,6 +334,7 @@ run_groups:
332
334
  - vqa_dialect
333
335
  - a_okvqa_dialect
334
336
  - crossmodal_3600
337
+ - fair_face
335
338
  - name: toxicity
336
339
  display_name: Toxicity
337
340
  description: Does the model generate toxic or inappropriate content? Can the model identify toxic or inappropriate content?
@@ -358,10 +361,11 @@ run_groups:
358
361
  - a_okvqa_hindi
359
362
  - a_okvqa_spanish
360
363
  - a_okvqa_swahili
364
+ - exams_v
361
365
 
362
366
  - name: a_okvqa_base
363
367
  display_name: A-OKVQA
364
- description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
368
+ description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
365
369
  metric_groups:
366
370
  - accuracy
367
371
  - general_information
@@ -377,7 +381,7 @@ run_groups:
377
381
 
378
382
  - name: a_okvqa_dialect
379
383
  display_name: A-OKVQA (AAE)
380
- description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
384
+ description: African-American English Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
381
385
  metric_groups:
382
386
  - fairness
383
387
  - general_information
@@ -393,7 +397,7 @@ run_groups:
393
397
 
394
398
  - name: a_okvqa_robustness
395
399
  display_name: A-OKVQA (robustness)
396
- description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
400
+ description: Robustness Typos Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
397
401
  metric_groups:
398
402
  - robustness
399
403
  - general_information
@@ -409,7 +413,7 @@ run_groups:
409
413
 
410
414
  - name: a_okvqa_chinese
411
415
  display_name: A-OKVQA (chinese)
412
- description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
416
+ description: Chinese Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
413
417
  metric_groups:
414
418
  - translate
415
419
  - general_information
@@ -425,7 +429,7 @@ run_groups:
425
429
 
426
430
  - name: a_okvqa_hindi
427
431
  display_name: A-OKVQA (hindi)
428
- description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
432
+ description: Hindi Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
429
433
  metric_groups:
430
434
  - translate
431
435
  - general_information
@@ -441,7 +445,7 @@ run_groups:
441
445
 
442
446
  - name: a_okvqa_spanish
443
447
  display_name: A-OKVQA (spanish)
444
- description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
448
+ description: Spanish Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
445
449
  metric_groups:
446
450
  - translate
447
451
  - general_information
@@ -457,7 +461,7 @@ run_groups:
457
461
 
458
462
  - name: a_okvqa_swahili
459
463
  display_name: A-OKVQA (swahili)
460
- description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
464
+ description: Swahili Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
461
465
  metric_groups:
462
466
  - translate
463
467
  - general_information
@@ -473,7 +477,7 @@ run_groups:
473
477
 
474
478
  - name: crossmodal_3600
475
479
  display_name: Crossmodal 3600
476
- description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([paper](https://arxiv.org/abs/2205.12522))
480
+ description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([Thapliyal et al., 2022](https://arxiv.org/abs/2205.12522))
477
481
  metric_groups:
478
482
  - accuracy
479
483
  - general_information
@@ -489,7 +493,7 @@ run_groups:
489
493
 
490
494
  - name: flickr30k
491
495
  display_name: Flickr30k
492
- description: An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr images. ([paper](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))
496
+ description: An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr images. ([Young et al., 2014](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))
493
497
  metric_groups:
494
498
  - accuracy
495
499
  - general_information
@@ -505,7 +509,7 @@ run_groups:
505
509
 
506
510
  - name: gqa
507
511
  display_name: GQA
508
- description: Questions about real-world visual reasoning and compositional QA
512
+ description: Questions about real-world visual reasoning and compositional QA ([Hudson and Manning, 2019](https://arxiv.org/abs/1902.09506)).
509
513
  metric_groups:
510
514
  - accuracy
511
515
  - general_information
@@ -513,7 +517,7 @@ run_groups:
513
517
  main_name: quasi_exact_match
514
518
  main_split: valid
515
519
  taxonomy:
516
- task: short answer question answering
520
+ task: short-answer question answering
517
521
  what: Real-world images
518
522
  who: Human experts
519
523
  when: "2019"
@@ -521,7 +525,7 @@ run_groups:
521
525
 
522
526
  - name: hateful_memes
523
527
  display_name: Hateful Memes
524
- description: The Hateful Memes benchmark for multimodal hate speech detection [(Dwibedi et al., 2020)](https://arxiv.org/pdf/2005.04790.pdf).
528
+ description: Multimodal hate speech detection in memes ([Kiela et al., 2020](https://arxiv.org/abs/2005.04790)).
525
529
  metric_groups:
526
530
  - accuracy
527
531
  - general_information
@@ -529,15 +533,15 @@ run_groups:
529
533
  main_name: exact_match
530
534
  main_split: test
531
535
  taxonomy:
532
- task: multimodal classification
533
- what: images and text
534
- who: annotators from Amazon Mechanical Turk
536
+ task: toxicity identification
537
+ what: Memes
538
+ who: Human experts
535
539
  when: "2020"
536
540
  language: English
537
541
 
538
542
  - name: mm_safety_bench
539
543
  display_name: MM-SafetyBench
540
- description: Expose the vulnerability of open-source VLMs with toxic and biased content ([paper](https://arxiv.org/abs/2311.17600))
544
+ description: Exposes the vulnerability of open-source VLMs with toxic and biased content ([Liu et al., 2023](https://arxiv.org/abs/2311.17600)).
541
545
  metric_groups:
542
546
  - accuracy
543
547
  - toxicity
@@ -546,22 +550,23 @@ run_groups:
546
550
  main_name: toxic_frac
547
551
  main_split: test
548
552
  taxonomy:
549
- task: safety
550
- what: safety images
553
+ task: toxicity mitigation
554
+ what: Jail-break images
551
555
  who: Human experts
552
556
  when: "2023"
553
557
  language: English
554
558
 
555
559
  - name: viz_wiz
556
560
  display_name: VizWiz
557
- description: A benchmark for visual question answering with images and questions created by visually impaired people [(Gurari et al., 2018)](https://arxiv.org/abs/1802.08218).
561
+ description: A benchmark for visual question answering with images and questions created by visually impaired people ([Gurari et al., 2018](https://arxiv.org/abs/1802.08218)).
558
562
  metric_groups:
559
563
  - accuracy
564
+ - general_information
560
565
  environment:
561
566
  main_name: quasi_exact_match
562
567
  main_split: valid
563
568
  taxonomy:
564
- task: multimodal short answer question answering
569
+ task: short-answer question answering
565
570
  what: Real-world images
566
571
  who: Visually impaired people
567
572
  when: "2018"
@@ -569,7 +574,7 @@ run_groups:
569
574
 
570
575
  - name: vqa_base
571
576
  display_name: VQAv2
572
- description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
577
+ description: Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
573
578
  metric_groups:
574
579
  - accuracy
575
580
  - general_information
@@ -577,7 +582,7 @@ run_groups:
577
582
  main_name: quasi_exact_match
578
583
  main_split: valid
579
584
  taxonomy:
580
- task: multimodal short answer question answering
585
+ task: short-answer question answering
581
586
  what: Real-world images
582
587
  who: Human experts
583
588
  when: "2017"
@@ -585,7 +590,7 @@ run_groups:
585
590
 
586
591
  - name: vqa_dialect
587
592
  display_name: VQAv2 (AAE)
588
- description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
593
+ description: African-American English Perturbation + Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
589
594
  metric_groups:
590
595
  - fairness
591
596
  - general_information
@@ -593,7 +598,7 @@ run_groups:
593
598
  main_name: quasi_exact_match
594
599
  main_split: valid
595
600
  taxonomy:
596
- task: multimodal short answer question answering
601
+ task: short-answer question answering
597
602
  what: Real-world images
598
603
  who: Human experts
599
604
  when: "2017"
@@ -601,7 +606,7 @@ run_groups:
601
606
 
602
607
  - name: vqa_robustness
603
608
  display_name: VQAv2 (robustness)
604
- description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
609
+ description: Robustness Typos Perturbation + Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
605
610
  metric_groups:
606
611
  - robustness
607
612
  - general_information
@@ -609,63 +614,15 @@ run_groups:
609
614
  main_name: quasi_exact_match
610
615
  main_split: valid
611
616
  taxonomy:
612
- task: multimodal short answer question answering
617
+ task: short-answer question answering
613
618
  what: Real-world images
614
619
  who: Human experts
615
620
  when: "2017"
616
621
  language: English
617
622
 
618
- - name: vqa_chinese
619
- display_name: VQAv2 (chinese)
620
- description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
621
- metric_groups:
622
- - translate
623
- - general_information
624
- environment:
625
- main_name: quasi_exact_match
626
- main_split: valid
627
- taxonomy:
628
- task: multimodal short answer question answering
629
- what: Real-world images
630
- who: Human experts
631
- when: "2017"
632
- language: Chinese
633
-
634
- - name: vqa_hindi
635
- display_name: VQAv2 (hindi)
636
- description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
637
- metric_groups:
638
- - translate
639
- - general_information
640
- environment:
641
- main_name: quasi_exact_match
642
- main_split: valid
643
- taxonomy:
644
- task: multimodal short answer question answering
645
- what: Real-world images
646
- who: Human experts
647
- when: "2017"
648
- language: Hindi
649
-
650
- - name: vqa_spanish
651
- display_name: VQAv2 (spanish)
652
- description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
653
- metric_groups:
654
- - translate
655
- - general_information
656
- environment:
657
- main_name: quasi_exact_match
658
- main_split: valid
659
- taxonomy:
660
- task: multimodal short answer question answering
661
- what: Real-world images
662
- who: Human experts
663
- when: "2017"
664
- language: Spanish
665
-
666
623
  - name: math_vista
667
624
  display_name: MathVista
668
- description: Evaluating Math Reasoning in Visual Contexts
625
+ description: A benchmark designed to combine challenges from diverse mathematical and visual tasks ([Lu et al., 2024](https://arxiv.org/abs/2310.02255)).
669
626
  metric_groups:
670
627
  - accuracy
671
628
  - general_information
@@ -681,7 +638,7 @@ run_groups:
681
638
 
682
639
  - name: mmmu
683
640
  display_name: MMMU
684
- description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning [(Yue et al., 2023)](https://arxiv.org/abs/2311.16502).
641
+ description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning ([Yue et al., 2023](https://arxiv.org/abs/2311.16502)).
685
642
  metric_groups:
686
643
  - accuracy
687
644
  - general_information
@@ -689,7 +646,7 @@ run_groups:
689
646
  main_name: exact_match
690
647
  main_split: valid
691
648
  taxonomy:
692
- task: multimodal multiple-choice question answering
649
+ task: multiple-choice question answering
693
650
  what: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering
694
651
  who: Human experts
695
652
  when: "2023"
@@ -697,7 +654,7 @@ run_groups:
697
654
 
698
655
  - name: unicorn
699
656
  display_name: Unicorn
700
- description: Safety Evaluation Benchmark for Evaluating on Out-of-Distribution and Sketch Images
657
+ description: Safety Evaluation Benchmark for Evaluating on Out-of-Distribution and Sketch Images ([Tu et al., 2023](https://arxiv.org/abs/2311.16101)).
701
658
  metric_groups:
702
659
  - accuracy
703
660
  - general_information
@@ -705,7 +662,7 @@ run_groups:
705
662
  main_name: exact_match
706
663
  main_split: test
707
664
  taxonomy:
708
- task: short answer question answering
665
+ task: short-answer question answering
709
666
  what: OOD images and sketch images
710
667
  who: Human experts
711
668
  when: "2023"
@@ -713,7 +670,7 @@ run_groups:
713
670
 
714
671
  - name: bingo
715
672
  display_name: Bingo
716
- description: Open-ended questions about biased images
673
+ description: Open-ended questions about biased images and hallucinations-inducing images ([Cui et al., 2023](https://arxiv.org/abs/2311.03287)).
717
674
  metric_groups:
718
675
  - accuracy
719
676
  - general_information
@@ -721,7 +678,7 @@ run_groups:
721
678
  main_name: prometheus_vision
722
679
  main_split: test
723
680
  taxonomy:
724
- task: short answer question answering
681
+ task: short-answer question answering
725
682
  what: Biased images about Region, OCR, Factual, Text-to-Image and Image-to-Image inference challenges
726
683
  who: Human experts
727
684
  when: "2023"
@@ -729,7 +686,7 @@ run_groups:
729
686
 
730
687
  - name: pope
731
688
  display_name: POPE
732
- description: Open-ended questions about object appearance in real-world images for evaluating hallucination behaviour
689
+ description: Open-ended questions about object appearance in real-world images for evaluating hallucination behaviour ([Li et al., 2023](https://aclanthology.org/2023.emnlp-main.20)).
733
690
  metric_groups:
734
691
  - accuracy
735
692
  - general_information
@@ -737,7 +694,7 @@ run_groups:
737
694
  main_name: exact_match
738
695
  main_split: test
739
696
  taxonomy:
740
- task: short answer question answering
697
+ task: short-answer question answering
741
698
  what: Real-world images
742
699
  who: Human experts
743
700
  when: "2023"
@@ -745,7 +702,7 @@ run_groups:
745
702
 
746
703
  - name: seed_bench
747
704
  display_name: Seed Bench
748
- description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input including the comprehension of both the image and video modality
705
+ description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input including the comprehension of both the image and video modality ([Li et al., 2023](https://arxiv.org/abs/2307.16125)).
749
706
  metric_groups:
750
707
  - accuracy
751
708
  - general_information
@@ -761,7 +718,7 @@ run_groups:
761
718
 
762
719
  - name: mme
763
720
  display_name: MME
764
- description: A comprehensive MLLM Evaluation benchmark with perception and cognition evaluations on 14 subtasks
721
+ description: A comprehensive MLLM Evaluation benchmark with perception and cognition evaluations on 14 subtasks ([Fu et al., 2023](https://arxiv.org/abs/2306.13394)).
765
722
  metric_groups:
766
723
  - accuracy
767
724
  - general_information
@@ -777,7 +734,7 @@ run_groups:
777
734
 
778
735
  - name: vibe_eval
779
736
  display_name: Vibe Eval
780
- description: hard evaluation suite for measuring progress of multimodal language models
737
+ description: A difficult evaluation suite for measuring progress of multimodal language models with day-to-day tasks ([Padlewski et al., 2024](https://arxiv.org/abs/2405.02287)).
781
738
  metric_groups:
782
739
  - accuracy
783
740
  - general_information
@@ -785,7 +742,7 @@ run_groups:
785
742
  main_name: prometheus_vision
786
743
  main_split: test
787
744
  taxonomy:
788
- task: short answer question answering
745
+ task: short-answer question answering
789
746
  what: Knowledge intensive
790
747
  who: Human experts
791
748
  when: "2024"
@@ -793,7 +750,7 @@ run_groups:
793
750
 
794
751
  - name: mementos
795
752
  display_name: Mementos
796
- description: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences
753
+ description: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences ([Wang et al., 2024](https://arxiv.org/abs/2401.10529)).
797
754
  metric_groups:
798
755
  - accuracy
799
756
  - general_information
@@ -801,15 +758,15 @@ run_groups:
801
758
  main_name: prometheus_vision
802
759
  main_split: test
803
760
  taxonomy:
804
- task: short answer question answering
805
- what: Image sequences of comics, dailylife and robotics
761
+ task: short-answer question answering
762
+ what: Image sequences of comics, daily life and robotics
806
763
  who: Human experts
807
764
  when: "2024"
808
765
  language: English
809
766
 
810
767
  - name: pairs
811
768
  display_name: PAIRS
812
- description: Examining Gender and Racial Bias in Large Vision-Language Models Using a Novel Dataset of Parallel Images.
769
+ description: Examining gender and racial bias using parallel images ([Fraser et al., 2024](https://arxiv.org/abs/2402.05779)).
813
770
  metric_groups:
814
771
  - accuracy
815
772
  - general_information
@@ -822,3 +779,51 @@ run_groups:
822
779
  who: Human experts
823
780
  when: "2024"
824
781
  language: English
782
+
783
+ - name: fair_face
784
+ display_name: FairFace
785
+ description: Identify the race, gender or age of a photo of a person ([Karkkainen et al., 2019](https://arxiv.org/abs/1908.04913)).
786
+ metric_groups:
787
+ - accuracy
788
+ - general_information
789
+ environment:
790
+ main_name: exact_match
791
+ main_split: valid
792
+ taxonomy:
793
+ task: multiple-choice question answering
794
+ what: Fairness
795
+ who: Human experts
796
+ when: "2019"
797
+ language: English
798
+
799
+ - name: real_world_qa
800
+ display_name: RealWorldQA
801
+ description: A benchmark designed to to evaluate real-world spatial understanding capabilities of multimodal models ([xAI, 2024](https://x.ai/blog/grok-1.5v)).
802
+ metric_groups:
803
+ - accuracy
804
+ - general_information
805
+ environment:
806
+ main_name: exact_match
807
+ main_split: test
808
+ taxonomy:
809
+ task: short-answer question answering
810
+ what: Real world images
811
+ who: Human experts
812
+ when: "2024"
813
+ language: English
814
+
815
+ - name: exams_v
816
+ display_name: Exams-V
817
+ description: A multimodal and multilingual benchmark with knowledge-intensive exam questions covering natural science, social science, and other miscellaneous studies ([Das et al., 2024]( https://arxiv.org/abs/2403.10378)).
818
+ metric_groups:
819
+ - accuracy
820
+ - general_information
821
+ environment:
822
+ main_name: exact_match
823
+ main_split: test
824
+ taxonomy:
825
+ task: multiple-choice question answering
826
+ what: Exam questions
827
+ who: Human experts
828
+ when: "2024"
829
+ language: English, Chinese, Croation, Hungarian, Arabic, Serbian, Bulgarian, English, German, French, Spanish, Polish