crfm-helm 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (98) hide show
  1. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +13 -3
  2. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +96 -63
  3. helm/benchmark/adaptation/adapter_spec.py +32 -31
  4. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  5. helm/benchmark/annotation/annotator_factory.py +6 -0
  6. helm/benchmark/annotation/live_qa_annotator.py +84 -0
  7. helm/benchmark/annotation/medication_qa_annotator.py +81 -0
  8. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  9. helm/benchmark/huggingface_registration.py +16 -6
  10. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  11. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  12. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  13. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  14. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  15. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  16. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  17. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  18. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  19. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  20. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  21. helm/benchmark/metrics/vision_language/image_metrics.py +29 -71
  22. helm/benchmark/presentation/schema.py +54 -4
  23. helm/benchmark/presentation/test_schema.py +11 -0
  24. helm/benchmark/run.py +16 -2
  25. helm/benchmark/run_expander.py +77 -0
  26. helm/benchmark/run_spec_factory.py +4 -0
  27. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  28. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  29. helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
  30. helm/benchmark/run_specs/experimental_run_specs.py +33 -0
  31. helm/benchmark/run_specs/finance_run_specs.py +33 -0
  32. helm/benchmark/run_specs/vlm_run_specs.py +168 -45
  33. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  34. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  35. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  36. helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
  37. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  38. helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
  39. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
  40. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
  41. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +0 -4
  42. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +4 -2
  43. helm/benchmark/scenarios/vision_language/pairs_scenario.py +6 -5
  44. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
  45. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
  46. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  47. helm/benchmark/static/schema_classic.yaml +3 -59
  48. helm/benchmark/static/schema_finance.yaml +143 -0
  49. helm/benchmark/static/schema_image2structure.yaml +254 -111
  50. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  51. helm/benchmark/static/schema_lite.yaml +3 -61
  52. helm/benchmark/static/schema_medical.yaml +255 -0
  53. helm/benchmark/static/schema_mmlu.yaml +3 -61
  54. helm/benchmark/static/schema_tables.yaml +200 -0
  55. helm/benchmark/static/schema_thai.yaml +223 -0
  56. helm/benchmark/static/schema_unitxt.yaml +3 -61
  57. helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +294 -293
  58. helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
  59. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  60. helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
  61. helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
  62. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  63. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  64. helm/benchmark/static_build/index.html +2 -2
  65. helm/clients/anthropic_client.py +43 -9
  66. helm/clients/auto_client.py +11 -0
  67. helm/clients/client.py +24 -7
  68. helm/clients/cohere_client.py +98 -3
  69. helm/clients/huggingface_client.py +71 -12
  70. helm/clients/openai_client.py +9 -2
  71. helm/clients/reka_client.py +189 -0
  72. helm/clients/test_client.py +3 -3
  73. helm/clients/test_huggingface_client.py +19 -3
  74. helm/clients/test_together_client.py +72 -2
  75. helm/clients/together_client.py +129 -23
  76. helm/clients/vertexai_client.py +62 -18
  77. helm/clients/vision_language/huggingface_vlm_client.py +1 -0
  78. helm/clients/vision_language/paligemma_client.py +146 -0
  79. helm/clients/vision_language/palmyra_vision_client.py +84 -0
  80. helm/clients/yi_client.py +31 -0
  81. helm/common/critique_request.py +10 -1
  82. helm/common/images_utils.py +19 -0
  83. helm/config/model_deployments.yaml +412 -18
  84. helm/config/model_metadata.yaml +447 -25
  85. helm/config/tokenizer_configs.yaml +93 -1
  86. helm/proxy/critique/model_critique_client.py +32 -4
  87. helm/proxy/services/server_service.py +1 -1
  88. helm/tokenizers/auto_tokenizer.py +1 -1
  89. helm/tokenizers/cohere_tokenizer.py +44 -2
  90. helm/tokenizers/huggingface_tokenizer.py +36 -13
  91. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  92. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  93. helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
  94. helm/benchmark/static_build/assets/index-878a1094.css +0 -1
  95. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
  96. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
  97. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
  98. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
@@ -1,66 +1,8 @@
1
1
  ---
2
2
  ############################################################
3
- adapter:
4
- - name: method
5
- description: The high-level strategy for converting instances into a prompt for the language model.
6
- values:
7
- - name: generation
8
- description: Given the input, the model generates the output free-form.
9
- - name: multiple_choice_joint
10
- description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
11
- - name: multiple_choice_separate_original
12
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
13
- - name: multiple_choice_separate_calibrated
14
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
15
- - name: language_modeling
16
- description: Given the input, the model assigns the sequence a probability.
17
- - name: instructions
18
- description: The description of the task that is included at the very beginning of the prompt.
19
- - name: global_prefix
20
- description: The string that is prepended to the prompt.
21
- - name: global_suffix
22
- description: The string that is appended to the prompt.
23
- - name: instance_prefix
24
- description: The string that is included before each instance (e.g., '\n\n').
25
- - name: input_prefix
26
- description: The string that is included before each input (e.g., 'Question:').
27
- - name: input_suffix
28
- description: The string that is included after each input (e.g., '\n').
29
- - name: reference_prefix
30
- description: The string that is included before each reference (for multiple-choice questions).
31
- - name: reference_suffix
32
- description: The string that is included after each reference (for multiple-choice questions).
33
- - name: output_prefix
34
- description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
35
- - name: output_suffix
36
- description: The string that is included after the correct answer/predicted output (e.g., '\n').
37
- - name: substitutions
38
- description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
39
- - name: max_train_instances
40
- description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
41
- - name: max_eval_instances
42
- description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
43
- - name: num_outputs
44
- description: Maximum number of possible outputs to generate by sampling multiple outputs.
45
- - name: num_train_trials
46
- description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
47
- - name: sample_train
48
- description: If true, randomly sample N training examples; if false, select N consecutive training examples
49
- - name: model
50
- description: Name of the language model (<creator_organization>/<model name>) to send requests to.
51
- - name: model_deployment
52
- description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
53
- - name: temperature
54
- description: Temperature parameter used in generation.
55
- - name: max_tokens
56
- description: Maximum number of tokens to generate.
57
- - name: stop_sequences
58
- description: List of sequences, where we stop generation if we encounter any of them.
59
- - name: random
60
- description: Random seed (string), which guarantees reproducibility.
61
- - name: multi_label
62
- description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
63
-
3
+ # For backwards compatibility with older versions of HELM.
4
+ # TODO: Remove this after 2024-09-01.
5
+ adapter: []
64
6
  ############################################################
65
7
  metrics:
66
8
  # Infrastructure metrics:
@@ -220,49 +162,10 @@ metrics:
220
162
  display_name: CIDEr
221
163
  description: Evaluates the quality of generated caption by measuring the weighted similarity of n-grams between the captions and a set of human-written reference captions, emphasizing informativeness and consensus.
222
164
  lower_is_better: false
223
-
224
- # Bias metrics:
225
- - name: bias_metric:mode=associations,demographic_category=race,target_category=profession
226
- display_name: Stereotypical associations (race, profession)
227
- short_display_name: Stereotypes (race)
228
- lower_is_better: true
229
- description: Measures uneven association of racial groups (Asian, Hispanic, White) with target professions. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
230
- - name: bias_metric:mode=associations,demographic_category=race,target_category=adjective
231
- display_name: Stereotypical associations (race, adjectives)
232
- short_display_name: Stereotypes (race)
233
- lower_is_better: true
234
- description: Measures uneven association of racial groups (Asian, Hispanic, White) with target adjectives. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
235
-
236
- - name: bias_metric:mode=associations,demographic_category=gender,target_category=profession
237
- display_name: Stereotypical associations (gender, profession)
238
- short_display_name: Stereotypes (gender)
239
- lower_is_better: true
240
- description: Measures uneven association of gender groups (male, female) with target professions. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
241
- - name: bias_metric:mode=associations,demographic_category=gender,target_category=adjective
242
- display_name: Stereotypical associations (gender, adjectives)
243
- short_display_name: Stereotypes (gender)
244
- lower_is_better: true
245
- description: Measures uneven association of gender groups (male, female) with target adjectives. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
246
-
247
- - name: bias_metric:mode=representation,demographic_category=race
248
- display_name: Demographic representation (race)
249
- short_display_name: Representation (race)
250
- lower_is_better: true
251
- description: Measures uneven representation of racial groups (Asian, Hispanic, White). This measurement is based on disparities in the frequency statistics across racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics).
252
- - name: bias_metric:mode=representation,demographic_category=gender
253
- display_name: Demographic representation (gender)
254
- short_display_name: Representation (gender)
255
- lower_is_better: true
256
- description: Measures uneven representation of gender groups (male, female). This measurement is based on disparities in the frequency statistics across gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
257
-
258
- - name: bbq_metric_ambiguous_bias
259
- display_name: BBQ (ambiguous)
260
- lower_is_better: true
261
- description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on ambiguous examples.
262
- - name: bbq_metric_unambiguous_bias
263
- display_name: BBQ (unambiguous)
264
- lower_is_better: true
265
- description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on unambiguous examples.
165
+ - name: prometheus_vision
166
+ display_name: Prometheus Vision rating
167
+ description: Scores are from 1 to 5, where 5 is the best.
168
+ lower_is_better: false
266
169
 
267
170
  # Toxicity metrics
268
171
  - name: expected_max_toxicity
@@ -281,6 +184,9 @@ metrics:
281
184
 
282
185
  ############################################################
283
186
  perturbations:
187
+ - name: translate
188
+ display_name: Translate
189
+ description: Translate prompts to different languages.
284
190
  - name: robustness
285
191
  display_name: Robustness
286
192
  description: Computes worst case over different robustness perturbations (misspellings, formatting, contrast sets).
@@ -322,12 +228,6 @@ metric_groups:
322
228
  - name: ${main_name}
323
229
  split: ${main_split}
324
230
 
325
- - name: efficiency
326
- display_name: Efficiency
327
- metrics:
328
- - name: inference_runtime
329
- split: ${main_split}
330
-
331
231
  - name: general_information
332
232
  display_name: General information
333
233
  metrics:
@@ -348,50 +248,122 @@ metric_groups:
348
248
  - name: toxic_frac
349
249
  split: ${main_split}
350
250
 
351
- - name: generation_image
352
- display_name: Generation (image)
251
+ - name: fairness
252
+ display_name: Fairness
353
253
  metrics:
354
- - name: pixel_similarity
355
- split: ${main_split}
356
- - name: compilation_success
357
- split: ${main_split}
358
- - name: fid_similarity
254
+ - name: ${main_name}
359
255
  split: ${main_split}
360
- - name: earth_mover_similarity
256
+ perturbation_name: fairness
257
+
258
+ - name: robustness
259
+ display_name: Robustness
260
+ metrics:
261
+ - name: ${main_name}
361
262
  split: ${main_split}
263
+ perturbation_name: robustness
362
264
 
363
- - name: generation_text
364
- display_name: Generation (text)
265
+ - name: translate
266
+ display_name: Translate
365
267
  metrics:
366
- - name: edit_similarity
268
+ - name: ${main_name}
367
269
  split: ${main_split}
270
+ perturbation_name: translate
271
+
368
272
 
369
273
  ############################################################
370
274
  run_groups:
371
275
  - name: core_scenarios
372
- display_name: Core scenarios
373
- description: The scenarios where we evaluate all the models.
276
+ display_name: All
277
+ description: All scenarios across capabilities
374
278
  category: All scenarios
375
279
  subgroups:
376
- - hateful_memes
377
- - heim_human_eval
280
+ - visual_perception
281
+ - reasoning
282
+ - knowledge
283
+ - bias
284
+ - fairness
285
+ - toxicity
286
+ - robustness
287
+ - multilinguality
288
+ - name: visual_perception
289
+ display_name: Visual perception
290
+ description: Is the output semantically correct, given the text and image inputs?
291
+ category: Core scenarios
292
+ subgroups:
293
+ - vqa_base
378
294
  - viz_wiz
379
- - vqa
295
+ - flickr30k
296
+ - name: reasoning
297
+ display_name: Reasoning
298
+ description: Does the model understand objects, counts and spatial relations? Can the model reason about both the text and image input?
299
+ category: Core scenarios
300
+ subgroups:
301
+ - gqa
302
+ - math_vista
303
+ - seed_bench
304
+ - name: real_world_reasoning
305
+ display_name: Real-world Reasoning
306
+ description: Reasoning in the real-world
307
+ category: Core scenarios
308
+ subgroups:
309
+ - gqa
310
+ - seed_bench
311
+ - mementos
312
+ - name: knowledge
313
+ display_name: Knowledge
314
+ description: Does the model have knowledge about the world and common sense?
315
+ category: Core scenarios
316
+ subgroups:
317
+ - a_okvqa_base
380
318
  - mmmu
381
- - image2structure
319
+ - mme
320
+ - vibe_eval
321
+ - name: bias
322
+ display_name: Bias
323
+ description: Are the generations biased in demographic representation? We focus on gender and skin tone bias.
324
+ category: Core scenarios
325
+ subgroups:
326
+ - pairs
327
+ - name: fairness
328
+ display_name: Fairness
329
+ description: Does the model exhibit performance disparities across different groups? We focus on gender, dialect and geographic bias.
330
+ category: Core scenarios
331
+ subgroups:
332
+ - vqa_dialect
333
+ - a_okvqa_dialect
334
+ - crossmodal_3600
335
+ - name: toxicity
336
+ display_name: Toxicity
337
+ description: Does the model generate toxic or inappropriate content? Can the model identify toxic or inappropriate content?
338
+ category: Core scenarios
339
+ subgroups:
340
+ - mm_safety_bench
341
+ - hateful_memes
342
+ - name: robustness
343
+ display_name: Robustness
344
+ description: Is the model robust to perturbations? We focus on both text and image perturbations.
345
+ category: Core scenarios
346
+ subgroups:
347
+ - vqa_robustness
348
+ - a_okvqa_robustness
382
349
  - unicorn
383
350
  - bingo
384
- - multipanelvqa
385
351
  - pope
386
- - seed_bench
387
- - mme
352
+ - name: multilinguality
353
+ display_name: Multilinguality
354
+ description: Do the model support non-English languages?
355
+ category: Core scenarios
356
+ subgroups:
357
+ - a_okvqa_chinese
358
+ - a_okvqa_hindi
359
+ - a_okvqa_spanish
360
+ - a_okvqa_swahili
388
361
 
389
- - name: a_okvqa
362
+ - name: a_okvqa_base
390
363
  display_name: A-OKVQA
391
364
  description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
392
365
  metric_groups:
393
366
  - accuracy
394
- - efficiency
395
367
  - general_information
396
368
  environment:
397
369
  main_name: exact_match
@@ -403,15 +375,110 @@ run_groups:
403
375
  when: "2023"
404
376
  language: English
405
377
 
378
+ - name: a_okvqa_dialect
379
+ display_name: A-OKVQA (AAE)
380
+ description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
381
+ metric_groups:
382
+ - fairness
383
+ - general_information
384
+ environment:
385
+ main_name: exact_match
386
+ main_split: valid
387
+ taxonomy:
388
+ task: multiple-choice question answering
389
+ what: Real-world images
390
+ who: Human experts
391
+ when: "2023"
392
+ language: English
393
+
394
+ - name: a_okvqa_robustness
395
+ display_name: A-OKVQA (robustness)
396
+ description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
397
+ metric_groups:
398
+ - robustness
399
+ - general_information
400
+ environment:
401
+ main_name: exact_match
402
+ main_split: valid
403
+ taxonomy:
404
+ task: multiple-choice question answering
405
+ what: Real-world images
406
+ who: Human experts
407
+ when: "2023"
408
+ language: English
409
+
410
+ - name: a_okvqa_chinese
411
+ display_name: A-OKVQA (chinese)
412
+ description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
413
+ metric_groups:
414
+ - translate
415
+ - general_information
416
+ environment:
417
+ main_name: exact_match
418
+ main_split: valid
419
+ taxonomy:
420
+ task: multiple-choice question answering
421
+ what: Real-world images
422
+ who: Human experts
423
+ when: "2023"
424
+ language: Chinese
425
+
426
+ - name: a_okvqa_hindi
427
+ display_name: A-OKVQA (hindi)
428
+ description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
429
+ metric_groups:
430
+ - translate
431
+ - general_information
432
+ environment:
433
+ main_name: exact_match
434
+ main_split: valid
435
+ taxonomy:
436
+ task: multiple-choice question answering
437
+ what: Real-world images
438
+ who: Human experts
439
+ when: "2023"
440
+ language: Hindi
441
+
442
+ - name: a_okvqa_spanish
443
+ display_name: A-OKVQA (spanish)
444
+ description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
445
+ metric_groups:
446
+ - translate
447
+ - general_information
448
+ environment:
449
+ main_name: exact_match
450
+ main_split: valid
451
+ taxonomy:
452
+ task: multiple-choice question answering
453
+ what: Real-world images
454
+ who: Human experts
455
+ when: "2023"
456
+ language: Spanish
457
+
458
+ - name: a_okvqa_swahili
459
+ display_name: A-OKVQA (swahili)
460
+ description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
461
+ metric_groups:
462
+ - translate
463
+ - general_information
464
+ environment:
465
+ main_name: exact_match
466
+ main_split: valid
467
+ taxonomy:
468
+ task: multiple-choice question answering
469
+ what: Real-world images
470
+ who: Human experts
471
+ when: "2023"
472
+ language: Swahili
473
+
406
474
  - name: crossmodal_3600
407
475
  display_name: Crossmodal 3600
408
476
  description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([paper](https://arxiv.org/abs/2205.12522))
409
477
  metric_groups:
410
478
  - accuracy
411
- - efficiency
412
479
  - general_information
413
480
  environment:
414
- main_name: f1_score
481
+ main_name: prometheus_vision
415
482
  main_split: test
416
483
  taxonomy:
417
484
  task: multilingual captioning
@@ -425,10 +492,9 @@ run_groups:
425
492
  description: An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr images. ([paper](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))
426
493
  metric_groups:
427
494
  - accuracy
428
- - efficiency
429
495
  - general_information
430
496
  environment:
431
- main_name: f1_score
497
+ main_name: prometheus_vision
432
498
  main_split: test
433
499
  taxonomy:
434
500
  task: image captioning
@@ -442,10 +508,9 @@ run_groups:
442
508
  description: Questions about real-world visual reasoning and compositional QA
443
509
  metric_groups:
444
510
  - accuracy
445
- - efficiency
446
511
  - general_information
447
512
  environment:
448
- main_name: f1_score
513
+ main_name: quasi_exact_match
449
514
  main_split: valid
450
515
  taxonomy:
451
516
  task: short answer question answering
@@ -454,38 +519,11 @@ run_groups:
454
519
  when: "2019"
455
520
  language: English
456
521
 
457
- - name: heim_human_eval
458
- display_name: HEIM Human Eval Scenario
459
- description: Seeing if we can use VLMs to evaluate AI-generated images from HEIM
460
- metric_groups:
461
- - accuracy
462
- - efficiency
463
- - general_information
464
- environment:
465
- main_name: exact_match
466
- main_split: test
467
- taxonomy:
468
- task: multiple-choice question answering
469
- what: AI-generated images
470
- who: Text-to-image models
471
- when: "2024"
472
- language: English
473
-
474
- - name: image2structure
475
- display_name: Image2Structure
476
- description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images.
477
- category: All scenarios
478
- subgroups:
479
- - image2latex
480
- - image2webpage
481
- - image2musicsheet
482
-
483
522
  - name: hateful_memes
484
523
  display_name: Hateful Memes
485
524
  description: The Hateful Memes benchmark for multimodal hate speech detection [(Dwibedi et al., 2020)](https://arxiv.org/pdf/2005.04790.pdf).
486
525
  metric_groups:
487
526
  - accuracy
488
- - efficiency
489
527
  - general_information
490
528
  environment:
491
529
  main_name: exact_match
@@ -502,7 +540,6 @@ run_groups:
502
540
  description: Expose the vulnerability of open-source VLMs with toxic and biased content ([paper](https://arxiv.org/abs/2311.17600))
503
541
  metric_groups:
504
542
  - accuracy
505
- - efficiency
506
543
  - toxicity
507
544
  - general_information
508
545
  environment:
@@ -515,66 +552,61 @@ run_groups:
515
552
  when: "2023"
516
553
  language: English
517
554
 
518
- - name: mscoco_captioning
519
- display_name: MSCOCO (captioning)
520
- description: Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
555
+ - name: viz_wiz
556
+ display_name: VizWiz
557
+ description: A benchmark for visual question answering with images and questions created by visually impaired people [(Gurari et al., 2018)](https://arxiv.org/abs/1802.08218).
521
558
  metric_groups:
522
559
  - accuracy
523
- - efficiency
524
- - general_information
525
560
  environment:
526
- main_name: f1_score
561
+ main_name: quasi_exact_match
527
562
  main_split: valid
528
563
  taxonomy:
529
- task: image captioning
530
- what: Real world images
531
- who: Human experts
532
- when: "2014"
564
+ task: multimodal short answer question answering
565
+ what: Real-world images
566
+ who: Visually impaired people
567
+ when: "2018"
533
568
  language: English
534
569
 
535
- - name: mscoco_categorization
536
- display_name: MSCOCO (categorization)
537
- description: Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
570
+ - name: vqa_base
571
+ display_name: VQAv2
572
+ description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
538
573
  metric_groups:
539
574
  - accuracy
540
- - efficiency
541
575
  - general_information
542
576
  environment:
543
- main_name: exact_match
577
+ main_name: quasi_exact_match
544
578
  main_split: valid
545
579
  taxonomy:
546
- task: image captioning
547
- what: Real world images
580
+ task: multimodal short answer question answering
581
+ what: Real-world images
548
582
  who: Human experts
549
- when: "2014"
583
+ when: "2017"
550
584
  language: English
551
585
 
552
- - name: viz_wiz
553
- display_name: VizWiz
554
- description: A benchmark for visual question answering with images and questions created by visually impaired people [(Gurari et al., 2018)](https://arxiv.org/abs/1802.08218).
586
+ - name: vqa_dialect
587
+ display_name: VQAv2 (AAE)
588
+ description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
555
589
  metric_groups:
556
- - accuracy
557
- - efficiency
590
+ - fairness
558
591
  - general_information
559
592
  environment:
560
- main_name: f1_score
593
+ main_name: quasi_exact_match
561
594
  main_split: valid
562
595
  taxonomy:
563
596
  task: multimodal short answer question answering
564
597
  what: Real-world images
565
- who: Visually impaired people
566
- when: "2018"
598
+ who: Human experts
599
+ when: "2017"
567
600
  language: English
568
601
 
569
- - name: vqa
570
- display_name: VQAv2
602
+ - name: vqa_robustness
603
+ display_name: VQAv2 (robustness)
571
604
  description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
572
605
  metric_groups:
573
- - accuracy
574
- - efficiency
606
+ - robustness
575
607
  - general_information
576
608
  environment:
577
- main_name: f1_score
609
+ main_name: quasi_exact_match
578
610
  main_split: valid
579
611
  taxonomy:
580
612
  task: multimodal short answer question answering
@@ -583,12 +615,59 @@ run_groups:
583
615
  when: "2017"
584
616
  language: English
585
617
 
618
+ - name: vqa_chinese
619
+ display_name: VQAv2 (chinese)
620
+ description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
621
+ metric_groups:
622
+ - translate
623
+ - general_information
624
+ environment:
625
+ main_name: quasi_exact_match
626
+ main_split: valid
627
+ taxonomy:
628
+ task: multimodal short answer question answering
629
+ what: Real-world images
630
+ who: Human experts
631
+ when: "2017"
632
+ language: Chinese
633
+
634
+ - name: vqa_hindi
635
+ display_name: VQAv2 (hindi)
636
+ description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
637
+ metric_groups:
638
+ - translate
639
+ - general_information
640
+ environment:
641
+ main_name: quasi_exact_match
642
+ main_split: valid
643
+ taxonomy:
644
+ task: multimodal short answer question answering
645
+ what: Real-world images
646
+ who: Human experts
647
+ when: "2017"
648
+ language: Hindi
649
+
650
+ - name: vqa_spanish
651
+ display_name: VQAv2 (spanish)
652
+ description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
653
+ metric_groups:
654
+ - translate
655
+ - general_information
656
+ environment:
657
+ main_name: quasi_exact_match
658
+ main_split: valid
659
+ taxonomy:
660
+ task: multimodal short answer question answering
661
+ what: Real-world images
662
+ who: Human experts
663
+ when: "2017"
664
+ language: Spanish
665
+
586
666
  - name: math_vista
587
667
  display_name: MathVista
588
668
  description: Evaluating Math Reasoning in Visual Contexts
589
669
  metric_groups:
590
670
  - accuracy
591
- - efficiency
592
671
  - general_information
593
672
  environment:
594
673
  main_name: exact_match
@@ -605,7 +684,6 @@ run_groups:
605
684
  description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning [(Yue et al., 2023)](https://arxiv.org/abs/2311.16502).
606
685
  metric_groups:
607
686
  - accuracy
608
- - efficiency
609
687
  - general_information
610
688
  environment:
611
689
  main_name: exact_match
@@ -638,8 +716,9 @@ run_groups:
638
716
  description: Open-ended questions about biased images
639
717
  metric_groups:
640
718
  - accuracy
719
+ - general_information
641
720
  environment:
642
- main_name: f1_score
721
+ main_name: prometheus_vision
643
722
  main_split: test
644
723
  taxonomy:
645
724
  task: short answer question answering
@@ -647,30 +726,12 @@ run_groups:
647
726
  who: Human experts
648
727
  when: "2023"
649
728
  language: English, Chinese, Japanese, etc.
650
-
651
- - name: multipanelvqa
652
- display_name: MultipanelVQA
653
- description: Question about real-world or synthetic multipanel images for evaluating multi-panel image reasoning ability
654
- metric_groups:
655
- - accuracy
656
- - efficiency
657
- - general_information
658
- environment:
659
- main_name: exact_match
660
- main_split: test
661
- taxonomy:
662
- task: short answer or multiple-choice question answering
663
- what: Real-world or synthetic multipanel images
664
- who: Human experts
665
- when: "2024"
666
- language: English
667
-
729
+
668
730
  - name: pope
669
731
  display_name: POPE
670
732
  description: Open-ended questions about object appearance in real-world images for evaluating hallucination behaviour
671
733
  metric_groups:
672
734
  - accuracy
673
- - efficiency
674
735
  - general_information
675
736
  environment:
676
737
  main_name: exact_match
@@ -684,11 +745,9 @@ run_groups:
684
745
 
685
746
  - name: seed_bench
686
747
  display_name: Seed Bench
687
- description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input
688
- including the comprehension of both the image and video modality
748
+ description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input including the comprehension of both the image and video modality
689
749
  metric_groups:
690
750
  - accuracy
691
- - efficiency
692
751
  - general_information
693
752
  environment:
694
753
  main_name: exact_match
@@ -705,7 +764,6 @@ run_groups:
705
764
  description: A comprehensive MLLM Evaluation benchmark with perception and cognition evaluations on 14 subtasks
706
765
  metric_groups:
707
766
  - accuracy
708
- - efficiency
709
767
  - general_information
710
768
  environment:
711
769
  main_name: exact_match
@@ -717,91 +775,35 @@ run_groups:
717
775
  when: "2023"
718
776
  language: English
719
777
 
720
- - name: mementos
721
- display_name: Mementos
722
- description: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences
778
+ - name: vibe_eval
779
+ display_name: Vibe Eval
780
+ description: hard evaluation suite for measuring progress of multimodal language models
723
781
  metric_groups:
724
782
  - accuracy
783
+ - general_information
725
784
  environment:
726
- main_name: f1_score
785
+ main_name: prometheus_vision
727
786
  main_split: test
728
787
  taxonomy:
729
788
  task: short answer question answering
730
- what: Image sequences of comics, dailylife and robotics
789
+ what: Knowledge intensive
731
790
  who: Human experts
732
791
  when: "2024"
733
792
  language: English
734
793
 
735
- - name: image2latex
736
- display_name: Image2LaTeX
737
- description: The Image2LaTeX benchmark for converting images of mathematical equations, tables. algorithms and tikz to LaTeX.
738
- metric_groups:
739
- - accuracy
740
- - generation_image
741
- - generation_text
742
- - efficiency
743
- - general_information
744
- environment:
745
- main_name: earth_mover_similarity
746
- main_split: valid
747
- taxonomy:
748
- task: image-to-text
749
- what: mathematical equations, tables, algorithms, tikz
750
- who: n/a
751
- when: "2024"
752
- language: English
753
-
754
- - name: image2webpage
755
- display_name: Image2webpage
756
- description: The Image2webpage benchmark for converting images of webpages to HTML/CSS/Javascript.
757
- metric_groups:
758
- - accuracy
759
- - generation_image
760
- - generation_text
761
- - efficiency
762
- - general_information
763
- environment:
764
- main_name: earth_mover_similarity
765
- main_split: valid
766
- taxonomy:
767
- task: image-to-text
768
- what: css, html, javascript
769
- who: n/a
770
- when: "2024"
771
- language: English
772
-
773
- - name: image2musicsheet
774
- display_name: Image2musicsheet
775
- description: The Image2musicsheet benchmark for converting images of music sheets to LilyPond.
776
- metric_groups:
777
- - accuracy
778
- - generation_image
779
- - efficiency
780
- - general_information
781
- environment:
782
- main_name: earth_mover_similarity
783
- main_split: valid
784
- taxonomy:
785
- task: image-to-text
786
- what: music sheets
787
- who: n/a
788
- when: "2024"
789
- language: English
790
-
791
- - name: chart2csv
792
- display_name: Chart2CSV
793
- description: The Chart2CSV benchmark for converting images of charts to CSV.
794
+ - name: mementos
795
+ display_name: Mementos
796
+ description: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences
794
797
  metric_groups:
795
798
  - accuracy
796
- - efficiency
797
799
  - general_information
798
800
  environment:
799
- main_name: exact_match
801
+ main_name: prometheus_vision
800
802
  main_split: test
801
803
  taxonomy:
802
- task: chart to CSV
803
- what: plots
804
- who: n/a
804
+ task: short answer question answering
805
+ what: Image sequences of comics, dailylife and robotics
806
+ who: Human experts
805
807
  when: "2024"
806
808
  language: English
807
809
 
@@ -810,7 +812,6 @@ run_groups:
810
812
  description: Examining Gender and Racial Bias in Large Vision-Language Models Using a Novel Dataset of Parallel Images.
811
813
  metric_groups:
812
814
  - accuracy
813
- - efficiency
814
815
  - general_information
815
816
  environment:
816
817
  main_name: exact_match