crfm-helm 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +29 -55
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +146 -134
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
- helm/benchmark/adaptation/common_adapter_specs.py +2 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
- helm/benchmark/annotation/call_center_annotator.py +247 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +68 -0
- helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
- helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
- helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
- helm/benchmark/annotation/live_qa_annotator.py +32 -45
- helm/benchmark/annotation/medication_qa_annotator.py +31 -44
- helm/benchmark/annotation/model_as_judge.py +45 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
- helm/benchmark/annotation/xstest_annotator.py +110 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/bhasa_metrics.py +188 -0
- helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
- helm/benchmark/metrics/code_metrics_helper.py +11 -1
- helm/benchmark/metrics/safety_metrics.py +57 -0
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +1 -1
- helm/benchmark/model_metadata_registry.py +3 -3
- helm/benchmark/presentation/test_run_entry.py +1 -0
- helm/benchmark/run.py +15 -0
- helm/benchmark/run_expander.py +56 -30
- helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
- helm/benchmark/run_specs/call_center_run_specs.py +152 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
- helm/benchmark/run_specs/experimental_run_specs.py +52 -0
- helm/benchmark/run_specs/finance_run_specs.py +78 -1
- helm/benchmark/run_specs/safety_run_specs.py +154 -0
- helm/benchmark/run_specs/vlm_run_specs.py +92 -21
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
- helm/benchmark/scenarios/banking77_scenario.py +51 -0
- helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
- helm/benchmark/scenarios/financebench_scenario.py +53 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
- helm/benchmark/scenarios/scenario.py +1 -1
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +2 -8
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +1 -6
- helm/benchmark/static/schema_air_bench.yaml +750 -750
- helm/benchmark/static/schema_bhasa.yaml +709 -0
- helm/benchmark/static/schema_call_center.yaml +232 -0
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +55 -9
- helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
- helm/benchmark/static/schema_safety.yaml +247 -0
- helm/benchmark/static/schema_tables.yaml +124 -7
- helm/benchmark/static/schema_thai.yaml +21 -0
- helm/benchmark/static/schema_vhelm.yaml +96 -91
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
- helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/test_openai_window_service.py +8 -8
- helm/clients/ai21_client.py +71 -1
- helm/clients/anthropic_client.py +7 -19
- helm/clients/huggingface_client.py +38 -37
- helm/clients/nvidia_nim_client.py +35 -0
- helm/clients/openai_client.py +2 -3
- helm/clients/palmyra_client.py +25 -0
- helm/clients/perspective_api_client.py +11 -6
- helm/clients/test_client.py +4 -6
- helm/clients/vision_language/open_flamingo_client.py +1 -2
- helm/clients/vision_language/palmyra_vision_client.py +28 -13
- helm/common/images_utils.py +6 -0
- helm/common/mongo_key_value_store.py +2 -1
- helm/common/request.py +16 -0
- helm/config/model_deployments.yaml +315 -332
- helm/config/model_metadata.yaml +384 -110
- helm/config/tokenizer_configs.yaml +116 -11
- helm/proxy/example_queries.py +14 -21
- helm/proxy/services/server_service.py +1 -2
- helm/proxy/token_counters/test_auto_token_counter.py +2 -2
- helm/tokenizers/ai21_tokenizer.py +51 -59
- helm/tokenizers/cohere_tokenizer.py +0 -75
- helm/tokenizers/huggingface_tokenizer.py +0 -1
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/general.js +0 -122
- helm/benchmark/static/images/crfm-logo.png +0 -0
- helm/benchmark/static/images/helm-logo-simple.png +0 -0
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/info-icon.png +0 -0
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
- helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
- helm/benchmark/window_services/ai21_window_service.py +0 -247
- helm/benchmark/window_services/cohere_window_service.py +0 -101
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -75
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -327
- helm/tokenizers/ice_tokenizer.py +0 -30
- helm/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
|
@@ -309,6 +309,7 @@ run_groups:
|
|
|
309
309
|
- gqa
|
|
310
310
|
- seed_bench
|
|
311
311
|
- mementos
|
|
312
|
+
- real_world_qa
|
|
312
313
|
- name: knowledge
|
|
313
314
|
display_name: Knowledge
|
|
314
315
|
description: Does the model have knowledge about the world and common sense?
|
|
@@ -318,6 +319,7 @@ run_groups:
|
|
|
318
319
|
- mmmu
|
|
319
320
|
- mme
|
|
320
321
|
- vibe_eval
|
|
322
|
+
- real_world_qa
|
|
321
323
|
- name: bias
|
|
322
324
|
display_name: Bias
|
|
323
325
|
description: Are the generations biased in demographic representation? We focus on gender and skin tone bias.
|
|
@@ -332,6 +334,7 @@ run_groups:
|
|
|
332
334
|
- vqa_dialect
|
|
333
335
|
- a_okvqa_dialect
|
|
334
336
|
- crossmodal_3600
|
|
337
|
+
- fair_face
|
|
335
338
|
- name: toxicity
|
|
336
339
|
display_name: Toxicity
|
|
337
340
|
description: Does the model generate toxic or inappropriate content? Can the model identify toxic or inappropriate content?
|
|
@@ -358,10 +361,11 @@ run_groups:
|
|
|
358
361
|
- a_okvqa_hindi
|
|
359
362
|
- a_okvqa_spanish
|
|
360
363
|
- a_okvqa_swahili
|
|
364
|
+
- exams_v
|
|
361
365
|
|
|
362
366
|
- name: a_okvqa_base
|
|
363
367
|
display_name: A-OKVQA
|
|
364
|
-
description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([
|
|
368
|
+
description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
365
369
|
metric_groups:
|
|
366
370
|
- accuracy
|
|
367
371
|
- general_information
|
|
@@ -377,7 +381,7 @@ run_groups:
|
|
|
377
381
|
|
|
378
382
|
- name: a_okvqa_dialect
|
|
379
383
|
display_name: A-OKVQA (AAE)
|
|
380
|
-
description:
|
|
384
|
+
description: African-American English Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
381
385
|
metric_groups:
|
|
382
386
|
- fairness
|
|
383
387
|
- general_information
|
|
@@ -393,7 +397,7 @@ run_groups:
|
|
|
393
397
|
|
|
394
398
|
- name: a_okvqa_robustness
|
|
395
399
|
display_name: A-OKVQA (robustness)
|
|
396
|
-
description:
|
|
400
|
+
description: Robustness Typos Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
397
401
|
metric_groups:
|
|
398
402
|
- robustness
|
|
399
403
|
- general_information
|
|
@@ -409,7 +413,7 @@ run_groups:
|
|
|
409
413
|
|
|
410
414
|
- name: a_okvqa_chinese
|
|
411
415
|
display_name: A-OKVQA (chinese)
|
|
412
|
-
description:
|
|
416
|
+
description: Chinese Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
413
417
|
metric_groups:
|
|
414
418
|
- translate
|
|
415
419
|
- general_information
|
|
@@ -425,7 +429,7 @@ run_groups:
|
|
|
425
429
|
|
|
426
430
|
- name: a_okvqa_hindi
|
|
427
431
|
display_name: A-OKVQA (hindi)
|
|
428
|
-
description:
|
|
432
|
+
description: Hindi Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
429
433
|
metric_groups:
|
|
430
434
|
- translate
|
|
431
435
|
- general_information
|
|
@@ -441,7 +445,7 @@ run_groups:
|
|
|
441
445
|
|
|
442
446
|
- name: a_okvqa_spanish
|
|
443
447
|
display_name: A-OKVQA (spanish)
|
|
444
|
-
description:
|
|
448
|
+
description: Spanish Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
445
449
|
metric_groups:
|
|
446
450
|
- translate
|
|
447
451
|
- general_information
|
|
@@ -457,7 +461,7 @@ run_groups:
|
|
|
457
461
|
|
|
458
462
|
- name: a_okvqa_swahili
|
|
459
463
|
display_name: A-OKVQA (swahili)
|
|
460
|
-
description:
|
|
464
|
+
description: Swahili Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
461
465
|
metric_groups:
|
|
462
466
|
- translate
|
|
463
467
|
- general_information
|
|
@@ -473,7 +477,7 @@ run_groups:
|
|
|
473
477
|
|
|
474
478
|
- name: crossmodal_3600
|
|
475
479
|
display_name: Crossmodal 3600
|
|
476
|
-
description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([
|
|
480
|
+
description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([Thapliyal et al., 2022](https://arxiv.org/abs/2205.12522))
|
|
477
481
|
metric_groups:
|
|
478
482
|
- accuracy
|
|
479
483
|
- general_information
|
|
@@ -489,7 +493,7 @@ run_groups:
|
|
|
489
493
|
|
|
490
494
|
- name: flickr30k
|
|
491
495
|
display_name: Flickr30k
|
|
492
|
-
description: An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr images. ([
|
|
496
|
+
description: An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr images. ([Young et al., 2014](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))
|
|
493
497
|
metric_groups:
|
|
494
498
|
- accuracy
|
|
495
499
|
- general_information
|
|
@@ -505,7 +509,7 @@ run_groups:
|
|
|
505
509
|
|
|
506
510
|
- name: gqa
|
|
507
511
|
display_name: GQA
|
|
508
|
-
description: Questions about real-world visual reasoning and compositional QA
|
|
512
|
+
description: Questions about real-world visual reasoning and compositional QA ([Hudson and Manning, 2019](https://arxiv.org/abs/1902.09506)).
|
|
509
513
|
metric_groups:
|
|
510
514
|
- accuracy
|
|
511
515
|
- general_information
|
|
@@ -513,7 +517,7 @@ run_groups:
|
|
|
513
517
|
main_name: quasi_exact_match
|
|
514
518
|
main_split: valid
|
|
515
519
|
taxonomy:
|
|
516
|
-
task: short
|
|
520
|
+
task: short-answer question answering
|
|
517
521
|
what: Real-world images
|
|
518
522
|
who: Human experts
|
|
519
523
|
when: "2019"
|
|
@@ -521,7 +525,7 @@ run_groups:
|
|
|
521
525
|
|
|
522
526
|
- name: hateful_memes
|
|
523
527
|
display_name: Hateful Memes
|
|
524
|
-
description:
|
|
528
|
+
description: Multimodal hate speech detection in memes ([Kiela et al., 2020](https://arxiv.org/abs/2005.04790)).
|
|
525
529
|
metric_groups:
|
|
526
530
|
- accuracy
|
|
527
531
|
- general_information
|
|
@@ -529,15 +533,15 @@ run_groups:
|
|
|
529
533
|
main_name: exact_match
|
|
530
534
|
main_split: test
|
|
531
535
|
taxonomy:
|
|
532
|
-
task:
|
|
533
|
-
what:
|
|
534
|
-
who:
|
|
536
|
+
task: toxicity identification
|
|
537
|
+
what: Memes
|
|
538
|
+
who: Human experts
|
|
535
539
|
when: "2020"
|
|
536
540
|
language: English
|
|
537
541
|
|
|
538
542
|
- name: mm_safety_bench
|
|
539
543
|
display_name: MM-SafetyBench
|
|
540
|
-
description:
|
|
544
|
+
description: Exposes the vulnerability of open-source VLMs with toxic and biased content ([Liu et al., 2023](https://arxiv.org/abs/2311.17600)).
|
|
541
545
|
metric_groups:
|
|
542
546
|
- accuracy
|
|
543
547
|
- toxicity
|
|
@@ -546,22 +550,23 @@ run_groups:
|
|
|
546
550
|
main_name: toxic_frac
|
|
547
551
|
main_split: test
|
|
548
552
|
taxonomy:
|
|
549
|
-
task:
|
|
550
|
-
what:
|
|
553
|
+
task: toxicity mitigation
|
|
554
|
+
what: Jail-break images
|
|
551
555
|
who: Human experts
|
|
552
556
|
when: "2023"
|
|
553
557
|
language: English
|
|
554
558
|
|
|
555
559
|
- name: viz_wiz
|
|
556
560
|
display_name: VizWiz
|
|
557
|
-
description: A benchmark for visual question answering with images and questions created by visually impaired people [
|
|
561
|
+
description: A benchmark for visual question answering with images and questions created by visually impaired people ([Gurari et al., 2018](https://arxiv.org/abs/1802.08218)).
|
|
558
562
|
metric_groups:
|
|
559
563
|
- accuracy
|
|
564
|
+
- general_information
|
|
560
565
|
environment:
|
|
561
566
|
main_name: quasi_exact_match
|
|
562
567
|
main_split: valid
|
|
563
568
|
taxonomy:
|
|
564
|
-
task:
|
|
569
|
+
task: short-answer question answering
|
|
565
570
|
what: Real-world images
|
|
566
571
|
who: Visually impaired people
|
|
567
572
|
when: "2018"
|
|
@@ -569,7 +574,7 @@ run_groups:
|
|
|
569
574
|
|
|
570
575
|
- name: vqa_base
|
|
571
576
|
display_name: VQAv2
|
|
572
|
-
description: Open-ended questions about real-world images [
|
|
577
|
+
description: Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
|
|
573
578
|
metric_groups:
|
|
574
579
|
- accuracy
|
|
575
580
|
- general_information
|
|
@@ -577,7 +582,7 @@ run_groups:
|
|
|
577
582
|
main_name: quasi_exact_match
|
|
578
583
|
main_split: valid
|
|
579
584
|
taxonomy:
|
|
580
|
-
task:
|
|
585
|
+
task: short-answer question answering
|
|
581
586
|
what: Real-world images
|
|
582
587
|
who: Human experts
|
|
583
588
|
when: "2017"
|
|
@@ -585,7 +590,7 @@ run_groups:
|
|
|
585
590
|
|
|
586
591
|
- name: vqa_dialect
|
|
587
592
|
display_name: VQAv2 (AAE)
|
|
588
|
-
description: Open-ended questions about real-world images [
|
|
593
|
+
description: African-American English Perturbation + Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
|
|
589
594
|
metric_groups:
|
|
590
595
|
- fairness
|
|
591
596
|
- general_information
|
|
@@ -593,7 +598,7 @@ run_groups:
|
|
|
593
598
|
main_name: quasi_exact_match
|
|
594
599
|
main_split: valid
|
|
595
600
|
taxonomy:
|
|
596
|
-
task:
|
|
601
|
+
task: short-answer question answering
|
|
597
602
|
what: Real-world images
|
|
598
603
|
who: Human experts
|
|
599
604
|
when: "2017"
|
|
@@ -601,7 +606,7 @@ run_groups:
|
|
|
601
606
|
|
|
602
607
|
- name: vqa_robustness
|
|
603
608
|
display_name: VQAv2 (robustness)
|
|
604
|
-
description: Open-ended questions about real-world images [
|
|
609
|
+
description: Robustness Typos Perturbation + Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
|
|
605
610
|
metric_groups:
|
|
606
611
|
- robustness
|
|
607
612
|
- general_information
|
|
@@ -609,63 +614,15 @@ run_groups:
|
|
|
609
614
|
main_name: quasi_exact_match
|
|
610
615
|
main_split: valid
|
|
611
616
|
taxonomy:
|
|
612
|
-
task:
|
|
617
|
+
task: short-answer question answering
|
|
613
618
|
what: Real-world images
|
|
614
619
|
who: Human experts
|
|
615
620
|
when: "2017"
|
|
616
621
|
language: English
|
|
617
622
|
|
|
618
|
-
- name: vqa_chinese
|
|
619
|
-
display_name: VQAv2 (chinese)
|
|
620
|
-
description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
|
|
621
|
-
metric_groups:
|
|
622
|
-
- translate
|
|
623
|
-
- general_information
|
|
624
|
-
environment:
|
|
625
|
-
main_name: quasi_exact_match
|
|
626
|
-
main_split: valid
|
|
627
|
-
taxonomy:
|
|
628
|
-
task: multimodal short answer question answering
|
|
629
|
-
what: Real-world images
|
|
630
|
-
who: Human experts
|
|
631
|
-
when: "2017"
|
|
632
|
-
language: Chinese
|
|
633
|
-
|
|
634
|
-
- name: vqa_hindi
|
|
635
|
-
display_name: VQAv2 (hindi)
|
|
636
|
-
description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
|
|
637
|
-
metric_groups:
|
|
638
|
-
- translate
|
|
639
|
-
- general_information
|
|
640
|
-
environment:
|
|
641
|
-
main_name: quasi_exact_match
|
|
642
|
-
main_split: valid
|
|
643
|
-
taxonomy:
|
|
644
|
-
task: multimodal short answer question answering
|
|
645
|
-
what: Real-world images
|
|
646
|
-
who: Human experts
|
|
647
|
-
when: "2017"
|
|
648
|
-
language: Hindi
|
|
649
|
-
|
|
650
|
-
- name: vqa_spanish
|
|
651
|
-
display_name: VQAv2 (spanish)
|
|
652
|
-
description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
|
|
653
|
-
metric_groups:
|
|
654
|
-
- translate
|
|
655
|
-
- general_information
|
|
656
|
-
environment:
|
|
657
|
-
main_name: quasi_exact_match
|
|
658
|
-
main_split: valid
|
|
659
|
-
taxonomy:
|
|
660
|
-
task: multimodal short answer question answering
|
|
661
|
-
what: Real-world images
|
|
662
|
-
who: Human experts
|
|
663
|
-
when: "2017"
|
|
664
|
-
language: Spanish
|
|
665
|
-
|
|
666
623
|
- name: math_vista
|
|
667
624
|
display_name: MathVista
|
|
668
|
-
description:
|
|
625
|
+
description: A benchmark designed to combine challenges from diverse mathematical and visual tasks ([Lu et al., 2024](https://arxiv.org/abs/2310.02255)).
|
|
669
626
|
metric_groups:
|
|
670
627
|
- accuracy
|
|
671
628
|
- general_information
|
|
@@ -681,7 +638,7 @@ run_groups:
|
|
|
681
638
|
|
|
682
639
|
- name: mmmu
|
|
683
640
|
display_name: MMMU
|
|
684
|
-
description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning [
|
|
641
|
+
description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning ([Yue et al., 2023](https://arxiv.org/abs/2311.16502)).
|
|
685
642
|
metric_groups:
|
|
686
643
|
- accuracy
|
|
687
644
|
- general_information
|
|
@@ -689,7 +646,7 @@ run_groups:
|
|
|
689
646
|
main_name: exact_match
|
|
690
647
|
main_split: valid
|
|
691
648
|
taxonomy:
|
|
692
|
-
task:
|
|
649
|
+
task: multiple-choice question answering
|
|
693
650
|
what: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering
|
|
694
651
|
who: Human experts
|
|
695
652
|
when: "2023"
|
|
@@ -697,7 +654,7 @@ run_groups:
|
|
|
697
654
|
|
|
698
655
|
- name: unicorn
|
|
699
656
|
display_name: Unicorn
|
|
700
|
-
description: Safety Evaluation Benchmark for Evaluating on Out-of-Distribution and Sketch Images
|
|
657
|
+
description: Safety Evaluation Benchmark for Evaluating on Out-of-Distribution and Sketch Images ([Tu et al., 2023](https://arxiv.org/abs/2311.16101)).
|
|
701
658
|
metric_groups:
|
|
702
659
|
- accuracy
|
|
703
660
|
- general_information
|
|
@@ -705,7 +662,7 @@ run_groups:
|
|
|
705
662
|
main_name: exact_match
|
|
706
663
|
main_split: test
|
|
707
664
|
taxonomy:
|
|
708
|
-
task: short
|
|
665
|
+
task: short-answer question answering
|
|
709
666
|
what: OOD images and sketch images
|
|
710
667
|
who: Human experts
|
|
711
668
|
when: "2023"
|
|
@@ -713,7 +670,7 @@ run_groups:
|
|
|
713
670
|
|
|
714
671
|
- name: bingo
|
|
715
672
|
display_name: Bingo
|
|
716
|
-
description: Open-ended questions about biased images
|
|
673
|
+
description: Open-ended questions about biased images and hallucinations-inducing images ([Cui et al., 2023](https://arxiv.org/abs/2311.03287)).
|
|
717
674
|
metric_groups:
|
|
718
675
|
- accuracy
|
|
719
676
|
- general_information
|
|
@@ -721,7 +678,7 @@ run_groups:
|
|
|
721
678
|
main_name: prometheus_vision
|
|
722
679
|
main_split: test
|
|
723
680
|
taxonomy:
|
|
724
|
-
task: short
|
|
681
|
+
task: short-answer question answering
|
|
725
682
|
what: Biased images about Region, OCR, Factual, Text-to-Image and Image-to-Image inference challenges
|
|
726
683
|
who: Human experts
|
|
727
684
|
when: "2023"
|
|
@@ -729,7 +686,7 @@ run_groups:
|
|
|
729
686
|
|
|
730
687
|
- name: pope
|
|
731
688
|
display_name: POPE
|
|
732
|
-
description: Open-ended questions about object appearance in real-world images for evaluating hallucination behaviour
|
|
689
|
+
description: Open-ended questions about object appearance in real-world images for evaluating hallucination behaviour ([Li et al., 2023](https://aclanthology.org/2023.emnlp-main.20)).
|
|
733
690
|
metric_groups:
|
|
734
691
|
- accuracy
|
|
735
692
|
- general_information
|
|
@@ -737,7 +694,7 @@ run_groups:
|
|
|
737
694
|
main_name: exact_match
|
|
738
695
|
main_split: test
|
|
739
696
|
taxonomy:
|
|
740
|
-
task: short
|
|
697
|
+
task: short-answer question answering
|
|
741
698
|
what: Real-world images
|
|
742
699
|
who: Human experts
|
|
743
700
|
when: "2023"
|
|
@@ -745,7 +702,7 @@ run_groups:
|
|
|
745
702
|
|
|
746
703
|
- name: seed_bench
|
|
747
704
|
display_name: Seed Bench
|
|
748
|
-
description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input including the comprehension of both the image and video modality
|
|
705
|
+
description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input including the comprehension of both the image and video modality ([Li et al., 2023](https://arxiv.org/abs/2307.16125)).
|
|
749
706
|
metric_groups:
|
|
750
707
|
- accuracy
|
|
751
708
|
- general_information
|
|
@@ -761,7 +718,7 @@ run_groups:
|
|
|
761
718
|
|
|
762
719
|
- name: mme
|
|
763
720
|
display_name: MME
|
|
764
|
-
description: A comprehensive MLLM Evaluation benchmark with perception and cognition evaluations on 14 subtasks
|
|
721
|
+
description: A comprehensive MLLM Evaluation benchmark with perception and cognition evaluations on 14 subtasks ([Fu et al., 2023](https://arxiv.org/abs/2306.13394)).
|
|
765
722
|
metric_groups:
|
|
766
723
|
- accuracy
|
|
767
724
|
- general_information
|
|
@@ -777,7 +734,7 @@ run_groups:
|
|
|
777
734
|
|
|
778
735
|
- name: vibe_eval
|
|
779
736
|
display_name: Vibe Eval
|
|
780
|
-
description:
|
|
737
|
+
description: A difficult evaluation suite for measuring progress of multimodal language models with day-to-day tasks ([Padlewski et al., 2024](https://arxiv.org/abs/2405.02287)).
|
|
781
738
|
metric_groups:
|
|
782
739
|
- accuracy
|
|
783
740
|
- general_information
|
|
@@ -785,7 +742,7 @@ run_groups:
|
|
|
785
742
|
main_name: prometheus_vision
|
|
786
743
|
main_split: test
|
|
787
744
|
taxonomy:
|
|
788
|
-
task: short
|
|
745
|
+
task: short-answer question answering
|
|
789
746
|
what: Knowledge intensive
|
|
790
747
|
who: Human experts
|
|
791
748
|
when: "2024"
|
|
@@ -793,7 +750,7 @@ run_groups:
|
|
|
793
750
|
|
|
794
751
|
- name: mementos
|
|
795
752
|
display_name: Mementos
|
|
796
|
-
description: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences
|
|
753
|
+
description: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences ([Wang et al., 2024](https://arxiv.org/abs/2401.10529)).
|
|
797
754
|
metric_groups:
|
|
798
755
|
- accuracy
|
|
799
756
|
- general_information
|
|
@@ -801,15 +758,15 @@ run_groups:
|
|
|
801
758
|
main_name: prometheus_vision
|
|
802
759
|
main_split: test
|
|
803
760
|
taxonomy:
|
|
804
|
-
task: short
|
|
805
|
-
what: Image sequences of comics,
|
|
761
|
+
task: short-answer question answering
|
|
762
|
+
what: Image sequences of comics, daily life and robotics
|
|
806
763
|
who: Human experts
|
|
807
764
|
when: "2024"
|
|
808
765
|
language: English
|
|
809
766
|
|
|
810
767
|
- name: pairs
|
|
811
768
|
display_name: PAIRS
|
|
812
|
-
description: Examining
|
|
769
|
+
description: Examining gender and racial bias using parallel images ([Fraser et al., 2024](https://arxiv.org/abs/2402.05779)).
|
|
813
770
|
metric_groups:
|
|
814
771
|
- accuracy
|
|
815
772
|
- general_information
|
|
@@ -822,3 +779,51 @@ run_groups:
|
|
|
822
779
|
who: Human experts
|
|
823
780
|
when: "2024"
|
|
824
781
|
language: English
|
|
782
|
+
|
|
783
|
+
- name: fair_face
|
|
784
|
+
display_name: FairFace
|
|
785
|
+
description: Identify the race, gender or age of a photo of a person ([Karkkainen et al., 2019](https://arxiv.org/abs/1908.04913)).
|
|
786
|
+
metric_groups:
|
|
787
|
+
- accuracy
|
|
788
|
+
- general_information
|
|
789
|
+
environment:
|
|
790
|
+
main_name: exact_match
|
|
791
|
+
main_split: valid
|
|
792
|
+
taxonomy:
|
|
793
|
+
task: multiple-choice question answering
|
|
794
|
+
what: Fairness
|
|
795
|
+
who: Human experts
|
|
796
|
+
when: "2019"
|
|
797
|
+
language: English
|
|
798
|
+
|
|
799
|
+
- name: real_world_qa
|
|
800
|
+
display_name: RealWorldQA
|
|
801
|
+
description: A benchmark designed to to evaluate real-world spatial understanding capabilities of multimodal models ([xAI, 2024](https://x.ai/blog/grok-1.5v)).
|
|
802
|
+
metric_groups:
|
|
803
|
+
- accuracy
|
|
804
|
+
- general_information
|
|
805
|
+
environment:
|
|
806
|
+
main_name: exact_match
|
|
807
|
+
main_split: test
|
|
808
|
+
taxonomy:
|
|
809
|
+
task: short-answer question answering
|
|
810
|
+
what: Real world images
|
|
811
|
+
who: Human experts
|
|
812
|
+
when: "2024"
|
|
813
|
+
language: English
|
|
814
|
+
|
|
815
|
+
- name: exams_v
|
|
816
|
+
display_name: Exams-V
|
|
817
|
+
description: A multimodal and multilingual benchmark with knowledge-intensive exam questions covering natural science, social science, and other miscellaneous studies ([Das et al., 2024]( https://arxiv.org/abs/2403.10378)).
|
|
818
|
+
metric_groups:
|
|
819
|
+
- accuracy
|
|
820
|
+
- general_information
|
|
821
|
+
environment:
|
|
822
|
+
main_name: exact_match
|
|
823
|
+
main_split: test
|
|
824
|
+
taxonomy:
|
|
825
|
+
task: multiple-choice question answering
|
|
826
|
+
what: Exam questions
|
|
827
|
+
who: Human experts
|
|
828
|
+
when: "2024"
|
|
829
|
+
language: English, Chinese, Croation, Hungarian, Arabic, Serbian, Bulgarian, English, German, French, Spanish, Polish
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|