crfm-helm 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +81 -112
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +165 -155
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
- helm/benchmark/adaptation/common_adapter_specs.py +2 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
- helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
- helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
- helm/benchmark/annotation/live_qa_annotator.py +37 -45
- helm/benchmark/annotation/medication_qa_annotator.py +36 -44
- helm/benchmark/annotation/model_as_judge.py +96 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/bhasa_metrics.py +188 -0
- helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
- helm/benchmark/metrics/code_metrics_helper.py +11 -1
- helm/benchmark/metrics/safety_metrics.py +79 -0
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +17 -3
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/vision_language/image_utils.py +1 -1
- helm/benchmark/model_metadata_registry.py +3 -3
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +106 -256
- helm/benchmark/presentation/test_run_entry.py +1 -0
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/run.py +15 -0
- helm/benchmark/run_expander.py +83 -30
- helm/benchmark/run_specs/bhasa_run_specs.py +652 -0
- helm/benchmark/run_specs/call_center_run_specs.py +152 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
- helm/benchmark/run_specs/experimental_run_specs.py +52 -0
- helm/benchmark/run_specs/finance_run_specs.py +82 -1
- helm/benchmark/run_specs/safety_run_specs.py +154 -0
- helm/benchmark/run_specs/vlm_run_specs.py +100 -24
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
- helm/benchmark/scenarios/banking77_scenario.py +51 -0
- helm/benchmark/scenarios/bhasa_scenario.py +1942 -0
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
- helm/benchmark/scenarios/financebench_scenario.py +53 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
- helm/benchmark/scenarios/raft_scenario.py +1 -1
- helm/benchmark/scenarios/scenario.py +1 -1
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +2 -8
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +1 -6
- helm/benchmark/static/schema_air_bench.yaml +750 -750
- helm/benchmark/static/schema_bhasa.yaml +709 -0
- helm/benchmark/static/schema_call_center.yaml +232 -0
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +55 -9
- helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_safety.yaml +266 -0
- helm/benchmark/static/schema_tables.yaml +149 -8
- helm/benchmark/static/schema_thai.yaml +21 -0
- helm/benchmark/static/schema_vhelm.yaml +137 -101
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
- helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/test_openai_window_service.py +8 -8
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/clients/ai21_client.py +71 -1
- helm/clients/anthropic_client.py +7 -19
- helm/clients/huggingface_client.py +38 -37
- helm/clients/nvidia_nim_client.py +35 -0
- helm/clients/openai_client.py +18 -4
- helm/clients/palmyra_client.py +24 -0
- helm/clients/perspective_api_client.py +11 -6
- helm/clients/test_client.py +4 -6
- helm/clients/together_client.py +22 -0
- helm/clients/vision_language/open_flamingo_client.py +1 -2
- helm/clients/vision_language/palmyra_vision_client.py +28 -13
- helm/common/cache.py +8 -30
- helm/common/images_utils.py +6 -0
- helm/common/key_value_store.py +9 -9
- helm/common/mongo_key_value_store.py +5 -4
- helm/common/request.py +16 -0
- helm/common/test_cache.py +1 -48
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +444 -329
- helm/config/model_metadata.yaml +513 -111
- helm/config/tokenizer_configs.yaml +140 -11
- helm/proxy/example_queries.py +14 -21
- helm/proxy/server.py +0 -9
- helm/proxy/services/remote_service.py +0 -6
- helm/proxy/services/server_service.py +6 -20
- helm/proxy/services/service.py +0 -6
- helm/proxy/token_counters/test_auto_token_counter.py +2 -2
- helm/tokenizers/ai21_tokenizer.py +51 -59
- helm/tokenizers/cohere_tokenizer.py +0 -75
- helm/tokenizers/huggingface_tokenizer.py +0 -1
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/general.js +0 -122
- helm/benchmark/static/images/crfm-logo.png +0 -0
- helm/benchmark/static/images/helm-logo-simple.png +0 -0
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/info-icon.png +0 -0
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
- helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/window_services/ai21_window_service.py +0 -247
- helm/benchmark/window_services/cohere_window_service.py +0 -101
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -75
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -327
- helm/tokenizers/ice_tokenizer.py +0 -30
- helm/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
- /helm/benchmark/{data_overlap → scenarios/vision_language/image2struct}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct/webpage}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
|
@@ -282,6 +282,7 @@ run_groups:
|
|
|
282
282
|
- knowledge
|
|
283
283
|
- bias
|
|
284
284
|
- fairness
|
|
285
|
+
- safety
|
|
285
286
|
- toxicity
|
|
286
287
|
- robustness
|
|
287
288
|
- multilinguality
|
|
@@ -293,6 +294,7 @@ run_groups:
|
|
|
293
294
|
- vqa_base
|
|
294
295
|
- viz_wiz
|
|
295
296
|
- flickr30k
|
|
297
|
+
- pope
|
|
296
298
|
- name: reasoning
|
|
297
299
|
display_name: Reasoning
|
|
298
300
|
description: Does the model understand objects, counts and spatial relations? Can the model reason about both the text and image input?
|
|
@@ -301,14 +303,8 @@ run_groups:
|
|
|
301
303
|
- gqa
|
|
302
304
|
- math_vista
|
|
303
305
|
- seed_bench
|
|
304
|
-
- name: real_world_reasoning
|
|
305
|
-
display_name: Real-world Reasoning
|
|
306
|
-
description: Reasoning in the real-world
|
|
307
|
-
category: Core scenarios
|
|
308
|
-
subgroups:
|
|
309
|
-
- gqa
|
|
310
|
-
- seed_bench
|
|
311
306
|
- mementos
|
|
307
|
+
- real_world_qa
|
|
312
308
|
- name: knowledge
|
|
313
309
|
display_name: Knowledge
|
|
314
310
|
description: Does the model have knowledge about the world and common sense?
|
|
@@ -332,13 +328,20 @@ run_groups:
|
|
|
332
328
|
- vqa_dialect
|
|
333
329
|
- a_okvqa_dialect
|
|
334
330
|
- crossmodal_3600
|
|
331
|
+
- fair_face
|
|
332
|
+
- bingo_fairness
|
|
335
333
|
- name: toxicity
|
|
336
334
|
display_name: Toxicity
|
|
337
335
|
description: Does the model generate toxic or inappropriate content? Can the model identify toxic or inappropriate content?
|
|
338
336
|
category: Core scenarios
|
|
339
337
|
subgroups:
|
|
340
|
-
- mm_safety_bench
|
|
341
338
|
- hateful_memes
|
|
339
|
+
- name: safety
|
|
340
|
+
display_name: Safety
|
|
341
|
+
description: Refusing to produce answers that cause harm to humans
|
|
342
|
+
category: Core scenarios
|
|
343
|
+
subgroups:
|
|
344
|
+
- mm_safety_bench
|
|
342
345
|
- name: robustness
|
|
343
346
|
display_name: Robustness
|
|
344
347
|
description: Is the model robust to perturbations? We focus on both text and image perturbations.
|
|
@@ -348,7 +351,6 @@ run_groups:
|
|
|
348
351
|
- a_okvqa_robustness
|
|
349
352
|
- unicorn
|
|
350
353
|
- bingo
|
|
351
|
-
- pope
|
|
352
354
|
- name: multilinguality
|
|
353
355
|
display_name: Multilinguality
|
|
354
356
|
description: Do the model support non-English languages?
|
|
@@ -358,10 +360,11 @@ run_groups:
|
|
|
358
360
|
- a_okvqa_hindi
|
|
359
361
|
- a_okvqa_spanish
|
|
360
362
|
- a_okvqa_swahili
|
|
361
|
-
|
|
363
|
+
- exams_v
|
|
364
|
+
- bingo_multilinguality
|
|
362
365
|
- name: a_okvqa_base
|
|
363
366
|
display_name: A-OKVQA
|
|
364
|
-
description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([
|
|
367
|
+
description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
365
368
|
metric_groups:
|
|
366
369
|
- accuracy
|
|
367
370
|
- general_information
|
|
@@ -377,7 +380,7 @@ run_groups:
|
|
|
377
380
|
|
|
378
381
|
- name: a_okvqa_dialect
|
|
379
382
|
display_name: A-OKVQA (AAE)
|
|
380
|
-
description:
|
|
383
|
+
description: African-American English Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
381
384
|
metric_groups:
|
|
382
385
|
- fairness
|
|
383
386
|
- general_information
|
|
@@ -393,7 +396,7 @@ run_groups:
|
|
|
393
396
|
|
|
394
397
|
- name: a_okvqa_robustness
|
|
395
398
|
display_name: A-OKVQA (robustness)
|
|
396
|
-
description:
|
|
399
|
+
description: Robustness Typos Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
397
400
|
metric_groups:
|
|
398
401
|
- robustness
|
|
399
402
|
- general_information
|
|
@@ -409,7 +412,7 @@ run_groups:
|
|
|
409
412
|
|
|
410
413
|
- name: a_okvqa_chinese
|
|
411
414
|
display_name: A-OKVQA (chinese)
|
|
412
|
-
description:
|
|
415
|
+
description: Chinese Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
413
416
|
metric_groups:
|
|
414
417
|
- translate
|
|
415
418
|
- general_information
|
|
@@ -425,7 +428,7 @@ run_groups:
|
|
|
425
428
|
|
|
426
429
|
- name: a_okvqa_hindi
|
|
427
430
|
display_name: A-OKVQA (hindi)
|
|
428
|
-
description:
|
|
431
|
+
description: Hindi Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
429
432
|
metric_groups:
|
|
430
433
|
- translate
|
|
431
434
|
- general_information
|
|
@@ -441,7 +444,7 @@ run_groups:
|
|
|
441
444
|
|
|
442
445
|
- name: a_okvqa_spanish
|
|
443
446
|
display_name: A-OKVQA (spanish)
|
|
444
|
-
description:
|
|
447
|
+
description: Spanish Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
445
448
|
metric_groups:
|
|
446
449
|
- translate
|
|
447
450
|
- general_information
|
|
@@ -457,7 +460,7 @@ run_groups:
|
|
|
457
460
|
|
|
458
461
|
- name: a_okvqa_swahili
|
|
459
462
|
display_name: A-OKVQA (swahili)
|
|
460
|
-
description:
|
|
463
|
+
description: Swahili Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
|
|
461
464
|
metric_groups:
|
|
462
465
|
- translate
|
|
463
466
|
- general_information
|
|
@@ -473,7 +476,7 @@ run_groups:
|
|
|
473
476
|
|
|
474
477
|
- name: crossmodal_3600
|
|
475
478
|
display_name: Crossmodal 3600
|
|
476
|
-
description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([
|
|
479
|
+
description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([Thapliyal et al., 2022](https://arxiv.org/abs/2205.12522))
|
|
477
480
|
metric_groups:
|
|
478
481
|
- accuracy
|
|
479
482
|
- general_information
|
|
@@ -489,7 +492,7 @@ run_groups:
|
|
|
489
492
|
|
|
490
493
|
- name: flickr30k
|
|
491
494
|
display_name: Flickr30k
|
|
492
|
-
description: An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr images. ([
|
|
495
|
+
description: An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr images. ([Young et al., 2014](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))
|
|
493
496
|
metric_groups:
|
|
494
497
|
- accuracy
|
|
495
498
|
- general_information
|
|
@@ -505,7 +508,7 @@ run_groups:
|
|
|
505
508
|
|
|
506
509
|
- name: gqa
|
|
507
510
|
display_name: GQA
|
|
508
|
-
description: Questions about real-world visual reasoning and compositional QA
|
|
511
|
+
description: Questions about real-world visual reasoning and compositional QA ([Hudson and Manning, 2019](https://arxiv.org/abs/1902.09506)).
|
|
509
512
|
metric_groups:
|
|
510
513
|
- accuracy
|
|
511
514
|
- general_information
|
|
@@ -513,7 +516,7 @@ run_groups:
|
|
|
513
516
|
main_name: quasi_exact_match
|
|
514
517
|
main_split: valid
|
|
515
518
|
taxonomy:
|
|
516
|
-
task: short
|
|
519
|
+
task: short-answer question answering
|
|
517
520
|
what: Real-world images
|
|
518
521
|
who: Human experts
|
|
519
522
|
when: "2019"
|
|
@@ -521,7 +524,7 @@ run_groups:
|
|
|
521
524
|
|
|
522
525
|
- name: hateful_memes
|
|
523
526
|
display_name: Hateful Memes
|
|
524
|
-
description:
|
|
527
|
+
description: Multimodal hate speech detection in memes ([Kiela et al., 2020](https://arxiv.org/abs/2005.04790)).
|
|
525
528
|
metric_groups:
|
|
526
529
|
- accuracy
|
|
527
530
|
- general_information
|
|
@@ -529,15 +532,15 @@ run_groups:
|
|
|
529
532
|
main_name: exact_match
|
|
530
533
|
main_split: test
|
|
531
534
|
taxonomy:
|
|
532
|
-
task:
|
|
533
|
-
what:
|
|
534
|
-
who:
|
|
535
|
+
task: toxicity identification
|
|
536
|
+
what: Memes
|
|
537
|
+
who: Human experts
|
|
535
538
|
when: "2020"
|
|
536
539
|
language: English
|
|
537
540
|
|
|
538
541
|
- name: mm_safety_bench
|
|
539
542
|
display_name: MM-SafetyBench
|
|
540
|
-
description:
|
|
543
|
+
description: Exposes the vulnerability of open-source VLMs with toxic and biased content ([Liu et al., 2023](https://arxiv.org/abs/2311.17600)).
|
|
541
544
|
metric_groups:
|
|
542
545
|
- accuracy
|
|
543
546
|
- toxicity
|
|
@@ -546,22 +549,23 @@ run_groups:
|
|
|
546
549
|
main_name: toxic_frac
|
|
547
550
|
main_split: test
|
|
548
551
|
taxonomy:
|
|
549
|
-
task:
|
|
550
|
-
what:
|
|
552
|
+
task: toxicity mitigation
|
|
553
|
+
what: Jail-break images
|
|
551
554
|
who: Human experts
|
|
552
555
|
when: "2023"
|
|
553
556
|
language: English
|
|
554
557
|
|
|
555
558
|
- name: viz_wiz
|
|
556
559
|
display_name: VizWiz
|
|
557
|
-
description: A benchmark for visual question answering with images and questions created by visually impaired people [
|
|
560
|
+
description: A benchmark for visual question answering with images and questions created by visually impaired people ([Gurari et al., 2018](https://arxiv.org/abs/1802.08218)).
|
|
558
561
|
metric_groups:
|
|
559
562
|
- accuracy
|
|
563
|
+
- general_information
|
|
560
564
|
environment:
|
|
561
565
|
main_name: quasi_exact_match
|
|
562
566
|
main_split: valid
|
|
563
567
|
taxonomy:
|
|
564
|
-
task:
|
|
568
|
+
task: short-answer question answering
|
|
565
569
|
what: Real-world images
|
|
566
570
|
who: Visually impaired people
|
|
567
571
|
when: "2018"
|
|
@@ -569,7 +573,7 @@ run_groups:
|
|
|
569
573
|
|
|
570
574
|
- name: vqa_base
|
|
571
575
|
display_name: VQAv2
|
|
572
|
-
description: Open-ended questions about real-world images [
|
|
576
|
+
description: Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
|
|
573
577
|
metric_groups:
|
|
574
578
|
- accuracy
|
|
575
579
|
- general_information
|
|
@@ -577,7 +581,7 @@ run_groups:
|
|
|
577
581
|
main_name: quasi_exact_match
|
|
578
582
|
main_split: valid
|
|
579
583
|
taxonomy:
|
|
580
|
-
task:
|
|
584
|
+
task: short-answer question answering
|
|
581
585
|
what: Real-world images
|
|
582
586
|
who: Human experts
|
|
583
587
|
when: "2017"
|
|
@@ -585,7 +589,7 @@ run_groups:
|
|
|
585
589
|
|
|
586
590
|
- name: vqa_dialect
|
|
587
591
|
display_name: VQAv2 (AAE)
|
|
588
|
-
description: Open-ended questions about real-world images [
|
|
592
|
+
description: African-American English Perturbation + Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
|
|
589
593
|
metric_groups:
|
|
590
594
|
- fairness
|
|
591
595
|
- general_information
|
|
@@ -593,7 +597,7 @@ run_groups:
|
|
|
593
597
|
main_name: quasi_exact_match
|
|
594
598
|
main_split: valid
|
|
595
599
|
taxonomy:
|
|
596
|
-
task:
|
|
600
|
+
task: short-answer question answering
|
|
597
601
|
what: Real-world images
|
|
598
602
|
who: Human experts
|
|
599
603
|
when: "2017"
|
|
@@ -601,7 +605,7 @@ run_groups:
|
|
|
601
605
|
|
|
602
606
|
- name: vqa_robustness
|
|
603
607
|
display_name: VQAv2 (robustness)
|
|
604
|
-
description: Open-ended questions about real-world images [
|
|
608
|
+
description: Robustness Typos Perturbation + Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
|
|
605
609
|
metric_groups:
|
|
606
610
|
- robustness
|
|
607
611
|
- general_information
|
|
@@ -609,63 +613,15 @@ run_groups:
|
|
|
609
613
|
main_name: quasi_exact_match
|
|
610
614
|
main_split: valid
|
|
611
615
|
taxonomy:
|
|
612
|
-
task:
|
|
616
|
+
task: short-answer question answering
|
|
613
617
|
what: Real-world images
|
|
614
618
|
who: Human experts
|
|
615
619
|
when: "2017"
|
|
616
620
|
language: English
|
|
617
621
|
|
|
618
|
-
- name: vqa_chinese
|
|
619
|
-
display_name: VQAv2 (chinese)
|
|
620
|
-
description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
|
|
621
|
-
metric_groups:
|
|
622
|
-
- translate
|
|
623
|
-
- general_information
|
|
624
|
-
environment:
|
|
625
|
-
main_name: quasi_exact_match
|
|
626
|
-
main_split: valid
|
|
627
|
-
taxonomy:
|
|
628
|
-
task: multimodal short answer question answering
|
|
629
|
-
what: Real-world images
|
|
630
|
-
who: Human experts
|
|
631
|
-
when: "2017"
|
|
632
|
-
language: Chinese
|
|
633
|
-
|
|
634
|
-
- name: vqa_hindi
|
|
635
|
-
display_name: VQAv2 (hindi)
|
|
636
|
-
description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
|
|
637
|
-
metric_groups:
|
|
638
|
-
- translate
|
|
639
|
-
- general_information
|
|
640
|
-
environment:
|
|
641
|
-
main_name: quasi_exact_match
|
|
642
|
-
main_split: valid
|
|
643
|
-
taxonomy:
|
|
644
|
-
task: multimodal short answer question answering
|
|
645
|
-
what: Real-world images
|
|
646
|
-
who: Human experts
|
|
647
|
-
when: "2017"
|
|
648
|
-
language: Hindi
|
|
649
|
-
|
|
650
|
-
- name: vqa_spanish
|
|
651
|
-
display_name: VQAv2 (spanish)
|
|
652
|
-
description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
|
|
653
|
-
metric_groups:
|
|
654
|
-
- translate
|
|
655
|
-
- general_information
|
|
656
|
-
environment:
|
|
657
|
-
main_name: quasi_exact_match
|
|
658
|
-
main_split: valid
|
|
659
|
-
taxonomy:
|
|
660
|
-
task: multimodal short answer question answering
|
|
661
|
-
what: Real-world images
|
|
662
|
-
who: Human experts
|
|
663
|
-
when: "2017"
|
|
664
|
-
language: Spanish
|
|
665
|
-
|
|
666
622
|
- name: math_vista
|
|
667
623
|
display_name: MathVista
|
|
668
|
-
description:
|
|
624
|
+
description: A benchmark designed to combine challenges from diverse mathematical and visual tasks ([Lu et al., 2024](https://arxiv.org/abs/2310.02255)).
|
|
669
625
|
metric_groups:
|
|
670
626
|
- accuracy
|
|
671
627
|
- general_information
|
|
@@ -681,7 +637,7 @@ run_groups:
|
|
|
681
637
|
|
|
682
638
|
- name: mmmu
|
|
683
639
|
display_name: MMMU
|
|
684
|
-
description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning [
|
|
640
|
+
description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning ([Yue et al., 2023](https://arxiv.org/abs/2311.16502)).
|
|
685
641
|
metric_groups:
|
|
686
642
|
- accuracy
|
|
687
643
|
- general_information
|
|
@@ -689,7 +645,7 @@ run_groups:
|
|
|
689
645
|
main_name: exact_match
|
|
690
646
|
main_split: valid
|
|
691
647
|
taxonomy:
|
|
692
|
-
task:
|
|
648
|
+
task: multiple-choice question answering
|
|
693
649
|
what: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering
|
|
694
650
|
who: Human experts
|
|
695
651
|
when: "2023"
|
|
@@ -697,7 +653,7 @@ run_groups:
|
|
|
697
653
|
|
|
698
654
|
- name: unicorn
|
|
699
655
|
display_name: Unicorn
|
|
700
|
-
description: Safety Evaluation Benchmark for Evaluating on Out-of-Distribution and Sketch Images
|
|
656
|
+
description: Safety Evaluation Benchmark for Evaluating on Out-of-Distribution and Sketch Images ([Tu et al., 2023](https://arxiv.org/abs/2311.16101)).
|
|
701
657
|
metric_groups:
|
|
702
658
|
- accuracy
|
|
703
659
|
- general_information
|
|
@@ -705,7 +661,7 @@ run_groups:
|
|
|
705
661
|
main_name: exact_match
|
|
706
662
|
main_split: test
|
|
707
663
|
taxonomy:
|
|
708
|
-
task: short
|
|
664
|
+
task: short-answer question answering
|
|
709
665
|
what: OOD images and sketch images
|
|
710
666
|
who: Human experts
|
|
711
667
|
when: "2023"
|
|
@@ -713,7 +669,23 @@ run_groups:
|
|
|
713
669
|
|
|
714
670
|
- name: bingo
|
|
715
671
|
display_name: Bingo
|
|
716
|
-
description: Open-ended questions about biased images
|
|
672
|
+
description: Open-ended questions about biased images and hallucinations-inducing images ([Cui et al., 2023](https://arxiv.org/abs/2311.03287)).
|
|
673
|
+
metric_groups:
|
|
674
|
+
- accuracy
|
|
675
|
+
- general_information
|
|
676
|
+
environment:
|
|
677
|
+
main_name: prometheus_vision
|
|
678
|
+
main_split: test
|
|
679
|
+
taxonomy:
|
|
680
|
+
task: short-answer question answering
|
|
681
|
+
what: Biased images about Region, OCR, Factual, Text-to-Image and Image-to-Image inference challenges
|
|
682
|
+
who: Human experts
|
|
683
|
+
when: "2023"
|
|
684
|
+
language: English, Chinese, Japanese, etc.
|
|
685
|
+
|
|
686
|
+
- name: bingo_fairness
|
|
687
|
+
display_name: Bingo (fairness)
|
|
688
|
+
description: Open-ended questions about biased images and hallucinations-inducing images ([Cui et al., 2023](https://arxiv.org/abs/2311.03287)).
|
|
717
689
|
metric_groups:
|
|
718
690
|
- accuracy
|
|
719
691
|
- general_information
|
|
@@ -721,7 +693,23 @@ run_groups:
|
|
|
721
693
|
main_name: prometheus_vision
|
|
722
694
|
main_split: test
|
|
723
695
|
taxonomy:
|
|
724
|
-
task: short
|
|
696
|
+
task: short-answer question answering
|
|
697
|
+
what: Biased images about Region, OCR, Factual, Text-to-Image and Image-to-Image inference challenges
|
|
698
|
+
who: Human experts
|
|
699
|
+
when: "2023"
|
|
700
|
+
language: English, Chinese, Japanese, etc.
|
|
701
|
+
|
|
702
|
+
- name: bingo_multilinguality
|
|
703
|
+
display_name: Bingo (multilinguality)
|
|
704
|
+
description: Open-ended questions about biased images and hallucinations-inducing images ([Cui et al., 2023](https://arxiv.org/abs/2311.03287)).
|
|
705
|
+
metric_groups:
|
|
706
|
+
- accuracy
|
|
707
|
+
- general_information
|
|
708
|
+
environment:
|
|
709
|
+
main_name: prometheus_vision
|
|
710
|
+
main_split: test
|
|
711
|
+
taxonomy:
|
|
712
|
+
task: short-answer question answering
|
|
725
713
|
what: Biased images about Region, OCR, Factual, Text-to-Image and Image-to-Image inference challenges
|
|
726
714
|
who: Human experts
|
|
727
715
|
when: "2023"
|
|
@@ -729,7 +717,7 @@ run_groups:
|
|
|
729
717
|
|
|
730
718
|
- name: pope
|
|
731
719
|
display_name: POPE
|
|
732
|
-
description: Open-ended questions about object appearance in real-world images for evaluating hallucination behaviour
|
|
720
|
+
description: Open-ended questions about object appearance in real-world images for evaluating hallucination behaviour ([Li et al., 2023](https://aclanthology.org/2023.emnlp-main.20)).
|
|
733
721
|
metric_groups:
|
|
734
722
|
- accuracy
|
|
735
723
|
- general_information
|
|
@@ -737,7 +725,7 @@ run_groups:
|
|
|
737
725
|
main_name: exact_match
|
|
738
726
|
main_split: test
|
|
739
727
|
taxonomy:
|
|
740
|
-
task: short
|
|
728
|
+
task: short-answer question answering
|
|
741
729
|
what: Real-world images
|
|
742
730
|
who: Human experts
|
|
743
731
|
when: "2023"
|
|
@@ -745,7 +733,7 @@ run_groups:
|
|
|
745
733
|
|
|
746
734
|
- name: seed_bench
|
|
747
735
|
display_name: Seed Bench
|
|
748
|
-
description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input including the comprehension of both the image and video modality
|
|
736
|
+
description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input including the comprehension of both the image and video modality ([Li et al., 2023](https://arxiv.org/abs/2307.16125)).
|
|
749
737
|
metric_groups:
|
|
750
738
|
- accuracy
|
|
751
739
|
- general_information
|
|
@@ -761,7 +749,7 @@ run_groups:
|
|
|
761
749
|
|
|
762
750
|
- name: mme
|
|
763
751
|
display_name: MME
|
|
764
|
-
description: A comprehensive MLLM Evaluation benchmark with perception and cognition evaluations on 14 subtasks
|
|
752
|
+
description: A comprehensive MLLM Evaluation benchmark with perception and cognition evaluations on 14 subtasks ([Fu et al., 2023](https://arxiv.org/abs/2306.13394)).
|
|
765
753
|
metric_groups:
|
|
766
754
|
- accuracy
|
|
767
755
|
- general_information
|
|
@@ -777,7 +765,7 @@ run_groups:
|
|
|
777
765
|
|
|
778
766
|
- name: vibe_eval
|
|
779
767
|
display_name: Vibe Eval
|
|
780
|
-
description:
|
|
768
|
+
description: A difficult evaluation suite for measuring progress of multimodal language models with day-to-day tasks ([Padlewski et al., 2024](https://arxiv.org/abs/2405.02287)).
|
|
781
769
|
metric_groups:
|
|
782
770
|
- accuracy
|
|
783
771
|
- general_information
|
|
@@ -785,7 +773,7 @@ run_groups:
|
|
|
785
773
|
main_name: prometheus_vision
|
|
786
774
|
main_split: test
|
|
787
775
|
taxonomy:
|
|
788
|
-
task: short
|
|
776
|
+
task: short-answer question answering
|
|
789
777
|
what: Knowledge intensive
|
|
790
778
|
who: Human experts
|
|
791
779
|
when: "2024"
|
|
@@ -793,7 +781,7 @@ run_groups:
|
|
|
793
781
|
|
|
794
782
|
- name: mementos
|
|
795
783
|
display_name: Mementos
|
|
796
|
-
description: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences
|
|
784
|
+
description: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences ([Wang et al., 2024](https://arxiv.org/abs/2401.10529)).
|
|
797
785
|
metric_groups:
|
|
798
786
|
- accuracy
|
|
799
787
|
- general_information
|
|
@@ -801,15 +789,15 @@ run_groups:
|
|
|
801
789
|
main_name: prometheus_vision
|
|
802
790
|
main_split: test
|
|
803
791
|
taxonomy:
|
|
804
|
-
task: short
|
|
805
|
-
what: Image sequences of comics,
|
|
792
|
+
task: short-answer question answering
|
|
793
|
+
what: Image sequences of comics, daily life and robotics
|
|
806
794
|
who: Human experts
|
|
807
795
|
when: "2024"
|
|
808
796
|
language: English
|
|
809
797
|
|
|
810
798
|
- name: pairs
|
|
811
799
|
display_name: PAIRS
|
|
812
|
-
description: Examining
|
|
800
|
+
description: Examining gender and racial bias using parallel images ([Fraser et al., 2024](https://arxiv.org/abs/2402.05779)).
|
|
813
801
|
metric_groups:
|
|
814
802
|
- accuracy
|
|
815
803
|
- general_information
|
|
@@ -822,3 +810,51 @@ run_groups:
|
|
|
822
810
|
who: Human experts
|
|
823
811
|
when: "2024"
|
|
824
812
|
language: English
|
|
813
|
+
|
|
814
|
+
- name: fair_face
|
|
815
|
+
display_name: FairFace
|
|
816
|
+
description: Identify the race, gender or age of a photo of a person ([Karkkainen et al., 2019](https://arxiv.org/abs/1908.04913)).
|
|
817
|
+
metric_groups:
|
|
818
|
+
- accuracy
|
|
819
|
+
- general_information
|
|
820
|
+
environment:
|
|
821
|
+
main_name: exact_match
|
|
822
|
+
main_split: valid
|
|
823
|
+
taxonomy:
|
|
824
|
+
task: multiple-choice question answering
|
|
825
|
+
what: Fairness
|
|
826
|
+
who: Human experts
|
|
827
|
+
when: "2019"
|
|
828
|
+
language: English
|
|
829
|
+
|
|
830
|
+
- name: real_world_qa
|
|
831
|
+
display_name: RealWorldQA
|
|
832
|
+
description: A benchmark designed to to evaluate real-world spatial understanding capabilities of multimodal models ([xAI, 2024](https://x.ai/blog/grok-1.5v)).
|
|
833
|
+
metric_groups:
|
|
834
|
+
- accuracy
|
|
835
|
+
- general_information
|
|
836
|
+
environment:
|
|
837
|
+
main_name: exact_match
|
|
838
|
+
main_split: test
|
|
839
|
+
taxonomy:
|
|
840
|
+
task: short-answer question answering
|
|
841
|
+
what: Real world images
|
|
842
|
+
who: Human experts
|
|
843
|
+
when: "2024"
|
|
844
|
+
language: English
|
|
845
|
+
|
|
846
|
+
- name: exams_v
|
|
847
|
+
display_name: Exams-V
|
|
848
|
+
description: A multimodal and multilingual benchmark with knowledge-intensive exam questions covering natural science, social science, and other miscellaneous studies ([Das et al., 2024]( https://arxiv.org/abs/2403.10378)).
|
|
849
|
+
metric_groups:
|
|
850
|
+
- accuracy
|
|
851
|
+
- general_information
|
|
852
|
+
environment:
|
|
853
|
+
main_name: exact_match
|
|
854
|
+
main_split: test
|
|
855
|
+
taxonomy:
|
|
856
|
+
task: multiple-choice question answering
|
|
857
|
+
what: Exam questions
|
|
858
|
+
who: Human experts
|
|
859
|
+
when: "2024"
|
|
860
|
+
language: English, Chinese, Croation, Hungarian, Arabic, Serbian, Bulgarian, English, German, French, Spanish, Polish
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|