crfm-helm 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +13 -3
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +96 -63
- helm/benchmark/adaptation/adapter_spec.py +32 -31
- helm/benchmark/annotation/air_bench_annotator.py +64 -0
- helm/benchmark/annotation/annotator_factory.py +6 -0
- helm/benchmark/annotation/live_qa_annotator.py +84 -0
- helm/benchmark/annotation/medication_qa_annotator.py +81 -0
- helm/benchmark/augmentations/translate_perturbation.py +1 -0
- helm/benchmark/huggingface_registration.py +16 -6
- helm/benchmark/metrics/air_bench_metrics.py +56 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
- helm/benchmark/metrics/live_qa_metrics.py +23 -0
- helm/benchmark/metrics/medication_qa_metrics.py +23 -0
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/unitxt_metrics.py +20 -10
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +29 -71
- helm/benchmark/presentation/schema.py +54 -4
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/run.py +16 -2
- helm/benchmark/run_expander.py +77 -0
- helm/benchmark/run_spec_factory.py +4 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +15 -11
- helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
- helm/benchmark/run_specs/experimental_run_specs.py +33 -0
- helm/benchmark/run_specs/finance_run_specs.py +33 -0
- helm/benchmark/run_specs/vlm_run_specs.py +168 -45
- helm/benchmark/scenarios/air_bench_scenario.py +50 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
- helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +0 -4
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +4 -2
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +6 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_classic.yaml +3 -59
- helm/benchmark/static/schema_finance.yaml +143 -0
- helm/benchmark/static/schema_image2structure.yaml +254 -111
- helm/benchmark/static/schema_instruction_following.yaml +3 -52
- helm/benchmark/static/schema_lite.yaml +3 -61
- helm/benchmark/static/schema_medical.yaml +255 -0
- helm/benchmark/static/schema_mmlu.yaml +3 -61
- helm/benchmark/static/schema_tables.yaml +200 -0
- helm/benchmark/static/schema_thai.yaml +223 -0
- helm/benchmark/static/schema_unitxt.yaml +3 -61
- helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +294 -293
- helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
- helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
- helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
- helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
- helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
- helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/anthropic_client.py +43 -9
- helm/clients/auto_client.py +11 -0
- helm/clients/client.py +24 -7
- helm/clients/cohere_client.py +98 -3
- helm/clients/huggingface_client.py +71 -12
- helm/clients/openai_client.py +9 -2
- helm/clients/reka_client.py +189 -0
- helm/clients/test_client.py +3 -3
- helm/clients/test_huggingface_client.py +19 -3
- helm/clients/test_together_client.py +72 -2
- helm/clients/together_client.py +129 -23
- helm/clients/vertexai_client.py +62 -18
- helm/clients/vision_language/huggingface_vlm_client.py +1 -0
- helm/clients/vision_language/paligemma_client.py +146 -0
- helm/clients/vision_language/palmyra_vision_client.py +84 -0
- helm/clients/yi_client.py +31 -0
- helm/common/critique_request.py +10 -1
- helm/common/images_utils.py +19 -0
- helm/config/model_deployments.yaml +412 -18
- helm/config/model_metadata.yaml +447 -25
- helm/config/tokenizer_configs.yaml +93 -1
- helm/proxy/critique/model_critique_client.py +32 -4
- helm/proxy/services/server_service.py +1 -1
- helm/tokenizers/auto_tokenizer.py +1 -1
- helm/tokenizers/cohere_tokenizer.py +44 -2
- helm/tokenizers/huggingface_tokenizer.py +36 -13
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_huggingface_tokenizer.py +5 -1
- helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
- helm/benchmark/static_build/assets/index-878a1094.css +0 -1
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -1,66 +1,8 @@
|
|
|
1
1
|
---
|
|
2
2
|
############################################################
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
values:
|
|
7
|
-
- name: generation
|
|
8
|
-
description: Given the input, the model generates the output free-form.
|
|
9
|
-
- name: multiple_choice_joint
|
|
10
|
-
description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
|
|
11
|
-
- name: multiple_choice_separate_original
|
|
12
|
-
description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
|
|
13
|
-
- name: multiple_choice_separate_calibrated
|
|
14
|
-
description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
|
|
15
|
-
- name: language_modeling
|
|
16
|
-
description: Given the input, the model assigns the sequence a probability.
|
|
17
|
-
- name: instructions
|
|
18
|
-
description: The description of the task that is included at the very beginning of the prompt.
|
|
19
|
-
- name: global_prefix
|
|
20
|
-
description: The string that is prepended to the prompt.
|
|
21
|
-
- name: global_suffix
|
|
22
|
-
description: The string that is appended to the prompt.
|
|
23
|
-
- name: instance_prefix
|
|
24
|
-
description: The string that is included before each instance (e.g., '\n\n').
|
|
25
|
-
- name: input_prefix
|
|
26
|
-
description: The string that is included before each input (e.g., 'Question:').
|
|
27
|
-
- name: input_suffix
|
|
28
|
-
description: The string that is included after each input (e.g., '\n').
|
|
29
|
-
- name: reference_prefix
|
|
30
|
-
description: The string that is included before each reference (for multiple-choice questions).
|
|
31
|
-
- name: reference_suffix
|
|
32
|
-
description: The string that is included after each reference (for multiple-choice questions).
|
|
33
|
-
- name: output_prefix
|
|
34
|
-
description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
|
|
35
|
-
- name: output_suffix
|
|
36
|
-
description: The string that is included after the correct answer/predicted output (e.g., '\n').
|
|
37
|
-
- name: substitutions
|
|
38
|
-
description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
|
|
39
|
-
- name: max_train_instances
|
|
40
|
-
description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
|
|
41
|
-
- name: max_eval_instances
|
|
42
|
-
description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
|
|
43
|
-
- name: num_outputs
|
|
44
|
-
description: Maximum number of possible outputs to generate by sampling multiple outputs.
|
|
45
|
-
- name: num_train_trials
|
|
46
|
-
description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
|
|
47
|
-
- name: sample_train
|
|
48
|
-
description: If true, randomly sample N training examples; if false, select N consecutive training examples
|
|
49
|
-
- name: model
|
|
50
|
-
description: Name of the language model (<creator_organization>/<model name>) to send requests to.
|
|
51
|
-
- name: model_deployment
|
|
52
|
-
description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
|
|
53
|
-
- name: temperature
|
|
54
|
-
description: Temperature parameter used in generation.
|
|
55
|
-
- name: max_tokens
|
|
56
|
-
description: Maximum number of tokens to generate.
|
|
57
|
-
- name: stop_sequences
|
|
58
|
-
description: List of sequences, where we stop generation if we encounter any of them.
|
|
59
|
-
- name: random
|
|
60
|
-
description: Random seed (string), which guarantees reproducibility.
|
|
61
|
-
- name: multi_label
|
|
62
|
-
description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
|
|
63
|
-
|
|
3
|
+
# For backwards compatibility with older versions of HELM.
|
|
4
|
+
# TODO: Remove this after 2024-09-01.
|
|
5
|
+
adapter: []
|
|
64
6
|
############################################################
|
|
65
7
|
metrics:
|
|
66
8
|
# Infrastructure metrics:
|
|
@@ -220,49 +162,10 @@ metrics:
|
|
|
220
162
|
display_name: CIDEr
|
|
221
163
|
description: Evaluates the quality of generated caption by measuring the weighted similarity of n-grams between the captions and a set of human-written reference captions, emphasizing informativeness and consensus.
|
|
222
164
|
lower_is_better: false
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
short_display_name: Stereotypes (race)
|
|
228
|
-
lower_is_better: true
|
|
229
|
-
description: Measures uneven association of racial groups (Asian, Hispanic, White) with target professions. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
|
|
230
|
-
- name: bias_metric:mode=associations,demographic_category=race,target_category=adjective
|
|
231
|
-
display_name: Stereotypical associations (race, adjectives)
|
|
232
|
-
short_display_name: Stereotypes (race)
|
|
233
|
-
lower_is_better: true
|
|
234
|
-
description: Measures uneven association of racial groups (Asian, Hispanic, White) with target adjectives. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
|
|
235
|
-
|
|
236
|
-
- name: bias_metric:mode=associations,demographic_category=gender,target_category=profession
|
|
237
|
-
display_name: Stereotypical associations (gender, profession)
|
|
238
|
-
short_display_name: Stereotypes (gender)
|
|
239
|
-
lower_is_better: true
|
|
240
|
-
description: Measures uneven association of gender groups (male, female) with target professions. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
|
|
241
|
-
- name: bias_metric:mode=associations,demographic_category=gender,target_category=adjective
|
|
242
|
-
display_name: Stereotypical associations (gender, adjectives)
|
|
243
|
-
short_display_name: Stereotypes (gender)
|
|
244
|
-
lower_is_better: true
|
|
245
|
-
description: Measures uneven association of gender groups (male, female) with target adjectives. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
|
|
246
|
-
|
|
247
|
-
- name: bias_metric:mode=representation,demographic_category=race
|
|
248
|
-
display_name: Demographic representation (race)
|
|
249
|
-
short_display_name: Representation (race)
|
|
250
|
-
lower_is_better: true
|
|
251
|
-
description: Measures uneven representation of racial groups (Asian, Hispanic, White). This measurement is based on disparities in the frequency statistics across racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics).
|
|
252
|
-
- name: bias_metric:mode=representation,demographic_category=gender
|
|
253
|
-
display_name: Demographic representation (gender)
|
|
254
|
-
short_display_name: Representation (gender)
|
|
255
|
-
lower_is_better: true
|
|
256
|
-
description: Measures uneven representation of gender groups (male, female). This measurement is based on disparities in the frequency statistics across gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
|
|
257
|
-
|
|
258
|
-
- name: bbq_metric_ambiguous_bias
|
|
259
|
-
display_name: BBQ (ambiguous)
|
|
260
|
-
lower_is_better: true
|
|
261
|
-
description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on ambiguous examples.
|
|
262
|
-
- name: bbq_metric_unambiguous_bias
|
|
263
|
-
display_name: BBQ (unambiguous)
|
|
264
|
-
lower_is_better: true
|
|
265
|
-
description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on unambiguous examples.
|
|
165
|
+
- name: prometheus_vision
|
|
166
|
+
display_name: Prometheus Vision rating
|
|
167
|
+
description: Scores are from 1 to 5, where 5 is the best.
|
|
168
|
+
lower_is_better: false
|
|
266
169
|
|
|
267
170
|
# Toxicity metrics
|
|
268
171
|
- name: expected_max_toxicity
|
|
@@ -281,6 +184,9 @@ metrics:
|
|
|
281
184
|
|
|
282
185
|
############################################################
|
|
283
186
|
perturbations:
|
|
187
|
+
- name: translate
|
|
188
|
+
display_name: Translate
|
|
189
|
+
description: Translate prompts to different languages.
|
|
284
190
|
- name: robustness
|
|
285
191
|
display_name: Robustness
|
|
286
192
|
description: Computes worst case over different robustness perturbations (misspellings, formatting, contrast sets).
|
|
@@ -322,12 +228,6 @@ metric_groups:
|
|
|
322
228
|
- name: ${main_name}
|
|
323
229
|
split: ${main_split}
|
|
324
230
|
|
|
325
|
-
- name: efficiency
|
|
326
|
-
display_name: Efficiency
|
|
327
|
-
metrics:
|
|
328
|
-
- name: inference_runtime
|
|
329
|
-
split: ${main_split}
|
|
330
|
-
|
|
331
231
|
- name: general_information
|
|
332
232
|
display_name: General information
|
|
333
233
|
metrics:
|
|
@@ -348,50 +248,122 @@ metric_groups:
|
|
|
348
248
|
- name: toxic_frac
|
|
349
249
|
split: ${main_split}
|
|
350
250
|
|
|
351
|
-
- name:
|
|
352
|
-
display_name:
|
|
251
|
+
- name: fairness
|
|
252
|
+
display_name: Fairness
|
|
353
253
|
metrics:
|
|
354
|
-
- name:
|
|
355
|
-
split: ${main_split}
|
|
356
|
-
- name: compilation_success
|
|
357
|
-
split: ${main_split}
|
|
358
|
-
- name: fid_similarity
|
|
254
|
+
- name: ${main_name}
|
|
359
255
|
split: ${main_split}
|
|
360
|
-
|
|
256
|
+
perturbation_name: fairness
|
|
257
|
+
|
|
258
|
+
- name: robustness
|
|
259
|
+
display_name: Robustness
|
|
260
|
+
metrics:
|
|
261
|
+
- name: ${main_name}
|
|
361
262
|
split: ${main_split}
|
|
263
|
+
perturbation_name: robustness
|
|
362
264
|
|
|
363
|
-
- name:
|
|
364
|
-
display_name:
|
|
265
|
+
- name: translate
|
|
266
|
+
display_name: Translate
|
|
365
267
|
metrics:
|
|
366
|
-
- name:
|
|
268
|
+
- name: ${main_name}
|
|
367
269
|
split: ${main_split}
|
|
270
|
+
perturbation_name: translate
|
|
271
|
+
|
|
368
272
|
|
|
369
273
|
############################################################
|
|
370
274
|
run_groups:
|
|
371
275
|
- name: core_scenarios
|
|
372
|
-
display_name:
|
|
373
|
-
description:
|
|
276
|
+
display_name: All
|
|
277
|
+
description: All scenarios across capabilities
|
|
374
278
|
category: All scenarios
|
|
375
279
|
subgroups:
|
|
376
|
-
-
|
|
377
|
-
-
|
|
280
|
+
- visual_perception
|
|
281
|
+
- reasoning
|
|
282
|
+
- knowledge
|
|
283
|
+
- bias
|
|
284
|
+
- fairness
|
|
285
|
+
- toxicity
|
|
286
|
+
- robustness
|
|
287
|
+
- multilinguality
|
|
288
|
+
- name: visual_perception
|
|
289
|
+
display_name: Visual perception
|
|
290
|
+
description: Is the output semantically correct, given the text and image inputs?
|
|
291
|
+
category: Core scenarios
|
|
292
|
+
subgroups:
|
|
293
|
+
- vqa_base
|
|
378
294
|
- viz_wiz
|
|
379
|
-
-
|
|
295
|
+
- flickr30k
|
|
296
|
+
- name: reasoning
|
|
297
|
+
display_name: Reasoning
|
|
298
|
+
description: Does the model understand objects, counts and spatial relations? Can the model reason about both the text and image input?
|
|
299
|
+
category: Core scenarios
|
|
300
|
+
subgroups:
|
|
301
|
+
- gqa
|
|
302
|
+
- math_vista
|
|
303
|
+
- seed_bench
|
|
304
|
+
- name: real_world_reasoning
|
|
305
|
+
display_name: Real-world Reasoning
|
|
306
|
+
description: Reasoning in the real-world
|
|
307
|
+
category: Core scenarios
|
|
308
|
+
subgroups:
|
|
309
|
+
- gqa
|
|
310
|
+
- seed_bench
|
|
311
|
+
- mementos
|
|
312
|
+
- name: knowledge
|
|
313
|
+
display_name: Knowledge
|
|
314
|
+
description: Does the model have knowledge about the world and common sense?
|
|
315
|
+
category: Core scenarios
|
|
316
|
+
subgroups:
|
|
317
|
+
- a_okvqa_base
|
|
380
318
|
- mmmu
|
|
381
|
-
-
|
|
319
|
+
- mme
|
|
320
|
+
- vibe_eval
|
|
321
|
+
- name: bias
|
|
322
|
+
display_name: Bias
|
|
323
|
+
description: Are the generations biased in demographic representation? We focus on gender and skin tone bias.
|
|
324
|
+
category: Core scenarios
|
|
325
|
+
subgroups:
|
|
326
|
+
- pairs
|
|
327
|
+
- name: fairness
|
|
328
|
+
display_name: Fairness
|
|
329
|
+
description: Does the model exhibit performance disparities across different groups? We focus on gender, dialect and geographic bias.
|
|
330
|
+
category: Core scenarios
|
|
331
|
+
subgroups:
|
|
332
|
+
- vqa_dialect
|
|
333
|
+
- a_okvqa_dialect
|
|
334
|
+
- crossmodal_3600
|
|
335
|
+
- name: toxicity
|
|
336
|
+
display_name: Toxicity
|
|
337
|
+
description: Does the model generate toxic or inappropriate content? Can the model identify toxic or inappropriate content?
|
|
338
|
+
category: Core scenarios
|
|
339
|
+
subgroups:
|
|
340
|
+
- mm_safety_bench
|
|
341
|
+
- hateful_memes
|
|
342
|
+
- name: robustness
|
|
343
|
+
display_name: Robustness
|
|
344
|
+
description: Is the model robust to perturbations? We focus on both text and image perturbations.
|
|
345
|
+
category: Core scenarios
|
|
346
|
+
subgroups:
|
|
347
|
+
- vqa_robustness
|
|
348
|
+
- a_okvqa_robustness
|
|
382
349
|
- unicorn
|
|
383
350
|
- bingo
|
|
384
|
-
- multipanelvqa
|
|
385
351
|
- pope
|
|
386
|
-
|
|
387
|
-
|
|
352
|
+
- name: multilinguality
|
|
353
|
+
display_name: Multilinguality
|
|
354
|
+
description: Do the model support non-English languages?
|
|
355
|
+
category: Core scenarios
|
|
356
|
+
subgroups:
|
|
357
|
+
- a_okvqa_chinese
|
|
358
|
+
- a_okvqa_hindi
|
|
359
|
+
- a_okvqa_spanish
|
|
360
|
+
- a_okvqa_swahili
|
|
388
361
|
|
|
389
|
-
- name:
|
|
362
|
+
- name: a_okvqa_base
|
|
390
363
|
display_name: A-OKVQA
|
|
391
364
|
description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
|
|
392
365
|
metric_groups:
|
|
393
366
|
- accuracy
|
|
394
|
-
- efficiency
|
|
395
367
|
- general_information
|
|
396
368
|
environment:
|
|
397
369
|
main_name: exact_match
|
|
@@ -403,15 +375,110 @@ run_groups:
|
|
|
403
375
|
when: "2023"
|
|
404
376
|
language: English
|
|
405
377
|
|
|
378
|
+
- name: a_okvqa_dialect
|
|
379
|
+
display_name: A-OKVQA (AAE)
|
|
380
|
+
description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
|
|
381
|
+
metric_groups:
|
|
382
|
+
- fairness
|
|
383
|
+
- general_information
|
|
384
|
+
environment:
|
|
385
|
+
main_name: exact_match
|
|
386
|
+
main_split: valid
|
|
387
|
+
taxonomy:
|
|
388
|
+
task: multiple-choice question answering
|
|
389
|
+
what: Real-world images
|
|
390
|
+
who: Human experts
|
|
391
|
+
when: "2023"
|
|
392
|
+
language: English
|
|
393
|
+
|
|
394
|
+
- name: a_okvqa_robustness
|
|
395
|
+
display_name: A-OKVQA (robustness)
|
|
396
|
+
description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
|
|
397
|
+
metric_groups:
|
|
398
|
+
- robustness
|
|
399
|
+
- general_information
|
|
400
|
+
environment:
|
|
401
|
+
main_name: exact_match
|
|
402
|
+
main_split: valid
|
|
403
|
+
taxonomy:
|
|
404
|
+
task: multiple-choice question answering
|
|
405
|
+
what: Real-world images
|
|
406
|
+
who: Human experts
|
|
407
|
+
when: "2023"
|
|
408
|
+
language: English
|
|
409
|
+
|
|
410
|
+
- name: a_okvqa_chinese
|
|
411
|
+
display_name: A-OKVQA (chinese)
|
|
412
|
+
description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
|
|
413
|
+
metric_groups:
|
|
414
|
+
- translate
|
|
415
|
+
- general_information
|
|
416
|
+
environment:
|
|
417
|
+
main_name: exact_match
|
|
418
|
+
main_split: valid
|
|
419
|
+
taxonomy:
|
|
420
|
+
task: multiple-choice question answering
|
|
421
|
+
what: Real-world images
|
|
422
|
+
who: Human experts
|
|
423
|
+
when: "2023"
|
|
424
|
+
language: Chinese
|
|
425
|
+
|
|
426
|
+
- name: a_okvqa_hindi
|
|
427
|
+
display_name: A-OKVQA (hindi)
|
|
428
|
+
description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
|
|
429
|
+
metric_groups:
|
|
430
|
+
- translate
|
|
431
|
+
- general_information
|
|
432
|
+
environment:
|
|
433
|
+
main_name: exact_match
|
|
434
|
+
main_split: valid
|
|
435
|
+
taxonomy:
|
|
436
|
+
task: multiple-choice question answering
|
|
437
|
+
what: Real-world images
|
|
438
|
+
who: Human experts
|
|
439
|
+
when: "2023"
|
|
440
|
+
language: Hindi
|
|
441
|
+
|
|
442
|
+
- name: a_okvqa_spanish
|
|
443
|
+
display_name: A-OKVQA (spanish)
|
|
444
|
+
description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
|
|
445
|
+
metric_groups:
|
|
446
|
+
- translate
|
|
447
|
+
- general_information
|
|
448
|
+
environment:
|
|
449
|
+
main_name: exact_match
|
|
450
|
+
main_split: valid
|
|
451
|
+
taxonomy:
|
|
452
|
+
task: multiple-choice question answering
|
|
453
|
+
what: Real-world images
|
|
454
|
+
who: Human experts
|
|
455
|
+
when: "2023"
|
|
456
|
+
language: Spanish
|
|
457
|
+
|
|
458
|
+
- name: a_okvqa_swahili
|
|
459
|
+
display_name: A-OKVQA (swahili)
|
|
460
|
+
description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
|
|
461
|
+
metric_groups:
|
|
462
|
+
- translate
|
|
463
|
+
- general_information
|
|
464
|
+
environment:
|
|
465
|
+
main_name: exact_match
|
|
466
|
+
main_split: valid
|
|
467
|
+
taxonomy:
|
|
468
|
+
task: multiple-choice question answering
|
|
469
|
+
what: Real-world images
|
|
470
|
+
who: Human experts
|
|
471
|
+
when: "2023"
|
|
472
|
+
language: Swahili
|
|
473
|
+
|
|
406
474
|
- name: crossmodal_3600
|
|
407
475
|
display_name: Crossmodal 3600
|
|
408
476
|
description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([paper](https://arxiv.org/abs/2205.12522))
|
|
409
477
|
metric_groups:
|
|
410
478
|
- accuracy
|
|
411
|
-
- efficiency
|
|
412
479
|
- general_information
|
|
413
480
|
environment:
|
|
414
|
-
main_name:
|
|
481
|
+
main_name: prometheus_vision
|
|
415
482
|
main_split: test
|
|
416
483
|
taxonomy:
|
|
417
484
|
task: multilingual captioning
|
|
@@ -425,10 +492,9 @@ run_groups:
|
|
|
425
492
|
description: An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr images. ([paper](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))
|
|
426
493
|
metric_groups:
|
|
427
494
|
- accuracy
|
|
428
|
-
- efficiency
|
|
429
495
|
- general_information
|
|
430
496
|
environment:
|
|
431
|
-
main_name:
|
|
497
|
+
main_name: prometheus_vision
|
|
432
498
|
main_split: test
|
|
433
499
|
taxonomy:
|
|
434
500
|
task: image captioning
|
|
@@ -442,10 +508,9 @@ run_groups:
|
|
|
442
508
|
description: Questions about real-world visual reasoning and compositional QA
|
|
443
509
|
metric_groups:
|
|
444
510
|
- accuracy
|
|
445
|
-
- efficiency
|
|
446
511
|
- general_information
|
|
447
512
|
environment:
|
|
448
|
-
main_name:
|
|
513
|
+
main_name: quasi_exact_match
|
|
449
514
|
main_split: valid
|
|
450
515
|
taxonomy:
|
|
451
516
|
task: short answer question answering
|
|
@@ -454,38 +519,11 @@ run_groups:
|
|
|
454
519
|
when: "2019"
|
|
455
520
|
language: English
|
|
456
521
|
|
|
457
|
-
- name: heim_human_eval
|
|
458
|
-
display_name: HEIM Human Eval Scenario
|
|
459
|
-
description: Seeing if we can use VLMs to evaluate AI-generated images from HEIM
|
|
460
|
-
metric_groups:
|
|
461
|
-
- accuracy
|
|
462
|
-
- efficiency
|
|
463
|
-
- general_information
|
|
464
|
-
environment:
|
|
465
|
-
main_name: exact_match
|
|
466
|
-
main_split: test
|
|
467
|
-
taxonomy:
|
|
468
|
-
task: multiple-choice question answering
|
|
469
|
-
what: AI-generated images
|
|
470
|
-
who: Text-to-image models
|
|
471
|
-
when: "2024"
|
|
472
|
-
language: English
|
|
473
|
-
|
|
474
|
-
- name: image2structure
|
|
475
|
-
display_name: Image2Structure
|
|
476
|
-
description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images.
|
|
477
|
-
category: All scenarios
|
|
478
|
-
subgroups:
|
|
479
|
-
- image2latex
|
|
480
|
-
- image2webpage
|
|
481
|
-
- image2musicsheet
|
|
482
|
-
|
|
483
522
|
- name: hateful_memes
|
|
484
523
|
display_name: Hateful Memes
|
|
485
524
|
description: The Hateful Memes benchmark for multimodal hate speech detection [(Dwibedi et al., 2020)](https://arxiv.org/pdf/2005.04790.pdf).
|
|
486
525
|
metric_groups:
|
|
487
526
|
- accuracy
|
|
488
|
-
- efficiency
|
|
489
527
|
- general_information
|
|
490
528
|
environment:
|
|
491
529
|
main_name: exact_match
|
|
@@ -502,7 +540,6 @@ run_groups:
|
|
|
502
540
|
description: Expose the vulnerability of open-source VLMs with toxic and biased content ([paper](https://arxiv.org/abs/2311.17600))
|
|
503
541
|
metric_groups:
|
|
504
542
|
- accuracy
|
|
505
|
-
- efficiency
|
|
506
543
|
- toxicity
|
|
507
544
|
- general_information
|
|
508
545
|
environment:
|
|
@@ -515,66 +552,61 @@ run_groups:
|
|
|
515
552
|
when: "2023"
|
|
516
553
|
language: English
|
|
517
554
|
|
|
518
|
-
- name:
|
|
519
|
-
display_name:
|
|
520
|
-
description:
|
|
555
|
+
- name: viz_wiz
|
|
556
|
+
display_name: VizWiz
|
|
557
|
+
description: A benchmark for visual question answering with images and questions created by visually impaired people [(Gurari et al., 2018)](https://arxiv.org/abs/1802.08218).
|
|
521
558
|
metric_groups:
|
|
522
559
|
- accuracy
|
|
523
|
-
- efficiency
|
|
524
|
-
- general_information
|
|
525
560
|
environment:
|
|
526
|
-
main_name:
|
|
561
|
+
main_name: quasi_exact_match
|
|
527
562
|
main_split: valid
|
|
528
563
|
taxonomy:
|
|
529
|
-
task:
|
|
530
|
-
what: Real
|
|
531
|
-
who:
|
|
532
|
-
when: "
|
|
564
|
+
task: multimodal short answer question answering
|
|
565
|
+
what: Real-world images
|
|
566
|
+
who: Visually impaired people
|
|
567
|
+
when: "2018"
|
|
533
568
|
language: English
|
|
534
569
|
|
|
535
|
-
- name:
|
|
536
|
-
display_name:
|
|
537
|
-
description:
|
|
570
|
+
- name: vqa_base
|
|
571
|
+
display_name: VQAv2
|
|
572
|
+
description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
|
|
538
573
|
metric_groups:
|
|
539
574
|
- accuracy
|
|
540
|
-
- efficiency
|
|
541
575
|
- general_information
|
|
542
576
|
environment:
|
|
543
|
-
main_name:
|
|
577
|
+
main_name: quasi_exact_match
|
|
544
578
|
main_split: valid
|
|
545
579
|
taxonomy:
|
|
546
|
-
task:
|
|
547
|
-
what: Real
|
|
580
|
+
task: multimodal short answer question answering
|
|
581
|
+
what: Real-world images
|
|
548
582
|
who: Human experts
|
|
549
|
-
when: "
|
|
583
|
+
when: "2017"
|
|
550
584
|
language: English
|
|
551
585
|
|
|
552
|
-
- name:
|
|
553
|
-
display_name:
|
|
554
|
-
description:
|
|
586
|
+
- name: vqa_dialect
|
|
587
|
+
display_name: VQAv2 (AAE)
|
|
588
|
+
description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
|
|
555
589
|
metric_groups:
|
|
556
|
-
-
|
|
557
|
-
- efficiency
|
|
590
|
+
- fairness
|
|
558
591
|
- general_information
|
|
559
592
|
environment:
|
|
560
|
-
main_name:
|
|
593
|
+
main_name: quasi_exact_match
|
|
561
594
|
main_split: valid
|
|
562
595
|
taxonomy:
|
|
563
596
|
task: multimodal short answer question answering
|
|
564
597
|
what: Real-world images
|
|
565
|
-
who:
|
|
566
|
-
when: "
|
|
598
|
+
who: Human experts
|
|
599
|
+
when: "2017"
|
|
567
600
|
language: English
|
|
568
601
|
|
|
569
|
-
- name:
|
|
570
|
-
display_name: VQAv2
|
|
602
|
+
- name: vqa_robustness
|
|
603
|
+
display_name: VQAv2 (robustness)
|
|
571
604
|
description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
|
|
572
605
|
metric_groups:
|
|
573
|
-
-
|
|
574
|
-
- efficiency
|
|
606
|
+
- robustness
|
|
575
607
|
- general_information
|
|
576
608
|
environment:
|
|
577
|
-
main_name:
|
|
609
|
+
main_name: quasi_exact_match
|
|
578
610
|
main_split: valid
|
|
579
611
|
taxonomy:
|
|
580
612
|
task: multimodal short answer question answering
|
|
@@ -583,12 +615,59 @@ run_groups:
|
|
|
583
615
|
when: "2017"
|
|
584
616
|
language: English
|
|
585
617
|
|
|
618
|
+
- name: vqa_chinese
|
|
619
|
+
display_name: VQAv2 (chinese)
|
|
620
|
+
description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
|
|
621
|
+
metric_groups:
|
|
622
|
+
- translate
|
|
623
|
+
- general_information
|
|
624
|
+
environment:
|
|
625
|
+
main_name: quasi_exact_match
|
|
626
|
+
main_split: valid
|
|
627
|
+
taxonomy:
|
|
628
|
+
task: multimodal short answer question answering
|
|
629
|
+
what: Real-world images
|
|
630
|
+
who: Human experts
|
|
631
|
+
when: "2017"
|
|
632
|
+
language: Chinese
|
|
633
|
+
|
|
634
|
+
- name: vqa_hindi
|
|
635
|
+
display_name: VQAv2 (hindi)
|
|
636
|
+
description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
|
|
637
|
+
metric_groups:
|
|
638
|
+
- translate
|
|
639
|
+
- general_information
|
|
640
|
+
environment:
|
|
641
|
+
main_name: quasi_exact_match
|
|
642
|
+
main_split: valid
|
|
643
|
+
taxonomy:
|
|
644
|
+
task: multimodal short answer question answering
|
|
645
|
+
what: Real-world images
|
|
646
|
+
who: Human experts
|
|
647
|
+
when: "2017"
|
|
648
|
+
language: Hindi
|
|
649
|
+
|
|
650
|
+
- name: vqa_spanish
|
|
651
|
+
display_name: VQAv2 (spanish)
|
|
652
|
+
description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
|
|
653
|
+
metric_groups:
|
|
654
|
+
- translate
|
|
655
|
+
- general_information
|
|
656
|
+
environment:
|
|
657
|
+
main_name: quasi_exact_match
|
|
658
|
+
main_split: valid
|
|
659
|
+
taxonomy:
|
|
660
|
+
task: multimodal short answer question answering
|
|
661
|
+
what: Real-world images
|
|
662
|
+
who: Human experts
|
|
663
|
+
when: "2017"
|
|
664
|
+
language: Spanish
|
|
665
|
+
|
|
586
666
|
- name: math_vista
|
|
587
667
|
display_name: MathVista
|
|
588
668
|
description: Evaluating Math Reasoning in Visual Contexts
|
|
589
669
|
metric_groups:
|
|
590
670
|
- accuracy
|
|
591
|
-
- efficiency
|
|
592
671
|
- general_information
|
|
593
672
|
environment:
|
|
594
673
|
main_name: exact_match
|
|
@@ -605,7 +684,6 @@ run_groups:
|
|
|
605
684
|
description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning [(Yue et al., 2023)](https://arxiv.org/abs/2311.16502).
|
|
606
685
|
metric_groups:
|
|
607
686
|
- accuracy
|
|
608
|
-
- efficiency
|
|
609
687
|
- general_information
|
|
610
688
|
environment:
|
|
611
689
|
main_name: exact_match
|
|
@@ -638,8 +716,9 @@ run_groups:
|
|
|
638
716
|
description: Open-ended questions about biased images
|
|
639
717
|
metric_groups:
|
|
640
718
|
- accuracy
|
|
719
|
+
- general_information
|
|
641
720
|
environment:
|
|
642
|
-
main_name:
|
|
721
|
+
main_name: prometheus_vision
|
|
643
722
|
main_split: test
|
|
644
723
|
taxonomy:
|
|
645
724
|
task: short answer question answering
|
|
@@ -647,30 +726,12 @@ run_groups:
|
|
|
647
726
|
who: Human experts
|
|
648
727
|
when: "2023"
|
|
649
728
|
language: English, Chinese, Japanese, etc.
|
|
650
|
-
|
|
651
|
-
- name: multipanelvqa
|
|
652
|
-
display_name: MultipanelVQA
|
|
653
|
-
description: Question about real-world or synthetic multipanel images for evaluating multi-panel image reasoning ability
|
|
654
|
-
metric_groups:
|
|
655
|
-
- accuracy
|
|
656
|
-
- efficiency
|
|
657
|
-
- general_information
|
|
658
|
-
environment:
|
|
659
|
-
main_name: exact_match
|
|
660
|
-
main_split: test
|
|
661
|
-
taxonomy:
|
|
662
|
-
task: short answer or multiple-choice question answering
|
|
663
|
-
what: Real-world or synthetic multipanel images
|
|
664
|
-
who: Human experts
|
|
665
|
-
when: "2024"
|
|
666
|
-
language: English
|
|
667
|
-
|
|
729
|
+
|
|
668
730
|
- name: pope
|
|
669
731
|
display_name: POPE
|
|
670
732
|
description: Open-ended questions about object appearance in real-world images for evaluating hallucination behaviour
|
|
671
733
|
metric_groups:
|
|
672
734
|
- accuracy
|
|
673
|
-
- efficiency
|
|
674
735
|
- general_information
|
|
675
736
|
environment:
|
|
676
737
|
main_name: exact_match
|
|
@@ -684,11 +745,9 @@ run_groups:
|
|
|
684
745
|
|
|
685
746
|
- name: seed_bench
|
|
686
747
|
display_name: Seed Bench
|
|
687
|
-
description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input
|
|
688
|
-
including the comprehension of both the image and video modality
|
|
748
|
+
description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input including the comprehension of both the image and video modality
|
|
689
749
|
metric_groups:
|
|
690
750
|
- accuracy
|
|
691
|
-
- efficiency
|
|
692
751
|
- general_information
|
|
693
752
|
environment:
|
|
694
753
|
main_name: exact_match
|
|
@@ -705,7 +764,6 @@ run_groups:
|
|
|
705
764
|
description: A comprehensive MLLM Evaluation benchmark with perception and cognition evaluations on 14 subtasks
|
|
706
765
|
metric_groups:
|
|
707
766
|
- accuracy
|
|
708
|
-
- efficiency
|
|
709
767
|
- general_information
|
|
710
768
|
environment:
|
|
711
769
|
main_name: exact_match
|
|
@@ -717,91 +775,35 @@ run_groups:
|
|
|
717
775
|
when: "2023"
|
|
718
776
|
language: English
|
|
719
777
|
|
|
720
|
-
- name:
|
|
721
|
-
display_name:
|
|
722
|
-
description:
|
|
778
|
+
- name: vibe_eval
|
|
779
|
+
display_name: Vibe Eval
|
|
780
|
+
description: hard evaluation suite for measuring progress of multimodal language models
|
|
723
781
|
metric_groups:
|
|
724
782
|
- accuracy
|
|
783
|
+
- general_information
|
|
725
784
|
environment:
|
|
726
|
-
main_name:
|
|
785
|
+
main_name: prometheus_vision
|
|
727
786
|
main_split: test
|
|
728
787
|
taxonomy:
|
|
729
788
|
task: short answer question answering
|
|
730
|
-
what:
|
|
789
|
+
what: Knowledge intensive
|
|
731
790
|
who: Human experts
|
|
732
791
|
when: "2024"
|
|
733
792
|
language: English
|
|
734
793
|
|
|
735
|
-
- name:
|
|
736
|
-
display_name:
|
|
737
|
-
description:
|
|
738
|
-
metric_groups:
|
|
739
|
-
- accuracy
|
|
740
|
-
- generation_image
|
|
741
|
-
- generation_text
|
|
742
|
-
- efficiency
|
|
743
|
-
- general_information
|
|
744
|
-
environment:
|
|
745
|
-
main_name: earth_mover_similarity
|
|
746
|
-
main_split: valid
|
|
747
|
-
taxonomy:
|
|
748
|
-
task: image-to-text
|
|
749
|
-
what: mathematical equations, tables, algorithms, tikz
|
|
750
|
-
who: n/a
|
|
751
|
-
when: "2024"
|
|
752
|
-
language: English
|
|
753
|
-
|
|
754
|
-
- name: image2webpage
|
|
755
|
-
display_name: Image2webpage
|
|
756
|
-
description: The Image2webpage benchmark for converting images of webpages to HTML/CSS/Javascript.
|
|
757
|
-
metric_groups:
|
|
758
|
-
- accuracy
|
|
759
|
-
- generation_image
|
|
760
|
-
- generation_text
|
|
761
|
-
- efficiency
|
|
762
|
-
- general_information
|
|
763
|
-
environment:
|
|
764
|
-
main_name: earth_mover_similarity
|
|
765
|
-
main_split: valid
|
|
766
|
-
taxonomy:
|
|
767
|
-
task: image-to-text
|
|
768
|
-
what: css, html, javascript
|
|
769
|
-
who: n/a
|
|
770
|
-
when: "2024"
|
|
771
|
-
language: English
|
|
772
|
-
|
|
773
|
-
- name: image2musicsheet
|
|
774
|
-
display_name: Image2musicsheet
|
|
775
|
-
description: The Image2musicsheet benchmark for converting images of music sheets to LilyPond.
|
|
776
|
-
metric_groups:
|
|
777
|
-
- accuracy
|
|
778
|
-
- generation_image
|
|
779
|
-
- efficiency
|
|
780
|
-
- general_information
|
|
781
|
-
environment:
|
|
782
|
-
main_name: earth_mover_similarity
|
|
783
|
-
main_split: valid
|
|
784
|
-
taxonomy:
|
|
785
|
-
task: image-to-text
|
|
786
|
-
what: music sheets
|
|
787
|
-
who: n/a
|
|
788
|
-
when: "2024"
|
|
789
|
-
language: English
|
|
790
|
-
|
|
791
|
-
- name: chart2csv
|
|
792
|
-
display_name: Chart2CSV
|
|
793
|
-
description: The Chart2CSV benchmark for converting images of charts to CSV.
|
|
794
|
+
- name: mementos
|
|
795
|
+
display_name: Mementos
|
|
796
|
+
description: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences
|
|
794
797
|
metric_groups:
|
|
795
798
|
- accuracy
|
|
796
|
-
- efficiency
|
|
797
799
|
- general_information
|
|
798
800
|
environment:
|
|
799
|
-
main_name:
|
|
801
|
+
main_name: prometheus_vision
|
|
800
802
|
main_split: test
|
|
801
803
|
taxonomy:
|
|
802
|
-
task:
|
|
803
|
-
what:
|
|
804
|
-
who:
|
|
804
|
+
task: short answer question answering
|
|
805
|
+
what: Image sequences of comics, dailylife and robotics
|
|
806
|
+
who: Human experts
|
|
805
807
|
when: "2024"
|
|
806
808
|
language: English
|
|
807
809
|
|
|
@@ -810,7 +812,6 @@ run_groups:
|
|
|
810
812
|
description: Examining Gender and Racial Bias in Large Vision-Language Models Using a Novel Dataset of Parallel Images.
|
|
811
813
|
metric_groups:
|
|
812
814
|
- accuracy
|
|
813
|
-
- efficiency
|
|
814
815
|
- general_information
|
|
815
816
|
environment:
|
|
816
817
|
main_name: exact_match
|