crfm-helm 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +7 -3
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/RECORD +53 -41
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
- helm/benchmark/augmentations/perturbation.py +17 -1
- helm/benchmark/augmentations/test_perturbation.py +30 -0
- helm/benchmark/metrics/efficiency_metrics.py +9 -2
- helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +142 -17
- helm/benchmark/model_metadata_registry.py +5 -1
- helm/benchmark/run_expander.py +35 -63
- helm/benchmark/run_spec_factory.py +11 -10
- helm/benchmark/run_specs/vlm_run_specs.py +294 -38
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/math_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
- helm/benchmark/static/schema_image2structure.yaml +304 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
- helm/benchmark/static/schema_vlm.yaml +257 -10
- helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
- helm/benchmark/static_build/assets/index-878a1094.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/anthropic_client.py +36 -6
- helm/clients/openai_client.py +2 -3
- helm/clients/together_client.py +93 -2
- helm/clients/vertexai_client.py +59 -50
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +11 -4
- helm/clients/vision_language/idefics_client.py +2 -2
- helm/common/images_utils.py +10 -3
- helm/config/model_deployments.yaml +100 -2
- helm/config/model_metadata.yaml +136 -31
- helm/config/tokenizer_configs.yaml +7 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
- helm/benchmark/static_build/assets/index-d839df55.js +0 -9
- helm/benchmark/test_model_deployment_definition.py +0 -90
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
CORRECT_TAG,
|
|
7
|
+
TEST_SPLIT,
|
|
8
|
+
Instance,
|
|
9
|
+
Input,
|
|
10
|
+
Output,
|
|
11
|
+
Reference,
|
|
12
|
+
Scenario,
|
|
13
|
+
)
|
|
14
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
15
|
+
from helm.common.general import ensure_file_downloaded
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Crossmodal3600Scenario(Scenario):
|
|
19
|
+
"""
|
|
20
|
+
Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated
|
|
21
|
+
with human-generated reference captions in 36 languages.
|
|
22
|
+
|
|
23
|
+
@inproceedings{ThapliyalCrossmodal2022,
|
|
24
|
+
author = {Ashish Thapliyal and Jordi Pont-Tuset and Xi Chen and Radu Soricut},
|
|
25
|
+
title = {{Crossmodal-3600: A Massively Multilingual Multimodal Evaluation Dataset}},
|
|
26
|
+
booktitle = {EMNLP},
|
|
27
|
+
year = {2022}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
Paper: https://arxiv.org/abs/2205.12522
|
|
31
|
+
Website: https://google.github.io/crossmodal-3600/
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
LANGUAGE_TO_ID: Dict[str, str] = {
|
|
35
|
+
"arabic": "ar",
|
|
36
|
+
"bengali": "bn",
|
|
37
|
+
"chinese": "zh",
|
|
38
|
+
"croatian": "hr",
|
|
39
|
+
"cusco_quechua": "quz",
|
|
40
|
+
"czech": "cs",
|
|
41
|
+
"danish": "da",
|
|
42
|
+
"dutch": "nl",
|
|
43
|
+
"english": "en",
|
|
44
|
+
"persian": "fa",
|
|
45
|
+
"finnish": "fi",
|
|
46
|
+
"french": "fr",
|
|
47
|
+
"german": "de",
|
|
48
|
+
"greek": "el",
|
|
49
|
+
"hebrew": "he",
|
|
50
|
+
"hindi": "hi",
|
|
51
|
+
"hungarian": "hu",
|
|
52
|
+
"indonesian": "id",
|
|
53
|
+
"italian": "it",
|
|
54
|
+
"japanese": "ja",
|
|
55
|
+
"korean": "ko",
|
|
56
|
+
"maori": "mi",
|
|
57
|
+
"norwegian": "no",
|
|
58
|
+
"polish": "pl",
|
|
59
|
+
"portuguese": "pt",
|
|
60
|
+
"romanian": "ro",
|
|
61
|
+
"russian": "ru",
|
|
62
|
+
"spanish": "es",
|
|
63
|
+
"swahili": "sw",
|
|
64
|
+
"swedish": "sv",
|
|
65
|
+
"telugu": "te",
|
|
66
|
+
"thai": "th",
|
|
67
|
+
"turkish": "tr",
|
|
68
|
+
"ukrainian": "uk",
|
|
69
|
+
"vietnamese": "vi",
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
IMAGES_URL: str = "https://open-images-dataset.s3.amazonaws.com/crossmodal-3600/images.tgz"
|
|
73
|
+
CAPTIONS_URL: str = "https://google.github.io/crossmodal-3600/web-data/captions.zip"
|
|
74
|
+
|
|
75
|
+
name = "crossmodal_3600"
|
|
76
|
+
description = (
|
|
77
|
+
"Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated "
|
|
78
|
+
"with human-generated reference captions in 36 languages. ([paper](https://arxiv.org/abs/2205.12522))."
|
|
79
|
+
)
|
|
80
|
+
tags = ["vision-language", "multilinguality"]
|
|
81
|
+
|
|
82
|
+
def __init__(self, location: str, language: str):
|
|
83
|
+
super().__init__()
|
|
84
|
+
self._locale_id: str = self.LANGUAGE_TO_ID[location]
|
|
85
|
+
self._language_id: str = self.LANGUAGE_TO_ID[language]
|
|
86
|
+
self._instruction: str = f"Generate a short caption for the following image in {language}."
|
|
87
|
+
|
|
88
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
89
|
+
images_path: str = os.path.join(output_path, "images")
|
|
90
|
+
ensure_file_downloaded(
|
|
91
|
+
source_url=self.IMAGES_URL,
|
|
92
|
+
target_path=images_path,
|
|
93
|
+
unpack=True,
|
|
94
|
+
unpack_type="untar",
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
captions_path: str = os.path.join(output_path, "captions.jsonl")
|
|
98
|
+
ensure_file_downloaded(
|
|
99
|
+
source_url=self.CAPTIONS_URL,
|
|
100
|
+
target_path=captions_path,
|
|
101
|
+
unpack=True,
|
|
102
|
+
unpack_type="unzip",
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
instances: List[Instance] = []
|
|
106
|
+
with open(captions_path, "r") as captions_file:
|
|
107
|
+
for line in captions_file:
|
|
108
|
+
example: Dict = json.loads(line)
|
|
109
|
+
|
|
110
|
+
locale_id: str = example["image/locale"]
|
|
111
|
+
if locale_id != self._locale_id:
|
|
112
|
+
continue
|
|
113
|
+
|
|
114
|
+
key: str = example["image/key"]
|
|
115
|
+
image_path: str = os.path.join(images_path, f"{key}.jpg")
|
|
116
|
+
assert os.path.exists(image_path), f"Image {image_path} does not exist"
|
|
117
|
+
|
|
118
|
+
assert self._language_id in example, f"Language {self._language_id} not found in example"
|
|
119
|
+
all_captions: Dict = example[self._language_id]
|
|
120
|
+
captions: List[str] = all_captions["caption"]
|
|
121
|
+
|
|
122
|
+
content: List[MediaObject] = [
|
|
123
|
+
MediaObject(text=self._instruction, content_type="text/plain"),
|
|
124
|
+
MediaObject(location=image_path, content_type="image/jpeg"),
|
|
125
|
+
]
|
|
126
|
+
instances.append(
|
|
127
|
+
Instance(
|
|
128
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
129
|
+
references=[Reference(Output(text=caption), tags=[CORRECT_TAG]) for caption in captions],
|
|
130
|
+
split=TEST_SPLIT,
|
|
131
|
+
)
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
return instances
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
VALID_SPLIT,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.general import ensure_directory_exists
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Flickr30KScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
An image caption corpus consisting of 158,915 crowdsourced captions describing 31,783 Flickr images.
|
|
23
|
+
|
|
24
|
+
@article{young2014image,
|
|
25
|
+
title={From image descriptions to visual denotations: New similarity metrics for semantic
|
|
26
|
+
inference over event descriptions},
|
|
27
|
+
author={Young, Peter and Lai, Alice and Hodosh, Micah and Hockenmaier, Julia},
|
|
28
|
+
journal={Transactions of the Association for Computational Linguistics},
|
|
29
|
+
volume={2},
|
|
30
|
+
pages={67--78},
|
|
31
|
+
year={2014},
|
|
32
|
+
publisher={MIT Press}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
Paper: https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf
|
|
36
|
+
Website: https://shannon.cs.illinois.edu/DenotationGraph/
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
HF_DATASET_NAME: str = "nlphuji/flickr30k"
|
|
40
|
+
|
|
41
|
+
name = "flickr30k"
|
|
42
|
+
description = (
|
|
43
|
+
"An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr "
|
|
44
|
+
"images ([paper](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))."
|
|
45
|
+
)
|
|
46
|
+
tags = ["vision-language"]
|
|
47
|
+
|
|
48
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
49
|
+
images_path: str = os.path.join(output_path, "images")
|
|
50
|
+
ensure_directory_exists(images_path)
|
|
51
|
+
|
|
52
|
+
instances: List[Instance] = []
|
|
53
|
+
for row in tqdm(load_dataset(self.HF_DATASET_NAME, cache_dir=output_path, split="test")):
|
|
54
|
+
split: str = row["split"]
|
|
55
|
+
helm_split: str = VALID_SPLIT if split == "val" else split
|
|
56
|
+
|
|
57
|
+
image_filename: str = row["filename"]
|
|
58
|
+
local_image_path: str = os.path.join(images_path, image_filename)
|
|
59
|
+
image = row["image"]
|
|
60
|
+
if not os.path.exists(local_image_path):
|
|
61
|
+
image.save(local_image_path)
|
|
62
|
+
|
|
63
|
+
content: List[MediaObject] = [
|
|
64
|
+
MediaObject(location=local_image_path, content_type="image/jpeg"),
|
|
65
|
+
]
|
|
66
|
+
instances.append(
|
|
67
|
+
Instance(
|
|
68
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
69
|
+
references=[Reference(Output(text=caption), tags=[CORRECT_TAG]) for caption in row["caption"]],
|
|
70
|
+
split=helm_split,
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
return instances
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
ALL_SPLITS,
|
|
7
|
+
CORRECT_TAG,
|
|
8
|
+
VALID_SPLIT,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.general import ensure_file_downloaded
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class GQAScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
Questions about real-world visual reasoning and compositional QA
|
|
23
|
+
|
|
24
|
+
@misc{hudson2019gqa,
|
|
25
|
+
title={GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering},
|
|
26
|
+
author={Drew A. Hudson and Christopher D. Manning},
|
|
27
|
+
year={2019},
|
|
28
|
+
eprint={1902.09506},
|
|
29
|
+
archivePrefix={arXiv},
|
|
30
|
+
primaryClass={cs.CL}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
Paper: https://arxiv.org/abs/1902.09506
|
|
34
|
+
Website: https://github.com/stanford-crfm/helm/issues/1951
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
QUESTIONS_URL: str = "https://downloads.cs.stanford.edu/nlp/data/gqa/questions1.2.zip"
|
|
38
|
+
IMAGES_URL: str = "https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip"
|
|
39
|
+
|
|
40
|
+
name = "gqa"
|
|
41
|
+
description = (
|
|
42
|
+
"Questions about real-world visual reasoning and compositional QA "
|
|
43
|
+
"([paper](https://arxiv.org/abs/1902.09506))."
|
|
44
|
+
)
|
|
45
|
+
tags = ["vision-language", "reasoning"]
|
|
46
|
+
|
|
47
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
48
|
+
questions_path: str = os.path.join(output_path, "questions")
|
|
49
|
+
ensure_file_downloaded(
|
|
50
|
+
source_url=self.QUESTIONS_URL, target_path=questions_path, unpack=True, unpack_type="unzip"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
images_path: str = os.path.join(output_path, "images")
|
|
54
|
+
ensure_file_downloaded(source_url=self.IMAGES_URL, target_path=images_path, unpack=True, unpack_type="unzip")
|
|
55
|
+
|
|
56
|
+
instances: List[Instance] = []
|
|
57
|
+
for helm_split in ALL_SPLITS:
|
|
58
|
+
if helm_split == TEST_SPLIT:
|
|
59
|
+
# The test split doesn't have annotations
|
|
60
|
+
continue
|
|
61
|
+
|
|
62
|
+
split: str = "val" if helm_split == VALID_SPLIT else helm_split
|
|
63
|
+
|
|
64
|
+
# Read the questions from the JSON
|
|
65
|
+
questions_split_path: str = os.path.join(questions_path, f"{split}_balanced_questions.json")
|
|
66
|
+
with open(questions_split_path, "r") as questions_file:
|
|
67
|
+
questions: Dict[str, Any] = json.load(questions_file)
|
|
68
|
+
for question_id, question_data in questions.items():
|
|
69
|
+
question: str = question_data["question"]
|
|
70
|
+
short_answer: str = question_data["answer"]
|
|
71
|
+
full_answer: str = question_data["fullAnswer"]
|
|
72
|
+
|
|
73
|
+
image_id: str = question_data["imageId"]
|
|
74
|
+
local_image_path: str = os.path.join(images_path, f"{image_id}.jpg")
|
|
75
|
+
|
|
76
|
+
content: List[MediaObject] = [
|
|
77
|
+
MediaObject(text=question, content_type="text/plain"),
|
|
78
|
+
MediaObject(location=local_image_path, content_type="image/jpeg"),
|
|
79
|
+
]
|
|
80
|
+
instances.append(
|
|
81
|
+
Instance(
|
|
82
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
83
|
+
references=[
|
|
84
|
+
Reference(Output(text=short_answer), tags=[CORRECT_TAG]),
|
|
85
|
+
Reference(Output(text=full_answer), tags=[CORRECT_TAG]),
|
|
86
|
+
],
|
|
87
|
+
split=helm_split,
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
return instances
|
|
@@ -80,11 +80,13 @@ class HatefulMemesScenario(Scenario):
|
|
|
80
80
|
MediaObject(location=local_image_path, content_type="image/jpeg"),
|
|
81
81
|
MediaObject(text=self.QUESTION, content_type="text/plain"),
|
|
82
82
|
]
|
|
83
|
-
answer: str = "Yes" if row["label"] == 1 else "No"
|
|
84
83
|
instances.append(
|
|
85
84
|
Instance(
|
|
86
85
|
Input(multimedia_content=MultimediaObject(content)),
|
|
87
|
-
references=[
|
|
86
|
+
references=[
|
|
87
|
+
Reference(Output(text="Yes"), tags=[CORRECT_TAG] if row["label"] == 1 else []),
|
|
88
|
+
Reference(Output(text="No"), tags=[CORRECT_TAG] if row["label"] == 0 else []),
|
|
89
|
+
],
|
|
88
90
|
split=split,
|
|
89
91
|
)
|
|
90
92
|
)
|
|
@@ -4,7 +4,7 @@ from helm.benchmark.scenarios.vision_language.image2structure.image2structure_sc
|
|
|
4
4
|
|
|
5
5
|
class MusicSheetScenario(Image2StructureScenario):
|
|
6
6
|
BASE_PROMPT = (
|
|
7
|
-
"Please generate the Lilypond code to generate a music sheet that looks like this image as much as
|
|
7
|
+
"Please generate the Lilypond code to generate a music sheet that looks like this image as much as feasibly possible.\n" # noqa: E501
|
|
8
8
|
"This music sheet was created by me, and I would like to recreate it using Lilypond."
|
|
9
9
|
)
|
|
10
10
|
HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-musicsheet"
|
|
@@ -123,7 +123,7 @@ class WebpageScenario(Image2StructureScenario):
|
|
|
123
123
|
" }\n"
|
|
124
124
|
"]\n"
|
|
125
125
|
"You do not have to create files with the same names. Create as many files as you need, you can even use directories if necessary," # noqa: E501
|
|
126
|
-
" they will be created for you automatically. Try to write some realistic code keeping in mind that
|
|
126
|
+
" they will be created for you automatically. Try to write some realistic code keeping in mind that it should"
|
|
127
127
|
" look like the image as much as feasibly possible."
|
|
128
128
|
)
|
|
129
129
|
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.general import ensure_directory_exists
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MathVistaScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
MathVista: Evaluating Math Reasoning in Visual Contexts
|
|
23
|
+
|
|
24
|
+
To bridge this gap, we present MathVista, a benchmark designed to combine challenges from diverse
|
|
25
|
+
mathematical and visual tasks. It consists of 6,141 examples, derived from 28 existing multimodal datasets
|
|
26
|
+
involving mathematics and 3 newly created datasets (i.e., IQTest, FunctionQA, and PaperQA). Completing these
|
|
27
|
+
tasks requires fine-grained, deep visual understanding and compositional reasoning, which all state-of-the-art
|
|
28
|
+
foundation models find challenging.
|
|
29
|
+
|
|
30
|
+
@inproceedings{lu2024mathvista,
|
|
31
|
+
author = {Lu, Pan and Bansal, Hritik and Xia, Tony and Liu, Jiacheng and Li, Chunyuan and Hajishirzi,
|
|
32
|
+
Hannaneh and Cheng, Hao and Chang, Kai-Wei and Galley, Michel and Gao, Jianfeng},
|
|
33
|
+
title = {MathVista: Evaluating Mathematical Reasoning of Foundation Models in Visual Contexts},
|
|
34
|
+
booktitle={International Conference on Learning Representations (ICLR)},
|
|
35
|
+
year = {2024}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
Paper: https://arxiv.org/abs/2310.02255
|
|
39
|
+
Website: https://mathvista.github.io/
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
HUGGINGFACE_DATASET_NAME: str = "AI4Math/MathVista"
|
|
43
|
+
|
|
44
|
+
# Only the testmini split has answers
|
|
45
|
+
SPLIT: str = "testmini"
|
|
46
|
+
|
|
47
|
+
# Supported difficulties
|
|
48
|
+
GRADES: List[str] = ["elementary_school", "high_school", "college", "daily_life"]
|
|
49
|
+
QUESTION_TYPES: List[str] = ["multi_choice", "free_form"]
|
|
50
|
+
|
|
51
|
+
name = "math_vista"
|
|
52
|
+
description = (
|
|
53
|
+
"A benchmark designed to combine challenges from diverse mathematical and visual tasks. "
|
|
54
|
+
"([paper](https://arxiv.org/abs/2310.02255))."
|
|
55
|
+
)
|
|
56
|
+
tags = ["vision-language", "reasoning", "math"]
|
|
57
|
+
|
|
58
|
+
def __init__(self, grade: str, question_type: str):
|
|
59
|
+
super().__init__()
|
|
60
|
+
assert grade in self.GRADES, f"Not supported: {grade}"
|
|
61
|
+
self._grade: str = grade.replace("_", " ")
|
|
62
|
+
|
|
63
|
+
assert question_type in self.QUESTION_TYPES, f"Invalid question type: {question_type}"
|
|
64
|
+
self._question_type: str = question_type
|
|
65
|
+
|
|
66
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
67
|
+
ensure_directory_exists(os.path.join(output_path, "images"))
|
|
68
|
+
instances: List[Instance] = []
|
|
69
|
+
|
|
70
|
+
for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split=self.SPLIT, cache_dir=output_path)):
|
|
71
|
+
# Filter out the questions by type and grade (or difficulty)
|
|
72
|
+
if row["question_type"] != self._question_type or row["metadata"]["grade"] != self._grade:
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
pid: str = row["pid"]
|
|
76
|
+
question: str = row["question"]
|
|
77
|
+
answer: str = row["answer"]
|
|
78
|
+
|
|
79
|
+
# Save the image locally
|
|
80
|
+
assert row["image"] == f"images/{pid}.jpg", f"Invalid image path: {row['image']} for question {pid}"
|
|
81
|
+
image_path: str = os.path.join(output_path, row["image"])
|
|
82
|
+
|
|
83
|
+
if not os.path.exists(image_path):
|
|
84
|
+
image = row["decoded_image"]
|
|
85
|
+
if image.mode in ("RGBA", "P", "LA"):
|
|
86
|
+
image = image.convert("RGB")
|
|
87
|
+
image.save(image_path)
|
|
88
|
+
|
|
89
|
+
content: List[MediaObject] = [
|
|
90
|
+
MediaObject(text=question, content_type="text/plain"),
|
|
91
|
+
MediaObject(location=image_path, content_type="image/jpeg"),
|
|
92
|
+
]
|
|
93
|
+
|
|
94
|
+
# Add the references
|
|
95
|
+
references: List[Reference] = []
|
|
96
|
+
if self._question_type == "multi_choice":
|
|
97
|
+
options: List[str] = row["choices"]
|
|
98
|
+
for option in options:
|
|
99
|
+
references.append(Reference(Output(text=option), tags=[CORRECT_TAG] if option == answer else []))
|
|
100
|
+
else:
|
|
101
|
+
references.append(Reference(Output(text=answer), tags=[CORRECT_TAG]))
|
|
102
|
+
|
|
103
|
+
if row["unit"] is not None:
|
|
104
|
+
references.append(Reference(Output(text=f"{answer} {row['unit']}"), tags=[CORRECT_TAG]))
|
|
105
|
+
|
|
106
|
+
instances.append(
|
|
107
|
+
Instance(
|
|
108
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
109
|
+
references=references,
|
|
110
|
+
split=TEST_SPLIT,
|
|
111
|
+
)
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
assert (
|
|
115
|
+
len(instances) > 0
|
|
116
|
+
), f"No instances found for subject {self._grade} and question type {self._question_type}"
|
|
117
|
+
return instances
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
TEST_SPLIT,
|
|
7
|
+
Instance,
|
|
8
|
+
Input,
|
|
9
|
+
Scenario,
|
|
10
|
+
)
|
|
11
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
12
|
+
from helm.common.general import ensure_file_downloaded
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MMSafetyBenchScenario(Scenario):
|
|
16
|
+
"""
|
|
17
|
+
To evaluate the extent of this vulnerability in open-source VLMs, compiled a substantial dataset encompassing
|
|
18
|
+
13 scenarios with a total of 5,040 text-image pairs
|
|
19
|
+
|
|
20
|
+
@misc{liu2023queryrelevant,
|
|
21
|
+
title={Query-Relevant Images Jailbreak Large Multi-Modal Models},
|
|
22
|
+
author={Xin Liu and Yichen Zhu and Yunshi Lan and Chao Yang and Yu Qiao},
|
|
23
|
+
year={2023},
|
|
24
|
+
eprint={2311.17600},
|
|
25
|
+
archivePrefix={arXiv},
|
|
26
|
+
primaryClass={cs.CV}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
Paper: https://arxiv.org/abs/2311.17600
|
|
30
|
+
Website: https://isxinliu.github.io/Project/MM-SafetyBench/
|
|
31
|
+
Questions: https://github.com/isXinLiu/MM-SafetyBench/tree/main/data/processed_questions
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
SUBSET_TO_DATASET_FOLDER: Dict[str, str] = {
|
|
35
|
+
"illegal_activity": "01-Illegal_Activitiy", # This is intentionally misspelled to match the original name
|
|
36
|
+
"hate_speech": "02-HateSpeech",
|
|
37
|
+
"malware_generation": "03-Malware_Generation",
|
|
38
|
+
"physical_harm": "04-Physical_Harm",
|
|
39
|
+
"economic_harm": "05-EconomicHarm",
|
|
40
|
+
"fraud": "06-Fraud",
|
|
41
|
+
"sex": "07-Sex",
|
|
42
|
+
"political_lobbying": "08-Political_Lobbying",
|
|
43
|
+
"privacy_violence": "09-Privacy_Violence",
|
|
44
|
+
"legal_opinion": "10-Legal_Opinion",
|
|
45
|
+
"financial_advice": "11-Financial_Advice",
|
|
46
|
+
"health_consultation": "12-Health_Consultation",
|
|
47
|
+
"government_decision": "13-Gov_Decision",
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
QUESTIONS_URL_TEMPLATE: str = (
|
|
51
|
+
"https://raw.githubusercontent.com/isXinLiu/MM-SafetyBench/main/data/" "processed_questions/{dataset}.json"
|
|
52
|
+
)
|
|
53
|
+
IMAGES_URL: str = "https://drive.google.com/uc?export=download&id=1xjW9k-aGkmwycqGCXbru70FaSKhSDcR_"
|
|
54
|
+
|
|
55
|
+
name = "mm_safety_bench"
|
|
56
|
+
description = (
|
|
57
|
+
"Expose the vulnerability of open-source VLMs with toxic and biased content "
|
|
58
|
+
"([paper](https://arxiv.org/abs/2311.17600))."
|
|
59
|
+
)
|
|
60
|
+
tags = ["vision-language", "bias", "toxicity"]
|
|
61
|
+
|
|
62
|
+
def __init__(self, subset: str):
|
|
63
|
+
super().__init__()
|
|
64
|
+
assert subset in self.SUBSET_TO_DATASET_FOLDER, f"Invalid subset: {subset}"
|
|
65
|
+
self._dataset: str = self.SUBSET_TO_DATASET_FOLDER[subset]
|
|
66
|
+
|
|
67
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
68
|
+
# Download all the images
|
|
69
|
+
images_path: str = os.path.join(output_path, "MM-SafetyBench(imgs)")
|
|
70
|
+
assert os.path.exists(images_path), (
|
|
71
|
+
f"Images path does not exist: {images_path}. Download the images "
|
|
72
|
+
f"from {self.IMAGES_URL}, unzip and place it at {output_path}"
|
|
73
|
+
)
|
|
74
|
+
# SD_TYPO seems to have the greatest attack success rate on the models they evaluated
|
|
75
|
+
images_path = os.path.join(images_path, self._dataset, "SD_TYPO")
|
|
76
|
+
assert os.path.exists(images_path)
|
|
77
|
+
|
|
78
|
+
questions_path: str = os.path.join(output_path, f"{self._dataset}.json")
|
|
79
|
+
questions_url: str = self.QUESTIONS_URL_TEMPLATE.format(dataset=self._dataset)
|
|
80
|
+
ensure_file_downloaded(source_url=questions_url, target_path=questions_path)
|
|
81
|
+
|
|
82
|
+
instances: List[Instance] = []
|
|
83
|
+
|
|
84
|
+
with open(questions_path, "r") as questions_file:
|
|
85
|
+
questions: Dict[str, Any] = json.load(questions_file)
|
|
86
|
+
for question_id, question_data in questions.items():
|
|
87
|
+
local_image_path: str = os.path.join(images_path, f"{question_id}.jpg")
|
|
88
|
+
assert os.path.exists(local_image_path), f"Image does not exist: {local_image_path}"
|
|
89
|
+
|
|
90
|
+
question: str = question_data["Rephrased Question"]
|
|
91
|
+
content: List[MediaObject] = [
|
|
92
|
+
MediaObject(location=local_image_path, content_type="image/jpeg"),
|
|
93
|
+
MediaObject(text=question, content_type="text/plain"),
|
|
94
|
+
]
|
|
95
|
+
instances.append(
|
|
96
|
+
Instance(
|
|
97
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
98
|
+
references=[],
|
|
99
|
+
split=TEST_SPLIT,
|
|
100
|
+
)
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
return instances
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import Any, Dict, List
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_file_downloaded
|
|
7
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
8
|
+
from helm.benchmark.scenarios.scenario import (
|
|
9
|
+
Scenario,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
CORRECT_TAG,
|
|
15
|
+
TRAIN_SPLIT,
|
|
16
|
+
VALID_SPLIT,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MSCOCOCaptioningScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
|
|
23
|
+
It has 330K images, with over 200K of them labeled. We use the 2014 version of the dataset instead
|
|
24
|
+
of the 2017 version because of the larger validation set. According to https://cocodataset.org/#download,
|
|
25
|
+
the 2014 version has 83K images in the train split and 41K in the val split.
|
|
26
|
+
|
|
27
|
+
Each image also has five captions. For example, image #335111 has the following five captions:
|
|
28
|
+
1. a row of bikes on the sidewalk, 2 on the ground.
|
|
29
|
+
2. a couple of bikes laying on their sides on a sidewalk.
|
|
30
|
+
3. a person wearing a black coat with a hood stands on the street, near many bikes
|
|
31
|
+
4. a woman standing in front of a row of bicycles in front of a bus stop with two bikes knocked over
|
|
32
|
+
5. there are some bicycles laying on their sides
|
|
33
|
+
|
|
34
|
+
Paper: https://arxiv.org/abs/1405.0312
|
|
35
|
+
Website: https://cocodataset.org/#home
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
ANNOTATIONS_DOWNLOAD_URL: str = "http://images.cocodataset.org/annotations/annotations_trainval2014.zip"
|
|
39
|
+
SPLIT_DOWNLOAD_URL_TEMPLATE: str = "http://images.cocodataset.org/zips/{split}2014.zip"
|
|
40
|
+
COCO_SPLIT_TO_HELM_SPLIT: Dict[str, str] = {"train": TRAIN_SPLIT, "val": VALID_SPLIT}
|
|
41
|
+
|
|
42
|
+
name = "mscoco"
|
|
43
|
+
description = "Microsoft COCO: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))."
|
|
44
|
+
tags = ["text-to-image", "image-to-text"]
|
|
45
|
+
|
|
46
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
47
|
+
# Download the annotations which contains the image IDs, filenames and captions
|
|
48
|
+
data_path: str = os.path.join(output_path, "data")
|
|
49
|
+
ensure_file_downloaded(source_url=self.ANNOTATIONS_DOWNLOAD_URL, target_path=data_path, unpack=True)
|
|
50
|
+
|
|
51
|
+
instances: List[Instance] = []
|
|
52
|
+
for coco_split, helm_split in self.COCO_SPLIT_TO_HELM_SPLIT.items():
|
|
53
|
+
# Download the images of the split
|
|
54
|
+
split_url: str = self.SPLIT_DOWNLOAD_URL_TEMPLATE.format(split=coco_split)
|
|
55
|
+
split_path: str = os.path.join(data_path, coco_split)
|
|
56
|
+
ensure_file_downloaded(source_url=split_url, target_path=split_path, unpack=True)
|
|
57
|
+
|
|
58
|
+
# Read the metadata for the split
|
|
59
|
+
metadata_path: str = os.path.join(data_path, f"captions_{coco_split}2014.json")
|
|
60
|
+
with open(metadata_path, "r") as f:
|
|
61
|
+
metadata: Dict[str, Any] = json.load(f)
|
|
62
|
+
|
|
63
|
+
# Get the path of each image
|
|
64
|
+
image_id_to_path: Dict[int, str] = {
|
|
65
|
+
image_metadata["id"]: os.path.join(split_path, image_metadata["file_name"])
|
|
66
|
+
for image_metadata in metadata["images"]
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
# Gather the five captions for each image
|
|
70
|
+
image_id_to_captions: Dict[int, List[str]] = defaultdict(list)
|
|
71
|
+
for annotation in metadata["annotations"]:
|
|
72
|
+
image_id_to_captions[annotation["image_id"]].append(annotation["caption"])
|
|
73
|
+
|
|
74
|
+
# Create instances
|
|
75
|
+
for image_id in image_id_to_path:
|
|
76
|
+
image_path: str = image_id_to_path[image_id]
|
|
77
|
+
captions: List[str] = image_id_to_captions[image_id]
|
|
78
|
+
|
|
79
|
+
content: List[MediaObject] = [
|
|
80
|
+
MediaObject(location=image_path, content_type="image/jpeg"),
|
|
81
|
+
]
|
|
82
|
+
instances.append(
|
|
83
|
+
Instance(
|
|
84
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
85
|
+
references=[
|
|
86
|
+
Reference(Output(text=caption.rstrip()), tags=[CORRECT_TAG]) for caption in captions
|
|
87
|
+
],
|
|
88
|
+
split=helm_split,
|
|
89
|
+
)
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
return instances
|