crfm-helm 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (56) hide show
  1. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +7 -3
  2. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/RECORD +53 -41
  3. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  5. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  6. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
  7. helm/benchmark/augmentations/perturbation.py +17 -1
  8. helm/benchmark/augmentations/test_perturbation.py +30 -0
  9. helm/benchmark/metrics/efficiency_metrics.py +9 -2
  10. helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
  11. helm/benchmark/metrics/vision_language/image_metrics.py +142 -17
  12. helm/benchmark/model_metadata_registry.py +5 -1
  13. helm/benchmark/run_expander.py +35 -63
  14. helm/benchmark/run_spec_factory.py +11 -10
  15. helm/benchmark/run_specs/vlm_run_specs.py +294 -38
  16. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  17. helm/benchmark/scenarios/math_scenario.py +1 -1
  18. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  19. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  20. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  21. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  22. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
  23. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -1
  24. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +1 -1
  25. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  26. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  27. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  28. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  29. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  30. helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
  31. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
  32. helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
  33. helm/benchmark/static/schema_image2structure.yaml +304 -0
  34. helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
  35. helm/benchmark/static/schema_vlm.yaml +257 -10
  36. helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
  37. helm/benchmark/static_build/assets/index-878a1094.css +1 -0
  38. helm/benchmark/static_build/index.html +2 -2
  39. helm/clients/anthropic_client.py +36 -6
  40. helm/clients/openai_client.py +2 -3
  41. helm/clients/together_client.py +93 -2
  42. helm/clients/vertexai_client.py +59 -50
  43. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  44. helm/clients/vision_language/huggingface_vlm_client.py +11 -4
  45. helm/clients/vision_language/idefics_client.py +2 -2
  46. helm/common/images_utils.py +10 -3
  47. helm/config/model_deployments.yaml +100 -2
  48. helm/config/model_metadata.yaml +136 -31
  49. helm/config/tokenizer_configs.yaml +7 -0
  50. helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
  51. helm/benchmark/static_build/assets/index-d839df55.js +0 -9
  52. helm/benchmark/test_model_deployment_definition.py +0 -90
  53. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
  54. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +0 -0
  55. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
  56. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,134 @@
1
+ import json
2
+ import os
3
+ from typing import Dict, List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ CORRECT_TAG,
7
+ TEST_SPLIT,
8
+ Instance,
9
+ Input,
10
+ Output,
11
+ Reference,
12
+ Scenario,
13
+ )
14
+ from helm.common.media_object import MediaObject, MultimediaObject
15
+ from helm.common.general import ensure_file_downloaded
16
+
17
+
18
+ class Crossmodal3600Scenario(Scenario):
19
+ """
20
+ Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated
21
+ with human-generated reference captions in 36 languages.
22
+
23
+ @inproceedings{ThapliyalCrossmodal2022,
24
+ author = {Ashish Thapliyal and Jordi Pont-Tuset and Xi Chen and Radu Soricut},
25
+ title = {{Crossmodal-3600: A Massively Multilingual Multimodal Evaluation Dataset}},
26
+ booktitle = {EMNLP},
27
+ year = {2022}
28
+ }
29
+
30
+ Paper: https://arxiv.org/abs/2205.12522
31
+ Website: https://google.github.io/crossmodal-3600/
32
+ """
33
+
34
+ LANGUAGE_TO_ID: Dict[str, str] = {
35
+ "arabic": "ar",
36
+ "bengali": "bn",
37
+ "chinese": "zh",
38
+ "croatian": "hr",
39
+ "cusco_quechua": "quz",
40
+ "czech": "cs",
41
+ "danish": "da",
42
+ "dutch": "nl",
43
+ "english": "en",
44
+ "persian": "fa",
45
+ "finnish": "fi",
46
+ "french": "fr",
47
+ "german": "de",
48
+ "greek": "el",
49
+ "hebrew": "he",
50
+ "hindi": "hi",
51
+ "hungarian": "hu",
52
+ "indonesian": "id",
53
+ "italian": "it",
54
+ "japanese": "ja",
55
+ "korean": "ko",
56
+ "maori": "mi",
57
+ "norwegian": "no",
58
+ "polish": "pl",
59
+ "portuguese": "pt",
60
+ "romanian": "ro",
61
+ "russian": "ru",
62
+ "spanish": "es",
63
+ "swahili": "sw",
64
+ "swedish": "sv",
65
+ "telugu": "te",
66
+ "thai": "th",
67
+ "turkish": "tr",
68
+ "ukrainian": "uk",
69
+ "vietnamese": "vi",
70
+ }
71
+
72
+ IMAGES_URL: str = "https://open-images-dataset.s3.amazonaws.com/crossmodal-3600/images.tgz"
73
+ CAPTIONS_URL: str = "https://google.github.io/crossmodal-3600/web-data/captions.zip"
74
+
75
+ name = "crossmodal_3600"
76
+ description = (
77
+ "Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated "
78
+ "with human-generated reference captions in 36 languages. ([paper](https://arxiv.org/abs/2205.12522))."
79
+ )
80
+ tags = ["vision-language", "multilinguality"]
81
+
82
+ def __init__(self, location: str, language: str):
83
+ super().__init__()
84
+ self._locale_id: str = self.LANGUAGE_TO_ID[location]
85
+ self._language_id: str = self.LANGUAGE_TO_ID[language]
86
+ self._instruction: str = f"Generate a short caption for the following image in {language}."
87
+
88
+ def get_instances(self, output_path: str) -> List[Instance]:
89
+ images_path: str = os.path.join(output_path, "images")
90
+ ensure_file_downloaded(
91
+ source_url=self.IMAGES_URL,
92
+ target_path=images_path,
93
+ unpack=True,
94
+ unpack_type="untar",
95
+ )
96
+
97
+ captions_path: str = os.path.join(output_path, "captions.jsonl")
98
+ ensure_file_downloaded(
99
+ source_url=self.CAPTIONS_URL,
100
+ target_path=captions_path,
101
+ unpack=True,
102
+ unpack_type="unzip",
103
+ )
104
+
105
+ instances: List[Instance] = []
106
+ with open(captions_path, "r") as captions_file:
107
+ for line in captions_file:
108
+ example: Dict = json.loads(line)
109
+
110
+ locale_id: str = example["image/locale"]
111
+ if locale_id != self._locale_id:
112
+ continue
113
+
114
+ key: str = example["image/key"]
115
+ image_path: str = os.path.join(images_path, f"{key}.jpg")
116
+ assert os.path.exists(image_path), f"Image {image_path} does not exist"
117
+
118
+ assert self._language_id in example, f"Language {self._language_id} not found in example"
119
+ all_captions: Dict = example[self._language_id]
120
+ captions: List[str] = all_captions["caption"]
121
+
122
+ content: List[MediaObject] = [
123
+ MediaObject(text=self._instruction, content_type="text/plain"),
124
+ MediaObject(location=image_path, content_type="image/jpeg"),
125
+ ]
126
+ instances.append(
127
+ Instance(
128
+ Input(multimedia_content=MultimediaObject(content)),
129
+ references=[Reference(Output(text=caption), tags=[CORRECT_TAG]) for caption in captions],
130
+ split=TEST_SPLIT,
131
+ )
132
+ )
133
+
134
+ return instances
@@ -0,0 +1,74 @@
1
+ import os
2
+ from typing import List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ VALID_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_directory_exists
18
+
19
+
20
+ class Flickr30KScenario(Scenario):
21
+ """
22
+ An image caption corpus consisting of 158,915 crowdsourced captions describing 31,783 Flickr images.
23
+
24
+ @article{young2014image,
25
+ title={From image descriptions to visual denotations: New similarity metrics for semantic
26
+ inference over event descriptions},
27
+ author={Young, Peter and Lai, Alice and Hodosh, Micah and Hockenmaier, Julia},
28
+ journal={Transactions of the Association for Computational Linguistics},
29
+ volume={2},
30
+ pages={67--78},
31
+ year={2014},
32
+ publisher={MIT Press}
33
+ }
34
+
35
+ Paper: https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf
36
+ Website: https://shannon.cs.illinois.edu/DenotationGraph/
37
+ """
38
+
39
+ HF_DATASET_NAME: str = "nlphuji/flickr30k"
40
+
41
+ name = "flickr30k"
42
+ description = (
43
+ "An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr "
44
+ "images ([paper](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))."
45
+ )
46
+ tags = ["vision-language"]
47
+
48
+ def get_instances(self, output_path: str) -> List[Instance]:
49
+ images_path: str = os.path.join(output_path, "images")
50
+ ensure_directory_exists(images_path)
51
+
52
+ instances: List[Instance] = []
53
+ for row in tqdm(load_dataset(self.HF_DATASET_NAME, cache_dir=output_path, split="test")):
54
+ split: str = row["split"]
55
+ helm_split: str = VALID_SPLIT if split == "val" else split
56
+
57
+ image_filename: str = row["filename"]
58
+ local_image_path: str = os.path.join(images_path, image_filename)
59
+ image = row["image"]
60
+ if not os.path.exists(local_image_path):
61
+ image.save(local_image_path)
62
+
63
+ content: List[MediaObject] = [
64
+ MediaObject(location=local_image_path, content_type="image/jpeg"),
65
+ ]
66
+ instances.append(
67
+ Instance(
68
+ Input(multimedia_content=MultimediaObject(content)),
69
+ references=[Reference(Output(text=caption), tags=[CORRECT_TAG]) for caption in row["caption"]],
70
+ split=helm_split,
71
+ )
72
+ )
73
+
74
+ return instances
@@ -0,0 +1,91 @@
1
+ import os
2
+ import json
3
+ from typing import Any, Dict, List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ ALL_SPLITS,
7
+ CORRECT_TAG,
8
+ VALID_SPLIT,
9
+ TEST_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_file_downloaded
18
+
19
+
20
+ class GQAScenario(Scenario):
21
+ """
22
+ Questions about real-world visual reasoning and compositional QA
23
+
24
+ @misc{hudson2019gqa,
25
+ title={GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering},
26
+ author={Drew A. Hudson and Christopher D. Manning},
27
+ year={2019},
28
+ eprint={1902.09506},
29
+ archivePrefix={arXiv},
30
+ primaryClass={cs.CL}
31
+ }
32
+
33
+ Paper: https://arxiv.org/abs/1902.09506
34
+ Website: https://github.com/stanford-crfm/helm/issues/1951
35
+ """
36
+
37
+ QUESTIONS_URL: str = "https://downloads.cs.stanford.edu/nlp/data/gqa/questions1.2.zip"
38
+ IMAGES_URL: str = "https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip"
39
+
40
+ name = "gqa"
41
+ description = (
42
+ "Questions about real-world visual reasoning and compositional QA "
43
+ "([paper](https://arxiv.org/abs/1902.09506))."
44
+ )
45
+ tags = ["vision-language", "reasoning"]
46
+
47
+ def get_instances(self, output_path: str) -> List[Instance]:
48
+ questions_path: str = os.path.join(output_path, "questions")
49
+ ensure_file_downloaded(
50
+ source_url=self.QUESTIONS_URL, target_path=questions_path, unpack=True, unpack_type="unzip"
51
+ )
52
+
53
+ images_path: str = os.path.join(output_path, "images")
54
+ ensure_file_downloaded(source_url=self.IMAGES_URL, target_path=images_path, unpack=True, unpack_type="unzip")
55
+
56
+ instances: List[Instance] = []
57
+ for helm_split in ALL_SPLITS:
58
+ if helm_split == TEST_SPLIT:
59
+ # The test split doesn't have annotations
60
+ continue
61
+
62
+ split: str = "val" if helm_split == VALID_SPLIT else helm_split
63
+
64
+ # Read the questions from the JSON
65
+ questions_split_path: str = os.path.join(questions_path, f"{split}_balanced_questions.json")
66
+ with open(questions_split_path, "r") as questions_file:
67
+ questions: Dict[str, Any] = json.load(questions_file)
68
+ for question_id, question_data in questions.items():
69
+ question: str = question_data["question"]
70
+ short_answer: str = question_data["answer"]
71
+ full_answer: str = question_data["fullAnswer"]
72
+
73
+ image_id: str = question_data["imageId"]
74
+ local_image_path: str = os.path.join(images_path, f"{image_id}.jpg")
75
+
76
+ content: List[MediaObject] = [
77
+ MediaObject(text=question, content_type="text/plain"),
78
+ MediaObject(location=local_image_path, content_type="image/jpeg"),
79
+ ]
80
+ instances.append(
81
+ Instance(
82
+ Input(multimedia_content=MultimediaObject(content)),
83
+ references=[
84
+ Reference(Output(text=short_answer), tags=[CORRECT_TAG]),
85
+ Reference(Output(text=full_answer), tags=[CORRECT_TAG]),
86
+ ],
87
+ split=helm_split,
88
+ )
89
+ )
90
+
91
+ return instances
@@ -80,11 +80,13 @@ class HatefulMemesScenario(Scenario):
80
80
  MediaObject(location=local_image_path, content_type="image/jpeg"),
81
81
  MediaObject(text=self.QUESTION, content_type="text/plain"),
82
82
  ]
83
- answer: str = "Yes" if row["label"] == 1 else "No"
84
83
  instances.append(
85
84
  Instance(
86
85
  Input(multimedia_content=MultimediaObject(content)),
87
- references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
86
+ references=[
87
+ Reference(Output(text="Yes"), tags=[CORRECT_TAG] if row["label"] == 1 else []),
88
+ Reference(Output(text="No"), tags=[CORRECT_TAG] if row["label"] == 0 else []),
89
+ ],
88
90
  split=split,
89
91
  )
90
92
  )
@@ -4,7 +4,7 @@ from helm.benchmark.scenarios.vision_language.image2structure.image2structure_sc
4
4
 
5
5
  class MusicSheetScenario(Image2StructureScenario):
6
6
  BASE_PROMPT = (
7
- "Please generate the Lilypond code to generate a music sheet that looks like this image as much as feasible possible.\n" # noqa: E501
7
+ "Please generate the Lilypond code to generate a music sheet that looks like this image as much as feasibly possible.\n" # noqa: E501
8
8
  "This music sheet was created by me, and I would like to recreate it using Lilypond."
9
9
  )
10
10
  HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-musicsheet"
@@ -123,7 +123,7 @@ class WebpageScenario(Image2StructureScenario):
123
123
  " }\n"
124
124
  "]\n"
125
125
  "You do not have to create files with the same names. Create as many files as you need, you can even use directories if necessary," # noqa: E501
126
- " they will be created for you automatically. Try to write some realistic code keeping in mind that is should"
126
+ " they will be created for you automatically. Try to write some realistic code keeping in mind that it should"
127
127
  " look like the image as much as feasibly possible."
128
128
  )
129
129
 
@@ -0,0 +1,117 @@
1
+ import os
2
+ from typing import List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_directory_exists
18
+
19
+
20
+ class MathVistaScenario(Scenario):
21
+ """
22
+ MathVista: Evaluating Math Reasoning in Visual Contexts
23
+
24
+ To bridge this gap, we present MathVista, a benchmark designed to combine challenges from diverse
25
+ mathematical and visual tasks. It consists of 6,141 examples, derived from 28 existing multimodal datasets
26
+ involving mathematics and 3 newly created datasets (i.e., IQTest, FunctionQA, and PaperQA). Completing these
27
+ tasks requires fine-grained, deep visual understanding and compositional reasoning, which all state-of-the-art
28
+ foundation models find challenging.
29
+
30
+ @inproceedings{lu2024mathvista,
31
+ author = {Lu, Pan and Bansal, Hritik and Xia, Tony and Liu, Jiacheng and Li, Chunyuan and Hajishirzi,
32
+ Hannaneh and Cheng, Hao and Chang, Kai-Wei and Galley, Michel and Gao, Jianfeng},
33
+ title = {MathVista: Evaluating Mathematical Reasoning of Foundation Models in Visual Contexts},
34
+ booktitle={International Conference on Learning Representations (ICLR)},
35
+ year = {2024}
36
+ }
37
+
38
+ Paper: https://arxiv.org/abs/2310.02255
39
+ Website: https://mathvista.github.io/
40
+ """
41
+
42
+ HUGGINGFACE_DATASET_NAME: str = "AI4Math/MathVista"
43
+
44
+ # Only the testmini split has answers
45
+ SPLIT: str = "testmini"
46
+
47
+ # Supported difficulties
48
+ GRADES: List[str] = ["elementary_school", "high_school", "college", "daily_life"]
49
+ QUESTION_TYPES: List[str] = ["multi_choice", "free_form"]
50
+
51
+ name = "math_vista"
52
+ description = (
53
+ "A benchmark designed to combine challenges from diverse mathematical and visual tasks. "
54
+ "([paper](https://arxiv.org/abs/2310.02255))."
55
+ )
56
+ tags = ["vision-language", "reasoning", "math"]
57
+
58
+ def __init__(self, grade: str, question_type: str):
59
+ super().__init__()
60
+ assert grade in self.GRADES, f"Not supported: {grade}"
61
+ self._grade: str = grade.replace("_", " ")
62
+
63
+ assert question_type in self.QUESTION_TYPES, f"Invalid question type: {question_type}"
64
+ self._question_type: str = question_type
65
+
66
+ def get_instances(self, output_path: str) -> List[Instance]:
67
+ ensure_directory_exists(os.path.join(output_path, "images"))
68
+ instances: List[Instance] = []
69
+
70
+ for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split=self.SPLIT, cache_dir=output_path)):
71
+ # Filter out the questions by type and grade (or difficulty)
72
+ if row["question_type"] != self._question_type or row["metadata"]["grade"] != self._grade:
73
+ continue
74
+
75
+ pid: str = row["pid"]
76
+ question: str = row["question"]
77
+ answer: str = row["answer"]
78
+
79
+ # Save the image locally
80
+ assert row["image"] == f"images/{pid}.jpg", f"Invalid image path: {row['image']} for question {pid}"
81
+ image_path: str = os.path.join(output_path, row["image"])
82
+
83
+ if not os.path.exists(image_path):
84
+ image = row["decoded_image"]
85
+ if image.mode in ("RGBA", "P", "LA"):
86
+ image = image.convert("RGB")
87
+ image.save(image_path)
88
+
89
+ content: List[MediaObject] = [
90
+ MediaObject(text=question, content_type="text/plain"),
91
+ MediaObject(location=image_path, content_type="image/jpeg"),
92
+ ]
93
+
94
+ # Add the references
95
+ references: List[Reference] = []
96
+ if self._question_type == "multi_choice":
97
+ options: List[str] = row["choices"]
98
+ for option in options:
99
+ references.append(Reference(Output(text=option), tags=[CORRECT_TAG] if option == answer else []))
100
+ else:
101
+ references.append(Reference(Output(text=answer), tags=[CORRECT_TAG]))
102
+
103
+ if row["unit"] is not None:
104
+ references.append(Reference(Output(text=f"{answer} {row['unit']}"), tags=[CORRECT_TAG]))
105
+
106
+ instances.append(
107
+ Instance(
108
+ Input(multimedia_content=MultimediaObject(content)),
109
+ references=references,
110
+ split=TEST_SPLIT,
111
+ )
112
+ )
113
+
114
+ assert (
115
+ len(instances) > 0
116
+ ), f"No instances found for subject {self._grade} and question type {self._question_type}"
117
+ return instances
@@ -0,0 +1,103 @@
1
+ import os
2
+ import json
3
+ from typing import Any, Dict, List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ TEST_SPLIT,
7
+ Instance,
8
+ Input,
9
+ Scenario,
10
+ )
11
+ from helm.common.media_object import MediaObject, MultimediaObject
12
+ from helm.common.general import ensure_file_downloaded
13
+
14
+
15
+ class MMSafetyBenchScenario(Scenario):
16
+ """
17
+ To evaluate the extent of this vulnerability in open-source VLMs, compiled a substantial dataset encompassing
18
+ 13 scenarios with a total of 5,040 text-image pairs
19
+
20
+ @misc{liu2023queryrelevant,
21
+ title={Query-Relevant Images Jailbreak Large Multi-Modal Models},
22
+ author={Xin Liu and Yichen Zhu and Yunshi Lan and Chao Yang and Yu Qiao},
23
+ year={2023},
24
+ eprint={2311.17600},
25
+ archivePrefix={arXiv},
26
+ primaryClass={cs.CV}
27
+ }
28
+
29
+ Paper: https://arxiv.org/abs/2311.17600
30
+ Website: https://isxinliu.github.io/Project/MM-SafetyBench/
31
+ Questions: https://github.com/isXinLiu/MM-SafetyBench/tree/main/data/processed_questions
32
+ """
33
+
34
+ SUBSET_TO_DATASET_FOLDER: Dict[str, str] = {
35
+ "illegal_activity": "01-Illegal_Activitiy", # This is intentionally misspelled to match the original name
36
+ "hate_speech": "02-HateSpeech",
37
+ "malware_generation": "03-Malware_Generation",
38
+ "physical_harm": "04-Physical_Harm",
39
+ "economic_harm": "05-EconomicHarm",
40
+ "fraud": "06-Fraud",
41
+ "sex": "07-Sex",
42
+ "political_lobbying": "08-Political_Lobbying",
43
+ "privacy_violence": "09-Privacy_Violence",
44
+ "legal_opinion": "10-Legal_Opinion",
45
+ "financial_advice": "11-Financial_Advice",
46
+ "health_consultation": "12-Health_Consultation",
47
+ "government_decision": "13-Gov_Decision",
48
+ }
49
+
50
+ QUESTIONS_URL_TEMPLATE: str = (
51
+ "https://raw.githubusercontent.com/isXinLiu/MM-SafetyBench/main/data/" "processed_questions/{dataset}.json"
52
+ )
53
+ IMAGES_URL: str = "https://drive.google.com/uc?export=download&id=1xjW9k-aGkmwycqGCXbru70FaSKhSDcR_"
54
+
55
+ name = "mm_safety_bench"
56
+ description = (
57
+ "Expose the vulnerability of open-source VLMs with toxic and biased content "
58
+ "([paper](https://arxiv.org/abs/2311.17600))."
59
+ )
60
+ tags = ["vision-language", "bias", "toxicity"]
61
+
62
+ def __init__(self, subset: str):
63
+ super().__init__()
64
+ assert subset in self.SUBSET_TO_DATASET_FOLDER, f"Invalid subset: {subset}"
65
+ self._dataset: str = self.SUBSET_TO_DATASET_FOLDER[subset]
66
+
67
+ def get_instances(self, output_path: str) -> List[Instance]:
68
+ # Download all the images
69
+ images_path: str = os.path.join(output_path, "MM-SafetyBench(imgs)")
70
+ assert os.path.exists(images_path), (
71
+ f"Images path does not exist: {images_path}. Download the images "
72
+ f"from {self.IMAGES_URL}, unzip and place it at {output_path}"
73
+ )
74
+ # SD_TYPO seems to have the greatest attack success rate on the models they evaluated
75
+ images_path = os.path.join(images_path, self._dataset, "SD_TYPO")
76
+ assert os.path.exists(images_path)
77
+
78
+ questions_path: str = os.path.join(output_path, f"{self._dataset}.json")
79
+ questions_url: str = self.QUESTIONS_URL_TEMPLATE.format(dataset=self._dataset)
80
+ ensure_file_downloaded(source_url=questions_url, target_path=questions_path)
81
+
82
+ instances: List[Instance] = []
83
+
84
+ with open(questions_path, "r") as questions_file:
85
+ questions: Dict[str, Any] = json.load(questions_file)
86
+ for question_id, question_data in questions.items():
87
+ local_image_path: str = os.path.join(images_path, f"{question_id}.jpg")
88
+ assert os.path.exists(local_image_path), f"Image does not exist: {local_image_path}"
89
+
90
+ question: str = question_data["Rephrased Question"]
91
+ content: List[MediaObject] = [
92
+ MediaObject(location=local_image_path, content_type="image/jpeg"),
93
+ MediaObject(text=question, content_type="text/plain"),
94
+ ]
95
+ instances.append(
96
+ Instance(
97
+ Input(multimedia_content=MultimediaObject(content)),
98
+ references=[],
99
+ split=TEST_SPLIT,
100
+ )
101
+ )
102
+
103
+ return instances
@@ -0,0 +1,92 @@
1
+ import json
2
+ import os
3
+ from collections import defaultdict
4
+ from typing import Any, Dict, List
5
+
6
+ from helm.common.general import ensure_file_downloaded
7
+ from helm.common.media_object import MediaObject, MultimediaObject
8
+ from helm.benchmark.scenarios.scenario import (
9
+ Scenario,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ CORRECT_TAG,
15
+ TRAIN_SPLIT,
16
+ VALID_SPLIT,
17
+ )
18
+
19
+
20
+ class MSCOCOCaptioningScenario(Scenario):
21
+ """
22
+ Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
23
+ It has 330K images, with over 200K of them labeled. We use the 2014 version of the dataset instead
24
+ of the 2017 version because of the larger validation set. According to https://cocodataset.org/#download,
25
+ the 2014 version has 83K images in the train split and 41K in the val split.
26
+
27
+ Each image also has five captions. For example, image #335111 has the following five captions:
28
+ 1. a row of bikes on the sidewalk, 2 on the ground.
29
+ 2. a couple of bikes laying on their sides on a sidewalk.
30
+ 3. a person wearing a black coat with a hood stands on the street, near many bikes
31
+ 4. a woman standing in front of a row of bicycles in front of a bus stop with two bikes knocked over
32
+ 5. there are some bicycles laying on their sides
33
+
34
+ Paper: https://arxiv.org/abs/1405.0312
35
+ Website: https://cocodataset.org/#home
36
+ """
37
+
38
+ ANNOTATIONS_DOWNLOAD_URL: str = "http://images.cocodataset.org/annotations/annotations_trainval2014.zip"
39
+ SPLIT_DOWNLOAD_URL_TEMPLATE: str = "http://images.cocodataset.org/zips/{split}2014.zip"
40
+ COCO_SPLIT_TO_HELM_SPLIT: Dict[str, str] = {"train": TRAIN_SPLIT, "val": VALID_SPLIT}
41
+
42
+ name = "mscoco"
43
+ description = "Microsoft COCO: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))."
44
+ tags = ["text-to-image", "image-to-text"]
45
+
46
+ def get_instances(self, output_path: str) -> List[Instance]:
47
+ # Download the annotations which contains the image IDs, filenames and captions
48
+ data_path: str = os.path.join(output_path, "data")
49
+ ensure_file_downloaded(source_url=self.ANNOTATIONS_DOWNLOAD_URL, target_path=data_path, unpack=True)
50
+
51
+ instances: List[Instance] = []
52
+ for coco_split, helm_split in self.COCO_SPLIT_TO_HELM_SPLIT.items():
53
+ # Download the images of the split
54
+ split_url: str = self.SPLIT_DOWNLOAD_URL_TEMPLATE.format(split=coco_split)
55
+ split_path: str = os.path.join(data_path, coco_split)
56
+ ensure_file_downloaded(source_url=split_url, target_path=split_path, unpack=True)
57
+
58
+ # Read the metadata for the split
59
+ metadata_path: str = os.path.join(data_path, f"captions_{coco_split}2014.json")
60
+ with open(metadata_path, "r") as f:
61
+ metadata: Dict[str, Any] = json.load(f)
62
+
63
+ # Get the path of each image
64
+ image_id_to_path: Dict[int, str] = {
65
+ image_metadata["id"]: os.path.join(split_path, image_metadata["file_name"])
66
+ for image_metadata in metadata["images"]
67
+ }
68
+
69
+ # Gather the five captions for each image
70
+ image_id_to_captions: Dict[int, List[str]] = defaultdict(list)
71
+ for annotation in metadata["annotations"]:
72
+ image_id_to_captions[annotation["image_id"]].append(annotation["caption"])
73
+
74
+ # Create instances
75
+ for image_id in image_id_to_path:
76
+ image_path: str = image_id_to_path[image_id]
77
+ captions: List[str] = image_id_to_captions[image_id]
78
+
79
+ content: List[MediaObject] = [
80
+ MediaObject(location=image_path, content_type="image/jpeg"),
81
+ ]
82
+ instances.append(
83
+ Instance(
84
+ Input(multimedia_content=MultimediaObject(content)),
85
+ references=[
86
+ Reference(Output(text=caption.rstrip()), tags=[CORRECT_TAG]) for caption in captions
87
+ ],
88
+ split=helm_split,
89
+ )
90
+ )
91
+
92
+ return instances