crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (125) hide show
  1. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
  2. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
  3. helm/benchmark/adaptation/adapter_spec.py +32 -31
  4. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  5. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  6. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  7. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  8. helm/benchmark/annotation/annotator_factory.py +6 -0
  9. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
  10. helm/benchmark/annotation/live_qa_annotator.py +84 -0
  11. helm/benchmark/annotation/medication_qa_annotator.py +81 -0
  12. helm/benchmark/augmentations/perturbation.py +17 -1
  13. helm/benchmark/augmentations/test_perturbation.py +30 -0
  14. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  15. helm/benchmark/huggingface_registration.py +16 -6
  16. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  17. helm/benchmark/metrics/efficiency_metrics.py +9 -2
  18. helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
  19. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  20. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  21. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  22. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  23. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  24. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  25. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  26. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  27. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  28. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  29. helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
  30. helm/benchmark/model_metadata_registry.py +5 -1
  31. helm/benchmark/presentation/schema.py +54 -4
  32. helm/benchmark/presentation/test_schema.py +11 -0
  33. helm/benchmark/run.py +16 -2
  34. helm/benchmark/run_expander.py +112 -63
  35. helm/benchmark/run_spec_factory.py +15 -10
  36. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  37. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  38. helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
  39. helm/benchmark/run_specs/experimental_run_specs.py +33 -0
  40. helm/benchmark/run_specs/finance_run_specs.py +33 -0
  41. helm/benchmark/run_specs/vlm_run_specs.py +444 -65
  42. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  43. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  44. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  45. helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
  46. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  47. helm/benchmark/scenarios/math_scenario.py +1 -1
  48. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  49. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  50. helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
  51. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  52. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  53. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  54. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
  55. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
  56. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
  57. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
  58. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
  59. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  60. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  61. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  62. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  63. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  64. helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
  65. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
  66. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
  67. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
  68. helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
  69. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  70. helm/benchmark/static/schema_classic.yaml +3 -59
  71. helm/benchmark/static/schema_finance.yaml +143 -0
  72. helm/benchmark/static/schema_image2structure.yaml +447 -0
  73. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  74. helm/benchmark/static/schema_lite.yaml +3 -61
  75. helm/benchmark/static/schema_medical.yaml +255 -0
  76. helm/benchmark/static/schema_mmlu.yaml +3 -61
  77. helm/benchmark/static/schema_tables.yaml +200 -0
  78. helm/benchmark/static/schema_thai.yaml +223 -0
  79. helm/benchmark/static/schema_unitxt.yaml +3 -61
  80. helm/benchmark/static/schema_vhelm.yaml +824 -0
  81. helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
  82. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  83. helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
  84. helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
  85. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  86. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  87. helm/benchmark/static_build/index.html +2 -2
  88. helm/clients/anthropic_client.py +78 -14
  89. helm/clients/auto_client.py +11 -0
  90. helm/clients/client.py +24 -7
  91. helm/clients/cohere_client.py +98 -3
  92. helm/clients/huggingface_client.py +71 -12
  93. helm/clients/openai_client.py +11 -5
  94. helm/clients/reka_client.py +189 -0
  95. helm/clients/test_client.py +3 -3
  96. helm/clients/test_huggingface_client.py +19 -3
  97. helm/clients/test_together_client.py +72 -2
  98. helm/clients/together_client.py +199 -2
  99. helm/clients/vertexai_client.py +117 -64
  100. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  101. helm/clients/vision_language/huggingface_vlm_client.py +12 -4
  102. helm/clients/vision_language/idefics_client.py +2 -2
  103. helm/clients/vision_language/paligemma_client.py +146 -0
  104. helm/clients/vision_language/palmyra_vision_client.py +84 -0
  105. helm/clients/yi_client.py +31 -0
  106. helm/common/critique_request.py +10 -1
  107. helm/common/images_utils.py +29 -3
  108. helm/config/model_deployments.yaml +504 -12
  109. helm/config/model_metadata.yaml +579 -52
  110. helm/config/tokenizer_configs.yaml +100 -1
  111. helm/proxy/critique/model_critique_client.py +32 -4
  112. helm/proxy/services/server_service.py +1 -1
  113. helm/tokenizers/auto_tokenizer.py +1 -1
  114. helm/tokenizers/cohere_tokenizer.py +44 -2
  115. helm/tokenizers/huggingface_tokenizer.py +36 -13
  116. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  117. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  118. helm/benchmark/static/schema_vlm.yaml +0 -576
  119. helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
  120. helm/benchmark/static_build/assets/index-d839df55.js +0 -9
  121. helm/benchmark/test_model_deployment_definition.py +0 -90
  122. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
  123. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
  124. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
  125. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,91 @@
1
+ import os
2
+ import json
3
+ from typing import Any, Dict, List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ ALL_SPLITS,
7
+ CORRECT_TAG,
8
+ VALID_SPLIT,
9
+ TEST_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_file_downloaded
18
+
19
+
20
+ class GQAScenario(Scenario):
21
+ """
22
+ Questions about real-world visual reasoning and compositional QA
23
+
24
+ @misc{hudson2019gqa,
25
+ title={GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering},
26
+ author={Drew A. Hudson and Christopher D. Manning},
27
+ year={2019},
28
+ eprint={1902.09506},
29
+ archivePrefix={arXiv},
30
+ primaryClass={cs.CL}
31
+ }
32
+
33
+ Paper: https://arxiv.org/abs/1902.09506
34
+ Website: https://github.com/stanford-crfm/helm/issues/1951
35
+ """
36
+
37
+ QUESTIONS_URL: str = "https://downloads.cs.stanford.edu/nlp/data/gqa/questions1.2.zip"
38
+ IMAGES_URL: str = "https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip"
39
+
40
+ name = "gqa"
41
+ description = (
42
+ "Questions about real-world visual reasoning and compositional QA "
43
+ "([paper](https://arxiv.org/abs/1902.09506))."
44
+ )
45
+ tags = ["vision-language", "reasoning"]
46
+
47
+ def get_instances(self, output_path: str) -> List[Instance]:
48
+ questions_path: str = os.path.join(output_path, "questions")
49
+ ensure_file_downloaded(
50
+ source_url=self.QUESTIONS_URL, target_path=questions_path, unpack=True, unpack_type="unzip"
51
+ )
52
+
53
+ images_path: str = os.path.join(output_path, "images")
54
+ ensure_file_downloaded(source_url=self.IMAGES_URL, target_path=images_path, unpack=True, unpack_type="unzip")
55
+
56
+ instances: List[Instance] = []
57
+ for helm_split in ALL_SPLITS:
58
+ if helm_split == TEST_SPLIT:
59
+ # The test split doesn't have annotations
60
+ continue
61
+
62
+ split: str = "val" if helm_split == VALID_SPLIT else helm_split
63
+
64
+ # Read the questions from the JSON
65
+ questions_split_path: str = os.path.join(questions_path, f"{split}_balanced_questions.json")
66
+ with open(questions_split_path, "r") as questions_file:
67
+ questions: Dict[str, Any] = json.load(questions_file)
68
+ for question_id, question_data in questions.items():
69
+ question: str = question_data["question"]
70
+ short_answer: str = question_data["answer"]
71
+ full_answer: str = question_data["fullAnswer"]
72
+
73
+ image_id: str = question_data["imageId"]
74
+ local_image_path: str = os.path.join(images_path, f"{image_id}.jpg")
75
+
76
+ content: List[MediaObject] = [
77
+ MediaObject(text=question, content_type="text/plain"),
78
+ MediaObject(location=local_image_path, content_type="image/jpeg"),
79
+ ]
80
+ instances.append(
81
+ Instance(
82
+ Input(multimedia_content=MultimediaObject(content)),
83
+ references=[
84
+ Reference(Output(text=short_answer), tags=[CORRECT_TAG]),
85
+ Reference(Output(text=full_answer), tags=[CORRECT_TAG]),
86
+ ],
87
+ split=helm_split,
88
+ )
89
+ )
90
+
91
+ return instances
@@ -80,11 +80,13 @@ class HatefulMemesScenario(Scenario):
80
80
  MediaObject(location=local_image_path, content_type="image/jpeg"),
81
81
  MediaObject(text=self.QUESTION, content_type="text/plain"),
82
82
  ]
83
- answer: str = "Yes" if row["label"] == 1 else "No"
84
83
  instances.append(
85
84
  Instance(
86
85
  Input(multimedia_content=MultimediaObject(content)),
87
- references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
86
+ references=[
87
+ Reference(Output(text="Yes"), tags=[CORRECT_TAG] if row["label"] == 1 else []),
88
+ Reference(Output(text="No"), tags=[CORRECT_TAG] if row["label"] == 0 else []),
89
+ ],
88
90
  split=split,
89
91
  )
90
92
  )
@@ -22,6 +22,10 @@ from helm.common.general import ensure_directory_exists
22
22
  from helm.common.hierarchical_logger import hlog
23
23
 
24
24
  PROCESSED: str = "processed"
25
+ DIFFICULTY_ALL = "all"
26
+ DIFFICULTY_EASY = "easy"
27
+ DIFFICULTY_MEDIUM = "medium"
28
+ DIFFICULTY_HARD = "hard"
25
29
 
26
30
 
27
31
  class Image2StructureScenario(Scenario):
@@ -38,13 +42,16 @@ class Image2StructureScenario(Scenario):
38
42
  VALID_SPLIT: "validation",
39
43
  }
40
44
 
41
- def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
45
+ def __init__(
46
+ self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT, difficulty: str = DIFFICULTY_ALL
47
+ ):
42
48
  super().__init__()
43
49
  assert subset in self.SUBSETS, f"Invalid subset: {subset}"
44
50
  self._subset: str = subset
45
51
  self._recompile_prompt: bool = recompile_prompt
46
52
  self._split: str = split
47
53
  self._output_path: Optional[str] = None
54
+ self._difficulty: str = difficulty
48
55
 
49
56
  def preprocess_row(self, row: Dict[str, Any], assets_path: str) -> Dict[str, Any]:
50
57
  # By default, there are no assets
@@ -110,6 +117,10 @@ class Image2StructureScenario(Scenario):
110
117
  )
111
118
  continue
112
119
 
120
+ # Filter by difficulty
121
+ if self._difficulty != DIFFICULTY_ALL and row["difficulty"] != self._difficulty:
122
+ continue
123
+
113
124
  # Step 1: Preprocess the row
114
125
  row = self.preprocess_row(row, assets_path)
115
126
 
@@ -158,7 +169,7 @@ class Image2StructureScenario(Scenario):
158
169
  # representing the structure (such as LaTeX code)
159
170
  multimedia_object = MultimediaObject([image_object])
160
171
  reference = Reference(
161
- output=Output(text=row["text"], multimedia_content=multimedia_object),
172
+ output=Output(text=row["text"] if "text" in row else "", multimedia_content=multimedia_object),
162
173
  tags=[CORRECT_TAG],
163
174
  )
164
175
  else:
@@ -1,4 +1,3 @@
1
- from helm.benchmark.scenarios.scenario import VALID_SPLIT
2
1
  from helm.benchmark.scenarios.vision_language.image2structure.utils_latex import (
3
2
  latex_to_image,
4
3
  strip_unnecessary_latex_parts,
@@ -9,14 +8,11 @@ from helm.benchmark.scenarios.vision_language.image2structure.image2structure_sc
9
8
  class LatexScenario(Image2StructureScenario):
10
9
  BASE_PROMPT = "Please provide the LaTeX code used to generate this image. Only generate the code relevant to what you see. Your code will be surrounded by all the imports necessary as well as the begin and end document delimiters." # noqa: E501
11
10
  HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-latex"
12
- SUBSETS = ["equation", "table", "plot", "algorithm"]
11
+ SUBSETS = ["equation", "table", "plot", "algorithm", "real"]
13
12
 
14
13
  name = "image2latex"
15
14
  description = "Evaluate multimodal models on Latex generation to recreate a provided image"
16
15
 
17
- def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
18
- super().__init__(subset, recompile_prompt, split)
19
-
20
16
  def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
21
17
  image, infos = latex_to_image(structure, assets_path=assets_path, crop=True)
22
18
  image.save(destination_path)
@@ -1,10 +1,9 @@
1
- from helm.benchmark.scenarios.scenario import VALID_SPLIT
2
1
  from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import Image2StructureScenario
3
2
 
4
3
 
5
4
  class MusicSheetScenario(Image2StructureScenario):
6
5
  BASE_PROMPT = (
7
- "Please generate the Lilypond code to generate a music sheet that looks like this image as much as feasible possible.\n" # noqa: E501
6
+ "Please generate the Lilypond code to generate a music sheet that looks like this image as much as feasibly possible.\n" # noqa: E501
8
7
  "This music sheet was created by me, and I would like to recreate it using Lilypond."
9
8
  )
10
9
  HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-musicsheet"
@@ -13,8 +12,5 @@ class MusicSheetScenario(Image2StructureScenario):
13
12
  name = "image2musicsheet"
14
13
  description = "Evaluate multimodal models on Lilypond generation to recreate a provided image"
15
14
 
16
- def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
17
- super().__init__(subset, recompile_prompt, split)
18
-
19
15
  def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
20
16
  raise Exception("Music sheets have no ground truth, compilation is not possible")
@@ -4,6 +4,7 @@ from helm.benchmark.scenarios.scenario import VALID_SPLIT
4
4
  from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import (
5
5
  Image2StructureScenario,
6
6
  PROCESSED,
7
+ DIFFICULTY_ALL,
7
8
  )
8
9
  from helm.benchmark.scenarios.vision_language.image2structure.webpage.jekyll_server import JekyllServer
9
10
  from helm.benchmark.scenarios.vision_language.image2structure.webpage.driver import (
@@ -123,12 +124,12 @@ class WebpageScenario(Image2StructureScenario):
123
124
  " }\n"
124
125
  "]\n"
125
126
  "You do not have to create files with the same names. Create as many files as you need, you can even use directories if necessary," # noqa: E501
126
- " they will be created for you automatically. Try to write some realistic code keeping in mind that is should"
127
+ " they will be created for you automatically. Try to write some realistic code keeping in mind that it should"
127
128
  " look like the image as much as feasibly possible."
128
129
  )
129
130
 
130
131
  HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-webpage"
131
- SUBSETS = ["css", "html", "javascript"]
132
+ SUBSETS = ["css", "html", "javascript", "real"]
132
133
  MAX_TRIES: int = 5
133
134
  ASSETS_EXTENSIONS: List[str] = ["png", "jpg", "jpeg", "gif", "svg", "webp", "ico", "bmp", "tiff"]
134
135
 
@@ -140,9 +141,10 @@ class WebpageScenario(Image2StructureScenario):
140
141
  subset: str,
141
142
  recompile_prompt: bool = True,
142
143
  split: str = VALID_SPLIT,
144
+ difficulty: str = DIFFICULTY_ALL,
143
145
  screenshot_options: ScreenshotOptions = ScreenshotOptions(),
144
146
  ):
145
- super().__init__(subset, recompile_prompt, split)
147
+ super().__init__(subset, recompile_prompt, split, difficulty)
146
148
  self._screenshot_options = screenshot_options
147
149
  self._html2text = HTML2Text()
148
150
  self._html2text.ignore_links = True
@@ -0,0 +1,117 @@
1
+ import os
2
+ from typing import List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_directory_exists
18
+
19
+
20
+ class MathVistaScenario(Scenario):
21
+ """
22
+ MathVista: Evaluating Math Reasoning in Visual Contexts
23
+
24
+ To bridge this gap, we present MathVista, a benchmark designed to combine challenges from diverse
25
+ mathematical and visual tasks. It consists of 6,141 examples, derived from 28 existing multimodal datasets
26
+ involving mathematics and 3 newly created datasets (i.e., IQTest, FunctionQA, and PaperQA). Completing these
27
+ tasks requires fine-grained, deep visual understanding and compositional reasoning, which all state-of-the-art
28
+ foundation models find challenging.
29
+
30
+ @inproceedings{lu2024mathvista,
31
+ author = {Lu, Pan and Bansal, Hritik and Xia, Tony and Liu, Jiacheng and Li, Chunyuan and Hajishirzi,
32
+ Hannaneh and Cheng, Hao and Chang, Kai-Wei and Galley, Michel and Gao, Jianfeng},
33
+ title = {MathVista: Evaluating Mathematical Reasoning of Foundation Models in Visual Contexts},
34
+ booktitle={International Conference on Learning Representations (ICLR)},
35
+ year = {2024}
36
+ }
37
+
38
+ Paper: https://arxiv.org/abs/2310.02255
39
+ Website: https://mathvista.github.io/
40
+ """
41
+
42
+ HUGGINGFACE_DATASET_NAME: str = "AI4Math/MathVista"
43
+
44
+ # Only the testmini split has answers
45
+ SPLIT: str = "testmini"
46
+
47
+ # Supported difficulties
48
+ GRADES: List[str] = ["elementary_school", "high_school", "college", "daily_life"]
49
+ QUESTION_TYPES: List[str] = ["multi_choice", "free_form"]
50
+
51
+ name = "math_vista"
52
+ description = (
53
+ "A benchmark designed to combine challenges from diverse mathematical and visual tasks. "
54
+ "([paper](https://arxiv.org/abs/2310.02255))."
55
+ )
56
+ tags = ["vision-language", "reasoning", "math"]
57
+
58
+ def __init__(self, grade: str, question_type: str):
59
+ super().__init__()
60
+ assert grade in self.GRADES, f"Not supported: {grade}"
61
+ self._grade: str = grade.replace("_", " ")
62
+
63
+ assert question_type in self.QUESTION_TYPES, f"Invalid question type: {question_type}"
64
+ self._question_type: str = question_type
65
+
66
+ def get_instances(self, output_path: str) -> List[Instance]:
67
+ ensure_directory_exists(os.path.join(output_path, "images"))
68
+ instances: List[Instance] = []
69
+
70
+ for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split=self.SPLIT, cache_dir=output_path)):
71
+ # Filter out the questions by type and grade (or difficulty)
72
+ if row["question_type"] != self._question_type or row["metadata"]["grade"] != self._grade:
73
+ continue
74
+
75
+ pid: str = row["pid"]
76
+ question: str = row["question"]
77
+ answer: str = row["answer"]
78
+
79
+ # Save the image locally
80
+ assert row["image"] == f"images/{pid}.jpg", f"Invalid image path: {row['image']} for question {pid}"
81
+ image_path: str = os.path.join(output_path, row["image"])
82
+
83
+ if not os.path.exists(image_path):
84
+ image = row["decoded_image"]
85
+ if image.mode in ("RGBA", "P", "LA"):
86
+ image = image.convert("RGB")
87
+ image.save(image_path)
88
+
89
+ content: List[MediaObject] = [
90
+ MediaObject(text=question, content_type="text/plain"),
91
+ MediaObject(location=image_path, content_type="image/jpeg"),
92
+ ]
93
+
94
+ # Add the references
95
+ references: List[Reference] = []
96
+ if self._question_type == "multi_choice":
97
+ options: List[str] = row["choices"]
98
+ for option in options:
99
+ references.append(Reference(Output(text=option), tags=[CORRECT_TAG] if option == answer else []))
100
+ else:
101
+ references.append(Reference(Output(text=answer), tags=[CORRECT_TAG]))
102
+
103
+ if row["unit"] is not None:
104
+ references.append(Reference(Output(text=f"{answer} {row['unit']}"), tags=[CORRECT_TAG]))
105
+
106
+ instances.append(
107
+ Instance(
108
+ Input(multimedia_content=MultimediaObject(content)),
109
+ references=references,
110
+ split=TEST_SPLIT,
111
+ )
112
+ )
113
+
114
+ assert (
115
+ len(instances) > 0
116
+ ), f"No instances found for subject {self._grade} and question type {self._question_type}"
117
+ return instances
@@ -0,0 +1,103 @@
1
+ import os
2
+ import json
3
+ from typing import Any, Dict, List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ TEST_SPLIT,
7
+ Instance,
8
+ Input,
9
+ Scenario,
10
+ )
11
+ from helm.common.media_object import MediaObject, MultimediaObject
12
+ from helm.common.general import ensure_file_downloaded
13
+
14
+
15
+ class MMSafetyBenchScenario(Scenario):
16
+ """
17
+ To evaluate the extent of this vulnerability in open-source VLMs, compiled a substantial dataset encompassing
18
+ 13 scenarios with a total of 5,040 text-image pairs
19
+
20
+ @misc{liu2023queryrelevant,
21
+ title={Query-Relevant Images Jailbreak Large Multi-Modal Models},
22
+ author={Xin Liu and Yichen Zhu and Yunshi Lan and Chao Yang and Yu Qiao},
23
+ year={2023},
24
+ eprint={2311.17600},
25
+ archivePrefix={arXiv},
26
+ primaryClass={cs.CV}
27
+ }
28
+
29
+ Paper: https://arxiv.org/abs/2311.17600
30
+ Website: https://isxinliu.github.io/Project/MM-SafetyBench/
31
+ Questions: https://github.com/isXinLiu/MM-SafetyBench/tree/main/data/processed_questions
32
+ """
33
+
34
+ SUBSET_TO_DATASET_FOLDER: Dict[str, str] = {
35
+ "illegal_activity": "01-Illegal_Activitiy", # This is intentionally misspelled to match the original name
36
+ "hate_speech": "02-HateSpeech",
37
+ "malware_generation": "03-Malware_Generation",
38
+ "physical_harm": "04-Physical_Harm",
39
+ "economic_harm": "05-EconomicHarm",
40
+ "fraud": "06-Fraud",
41
+ "sex": "07-Sex",
42
+ "political_lobbying": "08-Political_Lobbying",
43
+ "privacy_violence": "09-Privacy_Violence",
44
+ "legal_opinion": "10-Legal_Opinion",
45
+ "financial_advice": "11-Financial_Advice",
46
+ "health_consultation": "12-Health_Consultation",
47
+ "government_decision": "13-Gov_Decision",
48
+ }
49
+
50
+ QUESTIONS_URL_TEMPLATE: str = (
51
+ "https://raw.githubusercontent.com/isXinLiu/MM-SafetyBench/main/data/" "processed_questions/{dataset}.json"
52
+ )
53
+ IMAGES_URL: str = "https://drive.google.com/uc?export=download&id=1xjW9k-aGkmwycqGCXbru70FaSKhSDcR_"
54
+
55
+ name = "mm_safety_bench"
56
+ description = (
57
+ "Expose the vulnerability of open-source VLMs with toxic and biased content "
58
+ "([paper](https://arxiv.org/abs/2311.17600))."
59
+ )
60
+ tags = ["vision-language", "bias", "toxicity"]
61
+
62
+ def __init__(self, subset: str):
63
+ super().__init__()
64
+ assert subset in self.SUBSET_TO_DATASET_FOLDER, f"Invalid subset: {subset}"
65
+ self._dataset: str = self.SUBSET_TO_DATASET_FOLDER[subset]
66
+
67
+ def get_instances(self, output_path: str) -> List[Instance]:
68
+ # Download all the images
69
+ images_path: str = os.path.join(output_path, "MM-SafetyBench(imgs)")
70
+ assert os.path.exists(images_path), (
71
+ f"Images path does not exist: {images_path}. Download the images "
72
+ f"from {self.IMAGES_URL}, unzip and place it at {output_path}"
73
+ )
74
+ # SD_TYPO seems to have the greatest attack success rate on the models they evaluated
75
+ images_path = os.path.join(images_path, self._dataset, "SD_TYPO")
76
+ assert os.path.exists(images_path)
77
+
78
+ questions_path: str = os.path.join(output_path, f"{self._dataset}.json")
79
+ questions_url: str = self.QUESTIONS_URL_TEMPLATE.format(dataset=self._dataset)
80
+ ensure_file_downloaded(source_url=questions_url, target_path=questions_path)
81
+
82
+ instances: List[Instance] = []
83
+
84
+ with open(questions_path, "r") as questions_file:
85
+ questions: Dict[str, Any] = json.load(questions_file)
86
+ for question_id, question_data in questions.items():
87
+ local_image_path: str = os.path.join(images_path, f"{question_id}.jpg")
88
+ assert os.path.exists(local_image_path), f"Image does not exist: {local_image_path}"
89
+
90
+ question: str = question_data["Rephrased Question"]
91
+ content: List[MediaObject] = [
92
+ MediaObject(location=local_image_path, content_type="image/jpeg"),
93
+ MediaObject(text=question, content_type="text/plain"),
94
+ ]
95
+ instances.append(
96
+ Instance(
97
+ Input(multimedia_content=MultimediaObject(content)),
98
+ references=[],
99
+ split=TEST_SPLIT,
100
+ )
101
+ )
102
+
103
+ return instances
@@ -0,0 +1,92 @@
1
+ import json
2
+ import os
3
+ from collections import defaultdict
4
+ from typing import Any, Dict, List
5
+
6
+ from helm.common.general import ensure_file_downloaded
7
+ from helm.common.media_object import MediaObject, MultimediaObject
8
+ from helm.benchmark.scenarios.scenario import (
9
+ Scenario,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ CORRECT_TAG,
15
+ TRAIN_SPLIT,
16
+ VALID_SPLIT,
17
+ )
18
+
19
+
20
+ class MSCOCOCaptioningScenario(Scenario):
21
+ """
22
+ Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
23
+ It has 330K images, with over 200K of them labeled. We use the 2014 version of the dataset instead
24
+ of the 2017 version because of the larger validation set. According to https://cocodataset.org/#download,
25
+ the 2014 version has 83K images in the train split and 41K in the val split.
26
+
27
+ Each image also has five captions. For example, image #335111 has the following five captions:
28
+ 1. a row of bikes on the sidewalk, 2 on the ground.
29
+ 2. a couple of bikes laying on their sides on a sidewalk.
30
+ 3. a person wearing a black coat with a hood stands on the street, near many bikes
31
+ 4. a woman standing in front of a row of bicycles in front of a bus stop with two bikes knocked over
32
+ 5. there are some bicycles laying on their sides
33
+
34
+ Paper: https://arxiv.org/abs/1405.0312
35
+ Website: https://cocodataset.org/#home
36
+ """
37
+
38
+ ANNOTATIONS_DOWNLOAD_URL: str = "http://images.cocodataset.org/annotations/annotations_trainval2014.zip"
39
+ SPLIT_DOWNLOAD_URL_TEMPLATE: str = "http://images.cocodataset.org/zips/{split}2014.zip"
40
+ COCO_SPLIT_TO_HELM_SPLIT: Dict[str, str] = {"train": TRAIN_SPLIT, "val": VALID_SPLIT}
41
+
42
+ name = "mscoco"
43
+ description = "Microsoft COCO: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))."
44
+ tags = ["text-to-image", "image-to-text"]
45
+
46
+ def get_instances(self, output_path: str) -> List[Instance]:
47
+ # Download the annotations which contains the image IDs, filenames and captions
48
+ data_path: str = os.path.join(output_path, "data")
49
+ ensure_file_downloaded(source_url=self.ANNOTATIONS_DOWNLOAD_URL, target_path=data_path, unpack=True)
50
+
51
+ instances: List[Instance] = []
52
+ for coco_split, helm_split in self.COCO_SPLIT_TO_HELM_SPLIT.items():
53
+ # Download the images of the split
54
+ split_url: str = self.SPLIT_DOWNLOAD_URL_TEMPLATE.format(split=coco_split)
55
+ split_path: str = os.path.join(data_path, coco_split)
56
+ ensure_file_downloaded(source_url=split_url, target_path=split_path, unpack=True)
57
+
58
+ # Read the metadata for the split
59
+ metadata_path: str = os.path.join(data_path, f"captions_{coco_split}2014.json")
60
+ with open(metadata_path, "r") as f:
61
+ metadata: Dict[str, Any] = json.load(f)
62
+
63
+ # Get the path of each image
64
+ image_id_to_path: Dict[int, str] = {
65
+ image_metadata["id"]: os.path.join(split_path, image_metadata["file_name"])
66
+ for image_metadata in metadata["images"]
67
+ }
68
+
69
+ # Gather the five captions for each image
70
+ image_id_to_captions: Dict[int, List[str]] = defaultdict(list)
71
+ for annotation in metadata["annotations"]:
72
+ image_id_to_captions[annotation["image_id"]].append(annotation["caption"])
73
+
74
+ # Create instances
75
+ for image_id in image_id_to_path:
76
+ image_path: str = image_id_to_path[image_id]
77
+ captions: List[str] = image_id_to_captions[image_id]
78
+
79
+ content: List[MediaObject] = [
80
+ MediaObject(location=image_path, content_type="image/jpeg"),
81
+ ]
82
+ instances.append(
83
+ Instance(
84
+ Input(multimedia_content=MultimediaObject(content)),
85
+ references=[
86
+ Reference(Output(text=caption.rstrip()), tags=[CORRECT_TAG]) for caption in captions
87
+ ],
88
+ split=helm_split,
89
+ )
90
+ )
91
+
92
+ return instances
@@ -0,0 +1,117 @@
1
+ import json
2
+ import os
3
+ from collections import defaultdict
4
+ from typing import Any, Dict, List, Set
5
+
6
+ from helm.common.general import ensure_file_downloaded
7
+ from helm.common.media_object import MediaObject, MultimediaObject
8
+ from helm.benchmark.scenarios.scenario import (
9
+ Scenario,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ CORRECT_TAG,
15
+ TRAIN_SPLIT,
16
+ VALID_SPLIT,
17
+ )
18
+
19
+
20
+ class MSCOCOCategorizationScenario(Scenario):
21
+ """
22
+ Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
23
+ It has 330K images, with over 200K of them labeled. We use the 2017 version of the dataset
24
+ for the categorization task.
25
+
26
+ Paper: https://arxiv.org/abs/1405.0312
27
+ Website: https://cocodataset.org/#home
28
+ """
29
+
30
+ ANNOTATIONS_DOWNLOAD_URL: str = "http://images.cocodataset.org/annotations/stuff_annotations_trainval2017.zip"
31
+ SPLIT_DOWNLOAD_URL_TEMPLATE: str = "http://images.cocodataset.org/zips/{split}2017.zip"
32
+ COCO_SPLIT_TO_HELM_SPLIT: Dict[str, str] = {"train": TRAIN_SPLIT, "val": VALID_SPLIT}
33
+
34
+ name = "mscoco"
35
+ description = "Microsoft COCO: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))."
36
+ tags = ["text-to-image", "image-to-text"]
37
+
38
+ def get_instances(self, output_path: str) -> List[Instance]:
39
+ # Download the annotations which contains the image IDs, filenames and captions
40
+ data_path: str = os.path.join(output_path, "data_2017")
41
+ ensure_file_downloaded(source_url=self.ANNOTATIONS_DOWNLOAD_URL, target_path=data_path, unpack=True)
42
+
43
+ super_categories_to_categories: Dict[str, List[str]] = defaultdict(list)
44
+ category_id_to_category: Dict[int, str] = {}
45
+ category_id_to_super_category: Dict[int, str] = {}
46
+
47
+ instances: List[Instance] = []
48
+ for coco_split, helm_split in self.COCO_SPLIT_TO_HELM_SPLIT.items():
49
+ # Download the images of the split
50
+ split_url: str = self.SPLIT_DOWNLOAD_URL_TEMPLATE.format(split=coco_split)
51
+ split_path: str = os.path.join(data_path, coco_split)
52
+ ensure_file_downloaded(source_url=split_url, target_path=split_path, unpack=True)
53
+
54
+ # Read the metadata for the split
55
+ metadata_path: str = os.path.join(data_path, f"stuff_{coco_split}2017.json")
56
+ with open(metadata_path, "r") as f:
57
+ metadata: Dict[str, Any] = json.load(f)
58
+
59
+ for category_metadata in metadata["categories"]:
60
+ # Each metadata looks like this {'supercategory': 'textile', 'id': 92, 'name': 'banner'}
61
+ category_id: int = category_metadata["id"]
62
+ category: str = category_metadata["name"]
63
+ super_category: str = category_metadata["supercategory"]
64
+ super_categories_to_categories[super_category].append(category)
65
+ category_id_to_category[category_id] = category
66
+ category_id_to_super_category[category_id] = super_category
67
+
68
+ # Get the path of each image
69
+ image_id_to_path: Dict[int, str] = {
70
+ image_metadata["id"]: os.path.join(split_path, image_metadata["file_name"])
71
+ for image_metadata in metadata["images"]
72
+ }
73
+
74
+ # Gather the five captions for each image
75
+ image_id_to_category_ids: Dict[int, List[int]] = defaultdict(list)
76
+ for annotation in metadata["annotations"]:
77
+ image_id_to_category_ids[annotation["image_id"]].append(annotation["category_id"])
78
+
79
+ # Create instances
80
+ for image_id in image_id_to_path:
81
+ image_path: str = image_id_to_path[image_id]
82
+ assert os.path.exists(image_path), f"Image path {image_path} does not exist"
83
+ category_ids: List[int] = image_id_to_category_ids[image_id]
84
+
85
+ content: List[MediaObject] = [
86
+ MediaObject(location=image_path, content_type="image/jpeg"),
87
+ ]
88
+ references: List[Reference] = []
89
+ correct_super_categories: Set[str] = set(
90
+ category_id_to_super_category[category_id] for category_id in category_ids
91
+ )
92
+ # for category_id in category_ids:
93
+ # category = category_id_to_category[category_id]
94
+ # super_category = category_id_to_super_category[category_id]
95
+ # references.extend(
96
+ # [
97
+ # Reference(Output(text=category), tags=[CORRECT_TAG]),
98
+ # Reference(Output(text=super_category), tags=[CORRECT_TAG]),
99
+ # ]
100
+ # )
101
+ for super_category in super_categories_to_categories:
102
+ references.append(
103
+ Reference(
104
+ Output(text=super_category),
105
+ tags=[CORRECT_TAG] if super_category in correct_super_categories else [],
106
+ )
107
+ )
108
+
109
+ instances.append(
110
+ Instance(
111
+ Input(multimedia_content=MultimediaObject(content)),
112
+ references=references,
113
+ split=helm_split,
114
+ )
115
+ )
116
+
117
+ return instances