crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
- helm/benchmark/adaptation/adapter_spec.py +32 -31
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/annotation/air_bench_annotator.py +64 -0
- helm/benchmark/annotation/annotator_factory.py +6 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
- helm/benchmark/annotation/live_qa_annotator.py +84 -0
- helm/benchmark/annotation/medication_qa_annotator.py +81 -0
- helm/benchmark/augmentations/perturbation.py +17 -1
- helm/benchmark/augmentations/test_perturbation.py +30 -0
- helm/benchmark/augmentations/translate_perturbation.py +1 -0
- helm/benchmark/huggingface_registration.py +16 -6
- helm/benchmark/metrics/air_bench_metrics.py +56 -0
- helm/benchmark/metrics/efficiency_metrics.py +9 -2
- helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
- helm/benchmark/metrics/live_qa_metrics.py +23 -0
- helm/benchmark/metrics/medication_qa_metrics.py +23 -0
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/unitxt_metrics.py +20 -10
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
- helm/benchmark/model_metadata_registry.py +5 -1
- helm/benchmark/presentation/schema.py +54 -4
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/run.py +16 -2
- helm/benchmark/run_expander.py +112 -63
- helm/benchmark/run_spec_factory.py +15 -10
- helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +15 -11
- helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
- helm/benchmark/run_specs/experimental_run_specs.py +33 -0
- helm/benchmark/run_specs/finance_run_specs.py +33 -0
- helm/benchmark/run_specs/vlm_run_specs.py +444 -65
- helm/benchmark/scenarios/air_bench_scenario.py +50 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
- helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/math_scenario.py +1 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_classic.yaml +3 -59
- helm/benchmark/static/schema_finance.yaml +143 -0
- helm/benchmark/static/schema_image2structure.yaml +447 -0
- helm/benchmark/static/schema_instruction_following.yaml +3 -52
- helm/benchmark/static/schema_lite.yaml +3 -61
- helm/benchmark/static/schema_medical.yaml +255 -0
- helm/benchmark/static/schema_mmlu.yaml +3 -61
- helm/benchmark/static/schema_tables.yaml +200 -0
- helm/benchmark/static/schema_thai.yaml +223 -0
- helm/benchmark/static/schema_unitxt.yaml +3 -61
- helm/benchmark/static/schema_vhelm.yaml +824 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
- helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
- helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
- helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
- helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/anthropic_client.py +78 -14
- helm/clients/auto_client.py +11 -0
- helm/clients/client.py +24 -7
- helm/clients/cohere_client.py +98 -3
- helm/clients/huggingface_client.py +71 -12
- helm/clients/openai_client.py +11 -5
- helm/clients/reka_client.py +189 -0
- helm/clients/test_client.py +3 -3
- helm/clients/test_huggingface_client.py +19 -3
- helm/clients/test_together_client.py +72 -2
- helm/clients/together_client.py +199 -2
- helm/clients/vertexai_client.py +117 -64
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +12 -4
- helm/clients/vision_language/idefics_client.py +2 -2
- helm/clients/vision_language/paligemma_client.py +146 -0
- helm/clients/vision_language/palmyra_vision_client.py +84 -0
- helm/clients/yi_client.py +31 -0
- helm/common/critique_request.py +10 -1
- helm/common/images_utils.py +29 -3
- helm/config/model_deployments.yaml +504 -12
- helm/config/model_metadata.yaml +579 -52
- helm/config/tokenizer_configs.yaml +100 -1
- helm/proxy/critique/model_critique_client.py +32 -4
- helm/proxy/services/server_service.py +1 -1
- helm/tokenizers/auto_tokenizer.py +1 -1
- helm/tokenizers/cohere_tokenizer.py +44 -2
- helm/tokenizers/huggingface_tokenizer.py +36 -13
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_huggingface_tokenizer.py +5 -1
- helm/benchmark/static/schema_vlm.yaml +0 -576
- helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
- helm/benchmark/static_build/assets/index-d839df55.js +0 -9
- helm/benchmark/test_model_deployment_definition.py +0 -90
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
ALL_SPLITS,
|
|
7
|
+
CORRECT_TAG,
|
|
8
|
+
VALID_SPLIT,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.general import ensure_file_downloaded
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class GQAScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
Questions about real-world visual reasoning and compositional QA
|
|
23
|
+
|
|
24
|
+
@misc{hudson2019gqa,
|
|
25
|
+
title={GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering},
|
|
26
|
+
author={Drew A. Hudson and Christopher D. Manning},
|
|
27
|
+
year={2019},
|
|
28
|
+
eprint={1902.09506},
|
|
29
|
+
archivePrefix={arXiv},
|
|
30
|
+
primaryClass={cs.CL}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
Paper: https://arxiv.org/abs/1902.09506
|
|
34
|
+
Website: https://github.com/stanford-crfm/helm/issues/1951
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
QUESTIONS_URL: str = "https://downloads.cs.stanford.edu/nlp/data/gqa/questions1.2.zip"
|
|
38
|
+
IMAGES_URL: str = "https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip"
|
|
39
|
+
|
|
40
|
+
name = "gqa"
|
|
41
|
+
description = (
|
|
42
|
+
"Questions about real-world visual reasoning and compositional QA "
|
|
43
|
+
"([paper](https://arxiv.org/abs/1902.09506))."
|
|
44
|
+
)
|
|
45
|
+
tags = ["vision-language", "reasoning"]
|
|
46
|
+
|
|
47
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
48
|
+
questions_path: str = os.path.join(output_path, "questions")
|
|
49
|
+
ensure_file_downloaded(
|
|
50
|
+
source_url=self.QUESTIONS_URL, target_path=questions_path, unpack=True, unpack_type="unzip"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
images_path: str = os.path.join(output_path, "images")
|
|
54
|
+
ensure_file_downloaded(source_url=self.IMAGES_URL, target_path=images_path, unpack=True, unpack_type="unzip")
|
|
55
|
+
|
|
56
|
+
instances: List[Instance] = []
|
|
57
|
+
for helm_split in ALL_SPLITS:
|
|
58
|
+
if helm_split == TEST_SPLIT:
|
|
59
|
+
# The test split doesn't have annotations
|
|
60
|
+
continue
|
|
61
|
+
|
|
62
|
+
split: str = "val" if helm_split == VALID_SPLIT else helm_split
|
|
63
|
+
|
|
64
|
+
# Read the questions from the JSON
|
|
65
|
+
questions_split_path: str = os.path.join(questions_path, f"{split}_balanced_questions.json")
|
|
66
|
+
with open(questions_split_path, "r") as questions_file:
|
|
67
|
+
questions: Dict[str, Any] = json.load(questions_file)
|
|
68
|
+
for question_id, question_data in questions.items():
|
|
69
|
+
question: str = question_data["question"]
|
|
70
|
+
short_answer: str = question_data["answer"]
|
|
71
|
+
full_answer: str = question_data["fullAnswer"]
|
|
72
|
+
|
|
73
|
+
image_id: str = question_data["imageId"]
|
|
74
|
+
local_image_path: str = os.path.join(images_path, f"{image_id}.jpg")
|
|
75
|
+
|
|
76
|
+
content: List[MediaObject] = [
|
|
77
|
+
MediaObject(text=question, content_type="text/plain"),
|
|
78
|
+
MediaObject(location=local_image_path, content_type="image/jpeg"),
|
|
79
|
+
]
|
|
80
|
+
instances.append(
|
|
81
|
+
Instance(
|
|
82
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
83
|
+
references=[
|
|
84
|
+
Reference(Output(text=short_answer), tags=[CORRECT_TAG]),
|
|
85
|
+
Reference(Output(text=full_answer), tags=[CORRECT_TAG]),
|
|
86
|
+
],
|
|
87
|
+
split=helm_split,
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
return instances
|
|
@@ -80,11 +80,13 @@ class HatefulMemesScenario(Scenario):
|
|
|
80
80
|
MediaObject(location=local_image_path, content_type="image/jpeg"),
|
|
81
81
|
MediaObject(text=self.QUESTION, content_type="text/plain"),
|
|
82
82
|
]
|
|
83
|
-
answer: str = "Yes" if row["label"] == 1 else "No"
|
|
84
83
|
instances.append(
|
|
85
84
|
Instance(
|
|
86
85
|
Input(multimedia_content=MultimediaObject(content)),
|
|
87
|
-
references=[
|
|
86
|
+
references=[
|
|
87
|
+
Reference(Output(text="Yes"), tags=[CORRECT_TAG] if row["label"] == 1 else []),
|
|
88
|
+
Reference(Output(text="No"), tags=[CORRECT_TAG] if row["label"] == 0 else []),
|
|
89
|
+
],
|
|
88
90
|
split=split,
|
|
89
91
|
)
|
|
90
92
|
)
|
|
@@ -22,6 +22,10 @@ from helm.common.general import ensure_directory_exists
|
|
|
22
22
|
from helm.common.hierarchical_logger import hlog
|
|
23
23
|
|
|
24
24
|
PROCESSED: str = "processed"
|
|
25
|
+
DIFFICULTY_ALL = "all"
|
|
26
|
+
DIFFICULTY_EASY = "easy"
|
|
27
|
+
DIFFICULTY_MEDIUM = "medium"
|
|
28
|
+
DIFFICULTY_HARD = "hard"
|
|
25
29
|
|
|
26
30
|
|
|
27
31
|
class Image2StructureScenario(Scenario):
|
|
@@ -38,13 +42,16 @@ class Image2StructureScenario(Scenario):
|
|
|
38
42
|
VALID_SPLIT: "validation",
|
|
39
43
|
}
|
|
40
44
|
|
|
41
|
-
def __init__(
|
|
45
|
+
def __init__(
|
|
46
|
+
self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT, difficulty: str = DIFFICULTY_ALL
|
|
47
|
+
):
|
|
42
48
|
super().__init__()
|
|
43
49
|
assert subset in self.SUBSETS, f"Invalid subset: {subset}"
|
|
44
50
|
self._subset: str = subset
|
|
45
51
|
self._recompile_prompt: bool = recompile_prompt
|
|
46
52
|
self._split: str = split
|
|
47
53
|
self._output_path: Optional[str] = None
|
|
54
|
+
self._difficulty: str = difficulty
|
|
48
55
|
|
|
49
56
|
def preprocess_row(self, row: Dict[str, Any], assets_path: str) -> Dict[str, Any]:
|
|
50
57
|
# By default, there are no assets
|
|
@@ -110,6 +117,10 @@ class Image2StructureScenario(Scenario):
|
|
|
110
117
|
)
|
|
111
118
|
continue
|
|
112
119
|
|
|
120
|
+
# Filter by difficulty
|
|
121
|
+
if self._difficulty != DIFFICULTY_ALL and row["difficulty"] != self._difficulty:
|
|
122
|
+
continue
|
|
123
|
+
|
|
113
124
|
# Step 1: Preprocess the row
|
|
114
125
|
row = self.preprocess_row(row, assets_path)
|
|
115
126
|
|
|
@@ -158,7 +169,7 @@ class Image2StructureScenario(Scenario):
|
|
|
158
169
|
# representing the structure (such as LaTeX code)
|
|
159
170
|
multimedia_object = MultimediaObject([image_object])
|
|
160
171
|
reference = Reference(
|
|
161
|
-
output=Output(text=row["text"], multimedia_content=multimedia_object),
|
|
172
|
+
output=Output(text=row["text"] if "text" in row else "", multimedia_content=multimedia_object),
|
|
162
173
|
tags=[CORRECT_TAG],
|
|
163
174
|
)
|
|
164
175
|
else:
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from helm.benchmark.scenarios.scenario import VALID_SPLIT
|
|
2
1
|
from helm.benchmark.scenarios.vision_language.image2structure.utils_latex import (
|
|
3
2
|
latex_to_image,
|
|
4
3
|
strip_unnecessary_latex_parts,
|
|
@@ -9,14 +8,11 @@ from helm.benchmark.scenarios.vision_language.image2structure.image2structure_sc
|
|
|
9
8
|
class LatexScenario(Image2StructureScenario):
|
|
10
9
|
BASE_PROMPT = "Please provide the LaTeX code used to generate this image. Only generate the code relevant to what you see. Your code will be surrounded by all the imports necessary as well as the begin and end document delimiters." # noqa: E501
|
|
11
10
|
HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-latex"
|
|
12
|
-
SUBSETS = ["equation", "table", "plot", "algorithm"]
|
|
11
|
+
SUBSETS = ["equation", "table", "plot", "algorithm", "real"]
|
|
13
12
|
|
|
14
13
|
name = "image2latex"
|
|
15
14
|
description = "Evaluate multimodal models on Latex generation to recreate a provided image"
|
|
16
15
|
|
|
17
|
-
def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
|
|
18
|
-
super().__init__(subset, recompile_prompt, split)
|
|
19
|
-
|
|
20
16
|
def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
|
|
21
17
|
image, infos = latex_to_image(structure, assets_path=assets_path, crop=True)
|
|
22
18
|
image.save(destination_path)
|
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
from helm.benchmark.scenarios.scenario import VALID_SPLIT
|
|
2
1
|
from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import Image2StructureScenario
|
|
3
2
|
|
|
4
3
|
|
|
5
4
|
class MusicSheetScenario(Image2StructureScenario):
|
|
6
5
|
BASE_PROMPT = (
|
|
7
|
-
"Please generate the Lilypond code to generate a music sheet that looks like this image as much as
|
|
6
|
+
"Please generate the Lilypond code to generate a music sheet that looks like this image as much as feasibly possible.\n" # noqa: E501
|
|
8
7
|
"This music sheet was created by me, and I would like to recreate it using Lilypond."
|
|
9
8
|
)
|
|
10
9
|
HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-musicsheet"
|
|
@@ -13,8 +12,5 @@ class MusicSheetScenario(Image2StructureScenario):
|
|
|
13
12
|
name = "image2musicsheet"
|
|
14
13
|
description = "Evaluate multimodal models on Lilypond generation to recreate a provided image"
|
|
15
14
|
|
|
16
|
-
def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
|
|
17
|
-
super().__init__(subset, recompile_prompt, split)
|
|
18
|
-
|
|
19
15
|
def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
|
|
20
16
|
raise Exception("Music sheets have no ground truth, compilation is not possible")
|
|
@@ -4,6 +4,7 @@ from helm.benchmark.scenarios.scenario import VALID_SPLIT
|
|
|
4
4
|
from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import (
|
|
5
5
|
Image2StructureScenario,
|
|
6
6
|
PROCESSED,
|
|
7
|
+
DIFFICULTY_ALL,
|
|
7
8
|
)
|
|
8
9
|
from helm.benchmark.scenarios.vision_language.image2structure.webpage.jekyll_server import JekyllServer
|
|
9
10
|
from helm.benchmark.scenarios.vision_language.image2structure.webpage.driver import (
|
|
@@ -123,12 +124,12 @@ class WebpageScenario(Image2StructureScenario):
|
|
|
123
124
|
" }\n"
|
|
124
125
|
"]\n"
|
|
125
126
|
"You do not have to create files with the same names. Create as many files as you need, you can even use directories if necessary," # noqa: E501
|
|
126
|
-
" they will be created for you automatically. Try to write some realistic code keeping in mind that
|
|
127
|
+
" they will be created for you automatically. Try to write some realistic code keeping in mind that it should"
|
|
127
128
|
" look like the image as much as feasibly possible."
|
|
128
129
|
)
|
|
129
130
|
|
|
130
131
|
HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-webpage"
|
|
131
|
-
SUBSETS = ["css", "html", "javascript"]
|
|
132
|
+
SUBSETS = ["css", "html", "javascript", "real"]
|
|
132
133
|
MAX_TRIES: int = 5
|
|
133
134
|
ASSETS_EXTENSIONS: List[str] = ["png", "jpg", "jpeg", "gif", "svg", "webp", "ico", "bmp", "tiff"]
|
|
134
135
|
|
|
@@ -140,9 +141,10 @@ class WebpageScenario(Image2StructureScenario):
|
|
|
140
141
|
subset: str,
|
|
141
142
|
recompile_prompt: bool = True,
|
|
142
143
|
split: str = VALID_SPLIT,
|
|
144
|
+
difficulty: str = DIFFICULTY_ALL,
|
|
143
145
|
screenshot_options: ScreenshotOptions = ScreenshotOptions(),
|
|
144
146
|
):
|
|
145
|
-
super().__init__(subset, recompile_prompt, split)
|
|
147
|
+
super().__init__(subset, recompile_prompt, split, difficulty)
|
|
146
148
|
self._screenshot_options = screenshot_options
|
|
147
149
|
self._html2text = HTML2Text()
|
|
148
150
|
self._html2text.ignore_links = True
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.general import ensure_directory_exists
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MathVistaScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
MathVista: Evaluating Math Reasoning in Visual Contexts
|
|
23
|
+
|
|
24
|
+
To bridge this gap, we present MathVista, a benchmark designed to combine challenges from diverse
|
|
25
|
+
mathematical and visual tasks. It consists of 6,141 examples, derived from 28 existing multimodal datasets
|
|
26
|
+
involving mathematics and 3 newly created datasets (i.e., IQTest, FunctionQA, and PaperQA). Completing these
|
|
27
|
+
tasks requires fine-grained, deep visual understanding and compositional reasoning, which all state-of-the-art
|
|
28
|
+
foundation models find challenging.
|
|
29
|
+
|
|
30
|
+
@inproceedings{lu2024mathvista,
|
|
31
|
+
author = {Lu, Pan and Bansal, Hritik and Xia, Tony and Liu, Jiacheng and Li, Chunyuan and Hajishirzi,
|
|
32
|
+
Hannaneh and Cheng, Hao and Chang, Kai-Wei and Galley, Michel and Gao, Jianfeng},
|
|
33
|
+
title = {MathVista: Evaluating Mathematical Reasoning of Foundation Models in Visual Contexts},
|
|
34
|
+
booktitle={International Conference on Learning Representations (ICLR)},
|
|
35
|
+
year = {2024}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
Paper: https://arxiv.org/abs/2310.02255
|
|
39
|
+
Website: https://mathvista.github.io/
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
HUGGINGFACE_DATASET_NAME: str = "AI4Math/MathVista"
|
|
43
|
+
|
|
44
|
+
# Only the testmini split has answers
|
|
45
|
+
SPLIT: str = "testmini"
|
|
46
|
+
|
|
47
|
+
# Supported difficulties
|
|
48
|
+
GRADES: List[str] = ["elementary_school", "high_school", "college", "daily_life"]
|
|
49
|
+
QUESTION_TYPES: List[str] = ["multi_choice", "free_form"]
|
|
50
|
+
|
|
51
|
+
name = "math_vista"
|
|
52
|
+
description = (
|
|
53
|
+
"A benchmark designed to combine challenges from diverse mathematical and visual tasks. "
|
|
54
|
+
"([paper](https://arxiv.org/abs/2310.02255))."
|
|
55
|
+
)
|
|
56
|
+
tags = ["vision-language", "reasoning", "math"]
|
|
57
|
+
|
|
58
|
+
def __init__(self, grade: str, question_type: str):
|
|
59
|
+
super().__init__()
|
|
60
|
+
assert grade in self.GRADES, f"Not supported: {grade}"
|
|
61
|
+
self._grade: str = grade.replace("_", " ")
|
|
62
|
+
|
|
63
|
+
assert question_type in self.QUESTION_TYPES, f"Invalid question type: {question_type}"
|
|
64
|
+
self._question_type: str = question_type
|
|
65
|
+
|
|
66
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
67
|
+
ensure_directory_exists(os.path.join(output_path, "images"))
|
|
68
|
+
instances: List[Instance] = []
|
|
69
|
+
|
|
70
|
+
for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split=self.SPLIT, cache_dir=output_path)):
|
|
71
|
+
# Filter out the questions by type and grade (or difficulty)
|
|
72
|
+
if row["question_type"] != self._question_type or row["metadata"]["grade"] != self._grade:
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
pid: str = row["pid"]
|
|
76
|
+
question: str = row["question"]
|
|
77
|
+
answer: str = row["answer"]
|
|
78
|
+
|
|
79
|
+
# Save the image locally
|
|
80
|
+
assert row["image"] == f"images/{pid}.jpg", f"Invalid image path: {row['image']} for question {pid}"
|
|
81
|
+
image_path: str = os.path.join(output_path, row["image"])
|
|
82
|
+
|
|
83
|
+
if not os.path.exists(image_path):
|
|
84
|
+
image = row["decoded_image"]
|
|
85
|
+
if image.mode in ("RGBA", "P", "LA"):
|
|
86
|
+
image = image.convert("RGB")
|
|
87
|
+
image.save(image_path)
|
|
88
|
+
|
|
89
|
+
content: List[MediaObject] = [
|
|
90
|
+
MediaObject(text=question, content_type="text/plain"),
|
|
91
|
+
MediaObject(location=image_path, content_type="image/jpeg"),
|
|
92
|
+
]
|
|
93
|
+
|
|
94
|
+
# Add the references
|
|
95
|
+
references: List[Reference] = []
|
|
96
|
+
if self._question_type == "multi_choice":
|
|
97
|
+
options: List[str] = row["choices"]
|
|
98
|
+
for option in options:
|
|
99
|
+
references.append(Reference(Output(text=option), tags=[CORRECT_TAG] if option == answer else []))
|
|
100
|
+
else:
|
|
101
|
+
references.append(Reference(Output(text=answer), tags=[CORRECT_TAG]))
|
|
102
|
+
|
|
103
|
+
if row["unit"] is not None:
|
|
104
|
+
references.append(Reference(Output(text=f"{answer} {row['unit']}"), tags=[CORRECT_TAG]))
|
|
105
|
+
|
|
106
|
+
instances.append(
|
|
107
|
+
Instance(
|
|
108
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
109
|
+
references=references,
|
|
110
|
+
split=TEST_SPLIT,
|
|
111
|
+
)
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
assert (
|
|
115
|
+
len(instances) > 0
|
|
116
|
+
), f"No instances found for subject {self._grade} and question type {self._question_type}"
|
|
117
|
+
return instances
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
TEST_SPLIT,
|
|
7
|
+
Instance,
|
|
8
|
+
Input,
|
|
9
|
+
Scenario,
|
|
10
|
+
)
|
|
11
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
12
|
+
from helm.common.general import ensure_file_downloaded
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MMSafetyBenchScenario(Scenario):
|
|
16
|
+
"""
|
|
17
|
+
To evaluate the extent of this vulnerability in open-source VLMs, compiled a substantial dataset encompassing
|
|
18
|
+
13 scenarios with a total of 5,040 text-image pairs
|
|
19
|
+
|
|
20
|
+
@misc{liu2023queryrelevant,
|
|
21
|
+
title={Query-Relevant Images Jailbreak Large Multi-Modal Models},
|
|
22
|
+
author={Xin Liu and Yichen Zhu and Yunshi Lan and Chao Yang and Yu Qiao},
|
|
23
|
+
year={2023},
|
|
24
|
+
eprint={2311.17600},
|
|
25
|
+
archivePrefix={arXiv},
|
|
26
|
+
primaryClass={cs.CV}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
Paper: https://arxiv.org/abs/2311.17600
|
|
30
|
+
Website: https://isxinliu.github.io/Project/MM-SafetyBench/
|
|
31
|
+
Questions: https://github.com/isXinLiu/MM-SafetyBench/tree/main/data/processed_questions
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
SUBSET_TO_DATASET_FOLDER: Dict[str, str] = {
|
|
35
|
+
"illegal_activity": "01-Illegal_Activitiy", # This is intentionally misspelled to match the original name
|
|
36
|
+
"hate_speech": "02-HateSpeech",
|
|
37
|
+
"malware_generation": "03-Malware_Generation",
|
|
38
|
+
"physical_harm": "04-Physical_Harm",
|
|
39
|
+
"economic_harm": "05-EconomicHarm",
|
|
40
|
+
"fraud": "06-Fraud",
|
|
41
|
+
"sex": "07-Sex",
|
|
42
|
+
"political_lobbying": "08-Political_Lobbying",
|
|
43
|
+
"privacy_violence": "09-Privacy_Violence",
|
|
44
|
+
"legal_opinion": "10-Legal_Opinion",
|
|
45
|
+
"financial_advice": "11-Financial_Advice",
|
|
46
|
+
"health_consultation": "12-Health_Consultation",
|
|
47
|
+
"government_decision": "13-Gov_Decision",
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
QUESTIONS_URL_TEMPLATE: str = (
|
|
51
|
+
"https://raw.githubusercontent.com/isXinLiu/MM-SafetyBench/main/data/" "processed_questions/{dataset}.json"
|
|
52
|
+
)
|
|
53
|
+
IMAGES_URL: str = "https://drive.google.com/uc?export=download&id=1xjW9k-aGkmwycqGCXbru70FaSKhSDcR_"
|
|
54
|
+
|
|
55
|
+
name = "mm_safety_bench"
|
|
56
|
+
description = (
|
|
57
|
+
"Expose the vulnerability of open-source VLMs with toxic and biased content "
|
|
58
|
+
"([paper](https://arxiv.org/abs/2311.17600))."
|
|
59
|
+
)
|
|
60
|
+
tags = ["vision-language", "bias", "toxicity"]
|
|
61
|
+
|
|
62
|
+
def __init__(self, subset: str):
|
|
63
|
+
super().__init__()
|
|
64
|
+
assert subset in self.SUBSET_TO_DATASET_FOLDER, f"Invalid subset: {subset}"
|
|
65
|
+
self._dataset: str = self.SUBSET_TO_DATASET_FOLDER[subset]
|
|
66
|
+
|
|
67
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
68
|
+
# Download all the images
|
|
69
|
+
images_path: str = os.path.join(output_path, "MM-SafetyBench(imgs)")
|
|
70
|
+
assert os.path.exists(images_path), (
|
|
71
|
+
f"Images path does not exist: {images_path}. Download the images "
|
|
72
|
+
f"from {self.IMAGES_URL}, unzip and place it at {output_path}"
|
|
73
|
+
)
|
|
74
|
+
# SD_TYPO seems to have the greatest attack success rate on the models they evaluated
|
|
75
|
+
images_path = os.path.join(images_path, self._dataset, "SD_TYPO")
|
|
76
|
+
assert os.path.exists(images_path)
|
|
77
|
+
|
|
78
|
+
questions_path: str = os.path.join(output_path, f"{self._dataset}.json")
|
|
79
|
+
questions_url: str = self.QUESTIONS_URL_TEMPLATE.format(dataset=self._dataset)
|
|
80
|
+
ensure_file_downloaded(source_url=questions_url, target_path=questions_path)
|
|
81
|
+
|
|
82
|
+
instances: List[Instance] = []
|
|
83
|
+
|
|
84
|
+
with open(questions_path, "r") as questions_file:
|
|
85
|
+
questions: Dict[str, Any] = json.load(questions_file)
|
|
86
|
+
for question_id, question_data in questions.items():
|
|
87
|
+
local_image_path: str = os.path.join(images_path, f"{question_id}.jpg")
|
|
88
|
+
assert os.path.exists(local_image_path), f"Image does not exist: {local_image_path}"
|
|
89
|
+
|
|
90
|
+
question: str = question_data["Rephrased Question"]
|
|
91
|
+
content: List[MediaObject] = [
|
|
92
|
+
MediaObject(location=local_image_path, content_type="image/jpeg"),
|
|
93
|
+
MediaObject(text=question, content_type="text/plain"),
|
|
94
|
+
]
|
|
95
|
+
instances.append(
|
|
96
|
+
Instance(
|
|
97
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
98
|
+
references=[],
|
|
99
|
+
split=TEST_SPLIT,
|
|
100
|
+
)
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
return instances
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import Any, Dict, List
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_file_downloaded
|
|
7
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
8
|
+
from helm.benchmark.scenarios.scenario import (
|
|
9
|
+
Scenario,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
CORRECT_TAG,
|
|
15
|
+
TRAIN_SPLIT,
|
|
16
|
+
VALID_SPLIT,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MSCOCOCaptioningScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
|
|
23
|
+
It has 330K images, with over 200K of them labeled. We use the 2014 version of the dataset instead
|
|
24
|
+
of the 2017 version because of the larger validation set. According to https://cocodataset.org/#download,
|
|
25
|
+
the 2014 version has 83K images in the train split and 41K in the val split.
|
|
26
|
+
|
|
27
|
+
Each image also has five captions. For example, image #335111 has the following five captions:
|
|
28
|
+
1. a row of bikes on the sidewalk, 2 on the ground.
|
|
29
|
+
2. a couple of bikes laying on their sides on a sidewalk.
|
|
30
|
+
3. a person wearing a black coat with a hood stands on the street, near many bikes
|
|
31
|
+
4. a woman standing in front of a row of bicycles in front of a bus stop with two bikes knocked over
|
|
32
|
+
5. there are some bicycles laying on their sides
|
|
33
|
+
|
|
34
|
+
Paper: https://arxiv.org/abs/1405.0312
|
|
35
|
+
Website: https://cocodataset.org/#home
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
ANNOTATIONS_DOWNLOAD_URL: str = "http://images.cocodataset.org/annotations/annotations_trainval2014.zip"
|
|
39
|
+
SPLIT_DOWNLOAD_URL_TEMPLATE: str = "http://images.cocodataset.org/zips/{split}2014.zip"
|
|
40
|
+
COCO_SPLIT_TO_HELM_SPLIT: Dict[str, str] = {"train": TRAIN_SPLIT, "val": VALID_SPLIT}
|
|
41
|
+
|
|
42
|
+
name = "mscoco"
|
|
43
|
+
description = "Microsoft COCO: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))."
|
|
44
|
+
tags = ["text-to-image", "image-to-text"]
|
|
45
|
+
|
|
46
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
47
|
+
# Download the annotations which contains the image IDs, filenames and captions
|
|
48
|
+
data_path: str = os.path.join(output_path, "data")
|
|
49
|
+
ensure_file_downloaded(source_url=self.ANNOTATIONS_DOWNLOAD_URL, target_path=data_path, unpack=True)
|
|
50
|
+
|
|
51
|
+
instances: List[Instance] = []
|
|
52
|
+
for coco_split, helm_split in self.COCO_SPLIT_TO_HELM_SPLIT.items():
|
|
53
|
+
# Download the images of the split
|
|
54
|
+
split_url: str = self.SPLIT_DOWNLOAD_URL_TEMPLATE.format(split=coco_split)
|
|
55
|
+
split_path: str = os.path.join(data_path, coco_split)
|
|
56
|
+
ensure_file_downloaded(source_url=split_url, target_path=split_path, unpack=True)
|
|
57
|
+
|
|
58
|
+
# Read the metadata for the split
|
|
59
|
+
metadata_path: str = os.path.join(data_path, f"captions_{coco_split}2014.json")
|
|
60
|
+
with open(metadata_path, "r") as f:
|
|
61
|
+
metadata: Dict[str, Any] = json.load(f)
|
|
62
|
+
|
|
63
|
+
# Get the path of each image
|
|
64
|
+
image_id_to_path: Dict[int, str] = {
|
|
65
|
+
image_metadata["id"]: os.path.join(split_path, image_metadata["file_name"])
|
|
66
|
+
for image_metadata in metadata["images"]
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
# Gather the five captions for each image
|
|
70
|
+
image_id_to_captions: Dict[int, List[str]] = defaultdict(list)
|
|
71
|
+
for annotation in metadata["annotations"]:
|
|
72
|
+
image_id_to_captions[annotation["image_id"]].append(annotation["caption"])
|
|
73
|
+
|
|
74
|
+
# Create instances
|
|
75
|
+
for image_id in image_id_to_path:
|
|
76
|
+
image_path: str = image_id_to_path[image_id]
|
|
77
|
+
captions: List[str] = image_id_to_captions[image_id]
|
|
78
|
+
|
|
79
|
+
content: List[MediaObject] = [
|
|
80
|
+
MediaObject(location=image_path, content_type="image/jpeg"),
|
|
81
|
+
]
|
|
82
|
+
instances.append(
|
|
83
|
+
Instance(
|
|
84
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
85
|
+
references=[
|
|
86
|
+
Reference(Output(text=caption.rstrip()), tags=[CORRECT_TAG]) for caption in captions
|
|
87
|
+
],
|
|
88
|
+
split=helm_split,
|
|
89
|
+
)
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
return instances
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import Any, Dict, List, Set
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_file_downloaded
|
|
7
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
8
|
+
from helm.benchmark.scenarios.scenario import (
|
|
9
|
+
Scenario,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
CORRECT_TAG,
|
|
15
|
+
TRAIN_SPLIT,
|
|
16
|
+
VALID_SPLIT,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MSCOCOCategorizationScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
|
|
23
|
+
It has 330K images, with over 200K of them labeled. We use the 2017 version of the dataset
|
|
24
|
+
for the categorization task.
|
|
25
|
+
|
|
26
|
+
Paper: https://arxiv.org/abs/1405.0312
|
|
27
|
+
Website: https://cocodataset.org/#home
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
ANNOTATIONS_DOWNLOAD_URL: str = "http://images.cocodataset.org/annotations/stuff_annotations_trainval2017.zip"
|
|
31
|
+
SPLIT_DOWNLOAD_URL_TEMPLATE: str = "http://images.cocodataset.org/zips/{split}2017.zip"
|
|
32
|
+
COCO_SPLIT_TO_HELM_SPLIT: Dict[str, str] = {"train": TRAIN_SPLIT, "val": VALID_SPLIT}
|
|
33
|
+
|
|
34
|
+
name = "mscoco"
|
|
35
|
+
description = "Microsoft COCO: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))."
|
|
36
|
+
tags = ["text-to-image", "image-to-text"]
|
|
37
|
+
|
|
38
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
39
|
+
# Download the annotations which contains the image IDs, filenames and captions
|
|
40
|
+
data_path: str = os.path.join(output_path, "data_2017")
|
|
41
|
+
ensure_file_downloaded(source_url=self.ANNOTATIONS_DOWNLOAD_URL, target_path=data_path, unpack=True)
|
|
42
|
+
|
|
43
|
+
super_categories_to_categories: Dict[str, List[str]] = defaultdict(list)
|
|
44
|
+
category_id_to_category: Dict[int, str] = {}
|
|
45
|
+
category_id_to_super_category: Dict[int, str] = {}
|
|
46
|
+
|
|
47
|
+
instances: List[Instance] = []
|
|
48
|
+
for coco_split, helm_split in self.COCO_SPLIT_TO_HELM_SPLIT.items():
|
|
49
|
+
# Download the images of the split
|
|
50
|
+
split_url: str = self.SPLIT_DOWNLOAD_URL_TEMPLATE.format(split=coco_split)
|
|
51
|
+
split_path: str = os.path.join(data_path, coco_split)
|
|
52
|
+
ensure_file_downloaded(source_url=split_url, target_path=split_path, unpack=True)
|
|
53
|
+
|
|
54
|
+
# Read the metadata for the split
|
|
55
|
+
metadata_path: str = os.path.join(data_path, f"stuff_{coco_split}2017.json")
|
|
56
|
+
with open(metadata_path, "r") as f:
|
|
57
|
+
metadata: Dict[str, Any] = json.load(f)
|
|
58
|
+
|
|
59
|
+
for category_metadata in metadata["categories"]:
|
|
60
|
+
# Each metadata looks like this {'supercategory': 'textile', 'id': 92, 'name': 'banner'}
|
|
61
|
+
category_id: int = category_metadata["id"]
|
|
62
|
+
category: str = category_metadata["name"]
|
|
63
|
+
super_category: str = category_metadata["supercategory"]
|
|
64
|
+
super_categories_to_categories[super_category].append(category)
|
|
65
|
+
category_id_to_category[category_id] = category
|
|
66
|
+
category_id_to_super_category[category_id] = super_category
|
|
67
|
+
|
|
68
|
+
# Get the path of each image
|
|
69
|
+
image_id_to_path: Dict[int, str] = {
|
|
70
|
+
image_metadata["id"]: os.path.join(split_path, image_metadata["file_name"])
|
|
71
|
+
for image_metadata in metadata["images"]
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
# Gather the five captions for each image
|
|
75
|
+
image_id_to_category_ids: Dict[int, List[int]] = defaultdict(list)
|
|
76
|
+
for annotation in metadata["annotations"]:
|
|
77
|
+
image_id_to_category_ids[annotation["image_id"]].append(annotation["category_id"])
|
|
78
|
+
|
|
79
|
+
# Create instances
|
|
80
|
+
for image_id in image_id_to_path:
|
|
81
|
+
image_path: str = image_id_to_path[image_id]
|
|
82
|
+
assert os.path.exists(image_path), f"Image path {image_path} does not exist"
|
|
83
|
+
category_ids: List[int] = image_id_to_category_ids[image_id]
|
|
84
|
+
|
|
85
|
+
content: List[MediaObject] = [
|
|
86
|
+
MediaObject(location=image_path, content_type="image/jpeg"),
|
|
87
|
+
]
|
|
88
|
+
references: List[Reference] = []
|
|
89
|
+
correct_super_categories: Set[str] = set(
|
|
90
|
+
category_id_to_super_category[category_id] for category_id in category_ids
|
|
91
|
+
)
|
|
92
|
+
# for category_id in category_ids:
|
|
93
|
+
# category = category_id_to_category[category_id]
|
|
94
|
+
# super_category = category_id_to_super_category[category_id]
|
|
95
|
+
# references.extend(
|
|
96
|
+
# [
|
|
97
|
+
# Reference(Output(text=category), tags=[CORRECT_TAG]),
|
|
98
|
+
# Reference(Output(text=super_category), tags=[CORRECT_TAG]),
|
|
99
|
+
# ]
|
|
100
|
+
# )
|
|
101
|
+
for super_category in super_categories_to_categories:
|
|
102
|
+
references.append(
|
|
103
|
+
Reference(
|
|
104
|
+
Output(text=super_category),
|
|
105
|
+
tags=[CORRECT_TAG] if super_category in correct_super_categories else [],
|
|
106
|
+
)
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
instances.append(
|
|
110
|
+
Instance(
|
|
111
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
112
|
+
references=references,
|
|
113
|
+
split=helm_split,
|
|
114
|
+
)
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
return instances
|