crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
- helm/benchmark/adaptation/adapter_spec.py +32 -31
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/annotation/air_bench_annotator.py +64 -0
- helm/benchmark/annotation/annotator_factory.py +6 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
- helm/benchmark/annotation/live_qa_annotator.py +84 -0
- helm/benchmark/annotation/medication_qa_annotator.py +81 -0
- helm/benchmark/augmentations/perturbation.py +17 -1
- helm/benchmark/augmentations/test_perturbation.py +30 -0
- helm/benchmark/augmentations/translate_perturbation.py +1 -0
- helm/benchmark/huggingface_registration.py +16 -6
- helm/benchmark/metrics/air_bench_metrics.py +56 -0
- helm/benchmark/metrics/efficiency_metrics.py +9 -2
- helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
- helm/benchmark/metrics/live_qa_metrics.py +23 -0
- helm/benchmark/metrics/medication_qa_metrics.py +23 -0
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/unitxt_metrics.py +20 -10
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
- helm/benchmark/model_metadata_registry.py +5 -1
- helm/benchmark/presentation/schema.py +54 -4
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/run.py +16 -2
- helm/benchmark/run_expander.py +112 -63
- helm/benchmark/run_spec_factory.py +15 -10
- helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +15 -11
- helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
- helm/benchmark/run_specs/experimental_run_specs.py +33 -0
- helm/benchmark/run_specs/finance_run_specs.py +33 -0
- helm/benchmark/run_specs/vlm_run_specs.py +444 -65
- helm/benchmark/scenarios/air_bench_scenario.py +50 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
- helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/math_scenario.py +1 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_classic.yaml +3 -59
- helm/benchmark/static/schema_finance.yaml +143 -0
- helm/benchmark/static/schema_image2structure.yaml +447 -0
- helm/benchmark/static/schema_instruction_following.yaml +3 -52
- helm/benchmark/static/schema_lite.yaml +3 -61
- helm/benchmark/static/schema_medical.yaml +255 -0
- helm/benchmark/static/schema_mmlu.yaml +3 -61
- helm/benchmark/static/schema_tables.yaml +200 -0
- helm/benchmark/static/schema_thai.yaml +223 -0
- helm/benchmark/static/schema_unitxt.yaml +3 -61
- helm/benchmark/static/schema_vhelm.yaml +824 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
- helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
- helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
- helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
- helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/anthropic_client.py +78 -14
- helm/clients/auto_client.py +11 -0
- helm/clients/client.py +24 -7
- helm/clients/cohere_client.py +98 -3
- helm/clients/huggingface_client.py +71 -12
- helm/clients/openai_client.py +11 -5
- helm/clients/reka_client.py +189 -0
- helm/clients/test_client.py +3 -3
- helm/clients/test_huggingface_client.py +19 -3
- helm/clients/test_together_client.py +72 -2
- helm/clients/together_client.py +199 -2
- helm/clients/vertexai_client.py +117 -64
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +12 -4
- helm/clients/vision_language/idefics_client.py +2 -2
- helm/clients/vision_language/paligemma_client.py +146 -0
- helm/clients/vision_language/palmyra_vision_client.py +84 -0
- helm/clients/yi_client.py +31 -0
- helm/common/critique_request.py +10 -1
- helm/common/images_utils.py +29 -3
- helm/config/model_deployments.yaml +504 -12
- helm/config/model_metadata.yaml +579 -52
- helm/config/tokenizer_configs.yaml +100 -1
- helm/proxy/critique/model_critique_client.py +32 -4
- helm/proxy/services/server_service.py +1 -1
- helm/tokenizers/auto_tokenizer.py +1 -1
- helm/tokenizers/cohere_tokenizer.py +44 -2
- helm/tokenizers/huggingface_tokenizer.py +36 -13
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_huggingface_tokenizer.py +5 -1
- helm/benchmark/static/schema_vlm.yaml +0 -576
- helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
- helm/benchmark/static_build/assets/index-d839df55.js +0 -9
- helm/benchmark/test_model_deployment_definition.py +0 -90
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -7,10 +7,11 @@ from helm.benchmark.adaptation.adapters.adapter_factory import (
|
|
|
7
7
|
ADAPT_GENERATION_MULTIMODAL,
|
|
8
8
|
ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
|
|
9
9
|
)
|
|
10
|
+
from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import DIFFICULTY_ALL
|
|
10
11
|
from helm.benchmark.metrics.common_metric_specs import (
|
|
11
|
-
get_basic_reference_metric_specs,
|
|
12
12
|
get_exact_match_metric_specs,
|
|
13
|
-
|
|
13
|
+
get_generative_harms_metric_specs,
|
|
14
|
+
get_basic_metric_specs,
|
|
14
15
|
)
|
|
15
16
|
from helm.benchmark.metrics.metric import MetricSpec
|
|
16
17
|
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
@@ -22,13 +23,14 @@ from helm.benchmark.annotation.annotator import AnnotatorSpec
|
|
|
22
23
|
# Prototypical adapter specs for VLM evaluation
|
|
23
24
|
|
|
24
25
|
|
|
25
|
-
def
|
|
26
|
+
def _get_generation_adapter_spec(
|
|
26
27
|
instructions: str = "",
|
|
27
28
|
input_prefix: str = "",
|
|
28
29
|
input_suffix: str = "",
|
|
29
30
|
output_prefix: str = "",
|
|
30
31
|
output_suffix: str = "",
|
|
31
32
|
max_tokens: int = 100,
|
|
33
|
+
max_train_instances: int = 0,
|
|
32
34
|
stop_sequences: Optional[List[str]] = None,
|
|
33
35
|
) -> AdapterSpec:
|
|
34
36
|
return AdapterSpec(
|
|
@@ -40,23 +42,42 @@ def get_generation_adapter_spec(
|
|
|
40
42
|
output_prefix=output_prefix,
|
|
41
43
|
output_suffix=output_suffix,
|
|
42
44
|
instance_prefix="\n",
|
|
43
|
-
|
|
44
|
-
max_train_instances=0,
|
|
45
|
+
max_train_instances=max_train_instances,
|
|
45
46
|
num_outputs=1,
|
|
46
47
|
max_tokens=max_tokens,
|
|
47
48
|
stop_sequences=stop_sequences if stop_sequences is not None else [],
|
|
49
|
+
temperature=0.0,
|
|
48
50
|
random=None,
|
|
49
51
|
)
|
|
50
52
|
|
|
51
53
|
|
|
52
|
-
def
|
|
53
|
-
return
|
|
54
|
-
instructions=
|
|
54
|
+
def _get_short_answer_generation_adapter_spec(instructions: Optional[str] = None) -> AdapterSpec:
|
|
55
|
+
return _get_generation_adapter_spec(
|
|
56
|
+
instructions=(
|
|
57
|
+
"Just give a short answer without answering in a complete sentence."
|
|
58
|
+
if instructions is None
|
|
59
|
+
else instructions
|
|
60
|
+
),
|
|
55
61
|
max_tokens=20,
|
|
56
62
|
)
|
|
57
63
|
|
|
58
64
|
|
|
59
|
-
def
|
|
65
|
+
def _get_captioning_adapter_spec() -> AdapterSpec:
|
|
66
|
+
return _get_generation_adapter_spec(
|
|
67
|
+
instructions="Generate a caption for the following image. The caption should be short and does "
|
|
68
|
+
"not need to be a complete sentence.",
|
|
69
|
+
max_tokens=20,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def get_open_end_answer_generation_adapter_spec():
|
|
74
|
+
return _get_generation_adapter_spec(
|
|
75
|
+
instructions="Follow the given instruction and give your complete answer.",
|
|
76
|
+
max_tokens=100,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _get_multiple_choice_joint_adapter_spec(
|
|
60
81
|
input_noun: Optional[str],
|
|
61
82
|
output_noun: str,
|
|
62
83
|
max_train_instances: int = 0,
|
|
@@ -84,7 +105,13 @@ def get_multiple_choice_joint_adapter_spec(
|
|
|
84
105
|
# VHELM metric specs
|
|
85
106
|
|
|
86
107
|
|
|
87
|
-
def
|
|
108
|
+
def _get_open_ended_generation_metric_specs() -> List[MetricSpec]:
|
|
109
|
+
return get_basic_metric_specs(
|
|
110
|
+
["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4", "cider"]
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _get_image2structure_metric_specs(
|
|
88
115
|
generation_type: str,
|
|
89
116
|
metric_names: Optional[List[str]] = None,
|
|
90
117
|
args: Optional[Dict] = None,
|
|
@@ -97,6 +124,7 @@ def get_image2structure_metric_specs(
|
|
|
97
124
|
metric_names = [
|
|
98
125
|
AnnotatedImageMetrics.PIXEL_SIMILARITY,
|
|
99
126
|
AnnotatedImageMetrics.FID_SIMILARITY,
|
|
127
|
+
AnnotatedImageMetrics.BLOCK_EMD,
|
|
100
128
|
AnnotatedImageMetrics.EARTH_MOVER_SIMILARITY,
|
|
101
129
|
]
|
|
102
130
|
if include_edit_similarity:
|
|
@@ -114,20 +142,77 @@ def get_image2structure_metric_specs(
|
|
|
114
142
|
},
|
|
115
143
|
),
|
|
116
144
|
]
|
|
117
|
-
return metric_specs +
|
|
145
|
+
return metric_specs + get_basic_metric_specs([])
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _get_prometheus_vision_critique_metric_specs(num_respondents: int, max_tokens: int) -> List[MetricSpec]:
|
|
149
|
+
return [
|
|
150
|
+
MetricSpec(
|
|
151
|
+
class_name="helm.benchmark.metrics.prometheus_vision_critique_metrics.PrometheusVisionCritiqueMetric",
|
|
152
|
+
args={
|
|
153
|
+
"num_respondents": num_respondents,
|
|
154
|
+
"max_tokens": max_tokens,
|
|
155
|
+
},
|
|
156
|
+
)
|
|
157
|
+
]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _get_gpt4v_critique_originality_metric_specs(num_respondents: int) -> List[MetricSpec]:
|
|
161
|
+
return [
|
|
162
|
+
MetricSpec(
|
|
163
|
+
class_name="helm.benchmark.metrics.gpt4v_originality_critique_metrics.GPT4VCritiqueMetric",
|
|
164
|
+
args={
|
|
165
|
+
"num_respondents": num_respondents,
|
|
166
|
+
},
|
|
167
|
+
)
|
|
168
|
+
]
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _get_vibe_eval_critique_metric_specs(num_respondents: int, max_tokens: int) -> List[MetricSpec]:
|
|
172
|
+
return [
|
|
173
|
+
MetricSpec(
|
|
174
|
+
class_name="helm.benchmark.metrics.reka_vibe_critique_metrics.RekaVibeCritiqueMetric",
|
|
175
|
+
args={
|
|
176
|
+
"num_respondents": num_respondents,
|
|
177
|
+
"max_tokens": max_tokens,
|
|
178
|
+
},
|
|
179
|
+
)
|
|
180
|
+
]
|
|
118
181
|
|
|
119
182
|
|
|
120
183
|
############################################################
|
|
121
184
|
# VHELM run specs
|
|
122
185
|
|
|
123
186
|
|
|
187
|
+
@run_spec_function("a_okvqa")
|
|
188
|
+
def get_a_okvqa_spec() -> RunSpec:
|
|
189
|
+
scenario_spec = ScenarioSpec(
|
|
190
|
+
class_name="helm.benchmark.scenarios.vision_language.a_okvqa_scenario.AOKVQAScenario",
|
|
191
|
+
args={},
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
|
|
195
|
+
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
199
|
+
run_spec_name: str = "a_okvqa"
|
|
200
|
+
return RunSpec(
|
|
201
|
+
name=run_spec_name,
|
|
202
|
+
scenario_spec=scenario_spec,
|
|
203
|
+
adapter_spec=adapter_spec,
|
|
204
|
+
metric_specs=metric_specs,
|
|
205
|
+
groups=[run_spec_name],
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
|
|
124
209
|
@run_spec_function("chart2csv")
|
|
125
210
|
def get_chart2csv_spec() -> RunSpec:
|
|
126
211
|
scenario_spec = ScenarioSpec(
|
|
127
212
|
class_name="helm.benchmark.scenarios.vision_language.image2structure.chart2csv_scenario.Chart2CSVScenario",
|
|
128
213
|
args={},
|
|
129
214
|
)
|
|
130
|
-
adapter_spec: AdapterSpec =
|
|
215
|
+
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
|
|
131
216
|
instructions="Generate the CSV for the chart. Some of the labels may be missing due to the size of the chart. "
|
|
132
217
|
"Please infer the missing labels based on the surrounding context. "
|
|
133
218
|
"Just give the CSV without any explanation.",
|
|
@@ -145,15 +230,90 @@ def get_chart2csv_spec() -> RunSpec:
|
|
|
145
230
|
)
|
|
146
231
|
|
|
147
232
|
|
|
233
|
+
@run_spec_function("crossmodal_3600")
|
|
234
|
+
def get_crossmodal_3600_spec(location: str, language: str, num_respondents: int) -> RunSpec:
|
|
235
|
+
scenario_spec = ScenarioSpec(
|
|
236
|
+
class_name="helm.benchmark.scenarios.vision_language.crossmodal_3600_scenario.Crossmodal3600Scenario",
|
|
237
|
+
args={"location": location, "language": language},
|
|
238
|
+
)
|
|
239
|
+
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
|
|
240
|
+
instructions="Answer the question with a complete sentence in plain words",
|
|
241
|
+
max_tokens=20,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
metric_specs: List[MetricSpec] = (
|
|
245
|
+
_get_prometheus_vision_critique_metric_specs(
|
|
246
|
+
num_respondents=num_respondents,
|
|
247
|
+
max_tokens=200,
|
|
248
|
+
)
|
|
249
|
+
+ _get_open_ended_generation_metric_specs()
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
run_spec_name: str = "crossmodal_3600"
|
|
253
|
+
return RunSpec(
|
|
254
|
+
name=f"{run_spec_name}:location={location},language={language}",
|
|
255
|
+
scenario_spec=scenario_spec,
|
|
256
|
+
adapter_spec=adapter_spec,
|
|
257
|
+
metric_specs=metric_specs,
|
|
258
|
+
groups=[run_spec_name],
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
@run_spec_function("flickr30k")
|
|
263
|
+
def get_flickr30k_spec(num_respondents: int) -> RunSpec:
|
|
264
|
+
scenario_spec = ScenarioSpec(
|
|
265
|
+
class_name="helm.benchmark.scenarios.vision_language.flickr30k_scenario.Flickr30KScenario", args={}
|
|
266
|
+
)
|
|
267
|
+
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
|
|
268
|
+
instructions="Generate a caption for the following image in plain words. The caption should "
|
|
269
|
+
"be short and needs to be a complete sentence.",
|
|
270
|
+
max_tokens=30,
|
|
271
|
+
max_train_instances=0,
|
|
272
|
+
)
|
|
273
|
+
metric_specs: List[MetricSpec] = (
|
|
274
|
+
_get_prometheus_vision_critique_metric_specs(
|
|
275
|
+
num_respondents=num_respondents,
|
|
276
|
+
max_tokens=200,
|
|
277
|
+
)
|
|
278
|
+
+ _get_open_ended_generation_metric_specs()
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
run_spec_name: str = "flickr30k"
|
|
282
|
+
return RunSpec(
|
|
283
|
+
name=run_spec_name,
|
|
284
|
+
scenario_spec=scenario_spec,
|
|
285
|
+
adapter_spec=adapter_spec,
|
|
286
|
+
metric_specs=metric_specs,
|
|
287
|
+
groups=[run_spec_name],
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
@run_spec_function("gqa")
|
|
292
|
+
def get_gqa_spec() -> RunSpec:
|
|
293
|
+
scenario_spec = ScenarioSpec(
|
|
294
|
+
class_name="helm.benchmark.scenarios.vision_language.gqa_scenario.GQAScenario", args={}
|
|
295
|
+
)
|
|
296
|
+
adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(
|
|
297
|
+
instructions="Answer the question using a single word."
|
|
298
|
+
)
|
|
299
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
|
|
300
|
+
|
|
301
|
+
run_spec_name: str = "gqa"
|
|
302
|
+
return RunSpec(
|
|
303
|
+
name=run_spec_name,
|
|
304
|
+
scenario_spec=scenario_spec,
|
|
305
|
+
adapter_spec=adapter_spec,
|
|
306
|
+
metric_specs=metric_specs,
|
|
307
|
+
groups=[run_spec_name],
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
|
|
148
311
|
@run_spec_function("hateful_memes")
|
|
149
312
|
def get_hateful_memes_spec() -> RunSpec:
|
|
150
313
|
scenario_spec = ScenarioSpec(
|
|
151
314
|
class_name="helm.benchmark.scenarios.vision_language.hateful_memes_scenario.HatefulMemesScenario", args={}
|
|
152
315
|
)
|
|
153
|
-
adapter_spec
|
|
154
|
-
instructions="Answer Yes or No without an explanation.",
|
|
155
|
-
max_tokens=3,
|
|
156
|
-
)
|
|
316
|
+
adapter_spec = _get_multiple_choice_joint_adapter_spec(input_noun=None, output_noun="Answer", max_train_instances=0)
|
|
157
317
|
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
158
318
|
|
|
159
319
|
run_spec_name: str = "hateful_memes"
|
|
@@ -166,13 +326,115 @@ def get_hateful_memes_spec() -> RunSpec:
|
|
|
166
326
|
)
|
|
167
327
|
|
|
168
328
|
|
|
329
|
+
@run_spec_function("mm_safety_bench")
|
|
330
|
+
def get_mm_safety_bench_spec(subset: str) -> RunSpec:
|
|
331
|
+
scenario_spec = ScenarioSpec(
|
|
332
|
+
class_name="helm.benchmark.scenarios.vision_language.mm_safety_bench_scenario.MMSafetyBenchScenario",
|
|
333
|
+
args={"subset": subset},
|
|
334
|
+
)
|
|
335
|
+
adapter_spec: AdapterSpec = _get_generation_adapter_spec(max_tokens=500)
|
|
336
|
+
metric_specs: List[MetricSpec] = get_generative_harms_metric_specs(
|
|
337
|
+
include_basic_metrics=True, include_generative_harms_metrics=True
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
run_spec_name: str = "mm_safety_bench"
|
|
341
|
+
return RunSpec(
|
|
342
|
+
name=f"{run_spec_name}:subset={subset}",
|
|
343
|
+
scenario_spec=scenario_spec,
|
|
344
|
+
adapter_spec=adapter_spec,
|
|
345
|
+
metric_specs=metric_specs,
|
|
346
|
+
groups=[run_spec_name],
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
@run_spec_function("mscoco_captioning")
|
|
351
|
+
def get_mscoco_captioning_spec(long: bool = False) -> RunSpec:
|
|
352
|
+
scenario_spec = ScenarioSpec(
|
|
353
|
+
class_name="helm.benchmark.scenarios.vision_language.mscoco_captioning_scenario.MSCOCOCaptioningScenario",
|
|
354
|
+
args={},
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
adapter_spec: AdapterSpec
|
|
358
|
+
if long:
|
|
359
|
+
adapter_spec = _get_generation_adapter_spec(
|
|
360
|
+
instructions="Generate a long, detailed caption for the following image.",
|
|
361
|
+
max_tokens=200,
|
|
362
|
+
)
|
|
363
|
+
else:
|
|
364
|
+
adapter_spec = _get_generation_adapter_spec(
|
|
365
|
+
instructions="Generate a caption for the following image. The caption should be short and does "
|
|
366
|
+
"not need to be a complete sentence.",
|
|
367
|
+
max_tokens=20,
|
|
368
|
+
)
|
|
369
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
|
|
370
|
+
|
|
371
|
+
run_spec_name: str = "mscoco_captioning"
|
|
372
|
+
if long:
|
|
373
|
+
run_spec_name += "_long"
|
|
374
|
+
|
|
375
|
+
return RunSpec(
|
|
376
|
+
name=run_spec_name,
|
|
377
|
+
scenario_spec=scenario_spec,
|
|
378
|
+
adapter_spec=adapter_spec,
|
|
379
|
+
metric_specs=metric_specs,
|
|
380
|
+
groups=[run_spec_name],
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
@run_spec_function("mscoco_categorization")
|
|
385
|
+
def get_mscoco_categorization_spec() -> RunSpec:
|
|
386
|
+
scenario_spec = ScenarioSpec(
|
|
387
|
+
class_name="helm.benchmark.scenarios.vision_language.mscoco_categorization_scenario."
|
|
388
|
+
"MSCOCOCategorizationScenario",
|
|
389
|
+
args={},
|
|
390
|
+
)
|
|
391
|
+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
|
|
392
|
+
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
396
|
+
|
|
397
|
+
run_spec_name: str = "mscoco_categorization"
|
|
398
|
+
return RunSpec(
|
|
399
|
+
name=run_spec_name,
|
|
400
|
+
scenario_spec=scenario_spec,
|
|
401
|
+
adapter_spec=adapter_spec,
|
|
402
|
+
metric_specs=metric_specs,
|
|
403
|
+
groups=[run_spec_name],
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
@run_spec_function("originality_vlm")
|
|
408
|
+
def get_originality_vlm_spec() -> RunSpec:
|
|
409
|
+
scenario_spec = ScenarioSpec(
|
|
410
|
+
class_name="helm.benchmark.scenarios.vision_language.originality_scenario.OriginalityScenario", args={}
|
|
411
|
+
)
|
|
412
|
+
adapter_spec: AdapterSpec = _get_generation_adapter_spec(max_tokens=500)
|
|
413
|
+
metric_specs: List[MetricSpec] = get_generative_harms_metric_specs(
|
|
414
|
+
include_basic_metrics=True, include_generative_harms_metrics=True
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
run_spec_name: str = "originality_vlm"
|
|
418
|
+
return RunSpec(
|
|
419
|
+
name=run_spec_name,
|
|
420
|
+
scenario_spec=scenario_spec,
|
|
421
|
+
adapter_spec=adapter_spec,
|
|
422
|
+
metric_specs=metric_specs,
|
|
423
|
+
groups=[run_spec_name],
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
|
|
169
427
|
@run_spec_function("viz_wiz")
|
|
170
428
|
def get_viz_wiz_spec() -> RunSpec:
|
|
171
429
|
scenario_spec = ScenarioSpec(
|
|
172
430
|
class_name="helm.benchmark.scenarios.vision_language.viz_wiz_scenario.VizWizScenario", args={}
|
|
173
431
|
)
|
|
174
|
-
adapter_spec: AdapterSpec =
|
|
175
|
-
|
|
432
|
+
adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(
|
|
433
|
+
# Following https://arxiv.org/abs/2310.03744
|
|
434
|
+
instructions="When the provided information is insufficient, respond with 'Unanswerable'. "
|
|
435
|
+
"Answer the question using a single word or phrase."
|
|
436
|
+
)
|
|
437
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
|
|
176
438
|
|
|
177
439
|
run_spec_name: str = "viz_wiz"
|
|
178
440
|
return RunSpec(
|
|
@@ -189,8 +451,12 @@ def get_vqa_spec() -> RunSpec:
|
|
|
189
451
|
scenario_spec = ScenarioSpec(
|
|
190
452
|
class_name="helm.benchmark.scenarios.vision_language.vqa_scenario.VQAScenario", args={}
|
|
191
453
|
)
|
|
192
|
-
|
|
193
|
-
|
|
454
|
+
# Following https://arxiv.org/abs/2310.03744
|
|
455
|
+
adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(
|
|
456
|
+
instructions='Answer the question using a single word or phrase. When the question asks "How many...", '
|
|
457
|
+
"respond with just a number (e.g., 3) and not the word corresponding to the number."
|
|
458
|
+
)
|
|
459
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
|
|
194
460
|
|
|
195
461
|
run_spec_name: str = "vqa"
|
|
196
462
|
return RunSpec(
|
|
@@ -203,19 +469,21 @@ def get_vqa_spec() -> RunSpec:
|
|
|
203
469
|
|
|
204
470
|
|
|
205
471
|
@run_spec_function("image2latex")
|
|
206
|
-
def get_image2latex_spec(
|
|
472
|
+
def get_image2latex_spec(
|
|
473
|
+
subset: str, recompile_prompt: bool = False, difficulty: str = DIFFICULTY_ALL, args: Optional[Dict] = None
|
|
474
|
+
) -> RunSpec:
|
|
207
475
|
scenario_spec = ScenarioSpec(
|
|
208
476
|
class_name="helm.benchmark.scenarios.vision_language.image2structure.latex_scenario.LatexScenario",
|
|
209
|
-
args={"subset": subset, "recompile_prompt": recompile_prompt},
|
|
477
|
+
args={"subset": subset, "recompile_prompt": recompile_prompt, "difficulty": difficulty},
|
|
210
478
|
)
|
|
211
|
-
adapter_spec: AdapterSpec =
|
|
479
|
+
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
|
|
212
480
|
instructions="Just give a short answer without answering in a complete sentence.",
|
|
213
481
|
max_tokens=2000,
|
|
214
482
|
)
|
|
215
|
-
metric_specs: List[MetricSpec] =
|
|
483
|
+
metric_specs: List[MetricSpec] = _get_image2structure_metric_specs(
|
|
216
484
|
generation_type="latex",
|
|
217
485
|
args=args,
|
|
218
|
-
include_edit_similarity=
|
|
486
|
+
include_edit_similarity=(subset != "real"),
|
|
219
487
|
size_handling_method="padding",
|
|
220
488
|
)
|
|
221
489
|
annotator_specs: List[AnnotatorSpec] = [
|
|
@@ -224,31 +492,41 @@ def get_image2latex_spec(subset: str, recompile_prompt: bool = False, args: Opti
|
|
|
224
492
|
)
|
|
225
493
|
]
|
|
226
494
|
|
|
227
|
-
run_spec_name: str = "image2latex"
|
|
495
|
+
run_spec_name: str = f"image2latex:subset={subset}:difficulty={difficulty}"
|
|
496
|
+
groups: List[str]
|
|
497
|
+
if subset == "real":
|
|
498
|
+
groups = ["image2latex_real"]
|
|
499
|
+
else:
|
|
500
|
+
groups = ["image2latex", f"image2latex_{difficulty}"]
|
|
228
501
|
return RunSpec(
|
|
229
|
-
name=
|
|
502
|
+
name=run_spec_name,
|
|
230
503
|
scenario_spec=scenario_spec,
|
|
231
504
|
adapter_spec=adapter_spec,
|
|
232
505
|
metric_specs=metric_specs,
|
|
233
|
-
groups=
|
|
506
|
+
groups=groups,
|
|
234
507
|
annotators=annotator_specs,
|
|
235
508
|
)
|
|
236
509
|
|
|
237
510
|
|
|
238
511
|
@run_spec_function("image2webpage")
|
|
239
|
-
def get_image2webpage_spec(
|
|
512
|
+
def get_image2webpage_spec(
|
|
513
|
+
subset: str,
|
|
514
|
+
recompile_prompt: bool = False,
|
|
515
|
+
difficulty: str = DIFFICULTY_ALL,
|
|
516
|
+
args: Optional[Dict] = None,
|
|
517
|
+
) -> RunSpec:
|
|
240
518
|
scenario_spec = ScenarioSpec(
|
|
241
519
|
class_name="helm.benchmark.scenarios.vision_language.image2structure.webpage_scenario.WebpageScenario",
|
|
242
|
-
args={"subset": subset, "recompile_prompt": recompile_prompt},
|
|
520
|
+
args={"subset": subset, "recompile_prompt": recompile_prompt, "difficulty": difficulty},
|
|
243
521
|
)
|
|
244
|
-
adapter_spec: AdapterSpec =
|
|
522
|
+
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
|
|
245
523
|
instructions="Just give a short answer without answering in a complete sentence.",
|
|
246
524
|
max_tokens=2000,
|
|
247
525
|
)
|
|
248
|
-
metric_specs: List[MetricSpec] =
|
|
526
|
+
metric_specs: List[MetricSpec] = _get_image2structure_metric_specs(
|
|
249
527
|
generation_type="webpage",
|
|
250
528
|
args=args,
|
|
251
|
-
include_edit_similarity=
|
|
529
|
+
include_edit_similarity=(subset != "real"),
|
|
252
530
|
size_handling_method="none",
|
|
253
531
|
)
|
|
254
532
|
annotator_specs: List[AnnotatorSpec] = [
|
|
@@ -257,28 +535,64 @@ def get_image2webpage_spec(subset: str, recompile_prompt: bool = False, args: Op
|
|
|
257
535
|
)
|
|
258
536
|
]
|
|
259
537
|
|
|
260
|
-
run_spec_name: str = "image2webpage"
|
|
538
|
+
run_spec_name: str = f"image2webpage:subset={subset}:difficulty={difficulty}"
|
|
539
|
+
groups: List[str]
|
|
540
|
+
if subset == "real":
|
|
541
|
+
groups = ["image2webpage_real"]
|
|
542
|
+
else:
|
|
543
|
+
groups = ["image2webpage", f"image2webpage_{difficulty}"]
|
|
261
544
|
return RunSpec(
|
|
262
|
-
name=
|
|
545
|
+
name=run_spec_name,
|
|
263
546
|
scenario_spec=scenario_spec,
|
|
264
547
|
adapter_spec=adapter_spec,
|
|
265
548
|
metric_specs=metric_specs,
|
|
266
|
-
groups=
|
|
549
|
+
groups=groups,
|
|
267
550
|
annotators=annotator_specs,
|
|
268
551
|
)
|
|
269
552
|
|
|
270
553
|
|
|
554
|
+
@run_spec_function("math_vista")
|
|
555
|
+
def get_math_vista_spec(grade: str, question_type: str) -> RunSpec:
|
|
556
|
+
scenario_spec = ScenarioSpec(
|
|
557
|
+
class_name="helm.benchmark.scenarios.vision_language.math_vista_scenario.MathVistaScenario",
|
|
558
|
+
args={"grade": grade, "question_type": question_type},
|
|
559
|
+
)
|
|
560
|
+
|
|
561
|
+
adapter_spec: AdapterSpec
|
|
562
|
+
if question_type == "free_form":
|
|
563
|
+
adapter_spec = _get_short_answer_generation_adapter_spec(
|
|
564
|
+
instructions="Just give the numerical answer without showing the steps, the unit, or percentage symbol."
|
|
565
|
+
)
|
|
566
|
+
elif question_type == "multi_choice":
|
|
567
|
+
adapter_spec = _get_multiple_choice_joint_adapter_spec(
|
|
568
|
+
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
569
|
+
)
|
|
570
|
+
else:
|
|
571
|
+
raise ValueError(f"Invalid question type: {question_type}")
|
|
572
|
+
|
|
573
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
574
|
+
run_spec_name: str = "math_vista"
|
|
575
|
+
return RunSpec(
|
|
576
|
+
name=f"{run_spec_name}:grade={grade},question_type={question_type}",
|
|
577
|
+
scenario_spec=scenario_spec,
|
|
578
|
+
adapter_spec=adapter_spec,
|
|
579
|
+
metric_specs=metric_specs,
|
|
580
|
+
groups=[run_spec_name],
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
|
|
271
584
|
@run_spec_function("image2musicsheet")
|
|
272
|
-
def get_image2musicsheet_spec(args: Optional[Dict] = None) -> RunSpec:
|
|
585
|
+
def get_image2musicsheet_spec(difficulty: str = DIFFICULTY_ALL, args: Optional[Dict] = None) -> RunSpec:
|
|
273
586
|
scenario_spec = ScenarioSpec(
|
|
274
587
|
class_name="helm.benchmark.scenarios.vision_language.image2structure.musicsheet_scenario.MusicSheetScenario",
|
|
275
|
-
|
|
588
|
+
# There os only one subset for music sheets
|
|
589
|
+
args={"subset": "music", "recompile_prompt": False, "difficulty": difficulty},
|
|
276
590
|
)
|
|
277
|
-
adapter_spec: AdapterSpec =
|
|
591
|
+
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
|
|
278
592
|
instructions="Just give a short answer without answering in a complete sentence.",
|
|
279
593
|
max_tokens=2000,
|
|
280
594
|
)
|
|
281
|
-
metric_specs: List[MetricSpec] =
|
|
595
|
+
metric_specs: List[MetricSpec] = _get_image2structure_metric_specs(
|
|
282
596
|
generation_type="lilypond",
|
|
283
597
|
args=args,
|
|
284
598
|
include_edit_similarity=False, # No ground truth for music sheets
|
|
@@ -290,13 +604,14 @@ def get_image2musicsheet_spec(args: Optional[Dict] = None) -> RunSpec:
|
|
|
290
604
|
)
|
|
291
605
|
]
|
|
292
606
|
|
|
293
|
-
run_spec_name: str = "image2musicsheet"
|
|
607
|
+
run_spec_name: str = f"image2musicsheet:difficulty={difficulty}"
|
|
608
|
+
groups: List[str] = ["image2musicsheet", f"image2musicsheet_{difficulty}"]
|
|
294
609
|
return RunSpec(
|
|
295
|
-
name=
|
|
610
|
+
name=run_spec_name,
|
|
296
611
|
scenario_spec=scenario_spec,
|
|
297
612
|
adapter_spec=adapter_spec,
|
|
298
613
|
metric_specs=metric_specs,
|
|
299
|
-
groups=
|
|
614
|
+
groups=groups,
|
|
300
615
|
annotators=annotator_specs,
|
|
301
616
|
)
|
|
302
617
|
|
|
@@ -310,10 +625,14 @@ def get_mmmu_spec(subject: str, question_type: str) -> RunSpec:
|
|
|
310
625
|
|
|
311
626
|
adapter_spec: AdapterSpec
|
|
312
627
|
if question_type == "open":
|
|
313
|
-
adapter_spec =
|
|
628
|
+
adapter_spec = _get_short_answer_generation_adapter_spec()
|
|
314
629
|
elif question_type == "multiple-choice":
|
|
315
|
-
adapter_spec =
|
|
316
|
-
input_noun=None,
|
|
630
|
+
adapter_spec = _get_multiple_choice_joint_adapter_spec(
|
|
631
|
+
input_noun=None,
|
|
632
|
+
output_noun="Answer",
|
|
633
|
+
max_train_instances=0,
|
|
634
|
+
# instructions="Refer to the figure(s) and answer the multiple choice question by responding with just "
|
|
635
|
+
# "the letter of the correct answer (e.g., A, B, C, D, E).",
|
|
317
636
|
)
|
|
318
637
|
else:
|
|
319
638
|
raise ValueError(f"Invalid question type: {question_type}")
|
|
@@ -335,14 +654,15 @@ def get_unicorn_spec(subject: str) -> RunSpec:
|
|
|
335
654
|
class_name="helm.benchmark.scenarios.vision_language.unicorn_scenario.UnicornScenario",
|
|
336
655
|
args={"subject": subject},
|
|
337
656
|
)
|
|
338
|
-
adapter_spec: AdapterSpec =
|
|
339
|
-
instructions="Only give
|
|
657
|
+
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
|
|
658
|
+
instructions="Only give a yes/no or numerical answer without an explanation.",
|
|
659
|
+
max_tokens=1, # the model may generate answer with a period
|
|
340
660
|
)
|
|
341
661
|
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
342
662
|
|
|
343
663
|
run_spec_name: str = "unicorn"
|
|
344
664
|
return RunSpec(
|
|
345
|
-
name=run_spec_name,
|
|
665
|
+
name=f"{run_spec_name}:subject={subject}",
|
|
346
666
|
scenario_spec=scenario_spec,
|
|
347
667
|
adapter_spec=adapter_spec,
|
|
348
668
|
metric_specs=metric_specs,
|
|
@@ -351,16 +671,26 @@ def get_unicorn_spec(subject: str) -> RunSpec:
|
|
|
351
671
|
|
|
352
672
|
|
|
353
673
|
@run_spec_function("bingo")
|
|
354
|
-
def get_bingo_spec(subject: str) -> RunSpec:
|
|
674
|
+
def get_bingo_spec(subject: str, num_respondents: int) -> RunSpec:
|
|
355
675
|
scenario_spec = ScenarioSpec(
|
|
356
676
|
class_name="helm.benchmark.scenarios.vision_language.bingo_scenario.BingoScenario", args={"subject": subject}
|
|
357
677
|
)
|
|
358
|
-
adapter_spec: AdapterSpec =
|
|
359
|
-
|
|
678
|
+
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
|
|
679
|
+
instructions="Answer the question with a complete and clear explanation in sentences without listing it out.",
|
|
680
|
+
max_tokens=100,
|
|
681
|
+
max_train_instances=0,
|
|
682
|
+
)
|
|
683
|
+
metric_specs: List[MetricSpec] = (
|
|
684
|
+
_get_prometheus_vision_critique_metric_specs(
|
|
685
|
+
num_respondents=num_respondents,
|
|
686
|
+
max_tokens=200,
|
|
687
|
+
)
|
|
688
|
+
+ _get_open_ended_generation_metric_specs()
|
|
689
|
+
)
|
|
360
690
|
|
|
361
691
|
run_spec_name: str = "bingo"
|
|
362
692
|
return RunSpec(
|
|
363
|
-
name=run_spec_name,
|
|
693
|
+
name=f"{run_spec_name}:subject={subject}",
|
|
364
694
|
scenario_spec=scenario_spec,
|
|
365
695
|
adapter_spec=adapter_spec,
|
|
366
696
|
metric_specs=metric_specs,
|
|
@@ -377,9 +707,9 @@ def get_multipanelvqa_spec(subject: str, question_type: str) -> RunSpec:
|
|
|
377
707
|
|
|
378
708
|
adapter_spec: AdapterSpec
|
|
379
709
|
if question_type == "open":
|
|
380
|
-
adapter_spec =
|
|
710
|
+
adapter_spec = _get_short_answer_generation_adapter_spec()
|
|
381
711
|
elif question_type == "multiple-choice":
|
|
382
|
-
adapter_spec =
|
|
712
|
+
adapter_spec = _get_multiple_choice_joint_adapter_spec(
|
|
383
713
|
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
384
714
|
)
|
|
385
715
|
else:
|
|
@@ -401,7 +731,7 @@ def get_pope_spec() -> RunSpec:
|
|
|
401
731
|
scenario_spec = ScenarioSpec(
|
|
402
732
|
class_name="helm.benchmark.scenarios.vision_language.pope_scenario.POPEScenario",
|
|
403
733
|
)
|
|
404
|
-
adapter_spec: AdapterSpec =
|
|
734
|
+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
|
|
405
735
|
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
406
736
|
)
|
|
407
737
|
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
@@ -422,14 +752,14 @@ def get_seed_bench_spec(subject: str) -> RunSpec:
|
|
|
422
752
|
class_name="helm.benchmark.scenarios.vision_language.seed_bench_scenario.SEEDBenchScenario",
|
|
423
753
|
args={"subject": subject},
|
|
424
754
|
)
|
|
425
|
-
adapter_spec: AdapterSpec =
|
|
755
|
+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
|
|
426
756
|
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
427
757
|
)
|
|
428
758
|
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
429
759
|
|
|
430
760
|
run_spec_name: str = "seed_bench"
|
|
431
761
|
return RunSpec(
|
|
432
|
-
name=run_spec_name,
|
|
762
|
+
name=f"{run_spec_name}:subject={subject}",
|
|
433
763
|
scenario_spec=scenario_spec,
|
|
434
764
|
adapter_spec=adapter_spec,
|
|
435
765
|
metric_specs=metric_specs,
|
|
@@ -443,14 +773,14 @@ def get_mme_spec(subject: str) -> RunSpec:
|
|
|
443
773
|
class_name="helm.benchmark.scenarios.vision_language.mme_scenario.MMEScenario",
|
|
444
774
|
args={"subject": subject},
|
|
445
775
|
)
|
|
446
|
-
adapter_spec: AdapterSpec =
|
|
776
|
+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
|
|
447
777
|
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
448
778
|
)
|
|
449
779
|
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
450
780
|
|
|
451
781
|
run_spec_name: str = "mme"
|
|
452
782
|
return RunSpec(
|
|
453
|
-
name=run_spec_name,
|
|
783
|
+
name=f"{run_spec_name}:subject={subject}",
|
|
454
784
|
scenario_spec=scenario_spec,
|
|
455
785
|
adapter_spec=adapter_spec,
|
|
456
786
|
metric_specs=metric_specs,
|
|
@@ -464,7 +794,7 @@ def get_heim_human_eval_spec(question_type: str) -> RunSpec:
|
|
|
464
794
|
class_name="helm.benchmark.scenarios.vision_language.heim_human_eval_scenario.HEIMHumanEvalScenario",
|
|
465
795
|
args={"question_type": question_type},
|
|
466
796
|
)
|
|
467
|
-
adapter_spec: AdapterSpec =
|
|
797
|
+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
|
|
468
798
|
input_noun=None,
|
|
469
799
|
output_noun="Answer",
|
|
470
800
|
num_outputs=1,
|
|
@@ -482,18 +812,67 @@ def get_heim_human_eval_spec(question_type: str) -> RunSpec:
|
|
|
482
812
|
)
|
|
483
813
|
|
|
484
814
|
|
|
815
|
+
@run_spec_function("pairs")
|
|
816
|
+
def get_pairs_spec(subset: str, person: str) -> RunSpec:
|
|
817
|
+
scenario_spec = ScenarioSpec(
|
|
818
|
+
class_name="helm.benchmark.scenarios.vision_language.pairs_scenario.PAIRSScenario",
|
|
819
|
+
args={"subset": subset, "person": person},
|
|
820
|
+
)
|
|
821
|
+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
|
|
822
|
+
input_noun=None,
|
|
823
|
+
output_noun="Answer",
|
|
824
|
+
num_outputs=1,
|
|
825
|
+
max_train_instances=0,
|
|
826
|
+
)
|
|
827
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
828
|
+
|
|
829
|
+
run_spec_name: str = "pairs"
|
|
830
|
+
return RunSpec(
|
|
831
|
+
name=f"{run_spec_name}:subset={subset},person={person}",
|
|
832
|
+
scenario_spec=scenario_spec,
|
|
833
|
+
adapter_spec=adapter_spec,
|
|
834
|
+
metric_specs=metric_specs,
|
|
835
|
+
groups=[run_spec_name],
|
|
836
|
+
)
|
|
837
|
+
|
|
838
|
+
|
|
485
839
|
@run_spec_function("mementos")
|
|
486
|
-
def get_mementos_spec(subject: str) -> RunSpec:
|
|
840
|
+
def get_mementos_spec(subject: str, num_respondents: int) -> RunSpec:
|
|
487
841
|
scenario_spec = ScenarioSpec(
|
|
488
842
|
class_name="helm.benchmark.scenarios.vision_language.mementos_scenario.MementosScenario",
|
|
489
843
|
args={"subject": subject},
|
|
490
844
|
)
|
|
491
|
-
adapter_spec: AdapterSpec =
|
|
492
|
-
metric_specs: List[MetricSpec] =
|
|
845
|
+
adapter_spec: AdapterSpec = get_open_end_answer_generation_adapter_spec()
|
|
846
|
+
metric_specs: List[MetricSpec] = (
|
|
847
|
+
_get_prometheus_vision_critique_metric_specs(num_respondents=num_respondents, max_tokens=200)
|
|
848
|
+
+ _get_open_ended_generation_metric_specs()
|
|
849
|
+
)
|
|
493
850
|
|
|
494
851
|
run_spec_name: str = "mementos"
|
|
495
852
|
return RunSpec(
|
|
496
|
-
name=run_spec_name,
|
|
853
|
+
name=f"{run_spec_name}:subject={subject}",
|
|
854
|
+
scenario_spec=scenario_spec,
|
|
855
|
+
adapter_spec=adapter_spec,
|
|
856
|
+
metric_specs=metric_specs,
|
|
857
|
+
groups=[run_spec_name],
|
|
858
|
+
)
|
|
859
|
+
|
|
860
|
+
|
|
861
|
+
@run_spec_function("vibe_eval")
|
|
862
|
+
def get_vibe_eval_spec(subject: str, num_respondents: int) -> RunSpec:
|
|
863
|
+
scenario_spec = ScenarioSpec(
|
|
864
|
+
class_name="helm.benchmark.scenarios.vision_language.vibe_eval_scenario.VibeEvalScenario",
|
|
865
|
+
args={"subject": subject},
|
|
866
|
+
)
|
|
867
|
+
adapter_spec: AdapterSpec = get_open_end_answer_generation_adapter_spec()
|
|
868
|
+
metric_specs: List[MetricSpec] = (
|
|
869
|
+
_get_prometheus_vision_critique_metric_specs(num_respondents=num_respondents, max_tokens=200)
|
|
870
|
+
+ _get_open_ended_generation_metric_specs()
|
|
871
|
+
)
|
|
872
|
+
|
|
873
|
+
run_spec_name: str = "vibe_eval"
|
|
874
|
+
return RunSpec(
|
|
875
|
+
name=f"{run_spec_name}:subject={subject}",
|
|
497
876
|
scenario_spec=scenario_spec,
|
|
498
877
|
adapter_spec=adapter_spec,
|
|
499
878
|
metric_specs=metric_specs,
|