crfm-helm 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +7 -3
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/RECORD +53 -41
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
- helm/benchmark/augmentations/perturbation.py +17 -1
- helm/benchmark/augmentations/test_perturbation.py +30 -0
- helm/benchmark/metrics/efficiency_metrics.py +9 -2
- helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +142 -17
- helm/benchmark/model_metadata_registry.py +5 -1
- helm/benchmark/run_expander.py +35 -63
- helm/benchmark/run_spec_factory.py +11 -10
- helm/benchmark/run_specs/vlm_run_specs.py +294 -38
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/math_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
- helm/benchmark/static/schema_image2structure.yaml +304 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
- helm/benchmark/static/schema_vlm.yaml +257 -10
- helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
- helm/benchmark/static_build/assets/index-878a1094.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/anthropic_client.py +36 -6
- helm/clients/openai_client.py +2 -3
- helm/clients/together_client.py +93 -2
- helm/clients/vertexai_client.py +59 -50
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +11 -4
- helm/clients/vision_language/idefics_client.py +2 -2
- helm/common/images_utils.py +10 -3
- helm/config/model_deployments.yaml +100 -2
- helm/config/model_metadata.yaml +136 -31
- helm/config/tokenizer_configs.yaml +7 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
- helm/benchmark/static_build/assets/index-d839df55.js +0 -9
- helm/benchmark/test_model_deployment_definition.py +0 -90
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -10,7 +10,8 @@ from helm.benchmark.adaptation.adapters.adapter_factory import (
|
|
|
10
10
|
from helm.benchmark.metrics.common_metric_specs import (
|
|
11
11
|
get_basic_reference_metric_specs,
|
|
12
12
|
get_exact_match_metric_specs,
|
|
13
|
-
|
|
13
|
+
get_generative_harms_metric_specs,
|
|
14
|
+
get_basic_metric_specs,
|
|
14
15
|
)
|
|
15
16
|
from helm.benchmark.metrics.metric import MetricSpec
|
|
16
17
|
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
@@ -22,7 +23,7 @@ from helm.benchmark.annotation.annotator import AnnotatorSpec
|
|
|
22
23
|
# Prototypical adapter specs for VLM evaluation
|
|
23
24
|
|
|
24
25
|
|
|
25
|
-
def
|
|
26
|
+
def _get_generation_adapter_spec(
|
|
26
27
|
instructions: str = "",
|
|
27
28
|
input_prefix: str = "",
|
|
28
29
|
input_suffix: str = "",
|
|
@@ -45,18 +46,31 @@ def get_generation_adapter_spec(
|
|
|
45
46
|
num_outputs=1,
|
|
46
47
|
max_tokens=max_tokens,
|
|
47
48
|
stop_sequences=stop_sequences if stop_sequences is not None else [],
|
|
49
|
+
temperature=0.0,
|
|
48
50
|
random=None,
|
|
49
51
|
)
|
|
50
52
|
|
|
51
53
|
|
|
52
|
-
def
|
|
53
|
-
return
|
|
54
|
-
instructions=
|
|
54
|
+
def _get_short_answer_generation_adapter_spec(instructions: Optional[str] = None) -> AdapterSpec:
|
|
55
|
+
return _get_generation_adapter_spec(
|
|
56
|
+
instructions=(
|
|
57
|
+
"Just give a short answer without answering in a complete sentence."
|
|
58
|
+
if instructions is None
|
|
59
|
+
else instructions
|
|
60
|
+
),
|
|
55
61
|
max_tokens=20,
|
|
56
62
|
)
|
|
57
63
|
|
|
58
64
|
|
|
59
|
-
def
|
|
65
|
+
def _get_captioning_adapter_spec() -> AdapterSpec:
|
|
66
|
+
return _get_generation_adapter_spec(
|
|
67
|
+
instructions="Generate a caption for the following image. The caption should be short and does "
|
|
68
|
+
"not need to be a complete sentence.",
|
|
69
|
+
max_tokens=20,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _get_multiple_choice_joint_adapter_spec(
|
|
60
74
|
input_noun: Optional[str],
|
|
61
75
|
output_noun: str,
|
|
62
76
|
max_train_instances: int = 0,
|
|
@@ -84,7 +98,13 @@ def get_multiple_choice_joint_adapter_spec(
|
|
|
84
98
|
# VHELM metric specs
|
|
85
99
|
|
|
86
100
|
|
|
87
|
-
def
|
|
101
|
+
def _get_open_ended_generation_metric_specs() -> List[MetricSpec]:
|
|
102
|
+
return get_basic_metric_specs(
|
|
103
|
+
["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4", "cider"]
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _get_image2structure_metric_specs(
|
|
88
108
|
generation_type: str,
|
|
89
109
|
metric_names: Optional[List[str]] = None,
|
|
90
110
|
args: Optional[Dict] = None,
|
|
@@ -97,7 +117,9 @@ def get_image2structure_metric_specs(
|
|
|
97
117
|
metric_names = [
|
|
98
118
|
AnnotatedImageMetrics.PIXEL_SIMILARITY,
|
|
99
119
|
AnnotatedImageMetrics.FID_SIMILARITY,
|
|
100
|
-
AnnotatedImageMetrics.
|
|
120
|
+
AnnotatedImageMetrics.BLOCK_EARTH_MOVER_SIMILARITY,
|
|
121
|
+
AnnotatedImageMetrics.BLOCK_EARTH_MOVER_SIMILARITY_NORM2,
|
|
122
|
+
AnnotatedImageMetrics.BLOCK_EARTH_MOVER_SIMILARITY_NORM1,
|
|
101
123
|
]
|
|
102
124
|
if include_edit_similarity:
|
|
103
125
|
metric_names.append(AnnotatedImageMetrics.EDIT_SIMILARITY)
|
|
@@ -121,13 +143,35 @@ def get_image2structure_metric_specs(
|
|
|
121
143
|
# VHELM run specs
|
|
122
144
|
|
|
123
145
|
|
|
146
|
+
@run_spec_function("a_okvqa")
|
|
147
|
+
def get_a_okvqa_spec() -> RunSpec:
|
|
148
|
+
scenario_spec = ScenarioSpec(
|
|
149
|
+
class_name="helm.benchmark.scenarios.vision_language.a_okvqa_scenario.AOKVQAScenario",
|
|
150
|
+
args={},
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
|
|
154
|
+
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
158
|
+
run_spec_name: str = "a_okvqa"
|
|
159
|
+
return RunSpec(
|
|
160
|
+
name=run_spec_name,
|
|
161
|
+
scenario_spec=scenario_spec,
|
|
162
|
+
adapter_spec=adapter_spec,
|
|
163
|
+
metric_specs=metric_specs,
|
|
164
|
+
groups=[run_spec_name],
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
|
|
124
168
|
@run_spec_function("chart2csv")
|
|
125
169
|
def get_chart2csv_spec() -> RunSpec:
|
|
126
170
|
scenario_spec = ScenarioSpec(
|
|
127
171
|
class_name="helm.benchmark.scenarios.vision_language.image2structure.chart2csv_scenario.Chart2CSVScenario",
|
|
128
172
|
args={},
|
|
129
173
|
)
|
|
130
|
-
adapter_spec: AdapterSpec =
|
|
174
|
+
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
|
|
131
175
|
instructions="Generate the CSV for the chart. Some of the labels may be missing due to the size of the chart. "
|
|
132
176
|
"Please infer the missing labels based on the surrounding context. "
|
|
133
177
|
"Just give the CSV without any explanation.",
|
|
@@ -145,15 +189,69 @@ def get_chart2csv_spec() -> RunSpec:
|
|
|
145
189
|
)
|
|
146
190
|
|
|
147
191
|
|
|
192
|
+
@run_spec_function("crossmodal_3600")
|
|
193
|
+
def get_crossmodal_3600_spec(location: str, language: str) -> RunSpec:
|
|
194
|
+
scenario_spec = ScenarioSpec(
|
|
195
|
+
class_name="helm.benchmark.scenarios.vision_language.crossmodal_3600_scenario.Crossmodal3600Scenario",
|
|
196
|
+
args={"location": location, "language": language},
|
|
197
|
+
)
|
|
198
|
+
adapter_spec: AdapterSpec = _get_generation_adapter_spec(max_tokens=20)
|
|
199
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
|
|
200
|
+
|
|
201
|
+
run_spec_name: str = "crossmodal_3600"
|
|
202
|
+
return RunSpec(
|
|
203
|
+
name=f"{run_spec_name}:location={location},language={language}",
|
|
204
|
+
scenario_spec=scenario_spec,
|
|
205
|
+
adapter_spec=adapter_spec,
|
|
206
|
+
metric_specs=metric_specs,
|
|
207
|
+
groups=[run_spec_name],
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
@run_spec_function("flickr30k")
|
|
212
|
+
def get_flickr30k_spec() -> RunSpec:
|
|
213
|
+
scenario_spec = ScenarioSpec(
|
|
214
|
+
class_name="helm.benchmark.scenarios.vision_language.flickr30k_scenario.Flickr30KScenario", args={}
|
|
215
|
+
)
|
|
216
|
+
adapter_spec: AdapterSpec = _get_captioning_adapter_spec()
|
|
217
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
|
|
218
|
+
|
|
219
|
+
run_spec_name: str = "flickr30k"
|
|
220
|
+
return RunSpec(
|
|
221
|
+
name=run_spec_name,
|
|
222
|
+
scenario_spec=scenario_spec,
|
|
223
|
+
adapter_spec=adapter_spec,
|
|
224
|
+
metric_specs=metric_specs,
|
|
225
|
+
groups=[run_spec_name],
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
@run_spec_function("gqa")
|
|
230
|
+
def get_gqa_spec() -> RunSpec:
|
|
231
|
+
scenario_spec = ScenarioSpec(
|
|
232
|
+
class_name="helm.benchmark.scenarios.vision_language.gqa_scenario.GQAScenario", args={}
|
|
233
|
+
)
|
|
234
|
+
adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(
|
|
235
|
+
instructions="Answer the question using a single word or phrase."
|
|
236
|
+
)
|
|
237
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
|
|
238
|
+
|
|
239
|
+
run_spec_name: str = "gqa"
|
|
240
|
+
return RunSpec(
|
|
241
|
+
name=run_spec_name,
|
|
242
|
+
scenario_spec=scenario_spec,
|
|
243
|
+
adapter_spec=adapter_spec,
|
|
244
|
+
metric_specs=metric_specs,
|
|
245
|
+
groups=[run_spec_name],
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
|
|
148
249
|
@run_spec_function("hateful_memes")
|
|
149
250
|
def get_hateful_memes_spec() -> RunSpec:
|
|
150
251
|
scenario_spec = ScenarioSpec(
|
|
151
252
|
class_name="helm.benchmark.scenarios.vision_language.hateful_memes_scenario.HatefulMemesScenario", args={}
|
|
152
253
|
)
|
|
153
|
-
adapter_spec
|
|
154
|
-
instructions="Answer Yes or No without an explanation.",
|
|
155
|
-
max_tokens=3,
|
|
156
|
-
)
|
|
254
|
+
adapter_spec = _get_multiple_choice_joint_adapter_spec(input_noun=None, output_noun="Answer", max_train_instances=0)
|
|
157
255
|
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
158
256
|
|
|
159
257
|
run_spec_name: str = "hateful_memes"
|
|
@@ -166,13 +264,111 @@ def get_hateful_memes_spec() -> RunSpec:
|
|
|
166
264
|
)
|
|
167
265
|
|
|
168
266
|
|
|
267
|
+
@run_spec_function("mm_safety_bench")
|
|
268
|
+
def get_mm_safety_bench_spec(subset: str) -> RunSpec:
|
|
269
|
+
scenario_spec = ScenarioSpec(
|
|
270
|
+
class_name="helm.benchmark.scenarios.vision_language.mm_safety_bench_scenario.MMSafetyBenchScenario",
|
|
271
|
+
args={"subset": subset},
|
|
272
|
+
)
|
|
273
|
+
adapter_spec: AdapterSpec = _get_generation_adapter_spec(max_tokens=500)
|
|
274
|
+
metric_specs: List[MetricSpec] = get_generative_harms_metric_specs(
|
|
275
|
+
include_basic_metrics=True, include_generative_harms_metrics=True
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
run_spec_name: str = "mm_safety_bench"
|
|
279
|
+
return RunSpec(
|
|
280
|
+
name=f"{run_spec_name}:subset={subset}",
|
|
281
|
+
scenario_spec=scenario_spec,
|
|
282
|
+
adapter_spec=adapter_spec,
|
|
283
|
+
metric_specs=metric_specs,
|
|
284
|
+
groups=[run_spec_name],
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
@run_spec_function("mscoco_captioning")
|
|
289
|
+
def get_mscoco_captioning_spec(long: bool = False) -> RunSpec:
|
|
290
|
+
scenario_spec = ScenarioSpec(
|
|
291
|
+
class_name="helm.benchmark.scenarios.vision_language.mscoco_captioning_scenario.MSCOCOCaptioningScenario",
|
|
292
|
+
args={},
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
adapter_spec: AdapterSpec
|
|
296
|
+
if long:
|
|
297
|
+
adapter_spec = _get_generation_adapter_spec(
|
|
298
|
+
instructions="Generate a long, detailed caption for the following image.",
|
|
299
|
+
max_tokens=150,
|
|
300
|
+
)
|
|
301
|
+
else:
|
|
302
|
+
adapter_spec = _get_captioning_adapter_spec()
|
|
303
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
|
|
304
|
+
|
|
305
|
+
run_spec_name: str = "mscoco_captioning"
|
|
306
|
+
if long:
|
|
307
|
+
run_spec_name += "_long"
|
|
308
|
+
|
|
309
|
+
return RunSpec(
|
|
310
|
+
name=run_spec_name,
|
|
311
|
+
scenario_spec=scenario_spec,
|
|
312
|
+
adapter_spec=adapter_spec,
|
|
313
|
+
metric_specs=metric_specs,
|
|
314
|
+
groups=[run_spec_name],
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
@run_spec_function("mscoco_categorization")
|
|
319
|
+
def get_mscoco_categorization_spec() -> RunSpec:
|
|
320
|
+
scenario_spec = ScenarioSpec(
|
|
321
|
+
class_name="helm.benchmark.scenarios.vision_language.mscoco_categorization_scenario."
|
|
322
|
+
"MSCOCOCategorizationScenario",
|
|
323
|
+
args={},
|
|
324
|
+
)
|
|
325
|
+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
|
|
326
|
+
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
330
|
+
|
|
331
|
+
run_spec_name: str = "mscoco_categorization"
|
|
332
|
+
return RunSpec(
|
|
333
|
+
name=run_spec_name,
|
|
334
|
+
scenario_spec=scenario_spec,
|
|
335
|
+
adapter_spec=adapter_spec,
|
|
336
|
+
metric_specs=metric_specs,
|
|
337
|
+
groups=[run_spec_name],
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
@run_spec_function("originality_vlm")
|
|
342
|
+
def get_originality_vlm_spec() -> RunSpec:
|
|
343
|
+
scenario_spec = ScenarioSpec(
|
|
344
|
+
class_name="helm.benchmark.scenarios.vision_language.originality_scenario.OriginalityScenario", args={}
|
|
345
|
+
)
|
|
346
|
+
adapter_spec: AdapterSpec = _get_generation_adapter_spec(max_tokens=500)
|
|
347
|
+
metric_specs: List[MetricSpec] = get_generative_harms_metric_specs(
|
|
348
|
+
include_basic_metrics=True, include_generative_harms_metrics=True
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
run_spec_name: str = "originality_vlm"
|
|
352
|
+
return RunSpec(
|
|
353
|
+
name=run_spec_name,
|
|
354
|
+
scenario_spec=scenario_spec,
|
|
355
|
+
adapter_spec=adapter_spec,
|
|
356
|
+
metric_specs=metric_specs,
|
|
357
|
+
groups=[run_spec_name],
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
|
|
169
361
|
@run_spec_function("viz_wiz")
|
|
170
362
|
def get_viz_wiz_spec() -> RunSpec:
|
|
171
363
|
scenario_spec = ScenarioSpec(
|
|
172
364
|
class_name="helm.benchmark.scenarios.vision_language.viz_wiz_scenario.VizWizScenario", args={}
|
|
173
365
|
)
|
|
174
|
-
adapter_spec: AdapterSpec =
|
|
175
|
-
|
|
366
|
+
adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(
|
|
367
|
+
# Following https://arxiv.org/abs/2310.03744
|
|
368
|
+
instructions="When the provided information is insufficient, respond with 'Unanswerable'. "
|
|
369
|
+
"Answer the question using a single word or phrase."
|
|
370
|
+
)
|
|
371
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
|
|
176
372
|
|
|
177
373
|
run_spec_name: str = "viz_wiz"
|
|
178
374
|
return RunSpec(
|
|
@@ -189,8 +385,12 @@ def get_vqa_spec() -> RunSpec:
|
|
|
189
385
|
scenario_spec = ScenarioSpec(
|
|
190
386
|
class_name="helm.benchmark.scenarios.vision_language.vqa_scenario.VQAScenario", args={}
|
|
191
387
|
)
|
|
192
|
-
|
|
193
|
-
|
|
388
|
+
# Following https://arxiv.org/abs/2310.03744
|
|
389
|
+
adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(
|
|
390
|
+
instructions='Answer the question using a single word or phrase. When the question asks "How many...", '
|
|
391
|
+
"respond with just a number (e.g., 3) and not the word corresponding to the number."
|
|
392
|
+
)
|
|
393
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
|
|
194
394
|
|
|
195
395
|
run_spec_name: str = "vqa"
|
|
196
396
|
return RunSpec(
|
|
@@ -208,11 +408,11 @@ def get_image2latex_spec(subset: str, recompile_prompt: bool = False, args: Opti
|
|
|
208
408
|
class_name="helm.benchmark.scenarios.vision_language.image2structure.latex_scenario.LatexScenario",
|
|
209
409
|
args={"subset": subset, "recompile_prompt": recompile_prompt},
|
|
210
410
|
)
|
|
211
|
-
adapter_spec: AdapterSpec =
|
|
411
|
+
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
|
|
212
412
|
instructions="Just give a short answer without answering in a complete sentence.",
|
|
213
413
|
max_tokens=2000,
|
|
214
414
|
)
|
|
215
|
-
metric_specs: List[MetricSpec] =
|
|
415
|
+
metric_specs: List[MetricSpec] = _get_image2structure_metric_specs(
|
|
216
416
|
generation_type="latex",
|
|
217
417
|
args=args,
|
|
218
418
|
include_edit_similarity=True,
|
|
@@ -241,11 +441,11 @@ def get_image2webpage_spec(subset: str, recompile_prompt: bool = False, args: Op
|
|
|
241
441
|
class_name="helm.benchmark.scenarios.vision_language.image2structure.webpage_scenario.WebpageScenario",
|
|
242
442
|
args={"subset": subset, "recompile_prompt": recompile_prompt},
|
|
243
443
|
)
|
|
244
|
-
adapter_spec: AdapterSpec =
|
|
444
|
+
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
|
|
245
445
|
instructions="Just give a short answer without answering in a complete sentence.",
|
|
246
446
|
max_tokens=2000,
|
|
247
447
|
)
|
|
248
|
-
metric_specs: List[MetricSpec] =
|
|
448
|
+
metric_specs: List[MetricSpec] = _get_image2structure_metric_specs(
|
|
249
449
|
generation_type="webpage",
|
|
250
450
|
args=args,
|
|
251
451
|
include_edit_similarity=True,
|
|
@@ -268,17 +468,45 @@ def get_image2webpage_spec(subset: str, recompile_prompt: bool = False, args: Op
|
|
|
268
468
|
)
|
|
269
469
|
|
|
270
470
|
|
|
471
|
+
@run_spec_function("math_vista")
|
|
472
|
+
def get_math_vista_spec(grade: str, question_type: str) -> RunSpec:
|
|
473
|
+
scenario_spec = ScenarioSpec(
|
|
474
|
+
class_name="helm.benchmark.scenarios.vision_language.math_vista_scenario.MathVistaScenario",
|
|
475
|
+
args={"grade": grade, "question_type": question_type},
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
adapter_spec: AdapterSpec
|
|
479
|
+
if question_type == "free_form":
|
|
480
|
+
adapter_spec = _get_short_answer_generation_adapter_spec()
|
|
481
|
+
elif question_type == "multi_choice":
|
|
482
|
+
adapter_spec = _get_multiple_choice_joint_adapter_spec(
|
|
483
|
+
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
484
|
+
)
|
|
485
|
+
else:
|
|
486
|
+
raise ValueError(f"Invalid question type: {question_type}")
|
|
487
|
+
|
|
488
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
489
|
+
run_spec_name: str = "math_vista"
|
|
490
|
+
return RunSpec(
|
|
491
|
+
name=f"{run_spec_name}:grade={grade},question_type={question_type}",
|
|
492
|
+
scenario_spec=scenario_spec,
|
|
493
|
+
adapter_spec=adapter_spec,
|
|
494
|
+
metric_specs=metric_specs,
|
|
495
|
+
groups=[run_spec_name],
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
|
|
271
499
|
@run_spec_function("image2musicsheet")
|
|
272
500
|
def get_image2musicsheet_spec(args: Optional[Dict] = None) -> RunSpec:
|
|
273
501
|
scenario_spec = ScenarioSpec(
|
|
274
502
|
class_name="helm.benchmark.scenarios.vision_language.image2structure.musicsheet_scenario.MusicSheetScenario",
|
|
275
503
|
args={"subset": "music", "recompile_prompt": False}, # There os only one subset for music sheets
|
|
276
504
|
)
|
|
277
|
-
adapter_spec: AdapterSpec =
|
|
505
|
+
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
|
|
278
506
|
instructions="Just give a short answer without answering in a complete sentence.",
|
|
279
507
|
max_tokens=2000,
|
|
280
508
|
)
|
|
281
|
-
metric_specs: List[MetricSpec] =
|
|
509
|
+
metric_specs: List[MetricSpec] = _get_image2structure_metric_specs(
|
|
282
510
|
generation_type="lilypond",
|
|
283
511
|
args=args,
|
|
284
512
|
include_edit_similarity=False, # No ground truth for music sheets
|
|
@@ -292,7 +520,7 @@ def get_image2musicsheet_spec(args: Optional[Dict] = None) -> RunSpec:
|
|
|
292
520
|
|
|
293
521
|
run_spec_name: str = "image2musicsheet"
|
|
294
522
|
return RunSpec(
|
|
295
|
-
name=
|
|
523
|
+
name=run_spec_name,
|
|
296
524
|
scenario_spec=scenario_spec,
|
|
297
525
|
adapter_spec=adapter_spec,
|
|
298
526
|
metric_specs=metric_specs,
|
|
@@ -310,10 +538,14 @@ def get_mmmu_spec(subject: str, question_type: str) -> RunSpec:
|
|
|
310
538
|
|
|
311
539
|
adapter_spec: AdapterSpec
|
|
312
540
|
if question_type == "open":
|
|
313
|
-
adapter_spec =
|
|
541
|
+
adapter_spec = _get_short_answer_generation_adapter_spec()
|
|
314
542
|
elif question_type == "multiple-choice":
|
|
315
|
-
adapter_spec =
|
|
316
|
-
input_noun=None,
|
|
543
|
+
adapter_spec = _get_multiple_choice_joint_adapter_spec(
|
|
544
|
+
input_noun=None,
|
|
545
|
+
output_noun="Answer",
|
|
546
|
+
max_train_instances=0,
|
|
547
|
+
# instructions="Refer to the figure(s) and answer the multiple choice question by responding with just "
|
|
548
|
+
# "the letter of the correct answer (e.g., A, B, C, D, E).",
|
|
317
549
|
)
|
|
318
550
|
else:
|
|
319
551
|
raise ValueError(f"Invalid question type: {question_type}")
|
|
@@ -335,7 +567,7 @@ def get_unicorn_spec(subject: str) -> RunSpec:
|
|
|
335
567
|
class_name="helm.benchmark.scenarios.vision_language.unicorn_scenario.UnicornScenario",
|
|
336
568
|
args={"subject": subject},
|
|
337
569
|
)
|
|
338
|
-
adapter_spec: AdapterSpec =
|
|
570
|
+
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
|
|
339
571
|
instructions="Only give numerical or boolean answer without an explanation."
|
|
340
572
|
)
|
|
341
573
|
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
@@ -355,8 +587,8 @@ def get_bingo_spec(subject: str) -> RunSpec:
|
|
|
355
587
|
scenario_spec = ScenarioSpec(
|
|
356
588
|
class_name="helm.benchmark.scenarios.vision_language.bingo_scenario.BingoScenario", args={"subject": subject}
|
|
357
589
|
)
|
|
358
|
-
adapter_spec: AdapterSpec =
|
|
359
|
-
metric_specs: List[MetricSpec] =
|
|
590
|
+
adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec()
|
|
591
|
+
metric_specs: List[MetricSpec] = _get_open_ended_generation_metric_specs()
|
|
360
592
|
|
|
361
593
|
run_spec_name: str = "bingo"
|
|
362
594
|
return RunSpec(
|
|
@@ -377,9 +609,9 @@ def get_multipanelvqa_spec(subject: str, question_type: str) -> RunSpec:
|
|
|
377
609
|
|
|
378
610
|
adapter_spec: AdapterSpec
|
|
379
611
|
if question_type == "open":
|
|
380
|
-
adapter_spec =
|
|
612
|
+
adapter_spec = _get_short_answer_generation_adapter_spec()
|
|
381
613
|
elif question_type == "multiple-choice":
|
|
382
|
-
adapter_spec =
|
|
614
|
+
adapter_spec = _get_multiple_choice_joint_adapter_spec(
|
|
383
615
|
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
384
616
|
)
|
|
385
617
|
else:
|
|
@@ -401,7 +633,7 @@ def get_pope_spec() -> RunSpec:
|
|
|
401
633
|
scenario_spec = ScenarioSpec(
|
|
402
634
|
class_name="helm.benchmark.scenarios.vision_language.pope_scenario.POPEScenario",
|
|
403
635
|
)
|
|
404
|
-
adapter_spec: AdapterSpec =
|
|
636
|
+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
|
|
405
637
|
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
406
638
|
)
|
|
407
639
|
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
@@ -422,7 +654,7 @@ def get_seed_bench_spec(subject: str) -> RunSpec:
|
|
|
422
654
|
class_name="helm.benchmark.scenarios.vision_language.seed_bench_scenario.SEEDBenchScenario",
|
|
423
655
|
args={"subject": subject},
|
|
424
656
|
)
|
|
425
|
-
adapter_spec: AdapterSpec =
|
|
657
|
+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
|
|
426
658
|
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
427
659
|
)
|
|
428
660
|
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
@@ -443,7 +675,7 @@ def get_mme_spec(subject: str) -> RunSpec:
|
|
|
443
675
|
class_name="helm.benchmark.scenarios.vision_language.mme_scenario.MMEScenario",
|
|
444
676
|
args={"subject": subject},
|
|
445
677
|
)
|
|
446
|
-
adapter_spec: AdapterSpec =
|
|
678
|
+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
|
|
447
679
|
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
448
680
|
)
|
|
449
681
|
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
@@ -464,7 +696,7 @@ def get_heim_human_eval_spec(question_type: str) -> RunSpec:
|
|
|
464
696
|
class_name="helm.benchmark.scenarios.vision_language.heim_human_eval_scenario.HEIMHumanEvalScenario",
|
|
465
697
|
args={"question_type": question_type},
|
|
466
698
|
)
|
|
467
|
-
adapter_spec: AdapterSpec =
|
|
699
|
+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
|
|
468
700
|
input_noun=None,
|
|
469
701
|
output_noun="Answer",
|
|
470
702
|
num_outputs=1,
|
|
@@ -482,14 +714,38 @@ def get_heim_human_eval_spec(question_type: str) -> RunSpec:
|
|
|
482
714
|
)
|
|
483
715
|
|
|
484
716
|
|
|
717
|
+
@run_spec_function("pairs")
|
|
718
|
+
def get_pairs_spec(subset: str, person: str) -> RunSpec:
|
|
719
|
+
scenario_spec = ScenarioSpec(
|
|
720
|
+
class_name="helm.benchmark.scenarios.vision_language.pairs_scenario.PAIRSScenario",
|
|
721
|
+
args={"subset": subset, "person": person},
|
|
722
|
+
)
|
|
723
|
+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
|
|
724
|
+
input_noun=None,
|
|
725
|
+
output_noun="Answer",
|
|
726
|
+
num_outputs=1,
|
|
727
|
+
max_train_instances=0,
|
|
728
|
+
)
|
|
729
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
730
|
+
|
|
731
|
+
run_spec_name: str = "pairs"
|
|
732
|
+
return RunSpec(
|
|
733
|
+
name=f"{run_spec_name}:subset={subset},person={person}",
|
|
734
|
+
scenario_spec=scenario_spec,
|
|
735
|
+
adapter_spec=adapter_spec,
|
|
736
|
+
metric_specs=metric_specs,
|
|
737
|
+
groups=[run_spec_name],
|
|
738
|
+
)
|
|
739
|
+
|
|
740
|
+
|
|
485
741
|
@run_spec_function("mementos")
|
|
486
742
|
def get_mementos_spec(subject: str) -> RunSpec:
|
|
487
743
|
scenario_spec = ScenarioSpec(
|
|
488
744
|
class_name="helm.benchmark.scenarios.vision_language.mementos_scenario.MementosScenario",
|
|
489
745
|
args={"subject": subject},
|
|
490
746
|
)
|
|
491
|
-
adapter_spec: AdapterSpec =
|
|
492
|
-
metric_specs: List[MetricSpec] =
|
|
747
|
+
adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec()
|
|
748
|
+
metric_specs: List[MetricSpec] = _get_open_ended_generation_metric_specs()
|
|
493
749
|
|
|
494
750
|
run_spec_name: str = "mementos"
|
|
495
751
|
return RunSpec(
|
|
@@ -96,8 +96,12 @@ class LegalBenchScenario(Scenario):
|
|
|
96
96
|
|
|
97
97
|
# Download data from Huggingface. LegalBench provides splits for samples to
|
|
98
98
|
# be used for prompt construction and for testing.
|
|
99
|
-
train_dataset = datasets.load_dataset(
|
|
100
|
-
|
|
99
|
+
train_dataset = datasets.load_dataset(
|
|
100
|
+
"nguha/legalbench", self.subset, trust_remote_code=True, cache_dir=cache_dir, split="train"
|
|
101
|
+
)
|
|
102
|
+
test_dataset = datasets.load_dataset(
|
|
103
|
+
"nguha/legalbench", self.subset, trust_remote_code=True, cache_dir=cache_dir, split="test"
|
|
104
|
+
)
|
|
101
105
|
assert isinstance(train_dataset, datasets.Dataset)
|
|
102
106
|
assert isinstance(test_dataset, datasets.Dataset)
|
|
103
107
|
|
|
@@ -368,7 +368,7 @@ class MATHScenario(Scenario):
|
|
|
368
368
|
cache_dir = os.path.join(output_path, "data")
|
|
369
369
|
ensure_directory_exists(cache_dir)
|
|
370
370
|
data = (
|
|
371
|
-
typing.cast(DatasetDict, load_dataset("competition_math", cache_dir=cache_dir))
|
|
371
|
+
typing.cast(DatasetDict, load_dataset("competition_math", trust_remote_code=True, cache_dir=cache_dir))
|
|
372
372
|
.sort("problem")
|
|
373
373
|
.shuffle(seed=42)
|
|
374
374
|
)
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
ALL_SPLITS,
|
|
10
|
+
VALID_SPLIT,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
Instance,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
Reference,
|
|
16
|
+
Scenario,
|
|
17
|
+
)
|
|
18
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
19
|
+
from helm.common.general import ensure_directory_exists
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class AOKVQAScenario(Scenario):
|
|
23
|
+
"""
|
|
24
|
+
A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense
|
|
25
|
+
and world knowledge to answer.
|
|
26
|
+
|
|
27
|
+
@misc{schwenk2022aokvqa,
|
|
28
|
+
title={A-OKVQA: A Benchmark for Visual Question Answering using World Knowledge},
|
|
29
|
+
author={Dustin Schwenk and Apoorv Khandelwal and Christopher Clark and Kenneth Marino and Roozbeh Mottaghi},
|
|
30
|
+
year={2022},
|
|
31
|
+
eprint={2206.01718},
|
|
32
|
+
archivePrefix={arXiv},
|
|
33
|
+
primaryClass={cs.CV}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
Paper: https://arxiv.org/abs/2206.01718
|
|
37
|
+
Website: https://huggingface.co/datasets/HuggingFaceM4/A-OKVQA
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
HF_DATASET_NAME: str = "HuggingFaceM4/A-OKVQA"
|
|
41
|
+
|
|
42
|
+
name = "a_okvqa"
|
|
43
|
+
description = (
|
|
44
|
+
"A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of "
|
|
45
|
+
"commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718))."
|
|
46
|
+
)
|
|
47
|
+
tags = ["vision-language", "knowledge", "reasoning"]
|
|
48
|
+
|
|
49
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
50
|
+
images_path: str = os.path.join(output_path, "images")
|
|
51
|
+
ensure_directory_exists(images_path)
|
|
52
|
+
|
|
53
|
+
instances: List[Instance] = []
|
|
54
|
+
for helm_split in ALL_SPLITS:
|
|
55
|
+
if helm_split == TEST_SPLIT:
|
|
56
|
+
# The examples in the test split does not have answers
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
split = "validation" if helm_split == VALID_SPLIT else helm_split
|
|
60
|
+
|
|
61
|
+
for row in tqdm(load_dataset(self.HF_DATASET_NAME, cache_dir=output_path, split=split)):
|
|
62
|
+
image_filename: str = f"{row['question_id']}.jpg"
|
|
63
|
+
local_image_path: str = os.path.join(images_path, image_filename)
|
|
64
|
+
image = row["image"]
|
|
65
|
+
if not os.path.exists(local_image_path):
|
|
66
|
+
image.save(local_image_path)
|
|
67
|
+
|
|
68
|
+
content: List[MediaObject] = [
|
|
69
|
+
MediaObject(location=local_image_path, content_type="image/jpeg"),
|
|
70
|
+
MediaObject(text=row["question"], content_type="text/plain"),
|
|
71
|
+
]
|
|
72
|
+
instances.append(
|
|
73
|
+
Instance(
|
|
74
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
75
|
+
references=[
|
|
76
|
+
Reference(Output(text=choice), tags=[CORRECT_TAG] if i == row["correct_choice_idx"] else [])
|
|
77
|
+
for i, choice in enumerate(row["choices"])
|
|
78
|
+
],
|
|
79
|
+
split=helm_split,
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
return instances
|