crfm-helm 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (56) hide show
  1. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +7 -3
  2. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/RECORD +53 -41
  3. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  5. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  6. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
  7. helm/benchmark/augmentations/perturbation.py +17 -1
  8. helm/benchmark/augmentations/test_perturbation.py +30 -0
  9. helm/benchmark/metrics/efficiency_metrics.py +9 -2
  10. helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
  11. helm/benchmark/metrics/vision_language/image_metrics.py +142 -17
  12. helm/benchmark/model_metadata_registry.py +5 -1
  13. helm/benchmark/run_expander.py +35 -63
  14. helm/benchmark/run_spec_factory.py +11 -10
  15. helm/benchmark/run_specs/vlm_run_specs.py +294 -38
  16. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  17. helm/benchmark/scenarios/math_scenario.py +1 -1
  18. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  19. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  20. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  21. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  22. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
  23. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -1
  24. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +1 -1
  25. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  26. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  27. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  28. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  29. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  30. helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
  31. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
  32. helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
  33. helm/benchmark/static/schema_image2structure.yaml +304 -0
  34. helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
  35. helm/benchmark/static/schema_vlm.yaml +257 -10
  36. helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
  37. helm/benchmark/static_build/assets/index-878a1094.css +1 -0
  38. helm/benchmark/static_build/index.html +2 -2
  39. helm/clients/anthropic_client.py +36 -6
  40. helm/clients/openai_client.py +2 -3
  41. helm/clients/together_client.py +93 -2
  42. helm/clients/vertexai_client.py +59 -50
  43. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  44. helm/clients/vision_language/huggingface_vlm_client.py +11 -4
  45. helm/clients/vision_language/idefics_client.py +2 -2
  46. helm/common/images_utils.py +10 -3
  47. helm/config/model_deployments.yaml +100 -2
  48. helm/config/model_metadata.yaml +136 -31
  49. helm/config/tokenizer_configs.yaml +7 -0
  50. helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
  51. helm/benchmark/static_build/assets/index-d839df55.js +0 -9
  52. helm/benchmark/test_model_deployment_definition.py +0 -90
  53. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
  54. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +0 -0
  55. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
  56. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
@@ -10,7 +10,8 @@ from helm.benchmark.adaptation.adapters.adapter_factory import (
10
10
  from helm.benchmark.metrics.common_metric_specs import (
11
11
  get_basic_reference_metric_specs,
12
12
  get_exact_match_metric_specs,
13
- get_open_ended_generation_metric_specs,
13
+ get_generative_harms_metric_specs,
14
+ get_basic_metric_specs,
14
15
  )
15
16
  from helm.benchmark.metrics.metric import MetricSpec
16
17
  from helm.benchmark.run_spec import RunSpec, run_spec_function
@@ -22,7 +23,7 @@ from helm.benchmark.annotation.annotator import AnnotatorSpec
22
23
  # Prototypical adapter specs for VLM evaluation
23
24
 
24
25
 
25
- def get_generation_adapter_spec(
26
+ def _get_generation_adapter_spec(
26
27
  instructions: str = "",
27
28
  input_prefix: str = "",
28
29
  input_suffix: str = "",
@@ -45,18 +46,31 @@ def get_generation_adapter_spec(
45
46
  num_outputs=1,
46
47
  max_tokens=max_tokens,
47
48
  stop_sequences=stop_sequences if stop_sequences is not None else [],
49
+ temperature=0.0,
48
50
  random=None,
49
51
  )
50
52
 
51
53
 
52
- def get_short_answer_generation_adapter_spec():
53
- return get_generation_adapter_spec(
54
- instructions="Just give a short answer without answering in a complete sentence.",
54
+ def _get_short_answer_generation_adapter_spec(instructions: Optional[str] = None) -> AdapterSpec:
55
+ return _get_generation_adapter_spec(
56
+ instructions=(
57
+ "Just give a short answer without answering in a complete sentence."
58
+ if instructions is None
59
+ else instructions
60
+ ),
55
61
  max_tokens=20,
56
62
  )
57
63
 
58
64
 
59
- def get_multiple_choice_joint_adapter_spec(
65
+ def _get_captioning_adapter_spec() -> AdapterSpec:
66
+ return _get_generation_adapter_spec(
67
+ instructions="Generate a caption for the following image. The caption should be short and does "
68
+ "not need to be a complete sentence.",
69
+ max_tokens=20,
70
+ )
71
+
72
+
73
+ def _get_multiple_choice_joint_adapter_spec(
60
74
  input_noun: Optional[str],
61
75
  output_noun: str,
62
76
  max_train_instances: int = 0,
@@ -84,7 +98,13 @@ def get_multiple_choice_joint_adapter_spec(
84
98
  # VHELM metric specs
85
99
 
86
100
 
87
- def get_image2structure_metric_specs(
101
+ def _get_open_ended_generation_metric_specs() -> List[MetricSpec]:
102
+ return get_basic_metric_specs(
103
+ ["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4", "cider"]
104
+ )
105
+
106
+
107
+ def _get_image2structure_metric_specs(
88
108
  generation_type: str,
89
109
  metric_names: Optional[List[str]] = None,
90
110
  args: Optional[Dict] = None,
@@ -97,7 +117,9 @@ def get_image2structure_metric_specs(
97
117
  metric_names = [
98
118
  AnnotatedImageMetrics.PIXEL_SIMILARITY,
99
119
  AnnotatedImageMetrics.FID_SIMILARITY,
100
- AnnotatedImageMetrics.EARTH_MOVER_SIMILARITY,
120
+ AnnotatedImageMetrics.BLOCK_EARTH_MOVER_SIMILARITY,
121
+ AnnotatedImageMetrics.BLOCK_EARTH_MOVER_SIMILARITY_NORM2,
122
+ AnnotatedImageMetrics.BLOCK_EARTH_MOVER_SIMILARITY_NORM1,
101
123
  ]
102
124
  if include_edit_similarity:
103
125
  metric_names.append(AnnotatedImageMetrics.EDIT_SIMILARITY)
@@ -121,13 +143,35 @@ def get_image2structure_metric_specs(
121
143
  # VHELM run specs
122
144
 
123
145
 
146
+ @run_spec_function("a_okvqa")
147
+ def get_a_okvqa_spec() -> RunSpec:
148
+ scenario_spec = ScenarioSpec(
149
+ class_name="helm.benchmark.scenarios.vision_language.a_okvqa_scenario.AOKVQAScenario",
150
+ args={},
151
+ )
152
+
153
+ adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
154
+ input_noun=None, output_noun="Answer", max_train_instances=0
155
+ )
156
+
157
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
158
+ run_spec_name: str = "a_okvqa"
159
+ return RunSpec(
160
+ name=run_spec_name,
161
+ scenario_spec=scenario_spec,
162
+ adapter_spec=adapter_spec,
163
+ metric_specs=metric_specs,
164
+ groups=[run_spec_name],
165
+ )
166
+
167
+
124
168
  @run_spec_function("chart2csv")
125
169
  def get_chart2csv_spec() -> RunSpec:
126
170
  scenario_spec = ScenarioSpec(
127
171
  class_name="helm.benchmark.scenarios.vision_language.image2structure.chart2csv_scenario.Chart2CSVScenario",
128
172
  args={},
129
173
  )
130
- adapter_spec: AdapterSpec = get_generation_adapter_spec(
174
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(
131
175
  instructions="Generate the CSV for the chart. Some of the labels may be missing due to the size of the chart. "
132
176
  "Please infer the missing labels based on the surrounding context. "
133
177
  "Just give the CSV without any explanation.",
@@ -145,15 +189,69 @@ def get_chart2csv_spec() -> RunSpec:
145
189
  )
146
190
 
147
191
 
192
+ @run_spec_function("crossmodal_3600")
193
+ def get_crossmodal_3600_spec(location: str, language: str) -> RunSpec:
194
+ scenario_spec = ScenarioSpec(
195
+ class_name="helm.benchmark.scenarios.vision_language.crossmodal_3600_scenario.Crossmodal3600Scenario",
196
+ args={"location": location, "language": language},
197
+ )
198
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(max_tokens=20)
199
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
200
+
201
+ run_spec_name: str = "crossmodal_3600"
202
+ return RunSpec(
203
+ name=f"{run_spec_name}:location={location},language={language}",
204
+ scenario_spec=scenario_spec,
205
+ adapter_spec=adapter_spec,
206
+ metric_specs=metric_specs,
207
+ groups=[run_spec_name],
208
+ )
209
+
210
+
211
+ @run_spec_function("flickr30k")
212
+ def get_flickr30k_spec() -> RunSpec:
213
+ scenario_spec = ScenarioSpec(
214
+ class_name="helm.benchmark.scenarios.vision_language.flickr30k_scenario.Flickr30KScenario", args={}
215
+ )
216
+ adapter_spec: AdapterSpec = _get_captioning_adapter_spec()
217
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
218
+
219
+ run_spec_name: str = "flickr30k"
220
+ return RunSpec(
221
+ name=run_spec_name,
222
+ scenario_spec=scenario_spec,
223
+ adapter_spec=adapter_spec,
224
+ metric_specs=metric_specs,
225
+ groups=[run_spec_name],
226
+ )
227
+
228
+
229
+ @run_spec_function("gqa")
230
+ def get_gqa_spec() -> RunSpec:
231
+ scenario_spec = ScenarioSpec(
232
+ class_name="helm.benchmark.scenarios.vision_language.gqa_scenario.GQAScenario", args={}
233
+ )
234
+ adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(
235
+ instructions="Answer the question using a single word or phrase."
236
+ )
237
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
238
+
239
+ run_spec_name: str = "gqa"
240
+ return RunSpec(
241
+ name=run_spec_name,
242
+ scenario_spec=scenario_spec,
243
+ adapter_spec=adapter_spec,
244
+ metric_specs=metric_specs,
245
+ groups=[run_spec_name],
246
+ )
247
+
248
+
148
249
  @run_spec_function("hateful_memes")
149
250
  def get_hateful_memes_spec() -> RunSpec:
150
251
  scenario_spec = ScenarioSpec(
151
252
  class_name="helm.benchmark.scenarios.vision_language.hateful_memes_scenario.HatefulMemesScenario", args={}
152
253
  )
153
- adapter_spec: AdapterSpec = get_generation_adapter_spec(
154
- instructions="Answer Yes or No without an explanation.",
155
- max_tokens=3,
156
- )
254
+ adapter_spec = _get_multiple_choice_joint_adapter_spec(input_noun=None, output_noun="Answer", max_train_instances=0)
157
255
  metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
158
256
 
159
257
  run_spec_name: str = "hateful_memes"
@@ -166,13 +264,111 @@ def get_hateful_memes_spec() -> RunSpec:
166
264
  )
167
265
 
168
266
 
267
+ @run_spec_function("mm_safety_bench")
268
+ def get_mm_safety_bench_spec(subset: str) -> RunSpec:
269
+ scenario_spec = ScenarioSpec(
270
+ class_name="helm.benchmark.scenarios.vision_language.mm_safety_bench_scenario.MMSafetyBenchScenario",
271
+ args={"subset": subset},
272
+ )
273
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(max_tokens=500)
274
+ metric_specs: List[MetricSpec] = get_generative_harms_metric_specs(
275
+ include_basic_metrics=True, include_generative_harms_metrics=True
276
+ )
277
+
278
+ run_spec_name: str = "mm_safety_bench"
279
+ return RunSpec(
280
+ name=f"{run_spec_name}:subset={subset}",
281
+ scenario_spec=scenario_spec,
282
+ adapter_spec=adapter_spec,
283
+ metric_specs=metric_specs,
284
+ groups=[run_spec_name],
285
+ )
286
+
287
+
288
+ @run_spec_function("mscoco_captioning")
289
+ def get_mscoco_captioning_spec(long: bool = False) -> RunSpec:
290
+ scenario_spec = ScenarioSpec(
291
+ class_name="helm.benchmark.scenarios.vision_language.mscoco_captioning_scenario.MSCOCOCaptioningScenario",
292
+ args={},
293
+ )
294
+
295
+ adapter_spec: AdapterSpec
296
+ if long:
297
+ adapter_spec = _get_generation_adapter_spec(
298
+ instructions="Generate a long, detailed caption for the following image.",
299
+ max_tokens=150,
300
+ )
301
+ else:
302
+ adapter_spec = _get_captioning_adapter_spec()
303
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
304
+
305
+ run_spec_name: str = "mscoco_captioning"
306
+ if long:
307
+ run_spec_name += "_long"
308
+
309
+ return RunSpec(
310
+ name=run_spec_name,
311
+ scenario_spec=scenario_spec,
312
+ adapter_spec=adapter_spec,
313
+ metric_specs=metric_specs,
314
+ groups=[run_spec_name],
315
+ )
316
+
317
+
318
+ @run_spec_function("mscoco_categorization")
319
+ def get_mscoco_categorization_spec() -> RunSpec:
320
+ scenario_spec = ScenarioSpec(
321
+ class_name="helm.benchmark.scenarios.vision_language.mscoco_categorization_scenario."
322
+ "MSCOCOCategorizationScenario",
323
+ args={},
324
+ )
325
+ adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
326
+ input_noun=None, output_noun="Answer", max_train_instances=0
327
+ )
328
+
329
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
330
+
331
+ run_spec_name: str = "mscoco_categorization"
332
+ return RunSpec(
333
+ name=run_spec_name,
334
+ scenario_spec=scenario_spec,
335
+ adapter_spec=adapter_spec,
336
+ metric_specs=metric_specs,
337
+ groups=[run_spec_name],
338
+ )
339
+
340
+
341
+ @run_spec_function("originality_vlm")
342
+ def get_originality_vlm_spec() -> RunSpec:
343
+ scenario_spec = ScenarioSpec(
344
+ class_name="helm.benchmark.scenarios.vision_language.originality_scenario.OriginalityScenario", args={}
345
+ )
346
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(max_tokens=500)
347
+ metric_specs: List[MetricSpec] = get_generative_harms_metric_specs(
348
+ include_basic_metrics=True, include_generative_harms_metrics=True
349
+ )
350
+
351
+ run_spec_name: str = "originality_vlm"
352
+ return RunSpec(
353
+ name=run_spec_name,
354
+ scenario_spec=scenario_spec,
355
+ adapter_spec=adapter_spec,
356
+ metric_specs=metric_specs,
357
+ groups=[run_spec_name],
358
+ )
359
+
360
+
169
361
  @run_spec_function("viz_wiz")
170
362
  def get_viz_wiz_spec() -> RunSpec:
171
363
  scenario_spec = ScenarioSpec(
172
364
  class_name="helm.benchmark.scenarios.vision_language.viz_wiz_scenario.VizWizScenario", args={}
173
365
  )
174
- adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec()
175
- metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
366
+ adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(
367
+ # Following https://arxiv.org/abs/2310.03744
368
+ instructions="When the provided information is insufficient, respond with 'Unanswerable'. "
369
+ "Answer the question using a single word or phrase."
370
+ )
371
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
176
372
 
177
373
  run_spec_name: str = "viz_wiz"
178
374
  return RunSpec(
@@ -189,8 +385,12 @@ def get_vqa_spec() -> RunSpec:
189
385
  scenario_spec = ScenarioSpec(
190
386
  class_name="helm.benchmark.scenarios.vision_language.vqa_scenario.VQAScenario", args={}
191
387
  )
192
- adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec()
193
- metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + get_open_ended_generation_metric_specs()
388
+ # Following https://arxiv.org/abs/2310.03744
389
+ adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(
390
+ instructions='Answer the question using a single word or phrase. When the question asks "How many...", '
391
+ "respond with just a number (e.g., 3) and not the word corresponding to the number."
392
+ )
393
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
194
394
 
195
395
  run_spec_name: str = "vqa"
196
396
  return RunSpec(
@@ -208,11 +408,11 @@ def get_image2latex_spec(subset: str, recompile_prompt: bool = False, args: Opti
208
408
  class_name="helm.benchmark.scenarios.vision_language.image2structure.latex_scenario.LatexScenario",
209
409
  args={"subset": subset, "recompile_prompt": recompile_prompt},
210
410
  )
211
- adapter_spec: AdapterSpec = get_generation_adapter_spec(
411
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(
212
412
  instructions="Just give a short answer without answering in a complete sentence.",
213
413
  max_tokens=2000,
214
414
  )
215
- metric_specs: List[MetricSpec] = get_image2structure_metric_specs(
415
+ metric_specs: List[MetricSpec] = _get_image2structure_metric_specs(
216
416
  generation_type="latex",
217
417
  args=args,
218
418
  include_edit_similarity=True,
@@ -241,11 +441,11 @@ def get_image2webpage_spec(subset: str, recompile_prompt: bool = False, args: Op
241
441
  class_name="helm.benchmark.scenarios.vision_language.image2structure.webpage_scenario.WebpageScenario",
242
442
  args={"subset": subset, "recompile_prompt": recompile_prompt},
243
443
  )
244
- adapter_spec: AdapterSpec = get_generation_adapter_spec(
444
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(
245
445
  instructions="Just give a short answer without answering in a complete sentence.",
246
446
  max_tokens=2000,
247
447
  )
248
- metric_specs: List[MetricSpec] = get_image2structure_metric_specs(
448
+ metric_specs: List[MetricSpec] = _get_image2structure_metric_specs(
249
449
  generation_type="webpage",
250
450
  args=args,
251
451
  include_edit_similarity=True,
@@ -268,17 +468,45 @@ def get_image2webpage_spec(subset: str, recompile_prompt: bool = False, args: Op
268
468
  )
269
469
 
270
470
 
471
+ @run_spec_function("math_vista")
472
+ def get_math_vista_spec(grade: str, question_type: str) -> RunSpec:
473
+ scenario_spec = ScenarioSpec(
474
+ class_name="helm.benchmark.scenarios.vision_language.math_vista_scenario.MathVistaScenario",
475
+ args={"grade": grade, "question_type": question_type},
476
+ )
477
+
478
+ adapter_spec: AdapterSpec
479
+ if question_type == "free_form":
480
+ adapter_spec = _get_short_answer_generation_adapter_spec()
481
+ elif question_type == "multi_choice":
482
+ adapter_spec = _get_multiple_choice_joint_adapter_spec(
483
+ input_noun=None, output_noun="Answer", max_train_instances=0
484
+ )
485
+ else:
486
+ raise ValueError(f"Invalid question type: {question_type}")
487
+
488
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
489
+ run_spec_name: str = "math_vista"
490
+ return RunSpec(
491
+ name=f"{run_spec_name}:grade={grade},question_type={question_type}",
492
+ scenario_spec=scenario_spec,
493
+ adapter_spec=adapter_spec,
494
+ metric_specs=metric_specs,
495
+ groups=[run_spec_name],
496
+ )
497
+
498
+
271
499
  @run_spec_function("image2musicsheet")
272
500
  def get_image2musicsheet_spec(args: Optional[Dict] = None) -> RunSpec:
273
501
  scenario_spec = ScenarioSpec(
274
502
  class_name="helm.benchmark.scenarios.vision_language.image2structure.musicsheet_scenario.MusicSheetScenario",
275
503
  args={"subset": "music", "recompile_prompt": False}, # There os only one subset for music sheets
276
504
  )
277
- adapter_spec: AdapterSpec = get_generation_adapter_spec(
505
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(
278
506
  instructions="Just give a short answer without answering in a complete sentence.",
279
507
  max_tokens=2000,
280
508
  )
281
- metric_specs: List[MetricSpec] = get_image2structure_metric_specs(
509
+ metric_specs: List[MetricSpec] = _get_image2structure_metric_specs(
282
510
  generation_type="lilypond",
283
511
  args=args,
284
512
  include_edit_similarity=False, # No ground truth for music sheets
@@ -292,7 +520,7 @@ def get_image2musicsheet_spec(args: Optional[Dict] = None) -> RunSpec:
292
520
 
293
521
  run_spec_name: str = "image2musicsheet"
294
522
  return RunSpec(
295
- name=f"{run_spec_name}",
523
+ name=run_spec_name,
296
524
  scenario_spec=scenario_spec,
297
525
  adapter_spec=adapter_spec,
298
526
  metric_specs=metric_specs,
@@ -310,10 +538,14 @@ def get_mmmu_spec(subject: str, question_type: str) -> RunSpec:
310
538
 
311
539
  adapter_spec: AdapterSpec
312
540
  if question_type == "open":
313
- adapter_spec = get_short_answer_generation_adapter_spec()
541
+ adapter_spec = _get_short_answer_generation_adapter_spec()
314
542
  elif question_type == "multiple-choice":
315
- adapter_spec = get_multiple_choice_joint_adapter_spec(
316
- input_noun=None, output_noun="Answer", max_train_instances=0
543
+ adapter_spec = _get_multiple_choice_joint_adapter_spec(
544
+ input_noun=None,
545
+ output_noun="Answer",
546
+ max_train_instances=0,
547
+ # instructions="Refer to the figure(s) and answer the multiple choice question by responding with just "
548
+ # "the letter of the correct answer (e.g., A, B, C, D, E).",
317
549
  )
318
550
  else:
319
551
  raise ValueError(f"Invalid question type: {question_type}")
@@ -335,7 +567,7 @@ def get_unicorn_spec(subject: str) -> RunSpec:
335
567
  class_name="helm.benchmark.scenarios.vision_language.unicorn_scenario.UnicornScenario",
336
568
  args={"subject": subject},
337
569
  )
338
- adapter_spec: AdapterSpec = get_generation_adapter_spec(
570
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(
339
571
  instructions="Only give numerical or boolean answer without an explanation."
340
572
  )
341
573
  metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
@@ -355,8 +587,8 @@ def get_bingo_spec(subject: str) -> RunSpec:
355
587
  scenario_spec = ScenarioSpec(
356
588
  class_name="helm.benchmark.scenarios.vision_language.bingo_scenario.BingoScenario", args={"subject": subject}
357
589
  )
358
- adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec()
359
- metric_specs: List[MetricSpec] = get_open_ended_generation_metric_specs()
590
+ adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec()
591
+ metric_specs: List[MetricSpec] = _get_open_ended_generation_metric_specs()
360
592
 
361
593
  run_spec_name: str = "bingo"
362
594
  return RunSpec(
@@ -377,9 +609,9 @@ def get_multipanelvqa_spec(subject: str, question_type: str) -> RunSpec:
377
609
 
378
610
  adapter_spec: AdapterSpec
379
611
  if question_type == "open":
380
- adapter_spec = get_short_answer_generation_adapter_spec()
612
+ adapter_spec = _get_short_answer_generation_adapter_spec()
381
613
  elif question_type == "multiple-choice":
382
- adapter_spec = get_multiple_choice_joint_adapter_spec(
614
+ adapter_spec = _get_multiple_choice_joint_adapter_spec(
383
615
  input_noun=None, output_noun="Answer", max_train_instances=0
384
616
  )
385
617
  else:
@@ -401,7 +633,7 @@ def get_pope_spec() -> RunSpec:
401
633
  scenario_spec = ScenarioSpec(
402
634
  class_name="helm.benchmark.scenarios.vision_language.pope_scenario.POPEScenario",
403
635
  )
404
- adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec(
636
+ adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
405
637
  input_noun=None, output_noun="Answer", max_train_instances=0
406
638
  )
407
639
  metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
@@ -422,7 +654,7 @@ def get_seed_bench_spec(subject: str) -> RunSpec:
422
654
  class_name="helm.benchmark.scenarios.vision_language.seed_bench_scenario.SEEDBenchScenario",
423
655
  args={"subject": subject},
424
656
  )
425
- adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec(
657
+ adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
426
658
  input_noun=None, output_noun="Answer", max_train_instances=0
427
659
  )
428
660
  metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
@@ -443,7 +675,7 @@ def get_mme_spec(subject: str) -> RunSpec:
443
675
  class_name="helm.benchmark.scenarios.vision_language.mme_scenario.MMEScenario",
444
676
  args={"subject": subject},
445
677
  )
446
- adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec(
678
+ adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
447
679
  input_noun=None, output_noun="Answer", max_train_instances=0
448
680
  )
449
681
  metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
@@ -464,7 +696,7 @@ def get_heim_human_eval_spec(question_type: str) -> RunSpec:
464
696
  class_name="helm.benchmark.scenarios.vision_language.heim_human_eval_scenario.HEIMHumanEvalScenario",
465
697
  args={"question_type": question_type},
466
698
  )
467
- adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec(
699
+ adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
468
700
  input_noun=None,
469
701
  output_noun="Answer",
470
702
  num_outputs=1,
@@ -482,14 +714,38 @@ def get_heim_human_eval_spec(question_type: str) -> RunSpec:
482
714
  )
483
715
 
484
716
 
717
+ @run_spec_function("pairs")
718
+ def get_pairs_spec(subset: str, person: str) -> RunSpec:
719
+ scenario_spec = ScenarioSpec(
720
+ class_name="helm.benchmark.scenarios.vision_language.pairs_scenario.PAIRSScenario",
721
+ args={"subset": subset, "person": person},
722
+ )
723
+ adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
724
+ input_noun=None,
725
+ output_noun="Answer",
726
+ num_outputs=1,
727
+ max_train_instances=0,
728
+ )
729
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
730
+
731
+ run_spec_name: str = "pairs"
732
+ return RunSpec(
733
+ name=f"{run_spec_name}:subset={subset},person={person}",
734
+ scenario_spec=scenario_spec,
735
+ adapter_spec=adapter_spec,
736
+ metric_specs=metric_specs,
737
+ groups=[run_spec_name],
738
+ )
739
+
740
+
485
741
  @run_spec_function("mementos")
486
742
  def get_mementos_spec(subject: str) -> RunSpec:
487
743
  scenario_spec = ScenarioSpec(
488
744
  class_name="helm.benchmark.scenarios.vision_language.mementos_scenario.MementosScenario",
489
745
  args={"subject": subject},
490
746
  )
491
- adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec()
492
- metric_specs: List[MetricSpec] = get_open_ended_generation_metric_specs()
747
+ adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec()
748
+ metric_specs: List[MetricSpec] = _get_open_ended_generation_metric_specs()
493
749
 
494
750
  run_spec_name: str = "mementos"
495
751
  return RunSpec(
@@ -96,8 +96,12 @@ class LegalBenchScenario(Scenario):
96
96
 
97
97
  # Download data from Huggingface. LegalBench provides splits for samples to
98
98
  # be used for prompt construction and for testing.
99
- train_dataset = datasets.load_dataset("nguha/legalbench", self.subset, cache_dir=cache_dir, split="train")
100
- test_dataset = datasets.load_dataset("nguha/legalbench", self.subset, cache_dir=cache_dir, split="test")
99
+ train_dataset = datasets.load_dataset(
100
+ "nguha/legalbench", self.subset, trust_remote_code=True, cache_dir=cache_dir, split="train"
101
+ )
102
+ test_dataset = datasets.load_dataset(
103
+ "nguha/legalbench", self.subset, trust_remote_code=True, cache_dir=cache_dir, split="test"
104
+ )
101
105
  assert isinstance(train_dataset, datasets.Dataset)
102
106
  assert isinstance(test_dataset, datasets.Dataset)
103
107
 
@@ -368,7 +368,7 @@ class MATHScenario(Scenario):
368
368
  cache_dir = os.path.join(output_path, "data")
369
369
  ensure_directory_exists(cache_dir)
370
370
  data = (
371
- typing.cast(DatasetDict, load_dataset("competition_math", cache_dir=cache_dir))
371
+ typing.cast(DatasetDict, load_dataset("competition_math", trust_remote_code=True, cache_dir=cache_dir))
372
372
  .sort("problem")
373
373
  .shuffle(seed=42)
374
374
  )
@@ -0,0 +1,83 @@
1
+ import os
2
+ from typing import List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ ALL_SPLITS,
10
+ VALID_SPLIT,
11
+ TEST_SPLIT,
12
+ Instance,
13
+ Input,
14
+ Output,
15
+ Reference,
16
+ Scenario,
17
+ )
18
+ from helm.common.media_object import MediaObject, MultimediaObject
19
+ from helm.common.general import ensure_directory_exists
20
+
21
+
22
+ class AOKVQAScenario(Scenario):
23
+ """
24
+ A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense
25
+ and world knowledge to answer.
26
+
27
+ @misc{schwenk2022aokvqa,
28
+ title={A-OKVQA: A Benchmark for Visual Question Answering using World Knowledge},
29
+ author={Dustin Schwenk and Apoorv Khandelwal and Christopher Clark and Kenneth Marino and Roozbeh Mottaghi},
30
+ year={2022},
31
+ eprint={2206.01718},
32
+ archivePrefix={arXiv},
33
+ primaryClass={cs.CV}
34
+ }
35
+
36
+ Paper: https://arxiv.org/abs/2206.01718
37
+ Website: https://huggingface.co/datasets/HuggingFaceM4/A-OKVQA
38
+ """
39
+
40
+ HF_DATASET_NAME: str = "HuggingFaceM4/A-OKVQA"
41
+
42
+ name = "a_okvqa"
43
+ description = (
44
+ "A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of "
45
+ "commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718))."
46
+ )
47
+ tags = ["vision-language", "knowledge", "reasoning"]
48
+
49
+ def get_instances(self, output_path: str) -> List[Instance]:
50
+ images_path: str = os.path.join(output_path, "images")
51
+ ensure_directory_exists(images_path)
52
+
53
+ instances: List[Instance] = []
54
+ for helm_split in ALL_SPLITS:
55
+ if helm_split == TEST_SPLIT:
56
+ # The examples in the test split does not have answers
57
+ continue
58
+
59
+ split = "validation" if helm_split == VALID_SPLIT else helm_split
60
+
61
+ for row in tqdm(load_dataset(self.HF_DATASET_NAME, cache_dir=output_path, split=split)):
62
+ image_filename: str = f"{row['question_id']}.jpg"
63
+ local_image_path: str = os.path.join(images_path, image_filename)
64
+ image = row["image"]
65
+ if not os.path.exists(local_image_path):
66
+ image.save(local_image_path)
67
+
68
+ content: List[MediaObject] = [
69
+ MediaObject(location=local_image_path, content_type="image/jpeg"),
70
+ MediaObject(text=row["question"], content_type="text/plain"),
71
+ ]
72
+ instances.append(
73
+ Instance(
74
+ Input(multimedia_content=MultimediaObject(content)),
75
+ references=[
76
+ Reference(Output(text=choice), tags=[CORRECT_TAG] if i == row["correct_choice_idx"] else [])
77
+ for i, choice in enumerate(row["choices"])
78
+ ],
79
+ split=helm_split,
80
+ )
81
+ )
82
+
83
+ return instances