crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (125) hide show
  1. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
  2. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
  3. helm/benchmark/adaptation/adapter_spec.py +32 -31
  4. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  5. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  6. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  7. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  8. helm/benchmark/annotation/annotator_factory.py +6 -0
  9. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
  10. helm/benchmark/annotation/live_qa_annotator.py +84 -0
  11. helm/benchmark/annotation/medication_qa_annotator.py +81 -0
  12. helm/benchmark/augmentations/perturbation.py +17 -1
  13. helm/benchmark/augmentations/test_perturbation.py +30 -0
  14. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  15. helm/benchmark/huggingface_registration.py +16 -6
  16. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  17. helm/benchmark/metrics/efficiency_metrics.py +9 -2
  18. helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
  19. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  20. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  21. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  22. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  23. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  24. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  25. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  26. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  27. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  28. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  29. helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
  30. helm/benchmark/model_metadata_registry.py +5 -1
  31. helm/benchmark/presentation/schema.py +54 -4
  32. helm/benchmark/presentation/test_schema.py +11 -0
  33. helm/benchmark/run.py +16 -2
  34. helm/benchmark/run_expander.py +112 -63
  35. helm/benchmark/run_spec_factory.py +15 -10
  36. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  37. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  38. helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
  39. helm/benchmark/run_specs/experimental_run_specs.py +33 -0
  40. helm/benchmark/run_specs/finance_run_specs.py +33 -0
  41. helm/benchmark/run_specs/vlm_run_specs.py +444 -65
  42. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  43. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  44. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  45. helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
  46. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  47. helm/benchmark/scenarios/math_scenario.py +1 -1
  48. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  49. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  50. helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
  51. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  52. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  53. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  54. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
  55. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
  56. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
  57. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
  58. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
  59. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  60. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  61. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  62. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  63. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  64. helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
  65. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
  66. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
  67. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
  68. helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
  69. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  70. helm/benchmark/static/schema_classic.yaml +3 -59
  71. helm/benchmark/static/schema_finance.yaml +143 -0
  72. helm/benchmark/static/schema_image2structure.yaml +447 -0
  73. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  74. helm/benchmark/static/schema_lite.yaml +3 -61
  75. helm/benchmark/static/schema_medical.yaml +255 -0
  76. helm/benchmark/static/schema_mmlu.yaml +3 -61
  77. helm/benchmark/static/schema_tables.yaml +200 -0
  78. helm/benchmark/static/schema_thai.yaml +223 -0
  79. helm/benchmark/static/schema_unitxt.yaml +3 -61
  80. helm/benchmark/static/schema_vhelm.yaml +824 -0
  81. helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
  82. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  83. helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
  84. helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
  85. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  86. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  87. helm/benchmark/static_build/index.html +2 -2
  88. helm/clients/anthropic_client.py +78 -14
  89. helm/clients/auto_client.py +11 -0
  90. helm/clients/client.py +24 -7
  91. helm/clients/cohere_client.py +98 -3
  92. helm/clients/huggingface_client.py +71 -12
  93. helm/clients/openai_client.py +11 -5
  94. helm/clients/reka_client.py +189 -0
  95. helm/clients/test_client.py +3 -3
  96. helm/clients/test_huggingface_client.py +19 -3
  97. helm/clients/test_together_client.py +72 -2
  98. helm/clients/together_client.py +199 -2
  99. helm/clients/vertexai_client.py +117 -64
  100. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  101. helm/clients/vision_language/huggingface_vlm_client.py +12 -4
  102. helm/clients/vision_language/idefics_client.py +2 -2
  103. helm/clients/vision_language/paligemma_client.py +146 -0
  104. helm/clients/vision_language/palmyra_vision_client.py +84 -0
  105. helm/clients/yi_client.py +31 -0
  106. helm/common/critique_request.py +10 -1
  107. helm/common/images_utils.py +29 -3
  108. helm/config/model_deployments.yaml +504 -12
  109. helm/config/model_metadata.yaml +579 -52
  110. helm/config/tokenizer_configs.yaml +100 -1
  111. helm/proxy/critique/model_critique_client.py +32 -4
  112. helm/proxy/services/server_service.py +1 -1
  113. helm/tokenizers/auto_tokenizer.py +1 -1
  114. helm/tokenizers/cohere_tokenizer.py +44 -2
  115. helm/tokenizers/huggingface_tokenizer.py +36 -13
  116. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  117. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  118. helm/benchmark/static/schema_vlm.yaml +0 -576
  119. helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
  120. helm/benchmark/static_build/assets/index-d839df55.js +0 -9
  121. helm/benchmark/test_model_deployment_definition.py +0 -90
  122. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
  123. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
  124. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
  125. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
@@ -7,10 +7,11 @@ from helm.benchmark.adaptation.adapters.adapter_factory import (
7
7
  ADAPT_GENERATION_MULTIMODAL,
8
8
  ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
9
9
  )
10
+ from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import DIFFICULTY_ALL
10
11
  from helm.benchmark.metrics.common_metric_specs import (
11
- get_basic_reference_metric_specs,
12
12
  get_exact_match_metric_specs,
13
- get_open_ended_generation_metric_specs,
13
+ get_generative_harms_metric_specs,
14
+ get_basic_metric_specs,
14
15
  )
15
16
  from helm.benchmark.metrics.metric import MetricSpec
16
17
  from helm.benchmark.run_spec import RunSpec, run_spec_function
@@ -22,13 +23,14 @@ from helm.benchmark.annotation.annotator import AnnotatorSpec
22
23
  # Prototypical adapter specs for VLM evaluation
23
24
 
24
25
 
25
- def get_generation_adapter_spec(
26
+ def _get_generation_adapter_spec(
26
27
  instructions: str = "",
27
28
  input_prefix: str = "",
28
29
  input_suffix: str = "",
29
30
  output_prefix: str = "",
30
31
  output_suffix: str = "",
31
32
  max_tokens: int = 100,
33
+ max_train_instances: int = 0,
32
34
  stop_sequences: Optional[List[str]] = None,
33
35
  ) -> AdapterSpec:
34
36
  return AdapterSpec(
@@ -40,23 +42,42 @@ def get_generation_adapter_spec(
40
42
  output_prefix=output_prefix,
41
43
  output_suffix=output_suffix,
42
44
  instance_prefix="\n",
43
- # We focus on zero-shot evaluation for now as most open VLMs only support a single image input
44
- max_train_instances=0,
45
+ max_train_instances=max_train_instances,
45
46
  num_outputs=1,
46
47
  max_tokens=max_tokens,
47
48
  stop_sequences=stop_sequences if stop_sequences is not None else [],
49
+ temperature=0.0,
48
50
  random=None,
49
51
  )
50
52
 
51
53
 
52
- def get_short_answer_generation_adapter_spec():
53
- return get_generation_adapter_spec(
54
- instructions="Just give a short answer without answering in a complete sentence.",
54
+ def _get_short_answer_generation_adapter_spec(instructions: Optional[str] = None) -> AdapterSpec:
55
+ return _get_generation_adapter_spec(
56
+ instructions=(
57
+ "Just give a short answer without answering in a complete sentence."
58
+ if instructions is None
59
+ else instructions
60
+ ),
55
61
  max_tokens=20,
56
62
  )
57
63
 
58
64
 
59
- def get_multiple_choice_joint_adapter_spec(
65
+ def _get_captioning_adapter_spec() -> AdapterSpec:
66
+ return _get_generation_adapter_spec(
67
+ instructions="Generate a caption for the following image. The caption should be short and does "
68
+ "not need to be a complete sentence.",
69
+ max_tokens=20,
70
+ )
71
+
72
+
73
+ def get_open_end_answer_generation_adapter_spec():
74
+ return _get_generation_adapter_spec(
75
+ instructions="Follow the given instruction and give your complete answer.",
76
+ max_tokens=100,
77
+ )
78
+
79
+
80
+ def _get_multiple_choice_joint_adapter_spec(
60
81
  input_noun: Optional[str],
61
82
  output_noun: str,
62
83
  max_train_instances: int = 0,
@@ -84,7 +105,13 @@ def get_multiple_choice_joint_adapter_spec(
84
105
  # VHELM metric specs
85
106
 
86
107
 
87
- def get_image2structure_metric_specs(
108
+ def _get_open_ended_generation_metric_specs() -> List[MetricSpec]:
109
+ return get_basic_metric_specs(
110
+ ["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4", "cider"]
111
+ )
112
+
113
+
114
+ def _get_image2structure_metric_specs(
88
115
  generation_type: str,
89
116
  metric_names: Optional[List[str]] = None,
90
117
  args: Optional[Dict] = None,
@@ -97,6 +124,7 @@ def get_image2structure_metric_specs(
97
124
  metric_names = [
98
125
  AnnotatedImageMetrics.PIXEL_SIMILARITY,
99
126
  AnnotatedImageMetrics.FID_SIMILARITY,
127
+ AnnotatedImageMetrics.BLOCK_EMD,
100
128
  AnnotatedImageMetrics.EARTH_MOVER_SIMILARITY,
101
129
  ]
102
130
  if include_edit_similarity:
@@ -114,20 +142,77 @@ def get_image2structure_metric_specs(
114
142
  },
115
143
  ),
116
144
  ]
117
- return metric_specs + get_basic_reference_metric_specs()
145
+ return metric_specs + get_basic_metric_specs([])
146
+
147
+
148
+ def _get_prometheus_vision_critique_metric_specs(num_respondents: int, max_tokens: int) -> List[MetricSpec]:
149
+ return [
150
+ MetricSpec(
151
+ class_name="helm.benchmark.metrics.prometheus_vision_critique_metrics.PrometheusVisionCritiqueMetric",
152
+ args={
153
+ "num_respondents": num_respondents,
154
+ "max_tokens": max_tokens,
155
+ },
156
+ )
157
+ ]
158
+
159
+
160
+ def _get_gpt4v_critique_originality_metric_specs(num_respondents: int) -> List[MetricSpec]:
161
+ return [
162
+ MetricSpec(
163
+ class_name="helm.benchmark.metrics.gpt4v_originality_critique_metrics.GPT4VCritiqueMetric",
164
+ args={
165
+ "num_respondents": num_respondents,
166
+ },
167
+ )
168
+ ]
169
+
170
+
171
+ def _get_vibe_eval_critique_metric_specs(num_respondents: int, max_tokens: int) -> List[MetricSpec]:
172
+ return [
173
+ MetricSpec(
174
+ class_name="helm.benchmark.metrics.reka_vibe_critique_metrics.RekaVibeCritiqueMetric",
175
+ args={
176
+ "num_respondents": num_respondents,
177
+ "max_tokens": max_tokens,
178
+ },
179
+ )
180
+ ]
118
181
 
119
182
 
120
183
  ############################################################
121
184
  # VHELM run specs
122
185
 
123
186
 
187
+ @run_spec_function("a_okvqa")
188
+ def get_a_okvqa_spec() -> RunSpec:
189
+ scenario_spec = ScenarioSpec(
190
+ class_name="helm.benchmark.scenarios.vision_language.a_okvqa_scenario.AOKVQAScenario",
191
+ args={},
192
+ )
193
+
194
+ adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
195
+ input_noun=None, output_noun="Answer", max_train_instances=0
196
+ )
197
+
198
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
199
+ run_spec_name: str = "a_okvqa"
200
+ return RunSpec(
201
+ name=run_spec_name,
202
+ scenario_spec=scenario_spec,
203
+ adapter_spec=adapter_spec,
204
+ metric_specs=metric_specs,
205
+ groups=[run_spec_name],
206
+ )
207
+
208
+
124
209
  @run_spec_function("chart2csv")
125
210
  def get_chart2csv_spec() -> RunSpec:
126
211
  scenario_spec = ScenarioSpec(
127
212
  class_name="helm.benchmark.scenarios.vision_language.image2structure.chart2csv_scenario.Chart2CSVScenario",
128
213
  args={},
129
214
  )
130
- adapter_spec: AdapterSpec = get_generation_adapter_spec(
215
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(
131
216
  instructions="Generate the CSV for the chart. Some of the labels may be missing due to the size of the chart. "
132
217
  "Please infer the missing labels based on the surrounding context. "
133
218
  "Just give the CSV without any explanation.",
@@ -145,15 +230,90 @@ def get_chart2csv_spec() -> RunSpec:
145
230
  )
146
231
 
147
232
 
233
+ @run_spec_function("crossmodal_3600")
234
+ def get_crossmodal_3600_spec(location: str, language: str, num_respondents: int) -> RunSpec:
235
+ scenario_spec = ScenarioSpec(
236
+ class_name="helm.benchmark.scenarios.vision_language.crossmodal_3600_scenario.Crossmodal3600Scenario",
237
+ args={"location": location, "language": language},
238
+ )
239
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(
240
+ instructions="Answer the question with a complete sentence in plain words",
241
+ max_tokens=20,
242
+ )
243
+
244
+ metric_specs: List[MetricSpec] = (
245
+ _get_prometheus_vision_critique_metric_specs(
246
+ num_respondents=num_respondents,
247
+ max_tokens=200,
248
+ )
249
+ + _get_open_ended_generation_metric_specs()
250
+ )
251
+
252
+ run_spec_name: str = "crossmodal_3600"
253
+ return RunSpec(
254
+ name=f"{run_spec_name}:location={location},language={language}",
255
+ scenario_spec=scenario_spec,
256
+ adapter_spec=adapter_spec,
257
+ metric_specs=metric_specs,
258
+ groups=[run_spec_name],
259
+ )
260
+
261
+
262
+ @run_spec_function("flickr30k")
263
+ def get_flickr30k_spec(num_respondents: int) -> RunSpec:
264
+ scenario_spec = ScenarioSpec(
265
+ class_name="helm.benchmark.scenarios.vision_language.flickr30k_scenario.Flickr30KScenario", args={}
266
+ )
267
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(
268
+ instructions="Generate a caption for the following image in plain words. The caption should "
269
+ "be short and needs to be a complete sentence.",
270
+ max_tokens=30,
271
+ max_train_instances=0,
272
+ )
273
+ metric_specs: List[MetricSpec] = (
274
+ _get_prometheus_vision_critique_metric_specs(
275
+ num_respondents=num_respondents,
276
+ max_tokens=200,
277
+ )
278
+ + _get_open_ended_generation_metric_specs()
279
+ )
280
+
281
+ run_spec_name: str = "flickr30k"
282
+ return RunSpec(
283
+ name=run_spec_name,
284
+ scenario_spec=scenario_spec,
285
+ adapter_spec=adapter_spec,
286
+ metric_specs=metric_specs,
287
+ groups=[run_spec_name],
288
+ )
289
+
290
+
291
+ @run_spec_function("gqa")
292
+ def get_gqa_spec() -> RunSpec:
293
+ scenario_spec = ScenarioSpec(
294
+ class_name="helm.benchmark.scenarios.vision_language.gqa_scenario.GQAScenario", args={}
295
+ )
296
+ adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(
297
+ instructions="Answer the question using a single word."
298
+ )
299
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
300
+
301
+ run_spec_name: str = "gqa"
302
+ return RunSpec(
303
+ name=run_spec_name,
304
+ scenario_spec=scenario_spec,
305
+ adapter_spec=adapter_spec,
306
+ metric_specs=metric_specs,
307
+ groups=[run_spec_name],
308
+ )
309
+
310
+
148
311
  @run_spec_function("hateful_memes")
149
312
  def get_hateful_memes_spec() -> RunSpec:
150
313
  scenario_spec = ScenarioSpec(
151
314
  class_name="helm.benchmark.scenarios.vision_language.hateful_memes_scenario.HatefulMemesScenario", args={}
152
315
  )
153
- adapter_spec: AdapterSpec = get_generation_adapter_spec(
154
- instructions="Answer Yes or No without an explanation.",
155
- max_tokens=3,
156
- )
316
+ adapter_spec = _get_multiple_choice_joint_adapter_spec(input_noun=None, output_noun="Answer", max_train_instances=0)
157
317
  metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
158
318
 
159
319
  run_spec_name: str = "hateful_memes"
@@ -166,13 +326,115 @@ def get_hateful_memes_spec() -> RunSpec:
166
326
  )
167
327
 
168
328
 
329
+ @run_spec_function("mm_safety_bench")
330
+ def get_mm_safety_bench_spec(subset: str) -> RunSpec:
331
+ scenario_spec = ScenarioSpec(
332
+ class_name="helm.benchmark.scenarios.vision_language.mm_safety_bench_scenario.MMSafetyBenchScenario",
333
+ args={"subset": subset},
334
+ )
335
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(max_tokens=500)
336
+ metric_specs: List[MetricSpec] = get_generative_harms_metric_specs(
337
+ include_basic_metrics=True, include_generative_harms_metrics=True
338
+ )
339
+
340
+ run_spec_name: str = "mm_safety_bench"
341
+ return RunSpec(
342
+ name=f"{run_spec_name}:subset={subset}",
343
+ scenario_spec=scenario_spec,
344
+ adapter_spec=adapter_spec,
345
+ metric_specs=metric_specs,
346
+ groups=[run_spec_name],
347
+ )
348
+
349
+
350
+ @run_spec_function("mscoco_captioning")
351
+ def get_mscoco_captioning_spec(long: bool = False) -> RunSpec:
352
+ scenario_spec = ScenarioSpec(
353
+ class_name="helm.benchmark.scenarios.vision_language.mscoco_captioning_scenario.MSCOCOCaptioningScenario",
354
+ args={},
355
+ )
356
+
357
+ adapter_spec: AdapterSpec
358
+ if long:
359
+ adapter_spec = _get_generation_adapter_spec(
360
+ instructions="Generate a long, detailed caption for the following image.",
361
+ max_tokens=200,
362
+ )
363
+ else:
364
+ adapter_spec = _get_generation_adapter_spec(
365
+ instructions="Generate a caption for the following image. The caption should be short and does "
366
+ "not need to be a complete sentence.",
367
+ max_tokens=20,
368
+ )
369
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
370
+
371
+ run_spec_name: str = "mscoco_captioning"
372
+ if long:
373
+ run_spec_name += "_long"
374
+
375
+ return RunSpec(
376
+ name=run_spec_name,
377
+ scenario_spec=scenario_spec,
378
+ adapter_spec=adapter_spec,
379
+ metric_specs=metric_specs,
380
+ groups=[run_spec_name],
381
+ )
382
+
383
+
384
+ @run_spec_function("mscoco_categorization")
385
+ def get_mscoco_categorization_spec() -> RunSpec:
386
+ scenario_spec = ScenarioSpec(
387
+ class_name="helm.benchmark.scenarios.vision_language.mscoco_categorization_scenario."
388
+ "MSCOCOCategorizationScenario",
389
+ args={},
390
+ )
391
+ adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
392
+ input_noun=None, output_noun="Answer", max_train_instances=0
393
+ )
394
+
395
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
396
+
397
+ run_spec_name: str = "mscoco_categorization"
398
+ return RunSpec(
399
+ name=run_spec_name,
400
+ scenario_spec=scenario_spec,
401
+ adapter_spec=adapter_spec,
402
+ metric_specs=metric_specs,
403
+ groups=[run_spec_name],
404
+ )
405
+
406
+
407
+ @run_spec_function("originality_vlm")
408
+ def get_originality_vlm_spec() -> RunSpec:
409
+ scenario_spec = ScenarioSpec(
410
+ class_name="helm.benchmark.scenarios.vision_language.originality_scenario.OriginalityScenario", args={}
411
+ )
412
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(max_tokens=500)
413
+ metric_specs: List[MetricSpec] = get_generative_harms_metric_specs(
414
+ include_basic_metrics=True, include_generative_harms_metrics=True
415
+ )
416
+
417
+ run_spec_name: str = "originality_vlm"
418
+ return RunSpec(
419
+ name=run_spec_name,
420
+ scenario_spec=scenario_spec,
421
+ adapter_spec=adapter_spec,
422
+ metric_specs=metric_specs,
423
+ groups=[run_spec_name],
424
+ )
425
+
426
+
169
427
  @run_spec_function("viz_wiz")
170
428
  def get_viz_wiz_spec() -> RunSpec:
171
429
  scenario_spec = ScenarioSpec(
172
430
  class_name="helm.benchmark.scenarios.vision_language.viz_wiz_scenario.VizWizScenario", args={}
173
431
  )
174
- adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec()
175
- metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
432
+ adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(
433
+ # Following https://arxiv.org/abs/2310.03744
434
+ instructions="When the provided information is insufficient, respond with 'Unanswerable'. "
435
+ "Answer the question using a single word or phrase."
436
+ )
437
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
176
438
 
177
439
  run_spec_name: str = "viz_wiz"
178
440
  return RunSpec(
@@ -189,8 +451,12 @@ def get_vqa_spec() -> RunSpec:
189
451
  scenario_spec = ScenarioSpec(
190
452
  class_name="helm.benchmark.scenarios.vision_language.vqa_scenario.VQAScenario", args={}
191
453
  )
192
- adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec()
193
- metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + get_open_ended_generation_metric_specs()
454
+ # Following https://arxiv.org/abs/2310.03744
455
+ adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(
456
+ instructions='Answer the question using a single word or phrase. When the question asks "How many...", '
457
+ "respond with just a number (e.g., 3) and not the word corresponding to the number."
458
+ )
459
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
194
460
 
195
461
  run_spec_name: str = "vqa"
196
462
  return RunSpec(
@@ -203,19 +469,21 @@ def get_vqa_spec() -> RunSpec:
203
469
 
204
470
 
205
471
  @run_spec_function("image2latex")
206
- def get_image2latex_spec(subset: str, recompile_prompt: bool = False, args: Optional[Dict] = None) -> RunSpec:
472
+ def get_image2latex_spec(
473
+ subset: str, recompile_prompt: bool = False, difficulty: str = DIFFICULTY_ALL, args: Optional[Dict] = None
474
+ ) -> RunSpec:
207
475
  scenario_spec = ScenarioSpec(
208
476
  class_name="helm.benchmark.scenarios.vision_language.image2structure.latex_scenario.LatexScenario",
209
- args={"subset": subset, "recompile_prompt": recompile_prompt},
477
+ args={"subset": subset, "recompile_prompt": recompile_prompt, "difficulty": difficulty},
210
478
  )
211
- adapter_spec: AdapterSpec = get_generation_adapter_spec(
479
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(
212
480
  instructions="Just give a short answer without answering in a complete sentence.",
213
481
  max_tokens=2000,
214
482
  )
215
- metric_specs: List[MetricSpec] = get_image2structure_metric_specs(
483
+ metric_specs: List[MetricSpec] = _get_image2structure_metric_specs(
216
484
  generation_type="latex",
217
485
  args=args,
218
- include_edit_similarity=True,
486
+ include_edit_similarity=(subset != "real"),
219
487
  size_handling_method="padding",
220
488
  )
221
489
  annotator_specs: List[AnnotatorSpec] = [
@@ -224,31 +492,41 @@ def get_image2latex_spec(subset: str, recompile_prompt: bool = False, args: Opti
224
492
  )
225
493
  ]
226
494
 
227
- run_spec_name: str = "image2latex"
495
+ run_spec_name: str = f"image2latex:subset={subset}:difficulty={difficulty}"
496
+ groups: List[str]
497
+ if subset == "real":
498
+ groups = ["image2latex_real"]
499
+ else:
500
+ groups = ["image2latex", f"image2latex_{difficulty}"]
228
501
  return RunSpec(
229
- name=f"{run_spec_name}:subset={subset}",
502
+ name=run_spec_name,
230
503
  scenario_spec=scenario_spec,
231
504
  adapter_spec=adapter_spec,
232
505
  metric_specs=metric_specs,
233
- groups=[run_spec_name],
506
+ groups=groups,
234
507
  annotators=annotator_specs,
235
508
  )
236
509
 
237
510
 
238
511
  @run_spec_function("image2webpage")
239
- def get_image2webpage_spec(subset: str, recompile_prompt: bool = False, args: Optional[Dict] = None) -> RunSpec:
512
+ def get_image2webpage_spec(
513
+ subset: str,
514
+ recompile_prompt: bool = False,
515
+ difficulty: str = DIFFICULTY_ALL,
516
+ args: Optional[Dict] = None,
517
+ ) -> RunSpec:
240
518
  scenario_spec = ScenarioSpec(
241
519
  class_name="helm.benchmark.scenarios.vision_language.image2structure.webpage_scenario.WebpageScenario",
242
- args={"subset": subset, "recompile_prompt": recompile_prompt},
520
+ args={"subset": subset, "recompile_prompt": recompile_prompt, "difficulty": difficulty},
243
521
  )
244
- adapter_spec: AdapterSpec = get_generation_adapter_spec(
522
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(
245
523
  instructions="Just give a short answer without answering in a complete sentence.",
246
524
  max_tokens=2000,
247
525
  )
248
- metric_specs: List[MetricSpec] = get_image2structure_metric_specs(
526
+ metric_specs: List[MetricSpec] = _get_image2structure_metric_specs(
249
527
  generation_type="webpage",
250
528
  args=args,
251
- include_edit_similarity=True,
529
+ include_edit_similarity=(subset != "real"),
252
530
  size_handling_method="none",
253
531
  )
254
532
  annotator_specs: List[AnnotatorSpec] = [
@@ -257,28 +535,64 @@ def get_image2webpage_spec(subset: str, recompile_prompt: bool = False, args: Op
257
535
  )
258
536
  ]
259
537
 
260
- run_spec_name: str = "image2webpage"
538
+ run_spec_name: str = f"image2webpage:subset={subset}:difficulty={difficulty}"
539
+ groups: List[str]
540
+ if subset == "real":
541
+ groups = ["image2webpage_real"]
542
+ else:
543
+ groups = ["image2webpage", f"image2webpage_{difficulty}"]
261
544
  return RunSpec(
262
- name=f"{run_spec_name}:subset={subset}",
545
+ name=run_spec_name,
263
546
  scenario_spec=scenario_spec,
264
547
  adapter_spec=adapter_spec,
265
548
  metric_specs=metric_specs,
266
- groups=[run_spec_name],
549
+ groups=groups,
267
550
  annotators=annotator_specs,
268
551
  )
269
552
 
270
553
 
554
+ @run_spec_function("math_vista")
555
+ def get_math_vista_spec(grade: str, question_type: str) -> RunSpec:
556
+ scenario_spec = ScenarioSpec(
557
+ class_name="helm.benchmark.scenarios.vision_language.math_vista_scenario.MathVistaScenario",
558
+ args={"grade": grade, "question_type": question_type},
559
+ )
560
+
561
+ adapter_spec: AdapterSpec
562
+ if question_type == "free_form":
563
+ adapter_spec = _get_short_answer_generation_adapter_spec(
564
+ instructions="Just give the numerical answer without showing the steps, the unit, or percentage symbol."
565
+ )
566
+ elif question_type == "multi_choice":
567
+ adapter_spec = _get_multiple_choice_joint_adapter_spec(
568
+ input_noun=None, output_noun="Answer", max_train_instances=0
569
+ )
570
+ else:
571
+ raise ValueError(f"Invalid question type: {question_type}")
572
+
573
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
574
+ run_spec_name: str = "math_vista"
575
+ return RunSpec(
576
+ name=f"{run_spec_name}:grade={grade},question_type={question_type}",
577
+ scenario_spec=scenario_spec,
578
+ adapter_spec=adapter_spec,
579
+ metric_specs=metric_specs,
580
+ groups=[run_spec_name],
581
+ )
582
+
583
+
271
584
  @run_spec_function("image2musicsheet")
272
- def get_image2musicsheet_spec(args: Optional[Dict] = None) -> RunSpec:
585
+ def get_image2musicsheet_spec(difficulty: str = DIFFICULTY_ALL, args: Optional[Dict] = None) -> RunSpec:
273
586
  scenario_spec = ScenarioSpec(
274
587
  class_name="helm.benchmark.scenarios.vision_language.image2structure.musicsheet_scenario.MusicSheetScenario",
275
- args={"subset": "music", "recompile_prompt": False}, # There os only one subset for music sheets
588
+ # There os only one subset for music sheets
589
+ args={"subset": "music", "recompile_prompt": False, "difficulty": difficulty},
276
590
  )
277
- adapter_spec: AdapterSpec = get_generation_adapter_spec(
591
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(
278
592
  instructions="Just give a short answer without answering in a complete sentence.",
279
593
  max_tokens=2000,
280
594
  )
281
- metric_specs: List[MetricSpec] = get_image2structure_metric_specs(
595
+ metric_specs: List[MetricSpec] = _get_image2structure_metric_specs(
282
596
  generation_type="lilypond",
283
597
  args=args,
284
598
  include_edit_similarity=False, # No ground truth for music sheets
@@ -290,13 +604,14 @@ def get_image2musicsheet_spec(args: Optional[Dict] = None) -> RunSpec:
290
604
  )
291
605
  ]
292
606
 
293
- run_spec_name: str = "image2musicsheet"
607
+ run_spec_name: str = f"image2musicsheet:difficulty={difficulty}"
608
+ groups: List[str] = ["image2musicsheet", f"image2musicsheet_{difficulty}"]
294
609
  return RunSpec(
295
- name=f"{run_spec_name}",
610
+ name=run_spec_name,
296
611
  scenario_spec=scenario_spec,
297
612
  adapter_spec=adapter_spec,
298
613
  metric_specs=metric_specs,
299
- groups=[run_spec_name],
614
+ groups=groups,
300
615
  annotators=annotator_specs,
301
616
  )
302
617
 
@@ -310,10 +625,14 @@ def get_mmmu_spec(subject: str, question_type: str) -> RunSpec:
310
625
 
311
626
  adapter_spec: AdapterSpec
312
627
  if question_type == "open":
313
- adapter_spec = get_short_answer_generation_adapter_spec()
628
+ adapter_spec = _get_short_answer_generation_adapter_spec()
314
629
  elif question_type == "multiple-choice":
315
- adapter_spec = get_multiple_choice_joint_adapter_spec(
316
- input_noun=None, output_noun="Answer", max_train_instances=0
630
+ adapter_spec = _get_multiple_choice_joint_adapter_spec(
631
+ input_noun=None,
632
+ output_noun="Answer",
633
+ max_train_instances=0,
634
+ # instructions="Refer to the figure(s) and answer the multiple choice question by responding with just "
635
+ # "the letter of the correct answer (e.g., A, B, C, D, E).",
317
636
  )
318
637
  else:
319
638
  raise ValueError(f"Invalid question type: {question_type}")
@@ -335,14 +654,15 @@ def get_unicorn_spec(subject: str) -> RunSpec:
335
654
  class_name="helm.benchmark.scenarios.vision_language.unicorn_scenario.UnicornScenario",
336
655
  args={"subject": subject},
337
656
  )
338
- adapter_spec: AdapterSpec = get_generation_adapter_spec(
339
- instructions="Only give numerical or boolean answer without an explanation."
657
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(
658
+ instructions="Only give a yes/no or numerical answer without an explanation.",
659
+ max_tokens=1, # the model may generate answer with a period
340
660
  )
341
661
  metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
342
662
 
343
663
  run_spec_name: str = "unicorn"
344
664
  return RunSpec(
345
- name=run_spec_name,
665
+ name=f"{run_spec_name}:subject={subject}",
346
666
  scenario_spec=scenario_spec,
347
667
  adapter_spec=adapter_spec,
348
668
  metric_specs=metric_specs,
@@ -351,16 +671,26 @@ def get_unicorn_spec(subject: str) -> RunSpec:
351
671
 
352
672
 
353
673
  @run_spec_function("bingo")
354
- def get_bingo_spec(subject: str) -> RunSpec:
674
+ def get_bingo_spec(subject: str, num_respondents: int) -> RunSpec:
355
675
  scenario_spec = ScenarioSpec(
356
676
  class_name="helm.benchmark.scenarios.vision_language.bingo_scenario.BingoScenario", args={"subject": subject}
357
677
  )
358
- adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec()
359
- metric_specs: List[MetricSpec] = get_open_ended_generation_metric_specs()
678
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(
679
+ instructions="Answer the question with a complete and clear explanation in sentences without listing it out.",
680
+ max_tokens=100,
681
+ max_train_instances=0,
682
+ )
683
+ metric_specs: List[MetricSpec] = (
684
+ _get_prometheus_vision_critique_metric_specs(
685
+ num_respondents=num_respondents,
686
+ max_tokens=200,
687
+ )
688
+ + _get_open_ended_generation_metric_specs()
689
+ )
360
690
 
361
691
  run_spec_name: str = "bingo"
362
692
  return RunSpec(
363
- name=run_spec_name,
693
+ name=f"{run_spec_name}:subject={subject}",
364
694
  scenario_spec=scenario_spec,
365
695
  adapter_spec=adapter_spec,
366
696
  metric_specs=metric_specs,
@@ -377,9 +707,9 @@ def get_multipanelvqa_spec(subject: str, question_type: str) -> RunSpec:
377
707
 
378
708
  adapter_spec: AdapterSpec
379
709
  if question_type == "open":
380
- adapter_spec = get_short_answer_generation_adapter_spec()
710
+ adapter_spec = _get_short_answer_generation_adapter_spec()
381
711
  elif question_type == "multiple-choice":
382
- adapter_spec = get_multiple_choice_joint_adapter_spec(
712
+ adapter_spec = _get_multiple_choice_joint_adapter_spec(
383
713
  input_noun=None, output_noun="Answer", max_train_instances=0
384
714
  )
385
715
  else:
@@ -401,7 +731,7 @@ def get_pope_spec() -> RunSpec:
401
731
  scenario_spec = ScenarioSpec(
402
732
  class_name="helm.benchmark.scenarios.vision_language.pope_scenario.POPEScenario",
403
733
  )
404
- adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec(
734
+ adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
405
735
  input_noun=None, output_noun="Answer", max_train_instances=0
406
736
  )
407
737
  metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
@@ -422,14 +752,14 @@ def get_seed_bench_spec(subject: str) -> RunSpec:
422
752
  class_name="helm.benchmark.scenarios.vision_language.seed_bench_scenario.SEEDBenchScenario",
423
753
  args={"subject": subject},
424
754
  )
425
- adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec(
755
+ adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
426
756
  input_noun=None, output_noun="Answer", max_train_instances=0
427
757
  )
428
758
  metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
429
759
 
430
760
  run_spec_name: str = "seed_bench"
431
761
  return RunSpec(
432
- name=run_spec_name,
762
+ name=f"{run_spec_name}:subject={subject}",
433
763
  scenario_spec=scenario_spec,
434
764
  adapter_spec=adapter_spec,
435
765
  metric_specs=metric_specs,
@@ -443,14 +773,14 @@ def get_mme_spec(subject: str) -> RunSpec:
443
773
  class_name="helm.benchmark.scenarios.vision_language.mme_scenario.MMEScenario",
444
774
  args={"subject": subject},
445
775
  )
446
- adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec(
776
+ adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
447
777
  input_noun=None, output_noun="Answer", max_train_instances=0
448
778
  )
449
779
  metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
450
780
 
451
781
  run_spec_name: str = "mme"
452
782
  return RunSpec(
453
- name=run_spec_name,
783
+ name=f"{run_spec_name}:subject={subject}",
454
784
  scenario_spec=scenario_spec,
455
785
  adapter_spec=adapter_spec,
456
786
  metric_specs=metric_specs,
@@ -464,7 +794,7 @@ def get_heim_human_eval_spec(question_type: str) -> RunSpec:
464
794
  class_name="helm.benchmark.scenarios.vision_language.heim_human_eval_scenario.HEIMHumanEvalScenario",
465
795
  args={"question_type": question_type},
466
796
  )
467
- adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec(
797
+ adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
468
798
  input_noun=None,
469
799
  output_noun="Answer",
470
800
  num_outputs=1,
@@ -482,18 +812,67 @@ def get_heim_human_eval_spec(question_type: str) -> RunSpec:
482
812
  )
483
813
 
484
814
 
815
+ @run_spec_function("pairs")
816
+ def get_pairs_spec(subset: str, person: str) -> RunSpec:
817
+ scenario_spec = ScenarioSpec(
818
+ class_name="helm.benchmark.scenarios.vision_language.pairs_scenario.PAIRSScenario",
819
+ args={"subset": subset, "person": person},
820
+ )
821
+ adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
822
+ input_noun=None,
823
+ output_noun="Answer",
824
+ num_outputs=1,
825
+ max_train_instances=0,
826
+ )
827
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
828
+
829
+ run_spec_name: str = "pairs"
830
+ return RunSpec(
831
+ name=f"{run_spec_name}:subset={subset},person={person}",
832
+ scenario_spec=scenario_spec,
833
+ adapter_spec=adapter_spec,
834
+ metric_specs=metric_specs,
835
+ groups=[run_spec_name],
836
+ )
837
+
838
+
485
839
  @run_spec_function("mementos")
486
- def get_mementos_spec(subject: str) -> RunSpec:
840
+ def get_mementos_spec(subject: str, num_respondents: int) -> RunSpec:
487
841
  scenario_spec = ScenarioSpec(
488
842
  class_name="helm.benchmark.scenarios.vision_language.mementos_scenario.MementosScenario",
489
843
  args={"subject": subject},
490
844
  )
491
- adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec()
492
- metric_specs: List[MetricSpec] = get_open_ended_generation_metric_specs()
845
+ adapter_spec: AdapterSpec = get_open_end_answer_generation_adapter_spec()
846
+ metric_specs: List[MetricSpec] = (
847
+ _get_prometheus_vision_critique_metric_specs(num_respondents=num_respondents, max_tokens=200)
848
+ + _get_open_ended_generation_metric_specs()
849
+ )
493
850
 
494
851
  run_spec_name: str = "mementos"
495
852
  return RunSpec(
496
- name=run_spec_name,
853
+ name=f"{run_spec_name}:subject={subject}",
854
+ scenario_spec=scenario_spec,
855
+ adapter_spec=adapter_spec,
856
+ metric_specs=metric_specs,
857
+ groups=[run_spec_name],
858
+ )
859
+
860
+
861
+ @run_spec_function("vibe_eval")
862
+ def get_vibe_eval_spec(subject: str, num_respondents: int) -> RunSpec:
863
+ scenario_spec = ScenarioSpec(
864
+ class_name="helm.benchmark.scenarios.vision_language.vibe_eval_scenario.VibeEvalScenario",
865
+ args={"subject": subject},
866
+ )
867
+ adapter_spec: AdapterSpec = get_open_end_answer_generation_adapter_spec()
868
+ metric_specs: List[MetricSpec] = (
869
+ _get_prometheus_vision_critique_metric_specs(num_respondents=num_respondents, max_tokens=200)
870
+ + _get_open_ended_generation_metric_specs()
871
+ )
872
+
873
+ run_spec_name: str = "vibe_eval"
874
+ return RunSpec(
875
+ name=f"{run_spec_name}:subject={subject}",
497
876
  scenario_spec=scenario_spec,
498
877
  adapter_spec=adapter_spec,
499
878
  metric_specs=metric_specs,