crfm-helm 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (98) hide show
  1. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +13 -3
  2. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +96 -63
  3. helm/benchmark/adaptation/adapter_spec.py +32 -31
  4. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  5. helm/benchmark/annotation/annotator_factory.py +6 -0
  6. helm/benchmark/annotation/live_qa_annotator.py +84 -0
  7. helm/benchmark/annotation/medication_qa_annotator.py +81 -0
  8. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  9. helm/benchmark/huggingface_registration.py +16 -6
  10. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  11. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  12. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  13. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  14. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  15. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  16. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  17. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  18. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  19. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  20. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  21. helm/benchmark/metrics/vision_language/image_metrics.py +29 -71
  22. helm/benchmark/presentation/schema.py +54 -4
  23. helm/benchmark/presentation/test_schema.py +11 -0
  24. helm/benchmark/run.py +16 -2
  25. helm/benchmark/run_expander.py +77 -0
  26. helm/benchmark/run_spec_factory.py +4 -0
  27. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  28. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  29. helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
  30. helm/benchmark/run_specs/experimental_run_specs.py +33 -0
  31. helm/benchmark/run_specs/finance_run_specs.py +33 -0
  32. helm/benchmark/run_specs/vlm_run_specs.py +168 -45
  33. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  34. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  35. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  36. helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
  37. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  38. helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
  39. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
  40. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
  41. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +0 -4
  42. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +4 -2
  43. helm/benchmark/scenarios/vision_language/pairs_scenario.py +6 -5
  44. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
  45. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
  46. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  47. helm/benchmark/static/schema_classic.yaml +3 -59
  48. helm/benchmark/static/schema_finance.yaml +143 -0
  49. helm/benchmark/static/schema_image2structure.yaml +254 -111
  50. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  51. helm/benchmark/static/schema_lite.yaml +3 -61
  52. helm/benchmark/static/schema_medical.yaml +255 -0
  53. helm/benchmark/static/schema_mmlu.yaml +3 -61
  54. helm/benchmark/static/schema_tables.yaml +200 -0
  55. helm/benchmark/static/schema_thai.yaml +223 -0
  56. helm/benchmark/static/schema_unitxt.yaml +3 -61
  57. helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +294 -293
  58. helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
  59. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  60. helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
  61. helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
  62. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  63. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  64. helm/benchmark/static_build/index.html +2 -2
  65. helm/clients/anthropic_client.py +43 -9
  66. helm/clients/auto_client.py +11 -0
  67. helm/clients/client.py +24 -7
  68. helm/clients/cohere_client.py +98 -3
  69. helm/clients/huggingface_client.py +71 -12
  70. helm/clients/openai_client.py +9 -2
  71. helm/clients/reka_client.py +189 -0
  72. helm/clients/test_client.py +3 -3
  73. helm/clients/test_huggingface_client.py +19 -3
  74. helm/clients/test_together_client.py +72 -2
  75. helm/clients/together_client.py +129 -23
  76. helm/clients/vertexai_client.py +62 -18
  77. helm/clients/vision_language/huggingface_vlm_client.py +1 -0
  78. helm/clients/vision_language/paligemma_client.py +146 -0
  79. helm/clients/vision_language/palmyra_vision_client.py +84 -0
  80. helm/clients/yi_client.py +31 -0
  81. helm/common/critique_request.py +10 -1
  82. helm/common/images_utils.py +19 -0
  83. helm/config/model_deployments.yaml +412 -18
  84. helm/config/model_metadata.yaml +447 -25
  85. helm/config/tokenizer_configs.yaml +93 -1
  86. helm/proxy/critique/model_critique_client.py +32 -4
  87. helm/proxy/services/server_service.py +1 -1
  88. helm/tokenizers/auto_tokenizer.py +1 -1
  89. helm/tokenizers/cohere_tokenizer.py +44 -2
  90. helm/tokenizers/huggingface_tokenizer.py +36 -13
  91. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  92. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  93. helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
  94. helm/benchmark/static_build/assets/index-878a1094.css +0 -1
  95. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
  96. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
  97. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
  98. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
@@ -7,8 +7,8 @@ from helm.benchmark.adaptation.adapters.adapter_factory import (
7
7
  ADAPT_GENERATION_MULTIMODAL,
8
8
  ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
9
9
  )
10
+ from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import DIFFICULTY_ALL
10
11
  from helm.benchmark.metrics.common_metric_specs import (
11
- get_basic_reference_metric_specs,
12
12
  get_exact_match_metric_specs,
13
13
  get_generative_harms_metric_specs,
14
14
  get_basic_metric_specs,
@@ -30,6 +30,7 @@ def _get_generation_adapter_spec(
30
30
  output_prefix: str = "",
31
31
  output_suffix: str = "",
32
32
  max_tokens: int = 100,
33
+ max_train_instances: int = 0,
33
34
  stop_sequences: Optional[List[str]] = None,
34
35
  ) -> AdapterSpec:
35
36
  return AdapterSpec(
@@ -41,8 +42,7 @@ def _get_generation_adapter_spec(
41
42
  output_prefix=output_prefix,
42
43
  output_suffix=output_suffix,
43
44
  instance_prefix="\n",
44
- # We focus on zero-shot evaluation for now as most open VLMs only support a single image input
45
- max_train_instances=0,
45
+ max_train_instances=max_train_instances,
46
46
  num_outputs=1,
47
47
  max_tokens=max_tokens,
48
48
  stop_sequences=stop_sequences if stop_sequences is not None else [],
@@ -70,6 +70,13 @@ def _get_captioning_adapter_spec() -> AdapterSpec:
70
70
  )
71
71
 
72
72
 
73
+ def get_open_end_answer_generation_adapter_spec():
74
+ return _get_generation_adapter_spec(
75
+ instructions="Follow the given instruction and give your complete answer.",
76
+ max_tokens=100,
77
+ )
78
+
79
+
73
80
  def _get_multiple_choice_joint_adapter_spec(
74
81
  input_noun: Optional[str],
75
82
  output_noun: str,
@@ -117,9 +124,8 @@ def _get_image2structure_metric_specs(
117
124
  metric_names = [
118
125
  AnnotatedImageMetrics.PIXEL_SIMILARITY,
119
126
  AnnotatedImageMetrics.FID_SIMILARITY,
120
- AnnotatedImageMetrics.BLOCK_EARTH_MOVER_SIMILARITY,
121
- AnnotatedImageMetrics.BLOCK_EARTH_MOVER_SIMILARITY_NORM2,
122
- AnnotatedImageMetrics.BLOCK_EARTH_MOVER_SIMILARITY_NORM1,
127
+ AnnotatedImageMetrics.BLOCK_EMD,
128
+ AnnotatedImageMetrics.EARTH_MOVER_SIMILARITY,
123
129
  ]
124
130
  if include_edit_similarity:
125
131
  metric_names.append(AnnotatedImageMetrics.EDIT_SIMILARITY)
@@ -136,7 +142,42 @@ def _get_image2structure_metric_specs(
136
142
  },
137
143
  ),
138
144
  ]
139
- return metric_specs + get_basic_reference_metric_specs()
145
+ return metric_specs + get_basic_metric_specs([])
146
+
147
+
148
+ def _get_prometheus_vision_critique_metric_specs(num_respondents: int, max_tokens: int) -> List[MetricSpec]:
149
+ return [
150
+ MetricSpec(
151
+ class_name="helm.benchmark.metrics.prometheus_vision_critique_metrics.PrometheusVisionCritiqueMetric",
152
+ args={
153
+ "num_respondents": num_respondents,
154
+ "max_tokens": max_tokens,
155
+ },
156
+ )
157
+ ]
158
+
159
+
160
+ def _get_gpt4v_critique_originality_metric_specs(num_respondents: int) -> List[MetricSpec]:
161
+ return [
162
+ MetricSpec(
163
+ class_name="helm.benchmark.metrics.gpt4v_originality_critique_metrics.GPT4VCritiqueMetric",
164
+ args={
165
+ "num_respondents": num_respondents,
166
+ },
167
+ )
168
+ ]
169
+
170
+
171
+ def _get_vibe_eval_critique_metric_specs(num_respondents: int, max_tokens: int) -> List[MetricSpec]:
172
+ return [
173
+ MetricSpec(
174
+ class_name="helm.benchmark.metrics.reka_vibe_critique_metrics.RekaVibeCritiqueMetric",
175
+ args={
176
+ "num_respondents": num_respondents,
177
+ "max_tokens": max_tokens,
178
+ },
179
+ )
180
+ ]
140
181
 
141
182
 
142
183
  ############################################################
@@ -190,13 +231,23 @@ def get_chart2csv_spec() -> RunSpec:
190
231
 
191
232
 
192
233
  @run_spec_function("crossmodal_3600")
193
- def get_crossmodal_3600_spec(location: str, language: str) -> RunSpec:
234
+ def get_crossmodal_3600_spec(location: str, language: str, num_respondents: int) -> RunSpec:
194
235
  scenario_spec = ScenarioSpec(
195
236
  class_name="helm.benchmark.scenarios.vision_language.crossmodal_3600_scenario.Crossmodal3600Scenario",
196
237
  args={"location": location, "language": language},
197
238
  )
198
- adapter_spec: AdapterSpec = _get_generation_adapter_spec(max_tokens=20)
199
- metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
239
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(
240
+ instructions="Answer the question with a complete sentence in plain words",
241
+ max_tokens=20,
242
+ )
243
+
244
+ metric_specs: List[MetricSpec] = (
245
+ _get_prometheus_vision_critique_metric_specs(
246
+ num_respondents=num_respondents,
247
+ max_tokens=200,
248
+ )
249
+ + _get_open_ended_generation_metric_specs()
250
+ )
200
251
 
201
252
  run_spec_name: str = "crossmodal_3600"
202
253
  return RunSpec(
@@ -209,12 +260,23 @@ def get_crossmodal_3600_spec(location: str, language: str) -> RunSpec:
209
260
 
210
261
 
211
262
  @run_spec_function("flickr30k")
212
- def get_flickr30k_spec() -> RunSpec:
263
+ def get_flickr30k_spec(num_respondents: int) -> RunSpec:
213
264
  scenario_spec = ScenarioSpec(
214
265
  class_name="helm.benchmark.scenarios.vision_language.flickr30k_scenario.Flickr30KScenario", args={}
215
266
  )
216
- adapter_spec: AdapterSpec = _get_captioning_adapter_spec()
217
- metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
267
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(
268
+ instructions="Generate a caption for the following image in plain words. The caption should "
269
+ "be short and needs to be a complete sentence.",
270
+ max_tokens=30,
271
+ max_train_instances=0,
272
+ )
273
+ metric_specs: List[MetricSpec] = (
274
+ _get_prometheus_vision_critique_metric_specs(
275
+ num_respondents=num_respondents,
276
+ max_tokens=200,
277
+ )
278
+ + _get_open_ended_generation_metric_specs()
279
+ )
218
280
 
219
281
  run_spec_name: str = "flickr30k"
220
282
  return RunSpec(
@@ -232,7 +294,7 @@ def get_gqa_spec() -> RunSpec:
232
294
  class_name="helm.benchmark.scenarios.vision_language.gqa_scenario.GQAScenario", args={}
233
295
  )
234
296
  adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(
235
- instructions="Answer the question using a single word or phrase."
297
+ instructions="Answer the question using a single word."
236
298
  )
237
299
  metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
238
300
 
@@ -296,10 +358,14 @@ def get_mscoco_captioning_spec(long: bool = False) -> RunSpec:
296
358
  if long:
297
359
  adapter_spec = _get_generation_adapter_spec(
298
360
  instructions="Generate a long, detailed caption for the following image.",
299
- max_tokens=150,
361
+ max_tokens=200,
300
362
  )
301
363
  else:
302
- adapter_spec = _get_captioning_adapter_spec()
364
+ adapter_spec = _get_generation_adapter_spec(
365
+ instructions="Generate a caption for the following image. The caption should be short and does "
366
+ "not need to be a complete sentence.",
367
+ max_tokens=20,
368
+ )
303
369
  metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
304
370
 
305
371
  run_spec_name: str = "mscoco_captioning"
@@ -403,10 +469,12 @@ def get_vqa_spec() -> RunSpec:
403
469
 
404
470
 
405
471
  @run_spec_function("image2latex")
406
- def get_image2latex_spec(subset: str, recompile_prompt: bool = False, args: Optional[Dict] = None) -> RunSpec:
472
+ def get_image2latex_spec(
473
+ subset: str, recompile_prompt: bool = False, difficulty: str = DIFFICULTY_ALL, args: Optional[Dict] = None
474
+ ) -> RunSpec:
407
475
  scenario_spec = ScenarioSpec(
408
476
  class_name="helm.benchmark.scenarios.vision_language.image2structure.latex_scenario.LatexScenario",
409
- args={"subset": subset, "recompile_prompt": recompile_prompt},
477
+ args={"subset": subset, "recompile_prompt": recompile_prompt, "difficulty": difficulty},
410
478
  )
411
479
  adapter_spec: AdapterSpec = _get_generation_adapter_spec(
412
480
  instructions="Just give a short answer without answering in a complete sentence.",
@@ -415,7 +483,7 @@ def get_image2latex_spec(subset: str, recompile_prompt: bool = False, args: Opti
415
483
  metric_specs: List[MetricSpec] = _get_image2structure_metric_specs(
416
484
  generation_type="latex",
417
485
  args=args,
418
- include_edit_similarity=True,
486
+ include_edit_similarity=(subset != "real"),
419
487
  size_handling_method="padding",
420
488
  )
421
489
  annotator_specs: List[AnnotatorSpec] = [
@@ -424,22 +492,32 @@ def get_image2latex_spec(subset: str, recompile_prompt: bool = False, args: Opti
424
492
  )
425
493
  ]
426
494
 
427
- run_spec_name: str = "image2latex"
495
+ run_spec_name: str = f"image2latex:subset={subset}:difficulty={difficulty}"
496
+ groups: List[str]
497
+ if subset == "real":
498
+ groups = ["image2latex_real"]
499
+ else:
500
+ groups = ["image2latex", f"image2latex_{difficulty}"]
428
501
  return RunSpec(
429
- name=f"{run_spec_name}:subset={subset}",
502
+ name=run_spec_name,
430
503
  scenario_spec=scenario_spec,
431
504
  adapter_spec=adapter_spec,
432
505
  metric_specs=metric_specs,
433
- groups=[run_spec_name],
506
+ groups=groups,
434
507
  annotators=annotator_specs,
435
508
  )
436
509
 
437
510
 
438
511
  @run_spec_function("image2webpage")
439
- def get_image2webpage_spec(subset: str, recompile_prompt: bool = False, args: Optional[Dict] = None) -> RunSpec:
512
+ def get_image2webpage_spec(
513
+ subset: str,
514
+ recompile_prompt: bool = False,
515
+ difficulty: str = DIFFICULTY_ALL,
516
+ args: Optional[Dict] = None,
517
+ ) -> RunSpec:
440
518
  scenario_spec = ScenarioSpec(
441
519
  class_name="helm.benchmark.scenarios.vision_language.image2structure.webpage_scenario.WebpageScenario",
442
- args={"subset": subset, "recompile_prompt": recompile_prompt},
520
+ args={"subset": subset, "recompile_prompt": recompile_prompt, "difficulty": difficulty},
443
521
  )
444
522
  adapter_spec: AdapterSpec = _get_generation_adapter_spec(
445
523
  instructions="Just give a short answer without answering in a complete sentence.",
@@ -448,7 +526,7 @@ def get_image2webpage_spec(subset: str, recompile_prompt: bool = False, args: Op
448
526
  metric_specs: List[MetricSpec] = _get_image2structure_metric_specs(
449
527
  generation_type="webpage",
450
528
  args=args,
451
- include_edit_similarity=True,
529
+ include_edit_similarity=(subset != "real"),
452
530
  size_handling_method="none",
453
531
  )
454
532
  annotator_specs: List[AnnotatorSpec] = [
@@ -457,13 +535,18 @@ def get_image2webpage_spec(subset: str, recompile_prompt: bool = False, args: Op
457
535
  )
458
536
  ]
459
537
 
460
- run_spec_name: str = "image2webpage"
538
+ run_spec_name: str = f"image2webpage:subset={subset}:difficulty={difficulty}"
539
+ groups: List[str]
540
+ if subset == "real":
541
+ groups = ["image2webpage_real"]
542
+ else:
543
+ groups = ["image2webpage", f"image2webpage_{difficulty}"]
461
544
  return RunSpec(
462
- name=f"{run_spec_name}:subset={subset}",
545
+ name=run_spec_name,
463
546
  scenario_spec=scenario_spec,
464
547
  adapter_spec=adapter_spec,
465
548
  metric_specs=metric_specs,
466
- groups=[run_spec_name],
549
+ groups=groups,
467
550
  annotators=annotator_specs,
468
551
  )
469
552
 
@@ -477,7 +560,9 @@ def get_math_vista_spec(grade: str, question_type: str) -> RunSpec:
477
560
 
478
561
  adapter_spec: AdapterSpec
479
562
  if question_type == "free_form":
480
- adapter_spec = _get_short_answer_generation_adapter_spec()
563
+ adapter_spec = _get_short_answer_generation_adapter_spec(
564
+ instructions="Just give the numerical answer without showing the steps, the unit, or percentage symbol."
565
+ )
481
566
  elif question_type == "multi_choice":
482
567
  adapter_spec = _get_multiple_choice_joint_adapter_spec(
483
568
  input_noun=None, output_noun="Answer", max_train_instances=0
@@ -497,10 +582,11 @@ def get_math_vista_spec(grade: str, question_type: str) -> RunSpec:
497
582
 
498
583
 
499
584
  @run_spec_function("image2musicsheet")
500
- def get_image2musicsheet_spec(args: Optional[Dict] = None) -> RunSpec:
585
+ def get_image2musicsheet_spec(difficulty: str = DIFFICULTY_ALL, args: Optional[Dict] = None) -> RunSpec:
501
586
  scenario_spec = ScenarioSpec(
502
587
  class_name="helm.benchmark.scenarios.vision_language.image2structure.musicsheet_scenario.MusicSheetScenario",
503
- args={"subset": "music", "recompile_prompt": False}, # There os only one subset for music sheets
588
+ # There os only one subset for music sheets
589
+ args={"subset": "music", "recompile_prompt": False, "difficulty": difficulty},
504
590
  )
505
591
  adapter_spec: AdapterSpec = _get_generation_adapter_spec(
506
592
  instructions="Just give a short answer without answering in a complete sentence.",
@@ -518,13 +604,14 @@ def get_image2musicsheet_spec(args: Optional[Dict] = None) -> RunSpec:
518
604
  )
519
605
  ]
520
606
 
521
- run_spec_name: str = "image2musicsheet"
607
+ run_spec_name: str = f"image2musicsheet:difficulty={difficulty}"
608
+ groups: List[str] = ["image2musicsheet", f"image2musicsheet_{difficulty}"]
522
609
  return RunSpec(
523
610
  name=run_spec_name,
524
611
  scenario_spec=scenario_spec,
525
612
  adapter_spec=adapter_spec,
526
613
  metric_specs=metric_specs,
527
- groups=[run_spec_name],
614
+ groups=groups,
528
615
  annotators=annotator_specs,
529
616
  )
530
617
 
@@ -568,13 +655,14 @@ def get_unicorn_spec(subject: str) -> RunSpec:
568
655
  args={"subject": subject},
569
656
  )
570
657
  adapter_spec: AdapterSpec = _get_generation_adapter_spec(
571
- instructions="Only give numerical or boolean answer without an explanation."
658
+ instructions="Only give a yes/no or numerical answer without an explanation.",
659
+ max_tokens=1, # the model may generate answer with a period
572
660
  )
573
661
  metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
574
662
 
575
663
  run_spec_name: str = "unicorn"
576
664
  return RunSpec(
577
- name=run_spec_name,
665
+ name=f"{run_spec_name}:subject={subject}",
578
666
  scenario_spec=scenario_spec,
579
667
  adapter_spec=adapter_spec,
580
668
  metric_specs=metric_specs,
@@ -583,16 +671,26 @@ def get_unicorn_spec(subject: str) -> RunSpec:
583
671
 
584
672
 
585
673
  @run_spec_function("bingo")
586
- def get_bingo_spec(subject: str) -> RunSpec:
674
+ def get_bingo_spec(subject: str, num_respondents: int) -> RunSpec:
587
675
  scenario_spec = ScenarioSpec(
588
676
  class_name="helm.benchmark.scenarios.vision_language.bingo_scenario.BingoScenario", args={"subject": subject}
589
677
  )
590
- adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec()
591
- metric_specs: List[MetricSpec] = _get_open_ended_generation_metric_specs()
678
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(
679
+ instructions="Answer the question with a complete and clear explanation in sentences without listing it out.",
680
+ max_tokens=100,
681
+ max_train_instances=0,
682
+ )
683
+ metric_specs: List[MetricSpec] = (
684
+ _get_prometheus_vision_critique_metric_specs(
685
+ num_respondents=num_respondents,
686
+ max_tokens=200,
687
+ )
688
+ + _get_open_ended_generation_metric_specs()
689
+ )
592
690
 
593
691
  run_spec_name: str = "bingo"
594
692
  return RunSpec(
595
- name=run_spec_name,
693
+ name=f"{run_spec_name}:subject={subject}",
596
694
  scenario_spec=scenario_spec,
597
695
  adapter_spec=adapter_spec,
598
696
  metric_specs=metric_specs,
@@ -661,7 +759,7 @@ def get_seed_bench_spec(subject: str) -> RunSpec:
661
759
 
662
760
  run_spec_name: str = "seed_bench"
663
761
  return RunSpec(
664
- name=run_spec_name,
762
+ name=f"{run_spec_name}:subject={subject}",
665
763
  scenario_spec=scenario_spec,
666
764
  adapter_spec=adapter_spec,
667
765
  metric_specs=metric_specs,
@@ -682,7 +780,7 @@ def get_mme_spec(subject: str) -> RunSpec:
682
780
 
683
781
  run_spec_name: str = "mme"
684
782
  return RunSpec(
685
- name=run_spec_name,
783
+ name=f"{run_spec_name}:subject={subject}",
686
784
  scenario_spec=scenario_spec,
687
785
  adapter_spec=adapter_spec,
688
786
  metric_specs=metric_specs,
@@ -739,17 +837,42 @@ def get_pairs_spec(subset: str, person: str) -> RunSpec:
739
837
 
740
838
 
741
839
  @run_spec_function("mementos")
742
- def get_mementos_spec(subject: str) -> RunSpec:
840
+ def get_mementos_spec(subject: str, num_respondents: int) -> RunSpec:
743
841
  scenario_spec = ScenarioSpec(
744
842
  class_name="helm.benchmark.scenarios.vision_language.mementos_scenario.MementosScenario",
745
843
  args={"subject": subject},
746
844
  )
747
- adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec()
748
- metric_specs: List[MetricSpec] = _get_open_ended_generation_metric_specs()
845
+ adapter_spec: AdapterSpec = get_open_end_answer_generation_adapter_spec()
846
+ metric_specs: List[MetricSpec] = (
847
+ _get_prometheus_vision_critique_metric_specs(num_respondents=num_respondents, max_tokens=200)
848
+ + _get_open_ended_generation_metric_specs()
849
+ )
749
850
 
750
851
  run_spec_name: str = "mementos"
751
852
  return RunSpec(
752
- name=run_spec_name,
853
+ name=f"{run_spec_name}:subject={subject}",
854
+ scenario_spec=scenario_spec,
855
+ adapter_spec=adapter_spec,
856
+ metric_specs=metric_specs,
857
+ groups=[run_spec_name],
858
+ )
859
+
860
+
861
+ @run_spec_function("vibe_eval")
862
+ def get_vibe_eval_spec(subject: str, num_respondents: int) -> RunSpec:
863
+ scenario_spec = ScenarioSpec(
864
+ class_name="helm.benchmark.scenarios.vision_language.vibe_eval_scenario.VibeEvalScenario",
865
+ args={"subject": subject},
866
+ )
867
+ adapter_spec: AdapterSpec = get_open_end_answer_generation_adapter_spec()
868
+ metric_specs: List[MetricSpec] = (
869
+ _get_prometheus_vision_critique_metric_specs(num_respondents=num_respondents, max_tokens=200)
870
+ + _get_open_ended_generation_metric_specs()
871
+ )
872
+
873
+ run_spec_name: str = "vibe_eval"
874
+ return RunSpec(
875
+ name=f"{run_spec_name}:subject={subject}",
753
876
  scenario_spec=scenario_spec,
754
877
  adapter_spec=adapter_spec,
755
878
  metric_specs=metric_specs,
@@ -0,0 +1,50 @@
1
+ import datasets
2
+ import os
3
+ from typing import List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TEST_SPLIT,
10
+ Input,
11
+ Output,
12
+ )
13
+ from helm.common.general import ensure_directory_exists
14
+
15
+
16
+ class AIRBench2024Scenario(Scenario):
17
+ """AIRBench 2024
18
+
19
+ Pre-publication: References will be added post-publication.
20
+
21
+ AIRBench 2024 is a AI safety benchmark that aligns with emerging government
22
+ regulations and company policies. It consists of 5,619 malicious prompts
23
+ spanning categories of the regulation-based safety categories in the
24
+ AIR 2024 safety taxonomy."""
25
+
26
+ name = "air_bench_2024"
27
+ description = (
28
+ "AIRBench 2024 is a AI safety benchmark that aligns with "
29
+ "emerging government regulations and company policies"
30
+ )
31
+ tags = ["safety"]
32
+
33
+ def get_instances(self, output_path: str) -> List[Instance]:
34
+ cache_dir = os.path.join(output_path, "data")
35
+ ensure_directory_exists(cache_dir)
36
+
37
+ # TODO: Switch this to the production dataset when available.
38
+ dataset = datasets.load_dataset("stanford-crfm/air-bench-2024", split="test", cache_dir=cache_dir)
39
+ instances: List[Instance] = []
40
+ # TODO: Allow users to filter by category
41
+ for row in dataset:
42
+ input = Input(text=row["prompt"])
43
+ # References are category ID, followed by level 2, 3 and 4 category names.
44
+ references = [
45
+ Reference(output=Output(text=row[column_name]), tags=[])
46
+ for column_name in ["cate-idx", "l2-name", "l3-name", "l4-name"]
47
+ ]
48
+ instance = Instance(input=input, references=references, split=TEST_SPLIT)
49
+ instances.append(instance)
50
+ return instances
@@ -0,0 +1,80 @@
1
+ import json
2
+ import os
3
+ from typing import List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ CORRECT_TAG,
10
+ TRAIN_SPLIT,
11
+ TEST_SPLIT,
12
+ Input,
13
+ Output,
14
+ )
15
+
16
+
17
+ class CIMCQAScenario(Scenario):
18
+ """CIMCQA is a multiple-choice question answering (MCQA) dataset designed to
19
+ study concept inventories in CS Education.
20
+
21
+ This is used by a pre-publication paper.
22
+
23
+ NOTE: This code is for archival purposes only. The scenario cannot be run because it requires
24
+ private data. Please contact the paper authors for more information."""
25
+
26
+ DATASET_DOWNLOAD_URL: str = "https://drive.google.com/uc?export=download&id=1siYjhDiasI5FIiS0ckLbo40UnOj8EU2h"
27
+
28
+ name = "ci_mcqa"
29
+ description = (
30
+ "CIMCQA is a multiple-choice question answering (MCQA) dataset designed to"
31
+ "study concept inventories in CS Education."
32
+ )
33
+ tags = ["question_answering"]
34
+
35
+ def get_instances(self, output_path: str) -> List[Instance]:
36
+ data_path: str = os.path.join("restricted", "bdsi_multiple_answers_removed.json")
37
+ assert os.path.exists(data_path)
38
+
39
+ with open(data_path, "r", encoding="utf8") as f:
40
+ data = json.load(f)
41
+
42
+ # Data is a list of dictionaries now, each one a question and its associated answers and metadata.
43
+ instances: List[Instance] = list()
44
+
45
+ # UNCOMMENT BELOW FOR FEW-SHOT RUN
46
+ training_data_path: str = os.path.join("restricted", "mock_bdsi_multiple_answers_removed.json")
47
+ assert os.path.exists(training_data_path)
48
+
49
+ with open(training_data_path, "r", encoding="utf8") as f:
50
+ training_data = json.load(f)
51
+ for question in training_data:
52
+ question_text = question["question"]
53
+ references = list()
54
+ for index, answer in enumerate(question["options"]):
55
+ reference_answer = Output(text=answer)
56
+ # Correct option offset by 1 due to zero-indexing
57
+ tag = [CORRECT_TAG] if index == question["correct_option"] - 1 else []
58
+ references.append(Reference(reference_answer, tags=tag))
59
+ instance = Instance(
60
+ input=Input(text=question_text),
61
+ references=references,
62
+ split=TRAIN_SPLIT,
63
+ )
64
+ instances.append(instance)
65
+
66
+ for question in data:
67
+ question_text = question["question"]
68
+ references = list()
69
+ for index, answer in enumerate(question["options"]):
70
+ reference_answer = Output(text=answer)
71
+ # Correct option offset by 1 due to zero-indexing
72
+ tag = [CORRECT_TAG] if index == question["correct_option"] - 1 else []
73
+ references.append(Reference(reference_answer, tags=tag))
74
+ instance = Instance(
75
+ input=Input(text=question_text),
76
+ references=references,
77
+ split=TEST_SPLIT, # Just doing zero shot to start
78
+ )
79
+ instances.append(instance)
80
+ return instances
@@ -41,8 +41,14 @@ class EntityDataImputationScenario(Scenario):
41
41
  def __init__(self, dataset: str, seed: int = 1234):
42
42
  super().__init__()
43
43
  self.datasets_paths = {
44
- "Buy": "https://dbs.uni-leipzig.de/file/Abt-Buy.zip",
45
- "Restaurant": "https://www.cs.utexas.edu/users/ml/riddle/data/restaurant.tar.gz",
44
+ "Buy": (
45
+ "https://storage.googleapis.com/crfm-helm-public/source_datasets/scenarios/"
46
+ "entity_data_imputation/Abt-Buy.zip"
47
+ ),
48
+ "Restaurant": (
49
+ "https://storage.googleapis.com/crfm-helm-public/source_datasets/scenarios/"
50
+ "entity_data_imputation/restaurant.tar.gz"
51
+ ),
46
52
  }
47
53
  # Columns to impute
48
54
  self.datasets_impute_col = {