crfm-helm 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +13 -3
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +96 -63
- helm/benchmark/adaptation/adapter_spec.py +32 -31
- helm/benchmark/annotation/air_bench_annotator.py +64 -0
- helm/benchmark/annotation/annotator_factory.py +6 -0
- helm/benchmark/annotation/live_qa_annotator.py +84 -0
- helm/benchmark/annotation/medication_qa_annotator.py +81 -0
- helm/benchmark/augmentations/translate_perturbation.py +1 -0
- helm/benchmark/huggingface_registration.py +16 -6
- helm/benchmark/metrics/air_bench_metrics.py +56 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
- helm/benchmark/metrics/live_qa_metrics.py +23 -0
- helm/benchmark/metrics/medication_qa_metrics.py +23 -0
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/unitxt_metrics.py +20 -10
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +29 -71
- helm/benchmark/presentation/schema.py +54 -4
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/run.py +16 -2
- helm/benchmark/run_expander.py +77 -0
- helm/benchmark/run_spec_factory.py +4 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +15 -11
- helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
- helm/benchmark/run_specs/experimental_run_specs.py +33 -0
- helm/benchmark/run_specs/finance_run_specs.py +33 -0
- helm/benchmark/run_specs/vlm_run_specs.py +168 -45
- helm/benchmark/scenarios/air_bench_scenario.py +50 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
- helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +0 -4
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +4 -2
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +6 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_classic.yaml +3 -59
- helm/benchmark/static/schema_finance.yaml +143 -0
- helm/benchmark/static/schema_image2structure.yaml +254 -111
- helm/benchmark/static/schema_instruction_following.yaml +3 -52
- helm/benchmark/static/schema_lite.yaml +3 -61
- helm/benchmark/static/schema_medical.yaml +255 -0
- helm/benchmark/static/schema_mmlu.yaml +3 -61
- helm/benchmark/static/schema_tables.yaml +200 -0
- helm/benchmark/static/schema_thai.yaml +223 -0
- helm/benchmark/static/schema_unitxt.yaml +3 -61
- helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +294 -293
- helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
- helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
- helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
- helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
- helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
- helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/anthropic_client.py +43 -9
- helm/clients/auto_client.py +11 -0
- helm/clients/client.py +24 -7
- helm/clients/cohere_client.py +98 -3
- helm/clients/huggingface_client.py +71 -12
- helm/clients/openai_client.py +9 -2
- helm/clients/reka_client.py +189 -0
- helm/clients/test_client.py +3 -3
- helm/clients/test_huggingface_client.py +19 -3
- helm/clients/test_together_client.py +72 -2
- helm/clients/together_client.py +129 -23
- helm/clients/vertexai_client.py +62 -18
- helm/clients/vision_language/huggingface_vlm_client.py +1 -0
- helm/clients/vision_language/paligemma_client.py +146 -0
- helm/clients/vision_language/palmyra_vision_client.py +84 -0
- helm/clients/yi_client.py +31 -0
- helm/common/critique_request.py +10 -1
- helm/common/images_utils.py +19 -0
- helm/config/model_deployments.yaml +412 -18
- helm/config/model_metadata.yaml +447 -25
- helm/config/tokenizer_configs.yaml +93 -1
- helm/proxy/critique/model_critique_client.py +32 -4
- helm/proxy/services/server_service.py +1 -1
- helm/tokenizers/auto_tokenizer.py +1 -1
- helm/tokenizers/cohere_tokenizer.py +44 -2
- helm/tokenizers/huggingface_tokenizer.py +36 -13
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_huggingface_tokenizer.py +5 -1
- helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
- helm/benchmark/static_build/assets/index-878a1094.css +0 -1
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -7,8 +7,8 @@ from helm.benchmark.adaptation.adapters.adapter_factory import (
|
|
|
7
7
|
ADAPT_GENERATION_MULTIMODAL,
|
|
8
8
|
ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
|
|
9
9
|
)
|
|
10
|
+
from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import DIFFICULTY_ALL
|
|
10
11
|
from helm.benchmark.metrics.common_metric_specs import (
|
|
11
|
-
get_basic_reference_metric_specs,
|
|
12
12
|
get_exact_match_metric_specs,
|
|
13
13
|
get_generative_harms_metric_specs,
|
|
14
14
|
get_basic_metric_specs,
|
|
@@ -30,6 +30,7 @@ def _get_generation_adapter_spec(
|
|
|
30
30
|
output_prefix: str = "",
|
|
31
31
|
output_suffix: str = "",
|
|
32
32
|
max_tokens: int = 100,
|
|
33
|
+
max_train_instances: int = 0,
|
|
33
34
|
stop_sequences: Optional[List[str]] = None,
|
|
34
35
|
) -> AdapterSpec:
|
|
35
36
|
return AdapterSpec(
|
|
@@ -41,8 +42,7 @@ def _get_generation_adapter_spec(
|
|
|
41
42
|
output_prefix=output_prefix,
|
|
42
43
|
output_suffix=output_suffix,
|
|
43
44
|
instance_prefix="\n",
|
|
44
|
-
|
|
45
|
-
max_train_instances=0,
|
|
45
|
+
max_train_instances=max_train_instances,
|
|
46
46
|
num_outputs=1,
|
|
47
47
|
max_tokens=max_tokens,
|
|
48
48
|
stop_sequences=stop_sequences if stop_sequences is not None else [],
|
|
@@ -70,6 +70,13 @@ def _get_captioning_adapter_spec() -> AdapterSpec:
|
|
|
70
70
|
)
|
|
71
71
|
|
|
72
72
|
|
|
73
|
+
def get_open_end_answer_generation_adapter_spec():
|
|
74
|
+
return _get_generation_adapter_spec(
|
|
75
|
+
instructions="Follow the given instruction and give your complete answer.",
|
|
76
|
+
max_tokens=100,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
73
80
|
def _get_multiple_choice_joint_adapter_spec(
|
|
74
81
|
input_noun: Optional[str],
|
|
75
82
|
output_noun: str,
|
|
@@ -117,9 +124,8 @@ def _get_image2structure_metric_specs(
|
|
|
117
124
|
metric_names = [
|
|
118
125
|
AnnotatedImageMetrics.PIXEL_SIMILARITY,
|
|
119
126
|
AnnotatedImageMetrics.FID_SIMILARITY,
|
|
120
|
-
AnnotatedImageMetrics.
|
|
121
|
-
AnnotatedImageMetrics.
|
|
122
|
-
AnnotatedImageMetrics.BLOCK_EARTH_MOVER_SIMILARITY_NORM1,
|
|
127
|
+
AnnotatedImageMetrics.BLOCK_EMD,
|
|
128
|
+
AnnotatedImageMetrics.EARTH_MOVER_SIMILARITY,
|
|
123
129
|
]
|
|
124
130
|
if include_edit_similarity:
|
|
125
131
|
metric_names.append(AnnotatedImageMetrics.EDIT_SIMILARITY)
|
|
@@ -136,7 +142,42 @@ def _get_image2structure_metric_specs(
|
|
|
136
142
|
},
|
|
137
143
|
),
|
|
138
144
|
]
|
|
139
|
-
return metric_specs +
|
|
145
|
+
return metric_specs + get_basic_metric_specs([])
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _get_prometheus_vision_critique_metric_specs(num_respondents: int, max_tokens: int) -> List[MetricSpec]:
|
|
149
|
+
return [
|
|
150
|
+
MetricSpec(
|
|
151
|
+
class_name="helm.benchmark.metrics.prometheus_vision_critique_metrics.PrometheusVisionCritiqueMetric",
|
|
152
|
+
args={
|
|
153
|
+
"num_respondents": num_respondents,
|
|
154
|
+
"max_tokens": max_tokens,
|
|
155
|
+
},
|
|
156
|
+
)
|
|
157
|
+
]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _get_gpt4v_critique_originality_metric_specs(num_respondents: int) -> List[MetricSpec]:
|
|
161
|
+
return [
|
|
162
|
+
MetricSpec(
|
|
163
|
+
class_name="helm.benchmark.metrics.gpt4v_originality_critique_metrics.GPT4VCritiqueMetric",
|
|
164
|
+
args={
|
|
165
|
+
"num_respondents": num_respondents,
|
|
166
|
+
},
|
|
167
|
+
)
|
|
168
|
+
]
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _get_vibe_eval_critique_metric_specs(num_respondents: int, max_tokens: int) -> List[MetricSpec]:
|
|
172
|
+
return [
|
|
173
|
+
MetricSpec(
|
|
174
|
+
class_name="helm.benchmark.metrics.reka_vibe_critique_metrics.RekaVibeCritiqueMetric",
|
|
175
|
+
args={
|
|
176
|
+
"num_respondents": num_respondents,
|
|
177
|
+
"max_tokens": max_tokens,
|
|
178
|
+
},
|
|
179
|
+
)
|
|
180
|
+
]
|
|
140
181
|
|
|
141
182
|
|
|
142
183
|
############################################################
|
|
@@ -190,13 +231,23 @@ def get_chart2csv_spec() -> RunSpec:
|
|
|
190
231
|
|
|
191
232
|
|
|
192
233
|
@run_spec_function("crossmodal_3600")
|
|
193
|
-
def get_crossmodal_3600_spec(location: str, language: str) -> RunSpec:
|
|
234
|
+
def get_crossmodal_3600_spec(location: str, language: str, num_respondents: int) -> RunSpec:
|
|
194
235
|
scenario_spec = ScenarioSpec(
|
|
195
236
|
class_name="helm.benchmark.scenarios.vision_language.crossmodal_3600_scenario.Crossmodal3600Scenario",
|
|
196
237
|
args={"location": location, "language": language},
|
|
197
238
|
)
|
|
198
|
-
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
|
|
199
|
-
|
|
239
|
+
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
|
|
240
|
+
instructions="Answer the question with a complete sentence in plain words",
|
|
241
|
+
max_tokens=20,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
metric_specs: List[MetricSpec] = (
|
|
245
|
+
_get_prometheus_vision_critique_metric_specs(
|
|
246
|
+
num_respondents=num_respondents,
|
|
247
|
+
max_tokens=200,
|
|
248
|
+
)
|
|
249
|
+
+ _get_open_ended_generation_metric_specs()
|
|
250
|
+
)
|
|
200
251
|
|
|
201
252
|
run_spec_name: str = "crossmodal_3600"
|
|
202
253
|
return RunSpec(
|
|
@@ -209,12 +260,23 @@ def get_crossmodal_3600_spec(location: str, language: str) -> RunSpec:
|
|
|
209
260
|
|
|
210
261
|
|
|
211
262
|
@run_spec_function("flickr30k")
|
|
212
|
-
def get_flickr30k_spec() -> RunSpec:
|
|
263
|
+
def get_flickr30k_spec(num_respondents: int) -> RunSpec:
|
|
213
264
|
scenario_spec = ScenarioSpec(
|
|
214
265
|
class_name="helm.benchmark.scenarios.vision_language.flickr30k_scenario.Flickr30KScenario", args={}
|
|
215
266
|
)
|
|
216
|
-
adapter_spec: AdapterSpec =
|
|
217
|
-
|
|
267
|
+
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
|
|
268
|
+
instructions="Generate a caption for the following image in plain words. The caption should "
|
|
269
|
+
"be short and needs to be a complete sentence.",
|
|
270
|
+
max_tokens=30,
|
|
271
|
+
max_train_instances=0,
|
|
272
|
+
)
|
|
273
|
+
metric_specs: List[MetricSpec] = (
|
|
274
|
+
_get_prometheus_vision_critique_metric_specs(
|
|
275
|
+
num_respondents=num_respondents,
|
|
276
|
+
max_tokens=200,
|
|
277
|
+
)
|
|
278
|
+
+ _get_open_ended_generation_metric_specs()
|
|
279
|
+
)
|
|
218
280
|
|
|
219
281
|
run_spec_name: str = "flickr30k"
|
|
220
282
|
return RunSpec(
|
|
@@ -232,7 +294,7 @@ def get_gqa_spec() -> RunSpec:
|
|
|
232
294
|
class_name="helm.benchmark.scenarios.vision_language.gqa_scenario.GQAScenario", args={}
|
|
233
295
|
)
|
|
234
296
|
adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(
|
|
235
|
-
instructions="Answer the question using a single word
|
|
297
|
+
instructions="Answer the question using a single word."
|
|
236
298
|
)
|
|
237
299
|
metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
|
|
238
300
|
|
|
@@ -296,10 +358,14 @@ def get_mscoco_captioning_spec(long: bool = False) -> RunSpec:
|
|
|
296
358
|
if long:
|
|
297
359
|
adapter_spec = _get_generation_adapter_spec(
|
|
298
360
|
instructions="Generate a long, detailed caption for the following image.",
|
|
299
|
-
max_tokens=
|
|
361
|
+
max_tokens=200,
|
|
300
362
|
)
|
|
301
363
|
else:
|
|
302
|
-
adapter_spec =
|
|
364
|
+
adapter_spec = _get_generation_adapter_spec(
|
|
365
|
+
instructions="Generate a caption for the following image. The caption should be short and does "
|
|
366
|
+
"not need to be a complete sentence.",
|
|
367
|
+
max_tokens=20,
|
|
368
|
+
)
|
|
303
369
|
metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
|
|
304
370
|
|
|
305
371
|
run_spec_name: str = "mscoco_captioning"
|
|
@@ -403,10 +469,12 @@ def get_vqa_spec() -> RunSpec:
|
|
|
403
469
|
|
|
404
470
|
|
|
405
471
|
@run_spec_function("image2latex")
|
|
406
|
-
def get_image2latex_spec(
|
|
472
|
+
def get_image2latex_spec(
|
|
473
|
+
subset: str, recompile_prompt: bool = False, difficulty: str = DIFFICULTY_ALL, args: Optional[Dict] = None
|
|
474
|
+
) -> RunSpec:
|
|
407
475
|
scenario_spec = ScenarioSpec(
|
|
408
476
|
class_name="helm.benchmark.scenarios.vision_language.image2structure.latex_scenario.LatexScenario",
|
|
409
|
-
args={"subset": subset, "recompile_prompt": recompile_prompt},
|
|
477
|
+
args={"subset": subset, "recompile_prompt": recompile_prompt, "difficulty": difficulty},
|
|
410
478
|
)
|
|
411
479
|
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
|
|
412
480
|
instructions="Just give a short answer without answering in a complete sentence.",
|
|
@@ -415,7 +483,7 @@ def get_image2latex_spec(subset: str, recompile_prompt: bool = False, args: Opti
|
|
|
415
483
|
metric_specs: List[MetricSpec] = _get_image2structure_metric_specs(
|
|
416
484
|
generation_type="latex",
|
|
417
485
|
args=args,
|
|
418
|
-
include_edit_similarity=
|
|
486
|
+
include_edit_similarity=(subset != "real"),
|
|
419
487
|
size_handling_method="padding",
|
|
420
488
|
)
|
|
421
489
|
annotator_specs: List[AnnotatorSpec] = [
|
|
@@ -424,22 +492,32 @@ def get_image2latex_spec(subset: str, recompile_prompt: bool = False, args: Opti
|
|
|
424
492
|
)
|
|
425
493
|
]
|
|
426
494
|
|
|
427
|
-
run_spec_name: str = "image2latex"
|
|
495
|
+
run_spec_name: str = f"image2latex:subset={subset}:difficulty={difficulty}"
|
|
496
|
+
groups: List[str]
|
|
497
|
+
if subset == "real":
|
|
498
|
+
groups = ["image2latex_real"]
|
|
499
|
+
else:
|
|
500
|
+
groups = ["image2latex", f"image2latex_{difficulty}"]
|
|
428
501
|
return RunSpec(
|
|
429
|
-
name=
|
|
502
|
+
name=run_spec_name,
|
|
430
503
|
scenario_spec=scenario_spec,
|
|
431
504
|
adapter_spec=adapter_spec,
|
|
432
505
|
metric_specs=metric_specs,
|
|
433
|
-
groups=
|
|
506
|
+
groups=groups,
|
|
434
507
|
annotators=annotator_specs,
|
|
435
508
|
)
|
|
436
509
|
|
|
437
510
|
|
|
438
511
|
@run_spec_function("image2webpage")
|
|
439
|
-
def get_image2webpage_spec(
|
|
512
|
+
def get_image2webpage_spec(
|
|
513
|
+
subset: str,
|
|
514
|
+
recompile_prompt: bool = False,
|
|
515
|
+
difficulty: str = DIFFICULTY_ALL,
|
|
516
|
+
args: Optional[Dict] = None,
|
|
517
|
+
) -> RunSpec:
|
|
440
518
|
scenario_spec = ScenarioSpec(
|
|
441
519
|
class_name="helm.benchmark.scenarios.vision_language.image2structure.webpage_scenario.WebpageScenario",
|
|
442
|
-
args={"subset": subset, "recompile_prompt": recompile_prompt},
|
|
520
|
+
args={"subset": subset, "recompile_prompt": recompile_prompt, "difficulty": difficulty},
|
|
443
521
|
)
|
|
444
522
|
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
|
|
445
523
|
instructions="Just give a short answer without answering in a complete sentence.",
|
|
@@ -448,7 +526,7 @@ def get_image2webpage_spec(subset: str, recompile_prompt: bool = False, args: Op
|
|
|
448
526
|
metric_specs: List[MetricSpec] = _get_image2structure_metric_specs(
|
|
449
527
|
generation_type="webpage",
|
|
450
528
|
args=args,
|
|
451
|
-
include_edit_similarity=
|
|
529
|
+
include_edit_similarity=(subset != "real"),
|
|
452
530
|
size_handling_method="none",
|
|
453
531
|
)
|
|
454
532
|
annotator_specs: List[AnnotatorSpec] = [
|
|
@@ -457,13 +535,18 @@ def get_image2webpage_spec(subset: str, recompile_prompt: bool = False, args: Op
|
|
|
457
535
|
)
|
|
458
536
|
]
|
|
459
537
|
|
|
460
|
-
run_spec_name: str = "image2webpage"
|
|
538
|
+
run_spec_name: str = f"image2webpage:subset={subset}:difficulty={difficulty}"
|
|
539
|
+
groups: List[str]
|
|
540
|
+
if subset == "real":
|
|
541
|
+
groups = ["image2webpage_real"]
|
|
542
|
+
else:
|
|
543
|
+
groups = ["image2webpage", f"image2webpage_{difficulty}"]
|
|
461
544
|
return RunSpec(
|
|
462
|
-
name=
|
|
545
|
+
name=run_spec_name,
|
|
463
546
|
scenario_spec=scenario_spec,
|
|
464
547
|
adapter_spec=adapter_spec,
|
|
465
548
|
metric_specs=metric_specs,
|
|
466
|
-
groups=
|
|
549
|
+
groups=groups,
|
|
467
550
|
annotators=annotator_specs,
|
|
468
551
|
)
|
|
469
552
|
|
|
@@ -477,7 +560,9 @@ def get_math_vista_spec(grade: str, question_type: str) -> RunSpec:
|
|
|
477
560
|
|
|
478
561
|
adapter_spec: AdapterSpec
|
|
479
562
|
if question_type == "free_form":
|
|
480
|
-
adapter_spec = _get_short_answer_generation_adapter_spec(
|
|
563
|
+
adapter_spec = _get_short_answer_generation_adapter_spec(
|
|
564
|
+
instructions="Just give the numerical answer without showing the steps, the unit, or percentage symbol."
|
|
565
|
+
)
|
|
481
566
|
elif question_type == "multi_choice":
|
|
482
567
|
adapter_spec = _get_multiple_choice_joint_adapter_spec(
|
|
483
568
|
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
@@ -497,10 +582,11 @@ def get_math_vista_spec(grade: str, question_type: str) -> RunSpec:
|
|
|
497
582
|
|
|
498
583
|
|
|
499
584
|
@run_spec_function("image2musicsheet")
|
|
500
|
-
def get_image2musicsheet_spec(args: Optional[Dict] = None) -> RunSpec:
|
|
585
|
+
def get_image2musicsheet_spec(difficulty: str = DIFFICULTY_ALL, args: Optional[Dict] = None) -> RunSpec:
|
|
501
586
|
scenario_spec = ScenarioSpec(
|
|
502
587
|
class_name="helm.benchmark.scenarios.vision_language.image2structure.musicsheet_scenario.MusicSheetScenario",
|
|
503
|
-
|
|
588
|
+
# There os only one subset for music sheets
|
|
589
|
+
args={"subset": "music", "recompile_prompt": False, "difficulty": difficulty},
|
|
504
590
|
)
|
|
505
591
|
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
|
|
506
592
|
instructions="Just give a short answer without answering in a complete sentence.",
|
|
@@ -518,13 +604,14 @@ def get_image2musicsheet_spec(args: Optional[Dict] = None) -> RunSpec:
|
|
|
518
604
|
)
|
|
519
605
|
]
|
|
520
606
|
|
|
521
|
-
run_spec_name: str = "image2musicsheet"
|
|
607
|
+
run_spec_name: str = f"image2musicsheet:difficulty={difficulty}"
|
|
608
|
+
groups: List[str] = ["image2musicsheet", f"image2musicsheet_{difficulty}"]
|
|
522
609
|
return RunSpec(
|
|
523
610
|
name=run_spec_name,
|
|
524
611
|
scenario_spec=scenario_spec,
|
|
525
612
|
adapter_spec=adapter_spec,
|
|
526
613
|
metric_specs=metric_specs,
|
|
527
|
-
groups=
|
|
614
|
+
groups=groups,
|
|
528
615
|
annotators=annotator_specs,
|
|
529
616
|
)
|
|
530
617
|
|
|
@@ -568,13 +655,14 @@ def get_unicorn_spec(subject: str) -> RunSpec:
|
|
|
568
655
|
args={"subject": subject},
|
|
569
656
|
)
|
|
570
657
|
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
|
|
571
|
-
instructions="Only give
|
|
658
|
+
instructions="Only give a yes/no or numerical answer without an explanation.",
|
|
659
|
+
max_tokens=1, # the model may generate answer with a period
|
|
572
660
|
)
|
|
573
661
|
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
574
662
|
|
|
575
663
|
run_spec_name: str = "unicorn"
|
|
576
664
|
return RunSpec(
|
|
577
|
-
name=run_spec_name,
|
|
665
|
+
name=f"{run_spec_name}:subject={subject}",
|
|
578
666
|
scenario_spec=scenario_spec,
|
|
579
667
|
adapter_spec=adapter_spec,
|
|
580
668
|
metric_specs=metric_specs,
|
|
@@ -583,16 +671,26 @@ def get_unicorn_spec(subject: str) -> RunSpec:
|
|
|
583
671
|
|
|
584
672
|
|
|
585
673
|
@run_spec_function("bingo")
|
|
586
|
-
def get_bingo_spec(subject: str) -> RunSpec:
|
|
674
|
+
def get_bingo_spec(subject: str, num_respondents: int) -> RunSpec:
|
|
587
675
|
scenario_spec = ScenarioSpec(
|
|
588
676
|
class_name="helm.benchmark.scenarios.vision_language.bingo_scenario.BingoScenario", args={"subject": subject}
|
|
589
677
|
)
|
|
590
|
-
adapter_spec: AdapterSpec =
|
|
591
|
-
|
|
678
|
+
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
|
|
679
|
+
instructions="Answer the question with a complete and clear explanation in sentences without listing it out.",
|
|
680
|
+
max_tokens=100,
|
|
681
|
+
max_train_instances=0,
|
|
682
|
+
)
|
|
683
|
+
metric_specs: List[MetricSpec] = (
|
|
684
|
+
_get_prometheus_vision_critique_metric_specs(
|
|
685
|
+
num_respondents=num_respondents,
|
|
686
|
+
max_tokens=200,
|
|
687
|
+
)
|
|
688
|
+
+ _get_open_ended_generation_metric_specs()
|
|
689
|
+
)
|
|
592
690
|
|
|
593
691
|
run_spec_name: str = "bingo"
|
|
594
692
|
return RunSpec(
|
|
595
|
-
name=run_spec_name,
|
|
693
|
+
name=f"{run_spec_name}:subject={subject}",
|
|
596
694
|
scenario_spec=scenario_spec,
|
|
597
695
|
adapter_spec=adapter_spec,
|
|
598
696
|
metric_specs=metric_specs,
|
|
@@ -661,7 +759,7 @@ def get_seed_bench_spec(subject: str) -> RunSpec:
|
|
|
661
759
|
|
|
662
760
|
run_spec_name: str = "seed_bench"
|
|
663
761
|
return RunSpec(
|
|
664
|
-
name=run_spec_name,
|
|
762
|
+
name=f"{run_spec_name}:subject={subject}",
|
|
665
763
|
scenario_spec=scenario_spec,
|
|
666
764
|
adapter_spec=adapter_spec,
|
|
667
765
|
metric_specs=metric_specs,
|
|
@@ -682,7 +780,7 @@ def get_mme_spec(subject: str) -> RunSpec:
|
|
|
682
780
|
|
|
683
781
|
run_spec_name: str = "mme"
|
|
684
782
|
return RunSpec(
|
|
685
|
-
name=run_spec_name,
|
|
783
|
+
name=f"{run_spec_name}:subject={subject}",
|
|
686
784
|
scenario_spec=scenario_spec,
|
|
687
785
|
adapter_spec=adapter_spec,
|
|
688
786
|
metric_specs=metric_specs,
|
|
@@ -739,17 +837,42 @@ def get_pairs_spec(subset: str, person: str) -> RunSpec:
|
|
|
739
837
|
|
|
740
838
|
|
|
741
839
|
@run_spec_function("mementos")
|
|
742
|
-
def get_mementos_spec(subject: str) -> RunSpec:
|
|
840
|
+
def get_mementos_spec(subject: str, num_respondents: int) -> RunSpec:
|
|
743
841
|
scenario_spec = ScenarioSpec(
|
|
744
842
|
class_name="helm.benchmark.scenarios.vision_language.mementos_scenario.MementosScenario",
|
|
745
843
|
args={"subject": subject},
|
|
746
844
|
)
|
|
747
|
-
adapter_spec: AdapterSpec =
|
|
748
|
-
metric_specs: List[MetricSpec] =
|
|
845
|
+
adapter_spec: AdapterSpec = get_open_end_answer_generation_adapter_spec()
|
|
846
|
+
metric_specs: List[MetricSpec] = (
|
|
847
|
+
_get_prometheus_vision_critique_metric_specs(num_respondents=num_respondents, max_tokens=200)
|
|
848
|
+
+ _get_open_ended_generation_metric_specs()
|
|
849
|
+
)
|
|
749
850
|
|
|
750
851
|
run_spec_name: str = "mementos"
|
|
751
852
|
return RunSpec(
|
|
752
|
-
name=run_spec_name,
|
|
853
|
+
name=f"{run_spec_name}:subject={subject}",
|
|
854
|
+
scenario_spec=scenario_spec,
|
|
855
|
+
adapter_spec=adapter_spec,
|
|
856
|
+
metric_specs=metric_specs,
|
|
857
|
+
groups=[run_spec_name],
|
|
858
|
+
)
|
|
859
|
+
|
|
860
|
+
|
|
861
|
+
@run_spec_function("vibe_eval")
|
|
862
|
+
def get_vibe_eval_spec(subject: str, num_respondents: int) -> RunSpec:
|
|
863
|
+
scenario_spec = ScenarioSpec(
|
|
864
|
+
class_name="helm.benchmark.scenarios.vision_language.vibe_eval_scenario.VibeEvalScenario",
|
|
865
|
+
args={"subject": subject},
|
|
866
|
+
)
|
|
867
|
+
adapter_spec: AdapterSpec = get_open_end_answer_generation_adapter_spec()
|
|
868
|
+
metric_specs: List[MetricSpec] = (
|
|
869
|
+
_get_prometheus_vision_critique_metric_specs(num_respondents=num_respondents, max_tokens=200)
|
|
870
|
+
+ _get_open_ended_generation_metric_specs()
|
|
871
|
+
)
|
|
872
|
+
|
|
873
|
+
run_spec_name: str = "vibe_eval"
|
|
874
|
+
return RunSpec(
|
|
875
|
+
name=f"{run_spec_name}:subject={subject}",
|
|
753
876
|
scenario_spec=scenario_spec,
|
|
754
877
|
adapter_spec=adapter_spec,
|
|
755
878
|
metric_specs=metric_specs,
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import datasets
|
|
2
|
+
import os
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
Input,
|
|
11
|
+
Output,
|
|
12
|
+
)
|
|
13
|
+
from helm.common.general import ensure_directory_exists
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class AIRBench2024Scenario(Scenario):
|
|
17
|
+
"""AIRBench 2024
|
|
18
|
+
|
|
19
|
+
Pre-publication: References will be added post-publication.
|
|
20
|
+
|
|
21
|
+
AIRBench 2024 is a AI safety benchmark that aligns with emerging government
|
|
22
|
+
regulations and company policies. It consists of 5,619 malicious prompts
|
|
23
|
+
spanning categories of the regulation-based safety categories in the
|
|
24
|
+
AIR 2024 safety taxonomy."""
|
|
25
|
+
|
|
26
|
+
name = "air_bench_2024"
|
|
27
|
+
description = (
|
|
28
|
+
"AIRBench 2024 is a AI safety benchmark that aligns with "
|
|
29
|
+
"emerging government regulations and company policies"
|
|
30
|
+
)
|
|
31
|
+
tags = ["safety"]
|
|
32
|
+
|
|
33
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
34
|
+
cache_dir = os.path.join(output_path, "data")
|
|
35
|
+
ensure_directory_exists(cache_dir)
|
|
36
|
+
|
|
37
|
+
# TODO: Switch this to the production dataset when available.
|
|
38
|
+
dataset = datasets.load_dataset("stanford-crfm/air-bench-2024", split="test", cache_dir=cache_dir)
|
|
39
|
+
instances: List[Instance] = []
|
|
40
|
+
# TODO: Allow users to filter by category
|
|
41
|
+
for row in dataset:
|
|
42
|
+
input = Input(text=row["prompt"])
|
|
43
|
+
# References are category ID, followed by level 2, 3 and 4 category names.
|
|
44
|
+
references = [
|
|
45
|
+
Reference(output=Output(text=row[column_name]), tags=[])
|
|
46
|
+
for column_name in ["cate-idx", "l2-name", "l3-name", "l4-name"]
|
|
47
|
+
]
|
|
48
|
+
instance = Instance(input=input, references=references, split=TEST_SPLIT)
|
|
49
|
+
instances.append(instance)
|
|
50
|
+
return instances
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
CORRECT_TAG,
|
|
10
|
+
TRAIN_SPLIT,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
Input,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CIMCQAScenario(Scenario):
|
|
18
|
+
"""CIMCQA is a multiple-choice question answering (MCQA) dataset designed to
|
|
19
|
+
study concept inventories in CS Education.
|
|
20
|
+
|
|
21
|
+
This is used by a pre-publication paper.
|
|
22
|
+
|
|
23
|
+
NOTE: This code is for archival purposes only. The scenario cannot be run because it requires
|
|
24
|
+
private data. Please contact the paper authors for more information."""
|
|
25
|
+
|
|
26
|
+
DATASET_DOWNLOAD_URL: str = "https://drive.google.com/uc?export=download&id=1siYjhDiasI5FIiS0ckLbo40UnOj8EU2h"
|
|
27
|
+
|
|
28
|
+
name = "ci_mcqa"
|
|
29
|
+
description = (
|
|
30
|
+
"CIMCQA is a multiple-choice question answering (MCQA) dataset designed to"
|
|
31
|
+
"study concept inventories in CS Education."
|
|
32
|
+
)
|
|
33
|
+
tags = ["question_answering"]
|
|
34
|
+
|
|
35
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
36
|
+
data_path: str = os.path.join("restricted", "bdsi_multiple_answers_removed.json")
|
|
37
|
+
assert os.path.exists(data_path)
|
|
38
|
+
|
|
39
|
+
with open(data_path, "r", encoding="utf8") as f:
|
|
40
|
+
data = json.load(f)
|
|
41
|
+
|
|
42
|
+
# Data is a list of dictionaries now, each one a question and its associated answers and metadata.
|
|
43
|
+
instances: List[Instance] = list()
|
|
44
|
+
|
|
45
|
+
# UNCOMMENT BELOW FOR FEW-SHOT RUN
|
|
46
|
+
training_data_path: str = os.path.join("restricted", "mock_bdsi_multiple_answers_removed.json")
|
|
47
|
+
assert os.path.exists(training_data_path)
|
|
48
|
+
|
|
49
|
+
with open(training_data_path, "r", encoding="utf8") as f:
|
|
50
|
+
training_data = json.load(f)
|
|
51
|
+
for question in training_data:
|
|
52
|
+
question_text = question["question"]
|
|
53
|
+
references = list()
|
|
54
|
+
for index, answer in enumerate(question["options"]):
|
|
55
|
+
reference_answer = Output(text=answer)
|
|
56
|
+
# Correct option offset by 1 due to zero-indexing
|
|
57
|
+
tag = [CORRECT_TAG] if index == question["correct_option"] - 1 else []
|
|
58
|
+
references.append(Reference(reference_answer, tags=tag))
|
|
59
|
+
instance = Instance(
|
|
60
|
+
input=Input(text=question_text),
|
|
61
|
+
references=references,
|
|
62
|
+
split=TRAIN_SPLIT,
|
|
63
|
+
)
|
|
64
|
+
instances.append(instance)
|
|
65
|
+
|
|
66
|
+
for question in data:
|
|
67
|
+
question_text = question["question"]
|
|
68
|
+
references = list()
|
|
69
|
+
for index, answer in enumerate(question["options"]):
|
|
70
|
+
reference_answer = Output(text=answer)
|
|
71
|
+
# Correct option offset by 1 due to zero-indexing
|
|
72
|
+
tag = [CORRECT_TAG] if index == question["correct_option"] - 1 else []
|
|
73
|
+
references.append(Reference(reference_answer, tags=tag))
|
|
74
|
+
instance = Instance(
|
|
75
|
+
input=Input(text=question_text),
|
|
76
|
+
references=references,
|
|
77
|
+
split=TEST_SPLIT, # Just doing zero shot to start
|
|
78
|
+
)
|
|
79
|
+
instances.append(instance)
|
|
80
|
+
return instances
|
|
@@ -41,8 +41,14 @@ class EntityDataImputationScenario(Scenario):
|
|
|
41
41
|
def __init__(self, dataset: str, seed: int = 1234):
|
|
42
42
|
super().__init__()
|
|
43
43
|
self.datasets_paths = {
|
|
44
|
-
"Buy":
|
|
45
|
-
|
|
44
|
+
"Buy": (
|
|
45
|
+
"https://storage.googleapis.com/crfm-helm-public/source_datasets/scenarios/"
|
|
46
|
+
"entity_data_imputation/Abt-Buy.zip"
|
|
47
|
+
),
|
|
48
|
+
"Restaurant": (
|
|
49
|
+
"https://storage.googleapis.com/crfm-helm-public/source_datasets/scenarios/"
|
|
50
|
+
"entity_data_imputation/restaurant.tar.gz"
|
|
51
|
+
),
|
|
46
52
|
}
|
|
47
53
|
# Columns to impute
|
|
48
54
|
self.datasets_impute_col = {
|