crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
- helm/benchmark/adaptation/adapter_spec.py +32 -31
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/annotation/air_bench_annotator.py +64 -0
- helm/benchmark/annotation/annotator_factory.py +6 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
- helm/benchmark/annotation/live_qa_annotator.py +84 -0
- helm/benchmark/annotation/medication_qa_annotator.py +81 -0
- helm/benchmark/augmentations/perturbation.py +17 -1
- helm/benchmark/augmentations/test_perturbation.py +30 -0
- helm/benchmark/augmentations/translate_perturbation.py +1 -0
- helm/benchmark/huggingface_registration.py +16 -6
- helm/benchmark/metrics/air_bench_metrics.py +56 -0
- helm/benchmark/metrics/efficiency_metrics.py +9 -2
- helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
- helm/benchmark/metrics/live_qa_metrics.py +23 -0
- helm/benchmark/metrics/medication_qa_metrics.py +23 -0
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/unitxt_metrics.py +20 -10
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
- helm/benchmark/model_metadata_registry.py +5 -1
- helm/benchmark/presentation/schema.py +54 -4
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/run.py +16 -2
- helm/benchmark/run_expander.py +112 -63
- helm/benchmark/run_spec_factory.py +15 -10
- helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +15 -11
- helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
- helm/benchmark/run_specs/experimental_run_specs.py +33 -0
- helm/benchmark/run_specs/finance_run_specs.py +33 -0
- helm/benchmark/run_specs/vlm_run_specs.py +444 -65
- helm/benchmark/scenarios/air_bench_scenario.py +50 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
- helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/math_scenario.py +1 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_classic.yaml +3 -59
- helm/benchmark/static/schema_finance.yaml +143 -0
- helm/benchmark/static/schema_image2structure.yaml +447 -0
- helm/benchmark/static/schema_instruction_following.yaml +3 -52
- helm/benchmark/static/schema_lite.yaml +3 -61
- helm/benchmark/static/schema_medical.yaml +255 -0
- helm/benchmark/static/schema_mmlu.yaml +3 -61
- helm/benchmark/static/schema_tables.yaml +200 -0
- helm/benchmark/static/schema_thai.yaml +223 -0
- helm/benchmark/static/schema_unitxt.yaml +3 -61
- helm/benchmark/static/schema_vhelm.yaml +824 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
- helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
- helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
- helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
- helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/anthropic_client.py +78 -14
- helm/clients/auto_client.py +11 -0
- helm/clients/client.py +24 -7
- helm/clients/cohere_client.py +98 -3
- helm/clients/huggingface_client.py +71 -12
- helm/clients/openai_client.py +11 -5
- helm/clients/reka_client.py +189 -0
- helm/clients/test_client.py +3 -3
- helm/clients/test_huggingface_client.py +19 -3
- helm/clients/test_together_client.py +72 -2
- helm/clients/together_client.py +199 -2
- helm/clients/vertexai_client.py +117 -64
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +12 -4
- helm/clients/vision_language/idefics_client.py +2 -2
- helm/clients/vision_language/paligemma_client.py +146 -0
- helm/clients/vision_language/palmyra_vision_client.py +84 -0
- helm/clients/yi_client.py +31 -0
- helm/common/critique_request.py +10 -1
- helm/common/images_utils.py +29 -3
- helm/config/model_deployments.yaml +504 -12
- helm/config/model_metadata.yaml +579 -52
- helm/config/tokenizer_configs.yaml +100 -1
- helm/proxy/critique/model_critique_client.py +32 -4
- helm/proxy/services/server_service.py +1 -1
- helm/tokenizers/auto_tokenizer.py +1 -1
- helm/tokenizers/cohere_tokenizer.py +44 -2
- helm/tokenizers/huggingface_tokenizer.py +36 -13
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_huggingface_tokenizer.py +5 -1
- helm/benchmark/static/schema_vlm.yaml +0 -576
- helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
- helm/benchmark/static_build/assets/index-d839df55.js +0 -9
- helm/benchmark/test_model_deployment_definition.py +0 -90
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,9 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import dataclasses
|
|
1
3
|
from dataclasses import dataclass, field
|
|
2
4
|
from typing import List, Optional, Dict
|
|
3
5
|
import dacite
|
|
6
|
+
from inspect import cleandoc
|
|
4
7
|
import mako.template
|
|
5
8
|
import yaml
|
|
6
9
|
import importlib_resources as resources
|
|
@@ -17,6 +20,11 @@ SCHEMA_YAML_PACKAGE: str = "helm.benchmark.static"
|
|
|
17
20
|
SCHEMA_CLASSIC_YAML_FILENAME: str = "schema_classic.yaml"
|
|
18
21
|
|
|
19
22
|
|
|
23
|
+
_ADAPTER_SPEC_PACKAGE = "helm.benchmark.adaptation"
|
|
24
|
+
_ADAPTER_SPEC_FILENAME = "adapter_spec.py"
|
|
25
|
+
_ADAPTER_SPEC_CLASS_NAME = "AdapterSpec"
|
|
26
|
+
|
|
27
|
+
|
|
20
28
|
@dataclass(frozen=True)
|
|
21
29
|
class Field:
|
|
22
30
|
"""
|
|
@@ -198,9 +206,6 @@ class RunGroup(Field):
|
|
|
198
206
|
class Schema:
|
|
199
207
|
"""Specifies information about what to display on the frontend."""
|
|
200
208
|
|
|
201
|
-
# Adapter fields (e.g., temperature)
|
|
202
|
-
adapter: List[Field]
|
|
203
|
-
|
|
204
209
|
# Information about each field
|
|
205
210
|
metrics: List[Field]
|
|
206
211
|
|
|
@@ -213,6 +218,11 @@ class Schema:
|
|
|
213
218
|
# Group the scenarios
|
|
214
219
|
run_groups: List[RunGroup]
|
|
215
220
|
|
|
221
|
+
# Adapter fields (e.g., temperature)
|
|
222
|
+
# Automatically populated from the docstrings in the AdapterSpec class definition.
|
|
223
|
+
# Should not be specified in the user's YAML file.
|
|
224
|
+
adapter: Optional[List[Field]] = None
|
|
225
|
+
|
|
216
226
|
def __post_init__(self):
|
|
217
227
|
self.name_to_metric = {metric.name: metric for metric in self.metrics}
|
|
218
228
|
self.name_to_perturbation = {perturbation.name: perturbation for perturbation in self.perturbations}
|
|
@@ -220,6 +230,43 @@ class Schema:
|
|
|
220
230
|
self.name_to_run_group = {run_group.name: run_group for run_group in self.run_groups}
|
|
221
231
|
|
|
222
232
|
|
|
233
|
+
def get_adapter_fields() -> List[Field]:
|
|
234
|
+
"""Generate the adapter fields from the docstrings in the AdapterSpec class definition."""
|
|
235
|
+
# Unfortunately there is no standard library support for getting docstrings of class fields,
|
|
236
|
+
# so we have to do the parsing outselves. Fortunately, the parsing is quite straightforward.
|
|
237
|
+
adapter_spec_path = resources.files(_ADAPTER_SPEC_PACKAGE).joinpath(_ADAPTER_SPEC_FILENAME)
|
|
238
|
+
with open(adapter_spec_path, "r") as f:
|
|
239
|
+
contents = f.read()
|
|
240
|
+
module_node = ast.parse(contents)
|
|
241
|
+
adapter_spec_node = [
|
|
242
|
+
node
|
|
243
|
+
for node in ast.iter_child_nodes(module_node)
|
|
244
|
+
if isinstance(node, ast.ClassDef) and node.name == _ADAPTER_SPEC_CLASS_NAME
|
|
245
|
+
][0]
|
|
246
|
+
metadata_fields: List[Field] = []
|
|
247
|
+
field_name: str = ""
|
|
248
|
+
for node in ast.iter_child_nodes(adapter_spec_node):
|
|
249
|
+
if isinstance(node, ast.AnnAssign) and isinstance(node.target, ast.Name):
|
|
250
|
+
# This node is a field definition.
|
|
251
|
+
# Save the name of the field for later.
|
|
252
|
+
field_name = node.target.id
|
|
253
|
+
else:
|
|
254
|
+
# If this is a docstring that immediately follows a field definition,
|
|
255
|
+
# output an adapter field with the name set to the field definition and
|
|
256
|
+
# the description set to the docstring.
|
|
257
|
+
if (
|
|
258
|
+
field_name
|
|
259
|
+
and isinstance(node, ast.Expr)
|
|
260
|
+
and isinstance(node.value, ast.Constant)
|
|
261
|
+
and isinstance(node.value.value, str)
|
|
262
|
+
):
|
|
263
|
+
description = cleandoc(node.value.value).replace("\n", " ")
|
|
264
|
+
metadata_fields.append(Field(name=field_name, description=description))
|
|
265
|
+
field_name = ""
|
|
266
|
+
|
|
267
|
+
return metadata_fields
|
|
268
|
+
|
|
269
|
+
|
|
223
270
|
def get_default_schema_path() -> str:
|
|
224
271
|
return resources.files(SCHEMA_YAML_PACKAGE).joinpath(SCHEMA_CLASSIC_YAML_FILENAME)
|
|
225
272
|
|
|
@@ -229,4 +276,7 @@ def read_schema(schema_path: str) -> Schema:
|
|
|
229
276
|
hlog(f"Reading schema file {schema_path}...")
|
|
230
277
|
with open(schema_path, "r") as f:
|
|
231
278
|
raw = yaml.safe_load(f)
|
|
232
|
-
|
|
279
|
+
schema = dacite.from_dict(Schema, raw)
|
|
280
|
+
if schema.adapter:
|
|
281
|
+
hlog(f"WARNING: The `adapter` field is deprecated and should be removed from schema file {schema_path}")
|
|
282
|
+
return dataclasses.replace(schema, adapter=get_adapter_fields())
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from helm.benchmark.presentation.schema import get_adapter_fields
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_get_adapter_fields() -> None:
|
|
5
|
+
adapter_fields = get_adapter_fields()
|
|
6
|
+
assert adapter_fields
|
|
7
|
+
assert adapter_fields[0].name == "method"
|
|
8
|
+
assert (
|
|
9
|
+
adapter_fields[0].description
|
|
10
|
+
== "The high-level strategy for converting instances into a prompt for the language model."
|
|
11
|
+
)
|
helm/benchmark/run.py
CHANGED
|
@@ -264,6 +264,13 @@ def main():
|
|
|
264
264
|
default=None,
|
|
265
265
|
help="Full class name of the Runner class to use. If unset, uses the default Runner.",
|
|
266
266
|
)
|
|
267
|
+
parser.add_argument(
|
|
268
|
+
"--openvino",
|
|
269
|
+
action="store_true",
|
|
270
|
+
default=False,
|
|
271
|
+
help="Experimental: Apply openvino optimization to Hugging Face AutoModelForCausalLM models "
|
|
272
|
+
"specified with the --enable-huggingface-models and --enable-local-huggingface-models flags.",
|
|
273
|
+
)
|
|
267
274
|
add_run_args(parser)
|
|
268
275
|
args = parser.parse_args()
|
|
269
276
|
validate_args(args)
|
|
@@ -275,12 +282,19 @@ def main():
|
|
|
275
282
|
from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
|
|
276
283
|
|
|
277
284
|
for huggingface_model_name in args.enable_huggingface_models:
|
|
278
|
-
|
|
285
|
+
if args.openvino:
|
|
286
|
+
register_huggingface_hub_model_from_flag_value(huggingface_model_name, args.openvino)
|
|
287
|
+
else:
|
|
288
|
+
register_huggingface_hub_model_from_flag_value(huggingface_model_name)
|
|
289
|
+
|
|
279
290
|
if args.enable_local_huggingface_models:
|
|
280
291
|
from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
|
|
281
292
|
|
|
282
293
|
for huggingface_model_path in args.enable_local_huggingface_models:
|
|
283
|
-
|
|
294
|
+
if args.openvino:
|
|
295
|
+
register_huggingface_local_model_from_flag_value(huggingface_model_path, args.openvino)
|
|
296
|
+
else:
|
|
297
|
+
register_huggingface_local_model_from_flag_value(huggingface_model_path)
|
|
284
298
|
|
|
285
299
|
run_entries: List[RunEntry] = []
|
|
286
300
|
if args.conf_paths:
|
helm/benchmark/run_expander.py
CHANGED
|
@@ -8,12 +8,14 @@ from helm.benchmark.model_metadata_registry import (
|
|
|
8
8
|
get_all_code_models,
|
|
9
9
|
get_all_models,
|
|
10
10
|
get_all_text_models,
|
|
11
|
+
get_model_metadata,
|
|
11
12
|
get_model_names_with_tag,
|
|
12
13
|
FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
|
|
13
14
|
LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
|
|
14
15
|
ABLATION_MODEL_TAG,
|
|
15
16
|
TEXT_TO_IMAGE_MODEL_TAG,
|
|
16
17
|
VISION_LANGUAGE_MODEL_TAG,
|
|
18
|
+
INSTRUCTION_FOLLOWING_MODEL_TAG,
|
|
17
19
|
)
|
|
18
20
|
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
|
|
19
21
|
from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
|
|
@@ -192,6 +194,15 @@ class StopRunExpander(RunExpander):
|
|
|
192
194
|
self.value = value
|
|
193
195
|
|
|
194
196
|
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
197
|
+
if self.value == "none":
|
|
198
|
+
return [
|
|
199
|
+
replace(
|
|
200
|
+
run_spec,
|
|
201
|
+
name=f"{run_spec.name},{self.name}={self.value}",
|
|
202
|
+
adapter_spec=replace(run_spec.adapter_spec, stop_sequences=[]),
|
|
203
|
+
),
|
|
204
|
+
]
|
|
205
|
+
|
|
195
206
|
if self.value == "hash":
|
|
196
207
|
stop = "###"
|
|
197
208
|
elif self.value == "semicolon":
|
|
@@ -322,6 +333,16 @@ class AnthropicClaude3RunExpander(RunExpander):
|
|
|
322
333
|
name = "claude_3"
|
|
323
334
|
|
|
324
335
|
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
336
|
+
# Remove all stop sequences that do not contain non-whitespace characters.
|
|
337
|
+
# This prevents the Anthropic API from returnin the following error:
|
|
338
|
+
# "stop_sequences: each stop sequence must contain non-whitespace"
|
|
339
|
+
stop_sequences_with_non_whitespace = [
|
|
340
|
+
stop_sequence for stop_sequence in run_spec.adapter_spec.stop_sequences if stop_sequence.strip()
|
|
341
|
+
]
|
|
342
|
+
run_spec = replace(
|
|
343
|
+
run_spec,
|
|
344
|
+
adapter_spec=replace(run_spec.adapter_spec, stop_sequences=stop_sequences_with_non_whitespace),
|
|
345
|
+
)
|
|
325
346
|
if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
|
|
326
347
|
instructions = "Answer with only a single letter."
|
|
327
348
|
if run_spec.adapter_spec.instructions:
|
|
@@ -335,78 +356,37 @@ class AnthropicClaude3RunExpander(RunExpander):
|
|
|
335
356
|
return [run_spec]
|
|
336
357
|
|
|
337
358
|
|
|
338
|
-
class
|
|
339
|
-
"""
|
|
340
|
-
Custom prompt for OpenAI models.
|
|
341
|
-
These models need more explicit instructions about following the format.
|
|
342
|
-
"""
|
|
359
|
+
class FollowFormatInstructionsRunExpander(RunExpander):
|
|
360
|
+
"""Adds more explicit instructions about following the format to prompts.
|
|
343
361
|
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
def __init__(self):
|
|
349
|
-
pass
|
|
350
|
-
|
|
351
|
-
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
352
|
-
if run_spec.adapter_spec.method != ADAPT_GENERATION:
|
|
353
|
-
return [run_spec]
|
|
354
|
-
|
|
355
|
-
return [
|
|
356
|
-
replace(
|
|
357
|
-
run_spec,
|
|
358
|
-
name=run_spec.name,
|
|
359
|
-
adapter_spec=replace(
|
|
360
|
-
run_spec.adapter_spec,
|
|
361
|
-
global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
|
|
362
|
-
global_suffix="\n\n"
|
|
363
|
-
+ IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
|
|
364
|
-
+ "\n"
|
|
365
|
-
+ run_spec.adapter_spec.output_prefix.strip(),
|
|
366
|
-
),
|
|
367
|
-
),
|
|
368
|
-
]
|
|
362
|
+
The argument controlls which models will receive these instructions.
|
|
363
|
+
If "all", all models receive these instructions.
|
|
364
|
+
If "instruct", only instruction-following models receive these instructions.
|
|
369
365
|
|
|
366
|
+
Only supports the generation adaptation method. Raises an error if used on
|
|
367
|
+
a RunSpec that uses a different adaptation method.
|
|
370
368
|
|
|
371
|
-
|
|
372
|
-
""
|
|
373
|
-
|
|
374
|
-
|
|
369
|
+
Note: For legacy backwards compatibility reasons, despite the use of the word
|
|
370
|
+
"instructions" in this run expander's name, this run expander actually
|
|
371
|
+
modifies the global_prefix and the global_suffix of the AdapterSpec rather than
|
|
372
|
+
the instructions.
|
|
375
373
|
"""
|
|
376
374
|
|
|
377
|
-
|
|
375
|
+
name = "follow_format_instructions"
|
|
378
376
|
|
|
379
|
-
|
|
377
|
+
def __init__(self, value: str):
|
|
378
|
+
if value != "all" and value != "instruct":
|
|
379
|
+
raise ValueError("Value of add_follow_the_format_instructions run expander must be 'all' or 'instruct'")
|
|
380
|
+
self.value = value
|
|
380
381
|
|
|
381
382
|
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
382
383
|
if run_spec.adapter_spec.method != ADAPT_GENERATION:
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
return [
|
|
386
|
-
replace(
|
|
387
|
-
run_spec,
|
|
388
|
-
name=run_spec.name,
|
|
389
|
-
adapter_spec=replace(
|
|
390
|
-
run_spec.adapter_spec,
|
|
391
|
-
global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
|
|
392
|
-
global_suffix="\n\n"
|
|
393
|
-
+ IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
|
|
394
|
-
+ "\n"
|
|
395
|
-
+ run_spec.adapter_spec.output_prefix.strip(),
|
|
396
|
-
),
|
|
397
|
-
),
|
|
398
|
-
]
|
|
399
|
-
|
|
384
|
+
raise Exception("follow_format_instructions run expander only supports the generation adaptation method")
|
|
400
385
|
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
name = "output_format_instructions"
|
|
407
|
-
|
|
408
|
-
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
409
|
-
if run_spec.adapter_spec.method != ADAPT_GENERATION:
|
|
386
|
+
if (
|
|
387
|
+
self.value == "instruct"
|
|
388
|
+
and INSTRUCTION_FOLLOWING_MODEL_TAG not in get_model_metadata(run_spec.adapter_spec.model).tags
|
|
389
|
+
):
|
|
410
390
|
return [run_spec]
|
|
411
391
|
|
|
412
392
|
return [
|
|
@@ -539,7 +519,7 @@ class MaxTrainInstancesRunExpander(ReplaceValueRunExpander):
|
|
|
539
519
|
"one": [1],
|
|
540
520
|
"all": [0, 1, 2, 4, 8, 16], # Cap at 16 due to limited context length
|
|
541
521
|
"big_bench_few_shot_setting": [0, 1, 2, 3], # Commonly used few-shot setting in BIG-bench
|
|
542
|
-
"
|
|
522
|
+
"vhelm": [0, 1, 2, 4, 8],
|
|
543
523
|
}
|
|
544
524
|
|
|
545
525
|
|
|
@@ -1064,6 +1044,7 @@ PERTURBATION_SPECS_DICT: Dict[str, Dict[str, List[PerturbationSpec]]] = {
|
|
|
1064
1044
|
"chinese": {"chinese": [translate(language_code="zh-CN")]},
|
|
1065
1045
|
"hindi": {"hindi": [translate(language_code="hi")]},
|
|
1066
1046
|
"spanish": {"spanish": [translate(language_code="es")]},
|
|
1047
|
+
"swahili": {"swahili": [translate(language_code="sw")]},
|
|
1067
1048
|
# Styles
|
|
1068
1049
|
"art": {
|
|
1069
1050
|
"art": [
|
|
@@ -1409,12 +1390,79 @@ class ChatMLRunExpander(RunExpander):
|
|
|
1409
1390
|
]
|
|
1410
1391
|
|
|
1411
1392
|
|
|
1393
|
+
class OutputFormatInstructions(RunExpander):
|
|
1394
|
+
"""Add extra instructions to about output formatting to HELM Lite scenarios.
|
|
1395
|
+
|
|
1396
|
+
Many instruction-following models and chat models are tuned to expect conversational prompts
|
|
1397
|
+
and respond in a conversational way. These models occasionally produce outputs that are not
|
|
1398
|
+
in the expected format. This run expander instructs these models to provide the output in
|
|
1399
|
+
the format expected by the scenario.
|
|
1400
|
+
|
|
1401
|
+
The argument should be the name of the scenario."""
|
|
1402
|
+
|
|
1403
|
+
name = "output_format_instructions"
|
|
1404
|
+
|
|
1405
|
+
def __init__(self, scenario: str):
|
|
1406
|
+
self.scenario = scenario
|
|
1407
|
+
|
|
1408
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
1409
|
+
if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
|
|
1410
|
+
if self.scenario == "mmlu_only_last_question":
|
|
1411
|
+
instructions = "Answer only the last question with only a single letter."
|
|
1412
|
+
else:
|
|
1413
|
+
instructions = "Answer with only a single letter."
|
|
1414
|
+
if run_spec.adapter_spec.instructions:
|
|
1415
|
+
instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
|
|
1416
|
+
return [
|
|
1417
|
+
replace(
|
|
1418
|
+
run_spec,
|
|
1419
|
+
adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
|
|
1420
|
+
),
|
|
1421
|
+
]
|
|
1422
|
+
elif run_spec.adapter_spec.method == ADAPT_GENERATION:
|
|
1423
|
+
output_noun = run_spec.adapter_spec.output_prefix.split(":")[0]
|
|
1424
|
+
if self.scenario == "narrative_qa":
|
|
1425
|
+
instructions = (
|
|
1426
|
+
"Answer with one word, a few-word phrase, or a short sentence. "
|
|
1427
|
+
+ "Avoid extra, unnecessary information in the answer."
|
|
1428
|
+
)
|
|
1429
|
+
elif self.scenario == "natural_qa":
|
|
1430
|
+
instructions = "Answer with a short answer or a boolean 'yes' or 'no' answer."
|
|
1431
|
+
elif self.scenario == "legalbench":
|
|
1432
|
+
if output_noun != "Answer":
|
|
1433
|
+
instructions = f"Answer with the {output_noun.lower()}."
|
|
1434
|
+
else:
|
|
1435
|
+
instructions = "Answer yes or no."
|
|
1436
|
+
elif self.scenario == "wmt_14":
|
|
1437
|
+
instructions = "Answer with the English translation."
|
|
1438
|
+
else:
|
|
1439
|
+
raise ValueError(f"Unknown scenario {self.scenario}")
|
|
1440
|
+
|
|
1441
|
+
if run_spec.adapter_spec.output_prefix:
|
|
1442
|
+
instructions = (
|
|
1443
|
+
f"{instructions} Do not include '{run_spec.adapter_spec.output_prefix.strip()}' in your answer."
|
|
1444
|
+
)
|
|
1445
|
+
|
|
1446
|
+
if run_spec.adapter_spec.instructions:
|
|
1447
|
+
instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
|
|
1448
|
+
else:
|
|
1449
|
+
instructions = f"{instructions}\n"
|
|
1450
|
+
return [
|
|
1451
|
+
replace(
|
|
1452
|
+
run_spec,
|
|
1453
|
+
adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
|
|
1454
|
+
),
|
|
1455
|
+
]
|
|
1456
|
+
raise ValueError(f"Unknown scenario {self.scenario}")
|
|
1457
|
+
|
|
1458
|
+
|
|
1412
1459
|
RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
|
|
1413
1460
|
InstructionsRunExpander,
|
|
1414
1461
|
PromptRunExpander,
|
|
1415
1462
|
NewlineRunExpander,
|
|
1416
1463
|
StopRunExpander,
|
|
1417
1464
|
FormatPromptRunExpander,
|
|
1465
|
+
FollowFormatInstructionsRunExpander,
|
|
1418
1466
|
AddToStopRunExpander,
|
|
1419
1467
|
GlobalPrefixRunExpander,
|
|
1420
1468
|
NumTrainTrialsRunExpander,
|
|
@@ -1430,6 +1478,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
|
|
|
1430
1478
|
NumOutputTokensRunExpander,
|
|
1431
1479
|
ChatMLRunExpander,
|
|
1432
1480
|
EvalSplitRunExpander,
|
|
1481
|
+
OutputFormatInstructions,
|
|
1433
1482
|
]
|
|
1434
1483
|
|
|
1435
1484
|
|
|
@@ -4,7 +4,6 @@ from typing import List
|
|
|
4
4
|
from helm.benchmark.adaptation.adapter_spec import (
|
|
5
5
|
ADAPT_GENERATION,
|
|
6
6
|
ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
7
|
-
ADAPT_GENERATION_MULTIMODAL,
|
|
8
7
|
)
|
|
9
8
|
from helm.benchmark.model_deployment_registry import (
|
|
10
9
|
ModelDeployment,
|
|
@@ -14,22 +13,24 @@ from helm.benchmark.model_deployment_registry import (
|
|
|
14
13
|
from helm.benchmark.model_metadata_registry import (
|
|
15
14
|
ANTHROPIC_CLAUDE_1_MODEL_TAG,
|
|
16
15
|
ANTHROPIC_CLAUDE_2_MODEL_TAG,
|
|
16
|
+
ANTHROPIC_CLAUDE_3_MODEL_TAG,
|
|
17
17
|
BUGGY_TEMP_0_TAG,
|
|
18
18
|
CHATML_MODEL_TAG,
|
|
19
|
-
|
|
19
|
+
GOOGLE_GEMINI_PRO_VISION_V1_TAG,
|
|
20
20
|
IDEFICS_INSTRUCT_MODEL_TAG,
|
|
21
|
-
IDEFICS_MODEL_TAG,
|
|
22
21
|
LLAVA_MODEL_TAG,
|
|
23
22
|
OPEN_FLAMINGO_MODEL_TAG,
|
|
24
|
-
VISION_LANGUAGE_MODEL_TAG,
|
|
25
23
|
NLG_PREFIX_TAG,
|
|
26
24
|
NO_NEWLINES_TAG,
|
|
25
|
+
VISION_LANGUAGE_MODEL_TAG,
|
|
26
|
+
IDEFICS_MODEL_TAG,
|
|
27
27
|
ModelMetadata,
|
|
28
28
|
get_model_metadata,
|
|
29
29
|
)
|
|
30
30
|
from helm.benchmark.run_expander import (
|
|
31
31
|
RUN_EXPANDERS,
|
|
32
32
|
AnthropicClaude2RunExpander,
|
|
33
|
+
AnthropicClaude3RunExpander,
|
|
33
34
|
ChatMLRunExpander,
|
|
34
35
|
GlobalPrefixRunExpander,
|
|
35
36
|
IDEFICSInstructRunExpander,
|
|
@@ -125,20 +126,20 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
|
|
|
125
126
|
if ANTHROPIC_CLAUDE_1_MODEL_TAG in model.tags or ANTHROPIC_CLAUDE_2_MODEL_TAG in model.tags:
|
|
126
127
|
run_spec = singleton(AnthropicClaude2RunExpander().expand(run_spec))
|
|
127
128
|
|
|
128
|
-
#
|
|
129
|
+
# Anthropic Claude 3
|
|
130
|
+
if ANTHROPIC_CLAUDE_3_MODEL_TAG in model.tags:
|
|
131
|
+
run_spec = singleton(AnthropicClaude3RunExpander().expand(run_spec))
|
|
132
|
+
|
|
133
|
+
# Google Gemini Vision v1.0 returns an empty completion or throws an error if max_tokens is 1
|
|
129
134
|
if (
|
|
130
135
|
VISION_LANGUAGE_MODEL_TAG in model.tags
|
|
131
|
-
and
|
|
136
|
+
and GOOGLE_GEMINI_PRO_VISION_V1_TAG in model.tags
|
|
132
137
|
and run_spec.adapter_spec.max_tokens == 1
|
|
133
138
|
):
|
|
134
139
|
run_spec = singleton(IncreaseMaxTokensRunExpander(value=1).expand(run_spec))
|
|
135
140
|
|
|
136
141
|
# IDEFICS special handling
|
|
137
142
|
if IDEFICS_MODEL_TAG in model.tags:
|
|
138
|
-
# IDEFICS requires more `max_tokens` to generate something reasonable for open-ended generation
|
|
139
|
-
if run_spec.adapter_spec.method == ADAPT_GENERATION_MULTIMODAL:
|
|
140
|
-
run_spec = singleton(IncreaseMaxTokensRunExpander(value=30).expand(run_spec))
|
|
141
|
-
|
|
142
143
|
if IDEFICS_INSTRUCT_MODEL_TAG in model.tags:
|
|
143
144
|
run_spec = singleton(IDEFICSInstructRunExpander().expand(run_spec))
|
|
144
145
|
|
|
@@ -155,6 +156,10 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
|
|
|
155
156
|
increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4)
|
|
156
157
|
run_spec = singleton(increase_temperature_expander.expand(run_spec))
|
|
157
158
|
|
|
159
|
+
# MedLM-Large
|
|
160
|
+
if run_spec.adapter_spec.model == "google/medlm-large":
|
|
161
|
+
run_spec = singleton(StopRunExpander("none").expand(run_spec))
|
|
162
|
+
|
|
158
163
|
return run_spec
|
|
159
164
|
|
|
160
165
|
run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
|
|
2
|
+
from helm.benchmark.annotation.annotator import AnnotatorSpec
|
|
3
|
+
from helm.benchmark.metrics.metric import MetricSpec
|
|
4
|
+
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
5
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@run_spec_function("air_bench_2024")
|
|
9
|
+
def get_air_bench_2024_spec() -> RunSpec:
|
|
10
|
+
adapter_spec = AdapterSpec(
|
|
11
|
+
method=ADAPT_GENERATION,
|
|
12
|
+
global_prefix="",
|
|
13
|
+
global_suffix="",
|
|
14
|
+
instructions="",
|
|
15
|
+
input_prefix="",
|
|
16
|
+
input_suffix="",
|
|
17
|
+
output_prefix="",
|
|
18
|
+
output_suffix="",
|
|
19
|
+
instance_prefix="",
|
|
20
|
+
max_train_instances=0,
|
|
21
|
+
num_outputs=1,
|
|
22
|
+
max_tokens=512,
|
|
23
|
+
temperature=0.0,
|
|
24
|
+
stop_sequences=[],
|
|
25
|
+
)
|
|
26
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.air_bench_scenario.AIRBench2024Scenario")
|
|
27
|
+
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.air_bench_annotator.AIRBench2024Annotator")]
|
|
28
|
+
metric_specs = [
|
|
29
|
+
MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024ScoreMetric"),
|
|
30
|
+
MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024BasicGenerationMetric"),
|
|
31
|
+
MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
|
|
32
|
+
]
|
|
33
|
+
return RunSpec(
|
|
34
|
+
name="air_bench_2024",
|
|
35
|
+
scenario_spec=scenario_spec,
|
|
36
|
+
adapter_spec=adapter_spec,
|
|
37
|
+
metric_specs=metric_specs,
|
|
38
|
+
annotators=annotator_specs,
|
|
39
|
+
groups=["air_bench_2024"],
|
|
40
|
+
)
|
|
@@ -24,6 +24,7 @@ from helm.benchmark.adaptation.common_adapter_specs import (
|
|
|
24
24
|
get_ranking_binary_adapter_spec,
|
|
25
25
|
get_summarization_adapter_spec,
|
|
26
26
|
)
|
|
27
|
+
from helm.benchmark.annotation.annotator import AnnotatorSpec
|
|
27
28
|
from helm.benchmark.metrics.common_metric_specs import (
|
|
28
29
|
get_basic_metric_specs,
|
|
29
30
|
get_bias_metric_specs,
|
|
@@ -1166,8 +1167,6 @@ def get_pubmed_qa_spec() -> RunSpec:
|
|
|
1166
1167
|
|
|
1167
1168
|
@run_spec_function("live_qa")
|
|
1168
1169
|
def get_live_qa_spec() -> RunSpec:
|
|
1169
|
-
from helm.common.gpu_utils import get_torch_device_name
|
|
1170
|
-
|
|
1171
1170
|
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.live_qa_scenario.LiveQAScenario")
|
|
1172
1171
|
|
|
1173
1172
|
adapter_spec = get_generation_adapter_spec(
|
|
@@ -1177,22 +1176,23 @@ def get_live_qa_spec() -> RunSpec:
|
|
|
1177
1176
|
max_train_instances=0,
|
|
1178
1177
|
max_tokens=512,
|
|
1179
1178
|
)
|
|
1179
|
+
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.live_qa_annotator.LiveQAAnnotator")]
|
|
1180
|
+
metric_specs = get_open_ended_generation_metric_specs() + [
|
|
1181
|
+
MetricSpec(class_name="helm.benchmark.metrics.live_qa_metrics.LiveQAScoreMetric")
|
|
1182
|
+
]
|
|
1180
1183
|
|
|
1181
1184
|
return RunSpec(
|
|
1182
1185
|
name="live_qa",
|
|
1183
1186
|
scenario_spec=scenario_spec,
|
|
1184
1187
|
adapter_spec=adapter_spec,
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
),
|
|
1188
|
+
annotators=annotator_specs,
|
|
1189
|
+
metric_specs=metric_specs,
|
|
1188
1190
|
groups=["live_qa"],
|
|
1189
1191
|
)
|
|
1190
1192
|
|
|
1191
1193
|
|
|
1192
1194
|
@run_spec_function("medication_qa")
|
|
1193
1195
|
def get_medication_qa_spec() -> RunSpec:
|
|
1194
|
-
from helm.common.gpu_utils import get_torch_device_name
|
|
1195
|
-
|
|
1196
1196
|
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medication_qa_scenario.MedicationQAScenario")
|
|
1197
1197
|
|
|
1198
1198
|
adapter_spec = get_generation_adapter_spec(
|
|
@@ -1203,13 +1203,17 @@ def get_medication_qa_spec() -> RunSpec:
|
|
|
1203
1203
|
max_tokens=512,
|
|
1204
1204
|
)
|
|
1205
1205
|
|
|
1206
|
+
annotator_specs = [
|
|
1207
|
+
AnnotatorSpec(class_name="helm.benchmark.annotation.medication_qa_annotator.MedicationQAAnnotator")
|
|
1208
|
+
]
|
|
1209
|
+
metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.medication_qa_metrics.MedicationQAScoreMetric")]
|
|
1210
|
+
|
|
1206
1211
|
return RunSpec(
|
|
1207
1212
|
name="medication_qa",
|
|
1208
1213
|
scenario_spec=scenario_spec,
|
|
1209
1214
|
adapter_spec=adapter_spec,
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
),
|
|
1215
|
+
annotators=annotator_specs,
|
|
1216
|
+
metric_specs=metric_specs,
|
|
1213
1217
|
groups=["medication_qa"],
|
|
1214
1218
|
)
|
|
1215
1219
|
|
|
@@ -1506,5 +1510,5 @@ def get_thai_exam_spec(exam: str = "onet", method: str = ADAPT_MULTIPLE_CHOICE_J
|
|
|
1506
1510
|
scenario_spec=scenario_spec,
|
|
1507
1511
|
adapter_spec=adapter_spec,
|
|
1508
1512
|
metric_specs=get_exact_match_metric_specs(),
|
|
1509
|
-
groups=["thai_exam"],
|
|
1513
|
+
groups=["thai_exam", f"thai_exam_{exam}"],
|
|
1510
1514
|
)
|
|
@@ -309,6 +309,8 @@ def get_decodingtrust_toxicity_prompts_spec(subject) -> RunSpec:
|
|
|
309
309
|
name="decodingtrust_toxicity_prompts",
|
|
310
310
|
scenario_spec=scenario_spec,
|
|
311
311
|
adapter_spec=adapter_spec,
|
|
312
|
-
metric_specs=get_generative_harms_metric_specs(
|
|
312
|
+
metric_specs=get_generative_harms_metric_specs(
|
|
313
|
+
include_basic_metrics=True, include_generative_harms_metrics=True
|
|
314
|
+
),
|
|
313
315
|
groups=["decodingtrust", "toxicity_prompts"],
|
|
314
316
|
)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Run specs for experiments only.
|
|
2
|
+
|
|
3
|
+
These run specs are not intended for use with public leaderboards."""
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
|
|
6
|
+
from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
|
|
7
|
+
from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
|
|
8
|
+
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
9
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@run_spec_function("ci_mcqa")
|
|
13
|
+
def get_ci_mcqa_spec() -> RunSpec:
|
|
14
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.ci_mcqa_scenario.CIMCQAScenario", args={})
|
|
15
|
+
|
|
16
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
17
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
18
|
+
instructions=(
|
|
19
|
+
"Give a letter answer among the options given. "
|
|
20
|
+
"For example, if the options are A, B, C, D, E, and F, "
|
|
21
|
+
"your answer should consist of the single letter that corresponds to the correct answer."
|
|
22
|
+
),
|
|
23
|
+
input_noun="Question",
|
|
24
|
+
output_noun="Answer",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
return RunSpec(
|
|
28
|
+
name="ci_mcqa",
|
|
29
|
+
scenario_spec=scenario_spec,
|
|
30
|
+
adapter_spec=adapter_spec,
|
|
31
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
32
|
+
groups=["CIMCQA"],
|
|
33
|
+
)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Run spec functions for the HELM Finance leaderboard.
|
|
2
|
+
|
|
3
|
+
Website: https://crfm.stanford.edu/helm/finance/"""
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.adaptation.common_adapter_specs import (
|
|
6
|
+
get_generation_adapter_spec,
|
|
7
|
+
)
|
|
8
|
+
from helm.benchmark.metrics.common_metric_specs import (
|
|
9
|
+
get_basic_metric_specs,
|
|
10
|
+
)
|
|
11
|
+
from helm.benchmark.metrics.metric import MetricSpec
|
|
12
|
+
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
13
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@run_spec_function("fin_qa")
|
|
17
|
+
def get_fin_qa_spec() -> RunSpec:
|
|
18
|
+
from helm.benchmark.scenarios.fin_qa_scenario import INSTRUCTIONS
|
|
19
|
+
|
|
20
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.fin_qa_scenario.FinQAScenario", args={})
|
|
21
|
+
adapter_spec = get_generation_adapter_spec(
|
|
22
|
+
instructions=INSTRUCTIONS, input_noun=None, output_noun="Program", max_tokens=100
|
|
23
|
+
)
|
|
24
|
+
metric_specs = get_basic_metric_specs([]) + [
|
|
25
|
+
MetricSpec(class_name="helm.benchmark.metrics.fin_qa_metrics.FinQAMetric")
|
|
26
|
+
]
|
|
27
|
+
return RunSpec(
|
|
28
|
+
name="fin_qa",
|
|
29
|
+
scenario_spec=scenario_spec,
|
|
30
|
+
adapter_spec=adapter_spec,
|
|
31
|
+
metric_specs=metric_specs,
|
|
32
|
+
groups=["fin_qa"],
|
|
33
|
+
)
|