crfm-helm 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +13 -3
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +96 -63
- helm/benchmark/adaptation/adapter_spec.py +32 -31
- helm/benchmark/annotation/air_bench_annotator.py +64 -0
- helm/benchmark/annotation/annotator_factory.py +6 -0
- helm/benchmark/annotation/live_qa_annotator.py +84 -0
- helm/benchmark/annotation/medication_qa_annotator.py +81 -0
- helm/benchmark/augmentations/translate_perturbation.py +1 -0
- helm/benchmark/huggingface_registration.py +16 -6
- helm/benchmark/metrics/air_bench_metrics.py +56 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
- helm/benchmark/metrics/live_qa_metrics.py +23 -0
- helm/benchmark/metrics/medication_qa_metrics.py +23 -0
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/unitxt_metrics.py +20 -10
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +29 -71
- helm/benchmark/presentation/schema.py +54 -4
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/run.py +16 -2
- helm/benchmark/run_expander.py +77 -0
- helm/benchmark/run_spec_factory.py +4 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +15 -11
- helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
- helm/benchmark/run_specs/experimental_run_specs.py +33 -0
- helm/benchmark/run_specs/finance_run_specs.py +33 -0
- helm/benchmark/run_specs/vlm_run_specs.py +168 -45
- helm/benchmark/scenarios/air_bench_scenario.py +50 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
- helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +0 -4
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +4 -2
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +6 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_classic.yaml +3 -59
- helm/benchmark/static/schema_finance.yaml +143 -0
- helm/benchmark/static/schema_image2structure.yaml +254 -111
- helm/benchmark/static/schema_instruction_following.yaml +3 -52
- helm/benchmark/static/schema_lite.yaml +3 -61
- helm/benchmark/static/schema_medical.yaml +255 -0
- helm/benchmark/static/schema_mmlu.yaml +3 -61
- helm/benchmark/static/schema_tables.yaml +200 -0
- helm/benchmark/static/schema_thai.yaml +223 -0
- helm/benchmark/static/schema_unitxt.yaml +3 -61
- helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +294 -293
- helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
- helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
- helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
- helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
- helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
- helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/anthropic_client.py +43 -9
- helm/clients/auto_client.py +11 -0
- helm/clients/client.py +24 -7
- helm/clients/cohere_client.py +98 -3
- helm/clients/huggingface_client.py +71 -12
- helm/clients/openai_client.py +9 -2
- helm/clients/reka_client.py +189 -0
- helm/clients/test_client.py +3 -3
- helm/clients/test_huggingface_client.py +19 -3
- helm/clients/test_together_client.py +72 -2
- helm/clients/together_client.py +129 -23
- helm/clients/vertexai_client.py +62 -18
- helm/clients/vision_language/huggingface_vlm_client.py +1 -0
- helm/clients/vision_language/paligemma_client.py +146 -0
- helm/clients/vision_language/palmyra_vision_client.py +84 -0
- helm/clients/yi_client.py +31 -0
- helm/common/critique_request.py +10 -1
- helm/common/images_utils.py +19 -0
- helm/config/model_deployments.yaml +412 -18
- helm/config/model_metadata.yaml +447 -25
- helm/config/tokenizer_configs.yaml +93 -1
- helm/proxy/critique/model_critique_client.py +32 -4
- helm/proxy/services/server_service.py +1 -1
- helm/tokenizers/auto_tokenizer.py +1 -1
- helm/tokenizers/cohere_tokenizer.py +44 -2
- helm/tokenizers/huggingface_tokenizer.py +36 -13
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_huggingface_tokenizer.py +5 -1
- helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
- helm/benchmark/static_build/assets/index-878a1094.css +0 -1
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
helm/benchmark/run.py
CHANGED
|
@@ -264,6 +264,13 @@ def main():
|
|
|
264
264
|
default=None,
|
|
265
265
|
help="Full class name of the Runner class to use. If unset, uses the default Runner.",
|
|
266
266
|
)
|
|
267
|
+
parser.add_argument(
|
|
268
|
+
"--openvino",
|
|
269
|
+
action="store_true",
|
|
270
|
+
default=False,
|
|
271
|
+
help="Experimental: Apply openvino optimization to Hugging Face AutoModelForCausalLM models "
|
|
272
|
+
"specified with the --enable-huggingface-models and --enable-local-huggingface-models flags.",
|
|
273
|
+
)
|
|
267
274
|
add_run_args(parser)
|
|
268
275
|
args = parser.parse_args()
|
|
269
276
|
validate_args(args)
|
|
@@ -275,12 +282,19 @@ def main():
|
|
|
275
282
|
from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
|
|
276
283
|
|
|
277
284
|
for huggingface_model_name in args.enable_huggingface_models:
|
|
278
|
-
|
|
285
|
+
if args.openvino:
|
|
286
|
+
register_huggingface_hub_model_from_flag_value(huggingface_model_name, args.openvino)
|
|
287
|
+
else:
|
|
288
|
+
register_huggingface_hub_model_from_flag_value(huggingface_model_name)
|
|
289
|
+
|
|
279
290
|
if args.enable_local_huggingface_models:
|
|
280
291
|
from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
|
|
281
292
|
|
|
282
293
|
for huggingface_model_path in args.enable_local_huggingface_models:
|
|
283
|
-
|
|
294
|
+
if args.openvino:
|
|
295
|
+
register_huggingface_local_model_from_flag_value(huggingface_model_path, args.openvino)
|
|
296
|
+
else:
|
|
297
|
+
register_huggingface_local_model_from_flag_value(huggingface_model_path)
|
|
284
298
|
|
|
285
299
|
run_entries: List[RunEntry] = []
|
|
286
300
|
if args.conf_paths:
|
helm/benchmark/run_expander.py
CHANGED
|
@@ -194,6 +194,15 @@ class StopRunExpander(RunExpander):
|
|
|
194
194
|
self.value = value
|
|
195
195
|
|
|
196
196
|
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
197
|
+
if self.value == "none":
|
|
198
|
+
return [
|
|
199
|
+
replace(
|
|
200
|
+
run_spec,
|
|
201
|
+
name=f"{run_spec.name},{self.name}={self.value}",
|
|
202
|
+
adapter_spec=replace(run_spec.adapter_spec, stop_sequences=[]),
|
|
203
|
+
),
|
|
204
|
+
]
|
|
205
|
+
|
|
197
206
|
if self.value == "hash":
|
|
198
207
|
stop = "###"
|
|
199
208
|
elif self.value == "semicolon":
|
|
@@ -1035,6 +1044,7 @@ PERTURBATION_SPECS_DICT: Dict[str, Dict[str, List[PerturbationSpec]]] = {
|
|
|
1035
1044
|
"chinese": {"chinese": [translate(language_code="zh-CN")]},
|
|
1036
1045
|
"hindi": {"hindi": [translate(language_code="hi")]},
|
|
1037
1046
|
"spanish": {"spanish": [translate(language_code="es")]},
|
|
1047
|
+
"swahili": {"swahili": [translate(language_code="sw")]},
|
|
1038
1048
|
# Styles
|
|
1039
1049
|
"art": {
|
|
1040
1050
|
"art": [
|
|
@@ -1380,6 +1390,72 @@ class ChatMLRunExpander(RunExpander):
|
|
|
1380
1390
|
]
|
|
1381
1391
|
|
|
1382
1392
|
|
|
1393
|
+
class OutputFormatInstructions(RunExpander):
|
|
1394
|
+
"""Add extra instructions to about output formatting to HELM Lite scenarios.
|
|
1395
|
+
|
|
1396
|
+
Many instruction-following models and chat models are tuned to expect conversational prompts
|
|
1397
|
+
and respond in a conversational way. These models occasionally produce outputs that are not
|
|
1398
|
+
in the expected format. This run expander instructs these models to provide the output in
|
|
1399
|
+
the format expected by the scenario.
|
|
1400
|
+
|
|
1401
|
+
The argument should be the name of the scenario."""
|
|
1402
|
+
|
|
1403
|
+
name = "output_format_instructions"
|
|
1404
|
+
|
|
1405
|
+
def __init__(self, scenario: str):
|
|
1406
|
+
self.scenario = scenario
|
|
1407
|
+
|
|
1408
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
1409
|
+
if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
|
|
1410
|
+
if self.scenario == "mmlu_only_last_question":
|
|
1411
|
+
instructions = "Answer only the last question with only a single letter."
|
|
1412
|
+
else:
|
|
1413
|
+
instructions = "Answer with only a single letter."
|
|
1414
|
+
if run_spec.adapter_spec.instructions:
|
|
1415
|
+
instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
|
|
1416
|
+
return [
|
|
1417
|
+
replace(
|
|
1418
|
+
run_spec,
|
|
1419
|
+
adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
|
|
1420
|
+
),
|
|
1421
|
+
]
|
|
1422
|
+
elif run_spec.adapter_spec.method == ADAPT_GENERATION:
|
|
1423
|
+
output_noun = run_spec.adapter_spec.output_prefix.split(":")[0]
|
|
1424
|
+
if self.scenario == "narrative_qa":
|
|
1425
|
+
instructions = (
|
|
1426
|
+
"Answer with one word, a few-word phrase, or a short sentence. "
|
|
1427
|
+
+ "Avoid extra, unnecessary information in the answer."
|
|
1428
|
+
)
|
|
1429
|
+
elif self.scenario == "natural_qa":
|
|
1430
|
+
instructions = "Answer with a short answer or a boolean 'yes' or 'no' answer."
|
|
1431
|
+
elif self.scenario == "legalbench":
|
|
1432
|
+
if output_noun != "Answer":
|
|
1433
|
+
instructions = f"Answer with the {output_noun.lower()}."
|
|
1434
|
+
else:
|
|
1435
|
+
instructions = "Answer yes or no."
|
|
1436
|
+
elif self.scenario == "wmt_14":
|
|
1437
|
+
instructions = "Answer with the English translation."
|
|
1438
|
+
else:
|
|
1439
|
+
raise ValueError(f"Unknown scenario {self.scenario}")
|
|
1440
|
+
|
|
1441
|
+
if run_spec.adapter_spec.output_prefix:
|
|
1442
|
+
instructions = (
|
|
1443
|
+
f"{instructions} Do not include '{run_spec.adapter_spec.output_prefix.strip()}' in your answer."
|
|
1444
|
+
)
|
|
1445
|
+
|
|
1446
|
+
if run_spec.adapter_spec.instructions:
|
|
1447
|
+
instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
|
|
1448
|
+
else:
|
|
1449
|
+
instructions = f"{instructions}\n"
|
|
1450
|
+
return [
|
|
1451
|
+
replace(
|
|
1452
|
+
run_spec,
|
|
1453
|
+
adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
|
|
1454
|
+
),
|
|
1455
|
+
]
|
|
1456
|
+
raise ValueError(f"Unknown scenario {self.scenario}")
|
|
1457
|
+
|
|
1458
|
+
|
|
1383
1459
|
RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
|
|
1384
1460
|
InstructionsRunExpander,
|
|
1385
1461
|
PromptRunExpander,
|
|
@@ -1402,6 +1478,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
|
|
|
1402
1478
|
NumOutputTokensRunExpander,
|
|
1403
1479
|
ChatMLRunExpander,
|
|
1404
1480
|
EvalSplitRunExpander,
|
|
1481
|
+
OutputFormatInstructions,
|
|
1405
1482
|
]
|
|
1406
1483
|
|
|
1407
1484
|
|
|
@@ -156,6 +156,10 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
|
|
|
156
156
|
increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4)
|
|
157
157
|
run_spec = singleton(increase_temperature_expander.expand(run_spec))
|
|
158
158
|
|
|
159
|
+
# MedLM-Large
|
|
160
|
+
if run_spec.adapter_spec.model == "google/medlm-large":
|
|
161
|
+
run_spec = singleton(StopRunExpander("none").expand(run_spec))
|
|
162
|
+
|
|
159
163
|
return run_spec
|
|
160
164
|
|
|
161
165
|
run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
|
|
2
|
+
from helm.benchmark.annotation.annotator import AnnotatorSpec
|
|
3
|
+
from helm.benchmark.metrics.metric import MetricSpec
|
|
4
|
+
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
5
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@run_spec_function("air_bench_2024")
|
|
9
|
+
def get_air_bench_2024_spec() -> RunSpec:
|
|
10
|
+
adapter_spec = AdapterSpec(
|
|
11
|
+
method=ADAPT_GENERATION,
|
|
12
|
+
global_prefix="",
|
|
13
|
+
global_suffix="",
|
|
14
|
+
instructions="",
|
|
15
|
+
input_prefix="",
|
|
16
|
+
input_suffix="",
|
|
17
|
+
output_prefix="",
|
|
18
|
+
output_suffix="",
|
|
19
|
+
instance_prefix="",
|
|
20
|
+
max_train_instances=0,
|
|
21
|
+
num_outputs=1,
|
|
22
|
+
max_tokens=512,
|
|
23
|
+
temperature=0.0,
|
|
24
|
+
stop_sequences=[],
|
|
25
|
+
)
|
|
26
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.air_bench_scenario.AIRBench2024Scenario")
|
|
27
|
+
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.air_bench_annotator.AIRBench2024Annotator")]
|
|
28
|
+
metric_specs = [
|
|
29
|
+
MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024ScoreMetric"),
|
|
30
|
+
MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024BasicGenerationMetric"),
|
|
31
|
+
MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
|
|
32
|
+
]
|
|
33
|
+
return RunSpec(
|
|
34
|
+
name="air_bench_2024",
|
|
35
|
+
scenario_spec=scenario_spec,
|
|
36
|
+
adapter_spec=adapter_spec,
|
|
37
|
+
metric_specs=metric_specs,
|
|
38
|
+
annotators=annotator_specs,
|
|
39
|
+
groups=["air_bench_2024"],
|
|
40
|
+
)
|
|
@@ -24,6 +24,7 @@ from helm.benchmark.adaptation.common_adapter_specs import (
|
|
|
24
24
|
get_ranking_binary_adapter_spec,
|
|
25
25
|
get_summarization_adapter_spec,
|
|
26
26
|
)
|
|
27
|
+
from helm.benchmark.annotation.annotator import AnnotatorSpec
|
|
27
28
|
from helm.benchmark.metrics.common_metric_specs import (
|
|
28
29
|
get_basic_metric_specs,
|
|
29
30
|
get_bias_metric_specs,
|
|
@@ -1166,8 +1167,6 @@ def get_pubmed_qa_spec() -> RunSpec:
|
|
|
1166
1167
|
|
|
1167
1168
|
@run_spec_function("live_qa")
|
|
1168
1169
|
def get_live_qa_spec() -> RunSpec:
|
|
1169
|
-
from helm.common.gpu_utils import get_torch_device_name
|
|
1170
|
-
|
|
1171
1170
|
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.live_qa_scenario.LiveQAScenario")
|
|
1172
1171
|
|
|
1173
1172
|
adapter_spec = get_generation_adapter_spec(
|
|
@@ -1177,22 +1176,23 @@ def get_live_qa_spec() -> RunSpec:
|
|
|
1177
1176
|
max_train_instances=0,
|
|
1178
1177
|
max_tokens=512,
|
|
1179
1178
|
)
|
|
1179
|
+
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.live_qa_annotator.LiveQAAnnotator")]
|
|
1180
|
+
metric_specs = get_open_ended_generation_metric_specs() + [
|
|
1181
|
+
MetricSpec(class_name="helm.benchmark.metrics.live_qa_metrics.LiveQAScoreMetric")
|
|
1182
|
+
]
|
|
1180
1183
|
|
|
1181
1184
|
return RunSpec(
|
|
1182
1185
|
name="live_qa",
|
|
1183
1186
|
scenario_spec=scenario_spec,
|
|
1184
1187
|
adapter_spec=adapter_spec,
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
),
|
|
1188
|
+
annotators=annotator_specs,
|
|
1189
|
+
metric_specs=metric_specs,
|
|
1188
1190
|
groups=["live_qa"],
|
|
1189
1191
|
)
|
|
1190
1192
|
|
|
1191
1193
|
|
|
1192
1194
|
@run_spec_function("medication_qa")
|
|
1193
1195
|
def get_medication_qa_spec() -> RunSpec:
|
|
1194
|
-
from helm.common.gpu_utils import get_torch_device_name
|
|
1195
|
-
|
|
1196
1196
|
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medication_qa_scenario.MedicationQAScenario")
|
|
1197
1197
|
|
|
1198
1198
|
adapter_spec = get_generation_adapter_spec(
|
|
@@ -1203,13 +1203,17 @@ def get_medication_qa_spec() -> RunSpec:
|
|
|
1203
1203
|
max_tokens=512,
|
|
1204
1204
|
)
|
|
1205
1205
|
|
|
1206
|
+
annotator_specs = [
|
|
1207
|
+
AnnotatorSpec(class_name="helm.benchmark.annotation.medication_qa_annotator.MedicationQAAnnotator")
|
|
1208
|
+
]
|
|
1209
|
+
metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.medication_qa_metrics.MedicationQAScoreMetric")]
|
|
1210
|
+
|
|
1206
1211
|
return RunSpec(
|
|
1207
1212
|
name="medication_qa",
|
|
1208
1213
|
scenario_spec=scenario_spec,
|
|
1209
1214
|
adapter_spec=adapter_spec,
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
),
|
|
1215
|
+
annotators=annotator_specs,
|
|
1216
|
+
metric_specs=metric_specs,
|
|
1213
1217
|
groups=["medication_qa"],
|
|
1214
1218
|
)
|
|
1215
1219
|
|
|
@@ -1506,5 +1510,5 @@ def get_thai_exam_spec(exam: str = "onet", method: str = ADAPT_MULTIPLE_CHOICE_J
|
|
|
1506
1510
|
scenario_spec=scenario_spec,
|
|
1507
1511
|
adapter_spec=adapter_spec,
|
|
1508
1512
|
metric_specs=get_exact_match_metric_specs(),
|
|
1509
|
-
groups=["thai_exam"],
|
|
1513
|
+
groups=["thai_exam", f"thai_exam_{exam}"],
|
|
1510
1514
|
)
|
|
@@ -309,6 +309,8 @@ def get_decodingtrust_toxicity_prompts_spec(subject) -> RunSpec:
|
|
|
309
309
|
name="decodingtrust_toxicity_prompts",
|
|
310
310
|
scenario_spec=scenario_spec,
|
|
311
311
|
adapter_spec=adapter_spec,
|
|
312
|
-
metric_specs=get_generative_harms_metric_specs(
|
|
312
|
+
metric_specs=get_generative_harms_metric_specs(
|
|
313
|
+
include_basic_metrics=True, include_generative_harms_metrics=True
|
|
314
|
+
),
|
|
313
315
|
groups=["decodingtrust", "toxicity_prompts"],
|
|
314
316
|
)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Run specs for experiments only.
|
|
2
|
+
|
|
3
|
+
These run specs are not intended for use with public leaderboards."""
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
|
|
6
|
+
from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
|
|
7
|
+
from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
|
|
8
|
+
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
9
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@run_spec_function("ci_mcqa")
|
|
13
|
+
def get_ci_mcqa_spec() -> RunSpec:
|
|
14
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.ci_mcqa_scenario.CIMCQAScenario", args={})
|
|
15
|
+
|
|
16
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
17
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
18
|
+
instructions=(
|
|
19
|
+
"Give a letter answer among the options given. "
|
|
20
|
+
"For example, if the options are A, B, C, D, E, and F, "
|
|
21
|
+
"your answer should consist of the single letter that corresponds to the correct answer."
|
|
22
|
+
),
|
|
23
|
+
input_noun="Question",
|
|
24
|
+
output_noun="Answer",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
return RunSpec(
|
|
28
|
+
name="ci_mcqa",
|
|
29
|
+
scenario_spec=scenario_spec,
|
|
30
|
+
adapter_spec=adapter_spec,
|
|
31
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
32
|
+
groups=["CIMCQA"],
|
|
33
|
+
)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Run spec functions for the HELM Finance leaderboard.
|
|
2
|
+
|
|
3
|
+
Website: https://crfm.stanford.edu/helm/finance/"""
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.adaptation.common_adapter_specs import (
|
|
6
|
+
get_generation_adapter_spec,
|
|
7
|
+
)
|
|
8
|
+
from helm.benchmark.metrics.common_metric_specs import (
|
|
9
|
+
get_basic_metric_specs,
|
|
10
|
+
)
|
|
11
|
+
from helm.benchmark.metrics.metric import MetricSpec
|
|
12
|
+
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
13
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@run_spec_function("fin_qa")
|
|
17
|
+
def get_fin_qa_spec() -> RunSpec:
|
|
18
|
+
from helm.benchmark.scenarios.fin_qa_scenario import INSTRUCTIONS
|
|
19
|
+
|
|
20
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.fin_qa_scenario.FinQAScenario", args={})
|
|
21
|
+
adapter_spec = get_generation_adapter_spec(
|
|
22
|
+
instructions=INSTRUCTIONS, input_noun=None, output_noun="Program", max_tokens=100
|
|
23
|
+
)
|
|
24
|
+
metric_specs = get_basic_metric_specs([]) + [
|
|
25
|
+
MetricSpec(class_name="helm.benchmark.metrics.fin_qa_metrics.FinQAMetric")
|
|
26
|
+
]
|
|
27
|
+
return RunSpec(
|
|
28
|
+
name="fin_qa",
|
|
29
|
+
scenario_spec=scenario_spec,
|
|
30
|
+
adapter_spec=adapter_spec,
|
|
31
|
+
metric_specs=metric_specs,
|
|
32
|
+
groups=["fin_qa"],
|
|
33
|
+
)
|