crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
- helm/benchmark/__init__.py +13 -0
- helm/benchmark/adaptation/adapter_spec.py +3 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
- helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
- helm/benchmark/contamination/__init__.py +0 -0
- helm/benchmark/metrics/classification_metrics.py +70 -0
- helm/benchmark/metrics/machine_translation_metrics.py +36 -0
- helm/benchmark/metrics/summarization_metrics.py +7 -8
- helm/benchmark/metrics/test_classification_metrics.py +150 -0
- helm/benchmark/presentation/create_plots.py +617 -0
- helm/benchmark/presentation/run_display.py +7 -48
- helm/benchmark/presentation/summarize.py +4 -2
- helm/benchmark/presentation/test_create_plots.py +32 -0
- helm/benchmark/run.py +144 -48
- helm/benchmark/run_expander.py +164 -47
- helm/benchmark/run_specs.py +346 -39
- helm/benchmark/runner.py +34 -6
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
- helm/benchmark/scenarios/imdb_listdir.json +50014 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
- helm/benchmark/scenarios/lextreme_scenario.py +458 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
- helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
- helm/benchmark/scenarios/med_qa_scenario.py +96 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
- helm/benchmark/scenarios/scenario.py +5 -0
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
- helm/benchmark/static/benchmarking.css +14 -0
- helm/benchmark/static/benchmarking.js +43 -0
- helm/benchmark/static/index.html +2 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/plot-captions.js +16 -0
- helm/benchmark/static/schema.yaml +154 -1
- helm/benchmark/window_services/cohere_window_service.py +20 -0
- helm/benchmark/window_services/flan_t5_window_service.py +29 -0
- helm/benchmark/window_services/huggingface_window_service.py +39 -0
- helm/benchmark/window_services/santacoder_window_service.py +27 -0
- helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
- helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
- helm/benchmark/window_services/window_service_factory.py +34 -7
- helm/common/codec.py +123 -0
- helm/common/general.py +12 -5
- helm/common/test_codec.py +144 -0
- helm/proxy/clients/aleph_alpha_client.py +47 -28
- helm/proxy/clients/auto_client.py +32 -24
- helm/proxy/clients/google_client.py +88 -0
- helm/proxy/clients/huggingface_client.py +32 -16
- helm/proxy/clients/huggingface_model_registry.py +111 -0
- helm/proxy/clients/huggingface_tokenizer.py +25 -7
- helm/proxy/clients/openai_client.py +60 -2
- helm/proxy/clients/test_huggingface_model_registry.py +57 -0
- helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
- helm/proxy/clients/together_client.py +17 -2
- helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
- helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
- helm/proxy/models.py +115 -7
- helm/proxy/test_models.py +1 -1
- helm/benchmark/presentation/present.py +0 -249
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0
helm/benchmark/run_specs.py
CHANGED
|
@@ -14,15 +14,28 @@ from helm.benchmark.adaptation.adapters.adapter_factory import (
|
|
|
14
14
|
from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter
|
|
15
15
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
16
16
|
from .metrics.metric import MetricSpec
|
|
17
|
-
from .run_expander import RUN_EXPANDERS, GlobalPrefixRunExpander, StopRunExpander
|
|
17
|
+
from .run_expander import RUN_EXPANDERS, GlobalPrefixRunExpander, StopRunExpander, ChatMLRunExpander
|
|
18
18
|
from .runner import RunSpec
|
|
19
|
+
from .scenarios.lex_glue_scenario import (
|
|
20
|
+
get_lex_glue_max_train_instances,
|
|
21
|
+
get_lex_glue_instructions,
|
|
22
|
+
get_lex_glue_max_tokens,
|
|
23
|
+
get_lex_glue_task_type,
|
|
24
|
+
)
|
|
19
25
|
from .scenarios.scenario import ScenarioSpec
|
|
20
26
|
from .scenarios.big_bench_scenario import BIGBenchScenario
|
|
21
27
|
from .scenarios.msmarco_scenario import MSMARCOScenario
|
|
22
28
|
from .scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
|
|
23
29
|
from .scenarios.copyright_scenario import datatag2hash_code
|
|
24
30
|
from .scenarios.raft_scenario import get_raft_instructions
|
|
25
|
-
from
|
|
31
|
+
from .scenarios.lextreme_scenario import (
|
|
32
|
+
get_lextreme_instructions,
|
|
33
|
+
get_lextreme_max_train_instances,
|
|
34
|
+
get_lextreme_max_tokens,
|
|
35
|
+
TaskType,
|
|
36
|
+
get_lextreme_task_type,
|
|
37
|
+
)
|
|
38
|
+
from helm.proxy.models import get_model, NO_NEWLINES_TAG, NLG_PREFIX_TAG, CHATML_MODEL_TAG
|
|
26
39
|
from helm.common.general import singleton
|
|
27
40
|
|
|
28
41
|
|
|
@@ -37,7 +50,14 @@ def format_instructions(instructions: str) -> str:
|
|
|
37
50
|
|
|
38
51
|
|
|
39
52
|
def get_multiple_choice_joint_adapter_spec(
|
|
40
|
-
instructions: str,
|
|
53
|
+
instructions: str,
|
|
54
|
+
input_noun: Optional[str],
|
|
55
|
+
output_noun: str,
|
|
56
|
+
num_outputs: int = 5,
|
|
57
|
+
max_train_instances: int = 5,
|
|
58
|
+
max_tokens: int = 5,
|
|
59
|
+
sample_train: bool = True,
|
|
60
|
+
**kwargs,
|
|
41
61
|
) -> AdapterSpec:
|
|
42
62
|
"""
|
|
43
63
|
[instructions]
|
|
@@ -54,6 +74,7 @@ def get_multiple_choice_joint_adapter_spec(
|
|
|
54
74
|
[reference_k]
|
|
55
75
|
[output_noun]:
|
|
56
76
|
"""
|
|
77
|
+
|
|
57
78
|
return AdapterSpec(
|
|
58
79
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
59
80
|
instructions=format_instructions(instructions),
|
|
@@ -62,10 +83,11 @@ def get_multiple_choice_joint_adapter_spec(
|
|
|
62
83
|
output_prefix=f"{output_noun}: ",
|
|
63
84
|
output_suffix="\n",
|
|
64
85
|
max_train_instances=max_train_instances,
|
|
65
|
-
num_outputs=
|
|
66
|
-
max_tokens=
|
|
86
|
+
num_outputs=num_outputs,
|
|
87
|
+
max_tokens=max_tokens,
|
|
67
88
|
temperature=0.0,
|
|
68
89
|
stop_sequences=["\n"],
|
|
90
|
+
sample_train=sample_train,
|
|
69
91
|
**kwargs,
|
|
70
92
|
)
|
|
71
93
|
|
|
@@ -99,15 +121,26 @@ def get_multiple_choice_adapter_spec(
|
|
|
99
121
|
input_noun: Optional[str],
|
|
100
122
|
output_noun: str,
|
|
101
123
|
max_train_instances: int = 5,
|
|
124
|
+
num_outputs: int = 5,
|
|
125
|
+
max_tokens: int = 5,
|
|
102
126
|
empty_input: bool = False,
|
|
127
|
+
sample_train: bool = True,
|
|
103
128
|
**kwargs,
|
|
104
129
|
):
|
|
130
|
+
|
|
105
131
|
"""
|
|
106
132
|
Toggle between joint and separate adapters.
|
|
107
133
|
"""
|
|
108
134
|
if method == ADAPT_MULTIPLE_CHOICE_JOINT:
|
|
109
135
|
return get_multiple_choice_joint_adapter_spec(
|
|
110
|
-
instructions,
|
|
136
|
+
instructions,
|
|
137
|
+
input_noun,
|
|
138
|
+
output_noun,
|
|
139
|
+
max_train_instances=max_train_instances,
|
|
140
|
+
num_outputs=num_outputs,
|
|
141
|
+
max_tokens=max_tokens,
|
|
142
|
+
sample_train=sample_train,
|
|
143
|
+
**kwargs,
|
|
111
144
|
)
|
|
112
145
|
elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
|
|
113
146
|
return get_multiple_choice_separate_adapter_spec(method, empty_input)
|
|
@@ -304,6 +337,27 @@ def get_summarization_adapter_spec(num_sents: int, **kwargs) -> AdapterSpec:
|
|
|
304
337
|
)
|
|
305
338
|
|
|
306
339
|
|
|
340
|
+
def get_machine_translation_adapter_spec(
|
|
341
|
+
source_language, target_language, max_train_instances, **kwargs
|
|
342
|
+
) -> AdapterSpec:
|
|
343
|
+
"""
|
|
344
|
+
Used for machine translation.
|
|
345
|
+
"""
|
|
346
|
+
return AdapterSpec(
|
|
347
|
+
method=ADAPT_GENERATION,
|
|
348
|
+
instructions=f"Translate {source_language} to {target_language}:",
|
|
349
|
+
input_prefix="",
|
|
350
|
+
input_suffix=" = ",
|
|
351
|
+
output_prefix="",
|
|
352
|
+
output_suffix="\n",
|
|
353
|
+
max_train_instances=max_train_instances,
|
|
354
|
+
num_outputs=1,
|
|
355
|
+
stop_sequences=["\n\n"],
|
|
356
|
+
temperature=0.0,
|
|
357
|
+
**kwargs,
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
|
|
307
361
|
############################################################
|
|
308
362
|
# Examples of scenario and adapter specs
|
|
309
363
|
|
|
@@ -354,6 +408,14 @@ def get_f1_metric_specs() -> List[MetricSpec]:
|
|
|
354
408
|
return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score"])
|
|
355
409
|
|
|
356
410
|
|
|
411
|
+
def get_classification_metric_specs(delimiter: Optional[str] = None) -> List[MetricSpec]:
|
|
412
|
+
return [
|
|
413
|
+
MetricSpec(
|
|
414
|
+
class_name="helm.benchmark.classification_metrics.ClassificationMetric", args={"delimiter": delimiter}
|
|
415
|
+
)
|
|
416
|
+
]
|
|
417
|
+
|
|
418
|
+
|
|
357
419
|
def get_bbq_metric_specs() -> List[MetricSpec]:
|
|
358
420
|
return [MetricSpec(class_name="helm.benchmark.bbq_metrics.BBQMetric", args={})] + get_exact_match_metric_specs()
|
|
359
421
|
|
|
@@ -479,6 +541,16 @@ def get_code_metric_specs(dataset: str, timeout: float) -> List[MetricSpec]:
|
|
|
479
541
|
return [MetricSpec(class_name="helm.benchmark.code_metrics.APPSMetric", args=args)]
|
|
480
542
|
|
|
481
543
|
|
|
544
|
+
def get_open_ended_generation_metric_specs() -> List[MetricSpec]:
|
|
545
|
+
return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4"])
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
def get_machine_translation_metric_specs() -> List[MetricSpec]:
|
|
549
|
+
return [
|
|
550
|
+
MetricSpec(class_name="helm.benchmark.machine_translation_metrics.MachineTranslationMetric", args={})
|
|
551
|
+
] + get_basic_metric_specs([])
|
|
552
|
+
|
|
553
|
+
|
|
482
554
|
############################################################
|
|
483
555
|
# Run specs
|
|
484
556
|
|
|
@@ -564,7 +636,9 @@ def get_civil_comments_spec(demographic: str) -> RunSpec:
|
|
|
564
636
|
name=f"civil_comments:demographic={demographic}",
|
|
565
637
|
scenario_spec=scenario_spec,
|
|
566
638
|
adapter_spec=adapter_spec,
|
|
567
|
-
metric_specs=get_exact_match_metric_specs()
|
|
639
|
+
metric_specs=get_exact_match_metric_specs()
|
|
640
|
+
+ get_generative_harms_metric_specs()
|
|
641
|
+
+ get_classification_metric_specs(),
|
|
568
642
|
groups=["civil_comments"],
|
|
569
643
|
)
|
|
570
644
|
|
|
@@ -809,7 +883,9 @@ def get_raft_spec(subset: str) -> RunSpec:
|
|
|
809
883
|
name=f"raft:subset={subset}",
|
|
810
884
|
scenario_spec=scenario_spec,
|
|
811
885
|
adapter_spec=adapter_spec,
|
|
812
|
-
metric_specs=get_exact_match_metric_specs()
|
|
886
|
+
metric_specs=get_exact_match_metric_specs()
|
|
887
|
+
+ get_generative_harms_metric_specs()
|
|
888
|
+
+ get_classification_metric_specs(),
|
|
813
889
|
groups=["raft"],
|
|
814
890
|
)
|
|
815
891
|
|
|
@@ -971,7 +1047,7 @@ def get_imdb_spec(only_contrast=False) -> RunSpec:
|
|
|
971
1047
|
name="imdb" + (":only_contrast=True" if only_contrast else ""),
|
|
972
1048
|
scenario_spec=scenario_spec,
|
|
973
1049
|
adapter_spec=adapter_spec,
|
|
974
|
-
metric_specs=get_exact_match_metric_specs(),
|
|
1050
|
+
metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
|
|
975
1051
|
groups=["imdb"],
|
|
976
1052
|
)
|
|
977
1053
|
|
|
@@ -1182,10 +1258,7 @@ def get_narrativeqa_spec() -> RunSpec:
|
|
|
1182
1258
|
name="narrative_qa",
|
|
1183
1259
|
scenario_spec=scenario_spec,
|
|
1184
1260
|
adapter_spec=adapter_spec,
|
|
1185
|
-
metric_specs=
|
|
1186
|
-
["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4"]
|
|
1187
|
-
)
|
|
1188
|
-
+ get_generative_harms_metric_specs(),
|
|
1261
|
+
metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
|
|
1189
1262
|
groups=["narrative_qa"],
|
|
1190
1263
|
)
|
|
1191
1264
|
|
|
@@ -1509,7 +1582,7 @@ def get_big_bench_spec(task: str, subtask: str) -> RunSpec:
|
|
|
1509
1582
|
adapter_spec = AdapterSpec(
|
|
1510
1583
|
method=get_adaptation_method(big_bench_task["metrics"]),
|
|
1511
1584
|
model="openai/text-curie-001", # Can override with the `ModelRunExpander`.
|
|
1512
|
-
max_train_instances=
|
|
1585
|
+
max_train_instances=5, # Can override with the `MaxTrainInstancesRunExpander`.
|
|
1513
1586
|
num_outputs=1, # Can override with the `NumOutputsRunExpander`.
|
|
1514
1587
|
# From "Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models",
|
|
1515
1588
|
# for the BIG-G models tested on BIG-bench, "we use an input context length of 1,024 tokens
|
|
@@ -1541,36 +1614,136 @@ def get_big_bench_spec(task: str, subtask: str) -> RunSpec:
|
|
|
1541
1614
|
)
|
|
1542
1615
|
|
|
1543
1616
|
|
|
1544
|
-
def
|
|
1617
|
+
def get_covid_dialog_spec() -> RunSpec:
|
|
1618
|
+
scenario_spec = ScenarioSpec(
|
|
1619
|
+
class_name="helm.benchmark.scenarios.covid_dialog_scenario.COVIDDialogScenario", args={}
|
|
1620
|
+
)
|
|
1621
|
+
|
|
1622
|
+
adapter_spec = get_generation_adapter_spec(
|
|
1623
|
+
instructions="Generate a response given a patient's questions and concerns.",
|
|
1624
|
+
input_noun="Patient",
|
|
1625
|
+
output_noun="Doctor",
|
|
1626
|
+
max_tokens=128,
|
|
1627
|
+
)
|
|
1628
|
+
|
|
1629
|
+
return RunSpec(
|
|
1630
|
+
name="covid_dialog",
|
|
1631
|
+
scenario_spec=scenario_spec,
|
|
1632
|
+
adapter_spec=adapter_spec,
|
|
1633
|
+
metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
|
|
1634
|
+
groups=["COVIDDialog"],
|
|
1635
|
+
)
|
|
1636
|
+
|
|
1637
|
+
|
|
1638
|
+
def get_me_q_sum_spec() -> RunSpec:
|
|
1639
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.me_q_sum_scenario.MeQSumScenario", args={})
|
|
1640
|
+
|
|
1641
|
+
adapter_spec = get_summarization_adapter_spec(
|
|
1642
|
+
num_sents=1,
|
|
1643
|
+
max_tokens=128,
|
|
1644
|
+
temperature=0.3,
|
|
1645
|
+
)
|
|
1646
|
+
|
|
1647
|
+
return RunSpec(
|
|
1648
|
+
name="me_q_sum",
|
|
1649
|
+
scenario_spec=scenario_spec,
|
|
1650
|
+
adapter_spec=adapter_spec,
|
|
1651
|
+
metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
|
|
1652
|
+
groups=["MeQSum"],
|
|
1653
|
+
)
|
|
1654
|
+
|
|
1655
|
+
|
|
1656
|
+
def get_med_dialog_spec(subset: str) -> RunSpec:
|
|
1657
|
+
scenario_spec = ScenarioSpec(
|
|
1658
|
+
class_name="helm.benchmark.scenarios.med_dialog_scenario.MedDialogScenario", args={"subset": subset}
|
|
1659
|
+
)
|
|
1660
|
+
|
|
1661
|
+
adapter_spec = get_summarization_adapter_spec(
|
|
1662
|
+
num_sents=1,
|
|
1663
|
+
max_tokens=128,
|
|
1664
|
+
temperature=0.3,
|
|
1665
|
+
)
|
|
1666
|
+
|
|
1667
|
+
return RunSpec(
|
|
1668
|
+
name=f"med_dialog,subset={subset}",
|
|
1669
|
+
scenario_spec=scenario_spec,
|
|
1670
|
+
adapter_spec=adapter_spec,
|
|
1671
|
+
metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
|
|
1672
|
+
groups=["MedDialog"],
|
|
1673
|
+
)
|
|
1674
|
+
|
|
1675
|
+
|
|
1676
|
+
def get_med_mcqa_spec() -> RunSpec:
|
|
1677
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.med_mcqa_scenario.MedMCQAScenario", args={})
|
|
1678
|
+
|
|
1679
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
1680
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
1681
|
+
instructions="Give a letter answer among A, B, C or D.",
|
|
1682
|
+
input_noun="Question",
|
|
1683
|
+
output_noun="Answer",
|
|
1684
|
+
)
|
|
1685
|
+
|
|
1686
|
+
return RunSpec(
|
|
1687
|
+
name="med_mcqa",
|
|
1688
|
+
scenario_spec=scenario_spec,
|
|
1689
|
+
adapter_spec=adapter_spec,
|
|
1690
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
1691
|
+
groups=["MedMCQA"],
|
|
1692
|
+
)
|
|
1693
|
+
|
|
1694
|
+
|
|
1695
|
+
def get_med_paragraph_simplification_spec() -> RunSpec:
|
|
1696
|
+
scenario_spec = ScenarioSpec(
|
|
1697
|
+
class_name="helm.benchmark.scenarios.med_paragraph_simplification_scenario.MedParagraphSimplificationScenario",
|
|
1698
|
+
args={},
|
|
1699
|
+
)
|
|
1700
|
+
|
|
1701
|
+
adapter_spec = get_summarization_adapter_spec(
|
|
1702
|
+
num_sents=10,
|
|
1703
|
+
max_tokens=512,
|
|
1704
|
+
temperature=0.3,
|
|
1705
|
+
)
|
|
1706
|
+
|
|
1707
|
+
return RunSpec(
|
|
1708
|
+
name="med_paragraph_simplification",
|
|
1709
|
+
scenario_spec=scenario_spec,
|
|
1710
|
+
adapter_spec=adapter_spec,
|
|
1711
|
+
metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
|
|
1712
|
+
groups=["MedParagraphSimplification"],
|
|
1713
|
+
)
|
|
1714
|
+
|
|
1715
|
+
|
|
1716
|
+
def get_med_qa_spec() -> RunSpec:
|
|
1717
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.med_qa_scenario.MedQAScenario", args={})
|
|
1718
|
+
|
|
1719
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
1720
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
1721
|
+
instructions="Give a letter answer among A, B, C or D.",
|
|
1722
|
+
input_noun="Question",
|
|
1723
|
+
output_noun="Answer",
|
|
1724
|
+
)
|
|
1725
|
+
|
|
1726
|
+
return RunSpec(
|
|
1727
|
+
name="med_qa",
|
|
1728
|
+
scenario_spec=scenario_spec,
|
|
1729
|
+
adapter_spec=adapter_spec,
|
|
1730
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
1731
|
+
groups=["MedQA"],
|
|
1732
|
+
)
|
|
1733
|
+
|
|
1734
|
+
|
|
1735
|
+
def get_pubmed_qa_spec() -> RunSpec:
|
|
1545
1736
|
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.pubmed_qa_scenario.PubMedQAScenario", args={})
|
|
1546
1737
|
|
|
1547
|
-
|
|
1548
|
-
# "Can large language models reason about medical questions?" (Liévin et al.).
|
|
1549
|
-
# Therefore, specify the values of the fields of `AdapterSpec` based on experiment details of the paper.
|
|
1550
|
-
# Set `output_prefix` based on Table 1 (titled "Prompt templates") of the paper.
|
|
1551
|
-
output_prefix: str = "Answer: "
|
|
1552
|
-
if prompt_answer_choices.lower() == "true":
|
|
1553
|
-
output_prefix += "among A through C, the answer is "
|
|
1554
|
-
|
|
1555
|
-
# Liévin et al. followed what Kojima et al. did in "Large Language Models are Zero-Shot Reasoners."
|
|
1556
|
-
# to extract answers from completions: set the max completion length to a large number and
|
|
1557
|
-
# "...pick up the first large letter encountered in the text." Then they set "'Q:'...as a customized stop
|
|
1558
|
-
# sequence for all the models except for Instruct-GPT3 to stop the models from repeating questions and
|
|
1559
|
-
# answers by themselves." We don't need to do this since our framework has a "multiple_choice_joint"
|
|
1560
|
-
# adaptation method that handles the prompt construction for multiple-choice QA for us.
|
|
1561
|
-
adapter_spec = AdapterSpec(
|
|
1738
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
1562
1739
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
1563
|
-
|
|
1564
|
-
|
|
1565
|
-
|
|
1566
|
-
temperature=0,
|
|
1567
|
-
input_prefix="",
|
|
1568
|
-
output_prefix=output_prefix,
|
|
1569
|
-
# Following the examples in https://vlievin.github.io/medical-reasoning/samples/pubmedqa.html
|
|
1570
|
-
reference_prefix="A) ",
|
|
1740
|
+
instructions="Answer A for yes, B for no or C for maybe.",
|
|
1741
|
+
input_noun="Question",
|
|
1742
|
+
output_noun="Answer",
|
|
1571
1743
|
)
|
|
1744
|
+
|
|
1572
1745
|
return RunSpec(
|
|
1573
|
-
name=
|
|
1746
|
+
name="pubmed_qa",
|
|
1574
1747
|
scenario_spec=scenario_spec,
|
|
1575
1748
|
adapter_spec=adapter_spec,
|
|
1576
1749
|
metric_specs=get_exact_match_metric_specs(),
|
|
@@ -1578,6 +1751,125 @@ def get_pubmed_qa_spec(prompt_answer_choices: str) -> RunSpec:
|
|
|
1578
1751
|
)
|
|
1579
1752
|
|
|
1580
1753
|
|
|
1754
|
+
def build_classification_metrics(task_type):
|
|
1755
|
+
if task_type in [TaskType.QA, TaskType.SLTC]:
|
|
1756
|
+
return get_classification_metric_specs(delimiter=None)
|
|
1757
|
+
elif task_type == TaskType.MLTC:
|
|
1758
|
+
return get_classification_metric_specs(delimiter=",")
|
|
1759
|
+
return []
|
|
1760
|
+
|
|
1761
|
+
|
|
1762
|
+
def get_lextreme_spec(subset: str) -> RunSpec:
|
|
1763
|
+
scenario_spec = ScenarioSpec(
|
|
1764
|
+
class_name="helm.benchmark.scenarios.lextreme_scenario.LEXTREMEScenario",
|
|
1765
|
+
args={"subset": subset},
|
|
1766
|
+
)
|
|
1767
|
+
|
|
1768
|
+
adapter_spec = get_generation_adapter_spec(
|
|
1769
|
+
instructions=get_lextreme_instructions(subset),
|
|
1770
|
+
input_noun="Passage",
|
|
1771
|
+
output_noun="Answer",
|
|
1772
|
+
max_tokens=get_lextreme_max_tokens(subset),
|
|
1773
|
+
max_train_instances=get_lextreme_max_train_instances(subset), # in some subsets the input is very long
|
|
1774
|
+
)
|
|
1775
|
+
|
|
1776
|
+
return RunSpec(
|
|
1777
|
+
name=f"lextreme:subset={subset}",
|
|
1778
|
+
scenario_spec=scenario_spec,
|
|
1779
|
+
adapter_spec=adapter_spec,
|
|
1780
|
+
metric_specs=build_classification_metrics(get_lextreme_task_type(subset)),
|
|
1781
|
+
groups=["lextreme"],
|
|
1782
|
+
)
|
|
1783
|
+
|
|
1784
|
+
|
|
1785
|
+
def get_lex_glue_spec(subset: str) -> RunSpec:
|
|
1786
|
+
scenario_spec = ScenarioSpec(
|
|
1787
|
+
class_name="helm.benchmark.scenarios.lex_glue_scenario.LexGLUEScenario",
|
|
1788
|
+
args={"subset": subset},
|
|
1789
|
+
)
|
|
1790
|
+
|
|
1791
|
+
adapter_spec = get_generation_adapter_spec(
|
|
1792
|
+
instructions=get_lex_glue_instructions(subset),
|
|
1793
|
+
input_noun="Passage",
|
|
1794
|
+
output_noun="Answer",
|
|
1795
|
+
max_tokens=get_lex_glue_max_tokens(subset),
|
|
1796
|
+
max_train_instances=get_lex_glue_max_train_instances(subset), # in some subsets the input is very long
|
|
1797
|
+
)
|
|
1798
|
+
|
|
1799
|
+
return RunSpec(
|
|
1800
|
+
name=f"lex_glue:subset={subset}",
|
|
1801
|
+
scenario_spec=scenario_spec,
|
|
1802
|
+
adapter_spec=adapter_spec,
|
|
1803
|
+
metric_specs=build_classification_metrics(get_lex_glue_task_type(subset)),
|
|
1804
|
+
groups=["lex_glue"],
|
|
1805
|
+
)
|
|
1806
|
+
|
|
1807
|
+
|
|
1808
|
+
def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec:
|
|
1809
|
+
FULL_LANGUAGE_NAMES = {
|
|
1810
|
+
"cs": "Czech",
|
|
1811
|
+
"de": "German",
|
|
1812
|
+
"fr": "French",
|
|
1813
|
+
"hi": "Hindi",
|
|
1814
|
+
"ru": "Russian",
|
|
1815
|
+
"en": "English",
|
|
1816
|
+
}
|
|
1817
|
+
source_language, target_language = language_pair.split("-")
|
|
1818
|
+
|
|
1819
|
+
scenario_spec = ScenarioSpec(
|
|
1820
|
+
class_name="helm.benchmark.scenarios.wmt_14_scenario.WMT14Scenario",
|
|
1821
|
+
args={"source_language": source_language, "target_language": target_language},
|
|
1822
|
+
)
|
|
1823
|
+
|
|
1824
|
+
adapter_spec = get_machine_translation_adapter_spec(
|
|
1825
|
+
source_language=FULL_LANGUAGE_NAMES[source_language],
|
|
1826
|
+
target_language=FULL_LANGUAGE_NAMES[target_language],
|
|
1827
|
+
max_train_instances=max_train_instances,
|
|
1828
|
+
)
|
|
1829
|
+
|
|
1830
|
+
return RunSpec(
|
|
1831
|
+
name=f"wmt_14:language_pair={language_pair}",
|
|
1832
|
+
scenario_spec=scenario_spec,
|
|
1833
|
+
adapter_spec=adapter_spec,
|
|
1834
|
+
metric_specs=get_machine_translation_metric_specs(),
|
|
1835
|
+
groups=["wmt_14"],
|
|
1836
|
+
)
|
|
1837
|
+
|
|
1838
|
+
|
|
1839
|
+
def get_opinions_qa_spec(
|
|
1840
|
+
survey_type: str,
|
|
1841
|
+
num_logprobs: str,
|
|
1842
|
+
context: str = "None",
|
|
1843
|
+
num_train_trials: str = "1",
|
|
1844
|
+
method: str = ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
1845
|
+
) -> RunSpec:
|
|
1846
|
+
scenario_spec = ScenarioSpec(
|
|
1847
|
+
class_name="helm.benchmark.scenarios.opinions_qa_scenario.OpinionsQAScenario",
|
|
1848
|
+
args={"survey_type": survey_type, "context": context},
|
|
1849
|
+
)
|
|
1850
|
+
|
|
1851
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
1852
|
+
method=method,
|
|
1853
|
+
instructions="",
|
|
1854
|
+
input_noun="Question",
|
|
1855
|
+
output_noun="Answer",
|
|
1856
|
+
max_train_instances=1 if "steer" in context else 0,
|
|
1857
|
+
max_tokens=1,
|
|
1858
|
+
num_outputs=int(num_logprobs),
|
|
1859
|
+
num_train_trials=1 if context != "steer-qa" else int(num_train_trials),
|
|
1860
|
+
sample_train=False,
|
|
1861
|
+
)
|
|
1862
|
+
|
|
1863
|
+
return RunSpec(
|
|
1864
|
+
name=f"opinions_qa:survey={survey_type},num_logprobs={num_logprobs}"
|
|
1865
|
+
+ f",context={context},num_train_trials={num_train_trials}",
|
|
1866
|
+
scenario_spec=scenario_spec,
|
|
1867
|
+
adapter_spec=adapter_spec,
|
|
1868
|
+
metric_specs=[],
|
|
1869
|
+
groups=["opinions_qa"],
|
|
1870
|
+
)
|
|
1871
|
+
|
|
1872
|
+
|
|
1581
1873
|
############################################################
|
|
1582
1874
|
|
|
1583
1875
|
CANONICAL_RUN_SPEC_FUNCS: Dict[str, Callable[..., RunSpec]] = {
|
|
@@ -1624,7 +1916,18 @@ CANONICAL_RUN_SPEC_FUNCS: Dict[str, Callable[..., RunSpec]] = {
|
|
|
1624
1916
|
"entity_data_imputation": get_entity_data_imputation_spec,
|
|
1625
1917
|
"ice": get_ice_spec,
|
|
1626
1918
|
"big_bench": get_big_bench_spec,
|
|
1919
|
+
"lextreme": get_lextreme_spec,
|
|
1920
|
+
"lex_glue": get_lex_glue_spec,
|
|
1921
|
+
"wmt_14": get_wmt_14_spec,
|
|
1922
|
+
# Biomedical
|
|
1923
|
+
"covid_dialog": get_covid_dialog_spec,
|
|
1924
|
+
"me_q_sum": get_me_q_sum_spec,
|
|
1925
|
+
"med_dialog": get_med_dialog_spec,
|
|
1926
|
+
"med_mcqa": get_med_mcqa_spec,
|
|
1927
|
+
"med_paragraph_simplification": get_med_paragraph_simplification_spec,
|
|
1928
|
+
"med_qa": get_med_qa_spec,
|
|
1627
1929
|
"pubmed_qa": get_pubmed_qa_spec,
|
|
1930
|
+
"opinions_qa": get_opinions_qa_spec,
|
|
1628
1931
|
}
|
|
1629
1932
|
|
|
1630
1933
|
|
|
@@ -1667,6 +1970,10 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
|
|
|
1667
1970
|
global_prefix_expander = GlobalPrefixRunExpander(value="nlg")
|
|
1668
1971
|
run_spec = singleton(global_prefix_expander.expand(run_spec))
|
|
1669
1972
|
|
|
1973
|
+
if CHATML_MODEL_TAG in model.tags:
|
|
1974
|
+
chatml_expander = ChatMLRunExpander()
|
|
1975
|
+
run_spec = singleton(chatml_expander.expand(run_spec))
|
|
1976
|
+
|
|
1670
1977
|
return run_spec
|
|
1671
1978
|
|
|
1672
1979
|
run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]
|
helm/benchmark/runner.py
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
+
import traceback
|
|
3
4
|
import typing
|
|
4
5
|
from collections import Counter
|
|
5
6
|
from dataclasses import dataclass, field
|
|
6
7
|
from typing import List
|
|
7
8
|
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
|
|
8
11
|
from helm.common.general import ensure_directory_exists, write, asdict_without_nones
|
|
9
12
|
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
10
13
|
from helm.common.cache import cache_stats
|
|
@@ -23,6 +26,12 @@ from .metrics.tokens_metric import TokensMetric
|
|
|
23
26
|
from .window_services.tokenizer_service import TokenizerService
|
|
24
27
|
|
|
25
28
|
|
|
29
|
+
class RunnerError(Exception):
|
|
30
|
+
"""Error that happens in the Runner."""
|
|
31
|
+
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
|
|
26
35
|
@dataclass(frozen=True)
|
|
27
36
|
class RunSpec:
|
|
28
37
|
"""
|
|
@@ -68,15 +77,17 @@ class Runner:
|
|
|
68
77
|
execution_spec: ExecutionSpec,
|
|
69
78
|
output_path: str,
|
|
70
79
|
suite: str,
|
|
71
|
-
run_specs: List[RunSpec],
|
|
72
80
|
skip_instances: bool,
|
|
81
|
+
skip_completed_runs: bool,
|
|
82
|
+
exit_on_error: bool,
|
|
73
83
|
):
|
|
74
84
|
self.executor = Executor(execution_spec)
|
|
75
85
|
self.dry_run: bool = execution_spec.dry_run
|
|
76
86
|
self.tokenizer_service = TokenizerService(self.executor.service, execution_spec.auth)
|
|
77
87
|
self.metric_service = MetricService(self.executor.service, execution_spec.auth)
|
|
78
|
-
self.run_specs: List[RunSpec] = run_specs
|
|
79
88
|
self.skip_instances: bool = skip_instances
|
|
89
|
+
self.skip_completed_runs: bool = skip_completed_runs
|
|
90
|
+
self.exit_on_error: bool = exit_on_error
|
|
80
91
|
|
|
81
92
|
ensure_directory_exists(output_path)
|
|
82
93
|
# Decide where to save the raw data (e.g., "output/scenarios/mmlu").
|
|
@@ -90,10 +101,21 @@ class Runner:
|
|
|
90
101
|
self.eval_cache_path: str = os.path.join(self.runs_path, "eval_cache")
|
|
91
102
|
ensure_directory_exists(self.eval_cache_path)
|
|
92
103
|
|
|
93
|
-
def run_all(self):
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
104
|
+
def run_all(self, run_specs: List[RunSpec]):
|
|
105
|
+
failed_run_specs: List[RunSpec] = []
|
|
106
|
+
for run_spec in tqdm(run_specs, disable=None):
|
|
107
|
+
try:
|
|
108
|
+
with htrack_block(f"Running {run_spec.name}"):
|
|
109
|
+
self.run_one(run_spec)
|
|
110
|
+
except Exception as e:
|
|
111
|
+
if self.exit_on_error:
|
|
112
|
+
raise e
|
|
113
|
+
else:
|
|
114
|
+
hlog(f"Error when running {run_spec.name}:\n{traceback.format_exc()}")
|
|
115
|
+
failed_run_specs.append(run_spec)
|
|
116
|
+
if not self.exit_on_error and failed_run_specs:
|
|
117
|
+
failed_runs_str = ", ".join([f'"{run_spec.name}"' for run_spec in failed_run_specs])
|
|
118
|
+
raise RunnerError(f"Failed runs: [{failed_runs_str}]")
|
|
97
119
|
|
|
98
120
|
def run_one(self, run_spec: RunSpec):
|
|
99
121
|
# Load the scenario
|
|
@@ -106,6 +128,12 @@ class Runner:
|
|
|
106
128
|
run_path: str = os.path.join(self.runs_path, run_spec.name)
|
|
107
129
|
ensure_directory_exists(run_path)
|
|
108
130
|
|
|
131
|
+
if self.skip_completed_runs and os.path.exists(os.path.join(run_path, "scenario_state.json")):
|
|
132
|
+
# If scenario_state.json exists, assume that all other output files exist
|
|
133
|
+
# because scenario_state.json is the last output file to be written.
|
|
134
|
+
hlog(f"Skipping run {run_spec.name} because run is completed and all output files exist.")
|
|
135
|
+
return
|
|
136
|
+
|
|
109
137
|
# Fetch and initialize the Adapter based on the `AdapterSpec`.
|
|
110
138
|
adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)
|
|
111
139
|
|
|
@@ -72,7 +72,7 @@ class CopyrightScenario(Scenario):
|
|
|
72
72
|
|
|
73
73
|
# Read all the instances
|
|
74
74
|
instances: List[Instance] = []
|
|
75
|
-
for prefix, prefix_to_end in tqdm.tqdm(data["data"].items(), desc="load instances"):
|
|
75
|
+
for prefix, prefix_to_end in tqdm.tqdm(data["data"].items(), desc="load instances", disable=None):
|
|
76
76
|
instances.append(
|
|
77
77
|
Instance(
|
|
78
78
|
input=Input(text=prefix),
|