crfm-helm 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +10 -8
- {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +50 -37
- {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
- {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +1 -0
- helm/benchmark/__init__.py +2 -0
- helm/benchmark/adaptation/adapter_spec.py +3 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
- helm/benchmark/contamination/__init__.py +0 -0
- helm/benchmark/metrics/classification_metrics.py +28 -23
- helm/benchmark/metrics/test_classification_metrics.py +44 -9
- helm/benchmark/presentation/create_plots.py +617 -0
- helm/benchmark/presentation/summarize.py +4 -2
- helm/benchmark/presentation/test_create_plots.py +32 -0
- helm/benchmark/run.py +23 -1
- helm/benchmark/run_expander.py +161 -47
- helm/benchmark/run_specs.py +84 -10
- helm/benchmark/runner.py +31 -3
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/imdb_listdir.json +50014 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +58 -17
- helm/benchmark/scenarios/lextreme_scenario.py +37 -25
- helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
- helm/benchmark/scenarios/scenario.py +5 -0
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/static/benchmarking.css +14 -0
- helm/benchmark/static/benchmarking.js +43 -0
- helm/benchmark/static/index.html +2 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/plot-captions.js +16 -0
- helm/benchmark/static/schema.yaml +66 -8
- helm/benchmark/window_services/cohere_window_service.py +20 -0
- helm/benchmark/window_services/flan_t5_window_service.py +29 -0
- helm/benchmark/window_services/huggingface_window_service.py +39 -0
- helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
- helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
- helm/benchmark/window_services/window_service_factory.py +27 -6
- helm/common/general.py +12 -5
- helm/proxy/clients/aleph_alpha_client.py +47 -28
- helm/proxy/clients/auto_client.py +28 -24
- helm/proxy/clients/huggingface_client.py +30 -17
- helm/proxy/clients/huggingface_model_registry.py +111 -0
- helm/proxy/clients/huggingface_tokenizer.py +23 -7
- helm/proxy/clients/openai_client.py +60 -2
- helm/proxy/clients/test_huggingface_model_registry.py +57 -0
- helm/proxy/clients/together_client.py +17 -2
- helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
- helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
- helm/proxy/models.py +82 -2
- {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0
helm/benchmark/run.py
CHANGED
|
@@ -7,6 +7,7 @@ from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
|
|
|
7
7
|
from helm.common.hierarchical_logger import hlog, htrack, htrack_block
|
|
8
8
|
from helm.common.authentication import Authentication
|
|
9
9
|
from helm.common.object_spec import parse_object_spec
|
|
10
|
+
from helm.proxy.clients.huggingface_model_registry import register_huggingface_model_config
|
|
10
11
|
from helm.proxy.services.remote_service import create_authentication, add_service_args
|
|
11
12
|
|
|
12
13
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
@@ -73,6 +74,8 @@ def run_benchmarking(
|
|
|
73
74
|
suite: str,
|
|
74
75
|
dry_run: bool,
|
|
75
76
|
skip_instances: bool,
|
|
77
|
+
skip_completed_runs: bool,
|
|
78
|
+
exit_on_error: bool,
|
|
76
79
|
mongo_uri: str = "",
|
|
77
80
|
) -> List[RunSpec]:
|
|
78
81
|
"""Runs RunSpecs given a list of RunSpec descriptions."""
|
|
@@ -89,7 +92,7 @@ def run_benchmarking(
|
|
|
89
92
|
for run_spec in run_specs:
|
|
90
93
|
hlog(run_spec)
|
|
91
94
|
|
|
92
|
-
runner = Runner(execution_spec, output_path, suite, skip_instances)
|
|
95
|
+
runner = Runner(execution_spec, output_path, suite, skip_instances, skip_completed_runs, exit_on_error)
|
|
93
96
|
runner.run_all(run_specs)
|
|
94
97
|
return run_specs
|
|
95
98
|
|
|
@@ -197,6 +200,12 @@ def main():
|
|
|
197
200
|
default=None,
|
|
198
201
|
help="Fail and exit immediately if a particular RunSpec fails.",
|
|
199
202
|
)
|
|
203
|
+
parser.add_argument(
|
|
204
|
+
"--skip-completed-runs",
|
|
205
|
+
action="store_true",
|
|
206
|
+
default=None,
|
|
207
|
+
help="Skip RunSpecs that have completed i.e. output files exists.",
|
|
208
|
+
)
|
|
200
209
|
parser.add_argument(
|
|
201
210
|
"--priority",
|
|
202
211
|
type=int,
|
|
@@ -205,9 +214,20 @@ def main():
|
|
|
205
214
|
"If a value for --priority is not specified, run on everything",
|
|
206
215
|
)
|
|
207
216
|
parser.add_argument("-r", "--run-specs", nargs="*", help="Specifies what to run", default=[])
|
|
217
|
+
parser.add_argument(
|
|
218
|
+
"--enable-huggingface-models",
|
|
219
|
+
nargs="+",
|
|
220
|
+
default=[],
|
|
221
|
+
help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
|
|
222
|
+
"Format: namespace/model_name[@revision]",
|
|
223
|
+
)
|
|
208
224
|
add_run_args(parser)
|
|
209
225
|
args = parser.parse_args()
|
|
210
226
|
validate_args(args)
|
|
227
|
+
|
|
228
|
+
for huggingface_model_name in args.enable_huggingface_models:
|
|
229
|
+
register_huggingface_model_config(huggingface_model_name)
|
|
230
|
+
|
|
211
231
|
run_entries: List[RunEntry] = []
|
|
212
232
|
if args.conf_paths:
|
|
213
233
|
run_entries.extend(read_run_entries(args.conf_paths).entries)
|
|
@@ -242,6 +262,8 @@ def main():
|
|
|
242
262
|
suite=args.suite,
|
|
243
263
|
dry_run=args.dry_run,
|
|
244
264
|
skip_instances=args.skip_instances,
|
|
265
|
+
skip_completed_runs=args.skip_completed_runs,
|
|
266
|
+
exit_on_error=args.exit_on_error,
|
|
245
267
|
mongo_uri=args.mongo_uri,
|
|
246
268
|
)
|
|
247
269
|
|
helm/benchmark/run_expander.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from dataclasses import replace
|
|
3
|
-
from typing import List, Dict, Optional, Tuple
|
|
3
|
+
from typing import List, Dict, Optional, Tuple, Type
|
|
4
4
|
|
|
5
5
|
from helm.proxy.models import (
|
|
6
6
|
get_all_code_models,
|
|
@@ -302,36 +302,58 @@ class ModelRunExpander(ReplaceValueRunExpander):
|
|
|
302
302
|
"""
|
|
303
303
|
|
|
304
304
|
name = "model"
|
|
305
|
-
values_dict = {
|
|
306
|
-
"full_functionality_text": get_model_names_with_tag(FULL_FUNCTIONALITY_TEXT_MODEL_TAG),
|
|
307
|
-
"ai21/j1-jumbo": ["ai21/j1-jumbo"],
|
|
308
|
-
"openai/curie": ["openai/curie"],
|
|
309
|
-
"chat_run": ["openai/chat-gpt", "openai/text-davinci-003"], # Compare ChatGPT to text-davinci-003
|
|
310
|
-
"all": get_all_models(),
|
|
311
|
-
"text_code": get_all_text_models() + get_all_code_models(),
|
|
312
|
-
"text": get_all_text_models(),
|
|
313
|
-
"code": get_all_code_models(),
|
|
314
|
-
"limited_functionality_text": get_model_names_with_tag(LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG),
|
|
315
|
-
"gpt2_tokenizer": get_model_names_with_tag(GPT2_TOKENIZER_TAG),
|
|
316
|
-
"ai21_tokenizer": get_model_names_with_tag(AI21_TOKENIZER_TAG),
|
|
317
|
-
"cohere_tokenizer": get_model_names_with_tag(COHERE_TOKENIZER_TAG),
|
|
318
|
-
"opt_tokenizer": get_model_names_with_tag(OPT_TOKENIZER_TAG),
|
|
319
|
-
"summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"],
|
|
320
|
-
"biomedical": ["openai/text-davinci-003"], # TODO: add https://huggingface.co/stanford-crfm/BioMedLM
|
|
321
|
-
"interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"],
|
|
322
|
-
}
|
|
323
305
|
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
for family_name, models in ablation_values_dict.items():
|
|
331
|
-
if family_name == "ablation_all":
|
|
332
|
-
values_dict["ablation"] = models
|
|
306
|
+
def __init__(self, value):
|
|
307
|
+
"""
|
|
308
|
+
`value` is either the actual value to use or a lookup into the values dict.
|
|
309
|
+
"""
|
|
310
|
+
if value in self.values_dict:
|
|
311
|
+
self.values = self.values_dict[value]
|
|
333
312
|
else:
|
|
334
|
-
|
|
313
|
+
self.values = [value]
|
|
314
|
+
|
|
315
|
+
@property
|
|
316
|
+
def values_dict(self):
|
|
317
|
+
values_dict = {
|
|
318
|
+
"full_functionality_text": get_model_names_with_tag(FULL_FUNCTIONALITY_TEXT_MODEL_TAG),
|
|
319
|
+
"ai21/j1-jumbo": ["ai21/j1-jumbo"],
|
|
320
|
+
"openai/curie": ["openai/curie"],
|
|
321
|
+
"chat_run": ["openai/chat-gpt", "openai/text-davinci-003"], # Compare ChatGPT to text-davinci-003
|
|
322
|
+
"all": get_all_models(),
|
|
323
|
+
"text_code": get_all_text_models() + get_all_code_models(),
|
|
324
|
+
"text": get_all_text_models(),
|
|
325
|
+
"code": get_all_code_models(),
|
|
326
|
+
"limited_functionality_text": get_model_names_with_tag(LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG),
|
|
327
|
+
"gpt2_tokenizer": get_model_names_with_tag(GPT2_TOKENIZER_TAG),
|
|
328
|
+
"ai21_tokenizer": get_model_names_with_tag(AI21_TOKENIZER_TAG),
|
|
329
|
+
"cohere_tokenizer": get_model_names_with_tag(COHERE_TOKENIZER_TAG),
|
|
330
|
+
"opt_tokenizer": get_model_names_with_tag(OPT_TOKENIZER_TAG),
|
|
331
|
+
"summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"],
|
|
332
|
+
"biomedical": ["openai/text-davinci-003"], # TODO: add https://huggingface.co/stanford-crfm/BioMedLM
|
|
333
|
+
"interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"],
|
|
334
|
+
"opinions_qa_openai": [
|
|
335
|
+
"openai/ada",
|
|
336
|
+
"openai/davinci",
|
|
337
|
+
"openai/text-ada-001",
|
|
338
|
+
"openai/text-davinci-001",
|
|
339
|
+
"openai/text-davinci-002",
|
|
340
|
+
"openai/text-davinci-003",
|
|
341
|
+
],
|
|
342
|
+
"opinions_qa_ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
# For each of the keys above (e.g., "text"), create a corresponding ablation (e.g., "ablation_text")
|
|
346
|
+
# which contains the subset of models with the ablation tag.
|
|
347
|
+
ablation_models = set(get_model_names_with_tag(ABLATION_MODEL_TAG))
|
|
348
|
+
ablation_values_dict = {}
|
|
349
|
+
for family_name, models in values_dict.items():
|
|
350
|
+
ablation_values_dict["ablation_" + family_name] = list(ablation_models & set(models))
|
|
351
|
+
for family_name, models in ablation_values_dict.items():
|
|
352
|
+
if family_name == "ablation_all":
|
|
353
|
+
values_dict["ablation"] = models
|
|
354
|
+
else:
|
|
355
|
+
values_dict[family_name] = models
|
|
356
|
+
return values_dict
|
|
335
357
|
|
|
336
358
|
|
|
337
359
|
############################################################
|
|
@@ -821,21 +843,113 @@ class NumOutputTokensRunExpander(RunExpander):
|
|
|
821
843
|
]
|
|
822
844
|
|
|
823
845
|
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
846
|
+
class ChatMLRunExpander(RunExpander):
|
|
847
|
+
"""
|
|
848
|
+
Adapt to ChatML: https://github.com/openai/openai-python/blob/main/chatml.md
|
|
849
|
+
A 1-shot example:
|
|
850
|
+
<|im_start|>system
|
|
851
|
+
Translate from English to French
|
|
852
|
+
<|im_end|>
|
|
853
|
+
<|im_start|>user
|
|
854
|
+
How are you?
|
|
855
|
+
<|im_end|>
|
|
856
|
+
<|im_start|>user
|
|
857
|
+
Comment allez-vous?
|
|
858
|
+
<|im_end|>
|
|
859
|
+
<|im_start|>user
|
|
860
|
+
{{user input here}}<|im_end|>
|
|
861
|
+
"""
|
|
862
|
+
|
|
863
|
+
name = "chatml"
|
|
864
|
+
|
|
865
|
+
def __init__(self):
|
|
866
|
+
self.name = type(self).name
|
|
867
|
+
|
|
868
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
869
|
+
adapter_spec = run_spec.adapter_spec
|
|
870
|
+
# according to https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting
|
|
871
|
+
# few-shot examples should do `<|im_start|>system name=example_user`
|
|
872
|
+
# or `<|im_start|>system name=example_assistant`
|
|
873
|
+
# but it is also possible to put examples into a user message.
|
|
874
|
+
|
|
875
|
+
scenario_name = run_spec.name.split(":")[0]
|
|
876
|
+
|
|
877
|
+
if scenario_name in ("msmarco",):
|
|
878
|
+
# output_prefix:
|
|
879
|
+
# Does the passage answer the query?
|
|
880
|
+
# Answer:
|
|
881
|
+
#
|
|
882
|
+
# new_output_prefix:
|
|
883
|
+
# Does the passage answer the query?<|im_end|>
|
|
884
|
+
# <|im_start|>assistant
|
|
885
|
+
# Answer:
|
|
886
|
+
|
|
887
|
+
new_output_prefix = (
|
|
888
|
+
adapter_spec.output_prefix.split("\n")[0]
|
|
889
|
+
+ "<|im_end|>\n<|im_start|>assistant\n"
|
|
890
|
+
+ adapter_spec.output_prefix.split("\n")[1]
|
|
891
|
+
)
|
|
892
|
+
|
|
893
|
+
elif scenario_name in ("summarization_cnndm", "summarization_xsum"):
|
|
894
|
+
# output_prefix:
|
|
895
|
+
# Summarize the above article in 1 sentence.
|
|
896
|
+
#
|
|
897
|
+
# new_output_prefix:
|
|
898
|
+
# Summarize the above article in 1 sentence.<|im_end|>
|
|
899
|
+
# <|im_start|>assistant
|
|
900
|
+
#
|
|
901
|
+
|
|
902
|
+
new_output_prefix = adapter_spec.output_prefix + "<|im_end|>\n<|im_start|>assistant\n"
|
|
903
|
+
|
|
904
|
+
else:
|
|
905
|
+
# output_prefix:
|
|
906
|
+
# {output_prefix}
|
|
907
|
+
#
|
|
908
|
+
# new_output_prefix:
|
|
909
|
+
# <|im_end|>
|
|
910
|
+
# <|im_start|>assistant
|
|
911
|
+
# {output_prefix}
|
|
912
|
+
|
|
913
|
+
new_output_prefix = "<|im_end|>\n<|im_start|>assistant\n" + adapter_spec.output_prefix
|
|
914
|
+
|
|
915
|
+
adapter_spec = replace(
|
|
916
|
+
adapter_spec,
|
|
917
|
+
# This is a hack to make sure <|im_start|>user goes before the reference.
|
|
918
|
+
instructions=(
|
|
919
|
+
f"<|im_start|>system\n{adapter_spec.instructions}<|im_end|>\n<|im_start|>user\n"
|
|
920
|
+
if adapter_spec.instructions != ""
|
|
921
|
+
else "<|im_start|>user\n"
|
|
922
|
+
),
|
|
923
|
+
instance_prefix="",
|
|
924
|
+
output_prefix=new_output_prefix,
|
|
925
|
+
output_suffix="<|im_end|>\n<|im_start|>user\n",
|
|
926
|
+
stop_sequences=adapter_spec.stop_sequences + ["<|im_end|>"],
|
|
927
|
+
)
|
|
928
|
+
|
|
929
|
+
return [
|
|
930
|
+
replace(
|
|
931
|
+
run_spec,
|
|
932
|
+
adapter_spec=adapter_spec,
|
|
933
|
+
),
|
|
934
|
+
]
|
|
935
|
+
|
|
936
|
+
|
|
937
|
+
RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
|
|
938
|
+
InstructionsRunExpander,
|
|
939
|
+
PromptRunExpander,
|
|
940
|
+
NewlineRunExpander,
|
|
941
|
+
StopRunExpander,
|
|
942
|
+
GlobalPrefixRunExpander,
|
|
943
|
+
NumTrainTrialsRunExpander,
|
|
944
|
+
MaxTrainInstancesRunExpander,
|
|
945
|
+
NumOutputsRunExpander,
|
|
946
|
+
ModelRunExpander,
|
|
947
|
+
DataAugmentationRunExpander,
|
|
948
|
+
TokenizerRunExpander,
|
|
949
|
+
NumPromptTokensRunExpander,
|
|
950
|
+
NumOutputTokensRunExpander,
|
|
951
|
+
ChatMLRunExpander,
|
|
952
|
+
]
|
|
953
|
+
|
|
954
|
+
|
|
955
|
+
RUN_EXPANDERS = dict((expander.name, expander) for expander in RUN_EXPANDER_SUBCLASSES)
|
helm/benchmark/run_specs.py
CHANGED
|
@@ -14,12 +14,13 @@ from helm.benchmark.adaptation.adapters.adapter_factory import (
|
|
|
14
14
|
from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter
|
|
15
15
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
16
16
|
from .metrics.metric import MetricSpec
|
|
17
|
-
from .run_expander import RUN_EXPANDERS, GlobalPrefixRunExpander, StopRunExpander
|
|
17
|
+
from .run_expander import RUN_EXPANDERS, GlobalPrefixRunExpander, StopRunExpander, ChatMLRunExpander
|
|
18
18
|
from .runner import RunSpec
|
|
19
19
|
from .scenarios.lex_glue_scenario import (
|
|
20
20
|
get_lex_glue_max_train_instances,
|
|
21
21
|
get_lex_glue_instructions,
|
|
22
22
|
get_lex_glue_max_tokens,
|
|
23
|
+
get_lex_glue_task_type,
|
|
23
24
|
)
|
|
24
25
|
from .scenarios.scenario import ScenarioSpec
|
|
25
26
|
from .scenarios.big_bench_scenario import BIGBenchScenario
|
|
@@ -31,8 +32,10 @@ from .scenarios.lextreme_scenario import (
|
|
|
31
32
|
get_lextreme_instructions,
|
|
32
33
|
get_lextreme_max_train_instances,
|
|
33
34
|
get_lextreme_max_tokens,
|
|
35
|
+
TaskType,
|
|
36
|
+
get_lextreme_task_type,
|
|
34
37
|
)
|
|
35
|
-
from helm.proxy.models import get_model, NO_NEWLINES_TAG, NLG_PREFIX_TAG
|
|
38
|
+
from helm.proxy.models import get_model, NO_NEWLINES_TAG, NLG_PREFIX_TAG, CHATML_MODEL_TAG
|
|
36
39
|
from helm.common.general import singleton
|
|
37
40
|
|
|
38
41
|
|
|
@@ -47,7 +50,14 @@ def format_instructions(instructions: str) -> str:
|
|
|
47
50
|
|
|
48
51
|
|
|
49
52
|
def get_multiple_choice_joint_adapter_spec(
|
|
50
|
-
instructions: str,
|
|
53
|
+
instructions: str,
|
|
54
|
+
input_noun: Optional[str],
|
|
55
|
+
output_noun: str,
|
|
56
|
+
num_outputs: int = 5,
|
|
57
|
+
max_train_instances: int = 5,
|
|
58
|
+
max_tokens: int = 5,
|
|
59
|
+
sample_train: bool = True,
|
|
60
|
+
**kwargs,
|
|
51
61
|
) -> AdapterSpec:
|
|
52
62
|
"""
|
|
53
63
|
[instructions]
|
|
@@ -64,6 +74,7 @@ def get_multiple_choice_joint_adapter_spec(
|
|
|
64
74
|
[reference_k]
|
|
65
75
|
[output_noun]:
|
|
66
76
|
"""
|
|
77
|
+
|
|
67
78
|
return AdapterSpec(
|
|
68
79
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
69
80
|
instructions=format_instructions(instructions),
|
|
@@ -72,10 +83,11 @@ def get_multiple_choice_joint_adapter_spec(
|
|
|
72
83
|
output_prefix=f"{output_noun}: ",
|
|
73
84
|
output_suffix="\n",
|
|
74
85
|
max_train_instances=max_train_instances,
|
|
75
|
-
num_outputs=
|
|
76
|
-
max_tokens=
|
|
86
|
+
num_outputs=num_outputs,
|
|
87
|
+
max_tokens=max_tokens,
|
|
77
88
|
temperature=0.0,
|
|
78
89
|
stop_sequences=["\n"],
|
|
90
|
+
sample_train=sample_train,
|
|
79
91
|
**kwargs,
|
|
80
92
|
)
|
|
81
93
|
|
|
@@ -109,15 +121,26 @@ def get_multiple_choice_adapter_spec(
|
|
|
109
121
|
input_noun: Optional[str],
|
|
110
122
|
output_noun: str,
|
|
111
123
|
max_train_instances: int = 5,
|
|
124
|
+
num_outputs: int = 5,
|
|
125
|
+
max_tokens: int = 5,
|
|
112
126
|
empty_input: bool = False,
|
|
127
|
+
sample_train: bool = True,
|
|
113
128
|
**kwargs,
|
|
114
129
|
):
|
|
130
|
+
|
|
115
131
|
"""
|
|
116
132
|
Toggle between joint and separate adapters.
|
|
117
133
|
"""
|
|
118
134
|
if method == ADAPT_MULTIPLE_CHOICE_JOINT:
|
|
119
135
|
return get_multiple_choice_joint_adapter_spec(
|
|
120
|
-
instructions,
|
|
136
|
+
instructions,
|
|
137
|
+
input_noun,
|
|
138
|
+
output_noun,
|
|
139
|
+
max_train_instances=max_train_instances,
|
|
140
|
+
num_outputs=num_outputs,
|
|
141
|
+
max_tokens=max_tokens,
|
|
142
|
+
sample_train=sample_train,
|
|
143
|
+
**kwargs,
|
|
121
144
|
)
|
|
122
145
|
elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
|
|
123
146
|
return get_multiple_choice_separate_adapter_spec(method, empty_input)
|
|
@@ -385,8 +408,12 @@ def get_f1_metric_specs() -> List[MetricSpec]:
|
|
|
385
408
|
return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score"])
|
|
386
409
|
|
|
387
410
|
|
|
388
|
-
def get_classification_metric_specs() -> List[MetricSpec]:
|
|
389
|
-
return [
|
|
411
|
+
def get_classification_metric_specs(delimiter: Optional[str] = None) -> List[MetricSpec]:
|
|
412
|
+
return [
|
|
413
|
+
MetricSpec(
|
|
414
|
+
class_name="helm.benchmark.classification_metrics.ClassificationMetric", args={"delimiter": delimiter}
|
|
415
|
+
)
|
|
416
|
+
]
|
|
390
417
|
|
|
391
418
|
|
|
392
419
|
def get_bbq_metric_specs() -> List[MetricSpec]:
|
|
@@ -1724,6 +1751,14 @@ def get_pubmed_qa_spec() -> RunSpec:
|
|
|
1724
1751
|
)
|
|
1725
1752
|
|
|
1726
1753
|
|
|
1754
|
+
def build_classification_metrics(task_type):
|
|
1755
|
+
if task_type in [TaskType.QA, TaskType.SLTC]:
|
|
1756
|
+
return get_classification_metric_specs(delimiter=None)
|
|
1757
|
+
elif task_type == TaskType.MLTC:
|
|
1758
|
+
return get_classification_metric_specs(delimiter=",")
|
|
1759
|
+
return []
|
|
1760
|
+
|
|
1761
|
+
|
|
1727
1762
|
def get_lextreme_spec(subset: str) -> RunSpec:
|
|
1728
1763
|
scenario_spec = ScenarioSpec(
|
|
1729
1764
|
class_name="helm.benchmark.scenarios.lextreme_scenario.LEXTREMEScenario",
|
|
@@ -1742,7 +1777,7 @@ def get_lextreme_spec(subset: str) -> RunSpec:
|
|
|
1742
1777
|
name=f"lextreme:subset={subset}",
|
|
1743
1778
|
scenario_spec=scenario_spec,
|
|
1744
1779
|
adapter_spec=adapter_spec,
|
|
1745
|
-
metric_specs=
|
|
1780
|
+
metric_specs=build_classification_metrics(get_lextreme_task_type(subset)),
|
|
1746
1781
|
groups=["lextreme"],
|
|
1747
1782
|
)
|
|
1748
1783
|
|
|
@@ -1765,7 +1800,7 @@ def get_lex_glue_spec(subset: str) -> RunSpec:
|
|
|
1765
1800
|
name=f"lex_glue:subset={subset}",
|
|
1766
1801
|
scenario_spec=scenario_spec,
|
|
1767
1802
|
adapter_spec=adapter_spec,
|
|
1768
|
-
metric_specs=
|
|
1803
|
+
metric_specs=build_classification_metrics(get_lex_glue_task_type(subset)),
|
|
1769
1804
|
groups=["lex_glue"],
|
|
1770
1805
|
)
|
|
1771
1806
|
|
|
@@ -1801,6 +1836,40 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec
|
|
|
1801
1836
|
)
|
|
1802
1837
|
|
|
1803
1838
|
|
|
1839
|
+
def get_opinions_qa_spec(
|
|
1840
|
+
survey_type: str,
|
|
1841
|
+
num_logprobs: str,
|
|
1842
|
+
context: str = "None",
|
|
1843
|
+
num_train_trials: str = "1",
|
|
1844
|
+
method: str = ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
1845
|
+
) -> RunSpec:
|
|
1846
|
+
scenario_spec = ScenarioSpec(
|
|
1847
|
+
class_name="helm.benchmark.scenarios.opinions_qa_scenario.OpinionsQAScenario",
|
|
1848
|
+
args={"survey_type": survey_type, "context": context},
|
|
1849
|
+
)
|
|
1850
|
+
|
|
1851
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
1852
|
+
method=method,
|
|
1853
|
+
instructions="",
|
|
1854
|
+
input_noun="Question",
|
|
1855
|
+
output_noun="Answer",
|
|
1856
|
+
max_train_instances=1 if "steer" in context else 0,
|
|
1857
|
+
max_tokens=1,
|
|
1858
|
+
num_outputs=int(num_logprobs),
|
|
1859
|
+
num_train_trials=1 if context != "steer-qa" else int(num_train_trials),
|
|
1860
|
+
sample_train=False,
|
|
1861
|
+
)
|
|
1862
|
+
|
|
1863
|
+
return RunSpec(
|
|
1864
|
+
name=f"opinions_qa:survey={survey_type},num_logprobs={num_logprobs}"
|
|
1865
|
+
+ f",context={context},num_train_trials={num_train_trials}",
|
|
1866
|
+
scenario_spec=scenario_spec,
|
|
1867
|
+
adapter_spec=adapter_spec,
|
|
1868
|
+
metric_specs=[],
|
|
1869
|
+
groups=["opinions_qa"],
|
|
1870
|
+
)
|
|
1871
|
+
|
|
1872
|
+
|
|
1804
1873
|
############################################################
|
|
1805
1874
|
|
|
1806
1875
|
CANONICAL_RUN_SPEC_FUNCS: Dict[str, Callable[..., RunSpec]] = {
|
|
@@ -1858,6 +1927,7 @@ CANONICAL_RUN_SPEC_FUNCS: Dict[str, Callable[..., RunSpec]] = {
|
|
|
1858
1927
|
"med_paragraph_simplification": get_med_paragraph_simplification_spec,
|
|
1859
1928
|
"med_qa": get_med_qa_spec,
|
|
1860
1929
|
"pubmed_qa": get_pubmed_qa_spec,
|
|
1930
|
+
"opinions_qa": get_opinions_qa_spec,
|
|
1861
1931
|
}
|
|
1862
1932
|
|
|
1863
1933
|
|
|
@@ -1900,6 +1970,10 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
|
|
|
1900
1970
|
global_prefix_expander = GlobalPrefixRunExpander(value="nlg")
|
|
1901
1971
|
run_spec = singleton(global_prefix_expander.expand(run_spec))
|
|
1902
1972
|
|
|
1973
|
+
if CHATML_MODEL_TAG in model.tags:
|
|
1974
|
+
chatml_expander = ChatMLRunExpander()
|
|
1975
|
+
run_spec = singleton(chatml_expander.expand(run_spec))
|
|
1976
|
+
|
|
1903
1977
|
return run_spec
|
|
1904
1978
|
|
|
1905
1979
|
run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]
|
helm/benchmark/runner.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
+
import traceback
|
|
3
4
|
import typing
|
|
4
5
|
from collections import Counter
|
|
5
6
|
from dataclasses import dataclass, field
|
|
@@ -25,6 +26,12 @@ from .metrics.tokens_metric import TokensMetric
|
|
|
25
26
|
from .window_services.tokenizer_service import TokenizerService
|
|
26
27
|
|
|
27
28
|
|
|
29
|
+
class RunnerError(Exception):
|
|
30
|
+
"""Error that happens in the Runner."""
|
|
31
|
+
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
|
|
28
35
|
@dataclass(frozen=True)
|
|
29
36
|
class RunSpec:
|
|
30
37
|
"""
|
|
@@ -71,12 +78,16 @@ class Runner:
|
|
|
71
78
|
output_path: str,
|
|
72
79
|
suite: str,
|
|
73
80
|
skip_instances: bool,
|
|
81
|
+
skip_completed_runs: bool,
|
|
82
|
+
exit_on_error: bool,
|
|
74
83
|
):
|
|
75
84
|
self.executor = Executor(execution_spec)
|
|
76
85
|
self.dry_run: bool = execution_spec.dry_run
|
|
77
86
|
self.tokenizer_service = TokenizerService(self.executor.service, execution_spec.auth)
|
|
78
87
|
self.metric_service = MetricService(self.executor.service, execution_spec.auth)
|
|
79
88
|
self.skip_instances: bool = skip_instances
|
|
89
|
+
self.skip_completed_runs: bool = skip_completed_runs
|
|
90
|
+
self.exit_on_error: bool = exit_on_error
|
|
80
91
|
|
|
81
92
|
ensure_directory_exists(output_path)
|
|
82
93
|
# Decide where to save the raw data (e.g., "output/scenarios/mmlu").
|
|
@@ -91,9 +102,20 @@ class Runner:
|
|
|
91
102
|
ensure_directory_exists(self.eval_cache_path)
|
|
92
103
|
|
|
93
104
|
def run_all(self, run_specs: List[RunSpec]):
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
105
|
+
failed_run_specs: List[RunSpec] = []
|
|
106
|
+
for run_spec in tqdm(run_specs, disable=None):
|
|
107
|
+
try:
|
|
108
|
+
with htrack_block(f"Running {run_spec.name}"):
|
|
109
|
+
self.run_one(run_spec)
|
|
110
|
+
except Exception as e:
|
|
111
|
+
if self.exit_on_error:
|
|
112
|
+
raise e
|
|
113
|
+
else:
|
|
114
|
+
hlog(f"Error when running {run_spec.name}:\n{traceback.format_exc()}")
|
|
115
|
+
failed_run_specs.append(run_spec)
|
|
116
|
+
if not self.exit_on_error and failed_run_specs:
|
|
117
|
+
failed_runs_str = ", ".join([f'"{run_spec.name}"' for run_spec in failed_run_specs])
|
|
118
|
+
raise RunnerError(f"Failed runs: [{failed_runs_str}]")
|
|
97
119
|
|
|
98
120
|
def run_one(self, run_spec: RunSpec):
|
|
99
121
|
# Load the scenario
|
|
@@ -106,6 +128,12 @@ class Runner:
|
|
|
106
128
|
run_path: str = os.path.join(self.runs_path, run_spec.name)
|
|
107
129
|
ensure_directory_exists(run_path)
|
|
108
130
|
|
|
131
|
+
if self.skip_completed_runs and os.path.exists(os.path.join(run_path, "scenario_state.json")):
|
|
132
|
+
# If scenario_state.json exists, assume that all other output files exist
|
|
133
|
+
# because scenario_state.json is the last output file to be written.
|
|
134
|
+
hlog(f"Skipping run {run_spec.name} because run is completed and all output files exist.")
|
|
135
|
+
return
|
|
136
|
+
|
|
109
137
|
# Fetch and initialize the Adapter based on the `AdapterSpec`.
|
|
110
138
|
adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)
|
|
111
139
|
|
|
@@ -72,7 +72,7 @@ class CopyrightScenario(Scenario):
|
|
|
72
72
|
|
|
73
73
|
# Read all the instances
|
|
74
74
|
instances: List[Instance] = []
|
|
75
|
-
for prefix, prefix_to_end in tqdm.tqdm(data["data"].items(), desc="load instances"):
|
|
75
|
+
for prefix, prefix_to_end in tqdm.tqdm(data["data"].items(), desc="load instances", disable=None):
|
|
76
76
|
instances.append(
|
|
77
77
|
Instance(
|
|
78
78
|
input=Input(text=prefix),
|