crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
- helm/benchmark/__init__.py +13 -0
- helm/benchmark/adaptation/adapter_spec.py +3 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
- helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
- helm/benchmark/contamination/__init__.py +0 -0
- helm/benchmark/metrics/classification_metrics.py +70 -0
- helm/benchmark/metrics/machine_translation_metrics.py +36 -0
- helm/benchmark/metrics/summarization_metrics.py +7 -8
- helm/benchmark/metrics/test_classification_metrics.py +150 -0
- helm/benchmark/presentation/create_plots.py +617 -0
- helm/benchmark/presentation/run_display.py +7 -48
- helm/benchmark/presentation/summarize.py +4 -2
- helm/benchmark/presentation/test_create_plots.py +32 -0
- helm/benchmark/run.py +144 -48
- helm/benchmark/run_expander.py +164 -47
- helm/benchmark/run_specs.py +346 -39
- helm/benchmark/runner.py +34 -6
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
- helm/benchmark/scenarios/imdb_listdir.json +50014 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
- helm/benchmark/scenarios/lextreme_scenario.py +458 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
- helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
- helm/benchmark/scenarios/med_qa_scenario.py +96 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
- helm/benchmark/scenarios/scenario.py +5 -0
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
- helm/benchmark/static/benchmarking.css +14 -0
- helm/benchmark/static/benchmarking.js +43 -0
- helm/benchmark/static/index.html +2 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/plot-captions.js +16 -0
- helm/benchmark/static/schema.yaml +154 -1
- helm/benchmark/window_services/cohere_window_service.py +20 -0
- helm/benchmark/window_services/flan_t5_window_service.py +29 -0
- helm/benchmark/window_services/huggingface_window_service.py +39 -0
- helm/benchmark/window_services/santacoder_window_service.py +27 -0
- helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
- helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
- helm/benchmark/window_services/window_service_factory.py +34 -7
- helm/common/codec.py +123 -0
- helm/common/general.py +12 -5
- helm/common/test_codec.py +144 -0
- helm/proxy/clients/aleph_alpha_client.py +47 -28
- helm/proxy/clients/auto_client.py +32 -24
- helm/proxy/clients/google_client.py +88 -0
- helm/proxy/clients/huggingface_client.py +32 -16
- helm/proxy/clients/huggingface_model_registry.py +111 -0
- helm/proxy/clients/huggingface_tokenizer.py +25 -7
- helm/proxy/clients/openai_client.py +60 -2
- helm/proxy/clients/test_huggingface_model_registry.py +57 -0
- helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
- helm/proxy/clients/together_client.py +17 -2
- helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
- helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
- helm/proxy/models.py +115 -7
- helm/proxy/test_models.py +1 -1
- helm/benchmark/presentation/present.py +0 -249
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,8 @@
|
|
|
1
1
|
from collections import OrderedDict, defaultdict
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
import os
|
|
4
|
-
import json
|
|
5
4
|
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
|
6
5
|
|
|
7
|
-
import dacite
|
|
8
|
-
|
|
9
6
|
from helm.benchmark.adaptation.adapters.adapter_factory import (
|
|
10
7
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS,
|
|
11
8
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
|
|
@@ -13,48 +10,15 @@ from helm.benchmark.adaptation.adapters.adapter_factory import (
|
|
|
13
10
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
14
11
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
15
12
|
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
16
|
-
from helm.benchmark.augmentations.dialect_perturbation import DialectPerturbation
|
|
17
|
-
from helm.benchmark.augmentations.extra_space_perturbation import ExtraSpacePerturbation
|
|
18
|
-
from helm.benchmark.augmentations.filler_words_perturbation import FillerWordsPerturbation
|
|
19
|
-
from helm.benchmark.augmentations.gender_perturbation import GenderPerturbation
|
|
20
|
-
from helm.benchmark.augmentations.misspelling_perturbation import MisspellingPerturbation
|
|
21
|
-
from helm.benchmark.augmentations.person_name_perturbation import PersonNamePerturbation
|
|
22
13
|
from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
|
|
23
|
-
from helm.benchmark.augmentations.space_perturbation import SpacePerturbation
|
|
24
|
-
from helm.benchmark.augmentations.synonym_perturbation import SynonymPerturbation
|
|
25
|
-
from helm.benchmark.augmentations.typos_perturbation import TyposPerturbation
|
|
26
14
|
from helm.benchmark.metrics.metric import PerInstanceStats
|
|
27
15
|
from helm.benchmark.presentation.schema import Schema
|
|
28
16
|
from helm.benchmark.runner import RunSpec
|
|
29
17
|
from helm.benchmark.scenarios.scenario import Instance
|
|
30
|
-
from helm.common.general import
|
|
18
|
+
from helm.common.general import write
|
|
31
19
|
from helm.common.hierarchical_logger import htrack
|
|
32
20
|
from helm.common.request import Request
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
# TODO(#1251): Add proper class registration
|
|
36
|
-
_PERTURBATION_NAME_TO_DESCRIPTION = {
|
|
37
|
-
DialectPerturbation.name: DialectPerturbation.Description,
|
|
38
|
-
ExtraSpacePerturbation.name: ExtraSpacePerturbation.Description,
|
|
39
|
-
FillerWordsPerturbation.name: FillerWordsPerturbation.Description,
|
|
40
|
-
GenderPerturbation.name: GenderPerturbation.Description,
|
|
41
|
-
MisspellingPerturbation.name: MisspellingPerturbation.Description,
|
|
42
|
-
PersonNamePerturbation.name: PersonNamePerturbation.Description,
|
|
43
|
-
SpacePerturbation.name: SpacePerturbation.Description,
|
|
44
|
-
SynonymPerturbation.name: SynonymPerturbation.Description,
|
|
45
|
-
TyposPerturbation.name: TyposPerturbation.Description,
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def _deserialize_perturbation_description(raw_perturbation_description: Dict) -> PerturbationDescription:
|
|
50
|
-
"""Convert a raw dictionary to a PerturbationDescription.
|
|
51
|
-
This uses the name field to look up the correct PerturbationDescription subclass to output.
|
|
52
|
-
"""
|
|
53
|
-
factory = _PERTURBATION_NAME_TO_DESCRIPTION.get(raw_perturbation_description["name"], PerturbationDescription)
|
|
54
|
-
return factory(**raw_perturbation_description)
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
_DACITE_CONFIG = dacite.Config(type_hooks={PerturbationDescription: _deserialize_perturbation_description})
|
|
21
|
+
from helm.common.codec import from_json, to_json
|
|
58
22
|
|
|
59
23
|
|
|
60
24
|
@dataclass(frozen=True)
|
|
@@ -117,8 +81,7 @@ def _read_scenario_state(run_path: str) -> ScenarioState:
|
|
|
117
81
|
if not os.path.exists(scenario_state_path):
|
|
118
82
|
raise ValueError(f"Could not load ScenarioState from {scenario_state_path}")
|
|
119
83
|
with open(scenario_state_path) as f:
|
|
120
|
-
|
|
121
|
-
return dacite.from_dict(ScenarioState, raw_scenario_state, config=_DACITE_CONFIG)
|
|
84
|
+
return from_json(f.read(), ScenarioState)
|
|
122
85
|
|
|
123
86
|
|
|
124
87
|
def _read_per_instance_stats(run_path: str) -> List[PerInstanceStats]:
|
|
@@ -126,8 +89,7 @@ def _read_per_instance_stats(run_path: str) -> List[PerInstanceStats]:
|
|
|
126
89
|
if not os.path.exists(per_instance_stats_path):
|
|
127
90
|
raise ValueError(f"Could not load PerInstanceStats from {per_instance_stats_path}")
|
|
128
91
|
with open(per_instance_stats_path) as f:
|
|
129
|
-
|
|
130
|
-
return [dacite.from_dict(PerInstanceStats, r, config=_DACITE_CONFIG) for r in raw_per_instance_stats]
|
|
92
|
+
return from_json(f.read(), List[PerInstanceStats])
|
|
131
93
|
|
|
132
94
|
|
|
133
95
|
def _truncate_predicted_text(
|
|
@@ -286,13 +248,10 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema):
|
|
|
286
248
|
|
|
287
249
|
write(
|
|
288
250
|
os.path.join(run_path, "instances.json"),
|
|
289
|
-
|
|
290
|
-
)
|
|
291
|
-
write(
|
|
292
|
-
os.path.join(run_path, "display_predictions.json"),
|
|
293
|
-
json.dumps(list(map(asdict_without_nones, predictions)), indent=2),
|
|
251
|
+
to_json(list(instance_id_to_instance.values())),
|
|
294
252
|
)
|
|
253
|
+
write(os.path.join(run_path, "display_predictions.json"), to_json(predictions))
|
|
295
254
|
write(
|
|
296
255
|
os.path.join(run_path, "display_requests.json"),
|
|
297
|
-
|
|
256
|
+
to_json(requests),
|
|
298
257
|
)
|
|
@@ -205,6 +205,9 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
|
|
|
205
205
|
return aggregate_win_rates
|
|
206
206
|
|
|
207
207
|
|
|
208
|
+
AGGREGATE_WIN_RATE_COLUMN = 1
|
|
209
|
+
|
|
210
|
+
|
|
208
211
|
class Summarizer:
|
|
209
212
|
"""Summarize the benchmark results in JSON files to be displayed in the UI."""
|
|
210
213
|
|
|
@@ -288,7 +291,7 @@ class Summarizer:
|
|
|
288
291
|
# run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
|
|
289
292
|
# so filter them out.
|
|
290
293
|
run_dir_names = sorted([p for p in os.listdir(self.run_suite_path) if p != "eval_cache" and p != "groups"])
|
|
291
|
-
for run_dir_name in tqdm(run_dir_names):
|
|
294
|
+
for run_dir_name in tqdm(run_dir_names, disable=None):
|
|
292
295
|
run_spec_path: str = os.path.join(self.run_suite_path, run_dir_name, "run_spec.json")
|
|
293
296
|
stats_path: str = os.path.join(self.run_suite_path, run_dir_name, "stats.json")
|
|
294
297
|
if not os.path.exists(run_spec_path) or not os.path.exists(stats_path):
|
|
@@ -747,7 +750,6 @@ class Summarizer:
|
|
|
747
750
|
# add overall win rate as the second column
|
|
748
751
|
WIN_RATE_AGGREGATION = "mean"
|
|
749
752
|
win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
|
|
750
|
-
AGGREGATE_WIN_RATE_COLUMN = 1
|
|
751
753
|
description = "How many models this model outperform on average (over columns)."
|
|
752
754
|
table.header.insert(
|
|
753
755
|
AGGREGATE_WIN_RATE_COLUMN,
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from helm.common.general import asdict_without_nones
|
|
2
|
+
from helm.benchmark.presentation.table import Table, Cell, HeaderCell
|
|
3
|
+
from helm.benchmark.presentation.create_plots import parse_table
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_table_parsing():
|
|
7
|
+
title = "table"
|
|
8
|
+
scenarios = ["A", "B", "C", "D"]
|
|
9
|
+
models = ["X", "Y", "Z"]
|
|
10
|
+
header = []
|
|
11
|
+
rows = [[] for m in models]
|
|
12
|
+
|
|
13
|
+
header.append(HeaderCell("Models"))
|
|
14
|
+
header.append(HeaderCell("Mean win rate"))
|
|
15
|
+
for s in scenarios:
|
|
16
|
+
header.append(HeaderCell(s, lower_is_better=True, metadata={"run_group": s, "metric": "accuracy"}))
|
|
17
|
+
for i, model in enumerate(models):
|
|
18
|
+
rows[i].append(Cell(model))
|
|
19
|
+
rows[i].append(Cell(0.1 * i))
|
|
20
|
+
for j, s in enumerate(scenarios):
|
|
21
|
+
rows[i].append(Cell(i * 10 + j))
|
|
22
|
+
summarize_table = Table(title, header, rows)
|
|
23
|
+
table = parse_table(asdict_without_nones(summarize_table))
|
|
24
|
+
assert table.adapters == models
|
|
25
|
+
assert list(table.mean_win_rates) == [0.0, 0.1, 0.2]
|
|
26
|
+
assert len(table.columns) == len(scenarios)
|
|
27
|
+
for j, c in enumerate(table.columns):
|
|
28
|
+
assert c.group == scenarios[j]
|
|
29
|
+
assert c.lower_is_better
|
|
30
|
+
assert c.metric == "accuracy"
|
|
31
|
+
for i, v in enumerate(c.values):
|
|
32
|
+
assert v == i * 10 + j
|
helm/benchmark/run.py
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
from dataclasses import replace
|
|
3
|
+
import os
|
|
3
4
|
from typing import List, Optional
|
|
4
5
|
|
|
6
|
+
from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
|
|
5
7
|
from helm.common.hierarchical_logger import hlog, htrack, htrack_block
|
|
6
8
|
from helm.common.authentication import Authentication
|
|
7
9
|
from helm.common.object_spec import parse_object_spec
|
|
10
|
+
from helm.proxy.clients.huggingface_model_registry import register_huggingface_model_config
|
|
8
11
|
from helm.proxy.services.remote_service import create_authentication, add_service_args
|
|
9
12
|
|
|
10
13
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
@@ -16,8 +19,52 @@ from .run_specs import construct_run_specs
|
|
|
16
19
|
LATEST_SYMLINK: str = "latest"
|
|
17
20
|
|
|
18
21
|
|
|
22
|
+
def run_entries_to_run_specs(
|
|
23
|
+
run_entries: List[RunEntry],
|
|
24
|
+
max_eval_instances: Optional[int] = None,
|
|
25
|
+
num_train_trials: Optional[int] = None,
|
|
26
|
+
models_to_run: Optional[List[str]] = None,
|
|
27
|
+
groups_to_run: Optional[List[str]] = None,
|
|
28
|
+
priority: Optional[int] = None,
|
|
29
|
+
) -> List[RunSpec]:
|
|
30
|
+
"""Runs RunSpecs given a list of RunSpec descriptions."""
|
|
31
|
+
run_specs: List[RunSpec] = []
|
|
32
|
+
for entry in run_entries:
|
|
33
|
+
# Filter by priority
|
|
34
|
+
if priority is not None and entry.priority > priority:
|
|
35
|
+
continue
|
|
36
|
+
|
|
37
|
+
for run_spec in construct_run_specs(parse_object_spec(entry.description)):
|
|
38
|
+
# Filter by models
|
|
39
|
+
if models_to_run and run_spec.adapter_spec.model not in models_to_run:
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
# Filter by groups
|
|
43
|
+
if groups_to_run and not any(group in groups_to_run for group in run_spec.groups):
|
|
44
|
+
continue
|
|
45
|
+
|
|
46
|
+
# Modify AdapterSpec
|
|
47
|
+
adapter_spec: AdapterSpec = run_spec.adapter_spec
|
|
48
|
+
if max_eval_instances is not None:
|
|
49
|
+
adapter_spec = replace(adapter_spec, max_eval_instances=max_eval_instances)
|
|
50
|
+
if num_train_trials is not None or adapter_spec.max_train_instances == 0:
|
|
51
|
+
adapter_spec = replace(
|
|
52
|
+
adapter_spec, num_train_trials=1 if adapter_spec.max_train_instances == 0 else num_train_trials
|
|
53
|
+
)
|
|
54
|
+
run_spec = replace(run_spec, adapter_spec=adapter_spec)
|
|
55
|
+
|
|
56
|
+
# Append groups
|
|
57
|
+
if entry.groups is not None:
|
|
58
|
+
groups_name: str = "" if len(entry.groups) == 0 else f",groups={'-'.join(sorted(entry.groups))}"
|
|
59
|
+
run_spec = replace(run_spec, name=run_spec.name + groups_name, groups=run_spec.groups + entry.groups)
|
|
60
|
+
|
|
61
|
+
run_specs.append(run_spec)
|
|
62
|
+
|
|
63
|
+
return run_specs
|
|
64
|
+
|
|
65
|
+
|
|
19
66
|
def run_benchmarking(
|
|
20
|
-
|
|
67
|
+
run_specs: List[RunSpec],
|
|
21
68
|
auth: Authentication,
|
|
22
69
|
url: str,
|
|
23
70
|
local: bool,
|
|
@@ -27,15 +74,11 @@ def run_benchmarking(
|
|
|
27
74
|
suite: str,
|
|
28
75
|
dry_run: bool,
|
|
29
76
|
skip_instances: bool,
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
groups: Optional[List[str]] = None,
|
|
33
|
-
models_to_run: Optional[List[str]] = None,
|
|
34
|
-
groups_to_run: Optional[List[str]] = None,
|
|
77
|
+
skip_completed_runs: bool,
|
|
78
|
+
exit_on_error: bool,
|
|
35
79
|
mongo_uri: str = "",
|
|
36
80
|
) -> List[RunSpec]:
|
|
37
81
|
"""Runs RunSpecs given a list of RunSpec descriptions."""
|
|
38
|
-
|
|
39
82
|
execution_spec = ExecutionSpec(
|
|
40
83
|
auth=auth,
|
|
41
84
|
url=url,
|
|
@@ -45,47 +88,28 @@ def run_benchmarking(
|
|
|
45
88
|
dry_run=dry_run,
|
|
46
89
|
mongo_uri=mongo_uri,
|
|
47
90
|
)
|
|
48
|
-
|
|
49
|
-
def override(run_spec: RunSpec) -> RunSpec:
|
|
50
|
-
"""Override parts of `run_spec`."""
|
|
51
|
-
# Modify AdapterSpec
|
|
52
|
-
adapter_spec: AdapterSpec = run_spec.adapter_spec
|
|
53
|
-
if max_eval_instances is not None:
|
|
54
|
-
adapter_spec = replace(adapter_spec, max_eval_instances=max_eval_instances)
|
|
55
|
-
if num_train_trials is not None or adapter_spec.max_train_instances == 0:
|
|
56
|
-
adapter_spec = replace(
|
|
57
|
-
adapter_spec, num_train_trials=1 if adapter_spec.max_train_instances == 0 else num_train_trials
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
run_spec = replace(run_spec, adapter_spec=adapter_spec)
|
|
61
|
-
|
|
62
|
-
# Append groups
|
|
63
|
-
if groups is not None:
|
|
64
|
-
groups_name: str = "" if len(groups) == 0 else f",groups={'-'.join(sorted(groups))}"
|
|
65
|
-
run_spec = replace(run_spec, name=run_spec.name + groups_name, groups=run_spec.groups + groups)
|
|
66
|
-
|
|
67
|
-
return run_spec
|
|
68
|
-
|
|
69
|
-
run_specs = [
|
|
70
|
-
override(run_spec)
|
|
71
|
-
for description in run_spec_descriptions
|
|
72
|
-
for run_spec in construct_run_specs(parse_object_spec(description))
|
|
73
|
-
if (not models_to_run or run_spec.adapter_spec.model in models_to_run)
|
|
74
|
-
and (not groups_to_run or any(group in groups_to_run for group in run_spec.groups))
|
|
75
|
-
]
|
|
76
|
-
|
|
77
|
-
if len(run_specs) == 0:
|
|
78
|
-
return run_specs
|
|
79
|
-
|
|
80
91
|
with htrack_block("run_specs"):
|
|
81
92
|
for run_spec in run_specs:
|
|
82
|
-
hlog(run_spec
|
|
93
|
+
hlog(run_spec)
|
|
83
94
|
|
|
84
|
-
runner = Runner(execution_spec, output_path, suite,
|
|
85
|
-
runner.run_all()
|
|
95
|
+
runner = Runner(execution_spec, output_path, suite, skip_instances, skip_completed_runs, exit_on_error)
|
|
96
|
+
runner.run_all(run_specs)
|
|
86
97
|
return run_specs
|
|
87
98
|
|
|
88
99
|
|
|
100
|
+
def symlink_latest(output_path: str, suite: str) -> None:
|
|
101
|
+
# Create a symlink runs/latest -> runs/<name_of_suite>,
|
|
102
|
+
# so runs/latest always points to the latest run suite.
|
|
103
|
+
runs_dir: str = os.path.join(output_path, "runs")
|
|
104
|
+
suite_dir: str = os.path.join(runs_dir, suite)
|
|
105
|
+
symlink_path: str = os.path.abspath(os.path.join(runs_dir, LATEST_SYMLINK))
|
|
106
|
+
hlog(f"Symlinking {suite_dir} to {LATEST_SYMLINK}.")
|
|
107
|
+
if os.path.islink(symlink_path):
|
|
108
|
+
# Remove the previous symlink if it exists.
|
|
109
|
+
os.unlink(symlink_path)
|
|
110
|
+
os.symlink(os.path.abspath(suite_dir), symlink_path)
|
|
111
|
+
|
|
112
|
+
|
|
89
113
|
def add_run_args(parser: argparse.ArgumentParser):
|
|
90
114
|
parser.add_argument(
|
|
91
115
|
"-o", "--output-path", type=str, help="Where to save all the output", default="benchmark_output"
|
|
@@ -149,19 +173,86 @@ def validate_args(args):
|
|
|
149
173
|
|
|
150
174
|
@htrack(None)
|
|
151
175
|
def main():
|
|
152
|
-
"""
|
|
153
|
-
Main entry point for running the benchmark.
|
|
154
|
-
"""
|
|
155
176
|
parser = argparse.ArgumentParser()
|
|
156
177
|
add_service_args(parser)
|
|
157
|
-
parser.add_argument(
|
|
178
|
+
parser.add_argument(
|
|
179
|
+
"-c",
|
|
180
|
+
"--conf-paths",
|
|
181
|
+
nargs="+",
|
|
182
|
+
help="Where to read RunSpecs to run from",
|
|
183
|
+
default=[],
|
|
184
|
+
)
|
|
185
|
+
parser.add_argument(
|
|
186
|
+
"--models-to-run",
|
|
187
|
+
nargs="+",
|
|
188
|
+
help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
|
|
189
|
+
default=None,
|
|
190
|
+
)
|
|
191
|
+
parser.add_argument(
|
|
192
|
+
"--groups-to-run",
|
|
193
|
+
nargs="+",
|
|
194
|
+
help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
|
|
195
|
+
default=None,
|
|
196
|
+
)
|
|
197
|
+
parser.add_argument(
|
|
198
|
+
"--exit-on-error",
|
|
199
|
+
action="store_true",
|
|
200
|
+
default=None,
|
|
201
|
+
help="Fail and exit immediately if a particular RunSpec fails.",
|
|
202
|
+
)
|
|
203
|
+
parser.add_argument(
|
|
204
|
+
"--skip-completed-runs",
|
|
205
|
+
action="store_true",
|
|
206
|
+
default=None,
|
|
207
|
+
help="Skip RunSpecs that have completed i.e. output files exists.",
|
|
208
|
+
)
|
|
209
|
+
parser.add_argument(
|
|
210
|
+
"--priority",
|
|
211
|
+
type=int,
|
|
212
|
+
default=None,
|
|
213
|
+
help="Run RunSpecs with priority less than or equal to this number. "
|
|
214
|
+
"If a value for --priority is not specified, run on everything",
|
|
215
|
+
)
|
|
216
|
+
parser.add_argument("-r", "--run-specs", nargs="*", help="Specifies what to run", default=[])
|
|
217
|
+
parser.add_argument(
|
|
218
|
+
"--enable-huggingface-models",
|
|
219
|
+
nargs="+",
|
|
220
|
+
default=[],
|
|
221
|
+
help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
|
|
222
|
+
"Format: namespace/model_name[@revision]",
|
|
223
|
+
)
|
|
158
224
|
add_run_args(parser)
|
|
159
225
|
args = parser.parse_args()
|
|
160
226
|
validate_args(args)
|
|
161
227
|
|
|
228
|
+
for huggingface_model_name in args.enable_huggingface_models:
|
|
229
|
+
register_huggingface_model_config(huggingface_model_name)
|
|
230
|
+
|
|
231
|
+
run_entries: List[RunEntry] = []
|
|
232
|
+
if args.conf_paths:
|
|
233
|
+
run_entries.extend(read_run_entries(args.conf_paths).entries)
|
|
234
|
+
if args.run_specs:
|
|
235
|
+
run_entries.extend(
|
|
236
|
+
[RunEntry(description=description, priority=1, groups=None) for description in args.run_specs]
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
run_specs = run_entries_to_run_specs(
|
|
240
|
+
run_entries=run_entries,
|
|
241
|
+
max_eval_instances=args.max_eval_instances,
|
|
242
|
+
num_train_trials=args.num_train_trials,
|
|
243
|
+
models_to_run=args.models_to_run,
|
|
244
|
+
groups_to_run=args.groups_to_run,
|
|
245
|
+
priority=args.priority,
|
|
246
|
+
)
|
|
247
|
+
hlog(f"{len(run_entries)} entries produced {len(run_specs)} run specs")
|
|
248
|
+
|
|
249
|
+
if len(run_specs) == 0:
|
|
250
|
+
hlog("There were no RunSpecs or they got filtered out.")
|
|
251
|
+
return
|
|
252
|
+
|
|
162
253
|
auth: Authentication = Authentication("") if args.skip_instances or args.local else create_authentication(args)
|
|
163
254
|
run_benchmarking(
|
|
164
|
-
|
|
255
|
+
run_specs=run_specs,
|
|
165
256
|
auth=auth,
|
|
166
257
|
url=args.server_url,
|
|
167
258
|
local=args.local,
|
|
@@ -171,10 +262,15 @@ def main():
|
|
|
171
262
|
suite=args.suite,
|
|
172
263
|
dry_run=args.dry_run,
|
|
173
264
|
skip_instances=args.skip_instances,
|
|
174
|
-
|
|
265
|
+
skip_completed_runs=args.skip_completed_runs,
|
|
266
|
+
exit_on_error=args.exit_on_error,
|
|
175
267
|
mongo_uri=args.mongo_uri,
|
|
176
268
|
)
|
|
177
269
|
|
|
270
|
+
symlink_latest(output_path=args.output_path, suite=args.suite)
|
|
271
|
+
|
|
272
|
+
hlog("Done.")
|
|
273
|
+
|
|
178
274
|
|
|
179
275
|
if __name__ == "__main__":
|
|
180
276
|
main()
|
helm/benchmark/run_expander.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from dataclasses import replace
|
|
3
|
-
from typing import List, Dict, Optional, Tuple
|
|
3
|
+
from typing import List, Dict, Optional, Tuple, Type
|
|
4
4
|
|
|
5
5
|
from helm.proxy.models import (
|
|
6
6
|
get_all_code_models,
|
|
@@ -302,35 +302,58 @@ class ModelRunExpander(ReplaceValueRunExpander):
|
|
|
302
302
|
"""
|
|
303
303
|
|
|
304
304
|
name = "model"
|
|
305
|
-
values_dict = {
|
|
306
|
-
"full_functionality_text": get_model_names_with_tag(FULL_FUNCTIONALITY_TEXT_MODEL_TAG),
|
|
307
|
-
"ai21/j1-jumbo": ["ai21/j1-jumbo"],
|
|
308
|
-
"openai/curie": ["openai/curie"],
|
|
309
|
-
"chat_run": ["openai/chat-gpt", "openai/text-davinci-003"], # Compare ChatGPT to text-davinci-003
|
|
310
|
-
"all": get_all_models(),
|
|
311
|
-
"text_code": get_all_text_models() + get_all_code_models(),
|
|
312
|
-
"text": get_all_text_models(),
|
|
313
|
-
"code": get_all_code_models(),
|
|
314
|
-
"limited_functionality_text": get_model_names_with_tag(LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG),
|
|
315
|
-
"gpt2_tokenizer": get_model_names_with_tag(GPT2_TOKENIZER_TAG),
|
|
316
|
-
"ai21_tokenizer": get_model_names_with_tag(AI21_TOKENIZER_TAG),
|
|
317
|
-
"cohere_tokenizer": get_model_names_with_tag(COHERE_TOKENIZER_TAG),
|
|
318
|
-
"opt_tokenizer": get_model_names_with_tag(OPT_TOKENIZER_TAG),
|
|
319
|
-
"summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"],
|
|
320
|
-
"interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"],
|
|
321
|
-
}
|
|
322
305
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
for family_name, models in ablation_values_dict.items():
|
|
330
|
-
if family_name == "ablation_all":
|
|
331
|
-
values_dict["ablation"] = models
|
|
306
|
+
def __init__(self, value):
|
|
307
|
+
"""
|
|
308
|
+
`value` is either the actual value to use or a lookup into the values dict.
|
|
309
|
+
"""
|
|
310
|
+
if value in self.values_dict:
|
|
311
|
+
self.values = self.values_dict[value]
|
|
332
312
|
else:
|
|
333
|
-
|
|
313
|
+
self.values = [value]
|
|
314
|
+
|
|
315
|
+
@property
|
|
316
|
+
def values_dict(self):
|
|
317
|
+
values_dict = {
|
|
318
|
+
"full_functionality_text": get_model_names_with_tag(FULL_FUNCTIONALITY_TEXT_MODEL_TAG),
|
|
319
|
+
"ai21/j1-jumbo": ["ai21/j1-jumbo"],
|
|
320
|
+
"openai/curie": ["openai/curie"],
|
|
321
|
+
"chat_run": ["openai/chat-gpt", "openai/text-davinci-003"], # Compare ChatGPT to text-davinci-003
|
|
322
|
+
"all": get_all_models(),
|
|
323
|
+
"text_code": get_all_text_models() + get_all_code_models(),
|
|
324
|
+
"text": get_all_text_models(),
|
|
325
|
+
"code": get_all_code_models(),
|
|
326
|
+
"limited_functionality_text": get_model_names_with_tag(LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG),
|
|
327
|
+
"gpt2_tokenizer": get_model_names_with_tag(GPT2_TOKENIZER_TAG),
|
|
328
|
+
"ai21_tokenizer": get_model_names_with_tag(AI21_TOKENIZER_TAG),
|
|
329
|
+
"cohere_tokenizer": get_model_names_with_tag(COHERE_TOKENIZER_TAG),
|
|
330
|
+
"opt_tokenizer": get_model_names_with_tag(OPT_TOKENIZER_TAG),
|
|
331
|
+
"summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"],
|
|
332
|
+
"biomedical": ["openai/text-davinci-003"], # TODO: add https://huggingface.co/stanford-crfm/BioMedLM
|
|
333
|
+
"interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"],
|
|
334
|
+
"opinions_qa_openai": [
|
|
335
|
+
"openai/ada",
|
|
336
|
+
"openai/davinci",
|
|
337
|
+
"openai/text-ada-001",
|
|
338
|
+
"openai/text-davinci-001",
|
|
339
|
+
"openai/text-davinci-002",
|
|
340
|
+
"openai/text-davinci-003",
|
|
341
|
+
],
|
|
342
|
+
"opinions_qa_ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
# For each of the keys above (e.g., "text"), create a corresponding ablation (e.g., "ablation_text")
|
|
346
|
+
# which contains the subset of models with the ablation tag.
|
|
347
|
+
ablation_models = set(get_model_names_with_tag(ABLATION_MODEL_TAG))
|
|
348
|
+
ablation_values_dict = {}
|
|
349
|
+
for family_name, models in values_dict.items():
|
|
350
|
+
ablation_values_dict["ablation_" + family_name] = list(ablation_models & set(models))
|
|
351
|
+
for family_name, models in ablation_values_dict.items():
|
|
352
|
+
if family_name == "ablation_all":
|
|
353
|
+
values_dict["ablation"] = models
|
|
354
|
+
else:
|
|
355
|
+
values_dict[family_name] = models
|
|
356
|
+
return values_dict
|
|
334
357
|
|
|
335
358
|
|
|
336
359
|
############################################################
|
|
@@ -739,6 +762,7 @@ class TokenizerRunExpander(ScenarioSpecRunExpander):
|
|
|
739
762
|
"AlephAlpha/luminous-extended": ["AlephAlpha/luminous-extended"],
|
|
740
763
|
"AlephAlpha/luminous-supreme": ["AlephAlpha/luminous-supreme"],
|
|
741
764
|
"AlephAlpha/luminous-world": ["AlephAlpha/luminous-world"],
|
|
765
|
+
"huggingface/santacoder": ["bigcode/santacoder"],
|
|
742
766
|
}
|
|
743
767
|
model_tags_and_tokenizers = [
|
|
744
768
|
(GPT2_TOKENIZER_TAG, "huggingface/gpt2"),
|
|
@@ -768,7 +792,8 @@ class TokenizerRunExpander(ScenarioSpecRunExpander):
|
|
|
768
792
|
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
769
793
|
# Find right tokenizer given model.
|
|
770
794
|
if isinstance(self.all_values, dict):
|
|
771
|
-
|
|
795
|
+
model: str = run_spec.adapter_spec.model
|
|
796
|
+
self.values = self.all_values[model] if model in self.all_values else []
|
|
772
797
|
else:
|
|
773
798
|
self.values = self.all_values
|
|
774
799
|
return super().expand(run_spec)
|
|
@@ -818,21 +843,113 @@ class NumOutputTokensRunExpander(RunExpander):
|
|
|
818
843
|
]
|
|
819
844
|
|
|
820
845
|
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
846
|
+
class ChatMLRunExpander(RunExpander):
|
|
847
|
+
"""
|
|
848
|
+
Adapt to ChatML: https://github.com/openai/openai-python/blob/main/chatml.md
|
|
849
|
+
A 1-shot example:
|
|
850
|
+
<|im_start|>system
|
|
851
|
+
Translate from English to French
|
|
852
|
+
<|im_end|>
|
|
853
|
+
<|im_start|>user
|
|
854
|
+
How are you?
|
|
855
|
+
<|im_end|>
|
|
856
|
+
<|im_start|>user
|
|
857
|
+
Comment allez-vous?
|
|
858
|
+
<|im_end|>
|
|
859
|
+
<|im_start|>user
|
|
860
|
+
{{user input here}}<|im_end|>
|
|
861
|
+
"""
|
|
862
|
+
|
|
863
|
+
name = "chatml"
|
|
864
|
+
|
|
865
|
+
def __init__(self):
|
|
866
|
+
self.name = type(self).name
|
|
867
|
+
|
|
868
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
869
|
+
adapter_spec = run_spec.adapter_spec
|
|
870
|
+
# according to https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting
|
|
871
|
+
# few-shot examples should do `<|im_start|>system name=example_user`
|
|
872
|
+
# or `<|im_start|>system name=example_assistant`
|
|
873
|
+
# but it is also possible to put examples into a user message.
|
|
874
|
+
|
|
875
|
+
scenario_name = run_spec.name.split(":")[0]
|
|
876
|
+
|
|
877
|
+
if scenario_name in ("msmarco",):
|
|
878
|
+
# output_prefix:
|
|
879
|
+
# Does the passage answer the query?
|
|
880
|
+
# Answer:
|
|
881
|
+
#
|
|
882
|
+
# new_output_prefix:
|
|
883
|
+
# Does the passage answer the query?<|im_end|>
|
|
884
|
+
# <|im_start|>assistant
|
|
885
|
+
# Answer:
|
|
886
|
+
|
|
887
|
+
new_output_prefix = (
|
|
888
|
+
adapter_spec.output_prefix.split("\n")[0]
|
|
889
|
+
+ "<|im_end|>\n<|im_start|>assistant\n"
|
|
890
|
+
+ adapter_spec.output_prefix.split("\n")[1]
|
|
891
|
+
)
|
|
892
|
+
|
|
893
|
+
elif scenario_name in ("summarization_cnndm", "summarization_xsum"):
|
|
894
|
+
# output_prefix:
|
|
895
|
+
# Summarize the above article in 1 sentence.
|
|
896
|
+
#
|
|
897
|
+
# new_output_prefix:
|
|
898
|
+
# Summarize the above article in 1 sentence.<|im_end|>
|
|
899
|
+
# <|im_start|>assistant
|
|
900
|
+
#
|
|
901
|
+
|
|
902
|
+
new_output_prefix = adapter_spec.output_prefix + "<|im_end|>\n<|im_start|>assistant\n"
|
|
903
|
+
|
|
904
|
+
else:
|
|
905
|
+
# output_prefix:
|
|
906
|
+
# {output_prefix}
|
|
907
|
+
#
|
|
908
|
+
# new_output_prefix:
|
|
909
|
+
# <|im_end|>
|
|
910
|
+
# <|im_start|>assistant
|
|
911
|
+
# {output_prefix}
|
|
912
|
+
|
|
913
|
+
new_output_prefix = "<|im_end|>\n<|im_start|>assistant\n" + adapter_spec.output_prefix
|
|
914
|
+
|
|
915
|
+
adapter_spec = replace(
|
|
916
|
+
adapter_spec,
|
|
917
|
+
# This is a hack to make sure <|im_start|>user goes before the reference.
|
|
918
|
+
instructions=(
|
|
919
|
+
f"<|im_start|>system\n{adapter_spec.instructions}<|im_end|>\n<|im_start|>user\n"
|
|
920
|
+
if adapter_spec.instructions != ""
|
|
921
|
+
else "<|im_start|>user\n"
|
|
922
|
+
),
|
|
923
|
+
instance_prefix="",
|
|
924
|
+
output_prefix=new_output_prefix,
|
|
925
|
+
output_suffix="<|im_end|>\n<|im_start|>user\n",
|
|
926
|
+
stop_sequences=adapter_spec.stop_sequences + ["<|im_end|>"],
|
|
927
|
+
)
|
|
928
|
+
|
|
929
|
+
return [
|
|
930
|
+
replace(
|
|
931
|
+
run_spec,
|
|
932
|
+
adapter_spec=adapter_spec,
|
|
933
|
+
),
|
|
934
|
+
]
|
|
935
|
+
|
|
936
|
+
|
|
937
|
+
RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
|
|
938
|
+
InstructionsRunExpander,
|
|
939
|
+
PromptRunExpander,
|
|
940
|
+
NewlineRunExpander,
|
|
941
|
+
StopRunExpander,
|
|
942
|
+
GlobalPrefixRunExpander,
|
|
943
|
+
NumTrainTrialsRunExpander,
|
|
944
|
+
MaxTrainInstancesRunExpander,
|
|
945
|
+
NumOutputsRunExpander,
|
|
946
|
+
ModelRunExpander,
|
|
947
|
+
DataAugmentationRunExpander,
|
|
948
|
+
TokenizerRunExpander,
|
|
949
|
+
NumPromptTokensRunExpander,
|
|
950
|
+
NumOutputTokensRunExpander,
|
|
951
|
+
ChatMLRunExpander,
|
|
952
|
+
]
|
|
953
|
+
|
|
954
|
+
|
|
955
|
+
RUN_EXPANDERS = dict((expander.name, expander) for expander in RUN_EXPANDER_SUBCLASSES)
|