crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
  2. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
  3. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
  4. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
  5. helm/benchmark/__init__.py +13 -0
  6. helm/benchmark/adaptation/adapter_spec.py +3 -0
  7. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
  8. helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
  9. helm/benchmark/contamination/__init__.py +0 -0
  10. helm/benchmark/metrics/classification_metrics.py +70 -0
  11. helm/benchmark/metrics/machine_translation_metrics.py +36 -0
  12. helm/benchmark/metrics/summarization_metrics.py +7 -8
  13. helm/benchmark/metrics/test_classification_metrics.py +150 -0
  14. helm/benchmark/presentation/create_plots.py +617 -0
  15. helm/benchmark/presentation/run_display.py +7 -48
  16. helm/benchmark/presentation/summarize.py +4 -2
  17. helm/benchmark/presentation/test_create_plots.py +32 -0
  18. helm/benchmark/run.py +144 -48
  19. helm/benchmark/run_expander.py +164 -47
  20. helm/benchmark/run_specs.py +346 -39
  21. helm/benchmark/runner.py +34 -6
  22. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  23. helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
  24. helm/benchmark/scenarios/imdb_listdir.json +50014 -0
  25. helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
  26. helm/benchmark/scenarios/lextreme_scenario.py +458 -0
  27. helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
  28. helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
  29. helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
  30. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
  31. helm/benchmark/scenarios/med_qa_scenario.py +96 -0
  32. helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
  33. helm/benchmark/scenarios/scenario.py +5 -0
  34. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  35. helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
  36. helm/benchmark/static/benchmarking.css +14 -0
  37. helm/benchmark/static/benchmarking.js +43 -0
  38. helm/benchmark/static/index.html +2 -0
  39. helm/benchmark/static/json-urls.js +4 -0
  40. helm/benchmark/static/plot-captions.js +16 -0
  41. helm/benchmark/static/schema.yaml +154 -1
  42. helm/benchmark/window_services/cohere_window_service.py +20 -0
  43. helm/benchmark/window_services/flan_t5_window_service.py +29 -0
  44. helm/benchmark/window_services/huggingface_window_service.py +39 -0
  45. helm/benchmark/window_services/santacoder_window_service.py +27 -0
  46. helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
  47. helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
  48. helm/benchmark/window_services/window_service_factory.py +34 -7
  49. helm/common/codec.py +123 -0
  50. helm/common/general.py +12 -5
  51. helm/common/test_codec.py +144 -0
  52. helm/proxy/clients/aleph_alpha_client.py +47 -28
  53. helm/proxy/clients/auto_client.py +32 -24
  54. helm/proxy/clients/google_client.py +88 -0
  55. helm/proxy/clients/huggingface_client.py +32 -16
  56. helm/proxy/clients/huggingface_model_registry.py +111 -0
  57. helm/proxy/clients/huggingface_tokenizer.py +25 -7
  58. helm/proxy/clients/openai_client.py +60 -2
  59. helm/proxy/clients/test_huggingface_model_registry.py +57 -0
  60. helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
  61. helm/proxy/clients/together_client.py +17 -2
  62. helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
  63. helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
  64. helm/proxy/models.py +115 -7
  65. helm/proxy/test_models.py +1 -1
  66. helm/benchmark/presentation/present.py +0 -249
  67. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
  68. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,8 @@
1
1
  from collections import OrderedDict, defaultdict
2
2
  from dataclasses import dataclass
3
3
  import os
4
- import json
5
4
  from typing import Dict, Iterable, List, Optional, Set, Tuple
6
5
 
7
- import dacite
8
-
9
6
  from helm.benchmark.adaptation.adapters.adapter_factory import (
10
7
  ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS,
11
8
  ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
@@ -13,48 +10,15 @@ from helm.benchmark.adaptation.adapters.adapter_factory import (
13
10
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
14
11
  from helm.benchmark.adaptation.request_state import RequestState
15
12
  from helm.benchmark.adaptation.scenario_state import ScenarioState
16
- from helm.benchmark.augmentations.dialect_perturbation import DialectPerturbation
17
- from helm.benchmark.augmentations.extra_space_perturbation import ExtraSpacePerturbation
18
- from helm.benchmark.augmentations.filler_words_perturbation import FillerWordsPerturbation
19
- from helm.benchmark.augmentations.gender_perturbation import GenderPerturbation
20
- from helm.benchmark.augmentations.misspelling_perturbation import MisspellingPerturbation
21
- from helm.benchmark.augmentations.person_name_perturbation import PersonNamePerturbation
22
13
  from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
23
- from helm.benchmark.augmentations.space_perturbation import SpacePerturbation
24
- from helm.benchmark.augmentations.synonym_perturbation import SynonymPerturbation
25
- from helm.benchmark.augmentations.typos_perturbation import TyposPerturbation
26
14
  from helm.benchmark.metrics.metric import PerInstanceStats
27
15
  from helm.benchmark.presentation.schema import Schema
28
16
  from helm.benchmark.runner import RunSpec
29
17
  from helm.benchmark.scenarios.scenario import Instance
30
- from helm.common.general import asdict_without_nones, write
18
+ from helm.common.general import write
31
19
  from helm.common.hierarchical_logger import htrack
32
20
  from helm.common.request import Request
33
-
34
-
35
- # TODO(#1251): Add proper class registration
36
- _PERTURBATION_NAME_TO_DESCRIPTION = {
37
- DialectPerturbation.name: DialectPerturbation.Description,
38
- ExtraSpacePerturbation.name: ExtraSpacePerturbation.Description,
39
- FillerWordsPerturbation.name: FillerWordsPerturbation.Description,
40
- GenderPerturbation.name: GenderPerturbation.Description,
41
- MisspellingPerturbation.name: MisspellingPerturbation.Description,
42
- PersonNamePerturbation.name: PersonNamePerturbation.Description,
43
- SpacePerturbation.name: SpacePerturbation.Description,
44
- SynonymPerturbation.name: SynonymPerturbation.Description,
45
- TyposPerturbation.name: TyposPerturbation.Description,
46
- }
47
-
48
-
49
- def _deserialize_perturbation_description(raw_perturbation_description: Dict) -> PerturbationDescription:
50
- """Convert a raw dictionary to a PerturbationDescription.
51
- This uses the name field to look up the correct PerturbationDescription subclass to output.
52
- """
53
- factory = _PERTURBATION_NAME_TO_DESCRIPTION.get(raw_perturbation_description["name"], PerturbationDescription)
54
- return factory(**raw_perturbation_description)
55
-
56
-
57
- _DACITE_CONFIG = dacite.Config(type_hooks={PerturbationDescription: _deserialize_perturbation_description})
21
+ from helm.common.codec import from_json, to_json
58
22
 
59
23
 
60
24
  @dataclass(frozen=True)
@@ -117,8 +81,7 @@ def _read_scenario_state(run_path: str) -> ScenarioState:
117
81
  if not os.path.exists(scenario_state_path):
118
82
  raise ValueError(f"Could not load ScenarioState from {scenario_state_path}")
119
83
  with open(scenario_state_path) as f:
120
- raw_scenario_state = json.load(f)
121
- return dacite.from_dict(ScenarioState, raw_scenario_state, config=_DACITE_CONFIG)
84
+ return from_json(f.read(), ScenarioState)
122
85
 
123
86
 
124
87
  def _read_per_instance_stats(run_path: str) -> List[PerInstanceStats]:
@@ -126,8 +89,7 @@ def _read_per_instance_stats(run_path: str) -> List[PerInstanceStats]:
126
89
  if not os.path.exists(per_instance_stats_path):
127
90
  raise ValueError(f"Could not load PerInstanceStats from {per_instance_stats_path}")
128
91
  with open(per_instance_stats_path) as f:
129
- raw_per_instance_stats = json.load(f)
130
- return [dacite.from_dict(PerInstanceStats, r, config=_DACITE_CONFIG) for r in raw_per_instance_stats]
92
+ return from_json(f.read(), List[PerInstanceStats])
131
93
 
132
94
 
133
95
  def _truncate_predicted_text(
@@ -286,13 +248,10 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema):
286
248
 
287
249
  write(
288
250
  os.path.join(run_path, "instances.json"),
289
- json.dumps(list(map(asdict_without_nones, instance_id_to_instance.values())), indent=2),
290
- )
291
- write(
292
- os.path.join(run_path, "display_predictions.json"),
293
- json.dumps(list(map(asdict_without_nones, predictions)), indent=2),
251
+ to_json(list(instance_id_to_instance.values())),
294
252
  )
253
+ write(os.path.join(run_path, "display_predictions.json"), to_json(predictions))
295
254
  write(
296
255
  os.path.join(run_path, "display_requests.json"),
297
- json.dumps(list(map(asdict_without_nones, requests)), indent=2),
256
+ to_json(requests),
298
257
  )
@@ -205,6 +205,9 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
205
205
  return aggregate_win_rates
206
206
 
207
207
 
208
+ AGGREGATE_WIN_RATE_COLUMN = 1
209
+
210
+
208
211
  class Summarizer:
209
212
  """Summarize the benchmark results in JSON files to be displayed in the UI."""
210
213
 
@@ -288,7 +291,7 @@ class Summarizer:
288
291
  # run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
289
292
  # so filter them out.
290
293
  run_dir_names = sorted([p for p in os.listdir(self.run_suite_path) if p != "eval_cache" and p != "groups"])
291
- for run_dir_name in tqdm(run_dir_names):
294
+ for run_dir_name in tqdm(run_dir_names, disable=None):
292
295
  run_spec_path: str = os.path.join(self.run_suite_path, run_dir_name, "run_spec.json")
293
296
  stats_path: str = os.path.join(self.run_suite_path, run_dir_name, "stats.json")
294
297
  if not os.path.exists(run_spec_path) or not os.path.exists(stats_path):
@@ -747,7 +750,6 @@ class Summarizer:
747
750
  # add overall win rate as the second column
748
751
  WIN_RATE_AGGREGATION = "mean"
749
752
  win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
750
- AGGREGATE_WIN_RATE_COLUMN = 1
751
753
  description = "How many models this model outperform on average (over columns)."
752
754
  table.header.insert(
753
755
  AGGREGATE_WIN_RATE_COLUMN,
@@ -0,0 +1,32 @@
1
+ from helm.common.general import asdict_without_nones
2
+ from helm.benchmark.presentation.table import Table, Cell, HeaderCell
3
+ from helm.benchmark.presentation.create_plots import parse_table
4
+
5
+
6
+ def test_table_parsing():
7
+ title = "table"
8
+ scenarios = ["A", "B", "C", "D"]
9
+ models = ["X", "Y", "Z"]
10
+ header = []
11
+ rows = [[] for m in models]
12
+
13
+ header.append(HeaderCell("Models"))
14
+ header.append(HeaderCell("Mean win rate"))
15
+ for s in scenarios:
16
+ header.append(HeaderCell(s, lower_is_better=True, metadata={"run_group": s, "metric": "accuracy"}))
17
+ for i, model in enumerate(models):
18
+ rows[i].append(Cell(model))
19
+ rows[i].append(Cell(0.1 * i))
20
+ for j, s in enumerate(scenarios):
21
+ rows[i].append(Cell(i * 10 + j))
22
+ summarize_table = Table(title, header, rows)
23
+ table = parse_table(asdict_without_nones(summarize_table))
24
+ assert table.adapters == models
25
+ assert list(table.mean_win_rates) == [0.0, 0.1, 0.2]
26
+ assert len(table.columns) == len(scenarios)
27
+ for j, c in enumerate(table.columns):
28
+ assert c.group == scenarios[j]
29
+ assert c.lower_is_better
30
+ assert c.metric == "accuracy"
31
+ for i, v in enumerate(c.values):
32
+ assert v == i * 10 + j
helm/benchmark/run.py CHANGED
@@ -1,10 +1,13 @@
1
1
  import argparse
2
2
  from dataclasses import replace
3
+ import os
3
4
  from typing import List, Optional
4
5
 
6
+ from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
5
7
  from helm.common.hierarchical_logger import hlog, htrack, htrack_block
6
8
  from helm.common.authentication import Authentication
7
9
  from helm.common.object_spec import parse_object_spec
10
+ from helm.proxy.clients.huggingface_model_registry import register_huggingface_model_config
8
11
  from helm.proxy.services.remote_service import create_authentication, add_service_args
9
12
 
10
13
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
@@ -16,8 +19,52 @@ from .run_specs import construct_run_specs
16
19
  LATEST_SYMLINK: str = "latest"
17
20
 
18
21
 
22
+ def run_entries_to_run_specs(
23
+ run_entries: List[RunEntry],
24
+ max_eval_instances: Optional[int] = None,
25
+ num_train_trials: Optional[int] = None,
26
+ models_to_run: Optional[List[str]] = None,
27
+ groups_to_run: Optional[List[str]] = None,
28
+ priority: Optional[int] = None,
29
+ ) -> List[RunSpec]:
30
+ """Runs RunSpecs given a list of RunSpec descriptions."""
31
+ run_specs: List[RunSpec] = []
32
+ for entry in run_entries:
33
+ # Filter by priority
34
+ if priority is not None and entry.priority > priority:
35
+ continue
36
+
37
+ for run_spec in construct_run_specs(parse_object_spec(entry.description)):
38
+ # Filter by models
39
+ if models_to_run and run_spec.adapter_spec.model not in models_to_run:
40
+ continue
41
+
42
+ # Filter by groups
43
+ if groups_to_run and not any(group in groups_to_run for group in run_spec.groups):
44
+ continue
45
+
46
+ # Modify AdapterSpec
47
+ adapter_spec: AdapterSpec = run_spec.adapter_spec
48
+ if max_eval_instances is not None:
49
+ adapter_spec = replace(adapter_spec, max_eval_instances=max_eval_instances)
50
+ if num_train_trials is not None or adapter_spec.max_train_instances == 0:
51
+ adapter_spec = replace(
52
+ adapter_spec, num_train_trials=1 if adapter_spec.max_train_instances == 0 else num_train_trials
53
+ )
54
+ run_spec = replace(run_spec, adapter_spec=adapter_spec)
55
+
56
+ # Append groups
57
+ if entry.groups is not None:
58
+ groups_name: str = "" if len(entry.groups) == 0 else f",groups={'-'.join(sorted(entry.groups))}"
59
+ run_spec = replace(run_spec, name=run_spec.name + groups_name, groups=run_spec.groups + entry.groups)
60
+
61
+ run_specs.append(run_spec)
62
+
63
+ return run_specs
64
+
65
+
19
66
  def run_benchmarking(
20
- run_spec_descriptions: List[str],
67
+ run_specs: List[RunSpec],
21
68
  auth: Authentication,
22
69
  url: str,
23
70
  local: bool,
@@ -27,15 +74,11 @@ def run_benchmarking(
27
74
  suite: str,
28
75
  dry_run: bool,
29
76
  skip_instances: bool,
30
- max_eval_instances: Optional[int] = None,
31
- num_train_trials: Optional[int] = None,
32
- groups: Optional[List[str]] = None,
33
- models_to_run: Optional[List[str]] = None,
34
- groups_to_run: Optional[List[str]] = None,
77
+ skip_completed_runs: bool,
78
+ exit_on_error: bool,
35
79
  mongo_uri: str = "",
36
80
  ) -> List[RunSpec]:
37
81
  """Runs RunSpecs given a list of RunSpec descriptions."""
38
-
39
82
  execution_spec = ExecutionSpec(
40
83
  auth=auth,
41
84
  url=url,
@@ -45,47 +88,28 @@ def run_benchmarking(
45
88
  dry_run=dry_run,
46
89
  mongo_uri=mongo_uri,
47
90
  )
48
-
49
- def override(run_spec: RunSpec) -> RunSpec:
50
- """Override parts of `run_spec`."""
51
- # Modify AdapterSpec
52
- adapter_spec: AdapterSpec = run_spec.adapter_spec
53
- if max_eval_instances is not None:
54
- adapter_spec = replace(adapter_spec, max_eval_instances=max_eval_instances)
55
- if num_train_trials is not None or adapter_spec.max_train_instances == 0:
56
- adapter_spec = replace(
57
- adapter_spec, num_train_trials=1 if adapter_spec.max_train_instances == 0 else num_train_trials
58
- )
59
-
60
- run_spec = replace(run_spec, adapter_spec=adapter_spec)
61
-
62
- # Append groups
63
- if groups is not None:
64
- groups_name: str = "" if len(groups) == 0 else f",groups={'-'.join(sorted(groups))}"
65
- run_spec = replace(run_spec, name=run_spec.name + groups_name, groups=run_spec.groups + groups)
66
-
67
- return run_spec
68
-
69
- run_specs = [
70
- override(run_spec)
71
- for description in run_spec_descriptions
72
- for run_spec in construct_run_specs(parse_object_spec(description))
73
- if (not models_to_run or run_spec.adapter_spec.model in models_to_run)
74
- and (not groups_to_run or any(group in groups_to_run for group in run_spec.groups))
75
- ]
76
-
77
- if len(run_specs) == 0:
78
- return run_specs
79
-
80
91
  with htrack_block("run_specs"):
81
92
  for run_spec in run_specs:
82
- hlog(run_spec.name)
93
+ hlog(run_spec)
83
94
 
84
- runner = Runner(execution_spec, output_path, suite, run_specs, skip_instances)
85
- runner.run_all()
95
+ runner = Runner(execution_spec, output_path, suite, skip_instances, skip_completed_runs, exit_on_error)
96
+ runner.run_all(run_specs)
86
97
  return run_specs
87
98
 
88
99
 
100
+ def symlink_latest(output_path: str, suite: str) -> None:
101
+ # Create a symlink runs/latest -> runs/<name_of_suite>,
102
+ # so runs/latest always points to the latest run suite.
103
+ runs_dir: str = os.path.join(output_path, "runs")
104
+ suite_dir: str = os.path.join(runs_dir, suite)
105
+ symlink_path: str = os.path.abspath(os.path.join(runs_dir, LATEST_SYMLINK))
106
+ hlog(f"Symlinking {suite_dir} to {LATEST_SYMLINK}.")
107
+ if os.path.islink(symlink_path):
108
+ # Remove the previous symlink if it exists.
109
+ os.unlink(symlink_path)
110
+ os.symlink(os.path.abspath(suite_dir), symlink_path)
111
+
112
+
89
113
  def add_run_args(parser: argparse.ArgumentParser):
90
114
  parser.add_argument(
91
115
  "-o", "--output-path", type=str, help="Where to save all the output", default="benchmark_output"
@@ -149,19 +173,86 @@ def validate_args(args):
149
173
 
150
174
  @htrack(None)
151
175
  def main():
152
- """
153
- Main entry point for running the benchmark.
154
- """
155
176
  parser = argparse.ArgumentParser()
156
177
  add_service_args(parser)
157
- parser.add_argument("-r", "--run-specs", nargs="*", help="Specifies what to run", default=["simple1"])
178
+ parser.add_argument(
179
+ "-c",
180
+ "--conf-paths",
181
+ nargs="+",
182
+ help="Where to read RunSpecs to run from",
183
+ default=[],
184
+ )
185
+ parser.add_argument(
186
+ "--models-to-run",
187
+ nargs="+",
188
+ help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
189
+ default=None,
190
+ )
191
+ parser.add_argument(
192
+ "--groups-to-run",
193
+ nargs="+",
194
+ help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
195
+ default=None,
196
+ )
197
+ parser.add_argument(
198
+ "--exit-on-error",
199
+ action="store_true",
200
+ default=None,
201
+ help="Fail and exit immediately if a particular RunSpec fails.",
202
+ )
203
+ parser.add_argument(
204
+ "--skip-completed-runs",
205
+ action="store_true",
206
+ default=None,
207
+ help="Skip RunSpecs that have completed i.e. output files exists.",
208
+ )
209
+ parser.add_argument(
210
+ "--priority",
211
+ type=int,
212
+ default=None,
213
+ help="Run RunSpecs with priority less than or equal to this number. "
214
+ "If a value for --priority is not specified, run on everything",
215
+ )
216
+ parser.add_argument("-r", "--run-specs", nargs="*", help="Specifies what to run", default=[])
217
+ parser.add_argument(
218
+ "--enable-huggingface-models",
219
+ nargs="+",
220
+ default=[],
221
+ help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
222
+ "Format: namespace/model_name[@revision]",
223
+ )
158
224
  add_run_args(parser)
159
225
  args = parser.parse_args()
160
226
  validate_args(args)
161
227
 
228
+ for huggingface_model_name in args.enable_huggingface_models:
229
+ register_huggingface_model_config(huggingface_model_name)
230
+
231
+ run_entries: List[RunEntry] = []
232
+ if args.conf_paths:
233
+ run_entries.extend(read_run_entries(args.conf_paths).entries)
234
+ if args.run_specs:
235
+ run_entries.extend(
236
+ [RunEntry(description=description, priority=1, groups=None) for description in args.run_specs]
237
+ )
238
+
239
+ run_specs = run_entries_to_run_specs(
240
+ run_entries=run_entries,
241
+ max_eval_instances=args.max_eval_instances,
242
+ num_train_trials=args.num_train_trials,
243
+ models_to_run=args.models_to_run,
244
+ groups_to_run=args.groups_to_run,
245
+ priority=args.priority,
246
+ )
247
+ hlog(f"{len(run_entries)} entries produced {len(run_specs)} run specs")
248
+
249
+ if len(run_specs) == 0:
250
+ hlog("There were no RunSpecs or they got filtered out.")
251
+ return
252
+
162
253
  auth: Authentication = Authentication("") if args.skip_instances or args.local else create_authentication(args)
163
254
  run_benchmarking(
164
- args.run_specs,
255
+ run_specs=run_specs,
165
256
  auth=auth,
166
257
  url=args.server_url,
167
258
  local=args.local,
@@ -171,10 +262,15 @@ def main():
171
262
  suite=args.suite,
172
263
  dry_run=args.dry_run,
173
264
  skip_instances=args.skip_instances,
174
- max_eval_instances=args.max_eval_instances,
265
+ skip_completed_runs=args.skip_completed_runs,
266
+ exit_on_error=args.exit_on_error,
175
267
  mongo_uri=args.mongo_uri,
176
268
  )
177
269
 
270
+ symlink_latest(output_path=args.output_path, suite=args.suite)
271
+
272
+ hlog("Done.")
273
+
178
274
 
179
275
  if __name__ == "__main__":
180
276
  main()
@@ -1,6 +1,6 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from dataclasses import replace
3
- from typing import List, Dict, Optional, Tuple
3
+ from typing import List, Dict, Optional, Tuple, Type
4
4
 
5
5
  from helm.proxy.models import (
6
6
  get_all_code_models,
@@ -302,35 +302,58 @@ class ModelRunExpander(ReplaceValueRunExpander):
302
302
  """
303
303
 
304
304
  name = "model"
305
- values_dict = {
306
- "full_functionality_text": get_model_names_with_tag(FULL_FUNCTIONALITY_TEXT_MODEL_TAG),
307
- "ai21/j1-jumbo": ["ai21/j1-jumbo"],
308
- "openai/curie": ["openai/curie"],
309
- "chat_run": ["openai/chat-gpt", "openai/text-davinci-003"], # Compare ChatGPT to text-davinci-003
310
- "all": get_all_models(),
311
- "text_code": get_all_text_models() + get_all_code_models(),
312
- "text": get_all_text_models(),
313
- "code": get_all_code_models(),
314
- "limited_functionality_text": get_model_names_with_tag(LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG),
315
- "gpt2_tokenizer": get_model_names_with_tag(GPT2_TOKENIZER_TAG),
316
- "ai21_tokenizer": get_model_names_with_tag(AI21_TOKENIZER_TAG),
317
- "cohere_tokenizer": get_model_names_with_tag(COHERE_TOKENIZER_TAG),
318
- "opt_tokenizer": get_model_names_with_tag(OPT_TOKENIZER_TAG),
319
- "summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"],
320
- "interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"],
321
- }
322
305
 
323
- # For each of the keys above (e.g., "text"), create a corresponding ablation (e.g., "ablation_text")
324
- # which contains the subset of models with the ablation tag.
325
- ablation_models = set(get_model_names_with_tag(ABLATION_MODEL_TAG))
326
- ablation_values_dict = {}
327
- for family_name, models in values_dict.items():
328
- ablation_values_dict["ablation_" + family_name] = list(ablation_models & set(models))
329
- for family_name, models in ablation_values_dict.items():
330
- if family_name == "ablation_all":
331
- values_dict["ablation"] = models
306
+ def __init__(self, value):
307
+ """
308
+ `value` is either the actual value to use or a lookup into the values dict.
309
+ """
310
+ if value in self.values_dict:
311
+ self.values = self.values_dict[value]
332
312
  else:
333
- values_dict[family_name] = models
313
+ self.values = [value]
314
+
315
+ @property
316
+ def values_dict(self):
317
+ values_dict = {
318
+ "full_functionality_text": get_model_names_with_tag(FULL_FUNCTIONALITY_TEXT_MODEL_TAG),
319
+ "ai21/j1-jumbo": ["ai21/j1-jumbo"],
320
+ "openai/curie": ["openai/curie"],
321
+ "chat_run": ["openai/chat-gpt", "openai/text-davinci-003"], # Compare ChatGPT to text-davinci-003
322
+ "all": get_all_models(),
323
+ "text_code": get_all_text_models() + get_all_code_models(),
324
+ "text": get_all_text_models(),
325
+ "code": get_all_code_models(),
326
+ "limited_functionality_text": get_model_names_with_tag(LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG),
327
+ "gpt2_tokenizer": get_model_names_with_tag(GPT2_TOKENIZER_TAG),
328
+ "ai21_tokenizer": get_model_names_with_tag(AI21_TOKENIZER_TAG),
329
+ "cohere_tokenizer": get_model_names_with_tag(COHERE_TOKENIZER_TAG),
330
+ "opt_tokenizer": get_model_names_with_tag(OPT_TOKENIZER_TAG),
331
+ "summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"],
332
+ "biomedical": ["openai/text-davinci-003"], # TODO: add https://huggingface.co/stanford-crfm/BioMedLM
333
+ "interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"],
334
+ "opinions_qa_openai": [
335
+ "openai/ada",
336
+ "openai/davinci",
337
+ "openai/text-ada-001",
338
+ "openai/text-davinci-001",
339
+ "openai/text-davinci-002",
340
+ "openai/text-davinci-003",
341
+ ],
342
+ "opinions_qa_ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
343
+ }
344
+
345
+ # For each of the keys above (e.g., "text"), create a corresponding ablation (e.g., "ablation_text")
346
+ # which contains the subset of models with the ablation tag.
347
+ ablation_models = set(get_model_names_with_tag(ABLATION_MODEL_TAG))
348
+ ablation_values_dict = {}
349
+ for family_name, models in values_dict.items():
350
+ ablation_values_dict["ablation_" + family_name] = list(ablation_models & set(models))
351
+ for family_name, models in ablation_values_dict.items():
352
+ if family_name == "ablation_all":
353
+ values_dict["ablation"] = models
354
+ else:
355
+ values_dict[family_name] = models
356
+ return values_dict
334
357
 
335
358
 
336
359
  ############################################################
@@ -739,6 +762,7 @@ class TokenizerRunExpander(ScenarioSpecRunExpander):
739
762
  "AlephAlpha/luminous-extended": ["AlephAlpha/luminous-extended"],
740
763
  "AlephAlpha/luminous-supreme": ["AlephAlpha/luminous-supreme"],
741
764
  "AlephAlpha/luminous-world": ["AlephAlpha/luminous-world"],
765
+ "huggingface/santacoder": ["bigcode/santacoder"],
742
766
  }
743
767
  model_tags_and_tokenizers = [
744
768
  (GPT2_TOKENIZER_TAG, "huggingface/gpt2"),
@@ -768,7 +792,8 @@ class TokenizerRunExpander(ScenarioSpecRunExpander):
768
792
  def expand(self, run_spec: RunSpec) -> List[RunSpec]:
769
793
  # Find right tokenizer given model.
770
794
  if isinstance(self.all_values, dict):
771
- self.values = self.all_values[run_spec.adapter_spec.model]
795
+ model: str = run_spec.adapter_spec.model
796
+ self.values = self.all_values[model] if model in self.all_values else []
772
797
  else:
773
798
  self.values = self.all_values
774
799
  return super().expand(run_spec)
@@ -818,21 +843,113 @@ class NumOutputTokensRunExpander(RunExpander):
818
843
  ]
819
844
 
820
845
 
821
- RUN_EXPANDERS = dict(
822
- (expander.name, expander)
823
- for expander in [
824
- InstructionsRunExpander,
825
- PromptRunExpander,
826
- NewlineRunExpander,
827
- StopRunExpander,
828
- GlobalPrefixRunExpander,
829
- NumTrainTrialsRunExpander,
830
- MaxTrainInstancesRunExpander,
831
- NumOutputsRunExpander,
832
- ModelRunExpander,
833
- DataAugmentationRunExpander,
834
- TokenizerRunExpander,
835
- NumPromptTokensRunExpander,
836
- NumOutputTokensRunExpander,
837
- ]
838
- )
846
+ class ChatMLRunExpander(RunExpander):
847
+ """
848
+ Adapt to ChatML: https://github.com/openai/openai-python/blob/main/chatml.md
849
+ A 1-shot example:
850
+ <|im_start|>system
851
+ Translate from English to French
852
+ <|im_end|>
853
+ <|im_start|>user
854
+ How are you?
855
+ <|im_end|>
856
+ <|im_start|>user
857
+ Comment allez-vous?
858
+ <|im_end|>
859
+ <|im_start|>user
860
+ {{user input here}}<|im_end|>
861
+ """
862
+
863
+ name = "chatml"
864
+
865
+ def __init__(self):
866
+ self.name = type(self).name
867
+
868
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
869
+ adapter_spec = run_spec.adapter_spec
870
+ # according to https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting
871
+ # few-shot examples should do `<|im_start|>system name=example_user`
872
+ # or `<|im_start|>system name=example_assistant`
873
+ # but it is also possible to put examples into a user message.
874
+
875
+ scenario_name = run_spec.name.split(":")[0]
876
+
877
+ if scenario_name in ("msmarco",):
878
+ # output_prefix:
879
+ # Does the passage answer the query?
880
+ # Answer:
881
+ #
882
+ # new_output_prefix:
883
+ # Does the passage answer the query?<|im_end|>
884
+ # <|im_start|>assistant
885
+ # Answer:
886
+
887
+ new_output_prefix = (
888
+ adapter_spec.output_prefix.split("\n")[0]
889
+ + "<|im_end|>\n<|im_start|>assistant\n"
890
+ + adapter_spec.output_prefix.split("\n")[1]
891
+ )
892
+
893
+ elif scenario_name in ("summarization_cnndm", "summarization_xsum"):
894
+ # output_prefix:
895
+ # Summarize the above article in 1 sentence.
896
+ #
897
+ # new_output_prefix:
898
+ # Summarize the above article in 1 sentence.<|im_end|>
899
+ # <|im_start|>assistant
900
+ #
901
+
902
+ new_output_prefix = adapter_spec.output_prefix + "<|im_end|>\n<|im_start|>assistant\n"
903
+
904
+ else:
905
+ # output_prefix:
906
+ # {output_prefix}
907
+ #
908
+ # new_output_prefix:
909
+ # <|im_end|>
910
+ # <|im_start|>assistant
911
+ # {output_prefix}
912
+
913
+ new_output_prefix = "<|im_end|>\n<|im_start|>assistant\n" + adapter_spec.output_prefix
914
+
915
+ adapter_spec = replace(
916
+ adapter_spec,
917
+ # This is a hack to make sure <|im_start|>user goes before the reference.
918
+ instructions=(
919
+ f"<|im_start|>system\n{adapter_spec.instructions}<|im_end|>\n<|im_start|>user\n"
920
+ if adapter_spec.instructions != ""
921
+ else "<|im_start|>user\n"
922
+ ),
923
+ instance_prefix="",
924
+ output_prefix=new_output_prefix,
925
+ output_suffix="<|im_end|>\n<|im_start|>user\n",
926
+ stop_sequences=adapter_spec.stop_sequences + ["<|im_end|>"],
927
+ )
928
+
929
+ return [
930
+ replace(
931
+ run_spec,
932
+ adapter_spec=adapter_spec,
933
+ ),
934
+ ]
935
+
936
+
937
+ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
938
+ InstructionsRunExpander,
939
+ PromptRunExpander,
940
+ NewlineRunExpander,
941
+ StopRunExpander,
942
+ GlobalPrefixRunExpander,
943
+ NumTrainTrialsRunExpander,
944
+ MaxTrainInstancesRunExpander,
945
+ NumOutputsRunExpander,
946
+ ModelRunExpander,
947
+ DataAugmentationRunExpander,
948
+ TokenizerRunExpander,
949
+ NumPromptTokensRunExpander,
950
+ NumOutputTokensRunExpander,
951
+ ChatMLRunExpander,
952
+ ]
953
+
954
+
955
+ RUN_EXPANDERS = dict((expander.name, expander) for expander in RUN_EXPANDER_SUBCLASSES)