crfm-helm 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +10 -8
  2. {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +50 -37
  3. {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
  4. {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +1 -0
  5. helm/benchmark/__init__.py +2 -0
  6. helm/benchmark/adaptation/adapter_spec.py +3 -0
  7. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
  8. helm/benchmark/contamination/__init__.py +0 -0
  9. helm/benchmark/metrics/classification_metrics.py +28 -23
  10. helm/benchmark/metrics/test_classification_metrics.py +44 -9
  11. helm/benchmark/presentation/create_plots.py +617 -0
  12. helm/benchmark/presentation/summarize.py +4 -2
  13. helm/benchmark/presentation/test_create_plots.py +32 -0
  14. helm/benchmark/run.py +23 -1
  15. helm/benchmark/run_expander.py +161 -47
  16. helm/benchmark/run_specs.py +84 -10
  17. helm/benchmark/runner.py +31 -3
  18. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  19. helm/benchmark/scenarios/imdb_listdir.json +50014 -0
  20. helm/benchmark/scenarios/lex_glue_scenario.py +58 -17
  21. helm/benchmark/scenarios/lextreme_scenario.py +37 -25
  22. helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
  23. helm/benchmark/scenarios/scenario.py +5 -0
  24. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  25. helm/benchmark/static/benchmarking.css +14 -0
  26. helm/benchmark/static/benchmarking.js +43 -0
  27. helm/benchmark/static/index.html +2 -0
  28. helm/benchmark/static/json-urls.js +4 -0
  29. helm/benchmark/static/plot-captions.js +16 -0
  30. helm/benchmark/static/schema.yaml +66 -8
  31. helm/benchmark/window_services/cohere_window_service.py +20 -0
  32. helm/benchmark/window_services/flan_t5_window_service.py +29 -0
  33. helm/benchmark/window_services/huggingface_window_service.py +39 -0
  34. helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
  35. helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
  36. helm/benchmark/window_services/window_service_factory.py +27 -6
  37. helm/common/general.py +12 -5
  38. helm/proxy/clients/aleph_alpha_client.py +47 -28
  39. helm/proxy/clients/auto_client.py +28 -24
  40. helm/proxy/clients/huggingface_client.py +30 -17
  41. helm/proxy/clients/huggingface_model_registry.py +111 -0
  42. helm/proxy/clients/huggingface_tokenizer.py +23 -7
  43. helm/proxy/clients/openai_client.py +60 -2
  44. helm/proxy/clients/test_huggingface_model_registry.py +57 -0
  45. helm/proxy/clients/together_client.py +17 -2
  46. helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
  47. helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
  48. helm/proxy/models.py +82 -2
  49. {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
  50. {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0
helm/benchmark/run.py CHANGED
@@ -7,6 +7,7 @@ from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
7
7
  from helm.common.hierarchical_logger import hlog, htrack, htrack_block
8
8
  from helm.common.authentication import Authentication
9
9
  from helm.common.object_spec import parse_object_spec
10
+ from helm.proxy.clients.huggingface_model_registry import register_huggingface_model_config
10
11
  from helm.proxy.services.remote_service import create_authentication, add_service_args
11
12
 
12
13
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
@@ -73,6 +74,8 @@ def run_benchmarking(
73
74
  suite: str,
74
75
  dry_run: bool,
75
76
  skip_instances: bool,
77
+ skip_completed_runs: bool,
78
+ exit_on_error: bool,
76
79
  mongo_uri: str = "",
77
80
  ) -> List[RunSpec]:
78
81
  """Runs RunSpecs given a list of RunSpec descriptions."""
@@ -89,7 +92,7 @@ def run_benchmarking(
89
92
  for run_spec in run_specs:
90
93
  hlog(run_spec)
91
94
 
92
- runner = Runner(execution_spec, output_path, suite, skip_instances)
95
+ runner = Runner(execution_spec, output_path, suite, skip_instances, skip_completed_runs, exit_on_error)
93
96
  runner.run_all(run_specs)
94
97
  return run_specs
95
98
 
@@ -197,6 +200,12 @@ def main():
197
200
  default=None,
198
201
  help="Fail and exit immediately if a particular RunSpec fails.",
199
202
  )
203
+ parser.add_argument(
204
+ "--skip-completed-runs",
205
+ action="store_true",
206
+ default=None,
207
+ help="Skip RunSpecs that have completed i.e. output files exists.",
208
+ )
200
209
  parser.add_argument(
201
210
  "--priority",
202
211
  type=int,
@@ -205,9 +214,20 @@ def main():
205
214
  "If a value for --priority is not specified, run on everything",
206
215
  )
207
216
  parser.add_argument("-r", "--run-specs", nargs="*", help="Specifies what to run", default=[])
217
+ parser.add_argument(
218
+ "--enable-huggingface-models",
219
+ nargs="+",
220
+ default=[],
221
+ help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
222
+ "Format: namespace/model_name[@revision]",
223
+ )
208
224
  add_run_args(parser)
209
225
  args = parser.parse_args()
210
226
  validate_args(args)
227
+
228
+ for huggingface_model_name in args.enable_huggingface_models:
229
+ register_huggingface_model_config(huggingface_model_name)
230
+
211
231
  run_entries: List[RunEntry] = []
212
232
  if args.conf_paths:
213
233
  run_entries.extend(read_run_entries(args.conf_paths).entries)
@@ -242,6 +262,8 @@ def main():
242
262
  suite=args.suite,
243
263
  dry_run=args.dry_run,
244
264
  skip_instances=args.skip_instances,
265
+ skip_completed_runs=args.skip_completed_runs,
266
+ exit_on_error=args.exit_on_error,
245
267
  mongo_uri=args.mongo_uri,
246
268
  )
247
269
 
@@ -1,6 +1,6 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from dataclasses import replace
3
- from typing import List, Dict, Optional, Tuple
3
+ from typing import List, Dict, Optional, Tuple, Type
4
4
 
5
5
  from helm.proxy.models import (
6
6
  get_all_code_models,
@@ -302,36 +302,58 @@ class ModelRunExpander(ReplaceValueRunExpander):
302
302
  """
303
303
 
304
304
  name = "model"
305
- values_dict = {
306
- "full_functionality_text": get_model_names_with_tag(FULL_FUNCTIONALITY_TEXT_MODEL_TAG),
307
- "ai21/j1-jumbo": ["ai21/j1-jumbo"],
308
- "openai/curie": ["openai/curie"],
309
- "chat_run": ["openai/chat-gpt", "openai/text-davinci-003"], # Compare ChatGPT to text-davinci-003
310
- "all": get_all_models(),
311
- "text_code": get_all_text_models() + get_all_code_models(),
312
- "text": get_all_text_models(),
313
- "code": get_all_code_models(),
314
- "limited_functionality_text": get_model_names_with_tag(LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG),
315
- "gpt2_tokenizer": get_model_names_with_tag(GPT2_TOKENIZER_TAG),
316
- "ai21_tokenizer": get_model_names_with_tag(AI21_TOKENIZER_TAG),
317
- "cohere_tokenizer": get_model_names_with_tag(COHERE_TOKENIZER_TAG),
318
- "opt_tokenizer": get_model_names_with_tag(OPT_TOKENIZER_TAG),
319
- "summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"],
320
- "biomedical": ["openai/text-davinci-003"], # TODO: add https://huggingface.co/stanford-crfm/BioMedLM
321
- "interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"],
322
- }
323
305
 
324
- # For each of the keys above (e.g., "text"), create a corresponding ablation (e.g., "ablation_text")
325
- # which contains the subset of models with the ablation tag.
326
- ablation_models = set(get_model_names_with_tag(ABLATION_MODEL_TAG))
327
- ablation_values_dict = {}
328
- for family_name, models in values_dict.items():
329
- ablation_values_dict["ablation_" + family_name] = list(ablation_models & set(models))
330
- for family_name, models in ablation_values_dict.items():
331
- if family_name == "ablation_all":
332
- values_dict["ablation"] = models
306
+ def __init__(self, value):
307
+ """
308
+ `value` is either the actual value to use or a lookup into the values dict.
309
+ """
310
+ if value in self.values_dict:
311
+ self.values = self.values_dict[value]
333
312
  else:
334
- values_dict[family_name] = models
313
+ self.values = [value]
314
+
315
+ @property
316
+ def values_dict(self):
317
+ values_dict = {
318
+ "full_functionality_text": get_model_names_with_tag(FULL_FUNCTIONALITY_TEXT_MODEL_TAG),
319
+ "ai21/j1-jumbo": ["ai21/j1-jumbo"],
320
+ "openai/curie": ["openai/curie"],
321
+ "chat_run": ["openai/chat-gpt", "openai/text-davinci-003"], # Compare ChatGPT to text-davinci-003
322
+ "all": get_all_models(),
323
+ "text_code": get_all_text_models() + get_all_code_models(),
324
+ "text": get_all_text_models(),
325
+ "code": get_all_code_models(),
326
+ "limited_functionality_text": get_model_names_with_tag(LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG),
327
+ "gpt2_tokenizer": get_model_names_with_tag(GPT2_TOKENIZER_TAG),
328
+ "ai21_tokenizer": get_model_names_with_tag(AI21_TOKENIZER_TAG),
329
+ "cohere_tokenizer": get_model_names_with_tag(COHERE_TOKENIZER_TAG),
330
+ "opt_tokenizer": get_model_names_with_tag(OPT_TOKENIZER_TAG),
331
+ "summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"],
332
+ "biomedical": ["openai/text-davinci-003"], # TODO: add https://huggingface.co/stanford-crfm/BioMedLM
333
+ "interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"],
334
+ "opinions_qa_openai": [
335
+ "openai/ada",
336
+ "openai/davinci",
337
+ "openai/text-ada-001",
338
+ "openai/text-davinci-001",
339
+ "openai/text-davinci-002",
340
+ "openai/text-davinci-003",
341
+ ],
342
+ "opinions_qa_ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
343
+ }
344
+
345
+ # For each of the keys above (e.g., "text"), create a corresponding ablation (e.g., "ablation_text")
346
+ # which contains the subset of models with the ablation tag.
347
+ ablation_models = set(get_model_names_with_tag(ABLATION_MODEL_TAG))
348
+ ablation_values_dict = {}
349
+ for family_name, models in values_dict.items():
350
+ ablation_values_dict["ablation_" + family_name] = list(ablation_models & set(models))
351
+ for family_name, models in ablation_values_dict.items():
352
+ if family_name == "ablation_all":
353
+ values_dict["ablation"] = models
354
+ else:
355
+ values_dict[family_name] = models
356
+ return values_dict
335
357
 
336
358
 
337
359
  ############################################################
@@ -821,21 +843,113 @@ class NumOutputTokensRunExpander(RunExpander):
821
843
  ]
822
844
 
823
845
 
824
- RUN_EXPANDERS = dict(
825
- (expander.name, expander)
826
- for expander in [
827
- InstructionsRunExpander,
828
- PromptRunExpander,
829
- NewlineRunExpander,
830
- StopRunExpander,
831
- GlobalPrefixRunExpander,
832
- NumTrainTrialsRunExpander,
833
- MaxTrainInstancesRunExpander,
834
- NumOutputsRunExpander,
835
- ModelRunExpander,
836
- DataAugmentationRunExpander,
837
- TokenizerRunExpander,
838
- NumPromptTokensRunExpander,
839
- NumOutputTokensRunExpander,
840
- ]
841
- )
846
+ class ChatMLRunExpander(RunExpander):
847
+ """
848
+ Adapt to ChatML: https://github.com/openai/openai-python/blob/main/chatml.md
849
+ A 1-shot example:
850
+ <|im_start|>system
851
+ Translate from English to French
852
+ <|im_end|>
853
+ <|im_start|>user
854
+ How are you?
855
+ <|im_end|>
856
+ <|im_start|>user
857
+ Comment allez-vous?
858
+ <|im_end|>
859
+ <|im_start|>user
860
+ {{user input here}}<|im_end|>
861
+ """
862
+
863
+ name = "chatml"
864
+
865
+ def __init__(self):
866
+ self.name = type(self).name
867
+
868
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
869
+ adapter_spec = run_spec.adapter_spec
870
+ # according to https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting
871
+ # few-shot examples should do `<|im_start|>system name=example_user`
872
+ # or `<|im_start|>system name=example_assistant`
873
+ # but it is also possible to put examples into a user message.
874
+
875
+ scenario_name = run_spec.name.split(":")[0]
876
+
877
+ if scenario_name in ("msmarco",):
878
+ # output_prefix:
879
+ # Does the passage answer the query?
880
+ # Answer:
881
+ #
882
+ # new_output_prefix:
883
+ # Does the passage answer the query?<|im_end|>
884
+ # <|im_start|>assistant
885
+ # Answer:
886
+
887
+ new_output_prefix = (
888
+ adapter_spec.output_prefix.split("\n")[0]
889
+ + "<|im_end|>\n<|im_start|>assistant\n"
890
+ + adapter_spec.output_prefix.split("\n")[1]
891
+ )
892
+
893
+ elif scenario_name in ("summarization_cnndm", "summarization_xsum"):
894
+ # output_prefix:
895
+ # Summarize the above article in 1 sentence.
896
+ #
897
+ # new_output_prefix:
898
+ # Summarize the above article in 1 sentence.<|im_end|>
899
+ # <|im_start|>assistant
900
+ #
901
+
902
+ new_output_prefix = adapter_spec.output_prefix + "<|im_end|>\n<|im_start|>assistant\n"
903
+
904
+ else:
905
+ # output_prefix:
906
+ # {output_prefix}
907
+ #
908
+ # new_output_prefix:
909
+ # <|im_end|>
910
+ # <|im_start|>assistant
911
+ # {output_prefix}
912
+
913
+ new_output_prefix = "<|im_end|>\n<|im_start|>assistant\n" + adapter_spec.output_prefix
914
+
915
+ adapter_spec = replace(
916
+ adapter_spec,
917
+ # This is a hack to make sure <|im_start|>user goes before the reference.
918
+ instructions=(
919
+ f"<|im_start|>system\n{adapter_spec.instructions}<|im_end|>\n<|im_start|>user\n"
920
+ if adapter_spec.instructions != ""
921
+ else "<|im_start|>user\n"
922
+ ),
923
+ instance_prefix="",
924
+ output_prefix=new_output_prefix,
925
+ output_suffix="<|im_end|>\n<|im_start|>user\n",
926
+ stop_sequences=adapter_spec.stop_sequences + ["<|im_end|>"],
927
+ )
928
+
929
+ return [
930
+ replace(
931
+ run_spec,
932
+ adapter_spec=adapter_spec,
933
+ ),
934
+ ]
935
+
936
+
937
+ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
938
+ InstructionsRunExpander,
939
+ PromptRunExpander,
940
+ NewlineRunExpander,
941
+ StopRunExpander,
942
+ GlobalPrefixRunExpander,
943
+ NumTrainTrialsRunExpander,
944
+ MaxTrainInstancesRunExpander,
945
+ NumOutputsRunExpander,
946
+ ModelRunExpander,
947
+ DataAugmentationRunExpander,
948
+ TokenizerRunExpander,
949
+ NumPromptTokensRunExpander,
950
+ NumOutputTokensRunExpander,
951
+ ChatMLRunExpander,
952
+ ]
953
+
954
+
955
+ RUN_EXPANDERS = dict((expander.name, expander) for expander in RUN_EXPANDER_SUBCLASSES)
@@ -14,12 +14,13 @@ from helm.benchmark.adaptation.adapters.adapter_factory import (
14
14
  from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter
15
15
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
16
16
  from .metrics.metric import MetricSpec
17
- from .run_expander import RUN_EXPANDERS, GlobalPrefixRunExpander, StopRunExpander
17
+ from .run_expander import RUN_EXPANDERS, GlobalPrefixRunExpander, StopRunExpander, ChatMLRunExpander
18
18
  from .runner import RunSpec
19
19
  from .scenarios.lex_glue_scenario import (
20
20
  get_lex_glue_max_train_instances,
21
21
  get_lex_glue_instructions,
22
22
  get_lex_glue_max_tokens,
23
+ get_lex_glue_task_type,
23
24
  )
24
25
  from .scenarios.scenario import ScenarioSpec
25
26
  from .scenarios.big_bench_scenario import BIGBenchScenario
@@ -31,8 +32,10 @@ from .scenarios.lextreme_scenario import (
31
32
  get_lextreme_instructions,
32
33
  get_lextreme_max_train_instances,
33
34
  get_lextreme_max_tokens,
35
+ TaskType,
36
+ get_lextreme_task_type,
34
37
  )
35
- from helm.proxy.models import get_model, NO_NEWLINES_TAG, NLG_PREFIX_TAG
38
+ from helm.proxy.models import get_model, NO_NEWLINES_TAG, NLG_PREFIX_TAG, CHATML_MODEL_TAG
36
39
  from helm.common.general import singleton
37
40
 
38
41
 
@@ -47,7 +50,14 @@ def format_instructions(instructions: str) -> str:
47
50
 
48
51
 
49
52
  def get_multiple_choice_joint_adapter_spec(
50
- instructions: str, input_noun: Optional[str], output_noun: str, max_train_instances: int = 5, **kwargs
53
+ instructions: str,
54
+ input_noun: Optional[str],
55
+ output_noun: str,
56
+ num_outputs: int = 5,
57
+ max_train_instances: int = 5,
58
+ max_tokens: int = 5,
59
+ sample_train: bool = True,
60
+ **kwargs,
51
61
  ) -> AdapterSpec:
52
62
  """
53
63
  [instructions]
@@ -64,6 +74,7 @@ def get_multiple_choice_joint_adapter_spec(
64
74
  [reference_k]
65
75
  [output_noun]:
66
76
  """
77
+
67
78
  return AdapterSpec(
68
79
  method=ADAPT_MULTIPLE_CHOICE_JOINT,
69
80
  instructions=format_instructions(instructions),
@@ -72,10 +83,11 @@ def get_multiple_choice_joint_adapter_spec(
72
83
  output_prefix=f"{output_noun}: ",
73
84
  output_suffix="\n",
74
85
  max_train_instances=max_train_instances,
75
- num_outputs=1,
76
- max_tokens=5,
86
+ num_outputs=num_outputs,
87
+ max_tokens=max_tokens,
77
88
  temperature=0.0,
78
89
  stop_sequences=["\n"],
90
+ sample_train=sample_train,
79
91
  **kwargs,
80
92
  )
81
93
 
@@ -109,15 +121,26 @@ def get_multiple_choice_adapter_spec(
109
121
  input_noun: Optional[str],
110
122
  output_noun: str,
111
123
  max_train_instances: int = 5,
124
+ num_outputs: int = 5,
125
+ max_tokens: int = 5,
112
126
  empty_input: bool = False,
127
+ sample_train: bool = True,
113
128
  **kwargs,
114
129
  ):
130
+
115
131
  """
116
132
  Toggle between joint and separate adapters.
117
133
  """
118
134
  if method == ADAPT_MULTIPLE_CHOICE_JOINT:
119
135
  return get_multiple_choice_joint_adapter_spec(
120
- instructions, input_noun, output_noun, max_train_instances, **kwargs
136
+ instructions,
137
+ input_noun,
138
+ output_noun,
139
+ max_train_instances=max_train_instances,
140
+ num_outputs=num_outputs,
141
+ max_tokens=max_tokens,
142
+ sample_train=sample_train,
143
+ **kwargs,
121
144
  )
122
145
  elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
123
146
  return get_multiple_choice_separate_adapter_spec(method, empty_input)
@@ -385,8 +408,12 @@ def get_f1_metric_specs() -> List[MetricSpec]:
385
408
  return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score"])
386
409
 
387
410
 
388
- def get_classification_metric_specs() -> List[MetricSpec]:
389
- return [MetricSpec(class_name="helm.benchmark.classification_metrics.ClassificationMetric", args={})]
411
+ def get_classification_metric_specs(delimiter: Optional[str] = None) -> List[MetricSpec]:
412
+ return [
413
+ MetricSpec(
414
+ class_name="helm.benchmark.classification_metrics.ClassificationMetric", args={"delimiter": delimiter}
415
+ )
416
+ ]
390
417
 
391
418
 
392
419
  def get_bbq_metric_specs() -> List[MetricSpec]:
@@ -1724,6 +1751,14 @@ def get_pubmed_qa_spec() -> RunSpec:
1724
1751
  )
1725
1752
 
1726
1753
 
1754
+ def build_classification_metrics(task_type):
1755
+ if task_type in [TaskType.QA, TaskType.SLTC]:
1756
+ return get_classification_metric_specs(delimiter=None)
1757
+ elif task_type == TaskType.MLTC:
1758
+ return get_classification_metric_specs(delimiter=",")
1759
+ return []
1760
+
1761
+
1727
1762
  def get_lextreme_spec(subset: str) -> RunSpec:
1728
1763
  scenario_spec = ScenarioSpec(
1729
1764
  class_name="helm.benchmark.scenarios.lextreme_scenario.LEXTREMEScenario",
@@ -1742,7 +1777,7 @@ def get_lextreme_spec(subset: str) -> RunSpec:
1742
1777
  name=f"lextreme:subset={subset}",
1743
1778
  scenario_spec=scenario_spec,
1744
1779
  adapter_spec=adapter_spec,
1745
- metric_specs=get_f1_metric_specs(),
1780
+ metric_specs=build_classification_metrics(get_lextreme_task_type(subset)),
1746
1781
  groups=["lextreme"],
1747
1782
  )
1748
1783
 
@@ -1765,7 +1800,7 @@ def get_lex_glue_spec(subset: str) -> RunSpec:
1765
1800
  name=f"lex_glue:subset={subset}",
1766
1801
  scenario_spec=scenario_spec,
1767
1802
  adapter_spec=adapter_spec,
1768
- metric_specs=get_f1_metric_specs(),
1803
+ metric_specs=build_classification_metrics(get_lex_glue_task_type(subset)),
1769
1804
  groups=["lex_glue"],
1770
1805
  )
1771
1806
 
@@ -1801,6 +1836,40 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec
1801
1836
  )
1802
1837
 
1803
1838
 
1839
+ def get_opinions_qa_spec(
1840
+ survey_type: str,
1841
+ num_logprobs: str,
1842
+ context: str = "None",
1843
+ num_train_trials: str = "1",
1844
+ method: str = ADAPT_MULTIPLE_CHOICE_JOINT,
1845
+ ) -> RunSpec:
1846
+ scenario_spec = ScenarioSpec(
1847
+ class_name="helm.benchmark.scenarios.opinions_qa_scenario.OpinionsQAScenario",
1848
+ args={"survey_type": survey_type, "context": context},
1849
+ )
1850
+
1851
+ adapter_spec = get_multiple_choice_adapter_spec(
1852
+ method=method,
1853
+ instructions="",
1854
+ input_noun="Question",
1855
+ output_noun="Answer",
1856
+ max_train_instances=1 if "steer" in context else 0,
1857
+ max_tokens=1,
1858
+ num_outputs=int(num_logprobs),
1859
+ num_train_trials=1 if context != "steer-qa" else int(num_train_trials),
1860
+ sample_train=False,
1861
+ )
1862
+
1863
+ return RunSpec(
1864
+ name=f"opinions_qa:survey={survey_type},num_logprobs={num_logprobs}"
1865
+ + f",context={context},num_train_trials={num_train_trials}",
1866
+ scenario_spec=scenario_spec,
1867
+ adapter_spec=adapter_spec,
1868
+ metric_specs=[],
1869
+ groups=["opinions_qa"],
1870
+ )
1871
+
1872
+
1804
1873
  ############################################################
1805
1874
 
1806
1875
  CANONICAL_RUN_SPEC_FUNCS: Dict[str, Callable[..., RunSpec]] = {
@@ -1858,6 +1927,7 @@ CANONICAL_RUN_SPEC_FUNCS: Dict[str, Callable[..., RunSpec]] = {
1858
1927
  "med_paragraph_simplification": get_med_paragraph_simplification_spec,
1859
1928
  "med_qa": get_med_qa_spec,
1860
1929
  "pubmed_qa": get_pubmed_qa_spec,
1930
+ "opinions_qa": get_opinions_qa_spec,
1861
1931
  }
1862
1932
 
1863
1933
 
@@ -1900,6 +1970,10 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
1900
1970
  global_prefix_expander = GlobalPrefixRunExpander(value="nlg")
1901
1971
  run_spec = singleton(global_prefix_expander.expand(run_spec))
1902
1972
 
1973
+ if CHATML_MODEL_TAG in model.tags:
1974
+ chatml_expander = ChatMLRunExpander()
1975
+ run_spec = singleton(chatml_expander.expand(run_spec))
1976
+
1903
1977
  return run_spec
1904
1978
 
1905
1979
  run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]
helm/benchmark/runner.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import os
3
+ import traceback
3
4
  import typing
4
5
  from collections import Counter
5
6
  from dataclasses import dataclass, field
@@ -25,6 +26,12 @@ from .metrics.tokens_metric import TokensMetric
25
26
  from .window_services.tokenizer_service import TokenizerService
26
27
 
27
28
 
29
+ class RunnerError(Exception):
30
+ """Error that happens in the Runner."""
31
+
32
+ pass
33
+
34
+
28
35
  @dataclass(frozen=True)
29
36
  class RunSpec:
30
37
  """
@@ -71,12 +78,16 @@ class Runner:
71
78
  output_path: str,
72
79
  suite: str,
73
80
  skip_instances: bool,
81
+ skip_completed_runs: bool,
82
+ exit_on_error: bool,
74
83
  ):
75
84
  self.executor = Executor(execution_spec)
76
85
  self.dry_run: bool = execution_spec.dry_run
77
86
  self.tokenizer_service = TokenizerService(self.executor.service, execution_spec.auth)
78
87
  self.metric_service = MetricService(self.executor.service, execution_spec.auth)
79
88
  self.skip_instances: bool = skip_instances
89
+ self.skip_completed_runs: bool = skip_completed_runs
90
+ self.exit_on_error: bool = exit_on_error
80
91
 
81
92
  ensure_directory_exists(output_path)
82
93
  # Decide where to save the raw data (e.g., "output/scenarios/mmlu").
@@ -91,9 +102,20 @@ class Runner:
91
102
  ensure_directory_exists(self.eval_cache_path)
92
103
 
93
104
  def run_all(self, run_specs: List[RunSpec]):
94
- for run_spec in tqdm(run_specs):
95
- with htrack_block(f"Running {run_spec.name}"):
96
- self.run_one(run_spec)
105
+ failed_run_specs: List[RunSpec] = []
106
+ for run_spec in tqdm(run_specs, disable=None):
107
+ try:
108
+ with htrack_block(f"Running {run_spec.name}"):
109
+ self.run_one(run_spec)
110
+ except Exception as e:
111
+ if self.exit_on_error:
112
+ raise e
113
+ else:
114
+ hlog(f"Error when running {run_spec.name}:\n{traceback.format_exc()}")
115
+ failed_run_specs.append(run_spec)
116
+ if not self.exit_on_error and failed_run_specs:
117
+ failed_runs_str = ", ".join([f'"{run_spec.name}"' for run_spec in failed_run_specs])
118
+ raise RunnerError(f"Failed runs: [{failed_runs_str}]")
97
119
 
98
120
  def run_one(self, run_spec: RunSpec):
99
121
  # Load the scenario
@@ -106,6 +128,12 @@ class Runner:
106
128
  run_path: str = os.path.join(self.runs_path, run_spec.name)
107
129
  ensure_directory_exists(run_path)
108
130
 
131
+ if self.skip_completed_runs and os.path.exists(os.path.join(run_path, "scenario_state.json")):
132
+ # If scenario_state.json exists, assume that all other output files exist
133
+ # because scenario_state.json is the last output file to be written.
134
+ hlog(f"Skipping run {run_spec.name} because run is completed and all output files exist.")
135
+ return
136
+
109
137
  # Fetch and initialize the Adapter based on the `AdapterSpec`.
110
138
  adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)
111
139
 
@@ -72,7 +72,7 @@ class CopyrightScenario(Scenario):
72
72
 
73
73
  # Read all the instances
74
74
  instances: List[Instance] = []
75
- for prefix, prefix_to_end in tqdm.tqdm(data["data"].items(), desc="load instances"):
75
+ for prefix, prefix_to_end in tqdm.tqdm(data["data"].items(), desc="load instances", disable=None):
76
76
  instances.append(
77
77
  Instance(
78
78
  input=Input(text=prefix),