crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
  2. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
  3. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
  4. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
  5. helm/benchmark/__init__.py +13 -0
  6. helm/benchmark/adaptation/adapter_spec.py +3 -0
  7. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
  8. helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
  9. helm/benchmark/contamination/__init__.py +0 -0
  10. helm/benchmark/metrics/classification_metrics.py +70 -0
  11. helm/benchmark/metrics/machine_translation_metrics.py +36 -0
  12. helm/benchmark/metrics/summarization_metrics.py +7 -8
  13. helm/benchmark/metrics/test_classification_metrics.py +150 -0
  14. helm/benchmark/presentation/create_plots.py +617 -0
  15. helm/benchmark/presentation/run_display.py +7 -48
  16. helm/benchmark/presentation/summarize.py +4 -2
  17. helm/benchmark/presentation/test_create_plots.py +32 -0
  18. helm/benchmark/run.py +144 -48
  19. helm/benchmark/run_expander.py +164 -47
  20. helm/benchmark/run_specs.py +346 -39
  21. helm/benchmark/runner.py +34 -6
  22. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  23. helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
  24. helm/benchmark/scenarios/imdb_listdir.json +50014 -0
  25. helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
  26. helm/benchmark/scenarios/lextreme_scenario.py +458 -0
  27. helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
  28. helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
  29. helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
  30. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
  31. helm/benchmark/scenarios/med_qa_scenario.py +96 -0
  32. helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
  33. helm/benchmark/scenarios/scenario.py +5 -0
  34. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  35. helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
  36. helm/benchmark/static/benchmarking.css +14 -0
  37. helm/benchmark/static/benchmarking.js +43 -0
  38. helm/benchmark/static/index.html +2 -0
  39. helm/benchmark/static/json-urls.js +4 -0
  40. helm/benchmark/static/plot-captions.js +16 -0
  41. helm/benchmark/static/schema.yaml +154 -1
  42. helm/benchmark/window_services/cohere_window_service.py +20 -0
  43. helm/benchmark/window_services/flan_t5_window_service.py +29 -0
  44. helm/benchmark/window_services/huggingface_window_service.py +39 -0
  45. helm/benchmark/window_services/santacoder_window_service.py +27 -0
  46. helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
  47. helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
  48. helm/benchmark/window_services/window_service_factory.py +34 -7
  49. helm/common/codec.py +123 -0
  50. helm/common/general.py +12 -5
  51. helm/common/test_codec.py +144 -0
  52. helm/proxy/clients/aleph_alpha_client.py +47 -28
  53. helm/proxy/clients/auto_client.py +32 -24
  54. helm/proxy/clients/google_client.py +88 -0
  55. helm/proxy/clients/huggingface_client.py +32 -16
  56. helm/proxy/clients/huggingface_model_registry.py +111 -0
  57. helm/proxy/clients/huggingface_tokenizer.py +25 -7
  58. helm/proxy/clients/openai_client.py +60 -2
  59. helm/proxy/clients/test_huggingface_model_registry.py +57 -0
  60. helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
  61. helm/proxy/clients/together_client.py +17 -2
  62. helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
  63. helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
  64. helm/proxy/models.py +115 -7
  65. helm/proxy/test_models.py +1 -1
  66. helm/benchmark/presentation/present.py +0 -249
  67. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
  68. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0
@@ -14,15 +14,28 @@ from helm.benchmark.adaptation.adapters.adapter_factory import (
14
14
  from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter
15
15
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
16
16
  from .metrics.metric import MetricSpec
17
- from .run_expander import RUN_EXPANDERS, GlobalPrefixRunExpander, StopRunExpander
17
+ from .run_expander import RUN_EXPANDERS, GlobalPrefixRunExpander, StopRunExpander, ChatMLRunExpander
18
18
  from .runner import RunSpec
19
+ from .scenarios.lex_glue_scenario import (
20
+ get_lex_glue_max_train_instances,
21
+ get_lex_glue_instructions,
22
+ get_lex_glue_max_tokens,
23
+ get_lex_glue_task_type,
24
+ )
19
25
  from .scenarios.scenario import ScenarioSpec
20
26
  from .scenarios.big_bench_scenario import BIGBenchScenario
21
27
  from .scenarios.msmarco_scenario import MSMARCOScenario
22
28
  from .scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
23
29
  from .scenarios.copyright_scenario import datatag2hash_code
24
30
  from .scenarios.raft_scenario import get_raft_instructions
25
- from helm.proxy.models import get_model, NO_NEWLINES_TAG, NLG_PREFIX_TAG
31
+ from .scenarios.lextreme_scenario import (
32
+ get_lextreme_instructions,
33
+ get_lextreme_max_train_instances,
34
+ get_lextreme_max_tokens,
35
+ TaskType,
36
+ get_lextreme_task_type,
37
+ )
38
+ from helm.proxy.models import get_model, NO_NEWLINES_TAG, NLG_PREFIX_TAG, CHATML_MODEL_TAG
26
39
  from helm.common.general import singleton
27
40
 
28
41
 
@@ -37,7 +50,14 @@ def format_instructions(instructions: str) -> str:
37
50
 
38
51
 
39
52
  def get_multiple_choice_joint_adapter_spec(
40
- instructions: str, input_noun: Optional[str], output_noun: str, max_train_instances: int = 5, **kwargs
53
+ instructions: str,
54
+ input_noun: Optional[str],
55
+ output_noun: str,
56
+ num_outputs: int = 5,
57
+ max_train_instances: int = 5,
58
+ max_tokens: int = 5,
59
+ sample_train: bool = True,
60
+ **kwargs,
41
61
  ) -> AdapterSpec:
42
62
  """
43
63
  [instructions]
@@ -54,6 +74,7 @@ def get_multiple_choice_joint_adapter_spec(
54
74
  [reference_k]
55
75
  [output_noun]:
56
76
  """
77
+
57
78
  return AdapterSpec(
58
79
  method=ADAPT_MULTIPLE_CHOICE_JOINT,
59
80
  instructions=format_instructions(instructions),
@@ -62,10 +83,11 @@ def get_multiple_choice_joint_adapter_spec(
62
83
  output_prefix=f"{output_noun}: ",
63
84
  output_suffix="\n",
64
85
  max_train_instances=max_train_instances,
65
- num_outputs=1,
66
- max_tokens=5,
86
+ num_outputs=num_outputs,
87
+ max_tokens=max_tokens,
67
88
  temperature=0.0,
68
89
  stop_sequences=["\n"],
90
+ sample_train=sample_train,
69
91
  **kwargs,
70
92
  )
71
93
 
@@ -99,15 +121,26 @@ def get_multiple_choice_adapter_spec(
99
121
  input_noun: Optional[str],
100
122
  output_noun: str,
101
123
  max_train_instances: int = 5,
124
+ num_outputs: int = 5,
125
+ max_tokens: int = 5,
102
126
  empty_input: bool = False,
127
+ sample_train: bool = True,
103
128
  **kwargs,
104
129
  ):
130
+
105
131
  """
106
132
  Toggle between joint and separate adapters.
107
133
  """
108
134
  if method == ADAPT_MULTIPLE_CHOICE_JOINT:
109
135
  return get_multiple_choice_joint_adapter_spec(
110
- instructions, input_noun, output_noun, max_train_instances, **kwargs
136
+ instructions,
137
+ input_noun,
138
+ output_noun,
139
+ max_train_instances=max_train_instances,
140
+ num_outputs=num_outputs,
141
+ max_tokens=max_tokens,
142
+ sample_train=sample_train,
143
+ **kwargs,
111
144
  )
112
145
  elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
113
146
  return get_multiple_choice_separate_adapter_spec(method, empty_input)
@@ -304,6 +337,27 @@ def get_summarization_adapter_spec(num_sents: int, **kwargs) -> AdapterSpec:
304
337
  )
305
338
 
306
339
 
340
+ def get_machine_translation_adapter_spec(
341
+ source_language, target_language, max_train_instances, **kwargs
342
+ ) -> AdapterSpec:
343
+ """
344
+ Used for machine translation.
345
+ """
346
+ return AdapterSpec(
347
+ method=ADAPT_GENERATION,
348
+ instructions=f"Translate {source_language} to {target_language}:",
349
+ input_prefix="",
350
+ input_suffix=" = ",
351
+ output_prefix="",
352
+ output_suffix="\n",
353
+ max_train_instances=max_train_instances,
354
+ num_outputs=1,
355
+ stop_sequences=["\n\n"],
356
+ temperature=0.0,
357
+ **kwargs,
358
+ )
359
+
360
+
307
361
  ############################################################
308
362
  # Examples of scenario and adapter specs
309
363
 
@@ -354,6 +408,14 @@ def get_f1_metric_specs() -> List[MetricSpec]:
354
408
  return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score"])
355
409
 
356
410
 
411
+ def get_classification_metric_specs(delimiter: Optional[str] = None) -> List[MetricSpec]:
412
+ return [
413
+ MetricSpec(
414
+ class_name="helm.benchmark.classification_metrics.ClassificationMetric", args={"delimiter": delimiter}
415
+ )
416
+ ]
417
+
418
+
357
419
  def get_bbq_metric_specs() -> List[MetricSpec]:
358
420
  return [MetricSpec(class_name="helm.benchmark.bbq_metrics.BBQMetric", args={})] + get_exact_match_metric_specs()
359
421
 
@@ -479,6 +541,16 @@ def get_code_metric_specs(dataset: str, timeout: float) -> List[MetricSpec]:
479
541
  return [MetricSpec(class_name="helm.benchmark.code_metrics.APPSMetric", args=args)]
480
542
 
481
543
 
544
+ def get_open_ended_generation_metric_specs() -> List[MetricSpec]:
545
+ return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4"])
546
+
547
+
548
+ def get_machine_translation_metric_specs() -> List[MetricSpec]:
549
+ return [
550
+ MetricSpec(class_name="helm.benchmark.machine_translation_metrics.MachineTranslationMetric", args={})
551
+ ] + get_basic_metric_specs([])
552
+
553
+
482
554
  ############################################################
483
555
  # Run specs
484
556
 
@@ -564,7 +636,9 @@ def get_civil_comments_spec(demographic: str) -> RunSpec:
564
636
  name=f"civil_comments:demographic={demographic}",
565
637
  scenario_spec=scenario_spec,
566
638
  adapter_spec=adapter_spec,
567
- metric_specs=get_exact_match_metric_specs() + get_generative_harms_metric_specs(),
639
+ metric_specs=get_exact_match_metric_specs()
640
+ + get_generative_harms_metric_specs()
641
+ + get_classification_metric_specs(),
568
642
  groups=["civil_comments"],
569
643
  )
570
644
 
@@ -809,7 +883,9 @@ def get_raft_spec(subset: str) -> RunSpec:
809
883
  name=f"raft:subset={subset}",
810
884
  scenario_spec=scenario_spec,
811
885
  adapter_spec=adapter_spec,
812
- metric_specs=get_exact_match_metric_specs() + get_generative_harms_metric_specs(),
886
+ metric_specs=get_exact_match_metric_specs()
887
+ + get_generative_harms_metric_specs()
888
+ + get_classification_metric_specs(),
813
889
  groups=["raft"],
814
890
  )
815
891
 
@@ -971,7 +1047,7 @@ def get_imdb_spec(only_contrast=False) -> RunSpec:
971
1047
  name="imdb" + (":only_contrast=True" if only_contrast else ""),
972
1048
  scenario_spec=scenario_spec,
973
1049
  adapter_spec=adapter_spec,
974
- metric_specs=get_exact_match_metric_specs(),
1050
+ metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
975
1051
  groups=["imdb"],
976
1052
  )
977
1053
 
@@ -1182,10 +1258,7 @@ def get_narrativeqa_spec() -> RunSpec:
1182
1258
  name="narrative_qa",
1183
1259
  scenario_spec=scenario_spec,
1184
1260
  adapter_spec=adapter_spec,
1185
- metric_specs=get_basic_metric_specs(
1186
- ["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4"]
1187
- )
1188
- + get_generative_harms_metric_specs(),
1261
+ metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
1189
1262
  groups=["narrative_qa"],
1190
1263
  )
1191
1264
 
@@ -1509,7 +1582,7 @@ def get_big_bench_spec(task: str, subtask: str) -> RunSpec:
1509
1582
  adapter_spec = AdapterSpec(
1510
1583
  method=get_adaptation_method(big_bench_task["metrics"]),
1511
1584
  model="openai/text-curie-001", # Can override with the `ModelRunExpander`.
1512
- max_train_instances=0, # Can override with the `MaxTrainInstancesRunExpander`.
1585
+ max_train_instances=5, # Can override with the `MaxTrainInstancesRunExpander`.
1513
1586
  num_outputs=1, # Can override with the `NumOutputsRunExpander`.
1514
1587
  # From "Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models",
1515
1588
  # for the BIG-G models tested on BIG-bench, "we use an input context length of 1,024 tokens
@@ -1541,36 +1614,136 @@ def get_big_bench_spec(task: str, subtask: str) -> RunSpec:
1541
1614
  )
1542
1615
 
1543
1616
 
1544
- def get_pubmed_qa_spec(prompt_answer_choices: str) -> RunSpec:
1617
+ def get_covid_dialog_spec() -> RunSpec:
1618
+ scenario_spec = ScenarioSpec(
1619
+ class_name="helm.benchmark.scenarios.covid_dialog_scenario.COVIDDialogScenario", args={}
1620
+ )
1621
+
1622
+ adapter_spec = get_generation_adapter_spec(
1623
+ instructions="Generate a response given a patient's questions and concerns.",
1624
+ input_noun="Patient",
1625
+ output_noun="Doctor",
1626
+ max_tokens=128,
1627
+ )
1628
+
1629
+ return RunSpec(
1630
+ name="covid_dialog",
1631
+ scenario_spec=scenario_spec,
1632
+ adapter_spec=adapter_spec,
1633
+ metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
1634
+ groups=["COVIDDialog"],
1635
+ )
1636
+
1637
+
1638
+ def get_me_q_sum_spec() -> RunSpec:
1639
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.me_q_sum_scenario.MeQSumScenario", args={})
1640
+
1641
+ adapter_spec = get_summarization_adapter_spec(
1642
+ num_sents=1,
1643
+ max_tokens=128,
1644
+ temperature=0.3,
1645
+ )
1646
+
1647
+ return RunSpec(
1648
+ name="me_q_sum",
1649
+ scenario_spec=scenario_spec,
1650
+ adapter_spec=adapter_spec,
1651
+ metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
1652
+ groups=["MeQSum"],
1653
+ )
1654
+
1655
+
1656
+ def get_med_dialog_spec(subset: str) -> RunSpec:
1657
+ scenario_spec = ScenarioSpec(
1658
+ class_name="helm.benchmark.scenarios.med_dialog_scenario.MedDialogScenario", args={"subset": subset}
1659
+ )
1660
+
1661
+ adapter_spec = get_summarization_adapter_spec(
1662
+ num_sents=1,
1663
+ max_tokens=128,
1664
+ temperature=0.3,
1665
+ )
1666
+
1667
+ return RunSpec(
1668
+ name=f"med_dialog,subset={subset}",
1669
+ scenario_spec=scenario_spec,
1670
+ adapter_spec=adapter_spec,
1671
+ metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
1672
+ groups=["MedDialog"],
1673
+ )
1674
+
1675
+
1676
+ def get_med_mcqa_spec() -> RunSpec:
1677
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.med_mcqa_scenario.MedMCQAScenario", args={})
1678
+
1679
+ adapter_spec = get_multiple_choice_adapter_spec(
1680
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
1681
+ instructions="Give a letter answer among A, B, C or D.",
1682
+ input_noun="Question",
1683
+ output_noun="Answer",
1684
+ )
1685
+
1686
+ return RunSpec(
1687
+ name="med_mcqa",
1688
+ scenario_spec=scenario_spec,
1689
+ adapter_spec=adapter_spec,
1690
+ metric_specs=get_exact_match_metric_specs(),
1691
+ groups=["MedMCQA"],
1692
+ )
1693
+
1694
+
1695
+ def get_med_paragraph_simplification_spec() -> RunSpec:
1696
+ scenario_spec = ScenarioSpec(
1697
+ class_name="helm.benchmark.scenarios.med_paragraph_simplification_scenario.MedParagraphSimplificationScenario",
1698
+ args={},
1699
+ )
1700
+
1701
+ adapter_spec = get_summarization_adapter_spec(
1702
+ num_sents=10,
1703
+ max_tokens=512,
1704
+ temperature=0.3,
1705
+ )
1706
+
1707
+ return RunSpec(
1708
+ name="med_paragraph_simplification",
1709
+ scenario_spec=scenario_spec,
1710
+ adapter_spec=adapter_spec,
1711
+ metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
1712
+ groups=["MedParagraphSimplification"],
1713
+ )
1714
+
1715
+
1716
+ def get_med_qa_spec() -> RunSpec:
1717
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.med_qa_scenario.MedQAScenario", args={})
1718
+
1719
+ adapter_spec = get_multiple_choice_adapter_spec(
1720
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
1721
+ instructions="Give a letter answer among A, B, C or D.",
1722
+ input_noun="Question",
1723
+ output_noun="Answer",
1724
+ )
1725
+
1726
+ return RunSpec(
1727
+ name="med_qa",
1728
+ scenario_spec=scenario_spec,
1729
+ adapter_spec=adapter_spec,
1730
+ metric_specs=get_exact_match_metric_specs(),
1731
+ groups=["MedQA"],
1732
+ )
1733
+
1734
+
1735
+ def get_pubmed_qa_spec() -> RunSpec:
1545
1736
  scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.pubmed_qa_scenario.PubMedQAScenario", args={})
1546
1737
 
1547
- # We are trying to reproduce the Instruct-GPT3's zero-shot performance of 73.2% from
1548
- # "Can large language models reason about medical questions?" (Liévin et al.).
1549
- # Therefore, specify the values of the fields of `AdapterSpec` based on experiment details of the paper.
1550
- # Set `output_prefix` based on Table 1 (titled "Prompt templates") of the paper.
1551
- output_prefix: str = "Answer: "
1552
- if prompt_answer_choices.lower() == "true":
1553
- output_prefix += "among A through C, the answer is "
1554
-
1555
- # Liévin et al. followed what Kojima et al. did in "Large Language Models are Zero-Shot Reasoners."
1556
- # to extract answers from completions: set the max completion length to a large number and
1557
- # "...pick up the first large letter encountered in the text." Then they set "'Q:'...as a customized stop
1558
- # sequence for all the models except for Instruct-GPT3 to stop the models from repeating questions and
1559
- # answers by themselves." We don't need to do this since our framework has a "multiple_choice_joint"
1560
- # adaptation method that handles the prompt construction for multiple-choice QA for us.
1561
- adapter_spec = AdapterSpec(
1738
+ adapter_spec = get_multiple_choice_adapter_spec(
1562
1739
  method=ADAPT_MULTIPLE_CHOICE_JOINT,
1563
- max_train_instances=0, # We want to reproduce the zero-shot performance.
1564
- # "We sampled one completion per prompt with a temperature of zero..."
1565
- num_outputs=1,
1566
- temperature=0,
1567
- input_prefix="",
1568
- output_prefix=output_prefix,
1569
- # Following the examples in https://vlievin.github.io/medical-reasoning/samples/pubmedqa.html
1570
- reference_prefix="A) ",
1740
+ instructions="Answer A for yes, B for no or C for maybe.",
1741
+ input_noun="Question",
1742
+ output_noun="Answer",
1571
1743
  )
1744
+
1572
1745
  return RunSpec(
1573
- name=f"pubmed_qa:prompt_answer_choices={prompt_answer_choices}",
1746
+ name="pubmed_qa",
1574
1747
  scenario_spec=scenario_spec,
1575
1748
  adapter_spec=adapter_spec,
1576
1749
  metric_specs=get_exact_match_metric_specs(),
@@ -1578,6 +1751,125 @@ def get_pubmed_qa_spec(prompt_answer_choices: str) -> RunSpec:
1578
1751
  )
1579
1752
 
1580
1753
 
1754
+ def build_classification_metrics(task_type):
1755
+ if task_type in [TaskType.QA, TaskType.SLTC]:
1756
+ return get_classification_metric_specs(delimiter=None)
1757
+ elif task_type == TaskType.MLTC:
1758
+ return get_classification_metric_specs(delimiter=",")
1759
+ return []
1760
+
1761
+
1762
+ def get_lextreme_spec(subset: str) -> RunSpec:
1763
+ scenario_spec = ScenarioSpec(
1764
+ class_name="helm.benchmark.scenarios.lextreme_scenario.LEXTREMEScenario",
1765
+ args={"subset": subset},
1766
+ )
1767
+
1768
+ adapter_spec = get_generation_adapter_spec(
1769
+ instructions=get_lextreme_instructions(subset),
1770
+ input_noun="Passage",
1771
+ output_noun="Answer",
1772
+ max_tokens=get_lextreme_max_tokens(subset),
1773
+ max_train_instances=get_lextreme_max_train_instances(subset), # in some subsets the input is very long
1774
+ )
1775
+
1776
+ return RunSpec(
1777
+ name=f"lextreme:subset={subset}",
1778
+ scenario_spec=scenario_spec,
1779
+ adapter_spec=adapter_spec,
1780
+ metric_specs=build_classification_metrics(get_lextreme_task_type(subset)),
1781
+ groups=["lextreme"],
1782
+ )
1783
+
1784
+
1785
+ def get_lex_glue_spec(subset: str) -> RunSpec:
1786
+ scenario_spec = ScenarioSpec(
1787
+ class_name="helm.benchmark.scenarios.lex_glue_scenario.LexGLUEScenario",
1788
+ args={"subset": subset},
1789
+ )
1790
+
1791
+ adapter_spec = get_generation_adapter_spec(
1792
+ instructions=get_lex_glue_instructions(subset),
1793
+ input_noun="Passage",
1794
+ output_noun="Answer",
1795
+ max_tokens=get_lex_glue_max_tokens(subset),
1796
+ max_train_instances=get_lex_glue_max_train_instances(subset), # in some subsets the input is very long
1797
+ )
1798
+
1799
+ return RunSpec(
1800
+ name=f"lex_glue:subset={subset}",
1801
+ scenario_spec=scenario_spec,
1802
+ adapter_spec=adapter_spec,
1803
+ metric_specs=build_classification_metrics(get_lex_glue_task_type(subset)),
1804
+ groups=["lex_glue"],
1805
+ )
1806
+
1807
+
1808
+ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec:
1809
+ FULL_LANGUAGE_NAMES = {
1810
+ "cs": "Czech",
1811
+ "de": "German",
1812
+ "fr": "French",
1813
+ "hi": "Hindi",
1814
+ "ru": "Russian",
1815
+ "en": "English",
1816
+ }
1817
+ source_language, target_language = language_pair.split("-")
1818
+
1819
+ scenario_spec = ScenarioSpec(
1820
+ class_name="helm.benchmark.scenarios.wmt_14_scenario.WMT14Scenario",
1821
+ args={"source_language": source_language, "target_language": target_language},
1822
+ )
1823
+
1824
+ adapter_spec = get_machine_translation_adapter_spec(
1825
+ source_language=FULL_LANGUAGE_NAMES[source_language],
1826
+ target_language=FULL_LANGUAGE_NAMES[target_language],
1827
+ max_train_instances=max_train_instances,
1828
+ )
1829
+
1830
+ return RunSpec(
1831
+ name=f"wmt_14:language_pair={language_pair}",
1832
+ scenario_spec=scenario_spec,
1833
+ adapter_spec=adapter_spec,
1834
+ metric_specs=get_machine_translation_metric_specs(),
1835
+ groups=["wmt_14"],
1836
+ )
1837
+
1838
+
1839
+ def get_opinions_qa_spec(
1840
+ survey_type: str,
1841
+ num_logprobs: str,
1842
+ context: str = "None",
1843
+ num_train_trials: str = "1",
1844
+ method: str = ADAPT_MULTIPLE_CHOICE_JOINT,
1845
+ ) -> RunSpec:
1846
+ scenario_spec = ScenarioSpec(
1847
+ class_name="helm.benchmark.scenarios.opinions_qa_scenario.OpinionsQAScenario",
1848
+ args={"survey_type": survey_type, "context": context},
1849
+ )
1850
+
1851
+ adapter_spec = get_multiple_choice_adapter_spec(
1852
+ method=method,
1853
+ instructions="",
1854
+ input_noun="Question",
1855
+ output_noun="Answer",
1856
+ max_train_instances=1 if "steer" in context else 0,
1857
+ max_tokens=1,
1858
+ num_outputs=int(num_logprobs),
1859
+ num_train_trials=1 if context != "steer-qa" else int(num_train_trials),
1860
+ sample_train=False,
1861
+ )
1862
+
1863
+ return RunSpec(
1864
+ name=f"opinions_qa:survey={survey_type},num_logprobs={num_logprobs}"
1865
+ + f",context={context},num_train_trials={num_train_trials}",
1866
+ scenario_spec=scenario_spec,
1867
+ adapter_spec=adapter_spec,
1868
+ metric_specs=[],
1869
+ groups=["opinions_qa"],
1870
+ )
1871
+
1872
+
1581
1873
  ############################################################
1582
1874
 
1583
1875
  CANONICAL_RUN_SPEC_FUNCS: Dict[str, Callable[..., RunSpec]] = {
@@ -1624,7 +1916,18 @@ CANONICAL_RUN_SPEC_FUNCS: Dict[str, Callable[..., RunSpec]] = {
1624
1916
  "entity_data_imputation": get_entity_data_imputation_spec,
1625
1917
  "ice": get_ice_spec,
1626
1918
  "big_bench": get_big_bench_spec,
1919
+ "lextreme": get_lextreme_spec,
1920
+ "lex_glue": get_lex_glue_spec,
1921
+ "wmt_14": get_wmt_14_spec,
1922
+ # Biomedical
1923
+ "covid_dialog": get_covid_dialog_spec,
1924
+ "me_q_sum": get_me_q_sum_spec,
1925
+ "med_dialog": get_med_dialog_spec,
1926
+ "med_mcqa": get_med_mcqa_spec,
1927
+ "med_paragraph_simplification": get_med_paragraph_simplification_spec,
1928
+ "med_qa": get_med_qa_spec,
1627
1929
  "pubmed_qa": get_pubmed_qa_spec,
1930
+ "opinions_qa": get_opinions_qa_spec,
1628
1931
  }
1629
1932
 
1630
1933
 
@@ -1667,6 +1970,10 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
1667
1970
  global_prefix_expander = GlobalPrefixRunExpander(value="nlg")
1668
1971
  run_spec = singleton(global_prefix_expander.expand(run_spec))
1669
1972
 
1973
+ if CHATML_MODEL_TAG in model.tags:
1974
+ chatml_expander = ChatMLRunExpander()
1975
+ run_spec = singleton(chatml_expander.expand(run_spec))
1976
+
1670
1977
  return run_spec
1671
1978
 
1672
1979
  run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]
helm/benchmark/runner.py CHANGED
@@ -1,10 +1,13 @@
1
1
  import json
2
2
  import os
3
+ import traceback
3
4
  import typing
4
5
  from collections import Counter
5
6
  from dataclasses import dataclass, field
6
7
  from typing import List
7
8
 
9
+ from tqdm import tqdm
10
+
8
11
  from helm.common.general import ensure_directory_exists, write, asdict_without_nones
9
12
  from helm.common.hierarchical_logger import hlog, htrack_block
10
13
  from helm.common.cache import cache_stats
@@ -23,6 +26,12 @@ from .metrics.tokens_metric import TokensMetric
23
26
  from .window_services.tokenizer_service import TokenizerService
24
27
 
25
28
 
29
+ class RunnerError(Exception):
30
+ """Error that happens in the Runner."""
31
+
32
+ pass
33
+
34
+
26
35
  @dataclass(frozen=True)
27
36
  class RunSpec:
28
37
  """
@@ -68,15 +77,17 @@ class Runner:
68
77
  execution_spec: ExecutionSpec,
69
78
  output_path: str,
70
79
  suite: str,
71
- run_specs: List[RunSpec],
72
80
  skip_instances: bool,
81
+ skip_completed_runs: bool,
82
+ exit_on_error: bool,
73
83
  ):
74
84
  self.executor = Executor(execution_spec)
75
85
  self.dry_run: bool = execution_spec.dry_run
76
86
  self.tokenizer_service = TokenizerService(self.executor.service, execution_spec.auth)
77
87
  self.metric_service = MetricService(self.executor.service, execution_spec.auth)
78
- self.run_specs: List[RunSpec] = run_specs
79
88
  self.skip_instances: bool = skip_instances
89
+ self.skip_completed_runs: bool = skip_completed_runs
90
+ self.exit_on_error: bool = exit_on_error
80
91
 
81
92
  ensure_directory_exists(output_path)
82
93
  # Decide where to save the raw data (e.g., "output/scenarios/mmlu").
@@ -90,10 +101,21 @@ class Runner:
90
101
  self.eval_cache_path: str = os.path.join(self.runs_path, "eval_cache")
91
102
  ensure_directory_exists(self.eval_cache_path)
92
103
 
93
- def run_all(self):
94
- for run_spec in self.run_specs:
95
- with htrack_block(f"Running {run_spec.name}"):
96
- self.run_one(run_spec)
104
+ def run_all(self, run_specs: List[RunSpec]):
105
+ failed_run_specs: List[RunSpec] = []
106
+ for run_spec in tqdm(run_specs, disable=None):
107
+ try:
108
+ with htrack_block(f"Running {run_spec.name}"):
109
+ self.run_one(run_spec)
110
+ except Exception as e:
111
+ if self.exit_on_error:
112
+ raise e
113
+ else:
114
+ hlog(f"Error when running {run_spec.name}:\n{traceback.format_exc()}")
115
+ failed_run_specs.append(run_spec)
116
+ if not self.exit_on_error and failed_run_specs:
117
+ failed_runs_str = ", ".join([f'"{run_spec.name}"' for run_spec in failed_run_specs])
118
+ raise RunnerError(f"Failed runs: [{failed_runs_str}]")
97
119
 
98
120
  def run_one(self, run_spec: RunSpec):
99
121
  # Load the scenario
@@ -106,6 +128,12 @@ class Runner:
106
128
  run_path: str = os.path.join(self.runs_path, run_spec.name)
107
129
  ensure_directory_exists(run_path)
108
130
 
131
+ if self.skip_completed_runs and os.path.exists(os.path.join(run_path, "scenario_state.json")):
132
+ # If scenario_state.json exists, assume that all other output files exist
133
+ # because scenario_state.json is the last output file to be written.
134
+ hlog(f"Skipping run {run_spec.name} because run is completed and all output files exist.")
135
+ return
136
+
109
137
  # Fetch and initialize the Adapter based on the `AdapterSpec`.
110
138
  adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)
111
139
 
@@ -72,7 +72,7 @@ class CopyrightScenario(Scenario):
72
72
 
73
73
  # Read all the instances
74
74
  instances: List[Instance] = []
75
- for prefix, prefix_to_end in tqdm.tqdm(data["data"].items(), desc="load instances"):
75
+ for prefix, prefix_to_end in tqdm.tqdm(data["data"].items(), desc="load instances", disable=None):
76
76
  instances.append(
77
77
  Instance(
78
78
  input=Input(text=prefix),