crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
  2. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
  3. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
  4. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
  5. helm/benchmark/__init__.py +13 -0
  6. helm/benchmark/adaptation/adapter_spec.py +3 -0
  7. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
  8. helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
  9. helm/benchmark/contamination/__init__.py +0 -0
  10. helm/benchmark/metrics/classification_metrics.py +70 -0
  11. helm/benchmark/metrics/machine_translation_metrics.py +36 -0
  12. helm/benchmark/metrics/summarization_metrics.py +7 -8
  13. helm/benchmark/metrics/test_classification_metrics.py +150 -0
  14. helm/benchmark/presentation/create_plots.py +617 -0
  15. helm/benchmark/presentation/run_display.py +7 -48
  16. helm/benchmark/presentation/summarize.py +4 -2
  17. helm/benchmark/presentation/test_create_plots.py +32 -0
  18. helm/benchmark/run.py +144 -48
  19. helm/benchmark/run_expander.py +164 -47
  20. helm/benchmark/run_specs.py +346 -39
  21. helm/benchmark/runner.py +34 -6
  22. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  23. helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
  24. helm/benchmark/scenarios/imdb_listdir.json +50014 -0
  25. helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
  26. helm/benchmark/scenarios/lextreme_scenario.py +458 -0
  27. helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
  28. helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
  29. helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
  30. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
  31. helm/benchmark/scenarios/med_qa_scenario.py +96 -0
  32. helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
  33. helm/benchmark/scenarios/scenario.py +5 -0
  34. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  35. helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
  36. helm/benchmark/static/benchmarking.css +14 -0
  37. helm/benchmark/static/benchmarking.js +43 -0
  38. helm/benchmark/static/index.html +2 -0
  39. helm/benchmark/static/json-urls.js +4 -0
  40. helm/benchmark/static/plot-captions.js +16 -0
  41. helm/benchmark/static/schema.yaml +154 -1
  42. helm/benchmark/window_services/cohere_window_service.py +20 -0
  43. helm/benchmark/window_services/flan_t5_window_service.py +29 -0
  44. helm/benchmark/window_services/huggingface_window_service.py +39 -0
  45. helm/benchmark/window_services/santacoder_window_service.py +27 -0
  46. helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
  47. helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
  48. helm/benchmark/window_services/window_service_factory.py +34 -7
  49. helm/common/codec.py +123 -0
  50. helm/common/general.py +12 -5
  51. helm/common/test_codec.py +144 -0
  52. helm/proxy/clients/aleph_alpha_client.py +47 -28
  53. helm/proxy/clients/auto_client.py +32 -24
  54. helm/proxy/clients/google_client.py +88 -0
  55. helm/proxy/clients/huggingface_client.py +32 -16
  56. helm/proxy/clients/huggingface_model_registry.py +111 -0
  57. helm/proxy/clients/huggingface_tokenizer.py +25 -7
  58. helm/proxy/clients/openai_client.py +60 -2
  59. helm/proxy/clients/test_huggingface_model_registry.py +57 -0
  60. helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
  61. helm/proxy/clients/together_client.py +17 -2
  62. helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
  63. helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
  64. helm/proxy/models.py +115 -7
  65. helm/proxy/test_models.py +1 -1
  66. helm/benchmark/presentation/present.py +0 -249
  67. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
  68. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,253 @@
1
+ import random
2
+ from pathlib import Path
3
+ from typing import List, Any
4
+
5
+ import datasets
6
+ from datasets import load_dataset
7
+
8
+ from .lextreme_scenario import TaskType
9
+ from .scenario import Scenario, Instance, Reference, CORRECT_TAG, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, Input, Output
10
+
11
+ ECTHR_A = "ecthr_a"
12
+ ECTHR_B = "ecthr_b"
13
+ SCOTUS = "scotus"
14
+ EURLEX = "eurlex"
15
+ LEDGAR = "ledgar"
16
+ UNFAIR_TOS = "unfair_tos"
17
+ CASE_HOLD = "case_hold"
18
+
19
+ TASK_CODE_MAPPING = {
20
+ ECTHR_A: TaskType.MLTC,
21
+ ECTHR_B: TaskType.MLTC,
22
+ SCOTUS: TaskType.SLTC,
23
+ EURLEX: TaskType.MLTC,
24
+ LEDGAR: TaskType.SLTC,
25
+ UNFAIR_TOS: TaskType.MLTC,
26
+ CASE_HOLD: TaskType.QA,
27
+ }
28
+
29
+
30
+ def get_lex_glue_task_type(subset):
31
+ return TASK_CODE_MAPPING[subset]
32
+
33
+
34
+ TASK_MAX_TRAIN_INSTANCES_MAPPING = {
35
+ ECTHR_A: 1, # ~ max 4096 tokens
36
+ ECTHR_B: 1, # ~ max 4096 tokens
37
+ SCOTUS: 1, # ~ max 8192 tokens
38
+ EURLEX: 5, # ~ max 512 tokens
39
+ LEDGAR: 5, # ~ max 512 tokens
40
+ UNFAIR_TOS: 5, # ~ max 128 tokens
41
+ CASE_HOLD: 5, # ~ max 512 tokens
42
+ }
43
+
44
+
45
+ def get_lex_glue_max_train_instances(subset):
46
+ return TASK_MAX_TRAIN_INSTANCES_MAPPING[subset]
47
+
48
+
49
+ TASK_MAX_TOKENS_MAPPING = {
50
+ ECTHR_A: 20, # sequence of numbers
51
+ ECTHR_B: 20, # sequence of numbers
52
+ SCOTUS: 5, # one number
53
+ EURLEX: 20, # sequence of numbers
54
+ LEDGAR: 20, # multiple words
55
+ UNFAIR_TOS: 20, # sequence of numbers
56
+ CASE_HOLD: 5, # one number
57
+ }
58
+
59
+
60
+ def get_lex_glue_max_tokens(subset):
61
+ return TASK_MAX_TOKENS_MAPPING[subset]
62
+
63
+
64
+ INSTRUCTIONS = {
65
+ ECTHR_A: "In this task, you are given the facts from a case heard at the European Court of Human Rights (ECtHR). "
66
+ "Predict the articles of the ECtHR that were violated (if any) out of the following: "
67
+ "0: Article 2, "
68
+ "1: Article 3, "
69
+ "2: Article 5, "
70
+ "3: Article 6, "
71
+ "4: Article 8, "
72
+ "5: Article 9, "
73
+ "6: Article 10, "
74
+ "7: Article 11, "
75
+ "8: Article 14, "
76
+ "9: Article 1 of Protocol 1. "
77
+ "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
78
+ ECTHR_B: "In this task, you are given the facts from a case heard at the European Court of Human Rights (ECtHR). "
79
+ "Predict the articles of ECtHR that were allegedly violated (considered by the court) out of the following:"
80
+ "0: Article 2, "
81
+ "1: Article 3, "
82
+ "2: Article 5, "
83
+ "3: Article 6, "
84
+ "4: Article 8, "
85
+ "5: Article 9, "
86
+ "6: Article 10, "
87
+ "7: Article 11, "
88
+ "8: Article 14, "
89
+ "9: Article 1 of Protocol 1. "
90
+ "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
91
+ SCOTUS: "In this task, you are given a case heard at the Supreme Court of the United States (SCOTUS). "
92
+ "Predict the relevant issue area out of the following: "
93
+ "0: Criminal Procedure, "
94
+ "1: Civil Rights, "
95
+ "2: First Amendment, "
96
+ "3: Due Process, "
97
+ "4: Privacy, "
98
+ "5: Attorneys, "
99
+ "6: Unions, "
100
+ "7: Economic Activity, "
101
+ "8: Judicial Power, "
102
+ "9: Federalism, "
103
+ "10: Interstate Relations, "
104
+ "11: Federal Taxation, "
105
+ "12: Miscellaneous, "
106
+ "13: Private Action.",
107
+ EURLEX: "In this task, you are given an EU law document published in the EUR-Lex portal. "
108
+ "Predict the relevant EuroVoc concepts. "
109
+ "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
110
+ LEDGAR: "In this task, you are given a contract provision "
111
+ "from contracts obtained from US Securities and Exchange Commission (SEC) filings."
112
+ "Predict the main topic. ",
113
+ UNFAIR_TOS: "In this task, you are given a sentence "
114
+ "from a Terms of Service (ToS) document from online platforms. "
115
+ "Predict the types of unfair contractual terms out of the following: "
116
+ "0: Limitation of liability, "
117
+ "1: Unilateral termination, "
118
+ "2: Unilateral change, "
119
+ "3: Content removal, "
120
+ "4: Contract by using, "
121
+ "5: Choice of law, "
122
+ "6: Jurisdiction, "
123
+ "7: Arbitration. "
124
+ "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
125
+ CASE_HOLD: "In this task, you are given an excerpt from a court decision, "
126
+ "containing a reference to a particular case, while the holding statement is masked out. "
127
+ "Predict the index of the holding statement fitting in the context at <HOLDING> from a selection of five choices.",
128
+ }
129
+
130
+
131
+ def get_lex_glue_instructions(subset):
132
+ return INSTRUCTIONS[subset]
133
+
134
+
135
+ class LexGLUEScenario(Scenario):
136
+ """
137
+ Inspired by the recent widespread use of the GLUE multi-task benchmark NLP dataset (Wang et al., 2018),
138
+ the subsequent more difficult SuperGLUE (Wang et al., 2019),
139
+ other previous multi-task NLP benchmarks (Conneau and Kiela, 2018; McCann et al., 2018),
140
+ and similar initiatives in other domains (Peng et al., 2019),
141
+ we introduce the Legal General Language Understanding Evaluation (LexGLUE) benchmark,
142
+ a benchmark dataset to evaluate the performance of NLP methods in legal tasks.
143
+ LexGLUE is based on seven existing legal NLP datasets, selected using criteria largely from SuperGLUE.
144
+ Find more information on the dataset here: https://huggingface.co/datasets/lex_glue
145
+
146
+ We prompt models using the following format (example for unfair_tos)
147
+
148
+ <sentence>
149
+ Unfair Contractual Term Type:
150
+
151
+ Target completion:
152
+ <sentence> (<sentence>:"Limitation of liability", "Unilateral termination", "Unilateral change",
153
+ "Content removal", "Contract by using", "Choice of law", "Jurisdiction", "Arbitration")
154
+
155
+ Using an example from the training dataset, we have
156
+
157
+ ```
158
+ "tinder may terminate your account at any time without notice if it believes that you have violated this agreement."
159
+
160
+ Unfair Contractual Term Type:
161
+ Target completion:
162
+ "Unilateral change"
163
+ ```
164
+
165
+ """
166
+
167
+ name = "lex_glue"
168
+ description = "A Benchmark Dataset for Legal Language Understanding in English."
169
+ tags = ["single_label_text_classification", "multi_label_text_classification", "question_answering"]
170
+
171
+ # Mapping from HELM splits to HF splits
172
+ splits_mapping = {
173
+ TRAIN_SPLIT: datasets.Split.TRAIN,
174
+ VALID_SPLIT: datasets.Split.VALIDATION,
175
+ TEST_SPLIT: datasets.Split.TEST,
176
+ }
177
+
178
+ dataset_name = "lex_glue"
179
+ max_number_of_wrong_answers = 30
180
+
181
+ def __init__(self, subset: str):
182
+ super().__init__()
183
+ assert subset in list(TASK_CODE_MAPPING.keys()) + ["all"], f"Unknown subset: {subset}"
184
+ self.subsets = [subset] if subset != "all" else list(TASK_CODE_MAPPING.keys())
185
+ self.random: random.Random = random.Random(42)
186
+
187
+ def get_instances_for_subset(self, config: str) -> List[Instance]:
188
+ task_code = TASK_CODE_MAPPING[config]
189
+ # Load dataset
190
+ cache_dir = str(Path(self.output_path) / "data")
191
+ dataset: Any = load_dataset(self.dataset_name, config, cache_dir=cache_dir)
192
+
193
+ if task_code in [TaskType.SLTC, TaskType.QA]:
194
+ class_label = dataset["train"].features["label"]
195
+ label_classes = class_label.names
196
+ elif task_code == TaskType.MLTC:
197
+ # construct the label classes
198
+ label_classes = set()
199
+ for split in self.splits_mapping.values():
200
+ for example in dataset[split]:
201
+ label_classes |= set(example["labels"]) # add all new labels to the set
202
+ label_classes = sorted(list(map(str, label_classes))) # convert everything to a string
203
+
204
+ def generate_instance(example, split: str):
205
+ # get correct labels
206
+ if task_code in [TaskType.SLTC, TaskType.QA]:
207
+ correct_label = class_label.int2str(example["label"]) # get label name for correct label
208
+ correct_labels = correct_label if isinstance(correct_label, list) else [correct_label]
209
+ elif task_code == TaskType.MLTC:
210
+ correct_labels = list(map(str, example["labels"])) # here we don't have any mapping to label names
211
+
212
+ # construct wrong references
213
+ wrong_references = []
214
+ for label_name in label_classes:
215
+ if label_name not in correct_labels:
216
+ wrong_reference = Reference(output=Output(label_name), tags=[]) # Wrong output
217
+ wrong_references.append(wrong_reference)
218
+
219
+ wrong_references = reduce_wrong_reference_count(wrong_references)
220
+
221
+ # construct correct references and input
222
+ if task_code in [TaskType.SLTC, TaskType.MLTC]:
223
+ input_text = example["text"]
224
+ if "ecthr" in config:
225
+ input_text = " ".join(input_text)
226
+ elif task_code == TaskType.QA:
227
+ endings = [f"{i}: {end}" for i, end in enumerate(example["endings"])]
228
+ input_text = example["context"] + " Holdings: " + " ".join(endings)
229
+
230
+ # construct correct references
231
+ correct_references = [
232
+ Reference(output=Output(correct_label), tags=[CORRECT_TAG]) for correct_label in correct_labels
233
+ ] # for MLTC we have multiple correct ones
234
+ return Instance(input=Input(input_text), references=wrong_references + correct_references, split=split)
235
+
236
+ def reduce_wrong_reference_count(wrong_references):
237
+ self.random.shuffle(wrong_references) # shuffle wrong references
238
+ if len(wrong_references) > self.max_number_of_wrong_answers:
239
+ # if there are too many wrong references, only take a subset
240
+ wrong_references = wrong_references[: self.max_number_of_wrong_answers]
241
+ return wrong_references
242
+
243
+ def generate_instances(split: str):
244
+ split_dataset = dataset[self.splits_mapping[split]]
245
+ return [generate_instance(example, split) for example in split_dataset]
246
+
247
+ return generate_instances(TRAIN_SPLIT) + generate_instances(VALID_SPLIT) + generate_instances(TEST_SPLIT)
248
+
249
+ def get_instances(self) -> List[Instance]:
250
+ instances = []
251
+ for subset in self.subsets:
252
+ instances.extend(self.get_instances_for_subset(subset))
253
+ return instances