crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
- helm/benchmark/__init__.py +13 -0
- helm/benchmark/adaptation/adapter_spec.py +3 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
- helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
- helm/benchmark/contamination/__init__.py +0 -0
- helm/benchmark/metrics/classification_metrics.py +70 -0
- helm/benchmark/metrics/machine_translation_metrics.py +36 -0
- helm/benchmark/metrics/summarization_metrics.py +7 -8
- helm/benchmark/metrics/test_classification_metrics.py +150 -0
- helm/benchmark/presentation/create_plots.py +617 -0
- helm/benchmark/presentation/run_display.py +7 -48
- helm/benchmark/presentation/summarize.py +4 -2
- helm/benchmark/presentation/test_create_plots.py +32 -0
- helm/benchmark/run.py +144 -48
- helm/benchmark/run_expander.py +164 -47
- helm/benchmark/run_specs.py +346 -39
- helm/benchmark/runner.py +34 -6
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
- helm/benchmark/scenarios/imdb_listdir.json +50014 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
- helm/benchmark/scenarios/lextreme_scenario.py +458 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
- helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
- helm/benchmark/scenarios/med_qa_scenario.py +96 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
- helm/benchmark/scenarios/scenario.py +5 -0
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
- helm/benchmark/static/benchmarking.css +14 -0
- helm/benchmark/static/benchmarking.js +43 -0
- helm/benchmark/static/index.html +2 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/plot-captions.js +16 -0
- helm/benchmark/static/schema.yaml +154 -1
- helm/benchmark/window_services/cohere_window_service.py +20 -0
- helm/benchmark/window_services/flan_t5_window_service.py +29 -0
- helm/benchmark/window_services/huggingface_window_service.py +39 -0
- helm/benchmark/window_services/santacoder_window_service.py +27 -0
- helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
- helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
- helm/benchmark/window_services/window_service_factory.py +34 -7
- helm/common/codec.py +123 -0
- helm/common/general.py +12 -5
- helm/common/test_codec.py +144 -0
- helm/proxy/clients/aleph_alpha_client.py +47 -28
- helm/proxy/clients/auto_client.py +32 -24
- helm/proxy/clients/google_client.py +88 -0
- helm/proxy/clients/huggingface_client.py +32 -16
- helm/proxy/clients/huggingface_model_registry.py +111 -0
- helm/proxy/clients/huggingface_tokenizer.py +25 -7
- helm/proxy/clients/openai_client.py +60 -2
- helm/proxy/clients/test_huggingface_model_registry.py +57 -0
- helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
- helm/proxy/clients/together_client.py +17 -2
- helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
- helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
- helm/proxy/models.py +115 -7
- helm/proxy/test_models.py +1 -1
- helm/benchmark/presentation/present.py +0 -249
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
import random
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import List, Any
|
|
4
|
+
|
|
5
|
+
import datasets
|
|
6
|
+
from datasets import load_dataset
|
|
7
|
+
|
|
8
|
+
from .lextreme_scenario import TaskType
|
|
9
|
+
from .scenario import Scenario, Instance, Reference, CORRECT_TAG, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, Input, Output
|
|
10
|
+
|
|
11
|
+
ECTHR_A = "ecthr_a"
|
|
12
|
+
ECTHR_B = "ecthr_b"
|
|
13
|
+
SCOTUS = "scotus"
|
|
14
|
+
EURLEX = "eurlex"
|
|
15
|
+
LEDGAR = "ledgar"
|
|
16
|
+
UNFAIR_TOS = "unfair_tos"
|
|
17
|
+
CASE_HOLD = "case_hold"
|
|
18
|
+
|
|
19
|
+
TASK_CODE_MAPPING = {
|
|
20
|
+
ECTHR_A: TaskType.MLTC,
|
|
21
|
+
ECTHR_B: TaskType.MLTC,
|
|
22
|
+
SCOTUS: TaskType.SLTC,
|
|
23
|
+
EURLEX: TaskType.MLTC,
|
|
24
|
+
LEDGAR: TaskType.SLTC,
|
|
25
|
+
UNFAIR_TOS: TaskType.MLTC,
|
|
26
|
+
CASE_HOLD: TaskType.QA,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_lex_glue_task_type(subset):
|
|
31
|
+
return TASK_CODE_MAPPING[subset]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
TASK_MAX_TRAIN_INSTANCES_MAPPING = {
|
|
35
|
+
ECTHR_A: 1, # ~ max 4096 tokens
|
|
36
|
+
ECTHR_B: 1, # ~ max 4096 tokens
|
|
37
|
+
SCOTUS: 1, # ~ max 8192 tokens
|
|
38
|
+
EURLEX: 5, # ~ max 512 tokens
|
|
39
|
+
LEDGAR: 5, # ~ max 512 tokens
|
|
40
|
+
UNFAIR_TOS: 5, # ~ max 128 tokens
|
|
41
|
+
CASE_HOLD: 5, # ~ max 512 tokens
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def get_lex_glue_max_train_instances(subset):
|
|
46
|
+
return TASK_MAX_TRAIN_INSTANCES_MAPPING[subset]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
TASK_MAX_TOKENS_MAPPING = {
|
|
50
|
+
ECTHR_A: 20, # sequence of numbers
|
|
51
|
+
ECTHR_B: 20, # sequence of numbers
|
|
52
|
+
SCOTUS: 5, # one number
|
|
53
|
+
EURLEX: 20, # sequence of numbers
|
|
54
|
+
LEDGAR: 20, # multiple words
|
|
55
|
+
UNFAIR_TOS: 20, # sequence of numbers
|
|
56
|
+
CASE_HOLD: 5, # one number
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def get_lex_glue_max_tokens(subset):
|
|
61
|
+
return TASK_MAX_TOKENS_MAPPING[subset]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
INSTRUCTIONS = {
|
|
65
|
+
ECTHR_A: "In this task, you are given the facts from a case heard at the European Court of Human Rights (ECtHR). "
|
|
66
|
+
"Predict the articles of the ECtHR that were violated (if any) out of the following: "
|
|
67
|
+
"0: Article 2, "
|
|
68
|
+
"1: Article 3, "
|
|
69
|
+
"2: Article 5, "
|
|
70
|
+
"3: Article 6, "
|
|
71
|
+
"4: Article 8, "
|
|
72
|
+
"5: Article 9, "
|
|
73
|
+
"6: Article 10, "
|
|
74
|
+
"7: Article 11, "
|
|
75
|
+
"8: Article 14, "
|
|
76
|
+
"9: Article 1 of Protocol 1. "
|
|
77
|
+
"If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
|
|
78
|
+
ECTHR_B: "In this task, you are given the facts from a case heard at the European Court of Human Rights (ECtHR). "
|
|
79
|
+
"Predict the articles of ECtHR that were allegedly violated (considered by the court) out of the following:"
|
|
80
|
+
"0: Article 2, "
|
|
81
|
+
"1: Article 3, "
|
|
82
|
+
"2: Article 5, "
|
|
83
|
+
"3: Article 6, "
|
|
84
|
+
"4: Article 8, "
|
|
85
|
+
"5: Article 9, "
|
|
86
|
+
"6: Article 10, "
|
|
87
|
+
"7: Article 11, "
|
|
88
|
+
"8: Article 14, "
|
|
89
|
+
"9: Article 1 of Protocol 1. "
|
|
90
|
+
"If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
|
|
91
|
+
SCOTUS: "In this task, you are given a case heard at the Supreme Court of the United States (SCOTUS). "
|
|
92
|
+
"Predict the relevant issue area out of the following: "
|
|
93
|
+
"0: Criminal Procedure, "
|
|
94
|
+
"1: Civil Rights, "
|
|
95
|
+
"2: First Amendment, "
|
|
96
|
+
"3: Due Process, "
|
|
97
|
+
"4: Privacy, "
|
|
98
|
+
"5: Attorneys, "
|
|
99
|
+
"6: Unions, "
|
|
100
|
+
"7: Economic Activity, "
|
|
101
|
+
"8: Judicial Power, "
|
|
102
|
+
"9: Federalism, "
|
|
103
|
+
"10: Interstate Relations, "
|
|
104
|
+
"11: Federal Taxation, "
|
|
105
|
+
"12: Miscellaneous, "
|
|
106
|
+
"13: Private Action.",
|
|
107
|
+
EURLEX: "In this task, you are given an EU law document published in the EUR-Lex portal. "
|
|
108
|
+
"Predict the relevant EuroVoc concepts. "
|
|
109
|
+
"If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
|
|
110
|
+
LEDGAR: "In this task, you are given a contract provision "
|
|
111
|
+
"from contracts obtained from US Securities and Exchange Commission (SEC) filings."
|
|
112
|
+
"Predict the main topic. ",
|
|
113
|
+
UNFAIR_TOS: "In this task, you are given a sentence "
|
|
114
|
+
"from a Terms of Service (ToS) document from online platforms. "
|
|
115
|
+
"Predict the types of unfair contractual terms out of the following: "
|
|
116
|
+
"0: Limitation of liability, "
|
|
117
|
+
"1: Unilateral termination, "
|
|
118
|
+
"2: Unilateral change, "
|
|
119
|
+
"3: Content removal, "
|
|
120
|
+
"4: Contract by using, "
|
|
121
|
+
"5: Choice of law, "
|
|
122
|
+
"6: Jurisdiction, "
|
|
123
|
+
"7: Arbitration. "
|
|
124
|
+
"If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
|
|
125
|
+
CASE_HOLD: "In this task, you are given an excerpt from a court decision, "
|
|
126
|
+
"containing a reference to a particular case, while the holding statement is masked out. "
|
|
127
|
+
"Predict the index of the holding statement fitting in the context at <HOLDING> from a selection of five choices.",
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def get_lex_glue_instructions(subset):
|
|
132
|
+
return INSTRUCTIONS[subset]
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class LexGLUEScenario(Scenario):
|
|
136
|
+
"""
|
|
137
|
+
Inspired by the recent widespread use of the GLUE multi-task benchmark NLP dataset (Wang et al., 2018),
|
|
138
|
+
the subsequent more difficult SuperGLUE (Wang et al., 2019),
|
|
139
|
+
other previous multi-task NLP benchmarks (Conneau and Kiela, 2018; McCann et al., 2018),
|
|
140
|
+
and similar initiatives in other domains (Peng et al., 2019),
|
|
141
|
+
we introduce the Legal General Language Understanding Evaluation (LexGLUE) benchmark,
|
|
142
|
+
a benchmark dataset to evaluate the performance of NLP methods in legal tasks.
|
|
143
|
+
LexGLUE is based on seven existing legal NLP datasets, selected using criteria largely from SuperGLUE.
|
|
144
|
+
Find more information on the dataset here: https://huggingface.co/datasets/lex_glue
|
|
145
|
+
|
|
146
|
+
We prompt models using the following format (example for unfair_tos)
|
|
147
|
+
|
|
148
|
+
<sentence>
|
|
149
|
+
Unfair Contractual Term Type:
|
|
150
|
+
|
|
151
|
+
Target completion:
|
|
152
|
+
<sentence> (<sentence>:"Limitation of liability", "Unilateral termination", "Unilateral change",
|
|
153
|
+
"Content removal", "Contract by using", "Choice of law", "Jurisdiction", "Arbitration")
|
|
154
|
+
|
|
155
|
+
Using an example from the training dataset, we have
|
|
156
|
+
|
|
157
|
+
```
|
|
158
|
+
"tinder may terminate your account at any time without notice if it believes that you have violated this agreement."
|
|
159
|
+
|
|
160
|
+
Unfair Contractual Term Type:
|
|
161
|
+
Target completion:
|
|
162
|
+
"Unilateral change"
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
"""
|
|
166
|
+
|
|
167
|
+
name = "lex_glue"
|
|
168
|
+
description = "A Benchmark Dataset for Legal Language Understanding in English."
|
|
169
|
+
tags = ["single_label_text_classification", "multi_label_text_classification", "question_answering"]
|
|
170
|
+
|
|
171
|
+
# Mapping from HELM splits to HF splits
|
|
172
|
+
splits_mapping = {
|
|
173
|
+
TRAIN_SPLIT: datasets.Split.TRAIN,
|
|
174
|
+
VALID_SPLIT: datasets.Split.VALIDATION,
|
|
175
|
+
TEST_SPLIT: datasets.Split.TEST,
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
dataset_name = "lex_glue"
|
|
179
|
+
max_number_of_wrong_answers = 30
|
|
180
|
+
|
|
181
|
+
def __init__(self, subset: str):
|
|
182
|
+
super().__init__()
|
|
183
|
+
assert subset in list(TASK_CODE_MAPPING.keys()) + ["all"], f"Unknown subset: {subset}"
|
|
184
|
+
self.subsets = [subset] if subset != "all" else list(TASK_CODE_MAPPING.keys())
|
|
185
|
+
self.random: random.Random = random.Random(42)
|
|
186
|
+
|
|
187
|
+
def get_instances_for_subset(self, config: str) -> List[Instance]:
|
|
188
|
+
task_code = TASK_CODE_MAPPING[config]
|
|
189
|
+
# Load dataset
|
|
190
|
+
cache_dir = str(Path(self.output_path) / "data")
|
|
191
|
+
dataset: Any = load_dataset(self.dataset_name, config, cache_dir=cache_dir)
|
|
192
|
+
|
|
193
|
+
if task_code in [TaskType.SLTC, TaskType.QA]:
|
|
194
|
+
class_label = dataset["train"].features["label"]
|
|
195
|
+
label_classes = class_label.names
|
|
196
|
+
elif task_code == TaskType.MLTC:
|
|
197
|
+
# construct the label classes
|
|
198
|
+
label_classes = set()
|
|
199
|
+
for split in self.splits_mapping.values():
|
|
200
|
+
for example in dataset[split]:
|
|
201
|
+
label_classes |= set(example["labels"]) # add all new labels to the set
|
|
202
|
+
label_classes = sorted(list(map(str, label_classes))) # convert everything to a string
|
|
203
|
+
|
|
204
|
+
def generate_instance(example, split: str):
|
|
205
|
+
# get correct labels
|
|
206
|
+
if task_code in [TaskType.SLTC, TaskType.QA]:
|
|
207
|
+
correct_label = class_label.int2str(example["label"]) # get label name for correct label
|
|
208
|
+
correct_labels = correct_label if isinstance(correct_label, list) else [correct_label]
|
|
209
|
+
elif task_code == TaskType.MLTC:
|
|
210
|
+
correct_labels = list(map(str, example["labels"])) # here we don't have any mapping to label names
|
|
211
|
+
|
|
212
|
+
# construct wrong references
|
|
213
|
+
wrong_references = []
|
|
214
|
+
for label_name in label_classes:
|
|
215
|
+
if label_name not in correct_labels:
|
|
216
|
+
wrong_reference = Reference(output=Output(label_name), tags=[]) # Wrong output
|
|
217
|
+
wrong_references.append(wrong_reference)
|
|
218
|
+
|
|
219
|
+
wrong_references = reduce_wrong_reference_count(wrong_references)
|
|
220
|
+
|
|
221
|
+
# construct correct references and input
|
|
222
|
+
if task_code in [TaskType.SLTC, TaskType.MLTC]:
|
|
223
|
+
input_text = example["text"]
|
|
224
|
+
if "ecthr" in config:
|
|
225
|
+
input_text = " ".join(input_text)
|
|
226
|
+
elif task_code == TaskType.QA:
|
|
227
|
+
endings = [f"{i}: {end}" for i, end in enumerate(example["endings"])]
|
|
228
|
+
input_text = example["context"] + " Holdings: " + " ".join(endings)
|
|
229
|
+
|
|
230
|
+
# construct correct references
|
|
231
|
+
correct_references = [
|
|
232
|
+
Reference(output=Output(correct_label), tags=[CORRECT_TAG]) for correct_label in correct_labels
|
|
233
|
+
] # for MLTC we have multiple correct ones
|
|
234
|
+
return Instance(input=Input(input_text), references=wrong_references + correct_references, split=split)
|
|
235
|
+
|
|
236
|
+
def reduce_wrong_reference_count(wrong_references):
|
|
237
|
+
self.random.shuffle(wrong_references) # shuffle wrong references
|
|
238
|
+
if len(wrong_references) > self.max_number_of_wrong_answers:
|
|
239
|
+
# if there are too many wrong references, only take a subset
|
|
240
|
+
wrong_references = wrong_references[: self.max_number_of_wrong_answers]
|
|
241
|
+
return wrong_references
|
|
242
|
+
|
|
243
|
+
def generate_instances(split: str):
|
|
244
|
+
split_dataset = dataset[self.splits_mapping[split]]
|
|
245
|
+
return [generate_instance(example, split) for example in split_dataset]
|
|
246
|
+
|
|
247
|
+
return generate_instances(TRAIN_SPLIT) + generate_instances(VALID_SPLIT) + generate_instances(TEST_SPLIT)
|
|
248
|
+
|
|
249
|
+
def get_instances(self) -> List[Instance]:
|
|
250
|
+
instances = []
|
|
251
|
+
for subset in self.subsets:
|
|
252
|
+
instances.extend(self.get_instances_for_subset(subset))
|
|
253
|
+
return instances
|