crfm-helm 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +10 -8
- {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +50 -37
- {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
- {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +1 -0
- helm/benchmark/__init__.py +2 -0
- helm/benchmark/adaptation/adapter_spec.py +3 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
- helm/benchmark/contamination/__init__.py +0 -0
- helm/benchmark/metrics/classification_metrics.py +28 -23
- helm/benchmark/metrics/test_classification_metrics.py +44 -9
- helm/benchmark/presentation/create_plots.py +617 -0
- helm/benchmark/presentation/summarize.py +4 -2
- helm/benchmark/presentation/test_create_plots.py +32 -0
- helm/benchmark/run.py +23 -1
- helm/benchmark/run_expander.py +161 -47
- helm/benchmark/run_specs.py +84 -10
- helm/benchmark/runner.py +31 -3
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/imdb_listdir.json +50014 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +58 -17
- helm/benchmark/scenarios/lextreme_scenario.py +37 -25
- helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
- helm/benchmark/scenarios/scenario.py +5 -0
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/static/benchmarking.css +14 -0
- helm/benchmark/static/benchmarking.js +43 -0
- helm/benchmark/static/index.html +2 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/plot-captions.js +16 -0
- helm/benchmark/static/schema.yaml +66 -8
- helm/benchmark/window_services/cohere_window_service.py +20 -0
- helm/benchmark/window_services/flan_t5_window_service.py +29 -0
- helm/benchmark/window_services/huggingface_window_service.py +39 -0
- helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
- helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
- helm/benchmark/window_services/window_service_factory.py +27 -6
- helm/common/general.py +12 -5
- helm/proxy/clients/aleph_alpha_client.py +47 -28
- helm/proxy/clients/auto_client.py +28 -24
- helm/proxy/clients/huggingface_client.py +30 -17
- helm/proxy/clients/huggingface_model_registry.py +111 -0
- helm/proxy/clients/huggingface_tokenizer.py +23 -7
- helm/proxy/clients/openai_client.py +60 -2
- helm/proxy/clients/test_huggingface_model_registry.py +57 -0
- helm/proxy/clients/together_client.py +17 -2
- helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
- helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
- helm/proxy/models.py +82 -2
- {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0
|
@@ -26,6 +26,11 @@ TASK_CODE_MAPPING = {
|
|
|
26
26
|
CASE_HOLD: TaskType.QA,
|
|
27
27
|
}
|
|
28
28
|
|
|
29
|
+
|
|
30
|
+
def get_lex_glue_task_type(subset):
|
|
31
|
+
return TASK_CODE_MAPPING[subset]
|
|
32
|
+
|
|
33
|
+
|
|
29
34
|
TASK_MAX_TRAIN_INSTANCES_MAPPING = {
|
|
30
35
|
ECTHR_A: 1, # ~ max 4096 tokens
|
|
31
36
|
ECTHR_B: 1, # ~ max 4096 tokens
|
|
@@ -58,19 +63,65 @@ def get_lex_glue_max_tokens(subset):
|
|
|
58
63
|
|
|
59
64
|
INSTRUCTIONS = {
|
|
60
65
|
ECTHR_A: "In this task, you are given the facts from a case heard at the European Court of Human Rights (ECtHR). "
|
|
61
|
-
"Predict the articles of the ECtHR that were violated (if any)
|
|
66
|
+
"Predict the articles of the ECtHR that were violated (if any) out of the following: "
|
|
67
|
+
"0: Article 2, "
|
|
68
|
+
"1: Article 3, "
|
|
69
|
+
"2: Article 5, "
|
|
70
|
+
"3: Article 6, "
|
|
71
|
+
"4: Article 8, "
|
|
72
|
+
"5: Article 9, "
|
|
73
|
+
"6: Article 10, "
|
|
74
|
+
"7: Article 11, "
|
|
75
|
+
"8: Article 14, "
|
|
76
|
+
"9: Article 1 of Protocol 1. "
|
|
77
|
+
"If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
|
|
62
78
|
ECTHR_B: "In this task, you are given the facts from a case heard at the European Court of Human Rights (ECtHR). "
|
|
63
|
-
"Predict the articles of ECtHR that were allegedly violated (considered by the court)
|
|
79
|
+
"Predict the articles of ECtHR that were allegedly violated (considered by the court) out of the following:"
|
|
80
|
+
"0: Article 2, "
|
|
81
|
+
"1: Article 3, "
|
|
82
|
+
"2: Article 5, "
|
|
83
|
+
"3: Article 6, "
|
|
84
|
+
"4: Article 8, "
|
|
85
|
+
"5: Article 9, "
|
|
86
|
+
"6: Article 10, "
|
|
87
|
+
"7: Article 11, "
|
|
88
|
+
"8: Article 14, "
|
|
89
|
+
"9: Article 1 of Protocol 1. "
|
|
90
|
+
"If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
|
|
64
91
|
SCOTUS: "In this task, you are given a case heard at the Supreme Court of the United States (SCOTUS). "
|
|
65
|
-
"Predict the relevant issue area
|
|
92
|
+
"Predict the relevant issue area out of the following: "
|
|
93
|
+
"0: Criminal Procedure, "
|
|
94
|
+
"1: Civil Rights, "
|
|
95
|
+
"2: First Amendment, "
|
|
96
|
+
"3: Due Process, "
|
|
97
|
+
"4: Privacy, "
|
|
98
|
+
"5: Attorneys, "
|
|
99
|
+
"6: Unions, "
|
|
100
|
+
"7: Economic Activity, "
|
|
101
|
+
"8: Judicial Power, "
|
|
102
|
+
"9: Federalism, "
|
|
103
|
+
"10: Interstate Relations, "
|
|
104
|
+
"11: Federal Taxation, "
|
|
105
|
+
"12: Miscellaneous, "
|
|
106
|
+
"13: Private Action.",
|
|
66
107
|
EURLEX: "In this task, you are given an EU law document published in the EUR-Lex portal. "
|
|
67
|
-
"Predict the relevant EuroVoc concepts."
|
|
108
|
+
"Predict the relevant EuroVoc concepts. "
|
|
109
|
+
"If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
|
|
68
110
|
LEDGAR: "In this task, you are given a contract provision "
|
|
69
111
|
"from contracts obtained from US Securities and Exchange Commission (SEC) filings."
|
|
70
|
-
"Predict the main topic.",
|
|
112
|
+
"Predict the main topic. ",
|
|
71
113
|
UNFAIR_TOS: "In this task, you are given a sentence "
|
|
72
|
-
"from a Terms of Service (ToS) document from
|
|
73
|
-
"Predict the types of unfair contractual terms"
|
|
114
|
+
"from a Terms of Service (ToS) document from online platforms. "
|
|
115
|
+
"Predict the types of unfair contractual terms out of the following: "
|
|
116
|
+
"0: Limitation of liability, "
|
|
117
|
+
"1: Unilateral termination, "
|
|
118
|
+
"2: Unilateral change, "
|
|
119
|
+
"3: Content removal, "
|
|
120
|
+
"4: Contract by using, "
|
|
121
|
+
"5: Choice of law, "
|
|
122
|
+
"6: Jurisdiction, "
|
|
123
|
+
"7: Arbitration. "
|
|
124
|
+
"If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
|
|
74
125
|
CASE_HOLD: "In this task, you are given an excerpt from a court decision, "
|
|
75
126
|
"containing a reference to a particular case, while the holding statement is masked out. "
|
|
76
127
|
"Predict the index of the holding statement fitting in the context at <HOLDING> from a selection of five choices.",
|
|
@@ -126,7 +177,6 @@ class LexGLUEScenario(Scenario):
|
|
|
126
177
|
|
|
127
178
|
dataset_name = "lex_glue"
|
|
128
179
|
max_number_of_wrong_answers = 30
|
|
129
|
-
mltc_no_label_name = "No Label"
|
|
130
180
|
|
|
131
181
|
def __init__(self, subset: str):
|
|
132
182
|
super().__init__()
|
|
@@ -168,15 +218,6 @@ class LexGLUEScenario(Scenario):
|
|
|
168
218
|
|
|
169
219
|
wrong_references = reduce_wrong_reference_count(wrong_references)
|
|
170
220
|
|
|
171
|
-
if task_code == TaskType.MLTC: # special case for multilabel classification tasks
|
|
172
|
-
if correct_labels: # if we have a correct label
|
|
173
|
-
# add the no_label to the wrong references
|
|
174
|
-
# IMPORTANT: add it after reduce_wrong_reference_count, to make sure the no label is always there
|
|
175
|
-
wrong_references.append(Reference(output=Output(self.mltc_no_label_name), tags=[]))
|
|
176
|
-
else: # if we don't have a correct label
|
|
177
|
-
# add the no_label to the correct labels
|
|
178
|
-
correct_labels = [self.mltc_no_label_name]
|
|
179
|
-
|
|
180
221
|
# construct correct references and input
|
|
181
222
|
if task_code in [TaskType.SLTC, TaskType.MLTC]:
|
|
182
223
|
input_text = example["text"]
|
|
@@ -55,6 +55,11 @@ TASK_CODE_MAPPING = {
|
|
|
55
55
|
MAPA_FINE: TaskType.NER,
|
|
56
56
|
}
|
|
57
57
|
|
|
58
|
+
|
|
59
|
+
def get_lextreme_task_type(subset):
|
|
60
|
+
return TASK_CODE_MAPPING[subset]
|
|
61
|
+
|
|
62
|
+
|
|
58
63
|
TASK_MAX_TRAIN_INSTANCES_MAPPING = {
|
|
59
64
|
BRAZILIAN_COURT_DECISIONS_JUDGMENT: 4, # ~ max 1024 tokens
|
|
60
65
|
BRAZILIAN_COURT_DECISIONS_UNANIMITY: 4, # ~ max 1024 tokens
|
|
@@ -134,14 +139,14 @@ INSTRUCTIONS = {
|
|
|
134
139
|
"'Permanent Greek Legislation Code - Raptarchis (Ραπτάρχης)' the document belongs to.",
|
|
135
140
|
SWISS_JUDGMENT_PREDICTION: "In this task, you are given the facts description "
|
|
136
141
|
"from a decision heard at the Swiss Federal Supreme Court. "
|
|
137
|
-
"Predict the judgment of the case (approval or dismissal)",
|
|
142
|
+
"Predict the judgment of the case (approval: The appeal was approved, or dismissal: The appeal was denied)",
|
|
138
143
|
ONLINE_TERMS_OF_SERVICE_UNFAIRNESS_LEVELS: "In this task, you are given a sentence "
|
|
139
144
|
"from a Terms of Service (ToS) document. "
|
|
140
145
|
"Predict the unfairness level of the sentence (potentially_unfair, clearly_unfair, clearly_fair, untagged)",
|
|
141
146
|
ONLINE_TERMS_OF_SERVICE_CLAUSE_TOPICS: "In this task, you are given a sentence "
|
|
142
147
|
"from a Terms of Service (ToS) document. "
|
|
143
|
-
"Predict the clause topics of the sentence "
|
|
144
|
-
"
|
|
148
|
+
"Predict the clause topics of the sentence out of the following: "
|
|
149
|
+
"0: Arbitration, "
|
|
145
150
|
"1: Unilateral change, "
|
|
146
151
|
"2: Content removal, "
|
|
147
152
|
"3: Jurisdiction, "
|
|
@@ -149,34 +154,51 @@ INSTRUCTIONS = {
|
|
|
149
154
|
"5: Limitation of liability, "
|
|
150
155
|
"6: Unilateral termination, "
|
|
151
156
|
"7: Contract by using, "
|
|
152
|
-
"8: Privacy included
|
|
157
|
+
"8: Privacy included. "
|
|
158
|
+
"If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
|
|
153
159
|
COVID19_EMERGENCY_EVENT: "In this task, you are given a sentence from a European legislative document. "
|
|
154
|
-
"Predict the applicable measurements against COVID-19 "
|
|
155
|
-
"
|
|
160
|
+
"Predict the applicable measurements against COVID-19 out of the following: "
|
|
161
|
+
"0: State of Emergency, "
|
|
156
162
|
"1: Restrictions of fundamental rights and civil liberties, "
|
|
157
163
|
"2: Restrictions of daily liberties, "
|
|
158
164
|
"3: Closures / lockdown, "
|
|
159
165
|
"4: Suspension of international cooperation and commitments, "
|
|
160
166
|
"5: Police mobilization, "
|
|
161
167
|
"6: Army mobilization, "
|
|
162
|
-
"7: Government oversight
|
|
168
|
+
"7: Government oversight. "
|
|
169
|
+
"If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
|
|
163
170
|
MULTI_EURLEX_LEVEL_1: "In this task, you are given a document from an EU law. "
|
|
164
|
-
"Predict the level 1 concept in the EUROVOC taxonomy."
|
|
171
|
+
"Predict the level 1 concept in the EUROVOC taxonomy. "
|
|
172
|
+
"If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
|
|
165
173
|
MULTI_EURLEX_LEVEL_2: "In this task, you are given a document from an EU law. "
|
|
166
|
-
"Predict the level 2 concept in the EUROVOC taxonomy."
|
|
174
|
+
"Predict the level 2 concept in the EUROVOC taxonomy. "
|
|
175
|
+
"If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
|
|
167
176
|
MULTI_EURLEX_LEVEL_3: "In this task, you are given a document from an EU law. "
|
|
168
|
-
"Predict the level 3 concept in the EUROVOC taxonomy."
|
|
177
|
+
"Predict the level 3 concept in the EUROVOC taxonomy. "
|
|
178
|
+
"If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
|
|
169
179
|
GREEK_LEGAL_NER: "In this task, you are given a sentence from Greek legislation. "
|
|
170
|
-
"Predict the named entity type for each token
|
|
180
|
+
"Predict the named entity type for each token out of the following: "
|
|
181
|
+
"O, B-ORG, I-ORG, B-GPE, I-GPE, B-LEG-REFS, I-LEG-REFS, B-PUBLIC-DOCS, I-PUBLIC-DOCS, B-PERSON, I-PERSON, "
|
|
182
|
+
"B-FACILITY, I-FACILITY, B-LOCATION-UNK, I-LOCATION-UNK, B-LOCATION-NAT, I-LOCATION-NAT",
|
|
171
183
|
LEGALNERO: "In this task, you are given a sentence from Romanian legislation. "
|
|
172
|
-
"Predict the named entity type for each token
|
|
184
|
+
"Predict the named entity type for each token out of the following: "
|
|
185
|
+
"O, B-TIME, I-TIME, B-LEGAL, I-LEGAL, B-ORG, I-ORG, B-LOC, I-LOC, B-PER, I-PER",
|
|
173
186
|
LENER_BR: "In this task, you are given a sentence "
|
|
174
187
|
"from Brazilian legal documents (court decisions and legislation). "
|
|
175
|
-
"Predict the named entity type for each token
|
|
188
|
+
"Predict the named entity type for each token out of the following: "
|
|
189
|
+
"O, B-ORGANIZACAO, I-ORGANIZACAO, B-PESSOA, I-PESSOA, B-TEMPO, I-TEMPO, B-LOCAL, I-LOCAL, "
|
|
190
|
+
"B-LEGISLACAO, I-LEGISLACAO, B-JURISPRUDENCIA, I-JURISPRUDENCIA",
|
|
176
191
|
MAPA_COARSE: "In this task, you are given a sentence from the EUR-Lex database. "
|
|
177
|
-
"Predict the coarse grained named entity type for each token
|
|
192
|
+
"Predict the coarse grained named entity type for each token out of the following: "
|
|
193
|
+
"O, B-ORGANISATION, I-ORGANISATION, B-ADDRESS, I-ADDRESS, B-DATE, I-DATE, "
|
|
194
|
+
"B-PERSON, I-PERSON, B-AMOUNT, I-AMOUNT, B-TIME, I-TIME",
|
|
178
195
|
MAPA_FINE: "In this task, you are given a sentence from the EUR-Lex database. "
|
|
179
|
-
"Predict the fine grained named entity type for each token
|
|
196
|
+
"Predict the fine grained named entity type for each token out of the following: "
|
|
197
|
+
"O, B-BUILDING, I-BUILDING, B-CITY, I-CITY, B-COUNTRY, I-COUNTRY, B-PLACE, I-PLACE, B-TERRITORY, I-TERRITORY, "
|
|
198
|
+
"I-UNIT, B-UNIT, B-VALUE, I-VALUE, B-YEAR, I-YEAR, B-STANDARD ABBREVIATION, I-STANDARD ABBREVIATION, "
|
|
199
|
+
"B-MONTH, I-MONTH, B-DAY, I-DAY, B-AGE, I-AGE, B-ETHNIC CATEGORY, I-ETHNIC CATEGORY, B-FAMILY NAME, I-FAMILY NAME, "
|
|
200
|
+
"B-INITIAL NAME, I-INITIAL NAME, B-MARITAL STATUS, I-MARITAL STATUS, B-PROFESSION, I-PROFESSION, B-ROLE, I-ROLE, "
|
|
201
|
+
"B-NATIONALITY, I-NATIONALITY, B-TITLE, I-TITLE, B-URL, I-URL, B-TYPE, I-TYPE",
|
|
180
202
|
}
|
|
181
203
|
|
|
182
204
|
|
|
@@ -226,7 +248,6 @@ class LEXTREMEScenario(Scenario):
|
|
|
226
248
|
|
|
227
249
|
dataset_name = "joelito/lextreme"
|
|
228
250
|
max_number_of_wrong_answers = 30
|
|
229
|
-
mltc_no_label_name = "No Label"
|
|
230
251
|
delimiter = '" "' # we choose quotes and whitespace as a delimiter because this is what worked for gpt3
|
|
231
252
|
|
|
232
253
|
ner_class_mapping = {
|
|
@@ -396,15 +417,6 @@ class LEXTREMEScenario(Scenario):
|
|
|
396
417
|
|
|
397
418
|
wrong_references = reduce_wrong_reference_count(wrong_references)
|
|
398
419
|
|
|
399
|
-
if task_code == TaskType.MLTC: # special case for multilabel classification tasks
|
|
400
|
-
if correct_labels: # if we have a correct label
|
|
401
|
-
# add the no_label to the wrong references
|
|
402
|
-
# IMPORTANT: add it after reduce_wrong_reference_count, to make sure the no label is always there
|
|
403
|
-
wrong_references.append(Reference(output=Output(self.mltc_no_label_name), tags=[]))
|
|
404
|
-
else: # if we don't have a correct label
|
|
405
|
-
# add the no_label to the correct labels
|
|
406
|
-
correct_labels = [self.mltc_no_label_name]
|
|
407
|
-
|
|
408
420
|
# construct correct references and input
|
|
409
421
|
if task_code in [TaskType.SLTC, TaskType.MLTC]:
|
|
410
422
|
input_text = example["input"]
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from typing import List, Dict
|
|
4
|
+
|
|
5
|
+
from helm.common.general import ensure_file_downloaded
|
|
6
|
+
|
|
7
|
+
from .scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TRAIN_SPLIT,
|
|
12
|
+
TEST_SPLIT,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
PassageQuestionInput,
|
|
15
|
+
Input,
|
|
16
|
+
Output,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class OpinionsQAScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
The OpinionsQAScenario dataset is from the paper "Whose Opinions Do Language Models Reflect?"
|
|
23
|
+
[Santurkar et al., 2023].
|
|
24
|
+
|
|
25
|
+
OpinionsQA is a QA dataset containing 1484 multiple-choice questions. Since the questions are inherently
|
|
26
|
+
subjective, there isn't a single ground truth response. Instead, the object of interest is how
|
|
27
|
+
the distribution of model responses compares to those obtained from human survey participants.
|
|
28
|
+
|
|
29
|
+
As discussed in Santurkar et al., we consider prompting an LM:
|
|
30
|
+
1. Without any context (zero-shot) to evaluate the "default" opinions reflected
|
|
31
|
+
by it.
|
|
32
|
+
2. With context containing information pertaining to the group (say Democrats) we want to steer
|
|
33
|
+
the model towards using one of three modes: "steer-qa", "steer-bio" or "steer-portray". See the
|
|
34
|
+
example below for an illustration fo the difference between the three steering modes.
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
We prompt models using the following format
|
|
38
|
+
|
|
39
|
+
<optional context>
|
|
40
|
+
|
|
41
|
+
Question: <question>
|
|
42
|
+
A. <reference>
|
|
43
|
+
B. <reference>
|
|
44
|
+
C. <reference>
|
|
45
|
+
D. <reference>
|
|
46
|
+
Answer: <A/B/C/D>
|
|
47
|
+
|
|
48
|
+
For instance:
|
|
49
|
+
|
|
50
|
+
<optional context>
|
|
51
|
+
|
|
52
|
+
Question: How much, if at all, do you think the ease with which people can legally obtain guns contributes
|
|
53
|
+
to gun violence in the country today?
|
|
54
|
+
A. A great deal
|
|
55
|
+
B. A fair amount
|
|
56
|
+
C. Not too much
|
|
57
|
+
D. Not at all
|
|
58
|
+
E. Refused
|
|
59
|
+
Answer:
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
In the example above, the <optional context> varies based on the choice of the "context" variable as follows:
|
|
63
|
+
|
|
64
|
+
1. If context="default", <optional context> is not used and the model is prompted with the question directly.
|
|
65
|
+
|
|
66
|
+
2. If context="steer-qa", the group information (to steer the model towards) is passed in a QA format, e.g.:
|
|
67
|
+
|
|
68
|
+
Question: In politics today, do you consider yourself a
|
|
69
|
+
A. Republican
|
|
70
|
+
B. Democrat
|
|
71
|
+
C. Independent
|
|
72
|
+
D. Something else
|
|
73
|
+
E. Refused
|
|
74
|
+
Answer: B
|
|
75
|
+
|
|
76
|
+
3. If context="steer-bio", the group information is passed to the model as a free-text response
|
|
77
|
+
to a prompt asking for demographic information, e.g.:
|
|
78
|
+
|
|
79
|
+
Below you will be asked to provide a short description of your political affiliation and
|
|
80
|
+
then answer some questions.
|
|
81
|
+
|
|
82
|
+
Description: In politics today, I consider myself a Democrat.
|
|
83
|
+
|
|
84
|
+
4. If context="steer-portray", the model is explicitly asked to behave as if it were a member of
|
|
85
|
+
said group, e.g.:
|
|
86
|
+
|
|
87
|
+
Answer the following question as if in politics today, you considered yourself a Democrat.
|
|
88
|
+
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
name = "opinions_qa"
|
|
92
|
+
description = "Subjective questions dataset based on Pew American Trends Panel opinion polls."
|
|
93
|
+
tags = ["multiple_choice"]
|
|
94
|
+
|
|
95
|
+
""" Information needed to download the dataset """
|
|
96
|
+
CODALAB_URI_TEMPLATE: str = (
|
|
97
|
+
"https://worksheets.codalab.org/rest/bundles/{bundle}/contents/blob/model_input/{filename}"
|
|
98
|
+
)
|
|
99
|
+
CODALAB_BUNDLE: str = "0xa6f81cc62d7d4ccb93031a72d2043669"
|
|
100
|
+
FILE_NAME: str = "Pew_American_Trends_Panel_W{wave}.csv"
|
|
101
|
+
PEW_SURVEY_WAVES: list = [26, 27, 29, 32, 34, 36, 41, 42, 43, 45, 49, 50, 54, 82, 92]
|
|
102
|
+
|
|
103
|
+
def __init__(self, survey_type: str, context: str):
|
|
104
|
+
super().__init__()
|
|
105
|
+
assert context in ["default", "steer-qa", "steer-bio", "steer-portray"]
|
|
106
|
+
|
|
107
|
+
self.survey_type: str = survey_type
|
|
108
|
+
self.context: str = context
|
|
109
|
+
|
|
110
|
+
def download_data(self):
|
|
111
|
+
|
|
112
|
+
self.output_path: str = os.path.join(self.output_path, "data")
|
|
113
|
+
if not os.path.exists(self.output_path):
|
|
114
|
+
os.makedirs(self.output_path)
|
|
115
|
+
|
|
116
|
+
DOWNLOAD_FILENAMES = [self.FILE_NAME.format(wave=wave) for wave in self.PEW_SURVEY_WAVES]
|
|
117
|
+
DOWNLOAD_FILENAMES += [f"{steer}.csv" for steer in ["steer-qa", "steer-bio", "steer-portray"]]
|
|
118
|
+
DOWNLOAD_FILENAMES += ["Pew_American_Trends_Panel_disagreement_500.csv"]
|
|
119
|
+
|
|
120
|
+
for filename in DOWNLOAD_FILENAMES:
|
|
121
|
+
data_path: str = os.path.join(self.output_path, filename)
|
|
122
|
+
|
|
123
|
+
source_url: str = self.CODALAB_URI_TEMPLATE.format(bundle=self.CODALAB_BUNDLE, filename=filename)
|
|
124
|
+
ensure_file_downloaded(source_url=source_url, target_path=data_path, downloader_executable="gdown")
|
|
125
|
+
|
|
126
|
+
def read_survey_questions(self, csv_path):
|
|
127
|
+
df = pd.read_csv(csv_path, sep="\t")
|
|
128
|
+
df["options"] = df.apply(lambda x: eval(x["options"]), axis=1)
|
|
129
|
+
return df
|
|
130
|
+
|
|
131
|
+
def get_instances(self) -> List[Instance]:
|
|
132
|
+
self.download_data()
|
|
133
|
+
|
|
134
|
+
# Read all the instances
|
|
135
|
+
instances: List[Instance] = []
|
|
136
|
+
splits: Dict[str, str] = {
|
|
137
|
+
"dev": TRAIN_SPLIT,
|
|
138
|
+
"test": TEST_SPLIT,
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
all_splits = ["dev", "test"] if self.context == "steer-qa" else ["test"]
|
|
142
|
+
csv_dict = {
|
|
143
|
+
"dev": os.path.join(self.output_path, f"{self.context}.csv"),
|
|
144
|
+
"test": os.path.join(self.output_path, f"{self.survey_type}.csv"),
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
bios_df = None
|
|
148
|
+
if self.context in ["steer-bio", "steer-portray"]:
|
|
149
|
+
bios_path = os.path.join(self.output_path, f"{self.context}.csv")
|
|
150
|
+
bios_df = pd.read_csv(bios_path, sep="\t")
|
|
151
|
+
|
|
152
|
+
for split in all_splits:
|
|
153
|
+
|
|
154
|
+
csv_path: str = csv_dict[split]
|
|
155
|
+
assert os.path.exists(csv_path)
|
|
156
|
+
|
|
157
|
+
question_df = self.read_survey_questions(csv_path)
|
|
158
|
+
|
|
159
|
+
for qidx, (question, answers) in enumerate(zip(question_df["question"], question_df["options"])):
|
|
160
|
+
|
|
161
|
+
# Opinions QA test questions have no correct answer and thus we set it to be None by default
|
|
162
|
+
# for all test instances.
|
|
163
|
+
# In the case where context = steer-qa, we add demographic information in the form of a
|
|
164
|
+
# in-context question answer pair as shown in the example above.
|
|
165
|
+
|
|
166
|
+
correct_answer = None if split == "test" else question_df["correct"][qidx]
|
|
167
|
+
|
|
168
|
+
def answer_to_reference(answer: str) -> Reference:
|
|
169
|
+
return Reference(
|
|
170
|
+
Output(text=answer),
|
|
171
|
+
tags=[CORRECT_TAG] if (answer == correct_answer and split != "test") else [],
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
if bios_df is None:
|
|
175
|
+
# context = "default" or "steer-qa"
|
|
176
|
+
instance = Instance(
|
|
177
|
+
Input(text=question),
|
|
178
|
+
references=list(map(answer_to_reference, answers)),
|
|
179
|
+
split=splits[split],
|
|
180
|
+
)
|
|
181
|
+
instances.append(instance)
|
|
182
|
+
else:
|
|
183
|
+
# context = "steer-bio"or "steer-portray"
|
|
184
|
+
for bio in bios_df["question"].values:
|
|
185
|
+
|
|
186
|
+
context = PassageQuestionInput(passage=bio, question=question + "\n")
|
|
187
|
+
instance = Instance(
|
|
188
|
+
context,
|
|
189
|
+
references=list(map(answer_to_reference, answers)),
|
|
190
|
+
split=splits[split],
|
|
191
|
+
)
|
|
192
|
+
instances.append(instance)
|
|
193
|
+
|
|
194
|
+
return instances
|
|
@@ -147,6 +147,11 @@ class Instance:
|
|
|
147
147
|
return reference
|
|
148
148
|
return None
|
|
149
149
|
|
|
150
|
+
@property
|
|
151
|
+
def all_correct_references(self) -> List[Reference]:
|
|
152
|
+
"""Return all correct references."""
|
|
153
|
+
return [reference for reference in self.references if reference.is_correct]
|
|
154
|
+
|
|
150
155
|
def render_lines(self) -> List[str]:
|
|
151
156
|
info = [f"input: {format_text(self.input.text)}"]
|
|
152
157
|
if self.sub_split:
|
|
@@ -78,7 +78,7 @@ class ThePileScenario(Scenario):
|
|
|
78
78
|
# Download the raw data
|
|
79
79
|
data_path = os.path.join(self.output_path, "data")
|
|
80
80
|
ensure_file_downloaded(
|
|
81
|
-
source_url="https://
|
|
81
|
+
source_url="https://the-eye.eu/public/AI/pile/test.jsonl.zst",
|
|
82
82
|
target_path=data_path,
|
|
83
83
|
unpack=True,
|
|
84
84
|
)
|
|
@@ -136,3 +136,17 @@ tbody .table-sort-column {
|
|
|
136
136
|
background-color: #f5f5f5;
|
|
137
137
|
white-space: pre-wrap;
|
|
138
138
|
}
|
|
139
|
+
|
|
140
|
+
.plot {
|
|
141
|
+
margin: 15px;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
.plot img {
|
|
145
|
+
margin: 10px;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
.plot-caption {
|
|
149
|
+
color: #555;
|
|
150
|
+
font-style: italic;
|
|
151
|
+
margin: 5px;
|
|
152
|
+
}
|
|
@@ -124,6 +124,44 @@ $(function () {
|
|
|
124
124
|
return $table;
|
|
125
125
|
}
|
|
126
126
|
|
|
127
|
+
function renderPlots() {
|
|
128
|
+
const container = $('<div>', {class: "container"});
|
|
129
|
+
const links = $('<div>');
|
|
130
|
+
container.append(links);
|
|
131
|
+
const tableLinks = [];
|
|
132
|
+
|
|
133
|
+
function renderPlot(name, title) {
|
|
134
|
+
const plot = $('<div>', {class: "plot"});
|
|
135
|
+
const caption = $('<div>', {class: "plot-caption"}).append(plotCaptions[name]);
|
|
136
|
+
|
|
137
|
+
plot.append($('<h3>').append($('<a>', {id: title}).append(title)));
|
|
138
|
+
plot.append(caption);
|
|
139
|
+
plot.append($('<img>', {src: plotUrl(suite, name), class: "img-fluid"}));
|
|
140
|
+
container.append(plot);
|
|
141
|
+
tableLinks.push($('<a>', {href: '#' + title}).append(title));
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
renderPlot("generic_summary", "Metric spread for core scenarios");
|
|
145
|
+
renderPlot("model_ranking_all", "Head-to-head win rate per each model");
|
|
146
|
+
|
|
147
|
+
renderPlot("accuracy_v_x", "Accuracy as a function of other metrics");
|
|
148
|
+
renderPlot("metric_correlation", "Correlation between metrics");
|
|
149
|
+
|
|
150
|
+
renderPlot("accuracy_v_access", "Accuracy as a function of model access");
|
|
151
|
+
renderPlot("accuracy_over_num_parameters", "Accuracy across model sizes");
|
|
152
|
+
renderPlot("accuracy_over_release_date", "Accuracy over time");
|
|
153
|
+
renderPlot("accuracy_over_the_pile_perplexity", "Accuracy as a function of The Pile perplexity");
|
|
154
|
+
|
|
155
|
+
renderPlot("targeted_evals", "Targeted evaluations");
|
|
156
|
+
|
|
157
|
+
renderPlot("in_context_ablations", "Number of in-context examples ablation");
|
|
158
|
+
renderPlot("mc_ablations", "Multiple-choice adaptation ablation");
|
|
159
|
+
|
|
160
|
+
links.append(renderItems(tableLinks));
|
|
161
|
+
|
|
162
|
+
return container;
|
|
163
|
+
}
|
|
164
|
+
|
|
127
165
|
function renderRunsOverview(runSpecs) {
|
|
128
166
|
let query = '';
|
|
129
167
|
const $search = $('<input>', {type: 'text', size: 40, placeholder: 'Enter regex query (enter to open all)'});
|
|
@@ -1170,6 +1208,11 @@ $(function () {
|
|
|
1170
1208
|
$main.empty()
|
|
1171
1209
|
$main.append(renderHeader('Scenarios', renderScenarios()));
|
|
1172
1210
|
refreshHashLocation();
|
|
1211
|
+
} else if (urlParams.plots) {
|
|
1212
|
+
// Plots
|
|
1213
|
+
$main.empty()
|
|
1214
|
+
$main.append(renderHeader('Plots', renderPlots()));
|
|
1215
|
+
refreshHashLocation();
|
|
1173
1216
|
} else if (urlParams.runSpec || urlParams.runSpecs || urlParams.runSpecRegex) {
|
|
1174
1217
|
// Predictions for a set of run specs (matching a regular expression)
|
|
1175
1218
|
$main.text('Loading runs...');
|
helm/benchmark/static/index.html
CHANGED
|
@@ -22,6 +22,7 @@
|
|
|
22
22
|
<li class="nav-item"><a class="nav-link active" href="?models=1">Models</a></li>
|
|
23
23
|
<li class="nav-item"><a class="nav-link active" href="?scenarios=1">Scenarios</a></li>
|
|
24
24
|
<li class="nav-item"><a class="nav-link active" href="?groups=1">Results</a></li>
|
|
25
|
+
<li class="nav-item"><a class="nav-link active" href="?plots=1">Plots</a></li>
|
|
25
26
|
<li class="nav-item"><a class="nav-link active" href="?runs=1">Raw runs</a></li>
|
|
26
27
|
</ul>
|
|
27
28
|
</div>
|
|
@@ -48,5 +49,6 @@
|
|
|
48
49
|
<script src="json-urls-root.js"></script>
|
|
49
50
|
<script src="json-urls.js"></script>
|
|
50
51
|
<script src="benchmarking.js"></script>
|
|
52
|
+
<script src="plot-captions.js"></script>
|
|
51
53
|
</body>
|
|
52
54
|
</html>
|
|
@@ -48,3 +48,7 @@ function predictionsJsonUrl(suite, runSpecName) {
|
|
|
48
48
|
function requestsJsonUrl(suite, runSpecName) {
|
|
49
49
|
return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/display_requests.json`;
|
|
50
50
|
}
|
|
51
|
+
|
|
52
|
+
function plotUrl(suite, plotName) {
|
|
53
|
+
return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/plots/${plotName}.png`;
|
|
54
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
////////////////////////////////////////////////////////////
|
|
2
|
+
// Dictionary of plot captions
|
|
3
|
+
|
|
4
|
+
const plotCaptions = {
|
|
5
|
+
"generic_summary": "Metrics for every model on every core scenario as a means for indicating the spread on a per-metric basis.",
|
|
6
|
+
"model_ranking_all": "The fraction of head-to-head comparisons between the given model and all other models, across all scenarios, where the given model is higher along the metric (e.g. more accurate in the accuracy subfigure). If a model was the highest for the given metric for every scenario, it would receive a score of 1.0; if a model received a score of 0.5, then if a scenario and second model were chosen at random, the outcome of the comparison would be a coin flip. For calibration error, we measure ECE-10; for bias, we measure bias in gender representation.",
|
|
7
|
+
"accuracy_v_x": "The relationship between accuracy (x-axis) and each of the 6 metrics (calibration, robustness, fairness, social bias, toxicity, efficiency) we study in this work across all core scenarios and for all models. For calibration error, we measure ECE-10; for bias, we measure bias in gender representation; and for efficiency, we measure denoised inference time.",
|
|
8
|
+
"metric_correlation": "The Pearson correlation between each metric and every other metric (x-axis). The small blue dots denote the correlation on each individual scenario, while the larger orange dots average the correlation across scenarios. Trends are qualitatively similarly for other correlation measures (e.g. Spearman correlation). For calibration error, we measure ECE-10; for bias, we measure bias in gender representation; and for efficiency, we measure denoised inference time.",
|
|
9
|
+
"accuracy_v_access": "The relationship between access (open vs. limited vs. closed) and model accuracy for each of the core scenarios. Shaded bars indicate the performance of the best model for that scenario, whereas the solid bars indicate the performance of the overall most accurate model across all core scenarios.",
|
|
10
|
+
"accuracy_over_num_parameters": "Cumulative plot, depicting the accuracy of the most accurate model up to a given size across all core scenarios.",
|
|
11
|
+
"accuracy_over_release_date": "The relationship between time (x-axis) and the accuracy of models (y-axis) across the core scenarios.",
|
|
12
|
+
"accuracy_over_the_pile_perplexity": "The relationship between log bits-per-byte (BPB) on The Pile and the accuracy on each core scenario.",
|
|
13
|
+
"targeted_evals": "Model accuracy on scenario targeting specific performance components (language, knowledge, reasoning).",
|
|
14
|
+
"in_context_ablations": "For each model, we set the maximum number of in-context examples to [0, 1, 2, 4, 8, 16] and fit as many in-context examples as possible within the context window. We plot performance as a function of the average number of in-context examples actually used.",
|
|
15
|
+
"mc_ablations": "For each adaptation method (joint, separate, and separate calibrated), we compare models across scenarios."
|
|
16
|
+
};
|