crfm-helm 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +10 -8
  2. {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +50 -37
  3. {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
  4. {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +1 -0
  5. helm/benchmark/__init__.py +2 -0
  6. helm/benchmark/adaptation/adapter_spec.py +3 -0
  7. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
  8. helm/benchmark/contamination/__init__.py +0 -0
  9. helm/benchmark/metrics/classification_metrics.py +28 -23
  10. helm/benchmark/metrics/test_classification_metrics.py +44 -9
  11. helm/benchmark/presentation/create_plots.py +617 -0
  12. helm/benchmark/presentation/summarize.py +4 -2
  13. helm/benchmark/presentation/test_create_plots.py +32 -0
  14. helm/benchmark/run.py +23 -1
  15. helm/benchmark/run_expander.py +161 -47
  16. helm/benchmark/run_specs.py +84 -10
  17. helm/benchmark/runner.py +31 -3
  18. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  19. helm/benchmark/scenarios/imdb_listdir.json +50014 -0
  20. helm/benchmark/scenarios/lex_glue_scenario.py +58 -17
  21. helm/benchmark/scenarios/lextreme_scenario.py +37 -25
  22. helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
  23. helm/benchmark/scenarios/scenario.py +5 -0
  24. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  25. helm/benchmark/static/benchmarking.css +14 -0
  26. helm/benchmark/static/benchmarking.js +43 -0
  27. helm/benchmark/static/index.html +2 -0
  28. helm/benchmark/static/json-urls.js +4 -0
  29. helm/benchmark/static/plot-captions.js +16 -0
  30. helm/benchmark/static/schema.yaml +66 -8
  31. helm/benchmark/window_services/cohere_window_service.py +20 -0
  32. helm/benchmark/window_services/flan_t5_window_service.py +29 -0
  33. helm/benchmark/window_services/huggingface_window_service.py +39 -0
  34. helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
  35. helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
  36. helm/benchmark/window_services/window_service_factory.py +27 -6
  37. helm/common/general.py +12 -5
  38. helm/proxy/clients/aleph_alpha_client.py +47 -28
  39. helm/proxy/clients/auto_client.py +28 -24
  40. helm/proxy/clients/huggingface_client.py +30 -17
  41. helm/proxy/clients/huggingface_model_registry.py +111 -0
  42. helm/proxy/clients/huggingface_tokenizer.py +23 -7
  43. helm/proxy/clients/openai_client.py +60 -2
  44. helm/proxy/clients/test_huggingface_model_registry.py +57 -0
  45. helm/proxy/clients/together_client.py +17 -2
  46. helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
  47. helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
  48. helm/proxy/models.py +82 -2
  49. {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
  50. {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0
@@ -26,6 +26,11 @@ TASK_CODE_MAPPING = {
26
26
  CASE_HOLD: TaskType.QA,
27
27
  }
28
28
 
29
+
30
+ def get_lex_glue_task_type(subset):
31
+ return TASK_CODE_MAPPING[subset]
32
+
33
+
29
34
  TASK_MAX_TRAIN_INSTANCES_MAPPING = {
30
35
  ECTHR_A: 1, # ~ max 4096 tokens
31
36
  ECTHR_B: 1, # ~ max 4096 tokens
@@ -58,19 +63,65 @@ def get_lex_glue_max_tokens(subset):
58
63
 
59
64
  INSTRUCTIONS = {
60
65
  ECTHR_A: "In this task, you are given the facts from a case heard at the European Court of Human Rights (ECtHR). "
61
- "Predict the articles of the ECtHR that were violated (if any).",
66
+ "Predict the articles of the ECtHR that were violated (if any) out of the following: "
67
+ "0: Article 2, "
68
+ "1: Article 3, "
69
+ "2: Article 5, "
70
+ "3: Article 6, "
71
+ "4: Article 8, "
72
+ "5: Article 9, "
73
+ "6: Article 10, "
74
+ "7: Article 11, "
75
+ "8: Article 14, "
76
+ "9: Article 1 of Protocol 1. "
77
+ "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
62
78
  ECTHR_B: "In this task, you are given the facts from a case heard at the European Court of Human Rights (ECtHR). "
63
- "Predict the articles of ECtHR that were allegedly violated (considered by the court).",
79
+ "Predict the articles of ECtHR that were allegedly violated (considered by the court) out of the following:"
80
+ "0: Article 2, "
81
+ "1: Article 3, "
82
+ "2: Article 5, "
83
+ "3: Article 6, "
84
+ "4: Article 8, "
85
+ "5: Article 9, "
86
+ "6: Article 10, "
87
+ "7: Article 11, "
88
+ "8: Article 14, "
89
+ "9: Article 1 of Protocol 1. "
90
+ "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
64
91
  SCOTUS: "In this task, you are given a case heard at the Supreme Court of the United States (SCOTUS). "
65
- "Predict the relevant issue area.",
92
+ "Predict the relevant issue area out of the following: "
93
+ "0: Criminal Procedure, "
94
+ "1: Civil Rights, "
95
+ "2: First Amendment, "
96
+ "3: Due Process, "
97
+ "4: Privacy, "
98
+ "5: Attorneys, "
99
+ "6: Unions, "
100
+ "7: Economic Activity, "
101
+ "8: Judicial Power, "
102
+ "9: Federalism, "
103
+ "10: Interstate Relations, "
104
+ "11: Federal Taxation, "
105
+ "12: Miscellaneous, "
106
+ "13: Private Action.",
66
107
  EURLEX: "In this task, you are given an EU law document published in the EUR-Lex portal. "
67
- "Predict the relevant EuroVoc concepts.",
108
+ "Predict the relevant EuroVoc concepts. "
109
+ "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
68
110
  LEDGAR: "In this task, you are given a contract provision "
69
111
  "from contracts obtained from US Securities and Exchange Commission (SEC) filings."
70
- "Predict the main topic.",
112
+ "Predict the main topic. ",
71
113
  UNFAIR_TOS: "In this task, you are given a sentence "
72
- "from a Terms of Service (ToS) document from on-line platforms. "
73
- "Predict the types of unfair contractual terms",
114
+ "from a Terms of Service (ToS) document from online platforms. "
115
+ "Predict the types of unfair contractual terms out of the following: "
116
+ "0: Limitation of liability, "
117
+ "1: Unilateral termination, "
118
+ "2: Unilateral change, "
119
+ "3: Content removal, "
120
+ "4: Contract by using, "
121
+ "5: Choice of law, "
122
+ "6: Jurisdiction, "
123
+ "7: Arbitration. "
124
+ "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
74
125
  CASE_HOLD: "In this task, you are given an excerpt from a court decision, "
75
126
  "containing a reference to a particular case, while the holding statement is masked out. "
76
127
  "Predict the index of the holding statement fitting in the context at <HOLDING> from a selection of five choices.",
@@ -126,7 +177,6 @@ class LexGLUEScenario(Scenario):
126
177
 
127
178
  dataset_name = "lex_glue"
128
179
  max_number_of_wrong_answers = 30
129
- mltc_no_label_name = "No Label"
130
180
 
131
181
  def __init__(self, subset: str):
132
182
  super().__init__()
@@ -168,15 +218,6 @@ class LexGLUEScenario(Scenario):
168
218
 
169
219
  wrong_references = reduce_wrong_reference_count(wrong_references)
170
220
 
171
- if task_code == TaskType.MLTC: # special case for multilabel classification tasks
172
- if correct_labels: # if we have a correct label
173
- # add the no_label to the wrong references
174
- # IMPORTANT: add it after reduce_wrong_reference_count, to make sure the no label is always there
175
- wrong_references.append(Reference(output=Output(self.mltc_no_label_name), tags=[]))
176
- else: # if we don't have a correct label
177
- # add the no_label to the correct labels
178
- correct_labels = [self.mltc_no_label_name]
179
-
180
221
  # construct correct references and input
181
222
  if task_code in [TaskType.SLTC, TaskType.MLTC]:
182
223
  input_text = example["text"]
@@ -55,6 +55,11 @@ TASK_CODE_MAPPING = {
55
55
  MAPA_FINE: TaskType.NER,
56
56
  }
57
57
 
58
+
59
+ def get_lextreme_task_type(subset):
60
+ return TASK_CODE_MAPPING[subset]
61
+
62
+
58
63
  TASK_MAX_TRAIN_INSTANCES_MAPPING = {
59
64
  BRAZILIAN_COURT_DECISIONS_JUDGMENT: 4, # ~ max 1024 tokens
60
65
  BRAZILIAN_COURT_DECISIONS_UNANIMITY: 4, # ~ max 1024 tokens
@@ -134,14 +139,14 @@ INSTRUCTIONS = {
134
139
  "'Permanent Greek Legislation Code - Raptarchis (Ραπτάρχης)' the document belongs to.",
135
140
  SWISS_JUDGMENT_PREDICTION: "In this task, you are given the facts description "
136
141
  "from a decision heard at the Swiss Federal Supreme Court. "
137
- "Predict the judgment of the case (approval or dismissal)",
142
+ "Predict the judgment of the case (approval: The appeal was approved, or dismissal: The appeal was denied)",
138
143
  ONLINE_TERMS_OF_SERVICE_UNFAIRNESS_LEVELS: "In this task, you are given a sentence "
139
144
  "from a Terms of Service (ToS) document. "
140
145
  "Predict the unfairness level of the sentence (potentially_unfair, clearly_unfair, clearly_fair, untagged)",
141
146
  ONLINE_TERMS_OF_SERVICE_CLAUSE_TOPICS: "In this task, you are given a sentence "
142
147
  "from a Terms of Service (ToS) document. "
143
- "Predict the clause topics of the sentence "
144
- "(0: Arbitration, "
148
+ "Predict the clause topics of the sentence out of the following: "
149
+ "0: Arbitration, "
145
150
  "1: Unilateral change, "
146
151
  "2: Content removal, "
147
152
  "3: Jurisdiction, "
@@ -149,34 +154,51 @@ INSTRUCTIONS = {
149
154
  "5: Limitation of liability, "
150
155
  "6: Unilateral termination, "
151
156
  "7: Contract by using, "
152
- "8: Privacy included)",
157
+ "8: Privacy included. "
158
+ "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
153
159
  COVID19_EMERGENCY_EVENT: "In this task, you are given a sentence from a European legislative document. "
154
- "Predict the applicable measurements against COVID-19 "
155
- "(0: State of Emergency, "
160
+ "Predict the applicable measurements against COVID-19 out of the following: "
161
+ "0: State of Emergency, "
156
162
  "1: Restrictions of fundamental rights and civil liberties, "
157
163
  "2: Restrictions of daily liberties, "
158
164
  "3: Closures / lockdown, "
159
165
  "4: Suspension of international cooperation and commitments, "
160
166
  "5: Police mobilization, "
161
167
  "6: Army mobilization, "
162
- "7: Government oversight)",
168
+ "7: Government oversight. "
169
+ "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
163
170
  MULTI_EURLEX_LEVEL_1: "In this task, you are given a document from an EU law. "
164
- "Predict the level 1 concept in the EUROVOC taxonomy.",
171
+ "Predict the level 1 concept in the EUROVOC taxonomy. "
172
+ "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
165
173
  MULTI_EURLEX_LEVEL_2: "In this task, you are given a document from an EU law. "
166
- "Predict the level 2 concept in the EUROVOC taxonomy.",
174
+ "Predict the level 2 concept in the EUROVOC taxonomy. "
175
+ "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
167
176
  MULTI_EURLEX_LEVEL_3: "In this task, you are given a document from an EU law. "
168
- "Predict the level 3 concept in the EUROVOC taxonomy.",
177
+ "Predict the level 3 concept in the EUROVOC taxonomy. "
178
+ "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
169
179
  GREEK_LEGAL_NER: "In this task, you are given a sentence from Greek legislation. "
170
- "Predict the named entity type for each token.",
180
+ "Predict the named entity type for each token out of the following: "
181
+ "O, B-ORG, I-ORG, B-GPE, I-GPE, B-LEG-REFS, I-LEG-REFS, B-PUBLIC-DOCS, I-PUBLIC-DOCS, B-PERSON, I-PERSON, "
182
+ "B-FACILITY, I-FACILITY, B-LOCATION-UNK, I-LOCATION-UNK, B-LOCATION-NAT, I-LOCATION-NAT",
171
183
  LEGALNERO: "In this task, you are given a sentence from Romanian legislation. "
172
- "Predict the named entity type for each token.",
184
+ "Predict the named entity type for each token out of the following: "
185
+ "O, B-TIME, I-TIME, B-LEGAL, I-LEGAL, B-ORG, I-ORG, B-LOC, I-LOC, B-PER, I-PER",
173
186
  LENER_BR: "In this task, you are given a sentence "
174
187
  "from Brazilian legal documents (court decisions and legislation). "
175
- "Predict the named entity type for each token.",
188
+ "Predict the named entity type for each token out of the following: "
189
+ "O, B-ORGANIZACAO, I-ORGANIZACAO, B-PESSOA, I-PESSOA, B-TEMPO, I-TEMPO, B-LOCAL, I-LOCAL, "
190
+ "B-LEGISLACAO, I-LEGISLACAO, B-JURISPRUDENCIA, I-JURISPRUDENCIA",
176
191
  MAPA_COARSE: "In this task, you are given a sentence from the EUR-Lex database. "
177
- "Predict the coarse grained named entity type for each token.",
192
+ "Predict the coarse grained named entity type for each token out of the following: "
193
+ "O, B-ORGANISATION, I-ORGANISATION, B-ADDRESS, I-ADDRESS, B-DATE, I-DATE, "
194
+ "B-PERSON, I-PERSON, B-AMOUNT, I-AMOUNT, B-TIME, I-TIME",
178
195
  MAPA_FINE: "In this task, you are given a sentence from the EUR-Lex database. "
179
- "Predict the fine grained named entity type for each token.",
196
+ "Predict the fine grained named entity type for each token out of the following: "
197
+ "O, B-BUILDING, I-BUILDING, B-CITY, I-CITY, B-COUNTRY, I-COUNTRY, B-PLACE, I-PLACE, B-TERRITORY, I-TERRITORY, "
198
+ "I-UNIT, B-UNIT, B-VALUE, I-VALUE, B-YEAR, I-YEAR, B-STANDARD ABBREVIATION, I-STANDARD ABBREVIATION, "
199
+ "B-MONTH, I-MONTH, B-DAY, I-DAY, B-AGE, I-AGE, B-ETHNIC CATEGORY, I-ETHNIC CATEGORY, B-FAMILY NAME, I-FAMILY NAME, "
200
+ "B-INITIAL NAME, I-INITIAL NAME, B-MARITAL STATUS, I-MARITAL STATUS, B-PROFESSION, I-PROFESSION, B-ROLE, I-ROLE, "
201
+ "B-NATIONALITY, I-NATIONALITY, B-TITLE, I-TITLE, B-URL, I-URL, B-TYPE, I-TYPE",
180
202
  }
181
203
 
182
204
 
@@ -226,7 +248,6 @@ class LEXTREMEScenario(Scenario):
226
248
 
227
249
  dataset_name = "joelito/lextreme"
228
250
  max_number_of_wrong_answers = 30
229
- mltc_no_label_name = "No Label"
230
251
  delimiter = '" "' # we choose quotes and whitespace as a delimiter because this is what worked for gpt3
231
252
 
232
253
  ner_class_mapping = {
@@ -396,15 +417,6 @@ class LEXTREMEScenario(Scenario):
396
417
 
397
418
  wrong_references = reduce_wrong_reference_count(wrong_references)
398
419
 
399
- if task_code == TaskType.MLTC: # special case for multilabel classification tasks
400
- if correct_labels: # if we have a correct label
401
- # add the no_label to the wrong references
402
- # IMPORTANT: add it after reduce_wrong_reference_count, to make sure the no label is always there
403
- wrong_references.append(Reference(output=Output(self.mltc_no_label_name), tags=[]))
404
- else: # if we don't have a correct label
405
- # add the no_label to the correct labels
406
- correct_labels = [self.mltc_no_label_name]
407
-
408
420
  # construct correct references and input
409
421
  if task_code in [TaskType.SLTC, TaskType.MLTC]:
410
422
  input_text = example["input"]
@@ -0,0 +1,194 @@
1
+ import os
2
+ import pandas as pd
3
+ from typing import List, Dict
4
+
5
+ from helm.common.general import ensure_file_downloaded
6
+
7
+ from .scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TRAIN_SPLIT,
12
+ TEST_SPLIT,
13
+ CORRECT_TAG,
14
+ PassageQuestionInput,
15
+ Input,
16
+ Output,
17
+ )
18
+
19
+
20
+ class OpinionsQAScenario(Scenario):
21
+ """
22
+ The OpinionsQAScenario dataset is from the paper "Whose Opinions Do Language Models Reflect?"
23
+ [Santurkar et al., 2023].
24
+
25
+ OpinionsQA is a QA dataset containing 1484 multiple-choice questions. Since the questions are inherently
26
+ subjective, there isn't a single ground truth response. Instead, the object of interest is how
27
+ the distribution of model responses compares to those obtained from human survey participants.
28
+
29
+ As discussed in Santurkar et al., we consider prompting an LM:
30
+ 1. Without any context (zero-shot) to evaluate the "default" opinions reflected
31
+ by it.
32
+ 2. With context containing information pertaining to the group (say Democrats) we want to steer
33
+ the model towards using one of three modes: "steer-qa", "steer-bio" or "steer-portray". See the
34
+ example below for an illustration fo the difference between the three steering modes.
35
+
36
+
37
+ We prompt models using the following format
38
+
39
+ <optional context>
40
+
41
+ Question: <question>
42
+ A. <reference>
43
+ B. <reference>
44
+ C. <reference>
45
+ D. <reference>
46
+ Answer: <A/B/C/D>
47
+
48
+ For instance:
49
+
50
+ <optional context>
51
+
52
+ Question: How much, if at all, do you think the ease with which people can legally obtain guns contributes
53
+ to gun violence in the country today?
54
+ A. A great deal
55
+ B. A fair amount
56
+ C. Not too much
57
+ D. Not at all
58
+ E. Refused
59
+ Answer:
60
+
61
+
62
+ In the example above, the <optional context> varies based on the choice of the "context" variable as follows:
63
+
64
+ 1. If context="default", <optional context> is not used and the model is prompted with the question directly.
65
+
66
+ 2. If context="steer-qa", the group information (to steer the model towards) is passed in a QA format, e.g.:
67
+
68
+ Question: In politics today, do you consider yourself a
69
+ A. Republican
70
+ B. Democrat
71
+ C. Independent
72
+ D. Something else
73
+ E. Refused
74
+ Answer: B
75
+
76
+ 3. If context="steer-bio", the group information is passed to the model as a free-text response
77
+ to a prompt asking for demographic information, e.g.:
78
+
79
+ Below you will be asked to provide a short description of your political affiliation and
80
+ then answer some questions.
81
+
82
+ Description: In politics today, I consider myself a Democrat.
83
+
84
+ 4. If context="steer-portray", the model is explicitly asked to behave as if it were a member of
85
+ said group, e.g.:
86
+
87
+ Answer the following question as if in politics today, you considered yourself a Democrat.
88
+
89
+ """
90
+
91
+ name = "opinions_qa"
92
+ description = "Subjective questions dataset based on Pew American Trends Panel opinion polls."
93
+ tags = ["multiple_choice"]
94
+
95
+ """ Information needed to download the dataset """
96
+ CODALAB_URI_TEMPLATE: str = (
97
+ "https://worksheets.codalab.org/rest/bundles/{bundle}/contents/blob/model_input/{filename}"
98
+ )
99
+ CODALAB_BUNDLE: str = "0xa6f81cc62d7d4ccb93031a72d2043669"
100
+ FILE_NAME: str = "Pew_American_Trends_Panel_W{wave}.csv"
101
+ PEW_SURVEY_WAVES: list = [26, 27, 29, 32, 34, 36, 41, 42, 43, 45, 49, 50, 54, 82, 92]
102
+
103
+ def __init__(self, survey_type: str, context: str):
104
+ super().__init__()
105
+ assert context in ["default", "steer-qa", "steer-bio", "steer-portray"]
106
+
107
+ self.survey_type: str = survey_type
108
+ self.context: str = context
109
+
110
+ def download_data(self):
111
+
112
+ self.output_path: str = os.path.join(self.output_path, "data")
113
+ if not os.path.exists(self.output_path):
114
+ os.makedirs(self.output_path)
115
+
116
+ DOWNLOAD_FILENAMES = [self.FILE_NAME.format(wave=wave) for wave in self.PEW_SURVEY_WAVES]
117
+ DOWNLOAD_FILENAMES += [f"{steer}.csv" for steer in ["steer-qa", "steer-bio", "steer-portray"]]
118
+ DOWNLOAD_FILENAMES += ["Pew_American_Trends_Panel_disagreement_500.csv"]
119
+
120
+ for filename in DOWNLOAD_FILENAMES:
121
+ data_path: str = os.path.join(self.output_path, filename)
122
+
123
+ source_url: str = self.CODALAB_URI_TEMPLATE.format(bundle=self.CODALAB_BUNDLE, filename=filename)
124
+ ensure_file_downloaded(source_url=source_url, target_path=data_path, downloader_executable="gdown")
125
+
126
+ def read_survey_questions(self, csv_path):
127
+ df = pd.read_csv(csv_path, sep="\t")
128
+ df["options"] = df.apply(lambda x: eval(x["options"]), axis=1)
129
+ return df
130
+
131
+ def get_instances(self) -> List[Instance]:
132
+ self.download_data()
133
+
134
+ # Read all the instances
135
+ instances: List[Instance] = []
136
+ splits: Dict[str, str] = {
137
+ "dev": TRAIN_SPLIT,
138
+ "test": TEST_SPLIT,
139
+ }
140
+
141
+ all_splits = ["dev", "test"] if self.context == "steer-qa" else ["test"]
142
+ csv_dict = {
143
+ "dev": os.path.join(self.output_path, f"{self.context}.csv"),
144
+ "test": os.path.join(self.output_path, f"{self.survey_type}.csv"),
145
+ }
146
+
147
+ bios_df = None
148
+ if self.context in ["steer-bio", "steer-portray"]:
149
+ bios_path = os.path.join(self.output_path, f"{self.context}.csv")
150
+ bios_df = pd.read_csv(bios_path, sep="\t")
151
+
152
+ for split in all_splits:
153
+
154
+ csv_path: str = csv_dict[split]
155
+ assert os.path.exists(csv_path)
156
+
157
+ question_df = self.read_survey_questions(csv_path)
158
+
159
+ for qidx, (question, answers) in enumerate(zip(question_df["question"], question_df["options"])):
160
+
161
+ # Opinions QA test questions have no correct answer and thus we set it to be None by default
162
+ # for all test instances.
163
+ # In the case where context = steer-qa, we add demographic information in the form of a
164
+ # in-context question answer pair as shown in the example above.
165
+
166
+ correct_answer = None if split == "test" else question_df["correct"][qidx]
167
+
168
+ def answer_to_reference(answer: str) -> Reference:
169
+ return Reference(
170
+ Output(text=answer),
171
+ tags=[CORRECT_TAG] if (answer == correct_answer and split != "test") else [],
172
+ )
173
+
174
+ if bios_df is None:
175
+ # context = "default" or "steer-qa"
176
+ instance = Instance(
177
+ Input(text=question),
178
+ references=list(map(answer_to_reference, answers)),
179
+ split=splits[split],
180
+ )
181
+ instances.append(instance)
182
+ else:
183
+ # context = "steer-bio"or "steer-portray"
184
+ for bio in bios_df["question"].values:
185
+
186
+ context = PassageQuestionInput(passage=bio, question=question + "\n")
187
+ instance = Instance(
188
+ context,
189
+ references=list(map(answer_to_reference, answers)),
190
+ split=splits[split],
191
+ )
192
+ instances.append(instance)
193
+
194
+ return instances
@@ -147,6 +147,11 @@ class Instance:
147
147
  return reference
148
148
  return None
149
149
 
150
+ @property
151
+ def all_correct_references(self) -> List[Reference]:
152
+ """Return all correct references."""
153
+ return [reference for reference in self.references if reference.is_correct]
154
+
150
155
  def render_lines(self) -> List[str]:
151
156
  info = [f"input: {format_text(self.input.text)}"]
152
157
  if self.sub_split:
@@ -78,7 +78,7 @@ class ThePileScenario(Scenario):
78
78
  # Download the raw data
79
79
  data_path = os.path.join(self.output_path, "data")
80
80
  ensure_file_downloaded(
81
- source_url="https://mystic.the-eye.eu/public/AI/pile/test.jsonl.zst",
81
+ source_url="https://the-eye.eu/public/AI/pile/test.jsonl.zst",
82
82
  target_path=data_path,
83
83
  unpack=True,
84
84
  )
@@ -136,3 +136,17 @@ tbody .table-sort-column {
136
136
  background-color: #f5f5f5;
137
137
  white-space: pre-wrap;
138
138
  }
139
+
140
+ .plot {
141
+ margin: 15px;
142
+ }
143
+
144
+ .plot img {
145
+ margin: 10px;
146
+ }
147
+
148
+ .plot-caption {
149
+ color: #555;
150
+ font-style: italic;
151
+ margin: 5px;
152
+ }
@@ -124,6 +124,44 @@ $(function () {
124
124
  return $table;
125
125
  }
126
126
 
127
+ function renderPlots() {
128
+ const container = $('<div>', {class: "container"});
129
+ const links = $('<div>');
130
+ container.append(links);
131
+ const tableLinks = [];
132
+
133
+ function renderPlot(name, title) {
134
+ const plot = $('<div>', {class: "plot"});
135
+ const caption = $('<div>', {class: "plot-caption"}).append(plotCaptions[name]);
136
+
137
+ plot.append($('<h3>').append($('<a>', {id: title}).append(title)));
138
+ plot.append(caption);
139
+ plot.append($('<img>', {src: plotUrl(suite, name), class: "img-fluid"}));
140
+ container.append(plot);
141
+ tableLinks.push($('<a>', {href: '#' + title}).append(title));
142
+ }
143
+
144
+ renderPlot("generic_summary", "Metric spread for core scenarios");
145
+ renderPlot("model_ranking_all", "Head-to-head win rate per each model");
146
+
147
+ renderPlot("accuracy_v_x", "Accuracy as a function of other metrics");
148
+ renderPlot("metric_correlation", "Correlation between metrics");
149
+
150
+ renderPlot("accuracy_v_access", "Accuracy as a function of model access");
151
+ renderPlot("accuracy_over_num_parameters", "Accuracy across model sizes");
152
+ renderPlot("accuracy_over_release_date", "Accuracy over time");
153
+ renderPlot("accuracy_over_the_pile_perplexity", "Accuracy as a function of The Pile perplexity");
154
+
155
+ renderPlot("targeted_evals", "Targeted evaluations");
156
+
157
+ renderPlot("in_context_ablations", "Number of in-context examples ablation");
158
+ renderPlot("mc_ablations", "Multiple-choice adaptation ablation");
159
+
160
+ links.append(renderItems(tableLinks));
161
+
162
+ return container;
163
+ }
164
+
127
165
  function renderRunsOverview(runSpecs) {
128
166
  let query = '';
129
167
  const $search = $('<input>', {type: 'text', size: 40, placeholder: 'Enter regex query (enter to open all)'});
@@ -1170,6 +1208,11 @@ $(function () {
1170
1208
  $main.empty()
1171
1209
  $main.append(renderHeader('Scenarios', renderScenarios()));
1172
1210
  refreshHashLocation();
1211
+ } else if (urlParams.plots) {
1212
+ // Plots
1213
+ $main.empty()
1214
+ $main.append(renderHeader('Plots', renderPlots()));
1215
+ refreshHashLocation();
1173
1216
  } else if (urlParams.runSpec || urlParams.runSpecs || urlParams.runSpecRegex) {
1174
1217
  // Predictions for a set of run specs (matching a regular expression)
1175
1218
  $main.text('Loading runs...');
@@ -22,6 +22,7 @@
22
22
  <li class="nav-item"><a class="nav-link active" href="?models=1">Models</a></li>
23
23
  <li class="nav-item"><a class="nav-link active" href="?scenarios=1">Scenarios</a></li>
24
24
  <li class="nav-item"><a class="nav-link active" href="?groups=1">Results</a></li>
25
+ <li class="nav-item"><a class="nav-link active" href="?plots=1">Plots</a></li>
25
26
  <li class="nav-item"><a class="nav-link active" href="?runs=1">Raw runs</a></li>
26
27
  </ul>
27
28
  </div>
@@ -48,5 +49,6 @@
48
49
  <script src="json-urls-root.js"></script>
49
50
  <script src="json-urls.js"></script>
50
51
  <script src="benchmarking.js"></script>
52
+ <script src="plot-captions.js"></script>
51
53
  </body>
52
54
  </html>
@@ -48,3 +48,7 @@ function predictionsJsonUrl(suite, runSpecName) {
48
48
  function requestsJsonUrl(suite, runSpecName) {
49
49
  return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/display_requests.json`;
50
50
  }
51
+
52
+ function plotUrl(suite, plotName) {
53
+ return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/plots/${plotName}.png`;
54
+ }
@@ -0,0 +1,16 @@
1
+ ////////////////////////////////////////////////////////////
2
+ // Dictionary of plot captions
3
+
4
+ const plotCaptions = {
5
+ "generic_summary": "Metrics for every model on every core scenario as a means for indicating the spread on a per-metric basis.",
6
+ "model_ranking_all": "The fraction of head-to-head comparisons between the given model and all other models, across all scenarios, where the given model is higher along the metric (e.g. more accurate in the accuracy subfigure). If a model was the highest for the given metric for every scenario, it would receive a score of 1.0; if a model received a score of 0.5, then if a scenario and second model were chosen at random, the outcome of the comparison would be a coin flip. For calibration error, we measure ECE-10; for bias, we measure bias in gender representation.",
7
+ "accuracy_v_x": "The relationship between accuracy (x-axis) and each of the 6 metrics (calibration, robustness, fairness, social bias, toxicity, efficiency) we study in this work across all core scenarios and for all models. For calibration error, we measure ECE-10; for bias, we measure bias in gender representation; and for efficiency, we measure denoised inference time.",
8
+ "metric_correlation": "The Pearson correlation between each metric and every other metric (x-axis). The small blue dots denote the correlation on each individual scenario, while the larger orange dots average the correlation across scenarios. Trends are qualitatively similarly for other correlation measures (e.g. Spearman correlation). For calibration error, we measure ECE-10; for bias, we measure bias in gender representation; and for efficiency, we measure denoised inference time.",
9
+ "accuracy_v_access": "The relationship between access (open vs. limited vs. closed) and model accuracy for each of the core scenarios. Shaded bars indicate the performance of the best model for that scenario, whereas the solid bars indicate the performance of the overall most accurate model across all core scenarios.",
10
+ "accuracy_over_num_parameters": "Cumulative plot, depicting the accuracy of the most accurate model up to a given size across all core scenarios.",
11
+ "accuracy_over_release_date": "The relationship between time (x-axis) and the accuracy of models (y-axis) across the core scenarios.",
12
+ "accuracy_over_the_pile_perplexity": "The relationship between log bits-per-byte (BPB) on The Pile and the accuracy on each core scenario.",
13
+ "targeted_evals": "Model accuracy on scenario targeting specific performance components (language, knowledge, reasoning).",
14
+ "in_context_ablations": "For each model, we set the maximum number of in-context examples to [0, 1, 2, 4, 8, 16] and fit as many in-context examples as possible within the context window. We plot performance as a function of the average number of in-context examples actually used.",
15
+ "mc_ablations": "For each adaptation method (joint, separate, and separate calibrated), we compare models across scenarios."
16
+ };