ai2-olmo-eval 0.7.0__py3-none-any.whl → 0.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ai2_olmo_eval-0.7.0.dist-info → ai2_olmo_eval-0.7.2.dist-info}/METADATA +3 -2
- {ai2_olmo_eval-0.7.0.dist-info → ai2_olmo_eval-0.7.2.dist-info}/RECORD +20 -8
- {ai2_olmo_eval-0.7.0.dist-info → ai2_olmo_eval-0.7.2.dist-info}/WHEEL +1 -1
- olmo_eval/metrics.py +164 -9
- olmo_eval/oe_eval_tasks/basic_skills_arithmetic/rc_5shot/config.json +23 -0
- olmo_eval/oe_eval_tasks/basic_skills_arithmetic/rc_5shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/basic_skills_coding/rc_5shot/config.json +23 -0
- olmo_eval/oe_eval_tasks/basic_skills_coding/rc_5shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/basic_skills_common_knowledge/rc_5shot/config.json +23 -0
- olmo_eval/oe_eval_tasks/basic_skills_common_knowledge/rc_5shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/basic_skills_logical_reasoning/rc_5shot/config.json +23 -0
- olmo_eval/oe_eval_tasks/basic_skills_logical_reasoning/rc_5shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/basic_skills_pattern/rc_5shot/config.json +23 -0
- olmo_eval/oe_eval_tasks/basic_skills_pattern/rc_5shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/basic_skills_string_operations/rc_5shot/config.json +23 -0
- olmo_eval/oe_eval_tasks/basic_skills_string_operations/rc_5shot/requests.jsonl.gz +0 -0
- olmo_eval/tasks.py +80 -6
- olmo_eval/version.py +1 -1
- {ai2_olmo_eval-0.7.0.dist-info → ai2_olmo_eval-0.7.2.dist-info/licenses}/LICENSE +0 -0
- {ai2_olmo_eval-0.7.0.dist-info → ai2_olmo_eval-0.7.2.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: ai2-olmo-eval
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.2
|
|
4
4
|
Summary: In-loop evaluation tasks for language modeling
|
|
5
5
|
Author-email: Allen Institute for Artificial Intelligence <olmo@allenai.org>
|
|
6
6
|
License: Apache License
|
|
@@ -234,6 +234,7 @@ Requires-Dist: boto3; extra == "dev"
|
|
|
234
234
|
Requires-Dist: google-cloud-storage; extra == "dev"
|
|
235
235
|
Provides-Extra: all
|
|
236
236
|
Requires-Dist: ai2-olmo-eval[dev]; extra == "all"
|
|
237
|
+
Dynamic: license-file
|
|
237
238
|
|
|
238
239
|
# OLMo-in-loop-evals
|
|
239
240
|
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
+
ai2_olmo_eval-0.7.2.dist-info/licenses/LICENSE,sha256=YvuKOpYh3COIF0yqq-nCMXtpS7mh1GyYvPVlW2j1G-M,11359
|
|
1
2
|
olmo_eval/__init__.py,sha256=49RxnAaJNk8U9XP3SF5MjyFIxLSkxH0vXQuZgnEOi44,283
|
|
2
|
-
olmo_eval/metrics.py,sha256=
|
|
3
|
-
olmo_eval/tasks.py,sha256=
|
|
3
|
+
olmo_eval/metrics.py,sha256=NcI_1B3BV-DC9RXjsSIftU-2GeF8vvU6SNyJnlYlKwU,18705
|
|
4
|
+
olmo_eval/tasks.py,sha256=QGLyF7JA2-T9mkh-N4cZGNOQp9si90yQSS41T3x5Lak,79630
|
|
4
5
|
olmo_eval/tokenizer.py,sha256=PnkidE0nAtEA1QZjuQpE_bIwgAsHxodnaJRALAPqrJQ,5127
|
|
5
6
|
olmo_eval/util.py,sha256=ARmZmRQl8VOvnKQoUprb3cOunzcApeNhRdV4BMXZuvo,3856
|
|
6
|
-
olmo_eval/version.py,sha256=
|
|
7
|
+
olmo_eval/version.py,sha256=QWjPfx79C2NOQw2G7iDEsM4FKsLiGLCLNDzEx7EImf8,308
|
|
7
8
|
olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/data-00000-of-00001.arrow,sha256=TPWbMhBmticWjYp7TA3etcKbXbaoCDBWhxuqlD1bDJA,98080
|
|
8
9
|
olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/dataset_info.json,sha256=iZumP5Udu8LD7cbew3o7nNpnGu-o9jPaMxUrNDDNIVY,1795
|
|
9
10
|
olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/state.json,sha256=6Q1XhM-HMZcymuGAKBC_8RjMBKgJSaR_6lLUO9Z8XwE,255
|
|
@@ -599,6 +600,18 @@ olmo_eval/oe_eval_tasks/arc_easy/val_mc_5shot/config.json,sha256=CEgPNm226vxmMim
|
|
|
599
600
|
olmo_eval/oe_eval_tasks/arc_easy/val_mc_5shot/requests.jsonl.gz,sha256=LZ7XuWwDo6zJTqhgpZgHNj6yi-xOXb-TQxl9yxB9gVg,114271
|
|
600
601
|
olmo_eval/oe_eval_tasks/arc_easy/val_rc_5shot/config.json,sha256=LeNP534voujfcp9ph8SKHfnfYPjfSu8ik3HWiXt3TFM,761
|
|
601
602
|
olmo_eval/oe_eval_tasks/arc_easy/val_rc_5shot/requests.jsonl.gz,sha256=28UmHQnAB2DIlfYbqhuhJ4AjAVLDAHWWoEmaHlI-UKU,202290
|
|
603
|
+
olmo_eval/oe_eval_tasks/basic_skills_arithmetic/rc_5shot/config.json,sha256=_gSH-miyIWms4r3TSLCMihc42v7kt8tEPnqQJcgux-4,616
|
|
604
|
+
olmo_eval/oe_eval_tasks/basic_skills_arithmetic/rc_5shot/requests.jsonl.gz,sha256=iiVqzSVTiEk5lbq0WAiR8ujvBHHv73azRpvfuCIrEfI,215180
|
|
605
|
+
olmo_eval/oe_eval_tasks/basic_skills_coding/rc_5shot/config.json,sha256=19NZFpCouu7oEidoUBthKUekW87pT5pzR1bX1NJV77g,592
|
|
606
|
+
olmo_eval/oe_eval_tasks/basic_skills_coding/rc_5shot/requests.jsonl.gz,sha256=suXkEgQLUT-XK_EDyQKIoniNYNJvo4vUpe8-jyeNe-w,274302
|
|
607
|
+
olmo_eval/oe_eval_tasks/basic_skills_common_knowledge/rc_5shot/config.json,sha256=DOHsmlO6_OMBIl-oEfKT8O0yIj89I1gTV_uvOxdiT8M,652
|
|
608
|
+
olmo_eval/oe_eval_tasks/basic_skills_common_knowledge/rc_5shot/requests.jsonl.gz,sha256=ie673jV3ShxUhrqux3Y8YRNfAazKa8ayGEjo7hxEp1Y,237402
|
|
609
|
+
olmo_eval/oe_eval_tasks/basic_skills_logical_reasoning/rc_5shot/config.json,sha256=OB20jgxj00v3bvfsc1M1zyWGlEJvZdXBlg4L9NeGsZY,658
|
|
610
|
+
olmo_eval/oe_eval_tasks/basic_skills_logical_reasoning/rc_5shot/requests.jsonl.gz,sha256=5ElEBHtBq6tBQ1hqEbg9---XkUFV3GjcMGHFXxP_urs,284843
|
|
611
|
+
olmo_eval/oe_eval_tasks/basic_skills_pattern/rc_5shot/config.json,sha256=OAZyUX7pw7cEguIsSbs_fKXiuHh1sbEkpF7x9v6ZI80,598
|
|
612
|
+
olmo_eval/oe_eval_tasks/basic_skills_pattern/rc_5shot/requests.jsonl.gz,sha256=FY1pf-fTh5BNnN5H7uN0ksm21tdC6ewKsOhaOpN3760,71330
|
|
613
|
+
olmo_eval/oe_eval_tasks/basic_skills_string_operations/rc_5shot/config.json,sha256=TfwWhRHC_G17uqk60-pNROMNzzmd0rMTY5nPP0dje00,658
|
|
614
|
+
olmo_eval/oe_eval_tasks/basic_skills_string_operations/rc_5shot/requests.jsonl.gz,sha256=FIBdOQSDoQ99gDEpHYHYTmhW5qfClVP-rh3ll2I0fDQ,231341
|
|
602
615
|
olmo_eval/oe_eval_tasks/boolq/mc_5shot/config.json,sha256=87GTyDGser1tWfSWmktZ1X17jKXU1EZzHOJLMSbVspA,632
|
|
603
616
|
olmo_eval/oe_eval_tasks/boolq/mc_5shot/requests.jsonl.gz,sha256=uZ9ZkbFkiUn4XcCzypgPscTFTrVDexVC1L-e6zBiEMg,393249
|
|
604
617
|
olmo_eval/oe_eval_tasks/boolq/rc_0shot/config.json,sha256=d1GKQMIX1cUgnZHlUe9kgAZsgkMc1N2GnMlyhccO9pE,509
|
|
@@ -703,8 +716,7 @@ olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/config.json,sha256=ySjEVqTOj5GwC
|
|
|
703
716
|
olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/requests.jsonl.gz,sha256=knTzcqigWCfdYLN1Pl0TfCm0Fi1lRASWAo_SC6KtXsc,115262
|
|
704
717
|
olmo_eval/tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json,sha256=yjXYcnpTO7Zjm_R4Gucrn9oA5paadiYM-ZZER5q_EXc,2114319
|
|
705
718
|
olmo_eval/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json,sha256=mtM7Szmp-Dlzw_jEKgGUjdW4d6KKyaU1aVbE_07QtxQ,2115113
|
|
706
|
-
ai2_olmo_eval-0.7.
|
|
707
|
-
ai2_olmo_eval-0.7.
|
|
708
|
-
ai2_olmo_eval-0.7.
|
|
709
|
-
ai2_olmo_eval-0.7.
|
|
710
|
-
ai2_olmo_eval-0.7.0.dist-info/RECORD,,
|
|
719
|
+
ai2_olmo_eval-0.7.2.dist-info/METADATA,sha256=PKJfkoDu4hrLzb6NA1MDfXOjZnUxQ4WFpJouWU1Cr_4,14398
|
|
720
|
+
ai2_olmo_eval-0.7.2.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
|
|
721
|
+
ai2_olmo_eval-0.7.2.dist-info/top_level.txt,sha256=Pryk28JTb89-j624Uy1gRZiE0YXI3czgbNIfJCl9-x0,10
|
|
722
|
+
ai2_olmo_eval-0.7.2.dist-info/RECORD,,
|
olmo_eval/metrics.py
CHANGED
|
@@ -37,12 +37,26 @@ class ICLMetric(Metric):
|
|
|
37
37
|
self.add_state("bpbs", default=[], dist_reduce_fx=dist_combine_lists)
|
|
38
38
|
self.add_state("labels", default=[], dist_reduce_fx=dist_combine_lists)
|
|
39
39
|
|
|
40
|
+
self.add_state(
|
|
41
|
+
"loglikelihoods_no_leading_space", default=[], dist_reduce_fx=dist_combine_lists
|
|
42
|
+
)
|
|
43
|
+
self.add_state("celosses_no_leading_space", default=[], dist_reduce_fx=dist_combine_lists)
|
|
44
|
+
self.add_state("bpbs_no_leading_space", default=[], dist_reduce_fx=dist_combine_lists)
|
|
45
|
+
|
|
40
46
|
def reset(self):
|
|
41
47
|
self.loglikelihoods: List[Tuple[Optional[int], Optional[int], Optional[float]]] = []
|
|
42
48
|
self.celosses: List[Tuple[Optional[int], Optional[int], Optional[float]]] = []
|
|
43
49
|
self.bpbs: List[Tuple[Optional[int], Optional[int], Optional[float]]] = []
|
|
44
50
|
self.labels: List[Tuple[Optional[int], Optional[int], Optional[int]]] = []
|
|
45
51
|
|
|
52
|
+
self.loglikelihoods_no_leading_space: List[
|
|
53
|
+
Tuple[Optional[int], Optional[int], Optional[float]]
|
|
54
|
+
] = []
|
|
55
|
+
self.celosses_no_leading_space: List[
|
|
56
|
+
Tuple[Optional[int], Optional[int], Optional[float]]
|
|
57
|
+
] = []
|
|
58
|
+
self.bpbs_no_leading_space: List[Tuple[Optional[int], Optional[int], Optional[float]]] = []
|
|
59
|
+
|
|
46
60
|
def update(
|
|
47
61
|
self,
|
|
48
62
|
batch: Dict[str, Any],
|
|
@@ -56,6 +70,11 @@ class ICLMetric(Metric):
|
|
|
56
70
|
self.loglikelihoods.append((None, None, None))
|
|
57
71
|
self.celosses.append((None, None, None))
|
|
58
72
|
self.bpbs.append((None, None, None))
|
|
73
|
+
|
|
74
|
+
self.loglikelihoods_no_leading_space.append((None, None, None))
|
|
75
|
+
self.celosses_no_leading_space.append((None, None, None))
|
|
76
|
+
self.bpbs_no_leading_space.append((None, None, None))
|
|
77
|
+
|
|
59
78
|
self.labels.append((None, None, None))
|
|
60
79
|
return
|
|
61
80
|
|
|
@@ -82,6 +101,9 @@ class ICLMetric(Metric):
|
|
|
82
101
|
log_likelihood: torch.Tensor
|
|
83
102
|
celoss: torch.Tensor
|
|
84
103
|
bpb: torch.Tensor
|
|
104
|
+
log_likelihood_no_leading_space: torch.Tensor
|
|
105
|
+
celoss_no_leading_space: torch.Tensor
|
|
106
|
+
bpb_no_leading_space: torch.Tensor
|
|
85
107
|
if self.metric_type == "pmi_dc":
|
|
86
108
|
assert dc_lm_logits is not None
|
|
87
109
|
# get domain conditional continuation logits: [cont_len, vocab]
|
|
@@ -96,6 +118,10 @@ class ICLMetric(Metric):
|
|
|
96
118
|
)
|
|
97
119
|
celoss = -log_likelihood
|
|
98
120
|
bpb = -log_likelihood # the normalization factors cancel out
|
|
121
|
+
|
|
122
|
+
log_likelihood_no_leading_space = log_likelihood
|
|
123
|
+
celoss_no_leading_space = celoss
|
|
124
|
+
bpb_no_leading_space = bpb
|
|
99
125
|
elif self.metric_type == "acc" or self.metric_type == "f1":
|
|
100
126
|
# gather log-probs at continuation token indices
|
|
101
127
|
log_likelihood = torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
|
|
@@ -108,6 +134,19 @@ class ICLMetric(Metric):
|
|
|
108
134
|
/ batch["cont_byte_len"][idx]
|
|
109
135
|
* LOG_2_OF_E
|
|
110
136
|
)
|
|
137
|
+
|
|
138
|
+
log_likelihood_no_leading_space = torch.gather(
|
|
139
|
+
lm_cont_logits, 1, cont_tokens.unsqueeze(-1)
|
|
140
|
+
).sum()
|
|
141
|
+
celoss_no_leading_space = (
|
|
142
|
+
-torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
|
|
143
|
+
/ batch["cont_str_len_no_leading_space"][idx]
|
|
144
|
+
)
|
|
145
|
+
bpb_no_leading_space = (
|
|
146
|
+
-torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
|
|
147
|
+
/ batch["cont_byte_len_no_leading_space"][idx]
|
|
148
|
+
* LOG_2_OF_E
|
|
149
|
+
)
|
|
111
150
|
elif self.metric_type in ["len_norm", "ce_loss", "bpb"]:
|
|
112
151
|
log_likelihood = (
|
|
113
152
|
torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
|
|
@@ -122,23 +161,46 @@ class ICLMetric(Metric):
|
|
|
122
161
|
/ batch["cont_byte_len"][idx]
|
|
123
162
|
* LOG_2_OF_E
|
|
124
163
|
)
|
|
164
|
+
|
|
165
|
+
log_likelihood_no_leading_space = (
|
|
166
|
+
torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
|
|
167
|
+
/ batch["cont_str_len_no_leading_space"][idx]
|
|
168
|
+
)
|
|
169
|
+
celoss_no_leading_space = (
|
|
170
|
+
-torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
|
|
171
|
+
/ batch["cont_str_len_no_leading_space"][idx]
|
|
172
|
+
)
|
|
173
|
+
bpb_no_leading_space = (
|
|
174
|
+
-torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
|
|
175
|
+
/ batch["cont_byte_len_no_leading_space"][idx]
|
|
176
|
+
* LOG_2_OF_E
|
|
177
|
+
)
|
|
125
178
|
else:
|
|
126
179
|
raise ValueError(self.metric_type)
|
|
127
180
|
|
|
128
|
-
self.loglikelihoods.append((doc_id, cont_id, float(log_likelihood)))
|
|
129
181
|
self.labels.append((doc_id, cont_id, int(batch["label_id"][idx])))
|
|
182
|
+
self.loglikelihoods.append((doc_id, cont_id, float(log_likelihood)))
|
|
130
183
|
self.celosses.append((doc_id, cont_id, float(celoss)))
|
|
131
184
|
self.bpbs.append((doc_id, cont_id, float(bpb)))
|
|
132
185
|
|
|
186
|
+
self.loglikelihoods_no_leading_space.append(
|
|
187
|
+
(doc_id, cont_id, float(log_likelihood_no_leading_space))
|
|
188
|
+
)
|
|
189
|
+
self.celosses_no_leading_space.append((doc_id, cont_id, float(celoss_no_leading_space)))
|
|
190
|
+
self.bpbs_no_leading_space.append((doc_id, cont_id, float(bpb_no_leading_space)))
|
|
191
|
+
|
|
133
192
|
def compute(self) -> Dict[str, torch.Tensor]:
|
|
134
193
|
# Task "suffix" -> tensor
|
|
135
194
|
|
|
136
195
|
# states should have been synced from all accelerators at this point
|
|
137
196
|
# account for duplicates here because of DistributedSampler compensating for drop_last=False
|
|
138
197
|
loglikelihood_dict: Dict[int, Dict[int, float]] = {}
|
|
198
|
+
loglikelihood_no_leading_space_dict: Dict[int, Dict[int, float]] = {}
|
|
139
199
|
label_dict: Dict[int, int] = {}
|
|
140
200
|
celoss_dict: Dict[int, Dict[int, float]] = {}
|
|
201
|
+
celoss_no_leading_space_dict: Dict[int, Dict[int, float]] = {}
|
|
141
202
|
bpb_dict: Dict[int, Dict[int, float]] = {}
|
|
203
|
+
bpb_no_leading_space_dict: Dict[int, Dict[int, float]] = {}
|
|
142
204
|
|
|
143
205
|
# collect labels
|
|
144
206
|
for doc_id, cont_id, label_id in self.labels:
|
|
@@ -159,6 +221,17 @@ class ICLMetric(Metric):
|
|
|
159
221
|
if cont_id not in loglikelihood_dict[doc_id]:
|
|
160
222
|
loglikelihood_dict[doc_id][cont_id] = loglikelihood
|
|
161
223
|
|
|
224
|
+
# collect loglikelihoods no leading space
|
|
225
|
+
for doc_id, cont_id, loglikelihood in self.loglikelihoods_no_leading_space:
|
|
226
|
+
if doc_id is None or cont_id is None or loglikelihood is None:
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
if doc_id not in loglikelihood_no_leading_space_dict:
|
|
230
|
+
loglikelihood_no_leading_space_dict[doc_id] = {}
|
|
231
|
+
|
|
232
|
+
if cont_id not in loglikelihood_no_leading_space_dict[doc_id]:
|
|
233
|
+
loglikelihood_no_leading_space_dict[doc_id][cont_id] = loglikelihood
|
|
234
|
+
|
|
162
235
|
# collect celosses
|
|
163
236
|
for doc_id, cont_id, celoss_val in self.celosses:
|
|
164
237
|
if doc_id is None or cont_id is None or celoss_val is None:
|
|
@@ -170,6 +243,17 @@ class ICLMetric(Metric):
|
|
|
170
243
|
if cont_id not in celoss_dict[doc_id]:
|
|
171
244
|
celoss_dict[doc_id][cont_id] = celoss_val
|
|
172
245
|
|
|
246
|
+
# collect celosses no leading space
|
|
247
|
+
for doc_id, cont_id, celoss_val in self.celosses_no_leading_space:
|
|
248
|
+
if doc_id is None or cont_id is None or celoss_val is None:
|
|
249
|
+
continue
|
|
250
|
+
|
|
251
|
+
if doc_id not in celoss_no_leading_space_dict:
|
|
252
|
+
celoss_no_leading_space_dict[doc_id] = {}
|
|
253
|
+
|
|
254
|
+
if cont_id not in celoss_no_leading_space_dict[doc_id]:
|
|
255
|
+
celoss_no_leading_space_dict[doc_id][cont_id] = celoss_val
|
|
256
|
+
|
|
173
257
|
# collect bpbs
|
|
174
258
|
for doc_id, cont_id, bpb_val in self.bpbs:
|
|
175
259
|
if doc_id is None or cont_id is None or bpb_val is None:
|
|
@@ -181,13 +265,30 @@ class ICLMetric(Metric):
|
|
|
181
265
|
if cont_id not in bpb_dict[doc_id]:
|
|
182
266
|
bpb_dict[doc_id][cont_id] = bpb_val
|
|
183
267
|
|
|
268
|
+
# collect bpbs no leading space
|
|
269
|
+
for doc_id, cont_id, bpb_val in self.bpbs_no_leading_space:
|
|
270
|
+
if doc_id is None or cont_id is None or bpb_val is None:
|
|
271
|
+
continue
|
|
272
|
+
|
|
273
|
+
if doc_id not in bpb_no_leading_space_dict:
|
|
274
|
+
bpb_no_leading_space_dict[doc_id] = {}
|
|
275
|
+
|
|
276
|
+
if cont_id not in bpb_no_leading_space_dict[doc_id]:
|
|
277
|
+
bpb_no_leading_space_dict[doc_id][cont_id] = bpb_val
|
|
278
|
+
|
|
184
279
|
# compute acc
|
|
280
|
+
correct_no_leading_space = []
|
|
185
281
|
correct = []
|
|
186
282
|
celoss = []
|
|
283
|
+
celoss_no_leading_space = []
|
|
187
284
|
bpb = []
|
|
285
|
+
bpb_no_leading_space = []
|
|
188
286
|
soft_score = []
|
|
189
287
|
soft_log_score = []
|
|
288
|
+
soft_score_no_leading_space = []
|
|
289
|
+
soft_log_score_no_leading_space = []
|
|
190
290
|
preds: Optional[List[float]] = None
|
|
291
|
+
preds_no_leading_space: Optional[List[float]] = None
|
|
191
292
|
labels: Optional[List[int]] = None
|
|
192
293
|
if self.metric_type == "f1":
|
|
193
294
|
preds = []
|
|
@@ -197,15 +298,25 @@ class ICLMetric(Metric):
|
|
|
197
298
|
# each doc_id might have a different number of continuation
|
|
198
299
|
num_continuations = len(loglikelihood_dict[doc_id].keys())
|
|
199
300
|
loglikelihoods = torch.tensor([-float("inf")] * num_continuations)
|
|
301
|
+
loglikelihoods_no_leading_space = torch.tensor([-float("inf")] * num_continuations)
|
|
200
302
|
celosses = torch.tensor([float("inf")] * num_continuations)
|
|
303
|
+
celosses_no_leading_space = torch.tensor([float("inf")] * num_continuations)
|
|
201
304
|
bpbs = torch.tensor([float("inf")] * num_continuations)
|
|
305
|
+
bpbs_no_leading_space = torch.tensor([float("inf")] * num_continuations)
|
|
202
306
|
|
|
203
307
|
skip_document = False
|
|
204
308
|
for cont_id in loglikelihood_dict[doc_id]:
|
|
205
309
|
try:
|
|
206
310
|
loglikelihoods[cont_id] = loglikelihood_dict[doc_id][cont_id]
|
|
311
|
+
loglikelihoods_no_leading_space[cont_id] = loglikelihood_no_leading_space_dict[
|
|
312
|
+
doc_id
|
|
313
|
+
][cont_id]
|
|
207
314
|
celosses[cont_id] = celoss_dict[doc_id][cont_id]
|
|
315
|
+
celosses_no_leading_space[cont_id] = celoss_no_leading_space_dict[doc_id][
|
|
316
|
+
cont_id
|
|
317
|
+
]
|
|
208
318
|
bpbs[cont_id] = bpb_dict[doc_id][cont_id]
|
|
319
|
+
bpbs_no_leading_space[cont_id] = bpb_no_leading_space_dict[doc_id][cont_id]
|
|
209
320
|
except IndexError:
|
|
210
321
|
# We didn't process all of the continuations, so skip this document.
|
|
211
322
|
skip_document = True
|
|
@@ -216,39 +327,83 @@ class ICLMetric(Metric):
|
|
|
216
327
|
|
|
217
328
|
if self.metric_type == "ce_loss":
|
|
218
329
|
celoss.append(celosses[0]) # Only one answer is scored
|
|
330
|
+
celoss_no_leading_space.append(celosses_no_leading_space[0])
|
|
219
331
|
elif self.metric_type == "bpb":
|
|
220
332
|
bpb.append(bpbs[0]) # Only one answer is scored
|
|
333
|
+
bpb_no_leading_space.append(bpbs_no_leading_space[0])
|
|
221
334
|
elif self.metric_type == "f1":
|
|
222
335
|
assert preds is not None
|
|
336
|
+
assert preds_no_leading_space is not None
|
|
223
337
|
assert labels is not None
|
|
224
338
|
preds.append(torch.argmax(loglikelihoods).item())
|
|
339
|
+
preds_no_leading_space.append(torch.argmax(loglikelihoods_no_leading_space).item())
|
|
225
340
|
labels.append(label_dict[doc_id])
|
|
226
341
|
else:
|
|
227
342
|
correct.append(
|
|
228
343
|
1.0 if torch.argmax(loglikelihoods).item() == label_dict[doc_id] else 0.0
|
|
229
344
|
)
|
|
345
|
+
correct_no_leading_space.append(
|
|
346
|
+
1.0
|
|
347
|
+
if torch.argmax(loglikelihoods_no_leading_space).item() == label_dict[doc_id]
|
|
348
|
+
else 0.0
|
|
349
|
+
)
|
|
230
350
|
celoss.append(celosses[label_dict[doc_id]].item())
|
|
351
|
+
celoss_no_leading_space.append(celosses_no_leading_space[label_dict[doc_id]].item())
|
|
231
352
|
bpb.append(bpbs[label_dict[doc_id]].item())
|
|
353
|
+
bpb_no_leading_space.append(bpbs_no_leading_space[label_dict[doc_id]].item())
|
|
232
354
|
soft_score.append(torch.softmax(loglikelihoods, dim=0)[label_dict[doc_id]].item())
|
|
233
355
|
soft_log_score.append(
|
|
234
356
|
torch.log_softmax(loglikelihoods, dim=0)[label_dict[doc_id]].item()
|
|
235
357
|
)
|
|
358
|
+
soft_score_no_leading_space.append(
|
|
359
|
+
torch.softmax(loglikelihoods_no_leading_space, dim=0)[label_dict[doc_id]].item()
|
|
360
|
+
)
|
|
361
|
+
soft_log_score_no_leading_space.append(
|
|
362
|
+
torch.log_softmax(loglikelihoods_no_leading_space, dim=0)[
|
|
363
|
+
label_dict[doc_id]
|
|
364
|
+
].item()
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
# v1 vs. v2 corresponds to whether we add a 1 to the num chars or num bytes when normalizing the answer length. See https://github.com/allenai/OLMo-in-loop-evals/pull/6
|
|
236
368
|
|
|
237
369
|
if self.metric_type == "f1":
|
|
238
370
|
assert preds is not None
|
|
239
371
|
assert labels is not None
|
|
240
372
|
# for NLI tasks, continuations are yes, no, neither, so idx=0 assigned to pos label
|
|
241
373
|
score = f1_score(labels, preds, pos_label=0)
|
|
242
|
-
|
|
374
|
+
score_no_leading_space = f1_score(labels, preds_no_leading_space, pos_label=0)
|
|
375
|
+
return {
|
|
376
|
+
"f1_v1": torch.tensor(score),
|
|
377
|
+
"f1_v2": torch.tensor(score_no_leading_space),
|
|
378
|
+
}
|
|
243
379
|
elif self.metric_type == "ce_loss":
|
|
244
|
-
return {
|
|
380
|
+
return {
|
|
381
|
+
"ce_loss_v1": torch.tensor(
|
|
382
|
+
sum(celoss_no_leading_space) / len(celoss_no_leading_space)
|
|
383
|
+
),
|
|
384
|
+
"ce_loss_v2": torch.tensor(sum(celoss) / len(celoss)),
|
|
385
|
+
}
|
|
245
386
|
elif self.metric_type == "bpb":
|
|
246
|
-
return {
|
|
387
|
+
return {
|
|
388
|
+
"bpb_v1": torch.tensor(sum(bpb_no_leading_space) / len(bpb_no_leading_space)),
|
|
389
|
+
"bpb_v2": torch.tensor(sum(bpb) / len(bpb)),
|
|
390
|
+
}
|
|
247
391
|
else:
|
|
248
392
|
return {
|
|
249
|
-
self.metric_type: torch.tensor(sum(correct) / len(correct)),
|
|
250
|
-
"
|
|
251
|
-
"
|
|
252
|
-
|
|
253
|
-
|
|
393
|
+
f"{self.metric_type}_v1": torch.tensor(sum(correct) / len(correct)),
|
|
394
|
+
f"{self.metric_type}_v2": torch.tensor(sum(correct) / len(correct)),
|
|
395
|
+
"ce_loss_v1": torch.tensor(
|
|
396
|
+
sum(celoss_no_leading_space) / len(celoss_no_leading_space)
|
|
397
|
+
),
|
|
398
|
+
"ce_loss_v2": torch.tensor(sum(celoss) / len(celoss)),
|
|
399
|
+
"bpb_v1": torch.tensor(sum(bpb_no_leading_space) / len(bpb_no_leading_space)),
|
|
400
|
+
"bpb_v2": torch.tensor(sum(bpb) / len(bpb)),
|
|
401
|
+
"soft_v1": torch.tensor(
|
|
402
|
+
sum(soft_score_no_leading_space) / len(soft_score_no_leading_space)
|
|
403
|
+
),
|
|
404
|
+
"soft_v2": torch.tensor(sum(soft_score) / len(soft_score)),
|
|
405
|
+
"soft_log_v1": torch.tensor(
|
|
406
|
+
sum(soft_log_score_no_leading_space) / len(soft_log_score_no_leading_space)
|
|
407
|
+
),
|
|
408
|
+
"soft_log_v2": torch.tensor(sum(soft_log_score) / len(soft_log_score)),
|
|
254
409
|
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
{
|
|
2
|
+
"task_name": "basic_skills_arithmetic",
|
|
3
|
+
"task_hash": "56711b967c78d896ef51ba00aef5cfb0",
|
|
4
|
+
"task_config": {
|
|
5
|
+
"dataset_path": "basic_skills_arithmetic",
|
|
6
|
+
"primary_metric": "acc_per_token",
|
|
7
|
+
"split": "validation",
|
|
8
|
+
"num_shots": 5,
|
|
9
|
+
"metadata": {
|
|
10
|
+
"regimes": [
|
|
11
|
+
"OLMES-v0.1"
|
|
12
|
+
],
|
|
13
|
+
"alias": "basic_skills_arithmetic:rc::olmes"
|
|
14
|
+
},
|
|
15
|
+
"generation_kwargs": {},
|
|
16
|
+
"context_kwargs": {},
|
|
17
|
+
"dataset_name": "arithmetic",
|
|
18
|
+
"task_name": "basic_skills_arithmetic",
|
|
19
|
+
"version": 0,
|
|
20
|
+
"task_core": "basic_skills_arithmetic"
|
|
21
|
+
},
|
|
22
|
+
"current_date": "2025-05-12 00:06:28 UTC"
|
|
23
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
{
|
|
2
|
+
"task_name": "basic_skills_coding",
|
|
3
|
+
"task_hash": "d748d1d8ba506d3d234eed529ef62c3e",
|
|
4
|
+
"task_config": {
|
|
5
|
+
"dataset_path": "basic_skills_coding",
|
|
6
|
+
"primary_metric": "acc_per_token",
|
|
7
|
+
"split": "validation",
|
|
8
|
+
"num_shots": 5,
|
|
9
|
+
"metadata": {
|
|
10
|
+
"regimes": [
|
|
11
|
+
"OLMES-v0.1"
|
|
12
|
+
],
|
|
13
|
+
"alias": "basic_skills_coding:rc::olmes"
|
|
14
|
+
},
|
|
15
|
+
"generation_kwargs": {},
|
|
16
|
+
"context_kwargs": {},
|
|
17
|
+
"dataset_name": "coding",
|
|
18
|
+
"task_name": "basic_skills_coding",
|
|
19
|
+
"version": 0,
|
|
20
|
+
"task_core": "basic_skills_coding"
|
|
21
|
+
},
|
|
22
|
+
"current_date": "2025-05-12 00:06:28 UTC"
|
|
23
|
+
}
|
|
Binary file
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
{
|
|
2
|
+
"task_name": "basic_skills_common_knowledge",
|
|
3
|
+
"task_hash": "51e88e759602f9085a8c779da375d833",
|
|
4
|
+
"task_config": {
|
|
5
|
+
"dataset_path": "basic_skills_common_knowledge",
|
|
6
|
+
"primary_metric": "acc_per_token",
|
|
7
|
+
"split": "validation",
|
|
8
|
+
"num_shots": 5,
|
|
9
|
+
"metadata": {
|
|
10
|
+
"regimes": [
|
|
11
|
+
"OLMES-v0.1"
|
|
12
|
+
],
|
|
13
|
+
"alias": "basic_skills_common_knowledge:rc::olmes"
|
|
14
|
+
},
|
|
15
|
+
"generation_kwargs": {},
|
|
16
|
+
"context_kwargs": {},
|
|
17
|
+
"dataset_name": "common_knowledge",
|
|
18
|
+
"task_name": "basic_skills_common_knowledge",
|
|
19
|
+
"version": 0,
|
|
20
|
+
"task_core": "basic_skills_common_knowledge"
|
|
21
|
+
},
|
|
22
|
+
"current_date": "2025-05-12 00:06:28 UTC"
|
|
23
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
{
|
|
2
|
+
"task_name": "basic_skills_logical_reasoning",
|
|
3
|
+
"task_hash": "a3d406a2f4224604b7e6bbf68050691d",
|
|
4
|
+
"task_config": {
|
|
5
|
+
"dataset_path": "basic_skills_logical_reasoning",
|
|
6
|
+
"primary_metric": "acc_per_token",
|
|
7
|
+
"split": "validation",
|
|
8
|
+
"num_shots": 5,
|
|
9
|
+
"metadata": {
|
|
10
|
+
"regimes": [
|
|
11
|
+
"OLMES-v0.1"
|
|
12
|
+
],
|
|
13
|
+
"alias": "basic_skills_logical_reasoning:rc::olmes"
|
|
14
|
+
},
|
|
15
|
+
"generation_kwargs": {},
|
|
16
|
+
"context_kwargs": {},
|
|
17
|
+
"dataset_name": "logical_reasoning",
|
|
18
|
+
"task_name": "basic_skills_logical_reasoning",
|
|
19
|
+
"version": 0,
|
|
20
|
+
"task_core": "basic_skills_logical_reasoning"
|
|
21
|
+
},
|
|
22
|
+
"current_date": "2025-05-12 00:06:28 UTC"
|
|
23
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
{
|
|
2
|
+
"task_name": "basic_skills_pattern",
|
|
3
|
+
"task_hash": "67983750bfb70a3b5cc34dcd67ee3c6a",
|
|
4
|
+
"task_config": {
|
|
5
|
+
"dataset_path": "basic_skills_pattern",
|
|
6
|
+
"primary_metric": "acc_per_token",
|
|
7
|
+
"split": "validation",
|
|
8
|
+
"num_shots": 5,
|
|
9
|
+
"metadata": {
|
|
10
|
+
"regimes": [
|
|
11
|
+
"OLMES-v0.1"
|
|
12
|
+
],
|
|
13
|
+
"alias": "basic_skills_pattern:rc::olmes"
|
|
14
|
+
},
|
|
15
|
+
"generation_kwargs": {},
|
|
16
|
+
"context_kwargs": {},
|
|
17
|
+
"dataset_name": "pattern",
|
|
18
|
+
"task_name": "basic_skills_pattern",
|
|
19
|
+
"version": 0,
|
|
20
|
+
"task_core": "basic_skills_pattern"
|
|
21
|
+
},
|
|
22
|
+
"current_date": "2025-05-12 00:06:28 UTC"
|
|
23
|
+
}
|
|
Binary file
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
{
|
|
2
|
+
"task_name": "basic_skills_string_operations",
|
|
3
|
+
"task_hash": "8e5fdc7697f1bc7b0c9487a6fa682e45",
|
|
4
|
+
"task_config": {
|
|
5
|
+
"dataset_path": "basic_skills_string_operations",
|
|
6
|
+
"primary_metric": "acc_per_token",
|
|
7
|
+
"split": "validation",
|
|
8
|
+
"num_shots": 5,
|
|
9
|
+
"metadata": {
|
|
10
|
+
"regimes": [
|
|
11
|
+
"OLMES-v0.1"
|
|
12
|
+
],
|
|
13
|
+
"alias": "basic_skills_string_operations:rc::olmes"
|
|
14
|
+
},
|
|
15
|
+
"generation_kwargs": {},
|
|
16
|
+
"context_kwargs": {},
|
|
17
|
+
"dataset_name": "string_operations",
|
|
18
|
+
"task_name": "basic_skills_string_operations",
|
|
19
|
+
"version": 0,
|
|
20
|
+
"task_core": "basic_skills_string_operations"
|
|
21
|
+
},
|
|
22
|
+
"current_date": "2025-05-12 00:06:28 UTC"
|
|
23
|
+
}
|
olmo_eval/tasks.py
CHANGED
|
@@ -103,8 +103,15 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
|
|
|
103
103
|
)
|
|
104
104
|
|
|
105
105
|
for cont_id, continuation_str in enumerate(continuations):
|
|
106
|
-
|
|
107
|
-
|
|
106
|
+
# The original implementation did not count the first character (usually the leading space) as
|
|
107
|
+
# part of the continuation length (e.g., " A", " " is not counted). The OLMES standard does not
|
|
108
|
+
# do this, but we track both for backwards compatibility.
|
|
109
|
+
cont_str_len_no_leading_space = len(continuation_str) - 1
|
|
110
|
+
cont_byte_len_no_leading_space = len(continuation_str[1:].encode("utf-8"))
|
|
111
|
+
|
|
112
|
+
cont_str_len = len(continuation_str)
|
|
113
|
+
cont_byte_len = len(continuation_str.encode("utf-8"))
|
|
114
|
+
|
|
108
115
|
continuation = self.token_encode(continuation_str)
|
|
109
116
|
|
|
110
117
|
# query, remove last token from continuation, truncate from left is longer than model ctx length
|
|
@@ -131,6 +138,8 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
|
|
|
131
138
|
), # even if query has last token removed, LM will output same cont len
|
|
132
139
|
"cont_str_len": cont_str_len,
|
|
133
140
|
"cont_byte_len": cont_byte_len,
|
|
141
|
+
"cont_str_len_no_leading_space": cont_str_len_no_leading_space,
|
|
142
|
+
"cont_byte_len_no_leading_space": cont_byte_len_no_leading_space,
|
|
134
143
|
"query": query, # remove last token from continuation
|
|
135
144
|
"dc_query": dc_query,
|
|
136
145
|
"label_id": label_id,
|
|
@@ -209,6 +218,8 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
|
|
|
209
218
|
cont_lens = []
|
|
210
219
|
cont_str_lens = []
|
|
211
220
|
cont_byte_lens = []
|
|
221
|
+
cont_str_len_no_leading_space = []
|
|
222
|
+
cont_byte_len_no_leading_space = []
|
|
212
223
|
queries = []
|
|
213
224
|
dc_queries = []
|
|
214
225
|
label_ids = []
|
|
@@ -232,6 +243,8 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
|
|
|
232
243
|
cont_lens.append(sample["cont_len"])
|
|
233
244
|
cont_str_lens.append(sample["cont_str_len"])
|
|
234
245
|
cont_byte_lens.append(sample["cont_byte_len"])
|
|
246
|
+
cont_str_len_no_leading_space.append(sample["cont_str_len_no_leading_space"])
|
|
247
|
+
cont_byte_len_no_leading_space.append(sample["cont_byte_len_no_leading_space"])
|
|
235
248
|
|
|
236
249
|
queries.append(
|
|
237
250
|
torch.LongTensor(
|
|
@@ -261,6 +274,8 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
|
|
|
261
274
|
), # since query has last token removed from continuation
|
|
262
275
|
"cont_str_len": torch.LongTensor(cont_str_lens),
|
|
263
276
|
"cont_byte_len": torch.LongTensor(cont_byte_lens),
|
|
277
|
+
"cont_str_len_no_leading_space": torch.LongTensor(cont_str_len_no_leading_space),
|
|
278
|
+
"cont_byte_len_no_leading_space": torch.LongTensor(cont_byte_len_no_leading_space),
|
|
264
279
|
"input_ids": torch.stack(queries),
|
|
265
280
|
"dc_input_ids": torch.stack(dc_queries),
|
|
266
281
|
"label_id": torch.LongTensor(label_ids),
|
|
@@ -456,8 +471,15 @@ class WinoGrande(ICLMultiChoiceTaskDataset):
|
|
|
456
471
|
|
|
457
472
|
continuation_str = self.doc_to_continuations(doc)
|
|
458
473
|
label_id = self.doc_to_label(doc)
|
|
459
|
-
|
|
460
|
-
|
|
474
|
+
|
|
475
|
+
# The original implementation did not count the first character (usually the leading space) as
|
|
476
|
+
# part of the continuation length (e.g., " A", " " is not counted). The OLMES standard does not
|
|
477
|
+
# do this, but we track both for backwards compatibility.
|
|
478
|
+
cont_str_len_no_leading_space = len(continuation_str) - 1
|
|
479
|
+
cont_byte_len_no_leading_space = len(continuation_str[1:].encode("utf-8"))
|
|
480
|
+
|
|
481
|
+
cont_str_len = len(continuation_str)
|
|
482
|
+
cont_byte_len = len(continuation_str.encode("utf-8"))
|
|
461
483
|
|
|
462
484
|
# tokenize
|
|
463
485
|
continuation = self.token_encode(continuation_str)
|
|
@@ -488,6 +510,8 @@ class WinoGrande(ICLMultiChoiceTaskDataset):
|
|
|
488
510
|
), # even if query has last token removed, LM will output same cont len
|
|
489
511
|
"cont_str_len": cont_str_len,
|
|
490
512
|
"cont_byte_len": cont_byte_len,
|
|
513
|
+
"cont_str_len_no_leading_space": cont_str_len_no_leading_space,
|
|
514
|
+
"cont_byte_len_no_leading_space": cont_byte_len_no_leading_space,
|
|
491
515
|
"query": query, # remove last token from continuation
|
|
492
516
|
"dc_query": dc_query,
|
|
493
517
|
"label_id": label_id,
|
|
@@ -1524,8 +1548,16 @@ class OEEvalTask(ICLMultiChoiceTaskDataset):
|
|
|
1524
1548
|
f"Sample doc from ({self.dataset_path}, {ds_name}):"
|
|
1525
1549
|
+ f"\ndoc_text: {doc_text}\ncontinuation: {continuation_str}"
|
|
1526
1550
|
)
|
|
1527
|
-
|
|
1528
|
-
|
|
1551
|
+
|
|
1552
|
+
# The original implementation did not count the first character (usually the leading space) as
|
|
1553
|
+
# part of the continuation length (e.g., " A", " " is not counted). The OLMES standard does not
|
|
1554
|
+
# do this, but we track both for backwards compatibility.
|
|
1555
|
+
cont_str_len_no_leading_space = len(continuation_str) - 1
|
|
1556
|
+
cont_byte_len_no_leading_space = len(continuation_str[1:].encode("utf-8"))
|
|
1557
|
+
|
|
1558
|
+
cont_str_len = len(continuation_str)
|
|
1559
|
+
cont_byte_len = len(continuation_str.encode("utf-8"))
|
|
1560
|
+
|
|
1529
1561
|
continuation = self.token_encode(continuation_str)
|
|
1530
1562
|
|
|
1531
1563
|
# query, remove last token from continuation, truncate from left is longer than model ctx length
|
|
@@ -1552,6 +1584,8 @@ class OEEvalTask(ICLMultiChoiceTaskDataset):
|
|
|
1552
1584
|
), # even if query has last token removed, LM will output same cont len
|
|
1553
1585
|
"cont_str_len": cont_str_len,
|
|
1554
1586
|
"cont_byte_len": cont_byte_len,
|
|
1587
|
+
"cont_str_len_no_leading_space": cont_str_len_no_leading_space,
|
|
1588
|
+
"cont_byte_len_no_leading_space": cont_byte_len_no_leading_space,
|
|
1555
1589
|
"query": query, # remove last token from continuation
|
|
1556
1590
|
"dc_query": dc_query,
|
|
1557
1591
|
"label_id": label_id,
|
|
@@ -1670,6 +1704,46 @@ LABEL_TO_TASK_MAP_ORIG = {
|
|
|
1670
1704
|
OEEvalTask,
|
|
1671
1705
|
{"dataset_path": "arc_easy", "dataset_name": "rc_5shot", "metric_type": "acc"},
|
|
1672
1706
|
),
|
|
1707
|
+
"basic_skills_arithmetic_rc_5shot": (
|
|
1708
|
+
OEEvalTask,
|
|
1709
|
+
{
|
|
1710
|
+
"dataset_path": "basic_skills_arithmetic",
|
|
1711
|
+
"dataset_name": "rc_5shot",
|
|
1712
|
+
"metric_type": "acc",
|
|
1713
|
+
},
|
|
1714
|
+
),
|
|
1715
|
+
"basic_skills_coding_rc_5shot": (
|
|
1716
|
+
OEEvalTask,
|
|
1717
|
+
{"dataset_path": "basic_skills_coding", "dataset_name": "rc_5shot", "metric_type": "acc"},
|
|
1718
|
+
),
|
|
1719
|
+
"basic_skills_common_knowledge_rc_5shot": (
|
|
1720
|
+
OEEvalTask,
|
|
1721
|
+
{
|
|
1722
|
+
"dataset_path": "basic_skills_common_knowledge",
|
|
1723
|
+
"dataset_name": "rc_5shot",
|
|
1724
|
+
"metric_type": "acc",
|
|
1725
|
+
},
|
|
1726
|
+
),
|
|
1727
|
+
"basic_skills_logical_reasoning_rc_5shot": (
|
|
1728
|
+
OEEvalTask,
|
|
1729
|
+
{
|
|
1730
|
+
"dataset_path": "basic_skills_logical_reasoning",
|
|
1731
|
+
"dataset_name": "rc_5shot",
|
|
1732
|
+
"metric_type": "acc",
|
|
1733
|
+
},
|
|
1734
|
+
),
|
|
1735
|
+
"basic_skills_pattern_rc_5shot": (
|
|
1736
|
+
OEEvalTask,
|
|
1737
|
+
{"dataset_path": "basic_skills_pattern", "dataset_name": "rc_5shot", "metric_type": "acc"},
|
|
1738
|
+
),
|
|
1739
|
+
"basic_skills_string_operations_rc_5shot": (
|
|
1740
|
+
OEEvalTask,
|
|
1741
|
+
{
|
|
1742
|
+
"dataset_path": "basic_skills_string_operations",
|
|
1743
|
+
"dataset_name": "rc_5shot",
|
|
1744
|
+
"metric_type": "acc",
|
|
1745
|
+
},
|
|
1746
|
+
),
|
|
1673
1747
|
"boolq_mc_5shot": (
|
|
1674
1748
|
OEEvalTask,
|
|
1675
1749
|
{"dataset_path": "boolq", "dataset_name": "mc_5shot", "metric_type": "acc"},
|
olmo_eval/version.py
CHANGED
|
File without changes
|
|
File without changes
|