ai2-olmo-eval 0.7.0__py3-none-any.whl → 0.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: ai2-olmo-eval
3
- Version: 0.7.0
3
+ Version: 0.7.2
4
4
  Summary: In-loop evaluation tasks for language modeling
5
5
  Author-email: Allen Institute for Artificial Intelligence <olmo@allenai.org>
6
6
  License: Apache License
@@ -234,6 +234,7 @@ Requires-Dist: boto3; extra == "dev"
234
234
  Requires-Dist: google-cloud-storage; extra == "dev"
235
235
  Provides-Extra: all
236
236
  Requires-Dist: ai2-olmo-eval[dev]; extra == "all"
237
+ Dynamic: license-file
237
238
 
238
239
  # OLMo-in-loop-evals
239
240
 
@@ -1,9 +1,10 @@
1
+ ai2_olmo_eval-0.7.2.dist-info/licenses/LICENSE,sha256=YvuKOpYh3COIF0yqq-nCMXtpS7mh1GyYvPVlW2j1G-M,11359
1
2
  olmo_eval/__init__.py,sha256=49RxnAaJNk8U9XP3SF5MjyFIxLSkxH0vXQuZgnEOi44,283
2
- olmo_eval/metrics.py,sha256=FUiILL5NimzW8pW8IAhQaJXH7QXOOJ3axEfbK4u98UA,10818
3
- olmo_eval/tasks.py,sha256=kTISmy7Mc066g1koVaFz-gyNkN-HjUoNvAGV5g7gxEA,76135
3
+ olmo_eval/metrics.py,sha256=NcI_1B3BV-DC9RXjsSIftU-2GeF8vvU6SNyJnlYlKwU,18705
4
+ olmo_eval/tasks.py,sha256=QGLyF7JA2-T9mkh-N4cZGNOQp9si90yQSS41T3x5Lak,79630
4
5
  olmo_eval/tokenizer.py,sha256=PnkidE0nAtEA1QZjuQpE_bIwgAsHxodnaJRALAPqrJQ,5127
5
6
  olmo_eval/util.py,sha256=ARmZmRQl8VOvnKQoUprb3cOunzcApeNhRdV4BMXZuvo,3856
6
- olmo_eval/version.py,sha256=iFGTs1ocPCU9s8k4bXCFrMg3w_e9hrZV2YFjSF3wgcE,308
7
+ olmo_eval/version.py,sha256=QWjPfx79C2NOQw2G7iDEsM4FKsLiGLCLNDzEx7EImf8,308
7
8
  olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/data-00000-of-00001.arrow,sha256=TPWbMhBmticWjYp7TA3etcKbXbaoCDBWhxuqlD1bDJA,98080
8
9
  olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/dataset_info.json,sha256=iZumP5Udu8LD7cbew3o7nNpnGu-o9jPaMxUrNDDNIVY,1795
9
10
  olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/state.json,sha256=6Q1XhM-HMZcymuGAKBC_8RjMBKgJSaR_6lLUO9Z8XwE,255
@@ -599,6 +600,18 @@ olmo_eval/oe_eval_tasks/arc_easy/val_mc_5shot/config.json,sha256=CEgPNm226vxmMim
599
600
  olmo_eval/oe_eval_tasks/arc_easy/val_mc_5shot/requests.jsonl.gz,sha256=LZ7XuWwDo6zJTqhgpZgHNj6yi-xOXb-TQxl9yxB9gVg,114271
600
601
  olmo_eval/oe_eval_tasks/arc_easy/val_rc_5shot/config.json,sha256=LeNP534voujfcp9ph8SKHfnfYPjfSu8ik3HWiXt3TFM,761
601
602
  olmo_eval/oe_eval_tasks/arc_easy/val_rc_5shot/requests.jsonl.gz,sha256=28UmHQnAB2DIlfYbqhuhJ4AjAVLDAHWWoEmaHlI-UKU,202290
603
+ olmo_eval/oe_eval_tasks/basic_skills_arithmetic/rc_5shot/config.json,sha256=_gSH-miyIWms4r3TSLCMihc42v7kt8tEPnqQJcgux-4,616
604
+ olmo_eval/oe_eval_tasks/basic_skills_arithmetic/rc_5shot/requests.jsonl.gz,sha256=iiVqzSVTiEk5lbq0WAiR8ujvBHHv73azRpvfuCIrEfI,215180
605
+ olmo_eval/oe_eval_tasks/basic_skills_coding/rc_5shot/config.json,sha256=19NZFpCouu7oEidoUBthKUekW87pT5pzR1bX1NJV77g,592
606
+ olmo_eval/oe_eval_tasks/basic_skills_coding/rc_5shot/requests.jsonl.gz,sha256=suXkEgQLUT-XK_EDyQKIoniNYNJvo4vUpe8-jyeNe-w,274302
607
+ olmo_eval/oe_eval_tasks/basic_skills_common_knowledge/rc_5shot/config.json,sha256=DOHsmlO6_OMBIl-oEfKT8O0yIj89I1gTV_uvOxdiT8M,652
608
+ olmo_eval/oe_eval_tasks/basic_skills_common_knowledge/rc_5shot/requests.jsonl.gz,sha256=ie673jV3ShxUhrqux3Y8YRNfAazKa8ayGEjo7hxEp1Y,237402
609
+ olmo_eval/oe_eval_tasks/basic_skills_logical_reasoning/rc_5shot/config.json,sha256=OB20jgxj00v3bvfsc1M1zyWGlEJvZdXBlg4L9NeGsZY,658
610
+ olmo_eval/oe_eval_tasks/basic_skills_logical_reasoning/rc_5shot/requests.jsonl.gz,sha256=5ElEBHtBq6tBQ1hqEbg9---XkUFV3GjcMGHFXxP_urs,284843
611
+ olmo_eval/oe_eval_tasks/basic_skills_pattern/rc_5shot/config.json,sha256=OAZyUX7pw7cEguIsSbs_fKXiuHh1sbEkpF7x9v6ZI80,598
612
+ olmo_eval/oe_eval_tasks/basic_skills_pattern/rc_5shot/requests.jsonl.gz,sha256=FY1pf-fTh5BNnN5H7uN0ksm21tdC6ewKsOhaOpN3760,71330
613
+ olmo_eval/oe_eval_tasks/basic_skills_string_operations/rc_5shot/config.json,sha256=TfwWhRHC_G17uqk60-pNROMNzzmd0rMTY5nPP0dje00,658
614
+ olmo_eval/oe_eval_tasks/basic_skills_string_operations/rc_5shot/requests.jsonl.gz,sha256=FIBdOQSDoQ99gDEpHYHYTmhW5qfClVP-rh3ll2I0fDQ,231341
602
615
  olmo_eval/oe_eval_tasks/boolq/mc_5shot/config.json,sha256=87GTyDGser1tWfSWmktZ1X17jKXU1EZzHOJLMSbVspA,632
603
616
  olmo_eval/oe_eval_tasks/boolq/mc_5shot/requests.jsonl.gz,sha256=uZ9ZkbFkiUn4XcCzypgPscTFTrVDexVC1L-e6zBiEMg,393249
604
617
  olmo_eval/oe_eval_tasks/boolq/rc_0shot/config.json,sha256=d1GKQMIX1cUgnZHlUe9kgAZsgkMc1N2GnMlyhccO9pE,509
@@ -703,8 +716,7 @@ olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/config.json,sha256=ySjEVqTOj5GwC
703
716
  olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/requests.jsonl.gz,sha256=knTzcqigWCfdYLN1Pl0TfCm0Fi1lRASWAo_SC6KtXsc,115262
704
717
  olmo_eval/tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json,sha256=yjXYcnpTO7Zjm_R4Gucrn9oA5paadiYM-ZZER5q_EXc,2114319
705
718
  olmo_eval/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json,sha256=mtM7Szmp-Dlzw_jEKgGUjdW4d6KKyaU1aVbE_07QtxQ,2115113
706
- ai2_olmo_eval-0.7.0.dist-info/LICENSE,sha256=YvuKOpYh3COIF0yqq-nCMXtpS7mh1GyYvPVlW2j1G-M,11359
707
- ai2_olmo_eval-0.7.0.dist-info/METADATA,sha256=pv8KUZwaSxYUsZk59yyv_ffc3iXzHPf48BuNUNkgweE,14376
708
- ai2_olmo_eval-0.7.0.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
709
- ai2_olmo_eval-0.7.0.dist-info/top_level.txt,sha256=Pryk28JTb89-j624Uy1gRZiE0YXI3czgbNIfJCl9-x0,10
710
- ai2_olmo_eval-0.7.0.dist-info/RECORD,,
719
+ ai2_olmo_eval-0.7.2.dist-info/METADATA,sha256=PKJfkoDu4hrLzb6NA1MDfXOjZnUxQ4WFpJouWU1Cr_4,14398
720
+ ai2_olmo_eval-0.7.2.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
721
+ ai2_olmo_eval-0.7.2.dist-info/top_level.txt,sha256=Pryk28JTb89-j624Uy1gRZiE0YXI3czgbNIfJCl9-x0,10
722
+ ai2_olmo_eval-0.7.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (76.0.0)
2
+ Generator: setuptools (80.7.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
olmo_eval/metrics.py CHANGED
@@ -37,12 +37,26 @@ class ICLMetric(Metric):
37
37
  self.add_state("bpbs", default=[], dist_reduce_fx=dist_combine_lists)
38
38
  self.add_state("labels", default=[], dist_reduce_fx=dist_combine_lists)
39
39
 
40
+ self.add_state(
41
+ "loglikelihoods_no_leading_space", default=[], dist_reduce_fx=dist_combine_lists
42
+ )
43
+ self.add_state("celosses_no_leading_space", default=[], dist_reduce_fx=dist_combine_lists)
44
+ self.add_state("bpbs_no_leading_space", default=[], dist_reduce_fx=dist_combine_lists)
45
+
40
46
  def reset(self):
41
47
  self.loglikelihoods: List[Tuple[Optional[int], Optional[int], Optional[float]]] = []
42
48
  self.celosses: List[Tuple[Optional[int], Optional[int], Optional[float]]] = []
43
49
  self.bpbs: List[Tuple[Optional[int], Optional[int], Optional[float]]] = []
44
50
  self.labels: List[Tuple[Optional[int], Optional[int], Optional[int]]] = []
45
51
 
52
+ self.loglikelihoods_no_leading_space: List[
53
+ Tuple[Optional[int], Optional[int], Optional[float]]
54
+ ] = []
55
+ self.celosses_no_leading_space: List[
56
+ Tuple[Optional[int], Optional[int], Optional[float]]
57
+ ] = []
58
+ self.bpbs_no_leading_space: List[Tuple[Optional[int], Optional[int], Optional[float]]] = []
59
+
46
60
  def update(
47
61
  self,
48
62
  batch: Dict[str, Any],
@@ -56,6 +70,11 @@ class ICLMetric(Metric):
56
70
  self.loglikelihoods.append((None, None, None))
57
71
  self.celosses.append((None, None, None))
58
72
  self.bpbs.append((None, None, None))
73
+
74
+ self.loglikelihoods_no_leading_space.append((None, None, None))
75
+ self.celosses_no_leading_space.append((None, None, None))
76
+ self.bpbs_no_leading_space.append((None, None, None))
77
+
59
78
  self.labels.append((None, None, None))
60
79
  return
61
80
 
@@ -82,6 +101,9 @@ class ICLMetric(Metric):
82
101
  log_likelihood: torch.Tensor
83
102
  celoss: torch.Tensor
84
103
  bpb: torch.Tensor
104
+ log_likelihood_no_leading_space: torch.Tensor
105
+ celoss_no_leading_space: torch.Tensor
106
+ bpb_no_leading_space: torch.Tensor
85
107
  if self.metric_type == "pmi_dc":
86
108
  assert dc_lm_logits is not None
87
109
  # get domain conditional continuation logits: [cont_len, vocab]
@@ -96,6 +118,10 @@ class ICLMetric(Metric):
96
118
  )
97
119
  celoss = -log_likelihood
98
120
  bpb = -log_likelihood # the normalization factors cancel out
121
+
122
+ log_likelihood_no_leading_space = log_likelihood
123
+ celoss_no_leading_space = celoss
124
+ bpb_no_leading_space = bpb
99
125
  elif self.metric_type == "acc" or self.metric_type == "f1":
100
126
  # gather log-probs at continuation token indices
101
127
  log_likelihood = torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
@@ -108,6 +134,19 @@ class ICLMetric(Metric):
108
134
  / batch["cont_byte_len"][idx]
109
135
  * LOG_2_OF_E
110
136
  )
137
+
138
+ log_likelihood_no_leading_space = torch.gather(
139
+ lm_cont_logits, 1, cont_tokens.unsqueeze(-1)
140
+ ).sum()
141
+ celoss_no_leading_space = (
142
+ -torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
143
+ / batch["cont_str_len_no_leading_space"][idx]
144
+ )
145
+ bpb_no_leading_space = (
146
+ -torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
147
+ / batch["cont_byte_len_no_leading_space"][idx]
148
+ * LOG_2_OF_E
149
+ )
111
150
  elif self.metric_type in ["len_norm", "ce_loss", "bpb"]:
112
151
  log_likelihood = (
113
152
  torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
@@ -122,23 +161,46 @@ class ICLMetric(Metric):
122
161
  / batch["cont_byte_len"][idx]
123
162
  * LOG_2_OF_E
124
163
  )
164
+
165
+ log_likelihood_no_leading_space = (
166
+ torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
167
+ / batch["cont_str_len_no_leading_space"][idx]
168
+ )
169
+ celoss_no_leading_space = (
170
+ -torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
171
+ / batch["cont_str_len_no_leading_space"][idx]
172
+ )
173
+ bpb_no_leading_space = (
174
+ -torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
175
+ / batch["cont_byte_len_no_leading_space"][idx]
176
+ * LOG_2_OF_E
177
+ )
125
178
  else:
126
179
  raise ValueError(self.metric_type)
127
180
 
128
- self.loglikelihoods.append((doc_id, cont_id, float(log_likelihood)))
129
181
  self.labels.append((doc_id, cont_id, int(batch["label_id"][idx])))
182
+ self.loglikelihoods.append((doc_id, cont_id, float(log_likelihood)))
130
183
  self.celosses.append((doc_id, cont_id, float(celoss)))
131
184
  self.bpbs.append((doc_id, cont_id, float(bpb)))
132
185
 
186
+ self.loglikelihoods_no_leading_space.append(
187
+ (doc_id, cont_id, float(log_likelihood_no_leading_space))
188
+ )
189
+ self.celosses_no_leading_space.append((doc_id, cont_id, float(celoss_no_leading_space)))
190
+ self.bpbs_no_leading_space.append((doc_id, cont_id, float(bpb_no_leading_space)))
191
+
133
192
  def compute(self) -> Dict[str, torch.Tensor]:
134
193
  # Task "suffix" -> tensor
135
194
 
136
195
  # states should have been synced from all accelerators at this point
137
196
  # account for duplicates here because of DistributedSampler compensating for drop_last=False
138
197
  loglikelihood_dict: Dict[int, Dict[int, float]] = {}
198
+ loglikelihood_no_leading_space_dict: Dict[int, Dict[int, float]] = {}
139
199
  label_dict: Dict[int, int] = {}
140
200
  celoss_dict: Dict[int, Dict[int, float]] = {}
201
+ celoss_no_leading_space_dict: Dict[int, Dict[int, float]] = {}
141
202
  bpb_dict: Dict[int, Dict[int, float]] = {}
203
+ bpb_no_leading_space_dict: Dict[int, Dict[int, float]] = {}
142
204
 
143
205
  # collect labels
144
206
  for doc_id, cont_id, label_id in self.labels:
@@ -159,6 +221,17 @@ class ICLMetric(Metric):
159
221
  if cont_id not in loglikelihood_dict[doc_id]:
160
222
  loglikelihood_dict[doc_id][cont_id] = loglikelihood
161
223
 
224
+ # collect loglikelihoods no leading space
225
+ for doc_id, cont_id, loglikelihood in self.loglikelihoods_no_leading_space:
226
+ if doc_id is None or cont_id is None or loglikelihood is None:
227
+ continue
228
+
229
+ if doc_id not in loglikelihood_no_leading_space_dict:
230
+ loglikelihood_no_leading_space_dict[doc_id] = {}
231
+
232
+ if cont_id not in loglikelihood_no_leading_space_dict[doc_id]:
233
+ loglikelihood_no_leading_space_dict[doc_id][cont_id] = loglikelihood
234
+
162
235
  # collect celosses
163
236
  for doc_id, cont_id, celoss_val in self.celosses:
164
237
  if doc_id is None or cont_id is None or celoss_val is None:
@@ -170,6 +243,17 @@ class ICLMetric(Metric):
170
243
  if cont_id not in celoss_dict[doc_id]:
171
244
  celoss_dict[doc_id][cont_id] = celoss_val
172
245
 
246
+ # collect celosses no leading space
247
+ for doc_id, cont_id, celoss_val in self.celosses_no_leading_space:
248
+ if doc_id is None or cont_id is None or celoss_val is None:
249
+ continue
250
+
251
+ if doc_id not in celoss_no_leading_space_dict:
252
+ celoss_no_leading_space_dict[doc_id] = {}
253
+
254
+ if cont_id not in celoss_no_leading_space_dict[doc_id]:
255
+ celoss_no_leading_space_dict[doc_id][cont_id] = celoss_val
256
+
173
257
  # collect bpbs
174
258
  for doc_id, cont_id, bpb_val in self.bpbs:
175
259
  if doc_id is None or cont_id is None or bpb_val is None:
@@ -181,13 +265,30 @@ class ICLMetric(Metric):
181
265
  if cont_id not in bpb_dict[doc_id]:
182
266
  bpb_dict[doc_id][cont_id] = bpb_val
183
267
 
268
+ # collect bpbs no leading space
269
+ for doc_id, cont_id, bpb_val in self.bpbs_no_leading_space:
270
+ if doc_id is None or cont_id is None or bpb_val is None:
271
+ continue
272
+
273
+ if doc_id not in bpb_no_leading_space_dict:
274
+ bpb_no_leading_space_dict[doc_id] = {}
275
+
276
+ if cont_id not in bpb_no_leading_space_dict[doc_id]:
277
+ bpb_no_leading_space_dict[doc_id][cont_id] = bpb_val
278
+
184
279
  # compute acc
280
+ correct_no_leading_space = []
185
281
  correct = []
186
282
  celoss = []
283
+ celoss_no_leading_space = []
187
284
  bpb = []
285
+ bpb_no_leading_space = []
188
286
  soft_score = []
189
287
  soft_log_score = []
288
+ soft_score_no_leading_space = []
289
+ soft_log_score_no_leading_space = []
190
290
  preds: Optional[List[float]] = None
291
+ preds_no_leading_space: Optional[List[float]] = None
191
292
  labels: Optional[List[int]] = None
192
293
  if self.metric_type == "f1":
193
294
  preds = []
@@ -197,15 +298,25 @@ class ICLMetric(Metric):
197
298
  # each doc_id might have a different number of continuation
198
299
  num_continuations = len(loglikelihood_dict[doc_id].keys())
199
300
  loglikelihoods = torch.tensor([-float("inf")] * num_continuations)
301
+ loglikelihoods_no_leading_space = torch.tensor([-float("inf")] * num_continuations)
200
302
  celosses = torch.tensor([float("inf")] * num_continuations)
303
+ celosses_no_leading_space = torch.tensor([float("inf")] * num_continuations)
201
304
  bpbs = torch.tensor([float("inf")] * num_continuations)
305
+ bpbs_no_leading_space = torch.tensor([float("inf")] * num_continuations)
202
306
 
203
307
  skip_document = False
204
308
  for cont_id in loglikelihood_dict[doc_id]:
205
309
  try:
206
310
  loglikelihoods[cont_id] = loglikelihood_dict[doc_id][cont_id]
311
+ loglikelihoods_no_leading_space[cont_id] = loglikelihood_no_leading_space_dict[
312
+ doc_id
313
+ ][cont_id]
207
314
  celosses[cont_id] = celoss_dict[doc_id][cont_id]
315
+ celosses_no_leading_space[cont_id] = celoss_no_leading_space_dict[doc_id][
316
+ cont_id
317
+ ]
208
318
  bpbs[cont_id] = bpb_dict[doc_id][cont_id]
319
+ bpbs_no_leading_space[cont_id] = bpb_no_leading_space_dict[doc_id][cont_id]
209
320
  except IndexError:
210
321
  # We didn't process all of the continuations, so skip this document.
211
322
  skip_document = True
@@ -216,39 +327,83 @@ class ICLMetric(Metric):
216
327
 
217
328
  if self.metric_type == "ce_loss":
218
329
  celoss.append(celosses[0]) # Only one answer is scored
330
+ celoss_no_leading_space.append(celosses_no_leading_space[0])
219
331
  elif self.metric_type == "bpb":
220
332
  bpb.append(bpbs[0]) # Only one answer is scored
333
+ bpb_no_leading_space.append(bpbs_no_leading_space[0])
221
334
  elif self.metric_type == "f1":
222
335
  assert preds is not None
336
+ assert preds_no_leading_space is not None
223
337
  assert labels is not None
224
338
  preds.append(torch.argmax(loglikelihoods).item())
339
+ preds_no_leading_space.append(torch.argmax(loglikelihoods_no_leading_space).item())
225
340
  labels.append(label_dict[doc_id])
226
341
  else:
227
342
  correct.append(
228
343
  1.0 if torch.argmax(loglikelihoods).item() == label_dict[doc_id] else 0.0
229
344
  )
345
+ correct_no_leading_space.append(
346
+ 1.0
347
+ if torch.argmax(loglikelihoods_no_leading_space).item() == label_dict[doc_id]
348
+ else 0.0
349
+ )
230
350
  celoss.append(celosses[label_dict[doc_id]].item())
351
+ celoss_no_leading_space.append(celosses_no_leading_space[label_dict[doc_id]].item())
231
352
  bpb.append(bpbs[label_dict[doc_id]].item())
353
+ bpb_no_leading_space.append(bpbs_no_leading_space[label_dict[doc_id]].item())
232
354
  soft_score.append(torch.softmax(loglikelihoods, dim=0)[label_dict[doc_id]].item())
233
355
  soft_log_score.append(
234
356
  torch.log_softmax(loglikelihoods, dim=0)[label_dict[doc_id]].item()
235
357
  )
358
+ soft_score_no_leading_space.append(
359
+ torch.softmax(loglikelihoods_no_leading_space, dim=0)[label_dict[doc_id]].item()
360
+ )
361
+ soft_log_score_no_leading_space.append(
362
+ torch.log_softmax(loglikelihoods_no_leading_space, dim=0)[
363
+ label_dict[doc_id]
364
+ ].item()
365
+ )
366
+
367
+ # v1 vs. v2 corresponds to whether we add a 1 to the num chars or num bytes when normalizing the answer length. See https://github.com/allenai/OLMo-in-loop-evals/pull/6
236
368
 
237
369
  if self.metric_type == "f1":
238
370
  assert preds is not None
239
371
  assert labels is not None
240
372
  # for NLI tasks, continuations are yes, no, neither, so idx=0 assigned to pos label
241
373
  score = f1_score(labels, preds, pos_label=0)
242
- return {"f1": torch.tensor(score)}
374
+ score_no_leading_space = f1_score(labels, preds_no_leading_space, pos_label=0)
375
+ return {
376
+ "f1_v1": torch.tensor(score),
377
+ "f1_v2": torch.tensor(score_no_leading_space),
378
+ }
243
379
  elif self.metric_type == "ce_loss":
244
- return {"ce_loss": torch.tensor(sum(celoss) / len(celoss))}
380
+ return {
381
+ "ce_loss_v1": torch.tensor(
382
+ sum(celoss_no_leading_space) / len(celoss_no_leading_space)
383
+ ),
384
+ "ce_loss_v2": torch.tensor(sum(celoss) / len(celoss)),
385
+ }
245
386
  elif self.metric_type == "bpb":
246
- return {"bpb": torch.tensor(sum(bpb) / len(bpb))}
387
+ return {
388
+ "bpb_v1": torch.tensor(sum(bpb_no_leading_space) / len(bpb_no_leading_space)),
389
+ "bpb_v2": torch.tensor(sum(bpb) / len(bpb)),
390
+ }
247
391
  else:
248
392
  return {
249
- self.metric_type: torch.tensor(sum(correct) / len(correct)),
250
- "ce_loss": torch.tensor(sum(celoss) / len(celoss)),
251
- "bpb": torch.tensor(sum(bpb) / len(bpb)),
252
- "soft": torch.tensor(sum(soft_score) / len(soft_score)),
253
- "soft_log": torch.tensor(sum(soft_log_score) / len(soft_log_score)),
393
+ f"{self.metric_type}_v1": torch.tensor(sum(correct) / len(correct)),
394
+ f"{self.metric_type}_v2": torch.tensor(sum(correct) / len(correct)),
395
+ "ce_loss_v1": torch.tensor(
396
+ sum(celoss_no_leading_space) / len(celoss_no_leading_space)
397
+ ),
398
+ "ce_loss_v2": torch.tensor(sum(celoss) / len(celoss)),
399
+ "bpb_v1": torch.tensor(sum(bpb_no_leading_space) / len(bpb_no_leading_space)),
400
+ "bpb_v2": torch.tensor(sum(bpb) / len(bpb)),
401
+ "soft_v1": torch.tensor(
402
+ sum(soft_score_no_leading_space) / len(soft_score_no_leading_space)
403
+ ),
404
+ "soft_v2": torch.tensor(sum(soft_score) / len(soft_score)),
405
+ "soft_log_v1": torch.tensor(
406
+ sum(soft_log_score_no_leading_space) / len(soft_log_score_no_leading_space)
407
+ ),
408
+ "soft_log_v2": torch.tensor(sum(soft_log_score) / len(soft_log_score)),
254
409
  }
@@ -0,0 +1,23 @@
1
+ {
2
+ "task_name": "basic_skills_arithmetic",
3
+ "task_hash": "56711b967c78d896ef51ba00aef5cfb0",
4
+ "task_config": {
5
+ "dataset_path": "basic_skills_arithmetic",
6
+ "primary_metric": "acc_per_token",
7
+ "split": "validation",
8
+ "num_shots": 5,
9
+ "metadata": {
10
+ "regimes": [
11
+ "OLMES-v0.1"
12
+ ],
13
+ "alias": "basic_skills_arithmetic:rc::olmes"
14
+ },
15
+ "generation_kwargs": {},
16
+ "context_kwargs": {},
17
+ "dataset_name": "arithmetic",
18
+ "task_name": "basic_skills_arithmetic",
19
+ "version": 0,
20
+ "task_core": "basic_skills_arithmetic"
21
+ },
22
+ "current_date": "2025-05-12 00:06:28 UTC"
23
+ }
@@ -0,0 +1,23 @@
1
+ {
2
+ "task_name": "basic_skills_coding",
3
+ "task_hash": "d748d1d8ba506d3d234eed529ef62c3e",
4
+ "task_config": {
5
+ "dataset_path": "basic_skills_coding",
6
+ "primary_metric": "acc_per_token",
7
+ "split": "validation",
8
+ "num_shots": 5,
9
+ "metadata": {
10
+ "regimes": [
11
+ "OLMES-v0.1"
12
+ ],
13
+ "alias": "basic_skills_coding:rc::olmes"
14
+ },
15
+ "generation_kwargs": {},
16
+ "context_kwargs": {},
17
+ "dataset_name": "coding",
18
+ "task_name": "basic_skills_coding",
19
+ "version": 0,
20
+ "task_core": "basic_skills_coding"
21
+ },
22
+ "current_date": "2025-05-12 00:06:28 UTC"
23
+ }
@@ -0,0 +1,23 @@
1
+ {
2
+ "task_name": "basic_skills_common_knowledge",
3
+ "task_hash": "51e88e759602f9085a8c779da375d833",
4
+ "task_config": {
5
+ "dataset_path": "basic_skills_common_knowledge",
6
+ "primary_metric": "acc_per_token",
7
+ "split": "validation",
8
+ "num_shots": 5,
9
+ "metadata": {
10
+ "regimes": [
11
+ "OLMES-v0.1"
12
+ ],
13
+ "alias": "basic_skills_common_knowledge:rc::olmes"
14
+ },
15
+ "generation_kwargs": {},
16
+ "context_kwargs": {},
17
+ "dataset_name": "common_knowledge",
18
+ "task_name": "basic_skills_common_knowledge",
19
+ "version": 0,
20
+ "task_core": "basic_skills_common_knowledge"
21
+ },
22
+ "current_date": "2025-05-12 00:06:28 UTC"
23
+ }
@@ -0,0 +1,23 @@
1
+ {
2
+ "task_name": "basic_skills_logical_reasoning",
3
+ "task_hash": "a3d406a2f4224604b7e6bbf68050691d",
4
+ "task_config": {
5
+ "dataset_path": "basic_skills_logical_reasoning",
6
+ "primary_metric": "acc_per_token",
7
+ "split": "validation",
8
+ "num_shots": 5,
9
+ "metadata": {
10
+ "regimes": [
11
+ "OLMES-v0.1"
12
+ ],
13
+ "alias": "basic_skills_logical_reasoning:rc::olmes"
14
+ },
15
+ "generation_kwargs": {},
16
+ "context_kwargs": {},
17
+ "dataset_name": "logical_reasoning",
18
+ "task_name": "basic_skills_logical_reasoning",
19
+ "version": 0,
20
+ "task_core": "basic_skills_logical_reasoning"
21
+ },
22
+ "current_date": "2025-05-12 00:06:28 UTC"
23
+ }
@@ -0,0 +1,23 @@
1
+ {
2
+ "task_name": "basic_skills_pattern",
3
+ "task_hash": "67983750bfb70a3b5cc34dcd67ee3c6a",
4
+ "task_config": {
5
+ "dataset_path": "basic_skills_pattern",
6
+ "primary_metric": "acc_per_token",
7
+ "split": "validation",
8
+ "num_shots": 5,
9
+ "metadata": {
10
+ "regimes": [
11
+ "OLMES-v0.1"
12
+ ],
13
+ "alias": "basic_skills_pattern:rc::olmes"
14
+ },
15
+ "generation_kwargs": {},
16
+ "context_kwargs": {},
17
+ "dataset_name": "pattern",
18
+ "task_name": "basic_skills_pattern",
19
+ "version": 0,
20
+ "task_core": "basic_skills_pattern"
21
+ },
22
+ "current_date": "2025-05-12 00:06:28 UTC"
23
+ }
@@ -0,0 +1,23 @@
1
+ {
2
+ "task_name": "basic_skills_string_operations",
3
+ "task_hash": "8e5fdc7697f1bc7b0c9487a6fa682e45",
4
+ "task_config": {
5
+ "dataset_path": "basic_skills_string_operations",
6
+ "primary_metric": "acc_per_token",
7
+ "split": "validation",
8
+ "num_shots": 5,
9
+ "metadata": {
10
+ "regimes": [
11
+ "OLMES-v0.1"
12
+ ],
13
+ "alias": "basic_skills_string_operations:rc::olmes"
14
+ },
15
+ "generation_kwargs": {},
16
+ "context_kwargs": {},
17
+ "dataset_name": "string_operations",
18
+ "task_name": "basic_skills_string_operations",
19
+ "version": 0,
20
+ "task_core": "basic_skills_string_operations"
21
+ },
22
+ "current_date": "2025-05-12 00:06:28 UTC"
23
+ }
olmo_eval/tasks.py CHANGED
@@ -103,8 +103,15 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
103
103
  )
104
104
 
105
105
  for cont_id, continuation_str in enumerate(continuations):
106
- cont_str_len = len(continuation_str) - 1 # continuation contain leading blank
107
- cont_byte_len = len(continuation_str[1:].encode("utf-8"))
106
+ # The original implementation did not count the first character (usually the leading space) as
107
+ # part of the continuation length (e.g., " A", " " is not counted). The OLMES standard does not
108
+ # do this, but we track both for backwards compatibility.
109
+ cont_str_len_no_leading_space = len(continuation_str) - 1
110
+ cont_byte_len_no_leading_space = len(continuation_str[1:].encode("utf-8"))
111
+
112
+ cont_str_len = len(continuation_str)
113
+ cont_byte_len = len(continuation_str.encode("utf-8"))
114
+
108
115
  continuation = self.token_encode(continuation_str)
109
116
 
110
117
  # query, remove last token from continuation, truncate from left is longer than model ctx length
@@ -131,6 +138,8 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
131
138
  ), # even if query has last token removed, LM will output same cont len
132
139
  "cont_str_len": cont_str_len,
133
140
  "cont_byte_len": cont_byte_len,
141
+ "cont_str_len_no_leading_space": cont_str_len_no_leading_space,
142
+ "cont_byte_len_no_leading_space": cont_byte_len_no_leading_space,
134
143
  "query": query, # remove last token from continuation
135
144
  "dc_query": dc_query,
136
145
  "label_id": label_id,
@@ -209,6 +218,8 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
209
218
  cont_lens = []
210
219
  cont_str_lens = []
211
220
  cont_byte_lens = []
221
+ cont_str_len_no_leading_space = []
222
+ cont_byte_len_no_leading_space = []
212
223
  queries = []
213
224
  dc_queries = []
214
225
  label_ids = []
@@ -232,6 +243,8 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
232
243
  cont_lens.append(sample["cont_len"])
233
244
  cont_str_lens.append(sample["cont_str_len"])
234
245
  cont_byte_lens.append(sample["cont_byte_len"])
246
+ cont_str_len_no_leading_space.append(sample["cont_str_len_no_leading_space"])
247
+ cont_byte_len_no_leading_space.append(sample["cont_byte_len_no_leading_space"])
235
248
 
236
249
  queries.append(
237
250
  torch.LongTensor(
@@ -261,6 +274,8 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
261
274
  ), # since query has last token removed from continuation
262
275
  "cont_str_len": torch.LongTensor(cont_str_lens),
263
276
  "cont_byte_len": torch.LongTensor(cont_byte_lens),
277
+ "cont_str_len_no_leading_space": torch.LongTensor(cont_str_len_no_leading_space),
278
+ "cont_byte_len_no_leading_space": torch.LongTensor(cont_byte_len_no_leading_space),
264
279
  "input_ids": torch.stack(queries),
265
280
  "dc_input_ids": torch.stack(dc_queries),
266
281
  "label_id": torch.LongTensor(label_ids),
@@ -456,8 +471,15 @@ class WinoGrande(ICLMultiChoiceTaskDataset):
456
471
 
457
472
  continuation_str = self.doc_to_continuations(doc)
458
473
  label_id = self.doc_to_label(doc)
459
- cont_str_len = len(continuation_str) - 1 # continuations contain leading blank space
460
- cont_byte_len = len(continuation_str[1:].encode("utf-8"))
474
+
475
+ # The original implementation did not count the first character (usually the leading space) as
476
+ # part of the continuation length (e.g., " A", " " is not counted). The OLMES standard does not
477
+ # do this, but we track both for backwards compatibility.
478
+ cont_str_len_no_leading_space = len(continuation_str) - 1
479
+ cont_byte_len_no_leading_space = len(continuation_str[1:].encode("utf-8"))
480
+
481
+ cont_str_len = len(continuation_str)
482
+ cont_byte_len = len(continuation_str.encode("utf-8"))
461
483
 
462
484
  # tokenize
463
485
  continuation = self.token_encode(continuation_str)
@@ -488,6 +510,8 @@ class WinoGrande(ICLMultiChoiceTaskDataset):
488
510
  ), # even if query has last token removed, LM will output same cont len
489
511
  "cont_str_len": cont_str_len,
490
512
  "cont_byte_len": cont_byte_len,
513
+ "cont_str_len_no_leading_space": cont_str_len_no_leading_space,
514
+ "cont_byte_len_no_leading_space": cont_byte_len_no_leading_space,
491
515
  "query": query, # remove last token from continuation
492
516
  "dc_query": dc_query,
493
517
  "label_id": label_id,
@@ -1524,8 +1548,16 @@ class OEEvalTask(ICLMultiChoiceTaskDataset):
1524
1548
  f"Sample doc from ({self.dataset_path}, {ds_name}):"
1525
1549
  + f"\ndoc_text: {doc_text}\ncontinuation: {continuation_str}"
1526
1550
  )
1527
- cont_str_len = len(continuation_str) - 1 # continuation contain leading blank
1528
- cont_byte_len = len(continuation_str[1:].encode("utf-8"))
1551
+
1552
+ # The original implementation did not count the first character (usually the leading space) as
1553
+ # part of the continuation length (e.g., " A", " " is not counted). The OLMES standard does not
1554
+ # do this, but we track both for backwards compatibility.
1555
+ cont_str_len_no_leading_space = len(continuation_str) - 1
1556
+ cont_byte_len_no_leading_space = len(continuation_str[1:].encode("utf-8"))
1557
+
1558
+ cont_str_len = len(continuation_str)
1559
+ cont_byte_len = len(continuation_str.encode("utf-8"))
1560
+
1529
1561
  continuation = self.token_encode(continuation_str)
1530
1562
 
1531
1563
  # query, remove last token from continuation, truncate from left is longer than model ctx length
@@ -1552,6 +1584,8 @@ class OEEvalTask(ICLMultiChoiceTaskDataset):
1552
1584
  ), # even if query has last token removed, LM will output same cont len
1553
1585
  "cont_str_len": cont_str_len,
1554
1586
  "cont_byte_len": cont_byte_len,
1587
+ "cont_str_len_no_leading_space": cont_str_len_no_leading_space,
1588
+ "cont_byte_len_no_leading_space": cont_byte_len_no_leading_space,
1555
1589
  "query": query, # remove last token from continuation
1556
1590
  "dc_query": dc_query,
1557
1591
  "label_id": label_id,
@@ -1670,6 +1704,46 @@ LABEL_TO_TASK_MAP_ORIG = {
1670
1704
  OEEvalTask,
1671
1705
  {"dataset_path": "arc_easy", "dataset_name": "rc_5shot", "metric_type": "acc"},
1672
1706
  ),
1707
+ "basic_skills_arithmetic_rc_5shot": (
1708
+ OEEvalTask,
1709
+ {
1710
+ "dataset_path": "basic_skills_arithmetic",
1711
+ "dataset_name": "rc_5shot",
1712
+ "metric_type": "acc",
1713
+ },
1714
+ ),
1715
+ "basic_skills_coding_rc_5shot": (
1716
+ OEEvalTask,
1717
+ {"dataset_path": "basic_skills_coding", "dataset_name": "rc_5shot", "metric_type": "acc"},
1718
+ ),
1719
+ "basic_skills_common_knowledge_rc_5shot": (
1720
+ OEEvalTask,
1721
+ {
1722
+ "dataset_path": "basic_skills_common_knowledge",
1723
+ "dataset_name": "rc_5shot",
1724
+ "metric_type": "acc",
1725
+ },
1726
+ ),
1727
+ "basic_skills_logical_reasoning_rc_5shot": (
1728
+ OEEvalTask,
1729
+ {
1730
+ "dataset_path": "basic_skills_logical_reasoning",
1731
+ "dataset_name": "rc_5shot",
1732
+ "metric_type": "acc",
1733
+ },
1734
+ ),
1735
+ "basic_skills_pattern_rc_5shot": (
1736
+ OEEvalTask,
1737
+ {"dataset_path": "basic_skills_pattern", "dataset_name": "rc_5shot", "metric_type": "acc"},
1738
+ ),
1739
+ "basic_skills_string_operations_rc_5shot": (
1740
+ OEEvalTask,
1741
+ {
1742
+ "dataset_path": "basic_skills_string_operations",
1743
+ "dataset_name": "rc_5shot",
1744
+ "metric_type": "acc",
1745
+ },
1746
+ ),
1673
1747
  "boolq_mc_5shot": (
1674
1748
  OEEvalTask,
1675
1749
  {"dataset_path": "boolq", "dataset_name": "mc_5shot", "metric_type": "acc"},
olmo_eval/version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  _MAJOR = "0"
2
2
  _MINOR = "7"
3
- _PATCH = "0"
3
+ _PATCH = "2"
4
4
  _SUFFIX = ""
5
5
 
6
6
  VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)