ai2-olmo-eval 0.8.3__py3-none-any.whl → 0.8.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ai2-olmo-eval
3
- Version: 0.8.3
3
+ Version: 0.8.5
4
4
  Summary: In-loop evaluation tasks for language modeling
5
5
  Author-email: Allen Institute for Artificial Intelligence <olmo@allenai.org>
6
6
  License: Apache License
@@ -210,12 +210,10 @@ Project-URL: Changelog, https://github.com/allenai/OLMo-in-loop-evals/blob/main/
210
210
  Requires-Python: >=3.9
211
211
  Description-Content-Type: text/markdown
212
212
  License-File: LICENSE
213
- Requires-Dist: numpy<2.0
214
213
  Requires-Dist: torch
215
214
  Requires-Dist: torchmetrics
216
215
  Requires-Dist: datasets
217
216
  Requires-Dist: tokenizers
218
- Requires-Dist: scikit-learn
219
217
  Requires-Dist: cached-path
220
218
  Requires-Dist: requests
221
219
  Requires-Dist: packaging
@@ -1,10 +1,10 @@
1
- ai2_olmo_eval-0.8.3.dist-info/licenses/LICENSE,sha256=YvuKOpYh3COIF0yqq-nCMXtpS7mh1GyYvPVlW2j1G-M,11359
1
+ ai2_olmo_eval-0.8.5.dist-info/licenses/LICENSE,sha256=YvuKOpYh3COIF0yqq-nCMXtpS7mh1GyYvPVlW2j1G-M,11359
2
2
  olmo_eval/__init__.py,sha256=49RxnAaJNk8U9XP3SF5MjyFIxLSkxH0vXQuZgnEOi44,283
3
- olmo_eval/metrics.py,sha256=xUnFUGho1Y99595G79chqv2iFZU6LU5KVACHRYcUI1k,20046
4
- olmo_eval/tasks.py,sha256=eecUt07ww7lDuh9w974QXMIykV7RX6GhsI5iVoG4eQk,96636
3
+ olmo_eval/metrics.py,sha256=8oHD5RXmIOkMDIxFIAmBghbRl-Rg42dyhJd6hn4sH-o,20715
4
+ olmo_eval/tasks.py,sha256=yONOV2rI8rDkmaUetdXgeub-shZaNBo9j6Pslu1fKXA,97851
5
5
  olmo_eval/tokenizer.py,sha256=PnkidE0nAtEA1QZjuQpE_bIwgAsHxodnaJRALAPqrJQ,5127
6
6
  olmo_eval/util.py,sha256=ARmZmRQl8VOvnKQoUprb3cOunzcApeNhRdV4BMXZuvo,3856
7
- olmo_eval/version.py,sha256=2WwAQD_9rfYlFOdUcW7n-z_8LFN-v_CznrmwPxkrjbQ,308
7
+ olmo_eval/version.py,sha256=iwQwdb2iosjj77YAsresOe6y-ozxOyIEo7J74bi8Z0g,308
8
8
  olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/data-00000-of-00001.arrow,sha256=TPWbMhBmticWjYp7TA3etcKbXbaoCDBWhxuqlD1bDJA,98080
9
9
  olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/dataset_info.json,sha256=iZumP5Udu8LD7cbew3o7nNpnGu-o9jPaMxUrNDDNIVY,1795
10
10
  olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/state.json,sha256=6Q1XhM-HMZcymuGAKBC_8RjMBKgJSaR_6lLUO9Z8XwE,255
@@ -756,7 +756,7 @@ olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/config.json,sha256=ySjEVqTOj5GwC
756
756
  olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/requests.jsonl.gz,sha256=knTzcqigWCfdYLN1Pl0TfCm0Fi1lRASWAo_SC6KtXsc,115262
757
757
  olmo_eval/tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json,sha256=yjXYcnpTO7Zjm_R4Gucrn9oA5paadiYM-ZZER5q_EXc,2114319
758
758
  olmo_eval/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json,sha256=mtM7Szmp-Dlzw_jEKgGUjdW4d6KKyaU1aVbE_07QtxQ,2115113
759
- ai2_olmo_eval-0.8.3.dist-info/METADATA,sha256=yEIyjzmw8MXnBMMpXEYy2N8WDwoQajTyZpiJiBvlPzM,14398
760
- ai2_olmo_eval-0.8.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
761
- ai2_olmo_eval-0.8.3.dist-info/top_level.txt,sha256=Pryk28JTb89-j624Uy1gRZiE0YXI3czgbNIfJCl9-x0,10
762
- ai2_olmo_eval-0.8.3.dist-info/RECORD,,
759
+ ai2_olmo_eval-0.8.5.dist-info/METADATA,sha256=ifnySTY3NJaBEflpbAlmeQcf8NwZe0n7_WdhsDRhbII,14345
760
+ ai2_olmo_eval-0.8.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
761
+ ai2_olmo_eval-0.8.5.dist-info/top_level.txt,sha256=Pryk28JTb89-j624Uy1gRZiE0YXI3czgbNIfJCl9-x0,10
762
+ ai2_olmo_eval-0.8.5.dist-info/RECORD,,
olmo_eval/metrics.py CHANGED
@@ -3,7 +3,6 @@ from typing import Any, Dict, List, Optional, Tuple, TypeVar
3
3
 
4
4
  import torch
5
5
  import torch.nn.functional as F
6
- from sklearn.metrics import f1_score
7
6
  from torchmetrics import Metric
8
7
 
9
8
  from .util import all_gather_object
@@ -395,8 +394,10 @@ class ICLMetric(Metric):
395
394
  assert preds is not None
396
395
  assert labels is not None
397
396
  # for NLI tasks, continuations are yes, no, neither, so idx=0 assigned to pos label
398
- score = f1_score(labels, preds, pos_label=0)
399
- score_no_leading_space = f1_score(labels, preds_no_leading_space, pos_label=0)
397
+ score = self.custom_f1_score(labels, preds, pos_label=0)
398
+ score_no_leading_space = self.custom_f1_score(
399
+ labels, preds_no_leading_space, pos_label=0
400
+ )
400
401
  return {
401
402
  "f1_v1": torch.tensor(score),
402
403
  "f1_v2": torch.tensor(score_no_leading_space),
@@ -432,3 +433,21 @@ class ICLMetric(Metric):
432
433
  ),
433
434
  "soft_log_v2": torch.tensor(sum(soft_log_score) / len(soft_log_score)),
434
435
  }
436
+
437
+ def custom_f1_score(self, y_true, y_pred, pos_label=1):
438
+ y_true = list(y_true)
439
+ y_pred = list(y_pred)
440
+ tp = sum((yt == pos_label) and (yp == pos_label) for yt, yp in zip(y_true, y_pred))
441
+ fp = sum((yt != pos_label) and (yp == pos_label) for yt, yp in zip(y_true, y_pred))
442
+ fn = sum((yt == pos_label) and (yp != pos_label) for yt, yp in zip(y_true, y_pred))
443
+
444
+ if tp + fp == 0 or tp + fn == 0:
445
+ return 0.0
446
+
447
+ precision = tp / (tp + fp)
448
+ recall = tp / (tp + fn)
449
+
450
+ if precision + recall == 0:
451
+ return 0.0
452
+
453
+ return 2 * precision * recall / (precision + recall)
olmo_eval/tasks.py CHANGED
@@ -94,6 +94,17 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
94
94
  label_id = self.doc_to_label(doc)
95
95
  doc_text = self.doc_to_text(doc)
96
96
  ctx = self.token_encode(doc_text)
97
+
98
+ # Add BOS token if it is exists in the tokenizer
99
+ if (
100
+ self.tokenizer.bos_token_id is not None
101
+ and ctx[0] != self.tokenizer.bos_token_id
102
+ ):
103
+ ctx = [self.tokenizer.bos_token_id] + ctx
104
+
105
+ if doc_id == 0:
106
+ log.info(f"First tokens of in-loop eval context: {ctx[:5]}")
107
+
97
108
  dc = self.token_encode(self.doc_to_domain_conditional(doc))
98
109
  if self.log_instances > 0:
99
110
  self.log_instances -= 1
@@ -552,6 +563,17 @@ class WinoGrande(ICLMultiChoiceTaskDataset):
552
563
 
553
564
  for cont_id, (ctx, dc) in enumerate(zip(ctxs, dcs)):
554
565
  ctx = self.token_encode(ctx)
566
+
567
+ # Add BOS token if it is exists in the tokenizer
568
+ if (
569
+ self.tokenizer.bos_token_id is not None
570
+ and ctx[0] != self.tokenizer.bos_token_id
571
+ ):
572
+ ctx = [self.tokenizer.bos_token_id] + ctx
573
+
574
+ if doc_id == 0:
575
+ log.info(f"First tokens of in-loop eval context: {ctx[:5]}")
576
+
555
577
  dc = self.token_encode(dc)
556
578
 
557
579
  # query, remove last token from continuation, truncate from left is longer than model ctx length
@@ -1608,6 +1630,17 @@ class OEEvalTask(ICLMultiChoiceTaskDataset):
1608
1630
  label_id = 0
1609
1631
  doc_text = request_dict["context"]
1610
1632
  ctx = self.token_encode(doc_text)
1633
+
1634
+ # Add BOS token if it is exists in the tokenizer
1635
+ if (
1636
+ self.tokenizer.bos_token_id is not None
1637
+ and ctx[0] != self.tokenizer.bos_token_id
1638
+ ):
1639
+ ctx = [self.tokenizer.bos_token_id] + ctx
1640
+
1641
+ if doc_id == 0:
1642
+ log.info(f"First tokens of in-loop eval context: {ctx[:5]}")
1643
+
1611
1644
  dc = self.token_encode(self.doc_to_domain_conditional(doc))
1612
1645
  if self.log_instances > 0:
1613
1646
  self.log_instances -= 1
olmo_eval/version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  _MAJOR = "0"
2
2
  _MINOR = "8"
3
- _PATCH = "3"
3
+ _PATCH = "5"
4
4
  _SUFFIX = ""
5
5
 
6
6
  VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)