ai2-olmo-eval 0.8.4__py3-none-any.whl → 0.8.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ai2-olmo-eval
3
- Version: 0.8.4
3
+ Version: 0.8.5
4
4
  Summary: In-loop evaluation tasks for language modeling
5
5
  Author-email: Allen Institute for Artificial Intelligence <olmo@allenai.org>
6
6
  License: Apache License
@@ -210,12 +210,10 @@ Project-URL: Changelog, https://github.com/allenai/OLMo-in-loop-evals/blob/main/
210
210
  Requires-Python: >=3.9
211
211
  Description-Content-Type: text/markdown
212
212
  License-File: LICENSE
213
- Requires-Dist: numpy<2.0
214
213
  Requires-Dist: torch
215
214
  Requires-Dist: torchmetrics
216
215
  Requires-Dist: datasets
217
216
  Requires-Dist: tokenizers
218
- Requires-Dist: scikit-learn
219
217
  Requires-Dist: cached-path
220
218
  Requires-Dist: requests
221
219
  Requires-Dist: packaging
@@ -1,10 +1,10 @@
1
- ai2_olmo_eval-0.8.4.dist-info/licenses/LICENSE,sha256=YvuKOpYh3COIF0yqq-nCMXtpS7mh1GyYvPVlW2j1G-M,11359
1
+ ai2_olmo_eval-0.8.5.dist-info/licenses/LICENSE,sha256=YvuKOpYh3COIF0yqq-nCMXtpS7mh1GyYvPVlW2j1G-M,11359
2
2
  olmo_eval/__init__.py,sha256=49RxnAaJNk8U9XP3SF5MjyFIxLSkxH0vXQuZgnEOi44,283
3
- olmo_eval/metrics.py,sha256=xUnFUGho1Y99595G79chqv2iFZU6LU5KVACHRYcUI1k,20046
3
+ olmo_eval/metrics.py,sha256=8oHD5RXmIOkMDIxFIAmBghbRl-Rg42dyhJd6hn4sH-o,20715
4
4
  olmo_eval/tasks.py,sha256=yONOV2rI8rDkmaUetdXgeub-shZaNBo9j6Pslu1fKXA,97851
5
5
  olmo_eval/tokenizer.py,sha256=PnkidE0nAtEA1QZjuQpE_bIwgAsHxodnaJRALAPqrJQ,5127
6
6
  olmo_eval/util.py,sha256=ARmZmRQl8VOvnKQoUprb3cOunzcApeNhRdV4BMXZuvo,3856
7
- olmo_eval/version.py,sha256=wUkgiv1wrgtooOky_Dd4BYqHwcJ850V_jdiQ649cm9s,308
7
+ olmo_eval/version.py,sha256=iwQwdb2iosjj77YAsresOe6y-ozxOyIEo7J74bi8Z0g,308
8
8
  olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/data-00000-of-00001.arrow,sha256=TPWbMhBmticWjYp7TA3etcKbXbaoCDBWhxuqlD1bDJA,98080
9
9
  olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/dataset_info.json,sha256=iZumP5Udu8LD7cbew3o7nNpnGu-o9jPaMxUrNDDNIVY,1795
10
10
  olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/state.json,sha256=6Q1XhM-HMZcymuGAKBC_8RjMBKgJSaR_6lLUO9Z8XwE,255
@@ -756,7 +756,7 @@ olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/config.json,sha256=ySjEVqTOj5GwC
756
756
  olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/requests.jsonl.gz,sha256=knTzcqigWCfdYLN1Pl0TfCm0Fi1lRASWAo_SC6KtXsc,115262
757
757
  olmo_eval/tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json,sha256=yjXYcnpTO7Zjm_R4Gucrn9oA5paadiYM-ZZER5q_EXc,2114319
758
758
  olmo_eval/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json,sha256=mtM7Szmp-Dlzw_jEKgGUjdW4d6KKyaU1aVbE_07QtxQ,2115113
759
- ai2_olmo_eval-0.8.4.dist-info/METADATA,sha256=vbYd0LaHbtdOtA9NAxjJVlV2EmZt4ch2QhP5D1OYk3k,14398
760
- ai2_olmo_eval-0.8.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
761
- ai2_olmo_eval-0.8.4.dist-info/top_level.txt,sha256=Pryk28JTb89-j624Uy1gRZiE0YXI3czgbNIfJCl9-x0,10
762
- ai2_olmo_eval-0.8.4.dist-info/RECORD,,
759
+ ai2_olmo_eval-0.8.5.dist-info/METADATA,sha256=ifnySTY3NJaBEflpbAlmeQcf8NwZe0n7_WdhsDRhbII,14345
760
+ ai2_olmo_eval-0.8.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
761
+ ai2_olmo_eval-0.8.5.dist-info/top_level.txt,sha256=Pryk28JTb89-j624Uy1gRZiE0YXI3czgbNIfJCl9-x0,10
762
+ ai2_olmo_eval-0.8.5.dist-info/RECORD,,
olmo_eval/metrics.py CHANGED
@@ -3,7 +3,6 @@ from typing import Any, Dict, List, Optional, Tuple, TypeVar
3
3
 
4
4
  import torch
5
5
  import torch.nn.functional as F
6
- from sklearn.metrics import f1_score
7
6
  from torchmetrics import Metric
8
7
 
9
8
  from .util import all_gather_object
@@ -395,8 +394,10 @@ class ICLMetric(Metric):
395
394
  assert preds is not None
396
395
  assert labels is not None
397
396
  # for NLI tasks, continuations are yes, no, neither, so idx=0 assigned to pos label
398
- score = f1_score(labels, preds, pos_label=0)
399
- score_no_leading_space = f1_score(labels, preds_no_leading_space, pos_label=0)
397
+ score = self.custom_f1_score(labels, preds, pos_label=0)
398
+ score_no_leading_space = self.custom_f1_score(
399
+ labels, preds_no_leading_space, pos_label=0
400
+ )
400
401
  return {
401
402
  "f1_v1": torch.tensor(score),
402
403
  "f1_v2": torch.tensor(score_no_leading_space),
@@ -432,3 +433,21 @@ class ICLMetric(Metric):
432
433
  ),
433
434
  "soft_log_v2": torch.tensor(sum(soft_log_score) / len(soft_log_score)),
434
435
  }
436
+
437
+ def custom_f1_score(self, y_true, y_pred, pos_label=1):
438
+ y_true = list(y_true)
439
+ y_pred = list(y_pred)
440
+ tp = sum((yt == pos_label) and (yp == pos_label) for yt, yp in zip(y_true, y_pred))
441
+ fp = sum((yt != pos_label) and (yp == pos_label) for yt, yp in zip(y_true, y_pred))
442
+ fn = sum((yt == pos_label) and (yp != pos_label) for yt, yp in zip(y_true, y_pred))
443
+
444
+ if tp + fp == 0 or tp + fn == 0:
445
+ return 0.0
446
+
447
+ precision = tp / (tp + fp)
448
+ recall = tp / (tp + fn)
449
+
450
+ if precision + recall == 0:
451
+ return 0.0
452
+
453
+ return 2 * precision * recall / (precision + recall)
olmo_eval/version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  _MAJOR = "0"
2
2
  _MINOR = "8"
3
- _PATCH = "4"
3
+ _PATCH = "5"
4
4
  _SUFFIX = ""
5
5
 
6
6
  VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)