ai2-olmo-eval 0.8.4__py3-none-any.whl → 0.8.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ai2-olmo-eval
3
- Version: 0.8.4
3
+ Version: 0.8.6
4
4
  Summary: In-loop evaluation tasks for language modeling
5
5
  Author-email: Allen Institute for Artificial Intelligence <olmo@allenai.org>
6
6
  License: Apache License
@@ -210,16 +210,15 @@ Project-URL: Changelog, https://github.com/allenai/OLMo-in-loop-evals/blob/main/
210
210
  Requires-Python: >=3.9
211
211
  Description-Content-Type: text/markdown
212
212
  License-File: LICENSE
213
- Requires-Dist: numpy<2.0
214
213
  Requires-Dist: torch
215
214
  Requires-Dist: torchmetrics
216
- Requires-Dist: datasets
217
- Requires-Dist: tokenizers
218
- Requires-Dist: scikit-learn
215
+ Requires-Dist: datasets<4,>=3.6.0
219
216
  Requires-Dist: cached-path
220
217
  Requires-Dist: requests
221
218
  Requires-Dist: packaging
222
219
  Requires-Dist: importlib_resources
220
+ Requires-Dist: tokenizers<0.20,>=0.19.1
221
+ Requires-Dist: pyarrow<20,>=19.0
223
222
  Provides-Extra: dev
224
223
  Requires-Dist: ruff; extra == "dev"
225
224
  Requires-Dist: mypy<1.4,>=1.0; extra == "dev"
@@ -245,3 +244,21 @@ Code for in-loop evaluation tasks used by the OLMo training team.
245
244
  ```
246
245
  pip install ai2-olmo-eval
247
246
  ```
247
+
248
+ ## Release process
249
+
250
+ ### Steps
251
+
252
+ 1. Update the version in `src/olmo_eval/version.py`.
253
+ 2. Run the release script:
254
+
255
+ ```bash
256
+ ./src/scripts/release.sh
257
+ ```
258
+
259
+ This will commit the changes to the CHANGELOG and `version.py` files and then create a new tag in git
260
+ which will trigger a workflow on GitHub Actions that handles the rest.
261
+
262
+ ### Fixing a failed release
263
+
264
+ If for some reason the GitHub Actions release workflow failed with an error that needs to be fixed, you'll have to delete the tag on GitHub. Once you've pushed a fix you can simply repeat the steps above.
@@ -1,10 +1,10 @@
1
- ai2_olmo_eval-0.8.4.dist-info/licenses/LICENSE,sha256=YvuKOpYh3COIF0yqq-nCMXtpS7mh1GyYvPVlW2j1G-M,11359
1
+ ai2_olmo_eval-0.8.6.dist-info/licenses/LICENSE,sha256=YvuKOpYh3COIF0yqq-nCMXtpS7mh1GyYvPVlW2j1G-M,11359
2
2
  olmo_eval/__init__.py,sha256=49RxnAaJNk8U9XP3SF5MjyFIxLSkxH0vXQuZgnEOi44,283
3
- olmo_eval/metrics.py,sha256=xUnFUGho1Y99595G79chqv2iFZU6LU5KVACHRYcUI1k,20046
3
+ olmo_eval/metrics.py,sha256=8oHD5RXmIOkMDIxFIAmBghbRl-Rg42dyhJd6hn4sH-o,20715
4
4
  olmo_eval/tasks.py,sha256=yONOV2rI8rDkmaUetdXgeub-shZaNBo9j6Pslu1fKXA,97851
5
5
  olmo_eval/tokenizer.py,sha256=PnkidE0nAtEA1QZjuQpE_bIwgAsHxodnaJRALAPqrJQ,5127
6
6
  olmo_eval/util.py,sha256=ARmZmRQl8VOvnKQoUprb3cOunzcApeNhRdV4BMXZuvo,3856
7
- olmo_eval/version.py,sha256=wUkgiv1wrgtooOky_Dd4BYqHwcJ850V_jdiQ649cm9s,308
7
+ olmo_eval/version.py,sha256=5NSu_mJwJQNUZFR-Lsj3oN27hRtPhou4KCg_AT1RRHs,308
8
8
  olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/data-00000-of-00001.arrow,sha256=TPWbMhBmticWjYp7TA3etcKbXbaoCDBWhxuqlD1bDJA,98080
9
9
  olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/dataset_info.json,sha256=iZumP5Udu8LD7cbew3o7nNpnGu-o9jPaMxUrNDDNIVY,1795
10
10
  olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/state.json,sha256=6Q1XhM-HMZcymuGAKBC_8RjMBKgJSaR_6lLUO9Z8XwE,255
@@ -756,7 +756,7 @@ olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/config.json,sha256=ySjEVqTOj5GwC
756
756
  olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/requests.jsonl.gz,sha256=knTzcqigWCfdYLN1Pl0TfCm0Fi1lRASWAo_SC6KtXsc,115262
757
757
  olmo_eval/tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json,sha256=yjXYcnpTO7Zjm_R4Gucrn9oA5paadiYM-ZZER5q_EXc,2114319
758
758
  olmo_eval/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json,sha256=mtM7Szmp-Dlzw_jEKgGUjdW4d6KKyaU1aVbE_07QtxQ,2115113
759
- ai2_olmo_eval-0.8.4.dist-info/METADATA,sha256=vbYd0LaHbtdOtA9NAxjJVlV2EmZt4ch2QhP5D1OYk3k,14398
760
- ai2_olmo_eval-0.8.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
761
- ai2_olmo_eval-0.8.4.dist-info/top_level.txt,sha256=Pryk28JTb89-j624Uy1gRZiE0YXI3czgbNIfJCl9-x0,10
762
- ai2_olmo_eval-0.8.4.dist-info/RECORD,,
759
+ ai2_olmo_eval-0.8.6.dist-info/METADATA,sha256=Fhcb0yWL95vEhgAYnki2yOMveKSeGZOLVM-8Z_GmUKY,14981
760
+ ai2_olmo_eval-0.8.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
761
+ ai2_olmo_eval-0.8.6.dist-info/top_level.txt,sha256=Pryk28JTb89-j624Uy1gRZiE0YXI3czgbNIfJCl9-x0,10
762
+ ai2_olmo_eval-0.8.6.dist-info/RECORD,,
olmo_eval/metrics.py CHANGED
@@ -3,7 +3,6 @@ from typing import Any, Dict, List, Optional, Tuple, TypeVar
3
3
 
4
4
  import torch
5
5
  import torch.nn.functional as F
6
- from sklearn.metrics import f1_score
7
6
  from torchmetrics import Metric
8
7
 
9
8
  from .util import all_gather_object
@@ -395,8 +394,10 @@ class ICLMetric(Metric):
395
394
  assert preds is not None
396
395
  assert labels is not None
397
396
  # for NLI tasks, continuations are yes, no, neither, so idx=0 assigned to pos label
398
- score = f1_score(labels, preds, pos_label=0)
399
- score_no_leading_space = f1_score(labels, preds_no_leading_space, pos_label=0)
397
+ score = self.custom_f1_score(labels, preds, pos_label=0)
398
+ score_no_leading_space = self.custom_f1_score(
399
+ labels, preds_no_leading_space, pos_label=0
400
+ )
400
401
  return {
401
402
  "f1_v1": torch.tensor(score),
402
403
  "f1_v2": torch.tensor(score_no_leading_space),
@@ -432,3 +433,21 @@ class ICLMetric(Metric):
432
433
  ),
433
434
  "soft_log_v2": torch.tensor(sum(soft_log_score) / len(soft_log_score)),
434
435
  }
436
+
437
+ def custom_f1_score(self, y_true, y_pred, pos_label=1):
438
+ y_true = list(y_true)
439
+ y_pred = list(y_pred)
440
+ tp = sum((yt == pos_label) and (yp == pos_label) for yt, yp in zip(y_true, y_pred))
441
+ fp = sum((yt != pos_label) and (yp == pos_label) for yt, yp in zip(y_true, y_pred))
442
+ fn = sum((yt == pos_label) and (yp != pos_label) for yt, yp in zip(y_true, y_pred))
443
+
444
+ if tp + fp == 0 or tp + fn == 0:
445
+ return 0.0
446
+
447
+ precision = tp / (tp + fp)
448
+ recall = tp / (tp + fn)
449
+
450
+ if precision + recall == 0:
451
+ return 0.0
452
+
453
+ return 2 * precision * recall / (precision + recall)
olmo_eval/version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  _MAJOR = "0"
2
2
  _MINOR = "8"
3
- _PATCH = "4"
3
+ _PATCH = "6"
4
4
  _SUFFIX = ""
5
5
 
6
6
  VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)