PyPI - ai2-olmo-eval - Versions diffs - 0.8.4__py3-none-any.whl → 0.8.6__py3-none-any.whl - Mend

ai2-olmo-eval 0.8.4py3-none-any.whl → 0.8.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

{ai2_olmo_eval-0.8.4.dist-info → ai2_olmo_eval-0.8.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ai2-olmo-eval
-Version: 0.8.4
+Version: 0.8.6
 Summary: In-loop evaluation tasks for language modeling
 Author-email: Allen Institute for Artificial Intelligence <olmo@allenai.org>
 License:                                  Apache License
@@ -210,16 +210,15 @@ Project-URL: Changelog, https://github.com/allenai/OLMo-in-loop-evals/blob/main/
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: numpy<2.0
 Requires-Dist: torch
 Requires-Dist: torchmetrics
-Requires-Dist: datasets
-Requires-Dist: tokenizers
-Requires-Dist: scikit-learn
+Requires-Dist: datasets<4,>=3.6.0
 Requires-Dist: cached-path
 Requires-Dist: requests
 Requires-Dist: packaging
 Requires-Dist: importlib_resources
+Requires-Dist: tokenizers<0.20,>=0.19.1
+Requires-Dist: pyarrow<20,>=19.0
 Provides-Extra: dev
 Requires-Dist: ruff; extra == "dev"
 Requires-Dist: mypy<1.4,>=1.0; extra == "dev"
@@ -245,3 +244,21 @@ Code for in-loop evaluation tasks used by the OLMo training team.
 ```
 pip install ai2-olmo-eval
 ```
+## Release process
+### Steps
+1. Update the version in `src/olmo_eval/version.py`.
+2. Run the release script:
+    ```bash
+    ./src/scripts/release.sh
+    ```
+    This will commit the changes to the CHANGELOG and `version.py` files and then create a new tag in git
+    which will trigger a workflow on GitHub Actions that handles the rest.
+### Fixing a failed release
+If for some reason the GitHub Actions release workflow failed with an error that needs to be fixed, you'll have to delete the tag on GitHub. Once you've pushed a fix you can simply repeat the steps above.

{ai2_olmo_eval-0.8.4.dist-info → ai2_olmo_eval-0.8.6.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,10 @@
-ai2_olmo_eval-0.8.4.dist-info/licenses/LICENSE,sha256=YvuKOpYh3COIF0yqq-nCMXtpS7mh1GyYvPVlW2j1G-M,11359
+ai2_olmo_eval-0.8.6.dist-info/licenses/LICENSE,sha256=YvuKOpYh3COIF0yqq-nCMXtpS7mh1GyYvPVlW2j1G-M,11359
 olmo_eval/__init__.py,sha256=49RxnAaJNk8U9XP3SF5MjyFIxLSkxH0vXQuZgnEOi44,283
-olmo_eval/metrics.py,sha256=xUnFUGho1Y99595G79chqv2iFZU6LU5KVACHRYcUI1k,20046
+olmo_eval/metrics.py,sha256=8oHD5RXmIOkMDIxFIAmBghbRl-Rg42dyhJd6hn4sH-o,20715
 olmo_eval/tasks.py,sha256=yONOV2rI8rDkmaUetdXgeub-shZaNBo9j6Pslu1fKXA,97851
 olmo_eval/tokenizer.py,sha256=PnkidE0nAtEA1QZjuQpE_bIwgAsHxodnaJRALAPqrJQ,5127
 olmo_eval/util.py,sha256=ARmZmRQl8VOvnKQoUprb3cOunzcApeNhRdV4BMXZuvo,3856
-olmo_eval/version.py,sha256=wUkgiv1wrgtooOky_Dd4BYqHwcJ850V_jdiQ649cm9s,308
+olmo_eval/version.py,sha256=5NSu_mJwJQNUZFR-Lsj3oN27hRtPhou4KCg_AT1RRHs,308
 olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/data-00000-of-00001.arrow,sha256=TPWbMhBmticWjYp7TA3etcKbXbaoCDBWhxuqlD1bDJA,98080
 olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/dataset_info.json,sha256=iZumP5Udu8LD7cbew3o7nNpnGu-o9jPaMxUrNDDNIVY,1795
 olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/state.json,sha256=6Q1XhM-HMZcymuGAKBC_8RjMBKgJSaR_6lLUO9Z8XwE,255
@@ -756,7 +756,7 @@ olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/config.json,sha256=ySjEVqTOj5GwC
 olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/requests.jsonl.gz,sha256=knTzcqigWCfdYLN1Pl0TfCm0Fi1lRASWAo_SC6KtXsc,115262
 olmo_eval/tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json,sha256=yjXYcnpTO7Zjm_R4Gucrn9oA5paadiYM-ZZER5q_EXc,2114319
 olmo_eval/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json,sha256=mtM7Szmp-Dlzw_jEKgGUjdW4d6KKyaU1aVbE_07QtxQ,2115113
-ai2_olmo_eval-0.8.4.dist-info/METADATA,sha256=vbYd0LaHbtdOtA9NAxjJVlV2EmZt4ch2QhP5D1OYk3k,14398
-ai2_olmo_eval-0.8.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-ai2_olmo_eval-0.8.4.dist-info/top_level.txt,sha256=Pryk28JTb89-j624Uy1gRZiE0YXI3czgbNIfJCl9-x0,10
-ai2_olmo_eval-0.8.4.dist-info/RECORD,,
+ai2_olmo_eval-0.8.6.dist-info/METADATA,sha256=Fhcb0yWL95vEhgAYnki2yOMveKSeGZOLVM-8Z_GmUKY,14981
+ai2_olmo_eval-0.8.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+ai2_olmo_eval-0.8.6.dist-info/top_level.txt,sha256=Pryk28JTb89-j624Uy1gRZiE0YXI3czgbNIfJCl9-x0,10
+ai2_olmo_eval-0.8.6.dist-info/RECORD,,

olmo_eval/metrics.py CHANGED Viewed

@@ -3,7 +3,6 @@ from typing import Any, Dict, List, Optional, Tuple, TypeVar
 import torch
 import torch.nn.functional as F
-from sklearn.metrics import f1_score
 from torchmetrics import Metric
 from .util import all_gather_object
@@ -395,8 +394,10 @@ class ICLMetric(Metric):
             assert preds is not None
             assert labels is not None
             # for NLI tasks, continuations are yes, no, neither, so idx=0 assigned to pos label
-            score = f1_score(labels, preds, pos_label=0)
-            score_no_leading_space = f1_score(labels, preds_no_leading_space, pos_label=0)
+            score = self.custom_f1_score(labels, preds, pos_label=0)
+            score_no_leading_space = self.custom_f1_score(
+                labels, preds_no_leading_space, pos_label=0
+            )
             return {
                 "f1_v1": torch.tensor(score),
                 "f1_v2": torch.tensor(score_no_leading_space),
@@ -432,3 +433,21 @@ class ICLMetric(Metric):
                 ),
                 "soft_log_v2": torch.tensor(sum(soft_log_score) / len(soft_log_score)),
             }
+    def custom_f1_score(self, y_true, y_pred, pos_label=1):
+        y_true = list(y_true)
+        y_pred = list(y_pred)
+        tp = sum((yt == pos_label) and (yp == pos_label) for yt, yp in zip(y_true, y_pred))
+        fp = sum((yt != pos_label) and (yp == pos_label) for yt, yp in zip(y_true, y_pred))
+        fn = sum((yt == pos_label) and (yp != pos_label) for yt, yp in zip(y_true, y_pred))
+        if tp + fp == 0 or tp + fn == 0:
+            return 0.0
+        precision = tp / (tp + fp)
+        recall = tp / (tp + fn)
+        if precision + recall == 0:
+            return 0.0
+        return 2 * precision * recall / (precision + recall)

olmo_eval/version.py CHANGED Viewed

@@ -1,6 +1,6 @@
 _MAJOR = "0"
 _MINOR = "8"
-_PATCH = "4"
+_PATCH = "6"
 _SUFFIX = ""
 VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)

{ai2_olmo_eval-0.8.4.dist-info → ai2_olmo_eval-0.8.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{ai2_olmo_eval-0.8.4.dist-info → ai2_olmo_eval-0.8.6.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{ai2_olmo_eval-0.8.4.dist-info → ai2_olmo_eval-0.8.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

ai2-olmo-eval 0.8.4__py3-none-any.whl → 0.8.6__py3-none-any.whl

ai2-olmo-eval 0.8.4py3-none-any.whl → 0.8.6py3-none-any.whl