PyPI - ai2-olmo-eval - Versions diffs - 0.8.3__py3-none-any.whl → 0.8.5__py3-none-any.whl - Mend

ai2-olmo-eval 0.8.3py3-none-any.whl → 0.8.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

{ai2_olmo_eval-0.8.3.dist-info → ai2_olmo_eval-0.8.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ai2-olmo-eval
-Version: 0.8.3
+Version: 0.8.5
 Summary: In-loop evaluation tasks for language modeling
 Author-email: Allen Institute for Artificial Intelligence <olmo@allenai.org>
 License:                                  Apache License
@@ -210,12 +210,10 @@ Project-URL: Changelog, https://github.com/allenai/OLMo-in-loop-evals/blob/main/
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: numpy<2.0
 Requires-Dist: torch
 Requires-Dist: torchmetrics
 Requires-Dist: datasets
 Requires-Dist: tokenizers
-Requires-Dist: scikit-learn
 Requires-Dist: cached-path
 Requires-Dist: requests
 Requires-Dist: packaging

{ai2_olmo_eval-0.8.3.dist-info → ai2_olmo_eval-0.8.5.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,10 @@
-ai2_olmo_eval-0.8.3.dist-info/licenses/LICENSE,sha256=YvuKOpYh3COIF0yqq-nCMXtpS7mh1GyYvPVlW2j1G-M,11359
+ai2_olmo_eval-0.8.5.dist-info/licenses/LICENSE,sha256=YvuKOpYh3COIF0yqq-nCMXtpS7mh1GyYvPVlW2j1G-M,11359
 olmo_eval/__init__.py,sha256=49RxnAaJNk8U9XP3SF5MjyFIxLSkxH0vXQuZgnEOi44,283
-olmo_eval/metrics.py,sha256=xUnFUGho1Y99595G79chqv2iFZU6LU5KVACHRYcUI1k,20046
-olmo_eval/tasks.py,sha256=eecUt07ww7lDuh9w974QXMIykV7RX6GhsI5iVoG4eQk,96636
+olmo_eval/metrics.py,sha256=8oHD5RXmIOkMDIxFIAmBghbRl-Rg42dyhJd6hn4sH-o,20715
+olmo_eval/tasks.py,sha256=yONOV2rI8rDkmaUetdXgeub-shZaNBo9j6Pslu1fKXA,97851
 olmo_eval/tokenizer.py,sha256=PnkidE0nAtEA1QZjuQpE_bIwgAsHxodnaJRALAPqrJQ,5127
 olmo_eval/util.py,sha256=ARmZmRQl8VOvnKQoUprb3cOunzcApeNhRdV4BMXZuvo,3856
-olmo_eval/version.py,sha256=2WwAQD_9rfYlFOdUcW7n-z_8LFN-v_CznrmwPxkrjbQ,308
+olmo_eval/version.py,sha256=iwQwdb2iosjj77YAsresOe6y-ozxOyIEo7J74bi8Z0g,308
 olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/data-00000-of-00001.arrow,sha256=TPWbMhBmticWjYp7TA3etcKbXbaoCDBWhxuqlD1bDJA,98080
 olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/dataset_info.json,sha256=iZumP5Udu8LD7cbew3o7nNpnGu-o9jPaMxUrNDDNIVY,1795
 olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/state.json,sha256=6Q1XhM-HMZcymuGAKBC_8RjMBKgJSaR_6lLUO9Z8XwE,255
@@ -756,7 +756,7 @@ olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/config.json,sha256=ySjEVqTOj5GwC
 olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/requests.jsonl.gz,sha256=knTzcqigWCfdYLN1Pl0TfCm0Fi1lRASWAo_SC6KtXsc,115262
 olmo_eval/tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json,sha256=yjXYcnpTO7Zjm_R4Gucrn9oA5paadiYM-ZZER5q_EXc,2114319
 olmo_eval/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json,sha256=mtM7Szmp-Dlzw_jEKgGUjdW4d6KKyaU1aVbE_07QtxQ,2115113
-ai2_olmo_eval-0.8.3.dist-info/METADATA,sha256=yEIyjzmw8MXnBMMpXEYy2N8WDwoQajTyZpiJiBvlPzM,14398
-ai2_olmo_eval-0.8.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-ai2_olmo_eval-0.8.3.dist-info/top_level.txt,sha256=Pryk28JTb89-j624Uy1gRZiE0YXI3czgbNIfJCl9-x0,10
-ai2_olmo_eval-0.8.3.dist-info/RECORD,,
+ai2_olmo_eval-0.8.5.dist-info/METADATA,sha256=ifnySTY3NJaBEflpbAlmeQcf8NwZe0n7_WdhsDRhbII,14345
+ai2_olmo_eval-0.8.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+ai2_olmo_eval-0.8.5.dist-info/top_level.txt,sha256=Pryk28JTb89-j624Uy1gRZiE0YXI3czgbNIfJCl9-x0,10
+ai2_olmo_eval-0.8.5.dist-info/RECORD,,

olmo_eval/metrics.py CHANGED Viewed

@@ -3,7 +3,6 @@ from typing import Any, Dict, List, Optional, Tuple, TypeVar
 import torch
 import torch.nn.functional as F
-from sklearn.metrics import f1_score
 from torchmetrics import Metric
 from .util import all_gather_object
@@ -395,8 +394,10 @@ class ICLMetric(Metric):
             assert preds is not None
             assert labels is not None
             # for NLI tasks, continuations are yes, no, neither, so idx=0 assigned to pos label
-            score = f1_score(labels, preds, pos_label=0)
-            score_no_leading_space = f1_score(labels, preds_no_leading_space, pos_label=0)
+            score = self.custom_f1_score(labels, preds, pos_label=0)
+            score_no_leading_space = self.custom_f1_score(
+                labels, preds_no_leading_space, pos_label=0
+            )
             return {
                 "f1_v1": torch.tensor(score),
                 "f1_v2": torch.tensor(score_no_leading_space),
@@ -432,3 +433,21 @@ class ICLMetric(Metric):
                 ),
                 "soft_log_v2": torch.tensor(sum(soft_log_score) / len(soft_log_score)),
             }
+    def custom_f1_score(self, y_true, y_pred, pos_label=1):
+        y_true = list(y_true)
+        y_pred = list(y_pred)
+        tp = sum((yt == pos_label) and (yp == pos_label) for yt, yp in zip(y_true, y_pred))
+        fp = sum((yt != pos_label) and (yp == pos_label) for yt, yp in zip(y_true, y_pred))
+        fn = sum((yt == pos_label) and (yp != pos_label) for yt, yp in zip(y_true, y_pred))
+        if tp + fp == 0 or tp + fn == 0:
+            return 0.0
+        precision = tp / (tp + fp)
+        recall = tp / (tp + fn)
+        if precision + recall == 0:
+            return 0.0
+        return 2 * precision * recall / (precision + recall)

olmo_eval/tasks.py CHANGED Viewed

@@ -94,6 +94,17 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
                 label_id = self.doc_to_label(doc)
                 doc_text = self.doc_to_text(doc)
                 ctx = self.token_encode(doc_text)
+                # Add BOS token if it is exists in the tokenizer
+                if (
+                    self.tokenizer.bos_token_id is not None
+                    and ctx[0] != self.tokenizer.bos_token_id
+                ):
+                    ctx = [self.tokenizer.bos_token_id] + ctx
+                if doc_id == 0:
+                    log.info(f"First tokens of in-loop eval context: {ctx[:5]}")
                 dc = self.token_encode(self.doc_to_domain_conditional(doc))
                 if self.log_instances > 0:
                     self.log_instances -= 1
@@ -552,6 +563,17 @@ class WinoGrande(ICLMultiChoiceTaskDataset):
             for cont_id, (ctx, dc) in enumerate(zip(ctxs, dcs)):
                 ctx = self.token_encode(ctx)
+                # Add BOS token if it is exists in the tokenizer
+                if (
+                    self.tokenizer.bos_token_id is not None
+                    and ctx[0] != self.tokenizer.bos_token_id
+                ):
+                    ctx = [self.tokenizer.bos_token_id] + ctx
+                if doc_id == 0:
+                    log.info(f"First tokens of in-loop eval context: {ctx[:5]}")
                 dc = self.token_encode(dc)
                 # query, remove last token from continuation, truncate from left is longer than model ctx length
@@ -1608,6 +1630,17 @@ class OEEvalTask(ICLMultiChoiceTaskDataset):
                         label_id = 0
                 doc_text = request_dict["context"]
                 ctx = self.token_encode(doc_text)
+                # Add BOS token if it is exists in the tokenizer
+                if (
+                    self.tokenizer.bos_token_id is not None
+                    and ctx[0] != self.tokenizer.bos_token_id
+                ):
+                    ctx = [self.tokenizer.bos_token_id] + ctx
+                if doc_id == 0:
+                    log.info(f"First tokens of in-loop eval context: {ctx[:5]}")
                 dc = self.token_encode(self.doc_to_domain_conditional(doc))
                 if self.log_instances > 0:
                     self.log_instances -= 1

olmo_eval/version.py CHANGED Viewed

@@ -1,6 +1,6 @@
 _MAJOR = "0"
 _MINOR = "8"
-_PATCH = "3"
+_PATCH = "5"
 _SUFFIX = ""
 VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)

{ai2_olmo_eval-0.8.3.dist-info → ai2_olmo_eval-0.8.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{ai2_olmo_eval-0.8.3.dist-info → ai2_olmo_eval-0.8.5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{ai2_olmo_eval-0.8.3.dist-info → ai2_olmo_eval-0.8.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

ai2-olmo-eval 0.8.3__py3-none-any.whl → 0.8.5__py3-none-any.whl

ai2-olmo-eval 0.8.3py3-none-any.whl → 0.8.5py3-none-any.whl