ai2-olmo-eval 0.8.4__py3-none-any.whl → 0.8.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ai2_olmo_eval-0.8.4.dist-info → ai2_olmo_eval-0.8.6.dist-info}/METADATA +22 -5
- {ai2_olmo_eval-0.8.4.dist-info → ai2_olmo_eval-0.8.6.dist-info}/RECORD +7 -7
- olmo_eval/metrics.py +22 -3
- olmo_eval/version.py +1 -1
- {ai2_olmo_eval-0.8.4.dist-info → ai2_olmo_eval-0.8.6.dist-info}/WHEEL +0 -0
- {ai2_olmo_eval-0.8.4.dist-info → ai2_olmo_eval-0.8.6.dist-info}/licenses/LICENSE +0 -0
- {ai2_olmo_eval-0.8.4.dist-info → ai2_olmo_eval-0.8.6.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ai2-olmo-eval
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.6
|
|
4
4
|
Summary: In-loop evaluation tasks for language modeling
|
|
5
5
|
Author-email: Allen Institute for Artificial Intelligence <olmo@allenai.org>
|
|
6
6
|
License: Apache License
|
|
@@ -210,16 +210,15 @@ Project-URL: Changelog, https://github.com/allenai/OLMo-in-loop-evals/blob/main/
|
|
|
210
210
|
Requires-Python: >=3.9
|
|
211
211
|
Description-Content-Type: text/markdown
|
|
212
212
|
License-File: LICENSE
|
|
213
|
-
Requires-Dist: numpy<2.0
|
|
214
213
|
Requires-Dist: torch
|
|
215
214
|
Requires-Dist: torchmetrics
|
|
216
|
-
Requires-Dist: datasets
|
|
217
|
-
Requires-Dist: tokenizers
|
|
218
|
-
Requires-Dist: scikit-learn
|
|
215
|
+
Requires-Dist: datasets<4,>=3.6.0
|
|
219
216
|
Requires-Dist: cached-path
|
|
220
217
|
Requires-Dist: requests
|
|
221
218
|
Requires-Dist: packaging
|
|
222
219
|
Requires-Dist: importlib_resources
|
|
220
|
+
Requires-Dist: tokenizers<0.20,>=0.19.1
|
|
221
|
+
Requires-Dist: pyarrow<20,>=19.0
|
|
223
222
|
Provides-Extra: dev
|
|
224
223
|
Requires-Dist: ruff; extra == "dev"
|
|
225
224
|
Requires-Dist: mypy<1.4,>=1.0; extra == "dev"
|
|
@@ -245,3 +244,21 @@ Code for in-loop evaluation tasks used by the OLMo training team.
|
|
|
245
244
|
```
|
|
246
245
|
pip install ai2-olmo-eval
|
|
247
246
|
```
|
|
247
|
+
|
|
248
|
+
## Release process
|
|
249
|
+
|
|
250
|
+
### Steps
|
|
251
|
+
|
|
252
|
+
1. Update the version in `src/olmo_eval/version.py`.
|
|
253
|
+
2. Run the release script:
|
|
254
|
+
|
|
255
|
+
```bash
|
|
256
|
+
./src/scripts/release.sh
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
This will commit the changes to the CHANGELOG and `version.py` files and then create a new tag in git
|
|
260
|
+
which will trigger a workflow on GitHub Actions that handles the rest.
|
|
261
|
+
|
|
262
|
+
### Fixing a failed release
|
|
263
|
+
|
|
264
|
+
If for some reason the GitHub Actions release workflow failed with an error that needs to be fixed, you'll have to delete the tag on GitHub. Once you've pushed a fix you can simply repeat the steps above.
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
ai2_olmo_eval-0.8.
|
|
1
|
+
ai2_olmo_eval-0.8.6.dist-info/licenses/LICENSE,sha256=YvuKOpYh3COIF0yqq-nCMXtpS7mh1GyYvPVlW2j1G-M,11359
|
|
2
2
|
olmo_eval/__init__.py,sha256=49RxnAaJNk8U9XP3SF5MjyFIxLSkxH0vXQuZgnEOi44,283
|
|
3
|
-
olmo_eval/metrics.py,sha256=
|
|
3
|
+
olmo_eval/metrics.py,sha256=8oHD5RXmIOkMDIxFIAmBghbRl-Rg42dyhJd6hn4sH-o,20715
|
|
4
4
|
olmo_eval/tasks.py,sha256=yONOV2rI8rDkmaUetdXgeub-shZaNBo9j6Pslu1fKXA,97851
|
|
5
5
|
olmo_eval/tokenizer.py,sha256=PnkidE0nAtEA1QZjuQpE_bIwgAsHxodnaJRALAPqrJQ,5127
|
|
6
6
|
olmo_eval/util.py,sha256=ARmZmRQl8VOvnKQoUprb3cOunzcApeNhRdV4BMXZuvo,3856
|
|
7
|
-
olmo_eval/version.py,sha256=
|
|
7
|
+
olmo_eval/version.py,sha256=5NSu_mJwJQNUZFR-Lsj3oN27hRtPhou4KCg_AT1RRHs,308
|
|
8
8
|
olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/data-00000-of-00001.arrow,sha256=TPWbMhBmticWjYp7TA3etcKbXbaoCDBWhxuqlD1bDJA,98080
|
|
9
9
|
olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/dataset_info.json,sha256=iZumP5Udu8LD7cbew3o7nNpnGu-o9jPaMxUrNDDNIVY,1795
|
|
10
10
|
olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/state.json,sha256=6Q1XhM-HMZcymuGAKBC_8RjMBKgJSaR_6lLUO9Z8XwE,255
|
|
@@ -756,7 +756,7 @@ olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/config.json,sha256=ySjEVqTOj5GwC
|
|
|
756
756
|
olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/requests.jsonl.gz,sha256=knTzcqigWCfdYLN1Pl0TfCm0Fi1lRASWAo_SC6KtXsc,115262
|
|
757
757
|
olmo_eval/tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json,sha256=yjXYcnpTO7Zjm_R4Gucrn9oA5paadiYM-ZZER5q_EXc,2114319
|
|
758
758
|
olmo_eval/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json,sha256=mtM7Szmp-Dlzw_jEKgGUjdW4d6KKyaU1aVbE_07QtxQ,2115113
|
|
759
|
-
ai2_olmo_eval-0.8.
|
|
760
|
-
ai2_olmo_eval-0.8.
|
|
761
|
-
ai2_olmo_eval-0.8.
|
|
762
|
-
ai2_olmo_eval-0.8.
|
|
759
|
+
ai2_olmo_eval-0.8.6.dist-info/METADATA,sha256=Fhcb0yWL95vEhgAYnki2yOMveKSeGZOLVM-8Z_GmUKY,14981
|
|
760
|
+
ai2_olmo_eval-0.8.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
761
|
+
ai2_olmo_eval-0.8.6.dist-info/top_level.txt,sha256=Pryk28JTb89-j624Uy1gRZiE0YXI3czgbNIfJCl9-x0,10
|
|
762
|
+
ai2_olmo_eval-0.8.6.dist-info/RECORD,,
|
olmo_eval/metrics.py
CHANGED
|
@@ -3,7 +3,6 @@ from typing import Any, Dict, List, Optional, Tuple, TypeVar
|
|
|
3
3
|
|
|
4
4
|
import torch
|
|
5
5
|
import torch.nn.functional as F
|
|
6
|
-
from sklearn.metrics import f1_score
|
|
7
6
|
from torchmetrics import Metric
|
|
8
7
|
|
|
9
8
|
from .util import all_gather_object
|
|
@@ -395,8 +394,10 @@ class ICLMetric(Metric):
|
|
|
395
394
|
assert preds is not None
|
|
396
395
|
assert labels is not None
|
|
397
396
|
# for NLI tasks, continuations are yes, no, neither, so idx=0 assigned to pos label
|
|
398
|
-
score =
|
|
399
|
-
score_no_leading_space =
|
|
397
|
+
score = self.custom_f1_score(labels, preds, pos_label=0)
|
|
398
|
+
score_no_leading_space = self.custom_f1_score(
|
|
399
|
+
labels, preds_no_leading_space, pos_label=0
|
|
400
|
+
)
|
|
400
401
|
return {
|
|
401
402
|
"f1_v1": torch.tensor(score),
|
|
402
403
|
"f1_v2": torch.tensor(score_no_leading_space),
|
|
@@ -432,3 +433,21 @@ class ICLMetric(Metric):
|
|
|
432
433
|
),
|
|
433
434
|
"soft_log_v2": torch.tensor(sum(soft_log_score) / len(soft_log_score)),
|
|
434
435
|
}
|
|
436
|
+
|
|
437
|
+
def custom_f1_score(self, y_true, y_pred, pos_label=1):
|
|
438
|
+
y_true = list(y_true)
|
|
439
|
+
y_pred = list(y_pred)
|
|
440
|
+
tp = sum((yt == pos_label) and (yp == pos_label) for yt, yp in zip(y_true, y_pred))
|
|
441
|
+
fp = sum((yt != pos_label) and (yp == pos_label) for yt, yp in zip(y_true, y_pred))
|
|
442
|
+
fn = sum((yt == pos_label) and (yp != pos_label) for yt, yp in zip(y_true, y_pred))
|
|
443
|
+
|
|
444
|
+
if tp + fp == 0 or tp + fn == 0:
|
|
445
|
+
return 0.0
|
|
446
|
+
|
|
447
|
+
precision = tp / (tp + fp)
|
|
448
|
+
recall = tp / (tp + fn)
|
|
449
|
+
|
|
450
|
+
if precision + recall == 0:
|
|
451
|
+
return 0.0
|
|
452
|
+
|
|
453
|
+
return 2 * precision * recall / (precision + recall)
|
olmo_eval/version.py
CHANGED
|
File without changes
|
|
File without changes
|
|
File without changes
|