ai2-olmo-eval 0.8.3__py3-none-any.whl → 0.8.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ai2_olmo_eval-0.8.3.dist-info → ai2_olmo_eval-0.8.5.dist-info}/METADATA +1 -3
- {ai2_olmo_eval-0.8.3.dist-info → ai2_olmo_eval-0.8.5.dist-info}/RECORD +8 -8
- olmo_eval/metrics.py +22 -3
- olmo_eval/tasks.py +33 -0
- olmo_eval/version.py +1 -1
- {ai2_olmo_eval-0.8.3.dist-info → ai2_olmo_eval-0.8.5.dist-info}/WHEEL +0 -0
- {ai2_olmo_eval-0.8.3.dist-info → ai2_olmo_eval-0.8.5.dist-info}/licenses/LICENSE +0 -0
- {ai2_olmo_eval-0.8.3.dist-info → ai2_olmo_eval-0.8.5.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ai2-olmo-eval
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.5
|
|
4
4
|
Summary: In-loop evaluation tasks for language modeling
|
|
5
5
|
Author-email: Allen Institute for Artificial Intelligence <olmo@allenai.org>
|
|
6
6
|
License: Apache License
|
|
@@ -210,12 +210,10 @@ Project-URL: Changelog, https://github.com/allenai/OLMo-in-loop-evals/blob/main/
|
|
|
210
210
|
Requires-Python: >=3.9
|
|
211
211
|
Description-Content-Type: text/markdown
|
|
212
212
|
License-File: LICENSE
|
|
213
|
-
Requires-Dist: numpy<2.0
|
|
214
213
|
Requires-Dist: torch
|
|
215
214
|
Requires-Dist: torchmetrics
|
|
216
215
|
Requires-Dist: datasets
|
|
217
216
|
Requires-Dist: tokenizers
|
|
218
|
-
Requires-Dist: scikit-learn
|
|
219
217
|
Requires-Dist: cached-path
|
|
220
218
|
Requires-Dist: requests
|
|
221
219
|
Requires-Dist: packaging
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
ai2_olmo_eval-0.8.
|
|
1
|
+
ai2_olmo_eval-0.8.5.dist-info/licenses/LICENSE,sha256=YvuKOpYh3COIF0yqq-nCMXtpS7mh1GyYvPVlW2j1G-M,11359
|
|
2
2
|
olmo_eval/__init__.py,sha256=49RxnAaJNk8U9XP3SF5MjyFIxLSkxH0vXQuZgnEOi44,283
|
|
3
|
-
olmo_eval/metrics.py,sha256=
|
|
4
|
-
olmo_eval/tasks.py,sha256=
|
|
3
|
+
olmo_eval/metrics.py,sha256=8oHD5RXmIOkMDIxFIAmBghbRl-Rg42dyhJd6hn4sH-o,20715
|
|
4
|
+
olmo_eval/tasks.py,sha256=yONOV2rI8rDkmaUetdXgeub-shZaNBo9j6Pslu1fKXA,97851
|
|
5
5
|
olmo_eval/tokenizer.py,sha256=PnkidE0nAtEA1QZjuQpE_bIwgAsHxodnaJRALAPqrJQ,5127
|
|
6
6
|
olmo_eval/util.py,sha256=ARmZmRQl8VOvnKQoUprb3cOunzcApeNhRdV4BMXZuvo,3856
|
|
7
|
-
olmo_eval/version.py,sha256=
|
|
7
|
+
olmo_eval/version.py,sha256=iwQwdb2iosjj77YAsresOe6y-ozxOyIEo7J74bi8Z0g,308
|
|
8
8
|
olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/data-00000-of-00001.arrow,sha256=TPWbMhBmticWjYp7TA3etcKbXbaoCDBWhxuqlD1bDJA,98080
|
|
9
9
|
olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/dataset_info.json,sha256=iZumP5Udu8LD7cbew3o7nNpnGu-o9jPaMxUrNDDNIVY,1795
|
|
10
10
|
olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/state.json,sha256=6Q1XhM-HMZcymuGAKBC_8RjMBKgJSaR_6lLUO9Z8XwE,255
|
|
@@ -756,7 +756,7 @@ olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/config.json,sha256=ySjEVqTOj5GwC
|
|
|
756
756
|
olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/requests.jsonl.gz,sha256=knTzcqigWCfdYLN1Pl0TfCm0Fi1lRASWAo_SC6KtXsc,115262
|
|
757
757
|
olmo_eval/tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json,sha256=yjXYcnpTO7Zjm_R4Gucrn9oA5paadiYM-ZZER5q_EXc,2114319
|
|
758
758
|
olmo_eval/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json,sha256=mtM7Szmp-Dlzw_jEKgGUjdW4d6KKyaU1aVbE_07QtxQ,2115113
|
|
759
|
-
ai2_olmo_eval-0.8.
|
|
760
|
-
ai2_olmo_eval-0.8.
|
|
761
|
-
ai2_olmo_eval-0.8.
|
|
762
|
-
ai2_olmo_eval-0.8.
|
|
759
|
+
ai2_olmo_eval-0.8.5.dist-info/METADATA,sha256=ifnySTY3NJaBEflpbAlmeQcf8NwZe0n7_WdhsDRhbII,14345
|
|
760
|
+
ai2_olmo_eval-0.8.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
761
|
+
ai2_olmo_eval-0.8.5.dist-info/top_level.txt,sha256=Pryk28JTb89-j624Uy1gRZiE0YXI3czgbNIfJCl9-x0,10
|
|
762
|
+
ai2_olmo_eval-0.8.5.dist-info/RECORD,,
|
olmo_eval/metrics.py
CHANGED
|
@@ -3,7 +3,6 @@ from typing import Any, Dict, List, Optional, Tuple, TypeVar
|
|
|
3
3
|
|
|
4
4
|
import torch
|
|
5
5
|
import torch.nn.functional as F
|
|
6
|
-
from sklearn.metrics import f1_score
|
|
7
6
|
from torchmetrics import Metric
|
|
8
7
|
|
|
9
8
|
from .util import all_gather_object
|
|
@@ -395,8 +394,10 @@ class ICLMetric(Metric):
|
|
|
395
394
|
assert preds is not None
|
|
396
395
|
assert labels is not None
|
|
397
396
|
# for NLI tasks, continuations are yes, no, neither, so idx=0 assigned to pos label
|
|
398
|
-
score =
|
|
399
|
-
score_no_leading_space =
|
|
397
|
+
score = self.custom_f1_score(labels, preds, pos_label=0)
|
|
398
|
+
score_no_leading_space = self.custom_f1_score(
|
|
399
|
+
labels, preds_no_leading_space, pos_label=0
|
|
400
|
+
)
|
|
400
401
|
return {
|
|
401
402
|
"f1_v1": torch.tensor(score),
|
|
402
403
|
"f1_v2": torch.tensor(score_no_leading_space),
|
|
@@ -432,3 +433,21 @@ class ICLMetric(Metric):
|
|
|
432
433
|
),
|
|
433
434
|
"soft_log_v2": torch.tensor(sum(soft_log_score) / len(soft_log_score)),
|
|
434
435
|
}
|
|
436
|
+
|
|
437
|
+
def custom_f1_score(self, y_true, y_pred, pos_label=1):
|
|
438
|
+
y_true = list(y_true)
|
|
439
|
+
y_pred = list(y_pred)
|
|
440
|
+
tp = sum((yt == pos_label) and (yp == pos_label) for yt, yp in zip(y_true, y_pred))
|
|
441
|
+
fp = sum((yt != pos_label) and (yp == pos_label) for yt, yp in zip(y_true, y_pred))
|
|
442
|
+
fn = sum((yt == pos_label) and (yp != pos_label) for yt, yp in zip(y_true, y_pred))
|
|
443
|
+
|
|
444
|
+
if tp + fp == 0 or tp + fn == 0:
|
|
445
|
+
return 0.0
|
|
446
|
+
|
|
447
|
+
precision = tp / (tp + fp)
|
|
448
|
+
recall = tp / (tp + fn)
|
|
449
|
+
|
|
450
|
+
if precision + recall == 0:
|
|
451
|
+
return 0.0
|
|
452
|
+
|
|
453
|
+
return 2 * precision * recall / (precision + recall)
|
olmo_eval/tasks.py
CHANGED
|
@@ -94,6 +94,17 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
|
|
|
94
94
|
label_id = self.doc_to_label(doc)
|
|
95
95
|
doc_text = self.doc_to_text(doc)
|
|
96
96
|
ctx = self.token_encode(doc_text)
|
|
97
|
+
|
|
98
|
+
# Add BOS token if it is exists in the tokenizer
|
|
99
|
+
if (
|
|
100
|
+
self.tokenizer.bos_token_id is not None
|
|
101
|
+
and ctx[0] != self.tokenizer.bos_token_id
|
|
102
|
+
):
|
|
103
|
+
ctx = [self.tokenizer.bos_token_id] + ctx
|
|
104
|
+
|
|
105
|
+
if doc_id == 0:
|
|
106
|
+
log.info(f"First tokens of in-loop eval context: {ctx[:5]}")
|
|
107
|
+
|
|
97
108
|
dc = self.token_encode(self.doc_to_domain_conditional(doc))
|
|
98
109
|
if self.log_instances > 0:
|
|
99
110
|
self.log_instances -= 1
|
|
@@ -552,6 +563,17 @@ class WinoGrande(ICLMultiChoiceTaskDataset):
|
|
|
552
563
|
|
|
553
564
|
for cont_id, (ctx, dc) in enumerate(zip(ctxs, dcs)):
|
|
554
565
|
ctx = self.token_encode(ctx)
|
|
566
|
+
|
|
567
|
+
# Add BOS token if it is exists in the tokenizer
|
|
568
|
+
if (
|
|
569
|
+
self.tokenizer.bos_token_id is not None
|
|
570
|
+
and ctx[0] != self.tokenizer.bos_token_id
|
|
571
|
+
):
|
|
572
|
+
ctx = [self.tokenizer.bos_token_id] + ctx
|
|
573
|
+
|
|
574
|
+
if doc_id == 0:
|
|
575
|
+
log.info(f"First tokens of in-loop eval context: {ctx[:5]}")
|
|
576
|
+
|
|
555
577
|
dc = self.token_encode(dc)
|
|
556
578
|
|
|
557
579
|
# query, remove last token from continuation, truncate from left is longer than model ctx length
|
|
@@ -1608,6 +1630,17 @@ class OEEvalTask(ICLMultiChoiceTaskDataset):
|
|
|
1608
1630
|
label_id = 0
|
|
1609
1631
|
doc_text = request_dict["context"]
|
|
1610
1632
|
ctx = self.token_encode(doc_text)
|
|
1633
|
+
|
|
1634
|
+
# Add BOS token if it is exists in the tokenizer
|
|
1635
|
+
if (
|
|
1636
|
+
self.tokenizer.bos_token_id is not None
|
|
1637
|
+
and ctx[0] != self.tokenizer.bos_token_id
|
|
1638
|
+
):
|
|
1639
|
+
ctx = [self.tokenizer.bos_token_id] + ctx
|
|
1640
|
+
|
|
1641
|
+
if doc_id == 0:
|
|
1642
|
+
log.info(f"First tokens of in-loop eval context: {ctx[:5]}")
|
|
1643
|
+
|
|
1611
1644
|
dc = self.token_encode(self.doc_to_domain_conditional(doc))
|
|
1612
1645
|
if self.log_instances > 0:
|
|
1613
1646
|
self.log_instances -= 1
|
olmo_eval/version.py
CHANGED
|
File without changes
|
|
File without changes
|
|
File without changes
|