PyPI - ai2-olmo-eval - Versions diffs - 0.7.0__tar.gz → 0.7.1__tar.gz - Mend

ai2-olmo-eval 0.7.0tar.gz → 0.7.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (715) hide show

{ai2_olmo_eval-0.7.0 → ai2_olmo_eval-0.7.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: ai2-olmo-eval
-Version: 0.7.0
+Version: 0.7.1
 Summary: In-loop evaluation tasks for language modeling
 Author-email: Allen Institute for Artificial Intelligence <olmo@allenai.org>
 License:                                  Apache License
@@ -234,6 +234,7 @@ Requires-Dist: boto3; extra == "dev"
 Requires-Dist: google-cloud-storage; extra == "dev"
 Provides-Extra: all
 Requires-Dist: ai2-olmo-eval[dev]; extra == "all"
+Dynamic: license-file
 # OLMo-in-loop-evals

{ai2_olmo_eval-0.7.0 → ai2_olmo_eval-0.7.1}/src/ai2_olmo_eval.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: ai2-olmo-eval
-Version: 0.7.0
+Version: 0.7.1
 Summary: In-loop evaluation tasks for language modeling
 Author-email: Allen Institute for Artificial Intelligence <olmo@allenai.org>
 License:                                  Apache License
@@ -234,6 +234,7 @@ Requires-Dist: boto3; extra == "dev"
 Requires-Dist: google-cloud-storage; extra == "dev"
 Provides-Extra: all
 Requires-Dist: ai2-olmo-eval[dev]; extra == "all"
+Dynamic: license-file
 # OLMo-in-loop-evals

{ai2_olmo_eval-0.7.0 → ai2_olmo_eval-0.7.1}/src/olmo_eval/metrics.py RENAMED Viewed

@@ -37,12 +37,26 @@ class ICLMetric(Metric):
         self.add_state("bpbs", default=[], dist_reduce_fx=dist_combine_lists)
         self.add_state("labels", default=[], dist_reduce_fx=dist_combine_lists)
+        self.add_state(
+            "loglikelihoods_no_leading_space", default=[], dist_reduce_fx=dist_combine_lists
+        )
+        self.add_state("celosses_no_leading_space", default=[], dist_reduce_fx=dist_combine_lists)
+        self.add_state("bpbs_no_leading_space", default=[], dist_reduce_fx=dist_combine_lists)
     def reset(self):
         self.loglikelihoods: List[Tuple[Optional[int], Optional[int], Optional[float]]] = []
         self.celosses: List[Tuple[Optional[int], Optional[int], Optional[float]]] = []
         self.bpbs: List[Tuple[Optional[int], Optional[int], Optional[float]]] = []
         self.labels: List[Tuple[Optional[int], Optional[int], Optional[int]]] = []
+        self.loglikelihoods_no_leading_space: List[
+            Tuple[Optional[int], Optional[int], Optional[float]]
+        ] = []
+        self.celosses_no_leading_space: List[
+            Tuple[Optional[int], Optional[int], Optional[float]]
+        ] = []
+        self.bpbs_no_leading_space: List[Tuple[Optional[int], Optional[int], Optional[float]]] = []
     def update(
         self,
         batch: Dict[str, Any],
@@ -56,6 +70,11 @@ class ICLMetric(Metric):
             self.loglikelihoods.append((None, None, None))
             self.celosses.append((None, None, None))
             self.bpbs.append((None, None, None))
+            self.loglikelihoods_no_leading_space.append((None, None, None))
+            self.celosses_no_leading_space.append((None, None, None))
+            self.bpbs_no_leading_space.append((None, None, None))
             self.labels.append((None, None, None))
             return
@@ -82,6 +101,9 @@ class ICLMetric(Metric):
             log_likelihood: torch.Tensor
             celoss: torch.Tensor
             bpb: torch.Tensor
+            log_likelihood_no_leading_space: torch.Tensor
+            celoss_no_leading_space: torch.Tensor
+            bpb_no_leading_space: torch.Tensor
             if self.metric_type == "pmi_dc":
                 assert dc_lm_logits is not None
                 # get domain conditional continuation logits: [cont_len, vocab]
@@ -96,6 +118,10 @@ class ICLMetric(Metric):
                 )
                 celoss = -log_likelihood
                 bpb = -log_likelihood  # the normalization factors cancel out
+                log_likelihood_no_leading_space = log_likelihood
+                celoss_no_leading_space = celoss
+                bpb_no_leading_space = bpb
             elif self.metric_type == "acc" or self.metric_type == "f1":
                 # gather log-probs at continuation token indices
                 log_likelihood = torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
@@ -108,6 +134,19 @@ class ICLMetric(Metric):
                     / batch["cont_byte_len"][idx]
                     * LOG_2_OF_E
                 )
+                log_likelihood_no_leading_space = torch.gather(
+                    lm_cont_logits, 1, cont_tokens.unsqueeze(-1)
+                ).sum()
+                celoss_no_leading_space = (
+                    -torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
+                    / batch["cont_str_len_no_leading_space"][idx]
+                )
+                bpb_no_leading_space = (
+                    -torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
+                    / batch["cont_byte_len_no_leading_space"][idx]
+                    * LOG_2_OF_E
+                )
             elif self.metric_type in ["len_norm", "ce_loss", "bpb"]:
                 log_likelihood = (
                     torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
@@ -122,23 +161,46 @@ class ICLMetric(Metric):
                     / batch["cont_byte_len"][idx]
                     * LOG_2_OF_E
                 )
+                log_likelihood_no_leading_space = (
+                    torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
+                    / batch["cont_str_len_no_leading_space"][idx]
+                )
+                celoss_no_leading_space = (
+                    -torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
+                    / batch["cont_str_len_no_leading_space"][idx]
+                )
+                bpb_no_leading_space = (
+                    -torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
+                    / batch["cont_byte_len_no_leading_space"][idx]
+                    * LOG_2_OF_E
+                )
             else:
                 raise ValueError(self.metric_type)
-            self.loglikelihoods.append((doc_id, cont_id, float(log_likelihood)))
             self.labels.append((doc_id, cont_id, int(batch["label_id"][idx])))
+            self.loglikelihoods.append((doc_id, cont_id, float(log_likelihood)))
             self.celosses.append((doc_id, cont_id, float(celoss)))
             self.bpbs.append((doc_id, cont_id, float(bpb)))
+            self.loglikelihoods_no_leading_space.append(
+                (doc_id, cont_id, float(log_likelihood_no_leading_space))
+            )
+            self.celosses_no_leading_space.append((doc_id, cont_id, float(celoss_no_leading_space)))
+            self.bpbs_no_leading_space.append((doc_id, cont_id, float(bpb_no_leading_space)))
     def compute(self) -> Dict[str, torch.Tensor]:
         # Task "suffix" -> tensor
         # states should have been synced from all accelerators at this point
         # account for duplicates here because of DistributedSampler compensating for drop_last=False
         loglikelihood_dict: Dict[int, Dict[int, float]] = {}
+        loglikelihood_no_leading_space_dict: Dict[int, Dict[int, float]] = {}
         label_dict: Dict[int, int] = {}
         celoss_dict: Dict[int, Dict[int, float]] = {}
+        celoss_no_leading_space_dict: Dict[int, Dict[int, float]] = {}
         bpb_dict: Dict[int, Dict[int, float]] = {}
+        bpb_no_leading_space_dict: Dict[int, Dict[int, float]] = {}
         # collect labels
         for doc_id, cont_id, label_id in self.labels:
@@ -159,6 +221,17 @@ class ICLMetric(Metric):
             if cont_id not in loglikelihood_dict[doc_id]:
                 loglikelihood_dict[doc_id][cont_id] = loglikelihood
+        # collect loglikelihoods no leading space
+        for doc_id, cont_id, loglikelihood in self.loglikelihoods_no_leading_space:
+            if doc_id is None or cont_id is None or loglikelihood is None:
+                continue
+            if doc_id not in loglikelihood_no_leading_space_dict:
+                loglikelihood_no_leading_space_dict[doc_id] = {}
+            if cont_id not in loglikelihood_no_leading_space_dict[doc_id]:
+                loglikelihood_no_leading_space_dict[doc_id][cont_id] = loglikelihood
         # collect celosses
         for doc_id, cont_id, celoss_val in self.celosses:
             if doc_id is None or cont_id is None or celoss_val is None:
@@ -170,6 +243,17 @@ class ICLMetric(Metric):
             if cont_id not in celoss_dict[doc_id]:
                 celoss_dict[doc_id][cont_id] = celoss_val
+        # collect celosses no leading space
+        for doc_id, cont_id, celoss_val in self.celosses_no_leading_space:
+            if doc_id is None or cont_id is None or celoss_val is None:
+                continue
+            if doc_id not in celoss_no_leading_space_dict:
+                celoss_no_leading_space_dict[doc_id] = {}
+            if cont_id not in celoss_no_leading_space_dict[doc_id]:
+                celoss_no_leading_space_dict[doc_id][cont_id] = celoss_val
         # collect bpbs
         for doc_id, cont_id, bpb_val in self.bpbs:
             if doc_id is None or cont_id is None or bpb_val is None:
@@ -181,13 +265,30 @@ class ICLMetric(Metric):
             if cont_id not in bpb_dict[doc_id]:
                 bpb_dict[doc_id][cont_id] = bpb_val
+        # collect bpbs no leading space
+        for doc_id, cont_id, bpb_val in self.bpbs_no_leading_space:
+            if doc_id is None or cont_id is None or bpb_val is None:
+                continue
+            if doc_id not in bpb_no_leading_space_dict:
+                bpb_no_leading_space_dict[doc_id] = {}
+            if cont_id not in bpb_no_leading_space_dict[doc_id]:
+                bpb_no_leading_space_dict[doc_id][cont_id] = bpb_val
         # compute acc
+        correct_no_leading_space = []
         correct = []
         celoss = []
+        celoss_no_leading_space = []
         bpb = []
+        bpb_no_leading_space = []
         soft_score = []
         soft_log_score = []
+        soft_score_no_leading_space = []
+        soft_log_score_no_leading_space = []
         preds: Optional[List[float]] = None
+        preds_no_leading_space: Optional[List[float]] = None
         labels: Optional[List[int]] = None
         if self.metric_type == "f1":
             preds = []
@@ -197,15 +298,25 @@ class ICLMetric(Metric):
             # each doc_id might have a different number of continuation
             num_continuations = len(loglikelihood_dict[doc_id].keys())
             loglikelihoods = torch.tensor([-float("inf")] * num_continuations)
+            loglikelihoods_no_leading_space = torch.tensor([-float("inf")] * num_continuations)
             celosses = torch.tensor([float("inf")] * num_continuations)
+            celosses_no_leading_space = torch.tensor([float("inf")] * num_continuations)
             bpbs = torch.tensor([float("inf")] * num_continuations)
+            bpbs_no_leading_space = torch.tensor([float("inf")] * num_continuations)
             skip_document = False
             for cont_id in loglikelihood_dict[doc_id]:
                 try:
                     loglikelihoods[cont_id] = loglikelihood_dict[doc_id][cont_id]
+                    loglikelihoods_no_leading_space[cont_id] = loglikelihood_no_leading_space_dict[
+                        doc_id
+                    ][cont_id]
                     celosses[cont_id] = celoss_dict[doc_id][cont_id]
+                    celosses_no_leading_space[cont_id] = celoss_no_leading_space_dict[doc_id][
+                        cont_id
+                    ]
                     bpbs[cont_id] = bpb_dict[doc_id][cont_id]
+                    bpbs_no_leading_space[cont_id] = bpb_no_leading_space_dict[doc_id][cont_id]
                 except IndexError:
                     # We didn't process all of the continuations, so skip this document.
                     skip_document = True
@@ -216,39 +327,83 @@ class ICLMetric(Metric):
             if self.metric_type == "ce_loss":
                 celoss.append(celosses[0])  # Only one answer is scored
+                celoss_no_leading_space.append(celosses_no_leading_space[0])
             elif self.metric_type == "bpb":
                 bpb.append(bpbs[0])  # Only one answer is scored
+                bpb_no_leading_space.append(bpbs_no_leading_space[0])
             elif self.metric_type == "f1":
                 assert preds is not None
+                assert preds_no_leading_space is not None
                 assert labels is not None
                 preds.append(torch.argmax(loglikelihoods).item())
+                preds_no_leading_space.append(torch.argmax(loglikelihoods_no_leading_space).item())
                 labels.append(label_dict[doc_id])
             else:
                 correct.append(
                     1.0 if torch.argmax(loglikelihoods).item() == label_dict[doc_id] else 0.0
                 )
+                correct_no_leading_space.append(
+                    1.0
+                    if torch.argmax(loglikelihoods_no_leading_space).item() == label_dict[doc_id]
+                    else 0.0
+                )
                 celoss.append(celosses[label_dict[doc_id]].item())
+                celoss_no_leading_space.append(celosses_no_leading_space[label_dict[doc_id]].item())
                 bpb.append(bpbs[label_dict[doc_id]].item())
+                bpb_no_leading_space.append(bpbs_no_leading_space[label_dict[doc_id]].item())
                 soft_score.append(torch.softmax(loglikelihoods, dim=0)[label_dict[doc_id]].item())
                 soft_log_score.append(
                     torch.log_softmax(loglikelihoods, dim=0)[label_dict[doc_id]].item()
                 )
+                soft_score_no_leading_space.append(
+                    torch.softmax(loglikelihoods_no_leading_space, dim=0)[label_dict[doc_id]].item()
+                )
+                soft_log_score_no_leading_space.append(
+                    torch.log_softmax(loglikelihoods_no_leading_space, dim=0)[
+                        label_dict[doc_id]
+                    ].item()
+                )
+        # v1 vs. v2 corresponds to whether we add a 1 to the num chars or num bytes when normalizing the answer length. See https://github.com/allenai/OLMo-in-loop-evals/pull/6
         if self.metric_type == "f1":
             assert preds is not None
             assert labels is not None
             # for NLI tasks, continuations are yes, no, neither, so idx=0 assigned to pos label
             score = f1_score(labels, preds, pos_label=0)
-            return {"f1": torch.tensor(score)}
+            score_no_leading_space = f1_score(labels, preds_no_leading_space, pos_label=0)
+            return {
+                "f1_v1": torch.tensor(score),
+                "f1_v2": torch.tensor(score_no_leading_space),
+            }
         elif self.metric_type == "ce_loss":
-            return {"ce_loss": torch.tensor(sum(celoss) / len(celoss))}
+            return {
+                "ce_loss_v1": torch.tensor(
+                    sum(celoss_no_leading_space) / len(celoss_no_leading_space)
+                ),
+                "ce_loss_v2": torch.tensor(sum(celoss) / len(celoss)),
+            }
         elif self.metric_type == "bpb":
-            return {"bpb": torch.tensor(sum(bpb) / len(bpb))}
+            return {
+                "bpb_v1": torch.tensor(sum(bpb_no_leading_space) / len(bpb_no_leading_space)),
+                "bpb_v2": torch.tensor(sum(bpb) / len(bpb)),
+            }
         else:
             return {
-                self.metric_type: torch.tensor(sum(correct) / len(correct)),
-                "ce_loss": torch.tensor(sum(celoss) / len(celoss)),
-                "bpb": torch.tensor(sum(bpb) / len(bpb)),
-                "soft": torch.tensor(sum(soft_score) / len(soft_score)),
-                "soft_log": torch.tensor(sum(soft_log_score) / len(soft_log_score)),
+                f"{self.metric_type}_v1": torch.tensor(sum(correct) / len(correct)),
+                f"{self.metric_type}_v2": torch.tensor(sum(correct) / len(correct)),
+                "ce_loss_v1": torch.tensor(
+                    sum(celoss_no_leading_space) / len(celoss_no_leading_space)
+                ),
+                "ce_loss_v2": torch.tensor(sum(celoss) / len(celoss)),
+                "bpb_v1": torch.tensor(sum(bpb_no_leading_space) / len(bpb_no_leading_space)),
+                "bpb_v2": torch.tensor(sum(bpb) / len(bpb)),
+                "soft_v1": torch.tensor(
+                    sum(soft_score_no_leading_space) / len(soft_score_no_leading_space)
+                ),
+                "soft_v2": torch.tensor(sum(soft_score) / len(soft_score)),
+                "soft_log_v1": torch.tensor(
+                    sum(soft_log_score_no_leading_space) / len(soft_log_score_no_leading_space)
+                ),
+                "soft_log_v2": torch.tensor(sum(soft_log_score) / len(soft_log_score)),
             }

{ai2_olmo_eval-0.7.0 → ai2_olmo_eval-0.7.1}/src/olmo_eval/tasks.py RENAMED Viewed

@@ -103,8 +103,15 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
                     )
                 for cont_id, continuation_str in enumerate(continuations):
-                    cont_str_len = len(continuation_str) - 1  # continuation contain leading blank
-                    cont_byte_len = len(continuation_str[1:].encode("utf-8"))
+                    # The original implementation did not count the first character (usually the leading space) as
+                    # part of the continuation length (e.g., " A", " " is not counted). The OLMES standard does not
+                    # do this, but we track both for backwards compatibility.
+                    cont_str_len_no_leading_space = len(continuation_str) - 1
+                    cont_byte_len_no_leading_space = len(continuation_str[1:].encode("utf-8"))
+                    cont_str_len = len(continuation_str)
+                    cont_byte_len = len(continuation_str.encode("utf-8"))
                     continuation = self.token_encode(continuation_str)
                     # query, remove last token from continuation, truncate from left is longer than model ctx length
@@ -131,6 +138,8 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
                             ),  # even if query has last token removed, LM will output same cont len
                             "cont_str_len": cont_str_len,
                             "cont_byte_len": cont_byte_len,
+                            "cont_str_len_no_leading_space": cont_str_len_no_leading_space,
+                            "cont_byte_len_no_leading_space": cont_byte_len_no_leading_space,
                             "query": query,  # remove last token from continuation
                             "dc_query": dc_query,
                             "label_id": label_id,
@@ -209,6 +218,8 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
         cont_lens = []
         cont_str_lens = []
         cont_byte_lens = []
+        cont_str_len_no_leading_space = []
+        cont_byte_len_no_leading_space = []
         queries = []
         dc_queries = []
         label_ids = []
@@ -232,6 +243,8 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
             cont_lens.append(sample["cont_len"])
             cont_str_lens.append(sample["cont_str_len"])
             cont_byte_lens.append(sample["cont_byte_len"])
+            cont_str_len_no_leading_space.append(sample["cont_str_len_no_leading_space"])
+            cont_byte_len_no_leading_space.append(sample["cont_byte_len_no_leading_space"])
             queries.append(
                 torch.LongTensor(
@@ -261,6 +274,8 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
             ),  # since query has last token removed from continuation
             "cont_str_len": torch.LongTensor(cont_str_lens),
             "cont_byte_len": torch.LongTensor(cont_byte_lens),
+            "cont_str_len_no_leading_space": torch.LongTensor(cont_str_len_no_leading_space),
+            "cont_byte_len_no_leading_space": torch.LongTensor(cont_byte_len_no_leading_space),
             "input_ids": torch.stack(queries),
             "dc_input_ids": torch.stack(dc_queries),
             "label_id": torch.LongTensor(label_ids),
@@ -456,8 +471,15 @@ class WinoGrande(ICLMultiChoiceTaskDataset):
             continuation_str = self.doc_to_continuations(doc)
             label_id = self.doc_to_label(doc)
-            cont_str_len = len(continuation_str) - 1  # continuations contain leading blank space
-            cont_byte_len = len(continuation_str[1:].encode("utf-8"))
+            # The original implementation did not count the first character (usually the leading space) as
+            # part of the continuation length (e.g., " A", " " is not counted). The OLMES standard does not
+            # do this, but we track both for backwards compatibility.
+            cont_str_len_no_leading_space = len(continuation_str) - 1
+            cont_byte_len_no_leading_space = len(continuation_str[1:].encode("utf-8"))
+            cont_str_len = len(continuation_str)
+            cont_byte_len = len(continuation_str.encode("utf-8"))
             # tokenize
             continuation = self.token_encode(continuation_str)
@@ -488,6 +510,8 @@ class WinoGrande(ICLMultiChoiceTaskDataset):
                         ),  # even if query has last token removed, LM will output same cont len
                         "cont_str_len": cont_str_len,
                         "cont_byte_len": cont_byte_len,
+                        "cont_str_len_no_leading_space": cont_str_len_no_leading_space,
+                        "cont_byte_len_no_leading_space": cont_byte_len_no_leading_space,
                         "query": query,  # remove last token from continuation
                         "dc_query": dc_query,
                         "label_id": label_id,
@@ -1524,8 +1548,16 @@ class OEEvalTask(ICLMultiChoiceTaskDataset):
                         f"Sample doc from ({self.dataset_path}, {ds_name}):"
                         + f"\ndoc_text: {doc_text}\ncontinuation: {continuation_str}"
                     )
-                cont_str_len = len(continuation_str) - 1  # continuation contain leading blank
-                cont_byte_len = len(continuation_str[1:].encode("utf-8"))
+                # The original implementation did not count the first character (usually the leading space) as
+                # part of the continuation length (e.g., " A", " " is not counted). The OLMES standard does not
+                # do this, but we track both for backwards compatibility.
+                cont_str_len_no_leading_space = len(continuation_str) - 1
+                cont_byte_len_no_leading_space = len(continuation_str[1:].encode("utf-8"))
+                cont_str_len = len(continuation_str)
+                cont_byte_len = len(continuation_str.encode("utf-8"))
                 continuation = self.token_encode(continuation_str)
                 # query, remove last token from continuation, truncate from left is longer than model ctx length
@@ -1552,6 +1584,8 @@ class OEEvalTask(ICLMultiChoiceTaskDataset):
                         ),  # even if query has last token removed, LM will output same cont len
                         "cont_str_len": cont_str_len,
                         "cont_byte_len": cont_byte_len,
+                        "cont_str_len_no_leading_space": cont_str_len_no_leading_space,
+                        "cont_byte_len_no_leading_space": cont_byte_len_no_leading_space,
                         "query": query,  # remove last token from continuation
                         "dc_query": dc_query,
                         "label_id": label_id,