PyPI - ai2-olmo-eval - Versions diffs - 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

ai2-olmo-eval 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

{ai2_olmo_eval-0.7.2.dist-info → ai2_olmo_eval-0.8.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ai2-olmo-eval
-Version: 0.7.2
+Version: 0.8.0
 Summary: In-loop evaluation tasks for language modeling
 Author-email: Allen Institute for Artificial Intelligence <olmo@allenai.org>
 License:                                  Apache License

{ai2_olmo_eval-0.7.2.dist-info → ai2_olmo_eval-0.8.0.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,10 @@
-ai2_olmo_eval-0.7.2.dist-info/licenses/LICENSE,sha256=YvuKOpYh3COIF0yqq-nCMXtpS7mh1GyYvPVlW2j1G-M,11359
+ai2_olmo_eval-0.8.0.dist-info/licenses/LICENSE,sha256=YvuKOpYh3COIF0yqq-nCMXtpS7mh1GyYvPVlW2j1G-M,11359
 olmo_eval/__init__.py,sha256=49RxnAaJNk8U9XP3SF5MjyFIxLSkxH0vXQuZgnEOi44,283
-olmo_eval/metrics.py,sha256=NcI_1B3BV-DC9RXjsSIftU-2GeF8vvU6SNyJnlYlKwU,18705
-olmo_eval/tasks.py,sha256=QGLyF7JA2-T9mkh-N4cZGNOQp9si90yQSS41T3x5Lak,79630
+olmo_eval/metrics.py,sha256=zc4IOZ8rUhxPyXVk6fOYzVKjJ4Lzq4tYeoyurxYWqY0,20034
+olmo_eval/tasks.py,sha256=DF4-2MS5dkGgZSjNrRkjEoWShrAsGO7tiB6mqcTQnE8,93219
 olmo_eval/tokenizer.py,sha256=PnkidE0nAtEA1QZjuQpE_bIwgAsHxodnaJRALAPqrJQ,5127
 olmo_eval/util.py,sha256=ARmZmRQl8VOvnKQoUprb3cOunzcApeNhRdV4BMXZuvo,3856
-olmo_eval/version.py,sha256=QWjPfx79C2NOQw2G7iDEsM4FKsLiGLCLNDzEx7EImf8,308
+olmo_eval/version.py,sha256=ucNFr1ahYQCmPHuM8Qq6kPbT7lmTnsZQuSxG1jpqphI,308
 olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/data-00000-of-00001.arrow,sha256=TPWbMhBmticWjYp7TA3etcKbXbaoCDBWhxuqlD1bDJA,98080
 olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/dataset_info.json,sha256=iZumP5Udu8LD7cbew3o7nNpnGu-o9jPaMxUrNDDNIVY,1795
 olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/state.json,sha256=6Q1XhM-HMZcymuGAKBC_8RjMBKgJSaR_6lLUO9Z8XwE,255
@@ -716,7 +716,7 @@ olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/config.json,sha256=ySjEVqTOj5GwC
 olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/requests.jsonl.gz,sha256=knTzcqigWCfdYLN1Pl0TfCm0Fi1lRASWAo_SC6KtXsc,115262
 olmo_eval/tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json,sha256=yjXYcnpTO7Zjm_R4Gucrn9oA5paadiYM-ZZER5q_EXc,2114319
 olmo_eval/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json,sha256=mtM7Szmp-Dlzw_jEKgGUjdW4d6KKyaU1aVbE_07QtxQ,2115113
-ai2_olmo_eval-0.7.2.dist-info/METADATA,sha256=PKJfkoDu4hrLzb6NA1MDfXOjZnUxQ4WFpJouWU1Cr_4,14398
-ai2_olmo_eval-0.7.2.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
-ai2_olmo_eval-0.7.2.dist-info/top_level.txt,sha256=Pryk28JTb89-j624Uy1gRZiE0YXI3czgbNIfJCl9-x0,10
-ai2_olmo_eval-0.7.2.dist-info/RECORD,,
+ai2_olmo_eval-0.8.0.dist-info/METADATA,sha256=TZmOipbol7scpsNfiFVximYmOONNlOg-J_bhbn0a-FE,14398
+ai2_olmo_eval-0.8.0.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
+ai2_olmo_eval-0.8.0.dist-info/top_level.txt,sha256=Pryk28JTb89-j624Uy1gRZiE0YXI3czgbNIfJCl9-x0,10
+ai2_olmo_eval-0.8.0.dist-info/RECORD,,

olmo_eval/metrics.py CHANGED Viewed

@@ -98,96 +98,121 @@ class ICLMetric(Metric):
                 batch["ctx_len"][idx] - 1 : batch["ctx_len"][idx] + batch["cont_len"][idx] - 1
             ]
-            log_likelihood: torch.Tensor
-            celoss: torch.Tensor
-            bpb: torch.Tensor
-            log_likelihood_no_leading_space: torch.Tensor
-            celoss_no_leading_space: torch.Tensor
-            bpb_no_leading_space: torch.Tensor
-            if self.metric_type == "pmi_dc":
-                assert dc_lm_logits is not None
-                # get domain conditional continuation logits: [cont_len, vocab]
-                dc_lm_cont_logits = dc_lm_logits[idx][
-                    batch["dc_len"][idx] - 1 : batch["dc_len"][idx] + batch["cont_len"][idx] - 1
-                ]
-                # gather log-probs at continuation token indices but divide by domain conditional prob
-                log_likelihood = (
-                    torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
-                    / torch.gather(dc_lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
-                )
-                celoss = -log_likelihood
-                bpb = -log_likelihood  # the normalization factors cancel out
-                log_likelihood_no_leading_space = log_likelihood
-                celoss_no_leading_space = celoss
-                bpb_no_leading_space = bpb
-            elif self.metric_type == "acc" or self.metric_type == "f1":
-                # gather log-probs at continuation token indices
-                log_likelihood = torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
-                celoss = (
-                    -torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
-                    / batch["cont_str_len"][idx]
-                )
-                bpb = (
-                    -torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
-                    / batch["cont_byte_len"][idx]
-                    * LOG_2_OF_E
-                )
-                log_likelihood_no_leading_space = torch.gather(
-                    lm_cont_logits, 1, cont_tokens.unsqueeze(-1)
-                ).sum()
-                celoss_no_leading_space = (
-                    -torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
-                    / batch["cont_str_len_no_leading_space"][idx]
-                )
-                bpb_no_leading_space = (
-                    -torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
-                    / batch["cont_byte_len_no_leading_space"][idx]
-                    * LOG_2_OF_E
-                )
-            elif self.metric_type in ["len_norm", "ce_loss", "bpb"]:
-                log_likelihood = (
-                    torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
-                    / batch["cont_str_len"][idx]
-                )
-                celoss = (
-                    -torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
-                    / batch["cont_str_len"][idx]
-                )
-                bpb = (
-                    -torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
-                    / batch["cont_byte_len"][idx]
-                    * LOG_2_OF_E
-                )
+            if "choice_ids" in batch:
+                fast_mc = True
+                choice_ids = batch["choice_ids"][idx]
+            else:
+                fast_mc = False
+                choice_ids = cont_tokens
+            # For each choice token, calculate metrics and append as separate entries
+            for choice_idx, choice_token in enumerate(choice_ids):
+                if fast_mc:
+                    _cont_id = choice_idx
+                    _cont_tokens = choice_token.unsqueeze(-1)
+                else:
+                    _cont_id = cont_id
+                    _cont_tokens = cont_tokens
+                # Skip choices for Qs with less than the max choices (for questions w/ different nubmers of choices)
+                is_empty_choice = (choice_token.unsqueeze(-1).unsqueeze(-1) == -1).all().item()
+                if is_empty_choice:
+                    continue
+                log_likelihood: torch.Tensor
+                celoss: torch.Tensor
+                bpb: torch.Tensor
+                log_likelihood_no_leading_space: torch.Tensor
+                celoss_no_leading_space: torch.Tensor
+                bpb_no_leading_space: torch.Tensor
+                if self.metric_type == "pmi_dc":
+                    assert dc_lm_logits is not None
+                    # get domain conditional continuation logits: [cont_len, vocab]
+                    dc_lm_cont_logits = dc_lm_logits[idx][
+                        batch["dc_len"][idx] - 1 : batch["dc_len"][idx] + batch["cont_len"][idx] - 1
+                    ]
-                log_likelihood_no_leading_space = (
-                    torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
-                    / batch["cont_str_len_no_leading_space"][idx]
-                )
-                celoss_no_leading_space = (
-                    -torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
-                    / batch["cont_str_len_no_leading_space"][idx]
+                    # gather log-probs at continuation token indices but divide by domain conditional prob
+                    log_likelihood = (
+                        torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
+                        / torch.gather(dc_lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
+                    )
+                    celoss = -log_likelihood
+                    bpb = -log_likelihood  # the normalization factors cancel out
+                    log_likelihood_no_leading_space = log_likelihood
+                    celoss_no_leading_space = celoss
+                    bpb_no_leading_space = bpb
+                elif self.metric_type == "acc" or self.metric_type == "f1":
+                    # gather log-probs at continuation token indices
+                    log_likelihood = torch.gather(
+                        lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)
+                    ).sum()
+                    celoss = (
+                        -torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
+                        / batch["cont_str_len"][idx]
+                    )
+                    bpb = (
+                        -torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
+                        / batch["cont_byte_len"][idx]
+                        * LOG_2_OF_E
+                    )
+                    log_likelihood_no_leading_space = torch.gather(
+                        lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)
+                    ).sum()
+                    celoss_no_leading_space = (
+                        -torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
+                        / batch["cont_str_len_no_leading_space"][idx]
+                    )
+                    bpb_no_leading_space = (
+                        -torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
+                        / batch["cont_byte_len_no_leading_space"][idx]
+                        * LOG_2_OF_E
+                    )
+                elif self.metric_type in ["len_norm", "ce_loss", "bpb"]:
+                    log_likelihood = (
+                        torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
+                        / batch["cont_str_len"][idx]
+                    )
+                    celoss = (
+                        -torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
+                        / batch["cont_str_len"][idx]
+                    )
+                    bpb = (
+                        -torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
+                        / batch["cont_byte_len"][idx]
+                        * LOG_2_OF_E
+                    )
+                    log_likelihood_no_leading_space = (
+                        torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
+                        / batch["cont_str_len_no_leading_space"][idx]
+                    )
+                    celoss_no_leading_space = (
+                        -torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
+                        / batch["cont_str_len_no_leading_space"][idx]
+                    )
+                    bpb_no_leading_space = (
+                        -torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
+                        / batch["cont_byte_len_no_leading_space"][idx]
+                        * LOG_2_OF_E
+                    )
+                else:
+                    raise ValueError(self.metric_type)
+                self.labels.append((doc_id, _cont_id, int(batch["label_id"][idx])))
+                self.loglikelihoods.append((doc_id, _cont_id, float(log_likelihood)))
+                self.celosses.append((doc_id, _cont_id, float(celoss)))
+                self.bpbs.append((doc_id, _cont_id, float(bpb)))
+                self.loglikelihoods_no_leading_space.append(
+                    (doc_id, _cont_id, float(log_likelihood_no_leading_space))
                 )
-                bpb_no_leading_space = (
-                    -torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
-                    / batch["cont_byte_len_no_leading_space"][idx]
-                    * LOG_2_OF_E
+                self.celosses_no_leading_space.append(
+                    (doc_id, _cont_id, float(celoss_no_leading_space))
                 )
-            else:
-                raise ValueError(self.metric_type)
-            self.labels.append((doc_id, cont_id, int(batch["label_id"][idx])))
-            self.loglikelihoods.append((doc_id, cont_id, float(log_likelihood)))
-            self.celosses.append((doc_id, cont_id, float(celoss)))
-            self.bpbs.append((doc_id, cont_id, float(bpb)))
-            self.loglikelihoods_no_leading_space.append(
-                (doc_id, cont_id, float(log_likelihood_no_leading_space))
-            )
-            self.celosses_no_leading_space.append((doc_id, cont_id, float(celoss_no_leading_space)))
-            self.bpbs_no_leading_space.append((doc_id, cont_id, float(bpb_no_leading_space)))
+                self.bpbs_no_leading_space.append((doc_id, _cont_id, float(bpb_no_leading_space)))
     def compute(self) -> Dict[str, torch.Tensor]:
         # Task "suffix" -> tensor

olmo_eval/tasks.py CHANGED Viewed

@@ -33,6 +33,7 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
         dataset_name: Union[str, Sequence[str], None] = None,
         model_ctx_len: int = 2048,
         fixed_ctx_len: bool = False,
+        fast_mc: bool = False,
         split="validation",
         metric_type=None,  # Override default metric type
         prompts: Optional[List[Optional[str]]] = None,  # List of prompt variants to use
@@ -44,6 +45,7 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
         self.dataset_name = dataset_name
         self.model_ctx_len = model_ctx_len
         self.fixed_ctx_len = fixed_ctx_len
+        self.fast_mc = fast_mc
         self.prompts = prompts or [None]
         self.current_prompt: Optional[str] = None
         if metric_type is not None:
@@ -76,6 +78,7 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
     def prep_examples(self):
         """Append doc_ids to each example so that they are processed together in the metric"""
         doc_id = 0
+        new_samples = []
         for doc in self.dataset:
             for prompt in self.prompts:
                 self.current_prompt = prompt
@@ -125,7 +128,7 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
                     dc_query = dc + continuation[:-1]
                     # form a sample
-                    self.samples.append(
+                    new_samples.append(
                         {
                             "doc_id": doc_id,
                             "cont_id": cont_id,
@@ -148,6 +151,56 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
                 doc_id += 1
+        # Fast MCQA:
+        # Only pass a single request, and group together all continuations as tokens
+        if self.fast_mc:
+            # Get unique doc IDs
+            unique_doc_ids = {
+                sample["doc_id"] for sample in new_samples if isinstance(sample["doc_id"], int)
+            }
+            # Create new samples list for fast MC
+            fast_mc_samples = []
+            # Process each unique document
+            for doc_id in unique_doc_ids:
+                # Get all samples for this doc_id
+                doc_samples = [s for s in new_samples if s["doc_id"] == doc_id]
+                # Sort by continuation ID
+                doc_samples.sort(
+                    key=lambda x: float(x["cont_id"])
+                    if isinstance(x["cont_id"], (int, float))
+                    else 0.0
+                )
+                # Create new sample with distractor continuations
+                base_sample = doc_samples[0].copy()
+                choices = [s["continuation"] for s in doc_samples]
+                # Assert all continuations are length 1
+                for choice in choices:
+                    if not isinstance(choice, (list, tuple)):
+                        raise TypeError(
+                            f"Expected continuation to be a list or tuple, got {type(choice)}"
+                        )
+                    assert len(choice) == 1, f"Expected continuation length 1, got {len(choice)}"
+                # Take first token of each continuation
+                choices = [
+                    choice[0] if isinstance(choice, (list, tuple)) else choice for choice in choices
+                ]
+                base_sample["choices"] = choices
+                base_sample["fast_mc"] = True
+                fast_mc_samples.append(base_sample)
+            # Add fast MC samples to main samples list
+            new_samples = fast_mc_samples
+        self.samples = new_samples
     def pad_tokens_until_max(self, tokens, max_len=2048):
         """truncate from left if len(tokens) > model_ctx_len, max_len is not considered then
         queries are already truncated at max length of model_ctx_len
@@ -214,6 +267,7 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
         ctxs = []
         continuations = []
         ctx_lens = []
+        choice_ids = []
         dc_lens = []
         cont_lens = []
         cont_str_lens = []
@@ -245,6 +299,8 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
             cont_byte_lens.append(sample["cont_byte_len"])
             cont_str_len_no_leading_space.append(sample["cont_str_len_no_leading_space"])
             cont_byte_len_no_leading_space.append(sample["cont_byte_len_no_leading_space"])
+            if self.fast_mc:
+                choice_ids.append(sample["choices"])
             queries.append(
                 torch.LongTensor(
@@ -281,6 +337,16 @@ class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta):
             "label_id": torch.LongTensor(label_ids),
         }
+        if self.fast_mc:
+            # Pad choice_ids with -1 (for Qs with different numbers of choices)
+            max_choices_len = max(len(choices) for choices in choice_ids)
+            padded_choice_ids = []
+            for choices in choice_ids:
+                padding = [-1] * (max_choices_len - len(choices))
+                padded_choice_ids.append(choices + padding)
+            choice_ids = padded_choice_ids
+            batch["choice_ids"] = torch.LongTensor(choice_ids)
         return batch
     def token_encode(self, string: str) -> List[int]:
@@ -1446,6 +1512,7 @@ class OEEvalTask(ICLMultiChoiceTaskDataset):
         dataset_name: Union[str, Sequence[str], None] = None,
         model_ctx_len: int = 2048,
         fixed_ctx_len: bool = False,
+        fast_mc: bool = False,
         split=None,
         metric_type=None,
         prompts: Optional[List[Optional[str]]] = None,  # List of prompt variants to use
@@ -1457,6 +1524,7 @@ class OEEvalTask(ICLMultiChoiceTaskDataset):
         self.dataset_name = dataset_name
         self.model_ctx_len = model_ctx_len
         self.fixed_ctx_len = fixed_ctx_len
+        self.fast_mc = fast_mc
         self.log_instances = 0  # Set to > 0 to log the first few instances as a sanity check
         self.samples: List[Dict[str, Any]] = []
@@ -1500,6 +1568,8 @@ class OEEvalTask(ICLMultiChoiceTaskDataset):
         for requests in self.dataset:
             current_doc_id_offset += max_doc_id
             max_doc_id = 0  # Max doc id seen in this dataset
+            new_samples = []
             for request in requests:
                 doc = request["doc"]
                 doc_id = request["doc_id"]
@@ -1571,7 +1641,7 @@ class OEEvalTask(ICLMultiChoiceTaskDataset):
                 dc_query = dc + continuation[:-1]
                 # form a sample
-                self.samples.append(
+                new_samples.append(
                     {
                         "doc_id": doc_id + current_doc_id_offset,
                         "cont_id": cont_id,
@@ -1592,6 +1662,46 @@ class OEEvalTask(ICLMultiChoiceTaskDataset):
                     }
                 )
+            # Fast MCQA:
+            # Only pass a single request, and group together all continuations as tokens
+            if self.fast_mc:
+                # Get unique doc IDs
+                unique_doc_ids = set(sample["doc_id"] for sample in new_samples)
+                # Create new samples list for fast MC
+                fast_mc_samples = []
+                # Process each unique document
+                for doc_id in unique_doc_ids:
+                    # Get all samples for this doc_id
+                    doc_samples = [s for s in new_samples if s["doc_id"] == doc_id]
+                    # Sort by continuation ID
+                    doc_samples.sort(key=lambda x: x["cont_id"])
+                    # Create new sample with distractor continuations
+                    base_sample = doc_samples[0].copy()
+                    choices = [s["continuation"] for s in doc_samples]
+                    # Assert all continuations are length 1
+                    for choice in choices:
+                        assert (
+                            len(choice) == 1
+                        ), f"Expected continuation length 1, got {len(choice)}"
+                    # Take first token of each continuation
+                    choices = [choice[0] for choice in choices]
+                    base_sample["choices"] = choices
+                    base_sample["fast_mc"] = True
+                    fast_mc_samples.append(base_sample)
+                # Add fast MC samples to main samples list
+                new_samples = fast_mc_samples
+            self.samples = new_samples
     def doc_to_text(self, doc) -> str:
         del doc
         raise NotImplementedError
@@ -1768,6 +1878,24 @@ LABEL_TO_TASK_MAP_ORIG = {
         OEEvalTask,
         {"dataset_path": "copycolors", "dataset_name": "xl_10way", "metric_type": "acc"},
     ),
+    "copycolors_10way_fast": (
+        OEEvalTask,
+        {
+            "dataset_path": "copycolors",
+            "dataset_name": "10way",
+            "metric_type": "acc",
+            "fast_mc": True,
+        },
+    ),
+    "copycolors_xl_10way_fast": (
+        OEEvalTask,
+        {
+            "dataset_path": "copycolors",
+            "dataset_name": "xl_10way",
+            "metric_type": "acc",
+            "fast_mc": True,
+        },
+    ),
     "csqa_mc_5shot": (
         OEEvalTask,
         {"dataset_path": "csqa", "dataset_name": "mc_5shot", "metric_type": "acc"},
@@ -1792,6 +1920,10 @@ LABEL_TO_TASK_MAP_ORIG = {
         OEEvalTask,
         {"dataset_path": "hellaswag", "dataset_name": "rc_5shot", "metric_type": "len_norm"},
     ),
+    "hellaswag_bpb_5shot": (
+        OEEvalTask,
+        {"dataset_path": "hellaswag", "dataset_name": "rc_5shot", "metric_type": "bpb"},
+    ),
     "openbookqa_mc_5shot": (
         OEEvalTask,
         {"dataset_path": "openbookqa", "dataset_name": "mc_5shot", "metric_type": "acc"},
@@ -2001,6 +2133,14 @@ LABEL_TO_TASK_MAP_LADDER = {
             "metric_type": "len_norm",
         },
     ),
+    "arc_challenge_val_bpb_5shot": (
+        OEEvalTask,
+        {
+            "dataset_path": "arc_challenge",
+            "dataset_name": "val_rc_5shot",
+            "metric_type": "bpb",
+        },
+    ),
     "arc_challenge_val_mc_5shot": (
         OEEvalTask,
         {"dataset_path": "arc_challenge", "dataset_name": "val_mc_5shot", "metric_type": "acc"},
@@ -2013,114 +2153,299 @@ LABEL_TO_TASK_MAP_LADDER = {
             "metric_type": "len_norm",
         },
     ),
+    "arc_challenge_test_bpb_5shot": (
+        OEEvalTask,
+        {
+            "dataset_path": "arc_challenge",
+            "dataset_name": "test_rc_5shot",
+            "metric_type": "bpb",
+        },
+    ),
     "arc_challenge_test_mc_5shot": (
         OEEvalTask,
         {"dataset_path": "arc_challenge", "dataset_name": "test_mc_5shot", "metric_type": "acc"},
     ),
+    "arc_challenge_test_mc_5shot_fast": (
+        OEEvalTask,
+        {
+            "dataset_path": "arc_challenge",
+            "dataset_name": "test_mc_5shot",
+            "metric_type": "acc",
+            "fast_mc": True,
+        },
+    ),
     "arc_easy_val_rc_5shot": (
         OEEvalTask,
         {"dataset_path": "arc_easy", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"},
     ),
+    "arc_easy_val_bpb_5shot": (
+        OEEvalTask,
+        {"dataset_path": "arc_easy", "dataset_name": "val_rc_5shot", "metric_type": "bpb"},
+    ),
     "arc_easy_val_mc_5shot": (
         OEEvalTask,
         {"dataset_path": "arc_easy", "dataset_name": "val_mc_5shot", "metric_type": "acc"},
     ),
+    "arc_easy_val_mc_5shot_fast": (
+        OEEvalTask,
+        {
+            "dataset_path": "arc_easy",
+            "dataset_name": "val_mc_5shot",
+            "metric_type": "acc",
+            "fast_mc": True,
+        },
+    ),
     "arc_easy_test_rc_5shot": (
         OEEvalTask,
         {"dataset_path": "arc_easy", "dataset_name": "test_rc_5shot", "metric_type": "len_norm"},
     ),
+    "arc_easy_test_bpb_5shot": (
+        OEEvalTask,
+        {"dataset_path": "arc_easy", "dataset_name": "test_rc_5shot", "metric_type": "bpb"},
+    ),
     "arc_easy_test_mc_5shot": (
         OEEvalTask,
         {"dataset_path": "arc_easy", "dataset_name": "test_mc_5shot", "metric_type": "acc"},
     ),
+    "arc_easy_test_mc_5shot_fast": (
+        OEEvalTask,
+        {
+            "dataset_path": "arc_easy",
+            "dataset_name": "test_mc_5shot",
+            "metric_type": "acc",
+            "fast_mc": True,
+        },
+    ),
     "boolq_val_rc_5shot": (
         OEEvalTask,
         {"dataset_path": "boolq", "dataset_name": "val_rc_5shot", "metric_type": "acc"},
     ),
+    "boolq_val_bpb_5shot": (
+        OEEvalTask,
+        {"dataset_path": "boolq", "dataset_name": "val_rc_5shot", "metric_type": "bpb"},
+    ),
     "boolq_val_mc_5shot": (
         OEEvalTask,
         {"dataset_path": "boolq", "dataset_name": "val_mc_5shot", "metric_type": "acc"},
     ),
+    "boolq_val_mc_5shot_fast": (
+        OEEvalTask,
+        {
+            "dataset_path": "boolq",
+            "dataset_name": "val_mc_5shot",
+            "metric_type": "acc",
+            "fast_mc": True,
+        },
+    ),
     "csqa_val_rc_5shot": (
         OEEvalTask,
         {"dataset_path": "csqa", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"},
     ),
+    "csqa_val_bpb_5shot": (
+        OEEvalTask,
+        {"dataset_path": "csqa", "dataset_name": "val_rc_5shot", "metric_type": "bpb"},
+    ),
     "csqa_val_mc_5shot": (
         OEEvalTask,
         {"dataset_path": "csqa", "dataset_name": "val_mc_5shot", "metric_type": "acc"},
     ),
+    "csqa_val_mc_5shot_fast": (
+        OEEvalTask,
+        {
+            "dataset_path": "csqa",
+            "dataset_name": "val_mc_5shot",
+            "metric_type": "acc",
+            "fast_mc": True,
+        },
+    ),
     "hellaswag_val_rc_5shot": (
         OEEvalTask,
         {"dataset_path": "hellaswag", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"},
     ),
+    "hellaswag_val_bpb_5shot": (
+        OEEvalTask,
+        {"dataset_path": "hellaswag", "dataset_name": "val_rc_5shot", "metric_type": "bpb"},
+    ),
     "hellaswag_val_mc_5shot": (
         OEEvalTask,
         {"dataset_path": "hellaswag", "dataset_name": "val_mc_5shot", "metric_type": "acc"},
     ),
+    "hellaswag_val_mc_5shot_fast": (
+        OEEvalTask,
+        {
+            "dataset_path": "hellaswag",
+            "dataset_name": "val_mc_5shot",
+            "metric_type": "acc",
+            "fast_mc": True,
+        },
+    ),
     "openbookqa_val_rc_5shot": (
         OEEvalTask,
         {"dataset_path": "openbookqa", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"},
     ),
+    "openbookqa_val_bpb_5shot": (
+        OEEvalTask,
+        {"dataset_path": "openbookqa", "dataset_name": "val_rc_5shot", "metric_type": "bpb"},
+    ),
     "openbookqa_val_mc_5shot": (
         OEEvalTask,
         {"dataset_path": "openbookqa", "dataset_name": "val_mc_5shot", "metric_type": "acc"},
     ),
+    "openbookqa_val_mc_5shot_fast": (
+        OEEvalTask,
+        {
+            "dataset_path": "openbookqa",
+            "dataset_name": "val_mc_5shot",
+            "metric_type": "acc",
+            "fast_mc": True,
+        },
+    ),
     "openbookqa_test_rc_5shot": (
         OEEvalTask,
         {"dataset_path": "openbookqa", "dataset_name": "test_rc_5shot", "metric_type": "len_norm"},
     ),
+    "openbookqa_test_bpb_5shot": (
+        OEEvalTask,
+        {"dataset_path": "openbookqa", "dataset_name": "test_rc_5shot", "metric_type": "bpb"},
+    ),
     "openbookqa_test_mc_5shot": (
         OEEvalTask,
         {"dataset_path": "openbookqa", "dataset_name": "test_mc_5shot", "metric_type": "acc"},
     ),
+    "openbookqa_test_mc_5shot_fast": (
+        OEEvalTask,
+        {
+            "dataset_path": "openbookqa",
+            "dataset_name": "test_mc_5shot",
+            "metric_type": "acc",
+            "fast_mc": True,
+        },
+    ),
     "piqa_val_rc_5shot": (
         OEEvalTask,
         {"dataset_path": "piqa", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"},
     ),
+    "piqa_val_bpb_5shot": (
+        OEEvalTask,
+        {"dataset_path": "piqa", "dataset_name": "val_rc_5shot", "metric_type": "bpb"},
+    ),
     "piqa_val_mc_5shot": (
         OEEvalTask,
         {"dataset_path": "piqa", "dataset_name": "val_mc_5shot", "metric_type": "acc"},
     ),
+    "piqa_val_mc_5shot_fast": (
+        OEEvalTask,
+        {
+            "dataset_path": "piqa",
+            "dataset_name": "val_mc_5shot",
+            "metric_type": "acc",
+            "fast_mc": True,
+        },
+    ),
     "socialiqa_val_rc_5shot": (
         OEEvalTask,
         {"dataset_path": "socialiqa", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"},
     ),
+    "socialiqa_val_bpb_5shot": (
+        OEEvalTask,
+        {"dataset_path": "socialiqa", "dataset_name": "val_rc_5shot", "metric_type": "bpb"},
+    ),
     "socialiqa_val_mc_5shot": (
         OEEvalTask,
         {"dataset_path": "socialiqa", "dataset_name": "val_mc_5shot", "metric_type": "acc"},
     ),
+    "socialiqa_val_mc_5shot_fast": (
+        OEEvalTask,
+        {
+            "dataset_path": "socialiqa",
+            "dataset_name": "val_mc_5shot",
+            "metric_type": "acc",
+            "fast_mc": True,
+        },
+    ),
     "winogrande_val_rc_5shot": (
         OEEvalTask,
         {"dataset_path": "winogrande", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"},
     ),
+    "winogrande_val_bpb_5shot": (
+        OEEvalTask,
+        {"dataset_path": "winogrande", "dataset_name": "val_rc_5shot", "metric_type": "bpb"},
+    ),
     "winogrande_val_mc_5shot": (
         OEEvalTask,
         {"dataset_path": "winogrande", "dataset_name": "val_mc_5shot", "metric_type": "acc"},
     ),
+    "winogrande_val_mc_5shot_fast": (
+        OEEvalTask,
+        {
+            "dataset_path": "winogrande",
+            "dataset_name": "val_mc_5shot",
+            "metric_type": "acc",
+            "fast_mc": True,
+        },
+    ),
     "mmlu_stem_val_rc_var": (MMLU, {"dataset_name": "stem", "prompt_variations": 1}),
     "mmlu_stem_val_rc_5shot": (MMLU, {"dataset_name": "stem", "prompt_variations": 2}),
+    "mmlu_stem_val_bpb_5shot": (
+        MMLU,
+        {"dataset_name": "stem", "prompt_variations": 2, "metric_type": "bpb"},
+    ),
     "mmlu_stem_val_mc_5shot": (
         MMLU,
         {"dataset_name": "stem", "prompt_variations": 2, "mc_labels": True},
     ),
+    "mmlu_stem_val_mc_5shot_fast": (
+        MMLU,
+        {"dataset_name": "stem", "prompt_variations": 2, "mc_labels": True, "fast_mc": True},
+    ),
     "mmlu_stem_test_rc_var": (
         MMLU,
         {"dataset_name": "stem", "split": "test", "prompt_variations": 1},
     ),
+    "mmlu_stem_test_bpb_var": (
+        MMLU,
+        {"dataset_name": "stem", "split": "test", "prompt_variations": 2, "metric_type": "bpb"},
+    ),
     "mmlu_stem_test_rc_5shot": (
         MMLU,
         {"dataset_name": "stem", "split": "test", "prompt_variations": 2},
     ),
+    "mmlu_stem_test_bpb_5shot": (
+        MMLU,
+        {"dataset_name": "stem", "split": "test", "prompt_variations": 2, "metric_type": "bpb"},
+    ),
     "mmlu_stem_test_mc_5shot": (
         MMLU,
         {"dataset_name": "stem", "split": "test", "prompt_variations": 2, "mc_labels": True},
     ),
+    "mmlu_stem_test_mc_5shot_fast": (
+        MMLU,
+        {
+            "dataset_name": "stem",
+            "split": "test",
+            "prompt_variations": 2,
+            "mc_labels": True,
+            "fast_mc": True,
+        },
+    ),
     "mmlu_humanities_val_rc_var": (MMLU, {"dataset_name": "humanities", "prompt_variations": 1}),
     "mmlu_humanities_val_rc_5shot": (MMLU, {"dataset_name": "humanities", "prompt_variations": 2}),
+    "mmlu_humanities_val_bpb_var": (
+        MMLU,
+        {"dataset_name": "humanities", "prompt_variations": 2, "metric_type": "bpb"},
+    ),
+    "mmlu_humanities_val_bpb_5shot": (
+        MMLU,
+        {"dataset_name": "humanities", "prompt_variations": 2, "metric_type": "bpb"},
+    ),
     "mmlu_humanities_val_mc_5shot": (
         MMLU,
         {"dataset_name": "humanities", "prompt_variations": 2, "mc_labels": True},
     ),
+    "mmlu_humanities_val_mc_5shot_fast": (
+        MMLU,
+        {"dataset_name": "humanities", "prompt_variations": 2, "mc_labels": True, "fast_mc": True},
+    ),
     "mmlu_humanities_test_rc_var": (
         MMLU,
         {"dataset_name": "humanities", "split": "test", "prompt_variations": 1},
@@ -2129,10 +2454,38 @@ LABEL_TO_TASK_MAP_LADDER = {
         MMLU,
         {"dataset_name": "humanities", "split": "test", "prompt_variations": 2},
     ),
+    "mmlu_humanities_test_bpb_var": (
+        MMLU,
+        {
+            "dataset_name": "humanities",
+            "split": "test",
+            "prompt_variations": 2,
+            "metric_type": "bpb",
+        },
+    ),
+    "mmlu_humanities_test_bpb_5shot": (
+        MMLU,
+        {
+            "dataset_name": "humanities",
+            "split": "test",
+            "prompt_variations": 2,
+            "metric_type": "bpb",
+        },
+    ),
     "mmlu_humanities_test_mc_5shot": (
         MMLU,
         {"dataset_name": "humanities", "split": "test", "prompt_variations": 2, "mc_labels": True},
     ),
+    "mmlu_humanities_test_mc_5shot_fast": (
+        MMLU,
+        {
+            "dataset_name": "humanities",
+            "split": "test",
+            "prompt_variations": 2,
+            "mc_labels": True,
+            "fast_mc": True,
+        },
+    ),
     "mmlu_social_sciences_val_rc_var": (
         MMLU,
         {"dataset_name": "social_sciences", "prompt_variations": 1},
@@ -2141,10 +2494,27 @@ LABEL_TO_TASK_MAP_LADDER = {
         MMLU,
         {"dataset_name": "social_sciences", "prompt_variations": 2},
     ),
+    "mmlu_social_sciences_val_bpb_var": (
+        MMLU,
+        {"dataset_name": "social_sciences", "prompt_variations": 2, "metric_type": "bpb"},
+    ),
+    "mmlu_social_sciences_val_bpb_5shot": (
+        MMLU,
+        {"dataset_name": "social_sciences", "prompt_variations": 2, "metric_type": "bpb"},
+    ),
     "mmlu_social_sciences_val_mc_5shot": (
         MMLU,
         {"dataset_name": "social_sciences", "prompt_variations": 2, "mc_labels": True},
     ),
+    "mmlu_social_sciences_val_mc_5shot_fast": (
+        MMLU,
+        {
+            "dataset_name": "social_sciences",
+            "prompt_variations": 2,
+            "mc_labels": True,
+            "fast_mc": True,
+        },
+    ),
     "mmlu_social_sciences_test_rc_var": (
         MMLU,
         {"dataset_name": "social_sciences", "split": "test", "prompt_variations": 1},
@@ -2153,6 +2523,24 @@ LABEL_TO_TASK_MAP_LADDER = {
         MMLU,
         {"dataset_name": "social_sciences", "split": "test", "prompt_variations": 2},
     ),
+    "mmlu_social_sciences_test_bpb_var": (
+        MMLU,
+        {
+            "dataset_name": "social_sciences",
+            "split": "test",
+            "prompt_variations": 2,
+            "metric_type": "bpb",
+        },
+    ),
+    "mmlu_social_sciences_test_bpb_5shot": (
+        MMLU,
+        {
+            "dataset_name": "social_sciences",
+            "split": "test",
+            "prompt_variations": 2,
+            "metric_type": "bpb",
+        },
+    ),
     "mmlu_social_sciences_test_mc_5shot": (
         MMLU,
         {
@@ -2162,12 +2550,34 @@ LABEL_TO_TASK_MAP_LADDER = {
             "mc_labels": True,
         },
     ),
+    "mmlu_social_sciences_test_mc_5shot_fast": (
+        MMLU,
+        {
+            "dataset_name": "social_sciences",
+            "split": "test",
+            "prompt_variations": 2,
+            "mc_labels": True,
+            "fast_mc": True,
+        },
+    ),
     "mmlu_other_val_rc_var": (MMLU, {"dataset_name": "other", "prompt_variations": 1}),
     "mmlu_other_val_rc_5shot": (MMLU, {"dataset_name": "other", "prompt_variations": 2}),
+    "mmlu_other_val_bpb_var": (
+        MMLU,
+        {"dataset_name": "other", "prompt_variations": 2, "metric_type": "bpb"},
+    ),
+    "mmlu_other_val_bpb_5shot": (
+        MMLU,
+        {"dataset_name": "other", "prompt_variations": 2, "metric_type": "bpb"},
+    ),
     "mmlu_other_val_mc_5shot": (
         MMLU,
         {"dataset_name": "other", "prompt_variations": 2, "mc_labels": True},
     ),
+    "mmlu_other_val_mc_5shot_fast": (
+        MMLU,
+        {"dataset_name": "other", "prompt_variations": 2, "mc_labels": True, "fast_mc": True},
+    ),
     "mmlu_other_test_rc_var": (
         MMLU,
         {"dataset_name": "other", "split": "test", "prompt_variations": 1},
@@ -2176,10 +2586,28 @@ LABEL_TO_TASK_MAP_LADDER = {
         MMLU,
         {"dataset_name": "other", "split": "test", "prompt_variations": 2},
     ),
+    "mmlu_other_test_bpb_var": (
+        MMLU,
+        {"dataset_name": "other", "split": "test", "prompt_variations": 2, "metric_type": "bpb"},
+    ),
+    "mmlu_other_test_bpb_5shot": (
+        MMLU,
+        {"dataset_name": "other", "split": "test", "prompt_variations": 2, "metric_type": "bpb"},
+    ),
     "mmlu_other_test_mc_5shot": (
         MMLU,
         {"dataset_name": "other", "split": "test", "prompt_variations": 2, "mc_labels": True},
     ),
+    "mmlu_other_test_mc_5shot_fast": (
+        MMLU,
+        {
+            "dataset_name": "other",
+            "split": "test",
+            "prompt_variations": 2,
+            "mc_labels": True,
+            "fast_mc": True,
+        },
+    ),
 }
 # Expanded tasks for BPB on some generative tasks

olmo_eval/version.py CHANGED Viewed

@@ -1,6 +1,6 @@
 _MAJOR = "0"
-_MINOR = "7"
-_PATCH = "2"
+_MINOR = "8"
+_PATCH = "0"
 _SUFFIX = ""
 VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)

{ai2_olmo_eval-0.7.2.dist-info → ai2_olmo_eval-0.8.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{ai2_olmo_eval-0.7.2.dist-info → ai2_olmo_eval-0.8.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{ai2_olmo_eval-0.7.2.dist-info → ai2_olmo_eval-0.8.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

ai2-olmo-eval 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

ai2-olmo-eval 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl