PyPI - fusion-bench - Versions diffs - 0.2.9__py3-none-any.whl - Mend

fusion-bench 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (727) hide show

fusion_bench/tasks/flan_t5_text_generation/glue_evaluation.py ADDED Viewed

@@ -0,0 +1,132 @@
+import logging
+from typing import List
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+log = logging.getLogger(__name__)
+def remove_special_tokens(tokenizer, token_list: list):
+    """
+    This function removes special tokens from a list of tokens. It also stops processing
+    when it encounters a token with a value of -100.
+    Parameters:
+        tokenizer (Tokenizer): The tokenizer object used for tokenizing text.
+        token_list (list): The list of tokens to be processed.
+    Returns:
+        list: The list of tokens after removing special tokens.
+    """
+    ret = []
+    for token in token_list:
+        if token not in tokenizer.all_special_ids and token > 0:
+            ret.append(token)
+        if token == -100:
+            break
+    return ret
+def evaluate_accuracy(model, val_loader: DataLoader, tokenizer):
+    """
+    This function evaluates the accuracy of a language model on a validation set.
+    Parameters:
+        model (nn.Module): The language model to be evaluated.
+        val_loader (DataLoader): The DataLoader object containing the validation data.
+        tokenizer (Tokenizer): The tokenizer object used for tokenizing text.
+    Returns:
+        float: The accuracy of the model on the validation set.
+    """
+    from tqdm import tqdm
+    correct = 0
+    total = 0
+    model = model.eval()
+    for batch_idx, batch in enumerate(
+        tqdm(
+            val_loader, desc="Evaluate Exact Accuracy", leave=False, dynamic_ncols=True
+        )
+    ):
+        with torch.no_grad():
+            outputs = model.generate(batch["input_ids"], max_length=10)
+            output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            labels = [
+                remove_special_tokens(tokenizer, label_token)
+                for label_token in batch["labels"]
+            ]
+            labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+            # compare output_text and labels
+            for i, j in zip(output_text, labels):
+                if i == j:
+                    correct += 1
+                total += 1
+    # return accuracy
+    return correct / total
+def evaluate_spearman_rho(model, val_loader: DataLoader, tokenizer):
+    """
+    This function evaluates the Spearman's rank correlation coefficient (rho) between the model's predictions and the actual labels on a validation set.
+    Parameters:
+        model (nn.Module): The language model to be evaluated.
+        val_loader (DataLoader): The DataLoader object containing the validation data.
+        tokenizer (Tokenizer): The tokenizer object used for tokenizing text.
+    Returns:
+        float: The Spearman's rho between the model's predictions and the actual labels.
+    """
+    from tqdm import tqdm
+    model = model.eval()
+    all_preds: List[str] = []
+    all_labels: List[str] = []
+    for batch_idx, batch in enumerate(
+        tqdm(val_loader, desc="Evaluate Spearman Rho", leave=False, dynamic_ncols=True)
+    ):
+        with torch.no_grad():
+            outputs = model.generate(batch["input_ids"], max_length=10)
+            output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            labels = [
+                remove_special_tokens(tokenizer, label_token)
+                for label_token in batch["labels"]
+            ]
+            labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+            all_preds.extend(output_text)
+            all_labels.extend(labels)
+    # save `all_preds` and `all_labels`
+    # with open("temp/all_preds.txt", "w") as f:
+    #     for preds in all_preds:
+    #         for pred in preds:
+    #             f.write(pred + "\n")
+    # with open("temp/all_labels.txt", "w") as f:
+    #     for labels in all_labels:
+    #         for label in labels:
+    #             f.write(label + "\n")
+    # calculate spearman's rho
+    # 1. convert string list `all_preds` and `all_labels` to numpy array
+    # 2. compute spearman's rho
+    from scipy.stats import spearmanr
+    def parse_flost(s: str):
+        try:
+            return float(s)
+        except Exception:
+            return 0.0
+    all_preds = np.array([parse_flost(pred) for pred in all_preds])
+    all_labels = np.array([parse_flost(label) for label in all_labels])
+    rho = spearmanr(all_preds, all_labels)[0]
+    return rho

fusion_bench/tasks/flan_t5_text_generation/glue_load_dataset.py ADDED Viewed

@@ -0,0 +1,64 @@
+import logging
+import os
+from typing import Optional
+from datasets import load_dataset, load_from_disk
+from omegaconf import DictConfig
+from fusion_bench.utils import instantiate, timeit_context
+from .glue_preprocessors import glue_processors
+from .glue_prompt_templates import glue_prompt_templates
+log = logging.getLogger(__name__)
+def _load_glue_dataset(name, tokenizer):
+    if isinstance(tokenizer, (DictConfig, dict)):
+        tokenizer = instantiate(tokenizer)
+    dataset = load_dataset("glue", name)
+    preprocessor = glue_processors[name](
+        template=glue_prompt_templates[name],
+        tokenizer=tokenizer,
+        tokenizer_kwargs={
+            "padding": "max_length",
+            "truncation": True,
+            "return_tensors": "pt",
+        },
+    )
+    dataset = dataset.map(
+        preprocessor,
+        batched=True,
+        remove_columns=dataset["train"].column_names,
+        num_proc=1,
+    )
+    return dataset
+def load_glue_dataset(
+    name,
+    tokenizer,
+    cache_dir: Optional[str] = "outputs/cache",
+    split: Optional[str] = None,
+):
+    with timeit_context(f"Loading {name} dataset"):
+        if cache_dir is not None:
+            if not os.path.exists(cache_dir):
+                os.makedirs(cache_dir)
+            cache_path = os.path.join(
+                cache_dir, "flan-t5", f"_load_{name}_dataset_cached"
+            )
+            if os.path.exists(cache_path):
+                dataset = load_from_disk(cache_path)
+            else:
+                dataset = _load_glue_dataset(name, tokenizer)
+                log.info(f"Saving {name} dataset to {cache_path}")
+                dataset.save_to_disk(cache_path)
+        else:
+            dataset = _load_glue_dataset(name, tokenizer)
+    if split is not None:
+        return dataset[split]
+    else:
+        return dataset

fusion_bench/tasks/flan_t5_text_generation/glue_preprocessors.py ADDED Viewed

@@ -0,0 +1,379 @@
+from typing import Any, Dict
+from .datasets_preprocess import DatasetPreprocessor, preprocess
+class CoLA_Preprocessor(DatasetPreprocessor):
+    """
+    dataset URL: https://huggingface.co/datasets/glue/viewer/cola
+    """
+    def preprocess(self, sentence: str, label: int):
+        assert isinstance(sentence, str)
+        assert isinstance(label, int)
+        input_text = self.template["input_text"].format(sentence=sentence)
+        if label in [0, 1]:
+            target_text = self.template["target_text"][str(label)]
+        else:
+            target_text = ""
+        return input_text, target_text
+    def __call__(self, example: Dict[str, Any]):
+        """
+        Preprocess the CoLA dataset into a text-to-text format.
+        """
+        if isinstance(example["sentence"], str):
+            # not batched
+            input_text, target_text = self.preprocess(
+                example["sentence"], example["label"]
+            )
+        else:
+            # batched
+            input_text, target_text = [], []
+            for sentence, label in zip(example["sentence"], example["label"]):
+                _input_text, _target_text = self.preprocess(sentence, label)
+                input_text.append(_input_text)
+                target_text.append(_target_text)
+        return preprocess(
+            tokenizer=self.tokenizer,
+            input_text=input_text,
+            target_text=target_text,
+            tokenizer_kwawgs=self.tokenizer_kwargs,
+        )
+class RTE_Preprocessor(DatasetPreprocessor):
+    """
+    dataset URL: https://huggingface.co/datasets/glue/viewer/rte
+    """
+    def preprocess(self, sentence1, sentence2, label):
+        assert isinstance(sentence1, str)
+        assert isinstance(sentence2, str)
+        assert isinstance(label, int)
+        input_text: str = self.template["input_text"].format(
+            sentence1=sentence1, sentence2=sentence2
+        )
+        if label in [0, 1]:
+            target_text: str = self.template["target_text"][str(label)]
+        else:
+            target_text = ""
+        return input_text, target_text
+    def __call__(self, example):
+        """
+        Preprocess the RTE dataset into a text-to-text format.
+        """
+        if isinstance(example["sentence1"], str):
+            # not batched
+            input_text, target_text = self.preprocess(
+                example["sentence1"], example["sentence2"], example["label"]
+            )
+        else:
+            # batched
+            input_text, target_text = [], []
+            for sentence1, sentence2, label in zip(
+                example["sentence1"], example["sentence2"], example["label"]
+            ):
+                _input_text, _target_text = self.preprocess(sentence1, sentence2, label)
+                input_text.append(_input_text)
+                target_text.append(_target_text)
+        return preprocess(
+            tokenizer=self.tokenizer,
+            input_text=input_text,
+            target_text=target_text,
+            tokenizer_kwawgs=self.tokenizer_kwargs,
+        )
+class MNLI_Preprocessor(DatasetPreprocessor):
+    """
+    dataset URL: https://huggingface.co/datasets/glue/viewer/mnli/
+    """
+    def preprocess(self, hypothesis, premise, label):
+        assert isinstance(hypothesis, str)
+        assert isinstance(premise, str)
+        assert isinstance(label, int)
+        input_text = self.template["input_text"].format(
+            hypothesis=hypothesis, premise=premise
+        )
+        if label in [0, 1, 2]:
+            target_text = self.template["target_text"][str(label)]
+        else:
+            target_text = ""
+        return input_text, target_text
+    def __call__(self, example):
+        """
+        Preprocess the MNLI dataset into a text-to-text format.
+        """
+        if isinstance(example["hypothesis"], str):
+            # not batched
+            input_text, target_text = self.preprocess(
+                example["hypothesis"], example["premise"], example["label"]
+            )
+        else:
+            # batched
+            input_text, target_text = [], []
+            for hypothesis, premise, label in zip(
+                example["hypothesis"], example["premise"], example["label"]
+            ):
+                _input_text, _target_text = self.preprocess(hypothesis, premise, label)
+                input_text.append(_input_text)
+                target_text.append(_target_text)
+        return preprocess(
+            tokenizer=self.tokenizer,
+            input_text=input_text,
+            target_text=target_text,
+            tokenizer_kwawgs=self.tokenizer_kwargs,
+        )
+class MRPC_Preprocessor(DatasetPreprocessor):
+    """
+    dataset URL: https://huggingface.co/datasets/glue/viewer/mrpc
+    """
+    def preprocess(self, sentence1: str, sentence2: str, label: int):
+        assert isinstance(sentence1, str)
+        assert isinstance(sentence2, str)
+        assert isinstance(label, int)
+        input_text = self.template["input_text"].format(
+            sentence1=sentence1, sentence2=sentence2
+        )
+        if label in [0, 1]:
+            target_text = self.template["target_text"][str(label)]
+        else:
+            target_text = ""
+        return input_text, target_text
+    def __call__(self, example):
+        """
+        Preprocess the MRPC dataset into a text-to-text format.
+        """
+        if isinstance(example["sentence1"], str):
+            # not batched
+            input_text, target_text = self.preprocess(
+                example["sentence1"], example["sentence2"], example["label"]
+            )
+        else:
+            # batched
+            input_text, target_text = [], []
+            for sentence1, sentence2, label in zip(
+                example["sentence1"], example["sentence2"], example["label"]
+            ):
+                _input_text, _target_text = self.preprocess(sentence1, sentence2, label)
+                input_text.append(_input_text)
+                target_text.append(_target_text)
+        return preprocess(
+            tokenizer=self.tokenizer,
+            input_text=input_text,
+            target_text=target_text,
+            tokenizer_kwawgs=self.tokenizer_kwargs,
+        )
+class QNLI_Preprocessor(DatasetPreprocessor):
+    """
+    dataset URL: https://huggingface.co/datasets/glue/viewer/qnli
+    """
+    def preprocess(self, question: str, sentence: str, label: int):
+        assert isinstance(question, str)
+        assert isinstance(sentence, str)
+        assert isinstance(label, int)
+        input_text = self.template["input_text"].format(
+            question=question, sentence=sentence
+        )
+        if label in [0, 1]:
+            target_text = self.template["target_text"][str(label)]
+        else:
+            target_text = ""
+        return input_text, target_text
+    def __call__(self, example):
+        """
+        Preprocess the QNLI dataset into a text-to-text format.
+        """
+        if isinstance(example["question"], str):
+            # not batched
+            input_text, target_text = self.preprocess(
+                example["question"], example["sentence"], example["label"]
+            )
+        else:
+            # batched
+            input_text, target_text = [], []
+            for question, sentence, label in zip(
+                example["question"], example["sentence"], example["label"]
+            ):
+                _input_text, _target_text = self.preprocess(question, sentence, label)
+                input_text.append(_input_text)
+                target_text.append(_target_text)
+        return preprocess(
+            tokenizer=self.tokenizer,
+            input_text=input_text,
+            target_text=target_text,
+            tokenizer_kwawgs=self.tokenizer_kwargs,
+        )
+class QQP_Preprocessor(DatasetPreprocessor):
+    """
+    dataset URL: https://huggingface.co/datasets/glue/viewer/qqp
+    """
+    def preprocess(self, question1, question2, label):
+        assert isinstance(
+            question1, str
+        ), f"question1 must be a string, got {type(question1)}, question1={question1}"
+        assert isinstance(
+            question2, str
+        ), f"question2 must be a string, got {type(question2)}, question2={question2}"
+        assert isinstance(
+            label, int
+        ), f"label must be an int, got {type(label)}, label={label}"
+        input_text: str = self.template["input_text"].format(
+            question1=question1, question2=question2
+        )
+        if label in [0, 1]:
+            target_text: str = self.template["target_text"][str(label)]
+        else:
+            target_text = ""
+        return input_text, target_text
+    def __call__(self, example):
+        """
+        Preprocess the QQP dataset into a text-to-text format.
+        """
+        if isinstance(example["question1"], str):
+            # batched
+            input_text, target_text = self.preprocess(
+                example["question1"], example["question2"], example["label"]
+            )
+        else:
+            # not batched
+            input_text, target_text = [], []
+            for question1, question2, label in zip(
+                example["question1"], example["question2"], example["label"]
+            ):
+                _input_text, _target_text = self.preprocess(question1, question2, label)
+                input_text.append(_input_text)
+                target_text.append(_target_text)
+        return preprocess(
+            tokenizer=self.tokenizer,
+            input_text=input_text,
+            target_text=target_text,
+            tokenizer_kwawgs=self.tokenizer_kwargs,
+        )
+class SST2_Preprocessor(DatasetPreprocessor):
+    """
+    dataset URL: https://huggingface.co/datasets/glue/viewer/sst2
+    """
+    def preprocess(self, sentence: str, label: int):
+        assert isinstance(
+            sentence, str
+        ), f"sentence must be a string, got {type(sentence)}, sentence={sentence}"
+        assert isinstance(
+            label, int
+        ), f"label must be an integer, got {type(label)}, label={label}"
+        input_text = self.template["input_text"].format(sentence=sentence)
+        if label in [0, 1]:
+            target_text = self.template["target_text"][str(label)]
+        else:
+            target_text = ""
+        return input_text, target_text
+    def __call__(self, example):
+        """
+        Preprocess the SST2 dataset into a text-to-text format.
+        """
+        if isinstance(example["sentence"], str):
+            # not batched
+            input_text, target_text = self.preprocess(
+                example["sentence"], example["label"]
+            )
+        else:
+            # batched
+            input_text, target_text = [], []
+            for sentence, label in zip(example["sentence"], example["label"]):
+                _input_text, _target_text = self.preprocess(sentence, label)
+                input_text.append(_input_text)
+                target_text.append(_target_text)
+        return preprocess(
+            tokenizer=self.tokenizer,
+            input_text=input_text,
+            target_text=target_text,
+            tokenizer_kwawgs=self.tokenizer_kwargs,
+        )
+class STSB_Preprocessor(DatasetPreprocessor):
+    """
+    dataset URL: https://huggingface.co/datasets/glue/viewer/stsb
+    """
+    def preprocess(self, sentence1, sentence2, label):
+        assert isinstance(
+            sentence1, str
+        ), f"sentence1 must be a string, got {type(sentence1)}, sentence1={sentence1}"
+        assert isinstance(
+            sentence2, str
+        ), f"sentence2 must be a string, got {type(sentence2)}, sentence2={sentence2}"
+        assert isinstance(
+            label, (float, int)
+        ), f"label must be a float or an integer, got {type(label)}, label={label}"
+        input_text = self.template["input_text"].format(
+            sentence1=sentence1, sentence2=sentence2
+        )
+        target_text = self.template["target_text"].format(label)
+        return input_text, target_text
+    def __call__(self, example):
+        """
+        Preprocess the STSB dataset into a text-to-text format.
+        """
+        if isinstance(example["sentence1"], str):
+            # not batched
+            input_text, target_text = self.preprocess(
+                example["sentence1"], example["sentence2"], example["label"]
+            )
+        else:
+            # batched
+            input_text, target_text = [], []
+            for sentence1, sentence2, label in zip(
+                example["sentence1"], example["sentence2"], example["label"]
+            ):
+                _input_text, _target_text = self.preprocess(sentence1, sentence2, label)
+                input_text.append(_input_text)
+                target_text.append(_target_text)
+        return preprocess(
+            tokenizer=self.tokenizer,
+            input_text=input_text,
+            target_text=target_text,
+            tokenizer_kwawgs=self.tokenizer_kwargs,
+        )
+glue_processors = {
+    "cola": CoLA_Preprocessor,
+    "mnli": MNLI_Preprocessor,
+    "mrpc": MRPC_Preprocessor,
+    "qnli": QNLI_Preprocessor,
+    "qqp": QQP_Preprocessor,
+    "rte": RTE_Preprocessor,
+    "sst2": SST2_Preprocessor,
+    "stsb": STSB_Preprocessor,
+}

fusion_bench/tasks/flan_t5_text_generation/glue_prompt_templates.py ADDED Viewed

@@ -0,0 +1,52 @@
+cola = {
+    "description": "template used by GLUE-CoLA",
+    "input_text": "Indicate if the following sentence is grammatically correct or not: \"{sentence}\". Answere 'acceptable' or 'unacceptable'.",
+    "target_text": {"0": "unacceptable", "1": "acceptable"},
+}
+mnli = {
+    "input_text": "Does the premise: '{premise}' logically imply, contradict, or is neutral to the hypothesis: '{hypothesis}'? Answere with 'entailment', 'contradiction', or 'neutral'.",
+    "target_text": {"0": "entailment", "1": "neutral", "2": "contradiction"},
+}
+mrpc = {
+    "input_text": "Are the following sentences '{sentence1}' and '{sentence2}' conveying the same meaning? Answere with 'yes' or 'no'.",
+    "target_text": {"0": "no", "1": "yes"},
+}
+qnli = {
+    "input_text": "Given the context: '{sentence}', does the question '{question}' have an answer based on the information provided? Answer with 'yes' or 'no'.",
+    "target_text": {"0": "yes", "1": "no"},
+}
+qqp = {
+    "input_text": "Do the questions '{question1}' and '{question2}' have the same intent? Answere with 'yes' or 'no'.",
+    "target_text": {"0": "no", "1": "yes"},
+}
+rte = {
+    "description": "Template used by GLUE-RTE",
+    "input_text": "Does the text: '{sentence1}' entail that '{sentence2}' is true? Provide 'yes' or 'no'.",
+    "target_text": {"0": "yes", "1": "no"},
+}
+sst2 = {
+    "input_text": "Given the sentence '{sentence}', determine the sentiment. Is it positive or negative?",
+    "target_text": {"0": "negative", "1": "positive"},
+}
+stsb = {
+    "input_text": "Consider the sentences '{sentence1}' and '{sentence2}'. On a scale from 1 (completely different) to 5 (completely similar), rate the similarity.",
+    "target_text": "{:.1f}",
+}
+glue_prompt_templates = {
+    "cola": cola,
+    "mnli": mnli,
+    "mrpc": mrpc,
+    "qnli": qnli,
+    "qqp": qqp,
+    "rte": rte,
+    "stsb": stsb,
+    "sst2": sst2,
+}

fusion_bench/utils/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+# flake8: noqa: F401
+import importlib
+from typing import Iterable
+from . import data, functools, path
+from .cache_utils import *
+from .devices import *
+from .dtype import parse_dtype
+from .fabric import seed_everything_by_time
+from .instantiate import instantiate, is_instantiable
+from .misc import *
+from .packages import import_object
+from .parameters import *
+from .timer import timeit_context

fusion_bench/utils/auto.py ADDED Viewed

@@ -0,0 +1,31 @@
+from omegaconf import DictConfig
+from fusion_bench.utils import import_object
+class BaseFactoryClass:
+    _registry = {}
+    @classmethod
+    def from_config(cls, config: DictConfig):
+        name = config.name
+        if name not in cls._registry:
+            raise ValueError(
+                f"Unknown name: {name}, available names: {cls._registry.keys()}. "
+                f"You can register a new item using `{cls.__name__}.register()` method."
+            )
+        item_cls = cls._registry[name]
+        if isinstance(item_cls, str):
+            if item_cls.startswith("."):
+                item_cls = f"{cls.__module__}.{item_cls[1:]}"
+            item_cls = import_object(item_cls)
+        return item_cls(config)
+    @classmethod
+    def register(cls, name: str, item_cls):
+        cls._registry[name] = item_cls
+    @classmethod
+    def available_items(cls):
+        return list(cls._registry.keys())