PyPI - fusion-bench - Versions diffs - 0.2.9__py3-none-any.whl - Mend

fusion-bench 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (727) hide show

fusion_bench/dataset/llama/sharegpt.py ADDED Viewed

@@ -0,0 +1,141 @@
+import logging
+from typing import Dict, List, Optional, Union
+import numpy as np
+from datasets import Dataset
+from transformers import PreTrainedTokenizer
+log = logging.getLogger(__name__)
+def tokenize_sharegpt_dataset(
+    dataset: Dataset,
+    tokenizer: PreTrainedTokenizer,
+    max_length: int = 2048,
+    padding: bool = True,
+    system_template: str = "### System: {system}\n\n",
+    tools_template: str = "### Tools: {tools}\n\n",
+    human_template: str = "### Human: {message}\n",
+    assistant_template: str = "### Assistant: {message}\n",
+    function_template: str = "### Function Call: {message}\n",
+    observation_template: str = "### Observation: {message}\n",
+) -> Dataset:
+    """
+    Tokenize ShareGPT format dataset with support for system prompts, tools, and tool calls.
+    Args:
+        dataset: Input dataset in ShareGPT format.
+        tokenizer: The tokenizer to use.
+        max_length: Maximum sequence length.
+        padding: Whether to pad the tokenized inputs to `max_length`.
+        system_template: Template for system messages.
+        tools_template: Template for tool descriptions.
+        human_template: Template for human messages.
+        assistant_template: Template for assistant responses.
+        function_template: Template for function calls.
+        observation_template: Template for function observations.
+    Returns:
+        Tokenized dataset
+    """
+    def build_conversation(
+        conversations: List[Dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+    ) -> tuple[List[int], List[int]]:
+        """
+        Build prompt and response token ids from conversations.
+        Returns (prompt_tokens, response_tokens) for the last assistant message.
+        """
+        # Initialize conversation history
+        history = ""
+        # Add system prompt if provided
+        if system:
+            history += system_template.format(system=system.strip())
+        # Add tools description if provided
+        if tools:
+            history += tools_template.format(tools=tools.strip())
+        prompt_tokens = []
+        response_tokens = []
+        for i, message in enumerate(conversations):
+            msg_from = message["from"]
+            msg_value = message["value"].strip()
+            # If this is the last assistant message
+            if msg_from == "gpt" and i == len(conversations) - 1:
+                # Tokenize the current history as prompt
+                prompt_tokens = tokenizer.encode(history, add_special_tokens=False)
+                # Tokenize the assistant's message as response
+                response_tokens = tokenizer.encode(
+                    assistant_template.format(message=msg_value),
+                    add_special_tokens=False,
+                )
+                break
+            # Build conversation history
+            if msg_from == "human":
+                history += human_template.format(message=msg_value)
+            elif msg_from == "gpt":
+                history += assistant_template.format(message=msg_value)
+            elif msg_from == "function_call":
+                history += function_template.format(message=msg_value)
+            elif msg_from == "observation":
+                history += observation_template.format(message=msg_value)
+            else:
+                log.warning(f"Unkonwn role: {msg_from}")
+        return prompt_tokens, response_tokens
+    def prepare_sample(sample: dict) -> dict:
+        # Get prompt and response tokens
+        prompt_tokens, response_tokens = build_conversation(
+            conversations=sample["conversations"],
+            system=sample.get("system"),  # system prompt is optional
+            tools=sample.get("tools"),  # tools description is optional
+        )
+        # Create input_ids with EOS token
+        input_ids = prompt_tokens + response_tokens + [tokenizer.eos_token_id]
+        # Create attention mask (1 for tokens, 0 for padding)
+        attention_mask = [1] * len(input_ids)
+        # Create labels (-100 for prompt, actual tokens for response)
+        labels = (
+            [-100] * len(prompt_tokens) + response_tokens + [tokenizer.eos_token_id]
+        )
+        # Truncate if exceeds max length
+        if len(input_ids) > max_length:
+            input_ids = input_ids[:max_length]
+            attention_mask = attention_mask[:max_length]
+            labels = labels[:max_length]
+        # Pad if necessary
+        if padding:
+            padding_length = max_length - len(input_ids)
+            if padding_length > 0:
+                input_ids.extend([tokenizer.pad_token_id] * padding_length)
+                attention_mask.extend([0] * padding_length)
+                labels.extend([-100] * padding_length)
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+        }
+    if tokenizer.pad_token is None:
+        log.warning("Tokenizer does not have a `pad_token`. Set it the `eos_token`.")
+        tokenizer.pad_token = tokenizer.eos_token
+    # Process the dataset
+    tokenized_dataset = dataset.map(
+        prepare_sample, remove_columns=dataset.column_names, desc="Tokenizing dataset"
+    )
+    return tokenized_dataset

fusion_bench/dataset/llama/squad.py ADDED Viewed

@@ -0,0 +1,125 @@
+import logging
+import os
+from typing import Any, Dict, List, Literal, Optional
+from datasets import load_dataset, load_from_disk
+from transformers import PreTrainedTokenizer
+import fusion_bench
+log = logging.getLogger(__name__)
+def load_tokenized_squad_dataset(
+    tokenizer: Optional[PreTrainedTokenizer],
+    path: Literal["squard_v2", "squad"] = "squard_v2",
+    split: Optional[str] = None,
+    max_length: int = 384,  # The maximum length of a feature (question and context)
+    doc_stride: int = 128,  # The authorized overlap between two part of the context when splitting it is needed.
+    datasets: Optional[Any] = None,
+    cache_path: Optional[str] = None,
+):
+    if cache_path is not None and fusion_bench.utils.path.path_is_dir_and_not_empty(
+        cache_path
+    ):
+        datasets = load_from_disk(cache_path)
+        if split is None:
+            return datasets
+        else:
+            return datasets[split]
+    else:
+        assert (
+            tokenizer is not None
+        ), "Cached dataset not found. Need tokenizer to process the raw data."
+    # 1. load raw dataset
+    if datasets is not None:
+        log.info("Use `datasets`, `path` is ignored.")
+    else:
+        datasets = load_dataset(path)
+    # 2. tokenize the dataset
+    pad_on_right = tokenizer.padding_side == "right"
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    def prepare_train_features(examples):
+        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+        # left whitespace
+        examples["question"] = [q.lstrip() for q in examples["question"]]
+        # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples["question" if pad_on_right else "context"],
+            examples["context" if pad_on_right else "question"],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_length,
+            stride=doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length",
+        )
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+        # The offset mappings will give us a map from token to character position in the original context. This will
+        # help us compute the start_positions and end_positions.
+        offset_mapping = tokenized_examples.pop("offset_mapping")
+        # Initialize arrays for start and end positions
+        start_positions = []
+        end_positions = []
+        for i, offset in enumerate(offset_mapping):
+            # Get corresponding example from the original dataset
+            sample_idx = sample_mapping[i]
+            answer = examples["answers"][sample_idx]
+            # Character start/end positions of the answer
+            start_char = answer["answer_start"][0]
+            end_char = start_char + len(answer["text"][0])
+            # Convert character positions to token positions
+            # Find start token position
+            token_start_index = 0
+            while (
+                token_start_index < len(offset)
+                and offset[token_start_index][0] <= start_char
+            ):
+                token_start_index += 1
+            token_start_index -= 1
+            # Find end token position
+            token_end_index = token_start_index
+            while (
+                token_end_index < len(offset) and offset[token_end_index][1] <= end_char
+            ):
+                token_end_index += 1
+            token_end_index -= 1
+            start_positions.append(token_start_index)
+            end_positions.append(token_end_index)
+        tokenized_examples["start_positions"] = start_positions
+        tokenized_examples["end_positions"] = end_positions
+        return tokenized_examples
+    tokenized_datasets = datasets.map(
+        prepare_train_features,
+        batched=True,
+        remove_columns=datasets["train"].column_names,
+    )
+    if cache_path is not None:
+        os.makedirs(cache_path, exist_ok=True)
+        tokenized_datasets.save_to_disk(cache_path)
+    if split is None:
+        return tokenized_datasets
+    else:
+        return tokenized_datasets[split]

fusion_bench/dataset/llama/stanford_shp.py ADDED Viewed

@@ -0,0 +1,90 @@
+import os
+from copy import deepcopy
+from typing import TYPE_CHECKING, Optional
+from datasets import Dataset, load_dataset, load_from_disk
+from lightning.fabric.utilities import rank_zero_only
+from tqdm.auto import tqdm
+from fusion_bench.utils import timeit_context
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+def load_tokenized_stanford_shp_for_rlhf(
+    tokenizer: "PreTrainedTokenizer",
+    path: str = "stanfordnlp/SHP",
+    split: str = "train",
+    num_proc: int = 8,
+    cache_path: Optional[str] = None,
+):
+    if cache_path is not None and os.path.isdir(cache_path):
+        dataset = load_from_disk(cache_path)
+        return dataset
+    dataset = load_dataset(path, split=split)
+    def tokenize(sample):
+        """
+        - history: the post title concatented to the post body (string)
+        - human_ref_A: text of comment A (string)
+        - human_ref_B: text of comment B (string)
+        - labels: the preference label -- it is 1 if A is preferred to B; 0 if B is preferred to A. This was randomized such that the label distribution is roughly 50/50. (integer)
+        """
+        # Create a conversation with the post title and body, followed by comments
+        conversation = [{"role": "user", "content": sample["history"]}]
+        if sample["labels"] == 0:
+            sample["chosen"] = deepcopy(conversation).append(
+                {"role": "assistant", "content": sample["human_ref_B"]}
+            )
+            sample["rejected"] = deepcopy(conversation).append(
+                {"role": "assistant", "content": sample["human_ref_A"]}
+            )
+        else:
+            sample["chosen"] = deepcopy(conversation).append(
+                {"role": "assistant", "content": sample["human_ref_A"]}
+            )
+            sample["rejected"] = deepcopy(conversation).append(
+                {"role": "assistant", "content": sample["human_ref_B"]}
+            )
+        # apply chat template
+        sample["chosen_chat"] = tokenizer.apply_chat_template(
+            sample["chosen"], tokenize=False, add_generation_prompt=False
+        )
+        sample["rejected_chat"] = tokenizer.apply_chat_template(
+            sample["rejected"], tokenize=False, add_generation_prompt=False
+        )
+        # tokenize the conversation
+        tokenized_pos = tokenizer(sample["chosen_chat"], truncation=True)
+        tokenized_neg = tokenizer(sample["rejected_chat"], truncation=True)
+        # Ensure that the chosen response does not contain an EOS token
+        sample["chosen_input_ids"] = tokenized_pos["input_ids"]
+        sample["chosen_attention_mask"] = tokenized_pos["attention_mask"]
+        assert (
+            tokenizer.eos_token_id not in tokenized_pos["input_ids"][:-1]
+        ), f"Prompt contains EOS token: {sample['positive']}"
+        if sample["chosen_input_ids"][-1] != tokenizer.eos_token_id:
+            sample["chosen_input_ids"].append(tokenizer.eos_token_id)
+            sample["chosen_attention_mask"].append(1)
+        sample["rejected_input_ids"] = tokenized_neg["input_ids"]
+        sample["rejected_attention_mask"] = tokenized_neg["attention_mask"]
+        # Ensure that the rejected response does not contain an EOS token
+        assert (
+            tokenizer.eos_token_id not in tokenized_neg["input_ids"][:-1]
+        ), f"Prompt contains EOS token: {sample['rejected']}"
+        if sample["rejected_input_ids"][-1] != tokenizer.eos_token_id:
+            sample["rejected_input_ids"].append(tokenizer.eos_token_id)
+            sample["rejected_attention_mask"].append(1)
+        return sample
+    dataset = dataset.map(tokenize, num_proc=num_proc)
+    if cache_path is not None and rank_zero_only.rank == 0:
+        dataset.save_to_disk(cache_path)
+    return dataset

fusion_bench/dataset/llama/ultrachat.py ADDED Viewed

@@ -0,0 +1,58 @@
+import os
+from typing import TYPE_CHECKING, Optional
+from datasets import Dataset, load_dataset, load_from_disk
+from lightning.fabric.utilities import rank_zero_only
+from tqdm.auto import tqdm
+from fusion_bench.utils import timeit_context
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+def load_tokenized_ultrachat_200k(
+    tokenizer: "PreTrainedTokenizer",
+    path: str = "HuggingFaceH4/ultrachat_200k",
+    split: str = "train_sft",
+    num_proc: int = 8,
+    cache_path: Optional[str] = None,
+):
+    R"""
+    Load and tokenized Ultrachat 200k dataset for Bradley-Terry ranking model.
+    The returned dataset contains the following fields:
+    - input_ids: The input token ids for the winner.
+    - attention_mask: The attention mask for the winner.
+    """
+    if cache_path is not None and os.path.exists(cache_path):
+        dataset = load_from_disk(cache_path)
+        return dataset
+    dataset = load_dataset(path, split=split)
+    def tokenize(sample):
+        # ? is it necessary to `.replace(tokenizer.bos_token, "")`?
+        sample["input_ids"] = tokenizer.apply_chat_template(
+            sample["messages"], tokenize=True, add_generation_prompt=False
+        )
+        sample["attention_mask"] = [1] * len(sample["input_ids"])
+        return sample
+    dataset = dataset.map(tokenize, num_proc=num_proc)
+    if cache_path is not None and rank_zero_only.rank == 0:
+        dataset.save_to_disk(cache_path)
+    return dataset
+if __name__ == "__main__":
+    # Example usage and testing
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
+    dataset = load_tokenized_ultrachat_200k(tokenizer)
+    print(dataset)

fusion_bench/dataset/llama/utils/__init__.py ADDED Viewed

File without changes

fusion_bench/dataset/llama/wikitext.py ADDED Viewed

@@ -0,0 +1,89 @@
+import logging
+import os
+from typing import Any, Dict, List, Optional
+from datasets import load_dataset, load_from_disk
+from transformers import PreTrainedTokenizer
+import fusion_bench
+log = logging.getLogger(__name__)
+def load_tokenized_wiki_dataset(
+    tokenizer: Optional[PreTrainedTokenizer],
+    path: str = "wikitext",
+    name: str = "wikitext-2-raw-v1",
+    split: Optional[str] = None,
+    datasets: Optional[Any] = None,
+    block_size: int = 128,
+    cache_path: Optional[str] = None,
+):
+    """
+    Reference: https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb
+    Args:
+        block_size (int):
+        dataset: If dataset is provided, `path` and `name` will be ignored.
+    """
+    if cache_path is not None and fusion_bench.utils.path.path_is_dir_and_not_empty(
+        cache_path
+    ):
+        datasets = load_from_disk(cache_path)
+        if split is None:
+            return datasets
+        else:
+            return datasets[split]
+    else:
+        assert (
+            tokenizer is not None
+        ), "Cached dataset not found. Need tokenizer to process the raw data."
+    # 1. load raw dataset
+    if datasets is not None:
+        log.info("Use `datasets`, `path` and `name` are ignored.")
+    else:
+        datasets = load_dataset(path, name)
+    # 2. tokenize the dataset
+    def tokenize_function(examples):
+        return tokenizer(examples["text"])
+    tokenized_datasets = datasets.map(
+        tokenize_function, batched=True, num_proc=4, remove_columns=["text"]
+    )
+    # If we now look at an element of our datasets, we will see the text have been replaced by the input_ids the model will need:
+    # { 'attention_mask': <list of int>, 'input_ids': <list of int> }
+    # 3. concat and truncate tokens
+    def group_texts(examples: Dict[str, List]):
+        # Concatenate all texts.
+        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+        # customize this part to your needs.
+        total_length = (total_length // block_size) * block_size
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+            for k, t in concatenated_examples.items()
+        }
+        result["labels"] = result["input_ids"].copy()
+        return result
+    lm_datasets = tokenized_datasets.map(
+        group_texts,
+        batched=True,
+        batch_size=1000,
+        num_proc=4,
+    )
+    if cache_path is not None:
+        os.makedirs(cache_path, exist_ok=True)
+        lm_datasets.save_to_disk(cache_path)
+    if split is None:
+        return lm_datasets
+    else:
+        return lm_datasets[split]

fusion_bench/dataset/nyuv2.py ADDED Viewed

@@ -0,0 +1,119 @@
+import fnmatch
+import os
+from typing import Callable, Optional
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+class NYUv2(Dataset):
+    R"""
+    NYUv2 dataset, 3 tasks + 1 generated useless task
+    Included tasks:
+        1. Semantic Segmentation,
+        2. Depth prediction,
+        3. Surface Normal prediction,
+        4. Noise prediction [to test auxiliary learning, purely conflict gradients]
+    Modified from https://github.com/lorenmt/auto-lambda/blob/main/create_dataset.py
+    removed the `augmentation` arg and add `transform` args
+    """
+    num_out_channels = {
+        "segmentation": 13,
+        "depth": 1,
+        "normal": 3,
+        "noise": 1,
+    }
+    def __init__(
+        self,
+        root: str,
+        train: bool = True,
+        transform: Optional[Callable] = None,
+        seg_transform: Optional[Callable] = None,
+        sn_transform: Optional[Callable] = None,
+        depth_transform: Optional[Callable] = None,
+    ):
+        """
+        Initialize the NYUv2 dataset.
+        Args:
+            root (str): The root directory of the dataset.
+            train (bool, optional): If True, use training set. If False, use validation set. Defaults to True.
+            transform (Callable, optional): image transform. Defaults to None.
+            seg_transform (Callable, optional): segmentation transform. Defaults to None.
+            sn_transform (Callable, optional): surface normal transform. Defaults to None.
+            depth_transform (Callable, optional): depth transform. Defaults to None.
+        """
+        self.root = os.path.expanduser(root)
+        self.train = train
+        self.transform = transform
+        self.seg_transform = seg_transform
+        self.sn_transform = sn_transform
+        self.depth_transform = depth_transform
+        if train:
+            self.data_path = self.root + "/train"
+        else:
+            self.data_path = self.root + "/val"
+        # calculate data length
+        self.data_len = len(
+            fnmatch.filter(os.listdir(self.data_path + "/image"), "*.npy")
+        )
+        self.noise = torch.rand(self.data_len, 1, 288, 384)
+    def __getitem__(self, index):
+        """
+        Retrieve an item from the dataset.
+        Args:
+            index (int): The index of the item to retrieve.
+        Returns:
+            tuple: A tuple containing the image and a dictionary of task-specific outputs.
+        """
+        # load data from the pre-processed npy files
+        image = torch.from_numpy(
+            np.moveaxis(
+                np.load(self.data_path + "/image/{:d}.npy".format(index)), -1, 0
+            )
+        ).float()
+        semantic = torch.from_numpy(
+            np.load(self.data_path + "/label/{:d}.npy".format(index))
+        ).float()
+        depth = torch.from_numpy(
+            np.moveaxis(
+                np.load(self.data_path + "/depth/{:d}.npy".format(index)), -1, 0
+            )
+        ).float()
+        normal = torch.from_numpy(
+            np.moveaxis(
+                np.load(self.data_path + "/normal/{:d}.npy".format(index)), -1, 0
+            )
+        ).float()
+        noise = self.noise[index].float()
+        if self.transform is not None:
+            image = self.transform(image)
+        if self.seg_transform is not None:
+            semantic = self.seg_transform(semantic)
+        if self.sn_transform is not None:
+            normal = self.sn_transform(normal)
+        if self.depth_transform is not None:
+            depth = self.depth_transform(depth)
+        return image, {
+            "segmentation": semantic,
+            "depth": depth,
+            "normal": normal,
+            "noise": noise,
+        }
+    def __len__(self):
+        return self.data_len