PyPI - fusion-bench - Versions diffs - 0.2.9__py3-none-any.whl - Mend

fusion-bench 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (727) hide show

fusion_bench/dataset/llama/alpaca.py ADDED Viewed

@@ -0,0 +1,232 @@
+import logging
+import os
+import warnings
+from typing import Any, Dict, List, Optional
+from datasets import Dataset, load_dataset, load_from_disk
+from lightning.fabric.utilities import rank_zero_only
+from tqdm.auto import tqdm
+from transformers import PreTrainedTokenizer
+import fusion_bench
+from fusion_bench.utils import timeit_context
+log = logging.getLogger(__name__)
+def convert_alpaca_to_conversation(alpaca_data: List[Dict[str, str]]):
+    """
+    Convert Alpaca format data to conversation format.
+    Args:
+        alpaca_data (list): List of dictionaries in Alpaca format with
+            'instruction', 'input', and 'output' keys
+    Returns:
+        list: List of conversations in ChatML format
+    """
+    conversations = []
+    for item in tqdm(
+        alpaca_data,
+        "Converting Alpaca to conversations",
+        disable=not rank_zero_only.rank == 0,
+    ):
+        # Skip if required fields are missing
+        if not item.get("instruction") or not item.get("output"):
+            continue
+        conversation = []
+        # Create user message
+        user_content = item["instruction"]
+        if item.get("input") and item["input"].strip():
+            user_content += f"\n\n{item['input']}"
+        conversation.append({"role": "user", "content": user_content})
+        # Create assistant message
+        conversation.append({"role": "assistant", "content": item["output"]})
+        conversations.append(conversation)
+    return conversations
+def load_tokenized_alpaca_dataset(
+    tokenizer: PreTrainedTokenizer,
+    path: str = "yahma/alpaca-cleaned",
+    split: str = "train",
+    cache_path: Optional[str] = None,
+):
+    """
+    Load and tokenized Alpaca dataset and Alpaca-like dataset.
+    Args:
+        tokenizer (PreTrainedTokenizer): The tokenizer to use for tokenizing the dataset.
+        path (str, optional): The path to the Alpaca dataset. Defaults to "yahma/alpaca-cleaned".
+        split (str, optional): The dataset split to load (e.g., "train", "test"). Defaults to "train".
+        cache_path (Optional[str], optional): The path to cache the tokenized dataset. If provided and the cache exists,
+            the dataset will be loaded from the cache. Defaults to None.
+    Returns:
+        Dataset: The tokenized dataset.
+    """
+    if cache_path is not None and os.path.exists(cache_path):
+        dataset = load_from_disk(cache_path)
+        if split is not None and split in dataset:
+            return dataset[split]
+        else:
+            return dataset
+    dataset = load_dataset(path, split=split)
+    alpaca_data = dataset.to_list()
+    conversations = convert_alpaca_to_conversation(alpaca_data)
+    with timeit_context("Tokenizing dataset"):
+        tokenized_dataset = tokenizer.apply_chat_template(
+            conversations, return_dict=True
+        )
+    tokenized_dataset = Dataset.from_dict(tokenized_dataset)
+    if cache_path is not None and rank_zero_only.rank == 0:
+        tokenized_dataset.save_to_disk(cache_path)
+    return tokenized_dataset
+def _tokenize_alpaca_dataset_with_template(
+    dataset: Dataset,
+    tokenizer: PreTrainedTokenizer,
+    max_length: int = 2048,
+    input_template: str = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
+    input_no_template: str = "### Instruction:\n{instruction}\n\n### Response:\n",
+    batch_size: int = 1000,
+) -> Dataset:
+    """
+    Tokenize Alpaca format dataset with customizable options in batches.
+    Args:
+        dataset: The input dataset in Alpaca format
+        tokenizer: The tokenizer to use
+        max_length: Maximum sequence length
+        input_template: Template for samples with input field
+        input_no_template: Template for samples without input field
+        batch_size: Size of batches to process at once
+    Returns:
+        Tokenized dataset
+    """
+    warnings.warn(
+        "This function is deprecated. Use `apply_chat_template` from `transformers` instead.",
+        DeprecationWarning,
+    )
+    def prepare_samples(samples: Dict[str, List[str]]) -> Dict[str, List[List[int]]]:
+        # Format prompts based on whether input field exists
+        prompts = []
+        for instruction, input_text in zip(
+            samples["instruction"], samples.get("input", [])
+        ):
+            if input_text.strip():
+                prompt = input_template.format(
+                    instruction=instruction.strip(), input=input_text.strip()
+                )
+            else:
+                prompt = input_no_template.format(instruction=instruction.strip())
+            prompts.append(prompt)
+        responses = [output.strip() for output in samples["output"]]
+        # Tokenize prompts and responses
+        prompt_tokens = tokenizer(
+            prompts, add_special_tokens=False, padding=False, truncation=False
+        )
+        response_tokens = tokenizer(
+            responses, add_special_tokens=False, padding=False, truncation=False
+        )
+        input_ids, labels = [], []
+        # Process each sample in the batch
+        for prompt_toks, response_toks in zip(
+            prompt_tokens["input_ids"], response_tokens["input_ids"]
+        ):
+            # Create input_ids with EOS token
+            sample_input_ids = prompt_toks + response_toks + [tokenizer.eos_token_id]
+            # Create labels: -100 for prompt, actual tokens for response
+            label = [-100] * len(prompt_toks) + response_toks + [tokenizer.eos_token_id]
+            # Truncate if exceeds max length
+            if len(sample_input_ids) > max_length:
+                sample_input_ids = sample_input_ids[:max_length]
+                label = label[:max_length]
+            input_ids.append(sample_input_ids)
+            labels.append(label)
+        # Use tokenizer's padding feature for input_ids and attention_mask
+        padded_results = tokenizer.pad(
+            {"input_ids": input_ids},
+            padding=True,
+            max_length=max_length,
+            return_attention_mask=True,
+        )
+        # Pad labels with -100
+        padded_labels = []
+        for label in labels:
+            padding_length = max_length - len(label)
+            if padding_length > 0:
+                label = label + [-100] * padding_length
+            padded_labels.append(label)
+        return {
+            "input_ids": padded_results["input_ids"],
+            "attention_mask": padded_results["attention_mask"],
+            "labels": padded_labels,
+        }
+    if tokenizer.pad_token is None:
+        log.warning("Tokenizer does not have a `pad_token`. Set it the `eos_token`.")
+        tokenizer.pad_token = tokenizer.eos_token
+    # Process the entire dataset in batches
+    tokenized_dataset = dataset.map(
+        prepare_samples,
+        batched=True,
+        batch_size=batch_size,
+        remove_columns=dataset.column_names,
+        desc="Tokenizing dataset",
+    )
+    return tokenized_dataset
+def load_tokenized_alpaca_dataset_from_json_with_prompt(
+    data_files: str,
+    tokenizer: PreTrainedTokenizer,
+    max_length: int,
+    split: Optional[str] = "train",
+    cache_path: Optional[str] = None,
+):
+    if cache_path is not None and fusion_bench.utils.path.path_is_dir_and_not_empty(
+        cache_path
+    ):
+        datasets = load_from_disk(cache_path)
+        if split is None:
+            return datasets
+        else:
+            return datasets[split]
+    else:
+        assert (
+            tokenizer is not None
+        ), "Cached dataset not found. Need tokenizer to process the raw data."
+    dataset = load_dataset("json", data_files=data_files)
+    if split is not None:
+        dataset = dataset[split]
+    dataset = _tokenize_alpaca_dataset_with_template(
+        dataset, tokenizer, max_length=max_length
+    )
+    return dataset

fusion_bench/dataset/llama/collate.py ADDED Viewed

@@ -0,0 +1,120 @@
+from typing import Dict, List, Optional
+import torch
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
+def padded_collate_sft(
+    batch: List[Dict[str, List[int]]],
+    pad_token_id: int = 0,
+    input_ids_key: str = "input_ids",
+    attention_mask_key: Optional[str] = "attention_mask",
+    labels_key: Optional[str] = "labels",
+    ignore_idx: int = -100,
+) -> Dict[str, torch.Tensor]:
+    """
+    Pad (right) a batch of sequences to the longest sequence length in the batch, and
+    convert integer lists to tensors.
+    Args:
+        batch (List[Dict[str, List[int]]]): A list of dictionaries containing input, label pairs.
+        padding_idx (int): Padding index for input ids. Defaults to 0.
+        ignore_idx (int): Padding index for labels. Defaults to -100.
+    Returns:
+        Dict[str, torch.Tensor]: Collated input and label tensors.
+    """
+    input_ids = pad_sequence(
+        [torch.tensor(x[input_ids_key]) for x in batch],
+        batch_first=True,
+        padding_value=pad_token_id,
+    )
+    if attention_mask_key is not None and attention_mask_key in batch[0]:
+        attention_mask = pad_sequence(
+            [torch.tensor(x[attention_mask_key]) for x in batch],
+            batch_first=True,
+            padding_value=0,
+        )
+    else:
+        attention_mask = None
+    for i, item in enumerate(batch):
+        # if labels_key not in item, copy input_ids to labels_key
+        if labels_key not in item:
+            item[labels_key] = item[input_ids_key]
+    labels = pad_sequence(
+        [torch.tensor(x[labels_key]) for x in batch],
+        batch_first=True,
+        padding_value=ignore_idx,
+    )
+    if attention_mask is not None:
+        collated_batch = {
+            input_ids_key: input_ids,
+            attention_mask_key: attention_mask,
+            labels_key: labels,
+        }
+    else:
+        collated_batch = {input_ids_key: input_ids, labels_key: labels}
+    for key in batch[0]:
+        if key not in [input_ids_key, attention_mask_key, labels_key]:
+            collated_batch[key] = [x[key] for x in batch]
+    return collated_batch
+def bradley_terry_rm_collate(
+    batch: List[Dict[str, List[int]]],
+    pad_token_id: int = 0,
+    padding_side="right",
+):
+    """
+    Collate function for Bradley-Terry reward modeling.
+    Args:
+        batch (List[Dict[str, List[int]]]): A list of dictionaries containing input, label pairs.
+        pad_token_id (int): Padding index for input ids. Defaults to 0.
+    Returns:
+        Dict[str, torch.Tensor]: Collated input and label tensors. The first half of the batch is the winner, and the second half is the loser.
+    """
+    converted_batch = []
+    for item in batch:
+        new_item = {
+            "input_ids": item["chosen_input_ids"],
+            "attention_mask": item["chosen_attention_mask"],
+        }
+        converted_batch.append(new_item)
+    for item in batch:
+        new_item = {
+            "input_ids": item["rejected_input_ids"],
+            "attention_mask": item["rejected_attention_mask"],
+        }
+        converted_batch.append(new_item)
+    input_ids = pad_sequence(
+        [torch.tensor(x["input_ids"]) for x in converted_batch],
+        batch_first=True,
+        padding_value=pad_token_id,
+        padding_side=padding_side,
+    )
+    attention_mask = pad_sequence(
+        [torch.tensor(x["attention_mask"]) for x in converted_batch],
+        batch_first=True,
+        padding_value=0,
+        padding_side=padding_side,
+    )
+    collated_batch = {"input_ids": input_ids, "attention_mask": attention_mask}
+    for key in batch[0]:
+        if key not in [
+            "chosen_input_ids",
+            "chosen_attention_mask",
+            "rejected_input_ids",
+            "rejected_attention_mask",
+        ]:
+            collated_batch[key] = [x[key] for x in batch]
+    return collated_batch

fusion_bench/dataset/llama/metamathqa.py ADDED Viewed

@@ -0,0 +1,50 @@
+import os
+from typing import TYPE_CHECKING, Optional
+from datasets import Dataset, load_dataset, load_from_disk
+from lightning.fabric.utilities import rank_zero_only
+from tqdm.auto import tqdm
+from fusion_bench.utils import timeit_context
+from .alpaca import convert_alpaca_to_conversation
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+def load_tokenized_metamathqa(
+    tokenizer: "PreTrainedTokenizer",
+    path: str = "meta-math/MetaMathQA",
+    split: str = "train",
+    cache_path: Optional[str] = None,
+):
+    if cache_path is not None and os.path.exists(cache_path):
+        dataset = load_from_disk(cache_path)
+        if split is not None and split in dataset:
+            return dataset[split]
+        else:
+            return dataset
+    dataset = load_dataset(path, split=split)
+    # convert dataset to alpaca format and save to ../data/MetaMathQA.json
+    alpaca_dataset = []
+    for example in tqdm(dataset, disable=not rank_zero_only.rank == 0):
+        alpaca_example = {
+            "instruction": example["query"],
+            "input": "",
+            "output": example["response"],
+        }
+        alpaca_dataset.append(alpaca_example)
+    conversations = convert_alpaca_to_conversation(alpaca_dataset)
+    with timeit_context("Tokenizing dataset"):
+        tokenized_dataset = tokenizer.apply_chat_template(
+            conversations, return_dict=True
+        )
+    tokenized_dataset = Dataset.from_dict(tokenized_dataset)
+    if cache_path is not None and rank_zero_only.rank == 0:
+        tokenized_dataset.save_to_disk(cache_path)
+    return tokenized_dataset

fusion_bench/dataset/llama/openai.py ADDED Viewed

@@ -0,0 +1,160 @@
+import logging
+from typing import Dict, List
+from datasets import Dataset
+from transformers import PreTrainedTokenizer
+log = logging.getLogger(__name__)
+def tokenize_messages_dataset(
+    dataset: Dataset,
+    tokenizer: PreTrainedTokenizer,
+    max_length: int = 2048,
+    padding: bool = True,
+    system_template: str = "### System: {message}\n",
+    user_template: str = "## User: {message}\n",
+    assistant_template: str = "## Assistant: {message}\n",
+) -> Dataset:
+    R"""
+    Tokenize dataset with messages format supporting loss calculation flags.
+    write a script to tokenizer datasets with the following format:
+    Examples:
+    ```json
+    {
+        "messages": [
+            {
+                "role": "system",
+                "content": "XXX",
+                "calculate_loss": 0
+            },
+            {
+                "role": "system",
+                "content": "XXX",
+                "calculate_loss": 0
+            },
+            {
+                "role": "user",
+                "content": "XXX",
+                "calculate_loss": 0
+            },
+            {
+                "role": "assistant",
+                "content": "XXX",
+                "calculate_loss": 1
+            }
+        ],
+        "create_info": [
+            {
+                "date": "20240830",
+                "owner": "l00470783",
+                "within_source_id": 0,
+                "describe": "...",
+                "source": [
+                    "..."
+                ],
+                "language": "zh"
+            }
+        ],
+        "feature_info": {
+            "domain": "...",
+            "tags": [
+                "..."
+            ]
+        },
+        "source_file": "..."
+    }
+    ```
+    Args:
+        dataset: Input dataset with messages format
+        tokenizer: The tokenizer to use
+        max_length: Maximum sequence length
+        system_template: Template for system messages
+        user_template: Template for user messages
+        assistant_template: Template for assistant messages
+    Returns:
+        Tokenized dataset
+    """
+    def build_prompt(messages: List[Dict[str, str]]) -> tuple[str, str]:
+        """
+        Build prompt and get response that needs loss calculation.
+        Returns conversation history and the response to calculate loss on.
+        """
+        history = ""
+        response = ""
+        for message in messages:
+            role = message["role"]
+            content = message["content"].strip()
+            calculate_loss = message.get("calculate_loss", 0)
+            # Build conversation history
+            if role == "system":
+                history += system_template.format(message=content)
+            elif role == "user":
+                history += user_template.format(message=content)
+            elif role == "assistant":
+                if calculate_loss:
+                    # If this assistant message needs loss calculation,
+                    # save it as response and don't add to history
+                    response = content
+                else:
+                    # Otherwise add to conversation history
+                    history += assistant_template.format(message=content)
+        return history, response
+    def prepare_sample(sample: dict) -> dict:
+        # Get conversation history and response
+        history, response = build_prompt(sample["messages"])
+        # Tokenize prompt and response
+        prompt_tokens = tokenizer.encode(history, add_special_tokens=False)
+        response_tokens = tokenizer.encode(response, add_special_tokens=False)
+        # Create input_ids with EOS token
+        input_ids = prompt_tokens + response_tokens + [tokenizer.eos_token_id]
+        # Create attention mask
+        attention_mask = [1] * len(input_ids)
+        # Create labels: -100 for prompt, actual tokens for response
+        labels = (
+            [-100] * len(prompt_tokens) + response_tokens + [tokenizer.eos_token_id]
+        )
+        # Truncate if exceeds max length
+        if len(input_ids) > max_length:
+            input_ids = input_ids[:max_length]
+            attention_mask = attention_mask[:max_length]
+            labels = labels[:max_length]
+        # Pad if necessary
+        if padding:
+            padding_length = max_length - len(input_ids)
+            if padding_length > 0:
+                input_ids.extend([tokenizer.pad_token_id] * padding_length)
+                attention_mask.extend([0] * padding_length)
+                labels.extend([-100] * padding_length)
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+        }
+    if tokenizer.pad_token is None:
+        log.warning("Tokenizer does not have a `pad_token`. Set it the `eos_token`.")
+        tokenizer.pad_token = tokenizer.eos_token
+    # Process the dataset
+    tokenized_dataset = dataset.map(
+        prepare_sample, remove_columns=dataset.column_names, desc="Tokenizing dataset"
+    )
+    return tokenized_dataset

fusion_bench/dataset/llama/preference_700k.py ADDED Viewed

@@ -0,0 +1,70 @@
+import logging
+import os
+from copy import deepcopy
+from typing import TYPE_CHECKING, Optional
+from datasets import Dataset, load_dataset, load_from_disk
+from lightning.fabric.utilities import rank_zero_only
+from tqdm.auto import tqdm
+from fusion_bench.utils import timeit_context
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+log = logging.getLogger(__name__)
+def load_tokenized_preference_700k_for_rlhf(
+    tokenizer: "PreTrainedTokenizer",
+    path: str = "hendrydong/preference_700K",
+    split: str = "train",
+    num_proc: int = 8,
+    cache_path: Optional[str] = None,
+):
+    R"""
+    Load and tokenized Preference 700k dataset for Bradley-Terry ranking model.
+    The returned dataset contains the following fields:
+    - chosen_input_ids: The input token ids for the winner.
+    - chosen_attention_mask: The attention mask for the winner.
+    - rejected_input_ids: The input token ids for the loser.
+    - rejected_attention_mask: The attention mask for the loser.
+    """
+    if cache_path is not None and os.path.exists(cache_path):
+        dataset = load_from_disk(cache_path)
+        return dataset
+    dataset = load_dataset(path, split=split)
+    def tokenize(sample):
+        sample["chosen_chat"] = tokenizer.apply_chat_template(
+            sample["chosen"], tokenize=False, add_generation_prompt=False
+        )
+        sample["rejected_chat"] = tokenizer.apply_chat_template(
+            sample["rejected"], tokenize=False, add_generation_prompt=False
+        )
+        tokenized_pos = tokenizer(sample["chosen_chat"], truncation=True)
+        tokenized_neg = tokenizer(sample["rejected_chat"], truncation=True)
+        # Ensure that the chosen response does not contain an PAD token
+        sample["chosen_input_ids"] = tokenized_pos["input_ids"]
+        sample["chosen_attention_mask"] = tokenized_pos["attention_mask"]
+        if tokenizer.pad_token_id in tokenized_pos["input_ids"]:
+            log.warning(f"Prompt contains PAD token: {sample['chosen_chat']}")
+        sample["rejected_input_ids"] = tokenized_neg["input_ids"]
+        sample["rejected_attention_mask"] = tokenized_neg["attention_mask"]
+        # Ensure that the rejected response does not contain an PAD token
+        if tokenizer.pad_token_id in tokenized_neg["input_ids"]:
+            log.warning(f"Prompt contains PAD token: {sample['rejected_chat']}")
+        return sample
+    dataset = dataset.map(tokenize, num_proc=num_proc)
+    if cache_path is not None and rank_zero_only.rank == 0:
+        dataset.save_to_disk(cache_path)
+    return dataset