PyPI - fusion-bench - Versions diffs - 0.2.14__py3-none-any.whl → 0.2.16__py3-none-any.whl - Mend

fusion-bench 0.2.14py3-none-any.whl → 0.2.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

fusion_bench/method/moe_pruner/moe_pruner.py ADDED Viewed

@@ -0,0 +1,304 @@
+import logging
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type, TypeVar, Union
+import torch
+from torch import Tensor, nn
+from torch.nn import functional as F
+from tqdm.auto import tqdm
+from transformers import MixtralForCausalLM
+from transformers.models.mixtral.modeling_mixtral import (
+    MixtralDecoderLayer,
+    MixtralSparseMoeBlock,
+)
+from fusion_bench import BaseAlgorithm, BaseModelPool
+from fusion_bench.method.pruning.prune_utils import (
+    PruningType,
+    compute_sparsity,
+    find_linear_layers,
+    semistructured_magnitude_prune_,
+    unstructured_magnitude_prune_,
+)
+from fusion_bench.mixins import LightningFabricMixin, SimpleProfilerMixin
+from fusion_bench.modelpool import CausalLMPool
+from fusion_bench.models.modeling_deepseek_v2 import (
+    DeepseekV2DecoderLayer,
+    DeepseekV2ForCausalLM,
+    DeepseekV2MLP,
+    DeepseekV2MoE,
+    DeepseekV2MoEGate,
+)
+from fusion_bench.utils import timeit_context
+from fusion_bench.utils.cache_utils import cache_to_disk
+from fusion_bench.utils.devices import to_device
+from .hooks.deepseek_v2 import (
+    MoEPrunerHookFnForDeepseekV2Gate,
+    MoEPrunerHookFnForDeepseekV2Linear,
+)
+from .hooks.hook import BaseHookFn
+from .hooks.mixtral import (
+    MoEPrunerHookFnForMixtralGate,
+    MoEPrunerHookFnForMixtralLinear,
+)
+from .utils.data import get_loaders
+from .utils.prune import prepare_calibration_input
+MoEModel = TypeVar("MoEModel", bound=Union[MixtralForCausalLM, DeepseekV2ForCausalLM])
+log = logging.getLogger(__name__)
+class MoEPruner(BaseAlgorithm, SimpleProfilerMixin, LightningFabricMixin):
+    def __init__(
+        self,
+        nsamples: int,
+        seed: int,
+        device: str,
+        prune_type: PruningType,
+        sparsity_ratio: float,
+        n: int,
+        m: int,
+        max_seqlen: Optional[int] = None,
+    ):
+        self.nsamples = nsamples
+        self.seed = seed
+        self.device = device
+        self.max_seqlen = max_seqlen
+        self.prune_type = prune_type
+        self.sparsity_ratio = sparsity_ratio
+        self.n = n
+        self.m = m
+        super().__init__()
+    def run(self, modelpool: CausalLMPool):
+        # load pre-trained model or the first model in the pool
+        with self.profile("load_model"):
+            model: MoEModel = modelpool.load_pretrained_or_first_model()
+            if self.max_seqlen is not None:
+                model.seqlen = min(
+                    model.config.max_position_embeddings,
+                    self.max_seqlen,
+                )
+            tokenizer = modelpool.load_tokenizer()
+        inps, outs, attention_mask, position_ids, position_embeddings = (
+            self.prepare_calibration_data(model, tokenizer)
+        )
+        self.prune_using_calibration_data_(
+            model,
+            inps=inps,
+            outs=outs,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            position_embeddings=position_embeddings,
+        )
+        return model
+    def prepare_calibration_data(self, model: MoEModel, tokenizer):
+        """
+        Prepare calibration data for pruning with caching.
+        Args:
+            model (LlamaForCausalLM): Model to be pruned.
+            tokenizer: Tokenizer for the model.
+        Returns:
+            Tuple: Calibration data (inputs, outputs, attention mask, position IDs).
+        """
+        @cache_to_disk(
+            f"outputs/cache/{model.config.name_or_path.split('/')[-1]}/calibration_data.pkl"
+        )
+        def _prepare_calibration_data(model, tokenizer):
+            return self._prepare_calibration_data(model, tokenizer)
+        return _prepare_calibration_data(model, tokenizer)
+    def _prepare_calibration_data(self, model, tokenizer):
+        """
+        Prepare calibration data for pruning.
+        Args:
+            model (LlamaForCausalLM): Model to be pruned.
+            tokenizer: Tokenizer for the model.
+        Returns:
+            Tuple: Calibration data (inputs, outputs, attention mask, position IDs).
+        """
+        with timeit_context("loading calibration data"):
+            dataloader, _ = get_loaders(
+                "c4",
+                nsamples=self.nsamples,
+                seed=self.seed,
+                seqlen=model.seqlen,
+                tokenizer=tokenizer,
+            )
+        with torch.no_grad():
+            # collect input to the first layer
+            inps, outs, attention_mask, position_ids, position_embeddings = (
+                prepare_calibration_input(model, dataloader, self.device)
+            )
+        return inps, outs, attention_mask, position_ids, position_embeddings
+    def prune_using_calibration_data_(
+        self,
+        model: MoEModel,
+        *,
+        inps,
+        outs,
+        attention_mask,
+        position_ids,
+        position_embeddings,
+    ):
+        model.eval()
+        layers = model.model.layers
+        for layer_idx, layer in tqdm(
+            enumerate(layers),
+            "Pruning Layers",
+            total=len(layers),
+            dynamic_ncols=True,
+        ):
+            if (
+                hasattr(model, "hf_device_map")
+                and f"model.layers.{layer_idx}" in model.hf_device_map
+            ):
+                # handle the case for large models, when the device map has multiple GPUs;
+                dev = model.hf_device_map[f"model.layers.{layer_idx}"]
+                inps, outs, attention_mask, position_ids, position_embeddings = (
+                    inps.to(dev),
+                    outs.to(dev),
+                    attention_mask.to(dev) if attention_mask is not None else None,
+                    position_ids.to(dev) if position_ids is not None else None,
+                    (
+                        to_device(position_embeddings, dev)
+                        if position_embeddings is not None
+                        else None
+                    ),
+                )
+            if isinstance(layer, MixtralDecoderLayer):
+                linear_layers = find_linear_layers(layer.block_sparse_moe.experts)
+            elif isinstance(layer, DeepseekV2DecoderLayer):
+                if isinstance(layer.mlp, DeepseekV2MoE):
+                    linear_layers = find_linear_layers(layer.mlp.experts)
+                elif isinstance(layer.mlp, DeepseekV2MLP):
+                    # compute the input to the next layer
+                    with torch.no_grad():
+                        for j in range(self.nsamples):
+                            outs[j] = layer(
+                                inps[j].unsqueeze(0),
+                                attention_mask=attention_mask,
+                                position_ids=position_ids,
+                                position_embeddings=position_embeddings,
+                            )[0]
+                    inps, outs = outs, inps
+                    continue
+            else:
+                raise ValueError(f"Unsupported layer type: {type(layer)}")
+            linear_hooks: Dict[str, BaseHookFn] = {}
+            handles: List[torch.utils.hooks.RemovableHandle] = []
+            for name, linear in linear_layers.items():
+                if isinstance(model, MixtralForCausalLM):
+                    hook_fn = MoEPrunerHookFnForMixtralLinear(linear, name)
+                elif isinstance(model, DeepseekV2ForCausalLM):
+                    hook_fn = MoEPrunerHookFnForDeepseekV2Linear(linear, name)
+                else:
+                    raise ValueError(f"Unsupported model type: {type(model)}")
+                linear_hooks[name] = hook_fn
+                handles.append(linear.register_forward_hook(hook_fn))
+            if isinstance(model, MixtralForCausalLM):
+                gate_hook = MoEPrunerHookFnForMixtralGate(
+                    layer.block_sparse_moe.gate,
+                    linear_hooks,
+                    top_k=layer.block_sparse_moe.top_k,
+                    num_experts=layer.block_sparse_moe.num_experts,
+                )
+                handles.append(
+                    layer.block_sparse_moe.gate.register_forward_hook(gate_hook)
+                )
+            elif isinstance(model, DeepseekV2ForCausalLM):
+                gate_hook = MoEPrunerHookFnForDeepseekV2Gate(
+                    layer.mlp.gate,
+                    linear_hooks,
+                    top_k=layer.mlp.gate.top_k,
+                    num_experts=layer.mlp.config.n_routed_experts,
+                )
+                handles.append(layer.mlp.gate.register_forward_hook(gate_hook))
+            else:
+                raise ValueError(f"Unsupported model type: {type(model)}")
+            with torch.no_grad():
+                for j in range(self.nsamples):
+                    outs[j] = layer(
+                        inps[j].unsqueeze(0),
+                        attention_mask=attention_mask,
+                        position_ids=position_ids,
+                        position_embeddings=position_embeddings,
+                    )[0]
+            # compute the importance scores and remove the hooks
+            metrics = {}
+            for name, hook in linear_hooks.items():
+                metrics[name] = hook.compute().detach().cpu()
+            for h in handles:
+                h.remove()
+            # prune the weights based on the importance scores
+            if self.prune_type == PruningType.UNSTRUCTURED:
+                for name, linear in linear_layers.items():
+                    log.info(f"Pruning {name}")
+                    unstructured_magnitude_prune_(
+                        linear.weight.data,
+                        metrics[name].to(linear.weight.device),
+                        sparsity_ratio=self.sparsity_ratio,
+                    )
+                    self.check_sparsity(linear.weight)
+            elif self.prune_type == PruningType.SEMISTRUCTURED:
+                for name, linear in linear_layers.items():
+                    log.info(f"Pruning {name}")
+                    semistructured_magnitude_prune_(
+                        linear.weight.data,
+                        metrics[name].to(linear.weight.device),
+                        n=self.n,
+                        m=self.m,
+                    )
+                    self.check_sparsity(linear.weight)
+            else:
+                raise ValueError(f"Invalid pruning type: {self.prune_type}")
+            # compute the input to the next layer
+            with torch.no_grad():
+                for j in range(self.nsamples):
+                    outs[j] = layer(
+                        inps[j].unsqueeze(0),
+                        attention_mask=attention_mask,
+                        position_ids=position_ids,
+                        position_embeddings=position_embeddings,
+                    )[0]
+            inps, outs = outs, inps
+    @torch.no_grad()
+    def check_sparsity(self, weight: Tensor, tol: float = 0.01):
+        """
+        Check the sparsity of the weight tensor.
+        Args:
+            weight (Tensor): Weight tensor.
+            tol (float): Tolerance for sparsity check.
+        Raises:
+            ValueError: If the pruning type is invalid.
+        """
+        if self.prune_type == PruningType.UNSTRUCTURED:
+            assert (compute_sparsity(weight) - self.sparsity_ratio).abs() < tol
+        elif self.prune_type == PruningType.SEMISTRUCTURED:
+            assert (compute_sparsity(weight) - self.n / self.m).abs() < tol
+        else:
+            raise ValueError(f"Invalid pruning type: {self.prune_type}")

fusion_bench/method/moe_pruner/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .score import layer_load_balance_score

fusion_bench/method/moe_pruner/utils/data.py ADDED Viewed

@@ -0,0 +1,154 @@
+# Code adapted from https://github.com/IST-DASLab/sparsegpt/blob/master/datautils.py
+import random
+from typing import List, Optional, Tuple, cast  # noqa: F401
+import os
+from datasets import load_dataset
+from torch import Tensor
+from tqdm.auto import tqdm
+from transformers import PreTrainedTokenizer
+# Wrapper for tokenized input IDs
+class TokenizerWrapper:
+    def __init__(self, input_ids):
+        self.input_ids = input_ids
+# Load and process wikitext2 dataset
+def get_wikitext2(
+    nsamples: int,
+    seed: int,
+    seqlen: int,
+    tokenizer: PreTrainedTokenizer,
+    data_path: str = "wikitext",
+):
+    """
+    Load and preprocess the Wikitext-2 dataset for training and testing.
+    Args:
+        nsamples (int): Number of samples to generate from the training set.
+        seed (int): Random seed for reproducibility.
+        seqlen (int): Length of the sequence to be used for training.
+        tokenizer (PreTrainedTokenizer): Tokenizer to encode the text data.
+        data_path (str, optional): Path to the dataset. Defaults to "wikitext".
+    """
+    # Load train and test datasets
+    traindata = load_dataset(data_path, "wikitext-2-raw-v1", split="train")
+    testdata = load_dataset(data_path, "wikitext-2-raw-v1", split="test")
+    # Encode datasets
+    trainenc = tokenizer(" ".join(traindata["text"]), return_tensors="pt")
+    testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt")
+    # Generate samples from training set
+    random.seed(seed)
+    trainloader: List[Tuple[Tensor, Tensor]] = []
+    for _ in tqdm(range(nsamples), desc="Generating samples"):
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp: Tensor = trainenc.input_ids[:, i:j]
+        tar = inp.clone()
+        tar[:, :-1] = -100
+        trainloader.append((inp, tar))
+    return trainloader, testenc
+# Load and process c4 dataset
+def get_c4(
+    nsamples: int,
+    seed: int,
+    seqlen: int,
+    tokenizer,
+    data_path: str = "allenai/c4",
+) -> Tuple[List[Tuple[Tensor, Tensor]], TokenizerWrapper]:
+    """
+    Load and process the c4 dataset.
+    Args:
+        nsamples (int): Number of samples to generate from the training set.
+        seed (int): Seed for random number generation.
+        seqlen (int): Length of each sequence.
+        tokenizer: Tokenizer object for encoding the text.
+        data_path (str, optional): Path to the c4 dataset. Defaults to "allenai/c4".
+    Returns:
+        tuple (Tuple[List[Tuple[Tensor, Tensor]], TokenizerWrapper]): Tuple containing the training samples and the validation dataset.
+    """
+    # Load train and validation datasets
+    if os.path.exists(".cache/allenai--c4/en/c4-train.00000-of-01024.json.gz"):
+        traindata = load_dataset(
+            "json",
+            data_files={
+                "train": ".cache/allenai--c4/en/c4-train.00000-of-01024.json.gz"
+            },
+            split="train",
+        )
+    else:
+        traindata = load_dataset(
+            data_path,
+            # "allenai--c4", # https://github.com/huggingface/datasets/issues/6559
+            data_files={"train": "en/c4-train.00000-of-01024.json.gz"},
+            split="train",
+        )
+    if os.path.exists(".cache/allenai--c4/en/c4-validation.00000-of-00008.json.gz"):
+        valdata = load_dataset(
+            "json",
+            data_files={
+                "validation": ".cache/allenai--c4/en/c4-validation.00000-of-00008.json.gz",
+            },
+            split="validation",
+        )
+    else:
+        valdata = load_dataset(
+            data_path,
+            # "allenai--c4",
+            data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"},
+            split="validation",
+        )
+    # Generate samples from training set
+    if seed is not None:
+        random.seed(seed)
+    trainloader = []
+    for _ in tqdm(range(nsamples), desc="Generating samples"):
+        while True:
+            i = random.randint(0, len(traindata) - 1)
+            trainenc = tokenizer(traindata[i]["text"], return_tensors="pt")
+            if trainenc.input_ids.shape[1] > seqlen:
+                break
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
+        tar = inp.clone()
+        tar[:, :-1] = -100
+        trainloader.append((inp, tar))
+    # Prepare validation dataset
+    valenc = tokenizer(" ".join(valdata[:1100]["text"]), return_tensors="pt")
+    valenc = valenc.input_ids[:, : (256 * seqlen)]
+    valenc = TokenizerWrapper(valenc)
+    return trainloader, valenc
+# Function to select the appropriate loader based on dataset name
+def get_loaders(
+    name: str, nsamples: int = 128, seed: int = 0, seqlen: int = 2048, tokenizer=None
+):
+    """
+    Get the data loaders for the specified dataset.
+    Args:
+        name (str): The name of the dataset. Supported values are "wikitext2" and "c4".
+        nsamples (int, optional): Number of samples to generate from the dataset. Defaults to 128.
+        seed (int, optional): Random seed for reproducibility. Defaults to 0.
+        seqlen (int, optional): Length of the sequence to be used for training. Defaults to 2048.
+        tokenizer (optional): Tokenizer to encode the text data. Defaults to None.
+    """
+    if "wikitext2" in name:
+        return get_wikitext2(nsamples, seed, seqlen, tokenizer)
+    if "c4" in name:
+        return get_c4(nsamples, seed, seqlen, tokenizer)
+    raise ValueError(f"Unknown dataset: {name}")

fusion_bench/method/moe_pruner/utils/layerwrapper.py ADDED Viewed

@@ -0,0 +1,61 @@
+import torch
+import torch.nn as nn
+# Define WrappedGPT class
+class WrappedGPT:
+    """
+    This class wraps a GPT layer for specific operations.
+    Attributes:
+        layer (nn.Linear | nn.Module): The GPT layer to be wrapped.
+        dev (torch.device): The device on which the layer's weights are stored.
+        rows (int): The number of rows in the layer's weight matrix.
+        columns (int): The number of columns in the layer's weight matrix.
+        scaler_row (torch.Tensor): A tensor to store the scaler values for each column.
+        nsamples (int): The number of samples processed.
+        layer_id (int): The ID of the layer.
+        layer_name (str): The name of the layer.
+    """
+    def __init__(self, layer: nn.Linear | nn.Module, layer_id=0, layer_name="none"):
+        """
+        Initialize the WrappedGPT class.
+        Args:
+            layer (nn.Linear | nn.Module): The GPT layer to be wrapped.
+            layer_id (int, optional): The ID of the layer. Defaults to 0.
+            layer_name (str, optional): The name of the layer. Defaults to "none".
+        """
+        self.layer = layer
+        self.dev = self.layer.weight.device
+        self.rows = layer.weight.data.shape[0]
+        self.columns = layer.weight.data.shape[1]
+        self.scaler_row = torch.zeros((self.columns), device=self.dev)
+        self.nsamples = 0
+        self.layer_id = layer_id
+        self.layer_name = layer_name
+    def add_batch(self, inp: torch.Tensor, out: torch.Tensor):
+        """
+        Add a batch of input and output tensors to the scaler_row.
+        Args:
+            inp (torch.Tensor): The input tensor.
+            out (torch.Tensor): The output tensor.
+        """
+        if len(inp.shape) == 2:
+            inp = inp.unsqueeze(0)
+        tmp = inp.shape[0]
+        if isinstance(self.layer, nn.Linear):
+            if len(inp.shape) == 3:
+                inp = inp.reshape((-1, inp.shape[-1]))
+            inp = inp.t()
+        self.scaler_row *= self.nsamples / (self.nsamples + tmp)
+        self.nsamples += tmp
+        inp = inp.type(torch.float32)
+        self.scaler_row += torch.norm(inp, p=2, dim=1) ** 2 / self.nsamples

fusion-bench 0.2.14__py3-none-any.whl → 0.2.16__py3-none-any.whl

fusion-bench 0.2.14py3-none-any.whl → 0.2.16py3-none-any.whl