PyPI - fusion-bench - Versions diffs - 0.2.9__py3-none-any.whl - Mend

fusion-bench 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (727) hide show

fusion_bench/method/pruning/llama_wanda_prune.py ADDED Viewed

@@ -0,0 +1,359 @@
+import logging
+from abc import abstractmethod
+from typing import Dict, List, Literal, Optional, Tuple, cast  # noqa: F401
+import torch
+import torch.utils.hooks
+from torch import Tensor, nn
+from tqdm.auto import tqdm
+from transformers import LlamaForCausalLM
+from fusion_bench.method import BaseAlgorithm
+from fusion_bench.method.pruning.wanda_utils.data import get_loaders
+from fusion_bench.method.pruning.wanda_utils.prune import prepare_calibration_input
+from fusion_bench.mixins import SimpleProfilerMixin
+from fusion_bench.modelpool import CausalLMPool
+from fusion_bench.utils import timeit_context
+from fusion_bench.utils.cache_utils import cache_to_disk
+from .prune_utils import (
+    PruningType,
+    compute_sparsity,
+    find_linear_layers,
+    semistructured_magnitude_prune_,
+    unstructured_magnitude_prune_,
+)
+log = logging.getLogger(__name__)
+class BaseLoSparseHookFn:
+    """
+    Base class for low-sparsity hook functions.
+    """
+    def __init__(self, linear):
+        self.linear = linear
+    @abstractmethod
+    def compute(self) -> Tensor:
+        """
+        Compute the importance scores.
+        """
+        pass
+    @abstractmethod
+    def __call__(self, linear, inp: Tuple[Tensor], out: Tensor):
+        """
+        Hook function to be called during the forward pass.
+        """
+        pass
+class WandaHookFn(BaseLoSparseHookFn):
+    R"""
+    Here in this class, the `scalar_row` is the mean of the squared sum of the input to the linear layer along a specific input dimension.
+    $$\frac{\sum_{i=1}^{N L} X_{ij}^2}{N L}$$
+    """
+    def __init__(self, linear: nn.Linear):
+        super().__init__(linear)
+        self.scalar_row = torch.zeros(
+            (linear.weight.size(1),), device=linear.weight.device
+        )
+        self.nsamples = 0
+    def compute(self):
+        return torch.abs(self.linear.weight) * torch.sqrt(
+            self.scalar_row.reshape(1, -1)
+        )
+    def __call__(self, linear: nn.Linear, inps: Tuple[Tensor], out: Tensor):
+        assert len(inps) == 1
+        inp = inps[0]
+        if len(inp.shape) == 2:
+            inp = inp.unsqueeze(0)
+        batch_size = inp.shape[0]
+        if len(inp.shape) == 3:
+            inp = inp.reshape((-1, inp.shape[-1]))
+        # (NxL, C) -> (C, NxL)
+        inp = inp.t()
+        self.scalar_row *= self.nsamples / (self.nsamples + batch_size)
+        self.nsamples += batch_size
+        inp = inp.type(torch.float32)
+        self.scalar_row += torch.norm(inp, p=2, dim=1) ** 2 / self.nsamples
+class WandaPruningForLlama(BaseAlgorithm, SimpleProfilerMixin):
+    """
+    Class for Wanda pruning for Llama models.
+    """
+    _config_mapping = BaseAlgorithm._config_mapping | {
+        "nsamples": "nsamples",
+        "seed": "seed",
+        "use_variant": "use_variant",
+        "prune_type": "prune_type",
+        "device": "device",
+        "dtype": "dtype",
+        "sparsity_ratio": "sparsity_ratio",
+        "n": "n",
+        "m": "m",
+    }
+    def __init__(
+        self,
+        *,
+        nsamples: int,
+        seed: int,
+        use_variant: bool,
+        prune_type: PruningType,
+        device: str,
+        dtype: str,
+        sparsity_ratio: float,
+        n: int,
+        m: int,
+        model_save_path: Optional[str] = None,
+        **kwargs,
+    ):
+        """
+        Initialize the WandaPruningForLlama class.
+        Args:
+            nsamples (int): Number of samples for calibration.
+            seed (int): Random seed.
+            use_variant (bool): Whether to use a variant of the pruning method.
+            prune_type (PruningType): Type of pruning to perform.
+            device (str): Device to use for computation.
+            dtype (str): Data type to use for computation.
+            sparsity_ratio (float): Sparsity ratio for pruning.
+            n (int): Number of elements to keep in semi-structured pruning.
+            m (int): Number of elements in a group for semi-structured pruning.
+            model_save_path (Optional[str]): Path to save the pruned model.
+            **kwargs: Additional arguments.
+        """
+        super().__init__(**kwargs)
+        self.nsamples = nsamples
+        self.seed = seed
+        self.use_variant = use_variant
+        self.prune_type = prune_type
+        self.device = device
+        self.dtype = dtype
+        self.sparsity_ratio = sparsity_ratio
+        self.n = n
+        self.m = m
+        self.model_save_path = model_save_path
+    def run(self, modelpool: CausalLMPool):
+        """
+        Run the pruning algorithm on the model pool.
+        Args:
+            modelpool (CausalLMPool): Pool of causal language models.
+        Returns:
+            LlamaForCausalLM: Pruned model.
+        """
+        # load pre-trained model or the first model in the pool
+        with self.profile("load_model"):
+            model = modelpool.load_pretrained_or_first_model()
+            model.seqlen = model.config.max_position_embeddings
+            tokenizer = modelpool.load_tokenizer(use_fast=False)
+        if not isinstance(model, (LlamaForCausalLM,)):
+            log.warning(f"Model type {type(model)} may not supported.")
+        inps, outs, attention_mask, position_ids = self.prepare_calibration_data(
+            model, tokenizer
+        )
+        self.prune_using_calibration_data_(
+            model,
+            inps=inps,
+            outs=outs,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+        )
+        if self.model_save_path is not None:
+            with timeit_context(f"Saving pruned model to {self.model_save_path}"):
+                tokenizer.save_pretrained(self.model_save_path)
+                model.save_pretrained(self.model_save_path)
+        return model
+    def _prepare_calibration_data(self, model, tokenizer):
+        """
+        Prepare calibration data for pruning.
+        Args:
+            model (LlamaForCausalLM): Model to be pruned.
+            tokenizer: Tokenizer for the model.
+        Returns:
+            Tuple: Calibration data (inputs, outputs, attention mask, position IDs).
+        """
+        with timeit_context("loading calibration data"):
+            dataloader, _ = get_loaders(
+                "c4",
+                nsamples=self.nsamples,
+                seed=self.seed,
+                seqlen=model.seqlen,
+                tokenizer=tokenizer,
+            )
+        with torch.no_grad():
+            # collect input to the first layer
+            inps, outs, attention_mask, position_ids = prepare_calibration_input(
+                model, dataloader, self.device
+            )
+        return inps, outs, attention_mask, position_ids
+    def prepare_calibration_data(self, model: LlamaForCausalLM, tokenizer):
+        """
+        Prepare calibration data for pruning with caching.
+        Args:
+            model (LlamaForCausalLM): Model to be pruned.
+            tokenizer: Tokenizer for the model.
+        Returns:
+            Tuple: Calibration data (inputs, outputs, attention mask, position IDs).
+        """
+        @cache_to_disk(
+            f"outputs/cache/{model.config.name_or_path.split('/')[-1]}/calibration_data.pkl"
+        )
+        def _prepare_calibration_data(model, tokenizer):
+            return self._prepare_calibration_data(model, tokenizer)
+        return _prepare_calibration_data(model, tokenizer)
+    def prune_using_calibration_data_(
+        self,
+        model: LlamaForCausalLM,
+        *,
+        inps,
+        outs,
+        attention_mask,
+        position_ids,
+    ):
+        """
+        Prune the model using calibration data.
+        Args:
+            model (LlamaForCausalLM): Model to be pruned.
+            inps: Calibration inputs.
+            outs: Calibration outputs.
+            attention_mask: Attention mask for calibration data.
+            position_ids: Position IDs for calibration data.
+        """
+        layers = model.model.layers
+        for layer_idx, layer in tqdm(
+            enumerate(layers),
+            "Pruning Layers",
+            total=len(layers),
+            dynamic_ncols=True,
+        ):
+            if (
+                hasattr(model, "hf_device_map")
+                and f"model.layers.{layer_idx}" in model.hf_device_map
+            ):
+                # handle the case for llama-30B and llama-65B, when the device map has multiple GPUs;
+                dev = model.hf_device_map[f"model.layers.{layer_idx}"]
+                inps, outs, attention_mask, position_ids = (
+                    inps.to(dev),
+                    outs.to(dev),
+                    attention_mask.to(dev) if attention_mask is not None else None,
+                    position_ids.to(dev) if position_ids is not None else None,
+                )
+            # collect the importance scores
+            linear_layers = cast(
+                Dict[str, nn.Linear],
+                find_linear_layers(layer, layers=[nn.Linear]),
+            )
+            # register hooks to collect the importance scores
+            def get_hook_fn(linear: nn.Linear):
+                hook_fn = WandaHookFn(linear)
+                return hook_fn
+            hooks = {}
+            handles: List[torch.utils.hooks.RemovableHandle] = []
+            for name, linear in linear_layers.items():
+                hook_fn = get_hook_fn(linear)
+                hooks[name] = hook_fn
+                handles.append(linear.register_forward_hook(hook_fn))
+            with torch.no_grad():
+                for j in range(self.nsamples):
+                    outs[j] = layer(
+                        inps[j].unsqueeze(0),
+                        attention_mask=attention_mask,
+                        position_ids=position_ids,
+                    )[0]
+            # compute the importance scores and remove the hooks
+            metrics = {}
+            for name, hook in hooks.items():
+                metrics[name] = hook.compute()
+            for h in handles:
+                h.remove()
+            # prune the weights based on the importance scores
+            if self.prune_type == PruningType.UNSTRUCTURED:
+                for name, linear in linear_layers.items():
+                    log.info(f"Pruning {name}")
+                    unstructured_magnitude_prune_(
+                        linear.weight.data,
+                        metrics[name],
+                        sparsity_ratio=self.sparsity_ratio,
+                    )
+                    self.check_sparsity(linear.weight)
+            elif self.prune_type == PruningType.SEMISTRUCTURED:
+                for name, linear in linear_layers.items():
+                    log.info(f"Pruning {name}")
+                    semistructured_magnitude_prune_(
+                        linear.weight.data,
+                        metrics[name],
+                        n=self.n,
+                        m=self.m,
+                    )
+                    self.check_sparsity(linear.weight)
+            else:
+                raise ValueError(f"Invalid pruning type: {self.prune_type}")
+            # compute the input to the next layer
+            with torch.no_grad():
+                for j in range(self.nsamples):
+                    outs[j] = layer(
+                        inps[j].unsqueeze(0),
+                        attention_mask=attention_mask,
+                        position_ids=position_ids,
+                    )[0]
+            inps, outs = outs, inps
+    @torch.no_grad()
+    def check_sparsity(self, weight: Tensor, tol: float = 0.01):
+        """
+        Check the sparsity of the weight tensor.
+        Args:
+            weight (Tensor): Weight tensor.
+            tol (float): Tolerance for sparsity check.
+        Raises:
+            ValueError: If the pruning type is invalid.
+        """
+        if self.prune_type == PruningType.UNSTRUCTURED:
+            assert (compute_sparsity(weight) - self.sparsity_ratio).abs() < tol
+        elif self.prune_type == PruningType.SEMISTRUCTURED:
+            assert (compute_sparsity(weight) - self.n / self.m).abs() < tol
+        else:
+            raise ValueError(f"Invalid pruning type: {self.prune_type}")

fusion_bench/method/pruning/magnitude_diff_pruning.py ADDED Viewed

@@ -0,0 +1,180 @@
+import functools
+import logging
+import re
+from copy import deepcopy
+from typing import Dict, List, Literal, Optional, Union  # noqa: F401
+import torch
+from torch import Tensor, nn
+from tqdm.auto import tqdm
+from fusion_bench.method import BaseAlgorithm
+from fusion_bench.mixins.simple_profiler import SimpleProfilerMixin
+from fusion_bench.modelpool import BaseModelPool
+from .prune_utils import unstructured_magnitude_prune_
+log = logging.getLogger(__name__)
+def _is_name_matched(name: str, extract_names: List[str]):
+    """
+    Check if the parameter name matches any of the provided regular expressions.
+    Args:
+        name (str): The name of the parameter.
+        extract_names (List[str]): List of regular expressions to match the parameter names.
+    Returns:
+        bool: True if the name matches any of the regular expressions, False otherwise.
+    """
+    for extract_name in extract_names:
+        # extract_name is a regular expression
+        if re.match(extract_name, name):
+            return True
+    return False
+class MagnitudeDiffPruningAlgorithm(
+    BaseAlgorithm,
+    SimpleProfilerMixin,
+):
+    """
+    Implements magnitude-based pruning on the difference between pretrained and fine-tuned model parameters.
+    This class supports pruning the difference between the pretrained and fine-tuned model parameters
+    based on their magnitude. It allows specifying the ratio of weights to prune and the names of
+    parameters to extract for pruning.
+    Methods:
+        run(modelpool: BaseModelPool) -> nn.Module:
+            Executes the pruning process on the model pool and returns the pruned model.
+        magnitude_prune(pretrained_model: nn.Module, finetuned_model: nn.Module, in_place: bool = True) -> nn.Module:
+            Prunes the difference between the pretrained and fine-tuned model parameters.
+    """
+    _config_mapping = BaseAlgorithm._config_mapping | {
+        "prune_ratio": "prune_ratio",
+        "extract_names": "extract_names",
+    }
+    def __init__(
+        self,
+        prune_ratio: float,
+        rescale: Optional[Union[bool, float]] = None,
+        extract_names: List[str] = None,
+        prune_type: Literal["minor", "major"] = "minor",
+        **kwargs,
+    ):
+        """
+        Initialize the MagnitudeDiffPruningAlgorithm with the given configuration.
+        Args:
+            prune_ratio (float): The ratio of weights to prune.
+            extract_names (List[str], optional): List of regular expressions to match the parameter names for pruning. Defaults to None.
+            **kwargs: Additional keyword arguments.
+        """
+        self.prune_ratio = prune_ratio
+        self.rescale = rescale
+        self.extract_names = extract_names
+        self.prune_type = prune_type
+        super().__init__(**kwargs)
+    @torch.no_grad()
+    def run(self, modelpool: BaseModelPool):
+        """
+        Execute the pruning process on the model pool.
+        This method loads the pretrained and fine-tuned models from the model pool,
+        prunes the difference between their parameters, and returns the pruned model.
+        Args:
+            modelpool (BaseModelPool): The model pool containing the models to prune.
+        Returns:
+            nn.Module: The pruned model.
+        """
+        if not isinstance(modelpool, BaseModelPool):
+            modelpool = BaseModelPool(modelpool)
+        assert (
+            len(modelpool.model_names) == 1
+        ), "Only one fine-tuned model is allowed in the model pool."
+        with self.profile("load pretrained model"):
+            pretrained_model = modelpool.load_model("_pretrained_")
+        with self.profile("load fine-tuned model"):
+            finetuned_model = modelpool.load_model(modelpool.model_names[0])
+        with self.profile("prune model"):
+            model = self.magnitude_prune(pretrained_model, finetuned_model)
+        self.print_profile_summary()
+        return model
+    @torch.no_grad()
+    def magnitude_prune(
+        self,
+        pretrained_model: nn.Module,
+        finetuned_model: nn.Module,
+        in_place: bool = True,
+    ):
+        """
+        Prune the difference between the pretrained and fine-tuned model parameters.
+        This method calculates the difference between the pretrained and fine-tuned model parameters,
+        prunes the difference based on their magnitude, and updates the pretrained model parameters
+        with the pruned difference.
+        Args:
+            pretrained_model (nn.Module): The pretrained model.
+            finetuned_model (nn.Module): The fine-tuned model.
+            in_place (bool, optional): Whether to perform the pruning in place. Defaults to True.
+        Returns:
+            nn.Module: The pruned model.
+        """
+        if in_place:
+            model = pretrained_model
+        else:
+            model = deepcopy(pretrained_model)
+        if self.extract_names is not None:
+            extract_names: List[str] = (
+                self.extract_names
+            )  # regular expressions for the names of the parameters
+        else:
+            # extract the weight matrix of each linear layer
+            extract_names = []
+            for name, module in model.named_modules():
+                if isinstance(module, nn.Linear):
+                    extract_names.append(f"{name}.weight")
+        ft_state_dict = finetuned_model.state_dict()
+        for name, param in tqdm(
+            model.named_parameters(),
+            "Magnitude Pruning On Parameter Difference",
+            total=len(tuple(model.named_parameters())),
+        ):
+            if not param.requires_grad:
+                continue
+            # Prune the diff parameter if its name matches
+            if _is_name_matched(name, extract_names):
+                w_diff = ft_state_dict[name] - param
+                w_diff = unstructured_magnitude_prune_(
+                    w_diff,
+                    (
+                        torch.abs
+                        if self.prune_type == "minor"
+                        else lambda x: -torch.abs(x)
+                    ),
+                    sparsity_ratio=self.prune_ratio,
+                )
+                if self.rescale is not None:
+                    rescale = (
+                        1 / self.prune_ratio if self.rescale == True else self.rescale
+                    )
+                    w_diff = w_diff * rescale
+                param.data = param + w_diff
+        return model

fusion_bench/method/pruning/prune_utils.py ADDED Viewed

@@ -0,0 +1,165 @@
+from typing import Callable, Dict, Union  # noqa: F401
+import torch
+from torch import nn
+try:
+    # strEnum only available for python >= 3.11
+    # for older version, load from fusion_bench.utils.strenum
+    from enum import StrEnum
+except ImportError:
+    from fusion_bench.utils.strenum import StrEnum
+class PruningType(StrEnum):
+    """
+    Enum class for different types of pruning.
+    """
+    UNSTRUCTURED = "unstructured"
+    SEMISTRUCTURED = "semistructured"  # N:M structured
+    STRUCTURED = "structured"
+def find_linear_layers(module: nn.Module, layers=[nn.Linear], prefix=""):
+    """
+    Recursively find the layers of a certain type in a module.
+    Args:
+        module (nn.Module): PyTorch module.
+        layers (list): List of layer types to find.
+        prefix (str): A prefix to add to the layer names.
+    Returns:
+        dict: Dictionary of layers of the given type(s) within the module.
+    """
+    res = {}
+    for name, submodule in module.named_modules(prefix=prefix):
+        if isinstance(submodule, tuple(layers)):
+            res[name] = submodule
+    return res
+def unstructured_magnitude_prune_(
+    weight: torch.Tensor,
+    metric_function_or_scores: Union[
+        Callable[[torch.Tensor], torch.Tensor], torch.Tensor
+    ],
+    sparsity_ratio: float,
+    dtype: torch.dtype = None,
+    device: torch.device = None,
+    return_pruned_weight: bool = False,
+):
+    """
+    Perform unstructured magnitude pruning on the given weight tensor.
+    Args:
+        weight (torch.Tensor): The weight tensor to prune.
+        metric_function_or_scores (Union[Callable[[torch.Tensor], torch.Tensor], torch.Tensor]):
+            A function to compute the metric for pruning or a precomputed metric tensor.
+        sparsity_ratio (float): The ratio of weights to prune.
+        dtype (torch.dtype, optional): The data type to use for computations. Defaults to None.
+        device (torch.device, optional): The device to use for computations. Defaults to None.
+        return_pruned_weight (bool, optional): Whether to return the pruned weight tensor. Defaults to False.
+    Returns:
+        torch.Tensor: The pruned weight tensor.
+        torch.Tensor (optional): The pruned weight tensor if return_pruned_weight is True.
+    """
+    original_device = weight.device
+    if callable(metric_function_or_scores):
+        W_metric = metric_function_or_scores(weight.to(dtype=dtype, device=device))
+    elif isinstance(metric_function_or_scores, torch.Tensor):
+        W_metric = metric_function_or_scores.to(dtype=dtype, device=device)
+    else:
+        raise ValueError(
+            "metric_function_or_scores should be either a callable or a tensor"
+        )
+    # Create a mask for the weights to prune
+    W_mask = torch.zeros_like(W_metric) == 1
+    sort_res = torch.sort(W_metric, dim=-1, stable=True)
+    indices = sort_res[1][:, : int(W_metric.shape[1] * sparsity_ratio)]
+    W_mask.scatter_(1, indices, True)
+    W_mask = W_mask.to(device=original_device)
+    if not return_pruned_weight:
+        weight.masked_fill_(W_mask, 0)
+        return weight
+    else:
+        pruned_weight = weight.clone()
+        weight.masked_fill_(W_mask, 0)
+        pruned_weight.masked_fill_(~W_mask, 0)
+        return weight, pruned_weight
+def semistructured_magnitude_prune_(
+    weight: torch.Tensor,
+    metric_function_or_scores: Union[
+        Callable[[torch.Tensor], torch.Tensor], torch.Tensor
+    ],
+    n: int,
+    m: int,
+    dtype: torch.dtype = None,
+    device: torch.device = None,
+    return_pruned_weight: bool = False,
+):
+    """
+    Perform semi-structured (N:M structured) magnitude pruning on the given weight tensor.
+    Args:
+        weight (torch.Tensor): The weight tensor to prune.
+        metric_function_or_scores (Union[Callable[[torch.Tensor], torch.Tensor], torch.Tensor]):
+            A function to compute the metric for pruning or a precomputed metric tensor.
+        n (int): The number of weights to keep in each group.
+        m (int): The size of each group.
+        dtype (torch.dtype, optional): The data type to use for computations. Defaults to None.
+        device (torch.device, optional): The device to use for computations. Defaults to None.
+        return_pruned_weight (bool, optional): Whether to return the pruned weight tensor. Defaults to False.
+    Returns:
+        torch.Tensor: The pruned weight tensor.
+        torch.Tensor (optional): The pruned weight tensor if return_pruned_weight is True.
+    """
+    original_device = weight.device
+    if callable(metric_function_or_scores):
+        W_metric = metric_function_or_scores(weight.to(dtype=dtype, device=device))
+    elif isinstance(metric_function_or_scores, torch.Tensor):
+        W_metric = metric_function_or_scores.to(dtype=dtype, device=device)
+    else:
+        raise ValueError(
+            "metric_function_or_scores should be either a callable or a tensor"
+        )
+    # Create a mask for the weights to prune
+    W_mask = torch.zeros_like(W_metric, dtype=torch.bool)
+    for col_idx in range(0, W_metric.shape[1], m):
+        tmp = W_metric[:, col_idx : (col_idx + m)].float()  # noqa: E203
+        W_mask.scatter_(
+            1,
+            col_idx + torch.topk(tmp, n, dim=1, largest=False)[1],
+            True,
+        )
+    W_mask = W_mask.to(device=original_device)
+    if not return_pruned_weight:
+        weight.masked_fill_(W_mask, 0)
+        return weight
+    else:
+        pruned_weight = weight.clone()
+        weight.masked_fill_(W_mask, 0)
+        pruned_weight.masked_fill_(~W_mask, 0)
+        return weight, pruned_weight
+def compute_sparsity(weight: torch.Tensor):
+    """
+    Compute the sparsity of the given weight tensor.
+    Args:
+        weight (torch.Tensor): The weight tensor.
+    Returns:
+        float: The sparsity of the weight tensor.
+    """
+    return (weight == 0).sum() / weight.numel()