PyPI - fusion-bench - Versions diffs - 0.2.15__py3-none-any.whl → 0.2.16__py3-none-any.whl - Mend

fusion-bench 0.2.15py3-none-any.whl → 0.2.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

fusion_bench/method/moe_pruner/utils/prune.py ADDED Viewed

@@ -0,0 +1,313 @@
+import logging
+from typing import List, Tuple, cast
+import torch
+import torch.nn as nn
+from torch import Tensor
+from tqdm.auto import tqdm
+from transformers import LlamaForCausalLM, PreTrainedModel
+from fusion_bench import timeit_context
+from .data import get_loaders
+from .layerwrapper import WrappedGPT
+log = logging.getLogger(__name__)
+def find_layers(module, layers=[nn.Linear], name=""):
+    """
+    Recursively find the layers of a certain type in a module.
+    Args:
+        module (nn.Module): PyTorch module.
+        layers (list): List of layer types to find.
+        name (str): Name of the module.
+    Returns:
+        dict: Dictionary of layers of the given type(s) within the module.
+    """
+    if type(module) in layers:
+        return {name: module}
+    res = {}
+    for name1, child in module.named_children():
+        res.update(
+            find_layers(
+                child, layers=layers, name=name + "." + name1 if name != "" else name1
+            )
+        )
+    return res
+def check_sparsity(model):
+    """
+    Check the sparsity of the model by counting the number of zero weights.
+    Args:
+        model (PreTrainedModel): The model to check sparsity for.
+    Returns:
+        float: The sparsity ratio of the model.
+    """
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.layers
+    count = 0
+    total_params = 0
+    for i in range(len(layers)):
+        layer = layers[i]
+        subset = find_layers(layer)
+        sub_count = 0
+        sub_params = 0
+        for name in subset:
+            W = subset[name].weight.data
+            count += (W == 0).sum().item()
+            total_params += W.numel()
+            sub_count += (W == 0).sum().item()
+            sub_params += W.numel()
+        print(f"layer {i} sparsity {float(sub_count)/sub_params:.6f}")
+    model.config.use_cache = use_cache
+    return float(count) / total_params
+def prepare_calibration_input(
+    model: PreTrainedModel,
+    dataloader: List[Tuple[Tensor, Tensor]],
+    device: torch.device,
+):
+    """
+    Prepare the calibration input for the model by collecting input to the first layer.
+    Args:
+        model (PreTrainedModel): The model to prepare calibration input for.
+        dataloader (List[Tuple[Tensor, Tensor]]): The dataloader to use for calibration.
+        device (torch.device): The device to use for calibration.
+    Returns:
+        Tuple[Tensor, Tensor, Tensor, Tensor]: The prepared input, output, attention mask, and position IDs.
+    """
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.layers
+    # dev = model.hf_device_map["model.embed_tokens"]
+    if hasattr(model, "hf_device_map") and "model.embed_tokens" in model.hf_device_map:
+        device = model.hf_device_map["model.embed_tokens"]
+    dtype = next(iter(model.parameters())).dtype
+    # ? what if n_samples > 128
+    inps = torch.zeros(
+        (128, model.seqlen, model.config.hidden_size),
+        dtype=dtype,
+        device=device,
+        requires_grad=False,
+    )
+    cache = {"i": 0, "attention_mask": None, "position_ids": None, 'position_embeddings': None}
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache["i"]] = inp
+            cache["i"] += 1
+            # collect attention_mask and position_ids
+            cache["attention_mask"] = kwargs["attention_mask"]
+            cache["position_ids"] = kwargs["position_ids"]
+            if "position_embeddings" in kwargs:
+                cache["position_embeddings"] = kwargs["position_embeddings"]
+            else:
+                cache["position_embeddings"] = None
+            raise ValueError  # stop the forward pass
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(device))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+    outs = torch.zeros_like(inps)
+    attention_mask = cache["attention_mask"]
+    position_ids = cache["position_ids"]
+    position_embeddings = cache["position_embeddings"]
+    model.config.use_cache = use_cache
+    return inps, outs, attention_mask, position_ids, position_embeddings
+def return_given_alpha(alpha, sort_res, W_metric, tmp_metric, sum_before):
+    """
+    Return the mask and current sparsity given an alpha value.
+    Args:
+        alpha (float): The alpha value.
+        sort_res (Tensor): The sorted results.
+        W_metric (Tensor): The weight metric.
+        tmp_metric (Tensor): The temporary metric.
+        sum_before (Tensor): The sum before the alpha value.
+    Returns:
+        Tuple[Tensor, float]: The mask and current sparsity.
+    """
+    thres_cumsum = sum_before * alpha
+    sort_mask = tmp_metric <= thres_cumsum.reshape((-1, 1))
+    thres = torch.gather(
+        sort_res[0], dim=1, index=sort_mask.sum(dim=1, keepdims=True) - 1
+    )
+    W_mask = W_metric <= thres
+    cur_sparsity = (W_mask == True).sum() / W_mask.numel()
+    return W_mask, cur_sparsity
+def llama_prune_wanda_(
+    args,
+    model: LlamaForCausalLM,
+    tokenizer,
+    device=torch.device("cuda:0"),
+    prune_n=0,
+    prune_m=0,
+):
+    """
+    Perform Wanda pruning on a Llama model.
+    Args:
+        args: The arguments for pruning.
+        model (LlamaForCausalLM): The model to prune.
+        tokenizer: The tokenizer to use for calibration.
+        device (torch.device, optional): The device to use for pruning. Defaults to torch.device("cuda:0").
+        prune_n (int, optional): The number of elements to prune in each block. Defaults to 0.
+        prune_m (int, optional): The size of each block. Defaults to 0.
+    """
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    with timeit_context("loading calibdation data"):
+        dataloader, _ = get_loaders(
+            "c4",
+            nsamples=args.nsamples,
+            seed=args.seed,
+            seqlen=model.seqlen,
+            tokenizer=tokenizer,
+        )
+    with torch.no_grad():
+        # collect input to the first layer
+        inps, outs, attention_mask, position_ids = prepare_calibration_input(
+            model, dataloader, device
+        )
+    layers = model.model.layers
+    for i in range(len(layers)):
+        layer = layers[i]
+        subset = find_layers(layer)
+        if (
+            hasattr(model, "hf_device_map")
+            and f"model.layers.{i}" in model.hf_device_map
+        ):  ## handle the case for llama-30B and llama-65B, when the device map has multiple GPUs;
+            dev = model.hf_device_map[f"model.layers.{i}"]
+            inps, outs, attention_mask, position_ids = (
+                inps.to(dev),
+                outs.to(dev),
+                attention_mask.to(dev) if attention_mask is not None else None,
+                position_ids.to(dev) if position_ids is not None else None,
+            )
+        wrapped_layers = {}
+        for name in subset:
+            wrapped_layers[name] = WrappedGPT(subset[name])
+        def add_batch(name):
+            def tmp(_, inp, out):
+                cast(WrappedGPT, wrapped_layers[name]).add_batch(inp[0].data, out.data)
+            return tmp
+        handles = []
+        for name in wrapped_layers:
+            handles.append(subset[name].register_forward_hook(add_batch(name)))
+        for j in range(args.nsamples):
+            with torch.no_grad():
+                outs[j] = layer(
+                    inps[j].unsqueeze(0),
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                )[0]
+        for h in handles:
+            h.remove()
+        for name in subset:
+            print(f"pruning layer {i} name {name}")
+            W_metric = torch.abs(subset[name].weight.data) * torch.sqrt(
+                wrapped_layers[name].scaler_row.reshape((1, -1))
+            )
+            W_mask = (
+                torch.zeros_like(W_metric) == 1
+            )  ## initialize a mask to be all False
+            if prune_n != 0:
+                # structured n:m sparsity
+                for ii in range(W_metric.shape[1]):
+                    if ii % prune_m == 0:
+                        tmp = W_metric[:, ii : (ii + prune_m)].float()
+                        W_mask.scatter_(
+                            1,
+                            ii + torch.topk(tmp, prune_n, dim=1, largest=False)[1],
+                            True,
+                        )
+            else:
+                sort_res = torch.sort(W_metric, dim=-1, stable=True)
+                if args.use_variant:
+                    # wanda variant
+                    tmp_metric = torch.cumsum(sort_res[0], dim=1)
+                    sum_before = W_metric.sum(dim=1)
+                    alpha = 0.4
+                    alpha_hist = [0.0, 0.8]
+                    W_mask, cur_sparsity = return_given_alpha(
+                        alpha, sort_res, W_metric, tmp_metric, sum_before
+                    )
+                    while (torch.abs(cur_sparsity - args.sparsity_ratio) > 0.001) and (
+                        alpha_hist[1] - alpha_hist[0] >= 0.001
+                    ):
+                        if cur_sparsity > args.sparsity_ratio:
+                            alpha_new = (alpha + alpha_hist[0]) / 2.0
+                            alpha_hist[1] = alpha
+                        else:
+                            alpha_new = (alpha + alpha_hist[1]) / 2.0
+                            alpha_hist[0] = alpha
+                        alpha = alpha_new
+                        W_mask, cur_sparsity = return_given_alpha(
+                            alpha, sort_res, W_metric, tmp_metric, sum_before
+                        )
+                    print(f"alpha found {alpha} sparsity {cur_sparsity:.6f}")
+                else:
+                    # unstructured pruning
+                    indices = sort_res[1][
+                        :, : int(W_metric.shape[1] * args.sparsity_ratio)
+                    ]
+                    W_mask.scatter_(1, indices, True)
+            subset[name].weight.data[W_mask] = 0  ## set weights to zero
+        for j in range(args.nsamples):
+            with torch.no_grad():
+                outs[j] = layer(
+                    inps[j].unsqueeze(0),
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                )[0]
+        inps, outs = outs, inps
+    model.config.use_cache = use_cache
+    torch.cuda.empty_cache()

fusion_bench/method/moe_pruner/utils/score.py ADDED Viewed

@@ -0,0 +1,41 @@
+from typing import List, Union
+import numpy as np
+def layer_load_balance_score(
+    number_of_tokens_dispatched: Union[List[int], np.ndarray],
+    number_of_experts: int,
+) -> float:
+    """
+    Calculate the load balance score for one layer of the MoE model.
+    Args:
+        number_of_tokens_dispatched: List[int]
+        number_of_experts: int
+    Returns:
+        float: The load balance score
+    """
+    if len(number_of_tokens_dispatched) != number_of_experts:
+        raise ValueError(
+            f"The number of tokens dispatched ({len(number_of_tokens_dispatched)}) must match the number of experts ({number_of_experts})"
+        )
+    number_of_tokens_dispatched = np.array(number_of_tokens_dispatched)
+    mu = number_of_tokens_dispatched.mean()
+    sigma = np.sqrt(((number_of_tokens_dispatched - mu) ** 2).mean())
+    return sigma / mu
+def model_load_balance_score(layer_load_balance_scores: List[float]) -> float:
+    """
+    Calculate the load balance score for the whole model.
+    Args:
+        layer_load_balance_scores: List[float]
+    Returns:
+        float: The load balance score
+    """
+    return np.array(layer_load_balance_scores).mean()

fusion_bench/method/pruning/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # flake8: noqa F401
 from .llama_magnitude_prune import MagnitudePruningForLlama
 from .llama_random_prune import RandomPruningForLlama
+from .llama_sparsegpt_prune import SparseGPTPruningForLlama
 from .llama_wanda_prune import WandaPruningForLlama
 from .magnitude_diff_pruning import MagnitudeDiffPruningAlgorithm

fusion_bench/method/pruning/llama_sparsegpt_prune.py ADDED Viewed

@@ -0,0 +1,223 @@
+import logging
+from typing import Dict, Optional
+import torch
+from torch import Tensor, nn
+from tqdm.auto import tqdm
+from transformers import LlamaForCausalLM
+from fusion_bench import BaseAlgorithm
+from fusion_bench.method.pruning.prune_utils import (
+    PruningType,
+    compute_sparsity,
+    find_linear_layers,
+    semistructured_magnitude_prune_,
+    unstructured_magnitude_prune_,
+)
+from fusion_bench.method.pruning.sparsegpt_utils import SparseGPT
+from fusion_bench.method.pruning.wanda_utils.data import get_loaders
+from fusion_bench.method.pruning.wanda_utils.prune import prepare_calibration_input
+from fusion_bench.mixins import SimpleProfilerMixin
+from fusion_bench.modelpool import CausalLMPool
+from fusion_bench.utils import timeit_context
+from fusion_bench.utils.cache_utils import cache_to_disk
+log = logging.getLogger(__name__)
+class SparseGPTPruningForLlama(BaseAlgorithm, SimpleProfilerMixin):
+    def __init__(
+        self,
+        *,
+        nsamples: int,
+        seed: int,
+        use_variant: bool,
+        prune_type: PruningType,
+        device: str,
+        dtype: str,
+        sparsity_ratio: float,
+        n: int,
+        m: int,
+        model_save_path: Optional[str] = None,
+        **kwargs,
+    ):
+        """
+        Initialize the SparseGPTPruningForLlama class.
+        Args:
+            nsamples (int): Number of samples for calibration.
+            seed (int): Random seed.
+            use_variant (bool): Whether to use a variant of the pruning method.
+            prune_type (PruningType): Type of pruning to perform.
+            device (str): Device to use for computation.
+            dtype (str): Data type to use for computation.
+            sparsity_ratio (float): Sparsity ratio for pruning.
+            n (int): Number of elements to keep in semi-structured pruning.
+            m (int): Number of elements in a group for semi-structured pruning.
+            model_save_path (Optional[str]): Path to save the pruned model.
+            **kwargs: Additional arguments.
+        """
+        super().__init__(**kwargs)
+        self.nsamples = nsamples
+        self.seed = seed
+        self.use_variant = use_variant
+        self.prune_type = prune_type
+        self.device = device
+        self.dtype = dtype
+        self.sparsity_ratio = sparsity_ratio
+        self.n = n
+        self.m = m
+        self.model_save_path = model_save_path
+    def run(self, modelpool: CausalLMPool):
+        # load pre-trained model or the first model in the pool
+        with self.profile("load_model"):
+            model = modelpool.load_pretrained_or_first_model()
+            model.seqlen = model.config.max_position_embeddings
+            tokenizer = modelpool.load_tokenizer(use_fast=False)
+        if not isinstance(model, (LlamaForCausalLM,)):
+            log.warning(f"Model type {type(model)} may not supported.")
+        inps, outs, attention_mask, position_ids = self.prepare_calibration_data(
+            model, tokenizer
+        )
+        self.prune_using_calibration_data_(
+            model,
+            inps=inps,
+            outs=outs,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+        )
+        if self.model_save_path is not None:
+            with timeit_context(f"Saving pruned model to {self.model_save_path}"):
+                tokenizer.save_pretrained(self.model_save_path)
+                model.save_pretrained(self.model_save_path)
+        return model
+    def _prepare_calibration_data(self, model, tokenizer):
+        """
+        Prepare calibration data for pruning.
+        Args:
+            model (LlamaForCausalLM): Model to be pruned.
+            tokenizer: Tokenizer for the model.
+        Returns:
+            Tuple: Calibration data (inputs, outputs, attention mask, position IDs).
+        """
+        with timeit_context("loading calibration data"):
+            dataloader, _ = get_loaders(
+                "c4",
+                nsamples=self.nsamples,
+                seed=self.seed,
+                seqlen=model.seqlen,
+                tokenizer=tokenizer,
+            )
+        with torch.no_grad():
+            # collect input to the first layer
+            inps, outs, attention_mask, position_ids = prepare_calibration_input(
+                model, dataloader, self.device
+            )
+        return inps, outs, attention_mask, position_ids
+    def prepare_calibration_data(self, model: LlamaForCausalLM, tokenizer):
+        """
+        Prepare calibration data for pruning with caching.
+        Args:
+            model (LlamaForCausalLM): Model to be pruned.
+            tokenizer: Tokenizer for the model.
+        Returns:
+            Tuple: Calibration data (inputs, outputs, attention mask, position IDs).
+        """
+        @cache_to_disk(
+            f"outputs/cache/{model.config.name_or_path.split('/')[-1]}/calibration_data.pkl"
+        )
+        def _prepare_calibration_data(model, tokenizer):
+            return self._prepare_calibration_data(model, tokenizer)
+        return _prepare_calibration_data(model, tokenizer)
+    @torch.no_grad()
+    def prune_using_calibration_data_(
+        self,
+        model: LlamaForCausalLM,
+        *,
+        inps,
+        outs,
+        attention_mask,
+        position_ids,
+    ):
+        layers = model.model.layers
+        for layer_indx, layer in tqdm(
+            enumerate(layers),
+            "Pruning Layers",
+            total=len(layers),
+            dynamic_ncols=True,
+        ):
+            layer = layers[layer_indx]
+            if f"model.layers.{layer_indx}" in model.hf_device_map:
+                dev = model.hf_device_map[f"model.layers.{layer_indx}"]
+                print(f"layer {layer_indx} device {dev}")
+                inps, outs, attention_mask, position_ids = (
+                    inps.to(dev),
+                    outs.to(dev),
+                    attention_mask.to(dev),
+                    position_ids.to(dev),
+                )
+            subset = find_linear_layers(layer, layers=[nn.Linear])
+            gpts: Dict[str, SparseGPT] = {}
+            for name in subset:
+                gpts[name] = SparseGPT(subset[name])
+            def add_batch(name):
+                def tmp(_, inp, out):
+                    gpts[name].add_batch(inp[0].data, out.data)
+                return tmp
+            handles = []
+            for name in gpts:
+                handles.append(subset[name].register_forward_hook(add_batch(name)))
+            for j in range(self.nsamples):
+                outs[j] = layer(
+                    inps[j].unsqueeze(0),
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                )[0]
+            for h in handles:
+                h.remove()
+            for name in gpts:
+                print(layer_indx, name)
+                print("Pruning ...")
+                gpts[name].fasterprune(
+                    self.sparsity_ratio,
+                    prune_n=self.n,
+                    prune_m=self.m,
+                    percdamp=0.01,
+                    blocksize=128,
+                )
+                gpts[name].free()
+            for j in range(self.nsamples):
+                outs[j] = layer(
+                    inps[j].unsqueeze(0),
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                )[0]
+            layers[layer_indx] = layer
+            torch.cuda.empty_cache()
+            inps, outs = outs, inps

fusion_bench/method/pruning/sparsegpt_utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .sparsegpt import SparseGPT

fusion-bench 0.2.15__py3-none-any.whl → 0.2.16__py3-none-any.whl

fusion-bench 0.2.15py3-none-any.whl → 0.2.16py3-none-any.whl