PyPI - fusion-bench - Versions diffs - 0.2.9__py3-none-any.whl - Mend

fusion-bench 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (727) hide show

fusion_bench/method/pruning/wanda_utils/prune_opt.py ADDED Viewed

@@ -0,0 +1,539 @@
+import torch
+import torch.nn as nn
+from .ablate import AblateGPT
+from .data import get_loaders
+from .layerwrapper import WrappedGPT
+from .sparsegpt import SparseGPT
+def find_layers(module, layers=[nn.Linear], name=""):
+    """
+    Recursively find the layers of a certain type in a module.
+    Args:
+        module (nn.Module): PyTorch module.
+        layers (list): List of layer types to find.
+        name (str): Name of the module.
+    Returns:
+        dict: Dictionary of layers of the given type(s) within the module.
+    """
+    if type(module) in layers:
+        return {name: module}
+    res = {}
+    for name1, child in module.named_children():
+        res.update(
+            find_layers(
+                child, layers=layers, name=name + "." + name1 if name != "" else name1
+            )
+        )
+    return res
+def check_sparsity(model):
+    """
+    Check the sparsity of the model.
+    Args:
+        model (nn.Module): The model to check sparsity for.
+    Returns:
+        float: The sparsity ratio of the model.
+    """
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.decoder.layers
+    count = 0
+    total_params = 0
+    for i in range(len(layers)):
+        layer = layers[i]
+        subset = find_layers(layer)
+        sub_count = 0
+        sub_params = 0
+        for name in subset:
+            W = subset[name].weight.data
+            count += (W == 0).sum().item()
+            total_params += W.numel()
+            sub_count += (W == 0).sum().item()
+            sub_params += W.numel()
+        print(f"layer {i} sparsity {float(sub_count)/sub_params:.6f}")
+    model.config.use_cache = use_cache
+    return float(count) / total_params
+def prepare_calibration_input(model, dataloader, device):
+    """
+    Prepare the calibration input for the model.
+    Args:
+        model (nn.Module): The model to prepare calibration input for.
+        dataloader (DataLoader): The dataloader to use for calibration.
+        device (torch.device): The device to use for calibration.
+    Returns:
+        tuple: A tuple containing the input tensor, output tensor, and attention mask.
+    """
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.decoder.layers
+    if "model.embed_tokens" in model.hf_device_map:
+        device = model.hf_device_map["model.embed_tokens"]
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (128, model.seqlen, model.config.hidden_size), dtype=dtype, device=device
+    )
+    inps.requires_grad = False
+    cache = {"i": 0, "attention_mask": None, "position_ids": None}
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache["i"]] = inp
+            cache["i"] += 1
+            cache["attention_mask"] = kwargs["attention_mask"]
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(device))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+    outs = torch.zeros_like(inps)
+    attention_mask = cache["attention_mask"]
+    model.config.use_cache = use_cache
+    return inps, outs, attention_mask
+def return_given_alpha(alpha, sort_res, W_metric, tmp_metric, sum_before):
+    """
+    Return the mask and current sparsity given an alpha value.
+    Args:
+        alpha (float): The alpha value.
+        sort_res (tuple): The sorted result.
+        W_metric (torch.Tensor): The weight metric tensor.
+        tmp_metric (torch.Tensor): The temporary metric tensor.
+        sum_before (torch.Tensor): The sum before tensor.
+    Returns:
+        tuple: A tuple containing the mask and current sparsity.
+    """
+    thres_cumsum = sum_before * alpha
+    sort_mask = tmp_metric <= thres_cumsum.reshape((-1, 1))
+    thres = torch.gather(
+        sort_res[0], dim=1, index=sort_mask.sum(dim=1, keepdims=True) - 1
+    )
+    W_mask = W_metric <= thres
+    cur_sparsity = (W_mask == True).sum() / W_mask.numel()
+    return W_mask, cur_sparsity
+def prune_magnitude(
+    args, model, tokenizer, device=torch.device("cuda:0"), prune_n=0, prune_m=0
+):
+    """
+    Perform magnitude pruning on the model.
+    Args:
+        args: The arguments for pruning.
+        model (nn.Module): The model to prune.
+        tokenizer: The tokenizer to use.
+        device (torch.device, optional): The device to use for pruning. Defaults to torch.device("cuda:0").
+        prune_n (int, optional): The number of elements to prune in each block. Defaults to 0.
+        prune_m (int, optional): The size of each block. Defaults to 0.
+    """
+    layers = model.model.decoder.layers
+    for i in range(len(layers)):
+        layer = layers[i]
+        subset = find_layers(layer)
+        for name in subset:
+            W = subset[name].weight.data
+            W_metric = torch.abs(W)
+            if prune_n != 0:
+                W_mask = torch.zeros_like(W) == 1
+                for ii in range(W_metric.shape[1]):
+                    if ii % prune_m == 0:
+                        tmp = W_metric[:, ii : (ii + prune_m)].float()
+                        W_mask.scatter_(
+                            1,
+                            ii + torch.topk(tmp, prune_n, dim=1, largest=False)[1],
+                            True,
+                        )
+            else:
+                thresh = torch.sort(W_metric.flatten().cuda())[0][
+                    int(W.numel() * args.sparsity_ratio)
+                ].cpu()
+                W_mask = W_metric <= thresh
+            W[W_mask] = 0
+def prune_wanda(
+    args, model, tokenizer, device=torch.device("cuda:0"), prune_n=0, prune_m=0
+):
+    """
+    Perform Wanda pruning on the model.
+    Args:
+        args: The arguments for pruning.
+        model (nn.Module): The model to prune.
+        tokenizer: The tokenizer to use.
+        device (torch.device, optional): The device to use for pruning. Defaults to torch.device("cuda:0").
+        prune_n (int, optional): The number of elements to prune in each block. Defaults to 0.
+        prune_m (int, optional): The size of each block. Defaults to 0.
+    """
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    print("loading calibration data")
+    dataloader, _ = get_loaders(
+        "c4",
+        nsamples=args.nsamples,
+        seed=args.seed,
+        seqlen=model.seqlen,
+        tokenizer=tokenizer,
+    )
+    print("dataset loading complete")
+    with torch.no_grad():
+        inps, outs, attention_mask = prepare_calibration_input(
+            model, dataloader, device
+        )
+    layers = model.model.decoder.layers
+    for i in range(len(layers)):
+        layer = layers[i]
+        subset = find_layers(layer)
+        if (
+            f"model.layers.{i}" in model.hf_device_map
+        ):  ## handle the case for llama-30B and llama-65B, when the device map has multiple GPUs;
+            dev = model.hf_device_map[f"model.layers.{i}"]
+            inps, outs, attention_mask = (
+                inps.to(dev),
+                outs.to(dev),
+                attention_mask.to(dev),
+            )
+        wrapped_layers = {}
+        for name in subset:
+            wrapped_layers[name] = WrappedGPT(subset[name])
+        def add_batch(name):
+            def tmp(_, inp, out):
+                wrapped_layers[name].add_batch(inp[0].data, out.data)
+            return tmp
+        handles = []
+        for name in wrapped_layers:
+            handles.append(subset[name].register_forward_hook(add_batch(name)))
+        for j in range(args.nsamples):
+            with torch.no_grad():
+                outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        for h in handles:
+            h.remove()
+        for name in subset:
+            print(f"pruning layer {i} name {name}")
+            W_metric = torch.abs(subset[name].weight.data) * torch.sqrt(
+                wrapped_layers[name].scaler_row.reshape((1, -1))
+            )
+            W_mask = (
+                torch.zeros_like(W_metric) == 1
+            )  ## initialize a mask to be all False
+            if prune_n != 0:
+                # structured n:m sparsity
+                for ii in range(W_metric.shape[1]):
+                    if ii % prune_m == 0:
+                        tmp = W_metric[:, ii : (ii + prune_m)].float()
+                        W_mask.scatter_(
+                            1,
+                            ii + torch.topk(tmp, prune_n, dim=1, largest=False)[1],
+                            True,
+                        )
+            else:
+                sort_res = torch.sort(W_metric, dim=-1, stable=True)
+                # unstructured pruning
+                indices = sort_res[1][:, : int(W_metric.shape[1] * args.sparsity_ratio)]
+                W_mask.scatter_(1, indices, True)
+            subset[name].weight.data[W_mask] = 0  ## set weights to zero
+        for j in range(args.nsamples):
+            with torch.no_grad():
+                outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        inps, outs = outs, inps
+    model.config.use_cache = use_cache
+    torch.cuda.empty_cache()
+@torch.no_grad()
+def prune_sparsegpt(args, model, tokenizer, dev, prune_n=0, prune_m=0):
+    """
+    Perform SparseGPT pruning on the model.
+    Args:
+        args: The arguments for pruning.
+        model (nn.Module): The model to prune.
+        tokenizer: The tokenizer to use.
+        dev (torch.device): The device to use for pruning.
+        prune_n (int, optional): The number of elements to prune in each block. Defaults to 0.
+        prune_m (int, optional): The size of each block. Defaults to 0.
+    """
+    ## SparseGPT code available at: https://github.com/IST-DASLab/sparsegpt/tree/f5c25005a61f96a0933ca2f95705a963585aafaa
+    print("Starting ...")
+    dataloader, _ = get_loaders(
+        "c4",
+        nsamples=args.nsamples,
+        seed=args.seed,
+        seqlen=model.seqlen,
+        tokenizer=tokenizer,
+    )
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.decoder.layers
+    if "model.embed_tokens" in model.hf_device_map:
+        dev = model.hf_device_map["model.embed_tokens"]
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {"i": 0, "attention_mask": None, "position_ids": None}
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache["i"]] = inp
+            cache["i"] += 1
+            cache["attention_mask"] = kwargs["attention_mask"]
+            # cache['position_ids'] = kwargs['position_ids']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+    torch.cuda.empty_cache()
+    outs = torch.zeros_like(inps)
+    attention_mask = cache["attention_mask"]
+    print("Ready.")
+    for i in range(len(layers)):
+        layer = layers[i]
+        if f"model.layers.{i}" in model.hf_device_map:
+            dev = model.hf_device_map[f"model.layers.{i}"]
+            print(f"layer {i} device {dev}")
+            inps, outs, attention_mask = (
+                inps.to(dev),
+                outs.to(dev),
+                attention_mask.to(dev),
+            )
+        subset = find_layers(layer)
+        gpts = {}
+        for name in subset:
+            gpts[name] = SparseGPT(subset[name])
+        def add_batch(name):
+            def tmp(_, inp, out):
+                gpts[name].add_batch(inp[0].data, out.data)
+            return tmp
+        handles = []
+        for name in gpts:
+            handles.append(subset[name].register_forward_hook(add_batch(name)))
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        for h in handles:
+            h.remove()
+        for name in gpts:
+            print(i, name)
+            print("Pruning ...")
+            gpts[name].fasterprune(
+                args.sparsity_ratio,
+                prune_n=prune_n,
+                prune_m=prune_m,
+                percdamp=0.01,
+                blocksize=128,
+            )
+            gpts[name].free()
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        layers[i] = layer
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+    model.config.use_cache = use_cache
+    torch.cuda.empty_cache()
+@torch.no_grad()
+def prune_ablate(args, model, tokenizer, dev, prune_n=0, prune_m=0):
+    """
+    Perform ablation pruning on the model.
+    Args:
+        args: The arguments for pruning.
+        model (nn.Module): The model to prune.
+        tokenizer: The tokenizer to use.
+        dev (torch.device): The device to use for pruning.
+        prune_n (int, optional): The number of elements to prune in each block. Defaults to 0.
+        prune_m (int, optional): The size of each block. Defaults to 0.
+    """
+    ## SparseGPT code available at: https://github.com/IST-DASLab/sparsegpt/tree/f5c25005a61f96a0933ca2f95705a963585aafaa
+    print("Starting ...")
+    dataloader, _ = get_loaders(
+        "c4",
+        nsamples=args.nsamples,
+        seed=args.seed,
+        seqlen=model.seqlen,
+        tokenizer=tokenizer,
+    )
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.decoder.layers
+    if "model.embed_tokens" in model.hf_device_map:
+        dev = model.hf_device_map["model.embed_tokens"]
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {"i": 0, "attention_mask": None, "position_ids": None}
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache["i"]] = inp
+            cache["i"] += 1
+            cache["attention_mask"] = kwargs["attention_mask"]
+            # cache['position_ids'] = kwargs['position_ids']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+    torch.cuda.empty_cache()
+    outs = torch.zeros_like(inps)
+    attention_mask = cache["attention_mask"]
+    # position_ids = cache['position_ids']
+    print("Ready.")
+    for i in range(len(layers)):
+        layer = layers[i]
+        if f"model.layers.{i}" in model.hf_device_map:
+            dev = model.hf_device_map[f"model.layers.{i}"]
+            print(f"layer {i} device {dev}")
+            inps, outs, attention_mask = (
+                inps.to(dev),
+                outs.to(dev),
+                attention_mask.to(dev),
+            )
+        subset = find_layers(layer)
+        gpts = {}
+        for name in subset:
+            gpts[name] = AblateGPT(subset[name])
+        def add_batch(name):
+            def tmp(_, inp, out):
+                gpts[name].add_batch(inp[0].data, out.data)
+            return tmp
+        handles = []
+        for name in gpts:
+            handles.append(subset[name].register_forward_hook(add_batch(name)))
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        for h in handles:
+            h.remove()
+        for name in gpts:
+            print(i, name)
+            print("Pruning ...")
+            if args.prune_method == "ablate_wanda_seq":
+                prune_mask = gpts[name].get_wanda_mask(
+                    args.sparsity_ratio, prune_n, prune_m
+                )
+            elif args.prune_method == "ablate_mag_seq":
+                prune_mask = gpts[name].get_mag_mask(
+                    args.sparsity_ratio, prune_n, prune_m
+                )
+            elif "iter" in args.prune_method:
+                prune_mask = None
+            gpts[name].fasterprune(
+                args,
+                args.sparsity_ratio,
+                mask=prune_mask,
+                prune_n=prune_n,
+                prune_m=prune_m,
+                percdamp=0.01,
+                blocksize=128,
+            )
+            gpts[name].free()
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        layers[i] = layer
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+    model.config.use_cache = use_cache
+    torch.cuda.empty_cache()

fusion_bench/method/pruning/wanda_utils/sparsegpt.py ADDED Viewed

@@ -0,0 +1,165 @@
+import math
+import time
+import torch
+import torch.nn as nn
+import transformers
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cudnn.allow_tf32 = False
+## SparseGPT: https://github.com/IST-DASLab/sparsegpt/tree/f5c25005a61f96a0933ca2f95705a963585aafaa
+class SparseGPT:
+    """
+    A class to perform pruning on GPT models using SparseGPT techniques.
+    Attributes:
+        layer (nn.Module): The layer to be pruned.
+        dev (torch.device): The device on which the layer is located.
+        rows (int): The number of rows in the weight matrix.
+        columns (int): The number of columns in the weight matrix.
+        H (torch.Tensor): The Hessian matrix.
+        nsamples (int): The number of samples processed.
+    """
+    def __init__(self, layer):
+        """
+        Initialize the SparseGPT class.
+        Args:
+            layer (nn.Module): The layer to be pruned.
+        """
+        self.layer = layer
+        self.dev = self.layer.weight.device
+        W = layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.Conv1D):
+            W = W.t()
+        self.rows = W.shape[0]
+        self.columns = W.shape[1]
+        self.H = torch.zeros((self.columns, self.columns), device=self.dev)
+        self.nsamples = 0
+    def add_batch(self, inp, out):
+        """
+        Add a batch of input and output to the Hessian matrix.
+        Args:
+            inp (torch.Tensor): The input tensor.
+            out (torch.Tensor): The output tensor.
+        """
+        if len(inp.shape) == 2:
+            inp = inp.unsqueeze(0)
+        tmp = inp.shape[0]
+        if isinstance(self.layer, nn.Linear) or isinstance(
+            self.layer, transformers.Conv1D
+        ):
+            if len(inp.shape) == 3:
+                inp = inp.reshape((-1, inp.shape[-1]))
+            inp = inp.t()
+        self.H *= self.nsamples / (self.nsamples + tmp)
+        self.nsamples += tmp
+        inp = math.sqrt(2 / self.nsamples) * inp.float()
+        self.H += inp.matmul(inp.t())
+    def fasterprune(self, sparsity, prune_n=0, prune_m=0, blocksize=128, percdamp=0.01):
+        """
+        Perform faster pruning on the layer.
+        Args:
+            sparsity (float): The sparsity ratio.
+            prune_n (int, optional): The number of elements to prune in each block. Defaults to 0.
+            prune_m (int, optional): The size of each block. Defaults to 0.
+            blocksize (int, optional): The size of each block for pruning. Defaults to 128.
+            percdamp (float, optional): The percentage of damping. Defaults to 0.01.
+        """
+        W = self.layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.Conv1D):
+            W = W.t()
+        W = W.float()
+        tick = time.time()
+        H = self.H
+        del self.H
+        dead = torch.diag(H) == 0
+        H[dead, dead] = 1
+        W[:, dead] = 0
+        Losses = torch.zeros(self.rows, device=self.dev)
+        damp = percdamp * torch.mean(torch.diag(H))
+        diag = torch.arange(self.columns, device=self.dev)
+        H[diag, diag] += damp
+        H = torch.linalg.cholesky(H)
+        H = torch.cholesky_inverse(H)
+        H = torch.linalg.cholesky(H, upper=True)
+        Hinv = H
+        mask = None
+        for i1 in range(0, self.columns, blocksize):
+            i2 = min(i1 + blocksize, self.columns)
+            count = i2 - i1
+            W1 = W[:, i1:i2].clone()
+            Q1 = torch.zeros_like(W1)
+            Err1 = torch.zeros_like(W1)
+            Losses1 = torch.zeros_like(W1)
+            Hinv1 = Hinv[i1:i2, i1:i2]
+            if prune_n == 0:
+                if mask is not None:
+                    mask1 = mask[:, i1:i2]
+                else:
+                    tmp = W1**2 / (torch.diag(Hinv1).reshape((1, -1))) ** 2
+                    thresh = torch.sort(tmp.flatten())[0][int(tmp.numel() * sparsity)]
+                    mask1 = tmp <= thresh
+            else:
+                mask1 = torch.zeros_like(W1) == 1
+            for i in range(count):
+                w = W1[:, i]
+                d = Hinv1[i, i]
+                if prune_n != 0 and i % prune_m == 0:
+                    tmp = (
+                        W1[:, i : (i + prune_m)] ** 2
+                        / (torch.diag(Hinv1)[i : (i + prune_m)].reshape((1, -1))) ** 2
+                    )
+                    mask1.scatter_(
+                        1, i + torch.topk(tmp, prune_n, dim=1, largest=False)[1], True
+                    )
+                q = w.clone()
+                q[mask1[:, i]] = 0
+                Q1[:, i] = q
+                Losses1[:, i] = (w - q) ** 2 / d**2
+                err1 = (w - q) / d
+                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
+                Err1[:, i] = err1
+            W[:, i1:i2] = Q1
+            Losses += torch.sum(Losses1, 1) / 2
+            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
+        torch.cuda.synchronize()
+        if isinstance(self.layer, transformers.Conv1D):
+            W = W.t()
+        self.layer.weight.data = W.reshape(self.layer.weight.shape).to(
+            self.layer.weight.data.dtype
+        )
+    def free(self):
+        """
+        Free the memory used by the Hessian matrix.
+        """
+        self.H = None
+        torch.cuda.empty_cache()

fusion_bench/method/pwe_moe/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+# flake8: noqa F401
+from .clip_pwe_moe import (
+    PWEMoELinearScalarizationForCLIP,
+    PWEMoExactParetoOptimalForCLIP,
+)