PyPI - fusion-bench - Versions diffs - 0.2.17__py3-none-any.whl → 0.2.18__py3-none-any.whl - Mend

fusion-bench 0.2.17py3-none-any.whl → 0.2.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

fusion_bench/models/expert_sparsity/mixtral/wrapper.py ADDED Viewed

@@ -0,0 +1,268 @@
+import itertools as I
+import logging
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.models.mixtral.modeling_mixtral import (
+    MixtralBlockSparseTop2MLP,
+    MixtralForCausalLM,
+    MixtralSparseMoeBlock,
+)
+from .dataset import CacheDataset
+logger = logging.getLogger(__name__)
+class PrunableMixtralSparseMoeBlockWrapper(torch.nn.Module):
+    """
+    Wrapper of `MixtralSparseMoeBlock` that supports expert pruning.
+    """
+    def __init__(
+        self,
+        model: MixtralSparseMoeBlock,
+        r: Optional[int] = None,
+    ):
+        """
+        Args:
+            model: The model to be wrapped.
+            r: The number of experts to keep.
+        """
+        super().__init__()
+        if isinstance(model, MixtralSparseMoeBlock):
+            self.model = model
+        else:
+            self.model = model.model
+        self.r = r
+        self.experts_to_drop = None
+        self.cache_space = CacheDataset()
+        self.cache_logits = False
+        self.cache_X = False
+        self.cache_Z = False
+    # Forward uses topk
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """ """
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.model.gate(hidden_states)
+        if self.experts_to_drop is not None:
+            for e in self.experts_to_drop:
+                router_logits[:, e] = -float("inf")
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(
+            routing_weights, self.model.top_k, dim=-1
+        )
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(
+            selected_experts, num_classes=self.model.num_experts
+        ).permute(2, 1, 0)
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.model.num_experts):
+            expert_layer = self.model.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+            if top_x.shape[0] == 0:
+                continue
+            # in torch it is faster to index using lists than torch tensors
+            top_x_list = top_x.tolist()
+            idx_list = idx.tolist()
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x_list].reshape(-1, hidden_dim)
+            current_hidden_states = (
+                expert_layer(current_state)
+                * routing_weights[top_x_list, idx_list, None]
+            )
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(
+                0, top_x, current_hidden_states.to(hidden_states.dtype)
+            )
+        if self.experts_to_drop is not None and (
+            self.cache_logits or self.cache_X or self.cache_Z
+        ):
+            logger.warn(
+                f"Already dropped {self.experts_to_drop} but still storing activations."
+            )
+        self.cache_space.append(
+            alpha=(router_logits if self.cache_logits else None),
+            X=(hidden_states if self.cache_X else None),
+            Z=(final_hidden_states if self.cache_Z else None),
+        )
+        final_hidden_states = final_hidden_states.reshape(
+            batch_size, sequence_length, hidden_dim
+        )
+        return final_hidden_states, router_logits
+    @torch.no_grad()
+    def enumerate(self):
+        # disable caching
+        self.cache_logits = False
+        self.cache_X = False
+        self.cache_Z = False
+        loss_history = dict()
+        with torch.inference_mode():
+            for dropped in I.combinations(
+                range(self.model.num_experts), self.model.num_experts - self.r
+            ):
+                self.experts_to_drop = dropped
+                loss = 0
+                for hidden_states, final_hidden_states in zip(
+                    self.cache_space.Xs, self.cache_space.Zs
+                ):
+                    hidden_states = hidden_states.to(
+                        device=self.model.gate.weight.data.device, non_blocking=True
+                    )
+                    final_hidden_states = final_hidden_states.to(
+                        dtype=torch.float64,
+                        device=self.model.gate.weight.data.device,
+                        non_blocking=True,
+                    )
+                    final_hidden_states_e, _ = self.forward(hidden_states.unsqueeze(0))
+                    # compute the |Z - Z_e|_2 L2 loss
+                    loss += torch.norm(
+                        final_hidden_states
+                        - final_hidden_states_e.squeeze(0).to(torch.float64)
+                    ).item()
+                loss_history[dropped] = loss
+        self.experts_to_drop = min(loss_history, key=loss_history.get)
+        return loss_history
+    @torch.no_grad()
+    def prune(self):
+        assert self.experts_to_drop is not None
+        assert len(self.experts_to_drop) == self.model.num_experts - self.r
+        del self.cache_space
+        self.cache_X = False
+        self.cache_Z = False
+        experts_to_reserve = sorted(
+            set(range(self.model.num_experts)) - set(self.experts_to_drop)
+        )
+        # create a new gate with the experts to reserve
+        gate_new = torch.nn.Linear(
+            in_features=self.model.gate.in_features,
+            out_features=self.r,
+            bias=False,
+            device=self.model.gate.weight.data.device,
+            dtype=torch.bfloat16,
+        )
+        gate_new.weight.data = self.model.gate.weight.data[list(experts_to_reserve)]
+        self.model.gate = gate_new
+        self.model.experts = torch.nn.ModuleList(
+            [self.model.experts[i] for i in experts_to_reserve]
+        )
+        self.model.num_experts = self.r
+class DynamicSkippingMixtralSparseMoeBlockWrapper(nn.Module):
+    def __init__(self, model: MixtralSparseMoeBlock, beta: float):
+        super().__init__()
+        assert isinstance(model, MixtralSparseMoeBlock)
+        assert model.top_k == 2
+        self.hidden_dim = model.hidden_dim
+        self.ffn_dim = model.ffn_dim
+        self.num_experts = model.num_experts
+        self.top_k = model.top_k
+        self.gate = model.gate
+        self.experts = model.experts
+        self.beta = beta
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """ """
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(
+            routing_weights, self.top_k, dim=-1
+        )
+        # (batch * sequence_length)
+        mask_top1 = routing_weights[:, 1] < self.beta * routing_weights[:, 0]
+        routing_weights[mask_top1, 1] = 0
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        # (batch * sequence_length, self.top_k, n_experts)
+        expert_mask = torch.nn.functional.one_hot(
+            selected_experts, num_classes=self.num_experts
+        )
+        expert_mask[mask_top1, 1, :] = 0
+        expert_mask = expert_mask.permute(2, 1, 0)
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.num_experts):
+            expert_layer = self.experts[expert_idx]
+            top_x, indices = torch.where(expert_mask[expert_idx])
+            if indices.shape[0] == 0:
+                continue
+            # in torch it is faster to index using lists than torch tensors
+            indices_list = indices.tolist()
+            top_x_list = top_x.tolist()
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, indices_list].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(
+                current_state, routing_weights[indices_list, top_x_list, None]
+            )
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(
+                0, indices, current_hidden_states.to(hidden_states.dtype)
+            )
+        final_hidden_states = final_hidden_states.reshape(
+            batch_size, sequence_length, hidden_dim
+        )
+        return final_hidden_states, router_logits

fusion_bench/programs/fabric_fusion_program.py CHANGED Viewed

@@ -296,13 +296,17 @@ class FabricModelFusionProgram(
             if hydra_output_dir is not None:
                 os.makedirs(self.log_dir, exist_ok=True)
                 try:
-                    os.symlink(
-                        hydra_output_dir,
-                        os.path.join(
-                            self.log_dir,
-                            "hydra_output_" + os.path.basename(hydra_output_dir),
-                        ),
-                        target_is_directory=True,
-                    )
+                    # if the system is windows, use the `mklink` command in "CMD" to create the symlink
+                    if os.name == "nt":
+                        os.system(f"mklink /J {os.path.abspath(os.path.join(self.log_dir, 'hydra_output_' + os.path.basename(hydra_output_dir)))} {os.path.abspath(hydra_output_dir)}")
+                    else:
+                        os.symlink(
+                            hydra_output_dir,
+                            os.path.join(
+                                self.log_dir,
+                                "hydra_output_" + os.path.basename(hydra_output_dir),
+                            ),
+                            target_is_directory=True,
+                        )
                 except OSError as e:
                     log.warning(f"Failed to create symbolic link: {e}")

fusion_bench/utils/__init__.py CHANGED Viewed

@@ -2,14 +2,15 @@
 import importlib
 from typing import Iterable
-from . import data, functools, path
+from . import data, functools, path, pylogger
 from .cache_utils import *
 from .devices import *
 from .dtype import parse_dtype
 from .fabric import seed_everything_by_time
 from .instantiate_utils import instantiate, is_instantiable
+from .json import load_from_json, save_to_json
+from .lazy_state_dict import LazyStateDict
 from .misc import *
 from .packages import import_object
 from .parameters import *
 from .timer import timeit_context
-from .lazy_state_dict import LazyStateDict

fusion_bench/utils/dtype.py CHANGED Viewed

@@ -13,6 +13,7 @@ from transformers.utils import (
 PRECISION_STR_TO_DTYPE: Dict[str, torch.dtype] = {
     "fp16": torch.float16,
     "float16": torch.float16,
+    "half": torch.float16,
     "bf16": torch.bfloat16,
     "bfloat16": torch.bfloat16,
     "float": torch.float32,
@@ -50,7 +51,7 @@ def parse_dtype(dtype: Optional[str]):
     dtype = dtype.strip('"')
     if dtype not in PRECISION_STR_TO_DTYPE:
-        raise ValueError(f"Unsupported dtype: {type(dtype)}")
+        raise ValueError(f"Unsupported dtype string: {dtype}")
     dtype = PRECISION_STR_TO_DTYPE[dtype]
     return dtype

fusion_bench/utils/fabric.py CHANGED Viewed

@@ -1,17 +1,24 @@
 import time
+from typing import Optional
 import lightning as L
+from fusion_bench.utils.pylogger import getRankZeroLogger
-def seed_everything_by_time(fabric: L.Fabric):
+log = getRankZeroLogger(__name__)
+def seed_everything_by_time(fabric: Optional[L.Fabric] = None):
     """
     Set seed for all processes by time.
     """
     # set seed for all processes
-    if fabric.is_global_zero:
+    if fabric is None or fabric.is_global_zero:
         seed = int(time.time())
     else:
         seed = None
-    fabric.barrier()
-    seed = fabric.broadcast(seed, src=0)
+    if fabric is not None:
+        log.debug(f"Broadcasting seed `{seed}` to all processes")
+        fabric.barrier()
+        seed = fabric.broadcast(seed, src=0)
     L.seed_everything(seed)

fusion_bench/utils/lazy_state_dict.py CHANGED Viewed

@@ -1,13 +1,16 @@
 import json
 import logging
 import os
-from typing import TYPE_CHECKING, Dict, Iterator, List, Optional, Tuple
+from copy import deepcopy
+from typing import TYPE_CHECKING, Dict, Iterator, List, Optional, Tuple, Type
 import torch
+from accelerate import init_empty_weights
 from accelerate.utils.constants import SAFE_WEIGHTS_NAME, WEIGHTS_NAME
 from huggingface_hub import snapshot_download
 from safetensors import safe_open
 from safetensors.torch import load_file
+from torch import nn
 from transformers import AutoConfig
 from fusion_bench.utils.dtype import parse_dtype
@@ -59,6 +62,8 @@ class LazyStateDict:
     def __init__(
         self,
         checkpoint: str,
+        meta_module_class: Optional[Type[nn.Module]] = None,
+        meta_module: Optional[nn.Module] = None,
         cache_state_dict: bool = False,
         torch_dtype: Optional[torch.dtype] = None,
         device: str = "cpu",
@@ -66,6 +71,22 @@ class LazyStateDict:
         hf_cache_dir: Optional[str] = None,
         hf_proxies: Optional[Dict] = None,
     ):
+        self.meta_module_class = meta_module_class
+        self.meta_module = meta_module
+        if self.meta_module_class is not None:
+            if self.meta_module is not None:
+                raise ValueError(
+                    "Cannot provide both meta_module_class and meta_module, please provide only one."
+                )
+            with init_empty_weights():
+                self.meta_module = self.meta_module_class.from_pretrained(
+                    checkpoint,
+                    torch_dtype=torch_dtype,
+                    revision=hf_revision,
+                    cache_dir=hf_cache_dir,
+                    proxies=hf_proxies,
+                )
         self._checkpoint = checkpoint
         self._local_path = resolve_checkpoint_path(
             checkpoint,
@@ -78,10 +99,32 @@ class LazyStateDict:
             self._resolve_checkpoint_files(self._local_path)
         )
-        if cache_state_dict:
-            self._state_dict_cache = {}
+        if self._index is not None:
+            # if meta_module is provided, remove the keys that are not in the meta_module
+            if self.meta_module is not None:
+                meta_module_state_dict = self.meta_module.state_dict()
+                for key in tuple(self._index.keys()):
+                    if key not in meta_module_state_dict:
+                        self._index.pop(key)
+            if cache_state_dict:
+                self._state_dict_cache = {}
+            else:
+                self._state_dict_cache = None
+        elif len(self._checkpoint_files) == 1 and self._checkpoint_files[0].endswith(
+            WEIGHTS_NAME
+        ):
+            log.info(f"Loading full state dict from {WEIGHTS_NAME}")
+            self._state_dict_cache = torch.load(self._checkpoint_files[0])
+            # if meta_module is provided, remove the keys that are not in the meta_module
+            if self.meta_module is not None:
+                meta_module_state_dict = self.meta_module.state_dict()
+                for key in tuple(self._state_dict_cache.keys()):
+                    if key not in meta_module_state_dict:
+                        self._state_dict_cache.pop(key)
         else:
-            self._state_dict_cache = None
+            raise ValueError(
+                f"Cannot determine the type of checkpoint, please provide a checkpoint path to a file containing a whole state dict with file name {WEIGHTS_NAME} or {SAFE_WEIGHTS_NAME}, or the index of a sharded checkpoint ending with `.index.json`."
+            )
         self._torch_dtype = parse_dtype(torch_dtype)
         self._device = device
@@ -152,6 +195,8 @@ class LazyStateDict:
             checkpoint_files = [
                 os.path.join(checkpoint_folder, f) for f in checkpoint_files
             ]
+        else:
+            index = None
         return index, index_filename, checkpoint_files
     def _load_tensor_from_checkpoint_file(
@@ -248,16 +293,24 @@ class LazyStateDict:
     def __iter__(self) -> Iterator[str]:
         if self._index is not None:
             return iter(self._index)
-        return iter(self._checkpoint_files)
+        elif self._state_dict_cache is not None:
+            return iter(self._state_dict_cache)
+        else:
+            raise RuntimeError(
+                "Unexpected error: cannot determine the keys in the state dict."
+            )
-    def keys(self) -> List[str]:
-        return list(self)
+    def keys(self) -> Iterator[str]:
+        for key in self:
+            yield key
-    def values(self) -> List[torch.Tensor]:
-        return [self[key] for key in self]
+    def values(self) -> Iterator[torch.Tensor]:
+        for key in self:
+            yield self[key]
     def items(self) -> Iterator[Tuple[str, torch.Tensor]]:
-        return ((key, self[key]) for key in self)
+        for key in self:
+            yield key, self[key]
     def __repr__(self) -> str:
         if self._index is not None:
@@ -266,3 +319,20 @@ class LazyStateDict:
             return (
                 f"{self.__class__.__name__}(checkpoint_files={self._checkpoint_files})"
             )
+    def get_parameter(self, target: str) -> torch.Tensor:
+        return self[target]
+    def get_submodule(self, target: str) -> nn.Module:
+        if self.meta_module is not None:
+            module: nn.Module = deepcopy(self.meta_module.get_submodule(target))
+            module.to_empty(device=self._device)
+            state_dict = {}
+            for name, _ in module.named_parameters():
+                state_dict[name] = self[f"{target}.{name}"]
+            module.load_state_dict(state_dict)
+            return module
+        else:
+            raise RuntimeError(
+                "Cannot get submodule because meta_module is not provided."
+            )

fusion_bench/utils/pylogger.py CHANGED Viewed

@@ -62,6 +62,8 @@ class RankZeroLogger(logging.Logger):
     def _log(self, *args, **kwargs):
         if "stacklevel" in kwargs:
             kwargs["stacklevel"] += 1
+        else:
+            kwargs["stacklevel"] = 2
         return super()._log(*args, **kwargs)
     def is_global_zero(self):

{fusion_bench-0.2.17.dist-info → fusion_bench-0.2.18.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: fusion_bench
-Version: 0.2.17
+Version: 0.2.18
 Summary: A Comprehensive Benchmark of Deep Model Fusion
 Author-email: Anke Tang <tang.anke@foxmail.com>
 License: MIT License

fusion-bench 0.2.17__py3-none-any.whl → 0.2.18__py3-none-any.whl

fusion-bench 0.2.17py3-none-any.whl → 0.2.18py3-none-any.whl