PyPI - fusion-bench - Versions diffs - 0.2.21__tar.gz → 0.2.22__tar.gz - Mend

fusion-bench 0.2.21tar.gz → 0.2.22tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (940) hide show

{fusion_bench-0.2.21 → fusion_bench-0.2.22}/PKG-INFO RENAMED Viewed

@@ -1,30 +1,8 @@
 Metadata-Version: 2.4
 Name: fusion_bench
-Version: 0.2.21
+Version: 0.2.22
 Summary: A Comprehensive Benchmark of Deep Model Fusion
 Author-email: Anke Tang <tang.anke@foxmail.com>
-License: MIT License
-        Copyright (c) 2024 Anke Tang
-        Permission is hereby granted, free of charge, to any person obtaining a copy
-        of this software and associated documentation files (the "Software"), to deal
-        in the Software without restriction, including without limitation the rights
-        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-        copies of the Software, and to permit persons to whom the Software is
-        furnished to do so, subject to the following conditions:
-        The above copyright notice and this permission notice shall be included in all
-        copies or substantial portions of the Software.
-        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-        SOFTWARE.
 Project-URL: Repository, https://github.com/tanganke/fusion_bench
 Project-URL: Homepage, https://github.com/tanganke/fusion_bench
 Project-URL: Issues, https://github.com/tanganke/fusion_bench/issues

{fusion_bench-0.2.21 → fusion_bench-0.2.22}/fusion_bench/__init__.py RENAMED Viewed

@@ -19,9 +19,28 @@ from . import (
     tasks,
     utils,
 )
+from .constants import RuntimeConstants
 from .method import BaseAlgorithm, BaseModelFusionAlgorithm
 from .mixins import auto_register_config
 from .modelpool import BaseModelPool
-from .models import separate_io
+from .models import (
+    create_default_model_card,
+    load_model_card_template,
+    save_pretrained_with_remote_code,
+    separate_io,
+)
+from .programs import BaseHydraProgram
 from .taskpool import BaseTaskPool
-from .utils import parse_dtype, print_parameters, timeit_context
+from .utils import (
+    cache_with_joblib,
+    get_rankzero_logger,
+    import_object,
+    instantiate,
+    parse_dtype,
+    print_parameters,
+    seed_everything_by_time,
+    set_default_cache_dir,
+    set_print_function_call,
+    set_print_function_call_permeanent,
+    timeit_context,
+)

{fusion_bench-0.2.21 → fusion_bench-0.2.22}/fusion_bench/constants/__init__.py RENAMED Viewed

@@ -2,6 +2,7 @@
 import importlib.metadata
 from .paths import *
+from .runtime import RuntimeConstants
 # fusionbench version
 FUSION_BENCH_VERSION = importlib.metadata.version("fusion-bench")

fusion_bench-0.2.22/fusion_bench/constants/runtime.py ADDED Viewed

@@ -0,0 +1,57 @@
+import threading
+from pathlib import Path
+from typing import Optional, Union
+class RuntimeConstants:
+    """
+    This class holds constants related to the runtime environment of the Fusion Bench framework.
+    It includes default values for cache directories and other runtime configurations.
+    Implemented as a thread-safe singleton to ensure consistent runtime configuration
+    across the entire application.
+    """
+    _instance: Optional["RuntimeConstants"] = None
+    _lock = threading.Lock()
+    def __new__(cls) -> "RuntimeConstants":
+        """Create a new instance using singleton pattern with thread safety."""
+        with cls._lock:
+            # Double-check locking pattern
+            if cls._instance is None:
+                cls._instance = super(RuntimeConstants, cls).__new__(cls)
+                cls._instance._initialized = False
+            return cls._instance
+    def __init__(self):
+        """Initialize the singleton instance only once."""
+        if not self._initialized:
+            # Add your runtime constants here
+            self._initialized = True
+    debug = False
+    @property
+    def cache_dir(self) -> Path:
+        from fusion_bench.utils.cache_utils import DEFAULT_CACHE_DIR
+        return DEFAULT_CACHE_DIR
+    @cache_dir.setter
+    def cache_dir(self, path: Union[str, Path]) -> None:
+        from fusion_bench.utils.cache_utils import set_default_cache_dir
+        set_default_cache_dir(path)
+    @property
+    def print_function_call(self) -> bool:
+        from fusion_bench.utils.instantiate_utils import PRINT_FUNCTION_CALL
+        return PRINT_FUNCTION_CALL
+    @print_function_call.setter
+    def print_function_call(self, enable: bool) -> None:
+        from fusion_bench.utils.instantiate_utils import set_print_function_call
+        set_print_function_call(enable)

{fusion_bench-0.2.21 → fusion_bench-0.2.22}/fusion_bench/method/__init__.py RENAMED Viewed

@@ -90,7 +90,10 @@ _import_structure = {
         "MixtralForCausalLMMergingAlgorithm",
     ],
     "dawe": ["DataAdaptiveWeightEnsemblingForCLIP"],
-    "we_moe": ["CLIPWeightEnsemblingMoEAlgorithm"],
+    "we_moe": [
+        "CLIPWeightEnsemblingMoEAlgorithm",
+        "FlanT5WeightEnsemblingMoEAlgorithm",
+    ],
     "rankone_moe": ["CLIPRankOneMoEAlgorithm", "RankOneMoEAlgorithm"],
     "sparse_we_moe": [
         "SparseWeightEnsemblingMoEAlgorithm",
@@ -228,7 +231,10 @@ if TYPE_CHECKING:
     from .task_arithmetic import TaskArithmeticAlgorithm
     from .task_singular_vector import TaskSingularVectorMerging
     from .ties_merging import TiesMergingAlgorithm
-    from .we_moe import CLIPWeightEnsemblingMoEAlgorithm
+    from .we_moe import (
+        CLIPWeightEnsemblingMoEAlgorithm,
+        FlanT5WeightEnsemblingMoEAlgorithm,
+    )
     from .weighted_average import WeightedAverageAlgorithm, WeightedAverageForLLama
 else:

{fusion_bench-0.2.21 → fusion_bench-0.2.22}/fusion_bench/method/bitdelta/__init__.py RENAMED Viewed

@@ -1,4 +1,5 @@
 """
 Adapted from https://github.com/FasterDecoding/BitDelta
 """
 from .bitdelta import BitDeltaAlgorithm

{fusion_bench-0.2.21 → fusion_bench-0.2.22}/fusion_bench/method/classification/clip_finetune.py RENAMED Viewed

@@ -393,7 +393,7 @@ def convert_l_lora_state_dict_to_hf(
     base_model_name: Optional[str] = None,
 ):
     """
-    Convert a linearized Lora model's checkpoint to Hugggingface's format.
+    Convert a linearized Lora model's checkpoint to huggingface's format.
     Args:
         pretrained_path (str): The path to the pretrained model.

{fusion_bench-0.2.21 → fusion_bench-0.2.22}/fusion_bench/method/fisher_merging/clip_fisher_merging.py RENAMED Viewed

@@ -32,7 +32,6 @@ class FisherMergingForCLIPVisionModel(
     zeroshot_weights = {}
     _config_mapping = FisherMergingAlgorithm._config_mapping | {
-        "zeroshot_weights_cache_dir": "zeroshot_weights_cache_dir",
         "_dataloader_kwargs": "dataloader_kwargs",
     }
@@ -44,7 +43,6 @@ class FisherMergingForCLIPVisionModel(
         minimal_fisher_weight,
         num_fisher_examples,
         dataloader_kwargs: DictConfig,
-        zeroshot_weights_cache_dir=None,
         **kwargs,
     ):
         """
@@ -56,7 +54,6 @@ class FisherMergingForCLIPVisionModel(
             minimal_fisher_weight (float): Minimal value for Fisher weights to avoid numerical issues.
             num_fisher_examples (int): Number of examples to compute Fisher weights.
             dataloader_kwargs (DictConfig): Configuration for the dataloader.
-            zeroshot_weights_cache_dir (str, optional): Directory to cache zero-shot weights. Defaults to None.
             **kwargs: Additional keyword arguments.
         """
         super().__init__(
@@ -66,7 +63,6 @@ class FisherMergingForCLIPVisionModel(
             num_fisher_examples=num_fisher_examples,
         )
         self.dataloader_kwargs = dataloader_kwargs
-        self.zeroshot_weights_cache_dir = zeroshot_weights_cache_dir
         for key, value in kwargs.items():
             log.warning(f"Unused argument: {key}={value}")
             setattr(self, key, value)

{fusion_bench-0.2.21 → fusion_bench-0.2.22}/fusion_bench/method/fisher_merging/gpt2_fisher_merging.py RENAMED Viewed

@@ -15,10 +15,10 @@ from transformers import GPT2ForSequenceClassification, GPT2Model
 from transformers.data import default_data_collator
 from transformers.models.gpt2.modeling_gpt2 import Conv1D
-from fusion_bench.mixins import LightningFabricMixin
+from fusion_bench.mixins import LightningFabricMixin, auto_register_config
 from fusion_bench.modelpool import GPT2ForSequenceClassificationPool
 from fusion_bench.utils import timeit_context
-from fusion_bench.mixins import auto_register_config
 from .fisher_merging import FisherMergingAlgorithm, get_param_squared_gradients

{fusion_bench-0.2.21 → fusion_bench-0.2.22}/fusion_bench/method/linear/simple_average_for_llama.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import os
 from copy import deepcopy
 from typing import TYPE_CHECKING, Optional
@@ -7,13 +8,16 @@ from typing_extensions import override
 from fusion_bench import timeit_context
 from fusion_bench.method.base_algorithm import BaseAlgorithm
 from fusion_bench.method.simple_average import SimpleAverageAlgorithm
+from fusion_bench.mixins import auto_register_config
 from fusion_bench.modelpool import CausalLMBackbonePool, CausalLMPool
+from fusion_bench.models.hf_utils import create_default_model_card
 from fusion_bench.utils import instantiate
-from fusion_bench.utils.pylogger import getRankZeroLogger
+from fusion_bench.utils.pylogger import get_rankzero_logger
-log = getRankZeroLogger(__name__)
+log = get_rankzero_logger(__name__)
+@auto_register_config
 class SimpleAverageForLlama(BaseAlgorithm):
     R"""
     A simple averaging algorithm for LLama models. If `merge_backbone` is set to `True`, the backbone of the model will be averaged and the rest of the model will be loaded from the pre-trained model.
@@ -29,21 +33,14 @@ class SimpleAverageForLlama(BaseAlgorithm):
         ```
     """
-    _config_mapping = BaseAlgorithm._config_mapping | {
-        "merge_backbone": "merge_backbone",
-        "show_pbar": "show_pbar",
-    }
     def __init__(
         self,
         merge_backbone: bool,
         model_save_path: Optional[str] = None,
         show_pbar: bool = False,
+        **kwargs,
     ):
-        super().__init__()
-        self.merge_backbone = merge_backbone
-        self.model_save_path = model_save_path
-        self.show_pbar = show_pbar
+        super().__init__(**kwargs)
     @override
     def run(self, modelpool: CausalLMPool):
@@ -75,4 +72,12 @@ class SimpleAverageForLlama(BaseAlgorithm):
             with timeit_context(f"Saving the model to {self.model_save_path}"):
                 tokenizer.save_pretrained(self.model_save_path)
                 model.save_pretrained(self.model_save_path)
+                model_card_str = create_default_model_card(
+                    models=[modelpool.get_model_path(m) for m in modelpool.model_names],
+                    description="Merged model using simple averaging.",
+                    algorithm_config=self.config,
+                    modelpool_config=modelpool.config,
+                )
+                with open(os.path.join(self.model_save_path, "README.md"), "w") as f:
+                    f.write(model_card_str)
         return model

{fusion_bench-0.2.21 → fusion_bench-0.2.22}/fusion_bench/method/simple_average.py RENAMED Viewed

@@ -61,8 +61,8 @@ def simple_average(
 @auto_register_config
 class SimpleAverageAlgorithm(
-    BaseAlgorithm,
     SimpleProfilerMixin,
+    BaseAlgorithm,
 ):
     def __init__(self, show_pbar: bool = False, **kwargs):
         """
@@ -120,13 +120,13 @@ class SimpleAverageAlgorithm(
         if isinstance(forward_model, LazyStateDict):
             # if the model is a LazyStateDict, convert it to an empty module
             forward_model = forward_model.meta_module.to_empty(
-                device=(
-                    "cpu"
-                    if forward_model._torch_dtype is None
-                    else forward_model._torch_dtype
-                )
+                device=forward_model._device
             )
-        forward_model.load_state_dict(sd)
+        result = forward_model.load_state_dict(sd, strict=False)
+        if result.unexpected_keys:
+            raise ValueError(f"Unexpected keys in state dict: {result.unexpected_keys}")
+        if result.missing_keys:
+            log.warning(f"Missing keys in state dict: {result.missing_keys}")
         # print profile report and log the merged models
         self.print_profile_summary()
         log.info(f"merged {len(merged_model_names)} models:")

fusion_bench-0.2.22/fusion_bench/method/smile_upscaling/causal_lm_upscaling.py ADDED Viewed

@@ -0,0 +1,371 @@
+import logging
+import os
+from copy import deepcopy
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, Union
+import torch
+from accelerate import init_empty_weights
+from tqdm.auto import tqdm
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    LlamaForCausalLM,
+    MistralForCausalLM,
+    PretrainedConfig,
+    PreTrainedModel,
+    Qwen2ForCausalLM,
+)
+from transformers.models.llama.modeling_llama import LlamaDecoderLayer
+from transformers.models.mistral.modeling_mistral import MistralDecoderLayer
+from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer
+from fusion_bench import BaseAlgorithm, BaseModelPool
+from fusion_bench.compat.modelpool import to_modelpool
+from fusion_bench.constants import RuntimeConstants
+from fusion_bench.mixins import SimpleProfilerMixin, auto_register_config
+from fusion_bench.modelpool import CausalLMPool
+from fusion_bench.models.hf_utils import (
+    create_default_model_card,
+    save_pretrained_with_remote_code,
+)
+from fusion_bench.models.modeling_smile_llama import (
+    SmileLlamaConfig,
+    SmileLlamaForCausalLM,
+    SmileLlamaModel,
+)
+from fusion_bench.models.modeling_smile_llama.modeling_smile_llama import (
+    SmileLlamaDecoderLayer,
+)
+from fusion_bench.models.modeling_smile_mistral import (
+    SmileMistralConfig,
+    SmileMistralForCausalLM,
+    SmileMistralModel,
+)
+from fusion_bench.models.modeling_smile_mistral.modeling_smile_mistral import (
+    SmileMistralDecoderLayer,
+)
+# Import all SMILE configurations and models
+from fusion_bench.models.modeling_smile_qwen2 import (
+    SmileQwen2Config,
+    SmileQwen2ForCausalLM,
+    SmileQwen2Model,
+)
+from fusion_bench.models.modeling_smile_qwen2.modeling_smile_qwen2 import (
+    SmileQwen2DecoderLayer,
+)
+from fusion_bench.models.smile_moe.linear_from_hf_config import (
+    ExpertNotTrainedError,
+    upscale_to_smile_linear,
+)
+from fusion_bench.utils.dtype import parse_dtype
+from fusion_bench.utils.parameters import print_parameters
+log = logging.getLogger(__name__)
+# Model type mappings
+MODEL_TYPE_MAPPINGS = {
+    "qwen2": {
+        "base_model_cls": Qwen2ForCausalLM,
+        "base_decoder_layer_cls": Qwen2DecoderLayer,
+        "smile_config_cls": SmileQwen2Config,
+        "smile_model_cls": SmileQwen2ForCausalLM,
+        "smile_base_model_cls": SmileQwen2Model,
+        "smile_decoder_layer_cls": SmileQwen2DecoderLayer,
+        "description": "Qwen2",
+    },
+    "llama": {
+        "base_model_cls": LlamaForCausalLM,
+        "base_decoder_layer_cls": LlamaDecoderLayer,
+        "smile_config_cls": SmileLlamaConfig,
+        "smile_model_cls": SmileLlamaForCausalLM,
+        "smile_base_model_cls": SmileLlamaModel,
+        "smile_decoder_layer_cls": SmileLlamaDecoderLayer,
+        "description": "Llama",
+    },
+    "mistral": {
+        "base_model_cls": MistralForCausalLM,
+        "base_decoder_layer_cls": MistralDecoderLayer,
+        "smile_config_cls": SmileMistralConfig,
+        "smile_model_cls": SmileMistralForCausalLM,
+        "smile_base_model_cls": SmileMistralModel,
+        "smile_decoder_layer_cls": SmileMistralDecoderLayer,
+        "description": "Mistral",
+    },
+}
+def detect_model_type(
+    model_or_config: Union[PreTrainedModel, PretrainedConfig, str],
+) -> str:
+    """
+    Detect the model type from a model, config, or model name/path.
+    Args:
+        model_or_config: Model, config, or model name/path to detect type from
+    Returns:
+        str: The detected model type ("qwen2", "llama", "mistral")
+    Raises:
+        ValueError: If model type cannot be detected or is not supported
+    """
+    if isinstance(model_or_config, str):
+        # Load config from path/name
+        config = AutoConfig.from_pretrained(model_or_config)
+    elif isinstance(model_or_config, PreTrainedModel):
+        config = model_or_config.config
+    elif isinstance(model_or_config, PretrainedConfig):
+        config = model_or_config
+    else:
+        raise ValueError(
+            f"Unsupported type for model type detection: {type(model_or_config)}"
+        )
+    model_type = getattr(config, "model_type", "").lower()
+    # Handle various model type variations
+    if model_type in MODEL_TYPE_MAPPINGS:
+        return model_type
+    else:
+        raise ValueError(
+            f"Unsupported model type: {model_type}. Supported types: {list(MODEL_TYPE_MAPPINGS.keys())}"
+        )
+@auto_register_config
+class SmileCausalLMUpscalingAlgorithm(
+    SimpleProfilerMixin,
+    BaseAlgorithm,
+):
+    R"""
+    SmileCausalLMUpscalingAlgorithm is a generic model fusion algorithm designed to upscale
+    a pretrained CausalLM model using a set of fine-tuned expert models. The algorithm
+    supports Qwen2, Llama, and Mistral model architectures and leverages Singular Value
+    Decomposition (SVD) to merge the weights of the pretrained model and the expert models
+    into a new upscaled model.
+    The algorithm automatically detects the model type and uses the appropriate SMILE
+    configuration and model classes.
+    Methods:
+        run(modelpool: BaseModelPool) -> Union[SmileQwen2ForCausalLM, SmileLlamaForCausalLM, SmileMistralForCausalLM]:
+            Executes the upscaling process and returns the upscaled model.
+        merge(pretrained_model: PreTrainedModel, finetuned_models: List[PreTrainedModel]) -> PreTrainedModel:
+            Merges the pretrained model with the fine-tuned models to create an upscaled model.
+    """
+    modelpool: CausalLMPool
+    def __init__(
+        self,
+        device,
+        accelerator,
+        model_save_path,
+        model_dtype,
+        num_experts_per_tok,
+        rank_of_router,
+        rank_of_expert,
+        save_with_remote_code: bool = True,
+        model_type: str = None,  # Optional: explicitly specify model type
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.model_mappings = None  # Will be set during run()
+        if not torch.cuda.is_available():
+            if "cuda" in self.device:
+                self.device = "cpu"
+            if "cuda" in self.accelerator:
+                self.accelerator = "cpu"
+    @torch.no_grad()
+    def run(self, modelpool) -> PreTrainedModel:
+        """
+        Executes the upscaling process.
+        Args:
+            modelpool (ModelPool): The pool of models to be used for upscaling.
+        Returns:
+            PreTrainedModel: The upscaled model (specific type depends on detected model architecture).
+        """
+        self.modelpool = modelpool = to_modelpool(modelpool)
+        config = self.config
+        # Auto-detect model type if not specified
+        if self.model_type is None:
+            self.model_type = detect_model_type(
+                modelpool.get_model_path("_pretrained_")
+            )
+            log.info(f"Auto-detected model type: {self.model_type}")
+        # Get the appropriate model mappings
+        if self.model_type not in MODEL_TYPE_MAPPINGS:
+            raise ValueError(
+                f"Unsupported model type: {self.model_type}. Supported: {list(MODEL_TYPE_MAPPINGS.keys())}"
+            )
+        self.model_mappings = MODEL_TYPE_MAPPINGS[self.model_type]
+        log.info(f"Using {self.model_mappings['description']} model architecture")
+        with self.profile("load pretrained model"):
+            pretrained_model = modelpool.load_pretrained_model()
+        with self.profile("load fine-tuned model"):
+            finetuned_models = [
+                m for m in tqdm(modelpool.models(), total=len(modelpool.model_names))
+            ]
+        if self.device == "cuda" and torch.cuda.is_available():
+            pretrained_model = pretrained_model.cuda()
+            print("parameter count of pretrained model:")
+            print_parameters(pretrained_model)
+            finetuned_models = [m.cuda() for m in finetuned_models]
+        with self.profile("merge model"):
+            model = self.merge(pretrained_model, finetuned_models)
+        self.print_profile_summary()
+        print("parameter count of upscaled MoE model:")
+        print_parameters(model)
+        print(model)
+        if self.model_dtype is not None:
+            model.to(dtype=parse_dtype(self.model_dtype))
+        if self.model_save_path is not None:
+            if os.path.dirname(self.model_save_path):
+                os.makedirs(os.path.dirname(self.model_save_path), exist_ok=True)
+            log.info(f"Saving model to {self.model_save_path}")
+            tokenizer = self.modelpool.load_tokenizer()
+            tokenizer.save_pretrained(self.model_save_path)
+            if not self.save_with_remote_code:
+                model.save_pretrained(self.model_save_path)
+            else:
+                # Use the appropriate auto_map for the detected model type
+                auto_map = {
+                    "AutoConfig": self.model_mappings["smile_config_cls"],
+                    "AutoModel": self.model_mappings["smile_base_model_cls"],
+                    "AutoModelForCausalLM": self.model_mappings["smile_model_cls"],
+                }
+                save_pretrained_with_remote_code(
+                    model,
+                    auto_map=auto_map,
+                    save_directory=self.model_save_path,
+                )
+            # save readme
+            model_card_str = create_default_model_card(
+                models=[modelpool.get_model_path(m) for m in modelpool.all_model_names],
+                description=f"Merged {self.model_mappings['description']} model using SMILE Upscaling",
+                algorithm_config=self.config,
+                modelpool_config=modelpool.config,
+            )
+            with open(os.path.join(self.model_save_path, "README.md"), "w") as f:
+                f.write(model_card_str)
+        return model
+    def merge(
+        self,
+        pretrained_model: PreTrainedModel,
+        finetuned_models: List[PreTrainedModel],
+    ) -> PreTrainedModel:
+        """
+        Merges the pretrained model with the fine-tuned models to create an upscaled model.
+        Args:
+            pretrained_model (PreTrainedModel): The pretrained model.
+            finetuned_models (List[PreTrainedModel]): A list of fine-tuned models.
+        Returns:
+            PreTrainedModel: The upscaled model (specific type depends on model architecture).
+        """
+        with init_empty_weights():
+            pretrained_model_config = self.modelpool.get_model_config("_pretrained_")
+            if isinstance(pretrained_model_config, str):
+                pretrained_path = pretrained_model_config
+            else:
+                pretrained_path = pretrained_model_config.get(
+                    "path", pretrained_model_config["pretrained_model_name_or_path"]
+                )
+            base_config = AutoConfig.from_pretrained(pretrained_path)
+            # Create the appropriate SMILE config for the detected model type
+            SmileConfigClass = self.model_mappings["smile_config_cls"]
+            model_config = SmileConfigClass(
+                num_experts_per_tok=self.num_experts_per_tok,
+                rank_of_router=self.rank_of_router,
+                rank_of_expert=self.rank_of_expert,
+                num_local_experts=len(finetuned_models),
+                **base_config.to_dict(),
+            )
+            # Create the appropriate SMILE model for the detected model type
+            SmileModelClass = self.model_mappings["smile_model_cls"]
+            model = SmileModelClass(model_config)
+        model.to(dtype=pretrained_model.dtype).to_empty(device="cpu")
+        # copy pretrained model weights
+        state_dict = model.state_dict()
+        pretrained_state_dict = pretrained_model.state_dict()
+        for key in list(pretrained_state_dict.keys()):
+            if key not in state_dict:
+                pretrained_state_dict.pop(key)
+        model.load_state_dict(pretrained_state_dict, strict=False)
+        # upscale model
+        BaseDecoderLayerClass = self.model_mappings["base_decoder_layer_cls"]
+        SmileDecoderLayerClass = self.model_mappings["smile_decoder_layer_cls"]
+        for layer_idx in tqdm(
+            range(len(pretrained_model.model.layers)),
+            "Upscaling Modules (layer)",
+            dynamic_ncols=True,
+        ):
+            if RuntimeConstants.debug and layer_idx > 0:
+                log.info(
+                    "Debug mode enabled: processing only the first layer, skipping remaining layers"
+                )
+                break
+            pretrained_layer = pretrained_model.model.layers[layer_idx]
+            finetuned_layers = [m.model.layers[layer_idx] for m in finetuned_models]
+            target_layer = model.model.layers[layer_idx]
+            for n in ["q_proj", "k_proj", "v_proj", "o_proj"]:
+                try:
+                    upscale_to_smile_linear(
+                        base=getattr(pretrained_layer.self_attn, n),
+                        experts=[getattr(m.self_attn, n) for m in finetuned_layers],
+                        target=getattr(target_layer.self_attn, n),
+                        accelerator=self.accelerator,
+                    )
+                except ExpertNotTrainedError:
+                    setattr(
+                        target_layer.self_attn,
+                        n,
+                        getattr(pretrained_layer.self_attn, n),
+                    )
+            for n in ["gate_proj", "up_proj", "down_proj"]:
+                try:
+                    upscale_to_smile_linear(
+                        base=getattr(pretrained_layer.mlp, n),
+                        experts=[getattr(m.mlp, n) for m in finetuned_layers],
+                        target=getattr(target_layer.mlp, n),
+                        accelerator=self.accelerator,
+                    )
+                except ExpertNotTrainedError:
+                    setattr(
+                        target_layer.mlp,
+                        n,
+                        getattr(pretrained_layer.mlp, n),
+                    )
+        return model

fusion-bench 0.2.21__tar.gz → 0.2.22__tar.gz

fusion-bench 0.2.21tar.gz → 0.2.22tar.gz