PyPI - fusion-bench - Versions diffs - 0.2.29__py3-none-any.whl → 0.2.31__py3-none-any.whl - Mend

fusion-bench 0.2.29py3-none-any.whl → 0.2.31py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

fusion_bench/constants/runtime.py +4 -1
fusion_bench/method/__init__.py +9 -1
fusion_bench/method/base_algorithm.py +29 -19
fusion_bench/method/classification/image_classification_finetune.py +1 -0
fusion_bench/method/concrete_subspace/clip_concrete_tsvm.py +285 -0
fusion_bench/method/task_singular_vector/TSVM.py +7 -6
fusion_bench/method/task_singular_vector/utils/TSVM_utils.py +0 -1
fusion_bench/metrics/model_kinship/__init__.py +2 -0
fusion_bench/metrics/model_kinship/calculate.py +77 -0
fusion_bench/metrics/model_kinship/calculate_split.py +171 -0
fusion_bench/metrics/model_kinship/utility.py +184 -0
fusion_bench/mixins/lightning_fabric.py +2 -8
fusion_bench/mixins/openclip_classification.py +155 -1
fusion_bench/modelpool/base_pool.py +1 -0
fusion_bench/modelpool/openclip_vision/modelpool.py +12 -3
fusion_bench/models/masks/mask_model.py +8 -2
fusion_bench/models/open_clip/modeling.py +68 -5
fusion_bench/models/open_clip/utils.py +13 -2
fusion_bench/models/wrappers/layer_wise_fusion.py +41 -3
fusion_bench/models/wrappers/task_wise_fusion.py +14 -3
fusion_bench/py.typed +1 -0
fusion_bench/scripts/cli.py +21 -16
fusion_bench/scripts/imgui.py +2 -2
fusion_bench/scripts/webui.py +2 -2
fusion_bench/utils/__init__.py +2 -0
fusion_bench/utils/devices.py +3 -1
fusion_bench/utils/hydra_utils.py +75 -0
fusion_bench/utils/instantiate_utils.py +29 -18
fusion_bench/utils/misc.py +16 -0
fusion_bench/utils/parameters.py +33 -0
fusion_bench/utils/rich_utils.py +165 -25
{fusion_bench-0.2.29.dist-info → fusion_bench-0.2.31.dist-info}/METADATA +7 -7
{fusion_bench-0.2.29.dist-info → fusion_bench-0.2.31.dist-info}/RECORD +41 -34
fusion_bench_config/README.md +9 -0
fusion_bench_config/fabric/auto.yaml +1 -0
fusion_bench_config/hydra/default.yaml +3 -1
fusion_bench_config/method/concrete_subspace/clip_concrete_tsvm.yaml +38 -0
{fusion_bench-0.2.29.dist-info → fusion_bench-0.2.31.dist-info}/WHEEL +0 -0
{fusion_bench-0.2.29.dist-info → fusion_bench-0.2.31.dist-info}/entry_points.txt +0 -0
{fusion_bench-0.2.29.dist-info → fusion_bench-0.2.31.dist-info}/licenses/LICENSE +0 -0
{fusion_bench-0.2.29.dist-info → fusion_bench-0.2.31.dist-info}/top_level.txt +0 -0

fusion_bench/constants/runtime.py CHANGED Viewed

@@ -89,7 +89,10 @@ class RuntimeConstants:
             self._initialized = True
     debug = False
-    """Global debug flag for enabling verbose logging and debugging features."""
+    """
+    Global debug flag for enabling verbose logging and debugging features.
+    Use `RuntimeConstants().debug` instead of `RuntimeConstants.debug`
+    """
     @property
     def cache_dir(self) -> Path:

fusion_bench/method/__init__.py CHANGED Viewed

@@ -144,7 +144,15 @@ _extra_objects = {
 if TYPE_CHECKING:
     from .ada_svd import AdaSVDMergingForCLIPVisionModel
-    from .adamerging import *
+    from .adamerging import (
+        CLIPLayerWiseAdaMergingAlgorithm,
+        CLIPTaskWiseAdaMergingAlgorithm,
+        FlanT5LayerWiseAdaMergingAlgorithm,
+        GPT2LayerWiseAdaMergingAlgorithm,
+        LayerWiseAdaMergingForLlamaSFT,
+        ResNetLayerWiseAdamerging,
+        ResNetTaskWiseAdamerging,
+    )
     from .analysis import TaskVectorCosSimilarity, TaskVectorViolinPlot
     from .base_algorithm import BaseAlgorithm, BaseModelFusionAlgorithm
     from .bitdelta import BitDeltaAlgorithm

fusion_bench/method/base_algorithm.py CHANGED Viewed

@@ -40,6 +40,7 @@ from typing import Optional  # noqa: F401
 from fusion_bench.mixins import BaseYAMLSerializable
 from fusion_bench.modelpool import BaseModelPool
+from fusion_bench.utils.misc import DeprecationWarningMeta
 __all__ = ["BaseAlgorithm", "BaseModelFusionAlgorithm"]
@@ -202,27 +203,36 @@ class BaseAlgorithm(BaseYAMLSerializable):
         pass
-BaseModelFusionAlgorithm = BaseAlgorithm
-"""
-Alias for BaseAlgorithm class.
+# Create a deprecated wrapper class that inherits from BaseAlgorithm
+class BaseModelFusionAlgorithm(BaseAlgorithm, metaclass=DeprecationWarningMeta):
+    """
+    Alias for BaseAlgorithm class.
-This alias is provided for backward compatibility and semantic clarity.
-Some users may prefer the more explicit name 'BaseModelFusionAlgorithm'
-to emphasize that this class is specifically designed for model fusion
-tasks, while others may prefer the shorter 'BaseAlgorithm' name.
+    .. deprecated::
+        BaseModelFusionAlgorithm is deprecated and will be removed in a future version.
+        Use :class:`BaseAlgorithm` instead.
-Both names refer to the exact same class and can be used interchangeably.
+    This alias was provided for backward compatibility and semantic clarity.
+    Both names refer to the same base class and can be used interchangeably,
+    but BaseAlgorithm is now the preferred name for all implementations.
-Examples:
-    Using the original name:
-    >>> class MyAlgorithm(BaseAlgorithm):
-    ...     def run(self, modelpool): pass
+    Examples:
+        Preferred (using BaseAlgorithm):
-    Using the alias:
-    >>> class MyAlgorithm(BaseModelFusionAlgorithm):
-    ...     def run(self, modelpool): pass
+        >>> class MyAlgorithm(BaseAlgorithm):
+        ...     def run(self, modelpool): pass
-Note:
-    The alias is maintained for compatibility but BaseAlgorithm is the
-    preferred name for new implementations.
-"""
+        Deprecated (using BaseModelFusionAlgorithm):
+        >>> class MyAlgorithm(BaseModelFusionAlgorithm):  # Will trigger deprecation warning
+        ...     def run(self, modelpool): pass
+    Note:
+        New implementations should use :class:`BaseAlgorithm` exclusively.
+        The BaseModelFusionAlgorithm alias will be removed in a future release.
+    Warning:
+        Using BaseModelFusionAlgorithm will trigger a DeprecationWarning.
+    """
+    pass

fusion_bench/method/classification/image_classification_finetune.py CHANGED Viewed

@@ -173,6 +173,7 @@ class ImageClassificationFineTuning(BaseAlgorithm):
                 ),
             },
         )
+        lit_module.train()
         log_dir = (
             self._program.path.log_dir

fusion_bench/method/concrete_subspace/clip_concrete_tsvm.py ADDED Viewed

@@ -0,0 +1,285 @@
+import logging
+import os
+from copy import deepcopy
+from typing import TYPE_CHECKING, Dict, Iterable, List, Literal, Optional
+import torch
+from omegaconf import DictConfig
+from tqdm import tqdm
+from fusion_bench import (
+    BaseAlgorithm,
+    OpenCLIPClassificationMixin,
+    OpenCLIPVisionModelPool,
+    SimpleProfilerMixin,
+    StateDictType,
+    auto_register_config,
+    get_rankzero_logger,
+    instantiate,
+)
+from fusion_bench.method.adamerging.entropy_loss import entropy_loss
+from fusion_bench.method.task_singular_vector import TaskSingularVectorMerging
+from fusion_bench.method.task_singular_vector.utils import (
+    TSVM_utils,
+    check_parameterNamesMatch,
+    check_state_dicts_equal,
+    state_dict_to_vector,
+    vector_to_state_dict,
+)
+from fusion_bench.models.masks import MaskModel, mask_sparsity
+from fusion_bench.models.open_clip import (
+    ClassificationHead,
+    ImageClassifier,
+    ImageEncoder,
+)
+from fusion_bench.models.wrappers.task_wise_fusion import (
+    TaskWiseMergedModel,
+    get_task_wise_weights,
+)
+from fusion_bench.utils.devices import clear_cuda_cache
+from fusion_bench.utils.dtype import parse_dtype
+from fusion_bench.utils.parameters import print_parameters, print_trainable_parameters
+from fusion_bench.utils.rich_utils import print_config_yaml
+from fusion_bench.utils.state_dict_arithmetic import (
+    _validate_state_dict_same_keys,
+    state_dict_add,
+    state_dict_hadamard_product,
+    state_dict_mul,
+    state_dict_sub,
+)
+log = get_rankzero_logger(__name__)
+@auto_register_config
+class ConcreteTSVMForOpenCLIP(
+    OpenCLIPClassificationMixin,
+    SimpleProfilerMixin,
+    BaseAlgorithm,
+):
+    def __init__(
+        self,
+        dataloader_kwargs: DictConfig,
+        optimizer: DictConfig,
+        lr_scheduler: DictConfig,
+        max_steps: int,
+        save_interval: int,
+        initial_logits: float,
+        temperature: float,
+        eval_mask_type: Literal["continuous", "discrete"],
+        mask_checkpoint: Optional[str],
+        merge_dtype: str,
+        clamp_weights: bool,
+        tie_weights: bool,
+        strict: bool,
+        skip_training: bool,
+        # === TSVM parameters ===
+        exclude_keys: Optional[List[str]],
+        alpha: float,
+        return_single_task_models: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if not return_single_task_models:
+            log.warning("return_single_task_models is forced to be True here.")
+            self.return_single_task_models = True
+    @torch.no_grad()
+    def setup_models(self):
+        """
+        load the pre-trained model, task vectors, and construct the mask model.
+        """
+        merge_dtype = parse_dtype(self.merge_dtype)
+        modelpool = self.modelpool
+        # load the pre-trained model
+        pretrained_model = modelpool.load_pretrained_model()
+        self.set_clip_processor(stage="test", processor=pretrained_model.val_preprocess)
+        # constrcute mask model
+        mask_model = MaskModel(
+            pretrained_model, ignore_untrained_params=True, parameter_type="logits"
+        )
+        if merge_dtype is not None:
+            mask_model.to(merge_dtype)
+        mask_model.fill_(self.initial_logits)
+        if self.fabric.is_global_zero:
+            print("summary of mask model:")
+            print_parameters(mask_model)
+        if self.fabric.is_global_zero:
+            tsvm_algo = TaskSingularVectorMerging(
+                alpha=self.alpha,
+                exclude_keys=self.exclude_keys,
+                return_single_task_models=self.return_single_task_models,
+            )
+            tsvm_algo._fabric_instance = self.fabric
+            models = tsvm_algo.run(modelpool)
+            finetuned_models = [models[name] for name in modelpool.model_names]
+            task_wise_weight = get_task_wise_weights(
+                num_models=len(modelpool.model_names),
+                init_values=self.alpha,
+            )
+            # create a wrapped model
+            module = TaskWiseMergedModel(
+                task_wise_weight=task_wise_weight,
+                pretrained_model=pretrained_model,
+                finetuned_models=finetuned_models,
+                clamp_weights=self.clamp_weights,
+                tie_weights=self.tie_weights,
+                strict=self.strict,
+                task_vector_dtype=merge_dtype,
+            )
+            module = module.to(dtype=merge_dtype)
+            print("trainable parameter summary of merged model (TaskWiseMergedModel):")
+            print_trainable_parameters(module)
+        else:
+            module = None
+        with torch.no_grad():
+            self.fabric.barrier()
+            module = self.fabric.broadcast(module, src=0)
+        return module, mask_model
+    def train_mask(self, module: TaskWiseMergedModel, mask_model: MaskModel):
+        """
+        Train the mask model using the provided module.
+        This method configures the optimizer, sets up the mask model, and performs test-time adaptation to train the mask model.
+        Args:
+            module (TaskWiseMergedModel): The wrapped model with task-wise weights.
+            mask_model (MaskModel): The mask model to be trained.
+        """
+        config = self.config
+        merge_dtype = parse_dtype(self.merge_dtype)
+        log.info(f"Using merge dtype: {merge_dtype}")
+        optimizer: "torch.optim.Optimizer" = instantiate(
+            self.optimizer,
+            params=filter(lambda p: p.requires_grad, mask_model.parameters()),
+        )
+        print(f"{optimizer=}")
+        if self.lr_scheduler is not None:
+            lr_scheduler = instantiate(
+                self.lr_scheduler,
+                optimizer=optimizer,
+            )
+            print(f"{lr_scheduler=}")
+        else:
+            lr_scheduler = None
+        log.info("Setup models and optimizer with Fabric.")
+        mask_model, optimizer = self.fabric.setup(mask_model, optimizer)
+        log.info("Move the merged module to the correct device and disable gradients.")
+        module.requires_grad_(False)
+        module.to(mask_model.device)
+        mask_model.train()
+        optimizer.zero_grad()
+        for step_idx in (
+            pbar := tqdm(
+                range(self.config.max_steps if not self.is_debug_mode else 5),
+                ("[DEBUG MODE] " if self.is_debug_mode else "")
+                + "Concrete TSVM Test-Time Adaptation",
+                dynamic_ncols=True,
+                disable=not self.fabric.is_global_zero,
+            )
+        ):
+            metrics = {}
+            # sample a shared mask and merge weights
+            with self.profile("sample mask"):
+                mask = mask_model.sample_mask(
+                    mask_type="continuous", temperature=config.temperature
+                )
+                metrics["train/sparsity"] = mask_sparsity(mask)
+            with self.profile("merge weights"):
+                # rescale mask
+                for name, m in mask.items():
+                    mask[name] = m / torch.mean(m)
+                module.merge_weights(task_vector_mask=mask)
+            # ------ inner optimization goes here ------
+            # NOTE:
+            #   Because the algorithmic parameters of TSVM are assumed to be chosen on a validation test
+            #   set, we do not need to perform inner optimization here. So here we skip the inner optimization step.
+            # ------------------------------------------
+            total_loss = None
+            for task in self.modelpool.model_names:
+                with self.profile("data loading"):
+                    batch = next(self.get_shuffled_test_loader_iter(task))
+                    # NOTE: The labels are not allowed to be used during test-time adaptation
+                    images = batch[0].to(dtype=merge_dtype)
+                with self.profile("forward pass"):
+                    logits = self.compute_logits(module, images, task)
+                    loss = entropy_loss(logits)
+                    total_loss = loss if total_loss is None else total_loss + loss
+            with self.profile("compute grad"):
+                self.fabric.backward(total_loss)
+            with self.profile("optimizer step"):
+                optimizer.step()
+                optimizer.zero_grad()
+                if lr_scheduler is not None:
+                    lr_scheduler.step()
+            metrics.update({"train/loss": loss.item()})
+            self.fabric.log_dict(metrics, step=step_idx)
+            pbar.set_postfix(metrics)
+            if (step_idx + 1) % self.config.save_interval == 0:
+                with self.profiler.profile("save checkpoint"):
+                    save_dir = os.path.join(self.fabric.logger.log_dir, "checkpoints")
+                    if not os.path.exists(save_dir):
+                        os.makedirs(save_dir, exist_ok=True)
+                    save_path = os.path.join(save_dir, f"mask_steps_{step_idx}.pt")
+                    print(f"saving checkpoint to {save_path}")
+                    state = {"model": mask_model}
+                    self.fabric.save(save_path, state)
+                    # Create or update a symbolic link to the latest checkpoint
+                    if self.fabric.is_global_zero:
+                        symlink_path = os.path.join(save_dir, "latest_checkpoint.pt")
+                        if os.path.exists(symlink_path):
+                            os.remove(symlink_path)
+                        os.link(os.path.abspath(save_path), symlink_path)
+                self.print_profile_summary()
+    def run(self, modelpool: OpenCLIPVisionModelPool):
+        self.modelpool = modelpool
+        merge_dtype = parse_dtype(self.merge_dtype)
+        with self.profile("setup models"):
+            module, mask_model = self.setup_models()
+            self.setup_zero_shot_classification_head(freeze=True, dtype=merge_dtype)
+        if self.mask_checkpoint is None:
+            if not self.skip_training:
+                clear_cuda_cache()
+                self.train_mask(module, mask_model=mask_model)
+        else:
+            if self.fabric.is_global_zero:
+                print("loading mask from checkpoint", self.mask_checkpoint)
+            self.fabric.load(self.mask_checkpoint, {"model": mask_model})
+        with torch.no_grad():
+            clear_cuda_cache()
+            mask = mask_model.sample_mask(
+                mask_type=self.eval_mask_type, temperature=self.temperature
+            )
+            # rescale mask
+            for name, m in mask.items():
+                mask[name] = m / torch.mean(m)
+            model = module.merge_and_unload(mask)
+        return model.to(dtype=torch.float32)

fusion_bench/method/task_singular_vector/TSVM.py CHANGED Viewed

@@ -249,12 +249,13 @@ class TaskSingularVectorMerging(BaseAlgorithm, LightningFabricMixin):
         # - SVD finds the principal components (most important directions)
         # - Task vectors are reconstructed using only the most significant components
         # - The reconstructed vectors are merged (summed) to create a unified task vector
-        new_merged_tv = TSVM_utils.compute_and_sum_svd_mem_reduction(
-            task_vectors,
-            exclude_keys=self.exclude_keys,  # Skip certain parameters from SVD
-            accelerator=accelerator,  # Use GPU if available
-            return_single_task_models=self.return_single_task_models,
-        )
+        with torch.no_grad():
+            new_merged_tv = TSVM_utils.compute_and_sum_svd_mem_reduction(
+                task_vectors,
+                exclude_keys=self.exclude_keys,  # Skip certain parameters from SVD
+                accelerator=accelerator,  # Use GPU if available
+                return_single_task_models=self.return_single_task_models,
+            )
         # Handle the case where individual transformed task vectors are also returned
         if self.return_single_task_models:

fusion_bench/method/task_singular_vector/utils/TSVM_utils.py CHANGED Viewed

@@ -311,7 +311,6 @@ def compute_and_sum_svd_mem_reduction_lossless_eigen(
 ###############
 #### TSV Merge Orthogonalization
-@torch.no_grad()
 def compute_and_sum_svd_mem_reduction(
     task_vectors: List[StateDictType],
     exclude_keys: Optional[List[str]] = None,

fusion_bench/metrics/model_kinship/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # Exploring Model Kinship for Merging LLMs
2	+ # The implementation of this module is borrowed from: https://github.com/zjunlp/ModelKinship/

fusion_bench/metrics/model_kinship/calculate.py ADDED Viewed

@@ -0,0 +1,77 @@
+import logging
+from typing import List
+import numpy
+import torch
+from .utility import Metric
+def cosine_similarity(a, b):
+    similarity = numpy.sqrt(numpy.dot(a, b) ** 2 / (numpy.dot(a, a) * numpy.dot(b, b)))
+    return similarity
+def calculate_model_kinship(
+    delta1: numpy.ndarray, delta2: numpy.ndarray, metrics: List[str]
+) -> dict:
+    """
+    Calculate model kinship using specified metrics.
+    Args:
+        delta1: Delta parameters for first model
+        delta2: Delta parameters for second model
+        metrics: List of metrics to calculate
+    Returns:
+        dict: Dictionary of metric names and their calculated values
+    """
+    results = {}
+    for metric in metrics:
+        try:
+            if metric not in Metric.list():
+                raise ValueError(f"Unsupported metric: {metric}")
+            results[metric] = calculate_metric(delta1, delta2, metric)
+        except Exception as e:
+            results[metric] = f"Error calculating {metric}: {str(e)}"
+    return results
+def calculate_metric(
+    d_vector_1: torch.Tensor, d_vector_2: torch.Tensor, metric: str
+) -> str:
+    """
+    Calculate the specified metric between two delta vectors.
+    Args:
+        d_vector_1 (torch.Tensor): Delta parameters for model 1.
+        d_vector_2 (torch.Tensor): Delta parameters for model 2.
+        metric (str): The metric to calculate ('pcc', 'ed', 'cs').
+    Returns:
+        str: A formatted string with the result of the chosen metric.
+    """
+    logging.info(f"Starting calculation of {metric.upper()} metric...")
+    # Pearson Correlation Coefficient (PCC)
+    if metric == "pcc":
+        # Stack the two vectors and calculate the Pearson correlation coefficient
+        stack = torch.stack((d_vector_1, d_vector_2), dim=0)
+        pcc = torch.corrcoef(stack)[0, 1].item()
+        return f"Model Kinship based on Pearson Correlation Coefficient: {pcc}"
+    # Euclidean Distance (ED)
+    elif metric == "ed":
+        # Compute the Euclidean distance between the vectors
+        distance = torch.dist(d_vector_1, d_vector_2).item()
+        return f"Model Kinship based on Euclidean Distance: {distance}"
+    # Cosine Similarity (CS)
+    elif metric == "cs":
+        # Compute cosine similarity
+        cs = cosine_similarity(d_vector_1, d_vector_2)
+        return f"Model Kinship based on Cosine Similarity: {cs}"
+    # If metric is not recognized
+    else:
+        return "Invalid metric specified."

fusion_bench/metrics/model_kinship/calculate_split.py ADDED Viewed

@@ -0,0 +1,171 @@
+import logging
+from typing import Dict, List
+import numpy
+import torch
+from tqdm import tqdm
+from .utility import Metric, load_model_state_dict, quantize_8bit
+def cosine_similarity(a, b):
+    similarity = numpy.sqrt(numpy.dot(a, b) ** 2 / (numpy.dot(a, a) * numpy.dot(b, b)))
+    return similarity
+def calculate_model_kinship_split(
+    model_1_name: str,
+    model_2_name: str,
+    model_base_name: str,
+    low_precision: bool,
+    metrics: List[str],
+    device: str = "cuda" if torch.cuda.is_available() else "cpu",
+) -> dict:
+    # Extract state dictionaries from models
+    state_dict_1 = load_model_state_dict(model_1_name, device)
+    state_dict_2 = load_model_state_dict(model_2_name, device)
+    state_dict_base = load_model_state_dict(model_base_name, device)
+    results = {}
+    # Validate metrics before processing
+    valid_metrics = Metric.list()
+    for metric in metrics:
+        try:
+            if metric not in valid_metrics:
+                raise ValueError(
+                    f"Unsupported metric: {metric}. Valid metrics are: {', '.join(valid_metrics)}"
+                )
+            results[metric] = calculate_metrics_by_split(
+                state_dict_1, state_dict_2, state_dict_base, low_precision, metric
+            )
+        except Exception as e:
+            logging.error(f"Error calculating {metric}: {str(e)}")
+            results[metric] = f"Error calculating {metric}: {str(e)}"
+    return results
+def calculate_metrics_by_split(
+    state_dict_1: dict,
+    state_dict_2: dict,
+    state_dict_base: dict,
+    low_precision: bool,
+    metric: str,
+) -> str:
+    """
+    Calculate metrics for each key and integrate results.
+    Args:
+        state_dict_1 (dict): State dictionary of first model
+        state_dict_2 (dict): State dictionary of second model
+        state_dict_base (dict): State dictionary of base model
+        low_precision (bool): Whether to use 8-bit quantization
+        metric (str): Metric to calculate ('pcc', 'ed', 'cs')
+    Returns:
+        str: Integrated metric result as formatted string
+    """
+    total_similarity = 0.0
+    total_weight = 0.0
+    split_results = {}
+    # Determine the number of layers
+    num_layers = state_dict_base["lm_head.weight"].shape[0]
+    # Check architectures
+    if (
+        state_dict_1["lm_head.weight"].shape[0]
+        != state_dict_2["lm_head.weight"].shape[0]
+    ):
+        shape_1 = state_dict_1["lm_head.weight"].shape
+        shape_2 = state_dict_2["lm_head.weight"].shape
+        logging.warning(
+            f"Warning: Model architectures do not match. "
+            f"Using sub weight space instead.\n"
+            f"Vocab sizes in model 1: {shape_1[0]}, "
+            f"Vocab sizes in model 2: {shape_2[0]}"
+        )
+    # Process each key
+    for key, base_params in tqdm(
+        state_dict_base.items(), desc=f"Processing {metric.upper()} by key"
+    ):
+        try:
+            if key not in state_dict_1 or key not in state_dict_2:
+                logging.warning(f"Key {key} not found in one of the models")
+                continue
+            # Get parameters and calculate deltas
+            params_1 = state_dict_1[key][:num_layers]
+            params_2 = state_dict_2[key][:num_layers]
+            delta_1 = (params_1 - base_params).view(-1)
+            delta_2 = (params_2 - base_params).view(-1)
+            if low_precision:
+                delta_1 = quantize_8bit(delta_1)
+                delta_2 = quantize_8bit(delta_2)
+            # Calculate weight based on parameter count
+            weight = delta_1.numel()
+            # Calculate metric for current key
+            if metric == "pcc":
+                stack = torch.stack((delta_1, delta_2), dim=0)
+                split_similarity = torch.corrcoef(stack)[0, 1].item()
+            elif metric == "ed":
+                split_similarity = torch.dist(delta_1, delta_2).item()
+            elif metric == "cs":
+                split_similarity = cosine_similarity(delta_1, delta_2)
+            else:
+                raise ValueError(f"Unsupported metric: {metric}")
+            # Skip NaN values
+            if torch.isnan(torch.tensor(split_similarity)):
+                logging.warning(f"Skipping key {key} due to NaN result")
+                continue
+            # Store valid result
+            split_results[key] = split_similarity
+            # Update weighted average only for valid results
+            weight = delta_1.numel()
+            total_similarity += split_similarity * weight
+            total_weight += weight
+            # Log progress for large layers
+            if weight > 1000000:
+                logging.info(
+                    f"Layer {key}: {metric.upper()} = {split_similarity:.4f}, parameters = {weight}"
+                )
+            # Free memory
+            del delta_1, delta_2
+        except Exception as e:
+            logging.error(f"Error processing key {key}: {str(e)}")
+            continue
+    # Calculate final weighted average
+    if total_weight > 0:
+        final_result = total_similarity / total_weight
+        # Log summary statistics
+        logging.info(f"\nSummary for {metric.upper()}:")
+        logging.info(f"Total parameters: {total_weight}")
+        # Log detailed results for valid splits
+        logging.info(f"\nDetailed {metric.upper()} results by key:")
+        for key, value in split_results.items():
+            logging.info(f"{key}: {value:.4f}")
+        metric_names = {
+            "pcc": "Pearson Correlation Coefficient",
+            "ed": "Euclidean Distance",
+            "cs": "Cosine Similarity",
+        }
+        return f"Model Kinship based on {metric_names[metric]} (weighted average): {final_result:.4f}"
+    else:
+        return f"Error: No valid parameters found for {metric.upper()} calculation"

fusion-bench 0.2.29__py3-none-any.whl → 0.2.31__py3-none-any.whl

fusion-bench 0.2.29py3-none-any.whl → 0.2.31py3-none-any.whl