PyPI - fusion-bench - Versions diffs - 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl - Mend

fusion-bench 0.2.20py3-none-any.whl → 0.2.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (188) hide show

fusion_bench/models/wrappers/task_wise_fusion.py CHANGED Viewed

@@ -22,6 +22,7 @@ import torch
 from torch import Tensor, nn
 from torch.func import functional_call
+from fusion_bench.models.utils import StateDictType, del_attr, get_attr, set_attr
 from fusion_bench.utils.type import StateDictType, TorchModelType
 log = logging.getLogger(__name__)
@@ -29,77 +30,7 @@ log = logging.getLogger(__name__)
 __all__ = ["get_task_wise_weights", "fuse_weights", "TaskWiseMergedModel"]
-def del_attr(obj, names: List[str]):
-    """
-    Deletes an attribute from an object recursively.
-    Args:
-        obj (object): Object to delete attribute from.
-        names (list): List of attribute names to delete recursively.
-    """
-    if len(names) == 1:
-        delattr(obj, names[0])
-    else:
-        del_attr(getattr(obj, names[0]), names[1:])
-def set_attr(obj, names: List[str], val):
-    """
-    Sets an attribute of an object recursively.
-    Args:
-        obj (object): Object to set attribute of.
-        names (list): List of attribute names to set recursively.
-        val (object): Value to set the attribute to.
-    """
-    if len(names) == 1:
-        setattr(obj, names[0], val)
-    else:
-        set_attr(getattr(obj, names[0]), names[1:], val)
-def get_attr(obj, names: List[str]):
-    """
-    Gets an attribute of an object recursively.
-    Args:
-        obj (object): Object to get attribute of.
-        names (list): List of attribute names to get recursively.
-    Returns:
-        object: The attribute of the object.
-    """
-    if len(names) == 1:
-        return getattr(obj, names[0])
-    else:
-        return get_attr(getattr(obj, names[0]), names[1:])
-def check_parameterNamesMatch(checkpoints: List[StateDictType]) -> None:
-    """
-    Checks that the parameter names of the given checkpoints match.
-    Args:
-        checkpoints (List[Dict[str, float]]): A list of checkpoints, where each checkpoint is a dictionary of parameter names and their corresponding values.
-    Raises:
-        ValueError: If the number of checkpoints is less than 2 or if the parameter names of any two checkpoints differ.
-    """
-    parameter_names = set(checkpoints[0].keys())
-    if len(checkpoints) >= 2:
-        # raise ValueError("Number of models is less than 2.")
-        for checkpoint in checkpoints[1:]:
-            current_parameterNames = set(checkpoint.keys())
-            if current_parameterNames != parameter_names:
-                raise ValueError(
-                    "Differing parameter names in models. "
-                    f"The different parameters are {parameter_names.symmetric_difference(current_parameterNames)}"
-                )
-def get_task_wise_weights(num_models: int, init_values: float = None):
+def get_task_wise_weights(num_models: int, init_values: float = None) -> Tensor:
     """
     This function generates a tensor of weights for each model.
@@ -116,7 +47,7 @@ def get_task_wise_weights(num_models: int, init_values: float = None):
     return torch.full((num_models,), init_values, dtype=torch.float32)
-def _fuse_weights(task_wise_weight: Tensor, tensors: List[Tensor]):
+def _fuse_weights(task_wise_weight: Tensor, tensors: List[Tensor]) -> Tensor:
     """
     This function fuses the weights of the models.
@@ -158,6 +89,100 @@ def fuse_weights(
 class TaskWiseMergedModel(nn.Module, Generic[TorchModelType]):
+    """
+    A PyTorch module that dynamically merges multiple fine-tuned models using learnable task-wise weights.
+    This class implements a sophisticated model fusion approach where multiple task-specific models
+    are combined with a pretrained base model using learnable weights. The fusion is performed
+    using task vectors (differences between fine-tuned and pretrained models) that are weighted
+    and added to the base model's parameters.
+    The key innovation is that the merging weights are learnable parameters that can be optimized
+    during training, allowing the model to automatically learn the optimal combination of different
+    task-specific knowledge.
+    Architecture:
+        - Base pretrained model (frozen)
+        - Multiple task vectors (differences from pretrained model, frozen)
+        - Learnable task-wise weights (trainable parameters)
+        - Dynamic merging during forward pass
+    Args:
+        task_wise_weight (Tensor): Initial weights for each task model. Shape: (num_models,).
+            These become learnable parameters that control the contribution of each task vector.
+        pretrained_model (TorchModelType): The base pretrained model that serves as the foundation.
+            This model is frozen and used as the starting point for merging.
+        finetuned_models (List[TorchModelType]): List of fine-tuned models for different tasks.
+            These are converted to task vectors (differences from pretrained model) and frozen.
+        clamp_weights (bool, optional): Whether to clamp merge weights to [0, 1] range.
+            Defaults to True. When True, ensures weights are non-negative and bounded.
+        tie_weights (bool, optional): Whether to tie weights during functional call.
+            Defaults to False. Used in the underlying PyTorch functional_call.
+        strict (bool, optional): Whether to enforce strict parameter matching.
+            Defaults to True. Used in the underlying PyTorch functional_call.
+        task_vector_dtype (Optional[torch.dtype], optional): Data type for task vectors.
+            Defaults to None. Can be used to save memory (e.g., torch.float16).
+    Attributes:
+        merge_weight (nn.Parameter): Learnable weights for merging task vectors.
+        pretrained_model (TorchModelType): The frozen base model.
+        task_vectors (nn.ModuleList): List of frozen task vector models.
+        _merged_state_dict (StateDictType): Cached merged state dictionary.
+    Example:
+        ```python
+        import torch
+        import torch.nn as nn
+        # Create example models
+        pretrained_model = nn.Linear(10, 5)
+        finetuned_model1 = nn.Linear(10, 5)  # Fine-tuned on task 1
+        finetuned_model2 = nn.Linear(10, 5)  # Fine-tuned on task 2
+        # Initialize task-wise weights
+        task_weights = torch.tensor([0.3, 0.7])  # Initial weights for 2 tasks
+        # Create merged model
+        merged_model = TaskWiseMergedModel(
+            task_wise_weight=task_weights,
+            pretrained_model=pretrained_model,
+            finetuned_models=[finetuned_model1, finetuned_model2],
+            clamp_weights=True
+        )
+        # Use like a regular PyTorch model
+        x = torch.randn(32, 10)
+        output = merged_model(x)
+        # Train the merge weights
+        optimizer = torch.optim.Adam(merged_model.parameters())
+        loss = some_loss_function(output, targets)
+        loss.backward()
+        optimizer.step()
+        # Get the final merged model
+        final_model = merged_model.merge_and_unload()
+        ```
+    Training Workflow:
+        1. **Initialization**: Task vectors are computed as differences from pretrained model
+        2. **Forward Pass**: Weights are dynamically merged based on current merge_weight values
+        3. **Loss Computation**: Standard loss computation on model outputs
+        4. **Backpropagation**: Gradients flow through merge_weight parameters
+        5. **Optimization**: merge_weight parameters are updated to improve performance
+    Memory Efficiency:
+        - Task vectors can use lower precision (task_vector_dtype)
+        - Base model and task vectors are frozen (no gradient computation)
+        - Only merge weights require gradients
+    Note:
+        - The pretrained model and task vectors are frozen during training
+        - Only the merge weights (task_wise_weight) are trainable parameters
+        - Task vectors represent the difference between fine-tuned and pretrained models
+        - The merged state dict is cached and recomputed when merge weights change
+    """
     _merged_state_dict: StateDictType = None
     def __init__(
@@ -170,6 +195,32 @@ class TaskWiseMergedModel(nn.Module, Generic[TorchModelType]):
         strict: bool = True,
         task_vector_dtype: Optional[torch.dtype] = None,
     ):
+        """
+        Initialize the TaskWiseMergedModel.
+        This constructor sets up the model by:
+        1. Converting fine-tuned models to task vectors (differences from pretrained)
+        2. Freezing the pretrained model and task vectors
+        3. Setting up learnable merge weights as parameters
+        4. Configuring merging behavior options
+        Args:
+            task_wise_weight (Tensor): Initial weights for each task model. Shape: (num_models,).
+                These values become the starting point for learnable parameters.
+            pretrained_model (TorchModelType): The base pretrained model.
+                Will be frozen and used as the foundation for merging.
+            finetuned_models (List[TorchModelType]): List of fine-tuned models.
+                Must have the same architecture as pretrained_model.
+            clamp_weights (bool, optional): Whether to clamp weights to [0, 1]. Defaults to True.
+            tie_weights (bool, optional): Whether to tie weights in functional_call. Defaults to False.
+            strict (bool, optional): Whether to use strict parameter matching. Defaults to True.
+            task_vector_dtype (Optional[torch.dtype], optional): Data type for task vectors.
+                Defaults to None (same as original models).
+        Raises:
+            ValueError: If the number of task_wise_weights doesn't match the number of fine-tuned models.
+            RuntimeError: If models have incompatible architectures.
+        """
         super().__init__()
         self.clamp_weights = clamp_weights
         self.tie_weights = tie_weights
@@ -196,6 +247,24 @@ class TaskWiseMergedModel(nn.Module, Generic[TorchModelType]):
     @property
     def forward_model(self):
+        """
+        Get a functional model with merged parameters.
+        Returns a partial function that applies the pretrained model with the current
+        merged state dictionary. This allows for efficient forward passes without
+        modifying the original model's parameters.
+        Returns:
+            Callable: A partial function that can be called with (args, kwargs) to
+                perform forward pass with merged parameters.
+        Example:
+            ```python
+            # Internal usage during forward pass
+            forward_fn = merged_model.forward_model
+            output = forward_fn(args=(x,), kwargs={})
+            ```
+        """
         return functools.partial(
             functional_call,
             self.pretrained_model,
@@ -205,6 +274,43 @@ class TaskWiseMergedModel(nn.Module, Generic[TorchModelType]):
         )
     def merge_weights(self, task_vector_mask: Optional[Dict[str, Tensor]] = None):
+        """
+        Merge task vectors with the pretrained model using current merge weights.
+        This method computes the merged model parameters by combining the pretrained
+        model with weighted task vectors. The resulting state dictionary represents
+        a model that incorporates knowledge from all task-specific models.
+        The merging formula for each parameter is:
+        merged_param = pretrained_param + Σ(weight_i * task_vector_i * mask_i)
+        Args:
+            task_vector_mask (Optional[Dict[str, Tensor]], optional): Optional masks
+                to selectively apply task vectors to specific parameters. Keys should
+                match parameter names, values should be tensors with the same shape
+                as the corresponding parameters. Defaults to None (no masking).
+        Returns:
+            StateDictType: The merged state dictionary containing combined parameters.
+        Example:
+            ```python
+            # Basic merging
+            merged_state = model.merge_weights()
+            # Merging with parameter-specific masks
+            masks = {
+                'layer1.weight': torch.ones_like(model.pretrained_model.layer1.weight),
+                'layer2.weight': torch.zeros_like(model.pretrained_model.layer2.weight),
+            }
+            masked_state = model.merge_weights(task_vector_mask=masks)
+            ```
+        Note:
+            - If clamp_weights is True, merge weights are clamped to [0, 1] range
+            - The merged state dict is cached in _merged_state_dict
+            - Task vector masks allow fine-grained control over which parameters are affected
+        """
         if self.clamp_weights:
             merge_weight = self.merge_weight.clamp(0, 1)
         else:
@@ -222,11 +328,83 @@ class TaskWiseMergedModel(nn.Module, Generic[TorchModelType]):
         return state_dict
     def merge_and_unload(self, task_vector_mask: Optional[Dict[str, Tensor]] = None):
+        """
+        Merge models and return the final merged model.
+        This method performs the merging operation and then loads the merged parameters
+        into the pretrained model, returning a standard PyTorch model that can be used
+        independently of the TaskWiseMergedModel wrapper.
+        Args:
+            task_vector_mask (Optional[Dict[str, Tensor]], optional): Optional masks
+                for selective parameter merging. Defaults to None.
+        Returns:
+            TorchModelType: The pretrained model with merged parameters loaded.
+                This is a standalone model that can be used without the wrapper.
+        Example:
+            ```python
+            # Train the merged model
+            for epoch in range(num_epochs):
+                # ... training loop ...
+                pass
+            # Get the final merged model
+            final_model = merged_model.merge_and_unload()
+            # Save or use the final model
+            torch.save(final_model.state_dict(), 'merged_model.pth')
+            output = final_model(new_input)
+            ```
+        Warning:
+            This method modifies the pretrained_model's parameters in-place.
+            The original pretrained model parameters will be lost.
+        """
         self.merge_weights(task_vector_mask=task_vector_mask)
         self.pretrained_model.load_state_dict(self._merged_state_dict)
         return self.pretrained_model
     def forward(self, *args, **kwargs):
+        """
+        Forward pass through the dynamically merged model.
+        This method performs the forward pass by first ensuring the model parameters
+        are merged according to the current merge weights, then applying the merged
+        model to the input data.
+        The forward pass involves:
+        1. Check if merged state dict is current (recompute if needed)
+        2. Apply the merged model to inputs using functional_call
+        3. Return the model outputs
+        Args:
+            *args: Positional arguments to pass to the underlying model.
+            **kwargs: Keyword arguments to pass to the underlying model.
+        Returns:
+            Any: The output of the merged model, typically torch.Tensor or tuple of tensors.
+        Example:
+            ```python
+            # Single input
+            x = torch.randn(32, 784)
+            output = merged_model(x)
+            # Multiple inputs
+            x1, x2 = torch.randn(32, 784), torch.randn(32, 100)
+            output = merged_model(x1, x2)
+            # With keyword arguments
+            output = merged_model(input_ids=input_ids, attention_mask=attention_mask)
+            ```
+        Note:
+            - The merged state dict is recomputed if merge weights have changed
+            - This allows for dynamic behavior during training as weights are updated
+            - The computation is efficient as merging only happens when needed
+        """
         if self._merged_state_dict is None:
             self.merge_weights()
         return self.forward_model(args=args, kwargs=kwargs)

fusion_bench/programs/base_program.py CHANGED Viewed

@@ -1,9 +1,88 @@
+"""
+Base Program Classes for FusionBench.
+This module defines the foundational abstract base classes for FusionBench programs.
+These programs serve as the main execution units that orchestrate model fusion
+workflows, from loading configurations to executing fusion algorithms and
+evaluating results.
+The base classes provide a consistent interface for all FusionBench programs
+while allowing for flexible implementations of different fusion workflows.
+"""
 from abc import abstractmethod
-from fusion_bench.mixins import BaseYAMLSerializableModel
+from fusion_bench.mixins import BaseYAMLSerializable
+class BaseHydraProgram(BaseYAMLSerializable):
+    """
+    Abstract base class for all FusionBench programs that use Hydra configuration.
+    This class serves as the foundation for all FusionBench execution programs,
+    providing a standardized interface for configuration-driven model fusion
+    workflows. It combines the serialization capabilities of BaseYAMLSerializable
+    with the requirement for a main execution method.
+    The class is designed to work seamlessly with Hydra's configuration management
+    system, allowing programs to be instantiated and configured through YAML files.
+    This enables flexible, reproducible experiments with different fusion algorithms,
+    model pools, and evaluation tasks.
+    Key Features:
+    - Configuration-driven execution through Hydra integration
+    - YAML serialization support for experiment reproducibility
+    - Abstract interface ensuring consistent program structure
+    - Integration with FusionBench's modular architecture
+    Typical Usage:
+        Subclasses should implement the `run()` method to define their specific
+        fusion workflow. The program can then be executed through the FusionBench
+        CLI or instantiated directly from configuration files.
+    Example:
+        ```python
+        class MyFusionProgram(BaseHydraProgram):
+            def __init__(self, method_config, modelpool_config, taskpool_config):
+                self.method_config = method_config
+                self.modelpool_config = modelpool_config
+                self.taskpool_config = taskpool_config
+            def run(self):
+                # Load components
+                algorithm = load_algorithm(self.method_config)
+                modelpool = load_modelpool(self.modelpool_config)
+                taskpool = load_taskpool(self.taskpool_config)
+                # Execute fusion
+                merged_model = algorithm.run(modelpool)
+                # Evaluate results
+                report = taskpool.evaluate(merged_model)
+                return report
+        ```
+    Note:
+        This is an abstract base class and cannot be instantiated directly.
+        Subclasses must implement the `run()` method to provide concrete
+        functionality.
+    See Also:
+    - [FabricModelFusionProgram][fusion_bench.programs.FabricModelFusionProgram]: Lightning Fabric-based implementation
+    - [BaseYAMLSerializable][fusion_bench.mixins.BaseYAMLSerializable]: Parent class providing serialization
+    - FusionBench CLI documentation for program execution details
+    """
-class BaseHydraProgram(BaseYAMLSerializableModel):
     @abstractmethod
     def run(self):
+        """
+        Execute the main program workflow.
+        This abstract method defines the primary entry point for program execution.
+        Subclasses must implement this method to define their specific fusion
+        workflow, including model loading, fusion algorithm execution, and
+        result evaluation.
+        """
         pass

fusion_bench/programs/fabric_fusion_program.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import json
 import logging
 import os
-from typing import Callable, Dict, Iterable, Optional, Union  # noqa: F401
+from pathlib import Path
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union  # noqa: F401
 import lightning as L
 from lightning.fabric.utilities.rank_zero import rank_zero_only
@@ -9,19 +10,24 @@ from omegaconf import DictConfig, OmegaConf
 from torch import nn
 from tqdm.auto import tqdm
-import fusion_bench.utils.instantiate_utils
-from fusion_bench.method import BaseAlgorithm
+import fusion_bench
+from fusion_bench import (
+    BaseAlgorithm,
+    BaseHydraProgram,
+    BaseModelPool,
+    BaseTaskPool,
+    RuntimeConstants,
+    import_object,
+    instantiate,
+    timeit_context,
+)
 from fusion_bench.mixins import LightningFabricMixin
-from fusion_bench.modelpool import BaseModelPool
-from fusion_bench.programs import BaseHydraProgram
-from fusion_bench.taskpool import BaseTaskPool
-from fusion_bench.utils import import_object, instantiate, timeit_context
 from fusion_bench.utils.hydra_utils import get_hydra_output_dir
 from fusion_bench.utils.json import print_json
+from fusion_bench.utils.path import create_symlink
 from fusion_bench.utils.rich_utils import print_bordered, print_config_tree
-from fusion_bench.utils.pylogger import getRankZeroLogger
-log = getRankZeroLogger(__name__)
+log = fusion_bench.get_rankzero_logger(__name__)
 class FabricModelFusionProgram(
@@ -39,6 +45,7 @@ class FabricModelFusionProgram(
         "_fabric": "fabric",
         "fast_dev_run": "fast_dev_run",
         "seed": "seed",
+        "path": "path",
     }
     def __init__(
@@ -56,8 +63,10 @@ class FabricModelFusionProgram(
         fast_dev_run: bool = False,
         seed: Optional[int] = None,
         print_function_call: bool = True,
+        path: DictConfig = None,
         **kwargs,
     ):
+        super().__init__(**kwargs)
         self._method = method
         self._modelpool = modelpool
         self._taskpool = taskpool
@@ -67,8 +76,11 @@ class FabricModelFusionProgram(
         self.merged_model_save_kwargs = merged_model_save_kwargs
         self.fast_dev_run = fast_dev_run
         self.seed = seed
-        fusion_bench.utils.instantiate_utils.PRINT_FUNCTION_CALL = print_function_call
-        super().__init__(**kwargs)
+        self.path = path
+        RuntimeConstants.debug = fast_dev_run
+        RuntimeConstants.print_function_call = print_function_call
+        if path is not None:
+            RuntimeConstants.cache_dir = path.get("cache_dir", None)
         if print_config:
             print_config_tree(
@@ -164,9 +176,9 @@ class FabricModelFusionProgram(
         self,
         taskpool: BaseTaskPool,
         merged_model: Union[nn.Module, Dict, Iterable],
-        *args,
-        **kwargs,
-    ):
+        *args: Any,
+        **kwargs: Any,
+    ) -> Union[Dict, List, Any]:
         """
         Evaluates the merged model using the provided task pool.
@@ -221,8 +233,16 @@ class FabricModelFusionProgram(
         fabric = self.fabric
         if self.seed is not None:
             L.seed_everything(self.seed)
-        if fabric.global_rank == 0:
-            self._link_hydra_output()
+        # create symbol link to hydra output directory
+        if (
+            self.fabric.is_global_zero
+            and self.log_dir is not None
+            and os.path.abspath(self.log_dir) != os.path.abspath(get_hydra_output_dir())
+        ):
+            create_symlink(
+                get_hydra_output_dir(), self.log_dir, link_name="hydra_output"
+            )
         log.info("Running the model fusion program.")
         # setup the modelpool, method, and taskpool
@@ -243,7 +263,10 @@ class FabricModelFusionProgram(
                 compat_load_fn="fusion_bench.compat.taskpool.load_taskpool_from_config",
             )
+        self.method.on_run_start()
         merged_model = self.method.run(self.modelpool)
+        self.method.on_run_end()
         if merged_model is None:
             log.info(
                 "No merged model returned by the method. Skipping saving and evaluation."
@@ -261,52 +284,14 @@ class FabricModelFusionProgram(
                 if self.report_save_path is not None:
                     # save report (Dict) to a file
                     # if the directory of `save_report` does not exists, create it
-                    if "{log_dir}" in self.report_save_path and self.log_dir is not None:
-                        self.report_save_path = self.report_save_path.format(log_dir=self.log_dir)
+                    if (
+                        "{log_dir}" in self.report_save_path
+                        and self.log_dir is not None
+                    ):
+                        self.report_save_path = self.report_save_path.format(
+                            log_dir=self.log_dir
+                        )
                     os.makedirs(os.path.dirname(self.report_save_path), exist_ok=True)
                     json.dump(report, open(self.report_save_path, "w"))
             else:
                 log.info("No task pool specified. Skipping evaluation.")
-    @rank_zero_only
-    def _link_hydra_output(self):
-        """
-        Creates a symbolic link to the Hydra output directory within the specified log directory.
-        If `self.log_dir` is not None, this method will:
-        1. Retrieve the Hydra output directory using `get_hydra_output_dir()`.
-        2. Create the log directory if it does not already exist.
-        3. Create a symbolic link named "hydra_output_<basename_of_hydra_output_dir>"
-           within the log directory, pointing to the Hydra output directory.
-        Note:
-            - The symbolic link is created only if the Hydra output directory is not None.
-            - The `target_is_directory` parameter is set to True to indicate that the target is a directory.
-        Raises:
-            OSError: If the symbolic link creation fails.
-        """
-        if self.log_dir is not None:
-            # make symlink to the hydra output directory
-            try:
-                hydra_output_dir = get_hydra_output_dir()
-            except Exception as e:
-                hydra_output_dir = None
-            if hydra_output_dir is not None:
-                os.makedirs(self.log_dir, exist_ok=True)
-                try:
-                    # if the system is windows, use the `mklink` command in "CMD" to create the symlink
-                    if os.name == "nt":
-                        os.system(f"mklink /J {os.path.abspath(os.path.join(self.log_dir, 'hydra_output_' + os.path.basename(hydra_output_dir)))} {os.path.abspath(hydra_output_dir)}")
-                    else:
-                        os.symlink(
-                            hydra_output_dir,
-                            os.path.join(
-                                self.log_dir,
-                                "hydra_output_" + os.path.basename(hydra_output_dir),
-                            ),
-                            target_is_directory=True,
-                        )
-                except OSError as e:
-                    log.warning(f"Failed to create symbolic link: {e}")

fusion-bench 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl

fusion-bench 0.2.20py3-none-any.whl → 0.2.22py3-none-any.whl