PyPI - fusion-bench - Versions diffs - 0.2.22__py3-none-any.whl → 0.2.24__py3-none-any.whl - Mend

fusion-bench 0.2.22py3-none-any.whl → 0.2.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

fusion_bench/__init__.py +4 -0
fusion_bench/compat/method/__init__.py +5 -2
fusion_bench/compat/method/base_algorithm.py +3 -2
fusion_bench/compat/modelpool/base_pool.py +3 -3
fusion_bench/compat/taskpool/clip_image_classification.py +1 -1
fusion_bench/dataset/gpt2_glue.py +1 -1
fusion_bench/method/__init__.py +12 -2
fusion_bench/method/analysis/task_vector_cos_similarity.py +95 -12
fusion_bench/method/analysis/task_vector_violin_plot.py +160 -52
fusion_bench/method/bitdelta/bitdelta.py +7 -23
fusion_bench/method/ensemble.py +17 -2
fusion_bench/method/expert_sparsity/mixtral/dynamic_skipping.py +2 -0
fusion_bench/method/expert_sparsity/mixtral/layer_wise_pruning.py +2 -0
fusion_bench/method/expert_sparsity/mixtral/progressive_pruning.py +2 -0
fusion_bench/method/linear/__init__.py +6 -2
fusion_bench/method/linear/{simple_average_for_llama.py → simple_average_for_causallm.py} +8 -4
fusion_bench/method/linear/{task_arithmetic_for_llama.py → task_arithmetic_for_causallm.py} +22 -12
fusion_bench/method/linear/ties_merging_for_causallm.py +70 -0
fusion_bench/method/model_stock/__init__.py +1 -0
fusion_bench/method/model_stock/model_stock.py +309 -0
fusion_bench/method/regmean/clip_regmean.py +3 -6
fusion_bench/method/regmean/regmean.py +27 -56
fusion_bench/method/regmean/utils.py +56 -0
fusion_bench/method/regmean_plusplus/regmean_plusplus.py +21 -60
fusion_bench/method/simple_average.py +2 -2
fusion_bench/method/slerp/__init__.py +1 -1
fusion_bench/method/slerp/slerp.py +110 -14
fusion_bench/method/task_arithmetic/task_arithmetic.py +35 -10
fusion_bench/method/ties_merging/ties_merging.py +22 -6
fusion_bench/method/we_moe/flan_t5_we_moe.py +9 -20
fusion_bench/method/wudi/__init__.py +1 -0
fusion_bench/method/wudi/wudi.py +105 -0
fusion_bench/mixins/clip_classification.py +26 -6
fusion_bench/mixins/lightning_fabric.py +4 -0
fusion_bench/mixins/serialization.py +40 -83
fusion_bench/modelpool/base_pool.py +1 -1
fusion_bench/modelpool/causal_lm/causal_lm.py +285 -44
fusion_bench/modelpool/seq2seq_lm/modelpool.py +146 -0
fusion_bench/models/hf_clip.py +4 -0
fusion_bench/models/hf_utils.py +10 -4
fusion_bench/models/linearized/vision_model.py +6 -6
fusion_bench/models/model_card_templates/default.md +8 -1
fusion_bench/models/modeling_smile_mistral/__init__.py +1 -0
fusion_bench/models/we_moe.py +8 -8
fusion_bench/models/wrappers/ensemble.py +136 -7
fusion_bench/scripts/cli.py +2 -2
fusion_bench/taskpool/base_pool.py +99 -17
fusion_bench/taskpool/clip_vision/taskpool.py +12 -5
fusion_bench/taskpool/dummy.py +101 -13
fusion_bench/taskpool/lm_eval_harness/taskpool.py +80 -0
fusion_bench/taskpool/nyuv2_taskpool.py +28 -0
fusion_bench/utils/__init__.py +1 -0
fusion_bench/utils/data.py +6 -4
fusion_bench/utils/devices.py +36 -11
fusion_bench/utils/dtype.py +3 -2
fusion_bench/utils/lazy_state_dict.py +85 -19
fusion_bench/utils/packages.py +3 -3
fusion_bench/utils/parameters.py +0 -2
fusion_bench/utils/rich_utils.py +7 -3
fusion_bench/utils/timer.py +92 -10
{fusion_bench-0.2.22.dist-info → fusion_bench-0.2.24.dist-info}/METADATA +10 -3
{fusion_bench-0.2.22.dist-info → fusion_bench-0.2.24.dist-info}/RECORD +77 -64
fusion_bench_config/_get_started/llm_slerp.yaml +12 -0
fusion_bench_config/method/ensemble/simple_ensemble.yaml +1 -0
fusion_bench_config/method/linear/{simple_average_for_llama.yaml → simple_average_for_causallm.yaml} +1 -1
fusion_bench_config/method/linear/task_arithmetic_for_causallm.yaml +4 -0
fusion_bench_config/method/linear/ties_merging_for_causallm.yaml +13 -0
fusion_bench_config/method/model_stock/model_stock.yaml +12 -0
fusion_bench_config/method/slerp/slerp_lm.yaml +4 -0
fusion_bench_config/method/wudi/wudi.yaml +4 -0
fusion_bench_config/modelpool/CausalLMPool/{Qwen2.5-1.5B_math_and_coder.yaml → Qwen2.5-1.5B_math_and_code.yaml} +1 -2
fusion_bench_config/modelpool/CausalLMPool/Qwen2.5-1.5B_three_models.yaml +11 -0
fusion_bench_config/modelpool/CausalLMPool/llama-7b_3-models_v1.yaml +11 -0
fusion_bench_config/method/linear/task_arithmetic_for_llama.yaml +0 -4
{fusion_bench-0.2.22.dist-info → fusion_bench-0.2.24.dist-info}/WHEEL +0 -0
{fusion_bench-0.2.22.dist-info → fusion_bench-0.2.24.dist-info}/entry_points.txt +0 -0
{fusion_bench-0.2.22.dist-info → fusion_bench-0.2.24.dist-info}/licenses/LICENSE +0 -0
{fusion_bench-0.2.22.dist-info → fusion_bench-0.2.24.dist-info}/top_level.txt +0 -0

fusion_bench/models/linearized/vision_model.py CHANGED Viewed

@@ -45,21 +45,21 @@ def linearize_lora_model_(model):
 def load_fft_vision_model_hf(
-    model_name: str, return_vison_model=True
+    model_name: str, return_vision_model=True
 ) -> Union[CLIPVisionTransformer, CLIPVisionModel]:
     """
     Load a CLIP vision model from Hugging Face.
     Args:
         model_name (str): The name of the CLIP vision model to load from Hugging Face.
-        return_vison_model (bool, optional): If False, the full CLIPVisionModel is returned. If True, only the vision model (`CLIPVisionTransformer`) is returned. Defaults to True.
+        return_vision_model (bool, optional): If False, the full CLIPVisionModel is returned. If True, only the vision model (`CLIPVisionTransformer`) is returned. Defaults to True.
     Returns:
         Union[CLIPVisionTransformer, CLIPVisionModel]: The vision model.
     """
     model = CLIPVisionModel.from_pretrained(model_name)
-    if return_vison_model:
+    if return_vision_model:
         return CLIPVisionModel.from_pretrained(model_name).vision_model
     else:
         return model
@@ -69,7 +69,7 @@ def load_lora_vision_model_hf(
     base_model_name: str,
     peft_name: str,
     merge_and_unload: bool = False,
-    return_vison_model=True,
+    return_vision_model=True,
 ) -> PeftModel:
     """
     Load a LoRA (Low-Rank Adaptation) vision model from Hugging Face.
@@ -80,7 +80,7 @@ def load_lora_vision_model_hf(
         base_model_name (str): The name of the base vision model to load from Hugging Face.
         peft_name (str): The name of the LoRA adaptation to apply to the base model.
         merge_and_unload (bool, optional): If True, the LoRA adaptation is merged into the base model and the LoRA layers are removed. Defaults to False.
-        return_vison_model (bool, optional): If False, the full CLIPVisionModel is returned. If True, only the vision model (`CLIPVisionTransformer`) is returned. Defaults to True.
+        return_vision_model (bool, optional): If False, the full CLIPVisionModel is returned. If True, only the vision model (`CLIPVisionTransformer`) is returned. Defaults to True.
     Returns:
         PeftModel: The adapted vision model, optionally merged and unloaded.
@@ -97,7 +97,7 @@ def load_lora_vision_model_hf(
         vision_model = peft_model
     # Return the vision model
-    if return_vison_model:
+    if return_vision_model:
         return vision_model
     else:
         model.vision_model = vision_model

fusion_bench/models/model_card_templates/default.md CHANGED Viewed

@@ -1,5 +1,8 @@
 ---
 base_model:
+{%- if base_model is not none %}
+- {{ base_model }}
+{%- endif %}
 {%- for model in models %}
 - {{ model }}
 {%- endfor %}
@@ -18,7 +21,11 @@ tags:
 This is a merged model created using [fusion-bench](https://github.com/tanganke/fusion_bench).
 The following models were included in the merge:
-{% for model in models %}
+{% if base_model is not none %}
+- base model: {{ base_model }}
+{%- endif %}
+{%- for model in models %}
 - {{ model }}
 {%- endfor %}

fusion_bench/models/modeling_smile_mistral/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from . import register
 from .configuration_smile_mistral import SmileMistralConfig
 from .modeling_smile_mistral import (
+    SmileMistralDecoderLayer,
     SmileMistralForCausalLM,
     SmileMistralModel,
 )

fusion_bench/models/we_moe.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import functools
 import logging
-from typing import List
+from typing import Generic, List
 import torch
 import torch.func
@@ -9,7 +9,7 @@ from torch.func import functional_call
 from torch.nn import functional as F
 from fusion_bench.models.utils import del_attr, get_attr, set_attr
-from fusion_bench.utils.type import StateDictType
+from fusion_bench.utils.type import StateDictType, TorchModelType
 log = logging.getLogger(__name__)
@@ -76,15 +76,15 @@ def construct_weight_ensembling_gate(
     return gate
-class WeightEnsemblingMoE(nn.Module):
+class WeightEnsemblingMoE(nn.Module, Generic[TorchModelType]):
     # variable to store the merged state dict temporarily
     _merged_state_dict: StateDictType = None
     def __init__(
         self,
         hidden_size: int,
-        base_model: nn.Module,
-        expert_models: List[nn.Module],
+        base_model: TorchModelType,
+        expert_models: List[TorchModelType],
         init_lambda: float = 0.2,
         batch_first: bool = False,
         router_hidden_layers: int = 2,
@@ -101,8 +101,8 @@ class WeightEnsemblingMoE(nn.Module):
         Args:
             hidden_size (int): The size of the hidden layer in the models.
-            base_model (nn.Module): The base model that will be used as a reference for the expert models.
-            expert_models (List[nn.Module]): A list of expert models that will be combined.
+            base_model (TorchModelType): The base model that will be used as a reference for the expert models.
+            expert_models (List[TorchModelType]): A list of expert models that will be combined.
             init_lambda (float, optional): The initial lambda value for the weight ensembling gate. Defaults to 0.2.
             batch_first (bool, optional): If True, the input tensors are expected to have the batch size as the first dimension. Defaults to False.
             router_hidden_layers (int, optional): The number of hidden layers in the router. Defaults to 2.
@@ -145,7 +145,7 @@ class WeightEnsemblingMoE(nn.Module):
             self._merged_state_dict,
         )
-    def merge_weights(self, expert_weights):
+    def merge_weights(self, expert_weights) -> StateDictType:
         state_dict = self.base_model.state_dict(keep_vars=True)
         for weight, task_vector in zip(expert_weights, self.task_vectors):
             for name, param in task_vector.named_parameters():

fusion_bench/models/wrappers/ensemble.py CHANGED Viewed

@@ -1,10 +1,17 @@
-from typing import Any, Callable, Dict, List, Union, cast
+import logging
+from typing import Any, Callable, Dict, Generic, List, Union, cast
 import numpy as np
 import torch
+import torch.futures
 from omegaconf import ListConfig
 from torch import Tensor, nn
+from fusion_bench.utils.devices import to_device
+from fusion_bench.utils.type import TorchModelType
+log = logging.getLogger(__name__)
 def aggregate_tensors(
     outputs: List[Any], aggregate_fn: Callable
@@ -58,12 +65,16 @@ def aggregate_tensors(
         raise ValueError("Unsupported type for outputs")
-class EnsembleModule(nn.Module):
+class EnsembleModule(nn.Module, Generic[TorchModelType]):
     """
     Ensemble module that averages the outputs of multiple models.
     """
-    def __init__(self, models: List[nn.Module]):
+    def __init__(
+        self,
+        models: List[TorchModelType],
+        device_map: Dict[int, Union[int, str]] | None = None,
+    ):
         """
         Initializes the EnsembleModule with a list of models.
@@ -73,6 +84,16 @@ class EnsembleModule(nn.Module):
         super().__init__()
         # TODO: distribute models to devices
         self.model_list = nn.ModuleList(models)
+        self.device_map = device_map
+        if self.device_map is not None:
+            self._move_models_to_devices()
+    def _move_models_to_devices(self):
+        for model_idx, device_id in self.device_map.items():
+            log.info(f"Moving model {model_idx} to device {device_id}")
+            self.model_list[model_idx] = self.model_list[model_idx].to(
+                device_id, non_blocking=True
+            )
     def _aggregate_tensors(self, outputs: List[Tensor]) -> Tensor:
         """
@@ -86,6 +107,49 @@ class EnsembleModule(nn.Module):
         """
         return torch.stack(outputs).mean(dim=0)
+    def _parallel_forward_with_device_map(self, *args: Any, **kwargs: Any) -> List[Any]:
+        """
+        Performs parallel forward pass using device mapping with futures.
+        Args:
+            *args: Variable length argument list.
+            **kwargs: Arbitrary keyword arguments.
+        Returns:
+            List[Any]: List of outputs from all models, all moved to the same device.
+        """
+        futures = []
+        device_data_cache = {}
+        for i, model in enumerate(self.model_list):
+            device_id = self.device_map.get(i, "cpu")
+            if device_id not in device_data_cache:
+                # Move inputs to the same device as the model
+                device_args = to_device(
+                    args, device_id, copy_on_move=True, non_blocking=True
+                )
+                device_kwargs = to_device(
+                    kwargs, device_id, copy_on_move=True, non_blocking=True
+                )
+                device_data_cache[device_id] = (device_args, device_kwargs)
+            else:
+                device_args, device_kwargs = device_data_cache[device_id]
+            # Create a future for asynchronous execution
+            future = torch.jit.fork(model, *device_args, **device_kwargs)
+            futures.append(future)
+        # Wait for all futures to complete and collect results
+        outputs = [torch.jit.wait(future) for future in futures]
+        # Move all outputs to the same device (use the device of the first model or cpu as fallback)
+        target_device = self.device_map.get(0, "cpu") if self.device_map else "cpu"
+        outputs = [
+            to_device(output, target_device, non_blocking=True) for output in outputs
+        ]
+        return outputs
     def forward(self, *args: Any, **kwargs: Any) -> Any:
         """
         Performs a forward pass by averaging the outputs of the models.
@@ -97,20 +161,25 @@ class EnsembleModule(nn.Module):
         Returns:
             Aggregated output from the ensemble of models.
         """
-        outputs = [model(*args, **kwargs) for model in self.model_list]
+        if self.device_map is None:
+            outputs = [model(*args, **kwargs) for model in self.model_list]
+        else:
+            # Parallel execution with device mapping
+            outputs = self._parallel_forward_with_device_map(*args, **kwargs)
         return aggregate_tensors(outputs, self._aggregate_tensors)
-class WeightedEnsembleModule(nn.Module):
+class WeightedEnsembleModule(nn.Module, Generic[TorchModelType]):
     """
     Ensemble module that computes a weighted average of the outputs from multiple models.
     """
     def __init__(
         self,
-        models: List[nn.Module],
+        models: List[TorchModelType],
         weights: List[float] | Tensor | np.ndarray,
         normalize: bool = True,
+        device_map: Dict[int, Union[int, str]] | None = None,
     ):
         """
         Initializes the WeightedEnsembleModule with models and their corresponding weights.
@@ -119,9 +188,12 @@ class WeightedEnsembleModule(nn.Module):
             models (List[nn.Module]): List of models to ensemble.
             weights (List[float] | Tensor | np.ndarray): Weights for each model.
             normalize (bool, optional): If True, normalizes the weights. Defaults to True.
+            device_map (Dict[int, Union[int, str]] | None, optional): Device mapping for parallel execution. Defaults to None.
         """
         super().__init__()
         self.model_list = nn.ModuleList(models)
+        self.device_map = device_map
         if isinstance(weights, (list, tuple, ListConfig)):
             weights = torch.tensor(weights)
         elif isinstance(weights, Tensor):
@@ -139,6 +211,17 @@ class WeightedEnsembleModule(nn.Module):
             weights = weights / weights.sum()
         self.register_buffer("weights", weights)
+        if self.device_map is not None:
+            self._move_models_to_devices()
+    def _move_models_to_devices(self):
+        """Move models to their assigned devices according to device_map."""
+        for model_idx, device_id in self.device_map.items():
+            log.info(f"Moving model {model_idx} to device {device_id}")
+            self.model_list[model_idx] = self.model_list[model_idx].to(
+                device_id, non_blocking=True
+            )
     def _aggregate_tensors(self, outputs: List[Tensor]) -> Tensor:
         """
         Aggregates a list of tensors using the provided weights.
@@ -152,6 +235,48 @@ class WeightedEnsembleModule(nn.Module):
         weights = cast(Tensor, self.weights).view(-1, *([1] * outputs[0].dim()))
         return (torch.stack(outputs) * weights).sum(dim=0)
+    def _parallel_forward_with_device_map(self, *args: Any, **kwargs: Any) -> List[Any]:
+        """
+        Performs parallel forward pass using device mapping with futures.
+        Args:
+            *args: Variable length argument list.
+            **kwargs: Arbitrary keyword arguments.
+        Returns:
+            List[Any]: List of outputs from all models, all moved to the same device.
+        """
+        futures = []
+        device_data_cache = {}
+        for i, model in enumerate(self.model_list):
+            device_id = self.device_map.get(i, "cpu")
+            if device_id not in device_data_cache:
+                # Move inputs to the same device as the model
+                device_args = to_device(
+                    args, device_id, copy_on_move=True, non_blocking=True
+                )
+                device_kwargs = to_device(
+                    kwargs, device_id, copy_on_move=True, non_blocking=True
+                )
+                device_data_cache[device_id] = (device_args, device_kwargs)
+            else:
+                device_args, device_kwargs = device_data_cache[device_id]
+            # Create a future for asynchronous execution
+            future = torch.jit.fork(model, *device_args, **device_kwargs)
+            futures.append(future)
+        # Wait for all futures to complete and collect results
+        outputs = [torch.jit.wait(future) for future in futures]
+        # Move all outputs to the same device (use the device of the first model or cpu as fallback)
+        target_device = self.device_map.get(0, "cpu") if self.device_map else "cpu"
+        outputs = [to_device(output, target_device) for output in outputs]
+        return outputs
     def forward(self, *args: Any, **kwargs: Any) -> Any:
         """
         Performs a forward pass by computing the weighted average of the models' outputs.
@@ -163,7 +288,11 @@ class WeightedEnsembleModule(nn.Module):
         Returns:
             Weighted aggregated output from the ensemble of models.
         """
-        outputs = [model(*args, **kwargs) for model in self.model_list]
+        if self.device_map is None:
+            outputs = [model(*args, **kwargs) for model in self.model_list]
+        else:
+            # Parallel execution with device mapping
+            outputs = self._parallel_forward_with_device_map(*args, **kwargs)
         return aggregate_tensors(outputs, self._aggregate_tensors)

fusion_bench/scripts/cli.py CHANGED Viewed

@@ -20,8 +20,8 @@ log = logging.getLogger(__name__)
 def _get_default_config_path():
-    for config_dir in ["fusion_bench_config", "config"]:
-        for config_path_root in [os.getcwd(), PROJECT_ROOT_PATH]:
+    for config_path_root in [os.getcwd(), PROJECT_ROOT_PATH]:
+        for config_dir in ["config", "fusion_bench_config"]:
             config_path = os.path.join(config_path_root, config_dir)
             if os.path.exists(config_path) and os.path.isdir(config_path):
                 return os.path.abspath(config_path)

fusion_bench/taskpool/base_pool.py CHANGED Viewed

@@ -5,33 +5,115 @@ from fusion_bench.mixins import BaseYAMLSerializable
 class BaseTaskPool(BaseYAMLSerializable):
+    """Abstract base class for task pools in the FusionBench framework.
+    A task pool represents a collection of evaluation tasks that can be used to
+    assess model performance across multiple benchmarks or datasets. This base
+    class defines the common interface that all task pool implementations must
+    follow, ensuring consistency across different task types and evaluation
+    scenarios.
+    Task pools are designed to be configurable through YAML files and can be
+    used in various model fusion and evaluation workflows. They provide a
+    standardized way to evaluate models on multiple tasks and aggregate results.
+    The class inherits from BaseYAMLSerializable to support configuration
+    management and serialization capabilities.
+    Attributes:
+        _program: Optional program reference for execution context.
+        _config_key: Configuration key used for YAML configuration ("taskpool").
+    Abstract Methods:
+        evaluate: Must be implemented by subclasses to define task-specific
+            evaluation logic.
+    Example:
+        Implementing a custom task pool:
+        ```python
+        class MyTaskPool(BaseTaskPool):
+            def evaluate(self, model, **kwargs):
+                results = {}
+                for task_name in self.tasks:
+                    # Implement task-specific evaluation
+                    results[task_name] = self._evaluate_task(model, task_name)
+                return results
+        ```
+    """
     _program = None
     _config_key = "taskpool"
     @abstractmethod
     def evaluate(self, model: Any, *args: Any, **kwargs: Any) -> Dict[str, Any]:
-        """
-        Evaluate the model on all tasks in the task pool, and return a report.
+        """Evaluate a model on all tasks in the task pool and return aggregated results.
-        Take image classification as an example, the report will look like:
+        This abstract method defines the core evaluation interface that all task pool
+        implementations must provide. It should evaluate the given model on all tasks
+        managed by the pool and return a structured report of the results.
-        ```python
-        {
-            "mnist": {
-                "accuracy": 0.8,
-                "loss": 0.2,
-            },
-            <task_name>: {
-                <metric_name>: <metric_value>,
-                ...
-            },
-        }
-        ```
+        The evaluation process typically involves:
+        1. Iterating through all tasks in the pool
+        2. Running model inference on each task's dataset
+        3. Computing task-specific metrics
+        4. Aggregating results into a standardized report format
         Args:
-            model: The model to evaluate.
+            model: The model to evaluate. Can be any model type (PyTorch model,
+                Hugging Face model, etc.) that is compatible with the specific
+                task pool implementation.
+            *args: Additional positional arguments that may be needed for
+                task-specific evaluation procedures.
+            **kwargs: Additional keyword arguments for evaluation configuration,
+                such as batch_size, device, evaluation metrics, etc.
         Returns:
-            report (dict): A dictionary containing the results of the evaluation for each task.
+            Dict[str, Any]: A dictionary containing evaluation results for each task.
+                The structure follows the pattern:
+                ```python
+                {
+                    "task_name_1": {
+                        "metric_1": value,
+                        "metric_2": value,
+                        ...
+                    },
+                    "task_name_2": {
+                        "metric_1": value,
+                        "metric_2": value,
+                        ...
+                    },
+                    ...
+                }
+                ```
+        Example:
+            For an image classification task pool:
+            ```python
+            results = task_pool.evaluate(model)
+            # Returns:
+            # {
+            #     "mnist": {
+            #         "accuracy": 0.95,
+            #         "loss": 0.15,
+            #     },
+            #     "cifar10": {
+            #         "accuracy": 0.87,
+            #         "loss": 0.42,
+            #     }
+            # }
+            ```
+        Raises:
+            NotImplementedError: This method must be implemented by subclasses.
+        Note:
+            Implementations should ensure that the returned dictionary structure
+            is consistent and that metric names are standardized across similar
+            task types to enable meaningful comparison and aggregation.
         """
         pass

fusion_bench/taskpool/clip_vision/taskpool.py CHANGED Viewed

@@ -27,7 +27,7 @@ from tqdm.autonotebook import tqdm
 from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
 from transformers.models.clip.modeling_clip import CLIPVisionTransformer
-from fusion_bench import RuntimeConstants
+from fusion_bench import RuntimeConstants, auto_register_config
 from fusion_bench.dataset import CLIPDataset
 from fusion_bench.mixins import HydraConfigMixin, LightningFabricMixin
 from fusion_bench.models.hf_clip import HFCLIPClassifier
@@ -86,6 +86,7 @@ class LayerWiseFeatureSaver:
             torch.save(features, self.save_path)
+@auto_register_config
 class CLIPVisionModelTaskPool(
     HydraConfigMixin,
     LightningFabricMixin,
@@ -134,11 +135,13 @@ class CLIPVisionModelTaskPool(
         layer_wise_feature_first_token_only: bool = True,
         layer_wise_feature_max_num: Optional[int] = None,
         fast_dev_run: Optional[bool] = None,
+        move_to_device: bool = True,
         **kwargs,
     ):
         """
         Initialize the CLIPVisionModelTaskPool.
         """
+        super().__init__(**kwargs)
         self._test_datasets = test_datasets
         self._processor = processor
         self._data_processor = data_processor
@@ -159,7 +162,6 @@ class CLIPVisionModelTaskPool(
             self.fast_dev_run = RuntimeConstants().debug
         else:
             self.fast_dev_run = fast_dev_run
-        super().__init__(**kwargs)
     def setup(self):
         """
@@ -220,7 +222,9 @@ class CLIPVisionModelTaskPool(
             for name, dataset in self.test_datasets.items()
         }
         self.test_dataloaders = {
-            name: self.fabric.setup_dataloaders(dataloader)
+            name: self.fabric.setup_dataloaders(
+                dataloader, move_to_device=self.move_to_device
+            )
             for name, dataloader in self.test_dataloaders.items()
         }
@@ -273,6 +277,8 @@ class CLIPVisionModelTaskPool(
                 task_name=task_name,
             )
             logits: Tensor = outputs["logits"]
+            if logits.device != targets.device:
+                targets = targets.to(logits.device)
             loss = F.cross_entropy(logits, targets)
             loss_metric.update(loss.detach().cpu())
@@ -309,7 +315,7 @@ class CLIPVisionModelTaskPool(
             self.setup()
         report = {}
-        # CLIPVisionModel works the same with CLIPVisonTransformer, so we can use it directly
+        # CLIPVisionModel works the same with CLIPVisionTransformer, so we can use it directly
         if hasattr(model, "is_surgery_model") and model.is_surgery_model:
             log.info("running evaluation on a surgery model.")
             model: "SurgeryModelWrapper" = model
@@ -321,7 +327,8 @@ class CLIPVisionModelTaskPool(
             self.clip_model,
             processor=self.processor,
         )
-        classifier = cast(HFCLIPClassifier, self.fabric.to_device(classifier))
+        if self.move_to_device:
+            classifier = cast(HFCLIPClassifier, self.fabric.to_device(classifier))
         # collect basic model information
         training_params, all_params = count_parameters(model)
         report["model_info"] = {

fusion-bench 0.2.22__py3-none-any.whl → 0.2.24__py3-none-any.whl

fusion-bench 0.2.22py3-none-any.whl → 0.2.24py3-none-any.whl