fusion-bench 0.2.13__py3-none-any.whl → 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. fusion_bench/method/ada_svd/clip_vision.py +4 -1
  2. fusion_bench/method/smile_upscaling/smile_mistral_upscaling.py +46 -145
  3. fusion_bench/method/smile_upscaling/smile_qwen2_upscaling.py +229 -0
  4. fusion_bench/method/smile_upscaling/smile_upscaling.py +6 -336
  5. fusion_bench/models/modeling_smile_mistral/modeling_smile_mistral.py +2 -203
  6. fusion_bench/models/modeling_smile_qwen2/__init__.py +8 -0
  7. fusion_bench/models/modeling_smile_qwen2/configuration_smile_qwen2.py +21 -0
  8. fusion_bench/models/modeling_smile_qwen2/modeling_smile_qwen2.py +922 -0
  9. fusion_bench/models/modeling_smile_qwen2/register.py +11 -0
  10. fusion_bench/models/rankone_moe.py +2 -88
  11. fusion_bench/models/smile_moe/linear_from_hf_config.py +373 -0
  12. fusion_bench/models/smile_moe/{linear.py → linear_from_module.py} +103 -33
  13. fusion_bench/models/smile_moe/utils/__init__.py +24 -0
  14. fusion_bench/models/smile_moe/utils/svd_utils.py +46 -0
  15. fusion_bench/taskpool/__init__.py +2 -0
  16. fusion_bench/taskpool/lm_eval_harness/__init__.py +3 -0
  17. fusion_bench/taskpool/lm_eval_harness/taskpool.py +87 -0
  18. {fusion_bench-0.2.13.dist-info → fusion_bench-0.2.14.dist-info}/METADATA +22 -2
  19. {fusion_bench-0.2.13.dist-info → fusion_bench-0.2.14.dist-info}/RECORD +27 -14
  20. {fusion_bench-0.2.13.dist-info → fusion_bench-0.2.14.dist-info}/WHEEL +1 -1
  21. fusion_bench_config/method/smile_upscaling/smile_mistral_upscaling.yaml +5 -2
  22. fusion_bench_config/method/smile_upscaling/smile_qwen2_upscaling.yaml +13 -0
  23. fusion_bench_config/modelpool/CausalLMPool/qwen2_math_1.5B_and_R1.yaml +17 -0
  24. fusion_bench_config/taskpool/LMEvalHarnessTaskPool/lm_eval.yaml +12 -0
  25. {fusion_bench-0.2.13.dist-info → fusion_bench-0.2.14.dist-info}/entry_points.txt +0 -0
  26. {fusion_bench-0.2.13.dist-info → fusion_bench-0.2.14.dist-info}/licenses/LICENSE +0 -0
  27. {fusion_bench-0.2.13.dist-info → fusion_bench-0.2.14.dist-info}/top_level.txt +0 -0
@@ -31,7 +31,10 @@ from fusion_bench.method import WeightedAverageAlgorithm
31
31
  from fusion_bench.method.simple_average import simple_average
32
32
  from fusion_bench.mixins import SimpleProfilerMixin
33
33
  from fusion_bench.modelpool import CLIPVisionModelPool
34
- from fusion_bench.models.smile_moe.linear import ExpertNotTrainedError, SmileMoELinear
34
+ from fusion_bench.models.smile_moe.linear_from_module import (
35
+ ExpertNotTrainedError,
36
+ SmileMoELinear,
37
+ )
35
38
  from fusion_bench.models.utils import find_layers_with_type, get_attr, set_attr
36
39
  from fusion_bench.utils.devices import get_device
37
40
 
@@ -9,11 +9,16 @@ import torch.nn.functional as F
9
9
  from accelerate import init_empty_weights
10
10
  from torch import Tensor, nn
11
11
  from tqdm.auto import tqdm
12
- from transformers import AutoConfig, AutoTokenizer, MistralForCausalLM
12
+ from transformers import (
13
+ AutoConfig,
14
+ AutoModelForCausalLM,
15
+ AutoTokenizer,
16
+ MistralForCausalLM,
17
+ )
13
18
  from transformers.models.mistral.modeling_mistral import MistralDecoderLayer
14
19
 
15
- from fusion_bench.compat.method import ModelFusionAlgorithm
16
20
  from fusion_bench.compat.modelpool import to_modelpool
21
+ from fusion_bench.method import BaseAlgorithm
17
22
  from fusion_bench.method.simple_average import simple_average
18
23
  from fusion_bench.mixins.simple_profiler import SimpleProfilerMixin
19
24
  from fusion_bench.modelpool import BaseModelPool
@@ -25,158 +30,23 @@ from fusion_bench.models.modeling_smile_mistral.modeling_smile_mistral import (
25
30
  SmileLinear,
26
31
  SmileMistralDecoderLayer,
27
32
  )
28
- from fusion_bench.models.utils import get_attr, set_attr
33
+ from fusion_bench.models.smile_moe.linear_from_hf_config import (
34
+ ExpertNotTrainedError,
35
+ upscale_to_smile_linear,
36
+ )
29
37
  from fusion_bench.utils.dtype import parse_dtype
30
38
  from fusion_bench.utils.parameters import print_parameters
31
- from fusion_bench.utils.state_dict_arithmetic import state_dict_sub
32
39
 
33
40
  log = logging.getLogger(__name__)
34
41
 
35
42
 
36
- class ExpertNotTrainedError(Exception):
37
- pass
38
-
39
-
40
- def _is_all_zeros(tensor: Tensor | List[Tensor]) -> bool:
41
- """
42
- Check if a tensor or a list of tensors are all zeros.
43
-
44
- Args:
45
- tensor (Tensor | List[Tensor]): The tensor or list of tensors to check.
46
-
47
- Returns:
48
- bool: True if all elements are zeros, False otherwise.
49
- """
50
- if isinstance(tensor, Tensor):
51
- return torch.allclose(tensor, torch.zeros_like(tensor))
52
- else:
53
- return all(_is_all_zeros(t) for t in tensor)
54
-
55
-
56
- def _svd(w: Tensor, full_matrices=False) -> Tuple[Tensor, Tensor, Tensor]:
57
- """
58
- Perform Singular Value Decomposition (SVD) on a tensor.
59
-
60
- Args:
61
- w (Tensor): The input tensor.
62
- full_matrices (bool, optional): Whether to compute the full-sized U and V matrices. Defaults to False.
63
-
64
- Returns:
65
- Tuple[Tensor, Tensor, Tensor]: The U, S, and V matrices from SVD.
66
- """
67
- device = w.device
68
- if w.device != torch.float32 or w.device != torch.float64:
69
- w = w.float()
70
-
71
- u, s, vh = torch.linalg.svd(
72
- w,
73
- full_matrices=full_matrices,
74
- # driver="gesvd" if w.is_cuda else None
75
- )
76
- v = vh.T
77
-
78
- u = u.to(device)
79
- s = s.to(device)
80
- v = v.to(device)
81
- return u, s, v
82
-
83
-
84
- def svd(
85
- w: Tensor, full_matrices=True, accelerator=None
86
- ) -> Tuple[Tensor, Tensor, Tensor]:
87
- """
88
- Perform SVD on a tensor with optional acceleration.
89
-
90
- Args:
91
- w (Tensor): The input tensor.
92
- full_matrices (bool, optional): Whether to compute the full-sized U and V matrices. Defaults to True.
93
- accelerator (optional): The device to perform the computation on. Defaults to None.
94
-
95
- Returns:
96
- Tuple[Tensor, Tensor, Tensor]: The U, S, and V matrices from SVD.
97
- """
98
- if accelerator is None:
99
- return _svd(w, full_matrices=full_matrices)
100
- original_device = w.device
101
- w = w.to(accelerator)
102
- u, s, v = _svd(w)
103
- return u, s, v
104
-
105
-
106
- @torch.no_grad()
107
- def upscale_to_smile_linear(
108
- base: nn.Linear, experts: List[nn.Linear], target: SmileLinear, accelerator=None
109
- ):
110
- """
111
- Upscale a base linear layer to a SmileLinear layer using expert models.
112
-
113
- Args:
114
- base (nn.Linear): The base linear layer.
115
- experts (List[nn.Linear]): A list of expert linear layers.
116
- target (SmileLinear): The target SmileLinear layer.
117
- accelerator (optional): The device to perform the computation on. Defaults to None.
118
-
119
- Returns:
120
- SmileLinear: The upscaled SmileLinear layer.
121
- """
122
- w = base.weight
123
- w_ft_list = [e.weight for e in experts]
124
- dw_list = [w_ft - w for w_ft in w_ft_list]
125
-
126
- if _is_all_zeros(dw_list):
127
- raise ExpertNotTrainedError("Expert models are not trained")
128
-
129
- rank_of_router = target.rank_of_router
130
- rank_of_expert = target.rank_of_expert
131
- num_local_experts = target.num_local_experts
132
- svd_list = [svd(dw, accelerator=accelerator) for dw in dw_list]
133
-
134
- # gate
135
- gate_weight = []
136
- for u, s, v in svd_list:
137
- gate_weight.append(v[:, :rank_of_router].T)
138
- gate_weight = (
139
- torch.stack(gate_weight, dim=0)
140
- .reshape(num_local_experts * rank_of_router, -1)
141
- .contiguous()
142
- )
143
-
144
- target.gate.load_state_dict({"weight": gate_weight})
145
-
146
- # shared linear
147
- target.shared_linear.load_state_dict(base.state_dict())
148
-
149
- # experts
150
- if rank_of_expert > 0:
151
- for expert_idx, target_expert in enumerate(target.experts):
152
- u, s, v = svd_list[expert_idx]
153
- u = u[:, :rank_of_expert]
154
- s = s[:rank_of_expert]
155
- v = v[:, :rank_of_expert]
156
- state_dict = {"u": u, "svh": (s * v).T}
157
- if experts[expert_idx].bias is not None:
158
- state_dict["bias"] = experts[expert_idx].bias.data
159
- target_expert.load_state_dict(state_dict)
160
- else:
161
- for expert_idx, target_expert in enumerate(target.experts):
162
- target_expert.load_state_dict(
163
- state_dict_sub(experts[expert_idx].state_dict(), base.state_dict())
164
- )
165
-
166
- return target
167
-
168
-
169
- class SmileMistralUpscalingAlgorithm(ModelFusionAlgorithm, SimpleProfilerMixin):
43
+ class SmileMistralUpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
170
44
  R"""
171
45
  SmileMistralUpscalingAlgorithm is a model fusion algorithm designed to upscale
172
46
  a pretrained Mistral model using a set of fine-tuned expert models. The algorithm
173
47
  leverages Singular Value Decomposition (SVD) to merge the weights of the pretrained
174
48
  model and the expert models into a new upscaled model.
175
49
 
176
- Attributes:
177
- modelpool (BaseModelPool): The pool of models to be used for upscaling.
178
- config (dict): Configuration parameters for the upscaling process.
179
-
180
50
  Methods:
181
51
  run(modelpool: BaseModelPool) -> SmileMistralForCausalLM:
182
52
  Executes the upscaling process and returns the upscaled model.
@@ -185,6 +55,37 @@ class SmileMistralUpscalingAlgorithm(ModelFusionAlgorithm, SimpleProfilerMixin):
185
55
  Merges the pretrained model with the fine-tuned models to create an upscaled model.
186
56
  """
187
57
 
58
+ _config_mapping = BaseAlgorithm._config_mapping | {
59
+ "device": "device",
60
+ "accelerator": "accelerator",
61
+ "model_path": "model_path",
62
+ "model_dtype": "model_dtype",
63
+ "num_experts_per_tok": "num_experts_per_tok",
64
+ "rank_of_router": "rank_of_router",
65
+ "rank_of_expert": "rank_of_expert",
66
+ }
67
+
68
+ def __init__(
69
+ self,
70
+ device,
71
+ accelerator,
72
+ model_path,
73
+ model_dtype,
74
+ num_experts_per_tok,
75
+ rank_of_router,
76
+ rank_of_expert,
77
+ **kwargs,
78
+ ):
79
+ self.device = device
80
+ self.accelerator = accelerator
81
+ self.model_path = model_path
82
+ self.model_dtype = model_dtype
83
+ # SmileMoE parameters, except `num_local_experts` which is set later according to the number of finetuned models
84
+ self.num_experts_per_tok = num_experts_per_tok
85
+ self.rank_of_router = rank_of_router
86
+ self.rank_of_expert = rank_of_expert
87
+ super().__init__(**kwargs)
88
+
188
89
  @torch.no_grad()
189
90
  def run(self, modelpool: BaseModelPool) -> SmileMistralForCausalLM:
190
91
  """
@@ -199,15 +100,15 @@ class SmileMistralUpscalingAlgorithm(ModelFusionAlgorithm, SimpleProfilerMixin):
199
100
  self.modelpool = modelpool = to_modelpool(modelpool)
200
101
  config = self.config
201
102
 
202
- print(config)
103
+ # load model from path if provided and return directly
203
104
  if config.model_path is not None and os.path.exists(config.model_path):
204
105
  log.info(f"Loading model from {config.model_path}")
205
- model = torch.load(config.model_path)
106
+ model = AutoModelForCausalLM.from_pretrained(config.model_path)
206
107
  print_parameters(model)
207
108
  return model
208
109
 
209
110
  with self.profile("load pretrained model"):
210
- pretrained_model = modelpool.load_model("_pretrained_")
111
+ pretrained_model = modelpool.load_pretrained_model()
211
112
  with self.profile("load fine-tuned model"):
212
113
  finetuned_models = [
213
114
  m for m in tqdm(modelpool.models(), total=len(modelpool.model_names))
@@ -0,0 +1,229 @@
1
+ import logging
2
+ import os
3
+ from copy import deepcopy
4
+ from typing import TYPE_CHECKING, Dict, List, Tuple
5
+
6
+ import torch
7
+ from accelerate import init_empty_weights
8
+ from tqdm.auto import tqdm
9
+ from transformers import (
10
+ AutoConfig,
11
+ AutoModelForCausalLM,
12
+ AutoTokenizer,
13
+ Qwen2ForCausalLM,
14
+ )
15
+ from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer
16
+
17
+ from fusion_bench import BaseAlgorithm, BaseModelPool
18
+ from fusion_bench.compat.modelpool import to_modelpool
19
+ from fusion_bench.mixins import SimpleProfilerMixin
20
+ from fusion_bench.models.modeling_smile_qwen2 import (
21
+ SmileQwen2Config,
22
+ SmileQwen2ForCausalLM,
23
+ )
24
+ from fusion_bench.models.modeling_smile_qwen2.modeling_smile_qwen2 import (
25
+ SmileQwen2DecoderLayer,
26
+ )
27
+ from fusion_bench.models.smile_moe.linear_from_hf_config import (
28
+ ExpertNotTrainedError,
29
+ upscale_to_smile_linear,
30
+ )
31
+ from fusion_bench.utils.dtype import parse_dtype
32
+ from fusion_bench.utils.parameters import print_parameters
33
+
34
+ log = logging.getLogger(__name__)
35
+
36
+
37
+ class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
38
+ R"""
39
+ SmileQwen2UpscalingAlgorithm is a model fusion algorithm designed to upscale
40
+ a pretrained Qwen2 model using a set of fine-tuned expert models. The algorithm
41
+ leverages Singular Value Decomposition (SVD) to merge the weights of the pretrained
42
+ model and the expert models into a new upscaled model.
43
+
44
+ Methods:
45
+ run(modelpool: BaseModelPool) -> SmileQwen2ForCausalLM:
46
+ Executes the upscaling process and returns the upscaled model.
47
+
48
+ merge(pretrained_model: Qwen2ForCausalLM, finetuned_models: List[Qwen2ForCausalLM]) -> SmileQwen2ForCausalLM:
49
+ Merges the pretrained model with the fine-tuned models to create an upscaled model.
50
+ """
51
+
52
+ _config_mapping = BaseAlgorithm._config_mapping | {
53
+ "device": "device",
54
+ "accelerator": "accelerator",
55
+ "model_path": "model_path",
56
+ "model_dtype": "model_dtype",
57
+ "num_experts_per_tok": "num_experts_per_tok",
58
+ "rank_of_router": "rank_of_router",
59
+ "rank_of_expert": "rank_of_expert",
60
+ }
61
+
62
+ def __init__(
63
+ self,
64
+ device,
65
+ accelerator,
66
+ model_path,
67
+ model_dtype,
68
+ num_experts_per_tok,
69
+ rank_of_router,
70
+ rank_of_expert,
71
+ **kwargs,
72
+ ):
73
+ self.device = device
74
+ self.accelerator = accelerator
75
+ self.model_path = model_path
76
+ self.model_dtype = model_dtype
77
+ # SmileMoE parameters, except `num_local_experts` which is set later according to the number of finetuned models
78
+ self.num_experts_per_tok = num_experts_per_tok
79
+ self.rank_of_router = rank_of_router
80
+ self.rank_of_expert = rank_of_expert
81
+ super().__init__(**kwargs)
82
+
83
+ @torch.no_grad()
84
+ def run(self, modelpool: BaseModelPool) -> SmileQwen2ForCausalLM:
85
+ """
86
+ Executes the upscaling process.
87
+
88
+ Args:
89
+ modelpool (ModelPool): The pool of models to be used for upscaling.
90
+
91
+ Returns:
92
+ SmileQwen2ForCausalLM: The upscaled model.
93
+ """
94
+ self.modelpool = modelpool = to_modelpool(modelpool)
95
+ config = self.config
96
+
97
+ # load model from path if provided and return directly
98
+ if config.model_path is not None and os.path.exists(config.model_path):
99
+ log.info(f"Loading model from {config.model_path}")
100
+ model = AutoModelForCausalLM.from_pretrained(config.model_path)
101
+ print_parameters(model)
102
+ return model
103
+
104
+ with self.profile("load pretrained model"):
105
+ pretrained_model = modelpool.load_pretrained_model()
106
+ with self.profile("load fine-tuned model"):
107
+ finetuned_models = [
108
+ m for m in tqdm(modelpool.models(), total=len(modelpool.model_names))
109
+ ]
110
+
111
+ if config.device == "cuda" and torch.cuda.is_available():
112
+ pretrained_model = pretrained_model.cuda()
113
+ print("parameter count of pretrained model:")
114
+ print_parameters(pretrained_model)
115
+ finetuned_models = [m.cuda() for m in finetuned_models]
116
+
117
+ with self.profile("merge model"):
118
+ model = self.merge(pretrained_model, finetuned_models)
119
+
120
+ self.print_profile_summary()
121
+ print("parameter count of upscaled MoE model:")
122
+ print_parameters(model)
123
+ print(model)
124
+
125
+ if config.model_dtype is not None:
126
+ model.to(dtype=parse_dtype(config.model_dtype))
127
+
128
+ if config.model_path is not None:
129
+ if os.path.dirname(config.model_path):
130
+ os.makedirs(os.path.dirname(config.model_path), exist_ok=True)
131
+ log.info(f"Saving model to {config.model_path}")
132
+ pretrained_model_config = self.modelpool.get_model_config("_pretrained_")
133
+ pretrained_path = pretrained_model_config.get(
134
+ "path", pretrained_model_config["pretrained_model_name_or_path"]
135
+ )
136
+ tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
137
+ tokenizer.save_pretrained(config.model_path)
138
+ model.save_pretrained(config.model_path)
139
+
140
+ return model
141
+
142
+ def merge(
143
+ self,
144
+ pretrained_model: Qwen2ForCausalLM,
145
+ finetuned_models: List[Qwen2ForCausalLM],
146
+ ):
147
+ """
148
+ Merges the pretrained model with the fine-tuned models to create an upscaled model.
149
+
150
+ Args:
151
+ pretrained_model (Qwen2ForCausalLM): The pretrained model.
152
+ finetuned_models (List[Qwen2ForCausalLM]): A list of fine-tuned models.
153
+
154
+ Returns:
155
+ SmileQwen2ForCausalLM: The upscaled model.
156
+ """
157
+ config = self.config
158
+
159
+ with init_empty_weights():
160
+ pretrained_model_config = self.modelpool.get_model_config("_pretrained_")
161
+ pretrained_path = pretrained_model_config.get(
162
+ "path", pretrained_model_config["pretrained_model_name_or_path"]
163
+ )
164
+ base_config = AutoConfig.from_pretrained(pretrained_path)
165
+ model_config = SmileQwen2Config(
166
+ num_experts_per_tok=config.num_experts_per_tok,
167
+ rank_of_router=config.rank_of_router,
168
+ rank_of_expert=config.rank_of_expert,
169
+ num_local_experts=len(finetuned_models),
170
+ **base_config.to_dict(),
171
+ )
172
+ model = SmileQwen2ForCausalLM(model_config)
173
+
174
+ model.to(dtype=pretrained_model.dtype).to_empty(device="cpu")
175
+
176
+ # copy pretrained model weights
177
+ state_dict = model.state_dict()
178
+ pretrained_state_dict = dict(pretrained_model.state_dict())
179
+ for key in list(pretrained_state_dict.keys()):
180
+ if key not in state_dict:
181
+ pretrained_state_dict.pop(key)
182
+ model.load_state_dict(pretrained_state_dict, strict=False)
183
+
184
+ # upscale model
185
+ for layer_idx in tqdm(
186
+ range(len(pretrained_model.model.layers)),
187
+ "Upscaling Modules (layer)",
188
+ dynamic_ncols=True,
189
+ ):
190
+ pretrained_layer: Qwen2DecoderLayer = pretrained_model.model.layers[
191
+ layer_idx
192
+ ]
193
+ finetuned_layers: List[Qwen2DecoderLayer] = [
194
+ m.model.layers[layer_idx] for m in finetuned_models
195
+ ]
196
+
197
+ target_layer: SmileQwen2DecoderLayer = model.model.layers[layer_idx]
198
+
199
+ for n in ["q_proj", "k_proj", "v_proj", "o_proj"]:
200
+ try:
201
+ upscale_to_smile_linear(
202
+ base=getattr(pretrained_layer.self_attn, n),
203
+ experts=[getattr(m.self_attn, n) for m in finetuned_layers],
204
+ target=getattr(target_layer.self_attn, n),
205
+ accelerator=config.accelerator,
206
+ )
207
+ except ExpertNotTrainedError:
208
+ setattr(
209
+ target_layer.self_attn,
210
+ n,
211
+ getattr(pretrained_layer.self_attn, n),
212
+ )
213
+
214
+ for n in ["gate_proj", "up_proj", "down_proj"]:
215
+ try:
216
+ upscale_to_smile_linear(
217
+ base=getattr(pretrained_layer.mlp, n),
218
+ experts=[getattr(m.mlp, n) for m in finetuned_layers],
219
+ target=getattr(target_layer.mlp, n),
220
+ accelerator=config.accelerator,
221
+ )
222
+ except ExpertNotTrainedError:
223
+ setattr(
224
+ target_layer.mlp,
225
+ n,
226
+ getattr(pretrained_layer.mlp, n),
227
+ )
228
+
229
+ return model