fusion-bench 0.2.13__py3-none-any.whl → 0.2.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fusion_bench/method/ada_svd/clip_vision.py +4 -1
- fusion_bench/method/smile_upscaling/smile_mistral_upscaling.py +46 -145
- fusion_bench/method/smile_upscaling/smile_qwen2_upscaling.py +229 -0
- fusion_bench/method/smile_upscaling/smile_upscaling.py +6 -336
- fusion_bench/models/modeling_smile_mistral/modeling_smile_mistral.py +2 -203
- fusion_bench/models/modeling_smile_qwen2/__init__.py +8 -0
- fusion_bench/models/modeling_smile_qwen2/configuration_smile_qwen2.py +21 -0
- fusion_bench/models/modeling_smile_qwen2/modeling_smile_qwen2.py +922 -0
- fusion_bench/models/modeling_smile_qwen2/register.py +11 -0
- fusion_bench/models/rankone_moe.py +2 -88
- fusion_bench/models/smile_moe/linear_from_hf_config.py +373 -0
- fusion_bench/models/smile_moe/{linear.py → linear_from_module.py} +103 -33
- fusion_bench/models/smile_moe/utils/__init__.py +24 -0
- fusion_bench/models/smile_moe/utils/svd_utils.py +46 -0
- fusion_bench/taskpool/__init__.py +2 -0
- fusion_bench/taskpool/lm_eval_harness/__init__.py +3 -0
- fusion_bench/taskpool/lm_eval_harness/taskpool.py +87 -0
- {fusion_bench-0.2.13.dist-info → fusion_bench-0.2.14.dist-info}/METADATA +22 -2
- {fusion_bench-0.2.13.dist-info → fusion_bench-0.2.14.dist-info}/RECORD +27 -14
- {fusion_bench-0.2.13.dist-info → fusion_bench-0.2.14.dist-info}/WHEEL +1 -1
- fusion_bench_config/method/smile_upscaling/smile_mistral_upscaling.yaml +5 -2
- fusion_bench_config/method/smile_upscaling/smile_qwen2_upscaling.yaml +13 -0
- fusion_bench_config/modelpool/CausalLMPool/qwen2_math_1.5B_and_R1.yaml +17 -0
- fusion_bench_config/taskpool/LMEvalHarnessTaskPool/lm_eval.yaml +12 -0
- {fusion_bench-0.2.13.dist-info → fusion_bench-0.2.14.dist-info}/entry_points.txt +0 -0
- {fusion_bench-0.2.13.dist-info → fusion_bench-0.2.14.dist-info}/licenses/LICENSE +0 -0
- {fusion_bench-0.2.13.dist-info → fusion_bench-0.2.14.dist-info}/top_level.txt +0 -0
|
@@ -31,7 +31,10 @@ from fusion_bench.method import WeightedAverageAlgorithm
|
|
|
31
31
|
from fusion_bench.method.simple_average import simple_average
|
|
32
32
|
from fusion_bench.mixins import SimpleProfilerMixin
|
|
33
33
|
from fusion_bench.modelpool import CLIPVisionModelPool
|
|
34
|
-
from fusion_bench.models.smile_moe.
|
|
34
|
+
from fusion_bench.models.smile_moe.linear_from_module import (
|
|
35
|
+
ExpertNotTrainedError,
|
|
36
|
+
SmileMoELinear,
|
|
37
|
+
)
|
|
35
38
|
from fusion_bench.models.utils import find_layers_with_type, get_attr, set_attr
|
|
36
39
|
from fusion_bench.utils.devices import get_device
|
|
37
40
|
|
|
@@ -9,11 +9,16 @@ import torch.nn.functional as F
|
|
|
9
9
|
from accelerate import init_empty_weights
|
|
10
10
|
from torch import Tensor, nn
|
|
11
11
|
from tqdm.auto import tqdm
|
|
12
|
-
from transformers import
|
|
12
|
+
from transformers import (
|
|
13
|
+
AutoConfig,
|
|
14
|
+
AutoModelForCausalLM,
|
|
15
|
+
AutoTokenizer,
|
|
16
|
+
MistralForCausalLM,
|
|
17
|
+
)
|
|
13
18
|
from transformers.models.mistral.modeling_mistral import MistralDecoderLayer
|
|
14
19
|
|
|
15
|
-
from fusion_bench.compat.method import ModelFusionAlgorithm
|
|
16
20
|
from fusion_bench.compat.modelpool import to_modelpool
|
|
21
|
+
from fusion_bench.method import BaseAlgorithm
|
|
17
22
|
from fusion_bench.method.simple_average import simple_average
|
|
18
23
|
from fusion_bench.mixins.simple_profiler import SimpleProfilerMixin
|
|
19
24
|
from fusion_bench.modelpool import BaseModelPool
|
|
@@ -25,158 +30,23 @@ from fusion_bench.models.modeling_smile_mistral.modeling_smile_mistral import (
|
|
|
25
30
|
SmileLinear,
|
|
26
31
|
SmileMistralDecoderLayer,
|
|
27
32
|
)
|
|
28
|
-
from fusion_bench.models.
|
|
33
|
+
from fusion_bench.models.smile_moe.linear_from_hf_config import (
|
|
34
|
+
ExpertNotTrainedError,
|
|
35
|
+
upscale_to_smile_linear,
|
|
36
|
+
)
|
|
29
37
|
from fusion_bench.utils.dtype import parse_dtype
|
|
30
38
|
from fusion_bench.utils.parameters import print_parameters
|
|
31
|
-
from fusion_bench.utils.state_dict_arithmetic import state_dict_sub
|
|
32
39
|
|
|
33
40
|
log = logging.getLogger(__name__)
|
|
34
41
|
|
|
35
42
|
|
|
36
|
-
class
|
|
37
|
-
pass
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def _is_all_zeros(tensor: Tensor | List[Tensor]) -> bool:
|
|
41
|
-
"""
|
|
42
|
-
Check if a tensor or a list of tensors are all zeros.
|
|
43
|
-
|
|
44
|
-
Args:
|
|
45
|
-
tensor (Tensor | List[Tensor]): The tensor or list of tensors to check.
|
|
46
|
-
|
|
47
|
-
Returns:
|
|
48
|
-
bool: True if all elements are zeros, False otherwise.
|
|
49
|
-
"""
|
|
50
|
-
if isinstance(tensor, Tensor):
|
|
51
|
-
return torch.allclose(tensor, torch.zeros_like(tensor))
|
|
52
|
-
else:
|
|
53
|
-
return all(_is_all_zeros(t) for t in tensor)
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def _svd(w: Tensor, full_matrices=False) -> Tuple[Tensor, Tensor, Tensor]:
|
|
57
|
-
"""
|
|
58
|
-
Perform Singular Value Decomposition (SVD) on a tensor.
|
|
59
|
-
|
|
60
|
-
Args:
|
|
61
|
-
w (Tensor): The input tensor.
|
|
62
|
-
full_matrices (bool, optional): Whether to compute the full-sized U and V matrices. Defaults to False.
|
|
63
|
-
|
|
64
|
-
Returns:
|
|
65
|
-
Tuple[Tensor, Tensor, Tensor]: The U, S, and V matrices from SVD.
|
|
66
|
-
"""
|
|
67
|
-
device = w.device
|
|
68
|
-
if w.device != torch.float32 or w.device != torch.float64:
|
|
69
|
-
w = w.float()
|
|
70
|
-
|
|
71
|
-
u, s, vh = torch.linalg.svd(
|
|
72
|
-
w,
|
|
73
|
-
full_matrices=full_matrices,
|
|
74
|
-
# driver="gesvd" if w.is_cuda else None
|
|
75
|
-
)
|
|
76
|
-
v = vh.T
|
|
77
|
-
|
|
78
|
-
u = u.to(device)
|
|
79
|
-
s = s.to(device)
|
|
80
|
-
v = v.to(device)
|
|
81
|
-
return u, s, v
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
def svd(
|
|
85
|
-
w: Tensor, full_matrices=True, accelerator=None
|
|
86
|
-
) -> Tuple[Tensor, Tensor, Tensor]:
|
|
87
|
-
"""
|
|
88
|
-
Perform SVD on a tensor with optional acceleration.
|
|
89
|
-
|
|
90
|
-
Args:
|
|
91
|
-
w (Tensor): The input tensor.
|
|
92
|
-
full_matrices (bool, optional): Whether to compute the full-sized U and V matrices. Defaults to True.
|
|
93
|
-
accelerator (optional): The device to perform the computation on. Defaults to None.
|
|
94
|
-
|
|
95
|
-
Returns:
|
|
96
|
-
Tuple[Tensor, Tensor, Tensor]: The U, S, and V matrices from SVD.
|
|
97
|
-
"""
|
|
98
|
-
if accelerator is None:
|
|
99
|
-
return _svd(w, full_matrices=full_matrices)
|
|
100
|
-
original_device = w.device
|
|
101
|
-
w = w.to(accelerator)
|
|
102
|
-
u, s, v = _svd(w)
|
|
103
|
-
return u, s, v
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
@torch.no_grad()
|
|
107
|
-
def upscale_to_smile_linear(
|
|
108
|
-
base: nn.Linear, experts: List[nn.Linear], target: SmileLinear, accelerator=None
|
|
109
|
-
):
|
|
110
|
-
"""
|
|
111
|
-
Upscale a base linear layer to a SmileLinear layer using expert models.
|
|
112
|
-
|
|
113
|
-
Args:
|
|
114
|
-
base (nn.Linear): The base linear layer.
|
|
115
|
-
experts (List[nn.Linear]): A list of expert linear layers.
|
|
116
|
-
target (SmileLinear): The target SmileLinear layer.
|
|
117
|
-
accelerator (optional): The device to perform the computation on. Defaults to None.
|
|
118
|
-
|
|
119
|
-
Returns:
|
|
120
|
-
SmileLinear: The upscaled SmileLinear layer.
|
|
121
|
-
"""
|
|
122
|
-
w = base.weight
|
|
123
|
-
w_ft_list = [e.weight for e in experts]
|
|
124
|
-
dw_list = [w_ft - w for w_ft in w_ft_list]
|
|
125
|
-
|
|
126
|
-
if _is_all_zeros(dw_list):
|
|
127
|
-
raise ExpertNotTrainedError("Expert models are not trained")
|
|
128
|
-
|
|
129
|
-
rank_of_router = target.rank_of_router
|
|
130
|
-
rank_of_expert = target.rank_of_expert
|
|
131
|
-
num_local_experts = target.num_local_experts
|
|
132
|
-
svd_list = [svd(dw, accelerator=accelerator) for dw in dw_list]
|
|
133
|
-
|
|
134
|
-
# gate
|
|
135
|
-
gate_weight = []
|
|
136
|
-
for u, s, v in svd_list:
|
|
137
|
-
gate_weight.append(v[:, :rank_of_router].T)
|
|
138
|
-
gate_weight = (
|
|
139
|
-
torch.stack(gate_weight, dim=0)
|
|
140
|
-
.reshape(num_local_experts * rank_of_router, -1)
|
|
141
|
-
.contiguous()
|
|
142
|
-
)
|
|
143
|
-
|
|
144
|
-
target.gate.load_state_dict({"weight": gate_weight})
|
|
145
|
-
|
|
146
|
-
# shared linear
|
|
147
|
-
target.shared_linear.load_state_dict(base.state_dict())
|
|
148
|
-
|
|
149
|
-
# experts
|
|
150
|
-
if rank_of_expert > 0:
|
|
151
|
-
for expert_idx, target_expert in enumerate(target.experts):
|
|
152
|
-
u, s, v = svd_list[expert_idx]
|
|
153
|
-
u = u[:, :rank_of_expert]
|
|
154
|
-
s = s[:rank_of_expert]
|
|
155
|
-
v = v[:, :rank_of_expert]
|
|
156
|
-
state_dict = {"u": u, "svh": (s * v).T}
|
|
157
|
-
if experts[expert_idx].bias is not None:
|
|
158
|
-
state_dict["bias"] = experts[expert_idx].bias.data
|
|
159
|
-
target_expert.load_state_dict(state_dict)
|
|
160
|
-
else:
|
|
161
|
-
for expert_idx, target_expert in enumerate(target.experts):
|
|
162
|
-
target_expert.load_state_dict(
|
|
163
|
-
state_dict_sub(experts[expert_idx].state_dict(), base.state_dict())
|
|
164
|
-
)
|
|
165
|
-
|
|
166
|
-
return target
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
class SmileMistralUpscalingAlgorithm(ModelFusionAlgorithm, SimpleProfilerMixin):
|
|
43
|
+
class SmileMistralUpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
|
|
170
44
|
R"""
|
|
171
45
|
SmileMistralUpscalingAlgorithm is a model fusion algorithm designed to upscale
|
|
172
46
|
a pretrained Mistral model using a set of fine-tuned expert models. The algorithm
|
|
173
47
|
leverages Singular Value Decomposition (SVD) to merge the weights of the pretrained
|
|
174
48
|
model and the expert models into a new upscaled model.
|
|
175
49
|
|
|
176
|
-
Attributes:
|
|
177
|
-
modelpool (BaseModelPool): The pool of models to be used for upscaling.
|
|
178
|
-
config (dict): Configuration parameters for the upscaling process.
|
|
179
|
-
|
|
180
50
|
Methods:
|
|
181
51
|
run(modelpool: BaseModelPool) -> SmileMistralForCausalLM:
|
|
182
52
|
Executes the upscaling process and returns the upscaled model.
|
|
@@ -185,6 +55,37 @@ class SmileMistralUpscalingAlgorithm(ModelFusionAlgorithm, SimpleProfilerMixin):
|
|
|
185
55
|
Merges the pretrained model with the fine-tuned models to create an upscaled model.
|
|
186
56
|
"""
|
|
187
57
|
|
|
58
|
+
_config_mapping = BaseAlgorithm._config_mapping | {
|
|
59
|
+
"device": "device",
|
|
60
|
+
"accelerator": "accelerator",
|
|
61
|
+
"model_path": "model_path",
|
|
62
|
+
"model_dtype": "model_dtype",
|
|
63
|
+
"num_experts_per_tok": "num_experts_per_tok",
|
|
64
|
+
"rank_of_router": "rank_of_router",
|
|
65
|
+
"rank_of_expert": "rank_of_expert",
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
def __init__(
|
|
69
|
+
self,
|
|
70
|
+
device,
|
|
71
|
+
accelerator,
|
|
72
|
+
model_path,
|
|
73
|
+
model_dtype,
|
|
74
|
+
num_experts_per_tok,
|
|
75
|
+
rank_of_router,
|
|
76
|
+
rank_of_expert,
|
|
77
|
+
**kwargs,
|
|
78
|
+
):
|
|
79
|
+
self.device = device
|
|
80
|
+
self.accelerator = accelerator
|
|
81
|
+
self.model_path = model_path
|
|
82
|
+
self.model_dtype = model_dtype
|
|
83
|
+
# SmileMoE parameters, except `num_local_experts` which is set later according to the number of finetuned models
|
|
84
|
+
self.num_experts_per_tok = num_experts_per_tok
|
|
85
|
+
self.rank_of_router = rank_of_router
|
|
86
|
+
self.rank_of_expert = rank_of_expert
|
|
87
|
+
super().__init__(**kwargs)
|
|
88
|
+
|
|
188
89
|
@torch.no_grad()
|
|
189
90
|
def run(self, modelpool: BaseModelPool) -> SmileMistralForCausalLM:
|
|
190
91
|
"""
|
|
@@ -199,15 +100,15 @@ class SmileMistralUpscalingAlgorithm(ModelFusionAlgorithm, SimpleProfilerMixin):
|
|
|
199
100
|
self.modelpool = modelpool = to_modelpool(modelpool)
|
|
200
101
|
config = self.config
|
|
201
102
|
|
|
202
|
-
|
|
103
|
+
# load model from path if provided and return directly
|
|
203
104
|
if config.model_path is not None and os.path.exists(config.model_path):
|
|
204
105
|
log.info(f"Loading model from {config.model_path}")
|
|
205
|
-
model =
|
|
106
|
+
model = AutoModelForCausalLM.from_pretrained(config.model_path)
|
|
206
107
|
print_parameters(model)
|
|
207
108
|
return model
|
|
208
109
|
|
|
209
110
|
with self.profile("load pretrained model"):
|
|
210
|
-
pretrained_model = modelpool.
|
|
111
|
+
pretrained_model = modelpool.load_pretrained_model()
|
|
211
112
|
with self.profile("load fine-tuned model"):
|
|
212
113
|
finetuned_models = [
|
|
213
114
|
m for m in tqdm(modelpool.models(), total=len(modelpool.model_names))
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from copy import deepcopy
|
|
4
|
+
from typing import TYPE_CHECKING, Dict, List, Tuple
|
|
5
|
+
|
|
6
|
+
import torch
|
|
7
|
+
from accelerate import init_empty_weights
|
|
8
|
+
from tqdm.auto import tqdm
|
|
9
|
+
from transformers import (
|
|
10
|
+
AutoConfig,
|
|
11
|
+
AutoModelForCausalLM,
|
|
12
|
+
AutoTokenizer,
|
|
13
|
+
Qwen2ForCausalLM,
|
|
14
|
+
)
|
|
15
|
+
from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer
|
|
16
|
+
|
|
17
|
+
from fusion_bench import BaseAlgorithm, BaseModelPool
|
|
18
|
+
from fusion_bench.compat.modelpool import to_modelpool
|
|
19
|
+
from fusion_bench.mixins import SimpleProfilerMixin
|
|
20
|
+
from fusion_bench.models.modeling_smile_qwen2 import (
|
|
21
|
+
SmileQwen2Config,
|
|
22
|
+
SmileQwen2ForCausalLM,
|
|
23
|
+
)
|
|
24
|
+
from fusion_bench.models.modeling_smile_qwen2.modeling_smile_qwen2 import (
|
|
25
|
+
SmileQwen2DecoderLayer,
|
|
26
|
+
)
|
|
27
|
+
from fusion_bench.models.smile_moe.linear_from_hf_config import (
|
|
28
|
+
ExpertNotTrainedError,
|
|
29
|
+
upscale_to_smile_linear,
|
|
30
|
+
)
|
|
31
|
+
from fusion_bench.utils.dtype import parse_dtype
|
|
32
|
+
from fusion_bench.utils.parameters import print_parameters
|
|
33
|
+
|
|
34
|
+
log = logging.getLogger(__name__)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
|
|
38
|
+
R"""
|
|
39
|
+
SmileQwen2UpscalingAlgorithm is a model fusion algorithm designed to upscale
|
|
40
|
+
a pretrained Qwen2 model using a set of fine-tuned expert models. The algorithm
|
|
41
|
+
leverages Singular Value Decomposition (SVD) to merge the weights of the pretrained
|
|
42
|
+
model and the expert models into a new upscaled model.
|
|
43
|
+
|
|
44
|
+
Methods:
|
|
45
|
+
run(modelpool: BaseModelPool) -> SmileQwen2ForCausalLM:
|
|
46
|
+
Executes the upscaling process and returns the upscaled model.
|
|
47
|
+
|
|
48
|
+
merge(pretrained_model: Qwen2ForCausalLM, finetuned_models: List[Qwen2ForCausalLM]) -> SmileQwen2ForCausalLM:
|
|
49
|
+
Merges the pretrained model with the fine-tuned models to create an upscaled model.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
_config_mapping = BaseAlgorithm._config_mapping | {
|
|
53
|
+
"device": "device",
|
|
54
|
+
"accelerator": "accelerator",
|
|
55
|
+
"model_path": "model_path",
|
|
56
|
+
"model_dtype": "model_dtype",
|
|
57
|
+
"num_experts_per_tok": "num_experts_per_tok",
|
|
58
|
+
"rank_of_router": "rank_of_router",
|
|
59
|
+
"rank_of_expert": "rank_of_expert",
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
device,
|
|
65
|
+
accelerator,
|
|
66
|
+
model_path,
|
|
67
|
+
model_dtype,
|
|
68
|
+
num_experts_per_tok,
|
|
69
|
+
rank_of_router,
|
|
70
|
+
rank_of_expert,
|
|
71
|
+
**kwargs,
|
|
72
|
+
):
|
|
73
|
+
self.device = device
|
|
74
|
+
self.accelerator = accelerator
|
|
75
|
+
self.model_path = model_path
|
|
76
|
+
self.model_dtype = model_dtype
|
|
77
|
+
# SmileMoE parameters, except `num_local_experts` which is set later according to the number of finetuned models
|
|
78
|
+
self.num_experts_per_tok = num_experts_per_tok
|
|
79
|
+
self.rank_of_router = rank_of_router
|
|
80
|
+
self.rank_of_expert = rank_of_expert
|
|
81
|
+
super().__init__(**kwargs)
|
|
82
|
+
|
|
83
|
+
@torch.no_grad()
|
|
84
|
+
def run(self, modelpool: BaseModelPool) -> SmileQwen2ForCausalLM:
|
|
85
|
+
"""
|
|
86
|
+
Executes the upscaling process.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
modelpool (ModelPool): The pool of models to be used for upscaling.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
SmileQwen2ForCausalLM: The upscaled model.
|
|
93
|
+
"""
|
|
94
|
+
self.modelpool = modelpool = to_modelpool(modelpool)
|
|
95
|
+
config = self.config
|
|
96
|
+
|
|
97
|
+
# load model from path if provided and return directly
|
|
98
|
+
if config.model_path is not None and os.path.exists(config.model_path):
|
|
99
|
+
log.info(f"Loading model from {config.model_path}")
|
|
100
|
+
model = AutoModelForCausalLM.from_pretrained(config.model_path)
|
|
101
|
+
print_parameters(model)
|
|
102
|
+
return model
|
|
103
|
+
|
|
104
|
+
with self.profile("load pretrained model"):
|
|
105
|
+
pretrained_model = modelpool.load_pretrained_model()
|
|
106
|
+
with self.profile("load fine-tuned model"):
|
|
107
|
+
finetuned_models = [
|
|
108
|
+
m for m in tqdm(modelpool.models(), total=len(modelpool.model_names))
|
|
109
|
+
]
|
|
110
|
+
|
|
111
|
+
if config.device == "cuda" and torch.cuda.is_available():
|
|
112
|
+
pretrained_model = pretrained_model.cuda()
|
|
113
|
+
print("parameter count of pretrained model:")
|
|
114
|
+
print_parameters(pretrained_model)
|
|
115
|
+
finetuned_models = [m.cuda() for m in finetuned_models]
|
|
116
|
+
|
|
117
|
+
with self.profile("merge model"):
|
|
118
|
+
model = self.merge(pretrained_model, finetuned_models)
|
|
119
|
+
|
|
120
|
+
self.print_profile_summary()
|
|
121
|
+
print("parameter count of upscaled MoE model:")
|
|
122
|
+
print_parameters(model)
|
|
123
|
+
print(model)
|
|
124
|
+
|
|
125
|
+
if config.model_dtype is not None:
|
|
126
|
+
model.to(dtype=parse_dtype(config.model_dtype))
|
|
127
|
+
|
|
128
|
+
if config.model_path is not None:
|
|
129
|
+
if os.path.dirname(config.model_path):
|
|
130
|
+
os.makedirs(os.path.dirname(config.model_path), exist_ok=True)
|
|
131
|
+
log.info(f"Saving model to {config.model_path}")
|
|
132
|
+
pretrained_model_config = self.modelpool.get_model_config("_pretrained_")
|
|
133
|
+
pretrained_path = pretrained_model_config.get(
|
|
134
|
+
"path", pretrained_model_config["pretrained_model_name_or_path"]
|
|
135
|
+
)
|
|
136
|
+
tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
|
|
137
|
+
tokenizer.save_pretrained(config.model_path)
|
|
138
|
+
model.save_pretrained(config.model_path)
|
|
139
|
+
|
|
140
|
+
return model
|
|
141
|
+
|
|
142
|
+
def merge(
|
|
143
|
+
self,
|
|
144
|
+
pretrained_model: Qwen2ForCausalLM,
|
|
145
|
+
finetuned_models: List[Qwen2ForCausalLM],
|
|
146
|
+
):
|
|
147
|
+
"""
|
|
148
|
+
Merges the pretrained model with the fine-tuned models to create an upscaled model.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
pretrained_model (Qwen2ForCausalLM): The pretrained model.
|
|
152
|
+
finetuned_models (List[Qwen2ForCausalLM]): A list of fine-tuned models.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
SmileQwen2ForCausalLM: The upscaled model.
|
|
156
|
+
"""
|
|
157
|
+
config = self.config
|
|
158
|
+
|
|
159
|
+
with init_empty_weights():
|
|
160
|
+
pretrained_model_config = self.modelpool.get_model_config("_pretrained_")
|
|
161
|
+
pretrained_path = pretrained_model_config.get(
|
|
162
|
+
"path", pretrained_model_config["pretrained_model_name_or_path"]
|
|
163
|
+
)
|
|
164
|
+
base_config = AutoConfig.from_pretrained(pretrained_path)
|
|
165
|
+
model_config = SmileQwen2Config(
|
|
166
|
+
num_experts_per_tok=config.num_experts_per_tok,
|
|
167
|
+
rank_of_router=config.rank_of_router,
|
|
168
|
+
rank_of_expert=config.rank_of_expert,
|
|
169
|
+
num_local_experts=len(finetuned_models),
|
|
170
|
+
**base_config.to_dict(),
|
|
171
|
+
)
|
|
172
|
+
model = SmileQwen2ForCausalLM(model_config)
|
|
173
|
+
|
|
174
|
+
model.to(dtype=pretrained_model.dtype).to_empty(device="cpu")
|
|
175
|
+
|
|
176
|
+
# copy pretrained model weights
|
|
177
|
+
state_dict = model.state_dict()
|
|
178
|
+
pretrained_state_dict = dict(pretrained_model.state_dict())
|
|
179
|
+
for key in list(pretrained_state_dict.keys()):
|
|
180
|
+
if key not in state_dict:
|
|
181
|
+
pretrained_state_dict.pop(key)
|
|
182
|
+
model.load_state_dict(pretrained_state_dict, strict=False)
|
|
183
|
+
|
|
184
|
+
# upscale model
|
|
185
|
+
for layer_idx in tqdm(
|
|
186
|
+
range(len(pretrained_model.model.layers)),
|
|
187
|
+
"Upscaling Modules (layer)",
|
|
188
|
+
dynamic_ncols=True,
|
|
189
|
+
):
|
|
190
|
+
pretrained_layer: Qwen2DecoderLayer = pretrained_model.model.layers[
|
|
191
|
+
layer_idx
|
|
192
|
+
]
|
|
193
|
+
finetuned_layers: List[Qwen2DecoderLayer] = [
|
|
194
|
+
m.model.layers[layer_idx] for m in finetuned_models
|
|
195
|
+
]
|
|
196
|
+
|
|
197
|
+
target_layer: SmileQwen2DecoderLayer = model.model.layers[layer_idx]
|
|
198
|
+
|
|
199
|
+
for n in ["q_proj", "k_proj", "v_proj", "o_proj"]:
|
|
200
|
+
try:
|
|
201
|
+
upscale_to_smile_linear(
|
|
202
|
+
base=getattr(pretrained_layer.self_attn, n),
|
|
203
|
+
experts=[getattr(m.self_attn, n) for m in finetuned_layers],
|
|
204
|
+
target=getattr(target_layer.self_attn, n),
|
|
205
|
+
accelerator=config.accelerator,
|
|
206
|
+
)
|
|
207
|
+
except ExpertNotTrainedError:
|
|
208
|
+
setattr(
|
|
209
|
+
target_layer.self_attn,
|
|
210
|
+
n,
|
|
211
|
+
getattr(pretrained_layer.self_attn, n),
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
for n in ["gate_proj", "up_proj", "down_proj"]:
|
|
215
|
+
try:
|
|
216
|
+
upscale_to_smile_linear(
|
|
217
|
+
base=getattr(pretrained_layer.mlp, n),
|
|
218
|
+
experts=[getattr(m.mlp, n) for m in finetuned_layers],
|
|
219
|
+
target=getattr(target_layer.mlp, n),
|
|
220
|
+
accelerator=config.accelerator,
|
|
221
|
+
)
|
|
222
|
+
except ExpertNotTrainedError:
|
|
223
|
+
setattr(
|
|
224
|
+
target_layer.mlp,
|
|
225
|
+
n,
|
|
226
|
+
getattr(pretrained_layer.mlp, n),
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
return model
|