fusion-bench 0.2.25__py3-none-any.whl → 0.2.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fusion_bench/method/__init__.py +2 -0
- fusion_bench/method/classification/clip_finetune.py +6 -4
- fusion_bench/method/dop/__init__.py +1 -0
- fusion_bench/method/dop/dop.py +366 -0
- fusion_bench/method/dop/min_norm_solvers.py +227 -0
- fusion_bench/method/dop/utils.py +73 -0
- {fusion_bench-0.2.25.dist-info → fusion_bench-0.2.26.dist-info}/METADATA +8 -2
- {fusion_bench-0.2.25.dist-info → fusion_bench-0.2.26.dist-info}/RECORD +34 -29
- fusion_bench_config/method/bitdelta/bitdelta.yaml +3 -0
- fusion_bench_config/method/depth_upscaling.yaml +9 -0
- fusion_bench_config/method/dop/dop.yaml +30 -0
- fusion_bench_config/method/dummy.yaml +6 -0
- fusion_bench_config/method/ensemble/max_model_predictor.yaml +6 -0
- fusion_bench_config/method/ensemble/simple_ensemble.yaml +8 -1
- fusion_bench_config/method/ensemble/weighted_ensemble.yaml +8 -0
- fusion_bench_config/method/linear/linear_interpolation.yaml +8 -0
- fusion_bench_config/method/linear/weighted_average.yaml +3 -0
- fusion_bench_config/method/linear/weighted_average_for_llama.yaml +1 -1
- fusion_bench_config/method/model_recombination.yaml +8 -0
- fusion_bench_config/method/model_stock/model_stock.yaml +4 -1
- fusion_bench_config/method/opcm/opcm.yaml +5 -0
- fusion_bench_config/method/opcm/task_arithmetic.yaml +6 -0
- fusion_bench_config/method/opcm/ties_merging.yaml +5 -0
- fusion_bench_config/method/opcm/weight_average.yaml +5 -0
- fusion_bench_config/method/simple_average.yaml +9 -0
- fusion_bench_config/method/slerp/slerp.yaml +9 -0
- fusion_bench_config/method/slerp/slerp_lm.yaml +5 -0
- fusion_bench_config/method/smile_upscaling/smile_upscaling.yaml +3 -0
- fusion_bench_config/method/task_arithmetic.yaml +9 -0
- fusion_bench_config/method/ties_merging.yaml +3 -0
- {fusion_bench-0.2.25.dist-info → fusion_bench-0.2.26.dist-info}/WHEEL +0 -0
- {fusion_bench-0.2.25.dist-info → fusion_bench-0.2.26.dist-info}/entry_points.txt +0 -0
- {fusion_bench-0.2.25.dist-info → fusion_bench-0.2.26.dist-info}/licenses/LICENSE +0 -0
- {fusion_bench-0.2.25.dist-info → fusion_bench-0.2.26.dist-info}/top_level.txt +0 -0
fusion_bench/method/__init__.py
CHANGED
|
@@ -70,6 +70,7 @@ _import_structure = {
|
|
|
70
70
|
"IsotropicMergingInCommonSubspace",
|
|
71
71
|
],
|
|
72
72
|
"opcm": ["OPCMForCLIP"],
|
|
73
|
+
"dop": ["ContinualDOPForCLIP"],
|
|
73
74
|
"gossip": [
|
|
74
75
|
"CLIPLayerWiseGossipAlgorithm",
|
|
75
76
|
"CLIPTaskWiseGossipAlgorithm",
|
|
@@ -212,6 +213,7 @@ if TYPE_CHECKING:
|
|
|
212
213
|
from .model_recombination import ModelRecombinationAlgorithm
|
|
213
214
|
from .model_stock import ModelStock
|
|
214
215
|
from .opcm import OPCMForCLIP
|
|
216
|
+
from .dop import ContinualDOPForCLIP
|
|
215
217
|
from .pruning import (
|
|
216
218
|
MagnitudeDiffPruningAlgorithm,
|
|
217
219
|
MagnitudePruningForLlama,
|
|
@@ -5,8 +5,8 @@ Fine-tune CLIP-ViT-B/32:
|
|
|
5
5
|
|
|
6
6
|
```bash
|
|
7
7
|
fusion_bench \
|
|
8
|
-
method=clip_finetune \
|
|
9
|
-
modelpool=clip-vit-base-patch32_mtl \
|
|
8
|
+
method=classification/clip_finetune \
|
|
9
|
+
modelpool=CLIPVisionModelPool/clip-vit-base-patch32_mtl \
|
|
10
10
|
taskpool=dummy
|
|
11
11
|
```
|
|
12
12
|
|
|
@@ -15,12 +15,14 @@ Fine-tune CLIP-ViT-L/14 on eight GPUs with a per-device per-task batch size of 2
|
|
|
15
15
|
```bash
|
|
16
16
|
fusion_bench \
|
|
17
17
|
fabric.devices=8 \
|
|
18
|
-
method=clip_finetune \
|
|
18
|
+
method=classification/clip_finetune \
|
|
19
19
|
method.batch_size=2 \
|
|
20
|
-
modelpool=clip-vit-base-patch32_mtl \
|
|
20
|
+
modelpool=CLIPVisionModelPool/clip-vit-base-patch32_mtl \
|
|
21
21
|
modelpool.models.0.path=openai/clip-vit-large-patch14 \
|
|
22
22
|
taskpool=dummy
|
|
23
23
|
```
|
|
24
|
+
|
|
25
|
+
See `examples/clip_finetune` for more details.
|
|
24
26
|
"""
|
|
25
27
|
|
|
26
28
|
import os
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .dop import ContinualDOPForCLIP
|
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Continual Model Merging without Data: Dual Projections for Balancing Stability and Plasticity. NeurIPS, 2025.
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
Example:
|
|
6
|
+
|
|
7
|
+
fusion_bench \
|
|
8
|
+
method=dop/dop \
|
|
9
|
+
modelpool=CLIPVisionModelPool/clip-vit-base-patch32_TA8_model_only \
|
|
10
|
+
taskpool=CLIPVisionModelTaskPool/clip-vit-classification_TA8
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
import os
|
|
15
|
+
import random
|
|
16
|
+
from copy import deepcopy
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Dict, List, Literal, Optional, Tuple, cast
|
|
19
|
+
|
|
20
|
+
import lightning as L
|
|
21
|
+
import numpy as np
|
|
22
|
+
import torch
|
|
23
|
+
from omegaconf import DictConfig
|
|
24
|
+
from torch import Tensor, nn
|
|
25
|
+
from torch.autograd import Variable
|
|
26
|
+
from tqdm.auto import tqdm
|
|
27
|
+
from transformers import CLIPVisionModel
|
|
28
|
+
|
|
29
|
+
from fusion_bench import BaseAlgorithm, BaseModelPool, auto_register_config
|
|
30
|
+
from fusion_bench.method.simple_average import simple_average
|
|
31
|
+
from fusion_bench.mixins import LightningFabricMixin
|
|
32
|
+
from fusion_bench.taskpool import CLIPVisionModelTaskPool
|
|
33
|
+
from fusion_bench.utils import seed_everything_by_time
|
|
34
|
+
from fusion_bench.utils.json import save_to_json
|
|
35
|
+
|
|
36
|
+
from .min_norm_solvers import MinNormSolver, gradient_normalizers
|
|
37
|
+
from .utils import is_leaf_module, svd
|
|
38
|
+
|
|
39
|
+
log = logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@auto_register_config
|
|
43
|
+
class ContinualDOPForCLIP(BaseAlgorithm, LightningFabricMixin):
|
|
44
|
+
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
seed: Optional[int] = None,
|
|
48
|
+
shuffle_order: bool = False,
|
|
49
|
+
save_on_every_step: bool = True,
|
|
50
|
+
evaluate_on_every_step: bool = False,
|
|
51
|
+
lr: float = 1e-4,
|
|
52
|
+
num_steps: int = 200,
|
|
53
|
+
mgda: bool = True,
|
|
54
|
+
ema: bool = True,
|
|
55
|
+
ema_beta: float = 0.99,
|
|
56
|
+
alpha: float = None,
|
|
57
|
+
svd_epsilon: float = 1.0,
|
|
58
|
+
svd_proj_space: str = "uv",
|
|
59
|
+
**kwargs,
|
|
60
|
+
):
|
|
61
|
+
self.lr = lr
|
|
62
|
+
self.num_steps = num_steps
|
|
63
|
+
self.mgda = mgda
|
|
64
|
+
self.ema = ema
|
|
65
|
+
self.ema_beta = ema_beta
|
|
66
|
+
self.alpha = alpha
|
|
67
|
+
self.svd_epsilon = svd_epsilon
|
|
68
|
+
self.svd_proj_space = svd_proj_space
|
|
69
|
+
self.seed = seed
|
|
70
|
+
self.shuffle_order = shuffle_order
|
|
71
|
+
self.save_on_every_step = save_on_every_step
|
|
72
|
+
self.evaluate_on_every_step = evaluate_on_every_step
|
|
73
|
+
|
|
74
|
+
assert (
|
|
75
|
+
self.svd_epsilon >= 0 and self.svd_epsilon <= 1
|
|
76
|
+
), "The svd_epsilon should be in the range of [0, 1]"
|
|
77
|
+
assert (
|
|
78
|
+
self.alpha >= 0 and self.alpha <= 1
|
|
79
|
+
), "The alpha should be in the range of [0, 1]"
|
|
80
|
+
super().__init__(**kwargs)
|
|
81
|
+
|
|
82
|
+
def print_params(self, pretrained_model):
|
|
83
|
+
total_params = 0
|
|
84
|
+
linear_params = 0
|
|
85
|
+
linear_weight_params = 0
|
|
86
|
+
for module_name, module in pretrained_model.named_modules():
|
|
87
|
+
if not is_leaf_module(module):
|
|
88
|
+
continue
|
|
89
|
+
if isinstance(module, nn.Linear):
|
|
90
|
+
linear_params += sum(p.numel() for n, p in module.named_parameters())
|
|
91
|
+
linear_weight_params += sum(
|
|
92
|
+
p.numel() for n, p in module.named_parameters() if "weight" in n
|
|
93
|
+
)
|
|
94
|
+
total_params += sum(p.numel() for p in module.parameters())
|
|
95
|
+
|
|
96
|
+
linear_ratio = linear_params / total_params * 100
|
|
97
|
+
linear_weight_ratio = linear_weight_params / total_params * 100
|
|
98
|
+
print(f"Total Parameters: {total_params}")
|
|
99
|
+
print(f"Linear Parameters: {linear_params}")
|
|
100
|
+
print(f"Linear Weight Parameters: {linear_weight_params}")
|
|
101
|
+
print(f"Linear Ratio: {linear_ratio:.2f}%")
|
|
102
|
+
print(f"Linear Weight Ratio: {linear_weight_ratio:.2f}%")
|
|
103
|
+
|
|
104
|
+
def run(self, modelpool: BaseModelPool):
|
|
105
|
+
if self.seed is not None:
|
|
106
|
+
L.seed_everything(self.seed)
|
|
107
|
+
else:
|
|
108
|
+
seed_everything_by_time(self.fabric)
|
|
109
|
+
|
|
110
|
+
# get the model names, shuffle if needed
|
|
111
|
+
# the model names will be saved to the log directory as `model_names.json`
|
|
112
|
+
model_names = modelpool.model_names
|
|
113
|
+
if self.shuffle_order:
|
|
114
|
+
random.shuffle(model_names)
|
|
115
|
+
if self.log_dir is not None:
|
|
116
|
+
save_to_json(model_names, os.path.join(self.log_dir, "model_names.json"))
|
|
117
|
+
|
|
118
|
+
if self.evaluate_on_every_step:
|
|
119
|
+
"""Configuration for the test datasets"""
|
|
120
|
+
self.taskpool = cast(CLIPVisionModelTaskPool, self._program.taskpool)
|
|
121
|
+
self._test_datasets = deepcopy(self.taskpool._test_datasets)
|
|
122
|
+
|
|
123
|
+
pretrained_model = modelpool.load_pretrained_model()
|
|
124
|
+
|
|
125
|
+
merged_model = None
|
|
126
|
+
for model_idx, model_name in enumerate(model_names):
|
|
127
|
+
print(
|
|
128
|
+
f"--------- Optimizing {model_idx + 1}/{len(model_names)}-th with {model_name} ---------"
|
|
129
|
+
)
|
|
130
|
+
if model_idx == 0:
|
|
131
|
+
merged_model = modelpool.load_model(model_names[0])
|
|
132
|
+
else:
|
|
133
|
+
merged_model = self._layer_wise_optimize(
|
|
134
|
+
model_names=["merged", model_name],
|
|
135
|
+
pretrained_model=deepcopy(pretrained_model),
|
|
136
|
+
finetuned_models={
|
|
137
|
+
"merged": merged_model,
|
|
138
|
+
model_name: modelpool.load_model(model_name),
|
|
139
|
+
},
|
|
140
|
+
model_idx=model_idx,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
if self.save_on_every_step:
|
|
144
|
+
self.save_merged_model(merged_model, model_idx)
|
|
145
|
+
|
|
146
|
+
if self.evaluate_on_every_step:
|
|
147
|
+
self.taskpool._is_setup = False
|
|
148
|
+
self.taskpool._test_datasets = DictConfig(
|
|
149
|
+
{n: self._test_datasets[n] for n in model_names[: model_idx + 1]}
|
|
150
|
+
)
|
|
151
|
+
report = self.taskpool.evaluate(deepcopy(merged_model))
|
|
152
|
+
save_to_json(report, Path(self.log_dir) / f"report_{model_idx}.json")
|
|
153
|
+
|
|
154
|
+
return merged_model
|
|
155
|
+
|
|
156
|
+
def _layer_wise_optimize(
|
|
157
|
+
self,
|
|
158
|
+
model_names: List[str],
|
|
159
|
+
pretrained_model: nn.Module,
|
|
160
|
+
finetuned_models: Dict[str, nn.Module],
|
|
161
|
+
model_idx: int,
|
|
162
|
+
):
|
|
163
|
+
time_cost = []
|
|
164
|
+
for module_name, module in pretrained_model.named_modules():
|
|
165
|
+
if not is_leaf_module(module):
|
|
166
|
+
continue
|
|
167
|
+
|
|
168
|
+
if isinstance(module, nn.Linear):
|
|
169
|
+
if module.weight.requires_grad:
|
|
170
|
+
import time
|
|
171
|
+
|
|
172
|
+
start_time = time.time()
|
|
173
|
+
merged_weight = self._optimize_weight(
|
|
174
|
+
module.weight,
|
|
175
|
+
{
|
|
176
|
+
model_name: finetuned_models[model_name]
|
|
177
|
+
.get_submodule(module_name)
|
|
178
|
+
.weight
|
|
179
|
+
for model_name in model_names
|
|
180
|
+
},
|
|
181
|
+
module_name,
|
|
182
|
+
model_idx,
|
|
183
|
+
)
|
|
184
|
+
end_time = time.time()
|
|
185
|
+
time_cost.append(end_time - start_time)
|
|
186
|
+
module.weight.data = merged_weight.data
|
|
187
|
+
else:
|
|
188
|
+
module.weight.data = simple_average(
|
|
189
|
+
[
|
|
190
|
+
finetuned_models[model_name]
|
|
191
|
+
.get_submodule(module_name)
|
|
192
|
+
.weight
|
|
193
|
+
for model_name in model_names
|
|
194
|
+
]
|
|
195
|
+
)
|
|
196
|
+
if module.bias is not None:
|
|
197
|
+
module.bias.data = simple_average(
|
|
198
|
+
[
|
|
199
|
+
finetuned_models[model_name].get_submodule(module_name).bias
|
|
200
|
+
for model_name in model_names
|
|
201
|
+
]
|
|
202
|
+
)
|
|
203
|
+
else:
|
|
204
|
+
simple_average(
|
|
205
|
+
[
|
|
206
|
+
finetuned_models[model_name].get_submodule(module_name)
|
|
207
|
+
for model_name in model_names
|
|
208
|
+
],
|
|
209
|
+
base_module=module,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
return pretrained_model
|
|
213
|
+
|
|
214
|
+
def _optimize_weight(
|
|
215
|
+
self,
|
|
216
|
+
pretrained_weight: Tensor,
|
|
217
|
+
finetuned_weights: Dict[str, Tensor],
|
|
218
|
+
module_name: str,
|
|
219
|
+
model_idx: int,
|
|
220
|
+
):
|
|
221
|
+
assert (
|
|
222
|
+
self.fabric.world_size == 1
|
|
223
|
+
), "This algorithm is not currently supported in distributed training"
|
|
224
|
+
|
|
225
|
+
pretrained_weight = self.fabric.to_device(pretrained_weight.detach())
|
|
226
|
+
finetuned_weights = {
|
|
227
|
+
model_name: self.fabric.to_device(finetuned_weight.detach())
|
|
228
|
+
for model_name, finetuned_weight in finetuned_weights.items()
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
merged_weight = self.fabric.to_device(
|
|
232
|
+
nn.Parameter(
|
|
233
|
+
simple_average(
|
|
234
|
+
[
|
|
235
|
+
finetuned_weight.detach()
|
|
236
|
+
for finetuned_weight in finetuned_weights.values()
|
|
237
|
+
]
|
|
238
|
+
),
|
|
239
|
+
requires_grad=True,
|
|
240
|
+
)
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
# Compute SVD of the difference between the finetuned and pretrained weights
|
|
244
|
+
proj_u_dict = {}
|
|
245
|
+
proj_v_dict = {}
|
|
246
|
+
proj_s_dict = {}
|
|
247
|
+
for i, finetuned_weight in enumerate(finetuned_weights.values()):
|
|
248
|
+
finetuned_tv = finetuned_weight - pretrained_weight
|
|
249
|
+
u, s, v = svd(finetuned_tv, full_matrices=True)
|
|
250
|
+
epsilon = 1.0 if self.svd_epsilon > 1.0 else self.svd_epsilon
|
|
251
|
+
cumsum_ratio = s.cumsum(dim=0) / s.sum()
|
|
252
|
+
split_rank = torch.searchsorted(cumsum_ratio, epsilon).item()
|
|
253
|
+
u_main = u[:, :split_rank]
|
|
254
|
+
v_main = v[:, :split_rank]
|
|
255
|
+
s_main = s[:split_rank]
|
|
256
|
+
proj_u_dict[i] = u_main
|
|
257
|
+
proj_v_dict[i] = v_main
|
|
258
|
+
proj_s_dict[i] = s_main
|
|
259
|
+
|
|
260
|
+
if self.mgda:
|
|
261
|
+
if self.ema:
|
|
262
|
+
ema_sol = [self.alpha, 1 - self.alpha]
|
|
263
|
+
# This is multiple-gradient descent algorithm (MGDA) optimization
|
|
264
|
+
optimizer = torch.optim.Adam([merged_weight], lr=self.lr)
|
|
265
|
+
all_losses = [[], []]
|
|
266
|
+
all_alphas = [[], []]
|
|
267
|
+
for step_idx in tqdm(
|
|
268
|
+
range(self.num_steps), desc=f"Optimizing {module_name} weight"
|
|
269
|
+
):
|
|
270
|
+
# Scaling the loss functions based on the algorithm choice
|
|
271
|
+
loss_data = {}
|
|
272
|
+
grads = {}
|
|
273
|
+
for i, finetuned_weight in enumerate(finetuned_weights.values()):
|
|
274
|
+
proj_u = proj_u_dict[i]
|
|
275
|
+
proj_v = proj_v_dict[i]
|
|
276
|
+
proj_s = proj_s_dict[i]
|
|
277
|
+
delta_tv = merged_weight - finetuned_weight
|
|
278
|
+
loss_i = self.cal_loss_i(delta_tv, proj_s, proj_u, proj_v)
|
|
279
|
+
loss_data[i] = float(loss_i.data)
|
|
280
|
+
|
|
281
|
+
all_losses[i].append(float(loss_i.data))
|
|
282
|
+
|
|
283
|
+
optimizer.zero_grad()
|
|
284
|
+
loss_i.backward()
|
|
285
|
+
grads[i] = Variable(
|
|
286
|
+
merged_weight.grad.data.clone(), requires_grad=False
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
# Normalize all gradients
|
|
290
|
+
gn = gradient_normalizers(
|
|
291
|
+
grads=grads, losses=loss_data, normalization_type="loss"
|
|
292
|
+
)
|
|
293
|
+
for i, _ in enumerate(finetuned_weights.values()):
|
|
294
|
+
grads[i] = grads[i] / float(gn[i])
|
|
295
|
+
|
|
296
|
+
# Frank-Wolfe iteration to compute scales.
|
|
297
|
+
sol, min_norm = MinNormSolver.find_min_norm_element(
|
|
298
|
+
[[grads[i]] for i in range(len(finetuned_weights.values()))]
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
if self.ema:
|
|
302
|
+
ema_sol = [
|
|
303
|
+
self.ema_beta * ema_sol[i] + (1 - self.ema_beta) * float(sol[i])
|
|
304
|
+
for i in range(len(sol))
|
|
305
|
+
]
|
|
306
|
+
sol = ema_sol
|
|
307
|
+
all_alphas[0].append(ema_sol[0])
|
|
308
|
+
all_alphas[1].append(ema_sol[1])
|
|
309
|
+
|
|
310
|
+
# Scaled back-propagation
|
|
311
|
+
loss = 0
|
|
312
|
+
for i, finetuned_weight in enumerate(finetuned_weights.values()):
|
|
313
|
+
# Comptue gradients of each loss function wrt parameters
|
|
314
|
+
proj_u = proj_u_dict[i]
|
|
315
|
+
proj_v = proj_v_dict[i]
|
|
316
|
+
proj_s = proj_s_dict[i]
|
|
317
|
+
delta_tv = merged_weight - finetuned_weight
|
|
318
|
+
loss_i = self.cal_loss_i(delta_tv, proj_s, proj_u, proj_v)
|
|
319
|
+
loss += float(sol[i]) * loss_i
|
|
320
|
+
|
|
321
|
+
optimizer.zero_grad()
|
|
322
|
+
loss.backward()
|
|
323
|
+
optimizer.step()
|
|
324
|
+
|
|
325
|
+
else:
|
|
326
|
+
# This is a naive weighted optimization
|
|
327
|
+
optimizer = torch.optim.Adam([merged_weight], lr=self.lr)
|
|
328
|
+
for step_idx in tqdm(
|
|
329
|
+
range(self.num_steps), desc=f"Optimizing {module_name} weight"
|
|
330
|
+
):
|
|
331
|
+
loss = 0
|
|
332
|
+
for i, finetuned_weight in enumerate(finetuned_weights.values()):
|
|
333
|
+
proj_u = proj_u_dict[i]
|
|
334
|
+
proj_v = proj_v_dict[i]
|
|
335
|
+
proj_s = proj_s_dict[i]
|
|
336
|
+
delta_tv = merged_weight - finetuned_weight
|
|
337
|
+
loss_i = self.cal_loss_i(delta_tv, proj_s, proj_u, proj_v)
|
|
338
|
+
loss += self.alpha * loss_i if i == 0 else (1 - self.alpha) * loss_i
|
|
339
|
+
|
|
340
|
+
optimizer.zero_grad()
|
|
341
|
+
loss.backward()
|
|
342
|
+
optimizer.step()
|
|
343
|
+
|
|
344
|
+
return merged_weight.detach().cpu()
|
|
345
|
+
|
|
346
|
+
def cal_loss_i(self, delta_tv, proj_s, proj_u, proj_v):
|
|
347
|
+
proj_delta_1 = torch.diag(proj_s) @ proj_u.T @ delta_tv
|
|
348
|
+
proj_delta_2 = delta_tv @ proj_v @ torch.diag(proj_s)
|
|
349
|
+
loss_i_u = torch.linalg.matrix_norm(proj_delta_1, ord="fro") ** 2
|
|
350
|
+
loss_i_v = torch.linalg.matrix_norm(proj_delta_2, ord="fro") ** 2
|
|
351
|
+
if self.svd_proj_space == "uv":
|
|
352
|
+
loss_i = loss_i_u + loss_i_v
|
|
353
|
+
elif self.svd_proj_space == "u":
|
|
354
|
+
loss_i = loss_i_u
|
|
355
|
+
elif self.svd_proj_space == "v":
|
|
356
|
+
loss_i = loss_i_v
|
|
357
|
+
else:
|
|
358
|
+
raise ValueError("Invalid svd_proj_space")
|
|
359
|
+
|
|
360
|
+
return loss_i
|
|
361
|
+
|
|
362
|
+
def save_merged_model(self, merged_model: CLIPVisionModel, step: int):
|
|
363
|
+
os.makedirs(Path(self.log_dir) / "checkpoints", exist_ok=True)
|
|
364
|
+
merged_model.save_pretrained(
|
|
365
|
+
Path(self.log_dir) / "checkpoints" / f"merged_model_{step}"
|
|
366
|
+
)
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
# This code is from
|
|
2
|
+
# Multi-Task Learning as Multi-Objective Optimization
|
|
3
|
+
# Ozan Sener, Vladlen Koltun
|
|
4
|
+
# Neural Information Processing Systems (NeurIPS) 2018
|
|
5
|
+
# https://github.com/intel-isl/MultiObjectiveOptimization
|
|
6
|
+
from typing import Union
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import torch
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def np_sum(x: Union[torch.Tensor, np.ndarray]) -> float:
|
|
13
|
+
if isinstance(x, torch.Tensor):
|
|
14
|
+
return x.sum().item()
|
|
15
|
+
return np.sum(x)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def to_numpy(x: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
|
|
19
|
+
if isinstance(x, torch.Tensor):
|
|
20
|
+
return x.detach().cpu().numpy()
|
|
21
|
+
return x
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class MinNormSolver:
|
|
25
|
+
MAX_ITER = 250
|
|
26
|
+
STOP_CRIT = 1e-5
|
|
27
|
+
|
|
28
|
+
def _min_norm_element_from2(v1v1, v1v2, v2v2):
|
|
29
|
+
"""
|
|
30
|
+
Analytical solution for min_{c} |cx_1 + (1-c)x_2|_2^2
|
|
31
|
+
d is the distance (objective) optimzed
|
|
32
|
+
v1v1 = <x1,x1>
|
|
33
|
+
v1v2 = <x1,x2>
|
|
34
|
+
v2v2 = <x2,x2>
|
|
35
|
+
"""
|
|
36
|
+
if v1v2 >= v1v1:
|
|
37
|
+
# Case: Fig 1, third column
|
|
38
|
+
gamma = 0.999
|
|
39
|
+
cost = v1v1
|
|
40
|
+
return gamma, cost
|
|
41
|
+
if v1v2 >= v2v2:
|
|
42
|
+
# Case: Fig 1, first column
|
|
43
|
+
gamma = 0.001
|
|
44
|
+
cost = v2v2
|
|
45
|
+
return gamma, cost
|
|
46
|
+
# Case: Fig 1, second column
|
|
47
|
+
gamma = -1.0 * ((v1v2 - v2v2) / (v1v1 + v2v2 - 2 * v1v2))
|
|
48
|
+
cost = v2v2 + gamma * (v1v2 - v2v2)
|
|
49
|
+
return gamma, cost
|
|
50
|
+
|
|
51
|
+
def _min_norm_2d(vecs, dps):
|
|
52
|
+
R"""
|
|
53
|
+
Find the minimum norm solution as combination of two points
|
|
54
|
+
This is correct only in 2D
|
|
55
|
+
ie. min_c |\sum c_i x_i|_2^2 st. \sum c_i = 1 , 1 >= c_1 >= 0 for all i, c_i + c_j = 1.0 for some i, j
|
|
56
|
+
"""
|
|
57
|
+
dmin = 1e8
|
|
58
|
+
for i in range(len(vecs)):
|
|
59
|
+
for j in range(i + 1, len(vecs)):
|
|
60
|
+
if (i, j) not in dps:
|
|
61
|
+
dps[(i, j)] = 0.0
|
|
62
|
+
for k in range(len(vecs[i])):
|
|
63
|
+
dps[(i, j)] += (
|
|
64
|
+
torch.mul(vecs[i][k], vecs[j][k]).sum().data.cpu()
|
|
65
|
+
)
|
|
66
|
+
dps[(j, i)] = dps[(i, j)]
|
|
67
|
+
if (i, i) not in dps:
|
|
68
|
+
dps[(i, i)] = 0.0
|
|
69
|
+
for k in range(len(vecs[i])):
|
|
70
|
+
dps[(i, i)] += (
|
|
71
|
+
torch.mul(vecs[i][k], vecs[i][k]).sum().data.cpu()
|
|
72
|
+
)
|
|
73
|
+
if (j, j) not in dps:
|
|
74
|
+
dps[(j, j)] = 0.0
|
|
75
|
+
for k in range(len(vecs[i])):
|
|
76
|
+
dps[(j, j)] += (
|
|
77
|
+
torch.mul(vecs[j][k], vecs[j][k]).sum().data.cpu()
|
|
78
|
+
)
|
|
79
|
+
c, d = MinNormSolver._min_norm_element_from2(
|
|
80
|
+
dps[(i, i)], dps[(i, j)], dps[(j, j)]
|
|
81
|
+
)
|
|
82
|
+
if d < dmin:
|
|
83
|
+
dmin = d
|
|
84
|
+
sol = [(i, j), c, d]
|
|
85
|
+
return sol, dps
|
|
86
|
+
|
|
87
|
+
def _projection2simplex(y):
|
|
88
|
+
R"""
|
|
89
|
+
Given y, it solves argmin_z |y-z|_2 st \sum z = 1 , 1 >= z_i >= 0 for all i
|
|
90
|
+
"""
|
|
91
|
+
m = len(y)
|
|
92
|
+
sorted_y = np.flip(np.sort(y), axis=0)
|
|
93
|
+
tmpsum = 0.0
|
|
94
|
+
tmax_f = (np.sum(y) - 1.0) / m
|
|
95
|
+
for i in range(m - 1):
|
|
96
|
+
tmpsum += sorted_y[i]
|
|
97
|
+
tmax = (tmpsum - 1) / (i + 1.0)
|
|
98
|
+
if tmax > sorted_y[i + 1]:
|
|
99
|
+
tmax_f = tmax
|
|
100
|
+
break
|
|
101
|
+
return np.maximum(y - tmax_f, np.zeros(y.shape))
|
|
102
|
+
|
|
103
|
+
def _next_point(cur_val, grad, n):
|
|
104
|
+
proj_grad = grad - (np.sum(grad) / n)
|
|
105
|
+
tm1 = -1.0 * cur_val[proj_grad < 0] / proj_grad[proj_grad < 0]
|
|
106
|
+
tm2 = (1.0 - cur_val[proj_grad > 0]) / (proj_grad[proj_grad > 0])
|
|
107
|
+
|
|
108
|
+
skippers = np_sum(tm1 < 1e-7) + np_sum(tm2 < 1e-7)
|
|
109
|
+
t = 1
|
|
110
|
+
if len(tm1[tm1 > 1e-7]) > 0:
|
|
111
|
+
t = np.min(to_numpy(tm1[tm1 > 1e-7]))
|
|
112
|
+
if len(tm2[tm2 > 1e-7]) > 0:
|
|
113
|
+
t = min(t, np.min(to_numpy(tm2[tm2 > 1e-7])))
|
|
114
|
+
|
|
115
|
+
next_point = proj_grad * t + to_numpy(cur_val)
|
|
116
|
+
next_point = MinNormSolver._projection2simplex(next_point)
|
|
117
|
+
return next_point
|
|
118
|
+
|
|
119
|
+
def find_min_norm_element(vecs):
|
|
120
|
+
R"""
|
|
121
|
+
Given a list of vectors (vecs), this method finds the minimum norm element in the convex hull
|
|
122
|
+
as min |u|_2 st. u = \sum c_i vecs[i] and \sum c_i = 1.
|
|
123
|
+
It is quite geometric, and the main idea is the fact that if d_{ij} = min |u|_2 st u = c x_i + (1-c) x_j; the solution lies in (0, d_{i,j})
|
|
124
|
+
Hence, we find the best 2-task solution, and then run the projected gradient descent until convergence
|
|
125
|
+
"""
|
|
126
|
+
# Solution lying at the combination of two points
|
|
127
|
+
dps = {}
|
|
128
|
+
init_sol, dps = MinNormSolver._min_norm_2d(vecs, dps)
|
|
129
|
+
|
|
130
|
+
n = len(vecs)
|
|
131
|
+
sol_vec = np.zeros(n)
|
|
132
|
+
sol_vec[init_sol[0][0]] = init_sol[1]
|
|
133
|
+
sol_vec[init_sol[0][1]] = 1 - init_sol[1]
|
|
134
|
+
|
|
135
|
+
if n < 3:
|
|
136
|
+
# This is optimal for n=2, so return the solution
|
|
137
|
+
return sol_vec, init_sol[2]
|
|
138
|
+
|
|
139
|
+
iter_count = 0
|
|
140
|
+
|
|
141
|
+
grad_mat = np.zeros((n, n))
|
|
142
|
+
for i in range(n):
|
|
143
|
+
for j in range(n):
|
|
144
|
+
grad_mat[i, j] = dps[(i, j)]
|
|
145
|
+
|
|
146
|
+
while iter_count < MinNormSolver.MAX_ITER:
|
|
147
|
+
grad_dir = -1.0 * np.dot(grad_mat, sol_vec)
|
|
148
|
+
new_point = MinNormSolver._next_point(sol_vec, grad_dir, n)
|
|
149
|
+
# Re-compute the inner products for line search
|
|
150
|
+
v1v1 = 0.0
|
|
151
|
+
v1v2 = 0.0
|
|
152
|
+
v2v2 = 0.0
|
|
153
|
+
for i in range(n):
|
|
154
|
+
for j in range(n):
|
|
155
|
+
v1v1 += sol_vec[i] * sol_vec[j] * dps[(i, j)]
|
|
156
|
+
v1v2 += sol_vec[i] * new_point[j] * dps[(i, j)]
|
|
157
|
+
v2v2 += new_point[i] * new_point[j] * dps[(i, j)]
|
|
158
|
+
nc, nd = MinNormSolver._min_norm_element_from2(v1v1, v1v2, v2v2)
|
|
159
|
+
new_sol_vec = nc * sol_vec + (1 - nc) * new_point
|
|
160
|
+
change = new_sol_vec - sol_vec
|
|
161
|
+
if np_sum(np.abs(change)) < MinNormSolver.STOP_CRIT:
|
|
162
|
+
return sol_vec, nd
|
|
163
|
+
sol_vec = new_sol_vec
|
|
164
|
+
|
|
165
|
+
def find_min_norm_element_FW(vecs):
|
|
166
|
+
R"""
|
|
167
|
+
Given a list of vectors (vecs), this method finds the minimum norm element in the convex hull
|
|
168
|
+
as min |u|_2 st. u = \sum c_i vecs[i] and \sum c_i = 1.
|
|
169
|
+
It is quite geometric, and the main idea is the fact that if d_{ij} = min |u|_2 st u = c x_i + (1-c) x_j; the solution lies in (0, d_{i,j})
|
|
170
|
+
Hence, we find the best 2-task solution, and then run the Frank Wolfe until convergence
|
|
171
|
+
"""
|
|
172
|
+
# Solution lying at the combination of two points
|
|
173
|
+
dps = {}
|
|
174
|
+
init_sol, dps = MinNormSolver._min_norm_2d(vecs, dps)
|
|
175
|
+
|
|
176
|
+
n = len(vecs)
|
|
177
|
+
sol_vec = np.zeros(n)
|
|
178
|
+
sol_vec[init_sol[0][0]] = init_sol[1]
|
|
179
|
+
sol_vec[init_sol[0][1]] = 1 - init_sol[1]
|
|
180
|
+
|
|
181
|
+
if n < 3:
|
|
182
|
+
# This is optimal for n=2, so return the solution
|
|
183
|
+
return sol_vec, init_sol[2]
|
|
184
|
+
|
|
185
|
+
iter_count = 0
|
|
186
|
+
|
|
187
|
+
grad_mat = np.zeros((n, n))
|
|
188
|
+
for i in range(n):
|
|
189
|
+
for j in range(n):
|
|
190
|
+
grad_mat[i, j] = dps[(i, j)]
|
|
191
|
+
|
|
192
|
+
while iter_count < MinNormSolver.MAX_ITER:
|
|
193
|
+
t_iter = np.argmin(np.dot(grad_mat, sol_vec))
|
|
194
|
+
|
|
195
|
+
v1v1 = np.dot(sol_vec, np.dot(grad_mat, sol_vec))
|
|
196
|
+
v1v2 = np.dot(sol_vec, grad_mat[:, t_iter])
|
|
197
|
+
v2v2 = grad_mat[t_iter, t_iter]
|
|
198
|
+
|
|
199
|
+
nc, nd = MinNormSolver._min_norm_element_from2(v1v1, v1v2, v2v2)
|
|
200
|
+
new_sol_vec = nc * sol_vec
|
|
201
|
+
new_sol_vec[t_iter] += 1 - nc
|
|
202
|
+
|
|
203
|
+
change = new_sol_vec - sol_vec
|
|
204
|
+
if np_sum(np.abs(change)) < MinNormSolver.STOP_CRIT:
|
|
205
|
+
return sol_vec, nd
|
|
206
|
+
sol_vec = new_sol_vec
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def gradient_normalizers(grads, losses, normalization_type):
|
|
210
|
+
gn = {}
|
|
211
|
+
if normalization_type == "l2":
|
|
212
|
+
for t in grads:
|
|
213
|
+
gn[t] = np.sqrt(np.sum([gr.pow(2).sum().data.cpu() for gr in grads[t]]))
|
|
214
|
+
elif normalization_type == "loss":
|
|
215
|
+
for t in grads:
|
|
216
|
+
gn[t] = losses[t]
|
|
217
|
+
elif normalization_type == "loss+":
|
|
218
|
+
for t in grads:
|
|
219
|
+
gn[t] = losses[t] * np.sqrt(
|
|
220
|
+
np.sum([gr.pow(2).sum().data.cpu() for gr in grads[t]])
|
|
221
|
+
)
|
|
222
|
+
elif normalization_type == "none":
|
|
223
|
+
for t in grads:
|
|
224
|
+
gn[t] = 1.0
|
|
225
|
+
else:
|
|
226
|
+
print("ERROR: Invalid Normalization Type")
|
|
227
|
+
return gn
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from typing import Tuple
|
|
2
|
+
|
|
3
|
+
import torch
|
|
4
|
+
from torch import Tensor, nn
|
|
5
|
+
|
|
6
|
+
from fusion_bench.utils.parameters import state_dict_to_vector
|
|
7
|
+
from fusion_bench.utils.state_dict_arithmetic import state_dict_sub
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _svd(w: Tensor, full_matrices=True) -> Tuple[Tensor, Tensor, Tensor]:
|
|
11
|
+
"""
|
|
12
|
+
Perform Singular Value Decomposition (SVD) on a tensor.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
w (Tensor): The input tensor.
|
|
16
|
+
full_matrices (bool): Whether to compute the full-sized U and V matrices.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Tuple[Tensor, Tensor, Tensor]: The U, S, and V matrices from SVD.
|
|
20
|
+
"""
|
|
21
|
+
u, s, vh = torch.linalg.svd(
|
|
22
|
+
w, full_matrices=full_matrices, driver="gesvd" if w.is_cuda else None
|
|
23
|
+
)
|
|
24
|
+
v = vh.T
|
|
25
|
+
return u, s, v
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def svd(
|
|
29
|
+
w: Tensor, full_matrices=True, accelerator=None
|
|
30
|
+
) -> Tuple[Tensor, Tensor, Tensor]:
|
|
31
|
+
"""
|
|
32
|
+
Perform SVD on a tensor, optionally using a specified accelerator.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
w (Tensor): The input tensor.
|
|
36
|
+
full_matrices (bool): Whether to compute the full-sized U and V matrices.
|
|
37
|
+
accelerator (str): The device to perform the computation on.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Tuple[Tensor, Tensor, Tensor]: The U, S, and V matrices from SVD.
|
|
41
|
+
"""
|
|
42
|
+
if accelerator is None:
|
|
43
|
+
return _svd(w, full_matrices=full_matrices)
|
|
44
|
+
original_device = w.device
|
|
45
|
+
w = w.to(accelerator)
|
|
46
|
+
u, s, v = _svd(w)
|
|
47
|
+
return u.to(original_device), s.to(original_device), v.to(original_device)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def frobenius_inner_product(w1: Tensor, w2: Tensor) -> Tensor:
|
|
51
|
+
return torch.trace(w1.T @ w2)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def is_leaf_module(module: nn.Module) -> bool:
|
|
55
|
+
return len(list(module.children())) == 0
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def get_task_vector_norm(model: nn.Module, pretrained_model: nn.Module) -> Tensor:
|
|
59
|
+
"""
|
|
60
|
+
Get the vector norm of the task model.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
model (nn.Module): The task model.
|
|
64
|
+
pretrained_model (nn.Module): The pretrained model.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
Tensor: The vector norm of the task model.
|
|
68
|
+
"""
|
|
69
|
+
return torch.linalg.norm(
|
|
70
|
+
state_dict_to_vector(
|
|
71
|
+
state_dict_sub(model.state_dict(), pretrained_model.state_dict())
|
|
72
|
+
)
|
|
73
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: fusion_bench
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.26
|
|
4
4
|
Summary: A Comprehensive Benchmark of Deep Model Fusion
|
|
5
5
|
Author-email: Anke Tang <tang.anke@foxmail.com>
|
|
6
6
|
Project-URL: Repository, https://github.com/tanganke/fusion_bench
|
|
@@ -82,7 +82,13 @@ Model merging has emerged as a promising approach for multi-task learning (MTL),
|
|
|
82
82
|
</details>
|
|
83
83
|
|
|
84
84
|
<details>
|
|
85
|
-
<summary>
|
|
85
|
+
<summary>Enneng Yang, et al. Continual Model Merging without Data: Dual Projections for Balancing Stability and Plasticity. NeurIPS 2025. https://github.com/EnnengYang/DOP</summary>
|
|
86
|
+
|
|
87
|
+
Model merging integrates multiple expert models with diverse capabilities into a unified framework, facilitating collaborative learning. However, most existing methods assume simultaneous access to all models, which is often impractical in real-world scenarios where models are received sequentially. While some studies have investigated continual model merging (CMM)--which involves sequentially merging multiple models--the challenge of balancing prior knowledge (stability) and incorporating new tasks (plasticity) remains unresolved. This paper, for the first time, formally defines the stability and plasticity of CMM from the perspective of orthogonal projection. Subsequently, we analyze the relationships among the spaces spanned by task data, historical gradients, and accumulated gradients. Building on this, we propose a data-free Dual Orthogonal Projection (DOP) method, which eliminates data dependence and mitigates interference between the merged model and models for old and new tasks by projecting their parameter differences onto their respective approximate data spaces. Finally, to solve potential conflicts between stability and plasticity, we reformulate DOP as a multi-objective optimization problem and employ a multi-gradient descent algorithm to obtain a Pareto-optimal solution. Extensive experiments across multiple architectures and task configurations validate that our approach significantly outperforms state-of-the-art CMM methods.
|
|
88
|
+
</details>
|
|
89
|
+
|
|
90
|
+
<details>
|
|
91
|
+
<summary>Anke Tang, et al. Merging Models on the Fly Without Retraining: A Sequential Approach to Scalable Continual Model Merging. NeurIPS 2025. Jan 2025. https://arxiv.org/pdf/2501.09522</summary>
|
|
86
92
|
|
|
87
93
|
Deep model merging represents an emerging research direction that combines multiple fine-tuned models to harness their specialized capabilities across different tasks and domains. Current model merging techniques focus on merging all available models simultaneously, with weight interpolation-based methods being the predominant approaches. However, these conventional approaches are not well-suited for scenarios where models become available sequentially, and they often suffer from high memory requirements and potential interference between tasks. In this study, we propose a training-free projection-based continual merging method that processes models sequentially through orthogonal projections of weight matrices and adaptive scaling mechanisms. Our method operates by projecting new parameter updates onto subspaces orthogonal to existing merged parameter updates while using an adaptive scaling mechanism to maintain stable parameter distances, enabling efficient sequential integration of task-specific knowledge. Our approach maintains constant memory complexity to the number of models, minimizes interference between tasks through orthogonal projections, and retains the performance of previously merged models through adaptive task vector scaling. Extensive experiments on CLIP-ViT models demonstrate that our method achieves a 5-8% average accuracy improvement while maintaining robust performance in different task orderings.
|
|
88
94
|
</details>
|
|
@@ -48,7 +48,7 @@ fusion_bench/dataset/llama/stanford_shp.py,sha256=6ueXKnFXIBBobacU1h5WxGLZrSOtBk
|
|
|
48
48
|
fusion_bench/dataset/llama/ultrachat.py,sha256=Go7WvrDAYnm184fdazHGRYLbSY6Xd7jrESyQeUJtOww,1736
|
|
49
49
|
fusion_bench/dataset/llama/wikitext.py,sha256=9ZHR-nMfXRumd3o-PIj3n7B83YlVeqpGkZ2zJs2B-9Y,2883
|
|
50
50
|
fusion_bench/dataset/llama/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
51
|
-
fusion_bench/method/__init__.py,sha256
|
|
51
|
+
fusion_bench/method/__init__.py,sha256=-r9Sz5haSc5X4eNvxGvHwowQfS2sLfZ21orUC5ae7ws,9454
|
|
52
52
|
fusion_bench/method/base_algorithm.py,sha256=OnKSNPQ_nIdIWxryyblW_sko7uoEBN4lGh-eLkJ4kh4,9004
|
|
53
53
|
fusion_bench/method/dummy.py,sha256=hb1y6LR_geRZ5eRgGwt5zJUcHYorCeIbs5i76CvurUc,1031
|
|
54
54
|
fusion_bench/method/ensemble.py,sha256=Bjzqxt-tUp5cawT1jIhqKswN5QH3bkYbmuI4LS4uTG0,3619
|
|
@@ -77,7 +77,7 @@ fusion_bench/method/bitdelta/bitdelta_utils/binary_gemm_kernel.py,sha256=zC0w5cw
|
|
|
77
77
|
fusion_bench/method/bitdelta/bitdelta_utils/data.py,sha256=LGEgv8o8glyyLLYh6Ur5h_sulxPFmy6i-xi-Ap1G-Wc,1052
|
|
78
78
|
fusion_bench/method/bitdelta/bitdelta_utils/diff.py,sha256=o3ib5sgGDYLgnL8YTfX0YDc4Md6W9_gb03jzftTn5s4,4075
|
|
79
79
|
fusion_bench/method/classification/__init__.py,sha256=byVJ574JQ_DUvsDv8S6ZM6BKAv4ZZ964Ej4btm0aC7k,867
|
|
80
|
-
fusion_bench/method/classification/clip_finetune.py,sha256=
|
|
80
|
+
fusion_bench/method/classification/clip_finetune.py,sha256=5q5Sr3eVVh8DfYdeSoGjwaKDksC8F2dY2r8Dl-wRaDg,15844
|
|
81
81
|
fusion_bench/method/classification/continual_clip_finetune.py,sha256=OLhZKS-6aCnafevZkZYcNMKTWDDj3DATB27eZl_i8EY,11530
|
|
82
82
|
fusion_bench/method/classification/image_classification_finetune.py,sha256=CPMpZvaULWaim01EvJJHlU4C6HQ16OCqZGoMvPBEWtY,8157
|
|
83
83
|
fusion_bench/method/concrete_subspace/__init__.py,sha256=jJoFcjnQe-jvccsm9DuCXna378m9XBT9vV1fEZbdfR0,464
|
|
@@ -101,6 +101,10 @@ fusion_bench/method/doge_ta/__init__.py,sha256=dixO0i5fmhgC_W2_DAQ4PzYnkMCZX5D8t
|
|
|
101
101
|
fusion_bench/method/doge_ta/clip_layer_wise_adamerging.py,sha256=4WPG2fhFw-u6oSoT-fBrP2K9YpX-MH-AotBL1DknfpA,1304
|
|
102
102
|
fusion_bench/method/doge_ta/doge_ta.py,sha256=jrJF52JUBdrB3EGWaXJMFZE-v8syzZGr4smG6rEO74c,13790
|
|
103
103
|
fusion_bench/method/doge_ta/layer_wise_adamerging.py,sha256=rLk3Nep5d6wMUNCp6q7pC7L0pfBvUwGBIuiGM7CQOf4,9780
|
|
104
|
+
fusion_bench/method/dop/__init__.py,sha256=MD8c44ovLLJX_-v9t2SdLrvKLxVf8PijzFFNjJfvhpE,37
|
|
105
|
+
fusion_bench/method/dop/dop.py,sha256=_wNjN1DSK27aKEyWVay61fqc7prwJ1uiv_3618_bQ20,14160
|
|
106
|
+
fusion_bench/method/dop/min_norm_solvers.py,sha256=a7n2X0BE_YajlaUygyHV0yqW6-x5dTyZ5V0mt_Q69qE,8291
|
|
107
|
+
fusion_bench/method/dop/utils.py,sha256=_q7yy3ENNFUh1qUd5J5DThRL4J1tIxEcknCO2AKmeYM,2102
|
|
104
108
|
fusion_bench/method/expert_sparsity/__init__.py,sha256=nt7k5cKqA2Bax1aM93ODwsEuibZ_hdFgQsUos_8h2v8,271
|
|
105
109
|
fusion_bench/method/expert_sparsity/mixtral/__init__.py,sha256=FyKDZIyYUnqvGIdJ5BS639UpzSBj11g28ATHs1Yczdk,545
|
|
106
110
|
fusion_bench/method/expert_sparsity/mixtral/dynamic_skipping.py,sha256=zZa4IAKimFZMoxoQ_Oi7z2R9o5H6kxV2QTb0e-t9kDY,5665
|
|
@@ -480,7 +484,7 @@ fusion_bench/utils/plot/token_notebook.py,sha256=bsntXf46Zz_RavTxNiB9c3-KvHw7LFw
|
|
|
480
484
|
fusion_bench/utils/strenum/__init__.py,sha256=id9ORi1uXrDxhbmVxitJ1KDwLS4H3AAwFpaK5h1cQzw,8531
|
|
481
485
|
fusion_bench/utils/strenum/_name_mangler.py,sha256=o11M5-bURW2RBvRTYXFQIPNeqLzburdoWLIqk8X3ydw,3397
|
|
482
486
|
fusion_bench/utils/strenum/_version.py,sha256=6JQRo9LcvODbCOeVFYQb9HNJ_J9XiG_Zbn8ws2A3BV8,18466
|
|
483
|
-
fusion_bench-0.2.
|
|
487
|
+
fusion_bench-0.2.26.dist-info/licenses/LICENSE,sha256=nhnOJlw4CPuPVE0qvkGmxfFgHmKi-6nzXvTu8t0NUdg,1066
|
|
484
488
|
fusion_bench_config/README.md,sha256=Lc8YSBJ5oxf9KV5kKDivJ9LRyGuraGQPmBbgbdVA-j4,703
|
|
485
489
|
fusion_bench_config/clip-vit-base-patch32_robustness_corrupted.yaml,sha256=7IxLQoLRz-sRWyV8Vqc5kQcmYE_9YQz2_77pmvAkum8,1207
|
|
486
490
|
fusion_bench_config/fabric_model_fusion.yaml,sha256=U8BxsaOvsg9bsEZcIpBE-feo9n9G7Y1kQDHqPVxUYAg,2601
|
|
@@ -599,14 +603,14 @@ fusion_bench_config/hydra/default.yaml,sha256=Fpi3pV1hqPoPk5QdBncse6NlNOAl2YHzD4
|
|
|
599
603
|
fusion_bench_config/hydra/help/fusion_bench_help.yaml,sha256=v8s891Cr5wyxBXGDn_VBBwwRmb0JXOL874Sl-zNoCWA,1880
|
|
600
604
|
fusion_bench_config/hydra/job_logging/rich_logging.yaml,sha256=_dYGeFTCqaPrRowLXBNMXwzYhw8ns1TkQFfALwK1aCw,441
|
|
601
605
|
fusion_bench_config/method/clip_finetune.yaml,sha256=yWjcdKYaKvy53sGaygg2ElAjb9-YFCyCGE1s9aB_dPM,677
|
|
602
|
-
fusion_bench_config/method/depth_upscaling.yaml,sha256=
|
|
603
|
-
fusion_bench_config/method/dummy.yaml,sha256=
|
|
606
|
+
fusion_bench_config/method/depth_upscaling.yaml,sha256=86YqczaMzZftymLy_k2cb-GMy4C42yTxxP4c4htZTBs,1230
|
|
607
|
+
fusion_bench_config/method/dummy.yaml,sha256=Pw2w6WQiw3K4_KH0THPs4NSM7lZoZLsNbB72iPSVsl8,427
|
|
604
608
|
fusion_bench_config/method/mixtral_moe_merging.yaml,sha256=AdVhXD6Crw-B3QyNpP4ToHRSg-EeSCIGtazA7lQvPOU,148
|
|
605
609
|
fusion_bench_config/method/mixtral_moe_upscaling.yaml,sha256=wYDRnWOpZ6SgvL2Fm9wIDomrN2x5Jaq5vg1hjh3druk,210
|
|
606
|
-
fusion_bench_config/method/model_recombination.yaml,sha256=
|
|
607
|
-
fusion_bench_config/method/simple_average.yaml,sha256=
|
|
608
|
-
fusion_bench_config/method/task_arithmetic.yaml,sha256=
|
|
609
|
-
fusion_bench_config/method/ties_merging.yaml,sha256=
|
|
610
|
+
fusion_bench_config/method/model_recombination.yaml,sha256=DeyVPdDCL-eyJDlPZXLAIWfKi3p8nN0dLFRx5ydsERc,740
|
|
611
|
+
fusion_bench_config/method/simple_average.yaml,sha256=uB51mNlFKb9S3Go1p6SLGgr3PWJFZs97Ccn1zZZkEug,577
|
|
612
|
+
fusion_bench_config/method/task_arithmetic.yaml,sha256=zQmNpnQrZTHiRv_KmYnHPMScKf8MUMLbQYh9254_1Jg,580
|
|
613
|
+
fusion_bench_config/method/ties_merging.yaml,sha256=c3BjnFo-ZU5hmCrfi-1VQPhd_EYGtftxxYDHTVCMy6s,501
|
|
610
614
|
fusion_bench_config/method/ada_svd/clip_vision.yaml,sha256=3l0VKCL66rZNx020UKhf_UzXScZ5XZYOUeNm8mqo0So,183
|
|
611
615
|
fusion_bench_config/method/adamerging/clip.yaml,sha256=NBJaK0a4RxV3D2LRciUeWmTqabRwu6OxZnT7u7iz6ug,753
|
|
612
616
|
fusion_bench_config/method/adamerging/layer_wise_flan_t5.yaml,sha256=DxkZhcuu_-ErIUqBUmWKN5UXYYWKoKPX6IgjV-Txwv0,541
|
|
@@ -614,7 +618,7 @@ fusion_bench_config/method/adamerging/layer_wise_gpt2.yaml,sha256=bLz6zc5CofeUO2
|
|
|
614
618
|
fusion_bench_config/method/adamerging/llama_sft.yaml,sha256=khKzfhvQ5oxBMH0d-YvyjN-qIgQNeevDodXngS5g9KY,1022
|
|
615
619
|
fusion_bench_config/method/analysis/task_vector_cos_similarity.yaml,sha256=hxVA4deUr1go1RZl12qD8PekwydWJ9SBQowSqmo3A8I,139
|
|
616
620
|
fusion_bench_config/method/analysis/task_vector_violin_plot.yaml,sha256=FmBGj0Ib2xYd-49x_xZSeVbExwL-A9-tHhHTMBrT_Fg,134
|
|
617
|
-
fusion_bench_config/method/bitdelta/bitdelta.yaml,sha256=
|
|
621
|
+
fusion_bench_config/method/bitdelta/bitdelta.yaml,sha256=uuR5x1IVTWyZjTSd5i1JXd_D8tG7tWBfOpgMBDCBgR0,436
|
|
618
622
|
fusion_bench_config/method/classification/clip_continual_finetune.yaml,sha256=Ls63kdLb1bLwUEqzfyTtJcpFOdv3HmwzBML0V2JnnAs,791
|
|
619
623
|
fusion_bench_config/method/classification/clip_finetune.yaml,sha256=yWjcdKYaKvy53sGaygg2ElAjb9-YFCyCGE1s9aB_dPM,677
|
|
620
624
|
fusion_bench_config/method/classification/image_classification_finetune.yaml,sha256=fl60RFCYwmrwwu3QlaJTFiBLmSmnjHxl-xyq4Gb80iU,401
|
|
@@ -631,9 +635,10 @@ fusion_bench_config/method/dare/task_arithmetic.yaml,sha256=cUAweNJ6p2aOv__0dvUL
|
|
|
631
635
|
fusion_bench_config/method/dare/ties_merging.yaml,sha256=7gDW4XpezrsccsbJGqqKrbX26JnqAc85A-MY66DGvuE,416
|
|
632
636
|
fusion_bench_config/method/dawe/dawe_for_clip.yaml,sha256=99P5xpp1YGvIwXGxDcxRtJMLE2FhvEFmFBQjOMEcGoc,1023
|
|
633
637
|
fusion_bench_config/method/doge_ta/doge_ta.yaml,sha256=CtZI3YPMJNDy225yhOJbSiMKlsc-X5nCFzmVh0dvr-w,78
|
|
634
|
-
fusion_bench_config/method/
|
|
635
|
-
fusion_bench_config/method/ensemble/
|
|
636
|
-
fusion_bench_config/method/ensemble/
|
|
638
|
+
fusion_bench_config/method/dop/dop.yaml,sha256=ZgdjuVfTj83kAvrS4RrPgGX7d_QQ7d1lIMlzhjiVeUc,954
|
|
639
|
+
fusion_bench_config/method/ensemble/max_model_predictor.yaml,sha256=ugO9FbEYqQk3RkX7wUDE9UOg-4D0F4Rezv0O-7hTeRg,476
|
|
640
|
+
fusion_bench_config/method/ensemble/simple_ensemble.yaml,sha256=kfPAaPVQIet9dYThKNsEBfe9gHdeCREnsM-snSOPahM,546
|
|
641
|
+
fusion_bench_config/method/ensemble/weighted_ensemble.yaml,sha256=LhlxU2P_inxR8MB0Z62phHWj5S4qxD7ITG4Ly-GUcQo,770
|
|
637
642
|
fusion_bench_config/method/expert_sparsity/README.md,sha256=CLE0-XblXDWCUTHPaTNtBH-YquXn-uawwTJiYrgjMaA,239
|
|
638
643
|
fusion_bench_config/method/expert_sparsity/mixtral.yaml,sha256=maFL3LM0zfnQ1eXoNXUslSjgZmpOdUJgl_a31dYUBbc,605
|
|
639
644
|
fusion_bench_config/method/fisher_merging/clip_fisher_merging.yaml,sha256=-m5uDA9hfBg_8vF3s0MnUp0JTl3MqpB4-rlPEg9CHD4,569
|
|
@@ -646,23 +651,23 @@ fusion_bench_config/method/gossip/layer_wise_flan_t5.yaml,sha256=2yBqbhwz2vq65wT
|
|
|
646
651
|
fusion_bench_config/method/isotropic_merging/iso_c.yaml,sha256=mn_5nyc7s_a7QH1MkEj9ZncjNHtZa0mzfXcUGRJOiAw,81
|
|
647
652
|
fusion_bench_config/method/isotropic_merging/iso_cts.yaml,sha256=70BODJt69pZ_9xH7S_Z2Yzb299LFIGkXy1bQiHQad6A,110
|
|
648
653
|
fusion_bench_config/method/linear/expo.yaml,sha256=St3NW6cKVRV3vCn8y0gxQ8k66VTdtsLTEWQTbO9wQ0Y,420
|
|
649
|
-
fusion_bench_config/method/linear/linear_interpolation.yaml,sha256=
|
|
654
|
+
fusion_bench_config/method/linear/linear_interpolation.yaml,sha256=cAL_ekEIJhJD4cfAbKilV0k_lNNPoJqY4sABVEKcM7E,523
|
|
650
655
|
fusion_bench_config/method/linear/llama_expo.yaml,sha256=SvqamjT06BMObQ58sks5x7Wv6kGpp3-Nlw3ihbD_kSA,621
|
|
651
656
|
fusion_bench_config/method/linear/llama_expo_with_dare.yaml,sha256=Pp8s2xmEg5XSvaGKtwTYx_PzcGvwRh2gPpZ6u9as4_E,383
|
|
652
657
|
fusion_bench_config/method/linear/simple_average_for_causallm.yaml,sha256=qqeIr61PJEcfZclZ5vV64GCzyt-8b1zB0FDZu8DsbXQ,322
|
|
653
658
|
fusion_bench_config/method/linear/task_arithmetic_for_causallm.yaml,sha256=tJA0n0_XVvll4rZYVHQVqFCz8W3Bey6NjPKMIH3-P0U,142
|
|
654
659
|
fusion_bench_config/method/linear/ties_merging_for_causallm.yaml,sha256=1oEIdxV0OqWjDQ9V_lmXEPUayp4KbKHE2SvpCLmiKOU,489
|
|
655
|
-
fusion_bench_config/method/linear/weighted_average.yaml,sha256=
|
|
656
|
-
fusion_bench_config/method/linear/weighted_average_for_llama.yaml,sha256=
|
|
660
|
+
fusion_bench_config/method/linear/weighted_average.yaml,sha256=OjE4EdfDHPYx8PlBJ6xIpCz4ITu_65VsRyefioRXGQ8,408
|
|
661
|
+
fusion_bench_config/method/linear/weighted_average_for_llama.yaml,sha256=886ZKr81gyN7DISqtbrM5WnjSXd_6AlakQyOJQagoYY,518
|
|
657
662
|
fusion_bench_config/method/lm_finetune/bradley_terry_rm.yaml,sha256=QHsRfJK9K4KajsX3LBHG8cDt7ZLJWxOBnJjpHRQSB_s,1348
|
|
658
663
|
fusion_bench_config/method/lm_finetune/fullfinetune_sft.yaml,sha256=c0rFqj2GV11X9RMraHXJtJ9OiMUzZtvDVsTn4tgAeco,1337
|
|
659
664
|
fusion_bench_config/method/lm_finetune/peftfinetune_sft.yaml,sha256=LjGwfTiiC5iQKr62i22XopQTfSKbx9UbsDvEW-byneQ,1622
|
|
660
|
-
fusion_bench_config/method/model_stock/model_stock.yaml,sha256=
|
|
665
|
+
fusion_bench_config/method/model_stock/model_stock.yaml,sha256=4KHAFCjL4AQ5dxkv7IGkUTxE8g-GCoxDkA3BbnlzQC0,530
|
|
661
666
|
fusion_bench_config/method/moe_pruner/moe_pruner.yaml,sha256=OYMYLKvLlNEht7BK9phaTEvAE1ySaVi-pvjYiT-OTGw,442
|
|
662
|
-
fusion_bench_config/method/opcm/opcm.yaml,sha256=
|
|
663
|
-
fusion_bench_config/method/opcm/task_arithmetic.yaml,sha256=
|
|
664
|
-
fusion_bench_config/method/opcm/ties_merging.yaml,sha256=
|
|
665
|
-
fusion_bench_config/method/opcm/weight_average.yaml,sha256=
|
|
667
|
+
fusion_bench_config/method/opcm/opcm.yaml,sha256=7NBOGo6W1FDbqdkT8gfM5PI2kHfqB8ofMfgcxVI1suM,686
|
|
668
|
+
fusion_bench_config/method/opcm/task_arithmetic.yaml,sha256=WL_nVXhZWV9fe_ttChShkjYZVJnOCzvZ3i7NBppYsxk,743
|
|
669
|
+
fusion_bench_config/method/opcm/ties_merging.yaml,sha256=1-xR0dVEEFJue9r-oBk1ZfGmGM9vCu4cJBG5aZnJ3C8,917
|
|
670
|
+
fusion_bench_config/method/opcm/weight_average.yaml,sha256=n-eyxVkpRanlRJdFWFK3kppiO_W1S99WNjyjdBLDnw0,668
|
|
666
671
|
fusion_bench_config/method/pruning/llama_magnitude_pruning.yaml,sha256=Px8LU_UtDz-YHDFfqQ7scEPOproiFOaudKVshrhCTgc,483
|
|
667
672
|
fusion_bench_config/method/pruning/llama_random_pruning.yaml,sha256=0RiZS8d42PXZzwncPG8zcbnyYJ9vtfr2sOSqS8oDyT4,325
|
|
668
673
|
fusion_bench_config/method/pruning/llama_sparsegpt_pruning.yaml,sha256=gC6Ss0n2tKSb4gyVfx45BvsFbVBGN-om4-2S1sKS-_w,505
|
|
@@ -679,15 +684,15 @@ fusion_bench_config/method/regmean/clip_regmean.yaml,sha256=QfkCHCLK9wbyB1Tq1S7Y
|
|
|
679
684
|
fusion_bench_config/method/regmean/gpt2_regmean.yaml,sha256=n94aTboDdwSA7Tki8l_o8tYQkhXxPV8lRf-dRNPIsOs,422
|
|
680
685
|
fusion_bench_config/method/regmean/regmean.yaml,sha256=ZgVVLx-lHwVgjtjTl4VZUlthh8yyua87QvoJfmNHud4,101
|
|
681
686
|
fusion_bench_config/method/regmean_plusplus/clip_regmean_plusplus.yaml,sha256=A034ryEwvosqyQzA3KWs7kdp-3CUnoJtCujVywV-uzA,434
|
|
682
|
-
fusion_bench_config/method/slerp/slerp.yaml,sha256=
|
|
683
|
-
fusion_bench_config/method/slerp/slerp_lm.yaml,sha256=
|
|
687
|
+
fusion_bench_config/method/slerp/slerp.yaml,sha256=XR3z6iqyHirkoFSdLAeV2bP1yyI25MoWG-LqdE-ypjA,719
|
|
688
|
+
fusion_bench_config/method/slerp/slerp_lm.yaml,sha256=hO07n6elZg_FrqEfSfbdR-tb1hqwT7vaLgAZKdF8O1o,479
|
|
684
689
|
fusion_bench_config/method/smile_upscaling/causal_lm_upscaling.yaml,sha256=skLwgu_VHShm4m0oEOkqKzcBS5Cz7J29xEj7pTaSm0k,916
|
|
685
690
|
fusion_bench_config/method/smile_upscaling/error_accumulation.yaml,sha256=6Gui-OuQ3P_4TwO_syh9SWJCNeHiAQzS55aO-ByYKbQ,154
|
|
686
691
|
fusion_bench_config/method/smile_upscaling/projected_energy.yaml,sha256=M_EBOC3B_pxaBO3tD6mnbXpvy6-EaegSsE-jdJs-HY0,114
|
|
687
692
|
fusion_bench_config/method/smile_upscaling/singular_projection_merging.yaml,sha256=ZMn_ImRjjc2uozf7ocQIzbgvFDpBV7S-34KptbBXVGo,200
|
|
688
693
|
fusion_bench_config/method/smile_upscaling/smile_mistral_upscaling.yaml,sha256=VFMrkbO69d0wCjTQCuKysYGVe6hEwNu792g1QkhU5Mk,383
|
|
689
694
|
fusion_bench_config/method/smile_upscaling/smile_qwen2_upscaling.yaml,sha256=MfZ1u1HIJoy_csWiLzR4GLz-eiaVxo2gmNYre224yqo,433
|
|
690
|
-
fusion_bench_config/method/smile_upscaling/smile_upscaling.yaml,sha256=
|
|
695
|
+
fusion_bench_config/method/smile_upscaling/smile_upscaling.yaml,sha256=38DGdOjpDo-dOMpfy807p3x-eAvibjED-BGtFGnaycA,689
|
|
691
696
|
fusion_bench_config/method/sparselo_pruning/llama_iterative_sparselo.yaml,sha256=L-WgNhFjcp_2tocDxZi6STVTtoaSd1v9UOQaKO_QvHM,669
|
|
692
697
|
fusion_bench_config/method/sparselo_pruning/llama_pcp_sparselo.yaml,sha256=prTEFH0eu7R_CVNQ0GPWL9QsOLFcT1uM12zZdi3qcFo,636
|
|
693
698
|
fusion_bench_config/method/sparselo_pruning/llama_sparselo.yaml,sha256=Cmg8N4l--3C0qeSHG-HLOgjJZ954eWHoDNgRnx0pLK0,614
|
|
@@ -948,8 +953,8 @@ fusion_bench_config/taskpool/LMEvalHarnessTaskPool/lm_eval.yaml,sha256=3q-KMuFaM
|
|
|
948
953
|
fusion_bench_config/taskpool/OpenCLIPVisionModelTaskPool/ViT-B-16_TA8.yaml,sha256=GjpiiRownrBCpl-TNwWRW2PYePbF-Cl99jlLNPrK5T4,1017
|
|
949
954
|
fusion_bench_config/taskpool/OpenCLIPVisionModelTaskPool/ViT-B-32_TA8.yaml,sha256=WwiYMQKehtJixDPnu5o3vcWe4yJksXTWRqOzm3uVWXQ,1017
|
|
950
955
|
fusion_bench_config/taskpool/OpenCLIPVisionModelTaskPool/ViT-L-14_TA8.yaml,sha256=xGRt0J9joXTzWUew6DvoYprAWlPXhaVFw5AX4im5VQw,1017
|
|
951
|
-
fusion_bench-0.2.
|
|
952
|
-
fusion_bench-0.2.
|
|
953
|
-
fusion_bench-0.2.
|
|
954
|
-
fusion_bench-0.2.
|
|
955
|
-
fusion_bench-0.2.
|
|
956
|
+
fusion_bench-0.2.26.dist-info/METADATA,sha256=BOHkLorLs0w_fgAtRz7tpYVExKxFiClGISLlsnW3BG8,24307
|
|
957
|
+
fusion_bench-0.2.26.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
958
|
+
fusion_bench-0.2.26.dist-info/entry_points.txt,sha256=iUQ8MCJvda7HP4vYh2n1Teoapb4G9PBVYZkAfcc5SHU,116
|
|
959
|
+
fusion_bench-0.2.26.dist-info/top_level.txt,sha256=BuO4TL6iHL_2yPBUX9-LlIrHRczA_BNMIFwweK0PQEI,13
|
|
960
|
+
fusion_bench-0.2.26.dist-info/RECORD,,
|
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Method Configuration: BitDelta
|
|
3
|
+
# =============================================================================
|
|
1
4
|
_target_: fusion_bench.method.bitdelta.BitDeltaAlgorithm
|
|
2
5
|
save_dir: null
|
|
3
6
|
save_full_model: false
|
|
@@ -1,3 +1,12 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Method Configuration: Depth Upscaling
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Constructs a deeper model by stacking/selecting layers from existing models.
|
|
5
|
+
#
|
|
6
|
+
# - layer_indices: list[int | str] specifying which layers to use. Strings are Python
|
|
7
|
+
# expressions evaluated to lists, e.g., "range(6,12)".
|
|
8
|
+
# - Example: [0, 2, 4, "range(6,12)"] selects 1st, 3rd, 5th, and 7th-12th layers.
|
|
9
|
+
# =============================================================================
|
|
1
10
|
_target_: DepthUpscalingAlgorithm
|
|
2
11
|
# this should be a list of integers or string, indicating the sequence of layers.
|
|
3
12
|
# If the entry is an integer, it will use the n-th layer of the model.
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
_target_: fusion_bench.method.dop.dop.ContinualDOPForCLIP
|
|
2
|
+
|
|
3
|
+
# the random seed to use
|
|
4
|
+
seed: null
|
|
5
|
+
# shuffle the order of the models
|
|
6
|
+
shuffle_order: true
|
|
7
|
+
# save the merged model on every step
|
|
8
|
+
save_on_every_step: false
|
|
9
|
+
# evaluate the merged model on every step
|
|
10
|
+
evaluate_on_every_step: true
|
|
11
|
+
|
|
12
|
+
# optimizer (learning rate)
|
|
13
|
+
lr: 1e-4
|
|
14
|
+
# optimizer (num_steps)
|
|
15
|
+
num_steps: 200
|
|
16
|
+
|
|
17
|
+
# weighted loss
|
|
18
|
+
# if mgda is true, use mgda to optimize the loss weights
|
|
19
|
+
mgda: true
|
|
20
|
+
# if mgda is false, this is the weight for the loss of the first task
|
|
21
|
+
alpha: 0.8
|
|
22
|
+
# if mgda is true and ema is ture, using exponential moving average (ema), alpha is the initial value
|
|
23
|
+
ema: true
|
|
24
|
+
# if mgda is true and ema is ture, using exponential moving average (ema), beta is the decay rate
|
|
25
|
+
ema_beta: 0.999
|
|
26
|
+
|
|
27
|
+
# epsilon for svd (the proportion of energy retained)
|
|
28
|
+
svd_epsilon: 0.99999
|
|
29
|
+
# the space to project the delta w (left singular vectors, right singular vectors, or both)
|
|
30
|
+
svd_proj_space: uv # u or v or uv
|
|
@@ -1 +1,7 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Method Configuration: Dummy
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# No-op method for testing pipelines and wiring.
|
|
5
|
+
# Instantiates and exits without modifying models.
|
|
6
|
+
# =============================================================================
|
|
1
7
|
_target_: fusion_bench.method.DummyAlgorithm
|
|
@@ -1 +1,7 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Method Configuration: Max Model Predictor
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Selects the model with maximum confidence or performance per example/task.
|
|
5
|
+
# No additional hyperparameters are required.
|
|
6
|
+
# =============================================================================
|
|
1
7
|
_target_: fusion_bench.method.MaxModelPredictorAlgorithm
|
|
@@ -1,2 +1,9 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Method Configuration: Simple Ensemble
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Averages model predictions uniformly.
|
|
5
|
+
#
|
|
6
|
+
# device_map: leave null for single device or provide a mapping for multi-device setups.
|
|
7
|
+
# =============================================================================
|
|
1
8
|
_target_: fusion_bench.method.SimpleEnsembleAlgorithm
|
|
2
|
-
device_map: null
|
|
9
|
+
device_map: null # Set to null for single device, or specify mapping
|
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Method Configuration: Weighted Ensemble
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Ensembles model predictions using specified per-model weights.
|
|
5
|
+
#
|
|
6
|
+
# - Set normalize=true to rescale weights to sum to 1.
|
|
7
|
+
# - weights: one float per model in the pool (order-sensitive). If null, uses equal weights.
|
|
8
|
+
# =============================================================================
|
|
1
9
|
_target_: fusion_bench.method.WeightedEnsembleAlgorithm
|
|
2
10
|
normalize: true
|
|
3
11
|
# this should be a list of floats, one for each model in the ensemble
|
|
@@ -1,2 +1,10 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Method Configuration: Linear Interpolation
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Interpolates between two models: (1 - t) * model0 + t * model1
|
|
5
|
+
#
|
|
6
|
+
# - t in [0,1]: 0 returns model0; 1 returns model1.
|
|
7
|
+
# - Only meaningful for two-model pools.
|
|
8
|
+
# =============================================================================
|
|
1
9
|
_target_: fusion_bench.method.LinearInterpolationAlgorithm
|
|
2
10
|
t: 0.5
|
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Method Configuration: Weighted Average (Linear)
|
|
3
|
+
# =============================================================================
|
|
1
4
|
_target_: fusion_bench.method.WeightedAverageAlgorithm
|
|
2
5
|
normalize: true # if true, the weights will be normalized before merging
|
|
3
6
|
weights: # List of weights for each model
|
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Method Configuration: Model Recombination
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Recombines submodules/layers from multiple models to form a new model.
|
|
5
|
+
#
|
|
6
|
+
# - return_modelpool: override run() argument to return model pool instead of merged model.
|
|
7
|
+
# Set to null to respect runtime argument; set to true/false to force behavior.
|
|
8
|
+
# =============================================================================
|
|
1
9
|
_target_: fusion_bench.method.ModelRecombinationAlgorithm
|
|
2
10
|
# if `return_model_pool` is not null, the argument `return_modelpool` passed to the `run` method will be ignored.
|
|
3
11
|
return_modelpool: null
|
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Method Configuration: Model Stock
|
|
3
|
+
# =============================================================================
|
|
1
4
|
_target_: fusion_bench.method.model_stock.ModelStock
|
|
2
5
|
ignore_keys:
|
|
3
6
|
[
|
|
@@ -9,4 +12,4 @@ ignore_keys:
|
|
|
9
12
|
"model.ln_final.bias",
|
|
10
13
|
]
|
|
11
14
|
model_save_path: ${path.log_dir}/checkpoint
|
|
12
|
-
model_save_kwargs: null
|
|
15
|
+
model_save_kwargs: null
|
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Method Configuration: OPCM
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Incrementally merges models via SVD projection and evaluation per step.
|
|
5
|
+
# =============================================================================
|
|
1
6
|
_target_: fusion_bench.method.opcm.opcm.OPCMForCLIP
|
|
2
7
|
# shuffle the order of the models
|
|
3
8
|
shuffle_order: true
|
|
@@ -1,3 +1,9 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Method Configuration: Continual Task Arithmetic
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Applies task arithmetic incrementally across a stream of models.
|
|
5
|
+
# Maintains per-step save/eval similar to OPCM.
|
|
6
|
+
# =============================================================================
|
|
1
7
|
_target_: fusion_bench.method.opcm.task_arithmetic.ContinualTaskArithmeticForCLIP
|
|
2
8
|
scaling_factor: 0.3
|
|
3
9
|
# shuffle the order of the models
|
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Method Configuration: Continual TIES Merging
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Continual variant of TIES merging with per-step save/eval instrumentation.
|
|
5
|
+
# =============================================================================
|
|
1
6
|
_target_: fusion_bench.method.opcm.ties_merging.ContinualTiesMergingForCLIP
|
|
2
7
|
# Scaling factor $\lambda$
|
|
3
8
|
scaling_factor: 0.5
|
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Method Configuration: Continual Weighted Average
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Incrementally averages model weights as new models arrive.
|
|
5
|
+
# =============================================================================
|
|
1
6
|
_target_: fusion_bench.method.opcm.weight_average.ContinualWeightAverageForCLIP
|
|
2
7
|
# shuffle the order of the models
|
|
3
8
|
shuffle_order: true
|
|
@@ -1 +1,10 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Method Configuration: Simple Average
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Equally averages parameters of all models in the model pool.
|
|
5
|
+
#
|
|
6
|
+
# Usage notes
|
|
7
|
+
# - No hyperparameters required; behavior is deterministic given model order.
|
|
8
|
+
# - Ensure models are architecture-compatible (same shapes) before merging.
|
|
9
|
+
# =============================================================================
|
|
1
10
|
_target_: fusion_bench.method.SimpleAverageAlgorithm
|
|
@@ -1,3 +1,12 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Method Configuration: Spherical Linear Interpolation (SLERP)
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Interpolates between two parameter vectors on a hypersphere.
|
|
5
|
+
#
|
|
6
|
+
# - t in [0,1]: interpolation factor; 0 returns model0; 1 returns model1.
|
|
7
|
+
# - DOT_THRESHOLD: threshold to switch to linear interpolation when vectors are near-aligned.
|
|
8
|
+
# - epsilon: small constant to avoid division by zero.
|
|
9
|
+
# =============================================================================
|
|
1
10
|
_target_: fusion_bench.method.SlerpMergeAlgorithm
|
|
2
11
|
t: 0.5 # interpolation factor
|
|
3
12
|
DOT_THRESHOLD: 0.9995
|
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Method Configuration: SLERP for Causal LM
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Spherical linear interpolation between two causal language models.
|
|
5
|
+
# =============================================================================
|
|
1
6
|
_target_: fusion_bench.method.SlerpForCausalLM
|
|
2
7
|
t: 0.5
|
|
3
8
|
model_save_path: ${path.log_dir}/checkpoint
|
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Method Configuration: SMILE Upscaling
|
|
3
|
+
# =============================================================================
|
|
1
4
|
_target_: fusion_bench.method.SmileUpscalingAlgorithm
|
|
2
5
|
# merge device on cuda can accelerate the SVD computation
|
|
3
6
|
device: cpu
|
|
@@ -1,2 +1,11 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Method Configuration: Task Arithmetic
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Performs task vector arithmetic: base + lambda * \sum_i (task_i - base).
|
|
5
|
+
#
|
|
6
|
+
# Notes
|
|
7
|
+
# - scaling_factor controls the contribution of the task delta.
|
|
8
|
+
# - Model compatibility is required (matching parameter shapes).
|
|
9
|
+
# =============================================================================
|
|
1
10
|
_target_: fusion_bench.method.TaskArithmeticAlgorithm
|
|
2
11
|
scaling_factor: 0.3
|
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Method Configuration: Ties Merging
|
|
3
|
+
# =============================================================================
|
|
1
4
|
_target_: fusion_bench.method.TiesMergingAlgorithm
|
|
2
5
|
# Scaling factor $\lambda$
|
|
3
6
|
scaling_factor: 0.3
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|