fusion-bench 0.2.25__py3-none-any.whl → 0.2.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. fusion_bench/method/__init__.py +2 -0
  2. fusion_bench/method/classification/clip_finetune.py +6 -4
  3. fusion_bench/method/dop/__init__.py +1 -0
  4. fusion_bench/method/dop/dop.py +366 -0
  5. fusion_bench/method/dop/min_norm_solvers.py +227 -0
  6. fusion_bench/method/dop/utils.py +73 -0
  7. {fusion_bench-0.2.25.dist-info → fusion_bench-0.2.26.dist-info}/METADATA +8 -2
  8. {fusion_bench-0.2.25.dist-info → fusion_bench-0.2.26.dist-info}/RECORD +34 -29
  9. fusion_bench_config/method/bitdelta/bitdelta.yaml +3 -0
  10. fusion_bench_config/method/depth_upscaling.yaml +9 -0
  11. fusion_bench_config/method/dop/dop.yaml +30 -0
  12. fusion_bench_config/method/dummy.yaml +6 -0
  13. fusion_bench_config/method/ensemble/max_model_predictor.yaml +6 -0
  14. fusion_bench_config/method/ensemble/simple_ensemble.yaml +8 -1
  15. fusion_bench_config/method/ensemble/weighted_ensemble.yaml +8 -0
  16. fusion_bench_config/method/linear/linear_interpolation.yaml +8 -0
  17. fusion_bench_config/method/linear/weighted_average.yaml +3 -0
  18. fusion_bench_config/method/linear/weighted_average_for_llama.yaml +1 -1
  19. fusion_bench_config/method/model_recombination.yaml +8 -0
  20. fusion_bench_config/method/model_stock/model_stock.yaml +4 -1
  21. fusion_bench_config/method/opcm/opcm.yaml +5 -0
  22. fusion_bench_config/method/opcm/task_arithmetic.yaml +6 -0
  23. fusion_bench_config/method/opcm/ties_merging.yaml +5 -0
  24. fusion_bench_config/method/opcm/weight_average.yaml +5 -0
  25. fusion_bench_config/method/simple_average.yaml +9 -0
  26. fusion_bench_config/method/slerp/slerp.yaml +9 -0
  27. fusion_bench_config/method/slerp/slerp_lm.yaml +5 -0
  28. fusion_bench_config/method/smile_upscaling/smile_upscaling.yaml +3 -0
  29. fusion_bench_config/method/task_arithmetic.yaml +9 -0
  30. fusion_bench_config/method/ties_merging.yaml +3 -0
  31. {fusion_bench-0.2.25.dist-info → fusion_bench-0.2.26.dist-info}/WHEEL +0 -0
  32. {fusion_bench-0.2.25.dist-info → fusion_bench-0.2.26.dist-info}/entry_points.txt +0 -0
  33. {fusion_bench-0.2.25.dist-info → fusion_bench-0.2.26.dist-info}/licenses/LICENSE +0 -0
  34. {fusion_bench-0.2.25.dist-info → fusion_bench-0.2.26.dist-info}/top_level.txt +0 -0
@@ -70,6 +70,7 @@ _import_structure = {
70
70
  "IsotropicMergingInCommonSubspace",
71
71
  ],
72
72
  "opcm": ["OPCMForCLIP"],
73
+ "dop": ["ContinualDOPForCLIP"],
73
74
  "gossip": [
74
75
  "CLIPLayerWiseGossipAlgorithm",
75
76
  "CLIPTaskWiseGossipAlgorithm",
@@ -212,6 +213,7 @@ if TYPE_CHECKING:
212
213
  from .model_recombination import ModelRecombinationAlgorithm
213
214
  from .model_stock import ModelStock
214
215
  from .opcm import OPCMForCLIP
216
+ from .dop import ContinualDOPForCLIP
215
217
  from .pruning import (
216
218
  MagnitudeDiffPruningAlgorithm,
217
219
  MagnitudePruningForLlama,
@@ -5,8 +5,8 @@ Fine-tune CLIP-ViT-B/32:
5
5
 
6
6
  ```bash
7
7
  fusion_bench \
8
- method=clip_finetune \
9
- modelpool=clip-vit-base-patch32_mtl \
8
+ method=classification/clip_finetune \
9
+ modelpool=CLIPVisionModelPool/clip-vit-base-patch32_mtl \
10
10
  taskpool=dummy
11
11
  ```
12
12
 
@@ -15,12 +15,14 @@ Fine-tune CLIP-ViT-L/14 on eight GPUs with a per-device per-task batch size of 2
15
15
  ```bash
16
16
  fusion_bench \
17
17
  fabric.devices=8 \
18
- method=clip_finetune \
18
+ method=classification/clip_finetune \
19
19
  method.batch_size=2 \
20
- modelpool=clip-vit-base-patch32_mtl \
20
+ modelpool=CLIPVisionModelPool/clip-vit-base-patch32_mtl \
21
21
  modelpool.models.0.path=openai/clip-vit-large-patch14 \
22
22
  taskpool=dummy
23
23
  ```
24
+
25
+ See `examples/clip_finetune` for more details.
24
26
  """
25
27
 
26
28
  import os
@@ -0,0 +1 @@
1
+ from .dop import ContinualDOPForCLIP
@@ -0,0 +1,366 @@
1
+ """
2
+ Continual Model Merging without Data: Dual Projections for Balancing Stability and Plasticity. NeurIPS, 2025.
3
+
4
+
5
+ Example:
6
+
7
+ fusion_bench \
8
+ method=dop/dop \
9
+ modelpool=CLIPVisionModelPool/clip-vit-base-patch32_TA8_model_only \
10
+ taskpool=CLIPVisionModelTaskPool/clip-vit-classification_TA8
11
+ """
12
+
13
+ import logging
14
+ import os
15
+ import random
16
+ from copy import deepcopy
17
+ from pathlib import Path
18
+ from typing import Dict, List, Literal, Optional, Tuple, cast
19
+
20
+ import lightning as L
21
+ import numpy as np
22
+ import torch
23
+ from omegaconf import DictConfig
24
+ from torch import Tensor, nn
25
+ from torch.autograd import Variable
26
+ from tqdm.auto import tqdm
27
+ from transformers import CLIPVisionModel
28
+
29
+ from fusion_bench import BaseAlgorithm, BaseModelPool, auto_register_config
30
+ from fusion_bench.method.simple_average import simple_average
31
+ from fusion_bench.mixins import LightningFabricMixin
32
+ from fusion_bench.taskpool import CLIPVisionModelTaskPool
33
+ from fusion_bench.utils import seed_everything_by_time
34
+ from fusion_bench.utils.json import save_to_json
35
+
36
+ from .min_norm_solvers import MinNormSolver, gradient_normalizers
37
+ from .utils import is_leaf_module, svd
38
+
39
+ log = logging.getLogger(__name__)
40
+
41
+
42
+ @auto_register_config
43
+ class ContinualDOPForCLIP(BaseAlgorithm, LightningFabricMixin):
44
+
45
+ def __init__(
46
+ self,
47
+ seed: Optional[int] = None,
48
+ shuffle_order: bool = False,
49
+ save_on_every_step: bool = True,
50
+ evaluate_on_every_step: bool = False,
51
+ lr: float = 1e-4,
52
+ num_steps: int = 200,
53
+ mgda: bool = True,
54
+ ema: bool = True,
55
+ ema_beta: float = 0.99,
56
+ alpha: float = None,
57
+ svd_epsilon: float = 1.0,
58
+ svd_proj_space: str = "uv",
59
+ **kwargs,
60
+ ):
61
+ self.lr = lr
62
+ self.num_steps = num_steps
63
+ self.mgda = mgda
64
+ self.ema = ema
65
+ self.ema_beta = ema_beta
66
+ self.alpha = alpha
67
+ self.svd_epsilon = svd_epsilon
68
+ self.svd_proj_space = svd_proj_space
69
+ self.seed = seed
70
+ self.shuffle_order = shuffle_order
71
+ self.save_on_every_step = save_on_every_step
72
+ self.evaluate_on_every_step = evaluate_on_every_step
73
+
74
+ assert (
75
+ self.svd_epsilon >= 0 and self.svd_epsilon <= 1
76
+ ), "The svd_epsilon should be in the range of [0, 1]"
77
+ assert (
78
+ self.alpha >= 0 and self.alpha <= 1
79
+ ), "The alpha should be in the range of [0, 1]"
80
+ super().__init__(**kwargs)
81
+
82
+ def print_params(self, pretrained_model):
83
+ total_params = 0
84
+ linear_params = 0
85
+ linear_weight_params = 0
86
+ for module_name, module in pretrained_model.named_modules():
87
+ if not is_leaf_module(module):
88
+ continue
89
+ if isinstance(module, nn.Linear):
90
+ linear_params += sum(p.numel() for n, p in module.named_parameters())
91
+ linear_weight_params += sum(
92
+ p.numel() for n, p in module.named_parameters() if "weight" in n
93
+ )
94
+ total_params += sum(p.numel() for p in module.parameters())
95
+
96
+ linear_ratio = linear_params / total_params * 100
97
+ linear_weight_ratio = linear_weight_params / total_params * 100
98
+ print(f"Total Parameters: {total_params}")
99
+ print(f"Linear Parameters: {linear_params}")
100
+ print(f"Linear Weight Parameters: {linear_weight_params}")
101
+ print(f"Linear Ratio: {linear_ratio:.2f}%")
102
+ print(f"Linear Weight Ratio: {linear_weight_ratio:.2f}%")
103
+
104
+ def run(self, modelpool: BaseModelPool):
105
+ if self.seed is not None:
106
+ L.seed_everything(self.seed)
107
+ else:
108
+ seed_everything_by_time(self.fabric)
109
+
110
+ # get the model names, shuffle if needed
111
+ # the model names will be saved to the log directory as `model_names.json`
112
+ model_names = modelpool.model_names
113
+ if self.shuffle_order:
114
+ random.shuffle(model_names)
115
+ if self.log_dir is not None:
116
+ save_to_json(model_names, os.path.join(self.log_dir, "model_names.json"))
117
+
118
+ if self.evaluate_on_every_step:
119
+ """Configuration for the test datasets"""
120
+ self.taskpool = cast(CLIPVisionModelTaskPool, self._program.taskpool)
121
+ self._test_datasets = deepcopy(self.taskpool._test_datasets)
122
+
123
+ pretrained_model = modelpool.load_pretrained_model()
124
+
125
+ merged_model = None
126
+ for model_idx, model_name in enumerate(model_names):
127
+ print(
128
+ f"--------- Optimizing {model_idx + 1}/{len(model_names)}-th with {model_name} ---------"
129
+ )
130
+ if model_idx == 0:
131
+ merged_model = modelpool.load_model(model_names[0])
132
+ else:
133
+ merged_model = self._layer_wise_optimize(
134
+ model_names=["merged", model_name],
135
+ pretrained_model=deepcopy(pretrained_model),
136
+ finetuned_models={
137
+ "merged": merged_model,
138
+ model_name: modelpool.load_model(model_name),
139
+ },
140
+ model_idx=model_idx,
141
+ )
142
+
143
+ if self.save_on_every_step:
144
+ self.save_merged_model(merged_model, model_idx)
145
+
146
+ if self.evaluate_on_every_step:
147
+ self.taskpool._is_setup = False
148
+ self.taskpool._test_datasets = DictConfig(
149
+ {n: self._test_datasets[n] for n in model_names[: model_idx + 1]}
150
+ )
151
+ report = self.taskpool.evaluate(deepcopy(merged_model))
152
+ save_to_json(report, Path(self.log_dir) / f"report_{model_idx}.json")
153
+
154
+ return merged_model
155
+
156
+ def _layer_wise_optimize(
157
+ self,
158
+ model_names: List[str],
159
+ pretrained_model: nn.Module,
160
+ finetuned_models: Dict[str, nn.Module],
161
+ model_idx: int,
162
+ ):
163
+ time_cost = []
164
+ for module_name, module in pretrained_model.named_modules():
165
+ if not is_leaf_module(module):
166
+ continue
167
+
168
+ if isinstance(module, nn.Linear):
169
+ if module.weight.requires_grad:
170
+ import time
171
+
172
+ start_time = time.time()
173
+ merged_weight = self._optimize_weight(
174
+ module.weight,
175
+ {
176
+ model_name: finetuned_models[model_name]
177
+ .get_submodule(module_name)
178
+ .weight
179
+ for model_name in model_names
180
+ },
181
+ module_name,
182
+ model_idx,
183
+ )
184
+ end_time = time.time()
185
+ time_cost.append(end_time - start_time)
186
+ module.weight.data = merged_weight.data
187
+ else:
188
+ module.weight.data = simple_average(
189
+ [
190
+ finetuned_models[model_name]
191
+ .get_submodule(module_name)
192
+ .weight
193
+ for model_name in model_names
194
+ ]
195
+ )
196
+ if module.bias is not None:
197
+ module.bias.data = simple_average(
198
+ [
199
+ finetuned_models[model_name].get_submodule(module_name).bias
200
+ for model_name in model_names
201
+ ]
202
+ )
203
+ else:
204
+ simple_average(
205
+ [
206
+ finetuned_models[model_name].get_submodule(module_name)
207
+ for model_name in model_names
208
+ ],
209
+ base_module=module,
210
+ )
211
+
212
+ return pretrained_model
213
+
214
+ def _optimize_weight(
215
+ self,
216
+ pretrained_weight: Tensor,
217
+ finetuned_weights: Dict[str, Tensor],
218
+ module_name: str,
219
+ model_idx: int,
220
+ ):
221
+ assert (
222
+ self.fabric.world_size == 1
223
+ ), "This algorithm is not currently supported in distributed training"
224
+
225
+ pretrained_weight = self.fabric.to_device(pretrained_weight.detach())
226
+ finetuned_weights = {
227
+ model_name: self.fabric.to_device(finetuned_weight.detach())
228
+ for model_name, finetuned_weight in finetuned_weights.items()
229
+ }
230
+
231
+ merged_weight = self.fabric.to_device(
232
+ nn.Parameter(
233
+ simple_average(
234
+ [
235
+ finetuned_weight.detach()
236
+ for finetuned_weight in finetuned_weights.values()
237
+ ]
238
+ ),
239
+ requires_grad=True,
240
+ )
241
+ )
242
+
243
+ # Compute SVD of the difference between the finetuned and pretrained weights
244
+ proj_u_dict = {}
245
+ proj_v_dict = {}
246
+ proj_s_dict = {}
247
+ for i, finetuned_weight in enumerate(finetuned_weights.values()):
248
+ finetuned_tv = finetuned_weight - pretrained_weight
249
+ u, s, v = svd(finetuned_tv, full_matrices=True)
250
+ epsilon = 1.0 if self.svd_epsilon > 1.0 else self.svd_epsilon
251
+ cumsum_ratio = s.cumsum(dim=0) / s.sum()
252
+ split_rank = torch.searchsorted(cumsum_ratio, epsilon).item()
253
+ u_main = u[:, :split_rank]
254
+ v_main = v[:, :split_rank]
255
+ s_main = s[:split_rank]
256
+ proj_u_dict[i] = u_main
257
+ proj_v_dict[i] = v_main
258
+ proj_s_dict[i] = s_main
259
+
260
+ if self.mgda:
261
+ if self.ema:
262
+ ema_sol = [self.alpha, 1 - self.alpha]
263
+ # This is multiple-gradient descent algorithm (MGDA) optimization
264
+ optimizer = torch.optim.Adam([merged_weight], lr=self.lr)
265
+ all_losses = [[], []]
266
+ all_alphas = [[], []]
267
+ for step_idx in tqdm(
268
+ range(self.num_steps), desc=f"Optimizing {module_name} weight"
269
+ ):
270
+ # Scaling the loss functions based on the algorithm choice
271
+ loss_data = {}
272
+ grads = {}
273
+ for i, finetuned_weight in enumerate(finetuned_weights.values()):
274
+ proj_u = proj_u_dict[i]
275
+ proj_v = proj_v_dict[i]
276
+ proj_s = proj_s_dict[i]
277
+ delta_tv = merged_weight - finetuned_weight
278
+ loss_i = self.cal_loss_i(delta_tv, proj_s, proj_u, proj_v)
279
+ loss_data[i] = float(loss_i.data)
280
+
281
+ all_losses[i].append(float(loss_i.data))
282
+
283
+ optimizer.zero_grad()
284
+ loss_i.backward()
285
+ grads[i] = Variable(
286
+ merged_weight.grad.data.clone(), requires_grad=False
287
+ )
288
+
289
+ # Normalize all gradients
290
+ gn = gradient_normalizers(
291
+ grads=grads, losses=loss_data, normalization_type="loss"
292
+ )
293
+ for i, _ in enumerate(finetuned_weights.values()):
294
+ grads[i] = grads[i] / float(gn[i])
295
+
296
+ # Frank-Wolfe iteration to compute scales.
297
+ sol, min_norm = MinNormSolver.find_min_norm_element(
298
+ [[grads[i]] for i in range(len(finetuned_weights.values()))]
299
+ )
300
+
301
+ if self.ema:
302
+ ema_sol = [
303
+ self.ema_beta * ema_sol[i] + (1 - self.ema_beta) * float(sol[i])
304
+ for i in range(len(sol))
305
+ ]
306
+ sol = ema_sol
307
+ all_alphas[0].append(ema_sol[0])
308
+ all_alphas[1].append(ema_sol[1])
309
+
310
+ # Scaled back-propagation
311
+ loss = 0
312
+ for i, finetuned_weight in enumerate(finetuned_weights.values()):
313
+ # Comptue gradients of each loss function wrt parameters
314
+ proj_u = proj_u_dict[i]
315
+ proj_v = proj_v_dict[i]
316
+ proj_s = proj_s_dict[i]
317
+ delta_tv = merged_weight - finetuned_weight
318
+ loss_i = self.cal_loss_i(delta_tv, proj_s, proj_u, proj_v)
319
+ loss += float(sol[i]) * loss_i
320
+
321
+ optimizer.zero_grad()
322
+ loss.backward()
323
+ optimizer.step()
324
+
325
+ else:
326
+ # This is a naive weighted optimization
327
+ optimizer = torch.optim.Adam([merged_weight], lr=self.lr)
328
+ for step_idx in tqdm(
329
+ range(self.num_steps), desc=f"Optimizing {module_name} weight"
330
+ ):
331
+ loss = 0
332
+ for i, finetuned_weight in enumerate(finetuned_weights.values()):
333
+ proj_u = proj_u_dict[i]
334
+ proj_v = proj_v_dict[i]
335
+ proj_s = proj_s_dict[i]
336
+ delta_tv = merged_weight - finetuned_weight
337
+ loss_i = self.cal_loss_i(delta_tv, proj_s, proj_u, proj_v)
338
+ loss += self.alpha * loss_i if i == 0 else (1 - self.alpha) * loss_i
339
+
340
+ optimizer.zero_grad()
341
+ loss.backward()
342
+ optimizer.step()
343
+
344
+ return merged_weight.detach().cpu()
345
+
346
+ def cal_loss_i(self, delta_tv, proj_s, proj_u, proj_v):
347
+ proj_delta_1 = torch.diag(proj_s) @ proj_u.T @ delta_tv
348
+ proj_delta_2 = delta_tv @ proj_v @ torch.diag(proj_s)
349
+ loss_i_u = torch.linalg.matrix_norm(proj_delta_1, ord="fro") ** 2
350
+ loss_i_v = torch.linalg.matrix_norm(proj_delta_2, ord="fro") ** 2
351
+ if self.svd_proj_space == "uv":
352
+ loss_i = loss_i_u + loss_i_v
353
+ elif self.svd_proj_space == "u":
354
+ loss_i = loss_i_u
355
+ elif self.svd_proj_space == "v":
356
+ loss_i = loss_i_v
357
+ else:
358
+ raise ValueError("Invalid svd_proj_space")
359
+
360
+ return loss_i
361
+
362
+ def save_merged_model(self, merged_model: CLIPVisionModel, step: int):
363
+ os.makedirs(Path(self.log_dir) / "checkpoints", exist_ok=True)
364
+ merged_model.save_pretrained(
365
+ Path(self.log_dir) / "checkpoints" / f"merged_model_{step}"
366
+ )
@@ -0,0 +1,227 @@
1
+ # This code is from
2
+ # Multi-Task Learning as Multi-Objective Optimization
3
+ # Ozan Sener, Vladlen Koltun
4
+ # Neural Information Processing Systems (NeurIPS) 2018
5
+ # https://github.com/intel-isl/MultiObjectiveOptimization
6
+ from typing import Union
7
+
8
+ import numpy as np
9
+ import torch
10
+
11
+
12
+ def np_sum(x: Union[torch.Tensor, np.ndarray]) -> float:
13
+ if isinstance(x, torch.Tensor):
14
+ return x.sum().item()
15
+ return np.sum(x)
16
+
17
+
18
+ def to_numpy(x: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
19
+ if isinstance(x, torch.Tensor):
20
+ return x.detach().cpu().numpy()
21
+ return x
22
+
23
+
24
+ class MinNormSolver:
25
+ MAX_ITER = 250
26
+ STOP_CRIT = 1e-5
27
+
28
+ def _min_norm_element_from2(v1v1, v1v2, v2v2):
29
+ """
30
+ Analytical solution for min_{c} |cx_1 + (1-c)x_2|_2^2
31
+ d is the distance (objective) optimzed
32
+ v1v1 = <x1,x1>
33
+ v1v2 = <x1,x2>
34
+ v2v2 = <x2,x2>
35
+ """
36
+ if v1v2 >= v1v1:
37
+ # Case: Fig 1, third column
38
+ gamma = 0.999
39
+ cost = v1v1
40
+ return gamma, cost
41
+ if v1v2 >= v2v2:
42
+ # Case: Fig 1, first column
43
+ gamma = 0.001
44
+ cost = v2v2
45
+ return gamma, cost
46
+ # Case: Fig 1, second column
47
+ gamma = -1.0 * ((v1v2 - v2v2) / (v1v1 + v2v2 - 2 * v1v2))
48
+ cost = v2v2 + gamma * (v1v2 - v2v2)
49
+ return gamma, cost
50
+
51
+ def _min_norm_2d(vecs, dps):
52
+ R"""
53
+ Find the minimum norm solution as combination of two points
54
+ This is correct only in 2D
55
+ ie. min_c |\sum c_i x_i|_2^2 st. \sum c_i = 1 , 1 >= c_1 >= 0 for all i, c_i + c_j = 1.0 for some i, j
56
+ """
57
+ dmin = 1e8
58
+ for i in range(len(vecs)):
59
+ for j in range(i + 1, len(vecs)):
60
+ if (i, j) not in dps:
61
+ dps[(i, j)] = 0.0
62
+ for k in range(len(vecs[i])):
63
+ dps[(i, j)] += (
64
+ torch.mul(vecs[i][k], vecs[j][k]).sum().data.cpu()
65
+ )
66
+ dps[(j, i)] = dps[(i, j)]
67
+ if (i, i) not in dps:
68
+ dps[(i, i)] = 0.0
69
+ for k in range(len(vecs[i])):
70
+ dps[(i, i)] += (
71
+ torch.mul(vecs[i][k], vecs[i][k]).sum().data.cpu()
72
+ )
73
+ if (j, j) not in dps:
74
+ dps[(j, j)] = 0.0
75
+ for k in range(len(vecs[i])):
76
+ dps[(j, j)] += (
77
+ torch.mul(vecs[j][k], vecs[j][k]).sum().data.cpu()
78
+ )
79
+ c, d = MinNormSolver._min_norm_element_from2(
80
+ dps[(i, i)], dps[(i, j)], dps[(j, j)]
81
+ )
82
+ if d < dmin:
83
+ dmin = d
84
+ sol = [(i, j), c, d]
85
+ return sol, dps
86
+
87
+ def _projection2simplex(y):
88
+ R"""
89
+ Given y, it solves argmin_z |y-z|_2 st \sum z = 1 , 1 >= z_i >= 0 for all i
90
+ """
91
+ m = len(y)
92
+ sorted_y = np.flip(np.sort(y), axis=0)
93
+ tmpsum = 0.0
94
+ tmax_f = (np.sum(y) - 1.0) / m
95
+ for i in range(m - 1):
96
+ tmpsum += sorted_y[i]
97
+ tmax = (tmpsum - 1) / (i + 1.0)
98
+ if tmax > sorted_y[i + 1]:
99
+ tmax_f = tmax
100
+ break
101
+ return np.maximum(y - tmax_f, np.zeros(y.shape))
102
+
103
+ def _next_point(cur_val, grad, n):
104
+ proj_grad = grad - (np.sum(grad) / n)
105
+ tm1 = -1.0 * cur_val[proj_grad < 0] / proj_grad[proj_grad < 0]
106
+ tm2 = (1.0 - cur_val[proj_grad > 0]) / (proj_grad[proj_grad > 0])
107
+
108
+ skippers = np_sum(tm1 < 1e-7) + np_sum(tm2 < 1e-7)
109
+ t = 1
110
+ if len(tm1[tm1 > 1e-7]) > 0:
111
+ t = np.min(to_numpy(tm1[tm1 > 1e-7]))
112
+ if len(tm2[tm2 > 1e-7]) > 0:
113
+ t = min(t, np.min(to_numpy(tm2[tm2 > 1e-7])))
114
+
115
+ next_point = proj_grad * t + to_numpy(cur_val)
116
+ next_point = MinNormSolver._projection2simplex(next_point)
117
+ return next_point
118
+
119
+ def find_min_norm_element(vecs):
120
+ R"""
121
+ Given a list of vectors (vecs), this method finds the minimum norm element in the convex hull
122
+ as min |u|_2 st. u = \sum c_i vecs[i] and \sum c_i = 1.
123
+ It is quite geometric, and the main idea is the fact that if d_{ij} = min |u|_2 st u = c x_i + (1-c) x_j; the solution lies in (0, d_{i,j})
124
+ Hence, we find the best 2-task solution, and then run the projected gradient descent until convergence
125
+ """
126
+ # Solution lying at the combination of two points
127
+ dps = {}
128
+ init_sol, dps = MinNormSolver._min_norm_2d(vecs, dps)
129
+
130
+ n = len(vecs)
131
+ sol_vec = np.zeros(n)
132
+ sol_vec[init_sol[0][0]] = init_sol[1]
133
+ sol_vec[init_sol[0][1]] = 1 - init_sol[1]
134
+
135
+ if n < 3:
136
+ # This is optimal for n=2, so return the solution
137
+ return sol_vec, init_sol[2]
138
+
139
+ iter_count = 0
140
+
141
+ grad_mat = np.zeros((n, n))
142
+ for i in range(n):
143
+ for j in range(n):
144
+ grad_mat[i, j] = dps[(i, j)]
145
+
146
+ while iter_count < MinNormSolver.MAX_ITER:
147
+ grad_dir = -1.0 * np.dot(grad_mat, sol_vec)
148
+ new_point = MinNormSolver._next_point(sol_vec, grad_dir, n)
149
+ # Re-compute the inner products for line search
150
+ v1v1 = 0.0
151
+ v1v2 = 0.0
152
+ v2v2 = 0.0
153
+ for i in range(n):
154
+ for j in range(n):
155
+ v1v1 += sol_vec[i] * sol_vec[j] * dps[(i, j)]
156
+ v1v2 += sol_vec[i] * new_point[j] * dps[(i, j)]
157
+ v2v2 += new_point[i] * new_point[j] * dps[(i, j)]
158
+ nc, nd = MinNormSolver._min_norm_element_from2(v1v1, v1v2, v2v2)
159
+ new_sol_vec = nc * sol_vec + (1 - nc) * new_point
160
+ change = new_sol_vec - sol_vec
161
+ if np_sum(np.abs(change)) < MinNormSolver.STOP_CRIT:
162
+ return sol_vec, nd
163
+ sol_vec = new_sol_vec
164
+
165
+ def find_min_norm_element_FW(vecs):
166
+ R"""
167
+ Given a list of vectors (vecs), this method finds the minimum norm element in the convex hull
168
+ as min |u|_2 st. u = \sum c_i vecs[i] and \sum c_i = 1.
169
+ It is quite geometric, and the main idea is the fact that if d_{ij} = min |u|_2 st u = c x_i + (1-c) x_j; the solution lies in (0, d_{i,j})
170
+ Hence, we find the best 2-task solution, and then run the Frank Wolfe until convergence
171
+ """
172
+ # Solution lying at the combination of two points
173
+ dps = {}
174
+ init_sol, dps = MinNormSolver._min_norm_2d(vecs, dps)
175
+
176
+ n = len(vecs)
177
+ sol_vec = np.zeros(n)
178
+ sol_vec[init_sol[0][0]] = init_sol[1]
179
+ sol_vec[init_sol[0][1]] = 1 - init_sol[1]
180
+
181
+ if n < 3:
182
+ # This is optimal for n=2, so return the solution
183
+ return sol_vec, init_sol[2]
184
+
185
+ iter_count = 0
186
+
187
+ grad_mat = np.zeros((n, n))
188
+ for i in range(n):
189
+ for j in range(n):
190
+ grad_mat[i, j] = dps[(i, j)]
191
+
192
+ while iter_count < MinNormSolver.MAX_ITER:
193
+ t_iter = np.argmin(np.dot(grad_mat, sol_vec))
194
+
195
+ v1v1 = np.dot(sol_vec, np.dot(grad_mat, sol_vec))
196
+ v1v2 = np.dot(sol_vec, grad_mat[:, t_iter])
197
+ v2v2 = grad_mat[t_iter, t_iter]
198
+
199
+ nc, nd = MinNormSolver._min_norm_element_from2(v1v1, v1v2, v2v2)
200
+ new_sol_vec = nc * sol_vec
201
+ new_sol_vec[t_iter] += 1 - nc
202
+
203
+ change = new_sol_vec - sol_vec
204
+ if np_sum(np.abs(change)) < MinNormSolver.STOP_CRIT:
205
+ return sol_vec, nd
206
+ sol_vec = new_sol_vec
207
+
208
+
209
+ def gradient_normalizers(grads, losses, normalization_type):
210
+ gn = {}
211
+ if normalization_type == "l2":
212
+ for t in grads:
213
+ gn[t] = np.sqrt(np.sum([gr.pow(2).sum().data.cpu() for gr in grads[t]]))
214
+ elif normalization_type == "loss":
215
+ for t in grads:
216
+ gn[t] = losses[t]
217
+ elif normalization_type == "loss+":
218
+ for t in grads:
219
+ gn[t] = losses[t] * np.sqrt(
220
+ np.sum([gr.pow(2).sum().data.cpu() for gr in grads[t]])
221
+ )
222
+ elif normalization_type == "none":
223
+ for t in grads:
224
+ gn[t] = 1.0
225
+ else:
226
+ print("ERROR: Invalid Normalization Type")
227
+ return gn
@@ -0,0 +1,73 @@
1
+ from typing import Tuple
2
+
3
+ import torch
4
+ from torch import Tensor, nn
5
+
6
+ from fusion_bench.utils.parameters import state_dict_to_vector
7
+ from fusion_bench.utils.state_dict_arithmetic import state_dict_sub
8
+
9
+
10
+ def _svd(w: Tensor, full_matrices=True) -> Tuple[Tensor, Tensor, Tensor]:
11
+ """
12
+ Perform Singular Value Decomposition (SVD) on a tensor.
13
+
14
+ Args:
15
+ w (Tensor): The input tensor.
16
+ full_matrices (bool): Whether to compute the full-sized U and V matrices.
17
+
18
+ Returns:
19
+ Tuple[Tensor, Tensor, Tensor]: The U, S, and V matrices from SVD.
20
+ """
21
+ u, s, vh = torch.linalg.svd(
22
+ w, full_matrices=full_matrices, driver="gesvd" if w.is_cuda else None
23
+ )
24
+ v = vh.T
25
+ return u, s, v
26
+
27
+
28
+ def svd(
29
+ w: Tensor, full_matrices=True, accelerator=None
30
+ ) -> Tuple[Tensor, Tensor, Tensor]:
31
+ """
32
+ Perform SVD on a tensor, optionally using a specified accelerator.
33
+
34
+ Args:
35
+ w (Tensor): The input tensor.
36
+ full_matrices (bool): Whether to compute the full-sized U and V matrices.
37
+ accelerator (str): The device to perform the computation on.
38
+
39
+ Returns:
40
+ Tuple[Tensor, Tensor, Tensor]: The U, S, and V matrices from SVD.
41
+ """
42
+ if accelerator is None:
43
+ return _svd(w, full_matrices=full_matrices)
44
+ original_device = w.device
45
+ w = w.to(accelerator)
46
+ u, s, v = _svd(w)
47
+ return u.to(original_device), s.to(original_device), v.to(original_device)
48
+
49
+
50
+ def frobenius_inner_product(w1: Tensor, w2: Tensor) -> Tensor:
51
+ return torch.trace(w1.T @ w2)
52
+
53
+
54
+ def is_leaf_module(module: nn.Module) -> bool:
55
+ return len(list(module.children())) == 0
56
+
57
+
58
+ def get_task_vector_norm(model: nn.Module, pretrained_model: nn.Module) -> Tensor:
59
+ """
60
+ Get the vector norm of the task model.
61
+
62
+ Args:
63
+ model (nn.Module): The task model.
64
+ pretrained_model (nn.Module): The pretrained model.
65
+
66
+ Returns:
67
+ Tensor: The vector norm of the task model.
68
+ """
69
+ return torch.linalg.norm(
70
+ state_dict_to_vector(
71
+ state_dict_sub(model.state_dict(), pretrained_model.state_dict())
72
+ )
73
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fusion_bench
3
- Version: 0.2.25
3
+ Version: 0.2.26
4
4
  Summary: A Comprehensive Benchmark of Deep Model Fusion
5
5
  Author-email: Anke Tang <tang.anke@foxmail.com>
6
6
  Project-URL: Repository, https://github.com/tanganke/fusion_bench
@@ -82,7 +82,13 @@ Model merging has emerged as a promising approach for multi-task learning (MTL),
82
82
  </details>
83
83
 
84
84
  <details>
85
- <summary>Anke Tang, et al. Merging Models on the Fly Without Retraining: A Sequential Approach to Scalable Continual Model Merging. Jan 2025. https://arxiv.org/pdf/2501.09522</summary>
85
+ <summary>Enneng Yang, et al. Continual Model Merging without Data: Dual Projections for Balancing Stability and Plasticity. NeurIPS 2025. https://github.com/EnnengYang/DOP</summary>
86
+
87
+ Model merging integrates multiple expert models with diverse capabilities into a unified framework, facilitating collaborative learning. However, most existing methods assume simultaneous access to all models, which is often impractical in real-world scenarios where models are received sequentially. While some studies have investigated continual model merging (CMM)--which involves sequentially merging multiple models--the challenge of balancing prior knowledge (stability) and incorporating new tasks (plasticity) remains unresolved. This paper, for the first time, formally defines the stability and plasticity of CMM from the perspective of orthogonal projection. Subsequently, we analyze the relationships among the spaces spanned by task data, historical gradients, and accumulated gradients. Building on this, we propose a data-free Dual Orthogonal Projection (DOP) method, which eliminates data dependence and mitigates interference between the merged model and models for old and new tasks by projecting their parameter differences onto their respective approximate data spaces. Finally, to solve potential conflicts between stability and plasticity, we reformulate DOP as a multi-objective optimization problem and employ a multi-gradient descent algorithm to obtain a Pareto-optimal solution. Extensive experiments across multiple architectures and task configurations validate that our approach significantly outperforms state-of-the-art CMM methods.
88
+ </details>
89
+
90
+ <details>
91
+ <summary>Anke Tang, et al. Merging Models on the Fly Without Retraining: A Sequential Approach to Scalable Continual Model Merging. NeurIPS 2025. Jan 2025. https://arxiv.org/pdf/2501.09522</summary>
86
92
 
87
93
  Deep model merging represents an emerging research direction that combines multiple fine-tuned models to harness their specialized capabilities across different tasks and domains. Current model merging techniques focus on merging all available models simultaneously, with weight interpolation-based methods being the predominant approaches. However, these conventional approaches are not well-suited for scenarios where models become available sequentially, and they often suffer from high memory requirements and potential interference between tasks. In this study, we propose a training-free projection-based continual merging method that processes models sequentially through orthogonal projections of weight matrices and adaptive scaling mechanisms. Our method operates by projecting new parameter updates onto subspaces orthogonal to existing merged parameter updates while using an adaptive scaling mechanism to maintain stable parameter distances, enabling efficient sequential integration of task-specific knowledge. Our approach maintains constant memory complexity to the number of models, minimizes interference between tasks through orthogonal projections, and retains the performance of previously merged models through adaptive task vector scaling. Extensive experiments on CLIP-ViT models demonstrate that our method achieves a 5-8% average accuracy improvement while maintaining robust performance in different task orderings.
88
94
  </details>
@@ -48,7 +48,7 @@ fusion_bench/dataset/llama/stanford_shp.py,sha256=6ueXKnFXIBBobacU1h5WxGLZrSOtBk
48
48
  fusion_bench/dataset/llama/ultrachat.py,sha256=Go7WvrDAYnm184fdazHGRYLbSY6Xd7jrESyQeUJtOww,1736
49
49
  fusion_bench/dataset/llama/wikitext.py,sha256=9ZHR-nMfXRumd3o-PIj3n7B83YlVeqpGkZ2zJs2B-9Y,2883
50
50
  fusion_bench/dataset/llama/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
- fusion_bench/method/__init__.py,sha256=QOcRQ3AmUpSiDikH1tq-EcxakX7akFPOizcynTLmUwQ,9377
51
+ fusion_bench/method/__init__.py,sha256=-r9Sz5haSc5X4eNvxGvHwowQfS2sLfZ21orUC5ae7ws,9454
52
52
  fusion_bench/method/base_algorithm.py,sha256=OnKSNPQ_nIdIWxryyblW_sko7uoEBN4lGh-eLkJ4kh4,9004
53
53
  fusion_bench/method/dummy.py,sha256=hb1y6LR_geRZ5eRgGwt5zJUcHYorCeIbs5i76CvurUc,1031
54
54
  fusion_bench/method/ensemble.py,sha256=Bjzqxt-tUp5cawT1jIhqKswN5QH3bkYbmuI4LS4uTG0,3619
@@ -77,7 +77,7 @@ fusion_bench/method/bitdelta/bitdelta_utils/binary_gemm_kernel.py,sha256=zC0w5cw
77
77
  fusion_bench/method/bitdelta/bitdelta_utils/data.py,sha256=LGEgv8o8glyyLLYh6Ur5h_sulxPFmy6i-xi-Ap1G-Wc,1052
78
78
  fusion_bench/method/bitdelta/bitdelta_utils/diff.py,sha256=o3ib5sgGDYLgnL8YTfX0YDc4Md6W9_gb03jzftTn5s4,4075
79
79
  fusion_bench/method/classification/__init__.py,sha256=byVJ574JQ_DUvsDv8S6ZM6BKAv4ZZ964Ej4btm0aC7k,867
80
- fusion_bench/method/classification/clip_finetune.py,sha256=QNOw9O-BTOVOsW7lzRu8L-UfbiBpsT_8tS6i6BpbVyA,15726
80
+ fusion_bench/method/classification/clip_finetune.py,sha256=5q5Sr3eVVh8DfYdeSoGjwaKDksC8F2dY2r8Dl-wRaDg,15844
81
81
  fusion_bench/method/classification/continual_clip_finetune.py,sha256=OLhZKS-6aCnafevZkZYcNMKTWDDj3DATB27eZl_i8EY,11530
82
82
  fusion_bench/method/classification/image_classification_finetune.py,sha256=CPMpZvaULWaim01EvJJHlU4C6HQ16OCqZGoMvPBEWtY,8157
83
83
  fusion_bench/method/concrete_subspace/__init__.py,sha256=jJoFcjnQe-jvccsm9DuCXna378m9XBT9vV1fEZbdfR0,464
@@ -101,6 +101,10 @@ fusion_bench/method/doge_ta/__init__.py,sha256=dixO0i5fmhgC_W2_DAQ4PzYnkMCZX5D8t
101
101
  fusion_bench/method/doge_ta/clip_layer_wise_adamerging.py,sha256=4WPG2fhFw-u6oSoT-fBrP2K9YpX-MH-AotBL1DknfpA,1304
102
102
  fusion_bench/method/doge_ta/doge_ta.py,sha256=jrJF52JUBdrB3EGWaXJMFZE-v8syzZGr4smG6rEO74c,13790
103
103
  fusion_bench/method/doge_ta/layer_wise_adamerging.py,sha256=rLk3Nep5d6wMUNCp6q7pC7L0pfBvUwGBIuiGM7CQOf4,9780
104
+ fusion_bench/method/dop/__init__.py,sha256=MD8c44ovLLJX_-v9t2SdLrvKLxVf8PijzFFNjJfvhpE,37
105
+ fusion_bench/method/dop/dop.py,sha256=_wNjN1DSK27aKEyWVay61fqc7prwJ1uiv_3618_bQ20,14160
106
+ fusion_bench/method/dop/min_norm_solvers.py,sha256=a7n2X0BE_YajlaUygyHV0yqW6-x5dTyZ5V0mt_Q69qE,8291
107
+ fusion_bench/method/dop/utils.py,sha256=_q7yy3ENNFUh1qUd5J5DThRL4J1tIxEcknCO2AKmeYM,2102
104
108
  fusion_bench/method/expert_sparsity/__init__.py,sha256=nt7k5cKqA2Bax1aM93ODwsEuibZ_hdFgQsUos_8h2v8,271
105
109
  fusion_bench/method/expert_sparsity/mixtral/__init__.py,sha256=FyKDZIyYUnqvGIdJ5BS639UpzSBj11g28ATHs1Yczdk,545
106
110
  fusion_bench/method/expert_sparsity/mixtral/dynamic_skipping.py,sha256=zZa4IAKimFZMoxoQ_Oi7z2R9o5H6kxV2QTb0e-t9kDY,5665
@@ -480,7 +484,7 @@ fusion_bench/utils/plot/token_notebook.py,sha256=bsntXf46Zz_RavTxNiB9c3-KvHw7LFw
480
484
  fusion_bench/utils/strenum/__init__.py,sha256=id9ORi1uXrDxhbmVxitJ1KDwLS4H3AAwFpaK5h1cQzw,8531
481
485
  fusion_bench/utils/strenum/_name_mangler.py,sha256=o11M5-bURW2RBvRTYXFQIPNeqLzburdoWLIqk8X3ydw,3397
482
486
  fusion_bench/utils/strenum/_version.py,sha256=6JQRo9LcvODbCOeVFYQb9HNJ_J9XiG_Zbn8ws2A3BV8,18466
483
- fusion_bench-0.2.25.dist-info/licenses/LICENSE,sha256=nhnOJlw4CPuPVE0qvkGmxfFgHmKi-6nzXvTu8t0NUdg,1066
487
+ fusion_bench-0.2.26.dist-info/licenses/LICENSE,sha256=nhnOJlw4CPuPVE0qvkGmxfFgHmKi-6nzXvTu8t0NUdg,1066
484
488
  fusion_bench_config/README.md,sha256=Lc8YSBJ5oxf9KV5kKDivJ9LRyGuraGQPmBbgbdVA-j4,703
485
489
  fusion_bench_config/clip-vit-base-patch32_robustness_corrupted.yaml,sha256=7IxLQoLRz-sRWyV8Vqc5kQcmYE_9YQz2_77pmvAkum8,1207
486
490
  fusion_bench_config/fabric_model_fusion.yaml,sha256=U8BxsaOvsg9bsEZcIpBE-feo9n9G7Y1kQDHqPVxUYAg,2601
@@ -599,14 +603,14 @@ fusion_bench_config/hydra/default.yaml,sha256=Fpi3pV1hqPoPk5QdBncse6NlNOAl2YHzD4
599
603
  fusion_bench_config/hydra/help/fusion_bench_help.yaml,sha256=v8s891Cr5wyxBXGDn_VBBwwRmb0JXOL874Sl-zNoCWA,1880
600
604
  fusion_bench_config/hydra/job_logging/rich_logging.yaml,sha256=_dYGeFTCqaPrRowLXBNMXwzYhw8ns1TkQFfALwK1aCw,441
601
605
  fusion_bench_config/method/clip_finetune.yaml,sha256=yWjcdKYaKvy53sGaygg2ElAjb9-YFCyCGE1s9aB_dPM,677
602
- fusion_bench_config/method/depth_upscaling.yaml,sha256=m2XUK8Znf8nnaPKMNH7Un19DQXJlFwpaHE02UId1nxY,632
603
- fusion_bench_config/method/dummy.yaml,sha256=5qs6OuIfriKOH7FgqvcMXMUoRLmXDZmjA4irpAsc5xo,45
606
+ fusion_bench_config/method/depth_upscaling.yaml,sha256=86YqczaMzZftymLy_k2cb-GMy4C42yTxxP4c4htZTBs,1230
607
+ fusion_bench_config/method/dummy.yaml,sha256=Pw2w6WQiw3K4_KH0THPs4NSM7lZoZLsNbB72iPSVsl8,427
604
608
  fusion_bench_config/method/mixtral_moe_merging.yaml,sha256=AdVhXD6Crw-B3QyNpP4ToHRSg-EeSCIGtazA7lQvPOU,148
605
609
  fusion_bench_config/method/mixtral_moe_upscaling.yaml,sha256=wYDRnWOpZ6SgvL2Fm9wIDomrN2x5Jaq5vg1hjh3druk,210
606
- fusion_bench_config/method/model_recombination.yaml,sha256=RGb4boklWcN_GnI-bB5o_qr5o5vGgfIfi_EnTqQ_qcA,195
607
- fusion_bench_config/method/simple_average.yaml,sha256=GtMNvt0-qWOevRX2V6fjiYUO2BwDvMw-EcxRMS_PhZQ,53
608
- fusion_bench_config/method/task_arithmetic.yaml,sha256=hqkbc8kbzEFPFmBIKbf-6-vT2ZsBYxhhlP7ZmNT13PM,74
609
- fusion_bench_config/method/ties_merging.yaml,sha256=0lsy-q-9SNY5xzfoAOFpeva2AqdwcbLwMxb0ZtTU2PA,292
610
+ fusion_bench_config/method/model_recombination.yaml,sha256=DeyVPdDCL-eyJDlPZXLAIWfKi3p8nN0dLFRx5ydsERc,740
611
+ fusion_bench_config/method/simple_average.yaml,sha256=uB51mNlFKb9S3Go1p6SLGgr3PWJFZs97Ccn1zZZkEug,577
612
+ fusion_bench_config/method/task_arithmetic.yaml,sha256=zQmNpnQrZTHiRv_KmYnHPMScKf8MUMLbQYh9254_1Jg,580
613
+ fusion_bench_config/method/ties_merging.yaml,sha256=c3BjnFo-ZU5hmCrfi-1VQPhd_EYGtftxxYDHTVCMy6s,501
610
614
  fusion_bench_config/method/ada_svd/clip_vision.yaml,sha256=3l0VKCL66rZNx020UKhf_UzXScZ5XZYOUeNm8mqo0So,183
611
615
  fusion_bench_config/method/adamerging/clip.yaml,sha256=NBJaK0a4RxV3D2LRciUeWmTqabRwu6OxZnT7u7iz6ug,753
612
616
  fusion_bench_config/method/adamerging/layer_wise_flan_t5.yaml,sha256=DxkZhcuu_-ErIUqBUmWKN5UXYYWKoKPX6IgjV-Txwv0,541
@@ -614,7 +618,7 @@ fusion_bench_config/method/adamerging/layer_wise_gpt2.yaml,sha256=bLz6zc5CofeUO2
614
618
  fusion_bench_config/method/adamerging/llama_sft.yaml,sha256=khKzfhvQ5oxBMH0d-YvyjN-qIgQNeevDodXngS5g9KY,1022
615
619
  fusion_bench_config/method/analysis/task_vector_cos_similarity.yaml,sha256=hxVA4deUr1go1RZl12qD8PekwydWJ9SBQowSqmo3A8I,139
616
620
  fusion_bench_config/method/analysis/task_vector_violin_plot.yaml,sha256=FmBGj0Ib2xYd-49x_xZSeVbExwL-A9-tHhHTMBrT_Fg,134
617
- fusion_bench_config/method/bitdelta/bitdelta.yaml,sha256=b92xQpufqrSHAiU0QFE8g0nQ7RGSowOubGrEz_KugsQ,231
621
+ fusion_bench_config/method/bitdelta/bitdelta.yaml,sha256=uuR5x1IVTWyZjTSd5i1JXd_D8tG7tWBfOpgMBDCBgR0,436
618
622
  fusion_bench_config/method/classification/clip_continual_finetune.yaml,sha256=Ls63kdLb1bLwUEqzfyTtJcpFOdv3HmwzBML0V2JnnAs,791
619
623
  fusion_bench_config/method/classification/clip_finetune.yaml,sha256=yWjcdKYaKvy53sGaygg2ElAjb9-YFCyCGE1s9aB_dPM,677
620
624
  fusion_bench_config/method/classification/image_classification_finetune.yaml,sha256=fl60RFCYwmrwwu3QlaJTFiBLmSmnjHxl-xyq4Gb80iU,401
@@ -631,9 +635,10 @@ fusion_bench_config/method/dare/task_arithmetic.yaml,sha256=cUAweNJ6p2aOv__0dvUL
631
635
  fusion_bench_config/method/dare/ties_merging.yaml,sha256=7gDW4XpezrsccsbJGqqKrbX26JnqAc85A-MY66DGvuE,416
632
636
  fusion_bench_config/method/dawe/dawe_for_clip.yaml,sha256=99P5xpp1YGvIwXGxDcxRtJMLE2FhvEFmFBQjOMEcGoc,1023
633
637
  fusion_bench_config/method/doge_ta/doge_ta.yaml,sha256=CtZI3YPMJNDy225yhOJbSiMKlsc-X5nCFzmVh0dvr-w,78
634
- fusion_bench_config/method/ensemble/max_model_predictor.yaml,sha256=khdpCvKMNytx4nZSgtUJFXv44MVytXu0aqUVd9TixXo,57
635
- fusion_bench_config/method/ensemble/simple_ensemble.yaml,sha256=RKa3IgN3DfFZVmeXVIdTt0NdPVV0jFkpQz6SxLs3Kso,124
636
- fusion_bench_config/method/ensemble/weighted_ensemble.yaml,sha256=2KD3PjFglqL7fjqhjXtOWxZ1mvmYodiNVroXsFd7EGE,261
638
+ fusion_bench_config/method/dop/dop.yaml,sha256=ZgdjuVfTj83kAvrS4RrPgGX7d_QQ7d1lIMlzhjiVeUc,954
639
+ fusion_bench_config/method/ensemble/max_model_predictor.yaml,sha256=ugO9FbEYqQk3RkX7wUDE9UOg-4D0F4Rezv0O-7hTeRg,476
640
+ fusion_bench_config/method/ensemble/simple_ensemble.yaml,sha256=kfPAaPVQIet9dYThKNsEBfe9gHdeCREnsM-snSOPahM,546
641
+ fusion_bench_config/method/ensemble/weighted_ensemble.yaml,sha256=LhlxU2P_inxR8MB0Z62phHWj5S4qxD7ITG4Ly-GUcQo,770
637
642
  fusion_bench_config/method/expert_sparsity/README.md,sha256=CLE0-XblXDWCUTHPaTNtBH-YquXn-uawwTJiYrgjMaA,239
638
643
  fusion_bench_config/method/expert_sparsity/mixtral.yaml,sha256=maFL3LM0zfnQ1eXoNXUslSjgZmpOdUJgl_a31dYUBbc,605
639
644
  fusion_bench_config/method/fisher_merging/clip_fisher_merging.yaml,sha256=-m5uDA9hfBg_8vF3s0MnUp0JTl3MqpB4-rlPEg9CHD4,569
@@ -646,23 +651,23 @@ fusion_bench_config/method/gossip/layer_wise_flan_t5.yaml,sha256=2yBqbhwz2vq65wT
646
651
  fusion_bench_config/method/isotropic_merging/iso_c.yaml,sha256=mn_5nyc7s_a7QH1MkEj9ZncjNHtZa0mzfXcUGRJOiAw,81
647
652
  fusion_bench_config/method/isotropic_merging/iso_cts.yaml,sha256=70BODJt69pZ_9xH7S_Z2Yzb299LFIGkXy1bQiHQad6A,110
648
653
  fusion_bench_config/method/linear/expo.yaml,sha256=St3NW6cKVRV3vCn8y0gxQ8k66VTdtsLTEWQTbO9wQ0Y,420
649
- fusion_bench_config/method/linear/linear_interpolation.yaml,sha256=chM6_HRKKcMleTeuKY3-YNI1qaMG2CfnsRwUxAlHsRw,66
654
+ fusion_bench_config/method/linear/linear_interpolation.yaml,sha256=cAL_ekEIJhJD4cfAbKilV0k_lNNPoJqY4sABVEKcM7E,523
650
655
  fusion_bench_config/method/linear/llama_expo.yaml,sha256=SvqamjT06BMObQ58sks5x7Wv6kGpp3-Nlw3ihbD_kSA,621
651
656
  fusion_bench_config/method/linear/llama_expo_with_dare.yaml,sha256=Pp8s2xmEg5XSvaGKtwTYx_PzcGvwRh2gPpZ6u9as4_E,383
652
657
  fusion_bench_config/method/linear/simple_average_for_causallm.yaml,sha256=qqeIr61PJEcfZclZ5vV64GCzyt-8b1zB0FDZu8DsbXQ,322
653
658
  fusion_bench_config/method/linear/task_arithmetic_for_causallm.yaml,sha256=tJA0n0_XVvll4rZYVHQVqFCz8W3Bey6NjPKMIH3-P0U,142
654
659
  fusion_bench_config/method/linear/ties_merging_for_causallm.yaml,sha256=1oEIdxV0OqWjDQ9V_lmXEPUayp4KbKHE2SvpCLmiKOU,489
655
- fusion_bench_config/method/linear/weighted_average.yaml,sha256=uq2gHGCwVHHSa1H-hzcrSlumUTLJ50tfyiY1Mh1pFsk,186
656
- fusion_bench_config/method/linear/weighted_average_for_llama.yaml,sha256=se2aq6t5R1f-ZG6ubUyRr__DBe9BzXrgL81ua3DkQoM,498
660
+ fusion_bench_config/method/linear/weighted_average.yaml,sha256=OjE4EdfDHPYx8PlBJ6xIpCz4ITu_65VsRyefioRXGQ8,408
661
+ fusion_bench_config/method/linear/weighted_average_for_llama.yaml,sha256=886ZKr81gyN7DISqtbrM5WnjSXd_6AlakQyOJQagoYY,518
657
662
  fusion_bench_config/method/lm_finetune/bradley_terry_rm.yaml,sha256=QHsRfJK9K4KajsX3LBHG8cDt7ZLJWxOBnJjpHRQSB_s,1348
658
663
  fusion_bench_config/method/lm_finetune/fullfinetune_sft.yaml,sha256=c0rFqj2GV11X9RMraHXJtJ9OiMUzZtvDVsTn4tgAeco,1337
659
664
  fusion_bench_config/method/lm_finetune/peftfinetune_sft.yaml,sha256=LjGwfTiiC5iQKr62i22XopQTfSKbx9UbsDvEW-byneQ,1622
660
- fusion_bench_config/method/model_stock/model_stock.yaml,sha256=G92eRhG_Zsgi2R2FRnMViGC9QPvo7ge-o_eI4ZZLxao,321
665
+ fusion_bench_config/method/model_stock/model_stock.yaml,sha256=4KHAFCjL4AQ5dxkv7IGkUTxE8g-GCoxDkA3BbnlzQC0,530
661
666
  fusion_bench_config/method/moe_pruner/moe_pruner.yaml,sha256=OYMYLKvLlNEht7BK9phaTEvAE1ySaVi-pvjYiT-OTGw,442
662
- fusion_bench_config/method/opcm/opcm.yaml,sha256=YkjAMVGFDj0xqqxA7XWNr0vmcRyxeYbV387nWe0cUbk,331
663
- fusion_bench_config/method/opcm/task_arithmetic.yaml,sha256=wc9Bz7K_u0feLZbhCBhAuwjeIQTSugJu0I0DCmRNY_c,326
664
- fusion_bench_config/method/opcm/ties_merging.yaml,sha256=XOE1XzSdYXYzqev9bFD4g4prcmE1OiVINkVXsquizAA,541
665
- fusion_bench_config/method/opcm/weight_average.yaml,sha256=SmhftSJ_YXN6tn-0GuzQgjbE2sOd7YXoPYjDWzpY_9E,304
667
+ fusion_bench_config/method/opcm/opcm.yaml,sha256=7NBOGo6W1FDbqdkT8gfM5PI2kHfqB8ofMfgcxVI1suM,686
668
+ fusion_bench_config/method/opcm/task_arithmetic.yaml,sha256=WL_nVXhZWV9fe_ttChShkjYZVJnOCzvZ3i7NBppYsxk,743
669
+ fusion_bench_config/method/opcm/ties_merging.yaml,sha256=1-xR0dVEEFJue9r-oBk1ZfGmGM9vCu4cJBG5aZnJ3C8,917
670
+ fusion_bench_config/method/opcm/weight_average.yaml,sha256=n-eyxVkpRanlRJdFWFK3kppiO_W1S99WNjyjdBLDnw0,668
666
671
  fusion_bench_config/method/pruning/llama_magnitude_pruning.yaml,sha256=Px8LU_UtDz-YHDFfqQ7scEPOproiFOaudKVshrhCTgc,483
667
672
  fusion_bench_config/method/pruning/llama_random_pruning.yaml,sha256=0RiZS8d42PXZzwncPG8zcbnyYJ9vtfr2sOSqS8oDyT4,325
668
673
  fusion_bench_config/method/pruning/llama_sparsegpt_pruning.yaml,sha256=gC6Ss0n2tKSb4gyVfx45BvsFbVBGN-om4-2S1sKS-_w,505
@@ -679,15 +684,15 @@ fusion_bench_config/method/regmean/clip_regmean.yaml,sha256=QfkCHCLK9wbyB1Tq1S7Y
679
684
  fusion_bench_config/method/regmean/gpt2_regmean.yaml,sha256=n94aTboDdwSA7Tki8l_o8tYQkhXxPV8lRf-dRNPIsOs,422
680
685
  fusion_bench_config/method/regmean/regmean.yaml,sha256=ZgVVLx-lHwVgjtjTl4VZUlthh8yyua87QvoJfmNHud4,101
681
686
  fusion_bench_config/method/regmean_plusplus/clip_regmean_plusplus.yaml,sha256=A034ryEwvosqyQzA3KWs7kdp-3CUnoJtCujVywV-uzA,434
682
- fusion_bench_config/method/slerp/slerp.yaml,sha256=xldDUULtfCdwzAkQUb0C8-TmbW7FqcAlIOsPX8p4n6w,116
683
- fusion_bench_config/method/slerp/slerp_lm.yaml,sha256=c5OQ0zD7e0lXQyec09joHOFNxV1LMT4bHuwgk9GWskc,114
687
+ fusion_bench_config/method/slerp/slerp.yaml,sha256=XR3z6iqyHirkoFSdLAeV2bP1yyI25MoWG-LqdE-ypjA,719
688
+ fusion_bench_config/method/slerp/slerp_lm.yaml,sha256=hO07n6elZg_FrqEfSfbdR-tb1hqwT7vaLgAZKdF8O1o,479
684
689
  fusion_bench_config/method/smile_upscaling/causal_lm_upscaling.yaml,sha256=skLwgu_VHShm4m0oEOkqKzcBS5Cz7J29xEj7pTaSm0k,916
685
690
  fusion_bench_config/method/smile_upscaling/error_accumulation.yaml,sha256=6Gui-OuQ3P_4TwO_syh9SWJCNeHiAQzS55aO-ByYKbQ,154
686
691
  fusion_bench_config/method/smile_upscaling/projected_energy.yaml,sha256=M_EBOC3B_pxaBO3tD6mnbXpvy6-EaegSsE-jdJs-HY0,114
687
692
  fusion_bench_config/method/smile_upscaling/singular_projection_merging.yaml,sha256=ZMn_ImRjjc2uozf7ocQIzbgvFDpBV7S-34KptbBXVGo,200
688
693
  fusion_bench_config/method/smile_upscaling/smile_mistral_upscaling.yaml,sha256=VFMrkbO69d0wCjTQCuKysYGVe6hEwNu792g1QkhU5Mk,383
689
694
  fusion_bench_config/method/smile_upscaling/smile_qwen2_upscaling.yaml,sha256=MfZ1u1HIJoy_csWiLzR4GLz-eiaVxo2gmNYre224yqo,433
690
- fusion_bench_config/method/smile_upscaling/smile_upscaling.yaml,sha256=G88mabTTniDUtiUC9Vg3cj_sw6D05mE4_ZdyYI4Omjk,477
695
+ fusion_bench_config/method/smile_upscaling/smile_upscaling.yaml,sha256=38DGdOjpDo-dOMpfy807p3x-eAvibjED-BGtFGnaycA,689
691
696
  fusion_bench_config/method/sparselo_pruning/llama_iterative_sparselo.yaml,sha256=L-WgNhFjcp_2tocDxZi6STVTtoaSd1v9UOQaKO_QvHM,669
692
697
  fusion_bench_config/method/sparselo_pruning/llama_pcp_sparselo.yaml,sha256=prTEFH0eu7R_CVNQ0GPWL9QsOLFcT1uM12zZdi3qcFo,636
693
698
  fusion_bench_config/method/sparselo_pruning/llama_sparselo.yaml,sha256=Cmg8N4l--3C0qeSHG-HLOgjJZ954eWHoDNgRnx0pLK0,614
@@ -948,8 +953,8 @@ fusion_bench_config/taskpool/LMEvalHarnessTaskPool/lm_eval.yaml,sha256=3q-KMuFaM
948
953
  fusion_bench_config/taskpool/OpenCLIPVisionModelTaskPool/ViT-B-16_TA8.yaml,sha256=GjpiiRownrBCpl-TNwWRW2PYePbF-Cl99jlLNPrK5T4,1017
949
954
  fusion_bench_config/taskpool/OpenCLIPVisionModelTaskPool/ViT-B-32_TA8.yaml,sha256=WwiYMQKehtJixDPnu5o3vcWe4yJksXTWRqOzm3uVWXQ,1017
950
955
  fusion_bench_config/taskpool/OpenCLIPVisionModelTaskPool/ViT-L-14_TA8.yaml,sha256=xGRt0J9joXTzWUew6DvoYprAWlPXhaVFw5AX4im5VQw,1017
951
- fusion_bench-0.2.25.dist-info/METADATA,sha256=hOFNvf8-PM-SP8-58zf4yeOFX27dLWS27Ow1PaPpu30,22621
952
- fusion_bench-0.2.25.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
953
- fusion_bench-0.2.25.dist-info/entry_points.txt,sha256=iUQ8MCJvda7HP4vYh2n1Teoapb4G9PBVYZkAfcc5SHU,116
954
- fusion_bench-0.2.25.dist-info/top_level.txt,sha256=BuO4TL6iHL_2yPBUX9-LlIrHRczA_BNMIFwweK0PQEI,13
955
- fusion_bench-0.2.25.dist-info/RECORD,,
956
+ fusion_bench-0.2.26.dist-info/METADATA,sha256=BOHkLorLs0w_fgAtRz7tpYVExKxFiClGISLlsnW3BG8,24307
957
+ fusion_bench-0.2.26.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
958
+ fusion_bench-0.2.26.dist-info/entry_points.txt,sha256=iUQ8MCJvda7HP4vYh2n1Teoapb4G9PBVYZkAfcc5SHU,116
959
+ fusion_bench-0.2.26.dist-info/top_level.txt,sha256=BuO4TL6iHL_2yPBUX9-LlIrHRczA_BNMIFwweK0PQEI,13
960
+ fusion_bench-0.2.26.dist-info/RECORD,,
@@ -1,3 +1,6 @@
1
+ # =============================================================================
2
+ # FusionBench Method Configuration: BitDelta
3
+ # =============================================================================
1
4
  _target_: fusion_bench.method.bitdelta.BitDeltaAlgorithm
2
5
  save_dir: null
3
6
  save_full_model: false
@@ -1,3 +1,12 @@
1
+ # =============================================================================
2
+ # FusionBench Method Configuration: Depth Upscaling
3
+ # =============================================================================
4
+ # Constructs a deeper model by stacking/selecting layers from existing models.
5
+ #
6
+ # - layer_indices: list[int | str] specifying which layers to use. Strings are Python
7
+ # expressions evaluated to lists, e.g., "range(6,12)".
8
+ # - Example: [0, 2, 4, "range(6,12)"] selects 1st, 3rd, 5th, and 7th-12th layers.
9
+ # =============================================================================
1
10
  _target_: DepthUpscalingAlgorithm
2
11
  # this should be a list of integers or string, indicating the sequence of layers.
3
12
  # If the entry is an integer, it will use the n-th layer of the model.
@@ -0,0 +1,30 @@
1
+ _target_: fusion_bench.method.dop.dop.ContinualDOPForCLIP
2
+
3
+ # the random seed to use
4
+ seed: null
5
+ # shuffle the order of the models
6
+ shuffle_order: true
7
+ # save the merged model on every step
8
+ save_on_every_step: false
9
+ # evaluate the merged model on every step
10
+ evaluate_on_every_step: true
11
+
12
+ # optimizer (learning rate)
13
+ lr: 1e-4
14
+ # optimizer (num_steps)
15
+ num_steps: 200
16
+
17
+ # weighted loss
18
+ # if mgda is true, use mgda to optimize the loss weights
19
+ mgda: true
20
+ # if mgda is false, this is the weight for the loss of the first task
21
+ alpha: 0.8
22
+ # if mgda is true and ema is ture, using exponential moving average (ema), alpha is the initial value
23
+ ema: true
24
+ # if mgda is true and ema is ture, using exponential moving average (ema), beta is the decay rate
25
+ ema_beta: 0.999
26
+
27
+ # epsilon for svd (the proportion of energy retained)
28
+ svd_epsilon: 0.99999
29
+ # the space to project the delta w (left singular vectors, right singular vectors, or both)
30
+ svd_proj_space: uv # u or v or uv
@@ -1 +1,7 @@
1
+ # =============================================================================
2
+ # FusionBench Method Configuration: Dummy
3
+ # =============================================================================
4
+ # No-op method for testing pipelines and wiring.
5
+ # Instantiates and exits without modifying models.
6
+ # =============================================================================
1
7
  _target_: fusion_bench.method.DummyAlgorithm
@@ -1 +1,7 @@
1
+ # =============================================================================
2
+ # FusionBench Method Configuration: Max Model Predictor
3
+ # =============================================================================
4
+ # Selects the model with maximum confidence or performance per example/task.
5
+ # No additional hyperparameters are required.
6
+ # =============================================================================
1
7
  _target_: fusion_bench.method.MaxModelPredictorAlgorithm
@@ -1,2 +1,9 @@
1
+ # =============================================================================
2
+ # FusionBench Method Configuration: Simple Ensemble
3
+ # =============================================================================
4
+ # Averages model predictions uniformly.
5
+ #
6
+ # device_map: leave null for single device or provide a mapping for multi-device setups.
7
+ # =============================================================================
1
8
  _target_: fusion_bench.method.SimpleEnsembleAlgorithm
2
- device_map: null # Set to null for single device, or specify mapping
9
+ device_map: null # Set to null for single device, or specify mapping
@@ -1,3 +1,11 @@
1
+ # =============================================================================
2
+ # FusionBench Method Configuration: Weighted Ensemble
3
+ # =============================================================================
4
+ # Ensembles model predictions using specified per-model weights.
5
+ #
6
+ # - Set normalize=true to rescale weights to sum to 1.
7
+ # - weights: one float per model in the pool (order-sensitive). If null, uses equal weights.
8
+ # =============================================================================
1
9
  _target_: fusion_bench.method.WeightedEnsembleAlgorithm
2
10
  normalize: true
3
11
  # this should be a list of floats, one for each model in the ensemble
@@ -1,2 +1,10 @@
1
+ # =============================================================================
2
+ # FusionBench Method Configuration: Linear Interpolation
3
+ # =============================================================================
4
+ # Interpolates between two models: (1 - t) * model0 + t * model1
5
+ #
6
+ # - t in [0,1]: 0 returns model0; 1 returns model1.
7
+ # - Only meaningful for two-model pools.
8
+ # =============================================================================
1
9
  _target_: fusion_bench.method.LinearInterpolationAlgorithm
2
10
  t: 0.5
@@ -1,3 +1,6 @@
1
+ # =============================================================================
2
+ # FusionBench Method Configuration: Weighted Average (Linear)
3
+ # =============================================================================
1
4
  _target_: fusion_bench.method.WeightedAverageAlgorithm
2
5
  normalize: true # if true, the weights will be normalized before merging
3
6
  weights: # List of weights for each model
@@ -1,4 +1,4 @@
1
- _target_: WeightedAverageForLLama
1
+ _target_: fusion_bench.method.WeightedAverageForLLama
2
2
  normalize: true # if true, the weights will be normalized before merging
3
3
  weights: # List of weights for each model
4
4
  - 0.5
@@ -1,3 +1,11 @@
1
+ # =============================================================================
2
+ # FusionBench Method Configuration: Model Recombination
3
+ # =============================================================================
4
+ # Recombines submodules/layers from multiple models to form a new model.
5
+ #
6
+ # - return_modelpool: override run() argument to return model pool instead of merged model.
7
+ # Set to null to respect runtime argument; set to true/false to force behavior.
8
+ # =============================================================================
1
9
  _target_: fusion_bench.method.ModelRecombinationAlgorithm
2
10
  # if `return_model_pool` is not null, the argument `return_modelpool` passed to the `run` method will be ignored.
3
11
  return_modelpool: null
@@ -1,3 +1,6 @@
1
+ # =============================================================================
2
+ # FusionBench Method Configuration: Model Stock
3
+ # =============================================================================
1
4
  _target_: fusion_bench.method.model_stock.ModelStock
2
5
  ignore_keys:
3
6
  [
@@ -9,4 +12,4 @@ ignore_keys:
9
12
  "model.ln_final.bias",
10
13
  ]
11
14
  model_save_path: ${path.log_dir}/checkpoint
12
- model_save_kwargs: null
15
+ model_save_kwargs: null
@@ -1,3 +1,8 @@
1
+ # =============================================================================
2
+ # FusionBench Method Configuration: OPCM
3
+ # =============================================================================
4
+ # Incrementally merges models via SVD projection and evaluation per step.
5
+ # =============================================================================
1
6
  _target_: fusion_bench.method.opcm.opcm.OPCMForCLIP
2
7
  # shuffle the order of the models
3
8
  shuffle_order: true
@@ -1,3 +1,9 @@
1
+ # =============================================================================
2
+ # FusionBench Method Configuration: Continual Task Arithmetic
3
+ # =============================================================================
4
+ # Applies task arithmetic incrementally across a stream of models.
5
+ # Maintains per-step save/eval similar to OPCM.
6
+ # =============================================================================
1
7
  _target_: fusion_bench.method.opcm.task_arithmetic.ContinualTaskArithmeticForCLIP
2
8
  scaling_factor: 0.3
3
9
  # shuffle the order of the models
@@ -1,3 +1,8 @@
1
+ # =============================================================================
2
+ # FusionBench Method Configuration: Continual TIES Merging
3
+ # =============================================================================
4
+ # Continual variant of TIES merging with per-step save/eval instrumentation.
5
+ # =============================================================================
1
6
  _target_: fusion_bench.method.opcm.ties_merging.ContinualTiesMergingForCLIP
2
7
  # Scaling factor $\lambda$
3
8
  scaling_factor: 0.5
@@ -1,3 +1,8 @@
1
+ # =============================================================================
2
+ # FusionBench Method Configuration: Continual Weighted Average
3
+ # =============================================================================
4
+ # Incrementally averages model weights as new models arrive.
5
+ # =============================================================================
1
6
  _target_: fusion_bench.method.opcm.weight_average.ContinualWeightAverageForCLIP
2
7
  # shuffle the order of the models
3
8
  shuffle_order: true
@@ -1 +1,10 @@
1
+ # =============================================================================
2
+ # FusionBench Method Configuration: Simple Average
3
+ # =============================================================================
4
+ # Equally averages parameters of all models in the model pool.
5
+ #
6
+ # Usage notes
7
+ # - No hyperparameters required; behavior is deterministic given model order.
8
+ # - Ensure models are architecture-compatible (same shapes) before merging.
9
+ # =============================================================================
1
10
  _target_: fusion_bench.method.SimpleAverageAlgorithm
@@ -1,3 +1,12 @@
1
+ # =============================================================================
2
+ # FusionBench Method Configuration: Spherical Linear Interpolation (SLERP)
3
+ # =============================================================================
4
+ # Interpolates between two parameter vectors on a hypersphere.
5
+ #
6
+ # - t in [0,1]: interpolation factor; 0 returns model0; 1 returns model1.
7
+ # - DOT_THRESHOLD: threshold to switch to linear interpolation when vectors are near-aligned.
8
+ # - epsilon: small constant to avoid division by zero.
9
+ # =============================================================================
1
10
  _target_: fusion_bench.method.SlerpMergeAlgorithm
2
11
  t: 0.5 # interpolation factor
3
12
  DOT_THRESHOLD: 0.9995
@@ -1,3 +1,8 @@
1
+ # =============================================================================
2
+ # FusionBench Method Configuration: SLERP for Causal LM
3
+ # =============================================================================
4
+ # Spherical linear interpolation between two causal language models.
5
+ # =============================================================================
1
6
  _target_: fusion_bench.method.SlerpForCausalLM
2
7
  t: 0.5
3
8
  model_save_path: ${path.log_dir}/checkpoint
@@ -1,3 +1,6 @@
1
+ # =============================================================================
2
+ # FusionBench Method Configuration: SMILE Upscaling
3
+ # =============================================================================
1
4
  _target_: fusion_bench.method.SmileUpscalingAlgorithm
2
5
  # merge device on cuda can accelerate the SVD computation
3
6
  device: cpu
@@ -1,2 +1,11 @@
1
+ # =============================================================================
2
+ # FusionBench Method Configuration: Task Arithmetic
3
+ # =============================================================================
4
+ # Performs task vector arithmetic: base + lambda * \sum_i (task_i - base).
5
+ #
6
+ # Notes
7
+ # - scaling_factor controls the contribution of the task delta.
8
+ # - Model compatibility is required (matching parameter shapes).
9
+ # =============================================================================
1
10
  _target_: fusion_bench.method.TaskArithmeticAlgorithm
2
11
  scaling_factor: 0.3
@@ -1,3 +1,6 @@
1
+ # =============================================================================
2
+ # FusionBench Method Configuration: Ties Merging
3
+ # =============================================================================
1
4
  _target_: fusion_bench.method.TiesMergingAlgorithm
2
5
  # Scaling factor $\lambda$
3
6
  scaling_factor: 0.3