fusion-bench 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fusion_bench/compat/method/__init__.py +2 -0
- fusion_bench/compat/taskpool/flan_t5_glue_text_generation.py +4 -1
- fusion_bench/constants/clip_vision.py +22 -0
- fusion_bench/dataset/clip_dataset.py +10 -2
- fusion_bench/dataset/fer2013.py +1 -0
- fusion_bench/dataset/gsm8k.py +2 -2
- fusion_bench/method/__init__.py +10 -0
- fusion_bench/method/ada_svd/clip_vision.py +4 -1
- fusion_bench/method/adamerging/clip_task_wise_adamerging.py +1 -29
- fusion_bench/method/fisher_merging/fisher_merging.py +29 -17
- fusion_bench/method/gossip/__init__.py +3 -0
- fusion_bench/method/gossip/clip_layer_wise_gossip.py +43 -0
- fusion_bench/method/gossip/clip_task_wise_gossip.py +190 -0
- fusion_bench/method/gossip/entropy_loss.py +25 -0
- fusion_bench/method/gossip/flan_t5_layer_wise_gossip.py +388 -0
- fusion_bench/method/gossip/layer_wise_gossip.py +434 -0
- fusion_bench/method/gossip/min_norm_solvers.py +227 -0
- fusion_bench/method/gossip/task_wise_gossip.py +265 -0
- fusion_bench/method/gossip/utils.py +74 -0
- fusion_bench/method/isotropic_merging/__init__.py +1 -1
- fusion_bench/method/opcm/opcm.py +16 -7
- fusion_bench/method/pwe_moe/module.py +1 -1
- fusion_bench/method/pwe_moe/openclip_pwe_moe.py +476 -0
- fusion_bench/method/regmean/regmean.py +25 -17
- fusion_bench/method/smile_upscaling/__init__.py +1 -1
- fusion_bench/method/smile_upscaling/smile_mistral_upscaling.py +46 -145
- fusion_bench/method/smile_upscaling/smile_qwen2_upscaling.py +229 -0
- fusion_bench/method/smile_upscaling/smile_upscaling.py +19 -346
- fusion_bench/method/surgery/clip_layer_wise_adamerging_surgery.py +7 -0
- fusion_bench/method/task_arithmetic/task_arithmetic.py +8 -6
- fusion_bench/method/ties_merging/ties_merging.py +36 -31
- fusion_bench/method/we_moe/we_moe.py +14 -15
- fusion_bench/mixins/__init__.py +6 -3
- fusion_bench/mixins/hydra_config.py +49 -0
- fusion_bench/mixins/openclip_classification.py +11 -0
- fusion_bench/mixins/simple_profiler.py +4 -2
- fusion_bench/modelpool/__init__.py +3 -1
- fusion_bench/modelpool/base_pool.py +2 -2
- fusion_bench/modelpool/openclip_vision/__init__.py +1 -0
- fusion_bench/modelpool/openclip_vision/modelpool.py +255 -0
- fusion_bench/models/modeling_smile_mistral/modeling_smile_mistral.py +2 -203
- fusion_bench/models/modeling_smile_qwen2/__init__.py +8 -0
- fusion_bench/models/modeling_smile_qwen2/configuration_smile_qwen2.py +21 -0
- fusion_bench/models/modeling_smile_qwen2/modeling_smile_qwen2.py +922 -0
- fusion_bench/models/modeling_smile_qwen2/register.py +11 -0
- fusion_bench/models/open_clip/__init__.py +6 -0
- fusion_bench/models/open_clip/modeling.py +176 -0
- fusion_bench/models/open_clip/utils.py +311 -0
- fusion_bench/models/open_clip/variables_and_paths.py +56 -0
- fusion_bench/models/parameter_dict.py +54 -13
- fusion_bench/models/rankone_moe.py +2 -88
- fusion_bench/models/smile_moe/linear_from_hf_config.py +373 -0
- fusion_bench/models/smile_moe/{linear.py → linear_from_module.py} +103 -33
- fusion_bench/models/smile_moe/utils/__init__.py +24 -0
- fusion_bench/models/smile_moe/utils/svd_utils.py +46 -0
- fusion_bench/scripts/nyuv2_mtl_train.py +1 -1
- fusion_bench/taskpool/__init__.py +7 -3
- fusion_bench/taskpool/clip_vision/__init__.py +1 -0
- fusion_bench/taskpool/clip_vision/clip_rankone_moe_taskpool.py +2 -30
- fusion_bench/taskpool/clip_vision/clip_smile_taskpool.py +102 -0
- fusion_bench/taskpool/clip_vision/clip_sparse_wemoe_taskpool.py +2 -30
- fusion_bench/taskpool/clip_vision/taskpool.py +1 -2
- fusion_bench/taskpool/clip_vision/utils/__init__.py +0 -0
- fusion_bench/taskpool/clip_vision/utils/routing_analysis_utils.py +65 -0
- fusion_bench/taskpool/gpt2_text_classification.py +30 -1
- fusion_bench/taskpool/lm_eval_harness/__init__.py +3 -0
- fusion_bench/taskpool/lm_eval_harness/taskpool.py +87 -0
- fusion_bench/taskpool/openclip_vision/__init__.py +1 -0
- fusion_bench/taskpool/openclip_vision/openclip_taskpool.py +196 -0
- fusion_bench/utils/data.py +12 -0
- fusion_bench/utils/devices.py +14 -0
- fusion_bench/utils/instantiate.py +12 -0
- fusion_bench/utils/misc.py +9 -2
- fusion_bench/utils/packages.py +14 -0
- fusion_bench/utils/parameters.py +1 -1
- fusion_bench/utils/tensorboard.py +1 -1
- {fusion_bench-0.2.12.dist-info → fusion_bench-0.2.14.dist-info}/METADATA +22 -2
- {fusion_bench-0.2.12.dist-info → fusion_bench-0.2.14.dist-info}/RECORD +209 -157
- {fusion_bench-0.2.12.dist-info → fusion_bench-0.2.14.dist-info}/WHEEL +1 -1
- fusion_bench_config/clip-vit-base-patch32_robustness_corrupted.yaml +1 -2
- fusion_bench_config/dataset/image_classification/test/TALL20.yaml +0 -1
- fusion_bench_config/dataset/image_classification/test/emnist_letters.yaml +0 -1
- fusion_bench_config/dataset/image_classification/test/fashion_mnist.yaml +1 -1
- fusion_bench_config/dataset/image_classification/train/TALL20.yaml +0 -1
- fusion_bench_config/dataset/image_classification/train/fashion_mnist.yaml +1 -1
- fusion_bench_config/fabric/auto.yaml +0 -1
- fusion_bench_config/fabric/llama_ddp.yaml +0 -1
- fusion_bench_config/fabric/llama_fsdp.yaml +0 -1
- fusion_bench_config/fabric/llama_peft_fsdp.yaml +0 -1
- fusion_bench_config/fabric/strategy/deepspeed.yaml +0 -1
- fusion_bench_config/fabric/strategy/llama_peft_fsdp.yaml +0 -1
- fusion_bench_config/fabric_model_fusion.yaml +0 -1
- fusion_bench_config/llama_full_finetune.yaml +0 -2
- fusion_bench_config/llama_model_fusion.yaml +0 -2
- fusion_bench_config/method/ada_svd/clip_vision.yaml +0 -1
- fusion_bench_config/method/adamerging/layer_wise_flan_t5.yaml +0 -5
- fusion_bench_config/method/adamerging/layer_wise_gpt2.yaml +0 -5
- fusion_bench_config/method/adamerging/llama_sft.yaml +0 -2
- fusion_bench_config/method/adamerging.yaml +2 -2
- fusion_bench_config/method/analysis/task_vector_cos_similarity.yaml +0 -1
- fusion_bench_config/method/analysis/task_vector_violin_plot.yaml +0 -1
- fusion_bench_config/method/classification/clip_continual_finetune.yaml +0 -1
- fusion_bench_config/method/concrete_subspace/clip_concrete_layer_wise_adamerging.yaml +0 -1
- fusion_bench_config/method/concrete_subspace/clip_concrete_task_wise_adamerging.yaml +0 -1
- fusion_bench_config/method/concrete_subspace/clip_post_defense_AWM.yaml +1 -12
- fusion_bench_config/method/concrete_subspace/clip_post_defense_SAU.yaml +1 -12
- fusion_bench_config/method/concrete_subspace/clip_safe_concrete_layer_wise_adamerging.yaml +1 -10
- fusion_bench_config/method/concrete_subspace/clip_safe_concrete_task_arithmetic.yaml +1 -14
- fusion_bench_config/method/dare/simple_average.yaml +0 -1
- fusion_bench_config/method/dare/task_arithmetic.yaml +0 -1
- fusion_bench_config/method/dare/ties_merging.yaml +0 -2
- fusion_bench_config/method/dawe/dawe_for_clip.yaml +0 -3
- fusion_bench_config/method/doge_ta/doge_ta.yaml +1 -1
- fusion_bench_config/method/ensemble/max_model_predictor.yaml +1 -1
- fusion_bench_config/method/ensemble/simple_ensemble.yaml +0 -1
- fusion_bench_config/method/ensemble/weighted_ensemble.yaml +0 -1
- fusion_bench_config/method/gossip/layer_wise_clip.yaml +30 -0
- fusion_bench_config/method/gossip/layer_wise_flan_t5.yaml +25 -0
- fusion_bench_config/method/isotropic_merging/iso_c.yaml +0 -1
- fusion_bench_config/method/isotropic_merging/iso_cts.yaml +0 -1
- fusion_bench_config/method/linear/linear_interpolation.yaml +0 -1
- fusion_bench_config/method/linear/llama_expo.yaml +0 -3
- fusion_bench_config/method/linear/llama_expo_with_dare.yaml +0 -5
- fusion_bench_config/method/linear/weighted_average.yaml +0 -1
- fusion_bench_config/method/linear/weighted_average_for_llama.yaml +0 -1
- fusion_bench_config/method/lm_finetune/bradley_terry_rm.yaml +0 -4
- fusion_bench_config/method/lm_finetune/fullfinetune_sft.yaml +0 -4
- fusion_bench_config/method/lm_finetune/peftfinetune_sft.yaml +0 -6
- fusion_bench_config/method/mixtral_moe_upscaling.yaml +1 -2
- fusion_bench_config/method/model_recombination.yaml +0 -1
- fusion_bench_config/method/opcm/opcm.yaml +0 -1
- fusion_bench_config/method/opcm/task_arithmetic.yaml +0 -2
- fusion_bench_config/method/opcm/ties_merging.yaml +0 -2
- fusion_bench_config/method/opcm/weight_average.yaml +0 -1
- fusion_bench_config/method/pwe_moe/epo_for_openclip.yaml +30 -0
- fusion_bench_config/method/pwe_moe/ls_for_openclip.yaml +30 -0
- fusion_bench_config/method/{pwe_moe_ls_for_clip.yaml → pwe_moe/pwe_moe_ls_for_clip.yaml} +7 -6
- fusion_bench_config/method/rankone_moe/rankone_moe.yaml +1 -3
- fusion_bench_config/method/regmean/gpt2_regmean.yaml +0 -1
- fusion_bench_config/method/slerp/slerp.yaml +0 -2
- fusion_bench_config/method/smile_upscaling/smile_mistral_upscaling.yaml +5 -2
- fusion_bench_config/method/smile_upscaling/smile_qwen2_upscaling.yaml +13 -0
- fusion_bench_config/method/sparselo_pruning/llama_iterative_sparselo.yaml +1 -1
- fusion_bench_config/method/sparselo_pruning/llama_pcp_sparselo.yaml +1 -1
- fusion_bench_config/method/sparselo_pruning/llama_sparselo.yaml +1 -1
- fusion_bench_config/method/surgery/adamerging_surgery.yaml +1 -2
- fusion_bench_config/method/task_arithmetic.yaml +1 -1
- fusion_bench_config/method/task_singular_vector/TaskSingularVectorMerging.yaml +0 -1
- fusion_bench_config/method/ties_merging.yaml +1 -1
- fusion_bench_config/method/trust_region/clip_task_arithmetic.yaml +0 -1
- fusion_bench_config/method/wemoe/sparse_weight_ensembling_moe.yaml +0 -8
- fusion_bench_config/model/clip-vit/clip-vit-base-patch16_cifar10.yaml +1 -1
- fusion_bench_config/model/clip-vit/clip-vit-large-patch14.yaml +1 -1
- fusion_bench_config/model/clip-vit/clip-vit-large-patch14_oxford-iiit-pet.yaml +1 -1
- fusion_bench_config/model/clip-vit/clip-vit-large-patch14_oxford_flowers102.yaml +1 -1
- fusion_bench_config/model/clip-vit/clip-vit-large-patch14_pcam.yaml +1 -1
- fusion_bench_config/model/clip-vit/clip-vit-large-patch14_rendered-sst2.yaml +1 -1
- fusion_bench_config/model/clip-vit/clip-vit-large-patch14_resisc45.yaml +1 -1
- fusion_bench_config/model/clip-vit/clip-vit-large-patch14_stanford-cars.yaml +1 -1
- fusion_bench_config/model/clip-vit/clip-vit-large-patch14_stl10.yaml +1 -1
- fusion_bench_config/model/clip-vit/clip-vit-large-patch14_sun397.yaml +1 -1
- fusion_bench_config/model/clip-vit/clip-vit-large-patch14_svhn.yaml +1 -1
- fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch16_TA8_lora.yaml +0 -3
- fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch16_individual.yaml +0 -3
- fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch16_individual_lora.yaml +0 -3
- fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_TA8_control_task.yaml +0 -3
- fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_individual.yaml +0 -3
- fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_single_task_projection.yaml +0 -3
- fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_two_tasks_control_task.yaml +0 -4
- fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-large-patch14_individual.yaml +0 -3
- fusion_bench_config/modelpool/CausalLMPool/llama_alpaca_cleaned.yaml +0 -4
- fusion_bench_config/modelpool/CausalLMPool/llama_codealpaca.yaml +0 -4
- fusion_bench_config/modelpool/CausalLMPool/llama_for_causallm.yaml +0 -1
- fusion_bench_config/modelpool/CausalLMPool/llama_metamathqa.yaml +0 -4
- fusion_bench_config/modelpool/CausalLMPool/llama_ultrachat.yaml +0 -4
- fusion_bench_config/modelpool/CausalLMPool/qwen2_math_1.5B_and_R1.yaml +17 -0
- fusion_bench_config/modelpool/CausalLMPool/simle_mixtral_exp_v4.yaml +0 -1
- fusion_bench_config/modelpool/CausalLMPool/single_llama_model.yaml +0 -3
- fusion_bench_config/modelpool/OpenCLIPVisionModelPool/README.md +90 -0
- fusion_bench_config/modelpool/OpenCLIPVisionModelPool/ViT-B-16_TA8.yaml +27 -0
- fusion_bench_config/modelpool/OpenCLIPVisionModelPool/ViT-B-32_TA8.yaml +45 -0
- fusion_bench_config/modelpool/OpenCLIPVisionModelPool/ViT-B-32_TA_cars_dtd.yaml +23 -0
- fusion_bench_config/modelpool/OpenCLIPVisionModelPool/ViT-B-32_TA_sun397_cars.yaml +23 -0
- fusion_bench_config/modelpool/OpenCLIPVisionModelPool/ViT-B-32_TA_sun397_dtd.yaml +23 -0
- fusion_bench_config/modelpool/OpenCLIPVisionModelPool/ViT-B-32_individual.yaml +7 -0
- fusion_bench_config/modelpool/OpenCLIPVisionModelPool/ViT-L-14_TA8.yaml +26 -0
- fusion_bench_config/modelpool/Seq2SeqLMPool/flan-t5-base_glue.yaml +0 -1
- fusion_bench_config/modelpool/Seq2SeqLMPool/flan-t5-base_glue_lora16.yaml +0 -2
- fusion_bench_config/modelpool/Seq2SeqLMPool/flan-t5-base_glue_lora16_tta.yaml +0 -2
- fusion_bench_config/modelpool/Seq2SeqLMPool/flan-t5-base_glue_tta.yaml +1 -3
- fusion_bench_config/modelpool/Seq2SeqLMPool/flan-t5-base_individual.yaml +0 -1
- fusion_bench_config/modelpool/Seq2SeqLMPool/flan-t5-large_glue_lora16.yaml +0 -3
- fusion_bench_config/modelpool/SeqenceClassificationModelPool/llama_preference700k.yaml +0 -4
- fusion_bench_config/modelpool/SeqenceClassificationModelPool/single_reward_model.yaml +0 -3
- fusion_bench_config/modelpool/gpt-2_glue.yaml +0 -3
- fusion_bench_config/nyuv2_config.yaml +0 -2
- fusion_bench_config/taskpool/CLIPVisionModelTaskPool/_template.yaml +0 -3
- fusion_bench_config/taskpool/CLIPVisionModelTaskPool/clip-vit-classification_TA8_B16.yaml +0 -2
- fusion_bench_config/taskpool/CLIPVisionModelTaskPool/clip_rankone_wemoe_clip-vit-classification_TA8.yaml +0 -2
- fusion_bench_config/taskpool/CLIPVisionModelTaskPool/clip_sparse_wemoe_clip-vit-classification_TA8.yaml +0 -2
- fusion_bench_config/taskpool/LMEvalHarnessTaskPool/lm_eval.yaml +12 -0
- fusion_bench_config/taskpool/OpenCLIPVisionModelTaskPool/ViT-B-16_TA8.yaml +24 -0
- fusion_bench_config/taskpool/OpenCLIPVisionModelTaskPool/ViT-B-32_TA8.yaml +24 -0
- fusion_bench_config/taskpool/OpenCLIPVisionModelTaskPool/ViT-L-14_TA8.yaml +24 -0
- fusion_bench_config/taskpool/gpt-2_glue.yaml +0 -1
- fusion_bench_config/taskpool/reward_model_evaluation.yaml +0 -4
- {fusion_bench-0.2.12.dist-info → fusion_bench-0.2.14.dist-info}/entry_points.txt +0 -0
- {fusion_bench-0.2.12.dist-info → fusion_bench-0.2.14.dist-info}/licenses/LICENSE +0 -0
- {fusion_bench-0.2.12.dist-info → fusion_bench-0.2.14.dist-info}/top_level.txt +0 -0
|
@@ -3,10 +3,9 @@ defaults:
|
|
|
3
3
|
- fabric: auto
|
|
4
4
|
# --- Model, Method, Task ---
|
|
5
5
|
- modelpool: CLIPVisionModelPool/clip-vit-base-patch32_robustness_corrupted
|
|
6
|
-
- method: dummy
|
|
6
|
+
- method: dummy # change this to the method you want to use
|
|
7
7
|
- taskpool: CLIPVisionModelTaskPool/clip-vit-base-patch32_robustness_corrupted
|
|
8
8
|
- _self_
|
|
9
|
-
|
|
10
9
|
_target_: fusion_bench.programs.FabricModelFusionProgram
|
|
11
10
|
_recursive_: false
|
|
12
11
|
fast_dev_run: false # Run a single batch of data to test the model or method
|
|
@@ -6,10 +6,8 @@ defaults:
|
|
|
6
6
|
- modelpool: CausalLMPool/llama_alpaca_cleaned.yaml
|
|
7
7
|
- taskpool: dummy
|
|
8
8
|
- _self_
|
|
9
|
-
|
|
10
9
|
_target_: fusion_bench.programs.FabricModelFusionProgram
|
|
11
10
|
_recursive_: false
|
|
12
|
-
|
|
13
11
|
fast_dev_run: false # Run a single batch of data to test the model or method
|
|
14
12
|
# Run the script without actually running the experiment, use with `print_config=true`.
|
|
15
13
|
# You can also use `--cfg` or `-c` to show the configuration instead of running.
|
|
@@ -3,13 +3,11 @@ defaults:
|
|
|
3
3
|
- override modelpool: CausalLMPool/single_llama_model
|
|
4
4
|
- override taskpool: dummy
|
|
5
5
|
- _self_
|
|
6
|
-
|
|
7
6
|
merged_model_save_path: null # path to save the merged model, use "{log_dir}" to refer to the logger directory, for example `merged_model_save_path=\{log_dir\}/merged_model`
|
|
8
7
|
merged_model_save_kwargs:
|
|
9
8
|
save_tokenizer: true
|
|
10
9
|
# tokenizer_kwargs:
|
|
11
10
|
# unk_token: "<s>" # https://github.com/huggingface/transformers/issues/24318#issuecomment-1596801322
|
|
12
|
-
|
|
13
11
|
modelpool:
|
|
14
12
|
model_kwargs:
|
|
15
13
|
torch_dtype: float16
|
|
@@ -1,23 +1,18 @@
|
|
|
1
1
|
_target_: fusion_bench.method.adamerging.flan_t5_layer_wise_adamerging.FlanT5LayerWiseAdaMergingAlgorithm
|
|
2
2
|
_recursive_: false
|
|
3
|
-
|
|
4
3
|
optimizer:
|
|
5
4
|
_target_: torch.optim.Adam
|
|
6
5
|
lr: 1e-3
|
|
7
|
-
|
|
8
6
|
dataloader_kwargs:
|
|
9
7
|
batch_size: 4
|
|
10
8
|
num_workers: 0
|
|
11
|
-
|
|
12
9
|
init_values: 0.3
|
|
13
10
|
max_steps: 1000
|
|
14
11
|
# if `merging_weights_path` is specified, the merging weights will be loaded from the file and skip the training process
|
|
15
12
|
merging_weights_load_path: null
|
|
16
13
|
merging_weights_save_path: null
|
|
17
|
-
|
|
18
14
|
variant: null
|
|
19
15
|
clamp_weights: false
|
|
20
16
|
tie_weights: false
|
|
21
17
|
strict: false
|
|
22
|
-
|
|
23
18
|
cache_dir: "outputs/cache"
|
|
@@ -1,23 +1,18 @@
|
|
|
1
1
|
_target_: fusion_bench.method.adamerging.gpt2_layer_wise_adamerging.GPT2LayerWiseAdaMergingAlgorithm
|
|
2
2
|
_recursive_: false
|
|
3
|
-
|
|
4
3
|
optimizer:
|
|
5
4
|
_target_: torch.optim.Adam
|
|
6
5
|
lr: 1e-4
|
|
7
|
-
|
|
8
6
|
dataloader_kwargs:
|
|
9
7
|
batch_size: 16
|
|
10
8
|
num_workers: 0
|
|
11
|
-
|
|
12
9
|
init_values: 0.3
|
|
13
10
|
max_steps: 1000
|
|
14
11
|
# if `merging_weights_path` is specified, the merging weights will be loaded from the file and skip the training process
|
|
15
12
|
merging_weights_load_path: null
|
|
16
13
|
merging_weights_save_path: null
|
|
17
|
-
|
|
18
14
|
variant: null
|
|
19
15
|
clamp_weights: false
|
|
20
16
|
tie_weights: true
|
|
21
17
|
strict: false
|
|
22
|
-
|
|
23
18
|
cache_dir: "outputs/cache"
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
_target_: fusion_bench.method.adamerging.llama_adamerging.LayerWiseAdaMergingForLlamaSFT
|
|
2
|
-
|
|
3
2
|
seed: 0
|
|
4
3
|
output_dir: null
|
|
5
4
|
# path to initialize the merging weights
|
|
@@ -26,7 +25,6 @@ fast_dev_run: ${fast_dev_run}
|
|
|
26
25
|
# the path for saving the merging weights
|
|
27
26
|
save_interval: 100
|
|
28
27
|
save_merged_model: true
|
|
29
|
-
|
|
30
28
|
dataloader_kwargs:
|
|
31
29
|
batch_size: 24
|
|
32
30
|
num_workers: 0
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
# this option can be "clip_task_wise_adamerging"
|
|
2
|
-
name:
|
|
1
|
+
# this option can be one of "clip_task_wise_adamerging" or "clip_layer_wise_adamerging"
|
|
2
|
+
name: clip_layer_wise_adamerging
|
|
3
3
|
# this weights can be a list of float, or a string that points to a *.np, *.pt file containing the weights
|
|
4
4
|
# if weights is specified, skip the test-time adaptation training
|
|
5
5
|
weights: null
|
|
@@ -1,38 +1,27 @@
|
|
|
1
1
|
# Reference: Jinluan Yang, et al. Mitigating the Backdoor Effect for Multi-Task Model Merging via Safety-Aware Subspace. ICLR 2025.
|
|
2
|
-
|
|
3
2
|
name: clip_post_defense_AWM
|
|
4
|
-
|
|
5
3
|
# batch size per gpu
|
|
6
4
|
# if you have multiple gpus, the total batch size will be `batch_size * num_gpus`
|
|
7
5
|
batch_size: 16
|
|
8
6
|
num_workers: 8
|
|
9
|
-
|
|
10
7
|
optimizer: adam
|
|
11
8
|
lr: 1e-3
|
|
12
|
-
|
|
13
9
|
scaling_factor: 0.3
|
|
14
|
-
|
|
15
10
|
###new
|
|
16
|
-
adv_lr: 1e-4
|
|
11
|
+
adv_lr: 1e-4
|
|
17
12
|
trigger_norm: 1000
|
|
18
13
|
adv_weight: 0.01
|
|
19
|
-
|
|
20
|
-
|
|
21
14
|
max_steps: 2000
|
|
22
15
|
save_interval: 500
|
|
23
16
|
initial_logits: 0
|
|
24
17
|
temperature: 0.5
|
|
25
|
-
|
|
26
18
|
# "discrete" or "continuous", this is the mask applied for evaluation, not during training
|
|
27
19
|
# the performance of final model are expected to be similar
|
|
28
20
|
eval_mask_type: continuous
|
|
29
|
-
|
|
30
21
|
mask_checkpoint: null
|
|
31
22
|
# if `clamp_weights` is true, the weights will be clamped to [0, 1]
|
|
32
23
|
clamp_weights: false
|
|
33
|
-
|
|
34
24
|
# arguments of `functional_call`
|
|
35
25
|
tie_weights: true
|
|
36
26
|
strict: false
|
|
37
|
-
|
|
38
27
|
cache_dir: outputs
|
|
@@ -1,41 +1,30 @@
|
|
|
1
1
|
# Reference: Jinluan Yang, et al. Mitigating the Backdoor Effect for Multi-Task Model Merging via Safety-Aware Subspace. ICLR 2025.
|
|
2
|
-
|
|
3
2
|
name: clip_post_defense_SAU
|
|
4
|
-
|
|
5
3
|
# batch size per gpu
|
|
6
4
|
# if you have multiple gpus, the total batch size will be `batch_size * num_gpus`
|
|
7
5
|
batch_size: 16
|
|
8
6
|
num_workers: 8
|
|
9
|
-
|
|
10
7
|
optimizer: adam
|
|
11
8
|
lr: 1e-3
|
|
12
|
-
|
|
13
9
|
scaling_factor: 0.3
|
|
14
|
-
|
|
15
10
|
###new
|
|
16
|
-
adv_lr: 1e-4
|
|
11
|
+
adv_lr: 1e-4
|
|
17
12
|
trigger_norm: 1000
|
|
18
13
|
adv_weight: 0.01
|
|
19
14
|
shared_weight: 0.01
|
|
20
15
|
beta1: 0.5
|
|
21
16
|
beta2: 0.5
|
|
22
|
-
|
|
23
|
-
|
|
24
17
|
max_steps: 2000
|
|
25
18
|
save_interval: 500
|
|
26
19
|
initial_logits: 0
|
|
27
20
|
temperature: 0.5
|
|
28
|
-
|
|
29
21
|
# "discrete" or "continuous", this is the mask applied for evaluation, not during training
|
|
30
22
|
# the performance of final model are expected to be similar
|
|
31
23
|
eval_mask_type: continuous
|
|
32
|
-
|
|
33
24
|
mask_checkpoint: null
|
|
34
25
|
# if `clamp_weights` is true, the weights will be clamped to [0, 1]
|
|
35
26
|
clamp_weights: false
|
|
36
|
-
|
|
37
27
|
# arguments of `functional_call`
|
|
38
28
|
tie_weights: true
|
|
39
29
|
strict: false
|
|
40
|
-
|
|
41
30
|
cache_dir: outputs
|
|
@@ -1,39 +1,30 @@
|
|
|
1
1
|
# Reference: Jinluan Yang, et al. Mitigating the Backdoor Effect for Multi-Task Model Merging via Safety-Aware Subspace. ICLR 2025.
|
|
2
|
-
|
|
3
2
|
name: clip_safe_concrete_layer_wise_adamerging
|
|
4
|
-
|
|
5
3
|
# batch size per gpu
|
|
6
4
|
# if you have multiple gpus, the total batch size will be `batch_size * num_gpus`
|
|
7
5
|
batch_size: 16
|
|
8
6
|
num_workers: 8
|
|
9
|
-
|
|
10
7
|
optimizer: adam
|
|
11
8
|
lr: 1e-3
|
|
12
9
|
base_lr: 1
|
|
13
10
|
adamerging_lr: 1e-3
|
|
14
|
-
|
|
15
11
|
scaling_factor: 0.3
|
|
16
|
-
|
|
17
12
|
max_steps: 1000
|
|
18
13
|
max_adamerging_steps: 1000
|
|
19
14
|
save_interval: 500
|
|
20
15
|
initial_logits: 0
|
|
21
16
|
temperature: 0.5
|
|
22
|
-
|
|
23
17
|
###new
|
|
24
|
-
adv_lr: 1e-4
|
|
18
|
+
adv_lr: 1e-4
|
|
25
19
|
trigger_norm: 1000
|
|
26
20
|
adv_weight: 0.1
|
|
27
21
|
# "discrete" or "continuous", this is the mask applied for evaluation, not during training
|
|
28
22
|
# the performance of final model are expected to be similar
|
|
29
23
|
eval_mask_type: continuous
|
|
30
|
-
|
|
31
24
|
mask_checkpoint: null
|
|
32
25
|
# if `clamp_weights` is true, the weights will be clamped to [0, 1]
|
|
33
26
|
clamp_weights: false
|
|
34
|
-
|
|
35
27
|
# arguments of `functional_call`
|
|
36
28
|
tie_weights: true
|
|
37
29
|
strict: false
|
|
38
|
-
|
|
39
30
|
cache_dir: outputs
|
|
@@ -1,40 +1,27 @@
|
|
|
1
1
|
# Reference: Jinluan Yang, et al. Mitigating the Backdoor Effect for Multi-Task Model Merging via Safety-Aware Subspace. ICLR 2025.
|
|
2
|
-
|
|
3
2
|
name: clip_safe_concrete_task_arithmetic
|
|
4
|
-
|
|
5
3
|
# batch size per gpu
|
|
6
4
|
# if you have multiple gpus, the total batch size will be `batch_size * num_gpus`
|
|
7
5
|
batch_size: 16
|
|
8
6
|
num_workers: 8
|
|
9
|
-
|
|
10
7
|
optimizer: adam
|
|
11
8
|
lr: 1e-3
|
|
12
|
-
|
|
13
9
|
scaling_factor: 0.3
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
10
|
###new
|
|
18
|
-
adv_lr: 1e-4
|
|
11
|
+
adv_lr: 1e-4
|
|
19
12
|
trigger_norm: 1000
|
|
20
13
|
adv_weight: 0.1
|
|
21
|
-
|
|
22
|
-
|
|
23
14
|
max_steps: 2000
|
|
24
15
|
save_interval: 500
|
|
25
16
|
initial_logits: 0
|
|
26
17
|
temperature: 0.5
|
|
27
|
-
|
|
28
18
|
# "discrete" or "continuous", this is the mask applied for evaluation, not during training
|
|
29
19
|
# the performance of final model are expected to be similar
|
|
30
20
|
eval_mask_type: continuous
|
|
31
|
-
|
|
32
21
|
mask_checkpoint: null
|
|
33
22
|
# if `clamp_weights` is true, the weights will be clamped to [0, 1]
|
|
34
23
|
clamp_weights: false
|
|
35
|
-
|
|
36
24
|
# arguments of `functional_call`
|
|
37
25
|
tie_weights: true
|
|
38
26
|
strict: false
|
|
39
|
-
|
|
40
27
|
cache_dir: outputs
|
|
@@ -4,7 +4,6 @@ merge_mode: task_wise
|
|
|
4
4
|
init_lambda: 0.3
|
|
5
5
|
batch_reduce: true
|
|
6
6
|
eval_batch_reduce: false
|
|
7
|
-
|
|
8
7
|
_dict_feature_extractor_path: microsoft/resnet-18
|
|
9
8
|
dict_processor:
|
|
10
9
|
_target_: fusion_bench.method.dawe.dawe_for_clip.load_resnet_processor
|
|
@@ -18,14 +17,12 @@ gate_hidden_layers: 1
|
|
|
18
17
|
# if task_vector_dtype is null, the task vector will have the same dtype as pretrained model
|
|
19
18
|
task_vector_dtype: null
|
|
20
19
|
task_vector_sparsity: 0
|
|
21
|
-
|
|
22
20
|
# training & logging args
|
|
23
21
|
max_steps: 1000
|
|
24
22
|
save_interval: 200
|
|
25
23
|
learning_rate: 1e-5
|
|
26
24
|
resume_checkpoint_path: null
|
|
27
25
|
skip_training: false
|
|
28
|
-
|
|
29
26
|
# dataloader args
|
|
30
27
|
batch_size: 1
|
|
31
28
|
num_workers: 0
|
|
@@ -1 +1 @@
|
|
|
1
|
-
_target_: fusion_bench.method.MaxModelPredictorAlgorithm
|
|
1
|
+
_target_: fusion_bench.method.MaxModelPredictorAlgorithm
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# this option can be "clip_task_wise_gossip"
|
|
2
|
+
name: clip_layer_wise_gossip
|
|
3
|
+
# _target_: fusion_bench.method.CLIPLayerWiseGossipAlgorithm
|
|
4
|
+
# this weights can be a list of float, or a string that points to a *.np, *.pt file containing the weights
|
|
5
|
+
# if weights is specified, skip the test-time adaptation training
|
|
6
|
+
weights: null
|
|
7
|
+
# learning rate
|
|
8
|
+
optimizer: adam
|
|
9
|
+
lr: 1e-3
|
|
10
|
+
init_values: 0.3
|
|
11
|
+
# if `clamp_weights` is true, the weights will be clamped to [0, 1]
|
|
12
|
+
clamp_weights: false
|
|
13
|
+
# arguments of `functional_call`
|
|
14
|
+
tie_weights: true
|
|
15
|
+
strict: false
|
|
16
|
+
# this is overrided by `fabric.devices` if launched from the `fusion_bench` CLI.
|
|
17
|
+
devices: 1
|
|
18
|
+
batch_size: 16
|
|
19
|
+
num_workers: 8
|
|
20
|
+
max_steps: 400 # 1000
|
|
21
|
+
fast_dev_run: ${fast_dev_run}
|
|
22
|
+
# the path for saving the merging weights
|
|
23
|
+
save_merging_weights: 'merging_weights.pt'
|
|
24
|
+
cache_dir: outputs
|
|
25
|
+
# this is the parameter about gossip
|
|
26
|
+
gossip_max_steps: 20
|
|
27
|
+
gossip_skip_adamerging: false
|
|
28
|
+
accuracy_test_interval: 0 # if this value is equal to 1, we will evaluate all models each time after Gossip
|
|
29
|
+
improve_dataset: true
|
|
30
|
+
topo: ring
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
_target_: fusion_bench.method.gossip.flan_t5_layer_wise_gossip.FlanT5LayerWiseGossipAlgorithm
|
|
2
|
+
_recursive_: false
|
|
3
|
+
optimizer:
|
|
4
|
+
_target_: torch.optim.Adam
|
|
5
|
+
lr: 1e-3
|
|
6
|
+
dataloader_kwargs:
|
|
7
|
+
batch_size: 4
|
|
8
|
+
num_workers: 0
|
|
9
|
+
init_values: 0.3
|
|
10
|
+
max_steps: 400
|
|
11
|
+
# if `merging_weights_path` is specified, the merging weights will be loaded from the file and skip the training process
|
|
12
|
+
merging_weights_load_path: null
|
|
13
|
+
merging_weights_save_path: null
|
|
14
|
+
variant: null
|
|
15
|
+
clamp_weights: false
|
|
16
|
+
tie_weights: false
|
|
17
|
+
strict: false
|
|
18
|
+
weights: null
|
|
19
|
+
cache_dir: "outputs/cache"
|
|
20
|
+
# this is the parameter about gossip
|
|
21
|
+
gossip_max_steps: 20
|
|
22
|
+
gossip_skip_adamerging: false
|
|
23
|
+
accuracy_test_interval: 0 #if this value is equal to 1, we will evaluate all models each time after Gossip [1, 5, 10, 15, 20] it can also be a list #
|
|
24
|
+
improve_dataset: true
|
|
25
|
+
topo: ring
|
|
@@ -7,13 +7,10 @@
|
|
|
7
7
|
_target_: fusion_bench.method.ExPOAlgorithmForLlama
|
|
8
8
|
extrapolation_factor: 0.1
|
|
9
9
|
attention_scaling_factor: 1.0
|
|
10
|
-
|
|
11
10
|
only_on_backbone: true
|
|
12
11
|
on_linear_weights: true
|
|
13
12
|
on_linear_bias: false
|
|
14
13
|
on_embedding: false
|
|
15
|
-
|
|
16
14
|
fix_last_n_layers: 0
|
|
17
15
|
fix_first_n_layers: 0
|
|
18
|
-
|
|
19
16
|
magnitude_sparsity_ratio: null
|
|
@@ -1,18 +1,13 @@
|
|
|
1
1
|
_target_: fusion_bench.method.linear.llama_expo.ExPOWithDareForLLama
|
|
2
|
-
|
|
3
2
|
extrapolation_factor: 0.1
|
|
4
3
|
attention_scaling_factor: 1.0
|
|
5
|
-
|
|
6
4
|
only_on_backbone: true
|
|
7
5
|
on_linear_weights: true
|
|
8
6
|
on_linear_bias: false
|
|
9
7
|
on_embedding: false
|
|
10
|
-
|
|
11
8
|
fix_last_n_layers: 0
|
|
12
9
|
fix_first_n_layers: 0
|
|
13
|
-
|
|
14
10
|
magnitude_sparsity_ratio: null
|
|
15
|
-
|
|
16
11
|
# dare arguments
|
|
17
12
|
dare_sparsity_ratio: 0.5
|
|
18
13
|
dare_only_on_linear_weights: true
|
|
@@ -1,12 +1,10 @@
|
|
|
1
1
|
_target_: fusion_bench.method.BradleyTerryRewardModeling
|
|
2
2
|
_recursive_: False
|
|
3
|
-
|
|
4
3
|
optimizer:
|
|
5
4
|
_target_: torch.optim.AdamW
|
|
6
5
|
lr: 1e-5
|
|
7
6
|
weight_decay: 0.01
|
|
8
7
|
fused: null
|
|
9
|
-
|
|
10
8
|
lr_scheduler:
|
|
11
9
|
_target_: fusion_bench.optim.lr_scheduler.CosineDecayWithWarmup
|
|
12
10
|
T_max: _T_max_ # this will be replaced by the expected number of training steps
|
|
@@ -14,13 +12,11 @@ lr_scheduler:
|
|
|
14
12
|
warmup_steps: 100
|
|
15
13
|
max_lr: ${..optimizer.lr}
|
|
16
14
|
min_lr: 1e-6
|
|
17
|
-
|
|
18
15
|
dataloader_kwargs:
|
|
19
16
|
# per-gpu batch size
|
|
20
17
|
batch_size: 1
|
|
21
18
|
num_workers: 0
|
|
22
19
|
pin_memory: True
|
|
23
|
-
|
|
24
20
|
# Training hyperparameters
|
|
25
21
|
# if max_epochs=-1, max_steps will be used to determine the number of training steps
|
|
26
22
|
max_epochs: 3
|