fusion-bench 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. fusion_bench/__init__.py +22 -2
  2. fusion_bench/_get_started/__init__.py +3 -0
  3. fusion_bench/_get_started/greeting_program.py +49 -0
  4. fusion_bench/compat/method/base_algorithm.py +14 -0
  5. fusion_bench/constants/__init__.py +6 -0
  6. fusion_bench/constants/clip_vision.py +26 -2
  7. fusion_bench/constants/paths.py +4 -0
  8. fusion_bench/constants/runtime.py +57 -0
  9. fusion_bench/dataset/clip_dataset.py +2 -1
  10. fusion_bench/dataset/gpt2_glue.py +9 -9
  11. fusion_bench/dataset/image_corruption/__init__.py +0 -0
  12. fusion_bench/dataset/image_corruption/make_corruption.py +179 -0
  13. fusion_bench/dataset/image_dataset.py +1 -1
  14. fusion_bench/dataset/nyuv2.py +2 -2
  15. fusion_bench/method/__init__.py +24 -5
  16. fusion_bench/method/adamerging/clip_layer_wise_adamerging.py +1 -1
  17. fusion_bench/method/adamerging/clip_task_wise_adamerging.py +11 -7
  18. fusion_bench/method/adamerging/layer_wise_adamerging.py +11 -5
  19. fusion_bench/method/base_algorithm.py +195 -12
  20. fusion_bench/method/bitdelta/__init__.py +5 -0
  21. fusion_bench/method/bitdelta/bitdelta.py +156 -0
  22. fusion_bench/method/bitdelta/bitdelta_utils/__init__.py +0 -0
  23. fusion_bench/method/bitdelta/bitdelta_utils/binary_gemm_kernel.py +462 -0
  24. fusion_bench/method/bitdelta/bitdelta_utils/data.py +35 -0
  25. fusion_bench/method/bitdelta/bitdelta_utils/diff.py +129 -0
  26. fusion_bench/method/classification/clip_finetune.py +1 -1
  27. fusion_bench/method/concrete_subspace/clip_concrete_adamerging.py +0 -1
  28. fusion_bench/method/depth_upscaling/depth_upscaling.py +4 -9
  29. fusion_bench/method/doge_ta/clip_layer_wise_adamerging.py +4 -5
  30. fusion_bench/method/doge_ta/doge_ta.py +1 -1
  31. fusion_bench/method/ensemble.py +12 -12
  32. fusion_bench/method/expert_sparsity/utils/calibration_data.py +1 -1
  33. fusion_bench/method/fisher_merging/clip_fisher_merging.py +2 -6
  34. fusion_bench/method/fisher_merging/fisher_merging.py +6 -15
  35. fusion_bench/method/fisher_merging/gpt2_fisher_merging.py +3 -10
  36. fusion_bench/method/fw_merging/fw_hard.py +1 -1
  37. fusion_bench/method/fw_merging/fw_soft.py +1 -1
  38. fusion_bench/method/gossip/clip_layer_wise_gossip.py +4 -5
  39. fusion_bench/method/linear/expo.py +2 -1
  40. fusion_bench/method/linear/linear_interpolation.py +6 -4
  41. fusion_bench/method/linear/simple_average_for_llama.py +17 -13
  42. fusion_bench/method/lm_finetune/bradley_terry_rm.py +2 -2
  43. fusion_bench/method/mixture_of_experts/mixtral_upcycling.py +9 -26
  44. fusion_bench/method/model_recombination.py +2 -5
  45. fusion_bench/method/moe_pruner/hooks/__init__.py +1 -2
  46. fusion_bench/method/moe_pruner/utils/data.py +2 -1
  47. fusion_bench/method/moe_pruner/utils/prune.py +6 -1
  48. fusion_bench/method/pruning/llama_magnitude_prune.py +1 -1
  49. fusion_bench/method/pruning/wanda_utils/data.py +1 -2
  50. fusion_bench/method/pwe_moe/clip_pwe_moe.py +12 -34
  51. fusion_bench/method/randes/modelsoup.py +1 -3
  52. fusion_bench/method/regmean/clip_regmean.py +2 -2
  53. fusion_bench/method/regmean/gpt2_regmean.py +3 -10
  54. fusion_bench/method/regmean/regmean.py +2 -11
  55. fusion_bench/method/regmean_plusplus/__init__.py +1 -1
  56. fusion_bench/method/regmean_plusplus/clip_regmean_plusplus.py +24 -17
  57. fusion_bench/method/regmean_plusplus/regmean_plusplus.py +56 -38
  58. fusion_bench/method/simple_average.py +12 -16
  59. fusion_bench/method/slerp/slerp.py +5 -2
  60. fusion_bench/method/smile_upscaling/causal_lm_upscaling.py +371 -0
  61. fusion_bench/method/smile_upscaling/error_accumulation.py +177 -0
  62. fusion_bench/method/smile_upscaling/projected_energy.py +144 -0
  63. fusion_bench/method/smile_upscaling/smile_mistral_upscaling.py +5 -1
  64. fusion_bench/method/smile_upscaling/smile_qwen2_upscaling.py +71 -51
  65. fusion_bench/method/smile_upscaling/smile_upscaling.py +12 -5
  66. fusion_bench/method/tall_mask/task_arithmetic.py +3 -11
  67. fusion_bench/method/task_arithmetic/task_arithmetic.py +6 -10
  68. fusion_bench/method/ties_merging/ties_merging.py +13 -26
  69. fusion_bench/method/we_moe/__init__.py +1 -0
  70. fusion_bench/method/we_moe/clip_we_moe.py +5 -4
  71. fusion_bench/method/we_moe/entropy_loss.py +25 -0
  72. fusion_bench/method/we_moe/flan_t5_we_moe.py +331 -0
  73. fusion_bench/method/we_moe/utils.py +15 -0
  74. fusion_bench/method/we_moe/we_moe.py +6 -6
  75. fusion_bench/method/weighted_average/llama.py +4 -16
  76. fusion_bench/metrics/continual_learning/__init__.py +1 -0
  77. fusion_bench/metrics/continual_learning/backward_transfer.py +1 -1
  78. fusion_bench/metrics/nyuv2/__init__.py +2 -2
  79. fusion_bench/metrics/nyuv2/segmentation.py +1 -1
  80. fusion_bench/mixins/__init__.py +10 -2
  81. fusion_bench/mixins/clip_classification.py +15 -45
  82. fusion_bench/mixins/hydra_config.py +105 -7
  83. fusion_bench/mixins/lightning_fabric.py +2 -0
  84. fusion_bench/mixins/serialization.py +275 -48
  85. fusion_bench/modelpool/__init__.py +2 -2
  86. fusion_bench/modelpool/base_pool.py +29 -9
  87. fusion_bench/modelpool/causal_lm/causal_lm.py +41 -33
  88. fusion_bench/modelpool/clip_vision/modelpool.py +1 -3
  89. fusion_bench/modelpool/seq_classification_lm/__init__.py +1 -1
  90. fusion_bench/modelpool/seq_classification_lm/seq_classification_lm.py +1 -1
  91. fusion_bench/models/__init__.py +7 -1
  92. fusion_bench/models/expert_sparsity/mixtral/__init__.py +1 -1
  93. fusion_bench/models/hf_utils.py +160 -0
  94. fusion_bench/models/linearized/linearized_model_utils.py +4 -4
  95. fusion_bench/models/linearized/vision_model.py +1 -1
  96. fusion_bench/models/model_card_templates/default.md +46 -0
  97. fusion_bench/models/modeling_deepseek_v2/__init__.py +1 -1
  98. fusion_bench/models/modeling_deepseek_v2/modeling_deepseek.py +4 -4
  99. fusion_bench/models/modeling_deepseek_v2/tokenization_deepseek_fast.py +0 -1
  100. fusion_bench/models/modeling_smile_gemma2/__init__.py +9 -0
  101. fusion_bench/models/modeling_smile_gemma2/configuration_smile_gemma2.py +20 -0
  102. fusion_bench/models/modeling_smile_gemma2/modeling_smile_gemma2.py +986 -0
  103. fusion_bench/models/modeling_smile_gemma2/register.py +26 -0
  104. fusion_bench/models/modeling_smile_llama/__init__.py +7 -0
  105. fusion_bench/models/modeling_smile_llama/configuration_smile_llama.py +20 -0
  106. fusion_bench/models/modeling_smile_llama/modeling_smile_llama.py +698 -0
  107. fusion_bench/models/modeling_smile_llama/register.py +8 -0
  108. fusion_bench/models/modeling_smile_mistral/__init__.py +5 -47
  109. fusion_bench/models/modeling_smile_qwen2/__init__.py +1 -1
  110. fusion_bench/models/modeling_smile_qwen2/modeling_smile_qwen2.py +7 -12
  111. fusion_bench/models/modeling_smile_qwen2/register.py +1 -4
  112. fusion_bench/models/parameter_dict.py +1 -1
  113. fusion_bench/models/sparse_we_moe.py +1 -53
  114. fusion_bench/models/utils.py +26 -0
  115. fusion_bench/models/we_moe.py +1 -53
  116. fusion_bench/models/wrappers/ensemble.py +6 -4
  117. fusion_bench/models/wrappers/layer_wise_fusion.py +1 -1
  118. fusion_bench/models/wrappers/task_wise_fusion.py +250 -72
  119. fusion_bench/programs/base_program.py +81 -2
  120. fusion_bench/programs/fabric_fusion_program.py +46 -61
  121. fusion_bench/scripts/cli.py +38 -5
  122. fusion_bench/taskpool/base_pool.py +4 -3
  123. fusion_bench/taskpool/clip_vision/taskpool.py +43 -22
  124. fusion_bench/taskpool/dummy.py +1 -1
  125. fusion_bench/taskpool/lm_eval_harness/taskpool.py +1 -2
  126. fusion_bench/tasks/clip_classification/__init__.py +6 -4
  127. fusion_bench/utils/__init__.py +7 -1
  128. fusion_bench/utils/cache_utils.py +101 -1
  129. fusion_bench/utils/devices.py +14 -4
  130. fusion_bench/utils/fabric.py +2 -2
  131. fusion_bench/utils/instantiate_utils.py +3 -1
  132. fusion_bench/utils/lazy_imports.py +23 -0
  133. fusion_bench/utils/lazy_state_dict.py +38 -3
  134. fusion_bench/utils/modelscope.py +127 -8
  135. fusion_bench/utils/parameters.py +2 -2
  136. fusion_bench/utils/path.py +56 -0
  137. fusion_bench/utils/pylogger.py +1 -1
  138. fusion_bench/utils/rich_utils.py +3 -0
  139. fusion_bench/utils/state_dict_arithmetic.py +25 -23
  140. {fusion_bench-0.2.20.dist-info → fusion_bench-0.2.22.dist-info}/METADATA +24 -47
  141. {fusion_bench-0.2.20.dist-info → fusion_bench-0.2.22.dist-info}/RECORD +184 -145
  142. fusion_bench_config/_get_started/clip_evaluate_single_model.yaml +21 -0
  143. fusion_bench_config/_get_started/clip_simple_average.yaml +23 -0
  144. fusion_bench_config/_get_started/clip_task_arithmetic.yaml +24 -0
  145. fusion_bench_config/_get_started/greeting_program.yaml +4 -0
  146. fusion_bench_config/fabric/loggers/csv_logger.yaml +3 -3
  147. fusion_bench_config/fabric/loggers/tensorboard_logger.yaml +3 -3
  148. fusion_bench_config/fabric_model_fusion.yaml +45 -17
  149. fusion_bench_config/hydra/default.yaml +6 -2
  150. fusion_bench_config/llama_full_finetune.yaml +1 -0
  151. fusion_bench_config/method/adamerging/clip.yaml +1 -1
  152. fusion_bench_config/method/bitdelta/bitdelta.yaml +12 -0
  153. fusion_bench_config/method/depth_upscaling.yaml +4 -1
  154. fusion_bench_config/method/fisher_merging/clip_fisher_merging.yaml +0 -1
  155. fusion_bench_config/method/linear/simple_average_for_llama.yaml +3 -2
  156. fusion_bench_config/method/smile_upscaling/causal_lm_upscaling.yaml +21 -0
  157. fusion_bench_config/method/smile_upscaling/error_accumulation.yaml +5 -0
  158. fusion_bench_config/method/smile_upscaling/projected_energy.yaml +2 -0
  159. fusion_bench_config/method/smile_upscaling/smile_qwen2_upscaling.yaml +2 -1
  160. fusion_bench_config/method/wemoe/flan_t5_weight_ensembling_moe.yaml +20 -0
  161. fusion_bench_config/modelpool/CLIPVisionModelPool/_template.yaml +1 -4
  162. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_individual.yaml +4 -9
  163. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_single_finetuned.yaml +1 -1
  164. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_svhn_and_mnist.yaml +0 -6
  165. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-large-patch14_TA8.yaml +1 -1
  166. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-large-patch14_TA8_model_only.yaml +1 -1
  167. fusion_bench_config/modelpool/CausalLMPool/Qwen2.5-1.5B_math_and_coder.yaml +3 -3
  168. fusion_bench_config/modelpool/CausalLMPool/Qwen2.5-7B-math_and_coder.yaml +9 -0
  169. fusion_bench_config/modelpool/CausalLMPool/mistral-7b.yaml +6 -0
  170. fusion_bench_config/modelpool/CausalLMPool/mixtral_moe_merging.yaml +10 -0
  171. fusion_bench_config/modelpool/CausalLMPool/qwen2_math_1.5B_and_R1.yaml +4 -12
  172. fusion_bench_config/modelpool/CausalLMPool/simle_mixtral_exp_v4.yaml +6 -16
  173. fusion_bench_config/modelpool/CausalLMPool/vicuna-7b-v1.5.yaml +8 -0
  174. fusion_bench_config/modelpool/{SeqenceClassificationModelPool → SequenceClassificationModelPool}/llama_preference700k.yaml +1 -1
  175. fusion_bench_config/modelpool/{SeqenceClassificationModelPool → SequenceClassificationModelPool}/single_reward_model.yaml +1 -1
  176. fusion_bench_config/nyuv2_config.yaml +3 -1
  177. fusion_bench_config/nyuv2_mtl_train.yaml +1 -0
  178. fusion_bench_config/path/default.yaml +28 -0
  179. fusion_bench_config/taskpool/CLIPVisionModelTaskPool/clip-vit-base-patch32_svhn_and_mnist.yaml +24 -0
  180. fusion_bench_config/method/adamerging.yaml +0 -23
  181. fusion_bench_config/modelpool/mixtral_moe_merging.yaml +0 -14
  182. fusion_bench_config/modelpool/mixtral_moe_upscaling.yaml +0 -6
  183. fusion_bench_config/taskpool/clip-vit-base-patch32_svhn_and_mnist.yaml +0 -22
  184. {fusion_bench-0.2.20.dist-info → fusion_bench-0.2.22.dist-info}/WHEEL +0 -0
  185. {fusion_bench-0.2.20.dist-info → fusion_bench-0.2.22.dist-info}/entry_points.txt +0 -0
  186. {fusion_bench-0.2.20.dist-info → fusion_bench-0.2.22.dist-info}/licenses/LICENSE +0 -0
  187. {fusion_bench-0.2.20.dist-info → fusion_bench-0.2.22.dist-info}/top_level.txt +0 -0
  188. /fusion_bench_config/modelpool/{SeqenceClassificationModelPool → SequenceClassificationModelPool}/roberta-base_glue.yaml +0 -0
@@ -0,0 +1,21 @@
1
+ _target_: fusion_bench.programs.FabricModelFusionProgram
2
+ _recursive_: false
3
+ method:
4
+ _target_: fusion_bench.method.DummyAlgorithm
5
+ modelpool:
6
+ _target_: fusion_bench.modelpool.CLIPVisionModelPool
7
+ models:
8
+ _pretrained_: openai/clip-vit-base-patch32
9
+ taskpool:
10
+ _target_: fusion_bench.taskpool.CLIPVisionModelTaskPool
11
+ test_datasets:
12
+ sun397:
13
+ _target_: datasets.load_dataset
14
+ path: tanganke/sun397
15
+ split: test
16
+ stanford-cars:
17
+ _target_: datasets.load_dataset
18
+ path: tanganke/stanford_cars
19
+ split: test
20
+ clip_model: openai/clip-vit-base-patch32
21
+ processor: openai/clip-vit-base-patch32
@@ -0,0 +1,23 @@
1
+ _target_: fusion_bench.programs.FabricModelFusionProgram # (1)!
2
+ _recursive_: false
3
+ method: # (2)!
4
+ _target_: fusion_bench.method.SimpleAverageAlgorithm
5
+ modelpool: # (3)!
6
+ _target_: fusion_bench.modelpool.CLIPVisionModelPool
7
+ models:
8
+ _pretrained_: openai/clip-vit-base-patch32
9
+ sun397: tanganke/clip-vit-base-patch32_sun397
10
+ stanford-cars: tanganke/clip-vit-base-patch32_stanford-cars
11
+ taskpool: # (4)!
12
+ _target_: fusion_bench.taskpool.CLIPVisionModelTaskPool
13
+ test_datasets:
14
+ sun397:
15
+ _target_: datasets.load_dataset
16
+ path: tanganke/sun397
17
+ split: test
18
+ stanford-cars:
19
+ _target_: datasets.load_dataset
20
+ path: tanganke/stanford_cars
21
+ split: test
22
+ clip_model: openai/clip-vit-base-patch32
23
+ processor: openai/clip-vit-base-patch32
@@ -0,0 +1,24 @@
1
+ _target_: fusion_bench.programs.FabricModelFusionProgram
2
+ _recursive_: false
3
+ method:
4
+ _target_: fusion_bench.method.TaskArithmeticAlgorithm
5
+ scaling_factor: 0.7
6
+ modelpool:
7
+ _target_: fusion_bench.modelpool.CLIPVisionModelPool
8
+ models:
9
+ _pretrained_: openai/clip-vit-base-patch32
10
+ sun397: tanganke/clip-vit-base-patch32_sun397
11
+ stanford-cars: tanganke/clip-vit-base-patch32_stanford-cars
12
+ taskpool:
13
+ _target_: fusion_bench.taskpool.CLIPVisionModelTaskPool
14
+ test_datasets:
15
+ sun397:
16
+ _target_: datasets.load_dataset
17
+ path: tanganke/sun397
18
+ split: test
19
+ stanford-cars:
20
+ _target_: datasets.load_dataset
21
+ path: tanganke/stanford_cars
22
+ split: test
23
+ clip_model: openai/clip-vit-base-patch32
24
+ processor: openai/clip-vit-base-patch32
@@ -0,0 +1,4 @@
1
+ _target_: fusion_bench._get_started.greeting_program.GreetingProgram
2
+ message: "Welcome to FusionBench"
3
+ name: "Developer"
4
+ repeat_count: 3
@@ -3,9 +3,9 @@ _target_: lightning.fabric.loggers.CSVLogger
3
3
  # for example, `outputs/logs/lightning_logs/version_0` and `outputs/logs/lightning_logs/version_1` by default
4
4
 
5
5
  # root directory for all logging
6
- root_dir: outputs/logs
6
+ root_dir: ${path.log_dir}
7
7
  # the name of the experiment
8
- name: lightning_logs
9
- version: null
8
+ name: ""
9
+ version: ""
10
10
  prefix: ""
11
11
  flush_logs_every_n_steps: 100
@@ -3,9 +3,9 @@ _target_: lightning.fabric.loggers.TensorBoardLogger
3
3
  # for example, `outputs/logs/lightning_logs/version_0` and `outputs/logs/lightning_logs/version_1` by default
4
4
 
5
5
  # root directory for all logging
6
- root_dir: outputs/logs
6
+ root_dir: ${path.log_dir}
7
7
  # the name of the experiment
8
- name: "lightning_logs"
9
- version: null
8
+ name: ""
9
+ version: ""
10
10
  sub_dir: null
11
11
  default_hp_metric: false
@@ -1,19 +1,47 @@
1
+ # =============================================================================
2
+ # FusionBench Fabric Model Fusion Configuration
3
+ # =============================================================================
4
+ # This configuration file defines the settings for running model fusion experiments
5
+ # using PyTorch Lightning Fabric framework within FusionBench.
6
+ #
7
+ # The configuration includes:
8
+ #
9
+ # - Hydra framework settings and overrides
10
+ # - PyTorch Lightning Fabric configuration for distributed training
11
+ # - Path management for data, outputs, and logs
12
+ # - (core components) Model pool, fusion method, and task pool specifications
13
+ # - Experiment execution parameters and debugging options
14
+ #
15
+ # =============================================================================
16
+ # Hydra Configuration Defaults
17
+ # =============================================================================
1
18
  defaults:
2
- - hydra: default
3
- - fabric: auto
4
- # --- Model, Method, Task ---
5
- - modelpool: CLIPVisionModelPool/clip-vit-base-patch32_TA8
6
- - method: dummy
7
- - taskpool: dummy
8
- - _self_
19
+ - hydra: default # Hydra framework configuration
20
+ - fabric: auto # PyTorch Lightning Fabric auto-configuration
21
+ - path: default # Path management configuration
22
+ # --- Core Components ---
23
+ - modelpool: CLIPVisionModelPool/clip-vit-base-patch32_TA8 # Model pool specification
24
+ - method: dummy # Fusion method (placeholder)
25
+ - taskpool: dummy # Task pool specification (placeholder)
26
+ - _self_ # Self-reference for override priority
27
+ # =============================================================================
28
+ # Program Configuration
29
+ # =============================================================================
9
30
  _target_: fusion_bench.programs.FabricModelFusionProgram
10
- _recursive_: false
11
- fast_dev_run: false # Run a single batch of data to test the model or method
12
- # Run the script without actually running the experiment, use with `print_config=true`.
13
- # You can also use `--cfg` or `-c` to show the configuration instead of running.
14
- dry_run: false
15
- print_config: true # Print the configuration to the console
16
- merged_model_save_path: null # path to save the merged model, use "{log_dir}" to refer to the logger directory, for example `merged_model_save_path=\{log_dir\}/merged_model`
17
- merged_model_save_kwargs: null
18
- report_save_path: "{log_dir}/program_report.json" # path to save the result report
19
- print_function_call: true # set to false if you don't want to print the details of instantiate calls
31
+ _recursive_: false # Disable recursive instantiation
32
+ # =============================================================================
33
+ # Experiment Execution Settings
34
+ # =============================================================================
35
+ # Development and debugging options
36
+ fast_dev_run: false # This option is for quick testing. For example, run single batch instead of full dataset
37
+ dry_run: false # Show configuration without running experiment
38
+ print_config: true # Display full configuration before execution
39
+ print_function_call: true # Show detailed instantiation calls
40
+ # =============================================================================
41
+ # Output and Logging Configuration
42
+ # =============================================================================
43
+ # Model saving configuration
44
+ merged_model_save_path: null # Path to save merged model.
45
+ merged_model_save_kwargs: null # Additional kwargs for model saving.
46
+ # Report generation
47
+ report_save_path: "{log_dir}/program_report.json" # Experiment results report path
@@ -2,7 +2,11 @@ defaults:
2
2
  - override help: fusion_bench_help
3
3
  - override job_logging: rich_logging
4
4
  run:
5
- dir: outputs/${hydra.job.name}/${now:%Y-%m-%d_%H-%M-%S}
5
+ dir: ${path.log_dir}
6
6
  sweep:
7
- dir: outputs/${hydra.job.name}/${now:%Y-%m-%d_%H-%M-%S}
7
+ dir: ${path.log_dir}
8
8
  subdir: ${hydra.job.num}
9
+ job:
10
+ env_set:
11
+ HYDRA_FULL_ERROR: ${oc.env:HYDRA_FULL_ERROR,1}
12
+ output_subdir: ""
@@ -1,6 +1,7 @@
1
1
  defaults:
2
2
  - hydra: default
3
3
  - fabric: llama_fsdp
4
+ - path: default
4
5
  # --- Model, Method, Task ---
5
6
  - method: lm_finetune/fullfinetune_sft.yaml
6
7
  - modelpool: CausalLMPool/llama_alpaca_cleaned.yaml
@@ -1,5 +1,5 @@
1
1
  # this option can be "clip_task_wise_adamerging"
2
- name: ???
2
+ name: clip_layer_wise_adamerging
3
3
  # this weights can be a list of float, or a string that points to a *.np, *.pt file containing the weights
4
4
  # if weights is specified, skip the test-time adaptation training
5
5
  weights: null
@@ -0,0 +1,12 @@
1
+ _target_: fusion_bench.method.bitdelta.BitDeltaAlgorithm
2
+ save_dir: null
3
+ save_full_model: false
4
+ # training arguments
5
+ lr: 1e-4
6
+ batch_size: 4
7
+ num_steps: 100
8
+ # dataset arguments
9
+ dataset_name: c4
10
+ subset: en
11
+ split: train
12
+ max_length: 128
@@ -1,5 +1,8 @@
1
1
  _target_: DepthUpscalingAlgorithm
2
- # this should be a list of integers or string, indicating the sequence of layers. If the entry is an integer, it will use the n-th layer of the model. If the entry is a string, it will use the layers specified by the string. The string should be a valid python expression that evaluates to a list of integers.
2
+ # this should be a list of integers or string, indicating the sequence of layers.
3
+ # If the entry is an integer, it will use the n-th layer of the model.
4
+ # If the entry is a string, it will use the layers specified by the string.
5
+ # The string should be a valid python expression that evaluates to a list of integers.
3
6
  # for example, ["range(0,12)", "range(6,12)"] will use the first 12 layers and the last 6 layers of the model to construct the new model
4
7
  # [0, 2, 4, "range(6,12)"] will use the 1st, 3rd, 5th, and the 7th to 12th layers of the model to construct the new model
5
8
  layer_indices: null
@@ -7,7 +7,6 @@ normalize_fisher_weight: true
7
7
  minimal_fisher_weight: 1e-6
8
8
  # common choices: 256, 512, 1024, 2048
9
9
  num_fisher_examples: 256
10
- zeroshot_weights_cache_dir: outputs/cache/clip_zeroshot_weights
11
10
  dataloader_kwargs:
12
11
  batch_size: 32
13
12
  num_workers: 0
@@ -1,5 +1,6 @@
1
1
  _target_: fusion_bench.method.SimpleAverageForLlama
2
2
  # set `merge_backbone` to true if you has a base model and only want to merge the backbone of the experts
3
3
  # if `merge_backbone` is False, this is equivalent to `SimpleAverageAlgorithm`
4
- merge_backbone: true
5
- model_save_path: null
4
+ merge_backbone: false
5
+ model_save_path: ${path.log_dir}/checkpoint
6
+ show_pbar: true
@@ -0,0 +1,21 @@
1
+ # Generic SMILE Upscaling Configuration for CausalLM models
2
+ # Supports: Qwen2, Llama, Mistral models
3
+ # The model type will be auto-detected from the base model
4
+ _target_: fusion_bench.method.smile_upscaling.causal_lm_upscaling.SmileCausalLMUpscalingAlgorithm
5
+
6
+ # Device and computation settings
7
+ device: cuda # device to put the models on
8
+ accelerator: cuda # device to perform SVD on
9
+
10
+ # Model upscaling parameters
11
+ num_experts_per_tok: 1 # Number of experts to activate per token
12
+ rank_of_router: 8 # Rank for router weights
13
+ rank_of_expert: 64 # Rank for expert weights
14
+
15
+ # Model saving settings
16
+ model_save_path: ${path.log_dir}/checkpoint # Set to save the merged model
17
+ model_dtype: null # Optional: convert to specific dtype after merging
18
+ save_with_remote_code: true
19
+
20
+ # Optional: Explicitly specify model type instead of auto-detection
21
+ model_type: null # Options: "qwen2", "llama", "mistral", or null for auto-detection
@@ -0,0 +1,5 @@
1
+ # Measure error accumulation
2
+ _target_: fusion_bench.method.smile_upscaling.error_accumulation.ErrorAccumulationAnalysisForCLIP
3
+ gate_k: 16
4
+ k: 128
5
+ top_k: 1
@@ -0,0 +1,2 @@
1
+ # Measure projected energy
2
+ _target_: fusion_bench.method.smile_upscaling.projected_energy.ProjectedEnergyAnalysis
@@ -4,10 +4,11 @@ device: cpu
4
4
  # device to perform SVD on
5
5
  accelerator: cuda
6
6
  # path to save/load the model
7
- model_path: null
7
+ model_save_path: ${path.log_dir}/checkpoint
8
8
  model_dtype: null
9
9
  # SmileMoE parameters
10
10
  num_experts_per_tok: 1
11
11
  rank_of_router: 8
12
12
  # if rank_of_expert < 0, dense expert is used.
13
13
  rank_of_expert: 64
14
+ save_with_remote_code: true
@@ -0,0 +1,20 @@
1
+ _target_: fusion_bench.method.we_moe.flan_t5_we_moe.FlanT5WeightEnsemblingMoEAlgorithm
2
+ # the path for loading the model weights, if specified, skip the test-time adaptation training
3
+ checkpoint: False
4
+ # the path for saving the model weights.
5
+ save_checkpoint: False
6
+ router_hidden_layers: 2
7
+ init_lambda: 0.3
8
+ batch_reduce: true
9
+ # learning rate
10
+ lr: 1e-4
11
+ optimizer: adam
12
+ # this is overrided by `fabric.devices` if launched from the `fusion_bench` CLI.
13
+ devices: 1
14
+ batch_size: 4
15
+ num_workers: 0
16
+ max_steps: 200
17
+ # if true, we will use the gradient accumulation across tasks to save memory
18
+ use_grad_accumulate: true
19
+ cache_dir: outputs
20
+ fast_dev_run: ${fast_dev_run}
@@ -2,11 +2,8 @@ _usage_: |
2
2
  defaults:
3
3
  - CLIPVisionModelPool@: _template
4
4
  _target_: fusion_bench.modelpool.CLIPVisionModelPool
5
- _version_: "0.2"
6
5
  _recursive_: False
7
6
  models: ???
8
7
  train_datasets: null
9
8
  test_datasets: null
10
- processor:
11
- _target_: transformers.CLIPProcessor.from_pretrained
12
- pretrained_model_name_or_path: openai/clip-vit-base-patch32
9
+ processor: openai/clip-vit-base-patch32
@@ -1,10 +1,5 @@
1
- defaults:
2
- - CLIPVisionModelPool@: _template
1
+ _target_: fusion_bench.modelpool.CLIPVisionModelPool
2
+ _recursive_: False
3
3
  models:
4
- _pretrained_:
5
- _target_: transformers.CLIPVisionModel.from_pretrained
6
- pretrained_model_name_or_path: ${...base_model}
7
- processor:
8
- _target_: transformers.CLIPProcessor.from_pretrained
9
- pretrained_model_name_or_path: ${..base_model}
10
- base_model: openai/clip-vit-base-patch32
4
+ _pretrained_: openai/clip-vit-base-patch32
5
+ processor: ${.models._pretrained_}
@@ -3,5 +3,5 @@ _recursive_: False
3
3
  processor: openai/clip-vit-base-patch32
4
4
  models:
5
5
  _pretrained_: openai/clip-vit-base-patch32
6
- stanford-cars: tanganke/clip-vit-base-patch32_stanford-cars
6
+ finetuned: tanganke/clip-vit-base-patch32_stanford-cars
7
7
  platform: hf
@@ -1,9 +1,3 @@
1
- defaults:
2
- - CLIPVisionModelPool@: _template
3
- - /model/clip-vit@models:
4
- - clip-vit-base-patch32
5
- - clip-vit-base-patch32_svhn
6
- - clip-vit-base-patch32_mnist
7
1
  _target_: fusion_bench.modelpool.CLIPVisionModelPool
8
2
  _recursive_: False
9
3
  processor: openai/clip-vit-base-patch32
@@ -24,7 +24,7 @@ models:
24
24
  _pretrained_: openai/clip-vit-large-patch14
25
25
  sun397: tanganke/clip-vit-large-patch14_sun397
26
26
  stanford-cars: tanganke/clip-vit-large-patch14_stanford-cars
27
- resisc45: tanganke/clip-vit-large-patch14_dtd
27
+ resisc45: tanganke/clip-vit-large-patch14_resisc45
28
28
  eurosat: tanganke/clip-vit-large-patch14_eurosat
29
29
  svhn: tanganke/clip-vit-large-patch14_svhn
30
30
  gtsrb: tanganke/clip-vit-large-patch14_gtsrb
@@ -5,7 +5,7 @@ models:
5
5
  _pretrained_: openai/clip-vit-large-patch14
6
6
  sun397: tanganke/clip-vit-large-patch14_sun397
7
7
  stanford-cars: tanganke/clip-vit-large-patch14_stanford-cars
8
- resisc45: tanganke/clip-vit-large-patch14_dtd
8
+ resisc45: tanganke/clip-vit-large-patch14_resisc45
9
9
  eurosat: tanganke/clip-vit-large-patch14_eurosat
10
10
  svhn: tanganke/clip-vit-large-patch14_svhn
11
11
  gtsrb: tanganke/clip-vit-large-patch14_gtsrb
@@ -1,11 +1,11 @@
1
1
  _target_: fusion_bench.modelpool.CausalLMPool
2
2
  _recursive_: false
3
3
 
4
- load_lazy: false
4
+ enable_lazy_loading: false
5
5
  models:
6
6
  _pretrained_: Qwen/Qwen2.5-1.5B
7
- expert_1: Qwen/Qwen2.5-Math-1.5B
8
- expert_2: Qwen/Qwen2.5-Coder-1.5B
7
+ math: Qwen/Qwen2.5-Math-1.5B
8
+ code: Qwen/Qwen2.5-Coder-1.5B
9
9
  model_kwargs:
10
10
  torch_dtype: bfloat16
11
11
  tokenizer: Qwen/Qwen2.5-1.5B
@@ -0,0 +1,9 @@
1
+ _target_: fusion_bench.modelpool.CausalLMPool
2
+ _recursive_: false
3
+ models:
4
+ _pretrained_: Qwen/Qwen2.5-7B
5
+ math: Qwen/Qwen2.5-Math-7B
6
+ code: Qwen/Qwen2.5-Coder-7B
7
+ model_kwargs:
8
+ torch_dtype: bfloat16
9
+ tokenizer: Qwen/Qwen2.5-7B
@@ -0,0 +1,6 @@
1
+ _target_: fusion_bench.modelpool.CausalLMPool
2
+ models:
3
+ _pretrained_: mistralai/Mistral-7B-v0.1
4
+ tokenizer: ${.models._pretrained_}
5
+ model_kwargs:
6
+ torch_dtype: bfloat16
@@ -0,0 +1,10 @@
1
+ _target_: fusion_bench.modelpool.CausalLMPool
2
+ models:
3
+ _pretrained_: path_to_your_pretrained_model
4
+ expert_1: path_to_your_expert_model_1
5
+ expert_2: path_to_your_expert_model_2
6
+ expert_3: path_to_your_expert_model_3
7
+ expert_4: path_to_your_expert_model_4
8
+ tokenizer: ${.models._pretrained_}
9
+ model_kwargs:
10
+ torch_dtype: bfloat16
@@ -1,17 +1,9 @@
1
1
  _target_: fusion_bench.modelpool.CausalLMPool
2
2
  _recursive_: false
3
3
  models:
4
- _pretrained_:
5
- _target_: transformers.AutoModelForCausalLM.from_pretrained
6
- pretrained_model_name_or_path: Qwen/Qwen2.5-1.5B
7
- expert_1:
8
- _target_: transformers.AutoModelForCausalLM.from_pretrained
9
- pretrained_model_name_or_path: Qwen/Qwen2.5-Math-1.5B
10
- expert_2:
11
- _target_: transformers.AutoModelForCausalLM.from_pretrained
12
- pretrained_model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
4
+ _pretrained_: Qwen/Qwen2.5-1.5B
5
+ expert_1: Qwen/Qwen2.5-Math-1.5B
6
+ expert_2: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
13
7
  model_kwargs:
14
8
  torch_dtype: bfloat16
15
- tokenizer:
16
- _target_: transformers.AutoTokenizer.from_pretrained
17
- pretrained_model_name_or_path: Qwen/Qwen2.5-1.5B
9
+ tokenizer: Qwen/Qwen2.5-1.5B
@@ -1,20 +1,10 @@
1
1
  _target_: fusion_bench.modelpool.CausalLMPool
2
2
  _recursive_: false
3
3
  models:
4
- _pretrained_:
5
- _target_: transformers.AutoModelForCausalLM.from_pretrained
6
- pretrained_model_name_or_path: mistralai/Mistral-7B-v0.1
7
- expert_1:
8
- _target_: transformers.AutoModelForCausalLM.from_pretrained
9
- pretrained_model_name_or_path: meta-math/MetaMath-Mistral-7B
10
- expert_2:
11
- _target_: transformers.AutoModelForCausalLM.from_pretrained
12
- pretrained_model_name_or_path: cognitivecomputations/dolphin-2.1-mistral-7b
13
- expert_3:
14
- _target_: transformers.AutoModelForCausalLM.from_pretrained
15
- pretrained_model_name_or_path: uukuguy/speechless-code-mistral-7b-v1.0
4
+ _pretrained_: mistralai/Mistral-7B-v0.1
5
+ expert_1: meta-math/MetaMath-Mistral-7B
6
+ expert_2: cognitivecomputations/dolphin-2.1-mistral-7b
7
+ expert_3: uukuguy/speechless-code-mistral-7b-v1.0
16
8
  model_kwargs:
17
- torch_dtype: float16
18
- tokenizer:
19
- _target_: transformers.AutoTokenizer.from_pretrained
20
- pretrained_model_name_or_path: mistralai/Mistral-7B-v0.1
9
+ torch_dtype: bfloat16
10
+ tokenizer: mistralai/Mistral-7B-v0.1
@@ -0,0 +1,8 @@
1
+ _target_: fusion_bench.modelpool.CausalLMPool
2
+ _recursive_: false
3
+ models:
4
+ _pretrained_: meta-llama/Llama-2-7b-hf
5
+ finetuned_model: lmsys/vicuna-7b-v1.5
6
+ model_kwargs:
7
+ torch_dtype: bfloat16
8
+ tokenizer: ${.models.finetuned_model}
@@ -1,4 +1,4 @@
1
- _target_: fusion_bench.modelpool.SeqenceClassificationModelPool
1
+ _target_: fusion_bench.modelpool.SequenceClassificationModelPool
2
2
  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
3
3
  models:
4
4
  _pretrained_:
@@ -1,4 +1,4 @@
1
- _target_: fusion_bench.modelpool.SeqenceClassificationModelPool
1
+ _target_: fusion_bench.modelpool.SequenceClassificationModelPool
2
2
  pretrained_model_name_or_path: fusion-bench/Llama-3.2-1B-Instruct_Bradly-Terry-RM_Preference-700k
3
3
  models:
4
4
  _pretrained_:
@@ -1,8 +1,10 @@
1
1
  defaults:
2
2
  - hydra: default
3
3
  - fabric: auto
4
- - modelpool: nyuv2_modelpool
4
+ - path: default
5
+ # --- Model, Method, Task ---
5
6
  - method: simple_average
7
+ - modelpool: nyuv2_modelpool
6
8
  - taskpool: nyuv2_taskpool
7
9
  - _self_
8
10
  _target_: fusion_bench.programs.FabricModelFusionProgram
@@ -1,5 +1,6 @@
1
1
  defaults:
2
2
  - hydra: default
3
+ - path: default
3
4
  - _self_
4
5
  fast_dev_run: false
5
6
  exp_name: null
@@ -0,0 +1,28 @@
1
+ # =============================================================================
2
+ # FusionBench Path Configuration
3
+ # =============================================================================
4
+ # This configuration file defines the directory structure and path settings
5
+ # used throughout the FusionBench framework for model fusion experiments.
6
+ # All paths are configured using Hydra's variable interpolation syntax.
7
+ # Root directory - uses FUSION_BENCH_PROJECT_ROOT env var or current directory
8
+ #
9
+ # By default:
10
+ #
11
+ # root_dir (defaults to current directory)
12
+ # ├── outputs (output_dir)
13
+ # │ ├── cache (cache_dir)
14
+ # │ └── <config_name>
15
+ # │ └── <timestamp> (log_dir)
16
+ # └── data (data_dir)
17
+ #
18
+ root_dir: ${oc.env:FUSION_BENCH_PROJECT_ROOT,"."}
19
+ # Output directory for experiment results and artifacts
20
+ output_dir: ${.root_dir}/outputs
21
+ # Data directory - uses FUSION_BENCH_DATA_DIR env var or root_dir/data
22
+ data_dir: ${oc.env:FUSION_BENCH_DATA_DIR,${.root_dir}/data}
23
+ # Cache directory - uses FUSION_BENCH_CACHE_DIR env var or output_dir/cache
24
+ cache_dir: ${oc.env:FUSION_BENCH_CACHE_DIR,${.output_dir}/cache}
25
+ # Log directory with timestamped subdirectories for each run
26
+ log_dir: ${.output_dir}/${hydra:job.config_name}/${now:%Y-%m-%d_%H-%M-%S}
27
+ # Current working directory at runtime
28
+ work_dir: ${hydra:runtime.cwd}
@@ -0,0 +1,24 @@
1
+ defaults:
2
+ - /dataset/image_classification/test@test_datasets:
3
+ - svhn
4
+ - mnist
5
+ _target_: fusion_bench.taskpool.CLIPVisionModelTaskPool
6
+ _recursive_: false
7
+ test_datasets: ??? # The datasets to evaluate the model on
8
+ base_model: openai/clip-vit-base-patch32
9
+ clip_model: ${.base_model} # The base model to use
10
+ processor: ${.base_model} # The base model to use
11
+ data_processor: ${.processor}
12
+ dataloader_kwargs:
13
+ batch_size: 128 # The batch size for the data loader
14
+ num_workers: 8 # The number of worker processes for data loading
15
+ pin_memory: True # Whether to pin memory in data loader
16
+ drop_last: False # Whether to drop the last incomplete batch
17
+ shuffle: False # Whether to shuffle the data
18
+ # === layer-wise feature saving ===
19
+ # The path to save the features to, if none then the features are not saved
20
+ # This is the path to a directory, the features of task `task_name` will be saved in `feature_save_path/task_name.csv`
21
+ layer_wise_feature_save_path: null
22
+ layer_wise_feature_first_token_only: true # Whether to save only the first token of the features
23
+ # The maximum number of samples to save the features for
24
+ layer_wise_feature_max_num: 1000
@@ -1,23 +0,0 @@
1
- # this option can be one of "clip_task_wise_adamerging" or "clip_layer_wise_adamerging"
2
- name: clip_layer_wise_adamerging
3
- # this weights can be a list of float, or a string that points to a *.np, *.pt file containing the weights
4
- # if weights is specified, skip the test-time adaptation training
5
- weights: null
6
- # learning rate
7
- optimizer: adam
8
- lr: 1e-3
9
- init_values: 0.3
10
- # if `clamp_weights` is true, the weights will be clamped to [0, 1]
11
- clamp_weights: false
12
- # arguments of `functional_call`
13
- tie_weights: true
14
- strict: false
15
- # this is overrided by `fabric.devices` if launched from the `fusion_bench` CLI.
16
- devices: 1
17
- batch_size: 16
18
- num_workers: 8
19
- max_steps: 1000
20
- fast_dev_run: ${fast_dev_run}
21
- # the path for saving the merging weights
22
- save_merging_weights: 'merging_weights.pt'
23
- cache_dir: outputs
@@ -1,14 +0,0 @@
1
- type: AutoModelForCausalLMPool
2
- # each model should have a name and a path, and the model is loaded from the path
3
- # this is equivalent to `AutoModelForCausalLM.from_pretrained(path)`
4
- models:
5
- - name: _pretrained_
6
- path: path_to_your_pretrained_model
7
- - name: expert_1
8
- path: path_to_your_expert_model_1
9
- - name: expert_2
10
- path: path_to_your_expert_model_2
11
- - name: expert_3
12
- path: path_to_your_expert_model_3
13
- - name: expert_4
14
- path: path_to_your_expert_model_4
@@ -1,6 +0,0 @@
1
- type: AutoModelForCausalLMPool
2
- # each model should have a name and a path, and the model is loaded from the path
3
- # this is equivalent to `AutoModelForCausalLM.from_pretrained(path)`
4
- models:
5
- - name: _pretrained_
6
- path: path_to_your_pretrained_model