fusion-bench 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fusion_bench/__init__.py +22 -2
- fusion_bench/_get_started/__init__.py +3 -0
- fusion_bench/_get_started/greeting_program.py +49 -0
- fusion_bench/compat/method/base_algorithm.py +14 -0
- fusion_bench/constants/__init__.py +6 -0
- fusion_bench/constants/clip_vision.py +26 -2
- fusion_bench/constants/paths.py +4 -0
- fusion_bench/constants/runtime.py +57 -0
- fusion_bench/dataset/clip_dataset.py +2 -1
- fusion_bench/dataset/gpt2_glue.py +9 -9
- fusion_bench/dataset/image_corruption/__init__.py +0 -0
- fusion_bench/dataset/image_corruption/make_corruption.py +179 -0
- fusion_bench/dataset/image_dataset.py +1 -1
- fusion_bench/dataset/nyuv2.py +2 -2
- fusion_bench/method/__init__.py +24 -5
- fusion_bench/method/adamerging/clip_layer_wise_adamerging.py +1 -1
- fusion_bench/method/adamerging/clip_task_wise_adamerging.py +11 -7
- fusion_bench/method/adamerging/layer_wise_adamerging.py +11 -5
- fusion_bench/method/base_algorithm.py +195 -12
- fusion_bench/method/bitdelta/__init__.py +5 -0
- fusion_bench/method/bitdelta/bitdelta.py +156 -0
- fusion_bench/method/bitdelta/bitdelta_utils/__init__.py +0 -0
- fusion_bench/method/bitdelta/bitdelta_utils/binary_gemm_kernel.py +462 -0
- fusion_bench/method/bitdelta/bitdelta_utils/data.py +35 -0
- fusion_bench/method/bitdelta/bitdelta_utils/diff.py +129 -0
- fusion_bench/method/classification/clip_finetune.py +1 -1
- fusion_bench/method/concrete_subspace/clip_concrete_adamerging.py +0 -1
- fusion_bench/method/depth_upscaling/depth_upscaling.py +4 -9
- fusion_bench/method/doge_ta/clip_layer_wise_adamerging.py +4 -5
- fusion_bench/method/doge_ta/doge_ta.py +1 -1
- fusion_bench/method/ensemble.py +12 -12
- fusion_bench/method/expert_sparsity/utils/calibration_data.py +1 -1
- fusion_bench/method/fisher_merging/clip_fisher_merging.py +2 -6
- fusion_bench/method/fisher_merging/fisher_merging.py +6 -15
- fusion_bench/method/fisher_merging/gpt2_fisher_merging.py +3 -10
- fusion_bench/method/fw_merging/fw_hard.py +1 -1
- fusion_bench/method/fw_merging/fw_soft.py +1 -1
- fusion_bench/method/gossip/clip_layer_wise_gossip.py +4 -5
- fusion_bench/method/linear/expo.py +2 -1
- fusion_bench/method/linear/linear_interpolation.py +6 -4
- fusion_bench/method/linear/simple_average_for_llama.py +17 -13
- fusion_bench/method/lm_finetune/bradley_terry_rm.py +2 -2
- fusion_bench/method/mixture_of_experts/mixtral_upcycling.py +9 -26
- fusion_bench/method/model_recombination.py +2 -5
- fusion_bench/method/moe_pruner/hooks/__init__.py +1 -2
- fusion_bench/method/moe_pruner/utils/data.py +2 -1
- fusion_bench/method/moe_pruner/utils/prune.py +6 -1
- fusion_bench/method/pruning/llama_magnitude_prune.py +1 -1
- fusion_bench/method/pruning/wanda_utils/data.py +1 -2
- fusion_bench/method/pwe_moe/clip_pwe_moe.py +12 -34
- fusion_bench/method/randes/modelsoup.py +1 -3
- fusion_bench/method/regmean/clip_regmean.py +2 -2
- fusion_bench/method/regmean/gpt2_regmean.py +3 -10
- fusion_bench/method/regmean/regmean.py +2 -11
- fusion_bench/method/regmean_plusplus/__init__.py +1 -1
- fusion_bench/method/regmean_plusplus/clip_regmean_plusplus.py +24 -17
- fusion_bench/method/regmean_plusplus/regmean_plusplus.py +56 -38
- fusion_bench/method/simple_average.py +12 -16
- fusion_bench/method/slerp/slerp.py +5 -2
- fusion_bench/method/smile_upscaling/causal_lm_upscaling.py +371 -0
- fusion_bench/method/smile_upscaling/error_accumulation.py +177 -0
- fusion_bench/method/smile_upscaling/projected_energy.py +144 -0
- fusion_bench/method/smile_upscaling/smile_mistral_upscaling.py +5 -1
- fusion_bench/method/smile_upscaling/smile_qwen2_upscaling.py +71 -51
- fusion_bench/method/smile_upscaling/smile_upscaling.py +12 -5
- fusion_bench/method/tall_mask/task_arithmetic.py +3 -11
- fusion_bench/method/task_arithmetic/task_arithmetic.py +6 -10
- fusion_bench/method/ties_merging/ties_merging.py +13 -26
- fusion_bench/method/we_moe/__init__.py +1 -0
- fusion_bench/method/we_moe/clip_we_moe.py +5 -4
- fusion_bench/method/we_moe/entropy_loss.py +25 -0
- fusion_bench/method/we_moe/flan_t5_we_moe.py +331 -0
- fusion_bench/method/we_moe/utils.py +15 -0
- fusion_bench/method/we_moe/we_moe.py +6 -6
- fusion_bench/method/weighted_average/llama.py +4 -16
- fusion_bench/metrics/continual_learning/__init__.py +1 -0
- fusion_bench/metrics/continual_learning/backward_transfer.py +1 -1
- fusion_bench/metrics/nyuv2/__init__.py +2 -2
- fusion_bench/metrics/nyuv2/segmentation.py +1 -1
- fusion_bench/mixins/__init__.py +10 -2
- fusion_bench/mixins/clip_classification.py +15 -45
- fusion_bench/mixins/hydra_config.py +105 -7
- fusion_bench/mixins/lightning_fabric.py +2 -0
- fusion_bench/mixins/serialization.py +275 -48
- fusion_bench/modelpool/__init__.py +2 -2
- fusion_bench/modelpool/base_pool.py +29 -9
- fusion_bench/modelpool/causal_lm/causal_lm.py +41 -33
- fusion_bench/modelpool/clip_vision/modelpool.py +1 -3
- fusion_bench/modelpool/seq_classification_lm/__init__.py +1 -1
- fusion_bench/modelpool/seq_classification_lm/seq_classification_lm.py +1 -1
- fusion_bench/models/__init__.py +7 -1
- fusion_bench/models/expert_sparsity/mixtral/__init__.py +1 -1
- fusion_bench/models/hf_utils.py +160 -0
- fusion_bench/models/linearized/linearized_model_utils.py +4 -4
- fusion_bench/models/linearized/vision_model.py +1 -1
- fusion_bench/models/model_card_templates/default.md +46 -0
- fusion_bench/models/modeling_deepseek_v2/__init__.py +1 -1
- fusion_bench/models/modeling_deepseek_v2/modeling_deepseek.py +4 -4
- fusion_bench/models/modeling_deepseek_v2/tokenization_deepseek_fast.py +0 -1
- fusion_bench/models/modeling_smile_gemma2/__init__.py +9 -0
- fusion_bench/models/modeling_smile_gemma2/configuration_smile_gemma2.py +20 -0
- fusion_bench/models/modeling_smile_gemma2/modeling_smile_gemma2.py +986 -0
- fusion_bench/models/modeling_smile_gemma2/register.py +26 -0
- fusion_bench/models/modeling_smile_llama/__init__.py +7 -0
- fusion_bench/models/modeling_smile_llama/configuration_smile_llama.py +20 -0
- fusion_bench/models/modeling_smile_llama/modeling_smile_llama.py +698 -0
- fusion_bench/models/modeling_smile_llama/register.py +8 -0
- fusion_bench/models/modeling_smile_mistral/__init__.py +5 -47
- fusion_bench/models/modeling_smile_qwen2/__init__.py +1 -1
- fusion_bench/models/modeling_smile_qwen2/modeling_smile_qwen2.py +7 -12
- fusion_bench/models/modeling_smile_qwen2/register.py +1 -4
- fusion_bench/models/parameter_dict.py +1 -1
- fusion_bench/models/sparse_we_moe.py +1 -53
- fusion_bench/models/utils.py +26 -0
- fusion_bench/models/we_moe.py +1 -53
- fusion_bench/models/wrappers/ensemble.py +6 -4
- fusion_bench/models/wrappers/layer_wise_fusion.py +1 -1
- fusion_bench/models/wrappers/task_wise_fusion.py +250 -72
- fusion_bench/programs/base_program.py +81 -2
- fusion_bench/programs/fabric_fusion_program.py +46 -61
- fusion_bench/scripts/cli.py +38 -5
- fusion_bench/taskpool/base_pool.py +4 -3
- fusion_bench/taskpool/clip_vision/taskpool.py +43 -22
- fusion_bench/taskpool/dummy.py +1 -1
- fusion_bench/taskpool/lm_eval_harness/taskpool.py +1 -2
- fusion_bench/tasks/clip_classification/__init__.py +6 -4
- fusion_bench/utils/__init__.py +7 -1
- fusion_bench/utils/cache_utils.py +101 -1
- fusion_bench/utils/devices.py +14 -4
- fusion_bench/utils/fabric.py +2 -2
- fusion_bench/utils/instantiate_utils.py +3 -1
- fusion_bench/utils/lazy_imports.py +23 -0
- fusion_bench/utils/lazy_state_dict.py +38 -3
- fusion_bench/utils/modelscope.py +127 -8
- fusion_bench/utils/parameters.py +2 -2
- fusion_bench/utils/path.py +56 -0
- fusion_bench/utils/pylogger.py +1 -1
- fusion_bench/utils/rich_utils.py +3 -0
- fusion_bench/utils/state_dict_arithmetic.py +25 -23
- {fusion_bench-0.2.20.dist-info → fusion_bench-0.2.22.dist-info}/METADATA +24 -47
- {fusion_bench-0.2.20.dist-info → fusion_bench-0.2.22.dist-info}/RECORD +184 -145
- fusion_bench_config/_get_started/clip_evaluate_single_model.yaml +21 -0
- fusion_bench_config/_get_started/clip_simple_average.yaml +23 -0
- fusion_bench_config/_get_started/clip_task_arithmetic.yaml +24 -0
- fusion_bench_config/_get_started/greeting_program.yaml +4 -0
- fusion_bench_config/fabric/loggers/csv_logger.yaml +3 -3
- fusion_bench_config/fabric/loggers/tensorboard_logger.yaml +3 -3
- fusion_bench_config/fabric_model_fusion.yaml +45 -17
- fusion_bench_config/hydra/default.yaml +6 -2
- fusion_bench_config/llama_full_finetune.yaml +1 -0
- fusion_bench_config/method/adamerging/clip.yaml +1 -1
- fusion_bench_config/method/bitdelta/bitdelta.yaml +12 -0
- fusion_bench_config/method/depth_upscaling.yaml +4 -1
- fusion_bench_config/method/fisher_merging/clip_fisher_merging.yaml +0 -1
- fusion_bench_config/method/linear/simple_average_for_llama.yaml +3 -2
- fusion_bench_config/method/smile_upscaling/causal_lm_upscaling.yaml +21 -0
- fusion_bench_config/method/smile_upscaling/error_accumulation.yaml +5 -0
- fusion_bench_config/method/smile_upscaling/projected_energy.yaml +2 -0
- fusion_bench_config/method/smile_upscaling/smile_qwen2_upscaling.yaml +2 -1
- fusion_bench_config/method/wemoe/flan_t5_weight_ensembling_moe.yaml +20 -0
- fusion_bench_config/modelpool/CLIPVisionModelPool/_template.yaml +1 -4
- fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_individual.yaml +4 -9
- fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_single_finetuned.yaml +1 -1
- fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_svhn_and_mnist.yaml +0 -6
- fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-large-patch14_TA8.yaml +1 -1
- fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-large-patch14_TA8_model_only.yaml +1 -1
- fusion_bench_config/modelpool/CausalLMPool/Qwen2.5-1.5B_math_and_coder.yaml +3 -3
- fusion_bench_config/modelpool/CausalLMPool/Qwen2.5-7B-math_and_coder.yaml +9 -0
- fusion_bench_config/modelpool/CausalLMPool/mistral-7b.yaml +6 -0
- fusion_bench_config/modelpool/CausalLMPool/mixtral_moe_merging.yaml +10 -0
- fusion_bench_config/modelpool/CausalLMPool/qwen2_math_1.5B_and_R1.yaml +4 -12
- fusion_bench_config/modelpool/CausalLMPool/simle_mixtral_exp_v4.yaml +6 -16
- fusion_bench_config/modelpool/CausalLMPool/vicuna-7b-v1.5.yaml +8 -0
- fusion_bench_config/modelpool/{SeqenceClassificationModelPool → SequenceClassificationModelPool}/llama_preference700k.yaml +1 -1
- fusion_bench_config/modelpool/{SeqenceClassificationModelPool → SequenceClassificationModelPool}/single_reward_model.yaml +1 -1
- fusion_bench_config/nyuv2_config.yaml +3 -1
- fusion_bench_config/nyuv2_mtl_train.yaml +1 -0
- fusion_bench_config/path/default.yaml +28 -0
- fusion_bench_config/taskpool/CLIPVisionModelTaskPool/clip-vit-base-patch32_svhn_and_mnist.yaml +24 -0
- fusion_bench_config/method/adamerging.yaml +0 -23
- fusion_bench_config/modelpool/mixtral_moe_merging.yaml +0 -14
- fusion_bench_config/modelpool/mixtral_moe_upscaling.yaml +0 -6
- fusion_bench_config/taskpool/clip-vit-base-patch32_svhn_and_mnist.yaml +0 -22
- {fusion_bench-0.2.20.dist-info → fusion_bench-0.2.22.dist-info}/WHEEL +0 -0
- {fusion_bench-0.2.20.dist-info → fusion_bench-0.2.22.dist-info}/entry_points.txt +0 -0
- {fusion_bench-0.2.20.dist-info → fusion_bench-0.2.22.dist-info}/licenses/LICENSE +0 -0
- {fusion_bench-0.2.20.dist-info → fusion_bench-0.2.22.dist-info}/top_level.txt +0 -0
- /fusion_bench_config/modelpool/{SeqenceClassificationModelPool → SequenceClassificationModelPool}/roberta-base_glue.yaml +0 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
_target_: fusion_bench.programs.FabricModelFusionProgram
|
|
2
|
+
_recursive_: false
|
|
3
|
+
method:
|
|
4
|
+
_target_: fusion_bench.method.DummyAlgorithm
|
|
5
|
+
modelpool:
|
|
6
|
+
_target_: fusion_bench.modelpool.CLIPVisionModelPool
|
|
7
|
+
models:
|
|
8
|
+
_pretrained_: openai/clip-vit-base-patch32
|
|
9
|
+
taskpool:
|
|
10
|
+
_target_: fusion_bench.taskpool.CLIPVisionModelTaskPool
|
|
11
|
+
test_datasets:
|
|
12
|
+
sun397:
|
|
13
|
+
_target_: datasets.load_dataset
|
|
14
|
+
path: tanganke/sun397
|
|
15
|
+
split: test
|
|
16
|
+
stanford-cars:
|
|
17
|
+
_target_: datasets.load_dataset
|
|
18
|
+
path: tanganke/stanford_cars
|
|
19
|
+
split: test
|
|
20
|
+
clip_model: openai/clip-vit-base-patch32
|
|
21
|
+
processor: openai/clip-vit-base-patch32
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
_target_: fusion_bench.programs.FabricModelFusionProgram # (1)!
|
|
2
|
+
_recursive_: false
|
|
3
|
+
method: # (2)!
|
|
4
|
+
_target_: fusion_bench.method.SimpleAverageAlgorithm
|
|
5
|
+
modelpool: # (3)!
|
|
6
|
+
_target_: fusion_bench.modelpool.CLIPVisionModelPool
|
|
7
|
+
models:
|
|
8
|
+
_pretrained_: openai/clip-vit-base-patch32
|
|
9
|
+
sun397: tanganke/clip-vit-base-patch32_sun397
|
|
10
|
+
stanford-cars: tanganke/clip-vit-base-patch32_stanford-cars
|
|
11
|
+
taskpool: # (4)!
|
|
12
|
+
_target_: fusion_bench.taskpool.CLIPVisionModelTaskPool
|
|
13
|
+
test_datasets:
|
|
14
|
+
sun397:
|
|
15
|
+
_target_: datasets.load_dataset
|
|
16
|
+
path: tanganke/sun397
|
|
17
|
+
split: test
|
|
18
|
+
stanford-cars:
|
|
19
|
+
_target_: datasets.load_dataset
|
|
20
|
+
path: tanganke/stanford_cars
|
|
21
|
+
split: test
|
|
22
|
+
clip_model: openai/clip-vit-base-patch32
|
|
23
|
+
processor: openai/clip-vit-base-patch32
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
_target_: fusion_bench.programs.FabricModelFusionProgram
|
|
2
|
+
_recursive_: false
|
|
3
|
+
method:
|
|
4
|
+
_target_: fusion_bench.method.TaskArithmeticAlgorithm
|
|
5
|
+
scaling_factor: 0.7
|
|
6
|
+
modelpool:
|
|
7
|
+
_target_: fusion_bench.modelpool.CLIPVisionModelPool
|
|
8
|
+
models:
|
|
9
|
+
_pretrained_: openai/clip-vit-base-patch32
|
|
10
|
+
sun397: tanganke/clip-vit-base-patch32_sun397
|
|
11
|
+
stanford-cars: tanganke/clip-vit-base-patch32_stanford-cars
|
|
12
|
+
taskpool:
|
|
13
|
+
_target_: fusion_bench.taskpool.CLIPVisionModelTaskPool
|
|
14
|
+
test_datasets:
|
|
15
|
+
sun397:
|
|
16
|
+
_target_: datasets.load_dataset
|
|
17
|
+
path: tanganke/sun397
|
|
18
|
+
split: test
|
|
19
|
+
stanford-cars:
|
|
20
|
+
_target_: datasets.load_dataset
|
|
21
|
+
path: tanganke/stanford_cars
|
|
22
|
+
split: test
|
|
23
|
+
clip_model: openai/clip-vit-base-patch32
|
|
24
|
+
processor: openai/clip-vit-base-patch32
|
|
@@ -3,9 +3,9 @@ _target_: lightning.fabric.loggers.CSVLogger
|
|
|
3
3
|
# for example, `outputs/logs/lightning_logs/version_0` and `outputs/logs/lightning_logs/version_1` by default
|
|
4
4
|
|
|
5
5
|
# root directory for all logging
|
|
6
|
-
root_dir:
|
|
6
|
+
root_dir: ${path.log_dir}
|
|
7
7
|
# the name of the experiment
|
|
8
|
-
name:
|
|
9
|
-
version:
|
|
8
|
+
name: ""
|
|
9
|
+
version: ""
|
|
10
10
|
prefix: ""
|
|
11
11
|
flush_logs_every_n_steps: 100
|
|
@@ -3,9 +3,9 @@ _target_: lightning.fabric.loggers.TensorBoardLogger
|
|
|
3
3
|
# for example, `outputs/logs/lightning_logs/version_0` and `outputs/logs/lightning_logs/version_1` by default
|
|
4
4
|
|
|
5
5
|
# root directory for all logging
|
|
6
|
-
root_dir:
|
|
6
|
+
root_dir: ${path.log_dir}
|
|
7
7
|
# the name of the experiment
|
|
8
|
-
name: "
|
|
9
|
-
version:
|
|
8
|
+
name: ""
|
|
9
|
+
version: ""
|
|
10
10
|
sub_dir: null
|
|
11
11
|
default_hp_metric: false
|
|
@@ -1,19 +1,47 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Fabric Model Fusion Configuration
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# This configuration file defines the settings for running model fusion experiments
|
|
5
|
+
# using PyTorch Lightning Fabric framework within FusionBench.
|
|
6
|
+
#
|
|
7
|
+
# The configuration includes:
|
|
8
|
+
#
|
|
9
|
+
# - Hydra framework settings and overrides
|
|
10
|
+
# - PyTorch Lightning Fabric configuration for distributed training
|
|
11
|
+
# - Path management for data, outputs, and logs
|
|
12
|
+
# - (core components) Model pool, fusion method, and task pool specifications
|
|
13
|
+
# - Experiment execution parameters and debugging options
|
|
14
|
+
#
|
|
15
|
+
# =============================================================================
|
|
16
|
+
# Hydra Configuration Defaults
|
|
17
|
+
# =============================================================================
|
|
1
18
|
defaults:
|
|
2
|
-
- hydra: default
|
|
3
|
-
- fabric: auto
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
-
|
|
7
|
-
-
|
|
8
|
-
-
|
|
19
|
+
- hydra: default # Hydra framework configuration
|
|
20
|
+
- fabric: auto # PyTorch Lightning Fabric auto-configuration
|
|
21
|
+
- path: default # Path management configuration
|
|
22
|
+
# --- Core Components ---
|
|
23
|
+
- modelpool: CLIPVisionModelPool/clip-vit-base-patch32_TA8 # Model pool specification
|
|
24
|
+
- method: dummy # Fusion method (placeholder)
|
|
25
|
+
- taskpool: dummy # Task pool specification (placeholder)
|
|
26
|
+
- _self_ # Self-reference for override priority
|
|
27
|
+
# =============================================================================
|
|
28
|
+
# Program Configuration
|
|
29
|
+
# =============================================================================
|
|
9
30
|
_target_: fusion_bench.programs.FabricModelFusionProgram
|
|
10
|
-
_recursive_: false
|
|
11
|
-
|
|
12
|
-
#
|
|
13
|
-
#
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
31
|
+
_recursive_: false # Disable recursive instantiation
|
|
32
|
+
# =============================================================================
|
|
33
|
+
# Experiment Execution Settings
|
|
34
|
+
# =============================================================================
|
|
35
|
+
# Development and debugging options
|
|
36
|
+
fast_dev_run: false # This option is for quick testing. For example, run single batch instead of full dataset
|
|
37
|
+
dry_run: false # Show configuration without running experiment
|
|
38
|
+
print_config: true # Display full configuration before execution
|
|
39
|
+
print_function_call: true # Show detailed instantiation calls
|
|
40
|
+
# =============================================================================
|
|
41
|
+
# Output and Logging Configuration
|
|
42
|
+
# =============================================================================
|
|
43
|
+
# Model saving configuration
|
|
44
|
+
merged_model_save_path: null # Path to save merged model.
|
|
45
|
+
merged_model_save_kwargs: null # Additional kwargs for model saving.
|
|
46
|
+
# Report generation
|
|
47
|
+
report_save_path: "{log_dir}/program_report.json" # Experiment results report path
|
|
@@ -2,7 +2,11 @@ defaults:
|
|
|
2
2
|
- override help: fusion_bench_help
|
|
3
3
|
- override job_logging: rich_logging
|
|
4
4
|
run:
|
|
5
|
-
dir:
|
|
5
|
+
dir: ${path.log_dir}
|
|
6
6
|
sweep:
|
|
7
|
-
dir:
|
|
7
|
+
dir: ${path.log_dir}
|
|
8
8
|
subdir: ${hydra.job.num}
|
|
9
|
+
job:
|
|
10
|
+
env_set:
|
|
11
|
+
HYDRA_FULL_ERROR: ${oc.env:HYDRA_FULL_ERROR,1}
|
|
12
|
+
output_subdir: ""
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# this option can be "clip_task_wise_adamerging"
|
|
2
|
-
name:
|
|
2
|
+
name: clip_layer_wise_adamerging
|
|
3
3
|
# this weights can be a list of float, or a string that points to a *.np, *.pt file containing the weights
|
|
4
4
|
# if weights is specified, skip the test-time adaptation training
|
|
5
5
|
weights: null
|
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
_target_: DepthUpscalingAlgorithm
|
|
2
|
-
# this should be a list of integers or string, indicating the sequence of layers.
|
|
2
|
+
# this should be a list of integers or string, indicating the sequence of layers.
|
|
3
|
+
# If the entry is an integer, it will use the n-th layer of the model.
|
|
4
|
+
# If the entry is a string, it will use the layers specified by the string.
|
|
5
|
+
# The string should be a valid python expression that evaluates to a list of integers.
|
|
3
6
|
# for example, ["range(0,12)", "range(6,12)"] will use the first 12 layers and the last 6 layers of the model to construct the new model
|
|
4
7
|
# [0, 2, 4, "range(6,12)"] will use the 1st, 3rd, 5th, and the 7th to 12th layers of the model to construct the new model
|
|
5
8
|
layer_indices: null
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
_target_: fusion_bench.method.SimpleAverageForLlama
|
|
2
2
|
# set `merge_backbone` to true if you has a base model and only want to merge the backbone of the experts
|
|
3
3
|
# if `merge_backbone` is False, this is equivalent to `SimpleAverageAlgorithm`
|
|
4
|
-
merge_backbone:
|
|
5
|
-
model_save_path:
|
|
4
|
+
merge_backbone: false
|
|
5
|
+
model_save_path: ${path.log_dir}/checkpoint
|
|
6
|
+
show_pbar: true
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# Generic SMILE Upscaling Configuration for CausalLM models
|
|
2
|
+
# Supports: Qwen2, Llama, Mistral models
|
|
3
|
+
# The model type will be auto-detected from the base model
|
|
4
|
+
_target_: fusion_bench.method.smile_upscaling.causal_lm_upscaling.SmileCausalLMUpscalingAlgorithm
|
|
5
|
+
|
|
6
|
+
# Device and computation settings
|
|
7
|
+
device: cuda # device to put the models on
|
|
8
|
+
accelerator: cuda # device to perform SVD on
|
|
9
|
+
|
|
10
|
+
# Model upscaling parameters
|
|
11
|
+
num_experts_per_tok: 1 # Number of experts to activate per token
|
|
12
|
+
rank_of_router: 8 # Rank for router weights
|
|
13
|
+
rank_of_expert: 64 # Rank for expert weights
|
|
14
|
+
|
|
15
|
+
# Model saving settings
|
|
16
|
+
model_save_path: ${path.log_dir}/checkpoint # Set to save the merged model
|
|
17
|
+
model_dtype: null # Optional: convert to specific dtype after merging
|
|
18
|
+
save_with_remote_code: true
|
|
19
|
+
|
|
20
|
+
# Optional: Explicitly specify model type instead of auto-detection
|
|
21
|
+
model_type: null # Options: "qwen2", "llama", "mistral", or null for auto-detection
|
|
@@ -4,10 +4,11 @@ device: cpu
|
|
|
4
4
|
# device to perform SVD on
|
|
5
5
|
accelerator: cuda
|
|
6
6
|
# path to save/load the model
|
|
7
|
-
|
|
7
|
+
model_save_path: ${path.log_dir}/checkpoint
|
|
8
8
|
model_dtype: null
|
|
9
9
|
# SmileMoE parameters
|
|
10
10
|
num_experts_per_tok: 1
|
|
11
11
|
rank_of_router: 8
|
|
12
12
|
# if rank_of_expert < 0, dense expert is used.
|
|
13
13
|
rank_of_expert: 64
|
|
14
|
+
save_with_remote_code: true
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
_target_: fusion_bench.method.we_moe.flan_t5_we_moe.FlanT5WeightEnsemblingMoEAlgorithm
|
|
2
|
+
# the path for loading the model weights, if specified, skip the test-time adaptation training
|
|
3
|
+
checkpoint: False
|
|
4
|
+
# the path for saving the model weights.
|
|
5
|
+
save_checkpoint: False
|
|
6
|
+
router_hidden_layers: 2
|
|
7
|
+
init_lambda: 0.3
|
|
8
|
+
batch_reduce: true
|
|
9
|
+
# learning rate
|
|
10
|
+
lr: 1e-4
|
|
11
|
+
optimizer: adam
|
|
12
|
+
# this is overrided by `fabric.devices` if launched from the `fusion_bench` CLI.
|
|
13
|
+
devices: 1
|
|
14
|
+
batch_size: 4
|
|
15
|
+
num_workers: 0
|
|
16
|
+
max_steps: 200
|
|
17
|
+
# if true, we will use the gradient accumulation across tasks to save memory
|
|
18
|
+
use_grad_accumulate: true
|
|
19
|
+
cache_dir: outputs
|
|
20
|
+
fast_dev_run: ${fast_dev_run}
|
|
@@ -2,11 +2,8 @@ _usage_: |
|
|
|
2
2
|
defaults:
|
|
3
3
|
- CLIPVisionModelPool@: _template
|
|
4
4
|
_target_: fusion_bench.modelpool.CLIPVisionModelPool
|
|
5
|
-
_version_: "0.2"
|
|
6
5
|
_recursive_: False
|
|
7
6
|
models: ???
|
|
8
7
|
train_datasets: null
|
|
9
8
|
test_datasets: null
|
|
10
|
-
processor:
|
|
11
|
-
_target_: transformers.CLIPProcessor.from_pretrained
|
|
12
|
-
pretrained_model_name_or_path: openai/clip-vit-base-patch32
|
|
9
|
+
processor: openai/clip-vit-base-patch32
|
|
@@ -1,10 +1,5 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
_target_: fusion_bench.modelpool.CLIPVisionModelPool
|
|
2
|
+
_recursive_: False
|
|
3
3
|
models:
|
|
4
|
-
_pretrained_:
|
|
5
|
-
|
|
6
|
-
pretrained_model_name_or_path: ${...base_model}
|
|
7
|
-
processor:
|
|
8
|
-
_target_: transformers.CLIPProcessor.from_pretrained
|
|
9
|
-
pretrained_model_name_or_path: ${..base_model}
|
|
10
|
-
base_model: openai/clip-vit-base-patch32
|
|
4
|
+
_pretrained_: openai/clip-vit-base-patch32
|
|
5
|
+
processor: ${.models._pretrained_}
|
|
@@ -1,9 +1,3 @@
|
|
|
1
|
-
defaults:
|
|
2
|
-
- CLIPVisionModelPool@: _template
|
|
3
|
-
- /model/clip-vit@models:
|
|
4
|
-
- clip-vit-base-patch32
|
|
5
|
-
- clip-vit-base-patch32_svhn
|
|
6
|
-
- clip-vit-base-patch32_mnist
|
|
7
1
|
_target_: fusion_bench.modelpool.CLIPVisionModelPool
|
|
8
2
|
_recursive_: False
|
|
9
3
|
processor: openai/clip-vit-base-patch32
|
|
@@ -24,7 +24,7 @@ models:
|
|
|
24
24
|
_pretrained_: openai/clip-vit-large-patch14
|
|
25
25
|
sun397: tanganke/clip-vit-large-patch14_sun397
|
|
26
26
|
stanford-cars: tanganke/clip-vit-large-patch14_stanford-cars
|
|
27
|
-
resisc45: tanganke/clip-vit-large-
|
|
27
|
+
resisc45: tanganke/clip-vit-large-patch14_resisc45
|
|
28
28
|
eurosat: tanganke/clip-vit-large-patch14_eurosat
|
|
29
29
|
svhn: tanganke/clip-vit-large-patch14_svhn
|
|
30
30
|
gtsrb: tanganke/clip-vit-large-patch14_gtsrb
|
fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-large-patch14_TA8_model_only.yaml
CHANGED
|
@@ -5,7 +5,7 @@ models:
|
|
|
5
5
|
_pretrained_: openai/clip-vit-large-patch14
|
|
6
6
|
sun397: tanganke/clip-vit-large-patch14_sun397
|
|
7
7
|
stanford-cars: tanganke/clip-vit-large-patch14_stanford-cars
|
|
8
|
-
resisc45: tanganke/clip-vit-large-
|
|
8
|
+
resisc45: tanganke/clip-vit-large-patch14_resisc45
|
|
9
9
|
eurosat: tanganke/clip-vit-large-patch14_eurosat
|
|
10
10
|
svhn: tanganke/clip-vit-large-patch14_svhn
|
|
11
11
|
gtsrb: tanganke/clip-vit-large-patch14_gtsrb
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
_target_: fusion_bench.modelpool.CausalLMPool
|
|
2
2
|
_recursive_: false
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
enable_lazy_loading: false
|
|
5
5
|
models:
|
|
6
6
|
_pretrained_: Qwen/Qwen2.5-1.5B
|
|
7
|
-
|
|
8
|
-
|
|
7
|
+
math: Qwen/Qwen2.5-Math-1.5B
|
|
8
|
+
code: Qwen/Qwen2.5-Coder-1.5B
|
|
9
9
|
model_kwargs:
|
|
10
10
|
torch_dtype: bfloat16
|
|
11
11
|
tokenizer: Qwen/Qwen2.5-1.5B
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
_target_: fusion_bench.modelpool.CausalLMPool
|
|
2
|
+
models:
|
|
3
|
+
_pretrained_: path_to_your_pretrained_model
|
|
4
|
+
expert_1: path_to_your_expert_model_1
|
|
5
|
+
expert_2: path_to_your_expert_model_2
|
|
6
|
+
expert_3: path_to_your_expert_model_3
|
|
7
|
+
expert_4: path_to_your_expert_model_4
|
|
8
|
+
tokenizer: ${.models._pretrained_}
|
|
9
|
+
model_kwargs:
|
|
10
|
+
torch_dtype: bfloat16
|
|
@@ -1,17 +1,9 @@
|
|
|
1
1
|
_target_: fusion_bench.modelpool.CausalLMPool
|
|
2
2
|
_recursive_: false
|
|
3
3
|
models:
|
|
4
|
-
_pretrained_:
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
expert_1:
|
|
8
|
-
_target_: transformers.AutoModelForCausalLM.from_pretrained
|
|
9
|
-
pretrained_model_name_or_path: Qwen/Qwen2.5-Math-1.5B
|
|
10
|
-
expert_2:
|
|
11
|
-
_target_: transformers.AutoModelForCausalLM.from_pretrained
|
|
12
|
-
pretrained_model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
|
|
4
|
+
_pretrained_: Qwen/Qwen2.5-1.5B
|
|
5
|
+
expert_1: Qwen/Qwen2.5-Math-1.5B
|
|
6
|
+
expert_2: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
|
|
13
7
|
model_kwargs:
|
|
14
8
|
torch_dtype: bfloat16
|
|
15
|
-
tokenizer:
|
|
16
|
-
_target_: transformers.AutoTokenizer.from_pretrained
|
|
17
|
-
pretrained_model_name_or_path: Qwen/Qwen2.5-1.5B
|
|
9
|
+
tokenizer: Qwen/Qwen2.5-1.5B
|
|
@@ -1,20 +1,10 @@
|
|
|
1
1
|
_target_: fusion_bench.modelpool.CausalLMPool
|
|
2
2
|
_recursive_: false
|
|
3
3
|
models:
|
|
4
|
-
_pretrained_:
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
_target_: transformers.AutoModelForCausalLM.from_pretrained
|
|
9
|
-
pretrained_model_name_or_path: meta-math/MetaMath-Mistral-7B
|
|
10
|
-
expert_2:
|
|
11
|
-
_target_: transformers.AutoModelForCausalLM.from_pretrained
|
|
12
|
-
pretrained_model_name_or_path: cognitivecomputations/dolphin-2.1-mistral-7b
|
|
13
|
-
expert_3:
|
|
14
|
-
_target_: transformers.AutoModelForCausalLM.from_pretrained
|
|
15
|
-
pretrained_model_name_or_path: uukuguy/speechless-code-mistral-7b-v1.0
|
|
4
|
+
_pretrained_: mistralai/Mistral-7B-v0.1
|
|
5
|
+
expert_1: meta-math/MetaMath-Mistral-7B
|
|
6
|
+
expert_2: cognitivecomputations/dolphin-2.1-mistral-7b
|
|
7
|
+
expert_3: uukuguy/speechless-code-mistral-7b-v1.0
|
|
16
8
|
model_kwargs:
|
|
17
|
-
torch_dtype:
|
|
18
|
-
tokenizer:
|
|
19
|
-
_target_: transformers.AutoTokenizer.from_pretrained
|
|
20
|
-
pretrained_model_name_or_path: mistralai/Mistral-7B-v0.1
|
|
9
|
+
torch_dtype: bfloat16
|
|
10
|
+
tokenizer: mistralai/Mistral-7B-v0.1
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
defaults:
|
|
2
2
|
- hydra: default
|
|
3
3
|
- fabric: auto
|
|
4
|
-
-
|
|
4
|
+
- path: default
|
|
5
|
+
# --- Model, Method, Task ---
|
|
5
6
|
- method: simple_average
|
|
7
|
+
- modelpool: nyuv2_modelpool
|
|
6
8
|
- taskpool: nyuv2_taskpool
|
|
7
9
|
- _self_
|
|
8
10
|
_target_: fusion_bench.programs.FabricModelFusionProgram
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# FusionBench Path Configuration
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# This configuration file defines the directory structure and path settings
|
|
5
|
+
# used throughout the FusionBench framework for model fusion experiments.
|
|
6
|
+
# All paths are configured using Hydra's variable interpolation syntax.
|
|
7
|
+
# Root directory - uses FUSION_BENCH_PROJECT_ROOT env var or current directory
|
|
8
|
+
#
|
|
9
|
+
# By default:
|
|
10
|
+
#
|
|
11
|
+
# root_dir (defaults to current directory)
|
|
12
|
+
# ├── outputs (output_dir)
|
|
13
|
+
# │ ├── cache (cache_dir)
|
|
14
|
+
# │ └── <config_name>
|
|
15
|
+
# │ └── <timestamp> (log_dir)
|
|
16
|
+
# └── data (data_dir)
|
|
17
|
+
#
|
|
18
|
+
root_dir: ${oc.env:FUSION_BENCH_PROJECT_ROOT,"."}
|
|
19
|
+
# Output directory for experiment results and artifacts
|
|
20
|
+
output_dir: ${.root_dir}/outputs
|
|
21
|
+
# Data directory - uses FUSION_BENCH_DATA_DIR env var or root_dir/data
|
|
22
|
+
data_dir: ${oc.env:FUSION_BENCH_DATA_DIR,${.root_dir}/data}
|
|
23
|
+
# Cache directory - uses FUSION_BENCH_CACHE_DIR env var or output_dir/cache
|
|
24
|
+
cache_dir: ${oc.env:FUSION_BENCH_CACHE_DIR,${.output_dir}/cache}
|
|
25
|
+
# Log directory with timestamped subdirectories for each run
|
|
26
|
+
log_dir: ${.output_dir}/${hydra:job.config_name}/${now:%Y-%m-%d_%H-%M-%S}
|
|
27
|
+
# Current working directory at runtime
|
|
28
|
+
work_dir: ${hydra:runtime.cwd}
|
fusion_bench_config/taskpool/CLIPVisionModelTaskPool/clip-vit-base-patch32_svhn_and_mnist.yaml
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
defaults:
|
|
2
|
+
- /dataset/image_classification/test@test_datasets:
|
|
3
|
+
- svhn
|
|
4
|
+
- mnist
|
|
5
|
+
_target_: fusion_bench.taskpool.CLIPVisionModelTaskPool
|
|
6
|
+
_recursive_: false
|
|
7
|
+
test_datasets: ??? # The datasets to evaluate the model on
|
|
8
|
+
base_model: openai/clip-vit-base-patch32
|
|
9
|
+
clip_model: ${.base_model} # The base model to use
|
|
10
|
+
processor: ${.base_model} # The base model to use
|
|
11
|
+
data_processor: ${.processor}
|
|
12
|
+
dataloader_kwargs:
|
|
13
|
+
batch_size: 128 # The batch size for the data loader
|
|
14
|
+
num_workers: 8 # The number of worker processes for data loading
|
|
15
|
+
pin_memory: True # Whether to pin memory in data loader
|
|
16
|
+
drop_last: False # Whether to drop the last incomplete batch
|
|
17
|
+
shuffle: False # Whether to shuffle the data
|
|
18
|
+
# === layer-wise feature saving ===
|
|
19
|
+
# The path to save the features to, if none then the features are not saved
|
|
20
|
+
# This is the path to a directory, the features of task `task_name` will be saved in `feature_save_path/task_name.csv`
|
|
21
|
+
layer_wise_feature_save_path: null
|
|
22
|
+
layer_wise_feature_first_token_only: true # Whether to save only the first token of the features
|
|
23
|
+
# The maximum number of samples to save the features for
|
|
24
|
+
layer_wise_feature_max_num: 1000
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
# this option can be one of "clip_task_wise_adamerging" or "clip_layer_wise_adamerging"
|
|
2
|
-
name: clip_layer_wise_adamerging
|
|
3
|
-
# this weights can be a list of float, or a string that points to a *.np, *.pt file containing the weights
|
|
4
|
-
# if weights is specified, skip the test-time adaptation training
|
|
5
|
-
weights: null
|
|
6
|
-
# learning rate
|
|
7
|
-
optimizer: adam
|
|
8
|
-
lr: 1e-3
|
|
9
|
-
init_values: 0.3
|
|
10
|
-
# if `clamp_weights` is true, the weights will be clamped to [0, 1]
|
|
11
|
-
clamp_weights: false
|
|
12
|
-
# arguments of `functional_call`
|
|
13
|
-
tie_weights: true
|
|
14
|
-
strict: false
|
|
15
|
-
# this is overrided by `fabric.devices` if launched from the `fusion_bench` CLI.
|
|
16
|
-
devices: 1
|
|
17
|
-
batch_size: 16
|
|
18
|
-
num_workers: 8
|
|
19
|
-
max_steps: 1000
|
|
20
|
-
fast_dev_run: ${fast_dev_run}
|
|
21
|
-
# the path for saving the merging weights
|
|
22
|
-
save_merging_weights: 'merging_weights.pt'
|
|
23
|
-
cache_dir: outputs
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
type: AutoModelForCausalLMPool
|
|
2
|
-
# each model should have a name and a path, and the model is loaded from the path
|
|
3
|
-
# this is equivalent to `AutoModelForCausalLM.from_pretrained(path)`
|
|
4
|
-
models:
|
|
5
|
-
- name: _pretrained_
|
|
6
|
-
path: path_to_your_pretrained_model
|
|
7
|
-
- name: expert_1
|
|
8
|
-
path: path_to_your_expert_model_1
|
|
9
|
-
- name: expert_2
|
|
10
|
-
path: path_to_your_expert_model_2
|
|
11
|
-
- name: expert_3
|
|
12
|
-
path: path_to_your_expert_model_3
|
|
13
|
-
- name: expert_4
|
|
14
|
-
path: path_to_your_expert_model_4
|