fusion-bench 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fusion_bench/compat/method/__init__.py +2 -0
- fusion_bench/compat/method/base_algorithm.py +7 -2
- fusion_bench/compat/modelpool/__init__.py +3 -2
- fusion_bench/compat/taskpool/__init__.py +1 -1
- fusion_bench/dataset/arc_agi/__init__.py +6 -1
- fusion_bench/dataset/arc_agi/arc.py +26 -7
- fusion_bench/dataset/arc_agi/arc_agi.py +156 -25
- fusion_bench/dataset/arc_agi/np_cache.py +0 -1
- fusion_bench/dataset/arc_agi/preprocess.py +51 -9
- fusion_bench/dataset/llama/__init__.py +1 -0
- fusion_bench/dataset/llama/alpaca.py +93 -3
- fusion_bench/dataset/llama/collate.py +72 -5
- fusion_bench/dataset/llama/metamathqa.py +50 -0
- fusion_bench/dataset/llama/preference_700k.py +70 -0
- fusion_bench/dataset/llama/stanford_shp.py +90 -0
- fusion_bench/dataset/llama/ultrachat.py +58 -0
- fusion_bench/dataset/llama/utils/__init__.py +0 -0
- fusion_bench/method/__init__.py +4 -1
- fusion_bench/method/adamerging/__init__.py +1 -1
- fusion_bench/method/adamerging/layer_wise_adamerging.py +11 -4
- fusion_bench/method/adamerging/min_norm_solvers.py +4 -4
- fusion_bench/method/linear/expo.py +39 -0
- fusion_bench/method/lm_finetune/__init__.py +1 -0
- fusion_bench/method/lm_finetune/bradley_terry_rm.py +432 -0
- fusion_bench/method/lm_finetune/fullfinetune_sft.py +122 -150
- fusion_bench/method/lm_finetune/peftfinetune_sft.py +102 -157
- fusion_bench/method/pruning/llama_magnitude_prune.py +2 -2
- fusion_bench/method/pruning/llama_random_prune.py +2 -2
- fusion_bench/method/pruning/magnitude_diff_pruning.py +2 -1
- fusion_bench/method/rankone_moe/__init__.py +3 -0
- fusion_bench/method/rankone_moe/clip_rankone_moe.py +160 -0
- fusion_bench/method/rankone_moe/rankone_moe.py +249 -0
- fusion_bench/method/simple_average.py +1 -1
- fusion_bench/method/surgery/__init__.py +3 -0
- fusion_bench/method/surgery/clip_layer_wise_adamerging_surgery.py +157 -0
- fusion_bench/mixins/__init__.py +2 -0
- fusion_bench/mixins/clip_classification.py +60 -12
- fusion_bench/mixins/fabric_training.py +320 -0
- fusion_bench/mixins/lightning_fabric.py +11 -2
- fusion_bench/modelpool/__init__.py +2 -0
- fusion_bench/modelpool/causal_lm/__init__.py +1 -1
- fusion_bench/modelpool/causal_lm/causal_lm.py +21 -22
- fusion_bench/modelpool/seq_classification_lm/__init__.py +2 -0
- fusion_bench/modelpool/seq_classification_lm/reward_model.py +15 -0
- fusion_bench/modelpool/seq_classification_lm/seq_classification_lm.py +98 -0
- fusion_bench/models/chat_templates/__init__.py +1 -0
- fusion_bench/models/chat_templates/llama_3_Instruct.py +1 -0
- fusion_bench/models/chat_templates/load_tokenizer.py +43 -0
- fusion_bench/models/hf_clip.py +50 -9
- fusion_bench/models/rankone_moe.py +410 -0
- fusion_bench/models/surgery/surgerymodelwrapper.py +157 -0
- fusion_bench/models/utils.py +8 -0
- fusion_bench/models/wrappers/layer_wise_fusion.py +14 -5
- fusion_bench/models/wrappers/task_wise_fusion.py +5 -5
- fusion_bench/optim/__init__.py +2 -0
- fusion_bench/optim/exception.py +47 -0
- fusion_bench/optim/lr_scheduler/__init__.py +1 -0
- fusion_bench/optim/lr_scheduler/linear_warmup.py +222 -0
- fusion_bench/optim/lr_scheduler/utils/__init__.py +1 -0
- fusion_bench/optim/lr_scheduler/utils/visualization.py +119 -0
- fusion_bench/optim/mezo.py +0 -2
- fusion_bench/programs/fabric_fusion_program.py +5 -1
- fusion_bench/taskpool/__init__.py +10 -2
- fusion_bench/taskpool/clip_vision/__init__.py +1 -0
- fusion_bench/taskpool/clip_vision/clip_rankone_moe_taskpool.py +112 -0
- fusion_bench/taskpool/clip_vision/taskpool.py +43 -6
- fusion_bench/taskpool/llama/reward_model.py +157 -0
- fusion_bench/taskpool/nyuv2_taskpool.py +2 -0
- fusion_bench/tasks/flan_t5_text_generation/glue_load_dataset.py +2 -1
- fusion_bench/utils/hydra_utils.py +22 -0
- fusion_bench/utils/plot/__init__.py +0 -0
- fusion_bench/utils/plot/token.py +52 -0
- fusion_bench/utils/plot/token_notebook.py +127 -0
- fusion_bench/utils/type.py +5 -3
- {fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/METADATA +1 -1
- {fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/RECORD +104 -57
- fusion_bench_config/clip-vit-base-patch32_robustness_corrupted.yaml +1 -1
- fusion_bench_config/dataset/llm_sft/alpaca_cleaned.yaml +6 -0
- fusion_bench_config/dataset/llm_sft/ultrachat_200k.yaml +3 -0
- fusion_bench_config/fabric/llama_peft_fsdp.yaml +16 -0
- fusion_bench_config/fabric/loggers/wandb_logger.yaml +2 -0
- fusion_bench_config/fabric/strategy/deepspeed.yaml +10 -0
- fusion_bench_config/fabric/strategy/llama_peft_fsdp.yaml +9 -0
- fusion_bench_config/fabric_model_fusion.yaml +1 -1
- fusion_bench_config/llama_full_finetune.yaml +19 -0
- fusion_bench_config/method/lm_finetune/bradley_terry_rm.yaml +47 -0
- fusion_bench_config/method/lm_finetune/fullfinetune_sft.yaml +13 -6
- fusion_bench_config/method/lm_finetune/peftfinetune_sft.yaml +17 -9
- fusion_bench_config/method/rankone_moe/rankone_moe.yaml +26 -0
- fusion_bench_config/method/regmean/clip_regmean.yaml +1 -0
- fusion_bench_config/method/surgery/adamerging_surgery.yaml +27 -0
- fusion_bench_config/modelpool/CausalLMPool/llama_alpaca_cleaned.yaml +21 -0
- fusion_bench_config/modelpool/CausalLMPool/llama_codealpaca.yaml +21 -0
- fusion_bench_config/modelpool/CausalLMPool/llama_metamathqa.yaml +19 -0
- fusion_bench_config/modelpool/CausalLMPool/llama_ultrachat.yaml +18 -0
- fusion_bench_config/modelpool/SeqenceClassificationModelPool/llama_preference700k.yaml +23 -0
- fusion_bench_config/modelpool/SeqenceClassificationModelPool/single_reward_model.yaml +14 -0
- fusion_bench_config/nyuv2_config.yaml +5 -1
- fusion_bench_config/taskpool/CLIPVisionModelTaskPool/clip_rankone_wemoe_clip-vit-classification_TA8.yaml +18 -0
- fusion_bench_config/taskpool/reward_model_evaluation.yaml +18 -0
- fusion_bench_config/llama_weighted_average.yaml +0 -26
- {fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/LICENSE +0 -0
- {fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/WHEEL +0 -0
- {fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/entry_points.txt +0 -0
- {fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
name: ??? # this can be
|
|
2
|
+
# the path for loading the model weights, if specified, skip the test-time adaptation training
|
|
3
|
+
checkpoint: False
|
|
4
|
+
# the path for saving the model weights.
|
|
5
|
+
save_checkpoint: False
|
|
6
|
+
router_hidden_layers: 1
|
|
7
|
+
init_lambda: 0.3
|
|
8
|
+
batch_reduce: true
|
|
9
|
+
|
|
10
|
+
# device to compute svd
|
|
11
|
+
svd_accelerator: cuda
|
|
12
|
+
rank_k: 32 # How many experts are added to the pool per task?
|
|
13
|
+
select_k: -1 # How many experts are selected from the pool to merge? Range is (1, rank_k*task_num). In particular -1: All the experts in the pool
|
|
14
|
+
|
|
15
|
+
# learning rate
|
|
16
|
+
lr: 1e-4
|
|
17
|
+
optimizer: adam
|
|
18
|
+
# this is overrided by `fabric.devices` if launched from the `fusion_bench` CLI.
|
|
19
|
+
devices: 1
|
|
20
|
+
batch_size: 16
|
|
21
|
+
num_workers: 16
|
|
22
|
+
max_steps: 1000 # default: 1000
|
|
23
|
+
# if true, we will use the gradient accumulation across tasks to save memory
|
|
24
|
+
use_grad_accumulate: true
|
|
25
|
+
cache_dir: outputs
|
|
26
|
+
fast_dev_run: ${fast_dev_run}
|
|
@@ -3,6 +3,7 @@ _target_: fusion_bench.method.RegMeanAlgorithmForCLIP
|
|
|
3
3
|
exclude_param_names_regex: []
|
|
4
4
|
# numbers of examples to compute regmean weights
|
|
5
5
|
num_regmean_examples: 256
|
|
6
|
+
weight_transpose: true
|
|
6
7
|
# float, reduce non-diagonal elements in regmean weights by multiplying this scalar
|
|
7
8
|
reduce_non_diagonal_ratio: 0.6
|
|
8
9
|
dataloader_kwargs:
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# this option can be "clip_task_wise_adamerging"
|
|
2
|
+
name: clip_layer_wise_adamerging_surgery
|
|
3
|
+
# this weights can be a list of float, or a string that points to a *.np, *.pt file containing the weights
|
|
4
|
+
# if weights is specified, skip the test-time adaptation training
|
|
5
|
+
weights: null
|
|
6
|
+
# learning rate
|
|
7
|
+
optimizer: adam
|
|
8
|
+
lr: 1e-3
|
|
9
|
+
init_values: 0.3
|
|
10
|
+
# if `clamp_weights` is true, the weights will be clamped to [0, 1]
|
|
11
|
+
clamp_weights: false
|
|
12
|
+
# arguments of `functional_call`
|
|
13
|
+
tie_weights: true
|
|
14
|
+
strict: false
|
|
15
|
+
# this is overrided by `fabric.devices` if launched from the `fusion_bench` CLI.
|
|
16
|
+
devices: 1
|
|
17
|
+
batch_size: 16
|
|
18
|
+
num_workers: 8
|
|
19
|
+
max_steps: 1000
|
|
20
|
+
fast_dev_run: ${fast_dev_run}
|
|
21
|
+
# the path for saving the merging weights
|
|
22
|
+
save_merging_weights: 'merging_weights.pt'
|
|
23
|
+
cache_dir: outputs
|
|
24
|
+
|
|
25
|
+
# parameters of Surgery
|
|
26
|
+
eval_iterations: 200
|
|
27
|
+
surgery_steps: 1000
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
_target_: fusion_bench.modelpool.CausalLMPool
|
|
2
|
+
|
|
3
|
+
pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
|
|
4
|
+
|
|
5
|
+
models:
|
|
6
|
+
_pretrained_:
|
|
7
|
+
_target_: transformers.AutoModelForCausalLM.from_pretrained
|
|
8
|
+
pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
|
|
9
|
+
torch_dtype: bfloat16
|
|
10
|
+
|
|
11
|
+
tokenizer:
|
|
12
|
+
_target_: transformers.AutoTokenizer.from_pretrained
|
|
13
|
+
pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
|
|
14
|
+
|
|
15
|
+
train_datasets:
|
|
16
|
+
alpaca-cleaned:
|
|
17
|
+
_target_: fusion_bench.dataset.llama.alpaca.load_tokenized_alpaca_dataset
|
|
18
|
+
tokenizer: ${...tokenizer}
|
|
19
|
+
path: "yahma/alpaca-cleaned"
|
|
20
|
+
split: train
|
|
21
|
+
cache_path: null
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
_target_: fusion_bench.modelpool.CausalLMPool
|
|
2
|
+
|
|
3
|
+
pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
|
|
4
|
+
|
|
5
|
+
models:
|
|
6
|
+
_pretrained_:
|
|
7
|
+
_target_: transformers.AutoModelForCausalLM.from_pretrained
|
|
8
|
+
pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
|
|
9
|
+
torch_dtype: bfloat16
|
|
10
|
+
|
|
11
|
+
tokenizer:
|
|
12
|
+
_target_: transformers.AutoTokenizer.from_pretrained
|
|
13
|
+
pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
|
|
14
|
+
|
|
15
|
+
train_datasets:
|
|
16
|
+
codealpaca:
|
|
17
|
+
_target_: fusion_bench.dataset.llama.alpaca.load_tokenized_alpaca_dataset
|
|
18
|
+
tokenizer: ${...tokenizer}
|
|
19
|
+
path: sahil2801/CodeAlpaca-20k
|
|
20
|
+
split: train
|
|
21
|
+
cache_path: null
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
_target_: fusion_bench.modelpool.CausalLMPool
|
|
2
|
+
|
|
3
|
+
pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
|
|
4
|
+
|
|
5
|
+
models:
|
|
6
|
+
_pretrained_:
|
|
7
|
+
_target_: transformers.AutoModelForCausalLM.from_pretrained
|
|
8
|
+
pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
|
|
9
|
+
torch_dtype: bfloat16
|
|
10
|
+
|
|
11
|
+
tokenizer:
|
|
12
|
+
_target_: transformers.AutoTokenizer.from_pretrained
|
|
13
|
+
pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
|
|
14
|
+
|
|
15
|
+
train_datasets:
|
|
16
|
+
metamathqa:
|
|
17
|
+
_target_: fusion_bench.dataset.llama.metamathqa.load_tokenized_metamathqa
|
|
18
|
+
tokenizer: ${...tokenizer}
|
|
19
|
+
cache_path: null
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
_target_: fusion_bench.modelpool.CausalLMPool
|
|
2
|
+
|
|
3
|
+
pretrained_model_name_or_path: meta-llama/Llama-3-1B-Instruct
|
|
4
|
+
|
|
5
|
+
models:
|
|
6
|
+
_pretrained_:
|
|
7
|
+
_target_: transformers.AutoModelForCausalLM.from_pretrained
|
|
8
|
+
pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
|
|
9
|
+
torch_dtype: bfloat16
|
|
10
|
+
|
|
11
|
+
tokenizer:
|
|
12
|
+
_target_: transformers.AutoTokenizer.from_pretrained
|
|
13
|
+
pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
|
|
14
|
+
|
|
15
|
+
train_datasets:
|
|
16
|
+
ultrachat-200k:
|
|
17
|
+
_target_: fusion_bench.dataset.llama.ultrachat.load_tokenized_ultrachat_200k
|
|
18
|
+
tokenizer: ${...tokenizer}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
_target_: fusion_bench.modelpool.SeqenceClassificationModelPool
|
|
2
|
+
|
|
3
|
+
pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
|
|
4
|
+
|
|
5
|
+
models:
|
|
6
|
+
_pretrained_:
|
|
7
|
+
_target_: fusion_bench.modelpool.seq_classification_lm.create_reward_model_from_pretrained
|
|
8
|
+
pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
|
|
9
|
+
torch_dtype: bfloat16
|
|
10
|
+
use_flash_attention_2: true
|
|
11
|
+
|
|
12
|
+
tokenizer:
|
|
13
|
+
_target_: transformers.AutoTokenizer.from_pretrained
|
|
14
|
+
pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
|
|
15
|
+
pad_token: <|end_of_text|> # do not use eos token (<|eos_id|>) as padding token because it is used as the end of each content
|
|
16
|
+
|
|
17
|
+
train_datasets:
|
|
18
|
+
preference_700k:
|
|
19
|
+
_target_: fusion_bench.dataset.llama.preference_700k.load_tokenized_preference_700k_for_rlhf
|
|
20
|
+
tokenizer: ${...tokenizer}
|
|
21
|
+
path: hendrydong/preference_700K
|
|
22
|
+
split: train
|
|
23
|
+
cache_path: null
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
_target_: fusion_bench.modelpool.SeqenceClassificationModelPool
|
|
2
|
+
|
|
3
|
+
pretrained_model_name_or_path: fusion-bench/Llama-3.2-1B-Instruct_Bradly-Terry-RM_Preference-700k
|
|
4
|
+
|
|
5
|
+
models:
|
|
6
|
+
_pretrained_:
|
|
7
|
+
_target_: transformers.AutoModelForSequenceClassification.from_pretrained
|
|
8
|
+
pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
|
|
9
|
+
torch_dtype: bfloat16
|
|
10
|
+
|
|
11
|
+
tokenizer:
|
|
12
|
+
_target_: transformers.AutoTokenizer.from_pretrained
|
|
13
|
+
pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
|
|
14
|
+
pad_token: <|end_of_text|> # do not use eos token (<|eos_id|>) as padding token because it is used as the end of each content
|
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
defaults:
|
|
2
2
|
- hydra: default
|
|
3
|
+
- fabric: auto
|
|
3
4
|
- modelpool: nyuv2_modelpool
|
|
4
5
|
- method: simple_average
|
|
5
6
|
- taskpool: nyuv2_taskpool
|
|
6
7
|
- _self_
|
|
8
|
+
|
|
9
|
+
_target_: fusion_bench.programs.FabricModelFusionProgram
|
|
10
|
+
_recursive_: false
|
|
11
|
+
|
|
7
12
|
fast_dev_run: false # Run a single batch of data to test the model or method
|
|
8
13
|
use_lightning: true # Use the fabric to run the experiment
|
|
9
14
|
print_config: true # Print the configuration to the console
|
|
10
15
|
save_report: false # path to save the result report
|
|
11
|
-
fabric: null
|
|
12
16
|
trainer:
|
|
13
17
|
devices: 1
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
defaults:
|
|
2
|
+
- CLIPVisionModelTaskPool@: _template
|
|
3
|
+
- /dataset/image_classification/test@test_datasets:
|
|
4
|
+
- sun397
|
|
5
|
+
- stanford-cars
|
|
6
|
+
- resisc45
|
|
7
|
+
- eurosat
|
|
8
|
+
- svhn
|
|
9
|
+
- gtsrb
|
|
10
|
+
- mnist
|
|
11
|
+
- dtd
|
|
12
|
+
- _self_
|
|
13
|
+
|
|
14
|
+
_target_: fusion_bench.taskpool.RankoneWEMoECLIPVisionModelTaskPool
|
|
15
|
+
|
|
16
|
+
# === layer-wise routing weights saving ===
|
|
17
|
+
layer_wise_routing_weights_save_path: null
|
|
18
|
+
layer_wise_routing_weights_max_num: 1000
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
_target_: fusion_bench.taskpool.llama.reward_model.RewardModelEvaluationTaskPool
|
|
2
|
+
|
|
3
|
+
test_datasets:
|
|
4
|
+
preference_700k:
|
|
5
|
+
_target_: fusion_bench.dataset.llama.preference_700k.load_tokenized_preference_700k_for_rlhf
|
|
6
|
+
tokenizer: ${...tokenizer}
|
|
7
|
+
path: hendrydong/preference_700K
|
|
8
|
+
split: train
|
|
9
|
+
cache_path: null
|
|
10
|
+
|
|
11
|
+
dataloader_kwargs:
|
|
12
|
+
shuffle: False
|
|
13
|
+
batch_size: 16
|
|
14
|
+
|
|
15
|
+
tokenizer: ${..modelpool.tokenizer}
|
|
16
|
+
|
|
17
|
+
max_num_samples: 1000
|
|
18
|
+
seed: 42
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
defaults:
|
|
2
|
-
- example_config
|
|
3
|
-
- override method: weighted_average_for_llama
|
|
4
|
-
- override modelpool: llama_for_causallm
|
|
5
|
-
- _self_
|
|
6
|
-
modelpool:
|
|
7
|
-
models:
|
|
8
|
-
# the pre-trained model (base model) is optional
|
|
9
|
-
# if not provided, the first model will be used as the base model
|
|
10
|
-
- name: _pretrained_
|
|
11
|
-
path: meta-llama/Meta-Llama-3-8B
|
|
12
|
-
- name: expert_1
|
|
13
|
-
path: meta-llama/Meta-Llama-3-8B
|
|
14
|
-
- name: expert_2
|
|
15
|
-
path: meta-llama/Meta-Llama-3-8B-Instruct
|
|
16
|
-
method:
|
|
17
|
-
normalize: true # if true, the weights will be normalized before merging
|
|
18
|
-
weights: # List of weights for each model
|
|
19
|
-
- 0.5
|
|
20
|
-
- 0.5
|
|
21
|
-
# if true, only the backbone of the model will be merged and the head will be keeped as the pre-trained model (if the pre-trained model is provided, otherwise the head of the first model will be used)
|
|
22
|
-
# if false, the whole model will be merged
|
|
23
|
-
backbone_only: true
|
|
24
|
-
merged_model_save_path: null
|
|
25
|
-
save_tokenizer: true
|
|
26
|
-
push_to_hub: false
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|