fusion-bench 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. fusion_bench/compat/method/__init__.py +2 -0
  2. fusion_bench/compat/method/base_algorithm.py +7 -2
  3. fusion_bench/compat/modelpool/__init__.py +3 -2
  4. fusion_bench/compat/taskpool/__init__.py +1 -1
  5. fusion_bench/dataset/arc_agi/__init__.py +6 -1
  6. fusion_bench/dataset/arc_agi/arc.py +26 -7
  7. fusion_bench/dataset/arc_agi/arc_agi.py +156 -25
  8. fusion_bench/dataset/arc_agi/np_cache.py +0 -1
  9. fusion_bench/dataset/arc_agi/preprocess.py +51 -9
  10. fusion_bench/dataset/llama/__init__.py +1 -0
  11. fusion_bench/dataset/llama/alpaca.py +93 -3
  12. fusion_bench/dataset/llama/collate.py +72 -5
  13. fusion_bench/dataset/llama/metamathqa.py +50 -0
  14. fusion_bench/dataset/llama/preference_700k.py +70 -0
  15. fusion_bench/dataset/llama/stanford_shp.py +90 -0
  16. fusion_bench/dataset/llama/ultrachat.py +58 -0
  17. fusion_bench/dataset/llama/utils/__init__.py +0 -0
  18. fusion_bench/method/__init__.py +4 -1
  19. fusion_bench/method/adamerging/__init__.py +1 -1
  20. fusion_bench/method/adamerging/layer_wise_adamerging.py +11 -4
  21. fusion_bench/method/adamerging/min_norm_solvers.py +4 -4
  22. fusion_bench/method/linear/expo.py +39 -0
  23. fusion_bench/method/lm_finetune/__init__.py +1 -0
  24. fusion_bench/method/lm_finetune/bradley_terry_rm.py +432 -0
  25. fusion_bench/method/lm_finetune/fullfinetune_sft.py +122 -150
  26. fusion_bench/method/lm_finetune/peftfinetune_sft.py +102 -157
  27. fusion_bench/method/pruning/llama_magnitude_prune.py +2 -2
  28. fusion_bench/method/pruning/llama_random_prune.py +2 -2
  29. fusion_bench/method/pruning/magnitude_diff_pruning.py +2 -1
  30. fusion_bench/method/rankone_moe/__init__.py +3 -0
  31. fusion_bench/method/rankone_moe/clip_rankone_moe.py +160 -0
  32. fusion_bench/method/rankone_moe/rankone_moe.py +249 -0
  33. fusion_bench/method/simple_average.py +1 -1
  34. fusion_bench/method/surgery/__init__.py +3 -0
  35. fusion_bench/method/surgery/clip_layer_wise_adamerging_surgery.py +157 -0
  36. fusion_bench/mixins/__init__.py +2 -0
  37. fusion_bench/mixins/clip_classification.py +60 -12
  38. fusion_bench/mixins/fabric_training.py +320 -0
  39. fusion_bench/mixins/lightning_fabric.py +11 -2
  40. fusion_bench/modelpool/__init__.py +2 -0
  41. fusion_bench/modelpool/causal_lm/__init__.py +1 -1
  42. fusion_bench/modelpool/causal_lm/causal_lm.py +21 -22
  43. fusion_bench/modelpool/seq_classification_lm/__init__.py +2 -0
  44. fusion_bench/modelpool/seq_classification_lm/reward_model.py +15 -0
  45. fusion_bench/modelpool/seq_classification_lm/seq_classification_lm.py +98 -0
  46. fusion_bench/models/chat_templates/__init__.py +1 -0
  47. fusion_bench/models/chat_templates/llama_3_Instruct.py +1 -0
  48. fusion_bench/models/chat_templates/load_tokenizer.py +43 -0
  49. fusion_bench/models/hf_clip.py +50 -9
  50. fusion_bench/models/rankone_moe.py +410 -0
  51. fusion_bench/models/surgery/surgerymodelwrapper.py +157 -0
  52. fusion_bench/models/utils.py +8 -0
  53. fusion_bench/models/wrappers/layer_wise_fusion.py +14 -5
  54. fusion_bench/models/wrappers/task_wise_fusion.py +5 -5
  55. fusion_bench/optim/__init__.py +2 -0
  56. fusion_bench/optim/exception.py +47 -0
  57. fusion_bench/optim/lr_scheduler/__init__.py +1 -0
  58. fusion_bench/optim/lr_scheduler/linear_warmup.py +222 -0
  59. fusion_bench/optim/lr_scheduler/utils/__init__.py +1 -0
  60. fusion_bench/optim/lr_scheduler/utils/visualization.py +119 -0
  61. fusion_bench/optim/mezo.py +0 -2
  62. fusion_bench/programs/fabric_fusion_program.py +5 -1
  63. fusion_bench/taskpool/__init__.py +10 -2
  64. fusion_bench/taskpool/clip_vision/__init__.py +1 -0
  65. fusion_bench/taskpool/clip_vision/clip_rankone_moe_taskpool.py +112 -0
  66. fusion_bench/taskpool/clip_vision/taskpool.py +43 -6
  67. fusion_bench/taskpool/llama/reward_model.py +157 -0
  68. fusion_bench/taskpool/nyuv2_taskpool.py +2 -0
  69. fusion_bench/tasks/flan_t5_text_generation/glue_load_dataset.py +2 -1
  70. fusion_bench/utils/hydra_utils.py +22 -0
  71. fusion_bench/utils/plot/__init__.py +0 -0
  72. fusion_bench/utils/plot/token.py +52 -0
  73. fusion_bench/utils/plot/token_notebook.py +127 -0
  74. fusion_bench/utils/type.py +5 -3
  75. {fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/METADATA +1 -1
  76. {fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/RECORD +104 -57
  77. fusion_bench_config/clip-vit-base-patch32_robustness_corrupted.yaml +1 -1
  78. fusion_bench_config/dataset/llm_sft/alpaca_cleaned.yaml +6 -0
  79. fusion_bench_config/dataset/llm_sft/ultrachat_200k.yaml +3 -0
  80. fusion_bench_config/fabric/llama_peft_fsdp.yaml +16 -0
  81. fusion_bench_config/fabric/loggers/wandb_logger.yaml +2 -0
  82. fusion_bench_config/fabric/strategy/deepspeed.yaml +10 -0
  83. fusion_bench_config/fabric/strategy/llama_peft_fsdp.yaml +9 -0
  84. fusion_bench_config/fabric_model_fusion.yaml +1 -1
  85. fusion_bench_config/llama_full_finetune.yaml +19 -0
  86. fusion_bench_config/method/lm_finetune/bradley_terry_rm.yaml +47 -0
  87. fusion_bench_config/method/lm_finetune/fullfinetune_sft.yaml +13 -6
  88. fusion_bench_config/method/lm_finetune/peftfinetune_sft.yaml +17 -9
  89. fusion_bench_config/method/rankone_moe/rankone_moe.yaml +26 -0
  90. fusion_bench_config/method/regmean/clip_regmean.yaml +1 -0
  91. fusion_bench_config/method/surgery/adamerging_surgery.yaml +27 -0
  92. fusion_bench_config/modelpool/CausalLMPool/llama_alpaca_cleaned.yaml +21 -0
  93. fusion_bench_config/modelpool/CausalLMPool/llama_codealpaca.yaml +21 -0
  94. fusion_bench_config/modelpool/CausalLMPool/llama_metamathqa.yaml +19 -0
  95. fusion_bench_config/modelpool/CausalLMPool/llama_ultrachat.yaml +18 -0
  96. fusion_bench_config/modelpool/SeqenceClassificationModelPool/llama_preference700k.yaml +23 -0
  97. fusion_bench_config/modelpool/SeqenceClassificationModelPool/single_reward_model.yaml +14 -0
  98. fusion_bench_config/nyuv2_config.yaml +5 -1
  99. fusion_bench_config/taskpool/CLIPVisionModelTaskPool/clip_rankone_wemoe_clip-vit-classification_TA8.yaml +18 -0
  100. fusion_bench_config/taskpool/reward_model_evaluation.yaml +18 -0
  101. fusion_bench_config/llama_weighted_average.yaml +0 -26
  102. {fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/LICENSE +0 -0
  103. {fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/WHEEL +0 -0
  104. {fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/entry_points.txt +0 -0
  105. {fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,26 @@
1
+ name: ??? # this can be
2
+ # the path for loading the model weights, if specified, skip the test-time adaptation training
3
+ checkpoint: False
4
+ # the path for saving the model weights.
5
+ save_checkpoint: False
6
+ router_hidden_layers: 1
7
+ init_lambda: 0.3
8
+ batch_reduce: true
9
+
10
+ # device to compute svd
11
+ svd_accelerator: cuda
12
+ rank_k: 32 # How many experts are added to the pool per task?
13
+ select_k: -1 # How many experts are selected from the pool to merge? Range is (1, rank_k*task_num). In particular -1: All the experts in the pool
14
+
15
+ # learning rate
16
+ lr: 1e-4
17
+ optimizer: adam
18
+ # this is overrided by `fabric.devices` if launched from the `fusion_bench` CLI.
19
+ devices: 1
20
+ batch_size: 16
21
+ num_workers: 16
22
+ max_steps: 1000 # default: 1000
23
+ # if true, we will use the gradient accumulation across tasks to save memory
24
+ use_grad_accumulate: true
25
+ cache_dir: outputs
26
+ fast_dev_run: ${fast_dev_run}
@@ -3,6 +3,7 @@ _target_: fusion_bench.method.RegMeanAlgorithmForCLIP
3
3
  exclude_param_names_regex: []
4
4
  # numbers of examples to compute regmean weights
5
5
  num_regmean_examples: 256
6
+ weight_transpose: true
6
7
  # float, reduce non-diagonal elements in regmean weights by multiplying this scalar
7
8
  reduce_non_diagonal_ratio: 0.6
8
9
  dataloader_kwargs:
@@ -0,0 +1,27 @@
1
+ # this option can be "clip_task_wise_adamerging"
2
+ name: clip_layer_wise_adamerging_surgery
3
+ # this weights can be a list of float, or a string that points to a *.np, *.pt file containing the weights
4
+ # if weights is specified, skip the test-time adaptation training
5
+ weights: null
6
+ # learning rate
7
+ optimizer: adam
8
+ lr: 1e-3
9
+ init_values: 0.3
10
+ # if `clamp_weights` is true, the weights will be clamped to [0, 1]
11
+ clamp_weights: false
12
+ # arguments of `functional_call`
13
+ tie_weights: true
14
+ strict: false
15
+ # this is overrided by `fabric.devices` if launched from the `fusion_bench` CLI.
16
+ devices: 1
17
+ batch_size: 16
18
+ num_workers: 8
19
+ max_steps: 1000
20
+ fast_dev_run: ${fast_dev_run}
21
+ # the path for saving the merging weights
22
+ save_merging_weights: 'merging_weights.pt'
23
+ cache_dir: outputs
24
+
25
+ # parameters of Surgery
26
+ eval_iterations: 200
27
+ surgery_steps: 1000
@@ -0,0 +1,21 @@
1
+ _target_: fusion_bench.modelpool.CausalLMPool
2
+
3
+ pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
4
+
5
+ models:
6
+ _pretrained_:
7
+ _target_: transformers.AutoModelForCausalLM.from_pretrained
8
+ pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
9
+ torch_dtype: bfloat16
10
+
11
+ tokenizer:
12
+ _target_: transformers.AutoTokenizer.from_pretrained
13
+ pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
14
+
15
+ train_datasets:
16
+ alpaca-cleaned:
17
+ _target_: fusion_bench.dataset.llama.alpaca.load_tokenized_alpaca_dataset
18
+ tokenizer: ${...tokenizer}
19
+ path: "yahma/alpaca-cleaned"
20
+ split: train
21
+ cache_path: null
@@ -0,0 +1,21 @@
1
+ _target_: fusion_bench.modelpool.CausalLMPool
2
+
3
+ pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
4
+
5
+ models:
6
+ _pretrained_:
7
+ _target_: transformers.AutoModelForCausalLM.from_pretrained
8
+ pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
9
+ torch_dtype: bfloat16
10
+
11
+ tokenizer:
12
+ _target_: transformers.AutoTokenizer.from_pretrained
13
+ pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
14
+
15
+ train_datasets:
16
+ codealpaca:
17
+ _target_: fusion_bench.dataset.llama.alpaca.load_tokenized_alpaca_dataset
18
+ tokenizer: ${...tokenizer}
19
+ path: sahil2801/CodeAlpaca-20k
20
+ split: train
21
+ cache_path: null
@@ -0,0 +1,19 @@
1
+ _target_: fusion_bench.modelpool.CausalLMPool
2
+
3
+ pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
4
+
5
+ models:
6
+ _pretrained_:
7
+ _target_: transformers.AutoModelForCausalLM.from_pretrained
8
+ pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
9
+ torch_dtype: bfloat16
10
+
11
+ tokenizer:
12
+ _target_: transformers.AutoTokenizer.from_pretrained
13
+ pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
14
+
15
+ train_datasets:
16
+ metamathqa:
17
+ _target_: fusion_bench.dataset.llama.metamathqa.load_tokenized_metamathqa
18
+ tokenizer: ${...tokenizer}
19
+ cache_path: null
@@ -0,0 +1,18 @@
1
+ _target_: fusion_bench.modelpool.CausalLMPool
2
+
3
+ pretrained_model_name_or_path: meta-llama/Llama-3-1B-Instruct
4
+
5
+ models:
6
+ _pretrained_:
7
+ _target_: transformers.AutoModelForCausalLM.from_pretrained
8
+ pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
9
+ torch_dtype: bfloat16
10
+
11
+ tokenizer:
12
+ _target_: transformers.AutoTokenizer.from_pretrained
13
+ pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
14
+
15
+ train_datasets:
16
+ ultrachat-200k:
17
+ _target_: fusion_bench.dataset.llama.ultrachat.load_tokenized_ultrachat_200k
18
+ tokenizer: ${...tokenizer}
@@ -0,0 +1,23 @@
1
+ _target_: fusion_bench.modelpool.SeqenceClassificationModelPool
2
+
3
+ pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
4
+
5
+ models:
6
+ _pretrained_:
7
+ _target_: fusion_bench.modelpool.seq_classification_lm.create_reward_model_from_pretrained
8
+ pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
9
+ torch_dtype: bfloat16
10
+ use_flash_attention_2: true
11
+
12
+ tokenizer:
13
+ _target_: transformers.AutoTokenizer.from_pretrained
14
+ pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
15
+ pad_token: <|end_of_text|> # do not use eos token (<|eos_id|>) as padding token because it is used as the end of each content
16
+
17
+ train_datasets:
18
+ preference_700k:
19
+ _target_: fusion_bench.dataset.llama.preference_700k.load_tokenized_preference_700k_for_rlhf
20
+ tokenizer: ${...tokenizer}
21
+ path: hendrydong/preference_700K
22
+ split: train
23
+ cache_path: null
@@ -0,0 +1,14 @@
1
+ _target_: fusion_bench.modelpool.SeqenceClassificationModelPool
2
+
3
+ pretrained_model_name_or_path: fusion-bench/Llama-3.2-1B-Instruct_Bradly-Terry-RM_Preference-700k
4
+
5
+ models:
6
+ _pretrained_:
7
+ _target_: transformers.AutoModelForSequenceClassification.from_pretrained
8
+ pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
9
+ torch_dtype: bfloat16
10
+
11
+ tokenizer:
12
+ _target_: transformers.AutoTokenizer.from_pretrained
13
+ pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
14
+ pad_token: <|end_of_text|> # do not use eos token (<|eos_id|>) as padding token because it is used as the end of each content
@@ -1,13 +1,17 @@
1
1
  defaults:
2
2
  - hydra: default
3
+ - fabric: auto
3
4
  - modelpool: nyuv2_modelpool
4
5
  - method: simple_average
5
6
  - taskpool: nyuv2_taskpool
6
7
  - _self_
8
+
9
+ _target_: fusion_bench.programs.FabricModelFusionProgram
10
+ _recursive_: false
11
+
7
12
  fast_dev_run: false # Run a single batch of data to test the model or method
8
13
  use_lightning: true # Use the fabric to run the experiment
9
14
  print_config: true # Print the configuration to the console
10
15
  save_report: false # path to save the result report
11
- fabric: null
12
16
  trainer:
13
17
  devices: 1
@@ -0,0 +1,18 @@
1
+ defaults:
2
+ - CLIPVisionModelTaskPool@: _template
3
+ - /dataset/image_classification/test@test_datasets:
4
+ - sun397
5
+ - stanford-cars
6
+ - resisc45
7
+ - eurosat
8
+ - svhn
9
+ - gtsrb
10
+ - mnist
11
+ - dtd
12
+ - _self_
13
+
14
+ _target_: fusion_bench.taskpool.RankoneWEMoECLIPVisionModelTaskPool
15
+
16
+ # === layer-wise routing weights saving ===
17
+ layer_wise_routing_weights_save_path: null
18
+ layer_wise_routing_weights_max_num: 1000
@@ -0,0 +1,18 @@
1
+ _target_: fusion_bench.taskpool.llama.reward_model.RewardModelEvaluationTaskPool
2
+
3
+ test_datasets:
4
+ preference_700k:
5
+ _target_: fusion_bench.dataset.llama.preference_700k.load_tokenized_preference_700k_for_rlhf
6
+ tokenizer: ${...tokenizer}
7
+ path: hendrydong/preference_700K
8
+ split: train
9
+ cache_path: null
10
+
11
+ dataloader_kwargs:
12
+ shuffle: False
13
+ batch_size: 16
14
+
15
+ tokenizer: ${..modelpool.tokenizer}
16
+
17
+ max_num_samples: 1000
18
+ seed: 42
@@ -1,26 +0,0 @@
1
- defaults:
2
- - example_config
3
- - override method: weighted_average_for_llama
4
- - override modelpool: llama_for_causallm
5
- - _self_
6
- modelpool:
7
- models:
8
- # the pre-trained model (base model) is optional
9
- # if not provided, the first model will be used as the base model
10
- - name: _pretrained_
11
- path: meta-llama/Meta-Llama-3-8B
12
- - name: expert_1
13
- path: meta-llama/Meta-Llama-3-8B
14
- - name: expert_2
15
- path: meta-llama/Meta-Llama-3-8B-Instruct
16
- method:
17
- normalize: true # if true, the weights will be normalized before merging
18
- weights: # List of weights for each model
19
- - 0.5
20
- - 0.5
21
- # if true, only the backbone of the model will be merged and the head will be keeped as the pre-trained model (if the pre-trained model is provided, otherwise the head of the first model will be used)
22
- # if false, the whole model will be merged
23
- backbone_only: true
24
- merged_model_save_path: null
25
- save_tokenizer: true
26
- push_to_hub: false