fusion-bench 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. fusion_bench/compat/method/__init__.py +1 -0
  2. fusion_bench/compat/method/base_algorithm.py +7 -1
  3. fusion_bench/compat/modelpool/__init__.py +1 -1
  4. fusion_bench/compat/taskpool/__init__.py +1 -1
  5. fusion_bench/dataset/arc_agi/arc.py +5 -0
  6. fusion_bench/dataset/arc_agi/preprocess.py +1 -1
  7. fusion_bench/dataset/llama/__init__.py +1 -0
  8. fusion_bench/dataset/llama/alpaca.py +93 -3
  9. fusion_bench/dataset/llama/collate.py +62 -2
  10. fusion_bench/dataset/llama/metamathqa.py +50 -0
  11. fusion_bench/dataset/llama/preference_700k.py +70 -0
  12. fusion_bench/dataset/llama/stanford_shp.py +90 -0
  13. fusion_bench/dataset/llama/ultrachat.py +58 -0
  14. fusion_bench/dataset/llama/utils/__init__.py +0 -0
  15. fusion_bench/method/__init__.py +1 -1
  16. fusion_bench/method/adamerging/layer_wise_adamerging.py +11 -4
  17. fusion_bench/method/adamerging/min_norm_solvers.py +4 -4
  18. fusion_bench/method/linear/expo.py +39 -0
  19. fusion_bench/method/lm_finetune/__init__.py +1 -0
  20. fusion_bench/method/lm_finetune/bradley_terry_rm.py +432 -0
  21. fusion_bench/method/lm_finetune/fullfinetune_sft.py +90 -160
  22. fusion_bench/method/lm_finetune/peftfinetune_sft.py +49 -139
  23. fusion_bench/method/pruning/llama_magnitude_prune.py +2 -2
  24. fusion_bench/method/pruning/llama_random_prune.py +2 -2
  25. fusion_bench/method/surgery/__init__.py +3 -0
  26. fusion_bench/method/surgery/clip_layer_wise_adamerging_surgery.py +157 -0
  27. fusion_bench/mixins/__init__.py +2 -0
  28. fusion_bench/mixins/clip_classification.py +58 -5
  29. fusion_bench/mixins/fabric_training.py +320 -0
  30. fusion_bench/mixins/lightning_fabric.py +9 -0
  31. fusion_bench/modelpool/__init__.py +2 -0
  32. fusion_bench/modelpool/causal_lm/__init__.py +1 -1
  33. fusion_bench/modelpool/causal_lm/causal_lm.py +21 -22
  34. fusion_bench/modelpool/seq_classification_lm/__init__.py +2 -0
  35. fusion_bench/modelpool/seq_classification_lm/reward_model.py +15 -0
  36. fusion_bench/modelpool/seq_classification_lm/seq_classification_lm.py +98 -0
  37. fusion_bench/models/chat_templates/__init__.py +1 -0
  38. fusion_bench/models/chat_templates/llama_3_Instruct.py +1 -0
  39. fusion_bench/models/chat_templates/load_tokenizer.py +43 -0
  40. fusion_bench/models/hf_clip.py +50 -9
  41. fusion_bench/models/surgery/surgerymodelwrapper.py +157 -0
  42. fusion_bench/models/utils.py +8 -0
  43. fusion_bench/models/wrappers/layer_wise_fusion.py +14 -5
  44. fusion_bench/models/wrappers/task_wise_fusion.py +5 -5
  45. fusion_bench/optim/__init__.py +2 -0
  46. fusion_bench/optim/exception.py +47 -0
  47. fusion_bench/optim/lr_scheduler/__init__.py +1 -0
  48. fusion_bench/optim/lr_scheduler/linear_warmup.py +222 -0
  49. fusion_bench/optim/lr_scheduler/utils/__init__.py +1 -0
  50. fusion_bench/optim/lr_scheduler/utils/visualization.py +119 -0
  51. fusion_bench/optim/mezo.py +0 -2
  52. fusion_bench/programs/fabric_fusion_program.py +5 -1
  53. fusion_bench/taskpool/clip_vision/taskpool.py +43 -6
  54. fusion_bench/taskpool/llama/reward_model.py +157 -0
  55. fusion_bench/taskpool/nyuv2_taskpool.py +2 -0
  56. fusion_bench/utils/hydra_utils.py +22 -0
  57. fusion_bench/utils/plot/__init__.py +0 -0
  58. fusion_bench/utils/plot/token.py +52 -0
  59. fusion_bench/utils/plot/token_notebook.py +127 -0
  60. fusion_bench/utils/type.py +5 -3
  61. {fusion_bench-0.2.6.dist-info → fusion_bench-0.2.7.dist-info}/METADATA +1 -1
  62. {fusion_bench-0.2.6.dist-info → fusion_bench-0.2.7.dist-info}/RECORD +87 -47
  63. fusion_bench_config/clip-vit-base-patch32_robustness_corrupted.yaml +1 -1
  64. fusion_bench_config/dataset/llm_sft/alpaca_cleaned.yaml +6 -0
  65. fusion_bench_config/dataset/llm_sft/ultrachat_200k.yaml +3 -0
  66. fusion_bench_config/fabric/llama_peft_fsdp.yaml +16 -0
  67. fusion_bench_config/fabric/loggers/wandb_logger.yaml +2 -0
  68. fusion_bench_config/fabric/strategy/deepspeed.yaml +10 -0
  69. fusion_bench_config/fabric/strategy/llama_peft_fsdp.yaml +9 -0
  70. fusion_bench_config/fabric_model_fusion.yaml +1 -1
  71. fusion_bench_config/llama_full_finetune.yaml +19 -0
  72. fusion_bench_config/method/lm_finetune/bradley_terry_rm.yaml +47 -0
  73. fusion_bench_config/method/lm_finetune/fullfinetune_sft.yaml +11 -4
  74. fusion_bench_config/method/lm_finetune/peftfinetune_sft.yaml +4 -2
  75. fusion_bench_config/method/surgery/adamerging_surgery.yaml +27 -0
  76. fusion_bench_config/modelpool/CausalLMPool/llama_alpaca_cleaned.yaml +21 -0
  77. fusion_bench_config/modelpool/CausalLMPool/llama_codealpaca.yaml +21 -0
  78. fusion_bench_config/modelpool/CausalLMPool/llama_metamathqa.yaml +19 -0
  79. fusion_bench_config/modelpool/CausalLMPool/llama_ultrachat.yaml +18 -0
  80. fusion_bench_config/modelpool/SeqenceClassificationModelPool/llama_preference700k.yaml +23 -0
  81. fusion_bench_config/modelpool/SeqenceClassificationModelPool/single_reward_model.yaml +14 -0
  82. fusion_bench_config/nyuv2_config.yaml +5 -1
  83. fusion_bench_config/taskpool/reward_model_evaluation.yaml +18 -0
  84. fusion_bench_config/llama_weighted_average.yaml +0 -26
  85. {fusion_bench-0.2.6.dist-info → fusion_bench-0.2.7.dist-info}/LICENSE +0 -0
  86. {fusion_bench-0.2.6.dist-info → fusion_bench-0.2.7.dist-info}/WHEEL +0 -0
  87. {fusion_bench-0.2.6.dist-info → fusion_bench-0.2.7.dist-info}/entry_points.txt +0 -0
  88. {fusion_bench-0.2.6.dist-info → fusion_bench-0.2.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,19 @@
1
+ defaults:
2
+ - hydra: default
3
+ - fabric: llama_fsdp
4
+ # --- Model, Method, Task ---
5
+ - method: lm_finetune/fullfinetune_sft.yaml
6
+ - modelpool: CausalLMPool/llama_alpaca_cleaned.yaml
7
+ - taskpool: dummy
8
+ - _self_
9
+
10
+ _target_: fusion_bench.programs.FabricModelFusionProgram
11
+ _recursive_: false
12
+
13
+ fast_dev_run: false # Run a single batch of data to test the model or method
14
+ # Run the script without actually running the experiment, use with `print_config=true`.
15
+ # You can also use `--cfg` or `-c` to show the configuration instead of running.
16
+ dry_run: false
17
+ print_config: true # Print the configuration to the console
18
+ report_save_path: null # path to save the result report
19
+ print_function_call: true # set to false if you don't want to print the details of instantiate calls
@@ -0,0 +1,47 @@
1
+ _target_: fusion_bench.method.BradleyTerryRewardModeling
2
+ _recursive_: False
3
+
4
+ optimizer:
5
+ _target_: torch.optim.AdamW
6
+ lr: 1e-5
7
+ weight_decay: 0.01
8
+ fused: null
9
+
10
+ lr_scheduler:
11
+ _target_: fusion_bench.optim.lr_scheduler.CosineDecayWithWarmup
12
+ T_max: _T_max_ # this will be replaced by the expected number of training steps
13
+ init_lr: 0
14
+ warmup_steps: 100
15
+ max_lr: ${..optimizer.lr}
16
+ min_lr: 1e-6
17
+
18
+ dataloader_kwargs:
19
+ # per-gpu batch size
20
+ batch_size: 1
21
+ num_workers: 0
22
+ pin_memory: True
23
+
24
+ # Training hyperparameters
25
+ # if max_epochs=-1, max_steps will be used to determine the number of training steps
26
+ max_epochs: 3
27
+ max_steps: -1
28
+ max_steps_per_epoch: -1
29
+ accumulate_grad_batches: 1
30
+ lr_scheduler_interval: step
31
+ lr_scheduler_frequency: 1
32
+ # Checkpointing may be done by epoch or step, and at the end of training
33
+ # `checkpoint_save_interval` can be 'epoch' or 'step'
34
+ checkpoint_save_interval: epoch
35
+ checkpoint_save_frequency: 1
36
+ # Whether to use gradient clipping, and if so, the value and algorithm
37
+ gradient_clip_val: null
38
+ gradient_clip_algorithm: norm
39
+ save_optimizer_state: false
40
+ # save_full_model must be true when using shared FSDP
41
+ save_full_model: true
42
+ # save_ckpt_type can be 'hf' or 'lightning'
43
+ save_ckpt_type: lightning
44
+ # Path to checkpoint to load from, used for resuming training
45
+ ckpt_path: null
46
+ max_length: 4096
47
+ fix_token_embedding: true
@@ -3,14 +3,17 @@ _recursive_: False
3
3
 
4
4
  optimizer:
5
5
  _target_: torch.optim.AdamW
6
- fused: True
6
+ lr: 1e-5
7
7
  weight_decay: 0.01
8
- lr: 5e-5
8
+ fused: null
9
9
 
10
10
  lr_scheduler:
11
- _target_: torch.optim.lr_scheduler.CosineAnnealingLR
11
+ _target_: fusion_bench.optim.lr_scheduler.CosineDecayWithWarmup
12
12
  T_max: _T_max_ # this will be replaced by the expected number of training steps
13
- eta_min: 1e-6
13
+ init_lr: 0
14
+ warmup_steps: 100
15
+ max_lr: ${..optimizer.lr}
16
+ min_lr: 1e-6
14
17
 
15
18
  dataloader_kwargs:
16
19
  # per-gpu batch size
@@ -36,5 +39,9 @@ gradient_clip_algorithm: norm
36
39
  save_optimizer_state: false
37
40
  # save_full_model must be true when using shared FSDP
38
41
  save_full_model: true
42
+ # save_ckpt_type can be 'hf' or 'lightning'
43
+ save_ckpt_type: lightning
39
44
  # Path to checkpoint to load from, used for resuming training
40
45
  ckpt_path: null
46
+ max_length: 4096
47
+ fix_token_embedding: true
@@ -3,9 +3,9 @@ _recursive_: False
3
3
 
4
4
  optimizer:
5
5
  _target_: torch.optim.AdamW
6
- fused: True
6
+ lr: 1e-4
7
7
  weight_decay: 0.01
8
- lr: 5e-5
8
+ fused: null
9
9
 
10
10
  lr_scheduler:
11
11
  _target_: torch.optim.lr_scheduler.CosineAnnealingLR
@@ -56,6 +56,8 @@ gradient_clip_algorithm: norm
56
56
  save_optimizer_state: false
57
57
  # save_full_model must be true when using shared FSDP
58
58
  save_full_model: false
59
+ # save_ckpt_type can be 'peft' or 'lightning'
60
+ save_ckpt_type: lightning
59
61
  # Path to checkpoint to load from, used for resuming training
60
62
  ckpt_path: null
61
63
  max_length: 4096
@@ -0,0 +1,27 @@
1
+ # this option can be "clip_task_wise_adamerging"
2
+ name: clip_layer_wise_adamerging_surgery
3
+ # this weights can be a list of float, or a string that points to a *.np, *.pt file containing the weights
4
+ # if weights is specified, skip the test-time adaptation training
5
+ weights: null
6
+ # learning rate
7
+ optimizer: adam
8
+ lr: 1e-3
9
+ init_values: 0.3
10
+ # if `clamp_weights` is true, the weights will be clamped to [0, 1]
11
+ clamp_weights: false
12
+ # arguments of `functional_call`
13
+ tie_weights: true
14
+ strict: false
15
+ # this is overrided by `fabric.devices` if launched from the `fusion_bench` CLI.
16
+ devices: 1
17
+ batch_size: 16
18
+ num_workers: 8
19
+ max_steps: 1000
20
+ fast_dev_run: ${fast_dev_run}
21
+ # the path for saving the merging weights
22
+ save_merging_weights: 'merging_weights.pt'
23
+ cache_dir: outputs
24
+
25
+ # parameters of Surgery
26
+ eval_iterations: 200
27
+ surgery_steps: 1000
@@ -0,0 +1,21 @@
1
+ _target_: fusion_bench.modelpool.CausalLMPool
2
+
3
+ pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
4
+
5
+ models:
6
+ _pretrained_:
7
+ _target_: transformers.AutoModelForCausalLM.from_pretrained
8
+ pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
9
+ torch_dtype: bfloat16
10
+
11
+ tokenizer:
12
+ _target_: transformers.AutoTokenizer.from_pretrained
13
+ pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
14
+
15
+ train_datasets:
16
+ alpaca-cleaned:
17
+ _target_: fusion_bench.dataset.llama.alpaca.load_tokenized_alpaca_dataset
18
+ tokenizer: ${...tokenizer}
19
+ path: "yahma/alpaca-cleaned"
20
+ split: train
21
+ cache_path: null
@@ -0,0 +1,21 @@
1
+ _target_: fusion_bench.modelpool.CausalLMPool
2
+
3
+ pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
4
+
5
+ models:
6
+ _pretrained_:
7
+ _target_: transformers.AutoModelForCausalLM.from_pretrained
8
+ pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
9
+ torch_dtype: bfloat16
10
+
11
+ tokenizer:
12
+ _target_: transformers.AutoTokenizer.from_pretrained
13
+ pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
14
+
15
+ train_datasets:
16
+ codealpaca:
17
+ _target_: fusion_bench.dataset.llama.alpaca.load_tokenized_alpaca_dataset
18
+ tokenizer: ${...tokenizer}
19
+ path: sahil2801/CodeAlpaca-20k
20
+ split: train
21
+ cache_path: null
@@ -0,0 +1,19 @@
1
+ _target_: fusion_bench.modelpool.CausalLMPool
2
+
3
+ pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
4
+
5
+ models:
6
+ _pretrained_:
7
+ _target_: transformers.AutoModelForCausalLM.from_pretrained
8
+ pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
9
+ torch_dtype: bfloat16
10
+
11
+ tokenizer:
12
+ _target_: transformers.AutoTokenizer.from_pretrained
13
+ pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
14
+
15
+ train_datasets:
16
+ metamathqa:
17
+ _target_: fusion_bench.dataset.llama.metamathqa.load_tokenized_metamathqa
18
+ tokenizer: ${...tokenizer}
19
+ cache_path: null
@@ -0,0 +1,18 @@
1
+ _target_: fusion_bench.modelpool.CausalLMPool
2
+
3
+ pretrained_model_name_or_path: meta-llama/Llama-3-1B-Instruct
4
+
5
+ models:
6
+ _pretrained_:
7
+ _target_: transformers.AutoModelForCausalLM.from_pretrained
8
+ pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
9
+ torch_dtype: bfloat16
10
+
11
+ tokenizer:
12
+ _target_: transformers.AutoTokenizer.from_pretrained
13
+ pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
14
+
15
+ train_datasets:
16
+ ultrachat-200k:
17
+ _target_: fusion_bench.dataset.llama.ultrachat.load_tokenized_ultrachat_200k
18
+ tokenizer: ${...tokenizer}
@@ -0,0 +1,23 @@
1
+ _target_: fusion_bench.modelpool.SeqenceClassificationModelPool
2
+
3
+ pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
4
+
5
+ models:
6
+ _pretrained_:
7
+ _target_: fusion_bench.modelpool.seq_classification_lm.create_reward_model_from_pretrained
8
+ pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
9
+ torch_dtype: bfloat16
10
+ use_flash_attention_2: true
11
+
12
+ tokenizer:
13
+ _target_: transformers.AutoTokenizer.from_pretrained
14
+ pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
15
+ pad_token: <|end_of_text|> # do not use eos token (<|eos_id|>) as padding token because it is used as the end of each content
16
+
17
+ train_datasets:
18
+ preference_700k:
19
+ _target_: fusion_bench.dataset.llama.preference_700k.load_tokenized_preference_700k_for_rlhf
20
+ tokenizer: ${...tokenizer}
21
+ path: hendrydong/preference_700K
22
+ split: train
23
+ cache_path: null
@@ -0,0 +1,14 @@
1
+ _target_: fusion_bench.modelpool.SeqenceClassificationModelPool
2
+
3
+ pretrained_model_name_or_path: fusion-bench/Llama-3.2-1B-Instruct_Bradly-Terry-RM_Preference-700k
4
+
5
+ models:
6
+ _pretrained_:
7
+ _target_: transformers.AutoModelForSequenceClassification.from_pretrained
8
+ pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
9
+ torch_dtype: bfloat16
10
+
11
+ tokenizer:
12
+ _target_: transformers.AutoTokenizer.from_pretrained
13
+ pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
14
+ pad_token: <|end_of_text|> # do not use eos token (<|eos_id|>) as padding token because it is used as the end of each content
@@ -1,13 +1,17 @@
1
1
  defaults:
2
2
  - hydra: default
3
+ - fabric: auto
3
4
  - modelpool: nyuv2_modelpool
4
5
  - method: simple_average
5
6
  - taskpool: nyuv2_taskpool
6
7
  - _self_
8
+
9
+ _target_: fusion_bench.programs.FabricModelFusionProgram
10
+ _recursive_: false
11
+
7
12
  fast_dev_run: false # Run a single batch of data to test the model or method
8
13
  use_lightning: true # Use the fabric to run the experiment
9
14
  print_config: true # Print the configuration to the console
10
15
  save_report: false # path to save the result report
11
- fabric: null
12
16
  trainer:
13
17
  devices: 1
@@ -0,0 +1,18 @@
1
+ _target_: fusion_bench.taskpool.llama.reward_model.RewardModelEvaluationTaskPool
2
+
3
+ test_datasets:
4
+ preference_700k:
5
+ _target_: fusion_bench.dataset.llama.preference_700k.load_tokenized_preference_700k_for_rlhf
6
+ tokenizer: ${...tokenizer}
7
+ path: hendrydong/preference_700K
8
+ split: train
9
+ cache_path: null
10
+
11
+ dataloader_kwargs:
12
+ shuffle: False
13
+ batch_size: 16
14
+
15
+ tokenizer: ${..modelpool.tokenizer}
16
+
17
+ max_num_samples: 1000
18
+ seed: 42
@@ -1,26 +0,0 @@
1
- defaults:
2
- - example_config
3
- - override method: weighted_average_for_llama
4
- - override modelpool: llama_for_causallm
5
- - _self_
6
- modelpool:
7
- models:
8
- # the pre-trained model (base model) is optional
9
- # if not provided, the first model will be used as the base model
10
- - name: _pretrained_
11
- path: meta-llama/Meta-Llama-3-8B
12
- - name: expert_1
13
- path: meta-llama/Meta-Llama-3-8B
14
- - name: expert_2
15
- path: meta-llama/Meta-Llama-3-8B-Instruct
16
- method:
17
- normalize: true # if true, the weights will be normalized before merging
18
- weights: # List of weights for each model
19
- - 0.5
20
- - 0.5
21
- # if true, only the backbone of the model will be merged and the head will be keeped as the pre-trained model (if the pre-trained model is provided, otherwise the head of the first model will be used)
22
- # if false, the whole model will be merged
23
- backbone_only: true
24
- merged_model_save_path: null
25
- save_tokenizer: true
26
- push_to_hub: false