fusion-bench 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fusion_bench/compat/method/__init__.py +1 -0
- fusion_bench/compat/method/base_algorithm.py +7 -1
- fusion_bench/compat/modelpool/__init__.py +1 -1
- fusion_bench/compat/taskpool/__init__.py +1 -1
- fusion_bench/dataset/arc_agi/arc.py +5 -0
- fusion_bench/dataset/arc_agi/preprocess.py +1 -1
- fusion_bench/dataset/llama/__init__.py +1 -0
- fusion_bench/dataset/llama/alpaca.py +93 -3
- fusion_bench/dataset/llama/collate.py +62 -2
- fusion_bench/dataset/llama/metamathqa.py +50 -0
- fusion_bench/dataset/llama/preference_700k.py +70 -0
- fusion_bench/dataset/llama/stanford_shp.py +90 -0
- fusion_bench/dataset/llama/ultrachat.py +58 -0
- fusion_bench/dataset/llama/utils/__init__.py +0 -0
- fusion_bench/method/__init__.py +1 -1
- fusion_bench/method/adamerging/layer_wise_adamerging.py +11 -4
- fusion_bench/method/adamerging/min_norm_solvers.py +4 -4
- fusion_bench/method/linear/expo.py +39 -0
- fusion_bench/method/lm_finetune/__init__.py +1 -0
- fusion_bench/method/lm_finetune/bradley_terry_rm.py +432 -0
- fusion_bench/method/lm_finetune/fullfinetune_sft.py +90 -160
- fusion_bench/method/lm_finetune/peftfinetune_sft.py +49 -139
- fusion_bench/method/pruning/llama_magnitude_prune.py +2 -2
- fusion_bench/method/pruning/llama_random_prune.py +2 -2
- fusion_bench/method/surgery/__init__.py +3 -0
- fusion_bench/method/surgery/clip_layer_wise_adamerging_surgery.py +157 -0
- fusion_bench/mixins/__init__.py +2 -0
- fusion_bench/mixins/clip_classification.py +58 -5
- fusion_bench/mixins/fabric_training.py +320 -0
- fusion_bench/mixins/lightning_fabric.py +9 -0
- fusion_bench/modelpool/__init__.py +2 -0
- fusion_bench/modelpool/causal_lm/__init__.py +1 -1
- fusion_bench/modelpool/causal_lm/causal_lm.py +21 -22
- fusion_bench/modelpool/seq_classification_lm/__init__.py +2 -0
- fusion_bench/modelpool/seq_classification_lm/reward_model.py +15 -0
- fusion_bench/modelpool/seq_classification_lm/seq_classification_lm.py +98 -0
- fusion_bench/models/chat_templates/__init__.py +1 -0
- fusion_bench/models/chat_templates/llama_3_Instruct.py +1 -0
- fusion_bench/models/chat_templates/load_tokenizer.py +43 -0
- fusion_bench/models/hf_clip.py +50 -9
- fusion_bench/models/surgery/surgerymodelwrapper.py +157 -0
- fusion_bench/models/utils.py +8 -0
- fusion_bench/models/wrappers/layer_wise_fusion.py +14 -5
- fusion_bench/models/wrappers/task_wise_fusion.py +5 -5
- fusion_bench/optim/__init__.py +2 -0
- fusion_bench/optim/exception.py +47 -0
- fusion_bench/optim/lr_scheduler/__init__.py +1 -0
- fusion_bench/optim/lr_scheduler/linear_warmup.py +222 -0
- fusion_bench/optim/lr_scheduler/utils/__init__.py +1 -0
- fusion_bench/optim/lr_scheduler/utils/visualization.py +119 -0
- fusion_bench/optim/mezo.py +0 -2
- fusion_bench/programs/fabric_fusion_program.py +5 -1
- fusion_bench/taskpool/clip_vision/taskpool.py +43 -6
- fusion_bench/taskpool/llama/reward_model.py +157 -0
- fusion_bench/taskpool/nyuv2_taskpool.py +2 -0
- fusion_bench/utils/hydra_utils.py +22 -0
- fusion_bench/utils/plot/__init__.py +0 -0
- fusion_bench/utils/plot/token.py +52 -0
- fusion_bench/utils/plot/token_notebook.py +127 -0
- fusion_bench/utils/type.py +5 -3
- {fusion_bench-0.2.6.dist-info → fusion_bench-0.2.7.dist-info}/METADATA +1 -1
- {fusion_bench-0.2.6.dist-info → fusion_bench-0.2.7.dist-info}/RECORD +87 -47
- fusion_bench_config/clip-vit-base-patch32_robustness_corrupted.yaml +1 -1
- fusion_bench_config/dataset/llm_sft/alpaca_cleaned.yaml +6 -0
- fusion_bench_config/dataset/llm_sft/ultrachat_200k.yaml +3 -0
- fusion_bench_config/fabric/llama_peft_fsdp.yaml +16 -0
- fusion_bench_config/fabric/loggers/wandb_logger.yaml +2 -0
- fusion_bench_config/fabric/strategy/deepspeed.yaml +10 -0
- fusion_bench_config/fabric/strategy/llama_peft_fsdp.yaml +9 -0
- fusion_bench_config/fabric_model_fusion.yaml +1 -1
- fusion_bench_config/llama_full_finetune.yaml +19 -0
- fusion_bench_config/method/lm_finetune/bradley_terry_rm.yaml +47 -0
- fusion_bench_config/method/lm_finetune/fullfinetune_sft.yaml +11 -4
- fusion_bench_config/method/lm_finetune/peftfinetune_sft.yaml +4 -2
- fusion_bench_config/method/surgery/adamerging_surgery.yaml +27 -0
- fusion_bench_config/modelpool/CausalLMPool/llama_alpaca_cleaned.yaml +21 -0
- fusion_bench_config/modelpool/CausalLMPool/llama_codealpaca.yaml +21 -0
- fusion_bench_config/modelpool/CausalLMPool/llama_metamathqa.yaml +19 -0
- fusion_bench_config/modelpool/CausalLMPool/llama_ultrachat.yaml +18 -0
- fusion_bench_config/modelpool/SeqenceClassificationModelPool/llama_preference700k.yaml +23 -0
- fusion_bench_config/modelpool/SeqenceClassificationModelPool/single_reward_model.yaml +14 -0
- fusion_bench_config/nyuv2_config.yaml +5 -1
- fusion_bench_config/taskpool/reward_model_evaluation.yaml +18 -0
- fusion_bench_config/llama_weighted_average.yaml +0 -26
- {fusion_bench-0.2.6.dist-info → fusion_bench-0.2.7.dist-info}/LICENSE +0 -0
- {fusion_bench-0.2.6.dist-info → fusion_bench-0.2.7.dist-info}/WHEEL +0 -0
- {fusion_bench-0.2.6.dist-info → fusion_bench-0.2.7.dist-info}/entry_points.txt +0 -0
- {fusion_bench-0.2.6.dist-info → fusion_bench-0.2.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
defaults:
|
|
2
|
+
- hydra: default
|
|
3
|
+
- fabric: llama_fsdp
|
|
4
|
+
# --- Model, Method, Task ---
|
|
5
|
+
- method: lm_finetune/fullfinetune_sft.yaml
|
|
6
|
+
- modelpool: CausalLMPool/llama_alpaca_cleaned.yaml
|
|
7
|
+
- taskpool: dummy
|
|
8
|
+
- _self_
|
|
9
|
+
|
|
10
|
+
_target_: fusion_bench.programs.FabricModelFusionProgram
|
|
11
|
+
_recursive_: false
|
|
12
|
+
|
|
13
|
+
fast_dev_run: false # Run a single batch of data to test the model or method
|
|
14
|
+
# Run the script without actually running the experiment, use with `print_config=true`.
|
|
15
|
+
# You can also use `--cfg` or `-c` to show the configuration instead of running.
|
|
16
|
+
dry_run: false
|
|
17
|
+
print_config: true # Print the configuration to the console
|
|
18
|
+
report_save_path: null # path to save the result report
|
|
19
|
+
print_function_call: true # set to false if you don't want to print the details of instantiate calls
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
_target_: fusion_bench.method.BradleyTerryRewardModeling
|
|
2
|
+
_recursive_: False
|
|
3
|
+
|
|
4
|
+
optimizer:
|
|
5
|
+
_target_: torch.optim.AdamW
|
|
6
|
+
lr: 1e-5
|
|
7
|
+
weight_decay: 0.01
|
|
8
|
+
fused: null
|
|
9
|
+
|
|
10
|
+
lr_scheduler:
|
|
11
|
+
_target_: fusion_bench.optim.lr_scheduler.CosineDecayWithWarmup
|
|
12
|
+
T_max: _T_max_ # this will be replaced by the expected number of training steps
|
|
13
|
+
init_lr: 0
|
|
14
|
+
warmup_steps: 100
|
|
15
|
+
max_lr: ${..optimizer.lr}
|
|
16
|
+
min_lr: 1e-6
|
|
17
|
+
|
|
18
|
+
dataloader_kwargs:
|
|
19
|
+
# per-gpu batch size
|
|
20
|
+
batch_size: 1
|
|
21
|
+
num_workers: 0
|
|
22
|
+
pin_memory: True
|
|
23
|
+
|
|
24
|
+
# Training hyperparameters
|
|
25
|
+
# if max_epochs=-1, max_steps will be used to determine the number of training steps
|
|
26
|
+
max_epochs: 3
|
|
27
|
+
max_steps: -1
|
|
28
|
+
max_steps_per_epoch: -1
|
|
29
|
+
accumulate_grad_batches: 1
|
|
30
|
+
lr_scheduler_interval: step
|
|
31
|
+
lr_scheduler_frequency: 1
|
|
32
|
+
# Checkpointing may be done by epoch or step, and at the end of training
|
|
33
|
+
# `checkpoint_save_interval` can be 'epoch' or 'step'
|
|
34
|
+
checkpoint_save_interval: epoch
|
|
35
|
+
checkpoint_save_frequency: 1
|
|
36
|
+
# Whether to use gradient clipping, and if so, the value and algorithm
|
|
37
|
+
gradient_clip_val: null
|
|
38
|
+
gradient_clip_algorithm: norm
|
|
39
|
+
save_optimizer_state: false
|
|
40
|
+
# save_full_model must be true when using shared FSDP
|
|
41
|
+
save_full_model: true
|
|
42
|
+
# save_ckpt_type can be 'hf' or 'lightning'
|
|
43
|
+
save_ckpt_type: lightning
|
|
44
|
+
# Path to checkpoint to load from, used for resuming training
|
|
45
|
+
ckpt_path: null
|
|
46
|
+
max_length: 4096
|
|
47
|
+
fix_token_embedding: true
|
|
@@ -3,14 +3,17 @@ _recursive_: False
|
|
|
3
3
|
|
|
4
4
|
optimizer:
|
|
5
5
|
_target_: torch.optim.AdamW
|
|
6
|
-
|
|
6
|
+
lr: 1e-5
|
|
7
7
|
weight_decay: 0.01
|
|
8
|
-
|
|
8
|
+
fused: null
|
|
9
9
|
|
|
10
10
|
lr_scheduler:
|
|
11
|
-
_target_:
|
|
11
|
+
_target_: fusion_bench.optim.lr_scheduler.CosineDecayWithWarmup
|
|
12
12
|
T_max: _T_max_ # this will be replaced by the expected number of training steps
|
|
13
|
-
|
|
13
|
+
init_lr: 0
|
|
14
|
+
warmup_steps: 100
|
|
15
|
+
max_lr: ${..optimizer.lr}
|
|
16
|
+
min_lr: 1e-6
|
|
14
17
|
|
|
15
18
|
dataloader_kwargs:
|
|
16
19
|
# per-gpu batch size
|
|
@@ -36,5 +39,9 @@ gradient_clip_algorithm: norm
|
|
|
36
39
|
save_optimizer_state: false
|
|
37
40
|
# save_full_model must be true when using shared FSDP
|
|
38
41
|
save_full_model: true
|
|
42
|
+
# save_ckpt_type can be 'hf' or 'lightning'
|
|
43
|
+
save_ckpt_type: lightning
|
|
39
44
|
# Path to checkpoint to load from, used for resuming training
|
|
40
45
|
ckpt_path: null
|
|
46
|
+
max_length: 4096
|
|
47
|
+
fix_token_embedding: true
|
|
@@ -3,9 +3,9 @@ _recursive_: False
|
|
|
3
3
|
|
|
4
4
|
optimizer:
|
|
5
5
|
_target_: torch.optim.AdamW
|
|
6
|
-
|
|
6
|
+
lr: 1e-4
|
|
7
7
|
weight_decay: 0.01
|
|
8
|
-
|
|
8
|
+
fused: null
|
|
9
9
|
|
|
10
10
|
lr_scheduler:
|
|
11
11
|
_target_: torch.optim.lr_scheduler.CosineAnnealingLR
|
|
@@ -56,6 +56,8 @@ gradient_clip_algorithm: norm
|
|
|
56
56
|
save_optimizer_state: false
|
|
57
57
|
# save_full_model must be true when using shared FSDP
|
|
58
58
|
save_full_model: false
|
|
59
|
+
# save_ckpt_type can be 'peft' or 'lightning'
|
|
60
|
+
save_ckpt_type: lightning
|
|
59
61
|
# Path to checkpoint to load from, used for resuming training
|
|
60
62
|
ckpt_path: null
|
|
61
63
|
max_length: 4096
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# this option can be "clip_task_wise_adamerging"
|
|
2
|
+
name: clip_layer_wise_adamerging_surgery
|
|
3
|
+
# this weights can be a list of float, or a string that points to a *.np, *.pt file containing the weights
|
|
4
|
+
# if weights is specified, skip the test-time adaptation training
|
|
5
|
+
weights: null
|
|
6
|
+
# learning rate
|
|
7
|
+
optimizer: adam
|
|
8
|
+
lr: 1e-3
|
|
9
|
+
init_values: 0.3
|
|
10
|
+
# if `clamp_weights` is true, the weights will be clamped to [0, 1]
|
|
11
|
+
clamp_weights: false
|
|
12
|
+
# arguments of `functional_call`
|
|
13
|
+
tie_weights: true
|
|
14
|
+
strict: false
|
|
15
|
+
# this is overrided by `fabric.devices` if launched from the `fusion_bench` CLI.
|
|
16
|
+
devices: 1
|
|
17
|
+
batch_size: 16
|
|
18
|
+
num_workers: 8
|
|
19
|
+
max_steps: 1000
|
|
20
|
+
fast_dev_run: ${fast_dev_run}
|
|
21
|
+
# the path for saving the merging weights
|
|
22
|
+
save_merging_weights: 'merging_weights.pt'
|
|
23
|
+
cache_dir: outputs
|
|
24
|
+
|
|
25
|
+
# parameters of Surgery
|
|
26
|
+
eval_iterations: 200
|
|
27
|
+
surgery_steps: 1000
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
_target_: fusion_bench.modelpool.CausalLMPool
|
|
2
|
+
|
|
3
|
+
pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
|
|
4
|
+
|
|
5
|
+
models:
|
|
6
|
+
_pretrained_:
|
|
7
|
+
_target_: transformers.AutoModelForCausalLM.from_pretrained
|
|
8
|
+
pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
|
|
9
|
+
torch_dtype: bfloat16
|
|
10
|
+
|
|
11
|
+
tokenizer:
|
|
12
|
+
_target_: transformers.AutoTokenizer.from_pretrained
|
|
13
|
+
pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
|
|
14
|
+
|
|
15
|
+
train_datasets:
|
|
16
|
+
alpaca-cleaned:
|
|
17
|
+
_target_: fusion_bench.dataset.llama.alpaca.load_tokenized_alpaca_dataset
|
|
18
|
+
tokenizer: ${...tokenizer}
|
|
19
|
+
path: "yahma/alpaca-cleaned"
|
|
20
|
+
split: train
|
|
21
|
+
cache_path: null
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
_target_: fusion_bench.modelpool.CausalLMPool
|
|
2
|
+
|
|
3
|
+
pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
|
|
4
|
+
|
|
5
|
+
models:
|
|
6
|
+
_pretrained_:
|
|
7
|
+
_target_: transformers.AutoModelForCausalLM.from_pretrained
|
|
8
|
+
pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
|
|
9
|
+
torch_dtype: bfloat16
|
|
10
|
+
|
|
11
|
+
tokenizer:
|
|
12
|
+
_target_: transformers.AutoTokenizer.from_pretrained
|
|
13
|
+
pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
|
|
14
|
+
|
|
15
|
+
train_datasets:
|
|
16
|
+
codealpaca:
|
|
17
|
+
_target_: fusion_bench.dataset.llama.alpaca.load_tokenized_alpaca_dataset
|
|
18
|
+
tokenizer: ${...tokenizer}
|
|
19
|
+
path: sahil2801/CodeAlpaca-20k
|
|
20
|
+
split: train
|
|
21
|
+
cache_path: null
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
_target_: fusion_bench.modelpool.CausalLMPool
|
|
2
|
+
|
|
3
|
+
pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
|
|
4
|
+
|
|
5
|
+
models:
|
|
6
|
+
_pretrained_:
|
|
7
|
+
_target_: transformers.AutoModelForCausalLM.from_pretrained
|
|
8
|
+
pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
|
|
9
|
+
torch_dtype: bfloat16
|
|
10
|
+
|
|
11
|
+
tokenizer:
|
|
12
|
+
_target_: transformers.AutoTokenizer.from_pretrained
|
|
13
|
+
pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
|
|
14
|
+
|
|
15
|
+
train_datasets:
|
|
16
|
+
metamathqa:
|
|
17
|
+
_target_: fusion_bench.dataset.llama.metamathqa.load_tokenized_metamathqa
|
|
18
|
+
tokenizer: ${...tokenizer}
|
|
19
|
+
cache_path: null
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
_target_: fusion_bench.modelpool.CausalLMPool
|
|
2
|
+
|
|
3
|
+
pretrained_model_name_or_path: meta-llama/Llama-3-1B-Instruct
|
|
4
|
+
|
|
5
|
+
models:
|
|
6
|
+
_pretrained_:
|
|
7
|
+
_target_: transformers.AutoModelForCausalLM.from_pretrained
|
|
8
|
+
pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
|
|
9
|
+
torch_dtype: bfloat16
|
|
10
|
+
|
|
11
|
+
tokenizer:
|
|
12
|
+
_target_: transformers.AutoTokenizer.from_pretrained
|
|
13
|
+
pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
|
|
14
|
+
|
|
15
|
+
train_datasets:
|
|
16
|
+
ultrachat-200k:
|
|
17
|
+
_target_: fusion_bench.dataset.llama.ultrachat.load_tokenized_ultrachat_200k
|
|
18
|
+
tokenizer: ${...tokenizer}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
_target_: fusion_bench.modelpool.SeqenceClassificationModelPool
|
|
2
|
+
|
|
3
|
+
pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
|
|
4
|
+
|
|
5
|
+
models:
|
|
6
|
+
_pretrained_:
|
|
7
|
+
_target_: fusion_bench.modelpool.seq_classification_lm.create_reward_model_from_pretrained
|
|
8
|
+
pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
|
|
9
|
+
torch_dtype: bfloat16
|
|
10
|
+
use_flash_attention_2: true
|
|
11
|
+
|
|
12
|
+
tokenizer:
|
|
13
|
+
_target_: transformers.AutoTokenizer.from_pretrained
|
|
14
|
+
pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
|
|
15
|
+
pad_token: <|end_of_text|> # do not use eos token (<|eos_id|>) as padding token because it is used as the end of each content
|
|
16
|
+
|
|
17
|
+
train_datasets:
|
|
18
|
+
preference_700k:
|
|
19
|
+
_target_: fusion_bench.dataset.llama.preference_700k.load_tokenized_preference_700k_for_rlhf
|
|
20
|
+
tokenizer: ${...tokenizer}
|
|
21
|
+
path: hendrydong/preference_700K
|
|
22
|
+
split: train
|
|
23
|
+
cache_path: null
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
_target_: fusion_bench.modelpool.SeqenceClassificationModelPool
|
|
2
|
+
|
|
3
|
+
pretrained_model_name_or_path: fusion-bench/Llama-3.2-1B-Instruct_Bradly-Terry-RM_Preference-700k
|
|
4
|
+
|
|
5
|
+
models:
|
|
6
|
+
_pretrained_:
|
|
7
|
+
_target_: transformers.AutoModelForSequenceClassification.from_pretrained
|
|
8
|
+
pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
|
|
9
|
+
torch_dtype: bfloat16
|
|
10
|
+
|
|
11
|
+
tokenizer:
|
|
12
|
+
_target_: transformers.AutoTokenizer.from_pretrained
|
|
13
|
+
pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
|
|
14
|
+
pad_token: <|end_of_text|> # do not use eos token (<|eos_id|>) as padding token because it is used as the end of each content
|
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
defaults:
|
|
2
2
|
- hydra: default
|
|
3
|
+
- fabric: auto
|
|
3
4
|
- modelpool: nyuv2_modelpool
|
|
4
5
|
- method: simple_average
|
|
5
6
|
- taskpool: nyuv2_taskpool
|
|
6
7
|
- _self_
|
|
8
|
+
|
|
9
|
+
_target_: fusion_bench.programs.FabricModelFusionProgram
|
|
10
|
+
_recursive_: false
|
|
11
|
+
|
|
7
12
|
fast_dev_run: false # Run a single batch of data to test the model or method
|
|
8
13
|
use_lightning: true # Use the fabric to run the experiment
|
|
9
14
|
print_config: true # Print the configuration to the console
|
|
10
15
|
save_report: false # path to save the result report
|
|
11
|
-
fabric: null
|
|
12
16
|
trainer:
|
|
13
17
|
devices: 1
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
_target_: fusion_bench.taskpool.llama.reward_model.RewardModelEvaluationTaskPool
|
|
2
|
+
|
|
3
|
+
test_datasets:
|
|
4
|
+
preference_700k:
|
|
5
|
+
_target_: fusion_bench.dataset.llama.preference_700k.load_tokenized_preference_700k_for_rlhf
|
|
6
|
+
tokenizer: ${...tokenizer}
|
|
7
|
+
path: hendrydong/preference_700K
|
|
8
|
+
split: train
|
|
9
|
+
cache_path: null
|
|
10
|
+
|
|
11
|
+
dataloader_kwargs:
|
|
12
|
+
shuffle: False
|
|
13
|
+
batch_size: 16
|
|
14
|
+
|
|
15
|
+
tokenizer: ${..modelpool.tokenizer}
|
|
16
|
+
|
|
17
|
+
max_num_samples: 1000
|
|
18
|
+
seed: 42
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
defaults:
|
|
2
|
-
- example_config
|
|
3
|
-
- override method: weighted_average_for_llama
|
|
4
|
-
- override modelpool: llama_for_causallm
|
|
5
|
-
- _self_
|
|
6
|
-
modelpool:
|
|
7
|
-
models:
|
|
8
|
-
# the pre-trained model (base model) is optional
|
|
9
|
-
# if not provided, the first model will be used as the base model
|
|
10
|
-
- name: _pretrained_
|
|
11
|
-
path: meta-llama/Meta-Llama-3-8B
|
|
12
|
-
- name: expert_1
|
|
13
|
-
path: meta-llama/Meta-Llama-3-8B
|
|
14
|
-
- name: expert_2
|
|
15
|
-
path: meta-llama/Meta-Llama-3-8B-Instruct
|
|
16
|
-
method:
|
|
17
|
-
normalize: true # if true, the weights will be normalized before merging
|
|
18
|
-
weights: # List of weights for each model
|
|
19
|
-
- 0.5
|
|
20
|
-
- 0.5
|
|
21
|
-
# if true, only the backbone of the model will be merged and the head will be keeped as the pre-trained model (if the pre-trained model is provided, otherwise the head of the first model will be used)
|
|
22
|
-
# if false, the whole model will be merged
|
|
23
|
-
backbone_only: true
|
|
24
|
-
merged_model_save_path: null
|
|
25
|
-
save_tokenizer: true
|
|
26
|
-
push_to_hub: false
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|