PyPI - fusion-bench - Versions diffs - 0.2.9__py3-none-any.whl - Mend

fusion-bench 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (727) hide show

fusion_bench_config/method/concrete_subspace/clip_concrete_task_wise_adamerging.yaml ADDED Viewed

@@ -0,0 +1,27 @@
+name: clip_concrete_task_wise_adamerging
+# batch size per gpu
+# if you have multiple gpus, the total batch size will be `batch_size * num_gpus`
+batch_size: 16
+num_workers: 8
+merge_dtype: null
+optimizer: adam
+lr: 1e-3
+base_lr: 1
+adamerging_lr: 1e-3
+scaling_factor: 0.3
+max_steps: 1000
+max_adamerging_steps: 1000
+save_interval: 500
+initial_logits: 0
+temperature: 0.5
+# "discrete" or "continuous", this is the mask applied for evaluation, not during training
+# the performance of final model are expected to be similar
+eval_mask_type: continuous
+mask_checkpoint: null
+# if `clamp_weights` is true, the weights will be clamped to [0, 1]
+clamp_weights: false
+# arguments of `functional_call`
+tie_weights: true
+strict: false
+cache_dir: outputs

fusion_bench_config/method/dare/simple_average.yaml ADDED Viewed

@@ -0,0 +1,5 @@
+_target_: fusion_bench.method.DareSimpleAverage
+sparsity_ratio: 0.5
+only_on_linear_weights: false
+rescale: true

fusion_bench_config/method/dare/task_arithmetic.yaml ADDED Viewed

@@ -0,0 +1,6 @@
+_target_: fusion_bench.method.DareTaskArithmetic
+scaling_factor: 0.3
+sparsity_ratio: 0.5
+only_on_linear_weights: false
+rescale: true

fusion_bench_config/method/dare/ties_merging.yaml ADDED Viewed

@@ -0,0 +1,15 @@
+_target_: fusion_bench.method.dare.DareTiesMerging
+# === DARE parameters ===
+sparsity_ratio: 0.5
+only_on_linear_weights: false
+rescale: true
+# === Ties merging parameters ===
+# Scaling factor $\lambda$
+scaling_factor: 0.5
+threshold: 20
+# List of keys to remove from the state dict, default is empty
+remove_keys: []
+# Function to merge the models, default is sum. Options are 'sum', 'mean', and 'max'
+merge_func: sum

fusion_bench_config/method/dawe/dawe_for_clip.yaml ADDED Viewed

@@ -0,0 +1,32 @@
+_target_: fusion_bench.method.DataAdaptiveWeightEnsemblingForCLIP
+_recursive_: false
+merge_mode: task_wise
+init_lambda: 0.3
+batch_reduce: true
+eval_batch_reduce: false
+_dict_feature_extractor_path: microsoft/resnet-18
+dict_processor:
+  _target_: fusion_bench.method.dawe.dawe_for_clip.load_resnet_processor
+  pretrained_model_name_or_path: ${.._dict_feature_extractor_path}
+dict_feature_extractor:
+  _target_: fusion_bench.method.dawe.dawe_for_clip.load_resnet_feature_extractor
+  pretrained_model_name_or_path: ${.._dict_feature_extractor_path}
+# dimension of the extracted embeddings, if this None, try to infer from the feature extractor model
+hidden_size: null
+gate_hidden_layers: 1
+# if task_vector_dtype is null, the task vector will have the same dtype as pretrained model
+task_vector_dtype: null
+task_vector_sparsity: 0
+# training & logging args
+max_steps: 1000
+save_interval: 200
+learning_rate: 1e-5
+resume_checkpoint_path: null
+skip_training: false
+# dataloader args
+batch_size: 1
+num_workers: 0
+pin_memory: true

fusion_bench_config/method/depth_upscaling.yaml ADDED Viewed

@@ -0,0 +1,5 @@
+_target_: DepthUpscalingAlgorithm
+# this should be a list of integers or string, indicating the sequence of layers. If the entry is an integer, it will use the n-th layer of the model. If the entry is a string, it will use the layers specified by the string. The string should be a valid python expression that evaluates to a list of integers.
+# for example, ["range(0,12)", "range(6,12)"] will use the first 12 layers and the last 6 layers of the model to construct the new model
+# [0, 2, 4, "range(6,12)"] will use the 1st, 3rd, 5th, and the 7th to 12th layers of the model to construct the new model
+layer_indices: null

fusion_bench_config/method/dummy.yaml ADDED Viewed

	@@ -0,0 +1 @@
1	+ _target_: fusion_bench.method.DummyAlgorithm

fusion_bench_config/method/ensemble/max_model_predictor.yaml ADDED Viewed

	@@ -0,0 +1 @@
1	+ _target_: fusion_bench.method.MaxModelPredictorAlgorithm

fusion_bench_config/method/ensemble/simple_ensemble.yaml ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ _target_: fusion_bench.method.SimpleEnsembleAlgorithm
2	+

fusion_bench_config/method/ensemble/weighted_ensemble.yaml ADDED Viewed

@@ -0,0 +1,6 @@
+_target_: fusion_bench.method.WeightedEnsembleAlgorithm
+normalize: true
+# this should be a list of floats, one for each model in the ensemble
+# If weights is null, the ensemble will use the default weights, which are equal weights for all models.
+weights: null

fusion_bench_config/method/fisher_merging/clip_fisher_merging.yaml ADDED Viewed

@@ -0,0 +1,13 @@
+_target_: fusion_bench.method.FisherMergingForCLIPVisionModel
+# this should be a list of strings, regular expressions that match the names of the parameters that should be excluded from the fisher merging
+exclude_param_names_regex: []
+# boolean, whether to normalize fisher weights (L2 norm) or not
+normalize_fisher_weight: true
+# float, the minimal value in fisher weights, used for tackling the potential numerical issues
+minimal_fisher_weight: 1e-6
+# common choices: 256, 512, 1024, 2048
+num_fisher_examples: 256
+zeroshot_weights_cache_dir: outputs/cache/clip_zeroshot_weights
+dataloader_kwargs:
+  batch_size: 32
+  num_workers: 0

fusion_bench_config/method/fisher_merging/fisher_merging.yaml ADDED Viewed

@@ -0,0 +1,9 @@
+name: fisher_merging
+# this should be a list of strings, regular expressions that match the names of the parameters that should be excluded from the fisher merging
+exclude_param_names_regex: []
+# boolean, whether to normalize fisher weights (L2 norm) or not
+normalize_fisher_weight: true
+# float, the minimal value in fisher weights, used for tackling the potential numerical issues
+minimal_fisher_weight: 1e-6
+# common choices: 256, 512, 1024, 2048
+num_fisher_examples: 256

fusion_bench_config/method/fisher_merging/gpt2_fisher_merging.yaml ADDED Viewed

@@ -0,0 +1,12 @@
+_target_: fusion_bench.method.FisherMergingAlgorithmForGPT2
+# this should be a list of strings, regular expressions that match the names of the parameters that should be excluded from the fisher merging
+exclude_param_names_regex: []
+# boolean, whether to normalize fisher weights (L2 norm) or not
+normalize_fisher_weight: true
+# float, the minimal value in fisher weights, used for tackling the potential numerical issues
+minimal_fisher_weight: 1e-6
+# common choices: 256, 512, 1024, 2048
+num_fisher_examples: 256
+cache_dir: outputs
+batch_size: 32
+num_workers: 0

fusion_bench_config/method/linear/expo.yaml ADDED Viewed

@@ -0,0 +1,8 @@
+# This algorithm merges a pretrained model with a finetuned model.
+#
+# $$\theta_{merged} = \theta_{ft} + \alpha (\theta_{ft} - \theta_{pre})$$
+#
+# where $\theta_{merged}$ is the merged model, $\theta_{ft}$ is the finetuned model (medium-aligned model),
+# $\theta_{pre}$ is the pretrained model (base model), and $\alpha$ is the extrapolation factor.
+_target_: fusion_bench.method.ExPOAlgorithm
+extrapolation_factor: 0.1

fusion_bench_config/method/linear/linear_interpolation.yaml ADDED Viewed

@@ -0,0 +1,3 @@
+_target_: fusion_bench.method.LinearInterpolationAlgorithm
+t: 0.5

fusion_bench_config/method/linear/llama_expo.yaml ADDED Viewed

@@ -0,0 +1,19 @@
+# This algorithm merges a pretrained model with a finetuned model.
+#
+# $$\theta_{merged} = \theta_{ft} + \alpha (\theta_{ft} - \theta_{pre})$$
+#
+# where $\theta_{merged}$ is the merged model, $\theta_{ft}$ is the finetuned model (medium-aligned model),
+# $\theta_{pre}$ is the pretrained model (base model), and $\alpha$ is the extrapolation factor.
+_target_: fusion_bench.method.ExPOAlgorithmForLlama
+extrapolation_factor: 0.1
+attention_scaling_factor: 1.0
+only_on_backbone: true
+on_linear_weights: true
+on_linear_bias: false
+on_embedding: false
+fix_last_n_layers: 0
+fix_first_n_layers: 0
+magnitude_sparsity_ratio: null

fusion_bench_config/method/linear/llama_expo_with_dare.yaml ADDED Viewed

@@ -0,0 +1,19 @@
+_target_: fusion_bench.method.linear.llama_expo.ExPOWithDareForLLama
+extrapolation_factor: 0.1
+attention_scaling_factor: 1.0
+only_on_backbone: true
+on_linear_weights: true
+on_linear_bias: false
+on_embedding: false
+fix_last_n_layers: 0
+fix_first_n_layers: 0
+magnitude_sparsity_ratio: null
+# dare arguments
+dare_sparsity_ratio: 0.5
+dare_only_on_linear_weights: true
+dare_rescale: true

fusion_bench_config/method/linear/simple_average_for_llama.yaml ADDED Viewed

@@ -0,0 +1,5 @@
+_target_: fusion_bench.method.SimpleAverageForLlama
+# set `merge_backbone` to true if you has a base model and only want to merge the backbone of the experts
+# if `merge_backbone` is False, this is equivalent to `SimpleAverageAlgorithm`
+merge_backbone: true
+model_save_path: null

fusion_bench_config/method/linear/task_arithmetic_for_llama.yaml ADDED Viewed

@@ -0,0 +1,4 @@
+_target_: fusion_bench.method.TaskArithmeticForLlama
+scaling_factor: 0.3
+merge_backbone: true
+model_save_path: null

fusion_bench_config/method/linear/weighted_average.yaml ADDED Viewed

@@ -0,0 +1,6 @@
+_target_: fusion_bench.method.WeightedAverageAlgorithm
+normalize: true # if true, the weights will be normalized before merging
+weights: # List of weights for each model
+  - 0.5
+  - 0.5

fusion_bench_config/method/linear/weighted_average_for_llama.yaml ADDED Viewed

@@ -0,0 +1,12 @@
+_target_: WeightedAverageForLLama
+normalize: true # if true, the weights will be normalized before merging
+weights: # List of weights for each model
+  - 0.5
+  - 0.5
+# if true, only the backbone of the model will be merged and the head will be keeped as the pre-trained model (if the pre-trained model is provided, otherwise the head of the first model will be used)
+# if false, the whole model will be merged
+backbone_only: true
+merged_model_save_path: null
+save_tokenizer: true
+push_to_hub: false

fusion_bench_config/method/lm_finetune/bradley_terry_rm.yaml ADDED Viewed

@@ -0,0 +1,47 @@
+_target_: fusion_bench.method.BradleyTerryRewardModeling
+_recursive_: False
+optimizer:
+  _target_: torch.optim.AdamW
+  lr: 1e-5
+  weight_decay: 0.01
+  fused: null
+lr_scheduler:
+  _target_: fusion_bench.optim.lr_scheduler.CosineDecayWithWarmup
+  T_max: _T_max_ # this will be replaced by the expected number of training steps
+  init_lr: 0
+  warmup_steps: 100
+  max_lr: ${..optimizer.lr}
+  min_lr: 1e-6
+dataloader_kwargs:
+  # per-gpu batch size
+  batch_size: 1
+  num_workers: 0
+  pin_memory: True
+# Training hyperparameters
+# if max_epochs=-1, max_steps will be used to determine the number of training steps
+max_epochs: 3
+max_steps: -1
+max_steps_per_epoch: -1
+accumulate_grad_batches: 1
+lr_scheduler_interval: step
+lr_scheduler_frequency: 1
+# Checkpointing may be done by epoch or step, and at the end of training
+# `checkpoint_save_interval` can be 'epoch' or 'step'
+checkpoint_save_interval: epoch
+checkpoint_save_frequency: 1
+# Whether to use gradient clipping, and if so, the value and algorithm
+gradient_clip_val: null
+gradient_clip_algorithm: norm
+save_optimizer_state: false
+# save_full_model must be true when using shared FSDP
+save_full_model: true
+# save_ckpt_type can be 'hf' or 'lightning'
+save_ckpt_type: lightning
+# Path to checkpoint to load from, used for resuming training
+ckpt_path: null
+max_length: 4096
+fix_token_embedding: true

fusion_bench_config/method/lm_finetune/fullfinetune_sft.yaml ADDED Viewed

@@ -0,0 +1,47 @@
+_target_: fusion_bench.method.FullFinetuneSFT
+_recursive_: False
+optimizer:
+  _target_: torch.optim.AdamW
+  lr: 1e-5
+  weight_decay: 0.01
+  fused: null
+lr_scheduler:
+  _target_: fusion_bench.optim.lr_scheduler.CosineDecayWithWarmup
+  T_max: _T_max_ # this will be replaced by the expected number of training steps
+  init_lr: 0
+  warmup_steps: 100
+  max_lr: ${..optimizer.lr}
+  min_lr: 1e-6
+dataloader_kwargs:
+  # per-gpu batch size
+  batch_size: 1
+  num_workers: 0
+  pin_memory: True
+# Training hyperparameters
+# if max_epochs=-1, max_steps will be used to determine the number of training steps
+max_epochs: 3
+max_steps: -1
+max_steps_per_epoch: -1
+accumulate_grad_batches: 1
+lr_scheduler_interval: step
+lr_scheduler_frequency: 1
+# Checkpointing may be done by epoch or step, and at the end of training
+# `checkpoint_save_interval` can be 'epoch' or 'step'
+checkpoint_save_interval: epoch
+checkpoint_save_frequency: 1
+# Whether to use gradient clipping, and if so, the value and algorithm
+gradient_clip_val: null
+gradient_clip_algorithm: norm
+save_optimizer_state: false
+# save_full_model must be true when using shared FSDP
+save_full_model: true
+# save_ckpt_type can be 'hf' or 'lightning'
+save_ckpt_type: lightning
+# Path to checkpoint to load from, used for resuming training
+ckpt_path: null
+max_length: 4096
+fix_token_embedding: true

fusion_bench_config/method/lm_finetune/peftfinetune_sft.yaml ADDED Viewed

@@ -0,0 +1,63 @@
+_target_: fusion_bench.method.PeftFinetuneSFT
+_recursive_: False
+optimizer:
+  _target_: torch.optim.AdamW
+  lr: 1e-4
+  weight_decay: 0.01
+  fused: null
+lr_scheduler:
+  _target_: torch.optim.lr_scheduler.CosineAnnealingLR
+  T_max: _T_max_ # this will be replaced by the expected number of training steps
+  eta_min: 1e-6
+dataloader_kwargs:
+  # per-gpu batch size
+  batch_size: 1
+  num_workers: 0
+  pin_memory: True
+peft_config:
+  _target_: peft.LoraConfig
+  task_type: peft.TaskType.CAUSAL_LM
+  target_modules:
+    # lora attention modules
+    - q_proj
+    - v_proj
+    # lora mlp modules
+    - gate_proj
+    - down_proj
+    - up_proj
+  r: 64
+  lora_alpha: 16
+  lora_dropout: 0
+  bias: none
+adapter_name: default
+# whether to merge and unload the adapter after training
+merge_and_unload: false
+# Training hyperparameters
+# if max_epochs=-1, max_steps will be used to determine the number of training steps
+max_epochs: 3
+max_steps: -1
+max_steps_per_epoch: -1
+accumulate_grad_batches: 1
+lr_scheduler_interval: step
+lr_scheduler_frequency: 1
+# Checkpointing may be done by epoch or step, and at the end of training
+# `checkpoint_save_interval` can be 'epoch' or 'step'
+checkpoint_save_interval: epoch
+checkpoint_save_frequency: 1
+# Whether to use gradient clipping, and if so, the value and algorithm
+gradient_clip_val: null
+gradient_clip_algorithm: norm
+save_optimizer_state: false
+# save_full_model must be true when using shared FSDP
+save_full_model: false
+# save_ckpt_type can be 'peft' or 'lightning'
+save_ckpt_type: lightning
+# Path to checkpoint to load from, used for resuming training
+ckpt_path: null
+max_length: 4096

fusion_bench_config/method/mixtral_moe_merging.yaml ADDED Viewed

@@ -0,0 +1,4 @@
+name: mixtral_moe_upscaling # or "mixtral_for_causal_lm_moe_upscaling"
+experts_per_token: 2
+# path to save the upscaled model
+save_checkpoint: null

fusion_bench_config/method/mixtral_moe_upscaling.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+# or fusion_bench.method.MixtralUpscalingAlgorithm
+_target_: fusion_bench.method.MixtralForCausalLMUpscalingAlgorithm
+num_experts: 4
+experts_per_token: 2
+# path to save the upscaled model
+save_checkpoint: null

fusion_bench_config/method/model_recombination.yaml ADDED Viewed

@@ -0,0 +1,4 @@
+_target_: fusion_bench.method.ModelRecombinationAlgorithm
+# if `return_model_pool` is not null, the argument `return_modelpool` passed to the `run` method will be ignored.
+return_modelpool: null

fusion_bench_config/method/opcm/opcm.yaml ADDED Viewed

@@ -0,0 +1,12 @@
+_target_: fusion_bench.method.opcm.opcm.OPCMForCLIP
+# shuffle the order of the models
+shuffle_order: true
+# the scaling factor for the SVD projection
+alpha: 0.5
+# the random seed to use
+seed: null
+# save the merged model on every step
+save_on_every_step: true
+# evaluate the merged model on every step
+evaluate_on_every_step: true

fusion_bench_config/method/opcm/task_arithmetic.yaml ADDED Viewed

@@ -0,0 +1,12 @@
+_target_: fusion_bench.method.opcm.task_arithmetic.ContinualTaskArithmeticForCLIP
+scaling_factor: 0.3
+# shuffle the order of the models
+shuffle_order: true
+# the random seed to use
+seed: null
+# save the merged model on every step
+save_on_every_step: true
+# evaluate the merged model on every step
+evaluate_on_every_step: true

fusion_bench_config/method/opcm/ties_merging.yaml ADDED Viewed

@@ -0,0 +1,18 @@
+_target_: fusion_bench.method.opcm.ties_merging.ContinualTiesMergingForCLIP
+# Scaling factor $\lambda$
+scaling_factor: 0.5
+threshold: 20
+# List of keys to remove from the state dict, default is empty
+remove_keys: []
+# Function to merge the models, default is sum. Options are 'sum', 'mean', and 'max'
+merge_func: sum
+# shuffle the order of the models
+shuffle_order: true
+# the random seed to use
+seed: null
+# save the merged model on every step
+save_on_every_step: true
+# evaluate the merged model on every step
+evaluate_on_every_step: true

fusion_bench_config/method/opcm/weight_average.yaml ADDED Viewed

@@ -0,0 +1,10 @@
+_target_: fusion_bench.method.opcm.weight_average.ContinualWeightAverageForCLIP
+# shuffle the order of the models
+shuffle_order: true
+# the random seed to use
+seed: null
+# save the merged model on every step
+save_on_every_step: true
+# evaluate the merged model on every step
+evaluate_on_every_step: true

fusion_bench_config/method/pruning/llama_magnitude_pruning.yaml ADDED Viewed

@@ -0,0 +1,14 @@
+_target_: fusion_bench.method.MagnitudePruningForLlama
+_recursive_: false
+# `prune_type` can be either `unstructured` or `semistructured`
+prune_type: unstructured
+# device and dtype to compute the pruning mask
+device: cuda
+dtype: null
+# === options for unstructured pruning ===
+# `sparsity_ratio` is the ratio of weights to be pruned, 1 means all weights are pruned
+sparsity_ratio: 0.5
+# === options for semistructured pruning ===
+# 2:4 means 2 out of 4 weights are pruned
+n: 2
+m: 4

fusion_bench_config/method/pruning/llama_random_pruning.yaml ADDED Viewed

@@ -0,0 +1,9 @@
+_target_: fusion_bench.method.RandomPruningForLlama
+prune_type: unstructured
+# === options for unstructured pruning ===
+# `sparsity_ratio` is the ratio of weights to be pruned, 1 means all weights are pruned
+sparsity_ratio: 0.5
+# === options for semistructured pruning ===
+# 2:4 means 2 out of 4 weights are pruned
+n: 2
+m: 4

fusion_bench_config/method/pruning/llama_wanda_pruning.yaml ADDED Viewed

@@ -0,0 +1,16 @@
+_target_: fusion_bench.method.WandaPruningForLlama
+nsamples: 128
+seed: 0
+use_variant: false
+# `prune_type` can be either `unstructured` or `semistructured`
+prune_type: unstructured
+# device and dtype to compute the pruning mask
+device: cuda
+dtype: null
+# === options for unstructured pruning ===
+# `sparsity_ratio` is the ratio of weights to be pruned, 1 means all weights are pruned
+sparsity_ratio: 0.5
+# === options for semistructured pruning ===
+# 2:4 means 2 out of 4 weights are pruned
+n: 2
+m: 4

fusion_bench_config/method/pruning/magnitude_diff_pruning.yaml ADDED Viewed

@@ -0,0 +1,5 @@
+_target_: fusion_bench.method.MagnitudeDiffPruningAlgorithm
+prune_ratio: 0.5
+rescale: false
+extract_names: null
+prune_type: minor

fusion_bench_config/method/pwe_moe_ls_for_clip.yaml ADDED Viewed

@@ -0,0 +1,22 @@
+_target_: fusion_bench.method.PWEMoELinearScalarizationForCLIP # or PWEMoExactParetoOptimalForCLIP
+upscale_mlp: true
+upscale_attn: true
+# scaling factor for the remaining parameters
+init_lambda: 0.3
+router_hidden_layers: 2
+lr: 1e-5
+num_steps: 8000
+save_interval: 2000
+alpha: 1 # alpha for dirichlet, if alpha=1, then it is uniform
+# load model from this checkpoint
+checkpoint_path: null
+# evaluation grid
+eval_grid: true
+eval_grid_n: 8
+eval_grid_m: 2
+dataloader_kwargs:
+  # per-device batch size
+  batch_size: 16
+  num_workers: 4

fusion_bench_config/method/rankone_moe/rankone_moe.yaml ADDED Viewed

@@ -0,0 +1,26 @@
+name: ??? # this can be
+# the path for loading the model weights, if specified, skip the test-time adaptation training
+checkpoint: False
+# the path for saving the model weights.
+save_checkpoint: False
+router_hidden_layers: 1
+init_lambda: 0.3
+batch_reduce: true
+# device to compute svd
+svd_accelerator: cuda
+rank_k: 32 # How many experts are added to the pool per task?
+select_k: -1  # How many experts are selected from the pool to merge? Range is (1, rank_k*task_num). In particular -1: All the experts in the pool
+# learning rate
+lr: 1e-4
+optimizer: adam
+# this is overrided by `fabric.devices` if launched from the `fusion_bench` CLI.
+devices: 1
+batch_size: 16
+num_workers: 16
+max_steps: 1000 # default: 1000
+# if true, we will use the gradient accumulation across tasks to save memory
+use_grad_accumulate: true
+cache_dir: outputs
+fast_dev_run: ${fast_dev_run}

fusion_bench_config/method/regmean/clip_regmean.yaml ADDED Viewed

@@ -0,0 +1,11 @@
+_target_: fusion_bench.method.RegMeanAlgorithmForCLIP
+# list, regular expression of names of parameters that need to be excluded
+exclude_param_names_regex: []
+# numbers of examples to compute regmean weights
+num_regmean_examples: 256
+weight_transpose: true
+# float, reduce non-diagonal elements in regmean weights by multiplying this scalar
+reduce_non_diagonal_ratio: 0.6
+dataloader_kwargs:
+  batch_size: 32
+  num_workers: 0

fusion_bench_config/method/regmean/gpt2_regmean.yaml ADDED Viewed

@@ -0,0 +1,12 @@
+_target_: fusion_bench.method.RegMeanAlgorithmForGPT2
+# list, regular expression of names of parameters that need to be excluded
+exclude_param_names_regex: []
+# numbers of examples to compute regmean weights
+num_regmean_examples: 256
+# float, reduce non-diagonal elements in regmean weights by multiplying this scalar
+reduce_non_diagonal_ratio: 0.6
+weight_transpose: false
+cache_dir: outputs
+batch_size: 32
+num_workers: 0

fusion_bench_config/method/regmean/regmean.yaml ADDED Viewed

@@ -0,0 +1,4 @@
+_target_: ???
+num_regmean_examples: 256
+reduce_non_diagonal_ratio: 0.1
+exclude_param_names_regex: []

fusion_bench_config/method/simple_average.yaml ADDED Viewed

	@@ -0,0 +1 @@
1	+ _target_: fusion_bench.method.SimpleAverageAlgorithm

fusion_bench_config/method/slerp/slerp.yaml ADDED Viewed

@@ -0,0 +1,6 @@
+_target_: fusion_bench.method.SlerpMergeAlgorithm
+t: 0.5 # interpolation factor
+DOT_THRESHOLD: 0.9995
+epsilon: 1e-8

fusion_bench_config/method/smile_upscaling/singular_projection_merging.yaml ADDED Viewed

@@ -0,0 +1,8 @@
+name: singular_projection_merging
+# merge device on cuda can accelerate the SVD computation
+device: cuda
+k: 128
+rank: low # or high
+full_matrices: false
+# path to save/load the model
+model_path: null

fusion_bench_config/method/smile_upscaling/smile_mistral_upscaling.yaml ADDED Viewed

@@ -0,0 +1,10 @@
+name: smile_mistral_upscaling
+device: cpu
+accelerator: cuda
+# path to save/load the model
+model_path: null
+model_dtype: float16
+num_experts_per_tok: 1
+rank_of_router: 8
+# if rank_of_expert < 0, dense expert is used.
+rank_of_expert: 512