fusion-bench 0.2.19__py3-none-any.whl → 0.2.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. fusion_bench/__init__.py +1 -0
  2. fusion_bench/_get_started/__init__.py +3 -0
  3. fusion_bench/_get_started/greeting_program.py +49 -0
  4. fusion_bench/compat/method/base_algorithm.py +14 -0
  5. fusion_bench/constants/__init__.py +5 -0
  6. fusion_bench/constants/clip_vision.py +26 -2
  7. fusion_bench/constants/paths.py +4 -0
  8. fusion_bench/dataset/clip_dataset.py +2 -1
  9. fusion_bench/dataset/gpt2_glue.py +9 -9
  10. fusion_bench/dataset/image_corruption/__init__.py +0 -0
  11. fusion_bench/dataset/image_corruption/make_corruption.py +179 -0
  12. fusion_bench/dataset/image_dataset.py +1 -1
  13. fusion_bench/dataset/nyuv2.py +2 -2
  14. fusion_bench/method/__init__.py +16 -1
  15. fusion_bench/method/adamerging/clip_layer_wise_adamerging.py +1 -1
  16. fusion_bench/method/adamerging/clip_task_wise_adamerging.py +11 -7
  17. fusion_bench/method/adamerging/layer_wise_adamerging.py +11 -5
  18. fusion_bench/method/base_algorithm.py +195 -12
  19. fusion_bench/method/bitdelta/__init__.py +4 -0
  20. fusion_bench/method/bitdelta/bitdelta.py +156 -0
  21. fusion_bench/method/bitdelta/bitdelta_utils/__init__.py +0 -0
  22. fusion_bench/method/bitdelta/bitdelta_utils/binary_gemm_kernel.py +462 -0
  23. fusion_bench/method/bitdelta/bitdelta_utils/data.py +35 -0
  24. fusion_bench/method/bitdelta/bitdelta_utils/diff.py +129 -0
  25. fusion_bench/method/concrete_subspace/clip_concrete_adamerging.py +0 -1
  26. fusion_bench/method/depth_upscaling/depth_upscaling.py +4 -9
  27. fusion_bench/method/doge_ta/clip_layer_wise_adamerging.py +4 -5
  28. fusion_bench/method/doge_ta/doge_ta.py +1 -1
  29. fusion_bench/method/ensemble.py +12 -12
  30. fusion_bench/method/expert_sparsity/utils/calibration_data.py +1 -1
  31. fusion_bench/method/fisher_merging/clip_fisher_merging.py +2 -2
  32. fusion_bench/method/fisher_merging/fisher_merging.py +6 -15
  33. fusion_bench/method/fisher_merging/gpt2_fisher_merging.py +3 -10
  34. fusion_bench/method/fw_merging/fw_hard.py +1 -1
  35. fusion_bench/method/fw_merging/fw_soft.py +1 -1
  36. fusion_bench/method/gossip/clip_layer_wise_gossip.py +4 -5
  37. fusion_bench/method/linear/expo.py +2 -1
  38. fusion_bench/method/linear/linear_interpolation.py +6 -4
  39. fusion_bench/method/linear/simple_average_for_llama.py +16 -6
  40. fusion_bench/method/lm_finetune/bradley_terry_rm.py +2 -2
  41. fusion_bench/method/mixture_of_experts/mixtral_upcycling.py +9 -26
  42. fusion_bench/method/model_recombination.py +2 -5
  43. fusion_bench/method/moe_pruner/hooks/__init__.py +1 -2
  44. fusion_bench/method/moe_pruner/utils/data.py +2 -1
  45. fusion_bench/method/moe_pruner/utils/prune.py +6 -1
  46. fusion_bench/method/pruning/llama_magnitude_prune.py +1 -1
  47. fusion_bench/method/pruning/wanda_utils/data.py +1 -2
  48. fusion_bench/method/pwe_moe/clip_pwe_moe.py +12 -34
  49. fusion_bench/method/randes/modelsoup.py +1 -3
  50. fusion_bench/method/regmean/clip_regmean.py +2 -2
  51. fusion_bench/method/regmean/gpt2_regmean.py +3 -10
  52. fusion_bench/method/regmean/regmean.py +2 -11
  53. fusion_bench/method/regmean_plusplus/__init__.py +3 -0
  54. fusion_bench/method/regmean_plusplus/clip_regmean_plusplus.py +199 -0
  55. fusion_bench/method/regmean_plusplus/regmean_plusplus.py +383 -0
  56. fusion_bench/method/simple_average.py +16 -4
  57. fusion_bench/method/slerp/slerp.py +5 -2
  58. fusion_bench/method/smile_upscaling/error_accumulation.py +177 -0
  59. fusion_bench/method/smile_upscaling/projected_energy.py +145 -0
  60. fusion_bench/method/smile_upscaling/smile_qwen2_upscaling.py +39 -28
  61. fusion_bench/method/smile_upscaling/smile_upscaling.py +12 -5
  62. fusion_bench/method/tall_mask/task_arithmetic.py +3 -11
  63. fusion_bench/method/task_arithmetic/task_arithmetic.py +6 -10
  64. fusion_bench/method/ties_merging/ties_merging.py +13 -26
  65. fusion_bench/method/we_moe/clip_we_moe.py +5 -4
  66. fusion_bench/method/we_moe/we_moe.py +6 -6
  67. fusion_bench/method/weighted_average/llama.py +4 -16
  68. fusion_bench/metrics/continual_learning/__init__.py +1 -0
  69. fusion_bench/metrics/continual_learning/backward_transfer.py +1 -1
  70. fusion_bench/metrics/nyuv2/__init__.py +2 -2
  71. fusion_bench/metrics/nyuv2/segmentation.py +1 -1
  72. fusion_bench/mixins/__init__.py +10 -2
  73. fusion_bench/mixins/clip_classification.py +4 -3
  74. fusion_bench/mixins/hydra_config.py +105 -7
  75. fusion_bench/mixins/lightning_fabric.py +2 -0
  76. fusion_bench/mixins/serialization.py +265 -48
  77. fusion_bench/modelpool/__init__.py +2 -2
  78. fusion_bench/modelpool/base_pool.py +29 -9
  79. fusion_bench/modelpool/causal_lm/causal_lm.py +9 -0
  80. fusion_bench/modelpool/clip_vision/modelpool.py +43 -12
  81. fusion_bench/modelpool/seq_classification_lm/__init__.py +1 -1
  82. fusion_bench/modelpool/seq_classification_lm/seq_classification_lm.py +1 -1
  83. fusion_bench/models/__init__.py +2 -1
  84. fusion_bench/models/expert_sparsity/mixtral/__init__.py +1 -1
  85. fusion_bench/models/hf_utils.py +182 -0
  86. fusion_bench/models/linearized/linearized_model_utils.py +4 -4
  87. fusion_bench/models/linearized/vision_model.py +1 -1
  88. fusion_bench/models/modeling_deepseek_v2/__init__.py +1 -1
  89. fusion_bench/models/modeling_deepseek_v2/modeling_deepseek.py +4 -4
  90. fusion_bench/models/modeling_deepseek_v2/tokenization_deepseek_fast.py +0 -1
  91. fusion_bench/models/modeling_smile_gemma2/__init__.py +9 -0
  92. fusion_bench/models/modeling_smile_gemma2/configuration_smile_gemma2.py +20 -0
  93. fusion_bench/models/modeling_smile_gemma2/modeling_smile_gemma2.py +986 -0
  94. fusion_bench/models/modeling_smile_gemma2/register.py +26 -0
  95. fusion_bench/models/modeling_smile_llama/__init__.py +0 -0
  96. fusion_bench/models/modeling_smile_llama/configuration_smile_llama.py +20 -0
  97. fusion_bench/models/modeling_smile_llama/modeling_smile_llama.py +705 -0
  98. fusion_bench/models/modeling_smile_llama/register.py +8 -0
  99. fusion_bench/models/modeling_smile_mistral/__init__.py +5 -47
  100. fusion_bench/models/modeling_smile_qwen2/__init__.py +1 -1
  101. fusion_bench/models/modeling_smile_qwen2/modeling_smile_qwen2.py +6 -7
  102. fusion_bench/models/modeling_smile_qwen2/register.py +1 -4
  103. fusion_bench/models/parameter_dict.py +1 -1
  104. fusion_bench/models/sparse_we_moe.py +1 -53
  105. fusion_bench/models/utils.py +26 -0
  106. fusion_bench/models/we_moe.py +1 -53
  107. fusion_bench/models/wrappers/ensemble.py +6 -4
  108. fusion_bench/models/wrappers/layer_wise_fusion.py +1 -1
  109. fusion_bench/models/wrappers/task_wise_fusion.py +250 -72
  110. fusion_bench/programs/base_program.py +81 -2
  111. fusion_bench/programs/fabric_fusion_program.py +24 -8
  112. fusion_bench/scripts/cli.py +6 -6
  113. fusion_bench/taskpool/base_pool.py +4 -3
  114. fusion_bench/taskpool/clip_vision/taskpool.py +34 -18
  115. fusion_bench/taskpool/dummy.py +1 -1
  116. fusion_bench/taskpool/lm_eval_harness/taskpool.py +1 -2
  117. fusion_bench/tasks/clip_classification/__init__.py +6 -4
  118. fusion_bench/utils/__init__.py +6 -1
  119. fusion_bench/utils/devices.py +14 -4
  120. fusion_bench/utils/instantiate_utils.py +3 -1
  121. fusion_bench/utils/misc.py +48 -2
  122. fusion_bench/utils/modelscope.py +265 -0
  123. fusion_bench/utils/parameters.py +2 -2
  124. fusion_bench/utils/rich_utils.py +3 -0
  125. fusion_bench/utils/state_dict_arithmetic.py +34 -27
  126. {fusion_bench-0.2.19.dist-info → fusion_bench-0.2.21.dist-info}/METADATA +31 -24
  127. {fusion_bench-0.2.19.dist-info → fusion_bench-0.2.21.dist-info}/RECORD +189 -153
  128. fusion_bench_config/_get_started/clip_evaluate_single_model.yaml +21 -0
  129. fusion_bench_config/_get_started/clip_simple_average.yaml +23 -0
  130. fusion_bench_config/_get_started/clip_task_arithmetic.yaml +24 -0
  131. fusion_bench_config/_get_started/greeting_program.yaml +4 -0
  132. fusion_bench_config/fabric/loggers/csv_logger.yaml +3 -3
  133. fusion_bench_config/fabric/loggers/tensorboard_logger.yaml +3 -3
  134. fusion_bench_config/fabric_model_fusion.yaml +45 -17
  135. fusion_bench_config/hydra/default.yaml +6 -2
  136. fusion_bench_config/llama_full_finetune.yaml +1 -0
  137. fusion_bench_config/method/adamerging/clip.yaml +1 -1
  138. fusion_bench_config/method/bitdelta/bitdelta.yaml +12 -0
  139. fusion_bench_config/method/depth_upscaling.yaml +4 -1
  140. fusion_bench_config/method/regmean/clip_regmean.yaml +1 -1
  141. fusion_bench_config/method/regmean_plusplus/clip_regmean_plusplus.yaml +11 -0
  142. fusion_bench_config/method/smile_upscaling/error_accumulation.yaml +5 -0
  143. fusion_bench_config/method/smile_upscaling/projected_energy.yaml +2 -0
  144. fusion_bench_config/method/smile_upscaling/smile_qwen2_upscaling.yaml +1 -0
  145. fusion_bench_config/modelpool/CLIPVisionModelPool/_template.yaml +1 -4
  146. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch16_TALL20.yaml +73 -8
  147. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch16_TALL20_model_only.yaml +27 -7
  148. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_TA8.yaml +34 -4
  149. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_TA8_control_task.yaml +14 -17
  150. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_TA8_model_only.yaml +14 -3
  151. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_TALL10.yaml +39 -5
  152. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_TALL12.yaml +49 -5
  153. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_TALL14.yaml +55 -5
  154. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_TALL14_model_only.yaml +21 -4
  155. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_TALL16.yaml +61 -5
  156. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_TALL18.yaml +67 -5
  157. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_TALL20.yaml +73 -5
  158. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_TALL20_model_only.yaml +26 -3
  159. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_individual.yaml +4 -9
  160. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_single_finetuned.yaml +7 -5
  161. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_single_task_projection.yaml +6 -10
  162. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_sun397_and_cars.yaml +6 -7
  163. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_sun397_and_dtd.yaml +6 -7
  164. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_sun397_cars_and_dtd.yaml +7 -8
  165. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_svhn_and_mnist.yaml +8 -6
  166. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-base-patch32_two_tasks_control_task.yaml +4 -6
  167. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-large-patch14_TA8.yaml +32 -7
  168. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-large-patch14_TA8_model_only.yaml +14 -6
  169. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-large-patch14_TALL20.yaml +73 -8
  170. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-large-patch14_TALL20_model_only.yaml +27 -7
  171. fusion_bench_config/modelpool/CLIPVisionModelPool/clip-vit-large-patch14_individual.yaml +6 -10
  172. fusion_bench_config/modelpool/CausalLMPool/Qwen2.5-1.5B_math_and_coder.yaml +2 -2
  173. fusion_bench_config/modelpool/CausalLMPool/Qwen2.5-7B-math_and_coder.yaml +9 -0
  174. fusion_bench_config/modelpool/CausalLMPool/mistral-7b.yaml +6 -0
  175. fusion_bench_config/modelpool/CausalLMPool/mixtral_moe_merging.yaml +10 -0
  176. fusion_bench_config/modelpool/CausalLMPool/qwen2_math_1.5B_and_R1.yaml +4 -12
  177. fusion_bench_config/modelpool/CausalLMPool/simle_mixtral_exp_v4.yaml +6 -16
  178. fusion_bench_config/modelpool/CausalLMPool/vicuna-7b-v1.5.yaml +8 -0
  179. fusion_bench_config/modelpool/{SeqenceClassificationModelPool → SequenceClassificationModelPool}/llama_preference700k.yaml +1 -1
  180. fusion_bench_config/modelpool/{SeqenceClassificationModelPool → SequenceClassificationModelPool}/single_reward_model.yaml +1 -1
  181. fusion_bench_config/nyuv2_config.yaml +3 -1
  182. fusion_bench_config/nyuv2_mtl_train.yaml +1 -0
  183. fusion_bench_config/path/default.yaml +28 -0
  184. fusion_bench_config/taskpool/CLIPVisionModelTaskPool/clip-vit-base-patch32_svhn_and_mnist.yaml +24 -0
  185. fusion_bench_config/method/adamerging.yaml +0 -23
  186. fusion_bench_config/modelpool/mixtral_moe_merging.yaml +0 -14
  187. fusion_bench_config/modelpool/mixtral_moe_upscaling.yaml +0 -6
  188. fusion_bench_config/taskpool/clip-vit-base-patch32_svhn_and_mnist.yaml +0 -22
  189. {fusion_bench-0.2.19.dist-info → fusion_bench-0.2.21.dist-info}/WHEEL +0 -0
  190. {fusion_bench-0.2.19.dist-info → fusion_bench-0.2.21.dist-info}/entry_points.txt +0 -0
  191. {fusion_bench-0.2.19.dist-info → fusion_bench-0.2.21.dist-info}/licenses/LICENSE +0 -0
  192. {fusion_bench-0.2.19.dist-info → fusion_bench-0.2.21.dist-info}/top_level.txt +0 -0
  193. /fusion_bench_config/modelpool/{SeqenceClassificationModelPool → SequenceClassificationModelPool}/roberta-base_glue.yaml +0 -0
@@ -0,0 +1,705 @@
1
+ import logging
2
+ from functools import partial
3
+ from typing import Callable, Optional, Tuple, Union
4
+
5
+ import torch
6
+ import torch.utils.checkpoint
7
+ from torch import nn
8
+ from transformers import PreTrainedModel
9
+ from transformers.activations import ACT2FN
10
+ from transformers.cache_utils import Cache, DynamicCache, StaticCache
11
+ from transformers.generation import GenerationMixin
12
+ from transformers.modeling_attn_mask_utils import AttentionMaskConverter
13
+ from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
14
+ from transformers.modeling_outputs import (
15
+ BaseModelOutputWithPast,
16
+ CausalLMOutputWithPast,
17
+ )
18
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
19
+ from transformers.models.llama.modeling_llama import (
20
+ LLAMA_INPUTS_DOCSTRING,
21
+ LlamaRMSNorm,
22
+ LlamaRotaryEmbedding,
23
+ apply_rotary_pos_emb,
24
+ eager_attention_forward,
25
+ )
26
+ from transformers.processing_utils import Unpack
27
+ from transformers.utils import (
28
+ LossKwargs,
29
+ add_start_docstrings_to_model_forward,
30
+ can_return_tuple,
31
+ is_torch_flex_attn_available,
32
+ replace_return_docstrings,
33
+ )
34
+ from transformers.utils.deprecation import deprecate_kwarg
35
+
36
+ from fusion_bench.models.smile_moe.linear_from_hf_config import SmileLinear
37
+
38
+ from .configuration_smile_llama import SmileLlamaConfig
39
+
40
+ if is_torch_flex_attn_available():
41
+ from torch.nn.attention.flex_attention import BlockMask
42
+ from transformers.integrations.flex_attention import make_flex_block_causal_mask
43
+
44
+
45
+ logger = logging.getLogger(__name__)
46
+
47
+ _CONFIG_FOR_DOC = "SmileLlamaConfig"
48
+
49
+
50
+ class SmileLlamaMLP(nn.Module):
51
+ def __init__(self, config: SmileLlamaConfig):
52
+ super().__init__()
53
+ self.config = config
54
+ self.hidden_size = config.hidden_size
55
+ self.intermediate_size = config.intermediate_size
56
+ # * --- replace nn.Linear with SmileLinear ---
57
+ self.gate_proj = SmileLinear(
58
+ config, self.hidden_size, self.intermediate_size, bias=config.mlp_bias
59
+ )
60
+ self.up_proj = SmileLinear(
61
+ config, self.hidden_size, self.intermediate_size, bias=config.mlp_bias
62
+ )
63
+ self.down_proj = SmileLinear(
64
+ config, self.intermediate_size, self.hidden_size, bias=config.mlp_bias
65
+ )
66
+ # * --- end of replacement ---
67
+ self.act_fn = ACT2FN[config.hidden_act]
68
+
69
+ def forward(self, x):
70
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
71
+ return down_proj
72
+
73
+
74
+ class SmileLlamaAttention(nn.Module):
75
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
76
+
77
+ def __init__(self, config: SmileLlamaConfig, layer_idx: int):
78
+ super().__init__()
79
+ self.config = config
80
+ self.layer_idx = layer_idx
81
+ self.head_dim = getattr(
82
+ config, "head_dim", config.hidden_size // config.num_attention_heads
83
+ )
84
+ self.num_key_value_groups = (
85
+ config.num_attention_heads // config.num_key_value_heads
86
+ )
87
+ self.scaling = self.head_dim**-0.5
88
+ self.attention_dropout = config.attention_dropout
89
+ self.is_causal = True
90
+
91
+ # * --- replace nn.Linear with SmileLinear ---
92
+ self.q_proj = SmileLinear(
93
+ config,
94
+ config.hidden_size,
95
+ config.num_attention_heads * self.head_dim,
96
+ bias=config.attention_bias,
97
+ )
98
+ self.k_proj = SmileLinear(
99
+ config,
100
+ config.hidden_size,
101
+ config.num_key_value_heads * self.head_dim,
102
+ bias=config.attention_bias,
103
+ )
104
+ self.v_proj = SmileLinear(
105
+ config,
106
+ config.hidden_size,
107
+ config.num_key_value_heads * self.head_dim,
108
+ bias=config.attention_bias,
109
+ )
110
+ self.o_proj = SmileLinear(
111
+ config,
112
+ config.num_attention_heads * self.head_dim,
113
+ config.hidden_size,
114
+ bias=config.attention_bias,
115
+ )
116
+ # * --- end of replacement ---
117
+
118
+ def forward(
119
+ self,
120
+ hidden_states: torch.Tensor,
121
+ position_embeddings: Tuple[torch.Tensor, torch.Tensor],
122
+ attention_mask: Optional[torch.Tensor],
123
+ past_key_value: Optional[Cache] = None,
124
+ cache_position: Optional[torch.LongTensor] = None,
125
+ **kwargs: Unpack[FlashAttentionKwargs],
126
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
127
+ input_shape = hidden_states.shape[:-1]
128
+ hidden_shape = (*input_shape, -1, self.head_dim)
129
+
130
+ query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
131
+ key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
132
+ value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
133
+
134
+ cos, sin = position_embeddings
135
+ query_states, key_states = apply_rotary_pos_emb(
136
+ query_states, key_states, cos, sin
137
+ )
138
+
139
+ if past_key_value is not None:
140
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
141
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
142
+ key_states, value_states = past_key_value.update(
143
+ key_states, value_states, self.layer_idx, cache_kwargs
144
+ )
145
+
146
+ attention_interface: Callable = eager_attention_forward
147
+ if self.config._attn_implementation != "eager":
148
+ if self.config._attn_implementation == "sdpa" and kwargs.get(
149
+ "output_attentions", False
150
+ ):
151
+ logger.warning_once(
152
+ "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
153
+ 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
154
+ )
155
+ else:
156
+ attention_interface = ALL_ATTENTION_FUNCTIONS[
157
+ self.config._attn_implementation
158
+ ]
159
+
160
+ attn_output, attn_weights = attention_interface(
161
+ self,
162
+ query_states,
163
+ key_states,
164
+ value_states,
165
+ attention_mask,
166
+ dropout=0.0 if not self.training else self.attention_dropout,
167
+ scaling=self.scaling,
168
+ **kwargs,
169
+ )
170
+
171
+ attn_output = attn_output.reshape(*input_shape, -1).contiguous()
172
+ attn_output = self.o_proj(attn_output)
173
+ return attn_output, attn_weights
174
+
175
+
176
+ class SmileLlamaDecoderLayer(nn.Module):
177
+
178
+ def __init__(self, config: SmileLlamaConfig, layer_idx: int):
179
+ super().__init__()
180
+ self.hidden_size = config.hidden_size
181
+
182
+ self.self_attn = SmileLlamaAttention(config=config, layer_idx=layer_idx)
183
+
184
+ self.mlp = SmileLlamaMLP(config)
185
+ self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
186
+ self.post_attention_layernorm = LlamaRMSNorm(
187
+ config.hidden_size, eps=config.rms_norm_eps
188
+ )
189
+
190
+ def forward(
191
+ self,
192
+ hidden_states: torch.Tensor,
193
+ attention_mask: Optional[torch.Tensor] = None,
194
+ position_ids: Optional[torch.LongTensor] = None,
195
+ past_key_value: Optional[Cache] = None,
196
+ output_attentions: Optional[bool] = False,
197
+ use_cache: Optional[bool] = False,
198
+ cache_position: Optional[torch.LongTensor] = None,
199
+ position_embeddings: Optional[
200
+ Tuple[torch.Tensor, torch.Tensor]
201
+ ] = None, # necessary, but kept here for BC
202
+ **kwargs: Unpack[FlashAttentionKwargs],
203
+ ) -> Tuple[
204
+ torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
205
+ ]:
206
+ residual = hidden_states
207
+
208
+ hidden_states = self.input_layernorm(hidden_states)
209
+
210
+ # Self Attention
211
+ hidden_states, self_attn_weights = self.self_attn(
212
+ hidden_states=hidden_states,
213
+ attention_mask=attention_mask,
214
+ position_ids=position_ids,
215
+ past_key_value=past_key_value,
216
+ output_attentions=output_attentions,
217
+ use_cache=use_cache,
218
+ cache_position=cache_position,
219
+ position_embeddings=position_embeddings,
220
+ **kwargs,
221
+ )
222
+ hidden_states = residual + hidden_states
223
+
224
+ # Fully Connected
225
+ residual = hidden_states
226
+ hidden_states = self.post_attention_layernorm(hidden_states)
227
+ hidden_states = self.mlp(hidden_states)
228
+ hidden_states = residual + hidden_states
229
+
230
+ outputs = (hidden_states,)
231
+ if output_attentions:
232
+ outputs += (self_attn_weights,)
233
+
234
+ return outputs
235
+
236
+
237
+ class SmileLlamaPreTrainedModel(PreTrainedModel):
238
+ config_class = SmileLlamaConfig
239
+ base_model_prefix = "model"
240
+ supports_gradient_checkpointing = True
241
+ _no_split_modules = ["SmileLlamaDecoderLayer"]
242
+ _skip_keys_device_placement = ["past_key_values"]
243
+ _supports_flash_attn_2 = True
244
+ _supports_sdpa = True
245
+ _supports_flex_attn = True
246
+ _supports_cache_class = True
247
+ _supports_quantized_cache = True
248
+ _supports_static_cache = True
249
+ _supports_attention_backend = True
250
+
251
+ def _init_weights(self, module):
252
+ std = self.config.initializer_range
253
+ if isinstance(module, nn.Linear):
254
+ module.weight.data.normal_(mean=0.0, std=std)
255
+ if module.bias is not None:
256
+ module.bias.data.zero_()
257
+ elif isinstance(module, nn.Embedding):
258
+ module.weight.data.normal_(mean=0.0, std=std)
259
+ if module.padding_idx is not None:
260
+ module.weight.data[module.padding_idx].zero_()
261
+
262
+
263
+ class SmileLlamaModel(SmileLlamaPreTrainedModel):
264
+ """
265
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
266
+
267
+ Args:
268
+ config: LlamaConfig
269
+ """
270
+
271
+ def __init__(self, config: SmileLlamaConfig):
272
+ super().__init__(config)
273
+ self.padding_idx = config.pad_token_id
274
+ self.vocab_size = config.vocab_size
275
+
276
+ self.embed_tokens = nn.Embedding(
277
+ config.vocab_size, config.hidden_size, self.padding_idx
278
+ )
279
+ self.layers = nn.ModuleList(
280
+ [
281
+ SmileLlamaDecoderLayer(config, layer_idx)
282
+ for layer_idx in range(config.num_hidden_layers)
283
+ ]
284
+ )
285
+ self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
286
+ self.rotary_emb = LlamaRotaryEmbedding(config=config)
287
+ self.gradient_checkpointing = False
288
+
289
+ # Initialize weights and apply final processing
290
+ self.post_init()
291
+
292
+ def get_input_embeddings(self):
293
+ return self.embed_tokens
294
+
295
+ def set_input_embeddings(self, value):
296
+ self.embed_tokens = value
297
+
298
+ @can_return_tuple
299
+ @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
300
+ def forward(
301
+ self,
302
+ input_ids: Optional[torch.LongTensor] = None,
303
+ attention_mask: Optional[torch.Tensor] = None,
304
+ position_ids: Optional[torch.LongTensor] = None,
305
+ past_key_values: Optional[Cache] = None,
306
+ inputs_embeds: Optional[torch.FloatTensor] = None,
307
+ use_cache: Optional[bool] = None,
308
+ output_attentions: Optional[bool] = None,
309
+ output_hidden_states: Optional[bool] = None,
310
+ cache_position: Optional[torch.LongTensor] = None,
311
+ **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
312
+ ) -> BaseModelOutputWithPast:
313
+ output_attentions = (
314
+ output_attentions
315
+ if output_attentions is not None
316
+ else self.config.output_attentions
317
+ )
318
+ output_hidden_states = (
319
+ output_hidden_states
320
+ if output_hidden_states is not None
321
+ else self.config.output_hidden_states
322
+ )
323
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
324
+
325
+ if (input_ids is None) ^ (inputs_embeds is not None):
326
+ raise ValueError(
327
+ "You must specify exactly one of input_ids or inputs_embeds"
328
+ )
329
+
330
+ if self.gradient_checkpointing and self.training and use_cache:
331
+ logger.warning_once(
332
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
333
+ )
334
+ use_cache = False
335
+
336
+ # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
337
+ if not isinstance(past_key_values, (type(None), Cache)):
338
+ raise ValueError(
339
+ "The `past_key_values` should be either a `Cache` object or `None`."
340
+ )
341
+
342
+ if inputs_embeds is None:
343
+ inputs_embeds = self.embed_tokens(input_ids)
344
+
345
+ if use_cache and past_key_values is None:
346
+ past_key_values = DynamicCache()
347
+
348
+ if cache_position is None:
349
+ past_seen_tokens = (
350
+ past_key_values.get_seq_length() if past_key_values is not None else 0
351
+ )
352
+ cache_position = torch.arange(
353
+ past_seen_tokens,
354
+ past_seen_tokens + inputs_embeds.shape[1],
355
+ device=inputs_embeds.device,
356
+ )
357
+
358
+ if position_ids is None:
359
+ position_ids = cache_position.unsqueeze(0)
360
+
361
+ causal_mask = self._update_causal_mask(
362
+ attention_mask,
363
+ inputs_embeds,
364
+ cache_position,
365
+ past_key_values,
366
+ output_attentions,
367
+ )
368
+
369
+ hidden_states = inputs_embeds
370
+
371
+ # create position embeddings to be shared across the decoder layers
372
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
373
+
374
+ # decoder layers
375
+ all_hidden_states = () if output_hidden_states else None
376
+ all_self_attns = () if output_attentions else None
377
+
378
+ for decoder_layer in self.layers[: self.config.num_hidden_layers]:
379
+ if output_hidden_states:
380
+ all_hidden_states += (hidden_states,)
381
+
382
+ if self.gradient_checkpointing and self.training:
383
+ layer_outputs = self._gradient_checkpointing_func(
384
+ partial(decoder_layer.__call__, **flash_attn_kwargs),
385
+ hidden_states,
386
+ causal_mask,
387
+ position_ids,
388
+ past_key_values,
389
+ output_attentions,
390
+ use_cache,
391
+ cache_position,
392
+ position_embeddings,
393
+ )
394
+ else:
395
+ layer_outputs = decoder_layer(
396
+ hidden_states,
397
+ attention_mask=causal_mask,
398
+ position_ids=position_ids,
399
+ past_key_value=past_key_values,
400
+ output_attentions=output_attentions,
401
+ use_cache=use_cache,
402
+ cache_position=cache_position,
403
+ position_embeddings=position_embeddings,
404
+ **flash_attn_kwargs,
405
+ )
406
+
407
+ hidden_states = layer_outputs[0]
408
+
409
+ if output_attentions:
410
+ all_self_attns += (layer_outputs[1],)
411
+
412
+ hidden_states = self.norm(hidden_states)
413
+
414
+ # add hidden states from the last decoder layer
415
+ if output_hidden_states:
416
+ all_hidden_states += (hidden_states,)
417
+
418
+ return BaseModelOutputWithPast(
419
+ last_hidden_state=hidden_states,
420
+ past_key_values=past_key_values if use_cache else None,
421
+ hidden_states=all_hidden_states,
422
+ attentions=all_self_attns,
423
+ )
424
+
425
+ def _update_causal_mask(
426
+ self,
427
+ attention_mask: torch.Tensor,
428
+ input_tensor: torch.Tensor,
429
+ cache_position: torch.Tensor,
430
+ past_key_values: Cache,
431
+ output_attentions: bool = False,
432
+ ):
433
+ if self.config._attn_implementation == "flash_attention_2":
434
+ if attention_mask is not None and (attention_mask == 0.0).any():
435
+ return attention_mask
436
+ return None
437
+ if self.config._attn_implementation == "flex_attention":
438
+ if isinstance(attention_mask, torch.Tensor):
439
+ attention_mask = make_flex_block_causal_mask(attention_mask)
440
+ if isinstance(attention_mask, BlockMask):
441
+ return attention_mask
442
+
443
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
444
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
445
+ # to infer the attention mask.
446
+ past_seen_tokens = (
447
+ past_key_values.get_seq_length() if past_key_values is not None else 0
448
+ )
449
+ using_static_cache = isinstance(past_key_values, StaticCache)
450
+
451
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
452
+ if (
453
+ self.config._attn_implementation == "sdpa"
454
+ and not using_static_cache
455
+ and not output_attentions
456
+ ):
457
+ if AttentionMaskConverter._ignore_causal_mask_sdpa(
458
+ attention_mask,
459
+ inputs_embeds=input_tensor,
460
+ past_key_values_length=past_seen_tokens,
461
+ is_training=self.training,
462
+ ):
463
+ return None
464
+
465
+ dtype, device = input_tensor.dtype, input_tensor.device
466
+ sequence_length = input_tensor.shape[1]
467
+ if using_static_cache:
468
+ target_length = past_key_values.get_max_cache_shape()
469
+ else:
470
+ target_length = (
471
+ attention_mask.shape[-1]
472
+ if isinstance(attention_mask, torch.Tensor)
473
+ else past_seen_tokens + sequence_length + 1
474
+ )
475
+
476
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
477
+ causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
478
+ attention_mask,
479
+ sequence_length=sequence_length,
480
+ target_length=target_length,
481
+ dtype=dtype,
482
+ device=device,
483
+ cache_position=cache_position,
484
+ batch_size=input_tensor.shape[0],
485
+ )
486
+
487
+ if (
488
+ self.config._attn_implementation == "sdpa"
489
+ and attention_mask is not None
490
+ and attention_mask.device.type in ["cuda", "xpu"]
491
+ and not output_attentions
492
+ ):
493
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
494
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
495
+ # Details: https://github.com/pytorch/pytorch/issues/110213
496
+ min_dtype = torch.finfo(dtype).min
497
+ causal_mask = AttentionMaskConverter._unmask_unattended(
498
+ causal_mask, min_dtype
499
+ )
500
+
501
+ return causal_mask
502
+
503
+ @staticmethod
504
+ def _prepare_4d_causal_attention_mask_with_cache_position(
505
+ attention_mask: torch.Tensor,
506
+ sequence_length: int,
507
+ target_length: int,
508
+ dtype: torch.dtype,
509
+ device: torch.device,
510
+ cache_position: torch.Tensor,
511
+ batch_size: int,
512
+ **kwargs,
513
+ ):
514
+ """
515
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
516
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
517
+
518
+ Args:
519
+ attention_mask (`torch.Tensor`):
520
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
521
+ `(batch_size, 1, query_length, key_value_length)`.
522
+ sequence_length (`int`):
523
+ The sequence length being processed.
524
+ target_length (`int`):
525
+ The target length: when generating with static cache, the mask should be as long as the static cache,
526
+ to account for the 0 padding, the part of the cache that is not filled yet.
527
+ dtype (`torch.dtype`):
528
+ The dtype to use for the 4D attention mask.
529
+ device (`torch.device`):
530
+ The device to place the 4D attention mask on.
531
+ cache_position (`torch.Tensor`):
532
+ Indices depicting the position of the input sequence tokens in the sequence.
533
+ batch_size (`torch.Tensor`):
534
+ Batch size.
535
+ """
536
+ if attention_mask is not None and attention_mask.dim() == 4:
537
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
538
+ causal_mask = attention_mask
539
+ else:
540
+ min_dtype = torch.finfo(dtype).min
541
+ causal_mask = torch.full(
542
+ (sequence_length, target_length),
543
+ fill_value=min_dtype,
544
+ dtype=dtype,
545
+ device=device,
546
+ )
547
+ if sequence_length != 1:
548
+ causal_mask = torch.triu(causal_mask, diagonal=1)
549
+ causal_mask *= torch.arange(
550
+ target_length, device=device
551
+ ) > cache_position.reshape(-1, 1)
552
+ causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
553
+ if attention_mask is not None:
554
+ causal_mask = (
555
+ causal_mask.clone()
556
+ ) # copy to contiguous memory for in-place edit
557
+ mask_length = attention_mask.shape[-1]
558
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[
559
+ :, None, None, :
560
+ ].to(causal_mask.device)
561
+ padding_mask = padding_mask == 0
562
+ causal_mask[:, :, :, :mask_length] = causal_mask[
563
+ :, :, :, :mask_length
564
+ ].masked_fill(padding_mask, min_dtype)
565
+
566
+ return causal_mask
567
+
568
+
569
+ class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
570
+
571
+
572
+ class SmileLlamaForCausalLM(SmileLlamaPreTrainedModel, GenerationMixin):
573
+ _tied_weights_keys = ["lm_head.weight"]
574
+ _tp_plan = {"lm_head": "colwise_rep"}
575
+ _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
576
+
577
+ def __init__(self, config: SmileLlamaConfig):
578
+ super().__init__(config)
579
+ self.model = SmileLlamaModel(config)
580
+ self.vocab_size = config.vocab_size
581
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
582
+
583
+ # Initialize weights and apply final processing
584
+ self.post_init()
585
+
586
+ def get_input_embeddings(self):
587
+ return self.model.embed_tokens
588
+
589
+ def set_input_embeddings(self, value):
590
+ self.model.embed_tokens = value
591
+
592
+ def get_output_embeddings(self):
593
+ return self.lm_head
594
+
595
+ def set_output_embeddings(self, new_embeddings):
596
+ self.lm_head = new_embeddings
597
+
598
+ def set_decoder(self, decoder):
599
+ self.model = decoder
600
+
601
+ def get_decoder(self):
602
+ return self.model
603
+
604
+ @can_return_tuple
605
+ @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
606
+ @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
607
+ @replace_return_docstrings(
608
+ output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
609
+ )
610
+ def forward(
611
+ self,
612
+ input_ids: Optional[torch.LongTensor] = None,
613
+ attention_mask: Optional[torch.Tensor] = None,
614
+ position_ids: Optional[torch.LongTensor] = None,
615
+ past_key_values: Optional[Cache] = None,
616
+ inputs_embeds: Optional[torch.FloatTensor] = None,
617
+ labels: Optional[torch.LongTensor] = None,
618
+ use_cache: Optional[bool] = None,
619
+ output_attentions: Optional[bool] = None,
620
+ output_hidden_states: Optional[bool] = None,
621
+ cache_position: Optional[torch.LongTensor] = None,
622
+ logits_to_keep: Union[int, torch.Tensor] = 0,
623
+ **kwargs: Unpack[KwargsForCausalLM],
624
+ ) -> CausalLMOutputWithPast:
625
+ r"""
626
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
627
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
628
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
629
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
630
+
631
+ logits_to_keep (`int` or `torch.Tensor`, *optional*):
632
+ If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
633
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
634
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
635
+ If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
636
+ This is useful when using packed tensor format (single dimension for batch and sequence length).
637
+
638
+ Returns:
639
+
640
+ Example:
641
+
642
+ ```python
643
+ >>> from transformers import AutoTokenizer, LlamaForCausalLM
644
+
645
+ >>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
646
+ >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
647
+
648
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
649
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
650
+
651
+ >>> # Generate
652
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
653
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
654
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
655
+ ```"""
656
+ output_attentions = (
657
+ output_attentions
658
+ if output_attentions is not None
659
+ else self.config.output_attentions
660
+ )
661
+ output_hidden_states = (
662
+ output_hidden_states
663
+ if output_hidden_states is not None
664
+ else self.config.output_hidden_states
665
+ )
666
+
667
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
668
+ outputs: BaseModelOutputWithPast = self.model(
669
+ input_ids=input_ids,
670
+ attention_mask=attention_mask,
671
+ position_ids=position_ids,
672
+ past_key_values=past_key_values,
673
+ inputs_embeds=inputs_embeds,
674
+ use_cache=use_cache,
675
+ output_attentions=output_attentions,
676
+ output_hidden_states=output_hidden_states,
677
+ cache_position=cache_position,
678
+ **kwargs,
679
+ )
680
+
681
+ hidden_states = outputs.last_hidden_state
682
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
683
+ slice_indices = (
684
+ slice(-logits_to_keep, None)
685
+ if isinstance(logits_to_keep, int)
686
+ else logits_to_keep
687
+ )
688
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
689
+
690
+ loss = None
691
+ if labels is not None:
692
+ loss = self.loss_function(
693
+ logits=logits,
694
+ labels=labels,
695
+ vocab_size=self.config.vocab_size,
696
+ **kwargs,
697
+ )
698
+
699
+ return CausalLMOutputWithPast(
700
+ loss=loss,
701
+ logits=logits,
702
+ past_key_values=outputs.past_key_values,
703
+ hidden_states=outputs.hidden_states,
704
+ attentions=outputs.attentions,
705
+ )