PyPI - ipex-llm - Versions diffs - 2.2.0b20250107__py3-none-manylinux2010_x86_64.whl → 2.2.0b20250109__py3-none-manylinux2010_x86_64.whl - Mend

ipex-llm 2.2.0b20250107__py3-none-manylinux2010_x86_64.whl → 2.2.0b20250109__py3-none-manylinux2010_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

ipex_llm/libs/libbloom_amx.so +0 -0
ipex_llm/libs/libbloom_avx.so +0 -0
ipex_llm/libs/libbloom_avx2.so +0 -0
ipex_llm/libs/libbloom_avx512.so +0 -0
ipex_llm/libs/libbloom_avxvnni.so +0 -0
ipex_llm/libs/libgptneox_amx.so +0 -0
ipex_llm/libs/libgptneox_avx.so +0 -0
ipex_llm/libs/libgptneox_avx2.so +0 -0
ipex_llm/libs/libgptneox_avx512.so +0 -0
ipex_llm/libs/libgptneox_avxvnni.so +0 -0
ipex_llm/libs/libllama_amx.so +0 -0
ipex_llm/libs/libllama_avx.so +0 -0
ipex_llm/libs/libllama_avx2.so +0 -0
ipex_llm/libs/libllama_avx512.so +0 -0
ipex_llm/libs/libllama_avxvnni.so +0 -0
ipex_llm/libs/libstarcoder_amx.so +0 -0
ipex_llm/libs/libstarcoder_avx.so +0 -0
ipex_llm/libs/libstarcoder_avx2.so +0 -0
ipex_llm/libs/libstarcoder_avx512.so +0 -0
ipex_llm/libs/libstarcoder_avxvnni.so +0 -0
ipex_llm/libs/quantize-bloom +0 -0
ipex_llm/libs/quantize-gptneox +0 -0
ipex_llm/libs/quantize-llama +0 -0
ipex_llm/libs/quantize-starcoder +0 -0
ipex_llm/transformers/convert.py +20 -50
ipex_llm/transformers/loader.py +1 -1
ipex_llm/transformers/low_bit_linear.py +10 -25
ipex_llm/transformers/model.py +0 -7
ipex_llm/transformers/models/baichuan.py +7 -36
ipex_llm/transformers/models/bert.py +2 -13
ipex_llm/transformers/models/chatglm2.py +8 -31
ipex_llm/transformers/models/chatglm4.py +9 -4
ipex_llm/transformers/models/chatglm4v.py +2 -1
ipex_llm/transformers/models/common.py +3 -1
ipex_llm/transformers/models/glm.py +4 -2
ipex_llm/transformers/models/internlm.py +6 -3
ipex_llm/transformers/models/llama.py +2 -2
ipex_llm/transformers/models/minicpm.py +3 -2
ipex_llm/transformers/models/minicpm3.py +3 -1
ipex_llm/transformers/models/minicpmv.py +1 -0
ipex_llm/transformers/models/mistral.py +1 -1
ipex_llm/transformers/models/mllama.py +1 -1
ipex_llm/transformers/models/phi3.py +6 -2
ipex_llm/transformers/models/qwen.py +4 -2
ipex_llm/transformers/models/qwen2.py +4 -3
ipex_llm/transformers/models/qwen2_moe.py +4 -2
ipex_llm/transformers/models/qwen2_vl.py +3 -1
ipex_llm/transformers/models/stablelm.py +3 -1
ipex_llm/transformers/models/starcoder2.py +3 -1
ipex_llm/transformers/models/utils.py +10 -19
ipex_llm/transformers/models/yuan.py +2 -1
ipex_llm/transformers/speculative.py +2 -14
ipex_llm/transformers/utils.py +2 -14
ipex_llm/transformers/xpu_ops.py +25 -19
{ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250109.dist-info}/METADATA +20 -20
{ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250109.dist-info}/RECORD +62 -63
ipex_llm/transformers/models/gptj.py +0 -441
{ipex_llm-2.2.0b20250107.data → ipex_llm-2.2.0b20250109.data}/scripts/ipex-llm-init +0 -0
{ipex_llm-2.2.0b20250107.data → ipex_llm-2.2.0b20250109.data}/scripts/llm-chat +0 -0
{ipex_llm-2.2.0b20250107.data → ipex_llm-2.2.0b20250109.data}/scripts/llm-cli +0 -0
{ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250109.dist-info}/WHEEL +0 -0
{ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250109.dist-info}/entry_points.txt +0 -0
{ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250109.dist-info}/top_level.txt +0 -0

ipex_llm/libs/libbloom_amx.so CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_avx.so CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_avx2.so CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_avx512.so CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_avxvnni.so CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_amx.so CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_avx.so CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_avx2.so CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_avx512.so CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_avxvnni.so CHANGED Viewed

Binary file

ipex_llm/libs/libllama_amx.so CHANGED Viewed

Binary file

ipex_llm/libs/libllama_avx.so CHANGED Viewed

Binary file

ipex_llm/libs/libllama_avx2.so CHANGED Viewed

Binary file

ipex_llm/libs/libllama_avx512.so CHANGED Viewed

Binary file

ipex_llm/libs/libllama_avxvnni.so CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_amx.so CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_avx.so CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_avx2.so CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_avx512.so CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_avxvnni.so CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder CHANGED Viewed

Binary file

ipex_llm/transformers/convert.py CHANGED Viewed

@@ -680,18 +680,9 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                             optimize_lm_head=optimize_lm_head
                         )
                     device = module.weight.data.device
-                    from ipex_llm.transformers.utils import get_ipex_version
-                    if get_ipex_version() < "2.1.10+xpu":
-                        new_linear._parameters['weight'] = nn.Parameter(module.weight)
-                    else:
-                        # only from 2.1, ipex provides matmul_bias_out
-                        # so we need to transpose weight
-                        new_weight = module.weight.transpose(0, 1).contiguous()
-                        new_linear._parameters['weight'] = nn.Parameter(new_weight)
-                        new_linear.weight_type = 2
+                    new_linear._parameters['weight'] = nn.Parameter(module.weight)
                     if module.bias is not None:
-                        new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\
-                            .to(device)
+                        new_linear._parameters['bias'] = nn.Parameter(module.bias.data).to(device)
                 elif qtype == ggml_tensor_qtype["bf16"]:
                     module.to(torch.bfloat16)
                     if _USE_VLLM:
@@ -856,18 +847,9 @@ def replace_with_low_bit_linear_for_module(model, qtype, module_name=None,
                         mp_group=mp_group,
                     )
                     device = module.weight.data.device
-                    from ipex_llm.transformers.utils import get_ipex_version
-                    if get_ipex_version() < "2.1.10+xpu":
-                        new_linear._parameters['weight'] = nn.Parameter(module.weight)
-                    else:
-                        # only from 2.1, ipex provides matmul_bias_out
-                        # so we need to transpose weight
-                        new_weight = module.weight.transpose(0, 1).contiguous()
-                        new_linear._parameters['weight'] = nn.Parameter(new_weight)
-                        new_linear.weight_type = 2
+                    new_linear._parameters['weight'] = nn.Parameter(module.weight)
                     if module.bias is not None:
-                        new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\
-                            .to(device)
+                        new_linear._parameters['bias'] = nn.Parameter(module.bias.data).to(device)
                 elif qtype == ggml_tensor_qtype["bf16"]:
                     module.to(torch.bfloat16)
                     new_linear = BF16Linear(
@@ -1343,7 +1325,6 @@ def _optimize_post(model):
             modeling_module_name = model.__class__.__module__
             module = importlib.import_module(modeling_module_name)
             from ipex_llm.transformers.models.chatglm2 import chatglm2_attention_forward
-            from ipex_llm.transformers.models.chatglm2 import chatglm_rms_norm_forward
             from ipex_llm.transformers.models.chatglm2 import chatglm2_encoder_forward
             from ipex_llm.transformers.models.chatglm2 import chatglm2_model_forward
             from ipex_llm.transformers.models.chatglm2 import mlp_forward
@@ -1356,9 +1337,7 @@ def _optimize_post(model):
             convert_forward(model,
                             module.ChatGLMModel,
                             chatglm2_model_forward)
-            convert_forward(model,
-                            module.RMSNorm,
-                            chatglm_rms_norm_forward)
+            convert_forward(model, module.RMSNorm, rms_norm_forward)
             convert_forward(model, module.MLP, mlp_forward)
             # for codegeex-nano
             if hasattr(model.config, "rope_ratio"):
@@ -1376,8 +1355,7 @@ def _optimize_post(model):
             # glm4 family
             modeling_module_name = model.__class__.__module__
             module = importlib.import_module(modeling_module_name)
-            from ipex_llm.transformers.models.chatglm2 import chatglm_rms_norm_forward
-            convert_forward(model, module.RMSNorm, chatglm_rms_norm_forward)
+            convert_forward(model, module.RMSNorm, rms_norm_forward)
             if hasattr(model.transformer, "vision"):
                 # glm4 vision family
@@ -1429,6 +1407,7 @@ def _optimize_post(model):
         convert_forward(model, module.GlmRMSNorm, rms_norm_forward)
         convert_forward(model, module.GlmMLP, mlp_silu_forward)
         convert_forward(model, module.GlmAttention, glm_attention_forward)
+        convert_forward(model, module.GlmSdpaAttention, glm_attention_forward)
         glm_model_forward = glm_model_forward_wrapper(module.GlmModel.forward)
         convert_forward(model, module.GlmModel, glm_model_forward)
@@ -1437,10 +1416,12 @@ def _optimize_post(model):
             vision_module_name = model.model.vision.__class__.__module__
             vision_module = importlib.import_module(vision_module_name)
             from transformers.models.siglip.modeling_siglip import SiglipAttention
+            from transformers.models.siglip.modeling_siglip import SiglipSdpaAttention
             from ipex_llm.transformers.models.chatglm4v import vision_model_forward
             from ipex_llm.transformers.models.minicpmv import siglip_attention_forward
             convert_forward(model, vision_module.VisionModel, vision_model_forward)
             convert_forward(model, SiglipAttention, siglip_attention_forward)
+            convert_forward(model, SiglipSdpaAttention, siglip_attention_forward)
     elif "mpt" in model.config.model_type:
         if model.config.architectures is not None:
@@ -1452,21 +1433,6 @@ def _optimize_post(model):
                             module.MultiheadAttention,
                             mpt_multihead_attention_forward
                             )
-    elif "gptj" in model.config.model_type:
-        # dolly-v1-6b
-        modeling_module_name = model.__class__.__module__
-        module = importlib.import_module(modeling_module_name)
-        from ipex_llm.transformers.models.gptj import gptj_attention_forward, gptj_model_forward,\
-            gptj_block_forward
-        convert_forward(model,
-                        module.GPTJAttention,
-                        gptj_attention_forward)
-        convert_forward(model,
-                        module.GPTJModel,
-                        gptj_model_forward)
-        convert_forward(model,
-                        module.GPTJBlock,
-                        gptj_block_forward)
     elif "bloom" in model.config.model_type:
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
@@ -1478,8 +1444,8 @@ def _optimize_post(model):
     elif model.config.model_type == "baichuan":
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
-        from ipex_llm.transformers.models.baichuan import baichuan_mlp_forward
-        convert_forward(model, module.MLP, baichuan_mlp_forward)
+        convert_forward(model, module.RMSNorm, rms_norm_forward)
+        convert_forward(model, module.MLP, mlp_silu_forward)
         if model.config.hidden_size in [4096, 2048]:
             # baichuan-7B and baichuan2-7B
@@ -1488,7 +1454,6 @@ def _optimize_post(model):
             for i in range(len(model.model.layers)):
                 setattr(model.model.layers[i].self_attn, "layer_idx", i)
             convert_forward(model, module.Attention, baichuan_attention_forward_7b)
-            convert_forward(model, module.RMSNorm, rms_norm_forward)
             if model.config.vocab_size == 125696:
                 # baichuan2-7B
                 convert_forward(model, module.BaichuanModel, baichuan_model_7b_forward)
@@ -1498,9 +1463,7 @@ def _optimize_post(model):
         elif model.config.hidden_size == 5120:
             # baichuan-13B and baichuan2-13B
             from ipex_llm.transformers.models.baichuan import baichuan_attention_forward_13b
-            from ipex_llm.transformers.models.baichuan import baichuan_13b_rms_norm_forward
             convert_forward(model, module.BaichuanAttention, baichuan_attention_forward_13b)
-            convert_forward(model, module.RMSNorm, baichuan_13b_rms_norm_forward)
             if model.config.vocab_size == 125696:
                 # baichaun2-13B
@@ -1595,7 +1558,6 @@ def _optimize_post(model):
             from ipex_llm.transformers.models.qwen import qwen_attention_forward
             from ipex_llm.transformers.models.qwen import qwen_attention_forward_registered
             from ipex_llm.transformers.models.qwen import qwen_mlp_forward
-            from ipex_llm.transformers.models.chatglm2 import chatglm_rms_norm_forward
             from ipex_llm.transformers.models.qwen import qwen_model_forward
             if model.config.max_position_embeddings == 8192 \
                and model.config.hidden_size == 4096:
@@ -1610,7 +1572,7 @@ def _optimize_post(model):
                                 )
             convert_forward(model,
                             module.RMSNorm,
-                            chatglm_rms_norm_forward)
+                            rms_norm_forward)
             convert_forward(model,
                             module.QWenMLP,
                             qwen_mlp_forward)
@@ -1691,8 +1653,10 @@ def _optimize_post(model):
         convert_forward(model, module.Qwen2MLP, qwen2_mlp_forward)
         model.visual.get_dtype = MethodType(qwen2_vision_get_dtype, model.visual)
         convert_forward(model, module.VisionAttention, qwen2_vision_attention_forward)
+        convert_forward(model, module.VisionSdpaAttention, qwen2_vision_attention_forward)
         convert_forward(model, module.Qwen2VLModel, qwen2_vl_model_forward)
         convert_forward(model, module.Qwen2VLAttention, qwen2_vl_attention_forward)
+        convert_forward(model, module.Qwen2VLSdpaAttention, qwen2_vl_attention_forward)
     elif model.config.model_type == "aquila":
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
@@ -1838,6 +1802,7 @@ def _optimize_post(model):
         from ipex_llm.transformers.models.starcoder2 import attention_forward
         from ipex_llm.transformers.models.starcoder2 import model_forward
         convert_forward(model, module.Starcoder2Attention, attention_forward)
+        convert_forward(model, module.Starcoder2SdpaAttention, attention_forward)
         convert_forward(model, module.Starcoder2Model, model_forward)
     elif model.config.model_type == "phi":
         # for phi-2
@@ -1853,6 +1818,7 @@ def _optimize_post(model):
         module = importlib.import_module(modeling_module_name)
         from ipex_llm.transformers.models.phi3 import attention_forward
         convert_forward(model, module.Phi3Attention, attention_forward)
+        convert_forward(model, module.Phi3SdpaAttention, attention_forward)
         from ipex_llm.transformers.models.phi3 import mlp_forward
         convert_forward(model, module.Phi3MLP, mlp_forward)
         from ipex_llm.transformers.models.common import rms_norm_forward
@@ -1896,6 +1862,8 @@ def _optimize_post(model):
                         module.StableLmAttention,
                         stablelm_attention_forward
                         )
+        if hasattr(module, "StableLmSdpaAttention"):
+            convert_forward(model, module.StableLmSdpaAttention, stablelm_attention_forward)
         convert_forward(model,
                         module.StableLmMLP,
                         mlp_silu_forward)
@@ -1910,6 +1878,7 @@ def _optimize_post(model):
         from ipex_llm.transformers.models.minicpm import minicpm_model_forward_wrapper
         from ipex_llm.transformers.models.minicpm import minicpm_decoder_layer_forward
         convert_forward(model, module.MiniCPMAttention, minicpm_attention_forward)
+        convert_forward(model, module.MiniCPMSdpaAttention, minicpm_attention_forward)
         convert_forward(model, module.MiniCPMMLP, mlp_silu_forward)
         convert_forward(model, module.MiniCPMRMSNorm, rms_norm_forward)
         convert_forward(model, module.MiniCPMDecoderLayer, minicpm_decoder_layer_forward)
@@ -1925,6 +1894,7 @@ def _optimize_post(model):
         convert_forward(model, module.MiniCPMRMSNorm, rms_norm_forward)
         convert_forward(model, module.MiniCPMMLP, mlp_silu_forward)
         convert_forward(model, module.MiniCPMAttention, minicpm3_attention_forward)
+        convert_forward(model, module.MiniCPMSdpaAttention, minicpm3_attention_forward)
         minicpm3_model_forward = minicpm3_model_forward_wrapper(module.MiniCPM3Model.forward)
         convert_forward(model, module.MiniCPM3Model, minicpm3_model_forward)
     elif model.config.model_type == "minicpmv":

ipex_llm/transformers/loader.py CHANGED Viewed

@@ -22,7 +22,7 @@ import time
 from datetime import date
 import argparse
 from ipex_llm.utils.common import invalidInputError
-from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
+from transformers import AutoTokenizer, LlamaTokenizer
 LLAMA_IDS = ['llama', 'vicuna', 'merged-baize']

ipex_llm/transformers/low_bit_linear.py CHANGED Viewed

@@ -51,8 +51,7 @@ from torch import Tensor, device, dtype, nn
 from operator import mul
 from functools import reduce
 from ipex_llm.transformers.xpu_customize_fwd import custom_fwd, custom_bwd
-from ipex_llm.transformers.utils import get_autocast_dtype, get_xpu_device_name, \
-    get_ipex_version
+from ipex_llm.transformers.utils import get_autocast_dtype, get_xpu_device_name
 from ipex_llm.transformers.convert import is_deepspeed_available, get_use_vllm
 T = TypeVar("T", bound="torch.nn.Module")
@@ -286,7 +285,7 @@ def use_batch_forward(x: torch.Tensor, qtype: int, output_len: int):
             or (
                 qtype in [SYM_INT8, FP4, FP6, Q4_K, Q6_K]
                 and batch_size <= 48
-                and device_name in ["arc", "pvc", "mtl", "lnl", "arl"]
+                and device_name in ["arc", "pvc", "mtl", "arl"]
                 and x.shape[1] % 256 == 0
                 and output_len % 32 == 0
             )
@@ -759,9 +758,9 @@ class FP16Linear(nn.Linear):
         self.weight_length = self.out_len * self.in_len
         self.qtype = ggml_tensor_qtype["fp16"]
         self.mp_group = mp_group
-        # weigh_type = 1 means original weight
-        # weigh_type = 2 means weight has been transposed
-        # weigh_type = 3 means weight has been transposed by esimd method
+        # weight_type = 1 means original weight
+        # weight_type = 2 means weight has been transposed
+        # weight_type = 3 means weight has been transposed by esimd method
         self.weight_type = 1
         self.optimize_lm_head = optimize_lm_head
         self.disable_fp16_opt = False
@@ -775,28 +774,14 @@ class FP16Linear(nn.Linear):
         x = x.to(torch.float16)
         if self.bias is not None and self.bias.dtype != x.dtype:
-                self.bias.data = self.bias.data.to(x.dtype)
+            self.bias.data = self.bias.data.to(x.dtype)
         if self.weight is not None and self.weight.dtype != x.dtype:
             self.weight.data = self.weight.data.to(x.dtype)
         if not self.use_esimd_kernel(x):
-            if (
-                get_ipex_version() < "2.1.10+xpu"
-                or get_xpu_device_name(x.device) not in ["arc", "pvc"]
-                or self.disable_fp16_opt
-            ):
-                if self.weight_type == 2:
-                    self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
-                                                     requires_grad=False)
-                    self.weight_type = 1
-                result = F.linear(x, self.weight, self.bias)
-            else:
-                if self.weight_type == 1:
-                    self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
-                                                     requires_grad=False)
-                    self.weight_type = 2
-                result = torch.ops.torch_ipex.matmul_bias_out(x.contiguous(),
-                                                              self.weight, self.bias)
+            invalidInputError(self.weight_type == 1, "weight_type should be 1")
+            result = F.linear(x, self.weight, self.bias)
             if self.mp_group is not None:
                 if get_use_vllm():
                     result = self.mp_group.all_reduce(result)
@@ -852,7 +837,7 @@ class FP16Linear(nn.Linear):
         if self.disable_fp16_opt:
             return False
         # esimd kernel can only be used for Arc and Flex
-        if gpu_type not in ["arc", "flex"]:
+        if gpu_type not in ["arc"]:
             return False
         # now esimd kernel can only be used for specific cases (llama2-7b shape)
         if self.in_len == 11008 and self.out_features == 4096:

ipex_llm/transformers/model.py CHANGED Viewed

@@ -103,12 +103,6 @@ def save_low_bit(self, *args, **kwargs):
         self.to(origin_device)
-def _load_pre():
-    from transformers import GPTJModel
-    from ipex_llm.transformers.models.gptj import gptj_model_new_init
-    GPTJModel.__init__ = gptj_model_new_init
 class _BaseAutoModelClass:
     HF_MODEL = None
@@ -495,7 +489,6 @@ class _BaseAutoModelClass:
         else:
             if quant_config is not None:
                 kwargs["quantization_config"] = quant_config
-            _load_pre()
             try:
                 # To handle the input CUDA setting (such as 'device_map={"":0}'), ignore it
                 kwargs.pop('device_map', None)

ipex_llm/transformers/models/baichuan.py CHANGED Viewed

@@ -47,38 +47,6 @@ def pre_compute_inv_freq(module: torch.nn.Module):
         module.register_buffer("inv_freq", inv_freq, persistent=False)
-def baichuan_13b_rms_norm_forward(self, hidden_states):
-    if hidden_states.device.type == "xpu" and not (self.training or hidden_states.requires_grad):
-        import xe_addons
-        x_2d = hidden_states.reshape(-1, hidden_states.size(-1)).contiguous()
-        output = xe_addons.rms_norm(self.weight, x_2d, self.epsilon)
-        return output.reshape(hidden_states.shape)
-    input_dtype = hidden_states.dtype
-    hidden_states = hidden_states.to(torch.float32)
-    variance = hidden_states.pow(2).mean(-1, keepdim=True)
-    hidden_states = hidden_states * torch.rsqrt(variance + self.epsilon)
-    return self.weight * hidden_states.to(input_dtype)
-def baichuan_mlp_forward(
-    self,
-    x: torch.Tensor,
-) -> torch.Tensor:
-    x_2d = x.view(-1, x.shape[-1])
-    qtype = getattr(self.gate_proj, "qtype", None)
-    if mlp_fusion_check(x_2d, qtype, self.training):
-        import xe_linear
-        if not x_2d.is_contiguous():
-            x_2d = x_2d.contiguous()
-        return self.down_proj(xe_linear.mlp_forward_xpu(
-            x_2d, self.gate_proj.weight.data, self.up_proj.weight.data,
-            x_2d.shape[0], x_2d.shape[1], self.gate_proj.out_len,
-            SILU, qtype
-        ))
-    return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
 def baichuan_model_7b_forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -105,7 +73,9 @@ def baichuan_model_7b_forward(
     if use_cache:
         inputs = input_ids if input_ids is not None else inputs_embeds
         use_compress_kv = should_use_compresskv(inputs, inputs.shape[1])
-        use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs)
+        use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs,
+                                                self.config.num_attention_heads,
+                                                self.config.num_attention_heads)
         if use_compress_kv and not isinstance(past_key_values,
                                               DynamicCompressCache):
             if use_quantize_kv:
@@ -278,8 +248,6 @@ def baichuan_attention_forward_7b(
         key_states = key_states.to(hidden_states.dtype)
     # IPEX-LLM OPT: kv cache and quantize kv
-    use_quantize_kv = use_quantize_kv_cache(self.W_pack, hidden_states)
     # [CompressKV]
     if use_compresskv:
         enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value,
@@ -290,6 +258,8 @@ def baichuan_attention_forward_7b(
             query_states, attention_mask, 1,
             self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH)
     else:
+        use_quantize_kv = use_quantize_kv_cache(self.W_pack, hidden_states,
+                                                self.num_heads, self.num_heads)
         key_states, value_states = update_past_key_value(
             past_key_value, key_states, value_states,
             kv_seq_len, use_quantize_kv, device
@@ -340,7 +310,8 @@ def baichuan_attention_forward_13b(
         kv_seq_len += past_key_value[0].shape[2]
     # IPEX-LLM OPT: kv cache and quantize kv
-    use_quantize_kv = use_quantize_kv_cache(self.W_pack, hidden_states)
+    use_quantize_kv = use_quantize_kv_cache(self.W_pack, hidden_states,
+                                            self.num_heads, self.num_heads)
     key_states, value_states = update_past_key_value(
         past_key_value, key_states, value_states,
         kv_seq_len, use_quantize_kv, device

ipex_llm/transformers/models/bert.py CHANGED Viewed

@@ -36,24 +36,13 @@ import math
 import torch
 from typing import Optional, Tuple
 from transformers.models.bert.modeling_bert import BertSelfAttention, BertEncoder
+from ipex_llm.transformers.models.common import merge_linear
 from ipex_llm.utils.common import invalidInputError
 def merge_qkv(module: torch.nn.Module):
     if isinstance(module, BertSelfAttention):
-        q_w = module.query.weight.data
-        k_w = module.key.weight.data
-        v_w = module.value.weight.data
-        q_b = module.query.bias.data
-        k_b = module.key.bias.data
-        v_b = module.value.bias.data
-        new_w = torch.cat([q_w, k_w, v_w], dim=0)
-        new_b = torch.cat([q_b, k_b, v_b], dim=-1)
-        qkv = torch.nn.Linear(0, 0, bias=True)
-        qkv.weight = torch.nn.Parameter(new_w, requires_grad=False)
-        qkv.bias = torch.nn.Parameter(new_b, requires_grad=False)
-        qkv.in_features = module.query.in_features
-        qkv.out_features = module.query.out_features * 3
+        qkv = merge_linear([module.query, module.key, module.value])
         module.qkv = qkv
         del module.query
         del module.key

ipex_llm/transformers/models/chatglm2.py CHANGED Viewed

@@ -33,34 +33,6 @@ from ipex_llm.transformers.kv import DynamicCompressCache, DynamicCompressFp8Cac
 KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256))
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states
-    go from (batch, num_key_value_heads, seqlen, head_dim) to
-    (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads,
-                                                           n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-def chatglm_rms_norm_forward(self, hidden_states):
-    if hidden_states.device.type == "xpu" and not (self.training and hidden_states.requires_grad):
-        import xe_addons
-        x_2d = hidden_states.reshape(-1, hidden_states.size(-1)).contiguous()
-        output = xe_addons.rms_norm(self.weight, x_2d, self.eps)
-        return output.reshape(hidden_states.shape)
-    input_dtype = hidden_states.dtype
-    hidden_states = hidden_states.to(torch.float32)
-    variance = hidden_states.pow(2).mean(-1, keepdim=True)
-    hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
-    return self.weight * hidden_states.to(input_dtype)
 def chatglm2_model_forward(
     self,
     input_ids,
@@ -91,8 +63,13 @@ def chatglm2_model_forward(
     if use_cache:
         use_compress_kv = should_use_compresskv(input_ids, input_ids.shape[1])
+        n_heads = self.config.num_attention_heads
+        if self.config.multi_query_attention:
+            n_kv_heads = self.config.multi_query_group_num
+        else:
+            n_kv_heads = n_heads
         use_quantize_kv = use_quantize_kv_cache(self.encoder.layers[0].mlp.gate_proj,
-                                                input_ids)
+                                                input_ids, n_heads, n_kv_heads)
         if use_compress_kv and not isinstance(past_key_values,
                                               DynamicCompressCache):
             if use_quantize_kv:
@@ -285,8 +262,6 @@ def chatglm2_attention_forward(
         key_states[..., :rot_dim] = k_rot[...]
     # IPEX-LLM OPT: kv cache and quantize kv
-    use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states)
     # [CompressKV]
     if use_compresskv:
         from transformers.configuration_utils import PretrainedConfig
@@ -300,6 +275,8 @@ def chatglm2_attention_forward(
             self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH
         )
     else:
+        use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states,
+                                                n_head, n_kv_head)
         key_states, value_states = update_past_key_value(
             past_key_value, key_states, value_states,
             kv_seq_len, use_quantize_kv, hidden_states.device

ipex_llm/transformers/models/chatglm4.py CHANGED Viewed

@@ -55,8 +55,13 @@ def chatglm4_model_forward(
     if use_cache:
         inputs = input_ids if input_ids is not None else inputs_embeds
         use_compress_kv = should_use_compresskv(inputs, inputs.shape[1])
-        use_quantize_kv = use_quantize_kv_cache(self.encoder.layers[0].mlp.gate_proj,
-                                                inputs)
+        n_heads = self.config.num_attention_heads
+        if self.config.multi_query_attention:
+            n_kv_heads = self.config.multi_query_group_num
+        else:
+            n_kv_heads = n_heads
+        use_quantize_kv = use_quantize_kv_cache(self.encoder.layers[0].mlp.gate_proj, inputs,
+                                                n_heads, n_kv_heads)
         if use_compress_kv and not isinstance(past_key_values,
                                               DynamicCompressCache):
             if use_quantize_kv:
@@ -211,8 +216,6 @@ def chatglm4_attention_forward(
         key_states[..., :rot_dim] = k_rot[...]
     # IPEX-LLM OPT: kv cache and quantize kv
-    use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states)
     # [CompressKV]
     if use_compresskv:
         from transformers.configuration_utils import PretrainedConfig
@@ -226,6 +229,8 @@ def chatglm4_attention_forward(
             self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH
         )
     else:
+        use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states,
+                                                n_head, n_kv_head)
         key_states, value_states = update_past_key_value(
             past_key_value, key_states, value_states,
             kv_seq_len, use_quantize_kv, hidden_states.device

ipex_llm/transformers/models/chatglm4v.py CHANGED Viewed

@@ -230,7 +230,7 @@ def chatglm4v_attention_forward(
         key_states[..., :rot_dim] = k_rot[...]
     # IPEX-LLM OPT: kv cache and quantize kv
-    use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states)
+    use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states, n_head, n_kv_head)
     key_states, value_states = update_past_key_value(
         past_key_value, key_states, value_states,
         kv_seq_len, use_quantize_kv, hidden_states.device
@@ -301,6 +301,7 @@ def patch_embedding_forward(self, images: "tensor(B, C, H, W)") -> "tensor(B, L,
 def merge_qkv(module: torch.nn.Module):
     merge_qkv_base(module, "SiglipAttention")
+    merge_qkv_base(module, "SiglipSdpaAttention")
 def vision_model_forward(self: torch.nn.Module, image: torch.Tensor):

ipex_llm/transformers/models/common.py CHANGED Viewed

@@ -157,8 +157,10 @@ def rms_norm_forward(self, hidden_states: torch.Tensor):
     weight = self.weight
     if hasattr(self, "variance_epsilon"):
         eps = self.variance_epsilon
-    else:
+    elif hasattr(self, "epsilon"):
         eps = self.epsilon
+    else:
+        eps = self.eps
     if hidden_states.device.type == 'xpu' and hidden_states.dtype in [torch.float, torch.half]:
         import xe_addons

ipex_llm/transformers/models/glm.py CHANGED Viewed

@@ -37,6 +37,7 @@ import torch
 from typing import Optional, Tuple
 from transformers.cache_utils import Cache
+from transformers.models.glm.modeling_glm import GlmAttention
 from transformers.models.glm.modeling_glm import apply_rotary_pos_emb
 from ipex_llm.transformers.kv import DynamicNormalCache, DynamicFp8Cache
 from ipex_llm.transformers.models.common import merge_qkv_base
@@ -46,8 +47,9 @@ from ipex_llm.transformers.models.utils import use_quantize_kv_cache
 def merge_qkv(module: torch.nn.Module):
-    merge_qkv_base(module, "GlmAttention")
+    merge_qkv_base(module, GlmAttention)
     merge_qkv_base(module, "SiglipAttention")
+    merge_qkv_base(module, "SiglipSdpaAttention")
 def split_mlp(module: torch.nn.Module):
@@ -145,7 +147,7 @@ def glm_model_forward_wrapper(origin_forward):
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         use_cache = use_cache or inputs.device.type == 'xpu'
         use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
-                                                self.config.num_attention_heads //
+                                                self.config.num_attention_heads,
                                                 self.config.num_key_value_heads)
         if use_cache: