PyPI - ipex-llm - Versions diffs - 2.2.0b20250107__py3-none-manylinux2010_x86_64.whl → 2.2.0b20250108__py3-none-manylinux2010_x86_64.whl - Mend

ipex-llm 2.2.0b20250107__py3-none-manylinux2010_x86_64.whl → 2.2.0b20250108__py3-none-manylinux2010_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

ipex_llm/libs/libbloom_amx.so +0 -0
ipex_llm/libs/libbloom_avx.so +0 -0
ipex_llm/libs/libbloom_avx2.so +0 -0
ipex_llm/libs/libbloom_avx512.so +0 -0
ipex_llm/libs/libbloom_avxvnni.so +0 -0
ipex_llm/libs/libgptneox_amx.so +0 -0
ipex_llm/libs/libgptneox_avx.so +0 -0
ipex_llm/libs/libgptneox_avx2.so +0 -0
ipex_llm/libs/libgptneox_avx512.so +0 -0
ipex_llm/libs/libgptneox_avxvnni.so +0 -0
ipex_llm/libs/libllama_amx.so +0 -0
ipex_llm/libs/libllama_avx.so +0 -0
ipex_llm/libs/libllama_avx2.so +0 -0
ipex_llm/libs/libllama_avx512.so +0 -0
ipex_llm/libs/libllama_avxvnni.so +0 -0
ipex_llm/libs/libstarcoder_amx.so +0 -0
ipex_llm/libs/libstarcoder_avx.so +0 -0
ipex_llm/libs/libstarcoder_avx2.so +0 -0
ipex_llm/libs/libstarcoder_avx512.so +0 -0
ipex_llm/libs/libstarcoder_avxvnni.so +0 -0
ipex_llm/libs/quantize-bloom +0 -0
ipex_llm/libs/quantize-gptneox +0 -0
ipex_llm/libs/quantize-llama +0 -0
ipex_llm/libs/quantize-starcoder +0 -0
ipex_llm/transformers/convert.py +15 -37
ipex_llm/transformers/loader.py +1 -1
ipex_llm/transformers/low_bit_linear.py +10 -25
ipex_llm/transformers/model.py +0 -7
ipex_llm/transformers/models/chatglm4v.py +1 -0
ipex_llm/transformers/models/glm.py +3 -1
ipex_llm/transformers/models/llama.py +1 -1
ipex_llm/transformers/models/minicpm.py +2 -1
ipex_llm/transformers/models/minicpmv.py +1 -0
ipex_llm/transformers/models/utils.py +3 -16
ipex_llm/transformers/speculative.py +2 -14
ipex_llm/transformers/utils.py +2 -14
ipex_llm/transformers/xpu_ops.py +25 -19
{ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250108.dist-info}/METADATA +20 -20
{ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250108.dist-info}/RECORD +45 -46
ipex_llm/transformers/models/gptj.py +0 -441
{ipex_llm-2.2.0b20250107.data → ipex_llm-2.2.0b20250108.data}/scripts/ipex-llm-init +0 -0
{ipex_llm-2.2.0b20250107.data → ipex_llm-2.2.0b20250108.data}/scripts/llm-chat +0 -0
{ipex_llm-2.2.0b20250107.data → ipex_llm-2.2.0b20250108.data}/scripts/llm-cli +0 -0
{ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250108.dist-info}/WHEEL +0 -0
{ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250108.dist-info}/entry_points.txt +0 -0
{ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250108.dist-info}/top_level.txt +0 -0

ipex_llm/libs/libbloom_amx.so CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_avx.so CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_avx2.so CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_avx512.so CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_avxvnni.so CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_amx.so CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_avx.so CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_avx2.so CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_avx512.so CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_avxvnni.so CHANGED Viewed

Binary file

ipex_llm/libs/libllama_amx.so CHANGED Viewed

Binary file

ipex_llm/libs/libllama_avx.so CHANGED Viewed

Binary file

ipex_llm/libs/libllama_avx2.so CHANGED Viewed

Binary file

ipex_llm/libs/libllama_avx512.so CHANGED Viewed

Binary file

ipex_llm/libs/libllama_avxvnni.so CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_amx.so CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_avx.so CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_avx2.so CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_avx512.so CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_avxvnni.so CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder CHANGED Viewed

Binary file

ipex_llm/transformers/convert.py CHANGED Viewed

@@ -680,18 +680,9 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                             optimize_lm_head=optimize_lm_head
                         )
                     device = module.weight.data.device
-                    from ipex_llm.transformers.utils import get_ipex_version
-                    if get_ipex_version() < "2.1.10+xpu":
-                        new_linear._parameters['weight'] = nn.Parameter(module.weight)
-                    else:
-                        # only from 2.1, ipex provides matmul_bias_out
-                        # so we need to transpose weight
-                        new_weight = module.weight.transpose(0, 1).contiguous()
-                        new_linear._parameters['weight'] = nn.Parameter(new_weight)
-                        new_linear.weight_type = 2
+                    new_linear._parameters['weight'] = nn.Parameter(module.weight)
                     if module.bias is not None:
-                        new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\
-                            .to(device)
+                        new_linear._parameters['bias'] = nn.Parameter(module.bias.data).to(device)
                 elif qtype == ggml_tensor_qtype["bf16"]:
                     module.to(torch.bfloat16)
                     if _USE_VLLM:
@@ -856,18 +847,9 @@ def replace_with_low_bit_linear_for_module(model, qtype, module_name=None,
                         mp_group=mp_group,
                     )
                     device = module.weight.data.device
-                    from ipex_llm.transformers.utils import get_ipex_version
-                    if get_ipex_version() < "2.1.10+xpu":
-                        new_linear._parameters['weight'] = nn.Parameter(module.weight)
-                    else:
-                        # only from 2.1, ipex provides matmul_bias_out
-                        # so we need to transpose weight
-                        new_weight = module.weight.transpose(0, 1).contiguous()
-                        new_linear._parameters['weight'] = nn.Parameter(new_weight)
-                        new_linear.weight_type = 2
+                    new_linear._parameters['weight'] = nn.Parameter(module.weight)
                     if module.bias is not None:
-                        new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\
-                            .to(device)
+                        new_linear._parameters['bias'] = nn.Parameter(module.bias.data).to(device)
                 elif qtype == ggml_tensor_qtype["bf16"]:
                     module.to(torch.bfloat16)
                     new_linear = BF16Linear(
@@ -1429,6 +1411,7 @@ def _optimize_post(model):
         convert_forward(model, module.GlmRMSNorm, rms_norm_forward)
         convert_forward(model, module.GlmMLP, mlp_silu_forward)
         convert_forward(model, module.GlmAttention, glm_attention_forward)
+        convert_forward(model, module.GlmSdpaAttention, glm_attention_forward)
         glm_model_forward = glm_model_forward_wrapper(module.GlmModel.forward)
         convert_forward(model, module.GlmModel, glm_model_forward)
@@ -1437,10 +1420,12 @@ def _optimize_post(model):
             vision_module_name = model.model.vision.__class__.__module__
             vision_module = importlib.import_module(vision_module_name)
             from transformers.models.siglip.modeling_siglip import SiglipAttention
+            from transformers.models.siglip.modeling_siglip import SiglipSdpaAttention
             from ipex_llm.transformers.models.chatglm4v import vision_model_forward
             from ipex_llm.transformers.models.minicpmv import siglip_attention_forward
             convert_forward(model, vision_module.VisionModel, vision_model_forward)
             convert_forward(model, SiglipAttention, siglip_attention_forward)
+            convert_forward(model, SiglipSdpaAttention, siglip_attention_forward)
     elif "mpt" in model.config.model_type:
         if model.config.architectures is not None:
@@ -1452,21 +1437,6 @@ def _optimize_post(model):
                             module.MultiheadAttention,
                             mpt_multihead_attention_forward
                             )
-    elif "gptj" in model.config.model_type:
-        # dolly-v1-6b
-        modeling_module_name = model.__class__.__module__
-        module = importlib.import_module(modeling_module_name)
-        from ipex_llm.transformers.models.gptj import gptj_attention_forward, gptj_model_forward,\
-            gptj_block_forward
-        convert_forward(model,
-                        module.GPTJAttention,
-                        gptj_attention_forward)
-        convert_forward(model,
-                        module.GPTJModel,
-                        gptj_model_forward)
-        convert_forward(model,
-                        module.GPTJBlock,
-                        gptj_block_forward)
     elif "bloom" in model.config.model_type:
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
@@ -1691,8 +1661,10 @@ def _optimize_post(model):
         convert_forward(model, module.Qwen2MLP, qwen2_mlp_forward)
         model.visual.get_dtype = MethodType(qwen2_vision_get_dtype, model.visual)
         convert_forward(model, module.VisionAttention, qwen2_vision_attention_forward)
+        convert_forward(model, module.VisionSdpaAttention, qwen2_vision_attention_forward)
         convert_forward(model, module.Qwen2VLModel, qwen2_vl_model_forward)
         convert_forward(model, module.Qwen2VLAttention, qwen2_vl_attention_forward)
+        convert_forward(model, module.Qwen2VLSdpaAttention, qwen2_vl_attention_forward)
     elif model.config.model_type == "aquila":
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
@@ -1838,6 +1810,7 @@ def _optimize_post(model):
         from ipex_llm.transformers.models.starcoder2 import attention_forward
         from ipex_llm.transformers.models.starcoder2 import model_forward
         convert_forward(model, module.Starcoder2Attention, attention_forward)
+        convert_forward(model, module.Starcoder2SdpaAttention, attention_forward)
         convert_forward(model, module.Starcoder2Model, model_forward)
     elif model.config.model_type == "phi":
         # for phi-2
@@ -1853,6 +1826,7 @@ def _optimize_post(model):
         module = importlib.import_module(modeling_module_name)
         from ipex_llm.transformers.models.phi3 import attention_forward
         convert_forward(model, module.Phi3Attention, attention_forward)
+        convert_forward(model, module.Phi3SdpaAttention, attention_forward)
         from ipex_llm.transformers.models.phi3 import mlp_forward
         convert_forward(model, module.Phi3MLP, mlp_forward)
         from ipex_llm.transformers.models.common import rms_norm_forward
@@ -1896,6 +1870,8 @@ def _optimize_post(model):
                         module.StableLmAttention,
                         stablelm_attention_forward
                         )
+        if hasattr(module, "StableLmSdpaAttention"):
+            convert_forward(model, module.StableLmSdpaAttention, stablelm_attention_forward)
         convert_forward(model,
                         module.StableLmMLP,
                         mlp_silu_forward)
@@ -1910,6 +1886,7 @@ def _optimize_post(model):
         from ipex_llm.transformers.models.minicpm import minicpm_model_forward_wrapper
         from ipex_llm.transformers.models.minicpm import minicpm_decoder_layer_forward
         convert_forward(model, module.MiniCPMAttention, minicpm_attention_forward)
+        convert_forward(model, module.MiniCPMSdpaAttention, minicpm_attention_forward)
         convert_forward(model, module.MiniCPMMLP, mlp_silu_forward)
         convert_forward(model, module.MiniCPMRMSNorm, rms_norm_forward)
         convert_forward(model, module.MiniCPMDecoderLayer, minicpm_decoder_layer_forward)
@@ -1925,6 +1902,7 @@ def _optimize_post(model):
         convert_forward(model, module.MiniCPMRMSNorm, rms_norm_forward)
         convert_forward(model, module.MiniCPMMLP, mlp_silu_forward)
         convert_forward(model, module.MiniCPMAttention, minicpm3_attention_forward)
+        convert_forward(model, module.MiniCPMSdpaAttention, minicpm3_attention_forward)
         minicpm3_model_forward = minicpm3_model_forward_wrapper(module.MiniCPM3Model.forward)
         convert_forward(model, module.MiniCPM3Model, minicpm3_model_forward)
     elif model.config.model_type == "minicpmv":

ipex_llm/transformers/loader.py CHANGED Viewed

@@ -22,7 +22,7 @@ import time
 from datetime import date
 import argparse
 from ipex_llm.utils.common import invalidInputError
-from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
+from transformers import AutoTokenizer, LlamaTokenizer
 LLAMA_IDS = ['llama', 'vicuna', 'merged-baize']

ipex_llm/transformers/low_bit_linear.py CHANGED Viewed

@@ -51,8 +51,7 @@ from torch import Tensor, device, dtype, nn
 from operator import mul
 from functools import reduce
 from ipex_llm.transformers.xpu_customize_fwd import custom_fwd, custom_bwd
-from ipex_llm.transformers.utils import get_autocast_dtype, get_xpu_device_name, \
-    get_ipex_version
+from ipex_llm.transformers.utils import get_autocast_dtype, get_xpu_device_name
 from ipex_llm.transformers.convert import is_deepspeed_available, get_use_vllm
 T = TypeVar("T", bound="torch.nn.Module")
@@ -286,7 +285,7 @@ def use_batch_forward(x: torch.Tensor, qtype: int, output_len: int):
             or (
                 qtype in [SYM_INT8, FP4, FP6, Q4_K, Q6_K]
                 and batch_size <= 48
-                and device_name in ["arc", "pvc", "mtl", "lnl", "arl"]
+                and device_name in ["arc", "pvc", "mtl", "arl"]
                 and x.shape[1] % 256 == 0
                 and output_len % 32 == 0
             )
@@ -759,9 +758,9 @@ class FP16Linear(nn.Linear):
         self.weight_length = self.out_len * self.in_len
         self.qtype = ggml_tensor_qtype["fp16"]
         self.mp_group = mp_group
-        # weigh_type = 1 means original weight
-        # weigh_type = 2 means weight has been transposed
-        # weigh_type = 3 means weight has been transposed by esimd method
+        # weight_type = 1 means original weight
+        # weight_type = 2 means weight has been transposed
+        # weight_type = 3 means weight has been transposed by esimd method
         self.weight_type = 1
         self.optimize_lm_head = optimize_lm_head
         self.disable_fp16_opt = False
@@ -775,28 +774,14 @@ class FP16Linear(nn.Linear):
         x = x.to(torch.float16)
         if self.bias is not None and self.bias.dtype != x.dtype:
-                self.bias.data = self.bias.data.to(x.dtype)
+            self.bias.data = self.bias.data.to(x.dtype)
         if self.weight is not None and self.weight.dtype != x.dtype:
             self.weight.data = self.weight.data.to(x.dtype)
         if not self.use_esimd_kernel(x):
-            if (
-                get_ipex_version() < "2.1.10+xpu"
-                or get_xpu_device_name(x.device) not in ["arc", "pvc"]
-                or self.disable_fp16_opt
-            ):
-                if self.weight_type == 2:
-                    self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
-                                                     requires_grad=False)
-                    self.weight_type = 1
-                result = F.linear(x, self.weight, self.bias)
-            else:
-                if self.weight_type == 1:
-                    self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
-                                                     requires_grad=False)
-                    self.weight_type = 2
-                result = torch.ops.torch_ipex.matmul_bias_out(x.contiguous(),
-                                                              self.weight, self.bias)
+            invalidInputError(self.weight_type == 1, "weight_type should be 1")
+            result = F.linear(x, self.weight, self.bias)
             if self.mp_group is not None:
                 if get_use_vllm():
                     result = self.mp_group.all_reduce(result)
@@ -852,7 +837,7 @@ class FP16Linear(nn.Linear):
         if self.disable_fp16_opt:
             return False
         # esimd kernel can only be used for Arc and Flex
-        if gpu_type not in ["arc", "flex"]:
+        if gpu_type not in ["arc"]:
             return False
         # now esimd kernel can only be used for specific cases (llama2-7b shape)
         if self.in_len == 11008 and self.out_features == 4096:

ipex_llm/transformers/model.py CHANGED Viewed

@@ -103,12 +103,6 @@ def save_low_bit(self, *args, **kwargs):
         self.to(origin_device)
-def _load_pre():
-    from transformers import GPTJModel
-    from ipex_llm.transformers.models.gptj import gptj_model_new_init
-    GPTJModel.__init__ = gptj_model_new_init
 class _BaseAutoModelClass:
     HF_MODEL = None
@@ -495,7 +489,6 @@ class _BaseAutoModelClass:
         else:
             if quant_config is not None:
                 kwargs["quantization_config"] = quant_config
-            _load_pre()
             try:
                 # To handle the input CUDA setting (such as 'device_map={"":0}'), ignore it
                 kwargs.pop('device_map', None)

ipex_llm/transformers/models/chatglm4v.py CHANGED Viewed

@@ -301,6 +301,7 @@ def patch_embedding_forward(self, images: "tensor(B, C, H, W)") -> "tensor(B, L,
 def merge_qkv(module: torch.nn.Module):
     merge_qkv_base(module, "SiglipAttention")
+    merge_qkv_base(module, "SiglipSdpaAttention")
 def vision_model_forward(self: torch.nn.Module, image: torch.Tensor):

ipex_llm/transformers/models/glm.py CHANGED Viewed

@@ -37,6 +37,7 @@ import torch
 from typing import Optional, Tuple
 from transformers.cache_utils import Cache
+from transformers.models.glm.modeling_glm import GlmAttention
 from transformers.models.glm.modeling_glm import apply_rotary_pos_emb
 from ipex_llm.transformers.kv import DynamicNormalCache, DynamicFp8Cache
 from ipex_llm.transformers.models.common import merge_qkv_base
@@ -46,8 +47,9 @@ from ipex_llm.transformers.models.utils import use_quantize_kv_cache
 def merge_qkv(module: torch.nn.Module):
-    merge_qkv_base(module, "GlmAttention")
+    merge_qkv_base(module, GlmAttention)
     merge_qkv_base(module, "SiglipAttention")
+    merge_qkv_base(module, "SiglipSdpaAttention")
 def split_mlp(module: torch.nn.Module):

ipex_llm/transformers/models/llama.py CHANGED Viewed

@@ -116,7 +116,7 @@ def llama_model_forward(
 def merge_qkv(module: torch.nn.Module):
-    return merge_qkv_base(module, LlamaAttention)
+    merge_qkv_base(module, LlamaAttention)
 def llama_attention_forward(

ipex_llm/transformers/models/minicpm.py CHANGED Viewed

@@ -51,7 +51,8 @@ from transformers.cache_utils import Cache
 def merge_qkv(module: torch.nn.Module):
-    return merge_qkv_base(module, "MiniCPMAttention")
+    merge_qkv_base(module, "MiniCPMAttention")
+    merge_qkv_base(module, "MiniCPMSdpaAttention")
 def apply_residual_scale(module: torch.nn.Module):

ipex_llm/transformers/models/minicpmv.py CHANGED Viewed

@@ -36,6 +36,7 @@ from transformers.generation.logits_process import RepetitionPenaltyLogitsProces
 # MiniCPM-V-2_5 and MiniCPM-V-2_6
 def merge_qkv(module: torch.nn.Module):
     merge_qkv_base(module, "SiglipAttention")
+    merge_qkv_base(module, "SiglipSdpaAttention")
     merge_qkv_base(module, "Idefics2VisionAttention")

ipex_llm/transformers/models/utils.py CHANGED Viewed

@@ -19,7 +19,7 @@ import torch
 import warnings
 from ipex_llm.utils.common import invalidInputError
 from ipex_llm.ggml.quantize import ggml_tensor_qtype
-from ipex_llm.transformers.utils import get_ipex_version, get_xpu_device_name
+from ipex_llm.transformers.utils import get_xpu_device_name
 from ipex_llm.transformers.low_bit_linear import SYM_INT4, SYM_INT8, FP8E5, IQ2_XXS, FP4, FP8E4,\
     FP6, ASYM_INT4
@@ -168,7 +168,7 @@ def should_use_fuse_rope(hidden_states, position_ids, training):
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
     if model_family in ["llama", "baichuan", "internlm", "aquila", "gpt_neox", "mistral",
-                        "mixtral", "qwen2", "yuan", "stablelm", "qwen2_moe"]:
+                        "qwen2", "yuan", "stablelm", "qwen2_moe"]:
         # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
         cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
         sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
@@ -183,7 +183,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
         q_embed = (q * cos) + (rotate_half(q) * sin)
         k_embed = (k * cos) + (rotate_half(k) * sin)
         return q_embed, k_embed
-    elif model_family in ["gptj", "chatglm"]:
+    elif model_family in ["chatglm"]:
         q_embed = (q * cos) + (rotate_every_two(q) * sin)
         k_embed = (k * cos) + (rotate_every_two(k) * sin)
         return q_embed, k_embed
@@ -192,19 +192,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
                           f"{model_family} is not supported.")
-def apply_ipex_rotate_every_two(q, k, cos, sin):
-    # ipex's apply_rotary_embedding_two_qk can change the origin storage,
-    # so q/k will get the result directly.
-    from ipex_llm.transformers.utils import get_ipex_version
-    if get_ipex_version() >= "2.1.10+xpu":
-        torch.ops.torch_ipex.apply_rotary_embedding_two_qk(
-            q, k, sin, cos, q, k
-        )
-    else:
-        torch.ops.torch_ipex.apply_rotary_embedding(q, sin, cos, q)
-        torch.ops.torch_ipex.apply_rotary_embedding(k, sin, cos, k)
 def is_enough_kv_cache_room_4_36(past_key_value, idx, seq_len=1):
     # to determinate if is enough kv cache room in transformers==4.36
     # seq_len for current seq len

ipex_llm/transformers/speculative.py CHANGED Viewed

@@ -432,8 +432,7 @@ def _check_and_extend_kv_cache(past_key_values, max_step_draft, kv_alloc_block_l
     from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \
         extend_kv_cache
     enough_kv_room = True
-    if model_type not in ["chatglm", "qwen", "baichuan", "llama", "mistral",
-                          "gptj", "opt"]:
+    if model_type not in ["chatglm", "qwen", "baichuan", "llama", "mistral", "opt"]:
         return past_key_values, False
     cache_k = past_key_values[0][0]
     if model_type == "chatglm":
@@ -527,7 +526,7 @@ def _crop_past_key_values(self, past_key_values, new_cache_size, _enable_ipex=Fa
                         v[:-(new_cache_size), :, :, :])
                     for k, v in past_key_values
                 ]
-        elif self.config.model_type in ["baichuan", "gptj"]:
+        elif self.config.model_type in ["baichuan"]:
             past_key_values = [
                 (k[:, :, :-(new_cache_size), :],
                     v[:, :, :-(new_cache_size), :])
@@ -796,13 +795,6 @@ def _non_cpu_ipex_verify(self, verify_input_ids, past_key_values, cur_attention_
                                     device=verify_input_ids.device)
         position_ids = position_ids.unsqueeze(0).repeat(1, 1) + past_key_value_len
         forward_args["position_ids"] = position_ids
-    elif self.config.model_type == "gptj":
-        past_length = past_key_values[0][0].size(2)
-        input_len = verify_input_ids.shape[1]
-        position_ids = torch.arange(past_length, input_len + past_length,
-                                    dtype=torch.long, device=verify_input_ids.device)
-        position_ids = position_ids.unsqueeze(0).view(-1, input_len)
-        forward_args["position_ids"] = position_ids
     return self(**forward_args)
@@ -971,10 +963,6 @@ def speculative_generate(self,
                         past_key_value_len = past_key_values[0][0].shape[0]
                     position_ids = torch.Tensor([[past_key_value_len + step_draft]]).long()
                     forward_args["position_ids"] = position_ids
-                elif self.config.model_type == "gptj":
-                    past_length = draft_past_key_values[0][0].size(2)
-                    position_ids = torch.Tensor([[past_length]]).long().to(self.device)
-                    forward_args["position_ids"] = position_ids
                 if _enable_ipex:
                     if any(keyword in self.config.model_type

ipex_llm/transformers/utils.py CHANGED Viewed

@@ -154,24 +154,12 @@ def get_autocast_dtype(x):
                           f"Device {x.device} is not supported.")
-_ipex_version = None
-def get_ipex_version():
-    global _ipex_version
-    if _ipex_version is not None:
-        return _ipex_version
-    import intel_extension_for_pytorch as ipex
-    _ipex_version = ipex.__version__
-    return _ipex_version
 def get_xpu_device_name(device: torch.device):
     if device.type != "xpu":
         return device.type
     else:
+        # possiable device name:
+        # ["arc", "pvc", "mtl", "lnl", "bmg", "arl", "legacy", "unknown"]
         import xe_linear
         return xe_linear.get_xpu_device_name(device)

ipex_llm/transformers/xpu_ops.py CHANGED Viewed

@@ -20,9 +20,9 @@ import xe_batch
 import xe_addons
-@torch.library.register_fake("ipex_llm::forward_new")
-def _(x, weight, qtype, input_size):
-    return torch.empty_like(x)
+# @torch.library.register_fake("ipex_llm::forward_new")
+# def _(x, weight, qtype, input_size):
+#     return ???
 # @torch.library.register_fake("ipex_llm::dequant")
@@ -32,32 +32,38 @@ def _(x, weight, qtype, input_size):
 @torch.library.register_fake("ipex_llm::mlp_forward_xpu")
 def _(x, weight1, weight2, batch_size, state_size, output_size, act_type, qtype):
-    return torch.empty_like(x)
+    return torch.empty([batch_size, output_size],
+                       dtype=x.dtype, device=x.device)
-# @torch.library.register_fake("ipex_llm::rwkv_linear_attention_v4")
-# def _(time_decay, time_first, key, value, num_state, den_state, max_state)
-    # return ???
+@torch.library.register_fake("ipex_llm::rwkv_linear_attention_v4")
+def _(time_decay, time_first, key, value, num_state, den_state, max_state):
+    return torch.empty_like(key)
-# @torch.library.register_fake("ipex_llm::rwkv_linear_attention_v5")
-# def _(time_decay, time_first, receptance, key, value, state)
-    # return ???
+@torch.library.register_fake("ipex_llm::rwkv_linear_attention_v5")
+def _(time_decay, time_first, receptance, key, value, state):
+    bsz, n_heads, seq_len, head_dim = key.shape
+    return torch.empty([bsz, seq_len, n_heads, head_dim],
+                       dtype=key.dtype, device=key.device)
-# @torch.library.register_fake("ipex_llm::rwkv_time_shift")
-# def _(hidden, shifted, mix):
-    # return ???
+@torch.library.register_fake("ipex_llm::rwkv_time_shift")
+def _(hidden, shifted, mix):
+    bsz, seq_len, hidden_size = hidden.shape
+    return torch.empty([mix.size(0), bsz, seq_len, hidden_size],
+                       dtype=hidden.dtype, device=hidden.device)
-# @torch.library.register_fake("ipex_llm::dequantize_rows")
-# def _(x, weight, qtype, state_size, output_size):
-    # return ???
+@torch.library.register_fake("ipex_llm::dequantize_rows")
+def _(x, weight, qtype, state_size, output_size):
+    return torch.empty([x.size(0), x.size(1), state_size],
+                       dtype=torch.float, device=weight.device)
-@torch.library.register_fake("ipex_llm::batch_forward")
-def _(x, weight, qtype):
-    return torch.empty_like(x)
+# @torch.library.register_fake("ipex_llm::batch_forward")
+# def _(x, weight, qtype):
+#     return ???
 @torch.library.register_fake("ipex_llm::sdp")