PyPI - ipex-llm - Versions diffs - 2.2.0b20250107__py3-none-win_amd64.whl → 2.2.0b20250109__py3-none-win_amd64.whl - Mend

ipex-llm 2.2.0b20250107__py3-none-win_amd64.whl → 2.2.0b20250109__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

ipex_llm/libs/bloom-api.dll +0 -0
ipex_llm/libs/bloom.dll +0 -0
ipex_llm/libs/gptneox-api.dll +0 -0
ipex_llm/libs/gptneox.dll +0 -0
ipex_llm/libs/libbloom_avx.dll +0 -0
ipex_llm/libs/libbloom_vnni.dll +0 -0
ipex_llm/libs/libgptneox_avx.dll +0 -0
ipex_llm/libs/libgptneox_vnni.dll +0 -0
ipex_llm/libs/libllama_avx.dll +0 -0
ipex_llm/libs/libllama_vnni.dll +0 -0
ipex_llm/libs/libstarcoder_avx.dll +0 -0
ipex_llm/libs/libstarcoder_vnni.dll +0 -0
ipex_llm/libs/llama-api.dll +0 -0
ipex_llm/libs/llama.dll +0 -0
ipex_llm/libs/main-bloom.exe +0 -0
ipex_llm/libs/main-gptneox.exe +0 -0
ipex_llm/libs/main-llama.exe +0 -0
ipex_llm/libs/main-starcoder.exe +0 -0
ipex_llm/libs/pipeline.dll +0 -0
ipex_llm/libs/quantize-bloom.exe +0 -0
ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
ipex_llm/libs/quantize-gptneox.exe +0 -0
ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
ipex_llm/libs/quantize-llama.exe +0 -0
ipex_llm/libs/quantize-llama_vnni.exe +0 -0
ipex_llm/libs/quantize-starcoder.exe +0 -0
ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
ipex_llm/libs/starcoder-api.dll +0 -0
ipex_llm/libs/starcoder.dll +0 -0
ipex_llm/transformers/convert.py +20 -50
ipex_llm/transformers/loader.py +1 -1
ipex_llm/transformers/low_bit_linear.py +10 -25
ipex_llm/transformers/model.py +0 -7
ipex_llm/transformers/models/baichuan.py +7 -36
ipex_llm/transformers/models/bert.py +2 -13
ipex_llm/transformers/models/chatglm2.py +8 -31
ipex_llm/transformers/models/chatglm4.py +9 -4
ipex_llm/transformers/models/chatglm4v.py +2 -1
ipex_llm/transformers/models/common.py +3 -1
ipex_llm/transformers/models/glm.py +4 -2
ipex_llm/transformers/models/internlm.py +6 -3
ipex_llm/transformers/models/llama.py +2 -2
ipex_llm/transformers/models/minicpm.py +3 -2
ipex_llm/transformers/models/minicpm3.py +3 -1
ipex_llm/transformers/models/minicpmv.py +1 -0
ipex_llm/transformers/models/mistral.py +1 -1
ipex_llm/transformers/models/mllama.py +1 -1
ipex_llm/transformers/models/phi3.py +6 -2
ipex_llm/transformers/models/qwen.py +4 -2
ipex_llm/transformers/models/qwen2.py +4 -3
ipex_llm/transformers/models/qwen2_moe.py +4 -2
ipex_llm/transformers/models/qwen2_vl.py +3 -1
ipex_llm/transformers/models/stablelm.py +3 -1
ipex_llm/transformers/models/starcoder2.py +3 -1
ipex_llm/transformers/models/utils.py +10 -19
ipex_llm/transformers/models/yuan.py +2 -1
ipex_llm/transformers/speculative.py +2 -14
ipex_llm/transformers/utils.py +2 -14
ipex_llm/transformers/xpu_ops.py +25 -19
{ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250109.dist-info}/METADATA +20 -20
{ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250109.dist-info}/RECORD +67 -68
ipex_llm/transformers/models/gptj.py +0 -441
{ipex_llm-2.2.0b20250107.data → ipex_llm-2.2.0b20250109.data}/scripts/ipex-llm-init.bat +0 -0
{ipex_llm-2.2.0b20250107.data → ipex_llm-2.2.0b20250109.data}/scripts/llm-chat.ps1 +0 -0
{ipex_llm-2.2.0b20250107.data → ipex_llm-2.2.0b20250109.data}/scripts/llm-cli.ps1 +0 -0
{ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250109.dist-info}/WHEEL +0 -0
{ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250109.dist-info}/entry_points.txt +0 -0
{ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250109.dist-info}/top_level.txt +0 -0

ipex_llm/transformers/models/internlm.py CHANGED Viewed

@@ -87,7 +87,8 @@ def internlm_attention_forward(
         )
     # IPEX-LLM OPT: kv cache and quantzie kv cache
-    use_quantize_kv = use_quantize_kv_cache(self.qkv_proj, hidden_states)
+    use_quantize_kv = use_quantize_kv_cache(self.qkv_proj, hidden_states,
+                                            self.num_heads, self.num_heads)
     key_states, value_states = update_past_key_value(
         past_key_value, key_states, value_states,
         kv_seq_len, use_quantize_kv, hidden_states.device
@@ -171,7 +172,8 @@ def internlm2_attention_forward(
         )
     # IPEX-LLM OPT: kv cache and quantzie kv cache
-    use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states)
+    use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states,
+                                            self.num_heads, self.num_key_value_heads)
     key_states, value_states = update_past_key_value(
         past_key_value, key_states, value_states,
         kv_seq_len, use_quantize_kv, hidden_states.device
@@ -346,7 +348,8 @@ def internlm_xcomposser2_attention_forward(
             query_states, key_states, cos, sin, position_ids, "internlm")
     # IPEX-LLM OPT: kv cache and quantzie kv cache
-    use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states)
+    use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states,
+                                            self.num_heads, self.num_key_value_heads)
     key_states, value_states = update_past_key_value(
         past_key_value, key_states, value_states,
         kv_seq_len, use_quantize_kv, device

ipex_llm/transformers/models/llama.py CHANGED Viewed

@@ -72,7 +72,7 @@ def llama_model_forward(
     use_cache = True if inputs.device.type == "xpu" else use_cache
     use_quantize_kv = use_quantize_kv_cache(
         self.layers[0].mlp.down_proj, inputs,
-        self.config.num_attention_heads // self.config.num_key_value_heads
+        self.config.num_attention_heads, self.config.num_key_value_heads
     )
     use_compresskv = should_use_compresskv(inputs, inputs.shape[1]) or \
         isinstance(past_key_values, DynamicCompressCache)
@@ -116,7 +116,7 @@ def llama_model_forward(
 def merge_qkv(module: torch.nn.Module):
-    return merge_qkv_base(module, LlamaAttention)
+    merge_qkv_base(module, LlamaAttention)
 def llama_attention_forward(

ipex_llm/transformers/models/minicpm.py CHANGED Viewed

@@ -51,7 +51,8 @@ from transformers.cache_utils import Cache
 def merge_qkv(module: torch.nn.Module):
-    return merge_qkv_base(module, "MiniCPMAttention")
+    merge_qkv_base(module, "MiniCPMAttention")
+    merge_qkv_base(module, "MiniCPMSdpaAttention")
 def apply_residual_scale(module: torch.nn.Module):
@@ -158,7 +159,7 @@ def minicpm_model_forward_wrapper(origin_forward):
         # IPEX-LLM OPT: kv cache and quantize kv cache
         inputs = input_ids if input_ids is not None else inputs_embeds
         use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs,
-                                                self.config.num_attention_heads //
+                                                self.config.num_attention_heads,
                                                 self.config.num_key_value_heads)
         use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
             isinstance(past_key_values, DynamicCompressCache)

ipex_llm/transformers/models/minicpm3.py CHANGED Viewed

@@ -66,7 +66,9 @@ def minicpm3_model_forward_wrapper(origin_forward):
         inputs = input_ids if input_ids is not None else inputs_embeds
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         use_cache = True if inputs.device.type == "xpu" else use_cache
-        use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs)
+        num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
+        use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
+                                                num_heads, num_kv_heads)
         if use_cache:
             if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
                 past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)

ipex_llm/transformers/models/minicpmv.py CHANGED Viewed

@@ -36,6 +36,7 @@ from transformers.generation.logits_process import RepetitionPenaltyLogitsProces
 # MiniCPM-V-2_5 and MiniCPM-V-2_6
 def merge_qkv(module: torch.nn.Module):
     merge_qkv_base(module, "SiglipAttention")
+    merge_qkv_base(module, "SiglipSdpaAttention")
     merge_qkv_base(module, "Idefics2VisionAttention")

ipex_llm/transformers/models/mistral.py CHANGED Viewed

@@ -71,7 +71,7 @@ def mistral_model_forward(
     use_cache = use_cache if use_cache is not None else self.config.use_cache
     use_cache = use_cache or inputs.device.type == 'xpu'
     use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
-                                            self.config.num_attention_heads //
+                                            self.config.num_attention_heads,
                                             self.config.num_key_value_heads)
     use_compress_kv = should_use_compresskv(inputs, inputs.size(1)) or \
         isinstance(past_key_values, DynamicCompressCache)

ipex_llm/transformers/models/mllama.py CHANGED Viewed

@@ -113,7 +113,7 @@ def mllama_text_model_forward(
     use_cache = True if inputs.device.type == "xpu" else use_cache
     use_quantize_kv = use_quantize_kv_cache(
         self.layers[0].mlp.down_proj, inputs,
-        self.config.num_attention_heads // self.config.num_key_value_heads
+        self.config.num_attention_heads, self.config.num_key_value_heads
     )
     if use_cache:
         if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):

ipex_llm/transformers/models/phi3.py CHANGED Viewed

@@ -249,7 +249,9 @@ def phi3_model_forward_wrapper(origin_model_forward):
         # IPEX-LLM OPT: kv cache and quantize kv cache and sdp
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         inputs = input_ids if input_ids is not None else inputs_embeds
-        use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs)
+        num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
+        use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
+                                                num_heads, num_kv_heads)
         use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
             isinstance(past_key_values, DynamicCompressCache)
         if use_cache:
@@ -305,7 +307,9 @@ def phi3v_model_forward_wrapper(origin_model_forward):
     ):
         # IPEX-LLM OPT: kv cache and quantize kv cache and sdp
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, input_ids)
+        num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
+        use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, input_ids,
+                                                num_heads, num_kv_heads)
         if use_cache:
             if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
                 past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)

ipex_llm/transformers/models/qwen.py CHANGED Viewed

@@ -107,7 +107,8 @@ def qwen_attention_forward(
         query_states = query_states * logn_tensor.type_as(query_states).expand_as(query_states)
     # IPEX-LLM OPT: kv cache and quantzie kv cache
-    use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states)
+    use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states,
+                                            self.num_heads, self.num_heads)
     key_states, value_states = update_past_key_value(
         past_key_value, key_states, value_states,
         kv_seq_len, use_quantize_kv, device
@@ -205,7 +206,8 @@ def qwen_attention_forward_registered(
         query_states = query_states * logn_tensor.type_as(query_states).expand_as(query_states)
     # IPEX-LLM OPT: kv cache and quantzie kv cache
-    use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states)
+    use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states,
+                                            self.num_heads, self.num_heads)
     key_states, value_states = update_past_key_value(
         past_key_value, key_states, value_states,
         kv_seq_len, use_quantize_kv, device

ipex_llm/transformers/models/qwen2.py CHANGED Viewed

@@ -113,10 +113,10 @@ def qwen2_model_forward(
     # ipex-llm changes start
     # IPEX-LLM OPT: kv cache and quantize kv cache
     inputs = input_ids if input_ids is not None else inputs_embeds
+    num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
     use_quantize_kv = (
         self.config.hidden_size != 3584     # disable quantize kv in specific model
-        and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs,
-                                  self.config.num_attention_heads//self.config.num_key_value_heads)
+        and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs, num_heads, num_kv_heads)
     )
     use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
         isinstance(past_key_values, DynamicCompressCache)
@@ -305,10 +305,11 @@ def qwen2_model_forward_4_42(
     # ipex-llm changes start
     # IPEX-LLM OPT: kv cache and quantize kv cache
+    num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
     use_quantize_kv = (
         self.config.hidden_size != 3584     # disable quantize kv in specific model
         and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs_embeds,
-                                  self.config.num_attention_heads//self.config.num_key_value_heads)
+                                  num_heads, num_kv_heads)
     )
     use_compress_kv = should_use_compresskv(inputs_embeds, inputs_embeds.shape[1]) or \
         isinstance(past_key_values, DynamicCompressCache)

ipex_llm/transformers/models/qwen2_moe.py CHANGED Viewed

@@ -73,8 +73,10 @@ def qwen2moe_model_forward(
     return_dict: Optional[bool] = None,
 ):
     use_cache = use_cache if use_cache is not None else self.config.use_cache
-    input = input_ids if input_ids is not None else inputs_embeds
-    use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.shared_expert.up_proj, input)
+    inputs = input_ids if input_ids is not None else inputs_embeds
+    num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
+    use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.shared_expert.up_proj, inputs,
+                                            num_heads, num_kv_heads)
     if use_cache:
         if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
             past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)

ipex_llm/transformers/models/qwen2_vl.py CHANGED Viewed

@@ -88,7 +88,9 @@ def qwen2_vl_model_forward(
     # IPEX-LLM OPT start: kv cache and quantize kv cache
     inputs = input_ids if input_ids is not None else inputs_embeds
     use_cache = True if inputs.device.type == "xpu" else use_cache
-    use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs)
+    num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
+    use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
+                                            num_heads, num_kv_heads)
     if use_cache:
         if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
             past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)

ipex_llm/transformers/models/stablelm.py CHANGED Viewed

@@ -69,8 +69,10 @@ def stablelm_model_forward(
 ):
     # IPEX-LLM OPT: kv cache and quantize kv cache
     use_cache = use_cache if use_cache is not None else self.config.use_cache
+    num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
     use_quantize_kv = (self.layers[0].self_attn.head_dim in [64, 80, 96, 128]
-                       and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids))
+                       and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids,
+                                                 num_heads, num_kv_heads))
     if use_cache:
         if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
             past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)

ipex_llm/transformers/models/starcoder2.py CHANGED Viewed

@@ -132,7 +132,9 @@ def model_forward(
     return_dict: Optional[bool] = None,
 ):
     use_cache = use_cache if use_cache is not None else self.config.use_cache
-    use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.c_fc, input_ids)
+    num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
+    use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.c_fc, input_ids,
+                                            num_heads, num_kv_heads)
     if use_cache:
         if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
             past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)

ipex_llm/transformers/models/utils.py CHANGED Viewed

@@ -19,7 +19,7 @@ import torch
 import warnings
 from ipex_llm.utils.common import invalidInputError
 from ipex_llm.ggml.quantize import ggml_tensor_qtype
-from ipex_llm.transformers.utils import get_ipex_version, get_xpu_device_name
+from ipex_llm.transformers.utils import get_xpu_device_name
 from ipex_llm.transformers.low_bit_linear import SYM_INT4, SYM_INT8, FP8E5, IQ2_XXS, FP4, FP8E4,\
     FP6, ASYM_INT4
@@ -74,7 +74,8 @@ def append_kv_cache(cache_k, cache_v, key_states, value_states):
     return new_cache_k, new_cache_v
-def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor, kv_group: int = 1) -> bool:
+def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor,
+                          num_heads: int, num_kv_heads: int) -> bool:
     if os.environ.get("BIGDL_QUANTIZE_KV_CACHE", None) is not None:
         warnings.warn(
             "`BIGDL_QUANTIZE_KV_CACHE` is deprecated and will be removed in future releases. "
@@ -90,8 +91,11 @@ def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor, kv_group: in
     else:
         device_name = get_xpu_device_name(x.device)
         return (
-            device_name in ["mtl", "lnl", "arl"] and kv_group == 1
-            or device_name in ["arc", "bmg"] and x.size(0) > 1
+            num_kv_heads >= 4
+            and (
+                device_name in ["mtl", "lnl", "arl"] and num_heads // num_kv_heads <= 4
+                or device_name in ["arc", "bmg"] and x.size(0) > 1
+            )
         )
@@ -168,7 +172,7 @@ def should_use_fuse_rope(hidden_states, position_ids, training):
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
     if model_family in ["llama", "baichuan", "internlm", "aquila", "gpt_neox", "mistral",
-                        "mixtral", "qwen2", "yuan", "stablelm", "qwen2_moe"]:
+                        "qwen2", "yuan", "stablelm", "qwen2_moe"]:
         # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
         cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
         sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
@@ -183,7 +187,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
         q_embed = (q * cos) + (rotate_half(q) * sin)
         k_embed = (k * cos) + (rotate_half(k) * sin)
         return q_embed, k_embed
-    elif model_family in ["gptj", "chatglm"]:
+    elif model_family in ["chatglm"]:
         q_embed = (q * cos) + (rotate_every_two(q) * sin)
         k_embed = (k * cos) + (rotate_every_two(k) * sin)
         return q_embed, k_embed
@@ -192,19 +196,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
                           f"{model_family} is not supported.")
-def apply_ipex_rotate_every_two(q, k, cos, sin):
-    # ipex's apply_rotary_embedding_two_qk can change the origin storage,
-    # so q/k will get the result directly.
-    from ipex_llm.transformers.utils import get_ipex_version
-    if get_ipex_version() >= "2.1.10+xpu":
-        torch.ops.torch_ipex.apply_rotary_embedding_two_qk(
-            q, k, sin, cos, q, k
-        )
-    else:
-        torch.ops.torch_ipex.apply_rotary_embedding(q, sin, cos, q)
-        torch.ops.torch_ipex.apply_rotary_embedding(k, sin, cos, k)
 def is_enough_kv_cache_room_4_36(past_key_value, idx, seq_len=1):
     # to determinate if is enough kv cache room in transformers==4.36
     # seq_len for current seq len

ipex_llm/transformers/models/yuan.py CHANGED Viewed

@@ -158,7 +158,8 @@ def yuan_attention_forward(
                                                         "yuan")
     # IPEX-LLM OPT: kv cache and quantzie kv cache
-    use_quantize_kv = use_quantize_kv_cache(self.qk_proj, hidden_states)
+    use_quantize_kv = use_quantize_kv_cache(self.qk_proj, hidden_states,
+                                            self.num_heads, self.num_heads)
     key_states, value_states = update_past_key_value(
         None if past_key_value is None else (past_key_value[0], past_key_value[1]),
         key_states, value_states,

ipex_llm/transformers/speculative.py CHANGED Viewed

@@ -432,8 +432,7 @@ def _check_and_extend_kv_cache(past_key_values, max_step_draft, kv_alloc_block_l
     from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \
         extend_kv_cache
     enough_kv_room = True
-    if model_type not in ["chatglm", "qwen", "baichuan", "llama", "mistral",
-                          "gptj", "opt"]:
+    if model_type not in ["chatglm", "qwen", "baichuan", "llama", "mistral", "opt"]:
         return past_key_values, False
     cache_k = past_key_values[0][0]
     if model_type == "chatglm":
@@ -527,7 +526,7 @@ def _crop_past_key_values(self, past_key_values, new_cache_size, _enable_ipex=Fa
                         v[:-(new_cache_size), :, :, :])
                     for k, v in past_key_values
                 ]
-        elif self.config.model_type in ["baichuan", "gptj"]:
+        elif self.config.model_type in ["baichuan"]:
             past_key_values = [
                 (k[:, :, :-(new_cache_size), :],
                     v[:, :, :-(new_cache_size), :])
@@ -796,13 +795,6 @@ def _non_cpu_ipex_verify(self, verify_input_ids, past_key_values, cur_attention_
                                     device=verify_input_ids.device)
         position_ids = position_ids.unsqueeze(0).repeat(1, 1) + past_key_value_len
         forward_args["position_ids"] = position_ids
-    elif self.config.model_type == "gptj":
-        past_length = past_key_values[0][0].size(2)
-        input_len = verify_input_ids.shape[1]
-        position_ids = torch.arange(past_length, input_len + past_length,
-                                    dtype=torch.long, device=verify_input_ids.device)
-        position_ids = position_ids.unsqueeze(0).view(-1, input_len)
-        forward_args["position_ids"] = position_ids
     return self(**forward_args)
@@ -971,10 +963,6 @@ def speculative_generate(self,
                         past_key_value_len = past_key_values[0][0].shape[0]
                     position_ids = torch.Tensor([[past_key_value_len + step_draft]]).long()
                     forward_args["position_ids"] = position_ids
-                elif self.config.model_type == "gptj":
-                    past_length = draft_past_key_values[0][0].size(2)
-                    position_ids = torch.Tensor([[past_length]]).long().to(self.device)
-                    forward_args["position_ids"] = position_ids
                 if _enable_ipex:
                     if any(keyword in self.config.model_type

ipex_llm/transformers/utils.py CHANGED Viewed

@@ -154,24 +154,12 @@ def get_autocast_dtype(x):
                           f"Device {x.device} is not supported.")
-_ipex_version = None
-def get_ipex_version():
-    global _ipex_version
-    if _ipex_version is not None:
-        return _ipex_version
-    import intel_extension_for_pytorch as ipex
-    _ipex_version = ipex.__version__
-    return _ipex_version
 def get_xpu_device_name(device: torch.device):
     if device.type != "xpu":
         return device.type
     else:
+        # possiable device name:
+        # ["arc", "pvc", "mtl", "lnl", "bmg", "arl", "legacy", "unknown"]
         import xe_linear
         return xe_linear.get_xpu_device_name(device)

ipex_llm/transformers/xpu_ops.py CHANGED Viewed

@@ -20,9 +20,9 @@ import xe_batch
 import xe_addons
-@torch.library.register_fake("ipex_llm::forward_new")
-def _(x, weight, qtype, input_size):
-    return torch.empty_like(x)
+# @torch.library.register_fake("ipex_llm::forward_new")
+# def _(x, weight, qtype, input_size):
+#     return ???
 # @torch.library.register_fake("ipex_llm::dequant")
@@ -32,32 +32,38 @@ def _(x, weight, qtype, input_size):
 @torch.library.register_fake("ipex_llm::mlp_forward_xpu")
 def _(x, weight1, weight2, batch_size, state_size, output_size, act_type, qtype):
-    return torch.empty_like(x)
+    return torch.empty([batch_size, output_size],
+                       dtype=x.dtype, device=x.device)
-# @torch.library.register_fake("ipex_llm::rwkv_linear_attention_v4")
-# def _(time_decay, time_first, key, value, num_state, den_state, max_state)
-    # return ???
+@torch.library.register_fake("ipex_llm::rwkv_linear_attention_v4")
+def _(time_decay, time_first, key, value, num_state, den_state, max_state):
+    return torch.empty_like(key)
-# @torch.library.register_fake("ipex_llm::rwkv_linear_attention_v5")
-# def _(time_decay, time_first, receptance, key, value, state)
-    # return ???
+@torch.library.register_fake("ipex_llm::rwkv_linear_attention_v5")
+def _(time_decay, time_first, receptance, key, value, state):
+    bsz, n_heads, seq_len, head_dim = key.shape
+    return torch.empty([bsz, seq_len, n_heads, head_dim],
+                       dtype=key.dtype, device=key.device)
-# @torch.library.register_fake("ipex_llm::rwkv_time_shift")
-# def _(hidden, shifted, mix):
-    # return ???
+@torch.library.register_fake("ipex_llm::rwkv_time_shift")
+def _(hidden, shifted, mix):
+    bsz, seq_len, hidden_size = hidden.shape
+    return torch.empty([mix.size(0), bsz, seq_len, hidden_size],
+                       dtype=hidden.dtype, device=hidden.device)
-# @torch.library.register_fake("ipex_llm::dequantize_rows")
-# def _(x, weight, qtype, state_size, output_size):
-    # return ???
+@torch.library.register_fake("ipex_llm::dequantize_rows")
+def _(x, weight, qtype, state_size, output_size):
+    return torch.empty([x.size(0), x.size(1), state_size],
+                       dtype=torch.float, device=weight.device)
-@torch.library.register_fake("ipex_llm::batch_forward")
-def _(x, weight, qtype):
-    return torch.empty_like(x)
+# @torch.library.register_fake("ipex_llm::batch_forward")
+# def _(x, weight, qtype):
+#     return ???
 @torch.library.register_fake("ipex_llm::sdp")

{ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250109.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ipex-llm
-Version: 2.2.0b20250107
+Version: 2.2.0b20250109
 Summary: Large Language Model Develop Toolkit
 Home-page: https://github.com/intel-analytics/ipex-llm
 Author: BigDL Authors
@@ -27,10 +27,10 @@ Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine
 Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'all'
 Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'all'
 Provides-Extra: cpp
-Requires-Dist: bigdl-core-cpp ==2.6.0b20250107 ; extra == 'cpp'
+Requires-Dist: bigdl-core-cpp ==2.6.0b20250109 ; extra == 'cpp'
 Requires-Dist: setuptools ; extra == 'cpp'
 Provides-Extra: cpp-arl
-Requires-Dist: bigdl-core-cpp ==2.6.0b20250107 ; extra == 'cpp-arl'
+Requires-Dist: bigdl-core-cpp ==2.6.0b20250109 ; extra == 'cpp-arl'
 Requires-Dist: setuptools ; extra == 'cpp-arl'
 Requires-Dist: onednn-devel ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
 Requires-Dist: onednn ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
@@ -67,7 +67,7 @@ Requires-Dist: transformers ==4.40.0 ; extra == 'npu'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'npu'
 Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'npu'
 Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'npu'
-Requires-Dist: bigdl-core-npu ==2.6.0b20250107 ; (platform_system == "Windows") and extra == 'npu'
+Requires-Dist: bigdl-core-npu ==2.6.0b20250109 ; (platform_system == "Windows") and extra == 'npu'
 Provides-Extra: serving
 Requires-Dist: py-cpuinfo ; extra == 'serving'
 Requires-Dist: fschat[model_worker,webui] ==0.2.36 ; extra == 'serving'
@@ -87,9 +87,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu'
 Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu'
 Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu'
 Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu'
-Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250107 ; extra == 'xpu'
-Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250107 ; extra == 'xpu'
-Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250107 ; extra == 'xpu'
+Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250109 ; extra == 'xpu'
+Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250109 ; extra == 'xpu'
+Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250109 ; extra == 'xpu'
 Provides-Extra: xpu-2-1
 Requires-Dist: py-cpuinfo ; extra == 'xpu-2-1'
 Requires-Dist: protobuf ; extra == 'xpu-2-1'
@@ -104,9 +104,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu-2-1'
 Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu-2-1'
 Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu-2-1'
 Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu-2-1'
-Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250107 ; extra == 'xpu-2-1'
-Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250107 ; extra == 'xpu-2-1'
-Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250107 ; extra == 'xpu-2-1'
+Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250109 ; extra == 'xpu-2-1'
+Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250109 ; extra == 'xpu-2-1'
+Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250109 ; extra == 'xpu-2-1'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-1'
 Requires-Dist: dpcpp-cpp-rt ==2024.0.2 ; (platform_system == "Windows") and extra == 'xpu-2-1'
 Requires-Dist: mkl-dpcpp ==2024.0.0 ; (platform_system == "Windows") and extra == 'xpu-2-1'
@@ -124,7 +124,7 @@ Requires-Dist: setuptools ; extra == 'xpu-2-6'
 Requires-Dist: torch ==2.6.0+xpu ; extra == 'xpu-2-6'
 Requires-Dist: torchvision ==0.21.0+xpu ; extra == 'xpu-2-6'
 Requires-Dist: torchaudio ==2.6.0+xpu ; extra == 'xpu-2-6'
-Requires-Dist: bigdl-core-xe-all ==2.6.0b20250107 ; extra == 'xpu-2-6'
+Requires-Dist: bigdl-core-xe-all ==2.6.0b20250109 ; extra == 'xpu-2-6'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-6'
 Provides-Extra: xpu-arc
 Requires-Dist: py-cpuinfo ; extra == 'xpu-arc'
@@ -137,9 +137,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arc'
 Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arc'
 Requires-Dist: tabulate ; extra == 'xpu-arc'
 Requires-Dist: setuptools ; extra == 'xpu-arc'
-Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250107 ; extra == 'xpu-arc'
-Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250107 ; extra == 'xpu-arc'
-Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250107 ; extra == 'xpu-arc'
+Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250109 ; extra == 'xpu-arc'
+Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250109 ; extra == 'xpu-arc'
+Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250109 ; extra == 'xpu-arc'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arc'
 Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
 Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
@@ -160,9 +160,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arl'
 Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arl'
 Requires-Dist: tabulate ; extra == 'xpu-arl'
 Requires-Dist: setuptools ; extra == 'xpu-arl'
-Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250107 ; extra == 'xpu-arl'
-Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250107 ; extra == 'xpu-arl'
-Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250107 ; extra == 'xpu-arl'
+Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250109 ; extra == 'xpu-arl'
+Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250109 ; extra == 'xpu-arl'
+Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250109 ; extra == 'xpu-arl'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arl'
 Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
 Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
@@ -183,9 +183,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-lnl'
 Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-lnl'
 Requires-Dist: tabulate ; extra == 'xpu-lnl'
 Requires-Dist: setuptools ; extra == 'xpu-lnl'
-Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250107 ; extra == 'xpu-lnl'
-Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250107 ; extra == 'xpu-lnl'
-Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250107 ; extra == 'xpu-lnl'
+Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250109 ; extra == 'xpu-lnl'
+Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250109 ; extra == 'xpu-lnl'
+Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250109 ; extra == 'xpu-lnl'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-lnl'
 Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
 Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'