PyPI - ipex-llm - Versions diffs - 2.2.0b20250206__py3-none-win_amd64.whl → 2.2.0b20250208__py3-none-win_amd64.whl - Mend

ipex-llm 2.2.0b20250206__py3-none-win_amd64.whl → 2.2.0b20250208__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

ipex_llm/libs/bloom-api.dll +0 -0
ipex_llm/libs/bloom.dll +0 -0
ipex_llm/libs/gptneox-api.dll +0 -0
ipex_llm/libs/gptneox.dll +0 -0
ipex_llm/libs/libbloom_avx.dll +0 -0
ipex_llm/libs/libbloom_vnni.dll +0 -0
ipex_llm/libs/libgptneox_avx.dll +0 -0
ipex_llm/libs/libgptneox_vnni.dll +0 -0
ipex_llm/libs/libllama_avx.dll +0 -0
ipex_llm/libs/libllama_vnni.dll +0 -0
ipex_llm/libs/libstarcoder_avx.dll +0 -0
ipex_llm/libs/libstarcoder_vnni.dll +0 -0
ipex_llm/libs/llama-api.dll +0 -0
ipex_llm/libs/llama.dll +0 -0
ipex_llm/libs/main-bloom.exe +0 -0
ipex_llm/libs/main-gptneox.exe +0 -0
ipex_llm/libs/main-llama.exe +0 -0
ipex_llm/libs/main-starcoder.exe +0 -0
ipex_llm/libs/pipeline.dll +0 -0
ipex_llm/libs/quantize-bloom.exe +0 -0
ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
ipex_llm/libs/quantize-gptneox.exe +0 -0
ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
ipex_llm/libs/quantize-llama.exe +0 -0
ipex_llm/libs/quantize-llama_vnni.exe +0 -0
ipex_llm/libs/quantize-starcoder.exe +0 -0
ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
ipex_llm/libs/starcoder-api.dll +0 -0
ipex_llm/libs/starcoder.dll +0 -0
ipex_llm/transformers/model.py +0 -1
ipex_llm/transformers/npu_model.py +0 -1
ipex_llm/transformers/npu_models/qwen2_mp.py +20 -8
ipex_llm/transformers/npu_pipeline_model/common.py +161 -0
ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +30 -23
ipex_llm/transformers/npu_pipeline_model/llama.py +17 -165
ipex_llm/transformers/npu_pipeline_model/minicpm.py +10 -6
ipex_llm/transformers/npu_pipeline_model/qwen.py +53 -34
{ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/METADATA +23 -30
{ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/RECORD +45 -45
{ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250208.data}/scripts/ipex-llm-init.bat +0 -0
{ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250208.data}/scripts/llm-chat.ps1 +0 -0
{ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250208.data}/scripts/llm-cli.ps1 +0 -0
{ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/WHEEL +0 -0
{ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/entry_points.txt +0 -0
{ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/top_level.txt +0 -0

ipex_llm/transformers/npu_pipeline_model/llama.py CHANGED Viewed

@@ -18,108 +18,8 @@
 import torch
 import numpy as np
 import os
-from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead, \
-    obtain_weight_from_single_layer
-from intel_npu_acceleration_library.backend.factory import NNFactory
-class Llama32Embedding(NNFactory):
-    def __init__(
-        self,
-        vocab_size,
-        embedding_dim,
-        embedding_weight,
-        padding_idx,
-        inv_freq,
-        attention_scaling,
-        dtype,  # fp16
-        device: str = "NPU",
-    ):
-        super().__init__(False, device)
-        self.vocab_size = vocab_size
-        self.embedding_dim = embedding_dim
-        self.padding_idx = padding_idx
-        self.attention_scaling = attention_scaling
-        self.dtype = dtype
-        # define input
-        weight = self.constant(embedding_weight)
-        input = self.parameter((1, 1), dtype=np.int32)
-        position_ids = self.parameter((1, 1), dtype=np.int64)
-        inv_freq = self.constant(inv_freq)
-        # embed_tokens module
-        if padding_idx == -1:
-            padding_idx += vocab_size
-        axis_node = self.constant(np.array([0], dtype=np.int64))
-        if padding_idx is not None:
-            masked_embeddings = np.ones(weight.shape, dtype=np.float16)
-            masked_embeddings[padding_idx, :] = 0.0  # mask
-            node_mask = self.constant(masked_embeddings)
-            node_masked_w = self.eltwise_mul(weight, node_mask)
-            res = self.gather(node_masked_w, input, axis_node, 0)
-        else:
-            res = self.gather(weight, input, axis_node, 0)
-        # rotary_emb module
-        inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
-        position_ids = self.reshape(position_ids, (1, 1, 1))
-        freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
-                                 self.convert_to_fp32(position_ids))
-        freqs = self.transpose(freqs, [0, 2, 1])
-        emb = self.concat(freqs, freqs, axis=2)
-        cos = self.cos(emb)
-        sin = self.sin(emb)
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
-        # define outputs
-        res = self.convert_to_fp16(res)
-        cos = self.convert_to_fp32(cos)
-        sin = self.convert_to_fp32(sin)
-        print("start compiling")
-        self.compile()
-class Llama32PostEmbedding(NNFactory):
-    def __init__(
-        self,
-        inv_freq,
-        attention_scaling,
-        input_len: int = 1,
-        device: str = "NPU",
-    ):
-        super().__init__(False, device)
-        self.attention_scaling = attention_scaling
-        # define input
-        position_ids = self.parameter((1, input_len), dtype=np.int64)
-        inv_freq = self.constant(inv_freq)
-        # rotary_emb module
-        inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
-        position_ids = self.reshape(position_ids, (1, 1, input_len))
-        freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
-                                 self.convert_to_fp32(position_ids))
-        freqs = self.transpose(freqs, [0, 2, 1])
-        emb = self.concat(freqs, freqs, axis=2)
-        cos = self.cos(emb)
-        sin = self.sin(emb)
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
-        if input_len > 1:
-            cos = self.unsqueeze(cos, [1])
-            sin = self.unsqueeze(sin, [1])
-        # define outputs
-        cos = self.convert_to_fp32(cos)
-        sin = self.convert_to_fp32(sin)
-        print("start compiling")
-        self.compile()
+from .common import update_names_of_IR_and_export_blob, LowBitLLMLMHead, \
+    obtain_weight_from_single_layer, obtain_embedding_from_model
 def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
@@ -197,69 +97,17 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
         bin_file = os.path.join(weight_dir, f"model_lm_head_input_{1+idx}.bin")
         weight.tofile(bin_file)
-    if hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
-        # llama-2-7B & llama-3-8B
-        embedding_layer = model.model.embed_tokens
-        new_embedding = LLMEmbedding(
-            vocab_size=model.config.vocab_size,
-            embedding_dim=model.config.hidden_size,
-            embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
-            padding_idx=model.config.pad_token_id,
-            dtype=np.float16,
-        )
-        if convert_model:
-            bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
-            embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
-            first_blob_path = None
-        else:
-            first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
-                                                                 temp_dir, keep_ir=keep_ir,
-                                                                 compile_blob=compile_blob)
-            os.remove(os.path.join(temp_dir, "embedding.bin"))
-    else:
-        # llama-3.2-3B & llama-3.2-1B
-        embedding_layer = model.model.embed_tokens
-        new_embedding = Llama32Embedding(
-            vocab_size=model.config.vocab_size,
-            embedding_dim=model.config.hidden_size,
-            embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
-            padding_idx=model.config.pad_token_id,
-            inv_freq=model.model.rotary_emb.inv_freq.to(torch.float16),
-            attention_scaling=model.model.rotary_emb.attention_scaling,
-            dtype=np.float16,
-        )
-        if convert_model:
-            bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
-            embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
-            first_blob_path = None
-            # save embedding post module
-            inv_freq = model.model.rotary_emb.inv_freq.to(torch.float16)
-            attention_scaling = model.model.rotary_emb.attention_scaling
-            embedding_post = Llama32PostEmbedding(inv_freq=inv_freq,
-                                                  attention_scaling=attention_scaling,
-                                                  input_len=1)
-            update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
-                                               temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
-            embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
-                                                          attention_scaling=attention_scaling,
-                                                          input_len=max_prompt_len)
-            update_names_of_IR_and_export_blob(embedding_post_prefill,
-                                               "embedding_post_prefill",
-                                               temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
-            os.remove(os.path.join(temp_dir, "embedding_post.bin"))
-            os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
-        else:
-            first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
-                                                                 temp_dir, keep_ir=keep_ir,
-                                                                 compile_blob=compile_blob)
-            os.remove(os.path.join(temp_dir, "embedding.bin"))
+    first_blob_path = obtain_embedding_from_model(model, convert_model,
+                                                  temp_dir, weight_dir,
+                                                  max_prompt_len,
+                                                  keep_ir, compile_blob)
     return first_blob_path, last_blob_path
 def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
                         temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                        layernorm_const, mode="decode",
+                        const_parameter, mode="decode",
                         keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
@@ -297,14 +145,14 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
     else:
         input_len = kv_len
         decoder_name = "decoder_layer_prefill"
-        layernorm_const = False
+        const_parameter = False
         keep_position_ids = False
         npu_dpu_groups = 6
     single_decoder = LowBitLlamaMultiDecoderlayer(
         [1, input_len, num_heads * head_dim],
-        input_layernorm_weights=[layer_norm_0] if layernorm_const else None,
-        post_attn_layernorm_weights=[layer_norm_1] if layernorm_const else None,
+        input_layernorm_weights=[layer_norm_0] if const_parameter else None,
+        post_attn_layernorm_weights=[layer_norm_1] if const_parameter else None,
         cached_cos=cached_cos,
         cached_sin=cached_sin,
         num_heads=num_heads,
@@ -334,7 +182,7 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
     if mode == "decode":
         if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
             # llama-2-7B & llama-3-8B
-            if layernorm_const:
+            if const_parameter:
                 st_idx = 5
             else:
                 input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
@@ -344,7 +192,7 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
                 st_idx = 7
         else:
             # llama-3.2-3B & llama-3.2-1B
-            if layernorm_const:
+            if const_parameter:
                 st_idx = 6
             else:
                 input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
@@ -375,7 +223,7 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
                               save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                              layernorm_const, mode="decode",
+                              const_parameter, mode="decode",
                               keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
@@ -446,6 +294,10 @@ def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_dow
         else:  # FP16 Linear
             np_dtype = np.float16
+        if not const_parameter:
+            input_layer_norm_weights = None
+            post_attn_layernorm_weights = None
         fused_decoder = LowBitLlamaMultiDecoderlayer(
             [1, 1, num_heads * head_dim],
             input_layernorm_weights=input_layer_norm_weights,

ipex_llm/transformers/npu_pipeline_model/minicpm.py CHANGED Viewed

@@ -301,7 +301,7 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
 def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
                           temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                          layernorm_const, mode="decode",
+                          const_parameter, mode="decode",
                           keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
@@ -333,12 +333,12 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
     else:
         input_len = kv_len
         decoder_name = "decoder_layer_prefill"
-        layernorm_const = False
+        const_parameter = False
     single_decoder = LowBitMinicpmMultiDecoderlayer(
         [1, input_len, num_heads * head_dim],
-        input_layernorm_weights=[layer_norm_0] if layernorm_const else None,
-        post_attn_layernorm_weights=[layer_norm_1] if layernorm_const else None,
+        input_layernorm_weights=[layer_norm_0] if const_parameter else None,
+        post_attn_layernorm_weights=[layer_norm_1] if const_parameter else None,
         cached_cos=cached_cos,
         cached_sin=cached_sin,
         num_heads=num_heads,
@@ -364,7 +364,7 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
     os.remove(os.path.join(temp_dir, decoder_name + ".bin"))
     if mode == "decode":
-        if layernorm_const:
+        if const_parameter:
             st_idx = 5
         else:
             input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
@@ -394,7 +394,7 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
                                 save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                                layernorm_const, mode="decode",
+                                const_parameter, mode="decode",
                                 keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
@@ -461,6 +461,10 @@ def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_d
         else:  # FP16 Linear
             np_dtype = np.float16
+        if not const_parameter:
+            input_layer_norm_weights = None
+            post_attn_layernorm_weights = None
         fused_decoder = LowBitMinicpmMultiDecoderlayer(
             [1, 1, num_heads * head_dim],
             input_layernorm_weights=input_layer_norm_weights,

ipex_llm/transformers/npu_pipeline_model/qwen.py CHANGED Viewed

@@ -18,13 +18,14 @@
 import torch
 import numpy as np
 import os
-from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead, \
-    obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer
+from .common import update_names_of_IR_and_export_blob, LowBitLLMLMHead, \
+    obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer, \
+    obtain_embedding_from_model
 from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
 def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
-                                  convert_model=False, group_size=0,
+                                  convert_model=False, group_size=0, max_prompt_len=1,
                                   keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     head_dim = model.model.layers[0].self_attn.head_dim
@@ -107,30 +108,16 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
         bin_file = os.path.join(weight_dir, f"model_lm_head_input_{1+idx}.bin")
         weight.tofile(bin_file)
-    embedding_layer = model.model.embed_tokens
-    new_embedding = LLMEmbedding(
-        vocab_size=model.config.vocab_size,
-        embedding_dim=model.config.hidden_size,
-        embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
-        padding_idx=model.config.pad_token_id,
-        dtype=np.float16,
-        input_length=1,
-    )
-    if convert_model:
-        bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
-        embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
-        first_blob_path = True
-    else:
-        first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding",
-                                                             temp_dir, keep_ir=keep_ir,
-                                                             compile_blob=compile_blob)
-        os.remove(os.path.join(temp_dir, "embedding.bin"))
+    first_blob_path = obtain_embedding_from_model(model, convert_model,
+                                                  temp_dir, weight_dir,
+                                                  max_prompt_len,
+                                                  keep_ir, compile_blob)
     return first_blob_path, last_blob_path
 def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
                        temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                       layernorm_const, mode="decode",
+                       const_parameter, mode="decode",
                        keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
@@ -145,8 +132,13 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
     mlp_layer = curr_layer.mlp
     weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
     q_bias, k_bias, v_bias = obtain_qkv_bias_from_single_layer(attn_layer)
-    cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
-    cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
+    if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
+        cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
+        cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
+    else:
+        # transformers >= 4.45.0
+        cached_cos = None
+        cached_sin = None
     layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
     layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)
@@ -158,10 +150,12 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
     if mode == "decode":
         input_len = 1
         decoder_name = f"decoder_layer_{layer_idx}"
+        keep_position_ids = True
         npu_dpu_groups = None
     else:
         input_len = kv_len
         decoder_name = "decoder_layer_prefill"
+        keep_position_ids = False
         npu_dpu_groups = 6
     single_decoder = LowBitQwenMultiDecoderlayer(
@@ -185,6 +179,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
         n_splits_linear=n_splits_linear,
         n_splits_down_proj=n_splits_down_proj,
         group_size=group_size,
+        cos_len=input_len,
+        keep_position_ids=keep_position_ids,
         asym=asym
     )
     rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
@@ -196,14 +192,25 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
     # 0, 1, 2 are input_embed/attention_mask/position_id
     if mode == "decode":
-        if layernorm_const:
-            st_idx = 3
+        if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
+            if const_parameter:
+                st_idx = 3
+            else:
+                input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
+                post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
+                layer_norm_0.data.numpy().tofile(input_lm_bin_file)
+                layer_norm_1.data.numpy().tofile(post_lm_bin_file)
+                st_idx = 5
         else:
-            input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
-            post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
-            layer_norm_0.data.numpy().tofile(input_lm_bin_file)
-            layer_norm_1.data.numpy().tofile(post_lm_bin_file)
-            st_idx = 5
+            # transformers >= 4.45.0
+            if const_parameter:
+                st_idx = 4
+            else:
+                input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
+                post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_5.bin")
+                layer_norm_0.data.numpy().tofile(input_lm_bin_file)
+                layer_norm_1.data.numpy().tofile(post_lm_bin_file)
+                st_idx = 6
         q_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx}.bin")
         k_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+1}.bin")
         v_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+2}.bin")
@@ -234,7 +241,7 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
                              save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                             layernorm_const, mode="decode",
+                             const_parameter, mode="decode",
                              keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
@@ -261,8 +268,13 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
             attn_layer = curr_layer.self_attn
             mlp_layer = curr_layer.mlp
             weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
-            cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
-            cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
+            if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
+                cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
+                cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
+            else:
+                # transformers >= 4.45.0
+                cached_cos = None
+                cached_sin = None
             layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
             layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)
@@ -313,6 +325,13 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
         else:  # FP16 Linear
             np_dtype = np.float16
+        if not const_parameter:
+            input_layer_norm_weights = None
+            post_attn_layernorm_weights = None
+            q_biases = None
+            k_biases = None
+            v_biases = None
         fused_decoder = LowBitQwenMultiDecoderlayer(
             [1, 1, num_heads * head_dim],
             input_layernorm_weights=input_layer_norm_weights,

{ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ipex-llm
-Version: 2.2.0b20250206
+Version: 2.2.0b20250208
 Summary: Large Language Model Develop Toolkit
 Home-page: https://github.com/intel-analytics/ipex-llm
 Author: BigDL Authors
@@ -27,19 +27,12 @@ Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine
 Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'all'
 Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'all'
 Provides-Extra: cpp
-Requires-Dist: bigdl-core-cpp ==2.6.0b20250206 ; extra == 'cpp'
+Requires-Dist: bigdl-core-cpp ==2.6.0b20250208 ; extra == 'cpp'
 Requires-Dist: setuptools ; extra == 'cpp'
-Provides-Extra: cpp-arl
-Requires-Dist: bigdl-core-cpp ==2.6.0b20250206 ; extra == 'cpp-arl'
-Requires-Dist: setuptools ; extra == 'cpp-arl'
-Requires-Dist: onednn-devel ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
-Requires-Dist: onednn ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
-Requires-Dist: dpcpp-cpp-rt ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
-Requires-Dist: mkl-dpcpp ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
-Requires-Dist: onednn-devel ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
-Requires-Dist: onednn ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
-Requires-Dist: dpcpp-cpp-rt ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
-Requires-Dist: mkl-dpcpp ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
+Requires-Dist: onednn-devel ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
+Requires-Dist: onednn ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
+Requires-Dist: dpcpp-cpp-rt ==2025.0.2 ; (platform_system == "Windows") and extra == 'cpp'
+Requires-Dist: mkl-dpcpp ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
 Provides-Extra: llama-index
 Requires-Dist: py-cpuinfo ; extra == 'llama-index'
 Requires-Dist: protobuf ; extra == 'llama-index'
@@ -67,7 +60,7 @@ Requires-Dist: transformers ==4.40.0 ; extra == 'npu'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'npu'
 Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'npu'
 Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'npu'
-Requires-Dist: bigdl-core-npu ==2.6.0b20250206 ; (platform_system == "Windows") and extra == 'npu'
+Requires-Dist: bigdl-core-npu ==2.6.0b20250208 ; (platform_system == "Windows") and extra == 'npu'
 Provides-Extra: serving
 Requires-Dist: py-cpuinfo ; extra == 'serving'
 Requires-Dist: fschat[model_worker,webui] ==0.2.36 ; extra == 'serving'
@@ -87,9 +80,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu'
 Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu'
 Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu'
 Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu'
-Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250206 ; extra == 'xpu'
-Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250206 ; extra == 'xpu'
-Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250206 ; extra == 'xpu'
+Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250208 ; extra == 'xpu'
+Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250208 ; extra == 'xpu'
+Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250208 ; extra == 'xpu'
 Provides-Extra: xpu-2-1
 Requires-Dist: py-cpuinfo ; extra == 'xpu-2-1'
 Requires-Dist: protobuf ; extra == 'xpu-2-1'
@@ -104,9 +97,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu-2-1'
 Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu-2-1'
 Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu-2-1'
 Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu-2-1'
-Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250206 ; extra == 'xpu-2-1'
-Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250206 ; extra == 'xpu-2-1'
-Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250206 ; extra == 'xpu-2-1'
+Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250208 ; extra == 'xpu-2-1'
+Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250208 ; extra == 'xpu-2-1'
+Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250208 ; extra == 'xpu-2-1'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-1'
 Requires-Dist: dpcpp-cpp-rt ==2024.0.2 ; (platform_system == "Windows") and extra == 'xpu-2-1'
 Requires-Dist: mkl-dpcpp ==2024.0.0 ; (platform_system == "Windows") and extra == 'xpu-2-1'
@@ -124,7 +117,7 @@ Requires-Dist: setuptools ; extra == 'xpu-2-6'
 Requires-Dist: torch ==2.6.0+xpu ; extra == 'xpu-2-6'
 Requires-Dist: torchvision ==0.21.0+xpu ; extra == 'xpu-2-6'
 Requires-Dist: torchaudio ==2.6.0+xpu ; extra == 'xpu-2-6'
-Requires-Dist: bigdl-core-xe-all ==2.6.0b20250206 ; extra == 'xpu-2-6'
+Requires-Dist: bigdl-core-xe-all ==2.6.0b20250208 ; extra == 'xpu-2-6'
 Requires-Dist: onednn-devel ==2025.0.1 ; extra == 'xpu-2-6'
 Requires-Dist: onednn ==2025.0.1 ; extra == 'xpu-2-6'
 Requires-Dist: dpcpp-cpp-rt ==2025.0.2 ; extra == 'xpu-2-6'
@@ -140,9 +133,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arc'
 Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arc'
 Requires-Dist: tabulate ; extra == 'xpu-arc'
 Requires-Dist: setuptools ; extra == 'xpu-arc'
-Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250206 ; extra == 'xpu-arc'
-Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250206 ; extra == 'xpu-arc'
-Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250206 ; extra == 'xpu-arc'
+Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250208 ; extra == 'xpu-arc'
+Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250208 ; extra == 'xpu-arc'
+Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250208 ; extra == 'xpu-arc'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arc'
 Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
 Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
@@ -163,9 +156,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arl'
 Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arl'
 Requires-Dist: tabulate ; extra == 'xpu-arl'
 Requires-Dist: setuptools ; extra == 'xpu-arl'
-Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250206 ; extra == 'xpu-arl'
-Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250206 ; extra == 'xpu-arl'
-Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250206 ; extra == 'xpu-arl'
+Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250208 ; extra == 'xpu-arl'
+Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250208 ; extra == 'xpu-arl'
+Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250208 ; extra == 'xpu-arl'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arl'
 Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
 Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
@@ -186,9 +179,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-lnl'
 Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-lnl'
 Requires-Dist: tabulate ; extra == 'xpu-lnl'
 Requires-Dist: setuptools ; extra == 'xpu-lnl'
-Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250206 ; extra == 'xpu-lnl'
-Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250206 ; extra == 'xpu-lnl'
-Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250206 ; extra == 'xpu-lnl'
+Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250208 ; extra == 'xpu-lnl'
+Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250208 ; extra == 'xpu-lnl'
+Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250208 ; extra == 'xpu-lnl'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-lnl'
 Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
 Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'