PyPI - ipex-llm - Versions diffs - 2.2.0b20250206__py3-none-win_amd64.whl → 2.2.0b20250207__py3-none-win_amd64.whl - Mend

ipex-llm 2.2.0b20250206__py3-none-win_amd64.whl → 2.2.0b20250207__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

ipex_llm/libs/bloom-api.dll +0 -0
ipex_llm/libs/bloom.dll +0 -0
ipex_llm/libs/gptneox-api.dll +0 -0
ipex_llm/libs/gptneox.dll +0 -0
ipex_llm/libs/libbloom_avx.dll +0 -0
ipex_llm/libs/libbloom_vnni.dll +0 -0
ipex_llm/libs/libgptneox_avx.dll +0 -0
ipex_llm/libs/libgptneox_vnni.dll +0 -0
ipex_llm/libs/libllama_avx.dll +0 -0
ipex_llm/libs/libllama_vnni.dll +0 -0
ipex_llm/libs/libstarcoder_avx.dll +0 -0
ipex_llm/libs/libstarcoder_vnni.dll +0 -0
ipex_llm/libs/llama-api.dll +0 -0
ipex_llm/libs/llama.dll +0 -0
ipex_llm/libs/main-bloom.exe +0 -0
ipex_llm/libs/main-gptneox.exe +0 -0
ipex_llm/libs/main-llama.exe +0 -0
ipex_llm/libs/main-starcoder.exe +0 -0
ipex_llm/libs/pipeline.dll +0 -0
ipex_llm/libs/quantize-bloom.exe +0 -0
ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
ipex_llm/libs/quantize-gptneox.exe +0 -0
ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
ipex_llm/libs/quantize-llama.exe +0 -0
ipex_llm/libs/quantize-llama_vnni.exe +0 -0
ipex_llm/libs/quantize-starcoder.exe +0 -0
ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
ipex_llm/libs/starcoder-api.dll +0 -0
ipex_llm/libs/starcoder.dll +0 -0
ipex_llm/transformers/model.py +0 -1
ipex_llm/transformers/npu_model.py +0 -1
ipex_llm/transformers/npu_models/qwen2_mp.py +20 -8
ipex_llm/transformers/npu_pipeline_model/common.py +161 -0
ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +7 -2
ipex_llm/transformers/npu_pipeline_model/llama.py +6 -158
ipex_llm/transformers/npu_pipeline_model/qwen.py +44 -32
{ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250207.dist-info}/METADATA +23 -30
{ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250207.dist-info}/RECORD +44 -44
{ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250207.data}/scripts/ipex-llm-init.bat +0 -0
{ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250207.data}/scripts/llm-chat.ps1 +0 -0
{ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250207.data}/scripts/llm-cli.ps1 +0 -0
{ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250207.dist-info}/WHEEL +0 -0
{ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250207.dist-info}/entry_points.txt +0 -0
{ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250207.dist-info}/top_level.txt +0 -0

ipex_llm/libs/bloom-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/bloom.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama.dll CHANGED Viewed

Binary file

ipex_llm/libs/main-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/pipeline.dll CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/starcoder-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/starcoder.dll CHANGED Viewed

Binary file

ipex_llm/transformers/model.py CHANGED Viewed

@@ -233,7 +233,6 @@ class _BaseAutoModelClass:
             optimize_model = False
             kwargs["modules_to_not_convert"] = ["lm_head"]
-        load_in_8bit = kwargs.pop("load_in_8bit", False)
         from ipex_llm.llm_patching import bigdl_patched
         if bigdl_patched == 'Train':
             global patched_training_mode

ipex_llm/transformers/npu_model.py CHANGED Viewed

@@ -117,7 +117,6 @@ class _BaseAutoModelClass:
         # ignore following arguments
         ignore_argument(kwargs, "model_hub")
         ignore_argument(kwargs, "load_in_4bit")
-        ignore_argument(kwargs, "load_in_8bit")
         ignore_argument(kwargs, "imatrix")
         ignore_argument(kwargs, "cpu_embedding")
         ignore_argument(kwargs, "embedding_qtype")

ipex_llm/transformers/npu_models/qwen2_mp.py CHANGED Viewed

@@ -98,6 +98,8 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
         n_splits_linear: int = 1,
         n_splits_down_proj: int = 1,
         group_size: int = 0,
+        cos_len: int = 1,
+        keep_position_ids=True,
         asym: bool = False,
     ):
         super().__init__(max_seq_len=max_seq_len,
@@ -114,18 +116,13 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
         self.dtype = dtype
         self.cached_cos = cached_cos
         self.cached_sin = cached_sin
+        self.cos_len = cos_len
         self.batch_size, self.seq_len, self.hidden_size = hidden_shape
         self.mode = mode
         self.rms_norm_eps = rms_norm_eps
         self.transpose_value = transpose_value
         self.num_layers = num_layers
-        cos = self.constant(self.cached_cos)
-        self.cos = self.unsqueeze(cos, axis=0)
-        sin = self.constant(self.cached_sin)
-        self.sin = self.unsqueeze(sin, axis=0)
         if mode == "decode":
             self.kv_seq_len = self.max_seq_len + 1
         else:
@@ -148,7 +145,21 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
             attention_mask = self.create_input_op(
                 (self.batch_size, 1, self.seq_len, self.seq_len), dtype=np.float16)
-        position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
+        if self.cached_cos is None:
+            if mode == "prefill" and keep_position_ids:
+                position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
+            cos = self.create_input_op((self.batch_size, self.cos_len, self.head_dim),
+                                       dtype=np.float32)
+            self.cos = self.convert_to_fp16(cos)
+            sin = self.create_input_op((self.batch_size, self.cos_len, self.head_dim),
+                                       dtype=np.float32)
+            self.sin = self.convert_to_fp16(sin)
+        else:
+            position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
+            cos = self.constant(self.cached_cos)
+            self.cos = self.unsqueeze(cos, axis=0)
+            sin = self.constant(self.cached_sin)
+            self.sin = self.unsqueeze(sin, axis=0)
         if input_layernorm_weights is None:
             input_layernorm_weights = []
@@ -211,11 +222,12 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
         hidden_states = input
         curr_key_values = []
+        cos_condition = cached_cos is not None or (mode == "prefill" and keep_position_ids)
         for i in range(num_layers):
             hidden_states, new_key_states, new_value_states = self.build_decoder(
                 hidden_states=hidden_states,
                 attention_mask=attention_mask,
-                position_ids=position_ids,
+                position_ids=position_ids if cos_condition else None,
                 input_layernorm_weight=input_layernorm_weights[i],
                 post_attention_layernorm_weight=post_attn_layernorm_weights[i],
                 q_bias=q_biases[i],

ipex_llm/transformers/npu_pipeline_model/common.py CHANGED Viewed

@@ -173,6 +173,105 @@ class LLMEmbedding(NNFactory):
         self.compile()
+class Llama32Embedding(NNFactory):
+    def __init__(
+        self,
+        vocab_size,
+        embedding_dim,
+        embedding_weight,
+        padding_idx,
+        inv_freq,
+        attention_scaling,
+        dtype,  # fp16
+        device: str = "NPU",
+    ):
+        super().__init__(False, device)
+        self.vocab_size = vocab_size
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.attention_scaling = attention_scaling
+        self.dtype = dtype
+        # define input
+        weight = self.constant(embedding_weight)
+        input = self.parameter((1, 1), dtype=np.int32)
+        position_ids = self.parameter((1, 1), dtype=np.int64)
+        inv_freq = self.constant(inv_freq)
+        # embed_tokens module
+        if padding_idx == -1:
+            padding_idx += vocab_size
+        axis_node = self.constant(np.array([0], dtype=np.int64))
+        if padding_idx is not None:
+            masked_embeddings = np.ones(weight.shape, dtype=np.float16)
+            masked_embeddings[padding_idx, :] = 0.0  # mask
+            node_mask = self.constant(masked_embeddings)
+            node_masked_w = self.eltwise_mul(weight, node_mask)
+            res = self.gather(node_masked_w, input, axis_node, 0)
+        else:
+            res = self.gather(weight, input, axis_node, 0)
+        # rotary_emb module
+        inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
+        position_ids = self.reshape(position_ids, (1, 1, 1))
+        freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
+                                 self.convert_to_fp32(position_ids))
+        freqs = self.transpose(freqs, [0, 2, 1])
+        emb = self.concat(freqs, freqs, axis=2)
+        cos = self.cos(emb)
+        sin = self.sin(emb)
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        # define outputs
+        res = self.convert_to_fp16(res)
+        cos = self.convert_to_fp32(cos)
+        sin = self.convert_to_fp32(sin)
+        print("start compiling")
+        self.compile()
+class Llama32PostEmbedding(NNFactory):
+    def __init__(
+        self,
+        inv_freq,
+        attention_scaling,
+        input_len: int = 1,
+        device: str = "NPU",
+    ):
+        super().__init__(False, device)
+        self.attention_scaling = attention_scaling
+        # define input
+        position_ids = self.parameter((1, input_len), dtype=np.int64)
+        inv_freq = self.constant(inv_freq)
+        # rotary_emb module
+        inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
+        position_ids = self.reshape(position_ids, (1, 1, input_len))
+        freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
+                                 self.convert_to_fp32(position_ids))
+        freqs = self.transpose(freqs, [0, 2, 1])
+        emb = self.concat(freqs, freqs, axis=2)
+        cos = self.cos(emb)
+        sin = self.sin(emb)
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        if input_len > 1:
+            cos = self.unsqueeze(cos, [1])
+            sin = self.unsqueeze(sin, [1])
+        # define outputs
+        cos = self.convert_to_fp32(cos)
+        sin = self.convert_to_fp32(sin)
+        print("start compiling")
+        self.compile()
 def obtain_weight_from_single_layer(attn_layer, mlp_layer):
     weights = []
     if hasattr(attn_layer, "q_proj_dq_list"):
@@ -216,3 +315,65 @@ def obtain_qkv_bias_from_single_layer(attn_layer):
         k_bias = attn_layer.k_proj.bias.to(torch.float16)
         v_bias = attn_layer.v_proj.bias.to(torch.float16)
     return q_bias, k_bias, v_bias
+def obtain_embedding_from_model(model, convert_model, temp_dir, weight_dir,
+                                max_prompt_len, keep_ir, compile_blob):
+    if hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
+        # llama-2-7B & llama-3-8B
+        embedding_layer = model.model.embed_tokens
+        new_embedding = LLMEmbedding(
+            vocab_size=model.config.vocab_size,
+            embedding_dim=model.config.hidden_size,
+            embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
+            padding_idx=model.config.pad_token_id,
+            dtype=np.float16,
+        )
+        if convert_model:
+            bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
+            embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
+            first_blob_path = None
+        else:
+            first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
+                                                                 temp_dir, keep_ir=keep_ir,
+                                                                 compile_blob=compile_blob)
+            os.remove(os.path.join(temp_dir, "embedding.bin"))
+    else:
+        # llama-3.2-3B & llama-3.2-1B
+        # for transformers >= 4.45.0
+        embedding_layer = model.model.embed_tokens
+        new_embedding = Llama32Embedding(
+            vocab_size=model.config.vocab_size,
+            embedding_dim=model.config.hidden_size,
+            embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
+            padding_idx=model.config.pad_token_id,
+            inv_freq=model.model.rotary_emb.inv_freq.to(torch.float16),
+            attention_scaling=model.model.rotary_emb.attention_scaling,
+            dtype=np.float16,
+        )
+        if convert_model:
+            bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
+            embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
+            first_blob_path = None
+            # save embedding post module
+            inv_freq = model.model.rotary_emb.inv_freq.to(torch.float16)
+            attention_scaling = model.model.rotary_emb.attention_scaling
+            embedding_post = Llama32PostEmbedding(inv_freq=inv_freq,
+                                                  attention_scaling=attention_scaling,
+                                                  input_len=1)
+            update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
+                                               temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
+            embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
+                                                          attention_scaling=attention_scaling,
+                                                          input_len=max_prompt_len)
+            update_names_of_IR_and_export_blob(embedding_post_prefill,
+                                               "embedding_post_prefill",
+                                               temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
+            os.remove(os.path.join(temp_dir, "embedding_post.bin"))
+            os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
+        else:
+            first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
+                                                                 temp_dir, keep_ir=keep_ir,
+                                                                 compile_blob=compile_blob)
+            os.remove(os.path.join(temp_dir, "embedding.bin"))
+    return first_blob_path

ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py CHANGED Viewed

@@ -31,6 +31,7 @@ import tempfile
 import numpy as np
 from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
 from multiprocessing import Pool
+import transformers
 def generate(
@@ -456,6 +457,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
         custom_object_save(model, save_directory, config=model.config)
     if model.config.model_type == "qwen2":
+        cos_sin_input = not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached")
+        embedding_post = not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached")
         if group_size == 0:
             if model.config.hidden_size == 1536:
                 # Qwen2-1.5B-Instruct
@@ -476,6 +479,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
                        "use_prefill_sdp": False,
                        "weight_num": 7,
                        "weight_idx": 8,
+                       "embedding_post": embedding_post,
+                       "cos_sin_input": cos_sin_input,
                        "n_splits_linear": n_splits_linear,
                        "n_splits_down_proj": n_splits_down_proj,
                        "lm_head_low_bit": lm_head_low_bit}
@@ -493,8 +498,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
                            group_size, layernorm_const, "prefill",
                            keep_ir=keep_ir, compile_blob=compile_blob)
         # save blob of lmhead and bin of embedding
-        convert_lm_head_and_embedding(model, save_directory, weight_dir,
-                                      convert_model=True, group_size=group_size,
+        convert_lm_head_and_embedding(model, save_directory, weight_dir, convert_model=True,
+                                      group_size=group_size, max_prompt_len=max_prompt_len,
                                       keep_ir=keep_ir, compile_blob=compile_blob)
     elif model.config.model_type == "llama":
         embedding_post = False

ipex_llm/transformers/npu_pipeline_model/llama.py CHANGED Viewed

@@ -18,108 +18,8 @@
 import torch
 import numpy as np
 import os
-from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead, \
-    obtain_weight_from_single_layer
-from intel_npu_acceleration_library.backend.factory import NNFactory
-class Llama32Embedding(NNFactory):
-    def __init__(
-        self,
-        vocab_size,
-        embedding_dim,
-        embedding_weight,
-        padding_idx,
-        inv_freq,
-        attention_scaling,
-        dtype,  # fp16
-        device: str = "NPU",
-    ):
-        super().__init__(False, device)
-        self.vocab_size = vocab_size
-        self.embedding_dim = embedding_dim
-        self.padding_idx = padding_idx
-        self.attention_scaling = attention_scaling
-        self.dtype = dtype
-        # define input
-        weight = self.constant(embedding_weight)
-        input = self.parameter((1, 1), dtype=np.int32)
-        position_ids = self.parameter((1, 1), dtype=np.int64)
-        inv_freq = self.constant(inv_freq)
-        # embed_tokens module
-        if padding_idx == -1:
-            padding_idx += vocab_size
-        axis_node = self.constant(np.array([0], dtype=np.int64))
-        if padding_idx is not None:
-            masked_embeddings = np.ones(weight.shape, dtype=np.float16)
-            masked_embeddings[padding_idx, :] = 0.0  # mask
-            node_mask = self.constant(masked_embeddings)
-            node_masked_w = self.eltwise_mul(weight, node_mask)
-            res = self.gather(node_masked_w, input, axis_node, 0)
-        else:
-            res = self.gather(weight, input, axis_node, 0)
-        # rotary_emb module
-        inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
-        position_ids = self.reshape(position_ids, (1, 1, 1))
-        freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
-                                 self.convert_to_fp32(position_ids))
-        freqs = self.transpose(freqs, [0, 2, 1])
-        emb = self.concat(freqs, freqs, axis=2)
-        cos = self.cos(emb)
-        sin = self.sin(emb)
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
-        # define outputs
-        res = self.convert_to_fp16(res)
-        cos = self.convert_to_fp32(cos)
-        sin = self.convert_to_fp32(sin)
-        print("start compiling")
-        self.compile()
-class Llama32PostEmbedding(NNFactory):
-    def __init__(
-        self,
-        inv_freq,
-        attention_scaling,
-        input_len: int = 1,
-        device: str = "NPU",
-    ):
-        super().__init__(False, device)
-        self.attention_scaling = attention_scaling
-        # define input
-        position_ids = self.parameter((1, input_len), dtype=np.int64)
-        inv_freq = self.constant(inv_freq)
-        # rotary_emb module
-        inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
-        position_ids = self.reshape(position_ids, (1, 1, input_len))
-        freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
-                                 self.convert_to_fp32(position_ids))
-        freqs = self.transpose(freqs, [0, 2, 1])
-        emb = self.concat(freqs, freqs, axis=2)
-        cos = self.cos(emb)
-        sin = self.sin(emb)
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
-        if input_len > 1:
-            cos = self.unsqueeze(cos, [1])
-            sin = self.unsqueeze(sin, [1])
-        # define outputs
-        cos = self.convert_to_fp32(cos)
-        sin = self.convert_to_fp32(sin)
-        print("start compiling")
-        self.compile()
+from .common import update_names_of_IR_and_export_blob, LowBitLLMLMHead, \
+    obtain_weight_from_single_layer, obtain_embedding_from_model
 def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
@@ -197,62 +97,10 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
         bin_file = os.path.join(weight_dir, f"model_lm_head_input_{1+idx}.bin")
         weight.tofile(bin_file)
-    if hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
-        # llama-2-7B & llama-3-8B
-        embedding_layer = model.model.embed_tokens
-        new_embedding = LLMEmbedding(
-            vocab_size=model.config.vocab_size,
-            embedding_dim=model.config.hidden_size,
-            embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
-            padding_idx=model.config.pad_token_id,
-            dtype=np.float16,
-        )
-        if convert_model:
-            bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
-            embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
-            first_blob_path = None
-        else:
-            first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
-                                                                 temp_dir, keep_ir=keep_ir,
-                                                                 compile_blob=compile_blob)
-            os.remove(os.path.join(temp_dir, "embedding.bin"))
-    else:
-        # llama-3.2-3B & llama-3.2-1B
-        embedding_layer = model.model.embed_tokens
-        new_embedding = Llama32Embedding(
-            vocab_size=model.config.vocab_size,
-            embedding_dim=model.config.hidden_size,
-            embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
-            padding_idx=model.config.pad_token_id,
-            inv_freq=model.model.rotary_emb.inv_freq.to(torch.float16),
-            attention_scaling=model.model.rotary_emb.attention_scaling,
-            dtype=np.float16,
-        )
-        if convert_model:
-            bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
-            embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
-            first_blob_path = None
-            # save embedding post module
-            inv_freq = model.model.rotary_emb.inv_freq.to(torch.float16)
-            attention_scaling = model.model.rotary_emb.attention_scaling
-            embedding_post = Llama32PostEmbedding(inv_freq=inv_freq,
-                                                  attention_scaling=attention_scaling,
-                                                  input_len=1)
-            update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
-                                               temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
-            embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
-                                                          attention_scaling=attention_scaling,
-                                                          input_len=max_prompt_len)
-            update_names_of_IR_and_export_blob(embedding_post_prefill,
-                                               "embedding_post_prefill",
-                                               temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
-            os.remove(os.path.join(temp_dir, "embedding_post.bin"))
-            os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
-        else:
-            first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
-                                                                 temp_dir, keep_ir=keep_ir,
-                                                                 compile_blob=compile_blob)
-            os.remove(os.path.join(temp_dir, "embedding.bin"))
+    first_blob_path = obtain_embedding_from_model(model, convert_model,
+                                                  temp_dir, weight_dir,
+                                                  max_prompt_len,
+                                                  keep_ir, compile_blob)
     return first_blob_path, last_blob_path

ipex_llm/transformers/npu_pipeline_model/qwen.py CHANGED Viewed

@@ -18,13 +18,14 @@
 import torch
 import numpy as np
 import os
-from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead, \
-    obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer
+from .common import update_names_of_IR_and_export_blob, LowBitLLMLMHead, \
+    obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer, \
+    obtain_embedding_from_model
 from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
 def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
-                                  convert_model=False, group_size=0,
+                                  convert_model=False, group_size=0, max_prompt_len=1,
                                   keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     head_dim = model.model.layers[0].self_attn.head_dim
@@ -107,24 +108,10 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
         bin_file = os.path.join(weight_dir, f"model_lm_head_input_{1+idx}.bin")
         weight.tofile(bin_file)
-    embedding_layer = model.model.embed_tokens
-    new_embedding = LLMEmbedding(
-        vocab_size=model.config.vocab_size,
-        embedding_dim=model.config.hidden_size,
-        embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
-        padding_idx=model.config.pad_token_id,
-        dtype=np.float16,
-        input_length=1,
-    )
-    if convert_model:
-        bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
-        embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
-        first_blob_path = True
-    else:
-        first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding",
-                                                             temp_dir, keep_ir=keep_ir,
-                                                             compile_blob=compile_blob)
-        os.remove(os.path.join(temp_dir, "embedding.bin"))
+    first_blob_path = obtain_embedding_from_model(model, convert_model,
+                                                  temp_dir, weight_dir,
+                                                  max_prompt_len,
+                                                  keep_ir, compile_blob)
     return first_blob_path, last_blob_path
@@ -145,8 +132,13 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
     mlp_layer = curr_layer.mlp
     weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
     q_bias, k_bias, v_bias = obtain_qkv_bias_from_single_layer(attn_layer)
-    cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
-    cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
+    if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
+        cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
+        cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
+    else:
+        # transformers >= 4.45.0
+        cached_cos = None
+        cached_sin = None
     layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
     layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)
@@ -158,10 +150,12 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
     if mode == "decode":
         input_len = 1
         decoder_name = f"decoder_layer_{layer_idx}"
+        keep_position_ids = True
         npu_dpu_groups = None
     else:
         input_len = kv_len
         decoder_name = "decoder_layer_prefill"
+        keep_position_ids = False
         npu_dpu_groups = 6
     single_decoder = LowBitQwenMultiDecoderlayer(
@@ -185,6 +179,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
         n_splits_linear=n_splits_linear,
         n_splits_down_proj=n_splits_down_proj,
         group_size=group_size,
+        cos_len=input_len,
+        keep_position_ids=keep_position_ids,
         asym=asym
     )
     rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
@@ -196,14 +192,25 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
     # 0, 1, 2 are input_embed/attention_mask/position_id
     if mode == "decode":
-        if layernorm_const:
-            st_idx = 3
+        if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
+            if layernorm_const:
+                st_idx = 3
+            else:
+                input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
+                post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
+                layer_norm_0.data.numpy().tofile(input_lm_bin_file)
+                layer_norm_1.data.numpy().tofile(post_lm_bin_file)
+                st_idx = 5
         else:
-            input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
-            post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
-            layer_norm_0.data.numpy().tofile(input_lm_bin_file)
-            layer_norm_1.data.numpy().tofile(post_lm_bin_file)
-            st_idx = 5
+            # transformers >= 4.45.0
+            if layernorm_const:
+                st_idx = 4
+            else:
+                input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
+                post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_5.bin")
+                layer_norm_0.data.numpy().tofile(input_lm_bin_file)
+                layer_norm_1.data.numpy().tofile(post_lm_bin_file)
+                st_idx = 6
         q_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx}.bin")
         k_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+1}.bin")
         v_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+2}.bin")
@@ -261,8 +268,13 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
             attn_layer = curr_layer.self_attn
             mlp_layer = curr_layer.mlp
             weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
-            cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
-            cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
+            if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
+                cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
+                cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
+            else:
+                # transformers >= 4.45.0
+                cached_cos = None
+                cached_sin = None
             layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
             layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)

{ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250207.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ipex-llm
-Version: 2.2.0b20250206
+Version: 2.2.0b20250207
 Summary: Large Language Model Develop Toolkit
 Home-page: https://github.com/intel-analytics/ipex-llm
 Author: BigDL Authors
@@ -27,19 +27,12 @@ Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine
 Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'all'
 Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'all'
 Provides-Extra: cpp
-Requires-Dist: bigdl-core-cpp ==2.6.0b20250206 ; extra == 'cpp'
+Requires-Dist: bigdl-core-cpp ==2.6.0b20250207 ; extra == 'cpp'
 Requires-Dist: setuptools ; extra == 'cpp'
-Provides-Extra: cpp-arl
-Requires-Dist: bigdl-core-cpp ==2.6.0b20250206 ; extra == 'cpp-arl'
-Requires-Dist: setuptools ; extra == 'cpp-arl'
-Requires-Dist: onednn-devel ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
-Requires-Dist: onednn ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
-Requires-Dist: dpcpp-cpp-rt ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
-Requires-Dist: mkl-dpcpp ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
-Requires-Dist: onednn-devel ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
-Requires-Dist: onednn ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
-Requires-Dist: dpcpp-cpp-rt ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
-Requires-Dist: mkl-dpcpp ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
+Requires-Dist: onednn-devel ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
+Requires-Dist: onednn ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
+Requires-Dist: dpcpp-cpp-rt ==2025.0.2 ; (platform_system == "Windows") and extra == 'cpp'
+Requires-Dist: mkl-dpcpp ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
 Provides-Extra: llama-index
 Requires-Dist: py-cpuinfo ; extra == 'llama-index'
 Requires-Dist: protobuf ; extra == 'llama-index'
@@ -67,7 +60,7 @@ Requires-Dist: transformers ==4.40.0 ; extra == 'npu'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'npu'
 Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'npu'
 Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'npu'
-Requires-Dist: bigdl-core-npu ==2.6.0b20250206 ; (platform_system == "Windows") and extra == 'npu'
+Requires-Dist: bigdl-core-npu ==2.6.0b20250207 ; (platform_system == "Windows") and extra == 'npu'
 Provides-Extra: serving
 Requires-Dist: py-cpuinfo ; extra == 'serving'
 Requires-Dist: fschat[model_worker,webui] ==0.2.36 ; extra == 'serving'
@@ -87,9 +80,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu'
 Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu'
 Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu'
 Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu'
-Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250206 ; extra == 'xpu'
-Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250206 ; extra == 'xpu'
-Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250206 ; extra == 'xpu'
+Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250207 ; extra == 'xpu'
+Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250207 ; extra == 'xpu'
+Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250207 ; extra == 'xpu'
 Provides-Extra: xpu-2-1
 Requires-Dist: py-cpuinfo ; extra == 'xpu-2-1'
 Requires-Dist: protobuf ; extra == 'xpu-2-1'
@@ -104,9 +97,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu-2-1'
 Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu-2-1'
 Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu-2-1'
 Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu-2-1'
-Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250206 ; extra == 'xpu-2-1'
-Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250206 ; extra == 'xpu-2-1'
-Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250206 ; extra == 'xpu-2-1'
+Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250207 ; extra == 'xpu-2-1'
+Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250207 ; extra == 'xpu-2-1'
+Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250207 ; extra == 'xpu-2-1'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-1'
 Requires-Dist: dpcpp-cpp-rt ==2024.0.2 ; (platform_system == "Windows") and extra == 'xpu-2-1'
 Requires-Dist: mkl-dpcpp ==2024.0.0 ; (platform_system == "Windows") and extra == 'xpu-2-1'
@@ -124,7 +117,7 @@ Requires-Dist: setuptools ; extra == 'xpu-2-6'
 Requires-Dist: torch ==2.6.0+xpu ; extra == 'xpu-2-6'
 Requires-Dist: torchvision ==0.21.0+xpu ; extra == 'xpu-2-6'
 Requires-Dist: torchaudio ==2.6.0+xpu ; extra == 'xpu-2-6'
-Requires-Dist: bigdl-core-xe-all ==2.6.0b20250206 ; extra == 'xpu-2-6'
+Requires-Dist: bigdl-core-xe-all ==2.6.0b20250207 ; extra == 'xpu-2-6'
 Requires-Dist: onednn-devel ==2025.0.1 ; extra == 'xpu-2-6'
 Requires-Dist: onednn ==2025.0.1 ; extra == 'xpu-2-6'
 Requires-Dist: dpcpp-cpp-rt ==2025.0.2 ; extra == 'xpu-2-6'
@@ -140,9 +133,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arc'
 Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arc'
 Requires-Dist: tabulate ; extra == 'xpu-arc'
 Requires-Dist: setuptools ; extra == 'xpu-arc'
-Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250206 ; extra == 'xpu-arc'
-Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250206 ; extra == 'xpu-arc'
-Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250206 ; extra == 'xpu-arc'
+Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250207 ; extra == 'xpu-arc'
+Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250207 ; extra == 'xpu-arc'
+Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250207 ; extra == 'xpu-arc'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arc'
 Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
 Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
@@ -163,9 +156,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arl'
 Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arl'
 Requires-Dist: tabulate ; extra == 'xpu-arl'
 Requires-Dist: setuptools ; extra == 'xpu-arl'
-Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250206 ; extra == 'xpu-arl'
-Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250206 ; extra == 'xpu-arl'
-Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250206 ; extra == 'xpu-arl'
+Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250207 ; extra == 'xpu-arl'
+Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250207 ; extra == 'xpu-arl'
+Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250207 ; extra == 'xpu-arl'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arl'
 Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
 Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
@@ -186,9 +179,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-lnl'
 Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-lnl'
 Requires-Dist: tabulate ; extra == 'xpu-lnl'
 Requires-Dist: setuptools ; extra == 'xpu-lnl'
-Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250206 ; extra == 'xpu-lnl'
-Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250206 ; extra == 'xpu-lnl'
-Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250206 ; extra == 'xpu-lnl'
+Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250207 ; extra == 'xpu-lnl'
+Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250207 ; extra == 'xpu-lnl'
+Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250207 ; extra == 'xpu-lnl'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-lnl'
 Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
 Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'

{ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250207.dist-info}/RECORD RENAMED Viewed

@@ -41,35 +41,35 @@ ipex_llm/langchain/llms/transformerspipelinellm.py,sha256=vm522YPPwWxxAPVvQBtxRf
 ipex_llm/langchain/vllm/__init__.py,sha256=T-EbRT6GJ_8RCu-iLmSzcftOimXSPQf2d5X72AUAy2Y,874
 ipex_llm/langchain/vllm/vllm.py,sha256=6dxc-ZISZQrJilEa_HA827l75Dv9rcHpY_G6FdJ8BVs,7793
 ipex_llm/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ipex_llm/libs/bloom-api.dll,sha256=H0S3QMH9mK_VlsEGqqM7vGKNiuvD1j3_cNOloDEqojg,36352
-ipex_llm/libs/bloom.dll,sha256=GtBKdhbPz4gZDtzdcrjiIa0IoZighOQmWoaScXlCzGA,507904
-ipex_llm/libs/gptneox-api.dll,sha256=_8nji5kq5Z524SGHaElsEFZCkCZJRyjLAbK7dF5EAkE,24576
-ipex_llm/libs/gptneox.dll,sha256=su29UwirxxACBTb9rKx4ln5sKsmmG82J7wbIFv9GOQs,568320
-ipex_llm/libs/libbloom_avx.dll,sha256=v1lgo7B-JJMWEwgs5hDwkm9XSd0nmO1r1X8JoYaJLIs,536576
-ipex_llm/libs/libbloom_vnni.dll,sha256=dBbTV7wWKZKPMw4oZL-H2_ooLdBhwziXLI97xLKvC3w,508416
-ipex_llm/libs/libgptneox_avx.dll,sha256=F_JBremk85c6zqKo0_rse9YXo9v_T52jFwy8Nnwt9yg,596992
-ipex_llm/libs/libgptneox_vnni.dll,sha256=8A6hc2w5Xqq2MoY_t-El6upUqFuI5Cu-ITiiDv9Nfvg,568832
-ipex_llm/libs/libllama_avx.dll,sha256=fADeqa8IK5akM04Cjyd1IRY3Exk8tAuIdNzKBew2zJg,591360
-ipex_llm/libs/libllama_vnni.dll,sha256=SbwkJLCQqtIW9zz_QKzAYb5kqfyUSs8-gddMikbB57s,563200
-ipex_llm/libs/libstarcoder_avx.dll,sha256=vgvvBkIZ18ofJ9rE69gkNn9SpY025RyI7x2VM0APDWA,627712
-ipex_llm/libs/libstarcoder_vnni.dll,sha256=L0cdtY2qHvKpJhFEPl_UkaCVhUw4tcknoIuWbyxQ-ck,599552
-ipex_llm/libs/llama-api.dll,sha256=7yQHdnnFcNiHESH3nrGLyEWscKV9FTPWmDqk-Gf9bA8,25600
-ipex_llm/libs/llama.dll,sha256=pOUGsXP8_NP1byv7z_Q-JU2flWnTjYlCL6lbU-RvORw,562688
-ipex_llm/libs/main-bloom.exe,sha256=bK5DfBLbt4jHwdPl0hw1zaBGQHFWC9MFjiDRqCXFgFA,103424
-ipex_llm/libs/main-gptneox.exe,sha256=3OfGBYDzOpYeB6GxToauh8af4M8i6l4Z6ffYQPdKyIw,98816
-ipex_llm/libs/main-llama.exe,sha256=wZGa8lG3bfaEQi8-DvRC4D3sjMKXms1pwT9OXVME4_Y,99840
-ipex_llm/libs/main-starcoder.exe,sha256=3yZrYUpJ1FYOWCh6PNmWagQ5e6BmimlL25B6AiPmQys,157696
-ipex_llm/libs/pipeline.dll,sha256=uDPNVk7J_dvOX_NTAJs6AEtm5pAnwYLuczHYuTV6Pso,72704
-ipex_llm/libs/quantize-bloom.exe,sha256=6Rl2TEE9-FN0jHrcAYsZjfp0kAxzMoHKuvM31d8pzPs,126464
-ipex_llm/libs/quantize-bloom_vnni.exe,sha256=7Q20DE84l-CDxcVgUxzWspAh0faioQw2iJqdtk9JME4,128000
-ipex_llm/libs/quantize-gptneox.exe,sha256=QRxEqJYH3ShD6KLhW3guxM_SxPusFADvv8j5euhp53Q,104448
-ipex_llm/libs/quantize-gptneox_vnni.exe,sha256=fj2E8ChFakQQzHHpYb_UxNy-9yQ9ZbChhr5PYUCdWkw,104960
-ipex_llm/libs/quantize-llama.exe,sha256=v2Rq663-92bMV3ze9l2-ocxvSjTeqlJegfY5XLf4MRQ,110080
-ipex_llm/libs/quantize-llama_vnni.exe,sha256=ywCgkuUA_jBImNslFpLFdcUvGv2pcbRvRZyZBhJ6-4c,110592
-ipex_llm/libs/quantize-starcoder.exe,sha256=zbiQpagpWGSYqgyHEmHgglDen3nDUS1LyhUXJbt65wE,127488
-ipex_llm/libs/quantize-starcoder_vnni.exe,sha256=xtynT6qbnZ1nBRxsIQbi2JGSKOlvdSCCozDQJiDSwCg,128512
-ipex_llm/libs/starcoder-api.dll,sha256=7c7MWBv57ZhfiynSrYJIKhnE6HMXUTSYYTUGwD7BX9I,21504
-ipex_llm/libs/starcoder.dll,sha256=AzASEAh2HCDC9XIQ0JfUiUDqF-3p4KR3rF71MKQDA4k,599040
+ipex_llm/libs/bloom-api.dll,sha256=R0zcv1M0D8y8inrrCUO2xCSTRb0IChVyLa6YQo9zne8,36352
+ipex_llm/libs/bloom.dll,sha256=eBzUhLMeOAb9InMPp9_KC5VhJC9F-YKNlJn6HyfOAb0,507904
+ipex_llm/libs/gptneox-api.dll,sha256=9_mq8IntnMiU7-_kDxiLojnEc1nu3rrxZZAIes7Nd4k,24576
+ipex_llm/libs/gptneox.dll,sha256=kR3dyhN7tNUxVIWoqudW57V0MIGqr-Mxkmw7kwR8VWs,568320
+ipex_llm/libs/libbloom_avx.dll,sha256=0iRHd_QIzEG_NI0RkFKmCX_HG-3E21t33sxrmbCpQwo,536576
+ipex_llm/libs/libbloom_vnni.dll,sha256=dL1TzKzoki8KDsCmka6QfzBH24T06WokxT3F4M5a3lk,508416
+ipex_llm/libs/libgptneox_avx.dll,sha256=SPi9xXxB5jLp63CfgVhmMA-rCoyCCji2nuWz-rv5y3E,596992
+ipex_llm/libs/libgptneox_vnni.dll,sha256=NV3xykgHJGxNTDWAA_yhwlBG_dbHPX0__5s9uHCPmfc,568832
+ipex_llm/libs/libllama_avx.dll,sha256=EbZ-lpHHtM-zS9aiuDU8cBVueVAtRi3UqerARH41qC8,591360
+ipex_llm/libs/libllama_vnni.dll,sha256=67XqNSyXI1nuaA1-xcSOhYIHZaH7aZBvwMetGpTriIk,563200
+ipex_llm/libs/libstarcoder_avx.dll,sha256=kAqXHfoZfmyqIbNbGpzQjXNCMz9pkG5KVRECzEDEwhM,627712
+ipex_llm/libs/libstarcoder_vnni.dll,sha256=c02B9jpBvST282jRXJtkRwJKkZnzhkz5MLdFfjH9T8I,599552
+ipex_llm/libs/llama-api.dll,sha256=SA2frHXocsnAN9z3LZfWT_FjY1waSMS26bHM6ot_07c,25600
+ipex_llm/libs/llama.dll,sha256=Ls7CKimo2SNy-uJt6lLz16yz1O9E358dRgP8E0svF98,562688
+ipex_llm/libs/main-bloom.exe,sha256=-HCik31DRGrozp_Uy420O1l-Sk_7e9V1bjg4XaLPFvA,103424
+ipex_llm/libs/main-gptneox.exe,sha256=pqxQCGKBrsoDtvuKhCwk6uOAGt4GGvzoAdQbHB9qrFI,98816
+ipex_llm/libs/main-llama.exe,sha256=sPKj3WRmI97jyNhO4A5Lz4eF-tsZZojv6z2VaNzAKAU,99840
+ipex_llm/libs/main-starcoder.exe,sha256=7vyW8v2qO1J_fkRq4uzk44UsV4AhDGmcWHUwMiez8WY,157696
+ipex_llm/libs/pipeline.dll,sha256=vHFtLO6vUZQVwtzXICv1Q5Ork32Dw5Ipqa8pbr6TtmM,72704
+ipex_llm/libs/quantize-bloom.exe,sha256=8rUxXU7Z4AZ7mFHI3sGpwGG18_DkapunwTzzUTjCCbo,126464
+ipex_llm/libs/quantize-bloom_vnni.exe,sha256=gA9kKUkmFOIzT_CmFFvG-fG6d6bZuEWSTeyPvhCsDLs,128000
+ipex_llm/libs/quantize-gptneox.exe,sha256=YsrviyLjQU9uxD1p6TfdBAPXG72-QzZFGpt7lDmK_gM,104448
+ipex_llm/libs/quantize-gptneox_vnni.exe,sha256=mYmUHza3rZztjTogXv9FxuIM20z0gHfyjbF6b6ADEK0,104960
+ipex_llm/libs/quantize-llama.exe,sha256=h-7nbo0uIswViTdxf_vHmE3sZdnQ79dDMUHzqjtyMKs,110080
+ipex_llm/libs/quantize-llama_vnni.exe,sha256=OEPzGySIaa-O9IhPY-u2slHnhMDzp6mL8e_Qr2WUgKc,110592
+ipex_llm/libs/quantize-starcoder.exe,sha256=4U-jT0MC4Iz4kP_6WpKkMOSk_hTlqAwgSVlGLGa-imA,127488
+ipex_llm/libs/quantize-starcoder_vnni.exe,sha256=Xc4jW9KH_RNSfJIYJinDRIx-BbWmqxx4h-kc9jowZpk,128512
+ipex_llm/libs/starcoder-api.dll,sha256=2lF73SE1AyICwtpQSZUfkiAbE1WJQ5gEbikL1Lsvzhg,21504
+ipex_llm/libs/starcoder.dll,sha256=NBh51OQS90ppaqMAJAFCa6HptcUnnPx7tUL1J95QwMk,599040
 ipex_llm/llamaindex/__init__.py,sha256=T-EbRT6GJ_8RCu-iLmSzcftOimXSPQf2d5X72AUAy2Y,874
 ipex_llm/llamaindex/llms/__init__.py,sha256=KP1lEdGqDuxPoxL1ZSH25Pm2kKMPJBWUTLR0ckSLMIU,1139
 ipex_llm/llamaindex/llms/bigdlllm.py,sha256=FQBzq1KOjfc6uofTXAha3O7TqpJkNfOFepXQmOVlbnI,26314
@@ -95,9 +95,9 @@ ipex_llm/transformers/lisa.py,sha256=F5WxbtXQ7RdKulj83h_2DnEIgKiKGZf7zvOmg6QBl2s
 ipex_llm/transformers/loader.py,sha256=AwjV5RpI2t2bedlv7ZhLm8cfd-QJZm5hny-XyjIvdnk,6876
 ipex_llm/transformers/lookup.py,sha256=b6OlZ9OV10R9qeWw8mVryVpDxszkjwLkldvi7GPMJY8,19614
 ipex_llm/transformers/low_bit_linear.py,sha256=3EtbiCAq5HU_r2pGJ9beSDK4NPTN8Jj-aHMqm1jqX18,39177
-ipex_llm/transformers/model.py,sha256=cQJNlAkdfoWmVbWd-TS2hf-Do41mMO9orPvG3FO4Nns,40855
+ipex_llm/transformers/model.py,sha256=FyHrEQhkHxG3FbGkhTjVOP2rgFMjc3AXcjDwvvB0HqU,40798
 ipex_llm/transformers/modelling_bigdl.py,sha256=7JpNVMuyq_OmtNUaMFMXdxPWZp2q0QHC02QeA-VTPOw,6709
-ipex_llm/transformers/npu_model.py,sha256=LMmRmhq8IAN9FrXLUeUK2B8XS2OJ5GVWmG0cEdeK-ro,40354
+ipex_llm/transformers/npu_model.py,sha256=zgXOiLIJ-3p-1Kejgv4jUFK8OiBZbezMZrRyn0_6_8c,40306
 ipex_llm/transformers/patches.py,sha256=G9KcXxo42H1HJEDaroq4JbBN5P0P0lty7U7kk7-g4tw,991
 ipex_llm/transformers/pipeline_parallel.py,sha256=uNZpOXljNmdoEYnP8U-VFiN4dRZb2piQbIf2bG9LQnE,49051
 ipex_llm/transformers/qlora.py,sha256=qV9Y6G5kAaet77LLA3oXn3qQY4ayyAPZ7NAjOlHCS7g,14967
@@ -202,17 +202,17 @@ ipex_llm/transformers/npu_models/paraformer_mp.py,sha256=lGEjmKHW_Pk3BE3nqa1ZVgJ
 ipex_llm/transformers/npu_models/phi3.py,sha256=R-EuqHsTrPTX33HtCGAMFlRdXB_j5mH_7FDnj62JtNM,6555
 ipex_llm/transformers/npu_models/phi3_v.py,sha256=EMZuTPkGfuDVp9c5BU1HyzXHWKswHRQ8bvQjzocIyHA,7737
 ipex_llm/transformers/npu_models/qwen2.py,sha256=RDNtPK8kxMk3z8A4S53saTrw2klgkzo4oa7voJLwr1o,12085
-ipex_llm/transformers/npu_models/qwen2_mp.py,sha256=dnxpkLVW2bUsL4V-kZTyT2itc5aOpIB5vP3U7FtWdrs,44184
+ipex_llm/transformers/npu_models/qwen2_mp.py,sha256=EKiI80rnQ43WUF_2wWCy75mx-rbjAbRQSB49OgjZFNo,45003
 ipex_llm/transformers/npu_models/stablelm.py,sha256=0iUhdjFqFd0svuTd09wP60mbEtobPkNSj-1I1vfuhsU,7778
 ipex_llm/transformers/npu_models/xlm_mp.py,sha256=sj8OVun8xJprM7ZJp0XzWa55rqlSIzNMdKmI9i6jlDU,28332
 ipex_llm/transformers/npu_pipeline_model/__init__.py,sha256=b2IXvVqQ5cItki021h8s3ymW12RPu8QNPprq4Mn3bDM,586
 ipex_llm/transformers/npu_pipeline_model/baichuan.py,sha256=ICxRzFQ4OIANDkkVi2_4xOeQXmfFXYMx3H52KuE1xR4,6208
-ipex_llm/transformers/npu_pipeline_model/common.py,sha256=QxJoJESpv0BpwO_FBeAT2wKA56wNFfen8iI37PrMKuA,7838
-ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py,sha256=-eHNbRuX2QhYd0-jCyo2pZpHTZTZ108bhObYx8a3CJs,29494
-ipex_llm/transformers/npu_pipeline_model/llama.py,sha256=pmAnawfAn0W8XSr8kGWfxR1HylCLa-Y6mKpFeX-m8UY,20892
+ipex_llm/transformers/npu_pipeline_model/common.py,sha256=faooJmM75qnVyZYuQLx9gJpVlotcVF4qXRCnOrknfk4,14776
+ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py,sha256=_l4RFmyBMbREo8vzKpHXAMtE202JVQ41Y2lPg1qCOMI,29846
+ipex_llm/transformers/npu_pipeline_model/llama.py,sha256=j2sipfFSrzV2VgLKPOClMHwWIDXqDsL1jIQJK25hneo,14397
 ipex_llm/transformers/npu_pipeline_model/minicpm.py,sha256=H7j_UaHj-IwEBriQ-bunle0-8s2NmvqnL9eYuixnmFc,21398
 ipex_llm/transformers/npu_pipeline_model/pipeline_cpp.py,sha256=JNmodAMg_NQvDILug3E_fGXEh6cd3wsj4bvAzcd-vaU,2749
-ipex_llm/transformers/npu_pipeline_model/qwen.py,sha256=FAfoPlKEAxeU6-J8ltpSev5ithm9AC-urtreu6NGpME,15509
+ipex_llm/transformers/npu_pipeline_model/qwen.py,sha256=6MNtCL1CXoR19B4tKZSgv2e5gtma9bqDG7DOYMCnPt0,16013
 ipex_llm/utils/__init__.py,sha256=LlUgrD03rfw4iY8zWPtHH6p65Gw76waVOLHaqagETw0,1425
 ipex_llm/utils/benchmark_util_4_29.py,sha256=OU1W1quiaiJGsg1pd3HM9O6PmVSaPA0HHE7R8hNTfmQ,258653
 ipex_llm/utils/benchmark_util_4_42.py,sha256=HEiClCgKDp_T64HH8ulSTly8dvt6UwPDYZfrPVYvXcc,225383
@@ -248,11 +248,11 @@ ipex_llm/vllm/xpu/engine/__init__.py,sha256=pY_CpyuZd72fr6s32ejeKHKFW0K4vUU2rzZj
 ipex_llm/vllm/xpu/engine/engine.py,sha256=k4-D27WS_Gk3mA--w3HWAjPjb4Aiu043MVPi0ZoAUBc,5984
 ipex_llm/vllm/xpu/entrypoints/openai/api_server.py,sha256=GshTZFB8e4PWvqckfbmTOU6b0oLkNn7A-vzLuG9--j8,21544
 ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py,sha256=2rENA2ucynMaIjiZBEh2ez1o5vR32GaP514t39CD7KM,8676
-ipex_llm-2.2.0b20250206.data/scripts/ipex-llm-init.bat,sha256=HPtCYuDYwEatq7dAwOvdfVcHYCpAVdbj75K1qh0vQek,2578
-ipex_llm-2.2.0b20250206.data/scripts/llm-chat.ps1,sha256=6qrs-hGVAV8IKh7Jx8nq_XrnZcjd7qGU5wndArM7Yag,2769
-ipex_llm-2.2.0b20250206.data/scripts/llm-cli.ps1,sha256=3qBtTLs_EjYDnM8YyCpJhzLnGCKTEGssu9UNqfkjVXs,3009
-ipex_llm-2.2.0b20250206.dist-info/METADATA,sha256=pAr_-dBEJB_J2lV8oNgJkJ5bGTObiseNHISkXAGkY9I,12879
-ipex_llm-2.2.0b20250206.dist-info/WHEEL,sha256=6iYPr8vTHsyDK75jr9X0V3I9wPSVmtwr_8fdATBciGk,98
-ipex_llm-2.2.0b20250206.dist-info/entry_points.txt,sha256=TiUyBB2MRmfF3ko-pyAEzqeBCRnyhu27bNOAsWPp3e8,61
-ipex_llm-2.2.0b20250206.dist-info/top_level.txt,sha256=CGCMHM-SyqUabU4h8RqJ2KTYckQUO3LvIWwmUQ6Qbzw,9
-ipex_llm-2.2.0b20250206.dist-info/RECORD,,
+ipex_llm-2.2.0b20250207.data/scripts/ipex-llm-init.bat,sha256=HPtCYuDYwEatq7dAwOvdfVcHYCpAVdbj75K1qh0vQek,2578
+ipex_llm-2.2.0b20250207.data/scripts/llm-chat.ps1,sha256=6qrs-hGVAV8IKh7Jx8nq_XrnZcjd7qGU5wndArM7Yag,2769
+ipex_llm-2.2.0b20250207.data/scripts/llm-cli.ps1,sha256=3qBtTLs_EjYDnM8YyCpJhzLnGCKTEGssu9UNqfkjVXs,3009
+ipex_llm-2.2.0b20250207.dist-info/METADATA,sha256=d1hx5hE5Xeb3lHGWqeF35SK9GZOX6syXJ_Syu5b35IU,12369
+ipex_llm-2.2.0b20250207.dist-info/WHEEL,sha256=6iYPr8vTHsyDK75jr9X0V3I9wPSVmtwr_8fdATBciGk,98
+ipex_llm-2.2.0b20250207.dist-info/entry_points.txt,sha256=TiUyBB2MRmfF3ko-pyAEzqeBCRnyhu27bNOAsWPp3e8,61
+ipex_llm-2.2.0b20250207.dist-info/top_level.txt,sha256=CGCMHM-SyqUabU4h8RqJ2KTYckQUO3LvIWwmUQ6Qbzw,9
+ipex_llm-2.2.0b20250207.dist-info/RECORD,,

{ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250207.data}/scripts/ipex-llm-init.bat RENAMED Viewed

File without changes

{ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250207.data}/scripts/llm-chat.ps1 RENAMED Viewed

File without changes

{ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250207.data}/scripts/llm-cli.ps1 RENAMED Viewed

File without changes

{ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250207.dist-info}/WHEEL RENAMED Viewed

File without changes

{ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250207.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250207.dist-info}/top_level.txt RENAMED Viewed

File without changes