PyPI - ipex-llm - Versions diffs - 2.2.0b20250101__py3-none-win_amd64.whl → 2.2.0b20250102__py3-none-win_amd64.whl - Mend

ipex-llm 2.2.0b20250101__py3-none-win_amd64.whl → 2.2.0b20250102__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

ipex_llm/libs/bloom-api.dll +0 -0
ipex_llm/libs/bloom.dll +0 -0
ipex_llm/libs/gptneox-api.dll +0 -0
ipex_llm/libs/gptneox.dll +0 -0
ipex_llm/libs/libbloom_avx.dll +0 -0
ipex_llm/libs/libbloom_vnni.dll +0 -0
ipex_llm/libs/libgptneox_avx.dll +0 -0
ipex_llm/libs/libgptneox_vnni.dll +0 -0
ipex_llm/libs/libllama_avx.dll +0 -0
ipex_llm/libs/libllama_vnni.dll +0 -0
ipex_llm/libs/libstarcoder_avx.dll +0 -0
ipex_llm/libs/libstarcoder_vnni.dll +0 -0
ipex_llm/libs/llama-api.dll +0 -0
ipex_llm/libs/llama.dll +0 -0
ipex_llm/libs/main-bloom.exe +0 -0
ipex_llm/libs/main-gptneox.exe +0 -0
ipex_llm/libs/main-llama.exe +0 -0
ipex_llm/libs/main-starcoder.exe +0 -0
ipex_llm/libs/pipeline.dll +0 -0
ipex_llm/libs/quantize-bloom.exe +0 -0
ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
ipex_llm/libs/quantize-gptneox.exe +0 -0
ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
ipex_llm/libs/quantize-llama.exe +0 -0
ipex_llm/libs/quantize-llama_vnni.exe +0 -0
ipex_llm/libs/quantize-starcoder.exe +0 -0
ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
ipex_llm/libs/starcoder-api.dll +0 -0
ipex_llm/libs/starcoder.dll +0 -0
ipex_llm/optimize.py +3 -1
ipex_llm/transformers/convert.py +3 -2
ipex_llm/transformers/low_bit_linear.py +8 -2
ipex_llm/transformers/model.py +3 -1
ipex_llm/transformers/models/baichuan.py +5 -11
ipex_llm/transformers/models/chatglm.py +2 -2
ipex_llm/transformers/models/qwen.py +34 -46
ipex_llm/transformers/models/qwen2.py +5 -19
ipex_llm/transformers/npu_model.py +3 -3
ipex_llm/transformers/npu_models/convert.py +40 -18
ipex_llm/transformers/npu_models/npu_llm_cpp.py +18 -9
ipex_llm/transformers/npu_pipeline_model/qwen.py +4 -0
{ipex_llm-2.2.0b20250101.dist-info → ipex_llm-2.2.0b20250102.dist-info}/METADATA +19 -19
{ipex_llm-2.2.0b20250101.dist-info → ipex_llm-2.2.0b20250102.dist-info}/RECORD +49 -49
{ipex_llm-2.2.0b20250101.data → ipex_llm-2.2.0b20250102.data}/scripts/ipex-llm-init.bat +0 -0
{ipex_llm-2.2.0b20250101.data → ipex_llm-2.2.0b20250102.data}/scripts/llm-chat.ps1 +0 -0
{ipex_llm-2.2.0b20250101.data → ipex_llm-2.2.0b20250102.data}/scripts/llm-cli.ps1 +0 -0
{ipex_llm-2.2.0b20250101.dist-info → ipex_llm-2.2.0b20250102.dist-info}/WHEEL +0 -0
{ipex_llm-2.2.0b20250101.dist-info → ipex_llm-2.2.0b20250102.dist-info}/entry_points.txt +0 -0
{ipex_llm-2.2.0b20250101.dist-info → ipex_llm-2.2.0b20250102.dist-info}/top_level.txt +0 -0

ipex_llm/libs/bloom-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/bloom.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama.dll CHANGED Viewed

Binary file

ipex_llm/libs/main-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/pipeline.dll CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/starcoder-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/starcoder.dll CHANGED Viewed

Binary file

ipex_llm/optimize.py CHANGED Viewed

@@ -254,7 +254,9 @@ def optimize_model(model, low_bit='sym_int4', optimize_llm=True, modules_to_not_
                                  torch_dtype=torch_dtype,
                                  optimize_model=optimize_llm,
                                  modules_to_not_convert=modules_to_not_convert,
-                                 cpu_embedding=cpu_embedding)
+                                 cpu_embedding=cpu_embedding,
+                                 disable_optimize_pre=kwargs.pop("disable_optimize_pre",
+                                                                 False))
     # add save_low_bit to pretrained model dynamically
     import types
     model._bigdl_config = dict()

ipex_llm/transformers/convert.py CHANGED Viewed

@@ -1081,7 +1081,8 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
                          torch_dtype="auto",
                          imatrix_data=None,
                          embedding_qtype=None,
-                         mixed_precision=False):
+                         mixed_precision=False,
+                         disable_optimize_pre=False):
     if qtype in ggml_tensor_qtype.values():
         index = list(ggml_tensor_qtype.values()).index(qtype)
         logger.info(f"Converting the current model to "
@@ -1104,7 +1105,7 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
         model = _optimize_ipex(model, qtype)
         return model
-    if optimize_model:
+    if optimize_model and not disable_optimize_pre:
         model = _optimize_pre(model, qtype)
     act_order = False

ipex_llm/transformers/low_bit_linear.py CHANGED Viewed

@@ -764,6 +764,7 @@ class FP16Linear(nn.Linear):
         # weigh_type = 3 means weight has been transposed by esimd method
         self.weight_type = 1
         self.optimize_lm_head = optimize_lm_head
+        self.disable_fp16_opt = False
     def forward(self, x: torch.Tensor):
         # only work for GPU
@@ -779,8 +780,11 @@ class FP16Linear(nn.Linear):
             self.weight.data = self.weight.data.to(x.dtype)
         if not self.use_esimd_kernel(x):
-            if get_ipex_version() < "2.1.10+xpu" \
-                    or get_xpu_device_type(x) not in ["arc", "flex", "pvc"]:
+            if (
+                get_ipex_version() < "2.1.10+xpu"
+                or get_xpu_device_type(x) not in ["arc", "flex", "pvc"]
+                or self.disable_fp16_opt
+            ):
                 if self.weight_type == 2:
                     self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
                                                      requires_grad=False)
@@ -845,6 +849,8 @@ class FP16Linear(nn.Linear):
     def use_esimd_kernel(self, x):
         gpu_type = get_xpu_device_type(x)
+        if self.disable_fp16_opt:
+            return False
         # esimd kernel can only be used for Arc and Flex
         if gpu_type not in ["arc", "flex"]:
             return False

ipex_llm/transformers/model.py CHANGED Viewed

@@ -445,6 +445,7 @@ class _BaseAutoModelClass:
         mixed_precision = kwargs.pop("mixed_precision", False)
         if embedding_qtype is not None:
             embedding_qtype = ggml_tensor_qtype[embedding_qtype]
+        disable_optimize_pre = kwargs.pop("disable_optimize_pre", False)
         _args = copy.deepcopy(args)
         _kwargs = copy.deepcopy(kwargs)
         awq_config = None
@@ -513,7 +514,8 @@ class _BaseAutoModelClass:
                                      torch_dtype=kwargs.get("torch_dtype", 'auto'),
                                      imatrix_data=imatrix_data,
                                      embedding_qtype=embedding_qtype,
-                                     mixed_precision=mixed_precision)
+                                     mixed_precision=mixed_precision,
+                                     disable_optimize_pre=disable_optimize_pre)
         if disk_embedding:
             from ipex_llm.transformers.embedding import DiskEmbedding

ipex_llm/transformers/models/baichuan.py CHANGED Viewed

@@ -29,7 +29,7 @@ from ipex_llm.transformers.models.utils import use_quantize_kv_cache, restore_fp
     should_use_compresskv
 from ipex_llm.transformers.models.utils import update_past_key_value
 from ipex_llm.transformers.models.utils import should_use_fuse_rope
-from ipex_llm.transformers.models.utils import use_flash_attention, use_sdp
+from ipex_llm.transformers.models.utils import use_sdp
 from ipex_llm.transformers.models.utils import apply_rotary_pos_emb, SILU
 from ipex_llm.transformers.models.utils import mlp_fusion_check
 from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_36
@@ -301,16 +301,10 @@ def baichuan_attention_forward_7b(
     # IPEX-LLM OPT: sdp
     attn_weights = None
-    if use_flash_attention(query_states, key_states, attention_mask):
-        attn_output = F.scaled_dot_product_attention(query_states.to(dtype=torch.float16),
-                                                     key_states.to(dtype=torch.float16),
-                                                     value_states.to(dtype=torch.float16),
-                                                     is_causal=True).to(hidden_states.dtype)
-    else:
-        attn_output = scaled_dot_product_attention(
-            query_states, key_states, value_states,
-            attention_mask, q_len == kv_seq_len
-        )
+    attn_output = scaled_dot_product_attention(
+        query_states, key_states, value_states,
+        attention_mask, q_len == kv_seq_len
+    )
     attn_output = attn_output.transpose(1, 2).contiguous()
     attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

ipex_llm/transformers/models/chatglm.py CHANGED Viewed

@@ -23,7 +23,7 @@ import torch.utils.checkpoint
 import torch.nn.functional as F
 from typing import Optional, Tuple
 from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
-from ipex_llm.transformers.models.utils import use_flash_attention, use_sdp
+from ipex_llm.transformers.models.utils import use_sdp
 def rotate_half(x):
@@ -41,7 +41,7 @@ def apply_rotary_pos_emb_index(q, k, cos, sin, position_id):
 def glm_sdpa(query, key, value, attention_mask=None, is_causal=False):
-    if use_flash_attention(query, key, attention_mask) or query.device.type == 'cpu':
+    if query.device.type == 'cpu':
         context_layer = F.scaled_dot_product_attention(query.to(key.dtype),
                                                        key,
                                                        value,

ipex_llm/transformers/models/qwen.py CHANGED Viewed

@@ -33,7 +33,6 @@ from ipex_llm.transformers.models.utils import update_past_key_value, should_use
 from ipex_llm.transformers.models.utils import use_quantize_kv_cache
 from ipex_llm.transformers.models.utils import rotate_half, SILU
 from ipex_llm.transformers.models.utils import mlp_fusion_check
-from ipex_llm.transformers.models.utils import use_flash_attention
 from ipex_llm.utils.common import invalidInputError
 from transformers.modeling_outputs import BaseModelOutputWithPast
@@ -116,33 +115,28 @@ def qwen_attention_forward(
     past_key_value = (key_states.transpose(1, 2),
                       value_states.transpose(1, 2)) if use_cache else None
-    # IPEX-LLM OPT: sdp
+    # IPEX-LLM OPT: sdpa
     attn_weights = None
-    if use_flash_attention(query_states, key_states, attention_mask):
-        attn_output = F.scaled_dot_product_attention(query_states.to(dtype=torch.float16),
-                                                     key_states.to(dtype=torch.float16),
-                                                     value_states.to(dtype=torch.float16),
-                                                     is_causal=True).to(hidden_states.dtype)
+    if q_len > 1 and q_len != kv_seq_len:
+        causal_mask = torch.tril(
+            torch.ones((kv_seq_len, kv_seq_len), dtype=torch.bool, device=query_states.device)
+        ).view(1, 1, kv_seq_len, kv_seq_len)
+        causal_mask = causal_mask[
+            :, :, kv_seq_len - q_len:kv_seq_len, :kv_seq_len
+        ]
+        attention_mask = torch.zeros(causal_mask.shape, dtype=query_states.dtype,
+                                     device=query_states.device)
+        attention_mask.masked_fill_(causal_mask.logical_not(),
+                                    torch.finfo(attention_mask.dtype).min)
+        attention_mask = attention_mask.expand([bsz, -1, -1, -1])
     else:
-        if q_len > 1 and q_len != kv_seq_len:
-            causal_mask = torch.tril(
-                torch.ones((kv_seq_len, kv_seq_len), dtype=torch.bool, device=query_states.device)
-            ).view(1, 1, kv_seq_len, kv_seq_len)
-            causal_mask = causal_mask[
-                :, :, kv_seq_len - q_len:kv_seq_len, :kv_seq_len
-            ]
-            attention_mask = torch.zeros(causal_mask.shape, dtype=query_states.dtype,
-                                         device=query_states.device)
-            attention_mask.masked_fill_(causal_mask.logical_not(),
-                                        torch.finfo(attention_mask.dtype).min)
-            attention_mask = attention_mask.expand([bsz, -1, -1, -1])
-        else:
-            attention_mask = None
+        attention_mask = None
-        attn_output = scaled_dot_product_attention(
-            query_states, key_states, value_states,
-            attention_mask, q_len == kv_seq_len
-        )
+    attn_output = scaled_dot_product_attention(
+        query_states, key_states, value_states,
+        attention_mask, q_len == kv_seq_len
+    )
     attn_output = attn_output.transpose(1, 2).contiguous()
     attn_output = attn_output.view(bsz, q_len, self.hidden_size)
@@ -219,31 +213,25 @@ def qwen_attention_forward_registered(
     past_key_value = (key_states.transpose(1, 2),
                       value_states.transpose(1, 2)) if use_cache else None
-    # IPEX-LLM OPT: sdp
+    # IPEX-LLM OPT: sdpa
     attn_weights = None
-    if use_flash_attention(query_states, key_states, attention_mask):
-        attn_output = F.scaled_dot_product_attention(query_states.to(dtype=torch.float16),
-                                                     key_states.to(dtype=torch.float16),
-                                                     value_states.to(dtype=torch.float16),
-                                                     is_causal=True).to(hidden_states.dtype)
+    if q_len > 1 and q_len != kv_seq_len:
+        causal_mask = registered_causal_mask[
+            :, :, kv_seq_len - q_len:kv_seq_len, :kv_seq_len
+        ]
+        attention_mask = torch.zeros(causal_mask.shape, dtype=query_states.dtype,
+                                     device=query_states.device)
+        attention_mask.masked_fill_(causal_mask.logical_not(),
+                                    torch.finfo(attention_mask.dtype).min)
+        attention_mask = attention_mask.expand([bsz, -1, -1, -1])
     else:
-        if q_len > 1 and q_len != kv_seq_len:
-            causal_mask = registered_causal_mask[
-                :, :, kv_seq_len - q_len:kv_seq_len, :kv_seq_len
-            ]
-            attention_mask = torch.zeros(causal_mask.shape, dtype=query_states.dtype,
-                                         device=query_states.device)
-            attention_mask.masked_fill_(causal_mask.logical_not(),
-                                        torch.finfo(attention_mask.dtype).min)
-            attention_mask = attention_mask.expand([bsz, -1, -1, -1])
-        else:
-            attention_mask = None
+        attention_mask = None
-        attn_output = scaled_dot_product_attention(
-            query_states, key_states, value_states,
-            attention_mask, q_len == kv_seq_len
-        )
+    attn_output = scaled_dot_product_attention(
+        query_states, key_states, value_states,
+        attention_mask, q_len == kv_seq_len
+    )
     attn_output = attn_output.transpose(1, 2).contiguous()
     attn_output = attn_output.view(bsz, q_len, self.hidden_size)

ipex_llm/transformers/models/qwen2.py CHANGED Viewed

@@ -38,12 +38,10 @@
 #
 import os
-import math
 from typing import Optional, Tuple, Union, List
 import torch
 from torch.nn import CrossEntropyLoss
-from torch.nn.functional import scaled_dot_product_attention as sdpa
 from ipex_llm.transformers.models.common import merge_qkv_base
 from ipex_llm.transformers.models.common import scaled_dot_product_attention
@@ -51,13 +49,12 @@ from ipex_llm.transformers.models.utils import SILU, mlp_fusion_check
 from ipex_llm.transformers.models.utils import should_use_fuse_rope
 from ipex_llm.transformers.models.utils import use_quantize_kv_cache, \
     should_use_compresskv, is_enough_kv_cache_room_4_36
-from ipex_llm.transformers.models.utils import use_flash_attention
 from ipex_llm.transformers.kv import DynamicFp8Cache, DynamicNormalCache, \
     DynamicCompressCache, DynamicCompressFp8Cache
 from ipex_llm.utils.common import invalidInputError
 from transformers.models.qwen2.modeling_qwen2 import Qwen2Attention, Qwen2MLP
-from transformers.models.qwen2.modeling_qwen2 import apply_rotary_pos_emb, repeat_kv
+from transformers.models.qwen2.modeling_qwen2 import apply_rotary_pos_emb
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from transformers.cache_utils import Cache
 from transformers import logging
@@ -580,21 +577,10 @@ def qwen2_attention_forward(
                                                              self.layer_idx, None)
     attn_weights = None
-    if use_flash_attention(query_states, key_states, attention_mask):
-        if attention_mask is not None:
-            attention_mask = attention_mask[:, :, :, :kv_seq_len]
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        attn_output = sdpa(query_states.to(device, dtype=torch.float16),
-                           key_states.to(device, dtype=torch.float16),
-                           value_states.to(device, dtype=torch.float16),
-                           is_causal=True).to(hidden_states.dtype)
-    else:
-        attn_output = scaled_dot_product_attention(
-            query_states, key_states, value_states,
-            attention_mask, q_len == kv_seq_len
-        )
+    attn_output = scaled_dot_product_attention(
+        query_states, key_states, value_states,
+        attention_mask, q_len == kv_seq_len
+    )
     attn_output = attn_output.transpose(1, 2).contiguous()
     attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

ipex_llm/transformers/npu_model.py CHANGED Viewed

@@ -301,8 +301,7 @@ class _BaseAutoModelClass:
         model.share_memory()
         if not pipeline:
-            if (not hasattr(model, 'llm') and
-                    model.config.model_type in ["qwen2", "llama", "minicpm"]):
+            if model.config.model_type in ["qwen2", "llama", "minicpm"]:
                 from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
                 optimize_llm_single_process(
                     llm,
@@ -312,7 +311,8 @@ class _BaseAutoModelClass:
                     group_size=quantization_group_size,
                     qtype=qtype,
                     save_directory=save_directory,
-                    fuse_layers=fuse_layers
+                    fuse_layers=fuse_layers,
+                    has_llm=hasattr(model, "llm")
                 )
             else:
                 optimize_llm(

ipex_llm/transformers/npu_models/convert.py CHANGED Viewed

@@ -449,7 +449,8 @@ def optimize_llm_single_process(
     group_size: int,
     qtype: str,
     save_directory: str,
-    fuse_layers: int=None
+    fuse_layers: int=None,
+    has_llm: bool=False
 ):
     from ipex_llm.transformers.npu_pipeline_model.convert_pipeline import convert_llm
     from .npu_llm_cpp import load_model_from_file
@@ -468,8 +469,13 @@ def optimize_llm_single_process(
         model.kv_len = kv_len
         model.model_ptr = model_ptr
         model.save_directory = save_directory
-        model.vocab_size = model.config.vocab_size
+        if model.config.vocab_size == 151666:
+            # for MiniCPM-V 2.6, 152064 is vocab_size of Qwen2-7B
+            model.vocab_size = 152064
+        else:
+            model.vocab_size = model.config.vocab_size
         model.logits_buffer = torch.empty(1, 1, model.vocab_size, dtype=torch.float32)
+        model.max_prompt_len = max_prompt_len
     except:
         invalidInputError(False,
                           "False to InitLLMPipeline.")
@@ -478,9 +484,10 @@ def optimize_llm_single_process(
     general_convert(model, PreTrainedModel, prepare_input_ids, "prepare_inputs_for_generation")
     general_convert(model, PreTrainedModel, causal_lm_forward)
     # patch generate function
-    import types
-    model.original_generate = model.generate
-    model.generate = types.MethodType(generate, model)
+    if not has_llm:
+        import types
+        model.original_generate = model.generate
+        model.generate = types.MethodType(generate, model)
     return model
@@ -491,9 +498,10 @@ def prepare_input_ids(
     else:  # prefill, reset the model here
         from .npu_llm_cpp import reset
         reset(self.model_ptr)
-    model_inputs = {
-        "input_ids": input_ids
-    }
+    if inputs_embeds is not None and past_key_values is None:
+        model_inputs = {"inputs_embeds": inputs_embeds}
+    else:
+        model_inputs = {"input_ids": input_ids}
     return model_inputs
@@ -511,17 +519,31 @@ def causal_lm_forward(
     return_dict: Optional[bool] = None,
 ) -> Union[Tuple, CausalLMOutputWithPast]:
     from .npu_llm_cpp import run_prefill_with_logits, run_decode_with_logits
-    if isinstance(input_ids[0], torch.Tensor):
-        input_list = input_ids[0].flatten().tolist()
-    else:
-        input_list = input_ids[0]
-    input_length = len(input_list)
-    if input_length > 1:
-        logits = run_prefill_with_logits(self.model_ptr, input_list,
-                                         self.logits_buffer, self.vocab_size)
+    if input_ids is not None:
+        if isinstance(input_ids[0], torch.Tensor):
+            input_list = input_ids[0].flatten().tolist()
+        else:
+            input_list = input_ids[0]
+        input_length = len(input_list)
+        if input_length > 1:
+            logits = run_prefill_with_logits(self.model_ptr, input_list,
+                                             self.logits_buffer, self.vocab_size)
+        else:
+            logits = run_decode_with_logits(self.model_ptr, input_list[0],
+                                            self.logits_buffer, self.vocab_size)
+    elif inputs_embeds is not None:
+        seq_len = inputs_embeds.shape[1]
+        pad_len = self.max_prompt_len - seq_len
+        inputs_embeds = torch.nn.functional.pad(inputs_embeds.to(torch.float16),
+                                                (0, 0, 0, pad_len), value=0.0)
+        logits = run_prefill_with_logits(self.model_ptr, None, self.logits_buffer,
+                                         self.vocab_size, inputs_embeds, seq_len)
     else:
-        logits = run_decode_with_logits(self.model_ptr, input_list[0],
-                                        self.logits_buffer, self.vocab_size)
+        invalidInputError(False, "Please specify either input_ids or inputs_embeds.")
+    if self.config.vocab_size == 151666:
+        # for MiniCPM-V 2.6
+        logits = logits[:, :, :151666]
     return CausalLMOutputWithPast(
         loss=None,

ipex_llm/transformers/npu_models/npu_llm_cpp.py CHANGED Viewed

@@ -48,8 +48,8 @@ _lib = ctypes.cdll.LoadLibrary(_lib_path)
 _lib.load_model_from_file.argtypes = [ctypes.c_char_p]
 _lib.load_model_from_file.restype = ctypes.c_void_p
-_lib.run_prefill.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_int), ctypes.c_int,
-                             ctypes.c_float]
+_lib.run_prefill.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int,
+                             ctypes.c_float, ctypes.c_bool]
 _lib.run_prefill.restype = ctypes.POINTER(ctypes.c_float)
 _lib.run_decode.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_float]
@@ -61,8 +61,10 @@ _lib.llm_sample_token.restype = ctypes.c_int
 _lib.reset.argtypes = [ctypes.c_void_p]
 _lib.reset.restype = None
-_lib.run_prefill_with_logits.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_int),
-                                         ctypes.c_int, ctypes.POINTER(ctypes.c_float), ctypes.c_int]
+_lib.run_prefill_with_logits.argtypes = [ctypes.c_void_p, ctypes.c_void_p,
+                                         ctypes.c_int, ctypes.POINTER(ctypes.c_float),
+                                         ctypes.c_int, ctypes.c_bool]
 _lib.run_prefill_with_logits.restype = None
 _lib.run_decode_with_logits.argtypes = [ctypes.c_void_p, ctypes.c_int,
@@ -77,7 +79,7 @@ def load_model_from_file(model_dir: str):
 def run_prefill(model_ptr, input_ids, vocab_size, repetition_penalty=1.0):
     input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids)
     input_len = len(input_ids)
-    plogits = _lib.run_prefill(model_ptr, input_ptr, input_len, repetition_penalty)
+    plogits = _lib.run_prefill(model_ptr, input_ptr, input_len, repetition_penalty, False)
     new_token = _lib.llm_sample_token(plogits, True, vocab_size)
     return new_token
@@ -88,12 +90,19 @@ def run_decode(model_ptr, input_id, vocab_size, repetition_penalty=1.0):
     return new_token
-def run_prefill_with_logits(model_ptr, input_ids, logits, vocab_size):
-    input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids)
-    input_len = len(input_ids)
+def run_prefill_with_logits(model_ptr, input_ids, logits, vocab_size,
+                            inputs_embeds=None, seq_len=None):
+    if input_ids is not None:
+        input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids)
+        input_len = len(input_ids)
+    else:
+        input_ptr = inputs_embeds.contiguous().data.data_ptr()
+        input_ptr = ctypes.cast(input_ptr, ctypes.c_void_p)
+        input_len = seq_len
     logits_ptr = logits.data.data_ptr()
     logits_ptr = ctypes.cast(logits_ptr, ctypes.POINTER(ctypes.c_float))
-    _lib.run_prefill_with_logits(model_ptr, input_ptr, input_len, logits_ptr, vocab_size)
+    _lib.run_prefill_with_logits(model_ptr, input_ptr, input_len, logits_ptr,
+                                 vocab_size, (input_ids is None))
     return logits

ipex_llm/transformers/npu_pipeline_model/qwen.py CHANGED Viewed

@@ -34,6 +34,10 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
     lm_head_n_splits = 1
     asym = getattr(model.config, "asym", False)
+    if vocab_size == 151666:
+        # for MiniCPM-V 2.6 lm_head on NPU
+        vocab_size = 152064
     if not isinstance(lm_head, SlicedLMHead):
         asym = lm_head.qtype == "asym_int4_rtn"
         if asym:

{ipex_llm-2.2.0b20250101.dist-info → ipex_llm-2.2.0b20250102.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ipex-llm
-Version: 2.2.0b20250101
+Version: 2.2.0b20250102
 Summary: Large Language Model Develop Toolkit
 Home-page: https://github.com/intel-analytics/ipex-llm
 Author: BigDL Authors
@@ -27,10 +27,10 @@ Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine
 Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'all'
 Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'all'
 Provides-Extra: cpp
-Requires-Dist: bigdl-core-cpp ==2.6.0b20250101 ; extra == 'cpp'
+Requires-Dist: bigdl-core-cpp ==2.6.0b20250102 ; extra == 'cpp'
 Requires-Dist: setuptools ; extra == 'cpp'
 Provides-Extra: cpp-arl
-Requires-Dist: bigdl-core-cpp ==2.6.0b20250101 ; extra == 'cpp-arl'
+Requires-Dist: bigdl-core-cpp ==2.6.0b20250102 ; extra == 'cpp-arl'
 Requires-Dist: setuptools ; extra == 'cpp-arl'
 Requires-Dist: onednn-devel ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
 Requires-Dist: dpcpp-cpp-rt ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
@@ -65,7 +65,7 @@ Requires-Dist: transformers ==4.40.0 ; extra == 'npu'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'npu'
 Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'npu'
 Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'npu'
-Requires-Dist: bigdl-core-npu ==2.6.0b20250101 ; (platform_system == "Windows") and extra == 'npu'
+Requires-Dist: bigdl-core-npu ==2.6.0b20250102 ; (platform_system == "Windows") and extra == 'npu'
 Provides-Extra: serving
 Requires-Dist: py-cpuinfo ; extra == 'serving'
 Requires-Dist: fschat[model_worker,webui] ==0.2.36 ; extra == 'serving'
@@ -85,9 +85,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu'
 Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu'
 Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu'
 Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu'
-Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250101 ; extra == 'xpu'
-Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250101 ; extra == 'xpu'
-Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250101 ; extra == 'xpu'
+Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250102 ; extra == 'xpu'
+Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250102 ; extra == 'xpu'
+Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250102 ; extra == 'xpu'
 Provides-Extra: xpu-2-1
 Requires-Dist: py-cpuinfo ; extra == 'xpu-2-1'
 Requires-Dist: protobuf ; extra == 'xpu-2-1'
@@ -102,9 +102,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu-2-1'
 Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu-2-1'
 Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu-2-1'
 Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu-2-1'
-Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250101 ; extra == 'xpu-2-1'
-Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250101 ; extra == 'xpu-2-1'
-Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250101 ; extra == 'xpu-2-1'
+Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250102 ; extra == 'xpu-2-1'
+Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250102 ; extra == 'xpu-2-1'
+Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250102 ; extra == 'xpu-2-1'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-1'
 Requires-Dist: dpcpp-cpp-rt ==2024.0.2 ; (platform_system == "Windows") and extra == 'xpu-2-1'
 Requires-Dist: mkl-dpcpp ==2024.0.0 ; (platform_system == "Windows") and extra == 'xpu-2-1'
@@ -119,9 +119,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arc'
 Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arc'
 Requires-Dist: tabulate ; extra == 'xpu-arc'
 Requires-Dist: setuptools ; extra == 'xpu-arc'
-Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250101 ; extra == 'xpu-arc'
-Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250101 ; extra == 'xpu-arc'
-Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250101 ; extra == 'xpu-arc'
+Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250102 ; extra == 'xpu-arc'
+Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250102 ; extra == 'xpu-arc'
+Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250102 ; extra == 'xpu-arc'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arc'
 Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
 Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
@@ -141,9 +141,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arl'
 Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arl'
 Requires-Dist: tabulate ; extra == 'xpu-arl'
 Requires-Dist: setuptools ; extra == 'xpu-arl'
-Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250101 ; extra == 'xpu-arl'
-Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250101 ; extra == 'xpu-arl'
-Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250101 ; extra == 'xpu-arl'
+Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250102 ; extra == 'xpu-arl'
+Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250102 ; extra == 'xpu-arl'
+Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250102 ; extra == 'xpu-arl'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arl'
 Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
 Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
@@ -163,9 +163,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-lnl'
 Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-lnl'
 Requires-Dist: tabulate ; extra == 'xpu-lnl'
 Requires-Dist: setuptools ; extra == 'xpu-lnl'
-Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250101 ; extra == 'xpu-lnl'
-Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250101 ; extra == 'xpu-lnl'
-Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250101 ; extra == 'xpu-lnl'
+Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250102 ; extra == 'xpu-lnl'
+Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250102 ; extra == 'xpu-lnl'
+Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250102 ; extra == 'xpu-lnl'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-lnl'
 Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
 Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'

{ipex_llm-2.2.0b20250101.dist-info → ipex_llm-2.2.0b20250102.dist-info}/RECORD RENAMED Viewed

@@ -2,7 +2,7 @@ ipex_llm/__init__.py,sha256=kSA9JjVLPlpN4YchWtfOybRh4XiP6d_VTYvzbAouPSU,2118
 ipex_llm/convert_model.py,sha256=jopEe6wu88ZPZfNFhgnQUu7807iciiWW_EMyTsVni5A,6816
 ipex_llm/llm_patching.py,sha256=becMYcawtR8lgl2yeRQhvvX6CLaq09WZGm9dDmLJWL0,3232
 ipex_llm/models.py,sha256=XROP6GLLrGQDlogGXpXZENbV143YNi6j0VPJeOdQ3Cg,1063
-ipex_llm/optimize.py,sha256=4VYz8vgxSnrqBJhz__eB7hCJSwrkNx_t_wvTLxPlPyI,12253
+ipex_llm/optimize.py,sha256=ml-qEpzsrWGcd-Wia6IxPBC1PhqT5pi_lp3VTOH_ns0,12415
 ipex_llm/cli/llm-chat.ps1,sha256=6qrs-hGVAV8IKh7Jx8nq_XrnZcjd7qGU5wndArM7Yag,2769
 ipex_llm/cli/llm-cli.ps1,sha256=3qBtTLs_EjYDnM8YyCpJhzLnGCKTEGssu9UNqfkjVXs,3009
 ipex_llm/cli/prompts/chat-with-llm.txt,sha256=PpSyd4FQQd-T7ptfXL9jZp7dgstevu1fsxWFa0IQ5Oc,216
@@ -41,35 +41,35 @@ ipex_llm/langchain/llms/transformerspipelinellm.py,sha256=vm522YPPwWxxAPVvQBtxRf
 ipex_llm/langchain/vllm/__init__.py,sha256=T-EbRT6GJ_8RCu-iLmSzcftOimXSPQf2d5X72AUAy2Y,874
 ipex_llm/langchain/vllm/vllm.py,sha256=6dxc-ZISZQrJilEa_HA827l75Dv9rcHpY_G6FdJ8BVs,7793
 ipex_llm/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ipex_llm/libs/bloom-api.dll,sha256=2IyCxPjsahzooryTFYEq-7R6nkrliaZm8h3-Y6n35C4,36352
-ipex_llm/libs/bloom.dll,sha256=E2AQed3DUyh7u36R8_vi0Om92aDzkO-MP_aK4vehb58,506880
-ipex_llm/libs/gptneox-api.dll,sha256=dCc4_7Xl-Isu_xlGe9iZQfbSabN2V8WOByWyDIZLp_4,24576
-ipex_llm/libs/gptneox.dll,sha256=q3M6r7NEOL_CRooJqq3AZ2icota9FwlWsr4nab6izd0,567296
-ipex_llm/libs/libbloom_avx.dll,sha256=3KbBuBjRFd1k9fSaXzkIbXNB4zPuyQVEZt-9bx_NJOc,535040
-ipex_llm/libs/libbloom_vnni.dll,sha256=x_rnRwlhmUs0d7E_mkn77VBFd-_MtiwfoWCb6TOyKCc,506880
-ipex_llm/libs/libgptneox_avx.dll,sha256=VG1twX6up-CqWVgH_bhYXXAg6MhL8oP8CZL6oiaM8pg,595456
-ipex_llm/libs/libgptneox_vnni.dll,sha256=r6Pdq6XjvouI0NispOQco-CM4xSewQQThOBRmijgwyQ,567808
-ipex_llm/libs/libllama_avx.dll,sha256=mUHLFVirXIR2viOeKkugcuHEBJpSLmsc3d6V9Y0zDz4,589824
-ipex_llm/libs/libllama_vnni.dll,sha256=FXWgzdFnPw4o_UAAktlEZNDev8CT7SCdwzs3zV8mlMY,561664
-ipex_llm/libs/libstarcoder_avx.dll,sha256=ErlXweTeHBexsQQLwoaiPspJjRYu7R22GTYLMidXQCw,626688
-ipex_llm/libs/libstarcoder_vnni.dll,sha256=l54B9SqZDjmLRQlbsZqXauYfGo58xEiOctAaidVqnJw,598528
-ipex_llm/libs/llama-api.dll,sha256=cw4PwQE9d5eYEYi-7vY0aG83a28pFPfzlprW_YXzBg8,25600
-ipex_llm/libs/llama.dll,sha256=AiSnrnpJRvZrB3HIMTlecXpCblR0o7_fnHMdKclvCsg,561152
-ipex_llm/libs/main-bloom.exe,sha256=HfhuIi1jJa1gdkROK3yt97k8Q5noiYdfwXXKeI1d4XE,103424
-ipex_llm/libs/main-gptneox.exe,sha256=RX9FcsWS2oB9EKKT3DXZwkPWwSP9TMjysVxpk4e_FLQ,98816
-ipex_llm/libs/main-llama.exe,sha256=QQRN1avtgAnmrNLfrmGWL5BtOk9Id90nL8Y04OJzmMo,99840
-ipex_llm/libs/main-starcoder.exe,sha256=xVhlJ1m-H5W4c9YrdveFXumyedg23m4g0xZYqhaEbn4,157696
-ipex_llm/libs/pipeline.dll,sha256=XO75bZ1_OgLYxdPVeW3BNcd5d26YcO6sxVq0HHCueVU,72704
-ipex_llm/libs/quantize-bloom.exe,sha256=lU6EmSLCnlYi8vRHsXbq8sEGR8q6dPpyKS4cBQXNLyA,126464
-ipex_llm/libs/quantize-bloom_vnni.exe,sha256=rK2NcJzLcqd_I8_D_oX-VHsw8VCcLNYrF937D2LLaDU,127488
-ipex_llm/libs/quantize-gptneox.exe,sha256=UMzUlM4x5WT-J58IMhwTy2Hu0ZsNzC5UjjwtVapHQAQ,104448
-ipex_llm/libs/quantize-gptneox_vnni.exe,sha256=WIAcb-27zWDc9jgaODw5mHa4YK7MU5QgGPVbdYcftPA,104960
-ipex_llm/libs/quantize-llama.exe,sha256=H_M_xrapNgrXuWfvSvvO5jPlGG45hn9nuceFj-hnhwA,109568
-ipex_llm/libs/quantize-llama_vnni.exe,sha256=8x-57l7IUfre4gHT-RM63-tsRayGCFHSIpAfYapP27E,110592
-ipex_llm/libs/quantize-starcoder.exe,sha256=U0-IhAZl2jRHLuf90PUSrw4d21CTNnWdoFtoqfMNdq4,127488
-ipex_llm/libs/quantize-starcoder_vnni.exe,sha256=zCC8EJs9KUmglJklqndbPNrGfo9IKzB5Pwqo7aBZSBU,128512
-ipex_llm/libs/starcoder-api.dll,sha256=aX-nqvXEzk6GxiDB4nC0Qy1KBftizAhV_h1u6YhVzrw,21504
-ipex_llm/libs/starcoder.dll,sha256=fSQg0niB930j4GxEpEAFFMPhPlxZ_DnbLUy4OrM08Q4,598016
+ipex_llm/libs/bloom-api.dll,sha256=D4QwdMzMWEpvWCt6Qhf3TNufORG3i1vGsvrxKX0UeG0,36352
+ipex_llm/libs/bloom.dll,sha256=HiUWN81LFRa4ylyTNLlGbIiufHTlHr60HSpuvlaNcoM,506880
+ipex_llm/libs/gptneox-api.dll,sha256=U9_GilTEOKSvrLXbDd3-iAUjjJUtP0Ud-KOsi-51Xjo,24576
+ipex_llm/libs/gptneox.dll,sha256=RL0yZh9g1sj98wA8Ekx7xr2S_PSad5Ll-uhdF05ev64,567296
+ipex_llm/libs/libbloom_avx.dll,sha256=2ouFAhOxkBYHMerLJHkefV1XbgRHDFqN-KBneiH9g-I,535040
+ipex_llm/libs/libbloom_vnni.dll,sha256=Ab9mZrSYvKE82yJ70VQDpl8qI_0GaIOOidEWLsJxbAo,506880
+ipex_llm/libs/libgptneox_avx.dll,sha256=3dV2YKFjy_u5tYZayi2URq7hpmDIqGafalP7WQEK-Tw,595456
+ipex_llm/libs/libgptneox_vnni.dll,sha256=-xfG7NBOcISipqhp8qOhtxwpCKdHtWxQai3FMoLQLhQ,567808
+ipex_llm/libs/libllama_avx.dll,sha256=gKtT8c3qklcvigz1POAbRtBwAdvsPFyXS7yZVipOTBM,589824
+ipex_llm/libs/libllama_vnni.dll,sha256=i9wyOe9xqShLO_DlYbo7cn1b47uMJ4pwPwVx88EyGzY,561664
+ipex_llm/libs/libstarcoder_avx.dll,sha256=RQNEbNTNCSv3qgDcKJCx87GB3eBMysL-UEPQdYj3WTE,626688
+ipex_llm/libs/libstarcoder_vnni.dll,sha256=7luUJRSUDj9AyEEemYTxJzs4bcx8MqM0sLukiZfnVxc,598528
+ipex_llm/libs/llama-api.dll,sha256=CwGyWmsT8XHnRiuZ4OzBIIlHwRlkiu_IaovirJqwbRY,25600
+ipex_llm/libs/llama.dll,sha256=LAyZioZRUs_CCv4vo5cqzUWpz1MQMc_c4n0pLaygFvA,561152
+ipex_llm/libs/main-bloom.exe,sha256=qMBUiZQZcr2RLc0U2cnnGUig9S3F70bqHRjwUsKV_Ns,103424
+ipex_llm/libs/main-gptneox.exe,sha256=wsi7_iK9sTA4PNBW5_y_Jpa1gqcE1eK97DFv_lB8RgY,98816
+ipex_llm/libs/main-llama.exe,sha256=BSBjl3RX7QzMq7cy5BVrXEzGcXiauSsKZ9VL_y8w-qI,99840
+ipex_llm/libs/main-starcoder.exe,sha256=qzk78keT_JNdVi7cdf-ycpX2QD6N0ZkO0Z9GDUqZgqc,157696
+ipex_llm/libs/pipeline.dll,sha256=C5E1tWcpdFJZVVjr7CfbcJDULeQom9hIjX6Zgb_rbso,72704
+ipex_llm/libs/quantize-bloom.exe,sha256=35E6hdsDi_njtgEcAaSE9WwelkHd6yT9IV8fWROx-ts,126464
+ipex_llm/libs/quantize-bloom_vnni.exe,sha256=_ECFuoYcxXPDob77sTS6vZvjufPg2su3BhcUZtP__AU,127488
+ipex_llm/libs/quantize-gptneox.exe,sha256=dJsoz68WhbSOF0I5LD3D5AIypsAPJC-aJAqIG-XHjrA,104448
+ipex_llm/libs/quantize-gptneox_vnni.exe,sha256=4rXbo9XEh8Ka_bYyXCgIW68qLSHGkYwkrwFhiNkLeXM,104960
+ipex_llm/libs/quantize-llama.exe,sha256=s1jEeBhAy4DBAUPQ7DpZuqavX5Thy1yff-U9ENcYZPA,109568
+ipex_llm/libs/quantize-llama_vnni.exe,sha256=a4C5W-pxItLAaiVYGrHXkHsEH_nB0sVmnd5fyQv19ec,110592
+ipex_llm/libs/quantize-starcoder.exe,sha256=Uhz8MG9ElW__KmkjR89QgHhqJvESste0iISLw0pvgXw,127488
+ipex_llm/libs/quantize-starcoder_vnni.exe,sha256=lv3V22NpQs_H_6HgORxigYv47OzEECECrz4IzYmxcjs,128512
+ipex_llm/libs/starcoder-api.dll,sha256=-6pbKFCVrDXJ5N-KPAIN4tvmRlphbKxUOoYMR_1iFYc,21504
+ipex_llm/libs/starcoder.dll,sha256=e40-bxKji-RYl12eW2VwJB-m70A6bJQyR75TsBZGmT8,598016
 ipex_llm/llamaindex/__init__.py,sha256=T-EbRT6GJ_8RCu-iLmSzcftOimXSPQf2d5X72AUAy2Y,874
 ipex_llm/llamaindex/llms/__init__.py,sha256=KP1lEdGqDuxPoxL1ZSH25Pm2kKMPJBWUTLR0ckSLMIU,1139
 ipex_llm/llamaindex/llms/bigdlllm.py,sha256=FQBzq1KOjfc6uofTXAha3O7TqpJkNfOFepXQmOVlbnI,26314
@@ -87,17 +87,17 @@ ipex_llm/serving/fastchat/tgi_api_protocol.py,sha256=brT3k3-V0NJrU4fRqUwWjC0O3iO
 ipex_llm/serving/fastchat/tgi_api_server.py,sha256=agNTAEiZPSuj3dEdIdYKwkoY0cXOUDX06DiM9VP2knQ,24418
 ipex_llm/serving/fastchat/vllm_worker.py,sha256=ZLz2Q9GxJO6r_LOiP6epgCRjBGk-K4EB1SNEWSJp5DA,11091
 ipex_llm/transformers/__init__.py,sha256=l4KkMkLe-pRC7b_kj6LCfeifgE-Uo33_Av_FwN9HnFA,1074
-ipex_llm/transformers/convert.py,sha256=B4oI836JHEqg_qT3dcl2RaJdQs7rOyigMwj-racxhkc,106379
+ipex_llm/transformers/convert.py,sha256=V4KDyi-2FVWSYZAxe4PlAxGGZbauSbOCuqq56ME9yyQ,106461
 ipex_llm/transformers/convert_ipex.py,sha256=iKXo0n8fVFTOA2fNYYrByMFK0dovL-kLd2sVDk88AlQ,14334
 ipex_llm/transformers/embedding.py,sha256=bdgk59DvD4ZZyxRzewXOR7g56nThgO6uhIwk8QL7f-s,9299
 ipex_llm/transformers/kv.py,sha256=k4TU18LlA-Sbq9WNNQnfuzu3RSFBwFhmaV3BcGN5bAo,19191
 ipex_llm/transformers/lisa.py,sha256=F5WxbtXQ7RdKulj83h_2DnEIgKiKGZf7zvOmg6QBl2s,3289
 ipex_llm/transformers/loader.py,sha256=cOgX93xOC-4dt01GTJ5wyd7PjZ8S43r4mctkR2YxVuw,6893
 ipex_llm/transformers/lookup.py,sha256=c4ETIha6ZLbWvhcclSKRDdi5Ipuet4mfUnOkBa0E8kk,19607
-ipex_llm/transformers/low_bit_linear.py,sha256=TJfEqNp6zB6YnNEUASga302WQXzNdrmU_miGCM0u-F8,41504
-ipex_llm/transformers/model.py,sha256=N-g9IQVvBiBhbL5Fo5DTWbmHPZY52sjfFuq0B8Qu6h4,40952
+ipex_llm/transformers/low_bit_linear.py,sha256=dyyYyCqw0GK8hzaUGanrg-uIhU1HTLEEbvbxXMlm-80,41668
+ipex_llm/transformers/model.py,sha256=KcRjkauGg48BYrUBoUZaVMpg7Piuz5JrfIpVZd3EIjs,41105
 ipex_llm/transformers/modelling_bigdl.py,sha256=7JpNVMuyq_OmtNUaMFMXdxPWZp2q0QHC02QeA-VTPOw,6709
-ipex_llm/transformers/npu_model.py,sha256=wPFEB4W1rYbpO_XqepREMef69dzo-zkFoqFRb_mqneA,37862
+ipex_llm/transformers/npu_model.py,sha256=a1mkyc6EqD7AJhqbYzokGhFubNpt5trIMuZT_dQKlTk,37861
 ipex_llm/transformers/patches.py,sha256=halPWm__ORh2fRFSIFPiCNg3LQBfrRkTPtmtRpBJCZQ,1286
 ipex_llm/transformers/pipeline_parallel.py,sha256=uNZpOXljNmdoEYnP8U-VFiN4dRZb2piQbIf2bG9LQnE,49051
 ipex_llm/transformers/qlora.py,sha256=jtPGsvWFjbTUGzDBCdfftnCis_0nJQNRpACSwXUbbGU,14943
@@ -136,10 +136,10 @@ ipex_llm/transformers/gguf/models/model_implement/yuan2/configuration_yuan.py,sh
 ipex_llm/transformers/gguf/models/model_implement/yuan2/yuan_hf_model.py,sha256=_AOGMV65XHxgTxIib7lgs49InopcecTzRwgtYR8NTUg,51084
 ipex_llm/transformers/models/__init__.py,sha256=tp2DcVkKg1-QvdYk7DY7rZvQWCDQ4ZjU8NAQ7Fclrpg,584
 ipex_llm/transformers/models/aquila.py,sha256=VZb5Drpo_fTxwcExZ397LygnsNPX2sVbie9_JeFudZI,5252
-ipex_llm/transformers/models/baichuan.py,sha256=0dkTSPqGPgSnwa8zSNKroam0pvSyQLSRpbb43-OgIlc,19815
+ipex_llm/transformers/models/baichuan.py,sha256=oJCAEENSG8oQhJ-QPN2SiapARjAGdOM6nEbyCcYOMCo,19334
 ipex_llm/transformers/models/bert.py,sha256=bJNic2pt1kph0kBwdK5MRGyWupFfx2Ts0V3D1L-5kWo,6085
 ipex_llm/transformers/models/bloom.py,sha256=PxfzyYT-nFn3K5rZhTQjmcEjUUzAhUFzxIN4kzRlCuc,8103
-ipex_llm/transformers/models/chatglm.py,sha256=xCEhYzaXyTDBXqz111Uw4IW5x4TLbtBbYfmBT623gRI,12669
+ipex_llm/transformers/models/chatglm.py,sha256=UHai1t2AUtGmF765_eHF8LUMVQzp_oCBx8TJB21WrHk,12597
 ipex_llm/transformers/models/chatglm2.py,sha256=kfJThuKYb3unAB1XCzfop1iDW1gOkyFOjSr-lEjUdS0,24781
 ipex_llm/transformers/models/chatglm4.py,sha256=AAhAFFDDas5DBQPfh2Mwl7a2v7taKf6xphoeeNNFaBI,16593
 ipex_llm/transformers/models/chatglm4v.py,sha256=YRfuf9g1E0MQ_7wbHAOMvadFnO-j3LqI_k1SaRkDs0M,14055
@@ -167,8 +167,8 @@ ipex_llm/transformers/models/mpt.py,sha256=z02NwHogJZVh-Mk4sYoIzR90SFIKhoNN_-ifs
 ipex_llm/transformers/models/phi.py,sha256=E6qz4EEuHIVGvaPo-wtLC5lz3iyMqTbAE_cRlcjQRKI,6670
 ipex_llm/transformers/models/phi3.py,sha256=jkiadJ85ToHpymY5GOM6orWlnx6LKN8_-v1MUcfGWPg,15159
 ipex_llm/transformers/models/phixtral.py,sha256=MDTMghcu7qAmZmRcUGqXXDXhSU3y_N59HRIXmlcjp5g,4890
-ipex_llm/transformers/models/qwen.py,sha256=iP4wcjdIZ0CvqbM8muM96y-rghpEnZSR3TgjiOBIq5k,20475
-ipex_llm/transformers/models/qwen2.py,sha256=k_FcPfPAXIotRE03ULFBYA5hWE0M5CfktAhCWbEy8Yw,26369
+ipex_llm/transformers/models/qwen.py,sha256=XIJ_bLzediBURWU-OOS3H6WBIGXQue6jDdUHJsAabwY,19391
+ipex_llm/transformers/models/qwen2.py,sha256=b49HO4GSudwGJ3n6uHVno1oo3DgRt3jOjtQnLOB3cdY,25530
 ipex_llm/transformers/models/qwen2_moe.py,sha256=EA_OYxYAEgrvi7VpDW192AJXG9Fwe2aBtOAZPkOAJk4,19350
 ipex_llm/transformers/models/qwen2_vl.py,sha256=jIm4yZSd751BkRqgj3wR1QBkDIh-TMCLAMM8SZ8n6Qo,13419
 ipex_llm/transformers/models/qwen_vl.py,sha256=j7Nzzz2Qvynu9yrCXmoEfERjw43hXof5TbXIs7Ms-oY,17105
@@ -185,7 +185,7 @@ ipex_llm/transformers/npu_models/baichuan_mp.py,sha256=tHhO-0v5z6IhxsfzAPYWXVbLr
 ipex_llm/transformers/npu_models/chatglm.py,sha256=YzpGLZ7ORt6qkwW9mCwZ_xhOAI8uHSDHJrmqWgNM234,10511
 ipex_llm/transformers/npu_models/chatglm4.py,sha256=J4523DzhIzZxIvlf1V9qU4auzEGKvC80YqyxuCJygjw,9795
 ipex_llm/transformers/npu_models/common.py,sha256=tTUJL7IxVrJSnXle6nla35wTUrBf2sOEt7Ya1qyMezY,4853
-ipex_llm/transformers/npu_models/convert.py,sha256=cX10r7Q0a2qFcEIhRcvmw1eSmUtmeOcoNn8kVqv3Su8,24224
+ipex_llm/transformers/npu_models/convert.py,sha256=FILSGnoltcR9FMrCkw0eOKh6p3sbBI5i0Ms8AsJc04E,25342
 ipex_llm/transformers/npu_models/convert_mp.py,sha256=t7160V4MmYpnex2NfuLTcqoc1meGEXdYi4AAPotfbzk,24518
 ipex_llm/transformers/npu_models/glm_edge.py,sha256=VsJex-6530h4ZQk35TxRe1MnttAHT41omE8LV47LgBE,6723
 ipex_llm/transformers/npu_models/kv.py,sha256=2OSFO9Z6e4nGdVxXEM-Bq2qa_npYYbGmQt3lcCZxTlU,9201
@@ -198,7 +198,7 @@ ipex_llm/transformers/npu_models/minicpm_mp.py,sha256=0iCRWN9UIUQp5tSKyu-orpGCOx
 ipex_llm/transformers/npu_models/minicpmv_mp.py,sha256=m11WT6s_H5wkFtlz7aHMOL9b_CoL_G5MhoL5te4la_Q,20147
 ipex_llm/transformers/npu_models/mistral.py,sha256=iRdmIQI_bbbZxRCYRvnV4rWjX2t-6vkHNl1ICAsLoy4,10759
 ipex_llm/transformers/npu_models/mp_models_base.py,sha256=rY-5tq8DfxRsiaIITl0PQOTiPLJnUm_5L-oWzbK12N8,28429
-ipex_llm/transformers/npu_models/npu_llm_cpp.py,sha256=SL1p5UBjheufhum-GktNQQ4iXjRlZ9Hgatzb3NFr6Bs,3900
+ipex_llm/transformers/npu_models/npu_llm_cpp.py,sha256=B40sBujvy31ETFBgcYAf4CN23UuTCBEJVaxjIMaoEHk,4268
 ipex_llm/transformers/npu_models/paraformer_mp.py,sha256=lGEjmKHW_Pk3BE3nqa1ZVgJ3P5p4lNp7p6wMV7KrtCU,37871
 ipex_llm/transformers/npu_models/phi3.py,sha256=R-EuqHsTrPTX33HtCGAMFlRdXB_j5mH_7FDnj62JtNM,6555
 ipex_llm/transformers/npu_models/phi3_v.py,sha256=EMZuTPkGfuDVp9c5BU1HyzXHWKswHRQ8bvQjzocIyHA,7737
@@ -213,7 +213,7 @@ ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py,sha256=953Gua2tFKLI
 ipex_llm/transformers/npu_pipeline_model/llama.py,sha256=MnvHRytLt3oy5jIPUBe8AeEJ6PtPWLbhQ5a9WqjZ1TQ,19905
 ipex_llm/transformers/npu_pipeline_model/minicpm.py,sha256=MDMesYlVbECKdK0xxkt1LwHgpkJOO7ZwBExYAwMGQa0,20637
 ipex_llm/transformers/npu_pipeline_model/pipeline_cpp.py,sha256=JNmodAMg_NQvDILug3E_fGXEh6cd3wsj4bvAzcd-vaU,2749
-ipex_llm/transformers/npu_pipeline_model/qwen.py,sha256=iVUNrcRLXE5eQGZIetgkLTINo8RW15RHM7SUetvJNRQ,14748
+ipex_llm/transformers/npu_pipeline_model/qwen.py,sha256=3paMXr1viuztybhmVLqQ9XvM3EZbxncDuNSNwLF8OI0,14849
 ipex_llm/utils/__init__.py,sha256=NdB_InYE65dNgW4ruEPUOlgKEO2ELcsJoqkP7O5kpog,1391
 ipex_llm/utils/benchmark_util_4_29.py,sha256=OU1W1quiaiJGsg1pd3HM9O6PmVSaPA0HHE7R8hNTfmQ,258653
 ipex_llm/utils/benchmark_util_4_42.py,sha256=HEiClCgKDp_T64HH8ulSTly8dvt6UwPDYZfrPVYvXcc,225383
@@ -244,11 +244,11 @@ ipex_llm/vllm/xpu/engine/__init__.py,sha256=pY_CpyuZd72fr6s32ejeKHKFW0K4vUU2rzZj
 ipex_llm/vllm/xpu/engine/engine.py,sha256=k4-D27WS_Gk3mA--w3HWAjPjb4Aiu043MVPi0ZoAUBc,5984
 ipex_llm/vllm/xpu/entrypoints/openai/api_server.py,sha256=GshTZFB8e4PWvqckfbmTOU6b0oLkNn7A-vzLuG9--j8,21544
 ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py,sha256=2rENA2ucynMaIjiZBEh2ez1o5vR32GaP514t39CD7KM,8676
-ipex_llm-2.2.0b20250101.data/scripts/ipex-llm-init.bat,sha256=HPtCYuDYwEatq7dAwOvdfVcHYCpAVdbj75K1qh0vQek,2578
-ipex_llm-2.2.0b20250101.data/scripts/llm-chat.ps1,sha256=6qrs-hGVAV8IKh7Jx8nq_XrnZcjd7qGU5wndArM7Yag,2769
-ipex_llm-2.2.0b20250101.data/scripts/llm-cli.ps1,sha256=3qBtTLs_EjYDnM8YyCpJhzLnGCKTEGssu9UNqfkjVXs,3009
-ipex_llm-2.2.0b20250101.dist-info/METADATA,sha256=J_SK__bQX0TB_vOUTfpk9DV4DhTzVc-LzVzN_LmERV8,11374
-ipex_llm-2.2.0b20250101.dist-info/WHEEL,sha256=6iYPr8vTHsyDK75jr9X0V3I9wPSVmtwr_8fdATBciGk,98
-ipex_llm-2.2.0b20250101.dist-info/entry_points.txt,sha256=TiUyBB2MRmfF3ko-pyAEzqeBCRnyhu27bNOAsWPp3e8,61
-ipex_llm-2.2.0b20250101.dist-info/top_level.txt,sha256=CGCMHM-SyqUabU4h8RqJ2KTYckQUO3LvIWwmUQ6Qbzw,9
-ipex_llm-2.2.0b20250101.dist-info/RECORD,,
+ipex_llm-2.2.0b20250102.data/scripts/ipex-llm-init.bat,sha256=HPtCYuDYwEatq7dAwOvdfVcHYCpAVdbj75K1qh0vQek,2578
+ipex_llm-2.2.0b20250102.data/scripts/llm-chat.ps1,sha256=6qrs-hGVAV8IKh7Jx8nq_XrnZcjd7qGU5wndArM7Yag,2769
+ipex_llm-2.2.0b20250102.data/scripts/llm-cli.ps1,sha256=3qBtTLs_EjYDnM8YyCpJhzLnGCKTEGssu9UNqfkjVXs,3009
+ipex_llm-2.2.0b20250102.dist-info/METADATA,sha256=fF_EkmZQW5wODRZlaJEQgQnS6Xieiem4h1vZcvsRxRE,11374
+ipex_llm-2.2.0b20250102.dist-info/WHEEL,sha256=6iYPr8vTHsyDK75jr9X0V3I9wPSVmtwr_8fdATBciGk,98
+ipex_llm-2.2.0b20250102.dist-info/entry_points.txt,sha256=TiUyBB2MRmfF3ko-pyAEzqeBCRnyhu27bNOAsWPp3e8,61
+ipex_llm-2.2.0b20250102.dist-info/top_level.txt,sha256=CGCMHM-SyqUabU4h8RqJ2KTYckQUO3LvIWwmUQ6Qbzw,9
+ipex_llm-2.2.0b20250102.dist-info/RECORD,,

{ipex_llm-2.2.0b20250101.data → ipex_llm-2.2.0b20250102.data}/scripts/ipex-llm-init.bat RENAMED Viewed

File without changes

{ipex_llm-2.2.0b20250101.data → ipex_llm-2.2.0b20250102.data}/scripts/llm-chat.ps1 RENAMED Viewed

File without changes

{ipex_llm-2.2.0b20250101.data → ipex_llm-2.2.0b20250102.data}/scripts/llm-cli.ps1 RENAMED Viewed

File without changes

{ipex_llm-2.2.0b20250101.dist-info → ipex_llm-2.2.0b20250102.dist-info}/WHEEL RENAMED Viewed

File without changes

{ipex_llm-2.2.0b20250101.dist-info → ipex_llm-2.2.0b20250102.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{ipex_llm-2.2.0b20250101.dist-info → ipex_llm-2.2.0b20250102.dist-info}/top_level.txt RENAMED Viewed

File without changes