PyPI - ipex-llm - Versions diffs - 2.2.0b20250101__py3-none-manylinux2010_x86_64.whl → 2.2.0b20250103__py3-none-manylinux2010_x86_64.whl - Mend

ipex-llm 2.2.0b20250101__py3-none-manylinux2010_x86_64.whl → 2.2.0b20250103__py3-none-manylinux2010_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

ipex_llm/optimize.py CHANGED Viewed

@@ -254,7 +254,9 @@ def optimize_model(model, low_bit='sym_int4', optimize_llm=True, modules_to_not_
                                  torch_dtype=torch_dtype,
                                  optimize_model=optimize_llm,
                                  modules_to_not_convert=modules_to_not_convert,
-                                 cpu_embedding=cpu_embedding)
+                                 cpu_embedding=cpu_embedding,
+                                 disable_optimize_pre=kwargs.pop("disable_optimize_pre",
+                                                                 False))
     # add save_low_bit to pretrained model dynamically
     import types
     model._bigdl_config = dict()

ipex_llm/transformers/convert.py CHANGED Viewed

@@ -1081,7 +1081,8 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
                          torch_dtype="auto",
                          imatrix_data=None,
                          embedding_qtype=None,
-                         mixed_precision=False):
+                         mixed_precision=False,
+                         disable_optimize_pre=False):
     if qtype in ggml_tensor_qtype.values():
         index = list(ggml_tensor_qtype.values()).index(qtype)
         logger.info(f"Converting the current model to "
@@ -1104,7 +1105,7 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
         model = _optimize_ipex(model, qtype)
         return model
-    if optimize_model:
+    if optimize_model and not disable_optimize_pre:
         model = _optimize_pre(model, qtype)
     act_order = False
@@ -1983,16 +1984,9 @@ def _optimize_post(model):
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
         from ipex_llm.transformers.models.yuan import yuan_attention_forward
-        # from ipex_llm.transformers.models.yuan import yuan_mlp_forward
-        convert_forward(model,
-                        module.YuanAttention,
-                        yuan_attention_forward
-                        )
-        # disable able mlp_forward for quantize_kv on mtl.
-        # convert_forward(model,
-        #                 module.YuanMLP,
-        #                 yuan_mlp_forward
-        #                 )
+        convert_forward(model, module.YuanAttention, yuan_attention_forward)
+        # from ipex_llm.transformers.models.common import mlp_silu_forward
+        # convert_forward(model, module.YuanMLP, mlp_silu_forward)
     elif model.config.model_type == 'bert' and (
         not model.config.is_decoder and
         model.config.position_embedding_type == "absolute"

ipex_llm/transformers/low_bit_linear.py CHANGED Viewed

@@ -764,6 +764,7 @@ class FP16Linear(nn.Linear):
         # weigh_type = 3 means weight has been transposed by esimd method
         self.weight_type = 1
         self.optimize_lm_head = optimize_lm_head
+        self.disable_fp16_opt = False
     def forward(self, x: torch.Tensor):
         # only work for GPU
@@ -779,8 +780,11 @@ class FP16Linear(nn.Linear):
             self.weight.data = self.weight.data.to(x.dtype)
         if not self.use_esimd_kernel(x):
-            if get_ipex_version() < "2.1.10+xpu" \
-                    or get_xpu_device_type(x) not in ["arc", "flex", "pvc"]:
+            if (
+                get_ipex_version() < "2.1.10+xpu"
+                or get_xpu_device_type(x) not in ["arc", "flex", "pvc"]
+                or self.disable_fp16_opt
+            ):
                 if self.weight_type == 2:
                     self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
                                                      requires_grad=False)
@@ -845,6 +849,8 @@ class FP16Linear(nn.Linear):
     def use_esimd_kernel(self, x):
         gpu_type = get_xpu_device_type(x)
+        if self.disable_fp16_opt:
+            return False
         # esimd kernel can only be used for Arc and Flex
         if gpu_type not in ["arc", "flex"]:
             return False

ipex_llm/transformers/model.py CHANGED Viewed

@@ -445,6 +445,7 @@ class _BaseAutoModelClass:
         mixed_precision = kwargs.pop("mixed_precision", False)
         if embedding_qtype is not None:
             embedding_qtype = ggml_tensor_qtype[embedding_qtype]
+        disable_optimize_pre = kwargs.pop("disable_optimize_pre", False)
         _args = copy.deepcopy(args)
         _kwargs = copy.deepcopy(kwargs)
         awq_config = None
@@ -513,7 +514,8 @@ class _BaseAutoModelClass:
                                      torch_dtype=kwargs.get("torch_dtype", 'auto'),
                                      imatrix_data=imatrix_data,
                                      embedding_qtype=embedding_qtype,
-                                     mixed_precision=mixed_precision)
+                                     mixed_precision=mixed_precision,
+                                     disable_optimize_pre=disable_optimize_pre)
         if disk_embedding:
             from ipex_llm.transformers.embedding import DiskEmbedding

ipex_llm/transformers/models/baichuan.py CHANGED Viewed

@@ -29,7 +29,7 @@ from ipex_llm.transformers.models.utils import use_quantize_kv_cache, restore_fp
     should_use_compresskv
 from ipex_llm.transformers.models.utils import update_past_key_value
 from ipex_llm.transformers.models.utils import should_use_fuse_rope
-from ipex_llm.transformers.models.utils import use_flash_attention, use_sdp
+from ipex_llm.transformers.models.utils import use_sdp
 from ipex_llm.transformers.models.utils import apply_rotary_pos_emb, SILU
 from ipex_llm.transformers.models.utils import mlp_fusion_check
 from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_36
@@ -301,16 +301,10 @@ def baichuan_attention_forward_7b(
     # IPEX-LLM OPT: sdp
     attn_weights = None
-    if use_flash_attention(query_states, key_states, attention_mask):
-        attn_output = F.scaled_dot_product_attention(query_states.to(dtype=torch.float16),
-                                                     key_states.to(dtype=torch.float16),
-                                                     value_states.to(dtype=torch.float16),
-                                                     is_causal=True).to(hidden_states.dtype)
-    else:
-        attn_output = scaled_dot_product_attention(
-            query_states, key_states, value_states,
-            attention_mask, q_len == kv_seq_len
-        )
+    attn_output = scaled_dot_product_attention(
+        query_states, key_states, value_states,
+        attention_mask, q_len == kv_seq_len
+    )
     attn_output = attn_output.transpose(1, 2).contiguous()
     attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

ipex_llm/transformers/models/chatglm.py CHANGED Viewed

@@ -23,7 +23,7 @@ import torch.utils.checkpoint
 import torch.nn.functional as F
 from typing import Optional, Tuple
 from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
-from ipex_llm.transformers.models.utils import use_flash_attention, use_sdp
+from ipex_llm.transformers.models.utils import use_sdp
 def rotate_half(x):
@@ -41,7 +41,7 @@ def apply_rotary_pos_emb_index(q, k, cos, sin, position_id):
 def glm_sdpa(query, key, value, attention_mask=None, is_causal=False):
-    if use_flash_attention(query, key, attention_mask) or query.device.type == 'cpu':
+    if query.device.type == 'cpu':
         context_layer = F.scaled_dot_product_attention(query.to(key.dtype),
                                                        key,
                                                        value,

ipex_llm/transformers/models/qwen.py CHANGED Viewed

@@ -33,7 +33,6 @@ from ipex_llm.transformers.models.utils import update_past_key_value, should_use
 from ipex_llm.transformers.models.utils import use_quantize_kv_cache
 from ipex_llm.transformers.models.utils import rotate_half, SILU
 from ipex_llm.transformers.models.utils import mlp_fusion_check
-from ipex_llm.transformers.models.utils import use_flash_attention
 from ipex_llm.utils.common import invalidInputError
 from transformers.modeling_outputs import BaseModelOutputWithPast
@@ -116,33 +115,28 @@ def qwen_attention_forward(
     past_key_value = (key_states.transpose(1, 2),
                       value_states.transpose(1, 2)) if use_cache else None
-    # IPEX-LLM OPT: sdp
+    # IPEX-LLM OPT: sdpa
     attn_weights = None
-    if use_flash_attention(query_states, key_states, attention_mask):
-        attn_output = F.scaled_dot_product_attention(query_states.to(dtype=torch.float16),
-                                                     key_states.to(dtype=torch.float16),
-                                                     value_states.to(dtype=torch.float16),
-                                                     is_causal=True).to(hidden_states.dtype)
+    if q_len > 1 and q_len != kv_seq_len:
+        causal_mask = torch.tril(
+            torch.ones((kv_seq_len, kv_seq_len), dtype=torch.bool, device=query_states.device)
+        ).view(1, 1, kv_seq_len, kv_seq_len)
+        causal_mask = causal_mask[
+            :, :, kv_seq_len - q_len:kv_seq_len, :kv_seq_len
+        ]
+        attention_mask = torch.zeros(causal_mask.shape, dtype=query_states.dtype,
+                                     device=query_states.device)
+        attention_mask.masked_fill_(causal_mask.logical_not(),
+                                    torch.finfo(attention_mask.dtype).min)
+        attention_mask = attention_mask.expand([bsz, -1, -1, -1])
     else:
-        if q_len > 1 and q_len != kv_seq_len:
-            causal_mask = torch.tril(
-                torch.ones((kv_seq_len, kv_seq_len), dtype=torch.bool, device=query_states.device)
-            ).view(1, 1, kv_seq_len, kv_seq_len)
-            causal_mask = causal_mask[
-                :, :, kv_seq_len - q_len:kv_seq_len, :kv_seq_len
-            ]
-            attention_mask = torch.zeros(causal_mask.shape, dtype=query_states.dtype,
-                                         device=query_states.device)
-            attention_mask.masked_fill_(causal_mask.logical_not(),
-                                        torch.finfo(attention_mask.dtype).min)
-            attention_mask = attention_mask.expand([bsz, -1, -1, -1])
-        else:
-            attention_mask = None
+        attention_mask = None
-        attn_output = scaled_dot_product_attention(
-            query_states, key_states, value_states,
-            attention_mask, q_len == kv_seq_len
-        )
+    attn_output = scaled_dot_product_attention(
+        query_states, key_states, value_states,
+        attention_mask, q_len == kv_seq_len
+    )
     attn_output = attn_output.transpose(1, 2).contiguous()
     attn_output = attn_output.view(bsz, q_len, self.hidden_size)
@@ -219,31 +213,25 @@ def qwen_attention_forward_registered(
     past_key_value = (key_states.transpose(1, 2),
                       value_states.transpose(1, 2)) if use_cache else None
-    # IPEX-LLM OPT: sdp
+    # IPEX-LLM OPT: sdpa
     attn_weights = None
-    if use_flash_attention(query_states, key_states, attention_mask):
-        attn_output = F.scaled_dot_product_attention(query_states.to(dtype=torch.float16),
-                                                     key_states.to(dtype=torch.float16),
-                                                     value_states.to(dtype=torch.float16),
-                                                     is_causal=True).to(hidden_states.dtype)
+    if q_len > 1 and q_len != kv_seq_len:
+        causal_mask = registered_causal_mask[
+            :, :, kv_seq_len - q_len:kv_seq_len, :kv_seq_len
+        ]
+        attention_mask = torch.zeros(causal_mask.shape, dtype=query_states.dtype,
+                                     device=query_states.device)
+        attention_mask.masked_fill_(causal_mask.logical_not(),
+                                    torch.finfo(attention_mask.dtype).min)
+        attention_mask = attention_mask.expand([bsz, -1, -1, -1])
     else:
-        if q_len > 1 and q_len != kv_seq_len:
-            causal_mask = registered_causal_mask[
-                :, :, kv_seq_len - q_len:kv_seq_len, :kv_seq_len
-            ]
-            attention_mask = torch.zeros(causal_mask.shape, dtype=query_states.dtype,
-                                         device=query_states.device)
-            attention_mask.masked_fill_(causal_mask.logical_not(),
-                                        torch.finfo(attention_mask.dtype).min)
-            attention_mask = attention_mask.expand([bsz, -1, -1, -1])
-        else:
-            attention_mask = None
+        attention_mask = None
-        attn_output = scaled_dot_product_attention(
-            query_states, key_states, value_states,
-            attention_mask, q_len == kv_seq_len
-        )
+    attn_output = scaled_dot_product_attention(
+        query_states, key_states, value_states,
+        attention_mask, q_len == kv_seq_len
+    )
     attn_output = attn_output.transpose(1, 2).contiguous()
     attn_output = attn_output.view(bsz, q_len, self.hidden_size)

ipex_llm/transformers/models/qwen2.py CHANGED Viewed

@@ -38,12 +38,10 @@
 #
 import os
-import math
 from typing import Optional, Tuple, Union, List
 import torch
 from torch.nn import CrossEntropyLoss
-from torch.nn.functional import scaled_dot_product_attention as sdpa
 from ipex_llm.transformers.models.common import merge_qkv_base
 from ipex_llm.transformers.models.common import scaled_dot_product_attention
@@ -51,13 +49,12 @@ from ipex_llm.transformers.models.utils import SILU, mlp_fusion_check
 from ipex_llm.transformers.models.utils import should_use_fuse_rope
 from ipex_llm.transformers.models.utils import use_quantize_kv_cache, \
     should_use_compresskv, is_enough_kv_cache_room_4_36
-from ipex_llm.transformers.models.utils import use_flash_attention
 from ipex_llm.transformers.kv import DynamicFp8Cache, DynamicNormalCache, \
     DynamicCompressCache, DynamicCompressFp8Cache
 from ipex_llm.utils.common import invalidInputError
 from transformers.models.qwen2.modeling_qwen2 import Qwen2Attention, Qwen2MLP
-from transformers.models.qwen2.modeling_qwen2 import apply_rotary_pos_emb, repeat_kv
+from transformers.models.qwen2.modeling_qwen2 import apply_rotary_pos_emb
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from transformers.cache_utils import Cache
 from transformers import logging
@@ -580,21 +577,10 @@ def qwen2_attention_forward(
                                                              self.layer_idx, None)
     attn_weights = None
-    if use_flash_attention(query_states, key_states, attention_mask):
-        if attention_mask is not None:
-            attention_mask = attention_mask[:, :, :, :kv_seq_len]
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        attn_output = sdpa(query_states.to(device, dtype=torch.float16),
-                           key_states.to(device, dtype=torch.float16),
-                           value_states.to(device, dtype=torch.float16),
-                           is_causal=True).to(hidden_states.dtype)
-    else:
-        attn_output = scaled_dot_product_attention(
-            query_states, key_states, value_states,
-            attention_mask, q_len == kv_seq_len
-        )
+    attn_output = scaled_dot_product_attention(
+        query_states, key_states, value_states,
+        attention_mask, q_len == kv_seq_len
+    )
     attn_output = attn_output.transpose(1, 2).contiguous()
     attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

ipex_llm/transformers/models/yuan.py CHANGED Viewed

@@ -20,17 +20,15 @@
 # https://huggingface.co/IEITYuan/Yuan2-2B-hf/blob/7ab7b3c18eb8e5232ce2a3f720d4e6f4b53a2806/README.md#%E5%A3%B0%E6%98%8E%E4%B8%8E%E5%8D%8F%E8%AE%AEterms-and-conditions
 #
-import math
 from typing import Optional, Tuple
 import torch
 from ipex_llm.utils.common import invalidInputError
 from ipex_llm.transformers.models.common import scaled_dot_product_attention
-from ipex_llm.transformers.models.utils import apply_rotary_pos_emb, \
-    mlp_fusion_check, fp16_fusion_check
+from ipex_llm.transformers.models.utils import apply_rotary_pos_emb
 from ipex_llm.transformers.models.utils import use_quantize_kv_cache
-from ipex_llm.transformers.models.utils import SILU, update_past_key_value
+from ipex_llm.transformers.models.utils import update_past_key_value
 from ipex_llm.transformers.models.utils import should_use_fuse_rope
@@ -98,52 +96,6 @@ def yuan_localized_filtering_forward(
     return lf_output
-def yuan_mlp_forward(
-    self,
-    x: torch.Tensor,
-    residual=None
-) -> torch.Tensor:
-    x_2d = x.view(-1, x.shape[-1])
-    bsz, hidden_size = x_2d.shape
-    qtype = getattr(self.up_proj, "qtype", None)
-    if mlp_fusion_check(x_2d, qtype, self.training):
-        import xe_linear
-        if not x_2d.is_contiguous():
-            x_2d = x_2d.contiguous()
-        out = self.down_proj(xe_linear.mlp_forward_xpu(
-            x_2d, self.up_proj.weight.data, self.gate_proj.weight.data,
-            x_2d.shape[0], x_2d.shape[1], self.up_proj.out_len,
-            SILU, qtype
-        ))
-        if residual is not None:
-            return out + residual
-        else:
-            return out
-    elif fp16_fusion_check(self.up_proj, x, self.training) and \
-            hidden_size == 4096 and bsz == 1:
-        hidden_states1 = torch.ops.torch_ipex.mm_silu(x, self.up_proj.weight)
-        hidden_states = torch.ops.torch_ipex.mm_resmul(
-            x, self.gate_proj.weight, hidden_states1
-        )
-        if residual is None:
-            hidden_states = torch.matmul(hidden_states, self.down_proj.weight)
-        else:
-            attn_output = torch.addmm(
-                residual.flatten(0, -2),
-                hidden_states.flatten(0, -2),
-                self.down_proj.weight,
-                beta=1,
-            )
-            hidden_states = attn_output.view(x.shape)
-        return hidden_states
-    else:
-        out = self.down_proj(self.act_fn(self.up_proj(x)) * self.gate_proj(x))
-        if residual is not None:
-            return out + residual
-        else:
-            return out
 def yuan_attention_forward(
     self,
     hidden_states: torch.Tensor,

ipex_llm/transformers/npu_model.py CHANGED Viewed

@@ -301,8 +301,7 @@ class _BaseAutoModelClass:
         model.share_memory()
         if not pipeline:
-            if (not hasattr(model, 'llm') and
-                    model.config.model_type in ["qwen2", "llama", "minicpm"]):
+            if model.config.model_type in ["qwen2", "llama", "minicpm"]:
                 from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
                 optimize_llm_single_process(
                     llm,
@@ -312,7 +311,8 @@ class _BaseAutoModelClass:
                     group_size=quantization_group_size,
                     qtype=qtype,
                     save_directory=save_directory,
-                    fuse_layers=fuse_layers
+                    fuse_layers=fuse_layers,
+                    has_llm=hasattr(model, "llm")
                 )
             else:
                 optimize_llm(

ipex_llm/transformers/npu_models/convert.py CHANGED Viewed

@@ -449,7 +449,8 @@ def optimize_llm_single_process(
     group_size: int,
     qtype: str,
     save_directory: str,
-    fuse_layers: int=None
+    fuse_layers: int=None,
+    has_llm: bool=False
 ):
     from ipex_llm.transformers.npu_pipeline_model.convert_pipeline import convert_llm
     from .npu_llm_cpp import load_model_from_file
@@ -468,8 +469,13 @@ def optimize_llm_single_process(
         model.kv_len = kv_len
         model.model_ptr = model_ptr
         model.save_directory = save_directory
-        model.vocab_size = model.config.vocab_size
+        if model.config.vocab_size == 151666:
+            # for MiniCPM-V 2.6, 152064 is vocab_size of Qwen2-7B
+            model.vocab_size = 152064
+        else:
+            model.vocab_size = model.config.vocab_size
         model.logits_buffer = torch.empty(1, 1, model.vocab_size, dtype=torch.float32)
+        model.max_prompt_len = max_prompt_len
     except:
         invalidInputError(False,
                           "False to InitLLMPipeline.")
@@ -478,9 +484,10 @@ def optimize_llm_single_process(
     general_convert(model, PreTrainedModel, prepare_input_ids, "prepare_inputs_for_generation")
     general_convert(model, PreTrainedModel, causal_lm_forward)
     # patch generate function
-    import types
-    model.original_generate = model.generate
-    model.generate = types.MethodType(generate, model)
+    if not has_llm:
+        import types
+        model.original_generate = model.generate
+        model.generate = types.MethodType(generate, model)
     return model
@@ -491,9 +498,10 @@ def prepare_input_ids(
     else:  # prefill, reset the model here
         from .npu_llm_cpp import reset
         reset(self.model_ptr)
-    model_inputs = {
-        "input_ids": input_ids
-    }
+    if inputs_embeds is not None and past_key_values is None:
+        model_inputs = {"inputs_embeds": inputs_embeds}
+    else:
+        model_inputs = {"input_ids": input_ids}
     return model_inputs
@@ -511,17 +519,31 @@ def causal_lm_forward(
     return_dict: Optional[bool] = None,
 ) -> Union[Tuple, CausalLMOutputWithPast]:
     from .npu_llm_cpp import run_prefill_with_logits, run_decode_with_logits
-    if isinstance(input_ids[0], torch.Tensor):
-        input_list = input_ids[0].flatten().tolist()
-    else:
-        input_list = input_ids[0]
-    input_length = len(input_list)
-    if input_length > 1:
-        logits = run_prefill_with_logits(self.model_ptr, input_list,
-                                         self.logits_buffer, self.vocab_size)
+    if input_ids is not None:
+        if isinstance(input_ids[0], torch.Tensor):
+            input_list = input_ids[0].flatten().tolist()
+        else:
+            input_list = input_ids[0]
+        input_length = len(input_list)
+        if input_length > 1:
+            logits = run_prefill_with_logits(self.model_ptr, input_list,
+                                             self.logits_buffer, self.vocab_size)
+        else:
+            logits = run_decode_with_logits(self.model_ptr, input_list[0],
+                                            self.logits_buffer, self.vocab_size)
+    elif inputs_embeds is not None:
+        seq_len = inputs_embeds.shape[1]
+        pad_len = self.max_prompt_len - seq_len
+        inputs_embeds = torch.nn.functional.pad(inputs_embeds.to(torch.float16),
+                                                (0, 0, 0, pad_len), value=0.0)
+        logits = run_prefill_with_logits(self.model_ptr, None, self.logits_buffer,
+                                         self.vocab_size, inputs_embeds, seq_len)
     else:
-        logits = run_decode_with_logits(self.model_ptr, input_list[0],
-                                        self.logits_buffer, self.vocab_size)
+        invalidInputError(False, "Please specify either input_ids or inputs_embeds.")
+    if self.config.vocab_size == 151666:
+        # for MiniCPM-V 2.6
+        logits = logits[:, :, :151666]
     return CausalLMOutputWithPast(
         loss=None,

ipex_llm/transformers/npu_models/npu_llm_cpp.py CHANGED Viewed

@@ -48,8 +48,8 @@ _lib = ctypes.cdll.LoadLibrary(_lib_path)
 _lib.load_model_from_file.argtypes = [ctypes.c_char_p]
 _lib.load_model_from_file.restype = ctypes.c_void_p
-_lib.run_prefill.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_int), ctypes.c_int,
-                             ctypes.c_float]
+_lib.run_prefill.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int,
+                             ctypes.c_float, ctypes.c_bool]
 _lib.run_prefill.restype = ctypes.POINTER(ctypes.c_float)
 _lib.run_decode.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_float]
@@ -61,8 +61,10 @@ _lib.llm_sample_token.restype = ctypes.c_int
 _lib.reset.argtypes = [ctypes.c_void_p]
 _lib.reset.restype = None
-_lib.run_prefill_with_logits.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_int),
-                                         ctypes.c_int, ctypes.POINTER(ctypes.c_float), ctypes.c_int]
+_lib.run_prefill_with_logits.argtypes = [ctypes.c_void_p, ctypes.c_void_p,
+                                         ctypes.c_int, ctypes.POINTER(ctypes.c_float),
+                                         ctypes.c_int, ctypes.c_bool]
 _lib.run_prefill_with_logits.restype = None
 _lib.run_decode_with_logits.argtypes = [ctypes.c_void_p, ctypes.c_int,
@@ -77,7 +79,7 @@ def load_model_from_file(model_dir: str):
 def run_prefill(model_ptr, input_ids, vocab_size, repetition_penalty=1.0):
     input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids)
     input_len = len(input_ids)
-    plogits = _lib.run_prefill(model_ptr, input_ptr, input_len, repetition_penalty)
+    plogits = _lib.run_prefill(model_ptr, input_ptr, input_len, repetition_penalty, False)
     new_token = _lib.llm_sample_token(plogits, True, vocab_size)
     return new_token
@@ -88,12 +90,19 @@ def run_decode(model_ptr, input_id, vocab_size, repetition_penalty=1.0):
     return new_token
-def run_prefill_with_logits(model_ptr, input_ids, logits, vocab_size):
-    input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids)
-    input_len = len(input_ids)
+def run_prefill_with_logits(model_ptr, input_ids, logits, vocab_size,
+                            inputs_embeds=None, seq_len=None):
+    if input_ids is not None:
+        input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids)
+        input_len = len(input_ids)
+    else:
+        input_ptr = inputs_embeds.contiguous().data.data_ptr()
+        input_ptr = ctypes.cast(input_ptr, ctypes.c_void_p)
+        input_len = seq_len
     logits_ptr = logits.data.data_ptr()
     logits_ptr = ctypes.cast(logits_ptr, ctypes.POINTER(ctypes.c_float))
-    _lib.run_prefill_with_logits(model_ptr, input_ptr, input_len, logits_ptr, vocab_size)
+    _lib.run_prefill_with_logits(model_ptr, input_ptr, input_len, logits_ptr,
+                                 vocab_size, (input_ids is None))
     return logits

ipex_llm/transformers/npu_pipeline_model/qwen.py CHANGED Viewed

@@ -34,6 +34,10 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
     lm_head_n_splits = 1
     asym = getattr(model.config, "asym", False)
+    if vocab_size == 151666:
+        # for MiniCPM-V 2.6 lm_head on NPU
+        vocab_size = 152064
     if not isinstance(lm_head, SlicedLMHead):
         asym = lm_head.qtype == "asym_int4_rtn"
         if asym: