PyPI - ipex-llm - Versions diffs - 2.2.0b20250106__py3-none-win_amd64.whl → 2.2.0b20250106.post1__py3-none-win_amd64.whl - Mend

ipex-llm 2.2.0b20250106__py3-none-win_amd64.whl → 2.2.0b20250106.post1__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

ipex_llm/libs/bloom-api.dll +0 -0
ipex_llm/libs/bloom.dll +0 -0
ipex_llm/libs/gptneox-api.dll +0 -0
ipex_llm/libs/gptneox.dll +0 -0
ipex_llm/libs/libbloom_avx.dll +0 -0
ipex_llm/libs/libbloom_vnni.dll +0 -0
ipex_llm/libs/libgptneox_avx.dll +0 -0
ipex_llm/libs/libgptneox_vnni.dll +0 -0
ipex_llm/libs/libllama_avx.dll +0 -0
ipex_llm/libs/libllama_vnni.dll +0 -0
ipex_llm/libs/libstarcoder_avx.dll +0 -0
ipex_llm/libs/libstarcoder_vnni.dll +0 -0
ipex_llm/libs/llama-api.dll +0 -0
ipex_llm/libs/llama.dll +0 -0
ipex_llm/libs/main-bloom.exe +0 -0
ipex_llm/libs/main-gptneox.exe +0 -0
ipex_llm/libs/main-llama.exe +0 -0
ipex_llm/libs/main-starcoder.exe +0 -0
ipex_llm/libs/pipeline.dll +0 -0
ipex_llm/libs/quantize-bloom.exe +0 -0
ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
ipex_llm/libs/quantize-gptneox.exe +0 -0
ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
ipex_llm/libs/quantize-llama.exe +0 -0
ipex_llm/libs/quantize-llama_vnni.exe +0 -0
ipex_llm/libs/quantize-starcoder.exe +0 -0
ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
ipex_llm/libs/starcoder-api.dll +0 -0
ipex_llm/libs/starcoder.dll +0 -0
ipex_llm/transformers/convert.py +19 -158
ipex_llm/transformers/loader.py +1 -1
ipex_llm/transformers/lookup.py +2 -2
ipex_llm/transformers/low_bit_linear.py +15 -29
ipex_llm/transformers/model.py +0 -7
ipex_llm/transformers/models/chatglm2.py +1 -192
ipex_llm/transformers/models/minicpmv.py +2 -2
ipex_llm/transformers/models/sd.py +2 -2
ipex_llm/transformers/models/utils.py +16 -104
ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +5 -8
ipex_llm/transformers/speculative.py +2 -14
ipex_llm/transformers/utils.py +7 -20
{ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250106.post1.dist-info}/METADATA +40 -19
{ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250106.post1.dist-info}/RECORD +49 -53
ipex_llm/transformers/models/cohere.py +0 -589
ipex_llm/transformers/models/falcon.py +0 -829
ipex_llm/transformers/models/gptj.py +0 -441
ipex_llm/transformers/models/mixtral.py +0 -576
{ipex_llm-2.2.0b20250106.data → ipex_llm-2.2.0b20250106.post1.data}/scripts/ipex-llm-init.bat +0 -0
{ipex_llm-2.2.0b20250106.data → ipex_llm-2.2.0b20250106.post1.data}/scripts/llm-chat.ps1 +0 -0
{ipex_llm-2.2.0b20250106.data → ipex_llm-2.2.0b20250106.post1.data}/scripts/llm-cli.ps1 +0 -0
{ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250106.post1.dist-info}/WHEEL +0 -0
{ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250106.post1.dist-info}/entry_points.txt +0 -0
{ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250106.post1.dist-info}/top_level.txt +0 -0

ipex_llm/transformers/models/chatglm2.py CHANGED Viewed

@@ -269,7 +269,7 @@ def chatglm2_attention_forward(
     # IPEX-LLM OPT: fuse rope
     inv_freq, position_ids = rotary_pos_emb
     rot_dim = inv_freq.size(-1) * 2
-    if should_use_fuse_rope(hidden_states, rotary_pos_emb[1], self.training):
+    if should_use_fuse_rope(hidden_states, position_ids, self.training):
         import xe_addons
         xe_addons.rotary_two_inplaced(inv_freq, position_ids,
                                       query_states[..., :rot_dim], key_states[..., :rot_dim])
@@ -321,197 +321,6 @@ def chatglm2_attention_forward(
     return output, past_key_value
-@torch.jit.script
-def apply_rotary_pos_emb_original(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
-    # x: [sq, b, np, hn]
-    sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
-    rot_dim = rope_cache.shape[-2] * 2
-    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
-    # truncate to support variable sizes
-    rope_cache = rope_cache[:sq]
-    xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
-    rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
-    x_out2 = torch.stack(
-        [
-            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
-            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
-        ],
-        -1,
-    )
-    x_out2 = x_out2.flatten(3)
-    return torch.cat((x_out2, x_pass), dim=-1)
-def codegeex_model_forward(
-    self,
-    input_ids,
-    position_ids: Optional[torch.Tensor]=None,
-    attention_mask: Optional[torch.BoolTensor]=None,
-    full_attention_mask: Optional[torch.BoolTensor]=None,
-    past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]]=None,
-    inputs_embeds: Optional[torch.Tensor]=None,
-    use_cache: Optional[bool]=None,
-    output_hidden_states: Optional[bool]=None,
-    return_dict: Optional[bool]=None,
-):
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-    if inputs_embeds is None:
-        batch_size, seq_length = input_ids.shape
-        inputs_embeds = self.embedding(input_ids)
-    else:
-        inputs_embeds = inputs_embeds.transpose(0, 1).contiguous()
-        seq_length, batch_size, _ = inputs_embeds.shape
-        input_ids = torch.empty((batch_size, seq_length),
-                                dtype=inputs_embeds.dtype, device=inputs_embeds.device)
-    if full_attention_mask is None:
-        if (attention_mask is not None and not attention_mask.all()) or (
-                past_key_values and seq_length != 1):
-            full_attention_mask = self.get_masks(input_ids,
-                                                 past_key_values,
-                                                 padding_mask=attention_mask)
-    # ipex-llm changes begin
-    # 1. replace `rotary_pos_emb` with `inv_freq` and `position_ids`
-    # 2. generate `causal_mask` and replace `full_attention_mask` with it
-    if position_ids is None:
-        if past_key_values is None:
-            position_ids = torch.arange(seq_length, dtype=torch.int64, device=inputs_embeds.device)
-        else:
-            if isinstance(past_key_values, DynamicCompressCache):
-                kv_length = past_key_values.get_seq_length()
-            else:
-                kv_length = past_key_values[0][0].size(0)
-            position_ids = torch.arange(kv_length, kv_length + seq_length,
-                                        dtype=torch.int64, device=inputs_embeds.device)
-        position_ids = position_ids.repeat(batch_size, 1)
-    use_fuse_rope = input_ids.device.type == "xpu" and not self.training
-    # Rotary positional embeddings
-    rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
-    if position_ids is not None:
-        rotary_pos_emb = rotary_pos_emb[position_ids]
-    else:
-        rotary_pos_emb = rotary_pos_emb[None, :seq_length]
-    if use_fuse_rope:
-        # Repeat cos sin here, call only once for each token.
-        # Chatglm2's rotary embedding is similar to gptj's, is rotate_every_two.
-        # If put this to attension forward, it will generate too many times.
-        cos, sin = rotary_pos_emb.split(rotary_pos_emb.shape[-1] // 2, dim=-1)
-        cos = cos.squeeze(-1)
-        sin = sin.squeeze(-1)
-        cos = torch.repeat_interleave(cos[:, :, None, :], 2, 3)
-        sin = torch.repeat_interleave(sin[:, :, None, :], 2, 3)
-        rotary_pos_emb = (cos, sin)
-    else:
-        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
-    # `full_attention_mask` is not None only when
-    #  `past_key_values` is not None and `seq_length` > 1
-    if full_attention_mask is not None:
-        causal_mask = torch.zeros([batch_size, 1, seq_length, full_attention_mask.size(-1)],
-                                  dtype=inputs_embeds.dtype, device=inputs_embeds.device)
-        mask_value = torch.finfo(inputs_embeds.dtype).min
-        causal_mask.masked_fill_(full_attention_mask, mask_value)
-    elif self.training or (inputs_embeds.device.type != "xpu" and past_key_values is None):
-        full_attention_mask = self.get_masks(input_ids,
-                                             past_key_values,
-                                             padding_mask=attention_mask)
-        causal_mask = torch.zeros([batch_size, 1, seq_length, full_attention_mask.size(-1)],
-                                  dtype=inputs_embeds.dtype, device=inputs_embeds.device)
-        mask_value = torch.finfo(inputs_embeds.dtype).min
-        causal_mask.masked_fill_(full_attention_mask, mask_value)
-    else:
-        causal_mask = None
-    # Run encoder.
-    hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
-        inputs_embeds, causal_mask,
-        rotary_pos_emb=rotary_pos_emb,
-        kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
-    )
-    # ipex-llm changes end
-    if not return_dict:
-        return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions]
-                     if v is not None)
-    return BaseModelOutputWithPast(
-        last_hidden_state=hidden_states,
-        past_key_values=presents,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attentions,
-    )
-def codegeex_attention_forward(
-    self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
-):
-    q_len, bsz, _ = hidden_states.size()
-    n_head = self.num_attention_heads_per_partition
-    n_kv_head = self.num_multi_query_groups_per_partition if self.multi_query_attention else n_head
-    head_dim = self.hidden_size_per_attention_head
-    past_key_value = None if kv_cache is None else (kv_cache[0].permute(1, 2, 0, 3),
-                                                    kv_cache[1].permute(1, 2, 0, 3))
-    qkv = self.query_key_value(hidden_states)
-    qkv = qkv.view(q_len, bsz, n_head + 2 * n_kv_head, head_dim)
-    # [seq_len, bsz, n_head, head_dim] -> [bsz, n_head, seq_len, head_dim]
-    qkv = qkv.permute(1, 2, 0, 3)
-    query_layer, key_layer, value_layer = qkv.split([n_head,
-                                                     n_kv_head,
-                                                     n_kv_head], dim=1)
-    kv_seq_len = key_layer.shape[2]
-    if past_key_value is not None:
-        kv_seq_len += past_key_value[0].shape[2]
-    # apply relative positional encoding (rotary embedding)
-    if len(rotary_pos_emb) == 2 and isinstance(rotary_pos_emb, tuple):
-        cos, sin = rotary_pos_emb
-        rot_dim = cos.shape[-1]
-        query_layer = query_layer.transpose(1, 2)
-        key_layer = key_layer.transpose(1, 2)
-        query_layer_cur = query_layer[..., :rot_dim]
-        key_layer_cur = key_layer[..., :rot_dim]
-        # ipex_llm's apply_rotary_embedding can change the origin storage,
-        # so query_layer will get the result directly.
-        torch.ops.torch_ipex.apply_rotary_embedding(query_layer_cur, sin, cos, query_layer_cur)
-        torch.ops.torch_ipex.apply_rotary_embedding(key_layer_cur, sin, cos, key_layer_cur)
-        query_layer = query_layer.transpose(1, 2)
-        key_layer = key_layer.transpose(1, 2)
-    else:
-        query_layer = apply_rotary_pos_emb_original(query_layer, rotary_pos_emb)
-        key_layer = apply_rotary_pos_emb_original(key_layer, rotary_pos_emb)
-    key_layer, value_layer = update_past_key_value(
-        past_key_value, key_layer, value_layer,
-        kv_seq_len, False, hidden_states.device
-    )
-    # past_key_value: [bsz, n_kv_head, seq_len, head_dim] -> [seq_len, bsz, n_kv_head, head_dim]
-    past_key_value = (key_layer.permute(2, 0, 1, 3),
-                      value_layer.permute(2, 0, 1, 3)) if use_cache else None
-    # =================
-    # Output. [sq, b, h]
-    # =================
-    context_layer = scaled_dot_product_attention(
-        query_layer, key_layer, value_layer,
-        attention_mask, q_len == kv_seq_len
-    )
-    context_layer = context_layer.permute(2, 0, 1, 3).contiguous().view(q_len,
-                                                                        bsz,
-                                                                        n_head * head_dim)
-    output = self.dense(context_layer)
-    return output, past_key_value
 import torch.nn.functional as F

ipex_llm/transformers/models/minicpmv.py CHANGED Viewed

@@ -53,10 +53,10 @@ def siglip_attention_forward(
     qkv = qkv.transpose(1, 2)
     query_states, key_states, value_states = qkv.chunk(3, dim=1)
-    from ipex_llm.transformers.utils import get_xpu_device_type
+    from ipex_llm.transformers.utils import get_xpu_device_name
     if (
         self.head_dim == 72
-        and get_xpu_device_type(query_states) in ["arc", "flex"] and
+        and get_xpu_device_name(query_states.device) == "arc" and
         query_states.dtype in [torch.float, torch.half]
     ):
         n_heads, kv_length = query_states.size(1), key_states.size(2)

ipex_llm/transformers/models/sd.py CHANGED Viewed

@@ -36,7 +36,7 @@ import math
 import torch
 from typing import Optional
-from ipex_llm.transformers.utils import get_xpu_device_type
+from ipex_llm.transformers.utils import get_xpu_device_name
 from ipex_llm.transformers.models.common import padding_qkv_hd
 from ipex_llm.transformers.models.common import scaled_dot_product_attention
 from diffusers.models.attention_processor import Attention
@@ -144,7 +144,7 @@ class AttnProcessor2_0:
 def upcast_vae(self):
     # workaround overflow and ipex's bugs
-    if get_xpu_device_type(self.vae.post_quant_conv.weight) in ["arc", "flex", "pvc"]:
+    if get_xpu_device_name(self.vae.post_quant_conv.weight.device) == "arc":
         self.vae.to(torch.bfloat16)
     else:
         self.vae.decoder.up_blocks.to(torch.bfloat16)

ipex_llm/transformers/models/utils.py CHANGED Viewed

@@ -19,7 +19,7 @@ import torch
 import warnings
 from ipex_llm.utils.common import invalidInputError
 from ipex_llm.ggml.quantize import ggml_tensor_qtype
-from ipex_llm.transformers.utils import get_ipex_version, get_xpu_device_type
+from ipex_llm.transformers.utils import get_ipex_version, get_xpu_device_name
 from ipex_llm.transformers.low_bit_linear import SYM_INT4, SYM_INT8, FP8E5, IQ2_XXS, FP4, FP8E4,\
     FP6, ASYM_INT4
@@ -85,16 +85,14 @@ def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor, kv_group: in
         return os.environ["IPEX_LLM_QUANTIZE_KV_CACHE"] == "1"
     elif os.environ.get("IPEX_LLM_LOW_MEM", None) is not None:
         return os.environ["IPEX_LLM_LOW_MEM"] == "1"
+    elif linear.qtype in [ggml_tensor_qtype["fp16"], ggml_tensor_qtype["bf16"]]:
+        return False
     else:
-        return x.device.type == 'xpu' and kv_cache_device_check(x, kv_group) \
-            and hasattr(linear, "qtype") and \
-            linear.qtype != ggml_tensor_qtype["fp16"] and linear.qtype != ggml_tensor_qtype["bf16"]
-def kv_cache_device_check(x: torch.Tensor, kv_group: int) -> bool:
-    return (get_xpu_device_type(x) in ["mtl", "lnl"] and kv_group <= 1) or \
-        ((get_xpu_device_type(x) == "arc" or get_xpu_device_type(x) == "flex") and
-            1 < x.size(0) and x.size(0) <= 8)
+        device_name = get_xpu_device_name(x.device)
+        return (
+            device_name in ["mtl", "lnl", "arl"] and kv_group == 1
+            or device_name in ["arc", "bmg"] and x.size(0) > 1
+        )
 def init_fp8_kv_cache(batch_size, num_heads, current_length, head_dim, device):
@@ -170,7 +168,7 @@ def should_use_fuse_rope(hidden_states, position_ids, training):
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
     if model_family in ["llama", "baichuan", "internlm", "aquila", "gpt_neox", "mistral",
-                        "mixtral", "qwen2", "yuan", "stablelm", "qwen2_moe"]:
+                        "qwen2", "yuan", "stablelm", "qwen2_moe"]:
         # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
         cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
         sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
@@ -185,7 +183,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
         q_embed = (q * cos) + (rotate_half(q) * sin)
         k_embed = (k * cos) + (rotate_half(k) * sin)
         return q_embed, k_embed
-    elif model_family in ["gptj", "chatglm"]:
+    elif model_family in ["chatglm"]:
         q_embed = (q * cos) + (rotate_every_two(q) * sin)
         k_embed = (k * cos) + (rotate_every_two(k) * sin)
         return q_embed, k_embed
@@ -194,19 +192,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
                           f"{model_family} is not supported.")
-def apply_ipex_rotate_every_two(q, k, cos, sin):
-    # ipex's apply_rotary_embedding_two_qk can change the origin storage,
-    # so q/k will get the result directly.
-    from ipex_llm.transformers.utils import get_ipex_version
-    if get_ipex_version() >= "2.1.10+xpu":
-        torch.ops.torch_ipex.apply_rotary_embedding_two_qk(
-            q, k, sin, cos, q, k
-        )
-    else:
-        torch.ops.torch_ipex.apply_rotary_embedding(q, sin, cos, q)
-        torch.ops.torch_ipex.apply_rotary_embedding(k, sin, cos, k)
 def is_enough_kv_cache_room_4_36(past_key_value, idx, seq_len=1):
     # to determinate if is enough kv cache room in transformers==4.36
     # seq_len for current seq len
@@ -226,57 +211,6 @@ def is_enough_kv_cache_room_4_31(past_key_value, seq_len=1):
         (past_key_value[0].size(2) + seq_len) * past_key_value[0].size(3)
-def use_flash_attention(query, key, attention_mask=None):
-    # here we support query's shape is always [batch_size, head_num, q_len, head_dim],
-    # key's shape is always [batch_size, head_num, k_len, head_dim]
-    invalidInputError(query.dim() == 4,
-                      "Here query input of use_flash_attention should be [batch_size, "
-                      "head_num, q_len, head_dim]")
-    invalidInputError(key.dim() == 4,
-                      "Here key input of use_flash_attention should be [batch_size, "
-                      "head_num, k_len, head_dim]")
-    bsz, _, q_len, _ = query.size()
-    k_len = key.size()[2]
-    # check whether ipex flash attention can be used
-    if q_len != k_len:
-        # now only use flash attention for first token
-        # as it seems have no performance benifit for rest token now
-        return False
-    if query.device.type != "xpu":
-        # ipex flash attention only support for xpu
-        return False
-    ipex_version = get_ipex_version()
-    if ipex_version <= "2.0.110+xpu":
-        # ipex flash attention is supported from ipex 2.1
-        return False
-    if not torch.xpu.has_xetla():
-        # ipex flash attention is only supported for xetla
-        # may update this later
-        return False
-    elif get_xpu_device_type(query) != "pvc":
-        return False
-    if query.dtype not in [torch.float32, torch.float16]:
-        # only use flash attention for fp32/fp16 input
-        return False
-    if bsz > 1:
-        # as flash attention doesn't support attn_mask in ipex 2.1,
-        # so it will cause output error for padded batch input
-        if attention_mask is None:
-            return True
-        else:
-            # TODO: below logic may change for different model
-            # attention mask shape : [bsz, 1, q_len, k_len]
-            if attention_mask[0].squeeze()[0, 0].item() != 0:
-                # first batch contains padding
-                # otherwise we suppose it should be a upper triangular matrix
-                # at the same time, the diagonal is also 0
-                return False
-            elif not attention_mask.equal(attention_mask[0].repeat(bsz, 1, 1, 1)):
-                # check whether mask of every batch is the same
-                return False
-    return True
 def use_sdp(q_len, kv_len, head_dim, query_states):
     return (
         query_states.device.type == "xpu"
@@ -315,38 +249,16 @@ def mlp_fusion_check(x, qtype, training):
     if training or x.requires_grad:
         return False
     if qtype == FP6:
-        device = get_xpu_device_type(x)
-        if device in ["mtl", "lnl"]:
+        device = get_xpu_device_name(x.device)
+        if device in ["mtl", "lnl", "arl"]:
             return False
     return True
-def use_decoding_fast_path(proj,
-                           use_fuse_rope,
-                           enough_kv_room,
-                           bs,
-                           qtype_check=decoding_fast_path_qtype_check):
-    if proj is None:
-        return False
-    device = get_xpu_device_type(proj.weight)
-    if not qtype_check(proj):
-        return False
-    if not use_fuse_rope:
-        return False
-    if not enough_kv_room:
-        return False
-    if bs != 1:
-        return False
-    if device in ["uhd"]:
-        return False
-    return True
 def use_xmx(x: torch.Tensor, qtype: int):
-    device = get_xpu_device_type(x)
+    device = get_xpu_device_name(x.device)
     return (
-        device in ["arc", "flex", "pvc"]
+        device in ["arc", "pvc"]
         and qtype in [SYM_INT4, SYM_INT8, FP8E4, FP8E5]
         and (
             (device == "pvc" and 1 < x.size(0) <= 16)
@@ -370,7 +282,7 @@ def fp16_fusion_check(proj, x, training):
         return False
     if x.requires_grad:
         return False
-    device_type = get_xpu_device_type(x)
+    device_type = get_xpu_device_name(x.device)
     if device_type != "pvc":
         return False
     return True
@@ -439,7 +351,7 @@ def should_use_compresskv(x: torch.Tensor, prompt_len: int):
     else:
         if use_compress_kv is None:
             return (
-                get_xpu_device_type(x) in ["mtl", "lnl"]
+                get_xpu_device_name(x.device) in ["mtl", "lnl", "arl"]
                 and prompt_len >= 1800
                 and prompt_len <= 4500
             )

ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py CHANGED Viewed

@@ -473,10 +473,6 @@ def convert_llm_for_deploy(model: torch.nn.Module,
                        "n_splits_linear": n_splits_linear,
                        "n_splits_down_proj": n_splits_down_proj,
                        "lm_head_low_bit": lm_head_low_bit}
-        model.config.update(update_dict)
-        model.config.save_pretrained(save_directory)
-        if model.can_generate():
-            model.generation_config.save_pretrained(save_directory)
         from .qwen import convert_qwen_layer, convert_fused_qwen_layer
         from .qwen import convert_lm_head_and_embedding
@@ -537,8 +533,6 @@ def convert_llm_for_deploy(model: torch.nn.Module,
                        "n_splits_linear": n_splits_linear,
                        "n_splits_down_proj": n_splits_down_proj,
                        "lm_head_low_bit": lm_head_low_bit}
-        model.config.update(update_dict)
-        model.config.save_pretrained(save_directory)
         from .llama import convert_llama_layer, convert_fused_llama_layer
         from .llama import convert_lm_head_and_embedding
@@ -577,8 +571,6 @@ def convert_llm_for_deploy(model: torch.nn.Module,
                        "n_splits_linear": n_splits_linear,
                        "n_splits_down_proj": n_splits_down_proj,
                        "lm_head_low_bit": lm_head_low_bit}
-        model.config.update(update_dict)
-        model.config.save_pretrained(save_directory)
         from .minicpm import convert_minicpm_layer, convert_fused_minicpm_layer
         from .minicpm import convert_lm_head_and_embedding
@@ -595,3 +587,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
                                       save_directory, weight_dir,
                                       convert_model=True,
                                       max_prompt_len=max_prompt_len)
+    model.config.update(update_dict)
+    model.config.save_pretrained(save_directory)
+    if model.can_generate():
+        model.generation_config.save_pretrained(save_directory)

ipex_llm/transformers/speculative.py CHANGED Viewed

@@ -432,8 +432,7 @@ def _check_and_extend_kv_cache(past_key_values, max_step_draft, kv_alloc_block_l
     from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \
         extend_kv_cache
     enough_kv_room = True
-    if model_type not in ["chatglm", "qwen", "baichuan", "llama", "mistral",
-                          "gptj", "opt"]:
+    if model_type not in ["chatglm", "qwen", "baichuan", "llama", "mistral", "opt"]:
         return past_key_values, False
     cache_k = past_key_values[0][0]
     if model_type == "chatglm":
@@ -527,7 +526,7 @@ def _crop_past_key_values(self, past_key_values, new_cache_size, _enable_ipex=Fa
                         v[:-(new_cache_size), :, :, :])
                     for k, v in past_key_values
                 ]
-        elif self.config.model_type in ["baichuan", "gptj"]:
+        elif self.config.model_type in ["baichuan"]:
             past_key_values = [
                 (k[:, :, :-(new_cache_size), :],
                     v[:, :, :-(new_cache_size), :])
@@ -796,13 +795,6 @@ def _non_cpu_ipex_verify(self, verify_input_ids, past_key_values, cur_attention_
                                     device=verify_input_ids.device)
         position_ids = position_ids.unsqueeze(0).repeat(1, 1) + past_key_value_len
         forward_args["position_ids"] = position_ids
-    elif self.config.model_type == "gptj":
-        past_length = past_key_values[0][0].size(2)
-        input_len = verify_input_ids.shape[1]
-        position_ids = torch.arange(past_length, input_len + past_length,
-                                    dtype=torch.long, device=verify_input_ids.device)
-        position_ids = position_ids.unsqueeze(0).view(-1, input_len)
-        forward_args["position_ids"] = position_ids
     return self(**forward_args)
@@ -971,10 +963,6 @@ def speculative_generate(self,
                         past_key_value_len = past_key_values[0][0].shape[0]
                     position_ids = torch.Tensor([[past_key_value_len + step_draft]]).long()
                     forward_args["position_ids"] = position_ids
-                elif self.config.model_type == "gptj":
-                    past_length = draft_past_key_values[0][0].size(2)
-                    position_ids = torch.Tensor([[past_length]]).long().to(self.device)
-                    forward_args["position_ids"] = position_ids
                 if _enable_ipex:
                     if any(keyword in self.config.model_type

ipex_llm/transformers/utils.py CHANGED Viewed

@@ -168,27 +168,14 @@ def get_ipex_version():
     return _ipex_version
-def get_xpu_device_type(x):
-    if x.device.type != "xpu":
-        return x.device.type
-    name = torch.xpu.get_device_name(x.device.index)
-    if name.startswith("Intel(R) Arc(TM) A"):
-        return "arc"
-    elif name.startswith("Intel(R) Graphics [0xe20b]"):
-        return "bmg"
-    elif name.startswith("Intel(R) Arc(TM)"):
-        if 'V' in name:
-            return "lnl"
-        else:
-            return "mtl"
-    elif name.startswith("Intel(R) Data Center GPU Flex"):
-        return "flex"
-    elif name.startswith("Intel(R) Data Center GPU Max"):
-        return "pvc"
-    elif name.startswith("Intel(R) UHD"):
-        return "uhd"
+def get_xpu_device_name(device: torch.device):
+    if device.type != "xpu":
+        return device.type
     else:
-        return "others"
+        # possiable device name:
+        # ["arc", "pvc", "mtl", "lnl", "bmg", "arl", "legacy", "unknown"]
+        import xe_linear
+        return xe_linear.get_xpu_device_name(device)
 def load_imatrix_data(imatrix_file):