PyPI - ipex-llm - Versions diffs - 2.2.0b20250205__py3-none-win_amd64.whl → 2.2.0b20250207__py3-none-win_amd64.whl - Mend

ipex-llm 2.2.0b20250205__py3-none-win_amd64.whl → 2.2.0b20250207__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

ipex_llm/libs/bloom-api.dll +0 -0
ipex_llm/libs/bloom.dll +0 -0
ipex_llm/libs/gptneox-api.dll +0 -0
ipex_llm/libs/gptneox.dll +0 -0
ipex_llm/libs/libbloom_avx.dll +0 -0
ipex_llm/libs/libbloom_vnni.dll +0 -0
ipex_llm/libs/libgptneox_avx.dll +0 -0
ipex_llm/libs/libgptneox_vnni.dll +0 -0
ipex_llm/libs/libllama_avx.dll +0 -0
ipex_llm/libs/libllama_vnni.dll +0 -0
ipex_llm/libs/libstarcoder_avx.dll +0 -0
ipex_llm/libs/libstarcoder_vnni.dll +0 -0
ipex_llm/libs/llama-api.dll +0 -0
ipex_llm/libs/llama.dll +0 -0
ipex_llm/libs/main-bloom.exe +0 -0
ipex_llm/libs/main-gptneox.exe +0 -0
ipex_llm/libs/main-llama.exe +0 -0
ipex_llm/libs/main-starcoder.exe +0 -0
ipex_llm/libs/pipeline.dll +0 -0
ipex_llm/libs/quantize-bloom.exe +0 -0
ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
ipex_llm/libs/quantize-gptneox.exe +0 -0
ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
ipex_llm/libs/quantize-llama.exe +0 -0
ipex_llm/libs/quantize-llama_vnni.exe +0 -0
ipex_llm/libs/quantize-starcoder.exe +0 -0
ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
ipex_llm/libs/starcoder-api.dll +0 -0
ipex_llm/libs/starcoder.dll +0 -0
ipex_llm/transformers/low_bit_linear.py +5 -4
ipex_llm/transformers/model.py +0 -1
ipex_llm/transformers/npu_model.py +17 -5
ipex_llm/transformers/npu_models/convert.py +6 -2
ipex_llm/transformers/npu_models/qwen2_mp.py +20 -8
ipex_llm/transformers/npu_pipeline_model/common.py +161 -0
ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +33 -13
ipex_llm/transformers/npu_pipeline_model/llama.py +20 -159
ipex_llm/transformers/npu_pipeline_model/minicpm.py +19 -10
ipex_llm/transformers/npu_pipeline_model/qwen.py +57 -36
ipex_llm/transformers/qlora.py +2 -2
ipex_llm/transformers/utils.py +19 -6
ipex_llm/transformers/xpu_customize_fwd.py +6 -4
{ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/METADATA +23 -30
{ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/RECORD +50 -50
{ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250207.data}/scripts/ipex-llm-init.bat +0 -0
{ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250207.data}/scripts/llm-chat.ps1 +0 -0
{ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250207.data}/scripts/llm-cli.ps1 +0 -0
{ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/WHEEL +0 -0
{ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/entry_points.txt +0 -0
{ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/top_level.txt +0 -0

ipex_llm/transformers/npu_pipeline_model/llama.py CHANGED Viewed

@@ -18,112 +18,13 @@
 import torch
 import numpy as np
 import os
-from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead, \
-    obtain_weight_from_single_layer
-from intel_npu_acceleration_library.backend.factory import NNFactory
-class Llama32Embedding(NNFactory):
-    def __init__(
-        self,
-        vocab_size,
-        embedding_dim,
-        embedding_weight,
-        padding_idx,
-        inv_freq,
-        attention_scaling,
-        dtype,  # fp16
-        device: str = "NPU",
-    ):
-        super().__init__(False, device)
-        self.vocab_size = vocab_size
-        self.embedding_dim = embedding_dim
-        self.padding_idx = padding_idx
-        self.attention_scaling = attention_scaling
-        self.dtype = dtype
-        # define input
-        weight = self.constant(embedding_weight)
-        input = self.parameter((1, 1), dtype=np.int32)
-        position_ids = self.parameter((1, 1), dtype=np.int64)
-        inv_freq = self.constant(inv_freq)
-        # embed_tokens module
-        if padding_idx == -1:
-            padding_idx += vocab_size
-        axis_node = self.constant(np.array([0], dtype=np.int64))
-        if padding_idx is not None:
-            masked_embeddings = np.ones(weight.shape, dtype=np.float16)
-            masked_embeddings[padding_idx, :] = 0.0  # mask
-            node_mask = self.constant(masked_embeddings)
-            node_masked_w = self.eltwise_mul(weight, node_mask)
-            res = self.gather(node_masked_w, input, axis_node, 0)
-        else:
-            res = self.gather(weight, input, axis_node, 0)
-        # rotary_emb module
-        inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
-        position_ids = self.reshape(position_ids, (1, 1, 1))
-        freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
-                                 self.convert_to_fp32(position_ids))
-        freqs = self.transpose(freqs, [0, 2, 1])
-        emb = self.concat(freqs, freqs, axis=2)
-        cos = self.cos(emb)
-        sin = self.sin(emb)
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
-        # define outputs
-        res = self.convert_to_fp16(res)
-        cos = self.convert_to_fp32(cos)
-        sin = self.convert_to_fp32(sin)
-        print("start compiling")
-        self.compile()
-class Llama32PostEmbedding(NNFactory):
-    def __init__(
-        self,
-        inv_freq,
-        attention_scaling,
-        input_len: int = 1,
-        device: str = "NPU",
-    ):
-        super().__init__(False, device)
-        self.attention_scaling = attention_scaling
-        # define input
-        position_ids = self.parameter((1, input_len), dtype=np.int64)
-        inv_freq = self.constant(inv_freq)
-        # rotary_emb module
-        inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
-        position_ids = self.reshape(position_ids, (1, 1, input_len))
-        freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
-                                 self.convert_to_fp32(position_ids))
-        freqs = self.transpose(freqs, [0, 2, 1])
-        emb = self.concat(freqs, freqs, axis=2)
-        cos = self.cos(emb)
-        sin = self.sin(emb)
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
-        if input_len > 1:
-            cos = self.unsqueeze(cos, [1])
-            sin = self.unsqueeze(sin, [1])
-        # define outputs
-        cos = self.convert_to_fp32(cos)
-        sin = self.convert_to_fp32(sin)
-        print("start compiling")
-        self.compile()
+from .common import update_names_of_IR_and_export_blob, LowBitLLMLMHead, \
+    obtain_weight_from_single_layer, obtain_embedding_from_model
 def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
-                                  convert_model=False, max_prompt_len=1):
+                                  convert_model=False, max_prompt_len=1,
+                                  keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
     head_dim = model.model.layers[0].self_attn.head_dim
@@ -175,7 +76,8 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
         asym=asym
     )
     last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, "lm_head", temp_dir,
-                                                        True, False)
+                                                        keep_ir=keep_ir, compile_blob=compile_blob)
+    os.remove(os.path.join(temp_dir, "lm_head.bin"))
     # save weights bins files
     if n_splits_linear == 1:
@@ -195,62 +97,18 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
         bin_file = os.path.join(weight_dir, f"model_lm_head_input_{1+idx}.bin")
         weight.tofile(bin_file)
-    if hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
-        # llama-2-7B & llama-3-8B
-        embedding_layer = model.model.embed_tokens
-        new_embedding = LLMEmbedding(
-            vocab_size=model.config.vocab_size,
-            embedding_dim=model.config.hidden_size,
-            embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
-            padding_idx=model.config.pad_token_id,
-            dtype=np.float16,
-        )
-        if convert_model:
-            bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
-            embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
-            first_blob_path = None
-        else:
-            first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
-                                                                 temp_dir, True, False)
-    else:
-        # llama-3.2-3B & llama-3.2-1B
-        embedding_layer = model.model.embed_tokens
-        new_embedding = Llama32Embedding(
-            vocab_size=model.config.vocab_size,
-            embedding_dim=model.config.hidden_size,
-            embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
-            padding_idx=model.config.pad_token_id,
-            inv_freq=model.model.rotary_emb.inv_freq.to(torch.float16),
-            attention_scaling=model.model.rotary_emb.attention_scaling,
-            dtype=np.float16,
-        )
-        if convert_model:
-            bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
-            embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
-            first_blob_path = None
-            # save embedding post module
-            inv_freq = model.model.rotary_emb.inv_freq.to(torch.float16)
-            attention_scaling = model.model.rotary_emb.attention_scaling
-            embedding_post = Llama32PostEmbedding(inv_freq=inv_freq,
-                                                  attention_scaling=attention_scaling,
-                                                  input_len=1)
-            update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
-                                               temp_dir, True, False)
-            embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
-                                                          attention_scaling=attention_scaling,
-                                                          input_len=max_prompt_len)
-            update_names_of_IR_and_export_blob(embedding_post_prefill,
-                                               "embedding_post_prefill",
-                                               temp_dir, True, False)
-        else:
-            first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
-                                                                 temp_dir)
+    first_blob_path = obtain_embedding_from_model(model, convert_model,
+                                                  temp_dir, weight_dir,
+                                                  max_prompt_len,
+                                                  keep_ir, compile_blob)
     return first_blob_path, last_blob_path
 def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
                         temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                        layernorm_const, mode="decode"):
+                        layernorm_const, mode="decode",
+                        keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
     head_dim = model.model.layers[0].self_attn.head_dim
@@ -317,8 +175,9 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
     rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
                                                         decoder_name,
                                                         temp_dir,
-                                                        True, False,
+                                                        keep_ir=keep_ir, compile_blob=compile_blob,
                                                         npu_dpu_groups=npu_dpu_groups)
+    os.remove(os.path.join(temp_dir, decoder_name + ".bin"))
     if mode == "decode":
         if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
@@ -364,7 +223,8 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
                               save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                              layernorm_const, mode="decode"):
+                              layernorm_const, mode="decode",
+                              keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
     head_dim = model.model.layers[0].self_attn.head_dim
@@ -457,6 +317,7 @@ def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_dow
         update_names_of_IR_and_export_blob(fused_decoder,
                                            f"decoder_layer_{i}",
                                            save_dir,
-                                           compile_blob=True,
-                                           keep_ir=False)
+                                           keep_ir=keep_ir,
+                                           compile_blob=compile_blob)
+        os.remove(os.path.join(save_dir, f"decoder_layer_{i}" + ".bin"))
     return 0

ipex_llm/transformers/npu_pipeline_model/minicpm.py CHANGED Viewed

@@ -162,7 +162,8 @@ class MiniCPMLMHead(LLMBaseNNFactory):
 def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
-                                  convert_model=False, max_prompt_len=1):
+                                  convert_model=False, max_prompt_len=1,
+                                  keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
     head_dim = model.model.layers[0].self_attn.head_dim
@@ -230,7 +231,8 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
         asym=asym
     )
     last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, "lm_head", temp_dir,
-                                                        True, True)
+                                                        keep_ir=keep_ir, compile_blob=compile_blob)
+    os.remove(os.path.join(temp_dir, "lm_head.bin"))
     # save weights bins files
     if n_splits_linear == 1:
@@ -280,22 +282,27 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
                                               dtype=np.float16,
                                               scale_emb=model.config.scale_emb)
         update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
-                                           temp_dir, True, False)
+                                           temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
         embedding_post_prefill = MiniCPMPostEmbedding(max_prompt_len, model.config.hidden_size,
                                                       dtype=np.float16,
                                                       scale_emb=model.config.scale_emb)
         update_names_of_IR_and_export_blob(embedding_post_prefill,
                                            "embedding_post_prefill",
-                                           temp_dir, True, False)
+                                           temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
+        os.remove(os.path.join(temp_dir, "embedding_post.bin"))
+        os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
     else:
         first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
-                                                             temp_dir, True, False)
+                                                             temp_dir, keep_ir=keep_ir,
+                                                             compile_blob=compile_blob)
+        os.remove(os.path.join(temp_dir, "embedding.bin"))
     return first_blob_path, last_blob_path
 def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
                           temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                          layernorm_const, mode="decode"):
+                          layernorm_const, mode="decode",
+                          keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
     head_dim = model.model.layers[0].self_attn.head_dim
@@ -353,7 +360,8 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
     rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
                                                         decoder_name,
                                                         temp_dir,
-                                                        True, True)
+                                                        keep_ir=keep_ir, compile_blob=compile_blob)
+    os.remove(os.path.join(temp_dir, decoder_name + ".bin"))
     if mode == "decode":
         if layernorm_const:
@@ -386,7 +394,8 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
                                 save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                                layernorm_const, mode="decode"):
+                                layernorm_const, mode="decode",
+                                keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
     head_dim = model.model.layers[0].self_attn.head_dim
@@ -477,6 +486,6 @@ def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_d
         update_names_of_IR_and_export_blob(fused_decoder,
                                            f"decoder_layer_{i}",
                                            save_dir,
-                                           compile_blob=True,
-                                           keep_ir=False)
+                                           keep_ir=keep_ir, compile_blob=compile_blob)
+        os.remove(os.path.join(save_dir, f"decoder_layer_{i}" + ".bin"))
     return 0

ipex_llm/transformers/npu_pipeline_model/qwen.py CHANGED Viewed

@@ -18,13 +18,15 @@
 import torch
 import numpy as np
 import os
-from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead, \
-    obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer
+from .common import update_names_of_IR_and_export_blob, LowBitLLMLMHead, \
+    obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer, \
+    obtain_embedding_from_model
 from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
 def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
-                                  convert_model=False, group_size=0):
+                                  convert_model=False, group_size=0, max_prompt_len=1,
+                                  keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     head_dim = model.model.layers[0].self_attn.head_dim
     rms_norm_eps = model.config.rms_norm_eps
@@ -84,7 +86,9 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
     )
     last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, f"lm_head",
-                                                        temp_dir, True, False)
+                                                        temp_dir, keep_ir=keep_ir,
+                                                        compile_blob=compile_blob)
+    os.remove(os.path.join(temp_dir, "lm_head.bin"))
     # save weights bins files
     if not isinstance(lm_head, SlicedLMHead):
@@ -104,28 +108,17 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
         bin_file = os.path.join(weight_dir, f"model_lm_head_input_{1+idx}.bin")
         weight.tofile(bin_file)
-    embedding_layer = model.model.embed_tokens
-    new_embedding = LLMEmbedding(
-        vocab_size=model.config.vocab_size,
-        embedding_dim=model.config.hidden_size,
-        embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
-        padding_idx=model.config.pad_token_id,
-        dtype=np.float16,
-        input_length=1,
-    )
-    if convert_model:
-        bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
-        embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
-        first_blob_path = True
-    else:
-        first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding",
-                                                             temp_dir, True, keep_ir=True)
+    first_blob_path = obtain_embedding_from_model(model, convert_model,
+                                                  temp_dir, weight_dir,
+                                                  max_prompt_len,
+                                                  keep_ir, compile_blob)
     return first_blob_path, last_blob_path
 def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
                        temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                       layernorm_const, mode="decode"):
+                       layernorm_const, mode="decode",
+                       keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
     head_dim = model.model.layers[0].self_attn.head_dim
@@ -139,8 +132,13 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
     mlp_layer = curr_layer.mlp
     weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
     q_bias, k_bias, v_bias = obtain_qkv_bias_from_single_layer(attn_layer)
-    cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
-    cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
+    if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
+        cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
+        cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
+    else:
+        # transformers >= 4.45.0
+        cached_cos = None
+        cached_sin = None
     layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
     layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)
@@ -152,10 +150,12 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
     if mode == "decode":
         input_len = 1
         decoder_name = f"decoder_layer_{layer_idx}"
+        keep_position_ids = True
         npu_dpu_groups = None
     else:
         input_len = kv_len
         decoder_name = "decoder_layer_prefill"
+        keep_position_ids = False
         npu_dpu_groups = 6
     single_decoder = LowBitQwenMultiDecoderlayer(
@@ -179,23 +179,38 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
         n_splits_linear=n_splits_linear,
         n_splits_down_proj=n_splits_down_proj,
         group_size=group_size,
+        cos_len=input_len,
+        keep_position_ids=keep_position_ids,
         asym=asym
     )
     rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
                                                         decoder_name,
-                                                        temp_dir, True, False,
+                                                        temp_dir, keep_ir=keep_ir,
+                                                        compile_blob=compile_blob,
                                                         npu_dpu_groups=npu_dpu_groups)
+    os.remove(os.path.join(temp_dir, decoder_name + ".bin"))
     # 0, 1, 2 are input_embed/attention_mask/position_id
     if mode == "decode":
-        if layernorm_const:
-            st_idx = 3
+        if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
+            if layernorm_const:
+                st_idx = 3
+            else:
+                input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
+                post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
+                layer_norm_0.data.numpy().tofile(input_lm_bin_file)
+                layer_norm_1.data.numpy().tofile(post_lm_bin_file)
+                st_idx = 5
         else:
-            input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
-            post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
-            layer_norm_0.data.numpy().tofile(input_lm_bin_file)
-            layer_norm_1.data.numpy().tofile(post_lm_bin_file)
-            st_idx = 5
+            # transformers >= 4.45.0
+            if layernorm_const:
+                st_idx = 4
+            else:
+                input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
+                post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_5.bin")
+                layer_norm_0.data.numpy().tofile(input_lm_bin_file)
+                layer_norm_1.data.numpy().tofile(post_lm_bin_file)
+                st_idx = 6
         q_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx}.bin")
         k_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+1}.bin")
         v_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+2}.bin")
@@ -226,7 +241,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
                              save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                             layernorm_const, mode="decode"):
+                             layernorm_const, mode="decode",
+                             keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
     head_dim = model.model.layers[0].self_attn.head_dim
@@ -252,8 +268,13 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
             attn_layer = curr_layer.self_attn
             mlp_layer = curr_layer.mlp
             weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
-            cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
-            cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
+            if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
+                cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
+                cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
+            else:
+                # transformers >= 4.45.0
+                cached_cos = None
+                cached_sin = None
             layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
             layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)
@@ -330,6 +351,6 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
         update_names_of_IR_and_export_blob(fused_decoder,
                                            f"decoder_layer_{i}",
                                            save_dir,
-                                           compile_blob=True,
-                                           keep_ir=False)
+                                           keep_ir=keep_ir, compile_blob=compile_blob)
+        os.remove(os.path.join(save_dir, f"decoder_layer_{i}" + ".bin"))
     return 0

ipex_llm/transformers/qlora.py CHANGED Viewed

@@ -109,7 +109,7 @@ class LoraLowBitLinear(Module, LoraLayer):
             self.qa_pool = torch.nn.Identity()
     def forward(self, x: torch.Tensor):
-        autocast_dtype = get_autocast_dtype(x)
+        autocast_dtype = get_autocast_dtype(x.device.type)
         if x.device.type == "xpu":
             # force to use bf16 on gpu
             x = x.to(torch.bfloat16)
@@ -177,7 +177,7 @@ class LoraBF16Linear(Module, LoraLayer):
         self.is_target_conv_1d_layer = is_target_conv_1d_layer
     def forward(self, x: torch.Tensor):
-        autocast_dtype = get_autocast_dtype(x)
+        autocast_dtype = get_autocast_dtype(x.device.type)
         if x.device.type == "xpu":
             # force to use bf16 on gpu
             x = x.to(torch.bfloat16)

ipex_llm/transformers/utils.py CHANGED Viewed

@@ -138,26 +138,39 @@ def fix_key(key):
     return key
-def get_autocast_dtype(x):
+def is_autocast_enabled(device_type: str):
     if torch.__version__ >= '2.3':
-        if torch.is_autocast_enabled(x.device.type):
-            return torch.get_autocast_dtype(x.device.type)
+        return torch.is_autocast_enabled(device_type)
+    else:
+        if device_type == "xpu":
+            return torch.xpu.is_autocast_xpu_enabled()
+        elif device_type == "cpu":
+            return torch.is_autocast_cpu_enabled()
+        else:
+            invalidInputError(False,
+                              f"Device type {device_type} is not supported.")
+def get_autocast_dtype(device_type: str):
+    if torch.__version__ >= '2.3':
+        if torch.is_autocast_enabled(device_type):
+            return torch.get_autocast_dtype(device_type)
         else:
             return None
     else:
-        if x.device.type == "xpu":
+        if device_type == "xpu":
             if torch.xpu.is_autocast_xpu_enabled():
                 return torch.xpu.get_autocast_xpu_dtype()
             else:
                 return None
-        elif x.device.type == "cpu":
+        elif device_type == "cpu":
             if torch.is_autocast_cpu_enabled():
                 return torch.get_autocast_cpu_dtype()
             else:
                 return None
         else:
             invalidInputError(False,
-                              f"Device {x.device} is not supported.")
+                              f"Device type {device_type} is not supported.")
 def get_xpu_device_name(device: torch.device):

ipex_llm/transformers/xpu_customize_fwd.py CHANGED Viewed

@@ -107,6 +107,8 @@ except ModuleNotFoundError:
     np = None  # type: ignore[assignment]
 from typing import Any
+from ipex_llm.transformers.utils import is_autocast_enabled, get_autocast_dtype
 def _cast(value, dtype):
     if isinstance(value, torch.Tensor):
@@ -155,12 +157,12 @@ def custom_fwd(fwd=None, *, cast_inputs=None):
     @functools.wraps(fwd)
     def decorate_fwd(*args, **kwargs):
-        args[0]._dtype = torch.xpu.get_autocast_xpu_dtype()
+        args[0]._dtype = get_autocast_dtype("xpu")
         if cast_inputs is None:
-            args[0]._fwd_used_autocast = torch.xpu.is_autocast_xpu_enabled()
+            args[0]._fwd_used_autocast = is_autocast_enabled("xpu")
             return fwd(*args, **kwargs)
         else:
-            autocast_context = torch.xpu.is_autocast_xpu_enabled()
+            autocast_context = is_autocast_enabled("xpu")
             args[0]._fwd_used_autocast = False
             if autocast_context:
                 with torch.xpu.autocast(enabled=False):
@@ -184,7 +186,7 @@ def custom_bwd(bwd):
     @functools.wraps(bwd)
     def decorate_bwd(*args, **kwargs):
-        with torch.xpu.autocast(enabled=args[0]._fwd_used_autocast, dtype=args[0]._dtype):
+        with torch.autocast("xpu", enabled=args[0]._fwd_used_autocast, dtype=args[0]._dtype):
             return bwd(*args, **kwargs)
     return decorate_bwd