PyPI - ipex-llm - Versions diffs - 2.2.0b20250205__py3-none-win_amd64.whl → 2.2.0b20250207__py3-none-win_amd64.whl - Mend

ipex-llm 2.2.0b20250205__py3-none-win_amd64.whl → 2.2.0b20250207__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

ipex_llm/libs/bloom-api.dll +0 -0
ipex_llm/libs/bloom.dll +0 -0
ipex_llm/libs/gptneox-api.dll +0 -0
ipex_llm/libs/gptneox.dll +0 -0
ipex_llm/libs/libbloom_avx.dll +0 -0
ipex_llm/libs/libbloom_vnni.dll +0 -0
ipex_llm/libs/libgptneox_avx.dll +0 -0
ipex_llm/libs/libgptneox_vnni.dll +0 -0
ipex_llm/libs/libllama_avx.dll +0 -0
ipex_llm/libs/libllama_vnni.dll +0 -0
ipex_llm/libs/libstarcoder_avx.dll +0 -0
ipex_llm/libs/libstarcoder_vnni.dll +0 -0
ipex_llm/libs/llama-api.dll +0 -0
ipex_llm/libs/llama.dll +0 -0
ipex_llm/libs/main-bloom.exe +0 -0
ipex_llm/libs/main-gptneox.exe +0 -0
ipex_llm/libs/main-llama.exe +0 -0
ipex_llm/libs/main-starcoder.exe +0 -0
ipex_llm/libs/pipeline.dll +0 -0
ipex_llm/libs/quantize-bloom.exe +0 -0
ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
ipex_llm/libs/quantize-gptneox.exe +0 -0
ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
ipex_llm/libs/quantize-llama.exe +0 -0
ipex_llm/libs/quantize-llama_vnni.exe +0 -0
ipex_llm/libs/quantize-starcoder.exe +0 -0
ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
ipex_llm/libs/starcoder-api.dll +0 -0
ipex_llm/libs/starcoder.dll +0 -0
ipex_llm/transformers/low_bit_linear.py +5 -4
ipex_llm/transformers/model.py +0 -1
ipex_llm/transformers/npu_model.py +17 -5
ipex_llm/transformers/npu_models/convert.py +6 -2
ipex_llm/transformers/npu_models/qwen2_mp.py +20 -8
ipex_llm/transformers/npu_pipeline_model/common.py +161 -0
ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +33 -13
ipex_llm/transformers/npu_pipeline_model/llama.py +20 -159
ipex_llm/transformers/npu_pipeline_model/minicpm.py +19 -10
ipex_llm/transformers/npu_pipeline_model/qwen.py +57 -36
ipex_llm/transformers/qlora.py +2 -2
ipex_llm/transformers/utils.py +19 -6
ipex_llm/transformers/xpu_customize_fwd.py +6 -4
{ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/METADATA +23 -30
{ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/RECORD +50 -50
{ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250207.data}/scripts/ipex-llm-init.bat +0 -0
{ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250207.data}/scripts/llm-chat.ps1 +0 -0
{ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250207.data}/scripts/llm-cli.ps1 +0 -0
{ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/WHEEL +0 -0
{ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/entry_points.txt +0 -0
{ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/top_level.txt +0 -0

ipex_llm/libs/bloom-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/bloom.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama.dll CHANGED Viewed

Binary file

ipex_llm/libs/main-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/pipeline.dll CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/starcoder-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/starcoder.dll CHANGED Viewed

Binary file

ipex_llm/transformers/low_bit_linear.py CHANGED Viewed

@@ -51,7 +51,8 @@ from torch import Tensor, dtype, nn
 from operator import mul
 from functools import reduce
 from ipex_llm.transformers.xpu_customize_fwd import custom_fwd, custom_bwd
-from ipex_llm.transformers.utils import get_autocast_dtype, get_xpu_device_name
+from ipex_llm.transformers.utils import is_autocast_enabled, get_autocast_dtype
+from ipex_llm.transformers.utils import get_xpu_device_name
 from ipex_llm.transformers.convert import is_deepspeed_available, get_use_vllm
 T = TypeVar("T", bound="torch.nn.Module")
@@ -527,8 +528,8 @@ class MatMulLowBit(torch.autograd.Function):
         A, weight = ctx.tensors
         grad_A, grad_weight = None, None
         if req_gradA:
-            if torch.xpu.is_autocast_xpu_enabled():
-                grad_output = grad_output.to(torch.xpu.get_autocast_xpu_dtype())
+            if is_autocast_enabled("xpu"):
+                grad_output = grad_output.to(get_autocast_dtype("xpu"))
             if weight.qtype == NF4:
                 dequant_weight = xe_linear.dequant(A,
                                                    weight.data.view(torch.uint8),
@@ -615,7 +616,7 @@ class LowBitLinear(nn.Linear):
         is_training = self.training and not torch.is_inference_mode_enabled()
         if is_training:
             # below logic is only for training
-            autocast_dtype = get_autocast_dtype(x)
+            autocast_dtype = get_autocast_dtype(x.device.type)
             if self.compute_dtype is not None and x.device.type == "xpu":
                 x = x.to(self.compute_dtype)  # solve GC issue for unlora module
             elif autocast_dtype is not None:

ipex_llm/transformers/model.py CHANGED Viewed

@@ -233,7 +233,6 @@ class _BaseAutoModelClass:
             optimize_model = False
             kwargs["modules_to_not_convert"] = ["lm_head"]
-        load_in_8bit = kwargs.pop("load_in_8bit", False)
         from ipex_llm.llm_patching import bigdl_patched
         if bigdl_patched == 'Train':
             global patched_training_mode

ipex_llm/transformers/npu_model.py CHANGED Viewed

@@ -117,7 +117,6 @@ class _BaseAutoModelClass:
         # ignore following arguments
         ignore_argument(kwargs, "model_hub")
         ignore_argument(kwargs, "load_in_4bit")
-        ignore_argument(kwargs, "load_in_8bit")
         ignore_argument(kwargs, "imatrix")
         ignore_argument(kwargs, "cpu_embedding")
         ignore_argument(kwargs, "embedding_qtype")
@@ -139,8 +138,10 @@ class _BaseAutoModelClass:
         mock_device = kwargs.pop('device', None)  # For mock on CPU
         convert_model = kwargs.pop('convert_model', False)
         save_directory = kwargs.pop('save_directory', None)
-        fuse_layers = kwargs.pop('fuse_layers', None)
-        imatrix_file = kwargs.pop('imatrix_file', None)
+        fuse_layers = kwargs.pop("fuse_layers", None)
+        imatrix_file = kwargs.pop("imatrix_file", None)
+        keep_ir = kwargs.pop("keep_ir", False)
+        compile_blob = kwargs.pop("compile_blob", True)
         if imatrix_file is not None:
             imatrix_data = load_imatrix_data(imatrix_file)
@@ -236,6 +237,8 @@ class _BaseAutoModelClass:
                     "fuse_layers": fuse_layers,
                     "imatrix_data": imatrix_data,
                     "skip_npu_logic": mock_device == "dummy",
+                    "keep_ir": keep_ir,
+                    "compile_blob": compile_blob,
                 }
                 # Dummy will skip npu related logic and save the quantized model
                 if mock_device == "dummy":
@@ -280,9 +283,14 @@ class _BaseAutoModelClass:
         fuse_layers = kwargs.pop('fuse_layers', None)
         imatrix_data = kwargs.pop('imatrix_data', None)
         skip_npu_logic = kwargs.pop("skip_npu_logic", False)
+        keep_ir = kwargs.pop("keep_ir", False)
+        compile_blob = kwargs.pop("compile_blob", True)
         invalidInputError(save_directory is not None,
                           "Please provide the path to save converted model "
                           "through `save_directory`.")
+        invalidInputError(keep_ir or compile_blob,
+                          "Please save blob or save IR either.")
         if hasattr(model, "llm"):
             llm = model.llm
@@ -323,7 +331,9 @@ class _BaseAutoModelClass:
                         qtype=qtype,
                         save_directory=save_directory,
                         fuse_layers=fuse_layers,
-                        has_llm=hasattr(model, "llm")
+                        has_llm=hasattr(model, "llm"),
+                        keep_ir=keep_ir,
+                        compile_blob=compile_blob
                     )
                 else:
                     optimize_llm(
@@ -346,7 +356,9 @@ class _BaseAutoModelClass:
                             qtype=qtype,
                             convert_model=convert_model,
                             save_directory=save_directory,
-                            fuse_layers=fuse_layers)
+                            fuse_layers=fuse_layers,
+                            keep_ir=keep_ir,
+                            compile_blob=compile_blob)
             model.save_low_bit = types.MethodType(save_low_bit, model)
             model.save_low_bit(save_directory)
             logger.info(f"Converted model has already saved to {save_directory}.")

ipex_llm/transformers/npu_models/convert.py CHANGED Viewed

@@ -450,7 +450,9 @@ def optimize_llm_single_process(
     qtype: str,
     save_directory: str,
     fuse_layers: int=None,
-    has_llm: bool=False
+    has_llm: bool=False,
+    keep_ir: bool=False,
+    compile_blob: bool=True
 ):
     from ipex_llm.transformers.npu_pipeline_model.convert_pipeline import convert_llm
     from .npu_llm_cpp import load_model_from_file
@@ -463,7 +465,9 @@ def optimize_llm_single_process(
                 qtype=qtype,
                 convert_model=True,
                 save_directory=save_directory,
-                fuse_layers=fuse_layers)
+                fuse_layers=fuse_layers,
+                keep_ir=keep_ir,
+                compile_blob=compile_blob)
     try:
         model_ptr = load_model_from_file(save_directory)
         model.kv_len = kv_len

ipex_llm/transformers/npu_models/qwen2_mp.py CHANGED Viewed

@@ -98,6 +98,8 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
         n_splits_linear: int = 1,
         n_splits_down_proj: int = 1,
         group_size: int = 0,
+        cos_len: int = 1,
+        keep_position_ids=True,
         asym: bool = False,
     ):
         super().__init__(max_seq_len=max_seq_len,
@@ -114,18 +116,13 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
         self.dtype = dtype
         self.cached_cos = cached_cos
         self.cached_sin = cached_sin
+        self.cos_len = cos_len
         self.batch_size, self.seq_len, self.hidden_size = hidden_shape
         self.mode = mode
         self.rms_norm_eps = rms_norm_eps
         self.transpose_value = transpose_value
         self.num_layers = num_layers
-        cos = self.constant(self.cached_cos)
-        self.cos = self.unsqueeze(cos, axis=0)
-        sin = self.constant(self.cached_sin)
-        self.sin = self.unsqueeze(sin, axis=0)
         if mode == "decode":
             self.kv_seq_len = self.max_seq_len + 1
         else:
@@ -148,7 +145,21 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
             attention_mask = self.create_input_op(
                 (self.batch_size, 1, self.seq_len, self.seq_len), dtype=np.float16)
-        position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
+        if self.cached_cos is None:
+            if mode == "prefill" and keep_position_ids:
+                position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
+            cos = self.create_input_op((self.batch_size, self.cos_len, self.head_dim),
+                                       dtype=np.float32)
+            self.cos = self.convert_to_fp16(cos)
+            sin = self.create_input_op((self.batch_size, self.cos_len, self.head_dim),
+                                       dtype=np.float32)
+            self.sin = self.convert_to_fp16(sin)
+        else:
+            position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
+            cos = self.constant(self.cached_cos)
+            self.cos = self.unsqueeze(cos, axis=0)
+            sin = self.constant(self.cached_sin)
+            self.sin = self.unsqueeze(sin, axis=0)
         if input_layernorm_weights is None:
             input_layernorm_weights = []
@@ -211,11 +222,12 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
         hidden_states = input
         curr_key_values = []
+        cos_condition = cached_cos is not None or (mode == "prefill" and keep_position_ids)
         for i in range(num_layers):
             hidden_states, new_key_states, new_value_states = self.build_decoder(
                 hidden_states=hidden_states,
                 attention_mask=attention_mask,
-                position_ids=position_ids,
+                position_ids=position_ids if cos_condition else None,
                 input_layernorm_weight=input_layernorm_weights[i],
                 post_attention_layernorm_weight=post_attn_layernorm_weights[i],
                 q_bias=q_biases[i],

ipex_llm/transformers/npu_pipeline_model/common.py CHANGED Viewed

@@ -173,6 +173,105 @@ class LLMEmbedding(NNFactory):
         self.compile()
+class Llama32Embedding(NNFactory):
+    def __init__(
+        self,
+        vocab_size,
+        embedding_dim,
+        embedding_weight,
+        padding_idx,
+        inv_freq,
+        attention_scaling,
+        dtype,  # fp16
+        device: str = "NPU",
+    ):
+        super().__init__(False, device)
+        self.vocab_size = vocab_size
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.attention_scaling = attention_scaling
+        self.dtype = dtype
+        # define input
+        weight = self.constant(embedding_weight)
+        input = self.parameter((1, 1), dtype=np.int32)
+        position_ids = self.parameter((1, 1), dtype=np.int64)
+        inv_freq = self.constant(inv_freq)
+        # embed_tokens module
+        if padding_idx == -1:
+            padding_idx += vocab_size
+        axis_node = self.constant(np.array([0], dtype=np.int64))
+        if padding_idx is not None:
+            masked_embeddings = np.ones(weight.shape, dtype=np.float16)
+            masked_embeddings[padding_idx, :] = 0.0  # mask
+            node_mask = self.constant(masked_embeddings)
+            node_masked_w = self.eltwise_mul(weight, node_mask)
+            res = self.gather(node_masked_w, input, axis_node, 0)
+        else:
+            res = self.gather(weight, input, axis_node, 0)
+        # rotary_emb module
+        inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
+        position_ids = self.reshape(position_ids, (1, 1, 1))
+        freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
+                                 self.convert_to_fp32(position_ids))
+        freqs = self.transpose(freqs, [0, 2, 1])
+        emb = self.concat(freqs, freqs, axis=2)
+        cos = self.cos(emb)
+        sin = self.sin(emb)
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        # define outputs
+        res = self.convert_to_fp16(res)
+        cos = self.convert_to_fp32(cos)
+        sin = self.convert_to_fp32(sin)
+        print("start compiling")
+        self.compile()
+class Llama32PostEmbedding(NNFactory):
+    def __init__(
+        self,
+        inv_freq,
+        attention_scaling,
+        input_len: int = 1,
+        device: str = "NPU",
+    ):
+        super().__init__(False, device)
+        self.attention_scaling = attention_scaling
+        # define input
+        position_ids = self.parameter((1, input_len), dtype=np.int64)
+        inv_freq = self.constant(inv_freq)
+        # rotary_emb module
+        inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
+        position_ids = self.reshape(position_ids, (1, 1, input_len))
+        freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
+                                 self.convert_to_fp32(position_ids))
+        freqs = self.transpose(freqs, [0, 2, 1])
+        emb = self.concat(freqs, freqs, axis=2)
+        cos = self.cos(emb)
+        sin = self.sin(emb)
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        if input_len > 1:
+            cos = self.unsqueeze(cos, [1])
+            sin = self.unsqueeze(sin, [1])
+        # define outputs
+        cos = self.convert_to_fp32(cos)
+        sin = self.convert_to_fp32(sin)
+        print("start compiling")
+        self.compile()
 def obtain_weight_from_single_layer(attn_layer, mlp_layer):
     weights = []
     if hasattr(attn_layer, "q_proj_dq_list"):
@@ -216,3 +315,65 @@ def obtain_qkv_bias_from_single_layer(attn_layer):
         k_bias = attn_layer.k_proj.bias.to(torch.float16)
         v_bias = attn_layer.v_proj.bias.to(torch.float16)
     return q_bias, k_bias, v_bias
+def obtain_embedding_from_model(model, convert_model, temp_dir, weight_dir,
+                                max_prompt_len, keep_ir, compile_blob):
+    if hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
+        # llama-2-7B & llama-3-8B
+        embedding_layer = model.model.embed_tokens
+        new_embedding = LLMEmbedding(
+            vocab_size=model.config.vocab_size,
+            embedding_dim=model.config.hidden_size,
+            embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
+            padding_idx=model.config.pad_token_id,
+            dtype=np.float16,
+        )
+        if convert_model:
+            bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
+            embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
+            first_blob_path = None
+        else:
+            first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
+                                                                 temp_dir, keep_ir=keep_ir,
+                                                                 compile_blob=compile_blob)
+            os.remove(os.path.join(temp_dir, "embedding.bin"))
+    else:
+        # llama-3.2-3B & llama-3.2-1B
+        # for transformers >= 4.45.0
+        embedding_layer = model.model.embed_tokens
+        new_embedding = Llama32Embedding(
+            vocab_size=model.config.vocab_size,
+            embedding_dim=model.config.hidden_size,
+            embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
+            padding_idx=model.config.pad_token_id,
+            inv_freq=model.model.rotary_emb.inv_freq.to(torch.float16),
+            attention_scaling=model.model.rotary_emb.attention_scaling,
+            dtype=np.float16,
+        )
+        if convert_model:
+            bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
+            embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
+            first_blob_path = None
+            # save embedding post module
+            inv_freq = model.model.rotary_emb.inv_freq.to(torch.float16)
+            attention_scaling = model.model.rotary_emb.attention_scaling
+            embedding_post = Llama32PostEmbedding(inv_freq=inv_freq,
+                                                  attention_scaling=attention_scaling,
+                                                  input_len=1)
+            update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
+                                               temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
+            embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
+                                                          attention_scaling=attention_scaling,
+                                                          input_len=max_prompt_len)
+            update_names_of_IR_and_export_blob(embedding_post_prefill,
+                                               "embedding_post_prefill",
+                                               temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
+            os.remove(os.path.join(temp_dir, "embedding_post.bin"))
+            os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
+        else:
+            first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
+                                                                 temp_dir, keep_ir=keep_ir,
+                                                                 compile_blob=compile_blob)
+            os.remove(os.path.join(temp_dir, "embedding.bin"))
+    return first_blob_path

ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py CHANGED Viewed

@@ -31,6 +31,7 @@ import tempfile
 import numpy as np
 from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
 from multiprocessing import Pool
+import transformers
 def generate(
@@ -196,7 +197,9 @@ def convert_llm(model: torch.nn.Module,
                 qtype: str,
                 convert_model: bool=False,
                 save_directory: str=None,
-                fuse_layers: int=None):
+                fuse_layers: int=None,
+                keep_ir: bool=False,
+                compile_blob: bool=True):
     # whether to set layernorm weight as const
     layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
     if group_size == 0:
@@ -220,7 +223,9 @@ def convert_llm(model: torch.nn.Module,
                                n_splits_down_proj,
                                group_size,
                                save_directory,
-                               fuse_layers=fuse_layers)
+                               fuse_layers=fuse_layers,
+                               keep_ir=keep_ir,
+                               compile_blob=compile_blob)
         return 0
     if model.config.model_type == "llama":
         with tempfile.TemporaryDirectory() as temp_dir:
@@ -428,7 +433,9 @@ def convert_llm_for_deploy(model: torch.nn.Module,
                            n_splits_down_proj: int,
                            group_size: int,
                            save_directory: str=None,
-                           fuse_layers: int=None):
+                           fuse_layers: int=None,
+                           keep_ir: bool=False,
+                           compile_blob: bool=True):
     if not os.path.exists(save_directory):
         os.mkdir(save_directory)
     weight_dir = os.path.join(save_directory, "model_weights")
@@ -450,6 +457,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
         custom_object_save(model, save_directory, config=model.config)
     if model.config.model_type == "qwen2":
+        cos_sin_input = not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached")
+        embedding_post = not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached")
         if group_size == 0:
             if model.config.hidden_size == 1536:
                 # Qwen2-1.5B-Instruct
@@ -470,6 +479,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
                        "use_prefill_sdp": False,
                        "weight_num": 7,
                        "weight_idx": 8,
+                       "embedding_post": embedding_post,
+                       "cos_sin_input": cos_sin_input,
                        "n_splits_linear": n_splits_linear,
                        "n_splits_down_proj": n_splits_down_proj,
                        "lm_head_low_bit": lm_head_low_bit}
@@ -479,14 +490,17 @@ def convert_llm_for_deploy(model: torch.nn.Module,
         # save fused_layers blobs of fused decoder layers
         convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
                                  save_directory, weight_dir, transpose_value_cache, kv_len,
-                                 group_size, layernorm_const, "decode")
+                                 group_size, layernorm_const, "decode",
+                                 keep_ir=keep_ir, compile_blob=compile_blob)
         # save blob of single prefill layer
         convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj,
                            save_directory, weight_dir, transpose_value_cache, max_prompt_len,
-                           group_size, layernorm_const, "prefill")
+                           group_size, layernorm_const, "prefill",
+                           keep_ir=keep_ir, compile_blob=compile_blob)
         # save blob of lmhead and bin of embedding
-        convert_lm_head_and_embedding(model, save_directory, weight_dir,
-                                      convert_model=True, group_size=group_size)
+        convert_lm_head_and_embedding(model, save_directory, weight_dir, convert_model=True,
+                                      group_size=group_size, max_prompt_len=max_prompt_len,
+                                      keep_ir=keep_ir, compile_blob=compile_blob)
     elif model.config.model_type == "llama":
         embedding_post = False
         cos_sin_input = False
@@ -540,15 +554,18 @@ def convert_llm_for_deploy(model: torch.nn.Module,
         convert_lm_head_and_embedding(model, n_splits_linear,
                                       save_directory, weight_dir,
                                       convert_model=True,
-                                      max_prompt_len=max_prompt_len)
+                                      max_prompt_len=max_prompt_len,
+                                      keep_ir=keep_ir, compile_blob=compile_blob)
         # save fused_layers blobs of fused decoder layers
         convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
                                   save_directory, weight_dir, transpose_value_cache, kv_len,
-                                  group_size, layernorm_const, "decode")
+                                  group_size, layernorm_const, "decode",
+                                  keep_ir=keep_ir, compile_blob=compile_blob)
         # save blob of single prefill layer
         convert_llama_layer(model, 0, n_splits_linear, n_splits_down_proj,
                             save_directory, weight_dir, transpose_value_cache, max_prompt_len,
-                            group_size, layernorm_const, "prefill")
+                            group_size, layernorm_const, "prefill",
+                            keep_ir=keep_ir, compile_blob=compile_blob)
     elif model.config.model_type == "minicpm":
         if group_size == 0:
             fused_layers = 4 if fuse_layers is None else fuse_layers
@@ -577,16 +594,19 @@ def convert_llm_for_deploy(model: torch.nn.Module,
         # save fused_layers blobs of fused decoder layers
         convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
                                     save_directory, weight_dir, transpose_value_cache, kv_len,
-                                    group_size, layernorm_const, "decode")
+                                    group_size, layernorm_const, "decode",
+                                    keep_ir=keep_ir, compile_blob=compile_blob)
         # save blob of single prefill layer
         convert_minicpm_layer(model, 0, n_splits_linear, n_splits_down_proj,
                               save_directory, weight_dir, transpose_value_cache, max_prompt_len,
-                              group_size, layernorm_const, "prefill")
+                              group_size, layernorm_const, "prefill",
+                              keep_ir=keep_ir, compile_blob=compile_blob)
         # save blob of lmhead and bin of embedding and embedding_post
         convert_lm_head_and_embedding(model, n_splits_linear,
                                       save_directory, weight_dir,
                                       convert_model=True,
-                                      max_prompt_len=max_prompt_len)
+                                      max_prompt_len=max_prompt_len,
+                                      keep_ir=keep_ir, compile_blob=compile_blob)
     model.config.update(update_dict)
     model.config.save_pretrained(save_directory)