PyPI - ipex-llm - Versions diffs - 2.2.0b20250206__py3-none-win_amd64.whl → 2.2.0b20250208__py3-none-win_amd64.whl - Mend

ipex-llm 2.2.0b20250206__py3-none-win_amd64.whl → 2.2.0b20250208__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

ipex_llm/libs/bloom-api.dll +0 -0
ipex_llm/libs/bloom.dll +0 -0
ipex_llm/libs/gptneox-api.dll +0 -0
ipex_llm/libs/gptneox.dll +0 -0
ipex_llm/libs/libbloom_avx.dll +0 -0
ipex_llm/libs/libbloom_vnni.dll +0 -0
ipex_llm/libs/libgptneox_avx.dll +0 -0
ipex_llm/libs/libgptneox_vnni.dll +0 -0
ipex_llm/libs/libllama_avx.dll +0 -0
ipex_llm/libs/libllama_vnni.dll +0 -0
ipex_llm/libs/libstarcoder_avx.dll +0 -0
ipex_llm/libs/libstarcoder_vnni.dll +0 -0
ipex_llm/libs/llama-api.dll +0 -0
ipex_llm/libs/llama.dll +0 -0
ipex_llm/libs/main-bloom.exe +0 -0
ipex_llm/libs/main-gptneox.exe +0 -0
ipex_llm/libs/main-llama.exe +0 -0
ipex_llm/libs/main-starcoder.exe +0 -0
ipex_llm/libs/pipeline.dll +0 -0
ipex_llm/libs/quantize-bloom.exe +0 -0
ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
ipex_llm/libs/quantize-gptneox.exe +0 -0
ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
ipex_llm/libs/quantize-llama.exe +0 -0
ipex_llm/libs/quantize-llama_vnni.exe +0 -0
ipex_llm/libs/quantize-starcoder.exe +0 -0
ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
ipex_llm/libs/starcoder-api.dll +0 -0
ipex_llm/libs/starcoder.dll +0 -0
ipex_llm/transformers/model.py +0 -1
ipex_llm/transformers/npu_model.py +0 -1
ipex_llm/transformers/npu_models/qwen2_mp.py +20 -8
ipex_llm/transformers/npu_pipeline_model/common.py +161 -0
ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +30 -23
ipex_llm/transformers/npu_pipeline_model/llama.py +17 -165
ipex_llm/transformers/npu_pipeline_model/minicpm.py +10 -6
ipex_llm/transformers/npu_pipeline_model/qwen.py +53 -34
{ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/METADATA +23 -30
{ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/RECORD +45 -45
{ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250208.data}/scripts/ipex-llm-init.bat +0 -0
{ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250208.data}/scripts/llm-chat.ps1 +0 -0
{ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250208.data}/scripts/llm-cli.ps1 +0 -0
{ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/WHEEL +0 -0
{ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/entry_points.txt +0 -0
{ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/top_level.txt +0 -0

ipex_llm/libs/bloom-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/bloom.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama.dll CHANGED Viewed

Binary file

ipex_llm/libs/main-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/pipeline.dll CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/starcoder-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/starcoder.dll CHANGED Viewed

Binary file

ipex_llm/transformers/model.py CHANGED Viewed

@@ -233,7 +233,6 @@ class _BaseAutoModelClass:
             optimize_model = False
             kwargs["modules_to_not_convert"] = ["lm_head"]
-        load_in_8bit = kwargs.pop("load_in_8bit", False)
         from ipex_llm.llm_patching import bigdl_patched
         if bigdl_patched == 'Train':
             global patched_training_mode

ipex_llm/transformers/npu_model.py CHANGED Viewed

@@ -117,7 +117,6 @@ class _BaseAutoModelClass:
         # ignore following arguments
         ignore_argument(kwargs, "model_hub")
         ignore_argument(kwargs, "load_in_4bit")
-        ignore_argument(kwargs, "load_in_8bit")
         ignore_argument(kwargs, "imatrix")
         ignore_argument(kwargs, "cpu_embedding")
         ignore_argument(kwargs, "embedding_qtype")

ipex_llm/transformers/npu_models/qwen2_mp.py CHANGED Viewed

@@ -98,6 +98,8 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
         n_splits_linear: int = 1,
         n_splits_down_proj: int = 1,
         group_size: int = 0,
+        cos_len: int = 1,
+        keep_position_ids=True,
         asym: bool = False,
     ):
         super().__init__(max_seq_len=max_seq_len,
@@ -114,18 +116,13 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
         self.dtype = dtype
         self.cached_cos = cached_cos
         self.cached_sin = cached_sin
+        self.cos_len = cos_len
         self.batch_size, self.seq_len, self.hidden_size = hidden_shape
         self.mode = mode
         self.rms_norm_eps = rms_norm_eps
         self.transpose_value = transpose_value
         self.num_layers = num_layers
-        cos = self.constant(self.cached_cos)
-        self.cos = self.unsqueeze(cos, axis=0)
-        sin = self.constant(self.cached_sin)
-        self.sin = self.unsqueeze(sin, axis=0)
         if mode == "decode":
             self.kv_seq_len = self.max_seq_len + 1
         else:
@@ -148,7 +145,21 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
             attention_mask = self.create_input_op(
                 (self.batch_size, 1, self.seq_len, self.seq_len), dtype=np.float16)
-        position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
+        if self.cached_cos is None:
+            if mode == "prefill" and keep_position_ids:
+                position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
+            cos = self.create_input_op((self.batch_size, self.cos_len, self.head_dim),
+                                       dtype=np.float32)
+            self.cos = self.convert_to_fp16(cos)
+            sin = self.create_input_op((self.batch_size, self.cos_len, self.head_dim),
+                                       dtype=np.float32)
+            self.sin = self.convert_to_fp16(sin)
+        else:
+            position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
+            cos = self.constant(self.cached_cos)
+            self.cos = self.unsqueeze(cos, axis=0)
+            sin = self.constant(self.cached_sin)
+            self.sin = self.unsqueeze(sin, axis=0)
         if input_layernorm_weights is None:
             input_layernorm_weights = []
@@ -211,11 +222,12 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
         hidden_states = input
         curr_key_values = []
+        cos_condition = cached_cos is not None or (mode == "prefill" and keep_position_ids)
         for i in range(num_layers):
             hidden_states, new_key_states, new_value_states = self.build_decoder(
                 hidden_states=hidden_states,
                 attention_mask=attention_mask,
-                position_ids=position_ids,
+                position_ids=position_ids if cos_condition else None,
                 input_layernorm_weight=input_layernorm_weights[i],
                 post_attention_layernorm_weight=post_attn_layernorm_weights[i],
                 q_bias=q_biases[i],

ipex_llm/transformers/npu_pipeline_model/common.py CHANGED Viewed

@@ -173,6 +173,105 @@ class LLMEmbedding(NNFactory):
         self.compile()
+class Llama32Embedding(NNFactory):
+    def __init__(
+        self,
+        vocab_size,
+        embedding_dim,
+        embedding_weight,
+        padding_idx,
+        inv_freq,
+        attention_scaling,
+        dtype,  # fp16
+        device: str = "NPU",
+    ):
+        super().__init__(False, device)
+        self.vocab_size = vocab_size
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.attention_scaling = attention_scaling
+        self.dtype = dtype
+        # define input
+        weight = self.constant(embedding_weight)
+        input = self.parameter((1, 1), dtype=np.int32)
+        position_ids = self.parameter((1, 1), dtype=np.int64)
+        inv_freq = self.constant(inv_freq)
+        # embed_tokens module
+        if padding_idx == -1:
+            padding_idx += vocab_size
+        axis_node = self.constant(np.array([0], dtype=np.int64))
+        if padding_idx is not None:
+            masked_embeddings = np.ones(weight.shape, dtype=np.float16)
+            masked_embeddings[padding_idx, :] = 0.0  # mask
+            node_mask = self.constant(masked_embeddings)
+            node_masked_w = self.eltwise_mul(weight, node_mask)
+            res = self.gather(node_masked_w, input, axis_node, 0)
+        else:
+            res = self.gather(weight, input, axis_node, 0)
+        # rotary_emb module
+        inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
+        position_ids = self.reshape(position_ids, (1, 1, 1))
+        freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
+                                 self.convert_to_fp32(position_ids))
+        freqs = self.transpose(freqs, [0, 2, 1])
+        emb = self.concat(freqs, freqs, axis=2)
+        cos = self.cos(emb)
+        sin = self.sin(emb)
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        # define outputs
+        res = self.convert_to_fp16(res)
+        cos = self.convert_to_fp32(cos)
+        sin = self.convert_to_fp32(sin)
+        print("start compiling")
+        self.compile()
+class Llama32PostEmbedding(NNFactory):
+    def __init__(
+        self,
+        inv_freq,
+        attention_scaling,
+        input_len: int = 1,
+        device: str = "NPU",
+    ):
+        super().__init__(False, device)
+        self.attention_scaling = attention_scaling
+        # define input
+        position_ids = self.parameter((1, input_len), dtype=np.int64)
+        inv_freq = self.constant(inv_freq)
+        # rotary_emb module
+        inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
+        position_ids = self.reshape(position_ids, (1, 1, input_len))
+        freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
+                                 self.convert_to_fp32(position_ids))
+        freqs = self.transpose(freqs, [0, 2, 1])
+        emb = self.concat(freqs, freqs, axis=2)
+        cos = self.cos(emb)
+        sin = self.sin(emb)
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        if input_len > 1:
+            cos = self.unsqueeze(cos, [1])
+            sin = self.unsqueeze(sin, [1])
+        # define outputs
+        cos = self.convert_to_fp32(cos)
+        sin = self.convert_to_fp32(sin)
+        print("start compiling")
+        self.compile()
 def obtain_weight_from_single_layer(attn_layer, mlp_layer):
     weights = []
     if hasattr(attn_layer, "q_proj_dq_list"):
@@ -216,3 +315,65 @@ def obtain_qkv_bias_from_single_layer(attn_layer):
         k_bias = attn_layer.k_proj.bias.to(torch.float16)
         v_bias = attn_layer.v_proj.bias.to(torch.float16)
     return q_bias, k_bias, v_bias
+def obtain_embedding_from_model(model, convert_model, temp_dir, weight_dir,
+                                max_prompt_len, keep_ir, compile_blob):
+    if hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
+        # llama-2-7B & llama-3-8B
+        embedding_layer = model.model.embed_tokens
+        new_embedding = LLMEmbedding(
+            vocab_size=model.config.vocab_size,
+            embedding_dim=model.config.hidden_size,
+            embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
+            padding_idx=model.config.pad_token_id,
+            dtype=np.float16,
+        )
+        if convert_model:
+            bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
+            embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
+            first_blob_path = None
+        else:
+            first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
+                                                                 temp_dir, keep_ir=keep_ir,
+                                                                 compile_blob=compile_blob)
+            os.remove(os.path.join(temp_dir, "embedding.bin"))
+    else:
+        # llama-3.2-3B & llama-3.2-1B
+        # for transformers >= 4.45.0
+        embedding_layer = model.model.embed_tokens
+        new_embedding = Llama32Embedding(
+            vocab_size=model.config.vocab_size,
+            embedding_dim=model.config.hidden_size,
+            embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
+            padding_idx=model.config.pad_token_id,
+            inv_freq=model.model.rotary_emb.inv_freq.to(torch.float16),
+            attention_scaling=model.model.rotary_emb.attention_scaling,
+            dtype=np.float16,
+        )
+        if convert_model:
+            bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
+            embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
+            first_blob_path = None
+            # save embedding post module
+            inv_freq = model.model.rotary_emb.inv_freq.to(torch.float16)
+            attention_scaling = model.model.rotary_emb.attention_scaling
+            embedding_post = Llama32PostEmbedding(inv_freq=inv_freq,
+                                                  attention_scaling=attention_scaling,
+                                                  input_len=1)
+            update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
+                                               temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
+            embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
+                                                          attention_scaling=attention_scaling,
+                                                          input_len=max_prompt_len)
+            update_names_of_IR_and_export_blob(embedding_post_prefill,
+                                               "embedding_post_prefill",
+                                               temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
+            os.remove(os.path.join(temp_dir, "embedding_post.bin"))
+            os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
+        else:
+            first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
+                                                                 temp_dir, keep_ir=keep_ir,
+                                                                 compile_blob=compile_blob)
+            os.remove(os.path.join(temp_dir, "embedding.bin"))
+    return first_blob_path

ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py CHANGED Viewed

@@ -31,6 +31,7 @@ import tempfile
 import numpy as np
 from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
 from multiprocessing import Pool
+import transformers
 def generate(
@@ -200,7 +201,7 @@ def convert_llm(model: torch.nn.Module,
                 keep_ir: bool=False,
                 compile_blob: bool=True):
     # whether to set layernorm weight as const
-    layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
+    const_parameter = os.environ.get("IPEX_LLM_NPU_CONST_PARAMETER", "1") == "1"
     if group_size == 0:
         n_splits_linear = 1
         if qtype in ["sym_int8_rtn", "asym_int4_rtn"]:
@@ -239,7 +240,7 @@ def convert_llm(model: torch.nn.Module,
             for layer_idx in range(0, layer_num):
                 param_list.append((model, layer_idx, n_splits_linear, n_splits_down_proj,
                                    temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                                   layernorm_const))
+                                   const_parameter))
             with Pool() as pool:
                 result = pool.starmap(convert_llama_layer, param_list)
@@ -266,7 +267,7 @@ def convert_llm(model: torch.nn.Module,
                 res = InitLLMPipeline(model_type, kv_len, model.num_head, model.head_dim, layer_num,
                                       model.vocab_size, weight_dir, "model",
                                       first_blob_path, last_blob_path,
-                                      os.path.join(temp_dir, "decoder_layer"), layernorm_const)
+                                      os.path.join(temp_dir, "decoder_layer"), const_parameter)
             except:
                 invalidInputError(False,
                                   "False to InitLLMPipeline.")
@@ -283,7 +284,7 @@ def convert_llm(model: torch.nn.Module,
             for layer_idx in range(0, layer_num):
                 param_list.append((model, layer_idx, n_splits_linear, n_splits_down_proj,
                                   temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                                  layernorm_const))
+                                  const_parameter))
             with Pool() as pool:
                 result = pool.starmap(convert_baichuan_layer, param_list)
@@ -307,7 +308,7 @@ def convert_llm(model: torch.nn.Module,
                 res = InitLLMPipeline("baichuan", kv_len, model.num_head, model.head_dim, layer_num,
                                       model.vocab_size, weight_dir, "model",
                                       first_blob_path, last_blob_path,
-                                      os.path.join(temp_dir, "decoder_layer"), layernorm_const)
+                                      os.path.join(temp_dir, "decoder_layer"), const_parameter)
             except:
                 invalidInputError(False,
                                   "False to InitLLMPipeline.")
@@ -324,7 +325,7 @@ def convert_llm(model: torch.nn.Module,
             for layer_idx in range(0, layer_num):
                 param_list.append((model, layer_idx, n_splits_linear, n_splits_down_proj,
                                    temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                                   layernorm_const))
+                                   const_parameter))
             with Pool() as pool:
                 result = pool.starmap(convert_minicpm_layer, param_list)
@@ -347,12 +348,12 @@ def convert_llm(model: torch.nn.Module,
                 res = InitLLMPipeline("minicpm", kv_len, model.num_head, model.head_dim, layer_num,
                                       model.vocab_size, weight_dir, "model",
                                       first_blob_path, last_blob_path,
-                                      os.path.join(temp_dir, "decoder_layer"), layernorm_const)
+                                      os.path.join(temp_dir, "decoder_layer"), const_parameter)
             except:
                 invalidInputError(False,
                                   "False to InitLLMPipeline.")
     elif model.config.model_type == "qwen2":
-        layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "0") == "1"
+        const_parameter = os.environ.get("IPEX_LLM_NPU_CONST_PARAMETER", "0") == "1"
         with tempfile.TemporaryDirectory() as temp_dir:
             if save_directory is not None:
                 temp_dir = save_directory
@@ -370,7 +371,7 @@ def convert_llm(model: torch.nn.Module,
             for layer_idx in range(0, layer_num):
                 param_list.append((model, layer_idx, n_splits_linear, n_splits_down_proj,
                                   temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                                  layernorm_const))
+                                  const_parameter))
             with Pool() as pool:
                 result = pool.starmap(convert_qwen_layer, param_list)
@@ -395,7 +396,7 @@ def convert_llm(model: torch.nn.Module,
                                "head_dim": model.head_dim,
                                "transpose_value_cache": transpose_value_cache,
                                "max_prompt_len": max_prompt_len,
-                               "layernorm_const": layernorm_const,
+                               "const_parameter": const_parameter,
                                "group_size":  group_size}
                 model.config.update(update_dict)
                 model.config.save_pretrained(save_directory)
@@ -404,7 +405,7 @@ def convert_llm(model: torch.nn.Module,
                 res = InitLLMPipeline("qwen", kv_len, model.num_head, model.head_dim, layer_num,
                                       model.vocab_size, weight_dir, "model",
                                       first_blob_path, last_blob_path,
-                                      os.path.join(temp_dir, "decoder_layer"), layernorm_const)
+                                      os.path.join(temp_dir, "decoder_layer"), const_parameter)
             except:
                 invalidInputError(False,
                                   "False to InitLLMPipeline.")
@@ -440,7 +441,9 @@ def convert_llm_for_deploy(model: torch.nn.Module,
     weight_dir = os.path.join(save_directory, "model_weights")
     if not os.path.exists(weight_dir):
         os.mkdir(weight_dir)
-    layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
+    const_parameter = os.environ.get("IPEX_LLM_NPU_CONST_PARAMETER", "1") == "1"
+    if keep_ir:
+        const_parameter = False
     lm_head_low_bit = getattr(model.config, "bigdl_transformers_low_bit", "sym_int4_rtn")
     if hasattr(model, "lm_head") and not isinstance(model.lm_head, SlicedLMHead):
@@ -456,6 +459,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
         custom_object_save(model, save_directory, config=model.config)
     if model.config.model_type == "qwen2":
+        cos_sin_input = not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached")
+        embedding_post = not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached")
         if group_size == 0:
             if model.config.hidden_size == 1536:
                 # Qwen2-1.5B-Instruct
@@ -469,13 +474,15 @@ def convert_llm_for_deploy(model: torch.nn.Module,
                        "head_dim": model.model.layers[0].self_attn.head_dim,
                        "transpose_value_cache": transpose_value_cache,
                        "max_prompt_len": max_prompt_len,
-                       "layernorm_const": layernorm_const,
+                       "const_parameter": const_parameter,
                        "group_size":  group_size,
                        "fused_layers": fused_layers,
                        "qkv_bias": True,
                        "use_prefill_sdp": False,
                        "weight_num": 7,
                        "weight_idx": 8,
+                       "embedding_post": embedding_post,
+                       "cos_sin_input": cos_sin_input,
                        "n_splits_linear": n_splits_linear,
                        "n_splits_down_proj": n_splits_down_proj,
                        "lm_head_low_bit": lm_head_low_bit}
@@ -485,16 +492,16 @@ def convert_llm_for_deploy(model: torch.nn.Module,
         # save fused_layers blobs of fused decoder layers
         convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
                                  save_directory, weight_dir, transpose_value_cache, kv_len,
-                                 group_size, layernorm_const, "decode",
+                                 group_size, const_parameter, "decode",
                                  keep_ir=keep_ir, compile_blob=compile_blob)
         # save blob of single prefill layer
         convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj,
                            save_directory, weight_dir, transpose_value_cache, max_prompt_len,
-                           group_size, layernorm_const, "prefill",
+                           group_size, const_parameter, "prefill",
                            keep_ir=keep_ir, compile_blob=compile_blob)
         # save blob of lmhead and bin of embedding
-        convert_lm_head_and_embedding(model, save_directory, weight_dir,
-                                      convert_model=True, group_size=group_size,
+        convert_lm_head_and_embedding(model, save_directory, weight_dir, convert_model=True,
+                                      group_size=group_size, max_prompt_len=max_prompt_len,
                                       keep_ir=keep_ir, compile_blob=compile_blob)
     elif model.config.model_type == "llama":
         embedding_post = False
@@ -530,7 +537,7 @@ def convert_llm_for_deploy(model: torch.nn.Module,
                        "head_dim": model.model.layers[0].self_attn.head_dim,
                        "transpose_value_cache": transpose_value_cache,
                        "max_prompt_len": max_prompt_len,
-                       "layernorm_const": layernorm_const,
+                       "const_parameter": const_parameter,
                        "group_size":  group_size,
                        "fused_layers": fused_layers,
                        "qkv_bias": False,
@@ -554,12 +561,12 @@ def convert_llm_for_deploy(model: torch.nn.Module,
         # save fused_layers blobs of fused decoder layers
         convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
                                   save_directory, weight_dir, transpose_value_cache, kv_len,
-                                  group_size, layernorm_const, "decode",
+                                  group_size, const_parameter, "decode",
                                   keep_ir=keep_ir, compile_blob=compile_blob)
         # save blob of single prefill layer
         convert_llama_layer(model, 0, n_splits_linear, n_splits_down_proj,
                             save_directory, weight_dir, transpose_value_cache, max_prompt_len,
-                            group_size, layernorm_const, "prefill",
+                            group_size, const_parameter, "prefill",
                             keep_ir=keep_ir, compile_blob=compile_blob)
     elif model.config.model_type == "minicpm":
         if group_size == 0:
@@ -571,7 +578,7 @@ def convert_llm_for_deploy(model: torch.nn.Module,
                        "head_dim": model.model.layers[0].self_attn.head_dim,
                        "transpose_value_cache": transpose_value_cache,
                        "max_prompt_len": max_prompt_len,
-                       "layernorm_const": layernorm_const,
+                       "const_parameter": const_parameter,
                        "group_size":  group_size,
                        "fused_layers": fused_layers,
                        "qkv_bias": False,
@@ -589,12 +596,12 @@ def convert_llm_for_deploy(model: torch.nn.Module,
         # save fused_layers blobs of fused decoder layers
         convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
                                     save_directory, weight_dir, transpose_value_cache, kv_len,
-                                    group_size, layernorm_const, "decode",
+                                    group_size, const_parameter, "decode",
                                     keep_ir=keep_ir, compile_blob=compile_blob)
         # save blob of single prefill layer
         convert_minicpm_layer(model, 0, n_splits_linear, n_splits_down_proj,
                               save_directory, weight_dir, transpose_value_cache, max_prompt_len,
-                              group_size, layernorm_const, "prefill",
+                              group_size, const_parameter, "prefill",
                               keep_ir=keep_ir, compile_blob=compile_blob)
         # save blob of lmhead and bin of embedding and embedding_post
         convert_lm_head_and_embedding(model, n_splits_linear,