ipex-llm 2.2.0b20250206__py3-none-win_amd64.whl → 2.2.0b20250208__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. ipex_llm/libs/bloom-api.dll +0 -0
  2. ipex_llm/libs/bloom.dll +0 -0
  3. ipex_llm/libs/gptneox-api.dll +0 -0
  4. ipex_llm/libs/gptneox.dll +0 -0
  5. ipex_llm/libs/libbloom_avx.dll +0 -0
  6. ipex_llm/libs/libbloom_vnni.dll +0 -0
  7. ipex_llm/libs/libgptneox_avx.dll +0 -0
  8. ipex_llm/libs/libgptneox_vnni.dll +0 -0
  9. ipex_llm/libs/libllama_avx.dll +0 -0
  10. ipex_llm/libs/libllama_vnni.dll +0 -0
  11. ipex_llm/libs/libstarcoder_avx.dll +0 -0
  12. ipex_llm/libs/libstarcoder_vnni.dll +0 -0
  13. ipex_llm/libs/llama-api.dll +0 -0
  14. ipex_llm/libs/llama.dll +0 -0
  15. ipex_llm/libs/main-bloom.exe +0 -0
  16. ipex_llm/libs/main-gptneox.exe +0 -0
  17. ipex_llm/libs/main-llama.exe +0 -0
  18. ipex_llm/libs/main-starcoder.exe +0 -0
  19. ipex_llm/libs/pipeline.dll +0 -0
  20. ipex_llm/libs/quantize-bloom.exe +0 -0
  21. ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
  22. ipex_llm/libs/quantize-gptneox.exe +0 -0
  23. ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
  24. ipex_llm/libs/quantize-llama.exe +0 -0
  25. ipex_llm/libs/quantize-llama_vnni.exe +0 -0
  26. ipex_llm/libs/quantize-starcoder.exe +0 -0
  27. ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
  28. ipex_llm/libs/starcoder-api.dll +0 -0
  29. ipex_llm/libs/starcoder.dll +0 -0
  30. ipex_llm/transformers/model.py +0 -1
  31. ipex_llm/transformers/npu_model.py +0 -1
  32. ipex_llm/transformers/npu_models/qwen2_mp.py +20 -8
  33. ipex_llm/transformers/npu_pipeline_model/common.py +161 -0
  34. ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +30 -23
  35. ipex_llm/transformers/npu_pipeline_model/llama.py +17 -165
  36. ipex_llm/transformers/npu_pipeline_model/minicpm.py +10 -6
  37. ipex_llm/transformers/npu_pipeline_model/qwen.py +53 -34
  38. {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/METADATA +23 -30
  39. {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/RECORD +45 -45
  40. {ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250208.data}/scripts/ipex-llm-init.bat +0 -0
  41. {ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250208.data}/scripts/llm-chat.ps1 +0 -0
  42. {ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250208.data}/scripts/llm-cli.ps1 +0 -0
  43. {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/WHEEL +0 -0
  44. {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/entry_points.txt +0 -0
  45. {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/top_level.txt +0 -0
Binary file
ipex_llm/libs/bloom.dll CHANGED
Binary file
Binary file
ipex_llm/libs/gptneox.dll CHANGED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
ipex_llm/libs/llama.dll CHANGED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -233,7 +233,6 @@ class _BaseAutoModelClass:
233
233
  optimize_model = False
234
234
  kwargs["modules_to_not_convert"] = ["lm_head"]
235
235
 
236
- load_in_8bit = kwargs.pop("load_in_8bit", False)
237
236
  from ipex_llm.llm_patching import bigdl_patched
238
237
  if bigdl_patched == 'Train':
239
238
  global patched_training_mode
@@ -117,7 +117,6 @@ class _BaseAutoModelClass:
117
117
  # ignore following arguments
118
118
  ignore_argument(kwargs, "model_hub")
119
119
  ignore_argument(kwargs, "load_in_4bit")
120
- ignore_argument(kwargs, "load_in_8bit")
121
120
  ignore_argument(kwargs, "imatrix")
122
121
  ignore_argument(kwargs, "cpu_embedding")
123
122
  ignore_argument(kwargs, "embedding_qtype")
@@ -98,6 +98,8 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
98
98
  n_splits_linear: int = 1,
99
99
  n_splits_down_proj: int = 1,
100
100
  group_size: int = 0,
101
+ cos_len: int = 1,
102
+ keep_position_ids=True,
101
103
  asym: bool = False,
102
104
  ):
103
105
  super().__init__(max_seq_len=max_seq_len,
@@ -114,18 +116,13 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
114
116
  self.dtype = dtype
115
117
  self.cached_cos = cached_cos
116
118
  self.cached_sin = cached_sin
119
+ self.cos_len = cos_len
117
120
  self.batch_size, self.seq_len, self.hidden_size = hidden_shape
118
121
  self.mode = mode
119
122
  self.rms_norm_eps = rms_norm_eps
120
123
  self.transpose_value = transpose_value
121
124
  self.num_layers = num_layers
122
125
 
123
- cos = self.constant(self.cached_cos)
124
- self.cos = self.unsqueeze(cos, axis=0)
125
-
126
- sin = self.constant(self.cached_sin)
127
- self.sin = self.unsqueeze(sin, axis=0)
128
-
129
126
  if mode == "decode":
130
127
  self.kv_seq_len = self.max_seq_len + 1
131
128
  else:
@@ -148,7 +145,21 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
148
145
  attention_mask = self.create_input_op(
149
146
  (self.batch_size, 1, self.seq_len, self.seq_len), dtype=np.float16)
150
147
 
151
- position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
148
+ if self.cached_cos is None:
149
+ if mode == "prefill" and keep_position_ids:
150
+ position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
151
+ cos = self.create_input_op((self.batch_size, self.cos_len, self.head_dim),
152
+ dtype=np.float32)
153
+ self.cos = self.convert_to_fp16(cos)
154
+ sin = self.create_input_op((self.batch_size, self.cos_len, self.head_dim),
155
+ dtype=np.float32)
156
+ self.sin = self.convert_to_fp16(sin)
157
+ else:
158
+ position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
159
+ cos = self.constant(self.cached_cos)
160
+ self.cos = self.unsqueeze(cos, axis=0)
161
+ sin = self.constant(self.cached_sin)
162
+ self.sin = self.unsqueeze(sin, axis=0)
152
163
 
153
164
  if input_layernorm_weights is None:
154
165
  input_layernorm_weights = []
@@ -211,11 +222,12 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
211
222
  hidden_states = input
212
223
 
213
224
  curr_key_values = []
225
+ cos_condition = cached_cos is not None or (mode == "prefill" and keep_position_ids)
214
226
  for i in range(num_layers):
215
227
  hidden_states, new_key_states, new_value_states = self.build_decoder(
216
228
  hidden_states=hidden_states,
217
229
  attention_mask=attention_mask,
218
- position_ids=position_ids,
230
+ position_ids=position_ids if cos_condition else None,
219
231
  input_layernorm_weight=input_layernorm_weights[i],
220
232
  post_attention_layernorm_weight=post_attn_layernorm_weights[i],
221
233
  q_bias=q_biases[i],
@@ -173,6 +173,105 @@ class LLMEmbedding(NNFactory):
173
173
  self.compile()
174
174
 
175
175
 
176
+ class Llama32Embedding(NNFactory):
177
+ def __init__(
178
+ self,
179
+ vocab_size,
180
+ embedding_dim,
181
+ embedding_weight,
182
+ padding_idx,
183
+ inv_freq,
184
+ attention_scaling,
185
+ dtype, # fp16
186
+ device: str = "NPU",
187
+ ):
188
+ super().__init__(False, device)
189
+ self.vocab_size = vocab_size
190
+ self.embedding_dim = embedding_dim
191
+ self.padding_idx = padding_idx
192
+ self.attention_scaling = attention_scaling
193
+ self.dtype = dtype
194
+
195
+ # define input
196
+ weight = self.constant(embedding_weight)
197
+ input = self.parameter((1, 1), dtype=np.int32)
198
+ position_ids = self.parameter((1, 1), dtype=np.int64)
199
+ inv_freq = self.constant(inv_freq)
200
+
201
+ # embed_tokens module
202
+ if padding_idx == -1:
203
+ padding_idx += vocab_size
204
+
205
+ axis_node = self.constant(np.array([0], dtype=np.int64))
206
+ if padding_idx is not None:
207
+ masked_embeddings = np.ones(weight.shape, dtype=np.float16)
208
+ masked_embeddings[padding_idx, :] = 0.0 # mask
209
+
210
+ node_mask = self.constant(masked_embeddings)
211
+ node_masked_w = self.eltwise_mul(weight, node_mask)
212
+ res = self.gather(node_masked_w, input, axis_node, 0)
213
+ else:
214
+ res = self.gather(weight, input, axis_node, 0)
215
+
216
+ # rotary_emb module
217
+ inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
218
+ position_ids = self.reshape(position_ids, (1, 1, 1))
219
+ freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
220
+ self.convert_to_fp32(position_ids))
221
+ freqs = self.transpose(freqs, [0, 2, 1])
222
+ emb = self.concat(freqs, freqs, axis=2)
223
+ cos = self.cos(emb)
224
+ sin = self.sin(emb)
225
+ cos = cos * self.attention_scaling
226
+ sin = sin * self.attention_scaling
227
+
228
+ # define outputs
229
+ res = self.convert_to_fp16(res)
230
+ cos = self.convert_to_fp32(cos)
231
+ sin = self.convert_to_fp32(sin)
232
+
233
+ print("start compiling")
234
+ self.compile()
235
+
236
+
237
+ class Llama32PostEmbedding(NNFactory):
238
+ def __init__(
239
+ self,
240
+ inv_freq,
241
+ attention_scaling,
242
+ input_len: int = 1,
243
+ device: str = "NPU",
244
+ ):
245
+ super().__init__(False, device)
246
+ self.attention_scaling = attention_scaling
247
+
248
+ # define input
249
+ position_ids = self.parameter((1, input_len), dtype=np.int64)
250
+ inv_freq = self.constant(inv_freq)
251
+
252
+ # rotary_emb module
253
+ inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
254
+ position_ids = self.reshape(position_ids, (1, 1, input_len))
255
+ freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
256
+ self.convert_to_fp32(position_ids))
257
+ freqs = self.transpose(freqs, [0, 2, 1])
258
+ emb = self.concat(freqs, freqs, axis=2)
259
+ cos = self.cos(emb)
260
+ sin = self.sin(emb)
261
+ cos = cos * self.attention_scaling
262
+ sin = sin * self.attention_scaling
263
+ if input_len > 1:
264
+ cos = self.unsqueeze(cos, [1])
265
+ sin = self.unsqueeze(sin, [1])
266
+
267
+ # define outputs
268
+ cos = self.convert_to_fp32(cos)
269
+ sin = self.convert_to_fp32(sin)
270
+
271
+ print("start compiling")
272
+ self.compile()
273
+
274
+
176
275
  def obtain_weight_from_single_layer(attn_layer, mlp_layer):
177
276
  weights = []
178
277
  if hasattr(attn_layer, "q_proj_dq_list"):
@@ -216,3 +315,65 @@ def obtain_qkv_bias_from_single_layer(attn_layer):
216
315
  k_bias = attn_layer.k_proj.bias.to(torch.float16)
217
316
  v_bias = attn_layer.v_proj.bias.to(torch.float16)
218
317
  return q_bias, k_bias, v_bias
318
+
319
+
320
+ def obtain_embedding_from_model(model, convert_model, temp_dir, weight_dir,
321
+ max_prompt_len, keep_ir, compile_blob):
322
+ if hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
323
+ # llama-2-7B & llama-3-8B
324
+ embedding_layer = model.model.embed_tokens
325
+ new_embedding = LLMEmbedding(
326
+ vocab_size=model.config.vocab_size,
327
+ embedding_dim=model.config.hidden_size,
328
+ embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
329
+ padding_idx=model.config.pad_token_id,
330
+ dtype=np.float16,
331
+ )
332
+ if convert_model:
333
+ bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
334
+ embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
335
+ first_blob_path = None
336
+ else:
337
+ first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
338
+ temp_dir, keep_ir=keep_ir,
339
+ compile_blob=compile_blob)
340
+ os.remove(os.path.join(temp_dir, "embedding.bin"))
341
+ else:
342
+ # llama-3.2-3B & llama-3.2-1B
343
+ # for transformers >= 4.45.0
344
+ embedding_layer = model.model.embed_tokens
345
+ new_embedding = Llama32Embedding(
346
+ vocab_size=model.config.vocab_size,
347
+ embedding_dim=model.config.hidden_size,
348
+ embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
349
+ padding_idx=model.config.pad_token_id,
350
+ inv_freq=model.model.rotary_emb.inv_freq.to(torch.float16),
351
+ attention_scaling=model.model.rotary_emb.attention_scaling,
352
+ dtype=np.float16,
353
+ )
354
+ if convert_model:
355
+ bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
356
+ embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
357
+ first_blob_path = None
358
+ # save embedding post module
359
+ inv_freq = model.model.rotary_emb.inv_freq.to(torch.float16)
360
+ attention_scaling = model.model.rotary_emb.attention_scaling
361
+ embedding_post = Llama32PostEmbedding(inv_freq=inv_freq,
362
+ attention_scaling=attention_scaling,
363
+ input_len=1)
364
+ update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
365
+ temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
366
+ embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
367
+ attention_scaling=attention_scaling,
368
+ input_len=max_prompt_len)
369
+ update_names_of_IR_and_export_blob(embedding_post_prefill,
370
+ "embedding_post_prefill",
371
+ temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
372
+ os.remove(os.path.join(temp_dir, "embedding_post.bin"))
373
+ os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
374
+ else:
375
+ first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
376
+ temp_dir, keep_ir=keep_ir,
377
+ compile_blob=compile_blob)
378
+ os.remove(os.path.join(temp_dir, "embedding.bin"))
379
+ return first_blob_path
@@ -31,6 +31,7 @@ import tempfile
31
31
  import numpy as np
32
32
  from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
33
33
  from multiprocessing import Pool
34
+ import transformers
34
35
 
35
36
 
36
37
  def generate(
@@ -200,7 +201,7 @@ def convert_llm(model: torch.nn.Module,
200
201
  keep_ir: bool=False,
201
202
  compile_blob: bool=True):
202
203
  # whether to set layernorm weight as const
203
- layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
204
+ const_parameter = os.environ.get("IPEX_LLM_NPU_CONST_PARAMETER", "1") == "1"
204
205
  if group_size == 0:
205
206
  n_splits_linear = 1
206
207
  if qtype in ["sym_int8_rtn", "asym_int4_rtn"]:
@@ -239,7 +240,7 @@ def convert_llm(model: torch.nn.Module,
239
240
  for layer_idx in range(0, layer_num):
240
241
  param_list.append((model, layer_idx, n_splits_linear, n_splits_down_proj,
241
242
  temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
242
- layernorm_const))
243
+ const_parameter))
243
244
  with Pool() as pool:
244
245
  result = pool.starmap(convert_llama_layer, param_list)
245
246
 
@@ -266,7 +267,7 @@ def convert_llm(model: torch.nn.Module,
266
267
  res = InitLLMPipeline(model_type, kv_len, model.num_head, model.head_dim, layer_num,
267
268
  model.vocab_size, weight_dir, "model",
268
269
  first_blob_path, last_blob_path,
269
- os.path.join(temp_dir, "decoder_layer"), layernorm_const)
270
+ os.path.join(temp_dir, "decoder_layer"), const_parameter)
270
271
  except:
271
272
  invalidInputError(False,
272
273
  "False to InitLLMPipeline.")
@@ -283,7 +284,7 @@ def convert_llm(model: torch.nn.Module,
283
284
  for layer_idx in range(0, layer_num):
284
285
  param_list.append((model, layer_idx, n_splits_linear, n_splits_down_proj,
285
286
  temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
286
- layernorm_const))
287
+ const_parameter))
287
288
  with Pool() as pool:
288
289
  result = pool.starmap(convert_baichuan_layer, param_list)
289
290
 
@@ -307,7 +308,7 @@ def convert_llm(model: torch.nn.Module,
307
308
  res = InitLLMPipeline("baichuan", kv_len, model.num_head, model.head_dim, layer_num,
308
309
  model.vocab_size, weight_dir, "model",
309
310
  first_blob_path, last_blob_path,
310
- os.path.join(temp_dir, "decoder_layer"), layernorm_const)
311
+ os.path.join(temp_dir, "decoder_layer"), const_parameter)
311
312
  except:
312
313
  invalidInputError(False,
313
314
  "False to InitLLMPipeline.")
@@ -324,7 +325,7 @@ def convert_llm(model: torch.nn.Module,
324
325
  for layer_idx in range(0, layer_num):
325
326
  param_list.append((model, layer_idx, n_splits_linear, n_splits_down_proj,
326
327
  temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
327
- layernorm_const))
328
+ const_parameter))
328
329
  with Pool() as pool:
329
330
  result = pool.starmap(convert_minicpm_layer, param_list)
330
331
 
@@ -347,12 +348,12 @@ def convert_llm(model: torch.nn.Module,
347
348
  res = InitLLMPipeline("minicpm", kv_len, model.num_head, model.head_dim, layer_num,
348
349
  model.vocab_size, weight_dir, "model",
349
350
  first_blob_path, last_blob_path,
350
- os.path.join(temp_dir, "decoder_layer"), layernorm_const)
351
+ os.path.join(temp_dir, "decoder_layer"), const_parameter)
351
352
  except:
352
353
  invalidInputError(False,
353
354
  "False to InitLLMPipeline.")
354
355
  elif model.config.model_type == "qwen2":
355
- layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "0") == "1"
356
+ const_parameter = os.environ.get("IPEX_LLM_NPU_CONST_PARAMETER", "0") == "1"
356
357
  with tempfile.TemporaryDirectory() as temp_dir:
357
358
  if save_directory is not None:
358
359
  temp_dir = save_directory
@@ -370,7 +371,7 @@ def convert_llm(model: torch.nn.Module,
370
371
  for layer_idx in range(0, layer_num):
371
372
  param_list.append((model, layer_idx, n_splits_linear, n_splits_down_proj,
372
373
  temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
373
- layernorm_const))
374
+ const_parameter))
374
375
  with Pool() as pool:
375
376
  result = pool.starmap(convert_qwen_layer, param_list)
376
377
 
@@ -395,7 +396,7 @@ def convert_llm(model: torch.nn.Module,
395
396
  "head_dim": model.head_dim,
396
397
  "transpose_value_cache": transpose_value_cache,
397
398
  "max_prompt_len": max_prompt_len,
398
- "layernorm_const": layernorm_const,
399
+ "const_parameter": const_parameter,
399
400
  "group_size": group_size}
400
401
  model.config.update(update_dict)
401
402
  model.config.save_pretrained(save_directory)
@@ -404,7 +405,7 @@ def convert_llm(model: torch.nn.Module,
404
405
  res = InitLLMPipeline("qwen", kv_len, model.num_head, model.head_dim, layer_num,
405
406
  model.vocab_size, weight_dir, "model",
406
407
  first_blob_path, last_blob_path,
407
- os.path.join(temp_dir, "decoder_layer"), layernorm_const)
408
+ os.path.join(temp_dir, "decoder_layer"), const_parameter)
408
409
  except:
409
410
  invalidInputError(False,
410
411
  "False to InitLLMPipeline.")
@@ -440,7 +441,9 @@ def convert_llm_for_deploy(model: torch.nn.Module,
440
441
  weight_dir = os.path.join(save_directory, "model_weights")
441
442
  if not os.path.exists(weight_dir):
442
443
  os.mkdir(weight_dir)
443
- layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
444
+ const_parameter = os.environ.get("IPEX_LLM_NPU_CONST_PARAMETER", "1") == "1"
445
+ if keep_ir:
446
+ const_parameter = False
444
447
 
445
448
  lm_head_low_bit = getattr(model.config, "bigdl_transformers_low_bit", "sym_int4_rtn")
446
449
  if hasattr(model, "lm_head") and not isinstance(model.lm_head, SlicedLMHead):
@@ -456,6 +459,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
456
459
  custom_object_save(model, save_directory, config=model.config)
457
460
 
458
461
  if model.config.model_type == "qwen2":
462
+ cos_sin_input = not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached")
463
+ embedding_post = not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached")
459
464
  if group_size == 0:
460
465
  if model.config.hidden_size == 1536:
461
466
  # Qwen2-1.5B-Instruct
@@ -469,13 +474,15 @@ def convert_llm_for_deploy(model: torch.nn.Module,
469
474
  "head_dim": model.model.layers[0].self_attn.head_dim,
470
475
  "transpose_value_cache": transpose_value_cache,
471
476
  "max_prompt_len": max_prompt_len,
472
- "layernorm_const": layernorm_const,
477
+ "const_parameter": const_parameter,
473
478
  "group_size": group_size,
474
479
  "fused_layers": fused_layers,
475
480
  "qkv_bias": True,
476
481
  "use_prefill_sdp": False,
477
482
  "weight_num": 7,
478
483
  "weight_idx": 8,
484
+ "embedding_post": embedding_post,
485
+ "cos_sin_input": cos_sin_input,
479
486
  "n_splits_linear": n_splits_linear,
480
487
  "n_splits_down_proj": n_splits_down_proj,
481
488
  "lm_head_low_bit": lm_head_low_bit}
@@ -485,16 +492,16 @@ def convert_llm_for_deploy(model: torch.nn.Module,
485
492
  # save fused_layers blobs of fused decoder layers
486
493
  convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
487
494
  save_directory, weight_dir, transpose_value_cache, kv_len,
488
- group_size, layernorm_const, "decode",
495
+ group_size, const_parameter, "decode",
489
496
  keep_ir=keep_ir, compile_blob=compile_blob)
490
497
  # save blob of single prefill layer
491
498
  convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj,
492
499
  save_directory, weight_dir, transpose_value_cache, max_prompt_len,
493
- group_size, layernorm_const, "prefill",
500
+ group_size, const_parameter, "prefill",
494
501
  keep_ir=keep_ir, compile_blob=compile_blob)
495
502
  # save blob of lmhead and bin of embedding
496
- convert_lm_head_and_embedding(model, save_directory, weight_dir,
497
- convert_model=True, group_size=group_size,
503
+ convert_lm_head_and_embedding(model, save_directory, weight_dir, convert_model=True,
504
+ group_size=group_size, max_prompt_len=max_prompt_len,
498
505
  keep_ir=keep_ir, compile_blob=compile_blob)
499
506
  elif model.config.model_type == "llama":
500
507
  embedding_post = False
@@ -530,7 +537,7 @@ def convert_llm_for_deploy(model: torch.nn.Module,
530
537
  "head_dim": model.model.layers[0].self_attn.head_dim,
531
538
  "transpose_value_cache": transpose_value_cache,
532
539
  "max_prompt_len": max_prompt_len,
533
- "layernorm_const": layernorm_const,
540
+ "const_parameter": const_parameter,
534
541
  "group_size": group_size,
535
542
  "fused_layers": fused_layers,
536
543
  "qkv_bias": False,
@@ -554,12 +561,12 @@ def convert_llm_for_deploy(model: torch.nn.Module,
554
561
  # save fused_layers blobs of fused decoder layers
555
562
  convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
556
563
  save_directory, weight_dir, transpose_value_cache, kv_len,
557
- group_size, layernorm_const, "decode",
564
+ group_size, const_parameter, "decode",
558
565
  keep_ir=keep_ir, compile_blob=compile_blob)
559
566
  # save blob of single prefill layer
560
567
  convert_llama_layer(model, 0, n_splits_linear, n_splits_down_proj,
561
568
  save_directory, weight_dir, transpose_value_cache, max_prompt_len,
562
- group_size, layernorm_const, "prefill",
569
+ group_size, const_parameter, "prefill",
563
570
  keep_ir=keep_ir, compile_blob=compile_blob)
564
571
  elif model.config.model_type == "minicpm":
565
572
  if group_size == 0:
@@ -571,7 +578,7 @@ def convert_llm_for_deploy(model: torch.nn.Module,
571
578
  "head_dim": model.model.layers[0].self_attn.head_dim,
572
579
  "transpose_value_cache": transpose_value_cache,
573
580
  "max_prompt_len": max_prompt_len,
574
- "layernorm_const": layernorm_const,
581
+ "const_parameter": const_parameter,
575
582
  "group_size": group_size,
576
583
  "fused_layers": fused_layers,
577
584
  "qkv_bias": False,
@@ -589,12 +596,12 @@ def convert_llm_for_deploy(model: torch.nn.Module,
589
596
  # save fused_layers blobs of fused decoder layers
590
597
  convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
591
598
  save_directory, weight_dir, transpose_value_cache, kv_len,
592
- group_size, layernorm_const, "decode",
599
+ group_size, const_parameter, "decode",
593
600
  keep_ir=keep_ir, compile_blob=compile_blob)
594
601
  # save blob of single prefill layer
595
602
  convert_minicpm_layer(model, 0, n_splits_linear, n_splits_down_proj,
596
603
  save_directory, weight_dir, transpose_value_cache, max_prompt_len,
597
- group_size, layernorm_const, "prefill",
604
+ group_size, const_parameter, "prefill",
598
605
  keep_ir=keep_ir, compile_blob=compile_blob)
599
606
  # save blob of lmhead and bin of embedding and embedding_post
600
607
  convert_lm_head_and_embedding(model, n_splits_linear,