ipex-llm 2.2.0b20250205__py3-none-win_amd64.whl → 2.2.0b20250207__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. ipex_llm/libs/bloom-api.dll +0 -0
  2. ipex_llm/libs/bloom.dll +0 -0
  3. ipex_llm/libs/gptneox-api.dll +0 -0
  4. ipex_llm/libs/gptneox.dll +0 -0
  5. ipex_llm/libs/libbloom_avx.dll +0 -0
  6. ipex_llm/libs/libbloom_vnni.dll +0 -0
  7. ipex_llm/libs/libgptneox_avx.dll +0 -0
  8. ipex_llm/libs/libgptneox_vnni.dll +0 -0
  9. ipex_llm/libs/libllama_avx.dll +0 -0
  10. ipex_llm/libs/libllama_vnni.dll +0 -0
  11. ipex_llm/libs/libstarcoder_avx.dll +0 -0
  12. ipex_llm/libs/libstarcoder_vnni.dll +0 -0
  13. ipex_llm/libs/llama-api.dll +0 -0
  14. ipex_llm/libs/llama.dll +0 -0
  15. ipex_llm/libs/main-bloom.exe +0 -0
  16. ipex_llm/libs/main-gptneox.exe +0 -0
  17. ipex_llm/libs/main-llama.exe +0 -0
  18. ipex_llm/libs/main-starcoder.exe +0 -0
  19. ipex_llm/libs/pipeline.dll +0 -0
  20. ipex_llm/libs/quantize-bloom.exe +0 -0
  21. ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
  22. ipex_llm/libs/quantize-gptneox.exe +0 -0
  23. ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
  24. ipex_llm/libs/quantize-llama.exe +0 -0
  25. ipex_llm/libs/quantize-llama_vnni.exe +0 -0
  26. ipex_llm/libs/quantize-starcoder.exe +0 -0
  27. ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
  28. ipex_llm/libs/starcoder-api.dll +0 -0
  29. ipex_llm/libs/starcoder.dll +0 -0
  30. ipex_llm/transformers/low_bit_linear.py +5 -4
  31. ipex_llm/transformers/model.py +0 -1
  32. ipex_llm/transformers/npu_model.py +17 -5
  33. ipex_llm/transformers/npu_models/convert.py +6 -2
  34. ipex_llm/transformers/npu_models/qwen2_mp.py +20 -8
  35. ipex_llm/transformers/npu_pipeline_model/common.py +161 -0
  36. ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +33 -13
  37. ipex_llm/transformers/npu_pipeline_model/llama.py +20 -159
  38. ipex_llm/transformers/npu_pipeline_model/minicpm.py +19 -10
  39. ipex_llm/transformers/npu_pipeline_model/qwen.py +57 -36
  40. ipex_llm/transformers/qlora.py +2 -2
  41. ipex_llm/transformers/utils.py +19 -6
  42. ipex_llm/transformers/xpu_customize_fwd.py +6 -4
  43. {ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/METADATA +23 -30
  44. {ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/RECORD +50 -50
  45. {ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250207.data}/scripts/ipex-llm-init.bat +0 -0
  46. {ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250207.data}/scripts/llm-chat.ps1 +0 -0
  47. {ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250207.data}/scripts/llm-cli.ps1 +0 -0
  48. {ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/WHEEL +0 -0
  49. {ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/entry_points.txt +0 -0
  50. {ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/top_level.txt +0 -0
Binary file
ipex_llm/libs/bloom.dll CHANGED
Binary file
Binary file
ipex_llm/libs/gptneox.dll CHANGED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
ipex_llm/libs/llama.dll CHANGED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -51,7 +51,8 @@ from torch import Tensor, dtype, nn
51
51
  from operator import mul
52
52
  from functools import reduce
53
53
  from ipex_llm.transformers.xpu_customize_fwd import custom_fwd, custom_bwd
54
- from ipex_llm.transformers.utils import get_autocast_dtype, get_xpu_device_name
54
+ from ipex_llm.transformers.utils import is_autocast_enabled, get_autocast_dtype
55
+ from ipex_llm.transformers.utils import get_xpu_device_name
55
56
  from ipex_llm.transformers.convert import is_deepspeed_available, get_use_vllm
56
57
 
57
58
  T = TypeVar("T", bound="torch.nn.Module")
@@ -527,8 +528,8 @@ class MatMulLowBit(torch.autograd.Function):
527
528
  A, weight = ctx.tensors
528
529
  grad_A, grad_weight = None, None
529
530
  if req_gradA:
530
- if torch.xpu.is_autocast_xpu_enabled():
531
- grad_output = grad_output.to(torch.xpu.get_autocast_xpu_dtype())
531
+ if is_autocast_enabled("xpu"):
532
+ grad_output = grad_output.to(get_autocast_dtype("xpu"))
532
533
  if weight.qtype == NF4:
533
534
  dequant_weight = xe_linear.dequant(A,
534
535
  weight.data.view(torch.uint8),
@@ -615,7 +616,7 @@ class LowBitLinear(nn.Linear):
615
616
  is_training = self.training and not torch.is_inference_mode_enabled()
616
617
  if is_training:
617
618
  # below logic is only for training
618
- autocast_dtype = get_autocast_dtype(x)
619
+ autocast_dtype = get_autocast_dtype(x.device.type)
619
620
  if self.compute_dtype is not None and x.device.type == "xpu":
620
621
  x = x.to(self.compute_dtype) # solve GC issue for unlora module
621
622
  elif autocast_dtype is not None:
@@ -233,7 +233,6 @@ class _BaseAutoModelClass:
233
233
  optimize_model = False
234
234
  kwargs["modules_to_not_convert"] = ["lm_head"]
235
235
 
236
- load_in_8bit = kwargs.pop("load_in_8bit", False)
237
236
  from ipex_llm.llm_patching import bigdl_patched
238
237
  if bigdl_patched == 'Train':
239
238
  global patched_training_mode
@@ -117,7 +117,6 @@ class _BaseAutoModelClass:
117
117
  # ignore following arguments
118
118
  ignore_argument(kwargs, "model_hub")
119
119
  ignore_argument(kwargs, "load_in_4bit")
120
- ignore_argument(kwargs, "load_in_8bit")
121
120
  ignore_argument(kwargs, "imatrix")
122
121
  ignore_argument(kwargs, "cpu_embedding")
123
122
  ignore_argument(kwargs, "embedding_qtype")
@@ -139,8 +138,10 @@ class _BaseAutoModelClass:
139
138
  mock_device = kwargs.pop('device', None) # For mock on CPU
140
139
  convert_model = kwargs.pop('convert_model', False)
141
140
  save_directory = kwargs.pop('save_directory', None)
142
- fuse_layers = kwargs.pop('fuse_layers', None)
143
- imatrix_file = kwargs.pop('imatrix_file', None)
141
+ fuse_layers = kwargs.pop("fuse_layers", None)
142
+ imatrix_file = kwargs.pop("imatrix_file", None)
143
+ keep_ir = kwargs.pop("keep_ir", False)
144
+ compile_blob = kwargs.pop("compile_blob", True)
144
145
 
145
146
  if imatrix_file is not None:
146
147
  imatrix_data = load_imatrix_data(imatrix_file)
@@ -236,6 +237,8 @@ class _BaseAutoModelClass:
236
237
  "fuse_layers": fuse_layers,
237
238
  "imatrix_data": imatrix_data,
238
239
  "skip_npu_logic": mock_device == "dummy",
240
+ "keep_ir": keep_ir,
241
+ "compile_blob": compile_blob,
239
242
  }
240
243
  # Dummy will skip npu related logic and save the quantized model
241
244
  if mock_device == "dummy":
@@ -280,9 +283,14 @@ class _BaseAutoModelClass:
280
283
  fuse_layers = kwargs.pop('fuse_layers', None)
281
284
  imatrix_data = kwargs.pop('imatrix_data', None)
282
285
  skip_npu_logic = kwargs.pop("skip_npu_logic", False)
286
+ keep_ir = kwargs.pop("keep_ir", False)
287
+ compile_blob = kwargs.pop("compile_blob", True)
288
+
283
289
  invalidInputError(save_directory is not None,
284
290
  "Please provide the path to save converted model "
285
291
  "through `save_directory`.")
292
+ invalidInputError(keep_ir or compile_blob,
293
+ "Please save blob or save IR either.")
286
294
 
287
295
  if hasattr(model, "llm"):
288
296
  llm = model.llm
@@ -323,7 +331,9 @@ class _BaseAutoModelClass:
323
331
  qtype=qtype,
324
332
  save_directory=save_directory,
325
333
  fuse_layers=fuse_layers,
326
- has_llm=hasattr(model, "llm")
334
+ has_llm=hasattr(model, "llm"),
335
+ keep_ir=keep_ir,
336
+ compile_blob=compile_blob
327
337
  )
328
338
  else:
329
339
  optimize_llm(
@@ -346,7 +356,9 @@ class _BaseAutoModelClass:
346
356
  qtype=qtype,
347
357
  convert_model=convert_model,
348
358
  save_directory=save_directory,
349
- fuse_layers=fuse_layers)
359
+ fuse_layers=fuse_layers,
360
+ keep_ir=keep_ir,
361
+ compile_blob=compile_blob)
350
362
  model.save_low_bit = types.MethodType(save_low_bit, model)
351
363
  model.save_low_bit(save_directory)
352
364
  logger.info(f"Converted model has already saved to {save_directory}.")
@@ -450,7 +450,9 @@ def optimize_llm_single_process(
450
450
  qtype: str,
451
451
  save_directory: str,
452
452
  fuse_layers: int=None,
453
- has_llm: bool=False
453
+ has_llm: bool=False,
454
+ keep_ir: bool=False,
455
+ compile_blob: bool=True
454
456
  ):
455
457
  from ipex_llm.transformers.npu_pipeline_model.convert_pipeline import convert_llm
456
458
  from .npu_llm_cpp import load_model_from_file
@@ -463,7 +465,9 @@ def optimize_llm_single_process(
463
465
  qtype=qtype,
464
466
  convert_model=True,
465
467
  save_directory=save_directory,
466
- fuse_layers=fuse_layers)
468
+ fuse_layers=fuse_layers,
469
+ keep_ir=keep_ir,
470
+ compile_blob=compile_blob)
467
471
  try:
468
472
  model_ptr = load_model_from_file(save_directory)
469
473
  model.kv_len = kv_len
@@ -98,6 +98,8 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
98
98
  n_splits_linear: int = 1,
99
99
  n_splits_down_proj: int = 1,
100
100
  group_size: int = 0,
101
+ cos_len: int = 1,
102
+ keep_position_ids=True,
101
103
  asym: bool = False,
102
104
  ):
103
105
  super().__init__(max_seq_len=max_seq_len,
@@ -114,18 +116,13 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
114
116
  self.dtype = dtype
115
117
  self.cached_cos = cached_cos
116
118
  self.cached_sin = cached_sin
119
+ self.cos_len = cos_len
117
120
  self.batch_size, self.seq_len, self.hidden_size = hidden_shape
118
121
  self.mode = mode
119
122
  self.rms_norm_eps = rms_norm_eps
120
123
  self.transpose_value = transpose_value
121
124
  self.num_layers = num_layers
122
125
 
123
- cos = self.constant(self.cached_cos)
124
- self.cos = self.unsqueeze(cos, axis=0)
125
-
126
- sin = self.constant(self.cached_sin)
127
- self.sin = self.unsqueeze(sin, axis=0)
128
-
129
126
  if mode == "decode":
130
127
  self.kv_seq_len = self.max_seq_len + 1
131
128
  else:
@@ -148,7 +145,21 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
148
145
  attention_mask = self.create_input_op(
149
146
  (self.batch_size, 1, self.seq_len, self.seq_len), dtype=np.float16)
150
147
 
151
- position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
148
+ if self.cached_cos is None:
149
+ if mode == "prefill" and keep_position_ids:
150
+ position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
151
+ cos = self.create_input_op((self.batch_size, self.cos_len, self.head_dim),
152
+ dtype=np.float32)
153
+ self.cos = self.convert_to_fp16(cos)
154
+ sin = self.create_input_op((self.batch_size, self.cos_len, self.head_dim),
155
+ dtype=np.float32)
156
+ self.sin = self.convert_to_fp16(sin)
157
+ else:
158
+ position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
159
+ cos = self.constant(self.cached_cos)
160
+ self.cos = self.unsqueeze(cos, axis=0)
161
+ sin = self.constant(self.cached_sin)
162
+ self.sin = self.unsqueeze(sin, axis=0)
152
163
 
153
164
  if input_layernorm_weights is None:
154
165
  input_layernorm_weights = []
@@ -211,11 +222,12 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
211
222
  hidden_states = input
212
223
 
213
224
  curr_key_values = []
225
+ cos_condition = cached_cos is not None or (mode == "prefill" and keep_position_ids)
214
226
  for i in range(num_layers):
215
227
  hidden_states, new_key_states, new_value_states = self.build_decoder(
216
228
  hidden_states=hidden_states,
217
229
  attention_mask=attention_mask,
218
- position_ids=position_ids,
230
+ position_ids=position_ids if cos_condition else None,
219
231
  input_layernorm_weight=input_layernorm_weights[i],
220
232
  post_attention_layernorm_weight=post_attn_layernorm_weights[i],
221
233
  q_bias=q_biases[i],
@@ -173,6 +173,105 @@ class LLMEmbedding(NNFactory):
173
173
  self.compile()
174
174
 
175
175
 
176
+ class Llama32Embedding(NNFactory):
177
+ def __init__(
178
+ self,
179
+ vocab_size,
180
+ embedding_dim,
181
+ embedding_weight,
182
+ padding_idx,
183
+ inv_freq,
184
+ attention_scaling,
185
+ dtype, # fp16
186
+ device: str = "NPU",
187
+ ):
188
+ super().__init__(False, device)
189
+ self.vocab_size = vocab_size
190
+ self.embedding_dim = embedding_dim
191
+ self.padding_idx = padding_idx
192
+ self.attention_scaling = attention_scaling
193
+ self.dtype = dtype
194
+
195
+ # define input
196
+ weight = self.constant(embedding_weight)
197
+ input = self.parameter((1, 1), dtype=np.int32)
198
+ position_ids = self.parameter((1, 1), dtype=np.int64)
199
+ inv_freq = self.constant(inv_freq)
200
+
201
+ # embed_tokens module
202
+ if padding_idx == -1:
203
+ padding_idx += vocab_size
204
+
205
+ axis_node = self.constant(np.array([0], dtype=np.int64))
206
+ if padding_idx is not None:
207
+ masked_embeddings = np.ones(weight.shape, dtype=np.float16)
208
+ masked_embeddings[padding_idx, :] = 0.0 # mask
209
+
210
+ node_mask = self.constant(masked_embeddings)
211
+ node_masked_w = self.eltwise_mul(weight, node_mask)
212
+ res = self.gather(node_masked_w, input, axis_node, 0)
213
+ else:
214
+ res = self.gather(weight, input, axis_node, 0)
215
+
216
+ # rotary_emb module
217
+ inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
218
+ position_ids = self.reshape(position_ids, (1, 1, 1))
219
+ freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
220
+ self.convert_to_fp32(position_ids))
221
+ freqs = self.transpose(freqs, [0, 2, 1])
222
+ emb = self.concat(freqs, freqs, axis=2)
223
+ cos = self.cos(emb)
224
+ sin = self.sin(emb)
225
+ cos = cos * self.attention_scaling
226
+ sin = sin * self.attention_scaling
227
+
228
+ # define outputs
229
+ res = self.convert_to_fp16(res)
230
+ cos = self.convert_to_fp32(cos)
231
+ sin = self.convert_to_fp32(sin)
232
+
233
+ print("start compiling")
234
+ self.compile()
235
+
236
+
237
+ class Llama32PostEmbedding(NNFactory):
238
+ def __init__(
239
+ self,
240
+ inv_freq,
241
+ attention_scaling,
242
+ input_len: int = 1,
243
+ device: str = "NPU",
244
+ ):
245
+ super().__init__(False, device)
246
+ self.attention_scaling = attention_scaling
247
+
248
+ # define input
249
+ position_ids = self.parameter((1, input_len), dtype=np.int64)
250
+ inv_freq = self.constant(inv_freq)
251
+
252
+ # rotary_emb module
253
+ inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
254
+ position_ids = self.reshape(position_ids, (1, 1, input_len))
255
+ freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
256
+ self.convert_to_fp32(position_ids))
257
+ freqs = self.transpose(freqs, [0, 2, 1])
258
+ emb = self.concat(freqs, freqs, axis=2)
259
+ cos = self.cos(emb)
260
+ sin = self.sin(emb)
261
+ cos = cos * self.attention_scaling
262
+ sin = sin * self.attention_scaling
263
+ if input_len > 1:
264
+ cos = self.unsqueeze(cos, [1])
265
+ sin = self.unsqueeze(sin, [1])
266
+
267
+ # define outputs
268
+ cos = self.convert_to_fp32(cos)
269
+ sin = self.convert_to_fp32(sin)
270
+
271
+ print("start compiling")
272
+ self.compile()
273
+
274
+
176
275
  def obtain_weight_from_single_layer(attn_layer, mlp_layer):
177
276
  weights = []
178
277
  if hasattr(attn_layer, "q_proj_dq_list"):
@@ -216,3 +315,65 @@ def obtain_qkv_bias_from_single_layer(attn_layer):
216
315
  k_bias = attn_layer.k_proj.bias.to(torch.float16)
217
316
  v_bias = attn_layer.v_proj.bias.to(torch.float16)
218
317
  return q_bias, k_bias, v_bias
318
+
319
+
320
+ def obtain_embedding_from_model(model, convert_model, temp_dir, weight_dir,
321
+ max_prompt_len, keep_ir, compile_blob):
322
+ if hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
323
+ # llama-2-7B & llama-3-8B
324
+ embedding_layer = model.model.embed_tokens
325
+ new_embedding = LLMEmbedding(
326
+ vocab_size=model.config.vocab_size,
327
+ embedding_dim=model.config.hidden_size,
328
+ embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
329
+ padding_idx=model.config.pad_token_id,
330
+ dtype=np.float16,
331
+ )
332
+ if convert_model:
333
+ bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
334
+ embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
335
+ first_blob_path = None
336
+ else:
337
+ first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
338
+ temp_dir, keep_ir=keep_ir,
339
+ compile_blob=compile_blob)
340
+ os.remove(os.path.join(temp_dir, "embedding.bin"))
341
+ else:
342
+ # llama-3.2-3B & llama-3.2-1B
343
+ # for transformers >= 4.45.0
344
+ embedding_layer = model.model.embed_tokens
345
+ new_embedding = Llama32Embedding(
346
+ vocab_size=model.config.vocab_size,
347
+ embedding_dim=model.config.hidden_size,
348
+ embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
349
+ padding_idx=model.config.pad_token_id,
350
+ inv_freq=model.model.rotary_emb.inv_freq.to(torch.float16),
351
+ attention_scaling=model.model.rotary_emb.attention_scaling,
352
+ dtype=np.float16,
353
+ )
354
+ if convert_model:
355
+ bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
356
+ embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
357
+ first_blob_path = None
358
+ # save embedding post module
359
+ inv_freq = model.model.rotary_emb.inv_freq.to(torch.float16)
360
+ attention_scaling = model.model.rotary_emb.attention_scaling
361
+ embedding_post = Llama32PostEmbedding(inv_freq=inv_freq,
362
+ attention_scaling=attention_scaling,
363
+ input_len=1)
364
+ update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
365
+ temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
366
+ embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
367
+ attention_scaling=attention_scaling,
368
+ input_len=max_prompt_len)
369
+ update_names_of_IR_and_export_blob(embedding_post_prefill,
370
+ "embedding_post_prefill",
371
+ temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
372
+ os.remove(os.path.join(temp_dir, "embedding_post.bin"))
373
+ os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
374
+ else:
375
+ first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
376
+ temp_dir, keep_ir=keep_ir,
377
+ compile_blob=compile_blob)
378
+ os.remove(os.path.join(temp_dir, "embedding.bin"))
379
+ return first_blob_path
@@ -31,6 +31,7 @@ import tempfile
31
31
  import numpy as np
32
32
  from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
33
33
  from multiprocessing import Pool
34
+ import transformers
34
35
 
35
36
 
36
37
  def generate(
@@ -196,7 +197,9 @@ def convert_llm(model: torch.nn.Module,
196
197
  qtype: str,
197
198
  convert_model: bool=False,
198
199
  save_directory: str=None,
199
- fuse_layers: int=None):
200
+ fuse_layers: int=None,
201
+ keep_ir: bool=False,
202
+ compile_blob: bool=True):
200
203
  # whether to set layernorm weight as const
201
204
  layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
202
205
  if group_size == 0:
@@ -220,7 +223,9 @@ def convert_llm(model: torch.nn.Module,
220
223
  n_splits_down_proj,
221
224
  group_size,
222
225
  save_directory,
223
- fuse_layers=fuse_layers)
226
+ fuse_layers=fuse_layers,
227
+ keep_ir=keep_ir,
228
+ compile_blob=compile_blob)
224
229
  return 0
225
230
  if model.config.model_type == "llama":
226
231
  with tempfile.TemporaryDirectory() as temp_dir:
@@ -428,7 +433,9 @@ def convert_llm_for_deploy(model: torch.nn.Module,
428
433
  n_splits_down_proj: int,
429
434
  group_size: int,
430
435
  save_directory: str=None,
431
- fuse_layers: int=None):
436
+ fuse_layers: int=None,
437
+ keep_ir: bool=False,
438
+ compile_blob: bool=True):
432
439
  if not os.path.exists(save_directory):
433
440
  os.mkdir(save_directory)
434
441
  weight_dir = os.path.join(save_directory, "model_weights")
@@ -450,6 +457,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
450
457
  custom_object_save(model, save_directory, config=model.config)
451
458
 
452
459
  if model.config.model_type == "qwen2":
460
+ cos_sin_input = not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached")
461
+ embedding_post = not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached")
453
462
  if group_size == 0:
454
463
  if model.config.hidden_size == 1536:
455
464
  # Qwen2-1.5B-Instruct
@@ -470,6 +479,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
470
479
  "use_prefill_sdp": False,
471
480
  "weight_num": 7,
472
481
  "weight_idx": 8,
482
+ "embedding_post": embedding_post,
483
+ "cos_sin_input": cos_sin_input,
473
484
  "n_splits_linear": n_splits_linear,
474
485
  "n_splits_down_proj": n_splits_down_proj,
475
486
  "lm_head_low_bit": lm_head_low_bit}
@@ -479,14 +490,17 @@ def convert_llm_for_deploy(model: torch.nn.Module,
479
490
  # save fused_layers blobs of fused decoder layers
480
491
  convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
481
492
  save_directory, weight_dir, transpose_value_cache, kv_len,
482
- group_size, layernorm_const, "decode")
493
+ group_size, layernorm_const, "decode",
494
+ keep_ir=keep_ir, compile_blob=compile_blob)
483
495
  # save blob of single prefill layer
484
496
  convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj,
485
497
  save_directory, weight_dir, transpose_value_cache, max_prompt_len,
486
- group_size, layernorm_const, "prefill")
498
+ group_size, layernorm_const, "prefill",
499
+ keep_ir=keep_ir, compile_blob=compile_blob)
487
500
  # save blob of lmhead and bin of embedding
488
- convert_lm_head_and_embedding(model, save_directory, weight_dir,
489
- convert_model=True, group_size=group_size)
501
+ convert_lm_head_and_embedding(model, save_directory, weight_dir, convert_model=True,
502
+ group_size=group_size, max_prompt_len=max_prompt_len,
503
+ keep_ir=keep_ir, compile_blob=compile_blob)
490
504
  elif model.config.model_type == "llama":
491
505
  embedding_post = False
492
506
  cos_sin_input = False
@@ -540,15 +554,18 @@ def convert_llm_for_deploy(model: torch.nn.Module,
540
554
  convert_lm_head_and_embedding(model, n_splits_linear,
541
555
  save_directory, weight_dir,
542
556
  convert_model=True,
543
- max_prompt_len=max_prompt_len)
557
+ max_prompt_len=max_prompt_len,
558
+ keep_ir=keep_ir, compile_blob=compile_blob)
544
559
  # save fused_layers blobs of fused decoder layers
545
560
  convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
546
561
  save_directory, weight_dir, transpose_value_cache, kv_len,
547
- group_size, layernorm_const, "decode")
562
+ group_size, layernorm_const, "decode",
563
+ keep_ir=keep_ir, compile_blob=compile_blob)
548
564
  # save blob of single prefill layer
549
565
  convert_llama_layer(model, 0, n_splits_linear, n_splits_down_proj,
550
566
  save_directory, weight_dir, transpose_value_cache, max_prompt_len,
551
- group_size, layernorm_const, "prefill")
567
+ group_size, layernorm_const, "prefill",
568
+ keep_ir=keep_ir, compile_blob=compile_blob)
552
569
  elif model.config.model_type == "minicpm":
553
570
  if group_size == 0:
554
571
  fused_layers = 4 if fuse_layers is None else fuse_layers
@@ -577,16 +594,19 @@ def convert_llm_for_deploy(model: torch.nn.Module,
577
594
  # save fused_layers blobs of fused decoder layers
578
595
  convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
579
596
  save_directory, weight_dir, transpose_value_cache, kv_len,
580
- group_size, layernorm_const, "decode")
597
+ group_size, layernorm_const, "decode",
598
+ keep_ir=keep_ir, compile_blob=compile_blob)
581
599
  # save blob of single prefill layer
582
600
  convert_minicpm_layer(model, 0, n_splits_linear, n_splits_down_proj,
583
601
  save_directory, weight_dir, transpose_value_cache, max_prompt_len,
584
- group_size, layernorm_const, "prefill")
602
+ group_size, layernorm_const, "prefill",
603
+ keep_ir=keep_ir, compile_blob=compile_blob)
585
604
  # save blob of lmhead and bin of embedding and embedding_post
586
605
  convert_lm_head_and_embedding(model, n_splits_linear,
587
606
  save_directory, weight_dir,
588
607
  convert_model=True,
589
- max_prompt_len=max_prompt_len)
608
+ max_prompt_len=max_prompt_len,
609
+ keep_ir=keep_ir, compile_blob=compile_blob)
590
610
 
591
611
  model.config.update(update_dict)
592
612
  model.config.save_pretrained(save_directory)