ipex-llm 2.2.0b20250206__py3-none-win_amd64.whl → 2.2.0b20250208__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. ipex_llm/libs/bloom-api.dll +0 -0
  2. ipex_llm/libs/bloom.dll +0 -0
  3. ipex_llm/libs/gptneox-api.dll +0 -0
  4. ipex_llm/libs/gptneox.dll +0 -0
  5. ipex_llm/libs/libbloom_avx.dll +0 -0
  6. ipex_llm/libs/libbloom_vnni.dll +0 -0
  7. ipex_llm/libs/libgptneox_avx.dll +0 -0
  8. ipex_llm/libs/libgptneox_vnni.dll +0 -0
  9. ipex_llm/libs/libllama_avx.dll +0 -0
  10. ipex_llm/libs/libllama_vnni.dll +0 -0
  11. ipex_llm/libs/libstarcoder_avx.dll +0 -0
  12. ipex_llm/libs/libstarcoder_vnni.dll +0 -0
  13. ipex_llm/libs/llama-api.dll +0 -0
  14. ipex_llm/libs/llama.dll +0 -0
  15. ipex_llm/libs/main-bloom.exe +0 -0
  16. ipex_llm/libs/main-gptneox.exe +0 -0
  17. ipex_llm/libs/main-llama.exe +0 -0
  18. ipex_llm/libs/main-starcoder.exe +0 -0
  19. ipex_llm/libs/pipeline.dll +0 -0
  20. ipex_llm/libs/quantize-bloom.exe +0 -0
  21. ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
  22. ipex_llm/libs/quantize-gptneox.exe +0 -0
  23. ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
  24. ipex_llm/libs/quantize-llama.exe +0 -0
  25. ipex_llm/libs/quantize-llama_vnni.exe +0 -0
  26. ipex_llm/libs/quantize-starcoder.exe +0 -0
  27. ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
  28. ipex_llm/libs/starcoder-api.dll +0 -0
  29. ipex_llm/libs/starcoder.dll +0 -0
  30. ipex_llm/transformers/model.py +0 -1
  31. ipex_llm/transformers/npu_model.py +0 -1
  32. ipex_llm/transformers/npu_models/qwen2_mp.py +20 -8
  33. ipex_llm/transformers/npu_pipeline_model/common.py +161 -0
  34. ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +30 -23
  35. ipex_llm/transformers/npu_pipeline_model/llama.py +17 -165
  36. ipex_llm/transformers/npu_pipeline_model/minicpm.py +10 -6
  37. ipex_llm/transformers/npu_pipeline_model/qwen.py +53 -34
  38. {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/METADATA +23 -30
  39. {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/RECORD +45 -45
  40. {ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250208.data}/scripts/ipex-llm-init.bat +0 -0
  41. {ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250208.data}/scripts/llm-chat.ps1 +0 -0
  42. {ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250208.data}/scripts/llm-cli.ps1 +0 -0
  43. {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/WHEEL +0 -0
  44. {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/entry_points.txt +0 -0
  45. {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/top_level.txt +0 -0
@@ -18,108 +18,8 @@
18
18
  import torch
19
19
  import numpy as np
20
20
  import os
21
- from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead, \
22
- obtain_weight_from_single_layer
23
- from intel_npu_acceleration_library.backend.factory import NNFactory
24
-
25
-
26
- class Llama32Embedding(NNFactory):
27
- def __init__(
28
- self,
29
- vocab_size,
30
- embedding_dim,
31
- embedding_weight,
32
- padding_idx,
33
- inv_freq,
34
- attention_scaling,
35
- dtype, # fp16
36
- device: str = "NPU",
37
- ):
38
- super().__init__(False, device)
39
- self.vocab_size = vocab_size
40
- self.embedding_dim = embedding_dim
41
- self.padding_idx = padding_idx
42
- self.attention_scaling = attention_scaling
43
- self.dtype = dtype
44
-
45
- # define input
46
- weight = self.constant(embedding_weight)
47
- input = self.parameter((1, 1), dtype=np.int32)
48
- position_ids = self.parameter((1, 1), dtype=np.int64)
49
- inv_freq = self.constant(inv_freq)
50
-
51
- # embed_tokens module
52
- if padding_idx == -1:
53
- padding_idx += vocab_size
54
-
55
- axis_node = self.constant(np.array([0], dtype=np.int64))
56
- if padding_idx is not None:
57
- masked_embeddings = np.ones(weight.shape, dtype=np.float16)
58
- masked_embeddings[padding_idx, :] = 0.0 # mask
59
-
60
- node_mask = self.constant(masked_embeddings)
61
- node_masked_w = self.eltwise_mul(weight, node_mask)
62
- res = self.gather(node_masked_w, input, axis_node, 0)
63
- else:
64
- res = self.gather(weight, input, axis_node, 0)
65
-
66
- # rotary_emb module
67
- inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
68
- position_ids = self.reshape(position_ids, (1, 1, 1))
69
- freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
70
- self.convert_to_fp32(position_ids))
71
- freqs = self.transpose(freqs, [0, 2, 1])
72
- emb = self.concat(freqs, freqs, axis=2)
73
- cos = self.cos(emb)
74
- sin = self.sin(emb)
75
- cos = cos * self.attention_scaling
76
- sin = sin * self.attention_scaling
77
-
78
- # define outputs
79
- res = self.convert_to_fp16(res)
80
- cos = self.convert_to_fp32(cos)
81
- sin = self.convert_to_fp32(sin)
82
-
83
- print("start compiling")
84
- self.compile()
85
-
86
-
87
- class Llama32PostEmbedding(NNFactory):
88
- def __init__(
89
- self,
90
- inv_freq,
91
- attention_scaling,
92
- input_len: int = 1,
93
- device: str = "NPU",
94
- ):
95
- super().__init__(False, device)
96
- self.attention_scaling = attention_scaling
97
-
98
- # define input
99
- position_ids = self.parameter((1, input_len), dtype=np.int64)
100
- inv_freq = self.constant(inv_freq)
101
-
102
- # rotary_emb module
103
- inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
104
- position_ids = self.reshape(position_ids, (1, 1, input_len))
105
- freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
106
- self.convert_to_fp32(position_ids))
107
- freqs = self.transpose(freqs, [0, 2, 1])
108
- emb = self.concat(freqs, freqs, axis=2)
109
- cos = self.cos(emb)
110
- sin = self.sin(emb)
111
- cos = cos * self.attention_scaling
112
- sin = sin * self.attention_scaling
113
- if input_len > 1:
114
- cos = self.unsqueeze(cos, [1])
115
- sin = self.unsqueeze(sin, [1])
116
-
117
- # define outputs
118
- cos = self.convert_to_fp32(cos)
119
- sin = self.convert_to_fp32(sin)
120
-
121
- print("start compiling")
122
- self.compile()
21
+ from .common import update_names_of_IR_and_export_blob, LowBitLLMLMHead, \
22
+ obtain_weight_from_single_layer, obtain_embedding_from_model
123
23
 
124
24
 
125
25
  def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
@@ -197,69 +97,17 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
197
97
  bin_file = os.path.join(weight_dir, f"model_lm_head_input_{1+idx}.bin")
198
98
  weight.tofile(bin_file)
199
99
 
200
- if hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
201
- # llama-2-7B & llama-3-8B
202
- embedding_layer = model.model.embed_tokens
203
- new_embedding = LLMEmbedding(
204
- vocab_size=model.config.vocab_size,
205
- embedding_dim=model.config.hidden_size,
206
- embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
207
- padding_idx=model.config.pad_token_id,
208
- dtype=np.float16,
209
- )
210
- if convert_model:
211
- bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
212
- embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
213
- first_blob_path = None
214
- else:
215
- first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
216
- temp_dir, keep_ir=keep_ir,
217
- compile_blob=compile_blob)
218
- os.remove(os.path.join(temp_dir, "embedding.bin"))
219
- else:
220
- # llama-3.2-3B & llama-3.2-1B
221
- embedding_layer = model.model.embed_tokens
222
- new_embedding = Llama32Embedding(
223
- vocab_size=model.config.vocab_size,
224
- embedding_dim=model.config.hidden_size,
225
- embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
226
- padding_idx=model.config.pad_token_id,
227
- inv_freq=model.model.rotary_emb.inv_freq.to(torch.float16),
228
- attention_scaling=model.model.rotary_emb.attention_scaling,
229
- dtype=np.float16,
230
- )
231
- if convert_model:
232
- bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
233
- embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
234
- first_blob_path = None
235
- # save embedding post module
236
- inv_freq = model.model.rotary_emb.inv_freq.to(torch.float16)
237
- attention_scaling = model.model.rotary_emb.attention_scaling
238
- embedding_post = Llama32PostEmbedding(inv_freq=inv_freq,
239
- attention_scaling=attention_scaling,
240
- input_len=1)
241
- update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
242
- temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
243
- embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
244
- attention_scaling=attention_scaling,
245
- input_len=max_prompt_len)
246
- update_names_of_IR_and_export_blob(embedding_post_prefill,
247
- "embedding_post_prefill",
248
- temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
249
- os.remove(os.path.join(temp_dir, "embedding_post.bin"))
250
- os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
251
- else:
252
- first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
253
- temp_dir, keep_ir=keep_ir,
254
- compile_blob=compile_blob)
255
- os.remove(os.path.join(temp_dir, "embedding.bin"))
100
+ first_blob_path = obtain_embedding_from_model(model, convert_model,
101
+ temp_dir, weight_dir,
102
+ max_prompt_len,
103
+ keep_ir, compile_blob)
256
104
 
257
105
  return first_blob_path, last_blob_path
258
106
 
259
107
 
260
108
  def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
261
109
  temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
262
- layernorm_const, mode="decode",
110
+ const_parameter, mode="decode",
263
111
  keep_ir=False, compile_blob=True):
264
112
  num_heads = model.model.layers[0].self_attn.num_heads
265
113
  num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
@@ -297,14 +145,14 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
297
145
  else:
298
146
  input_len = kv_len
299
147
  decoder_name = "decoder_layer_prefill"
300
- layernorm_const = False
148
+ const_parameter = False
301
149
  keep_position_ids = False
302
150
  npu_dpu_groups = 6
303
151
 
304
152
  single_decoder = LowBitLlamaMultiDecoderlayer(
305
153
  [1, input_len, num_heads * head_dim],
306
- input_layernorm_weights=[layer_norm_0] if layernorm_const else None,
307
- post_attn_layernorm_weights=[layer_norm_1] if layernorm_const else None,
154
+ input_layernorm_weights=[layer_norm_0] if const_parameter else None,
155
+ post_attn_layernorm_weights=[layer_norm_1] if const_parameter else None,
308
156
  cached_cos=cached_cos,
309
157
  cached_sin=cached_sin,
310
158
  num_heads=num_heads,
@@ -334,7 +182,7 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
334
182
  if mode == "decode":
335
183
  if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
336
184
  # llama-2-7B & llama-3-8B
337
- if layernorm_const:
185
+ if const_parameter:
338
186
  st_idx = 5
339
187
  else:
340
188
  input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
@@ -344,7 +192,7 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
344
192
  st_idx = 7
345
193
  else:
346
194
  # llama-3.2-3B & llama-3.2-1B
347
- if layernorm_const:
195
+ if const_parameter:
348
196
  st_idx = 6
349
197
  else:
350
198
  input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
@@ -375,7 +223,7 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
375
223
 
376
224
  def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
377
225
  save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
378
- layernorm_const, mode="decode",
226
+ const_parameter, mode="decode",
379
227
  keep_ir=False, compile_blob=True):
380
228
  num_heads = model.model.layers[0].self_attn.num_heads
381
229
  num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
@@ -446,6 +294,10 @@ def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_dow
446
294
  else: # FP16 Linear
447
295
  np_dtype = np.float16
448
296
 
297
+ if not const_parameter:
298
+ input_layer_norm_weights = None
299
+ post_attn_layernorm_weights = None
300
+
449
301
  fused_decoder = LowBitLlamaMultiDecoderlayer(
450
302
  [1, 1, num_heads * head_dim],
451
303
  input_layernorm_weights=input_layer_norm_weights,
@@ -301,7 +301,7 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
301
301
 
302
302
  def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
303
303
  temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
304
- layernorm_const, mode="decode",
304
+ const_parameter, mode="decode",
305
305
  keep_ir=False, compile_blob=True):
306
306
  num_heads = model.model.layers[0].self_attn.num_heads
307
307
  num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
@@ -333,12 +333,12 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
333
333
  else:
334
334
  input_len = kv_len
335
335
  decoder_name = "decoder_layer_prefill"
336
- layernorm_const = False
336
+ const_parameter = False
337
337
 
338
338
  single_decoder = LowBitMinicpmMultiDecoderlayer(
339
339
  [1, input_len, num_heads * head_dim],
340
- input_layernorm_weights=[layer_norm_0] if layernorm_const else None,
341
- post_attn_layernorm_weights=[layer_norm_1] if layernorm_const else None,
340
+ input_layernorm_weights=[layer_norm_0] if const_parameter else None,
341
+ post_attn_layernorm_weights=[layer_norm_1] if const_parameter else None,
342
342
  cached_cos=cached_cos,
343
343
  cached_sin=cached_sin,
344
344
  num_heads=num_heads,
@@ -364,7 +364,7 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
364
364
  os.remove(os.path.join(temp_dir, decoder_name + ".bin"))
365
365
 
366
366
  if mode == "decode":
367
- if layernorm_const:
367
+ if const_parameter:
368
368
  st_idx = 5
369
369
  else:
370
370
  input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
@@ -394,7 +394,7 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
394
394
 
395
395
  def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
396
396
  save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
397
- layernorm_const, mode="decode",
397
+ const_parameter, mode="decode",
398
398
  keep_ir=False, compile_blob=True):
399
399
  num_heads = model.model.layers[0].self_attn.num_heads
400
400
  num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
@@ -461,6 +461,10 @@ def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_d
461
461
  else: # FP16 Linear
462
462
  np_dtype = np.float16
463
463
 
464
+ if not const_parameter:
465
+ input_layer_norm_weights = None
466
+ post_attn_layernorm_weights = None
467
+
464
468
  fused_decoder = LowBitMinicpmMultiDecoderlayer(
465
469
  [1, 1, num_heads * head_dim],
466
470
  input_layernorm_weights=input_layer_norm_weights,
@@ -18,13 +18,14 @@
18
18
  import torch
19
19
  import numpy as np
20
20
  import os
21
- from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead, \
22
- obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer
21
+ from .common import update_names_of_IR_and_export_blob, LowBitLLMLMHead, \
22
+ obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer, \
23
+ obtain_embedding_from_model
23
24
  from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
24
25
 
25
26
 
26
27
  def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
27
- convert_model=False, group_size=0,
28
+ convert_model=False, group_size=0, max_prompt_len=1,
28
29
  keep_ir=False, compile_blob=True):
29
30
  num_heads = model.model.layers[0].self_attn.num_heads
30
31
  head_dim = model.model.layers[0].self_attn.head_dim
@@ -107,30 +108,16 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
107
108
  bin_file = os.path.join(weight_dir, f"model_lm_head_input_{1+idx}.bin")
108
109
  weight.tofile(bin_file)
109
110
 
110
- embedding_layer = model.model.embed_tokens
111
- new_embedding = LLMEmbedding(
112
- vocab_size=model.config.vocab_size,
113
- embedding_dim=model.config.hidden_size,
114
- embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
115
- padding_idx=model.config.pad_token_id,
116
- dtype=np.float16,
117
- input_length=1,
118
- )
119
- if convert_model:
120
- bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
121
- embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
122
- first_blob_path = True
123
- else:
124
- first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding",
125
- temp_dir, keep_ir=keep_ir,
126
- compile_blob=compile_blob)
127
- os.remove(os.path.join(temp_dir, "embedding.bin"))
111
+ first_blob_path = obtain_embedding_from_model(model, convert_model,
112
+ temp_dir, weight_dir,
113
+ max_prompt_len,
114
+ keep_ir, compile_blob)
128
115
  return first_blob_path, last_blob_path
129
116
 
130
117
 
131
118
  def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
132
119
  temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
133
- layernorm_const, mode="decode",
120
+ const_parameter, mode="decode",
134
121
  keep_ir=False, compile_blob=True):
135
122
  num_heads = model.model.layers[0].self_attn.num_heads
136
123
  num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
@@ -145,8 +132,13 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
145
132
  mlp_layer = curr_layer.mlp
146
133
  weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
147
134
  q_bias, k_bias, v_bias = obtain_qkv_bias_from_single_layer(attn_layer)
148
- cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
149
- cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
135
+ if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
136
+ cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
137
+ cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
138
+ else:
139
+ # transformers >= 4.45.0
140
+ cached_cos = None
141
+ cached_sin = None
150
142
  layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
151
143
  layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)
152
144
 
@@ -158,10 +150,12 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
158
150
  if mode == "decode":
159
151
  input_len = 1
160
152
  decoder_name = f"decoder_layer_{layer_idx}"
153
+ keep_position_ids = True
161
154
  npu_dpu_groups = None
162
155
  else:
163
156
  input_len = kv_len
164
157
  decoder_name = "decoder_layer_prefill"
158
+ keep_position_ids = False
165
159
  npu_dpu_groups = 6
166
160
 
167
161
  single_decoder = LowBitQwenMultiDecoderlayer(
@@ -185,6 +179,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
185
179
  n_splits_linear=n_splits_linear,
186
180
  n_splits_down_proj=n_splits_down_proj,
187
181
  group_size=group_size,
182
+ cos_len=input_len,
183
+ keep_position_ids=keep_position_ids,
188
184
  asym=asym
189
185
  )
190
186
  rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
@@ -196,14 +192,25 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
196
192
 
197
193
  # 0, 1, 2 are input_embed/attention_mask/position_id
198
194
  if mode == "decode":
199
- if layernorm_const:
200
- st_idx = 3
195
+ if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
196
+ if const_parameter:
197
+ st_idx = 3
198
+ else:
199
+ input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
200
+ post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
201
+ layer_norm_0.data.numpy().tofile(input_lm_bin_file)
202
+ layer_norm_1.data.numpy().tofile(post_lm_bin_file)
203
+ st_idx = 5
201
204
  else:
202
- input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
203
- post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
204
- layer_norm_0.data.numpy().tofile(input_lm_bin_file)
205
- layer_norm_1.data.numpy().tofile(post_lm_bin_file)
206
- st_idx = 5
205
+ # transformers >= 4.45.0
206
+ if const_parameter:
207
+ st_idx = 4
208
+ else:
209
+ input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
210
+ post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_5.bin")
211
+ layer_norm_0.data.numpy().tofile(input_lm_bin_file)
212
+ layer_norm_1.data.numpy().tofile(post_lm_bin_file)
213
+ st_idx = 6
207
214
  q_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx}.bin")
208
215
  k_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+1}.bin")
209
216
  v_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+2}.bin")
@@ -234,7 +241,7 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
234
241
 
235
242
  def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
236
243
  save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
237
- layernorm_const, mode="decode",
244
+ const_parameter, mode="decode",
238
245
  keep_ir=False, compile_blob=True):
239
246
  num_heads = model.model.layers[0].self_attn.num_heads
240
247
  num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
@@ -261,8 +268,13 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
261
268
  attn_layer = curr_layer.self_attn
262
269
  mlp_layer = curr_layer.mlp
263
270
  weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
264
- cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
265
- cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
271
+ if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
272
+ cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
273
+ cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
274
+ else:
275
+ # transformers >= 4.45.0
276
+ cached_cos = None
277
+ cached_sin = None
266
278
  layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
267
279
  layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)
268
280
 
@@ -313,6 +325,13 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
313
325
  else: # FP16 Linear
314
326
  np_dtype = np.float16
315
327
 
328
+ if not const_parameter:
329
+ input_layer_norm_weights = None
330
+ post_attn_layernorm_weights = None
331
+ q_biases = None
332
+ k_biases = None
333
+ v_biases = None
334
+
316
335
  fused_decoder = LowBitQwenMultiDecoderlayer(
317
336
  [1, 1, num_heads * head_dim],
318
337
  input_layernorm_weights=input_layer_norm_weights,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ipex-llm
3
- Version: 2.2.0b20250206
3
+ Version: 2.2.0b20250208
4
4
  Summary: Large Language Model Develop Toolkit
5
5
  Home-page: https://github.com/intel-analytics/ipex-llm
6
6
  Author: BigDL Authors
@@ -27,19 +27,12 @@ Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine
27
27
  Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'all'
28
28
  Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'all'
29
29
  Provides-Extra: cpp
30
- Requires-Dist: bigdl-core-cpp ==2.6.0b20250206 ; extra == 'cpp'
30
+ Requires-Dist: bigdl-core-cpp ==2.6.0b20250208 ; extra == 'cpp'
31
31
  Requires-Dist: setuptools ; extra == 'cpp'
32
- Provides-Extra: cpp-arl
33
- Requires-Dist: bigdl-core-cpp ==2.6.0b20250206 ; extra == 'cpp-arl'
34
- Requires-Dist: setuptools ; extra == 'cpp-arl'
35
- Requires-Dist: onednn-devel ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
36
- Requires-Dist: onednn ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
37
- Requires-Dist: dpcpp-cpp-rt ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
38
- Requires-Dist: mkl-dpcpp ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
39
- Requires-Dist: onednn-devel ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
40
- Requires-Dist: onednn ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
41
- Requires-Dist: dpcpp-cpp-rt ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
42
- Requires-Dist: mkl-dpcpp ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
32
+ Requires-Dist: onednn-devel ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
33
+ Requires-Dist: onednn ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
34
+ Requires-Dist: dpcpp-cpp-rt ==2025.0.2 ; (platform_system == "Windows") and extra == 'cpp'
35
+ Requires-Dist: mkl-dpcpp ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
43
36
  Provides-Extra: llama-index
44
37
  Requires-Dist: py-cpuinfo ; extra == 'llama-index'
45
38
  Requires-Dist: protobuf ; extra == 'llama-index'
@@ -67,7 +60,7 @@ Requires-Dist: transformers ==4.40.0 ; extra == 'npu'
67
60
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'npu'
68
61
  Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'npu'
69
62
  Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'npu'
70
- Requires-Dist: bigdl-core-npu ==2.6.0b20250206 ; (platform_system == "Windows") and extra == 'npu'
63
+ Requires-Dist: bigdl-core-npu ==2.6.0b20250208 ; (platform_system == "Windows") and extra == 'npu'
71
64
  Provides-Extra: serving
72
65
  Requires-Dist: py-cpuinfo ; extra == 'serving'
73
66
  Requires-Dist: fschat[model_worker,webui] ==0.2.36 ; extra == 'serving'
@@ -87,9 +80,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu'
87
80
  Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu'
88
81
  Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu'
89
82
  Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu'
90
- Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250206 ; extra == 'xpu'
91
- Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250206 ; extra == 'xpu'
92
- Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250206 ; extra == 'xpu'
83
+ Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250208 ; extra == 'xpu'
84
+ Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250208 ; extra == 'xpu'
85
+ Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250208 ; extra == 'xpu'
93
86
  Provides-Extra: xpu-2-1
94
87
  Requires-Dist: py-cpuinfo ; extra == 'xpu-2-1'
95
88
  Requires-Dist: protobuf ; extra == 'xpu-2-1'
@@ -104,9 +97,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu-2-1'
104
97
  Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu-2-1'
105
98
  Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu-2-1'
106
99
  Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu-2-1'
107
- Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250206 ; extra == 'xpu-2-1'
108
- Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250206 ; extra == 'xpu-2-1'
109
- Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250206 ; extra == 'xpu-2-1'
100
+ Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250208 ; extra == 'xpu-2-1'
101
+ Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250208 ; extra == 'xpu-2-1'
102
+ Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250208 ; extra == 'xpu-2-1'
110
103
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-1'
111
104
  Requires-Dist: dpcpp-cpp-rt ==2024.0.2 ; (platform_system == "Windows") and extra == 'xpu-2-1'
112
105
  Requires-Dist: mkl-dpcpp ==2024.0.0 ; (platform_system == "Windows") and extra == 'xpu-2-1'
@@ -124,7 +117,7 @@ Requires-Dist: setuptools ; extra == 'xpu-2-6'
124
117
  Requires-Dist: torch ==2.6.0+xpu ; extra == 'xpu-2-6'
125
118
  Requires-Dist: torchvision ==0.21.0+xpu ; extra == 'xpu-2-6'
126
119
  Requires-Dist: torchaudio ==2.6.0+xpu ; extra == 'xpu-2-6'
127
- Requires-Dist: bigdl-core-xe-all ==2.6.0b20250206 ; extra == 'xpu-2-6'
120
+ Requires-Dist: bigdl-core-xe-all ==2.6.0b20250208 ; extra == 'xpu-2-6'
128
121
  Requires-Dist: onednn-devel ==2025.0.1 ; extra == 'xpu-2-6'
129
122
  Requires-Dist: onednn ==2025.0.1 ; extra == 'xpu-2-6'
130
123
  Requires-Dist: dpcpp-cpp-rt ==2025.0.2 ; extra == 'xpu-2-6'
@@ -140,9 +133,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arc'
140
133
  Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arc'
141
134
  Requires-Dist: tabulate ; extra == 'xpu-arc'
142
135
  Requires-Dist: setuptools ; extra == 'xpu-arc'
143
- Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250206 ; extra == 'xpu-arc'
144
- Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250206 ; extra == 'xpu-arc'
145
- Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250206 ; extra == 'xpu-arc'
136
+ Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250208 ; extra == 'xpu-arc'
137
+ Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250208 ; extra == 'xpu-arc'
138
+ Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250208 ; extra == 'xpu-arc'
146
139
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arc'
147
140
  Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
148
141
  Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
@@ -163,9 +156,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arl'
163
156
  Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arl'
164
157
  Requires-Dist: tabulate ; extra == 'xpu-arl'
165
158
  Requires-Dist: setuptools ; extra == 'xpu-arl'
166
- Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250206 ; extra == 'xpu-arl'
167
- Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250206 ; extra == 'xpu-arl'
168
- Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250206 ; extra == 'xpu-arl'
159
+ Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250208 ; extra == 'xpu-arl'
160
+ Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250208 ; extra == 'xpu-arl'
161
+ Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250208 ; extra == 'xpu-arl'
169
162
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arl'
170
163
  Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
171
164
  Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
@@ -186,9 +179,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-lnl'
186
179
  Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-lnl'
187
180
  Requires-Dist: tabulate ; extra == 'xpu-lnl'
188
181
  Requires-Dist: setuptools ; extra == 'xpu-lnl'
189
- Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250206 ; extra == 'xpu-lnl'
190
- Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250206 ; extra == 'xpu-lnl'
191
- Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250206 ; extra == 'xpu-lnl'
182
+ Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250208 ; extra == 'xpu-lnl'
183
+ Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250208 ; extra == 'xpu-lnl'
184
+ Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250208 ; extra == 'xpu-lnl'
192
185
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-lnl'
193
186
  Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
194
187
  Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'