ipex-llm 2.2.0b20250206__py3-none-win_amd64.whl → 2.2.0b20250208__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ipex_llm/libs/bloom-api.dll +0 -0
- ipex_llm/libs/bloom.dll +0 -0
- ipex_llm/libs/gptneox-api.dll +0 -0
- ipex_llm/libs/gptneox.dll +0 -0
- ipex_llm/libs/libbloom_avx.dll +0 -0
- ipex_llm/libs/libbloom_vnni.dll +0 -0
- ipex_llm/libs/libgptneox_avx.dll +0 -0
- ipex_llm/libs/libgptneox_vnni.dll +0 -0
- ipex_llm/libs/libllama_avx.dll +0 -0
- ipex_llm/libs/libllama_vnni.dll +0 -0
- ipex_llm/libs/libstarcoder_avx.dll +0 -0
- ipex_llm/libs/libstarcoder_vnni.dll +0 -0
- ipex_llm/libs/llama-api.dll +0 -0
- ipex_llm/libs/llama.dll +0 -0
- ipex_llm/libs/main-bloom.exe +0 -0
- ipex_llm/libs/main-gptneox.exe +0 -0
- ipex_llm/libs/main-llama.exe +0 -0
- ipex_llm/libs/main-starcoder.exe +0 -0
- ipex_llm/libs/pipeline.dll +0 -0
- ipex_llm/libs/quantize-bloom.exe +0 -0
- ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
- ipex_llm/libs/quantize-gptneox.exe +0 -0
- ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
- ipex_llm/libs/quantize-llama.exe +0 -0
- ipex_llm/libs/quantize-llama_vnni.exe +0 -0
- ipex_llm/libs/quantize-starcoder.exe +0 -0
- ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
- ipex_llm/libs/starcoder-api.dll +0 -0
- ipex_llm/libs/starcoder.dll +0 -0
- ipex_llm/transformers/model.py +0 -1
- ipex_llm/transformers/npu_model.py +0 -1
- ipex_llm/transformers/npu_models/qwen2_mp.py +20 -8
- ipex_llm/transformers/npu_pipeline_model/common.py +161 -0
- ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +30 -23
- ipex_llm/transformers/npu_pipeline_model/llama.py +17 -165
- ipex_llm/transformers/npu_pipeline_model/minicpm.py +10 -6
- ipex_llm/transformers/npu_pipeline_model/qwen.py +53 -34
- {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/METADATA +23 -30
- {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/RECORD +45 -45
- {ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250208.data}/scripts/ipex-llm-init.bat +0 -0
- {ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250208.data}/scripts/llm-chat.ps1 +0 -0
- {ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250208.data}/scripts/llm-cli.ps1 +0 -0
- {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/WHEEL +0 -0
- {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/entry_points.txt +0 -0
- {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/top_level.txt +0 -0
@@ -18,108 +18,8 @@
|
|
18
18
|
import torch
|
19
19
|
import numpy as np
|
20
20
|
import os
|
21
|
-
from .common import update_names_of_IR_and_export_blob,
|
22
|
-
obtain_weight_from_single_layer
|
23
|
-
from intel_npu_acceleration_library.backend.factory import NNFactory
|
24
|
-
|
25
|
-
|
26
|
-
class Llama32Embedding(NNFactory):
|
27
|
-
def __init__(
|
28
|
-
self,
|
29
|
-
vocab_size,
|
30
|
-
embedding_dim,
|
31
|
-
embedding_weight,
|
32
|
-
padding_idx,
|
33
|
-
inv_freq,
|
34
|
-
attention_scaling,
|
35
|
-
dtype, # fp16
|
36
|
-
device: str = "NPU",
|
37
|
-
):
|
38
|
-
super().__init__(False, device)
|
39
|
-
self.vocab_size = vocab_size
|
40
|
-
self.embedding_dim = embedding_dim
|
41
|
-
self.padding_idx = padding_idx
|
42
|
-
self.attention_scaling = attention_scaling
|
43
|
-
self.dtype = dtype
|
44
|
-
|
45
|
-
# define input
|
46
|
-
weight = self.constant(embedding_weight)
|
47
|
-
input = self.parameter((1, 1), dtype=np.int32)
|
48
|
-
position_ids = self.parameter((1, 1), dtype=np.int64)
|
49
|
-
inv_freq = self.constant(inv_freq)
|
50
|
-
|
51
|
-
# embed_tokens module
|
52
|
-
if padding_idx == -1:
|
53
|
-
padding_idx += vocab_size
|
54
|
-
|
55
|
-
axis_node = self.constant(np.array([0], dtype=np.int64))
|
56
|
-
if padding_idx is not None:
|
57
|
-
masked_embeddings = np.ones(weight.shape, dtype=np.float16)
|
58
|
-
masked_embeddings[padding_idx, :] = 0.0 # mask
|
59
|
-
|
60
|
-
node_mask = self.constant(masked_embeddings)
|
61
|
-
node_masked_w = self.eltwise_mul(weight, node_mask)
|
62
|
-
res = self.gather(node_masked_w, input, axis_node, 0)
|
63
|
-
else:
|
64
|
-
res = self.gather(weight, input, axis_node, 0)
|
65
|
-
|
66
|
-
# rotary_emb module
|
67
|
-
inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
|
68
|
-
position_ids = self.reshape(position_ids, (1, 1, 1))
|
69
|
-
freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
|
70
|
-
self.convert_to_fp32(position_ids))
|
71
|
-
freqs = self.transpose(freqs, [0, 2, 1])
|
72
|
-
emb = self.concat(freqs, freqs, axis=2)
|
73
|
-
cos = self.cos(emb)
|
74
|
-
sin = self.sin(emb)
|
75
|
-
cos = cos * self.attention_scaling
|
76
|
-
sin = sin * self.attention_scaling
|
77
|
-
|
78
|
-
# define outputs
|
79
|
-
res = self.convert_to_fp16(res)
|
80
|
-
cos = self.convert_to_fp32(cos)
|
81
|
-
sin = self.convert_to_fp32(sin)
|
82
|
-
|
83
|
-
print("start compiling")
|
84
|
-
self.compile()
|
85
|
-
|
86
|
-
|
87
|
-
class Llama32PostEmbedding(NNFactory):
|
88
|
-
def __init__(
|
89
|
-
self,
|
90
|
-
inv_freq,
|
91
|
-
attention_scaling,
|
92
|
-
input_len: int = 1,
|
93
|
-
device: str = "NPU",
|
94
|
-
):
|
95
|
-
super().__init__(False, device)
|
96
|
-
self.attention_scaling = attention_scaling
|
97
|
-
|
98
|
-
# define input
|
99
|
-
position_ids = self.parameter((1, input_len), dtype=np.int64)
|
100
|
-
inv_freq = self.constant(inv_freq)
|
101
|
-
|
102
|
-
# rotary_emb module
|
103
|
-
inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
|
104
|
-
position_ids = self.reshape(position_ids, (1, 1, input_len))
|
105
|
-
freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
|
106
|
-
self.convert_to_fp32(position_ids))
|
107
|
-
freqs = self.transpose(freqs, [0, 2, 1])
|
108
|
-
emb = self.concat(freqs, freqs, axis=2)
|
109
|
-
cos = self.cos(emb)
|
110
|
-
sin = self.sin(emb)
|
111
|
-
cos = cos * self.attention_scaling
|
112
|
-
sin = sin * self.attention_scaling
|
113
|
-
if input_len > 1:
|
114
|
-
cos = self.unsqueeze(cos, [1])
|
115
|
-
sin = self.unsqueeze(sin, [1])
|
116
|
-
|
117
|
-
# define outputs
|
118
|
-
cos = self.convert_to_fp32(cos)
|
119
|
-
sin = self.convert_to_fp32(sin)
|
120
|
-
|
121
|
-
print("start compiling")
|
122
|
-
self.compile()
|
21
|
+
from .common import update_names_of_IR_and_export_blob, LowBitLLMLMHead, \
|
22
|
+
obtain_weight_from_single_layer, obtain_embedding_from_model
|
123
23
|
|
124
24
|
|
125
25
|
def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
@@ -197,69 +97,17 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
|
197
97
|
bin_file = os.path.join(weight_dir, f"model_lm_head_input_{1+idx}.bin")
|
198
98
|
weight.tofile(bin_file)
|
199
99
|
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
vocab_size=model.config.vocab_size,
|
205
|
-
embedding_dim=model.config.hidden_size,
|
206
|
-
embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
|
207
|
-
padding_idx=model.config.pad_token_id,
|
208
|
-
dtype=np.float16,
|
209
|
-
)
|
210
|
-
if convert_model:
|
211
|
-
bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
|
212
|
-
embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
|
213
|
-
first_blob_path = None
|
214
|
-
else:
|
215
|
-
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
|
216
|
-
temp_dir, keep_ir=keep_ir,
|
217
|
-
compile_blob=compile_blob)
|
218
|
-
os.remove(os.path.join(temp_dir, "embedding.bin"))
|
219
|
-
else:
|
220
|
-
# llama-3.2-3B & llama-3.2-1B
|
221
|
-
embedding_layer = model.model.embed_tokens
|
222
|
-
new_embedding = Llama32Embedding(
|
223
|
-
vocab_size=model.config.vocab_size,
|
224
|
-
embedding_dim=model.config.hidden_size,
|
225
|
-
embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
|
226
|
-
padding_idx=model.config.pad_token_id,
|
227
|
-
inv_freq=model.model.rotary_emb.inv_freq.to(torch.float16),
|
228
|
-
attention_scaling=model.model.rotary_emb.attention_scaling,
|
229
|
-
dtype=np.float16,
|
230
|
-
)
|
231
|
-
if convert_model:
|
232
|
-
bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
|
233
|
-
embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
|
234
|
-
first_blob_path = None
|
235
|
-
# save embedding post module
|
236
|
-
inv_freq = model.model.rotary_emb.inv_freq.to(torch.float16)
|
237
|
-
attention_scaling = model.model.rotary_emb.attention_scaling
|
238
|
-
embedding_post = Llama32PostEmbedding(inv_freq=inv_freq,
|
239
|
-
attention_scaling=attention_scaling,
|
240
|
-
input_len=1)
|
241
|
-
update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
|
242
|
-
temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
|
243
|
-
embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
|
244
|
-
attention_scaling=attention_scaling,
|
245
|
-
input_len=max_prompt_len)
|
246
|
-
update_names_of_IR_and_export_blob(embedding_post_prefill,
|
247
|
-
"embedding_post_prefill",
|
248
|
-
temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
|
249
|
-
os.remove(os.path.join(temp_dir, "embedding_post.bin"))
|
250
|
-
os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
|
251
|
-
else:
|
252
|
-
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
|
253
|
-
temp_dir, keep_ir=keep_ir,
|
254
|
-
compile_blob=compile_blob)
|
255
|
-
os.remove(os.path.join(temp_dir, "embedding.bin"))
|
100
|
+
first_blob_path = obtain_embedding_from_model(model, convert_model,
|
101
|
+
temp_dir, weight_dir,
|
102
|
+
max_prompt_len,
|
103
|
+
keep_ir, compile_blob)
|
256
104
|
|
257
105
|
return first_blob_path, last_blob_path
|
258
106
|
|
259
107
|
|
260
108
|
def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
261
109
|
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
262
|
-
|
110
|
+
const_parameter, mode="decode",
|
263
111
|
keep_ir=False, compile_blob=True):
|
264
112
|
num_heads = model.model.layers[0].self_attn.num_heads
|
265
113
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
@@ -297,14 +145,14 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
297
145
|
else:
|
298
146
|
input_len = kv_len
|
299
147
|
decoder_name = "decoder_layer_prefill"
|
300
|
-
|
148
|
+
const_parameter = False
|
301
149
|
keep_position_ids = False
|
302
150
|
npu_dpu_groups = 6
|
303
151
|
|
304
152
|
single_decoder = LowBitLlamaMultiDecoderlayer(
|
305
153
|
[1, input_len, num_heads * head_dim],
|
306
|
-
input_layernorm_weights=[layer_norm_0] if
|
307
|
-
post_attn_layernorm_weights=[layer_norm_1] if
|
154
|
+
input_layernorm_weights=[layer_norm_0] if const_parameter else None,
|
155
|
+
post_attn_layernorm_weights=[layer_norm_1] if const_parameter else None,
|
308
156
|
cached_cos=cached_cos,
|
309
157
|
cached_sin=cached_sin,
|
310
158
|
num_heads=num_heads,
|
@@ -334,7 +182,7 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
334
182
|
if mode == "decode":
|
335
183
|
if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
|
336
184
|
# llama-2-7B & llama-3-8B
|
337
|
-
if
|
185
|
+
if const_parameter:
|
338
186
|
st_idx = 5
|
339
187
|
else:
|
340
188
|
input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
|
@@ -344,7 +192,7 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
344
192
|
st_idx = 7
|
345
193
|
else:
|
346
194
|
# llama-3.2-3B & llama-3.2-1B
|
347
|
-
if
|
195
|
+
if const_parameter:
|
348
196
|
st_idx = 6
|
349
197
|
else:
|
350
198
|
input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
|
@@ -375,7 +223,7 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
375
223
|
|
376
224
|
def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
377
225
|
save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
378
|
-
|
226
|
+
const_parameter, mode="decode",
|
379
227
|
keep_ir=False, compile_blob=True):
|
380
228
|
num_heads = model.model.layers[0].self_attn.num_heads
|
381
229
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
@@ -446,6 +294,10 @@ def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_dow
|
|
446
294
|
else: # FP16 Linear
|
447
295
|
np_dtype = np.float16
|
448
296
|
|
297
|
+
if not const_parameter:
|
298
|
+
input_layer_norm_weights = None
|
299
|
+
post_attn_layernorm_weights = None
|
300
|
+
|
449
301
|
fused_decoder = LowBitLlamaMultiDecoderlayer(
|
450
302
|
[1, 1, num_heads * head_dim],
|
451
303
|
input_layernorm_weights=input_layer_norm_weights,
|
@@ -301,7 +301,7 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
|
301
301
|
|
302
302
|
def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
303
303
|
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
304
|
-
|
304
|
+
const_parameter, mode="decode",
|
305
305
|
keep_ir=False, compile_blob=True):
|
306
306
|
num_heads = model.model.layers[0].self_attn.num_heads
|
307
307
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
@@ -333,12 +333,12 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
333
333
|
else:
|
334
334
|
input_len = kv_len
|
335
335
|
decoder_name = "decoder_layer_prefill"
|
336
|
-
|
336
|
+
const_parameter = False
|
337
337
|
|
338
338
|
single_decoder = LowBitMinicpmMultiDecoderlayer(
|
339
339
|
[1, input_len, num_heads * head_dim],
|
340
|
-
input_layernorm_weights=[layer_norm_0] if
|
341
|
-
post_attn_layernorm_weights=[layer_norm_1] if
|
340
|
+
input_layernorm_weights=[layer_norm_0] if const_parameter else None,
|
341
|
+
post_attn_layernorm_weights=[layer_norm_1] if const_parameter else None,
|
342
342
|
cached_cos=cached_cos,
|
343
343
|
cached_sin=cached_sin,
|
344
344
|
num_heads=num_heads,
|
@@ -364,7 +364,7 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
364
364
|
os.remove(os.path.join(temp_dir, decoder_name + ".bin"))
|
365
365
|
|
366
366
|
if mode == "decode":
|
367
|
-
if
|
367
|
+
if const_parameter:
|
368
368
|
st_idx = 5
|
369
369
|
else:
|
370
370
|
input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
|
@@ -394,7 +394,7 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
394
394
|
|
395
395
|
def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
396
396
|
save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
397
|
-
|
397
|
+
const_parameter, mode="decode",
|
398
398
|
keep_ir=False, compile_blob=True):
|
399
399
|
num_heads = model.model.layers[0].self_attn.num_heads
|
400
400
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
@@ -461,6 +461,10 @@ def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_d
|
|
461
461
|
else: # FP16 Linear
|
462
462
|
np_dtype = np.float16
|
463
463
|
|
464
|
+
if not const_parameter:
|
465
|
+
input_layer_norm_weights = None
|
466
|
+
post_attn_layernorm_weights = None
|
467
|
+
|
464
468
|
fused_decoder = LowBitMinicpmMultiDecoderlayer(
|
465
469
|
[1, 1, num_heads * head_dim],
|
466
470
|
input_layernorm_weights=input_layer_norm_weights,
|
@@ -18,13 +18,14 @@
|
|
18
18
|
import torch
|
19
19
|
import numpy as np
|
20
20
|
import os
|
21
|
-
from .common import update_names_of_IR_and_export_blob,
|
22
|
-
obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer
|
21
|
+
from .common import update_names_of_IR_and_export_blob, LowBitLLMLMHead, \
|
22
|
+
obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer, \
|
23
|
+
obtain_embedding_from_model
|
23
24
|
from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
|
24
25
|
|
25
26
|
|
26
27
|
def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
|
27
|
-
convert_model=False, group_size=0,
|
28
|
+
convert_model=False, group_size=0, max_prompt_len=1,
|
28
29
|
keep_ir=False, compile_blob=True):
|
29
30
|
num_heads = model.model.layers[0].self_attn.num_heads
|
30
31
|
head_dim = model.model.layers[0].self_attn.head_dim
|
@@ -107,30 +108,16 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
|
|
107
108
|
bin_file = os.path.join(weight_dir, f"model_lm_head_input_{1+idx}.bin")
|
108
109
|
weight.tofile(bin_file)
|
109
110
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
|
115
|
-
padding_idx=model.config.pad_token_id,
|
116
|
-
dtype=np.float16,
|
117
|
-
input_length=1,
|
118
|
-
)
|
119
|
-
if convert_model:
|
120
|
-
bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
|
121
|
-
embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
|
122
|
-
first_blob_path = True
|
123
|
-
else:
|
124
|
-
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding",
|
125
|
-
temp_dir, keep_ir=keep_ir,
|
126
|
-
compile_blob=compile_blob)
|
127
|
-
os.remove(os.path.join(temp_dir, "embedding.bin"))
|
111
|
+
first_blob_path = obtain_embedding_from_model(model, convert_model,
|
112
|
+
temp_dir, weight_dir,
|
113
|
+
max_prompt_len,
|
114
|
+
keep_ir, compile_blob)
|
128
115
|
return first_blob_path, last_blob_path
|
129
116
|
|
130
117
|
|
131
118
|
def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
132
119
|
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
133
|
-
|
120
|
+
const_parameter, mode="decode",
|
134
121
|
keep_ir=False, compile_blob=True):
|
135
122
|
num_heads = model.model.layers[0].self_attn.num_heads
|
136
123
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
@@ -145,8 +132,13 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
145
132
|
mlp_layer = curr_layer.mlp
|
146
133
|
weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
|
147
134
|
q_bias, k_bias, v_bias = obtain_qkv_bias_from_single_layer(attn_layer)
|
148
|
-
|
149
|
-
|
135
|
+
if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
|
136
|
+
cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
|
137
|
+
cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
|
138
|
+
else:
|
139
|
+
# transformers >= 4.45.0
|
140
|
+
cached_cos = None
|
141
|
+
cached_sin = None
|
150
142
|
layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
|
151
143
|
layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)
|
152
144
|
|
@@ -158,10 +150,12 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
158
150
|
if mode == "decode":
|
159
151
|
input_len = 1
|
160
152
|
decoder_name = f"decoder_layer_{layer_idx}"
|
153
|
+
keep_position_ids = True
|
161
154
|
npu_dpu_groups = None
|
162
155
|
else:
|
163
156
|
input_len = kv_len
|
164
157
|
decoder_name = "decoder_layer_prefill"
|
158
|
+
keep_position_ids = False
|
165
159
|
npu_dpu_groups = 6
|
166
160
|
|
167
161
|
single_decoder = LowBitQwenMultiDecoderlayer(
|
@@ -185,6 +179,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
185
179
|
n_splits_linear=n_splits_linear,
|
186
180
|
n_splits_down_proj=n_splits_down_proj,
|
187
181
|
group_size=group_size,
|
182
|
+
cos_len=input_len,
|
183
|
+
keep_position_ids=keep_position_ids,
|
188
184
|
asym=asym
|
189
185
|
)
|
190
186
|
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
|
@@ -196,14 +192,25 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
196
192
|
|
197
193
|
# 0, 1, 2 are input_embed/attention_mask/position_id
|
198
194
|
if mode == "decode":
|
199
|
-
if
|
200
|
-
|
195
|
+
if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
|
196
|
+
if const_parameter:
|
197
|
+
st_idx = 3
|
198
|
+
else:
|
199
|
+
input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
|
200
|
+
post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
|
201
|
+
layer_norm_0.data.numpy().tofile(input_lm_bin_file)
|
202
|
+
layer_norm_1.data.numpy().tofile(post_lm_bin_file)
|
203
|
+
st_idx = 5
|
201
204
|
else:
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
205
|
+
# transformers >= 4.45.0
|
206
|
+
if const_parameter:
|
207
|
+
st_idx = 4
|
208
|
+
else:
|
209
|
+
input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
|
210
|
+
post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_5.bin")
|
211
|
+
layer_norm_0.data.numpy().tofile(input_lm_bin_file)
|
212
|
+
layer_norm_1.data.numpy().tofile(post_lm_bin_file)
|
213
|
+
st_idx = 6
|
207
214
|
q_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx}.bin")
|
208
215
|
k_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+1}.bin")
|
209
216
|
v_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+2}.bin")
|
@@ -234,7 +241,7 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
234
241
|
|
235
242
|
def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
236
243
|
save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
237
|
-
|
244
|
+
const_parameter, mode="decode",
|
238
245
|
keep_ir=False, compile_blob=True):
|
239
246
|
num_heads = model.model.layers[0].self_attn.num_heads
|
240
247
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
@@ -261,8 +268,13 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
|
|
261
268
|
attn_layer = curr_layer.self_attn
|
262
269
|
mlp_layer = curr_layer.mlp
|
263
270
|
weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
|
264
|
-
|
265
|
-
|
271
|
+
if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
|
272
|
+
cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
|
273
|
+
cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
|
274
|
+
else:
|
275
|
+
# transformers >= 4.45.0
|
276
|
+
cached_cos = None
|
277
|
+
cached_sin = None
|
266
278
|
layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
|
267
279
|
layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)
|
268
280
|
|
@@ -313,6 +325,13 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
|
|
313
325
|
else: # FP16 Linear
|
314
326
|
np_dtype = np.float16
|
315
327
|
|
328
|
+
if not const_parameter:
|
329
|
+
input_layer_norm_weights = None
|
330
|
+
post_attn_layernorm_weights = None
|
331
|
+
q_biases = None
|
332
|
+
k_biases = None
|
333
|
+
v_biases = None
|
334
|
+
|
316
335
|
fused_decoder = LowBitQwenMultiDecoderlayer(
|
317
336
|
[1, 1, num_heads * head_dim],
|
318
337
|
input_layernorm_weights=input_layer_norm_weights,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: ipex-llm
|
3
|
-
Version: 2.2.
|
3
|
+
Version: 2.2.0b20250208
|
4
4
|
Summary: Large Language Model Develop Toolkit
|
5
5
|
Home-page: https://github.com/intel-analytics/ipex-llm
|
6
6
|
Author: BigDL Authors
|
@@ -27,19 +27,12 @@ Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine
|
|
27
27
|
Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'all'
|
28
28
|
Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'all'
|
29
29
|
Provides-Extra: cpp
|
30
|
-
Requires-Dist: bigdl-core-cpp ==2.6.
|
30
|
+
Requires-Dist: bigdl-core-cpp ==2.6.0b20250208 ; extra == 'cpp'
|
31
31
|
Requires-Dist: setuptools ; extra == 'cpp'
|
32
|
-
|
33
|
-
Requires-Dist:
|
34
|
-
Requires-Dist:
|
35
|
-
Requires-Dist:
|
36
|
-
Requires-Dist: onednn ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
|
37
|
-
Requires-Dist: dpcpp-cpp-rt ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
|
38
|
-
Requires-Dist: mkl-dpcpp ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
|
39
|
-
Requires-Dist: onednn-devel ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
|
40
|
-
Requires-Dist: onednn ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
|
41
|
-
Requires-Dist: dpcpp-cpp-rt ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
|
42
|
-
Requires-Dist: mkl-dpcpp ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
|
32
|
+
Requires-Dist: onednn-devel ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
|
33
|
+
Requires-Dist: onednn ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
|
34
|
+
Requires-Dist: dpcpp-cpp-rt ==2025.0.2 ; (platform_system == "Windows") and extra == 'cpp'
|
35
|
+
Requires-Dist: mkl-dpcpp ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
|
43
36
|
Provides-Extra: llama-index
|
44
37
|
Requires-Dist: py-cpuinfo ; extra == 'llama-index'
|
45
38
|
Requires-Dist: protobuf ; extra == 'llama-index'
|
@@ -67,7 +60,7 @@ Requires-Dist: transformers ==4.40.0 ; extra == 'npu'
|
|
67
60
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'npu'
|
68
61
|
Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'npu'
|
69
62
|
Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'npu'
|
70
|
-
Requires-Dist: bigdl-core-npu ==2.6.
|
63
|
+
Requires-Dist: bigdl-core-npu ==2.6.0b20250208 ; (platform_system == "Windows") and extra == 'npu'
|
71
64
|
Provides-Extra: serving
|
72
65
|
Requires-Dist: py-cpuinfo ; extra == 'serving'
|
73
66
|
Requires-Dist: fschat[model_worker,webui] ==0.2.36 ; extra == 'serving'
|
@@ -87,9 +80,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu'
|
|
87
80
|
Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu'
|
88
81
|
Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu'
|
89
82
|
Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu'
|
90
|
-
Requires-Dist: bigdl-core-xe-21 ==2.6.
|
91
|
-
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.
|
92
|
-
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.
|
83
|
+
Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250208 ; extra == 'xpu'
|
84
|
+
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250208 ; extra == 'xpu'
|
85
|
+
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250208 ; extra == 'xpu'
|
93
86
|
Provides-Extra: xpu-2-1
|
94
87
|
Requires-Dist: py-cpuinfo ; extra == 'xpu-2-1'
|
95
88
|
Requires-Dist: protobuf ; extra == 'xpu-2-1'
|
@@ -104,9 +97,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu-2-1'
|
|
104
97
|
Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu-2-1'
|
105
98
|
Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu-2-1'
|
106
99
|
Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu-2-1'
|
107
|
-
Requires-Dist: bigdl-core-xe-21 ==2.6.
|
108
|
-
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.
|
109
|
-
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.
|
100
|
+
Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250208 ; extra == 'xpu-2-1'
|
101
|
+
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250208 ; extra == 'xpu-2-1'
|
102
|
+
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250208 ; extra == 'xpu-2-1'
|
110
103
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-1'
|
111
104
|
Requires-Dist: dpcpp-cpp-rt ==2024.0.2 ; (platform_system == "Windows") and extra == 'xpu-2-1'
|
112
105
|
Requires-Dist: mkl-dpcpp ==2024.0.0 ; (platform_system == "Windows") and extra == 'xpu-2-1'
|
@@ -124,7 +117,7 @@ Requires-Dist: setuptools ; extra == 'xpu-2-6'
|
|
124
117
|
Requires-Dist: torch ==2.6.0+xpu ; extra == 'xpu-2-6'
|
125
118
|
Requires-Dist: torchvision ==0.21.0+xpu ; extra == 'xpu-2-6'
|
126
119
|
Requires-Dist: torchaudio ==2.6.0+xpu ; extra == 'xpu-2-6'
|
127
|
-
Requires-Dist: bigdl-core-xe-all ==2.6.
|
120
|
+
Requires-Dist: bigdl-core-xe-all ==2.6.0b20250208 ; extra == 'xpu-2-6'
|
128
121
|
Requires-Dist: onednn-devel ==2025.0.1 ; extra == 'xpu-2-6'
|
129
122
|
Requires-Dist: onednn ==2025.0.1 ; extra == 'xpu-2-6'
|
130
123
|
Requires-Dist: dpcpp-cpp-rt ==2025.0.2 ; extra == 'xpu-2-6'
|
@@ -140,9 +133,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arc'
|
|
140
133
|
Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arc'
|
141
134
|
Requires-Dist: tabulate ; extra == 'xpu-arc'
|
142
135
|
Requires-Dist: setuptools ; extra == 'xpu-arc'
|
143
|
-
Requires-Dist: bigdl-core-xe-23 ==2.6.
|
144
|
-
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.
|
145
|
-
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.
|
136
|
+
Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250208 ; extra == 'xpu-arc'
|
137
|
+
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250208 ; extra == 'xpu-arc'
|
138
|
+
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250208 ; extra == 'xpu-arc'
|
146
139
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arc'
|
147
140
|
Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
|
148
141
|
Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
|
@@ -163,9 +156,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arl'
|
|
163
156
|
Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arl'
|
164
157
|
Requires-Dist: tabulate ; extra == 'xpu-arl'
|
165
158
|
Requires-Dist: setuptools ; extra == 'xpu-arl'
|
166
|
-
Requires-Dist: bigdl-core-xe-23 ==2.6.
|
167
|
-
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.
|
168
|
-
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.
|
159
|
+
Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250208 ; extra == 'xpu-arl'
|
160
|
+
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250208 ; extra == 'xpu-arl'
|
161
|
+
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250208 ; extra == 'xpu-arl'
|
169
162
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arl'
|
170
163
|
Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
|
171
164
|
Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
|
@@ -186,9 +179,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-lnl'
|
|
186
179
|
Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-lnl'
|
187
180
|
Requires-Dist: tabulate ; extra == 'xpu-lnl'
|
188
181
|
Requires-Dist: setuptools ; extra == 'xpu-lnl'
|
189
|
-
Requires-Dist: bigdl-core-xe-23 ==2.6.
|
190
|
-
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.
|
191
|
-
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.
|
182
|
+
Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250208 ; extra == 'xpu-lnl'
|
183
|
+
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250208 ; extra == 'xpu-lnl'
|
184
|
+
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250208 ; extra == 'xpu-lnl'
|
192
185
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-lnl'
|
193
186
|
Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
|
194
187
|
Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
|