ipex-llm 2.2.0b20250205__py3-none-win_amd64.whl → 2.2.0b20250207__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ipex_llm/libs/bloom-api.dll +0 -0
- ipex_llm/libs/bloom.dll +0 -0
- ipex_llm/libs/gptneox-api.dll +0 -0
- ipex_llm/libs/gptneox.dll +0 -0
- ipex_llm/libs/libbloom_avx.dll +0 -0
- ipex_llm/libs/libbloom_vnni.dll +0 -0
- ipex_llm/libs/libgptneox_avx.dll +0 -0
- ipex_llm/libs/libgptneox_vnni.dll +0 -0
- ipex_llm/libs/libllama_avx.dll +0 -0
- ipex_llm/libs/libllama_vnni.dll +0 -0
- ipex_llm/libs/libstarcoder_avx.dll +0 -0
- ipex_llm/libs/libstarcoder_vnni.dll +0 -0
- ipex_llm/libs/llama-api.dll +0 -0
- ipex_llm/libs/llama.dll +0 -0
- ipex_llm/libs/main-bloom.exe +0 -0
- ipex_llm/libs/main-gptneox.exe +0 -0
- ipex_llm/libs/main-llama.exe +0 -0
- ipex_llm/libs/main-starcoder.exe +0 -0
- ipex_llm/libs/pipeline.dll +0 -0
- ipex_llm/libs/quantize-bloom.exe +0 -0
- ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
- ipex_llm/libs/quantize-gptneox.exe +0 -0
- ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
- ipex_llm/libs/quantize-llama.exe +0 -0
- ipex_llm/libs/quantize-llama_vnni.exe +0 -0
- ipex_llm/libs/quantize-starcoder.exe +0 -0
- ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
- ipex_llm/libs/starcoder-api.dll +0 -0
- ipex_llm/libs/starcoder.dll +0 -0
- ipex_llm/transformers/low_bit_linear.py +5 -4
- ipex_llm/transformers/model.py +0 -1
- ipex_llm/transformers/npu_model.py +17 -5
- ipex_llm/transformers/npu_models/convert.py +6 -2
- ipex_llm/transformers/npu_models/qwen2_mp.py +20 -8
- ipex_llm/transformers/npu_pipeline_model/common.py +161 -0
- ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +33 -13
- ipex_llm/transformers/npu_pipeline_model/llama.py +20 -159
- ipex_llm/transformers/npu_pipeline_model/minicpm.py +19 -10
- ipex_llm/transformers/npu_pipeline_model/qwen.py +57 -36
- ipex_llm/transformers/qlora.py +2 -2
- ipex_llm/transformers/utils.py +19 -6
- ipex_llm/transformers/xpu_customize_fwd.py +6 -4
- {ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/METADATA +23 -30
- {ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/RECORD +50 -50
- {ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250207.data}/scripts/ipex-llm-init.bat +0 -0
- {ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250207.data}/scripts/llm-chat.ps1 +0 -0
- {ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250207.data}/scripts/llm-cli.ps1 +0 -0
- {ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/WHEEL +0 -0
- {ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/entry_points.txt +0 -0
- {ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/top_level.txt +0 -0
@@ -18,112 +18,13 @@
|
|
18
18
|
import torch
|
19
19
|
import numpy as np
|
20
20
|
import os
|
21
|
-
from .common import update_names_of_IR_and_export_blob,
|
22
|
-
obtain_weight_from_single_layer
|
23
|
-
from intel_npu_acceleration_library.backend.factory import NNFactory
|
24
|
-
|
25
|
-
|
26
|
-
class Llama32Embedding(NNFactory):
|
27
|
-
def __init__(
|
28
|
-
self,
|
29
|
-
vocab_size,
|
30
|
-
embedding_dim,
|
31
|
-
embedding_weight,
|
32
|
-
padding_idx,
|
33
|
-
inv_freq,
|
34
|
-
attention_scaling,
|
35
|
-
dtype, # fp16
|
36
|
-
device: str = "NPU",
|
37
|
-
):
|
38
|
-
super().__init__(False, device)
|
39
|
-
self.vocab_size = vocab_size
|
40
|
-
self.embedding_dim = embedding_dim
|
41
|
-
self.padding_idx = padding_idx
|
42
|
-
self.attention_scaling = attention_scaling
|
43
|
-
self.dtype = dtype
|
44
|
-
|
45
|
-
# define input
|
46
|
-
weight = self.constant(embedding_weight)
|
47
|
-
input = self.parameter((1, 1), dtype=np.int32)
|
48
|
-
position_ids = self.parameter((1, 1), dtype=np.int64)
|
49
|
-
inv_freq = self.constant(inv_freq)
|
50
|
-
|
51
|
-
# embed_tokens module
|
52
|
-
if padding_idx == -1:
|
53
|
-
padding_idx += vocab_size
|
54
|
-
|
55
|
-
axis_node = self.constant(np.array([0], dtype=np.int64))
|
56
|
-
if padding_idx is not None:
|
57
|
-
masked_embeddings = np.ones(weight.shape, dtype=np.float16)
|
58
|
-
masked_embeddings[padding_idx, :] = 0.0 # mask
|
59
|
-
|
60
|
-
node_mask = self.constant(masked_embeddings)
|
61
|
-
node_masked_w = self.eltwise_mul(weight, node_mask)
|
62
|
-
res = self.gather(node_masked_w, input, axis_node, 0)
|
63
|
-
else:
|
64
|
-
res = self.gather(weight, input, axis_node, 0)
|
65
|
-
|
66
|
-
# rotary_emb module
|
67
|
-
inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
|
68
|
-
position_ids = self.reshape(position_ids, (1, 1, 1))
|
69
|
-
freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
|
70
|
-
self.convert_to_fp32(position_ids))
|
71
|
-
freqs = self.transpose(freqs, [0, 2, 1])
|
72
|
-
emb = self.concat(freqs, freqs, axis=2)
|
73
|
-
cos = self.cos(emb)
|
74
|
-
sin = self.sin(emb)
|
75
|
-
cos = cos * self.attention_scaling
|
76
|
-
sin = sin * self.attention_scaling
|
77
|
-
|
78
|
-
# define outputs
|
79
|
-
res = self.convert_to_fp16(res)
|
80
|
-
cos = self.convert_to_fp32(cos)
|
81
|
-
sin = self.convert_to_fp32(sin)
|
82
|
-
|
83
|
-
print("start compiling")
|
84
|
-
self.compile()
|
85
|
-
|
86
|
-
|
87
|
-
class Llama32PostEmbedding(NNFactory):
|
88
|
-
def __init__(
|
89
|
-
self,
|
90
|
-
inv_freq,
|
91
|
-
attention_scaling,
|
92
|
-
input_len: int = 1,
|
93
|
-
device: str = "NPU",
|
94
|
-
):
|
95
|
-
super().__init__(False, device)
|
96
|
-
self.attention_scaling = attention_scaling
|
97
|
-
|
98
|
-
# define input
|
99
|
-
position_ids = self.parameter((1, input_len), dtype=np.int64)
|
100
|
-
inv_freq = self.constant(inv_freq)
|
101
|
-
|
102
|
-
# rotary_emb module
|
103
|
-
inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
|
104
|
-
position_ids = self.reshape(position_ids, (1, 1, input_len))
|
105
|
-
freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
|
106
|
-
self.convert_to_fp32(position_ids))
|
107
|
-
freqs = self.transpose(freqs, [0, 2, 1])
|
108
|
-
emb = self.concat(freqs, freqs, axis=2)
|
109
|
-
cos = self.cos(emb)
|
110
|
-
sin = self.sin(emb)
|
111
|
-
cos = cos * self.attention_scaling
|
112
|
-
sin = sin * self.attention_scaling
|
113
|
-
if input_len > 1:
|
114
|
-
cos = self.unsqueeze(cos, [1])
|
115
|
-
sin = self.unsqueeze(sin, [1])
|
116
|
-
|
117
|
-
# define outputs
|
118
|
-
cos = self.convert_to_fp32(cos)
|
119
|
-
sin = self.convert_to_fp32(sin)
|
120
|
-
|
121
|
-
print("start compiling")
|
122
|
-
self.compile()
|
21
|
+
from .common import update_names_of_IR_and_export_blob, LowBitLLMLMHead, \
|
22
|
+
obtain_weight_from_single_layer, obtain_embedding_from_model
|
123
23
|
|
124
24
|
|
125
25
|
def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
126
|
-
convert_model=False, max_prompt_len=1
|
26
|
+
convert_model=False, max_prompt_len=1,
|
27
|
+
keep_ir=False, compile_blob=True):
|
127
28
|
num_heads = model.model.layers[0].self_attn.num_heads
|
128
29
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
129
30
|
head_dim = model.model.layers[0].self_attn.head_dim
|
@@ -175,7 +76,8 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
|
175
76
|
asym=asym
|
176
77
|
)
|
177
78
|
last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, "lm_head", temp_dir,
|
178
|
-
|
79
|
+
keep_ir=keep_ir, compile_blob=compile_blob)
|
80
|
+
os.remove(os.path.join(temp_dir, "lm_head.bin"))
|
179
81
|
|
180
82
|
# save weights bins files
|
181
83
|
if n_splits_linear == 1:
|
@@ -195,62 +97,18 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
|
195
97
|
bin_file = os.path.join(weight_dir, f"model_lm_head_input_{1+idx}.bin")
|
196
98
|
weight.tofile(bin_file)
|
197
99
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
embedding_dim=model.config.hidden_size,
|
204
|
-
embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
|
205
|
-
padding_idx=model.config.pad_token_id,
|
206
|
-
dtype=np.float16,
|
207
|
-
)
|
208
|
-
if convert_model:
|
209
|
-
bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
|
210
|
-
embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
|
211
|
-
first_blob_path = None
|
212
|
-
else:
|
213
|
-
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
|
214
|
-
temp_dir, True, False)
|
215
|
-
else:
|
216
|
-
# llama-3.2-3B & llama-3.2-1B
|
217
|
-
embedding_layer = model.model.embed_tokens
|
218
|
-
new_embedding = Llama32Embedding(
|
219
|
-
vocab_size=model.config.vocab_size,
|
220
|
-
embedding_dim=model.config.hidden_size,
|
221
|
-
embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
|
222
|
-
padding_idx=model.config.pad_token_id,
|
223
|
-
inv_freq=model.model.rotary_emb.inv_freq.to(torch.float16),
|
224
|
-
attention_scaling=model.model.rotary_emb.attention_scaling,
|
225
|
-
dtype=np.float16,
|
226
|
-
)
|
227
|
-
if convert_model:
|
228
|
-
bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
|
229
|
-
embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
|
230
|
-
first_blob_path = None
|
231
|
-
# save embedding post module
|
232
|
-
inv_freq = model.model.rotary_emb.inv_freq.to(torch.float16)
|
233
|
-
attention_scaling = model.model.rotary_emb.attention_scaling
|
234
|
-
embedding_post = Llama32PostEmbedding(inv_freq=inv_freq,
|
235
|
-
attention_scaling=attention_scaling,
|
236
|
-
input_len=1)
|
237
|
-
update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
|
238
|
-
temp_dir, True, False)
|
239
|
-
embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
|
240
|
-
attention_scaling=attention_scaling,
|
241
|
-
input_len=max_prompt_len)
|
242
|
-
update_names_of_IR_and_export_blob(embedding_post_prefill,
|
243
|
-
"embedding_post_prefill",
|
244
|
-
temp_dir, True, False)
|
245
|
-
else:
|
246
|
-
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
|
247
|
-
temp_dir)
|
100
|
+
first_blob_path = obtain_embedding_from_model(model, convert_model,
|
101
|
+
temp_dir, weight_dir,
|
102
|
+
max_prompt_len,
|
103
|
+
keep_ir, compile_blob)
|
104
|
+
|
248
105
|
return first_blob_path, last_blob_path
|
249
106
|
|
250
107
|
|
251
108
|
def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
252
109
|
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
253
|
-
layernorm_const, mode="decode"
|
110
|
+
layernorm_const, mode="decode",
|
111
|
+
keep_ir=False, compile_blob=True):
|
254
112
|
num_heads = model.model.layers[0].self_attn.num_heads
|
255
113
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
256
114
|
head_dim = model.model.layers[0].self_attn.head_dim
|
@@ -317,8 +175,9 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
317
175
|
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
|
318
176
|
decoder_name,
|
319
177
|
temp_dir,
|
320
|
-
|
178
|
+
keep_ir=keep_ir, compile_blob=compile_blob,
|
321
179
|
npu_dpu_groups=npu_dpu_groups)
|
180
|
+
os.remove(os.path.join(temp_dir, decoder_name + ".bin"))
|
322
181
|
|
323
182
|
if mode == "decode":
|
324
183
|
if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
|
@@ -364,7 +223,8 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
364
223
|
|
365
224
|
def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
366
225
|
save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
367
|
-
layernorm_const, mode="decode"
|
226
|
+
layernorm_const, mode="decode",
|
227
|
+
keep_ir=False, compile_blob=True):
|
368
228
|
num_heads = model.model.layers[0].self_attn.num_heads
|
369
229
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
370
230
|
head_dim = model.model.layers[0].self_attn.head_dim
|
@@ -457,6 +317,7 @@ def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_dow
|
|
457
317
|
update_names_of_IR_and_export_blob(fused_decoder,
|
458
318
|
f"decoder_layer_{i}",
|
459
319
|
save_dir,
|
460
|
-
|
461
|
-
|
320
|
+
keep_ir=keep_ir,
|
321
|
+
compile_blob=compile_blob)
|
322
|
+
os.remove(os.path.join(save_dir, f"decoder_layer_{i}" + ".bin"))
|
462
323
|
return 0
|
@@ -162,7 +162,8 @@ class MiniCPMLMHead(LLMBaseNNFactory):
|
|
162
162
|
|
163
163
|
|
164
164
|
def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
165
|
-
convert_model=False, max_prompt_len=1
|
165
|
+
convert_model=False, max_prompt_len=1,
|
166
|
+
keep_ir=False, compile_blob=True):
|
166
167
|
num_heads = model.model.layers[0].self_attn.num_heads
|
167
168
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
168
169
|
head_dim = model.model.layers[0].self_attn.head_dim
|
@@ -230,7 +231,8 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
|
230
231
|
asym=asym
|
231
232
|
)
|
232
233
|
last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, "lm_head", temp_dir,
|
233
|
-
|
234
|
+
keep_ir=keep_ir, compile_blob=compile_blob)
|
235
|
+
os.remove(os.path.join(temp_dir, "lm_head.bin"))
|
234
236
|
|
235
237
|
# save weights bins files
|
236
238
|
if n_splits_linear == 1:
|
@@ -280,22 +282,27 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
|
280
282
|
dtype=np.float16,
|
281
283
|
scale_emb=model.config.scale_emb)
|
282
284
|
update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
|
283
|
-
temp_dir,
|
285
|
+
temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
|
284
286
|
embedding_post_prefill = MiniCPMPostEmbedding(max_prompt_len, model.config.hidden_size,
|
285
287
|
dtype=np.float16,
|
286
288
|
scale_emb=model.config.scale_emb)
|
287
289
|
update_names_of_IR_and_export_blob(embedding_post_prefill,
|
288
290
|
"embedding_post_prefill",
|
289
|
-
temp_dir,
|
291
|
+
temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
|
292
|
+
os.remove(os.path.join(temp_dir, "embedding_post.bin"))
|
293
|
+
os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
|
290
294
|
else:
|
291
295
|
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
|
292
|
-
temp_dir,
|
296
|
+
temp_dir, keep_ir=keep_ir,
|
297
|
+
compile_blob=compile_blob)
|
298
|
+
os.remove(os.path.join(temp_dir, "embedding.bin"))
|
293
299
|
return first_blob_path, last_blob_path
|
294
300
|
|
295
301
|
|
296
302
|
def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
297
303
|
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
298
|
-
layernorm_const, mode="decode"
|
304
|
+
layernorm_const, mode="decode",
|
305
|
+
keep_ir=False, compile_blob=True):
|
299
306
|
num_heads = model.model.layers[0].self_attn.num_heads
|
300
307
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
301
308
|
head_dim = model.model.layers[0].self_attn.head_dim
|
@@ -353,7 +360,8 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
353
360
|
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
|
354
361
|
decoder_name,
|
355
362
|
temp_dir,
|
356
|
-
|
363
|
+
keep_ir=keep_ir, compile_blob=compile_blob)
|
364
|
+
os.remove(os.path.join(temp_dir, decoder_name + ".bin"))
|
357
365
|
|
358
366
|
if mode == "decode":
|
359
367
|
if layernorm_const:
|
@@ -386,7 +394,8 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
386
394
|
|
387
395
|
def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
388
396
|
save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
389
|
-
layernorm_const, mode="decode"
|
397
|
+
layernorm_const, mode="decode",
|
398
|
+
keep_ir=False, compile_blob=True):
|
390
399
|
num_heads = model.model.layers[0].self_attn.num_heads
|
391
400
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
392
401
|
head_dim = model.model.layers[0].self_attn.head_dim
|
@@ -477,6 +486,6 @@ def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_d
|
|
477
486
|
update_names_of_IR_and_export_blob(fused_decoder,
|
478
487
|
f"decoder_layer_{i}",
|
479
488
|
save_dir,
|
480
|
-
compile_blob=
|
481
|
-
|
489
|
+
keep_ir=keep_ir, compile_blob=compile_blob)
|
490
|
+
os.remove(os.path.join(save_dir, f"decoder_layer_{i}" + ".bin"))
|
482
491
|
return 0
|
@@ -18,13 +18,15 @@
|
|
18
18
|
import torch
|
19
19
|
import numpy as np
|
20
20
|
import os
|
21
|
-
from .common import update_names_of_IR_and_export_blob,
|
22
|
-
obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer
|
21
|
+
from .common import update_names_of_IR_and_export_blob, LowBitLLMLMHead, \
|
22
|
+
obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer, \
|
23
|
+
obtain_embedding_from_model
|
23
24
|
from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
|
24
25
|
|
25
26
|
|
26
27
|
def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
|
27
|
-
convert_model=False, group_size=0
|
28
|
+
convert_model=False, group_size=0, max_prompt_len=1,
|
29
|
+
keep_ir=False, compile_blob=True):
|
28
30
|
num_heads = model.model.layers[0].self_attn.num_heads
|
29
31
|
head_dim = model.model.layers[0].self_attn.head_dim
|
30
32
|
rms_norm_eps = model.config.rms_norm_eps
|
@@ -84,7 +86,9 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
|
|
84
86
|
)
|
85
87
|
|
86
88
|
last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, f"lm_head",
|
87
|
-
temp_dir,
|
89
|
+
temp_dir, keep_ir=keep_ir,
|
90
|
+
compile_blob=compile_blob)
|
91
|
+
os.remove(os.path.join(temp_dir, "lm_head.bin"))
|
88
92
|
|
89
93
|
# save weights bins files
|
90
94
|
if not isinstance(lm_head, SlicedLMHead):
|
@@ -104,28 +108,17 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
|
|
104
108
|
bin_file = os.path.join(weight_dir, f"model_lm_head_input_{1+idx}.bin")
|
105
109
|
weight.tofile(bin_file)
|
106
110
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
|
112
|
-
padding_idx=model.config.pad_token_id,
|
113
|
-
dtype=np.float16,
|
114
|
-
input_length=1,
|
115
|
-
)
|
116
|
-
if convert_model:
|
117
|
-
bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
|
118
|
-
embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
|
119
|
-
first_blob_path = True
|
120
|
-
else:
|
121
|
-
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding",
|
122
|
-
temp_dir, True, keep_ir=True)
|
111
|
+
first_blob_path = obtain_embedding_from_model(model, convert_model,
|
112
|
+
temp_dir, weight_dir,
|
113
|
+
max_prompt_len,
|
114
|
+
keep_ir, compile_blob)
|
123
115
|
return first_blob_path, last_blob_path
|
124
116
|
|
125
117
|
|
126
118
|
def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
127
119
|
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
128
|
-
layernorm_const, mode="decode"
|
120
|
+
layernorm_const, mode="decode",
|
121
|
+
keep_ir=False, compile_blob=True):
|
129
122
|
num_heads = model.model.layers[0].self_attn.num_heads
|
130
123
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
131
124
|
head_dim = model.model.layers[0].self_attn.head_dim
|
@@ -139,8 +132,13 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
139
132
|
mlp_layer = curr_layer.mlp
|
140
133
|
weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
|
141
134
|
q_bias, k_bias, v_bias = obtain_qkv_bias_from_single_layer(attn_layer)
|
142
|
-
|
143
|
-
|
135
|
+
if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
|
136
|
+
cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
|
137
|
+
cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
|
138
|
+
else:
|
139
|
+
# transformers >= 4.45.0
|
140
|
+
cached_cos = None
|
141
|
+
cached_sin = None
|
144
142
|
layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
|
145
143
|
layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)
|
146
144
|
|
@@ -152,10 +150,12 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
152
150
|
if mode == "decode":
|
153
151
|
input_len = 1
|
154
152
|
decoder_name = f"decoder_layer_{layer_idx}"
|
153
|
+
keep_position_ids = True
|
155
154
|
npu_dpu_groups = None
|
156
155
|
else:
|
157
156
|
input_len = kv_len
|
158
157
|
decoder_name = "decoder_layer_prefill"
|
158
|
+
keep_position_ids = False
|
159
159
|
npu_dpu_groups = 6
|
160
160
|
|
161
161
|
single_decoder = LowBitQwenMultiDecoderlayer(
|
@@ -179,23 +179,38 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
179
179
|
n_splits_linear=n_splits_linear,
|
180
180
|
n_splits_down_proj=n_splits_down_proj,
|
181
181
|
group_size=group_size,
|
182
|
+
cos_len=input_len,
|
183
|
+
keep_position_ids=keep_position_ids,
|
182
184
|
asym=asym
|
183
185
|
)
|
184
186
|
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
|
185
187
|
decoder_name,
|
186
|
-
temp_dir,
|
188
|
+
temp_dir, keep_ir=keep_ir,
|
189
|
+
compile_blob=compile_blob,
|
187
190
|
npu_dpu_groups=npu_dpu_groups)
|
191
|
+
os.remove(os.path.join(temp_dir, decoder_name + ".bin"))
|
188
192
|
|
189
193
|
# 0, 1, 2 are input_embed/attention_mask/position_id
|
190
194
|
if mode == "decode":
|
191
|
-
if
|
192
|
-
|
195
|
+
if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
|
196
|
+
if layernorm_const:
|
197
|
+
st_idx = 3
|
198
|
+
else:
|
199
|
+
input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
|
200
|
+
post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
|
201
|
+
layer_norm_0.data.numpy().tofile(input_lm_bin_file)
|
202
|
+
layer_norm_1.data.numpy().tofile(post_lm_bin_file)
|
203
|
+
st_idx = 5
|
193
204
|
else:
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
205
|
+
# transformers >= 4.45.0
|
206
|
+
if layernorm_const:
|
207
|
+
st_idx = 4
|
208
|
+
else:
|
209
|
+
input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
|
210
|
+
post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_5.bin")
|
211
|
+
layer_norm_0.data.numpy().tofile(input_lm_bin_file)
|
212
|
+
layer_norm_1.data.numpy().tofile(post_lm_bin_file)
|
213
|
+
st_idx = 6
|
199
214
|
q_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx}.bin")
|
200
215
|
k_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+1}.bin")
|
201
216
|
v_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+2}.bin")
|
@@ -226,7 +241,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
226
241
|
|
227
242
|
def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
228
243
|
save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
229
|
-
layernorm_const, mode="decode"
|
244
|
+
layernorm_const, mode="decode",
|
245
|
+
keep_ir=False, compile_blob=True):
|
230
246
|
num_heads = model.model.layers[0].self_attn.num_heads
|
231
247
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
232
248
|
head_dim = model.model.layers[0].self_attn.head_dim
|
@@ -252,8 +268,13 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
|
|
252
268
|
attn_layer = curr_layer.self_attn
|
253
269
|
mlp_layer = curr_layer.mlp
|
254
270
|
weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
|
255
|
-
|
256
|
-
|
271
|
+
if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
|
272
|
+
cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
|
273
|
+
cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
|
274
|
+
else:
|
275
|
+
# transformers >= 4.45.0
|
276
|
+
cached_cos = None
|
277
|
+
cached_sin = None
|
257
278
|
layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
|
258
279
|
layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)
|
259
280
|
|
@@ -330,6 +351,6 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
|
|
330
351
|
update_names_of_IR_and_export_blob(fused_decoder,
|
331
352
|
f"decoder_layer_{i}",
|
332
353
|
save_dir,
|
333
|
-
compile_blob=
|
334
|
-
|
354
|
+
keep_ir=keep_ir, compile_blob=compile_blob)
|
355
|
+
os.remove(os.path.join(save_dir, f"decoder_layer_{i}" + ".bin"))
|
335
356
|
return 0
|
ipex_llm/transformers/qlora.py
CHANGED
@@ -109,7 +109,7 @@ class LoraLowBitLinear(Module, LoraLayer):
|
|
109
109
|
self.qa_pool = torch.nn.Identity()
|
110
110
|
|
111
111
|
def forward(self, x: torch.Tensor):
|
112
|
-
autocast_dtype = get_autocast_dtype(x)
|
112
|
+
autocast_dtype = get_autocast_dtype(x.device.type)
|
113
113
|
if x.device.type == "xpu":
|
114
114
|
# force to use bf16 on gpu
|
115
115
|
x = x.to(torch.bfloat16)
|
@@ -177,7 +177,7 @@ class LoraBF16Linear(Module, LoraLayer):
|
|
177
177
|
self.is_target_conv_1d_layer = is_target_conv_1d_layer
|
178
178
|
|
179
179
|
def forward(self, x: torch.Tensor):
|
180
|
-
autocast_dtype = get_autocast_dtype(x)
|
180
|
+
autocast_dtype = get_autocast_dtype(x.device.type)
|
181
181
|
if x.device.type == "xpu":
|
182
182
|
# force to use bf16 on gpu
|
183
183
|
x = x.to(torch.bfloat16)
|
ipex_llm/transformers/utils.py
CHANGED
@@ -138,26 +138,39 @@ def fix_key(key):
|
|
138
138
|
return key
|
139
139
|
|
140
140
|
|
141
|
-
def
|
141
|
+
def is_autocast_enabled(device_type: str):
|
142
142
|
if torch.__version__ >= '2.3':
|
143
|
-
|
144
|
-
|
143
|
+
return torch.is_autocast_enabled(device_type)
|
144
|
+
else:
|
145
|
+
if device_type == "xpu":
|
146
|
+
return torch.xpu.is_autocast_xpu_enabled()
|
147
|
+
elif device_type == "cpu":
|
148
|
+
return torch.is_autocast_cpu_enabled()
|
149
|
+
else:
|
150
|
+
invalidInputError(False,
|
151
|
+
f"Device type {device_type} is not supported.")
|
152
|
+
|
153
|
+
|
154
|
+
def get_autocast_dtype(device_type: str):
|
155
|
+
if torch.__version__ >= '2.3':
|
156
|
+
if torch.is_autocast_enabled(device_type):
|
157
|
+
return torch.get_autocast_dtype(device_type)
|
145
158
|
else:
|
146
159
|
return None
|
147
160
|
else:
|
148
|
-
if
|
161
|
+
if device_type == "xpu":
|
149
162
|
if torch.xpu.is_autocast_xpu_enabled():
|
150
163
|
return torch.xpu.get_autocast_xpu_dtype()
|
151
164
|
else:
|
152
165
|
return None
|
153
|
-
elif
|
166
|
+
elif device_type == "cpu":
|
154
167
|
if torch.is_autocast_cpu_enabled():
|
155
168
|
return torch.get_autocast_cpu_dtype()
|
156
169
|
else:
|
157
170
|
return None
|
158
171
|
else:
|
159
172
|
invalidInputError(False,
|
160
|
-
f"Device {
|
173
|
+
f"Device type {device_type} is not supported.")
|
161
174
|
|
162
175
|
|
163
176
|
def get_xpu_device_name(device: torch.device):
|
@@ -107,6 +107,8 @@ except ModuleNotFoundError:
|
|
107
107
|
np = None # type: ignore[assignment]
|
108
108
|
from typing import Any
|
109
109
|
|
110
|
+
from ipex_llm.transformers.utils import is_autocast_enabled, get_autocast_dtype
|
111
|
+
|
110
112
|
|
111
113
|
def _cast(value, dtype):
|
112
114
|
if isinstance(value, torch.Tensor):
|
@@ -155,12 +157,12 @@ def custom_fwd(fwd=None, *, cast_inputs=None):
|
|
155
157
|
|
156
158
|
@functools.wraps(fwd)
|
157
159
|
def decorate_fwd(*args, **kwargs):
|
158
|
-
args[0]._dtype =
|
160
|
+
args[0]._dtype = get_autocast_dtype("xpu")
|
159
161
|
if cast_inputs is None:
|
160
|
-
args[0]._fwd_used_autocast =
|
162
|
+
args[0]._fwd_used_autocast = is_autocast_enabled("xpu")
|
161
163
|
return fwd(*args, **kwargs)
|
162
164
|
else:
|
163
|
-
autocast_context =
|
165
|
+
autocast_context = is_autocast_enabled("xpu")
|
164
166
|
args[0]._fwd_used_autocast = False
|
165
167
|
if autocast_context:
|
166
168
|
with torch.xpu.autocast(enabled=False):
|
@@ -184,7 +186,7 @@ def custom_bwd(bwd):
|
|
184
186
|
|
185
187
|
@functools.wraps(bwd)
|
186
188
|
def decorate_bwd(*args, **kwargs):
|
187
|
-
with torch.
|
189
|
+
with torch.autocast("xpu", enabled=args[0]._fwd_used_autocast, dtype=args[0]._dtype):
|
188
190
|
return bwd(*args, **kwargs)
|
189
191
|
|
190
192
|
return decorate_bwd
|