ipex-llm 2.2.0b20250206__py3-none-win_amd64.whl → 2.2.0b20250208__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ipex_llm/libs/bloom-api.dll +0 -0
- ipex_llm/libs/bloom.dll +0 -0
- ipex_llm/libs/gptneox-api.dll +0 -0
- ipex_llm/libs/gptneox.dll +0 -0
- ipex_llm/libs/libbloom_avx.dll +0 -0
- ipex_llm/libs/libbloom_vnni.dll +0 -0
- ipex_llm/libs/libgptneox_avx.dll +0 -0
- ipex_llm/libs/libgptneox_vnni.dll +0 -0
- ipex_llm/libs/libllama_avx.dll +0 -0
- ipex_llm/libs/libllama_vnni.dll +0 -0
- ipex_llm/libs/libstarcoder_avx.dll +0 -0
- ipex_llm/libs/libstarcoder_vnni.dll +0 -0
- ipex_llm/libs/llama-api.dll +0 -0
- ipex_llm/libs/llama.dll +0 -0
- ipex_llm/libs/main-bloom.exe +0 -0
- ipex_llm/libs/main-gptneox.exe +0 -0
- ipex_llm/libs/main-llama.exe +0 -0
- ipex_llm/libs/main-starcoder.exe +0 -0
- ipex_llm/libs/pipeline.dll +0 -0
- ipex_llm/libs/quantize-bloom.exe +0 -0
- ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
- ipex_llm/libs/quantize-gptneox.exe +0 -0
- ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
- ipex_llm/libs/quantize-llama.exe +0 -0
- ipex_llm/libs/quantize-llama_vnni.exe +0 -0
- ipex_llm/libs/quantize-starcoder.exe +0 -0
- ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
- ipex_llm/libs/starcoder-api.dll +0 -0
- ipex_llm/libs/starcoder.dll +0 -0
- ipex_llm/transformers/model.py +0 -1
- ipex_llm/transformers/npu_model.py +0 -1
- ipex_llm/transformers/npu_models/qwen2_mp.py +20 -8
- ipex_llm/transformers/npu_pipeline_model/common.py +161 -0
- ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +30 -23
- ipex_llm/transformers/npu_pipeline_model/llama.py +17 -165
- ipex_llm/transformers/npu_pipeline_model/minicpm.py +10 -6
- ipex_llm/transformers/npu_pipeline_model/qwen.py +53 -34
- {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/METADATA +23 -30
- {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/RECORD +45 -45
- {ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250208.data}/scripts/ipex-llm-init.bat +0 -0
- {ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250208.data}/scripts/llm-chat.ps1 +0 -0
- {ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250208.data}/scripts/llm-cli.ps1 +0 -0
- {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/WHEEL +0 -0
- {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/entry_points.txt +0 -0
- {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250208.dist-info}/top_level.txt +0 -0
ipex_llm/libs/bloom-api.dll
CHANGED
Binary file
|
ipex_llm/libs/bloom.dll
CHANGED
Binary file
|
ipex_llm/libs/gptneox-api.dll
CHANGED
Binary file
|
ipex_llm/libs/gptneox.dll
CHANGED
Binary file
|
ipex_llm/libs/libbloom_avx.dll
CHANGED
Binary file
|
ipex_llm/libs/libbloom_vnni.dll
CHANGED
Binary file
|
ipex_llm/libs/libgptneox_avx.dll
CHANGED
Binary file
|
Binary file
|
ipex_llm/libs/libllama_avx.dll
CHANGED
Binary file
|
ipex_llm/libs/libllama_vnni.dll
CHANGED
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/llama-api.dll
CHANGED
Binary file
|
ipex_llm/libs/llama.dll
CHANGED
Binary file
|
ipex_llm/libs/main-bloom.exe
CHANGED
Binary file
|
ipex_llm/libs/main-gptneox.exe
CHANGED
Binary file
|
ipex_llm/libs/main-llama.exe
CHANGED
Binary file
|
ipex_llm/libs/main-starcoder.exe
CHANGED
Binary file
|
ipex_llm/libs/pipeline.dll
CHANGED
Binary file
|
ipex_llm/libs/quantize-bloom.exe
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/quantize-llama.exe
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/starcoder-api.dll
CHANGED
Binary file
|
ipex_llm/libs/starcoder.dll
CHANGED
Binary file
|
ipex_llm/transformers/model.py
CHANGED
@@ -233,7 +233,6 @@ class _BaseAutoModelClass:
|
|
233
233
|
optimize_model = False
|
234
234
|
kwargs["modules_to_not_convert"] = ["lm_head"]
|
235
235
|
|
236
|
-
load_in_8bit = kwargs.pop("load_in_8bit", False)
|
237
236
|
from ipex_llm.llm_patching import bigdl_patched
|
238
237
|
if bigdl_patched == 'Train':
|
239
238
|
global patched_training_mode
|
@@ -117,7 +117,6 @@ class _BaseAutoModelClass:
|
|
117
117
|
# ignore following arguments
|
118
118
|
ignore_argument(kwargs, "model_hub")
|
119
119
|
ignore_argument(kwargs, "load_in_4bit")
|
120
|
-
ignore_argument(kwargs, "load_in_8bit")
|
121
120
|
ignore_argument(kwargs, "imatrix")
|
122
121
|
ignore_argument(kwargs, "cpu_embedding")
|
123
122
|
ignore_argument(kwargs, "embedding_qtype")
|
@@ -98,6 +98,8 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
|
|
98
98
|
n_splits_linear: int = 1,
|
99
99
|
n_splits_down_proj: int = 1,
|
100
100
|
group_size: int = 0,
|
101
|
+
cos_len: int = 1,
|
102
|
+
keep_position_ids=True,
|
101
103
|
asym: bool = False,
|
102
104
|
):
|
103
105
|
super().__init__(max_seq_len=max_seq_len,
|
@@ -114,18 +116,13 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
|
|
114
116
|
self.dtype = dtype
|
115
117
|
self.cached_cos = cached_cos
|
116
118
|
self.cached_sin = cached_sin
|
119
|
+
self.cos_len = cos_len
|
117
120
|
self.batch_size, self.seq_len, self.hidden_size = hidden_shape
|
118
121
|
self.mode = mode
|
119
122
|
self.rms_norm_eps = rms_norm_eps
|
120
123
|
self.transpose_value = transpose_value
|
121
124
|
self.num_layers = num_layers
|
122
125
|
|
123
|
-
cos = self.constant(self.cached_cos)
|
124
|
-
self.cos = self.unsqueeze(cos, axis=0)
|
125
|
-
|
126
|
-
sin = self.constant(self.cached_sin)
|
127
|
-
self.sin = self.unsqueeze(sin, axis=0)
|
128
|
-
|
129
126
|
if mode == "decode":
|
130
127
|
self.kv_seq_len = self.max_seq_len + 1
|
131
128
|
else:
|
@@ -148,7 +145,21 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
|
|
148
145
|
attention_mask = self.create_input_op(
|
149
146
|
(self.batch_size, 1, self.seq_len, self.seq_len), dtype=np.float16)
|
150
147
|
|
151
|
-
|
148
|
+
if self.cached_cos is None:
|
149
|
+
if mode == "prefill" and keep_position_ids:
|
150
|
+
position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
|
151
|
+
cos = self.create_input_op((self.batch_size, self.cos_len, self.head_dim),
|
152
|
+
dtype=np.float32)
|
153
|
+
self.cos = self.convert_to_fp16(cos)
|
154
|
+
sin = self.create_input_op((self.batch_size, self.cos_len, self.head_dim),
|
155
|
+
dtype=np.float32)
|
156
|
+
self.sin = self.convert_to_fp16(sin)
|
157
|
+
else:
|
158
|
+
position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
|
159
|
+
cos = self.constant(self.cached_cos)
|
160
|
+
self.cos = self.unsqueeze(cos, axis=0)
|
161
|
+
sin = self.constant(self.cached_sin)
|
162
|
+
self.sin = self.unsqueeze(sin, axis=0)
|
152
163
|
|
153
164
|
if input_layernorm_weights is None:
|
154
165
|
input_layernorm_weights = []
|
@@ -211,11 +222,12 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
|
|
211
222
|
hidden_states = input
|
212
223
|
|
213
224
|
curr_key_values = []
|
225
|
+
cos_condition = cached_cos is not None or (mode == "prefill" and keep_position_ids)
|
214
226
|
for i in range(num_layers):
|
215
227
|
hidden_states, new_key_states, new_value_states = self.build_decoder(
|
216
228
|
hidden_states=hidden_states,
|
217
229
|
attention_mask=attention_mask,
|
218
|
-
position_ids=position_ids,
|
230
|
+
position_ids=position_ids if cos_condition else None,
|
219
231
|
input_layernorm_weight=input_layernorm_weights[i],
|
220
232
|
post_attention_layernorm_weight=post_attn_layernorm_weights[i],
|
221
233
|
q_bias=q_biases[i],
|
@@ -173,6 +173,105 @@ class LLMEmbedding(NNFactory):
|
|
173
173
|
self.compile()
|
174
174
|
|
175
175
|
|
176
|
+
class Llama32Embedding(NNFactory):
|
177
|
+
def __init__(
|
178
|
+
self,
|
179
|
+
vocab_size,
|
180
|
+
embedding_dim,
|
181
|
+
embedding_weight,
|
182
|
+
padding_idx,
|
183
|
+
inv_freq,
|
184
|
+
attention_scaling,
|
185
|
+
dtype, # fp16
|
186
|
+
device: str = "NPU",
|
187
|
+
):
|
188
|
+
super().__init__(False, device)
|
189
|
+
self.vocab_size = vocab_size
|
190
|
+
self.embedding_dim = embedding_dim
|
191
|
+
self.padding_idx = padding_idx
|
192
|
+
self.attention_scaling = attention_scaling
|
193
|
+
self.dtype = dtype
|
194
|
+
|
195
|
+
# define input
|
196
|
+
weight = self.constant(embedding_weight)
|
197
|
+
input = self.parameter((1, 1), dtype=np.int32)
|
198
|
+
position_ids = self.parameter((1, 1), dtype=np.int64)
|
199
|
+
inv_freq = self.constant(inv_freq)
|
200
|
+
|
201
|
+
# embed_tokens module
|
202
|
+
if padding_idx == -1:
|
203
|
+
padding_idx += vocab_size
|
204
|
+
|
205
|
+
axis_node = self.constant(np.array([0], dtype=np.int64))
|
206
|
+
if padding_idx is not None:
|
207
|
+
masked_embeddings = np.ones(weight.shape, dtype=np.float16)
|
208
|
+
masked_embeddings[padding_idx, :] = 0.0 # mask
|
209
|
+
|
210
|
+
node_mask = self.constant(masked_embeddings)
|
211
|
+
node_masked_w = self.eltwise_mul(weight, node_mask)
|
212
|
+
res = self.gather(node_masked_w, input, axis_node, 0)
|
213
|
+
else:
|
214
|
+
res = self.gather(weight, input, axis_node, 0)
|
215
|
+
|
216
|
+
# rotary_emb module
|
217
|
+
inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
|
218
|
+
position_ids = self.reshape(position_ids, (1, 1, 1))
|
219
|
+
freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
|
220
|
+
self.convert_to_fp32(position_ids))
|
221
|
+
freqs = self.transpose(freqs, [0, 2, 1])
|
222
|
+
emb = self.concat(freqs, freqs, axis=2)
|
223
|
+
cos = self.cos(emb)
|
224
|
+
sin = self.sin(emb)
|
225
|
+
cos = cos * self.attention_scaling
|
226
|
+
sin = sin * self.attention_scaling
|
227
|
+
|
228
|
+
# define outputs
|
229
|
+
res = self.convert_to_fp16(res)
|
230
|
+
cos = self.convert_to_fp32(cos)
|
231
|
+
sin = self.convert_to_fp32(sin)
|
232
|
+
|
233
|
+
print("start compiling")
|
234
|
+
self.compile()
|
235
|
+
|
236
|
+
|
237
|
+
class Llama32PostEmbedding(NNFactory):
|
238
|
+
def __init__(
|
239
|
+
self,
|
240
|
+
inv_freq,
|
241
|
+
attention_scaling,
|
242
|
+
input_len: int = 1,
|
243
|
+
device: str = "NPU",
|
244
|
+
):
|
245
|
+
super().__init__(False, device)
|
246
|
+
self.attention_scaling = attention_scaling
|
247
|
+
|
248
|
+
# define input
|
249
|
+
position_ids = self.parameter((1, input_len), dtype=np.int64)
|
250
|
+
inv_freq = self.constant(inv_freq)
|
251
|
+
|
252
|
+
# rotary_emb module
|
253
|
+
inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
|
254
|
+
position_ids = self.reshape(position_ids, (1, 1, input_len))
|
255
|
+
freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
|
256
|
+
self.convert_to_fp32(position_ids))
|
257
|
+
freqs = self.transpose(freqs, [0, 2, 1])
|
258
|
+
emb = self.concat(freqs, freqs, axis=2)
|
259
|
+
cos = self.cos(emb)
|
260
|
+
sin = self.sin(emb)
|
261
|
+
cos = cos * self.attention_scaling
|
262
|
+
sin = sin * self.attention_scaling
|
263
|
+
if input_len > 1:
|
264
|
+
cos = self.unsqueeze(cos, [1])
|
265
|
+
sin = self.unsqueeze(sin, [1])
|
266
|
+
|
267
|
+
# define outputs
|
268
|
+
cos = self.convert_to_fp32(cos)
|
269
|
+
sin = self.convert_to_fp32(sin)
|
270
|
+
|
271
|
+
print("start compiling")
|
272
|
+
self.compile()
|
273
|
+
|
274
|
+
|
176
275
|
def obtain_weight_from_single_layer(attn_layer, mlp_layer):
|
177
276
|
weights = []
|
178
277
|
if hasattr(attn_layer, "q_proj_dq_list"):
|
@@ -216,3 +315,65 @@ def obtain_qkv_bias_from_single_layer(attn_layer):
|
|
216
315
|
k_bias = attn_layer.k_proj.bias.to(torch.float16)
|
217
316
|
v_bias = attn_layer.v_proj.bias.to(torch.float16)
|
218
317
|
return q_bias, k_bias, v_bias
|
318
|
+
|
319
|
+
|
320
|
+
def obtain_embedding_from_model(model, convert_model, temp_dir, weight_dir,
|
321
|
+
max_prompt_len, keep_ir, compile_blob):
|
322
|
+
if hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
|
323
|
+
# llama-2-7B & llama-3-8B
|
324
|
+
embedding_layer = model.model.embed_tokens
|
325
|
+
new_embedding = LLMEmbedding(
|
326
|
+
vocab_size=model.config.vocab_size,
|
327
|
+
embedding_dim=model.config.hidden_size,
|
328
|
+
embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
|
329
|
+
padding_idx=model.config.pad_token_id,
|
330
|
+
dtype=np.float16,
|
331
|
+
)
|
332
|
+
if convert_model:
|
333
|
+
bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
|
334
|
+
embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
|
335
|
+
first_blob_path = None
|
336
|
+
else:
|
337
|
+
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
|
338
|
+
temp_dir, keep_ir=keep_ir,
|
339
|
+
compile_blob=compile_blob)
|
340
|
+
os.remove(os.path.join(temp_dir, "embedding.bin"))
|
341
|
+
else:
|
342
|
+
# llama-3.2-3B & llama-3.2-1B
|
343
|
+
# for transformers >= 4.45.0
|
344
|
+
embedding_layer = model.model.embed_tokens
|
345
|
+
new_embedding = Llama32Embedding(
|
346
|
+
vocab_size=model.config.vocab_size,
|
347
|
+
embedding_dim=model.config.hidden_size,
|
348
|
+
embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
|
349
|
+
padding_idx=model.config.pad_token_id,
|
350
|
+
inv_freq=model.model.rotary_emb.inv_freq.to(torch.float16),
|
351
|
+
attention_scaling=model.model.rotary_emb.attention_scaling,
|
352
|
+
dtype=np.float16,
|
353
|
+
)
|
354
|
+
if convert_model:
|
355
|
+
bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
|
356
|
+
embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
|
357
|
+
first_blob_path = None
|
358
|
+
# save embedding post module
|
359
|
+
inv_freq = model.model.rotary_emb.inv_freq.to(torch.float16)
|
360
|
+
attention_scaling = model.model.rotary_emb.attention_scaling
|
361
|
+
embedding_post = Llama32PostEmbedding(inv_freq=inv_freq,
|
362
|
+
attention_scaling=attention_scaling,
|
363
|
+
input_len=1)
|
364
|
+
update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
|
365
|
+
temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
|
366
|
+
embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
|
367
|
+
attention_scaling=attention_scaling,
|
368
|
+
input_len=max_prompt_len)
|
369
|
+
update_names_of_IR_and_export_blob(embedding_post_prefill,
|
370
|
+
"embedding_post_prefill",
|
371
|
+
temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
|
372
|
+
os.remove(os.path.join(temp_dir, "embedding_post.bin"))
|
373
|
+
os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
|
374
|
+
else:
|
375
|
+
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
|
376
|
+
temp_dir, keep_ir=keep_ir,
|
377
|
+
compile_blob=compile_blob)
|
378
|
+
os.remove(os.path.join(temp_dir, "embedding.bin"))
|
379
|
+
return first_blob_path
|
@@ -31,6 +31,7 @@ import tempfile
|
|
31
31
|
import numpy as np
|
32
32
|
from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
|
33
33
|
from multiprocessing import Pool
|
34
|
+
import transformers
|
34
35
|
|
35
36
|
|
36
37
|
def generate(
|
@@ -200,7 +201,7 @@ def convert_llm(model: torch.nn.Module,
|
|
200
201
|
keep_ir: bool=False,
|
201
202
|
compile_blob: bool=True):
|
202
203
|
# whether to set layernorm weight as const
|
203
|
-
|
204
|
+
const_parameter = os.environ.get("IPEX_LLM_NPU_CONST_PARAMETER", "1") == "1"
|
204
205
|
if group_size == 0:
|
205
206
|
n_splits_linear = 1
|
206
207
|
if qtype in ["sym_int8_rtn", "asym_int4_rtn"]:
|
@@ -239,7 +240,7 @@ def convert_llm(model: torch.nn.Module,
|
|
239
240
|
for layer_idx in range(0, layer_num):
|
240
241
|
param_list.append((model, layer_idx, n_splits_linear, n_splits_down_proj,
|
241
242
|
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
242
|
-
|
243
|
+
const_parameter))
|
243
244
|
with Pool() as pool:
|
244
245
|
result = pool.starmap(convert_llama_layer, param_list)
|
245
246
|
|
@@ -266,7 +267,7 @@ def convert_llm(model: torch.nn.Module,
|
|
266
267
|
res = InitLLMPipeline(model_type, kv_len, model.num_head, model.head_dim, layer_num,
|
267
268
|
model.vocab_size, weight_dir, "model",
|
268
269
|
first_blob_path, last_blob_path,
|
269
|
-
os.path.join(temp_dir, "decoder_layer"),
|
270
|
+
os.path.join(temp_dir, "decoder_layer"), const_parameter)
|
270
271
|
except:
|
271
272
|
invalidInputError(False,
|
272
273
|
"False to InitLLMPipeline.")
|
@@ -283,7 +284,7 @@ def convert_llm(model: torch.nn.Module,
|
|
283
284
|
for layer_idx in range(0, layer_num):
|
284
285
|
param_list.append((model, layer_idx, n_splits_linear, n_splits_down_proj,
|
285
286
|
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
286
|
-
|
287
|
+
const_parameter))
|
287
288
|
with Pool() as pool:
|
288
289
|
result = pool.starmap(convert_baichuan_layer, param_list)
|
289
290
|
|
@@ -307,7 +308,7 @@ def convert_llm(model: torch.nn.Module,
|
|
307
308
|
res = InitLLMPipeline("baichuan", kv_len, model.num_head, model.head_dim, layer_num,
|
308
309
|
model.vocab_size, weight_dir, "model",
|
309
310
|
first_blob_path, last_blob_path,
|
310
|
-
os.path.join(temp_dir, "decoder_layer"),
|
311
|
+
os.path.join(temp_dir, "decoder_layer"), const_parameter)
|
311
312
|
except:
|
312
313
|
invalidInputError(False,
|
313
314
|
"False to InitLLMPipeline.")
|
@@ -324,7 +325,7 @@ def convert_llm(model: torch.nn.Module,
|
|
324
325
|
for layer_idx in range(0, layer_num):
|
325
326
|
param_list.append((model, layer_idx, n_splits_linear, n_splits_down_proj,
|
326
327
|
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
327
|
-
|
328
|
+
const_parameter))
|
328
329
|
with Pool() as pool:
|
329
330
|
result = pool.starmap(convert_minicpm_layer, param_list)
|
330
331
|
|
@@ -347,12 +348,12 @@ def convert_llm(model: torch.nn.Module,
|
|
347
348
|
res = InitLLMPipeline("minicpm", kv_len, model.num_head, model.head_dim, layer_num,
|
348
349
|
model.vocab_size, weight_dir, "model",
|
349
350
|
first_blob_path, last_blob_path,
|
350
|
-
os.path.join(temp_dir, "decoder_layer"),
|
351
|
+
os.path.join(temp_dir, "decoder_layer"), const_parameter)
|
351
352
|
except:
|
352
353
|
invalidInputError(False,
|
353
354
|
"False to InitLLMPipeline.")
|
354
355
|
elif model.config.model_type == "qwen2":
|
355
|
-
|
356
|
+
const_parameter = os.environ.get("IPEX_LLM_NPU_CONST_PARAMETER", "0") == "1"
|
356
357
|
with tempfile.TemporaryDirectory() as temp_dir:
|
357
358
|
if save_directory is not None:
|
358
359
|
temp_dir = save_directory
|
@@ -370,7 +371,7 @@ def convert_llm(model: torch.nn.Module,
|
|
370
371
|
for layer_idx in range(0, layer_num):
|
371
372
|
param_list.append((model, layer_idx, n_splits_linear, n_splits_down_proj,
|
372
373
|
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
373
|
-
|
374
|
+
const_parameter))
|
374
375
|
with Pool() as pool:
|
375
376
|
result = pool.starmap(convert_qwen_layer, param_list)
|
376
377
|
|
@@ -395,7 +396,7 @@ def convert_llm(model: torch.nn.Module,
|
|
395
396
|
"head_dim": model.head_dim,
|
396
397
|
"transpose_value_cache": transpose_value_cache,
|
397
398
|
"max_prompt_len": max_prompt_len,
|
398
|
-
"
|
399
|
+
"const_parameter": const_parameter,
|
399
400
|
"group_size": group_size}
|
400
401
|
model.config.update(update_dict)
|
401
402
|
model.config.save_pretrained(save_directory)
|
@@ -404,7 +405,7 @@ def convert_llm(model: torch.nn.Module,
|
|
404
405
|
res = InitLLMPipeline("qwen", kv_len, model.num_head, model.head_dim, layer_num,
|
405
406
|
model.vocab_size, weight_dir, "model",
|
406
407
|
first_blob_path, last_blob_path,
|
407
|
-
os.path.join(temp_dir, "decoder_layer"),
|
408
|
+
os.path.join(temp_dir, "decoder_layer"), const_parameter)
|
408
409
|
except:
|
409
410
|
invalidInputError(False,
|
410
411
|
"False to InitLLMPipeline.")
|
@@ -440,7 +441,9 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|
440
441
|
weight_dir = os.path.join(save_directory, "model_weights")
|
441
442
|
if not os.path.exists(weight_dir):
|
442
443
|
os.mkdir(weight_dir)
|
443
|
-
|
444
|
+
const_parameter = os.environ.get("IPEX_LLM_NPU_CONST_PARAMETER", "1") == "1"
|
445
|
+
if keep_ir:
|
446
|
+
const_parameter = False
|
444
447
|
|
445
448
|
lm_head_low_bit = getattr(model.config, "bigdl_transformers_low_bit", "sym_int4_rtn")
|
446
449
|
if hasattr(model, "lm_head") and not isinstance(model.lm_head, SlicedLMHead):
|
@@ -456,6 +459,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|
456
459
|
custom_object_save(model, save_directory, config=model.config)
|
457
460
|
|
458
461
|
if model.config.model_type == "qwen2":
|
462
|
+
cos_sin_input = not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached")
|
463
|
+
embedding_post = not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached")
|
459
464
|
if group_size == 0:
|
460
465
|
if model.config.hidden_size == 1536:
|
461
466
|
# Qwen2-1.5B-Instruct
|
@@ -469,13 +474,15 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|
469
474
|
"head_dim": model.model.layers[0].self_attn.head_dim,
|
470
475
|
"transpose_value_cache": transpose_value_cache,
|
471
476
|
"max_prompt_len": max_prompt_len,
|
472
|
-
"
|
477
|
+
"const_parameter": const_parameter,
|
473
478
|
"group_size": group_size,
|
474
479
|
"fused_layers": fused_layers,
|
475
480
|
"qkv_bias": True,
|
476
481
|
"use_prefill_sdp": False,
|
477
482
|
"weight_num": 7,
|
478
483
|
"weight_idx": 8,
|
484
|
+
"embedding_post": embedding_post,
|
485
|
+
"cos_sin_input": cos_sin_input,
|
479
486
|
"n_splits_linear": n_splits_linear,
|
480
487
|
"n_splits_down_proj": n_splits_down_proj,
|
481
488
|
"lm_head_low_bit": lm_head_low_bit}
|
@@ -485,16 +492,16 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|
485
492
|
# save fused_layers blobs of fused decoder layers
|
486
493
|
convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
487
494
|
save_directory, weight_dir, transpose_value_cache, kv_len,
|
488
|
-
group_size,
|
495
|
+
group_size, const_parameter, "decode",
|
489
496
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
490
497
|
# save blob of single prefill layer
|
491
498
|
convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj,
|
492
499
|
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
|
493
|
-
group_size,
|
500
|
+
group_size, const_parameter, "prefill",
|
494
501
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
495
502
|
# save blob of lmhead and bin of embedding
|
496
|
-
convert_lm_head_and_embedding(model, save_directory, weight_dir,
|
497
|
-
|
503
|
+
convert_lm_head_and_embedding(model, save_directory, weight_dir, convert_model=True,
|
504
|
+
group_size=group_size, max_prompt_len=max_prompt_len,
|
498
505
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
499
506
|
elif model.config.model_type == "llama":
|
500
507
|
embedding_post = False
|
@@ -530,7 +537,7 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|
530
537
|
"head_dim": model.model.layers[0].self_attn.head_dim,
|
531
538
|
"transpose_value_cache": transpose_value_cache,
|
532
539
|
"max_prompt_len": max_prompt_len,
|
533
|
-
"
|
540
|
+
"const_parameter": const_parameter,
|
534
541
|
"group_size": group_size,
|
535
542
|
"fused_layers": fused_layers,
|
536
543
|
"qkv_bias": False,
|
@@ -554,12 +561,12 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|
554
561
|
# save fused_layers blobs of fused decoder layers
|
555
562
|
convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
556
563
|
save_directory, weight_dir, transpose_value_cache, kv_len,
|
557
|
-
group_size,
|
564
|
+
group_size, const_parameter, "decode",
|
558
565
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
559
566
|
# save blob of single prefill layer
|
560
567
|
convert_llama_layer(model, 0, n_splits_linear, n_splits_down_proj,
|
561
568
|
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
|
562
|
-
group_size,
|
569
|
+
group_size, const_parameter, "prefill",
|
563
570
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
564
571
|
elif model.config.model_type == "minicpm":
|
565
572
|
if group_size == 0:
|
@@ -571,7 +578,7 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|
571
578
|
"head_dim": model.model.layers[0].self_attn.head_dim,
|
572
579
|
"transpose_value_cache": transpose_value_cache,
|
573
580
|
"max_prompt_len": max_prompt_len,
|
574
|
-
"
|
581
|
+
"const_parameter": const_parameter,
|
575
582
|
"group_size": group_size,
|
576
583
|
"fused_layers": fused_layers,
|
577
584
|
"qkv_bias": False,
|
@@ -589,12 +596,12 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|
589
596
|
# save fused_layers blobs of fused decoder layers
|
590
597
|
convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
591
598
|
save_directory, weight_dir, transpose_value_cache, kv_len,
|
592
|
-
group_size,
|
599
|
+
group_size, const_parameter, "decode",
|
593
600
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
594
601
|
# save blob of single prefill layer
|
595
602
|
convert_minicpm_layer(model, 0, n_splits_linear, n_splits_down_proj,
|
596
603
|
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
|
597
|
-
group_size,
|
604
|
+
group_size, const_parameter, "prefill",
|
598
605
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
599
606
|
# save blob of lmhead and bin of embedding and embedding_post
|
600
607
|
convert_lm_head_and_embedding(model, n_splits_linear,
|