ipex-llm 2.2.0b20250205__py3-none-win_amd64.whl → 2.2.0b20250207__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ipex_llm/libs/bloom-api.dll +0 -0
- ipex_llm/libs/bloom.dll +0 -0
- ipex_llm/libs/gptneox-api.dll +0 -0
- ipex_llm/libs/gptneox.dll +0 -0
- ipex_llm/libs/libbloom_avx.dll +0 -0
- ipex_llm/libs/libbloom_vnni.dll +0 -0
- ipex_llm/libs/libgptneox_avx.dll +0 -0
- ipex_llm/libs/libgptneox_vnni.dll +0 -0
- ipex_llm/libs/libllama_avx.dll +0 -0
- ipex_llm/libs/libllama_vnni.dll +0 -0
- ipex_llm/libs/libstarcoder_avx.dll +0 -0
- ipex_llm/libs/libstarcoder_vnni.dll +0 -0
- ipex_llm/libs/llama-api.dll +0 -0
- ipex_llm/libs/llama.dll +0 -0
- ipex_llm/libs/main-bloom.exe +0 -0
- ipex_llm/libs/main-gptneox.exe +0 -0
- ipex_llm/libs/main-llama.exe +0 -0
- ipex_llm/libs/main-starcoder.exe +0 -0
- ipex_llm/libs/pipeline.dll +0 -0
- ipex_llm/libs/quantize-bloom.exe +0 -0
- ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
- ipex_llm/libs/quantize-gptneox.exe +0 -0
- ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
- ipex_llm/libs/quantize-llama.exe +0 -0
- ipex_llm/libs/quantize-llama_vnni.exe +0 -0
- ipex_llm/libs/quantize-starcoder.exe +0 -0
- ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
- ipex_llm/libs/starcoder-api.dll +0 -0
- ipex_llm/libs/starcoder.dll +0 -0
- ipex_llm/transformers/low_bit_linear.py +5 -4
- ipex_llm/transformers/model.py +0 -1
- ipex_llm/transformers/npu_model.py +17 -5
- ipex_llm/transformers/npu_models/convert.py +6 -2
- ipex_llm/transformers/npu_models/qwen2_mp.py +20 -8
- ipex_llm/transformers/npu_pipeline_model/common.py +161 -0
- ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +33 -13
- ipex_llm/transformers/npu_pipeline_model/llama.py +20 -159
- ipex_llm/transformers/npu_pipeline_model/minicpm.py +19 -10
- ipex_llm/transformers/npu_pipeline_model/qwen.py +57 -36
- ipex_llm/transformers/qlora.py +2 -2
- ipex_llm/transformers/utils.py +19 -6
- ipex_llm/transformers/xpu_customize_fwd.py +6 -4
- {ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/METADATA +23 -30
- {ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/RECORD +50 -50
- {ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250207.data}/scripts/ipex-llm-init.bat +0 -0
- {ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250207.data}/scripts/llm-chat.ps1 +0 -0
- {ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250207.data}/scripts/llm-cli.ps1 +0 -0
- {ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/WHEEL +0 -0
- {ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/entry_points.txt +0 -0
- {ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/top_level.txt +0 -0
ipex_llm/libs/bloom-api.dll
CHANGED
Binary file
|
ipex_llm/libs/bloom.dll
CHANGED
Binary file
|
ipex_llm/libs/gptneox-api.dll
CHANGED
Binary file
|
ipex_llm/libs/gptneox.dll
CHANGED
Binary file
|
ipex_llm/libs/libbloom_avx.dll
CHANGED
Binary file
|
ipex_llm/libs/libbloom_vnni.dll
CHANGED
Binary file
|
ipex_llm/libs/libgptneox_avx.dll
CHANGED
Binary file
|
Binary file
|
ipex_llm/libs/libllama_avx.dll
CHANGED
Binary file
|
ipex_llm/libs/libllama_vnni.dll
CHANGED
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/llama-api.dll
CHANGED
Binary file
|
ipex_llm/libs/llama.dll
CHANGED
Binary file
|
ipex_llm/libs/main-bloom.exe
CHANGED
Binary file
|
ipex_llm/libs/main-gptneox.exe
CHANGED
Binary file
|
ipex_llm/libs/main-llama.exe
CHANGED
Binary file
|
ipex_llm/libs/main-starcoder.exe
CHANGED
Binary file
|
ipex_llm/libs/pipeline.dll
CHANGED
Binary file
|
ipex_llm/libs/quantize-bloom.exe
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/quantize-llama.exe
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/starcoder-api.dll
CHANGED
Binary file
|
ipex_llm/libs/starcoder.dll
CHANGED
Binary file
|
@@ -51,7 +51,8 @@ from torch import Tensor, dtype, nn
|
|
51
51
|
from operator import mul
|
52
52
|
from functools import reduce
|
53
53
|
from ipex_llm.transformers.xpu_customize_fwd import custom_fwd, custom_bwd
|
54
|
-
from ipex_llm.transformers.utils import
|
54
|
+
from ipex_llm.transformers.utils import is_autocast_enabled, get_autocast_dtype
|
55
|
+
from ipex_llm.transformers.utils import get_xpu_device_name
|
55
56
|
from ipex_llm.transformers.convert import is_deepspeed_available, get_use_vllm
|
56
57
|
|
57
58
|
T = TypeVar("T", bound="torch.nn.Module")
|
@@ -527,8 +528,8 @@ class MatMulLowBit(torch.autograd.Function):
|
|
527
528
|
A, weight = ctx.tensors
|
528
529
|
grad_A, grad_weight = None, None
|
529
530
|
if req_gradA:
|
530
|
-
if
|
531
|
-
grad_output = grad_output.to(
|
531
|
+
if is_autocast_enabled("xpu"):
|
532
|
+
grad_output = grad_output.to(get_autocast_dtype("xpu"))
|
532
533
|
if weight.qtype == NF4:
|
533
534
|
dequant_weight = xe_linear.dequant(A,
|
534
535
|
weight.data.view(torch.uint8),
|
@@ -615,7 +616,7 @@ class LowBitLinear(nn.Linear):
|
|
615
616
|
is_training = self.training and not torch.is_inference_mode_enabled()
|
616
617
|
if is_training:
|
617
618
|
# below logic is only for training
|
618
|
-
autocast_dtype = get_autocast_dtype(x)
|
619
|
+
autocast_dtype = get_autocast_dtype(x.device.type)
|
619
620
|
if self.compute_dtype is not None and x.device.type == "xpu":
|
620
621
|
x = x.to(self.compute_dtype) # solve GC issue for unlora module
|
621
622
|
elif autocast_dtype is not None:
|
ipex_llm/transformers/model.py
CHANGED
@@ -233,7 +233,6 @@ class _BaseAutoModelClass:
|
|
233
233
|
optimize_model = False
|
234
234
|
kwargs["modules_to_not_convert"] = ["lm_head"]
|
235
235
|
|
236
|
-
load_in_8bit = kwargs.pop("load_in_8bit", False)
|
237
236
|
from ipex_llm.llm_patching import bigdl_patched
|
238
237
|
if bigdl_patched == 'Train':
|
239
238
|
global patched_training_mode
|
@@ -117,7 +117,6 @@ class _BaseAutoModelClass:
|
|
117
117
|
# ignore following arguments
|
118
118
|
ignore_argument(kwargs, "model_hub")
|
119
119
|
ignore_argument(kwargs, "load_in_4bit")
|
120
|
-
ignore_argument(kwargs, "load_in_8bit")
|
121
120
|
ignore_argument(kwargs, "imatrix")
|
122
121
|
ignore_argument(kwargs, "cpu_embedding")
|
123
122
|
ignore_argument(kwargs, "embedding_qtype")
|
@@ -139,8 +138,10 @@ class _BaseAutoModelClass:
|
|
139
138
|
mock_device = kwargs.pop('device', None) # For mock on CPU
|
140
139
|
convert_model = kwargs.pop('convert_model', False)
|
141
140
|
save_directory = kwargs.pop('save_directory', None)
|
142
|
-
fuse_layers = kwargs.pop(
|
143
|
-
imatrix_file = kwargs.pop(
|
141
|
+
fuse_layers = kwargs.pop("fuse_layers", None)
|
142
|
+
imatrix_file = kwargs.pop("imatrix_file", None)
|
143
|
+
keep_ir = kwargs.pop("keep_ir", False)
|
144
|
+
compile_blob = kwargs.pop("compile_blob", True)
|
144
145
|
|
145
146
|
if imatrix_file is not None:
|
146
147
|
imatrix_data = load_imatrix_data(imatrix_file)
|
@@ -236,6 +237,8 @@ class _BaseAutoModelClass:
|
|
236
237
|
"fuse_layers": fuse_layers,
|
237
238
|
"imatrix_data": imatrix_data,
|
238
239
|
"skip_npu_logic": mock_device == "dummy",
|
240
|
+
"keep_ir": keep_ir,
|
241
|
+
"compile_blob": compile_blob,
|
239
242
|
}
|
240
243
|
# Dummy will skip npu related logic and save the quantized model
|
241
244
|
if mock_device == "dummy":
|
@@ -280,9 +283,14 @@ class _BaseAutoModelClass:
|
|
280
283
|
fuse_layers = kwargs.pop('fuse_layers', None)
|
281
284
|
imatrix_data = kwargs.pop('imatrix_data', None)
|
282
285
|
skip_npu_logic = kwargs.pop("skip_npu_logic", False)
|
286
|
+
keep_ir = kwargs.pop("keep_ir", False)
|
287
|
+
compile_blob = kwargs.pop("compile_blob", True)
|
288
|
+
|
283
289
|
invalidInputError(save_directory is not None,
|
284
290
|
"Please provide the path to save converted model "
|
285
291
|
"through `save_directory`.")
|
292
|
+
invalidInputError(keep_ir or compile_blob,
|
293
|
+
"Please save blob or save IR either.")
|
286
294
|
|
287
295
|
if hasattr(model, "llm"):
|
288
296
|
llm = model.llm
|
@@ -323,7 +331,9 @@ class _BaseAutoModelClass:
|
|
323
331
|
qtype=qtype,
|
324
332
|
save_directory=save_directory,
|
325
333
|
fuse_layers=fuse_layers,
|
326
|
-
has_llm=hasattr(model, "llm")
|
334
|
+
has_llm=hasattr(model, "llm"),
|
335
|
+
keep_ir=keep_ir,
|
336
|
+
compile_blob=compile_blob
|
327
337
|
)
|
328
338
|
else:
|
329
339
|
optimize_llm(
|
@@ -346,7 +356,9 @@ class _BaseAutoModelClass:
|
|
346
356
|
qtype=qtype,
|
347
357
|
convert_model=convert_model,
|
348
358
|
save_directory=save_directory,
|
349
|
-
fuse_layers=fuse_layers
|
359
|
+
fuse_layers=fuse_layers,
|
360
|
+
keep_ir=keep_ir,
|
361
|
+
compile_blob=compile_blob)
|
350
362
|
model.save_low_bit = types.MethodType(save_low_bit, model)
|
351
363
|
model.save_low_bit(save_directory)
|
352
364
|
logger.info(f"Converted model has already saved to {save_directory}.")
|
@@ -450,7 +450,9 @@ def optimize_llm_single_process(
|
|
450
450
|
qtype: str,
|
451
451
|
save_directory: str,
|
452
452
|
fuse_layers: int=None,
|
453
|
-
has_llm: bool=False
|
453
|
+
has_llm: bool=False,
|
454
|
+
keep_ir: bool=False,
|
455
|
+
compile_blob: bool=True
|
454
456
|
):
|
455
457
|
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline import convert_llm
|
456
458
|
from .npu_llm_cpp import load_model_from_file
|
@@ -463,7 +465,9 @@ def optimize_llm_single_process(
|
|
463
465
|
qtype=qtype,
|
464
466
|
convert_model=True,
|
465
467
|
save_directory=save_directory,
|
466
|
-
fuse_layers=fuse_layers
|
468
|
+
fuse_layers=fuse_layers,
|
469
|
+
keep_ir=keep_ir,
|
470
|
+
compile_blob=compile_blob)
|
467
471
|
try:
|
468
472
|
model_ptr = load_model_from_file(save_directory)
|
469
473
|
model.kv_len = kv_len
|
@@ -98,6 +98,8 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
|
|
98
98
|
n_splits_linear: int = 1,
|
99
99
|
n_splits_down_proj: int = 1,
|
100
100
|
group_size: int = 0,
|
101
|
+
cos_len: int = 1,
|
102
|
+
keep_position_ids=True,
|
101
103
|
asym: bool = False,
|
102
104
|
):
|
103
105
|
super().__init__(max_seq_len=max_seq_len,
|
@@ -114,18 +116,13 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
|
|
114
116
|
self.dtype = dtype
|
115
117
|
self.cached_cos = cached_cos
|
116
118
|
self.cached_sin = cached_sin
|
119
|
+
self.cos_len = cos_len
|
117
120
|
self.batch_size, self.seq_len, self.hidden_size = hidden_shape
|
118
121
|
self.mode = mode
|
119
122
|
self.rms_norm_eps = rms_norm_eps
|
120
123
|
self.transpose_value = transpose_value
|
121
124
|
self.num_layers = num_layers
|
122
125
|
|
123
|
-
cos = self.constant(self.cached_cos)
|
124
|
-
self.cos = self.unsqueeze(cos, axis=0)
|
125
|
-
|
126
|
-
sin = self.constant(self.cached_sin)
|
127
|
-
self.sin = self.unsqueeze(sin, axis=0)
|
128
|
-
|
129
126
|
if mode == "decode":
|
130
127
|
self.kv_seq_len = self.max_seq_len + 1
|
131
128
|
else:
|
@@ -148,7 +145,21 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
|
|
148
145
|
attention_mask = self.create_input_op(
|
149
146
|
(self.batch_size, 1, self.seq_len, self.seq_len), dtype=np.float16)
|
150
147
|
|
151
|
-
|
148
|
+
if self.cached_cos is None:
|
149
|
+
if mode == "prefill" and keep_position_ids:
|
150
|
+
position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
|
151
|
+
cos = self.create_input_op((self.batch_size, self.cos_len, self.head_dim),
|
152
|
+
dtype=np.float32)
|
153
|
+
self.cos = self.convert_to_fp16(cos)
|
154
|
+
sin = self.create_input_op((self.batch_size, self.cos_len, self.head_dim),
|
155
|
+
dtype=np.float32)
|
156
|
+
self.sin = self.convert_to_fp16(sin)
|
157
|
+
else:
|
158
|
+
position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
|
159
|
+
cos = self.constant(self.cached_cos)
|
160
|
+
self.cos = self.unsqueeze(cos, axis=0)
|
161
|
+
sin = self.constant(self.cached_sin)
|
162
|
+
self.sin = self.unsqueeze(sin, axis=0)
|
152
163
|
|
153
164
|
if input_layernorm_weights is None:
|
154
165
|
input_layernorm_weights = []
|
@@ -211,11 +222,12 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
|
|
211
222
|
hidden_states = input
|
212
223
|
|
213
224
|
curr_key_values = []
|
225
|
+
cos_condition = cached_cos is not None or (mode == "prefill" and keep_position_ids)
|
214
226
|
for i in range(num_layers):
|
215
227
|
hidden_states, new_key_states, new_value_states = self.build_decoder(
|
216
228
|
hidden_states=hidden_states,
|
217
229
|
attention_mask=attention_mask,
|
218
|
-
position_ids=position_ids,
|
230
|
+
position_ids=position_ids if cos_condition else None,
|
219
231
|
input_layernorm_weight=input_layernorm_weights[i],
|
220
232
|
post_attention_layernorm_weight=post_attn_layernorm_weights[i],
|
221
233
|
q_bias=q_biases[i],
|
@@ -173,6 +173,105 @@ class LLMEmbedding(NNFactory):
|
|
173
173
|
self.compile()
|
174
174
|
|
175
175
|
|
176
|
+
class Llama32Embedding(NNFactory):
|
177
|
+
def __init__(
|
178
|
+
self,
|
179
|
+
vocab_size,
|
180
|
+
embedding_dim,
|
181
|
+
embedding_weight,
|
182
|
+
padding_idx,
|
183
|
+
inv_freq,
|
184
|
+
attention_scaling,
|
185
|
+
dtype, # fp16
|
186
|
+
device: str = "NPU",
|
187
|
+
):
|
188
|
+
super().__init__(False, device)
|
189
|
+
self.vocab_size = vocab_size
|
190
|
+
self.embedding_dim = embedding_dim
|
191
|
+
self.padding_idx = padding_idx
|
192
|
+
self.attention_scaling = attention_scaling
|
193
|
+
self.dtype = dtype
|
194
|
+
|
195
|
+
# define input
|
196
|
+
weight = self.constant(embedding_weight)
|
197
|
+
input = self.parameter((1, 1), dtype=np.int32)
|
198
|
+
position_ids = self.parameter((1, 1), dtype=np.int64)
|
199
|
+
inv_freq = self.constant(inv_freq)
|
200
|
+
|
201
|
+
# embed_tokens module
|
202
|
+
if padding_idx == -1:
|
203
|
+
padding_idx += vocab_size
|
204
|
+
|
205
|
+
axis_node = self.constant(np.array([0], dtype=np.int64))
|
206
|
+
if padding_idx is not None:
|
207
|
+
masked_embeddings = np.ones(weight.shape, dtype=np.float16)
|
208
|
+
masked_embeddings[padding_idx, :] = 0.0 # mask
|
209
|
+
|
210
|
+
node_mask = self.constant(masked_embeddings)
|
211
|
+
node_masked_w = self.eltwise_mul(weight, node_mask)
|
212
|
+
res = self.gather(node_masked_w, input, axis_node, 0)
|
213
|
+
else:
|
214
|
+
res = self.gather(weight, input, axis_node, 0)
|
215
|
+
|
216
|
+
# rotary_emb module
|
217
|
+
inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
|
218
|
+
position_ids = self.reshape(position_ids, (1, 1, 1))
|
219
|
+
freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
|
220
|
+
self.convert_to_fp32(position_ids))
|
221
|
+
freqs = self.transpose(freqs, [0, 2, 1])
|
222
|
+
emb = self.concat(freqs, freqs, axis=2)
|
223
|
+
cos = self.cos(emb)
|
224
|
+
sin = self.sin(emb)
|
225
|
+
cos = cos * self.attention_scaling
|
226
|
+
sin = sin * self.attention_scaling
|
227
|
+
|
228
|
+
# define outputs
|
229
|
+
res = self.convert_to_fp16(res)
|
230
|
+
cos = self.convert_to_fp32(cos)
|
231
|
+
sin = self.convert_to_fp32(sin)
|
232
|
+
|
233
|
+
print("start compiling")
|
234
|
+
self.compile()
|
235
|
+
|
236
|
+
|
237
|
+
class Llama32PostEmbedding(NNFactory):
|
238
|
+
def __init__(
|
239
|
+
self,
|
240
|
+
inv_freq,
|
241
|
+
attention_scaling,
|
242
|
+
input_len: int = 1,
|
243
|
+
device: str = "NPU",
|
244
|
+
):
|
245
|
+
super().__init__(False, device)
|
246
|
+
self.attention_scaling = attention_scaling
|
247
|
+
|
248
|
+
# define input
|
249
|
+
position_ids = self.parameter((1, input_len), dtype=np.int64)
|
250
|
+
inv_freq = self.constant(inv_freq)
|
251
|
+
|
252
|
+
# rotary_emb module
|
253
|
+
inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
|
254
|
+
position_ids = self.reshape(position_ids, (1, 1, input_len))
|
255
|
+
freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
|
256
|
+
self.convert_to_fp32(position_ids))
|
257
|
+
freqs = self.transpose(freqs, [0, 2, 1])
|
258
|
+
emb = self.concat(freqs, freqs, axis=2)
|
259
|
+
cos = self.cos(emb)
|
260
|
+
sin = self.sin(emb)
|
261
|
+
cos = cos * self.attention_scaling
|
262
|
+
sin = sin * self.attention_scaling
|
263
|
+
if input_len > 1:
|
264
|
+
cos = self.unsqueeze(cos, [1])
|
265
|
+
sin = self.unsqueeze(sin, [1])
|
266
|
+
|
267
|
+
# define outputs
|
268
|
+
cos = self.convert_to_fp32(cos)
|
269
|
+
sin = self.convert_to_fp32(sin)
|
270
|
+
|
271
|
+
print("start compiling")
|
272
|
+
self.compile()
|
273
|
+
|
274
|
+
|
176
275
|
def obtain_weight_from_single_layer(attn_layer, mlp_layer):
|
177
276
|
weights = []
|
178
277
|
if hasattr(attn_layer, "q_proj_dq_list"):
|
@@ -216,3 +315,65 @@ def obtain_qkv_bias_from_single_layer(attn_layer):
|
|
216
315
|
k_bias = attn_layer.k_proj.bias.to(torch.float16)
|
217
316
|
v_bias = attn_layer.v_proj.bias.to(torch.float16)
|
218
317
|
return q_bias, k_bias, v_bias
|
318
|
+
|
319
|
+
|
320
|
+
def obtain_embedding_from_model(model, convert_model, temp_dir, weight_dir,
|
321
|
+
max_prompt_len, keep_ir, compile_blob):
|
322
|
+
if hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
|
323
|
+
# llama-2-7B & llama-3-8B
|
324
|
+
embedding_layer = model.model.embed_tokens
|
325
|
+
new_embedding = LLMEmbedding(
|
326
|
+
vocab_size=model.config.vocab_size,
|
327
|
+
embedding_dim=model.config.hidden_size,
|
328
|
+
embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
|
329
|
+
padding_idx=model.config.pad_token_id,
|
330
|
+
dtype=np.float16,
|
331
|
+
)
|
332
|
+
if convert_model:
|
333
|
+
bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
|
334
|
+
embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
|
335
|
+
first_blob_path = None
|
336
|
+
else:
|
337
|
+
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
|
338
|
+
temp_dir, keep_ir=keep_ir,
|
339
|
+
compile_blob=compile_blob)
|
340
|
+
os.remove(os.path.join(temp_dir, "embedding.bin"))
|
341
|
+
else:
|
342
|
+
# llama-3.2-3B & llama-3.2-1B
|
343
|
+
# for transformers >= 4.45.0
|
344
|
+
embedding_layer = model.model.embed_tokens
|
345
|
+
new_embedding = Llama32Embedding(
|
346
|
+
vocab_size=model.config.vocab_size,
|
347
|
+
embedding_dim=model.config.hidden_size,
|
348
|
+
embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
|
349
|
+
padding_idx=model.config.pad_token_id,
|
350
|
+
inv_freq=model.model.rotary_emb.inv_freq.to(torch.float16),
|
351
|
+
attention_scaling=model.model.rotary_emb.attention_scaling,
|
352
|
+
dtype=np.float16,
|
353
|
+
)
|
354
|
+
if convert_model:
|
355
|
+
bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
|
356
|
+
embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
|
357
|
+
first_blob_path = None
|
358
|
+
# save embedding post module
|
359
|
+
inv_freq = model.model.rotary_emb.inv_freq.to(torch.float16)
|
360
|
+
attention_scaling = model.model.rotary_emb.attention_scaling
|
361
|
+
embedding_post = Llama32PostEmbedding(inv_freq=inv_freq,
|
362
|
+
attention_scaling=attention_scaling,
|
363
|
+
input_len=1)
|
364
|
+
update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
|
365
|
+
temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
|
366
|
+
embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
|
367
|
+
attention_scaling=attention_scaling,
|
368
|
+
input_len=max_prompt_len)
|
369
|
+
update_names_of_IR_and_export_blob(embedding_post_prefill,
|
370
|
+
"embedding_post_prefill",
|
371
|
+
temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
|
372
|
+
os.remove(os.path.join(temp_dir, "embedding_post.bin"))
|
373
|
+
os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
|
374
|
+
else:
|
375
|
+
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
|
376
|
+
temp_dir, keep_ir=keep_ir,
|
377
|
+
compile_blob=compile_blob)
|
378
|
+
os.remove(os.path.join(temp_dir, "embedding.bin"))
|
379
|
+
return first_blob_path
|
@@ -31,6 +31,7 @@ import tempfile
|
|
31
31
|
import numpy as np
|
32
32
|
from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
|
33
33
|
from multiprocessing import Pool
|
34
|
+
import transformers
|
34
35
|
|
35
36
|
|
36
37
|
def generate(
|
@@ -196,7 +197,9 @@ def convert_llm(model: torch.nn.Module,
|
|
196
197
|
qtype: str,
|
197
198
|
convert_model: bool=False,
|
198
199
|
save_directory: str=None,
|
199
|
-
fuse_layers: int=None
|
200
|
+
fuse_layers: int=None,
|
201
|
+
keep_ir: bool=False,
|
202
|
+
compile_blob: bool=True):
|
200
203
|
# whether to set layernorm weight as const
|
201
204
|
layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
|
202
205
|
if group_size == 0:
|
@@ -220,7 +223,9 @@ def convert_llm(model: torch.nn.Module,
|
|
220
223
|
n_splits_down_proj,
|
221
224
|
group_size,
|
222
225
|
save_directory,
|
223
|
-
fuse_layers=fuse_layers
|
226
|
+
fuse_layers=fuse_layers,
|
227
|
+
keep_ir=keep_ir,
|
228
|
+
compile_blob=compile_blob)
|
224
229
|
return 0
|
225
230
|
if model.config.model_type == "llama":
|
226
231
|
with tempfile.TemporaryDirectory() as temp_dir:
|
@@ -428,7 +433,9 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|
428
433
|
n_splits_down_proj: int,
|
429
434
|
group_size: int,
|
430
435
|
save_directory: str=None,
|
431
|
-
fuse_layers: int=None
|
436
|
+
fuse_layers: int=None,
|
437
|
+
keep_ir: bool=False,
|
438
|
+
compile_blob: bool=True):
|
432
439
|
if not os.path.exists(save_directory):
|
433
440
|
os.mkdir(save_directory)
|
434
441
|
weight_dir = os.path.join(save_directory, "model_weights")
|
@@ -450,6 +457,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|
450
457
|
custom_object_save(model, save_directory, config=model.config)
|
451
458
|
|
452
459
|
if model.config.model_type == "qwen2":
|
460
|
+
cos_sin_input = not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached")
|
461
|
+
embedding_post = not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached")
|
453
462
|
if group_size == 0:
|
454
463
|
if model.config.hidden_size == 1536:
|
455
464
|
# Qwen2-1.5B-Instruct
|
@@ -470,6 +479,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|
470
479
|
"use_prefill_sdp": False,
|
471
480
|
"weight_num": 7,
|
472
481
|
"weight_idx": 8,
|
482
|
+
"embedding_post": embedding_post,
|
483
|
+
"cos_sin_input": cos_sin_input,
|
473
484
|
"n_splits_linear": n_splits_linear,
|
474
485
|
"n_splits_down_proj": n_splits_down_proj,
|
475
486
|
"lm_head_low_bit": lm_head_low_bit}
|
@@ -479,14 +490,17 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|
479
490
|
# save fused_layers blobs of fused decoder layers
|
480
491
|
convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
481
492
|
save_directory, weight_dir, transpose_value_cache, kv_len,
|
482
|
-
group_size, layernorm_const, "decode"
|
493
|
+
group_size, layernorm_const, "decode",
|
494
|
+
keep_ir=keep_ir, compile_blob=compile_blob)
|
483
495
|
# save blob of single prefill layer
|
484
496
|
convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj,
|
485
497
|
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
|
486
|
-
group_size, layernorm_const, "prefill"
|
498
|
+
group_size, layernorm_const, "prefill",
|
499
|
+
keep_ir=keep_ir, compile_blob=compile_blob)
|
487
500
|
# save blob of lmhead and bin of embedding
|
488
|
-
convert_lm_head_and_embedding(model, save_directory, weight_dir,
|
489
|
-
|
501
|
+
convert_lm_head_and_embedding(model, save_directory, weight_dir, convert_model=True,
|
502
|
+
group_size=group_size, max_prompt_len=max_prompt_len,
|
503
|
+
keep_ir=keep_ir, compile_blob=compile_blob)
|
490
504
|
elif model.config.model_type == "llama":
|
491
505
|
embedding_post = False
|
492
506
|
cos_sin_input = False
|
@@ -540,15 +554,18 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|
540
554
|
convert_lm_head_and_embedding(model, n_splits_linear,
|
541
555
|
save_directory, weight_dir,
|
542
556
|
convert_model=True,
|
543
|
-
max_prompt_len=max_prompt_len
|
557
|
+
max_prompt_len=max_prompt_len,
|
558
|
+
keep_ir=keep_ir, compile_blob=compile_blob)
|
544
559
|
# save fused_layers blobs of fused decoder layers
|
545
560
|
convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
546
561
|
save_directory, weight_dir, transpose_value_cache, kv_len,
|
547
|
-
group_size, layernorm_const, "decode"
|
562
|
+
group_size, layernorm_const, "decode",
|
563
|
+
keep_ir=keep_ir, compile_blob=compile_blob)
|
548
564
|
# save blob of single prefill layer
|
549
565
|
convert_llama_layer(model, 0, n_splits_linear, n_splits_down_proj,
|
550
566
|
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
|
551
|
-
group_size, layernorm_const, "prefill"
|
567
|
+
group_size, layernorm_const, "prefill",
|
568
|
+
keep_ir=keep_ir, compile_blob=compile_blob)
|
552
569
|
elif model.config.model_type == "minicpm":
|
553
570
|
if group_size == 0:
|
554
571
|
fused_layers = 4 if fuse_layers is None else fuse_layers
|
@@ -577,16 +594,19 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|
577
594
|
# save fused_layers blobs of fused decoder layers
|
578
595
|
convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
579
596
|
save_directory, weight_dir, transpose_value_cache, kv_len,
|
580
|
-
group_size, layernorm_const, "decode"
|
597
|
+
group_size, layernorm_const, "decode",
|
598
|
+
keep_ir=keep_ir, compile_blob=compile_blob)
|
581
599
|
# save blob of single prefill layer
|
582
600
|
convert_minicpm_layer(model, 0, n_splits_linear, n_splits_down_proj,
|
583
601
|
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
|
584
|
-
group_size, layernorm_const, "prefill"
|
602
|
+
group_size, layernorm_const, "prefill",
|
603
|
+
keep_ir=keep_ir, compile_blob=compile_blob)
|
585
604
|
# save blob of lmhead and bin of embedding and embedding_post
|
586
605
|
convert_lm_head_and_embedding(model, n_splits_linear,
|
587
606
|
save_directory, weight_dir,
|
588
607
|
convert_model=True,
|
589
|
-
max_prompt_len=max_prompt_len
|
608
|
+
max_prompt_len=max_prompt_len,
|
609
|
+
keep_ir=keep_ir, compile_blob=compile_blob)
|
590
610
|
|
591
611
|
model.config.update(update_dict)
|
592
612
|
model.config.save_pretrained(save_directory)
|