ipex-llm 2.2.0b20250105__py3-none-win_amd64.whl → 2.2.0b20250105.post0__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ipex_llm/libs/bloom-api.dll +0 -0
- ipex_llm/libs/bloom.dll +0 -0
- ipex_llm/libs/gptneox-api.dll +0 -0
- ipex_llm/libs/gptneox.dll +0 -0
- ipex_llm/libs/libbloom_avx.dll +0 -0
- ipex_llm/libs/libbloom_vnni.dll +0 -0
- ipex_llm/libs/libgptneox_avx.dll +0 -0
- ipex_llm/libs/libgptneox_vnni.dll +0 -0
- ipex_llm/libs/libllama_avx.dll +0 -0
- ipex_llm/libs/libllama_vnni.dll +0 -0
- ipex_llm/libs/libstarcoder_avx.dll +0 -0
- ipex_llm/libs/libstarcoder_vnni.dll +0 -0
- ipex_llm/libs/llama-api.dll +0 -0
- ipex_llm/libs/llama.dll +0 -0
- ipex_llm/libs/main-bloom.exe +0 -0
- ipex_llm/libs/main-gptneox.exe +0 -0
- ipex_llm/libs/main-llama.exe +0 -0
- ipex_llm/libs/main-starcoder.exe +0 -0
- ipex_llm/libs/pipeline.dll +0 -0
- ipex_llm/libs/quantize-bloom.exe +0 -0
- ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
- ipex_llm/libs/quantize-gptneox.exe +0 -0
- ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
- ipex_llm/libs/quantize-llama.exe +0 -0
- ipex_llm/libs/quantize-llama_vnni.exe +0 -0
- ipex_llm/libs/quantize-starcoder.exe +0 -0
- ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
- ipex_llm/libs/starcoder-api.dll +0 -0
- ipex_llm/libs/starcoder.dll +0 -0
- ipex_llm/transformers/convert.py +17 -132
- ipex_llm/transformers/lookup.py +2 -2
- ipex_llm/transformers/low_bit_linear.py +8 -8
- ipex_llm/transformers/models/chatglm2.py +1 -192
- ipex_llm/transformers/models/minicpmv.py +2 -2
- ipex_llm/transformers/models/sd.py +2 -2
- ipex_llm/transformers/models/utils.py +14 -89
- ipex_llm/transformers/npu_model.py +80 -50
- ipex_llm/transformers/npu_models/convert_mp.py +1 -1
- ipex_llm/transformers/npu_models/linear.py +15 -3
- ipex_llm/transformers/npu_models/lm_head.py +1 -90
- ipex_llm/transformers/npu_models/lm_head_linear.py +106 -0
- ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +5 -8
- ipex_llm/transformers/utils.py +5 -20
- {ipex_llm-2.2.0b20250105.dist-info → ipex_llm-2.2.0b20250105.post0.dist-info}/METADATA +40 -19
- {ipex_llm-2.2.0b20250105.dist-info → ipex_llm-2.2.0b20250105.post0.dist-info}/RECORD +51 -53
- ipex_llm/transformers/models/cohere.py +0 -589
- ipex_llm/transformers/models/falcon.py +0 -829
- ipex_llm/transformers/models/mixtral.py +0 -576
- {ipex_llm-2.2.0b20250105.data → ipex_llm-2.2.0b20250105.post0.data}/scripts/ipex-llm-init.bat +0 -0
- {ipex_llm-2.2.0b20250105.data → ipex_llm-2.2.0b20250105.post0.data}/scripts/llm-chat.ps1 +0 -0
- {ipex_llm-2.2.0b20250105.data → ipex_llm-2.2.0b20250105.post0.data}/scripts/llm-cli.ps1 +0 -0
- {ipex_llm-2.2.0b20250105.dist-info → ipex_llm-2.2.0b20250105.post0.dist-info}/WHEEL +0 -0
- {ipex_llm-2.2.0b20250105.dist-info → ipex_llm-2.2.0b20250105.post0.dist-info}/entry_points.txt +0 -0
- {ipex_llm-2.2.0b20250105.dist-info → ipex_llm-2.2.0b20250105.post0.dist-info}/top_level.txt +0 -0
@@ -19,7 +19,7 @@ import torch
|
|
19
19
|
import warnings
|
20
20
|
from ipex_llm.utils.common import invalidInputError
|
21
21
|
from ipex_llm.ggml.quantize import ggml_tensor_qtype
|
22
|
-
from ipex_llm.transformers.utils import get_ipex_version,
|
22
|
+
from ipex_llm.transformers.utils import get_ipex_version, get_xpu_device_name
|
23
23
|
from ipex_llm.transformers.low_bit_linear import SYM_INT4, SYM_INT8, FP8E5, IQ2_XXS, FP4, FP8E4,\
|
24
24
|
FP6, ASYM_INT4
|
25
25
|
|
@@ -85,16 +85,14 @@ def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor, kv_group: in
|
|
85
85
|
return os.environ["IPEX_LLM_QUANTIZE_KV_CACHE"] == "1"
|
86
86
|
elif os.environ.get("IPEX_LLM_LOW_MEM", None) is not None:
|
87
87
|
return os.environ["IPEX_LLM_LOW_MEM"] == "1"
|
88
|
+
elif linear.qtype in [ggml_tensor_qtype["fp16"], ggml_tensor_qtype["bf16"]]:
|
89
|
+
return False
|
88
90
|
else:
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
def kv_cache_device_check(x: torch.Tensor, kv_group: int) -> bool:
|
95
|
-
return (get_xpu_device_type(x) in ["mtl", "lnl"] and kv_group <= 1) or \
|
96
|
-
((get_xpu_device_type(x) == "arc" or get_xpu_device_type(x) == "flex") and
|
97
|
-
1 < x.size(0) and x.size(0) <= 8)
|
91
|
+
device_name = get_xpu_device_name(x.device)
|
92
|
+
return (
|
93
|
+
device_name in ["mtl", "lnl", "arl"] and kv_group == 1
|
94
|
+
or device_name in ["arc", "bmg"] and x.size(0) > 1
|
95
|
+
)
|
98
96
|
|
99
97
|
|
100
98
|
def init_fp8_kv_cache(batch_size, num_heads, current_length, head_dim, device):
|
@@ -226,57 +224,6 @@ def is_enough_kv_cache_room_4_31(past_key_value, seq_len=1):
|
|
226
224
|
(past_key_value[0].size(2) + seq_len) * past_key_value[0].size(3)
|
227
225
|
|
228
226
|
|
229
|
-
def use_flash_attention(query, key, attention_mask=None):
|
230
|
-
# here we support query's shape is always [batch_size, head_num, q_len, head_dim],
|
231
|
-
# key's shape is always [batch_size, head_num, k_len, head_dim]
|
232
|
-
invalidInputError(query.dim() == 4,
|
233
|
-
"Here query input of use_flash_attention should be [batch_size, "
|
234
|
-
"head_num, q_len, head_dim]")
|
235
|
-
invalidInputError(key.dim() == 4,
|
236
|
-
"Here key input of use_flash_attention should be [batch_size, "
|
237
|
-
"head_num, k_len, head_dim]")
|
238
|
-
bsz, _, q_len, _ = query.size()
|
239
|
-
k_len = key.size()[2]
|
240
|
-
# check whether ipex flash attention can be used
|
241
|
-
if q_len != k_len:
|
242
|
-
# now only use flash attention for first token
|
243
|
-
# as it seems have no performance benifit for rest token now
|
244
|
-
return False
|
245
|
-
if query.device.type != "xpu":
|
246
|
-
# ipex flash attention only support for xpu
|
247
|
-
return False
|
248
|
-
ipex_version = get_ipex_version()
|
249
|
-
if ipex_version <= "2.0.110+xpu":
|
250
|
-
# ipex flash attention is supported from ipex 2.1
|
251
|
-
return False
|
252
|
-
if not torch.xpu.has_xetla():
|
253
|
-
# ipex flash attention is only supported for xetla
|
254
|
-
# may update this later
|
255
|
-
return False
|
256
|
-
elif get_xpu_device_type(query) != "pvc":
|
257
|
-
return False
|
258
|
-
if query.dtype not in [torch.float32, torch.float16]:
|
259
|
-
# only use flash attention for fp32/fp16 input
|
260
|
-
return False
|
261
|
-
if bsz > 1:
|
262
|
-
# as flash attention doesn't support attn_mask in ipex 2.1,
|
263
|
-
# so it will cause output error for padded batch input
|
264
|
-
if attention_mask is None:
|
265
|
-
return True
|
266
|
-
else:
|
267
|
-
# TODO: below logic may change for different model
|
268
|
-
# attention mask shape : [bsz, 1, q_len, k_len]
|
269
|
-
if attention_mask[0].squeeze()[0, 0].item() != 0:
|
270
|
-
# first batch contains padding
|
271
|
-
# otherwise we suppose it should be a upper triangular matrix
|
272
|
-
# at the same time, the diagonal is also 0
|
273
|
-
return False
|
274
|
-
elif not attention_mask.equal(attention_mask[0].repeat(bsz, 1, 1, 1)):
|
275
|
-
# check whether mask of every batch is the same
|
276
|
-
return False
|
277
|
-
return True
|
278
|
-
|
279
|
-
|
280
227
|
def use_sdp(q_len, kv_len, head_dim, query_states):
|
281
228
|
return (
|
282
229
|
query_states.device.type == "xpu"
|
@@ -315,38 +262,16 @@ def mlp_fusion_check(x, qtype, training):
|
|
315
262
|
if training or x.requires_grad:
|
316
263
|
return False
|
317
264
|
if qtype == FP6:
|
318
|
-
device =
|
319
|
-
if device in ["mtl", "lnl"]:
|
265
|
+
device = get_xpu_device_name(x.device)
|
266
|
+
if device in ["mtl", "lnl", "arl"]:
|
320
267
|
return False
|
321
268
|
return True
|
322
269
|
|
323
270
|
|
324
|
-
def use_decoding_fast_path(proj,
|
325
|
-
use_fuse_rope,
|
326
|
-
enough_kv_room,
|
327
|
-
bs,
|
328
|
-
qtype_check=decoding_fast_path_qtype_check):
|
329
|
-
if proj is None:
|
330
|
-
return False
|
331
|
-
device = get_xpu_device_type(proj.weight)
|
332
|
-
if not qtype_check(proj):
|
333
|
-
return False
|
334
|
-
if not use_fuse_rope:
|
335
|
-
return False
|
336
|
-
if not enough_kv_room:
|
337
|
-
return False
|
338
|
-
if bs != 1:
|
339
|
-
return False
|
340
|
-
|
341
|
-
if device in ["uhd"]:
|
342
|
-
return False
|
343
|
-
return True
|
344
|
-
|
345
|
-
|
346
271
|
def use_xmx(x: torch.Tensor, qtype: int):
|
347
|
-
device =
|
272
|
+
device = get_xpu_device_name(x.device)
|
348
273
|
return (
|
349
|
-
device in ["arc", "
|
274
|
+
device in ["arc", "pvc"]
|
350
275
|
and qtype in [SYM_INT4, SYM_INT8, FP8E4, FP8E5]
|
351
276
|
and (
|
352
277
|
(device == "pvc" and 1 < x.size(0) <= 16)
|
@@ -370,7 +295,7 @@ def fp16_fusion_check(proj, x, training):
|
|
370
295
|
return False
|
371
296
|
if x.requires_grad:
|
372
297
|
return False
|
373
|
-
device_type =
|
298
|
+
device_type = get_xpu_device_name(x.device)
|
374
299
|
if device_type != "pvc":
|
375
300
|
return False
|
376
301
|
return True
|
@@ -439,7 +364,7 @@ def should_use_compresskv(x: torch.Tensor, prompt_len: int):
|
|
439
364
|
else:
|
440
365
|
if use_compress_kv is None:
|
441
366
|
return (
|
442
|
-
|
367
|
+
get_xpu_device_name(x.device) in ["mtl", "lnl", "arl"]
|
443
368
|
and prompt_len >= 1800
|
444
369
|
and prompt_len <= 4500
|
445
370
|
)
|
@@ -27,7 +27,7 @@ from transformers.configuration_utils import PretrainedConfig
|
|
27
27
|
|
28
28
|
from ipex_llm.utils.common.log4Error import invalidInputError
|
29
29
|
from ipex_llm.transformers.utils import logger, load_imatrix_data
|
30
|
-
from ipex_llm.transformers.npu_models.convert import optimize_llm
|
30
|
+
from ipex_llm.transformers.npu_models.convert import optimize_llm
|
31
31
|
|
32
32
|
|
33
33
|
def patch_flash_attn_import(filename: str) -> List[str]:
|
@@ -207,8 +207,6 @@ class _BaseAutoModelClass:
|
|
207
207
|
model = model.eval()
|
208
208
|
logger.info(f"Finish to convert model")
|
209
209
|
else:
|
210
|
-
from intel_npu_acceleration_library.compiler import create_npu_kernels
|
211
|
-
|
212
210
|
if optimize_model:
|
213
211
|
invalidInputError(
|
214
212
|
max_prompt_len < max_context_len,
|
@@ -232,11 +230,14 @@ class _BaseAutoModelClass:
|
|
232
230
|
"convert_model": convert_model,
|
233
231
|
"save_directory": save_directory,
|
234
232
|
"fuse_layers": fuse_layers,
|
235
|
-
"imatrix_data": imatrix_data
|
233
|
+
"imatrix_data": imatrix_data,
|
234
|
+
"skip_npu_logic": mock_device == "dummy",
|
236
235
|
}
|
236
|
+
# Dummy will skip npu related logic and save the quantized model
|
237
|
+
if mock_device == "dummy":
|
238
|
+
model.save_low_bit = types.MethodType(save_low_bit, model)
|
237
239
|
model = cls.optimize_npu_model(*args, **optimize_kwargs)
|
238
240
|
else:
|
239
|
-
from ipex_llm.transformers.npu_models.convert import optimize_llm
|
240
241
|
optimize_llm(model)
|
241
242
|
with torch.no_grad():
|
242
243
|
cls.load_convert(qtype, model, "cpu", modules_to_not_convert,
|
@@ -258,7 +259,6 @@ class _BaseAutoModelClass:
|
|
258
259
|
def optimize_npu_model(cls, *args, **kwargs):
|
259
260
|
|
260
261
|
from ipex_llm.transformers.npu_models.convert_mp import optimize_llm_pre, optimize_llm
|
261
|
-
from intel_npu_acceleration_library.compiler import create_npu_kernels
|
262
262
|
|
263
263
|
model = kwargs.pop("model")
|
264
264
|
qtype = kwargs.pop("qtype", "sym_int4_rtn")
|
@@ -275,6 +275,7 @@ class _BaseAutoModelClass:
|
|
275
275
|
save_directory = kwargs.pop('save_directory', None)
|
276
276
|
fuse_layers = kwargs.pop('fuse_layers', None)
|
277
277
|
imatrix_data = kwargs.pop('imatrix_data', None)
|
278
|
+
skip_npu_logic = kwargs.pop("skip_npu_logic", False)
|
278
279
|
invalidInputError(save_directory is not None,
|
279
280
|
"Please provide the path to save converted model "
|
280
281
|
"through `save_directory`.")
|
@@ -294,51 +295,58 @@ class _BaseAutoModelClass:
|
|
294
295
|
cls.load_convert(qtype, model, "cpu", modules_to_not_convert,
|
295
296
|
quantization_group_size, imatrix_data,
|
296
297
|
*args, **kwargs)
|
297
|
-
|
298
|
+
if not skip_npu_logic:
|
299
|
+
from intel_npu_acceleration_library.compiler import create_npu_kernels
|
300
|
+
create_npu_kernels(llm)
|
298
301
|
model = model.eval()
|
299
302
|
logger.info(f"Finish to convert model")
|
300
303
|
model.config.update({"bigdl_transformers_low_bit": qtype})
|
301
|
-
model.share_memory()
|
302
304
|
|
303
|
-
if
|
304
|
-
|
305
|
-
from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
|
306
|
-
optimize_llm_single_process(
|
307
|
-
llm,
|
308
|
-
kv_len=max_context_len,
|
309
|
-
max_prompt_len=max_prompt_len,
|
310
|
-
transpose_value_cache=transpose_value_cache,
|
311
|
-
group_size=quantization_group_size,
|
312
|
-
qtype=qtype,
|
313
|
-
save_directory=save_directory,
|
314
|
-
fuse_layers=fuse_layers,
|
315
|
-
has_llm=hasattr(model, "llm")
|
316
|
-
)
|
317
|
-
else:
|
318
|
-
optimize_llm(
|
319
|
-
llm,
|
320
|
-
max_context_len=max_context_len,
|
321
|
-
max_prompt_len=max_prompt_len,
|
322
|
-
inter_pp=inter_pp,
|
323
|
-
intra_pp=intra_pp,
|
324
|
-
transpose_value_cache=transpose_value_cache,
|
325
|
-
group_size=quantization_group_size
|
326
|
-
)
|
305
|
+
if skip_npu_logic:
|
306
|
+
model.save_low_bit(model_dir=save_directory)
|
327
307
|
else:
|
328
|
-
|
329
|
-
|
330
|
-
|
308
|
+
model.share_memory()
|
309
|
+
|
310
|
+
if not pipeline:
|
311
|
+
if model.config.model_type in ["qwen2", "llama", "minicpm"]:
|
312
|
+
from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
|
313
|
+
optimize_llm_single_process(
|
314
|
+
llm,
|
331
315
|
kv_len=max_context_len,
|
332
316
|
max_prompt_len=max_prompt_len,
|
333
317
|
transpose_value_cache=transpose_value_cache,
|
334
318
|
group_size=quantization_group_size,
|
335
319
|
qtype=qtype,
|
336
|
-
convert_model=convert_model,
|
337
320
|
save_directory=save_directory,
|
338
|
-
fuse_layers=fuse_layers
|
339
|
-
|
340
|
-
|
341
|
-
|
321
|
+
fuse_layers=fuse_layers,
|
322
|
+
has_llm=hasattr(model, "llm")
|
323
|
+
)
|
324
|
+
else:
|
325
|
+
optimize_llm(
|
326
|
+
llm,
|
327
|
+
max_context_len=max_context_len,
|
328
|
+
max_prompt_len=max_prompt_len,
|
329
|
+
inter_pp=inter_pp,
|
330
|
+
intra_pp=intra_pp,
|
331
|
+
transpose_value_cache=transpose_value_cache,
|
332
|
+
group_size=quantization_group_size
|
333
|
+
)
|
334
|
+
else:
|
335
|
+
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
|
336
|
+
import convert_llm
|
337
|
+
convert_llm(llm,
|
338
|
+
kv_len=max_context_len,
|
339
|
+
max_prompt_len=max_prompt_len,
|
340
|
+
transpose_value_cache=transpose_value_cache,
|
341
|
+
group_size=quantization_group_size,
|
342
|
+
qtype=qtype,
|
343
|
+
convert_model=convert_model,
|
344
|
+
save_directory=save_directory,
|
345
|
+
fuse_layers=fuse_layers)
|
346
|
+
model.save_low_bit = types.MethodType(save_low_bit, model)
|
347
|
+
model.save_low_bit(save_directory)
|
348
|
+
logger.info(f"Converted model has already saved to {save_directory}.")
|
349
|
+
|
342
350
|
return model
|
343
351
|
|
344
352
|
@classmethod
|
@@ -379,6 +387,7 @@ class _BaseAutoModelClass:
|
|
379
387
|
intra_pp = kwargs.pop("intra_pp", None)
|
380
388
|
transpose_value_cache = kwargs.pop("transpose_value_cache", True)
|
381
389
|
modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
|
390
|
+
save_directory = kwargs.pop('save_directory', None)
|
382
391
|
|
383
392
|
from transformers.models.auto.configuration_auto import AutoConfig
|
384
393
|
from transformers.modeling_utils import no_init_weights, get_state_dict_dtype
|
@@ -650,16 +659,37 @@ class _BaseAutoModelClass:
|
|
650
659
|
param.requires_grad_(False)
|
651
660
|
|
652
661
|
if optimize_model and not pipeline:
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
662
|
+
if model.config.model_type in ["qwen2", "llama", "minicpm"]:
|
663
|
+
from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
|
664
|
+
if save_directory is None:
|
665
|
+
invalidInputError(False,
|
666
|
+
"Please specify the save_directory, the path of folder " +
|
667
|
+
"to save the compiled NPU model. If path not exists, " +
|
668
|
+
"the compiled NPU model will be saved there. " +
|
669
|
+
"Else, program will exit.")
|
670
|
+
|
671
|
+
optimize_llm_single_process(
|
672
|
+
llm,
|
673
|
+
kv_len=max_context_len,
|
674
|
+
max_prompt_len=max_prompt_len,
|
675
|
+
transpose_value_cache=transpose_value_cache,
|
676
|
+
group_size=quantization_group_size,
|
677
|
+
qtype=qtype,
|
678
|
+
save_directory=save_directory,
|
679
|
+
fuse_layers=None,
|
680
|
+
has_llm=hasattr(model, "llm")
|
681
|
+
)
|
682
|
+
else:
|
683
|
+
from ipex_llm.transformers.npu_models.convert_mp import optimize_llm
|
684
|
+
optimize_llm(
|
685
|
+
llm,
|
686
|
+
max_context_len=max_context_len,
|
687
|
+
max_prompt_len=max_prompt_len,
|
688
|
+
inter_pp=inter_pp,
|
689
|
+
intra_pp=intra_pp,
|
690
|
+
transpose_value_cache=transpose_value_cache,
|
691
|
+
group_size=quantization_group_size
|
692
|
+
)
|
663
693
|
elif optimize_model and pipeline:
|
664
694
|
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
|
665
695
|
import convert_llm
|
@@ -18,7 +18,7 @@ import torch
|
|
18
18
|
import importlib
|
19
19
|
import numpy as np
|
20
20
|
from ipex_llm.transformers.low_bit_linear import LowBitLinear, FP4Params
|
21
|
-
from ipex_llm.transformers.npu_models.lm_head import
|
21
|
+
from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
|
22
22
|
from ipex_llm.utils.common.log4Error import invalidInputError
|
23
23
|
|
24
24
|
|
@@ -21,16 +21,25 @@
|
|
21
21
|
# SPDX-License-Identifier: Apache 2.0
|
22
22
|
#
|
23
23
|
|
24
|
-
|
25
|
-
from intel_npu_acceleration_library.dtypes import NPUDtype
|
24
|
+
|
26
25
|
import os
|
27
26
|
import torch
|
28
27
|
from torch.nn import Parameter
|
29
28
|
import uuid
|
30
29
|
import math
|
31
|
-
from intel_npu_acceleration_library.backend import run_matmul
|
32
30
|
from typing import Optional, Union
|
33
31
|
from ipex_llm.utils.common import invalidInputError
|
32
|
+
import importlib
|
33
|
+
|
34
|
+
|
35
|
+
def is_acclib_available():
|
36
|
+
return importlib.util.find_spec("intel_npu_acceleration_library") is not None
|
37
|
+
|
38
|
+
|
39
|
+
if is_acclib_available():
|
40
|
+
from intel_npu_acceleration_library.quantization import quantize_tensor, compress_to_i4
|
41
|
+
from intel_npu_acceleration_library.dtypes import NPUDtype
|
42
|
+
from intel_npu_acceleration_library.backend import run_matmul
|
34
43
|
|
35
44
|
|
36
45
|
class Linear(torch.nn.Module):
|
@@ -63,6 +72,7 @@ class Linear(torch.nn.Module):
|
|
63
72
|
if self.training:
|
64
73
|
out = self._mm(x, self.weight, None)
|
65
74
|
else:
|
75
|
+
from intel_npu_acceleration_library.backend import run_matmul
|
66
76
|
out = run_matmul(x, self.weight, None, self.op_id)
|
67
77
|
|
68
78
|
if self.bias is None:
|
@@ -105,6 +115,8 @@ class Linear(torch.nn.Module):
|
|
105
115
|
Returns:
|
106
116
|
Union[Linear, QuantizedLinear]: A NPU linear layer
|
107
117
|
"""
|
118
|
+
from intel_npu_acceleration_library.quantization import quantize_tensor, compress_to_i4
|
119
|
+
from intel_npu_acceleration_library.dtypes import NPUDtype
|
108
120
|
if dtype.is_floating_point:
|
109
121
|
if bias is None:
|
110
122
|
return Linear(weight.to(dtype), None)
|
@@ -16,96 +16,6 @@
|
|
16
16
|
import torch
|
17
17
|
from torch import nn
|
18
18
|
import numpy as np
|
19
|
-
from filelock import FileLock
|
20
|
-
from intel_npu_acceleration_library.backend import NNFactory
|
21
|
-
from intel_npu_acceleration_library.backend.bindings import lib as backend_lib
|
22
|
-
|
23
|
-
|
24
|
-
class LMHeadLinear(NNFactory):
|
25
|
-
"""Quantized Linear class for sliced lm_head, computing a matrix matrix multiplication
|
26
|
-
with weights prefetching."""
|
27
|
-
|
28
|
-
def __init__(
|
29
|
-
self,
|
30
|
-
inC: int,
|
31
|
-
outC: int,
|
32
|
-
batch: int,
|
33
|
-
split_num: int = 2,
|
34
|
-
profile: bool = False,
|
35
|
-
device: str = "NPU",
|
36
|
-
dtype: np.dtype = np.int8,
|
37
|
-
use_split: bool = False,
|
38
|
-
group_size: int = 0,
|
39
|
-
asym: bool = False,
|
40
|
-
):
|
41
|
-
"""Initialize the LMHeadLinear class.
|
42
|
-
|
43
|
-
Args:
|
44
|
-
inC (int): input channels
|
45
|
-
outC (int): output channels
|
46
|
-
batch (int): batch
|
47
|
-
split_num (int): split in_features of lm_head to how many parts
|
48
|
-
profile (bool): Enable/Disable profiling. Defaults to False.
|
49
|
-
device (str): Target device, default to "NPU".
|
50
|
-
dtype (np.dtype): weights datatype. Defaults to np.int8.
|
51
|
-
|
52
|
-
"""
|
53
|
-
super().__init__(profile, device)
|
54
|
-
self.inC, self.outC = inC, outC
|
55
|
-
self.batch = batch
|
56
|
-
|
57
|
-
self.split_num = split_num
|
58
|
-
if use_split:
|
59
|
-
input = self.parameter((1, self.batch, self.inC))
|
60
|
-
res = self.dq_split_linear(input, self.split_num, self.outC, self.inC, wt_dtype=dtype,
|
61
|
-
scale_factor=(group_size == 0), asym=asym)
|
62
|
-
else:
|
63
|
-
input = self.parameter((self.batch, self.inC))
|
64
|
-
split_size = self.inC // split_num // 2 * 2
|
65
|
-
|
66
|
-
for i in range(self.split_num):
|
67
|
-
start_idx = i * split_size
|
68
|
-
end_idx = (i + 1) * split_size if i < self.split_num - 1 else self.inC
|
69
|
-
input_slice = self.slice(input, begin=[0, start_idx],
|
70
|
-
end=[self.batch, end_idx])
|
71
|
-
linear_slice = self.linear(input_slice, outC, split_size, bias=False,
|
72
|
-
wt_dtype=dtype, asym=asym)
|
73
|
-
if i == 0:
|
74
|
-
res = linear_slice
|
75
|
-
else:
|
76
|
-
res += linear_slice
|
77
|
-
|
78
|
-
print("start compiling lm_head")
|
79
|
-
self.compile()
|
80
|
-
print("end compiling lm_head")
|
81
|
-
|
82
|
-
def set_weights(self, op_id, weights):
|
83
|
-
self.set_weights_async(op_id, weights)
|
84
|
-
with FileLock(f"lmhead_run.lock"):
|
85
|
-
backend_lib.run(self._mm)
|
86
|
-
|
87
|
-
def set_weights_async(self, op_id, weights):
|
88
|
-
self.setWeights(1, op_id, *weights)
|
89
|
-
|
90
|
-
def run(
|
91
|
-
self, X: np.ndarray
|
92
|
-
) -> np.ndarray:
|
93
|
-
"""Run the layer: $X * (W * S)^T$ .
|
94
|
-
|
95
|
-
Args:
|
96
|
-
X (np.ndarray): activation
|
97
|
-
|
98
|
-
Raises:
|
99
|
-
RuntimeError: Input, weights or scale shape mismatch
|
100
|
-
|
101
|
-
Returns:
|
102
|
-
np.ndarray: result
|
103
|
-
"""
|
104
|
-
self.set_input_tensor(X, 0)
|
105
|
-
self.elapsed = backend_lib.run(self._mm)
|
106
|
-
if len(self.out) == 1:
|
107
|
-
return self.out[0]
|
108
|
-
return self.out
|
109
19
|
|
110
20
|
|
111
21
|
class SlicedLMHead(nn.Module):
|
@@ -160,6 +70,7 @@ class SlicedLMHead(nn.Module):
|
|
160
70
|
return self.lm_heads[0].weight.dtype
|
161
71
|
|
162
72
|
def get_fused_lm_head(self):
|
73
|
+
from ipex_llm.transformers.npu_models.lm_head_linear import LMHeadLinear
|
163
74
|
np_dtype = np.uint8 if self.get_weight_dtype() == torch.uint8 else np.int8
|
164
75
|
self.fused_lm_head = LMHeadLinear(self.inC, self.outC, 1, self.split_num,
|
165
76
|
False, "NPU", dtype=np_dtype, use_split=self.use_split,
|
@@ -0,0 +1,106 @@
|
|
1
|
+
#
|
2
|
+
# Copyright 2016 The BigDL Authors.
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
import numpy as np
|
17
|
+
from filelock import FileLock
|
18
|
+
from intel_npu_acceleration_library.backend import NNFactory
|
19
|
+
from intel_npu_acceleration_library.backend.bindings import lib as backend_lib
|
20
|
+
|
21
|
+
|
22
|
+
class LMHeadLinear(NNFactory):
|
23
|
+
"""Quantized Linear class for sliced lm_head, computing a matrix matrix multiplication
|
24
|
+
with weights prefetching."""
|
25
|
+
|
26
|
+
def __init__(
|
27
|
+
self,
|
28
|
+
inC: int,
|
29
|
+
outC: int,
|
30
|
+
batch: int,
|
31
|
+
split_num: int = 2,
|
32
|
+
profile: bool = False,
|
33
|
+
device: str = "NPU",
|
34
|
+
dtype: np.dtype = np.int8,
|
35
|
+
use_split: bool = False,
|
36
|
+
group_size: int = 0,
|
37
|
+
asym: bool = False,
|
38
|
+
):
|
39
|
+
"""Initialize the LMHeadLinear class.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
inC (int): input channels
|
43
|
+
outC (int): output channels
|
44
|
+
batch (int): batch
|
45
|
+
split_num (int): split in_features of lm_head to how many parts
|
46
|
+
profile (bool): Enable/Disable profiling. Defaults to False.
|
47
|
+
device (str): Target device, default to "NPU".
|
48
|
+
dtype (np.dtype): weights datatype. Defaults to np.int8.
|
49
|
+
|
50
|
+
"""
|
51
|
+
super().__init__(profile, device)
|
52
|
+
self.inC, self.outC = inC, outC
|
53
|
+
self.batch = batch
|
54
|
+
|
55
|
+
self.split_num = split_num
|
56
|
+
if use_split:
|
57
|
+
input = self.parameter((1, self.batch, self.inC))
|
58
|
+
res = self.dq_split_linear(input, self.split_num, self.outC, self.inC, wt_dtype=dtype,
|
59
|
+
scale_factor=(group_size == 0), asym=asym)
|
60
|
+
else:
|
61
|
+
input = self.parameter((self.batch, self.inC))
|
62
|
+
split_size = self.inC // split_num // 2 * 2
|
63
|
+
|
64
|
+
for i in range(self.split_num):
|
65
|
+
start_idx = i * split_size
|
66
|
+
end_idx = (i + 1) * split_size if i < self.split_num - 1 else self.inC
|
67
|
+
input_slice = self.slice(input, begin=[0, start_idx],
|
68
|
+
end=[self.batch, end_idx])
|
69
|
+
linear_slice = self.linear(input_slice, outC, split_size, bias=False,
|
70
|
+
wt_dtype=dtype, asym=asym)
|
71
|
+
if i == 0:
|
72
|
+
res = linear_slice
|
73
|
+
else:
|
74
|
+
res += linear_slice
|
75
|
+
|
76
|
+
print("start compiling lm_head")
|
77
|
+
self.compile()
|
78
|
+
print("end compiling lm_head")
|
79
|
+
|
80
|
+
def set_weights(self, op_id, weights):
|
81
|
+
self.set_weights_async(op_id, weights)
|
82
|
+
with FileLock(f"lmhead_run.lock"):
|
83
|
+
backend_lib.run(self._mm)
|
84
|
+
|
85
|
+
def set_weights_async(self, op_id, weights):
|
86
|
+
self.setWeights(1, op_id, *weights)
|
87
|
+
|
88
|
+
def run(
|
89
|
+
self, X: np.ndarray
|
90
|
+
) -> np.ndarray:
|
91
|
+
"""Run the layer: $X * (W * S)^T$ .
|
92
|
+
|
93
|
+
Args:
|
94
|
+
X (np.ndarray): activation
|
95
|
+
|
96
|
+
Raises:
|
97
|
+
RuntimeError: Input, weights or scale shape mismatch
|
98
|
+
|
99
|
+
Returns:
|
100
|
+
np.ndarray: result
|
101
|
+
"""
|
102
|
+
self.set_input_tensor(X, 0)
|
103
|
+
self.elapsed = backend_lib.run(self._mm)
|
104
|
+
if len(self.out) == 1:
|
105
|
+
return self.out[0]
|
106
|
+
return self.out
|
@@ -473,10 +473,6 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|
473
473
|
"n_splits_linear": n_splits_linear,
|
474
474
|
"n_splits_down_proj": n_splits_down_proj,
|
475
475
|
"lm_head_low_bit": lm_head_low_bit}
|
476
|
-
model.config.update(update_dict)
|
477
|
-
model.config.save_pretrained(save_directory)
|
478
|
-
if model.can_generate():
|
479
|
-
model.generation_config.save_pretrained(save_directory)
|
480
476
|
|
481
477
|
from .qwen import convert_qwen_layer, convert_fused_qwen_layer
|
482
478
|
from .qwen import convert_lm_head_and_embedding
|
@@ -537,8 +533,6 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|
537
533
|
"n_splits_linear": n_splits_linear,
|
538
534
|
"n_splits_down_proj": n_splits_down_proj,
|
539
535
|
"lm_head_low_bit": lm_head_low_bit}
|
540
|
-
model.config.update(update_dict)
|
541
|
-
model.config.save_pretrained(save_directory)
|
542
536
|
|
543
537
|
from .llama import convert_llama_layer, convert_fused_llama_layer
|
544
538
|
from .llama import convert_lm_head_and_embedding
|
@@ -577,8 +571,6 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|
577
571
|
"n_splits_linear": n_splits_linear,
|
578
572
|
"n_splits_down_proj": n_splits_down_proj,
|
579
573
|
"lm_head_low_bit": lm_head_low_bit}
|
580
|
-
model.config.update(update_dict)
|
581
|
-
model.config.save_pretrained(save_directory)
|
582
574
|
|
583
575
|
from .minicpm import convert_minicpm_layer, convert_fused_minicpm_layer
|
584
576
|
from .minicpm import convert_lm_head_and_embedding
|
@@ -595,3 +587,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|
595
587
|
save_directory, weight_dir,
|
596
588
|
convert_model=True,
|
597
589
|
max_prompt_len=max_prompt_len)
|
590
|
+
|
591
|
+
model.config.update(update_dict)
|
592
|
+
model.config.save_pretrained(save_directory)
|
593
|
+
if model.can_generate():
|
594
|
+
model.generation_config.save_pretrained(save_directory)
|