ipex-llm 2.2.0b20250105__py3-none-win_amd64.whl → 2.2.0b20250105.post0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. ipex_llm/libs/bloom-api.dll +0 -0
  2. ipex_llm/libs/bloom.dll +0 -0
  3. ipex_llm/libs/gptneox-api.dll +0 -0
  4. ipex_llm/libs/gptneox.dll +0 -0
  5. ipex_llm/libs/libbloom_avx.dll +0 -0
  6. ipex_llm/libs/libbloom_vnni.dll +0 -0
  7. ipex_llm/libs/libgptneox_avx.dll +0 -0
  8. ipex_llm/libs/libgptneox_vnni.dll +0 -0
  9. ipex_llm/libs/libllama_avx.dll +0 -0
  10. ipex_llm/libs/libllama_vnni.dll +0 -0
  11. ipex_llm/libs/libstarcoder_avx.dll +0 -0
  12. ipex_llm/libs/libstarcoder_vnni.dll +0 -0
  13. ipex_llm/libs/llama-api.dll +0 -0
  14. ipex_llm/libs/llama.dll +0 -0
  15. ipex_llm/libs/main-bloom.exe +0 -0
  16. ipex_llm/libs/main-gptneox.exe +0 -0
  17. ipex_llm/libs/main-llama.exe +0 -0
  18. ipex_llm/libs/main-starcoder.exe +0 -0
  19. ipex_llm/libs/pipeline.dll +0 -0
  20. ipex_llm/libs/quantize-bloom.exe +0 -0
  21. ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
  22. ipex_llm/libs/quantize-gptneox.exe +0 -0
  23. ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
  24. ipex_llm/libs/quantize-llama.exe +0 -0
  25. ipex_llm/libs/quantize-llama_vnni.exe +0 -0
  26. ipex_llm/libs/quantize-starcoder.exe +0 -0
  27. ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
  28. ipex_llm/libs/starcoder-api.dll +0 -0
  29. ipex_llm/libs/starcoder.dll +0 -0
  30. ipex_llm/transformers/convert.py +17 -132
  31. ipex_llm/transformers/lookup.py +2 -2
  32. ipex_llm/transformers/low_bit_linear.py +8 -8
  33. ipex_llm/transformers/models/chatglm2.py +1 -192
  34. ipex_llm/transformers/models/minicpmv.py +2 -2
  35. ipex_llm/transformers/models/sd.py +2 -2
  36. ipex_llm/transformers/models/utils.py +14 -89
  37. ipex_llm/transformers/npu_model.py +80 -50
  38. ipex_llm/transformers/npu_models/convert_mp.py +1 -1
  39. ipex_llm/transformers/npu_models/linear.py +15 -3
  40. ipex_llm/transformers/npu_models/lm_head.py +1 -90
  41. ipex_llm/transformers/npu_models/lm_head_linear.py +106 -0
  42. ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +5 -8
  43. ipex_llm/transformers/utils.py +5 -20
  44. {ipex_llm-2.2.0b20250105.dist-info → ipex_llm-2.2.0b20250105.post0.dist-info}/METADATA +40 -19
  45. {ipex_llm-2.2.0b20250105.dist-info → ipex_llm-2.2.0b20250105.post0.dist-info}/RECORD +51 -53
  46. ipex_llm/transformers/models/cohere.py +0 -589
  47. ipex_llm/transformers/models/falcon.py +0 -829
  48. ipex_llm/transformers/models/mixtral.py +0 -576
  49. {ipex_llm-2.2.0b20250105.data → ipex_llm-2.2.0b20250105.post0.data}/scripts/ipex-llm-init.bat +0 -0
  50. {ipex_llm-2.2.0b20250105.data → ipex_llm-2.2.0b20250105.post0.data}/scripts/llm-chat.ps1 +0 -0
  51. {ipex_llm-2.2.0b20250105.data → ipex_llm-2.2.0b20250105.post0.data}/scripts/llm-cli.ps1 +0 -0
  52. {ipex_llm-2.2.0b20250105.dist-info → ipex_llm-2.2.0b20250105.post0.dist-info}/WHEEL +0 -0
  53. {ipex_llm-2.2.0b20250105.dist-info → ipex_llm-2.2.0b20250105.post0.dist-info}/entry_points.txt +0 -0
  54. {ipex_llm-2.2.0b20250105.dist-info → ipex_llm-2.2.0b20250105.post0.dist-info}/top_level.txt +0 -0
@@ -19,7 +19,7 @@ import torch
19
19
  import warnings
20
20
  from ipex_llm.utils.common import invalidInputError
21
21
  from ipex_llm.ggml.quantize import ggml_tensor_qtype
22
- from ipex_llm.transformers.utils import get_ipex_version, get_xpu_device_type
22
+ from ipex_llm.transformers.utils import get_ipex_version, get_xpu_device_name
23
23
  from ipex_llm.transformers.low_bit_linear import SYM_INT4, SYM_INT8, FP8E5, IQ2_XXS, FP4, FP8E4,\
24
24
  FP6, ASYM_INT4
25
25
 
@@ -85,16 +85,14 @@ def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor, kv_group: in
85
85
  return os.environ["IPEX_LLM_QUANTIZE_KV_CACHE"] == "1"
86
86
  elif os.environ.get("IPEX_LLM_LOW_MEM", None) is not None:
87
87
  return os.environ["IPEX_LLM_LOW_MEM"] == "1"
88
+ elif linear.qtype in [ggml_tensor_qtype["fp16"], ggml_tensor_qtype["bf16"]]:
89
+ return False
88
90
  else:
89
- return x.device.type == 'xpu' and kv_cache_device_check(x, kv_group) \
90
- and hasattr(linear, "qtype") and \
91
- linear.qtype != ggml_tensor_qtype["fp16"] and linear.qtype != ggml_tensor_qtype["bf16"]
92
-
93
-
94
- def kv_cache_device_check(x: torch.Tensor, kv_group: int) -> bool:
95
- return (get_xpu_device_type(x) in ["mtl", "lnl"] and kv_group <= 1) or \
96
- ((get_xpu_device_type(x) == "arc" or get_xpu_device_type(x) == "flex") and
97
- 1 < x.size(0) and x.size(0) <= 8)
91
+ device_name = get_xpu_device_name(x.device)
92
+ return (
93
+ device_name in ["mtl", "lnl", "arl"] and kv_group == 1
94
+ or device_name in ["arc", "bmg"] and x.size(0) > 1
95
+ )
98
96
 
99
97
 
100
98
  def init_fp8_kv_cache(batch_size, num_heads, current_length, head_dim, device):
@@ -226,57 +224,6 @@ def is_enough_kv_cache_room_4_31(past_key_value, seq_len=1):
226
224
  (past_key_value[0].size(2) + seq_len) * past_key_value[0].size(3)
227
225
 
228
226
 
229
- def use_flash_attention(query, key, attention_mask=None):
230
- # here we support query's shape is always [batch_size, head_num, q_len, head_dim],
231
- # key's shape is always [batch_size, head_num, k_len, head_dim]
232
- invalidInputError(query.dim() == 4,
233
- "Here query input of use_flash_attention should be [batch_size, "
234
- "head_num, q_len, head_dim]")
235
- invalidInputError(key.dim() == 4,
236
- "Here key input of use_flash_attention should be [batch_size, "
237
- "head_num, k_len, head_dim]")
238
- bsz, _, q_len, _ = query.size()
239
- k_len = key.size()[2]
240
- # check whether ipex flash attention can be used
241
- if q_len != k_len:
242
- # now only use flash attention for first token
243
- # as it seems have no performance benifit for rest token now
244
- return False
245
- if query.device.type != "xpu":
246
- # ipex flash attention only support for xpu
247
- return False
248
- ipex_version = get_ipex_version()
249
- if ipex_version <= "2.0.110+xpu":
250
- # ipex flash attention is supported from ipex 2.1
251
- return False
252
- if not torch.xpu.has_xetla():
253
- # ipex flash attention is only supported for xetla
254
- # may update this later
255
- return False
256
- elif get_xpu_device_type(query) != "pvc":
257
- return False
258
- if query.dtype not in [torch.float32, torch.float16]:
259
- # only use flash attention for fp32/fp16 input
260
- return False
261
- if bsz > 1:
262
- # as flash attention doesn't support attn_mask in ipex 2.1,
263
- # so it will cause output error for padded batch input
264
- if attention_mask is None:
265
- return True
266
- else:
267
- # TODO: below logic may change for different model
268
- # attention mask shape : [bsz, 1, q_len, k_len]
269
- if attention_mask[0].squeeze()[0, 0].item() != 0:
270
- # first batch contains padding
271
- # otherwise we suppose it should be a upper triangular matrix
272
- # at the same time, the diagonal is also 0
273
- return False
274
- elif not attention_mask.equal(attention_mask[0].repeat(bsz, 1, 1, 1)):
275
- # check whether mask of every batch is the same
276
- return False
277
- return True
278
-
279
-
280
227
  def use_sdp(q_len, kv_len, head_dim, query_states):
281
228
  return (
282
229
  query_states.device.type == "xpu"
@@ -315,38 +262,16 @@ def mlp_fusion_check(x, qtype, training):
315
262
  if training or x.requires_grad:
316
263
  return False
317
264
  if qtype == FP6:
318
- device = get_xpu_device_type(x)
319
- if device in ["mtl", "lnl"]:
265
+ device = get_xpu_device_name(x.device)
266
+ if device in ["mtl", "lnl", "arl"]:
320
267
  return False
321
268
  return True
322
269
 
323
270
 
324
- def use_decoding_fast_path(proj,
325
- use_fuse_rope,
326
- enough_kv_room,
327
- bs,
328
- qtype_check=decoding_fast_path_qtype_check):
329
- if proj is None:
330
- return False
331
- device = get_xpu_device_type(proj.weight)
332
- if not qtype_check(proj):
333
- return False
334
- if not use_fuse_rope:
335
- return False
336
- if not enough_kv_room:
337
- return False
338
- if bs != 1:
339
- return False
340
-
341
- if device in ["uhd"]:
342
- return False
343
- return True
344
-
345
-
346
271
  def use_xmx(x: torch.Tensor, qtype: int):
347
- device = get_xpu_device_type(x)
272
+ device = get_xpu_device_name(x.device)
348
273
  return (
349
- device in ["arc", "flex", "pvc"]
274
+ device in ["arc", "pvc"]
350
275
  and qtype in [SYM_INT4, SYM_INT8, FP8E4, FP8E5]
351
276
  and (
352
277
  (device == "pvc" and 1 < x.size(0) <= 16)
@@ -370,7 +295,7 @@ def fp16_fusion_check(proj, x, training):
370
295
  return False
371
296
  if x.requires_grad:
372
297
  return False
373
- device_type = get_xpu_device_type(x)
298
+ device_type = get_xpu_device_name(x.device)
374
299
  if device_type != "pvc":
375
300
  return False
376
301
  return True
@@ -439,7 +364,7 @@ def should_use_compresskv(x: torch.Tensor, prompt_len: int):
439
364
  else:
440
365
  if use_compress_kv is None:
441
366
  return (
442
- get_xpu_device_type(x) in ["mtl", "lnl"]
367
+ get_xpu_device_name(x.device) in ["mtl", "lnl", "arl"]
443
368
  and prompt_len >= 1800
444
369
  and prompt_len <= 4500
445
370
  )
@@ -27,7 +27,7 @@ from transformers.configuration_utils import PretrainedConfig
27
27
 
28
28
  from ipex_llm.utils.common.log4Error import invalidInputError
29
29
  from ipex_llm.transformers.utils import logger, load_imatrix_data
30
- from ipex_llm.transformers.npu_models.convert import optimize_llm, optimize_llm_post
30
+ from ipex_llm.transformers.npu_models.convert import optimize_llm
31
31
 
32
32
 
33
33
  def patch_flash_attn_import(filename: str) -> List[str]:
@@ -207,8 +207,6 @@ class _BaseAutoModelClass:
207
207
  model = model.eval()
208
208
  logger.info(f"Finish to convert model")
209
209
  else:
210
- from intel_npu_acceleration_library.compiler import create_npu_kernels
211
-
212
210
  if optimize_model:
213
211
  invalidInputError(
214
212
  max_prompt_len < max_context_len,
@@ -232,11 +230,14 @@ class _BaseAutoModelClass:
232
230
  "convert_model": convert_model,
233
231
  "save_directory": save_directory,
234
232
  "fuse_layers": fuse_layers,
235
- "imatrix_data": imatrix_data
233
+ "imatrix_data": imatrix_data,
234
+ "skip_npu_logic": mock_device == "dummy",
236
235
  }
236
+ # Dummy will skip npu related logic and save the quantized model
237
+ if mock_device == "dummy":
238
+ model.save_low_bit = types.MethodType(save_low_bit, model)
237
239
  model = cls.optimize_npu_model(*args, **optimize_kwargs)
238
240
  else:
239
- from ipex_llm.transformers.npu_models.convert import optimize_llm
240
241
  optimize_llm(model)
241
242
  with torch.no_grad():
242
243
  cls.load_convert(qtype, model, "cpu", modules_to_not_convert,
@@ -258,7 +259,6 @@ class _BaseAutoModelClass:
258
259
  def optimize_npu_model(cls, *args, **kwargs):
259
260
 
260
261
  from ipex_llm.transformers.npu_models.convert_mp import optimize_llm_pre, optimize_llm
261
- from intel_npu_acceleration_library.compiler import create_npu_kernels
262
262
 
263
263
  model = kwargs.pop("model")
264
264
  qtype = kwargs.pop("qtype", "sym_int4_rtn")
@@ -275,6 +275,7 @@ class _BaseAutoModelClass:
275
275
  save_directory = kwargs.pop('save_directory', None)
276
276
  fuse_layers = kwargs.pop('fuse_layers', None)
277
277
  imatrix_data = kwargs.pop('imatrix_data', None)
278
+ skip_npu_logic = kwargs.pop("skip_npu_logic", False)
278
279
  invalidInputError(save_directory is not None,
279
280
  "Please provide the path to save converted model "
280
281
  "through `save_directory`.")
@@ -294,51 +295,58 @@ class _BaseAutoModelClass:
294
295
  cls.load_convert(qtype, model, "cpu", modules_to_not_convert,
295
296
  quantization_group_size, imatrix_data,
296
297
  *args, **kwargs)
297
- create_npu_kernels(llm)
298
+ if not skip_npu_logic:
299
+ from intel_npu_acceleration_library.compiler import create_npu_kernels
300
+ create_npu_kernels(llm)
298
301
  model = model.eval()
299
302
  logger.info(f"Finish to convert model")
300
303
  model.config.update({"bigdl_transformers_low_bit": qtype})
301
- model.share_memory()
302
304
 
303
- if not pipeline:
304
- if model.config.model_type in ["qwen2", "llama", "minicpm"]:
305
- from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
306
- optimize_llm_single_process(
307
- llm,
308
- kv_len=max_context_len,
309
- max_prompt_len=max_prompt_len,
310
- transpose_value_cache=transpose_value_cache,
311
- group_size=quantization_group_size,
312
- qtype=qtype,
313
- save_directory=save_directory,
314
- fuse_layers=fuse_layers,
315
- has_llm=hasattr(model, "llm")
316
- )
317
- else:
318
- optimize_llm(
319
- llm,
320
- max_context_len=max_context_len,
321
- max_prompt_len=max_prompt_len,
322
- inter_pp=inter_pp,
323
- intra_pp=intra_pp,
324
- transpose_value_cache=transpose_value_cache,
325
- group_size=quantization_group_size
326
- )
305
+ if skip_npu_logic:
306
+ model.save_low_bit(model_dir=save_directory)
327
307
  else:
328
- from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
329
- import convert_llm
330
- convert_llm(llm,
308
+ model.share_memory()
309
+
310
+ if not pipeline:
311
+ if model.config.model_type in ["qwen2", "llama", "minicpm"]:
312
+ from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
313
+ optimize_llm_single_process(
314
+ llm,
331
315
  kv_len=max_context_len,
332
316
  max_prompt_len=max_prompt_len,
333
317
  transpose_value_cache=transpose_value_cache,
334
318
  group_size=quantization_group_size,
335
319
  qtype=qtype,
336
- convert_model=convert_model,
337
320
  save_directory=save_directory,
338
- fuse_layers=fuse_layers)
339
- model.save_low_bit = types.MethodType(save_low_bit, model)
340
- model.save_low_bit(save_directory)
341
- logger.info(f"Converted model has already saved to {save_directory}.")
321
+ fuse_layers=fuse_layers,
322
+ has_llm=hasattr(model, "llm")
323
+ )
324
+ else:
325
+ optimize_llm(
326
+ llm,
327
+ max_context_len=max_context_len,
328
+ max_prompt_len=max_prompt_len,
329
+ inter_pp=inter_pp,
330
+ intra_pp=intra_pp,
331
+ transpose_value_cache=transpose_value_cache,
332
+ group_size=quantization_group_size
333
+ )
334
+ else:
335
+ from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
336
+ import convert_llm
337
+ convert_llm(llm,
338
+ kv_len=max_context_len,
339
+ max_prompt_len=max_prompt_len,
340
+ transpose_value_cache=transpose_value_cache,
341
+ group_size=quantization_group_size,
342
+ qtype=qtype,
343
+ convert_model=convert_model,
344
+ save_directory=save_directory,
345
+ fuse_layers=fuse_layers)
346
+ model.save_low_bit = types.MethodType(save_low_bit, model)
347
+ model.save_low_bit(save_directory)
348
+ logger.info(f"Converted model has already saved to {save_directory}.")
349
+
342
350
  return model
343
351
 
344
352
  @classmethod
@@ -379,6 +387,7 @@ class _BaseAutoModelClass:
379
387
  intra_pp = kwargs.pop("intra_pp", None)
380
388
  transpose_value_cache = kwargs.pop("transpose_value_cache", True)
381
389
  modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
390
+ save_directory = kwargs.pop('save_directory', None)
382
391
 
383
392
  from transformers.models.auto.configuration_auto import AutoConfig
384
393
  from transformers.modeling_utils import no_init_weights, get_state_dict_dtype
@@ -650,16 +659,37 @@ class _BaseAutoModelClass:
650
659
  param.requires_grad_(False)
651
660
 
652
661
  if optimize_model and not pipeline:
653
- from ipex_llm.transformers.npu_models.convert_mp import optimize_llm
654
- optimize_llm(
655
- llm,
656
- max_context_len=max_context_len,
657
- max_prompt_len=max_prompt_len,
658
- inter_pp=inter_pp,
659
- intra_pp=intra_pp,
660
- transpose_value_cache=transpose_value_cache,
661
- group_size=quantization_group_size
662
- )
662
+ if model.config.model_type in ["qwen2", "llama", "minicpm"]:
663
+ from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
664
+ if save_directory is None:
665
+ invalidInputError(False,
666
+ "Please specify the save_directory, the path of folder " +
667
+ "to save the compiled NPU model. If path not exists, " +
668
+ "the compiled NPU model will be saved there. " +
669
+ "Else, program will exit.")
670
+
671
+ optimize_llm_single_process(
672
+ llm,
673
+ kv_len=max_context_len,
674
+ max_prompt_len=max_prompt_len,
675
+ transpose_value_cache=transpose_value_cache,
676
+ group_size=quantization_group_size,
677
+ qtype=qtype,
678
+ save_directory=save_directory,
679
+ fuse_layers=None,
680
+ has_llm=hasattr(model, "llm")
681
+ )
682
+ else:
683
+ from ipex_llm.transformers.npu_models.convert_mp import optimize_llm
684
+ optimize_llm(
685
+ llm,
686
+ max_context_len=max_context_len,
687
+ max_prompt_len=max_prompt_len,
688
+ inter_pp=inter_pp,
689
+ intra_pp=intra_pp,
690
+ transpose_value_cache=transpose_value_cache,
691
+ group_size=quantization_group_size
692
+ )
663
693
  elif optimize_model and pipeline:
664
694
  from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
665
695
  import convert_llm
@@ -18,7 +18,7 @@ import torch
18
18
  import importlib
19
19
  import numpy as np
20
20
  from ipex_llm.transformers.low_bit_linear import LowBitLinear, FP4Params
21
- from ipex_llm.transformers.npu_models.lm_head import LMHeadLinear, SlicedLMHead
21
+ from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
22
22
  from ipex_llm.utils.common.log4Error import invalidInputError
23
23
 
24
24
 
@@ -21,16 +21,25 @@
21
21
  # SPDX-License-Identifier: Apache 2.0
22
22
  #
23
23
 
24
- from intel_npu_acceleration_library.quantization import quantize_tensor, compress_to_i4
25
- from intel_npu_acceleration_library.dtypes import NPUDtype
24
+
26
25
  import os
27
26
  import torch
28
27
  from torch.nn import Parameter
29
28
  import uuid
30
29
  import math
31
- from intel_npu_acceleration_library.backend import run_matmul
32
30
  from typing import Optional, Union
33
31
  from ipex_llm.utils.common import invalidInputError
32
+ import importlib
33
+
34
+
35
+ def is_acclib_available():
36
+ return importlib.util.find_spec("intel_npu_acceleration_library") is not None
37
+
38
+
39
+ if is_acclib_available():
40
+ from intel_npu_acceleration_library.quantization import quantize_tensor, compress_to_i4
41
+ from intel_npu_acceleration_library.dtypes import NPUDtype
42
+ from intel_npu_acceleration_library.backend import run_matmul
34
43
 
35
44
 
36
45
  class Linear(torch.nn.Module):
@@ -63,6 +72,7 @@ class Linear(torch.nn.Module):
63
72
  if self.training:
64
73
  out = self._mm(x, self.weight, None)
65
74
  else:
75
+ from intel_npu_acceleration_library.backend import run_matmul
66
76
  out = run_matmul(x, self.weight, None, self.op_id)
67
77
 
68
78
  if self.bias is None:
@@ -105,6 +115,8 @@ class Linear(torch.nn.Module):
105
115
  Returns:
106
116
  Union[Linear, QuantizedLinear]: A NPU linear layer
107
117
  """
118
+ from intel_npu_acceleration_library.quantization import quantize_tensor, compress_to_i4
119
+ from intel_npu_acceleration_library.dtypes import NPUDtype
108
120
  if dtype.is_floating_point:
109
121
  if bias is None:
110
122
  return Linear(weight.to(dtype), None)
@@ -16,96 +16,6 @@
16
16
  import torch
17
17
  from torch import nn
18
18
  import numpy as np
19
- from filelock import FileLock
20
- from intel_npu_acceleration_library.backend import NNFactory
21
- from intel_npu_acceleration_library.backend.bindings import lib as backend_lib
22
-
23
-
24
- class LMHeadLinear(NNFactory):
25
- """Quantized Linear class for sliced lm_head, computing a matrix matrix multiplication
26
- with weights prefetching."""
27
-
28
- def __init__(
29
- self,
30
- inC: int,
31
- outC: int,
32
- batch: int,
33
- split_num: int = 2,
34
- profile: bool = False,
35
- device: str = "NPU",
36
- dtype: np.dtype = np.int8,
37
- use_split: bool = False,
38
- group_size: int = 0,
39
- asym: bool = False,
40
- ):
41
- """Initialize the LMHeadLinear class.
42
-
43
- Args:
44
- inC (int): input channels
45
- outC (int): output channels
46
- batch (int): batch
47
- split_num (int): split in_features of lm_head to how many parts
48
- profile (bool): Enable/Disable profiling. Defaults to False.
49
- device (str): Target device, default to "NPU".
50
- dtype (np.dtype): weights datatype. Defaults to np.int8.
51
-
52
- """
53
- super().__init__(profile, device)
54
- self.inC, self.outC = inC, outC
55
- self.batch = batch
56
-
57
- self.split_num = split_num
58
- if use_split:
59
- input = self.parameter((1, self.batch, self.inC))
60
- res = self.dq_split_linear(input, self.split_num, self.outC, self.inC, wt_dtype=dtype,
61
- scale_factor=(group_size == 0), asym=asym)
62
- else:
63
- input = self.parameter((self.batch, self.inC))
64
- split_size = self.inC // split_num // 2 * 2
65
-
66
- for i in range(self.split_num):
67
- start_idx = i * split_size
68
- end_idx = (i + 1) * split_size if i < self.split_num - 1 else self.inC
69
- input_slice = self.slice(input, begin=[0, start_idx],
70
- end=[self.batch, end_idx])
71
- linear_slice = self.linear(input_slice, outC, split_size, bias=False,
72
- wt_dtype=dtype, asym=asym)
73
- if i == 0:
74
- res = linear_slice
75
- else:
76
- res += linear_slice
77
-
78
- print("start compiling lm_head")
79
- self.compile()
80
- print("end compiling lm_head")
81
-
82
- def set_weights(self, op_id, weights):
83
- self.set_weights_async(op_id, weights)
84
- with FileLock(f"lmhead_run.lock"):
85
- backend_lib.run(self._mm)
86
-
87
- def set_weights_async(self, op_id, weights):
88
- self.setWeights(1, op_id, *weights)
89
-
90
- def run(
91
- self, X: np.ndarray
92
- ) -> np.ndarray:
93
- """Run the layer: $X * (W * S)^T$ .
94
-
95
- Args:
96
- X (np.ndarray): activation
97
-
98
- Raises:
99
- RuntimeError: Input, weights or scale shape mismatch
100
-
101
- Returns:
102
- np.ndarray: result
103
- """
104
- self.set_input_tensor(X, 0)
105
- self.elapsed = backend_lib.run(self._mm)
106
- if len(self.out) == 1:
107
- return self.out[0]
108
- return self.out
109
19
 
110
20
 
111
21
  class SlicedLMHead(nn.Module):
@@ -160,6 +70,7 @@ class SlicedLMHead(nn.Module):
160
70
  return self.lm_heads[0].weight.dtype
161
71
 
162
72
  def get_fused_lm_head(self):
73
+ from ipex_llm.transformers.npu_models.lm_head_linear import LMHeadLinear
163
74
  np_dtype = np.uint8 if self.get_weight_dtype() == torch.uint8 else np.int8
164
75
  self.fused_lm_head = LMHeadLinear(self.inC, self.outC, 1, self.split_num,
165
76
  False, "NPU", dtype=np_dtype, use_split=self.use_split,
@@ -0,0 +1,106 @@
1
+ #
2
+ # Copyright 2016 The BigDL Authors.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import numpy as np
17
+ from filelock import FileLock
18
+ from intel_npu_acceleration_library.backend import NNFactory
19
+ from intel_npu_acceleration_library.backend.bindings import lib as backend_lib
20
+
21
+
22
+ class LMHeadLinear(NNFactory):
23
+ """Quantized Linear class for sliced lm_head, computing a matrix matrix multiplication
24
+ with weights prefetching."""
25
+
26
+ def __init__(
27
+ self,
28
+ inC: int,
29
+ outC: int,
30
+ batch: int,
31
+ split_num: int = 2,
32
+ profile: bool = False,
33
+ device: str = "NPU",
34
+ dtype: np.dtype = np.int8,
35
+ use_split: bool = False,
36
+ group_size: int = 0,
37
+ asym: bool = False,
38
+ ):
39
+ """Initialize the LMHeadLinear class.
40
+
41
+ Args:
42
+ inC (int): input channels
43
+ outC (int): output channels
44
+ batch (int): batch
45
+ split_num (int): split in_features of lm_head to how many parts
46
+ profile (bool): Enable/Disable profiling. Defaults to False.
47
+ device (str): Target device, default to "NPU".
48
+ dtype (np.dtype): weights datatype. Defaults to np.int8.
49
+
50
+ """
51
+ super().__init__(profile, device)
52
+ self.inC, self.outC = inC, outC
53
+ self.batch = batch
54
+
55
+ self.split_num = split_num
56
+ if use_split:
57
+ input = self.parameter((1, self.batch, self.inC))
58
+ res = self.dq_split_linear(input, self.split_num, self.outC, self.inC, wt_dtype=dtype,
59
+ scale_factor=(group_size == 0), asym=asym)
60
+ else:
61
+ input = self.parameter((self.batch, self.inC))
62
+ split_size = self.inC // split_num // 2 * 2
63
+
64
+ for i in range(self.split_num):
65
+ start_idx = i * split_size
66
+ end_idx = (i + 1) * split_size if i < self.split_num - 1 else self.inC
67
+ input_slice = self.slice(input, begin=[0, start_idx],
68
+ end=[self.batch, end_idx])
69
+ linear_slice = self.linear(input_slice, outC, split_size, bias=False,
70
+ wt_dtype=dtype, asym=asym)
71
+ if i == 0:
72
+ res = linear_slice
73
+ else:
74
+ res += linear_slice
75
+
76
+ print("start compiling lm_head")
77
+ self.compile()
78
+ print("end compiling lm_head")
79
+
80
+ def set_weights(self, op_id, weights):
81
+ self.set_weights_async(op_id, weights)
82
+ with FileLock(f"lmhead_run.lock"):
83
+ backend_lib.run(self._mm)
84
+
85
+ def set_weights_async(self, op_id, weights):
86
+ self.setWeights(1, op_id, *weights)
87
+
88
+ def run(
89
+ self, X: np.ndarray
90
+ ) -> np.ndarray:
91
+ """Run the layer: $X * (W * S)^T$ .
92
+
93
+ Args:
94
+ X (np.ndarray): activation
95
+
96
+ Raises:
97
+ RuntimeError: Input, weights or scale shape mismatch
98
+
99
+ Returns:
100
+ np.ndarray: result
101
+ """
102
+ self.set_input_tensor(X, 0)
103
+ self.elapsed = backend_lib.run(self._mm)
104
+ if len(self.out) == 1:
105
+ return self.out[0]
106
+ return self.out
@@ -473,10 +473,6 @@ def convert_llm_for_deploy(model: torch.nn.Module,
473
473
  "n_splits_linear": n_splits_linear,
474
474
  "n_splits_down_proj": n_splits_down_proj,
475
475
  "lm_head_low_bit": lm_head_low_bit}
476
- model.config.update(update_dict)
477
- model.config.save_pretrained(save_directory)
478
- if model.can_generate():
479
- model.generation_config.save_pretrained(save_directory)
480
476
 
481
477
  from .qwen import convert_qwen_layer, convert_fused_qwen_layer
482
478
  from .qwen import convert_lm_head_and_embedding
@@ -537,8 +533,6 @@ def convert_llm_for_deploy(model: torch.nn.Module,
537
533
  "n_splits_linear": n_splits_linear,
538
534
  "n_splits_down_proj": n_splits_down_proj,
539
535
  "lm_head_low_bit": lm_head_low_bit}
540
- model.config.update(update_dict)
541
- model.config.save_pretrained(save_directory)
542
536
 
543
537
  from .llama import convert_llama_layer, convert_fused_llama_layer
544
538
  from .llama import convert_lm_head_and_embedding
@@ -577,8 +571,6 @@ def convert_llm_for_deploy(model: torch.nn.Module,
577
571
  "n_splits_linear": n_splits_linear,
578
572
  "n_splits_down_proj": n_splits_down_proj,
579
573
  "lm_head_low_bit": lm_head_low_bit}
580
- model.config.update(update_dict)
581
- model.config.save_pretrained(save_directory)
582
574
 
583
575
  from .minicpm import convert_minicpm_layer, convert_fused_minicpm_layer
584
576
  from .minicpm import convert_lm_head_and_embedding
@@ -595,3 +587,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
595
587
  save_directory, weight_dir,
596
588
  convert_model=True,
597
589
  max_prompt_len=max_prompt_len)
590
+
591
+ model.config.update(update_dict)
592
+ model.config.save_pretrained(save_directory)
593
+ if model.can_generate():
594
+ model.generation_config.save_pretrained(save_directory)