bigdl-core-npu 2.6.0b20241204__cp311-cp311-win_amd64.whl → 2.6.0b20241206__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -64,6 +64,7 @@ struct npu_model_params {
64
64
  std::string embedding_post_blob_name;
65
65
  std::string config;
66
66
  std::string low_bit;
67
+ std::string lm_head_low_bit;
67
68
  bool layernorm_const;
68
69
  std::string model_type;
69
70
  bool transpose_value_cache;
Binary file
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bigdl-core-npu
3
- Version: 2.6.0b20241204
3
+ Version: 2.6.0b20241206
4
4
  Summary: Intel® NPU Acceleration Library
5
5
  Home-page: https://github.com/intel/intel-npu-acceleration-library
6
6
  Author: Alessandro Palla
@@ -1,10 +1,10 @@
1
1
  bigdl-core-npu/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- bigdl-core-npu/npu_llm.dll,sha256=sS7bFzJL0gMfsA-96ilPw3WXkLzQoAPDkVTNCh0hYNE,3902464
2
+ bigdl-core-npu/npu_llm.dll,sha256=RXbFgySS5bR_9uOC16C35-tXKH5f9zsqk95jgnJR02U,3918336
3
3
  bigdl-core-npu/npu_llm.lib,sha256=V8WB7fKSKDwBopflxjFk0460SyoAS3LNbkYgFJrgqMQ,43846
4
- bigdl-core-npu/include/common.h,sha256=2qMhyPMYet0BzIQ5m_wRXye_A8ileRpnsxp55TvcThE,2632
4
+ bigdl-core-npu/include/common.h,sha256=rQ7aSNAthGaImWtvs2ZetTZoZHN8iayiuc02M5QP-Pw,2666
5
5
  bigdl-core-npu/include/npu_llm.h,sha256=dS7_Esxw9Nxz3x07zTei2GEhTgxhS5IGBnyOVf3OS0k,2171
6
6
  intel_npu_acceleration_library/__init__.py,sha256=ZKTIhGMDjF7P6pF-yX8KWcSXbeHWRk24AO_orsa18f8,536
7
- intel_npu_acceleration_library/_version.py,sha256=FmzuI9iV663E0W8P6HV8M3tioaDeV5flFtmEkXQPwfc,112
7
+ intel_npu_acceleration_library/_version.py,sha256=n-zEmitSgrCJTtvRD4v5hIEhQtBI4j9eS-kJZTKXXyc,112
8
8
  intel_npu_acceleration_library/compiler.py,sha256=3IdgqjamSC8MLexDBJypIeZRiWIcTFnvQSU1LPXUr7Y,6225
9
9
  intel_npu_acceleration_library/device.py,sha256=9bn8eVXJa5cXIqgfLsQAdkMVtVUQABb8z0-mQik5jRg,7424
10
10
  intel_npu_acceleration_library/dtypes.py,sha256=gdd06Wsc9zIZFHlauUEx4xcK9WGTn1Mu6GkuYDJeA-E,4683
@@ -12,18 +12,18 @@ intel_npu_acceleration_library/modelling.py,sha256=vSiQOWGJ0l6wGV7zWQtZEkHpnMQIM
12
12
  intel_npu_acceleration_library/optimizations.py,sha256=9NY8QoDFbs2LY12jbx6As8g2v0oInX4YzvkjnqViA70,5469
13
13
  intel_npu_acceleration_library/quantization.py,sha256=6N_04h1KX6TNbw-ceANV0Pmk4_lQ2Y9C7Pwn5x-zQzo,5566
14
14
  intel_npu_acceleration_library/backend/__init__.py,sha256=2NP6Ypr1dGUNXmLGW5GD9xrh0U9KJgqxTd_c7su1RUY,857
15
- intel_npu_acceleration_library/backend/base.py,sha256=hbHqxSOfWH5BaA5PY6_zaf1Zdg5NrQK6WOfe-hr279k,8605
16
- intel_npu_acceleration_library/backend/bindings.py,sha256=6APMmDZnYBwsjcZIO5bgFz8IwfhR4CH1evWWc2IylwM,10005
15
+ intel_npu_acceleration_library/backend/base.py,sha256=0EXHZTMrelebJ6HOSe74zE1mhy9tghXrkYnRQDLzwk4,9492
16
+ intel_npu_acceleration_library/backend/bindings.py,sha256=mu7EJ60X1cAFa1y17yA4r3n5lwFBSzsq5u-6Nj-OaJ0,10352
17
17
  intel_npu_acceleration_library/backend/compression.py,sha256=Avz_zm2s_ELy5peVQ8zFGn8njBfh9nEGR16mflotBic,630
18
18
  intel_npu_acceleration_library/backend/convolution.py,sha256=cN3k78X3Y4Cbf7er-MFq0sJ4OwIvquj8PajpdEDmCo4,2018
19
- intel_npu_acceleration_library/backend/factory.py,sha256=5NQyfBxjc0lkFrzmzwDSy6q7K-W78uTpGoWWfUDJGjg,40953
19
+ intel_npu_acceleration_library/backend/factory.py,sha256=RuVxbG-jHRhkkUU2q2kITVov3r-gBb6SGslWS-eL-Sk,41035
20
20
  intel_npu_acceleration_library/backend/linear.py,sha256=RiLUh5FOSxRWHB5kYx7mOPOOrS_vxIeBJ5t3yC6wOiQ,1908
21
21
  intel_npu_acceleration_library/backend/matmul.py,sha256=mfGi73-mIbUcXp4kyvCGW0Y9kb4Xp1ppbGNpdJFohuA,1819
22
22
  intel_npu_acceleration_library/backend/mlp.py,sha256=BuKVwSI726v3nHQQvtMBbXyWxRTq-WoLZtTxeSeWaaY,2330
23
23
  intel_npu_acceleration_library/backend/ops.py,sha256=3yS-f-VPErHFt_oWZrgplNmSWnRrjm_wdxXNsqEBN7M,5070
24
- intel_npu_acceleration_library/backend/qlinear.py,sha256=oeawOjRBA_kQRqfQ1Vn_e3aJQa2b4pQ8y0gMwvHBTzk,2362
24
+ intel_npu_acceleration_library/backend/qlinear.py,sha256=4k0QGq8beo20BgXSmKFlCo6XaI3ZqAMtJR4_VCmv0rU,2565
25
25
  intel_npu_acceleration_library/backend/qmatmul.py,sha256=pJkFJaBxZk3Oh5w_f6ywRNeGOfloEOCj0mCGnvim9Ew,2250
26
- intel_npu_acceleration_library/backend/runtime.py,sha256=COsTslfHknjrix4kETlnKvEmLO2lyBtXpAzEQTZJSvk,7195
26
+ intel_npu_acceleration_library/backend/runtime.py,sha256=anHg8F2mfTYm9aqli_AOibXW-klWg1A7hEAJiQa_Azk,7557
27
27
  intel_npu_acceleration_library/backend/sdpa.py,sha256=HNlL9jEA9OH3KnZqOkLcaKwt8tfCe5apUQxlWw0UhlA,3818
28
28
  intel_npu_acceleration_library/backend/tensor.py,sha256=swTymMVcXLPFXOlo6b_H3VGO1xf76Fz30RKyJPrVhPY,35468
29
29
  intel_npu_acceleration_library/backend/utils.py,sha256=WJ2agtqYxBuDd21ngE55io9VX-MOGg-AjnE63UHpCiU,2174
@@ -198,7 +198,7 @@ intel_npu_acceleration_library/external/openvino/torch/__init__.py,sha256=RXLzsf
198
198
  intel_npu_acceleration_library/functional/__init__.py,sha256=WWKwKOh6Sgovv7mKctA872TbLP98Pg5m5-MREvUmlAA,204
199
199
  intel_npu_acceleration_library/functional/scaled_dot_product_attention.py,sha256=yGUcg4tDQOLuUnP1g74cl-ec8TRr2SuAMcNLlN6qLvE,1620
200
200
  intel_npu_acceleration_library/lib/Release/cache.json,sha256=CyrSqZUWo0Ec4_7ydOiuKIC0Gm8AybrGdozUqUuHxBw,8840377
201
- intel_npu_acceleration_library/lib/Release/intel_npu_acceleration_library.dll,sha256=up79D00rpNM9RN-1_WPFlt6elodCe4ZNWSzlxRRHDoI,333312
201
+ intel_npu_acceleration_library/lib/Release/intel_npu_acceleration_library.dll,sha256=NUGlrx547TD6JBldoNOzotF6bJfKSvszGE0v2S1xeZ0,340480
202
202
  intel_npu_acceleration_library/lib/Release/openvino.dll,sha256=m7M119p3JBq2YYJJ2zzCaBDz6XivKK3nNykb8L1cvDU,13244768
203
203
  intel_npu_acceleration_library/lib/Release/openvino_auto_batch_plugin.dll,sha256=2v_I9P3Qo0St1bQZMEZscnFOUVvgZQQ0HvQlG3HtTd0,203104
204
204
  intel_npu_acceleration_library/lib/Release/openvino_auto_plugin.dll,sha256=e3Aj9CDRHN30dBEdPSk7OCWe52tWfhI4xeXgyFjuDHg,475488
@@ -228,7 +228,7 @@ intel_npu_acceleration_library/nn/functional.py,sha256=UfAKBc0u6RtyaMo14ldH2GpEn
228
228
  intel_npu_acceleration_library/nn/linear.py,sha256=Q06SoGQeLaI86nA_ky2GnFC6H2Fw1zyMDILKnpYC2eo,5739
229
229
  intel_npu_acceleration_library/nn/llm.py,sha256=P6dz36Yf6BHtzWcftaghC6QaMI_WeRfQwrCbO7fD6hk,15002
230
230
  intel_npu_acceleration_library/nn/module.py,sha256=EYxoTq6I_YgBDgTF76GPDxHrT8SupOTDGMzQaomBeq8,12667
231
- bigdl_core_npu-2.6.0b20241204.dist-info/METADATA,sha256=sR39NzrsfEWz7zEY1XkRrc6RP82ukWI5R2B5ypyk1uw,1541
232
- bigdl_core_npu-2.6.0b20241204.dist-info/WHEEL,sha256=nkBcd8Ko0v5sEcSagm2-x_RVrb8gBSkTa8VFFZ0Mr1o,101
233
- bigdl_core_npu-2.6.0b20241204.dist-info/top_level.txt,sha256=iMQZlTsFPJjlD-Y0MqZEP_9ifI0LlbNCJIOTaMoGMjk,46
234
- bigdl_core_npu-2.6.0b20241204.dist-info/RECORD,,
231
+ bigdl_core_npu-2.6.0b20241206.dist-info/METADATA,sha256=VIzHA4u_YDpz5WLHKj_A4JFaayA9j6KscxxJ0gyJhbI,1541
232
+ bigdl_core_npu-2.6.0b20241206.dist-info/WHEEL,sha256=nkBcd8Ko0v5sEcSagm2-x_RVrb8gBSkTa8VFFZ0Mr1o,101
233
+ bigdl_core_npu-2.6.0b20241206.dist-info/top_level.txt,sha256=iMQZlTsFPJjlD-Y0MqZEP_9ifI0LlbNCJIOTaMoGMjk,46
234
+ bigdl_core_npu-2.6.0b20241206.dist-info/RECORD,,
@@ -3,4 +3,4 @@
3
3
  # SPDX-License-Identifier: Apache 2.0
4
4
  #
5
5
 
6
- __version__ = "2.6.0b20241204"
6
+ __version__ = "2.6.0b20241206"
@@ -116,7 +116,12 @@ class BaseNPUBackendWithPrefetch(BaseNPUBackend):
116
116
  for weight in weights:
117
117
  if isinstance(weight, (list, tuple)):
118
118
  # int8: data and scale
119
- data, scale = weight
119
+ if len(weight) == 2:
120
+ data, scale = weight
121
+ zero = None
122
+ elif len(weight) == 3:
123
+ # for asym int4
124
+ data, scale, zero = weight
120
125
  if data.dtype not in [np.int8, np.uint8]:
121
126
  raise RuntimeError(
122
127
  "Quantized weights needs to be in int8 or uint8 format"
@@ -133,7 +138,20 @@ class BaseNPUBackendWithPrefetch(BaseNPUBackend):
133
138
  *shape,
134
139
  *shape_scale,
135
140
  )
141
+ elif data.dtype == np.uint8 and zero is not None:
142
+ # asym_int4
143
+ adapted_weights_zero, shape_zero = adapt_weight(zero)
144
+ backend_lib.addAsymInt4Parameter(
145
+ param,
146
+ adapted_weights,
147
+ adapted_weights_scale,
148
+ adapted_weights_zero,
149
+ *shape,
150
+ *shape_scale,
151
+ *shape_zero
152
+ )
136
153
  else:
154
+ # sym_int4
137
155
  backend_lib.addInt4Parameter(
138
156
  param,
139
157
  adapted_weights,
@@ -183,6 +183,7 @@ def init_network_factory(lib: ctypes.CDLL):
183
183
  ctypes.c_char_p,
184
184
  ctypes.c_char_p,
185
185
  ctypes.c_bool,
186
+ ctypes.c_bool,
186
187
  ]
187
188
  lib.linear.restype = handler
188
189
 
@@ -265,6 +266,7 @@ def init_network_factory(lib: ctypes.CDLL):
265
266
  ctypes.c_char_p,
266
267
  ctypes.c_char_p,
267
268
  ctypes.c_bool,
269
+ ctypes.c_bool,
268
270
  ]
269
271
  lib.dq_split_linear.restype = handler
270
272
 
@@ -278,6 +280,7 @@ def init_network_factory(lib: ctypes.CDLL):
278
280
  ctypes.c_char_p,
279
281
  ctypes.c_char_p,
280
282
  ctypes.c_bool,
283
+ ctypes.c_bool,
281
284
  ]
282
285
  lib.dq_split_linear_prefill.restype = handler
283
286
 
@@ -332,6 +335,19 @@ def init_parameters(lib: ctypes.CDLL):
332
335
  ctypes.c_int,
333
336
  ]
334
337
 
338
+ lib.addAsymInt4Parameter.argtypes = [
339
+ handler,
340
+ c_u8_array,
341
+ c_fp16_array,
342
+ c_fp16_array,
343
+ ctypes.c_int,
344
+ ctypes.c_int,
345
+ ctypes.c_int,
346
+ ctypes.c_int,
347
+ ctypes.c_int,
348
+ ctypes.c_int,
349
+ ]
350
+
335
351
  lib.addIntParameterConversion.argtypes = [
336
352
  handler,
337
353
  c_i8_array,
@@ -378,6 +378,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
378
378
  act_dtype: npt.DTypeLike = np.float16,
379
379
  wt_dtype: npt.DTypeLike = np.float16,
380
380
  scale_factor: bool = True,
381
+ asym: bool=False,
381
382
  ) -> ctypes._Pointer:
382
383
  """Generate a linear layer.
383
384
 
@@ -400,7 +401,8 @@ class NNFactory(BaseNPUBackendWithPrefetch):
400
401
  bias,
401
402
  self.get_backend_dtype(act_dtype),
402
403
  self.get_backend_dtype(wt_dtype),
403
- scale_factor
404
+ scale_factor,
405
+ asym
404
406
  )
405
407
 
406
408
  @return_tensor
@@ -412,6 +414,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
412
414
  scale_factor: bool = True,
413
415
  is_prefill: bool = False,
414
416
  use_dq: bool = True,
417
+ asym: bool = False,
415
418
  ) -> ctypes._Pointer:
416
419
  """Generate a linear layer for dynamic quantization linear layer.
417
420
 
@@ -424,7 +427,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
424
427
  act_dtype (npt.DTypeLike, optional): activation dtype. Defaults to np.float16.
425
428
  wt_dtype (npt.DTypeLike, optional): weight dtype. Defaults to np.float16.
426
429
  scale_factor (bool, optional): enable/disable mul scale factor. Default to True,
427
- is_prefill (bool, optional): enable/disable prefill linear optimization. Default to True.
430
+ is_prefill (bool, optional): enable/disable prefill linear optimization. Default to False.
428
431
 
429
432
  Returns:
430
433
  ctypes._Pointer: output node
@@ -437,7 +440,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
437
440
  input_channels, outout_channels, bias,
438
441
  self.get_backend_dtype(act_dtype),
439
442
  self.get_backend_dtype(wt_dtype),
440
- scale_factor)
443
+ scale_factor, asym)
441
444
 
442
445
  @return_tensor
443
446
  def reshape(
@@ -18,6 +18,7 @@ class QLinear(NNFactory):
18
18
  profile: bool = False,
19
19
  device: str = "NPU",
20
20
  dtype: np.dtype = np.int8,
21
+ asym: bool = False
21
22
  ):
22
23
  """Initialize the QLinear class.
23
24
 
@@ -33,13 +34,14 @@ class QLinear(NNFactory):
33
34
  super().__init__(profile, device)
34
35
  self.inC, self.outC = inC, outC
35
36
  self.batch = batch
37
+ self.asym = asym
36
38
 
37
39
  input = self.parameter((self.batch, self.inC))
38
- _ = self.linear(input, outC, inC, bias=False, wt_dtype=dtype)
40
+ _ = self.linear(input, outC, inC, bias=False, wt_dtype=dtype, asym=asym)
39
41
  self.compile()
40
42
 
41
43
  def run(
42
- self, X: np.ndarray, W: np.ndarray, scale: np.ndarray, op_id: str
44
+ self, X: np.ndarray, W: np.ndarray, scale: np.ndarray, zero: np.ndarray=None, op_id: str=None
43
45
  ) -> np.ndarray:
44
46
  """Run the layer: $X * (W * S)^T$ .
45
47
 
@@ -67,5 +69,7 @@ class QLinear(NNFactory):
67
69
  raise RuntimeError(
68
70
  f"Scale shape {W.shape} different from expected one {(self.outC, 1)}"
69
71
  )
70
-
71
- return super().run(X, (W, scale), op_id=op_id)
72
+ if not self.asym:
73
+ return super().run(X, (W, scale), op_id=op_id)
74
+ else:
75
+ return super().run(X, (W, scale, zero), op_id=op_id)
@@ -27,6 +27,7 @@ def run_matmul(
27
27
  x: torch.Tensor,
28
28
  weights: torch.Tensor,
29
29
  scale: Optional[torch.Tensor] = None,
30
+ zero: Optional[torch.Tensor] = None,
30
31
  op_id: Optional[str] = None,
31
32
  ) -> torch.Tensor:
32
33
  """Run a matmul operation. Depending on the datatype of the weights it runs a float or quantized operation.
@@ -35,6 +36,7 @@ def run_matmul(
35
36
  x (torch.Tensor): Activation tensor. Its dtype must be torch.float16
36
37
  weights (torch.Tensor): Weights tensor. Its dtype can be torch.float16 or torch.int8
37
38
  scale (Optional[torch.Tensor], optional): Quantization scale. If weights.dtype == torch.int8 then it must be set. Defaults to None.
39
+ zero (Optional[torch.Tensor], optional): Quantization zero for asym_int4. If weights.dtype == torch.uint8 and use asym_int4 then it must be set and asym Defaults to None.
38
40
  op_id (Optional[str], optional): Operation ID. Defaults to None.
39
41
 
40
42
  Raises:
@@ -68,12 +70,15 @@ def run_matmul(
68
70
  op_class = QLinear if op_id is not None else QMatMul
69
71
  op_class_name = op_class.__name__
70
72
  np_dtype = np.int8 if weights.dtype == torch.int8 else np.uint8
71
- create_op = partial(op_class, dtype=np_dtype)
73
+ create_op = partial(op_class, dtype=np_dtype, asym=(zero is not None))
72
74
  if scale is None:
73
75
  raise RuntimeError(
74
76
  f"Quantized matmul (weights dtype == {weights.dtype}) requires scale (scale = {scale})"
75
77
  )
76
- op_args = [weights.numpy(), scale.numpy()]
78
+ if zero is None:
79
+ op_args = [weights.numpy(), scale.numpy()]
80
+ else:
81
+ op_args = [weights.numpy(), scale.numpy(), zero.numpy()]
77
82
  else:
78
83
  raise RuntimeError(f"Unsupported dtype for weights {weights.dtype}")
79
84