bigdl-core-npu 2.6.0b20241203__cp310-cp310-win_amd64.whl → 2.6.0b20241206__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl-core-npu/include/common.h +1 -0
- bigdl-core-npu/npu_llm.dll +0 -0
- {bigdl_core_npu-2.6.0b20241203.dist-info → bigdl_core_npu-2.6.0b20241206.dist-info}/METADATA +1 -1
- {bigdl_core_npu-2.6.0b20241203.dist-info → bigdl_core_npu-2.6.0b20241206.dist-info}/RECORD +13 -13
- intel_npu_acceleration_library/_version.py +1 -1
- intel_npu_acceleration_library/backend/base.py +19 -1
- intel_npu_acceleration_library/backend/bindings.py +16 -0
- intel_npu_acceleration_library/backend/factory.py +6 -3
- intel_npu_acceleration_library/backend/qlinear.py +8 -4
- intel_npu_acceleration_library/backend/runtime.py +7 -2
- intel_npu_acceleration_library/lib/Release/intel_npu_acceleration_library.dll +0 -0
- {bigdl_core_npu-2.6.0b20241203.dist-info → bigdl_core_npu-2.6.0b20241206.dist-info}/WHEEL +0 -0
- {bigdl_core_npu-2.6.0b20241203.dist-info → bigdl_core_npu-2.6.0b20241206.dist-info}/top_level.txt +0 -0
bigdl-core-npu/include/common.h
CHANGED
bigdl-core-npu/npu_llm.dll
CHANGED
Binary file
|
@@ -1,10 +1,10 @@
|
|
1
1
|
bigdl-core-npu/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
bigdl-core-npu/npu_llm.dll,sha256=
|
2
|
+
bigdl-core-npu/npu_llm.dll,sha256=viVJrn2M4UOoBncCLrsj78HlS00-PXBOYnpfF2tag2c,3918336
|
3
3
|
bigdl-core-npu/npu_llm.lib,sha256=V8WB7fKSKDwBopflxjFk0460SyoAS3LNbkYgFJrgqMQ,43846
|
4
|
-
bigdl-core-npu/include/common.h,sha256=
|
4
|
+
bigdl-core-npu/include/common.h,sha256=rQ7aSNAthGaImWtvs2ZetTZoZHN8iayiuc02M5QP-Pw,2666
|
5
5
|
bigdl-core-npu/include/npu_llm.h,sha256=dS7_Esxw9Nxz3x07zTei2GEhTgxhS5IGBnyOVf3OS0k,2171
|
6
6
|
intel_npu_acceleration_library/__init__.py,sha256=ZKTIhGMDjF7P6pF-yX8KWcSXbeHWRk24AO_orsa18f8,536
|
7
|
-
intel_npu_acceleration_library/_version.py,sha256=
|
7
|
+
intel_npu_acceleration_library/_version.py,sha256=n-zEmitSgrCJTtvRD4v5hIEhQtBI4j9eS-kJZTKXXyc,112
|
8
8
|
intel_npu_acceleration_library/compiler.py,sha256=3IdgqjamSC8MLexDBJypIeZRiWIcTFnvQSU1LPXUr7Y,6225
|
9
9
|
intel_npu_acceleration_library/device.py,sha256=9bn8eVXJa5cXIqgfLsQAdkMVtVUQABb8z0-mQik5jRg,7424
|
10
10
|
intel_npu_acceleration_library/dtypes.py,sha256=gdd06Wsc9zIZFHlauUEx4xcK9WGTn1Mu6GkuYDJeA-E,4683
|
@@ -12,18 +12,18 @@ intel_npu_acceleration_library/modelling.py,sha256=vSiQOWGJ0l6wGV7zWQtZEkHpnMQIM
|
|
12
12
|
intel_npu_acceleration_library/optimizations.py,sha256=9NY8QoDFbs2LY12jbx6As8g2v0oInX4YzvkjnqViA70,5469
|
13
13
|
intel_npu_acceleration_library/quantization.py,sha256=6N_04h1KX6TNbw-ceANV0Pmk4_lQ2Y9C7Pwn5x-zQzo,5566
|
14
14
|
intel_npu_acceleration_library/backend/__init__.py,sha256=2NP6Ypr1dGUNXmLGW5GD9xrh0U9KJgqxTd_c7su1RUY,857
|
15
|
-
intel_npu_acceleration_library/backend/base.py,sha256=
|
16
|
-
intel_npu_acceleration_library/backend/bindings.py,sha256=
|
15
|
+
intel_npu_acceleration_library/backend/base.py,sha256=0EXHZTMrelebJ6HOSe74zE1mhy9tghXrkYnRQDLzwk4,9492
|
16
|
+
intel_npu_acceleration_library/backend/bindings.py,sha256=mu7EJ60X1cAFa1y17yA4r3n5lwFBSzsq5u-6Nj-OaJ0,10352
|
17
17
|
intel_npu_acceleration_library/backend/compression.py,sha256=Avz_zm2s_ELy5peVQ8zFGn8njBfh9nEGR16mflotBic,630
|
18
18
|
intel_npu_acceleration_library/backend/convolution.py,sha256=cN3k78X3Y4Cbf7er-MFq0sJ4OwIvquj8PajpdEDmCo4,2018
|
19
|
-
intel_npu_acceleration_library/backend/factory.py,sha256=
|
19
|
+
intel_npu_acceleration_library/backend/factory.py,sha256=RuVxbG-jHRhkkUU2q2kITVov3r-gBb6SGslWS-eL-Sk,41035
|
20
20
|
intel_npu_acceleration_library/backend/linear.py,sha256=RiLUh5FOSxRWHB5kYx7mOPOOrS_vxIeBJ5t3yC6wOiQ,1908
|
21
21
|
intel_npu_acceleration_library/backend/matmul.py,sha256=mfGi73-mIbUcXp4kyvCGW0Y9kb4Xp1ppbGNpdJFohuA,1819
|
22
22
|
intel_npu_acceleration_library/backend/mlp.py,sha256=BuKVwSI726v3nHQQvtMBbXyWxRTq-WoLZtTxeSeWaaY,2330
|
23
23
|
intel_npu_acceleration_library/backend/ops.py,sha256=3yS-f-VPErHFt_oWZrgplNmSWnRrjm_wdxXNsqEBN7M,5070
|
24
|
-
intel_npu_acceleration_library/backend/qlinear.py,sha256=
|
24
|
+
intel_npu_acceleration_library/backend/qlinear.py,sha256=4k0QGq8beo20BgXSmKFlCo6XaI3ZqAMtJR4_VCmv0rU,2565
|
25
25
|
intel_npu_acceleration_library/backend/qmatmul.py,sha256=pJkFJaBxZk3Oh5w_f6ywRNeGOfloEOCj0mCGnvim9Ew,2250
|
26
|
-
intel_npu_acceleration_library/backend/runtime.py,sha256=
|
26
|
+
intel_npu_acceleration_library/backend/runtime.py,sha256=anHg8F2mfTYm9aqli_AOibXW-klWg1A7hEAJiQa_Azk,7557
|
27
27
|
intel_npu_acceleration_library/backend/sdpa.py,sha256=HNlL9jEA9OH3KnZqOkLcaKwt8tfCe5apUQxlWw0UhlA,3818
|
28
28
|
intel_npu_acceleration_library/backend/tensor.py,sha256=swTymMVcXLPFXOlo6b_H3VGO1xf76Fz30RKyJPrVhPY,35468
|
29
29
|
intel_npu_acceleration_library/backend/utils.py,sha256=WJ2agtqYxBuDd21ngE55io9VX-MOGg-AjnE63UHpCiU,2174
|
@@ -198,7 +198,7 @@ intel_npu_acceleration_library/external/openvino/torch/__init__.py,sha256=RXLzsf
|
|
198
198
|
intel_npu_acceleration_library/functional/__init__.py,sha256=WWKwKOh6Sgovv7mKctA872TbLP98Pg5m5-MREvUmlAA,204
|
199
199
|
intel_npu_acceleration_library/functional/scaled_dot_product_attention.py,sha256=yGUcg4tDQOLuUnP1g74cl-ec8TRr2SuAMcNLlN6qLvE,1620
|
200
200
|
intel_npu_acceleration_library/lib/Release/cache.json,sha256=CyrSqZUWo0Ec4_7ydOiuKIC0Gm8AybrGdozUqUuHxBw,8840377
|
201
|
-
intel_npu_acceleration_library/lib/Release/intel_npu_acceleration_library.dll,sha256=
|
201
|
+
intel_npu_acceleration_library/lib/Release/intel_npu_acceleration_library.dll,sha256=o1V9NO0l3hYO1sjwCWuinyalVBQmHMXbWmOB6XLlvOo,340480
|
202
202
|
intel_npu_acceleration_library/lib/Release/openvino.dll,sha256=m7M119p3JBq2YYJJ2zzCaBDz6XivKK3nNykb8L1cvDU,13244768
|
203
203
|
intel_npu_acceleration_library/lib/Release/openvino_auto_batch_plugin.dll,sha256=2v_I9P3Qo0St1bQZMEZscnFOUVvgZQQ0HvQlG3HtTd0,203104
|
204
204
|
intel_npu_acceleration_library/lib/Release/openvino_auto_plugin.dll,sha256=e3Aj9CDRHN30dBEdPSk7OCWe52tWfhI4xeXgyFjuDHg,475488
|
@@ -228,7 +228,7 @@ intel_npu_acceleration_library/nn/functional.py,sha256=UfAKBc0u6RtyaMo14ldH2GpEn
|
|
228
228
|
intel_npu_acceleration_library/nn/linear.py,sha256=Q06SoGQeLaI86nA_ky2GnFC6H2Fw1zyMDILKnpYC2eo,5739
|
229
229
|
intel_npu_acceleration_library/nn/llm.py,sha256=P6dz36Yf6BHtzWcftaghC6QaMI_WeRfQwrCbO7fD6hk,15002
|
230
230
|
intel_npu_acceleration_library/nn/module.py,sha256=EYxoTq6I_YgBDgTF76GPDxHrT8SupOTDGMzQaomBeq8,12667
|
231
|
-
bigdl_core_npu-2.6.
|
232
|
-
bigdl_core_npu-2.6.
|
233
|
-
bigdl_core_npu-2.6.
|
234
|
-
bigdl_core_npu-2.6.
|
231
|
+
bigdl_core_npu-2.6.0b20241206.dist-info/METADATA,sha256=VIzHA4u_YDpz5WLHKj_A4JFaayA9j6KscxxJ0gyJhbI,1541
|
232
|
+
bigdl_core_npu-2.6.0b20241206.dist-info/WHEEL,sha256=tcd-HDpskugT8GYYKyyid0lOlzoZtZdWwcrj5ormtfo,101
|
233
|
+
bigdl_core_npu-2.6.0b20241206.dist-info/top_level.txt,sha256=iMQZlTsFPJjlD-Y0MqZEP_9ifI0LlbNCJIOTaMoGMjk,46
|
234
|
+
bigdl_core_npu-2.6.0b20241206.dist-info/RECORD,,
|
@@ -116,7 +116,12 @@ class BaseNPUBackendWithPrefetch(BaseNPUBackend):
|
|
116
116
|
for weight in weights:
|
117
117
|
if isinstance(weight, (list, tuple)):
|
118
118
|
# int8: data and scale
|
119
|
-
|
119
|
+
if len(weight) == 2:
|
120
|
+
data, scale = weight
|
121
|
+
zero = None
|
122
|
+
elif len(weight) == 3:
|
123
|
+
# for asym int4
|
124
|
+
data, scale, zero = weight
|
120
125
|
if data.dtype not in [np.int8, np.uint8]:
|
121
126
|
raise RuntimeError(
|
122
127
|
"Quantized weights needs to be in int8 or uint8 format"
|
@@ -133,7 +138,20 @@ class BaseNPUBackendWithPrefetch(BaseNPUBackend):
|
|
133
138
|
*shape,
|
134
139
|
*shape_scale,
|
135
140
|
)
|
141
|
+
elif data.dtype == np.uint8 and zero is not None:
|
142
|
+
# asym_int4
|
143
|
+
adapted_weights_zero, shape_zero = adapt_weight(zero)
|
144
|
+
backend_lib.addAsymInt4Parameter(
|
145
|
+
param,
|
146
|
+
adapted_weights,
|
147
|
+
adapted_weights_scale,
|
148
|
+
adapted_weights_zero,
|
149
|
+
*shape,
|
150
|
+
*shape_scale,
|
151
|
+
*shape_zero
|
152
|
+
)
|
136
153
|
else:
|
154
|
+
# sym_int4
|
137
155
|
backend_lib.addInt4Parameter(
|
138
156
|
param,
|
139
157
|
adapted_weights,
|
@@ -183,6 +183,7 @@ def init_network_factory(lib: ctypes.CDLL):
|
|
183
183
|
ctypes.c_char_p,
|
184
184
|
ctypes.c_char_p,
|
185
185
|
ctypes.c_bool,
|
186
|
+
ctypes.c_bool,
|
186
187
|
]
|
187
188
|
lib.linear.restype = handler
|
188
189
|
|
@@ -265,6 +266,7 @@ def init_network_factory(lib: ctypes.CDLL):
|
|
265
266
|
ctypes.c_char_p,
|
266
267
|
ctypes.c_char_p,
|
267
268
|
ctypes.c_bool,
|
269
|
+
ctypes.c_bool,
|
268
270
|
]
|
269
271
|
lib.dq_split_linear.restype = handler
|
270
272
|
|
@@ -278,6 +280,7 @@ def init_network_factory(lib: ctypes.CDLL):
|
|
278
280
|
ctypes.c_char_p,
|
279
281
|
ctypes.c_char_p,
|
280
282
|
ctypes.c_bool,
|
283
|
+
ctypes.c_bool,
|
281
284
|
]
|
282
285
|
lib.dq_split_linear_prefill.restype = handler
|
283
286
|
|
@@ -332,6 +335,19 @@ def init_parameters(lib: ctypes.CDLL):
|
|
332
335
|
ctypes.c_int,
|
333
336
|
]
|
334
337
|
|
338
|
+
lib.addAsymInt4Parameter.argtypes = [
|
339
|
+
handler,
|
340
|
+
c_u8_array,
|
341
|
+
c_fp16_array,
|
342
|
+
c_fp16_array,
|
343
|
+
ctypes.c_int,
|
344
|
+
ctypes.c_int,
|
345
|
+
ctypes.c_int,
|
346
|
+
ctypes.c_int,
|
347
|
+
ctypes.c_int,
|
348
|
+
ctypes.c_int,
|
349
|
+
]
|
350
|
+
|
335
351
|
lib.addIntParameterConversion.argtypes = [
|
336
352
|
handler,
|
337
353
|
c_i8_array,
|
@@ -378,6 +378,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
|
|
378
378
|
act_dtype: npt.DTypeLike = np.float16,
|
379
379
|
wt_dtype: npt.DTypeLike = np.float16,
|
380
380
|
scale_factor: bool = True,
|
381
|
+
asym: bool=False,
|
381
382
|
) -> ctypes._Pointer:
|
382
383
|
"""Generate a linear layer.
|
383
384
|
|
@@ -400,7 +401,8 @@ class NNFactory(BaseNPUBackendWithPrefetch):
|
|
400
401
|
bias,
|
401
402
|
self.get_backend_dtype(act_dtype),
|
402
403
|
self.get_backend_dtype(wt_dtype),
|
403
|
-
scale_factor
|
404
|
+
scale_factor,
|
405
|
+
asym
|
404
406
|
)
|
405
407
|
|
406
408
|
@return_tensor
|
@@ -412,6 +414,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
|
|
412
414
|
scale_factor: bool = True,
|
413
415
|
is_prefill: bool = False,
|
414
416
|
use_dq: bool = True,
|
417
|
+
asym: bool = False,
|
415
418
|
) -> ctypes._Pointer:
|
416
419
|
"""Generate a linear layer for dynamic quantization linear layer.
|
417
420
|
|
@@ -424,7 +427,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
|
|
424
427
|
act_dtype (npt.DTypeLike, optional): activation dtype. Defaults to np.float16.
|
425
428
|
wt_dtype (npt.DTypeLike, optional): weight dtype. Defaults to np.float16.
|
426
429
|
scale_factor (bool, optional): enable/disable mul scale factor. Default to True,
|
427
|
-
is_prefill (bool, optional): enable/disable prefill linear optimization. Default to
|
430
|
+
is_prefill (bool, optional): enable/disable prefill linear optimization. Default to False.
|
428
431
|
|
429
432
|
Returns:
|
430
433
|
ctypes._Pointer: output node
|
@@ -437,7 +440,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
|
|
437
440
|
input_channels, outout_channels, bias,
|
438
441
|
self.get_backend_dtype(act_dtype),
|
439
442
|
self.get_backend_dtype(wt_dtype),
|
440
|
-
scale_factor)
|
443
|
+
scale_factor, asym)
|
441
444
|
|
442
445
|
@return_tensor
|
443
446
|
def reshape(
|
@@ -18,6 +18,7 @@ class QLinear(NNFactory):
|
|
18
18
|
profile: bool = False,
|
19
19
|
device: str = "NPU",
|
20
20
|
dtype: np.dtype = np.int8,
|
21
|
+
asym: bool = False
|
21
22
|
):
|
22
23
|
"""Initialize the QLinear class.
|
23
24
|
|
@@ -33,13 +34,14 @@ class QLinear(NNFactory):
|
|
33
34
|
super().__init__(profile, device)
|
34
35
|
self.inC, self.outC = inC, outC
|
35
36
|
self.batch = batch
|
37
|
+
self.asym = asym
|
36
38
|
|
37
39
|
input = self.parameter((self.batch, self.inC))
|
38
|
-
_ = self.linear(input, outC, inC, bias=False, wt_dtype=dtype)
|
40
|
+
_ = self.linear(input, outC, inC, bias=False, wt_dtype=dtype, asym=asym)
|
39
41
|
self.compile()
|
40
42
|
|
41
43
|
def run(
|
42
|
-
self, X: np.ndarray, W: np.ndarray, scale: np.ndarray, op_id: str
|
44
|
+
self, X: np.ndarray, W: np.ndarray, scale: np.ndarray, zero: np.ndarray=None, op_id: str=None
|
43
45
|
) -> np.ndarray:
|
44
46
|
"""Run the layer: $X * (W * S)^T$ .
|
45
47
|
|
@@ -67,5 +69,7 @@ class QLinear(NNFactory):
|
|
67
69
|
raise RuntimeError(
|
68
70
|
f"Scale shape {W.shape} different from expected one {(self.outC, 1)}"
|
69
71
|
)
|
70
|
-
|
71
|
-
|
72
|
+
if not self.asym:
|
73
|
+
return super().run(X, (W, scale), op_id=op_id)
|
74
|
+
else:
|
75
|
+
return super().run(X, (W, scale, zero), op_id=op_id)
|
@@ -27,6 +27,7 @@ def run_matmul(
|
|
27
27
|
x: torch.Tensor,
|
28
28
|
weights: torch.Tensor,
|
29
29
|
scale: Optional[torch.Tensor] = None,
|
30
|
+
zero: Optional[torch.Tensor] = None,
|
30
31
|
op_id: Optional[str] = None,
|
31
32
|
) -> torch.Tensor:
|
32
33
|
"""Run a matmul operation. Depending on the datatype of the weights it runs a float or quantized operation.
|
@@ -35,6 +36,7 @@ def run_matmul(
|
|
35
36
|
x (torch.Tensor): Activation tensor. Its dtype must be torch.float16
|
36
37
|
weights (torch.Tensor): Weights tensor. Its dtype can be torch.float16 or torch.int8
|
37
38
|
scale (Optional[torch.Tensor], optional): Quantization scale. If weights.dtype == torch.int8 then it must be set. Defaults to None.
|
39
|
+
zero (Optional[torch.Tensor], optional): Quantization zero for asym_int4. If weights.dtype == torch.uint8 and use asym_int4 then it must be set and asym Defaults to None.
|
38
40
|
op_id (Optional[str], optional): Operation ID. Defaults to None.
|
39
41
|
|
40
42
|
Raises:
|
@@ -68,12 +70,15 @@ def run_matmul(
|
|
68
70
|
op_class = QLinear if op_id is not None else QMatMul
|
69
71
|
op_class_name = op_class.__name__
|
70
72
|
np_dtype = np.int8 if weights.dtype == torch.int8 else np.uint8
|
71
|
-
create_op = partial(op_class, dtype=np_dtype)
|
73
|
+
create_op = partial(op_class, dtype=np_dtype, asym=(zero is not None))
|
72
74
|
if scale is None:
|
73
75
|
raise RuntimeError(
|
74
76
|
f"Quantized matmul (weights dtype == {weights.dtype}) requires scale (scale = {scale})"
|
75
77
|
)
|
76
|
-
|
78
|
+
if zero is None:
|
79
|
+
op_args = [weights.numpy(), scale.numpy()]
|
80
|
+
else:
|
81
|
+
op_args = [weights.numpy(), scale.numpy(), zero.numpy()]
|
77
82
|
else:
|
78
83
|
raise RuntimeError(f"Unsupported dtype for weights {weights.dtype}")
|
79
84
|
|
Binary file
|
File without changes
|
{bigdl_core_npu-2.6.0b20241203.dist-info → bigdl_core_npu-2.6.0b20241206.dist-info}/top_level.txt
RENAMED
File without changes
|