bigdl-core-npu 2.5.0__cp310-cp310-win_amd64.whl → 2.6.0b20241101__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bigdl-core-npu
3
- Version: 2.5.0
3
+ Version: 2.6.0b20241101
4
4
  Summary: Intel® NPU Acceleration Library
5
5
  Home-page: https://github.com/intel/intel-npu-acceleration-library
6
6
  Author: Alessandro Palla
@@ -1,5 +1,5 @@
1
1
  intel_npu_acceleration_library/__init__.py,sha256=ZKTIhGMDjF7P6pF-yX8KWcSXbeHWRk24AO_orsa18f8,536
2
- intel_npu_acceleration_library/_version.py,sha256=-yyXJHoPI8Uu4p1coZDeAWH6XHHdLed8GM4ogYbrieE,103
2
+ intel_npu_acceleration_library/_version.py,sha256=jcHRT5PZ4LFo80UDBbT1tq0P4zoXqHJw1NiKteCpiQw,112
3
3
  intel_npu_acceleration_library/compiler.py,sha256=3IdgqjamSC8MLexDBJypIeZRiWIcTFnvQSU1LPXUr7Y,6225
4
4
  intel_npu_acceleration_library/device.py,sha256=TbG4cJ197qo7PJQ5zz9zfxbuXB5OTWJlKNaKL4TAlms,7395
5
5
  intel_npu_acceleration_library/dtypes.py,sha256=1CV4FIuvlmLsTCS1nCCEwq4EzZmD3thj1_92v5vajpw,3539
@@ -7,11 +7,11 @@ intel_npu_acceleration_library/modelling.py,sha256=vSiQOWGJ0l6wGV7zWQtZEkHpnMQIM
7
7
  intel_npu_acceleration_library/optimizations.py,sha256=9NY8QoDFbs2LY12jbx6As8g2v0oInX4YzvkjnqViA70,5469
8
8
  intel_npu_acceleration_library/quantization.py,sha256=6N_04h1KX6TNbw-ceANV0Pmk4_lQ2Y9C7Pwn5x-zQzo,5566
9
9
  intel_npu_acceleration_library/backend/__init__.py,sha256=2NP6Ypr1dGUNXmLGW5GD9xrh0U9KJgqxTd_c7su1RUY,857
10
- intel_npu_acceleration_library/backend/base.py,sha256=7L1SE-8HKSB5efP8ACQ5tKa89NBkQlf2IxXrSUxGvjs,8317
11
- intel_npu_acceleration_library/backend/bindings.py,sha256=zoF6etBvQWwAsQmA-woyivZAmZk1RfJaWNn0QShaPjs,7925
10
+ intel_npu_acceleration_library/backend/base.py,sha256=hbHqxSOfWH5BaA5PY6_zaf1Zdg5NrQK6WOfe-hr279k,8605
11
+ intel_npu_acceleration_library/backend/bindings.py,sha256=cla6JRX7pqUDuRmsXN6K9cAKklHz_mb6butatR2Eu9I,8901
12
12
  intel_npu_acceleration_library/backend/compression.py,sha256=Avz_zm2s_ELy5peVQ8zFGn8njBfh9nEGR16mflotBic,630
13
13
  intel_npu_acceleration_library/backend/convolution.py,sha256=cN3k78X3Y4Cbf7er-MFq0sJ4OwIvquj8PajpdEDmCo4,2018
14
- intel_npu_acceleration_library/backend/factory.py,sha256=9RyDBzJJYKiFOd0IxMZl5dr6K_pDvfehhrGsE7xTTAw,32773
14
+ intel_npu_acceleration_library/backend/factory.py,sha256=n63KE8X9eOuv2m2MiQFASjzgnkIM9deGtDC-qSHRMMw,38847
15
15
  intel_npu_acceleration_library/backend/linear.py,sha256=RiLUh5FOSxRWHB5kYx7mOPOOrS_vxIeBJ5t3yC6wOiQ,1908
16
16
  intel_npu_acceleration_library/backend/matmul.py,sha256=mfGi73-mIbUcXp4kyvCGW0Y9kb4Xp1ppbGNpdJFohuA,1819
17
17
  intel_npu_acceleration_library/backend/mlp.py,sha256=BuKVwSI726v3nHQQvtMBbXyWxRTq-WoLZtTxeSeWaaY,2330
@@ -187,7 +187,7 @@ intel_npu_acceleration_library/external/openvino/torch/__init__.py,sha256=RXLzsf
187
187
  intel_npu_acceleration_library/functional/__init__.py,sha256=WWKwKOh6Sgovv7mKctA872TbLP98Pg5m5-MREvUmlAA,204
188
188
  intel_npu_acceleration_library/functional/scaled_dot_product_attention.py,sha256=yGUcg4tDQOLuUnP1g74cl-ec8TRr2SuAMcNLlN6qLvE,1620
189
189
  intel_npu_acceleration_library/lib/Release/cache.json,sha256=CyrSqZUWo0Ec4_7ydOiuKIC0Gm8AybrGdozUqUuHxBw,8840377
190
- intel_npu_acceleration_library/lib/Release/intel_npu_acceleration_library.dll,sha256=B6ahqIBpVO62GjmyWmgmAfRq9IroDPK4H4Y6cAkbNDM,281600
190
+ intel_npu_acceleration_library/lib/Release/intel_npu_acceleration_library.dll,sha256=wmiXqhTSSogchjl8WNQnNYwjHiMtyYOvPvATeOUQkt8,304640
191
191
  intel_npu_acceleration_library/lib/Release/openvino.dll,sha256=_ifEwHwM-7LuKMhAnlqNuJ2GxsLXbG47easxl5E4shU,12624904
192
192
  intel_npu_acceleration_library/lib/Release/openvino_auto_batch_plugin.dll,sha256=hXFvu4oLvfNhCODn5eNYOmkxBb0LEKYXHA0sZLccOXc,195080
193
193
  intel_npu_acceleration_library/lib/Release/openvino_auto_plugin.dll,sha256=nh_iDxejjHlkes-KT0IwBzEd4Ec0L3bXQFCl0Dqerf8,472072
@@ -217,7 +217,7 @@ intel_npu_acceleration_library/nn/functional.py,sha256=UfAKBc0u6RtyaMo14ldH2GpEn
217
217
  intel_npu_acceleration_library/nn/linear.py,sha256=Q06SoGQeLaI86nA_ky2GnFC6H2Fw1zyMDILKnpYC2eo,5739
218
218
  intel_npu_acceleration_library/nn/llm.py,sha256=P6dz36Yf6BHtzWcftaghC6QaMI_WeRfQwrCbO7fD6hk,15002
219
219
  intel_npu_acceleration_library/nn/module.py,sha256=klVK4A0O-7fLzEIhGhE6_eVgvyVK_NakAqpDq08Ju1Y,12637
220
- bigdl_core_npu-2.5.0.dist-info/METADATA,sha256=NhXfzEaj8jWORFpNU4y5qcnnZVv8sjOMWfnhKcER2cE,1534
221
- bigdl_core_npu-2.5.0.dist-info/WHEEL,sha256=fsW6--WFfuzX2scefE6JfcSZ5dXg5h59u8lqlpL5uuo,101
222
- bigdl_core_npu-2.5.0.dist-info/top_level.txt,sha256=CH3qQoleRBC1eThu8mCEMxYNKdzJuXCtmeCXRKskt7A,31
223
- bigdl_core_npu-2.5.0.dist-info/RECORD,,
220
+ bigdl_core_npu-2.6.0b20241101.dist-info/METADATA,sha256=oOcjv-wWArv5l6x58K0TH2hhhvYI11BjUXyMYRFRcGc,1543
221
+ bigdl_core_npu-2.6.0b20241101.dist-info/WHEEL,sha256=09_eAv2LFHDbyhcOULd5e3WJrC_F5q7AlLDftiw-PyE,101
222
+ bigdl_core_npu-2.6.0b20241101.dist-info/top_level.txt,sha256=CH3qQoleRBC1eThu8mCEMxYNKdzJuXCtmeCXRKskt7A,31
223
+ bigdl_core_npu-2.6.0b20241101.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (72.1.0)
2
+ Generator: setuptools (75.3.0)
3
3
  Root-Is-Purelib: false
4
4
  Tag: cp310-cp310-win_amd64
5
5
 
@@ -3,4 +3,4 @@
3
3
  # SPDX-License-Identifier: Apache 2.0
4
4
  #
5
5
 
6
- __version__ = "2.5.0"
6
+ __version__ = "2.6.0b20241101"
@@ -153,7 +153,14 @@ class BaseNPUBackendWithPrefetch(BaseNPUBackend):
153
153
  raise ValueError(f"Invalid dtype for scale: {scale.dtype}")
154
154
  else:
155
155
  adapted_weights, shape = adapt_weight(weight)
156
- backend_lib.addFloatParameter(param, adapted_weights, *shape)
156
+ if weight.dtype == np.uint8:
157
+ backend_lib.addInt4WeightParameter(
158
+ param,
159
+ adapted_weights,
160
+ *shape,
161
+ )
162
+ else:
163
+ backend_lib.addFloatParameter(param, adapted_weights, *shape)
157
164
  elif isinstance(weights, np.ndarray):
158
165
  adapted_weights, shape = adapt_weight(weights)
159
166
  backend_lib.addFloatParameter(param, adapted_weights, *shape)
@@ -143,7 +143,7 @@ def init_network_factory(lib: ctypes.CDLL):
143
143
  ]
144
144
  lib.slice.restype = handler
145
145
 
146
- lib.compile.argtypes = [handler]
146
+ lib.compile.argtypes = [handler, ctypes.c_int]
147
147
  lib.compile.restype = handler
148
148
 
149
149
  lib.get_output_tensor_shape_size.argtypes = [handler, ctypes.c_int]
@@ -160,6 +160,7 @@ def init_network_factory(lib: ctypes.CDLL):
160
160
  ctypes.c_bool,
161
161
  ctypes.c_char_p,
162
162
  ctypes.c_char_p,
163
+ ctypes.c_bool,
163
164
  ]
164
165
  lib.linear.restype = handler
165
166
 
@@ -214,6 +215,41 @@ def init_network_factory(lib: ctypes.CDLL):
214
215
  ]
215
216
  lib.max_pooling.restype = handler
216
217
 
218
+
219
+ lib.multi_concat.argtypes = [
220
+ handler,
221
+ ctypes.POINTER(handler),
222
+ ctypes.c_uint64,
223
+ ctypes.c_int64,
224
+ ]
225
+ lib.multi_concat.restype = handler
226
+
227
+ lib.dq_split_linear.argtypes = [
228
+ handler,
229
+ handler,
230
+ ctypes.c_int,
231
+ ctypes.c_int,
232
+ ctypes.c_int,
233
+ ctypes.c_bool,
234
+ ctypes.c_char_p,
235
+ ctypes.c_char_p,
236
+ ctypes.c_bool,
237
+ ]
238
+ lib.dq_split_linear.restype = handler
239
+
240
+ lib.dq_split_linear_prefill.argtypes = [
241
+ handler,
242
+ handler,
243
+ ctypes.c_int,
244
+ ctypes.c_int,
245
+ ctypes.c_int,
246
+ ctypes.c_bool,
247
+ ctypes.c_char_p,
248
+ ctypes.c_char_p,
249
+ ctypes.c_bool,
250
+ ]
251
+ lib.dq_split_linear_prefill.restype = handler
252
+
217
253
  for op in get_supported_ops():
218
254
  fn = getattr(lib, op.name)
219
255
  fn.argtypes = [handler] * (op.inputs + 1) + list(op.parameters)
@@ -260,6 +296,13 @@ def init_parameters(lib: ctypes.CDLL):
260
296
  ctypes.c_int,
261
297
  ]
262
298
 
299
+ lib.addInt4WeightParameter.argtypes = [
300
+ handler,
301
+ c_u8_array,
302
+ ctypes.c_int,
303
+ ctypes.c_int,
304
+ ]
305
+
263
306
 
264
307
  def initialize_bindings() -> ctypes.CDLL:
265
308
  """Load the Intel® NPU Acceleration Library runtime library, and initialize all c++ <-> python bindings.
@@ -95,6 +95,75 @@ class NNFactory(BaseNPUBackendWithPrefetch):
95
95
 
96
96
  return cast(F, wrapper)
97
97
 
98
+ def return_tensor_for_list_inputs(fn: F) -> F: # type: ignore
99
+ """Wrap the output of a function in a Tensor object.
100
+ This new wrapper add support for List Tensor input.
101
+
102
+ Args:
103
+ fn (function): Function
104
+
105
+ Returns:
106
+ function: A function that wraps the output in a Tensor object
107
+ """
108
+
109
+ def wrapper(self, *args: Any, **kwargs: Any) -> Tensor:
110
+ """Wrap the output of a function in a Tensor object.
111
+
112
+ Args:
113
+ args (Any): Variable length argument list
114
+ kwargs (Any): Arbitrary keyword arguments
115
+
116
+ Returns:
117
+ Tensor: Tensor object
118
+ """
119
+ # Convert Tensor objects to their underlying node
120
+ # args = tuple(arg.node if isinstance(arg, Tensor) else arg for arg in args)
121
+ new_args = []
122
+ for arg in args:
123
+ if isinstance(arg, Tensor):
124
+ new_args.append(arg.node)
125
+ elif isinstance(arg, (tuple, list)):
126
+ # for item in arg:
127
+ for i in range(len(arg)):
128
+ if isinstance(arg[i], Tensor):
129
+ arg[i] = arg[i].node
130
+ new_args.append(arg)
131
+ else:
132
+ new_args.append(arg)
133
+ args = tuple(new_args)
134
+ kwargs = {
135
+ k: v.node if isinstance(v, Tensor) else v for k, v in kwargs.items()
136
+ }
137
+
138
+ # input_nodes = [arg for arg in args if isinstance(arg, ctypes._Pointer)] + [
139
+ # v for v in kwargs.values() if isinstance(v, ctypes._Pointer)
140
+ # ]
141
+ input_nodes = []
142
+ for arg in args:
143
+ if isinstance(arg, ctypes._Pointer):
144
+ input_nodes.append(arg)
145
+ elif isinstance(arg, (tuple, list)):
146
+ for item in arg:
147
+ if isinstance(item, ctypes._Pointer):
148
+ input_nodes.append(item)
149
+ input_nodes += [v for v in kwargs.values() if isinstance(v, ctypes._Pointer)]
150
+
151
+ # Call the function
152
+ node = fn(self, *args, **kwargs)
153
+
154
+ # remove input nodes from output_nodes
155
+ self.output_nodes = [
156
+ node for node in self.output_nodes if node not in input_nodes
157
+ ]
158
+ # add output node to output_nodes
159
+ if fn.__name__ != "constant":
160
+ self.output_nodes.append(node)
161
+
162
+ # Wrap the node in a Tensor object
163
+ return Tensor(factory=self, node=node)
164
+
165
+ return cast(F, wrapper)
166
+
98
167
  @return_tensor
99
168
  def _call_backend_op(self, op_name: str, *parameters: Any) -> Any:
100
169
  """Dynamically call a backend operation.
@@ -319,6 +388,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
319
388
  bias: Optional[bool] = False,
320
389
  act_dtype: npt.DTypeLike = np.float16,
321
390
  wt_dtype: npt.DTypeLike = np.float16,
391
+ scale_factor: bool = True,
322
392
  ) -> ctypes._Pointer:
323
393
  """Generate a linear layer.
324
394
 
@@ -341,7 +411,40 @@ class NNFactory(BaseNPUBackendWithPrefetch):
341
411
  bias,
342
412
  self.get_backend_dtype(act_dtype),
343
413
  self.get_backend_dtype(wt_dtype),
414
+ scale_factor
344
415
  )
416
+
417
+ @return_tensor
418
+ def dq_split_linear(
419
+ self, input_node: ctypes._Pointer, n_splits: int,
420
+ outout_channels: int, input_channels: int, bias: bool = False,
421
+ act_dtype: npt.DTypeLike = np.float16,
422
+ wt_dtype: npt.DTypeLike = np.float16,
423
+ scale_factor: bool = True,
424
+ is_prefill: bool = False,
425
+ ) -> ctypes._Pointer:
426
+ """Generate a linear layer for dynamic quantization linear layer.
427
+
428
+ Args:
429
+ input_node (ctypes._Pointer): layer input node
430
+ n_splits (int): number of parts the linear layer is split into
431
+ output_channels (int): number of output channels
432
+ input_channels (int): number of input channels
433
+ bias (bool, optional): enable/disable bias. Defaults to False.
434
+ act_dtype (npt.DTypeLike, optional): activation dtype. Defaults to np.float16.
435
+ wt_dtype (npt.DTypeLike, optional): weight dtype. Defaults to np.float16.
436
+ scale_factor (bool, optional): enable/disable mul scale factor. Default to True,
437
+ is_prefill (bool, optional): enable/disable prefill linear optimization. Default to True.
438
+
439
+ Returns:
440
+ ctypes._Pointer: output node
441
+ """
442
+ func = backend_lib.dq_split_linear_prefill if is_prefill else backend_lib.dq_split_linear
443
+ return func(self._mm, input_node, n_splits,
444
+ input_channels, outout_channels, bias,
445
+ self.get_backend_dtype(act_dtype),
446
+ self.get_backend_dtype(wt_dtype),
447
+ scale_factor)
345
448
 
346
449
  @return_tensor
347
450
  def reshape(
@@ -474,6 +577,27 @@ class NNFactory(BaseNPUBackendWithPrefetch):
474
577
  axis = np.int64(axis)
475
578
  return backend_lib.concat(self._mm, input_node_1, input_node_2, axis)
476
579
 
580
+ @return_tensor_for_list_inputs
581
+ def sequence_concat(
582
+ self, input_nodes: List[ctypes._Pointer], axis: int
583
+ ) -> ctypes._Pointer:
584
+ """Generate a concatenation layer.
585
+
586
+ Args:
587
+ input_nodes (List[ctypes._Pointer]): sequence of layer input node
588
+ axis (int): axis
589
+
590
+ Returns:
591
+ ctypes._Pointer: output node
592
+ """
593
+ if axis < 0:
594
+ shape_size = backend_lib.op_shape_size(input_nodes[0])
595
+ axis = (axis + shape_size) % shape_size
596
+ axis = np.int64(axis)
597
+
598
+ input_ptr = (ctypes.POINTER(ctypes.c_char) * len(input_nodes))(*input_nodes)
599
+ return backend_lib.multi_concat(self._mm, input_ptr, len(input_nodes), axis)
600
+
477
601
  @return_tensor
478
602
  def reduce_max(
479
603
  self,
@@ -777,6 +901,27 @@ class NNFactory(BaseNPUBackendWithPrefetch):
777
901
  auto_pad, # auto_pad
778
902
  )
779
903
 
904
+ @return_tensor
905
+ def scaled_dot_product_attention(
906
+ self, query: ctypes._Pointer, key: ctypes._Pointer,
907
+ value: ctypes._Pointer, attn_mask: ctypes._Pointer,
908
+ is_causal: bool
909
+ ) -> ctypes._Pointer:
910
+ """Constructs a ScaledDotProductAttention operation.
911
+ Args:
912
+ query (ctypes._Pointer): query
913
+ key (ctypes._Pointer): key
914
+ value (ctypes._Pointer): value
915
+ attn_mask (ctypes._Pointer): attention mask
916
+ is_causal (ctypes._Pointer): causal/not causal
917
+ Returns:
918
+ ctypes._Pointer: output node
919
+ """
920
+ return backend_lib.scaled_dot_product_attention(self._mm,
921
+ query, key,
922
+ value, attn_mask,
923
+ is_causal)
924
+
780
925
  def get_tensor_shape(self, node):
781
926
  """Get tensor shape.
782
927
 
@@ -826,7 +971,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
826
971
  else:
827
972
  raise RuntimeError("Unsupported dtype")
828
973
 
829
- def compile(self):
974
+ def compile(self, npu_dpu_groups=4):
830
975
  """Finalize and compile a model."""
831
976
  self.out = []
832
977
  self.torch_out = []
@@ -834,7 +979,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
834
979
  backend_lib.result(self._mm, node)
835
980
 
836
981
  # Compile the model
837
- backend_lib.compile(self._mm)
982
+ backend_lib.compile(self._mm, npu_dpu_groups)
838
983
 
839
984
  for idx, node in enumerate(self.output_nodes):
840
985
  output_shape = self.get_tensor_shape(node)