bigdl-core-npu 2.5.0__cp311-cp311-win_amd64.whl → 2.6.0__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. bigdl-core-npu/__init__.py +0 -0
  2. bigdl-core-npu/common.lib +0 -0
  3. bigdl-core-npu/ggml.dll +0 -0
  4. bigdl-core-npu/ggml.lib +0 -0
  5. bigdl-core-npu/include/llamacpp/arg.h +77 -0
  6. bigdl-core-npu/include/llamacpp/common.h +563 -0
  7. bigdl-core-npu/include/llamacpp/ggml-alloc.h +76 -0
  8. bigdl-core-npu/include/llamacpp/ggml-backend.h +241 -0
  9. bigdl-core-npu/include/llamacpp/ggml.h +2679 -0
  10. bigdl-core-npu/include/llamacpp/llama.h +1234 -0
  11. bigdl-core-npu/include/llamacpp/log.h +92 -0
  12. bigdl-core-npu/include/npu/npu_common.h +119 -0
  13. bigdl-core-npu/include/npu/npu_llm.h +77 -0
  14. bigdl-core-npu/llama-cli-npu.exe +0 -0
  15. bigdl-core-npu/llama.dll +0 -0
  16. bigdl-core-npu/llama.lib +0 -0
  17. bigdl-core-npu/llm-cli.exe +0 -0
  18. bigdl-core-npu/npu_llm.dll +0 -0
  19. bigdl-core-npu/npu_llm.lib +0 -0
  20. bigdl-core-npu/zlib1.dll +0 -0
  21. bigdl_core_npu-2.6.0.data/scripts/init-llama-cpp.bat +29 -0
  22. {bigdl_core_npu-2.5.0.dist-info → bigdl_core_npu-2.6.0.dist-info}/METADATA +12 -3
  23. {bigdl_core_npu-2.5.0.dist-info → bigdl_core_npu-2.6.0.dist-info}/RECORD +146 -96
  24. {bigdl_core_npu-2.5.0.dist-info → bigdl_core_npu-2.6.0.dist-info}/WHEEL +1 -1
  25. {bigdl_core_npu-2.5.0.dist-info → bigdl_core_npu-2.6.0.dist-info}/top_level.txt +1 -0
  26. intel_npu_acceleration_library/_version.py +1 -1
  27. intel_npu_acceleration_library/backend/base.py +39 -4
  28. intel_npu_acceleration_library/backend/bindings.py +109 -5
  29. intel_npu_acceleration_library/backend/factory.py +264 -47
  30. intel_npu_acceleration_library/backend/ops.py +2 -1
  31. intel_npu_acceleration_library/backend/qlinear.py +8 -4
  32. intel_npu_acceleration_library/backend/runtime.py +7 -2
  33. intel_npu_acceleration_library/backend/tensor.py +73 -3
  34. intel_npu_acceleration_library/bigdl-core-npu/cache.json +113732 -0
  35. intel_npu_acceleration_library/bigdl-core-npu/openvino.dll +0 -0
  36. intel_npu_acceleration_library/bigdl-core-npu/openvino_auto_batch_plugin.dll +0 -0
  37. intel_npu_acceleration_library/bigdl-core-npu/openvino_auto_plugin.dll +0 -0
  38. intel_npu_acceleration_library/bigdl-core-npu/openvino_c.dll +0 -0
  39. intel_npu_acceleration_library/bigdl-core-npu/openvino_hetero_plugin.dll +0 -0
  40. intel_npu_acceleration_library/bigdl-core-npu/openvino_intel_cpu_plugin.dll +0 -0
  41. intel_npu_acceleration_library/bigdl-core-npu/openvino_intel_gpu_plugin.dll +0 -0
  42. intel_npu_acceleration_library/bigdl-core-npu/openvino_intel_npu_plugin.dll +0 -0
  43. intel_npu_acceleration_library/bigdl-core-npu/openvino_ir_frontend.dll +0 -0
  44. intel_npu_acceleration_library/bigdl-core-npu/openvino_onnx_frontend.dll +0 -0
  45. intel_npu_acceleration_library/bigdl-core-npu/openvino_paddle_frontend.dll +0 -0
  46. intel_npu_acceleration_library/bigdl-core-npu/openvino_pytorch_frontend.dll +0 -0
  47. intel_npu_acceleration_library/bigdl-core-npu/openvino_tensorflow_frontend.dll +0 -0
  48. intel_npu_acceleration_library/bigdl-core-npu/openvino_tensorflow_lite_frontend.dll +0 -0
  49. intel_npu_acceleration_library/bigdl-core-npu/tbb12.dll +0 -0
  50. intel_npu_acceleration_library/bigdl-core-npu/tbb12_debug.dll +0 -0
  51. intel_npu_acceleration_library/bigdl-core-npu/tbbbind_2_5.dll +0 -0
  52. intel_npu_acceleration_library/bigdl-core-npu/tbbbind_2_5_debug.dll +0 -0
  53. intel_npu_acceleration_library/bigdl-core-npu/tbbmalloc.dll +0 -0
  54. intel_npu_acceleration_library/bigdl-core-npu/tbbmalloc_debug.dll +0 -0
  55. intel_npu_acceleration_library/bigdl-core-npu/tbbmalloc_proxy.dll +0 -0
  56. intel_npu_acceleration_library/bigdl-core-npu/tbbmalloc_proxy_debug.dll +0 -0
  57. intel_npu_acceleration_library/device.py +2 -2
  58. intel_npu_acceleration_library/dtypes.py +34 -1
  59. intel_npu_acceleration_library/external/openvino/__init__.py +1 -0
  60. intel_npu_acceleration_library/external/openvino/_offline_transformations/__init__.py +1 -0
  61. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp310-win_amd64.pyd +0 -0
  62. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp311-win_amd64.pyd +0 -0
  63. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp312-win_amd64.pyd +0 -0
  64. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp38-win_amd64.pyd +0 -0
  65. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp39-win_amd64.pyd +0 -0
  66. intel_npu_acceleration_library/external/openvino/experimental/__init__.py +14 -0
  67. intel_npu_acceleration_library/external/openvino/frontend/jax/__init__.py +15 -0
  68. intel_npu_acceleration_library/external/openvino/frontend/jax/jaxpr_decoder.py +293 -0
  69. intel_npu_acceleration_library/external/openvino/frontend/jax/passes.py +65 -0
  70. intel_npu_acceleration_library/external/openvino/frontend/jax/utils.py +182 -0
  71. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp310-win_amd64.pyd +0 -0
  72. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp311-win_amd64.pyd +0 -0
  73. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp312-win_amd64.pyd +0 -0
  74. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp38-win_amd64.pyd +0 -0
  75. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp39-win_amd64.pyd +0 -0
  76. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp310-win_amd64.pyd +0 -0
  77. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp311-win_amd64.pyd +0 -0
  78. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp312-win_amd64.pyd +0 -0
  79. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp38-win_amd64.pyd +0 -0
  80. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp39-win_amd64.pyd +0 -0
  81. intel_npu_acceleration_library/external/openvino/frontend/pytorch/fx_decoder.py +37 -19
  82. intel_npu_acceleration_library/external/openvino/frontend/pytorch/gptq.py +47 -6
  83. intel_npu_acceleration_library/external/openvino/frontend/pytorch/patch_model.py +28 -8
  84. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp310-win_amd64.pyd +0 -0
  85. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp311-win_amd64.pyd +0 -0
  86. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp312-win_amd64.pyd +0 -0
  87. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp38-win_amd64.pyd +0 -0
  88. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp39-win_amd64.pyd +0 -0
  89. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/backend.py +17 -5
  90. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/op_support.py +1 -0
  91. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/partition.py +55 -47
  92. intel_npu_acceleration_library/external/openvino/frontend/pytorch/ts_decoder.py +95 -63
  93. intel_npu_acceleration_library/external/openvino/frontend/pytorch/utils.py +12 -10
  94. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp310-win_amd64.pyd +0 -0
  95. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp311-win_amd64.pyd +0 -0
  96. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp312-win_amd64.pyd +0 -0
  97. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp38-win_amd64.pyd +0 -0
  98. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp39-win_amd64.pyd +0 -0
  99. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/utils.py +31 -10
  100. intel_npu_acceleration_library/external/openvino/helpers/packing.py +4 -4
  101. intel_npu_acceleration_library/external/openvino/preprocess/__init__.py +2 -0
  102. intel_npu_acceleration_library/external/openvino/preprocess/torchvision/requirements.txt +1 -0
  103. intel_npu_acceleration_library/external/openvino/properties/__init__.py +1 -0
  104. intel_npu_acceleration_library/external/openvino/runtime/ie_api.py +1 -1
  105. intel_npu_acceleration_library/external/openvino/runtime/op/__init__.py +1 -0
  106. intel_npu_acceleration_library/external/openvino/runtime/opset1/ops.py +2 -1
  107. intel_npu_acceleration_library/external/openvino/runtime/opset13/ops.py +5 -6
  108. intel_npu_acceleration_library/external/openvino/runtime/opset15/__init__.py +7 -0
  109. intel_npu_acceleration_library/external/openvino/runtime/opset15/ops.py +193 -2
  110. intel_npu_acceleration_library/external/openvino/runtime/opset6/ops.py +69 -43
  111. intel_npu_acceleration_library/external/openvino/runtime/opset8/ops.py +4 -0
  112. intel_npu_acceleration_library/external/openvino/runtime/properties/__init__.py +2 -0
  113. intel_npu_acceleration_library/external/openvino/runtime/utils/data_helpers/data_dispatcher.py +21 -3
  114. intel_npu_acceleration_library/external/openvino/runtime/utils/decorators.py +88 -2
  115. intel_npu_acceleration_library/external/openvino/tools/benchmark/utils/inputs_filling.py +9 -9
  116. intel_npu_acceleration_library/external/openvino/tools/ovc/convert_impl.py +16 -2
  117. intel_npu_acceleration_library/external/openvino/tools/ovc/main.py +5 -0
  118. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/jax_frontend_utils.py +19 -0
  119. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/pipeline.py +68 -16
  120. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py +69 -60
  121. intel_npu_acceleration_library/external/openvino/tools/ovc/utils.py +90 -3
  122. intel_npu_acceleration_library/external/openvino/utils.py +17 -0
  123. intel_npu_acceleration_library/lib/Release/intel_npu_acceleration_library.dll +0 -0
  124. intel_npu_acceleration_library/lib/Release/openvino.dll +0 -0
  125. intel_npu_acceleration_library/lib/Release/openvino_auto_batch_plugin.dll +0 -0
  126. intel_npu_acceleration_library/lib/Release/openvino_auto_plugin.dll +0 -0
  127. intel_npu_acceleration_library/lib/Release/openvino_c.dll +0 -0
  128. intel_npu_acceleration_library/lib/Release/openvino_hetero_plugin.dll +0 -0
  129. intel_npu_acceleration_library/lib/Release/openvino_intel_cpu_plugin.dll +0 -0
  130. intel_npu_acceleration_library/lib/Release/openvino_intel_gpu_plugin.dll +0 -0
  131. intel_npu_acceleration_library/lib/Release/openvino_intel_npu_plugin.dll +0 -0
  132. intel_npu_acceleration_library/lib/Release/openvino_ir_frontend.dll +0 -0
  133. intel_npu_acceleration_library/lib/Release/openvino_onnx_frontend.dll +0 -0
  134. intel_npu_acceleration_library/lib/Release/openvino_paddle_frontend.dll +0 -0
  135. intel_npu_acceleration_library/lib/Release/openvino_pytorch_frontend.dll +0 -0
  136. intel_npu_acceleration_library/lib/Release/openvino_tensorflow_frontend.dll +0 -0
  137. intel_npu_acceleration_library/lib/Release/openvino_tensorflow_lite_frontend.dll +0 -0
  138. intel_npu_acceleration_library/lib/Release/tbb12.dll +0 -0
  139. intel_npu_acceleration_library/lib/Release/tbb12_debug.dll +0 -0
  140. intel_npu_acceleration_library/lib/Release/tbbbind_2_5.dll +0 -0
  141. intel_npu_acceleration_library/lib/Release/tbbbind_2_5_debug.dll +0 -0
  142. intel_npu_acceleration_library/lib/Release/tbbmalloc.dll +0 -0
  143. intel_npu_acceleration_library/lib/Release/tbbmalloc_debug.dll +0 -0
  144. intel_npu_acceleration_library/lib/Release/tbbmalloc_proxy.dll +0 -0
  145. intel_npu_acceleration_library/lib/Release/tbbmalloc_proxy_debug.dll +0 -0
  146. intel_npu_acceleration_library/nn/module.py +17 -17
@@ -67,8 +67,9 @@ def init_common(lib: ctypes.CDLL):
67
67
  Args:
68
68
  lib (ctypes.CDLL): Intel® NPU Acceleration Library runtime library
69
69
  """
70
- lib.saveModel.argtypes = [handler, ctypes.c_char_p]
70
+ lib.saveModel.argtypes = [handler, ctypes.c_char_p, ctypes.c_bool]
71
71
  lib.saveCompiledModel.argtypes = [handler, ctypes.c_char_p]
72
+ lib.serializeModel.argtypes = [handler, ctypes.c_char_p, ctypes.c_char_p]
72
73
 
73
74
  # Set input activations
74
75
  lib.set_activation.argtypes = [handler, ctypes.c_void_p, ctypes.c_int]
@@ -91,6 +92,16 @@ def init_common(lib: ctypes.CDLL):
91
92
 
92
93
  lib.compressToI4.argtypes = [c_i8_array, c_u8_array, ctypes.c_int]
93
94
 
95
+ # Remote tensors
96
+ lib.to_npu.argtypes = [ctypes.c_int, c_u32_array, ctypes.c_char_p, ctypes.c_void_p]
97
+ lib.to_npu.restype = handler
98
+
99
+ lib.remote_tensor_data.argtypes = [handler]
100
+ lib.remote_tensor_data.restype = ctypes.c_void_p
101
+
102
+ lib.del_remote_tensor.argtypes = [handler]
103
+
104
+
94
105
 
95
106
  def init_network_factory(lib: ctypes.CDLL):
96
107
  """Initialize Netowrk factory bindings.
@@ -106,15 +117,18 @@ def init_network_factory(lib: ctypes.CDLL):
106
117
 
107
118
  lib.setNNFactoryWeights.argtypes = [handler, ctypes.c_int, handler, ctypes.c_bool]
108
119
 
109
- lib.op_shape_size.argtypes = [handler]
120
+ lib.op_shape_size.argtypes = [handler, ctypes.c_int]
110
121
  lib.op_shape_size.restype = ctypes.c_int
111
122
 
112
- lib.op_shape.argtypes = [handler, ctypes.c_int]
123
+ lib.op_shape.argtypes = [handler, ctypes.c_int, ctypes.c_int]
113
124
  lib.op_shape.restype = ctypes.c_int
114
125
 
115
- lib.op_dtype.argtypes = [handler]
126
+ lib.op_dtype.argtypes = [handler, ctypes.c_int]
116
127
  lib.op_dtype.restype = ctypes.c_int
117
128
 
129
+ lib.op_output_size.argtypes = [handler]
130
+ lib.op_output_size.restype = ctypes.c_int
131
+
118
132
  lib.parameter.argtypes = [handler, ctypes.c_int, c_u32_array, ctypes.c_char_p]
119
133
  lib.parameter.restype = handler
120
134
 
@@ -143,7 +157,16 @@ def init_network_factory(lib: ctypes.CDLL):
143
157
  ]
144
158
  lib.slice.restype = handler
145
159
 
146
- lib.compile.argtypes = [handler]
160
+ lib.simple_slice.argtypes = [
161
+ handler,
162
+ handler,
163
+ handler,
164
+ handler,
165
+ handler
166
+ ]
167
+ lib.simple_slice.restype = handler
168
+
169
+ lib.compile.argtypes = [handler, ctypes.c_int]
147
170
  lib.compile.restype = handler
148
171
 
149
172
  lib.get_output_tensor_shape_size.argtypes = [handler, ctypes.c_int]
@@ -160,6 +183,8 @@ def init_network_factory(lib: ctypes.CDLL):
160
183
  ctypes.c_bool,
161
184
  ctypes.c_char_p,
162
185
  ctypes.c_char_p,
186
+ ctypes.c_bool,
187
+ ctypes.c_bool,
163
188
  ]
164
189
  lib.linear.restype = handler
165
190
 
@@ -214,6 +239,65 @@ def init_network_factory(lib: ctypes.CDLL):
214
239
  ]
215
240
  lib.max_pooling.restype = handler
216
241
 
242
+
243
+ lib.multi_concat.argtypes = [
244
+ handler,
245
+ ctypes.POINTER(handler),
246
+ ctypes.c_uint64,
247
+ ctypes.c_int64,
248
+ ]
249
+ lib.multi_concat.restype = handler
250
+
251
+ lib.variadic_split.argtypes = [
252
+ handler,
253
+ handler,
254
+ ctypes.c_int,
255
+ c_u32_array,
256
+ ctypes.c_int,
257
+ ]
258
+ lib.variadic_split.restype = handler
259
+
260
+ lib.dq_split_linear.argtypes = [
261
+ handler,
262
+ handler,
263
+ ctypes.c_int,
264
+ ctypes.c_int,
265
+ ctypes.c_int,
266
+ ctypes.c_bool,
267
+ ctypes.c_char_p,
268
+ ctypes.c_char_p,
269
+ ctypes.c_bool,
270
+ ctypes.c_bool,
271
+ ]
272
+ lib.dq_split_linear.restype = handler
273
+
274
+ lib.dq_split_linear_prefill.argtypes = [
275
+ handler,
276
+ handler,
277
+ ctypes.c_int,
278
+ ctypes.c_int,
279
+ ctypes.c_int,
280
+ ctypes.c_bool,
281
+ ctypes.c_char_p,
282
+ ctypes.c_char_p,
283
+ ctypes.c_bool,
284
+ ctypes.c_bool,
285
+ ]
286
+ lib.dq_split_linear_prefill.restype = handler
287
+
288
+ lib.gw_linear_prefill.argtypes = [
289
+ handler,
290
+ handler,
291
+ ctypes.c_int,
292
+ ctypes.c_int,
293
+ ctypes.c_int,
294
+ ctypes.c_bool,
295
+ ctypes.c_char_p,
296
+ ctypes.c_char_p,
297
+ ctypes.c_bool,
298
+ ]
299
+ lib.gw_linear_prefill.restype = handler
300
+
217
301
  for op in get_supported_ops():
218
302
  fn = getattr(lib, op.name)
219
303
  fn.argtypes = [handler] * (op.inputs + 1) + list(op.parameters)
@@ -252,6 +336,19 @@ def init_parameters(lib: ctypes.CDLL):
252
336
  ctypes.c_int,
253
337
  ]
254
338
 
339
+ lib.addAsymInt4Parameter.argtypes = [
340
+ handler,
341
+ c_u8_array,
342
+ c_fp16_array,
343
+ c_fp16_array,
344
+ ctypes.c_int,
345
+ ctypes.c_int,
346
+ ctypes.c_int,
347
+ ctypes.c_int,
348
+ ctypes.c_int,
349
+ ctypes.c_int,
350
+ ]
351
+
255
352
  lib.addIntParameterConversion.argtypes = [
256
353
  handler,
257
354
  c_i8_array,
@@ -260,6 +357,13 @@ def init_parameters(lib: ctypes.CDLL):
260
357
  ctypes.c_int,
261
358
  ]
262
359
 
360
+ lib.addInt4WeightParameter.argtypes = [
361
+ handler,
362
+ c_u8_array,
363
+ ctypes.c_int,
364
+ ctypes.c_int,
365
+ ]
366
+
263
367
 
264
368
  def initialize_bindings() -> ctypes.CDLL:
265
369
  """Load the Intel® NPU Acceleration Library runtime library, and initialize all c++ <-> python bindings.
@@ -7,7 +7,7 @@ from intel_npu_acceleration_library.backend.base import BaseNPUBackendWithPrefet
7
7
  from intel_npu_acceleration_library.backend.ops import get_supported_ops
8
8
  from intel_npu_acceleration_library.backend.bindings import lib as backend_lib
9
9
  from intel_npu_acceleration_library.backend.tensor import Tensor
10
- from intel_npu_acceleration_library.dtypes import int4, bfloat16
10
+ from intel_npu_acceleration_library.dtypes import int4, bfloat16, get_backend_dtype
11
11
  from typing import Optional, Tuple, Any, Union, Sequence, TypeVar, Callable, cast, List
12
12
  from functools import partial
13
13
  import numpy.typing as npt
@@ -71,17 +71,99 @@ class NNFactory(BaseNPUBackendWithPrefetch):
71
71
  Tensor: Tensor object
72
72
  """
73
73
  # Convert Tensor objects to their underlying node
74
- args = tuple(arg.node if isinstance(arg, Tensor) else arg for arg in args)
75
74
  kwargs = {
76
75
  k: v.node if isinstance(v, Tensor) else v for k, v in kwargs.items()
77
76
  }
78
77
 
78
+ if fn.__qualname__ == 'NNFactory.reshape':
79
+ output_idx = args[0].output_idx
80
+ kwargs["output_idx"] = output_idx
81
+ args = tuple(arg.node if isinstance(arg, Tensor) else arg for arg in args)
82
+
83
+
79
84
  input_nodes = [arg for arg in args if isinstance(arg, ctypes._Pointer)] + [
80
85
  v for v in kwargs.values() if isinstance(v, ctypes._Pointer)
81
86
  ]
82
87
  # Call the function
83
88
  node = fn(self, *args, **kwargs)
84
89
 
90
+ output_len = backend_lib.op_output_size(node)
91
+
92
+ # remove input nodes from output_nodes
93
+ self.output_nodes = [
94
+ node for node in self.output_nodes if node not in input_nodes
95
+ ]
96
+ # add output node to output_nodes
97
+ if fn.__name__ != "constant":
98
+ self.output_nodes.append(node)
99
+
100
+ # Wrap the node in a Tensor object
101
+ if output_len == 1:
102
+ return Tensor(factory=self, node=node, output_idx=0)
103
+ else:
104
+ output_tensor_list = []
105
+ for i in range(output_len):
106
+ output_tensor_list.append(Tensor(factory=self, node=node, output_idx=i))
107
+ return output_tensor_list
108
+
109
+ return cast(F, wrapper)
110
+
111
+ def return_tensor_for_list_inputs(fn: F) -> F: # type: ignore
112
+ """Wrap the output of a function in a Tensor object.
113
+ This new wrapper add support for List Tensor input.
114
+
115
+ Args:
116
+ fn (function): Function
117
+
118
+ Returns:
119
+ function: A function that wraps the output in a Tensor object
120
+ """
121
+
122
+ def wrapper(self, *args: Any, **kwargs: Any) -> Tensor:
123
+ """Wrap the output of a function in a Tensor object.
124
+
125
+ Args:
126
+ args (Any): Variable length argument list
127
+ kwargs (Any): Arbitrary keyword arguments
128
+
129
+ Returns:
130
+ Tensor: Tensor object
131
+ """
132
+ # Convert Tensor objects to their underlying node
133
+ # args = tuple(arg.node if isinstance(arg, Tensor) else arg for arg in args)
134
+ new_args = []
135
+ for arg in args:
136
+ if isinstance(arg, Tensor):
137
+ new_args.append(arg.node)
138
+ elif isinstance(arg, (tuple, list)):
139
+ # for item in arg:
140
+ for i in range(len(arg)):
141
+ if isinstance(arg[i], Tensor):
142
+ arg[i] = arg[i].node
143
+ new_args.append(arg)
144
+ else:
145
+ new_args.append(arg)
146
+ args = tuple(new_args)
147
+ kwargs = {
148
+ k: v.node if isinstance(v, Tensor) else v for k, v in kwargs.items()
149
+ }
150
+
151
+ # input_nodes = [arg for arg in args if isinstance(arg, ctypes._Pointer)] + [
152
+ # v for v in kwargs.values() if isinstance(v, ctypes._Pointer)
153
+ # ]
154
+ input_nodes = []
155
+ for arg in args:
156
+ if isinstance(arg, ctypes._Pointer):
157
+ input_nodes.append(arg)
158
+ elif isinstance(arg, (tuple, list)):
159
+ for item in arg:
160
+ if isinstance(item, ctypes._Pointer):
161
+ input_nodes.append(item)
162
+ input_nodes += [v for v in kwargs.values() if isinstance(v, ctypes._Pointer)]
163
+
164
+ # Call the function
165
+ node = fn(self, *args, **kwargs)
166
+
85
167
  # remove input nodes from output_nodes
86
168
  self.output_nodes = [
87
169
  node for node in self.output_nodes if node not in input_nodes
@@ -115,34 +197,10 @@ class NNFactory(BaseNPUBackendWithPrefetch):
115
197
  Args:
116
198
  dtype: numpy dtype
117
199
 
118
- Raises:
119
- RuntimeError: Unsupported datatype
120
-
121
200
  Returns:
122
201
  ctypes.c_char_p: string representation of the dtype
123
202
  """
124
- if dtype in [np.int8, torch.int8]:
125
- str_dtype = "int8"
126
- elif dtype == np.uint8 or dtype == int4:
127
- # u8 represents packed i4 dtypes
128
- str_dtype = "int4"
129
- elif dtype in [np.int16, torch.int16]:
130
- str_dtype = "int16"
131
- elif dtype in [np.int32, torch.int32]:
132
- str_dtype = "int32"
133
- elif dtype in [np.int64, torch.int64]:
134
- str_dtype = "int64"
135
- elif dtype in [np.float16, torch.float16]:
136
- str_dtype = "float16"
137
- elif dtype in [np.float32, torch.float32]:
138
- str_dtype = "float32"
139
- elif dtype in [np.float64, torch.float64]:
140
- str_dtype = "float64"
141
- elif dtype in [bfloat16, torch.bfloat16]:
142
- str_dtype = "bfloat16"
143
- else:
144
- raise RuntimeError(f"DType is not supported {dtype}")
145
- return ctypes.c_char_p(str_dtype.encode())
203
+ return get_backend_dtype(dtype)
146
204
 
147
205
  @return_tensor
148
206
  def parameter(
@@ -319,6 +377,8 @@ class NNFactory(BaseNPUBackendWithPrefetch):
319
377
  bias: Optional[bool] = False,
320
378
  act_dtype: npt.DTypeLike = np.float16,
321
379
  wt_dtype: npt.DTypeLike = np.float16,
380
+ scale_factor: bool = True,
381
+ asym: bool=False,
322
382
  ) -> ctypes._Pointer:
323
383
  """Generate a linear layer.
324
384
 
@@ -341,11 +401,52 @@ class NNFactory(BaseNPUBackendWithPrefetch):
341
401
  bias,
342
402
  self.get_backend_dtype(act_dtype),
343
403
  self.get_backend_dtype(wt_dtype),
404
+ scale_factor,
405
+ asym
344
406
  )
407
+
408
+ @return_tensor
409
+ def dq_split_linear(
410
+ self, input_node: ctypes._Pointer, n_splits: int,
411
+ outout_channels: int, input_channels: int, bias: bool = False,
412
+ act_dtype: npt.DTypeLike = np.float16,
413
+ wt_dtype: npt.DTypeLike = np.float16,
414
+ scale_factor: bool = True,
415
+ is_prefill: bool = False,
416
+ use_dq: bool = True,
417
+ asym: bool = False,
418
+ ) -> ctypes._Pointer:
419
+ """Generate a linear layer for dynamic quantization linear layer.
420
+
421
+ Args:
422
+ input_node (ctypes._Pointer): layer input node
423
+ n_splits (int): number of parts the linear layer is split into
424
+ output_channels (int): number of output channels
425
+ input_channels (int): number of input channels
426
+ bias (bool, optional): enable/disable bias. Defaults to False.
427
+ act_dtype (npt.DTypeLike, optional): activation dtype. Defaults to np.float16.
428
+ wt_dtype (npt.DTypeLike, optional): weight dtype. Defaults to np.float16.
429
+ scale_factor (bool, optional): enable/disable mul scale factor. Default to True,
430
+ is_prefill (bool, optional): enable/disable prefill linear optimization. Default to False.
431
+
432
+ Returns:
433
+ ctypes._Pointer: output node
434
+ """
435
+ if is_prefill:
436
+ func = backend_lib.dq_split_linear_prefill if use_dq else backend_lib.gw_linear_prefill
437
+ else:
438
+ func = backend_lib.dq_split_linear
439
+ return func(self._mm, input_node, n_splits,
440
+ input_channels, outout_channels, bias,
441
+ self.get_backend_dtype(act_dtype),
442
+ self.get_backend_dtype(wt_dtype),
443
+ scale_factor, asym)
345
444
 
346
445
  @return_tensor
347
446
  def reshape(
348
- self, input_node: ctypes._Pointer, shape: Sequence[int]
447
+ self, input_node: ctypes._Pointer, shape: Sequence[int],
448
+ special_zero: bool = True,
449
+ output_idx: int = 0
349
450
  ) -> ctypes._Pointer:
350
451
  """Generate a reshape layer.
351
452
 
@@ -357,7 +458,8 @@ class NNFactory(BaseNPUBackendWithPrefetch):
357
458
  ctypes._Pointer: output node
358
459
  """
359
460
  shape_node = self.constant(shape).node # type: ignore
360
- return backend_lib.reshape(self._mm, input_node, shape_node)
461
+ return backend_lib.reshape(self._mm, input_node, shape_node,
462
+ special_zero, output_idx)
361
463
 
362
464
  @return_tensor
363
465
  def broadcast(
@@ -453,6 +555,46 @@ class NNFactory(BaseNPUBackendWithPrefetch):
453
555
  end_mask_ptr.size,
454
556
  end_mask_ptr,
455
557
  )
558
+
559
+ @return_tensor
560
+ def simple_slice(
561
+ self,
562
+ input_node: ctypes._Pointer,
563
+ begin: Sequence[int],
564
+ end: Sequence[int],
565
+ step: Optional[Sequence[int]] = None,
566
+ ) -> ctypes._Pointer:
567
+ """Generate an unsqueeze layer.
568
+
569
+ Args:
570
+ input_node (ctypes._Pointer): layer input node
571
+ begin (Sequence[int]): begin
572
+ end (Sequence[int]): end
573
+ stride (Optional[Sequence[int]]): stride
574
+
575
+ Raises:
576
+ ValueError: begin and end must have the same length
577
+
578
+ Returns:
579
+ ctypes._Pointer: output node
580
+ """
581
+ if len(begin) != len(end):
582
+ raise ValueError("begin and end must have the same length")
583
+
584
+ if step is None:
585
+ step = [1] * len(begin)
586
+
587
+ begin = self.constant(begin).node # type: ignore
588
+ end = self.constant(end).node # type: ignore
589
+ step = self.constant(step).node # type: ignore
590
+
591
+ return backend_lib.simple_slice(
592
+ self._mm,
593
+ input_node,
594
+ begin,
595
+ end,
596
+ step
597
+ )
456
598
 
457
599
  @return_tensor
458
600
  def concat(
@@ -469,11 +611,32 @@ class NNFactory(BaseNPUBackendWithPrefetch):
469
611
  ctypes._Pointer: output node
470
612
  """
471
613
  if axis < 0:
472
- shape_size = backend_lib.op_shape_size(input_node_1)
614
+ shape_size = backend_lib.op_shape_size(input_node_1, 0)
473
615
  axis = (axis + shape_size) % shape_size
474
616
  axis = np.int64(axis)
475
617
  return backend_lib.concat(self._mm, input_node_1, input_node_2, axis)
476
618
 
619
+ @return_tensor_for_list_inputs
620
+ def sequence_concat(
621
+ self, input_nodes: List[ctypes._Pointer], axis: int
622
+ ) -> ctypes._Pointer:
623
+ """Generate a concatenation layer.
624
+
625
+ Args:
626
+ input_nodes (List[ctypes._Pointer]): sequence of layer input node
627
+ axis (int): axis
628
+
629
+ Returns:
630
+ ctypes._Pointer: output node
631
+ """
632
+ if axis < 0:
633
+ shape_size = backend_lib.op_shape_size(input_nodes[0], 0)
634
+ axis = (axis + shape_size) % shape_size
635
+ axis = np.int64(axis)
636
+
637
+ input_ptr = (ctypes.POINTER(ctypes.c_char) * len(input_nodes))(*input_nodes)
638
+ return backend_lib.multi_concat(self._mm, input_ptr, len(input_nodes), axis)
639
+
477
640
  @return_tensor
478
641
  def reduce_max(
479
642
  self,
@@ -492,7 +655,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
492
655
  ctypes._Pointer: output node
493
656
  """
494
657
  if reduction_axes is None:
495
- shape_size = backend_lib.op_shape_size(input_node)
658
+ shape_size = backend_lib.op_shape_size(input_node, 0)
496
659
  reduction_axes = list(range(shape_size - 1, -1, -1))
497
660
  axis_node = self.constant(reduction_axes).node # type: ignore
498
661
  return backend_lib.reduce_max(self._mm, input_node, axis_node, keep_dims)
@@ -515,7 +678,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
515
678
  ctypes._Pointer: output node
516
679
  """
517
680
  if reduction_axes is None:
518
- shape_size = backend_lib.op_shape_size(input_node)
681
+ shape_size = backend_lib.op_shape_size(input_node, 0)
519
682
  reduction_axes = list(range(shape_size - 1, -1, -1))
520
683
  axis_node = self.constant(reduction_axes).node # type: ignore
521
684
  return backend_lib.reduce_mean(self._mm, input_node, axis_node, keep_dims)
@@ -538,7 +701,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
538
701
  ctypes._Pointer: output node
539
702
  """
540
703
  if reduction_axes is None:
541
- shape_size = backend_lib.op_shape_size(input_node)
704
+ shape_size = backend_lib.op_shape_size(input_node, 0)
542
705
  reduction_axes = list(range(shape_size - 1, -1, -1))
543
706
  axis_node = self.constant(reduction_axes).node # type: ignore
544
707
  return backend_lib.reduce_min(self._mm, input_node, axis_node, keep_dims)
@@ -561,7 +724,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
561
724
  ctypes._Pointer: output node
562
725
  """
563
726
  if reduction_axes is None:
564
- shape_size = backend_lib.op_shape_size(input_node)
727
+ shape_size = backend_lib.op_shape_size(input_node, 0)
565
728
  reduction_axes = list(range(shape_size - 1, -1, -1))
566
729
  axis_node = self.constant(reduction_axes).node # type: ignore
567
730
  return backend_lib.reduce_prod(self._mm, input_node, axis_node, keep_dims)
@@ -584,7 +747,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
584
747
  ctypes._Pointer: output node
585
748
  """
586
749
  if reduction_axes is None:
587
- shape_size = backend_lib.op_shape_size(input_node)
750
+ shape_size = backend_lib.op_shape_size(input_node, 0)
588
751
  reduction_axes = list(range(shape_size - 1, -1, -1))
589
752
  axis_node = self.constant(reduction_axes).node # type: ignore
590
753
  return backend_lib.reduce_sum(self._mm, input_node, axis_node, keep_dims)
@@ -604,7 +767,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
604
767
  ctypes._Pointer: output node
605
768
  """
606
769
  if axis < 0:
607
- shape_size = backend_lib.op_shape_size(input_node)
770
+ shape_size = backend_lib.op_shape_size(input_node, 0)
608
771
  axis = (axis + shape_size) % shape_size
609
772
  axis_node = self.constant(axis).node # type: ignore
610
773
  return backend_lib.normL2(self._mm, input_node, axis_node, eps)
@@ -627,14 +790,14 @@ class NNFactory(BaseNPUBackendWithPrefetch):
627
790
  Returns:
628
791
  ctypes._Pointer: output node
629
792
  """
630
- input_shape_size = backend_lib.op_shape_size(input_node)
793
+ input_shape_size = backend_lib.op_shape_size(input_node, 0)
631
794
  input_shape = [
632
- backend_lib.op_shape(input_node, i) for i in range(input_shape_size)
795
+ backend_lib.op_shape(input_node, i, 0) for i in range(input_shape_size)
633
796
  ]
634
797
  if isinstance(exponent, ctypes._Pointer):
635
- exponent_shape_size = backend_lib.op_shape_size(input_node)
798
+ exponent_shape_size = backend_lib.op_shape_size(input_node, 0)
636
799
  exponent_shape = [
637
- backend_lib.op_shape(exponent, i) for i in range(exponent_shape_size)
800
+ backend_lib.op_shape(exponent, i, 0) for i in range(exponent_shape_size)
638
801
  ]
639
802
  else:
640
803
  exponent_shape = list(exponent.shape)
@@ -643,6 +806,39 @@ class NNFactory(BaseNPUBackendWithPrefetch):
643
806
  # raise ValueError("Input tensor shapes are not equal")
644
807
 
645
808
  return backend_lib.power(self._mm, input_node, exponent)
809
+
810
+ @return_tensor
811
+ def variadic_split(
812
+ self,
813
+ input: ctypes._Pointer,
814
+ axis: int,
815
+ split_lengths: Sequence[int],
816
+ ) -> ctypes._Pointer:
817
+ """Generate an average pooling layer.
818
+
819
+ Args:
820
+ input (ctypes._Pointer): layer input node
821
+ axis (int): split axis
822
+ split_lengths (Sequence[int]): A list containing the sizes of each output tensor
823
+ along the split "axis". Size of "split_lengths" should be equal to the number of
824
+ outputs. The sum of split_lengths must match data.shape[axis]
825
+
826
+ Raises:
827
+ NotImplementedError: divisor_override is not supported
828
+
829
+ Returns:
830
+ ctypes._Pointer: output node
831
+ """
832
+
833
+ split_lens_ptr = np.array(split_lengths, dtype=np.uint32)
834
+
835
+ return backend_lib.variadic_split(
836
+ self._mm,
837
+ input,
838
+ axis,
839
+ split_lens_ptr,
840
+ split_lens_ptr.size,
841
+ )
646
842
 
647
843
  @return_tensor
648
844
  def avg_pooling(
@@ -777,7 +973,28 @@ class NNFactory(BaseNPUBackendWithPrefetch):
777
973
  auto_pad, # auto_pad
778
974
  )
779
975
 
780
- def get_tensor_shape(self, node):
976
+ @return_tensor
977
+ def scaled_dot_product_attention(
978
+ self, query: ctypes._Pointer, key: ctypes._Pointer,
979
+ value: ctypes._Pointer, attn_mask: ctypes._Pointer,
980
+ is_causal: bool
981
+ ) -> ctypes._Pointer:
982
+ """Constructs a ScaledDotProductAttention operation.
983
+ Args:
984
+ query (ctypes._Pointer): query
985
+ key (ctypes._Pointer): key
986
+ value (ctypes._Pointer): value
987
+ attn_mask (ctypes._Pointer): attention mask
988
+ is_causal (ctypes._Pointer): causal/not causal
989
+ Returns:
990
+ ctypes._Pointer: output node
991
+ """
992
+ return backend_lib.scaled_dot_product_attention(self._mm,
993
+ query, key,
994
+ value, attn_mask,
995
+ is_causal)
996
+
997
+ def get_tensor_shape(self, node, output_idx=0):
781
998
  """Get tensor shape.
782
999
 
783
1000
  Args:
@@ -786,10 +1003,10 @@ class NNFactory(BaseNPUBackendWithPrefetch):
786
1003
  Returns:
787
1004
  tuple[int]: tensor shape
788
1005
  """
789
- size = backend_lib.op_shape_size(node)
790
- return tuple([backend_lib.op_shape(node, idx) for idx in range(size)])
1006
+ size = backend_lib.op_shape_size(node, output_idx)
1007
+ return tuple([backend_lib.op_shape(node, idx, output_idx) for idx in range(size)])
791
1008
 
792
- def get_tensor_dtype(self, node):
1009
+ def get_tensor_dtype(self, node, output_idx=0):
793
1010
  """Get tensor dtype.
794
1011
 
795
1012
  Args:
@@ -801,7 +1018,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
801
1018
  Returns:
802
1019
  str: tensor dtype
803
1020
  """
804
- dtype_int = backend_lib.op_dtype(node)
1021
+ dtype_int = backend_lib.op_dtype(node, output_idx)
805
1022
 
806
1023
  if dtype_int == 2:
807
1024
  return np.bool
@@ -826,7 +1043,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
826
1043
  else:
827
1044
  raise RuntimeError("Unsupported dtype")
828
1045
 
829
- def compile(self):
1046
+ def compile(self, npu_dpu_groups=4):
830
1047
  """Finalize and compile a model."""
831
1048
  self.out = []
832
1049
  self.torch_out = []
@@ -834,7 +1051,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
834
1051
  backend_lib.result(self._mm, node)
835
1052
 
836
1053
  # Compile the model
837
- backend_lib.compile(self._mm)
1054
+ backend_lib.compile(self._mm, npu_dpu_groups)
838
1055
 
839
1056
  for idx, node in enumerate(self.output_nodes):
840
1057
  output_shape = self.get_tensor_shape(node)
@@ -98,7 +98,7 @@ def get_supported_ops() -> List[SupportedOp]:
98
98
  inputs=3,
99
99
  parameters=[ctypes.c_int],
100
100
  ),
101
- SupportedOp(name="reshape", inputs=2),
101
+ SupportedOp(name="reshape", inputs=2, parameters=[ctypes.c_bool, ctypes.c_int]),
102
102
  SupportedOp(name="transpose", inputs=2),
103
103
  SupportedOp(name="squeeze", inputs=1),
104
104
  SupportedOp(name="unsqueeze", inputs=2),
@@ -137,5 +137,6 @@ def get_supported_ops() -> List[SupportedOp]:
137
137
  SupportedOp(name="power", inputs=2),
138
138
  SupportedOp(name="broadcast", inputs=2),
139
139
  SupportedOp(name="log_softmax", inputs=1, parameters=[ctypes.c_int64]),
140
+ SupportedOp(name="rotate_half", inputs=1),
140
141
  ]
141
142
  return supported_ops
@@ -18,6 +18,7 @@ class QLinear(NNFactory):
18
18
  profile: bool = False,
19
19
  device: str = "NPU",
20
20
  dtype: np.dtype = np.int8,
21
+ asym: bool = False
21
22
  ):
22
23
  """Initialize the QLinear class.
23
24
 
@@ -33,13 +34,14 @@ class QLinear(NNFactory):
33
34
  super().__init__(profile, device)
34
35
  self.inC, self.outC = inC, outC
35
36
  self.batch = batch
37
+ self.asym = asym
36
38
 
37
39
  input = self.parameter((self.batch, self.inC))
38
- _ = self.linear(input, outC, inC, bias=False, wt_dtype=dtype)
40
+ _ = self.linear(input, outC, inC, bias=False, wt_dtype=dtype, asym=asym)
39
41
  self.compile()
40
42
 
41
43
  def run(
42
- self, X: np.ndarray, W: np.ndarray, scale: np.ndarray, op_id: str
44
+ self, X: np.ndarray, W: np.ndarray, scale: np.ndarray, zero: np.ndarray=None, op_id: str=None
43
45
  ) -> np.ndarray:
44
46
  """Run the layer: $X * (W * S)^T$ .
45
47
 
@@ -67,5 +69,7 @@ class QLinear(NNFactory):
67
69
  raise RuntimeError(
68
70
  f"Scale shape {W.shape} different from expected one {(self.outC, 1)}"
69
71
  )
70
-
71
- return super().run(X, (W, scale), op_id=op_id)
72
+ if not self.asym:
73
+ return super().run(X, (W, scale), op_id=op_id)
74
+ else:
75
+ return super().run(X, (W, scale, zero), op_id=op_id)