bigdl-core-npu 2.6.0b20250114__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. bigdl-core-npu/__init__.py +0 -0
  2. bigdl-core-npu/include/common.h +96 -0
  3. bigdl-core-npu/include/npu_llm.h +74 -0
  4. bigdl-core-npu/npu_llm.dll +0 -0
  5. bigdl-core-npu/npu_llm.lib +0 -0
  6. bigdl_core_npu-2.6.0b20250114.dist-info/METADATA +44 -0
  7. bigdl_core_npu-2.6.0b20250114.dist-info/RECORD +234 -0
  8. bigdl_core_npu-2.6.0b20250114.dist-info/WHEEL +5 -0
  9. bigdl_core_npu-2.6.0b20250114.dist-info/top_level.txt +2 -0
  10. intel_npu_acceleration_library/__init__.py +24 -0
  11. intel_npu_acceleration_library/_version.py +6 -0
  12. intel_npu_acceleration_library/backend/__init__.py +37 -0
  13. intel_npu_acceleration_library/backend/base.py +250 -0
  14. intel_npu_acceleration_library/backend/bindings.py +383 -0
  15. intel_npu_acceleration_library/backend/compression.py +24 -0
  16. intel_npu_acceleration_library/backend/convolution.py +58 -0
  17. intel_npu_acceleration_library/backend/factory.py +1161 -0
  18. intel_npu_acceleration_library/backend/linear.py +60 -0
  19. intel_npu_acceleration_library/backend/matmul.py +59 -0
  20. intel_npu_acceleration_library/backend/mlp.py +58 -0
  21. intel_npu_acceleration_library/backend/ops.py +142 -0
  22. intel_npu_acceleration_library/backend/qlinear.py +75 -0
  23. intel_npu_acceleration_library/backend/qmatmul.py +66 -0
  24. intel_npu_acceleration_library/backend/runtime.py +215 -0
  25. intel_npu_acceleration_library/backend/sdpa.py +107 -0
  26. intel_npu_acceleration_library/backend/tensor.py +1120 -0
  27. intel_npu_acceleration_library/backend/utils.py +70 -0
  28. intel_npu_acceleration_library/compiler.py +194 -0
  29. intel_npu_acceleration_library/device.py +230 -0
  30. intel_npu_acceleration_library/dtypes.py +155 -0
  31. intel_npu_acceleration_library/external/openvino/__init__.py +72 -0
  32. intel_npu_acceleration_library/external/openvino/_offline_transformations/__init__.py +21 -0
  33. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp310-win_amd64.pyd +0 -0
  34. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp311-win_amd64.pyd +0 -0
  35. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp312-win_amd64.pyd +0 -0
  36. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp38-win_amd64.pyd +0 -0
  37. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp39-win_amd64.pyd +0 -0
  38. intel_npu_acceleration_library/external/openvino/experimental/__init__.py +14 -0
  39. intel_npu_acceleration_library/external/openvino/frontend/__init__.py +34 -0
  40. intel_npu_acceleration_library/external/openvino/frontend/frontend.py +44 -0
  41. intel_npu_acceleration_library/external/openvino/frontend/jax/__init__.py +15 -0
  42. intel_npu_acceleration_library/external/openvino/frontend/jax/jaxpr_decoder.py +293 -0
  43. intel_npu_acceleration_library/external/openvino/frontend/jax/passes.py +65 -0
  44. intel_npu_acceleration_library/external/openvino/frontend/jax/utils.py +182 -0
  45. intel_npu_acceleration_library/external/openvino/frontend/onnx/__init__.py +15 -0
  46. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp310-win_amd64.pyd +0 -0
  47. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp311-win_amd64.pyd +0 -0
  48. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp312-win_amd64.pyd +0 -0
  49. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp38-win_amd64.pyd +0 -0
  50. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp39-win_amd64.pyd +0 -0
  51. intel_npu_acceleration_library/external/openvino/frontend/paddle/__init__.py +15 -0
  52. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp310-win_amd64.pyd +0 -0
  53. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp311-win_amd64.pyd +0 -0
  54. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp312-win_amd64.pyd +0 -0
  55. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp38-win_amd64.pyd +0 -0
  56. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp39-win_amd64.pyd +0 -0
  57. intel_npu_acceleration_library/external/openvino/frontend/pytorch/__init__.py +19 -0
  58. intel_npu_acceleration_library/external/openvino/frontend/pytorch/fx_decoder.py +370 -0
  59. intel_npu_acceleration_library/external/openvino/frontend/pytorch/gptq.py +180 -0
  60. intel_npu_acceleration_library/external/openvino/frontend/pytorch/module_extension.py +39 -0
  61. intel_npu_acceleration_library/external/openvino/frontend/pytorch/patch_model.py +118 -0
  62. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp310-win_amd64.pyd +0 -0
  63. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp311-win_amd64.pyd +0 -0
  64. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp312-win_amd64.pyd +0 -0
  65. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp38-win_amd64.pyd +0 -0
  66. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp39-win_amd64.pyd +0 -0
  67. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/backend.py +131 -0
  68. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/backend_utils.py +85 -0
  69. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/compile.py +141 -0
  70. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/decompositions.py +116 -0
  71. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/execute.py +189 -0
  72. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/op_support.py +290 -0
  73. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/partition.py +126 -0
  74. intel_npu_acceleration_library/external/openvino/frontend/pytorch/ts_decoder.py +568 -0
  75. intel_npu_acceleration_library/external/openvino/frontend/pytorch/utils.py +258 -0
  76. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/__init__.py +16 -0
  77. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/graph_iterator.py +116 -0
  78. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/node_decoder.py +219 -0
  79. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp310-win_amd64.pyd +0 -0
  80. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp311-win_amd64.pyd +0 -0
  81. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp312-win_amd64.pyd +0 -0
  82. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp38-win_amd64.pyd +0 -0
  83. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp39-win_amd64.pyd +0 -0
  84. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/utils.py +481 -0
  85. intel_npu_acceleration_library/external/openvino/helpers/__init__.py +6 -0
  86. intel_npu_acceleration_library/external/openvino/helpers/packing.py +87 -0
  87. intel_npu_acceleration_library/external/openvino/preprocess/README.md +60 -0
  88. intel_npu_acceleration_library/external/openvino/preprocess/__init__.py +28 -0
  89. intel_npu_acceleration_library/external/openvino/preprocess/torchvision/__init__.py +15 -0
  90. intel_npu_acceleration_library/external/openvino/preprocess/torchvision/preprocess_converter.py +47 -0
  91. intel_npu_acceleration_library/external/openvino/preprocess/torchvision/requirements.txt +5 -0
  92. intel_npu_acceleration_library/external/openvino/preprocess/torchvision/torchvision_preprocessing.py +347 -0
  93. intel_npu_acceleration_library/external/openvino/properties/__init__.py +22 -0
  94. intel_npu_acceleration_library/external/openvino/properties/_properties.py +55 -0
  95. intel_npu_acceleration_library/external/openvino/properties/device/__init__.py +14 -0
  96. intel_npu_acceleration_library/external/openvino/properties/hint/__init__.py +15 -0
  97. intel_npu_acceleration_library/external/openvino/properties/intel_auto/__init__.py +12 -0
  98. intel_npu_acceleration_library/external/openvino/properties/intel_cpu/__init__.py +8 -0
  99. intel_npu_acceleration_library/external/openvino/properties/intel_gpu/__init__.py +12 -0
  100. intel_npu_acceleration_library/external/openvino/properties/intel_gpu/hint/__init__.py +11 -0
  101. intel_npu_acceleration_library/external/openvino/properties/log/__init__.py +11 -0
  102. intel_npu_acceleration_library/external/openvino/properties/streams/__init__.py +11 -0
  103. intel_npu_acceleration_library/external/openvino/runtime/__init__.py +85 -0
  104. intel_npu_acceleration_library/external/openvino/runtime/exceptions.py +17 -0
  105. intel_npu_acceleration_library/external/openvino/runtime/ie_api.py +631 -0
  106. intel_npu_acceleration_library/external/openvino/runtime/op/__init__.py +19 -0
  107. intel_npu_acceleration_library/external/openvino/runtime/op/util/__init__.py +22 -0
  108. intel_npu_acceleration_library/external/openvino/runtime/opset1/__init__.py +112 -0
  109. intel_npu_acceleration_library/external/openvino/runtime/opset1/ops.py +3068 -0
  110. intel_npu_acceleration_library/external/openvino/runtime/opset10/__init__.py +179 -0
  111. intel_npu_acceleration_library/external/openvino/runtime/opset10/ops.py +173 -0
  112. intel_npu_acceleration_library/external/openvino/runtime/opset11/__init__.py +179 -0
  113. intel_npu_acceleration_library/external/openvino/runtime/opset11/ops.py +107 -0
  114. intel_npu_acceleration_library/external/openvino/runtime/opset12/__init__.py +180 -0
  115. intel_npu_acceleration_library/external/openvino/runtime/opset12/ops.py +120 -0
  116. intel_npu_acceleration_library/external/openvino/runtime/opset13/__init__.py +188 -0
  117. intel_npu_acceleration_library/external/openvino/runtime/opset13/ops.py +398 -0
  118. intel_npu_acceleration_library/external/openvino/runtime/opset14/__init__.py +190 -0
  119. intel_npu_acceleration_library/external/openvino/runtime/opset14/ops.py +171 -0
  120. intel_npu_acceleration_library/external/openvino/runtime/opset15/__init__.py +17 -0
  121. intel_npu_acceleration_library/external/openvino/runtime/opset15/ops.py +276 -0
  122. intel_npu_acceleration_library/external/openvino/runtime/opset2/__init__.py +118 -0
  123. intel_npu_acceleration_library/external/openvino/runtime/opset2/ops.py +216 -0
  124. intel_npu_acceleration_library/external/openvino/runtime/opset3/__init__.py +134 -0
  125. intel_npu_acceleration_library/external/openvino/runtime/opset3/ops.py +638 -0
  126. intel_npu_acceleration_library/external/openvino/runtime/opset4/__init__.py +145 -0
  127. intel_npu_acceleration_library/external/openvino/runtime/opset4/ops.py +464 -0
  128. intel_npu_acceleration_library/external/openvino/runtime/opset5/__init__.py +152 -0
  129. intel_npu_acceleration_library/external/openvino/runtime/opset5/ops.py +372 -0
  130. intel_npu_acceleration_library/external/openvino/runtime/opset6/__init__.py +154 -0
  131. intel_npu_acceleration_library/external/openvino/runtime/opset6/ops.py +215 -0
  132. intel_npu_acceleration_library/external/openvino/runtime/opset7/__init__.py +158 -0
  133. intel_npu_acceleration_library/external/openvino/runtime/opset7/ops.py +169 -0
  134. intel_npu_acceleration_library/external/openvino/runtime/opset8/__init__.py +169 -0
  135. intel_npu_acceleration_library/external/openvino/runtime/opset8/ops.py +787 -0
  136. intel_npu_acceleration_library/external/openvino/runtime/opset9/__init__.py +175 -0
  137. intel_npu_acceleration_library/external/openvino/runtime/opset9/ops.py +341 -0
  138. intel_npu_acceleration_library/external/openvino/runtime/opset_utils.py +22 -0
  139. intel_npu_acceleration_library/external/openvino/runtime/passes/__init__.py +19 -0
  140. intel_npu_acceleration_library/external/openvino/runtime/passes/graph_rewrite.py +33 -0
  141. intel_npu_acceleration_library/external/openvino/runtime/passes/manager.py +26 -0
  142. intel_npu_acceleration_library/external/openvino/runtime/properties/__init__.py +40 -0
  143. intel_npu_acceleration_library/external/openvino/runtime/properties/hint/__init__.py +25 -0
  144. intel_npu_acceleration_library/external/openvino/runtime/utils/__init__.py +7 -0
  145. intel_npu_acceleration_library/external/openvino/runtime/utils/broadcasting.py +44 -0
  146. intel_npu_acceleration_library/external/openvino/runtime/utils/data_helpers/__init__.py +8 -0
  147. intel_npu_acceleration_library/external/openvino/runtime/utils/data_helpers/data_dispatcher.py +447 -0
  148. intel_npu_acceleration_library/external/openvino/runtime/utils/data_helpers/wrappers.py +148 -0
  149. intel_npu_acceleration_library/external/openvino/runtime/utils/decorators.py +156 -0
  150. intel_npu_acceleration_library/external/openvino/runtime/utils/input_validation.py +133 -0
  151. intel_npu_acceleration_library/external/openvino/runtime/utils/node_factory.py +127 -0
  152. intel_npu_acceleration_library/external/openvino/runtime/utils/reduction.py +25 -0
  153. intel_npu_acceleration_library/external/openvino/runtime/utils/types.py +175 -0
  154. intel_npu_acceleration_library/external/openvino/tools/__init__.py +4 -0
  155. intel_npu_acceleration_library/external/openvino/tools/benchmark/__init__.py +3 -0
  156. intel_npu_acceleration_library/external/openvino/tools/benchmark/benchmark.py +186 -0
  157. intel_npu_acceleration_library/external/openvino/tools/benchmark/main.py +695 -0
  158. intel_npu_acceleration_library/external/openvino/tools/benchmark/parameters.py +199 -0
  159. intel_npu_acceleration_library/external/openvino/tools/benchmark/utils/__init__.py +3 -0
  160. intel_npu_acceleration_library/external/openvino/tools/benchmark/utils/constants.py +26 -0
  161. intel_npu_acceleration_library/external/openvino/tools/benchmark/utils/inputs_filling.py +482 -0
  162. intel_npu_acceleration_library/external/openvino/tools/benchmark/utils/logging.py +8 -0
  163. intel_npu_acceleration_library/external/openvino/tools/benchmark/utils/statistics_report.py +296 -0
  164. intel_npu_acceleration_library/external/openvino/tools/benchmark/utils/utils.py +836 -0
  165. intel_npu_acceleration_library/external/openvino/tools/ovc/__init__.py +20 -0
  166. intel_npu_acceleration_library/external/openvino/tools/ovc/__main__.py +10 -0
  167. intel_npu_acceleration_library/external/openvino/tools/ovc/cli_parser.py +633 -0
  168. intel_npu_acceleration_library/external/openvino/tools/ovc/convert.py +102 -0
  169. intel_npu_acceleration_library/external/openvino/tools/ovc/convert_data_type.py +82 -0
  170. intel_npu_acceleration_library/external/openvino/tools/ovc/convert_impl.py +550 -0
  171. intel_npu_acceleration_library/external/openvino/tools/ovc/environment_setup_utils.py +50 -0
  172. intel_npu_acceleration_library/external/openvino/tools/ovc/error.py +49 -0
  173. intel_npu_acceleration_library/external/openvino/tools/ovc/get_ov_update_message.py +16 -0
  174. intel_npu_acceleration_library/external/openvino/tools/ovc/help.py +45 -0
  175. intel_npu_acceleration_library/external/openvino/tools/ovc/logger.py +91 -0
  176. intel_npu_acceleration_library/external/openvino/tools/ovc/main.py +40 -0
  177. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/__init__.py +2 -0
  178. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/analysis.py +46 -0
  179. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/check_config.py +57 -0
  180. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/extractor.py +447 -0
  181. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/jax_frontend_utils.py +19 -0
  182. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/layout_utils.py +73 -0
  183. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/moc_emit_ir.py +32 -0
  184. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/offline_transformations.py +107 -0
  185. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/paddle_frontend_utils.py +83 -0
  186. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/pipeline.py +298 -0
  187. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/preprocessing.py +220 -0
  188. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py +214 -0
  189. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/shape_utils.py +109 -0
  190. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/type_utils.py +82 -0
  191. intel_npu_acceleration_library/external/openvino/tools/ovc/ovc.py +13 -0
  192. intel_npu_acceleration_library/external/openvino/tools/ovc/telemetry_params.py +6 -0
  193. intel_npu_acceleration_library/external/openvino/tools/ovc/telemetry_stub.py +28 -0
  194. intel_npu_acceleration_library/external/openvino/tools/ovc/telemetry_utils.py +118 -0
  195. intel_npu_acceleration_library/external/openvino/tools/ovc/utils.py +196 -0
  196. intel_npu_acceleration_library/external/openvino/tools/ovc/version.py +80 -0
  197. intel_npu_acceleration_library/external/openvino/torch/__init__.py +5 -0
  198. intel_npu_acceleration_library/external/openvino/utils.py +115 -0
  199. intel_npu_acceleration_library/functional/__init__.py +8 -0
  200. intel_npu_acceleration_library/functional/scaled_dot_product_attention.py +47 -0
  201. intel_npu_acceleration_library/lib/Release/cache.json +113732 -0
  202. intel_npu_acceleration_library/lib/Release/intel_npu_acceleration_library.dll +0 -0
  203. intel_npu_acceleration_library/lib/Release/openvino.dll +0 -0
  204. intel_npu_acceleration_library/lib/Release/openvino_auto_batch_plugin.dll +0 -0
  205. intel_npu_acceleration_library/lib/Release/openvino_auto_plugin.dll +0 -0
  206. intel_npu_acceleration_library/lib/Release/openvino_c.dll +0 -0
  207. intel_npu_acceleration_library/lib/Release/openvino_hetero_plugin.dll +0 -0
  208. intel_npu_acceleration_library/lib/Release/openvino_intel_cpu_plugin.dll +0 -0
  209. intel_npu_acceleration_library/lib/Release/openvino_intel_gpu_plugin.dll +0 -0
  210. intel_npu_acceleration_library/lib/Release/openvino_intel_npu_plugin.dll +0 -0
  211. intel_npu_acceleration_library/lib/Release/openvino_ir_frontend.dll +0 -0
  212. intel_npu_acceleration_library/lib/Release/openvino_onnx_frontend.dll +0 -0
  213. intel_npu_acceleration_library/lib/Release/openvino_paddle_frontend.dll +0 -0
  214. intel_npu_acceleration_library/lib/Release/openvino_pytorch_frontend.dll +0 -0
  215. intel_npu_acceleration_library/lib/Release/openvino_tensorflow_frontend.dll +0 -0
  216. intel_npu_acceleration_library/lib/Release/openvino_tensorflow_lite_frontend.dll +0 -0
  217. intel_npu_acceleration_library/lib/Release/tbb12.dll +0 -0
  218. intel_npu_acceleration_library/lib/Release/tbb12_debug.dll +0 -0
  219. intel_npu_acceleration_library/lib/Release/tbbbind_2_5.dll +0 -0
  220. intel_npu_acceleration_library/lib/Release/tbbbind_2_5_debug.dll +0 -0
  221. intel_npu_acceleration_library/lib/Release/tbbmalloc.dll +0 -0
  222. intel_npu_acceleration_library/lib/Release/tbbmalloc_debug.dll +0 -0
  223. intel_npu_acceleration_library/lib/Release/tbbmalloc_proxy.dll +0 -0
  224. intel_npu_acceleration_library/lib/Release/tbbmalloc_proxy_debug.dll +0 -0
  225. intel_npu_acceleration_library/modelling.py +150 -0
  226. intel_npu_acceleration_library/nn/__init__.py +20 -0
  227. intel_npu_acceleration_library/nn/autograd.py +68 -0
  228. intel_npu_acceleration_library/nn/conv.py +257 -0
  229. intel_npu_acceleration_library/nn/functional.py +1207 -0
  230. intel_npu_acceleration_library/nn/linear.py +162 -0
  231. intel_npu_acceleration_library/nn/llm.py +417 -0
  232. intel_npu_acceleration_library/nn/module.py +393 -0
  233. intel_npu_acceleration_library/optimizations.py +157 -0
  234. intel_npu_acceleration_library/quantization.py +174 -0
@@ -0,0 +1,1161 @@
1
+ #
2
+ # Copyright © 2024 Intel Corporation
3
+ # SPDX-License-Identifier: Apache 2.0
4
+ #
5
+
6
+ from intel_npu_acceleration_library.backend.base import BaseNPUBackendWithPrefetch
7
+ from intel_npu_acceleration_library.backend.ops import get_supported_ops
8
+ from intel_npu_acceleration_library.backend.bindings import lib as backend_lib
9
+ from intel_npu_acceleration_library.backend.tensor import Tensor
10
+ from intel_npu_acceleration_library.dtypes import int4, bfloat16, get_backend_dtype
11
+ from typing import Optional, Tuple, Any, Union, Sequence, TypeVar, Callable, cast, List
12
+ from functools import partial
13
+ import numpy.typing as npt
14
+ import numpy as np
15
+ import ctypes
16
+ import torch
17
+
18
+
19
+ F = TypeVar("F", bound=Callable[..., Any])
20
+
21
+
22
+ class NNFactory(BaseNPUBackendWithPrefetch):
23
+ """Linear class, computing a matrix matrix multiplication with weights prefetching."""
24
+
25
+ def __init__(
26
+ self,
27
+ profile: bool = False,
28
+ device: str = "NPU",
29
+ ):
30
+ """Initialize the Linear class.
31
+
32
+ Args:
33
+ profile (Optional[bool], optional): Enable/Disable profiling. Defaults to False.
34
+ device (str): Target device, default to "NPU".
35
+ """
36
+ super().__init__(profile)
37
+ self.device = device
38
+ self._mm = backend_lib.createNNFactory(
39
+ ctypes.c_char_p(self.device.encode()),
40
+ profile,
41
+ )
42
+ self.elapsed = None
43
+ self.output_nodes: Sequence[ctypes._Pointer] = []
44
+
45
+ for op in get_supported_ops():
46
+ if not hasattr(self, op.name.replace("_act", "")):
47
+ setattr(
48
+ self,
49
+ op.name.replace("_act", ""),
50
+ partial(self._call_backend_op, op.name),
51
+ )
52
+
53
+ def return_tensor(fn: F) -> F: # type: ignore
54
+ """Wrap the output of a function in a Tensor object.
55
+
56
+ Args:
57
+ fn (function): Function
58
+
59
+ Returns:
60
+ function: A function that wraps the output in a Tensor object
61
+ """
62
+
63
+ def wrapper(self, *args: Any, **kwargs: Any) -> Tensor:
64
+ """Wrap the output of a function in a Tensor object.
65
+
66
+ Args:
67
+ args (Any): Variable length argument list
68
+ kwargs (Any): Arbitrary keyword arguments
69
+
70
+ Returns:
71
+ Tensor: Tensor object
72
+ """
73
+ # Convert Tensor objects to their underlying node
74
+ kwargs = {
75
+ k: v.node if isinstance(v, Tensor) else v for k, v in kwargs.items()
76
+ }
77
+
78
+ if fn.__qualname__ == 'NNFactory.reshape':
79
+ output_idx = args[0].output_idx
80
+ kwargs["output_idx"] = output_idx
81
+ args = tuple(arg.node if isinstance(arg, Tensor) else arg for arg in args)
82
+
83
+
84
+ input_nodes = [arg for arg in args if isinstance(arg, ctypes._Pointer)] + [
85
+ v for v in kwargs.values() if isinstance(v, ctypes._Pointer)
86
+ ]
87
+ # Call the function
88
+ node = fn(self, *args, **kwargs)
89
+
90
+ output_len = backend_lib.op_output_size(node)
91
+
92
+ # remove input nodes from output_nodes
93
+ self.output_nodes = [
94
+ node for node in self.output_nodes if node not in input_nodes
95
+ ]
96
+ # add output node to output_nodes
97
+ if fn.__name__ != "constant":
98
+ self.output_nodes.append(node)
99
+
100
+ # Wrap the node in a Tensor object
101
+ if output_len == 1:
102
+ return Tensor(factory=self, node=node, output_idx=0)
103
+ else:
104
+ output_tensor_list = []
105
+ for i in range(output_len):
106
+ output_tensor_list.append(Tensor(factory=self, node=node, output_idx=i))
107
+ return output_tensor_list
108
+
109
+ return cast(F, wrapper)
110
+
111
+ def return_tensor_for_list_inputs(fn: F) -> F: # type: ignore
112
+ """Wrap the output of a function in a Tensor object.
113
+ This new wrapper add support for List Tensor input.
114
+
115
+ Args:
116
+ fn (function): Function
117
+
118
+ Returns:
119
+ function: A function that wraps the output in a Tensor object
120
+ """
121
+
122
+ def wrapper(self, *args: Any, **kwargs: Any) -> Tensor:
123
+ """Wrap the output of a function in a Tensor object.
124
+
125
+ Args:
126
+ args (Any): Variable length argument list
127
+ kwargs (Any): Arbitrary keyword arguments
128
+
129
+ Returns:
130
+ Tensor: Tensor object
131
+ """
132
+ # Convert Tensor objects to their underlying node
133
+ # args = tuple(arg.node if isinstance(arg, Tensor) else arg for arg in args)
134
+ new_args = []
135
+ for arg in args:
136
+ if isinstance(arg, Tensor):
137
+ new_args.append(arg.node)
138
+ elif isinstance(arg, (tuple, list)):
139
+ # for item in arg:
140
+ for i in range(len(arg)):
141
+ if isinstance(arg[i], Tensor):
142
+ arg[i] = arg[i].node
143
+ new_args.append(arg)
144
+ else:
145
+ new_args.append(arg)
146
+ args = tuple(new_args)
147
+ kwargs = {
148
+ k: v.node if isinstance(v, Tensor) else v for k, v in kwargs.items()
149
+ }
150
+
151
+ # input_nodes = [arg for arg in args if isinstance(arg, ctypes._Pointer)] + [
152
+ # v for v in kwargs.values() if isinstance(v, ctypes._Pointer)
153
+ # ]
154
+ input_nodes = []
155
+ for arg in args:
156
+ if isinstance(arg, ctypes._Pointer):
157
+ input_nodes.append(arg)
158
+ elif isinstance(arg, (tuple, list)):
159
+ for item in arg:
160
+ if isinstance(item, ctypes._Pointer):
161
+ input_nodes.append(item)
162
+ input_nodes += [v for v in kwargs.values() if isinstance(v, ctypes._Pointer)]
163
+
164
+ # Call the function
165
+ node = fn(self, *args, **kwargs)
166
+
167
+ # remove input nodes from output_nodes
168
+ self.output_nodes = [
169
+ node for node in self.output_nodes if node not in input_nodes
170
+ ]
171
+ # add output node to output_nodes
172
+ if fn.__name__ != "constant":
173
+ self.output_nodes.append(node)
174
+
175
+ # Wrap the node in a Tensor object
176
+ return Tensor(factory=self, node=node)
177
+
178
+ return cast(F, wrapper)
179
+
180
+ @return_tensor
181
+ def _call_backend_op(self, op_name: str, *parameters: Any) -> Any:
182
+ """Dynamically call a backend operation.
183
+
184
+ Args:
185
+ op_name (str): operation name
186
+ parameters (Any): variable list of operation parameters
187
+
188
+ Returns:
189
+ Any: Operation
190
+ """
191
+ fn = getattr(backend_lib, op_name)
192
+ return fn(self._mm, *parameters)
193
+
194
+ def get_backend_dtype(self, dtype) -> ctypes.c_char_p:
195
+ """Get the string representation of the dtype.
196
+
197
+ Args:
198
+ dtype: numpy dtype
199
+
200
+ Returns:
201
+ ctypes.c_char_p: string representation of the dtype
202
+ """
203
+ return get_backend_dtype(dtype)
204
+
205
+ @return_tensor
206
+ def parameter(
207
+ self, shape: Sequence[int], dtype: npt.DTypeLike = np.float16
208
+ ) -> ctypes._Pointer:
209
+ """Generate a model input parameter.
210
+
211
+ Args:
212
+ shape (Sequence[int]): Parameter shape
213
+ dtype (np.dtype, optional): parameter type np.int8, np.uint8 and np.float16 supported. Defaults to np.float16. Unit8 represents packed i4 dtypes
214
+
215
+ Returns:
216
+ ctypes._Pointer: an instance to a parameter object
217
+
218
+ """
219
+ shape_ptr = np.array(shape, dtype=np.uint32)
220
+ return backend_lib.parameter(
221
+ self._mm, shape_ptr.size, shape_ptr, self.get_backend_dtype(dtype)
222
+ )
223
+
224
+ @return_tensor
225
+ def to(self, tensor: ctypes._Pointer, dtype: npt.DTypeLike) -> ctypes._Pointer:
226
+ """Convert a tensor to a different dtype.
227
+
228
+ Args:
229
+ tensor (ctypes._Pointer): input tensor
230
+ dtype (npt.DTypeLike): target dtype
231
+
232
+ Returns:
233
+ ctypes._Pointer: output tensor
234
+ """
235
+ dtype_ptr = self.get_backend_dtype(dtype)
236
+ return backend_lib.to(self._mm, tensor, dtype_ptr)
237
+
238
+ @return_tensor
239
+ def constant(
240
+ self,
241
+ data: Union[np.array, Sequence[int], Sequence[float], int, float, torch.Tensor],
242
+ ) -> ctypes._Pointer:
243
+ """Generate a model input constant.
244
+
245
+ Args:
246
+ data (Union[np.array, Sequence[int], Sequence[float], int, float, torch.Tensor]): constant data
247
+
248
+ Returns:
249
+ ctypes._Pointer: an instance to a constant object
250
+
251
+ """
252
+ if isinstance(data, (list, tuple)):
253
+ if all(isinstance(i, int) for i in data):
254
+ data = np.array(data, dtype=np.int64)
255
+ else:
256
+ data = np.array(data, dtype=np.float32)
257
+ elif isinstance(data, int):
258
+ data = np.array([data], dtype=np.int64)
259
+ elif isinstance(data, float):
260
+ data = np.array([data], dtype=np.float32)
261
+ elif isinstance(data, torch.Tensor):
262
+ data = data.detach().numpy()
263
+ elif data is None:
264
+ return ctypes.cast(ctypes.c_void_p(0), ctypes.POINTER(ctypes.c_char))
265
+
266
+ dst = data.ctypes.data_as(ctypes.c_void_p)
267
+ backend_dtype = self.get_backend_dtype(data.dtype)
268
+ if data.dtype == np.uint8 or data.dtype == int4:
269
+ shape = []
270
+ for size in data.shape:
271
+ shape.append(size)
272
+
273
+ shape[-1] = shape[-1] * 2
274
+ shape_ptr = np.array(shape, dtype=np.uint32)
275
+ else:
276
+ shape_ptr = np.array(data.shape, dtype=np.uint32)
277
+ return backend_lib.constant(
278
+ self._mm, shape_ptr.size, shape_ptr, backend_dtype, dst
279
+ )
280
+
281
+ @return_tensor
282
+ def matmul(
283
+ self,
284
+ input_node: ctypes._Pointer,
285
+ weights_node: ctypes._Pointer,
286
+ trA: bool = False,
287
+ trB: bool = True,
288
+ ) -> ctypes._Pointer:
289
+ """Generate a matrix multiplication layer.
290
+
291
+ Args:
292
+ input_node (ctypes._Pointer): layer input node
293
+ weights_node (ctypes._Pointer): weights node
294
+ trA (bool): transpose input node
295
+ trB (bool): transpose weights node
296
+
297
+ Returns:
298
+ ctypes._Pointer: output node
299
+ """
300
+ return backend_lib.matmul(self._mm, input_node, weights_node, trA, trB)
301
+
302
+ @return_tensor
303
+ def convolution(
304
+ self,
305
+ input_node: ctypes._Pointer,
306
+ weights_node: ctypes._Pointer,
307
+ bias: Optional[ctypes._Pointer] = None,
308
+ strides: Union[int, Sequence[int]] = 1,
309
+ padding: Union[int, Sequence[int]] = 0,
310
+ dilation: Union[int, Sequence[int]] = 1,
311
+ groups: int = 1,
312
+ act_dtype: npt.DTypeLike = np.float16,
313
+ n_spatial_dims: int = 2,
314
+ ) -> ctypes._Pointer:
315
+ """Generate a convolution layer.
316
+
317
+ Args:
318
+ input_node (ctypes._Pointer): layer input node
319
+ weights_node (ctypes._Pointer): weights node
320
+ bias (Optional[ctypes._Pointer}): bias node
321
+ strides (Sequence[int]): strides
322
+ padding (Sequence[int]): padding
323
+ dilation (Sequence[int]): dilation
324
+ groups (int): groups
325
+ act_dtype (npt.DTypeLike, optional): activation dtype. Defaults to np.float16.
326
+ n_spatial_dims (int): number of spatial dimensions
327
+
328
+ Returns:
329
+ ctypes._Pointer: output node
330
+ """
331
+ if isinstance(strides, int):
332
+ strides = [strides] * n_spatial_dims
333
+
334
+ if isinstance(padding, int):
335
+ padding_begins = [padding] * n_spatial_dims
336
+ padding_ends = [padding] * n_spatial_dims
337
+ else:
338
+ padding_begins = list(padding)
339
+ padding_ends = list(padding)
340
+
341
+ if isinstance(dilation, int):
342
+ dilation = [dilation] * n_spatial_dims
343
+
344
+ strides_ptr = np.array(strides, dtype=np.uint32)
345
+ padding_begins_ptr = np.array(padding_begins, dtype=np.uint32)
346
+ padding_ends_ptr = np.array(padding_ends, dtype=np.uint32)
347
+ dilation_ptr = np.array(dilation, dtype=np.uint32)
348
+
349
+ if bias is not None:
350
+ bias_node = bias
351
+ else:
352
+ bias_node = ctypes.cast(ctypes.c_void_p(0), ctypes.POINTER(ctypes.c_char))
353
+
354
+ return backend_lib.convolution(
355
+ self._mm,
356
+ input_node,
357
+ weights_node,
358
+ bias_node,
359
+ strides_ptr.size,
360
+ strides_ptr,
361
+ padding_begins_ptr.size,
362
+ padding_begins_ptr,
363
+ padding_ends_ptr.size,
364
+ padding_ends_ptr,
365
+ dilation_ptr.size,
366
+ dilation_ptr,
367
+ groups,
368
+ self.get_backend_dtype(act_dtype),
369
+ )
370
+
371
+ @return_tensor
372
+ def linear(
373
+ self,
374
+ input_node: ctypes._Pointer,
375
+ output_channels: int,
376
+ input_channels: int,
377
+ bias: Optional[bool] = False,
378
+ act_dtype: npt.DTypeLike = np.float16,
379
+ wt_dtype: npt.DTypeLike = np.float16,
380
+ scale_factor: bool = True,
381
+ asym: bool=False,
382
+ ) -> ctypes._Pointer:
383
+ """Generate a linear layer.
384
+
385
+ Args:
386
+ input_node (ctypes._Pointer): layer input node
387
+ output_channels (int): number of output channels
388
+ input_channels (int): number of input channels
389
+ bias (bool, optional): enable/disable bias. Defaults to False.
390
+ act_dtype (npt.DTypeLike, optional): activation dtype. Defaults to np.float16.
391
+ wt_dtype (npt.DTypeLike, optional): weight dtype. Defaults to np.float16.
392
+
393
+ Returns:
394
+ ctypes._Pointer: output node
395
+ """
396
+ return backend_lib.linear(
397
+ self._mm,
398
+ input_node,
399
+ output_channels,
400
+ input_channels,
401
+ bias,
402
+ self.get_backend_dtype(act_dtype),
403
+ self.get_backend_dtype(wt_dtype),
404
+ scale_factor,
405
+ asym
406
+ )
407
+
408
+ @return_tensor
409
+ def dq_split_linear(
410
+ self, input_node: ctypes._Pointer, n_splits: int,
411
+ outout_channels: int, input_channels: int, bias: bool = False,
412
+ act_dtype: npt.DTypeLike = np.float16,
413
+ wt_dtype: npt.DTypeLike = np.float16,
414
+ scale_factor: bool = True,
415
+ is_prefill: bool = False,
416
+ use_dq: bool = True,
417
+ asym: bool = False,
418
+ ) -> ctypes._Pointer:
419
+ """Generate a linear layer for dynamic quantization linear layer.
420
+
421
+ Args:
422
+ input_node (ctypes._Pointer): layer input node
423
+ n_splits (int): number of parts the linear layer is split into
424
+ output_channels (int): number of output channels
425
+ input_channels (int): number of input channels
426
+ bias (bool, optional): enable/disable bias. Defaults to False.
427
+ act_dtype (npt.DTypeLike, optional): activation dtype. Defaults to np.float16.
428
+ wt_dtype (npt.DTypeLike, optional): weight dtype. Defaults to np.float16.
429
+ scale_factor (bool, optional): enable/disable mul scale factor. Default to True,
430
+ is_prefill (bool, optional): enable/disable prefill linear optimization. Default to False.
431
+
432
+ Returns:
433
+ ctypes._Pointer: output node
434
+ """
435
+ if is_prefill:
436
+ func = backend_lib.dq_split_linear_prefill if use_dq else backend_lib.gw_linear_prefill
437
+ else:
438
+ func = backend_lib.dq_split_linear
439
+ return func(self._mm, input_node, n_splits,
440
+ input_channels, outout_channels, bias,
441
+ self.get_backend_dtype(act_dtype),
442
+ self.get_backend_dtype(wt_dtype),
443
+ scale_factor, asym)
444
+
445
+ @return_tensor
446
+ def reshape(
447
+ self, input_node: ctypes._Pointer, shape: Sequence[int],
448
+ special_zero: bool = True,
449
+ output_idx: int = 0
450
+ ) -> ctypes._Pointer:
451
+ """Generate a reshape layer.
452
+
453
+ Args:
454
+ input_node (ctypes._Pointer): layer input node
455
+ shape (Sequence[int]): shape
456
+
457
+ Returns:
458
+ ctypes._Pointer: output node
459
+ """
460
+ shape_node = self.constant(shape).node # type: ignore
461
+ return backend_lib.reshape(self._mm, input_node, shape_node,
462
+ special_zero, output_idx)
463
+
464
+ @return_tensor
465
+ def broadcast(
466
+ self, input_node: ctypes._Pointer, shape: Sequence[int]
467
+ ) -> ctypes._Pointer:
468
+ """Broadcast.
469
+
470
+ Args:
471
+ input_node (ctypes._Pointer): layer input node
472
+ shape (Sequence[int]): shape
473
+
474
+ Returns:
475
+ ctypes._Pointer: output node
476
+ """
477
+ shape_node = self.constant(shape).node # type: ignore
478
+ return backend_lib.broadcast(self._mm, input_node, shape_node)
479
+
480
+ @return_tensor
481
+ def transpose(
482
+ self, input_node: ctypes._Pointer, input_order: Sequence[int]
483
+ ) -> ctypes._Pointer:
484
+ """Generate a transpose layer.
485
+
486
+ Args:
487
+ input_node (ctypes._Pointer): layer input node
488
+ input_order (Sequence[int]): input order
489
+
490
+ Returns:
491
+ ctypes._Pointer: output node
492
+ """
493
+ input_order_node = self.constant(input_order).node # type: ignore
494
+ return backend_lib.transpose(self._mm, input_node, input_order_node)
495
+
496
+ @return_tensor
497
+ def unsqueeze(
498
+ self, input_node: ctypes._Pointer, axis: Sequence[int]
499
+ ) -> ctypes._Pointer:
500
+ """Generate an unsqueeze layer.
501
+
502
+ Args:
503
+ input_node (ctypes._Pointer): layer input node
504
+ axis (Sequence[int]): axis
505
+
506
+ Returns:
507
+ ctypes._Pointer: output node
508
+ """
509
+ axis_node = self.constant(axis).node # type: ignore
510
+ return backend_lib.unsqueeze(self._mm, input_node, axis_node)
511
+
512
+ @return_tensor
513
+ def slice(
514
+ self,
515
+ input_node: ctypes._Pointer,
516
+ begin: Sequence[int],
517
+ end: Sequence[int],
518
+ stride: Optional[Sequence[int]] = None,
519
+ ) -> ctypes._Pointer:
520
+ """Generate an unsqueeze layer.
521
+
522
+ Args:
523
+ input_node (ctypes._Pointer): layer input node
524
+ begin (Sequence[int]): begin
525
+ end (Sequence[int]): end
526
+ stride (Optional[Sequence[int]]): stride
527
+
528
+ Raises:
529
+ ValueError: begin and end must have the same length
530
+
531
+ Returns:
532
+ ctypes._Pointer: output node
533
+ """
534
+ if len(begin) != len(end):
535
+ raise ValueError("begin and end must have the same length")
536
+
537
+ if stride is None:
538
+ stride = [1] * len(begin)
539
+
540
+ begin_mask_ptr = np.zeros([len(begin)], dtype=np.uint32)
541
+ end_mask_ptr = np.zeros([len(begin)], dtype=np.uint32)
542
+
543
+ begin = self.constant(begin).node # type: ignore
544
+ end = self.constant(end).node # type: ignore
545
+ stride = self.constant(stride).node # type: ignore
546
+
547
+ return backend_lib.slice(
548
+ self._mm,
549
+ input_node,
550
+ begin,
551
+ end,
552
+ stride,
553
+ begin_mask_ptr.size,
554
+ begin_mask_ptr,
555
+ end_mask_ptr.size,
556
+ end_mask_ptr,
557
+ )
558
+
559
+ @return_tensor
560
+ def simple_slice(
561
+ self,
562
+ input_node: ctypes._Pointer,
563
+ begin: Sequence[int],
564
+ end: Sequence[int],
565
+ step: Optional[Sequence[int]] = None,
566
+ ) -> ctypes._Pointer:
567
+ """Generate an unsqueeze layer.
568
+
569
+ Args:
570
+ input_node (ctypes._Pointer): layer input node
571
+ begin (Sequence[int]): begin
572
+ end (Sequence[int]): end
573
+ stride (Optional[Sequence[int]]): stride
574
+
575
+ Raises:
576
+ ValueError: begin and end must have the same length
577
+
578
+ Returns:
579
+ ctypes._Pointer: output node
580
+ """
581
+ if len(begin) != len(end):
582
+ raise ValueError("begin and end must have the same length")
583
+
584
+ if step is None:
585
+ step = [1] * len(begin)
586
+
587
+ begin = self.constant(begin).node # type: ignore
588
+ end = self.constant(end).node # type: ignore
589
+ step = self.constant(step).node # type: ignore
590
+
591
+ return backend_lib.simple_slice(
592
+ self._mm,
593
+ input_node,
594
+ begin,
595
+ end,
596
+ step
597
+ )
598
+
599
+ @return_tensor
600
+ def concat(
601
+ self, input_node_1: ctypes._Pointer, input_node_2: ctypes._Pointer, axis: int
602
+ ) -> ctypes._Pointer:
603
+ """Generate a concatenation layer.
604
+
605
+ Args:
606
+ input_node_1 (ctypes._Pointer): first layer input node
607
+ input_node_2 (ctypes._Pointer): second layer input node
608
+ axis (int): axis
609
+
610
+ Returns:
611
+ ctypes._Pointer: output node
612
+ """
613
+ if axis < 0:
614
+ shape_size = backend_lib.op_shape_size(input_node_1, 0)
615
+ axis = (axis + shape_size) % shape_size
616
+ axis = np.int64(axis)
617
+ return backend_lib.concat(self._mm, input_node_1, input_node_2, axis)
618
+
619
+ @return_tensor_for_list_inputs
620
+ def sequence_concat(
621
+ self, input_nodes: List[ctypes._Pointer], axis: int
622
+ ) -> ctypes._Pointer:
623
+ """Generate a concatenation layer.
624
+
625
+ Args:
626
+ input_nodes (List[ctypes._Pointer]): sequence of layer input node
627
+ axis (int): axis
628
+
629
+ Returns:
630
+ ctypes._Pointer: output node
631
+ """
632
+ if axis < 0:
633
+ shape_size = backend_lib.op_shape_size(input_nodes[0], 0)
634
+ axis = (axis + shape_size) % shape_size
635
+ axis = np.int64(axis)
636
+
637
+ input_ptr = (ctypes.POINTER(ctypes.c_char) * len(input_nodes))(*input_nodes)
638
+ return backend_lib.multi_concat(self._mm, input_ptr, len(input_nodes), axis)
639
+
640
+ @return_tensor
641
+ def reduce_max(
642
+ self,
643
+ input_node: ctypes._Pointer,
644
+ reduction_axes: Optional[Union[int, Sequence[int]]] = None,
645
+ keep_dims: Optional[bool] = False,
646
+ ) -> ctypes._Pointer:
647
+ """Generate a reduce max layer.
648
+
649
+ Args:
650
+ input_node (ctypes._Pointer): layer input node
651
+ reduction_axes (Optional[Union[int, Sequence[int]]]): the axis positions to be reduced
652
+ keep_dims (Optional[bool]): if set to 1 it holds axes that are used for reduction. Defaults to False
653
+
654
+ Returns:
655
+ ctypes._Pointer: output node
656
+ """
657
+ if reduction_axes is None:
658
+ shape_size = backend_lib.op_shape_size(input_node, 0)
659
+ reduction_axes = list(range(shape_size - 1, -1, -1))
660
+ axis_node = self.constant(reduction_axes).node # type: ignore
661
+ return backend_lib.reduce_max(self._mm, input_node, axis_node, keep_dims)
662
+
663
+ @return_tensor
664
+ def reduce_mean(
665
+ self,
666
+ input_node: ctypes._Pointer,
667
+ reduction_axes: Optional[Union[int, Sequence[int]]] = None,
668
+ keep_dims: Optional[bool] = False,
669
+ ) -> ctypes._Pointer:
670
+ """Generate a reduce mean layer.
671
+
672
+ Args:
673
+ input_node (ctypes._Pointer): layer input node
674
+ reduction_axes (Optional[Union[int, Sequence[int]]]): the axis positions to be reduced
675
+ keep_dims (Optional[bool] ): if set to 1 it holds axes that are used for reduction. Defaults to False
676
+
677
+ Returns:
678
+ ctypes._Pointer: output node
679
+ """
680
+ if reduction_axes is None:
681
+ shape_size = backend_lib.op_shape_size(input_node, 0)
682
+ reduction_axes = list(range(shape_size - 1, -1, -1))
683
+ axis_node = self.constant(reduction_axes).node # type: ignore
684
+ return backend_lib.reduce_mean(self._mm, input_node, axis_node, keep_dims)
685
+
686
+ @return_tensor
687
+ def reduce_min(
688
+ self,
689
+ input_node: ctypes._Pointer,
690
+ reduction_axes: Optional[Union[int, Sequence[int]]] = None,
691
+ keep_dims: Optional[bool] = False,
692
+ ) -> ctypes._Pointer:
693
+ """Generate a reduce min layer.
694
+
695
+ Args:
696
+ input_node (ctypes._Pointer): layer input node
697
+ reduction_axes (Optional[Union[int, Sequence[int]]]): the axis positions to be reduced
698
+ keep_dims (Optional[bool] ): if set to 1 it holds axes that are used for reduction. Defaults to False
699
+
700
+ Returns:
701
+ ctypes._Pointer: output node
702
+ """
703
+ if reduction_axes is None:
704
+ shape_size = backend_lib.op_shape_size(input_node, 0)
705
+ reduction_axes = list(range(shape_size - 1, -1, -1))
706
+ axis_node = self.constant(reduction_axes).node # type: ignore
707
+ return backend_lib.reduce_min(self._mm, input_node, axis_node, keep_dims)
708
+
709
+ @return_tensor
710
+ def reduce_prod(
711
+ self,
712
+ input_node: ctypes._Pointer,
713
+ reduction_axes: Optional[Union[int, Sequence[int]]] = None,
714
+ keep_dims: Optional[bool] = False,
715
+ ) -> ctypes._Pointer:
716
+ """Generate a reduce product layer.
717
+
718
+ Args:
719
+ input_node (ctypes._Pointer): layer input node
720
+ reduction_axes (Optional[Union[int, Sequence[int]]]): the axis positions to be reduced
721
+ keep_dims (Optional[bool] ): if set to 1 it holds axes that are used for reduction. Defaults to False
722
+
723
+ Returns:
724
+ ctypes._Pointer: output node
725
+ """
726
+ if reduction_axes is None:
727
+ shape_size = backend_lib.op_shape_size(input_node, 0)
728
+ reduction_axes = list(range(shape_size - 1, -1, -1))
729
+ axis_node = self.constant(reduction_axes).node # type: ignore
730
+ return backend_lib.reduce_prod(self._mm, input_node, axis_node, keep_dims)
731
+
732
+ @return_tensor
733
+ def reduce_sum(
734
+ self,
735
+ input_node: ctypes._Pointer,
736
+ reduction_axes: Optional[Union[int, Sequence[int]]] = None,
737
+ keep_dims: Optional[bool] = False,
738
+ ) -> ctypes._Pointer:
739
+ """Generate a reduce sum layer.
740
+
741
+ Args:
742
+ input_node (ctypes._Pointer): layer input node
743
+ reduction_axes (Optional[Union[int, Sequence[int]]]): the axis positions to be reduced
744
+ keep_dims (Optional[bool] ): if set to 1 it holds axes that are used for reduction. Defaults to False
745
+
746
+ Returns:
747
+ ctypes._Pointer: output node
748
+ """
749
+ if reduction_axes is None:
750
+ shape_size = backend_lib.op_shape_size(input_node, 0)
751
+ reduction_axes = list(range(shape_size - 1, -1, -1))
752
+ axis_node = self.constant(reduction_axes).node # type: ignore
753
+ return backend_lib.reduce_sum(self._mm, input_node, axis_node, keep_dims)
754
+
755
+ @return_tensor
756
+ def normL2(
757
+ self, input_node: ctypes._Pointer, axis: int, eps: Optional[float] = 1e-12
758
+ ) -> ctypes._Pointer:
759
+ """Generate an L2 normalization layer.
760
+
761
+ Args:
762
+ input_node (ctypes._Pointer): layer input node
763
+ axis (int): axis
764
+ eps (float, optional): epsilon added to L2 norm. Defaults to 1e-12
765
+
766
+ Returns:
767
+ ctypes._Pointer: output node
768
+ """
769
+ if axis < 0:
770
+ shape_size = backend_lib.op_shape_size(input_node, 0)
771
+ axis = (axis + shape_size) % shape_size
772
+ axis_node = self.constant(axis).node # type: ignore
773
+ return backend_lib.normL2(self._mm, input_node, axis_node, eps)
774
+
775
+ @return_tensor
776
+ def power(
777
+ self,
778
+ input_node: ctypes._Pointer,
779
+ exponent: Union[ctypes._Pointer, torch.Tensor],
780
+ ) -> ctypes._Pointer:
781
+ """Generate a power layer.
782
+
783
+ Args:
784
+ input_node (ctypes._Pointer): layer input node
785
+ exponent (Union[ctypes._Pointer, torch.Tensor]): the exponent value
786
+
787
+ Raises:
788
+ ValueError: Input tensor shapes are not equal
789
+
790
+ Returns:
791
+ ctypes._Pointer: output node
792
+ """
793
+ input_shape_size = backend_lib.op_shape_size(input_node, 0)
794
+ input_shape = [
795
+ backend_lib.op_shape(input_node, i, 0) for i in range(input_shape_size)
796
+ ]
797
+ if isinstance(exponent, ctypes._Pointer):
798
+ exponent_shape_size = backend_lib.op_shape_size(input_node, 0)
799
+ exponent_shape = [
800
+ backend_lib.op_shape(exponent, i, 0) for i in range(exponent_shape_size)
801
+ ]
802
+ else:
803
+ exponent_shape = list(exponent.shape)
804
+ exponent = self.constant(exponent).node # type: ignore
805
+ # if exponent_shape != input_shape:
806
+ # raise ValueError("Input tensor shapes are not equal")
807
+
808
+ return backend_lib.power(self._mm, input_node, exponent)
809
+
810
+ @return_tensor
811
+ def variadic_split(
812
+ self,
813
+ input: ctypes._Pointer,
814
+ axis: int,
815
+ split_lengths: Sequence[int],
816
+ ) -> ctypes._Pointer:
817
+ """Generate an average pooling layer.
818
+
819
+ Args:
820
+ input (ctypes._Pointer): layer input node
821
+ axis (int): split axis
822
+ split_lengths (Sequence[int]): A list containing the sizes of each output tensor
823
+ along the split "axis". Size of "split_lengths" should be equal to the number of
824
+ outputs. The sum of split_lengths must match data.shape[axis]
825
+
826
+ Raises:
827
+ NotImplementedError: divisor_override is not supported
828
+
829
+ Returns:
830
+ ctypes._Pointer: output node
831
+ """
832
+
833
+ split_lens_ptr = np.array(split_lengths, dtype=np.uint32)
834
+
835
+ return backend_lib.variadic_split(
836
+ self._mm,
837
+ input,
838
+ axis,
839
+ split_lens_ptr,
840
+ split_lens_ptr.size,
841
+ )
842
+
843
+ @return_tensor
844
+ def avg_pooling(
845
+ self,
846
+ input: ctypes._Pointer,
847
+ kernel_size: Union[int, Sequence[int]],
848
+ strides: Optional[Union[int, Sequence[int]]] = None,
849
+ padding: int = 0,
850
+ ceil_mode: bool = False,
851
+ count_include_pad: bool = True,
852
+ divisor_override: Optional[int] = None,
853
+ n_spatial_dims: int = 2,
854
+ ) -> ctypes._Pointer:
855
+ """Generate an average pooling layer.
856
+
857
+ Args:
858
+ input (ctypes._Pointer): layer input node
859
+ kernel_size (Sequence[int]): kernel size
860
+ strides (Sequence[int]): strides
861
+ padding (int): padding
862
+ ceil_mode (bool): ceil mode
863
+ count_include_pad (bool): count include pad
864
+ divisor_override (int): divisor override
865
+ n_spatial_dims (int): number of spatial dimensions
866
+
867
+ Raises:
868
+ NotImplementedError: divisor_override is not supported
869
+
870
+ Returns:
871
+ ctypes._Pointer: output node
872
+ """
873
+ if isinstance(kernel_size, int):
874
+ kernel_size = [kernel_size] * n_spatial_dims
875
+
876
+ if strides is None:
877
+ strides = kernel_size
878
+ elif isinstance(strides, int):
879
+ strides = [strides] * n_spatial_dims
880
+
881
+ if isinstance(padding, int):
882
+ padding_begins = [padding] * n_spatial_dims
883
+ padding_ends = [padding] * n_spatial_dims
884
+ else:
885
+ padding_begins = list(padding)
886
+ padding_ends = list(padding)
887
+
888
+ strides_ptr = np.array(strides, dtype=np.uint32)
889
+ padding_begins_ptr = np.array(padding_begins, dtype=np.uint32)
890
+ padding_ends_ptr = np.array(padding_ends, dtype=np.uint32)
891
+ kernel_size_ptr = np.array(kernel_size, dtype=np.uint32)
892
+
893
+ rounding_type = 1 if ceil_mode else 0
894
+ auto_pad = 0 # Hardcoded to explicit padding
895
+
896
+ if divisor_override:
897
+ raise NotImplementedError("divisor_override is not supported")
898
+
899
+ return backend_lib.avg_pooling(
900
+ self._mm,
901
+ input,
902
+ strides_ptr.size,
903
+ strides_ptr,
904
+ padding_begins_ptr.size,
905
+ padding_begins_ptr,
906
+ padding_ends_ptr.size,
907
+ padding_ends_ptr,
908
+ kernel_size_ptr.size,
909
+ kernel_size_ptr,
910
+ not count_include_pad, # exclude_pad
911
+ rounding_type, # rounding_type
912
+ auto_pad, # auto_pad
913
+ )
914
+
915
+ @return_tensor
916
+ def max_pooling(
917
+ self,
918
+ input: ctypes._Pointer,
919
+ kernel_size: Union[int, Sequence[int]],
920
+ strides: Optional[Union[int, Sequence[int]]] = None,
921
+ padding: int = 0,
922
+ ceil_mode: bool = False,
923
+ n_spatial_dims: int = 2,
924
+ ) -> ctypes._Pointer:
925
+ """Generate an average pooling layer.
926
+
927
+ Args:
928
+ input (ctypes._Pointer): layer input node
929
+ kernel_size (Sequence[int]): kernel size
930
+ strides (Sequence[int]): strides
931
+ padding (int): padding
932
+ ceil_mode (bool): ceil mode
933
+ n_spatial_dims (int): number of spatial dimensions
934
+
935
+ Returns:
936
+ ctypes._Pointer: output node
937
+ """
938
+ if isinstance(kernel_size, int):
939
+ kernel_size = [kernel_size] * n_spatial_dims
940
+
941
+ if strides is None:
942
+ strides = kernel_size
943
+ elif isinstance(strides, int):
944
+ strides = [strides] * n_spatial_dims
945
+
946
+ if isinstance(padding, int):
947
+ padding_begins = [padding] * n_spatial_dims
948
+ padding_ends = [padding] * n_spatial_dims
949
+ else:
950
+ padding_begins = list(padding)
951
+ padding_ends = list(padding)
952
+
953
+ strides_ptr = np.array(strides, dtype=np.uint32)
954
+ padding_begins_ptr = np.array(padding_begins, dtype=np.uint32)
955
+ padding_ends_ptr = np.array(padding_ends, dtype=np.uint32)
956
+ kernel_size_ptr = np.array(kernel_size, dtype=np.uint32)
957
+
958
+ rounding_type = 1 if ceil_mode else 0
959
+ auto_pad = 0 # Hardcoded to explicit padding
960
+
961
+ return backend_lib.max_pooling(
962
+ self._mm,
963
+ input,
964
+ strides_ptr.size,
965
+ strides_ptr,
966
+ padding_begins_ptr.size,
967
+ padding_begins_ptr,
968
+ padding_ends_ptr.size,
969
+ padding_ends_ptr,
970
+ kernel_size_ptr.size,
971
+ kernel_size_ptr,
972
+ rounding_type, # rounding_type
973
+ auto_pad, # auto_pad
974
+ )
975
+
976
+ @return_tensor
977
+ def scaled_dot_product_attention(
978
+ self, query: ctypes._Pointer, key: ctypes._Pointer,
979
+ value: ctypes._Pointer, attn_mask: ctypes._Pointer,
980
+ is_causal: bool
981
+ ) -> ctypes._Pointer:
982
+ """Constructs a ScaledDotProductAttention operation.
983
+ Args:
984
+ query (ctypes._Pointer): query
985
+ key (ctypes._Pointer): key
986
+ value (ctypes._Pointer): value
987
+ attn_mask (ctypes._Pointer): attention mask
988
+ is_causal (ctypes._Pointer): causal/not causal
989
+ Returns:
990
+ ctypes._Pointer: output node
991
+ """
992
+ return backend_lib.scaled_dot_product_attention(self._mm,
993
+ query, key,
994
+ value, attn_mask,
995
+ is_causal)
996
+
997
+ def get_tensor_shape(self, node, output_idx=0):
998
+ """Get tensor shape.
999
+
1000
+ Args:
1001
+ node: network node
1002
+
1003
+ Returns:
1004
+ tuple[int]: tensor shape
1005
+ """
1006
+ size = backend_lib.op_shape_size(node, output_idx)
1007
+ return tuple([backend_lib.op_shape(node, idx, output_idx) for idx in range(size)])
1008
+
1009
+ def get_tensor_dtype(self, node, output_idx=0):
1010
+ """Get tensor dtype.
1011
+
1012
+ Args:
1013
+ node: network node
1014
+
1015
+ Raises:
1016
+ RuntimeError: Unsupported dtype
1017
+
1018
+ Returns:
1019
+ str: tensor dtype
1020
+ """
1021
+ dtype_int = backend_lib.op_dtype(node, output_idx)
1022
+
1023
+ if dtype_int == 2:
1024
+ return np.bool
1025
+ # elif dtype_int == 3:
1026
+ # return bfloat16
1027
+ elif dtype_int == 4:
1028
+ return np.float16
1029
+ elif dtype_int == 5:
1030
+ return np.float32
1031
+ elif dtype_int == 6:
1032
+ return np.float64
1033
+ # elif dtype_int == 7:
1034
+ # return int4
1035
+ elif dtype_int == 8:
1036
+ return np.int8
1037
+ elif dtype_int == 9:
1038
+ return np.int16
1039
+ elif dtype_int == 10:
1040
+ return np.int32
1041
+ elif dtype_int == 11:
1042
+ return np.int64
1043
+ else:
1044
+ raise RuntimeError("Unsupported dtype")
1045
+
1046
+ def compile(self, npu_dpu_groups=4):
1047
+ """Finalize and compile a model."""
1048
+ self.out = []
1049
+ self.torch_out = []
1050
+ for node in self.output_nodes:
1051
+ backend_lib.result(self._mm, node)
1052
+
1053
+ # Compile the model
1054
+ backend_lib.compile(self._mm, npu_dpu_groups)
1055
+
1056
+ for idx, node in enumerate(self.output_nodes):
1057
+ output_shape = self.get_tensor_shape(node)
1058
+ output_dtype = self.get_tensor_dtype(node)
1059
+
1060
+ tensor = np.empty(output_shape, dtype=output_dtype)
1061
+ ptr = tensor.ctypes.data_as(ctypes.c_void_p)
1062
+ backend_lib.set_output(self._mm, ptr, idx)
1063
+ self.out.append(tensor)
1064
+ self.torch_out.append(torch.from_numpy(tensor))
1065
+
1066
+ def set_input_tensor(self, tensor: np.ndarray, idx: int):
1067
+ """Set input tensor.
1068
+
1069
+ Args:
1070
+ tensor (np.ndarray): Input tensor
1071
+ idx (int): tensor index
1072
+ """
1073
+ backend_lib.set_activation(
1074
+ self._mm, tensor.ctypes.data_as(ctypes.c_void_p), idx
1075
+ )
1076
+
1077
+ def get_tensor_recursively(self, args: Sequence[Any]) -> List[np.ndarray]:
1078
+ """Get tensor recursively for a list of arguments.
1079
+
1080
+ Args:
1081
+ args (Sequence[Any]): Sequence of tensors, tuple of tensors and additional arguments
1082
+
1083
+ Returns:
1084
+ List[np.ndarray]: Sequence of tensors
1085
+ """
1086
+ tensors = []
1087
+ for t in args:
1088
+ if isinstance(t, (list, tuple)):
1089
+ tensors.extend(self.get_tensor_recursively(t))
1090
+ elif isinstance(t, np.ndarray):
1091
+ tensors.append(t)
1092
+
1093
+ return tensors
1094
+
1095
+ def run(
1096
+ self,
1097
+ X: List[np.ndarray],
1098
+ *weights: Union[np.ndarray, Tuple[np.ndarray, np.ndarray]],
1099
+ **kwargs: Any,
1100
+ ) -> np.ndarray:
1101
+ """Run the layer: X * W^T.
1102
+
1103
+ Args:
1104
+ X (np.ndarray): lhs operator
1105
+ weights (Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]): rhs operators
1106
+ kwargs (Any): additional arguments
1107
+
1108
+ Returns:
1109
+ np.ndarray: result
1110
+ """
1111
+
1112
+ if isinstance(X, np.ndarray):
1113
+ X = [X]
1114
+
1115
+ op_id = kwargs.get("op_id", None)
1116
+ verify_size = kwargs.get("verify_size", False)
1117
+ if op_id is None:
1118
+ ww = self.get_tensor_recursively(weights)
1119
+ for idx, weight in enumerate(ww):
1120
+ self.set_input_tensor(weight, idx + 1)
1121
+ prefetch = False
1122
+ else:
1123
+ prefetch = self.setWeights(len(X), kwargs.get("op_id", None), *weights, verify_size=verify_size)
1124
+
1125
+ for idx, elem in enumerate(X):
1126
+ self.set_input_tensor(elem, idx)
1127
+
1128
+ self.elapsed = backend_lib.run(self._mm)
1129
+
1130
+ if prefetch:
1131
+ self.prefetchWeights(len(X), verify_size=verify_size)
1132
+
1133
+ if len(self.out) == 1:
1134
+ return self.out[0]
1135
+ return self.out
1136
+
1137
+ def __call__(self, *args: Any, **kwargs: Any) -> np.ndarray:
1138
+ """Run the model using the factory.
1139
+
1140
+ Args:
1141
+ args (Any): The positional arguments.
1142
+ kwargs (Any): The keyword arguments.
1143
+
1144
+ Returns:
1145
+ np.ndarray: The output tensor.
1146
+ """
1147
+ args = tuple(
1148
+ [
1149
+ arg.detach().numpy() if isinstance(arg, torch.Tensor) else arg
1150
+ for arg in args
1151
+ ]
1152
+ )
1153
+ kwargs = {
1154
+ k: arg.detach().numpy() if isinstance(arg, torch.Tensor) else arg
1155
+ for k, arg in kwargs.items()
1156
+ }
1157
+
1158
+ out = self.run(*args, **kwargs)
1159
+ if isinstance(out, list):
1160
+ return [torch.tensor(o, device=torch.device("npu")) for o in out]
1161
+ return torch.tensor(out, device=torch.device("npu"))