bigdl-core-npu 2.5.0__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (223) hide show
  1. bigdl_core_npu-2.5.0.dist-info/METADATA +35 -0
  2. bigdl_core_npu-2.5.0.dist-info/RECORD +223 -0
  3. bigdl_core_npu-2.5.0.dist-info/WHEEL +5 -0
  4. bigdl_core_npu-2.5.0.dist-info/top_level.txt +1 -0
  5. intel_npu_acceleration_library/__init__.py +24 -0
  6. intel_npu_acceleration_library/_version.py +6 -0
  7. intel_npu_acceleration_library/backend/__init__.py +37 -0
  8. intel_npu_acceleration_library/backend/base.py +215 -0
  9. intel_npu_acceleration_library/backend/bindings.py +279 -0
  10. intel_npu_acceleration_library/backend/compression.py +24 -0
  11. intel_npu_acceleration_library/backend/convolution.py +58 -0
  12. intel_npu_acceleration_library/backend/factory.py +944 -0
  13. intel_npu_acceleration_library/backend/linear.py +60 -0
  14. intel_npu_acceleration_library/backend/matmul.py +59 -0
  15. intel_npu_acceleration_library/backend/mlp.py +58 -0
  16. intel_npu_acceleration_library/backend/ops.py +141 -0
  17. intel_npu_acceleration_library/backend/qlinear.py +71 -0
  18. intel_npu_acceleration_library/backend/qmatmul.py +66 -0
  19. intel_npu_acceleration_library/backend/runtime.py +210 -0
  20. intel_npu_acceleration_library/backend/sdpa.py +107 -0
  21. intel_npu_acceleration_library/backend/tensor.py +1050 -0
  22. intel_npu_acceleration_library/backend/utils.py +70 -0
  23. intel_npu_acceleration_library/compiler.py +194 -0
  24. intel_npu_acceleration_library/device.py +230 -0
  25. intel_npu_acceleration_library/dtypes.py +122 -0
  26. intel_npu_acceleration_library/external/openvino/__init__.py +71 -0
  27. intel_npu_acceleration_library/external/openvino/_offline_transformations/__init__.py +20 -0
  28. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp310-win_amd64.pyd +0 -0
  29. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp311-win_amd64.pyd +0 -0
  30. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp312-win_amd64.pyd +0 -0
  31. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp38-win_amd64.pyd +0 -0
  32. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp39-win_amd64.pyd +0 -0
  33. intel_npu_acceleration_library/external/openvino/frontend/__init__.py +34 -0
  34. intel_npu_acceleration_library/external/openvino/frontend/frontend.py +44 -0
  35. intel_npu_acceleration_library/external/openvino/frontend/onnx/__init__.py +15 -0
  36. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp310-win_amd64.pyd +0 -0
  37. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp311-win_amd64.pyd +0 -0
  38. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp312-win_amd64.pyd +0 -0
  39. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp38-win_amd64.pyd +0 -0
  40. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp39-win_amd64.pyd +0 -0
  41. intel_npu_acceleration_library/external/openvino/frontend/paddle/__init__.py +15 -0
  42. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp310-win_amd64.pyd +0 -0
  43. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp311-win_amd64.pyd +0 -0
  44. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp312-win_amd64.pyd +0 -0
  45. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp38-win_amd64.pyd +0 -0
  46. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp39-win_amd64.pyd +0 -0
  47. intel_npu_acceleration_library/external/openvino/frontend/pytorch/__init__.py +19 -0
  48. intel_npu_acceleration_library/external/openvino/frontend/pytorch/fx_decoder.py +352 -0
  49. intel_npu_acceleration_library/external/openvino/frontend/pytorch/gptq.py +139 -0
  50. intel_npu_acceleration_library/external/openvino/frontend/pytorch/module_extension.py +39 -0
  51. intel_npu_acceleration_library/external/openvino/frontend/pytorch/patch_model.py +98 -0
  52. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp310-win_amd64.pyd +0 -0
  53. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp311-win_amd64.pyd +0 -0
  54. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp312-win_amd64.pyd +0 -0
  55. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp38-win_amd64.pyd +0 -0
  56. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp39-win_amd64.pyd +0 -0
  57. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/backend.py +119 -0
  58. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/backend_utils.py +85 -0
  59. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/compile.py +141 -0
  60. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/decompositions.py +116 -0
  61. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/execute.py +189 -0
  62. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/op_support.py +289 -0
  63. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/partition.py +118 -0
  64. intel_npu_acceleration_library/external/openvino/frontend/pytorch/ts_decoder.py +536 -0
  65. intel_npu_acceleration_library/external/openvino/frontend/pytorch/utils.py +256 -0
  66. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/__init__.py +16 -0
  67. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/graph_iterator.py +116 -0
  68. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/node_decoder.py +219 -0
  69. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp310-win_amd64.pyd +0 -0
  70. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp311-win_amd64.pyd +0 -0
  71. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp312-win_amd64.pyd +0 -0
  72. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp38-win_amd64.pyd +0 -0
  73. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp39-win_amd64.pyd +0 -0
  74. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/utils.py +460 -0
  75. intel_npu_acceleration_library/external/openvino/helpers/__init__.py +6 -0
  76. intel_npu_acceleration_library/external/openvino/helpers/packing.py +87 -0
  77. intel_npu_acceleration_library/external/openvino/preprocess/README.md +60 -0
  78. intel_npu_acceleration_library/external/openvino/preprocess/__init__.py +26 -0
  79. intel_npu_acceleration_library/external/openvino/preprocess/torchvision/__init__.py +15 -0
  80. intel_npu_acceleration_library/external/openvino/preprocess/torchvision/preprocess_converter.py +47 -0
  81. intel_npu_acceleration_library/external/openvino/preprocess/torchvision/requirements.txt +4 -0
  82. intel_npu_acceleration_library/external/openvino/preprocess/torchvision/torchvision_preprocessing.py +347 -0
  83. intel_npu_acceleration_library/external/openvino/properties/__init__.py +21 -0
  84. intel_npu_acceleration_library/external/openvino/properties/_properties.py +55 -0
  85. intel_npu_acceleration_library/external/openvino/properties/device/__init__.py +14 -0
  86. intel_npu_acceleration_library/external/openvino/properties/hint/__init__.py +15 -0
  87. intel_npu_acceleration_library/external/openvino/properties/intel_auto/__init__.py +12 -0
  88. intel_npu_acceleration_library/external/openvino/properties/intel_cpu/__init__.py +8 -0
  89. intel_npu_acceleration_library/external/openvino/properties/intel_gpu/__init__.py +12 -0
  90. intel_npu_acceleration_library/external/openvino/properties/intel_gpu/hint/__init__.py +11 -0
  91. intel_npu_acceleration_library/external/openvino/properties/log/__init__.py +11 -0
  92. intel_npu_acceleration_library/external/openvino/properties/streams/__init__.py +11 -0
  93. intel_npu_acceleration_library/external/openvino/runtime/__init__.py +85 -0
  94. intel_npu_acceleration_library/external/openvino/runtime/exceptions.py +17 -0
  95. intel_npu_acceleration_library/external/openvino/runtime/ie_api.py +631 -0
  96. intel_npu_acceleration_library/external/openvino/runtime/op/__init__.py +18 -0
  97. intel_npu_acceleration_library/external/openvino/runtime/op/util/__init__.py +22 -0
  98. intel_npu_acceleration_library/external/openvino/runtime/opset1/__init__.py +112 -0
  99. intel_npu_acceleration_library/external/openvino/runtime/opset1/ops.py +3067 -0
  100. intel_npu_acceleration_library/external/openvino/runtime/opset10/__init__.py +179 -0
  101. intel_npu_acceleration_library/external/openvino/runtime/opset10/ops.py +173 -0
  102. intel_npu_acceleration_library/external/openvino/runtime/opset11/__init__.py +179 -0
  103. intel_npu_acceleration_library/external/openvino/runtime/opset11/ops.py +107 -0
  104. intel_npu_acceleration_library/external/openvino/runtime/opset12/__init__.py +180 -0
  105. intel_npu_acceleration_library/external/openvino/runtime/opset12/ops.py +120 -0
  106. intel_npu_acceleration_library/external/openvino/runtime/opset13/__init__.py +188 -0
  107. intel_npu_acceleration_library/external/openvino/runtime/opset13/ops.py +399 -0
  108. intel_npu_acceleration_library/external/openvino/runtime/opset14/__init__.py +190 -0
  109. intel_npu_acceleration_library/external/openvino/runtime/opset14/ops.py +171 -0
  110. intel_npu_acceleration_library/external/openvino/runtime/opset15/__init__.py +10 -0
  111. intel_npu_acceleration_library/external/openvino/runtime/opset15/ops.py +85 -0
  112. intel_npu_acceleration_library/external/openvino/runtime/opset2/__init__.py +118 -0
  113. intel_npu_acceleration_library/external/openvino/runtime/opset2/ops.py +216 -0
  114. intel_npu_acceleration_library/external/openvino/runtime/opset3/__init__.py +134 -0
  115. intel_npu_acceleration_library/external/openvino/runtime/opset3/ops.py +638 -0
  116. intel_npu_acceleration_library/external/openvino/runtime/opset4/__init__.py +145 -0
  117. intel_npu_acceleration_library/external/openvino/runtime/opset4/ops.py +464 -0
  118. intel_npu_acceleration_library/external/openvino/runtime/opset5/__init__.py +152 -0
  119. intel_npu_acceleration_library/external/openvino/runtime/opset5/ops.py +372 -0
  120. intel_npu_acceleration_library/external/openvino/runtime/opset6/__init__.py +154 -0
  121. intel_npu_acceleration_library/external/openvino/runtime/opset6/ops.py +189 -0
  122. intel_npu_acceleration_library/external/openvino/runtime/opset7/__init__.py +158 -0
  123. intel_npu_acceleration_library/external/openvino/runtime/opset7/ops.py +169 -0
  124. intel_npu_acceleration_library/external/openvino/runtime/opset8/__init__.py +169 -0
  125. intel_npu_acceleration_library/external/openvino/runtime/opset8/ops.py +783 -0
  126. intel_npu_acceleration_library/external/openvino/runtime/opset9/__init__.py +175 -0
  127. intel_npu_acceleration_library/external/openvino/runtime/opset9/ops.py +341 -0
  128. intel_npu_acceleration_library/external/openvino/runtime/opset_utils.py +22 -0
  129. intel_npu_acceleration_library/external/openvino/runtime/passes/__init__.py +19 -0
  130. intel_npu_acceleration_library/external/openvino/runtime/passes/graph_rewrite.py +33 -0
  131. intel_npu_acceleration_library/external/openvino/runtime/passes/manager.py +26 -0
  132. intel_npu_acceleration_library/external/openvino/runtime/properties/__init__.py +38 -0
  133. intel_npu_acceleration_library/external/openvino/runtime/properties/hint/__init__.py +25 -0
  134. intel_npu_acceleration_library/external/openvino/runtime/utils/__init__.py +7 -0
  135. intel_npu_acceleration_library/external/openvino/runtime/utils/broadcasting.py +44 -0
  136. intel_npu_acceleration_library/external/openvino/runtime/utils/data_helpers/__init__.py +8 -0
  137. intel_npu_acceleration_library/external/openvino/runtime/utils/data_helpers/data_dispatcher.py +429 -0
  138. intel_npu_acceleration_library/external/openvino/runtime/utils/data_helpers/wrappers.py +148 -0
  139. intel_npu_acceleration_library/external/openvino/runtime/utils/decorators.py +70 -0
  140. intel_npu_acceleration_library/external/openvino/runtime/utils/input_validation.py +133 -0
  141. intel_npu_acceleration_library/external/openvino/runtime/utils/node_factory.py +127 -0
  142. intel_npu_acceleration_library/external/openvino/runtime/utils/reduction.py +25 -0
  143. intel_npu_acceleration_library/external/openvino/runtime/utils/types.py +175 -0
  144. intel_npu_acceleration_library/external/openvino/tools/__init__.py +4 -0
  145. intel_npu_acceleration_library/external/openvino/tools/benchmark/__init__.py +3 -0
  146. intel_npu_acceleration_library/external/openvino/tools/benchmark/benchmark.py +186 -0
  147. intel_npu_acceleration_library/external/openvino/tools/benchmark/main.py +695 -0
  148. intel_npu_acceleration_library/external/openvino/tools/benchmark/parameters.py +199 -0
  149. intel_npu_acceleration_library/external/openvino/tools/benchmark/utils/__init__.py +3 -0
  150. intel_npu_acceleration_library/external/openvino/tools/benchmark/utils/constants.py +26 -0
  151. intel_npu_acceleration_library/external/openvino/tools/benchmark/utils/inputs_filling.py +482 -0
  152. intel_npu_acceleration_library/external/openvino/tools/benchmark/utils/logging.py +8 -0
  153. intel_npu_acceleration_library/external/openvino/tools/benchmark/utils/statistics_report.py +296 -0
  154. intel_npu_acceleration_library/external/openvino/tools/benchmark/utils/utils.py +836 -0
  155. intel_npu_acceleration_library/external/openvino/tools/ovc/__init__.py +20 -0
  156. intel_npu_acceleration_library/external/openvino/tools/ovc/__main__.py +10 -0
  157. intel_npu_acceleration_library/external/openvino/tools/ovc/cli_parser.py +633 -0
  158. intel_npu_acceleration_library/external/openvino/tools/ovc/convert.py +102 -0
  159. intel_npu_acceleration_library/external/openvino/tools/ovc/convert_data_type.py +82 -0
  160. intel_npu_acceleration_library/external/openvino/tools/ovc/convert_impl.py +536 -0
  161. intel_npu_acceleration_library/external/openvino/tools/ovc/environment_setup_utils.py +50 -0
  162. intel_npu_acceleration_library/external/openvino/tools/ovc/error.py +49 -0
  163. intel_npu_acceleration_library/external/openvino/tools/ovc/get_ov_update_message.py +16 -0
  164. intel_npu_acceleration_library/external/openvino/tools/ovc/help.py +45 -0
  165. intel_npu_acceleration_library/external/openvino/tools/ovc/logger.py +91 -0
  166. intel_npu_acceleration_library/external/openvino/tools/ovc/main.py +35 -0
  167. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/__init__.py +2 -0
  168. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/analysis.py +46 -0
  169. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/check_config.py +57 -0
  170. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/extractor.py +447 -0
  171. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/layout_utils.py +73 -0
  172. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/moc_emit_ir.py +32 -0
  173. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/offline_transformations.py +107 -0
  174. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/paddle_frontend_utils.py +83 -0
  175. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/pipeline.py +246 -0
  176. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/preprocessing.py +220 -0
  177. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py +205 -0
  178. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/shape_utils.py +109 -0
  179. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/type_utils.py +82 -0
  180. intel_npu_acceleration_library/external/openvino/tools/ovc/ovc.py +13 -0
  181. intel_npu_acceleration_library/external/openvino/tools/ovc/telemetry_params.py +6 -0
  182. intel_npu_acceleration_library/external/openvino/tools/ovc/telemetry_stub.py +28 -0
  183. intel_npu_acceleration_library/external/openvino/tools/ovc/telemetry_utils.py +118 -0
  184. intel_npu_acceleration_library/external/openvino/tools/ovc/utils.py +109 -0
  185. intel_npu_acceleration_library/external/openvino/tools/ovc/version.py +80 -0
  186. intel_npu_acceleration_library/external/openvino/torch/__init__.py +5 -0
  187. intel_npu_acceleration_library/external/openvino/utils.py +98 -0
  188. intel_npu_acceleration_library/functional/__init__.py +8 -0
  189. intel_npu_acceleration_library/functional/scaled_dot_product_attention.py +47 -0
  190. intel_npu_acceleration_library/lib/Release/cache.json +113732 -0
  191. intel_npu_acceleration_library/lib/Release/intel_npu_acceleration_library.dll +0 -0
  192. intel_npu_acceleration_library/lib/Release/openvino.dll +0 -0
  193. intel_npu_acceleration_library/lib/Release/openvino_auto_batch_plugin.dll +0 -0
  194. intel_npu_acceleration_library/lib/Release/openvino_auto_plugin.dll +0 -0
  195. intel_npu_acceleration_library/lib/Release/openvino_c.dll +0 -0
  196. intel_npu_acceleration_library/lib/Release/openvino_hetero_plugin.dll +0 -0
  197. intel_npu_acceleration_library/lib/Release/openvino_intel_cpu_plugin.dll +0 -0
  198. intel_npu_acceleration_library/lib/Release/openvino_intel_gpu_plugin.dll +0 -0
  199. intel_npu_acceleration_library/lib/Release/openvino_intel_npu_plugin.dll +0 -0
  200. intel_npu_acceleration_library/lib/Release/openvino_ir_frontend.dll +0 -0
  201. intel_npu_acceleration_library/lib/Release/openvino_onnx_frontend.dll +0 -0
  202. intel_npu_acceleration_library/lib/Release/openvino_paddle_frontend.dll +0 -0
  203. intel_npu_acceleration_library/lib/Release/openvino_pytorch_frontend.dll +0 -0
  204. intel_npu_acceleration_library/lib/Release/openvino_tensorflow_frontend.dll +0 -0
  205. intel_npu_acceleration_library/lib/Release/openvino_tensorflow_lite_frontend.dll +0 -0
  206. intel_npu_acceleration_library/lib/Release/tbb12.dll +0 -0
  207. intel_npu_acceleration_library/lib/Release/tbb12_debug.dll +0 -0
  208. intel_npu_acceleration_library/lib/Release/tbbbind_2_5.dll +0 -0
  209. intel_npu_acceleration_library/lib/Release/tbbbind_2_5_debug.dll +0 -0
  210. intel_npu_acceleration_library/lib/Release/tbbmalloc.dll +0 -0
  211. intel_npu_acceleration_library/lib/Release/tbbmalloc_debug.dll +0 -0
  212. intel_npu_acceleration_library/lib/Release/tbbmalloc_proxy.dll +0 -0
  213. intel_npu_acceleration_library/lib/Release/tbbmalloc_proxy_debug.dll +0 -0
  214. intel_npu_acceleration_library/modelling.py +150 -0
  215. intel_npu_acceleration_library/nn/__init__.py +20 -0
  216. intel_npu_acceleration_library/nn/autograd.py +68 -0
  217. intel_npu_acceleration_library/nn/conv.py +257 -0
  218. intel_npu_acceleration_library/nn/functional.py +1207 -0
  219. intel_npu_acceleration_library/nn/linear.py +162 -0
  220. intel_npu_acceleration_library/nn/llm.py +417 -0
  221. intel_npu_acceleration_library/nn/module.py +393 -0
  222. intel_npu_acceleration_library/optimizations.py +157 -0
  223. intel_npu_acceleration_library/quantization.py +174 -0
@@ -0,0 +1,944 @@
1
+ #
2
+ # Copyright © 2024 Intel Corporation
3
+ # SPDX-License-Identifier: Apache 2.0
4
+ #
5
+
6
+ from intel_npu_acceleration_library.backend.base import BaseNPUBackendWithPrefetch
7
+ from intel_npu_acceleration_library.backend.ops import get_supported_ops
8
+ from intel_npu_acceleration_library.backend.bindings import lib as backend_lib
9
+ from intel_npu_acceleration_library.backend.tensor import Tensor
10
+ from intel_npu_acceleration_library.dtypes import int4, bfloat16
11
+ from typing import Optional, Tuple, Any, Union, Sequence, TypeVar, Callable, cast, List
12
+ from functools import partial
13
+ import numpy.typing as npt
14
+ import numpy as np
15
+ import ctypes
16
+ import torch
17
+
18
+
19
+ F = TypeVar("F", bound=Callable[..., Any])
20
+
21
+
22
+ class NNFactory(BaseNPUBackendWithPrefetch):
23
+ """Linear class, computing a matrix matrix multiplication with weights prefetching."""
24
+
25
+ def __init__(
26
+ self,
27
+ profile: bool = False,
28
+ device: str = "NPU",
29
+ ):
30
+ """Initialize the Linear class.
31
+
32
+ Args:
33
+ profile (Optional[bool], optional): Enable/Disable profiling. Defaults to False.
34
+ device (str): Target device, default to "NPU".
35
+ """
36
+ super().__init__(profile)
37
+ self.device = device
38
+ self._mm = backend_lib.createNNFactory(
39
+ ctypes.c_char_p(self.device.encode()),
40
+ profile,
41
+ )
42
+ self.elapsed = None
43
+ self.output_nodes: Sequence[ctypes._Pointer] = []
44
+
45
+ for op in get_supported_ops():
46
+ if not hasattr(self, op.name.replace("_act", "")):
47
+ setattr(
48
+ self,
49
+ op.name.replace("_act", ""),
50
+ partial(self._call_backend_op, op.name),
51
+ )
52
+
53
+ def return_tensor(fn: F) -> F: # type: ignore
54
+ """Wrap the output of a function in a Tensor object.
55
+
56
+ Args:
57
+ fn (function): Function
58
+
59
+ Returns:
60
+ function: A function that wraps the output in a Tensor object
61
+ """
62
+
63
+ def wrapper(self, *args: Any, **kwargs: Any) -> Tensor:
64
+ """Wrap the output of a function in a Tensor object.
65
+
66
+ Args:
67
+ args (Any): Variable length argument list
68
+ kwargs (Any): Arbitrary keyword arguments
69
+
70
+ Returns:
71
+ Tensor: Tensor object
72
+ """
73
+ # Convert Tensor objects to their underlying node
74
+ args = tuple(arg.node if isinstance(arg, Tensor) else arg for arg in args)
75
+ kwargs = {
76
+ k: v.node if isinstance(v, Tensor) else v for k, v in kwargs.items()
77
+ }
78
+
79
+ input_nodes = [arg for arg in args if isinstance(arg, ctypes._Pointer)] + [
80
+ v for v in kwargs.values() if isinstance(v, ctypes._Pointer)
81
+ ]
82
+ # Call the function
83
+ node = fn(self, *args, **kwargs)
84
+
85
+ # remove input nodes from output_nodes
86
+ self.output_nodes = [
87
+ node for node in self.output_nodes if node not in input_nodes
88
+ ]
89
+ # add output node to output_nodes
90
+ if fn.__name__ != "constant":
91
+ self.output_nodes.append(node)
92
+
93
+ # Wrap the node in a Tensor object
94
+ return Tensor(factory=self, node=node)
95
+
96
+ return cast(F, wrapper)
97
+
98
+ @return_tensor
99
+ def _call_backend_op(self, op_name: str, *parameters: Any) -> Any:
100
+ """Dynamically call a backend operation.
101
+
102
+ Args:
103
+ op_name (str): operation name
104
+ parameters (Any): variable list of operation parameters
105
+
106
+ Returns:
107
+ Any: Operation
108
+ """
109
+ fn = getattr(backend_lib, op_name)
110
+ return fn(self._mm, *parameters)
111
+
112
+ def get_backend_dtype(self, dtype) -> ctypes.c_char_p:
113
+ """Get the string representation of the dtype.
114
+
115
+ Args:
116
+ dtype: numpy dtype
117
+
118
+ Raises:
119
+ RuntimeError: Unsupported datatype
120
+
121
+ Returns:
122
+ ctypes.c_char_p: string representation of the dtype
123
+ """
124
+ if dtype in [np.int8, torch.int8]:
125
+ str_dtype = "int8"
126
+ elif dtype == np.uint8 or dtype == int4:
127
+ # u8 represents packed i4 dtypes
128
+ str_dtype = "int4"
129
+ elif dtype in [np.int16, torch.int16]:
130
+ str_dtype = "int16"
131
+ elif dtype in [np.int32, torch.int32]:
132
+ str_dtype = "int32"
133
+ elif dtype in [np.int64, torch.int64]:
134
+ str_dtype = "int64"
135
+ elif dtype in [np.float16, torch.float16]:
136
+ str_dtype = "float16"
137
+ elif dtype in [np.float32, torch.float32]:
138
+ str_dtype = "float32"
139
+ elif dtype in [np.float64, torch.float64]:
140
+ str_dtype = "float64"
141
+ elif dtype in [bfloat16, torch.bfloat16]:
142
+ str_dtype = "bfloat16"
143
+ else:
144
+ raise RuntimeError(f"DType is not supported {dtype}")
145
+ return ctypes.c_char_p(str_dtype.encode())
146
+
147
+ @return_tensor
148
+ def parameter(
149
+ self, shape: Sequence[int], dtype: npt.DTypeLike = np.float16
150
+ ) -> ctypes._Pointer:
151
+ """Generate a model input parameter.
152
+
153
+ Args:
154
+ shape (Sequence[int]): Parameter shape
155
+ dtype (np.dtype, optional): parameter type np.int8, np.uint8 and np.float16 supported. Defaults to np.float16. Unit8 represents packed i4 dtypes
156
+
157
+ Returns:
158
+ ctypes._Pointer: an instance to a parameter object
159
+
160
+ """
161
+ shape_ptr = np.array(shape, dtype=np.uint32)
162
+ return backend_lib.parameter(
163
+ self._mm, shape_ptr.size, shape_ptr, self.get_backend_dtype(dtype)
164
+ )
165
+
166
+ @return_tensor
167
+ def to(self, tensor: ctypes._Pointer, dtype: npt.DTypeLike) -> ctypes._Pointer:
168
+ """Convert a tensor to a different dtype.
169
+
170
+ Args:
171
+ tensor (ctypes._Pointer): input tensor
172
+ dtype (npt.DTypeLike): target dtype
173
+
174
+ Returns:
175
+ ctypes._Pointer: output tensor
176
+ """
177
+ dtype_ptr = self.get_backend_dtype(dtype)
178
+ return backend_lib.to(self._mm, tensor, dtype_ptr)
179
+
180
+ @return_tensor
181
+ def constant(
182
+ self,
183
+ data: Union[np.array, Sequence[int], Sequence[float], int, float, torch.Tensor],
184
+ ) -> ctypes._Pointer:
185
+ """Generate a model input constant.
186
+
187
+ Args:
188
+ data (Union[np.array, Sequence[int], Sequence[float], int, float, torch.Tensor]): constant data
189
+
190
+ Returns:
191
+ ctypes._Pointer: an instance to a constant object
192
+
193
+ """
194
+ if isinstance(data, (list, tuple)):
195
+ if all(isinstance(i, int) for i in data):
196
+ data = np.array(data, dtype=np.int64)
197
+ else:
198
+ data = np.array(data, dtype=np.float32)
199
+ elif isinstance(data, int):
200
+ data = np.array([data], dtype=np.int64)
201
+ elif isinstance(data, float):
202
+ data = np.array([data], dtype=np.float32)
203
+ elif isinstance(data, torch.Tensor):
204
+ data = data.detach().numpy()
205
+ elif data is None:
206
+ return ctypes.cast(ctypes.c_void_p(0), ctypes.POINTER(ctypes.c_char))
207
+
208
+ dst = data.ctypes.data_as(ctypes.c_void_p)
209
+ backend_dtype = self.get_backend_dtype(data.dtype)
210
+ if data.dtype == np.uint8 or data.dtype == int4:
211
+ shape = []
212
+ for size in data.shape:
213
+ shape.append(size)
214
+
215
+ shape[-1] = shape[-1] * 2
216
+ shape_ptr = np.array(shape, dtype=np.uint32)
217
+ else:
218
+ shape_ptr = np.array(data.shape, dtype=np.uint32)
219
+ return backend_lib.constant(
220
+ self._mm, shape_ptr.size, shape_ptr, backend_dtype, dst
221
+ )
222
+
223
+ @return_tensor
224
+ def matmul(
225
+ self,
226
+ input_node: ctypes._Pointer,
227
+ weights_node: ctypes._Pointer,
228
+ trA: bool = False,
229
+ trB: bool = True,
230
+ ) -> ctypes._Pointer:
231
+ """Generate a matrix multiplication layer.
232
+
233
+ Args:
234
+ input_node (ctypes._Pointer): layer input node
235
+ weights_node (ctypes._Pointer): weights node
236
+ trA (bool): transpose input node
237
+ trB (bool): transpose weights node
238
+
239
+ Returns:
240
+ ctypes._Pointer: output node
241
+ """
242
+ return backend_lib.matmul(self._mm, input_node, weights_node, trA, trB)
243
+
244
+ @return_tensor
245
+ def convolution(
246
+ self,
247
+ input_node: ctypes._Pointer,
248
+ weights_node: ctypes._Pointer,
249
+ bias: Optional[ctypes._Pointer] = None,
250
+ strides: Union[int, Sequence[int]] = 1,
251
+ padding: Union[int, Sequence[int]] = 0,
252
+ dilation: Union[int, Sequence[int]] = 1,
253
+ groups: int = 1,
254
+ act_dtype: npt.DTypeLike = np.float16,
255
+ n_spatial_dims: int = 2,
256
+ ) -> ctypes._Pointer:
257
+ """Generate a convolution layer.
258
+
259
+ Args:
260
+ input_node (ctypes._Pointer): layer input node
261
+ weights_node (ctypes._Pointer): weights node
262
+ bias (Optional[ctypes._Pointer}): bias node
263
+ strides (Sequence[int]): strides
264
+ padding (Sequence[int]): padding
265
+ dilation (Sequence[int]): dilation
266
+ groups (int): groups
267
+ act_dtype (npt.DTypeLike, optional): activation dtype. Defaults to np.float16.
268
+ n_spatial_dims (int): number of spatial dimensions
269
+
270
+ Returns:
271
+ ctypes._Pointer: output node
272
+ """
273
+ if isinstance(strides, int):
274
+ strides = [strides] * n_spatial_dims
275
+
276
+ if isinstance(padding, int):
277
+ padding_begins = [padding] * n_spatial_dims
278
+ padding_ends = [padding] * n_spatial_dims
279
+ else:
280
+ padding_begins = list(padding)
281
+ padding_ends = list(padding)
282
+
283
+ if isinstance(dilation, int):
284
+ dilation = [dilation] * n_spatial_dims
285
+
286
+ strides_ptr = np.array(strides, dtype=np.uint32)
287
+ padding_begins_ptr = np.array(padding_begins, dtype=np.uint32)
288
+ padding_ends_ptr = np.array(padding_ends, dtype=np.uint32)
289
+ dilation_ptr = np.array(dilation, dtype=np.uint32)
290
+
291
+ if bias is not None:
292
+ bias_node = bias
293
+ else:
294
+ bias_node = ctypes.cast(ctypes.c_void_p(0), ctypes.POINTER(ctypes.c_char))
295
+
296
+ return backend_lib.convolution(
297
+ self._mm,
298
+ input_node,
299
+ weights_node,
300
+ bias_node,
301
+ strides_ptr.size,
302
+ strides_ptr,
303
+ padding_begins_ptr.size,
304
+ padding_begins_ptr,
305
+ padding_ends_ptr.size,
306
+ padding_ends_ptr,
307
+ dilation_ptr.size,
308
+ dilation_ptr,
309
+ groups,
310
+ self.get_backend_dtype(act_dtype),
311
+ )
312
+
313
+ @return_tensor
314
+ def linear(
315
+ self,
316
+ input_node: ctypes._Pointer,
317
+ output_channels: int,
318
+ input_channels: int,
319
+ bias: Optional[bool] = False,
320
+ act_dtype: npt.DTypeLike = np.float16,
321
+ wt_dtype: npt.DTypeLike = np.float16,
322
+ ) -> ctypes._Pointer:
323
+ """Generate a linear layer.
324
+
325
+ Args:
326
+ input_node (ctypes._Pointer): layer input node
327
+ output_channels (int): number of output channels
328
+ input_channels (int): number of input channels
329
+ bias (bool, optional): enable/disable bias. Defaults to False.
330
+ act_dtype (npt.DTypeLike, optional): activation dtype. Defaults to np.float16.
331
+ wt_dtype (npt.DTypeLike, optional): weight dtype. Defaults to np.float16.
332
+
333
+ Returns:
334
+ ctypes._Pointer: output node
335
+ """
336
+ return backend_lib.linear(
337
+ self._mm,
338
+ input_node,
339
+ output_channels,
340
+ input_channels,
341
+ bias,
342
+ self.get_backend_dtype(act_dtype),
343
+ self.get_backend_dtype(wt_dtype),
344
+ )
345
+
346
+ @return_tensor
347
+ def reshape(
348
+ self, input_node: ctypes._Pointer, shape: Sequence[int]
349
+ ) -> ctypes._Pointer:
350
+ """Generate a reshape layer.
351
+
352
+ Args:
353
+ input_node (ctypes._Pointer): layer input node
354
+ shape (Sequence[int]): shape
355
+
356
+ Returns:
357
+ ctypes._Pointer: output node
358
+ """
359
+ shape_node = self.constant(shape).node # type: ignore
360
+ return backend_lib.reshape(self._mm, input_node, shape_node)
361
+
362
+ @return_tensor
363
+ def broadcast(
364
+ self, input_node: ctypes._Pointer, shape: Sequence[int]
365
+ ) -> ctypes._Pointer:
366
+ """Broadcast.
367
+
368
+ Args:
369
+ input_node (ctypes._Pointer): layer input node
370
+ shape (Sequence[int]): shape
371
+
372
+ Returns:
373
+ ctypes._Pointer: output node
374
+ """
375
+ shape_node = self.constant(shape).node # type: ignore
376
+ return backend_lib.broadcast(self._mm, input_node, shape_node)
377
+
378
+ @return_tensor
379
+ def transpose(
380
+ self, input_node: ctypes._Pointer, input_order: Sequence[int]
381
+ ) -> ctypes._Pointer:
382
+ """Generate a transpose layer.
383
+
384
+ Args:
385
+ input_node (ctypes._Pointer): layer input node
386
+ input_order (Sequence[int]): input order
387
+
388
+ Returns:
389
+ ctypes._Pointer: output node
390
+ """
391
+ input_order_node = self.constant(input_order).node # type: ignore
392
+ return backend_lib.transpose(self._mm, input_node, input_order_node)
393
+
394
+ @return_tensor
395
+ def unsqueeze(
396
+ self, input_node: ctypes._Pointer, axis: Sequence[int]
397
+ ) -> ctypes._Pointer:
398
+ """Generate an unsqueeze layer.
399
+
400
+ Args:
401
+ input_node (ctypes._Pointer): layer input node
402
+ axis (Sequence[int]): axis
403
+
404
+ Returns:
405
+ ctypes._Pointer: output node
406
+ """
407
+ axis_node = self.constant(axis).node # type: ignore
408
+ return backend_lib.unsqueeze(self._mm, input_node, axis_node)
409
+
410
+ @return_tensor
411
+ def slice(
412
+ self,
413
+ input_node: ctypes._Pointer,
414
+ begin: Sequence[int],
415
+ end: Sequence[int],
416
+ stride: Optional[Sequence[int]] = None,
417
+ ) -> ctypes._Pointer:
418
+ """Generate an unsqueeze layer.
419
+
420
+ Args:
421
+ input_node (ctypes._Pointer): layer input node
422
+ begin (Sequence[int]): begin
423
+ end (Sequence[int]): end
424
+ stride (Optional[Sequence[int]]): stride
425
+
426
+ Raises:
427
+ ValueError: begin and end must have the same length
428
+
429
+ Returns:
430
+ ctypes._Pointer: output node
431
+ """
432
+ if len(begin) != len(end):
433
+ raise ValueError("begin and end must have the same length")
434
+
435
+ if stride is None:
436
+ stride = [1] * len(begin)
437
+
438
+ begin_mask_ptr = np.zeros([len(begin)], dtype=np.uint32)
439
+ end_mask_ptr = np.zeros([len(begin)], dtype=np.uint32)
440
+
441
+ begin = self.constant(begin).node # type: ignore
442
+ end = self.constant(end).node # type: ignore
443
+ stride = self.constant(stride).node # type: ignore
444
+
445
+ return backend_lib.slice(
446
+ self._mm,
447
+ input_node,
448
+ begin,
449
+ end,
450
+ stride,
451
+ begin_mask_ptr.size,
452
+ begin_mask_ptr,
453
+ end_mask_ptr.size,
454
+ end_mask_ptr,
455
+ )
456
+
457
+ @return_tensor
458
+ def concat(
459
+ self, input_node_1: ctypes._Pointer, input_node_2: ctypes._Pointer, axis: int
460
+ ) -> ctypes._Pointer:
461
+ """Generate a concatenation layer.
462
+
463
+ Args:
464
+ input_node_1 (ctypes._Pointer): first layer input node
465
+ input_node_2 (ctypes._Pointer): second layer input node
466
+ axis (int): axis
467
+
468
+ Returns:
469
+ ctypes._Pointer: output node
470
+ """
471
+ if axis < 0:
472
+ shape_size = backend_lib.op_shape_size(input_node_1)
473
+ axis = (axis + shape_size) % shape_size
474
+ axis = np.int64(axis)
475
+ return backend_lib.concat(self._mm, input_node_1, input_node_2, axis)
476
+
477
+ @return_tensor
478
+ def reduce_max(
479
+ self,
480
+ input_node: ctypes._Pointer,
481
+ reduction_axes: Optional[Union[int, Sequence[int]]] = None,
482
+ keep_dims: Optional[bool] = False,
483
+ ) -> ctypes._Pointer:
484
+ """Generate a reduce max layer.
485
+
486
+ Args:
487
+ input_node (ctypes._Pointer): layer input node
488
+ reduction_axes (Optional[Union[int, Sequence[int]]]): the axis positions to be reduced
489
+ keep_dims (Optional[bool]): if set to 1 it holds axes that are used for reduction. Defaults to False
490
+
491
+ Returns:
492
+ ctypes._Pointer: output node
493
+ """
494
+ if reduction_axes is None:
495
+ shape_size = backend_lib.op_shape_size(input_node)
496
+ reduction_axes = list(range(shape_size - 1, -1, -1))
497
+ axis_node = self.constant(reduction_axes).node # type: ignore
498
+ return backend_lib.reduce_max(self._mm, input_node, axis_node, keep_dims)
499
+
500
+ @return_tensor
501
+ def reduce_mean(
502
+ self,
503
+ input_node: ctypes._Pointer,
504
+ reduction_axes: Optional[Union[int, Sequence[int]]] = None,
505
+ keep_dims: Optional[bool] = False,
506
+ ) -> ctypes._Pointer:
507
+ """Generate a reduce mean layer.
508
+
509
+ Args:
510
+ input_node (ctypes._Pointer): layer input node
511
+ reduction_axes (Optional[Union[int, Sequence[int]]]): the axis positions to be reduced
512
+ keep_dims (Optional[bool] ): if set to 1 it holds axes that are used for reduction. Defaults to False
513
+
514
+ Returns:
515
+ ctypes._Pointer: output node
516
+ """
517
+ if reduction_axes is None:
518
+ shape_size = backend_lib.op_shape_size(input_node)
519
+ reduction_axes = list(range(shape_size - 1, -1, -1))
520
+ axis_node = self.constant(reduction_axes).node # type: ignore
521
+ return backend_lib.reduce_mean(self._mm, input_node, axis_node, keep_dims)
522
+
523
+ @return_tensor
524
+ def reduce_min(
525
+ self,
526
+ input_node: ctypes._Pointer,
527
+ reduction_axes: Optional[Union[int, Sequence[int]]] = None,
528
+ keep_dims: Optional[bool] = False,
529
+ ) -> ctypes._Pointer:
530
+ """Generate a reduce min layer.
531
+
532
+ Args:
533
+ input_node (ctypes._Pointer): layer input node
534
+ reduction_axes (Optional[Union[int, Sequence[int]]]): the axis positions to be reduced
535
+ keep_dims (Optional[bool] ): if set to 1 it holds axes that are used for reduction. Defaults to False
536
+
537
+ Returns:
538
+ ctypes._Pointer: output node
539
+ """
540
+ if reduction_axes is None:
541
+ shape_size = backend_lib.op_shape_size(input_node)
542
+ reduction_axes = list(range(shape_size - 1, -1, -1))
543
+ axis_node = self.constant(reduction_axes).node # type: ignore
544
+ return backend_lib.reduce_min(self._mm, input_node, axis_node, keep_dims)
545
+
546
+ @return_tensor
547
+ def reduce_prod(
548
+ self,
549
+ input_node: ctypes._Pointer,
550
+ reduction_axes: Optional[Union[int, Sequence[int]]] = None,
551
+ keep_dims: Optional[bool] = False,
552
+ ) -> ctypes._Pointer:
553
+ """Generate a reduce product layer.
554
+
555
+ Args:
556
+ input_node (ctypes._Pointer): layer input node
557
+ reduction_axes (Optional[Union[int, Sequence[int]]]): the axis positions to be reduced
558
+ keep_dims (Optional[bool] ): if set to 1 it holds axes that are used for reduction. Defaults to False
559
+
560
+ Returns:
561
+ ctypes._Pointer: output node
562
+ """
563
+ if reduction_axes is None:
564
+ shape_size = backend_lib.op_shape_size(input_node)
565
+ reduction_axes = list(range(shape_size - 1, -1, -1))
566
+ axis_node = self.constant(reduction_axes).node # type: ignore
567
+ return backend_lib.reduce_prod(self._mm, input_node, axis_node, keep_dims)
568
+
569
+ @return_tensor
570
+ def reduce_sum(
571
+ self,
572
+ input_node: ctypes._Pointer,
573
+ reduction_axes: Optional[Union[int, Sequence[int]]] = None,
574
+ keep_dims: Optional[bool] = False,
575
+ ) -> ctypes._Pointer:
576
+ """Generate a reduce sum layer.
577
+
578
+ Args:
579
+ input_node (ctypes._Pointer): layer input node
580
+ reduction_axes (Optional[Union[int, Sequence[int]]]): the axis positions to be reduced
581
+ keep_dims (Optional[bool] ): if set to 1 it holds axes that are used for reduction. Defaults to False
582
+
583
+ Returns:
584
+ ctypes._Pointer: output node
585
+ """
586
+ if reduction_axes is None:
587
+ shape_size = backend_lib.op_shape_size(input_node)
588
+ reduction_axes = list(range(shape_size - 1, -1, -1))
589
+ axis_node = self.constant(reduction_axes).node # type: ignore
590
+ return backend_lib.reduce_sum(self._mm, input_node, axis_node, keep_dims)
591
+
592
+ @return_tensor
593
+ def normL2(
594
+ self, input_node: ctypes._Pointer, axis: int, eps: Optional[float] = 1e-12
595
+ ) -> ctypes._Pointer:
596
+ """Generate an L2 normalization layer.
597
+
598
+ Args:
599
+ input_node (ctypes._Pointer): layer input node
600
+ axis (int): axis
601
+ eps (float, optional): epsilon added to L2 norm. Defaults to 1e-12
602
+
603
+ Returns:
604
+ ctypes._Pointer: output node
605
+ """
606
+ if axis < 0:
607
+ shape_size = backend_lib.op_shape_size(input_node)
608
+ axis = (axis + shape_size) % shape_size
609
+ axis_node = self.constant(axis).node # type: ignore
610
+ return backend_lib.normL2(self._mm, input_node, axis_node, eps)
611
+
612
+ @return_tensor
613
+ def power(
614
+ self,
615
+ input_node: ctypes._Pointer,
616
+ exponent: Union[ctypes._Pointer, torch.Tensor],
617
+ ) -> ctypes._Pointer:
618
+ """Generate a power layer.
619
+
620
+ Args:
621
+ input_node (ctypes._Pointer): layer input node
622
+ exponent (Union[ctypes._Pointer, torch.Tensor]): the exponent value
623
+
624
+ Raises:
625
+ ValueError: Input tensor shapes are not equal
626
+
627
+ Returns:
628
+ ctypes._Pointer: output node
629
+ """
630
+ input_shape_size = backend_lib.op_shape_size(input_node)
631
+ input_shape = [
632
+ backend_lib.op_shape(input_node, i) for i in range(input_shape_size)
633
+ ]
634
+ if isinstance(exponent, ctypes._Pointer):
635
+ exponent_shape_size = backend_lib.op_shape_size(input_node)
636
+ exponent_shape = [
637
+ backend_lib.op_shape(exponent, i) for i in range(exponent_shape_size)
638
+ ]
639
+ else:
640
+ exponent_shape = list(exponent.shape)
641
+ exponent = self.constant(exponent).node # type: ignore
642
+ # if exponent_shape != input_shape:
643
+ # raise ValueError("Input tensor shapes are not equal")
644
+
645
+ return backend_lib.power(self._mm, input_node, exponent)
646
+
647
+ @return_tensor
648
+ def avg_pooling(
649
+ self,
650
+ input: ctypes._Pointer,
651
+ kernel_size: Union[int, Sequence[int]],
652
+ strides: Optional[Union[int, Sequence[int]]] = None,
653
+ padding: int = 0,
654
+ ceil_mode: bool = False,
655
+ count_include_pad: bool = True,
656
+ divisor_override: Optional[int] = None,
657
+ n_spatial_dims: int = 2,
658
+ ) -> ctypes._Pointer:
659
+ """Generate an average pooling layer.
660
+
661
+ Args:
662
+ input (ctypes._Pointer): layer input node
663
+ kernel_size (Sequence[int]): kernel size
664
+ strides (Sequence[int]): strides
665
+ padding (int): padding
666
+ ceil_mode (bool): ceil mode
667
+ count_include_pad (bool): count include pad
668
+ divisor_override (int): divisor override
669
+ n_spatial_dims (int): number of spatial dimensions
670
+
671
+ Raises:
672
+ NotImplementedError: divisor_override is not supported
673
+
674
+ Returns:
675
+ ctypes._Pointer: output node
676
+ """
677
+ if isinstance(kernel_size, int):
678
+ kernel_size = [kernel_size] * n_spatial_dims
679
+
680
+ if strides is None:
681
+ strides = kernel_size
682
+ elif isinstance(strides, int):
683
+ strides = [strides] * n_spatial_dims
684
+
685
+ if isinstance(padding, int):
686
+ padding_begins = [padding] * n_spatial_dims
687
+ padding_ends = [padding] * n_spatial_dims
688
+ else:
689
+ padding_begins = list(padding)
690
+ padding_ends = list(padding)
691
+
692
+ strides_ptr = np.array(strides, dtype=np.uint32)
693
+ padding_begins_ptr = np.array(padding_begins, dtype=np.uint32)
694
+ padding_ends_ptr = np.array(padding_ends, dtype=np.uint32)
695
+ kernel_size_ptr = np.array(kernel_size, dtype=np.uint32)
696
+
697
+ rounding_type = 1 if ceil_mode else 0
698
+ auto_pad = 0 # Hardcoded to explicit padding
699
+
700
+ if divisor_override:
701
+ raise NotImplementedError("divisor_override is not supported")
702
+
703
+ return backend_lib.avg_pooling(
704
+ self._mm,
705
+ input,
706
+ strides_ptr.size,
707
+ strides_ptr,
708
+ padding_begins_ptr.size,
709
+ padding_begins_ptr,
710
+ padding_ends_ptr.size,
711
+ padding_ends_ptr,
712
+ kernel_size_ptr.size,
713
+ kernel_size_ptr,
714
+ not count_include_pad, # exclude_pad
715
+ rounding_type, # rounding_type
716
+ auto_pad, # auto_pad
717
+ )
718
+
719
+ @return_tensor
720
+ def max_pooling(
721
+ self,
722
+ input: ctypes._Pointer,
723
+ kernel_size: Union[int, Sequence[int]],
724
+ strides: Optional[Union[int, Sequence[int]]] = None,
725
+ padding: int = 0,
726
+ ceil_mode: bool = False,
727
+ n_spatial_dims: int = 2,
728
+ ) -> ctypes._Pointer:
729
+ """Generate an average pooling layer.
730
+
731
+ Args:
732
+ input (ctypes._Pointer): layer input node
733
+ kernel_size (Sequence[int]): kernel size
734
+ strides (Sequence[int]): strides
735
+ padding (int): padding
736
+ ceil_mode (bool): ceil mode
737
+ n_spatial_dims (int): number of spatial dimensions
738
+
739
+ Returns:
740
+ ctypes._Pointer: output node
741
+ """
742
+ if isinstance(kernel_size, int):
743
+ kernel_size = [kernel_size] * n_spatial_dims
744
+
745
+ if strides is None:
746
+ strides = kernel_size
747
+ elif isinstance(strides, int):
748
+ strides = [strides] * n_spatial_dims
749
+
750
+ if isinstance(padding, int):
751
+ padding_begins = [padding] * n_spatial_dims
752
+ padding_ends = [padding] * n_spatial_dims
753
+ else:
754
+ padding_begins = list(padding)
755
+ padding_ends = list(padding)
756
+
757
+ strides_ptr = np.array(strides, dtype=np.uint32)
758
+ padding_begins_ptr = np.array(padding_begins, dtype=np.uint32)
759
+ padding_ends_ptr = np.array(padding_ends, dtype=np.uint32)
760
+ kernel_size_ptr = np.array(kernel_size, dtype=np.uint32)
761
+
762
+ rounding_type = 1 if ceil_mode else 0
763
+ auto_pad = 0 # Hardcoded to explicit padding
764
+
765
+ return backend_lib.max_pooling(
766
+ self._mm,
767
+ input,
768
+ strides_ptr.size,
769
+ strides_ptr,
770
+ padding_begins_ptr.size,
771
+ padding_begins_ptr,
772
+ padding_ends_ptr.size,
773
+ padding_ends_ptr,
774
+ kernel_size_ptr.size,
775
+ kernel_size_ptr,
776
+ rounding_type, # rounding_type
777
+ auto_pad, # auto_pad
778
+ )
779
+
780
+ def get_tensor_shape(self, node):
781
+ """Get tensor shape.
782
+
783
+ Args:
784
+ node: network node
785
+
786
+ Returns:
787
+ tuple[int]: tensor shape
788
+ """
789
+ size = backend_lib.op_shape_size(node)
790
+ return tuple([backend_lib.op_shape(node, idx) for idx in range(size)])
791
+
792
+ def get_tensor_dtype(self, node):
793
+ """Get tensor dtype.
794
+
795
+ Args:
796
+ node: network node
797
+
798
+ Raises:
799
+ RuntimeError: Unsupported dtype
800
+
801
+ Returns:
802
+ str: tensor dtype
803
+ """
804
+ dtype_int = backend_lib.op_dtype(node)
805
+
806
+ if dtype_int == 2:
807
+ return np.bool
808
+ # elif dtype_int == 3:
809
+ # return bfloat16
810
+ elif dtype_int == 4:
811
+ return np.float16
812
+ elif dtype_int == 5:
813
+ return np.float32
814
+ elif dtype_int == 6:
815
+ return np.float64
816
+ # elif dtype_int == 7:
817
+ # return int4
818
+ elif dtype_int == 8:
819
+ return np.int8
820
+ elif dtype_int == 9:
821
+ return np.int16
822
+ elif dtype_int == 10:
823
+ return np.int32
824
+ elif dtype_int == 11:
825
+ return np.int64
826
+ else:
827
+ raise RuntimeError("Unsupported dtype")
828
+
829
+ def compile(self):
830
+ """Finalize and compile a model."""
831
+ self.out = []
832
+ self.torch_out = []
833
+ for node in self.output_nodes:
834
+ backend_lib.result(self._mm, node)
835
+
836
+ # Compile the model
837
+ backend_lib.compile(self._mm)
838
+
839
+ for idx, node in enumerate(self.output_nodes):
840
+ output_shape = self.get_tensor_shape(node)
841
+ output_dtype = self.get_tensor_dtype(node)
842
+
843
+ tensor = np.empty(output_shape, dtype=output_dtype)
844
+ ptr = tensor.ctypes.data_as(ctypes.c_void_p)
845
+ backend_lib.set_output(self._mm, ptr, idx)
846
+ self.out.append(tensor)
847
+ self.torch_out.append(torch.from_numpy(tensor))
848
+
849
+ def set_input_tensor(self, tensor: np.ndarray, idx: int):
850
+ """Set input tensor.
851
+
852
+ Args:
853
+ tensor (np.ndarray): Input tensor
854
+ idx (int): tensor index
855
+ """
856
+ backend_lib.set_activation(
857
+ self._mm, tensor.ctypes.data_as(ctypes.c_void_p), idx
858
+ )
859
+
860
+ def get_tensor_recursively(self, args: Sequence[Any]) -> List[np.ndarray]:
861
+ """Get tensor recursively for a list of arguments.
862
+
863
+ Args:
864
+ args (Sequence[Any]): Sequence of tensors, tuple of tensors and additional arguments
865
+
866
+ Returns:
867
+ List[np.ndarray]: Sequence of tensors
868
+ """
869
+ tensors = []
870
+ for t in args:
871
+ if isinstance(t, (list, tuple)):
872
+ tensors.extend(self.get_tensor_recursively(t))
873
+ elif isinstance(t, np.ndarray):
874
+ tensors.append(t)
875
+
876
+ return tensors
877
+
878
+ def run(
879
+ self,
880
+ X: List[np.ndarray],
881
+ *weights: Union[np.ndarray, Tuple[np.ndarray, np.ndarray]],
882
+ **kwargs: Any,
883
+ ) -> np.ndarray:
884
+ """Run the layer: X * W^T.
885
+
886
+ Args:
887
+ X (np.ndarray): lhs operator
888
+ weights (Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]): rhs operators
889
+ kwargs (Any): additional arguments
890
+
891
+ Returns:
892
+ np.ndarray: result
893
+ """
894
+
895
+ if isinstance(X, np.ndarray):
896
+ X = [X]
897
+
898
+ op_id = kwargs.get("op_id", None)
899
+ verify_size = kwargs.get("verify_size", False)
900
+ if op_id is None:
901
+ ww = self.get_tensor_recursively(weights)
902
+ for idx, weight in enumerate(ww):
903
+ self.set_input_tensor(weight, idx + 1)
904
+ prefetch = False
905
+ else:
906
+ prefetch = self.setWeights(len(X), kwargs.get("op_id", None), *weights, verify_size=verify_size)
907
+
908
+ for idx, elem in enumerate(X):
909
+ self.set_input_tensor(elem, idx)
910
+
911
+ self.elapsed = backend_lib.run(self._mm)
912
+
913
+ if prefetch:
914
+ self.prefetchWeights(len(X), verify_size=verify_size)
915
+
916
+ if len(self.out) == 1:
917
+ return self.out[0]
918
+ return self.out
919
+
920
+ def __call__(self, *args: Any, **kwargs: Any) -> np.ndarray:
921
+ """Run the model using the factory.
922
+
923
+ Args:
924
+ args (Any): The positional arguments.
925
+ kwargs (Any): The keyword arguments.
926
+
927
+ Returns:
928
+ np.ndarray: The output tensor.
929
+ """
930
+ args = tuple(
931
+ [
932
+ arg.detach().numpy() if isinstance(arg, torch.Tensor) else arg
933
+ for arg in args
934
+ ]
935
+ )
936
+ kwargs = {
937
+ k: arg.detach().numpy() if isinstance(arg, torch.Tensor) else arg
938
+ for k, arg in kwargs.items()
939
+ }
940
+
941
+ out = self.run(*args, **kwargs)
942
+ if isinstance(out, list):
943
+ return [torch.tensor(o, device=torch.device("npu")) for o in out]
944
+ return torch.tensor(out, device=torch.device("npu"))