bigdl-core-npu 2.6.0b20250114__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. bigdl-core-npu/__init__.py +0 -0
  2. bigdl-core-npu/include/common.h +96 -0
  3. bigdl-core-npu/include/npu_llm.h +74 -0
  4. bigdl-core-npu/npu_llm.dll +0 -0
  5. bigdl-core-npu/npu_llm.lib +0 -0
  6. bigdl_core_npu-2.6.0b20250114.dist-info/METADATA +44 -0
  7. bigdl_core_npu-2.6.0b20250114.dist-info/RECORD +234 -0
  8. bigdl_core_npu-2.6.0b20250114.dist-info/WHEEL +5 -0
  9. bigdl_core_npu-2.6.0b20250114.dist-info/top_level.txt +2 -0
  10. intel_npu_acceleration_library/__init__.py +24 -0
  11. intel_npu_acceleration_library/_version.py +6 -0
  12. intel_npu_acceleration_library/backend/__init__.py +37 -0
  13. intel_npu_acceleration_library/backend/base.py +250 -0
  14. intel_npu_acceleration_library/backend/bindings.py +383 -0
  15. intel_npu_acceleration_library/backend/compression.py +24 -0
  16. intel_npu_acceleration_library/backend/convolution.py +58 -0
  17. intel_npu_acceleration_library/backend/factory.py +1161 -0
  18. intel_npu_acceleration_library/backend/linear.py +60 -0
  19. intel_npu_acceleration_library/backend/matmul.py +59 -0
  20. intel_npu_acceleration_library/backend/mlp.py +58 -0
  21. intel_npu_acceleration_library/backend/ops.py +142 -0
  22. intel_npu_acceleration_library/backend/qlinear.py +75 -0
  23. intel_npu_acceleration_library/backend/qmatmul.py +66 -0
  24. intel_npu_acceleration_library/backend/runtime.py +215 -0
  25. intel_npu_acceleration_library/backend/sdpa.py +107 -0
  26. intel_npu_acceleration_library/backend/tensor.py +1120 -0
  27. intel_npu_acceleration_library/backend/utils.py +70 -0
  28. intel_npu_acceleration_library/compiler.py +194 -0
  29. intel_npu_acceleration_library/device.py +230 -0
  30. intel_npu_acceleration_library/dtypes.py +155 -0
  31. intel_npu_acceleration_library/external/openvino/__init__.py +72 -0
  32. intel_npu_acceleration_library/external/openvino/_offline_transformations/__init__.py +21 -0
  33. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp310-win_amd64.pyd +0 -0
  34. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp311-win_amd64.pyd +0 -0
  35. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp312-win_amd64.pyd +0 -0
  36. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp38-win_amd64.pyd +0 -0
  37. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp39-win_amd64.pyd +0 -0
  38. intel_npu_acceleration_library/external/openvino/experimental/__init__.py +14 -0
  39. intel_npu_acceleration_library/external/openvino/frontend/__init__.py +34 -0
  40. intel_npu_acceleration_library/external/openvino/frontend/frontend.py +44 -0
  41. intel_npu_acceleration_library/external/openvino/frontend/jax/__init__.py +15 -0
  42. intel_npu_acceleration_library/external/openvino/frontend/jax/jaxpr_decoder.py +293 -0
  43. intel_npu_acceleration_library/external/openvino/frontend/jax/passes.py +65 -0
  44. intel_npu_acceleration_library/external/openvino/frontend/jax/utils.py +182 -0
  45. intel_npu_acceleration_library/external/openvino/frontend/onnx/__init__.py +15 -0
  46. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp310-win_amd64.pyd +0 -0
  47. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp311-win_amd64.pyd +0 -0
  48. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp312-win_amd64.pyd +0 -0
  49. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp38-win_amd64.pyd +0 -0
  50. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp39-win_amd64.pyd +0 -0
  51. intel_npu_acceleration_library/external/openvino/frontend/paddle/__init__.py +15 -0
  52. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp310-win_amd64.pyd +0 -0
  53. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp311-win_amd64.pyd +0 -0
  54. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp312-win_amd64.pyd +0 -0
  55. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp38-win_amd64.pyd +0 -0
  56. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp39-win_amd64.pyd +0 -0
  57. intel_npu_acceleration_library/external/openvino/frontend/pytorch/__init__.py +19 -0
  58. intel_npu_acceleration_library/external/openvino/frontend/pytorch/fx_decoder.py +370 -0
  59. intel_npu_acceleration_library/external/openvino/frontend/pytorch/gptq.py +180 -0
  60. intel_npu_acceleration_library/external/openvino/frontend/pytorch/module_extension.py +39 -0
  61. intel_npu_acceleration_library/external/openvino/frontend/pytorch/patch_model.py +118 -0
  62. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp310-win_amd64.pyd +0 -0
  63. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp311-win_amd64.pyd +0 -0
  64. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp312-win_amd64.pyd +0 -0
  65. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp38-win_amd64.pyd +0 -0
  66. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp39-win_amd64.pyd +0 -0
  67. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/backend.py +131 -0
  68. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/backend_utils.py +85 -0
  69. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/compile.py +141 -0
  70. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/decompositions.py +116 -0
  71. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/execute.py +189 -0
  72. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/op_support.py +290 -0
  73. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/partition.py +126 -0
  74. intel_npu_acceleration_library/external/openvino/frontend/pytorch/ts_decoder.py +568 -0
  75. intel_npu_acceleration_library/external/openvino/frontend/pytorch/utils.py +258 -0
  76. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/__init__.py +16 -0
  77. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/graph_iterator.py +116 -0
  78. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/node_decoder.py +219 -0
  79. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp310-win_amd64.pyd +0 -0
  80. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp311-win_amd64.pyd +0 -0
  81. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp312-win_amd64.pyd +0 -0
  82. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp38-win_amd64.pyd +0 -0
  83. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp39-win_amd64.pyd +0 -0
  84. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/utils.py +481 -0
  85. intel_npu_acceleration_library/external/openvino/helpers/__init__.py +6 -0
  86. intel_npu_acceleration_library/external/openvino/helpers/packing.py +87 -0
  87. intel_npu_acceleration_library/external/openvino/preprocess/README.md +60 -0
  88. intel_npu_acceleration_library/external/openvino/preprocess/__init__.py +28 -0
  89. intel_npu_acceleration_library/external/openvino/preprocess/torchvision/__init__.py +15 -0
  90. intel_npu_acceleration_library/external/openvino/preprocess/torchvision/preprocess_converter.py +47 -0
  91. intel_npu_acceleration_library/external/openvino/preprocess/torchvision/requirements.txt +5 -0
  92. intel_npu_acceleration_library/external/openvino/preprocess/torchvision/torchvision_preprocessing.py +347 -0
  93. intel_npu_acceleration_library/external/openvino/properties/__init__.py +22 -0
  94. intel_npu_acceleration_library/external/openvino/properties/_properties.py +55 -0
  95. intel_npu_acceleration_library/external/openvino/properties/device/__init__.py +14 -0
  96. intel_npu_acceleration_library/external/openvino/properties/hint/__init__.py +15 -0
  97. intel_npu_acceleration_library/external/openvino/properties/intel_auto/__init__.py +12 -0
  98. intel_npu_acceleration_library/external/openvino/properties/intel_cpu/__init__.py +8 -0
  99. intel_npu_acceleration_library/external/openvino/properties/intel_gpu/__init__.py +12 -0
  100. intel_npu_acceleration_library/external/openvino/properties/intel_gpu/hint/__init__.py +11 -0
  101. intel_npu_acceleration_library/external/openvino/properties/log/__init__.py +11 -0
  102. intel_npu_acceleration_library/external/openvino/properties/streams/__init__.py +11 -0
  103. intel_npu_acceleration_library/external/openvino/runtime/__init__.py +85 -0
  104. intel_npu_acceleration_library/external/openvino/runtime/exceptions.py +17 -0
  105. intel_npu_acceleration_library/external/openvino/runtime/ie_api.py +631 -0
  106. intel_npu_acceleration_library/external/openvino/runtime/op/__init__.py +19 -0
  107. intel_npu_acceleration_library/external/openvino/runtime/op/util/__init__.py +22 -0
  108. intel_npu_acceleration_library/external/openvino/runtime/opset1/__init__.py +112 -0
  109. intel_npu_acceleration_library/external/openvino/runtime/opset1/ops.py +3068 -0
  110. intel_npu_acceleration_library/external/openvino/runtime/opset10/__init__.py +179 -0
  111. intel_npu_acceleration_library/external/openvino/runtime/opset10/ops.py +173 -0
  112. intel_npu_acceleration_library/external/openvino/runtime/opset11/__init__.py +179 -0
  113. intel_npu_acceleration_library/external/openvino/runtime/opset11/ops.py +107 -0
  114. intel_npu_acceleration_library/external/openvino/runtime/opset12/__init__.py +180 -0
  115. intel_npu_acceleration_library/external/openvino/runtime/opset12/ops.py +120 -0
  116. intel_npu_acceleration_library/external/openvino/runtime/opset13/__init__.py +188 -0
  117. intel_npu_acceleration_library/external/openvino/runtime/opset13/ops.py +398 -0
  118. intel_npu_acceleration_library/external/openvino/runtime/opset14/__init__.py +190 -0
  119. intel_npu_acceleration_library/external/openvino/runtime/opset14/ops.py +171 -0
  120. intel_npu_acceleration_library/external/openvino/runtime/opset15/__init__.py +17 -0
  121. intel_npu_acceleration_library/external/openvino/runtime/opset15/ops.py +276 -0
  122. intel_npu_acceleration_library/external/openvino/runtime/opset2/__init__.py +118 -0
  123. intel_npu_acceleration_library/external/openvino/runtime/opset2/ops.py +216 -0
  124. intel_npu_acceleration_library/external/openvino/runtime/opset3/__init__.py +134 -0
  125. intel_npu_acceleration_library/external/openvino/runtime/opset3/ops.py +638 -0
  126. intel_npu_acceleration_library/external/openvino/runtime/opset4/__init__.py +145 -0
  127. intel_npu_acceleration_library/external/openvino/runtime/opset4/ops.py +464 -0
  128. intel_npu_acceleration_library/external/openvino/runtime/opset5/__init__.py +152 -0
  129. intel_npu_acceleration_library/external/openvino/runtime/opset5/ops.py +372 -0
  130. intel_npu_acceleration_library/external/openvino/runtime/opset6/__init__.py +154 -0
  131. intel_npu_acceleration_library/external/openvino/runtime/opset6/ops.py +215 -0
  132. intel_npu_acceleration_library/external/openvino/runtime/opset7/__init__.py +158 -0
  133. intel_npu_acceleration_library/external/openvino/runtime/opset7/ops.py +169 -0
  134. intel_npu_acceleration_library/external/openvino/runtime/opset8/__init__.py +169 -0
  135. intel_npu_acceleration_library/external/openvino/runtime/opset8/ops.py +787 -0
  136. intel_npu_acceleration_library/external/openvino/runtime/opset9/__init__.py +175 -0
  137. intel_npu_acceleration_library/external/openvino/runtime/opset9/ops.py +341 -0
  138. intel_npu_acceleration_library/external/openvino/runtime/opset_utils.py +22 -0
  139. intel_npu_acceleration_library/external/openvino/runtime/passes/__init__.py +19 -0
  140. intel_npu_acceleration_library/external/openvino/runtime/passes/graph_rewrite.py +33 -0
  141. intel_npu_acceleration_library/external/openvino/runtime/passes/manager.py +26 -0
  142. intel_npu_acceleration_library/external/openvino/runtime/properties/__init__.py +40 -0
  143. intel_npu_acceleration_library/external/openvino/runtime/properties/hint/__init__.py +25 -0
  144. intel_npu_acceleration_library/external/openvino/runtime/utils/__init__.py +7 -0
  145. intel_npu_acceleration_library/external/openvino/runtime/utils/broadcasting.py +44 -0
  146. intel_npu_acceleration_library/external/openvino/runtime/utils/data_helpers/__init__.py +8 -0
  147. intel_npu_acceleration_library/external/openvino/runtime/utils/data_helpers/data_dispatcher.py +447 -0
  148. intel_npu_acceleration_library/external/openvino/runtime/utils/data_helpers/wrappers.py +148 -0
  149. intel_npu_acceleration_library/external/openvino/runtime/utils/decorators.py +156 -0
  150. intel_npu_acceleration_library/external/openvino/runtime/utils/input_validation.py +133 -0
  151. intel_npu_acceleration_library/external/openvino/runtime/utils/node_factory.py +127 -0
  152. intel_npu_acceleration_library/external/openvino/runtime/utils/reduction.py +25 -0
  153. intel_npu_acceleration_library/external/openvino/runtime/utils/types.py +175 -0
  154. intel_npu_acceleration_library/external/openvino/tools/__init__.py +4 -0
  155. intel_npu_acceleration_library/external/openvino/tools/benchmark/__init__.py +3 -0
  156. intel_npu_acceleration_library/external/openvino/tools/benchmark/benchmark.py +186 -0
  157. intel_npu_acceleration_library/external/openvino/tools/benchmark/main.py +695 -0
  158. intel_npu_acceleration_library/external/openvino/tools/benchmark/parameters.py +199 -0
  159. intel_npu_acceleration_library/external/openvino/tools/benchmark/utils/__init__.py +3 -0
  160. intel_npu_acceleration_library/external/openvino/tools/benchmark/utils/constants.py +26 -0
  161. intel_npu_acceleration_library/external/openvino/tools/benchmark/utils/inputs_filling.py +482 -0
  162. intel_npu_acceleration_library/external/openvino/tools/benchmark/utils/logging.py +8 -0
  163. intel_npu_acceleration_library/external/openvino/tools/benchmark/utils/statistics_report.py +296 -0
  164. intel_npu_acceleration_library/external/openvino/tools/benchmark/utils/utils.py +836 -0
  165. intel_npu_acceleration_library/external/openvino/tools/ovc/__init__.py +20 -0
  166. intel_npu_acceleration_library/external/openvino/tools/ovc/__main__.py +10 -0
  167. intel_npu_acceleration_library/external/openvino/tools/ovc/cli_parser.py +633 -0
  168. intel_npu_acceleration_library/external/openvino/tools/ovc/convert.py +102 -0
  169. intel_npu_acceleration_library/external/openvino/tools/ovc/convert_data_type.py +82 -0
  170. intel_npu_acceleration_library/external/openvino/tools/ovc/convert_impl.py +550 -0
  171. intel_npu_acceleration_library/external/openvino/tools/ovc/environment_setup_utils.py +50 -0
  172. intel_npu_acceleration_library/external/openvino/tools/ovc/error.py +49 -0
  173. intel_npu_acceleration_library/external/openvino/tools/ovc/get_ov_update_message.py +16 -0
  174. intel_npu_acceleration_library/external/openvino/tools/ovc/help.py +45 -0
  175. intel_npu_acceleration_library/external/openvino/tools/ovc/logger.py +91 -0
  176. intel_npu_acceleration_library/external/openvino/tools/ovc/main.py +40 -0
  177. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/__init__.py +2 -0
  178. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/analysis.py +46 -0
  179. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/check_config.py +57 -0
  180. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/extractor.py +447 -0
  181. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/jax_frontend_utils.py +19 -0
  182. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/layout_utils.py +73 -0
  183. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/moc_emit_ir.py +32 -0
  184. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/offline_transformations.py +107 -0
  185. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/paddle_frontend_utils.py +83 -0
  186. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/pipeline.py +298 -0
  187. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/preprocessing.py +220 -0
  188. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py +214 -0
  189. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/shape_utils.py +109 -0
  190. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/type_utils.py +82 -0
  191. intel_npu_acceleration_library/external/openvino/tools/ovc/ovc.py +13 -0
  192. intel_npu_acceleration_library/external/openvino/tools/ovc/telemetry_params.py +6 -0
  193. intel_npu_acceleration_library/external/openvino/tools/ovc/telemetry_stub.py +28 -0
  194. intel_npu_acceleration_library/external/openvino/tools/ovc/telemetry_utils.py +118 -0
  195. intel_npu_acceleration_library/external/openvino/tools/ovc/utils.py +196 -0
  196. intel_npu_acceleration_library/external/openvino/tools/ovc/version.py +80 -0
  197. intel_npu_acceleration_library/external/openvino/torch/__init__.py +5 -0
  198. intel_npu_acceleration_library/external/openvino/utils.py +115 -0
  199. intel_npu_acceleration_library/functional/__init__.py +8 -0
  200. intel_npu_acceleration_library/functional/scaled_dot_product_attention.py +47 -0
  201. intel_npu_acceleration_library/lib/Release/cache.json +113732 -0
  202. intel_npu_acceleration_library/lib/Release/intel_npu_acceleration_library.dll +0 -0
  203. intel_npu_acceleration_library/lib/Release/openvino.dll +0 -0
  204. intel_npu_acceleration_library/lib/Release/openvino_auto_batch_plugin.dll +0 -0
  205. intel_npu_acceleration_library/lib/Release/openvino_auto_plugin.dll +0 -0
  206. intel_npu_acceleration_library/lib/Release/openvino_c.dll +0 -0
  207. intel_npu_acceleration_library/lib/Release/openvino_hetero_plugin.dll +0 -0
  208. intel_npu_acceleration_library/lib/Release/openvino_intel_cpu_plugin.dll +0 -0
  209. intel_npu_acceleration_library/lib/Release/openvino_intel_gpu_plugin.dll +0 -0
  210. intel_npu_acceleration_library/lib/Release/openvino_intel_npu_plugin.dll +0 -0
  211. intel_npu_acceleration_library/lib/Release/openvino_ir_frontend.dll +0 -0
  212. intel_npu_acceleration_library/lib/Release/openvino_onnx_frontend.dll +0 -0
  213. intel_npu_acceleration_library/lib/Release/openvino_paddle_frontend.dll +0 -0
  214. intel_npu_acceleration_library/lib/Release/openvino_pytorch_frontend.dll +0 -0
  215. intel_npu_acceleration_library/lib/Release/openvino_tensorflow_frontend.dll +0 -0
  216. intel_npu_acceleration_library/lib/Release/openvino_tensorflow_lite_frontend.dll +0 -0
  217. intel_npu_acceleration_library/lib/Release/tbb12.dll +0 -0
  218. intel_npu_acceleration_library/lib/Release/tbb12_debug.dll +0 -0
  219. intel_npu_acceleration_library/lib/Release/tbbbind_2_5.dll +0 -0
  220. intel_npu_acceleration_library/lib/Release/tbbbind_2_5_debug.dll +0 -0
  221. intel_npu_acceleration_library/lib/Release/tbbmalloc.dll +0 -0
  222. intel_npu_acceleration_library/lib/Release/tbbmalloc_debug.dll +0 -0
  223. intel_npu_acceleration_library/lib/Release/tbbmalloc_proxy.dll +0 -0
  224. intel_npu_acceleration_library/lib/Release/tbbmalloc_proxy_debug.dll +0 -0
  225. intel_npu_acceleration_library/modelling.py +150 -0
  226. intel_npu_acceleration_library/nn/__init__.py +20 -0
  227. intel_npu_acceleration_library/nn/autograd.py +68 -0
  228. intel_npu_acceleration_library/nn/conv.py +257 -0
  229. intel_npu_acceleration_library/nn/functional.py +1207 -0
  230. intel_npu_acceleration_library/nn/linear.py +162 -0
  231. intel_npu_acceleration_library/nn/llm.py +417 -0
  232. intel_npu_acceleration_library/nn/module.py +393 -0
  233. intel_npu_acceleration_library/optimizations.py +157 -0
  234. intel_npu_acceleration_library/quantization.py +174 -0
@@ -0,0 +1,162 @@
1
+ #
2
+ # Copyright © 2024 Intel Corporation
3
+ # SPDX-License-Identifier: Apache 2.0
4
+ #
5
+
6
+ from intel_npu_acceleration_library.quantization import quantize_tensor, compress_to_i4
7
+ from intel_npu_acceleration_library.nn.autograd import AutogradMatMul
8
+ from intel_npu_acceleration_library.backend import run_matmul
9
+ from intel_npu_acceleration_library.dtypes import NPUDtype
10
+ from typing import Optional, Union
11
+ import torch
12
+ import uuid
13
+ import math
14
+
15
+
16
+ class Linear(torch.nn.Module):
17
+ """Torch Linear operation NPU backend."""
18
+
19
+ def __init__(self, weight: torch.Tensor, bias: Optional[torch.Tensor] = None):
20
+ """Initialize the Linear class.
21
+
22
+ Args:
23
+ weight (torch.Tensor): Linear operation weight
24
+ bias (Optional[torch.Tensor], optional): Linear operation optional bias. Defaults to None.
25
+ """
26
+ super().__init__()
27
+
28
+ self.weight = torch.nn.Parameter(weight)
29
+ self.bias = torch.nn.Parameter(bias) if isinstance(bias, torch.Tensor) else None
30
+ self.outC, self.inC = self.weight.shape
31
+ self.op_id = str(uuid.uuid4())
32
+ # assert self.weight.dtype == torch.float16
33
+ self._mm = AutogradMatMul.apply
34
+
35
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
36
+ """Torch module forward method.
37
+
38
+ Args:
39
+ x (torch.Tensor): Input tensor
40
+
41
+ Returns:
42
+ torch.Tensor: result
43
+ """
44
+ if self.training:
45
+ out = self._mm(x, self.weight, None)
46
+ else:
47
+ out = run_matmul(x, self.weight, None, self.op_id)
48
+
49
+ if self.bias is None:
50
+ return out
51
+ return out + self.bias
52
+
53
+ @staticmethod
54
+ def fromTorch(
55
+ layer: torch.nn.Linear, dtype: torch.dtype = torch.float16
56
+ ) -> Union["Linear", "QuantizedLinear"]:
57
+ """Generate a NPU Linear layer from a torch one.
58
+
59
+ Args:
60
+ layer (torch.nn.Linear): the original torch.nn.Linear model to run on the NPU
61
+ dtype (torch.dtype): the desired datatype
62
+
63
+ Returns:
64
+ Union[Linear, QuantizedLinear]: A NPU linear layer
65
+ """
66
+ if any(dim > 2**17 for dim in layer.weight.shape):
67
+ return layer
68
+ return Linear.fromTensor(layer.weight, getattr(layer, "bias", None), dtype)
69
+
70
+ @staticmethod
71
+ def fromTensor(
72
+ weight: torch.Tensor,
73
+ bias: Optional[torch.Tensor],
74
+ dtype: torch.dtype = torch.float16,
75
+ ) -> Union["Linear", "QuantizedLinear"]:
76
+ """Generate a NPU Linear layer from a torch one.
77
+
78
+ Args:
79
+ weight (torch.Tensor): the original weight tensor
80
+ bias (Optional[torch.Tensor]): the original bias tensor
81
+ dtype (torch.dtype): the desired datatype
82
+
83
+ Raises:
84
+ RuntimeError: dtype not supported
85
+
86
+ Returns:
87
+ Union[Linear, QuantizedLinear]: A NPU linear layer
88
+ """
89
+ if dtype.is_floating_point:
90
+ if bias is None:
91
+ return Linear(weight.to(dtype), None)
92
+ return Linear(weight.to(dtype), bias.to(dtype))
93
+ elif isinstance(dtype, NPUDtype):
94
+ weights_quant, scale = quantize_tensor(weight, (dtype.min, dtype.max))
95
+ if dtype.bits == 4:
96
+ weights_quant = compress_to_i4(weights_quant)
97
+ return QuantizedLinear(weights_quant, scale, bias)
98
+ elif dtype == torch.int8:
99
+ weights_quant, scale = quantize_tensor(weight)
100
+ return QuantizedLinear(weights_quant, scale, bias)
101
+ else:
102
+ raise RuntimeError(
103
+ f"intel-npu-acceleration-library library do not support yet the requeste datatype: {dtype}"
104
+ )
105
+
106
+
107
+ class QuantizedLinear(torch.nn.Module):
108
+ """Torch Quantized Linear operation NPU backend."""
109
+
110
+ def __init__(
111
+ self,
112
+ weight: torch.Tensor,
113
+ scale: torch.Tensor,
114
+ bias: Optional[torch.Tensor] = None,
115
+ ):
116
+ """Initialize the QuantizedLinear class.
117
+
118
+ Args:
119
+ weight (torch.Tensor): Linear operation weight
120
+ scale (torch.Tensor): Quantization scale
121
+ bias (Optional[torch.Tensor], optional): Linear operation optional bias. Defaults to None.
122
+
123
+ Raises:
124
+ RuntimeError: Quantized weight must be in torch.int8 format
125
+ """
126
+ super().__init__()
127
+
128
+ self.weight = weight
129
+ if self.weight.dtype not in (torch.int8, torch.uint8):
130
+ raise RuntimeError(
131
+ f"Quantized weight must be in torch.(u)int8 dtype instead of {self.weight.dtype}"
132
+ )
133
+ self.outC, self.inC = self.weight.shape
134
+ if self.weight.dtype == torch.uint8:
135
+ # In case is Int4 we need to double the input channels because weights are compressed
136
+ self.inC *= 2
137
+ self.scale = scale * math.sqrt(self.inC)
138
+ self.bias = bias
139
+ self.op_id = str(uuid.uuid4())
140
+ self._mm = AutogradMatMul.apply
141
+
142
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
143
+ """Torch module forward method.
144
+
145
+ Args:
146
+ x (torch.Tensor): Input tensor
147
+
148
+ Raises:
149
+ RuntimeError: Training is not supported for QuantizedLinear layer. Use `.eval()` to do inference only
150
+
151
+ Returns:
152
+ torch.Tensor: result
153
+ """
154
+ if self.training:
155
+ raise RuntimeError(
156
+ "Training is not supported for QuantizedLinear layer. Use `.eval()` to do inference only"
157
+ )
158
+ out = run_matmul(x, self.weight, self.scale, self.op_id)
159
+
160
+ if self.bias is None:
161
+ return out
162
+ return out + self.bias
@@ -0,0 +1,417 @@
1
+ #
2
+ # Copyright © 2024 Intel Corporation
3
+ # SPDX-License-Identifier: Apache 2.0
4
+ #
5
+
6
+ from transformers.models.llama.modeling_llama import (
7
+ apply_rotary_pos_emb,
8
+ repeat_kv,
9
+ LlamaConfig,
10
+ )
11
+ from transformers import AutoTokenizer
12
+ from intel_npu_acceleration_library.nn import Linear
13
+ from intel_npu_acceleration_library.backend import run_factory, MLP
14
+ from functools import partial
15
+ from typing import Optional, List, Generator
16
+ from transformers.cache_utils import Cache
17
+ import torch
18
+ import uuid
19
+
20
+
21
+ class PhiMLP(torch.nn.Module):
22
+ """Phi-2 MLP operation NPU backend."""
23
+
24
+ def __init__(
25
+ self,
26
+ parameters: List[torch.Tensor],
27
+ ):
28
+ """Initialize LLAMA MLP operation.
29
+
30
+ Args:
31
+ parameters (List[torch.Tensor]): model weights
32
+ """
33
+ super().__init__()
34
+ self.op_parameters = parameters
35
+ self.op_id = str(uuid.uuid4())
36
+ intermediate_size, _ = parameters[0].shape
37
+ self.backend_cls = partial(
38
+ MLP,
39
+ intermediate_size=intermediate_size,
40
+ activation="gelu",
41
+ bias=True,
42
+ )
43
+
44
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
45
+ """Torch module forward method.
46
+
47
+ Args:
48
+ x (torch.Tensor): Input tensor
49
+
50
+ Returns:
51
+ torch.Tensor: result
52
+ """
53
+ return run_factory(x, self.op_parameters, self.backend_cls, self.op_id)
54
+
55
+ @staticmethod
56
+ def fromTorch(
57
+ layer: torch.nn.Module, dtype: torch.dtype = torch.float16
58
+ ) -> "PhiMLP":
59
+ """Generate a NPU PhiMLP layer from a transformer one.
60
+
61
+ Args:
62
+ layer (torch.nn.Linear): the original PhiMLP model to run on the NPU
63
+ dtype (torch.dtype): the desired datatype
64
+
65
+ Returns:
66
+ PhiMLP: A NPU PhiMLP layer
67
+ """
68
+ new_layer = PhiMLP(
69
+ parameters=[weight.to(dtype) for weight in layer.parameters()],
70
+ )
71
+
72
+ return new_layer
73
+
74
+
75
+ class FusedLlamaMLP(torch.nn.Module):
76
+ """LLAMA MLP operation NPU backend."""
77
+
78
+ def __init__(
79
+ self,
80
+ parameters: List[torch.Tensor],
81
+ ):
82
+ """Initialize LLAMA MLP operation.
83
+
84
+ Args:
85
+ parameters (List[torch.Tensor]): model weights
86
+ """
87
+ super().__init__()
88
+ self.op_parameters = parameters
89
+ self.op_id = str(uuid.uuid4())
90
+ intermediate_size, _ = parameters[0].shape
91
+ self.backend_cls = partial(MLP, intermediate_size=intermediate_size)
92
+
93
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
94
+ """Torch module forward method.
95
+
96
+ Args:
97
+ x (torch.Tensor): Input tensor
98
+
99
+ Returns:
100
+ torch.Tensor: result
101
+ """
102
+ return run_factory(x, self.op_parameters, self.backend_cls, self.op_id)
103
+
104
+ @staticmethod
105
+ def fromTorch(
106
+ layer: torch.nn.Module, dtype: torch.dtype = torch.float16
107
+ ) -> "FusedLlamaMLP":
108
+ """Generate a NPU LlamaMLP layer from a transformer LlamaMLP one.
109
+
110
+ Args:
111
+ layer (torch.nn.Linear): the original LlamaMLP model to run on the NPU
112
+ dtype (torch.dtype): the desired datatype
113
+
114
+ Returns:
115
+ FusedLlamaMLP: A NPU LlamaMLP layer
116
+ """
117
+ new_layer = FusedLlamaMLP(
118
+ parameters=[weight.to(dtype) for weight in layer.parameters()],
119
+ )
120
+
121
+ return new_layer
122
+
123
+
124
+ class LlamaAttention(torch.nn.Module):
125
+ """LlamaAttention operation NPU backend."""
126
+
127
+ def __init__(
128
+ self,
129
+ config: LlamaConfig,
130
+ q_weights: torch.Tensor,
131
+ kv_weights: torch.Tensor,
132
+ o_proj: torch.Tensor,
133
+ rotary_emb: torch.nn.Module,
134
+ dtype: torch.dtype = torch.float16,
135
+ layer_idx: Optional[int] = None,
136
+ ):
137
+ """Initialize the LlamaAttention class.
138
+
139
+ Args:
140
+ config (LlamaConfig): LlamaAttention configuration
141
+ q_weights (torch.Tensor): Weights for the query Linear layer
142
+ kv_weights (torch.Tensor): Concatentation of the weights for the Key and Value Linear layer
143
+ o_proj (torch.Tensor): Weights for the output projection Linear layer
144
+ rotary_emb (torch.nn.Module): Rotary embedding module
145
+ dtype (torch.dtype): the desired datatype
146
+ layer_idx (Optional[int], optional): Layer index. Defaults to None.
147
+ """
148
+ super().__init__()
149
+ self.config = config
150
+ self.rotary_emb = rotary_emb
151
+ self.kv_proj = Linear.fromTensor(kv_weights, None, dtype)
152
+ self.q_proj = Linear.fromTensor(q_weights, None, dtype)
153
+ self.o_proj = Linear.fromTensor(o_proj, None, dtype)
154
+
155
+ self.hidden_size = config.hidden_size
156
+ self.num_heads = config.num_attention_heads
157
+ self.head_dim = self.hidden_size // self.num_heads
158
+ self.num_key_value_heads = config.num_key_value_heads
159
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
160
+ self.is_causal = True
161
+ self.layer_idx = layer_idx
162
+
163
+ def forward(
164
+ self,
165
+ hidden_states: torch.Tensor,
166
+ attention_mask: Optional[torch.Tensor] = None,
167
+ position_ids: Optional[torch.Tensor] = None,
168
+ past_key_value: Optional[Cache] = None,
169
+ output_attentions: Optional[bool] = False,
170
+ use_cache: Optional[bool] = False,
171
+ cache_position: Optional[torch.LongTensor] = None,
172
+ ):
173
+ """Torch module forward method.
174
+
175
+ Args:
176
+ hidden_states (torch.Tensor): input to the layer of shape `(batch, seq_len, embed_dim)`
177
+ attention_mask (Optional[torch.Tensor], optional): attention mask of shape `(batch_size, sequence_length)`. Defaults to None.
178
+ position_ids (Optional[torch.Tensor], optional): position_ids of shape `(batch_size, sequence_length)`. Defaults to None.
179
+ past_key_value (Optional[Cache], optional): Pre-computed hidden-states (key and values in the self-attention blocks). Defaults to None.
180
+ output_attentions (Optional[bool], optional): Whether or not to return the attentions tensors of all attention layers.. Defaults to False.
181
+ use_cache (Optional[bool], optional): If set to `True`, `past_key_values` key value states are returned. Defaults to False.
182
+ cache_position (Optional[torch.LongTensor], optional): Cache position useful for static cache applications . Defaults to None.
183
+
184
+ Returns:
185
+ _type_: result
186
+ """
187
+ bsz, q_len, _ = hidden_states.size()
188
+
189
+ query_states = self.q_proj(hidden_states)
190
+ kv_states = self.kv_proj(hidden_states)
191
+
192
+ key_states = kv_states[..., : self.num_key_value_heads * self.head_dim]
193
+ value_states = kv_states[..., self.num_key_value_heads * self.head_dim :]
194
+
195
+ query_states = query_states.view(
196
+ bsz, q_len, self.num_heads, self.head_dim
197
+ ).transpose(1, 2)
198
+ key_states = key_states.view(
199
+ bsz, q_len, self.num_key_value_heads, self.head_dim
200
+ ).transpose(1, 2)
201
+ value_states = value_states.view(
202
+ bsz, q_len, self.num_key_value_heads, self.head_dim
203
+ ).transpose(1, 2)
204
+
205
+ cos, sin = self.rotary_emb(value_states, position_ids)
206
+
207
+ query_states, key_states = apply_rotary_pos_emb(
208
+ query_states, key_states, cos, sin, position_ids
209
+ )
210
+
211
+ if past_key_value is not None:
212
+ # sin and cos are specific to RoPE models; position_ids needed for the static cache
213
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
214
+ key_states, value_states = past_key_value.update(
215
+ key_states, value_states, self.layer_idx, cache_kwargs
216
+ )
217
+
218
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
219
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
220
+
221
+ causal_mask = attention_mask
222
+ if causal_mask is not None:
223
+ causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
224
+
225
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
226
+ query_states,
227
+ key_states,
228
+ value_states,
229
+ attn_mask=causal_mask,
230
+ is_causal=self.is_causal and attention_mask is None and q_len > 1,
231
+ )
232
+
233
+ attn_output = attn_output.transpose(1, 2).contiguous()
234
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
235
+
236
+ attn_output = self.o_proj(attn_output)
237
+
238
+ return attn_output, None, past_key_value
239
+
240
+ @staticmethod
241
+ def fromTorch(
242
+ layer: torch.nn.Module, dtype: torch.dtype = torch.float16
243
+ ) -> "LlamaAttention":
244
+ """Generate a NPU LlamaAttention layer from a transformer LlamaAttention one.
245
+
246
+ Args:
247
+ layer (torch.nn.Linear): the original LlamaAttention model to run on the NPU
248
+ dtype (torch.dtype): the desired datatype
249
+
250
+ Returns:
251
+ LlamaAttention: A NPU LlamaAttention layer
252
+ """
253
+ kv_weights = torch.cat((layer.k_proj.weight, layer.v_proj.weight), dim=0)
254
+
255
+ new_layer = LlamaAttention(
256
+ config=layer.config,
257
+ q_weights=layer.q_proj.weight,
258
+ kv_weights=kv_weights,
259
+ o_proj=layer.o_proj.weight,
260
+ rotary_emb=layer.rotary_emb,
261
+ dtype=dtype,
262
+ layer_idx=layer.layer_idx,
263
+ )
264
+
265
+ return new_layer
266
+
267
+
268
+ def lshift_insert(tensor: torch.Tensor, value: float) -> torch.Tensor:
269
+ """Compute shift left and insert a value into a tensor.
270
+
271
+ Args:
272
+ tensor (torch.Tensor): input tensor
273
+ value (float): value to add
274
+
275
+ Returns:
276
+ torch.Tensor: output tensor
277
+ """
278
+ tensor = torch.roll(tensor, shifts=-1, dims=-1)
279
+ tensor[0, -1] = value
280
+ return tensor
281
+
282
+
283
+ # Generate function
284
+ @torch.no_grad()
285
+ def generate_with_static_shape(
286
+ model: torch.nn.Module,
287
+ input_ids: torch.Tensor,
288
+ max_length: int,
289
+ attention_mask: Optional[torch.Tensor] = None,
290
+ use_past: Optional[bool] = True,
291
+ pad_token_id: Optional[int] = None,
292
+ **kwargs,
293
+ ) -> Generator[int, None, None]:
294
+ """Run LLM generator routine wiht static shapes.
295
+
296
+ Args:
297
+ model (torch.nn.Module): LLM mode
298
+ input_ids (torch.Tensor): model input_ids
299
+ max_length (int): model max lenght.
300
+ attention_mask (Optional[torch.Tensor], optional): input attention mask. Defaults to None.
301
+ use_past (Optional[bool], optional): Enable/disable KV caching. Defaults to True.
302
+ pad_token_id (Optional[int], optional): Padding token. Defaults to None.
303
+ kwargs: Additional arguments
304
+
305
+ Raises:
306
+ RuntimeError: pad_token_id is not set and needed for static shape generation
307
+
308
+ Yields:
309
+ Generator[int, None, None]: Return a generator of new tokens
310
+ """
311
+ # Get sequence lenght
312
+ batch, seq_lenght = input_ids.shape
313
+
314
+ if pad_token_id is None:
315
+ raise RuntimeError(
316
+ "pad_token_id is not set and needed for static shape generation"
317
+ )
318
+
319
+ # Padding attention mask
320
+ if attention_mask is None:
321
+ attention_mask = torch.ones_like(input_ids, dtype=torch.int32).to(model.device)
322
+ attention_mask_padding = torch.zeros(
323
+ (batch, max_length - seq_lenght), dtype=input_ids.dtype, device=input_ids.device
324
+ )
325
+ attention_mask = torch.cat((attention_mask_padding, attention_mask), dim=-1)
326
+
327
+ # Padding input_ids with left padding
328
+ padding_input_ids = pad_token_id * torch.ones(
329
+ (batch, max_length - seq_lenght), dtype=input_ids.dtype, device=input_ids.device
330
+ )
331
+ input_ids = torch.cat((padding_input_ids, input_ids), dim=-1).to(model.device)
332
+
333
+ # Set the proper position ids
334
+ position_ids = kwargs.get("position_ids", None)
335
+ if position_ids is None:
336
+ position_ids = torch.tensor(
337
+ [[0] * (max_length - seq_lenght) + list(range(seq_lenght))],
338
+ dtype=torch.int32,
339
+ ).to(model.device)
340
+ else:
341
+ raise RuntimeError("Cannot set position_ids with in static shape generation")
342
+
343
+ # past_key_values for KV-cache
344
+ past_key_values = None
345
+
346
+ for idx in range(seq_lenght, max_length):
347
+
348
+ # Run the inference
349
+ # position_ids=position_ids,
350
+ out = model(
351
+ input_ids=input_ids,
352
+ attention_mask=attention_mask,
353
+ position_ids=position_ids,
354
+ past_key_values=past_key_values,
355
+ )
356
+
357
+ # Here I do greedy search as an example, but in general is where you want to select the next token with your fancy decoding algorithm
358
+ logits = out.logits
359
+ new_token = torch.argmax(logits[0, -1, :]).item()
360
+
361
+ yield int(new_token)
362
+
363
+ if not use_past:
364
+ # Shift left input and position ids and set the new token and idx to the proper values
365
+ input_ids = lshift_insert(input_ids, new_token)
366
+ position_ids = lshift_insert(position_ids, idx)
367
+ else:
368
+ # Set input_ids and position_ids to their new value
369
+ input_ids = torch.tensor([[new_token]], dtype=input_ids.dtype).to(
370
+ model.device
371
+ )
372
+ position_ids = torch.tensor([[idx]], dtype=input_ids.dtype).to(model.device)
373
+
374
+ # Select the proper KV cached keys for next inference
375
+ past_key_values = [
376
+ [item[:, :, 1:, :] for item in layer_past]
377
+ for layer_past in out.past_key_values
378
+ ]
379
+
380
+ # Shift left attention mask and set the last value to one
381
+ attention_mask = lshift_insert(attention_mask, 1)
382
+
383
+
384
+ def warm_up_decoder_model(
385
+ tokenizer: AutoTokenizer,
386
+ model: torch.nn.Module,
387
+ model_seq_length: int,
388
+ use_past: Optional[bool] = True,
389
+ ):
390
+ """Warm up the model on the NPU.
391
+
392
+ This function JIT compile all the layers offloaded to the NPU and load and warm them into the NPU. This is particolarly useful for LLM decoders
393
+
394
+ Args:
395
+ tokenizer (AutoTokenizer): a tokenizer
396
+ model (torch.nn.Module): a torch Module representing a language model decoder
397
+ model_seq_length (int): Max sequence lenght for the tokenizer padding
398
+ use_past (Optional[bool], optional): Enable or Disable KV-caching. Defaults to True.
399
+ """
400
+ input_ids = tokenizer(tokenizer.eos_token, return_tensors="pt")["input_ids"].to(
401
+ "cpu"
402
+ )
403
+
404
+ results = generate_with_static_shape(
405
+ model,
406
+ input_ids=input_ids,
407
+ max_length=model_seq_length,
408
+ use_past=use_past,
409
+ pad_token_id=tokenizer.pad_token_id,
410
+ )
411
+ idx = 0
412
+ # Only two inferences
413
+ for _ in results:
414
+ if idx < 1:
415
+ idx += 1
416
+ else:
417
+ break