xmos-ai-tools 1.3.2.dev80__py3-none-macosx_10_15_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (395) hide show
  1. xmos_ai_tools/__init__.py +7 -0
  2. xmos_ai_tools/io_server/__init__.py +151 -0
  3. xmos_ai_tools/runtime/__init__.py +0 -0
  4. xmos_ai_tools/runtime/buildfiles/aitoolslib.cmake +13 -0
  5. xmos_ai_tools/runtime/buildfiles/aitoolslib.make +8 -0
  6. xmos_ai_tools/runtime/include/flash_server.h +74 -0
  7. xmos_ai_tools/runtime/include/flatbuffers/allocator.h +68 -0
  8. xmos_ai_tools/runtime/include/flatbuffers/array.h +243 -0
  9. xmos_ai_tools/runtime/include/flatbuffers/base.h +474 -0
  10. xmos_ai_tools/runtime/include/flatbuffers/bfbs_generator.h +43 -0
  11. xmos_ai_tools/runtime/include/flatbuffers/buffer.h +142 -0
  12. xmos_ai_tools/runtime/include/flatbuffers/buffer_ref.h +53 -0
  13. xmos_ai_tools/runtime/include/flatbuffers/code_generators.h +235 -0
  14. xmos_ai_tools/runtime/include/flatbuffers/default_allocator.h +64 -0
  15. xmos_ai_tools/runtime/include/flatbuffers/detached_buffer.h +114 -0
  16. xmos_ai_tools/runtime/include/flatbuffers/flatbuffer_builder.h +1197 -0
  17. xmos_ai_tools/runtime/include/flatbuffers/flatbuffers.h +270 -0
  18. xmos_ai_tools/runtime/include/flatbuffers/flatc.h +111 -0
  19. xmos_ai_tools/runtime/include/flatbuffers/flexbuffers.h +1897 -0
  20. xmos_ai_tools/runtime/include/flatbuffers/grpc.h +300 -0
  21. xmos_ai_tools/runtime/include/flatbuffers/hash.h +127 -0
  22. xmos_ai_tools/runtime/include/flatbuffers/idl.h +1232 -0
  23. xmos_ai_tools/runtime/include/flatbuffers/minireflect.h +419 -0
  24. xmos_ai_tools/runtime/include/flatbuffers/pch/flatc_pch.h +39 -0
  25. xmos_ai_tools/runtime/include/flatbuffers/pch/pch.h +38 -0
  26. xmos_ai_tools/runtime/include/flatbuffers/reflection.h +502 -0
  27. xmos_ai_tools/runtime/include/flatbuffers/reflection_generated.h +1449 -0
  28. xmos_ai_tools/runtime/include/flatbuffers/registry.h +128 -0
  29. xmos_ai_tools/runtime/include/flatbuffers/stl_emulation.h +509 -0
  30. xmos_ai_tools/runtime/include/flatbuffers/string.h +64 -0
  31. xmos_ai_tools/runtime/include/flatbuffers/struct.h +53 -0
  32. xmos_ai_tools/runtime/include/flatbuffers/table.h +168 -0
  33. xmos_ai_tools/runtime/include/flatbuffers/util.h +690 -0
  34. xmos_ai_tools/runtime/include/flatbuffers/vector.h +370 -0
  35. xmos_ai_tools/runtime/include/flatbuffers/vector_downward.h +271 -0
  36. xmos_ai_tools/runtime/include/flatbuffers/verifier.h +283 -0
  37. xmos_ai_tools/runtime/include/ioserver.h +44 -0
  38. xmos_ai_tools/runtime/include/lib_nn/api/TransposeConv.h +24 -0
  39. xmos_ai_tools/runtime/include/lib_nn/api/add_int16.h +27 -0
  40. xmos_ai_tools/runtime/include/lib_nn/api/add_int16_transform.h +42 -0
  41. xmos_ai_tools/runtime/include/lib_nn/api/dequantize_int16.h +22 -0
  42. xmos_ai_tools/runtime/include/lib_nn/api/dequantize_int16_transform.h +34 -0
  43. xmos_ai_tools/runtime/include/lib_nn/api/expand_8_to_16.h +8 -0
  44. xmos_ai_tools/runtime/include/lib_nn/api/multiply_int16.h +42 -0
  45. xmos_ai_tools/runtime/include/lib_nn/api/multiply_int16_transform.h +71 -0
  46. xmos_ai_tools/runtime/include/lib_nn/api/nn_api.h +15 -0
  47. xmos_ai_tools/runtime/include/lib_nn/api/nn_bin_types.h +14 -0
  48. xmos_ai_tools/runtime/include/lib_nn/api/nn_config.h +287 -0
  49. xmos_ai_tools/runtime/include/lib_nn/api/nn_conv2d_structs.h +72 -0
  50. xmos_ai_tools/runtime/include/lib_nn/api/nn_image.h +26 -0
  51. xmos_ai_tools/runtime/include/lib_nn/api/nn_layers.h +303 -0
  52. xmos_ai_tools/runtime/include/lib_nn/api/nn_op_helper.h +132 -0
  53. xmos_ai_tools/runtime/include/lib_nn/api/nn_op_utils.h +150 -0
  54. xmos_ai_tools/runtime/include/lib_nn/api/nn_operator.h +18 -0
  55. xmos_ai_tools/runtime/include/lib_nn/api/nn_pooling.h +551 -0
  56. xmos_ai_tools/runtime/include/lib_nn/api/nn_types.h +83 -0
  57. xmos_ai_tools/runtime/include/lib_nn/api/nn_window_params.h +55 -0
  58. xmos_ai_tools/runtime/include/lib_nn/api/output_transform_fn_int16.h +54 -0
  59. xmos_ai_tools/runtime/include/lib_nn/api/output_transform_fn_int16_kernel_transform.h +37 -0
  60. xmos_ai_tools/runtime/include/lib_nn/api/output_transform_fn_int16_mappings.h +13 -0
  61. xmos_ai_tools/runtime/include/lib_nn/api/quadratic_approximation.h +82 -0
  62. xmos_ai_tools/runtime/include/lib_nn/api/quadratic_interpolation.h +23 -0
  63. xmos_ai_tools/runtime/include/lib_nn/api/quantize_int16.h +22 -0
  64. xmos_ai_tools/runtime/include/lib_nn/api/quantize_int16_transform.h +33 -0
  65. xmos_ai_tools/runtime/include/lib_nn/api/version.h +13 -0
  66. xmos_ai_tools/runtime/include/lib_nn/api/vpu_memmove_word_aligned.h +15 -0
  67. xmos_ai_tools/runtime/include/lib_nn/api/vpu_memset_256.h +55 -0
  68. xmos_ai_tools/runtime/include/lib_nn/api/vpu_sim.h +118 -0
  69. xmos_ai_tools/runtime/include/lib_nn/api/xs3_vpu.h +216 -0
  70. xmos_ai_tools/runtime/include/lib_nn/api/xs3a_registers.h +2869 -0
  71. xmos_ai_tools/runtime/include/lib_nn/src/asm/asm_constants.h +41 -0
  72. xmos_ai_tools/runtime/include/lib_nn/src/asm/window_op_plan.h +25 -0
  73. xmos_ai_tools/runtime/include/lib_tflite_micro/api/fast_flash.h +47 -0
  74. xmos_ai_tools/runtime/include/lib_tflite_micro/api/inference_engine.h +218 -0
  75. xmos_ai_tools/runtime/include/lib_tflite_micro/api/memory_parallel_transport.h +52 -0
  76. xmos_ai_tools/runtime/include/lib_tflite_micro/api/version.h +13 -0
  77. xmos_ai_tools/runtime/include/lib_tflite_micro/api/xcore_config.h +17 -0
  78. xmos_ai_tools/runtime/include/lib_tflite_micro/api/xcore_device_memory.h +62 -0
  79. xmos_ai_tools/runtime/include/lib_tflite_micro/api/xcore_shared_config.h +31 -0
  80. xmos_ai_tools/runtime/include/lib_tflite_micro/src/tflite-xcore-kernels/conv2d_float.h +155 -0
  81. xmos_ai_tools/runtime/include/lib_tflite_micro/src/tflite-xcore-kernels/xcore_common.h +19 -0
  82. xmos_ai_tools/runtime/include/lib_tflite_micro/src/tflite-xcore-kernels/xcore_custom_options.h +28 -0
  83. xmos_ai_tools/runtime/include/lib_tflite_micro/src/tflite-xcore-kernels/xcore_error_reporter.h +32 -0
  84. xmos_ai_tools/runtime/include/lib_tflite_micro/src/tflite-xcore-kernels/xcore_interpreter.h +49 -0
  85. xmos_ai_tools/runtime/include/lib_tflite_micro/src/tflite-xcore-kernels/xcore_ops.h +71 -0
  86. xmos_ai_tools/runtime/include/lib_tflite_micro/src/tflite-xcore-kernels/xcore_profiler.h +49 -0
  87. xmos_ai_tools/runtime/include/lib_tflite_micro/src/tflite-xcore-kernels/xcore_utils.h +160 -0
  88. xmos_ai_tools/runtime/include/lib_tflite_micro/src/thread_call.h +119 -0
  89. xmos_ai_tools/runtime/include/lib_xud/lib_xud/api/legacy/usb_defs.h +4 -0
  90. xmos_ai_tools/runtime/include/lib_xud/lib_xud/api/legacy/usb_device.h +4 -0
  91. xmos_ai_tools/runtime/include/lib_xud/lib_xud/api/legacy/usb_std_descriptors.h +4 -0
  92. xmos_ai_tools/runtime/include/lib_xud/lib_xud/api/legacy/usb_std_requests.h +4 -0
  93. xmos_ai_tools/runtime/include/lib_xud/lib_xud/api/xud.h +518 -0
  94. xmos_ai_tools/runtime/include/lib_xud/lib_xud/api/xud_conf_default.h +11 -0
  95. xmos_ai_tools/runtime/include/lib_xud/lib_xud/api/xud_device.h +87 -0
  96. xmos_ai_tools/runtime/include/lib_xud/lib_xud/api/xud_std_descriptors.h +191 -0
  97. xmos_ai_tools/runtime/include/lib_xud/lib_xud/api/xud_std_requests.h +120 -0
  98. xmos_ai_tools/runtime/include/lib_xud/lib_xud/src/user/XUD_USB_Defines.h +70 -0
  99. xmos_ai_tools/runtime/include/lib_xud/lib_xud/src/user/class/hid.h +23 -0
  100. xmos_ai_tools/runtime/include/lib_xud/lib_xud/src/user/class/usbaudio10.h +30 -0
  101. xmos_ai_tools/runtime/include/lib_xud/lib_xud/src/user/class/usbaudio20.h +357 -0
  102. xmos_ai_tools/runtime/include/lib_xud/lib_xud/src/user/class/usbaudiocommon.h +168 -0
  103. xmos_ai_tools/runtime/include/signal/micro/kernels/delay_flexbuffers_generated_data.h +25 -0
  104. xmos_ai_tools/runtime/include/signal/micro/kernels/energy_flexbuffers_generated_data.h +28 -0
  105. xmos_ai_tools/runtime/include/signal/micro/kernels/fft_flexbuffers_generated_data.h +37 -0
  106. xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_flexbuffers_generated_data.h +25 -0
  107. xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_log_flexbuffers_generated_data.h +27 -0
  108. xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_spectral_subtraction_flexbuffers_generated_data.h +26 -0
  109. xmos_ai_tools/runtime/include/signal/micro/kernels/framer_flexbuffers_generated_data.h +25 -0
  110. xmos_ai_tools/runtime/include/signal/micro/kernels/irfft.h +31 -0
  111. xmos_ai_tools/runtime/include/signal/micro/kernels/overlap_add_flexbuffers_generated_data.h +25 -0
  112. xmos_ai_tools/runtime/include/signal/micro/kernels/pcan_flexbuffers_generated_data.h +7 -0
  113. xmos_ai_tools/runtime/include/signal/micro/kernels/rfft.h +31 -0
  114. xmos_ai_tools/runtime/include/signal/micro/kernels/stacker_flexbuffers_generated_data.h +25 -0
  115. xmos_ai_tools/runtime/include/signal/micro/kernels/window_flexbuffers_generated_data.h +25 -0
  116. xmos_ai_tools/runtime/include/signal/src/circular_buffer.h +118 -0
  117. xmos_ai_tools/runtime/include/signal/src/complex.h +29 -0
  118. xmos_ai_tools/runtime/include/signal/src/energy.h +38 -0
  119. xmos_ai_tools/runtime/include/signal/src/fft_auto_scale.h +35 -0
  120. xmos_ai_tools/runtime/include/signal/src/filter_bank.h +69 -0
  121. xmos_ai_tools/runtime/include/signal/src/filter_bank_log.h +38 -0
  122. xmos_ai_tools/runtime/include/signal/src/filter_bank_spectral_subtraction.h +73 -0
  123. xmos_ai_tools/runtime/include/signal/src/filter_bank_square_root.h +34 -0
  124. xmos_ai_tools/runtime/include/signal/src/irfft.h +84 -0
  125. xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_common.h +49 -0
  126. xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_float.h +31 -0
  127. xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_int16.h +30 -0
  128. xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_int32.h +31 -0
  129. xmos_ai_tools/runtime/include/signal/src/log.h +30 -0
  130. xmos_ai_tools/runtime/include/signal/src/max_abs.h +31 -0
  131. xmos_ai_tools/runtime/include/signal/src/msb.h +32 -0
  132. xmos_ai_tools/runtime/include/signal/src/overlap_add.h +46 -0
  133. xmos_ai_tools/runtime/include/signal/src/pcan_argc_fixed.h +41 -0
  134. xmos_ai_tools/runtime/include/signal/src/rfft.h +85 -0
  135. xmos_ai_tools/runtime/include/signal/src/square_root.h +32 -0
  136. xmos_ai_tools/runtime/include/signal/src/window.h +31 -0
  137. xmos_ai_tools/runtime/include/signal/testdata/fft_test_data.h +48 -0
  138. xmos_ai_tools/runtime/include/tensorflow/lite/array.h +156 -0
  139. xmos_ai_tools/runtime/include/tensorflow/lite/builtin_op_data.h +22 -0
  140. xmos_ai_tools/runtime/include/tensorflow/lite/builtin_ops.h +241 -0
  141. xmos_ai_tools/runtime/include/tensorflow/lite/c/builtin_op_data.h +20 -0
  142. xmos_ai_tools/runtime/include/tensorflow/lite/c/c_api_types.h +26 -0
  143. xmos_ai_tools/runtime/include/tensorflow/lite/c/common.h +30 -0
  144. xmos_ai_tools/runtime/include/tensorflow/lite/context_util.h +54 -0
  145. xmos_ai_tools/runtime/include/tensorflow/lite/core/api/error_reporter.h +72 -0
  146. xmos_ai_tools/runtime/include/tensorflow/lite/core/api/flatbuffer_conversions.h +440 -0
  147. xmos_ai_tools/runtime/include/tensorflow/lite/core/api/tensor_utils.h +28 -0
  148. xmos_ai_tools/runtime/include/tensorflow/lite/core/c/builtin_op_data.h +626 -0
  149. xmos_ai_tools/runtime/include/tensorflow/lite/core/c/c_api_types.h +178 -0
  150. xmos_ai_tools/runtime/include/tensorflow/lite/core/c/common.h +1496 -0
  151. xmos_ai_tools/runtime/include/tensorflow/lite/core/macros.h +78 -0
  152. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/bits.h +102 -0
  153. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/fft.h +50 -0
  154. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/fft_io.h +34 -0
  155. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/fft_util.h +34 -0
  156. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/filterbank.h +63 -0
  157. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/filterbank_io.h +35 -0
  158. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.h +50 -0
  159. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/frontend.h +64 -0
  160. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/frontend_io.h +31 -0
  161. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/frontend_util.h +52 -0
  162. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/kiss_fft_common.h +48 -0
  163. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/kiss_fft_int16.h +33 -0
  164. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/log_lut.h +40 -0
  165. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/log_scale.h +39 -0
  166. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/log_scale_io.h +33 -0
  167. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h +45 -0
  168. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h +46 -0
  169. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_io.h +36 -0
  170. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h +50 -0
  171. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h +47 -0
  172. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h +57 -0
  173. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/window.h +49 -0
  174. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/window_io.h +34 -0
  175. xmos_ai_tools/runtime/include/tensorflow/lite/experimental/microfrontend/lib/window_util.h +45 -0
  176. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/common.h +1358 -0
  177. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/compatibility.h +122 -0
  178. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/cppmath.h +40 -0
  179. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/max.h +35 -0
  180. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/min.h +35 -0
  181. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/optimized/neon_check.h +20 -0
  182. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/portable_tensor.h +141 -0
  183. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/portable_tensor_utils.h +623 -0
  184. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/quantization_util.h +292 -0
  185. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/add.h +561 -0
  186. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/add_n.h +86 -0
  187. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/arg_min_max.h +88 -0
  188. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/batch_matmul.h +275 -0
  189. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h +101 -0
  190. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/binary_function.h +91 -0
  191. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/broadcast_args.h +56 -0
  192. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/broadcast_to.h +97 -0
  193. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/ceil.h +37 -0
  194. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/comparisons.h +271 -0
  195. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/concatenation.h +141 -0
  196. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/conv.h +289 -0
  197. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/cumsum.h +175 -0
  198. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/depth_to_space.h +79 -0
  199. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h +100 -0
  200. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h +319 -0
  201. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/dequantize.h +78 -0
  202. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/div.h +247 -0
  203. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/elu.h +37 -0
  204. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/exp.h +38 -0
  205. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/fill.h +38 -0
  206. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/floor.h +39 -0
  207. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/floor_div.h +35 -0
  208. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/floor_mod.h +44 -0
  209. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/fully_connected.h +323 -0
  210. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/hard_swish.h +168 -0
  211. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/add.h +250 -0
  212. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h +241 -0
  213. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h +291 -0
  214. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h +126 -0
  215. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h +67 -0
  216. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h +121 -0
  217. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h +18 -0
  218. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h +194 -0
  219. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h +264 -0
  220. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h +117 -0
  221. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h +224 -0
  222. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/l2normalization.h +90 -0
  223. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/leaky_relu.h +69 -0
  224. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/log_softmax.h +256 -0
  225. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/logistic.h +132 -0
  226. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/lstm_cell.h +422 -0
  227. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/maximum_minimum.h +64 -0
  228. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/mul.h +267 -0
  229. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/neg.h +37 -0
  230. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/pad.h +169 -0
  231. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/pooling.h +303 -0
  232. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h +333 -0
  233. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h +244 -0
  234. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/prelu.h +111 -0
  235. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h +140 -0
  236. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/quantize.h +89 -0
  237. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/reduce.h +491 -0
  238. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/requantize.h +70 -0
  239. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/resize_bilinear.h +233 -0
  240. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h +102 -0
  241. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/round.h +51 -0
  242. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/select.h +151 -0
  243. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/slice.h +80 -0
  244. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/softmax.h +233 -0
  245. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/space_to_batch_nd.h +109 -0
  246. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/space_to_depth.h +80 -0
  247. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/strided_slice.h +147 -0
  248. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/sub.h +465 -0
  249. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/tanh.h +129 -0
  250. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/transpose.h +203 -0
  251. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/transpose_conv.h +225 -0
  252. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/runtime_shape.h +168 -0
  253. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/strided_slice_logic.h +278 -0
  254. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/tensor_ctypes.h +42 -0
  255. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/types.h +1096 -0
  256. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/kernel_util.h +341 -0
  257. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/op_macros.h +49 -0
  258. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/padding.h +115 -0
  259. xmos_ai_tools/runtime/include/tensorflow/lite/micro/arena_allocator/ibuffer_allocator.h +100 -0
  260. xmos_ai_tools/runtime/include/tensorflow/lite/micro/arena_allocator/non_persistent_arena_buffer_allocator.h +104 -0
  261. xmos_ai_tools/runtime/include/tensorflow/lite/micro/arena_allocator/persistent_arena_buffer_allocator.h +58 -0
  262. xmos_ai_tools/runtime/include/tensorflow/lite/micro/arena_allocator/recording_single_arena_buffer_allocator.h +63 -0
  263. xmos_ai_tools/runtime/include/tensorflow/lite/micro/arena_allocator/single_arena_buffer_allocator.h +144 -0
  264. xmos_ai_tools/runtime/include/tensorflow/lite/micro/benchmarks/micro_benchmark.h +95 -0
  265. xmos_ai_tools/runtime/include/tensorflow/lite/micro/compatibility.h +32 -0
  266. xmos_ai_tools/runtime/include/tensorflow/lite/micro/cortex_m_generic/debug_log_callback.h +49 -0
  267. xmos_ai_tools/runtime/include/tensorflow/lite/micro/debug_log.h +38 -0
  268. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_model_settings.h +37 -0
  269. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/network_tester/expected_output_data.h +47 -0
  270. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/network_tester/input_data.h +108 -0
  271. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/network_tester/network_model.h +166 -0
  272. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/person_detection/detection_responder.h +32 -0
  273. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/person_detection/image_provider.h +38 -0
  274. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/person_detection/main_functions.h +37 -0
  275. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/person_detection/model_settings.h +35 -0
  276. xmos_ai_tools/runtime/include/tensorflow/lite/micro/fake_micro_context.h +70 -0
  277. xmos_ai_tools/runtime/include/tensorflow/lite/micro/flatbuffer_utils.h +65 -0
  278. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/activation_utils.h +57 -0
  279. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/activations.h +64 -0
  280. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/add.h +78 -0
  281. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/arc_mli/mli_function_specializations.h +141 -0
  282. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/arc_mli/mli_interface.h +75 -0
  283. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h +56 -0
  284. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h +310 -0
  285. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h +145 -0
  286. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h +78 -0
  287. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/ceva/ceva_common.h +24 -0
  288. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/ceva/ceva_tflm_lib.h +613 -0
  289. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/ceva/mcps_macros.h +115 -0
  290. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/ceva/types.h +1286 -0
  291. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/circular_buffer.h +45 -0
  292. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/circular_buffer_flexbuffers_generated_data.h +22 -0
  293. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/conv.h +117 -0
  294. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/conv_test.h +94 -0
  295. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/depthwise_conv.h +80 -0
  296. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/dequantize.h +38 -0
  297. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/detection_postprocess_flexbuffers_generated_data.h +25 -0
  298. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/ethosu.h +28 -0
  299. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/fully_connected.h +112 -0
  300. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/hard_swish.h +30 -0
  301. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/kernel_runner.h +86 -0
  302. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/kernel_util.h +150 -0
  303. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/leaky_relu.h +43 -0
  304. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/logical.h +35 -0
  305. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/logistic.h +42 -0
  306. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/lstm_eval.h +541 -0
  307. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/lstm_eval_test.h +817 -0
  308. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/lstm_shared.h +150 -0
  309. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/micro_ops.h +158 -0
  310. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/micro_tensor_utils.h +56 -0
  311. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/mul.h +74 -0
  312. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/pad.h +27 -0
  313. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/pooling.h +142 -0
  314. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/prelu.h +39 -0
  315. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/quantize.h +37 -0
  316. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/reduce.h +65 -0
  317. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/reshape.h +26 -0
  318. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/softmax.h +67 -0
  319. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/strided_slice.h +40 -0
  320. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/sub.h +60 -0
  321. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/svdf.h +100 -0
  322. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/testdata/conv_test_data.h +37 -0
  323. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/testdata/lstm_test_data.h +579 -0
  324. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.h +47 -0
  325. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/hifimini/fixedpoint_utils.h +139 -0
  326. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/lstm_eval.h +216 -0
  327. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/lstm_shared.h +78 -0
  328. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa.h +38 -0
  329. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_add.h +48 -0
  330. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_conv.h +89 -0
  331. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_depthwise_conv.h +74 -0
  332. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_fully_connected.h +78 -0
  333. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_pad.h +49 -0
  334. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_pooling.h +76 -0
  335. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_reduce.h +47 -0
  336. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_reshape.h +44 -0
  337. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_softmax.h +58 -0
  338. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_svdf.h +39 -0
  339. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_helpers.h +64 -0
  340. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h +170 -0
  341. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/linear_memory_planner.h +53 -0
  342. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/memory_plan_struct.h +73 -0
  343. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/micro_memory_planner.h +95 -0
  344. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/non_persistent_buffer_planner_shim.h +133 -0
  345. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_allocation_info.h +138 -0
  346. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_allocator.h +351 -0
  347. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_arena_constants.h +28 -0
  348. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_common.h +38 -0
  349. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_context.h +176 -0
  350. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_graph.h +79 -0
  351. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter.h +189 -0
  352. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter_context.h +125 -0
  353. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter_graph.h +110 -0
  354. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_log.h +42 -0
  355. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_mutable_op_resolver.h +708 -0
  356. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_op_resolver.h +62 -0
  357. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_profiler.h +140 -0
  358. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_profiler_interface.h +38 -0
  359. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_resource_variable.h +89 -0
  360. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_time.h +36 -0
  361. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_utils.h +162 -0
  362. xmos_ai_tools/runtime/include/tensorflow/lite/micro/mock_micro_graph.h +60 -0
  363. xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/python_ops_resolver.h +21 -0
  364. xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/tflite_size/src/flatbuffer_size.h +30 -0
  365. xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/tflite_size/src/flatbuffer_size_wrapper.h +33 -0
  366. xmos_ai_tools/runtime/include/tensorflow/lite/micro/recording_micro_allocator.h +125 -0
  367. xmos_ai_tools/runtime/include/tensorflow/lite/micro/recording_micro_interpreter.h +69 -0
  368. xmos_ai_tools/runtime/include/tensorflow/lite/micro/system_setup.h +27 -0
  369. xmos_ai_tools/runtime/include/tensorflow/lite/micro/test_helper_custom_ops.h +49 -0
  370. xmos_ai_tools/runtime/include/tensorflow/lite/micro/test_helpers.h +334 -0
  371. xmos_ai_tools/runtime/include/tensorflow/lite/micro/testing/micro_test.h +267 -0
  372. xmos_ai_tools/runtime/include/tensorflow/lite/micro/testing/test_conv_model.h +23 -0
  373. xmos_ai_tools/runtime/include/tensorflow/lite/micro/tflite_bridge/flatbuffer_conversions_bridge.h +45 -0
  374. xmos_ai_tools/runtime/include/tensorflow/lite/micro/tflite_bridge/micro_error_reporter.h +36 -0
  375. xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/log_utils.h +273 -0
  376. xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/metrics.h +41 -0
  377. xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/op_resolver.h +127 -0
  378. xmos_ai_tools/runtime/include/tensorflow/lite/portable_type_to_tflitetype.h +75 -0
  379. xmos_ai_tools/runtime/include/tensorflow/lite/schema/schema_generated.h +24644 -0
  380. xmos_ai_tools/runtime/include/tensorflow/lite/schema/schema_utils.h +33 -0
  381. xmos_ai_tools/runtime/include/tile_ram_server.h +38 -0
  382. xmos_ai_tools/runtime/lib/libhost_xtflitemicro.a +0 -0
  383. xmos_ai_tools/runtime/lib/libxtflitemicro.a +0 -0
  384. xmos_ai_tools/xformer/__init__.py +60 -0
  385. xmos_ai_tools/xformer/flash.py +190 -0
  386. xmos_ai_tools/xinterpreters/__init__.py +1 -0
  387. xmos_ai_tools/xinterpreters/exceptions.py +38 -0
  388. xmos_ai_tools/xinterpreters/host_interpreter.py +652 -0
  389. xmos_ai_tools/xinterpreters/libs/macos/xtflm_python.1.0.1.dylib +0 -0
  390. xmos_ai_tools/xinterpreters/libs/macos/xtflm_python.dylib +0 -0
  391. xmos_ai_tools-1.3.2.dev80.data/data/bin/xcore-opt +0 -0
  392. xmos_ai_tools-1.3.2.dev80.dist-info/METADATA +33 -0
  393. xmos_ai_tools-1.3.2.dev80.dist-info/RECORD +395 -0
  394. xmos_ai_tools-1.3.2.dev80.dist-info/WHEEL +5 -0
  395. xmos_ai_tools-1.3.2.dev80.dist-info/top_level.txt +1 -0
@@ -0,0 +1,623 @@
1
+ /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_UTILS_H_
17
+ #define TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_UTILS_H_
18
+
19
+ #include <algorithm>
20
+ #include <cmath>
21
+ #include <cstdint>
22
+
23
+ #include "tensorflow/lite/core/c/builtin_op_data.h"
24
+ #include "tensorflow/lite/core/c/common.h"
25
+
26
+ #if defined(_MSC_VER)
27
+ #define __restrict__ __restrict
28
+ #endif
29
+
30
+ namespace tflite_micro {
31
+
32
+ // Not all backends support CpuBackendContext usage, so forward declare to avoid
33
+ // pulling in its implementation. Use of CpuBackendContext in method
34
+ // implementations is purely optional.
35
+ class CpuBackendContext;
36
+
37
+ namespace tensor_utils {
38
+
39
+ // Multiplies a matrix with a scalar and reduce the result on each row to a
40
+ // scalar.
41
+ // Parameters:
42
+ // - matrix: matrix of size n_row * n_col
43
+ // - scalar: the scalar that is multiplied to each element in the matrix
44
+ // - n_row: the row count of the matrix
45
+ // - n_col: the column count of the matrix
46
+ // - output: the 32bit output
47
+ // Note: We do not need saturation because the int8 * int8 is safe from overflow
48
+ // in (2^31-1) / (2^14) = 131072, which is bigger than the n_row. Non-zero
49
+ // initial output value is not exceptionally large.
50
+ void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
51
+ int32_t n_row, int32_t n_col,
52
+ int32_t* output);
53
+
54
+ // Add another vector for each batch in the batch vector.
55
+ template <typename T>
56
+ void VectorBatchVectorAdd(const T* vector, int v_size, int n_batch,
57
+ T* batch_vector) {
58
+ for (int b = 0; b < n_batch; b++) {
59
+ for (int i = 0; i < v_size; ++i) {
60
+ batch_vector[i] += vector[i];
61
+ }
62
+ batch_vector += v_size;
63
+ }
64
+ }
65
+
66
+ // Cwise product of two vectors.
67
+ template <typename T>
68
+ inline void VectorVectorCwiseProduct(const T* vector1, const T* vector2,
69
+ int v_size, T* result) {
70
+ for (int v = 0; v < v_size; v++) {
71
+ *result++ = *vector1++ * *vector2++;
72
+ }
73
+ }
74
+
75
+ // Cwise product of a vector and a batch-vector.
76
+ template <typename T>
77
+ inline void VectorBatchVectorCwiseProduct(const T* vector, int v_size,
78
+ const T* batch_vector, int n_batch,
79
+ T* result) {
80
+ for (int b = 0; b < n_batch; b++) {
81
+ VectorVectorCwiseProduct(vector, batch_vector, v_size, result);
82
+ // Update the pointers.
83
+ result += v_size;
84
+ batch_vector += v_size;
85
+ }
86
+ }
87
+
88
+ // Cwise product and accumulate of two vectors. Since it's a MAC operation, the
89
+ // assumption here is that result array is initialized to valid values.
90
+ template <typename T>
91
+ inline void VectorVectorCwiseProductAccumulate(const T* __restrict__ vector1,
92
+ const T* __restrict__ vector2,
93
+ int v_size,
94
+ T* __restrict__ result) {
95
+ for (int v = 0; v < v_size; v++) {
96
+ *result++ += *vector1++ * *vector2++;
97
+ }
98
+ }
99
+
100
+ // Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
101
+ // operation, the assumption here is that result array is initialized to valid
102
+ // values.
103
+ template <typename T>
104
+ inline void VectorBatchVectorCwiseProductAccumulate(const T* vector, int v_size,
105
+ const T* batch_vector,
106
+ int n_batch, T* result) {
107
+ for (int b = 0; b < n_batch; b++) {
108
+ VectorVectorCwiseProductAccumulate(vector, batch_vector, v_size, result);
109
+ // Update the pointers.
110
+ result += v_size;
111
+ batch_vector += v_size;
112
+ }
113
+ }
114
+
115
+ // Batch vector initialization with another vector.
116
+ template <typename T>
117
+ void VectorBatchVectorAssign(const T* vector, int v_size, int n_batch,
118
+ T* batch_vector) {
119
+ for (int b = 0; b < n_batch; b++) {
120
+ std::copy_n(vector, v_size, batch_vector + b * v_size);
121
+ }
122
+ }
123
+
124
+ // Checks if all entries of vector are zero for float.
125
+ bool IsZeroVector(const float* vector, int v_size);
126
+
127
+ // Checks if all entries of vector are zero for int8.
128
+ bool IsZeroVector(const int8_t* vector, int v_size);
129
+
130
+ // Quantizes a buffer of floating point values using a symmetric quantization
131
+ // (i.e. linear quantization without an offset) to 8-bit signed integers.
132
+ // It also outputs the range (min, max) of the floating point buffer, and the
133
+ // scaling factor used to quantize the values.
134
+ void SymmetricQuantizeFloats(const float* values, const int size,
135
+ int8_t* quantized_values, float* min_value,
136
+ float* max_value, float* scaling_factor);
137
+
138
+ // Quantizes a buffer of floating point values using a symmetric quantization
139
+ // (i.e. linear quantization without an offset) to 8-bit signed integers.
140
+ // It uses the range (min, max) provided to the function to calculate the
141
+ // appropriate scaling factor to quantize the values.
142
+ void SymmetricQuantizeFloats(const float* values, const int size,
143
+ int8_t* quantized_values, float min_value,
144
+ float max_value, float* scaling_factor);
145
+
146
+ void AsymmetricQuantizeFloats(const float* values, const int size,
147
+ int8_t* quantized_values, float* scaling_factor,
148
+ int32_t* offset);
149
+
150
+ // Helper function to quantize floats.
151
+ // float_data_ptr input float vectors
152
+ // n_batch number of input vectors
153
+ // n_data size of a single input vector
154
+ // quantized_data_ptr (out) vector with quantized data
155
+ // scaling_factors (out) scaling factors (one per vector)
156
+ // zero_points (out) zero points (one per vector)
157
+ // do_asymmetric controls if the quantization should be asymmetric.
158
+ inline void BatchQuantizeFloats(const float* float_data_ptr, int n_batch,
159
+ int n_data, int8_t* quantized_data_ptr,
160
+ float* scaling_factors, int32_t* zero_points,
161
+ bool do_asymmetric) {
162
+ for (int b = 0; b < n_batch; ++b) {
163
+ const int offset = b * n_data;
164
+ if (do_asymmetric) {
165
+ tensor_utils::AsymmetricQuantizeFloats(
166
+ float_data_ptr + offset, n_data, quantized_data_ptr + offset,
167
+ &scaling_factors[b], &zero_points[b]);
168
+ } else {
169
+ float unused_min, unused_max;
170
+ tensor_utils::SymmetricQuantizeFloats(
171
+ float_data_ptr + offset, n_data, quantized_data_ptr + offset,
172
+ &unused_min, &unused_max, &scaling_factors[b]);
173
+ }
174
+ }
175
+ }
176
+
177
+ // Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
178
+ // dimension composed by input vectors independent from each other). The result
179
+ // of the multiplication is accumulated to the passed result buffer.
180
+ // More specifically, for a matrix M of shape [n, i] and a batched-vector
181
+ // of shape [i, batch] it will first compute the product of shape [n, batch].
182
+ // This product will be accumulated to the result buffer.
183
+ void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
184
+ int m_cols, const float* vector,
185
+ int n_batch, float* result);
186
+
187
+ // Same as the function above, but the matrix is a sparse tensor with block
188
+ // pattern 1x4.
189
+ // This function assumes that m_cols is a multiple of the block size (4 in this
190
+ // case) so that there's no incomplete block.
191
+ void SparseMatrixBatchVectorMultiplyAccumulate1x4(
192
+ const float* __restrict__ matrix, const int32_t* __restrict__ segments,
193
+ const int32_t* __restrict__ indices, int m_rows, int m_cols,
194
+ const float* __restrict__ vector, int n_batch, float* __restrict__ result);
195
+
196
+ // Same as the function above, but the matrix is stored in block compressed
197
+ // sparse row format with block pattern 1x16 which consists of two arrays:
198
+ // 1. A matrix array stores non-zero blocks of the matrix in row major.
199
+ // 2. A ledger array stores nrows groups, one group per row. Each group starts
200
+ // with an integer representing the number of non-zero blocks for the
201
+ // corresponding row and follows with column indexes of the first element
202
+ // of each non-zero block.
203
+ // This function assumes that
204
+ // 1. m_cols is a multiple of 16 so that all blocks are full blocks.
205
+ // 2. m_cols < 254 * 16 so that block index can be represented by uint8.
206
+ void SparseMatrixBatchVectorMultiplyAccumulate(
207
+ const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
208
+ int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
209
+ float* __restrict__ result);
210
+
211
+ // Same as the function above, but for values quantized using symmetric
212
+ // quantization (e.g. by calling SymmetricQuantizeFloats).
213
+ // The passed scaling factors is a buffer of the quantization scaling factors
214
+ // that will be used to dequentize the products into the final result buffer.
215
+ // These scaling factors are the multiplication of the matrix scaling factor
216
+ // by the vector's scaling factor, one per batch (i.e. this allows quantizing
217
+ // each batch in the batch-vector matrix independently).
218
+ void MatrixBatchVectorMultiplyAccumulate(
219
+ const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
220
+ const int8_t* __restrict__ vectors,
221
+ const float* __restrict__ scaling_factors, int n_batch,
222
+ float* __restrict__ result);
223
+
224
+ // Same as the function above except that vector values
225
+ // are quantized with asymmetric quantization per-batch and the matrix
226
+ // is quantized per row.
227
+ void MatrixBatchVectorMultiplyAccumulate(
228
+ const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
229
+ const int8_t* __restrict__ vectors,
230
+ const float* __restrict__ scaling_factors, int n_batch,
231
+ float* __restrict__ result, const float* __restrict__ per_channel_scale,
232
+ const int32_t* __restrict__ input_offset);
233
+
234
+ // Same as the function above, but the matrix is a sparse tensor with block
235
+ // pattern 1x16.
236
+ // This function assumes that m_cols is a multiple of the block size (16 in this
237
+ // case) so that there's no incomplete block. Also, it assumes all offsets of
238
+ // input, output and filter are zero.
239
+ void SparseMatrixBatchVectorMultiplyAccumulate1x16(
240
+ const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
241
+ const int32_t* __restrict__ indices, int m_rows, int m_cols,
242
+ const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
243
+ int n_batch, const int32_t input_offset, const int32_t output_multiplier,
244
+ const int32_t output_shift, const int32_t output_offset,
245
+ const int32_t output_activation_min, const int32_t output_activation_max,
246
+ int8_t* __restrict__ result);
247
+
248
+ // Same as the function above, but the matrix is stored in block compressed
249
+ // sparse row format with block pattern 1x16 which consists of two arrays:
250
+ // 1. A matrix array stores non-zero blocks of the matrix in row major.
251
+ // 2. A ledger array stores nrows groups, one group per row. Each group starts
252
+ // with an integer representing the number of non-zero blocks for the
253
+ // corresponding row followed by column index of the first element of
254
+ // each non-zero block.
255
+ // This function assumes that
256
+ // 1. m_cols is a multiple of 16 so that all blocks are full blocks.
257
+ // 2. m_cols < 254 * 16 so that block index can be represented by uint8.
258
+ void SparseMatrixBatchVectorMultiplyAccumulate(
259
+ const int8_t* __restrict__ matrix, const uint8_t* __restrict__ ledger,
260
+ const int m_rows, const int m_cols, const int8_t* __restrict__ vectors,
261
+ const float* __restrict__ scaling_factors, int n_batch,
262
+ float* __restrict__ result);
263
+
264
+ // Same as the above 8, 8, 8 integer matmul except for the presence of zero
265
+ // point and non-accumulative.
266
+ // TODO(b/148688698): remove this function by folding zero point calculation in
267
+ // prepare() function.
268
+ void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
269
+ const int8_t* input_to_gate_weights,
270
+ int32_t input_to_gate_effective_scale_a,
271
+ int32_t input_to_gate_effective_scale_b,
272
+ int32_t n_batch, int32_t n_input, int32_t n_cell,
273
+ int8_t* gate_output, int8_t gate_output_zp);
274
+
275
+ // Same as above but has 16 bit and 8 bit input and 8 bit output.
276
+ // Used in projection when hidden is 16bit.
277
+ void MatrixBatchVectorMultiply(const int16_t* hidden,
278
+ const int8_t* hidden_to_output_weights,
279
+ int32_t proj_effective_scale_a,
280
+ int32_t proj_effective_scale_b,
281
+ const int32_t* gate_bias, int32_t n_batch,
282
+ int32_t n_hidden, int32_t n_output,
283
+ int32_t output_zp, int8_t* proj_output);
284
+
285
+ // Apply Layer Normalization (https://arxiv.org/abs/1607.06450) to a Quantized
286
+ // vector.
287
+ // Parameters:
288
+ // - input: batch vector of size n_batch * n_input; 16 bit.
289
+ // - layer_norm_weights: the quantized layer normalization weights.
290
+ // - bias: the bias for the layer normalization.
291
+ // - layer_norm_scale_a: multiplier for scale factor.
292
+ // - layer_norm_scale_b: shift for scale factor.
293
+ // - variance_limit: the guard to make sure the inverse does not overflow.
294
+ // - n_batch: the number of batches.
295
+ // - n_input: the size for input and output.
296
+ // - output: the 16 bit output
297
+ void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
298
+ const int32_t* bias, int32_t layer_norm_scale_a,
299
+ int32_t layer_norm_scale_b, int32_t variance_limit,
300
+ int n_batch, int n_input, int16_t* output);
301
+
302
+ // Same as above but the internal calculation is done in float.
303
+ void ApplyLayerNormFloat(const int16_t* input,
304
+ const int16_t* layer_norm_weights,
305
+ int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
306
+ const int32_t* bias, int n_batch, int n_input,
307
+ int16_t* output);
308
+
309
+ // Apply Sigmoid to a quantized vector.
310
+ // Parameters:
311
+ // - input: batch vector of size n_batch * n_input; 16 bit.
312
+ // - n_batch: the number of batches.
313
+ // - n_input: the size for input and output.
314
+ // - output: the 16 bit output
315
+ // The input is in Q3.12 format and the output is in Q0.15 format.
316
+ void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
317
+ int16_t* output);
318
+
319
+ // Same as above but the internal calcualtion is float.
320
+ void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
321
+ int16_t* output);
322
+
323
+ // Apply Tanh to a quantized vector.
324
+ // Parameters:
325
+ // - integer_bits: the integer bits of the input.
326
+ // Currently supports 0, 1, 2, 3, 4, 5, 6.
327
+ // - input: batch vector of size n_batch * n_input; 16 bit.
328
+ // - n_batch: the number of batches.
329
+ // - n_input: the size for input and output.
330
+ // - output: the 16 bit output
331
+ // The input is in Qm.15-m format and the output is in Q0.15 format.
332
+ void ApplyTanh(int32_t intger_bits, const int16_t* input, int32_t n_batch,
333
+ int32_t n_input, int16_t* output);
334
+
335
+ // Apply Tanh to a quantized vector. Tbe internal calculation is in float.
336
+ // - Input has 2^(integer_bits) as scale.
337
+ // - Output has Q0.15 as scale.
338
+ void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
339
+ int32_t integer_bits, int16_t* output);
340
+
341
+ // Element-wise multiplication of two quantized vectors.
342
+ // Parameters:
343
+ // - input_1: batch vector of size n_batch * n_input; 16 bit.
344
+ // - input_2: batch vector of size n_batch * n_input; 16 bit.
345
+ // - n_batch: the number of batches.
346
+ // - n_input: the size for input and output.
347
+ // - shift: the shift needed to produce the output.
348
+ // - output: the 16 bit output of size n_batch * n_input.
349
+ // Output does not need to be initialized.
350
+ void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
351
+ int n_input, int shift, int16_t* output);
352
+
353
+ // Element-wise multiplication of two quantized vectors.
354
+ // Parameters:
355
+ // - input_1: batch vector of size n_batch * n_input; 16 bit.
356
+ // - input_2: batch vector of size n_batch * n_input; 16 bit.
357
+ // - n_batch: the number of batches.
358
+ // - n_input: the size for input and output.
359
+ // - shift: the shift needed to produce the output.
360
+ // - output: the 8 bit output of size n_batch * n_input.
361
+ // Output does not need to be initialized.
362
+ void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
363
+ int n_input, int shift, int8_t* output);
364
+
365
+ // Element-wise multiplication of two quantized vectors with rescaling.
366
+ // Parameters:
367
+ // - input_1: batch vector of size n_batch * n_input; 16 bit.
368
+ // - input_2: batch vector of size n_batch * n_input; 16 bit.
369
+ // - multiplier: the multiplier part of scale.
370
+ // - shift: the shift part of scale.
371
+ // - n_batch: the number of batches.
372
+ // - n_input: the size for input and output.
373
+ // - output: the 8 bit output of size n_batch * n_input.
374
+ // - output_zp: the zero point of output.
375
+ // Output does not need to be initialized.
376
+ // Multiplier ("m") and shift ("s") are connected to scale ("s") with s = m *
377
+ // 2^(s - 31).
378
+ void CwiseMul(const int16_t* input_1, const int16_t* input_2,
379
+ int32_t multiplier, int32_t shift, int32_t n_batch,
380
+ int32_t n_input, int32_t output_zp, int8_t* output);
381
+
382
+ // Element-wise saturating addition of two quantized vectors without rescaling.
383
+ // Parameters:
384
+ // - input_1: batch vector of size n_batch * n_input; 16 bit.
385
+ // - input_2: batch vector of size n_batch * n_input; 16 bit.
386
+ // - n_batch: the number of batches.
387
+ // - n_input: the size for input and output.
388
+ // - output: the 8 bit output of size n_batch * n_input.
389
+ // Output does not need to be initialized.
390
+ void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
391
+ int n_input, int16_t* output);
392
+
393
+ // Element-wise in-place clipping of a vector. Overloaded for float, int16_t,
394
+ // int8_t. Parameters:
395
+ // - vector: vector of size v_size.
396
+ // - v_size: the size of the vector.
397
+ // - clipping_value: the value used for clipping.
398
+ void CwiseClipping(float* vector, const int v_size, const float clipping_value);
399
+ void CwiseClipping(int16_t* vector, const int v_size,
400
+ const int16_t clipping_value);
401
+ void CwiseClipping(int8_t* vector, const int v_size,
402
+ const int8_t clipping_value);
403
+
404
+ // Dot product of two vectors.
405
+ float VectorVectorDotProduct(const float* vector1, const float* vector2,
406
+ int v_size);
407
+
408
+ // Dot product of two batch vectors of size n_batch * v_size:
409
+ // vector1 = [x_1_1, x_1_2, ..., x_1_vsize,
410
+ // x_2_1, x_2_2, ..., x_2_vsize,
411
+ // ...
412
+ // x_nbatch_1,..., x_nbatch_vsize]
413
+ // vector2 = [y_1_1, y_1_2, ..., y_1_vsize,
414
+ // y_2_1, y_2_2, ..., y_2_vsize,
415
+ // ...
416
+ // y_nbatch_1,..., y_nbatch_vsize]
417
+ // Then result will be a vector of n_batch size starting from 'result':
418
+ // [x_1_1 * y_1_1 + x_1_2 * y_1_2 + ... + x_1_vsize * y_1_vsize,
419
+ // x_2_1 * y_2_1 + x_2_2 * y_2_2 + ... + x_2_vsize * y_2_vsize,
420
+ // ...
421
+ // x_nbatch_1 * y_nbatch_1 + ... + x_nbatch_vsize * y_nbatch_vsize]
422
+ template <typename T>
423
+ inline void BatchVectorBatchVectorDotProduct(const T* vector1, const T* vector2,
424
+ int v_size, int n_batch,
425
+ T* result) {
426
+ for (int b = 0; b < n_batch; b++) {
427
+ result[b] = VectorVectorDotProduct(vector1, vector2, v_size);
428
+ vector1 += v_size;
429
+ vector2 += v_size;
430
+ }
431
+ }
432
+
433
+ // Same as above but input is 16bit and output is 32bit.
434
+ void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
435
+ const int16_t* vector2, int v_size,
436
+ int n_batch, int32_t* result);
437
+
438
+ // Same as above, but inputs are 16bit integer and output is 16bit integer.
439
+ void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
440
+ const int16_t* batch_vector,
441
+ int n_batch, int32_t multiplier,
442
+ int shift, int16_t* result);
443
+
444
+ // Compute "1.0f - elements of vector" (used in CIFG).
445
+ void Sub1Vector(const float* vector, int v_size, float* result);
446
+
447
+ // Compute "1.0f - elements of vector" (used in CIFG) for int16 input.
448
+ // "vector" has range [0, 32767] because it is the output of sigmoid function.
449
+ void Sub1Vector(const int16_t* vector, int v_size, int16_t* result);
450
+
451
+ // Reduce-sum on a float input vector:
452
+ // input_vector: float pointer to input vector.
453
+ // output_vector: float pointer to vector.
454
+ // output_size: output vector size.
455
+ // reduction_size: number of consecutive elements from input vector which are
456
+ // added to get one element of output.
457
+ void ReductionSumVector(const float* input_vector, float* output_vector,
458
+ int output_size, int reduction_size);
459
+
460
+ // Same as above but input/output is 32 bit integer.
461
+ void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
462
+ int output_size, int reduction_size);
463
+
464
+ // Same as above but input is 8 bit integer.
465
+ void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
466
+ int output_size, int reduction_size);
467
+
468
+ // Multiply all elements of vector with a scalar.
469
+ void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
470
+ float* result);
471
+
472
+ // Layer norm for each batch.
473
+ void MeanStddevNormalization(const float* input_vector, float* output_vector,
474
+ int v_size, int n_batch);
475
+
476
+ // Saturate Add with rescale on both inputs.
477
+ void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
478
+ const int8_t* recurrent, int8_t recurrent_zp,
479
+ int32_t input_effective_scale_a,
480
+ int32_t input_effective_scale_b,
481
+ int32_t recurrent_effective_scale_a,
482
+ int32_t recurrent_effective_scale_b, int32_t n_batch,
483
+ int32_t n_cell, int16_t* output);
484
+
485
+ // Same as the function above, but provide a scratch buffer for the
486
+ // int8 x int8 -> int32 and a CpuBackendContext for the accumulator
487
+ // computation.
488
+ void MatrixBatchVectorMultiplyAccumulate(
489
+ const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
490
+ const int8_t* __restrict__ vectors,
491
+ const float* __restrict__ scaling_factors, int n_batch,
492
+ int32_t* __restrict__ scratch, float* __restrict__ result,
493
+ CpuBackendContext* __restrict__ context);
494
+
495
+ // Same as the function above except that can make use of cached row sums.
496
+ void MatrixBatchVectorMultiplyAccumulate(
497
+ const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
498
+ const int8_t* __restrict__ vectors, const float* scaling_factors,
499
+ int n_batch, float* __restrict__ result, const float* per_channel_scale,
500
+ const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
501
+ bool* compute_row_sums, CpuBackendContext* context);
502
+
503
+ // Same as the function above, but provides separate scaling factor for the
504
+ // matrix and the vectors. The scaling factors are multiplied in the
505
+ // scaling_factor_scratch buffer.
506
+ inline void MatrixBatchVectorMultiplyAccumulate(
507
+ const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
508
+ const int8_t* __restrict__ vectors, const float matrix_scaling_factor,
509
+ const float* vector_scaling_factors, int n_batch,
510
+ float* __restrict__ result, const float* per_channel_scale,
511
+ const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
512
+ bool* compute_row_sums, float* scaling_factor_scratch,
513
+ CpuBackendContext* context) {
514
+ for (int b = 0; b < n_batch; ++b) {
515
+ scaling_factor_scratch[b] =
516
+ vector_scaling_factors[b] * matrix_scaling_factor;
517
+ }
518
+ MatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
519
+ scaling_factor_scratch, n_batch, result,
520
+ per_channel_scale, input_offset, scratch,
521
+ row_sums, compute_row_sums, context);
522
+ }
523
+
524
+ // Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
525
+ // dimension composed by input vectors independent from each other). The result
526
+ // of the multiplication is accumulated to the passed result buffer.
527
+ // More specifically, for a matrix M of shape [n, i] and a batched-vector
528
+ // of shape [i, batch] it will first compute the product of shape [n, batch].
529
+ // This product will be accumulated to the result buffer,
530
+ // Parameters:
531
+ // - input: batch vector of size n_batch * n_input
532
+ // - bias: vector of size b_input
533
+ // - input_to_gate_weights: matrix of size n_input * n_output
534
+ // - multiplier: scalar
535
+ // - shift: scalar
536
+ // - n_batch: the batch size
537
+ // - n_input: the input size
538
+ // - n_output: the output size
539
+ // - output_zp: the zero point of the output.
540
+ // - scratch: batch vector of size n_batch * n_output
541
+ // - output: the 16 bit output
542
+ // Notes:
543
+ // - this is used for gate matmul: for non-cifg it is for input, forget,
544
+ // cell, output gates; for cifg, it is for forget, cell, output gates.
545
+ // - multiplier and shift combined gives the scale.
546
+ // - assumes input zero point is 0.
547
+ // - scratch is created for optimization purpose only.
548
+ // TODO(b/152066492): this can be removed if some future optimization
549
+ // work makes it unnecessary.
550
+ void MatrixBatchVectorMultiplyAccumulate(
551
+ const int8_t* input, const int32_t* bias,
552
+ const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
553
+ int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
554
+ int32_t* scratch, int16_t* output, CpuBackendContext* context);
555
+
556
+ // Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
557
+ // dimension composed by input vectors independent from each other). The result
558
+ // of the multiplication is accumulated to the passed result buffer.
559
+ // More specifically, for a matrix M of shape [n, i] and a batched-vector
560
+ // of shape [i, batch] it will first compute the product of shape [n, batch].
561
+ // This product will be accumulated to the result buffer,
562
+ // Parameters:
563
+ // - input: batch vector of size n_batch * n_input
564
+ // - bias: vector of size b_input
565
+ // - input_to_gate_weights: matrix of size n_input * n_output
566
+ // - multiplier: scalar
567
+ // - shift: scalar
568
+ // - n_batch: the batch size
569
+ // - n_input: the input size
570
+ // - n_output: the output size
571
+ // - output_zp: the zero point of the output.
572
+ // - scratch: batch vector of size n_batch * n_output
573
+ // - output: the 8 bit output
574
+ // Notes:
575
+ // - this is used for projection matmul.
576
+ // - multiplier and shift combined gives the scale.
577
+ // - assumes input zero point is 0.
578
+ // - scratch is created for optimization purpose only.
579
+ // TODO(b/152066492): this can be removed if some future optimization
580
+ // work makes it unnecessary.
581
+ void MatrixBatchVectorMultiplyAccumulate(
582
+ const int8_t* input, const int32_t* bias,
583
+ const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
584
+ int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
585
+ int32_t* scratch, int8_t* output, CpuBackendContext* context);
586
+
587
+ // Apply Rectified Linear to elements of a vector.
588
+ void ApplyReluToVector(const float* __restrict__ vector, int v_size,
589
+ float* __restrict__ result);
590
+
591
+ // Apply Rectified Linear 1 (cap to [-1;1]) to elements of a vector
592
+ void ApplyRelu1ToVector(const float* __restrict__ vector, int v_size,
593
+ float* __restrict__ result);
594
+
595
+ // Apply Rectified Linear 6 (cap to [0;6]) to elements of a vector
596
+ void ApplyRelu6ToVector(const float* __restrict__ vector, int v_size,
597
+ float* __restrict__ result);
598
+
599
+ // Apply signbit to elements of a vector
600
+ void ApplySignbitToVector(const float* __restrict__ vector, int v_size,
601
+ float* __restrict__ result);
602
+
603
+ // Unpack or inflate `src_buffer` by taking each element and splitting it as
604
+ // two elements into `dst_buffer`.
605
+ // Parameters:
606
+ // src_buffer : Densely packed buffer containing int4 values
607
+ // num_elements : Number of elements stored in the buffer. Note that this can
608
+ // be smaller than the size of `src_buffer` by 1 if it's odd,
609
+ // in which case the last nibble in `src_buffer` is ignored.
610
+ // This should be equal to the size of `dst_buffer`.
611
+ // dst_buffer : Buffer to unpack into. Should be allocated by the caller.
612
+ // Size should be at least `num_elements`.
613
+ // Notes:
614
+ // For example, given `src_buffer = {0x12, 0x34};`, calling this function
615
+ // will return `dst_buffer = {0x02, 0x01, 0x04, 0x03}`.
616
+ void UnpackDenseInt4IntoInt8(const int8_t* src_buffer, int num_elements,
617
+ int8_t* dst_buffer);
618
+
619
+ } // namespace tensor_utils
620
+
621
+ } // namespace tflite_micro
622
+
623
+ #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_UTILS_H_