xmos-ai-tools 1.1.2.dev216__py3-none-macosx_11_0_arm64.whl → 1.1.2.dev236__py3-none-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. xmos_ai_tools/runtime/include/lib_nn/api/nn_layers.h +16 -0
  2. xmos_ai_tools/runtime/include/lib_nn/api/quadratic_approximation.h +80 -0
  3. xmos_ai_tools/runtime/include/lib_nn/api/quadratic_interpolation.h +23 -0
  4. xmos_ai_tools/runtime/include/lib_tflite_micro/src/tflite-xcore-kernels/xcore_ops.h +15 -15
  5. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/simple_features/model.h → signal/micro/kernels/delay_flexbuffers_generated_data.h} +7 -9
  6. xmos_ai_tools/runtime/include/signal/micro/kernels/energy_flexbuffers_generated_data.h +28 -0
  7. xmos_ai_tools/runtime/include/signal/micro/kernels/fft_flexbuffers_generated_data.h +37 -0
  8. xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_flexbuffers_generated_data.h +25 -0
  9. xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_log_flexbuffers_generated_data.h +27 -0
  10. xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_spectral_subtraction_flexbuffers_generated_data.h +26 -0
  11. xmos_ai_tools/runtime/include/signal/micro/kernels/framer_flexbuffers_generated_data.h +25 -0
  12. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/simple_features/no_simple_features_data.h → signal/micro/kernels/irfft.h} +15 -7
  13. xmos_ai_tools/runtime/include/signal/micro/kernels/overlap_add_flexbuffers_generated_data.h +25 -0
  14. xmos_ai_tools/runtime/include/signal/micro/kernels/pcan_flexbuffers_generated_data.h +7 -0
  15. xmos_ai_tools/runtime/include/signal/micro/kernels/rfft.h +31 -0
  16. xmos_ai_tools/runtime/include/signal/micro/kernels/stacker_flexbuffers_generated_data.h +25 -0
  17. xmos_ai_tools/runtime/include/signal/micro/kernels/window_flexbuffers_generated_data.h +25 -0
  18. xmos_ai_tools/runtime/include/signal/src/circular_buffer.h +118 -0
  19. xmos_ai_tools/runtime/include/signal/src/complex.h +29 -0
  20. xmos_ai_tools/runtime/include/signal/src/energy.h +38 -0
  21. xmos_ai_tools/runtime/include/signal/src/fft_auto_scale.h +35 -0
  22. xmos_ai_tools/runtime/include/signal/src/filter_bank.h +69 -0
  23. xmos_ai_tools/runtime/include/signal/src/filter_bank_log.h +38 -0
  24. xmos_ai_tools/runtime/include/signal/src/filter_bank_spectral_subtraction.h +73 -0
  25. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/main_functions.h → signal/src/filter_bank_square_root.h} +14 -17
  26. xmos_ai_tools/runtime/include/signal/src/irfft.h +84 -0
  27. xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_common.h +49 -0
  28. xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_float.h +31 -0
  29. xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_int16.h +30 -0
  30. xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_int32.h +31 -0
  31. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.h → signal/src/log.h} +13 -6
  32. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/python/interpreter/src/python_utils.h → signal/src/max_abs.h} +11 -11
  33. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.h → signal/src/msb.h} +15 -6
  34. xmos_ai_tools/runtime/include/signal/src/overlap_add.h +46 -0
  35. xmos_ai_tools/runtime/include/signal/src/pcan_argc_fixed.h +41 -0
  36. xmos_ai_tools/runtime/include/signal/src/rfft.h +85 -0
  37. xmos_ai_tools/runtime/include/signal/src/square_root.h +32 -0
  38. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/python/interpreter/src/numpy_utils.h → signal/src/window.h} +13 -15
  39. xmos_ai_tools/runtime/include/signal/testdata/fft_test_data.h +48 -0
  40. xmos_ai_tools/runtime/include/tensorflow/lite/array.h +156 -0
  41. xmos_ai_tools/runtime/include/tensorflow/lite/builtin_ops.h +44 -0
  42. xmos_ai_tools/runtime/include/tensorflow/lite/c/c_api_types.h +6 -0
  43. xmos_ai_tools/runtime/include/tensorflow/lite/c/common.h +8 -25
  44. xmos_ai_tools/runtime/include/tensorflow/lite/core/api/error_reporter.h +3 -3
  45. xmos_ai_tools/runtime/include/tensorflow/lite/core/api/flatbuffer_conversions.h +15 -0
  46. xmos_ai_tools/runtime/include/tensorflow/lite/core/c/builtin_op_data.h +92 -3
  47. xmos_ai_tools/runtime/include/tensorflow/lite/core/c/c_api_types.h +61 -51
  48. xmos_ai_tools/runtime/include/tensorflow/lite/core/c/common.h +302 -1
  49. xmos_ai_tools/runtime/include/tensorflow/lite/core/macros.h +78 -0
  50. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/common.h +129 -43
  51. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/cppmath.h +2 -2
  52. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/portable_tensor.h +23 -4
  53. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/add.h +210 -151
  54. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/comparisons.h +9 -18
  55. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/conv.h +2 -0
  56. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/add.h +103 -72
  57. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h +2 -0
  58. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h +2 -63
  59. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h +87 -26
  60. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/mul.h +129 -80
  61. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/reduce.h +42 -93
  62. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/resize_bilinear.h +5 -0
  63. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/sub.h +249 -263
  64. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/runtime_shape.h +11 -1
  65. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/strided_slice_logic.h +5 -1
  66. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/tensor_ctypes.h +5 -10
  67. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/types.h +4 -2
  68. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/kernel_util.h +25 -14
  69. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/op_macros.h +14 -3
  70. xmos_ai_tools/runtime/include/tensorflow/lite/micro/debug_log.h +10 -3
  71. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_model_settings.h +37 -0
  72. xmos_ai_tools/runtime/include/tensorflow/lite/micro/fake_micro_context.h +7 -0
  73. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/add.h +6 -5
  74. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/circular_buffer.h +0 -3
  75. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/conv.h +19 -20
  76. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/conv_test.h +8 -31
  77. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/depthwise_conv.h +8 -8
  78. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/ethosu.h +1 -1
  79. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/fully_connected.h +9 -9
  80. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/kernel_runner.h +14 -9
  81. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/kernel_util.h +9 -4
  82. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/micro_ops.h +119 -100
  83. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/mul.h +4 -4
  84. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/pooling.h +8 -8
  85. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/reduce.h +4 -4
  86. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/reshape.h +26 -0
  87. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/softmax.h +12 -16
  88. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/strided_slice.h +40 -0
  89. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/svdf.h +8 -7
  90. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.h +5 -5
  91. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa.h +2 -2
  92. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_conv.h +26 -21
  93. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_depthwise_conv.h +4 -4
  94. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_reshape.h +2 -4
  95. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_softmax.h +2 -2
  96. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h +5 -0
  97. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/linear_memory_planner.h +4 -0
  98. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/micro_memory_planner.h +4 -0
  99. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/non_persistent_buffer_planner_shim.h +4 -0
  100. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_allocator.h +23 -8
  101. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_common.h +38 -0
  102. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_context.h +23 -65
  103. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_graph.h +15 -57
  104. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter.h +16 -5
  105. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter_context.h +125 -0
  106. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter_graph.h +110 -0
  107. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_log.h +6 -8
  108. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_mutable_op_resolver.h +114 -32
  109. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_op_resolver.h +6 -5
  110. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_profiler.h +1 -1
  111. xmos_ai_tools/runtime/include/tensorflow/lite/micro/mock_micro_graph.h +1 -1
  112. xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/python_ops_resolver.h +21 -0
  113. xmos_ai_tools/runtime/include/tensorflow/lite/micro/test_helper_custom_ops.h +3 -4
  114. xmos_ai_tools/runtime/include/tensorflow/lite/micro/test_helpers.h +28 -12
  115. xmos_ai_tools/runtime/include/tensorflow/lite/micro/testing/micro_test.h +1 -0
  116. xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/log_utils.h +273 -0
  117. xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/metrics.h +41 -0
  118. xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/op_resolver.h +127 -0
  119. xmos_ai_tools/runtime/include/tensorflow/lite/schema/schema_generated.h +9139 -5010
  120. xmos_ai_tools/runtime/lib/libhost_xtflitemicro.a +0 -0
  121. xmos_ai_tools/runtime/lib/libxtflitemicro.a +0 -0
  122. xmos_ai_tools/xinterpreters/libs/macos/xtflm_python.1.0.1.dylib +0 -0
  123. xmos_ai_tools/xinterpreters/libs/macos/xtflm_python.dylib +0 -0
  124. {xmos_ai_tools-1.1.2.dev216.data → xmos_ai_tools-1.1.2.dev236.data}/data/bin/xcore-opt +0 -0
  125. {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/METADATA +3 -4
  126. {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/RECORD +128 -105
  127. {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/WHEEL +1 -1
  128. xmos_ai_tools/runtime/include/tensorflow/lite/core/api/op_resolver.h +0 -129
  129. xmos_ai_tools/runtime/include/tensorflow/lite/micro/all_ops_resolver.h +0 -38
  130. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/audio_provider.h +0 -44
  131. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/command_responder.h +0 -30
  132. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/feature_provider.h +0 -50
  133. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.h +0 -30
  134. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h +0 -43
  135. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.h +0 -29
  136. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h +0 -29
  137. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/recognize_commands.h +0 -151
  138. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h +0 -29
  139. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_features_generator.h +0 -29
  140. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_model_settings.h +0 -43
  141. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h +0 -29
  142. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/yes_simple_features_data.h +0 -23
  143. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_string.h +0 -33
  144. xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/interpreter_wrapper.h +0 -51
  145. xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/pybind11_lib.h +0 -64
  146. xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/shared_library.h +0 -40
  147. {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/top_level.txt +0 -0
@@ -32,8 +32,8 @@ namespace tflite {
32
32
  return TF_LITE_GLOBAL_STD_PREFIX::std_name(x); \
33
33
  }
34
34
 
35
- DECLARE_STD_GLOBAL_SWITCH1(TfLiteRound, round);
36
- DECLARE_STD_GLOBAL_SWITCH1(TfLiteExpm1, expm1);
35
+ DECLARE_STD_GLOBAL_SWITCH1(TfLiteRound, round)
36
+ DECLARE_STD_GLOBAL_SWITCH1(TfLiteExpm1, expm1)
37
37
 
38
38
  } // namespace tflite
39
39
 
@@ -15,6 +15,7 @@ limitations under the License.
15
15
  #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
16
16
  #define TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
17
17
 
18
+ #include <cstddef>
18
19
  #include <vector>
19
20
 
20
21
  #include "tensorflow/lite/core/c/common.h"
@@ -23,10 +24,6 @@ limitations under the License.
23
24
 
24
25
  namespace tflite {
25
26
 
26
- inline RuntimeShape GetTensorShape(std::vector<int32_t> data) {
27
- return RuntimeShape(data.size(), data.data());
28
- }
29
-
30
27
  // A list of tensors in a format that can be used by kernels like split and
31
28
  // concatenation.
32
29
  template <typename T>
@@ -54,6 +51,26 @@ class VectorOfTensors {
54
51
  all_shape_ptr_.push_back(&all_shape_[i]);
55
52
  }
56
53
  }
54
+
55
+ explicit VectorOfTensors(const std::vector<TfLiteTensor*>& tensors) {
56
+ int num_tensors = tensors.size();
57
+
58
+ all_data_.reserve(num_tensors);
59
+ all_shape_.reserve(num_tensors);
60
+ all_shape_ptr_.reserve(num_tensors);
61
+
62
+ for (auto* t : tensors) {
63
+ all_data_.push_back(GetTensorData<T>(t));
64
+ all_shape_.push_back(GetTensorShape(t));
65
+ }
66
+
67
+ // Taking the pointer from inside a std::vector is only OK if the vector is
68
+ // never modified, so we populate all_shape in the previous loop and then we
69
+ // are free to grab iterators here.
70
+ for (int i = 0; i < num_tensors; ++i) {
71
+ all_shape_ptr_.push_back(&all_shape_[i]);
72
+ }
73
+ }
57
74
  // Return a pointer to the data pointers of all tensors in the list. For
58
75
  // example:
59
76
  // float* const* f = v.data();
@@ -66,6 +83,8 @@ class VectorOfTensors {
66
83
  // dims[1] are the dimensions of the second tensor in the list.
67
84
  const RuntimeShape* const* shapes() const { return all_shape_ptr_.data(); }
68
85
 
86
+ size_t size() const { return all_data_.size(); }
87
+
69
88
  private:
70
89
  std::vector<T*> all_data_;
71
90
  std::vector<RuntimeShape> all_shape_;
@@ -16,10 +16,13 @@ limitations under the License.
16
16
  #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
17
17
 
18
18
  #include <algorithm>
19
+ #include <cstddef>
20
+ #include <cstdint>
19
21
  #include <type_traits>
20
22
 
21
23
  #include "fixedpoint/fixedpoint.h"
22
24
  #include "tensorflow/lite/kernels/internal/common.h"
25
+ #include "tensorflow/lite/kernels/internal/compatibility.h"
23
26
 
24
27
  namespace tflite {
25
28
 
@@ -194,21 +197,135 @@ inline void Add(const ArithmeticParams& params,
194
197
  }
195
198
  }
196
199
 
200
+ template <typename T>
201
+ inline void AddBroadcast(const T* input_data, const T* broadcast_data,
202
+ T* output_data, size_t size, T activation_min,
203
+ T activation_max) {
204
+ for (size_t c = 0; c < size; ++c) {
205
+ output_data[c] = ActivationFunctionWithMinMax<T>(
206
+ input_data[c] + broadcast_data[0], activation_min, activation_max);
207
+ }
208
+ }
209
+
210
+ template <>
211
+ inline void AddBroadcast<int32_t>(const int32_t* input_data,
212
+ const int32_t* broadcast_data,
213
+ int32_t* output_data, size_t size,
214
+ int32_t activation_min,
215
+ int32_t activation_max) {
216
+ size_t c = 0;
217
+ #ifdef USE_NEON
218
+ const int32x4_t vmax = vdupq_n_s32(activation_max);
219
+ const int32x4_t vmin = vdupq_n_s32(activation_min);
220
+ const int32x4_t vb = vdupq_n_s32(broadcast_data[0]);
221
+ for (; c + 4 <= size; c += 4) {
222
+ const int32x4_t va = vld1q_s32(&input_data[c]);
223
+ int32x4_t vres = vaddq_s32(va, vb);
224
+ vres = vmaxq_s32(vmin, vres);
225
+ vres = vminq_s32(vmax, vres);
226
+ vst1q_s32(&output_data[c], vres);
227
+ }
228
+ #endif
229
+ for (; c < size; ++c) {
230
+ output_data[c] = ActivationFunctionWithMinMax<int32_t>(
231
+ input_data[c] + broadcast_data[0], activation_min, activation_max);
232
+ }
233
+ }
234
+
235
+ template <typename T>
236
+ void AddElementwise(const T* input1_data, const T* input2_data, T* output_data,
237
+ size_t size, T activation_min, T activation_max) {
238
+ for (size_t c = 0; c < size; ++c) {
239
+ output_data[c] = ActivationFunctionWithMinMax<T>(
240
+ input1_data[c] + input2_data[c], activation_min, activation_max);
241
+ }
242
+ }
243
+
244
+ template <>
245
+ inline void AddElementwise<int32_t>(const int32_t* input1_data,
246
+ const int32_t* input2_data,
247
+ int32_t* output_data, size_t size,
248
+ int32_t activation_min,
249
+ int32_t activation_max) {
250
+ size_t c = 0;
251
+ #ifdef USE_NEON
252
+ const int32x4_t vmax = vdupq_n_s32(activation_max);
253
+ const int32x4_t vmin = vdupq_n_s32(activation_min);
254
+ for (; c + 4 <= size; c += 4) {
255
+ const int32x4_t va = vld1q_s32(&input1_data[c]);
256
+ const int32x4_t vb = vld1q_s32(&input2_data[c]);
257
+ int32x4_t vres = vaddq_s32(va, vb);
258
+ vres = vmaxq_s32(vmin, vres);
259
+ vres = vminq_s32(vmax, vres);
260
+ vst1q_s32(&output_data[c], vres);
261
+ }
262
+ #endif
263
+ for (; c < size; ++c) {
264
+ output_data[c] = ActivationFunctionWithMinMax<int32_t>(
265
+ input1_data[c] + input2_data[c], activation_min, activation_max);
266
+ }
267
+ }
268
+
269
+ template <typename T>
270
+ inline void BroadcastAddRecursiveDimensions(
271
+ int dimension, size_t* input1_offset_p, size_t* input2_offset_p,
272
+ size_t* output_offset, size_t* compressed_input1_stride,
273
+ size_t* compressed_input2_stride, size_t* compressed_output_shape,
274
+ T activation_min, T activation_max, const T* input1_data,
275
+ const T* input2_data, T* output_data) {
276
+ if (dimension > 0) {
277
+ for (size_t c = 0; c < compressed_output_shape[dimension]; ++c) {
278
+ size_t input1_offset_c = *input1_offset_p;
279
+ size_t input2_offset_c = *input2_offset_p;
280
+ BroadcastAddRecursiveDimensions(
281
+ dimension - 1, &input1_offset_c, &input2_offset_c, output_offset,
282
+ compressed_input1_stride, compressed_input2_stride,
283
+ compressed_output_shape, activation_min, activation_max, input1_data,
284
+ input2_data, output_data);
285
+ *input1_offset_p += compressed_input1_stride[dimension];
286
+ *input2_offset_p += compressed_input2_stride[dimension];
287
+ }
288
+ } else {
289
+ TFLITE_DCHECK(dimension == 0);
290
+ bool input1_is_broadcast = compressed_input1_stride[dimension] == 0;
291
+ bool input2_is_broadcast = compressed_input2_stride[dimension] == 0;
292
+ TFLITE_DCHECK(!(input1_is_broadcast && input2_is_broadcast));
293
+ const T* input1_data_ptr = input1_data + *input1_offset_p;
294
+ const T* input2_data_ptr = input2_data + *input2_offset_p;
295
+ T* output_data_ptr = output_data + *output_offset;
296
+ if (input1_is_broadcast) {
297
+ // input1 is broadcast.
298
+ AddBroadcast<T>(input2_data_ptr, input1_data_ptr, output_data_ptr,
299
+ compressed_output_shape[dimension], activation_min,
300
+ activation_max);
301
+ *input2_offset_p += compressed_output_shape[dimension];
302
+ } else if (input2_is_broadcast) {
303
+ // input2 is broadcast.
304
+ AddBroadcast<T>(input1_data_ptr, input2_data_ptr, output_data_ptr,
305
+ compressed_output_shape[dimension], activation_min,
306
+ activation_max);
307
+ *input1_offset_p += compressed_output_shape[dimension];
308
+ } else {
309
+ // Add element-wise.
310
+ AddElementwise<T>(input1_data_ptr, input2_data_ptr, output_data_ptr,
311
+ compressed_output_shape[dimension], activation_min,
312
+ activation_max);
313
+ *input1_offset_p += compressed_output_shape[dimension];
314
+ *input2_offset_p += compressed_output_shape[dimension];
315
+ }
316
+ *output_offset += compressed_output_shape[dimension];
317
+ }
318
+ }
319
+
197
320
  template <typename T,
198
- // For unquantized add for small integers, explictly set to true.
321
+ // For unquantized add for small integers, explicitly set to true.
199
322
  bool dummy = false>
200
323
  inline typename std::enable_if<!is_small_integer<T>::value || dummy, void>::type
201
324
  BroadcastAdd6DSlow(const ArithmeticParams& params,
202
325
  const RuntimeShape& input1_shape, const T* input1_data,
203
326
  const RuntimeShape& input2_shape, const T* input2_data,
204
327
  const RuntimeShape& output_shape, T* output_data) {
205
- NdArrayDesc<6> desc1;
206
- NdArrayDesc<6> desc2;
207
- NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
208
- &desc2);
209
- const RuntimeShape extended_output_shape =
210
- RuntimeShape::ExtendedShape(6, output_shape);
211
-
328
+ constexpr int kMaxBroadcastDim = 6;
212
329
  T activation_min, activation_max;
213
330
  GetActivationParams(params, &activation_min, &activation_max);
214
331
 
@@ -223,64 +340,74 @@ BroadcastAdd6DSlow(const ArithmeticParams& params,
223
340
  // We name our variables by their Tensorflow convention, but generate C code
224
341
  // nesting loops such that the innermost loop has the smallest stride for the
225
342
  // best cache behavior.
226
- size_t input1_offset_a = 0;
227
- size_t input2_offset_a = 0;
228
- size_t output_offset_a = 0;
229
- for (int a = 0; a < extended_output_shape.Dims(0); ++a) {
230
- size_t input1_offset_d = input1_offset_a;
231
- size_t input2_offset_d = input2_offset_a;
232
- size_t output_offset_d = output_offset_a;
233
- for (int d = 0; d < extended_output_shape.Dims(1); ++d) {
234
- size_t input1_offset_b = input1_offset_d;
235
- size_t input2_offset_b = input2_offset_d;
236
- size_t output_offset_b = output_offset_d;
237
- for (int b = 0; b < extended_output_shape.Dims(2); ++b) {
238
- size_t input1_offset_y = input1_offset_b;
239
- size_t input2_offset_y = input2_offset_b;
240
- size_t output_offset_y = output_offset_b;
241
- for (int y = 0; y < extended_output_shape.Dims(3); ++y) {
242
- size_t input1_offset_x = input1_offset_y;
243
- size_t input2_offset_x = input2_offset_y;
244
- size_t output_offset_x = output_offset_y;
245
- for (int x = 0; x < extended_output_shape.Dims(4); ++x) {
246
- size_t input1_offset_c = input1_offset_x;
247
- size_t input2_offset_c = input2_offset_x;
248
- size_t output_offset_c = output_offset_x;
249
- for (int c = 0; c < extended_output_shape.Dims(5); ++c) {
250
- output_data[output_offset_c] = ActivationFunctionWithMinMax<T>(
251
- input1_data[input1_offset_c] + input2_data[input2_offset_c],
252
- activation_min, activation_max);
253
- input1_offset_c += desc1.strides[5];
254
- input2_offset_c += desc2.strides[5];
255
- ++output_offset_c;
256
- }
257
- input1_offset_x += desc1.strides[4];
258
- input2_offset_x += desc2.strides[4];
259
- output_offset_x += extended_output_shape.Dims(5);
260
- }
261
- input1_offset_y += desc1.strides[3];
262
- input2_offset_y += desc2.strides[3];
263
- output_offset_y +=
264
- extended_output_shape.Dims(4) * extended_output_shape.Dims(5);
265
- }
266
- input1_offset_b += desc1.strides[2];
267
- input2_offset_b += desc2.strides[2];
268
- output_offset_b += extended_output_shape.Dims(3) *
269
- extended_output_shape.Dims(4) *
270
- extended_output_shape.Dims(5);
271
- }
272
- input1_offset_d += desc1.strides[1];
273
- input2_offset_d += desc2.strides[1];
274
- output_offset_d +=
275
- extended_output_shape.Dims(2) * extended_output_shape.Dims(3) *
276
- extended_output_shape.Dims(4) * extended_output_shape.Dims(5);
343
+ size_t compressed_input1_stride[kMaxBroadcastDim];
344
+ size_t compressed_input2_stride[kMaxBroadcastDim];
345
+ size_t compressed_output_shape[kMaxBroadcastDim];
346
+ bool broadcastable_shape = ReduceDimensionsForBroadcast<kMaxBroadcastDim>(
347
+ input1_shape, input2_shape, compressed_input1_stride,
348
+ compressed_input2_stride, compressed_output_shape);
349
+ // Skip broadcasting for degenerate shapes.
350
+ if (!broadcastable_shape) {
351
+ return;
352
+ }
353
+
354
+ size_t input1_offset = 0;
355
+ size_t input2_offset = 0;
356
+ size_t output_offset = 0;
357
+ BroadcastAddRecursiveDimensions<T>(
358
+ kMaxBroadcastDim - 1, &input1_offset, &input2_offset, &output_offset,
359
+ compressed_input1_stride, compressed_input2_stride,
360
+ compressed_output_shape, activation_min, activation_max, input1_data,
361
+ input2_data, output_data);
362
+ }
363
+
364
+ // This function is used for 8-bit as well as for 16-bit, but the accumulator
365
+ // is 32-bit for both cases. The overflow does not happen due to the
366
+ // choice of the shift (20 or 15, accordingly - see add.cc for more comments).
367
+ template <typename T>
368
+ inline void BroadcastAddRecursiveDimensions(
369
+ const ArithmeticParams& params, int dimension, size_t* input1_offset_p,
370
+ size_t* input2_offset_p, size_t* output_offset,
371
+ size_t* compressed_input1_stride, size_t* compressed_input2_stride,
372
+ size_t* compressed_output_shape, const T* input1_data, const T* input2_data,
373
+ T* output_data) {
374
+ for (size_t c = 0; c < compressed_output_shape[dimension]; ++c) {
375
+ if (dimension > 0) {
376
+ size_t input1_offset_c = *input1_offset_p;
377
+ size_t input2_offset_c = *input2_offset_p;
378
+ BroadcastAddRecursiveDimensions(
379
+ params, dimension - 1, &input1_offset_c, &input2_offset_c,
380
+ output_offset, compressed_input1_stride, compressed_input2_stride,
381
+ compressed_output_shape, input1_data, input2_data, output_data);
382
+ } else {
383
+ TFLITE_DCHECK(dimension == 0);
384
+ const int32_t input1_val =
385
+ params.input1_offset + input1_data[*input1_offset_p];
386
+ const int32_t input2_val =
387
+ params.input2_offset + input2_data[*input2_offset_p];
388
+ const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
389
+ const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
390
+ const int32_t scaled_input1_val =
391
+ MultiplyByQuantizedMultiplierSmallerThanOneExp(
392
+ shifted_input1_val, params.input1_multiplier,
393
+ params.input1_shift);
394
+ const int32_t scaled_input2_val =
395
+ MultiplyByQuantizedMultiplierSmallerThanOneExp(
396
+ shifted_input2_val, params.input2_multiplier,
397
+ params.input2_shift);
398
+ const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
399
+ const int32_t raw_output =
400
+ MultiplyByQuantizedMultiplierSmallerThanOneExp(
401
+ raw_sum, params.output_multiplier, params.output_shift) +
402
+ params.output_offset;
403
+ const int32_t clamped_output =
404
+ std::min(params.quantized_activation_max,
405
+ std::max(params.quantized_activation_min, raw_output));
406
+ output_data[*output_offset] = static_cast<T>(clamped_output);
407
+ ++(*output_offset);
277
408
  }
278
- input1_offset_a += desc1.strides[0];
279
- input2_offset_a += desc2.strides[0];
280
- output_offset_a +=
281
- extended_output_shape.Dims(1) * extended_output_shape.Dims(2) *
282
- extended_output_shape.Dims(3) * extended_output_shape.Dims(4) *
283
- extended_output_shape.Dims(5);
409
+ *input1_offset_p += compressed_input1_stride[dimension];
410
+ *input2_offset_p += compressed_input2_stride[dimension];
284
411
  }
285
412
  }
286
413
 
@@ -293,12 +420,7 @@ BroadcastAdd6DSlow(const ArithmeticParams& params,
293
420
  const RuntimeShape& input1_shape, const T* input1_data,
294
421
  const RuntimeShape& input2_shape, const T* input2_data,
295
422
  const RuntimeShape& output_shape, T* output_data) {
296
- NdArrayDesc<6> desc1;
297
- NdArrayDesc<6> desc2;
298
- NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
299
- &desc2);
300
- const RuntimeShape extended_output_shape =
301
- RuntimeShape::ExtendedShape(6, output_shape);
423
+ constexpr int kMaxBroadcastDim = 6;
302
424
 
303
425
  // In Tensorflow, the dimensions are canonically named (batch_number, row,
304
426
  // col, channel), with extents (batches, height, width, depth), with the
@@ -311,87 +433,24 @@ BroadcastAdd6DSlow(const ArithmeticParams& params,
311
433
  // We name our variables by their Tensorflow convention, but generate C code
312
434
  // nesting loops such that the innermost loop has the smallest stride for the
313
435
  // best cache behavior.
314
- size_t input1_offset_a = 0;
315
- size_t input2_offset_a = 0;
316
- size_t output_offset_a = 0;
317
- for (int a = 0; a < extended_output_shape.Dims(0); ++a) {
318
- size_t input1_offset_d = input1_offset_a;
319
- size_t input2_offset_d = input2_offset_a;
320
- size_t output_offset_d = output_offset_a;
321
- for (int d = 0; d < extended_output_shape.Dims(1); ++d) {
322
- size_t input1_offset_b = input1_offset_d;
323
- size_t input2_offset_b = input2_offset_d;
324
- size_t output_offset_b = output_offset_d;
325
- for (int b = 0; b < extended_output_shape.Dims(2); ++b) {
326
- size_t input1_offset_y = input1_offset_b;
327
- size_t input2_offset_y = input2_offset_b;
328
- size_t output_offset_y = output_offset_b;
329
- for (int y = 0; y < extended_output_shape.Dims(3); ++y) {
330
- size_t input1_offset_x = input1_offset_y;
331
- size_t input2_offset_x = input2_offset_y;
332
- size_t output_offset_x = output_offset_y;
333
- for (int x = 0; x < extended_output_shape.Dims(4); ++x) {
334
- size_t input1_offset_c = input1_offset_x;
335
- size_t input2_offset_c = input2_offset_x;
336
- size_t output_offset_c = output_offset_x;
337
- for (int c = 0; c < extended_output_shape.Dims(5); ++c) {
338
- const int32_t input1_val =
339
- params.input1_offset + input1_data[input1_offset_c];
340
- const int32_t input2_val =
341
- params.input2_offset + input2_data[input2_offset_c];
342
- const int32_t shifted_input1_val =
343
- input1_val * (1 << params.left_shift);
344
- const int32_t shifted_input2_val =
345
- input2_val * (1 << params.left_shift);
346
- const int32_t scaled_input1_val =
347
- MultiplyByQuantizedMultiplierSmallerThanOneExp(
348
- shifted_input1_val, params.input1_multiplier,
349
- params.input1_shift);
350
- const int32_t scaled_input2_val =
351
- MultiplyByQuantizedMultiplierSmallerThanOneExp(
352
- shifted_input2_val, params.input2_multiplier,
353
- params.input2_shift);
354
- const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
355
- const int32_t raw_output =
356
- MultiplyByQuantizedMultiplierSmallerThanOneExp(
357
- raw_sum, params.output_multiplier, params.output_shift) +
358
- params.output_offset;
359
- const int32_t clamped_output = std::min(
360
- params.quantized_activation_max,
361
- std::max(params.quantized_activation_min, raw_output));
362
- output_data[output_offset_c] = static_cast<T>(clamped_output);
363
- input1_offset_c += desc1.strides[5];
364
- input2_offset_c += desc2.strides[5];
365
- ++output_offset_c;
366
- }
367
- input1_offset_x += desc1.strides[4];
368
- input2_offset_x += desc2.strides[4];
369
- output_offset_x += extended_output_shape.Dims(5);
370
- }
371
- input1_offset_y += desc1.strides[3];
372
- input2_offset_y += desc2.strides[3];
373
- output_offset_y +=
374
- extended_output_shape.Dims(4) * extended_output_shape.Dims(5);
375
- }
376
- input1_offset_b += desc1.strides[2];
377
- input2_offset_b += desc2.strides[2];
378
- output_offset_b += extended_output_shape.Dims(3) *
379
- extended_output_shape.Dims(4) *
380
- extended_output_shape.Dims(5);
381
- }
382
- input1_offset_d += desc1.strides[1];
383
- input2_offset_d += desc2.strides[1];
384
- output_offset_d +=
385
- extended_output_shape.Dims(2) * extended_output_shape.Dims(3) *
386
- extended_output_shape.Dims(4) * extended_output_shape.Dims(5);
387
- }
388
- input1_offset_a += desc1.strides[0];
389
- input2_offset_a += desc2.strides[0];
390
- output_offset_a +=
391
- extended_output_shape.Dims(1) * extended_output_shape.Dims(2) *
392
- extended_output_shape.Dims(3) * extended_output_shape.Dims(4) *
393
- extended_output_shape.Dims(5);
436
+ size_t compressed_input1_stride[kMaxBroadcastDim];
437
+ size_t compressed_input2_stride[kMaxBroadcastDim];
438
+ size_t compressed_output_shape[kMaxBroadcastDim];
439
+ bool broadcastable_shape = ReduceDimensionsForBroadcast<kMaxBroadcastDim>(
440
+ input1_shape, input2_shape, compressed_input1_stride,
441
+ compressed_input2_stride, compressed_output_shape);
442
+ // Skip broadcasting for degenerate shapes.
443
+ if (!broadcastable_shape) {
444
+ return;
394
445
  }
446
+
447
+ size_t input1_offset = 0;
448
+ size_t input2_offset = 0;
449
+ size_t output_offset = 0;
450
+ BroadcastAddRecursiveDimensions(
451
+ params, kMaxBroadcastDim - 1, &input1_offset, &input2_offset,
452
+ &output_offset, compressed_input1_stride, compressed_input2_stride,
453
+ compressed_output_shape, input1_data, input2_data, output_data);
395
454
  }
396
455
 
397
456
  template <typename T>
@@ -112,20 +112,11 @@ struct BroadcastComparison4DSlowCommon {
112
112
  NdArrayDesc<4> desc2;
113
113
  };
114
114
 
115
- inline BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess(
115
+ TFLITE_NOINLINE
116
+ BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess(
116
117
  const RuntimeShape& unextended_input1_shape,
117
118
  const RuntimeShape& unextended_input2_shape,
118
- const RuntimeShape& unextended_output_shape) {
119
- TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
120
- TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
121
- TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
122
- NdArrayDesc<4> desc1;
123
- NdArrayDesc<4> desc2;
124
- NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
125
- unextended_input2_shape, &desc1, &desc2);
126
- return {RuntimeShape::ExtendedShape(4, unextended_output_shape), desc1,
127
- desc2};
128
- }
119
+ const RuntimeShape& unextended_output_shape);
129
120
 
130
121
  template <typename T, ComparisonFn<T> F>
131
122
  inline void BroadcastComparison4DSlowImpl(
@@ -266,12 +257,12 @@ inline void BroadcastComparison4DSlowWithScaling(
266
257
  op_params, input1_shape, input1_data, input2_shape, input2_data, \
267
258
  output_shape, output_data); \
268
259
  }
269
- TFLITE_COMPARISON_OP(Equal);
270
- TFLITE_COMPARISON_OP(NotEqual);
271
- TFLITE_COMPARISON_OP(Greater);
272
- TFLITE_COMPARISON_OP(GreaterEqual);
273
- TFLITE_COMPARISON_OP(Less);
274
- TFLITE_COMPARISON_OP(LessEqual);
260
+ TFLITE_COMPARISON_OP(Equal)
261
+ TFLITE_COMPARISON_OP(NotEqual)
262
+ TFLITE_COMPARISON_OP(Greater)
263
+ TFLITE_COMPARISON_OP(GreaterEqual)
264
+ TFLITE_COMPARISON_OP(Less)
265
+ TFLITE_COMPARISON_OP(LessEqual)
275
266
  #undef TFLITE_COMPARISON_OP
276
267
 
277
268
  } // namespace reference_ops
@@ -56,8 +56,10 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
56
56
  const int filter_width = filter_shape.Dims(2);
57
57
  const int filter_input_depth = filter_shape.Dims(3);
58
58
  const int groups = input_depth / filter_input_depth;
59
+ TFLITE_DCHECK_NE(groups, 0);
59
60
  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
60
61
  const int filters_per_group = output_depth / groups;
62
+ TFLITE_DCHECK_NE(filters_per_group, 0);
61
63
  const int output_height = output_shape.Dims(1);
62
64
  const int output_width = output_shape.Dims(2);
63
65