xmos-ai-tools 1.1.2.dev216__py3-none-macosx_11_0_arm64.whl → 1.1.2.dev236__py3-none-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. xmos_ai_tools/runtime/include/lib_nn/api/nn_layers.h +16 -0
  2. xmos_ai_tools/runtime/include/lib_nn/api/quadratic_approximation.h +80 -0
  3. xmos_ai_tools/runtime/include/lib_nn/api/quadratic_interpolation.h +23 -0
  4. xmos_ai_tools/runtime/include/lib_tflite_micro/src/tflite-xcore-kernels/xcore_ops.h +15 -15
  5. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/simple_features/model.h → signal/micro/kernels/delay_flexbuffers_generated_data.h} +7 -9
  6. xmos_ai_tools/runtime/include/signal/micro/kernels/energy_flexbuffers_generated_data.h +28 -0
  7. xmos_ai_tools/runtime/include/signal/micro/kernels/fft_flexbuffers_generated_data.h +37 -0
  8. xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_flexbuffers_generated_data.h +25 -0
  9. xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_log_flexbuffers_generated_data.h +27 -0
  10. xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_spectral_subtraction_flexbuffers_generated_data.h +26 -0
  11. xmos_ai_tools/runtime/include/signal/micro/kernels/framer_flexbuffers_generated_data.h +25 -0
  12. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/simple_features/no_simple_features_data.h → signal/micro/kernels/irfft.h} +15 -7
  13. xmos_ai_tools/runtime/include/signal/micro/kernels/overlap_add_flexbuffers_generated_data.h +25 -0
  14. xmos_ai_tools/runtime/include/signal/micro/kernels/pcan_flexbuffers_generated_data.h +7 -0
  15. xmos_ai_tools/runtime/include/signal/micro/kernels/rfft.h +31 -0
  16. xmos_ai_tools/runtime/include/signal/micro/kernels/stacker_flexbuffers_generated_data.h +25 -0
  17. xmos_ai_tools/runtime/include/signal/micro/kernels/window_flexbuffers_generated_data.h +25 -0
  18. xmos_ai_tools/runtime/include/signal/src/circular_buffer.h +118 -0
  19. xmos_ai_tools/runtime/include/signal/src/complex.h +29 -0
  20. xmos_ai_tools/runtime/include/signal/src/energy.h +38 -0
  21. xmos_ai_tools/runtime/include/signal/src/fft_auto_scale.h +35 -0
  22. xmos_ai_tools/runtime/include/signal/src/filter_bank.h +69 -0
  23. xmos_ai_tools/runtime/include/signal/src/filter_bank_log.h +38 -0
  24. xmos_ai_tools/runtime/include/signal/src/filter_bank_spectral_subtraction.h +73 -0
  25. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/main_functions.h → signal/src/filter_bank_square_root.h} +14 -17
  26. xmos_ai_tools/runtime/include/signal/src/irfft.h +84 -0
  27. xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_common.h +49 -0
  28. xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_float.h +31 -0
  29. xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_int16.h +30 -0
  30. xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_int32.h +31 -0
  31. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.h → signal/src/log.h} +13 -6
  32. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/python/interpreter/src/python_utils.h → signal/src/max_abs.h} +11 -11
  33. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.h → signal/src/msb.h} +15 -6
  34. xmos_ai_tools/runtime/include/signal/src/overlap_add.h +46 -0
  35. xmos_ai_tools/runtime/include/signal/src/pcan_argc_fixed.h +41 -0
  36. xmos_ai_tools/runtime/include/signal/src/rfft.h +85 -0
  37. xmos_ai_tools/runtime/include/signal/src/square_root.h +32 -0
  38. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/python/interpreter/src/numpy_utils.h → signal/src/window.h} +13 -15
  39. xmos_ai_tools/runtime/include/signal/testdata/fft_test_data.h +48 -0
  40. xmos_ai_tools/runtime/include/tensorflow/lite/array.h +156 -0
  41. xmos_ai_tools/runtime/include/tensorflow/lite/builtin_ops.h +44 -0
  42. xmos_ai_tools/runtime/include/tensorflow/lite/c/c_api_types.h +6 -0
  43. xmos_ai_tools/runtime/include/tensorflow/lite/c/common.h +8 -25
  44. xmos_ai_tools/runtime/include/tensorflow/lite/core/api/error_reporter.h +3 -3
  45. xmos_ai_tools/runtime/include/tensorflow/lite/core/api/flatbuffer_conversions.h +15 -0
  46. xmos_ai_tools/runtime/include/tensorflow/lite/core/c/builtin_op_data.h +92 -3
  47. xmos_ai_tools/runtime/include/tensorflow/lite/core/c/c_api_types.h +61 -51
  48. xmos_ai_tools/runtime/include/tensorflow/lite/core/c/common.h +302 -1
  49. xmos_ai_tools/runtime/include/tensorflow/lite/core/macros.h +78 -0
  50. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/common.h +129 -43
  51. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/cppmath.h +2 -2
  52. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/portable_tensor.h +23 -4
  53. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/add.h +210 -151
  54. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/comparisons.h +9 -18
  55. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/conv.h +2 -0
  56. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/add.h +103 -72
  57. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h +2 -0
  58. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h +2 -63
  59. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h +87 -26
  60. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/mul.h +129 -80
  61. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/reduce.h +42 -93
  62. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/resize_bilinear.h +5 -0
  63. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/sub.h +249 -263
  64. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/runtime_shape.h +11 -1
  65. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/strided_slice_logic.h +5 -1
  66. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/tensor_ctypes.h +5 -10
  67. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/types.h +4 -2
  68. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/kernel_util.h +25 -14
  69. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/op_macros.h +14 -3
  70. xmos_ai_tools/runtime/include/tensorflow/lite/micro/debug_log.h +10 -3
  71. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_model_settings.h +37 -0
  72. xmos_ai_tools/runtime/include/tensorflow/lite/micro/fake_micro_context.h +7 -0
  73. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/add.h +6 -5
  74. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/circular_buffer.h +0 -3
  75. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/conv.h +19 -20
  76. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/conv_test.h +8 -31
  77. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/depthwise_conv.h +8 -8
  78. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/ethosu.h +1 -1
  79. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/fully_connected.h +9 -9
  80. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/kernel_runner.h +14 -9
  81. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/kernel_util.h +9 -4
  82. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/micro_ops.h +119 -100
  83. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/mul.h +4 -4
  84. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/pooling.h +8 -8
  85. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/reduce.h +4 -4
  86. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/reshape.h +26 -0
  87. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/softmax.h +12 -16
  88. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/strided_slice.h +40 -0
  89. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/svdf.h +8 -7
  90. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.h +5 -5
  91. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa.h +2 -2
  92. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_conv.h +26 -21
  93. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_depthwise_conv.h +4 -4
  94. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_reshape.h +2 -4
  95. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_softmax.h +2 -2
  96. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h +5 -0
  97. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/linear_memory_planner.h +4 -0
  98. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/micro_memory_planner.h +4 -0
  99. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/non_persistent_buffer_planner_shim.h +4 -0
  100. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_allocator.h +23 -8
  101. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_common.h +38 -0
  102. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_context.h +23 -65
  103. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_graph.h +15 -57
  104. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter.h +16 -5
  105. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter_context.h +125 -0
  106. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter_graph.h +110 -0
  107. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_log.h +6 -8
  108. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_mutable_op_resolver.h +114 -32
  109. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_op_resolver.h +6 -5
  110. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_profiler.h +1 -1
  111. xmos_ai_tools/runtime/include/tensorflow/lite/micro/mock_micro_graph.h +1 -1
  112. xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/python_ops_resolver.h +21 -0
  113. xmos_ai_tools/runtime/include/tensorflow/lite/micro/test_helper_custom_ops.h +3 -4
  114. xmos_ai_tools/runtime/include/tensorflow/lite/micro/test_helpers.h +28 -12
  115. xmos_ai_tools/runtime/include/tensorflow/lite/micro/testing/micro_test.h +1 -0
  116. xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/log_utils.h +273 -0
  117. xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/metrics.h +41 -0
  118. xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/op_resolver.h +127 -0
  119. xmos_ai_tools/runtime/include/tensorflow/lite/schema/schema_generated.h +9139 -5010
  120. xmos_ai_tools/runtime/lib/libhost_xtflitemicro.a +0 -0
  121. xmos_ai_tools/runtime/lib/libxtflitemicro.a +0 -0
  122. xmos_ai_tools/xinterpreters/libs/macos/xtflm_python.1.0.1.dylib +0 -0
  123. xmos_ai_tools/xinterpreters/libs/macos/xtflm_python.dylib +0 -0
  124. {xmos_ai_tools-1.1.2.dev216.data → xmos_ai_tools-1.1.2.dev236.data}/data/bin/xcore-opt +0 -0
  125. {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/METADATA +3 -4
  126. {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/RECORD +128 -105
  127. {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/WHEEL +1 -1
  128. xmos_ai_tools/runtime/include/tensorflow/lite/core/api/op_resolver.h +0 -129
  129. xmos_ai_tools/runtime/include/tensorflow/lite/micro/all_ops_resolver.h +0 -38
  130. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/audio_provider.h +0 -44
  131. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/command_responder.h +0 -30
  132. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/feature_provider.h +0 -50
  133. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.h +0 -30
  134. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h +0 -43
  135. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.h +0 -29
  136. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h +0 -29
  137. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/recognize_commands.h +0 -151
  138. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h +0 -29
  139. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_features_generator.h +0 -29
  140. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_model_settings.h +0 -43
  141. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h +0 -29
  142. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/yes_simple_features_data.h +0 -23
  143. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_string.h +0 -33
  144. xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/interpreter_wrapper.h +0 -51
  145. xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/pybind11_lib.h +0 -64
  146. xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/shared_library.h +0 -40
  147. {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/top_level.txt +0 -0
@@ -24,6 +24,9 @@ namespace tflite {
24
24
 
25
25
  namespace reference_ops {
26
26
 
27
+ // Maximum dimension supported by the broadcast mul operation.
28
+ constexpr int kMaxMulBroadcastDim = 6;
29
+
27
30
  // Element-wise mul that can often be used for inner loop of broadcast Mul as
28
31
  // well as the non-broadcast Mul.
29
32
  inline void MulElementwise(int size, const ArithmeticParams& params,
@@ -88,128 +91,174 @@ inline void Mul(const ArithmeticParams& params,
88
91
  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
89
92
  }
90
93
 
91
- inline void BroadcastMul4DSlow(const ArithmeticParams& params,
94
+ template <typename T, typename F>
95
+ void BroadcastMulRecursiveDimensions(
96
+ const ArithmeticParams& params, int dimension, const T* input1_data,
97
+ const T* input2_data, T* output_data, size_t* input1_offset_p,
98
+ size_t* input2_offset_p, size_t* output_offset,
99
+ const NdArrayDesc<kMaxMulBroadcastDim>& desc1,
100
+ const NdArrayDesc<kMaxMulBroadcastDim>& desc2,
101
+ const int32_t extended_output_shape_dims[kMaxMulBroadcastDim],
102
+ F binary_func) {
103
+ if (dimension == kMaxMulBroadcastDim - 1) {
104
+ for (int c = 0; c < extended_output_shape_dims[dimension]; ++c) {
105
+ const T input1_val = input1_data[*input1_offset_p];
106
+ const T input2_val = input2_data[*input2_offset_p];
107
+ output_data[*output_offset] = binary_func(params, input1_val, input2_val);
108
+ *input1_offset_p += desc1.strides[dimension];
109
+ *input2_offset_p += desc2.strides[dimension];
110
+ ++(*output_offset);
111
+ }
112
+ } else {
113
+ for (int a = 0; a < extended_output_shape_dims[dimension]; ++a) {
114
+ size_t input1_offset_c = *input1_offset_p;
115
+ size_t input2_offset_c = *input2_offset_p;
116
+ BroadcastMulRecursiveDimensions(
117
+ params, dimension + 1, input1_data, input2_data, output_data,
118
+ &input1_offset_c, &input2_offset_c, output_offset, desc1, desc2,
119
+ extended_output_shape_dims, binary_func);
120
+ *input1_offset_p += desc1.strides[dimension];
121
+ *input2_offset_p += desc2.strides[dimension];
122
+ }
123
+ }
124
+ }
125
+
126
+ inline void BroadcastMul6DSlow(const ArithmeticParams& params,
92
127
  const RuntimeShape& input1_shape,
93
128
  const uint8_t* input1_data,
94
129
  const RuntimeShape& input2_shape,
95
130
  const uint8_t* input2_data,
96
131
  const RuntimeShape& output_shape,
97
132
  uint8_t* output_data) {
98
- NdArrayDesc<4> desc1;
99
- NdArrayDesc<4> desc2;
133
+ NdArrayDesc<kMaxMulBroadcastDim> desc1;
134
+ NdArrayDesc<kMaxMulBroadcastDim> desc2;
100
135
  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
101
136
  &desc2);
102
137
  const RuntimeShape extended_output_shape =
103
- RuntimeShape::ExtendedShape(4, output_shape);
104
-
105
- for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
106
- for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
107
- for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
108
- for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
109
- const int32_t input1_val =
110
- params.input1_offset +
111
- input1_data[SubscriptToIndex(desc1, b, y, x, c)];
112
- const int32_t input2_val =
113
- params.input2_offset +
114
- input2_data[SubscriptToIndex(desc2, b, y, x, c)];
115
- const int32_t unclamped_result =
116
- params.output_offset +
117
- MultiplyByQuantizedMultiplier(input1_val * input2_val,
118
- params.output_multiplier,
119
- params.output_shift);
120
- const int32_t clamped_output = std::min(
121
- params.quantized_activation_max,
122
- std::max(params.quantized_activation_min, unclamped_result));
123
- output_data[Offset(extended_output_shape, b, y, x, c)] =
124
- static_cast<uint8_t>(clamped_output);
125
- }
126
- }
127
- }
128
- }
138
+ RuntimeShape::ExtendedShape(kMaxMulBroadcastDim, output_shape);
139
+ // Cache output shape dimensions.
140
+ int32_t extended_output_shape_dims[kMaxMulBroadcastDim];
141
+ std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(),
142
+ sizeof(extended_output_shape_dims));
143
+
144
+ size_t input1_offset = 0;
145
+ size_t input2_offset = 0;
146
+ size_t output_offset = 0;
147
+ BroadcastMulRecursiveDimensions(
148
+ params, 0, input1_data, input2_data, output_data, &input1_offset,
149
+ &input2_offset, &output_offset, desc1, desc2, extended_output_shape_dims,
150
+ [](const ArithmeticParams& params, const uint8_t input1_val,
151
+ const uint8_t input2_val) {
152
+ const int32_t offsetted_input1_val = params.input1_offset + input1_val;
153
+ const int32_t offsetted_input2_val = params.input2_offset + input2_val;
154
+ const int32_t unclamped_result =
155
+ params.output_offset +
156
+ MultiplyByQuantizedMultiplier(
157
+ offsetted_input1_val * offsetted_input2_val,
158
+ params.output_multiplier, params.output_shift);
159
+ const int32_t clamped_output = std::min(
160
+ params.quantized_activation_max,
161
+ std::max(params.quantized_activation_min, unclamped_result));
162
+ return static_cast<uint8_t>(clamped_output);
163
+ });
129
164
  }
130
165
 
131
166
  template <typename T,
132
- // For unquantized mul on small integers, explictly set to true.
167
+ // For unquantized mul on small integers, explicitly set to true.
133
168
  bool enable_for_short_integers = false>
134
169
  inline typename std::enable_if<
135
170
  !is_small_integer<T>::value || enable_for_short_integers, void>::type
136
- BroadcastMul4DSlow(const ArithmeticParams& params,
171
+ BroadcastMul6DSlow(const ArithmeticParams& params,
137
172
  const RuntimeShape& unextended_input1_shape,
138
173
  const T* input1_data,
139
174
  const RuntimeShape& unextended_input2_shape,
140
175
  const T* input2_data,
141
176
  const RuntimeShape& unextended_output_shape,
142
177
  T* output_data) {
143
- T output_activation_min;
144
- T output_activation_max;
145
- GetActivationParams(params, &output_activation_min, &output_activation_max);
146
-
147
- TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
148
- TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
149
- TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
150
- const RuntimeShape output_shape =
151
- RuntimeShape::ExtendedShape(4, unextended_output_shape);
152
-
153
- NdArrayDesc<4> desc1;
154
- NdArrayDesc<4> desc2;
178
+ TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 6);
179
+ TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 6);
180
+ TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 6);
181
+ NdArrayDesc<kMaxMulBroadcastDim> desc1;
182
+ NdArrayDesc<kMaxMulBroadcastDim> desc2;
155
183
  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
156
184
  unextended_input2_shape, &desc1, &desc2);
185
+ const RuntimeShape extended_output_shape =
186
+ RuntimeShape::ExtendedShape(kMaxMulBroadcastDim, unextended_output_shape);
187
+ // Cache output shape dimensions.
188
+ int32_t extended_output_shape_dims[kMaxMulBroadcastDim];
189
+ std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(),
190
+ sizeof(extended_output_shape_dims));
157
191
 
158
192
  // In Tensorflow, the dimensions are canonically named (batch_number, row,
159
193
  // col, channel), with extents (batches, height, width, depth), with the
160
- // trailing dimension changing most rapidly (channels has the smallest stride,
161
- // typically 1 element).
194
+ // trailing dimension changing most rapidly (channels has the smallest
195
+ // stride, typically 1 element).
162
196
  //
163
197
  // In generated C code, we store arrays with the dimensions reversed. The
164
198
  // first dimension has smallest stride.
165
199
  //
166
200
  // We name our variables by their Tensorflow convention, but generate C code
167
- // nesting loops such that the innermost loop has the smallest stride for the
168
- // best cache behavior.
169
- for (int b = 0; b < output_shape.Dims(0); ++b) {
170
- for (int y = 0; y < output_shape.Dims(1); ++y) {
171
- for (int x = 0; x < output_shape.Dims(2); ++x) {
172
- for (int c = 0; c < output_shape.Dims(3); ++c) {
173
- output_data[Offset(output_shape, b, y, x, c)] =
174
- ActivationFunctionWithMinMax<T>(
175
- input1_data[SubscriptToIndex(desc1, b, y, x, c)] *
176
- input2_data[SubscriptToIndex(desc2, b, y, x, c)],
177
- output_activation_min, output_activation_max);
178
- }
179
- }
180
- }
181
- }
201
+ // nesting loops such that the innermost loop has the smallest stride for
202
+ // the best cache behavior.
203
+ size_t input1_offset = 0;
204
+ size_t input2_offset = 0;
205
+ size_t output_offset = 0;
206
+ BroadcastMulRecursiveDimensions(
207
+ params, 0, input1_data, input2_data, output_data, &input1_offset,
208
+ &input2_offset, &output_offset, desc1, desc2, extended_output_shape_dims,
209
+ [](const ArithmeticParams& params, const T input1_val,
210
+ const T input2_val) {
211
+ T output_activation_min;
212
+ T output_activation_max;
213
+ GetActivationParams(params, &output_activation_min,
214
+ &output_activation_max);
215
+ return ActivationFunctionWithMinMax<T>(input1_val * input2_val,
216
+ output_activation_min,
217
+ output_activation_max);
218
+ });
182
219
  }
183
220
 
184
- inline void BroadcastMul4DSlow(const ArithmeticParams& params,
221
+ inline void BroadcastMul6DSlow(const ArithmeticParams& params,
185
222
  const RuntimeShape& unextended_input1_shape,
186
223
  const std::complex<float>* input1_data,
187
224
  const RuntimeShape& unextended_input2_shape,
188
225
  const std::complex<float>* input2_data,
189
226
  const RuntimeShape& unextended_output_shape,
190
227
  std::complex<float>* output_data) {
191
- TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
192
- TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
193
- TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
194
- const RuntimeShape output_shape =
195
- RuntimeShape::ExtendedShape(4, unextended_output_shape);
196
-
197
- NdArrayDesc<4> desc1;
198
- NdArrayDesc<4> desc2;
228
+ TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 6);
229
+ TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 6);
230
+ TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 6);
231
+
232
+ NdArrayDesc<kMaxMulBroadcastDim> desc1;
233
+ NdArrayDesc<kMaxMulBroadcastDim> desc2;
199
234
  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
200
235
  unextended_input2_shape, &desc1, &desc2);
236
+ const RuntimeShape extended_output_shape =
237
+ RuntimeShape::ExtendedShape(kMaxMulBroadcastDim, unextended_output_shape);
238
+ // Cache output shape dimensions.
239
+ int32_t extended_output_shape_dims[kMaxMulBroadcastDim];
240
+ std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(),
241
+ sizeof(extended_output_shape_dims));
201
242
 
202
- for (int b = 0; b < output_shape.Dims(0); ++b) {
203
- for (int y = 0; y < output_shape.Dims(1); ++y) {
204
- for (int x = 0; x < output_shape.Dims(2); ++x) {
205
- for (int c = 0; c < output_shape.Dims(3); ++c) {
206
- output_data[Offset(output_shape, b, y, x, c)] =
207
- input1_data[SubscriptToIndex(desc1, b, y, x, c)] *
208
- input2_data[SubscriptToIndex(desc2, b, y, x, c)];
209
- }
210
- }
211
- }
212
- }
243
+ size_t input1_offset = 0;
244
+ size_t input2_offset = 0;
245
+ size_t output_offset = 0;
246
+ BroadcastMulRecursiveDimensions(
247
+ params, 0, input1_data, input2_data, output_data, &input1_offset,
248
+ &input2_offset, &output_offset, desc1, desc2, extended_output_shape_dims,
249
+ [](const ArithmeticParams& params, const std::complex<float> input1_val,
250
+ const std::complex<float> input2_val) {
251
+ return input1_val * input2_val;
252
+ });
253
+ }
254
+
255
+ template <typename T>
256
+ inline void BroadcastMul4DSlow(
257
+ const ArithmeticParams& params, const RuntimeShape& input1_shape,
258
+ const T* input1_data, const RuntimeShape& input2_shape,
259
+ const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
260
+ return BroadcastMul6DSlow(params, input1_shape, input1_data, input2_shape,
261
+ input2_data, output_shape, output_data);
213
262
  }
214
263
 
215
264
  } // namespace reference_ops
@@ -268,11 +268,11 @@ inline bool Mean(const T* input_data, const int* input_dims,
268
268
  return true;
269
269
  }
270
270
 
271
- template <typename T>
272
271
  inline void Mean(const tflite::MeanParams& op_params,
273
272
  const RuntimeShape& unextended_input_shape,
274
- const T* input_data,
275
- const RuntimeShape& unextended_output_shape, T* output_data) {
273
+ const float* input_data,
274
+ const RuntimeShape& unextended_output_shape,
275
+ float* output_data) {
276
276
  ruy::profiler::ScopeLabel label("Mean4D");
277
277
 
278
278
  // Current implementation only supports dimension equals 4 and simultaneous
@@ -312,78 +312,21 @@ inline void Mean(const tflite::MeanParams& op_params,
312
312
  }
313
313
  }
314
314
 
315
- inline void Mean(const tflite::MeanParams& op_params,
316
- const RuntimeShape& unextended_input_shape,
317
- const uint8_t* input_data, int32_t input_zero_point,
318
- float input_scale, const RuntimeShape& unextended_output_shape,
319
- uint8_t* output_data, int32_t output_zero_point,
320
- float output_scale) {
321
- ruy::profiler::ScopeLabel label("Mean4D/Uint8");
322
-
323
- // Current implementation only supports dimension equals 4 and simultaneous
324
- // reduction over width and height.
325
- TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
326
- TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
327
- const RuntimeShape input_shape =
328
- RuntimeShape::ExtendedShape(4, unextended_input_shape);
329
- const RuntimeShape output_shape =
330
- RuntimeShape::ExtendedShape(4, unextended_output_shape);
331
- const int output_batch = output_shape.Dims(0);
332
- const int output_height = output_shape.Dims(1);
333
- const int output_width = output_shape.Dims(2);
334
- const int output_depth = output_shape.Dims(3);
335
- const int input_height = input_shape.Dims(1);
336
- const int input_width = input_shape.Dims(2);
337
- const float num_elements_in_axis = input_width * input_height;
338
-
339
- TFLITE_CHECK_EQ(op_params.axis_count, 2);
340
- TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
341
- (op_params.axis[0] == 2 && op_params.axis[1] == 1));
342
- TFLITE_CHECK_EQ(output_height, 1);
343
- TFLITE_CHECK_EQ(output_width, 1);
344
-
345
- constexpr int32_t kMinValue = std::numeric_limits<uint8_t>::min();
346
- constexpr int32_t kMaxValue = std::numeric_limits<uint8_t>::max();
347
-
348
- float temp = input_zero_point * input_scale / output_scale;
349
- temp = temp > 0 ? temp + 0.5f : temp - 0.5f;
350
- int32_t bias = output_zero_point - static_cast<int32_t>(temp);
351
- double real_scale =
352
- static_cast<double>(input_scale / (num_elements_in_axis * output_scale));
353
-
354
- int32_t multiplier;
355
- int shift;
356
- QuantizeMultiplier(real_scale, &multiplier, &shift);
357
- for (int out_b = 0; out_b < output_batch; ++out_b) {
358
- for (int out_d = 0; out_d < output_depth; ++out_d) {
359
- int32_t acc = 0;
360
- for (int in_h = 0; in_h < input_height; ++in_h) {
361
- for (int in_w = 0; in_w < input_width; ++in_w) {
362
- acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
363
- }
364
- }
365
- acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
366
- acc += bias;
367
- acc = std::min(std::max(acc, kMinValue), kMaxValue);
368
- output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
369
- static_cast<uint8_t>(acc);
370
- }
371
- }
372
- }
373
-
374
315
  // Computes the mean of elements across dimensions given in axis.
375
316
  // It does so in two stages, first calculates the sum of elements along the axis
376
317
  // then divides it by the number of element in axis for quantized values.
377
318
  template <typename T, typename U>
378
319
  inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
379
- float input_scale, const int* input_dims,
380
- const int input_num_dims, T* output_data,
381
- int32_t output_zero_point, float output_scale,
320
+ const int* input_dims, const int input_num_dims,
321
+ T* output_data, int32_t output_multiplier,
322
+ int output_shift, int32_t output_zero_point,
382
323
  const int* output_dims,
383
324
  const int output_num_dims, const int* axis,
384
325
  const int num_axis_dimensions, bool keep_dims,
385
326
  int* temp_index, int* resolved_axis, U* temp_sum,
386
327
  bool compute_sum) {
328
+ const int32_t kMinValue = std::numeric_limits<T>::min();
329
+ const int32_t kMaxValue = std::numeric_limits<T>::max();
387
330
  const bool uint8_case = std::is_same<T, uint8_t>::value;
388
331
  const bool int16_case = std::is_same<T, int16_t>::value;
389
332
  if (uint8_case) {
@@ -430,40 +373,46 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
430
373
  }
431
374
 
432
375
  // Calculate mean by dividing output_data by num of aggregated element.
433
- size_t num_elements_in_axis = 1;
376
+ int64_t num_elements_in_axis = 1;
434
377
  for (int idx = 0; idx < num_resolved_axis; ++idx) {
435
378
  size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
436
379
  // Overflow prevention.
437
- if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
380
+ if (current > static_cast<size_t>(std::numeric_limits<int64_t>::max() /
381
+ num_elements_in_axis)) {
438
382
  return false;
439
383
  }
440
384
  num_elements_in_axis *= current;
441
385
  }
442
386
 
443
- if (num_elements_in_axis > 0) {
444
- const float scale = input_scale / output_scale;
445
- if (compute_sum) {
446
- // TODO(b/116341117): Eliminate float and do this completely in 8bit.
447
- const float bias = -input_zero_point * scale * num_elements_in_axis;
448
- for (size_t idx = 0; idx < num_outputs; ++idx) {
449
- const U value =
450
- static_cast<U>(TfLiteRound(temp_sum[idx] * scale + bias)) +
451
- output_zero_point;
452
- output_data[idx] = static_cast<T>(value);
453
- }
454
- } else {
455
- const float bias = -input_zero_point * scale;
456
- for (size_t idx = 0; idx < num_outputs; ++idx) {
457
- float float_mean = static_cast<float>(temp_sum[idx]) /
458
- static_cast<float>(num_elements_in_axis);
459
- float result = TfLiteMin(
460
- TfLiteRound(float_mean * scale + bias) + output_zero_point,
461
- static_cast<float>(std::numeric_limits<T>::max()));
462
- result = TfLiteMax(result,
463
- static_cast<float>(std::numeric_limits<T>::min()));
464
- output_data[idx] = static_cast<T>(result);
465
- }
466
- }
387
+ if (num_elements_in_axis == 0) {
388
+ return true;
389
+ }
390
+
391
+ // Readapt output rescaling when calculating the mean to integrate a
392
+ // 1/num_elements_in_axis multiplier.
393
+ if (!compute_sum) {
394
+ TFLITE_DCHECK_GE(num_elements_in_axis, 0);
395
+ int shift =
396
+ 63 - CountLeadingZeros(static_cast<uint64_t>(num_elements_in_axis));
397
+ // To avoid any overflow risk 'shift' should be <= 32 and to satisfy
398
+ // 'MultiplyByQuantizedMultiplier' pre-conditions 'output_shift - shift'
399
+ // should be >= -31. Clamp the value at the price of some precision loss.
400
+ shift = std::min(shift, 32);
401
+ shift = std::min(shift, 31 + output_shift);
402
+ output_multiplier = static_cast<int32_t>(
403
+ (static_cast<int64_t>(output_multiplier) << shift) /
404
+ num_elements_in_axis);
405
+ output_shift = output_shift - shift;
406
+ }
407
+
408
+ for (size_t idx = 0; idx < num_outputs; ++idx) {
409
+ const U shifted_sum =
410
+ static_cast<U>(temp_sum[idx] - input_zero_point * num_elements_in_axis);
411
+ int32_t output = MultiplyByQuantizedMultiplier(
412
+ shifted_sum, output_multiplier, output_shift) +
413
+ output_zero_point;
414
+ output = std::min(std::max(output, kMinValue), kMaxValue);
415
+ output_data[idx] = static_cast<T>(output);
467
416
  }
468
417
  return true;
469
418
  }
@@ -478,8 +427,8 @@ inline bool QuantizedMeanOrSumExtraArgs(
478
427
  bool keep_dims, int* temp_index, int* resolved_axis, U* temp_sum,
479
428
  bool compute_sum) {
480
429
  return QuantizedMeanOrSum<T, U>(
481
- input_data, input_zero_point, input_scale, input_dims, input_num_dims,
482
- output_data, output_zero_point, output_scale, output_dims,
430
+ input_data, input_zero_point, input_dims, input_num_dims, output_data,
431
+ output_multiplier, output_shift, output_zero_point, output_dims,
483
432
  output_num_dims, axis, num_axis_dimensions, keep_dims, temp_index,
484
433
  resolved_axis, temp_sum, compute_sum);
485
434
  }
@@ -212,9 +212,14 @@ inline void ResizeBilinearInteger(
212
212
  (input_y - (1 << 10) * y0) * (input_x - (1 << 10) * x0);
213
213
  const int64_t output_20 =
214
214
  output_20_ll + output_20_lu + output_20_rl + output_20_ru;
215
+ #if TFLITE_SINGLE_ROUNDING
216
+ const int64_t round = 1 << 19;
217
+ const T interpolation = static_cast<T>((output_20 + round) >> 20);
218
+ #else
215
219
  const int64_t round = (output_20 > 0) ? (1 << 19) : -(1 << 19);
216
220
  const T interpolation =
217
221
  static_cast<T>((output_20 + round) / (1 << 20));
222
+ #endif // TFLITE_SINGLE_ROUNDING
218
223
  output_data[Offset(output_shape, b, y, x, c)] = interpolation;
219
224
  }
220
225
  }