xmos-ai-tools 1.1.2.dev216__py3-none-macosx_11_0_arm64.whl → 1.1.2.dev236__py3-none-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. xmos_ai_tools/runtime/include/lib_nn/api/nn_layers.h +16 -0
  2. xmos_ai_tools/runtime/include/lib_nn/api/quadratic_approximation.h +80 -0
  3. xmos_ai_tools/runtime/include/lib_nn/api/quadratic_interpolation.h +23 -0
  4. xmos_ai_tools/runtime/include/lib_tflite_micro/src/tflite-xcore-kernels/xcore_ops.h +15 -15
  5. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/simple_features/model.h → signal/micro/kernels/delay_flexbuffers_generated_data.h} +7 -9
  6. xmos_ai_tools/runtime/include/signal/micro/kernels/energy_flexbuffers_generated_data.h +28 -0
  7. xmos_ai_tools/runtime/include/signal/micro/kernels/fft_flexbuffers_generated_data.h +37 -0
  8. xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_flexbuffers_generated_data.h +25 -0
  9. xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_log_flexbuffers_generated_data.h +27 -0
  10. xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_spectral_subtraction_flexbuffers_generated_data.h +26 -0
  11. xmos_ai_tools/runtime/include/signal/micro/kernels/framer_flexbuffers_generated_data.h +25 -0
  12. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/simple_features/no_simple_features_data.h → signal/micro/kernels/irfft.h} +15 -7
  13. xmos_ai_tools/runtime/include/signal/micro/kernels/overlap_add_flexbuffers_generated_data.h +25 -0
  14. xmos_ai_tools/runtime/include/signal/micro/kernels/pcan_flexbuffers_generated_data.h +7 -0
  15. xmos_ai_tools/runtime/include/signal/micro/kernels/rfft.h +31 -0
  16. xmos_ai_tools/runtime/include/signal/micro/kernels/stacker_flexbuffers_generated_data.h +25 -0
  17. xmos_ai_tools/runtime/include/signal/micro/kernels/window_flexbuffers_generated_data.h +25 -0
  18. xmos_ai_tools/runtime/include/signal/src/circular_buffer.h +118 -0
  19. xmos_ai_tools/runtime/include/signal/src/complex.h +29 -0
  20. xmos_ai_tools/runtime/include/signal/src/energy.h +38 -0
  21. xmos_ai_tools/runtime/include/signal/src/fft_auto_scale.h +35 -0
  22. xmos_ai_tools/runtime/include/signal/src/filter_bank.h +69 -0
  23. xmos_ai_tools/runtime/include/signal/src/filter_bank_log.h +38 -0
  24. xmos_ai_tools/runtime/include/signal/src/filter_bank_spectral_subtraction.h +73 -0
  25. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/main_functions.h → signal/src/filter_bank_square_root.h} +14 -17
  26. xmos_ai_tools/runtime/include/signal/src/irfft.h +84 -0
  27. xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_common.h +49 -0
  28. xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_float.h +31 -0
  29. xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_int16.h +30 -0
  30. xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_int32.h +31 -0
  31. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.h → signal/src/log.h} +13 -6
  32. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/python/interpreter/src/python_utils.h → signal/src/max_abs.h} +11 -11
  33. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.h → signal/src/msb.h} +15 -6
  34. xmos_ai_tools/runtime/include/signal/src/overlap_add.h +46 -0
  35. xmos_ai_tools/runtime/include/signal/src/pcan_argc_fixed.h +41 -0
  36. xmos_ai_tools/runtime/include/signal/src/rfft.h +85 -0
  37. xmos_ai_tools/runtime/include/signal/src/square_root.h +32 -0
  38. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/python/interpreter/src/numpy_utils.h → signal/src/window.h} +13 -15
  39. xmos_ai_tools/runtime/include/signal/testdata/fft_test_data.h +48 -0
  40. xmos_ai_tools/runtime/include/tensorflow/lite/array.h +156 -0
  41. xmos_ai_tools/runtime/include/tensorflow/lite/builtin_ops.h +44 -0
  42. xmos_ai_tools/runtime/include/tensorflow/lite/c/c_api_types.h +6 -0
  43. xmos_ai_tools/runtime/include/tensorflow/lite/c/common.h +8 -25
  44. xmos_ai_tools/runtime/include/tensorflow/lite/core/api/error_reporter.h +3 -3
  45. xmos_ai_tools/runtime/include/tensorflow/lite/core/api/flatbuffer_conversions.h +15 -0
  46. xmos_ai_tools/runtime/include/tensorflow/lite/core/c/builtin_op_data.h +92 -3
  47. xmos_ai_tools/runtime/include/tensorflow/lite/core/c/c_api_types.h +61 -51
  48. xmos_ai_tools/runtime/include/tensorflow/lite/core/c/common.h +302 -1
  49. xmos_ai_tools/runtime/include/tensorflow/lite/core/macros.h +78 -0
  50. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/common.h +129 -43
  51. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/cppmath.h +2 -2
  52. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/portable_tensor.h +23 -4
  53. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/add.h +210 -151
  54. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/comparisons.h +9 -18
  55. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/conv.h +2 -0
  56. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/add.h +103 -72
  57. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h +2 -0
  58. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h +2 -63
  59. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h +87 -26
  60. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/mul.h +129 -80
  61. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/reduce.h +42 -93
  62. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/resize_bilinear.h +5 -0
  63. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/sub.h +249 -263
  64. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/runtime_shape.h +11 -1
  65. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/strided_slice_logic.h +5 -1
  66. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/tensor_ctypes.h +5 -10
  67. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/types.h +4 -2
  68. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/kernel_util.h +25 -14
  69. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/op_macros.h +14 -3
  70. xmos_ai_tools/runtime/include/tensorflow/lite/micro/debug_log.h +10 -3
  71. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_model_settings.h +37 -0
  72. xmos_ai_tools/runtime/include/tensorflow/lite/micro/fake_micro_context.h +7 -0
  73. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/add.h +6 -5
  74. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/circular_buffer.h +0 -3
  75. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/conv.h +19 -20
  76. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/conv_test.h +8 -31
  77. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/depthwise_conv.h +8 -8
  78. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/ethosu.h +1 -1
  79. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/fully_connected.h +9 -9
  80. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/kernel_runner.h +14 -9
  81. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/kernel_util.h +9 -4
  82. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/micro_ops.h +119 -100
  83. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/mul.h +4 -4
  84. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/pooling.h +8 -8
  85. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/reduce.h +4 -4
  86. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/reshape.h +26 -0
  87. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/softmax.h +12 -16
  88. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/strided_slice.h +40 -0
  89. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/svdf.h +8 -7
  90. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.h +5 -5
  91. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa.h +2 -2
  92. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_conv.h +26 -21
  93. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_depthwise_conv.h +4 -4
  94. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_reshape.h +2 -4
  95. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_softmax.h +2 -2
  96. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h +5 -0
  97. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/linear_memory_planner.h +4 -0
  98. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/micro_memory_planner.h +4 -0
  99. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/non_persistent_buffer_planner_shim.h +4 -0
  100. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_allocator.h +23 -8
  101. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_common.h +38 -0
  102. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_context.h +23 -65
  103. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_graph.h +15 -57
  104. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter.h +16 -5
  105. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter_context.h +125 -0
  106. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter_graph.h +110 -0
  107. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_log.h +6 -8
  108. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_mutable_op_resolver.h +114 -32
  109. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_op_resolver.h +6 -5
  110. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_profiler.h +1 -1
  111. xmos_ai_tools/runtime/include/tensorflow/lite/micro/mock_micro_graph.h +1 -1
  112. xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/python_ops_resolver.h +21 -0
  113. xmos_ai_tools/runtime/include/tensorflow/lite/micro/test_helper_custom_ops.h +3 -4
  114. xmos_ai_tools/runtime/include/tensorflow/lite/micro/test_helpers.h +28 -12
  115. xmos_ai_tools/runtime/include/tensorflow/lite/micro/testing/micro_test.h +1 -0
  116. xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/log_utils.h +273 -0
  117. xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/metrics.h +41 -0
  118. xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/op_resolver.h +127 -0
  119. xmos_ai_tools/runtime/include/tensorflow/lite/schema/schema_generated.h +9139 -5010
  120. xmos_ai_tools/runtime/lib/libhost_xtflitemicro.a +0 -0
  121. xmos_ai_tools/runtime/lib/libxtflitemicro.a +0 -0
  122. xmos_ai_tools/xinterpreters/libs/macos/xtflm_python.1.0.1.dylib +0 -0
  123. xmos_ai_tools/xinterpreters/libs/macos/xtflm_python.dylib +0 -0
  124. {xmos_ai_tools-1.1.2.dev216.data → xmos_ai_tools-1.1.2.dev236.data}/data/bin/xcore-opt +0 -0
  125. {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/METADATA +3 -4
  126. {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/RECORD +128 -105
  127. {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/WHEEL +1 -1
  128. xmos_ai_tools/runtime/include/tensorflow/lite/core/api/op_resolver.h +0 -129
  129. xmos_ai_tools/runtime/include/tensorflow/lite/micro/all_ops_resolver.h +0 -38
  130. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/audio_provider.h +0 -44
  131. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/command_responder.h +0 -30
  132. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/feature_provider.h +0 -50
  133. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.h +0 -30
  134. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h +0 -43
  135. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.h +0 -29
  136. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h +0 -29
  137. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/recognize_commands.h +0 -151
  138. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h +0 -29
  139. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_features_generator.h +0 -29
  140. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_model_settings.h +0 -43
  141. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h +0 -29
  142. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/yes_simple_features_data.h +0 -23
  143. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_string.h +0 -33
  144. xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/interpreter_wrapper.h +0 -51
  145. xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/pybind11_lib.h +0 -64
  146. xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/shared_library.h +0 -40
  147. {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/top_level.txt +0 -0
@@ -18,6 +18,7 @@ limitations under the License.
18
18
  #include <stdint.h>
19
19
 
20
20
  #include <algorithm>
21
+ #include <cstddef>
21
22
  #include <limits>
22
23
 
23
24
  #include "ruy/profiler/instrumentation.h" // from @ruy
@@ -29,100 +30,179 @@ namespace tflite {
29
30
 
30
31
  namespace reference_ops {
31
32
 
32
- inline void SubNonBroadcast(const ArithmeticParams& params,
33
- const RuntimeShape& input1_shape,
34
- const float* input1_data,
35
- const RuntimeShape& input2_shape,
36
- const float* input2_data,
37
- const RuntimeShape& output_shape,
38
- float* output_data) {
39
- const int flat_size =
40
- MatchingElementsSize(input1_shape, input2_shape, output_shape);
41
- for (int i = 0; i < flat_size; ++i) {
42
- output_data[i] = ActivationFunctionWithMinMax(
43
- input1_data[i] - input2_data[i], params.float_activation_min,
44
- params.float_activation_max);
33
+ template <class T>
34
+ struct SubImpl {
35
+ template <class F>
36
+ static void BroadcastInput1(const ArithmeticParams& params,
37
+ const T* input1_data, const T* input2_data,
38
+ T* output_data, size_t size, F binary_func) {
39
+ for (size_t c = 0; c < size; ++c) {
40
+ output_data[c] = binary_func(input1_data[0], input2_data[c], params);
41
+ }
45
42
  }
46
- }
47
43
 
48
- inline void SubNonBroadcast(const ArithmeticParams& params,
49
- const RuntimeShape& input1_shape,
50
- const int32_t* input1_data,
51
- const RuntimeShape& input2_shape,
52
- const int32_t* input2_data,
53
- const RuntimeShape& output_shape,
54
- int32_t* output_data) {
55
- const int flat_size =
56
- MatchingElementsSize(input1_shape, input2_shape, output_shape);
57
- for (int i = 0; i < flat_size; ++i) {
58
- output_data[i] = ActivationFunctionWithMinMax(
59
- input1_data[i] - input2_data[i], params.quantized_activation_min,
60
- params.quantized_activation_max);
44
+ template <class F>
45
+ static void BroadcastInput2(const ArithmeticParams& params,
46
+ const T* input1_data, const T* input2_data,
47
+ T* output_data, size_t size, F binary_func) {
48
+ for (size_t c = 0; c < size; ++c) {
49
+ output_data[c] = binary_func(input1_data[c], input2_data[0], params);
50
+ }
61
51
  }
62
- }
63
52
 
64
- // TODO(b/151345304): We can implement BroadcastSub on buffers of arbitrary
65
- // dimensionality if the runtime code does a single loop over one dimension
66
- // that handles broadcasting as the base case. The code generator would then
67
- // generate max(D1, D2) nested for loops.
68
- template <int N = 5>
69
- inline void BroadcastSubSlow(const ArithmeticParams& params,
70
- const RuntimeShape& input1_shape,
71
- const float* input1_data,
72
- const RuntimeShape& input2_shape,
73
- const float* input2_data,
74
- const RuntimeShape& output_shape,
75
- float* output_data) {
76
- ruy::profiler::ScopeLabel label("BroadcastSubSlow/float");
77
- TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
78
- TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
79
- TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
80
- NdArrayDesc<N> desc1;
81
- NdArrayDesc<N> desc2;
82
- NdArrayDesc<N> output_desc;
83
- NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
84
- &desc2);
85
- CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
53
+ template <class F>
54
+ static void ElementWise(const ArithmeticParams& params, const T* input1_data,
55
+ const T* input2_data, T* output_data, size_t size,
56
+ F binary_func) {
57
+ for (size_t c = 0; c < size; ++c) {
58
+ output_data[c] = binary_func(input1_data[c], input2_data[c], params);
59
+ }
60
+ }
61
+ };
62
+
63
+ template <>
64
+ struct SubImpl<int32_t> {
65
+ template <class F>
66
+ static void BroadcastInput1(const ArithmeticParams& params,
67
+ const int32_t* input1_data,
68
+ const int32_t* input2_data, int32_t* output_data,
69
+ size_t size, F binary_func) {
70
+ size_t c = 0;
71
+ int32_t activation_min, activation_max;
72
+ GetActivationParams(params, &activation_min, &activation_max);
73
+ #ifdef USE_NEON
74
+ const int32x4_t vmax = vdupq_n_s32(activation_max);
75
+ const int32x4_t vmin = vdupq_n_s32(activation_min);
76
+ const int32x4_t va = vdupq_n_s32(input1_data[0]);
77
+ for (; c + 4 <= size; c += 4) {
78
+ const int32x4_t vb = vld1q_s32(&input2_data[c]);
79
+ int32x4_t vres = vsubq_s32(va, vb);
80
+ vres = vmaxq_s32(vmin, vres);
81
+ vres = vminq_s32(vmax, vres);
82
+ vst1q_s32(&output_data[c], vres);
83
+ }
84
+ #endif
85
+ for (; c < size; ++c) {
86
+ output_data[c] = binary_func(input1_data[0], input2_data[c], params);
87
+ }
88
+ }
86
89
 
87
- // In Tensorflow, the dimensions are canonically named (batch_number, row,
88
- // col, channel), with extents (batches, height, width, depth), with the
89
- // trailing dimension changing most rapidly (channels has the smallest stride,
90
- // typically 1 element).
91
- //
92
- // In generated C code, we store arrays with the dimensions reversed. The
93
- // first dimension has smallest stride.
94
- //
95
- // We name our variables by their Tensorflow convention, but generate C code
96
- // nesting loops such that the innermost loop has the smallest stride for the
97
- // best cache behavior.
98
- auto sub_func = [&](int indexes[N]) {
99
- output_data[SubscriptToIndex(output_desc, indexes)] =
100
- ActivationFunctionWithMinMax(
101
- input1_data[SubscriptToIndex(desc1, indexes)] -
102
- input2_data[SubscriptToIndex(desc2, indexes)],
103
- params.float_activation_min, params.float_activation_max);
104
- };
105
- NDOpsHelper<N>(output_desc, sub_func);
90
+ template <class F>
91
+ static void BroadcastInput2(const ArithmeticParams& params,
92
+ const int32_t* input1_data,
93
+ const int32_t* input2_data, int32_t* output_data,
94
+ size_t size, F binary_func) {
95
+ size_t c = 0;
96
+ int32_t activation_min, activation_max;
97
+ GetActivationParams(params, &activation_min, &activation_max);
98
+ #ifdef USE_NEON
99
+ const int32x4_t vmax = vdupq_n_s32(activation_max);
100
+ const int32x4_t vmin = vdupq_n_s32(activation_min);
101
+ const int32x4_t vb = vdupq_n_s32(input2_data[0]);
102
+ for (; c + 4 <= size; c += 4) {
103
+ const int32x4_t va = vld1q_s32(&input1_data[c]);
104
+ int32x4_t vres = vsubq_s32(va, vb);
105
+ vres = vmaxq_s32(vmin, vres);
106
+ vres = vminq_s32(vmax, vres);
107
+ vst1q_s32(&output_data[c], vres);
108
+ }
109
+ #endif
110
+ for (; c < size; ++c) {
111
+ output_data[c] = binary_func(input1_data[c], input2_data[0], params);
112
+ }
113
+ }
114
+
115
+ template <class F>
116
+ static void ElementWise(const ArithmeticParams& params,
117
+ const int32_t* input1_data,
118
+ const int32_t* input2_data, int32_t* output_data,
119
+ size_t size, F binary_func) {
120
+ size_t c = 0;
121
+ int32_t activation_min, activation_max;
122
+ GetActivationParams(params, &activation_min, &activation_max);
123
+ #ifdef USE_NEON
124
+ int32x4_t vmax = vdupq_n_s32(activation_max);
125
+ int32x4_t vmin = vdupq_n_s32(activation_min);
126
+ for (; c + 4 <= size; c += 4) {
127
+ const int32x4_t va = vld1q_s32(&input1_data[c]);
128
+ const int32x4_t vb = vld1q_s32(&input2_data[c]);
129
+ int32x4_t vres = vsubq_s32(va, vb);
130
+ vres = vmaxq_s32(vmin, vres);
131
+ vres = vminq_s32(vmax, vres);
132
+ vst1q_s32(&output_data[c], vres);
133
+ }
134
+ #endif
135
+ for (; c < size; ++c) {
136
+ output_data[c] = binary_func(input1_data[c], input2_data[c], params);
137
+ }
138
+ }
139
+ };
140
+
141
+ template <typename T, typename F>
142
+ inline void BroadcastSubRecursiveDimensions(
143
+ int dimension, const ArithmeticParams& params, const T* input1_data,
144
+ const T* input2_data, T* output_data, size_t* input1_offset_p,
145
+ size_t* input2_offset_p, size_t* output_offset,
146
+ size_t* compressed_input1_stride, size_t* compressed_input2_stride,
147
+ size_t* compressed_output_shape, F binary_func) {
148
+ if (dimension > 0) {
149
+ for (size_t c = 0; c < compressed_output_shape[dimension]; ++c) {
150
+ size_t input1_offset_c = *input1_offset_p;
151
+ size_t input2_offset_c = *input2_offset_p;
152
+ BroadcastSubRecursiveDimensions(
153
+ dimension - 1, params, input1_data, input2_data, output_data,
154
+ &input1_offset_c, &input2_offset_c, output_offset,
155
+ compressed_input1_stride, compressed_input2_stride,
156
+ compressed_output_shape, binary_func);
157
+ *input1_offset_p += compressed_input1_stride[dimension];
158
+ *input2_offset_p += compressed_input2_stride[dimension];
159
+ }
160
+ } else {
161
+ TFLITE_DCHECK(dimension == 0);
162
+ bool input1_is_broadcast = compressed_input1_stride[dimension] == 0;
163
+ bool input2_is_broadcast = compressed_input2_stride[dimension] == 0;
164
+ TFLITE_DCHECK(!(input1_is_broadcast && input2_is_broadcast));
165
+ const T* input1_data_ptr = input1_data + *input1_offset_p;
166
+ const T* input2_data_ptr = input2_data + *input2_offset_p;
167
+ T* output_data_ptr = output_data + *output_offset;
168
+ if (input1_is_broadcast) {
169
+ // input1 is broadcast.
170
+ SubImpl<T>::BroadcastInput1(
171
+ params, input1_data_ptr, input2_data_ptr, output_data_ptr,
172
+ compressed_output_shape[dimension], binary_func);
173
+ *input2_offset_p += compressed_output_shape[dimension];
174
+ } else if (input2_is_broadcast) {
175
+ // input2 is broadcast.
176
+ SubImpl<T>::BroadcastInput2(
177
+ params, input1_data_ptr, input2_data_ptr, output_data_ptr,
178
+ compressed_output_shape[dimension], binary_func);
179
+ *input1_offset_p += compressed_output_shape[dimension];
180
+ } else {
181
+ // Add element-wise.
182
+ SubImpl<T>::ElementWise(params, input1_data_ptr, input2_data_ptr,
183
+ output_data_ptr,
184
+ compressed_output_shape[dimension], binary_func);
185
+ *input1_offset_p += compressed_output_shape[dimension];
186
+ *input2_offset_p += compressed_output_shape[dimension];
187
+ }
188
+ *output_offset += compressed_output_shape[dimension];
189
+ }
106
190
  }
107
191
 
108
- template <int N = 5>
109
- inline void BroadcastSubSlow(const ArithmeticParams& params,
110
- const RuntimeShape& input1_shape,
111
- const int32_t* input1_data,
112
- const RuntimeShape& input2_shape,
113
- const int32_t* input2_data,
114
- const RuntimeShape& output_shape,
115
- int32_t* output_data) {
116
- ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32_t");
117
- TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
118
- TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
119
- TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
120
- NdArrayDesc<N> desc1;
121
- NdArrayDesc<N> desc2;
122
- NdArrayDesc<N> output_desc;
123
- NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
124
- &desc2);
125
- CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
192
+ // TODO: b/296510380 - we may be able to factor out this to common.h for all
193
+ // binary arithmetic ops (add, sub, mul).
194
+ template <typename T, typename F>
195
+ inline void BroadcastSubCommon(const ArithmeticParams& params,
196
+ const RuntimeShape& input1_shape,
197
+ const T* input1_data,
198
+ const RuntimeShape& input2_shape,
199
+ const T* input2_data,
200
+ const RuntimeShape& output_shape, T* output_data,
201
+ F binary_func) {
202
+ constexpr int kMaxBroadcastDim = 6;
203
+ TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), kMaxBroadcastDim);
204
+ TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), kMaxBroadcastDim);
205
+ TFLITE_DCHECK_LE(output_shape.DimensionsCount(), kMaxBroadcastDim);
126
206
 
127
207
  // In Tensorflow, the dimensions are canonically named (batch_number, row,
128
208
  // col, channel), with extents (batches, height, width, depth), with the
@@ -135,33 +215,6 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
135
215
  // We name our variables by their Tensorflow convention, but generate C code
136
216
  // nesting loops such that the innermost loop has the smallest stride for the
137
217
  // best cache behavior.
138
- auto sub_func = [&](int indexes[N]) {
139
- output_data[SubscriptToIndex(output_desc, indexes)] =
140
- ActivationFunctionWithMinMax(
141
- input1_data[SubscriptToIndex(desc1, indexes)] -
142
- input2_data[SubscriptToIndex(desc2, indexes)],
143
- params.quantized_activation_min, params.quantized_activation_max);
144
- };
145
- NDOpsHelper<N>(output_desc, sub_func);
146
- }
147
-
148
- template <int N = 5>
149
- void BroadcastSubSlow(const ArithmeticParams& params,
150
- const RuntimeShape& input1_shape,
151
- const int64_t* input1_data,
152
- const RuntimeShape& input2_shape,
153
- const int64_t* input2_data,
154
- const RuntimeShape& output_shape, int64_t* output_data) {
155
- ruy::profiler::ScopeLabel label("BroadcastSubSlow/int64_t");
156
- TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
157
- TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
158
- TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
159
- NdArrayDesc<N> desc1;
160
- NdArrayDesc<N> desc2;
161
- NdArrayDesc<N> output_desc;
162
- NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
163
- &desc2);
164
- CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
165
218
 
166
219
  // In Tensorflow, the dimensions are canonically named (batch_number, row,
167
220
  // col, channel), with extents (batches, height, width, depth), with the
@@ -174,54 +227,48 @@ void BroadcastSubSlow(const ArithmeticParams& params,
174
227
  // We name our variables by their Tensorflow convention, but generate C code
175
228
  // nesting loops such that the innermost loop has the smallest stride for the
176
229
  // best cache behavior.
177
- auto sub_func = [&](int indexes[N]) {
178
- output_data[SubscriptToIndex(output_desc, indexes)] =
179
- ActivationFunctionWithMinMax(
180
- input1_data[SubscriptToIndex(desc1, indexes)] -
181
- input2_data[SubscriptToIndex(desc2, indexes)],
182
- params.int64_activation_min, params.int64_activation_max);
183
- };
184
- NDOpsHelper<N>(output_desc, sub_func);
230
+
231
+ size_t compressed_input1_stride[kMaxBroadcastDim];
232
+ size_t compressed_input2_stride[kMaxBroadcastDim];
233
+ size_t compressed_output_shape[kMaxBroadcastDim];
234
+ bool broadcastable_shape = ReduceDimensionsForBroadcast<kMaxBroadcastDim>(
235
+ input1_shape, input2_shape, compressed_input1_stride,
236
+ compressed_input2_stride, compressed_output_shape);
237
+ // Skip broadcasting for degenerate shapes.
238
+ if (!broadcastable_shape) {
239
+ return;
240
+ }
241
+
242
+ size_t input1_offset = 0;
243
+ size_t input2_offset = 0;
244
+ size_t output_offset = 0;
245
+ BroadcastSubRecursiveDimensions(
246
+ kMaxBroadcastDim - 1, params, input1_data, input2_data, output_data,
247
+ &input1_offset, &input2_offset, &output_offset, compressed_input1_stride,
248
+ compressed_input2_stride, compressed_output_shape, binary_func);
185
249
  }
186
250
 
187
- template <typename T, int N = 5>
251
+ // TODO(b/151345304): We can implement BroadcastSub on buffers of arbitrary
252
+ // dimensionality if the runtime code does a single loop over one dimension
253
+ // that handles broadcasting as the base case. The code generator would then
254
+ // generate max(D1, D2) nested for loops.
255
+ template <typename T>
188
256
  void BroadcastSubSlow(const ArithmeticParams& params,
189
257
  const RuntimeShape& input1_shape, const T* input1_data,
190
258
  const RuntimeShape& input2_shape, const T* input2_data,
191
259
  const RuntimeShape& output_shape, T* output_data) {
192
- ruy::profiler::ScopeLabel label("BroadcastSubSlow/templated");
193
- TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
194
- TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
195
- TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
196
- NdArrayDesc<N> desc1;
197
- NdArrayDesc<N> desc2;
198
- NdArrayDesc<N> output_desc;
199
- NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
200
- &desc2);
201
- CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
202
-
203
- // In Tensorflow, the dimensions are canonically named (batch_number, row,
204
- // col, channel), with extents (batches, height, width, depth), with the
205
- // trailing dimension changing most rapidly (channels has the smallest stride,
206
- // typically 1 element).
207
- //
208
- // In generated C code, we store arrays with the dimensions reversed. The
209
- // first dimension has smallest stride.
210
- //
211
- // We name our variables by their Tensorflow convention, but generate C code
212
- // nesting loops such that the innermost loop has the smallest stride for the
213
- // best cache behavior.
214
- auto sub_func = [&](int indexes[N]) {
215
- output_data[SubscriptToIndex(output_desc, indexes)] =
216
- ActivationFunctionWithMinMax(
217
- input1_data[SubscriptToIndex(desc1, indexes)] -
218
- input2_data[SubscriptToIndex(desc2, indexes)],
219
- params.quantized_activation_min, params.quantized_activation_max);
220
- };
221
- NDOpsHelper<N>(output_desc, sub_func);
260
+ ruy::profiler::ScopeLabel label("BroadcastSubSlow/T");
261
+ BroadcastSubCommon<T>(
262
+ params, input1_shape, input1_data, input2_shape, input2_data,
263
+ output_shape, output_data,
264
+ [](T input1_val, T input2_val, const ArithmeticParams& params) {
265
+ T activation_min, activation_max;
266
+ GetActivationParams(params, &activation_min, &activation_max);
267
+ return ActivationFunctionWithMinMax(input1_val - input2_val,
268
+ activation_min, activation_max);
269
+ });
222
270
  }
223
271
 
224
- template <int N = 5>
225
272
  inline void BroadcastSub16POTSlow(const ArithmeticParams& params,
226
273
  const RuntimeShape& input1_shape,
227
274
  const int16_t* input1_data,
@@ -230,42 +277,24 @@ inline void BroadcastSub16POTSlow(const ArithmeticParams& params,
230
277
  const RuntimeShape& output_shape,
231
278
  int16_t* output_data) {
232
279
  ruy::profiler::ScopeLabel label("BroadcastSub16POTSlow/int16_t");
233
- NdArrayDesc<N> desc1;
234
- NdArrayDesc<N> desc2;
235
- NdArrayDesc<N> output_desc;
236
- NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
237
- &desc2);
238
- CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
239
-
240
- // In Tensorflow, the dimensions are canonically named (batch_number, row,
241
- // col, channel), with extents (batches, height, width, depth), with the
242
- // trailing dimension changing most rapidly (channels has the smallest stride,
243
- // typically 1 element).
244
- //
245
- // In generated C code, we store arrays with the dimensions reversed. The
246
- // first dimension has smallest stride.
247
- //
248
- // We name our variables by their Tensorflow convention, but generate C code
249
- // nesting loops such that the innermost loop has the smallest stride for the
250
- // best cache behavior.
251
- auto sub_func = [&](int indexes[N]) {
252
- const int32_t input1_val = input1_data[SubscriptToIndex(desc1, indexes)];
253
- const int32_t input2_val = input2_data[SubscriptToIndex(desc2, indexes)];
254
- const int32_t scaled_input1_val =
255
- gemmlowp::RoundingDivideByPOT(input1_val, -params.input1_shift);
256
- const int32_t scaled_input2_val =
257
- gemmlowp::RoundingDivideByPOT(input2_val, -params.input2_shift);
258
- const int32_t raw_output = scaled_input1_val - scaled_input2_val;
259
- const int32_t clamped_output =
260
- std::min(params.quantized_activation_max,
261
- std::max(params.quantized_activation_min, raw_output));
262
- output_data[SubscriptToIndex(output_desc, indexes)] =
263
- static_cast<int16_t>(clamped_output);
264
- };
265
- NDOpsHelper<N>(output_desc, sub_func);
280
+ BroadcastSubCommon<int16_t>(
281
+ params, input1_shape, input1_data, input2_shape, input2_data,
282
+ output_shape, output_data,
283
+ [](int16_t input1_val, int16_t input2_val,
284
+ const ArithmeticParams& params) {
285
+ const int32_t scaled_input1_val =
286
+ gemmlowp::RoundingDivideByPOT(input1_val, -params.input1_shift);
287
+ const int32_t scaled_input2_val =
288
+ gemmlowp::RoundingDivideByPOT(input2_val, -params.input2_shift);
289
+ const int32_t raw_output = scaled_input1_val - scaled_input2_val;
290
+ const int32_t clamped_output =
291
+ std::min(params.quantized_activation_max,
292
+ std::max(params.quantized_activation_min, raw_output));
293
+ return static_cast<int16_t>(clamped_output);
294
+ });
266
295
  }
267
296
 
268
- template <typename T, int N = 5>
297
+ template <typename T>
269
298
  void BroadcastQuantSubSlow(const ArithmeticParams& params,
270
299
  const RuntimeShape& input1_shape,
271
300
  const T* input1_data,
@@ -273,52 +302,32 @@ void BroadcastQuantSubSlow(const ArithmeticParams& params,
273
302
  const T* input2_data,
274
303
  const RuntimeShape& output_shape, T* output_data) {
275
304
  ruy::profiler::ScopeLabel label("BroadcastQuantSubSlow/T");
276
- TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
277
- TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
278
- TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
279
- NdArrayDesc<N> desc1;
280
- NdArrayDesc<N> desc2;
281
- NdArrayDesc<N> output_desc;
282
- NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
283
- &desc2);
284
- CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
285
-
286
- // In Tensorflow, the dimensions are canonically named (batch_number, row,
287
- // col, channel), with extents (batches, height, width, depth), with the
288
- // trailing dimension changing most rapidly (channels has the smallest stride,
289
- // typically 1 element).
290
- //
291
- // In generated C code, we store arrays with the dimensions reversed. The
292
- // first dimension has smallest stride.
293
- //
294
- // We name our variables by their Tensorflow convention, but generate C code
295
- // nesting loops such that the innermost loop has the smallest stride for the
296
- // best cache behavior.
297
- auto sub_func = [&](int indexes[N]) {
298
- const int32_t input1_val =
299
- params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
300
- const int32_t input2_val =
301
- params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
302
- const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
303
- const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
304
- const int32_t scaled_input1_val =
305
- MultiplyByQuantizedMultiplierSmallerThanOneExp(
306
- shifted_input1_val, params.input1_multiplier, params.input1_shift);
307
- const int32_t scaled_input2_val =
308
- MultiplyByQuantizedMultiplierSmallerThanOneExp(
309
- shifted_input2_val, params.input2_multiplier, params.input2_shift);
310
- const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
311
- const int32_t raw_output =
312
- MultiplyByQuantizedMultiplierSmallerThanOneExp(
313
- raw_sub, params.output_multiplier, params.output_shift) +
314
- params.output_offset;
315
- const int32_t clamped_output =
316
- std::min(params.quantized_activation_max,
317
- std::max(params.quantized_activation_min, raw_output));
318
- output_data[SubscriptToIndex(output_desc, indexes)] =
319
- static_cast<T>(clamped_output);
320
- };
321
- NDOpsHelper<N>(output_desc, sub_func);
305
+ BroadcastSubCommon<T>(
306
+ params, input1_shape, input1_data, input2_shape, input2_data,
307
+ output_shape, output_data,
308
+ [](T input1_val, T input2_val, const ArithmeticParams& params) {
309
+ const int32_t shifted_input1_val =
310
+ (params.input1_offset + input1_val) * (1 << params.left_shift);
311
+ const int32_t shifted_input2_val =
312
+ (params.input2_offset + input2_val) * (1 << params.left_shift);
313
+ const int32_t scaled_input1_val =
314
+ MultiplyByQuantizedMultiplierSmallerThanOneExp(
315
+ shifted_input1_val, params.input1_multiplier,
316
+ params.input1_shift);
317
+ const int32_t scaled_input2_val =
318
+ MultiplyByQuantizedMultiplierSmallerThanOneExp(
319
+ shifted_input2_val, params.input2_multiplier,
320
+ params.input2_shift);
321
+ const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
322
+ const int32_t raw_output =
323
+ MultiplyByQuantizedMultiplierSmallerThanOneExp(
324
+ raw_sub, params.output_multiplier, params.output_shift) +
325
+ params.output_offset;
326
+ const int32_t clamped_output =
327
+ std::min(params.quantized_activation_max,
328
+ std::max(params.quantized_activation_min, raw_output));
329
+ return static_cast<T>(clamped_output);
330
+ });
322
331
  }
323
332
 
324
333
  // Element-wise add that can often be used for inner loop of broadcast add as
@@ -405,35 +414,12 @@ void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
405
414
  const T* input1_data, const RuntimeShape& input2_shape,
406
415
  const T* input2_data, const RuntimeShape& output_shape,
407
416
  T* output_data) {
408
- NdArrayDesc<4> desc1;
409
- NdArrayDesc<4> desc2;
410
- NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
411
- &desc2);
412
- const RuntimeShape extended_output_shape =
413
- RuntimeShape::ExtendedShape(4, output_shape);
414
-
415
- // In Tensorflow, the dimensions are canonically named (batch_number, row,
416
- // col, channel), with extents (batches, height, width, depth), with the
417
- // trailing dimension changing most rapidly (channels has the smallest stride,
418
- // typically 1 element).
419
- //
420
- // In generated C code, we store arrays with the dimensions reversed. The
421
- // first dimension has smallest stride.
422
- //
423
- // We name our variables by their Tensorflow convention, but generate C code
424
- // nesting loops such that the innermost loop has the smallest stride for the
425
- // best cache behavior.
426
- for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
427
- for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
428
- for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
429
- for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
430
- output_data[Offset(extended_output_shape, b, y, x, c)] =
431
- input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
432
- input2_data[SubscriptToIndex(desc2, b, y, x, c)];
433
- }
434
- }
435
- }
436
- }
417
+ BroadcastSubCommon<T>(
418
+ params, input1_shape, input1_data, input2_shape, input2_data,
419
+ output_shape, output_data,
420
+ [](T input1_val, T input2_val, const ArithmeticParams& params) {
421
+ return input1_val - input2_val;
422
+ });
437
423
  }
438
424
 
439
425
  inline void SetActivationMinMax(const ArithmeticParams& params,
@@ -15,6 +15,10 @@ limitations under the License.
15
15
  #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_
16
16
  #define TENSORFLOW_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_
17
17
 
18
+ #include <cstring>
19
+
20
+ #include "tensorflow/lite/kernels/internal/compatibility.h"
21
+
18
22
  namespace tflite {
19
23
 
20
24
  template <int N>
@@ -34,9 +38,12 @@ class RuntimeShape {
34
38
 
35
39
  RuntimeShape() : size_(0) {}
36
40
 
37
- explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) {}
41
+ explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) {
42
+ TFLITE_DCHECK_LE(dimensions_count, kMaxSmallSize);
43
+ }
38
44
 
39
45
  RuntimeShape(int shape_size, int32_t value) : size_(shape_size) {
46
+ TFLITE_DCHECK_LE(shape_size, kMaxSmallSize);
40
47
  for (int i = 0; i < shape_size; ++i) {
41
48
  SetDim(i, value);
42
49
  }
@@ -44,6 +51,7 @@ class RuntimeShape {
44
51
 
45
52
  RuntimeShape(int dimensions_count, const int32_t* dims_data)
46
53
  : size_(dimensions_count) {
54
+ // check of dimensions_count handled by ReplaceWith()
47
55
  ReplaceWith(dimensions_count, dims_data);
48
56
  }
49
57
 
@@ -69,6 +77,7 @@ class RuntimeShape {
69
77
 
70
78
  static RuntimeShape ExtendedShape(int new_shape_size,
71
79
  const RuntimeShape& shape) {
80
+ TFLITE_DCHECK_LE(new_shape_size, kMaxSmallSize);
72
81
  return RuntimeShape(new_shape_size, shape, 1);
73
82
  }
74
83
  int32_t* DimsData() { return dims_; }
@@ -76,6 +85,7 @@ class RuntimeShape {
76
85
  const int32_t* DimsDataUpTo5D() const { return dims_; }
77
86
 
78
87
  void ReplaceWith(int dimensions_count, const int32_t* dims_data) {
88
+ TFLITE_DCHECK_LE(dimensions_count, kMaxSmallSize);
79
89
  size_ = dimensions_count;
80
90
  int32_t* dst_dims = DimsData();
81
91
  std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t));
@@ -103,6 +103,7 @@ inline int StridedSliceEndForAxis(const tflite::StridedSliceParams& params,
103
103
  const auto shrink_axis_mask = params.shrink_axis_mask;
104
104
  const bool shrink_axis = shrink_axis_mask & (1 << axis);
105
105
  const int axis_size = input_shape.Dims(axis);
106
+ const bool offset = params.offset;
106
107
  if (shrink_axis) {
107
108
  if (start >= axis_size) {
108
109
  return start;
@@ -112,6 +113,9 @@ inline int StridedSliceEndForAxis(const tflite::StridedSliceParams& params,
112
113
  }
113
114
  const auto* indices = params.stop_indices;
114
115
  int end = indices[axis];
116
+ if (offset) {
117
+ end += start;
118
+ }
115
119
  const int32_t stride = params.strides[axis];
116
120
  const int32_t end_mask = (params.end_mask & 1 << axis);
117
121
  if (end < 0) {
@@ -246,7 +250,7 @@ inline tflite::StridedSliceParams BuildStridedSliceParams(
246
250
  int begin_mask, int end_mask, int shrink_axis_mask,
247
251
  const std::vector<int>& start_indices, const std::vector<int>& stop_indices,
248
252
  const std::vector<int>& strides) {
249
- tflite::StridedSliceParams op_params;
253
+ tflite::StridedSliceParams op_params{};
250
254
  const int dims_count = start_indices.size();
251
255
 
252
256
  op_params.start_indices_count = dims_count;