xmos-ai-tools 1.1.2.dev216__py3-none-macosx_11_0_arm64.whl → 1.1.2.dev236__py3-none-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xmos_ai_tools/runtime/include/lib_nn/api/nn_layers.h +16 -0
- xmos_ai_tools/runtime/include/lib_nn/api/quadratic_approximation.h +80 -0
- xmos_ai_tools/runtime/include/lib_nn/api/quadratic_interpolation.h +23 -0
- xmos_ai_tools/runtime/include/lib_tflite_micro/src/tflite-xcore-kernels/xcore_ops.h +15 -15
- xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/simple_features/model.h → signal/micro/kernels/delay_flexbuffers_generated_data.h} +7 -9
- xmos_ai_tools/runtime/include/signal/micro/kernels/energy_flexbuffers_generated_data.h +28 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/fft_flexbuffers_generated_data.h +37 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_flexbuffers_generated_data.h +25 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_log_flexbuffers_generated_data.h +27 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_spectral_subtraction_flexbuffers_generated_data.h +26 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/framer_flexbuffers_generated_data.h +25 -0
- xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/simple_features/no_simple_features_data.h → signal/micro/kernels/irfft.h} +15 -7
- xmos_ai_tools/runtime/include/signal/micro/kernels/overlap_add_flexbuffers_generated_data.h +25 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/pcan_flexbuffers_generated_data.h +7 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/rfft.h +31 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/stacker_flexbuffers_generated_data.h +25 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/window_flexbuffers_generated_data.h +25 -0
- xmos_ai_tools/runtime/include/signal/src/circular_buffer.h +118 -0
- xmos_ai_tools/runtime/include/signal/src/complex.h +29 -0
- xmos_ai_tools/runtime/include/signal/src/energy.h +38 -0
- xmos_ai_tools/runtime/include/signal/src/fft_auto_scale.h +35 -0
- xmos_ai_tools/runtime/include/signal/src/filter_bank.h +69 -0
- xmos_ai_tools/runtime/include/signal/src/filter_bank_log.h +38 -0
- xmos_ai_tools/runtime/include/signal/src/filter_bank_spectral_subtraction.h +73 -0
- xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/main_functions.h → signal/src/filter_bank_square_root.h} +14 -17
- xmos_ai_tools/runtime/include/signal/src/irfft.h +84 -0
- xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_common.h +49 -0
- xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_float.h +31 -0
- xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_int16.h +30 -0
- xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_int32.h +31 -0
- xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.h → signal/src/log.h} +13 -6
- xmos_ai_tools/runtime/include/{tensorflow/lite/micro/python/interpreter/src/python_utils.h → signal/src/max_abs.h} +11 -11
- xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.h → signal/src/msb.h} +15 -6
- xmos_ai_tools/runtime/include/signal/src/overlap_add.h +46 -0
- xmos_ai_tools/runtime/include/signal/src/pcan_argc_fixed.h +41 -0
- xmos_ai_tools/runtime/include/signal/src/rfft.h +85 -0
- xmos_ai_tools/runtime/include/signal/src/square_root.h +32 -0
- xmos_ai_tools/runtime/include/{tensorflow/lite/micro/python/interpreter/src/numpy_utils.h → signal/src/window.h} +13 -15
- xmos_ai_tools/runtime/include/signal/testdata/fft_test_data.h +48 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/array.h +156 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/builtin_ops.h +44 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/c/c_api_types.h +6 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/c/common.h +8 -25
- xmos_ai_tools/runtime/include/tensorflow/lite/core/api/error_reporter.h +3 -3
- xmos_ai_tools/runtime/include/tensorflow/lite/core/api/flatbuffer_conversions.h +15 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/core/c/builtin_op_data.h +92 -3
- xmos_ai_tools/runtime/include/tensorflow/lite/core/c/c_api_types.h +61 -51
- xmos_ai_tools/runtime/include/tensorflow/lite/core/c/common.h +302 -1
- xmos_ai_tools/runtime/include/tensorflow/lite/core/macros.h +78 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/common.h +129 -43
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/cppmath.h +2 -2
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/portable_tensor.h +23 -4
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/add.h +210 -151
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/comparisons.h +9 -18
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/conv.h +2 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/add.h +103 -72
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h +2 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h +2 -63
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h +87 -26
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/mul.h +129 -80
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/reduce.h +42 -93
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/resize_bilinear.h +5 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/sub.h +249 -263
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/runtime_shape.h +11 -1
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/strided_slice_logic.h +5 -1
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/tensor_ctypes.h +5 -10
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/types.h +4 -2
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/kernel_util.h +25 -14
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/op_macros.h +14 -3
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/debug_log.h +10 -3
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_model_settings.h +37 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/fake_micro_context.h +7 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/add.h +6 -5
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/circular_buffer.h +0 -3
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/conv.h +19 -20
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/conv_test.h +8 -31
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/depthwise_conv.h +8 -8
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/ethosu.h +1 -1
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/fully_connected.h +9 -9
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/kernel_runner.h +14 -9
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/kernel_util.h +9 -4
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/micro_ops.h +119 -100
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/mul.h +4 -4
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/pooling.h +8 -8
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/reduce.h +4 -4
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/reshape.h +26 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/softmax.h +12 -16
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/strided_slice.h +40 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/svdf.h +8 -7
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.h +5 -5
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa.h +2 -2
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_conv.h +26 -21
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_depthwise_conv.h +4 -4
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_reshape.h +2 -4
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_softmax.h +2 -2
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h +5 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/linear_memory_planner.h +4 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/micro_memory_planner.h +4 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/non_persistent_buffer_planner_shim.h +4 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_allocator.h +23 -8
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_common.h +38 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_context.h +23 -65
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_graph.h +15 -57
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter.h +16 -5
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter_context.h +125 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter_graph.h +110 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_log.h +6 -8
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_mutable_op_resolver.h +114 -32
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_op_resolver.h +6 -5
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_profiler.h +1 -1
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/mock_micro_graph.h +1 -1
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/python_ops_resolver.h +21 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/test_helper_custom_ops.h +3 -4
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/test_helpers.h +28 -12
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/testing/micro_test.h +1 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/log_utils.h +273 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/metrics.h +41 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/op_resolver.h +127 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/schema/schema_generated.h +9139 -5010
- xmos_ai_tools/runtime/lib/libhost_xtflitemicro.a +0 -0
- xmos_ai_tools/runtime/lib/libxtflitemicro.a +0 -0
- xmos_ai_tools/xinterpreters/libs/macos/xtflm_python.1.0.1.dylib +0 -0
- xmos_ai_tools/xinterpreters/libs/macos/xtflm_python.dylib +0 -0
- {xmos_ai_tools-1.1.2.dev216.data → xmos_ai_tools-1.1.2.dev236.data}/data/bin/xcore-opt +0 -0
- {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/METADATA +3 -4
- {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/RECORD +128 -105
- {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/WHEEL +1 -1
- xmos_ai_tools/runtime/include/tensorflow/lite/core/api/op_resolver.h +0 -129
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/all_ops_resolver.h +0 -38
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/audio_provider.h +0 -44
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/command_responder.h +0 -30
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/feature_provider.h +0 -50
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.h +0 -30
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h +0 -43
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.h +0 -29
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h +0 -29
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/recognize_commands.h +0 -151
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h +0 -29
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_features_generator.h +0 -29
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_model_settings.h +0 -43
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h +0 -29
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/yes_simple_features_data.h +0 -23
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_string.h +0 -33
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/interpreter_wrapper.h +0 -51
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/pybind11_lib.h +0 -64
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/shared_library.h +0 -40
- {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/top_level.txt +0 -0
@@ -18,6 +18,7 @@ limitations under the License.
|
|
18
18
|
#include <stdint.h>
|
19
19
|
|
20
20
|
#include <algorithm>
|
21
|
+
#include <cstddef>
|
21
22
|
#include <limits>
|
22
23
|
|
23
24
|
#include "ruy/profiler/instrumentation.h" // from @ruy
|
@@ -29,100 +30,179 @@ namespace tflite {
|
|
29
30
|
|
30
31
|
namespace reference_ops {
|
31
32
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
for (int i = 0; i < flat_size; ++i) {
|
42
|
-
output_data[i] = ActivationFunctionWithMinMax(
|
43
|
-
input1_data[i] - input2_data[i], params.float_activation_min,
|
44
|
-
params.float_activation_max);
|
33
|
+
template <class T>
|
34
|
+
struct SubImpl {
|
35
|
+
template <class F>
|
36
|
+
static void BroadcastInput1(const ArithmeticParams& params,
|
37
|
+
const T* input1_data, const T* input2_data,
|
38
|
+
T* output_data, size_t size, F binary_func) {
|
39
|
+
for (size_t c = 0; c < size; ++c) {
|
40
|
+
output_data[c] = binary_func(input1_data[0], input2_data[c], params);
|
41
|
+
}
|
45
42
|
}
|
46
|
-
}
|
47
43
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
const int flat_size =
|
56
|
-
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
57
|
-
for (int i = 0; i < flat_size; ++i) {
|
58
|
-
output_data[i] = ActivationFunctionWithMinMax(
|
59
|
-
input1_data[i] - input2_data[i], params.quantized_activation_min,
|
60
|
-
params.quantized_activation_max);
|
44
|
+
template <class F>
|
45
|
+
static void BroadcastInput2(const ArithmeticParams& params,
|
46
|
+
const T* input1_data, const T* input2_data,
|
47
|
+
T* output_data, size_t size, F binary_func) {
|
48
|
+
for (size_t c = 0; c < size; ++c) {
|
49
|
+
output_data[c] = binary_func(input1_data[c], input2_data[0], params);
|
50
|
+
}
|
61
51
|
}
|
62
|
-
}
|
63
52
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
53
|
+
template <class F>
|
54
|
+
static void ElementWise(const ArithmeticParams& params, const T* input1_data,
|
55
|
+
const T* input2_data, T* output_data, size_t size,
|
56
|
+
F binary_func) {
|
57
|
+
for (size_t c = 0; c < size; ++c) {
|
58
|
+
output_data[c] = binary_func(input1_data[c], input2_data[c], params);
|
59
|
+
}
|
60
|
+
}
|
61
|
+
};
|
62
|
+
|
63
|
+
template <>
|
64
|
+
struct SubImpl<int32_t> {
|
65
|
+
template <class F>
|
66
|
+
static void BroadcastInput1(const ArithmeticParams& params,
|
67
|
+
const int32_t* input1_data,
|
68
|
+
const int32_t* input2_data, int32_t* output_data,
|
69
|
+
size_t size, F binary_func) {
|
70
|
+
size_t c = 0;
|
71
|
+
int32_t activation_min, activation_max;
|
72
|
+
GetActivationParams(params, &activation_min, &activation_max);
|
73
|
+
#ifdef USE_NEON
|
74
|
+
const int32x4_t vmax = vdupq_n_s32(activation_max);
|
75
|
+
const int32x4_t vmin = vdupq_n_s32(activation_min);
|
76
|
+
const int32x4_t va = vdupq_n_s32(input1_data[0]);
|
77
|
+
for (; c + 4 <= size; c += 4) {
|
78
|
+
const int32x4_t vb = vld1q_s32(&input2_data[c]);
|
79
|
+
int32x4_t vres = vsubq_s32(va, vb);
|
80
|
+
vres = vmaxq_s32(vmin, vres);
|
81
|
+
vres = vminq_s32(vmax, vres);
|
82
|
+
vst1q_s32(&output_data[c], vres);
|
83
|
+
}
|
84
|
+
#endif
|
85
|
+
for (; c < size; ++c) {
|
86
|
+
output_data[c] = binary_func(input1_data[0], input2_data[c], params);
|
87
|
+
}
|
88
|
+
}
|
86
89
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
90
|
+
template <class F>
|
91
|
+
static void BroadcastInput2(const ArithmeticParams& params,
|
92
|
+
const int32_t* input1_data,
|
93
|
+
const int32_t* input2_data, int32_t* output_data,
|
94
|
+
size_t size, F binary_func) {
|
95
|
+
size_t c = 0;
|
96
|
+
int32_t activation_min, activation_max;
|
97
|
+
GetActivationParams(params, &activation_min, &activation_max);
|
98
|
+
#ifdef USE_NEON
|
99
|
+
const int32x4_t vmax = vdupq_n_s32(activation_max);
|
100
|
+
const int32x4_t vmin = vdupq_n_s32(activation_min);
|
101
|
+
const int32x4_t vb = vdupq_n_s32(input2_data[0]);
|
102
|
+
for (; c + 4 <= size; c += 4) {
|
103
|
+
const int32x4_t va = vld1q_s32(&input1_data[c]);
|
104
|
+
int32x4_t vres = vsubq_s32(va, vb);
|
105
|
+
vres = vmaxq_s32(vmin, vres);
|
106
|
+
vres = vminq_s32(vmax, vres);
|
107
|
+
vst1q_s32(&output_data[c], vres);
|
108
|
+
}
|
109
|
+
#endif
|
110
|
+
for (; c < size; ++c) {
|
111
|
+
output_data[c] = binary_func(input1_data[c], input2_data[0], params);
|
112
|
+
}
|
113
|
+
}
|
114
|
+
|
115
|
+
template <class F>
|
116
|
+
static void ElementWise(const ArithmeticParams& params,
|
117
|
+
const int32_t* input1_data,
|
118
|
+
const int32_t* input2_data, int32_t* output_data,
|
119
|
+
size_t size, F binary_func) {
|
120
|
+
size_t c = 0;
|
121
|
+
int32_t activation_min, activation_max;
|
122
|
+
GetActivationParams(params, &activation_min, &activation_max);
|
123
|
+
#ifdef USE_NEON
|
124
|
+
int32x4_t vmax = vdupq_n_s32(activation_max);
|
125
|
+
int32x4_t vmin = vdupq_n_s32(activation_min);
|
126
|
+
for (; c + 4 <= size; c += 4) {
|
127
|
+
const int32x4_t va = vld1q_s32(&input1_data[c]);
|
128
|
+
const int32x4_t vb = vld1q_s32(&input2_data[c]);
|
129
|
+
int32x4_t vres = vsubq_s32(va, vb);
|
130
|
+
vres = vmaxq_s32(vmin, vres);
|
131
|
+
vres = vminq_s32(vmax, vres);
|
132
|
+
vst1q_s32(&output_data[c], vres);
|
133
|
+
}
|
134
|
+
#endif
|
135
|
+
for (; c < size; ++c) {
|
136
|
+
output_data[c] = binary_func(input1_data[c], input2_data[c], params);
|
137
|
+
}
|
138
|
+
}
|
139
|
+
};
|
140
|
+
|
141
|
+
template <typename T, typename F>
|
142
|
+
inline void BroadcastSubRecursiveDimensions(
|
143
|
+
int dimension, const ArithmeticParams& params, const T* input1_data,
|
144
|
+
const T* input2_data, T* output_data, size_t* input1_offset_p,
|
145
|
+
size_t* input2_offset_p, size_t* output_offset,
|
146
|
+
size_t* compressed_input1_stride, size_t* compressed_input2_stride,
|
147
|
+
size_t* compressed_output_shape, F binary_func) {
|
148
|
+
if (dimension > 0) {
|
149
|
+
for (size_t c = 0; c < compressed_output_shape[dimension]; ++c) {
|
150
|
+
size_t input1_offset_c = *input1_offset_p;
|
151
|
+
size_t input2_offset_c = *input2_offset_p;
|
152
|
+
BroadcastSubRecursiveDimensions(
|
153
|
+
dimension - 1, params, input1_data, input2_data, output_data,
|
154
|
+
&input1_offset_c, &input2_offset_c, output_offset,
|
155
|
+
compressed_input1_stride, compressed_input2_stride,
|
156
|
+
compressed_output_shape, binary_func);
|
157
|
+
*input1_offset_p += compressed_input1_stride[dimension];
|
158
|
+
*input2_offset_p += compressed_input2_stride[dimension];
|
159
|
+
}
|
160
|
+
} else {
|
161
|
+
TFLITE_DCHECK(dimension == 0);
|
162
|
+
bool input1_is_broadcast = compressed_input1_stride[dimension] == 0;
|
163
|
+
bool input2_is_broadcast = compressed_input2_stride[dimension] == 0;
|
164
|
+
TFLITE_DCHECK(!(input1_is_broadcast && input2_is_broadcast));
|
165
|
+
const T* input1_data_ptr = input1_data + *input1_offset_p;
|
166
|
+
const T* input2_data_ptr = input2_data + *input2_offset_p;
|
167
|
+
T* output_data_ptr = output_data + *output_offset;
|
168
|
+
if (input1_is_broadcast) {
|
169
|
+
// input1 is broadcast.
|
170
|
+
SubImpl<T>::BroadcastInput1(
|
171
|
+
params, input1_data_ptr, input2_data_ptr, output_data_ptr,
|
172
|
+
compressed_output_shape[dimension], binary_func);
|
173
|
+
*input2_offset_p += compressed_output_shape[dimension];
|
174
|
+
} else if (input2_is_broadcast) {
|
175
|
+
// input2 is broadcast.
|
176
|
+
SubImpl<T>::BroadcastInput2(
|
177
|
+
params, input1_data_ptr, input2_data_ptr, output_data_ptr,
|
178
|
+
compressed_output_shape[dimension], binary_func);
|
179
|
+
*input1_offset_p += compressed_output_shape[dimension];
|
180
|
+
} else {
|
181
|
+
// Add element-wise.
|
182
|
+
SubImpl<T>::ElementWise(params, input1_data_ptr, input2_data_ptr,
|
183
|
+
output_data_ptr,
|
184
|
+
compressed_output_shape[dimension], binary_func);
|
185
|
+
*input1_offset_p += compressed_output_shape[dimension];
|
186
|
+
*input2_offset_p += compressed_output_shape[dimension];
|
187
|
+
}
|
188
|
+
*output_offset += compressed_output_shape[dimension];
|
189
|
+
}
|
106
190
|
}
|
107
191
|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
TFLITE_DCHECK_LE(
|
120
|
-
|
121
|
-
|
122
|
-
NdArrayDesc<N> output_desc;
|
123
|
-
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
124
|
-
&desc2);
|
125
|
-
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
|
192
|
+
// TODO: b/296510380 - we may be able to factor out this to common.h for all
|
193
|
+
// binary arithmetic ops (add, sub, mul).
|
194
|
+
template <typename T, typename F>
|
195
|
+
inline void BroadcastSubCommon(const ArithmeticParams& params,
|
196
|
+
const RuntimeShape& input1_shape,
|
197
|
+
const T* input1_data,
|
198
|
+
const RuntimeShape& input2_shape,
|
199
|
+
const T* input2_data,
|
200
|
+
const RuntimeShape& output_shape, T* output_data,
|
201
|
+
F binary_func) {
|
202
|
+
constexpr int kMaxBroadcastDim = 6;
|
203
|
+
TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), kMaxBroadcastDim);
|
204
|
+
TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), kMaxBroadcastDim);
|
205
|
+
TFLITE_DCHECK_LE(output_shape.DimensionsCount(), kMaxBroadcastDim);
|
126
206
|
|
127
207
|
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
128
208
|
// col, channel), with extents (batches, height, width, depth), with the
|
@@ -135,33 +215,6 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
|
|
135
215
|
// We name our variables by their Tensorflow convention, but generate C code
|
136
216
|
// nesting loops such that the innermost loop has the smallest stride for the
|
137
217
|
// best cache behavior.
|
138
|
-
auto sub_func = [&](int indexes[N]) {
|
139
|
-
output_data[SubscriptToIndex(output_desc, indexes)] =
|
140
|
-
ActivationFunctionWithMinMax(
|
141
|
-
input1_data[SubscriptToIndex(desc1, indexes)] -
|
142
|
-
input2_data[SubscriptToIndex(desc2, indexes)],
|
143
|
-
params.quantized_activation_min, params.quantized_activation_max);
|
144
|
-
};
|
145
|
-
NDOpsHelper<N>(output_desc, sub_func);
|
146
|
-
}
|
147
|
-
|
148
|
-
template <int N = 5>
|
149
|
-
void BroadcastSubSlow(const ArithmeticParams& params,
|
150
|
-
const RuntimeShape& input1_shape,
|
151
|
-
const int64_t* input1_data,
|
152
|
-
const RuntimeShape& input2_shape,
|
153
|
-
const int64_t* input2_data,
|
154
|
-
const RuntimeShape& output_shape, int64_t* output_data) {
|
155
|
-
ruy::profiler::ScopeLabel label("BroadcastSubSlow/int64_t");
|
156
|
-
TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
|
157
|
-
TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
|
158
|
-
TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
|
159
|
-
NdArrayDesc<N> desc1;
|
160
|
-
NdArrayDesc<N> desc2;
|
161
|
-
NdArrayDesc<N> output_desc;
|
162
|
-
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
163
|
-
&desc2);
|
164
|
-
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
|
165
218
|
|
166
219
|
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
167
220
|
// col, channel), with extents (batches, height, width, depth), with the
|
@@ -174,54 +227,48 @@ void BroadcastSubSlow(const ArithmeticParams& params,
|
|
174
227
|
// We name our variables by their Tensorflow convention, but generate C code
|
175
228
|
// nesting loops such that the innermost loop has the smallest stride for the
|
176
229
|
// best cache behavior.
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
230
|
+
|
231
|
+
size_t compressed_input1_stride[kMaxBroadcastDim];
|
232
|
+
size_t compressed_input2_stride[kMaxBroadcastDim];
|
233
|
+
size_t compressed_output_shape[kMaxBroadcastDim];
|
234
|
+
bool broadcastable_shape = ReduceDimensionsForBroadcast<kMaxBroadcastDim>(
|
235
|
+
input1_shape, input2_shape, compressed_input1_stride,
|
236
|
+
compressed_input2_stride, compressed_output_shape);
|
237
|
+
// Skip broadcasting for degenerate shapes.
|
238
|
+
if (!broadcastable_shape) {
|
239
|
+
return;
|
240
|
+
}
|
241
|
+
|
242
|
+
size_t input1_offset = 0;
|
243
|
+
size_t input2_offset = 0;
|
244
|
+
size_t output_offset = 0;
|
245
|
+
BroadcastSubRecursiveDimensions(
|
246
|
+
kMaxBroadcastDim - 1, params, input1_data, input2_data, output_data,
|
247
|
+
&input1_offset, &input2_offset, &output_offset, compressed_input1_stride,
|
248
|
+
compressed_input2_stride, compressed_output_shape, binary_func);
|
185
249
|
}
|
186
250
|
|
187
|
-
|
251
|
+
// TODO(b/151345304): We can implement BroadcastSub on buffers of arbitrary
|
252
|
+
// dimensionality if the runtime code does a single loop over one dimension
|
253
|
+
// that handles broadcasting as the base case. The code generator would then
|
254
|
+
// generate max(D1, D2) nested for loops.
|
255
|
+
template <typename T>
|
188
256
|
void BroadcastSubSlow(const ArithmeticParams& params,
|
189
257
|
const RuntimeShape& input1_shape, const T* input1_data,
|
190
258
|
const RuntimeShape& input2_shape, const T* input2_data,
|
191
259
|
const RuntimeShape& output_shape, T* output_data) {
|
192
|
-
ruy::profiler::ScopeLabel label("BroadcastSubSlow/
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
204
|
-
// col, channel), with extents (batches, height, width, depth), with the
|
205
|
-
// trailing dimension changing most rapidly (channels has the smallest stride,
|
206
|
-
// typically 1 element).
|
207
|
-
//
|
208
|
-
// In generated C code, we store arrays with the dimensions reversed. The
|
209
|
-
// first dimension has smallest stride.
|
210
|
-
//
|
211
|
-
// We name our variables by their Tensorflow convention, but generate C code
|
212
|
-
// nesting loops such that the innermost loop has the smallest stride for the
|
213
|
-
// best cache behavior.
|
214
|
-
auto sub_func = [&](int indexes[N]) {
|
215
|
-
output_data[SubscriptToIndex(output_desc, indexes)] =
|
216
|
-
ActivationFunctionWithMinMax(
|
217
|
-
input1_data[SubscriptToIndex(desc1, indexes)] -
|
218
|
-
input2_data[SubscriptToIndex(desc2, indexes)],
|
219
|
-
params.quantized_activation_min, params.quantized_activation_max);
|
220
|
-
};
|
221
|
-
NDOpsHelper<N>(output_desc, sub_func);
|
260
|
+
ruy::profiler::ScopeLabel label("BroadcastSubSlow/T");
|
261
|
+
BroadcastSubCommon<T>(
|
262
|
+
params, input1_shape, input1_data, input2_shape, input2_data,
|
263
|
+
output_shape, output_data,
|
264
|
+
[](T input1_val, T input2_val, const ArithmeticParams& params) {
|
265
|
+
T activation_min, activation_max;
|
266
|
+
GetActivationParams(params, &activation_min, &activation_max);
|
267
|
+
return ActivationFunctionWithMinMax(input1_val - input2_val,
|
268
|
+
activation_min, activation_max);
|
269
|
+
});
|
222
270
|
}
|
223
271
|
|
224
|
-
template <int N = 5>
|
225
272
|
inline void BroadcastSub16POTSlow(const ArithmeticParams& params,
|
226
273
|
const RuntimeShape& input1_shape,
|
227
274
|
const int16_t* input1_data,
|
@@ -230,42 +277,24 @@ inline void BroadcastSub16POTSlow(const ArithmeticParams& params,
|
|
230
277
|
const RuntimeShape& output_shape,
|
231
278
|
int16_t* output_data) {
|
232
279
|
ruy::profiler::ScopeLabel label("BroadcastSub16POTSlow/int16_t");
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
// We name our variables by their Tensorflow convention, but generate C code
|
249
|
-
// nesting loops such that the innermost loop has the smallest stride for the
|
250
|
-
// best cache behavior.
|
251
|
-
auto sub_func = [&](int indexes[N]) {
|
252
|
-
const int32_t input1_val = input1_data[SubscriptToIndex(desc1, indexes)];
|
253
|
-
const int32_t input2_val = input2_data[SubscriptToIndex(desc2, indexes)];
|
254
|
-
const int32_t scaled_input1_val =
|
255
|
-
gemmlowp::RoundingDivideByPOT(input1_val, -params.input1_shift);
|
256
|
-
const int32_t scaled_input2_val =
|
257
|
-
gemmlowp::RoundingDivideByPOT(input2_val, -params.input2_shift);
|
258
|
-
const int32_t raw_output = scaled_input1_val - scaled_input2_val;
|
259
|
-
const int32_t clamped_output =
|
260
|
-
std::min(params.quantized_activation_max,
|
261
|
-
std::max(params.quantized_activation_min, raw_output));
|
262
|
-
output_data[SubscriptToIndex(output_desc, indexes)] =
|
263
|
-
static_cast<int16_t>(clamped_output);
|
264
|
-
};
|
265
|
-
NDOpsHelper<N>(output_desc, sub_func);
|
280
|
+
BroadcastSubCommon<int16_t>(
|
281
|
+
params, input1_shape, input1_data, input2_shape, input2_data,
|
282
|
+
output_shape, output_data,
|
283
|
+
[](int16_t input1_val, int16_t input2_val,
|
284
|
+
const ArithmeticParams& params) {
|
285
|
+
const int32_t scaled_input1_val =
|
286
|
+
gemmlowp::RoundingDivideByPOT(input1_val, -params.input1_shift);
|
287
|
+
const int32_t scaled_input2_val =
|
288
|
+
gemmlowp::RoundingDivideByPOT(input2_val, -params.input2_shift);
|
289
|
+
const int32_t raw_output = scaled_input1_val - scaled_input2_val;
|
290
|
+
const int32_t clamped_output =
|
291
|
+
std::min(params.quantized_activation_max,
|
292
|
+
std::max(params.quantized_activation_min, raw_output));
|
293
|
+
return static_cast<int16_t>(clamped_output);
|
294
|
+
});
|
266
295
|
}
|
267
296
|
|
268
|
-
template <typename T
|
297
|
+
template <typename T>
|
269
298
|
void BroadcastQuantSubSlow(const ArithmeticParams& params,
|
270
299
|
const RuntimeShape& input1_shape,
|
271
300
|
const T* input1_data,
|
@@ -273,52 +302,32 @@ void BroadcastQuantSubSlow(const ArithmeticParams& params,
|
|
273
302
|
const T* input2_data,
|
274
303
|
const RuntimeShape& output_shape, T* output_data) {
|
275
304
|
ruy::profiler::ScopeLabel label("BroadcastQuantSubSlow/T");
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
|
303
|
-
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
|
304
|
-
const int32_t scaled_input1_val =
|
305
|
-
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
306
|
-
shifted_input1_val, params.input1_multiplier, params.input1_shift);
|
307
|
-
const int32_t scaled_input2_val =
|
308
|
-
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
309
|
-
shifted_input2_val, params.input2_multiplier, params.input2_shift);
|
310
|
-
const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
|
311
|
-
const int32_t raw_output =
|
312
|
-
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
313
|
-
raw_sub, params.output_multiplier, params.output_shift) +
|
314
|
-
params.output_offset;
|
315
|
-
const int32_t clamped_output =
|
316
|
-
std::min(params.quantized_activation_max,
|
317
|
-
std::max(params.quantized_activation_min, raw_output));
|
318
|
-
output_data[SubscriptToIndex(output_desc, indexes)] =
|
319
|
-
static_cast<T>(clamped_output);
|
320
|
-
};
|
321
|
-
NDOpsHelper<N>(output_desc, sub_func);
|
305
|
+
BroadcastSubCommon<T>(
|
306
|
+
params, input1_shape, input1_data, input2_shape, input2_data,
|
307
|
+
output_shape, output_data,
|
308
|
+
[](T input1_val, T input2_val, const ArithmeticParams& params) {
|
309
|
+
const int32_t shifted_input1_val =
|
310
|
+
(params.input1_offset + input1_val) * (1 << params.left_shift);
|
311
|
+
const int32_t shifted_input2_val =
|
312
|
+
(params.input2_offset + input2_val) * (1 << params.left_shift);
|
313
|
+
const int32_t scaled_input1_val =
|
314
|
+
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
315
|
+
shifted_input1_val, params.input1_multiplier,
|
316
|
+
params.input1_shift);
|
317
|
+
const int32_t scaled_input2_val =
|
318
|
+
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
319
|
+
shifted_input2_val, params.input2_multiplier,
|
320
|
+
params.input2_shift);
|
321
|
+
const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
|
322
|
+
const int32_t raw_output =
|
323
|
+
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
324
|
+
raw_sub, params.output_multiplier, params.output_shift) +
|
325
|
+
params.output_offset;
|
326
|
+
const int32_t clamped_output =
|
327
|
+
std::min(params.quantized_activation_max,
|
328
|
+
std::max(params.quantized_activation_min, raw_output));
|
329
|
+
return static_cast<T>(clamped_output);
|
330
|
+
});
|
322
331
|
}
|
323
332
|
|
324
333
|
// Element-wise add that can often be used for inner loop of broadcast add as
|
@@ -405,35 +414,12 @@ void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
|
|
405
414
|
const T* input1_data, const RuntimeShape& input2_shape,
|
406
415
|
const T* input2_data, const RuntimeShape& output_shape,
|
407
416
|
T* output_data) {
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
416
|
-
// col, channel), with extents (batches, height, width, depth), with the
|
417
|
-
// trailing dimension changing most rapidly (channels has the smallest stride,
|
418
|
-
// typically 1 element).
|
419
|
-
//
|
420
|
-
// In generated C code, we store arrays with the dimensions reversed. The
|
421
|
-
// first dimension has smallest stride.
|
422
|
-
//
|
423
|
-
// We name our variables by their Tensorflow convention, but generate C code
|
424
|
-
// nesting loops such that the innermost loop has the smallest stride for the
|
425
|
-
// best cache behavior.
|
426
|
-
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
|
427
|
-
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
|
428
|
-
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
|
429
|
-
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
|
430
|
-
output_data[Offset(extended_output_shape, b, y, x, c)] =
|
431
|
-
input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
|
432
|
-
input2_data[SubscriptToIndex(desc2, b, y, x, c)];
|
433
|
-
}
|
434
|
-
}
|
435
|
-
}
|
436
|
-
}
|
417
|
+
BroadcastSubCommon<T>(
|
418
|
+
params, input1_shape, input1_data, input2_shape, input2_data,
|
419
|
+
output_shape, output_data,
|
420
|
+
[](T input1_val, T input2_val, const ArithmeticParams& params) {
|
421
|
+
return input1_val - input2_val;
|
422
|
+
});
|
437
423
|
}
|
438
424
|
|
439
425
|
inline void SetActivationMinMax(const ArithmeticParams& params,
|
@@ -15,6 +15,10 @@ limitations under the License.
|
|
15
15
|
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_
|
16
16
|
#define TENSORFLOW_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_
|
17
17
|
|
18
|
+
#include <cstring>
|
19
|
+
|
20
|
+
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
21
|
+
|
18
22
|
namespace tflite {
|
19
23
|
|
20
24
|
template <int N>
|
@@ -34,9 +38,12 @@ class RuntimeShape {
|
|
34
38
|
|
35
39
|
RuntimeShape() : size_(0) {}
|
36
40
|
|
37
|
-
explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) {
|
41
|
+
explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) {
|
42
|
+
TFLITE_DCHECK_LE(dimensions_count, kMaxSmallSize);
|
43
|
+
}
|
38
44
|
|
39
45
|
RuntimeShape(int shape_size, int32_t value) : size_(shape_size) {
|
46
|
+
TFLITE_DCHECK_LE(shape_size, kMaxSmallSize);
|
40
47
|
for (int i = 0; i < shape_size; ++i) {
|
41
48
|
SetDim(i, value);
|
42
49
|
}
|
@@ -44,6 +51,7 @@ class RuntimeShape {
|
|
44
51
|
|
45
52
|
RuntimeShape(int dimensions_count, const int32_t* dims_data)
|
46
53
|
: size_(dimensions_count) {
|
54
|
+
// check of dimensions_count handled by ReplaceWith()
|
47
55
|
ReplaceWith(dimensions_count, dims_data);
|
48
56
|
}
|
49
57
|
|
@@ -69,6 +77,7 @@ class RuntimeShape {
|
|
69
77
|
|
70
78
|
static RuntimeShape ExtendedShape(int new_shape_size,
|
71
79
|
const RuntimeShape& shape) {
|
80
|
+
TFLITE_DCHECK_LE(new_shape_size, kMaxSmallSize);
|
72
81
|
return RuntimeShape(new_shape_size, shape, 1);
|
73
82
|
}
|
74
83
|
int32_t* DimsData() { return dims_; }
|
@@ -76,6 +85,7 @@ class RuntimeShape {
|
|
76
85
|
const int32_t* DimsDataUpTo5D() const { return dims_; }
|
77
86
|
|
78
87
|
void ReplaceWith(int dimensions_count, const int32_t* dims_data) {
|
88
|
+
TFLITE_DCHECK_LE(dimensions_count, kMaxSmallSize);
|
79
89
|
size_ = dimensions_count;
|
80
90
|
int32_t* dst_dims = DimsData();
|
81
91
|
std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t));
|
@@ -103,6 +103,7 @@ inline int StridedSliceEndForAxis(const tflite::StridedSliceParams& params,
|
|
103
103
|
const auto shrink_axis_mask = params.shrink_axis_mask;
|
104
104
|
const bool shrink_axis = shrink_axis_mask & (1 << axis);
|
105
105
|
const int axis_size = input_shape.Dims(axis);
|
106
|
+
const bool offset = params.offset;
|
106
107
|
if (shrink_axis) {
|
107
108
|
if (start >= axis_size) {
|
108
109
|
return start;
|
@@ -112,6 +113,9 @@ inline int StridedSliceEndForAxis(const tflite::StridedSliceParams& params,
|
|
112
113
|
}
|
113
114
|
const auto* indices = params.stop_indices;
|
114
115
|
int end = indices[axis];
|
116
|
+
if (offset) {
|
117
|
+
end += start;
|
118
|
+
}
|
115
119
|
const int32_t stride = params.strides[axis];
|
116
120
|
const int32_t end_mask = (params.end_mask & 1 << axis);
|
117
121
|
if (end < 0) {
|
@@ -246,7 +250,7 @@ inline tflite::StridedSliceParams BuildStridedSliceParams(
|
|
246
250
|
int begin_mask, int end_mask, int shrink_axis_mask,
|
247
251
|
const std::vector<int>& start_indices, const std::vector<int>& stop_indices,
|
248
252
|
const std::vector<int>& strides) {
|
249
|
-
tflite::StridedSliceParams op_params;
|
253
|
+
tflite::StridedSliceParams op_params{};
|
250
254
|
const int dims_count = start_indices.size();
|
251
255
|
|
252
256
|
op_params.start_indices_count = dims_count;
|