xmos-ai-tools 1.1.2.dev216__py3-none-macosx_11_0_arm64.whl → 1.1.2.dev236__py3-none-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xmos_ai_tools/runtime/include/lib_nn/api/nn_layers.h +16 -0
- xmos_ai_tools/runtime/include/lib_nn/api/quadratic_approximation.h +80 -0
- xmos_ai_tools/runtime/include/lib_nn/api/quadratic_interpolation.h +23 -0
- xmos_ai_tools/runtime/include/lib_tflite_micro/src/tflite-xcore-kernels/xcore_ops.h +15 -15
- xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/simple_features/model.h → signal/micro/kernels/delay_flexbuffers_generated_data.h} +7 -9
- xmos_ai_tools/runtime/include/signal/micro/kernels/energy_flexbuffers_generated_data.h +28 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/fft_flexbuffers_generated_data.h +37 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_flexbuffers_generated_data.h +25 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_log_flexbuffers_generated_data.h +27 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_spectral_subtraction_flexbuffers_generated_data.h +26 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/framer_flexbuffers_generated_data.h +25 -0
- xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/simple_features/no_simple_features_data.h → signal/micro/kernels/irfft.h} +15 -7
- xmos_ai_tools/runtime/include/signal/micro/kernels/overlap_add_flexbuffers_generated_data.h +25 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/pcan_flexbuffers_generated_data.h +7 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/rfft.h +31 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/stacker_flexbuffers_generated_data.h +25 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/window_flexbuffers_generated_data.h +25 -0
- xmos_ai_tools/runtime/include/signal/src/circular_buffer.h +118 -0
- xmos_ai_tools/runtime/include/signal/src/complex.h +29 -0
- xmos_ai_tools/runtime/include/signal/src/energy.h +38 -0
- xmos_ai_tools/runtime/include/signal/src/fft_auto_scale.h +35 -0
- xmos_ai_tools/runtime/include/signal/src/filter_bank.h +69 -0
- xmos_ai_tools/runtime/include/signal/src/filter_bank_log.h +38 -0
- xmos_ai_tools/runtime/include/signal/src/filter_bank_spectral_subtraction.h +73 -0
- xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/main_functions.h → signal/src/filter_bank_square_root.h} +14 -17
- xmos_ai_tools/runtime/include/signal/src/irfft.h +84 -0
- xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_common.h +49 -0
- xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_float.h +31 -0
- xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_int16.h +30 -0
- xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_int32.h +31 -0
- xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.h → signal/src/log.h} +13 -6
- xmos_ai_tools/runtime/include/{tensorflow/lite/micro/python/interpreter/src/python_utils.h → signal/src/max_abs.h} +11 -11
- xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.h → signal/src/msb.h} +15 -6
- xmos_ai_tools/runtime/include/signal/src/overlap_add.h +46 -0
- xmos_ai_tools/runtime/include/signal/src/pcan_argc_fixed.h +41 -0
- xmos_ai_tools/runtime/include/signal/src/rfft.h +85 -0
- xmos_ai_tools/runtime/include/signal/src/square_root.h +32 -0
- xmos_ai_tools/runtime/include/{tensorflow/lite/micro/python/interpreter/src/numpy_utils.h → signal/src/window.h} +13 -15
- xmos_ai_tools/runtime/include/signal/testdata/fft_test_data.h +48 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/array.h +156 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/builtin_ops.h +44 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/c/c_api_types.h +6 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/c/common.h +8 -25
- xmos_ai_tools/runtime/include/tensorflow/lite/core/api/error_reporter.h +3 -3
- xmos_ai_tools/runtime/include/tensorflow/lite/core/api/flatbuffer_conversions.h +15 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/core/c/builtin_op_data.h +92 -3
- xmos_ai_tools/runtime/include/tensorflow/lite/core/c/c_api_types.h +61 -51
- xmos_ai_tools/runtime/include/tensorflow/lite/core/c/common.h +302 -1
- xmos_ai_tools/runtime/include/tensorflow/lite/core/macros.h +78 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/common.h +129 -43
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/cppmath.h +2 -2
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/portable_tensor.h +23 -4
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/add.h +210 -151
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/comparisons.h +9 -18
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/conv.h +2 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/add.h +103 -72
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h +2 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h +2 -63
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h +87 -26
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/mul.h +129 -80
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/reduce.h +42 -93
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/resize_bilinear.h +5 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/sub.h +249 -263
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/runtime_shape.h +11 -1
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/strided_slice_logic.h +5 -1
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/tensor_ctypes.h +5 -10
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/types.h +4 -2
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/kernel_util.h +25 -14
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/op_macros.h +14 -3
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/debug_log.h +10 -3
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_model_settings.h +37 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/fake_micro_context.h +7 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/add.h +6 -5
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/circular_buffer.h +0 -3
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/conv.h +19 -20
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/conv_test.h +8 -31
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/depthwise_conv.h +8 -8
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/ethosu.h +1 -1
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/fully_connected.h +9 -9
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/kernel_runner.h +14 -9
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/kernel_util.h +9 -4
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/micro_ops.h +119 -100
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/mul.h +4 -4
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/pooling.h +8 -8
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/reduce.h +4 -4
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/reshape.h +26 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/softmax.h +12 -16
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/strided_slice.h +40 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/svdf.h +8 -7
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.h +5 -5
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa.h +2 -2
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_conv.h +26 -21
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_depthwise_conv.h +4 -4
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_reshape.h +2 -4
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_softmax.h +2 -2
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h +5 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/linear_memory_planner.h +4 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/micro_memory_planner.h +4 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/non_persistent_buffer_planner_shim.h +4 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_allocator.h +23 -8
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_common.h +38 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_context.h +23 -65
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_graph.h +15 -57
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter.h +16 -5
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter_context.h +125 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter_graph.h +110 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_log.h +6 -8
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_mutable_op_resolver.h +114 -32
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_op_resolver.h +6 -5
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_profiler.h +1 -1
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/mock_micro_graph.h +1 -1
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/python_ops_resolver.h +21 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/test_helper_custom_ops.h +3 -4
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/test_helpers.h +28 -12
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/testing/micro_test.h +1 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/log_utils.h +273 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/metrics.h +41 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/op_resolver.h +127 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/schema/schema_generated.h +9139 -5010
- xmos_ai_tools/runtime/lib/libhost_xtflitemicro.a +0 -0
- xmos_ai_tools/runtime/lib/libxtflitemicro.a +0 -0
- xmos_ai_tools/xinterpreters/libs/macos/xtflm_python.1.0.1.dylib +0 -0
- xmos_ai_tools/xinterpreters/libs/macos/xtflm_python.dylib +0 -0
- {xmos_ai_tools-1.1.2.dev216.data → xmos_ai_tools-1.1.2.dev236.data}/data/bin/xcore-opt +0 -0
- {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/METADATA +3 -4
- {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/RECORD +128 -105
- {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/WHEEL +1 -1
- xmos_ai_tools/runtime/include/tensorflow/lite/core/api/op_resolver.h +0 -129
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/all_ops_resolver.h +0 -38
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/audio_provider.h +0 -44
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/command_responder.h +0 -30
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/feature_provider.h +0 -50
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.h +0 -30
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h +0 -43
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.h +0 -29
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h +0 -29
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/recognize_commands.h +0 -151
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h +0 -29
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_features_generator.h +0 -29
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_model_settings.h +0 -43
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h +0 -29
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/yes_simple_features_data.h +0 -23
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_string.h +0 -33
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/interpreter_wrapper.h +0 -51
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/pybind11_lib.h +0 -64
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/shared_library.h +0 -40
- {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/top_level.txt +0 -0
@@ -24,6 +24,9 @@ namespace tflite {
|
|
24
24
|
|
25
25
|
namespace reference_ops {
|
26
26
|
|
27
|
+
// Maximum dimension supported by the broadcast mul operation.
|
28
|
+
constexpr int kMaxMulBroadcastDim = 6;
|
29
|
+
|
27
30
|
// Element-wise mul that can often be used for inner loop of broadcast Mul as
|
28
31
|
// well as the non-broadcast Mul.
|
29
32
|
inline void MulElementwise(int size, const ArithmeticParams& params,
|
@@ -88,128 +91,174 @@ inline void Mul(const ArithmeticParams& params,
|
|
88
91
|
MulElementwise(flat_size, params, input1_data, input2_data, output_data);
|
89
92
|
}
|
90
93
|
|
91
|
-
|
94
|
+
template <typename T, typename F>
|
95
|
+
void BroadcastMulRecursiveDimensions(
|
96
|
+
const ArithmeticParams& params, int dimension, const T* input1_data,
|
97
|
+
const T* input2_data, T* output_data, size_t* input1_offset_p,
|
98
|
+
size_t* input2_offset_p, size_t* output_offset,
|
99
|
+
const NdArrayDesc<kMaxMulBroadcastDim>& desc1,
|
100
|
+
const NdArrayDesc<kMaxMulBroadcastDim>& desc2,
|
101
|
+
const int32_t extended_output_shape_dims[kMaxMulBroadcastDim],
|
102
|
+
F binary_func) {
|
103
|
+
if (dimension == kMaxMulBroadcastDim - 1) {
|
104
|
+
for (int c = 0; c < extended_output_shape_dims[dimension]; ++c) {
|
105
|
+
const T input1_val = input1_data[*input1_offset_p];
|
106
|
+
const T input2_val = input2_data[*input2_offset_p];
|
107
|
+
output_data[*output_offset] = binary_func(params, input1_val, input2_val);
|
108
|
+
*input1_offset_p += desc1.strides[dimension];
|
109
|
+
*input2_offset_p += desc2.strides[dimension];
|
110
|
+
++(*output_offset);
|
111
|
+
}
|
112
|
+
} else {
|
113
|
+
for (int a = 0; a < extended_output_shape_dims[dimension]; ++a) {
|
114
|
+
size_t input1_offset_c = *input1_offset_p;
|
115
|
+
size_t input2_offset_c = *input2_offset_p;
|
116
|
+
BroadcastMulRecursiveDimensions(
|
117
|
+
params, dimension + 1, input1_data, input2_data, output_data,
|
118
|
+
&input1_offset_c, &input2_offset_c, output_offset, desc1, desc2,
|
119
|
+
extended_output_shape_dims, binary_func);
|
120
|
+
*input1_offset_p += desc1.strides[dimension];
|
121
|
+
*input2_offset_p += desc2.strides[dimension];
|
122
|
+
}
|
123
|
+
}
|
124
|
+
}
|
125
|
+
|
126
|
+
inline void BroadcastMul6DSlow(const ArithmeticParams& params,
|
92
127
|
const RuntimeShape& input1_shape,
|
93
128
|
const uint8_t* input1_data,
|
94
129
|
const RuntimeShape& input2_shape,
|
95
130
|
const uint8_t* input2_data,
|
96
131
|
const RuntimeShape& output_shape,
|
97
132
|
uint8_t* output_data) {
|
98
|
-
NdArrayDesc<
|
99
|
-
NdArrayDesc<
|
133
|
+
NdArrayDesc<kMaxMulBroadcastDim> desc1;
|
134
|
+
NdArrayDesc<kMaxMulBroadcastDim> desc2;
|
100
135
|
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
101
136
|
&desc2);
|
102
137
|
const RuntimeShape extended_output_shape =
|
103
|
-
RuntimeShape::ExtendedShape(
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
138
|
+
RuntimeShape::ExtendedShape(kMaxMulBroadcastDim, output_shape);
|
139
|
+
// Cache output shape dimensions.
|
140
|
+
int32_t extended_output_shape_dims[kMaxMulBroadcastDim];
|
141
|
+
std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(),
|
142
|
+
sizeof(extended_output_shape_dims));
|
143
|
+
|
144
|
+
size_t input1_offset = 0;
|
145
|
+
size_t input2_offset = 0;
|
146
|
+
size_t output_offset = 0;
|
147
|
+
BroadcastMulRecursiveDimensions(
|
148
|
+
params, 0, input1_data, input2_data, output_data, &input1_offset,
|
149
|
+
&input2_offset, &output_offset, desc1, desc2, extended_output_shape_dims,
|
150
|
+
[](const ArithmeticParams& params, const uint8_t input1_val,
|
151
|
+
const uint8_t input2_val) {
|
152
|
+
const int32_t offsetted_input1_val = params.input1_offset + input1_val;
|
153
|
+
const int32_t offsetted_input2_val = params.input2_offset + input2_val;
|
154
|
+
const int32_t unclamped_result =
|
155
|
+
params.output_offset +
|
156
|
+
MultiplyByQuantizedMultiplier(
|
157
|
+
offsetted_input1_val * offsetted_input2_val,
|
158
|
+
params.output_multiplier, params.output_shift);
|
159
|
+
const int32_t clamped_output = std::min(
|
160
|
+
params.quantized_activation_max,
|
161
|
+
std::max(params.quantized_activation_min, unclamped_result));
|
162
|
+
return static_cast<uint8_t>(clamped_output);
|
163
|
+
});
|
129
164
|
}
|
130
165
|
|
131
166
|
template <typename T,
|
132
|
-
// For unquantized mul on small integers,
|
167
|
+
// For unquantized mul on small integers, explicitly set to true.
|
133
168
|
bool enable_for_short_integers = false>
|
134
169
|
inline typename std::enable_if<
|
135
170
|
!is_small_integer<T>::value || enable_for_short_integers, void>::type
|
136
|
-
|
171
|
+
BroadcastMul6DSlow(const ArithmeticParams& params,
|
137
172
|
const RuntimeShape& unextended_input1_shape,
|
138
173
|
const T* input1_data,
|
139
174
|
const RuntimeShape& unextended_input2_shape,
|
140
175
|
const T* input2_data,
|
141
176
|
const RuntimeShape& unextended_output_shape,
|
142
177
|
T* output_data) {
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
|
149
|
-
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
150
|
-
const RuntimeShape output_shape =
|
151
|
-
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
152
|
-
|
153
|
-
NdArrayDesc<4> desc1;
|
154
|
-
NdArrayDesc<4> desc2;
|
178
|
+
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 6);
|
179
|
+
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 6);
|
180
|
+
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 6);
|
181
|
+
NdArrayDesc<kMaxMulBroadcastDim> desc1;
|
182
|
+
NdArrayDesc<kMaxMulBroadcastDim> desc2;
|
155
183
|
NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
|
156
184
|
unextended_input2_shape, &desc1, &desc2);
|
185
|
+
const RuntimeShape extended_output_shape =
|
186
|
+
RuntimeShape::ExtendedShape(kMaxMulBroadcastDim, unextended_output_shape);
|
187
|
+
// Cache output shape dimensions.
|
188
|
+
int32_t extended_output_shape_dims[kMaxMulBroadcastDim];
|
189
|
+
std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(),
|
190
|
+
sizeof(extended_output_shape_dims));
|
157
191
|
|
158
192
|
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
159
193
|
// col, channel), with extents (batches, height, width, depth), with the
|
160
|
-
// trailing dimension changing most rapidly (channels has the smallest
|
161
|
-
// typically 1 element).
|
194
|
+
// trailing dimension changing most rapidly (channels has the smallest
|
195
|
+
// stride, typically 1 element).
|
162
196
|
//
|
163
197
|
// In generated C code, we store arrays with the dimensions reversed. The
|
164
198
|
// first dimension has smallest stride.
|
165
199
|
//
|
166
200
|
// We name our variables by their Tensorflow convention, but generate C code
|
167
|
-
// nesting loops such that the innermost loop has the smallest stride for
|
168
|
-
// best cache behavior.
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
201
|
+
// nesting loops such that the innermost loop has the smallest stride for
|
202
|
+
// the best cache behavior.
|
203
|
+
size_t input1_offset = 0;
|
204
|
+
size_t input2_offset = 0;
|
205
|
+
size_t output_offset = 0;
|
206
|
+
BroadcastMulRecursiveDimensions(
|
207
|
+
params, 0, input1_data, input2_data, output_data, &input1_offset,
|
208
|
+
&input2_offset, &output_offset, desc1, desc2, extended_output_shape_dims,
|
209
|
+
[](const ArithmeticParams& params, const T input1_val,
|
210
|
+
const T input2_val) {
|
211
|
+
T output_activation_min;
|
212
|
+
T output_activation_max;
|
213
|
+
GetActivationParams(params, &output_activation_min,
|
214
|
+
&output_activation_max);
|
215
|
+
return ActivationFunctionWithMinMax<T>(input1_val * input2_val,
|
216
|
+
output_activation_min,
|
217
|
+
output_activation_max);
|
218
|
+
});
|
182
219
|
}
|
183
220
|
|
184
|
-
inline void
|
221
|
+
inline void BroadcastMul6DSlow(const ArithmeticParams& params,
|
185
222
|
const RuntimeShape& unextended_input1_shape,
|
186
223
|
const std::complex<float>* input1_data,
|
187
224
|
const RuntimeShape& unextended_input2_shape,
|
188
225
|
const std::complex<float>* input2_data,
|
189
226
|
const RuntimeShape& unextended_output_shape,
|
190
227
|
std::complex<float>* output_data) {
|
191
|
-
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(),
|
192
|
-
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(),
|
193
|
-
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(),
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
NdArrayDesc<4> desc1;
|
198
|
-
NdArrayDesc<4> desc2;
|
228
|
+
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 6);
|
229
|
+
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 6);
|
230
|
+
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 6);
|
231
|
+
|
232
|
+
NdArrayDesc<kMaxMulBroadcastDim> desc1;
|
233
|
+
NdArrayDesc<kMaxMulBroadcastDim> desc2;
|
199
234
|
NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
|
200
235
|
unextended_input2_shape, &desc1, &desc2);
|
236
|
+
const RuntimeShape extended_output_shape =
|
237
|
+
RuntimeShape::ExtendedShape(kMaxMulBroadcastDim, unextended_output_shape);
|
238
|
+
// Cache output shape dimensions.
|
239
|
+
int32_t extended_output_shape_dims[kMaxMulBroadcastDim];
|
240
|
+
std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(),
|
241
|
+
sizeof(extended_output_shape_dims));
|
201
242
|
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
243
|
+
size_t input1_offset = 0;
|
244
|
+
size_t input2_offset = 0;
|
245
|
+
size_t output_offset = 0;
|
246
|
+
BroadcastMulRecursiveDimensions(
|
247
|
+
params, 0, input1_data, input2_data, output_data, &input1_offset,
|
248
|
+
&input2_offset, &output_offset, desc1, desc2, extended_output_shape_dims,
|
249
|
+
[](const ArithmeticParams& params, const std::complex<float> input1_val,
|
250
|
+
const std::complex<float> input2_val) {
|
251
|
+
return input1_val * input2_val;
|
252
|
+
});
|
253
|
+
}
|
254
|
+
|
255
|
+
template <typename T>
|
256
|
+
inline void BroadcastMul4DSlow(
|
257
|
+
const ArithmeticParams& params, const RuntimeShape& input1_shape,
|
258
|
+
const T* input1_data, const RuntimeShape& input2_shape,
|
259
|
+
const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
|
260
|
+
return BroadcastMul6DSlow(params, input1_shape, input1_data, input2_shape,
|
261
|
+
input2_data, output_shape, output_data);
|
213
262
|
}
|
214
263
|
|
215
264
|
} // namespace reference_ops
|
@@ -268,11 +268,11 @@ inline bool Mean(const T* input_data, const int* input_dims,
|
|
268
268
|
return true;
|
269
269
|
}
|
270
270
|
|
271
|
-
template <typename T>
|
272
271
|
inline void Mean(const tflite::MeanParams& op_params,
|
273
272
|
const RuntimeShape& unextended_input_shape,
|
274
|
-
const
|
275
|
-
const RuntimeShape& unextended_output_shape,
|
273
|
+
const float* input_data,
|
274
|
+
const RuntimeShape& unextended_output_shape,
|
275
|
+
float* output_data) {
|
276
276
|
ruy::profiler::ScopeLabel label("Mean4D");
|
277
277
|
|
278
278
|
// Current implementation only supports dimension equals 4 and simultaneous
|
@@ -312,78 +312,21 @@ inline void Mean(const tflite::MeanParams& op_params,
|
|
312
312
|
}
|
313
313
|
}
|
314
314
|
|
315
|
-
inline void Mean(const tflite::MeanParams& op_params,
|
316
|
-
const RuntimeShape& unextended_input_shape,
|
317
|
-
const uint8_t* input_data, int32_t input_zero_point,
|
318
|
-
float input_scale, const RuntimeShape& unextended_output_shape,
|
319
|
-
uint8_t* output_data, int32_t output_zero_point,
|
320
|
-
float output_scale) {
|
321
|
-
ruy::profiler::ScopeLabel label("Mean4D/Uint8");
|
322
|
-
|
323
|
-
// Current implementation only supports dimension equals 4 and simultaneous
|
324
|
-
// reduction over width and height.
|
325
|
-
TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
|
326
|
-
TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
327
|
-
const RuntimeShape input_shape =
|
328
|
-
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
329
|
-
const RuntimeShape output_shape =
|
330
|
-
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
331
|
-
const int output_batch = output_shape.Dims(0);
|
332
|
-
const int output_height = output_shape.Dims(1);
|
333
|
-
const int output_width = output_shape.Dims(2);
|
334
|
-
const int output_depth = output_shape.Dims(3);
|
335
|
-
const int input_height = input_shape.Dims(1);
|
336
|
-
const int input_width = input_shape.Dims(2);
|
337
|
-
const float num_elements_in_axis = input_width * input_height;
|
338
|
-
|
339
|
-
TFLITE_CHECK_EQ(op_params.axis_count, 2);
|
340
|
-
TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
|
341
|
-
(op_params.axis[0] == 2 && op_params.axis[1] == 1));
|
342
|
-
TFLITE_CHECK_EQ(output_height, 1);
|
343
|
-
TFLITE_CHECK_EQ(output_width, 1);
|
344
|
-
|
345
|
-
constexpr int32_t kMinValue = std::numeric_limits<uint8_t>::min();
|
346
|
-
constexpr int32_t kMaxValue = std::numeric_limits<uint8_t>::max();
|
347
|
-
|
348
|
-
float temp = input_zero_point * input_scale / output_scale;
|
349
|
-
temp = temp > 0 ? temp + 0.5f : temp - 0.5f;
|
350
|
-
int32_t bias = output_zero_point - static_cast<int32_t>(temp);
|
351
|
-
double real_scale =
|
352
|
-
static_cast<double>(input_scale / (num_elements_in_axis * output_scale));
|
353
|
-
|
354
|
-
int32_t multiplier;
|
355
|
-
int shift;
|
356
|
-
QuantizeMultiplier(real_scale, &multiplier, &shift);
|
357
|
-
for (int out_b = 0; out_b < output_batch; ++out_b) {
|
358
|
-
for (int out_d = 0; out_d < output_depth; ++out_d) {
|
359
|
-
int32_t acc = 0;
|
360
|
-
for (int in_h = 0; in_h < input_height; ++in_h) {
|
361
|
-
for (int in_w = 0; in_w < input_width; ++in_w) {
|
362
|
-
acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
|
363
|
-
}
|
364
|
-
}
|
365
|
-
acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
|
366
|
-
acc += bias;
|
367
|
-
acc = std::min(std::max(acc, kMinValue), kMaxValue);
|
368
|
-
output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
|
369
|
-
static_cast<uint8_t>(acc);
|
370
|
-
}
|
371
|
-
}
|
372
|
-
}
|
373
|
-
|
374
315
|
// Computes the mean of elements across dimensions given in axis.
|
375
316
|
// It does so in two stages, first calculates the sum of elements along the axis
|
376
317
|
// then divides it by the number of element in axis for quantized values.
|
377
318
|
template <typename T, typename U>
|
378
319
|
inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
|
379
|
-
|
380
|
-
|
381
|
-
|
320
|
+
const int* input_dims, const int input_num_dims,
|
321
|
+
T* output_data, int32_t output_multiplier,
|
322
|
+
int output_shift, int32_t output_zero_point,
|
382
323
|
const int* output_dims,
|
383
324
|
const int output_num_dims, const int* axis,
|
384
325
|
const int num_axis_dimensions, bool keep_dims,
|
385
326
|
int* temp_index, int* resolved_axis, U* temp_sum,
|
386
327
|
bool compute_sum) {
|
328
|
+
const int32_t kMinValue = std::numeric_limits<T>::min();
|
329
|
+
const int32_t kMaxValue = std::numeric_limits<T>::max();
|
387
330
|
const bool uint8_case = std::is_same<T, uint8_t>::value;
|
388
331
|
const bool int16_case = std::is_same<T, int16_t>::value;
|
389
332
|
if (uint8_case) {
|
@@ -430,40 +373,46 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
|
|
430
373
|
}
|
431
374
|
|
432
375
|
// Calculate mean by dividing output_data by num of aggregated element.
|
433
|
-
|
376
|
+
int64_t num_elements_in_axis = 1;
|
434
377
|
for (int idx = 0; idx < num_resolved_axis; ++idx) {
|
435
378
|
size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
|
436
379
|
// Overflow prevention.
|
437
|
-
if (current > (std::numeric_limits<
|
380
|
+
if (current > static_cast<size_t>(std::numeric_limits<int64_t>::max() /
|
381
|
+
num_elements_in_axis)) {
|
438
382
|
return false;
|
439
383
|
}
|
440
384
|
num_elements_in_axis *= current;
|
441
385
|
}
|
442
386
|
|
443
|
-
if (num_elements_in_axis
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
387
|
+
if (num_elements_in_axis == 0) {
|
388
|
+
return true;
|
389
|
+
}
|
390
|
+
|
391
|
+
// Readapt output rescaling when calculating the mean to integrate a
|
392
|
+
// 1/num_elements_in_axis multiplier.
|
393
|
+
if (!compute_sum) {
|
394
|
+
TFLITE_DCHECK_GE(num_elements_in_axis, 0);
|
395
|
+
int shift =
|
396
|
+
63 - CountLeadingZeros(static_cast<uint64_t>(num_elements_in_axis));
|
397
|
+
// To avoid any overflow risk 'shift' should be <= 32 and to satisfy
|
398
|
+
// 'MultiplyByQuantizedMultiplier' pre-conditions 'output_shift - shift'
|
399
|
+
// should be >= -31. Clamp the value at the price of some precision loss.
|
400
|
+
shift = std::min(shift, 32);
|
401
|
+
shift = std::min(shift, 31 + output_shift);
|
402
|
+
output_multiplier = static_cast<int32_t>(
|
403
|
+
(static_cast<int64_t>(output_multiplier) << shift) /
|
404
|
+
num_elements_in_axis);
|
405
|
+
output_shift = output_shift - shift;
|
406
|
+
}
|
407
|
+
|
408
|
+
for (size_t idx = 0; idx < num_outputs; ++idx) {
|
409
|
+
const U shifted_sum =
|
410
|
+
static_cast<U>(temp_sum[idx] - input_zero_point * num_elements_in_axis);
|
411
|
+
int32_t output = MultiplyByQuantizedMultiplier(
|
412
|
+
shifted_sum, output_multiplier, output_shift) +
|
413
|
+
output_zero_point;
|
414
|
+
output = std::min(std::max(output, kMinValue), kMaxValue);
|
415
|
+
output_data[idx] = static_cast<T>(output);
|
467
416
|
}
|
468
417
|
return true;
|
469
418
|
}
|
@@ -478,8 +427,8 @@ inline bool QuantizedMeanOrSumExtraArgs(
|
|
478
427
|
bool keep_dims, int* temp_index, int* resolved_axis, U* temp_sum,
|
479
428
|
bool compute_sum) {
|
480
429
|
return QuantizedMeanOrSum<T, U>(
|
481
|
-
input_data, input_zero_point,
|
482
|
-
|
430
|
+
input_data, input_zero_point, input_dims, input_num_dims, output_data,
|
431
|
+
output_multiplier, output_shift, output_zero_point, output_dims,
|
483
432
|
output_num_dims, axis, num_axis_dimensions, keep_dims, temp_index,
|
484
433
|
resolved_axis, temp_sum, compute_sum);
|
485
434
|
}
|
@@ -212,9 +212,14 @@ inline void ResizeBilinearInteger(
|
|
212
212
|
(input_y - (1 << 10) * y0) * (input_x - (1 << 10) * x0);
|
213
213
|
const int64_t output_20 =
|
214
214
|
output_20_ll + output_20_lu + output_20_rl + output_20_ru;
|
215
|
+
#if TFLITE_SINGLE_ROUNDING
|
216
|
+
const int64_t round = 1 << 19;
|
217
|
+
const T interpolation = static_cast<T>((output_20 + round) >> 20);
|
218
|
+
#else
|
215
219
|
const int64_t round = (output_20 > 0) ? (1 << 19) : -(1 << 19);
|
216
220
|
const T interpolation =
|
217
221
|
static_cast<T>((output_20 + round) / (1 << 20));
|
222
|
+
#endif // TFLITE_SINGLE_ROUNDING
|
218
223
|
output_data[Offset(output_shape, b, y, x, c)] = interpolation;
|
219
224
|
}
|
220
225
|
}
|