xmos-ai-tools 1.1.2.dev216__py3-none-macosx_11_0_arm64.whl → 1.1.2.dev236__py3-none-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xmos_ai_tools/runtime/include/lib_nn/api/nn_layers.h +16 -0
- xmos_ai_tools/runtime/include/lib_nn/api/quadratic_approximation.h +80 -0
- xmos_ai_tools/runtime/include/lib_nn/api/quadratic_interpolation.h +23 -0
- xmos_ai_tools/runtime/include/lib_tflite_micro/src/tflite-xcore-kernels/xcore_ops.h +15 -15
- xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/simple_features/model.h → signal/micro/kernels/delay_flexbuffers_generated_data.h} +7 -9
- xmos_ai_tools/runtime/include/signal/micro/kernels/energy_flexbuffers_generated_data.h +28 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/fft_flexbuffers_generated_data.h +37 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_flexbuffers_generated_data.h +25 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_log_flexbuffers_generated_data.h +27 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_spectral_subtraction_flexbuffers_generated_data.h +26 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/framer_flexbuffers_generated_data.h +25 -0
- xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/simple_features/no_simple_features_data.h → signal/micro/kernels/irfft.h} +15 -7
- xmos_ai_tools/runtime/include/signal/micro/kernels/overlap_add_flexbuffers_generated_data.h +25 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/pcan_flexbuffers_generated_data.h +7 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/rfft.h +31 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/stacker_flexbuffers_generated_data.h +25 -0
- xmos_ai_tools/runtime/include/signal/micro/kernels/window_flexbuffers_generated_data.h +25 -0
- xmos_ai_tools/runtime/include/signal/src/circular_buffer.h +118 -0
- xmos_ai_tools/runtime/include/signal/src/complex.h +29 -0
- xmos_ai_tools/runtime/include/signal/src/energy.h +38 -0
- xmos_ai_tools/runtime/include/signal/src/fft_auto_scale.h +35 -0
- xmos_ai_tools/runtime/include/signal/src/filter_bank.h +69 -0
- xmos_ai_tools/runtime/include/signal/src/filter_bank_log.h +38 -0
- xmos_ai_tools/runtime/include/signal/src/filter_bank_spectral_subtraction.h +73 -0
- xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/main_functions.h → signal/src/filter_bank_square_root.h} +14 -17
- xmos_ai_tools/runtime/include/signal/src/irfft.h +84 -0
- xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_common.h +49 -0
- xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_float.h +31 -0
- xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_int16.h +30 -0
- xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_int32.h +31 -0
- xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.h → signal/src/log.h} +13 -6
- xmos_ai_tools/runtime/include/{tensorflow/lite/micro/python/interpreter/src/python_utils.h → signal/src/max_abs.h} +11 -11
- xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.h → signal/src/msb.h} +15 -6
- xmos_ai_tools/runtime/include/signal/src/overlap_add.h +46 -0
- xmos_ai_tools/runtime/include/signal/src/pcan_argc_fixed.h +41 -0
- xmos_ai_tools/runtime/include/signal/src/rfft.h +85 -0
- xmos_ai_tools/runtime/include/signal/src/square_root.h +32 -0
- xmos_ai_tools/runtime/include/{tensorflow/lite/micro/python/interpreter/src/numpy_utils.h → signal/src/window.h} +13 -15
- xmos_ai_tools/runtime/include/signal/testdata/fft_test_data.h +48 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/array.h +156 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/builtin_ops.h +44 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/c/c_api_types.h +6 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/c/common.h +8 -25
- xmos_ai_tools/runtime/include/tensorflow/lite/core/api/error_reporter.h +3 -3
- xmos_ai_tools/runtime/include/tensorflow/lite/core/api/flatbuffer_conversions.h +15 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/core/c/builtin_op_data.h +92 -3
- xmos_ai_tools/runtime/include/tensorflow/lite/core/c/c_api_types.h +61 -51
- xmos_ai_tools/runtime/include/tensorflow/lite/core/c/common.h +302 -1
- xmos_ai_tools/runtime/include/tensorflow/lite/core/macros.h +78 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/common.h +129 -43
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/cppmath.h +2 -2
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/portable_tensor.h +23 -4
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/add.h +210 -151
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/comparisons.h +9 -18
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/conv.h +2 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/add.h +103 -72
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h +2 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h +2 -63
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h +87 -26
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/mul.h +129 -80
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/reduce.h +42 -93
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/resize_bilinear.h +5 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/sub.h +249 -263
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/runtime_shape.h +11 -1
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/strided_slice_logic.h +5 -1
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/tensor_ctypes.h +5 -10
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/types.h +4 -2
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/kernel_util.h +25 -14
- xmos_ai_tools/runtime/include/tensorflow/lite/kernels/op_macros.h +14 -3
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/debug_log.h +10 -3
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_model_settings.h +37 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/fake_micro_context.h +7 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/add.h +6 -5
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/circular_buffer.h +0 -3
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/conv.h +19 -20
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/conv_test.h +8 -31
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/depthwise_conv.h +8 -8
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/ethosu.h +1 -1
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/fully_connected.h +9 -9
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/kernel_runner.h +14 -9
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/kernel_util.h +9 -4
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/micro_ops.h +119 -100
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/mul.h +4 -4
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/pooling.h +8 -8
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/reduce.h +4 -4
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/reshape.h +26 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/softmax.h +12 -16
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/strided_slice.h +40 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/svdf.h +8 -7
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.h +5 -5
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa.h +2 -2
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_conv.h +26 -21
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_depthwise_conv.h +4 -4
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_reshape.h +2 -4
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_softmax.h +2 -2
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h +5 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/linear_memory_planner.h +4 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/micro_memory_planner.h +4 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/non_persistent_buffer_planner_shim.h +4 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_allocator.h +23 -8
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_common.h +38 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_context.h +23 -65
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_graph.h +15 -57
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter.h +16 -5
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter_context.h +125 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter_graph.h +110 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_log.h +6 -8
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_mutable_op_resolver.h +114 -32
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_op_resolver.h +6 -5
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_profiler.h +1 -1
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/mock_micro_graph.h +1 -1
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/python_ops_resolver.h +21 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/test_helper_custom_ops.h +3 -4
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/test_helpers.h +28 -12
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/testing/micro_test.h +1 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/log_utils.h +273 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/metrics.h +41 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/op_resolver.h +127 -0
- xmos_ai_tools/runtime/include/tensorflow/lite/schema/schema_generated.h +9139 -5010
- xmos_ai_tools/runtime/lib/libhost_xtflitemicro.a +0 -0
- xmos_ai_tools/runtime/lib/libxtflitemicro.a +0 -0
- xmos_ai_tools/xinterpreters/libs/macos/xtflm_python.1.0.1.dylib +0 -0
- xmos_ai_tools/xinterpreters/libs/macos/xtflm_python.dylib +0 -0
- {xmos_ai_tools-1.1.2.dev216.data → xmos_ai_tools-1.1.2.dev236.data}/data/bin/xcore-opt +0 -0
- {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/METADATA +3 -4
- {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/RECORD +128 -105
- {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/WHEEL +1 -1
- xmos_ai_tools/runtime/include/tensorflow/lite/core/api/op_resolver.h +0 -129
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/all_ops_resolver.h +0 -38
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/audio_provider.h +0 -44
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/command_responder.h +0 -30
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/feature_provider.h +0 -50
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.h +0 -30
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h +0 -43
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.h +0 -29
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h +0 -29
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/recognize_commands.h +0 -151
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h +0 -29
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_features_generator.h +0 -29
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_model_settings.h +0 -43
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h +0 -29
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/yes_simple_features_data.h +0 -23
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_string.h +0 -33
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/interpreter_wrapper.h +0 -51
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/pybind11_lib.h +0 -64
- xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/shared_library.h +0 -40
- {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/top_level.txt +0 -0
@@ -32,8 +32,8 @@ namespace tflite {
|
|
32
32
|
return TF_LITE_GLOBAL_STD_PREFIX::std_name(x); \
|
33
33
|
}
|
34
34
|
|
35
|
-
DECLARE_STD_GLOBAL_SWITCH1(TfLiteRound, round)
|
36
|
-
DECLARE_STD_GLOBAL_SWITCH1(TfLiteExpm1, expm1)
|
35
|
+
DECLARE_STD_GLOBAL_SWITCH1(TfLiteRound, round)
|
36
|
+
DECLARE_STD_GLOBAL_SWITCH1(TfLiteExpm1, expm1)
|
37
37
|
|
38
38
|
} // namespace tflite
|
39
39
|
|
@@ -15,6 +15,7 @@ limitations under the License.
|
|
15
15
|
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
|
16
16
|
#define TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
|
17
17
|
|
18
|
+
#include <cstddef>
|
18
19
|
#include <vector>
|
19
20
|
|
20
21
|
#include "tensorflow/lite/core/c/common.h"
|
@@ -23,10 +24,6 @@ limitations under the License.
|
|
23
24
|
|
24
25
|
namespace tflite {
|
25
26
|
|
26
|
-
inline RuntimeShape GetTensorShape(std::vector<int32_t> data) {
|
27
|
-
return RuntimeShape(data.size(), data.data());
|
28
|
-
}
|
29
|
-
|
30
27
|
// A list of tensors in a format that can be used by kernels like split and
|
31
28
|
// concatenation.
|
32
29
|
template <typename T>
|
@@ -54,6 +51,26 @@ class VectorOfTensors {
|
|
54
51
|
all_shape_ptr_.push_back(&all_shape_[i]);
|
55
52
|
}
|
56
53
|
}
|
54
|
+
|
55
|
+
explicit VectorOfTensors(const std::vector<TfLiteTensor*>& tensors) {
|
56
|
+
int num_tensors = tensors.size();
|
57
|
+
|
58
|
+
all_data_.reserve(num_tensors);
|
59
|
+
all_shape_.reserve(num_tensors);
|
60
|
+
all_shape_ptr_.reserve(num_tensors);
|
61
|
+
|
62
|
+
for (auto* t : tensors) {
|
63
|
+
all_data_.push_back(GetTensorData<T>(t));
|
64
|
+
all_shape_.push_back(GetTensorShape(t));
|
65
|
+
}
|
66
|
+
|
67
|
+
// Taking the pointer from inside a std::vector is only OK if the vector is
|
68
|
+
// never modified, so we populate all_shape in the previous loop and then we
|
69
|
+
// are free to grab iterators here.
|
70
|
+
for (int i = 0; i < num_tensors; ++i) {
|
71
|
+
all_shape_ptr_.push_back(&all_shape_[i]);
|
72
|
+
}
|
73
|
+
}
|
57
74
|
// Return a pointer to the data pointers of all tensors in the list. For
|
58
75
|
// example:
|
59
76
|
// float* const* f = v.data();
|
@@ -66,6 +83,8 @@ class VectorOfTensors {
|
|
66
83
|
// dims[1] are the dimensions of the second tensor in the list.
|
67
84
|
const RuntimeShape* const* shapes() const { return all_shape_ptr_.data(); }
|
68
85
|
|
86
|
+
size_t size() const { return all_data_.size(); }
|
87
|
+
|
69
88
|
private:
|
70
89
|
std::vector<T*> all_data_;
|
71
90
|
std::vector<RuntimeShape> all_shape_;
|
@@ -16,10 +16,13 @@ limitations under the License.
|
|
16
16
|
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
|
17
17
|
|
18
18
|
#include <algorithm>
|
19
|
+
#include <cstddef>
|
20
|
+
#include <cstdint>
|
19
21
|
#include <type_traits>
|
20
22
|
|
21
23
|
#include "fixedpoint/fixedpoint.h"
|
22
24
|
#include "tensorflow/lite/kernels/internal/common.h"
|
25
|
+
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
23
26
|
|
24
27
|
namespace tflite {
|
25
28
|
|
@@ -194,21 +197,135 @@ inline void Add(const ArithmeticParams& params,
|
|
194
197
|
}
|
195
198
|
}
|
196
199
|
|
200
|
+
template <typename T>
|
201
|
+
inline void AddBroadcast(const T* input_data, const T* broadcast_data,
|
202
|
+
T* output_data, size_t size, T activation_min,
|
203
|
+
T activation_max) {
|
204
|
+
for (size_t c = 0; c < size; ++c) {
|
205
|
+
output_data[c] = ActivationFunctionWithMinMax<T>(
|
206
|
+
input_data[c] + broadcast_data[0], activation_min, activation_max);
|
207
|
+
}
|
208
|
+
}
|
209
|
+
|
210
|
+
template <>
|
211
|
+
inline void AddBroadcast<int32_t>(const int32_t* input_data,
|
212
|
+
const int32_t* broadcast_data,
|
213
|
+
int32_t* output_data, size_t size,
|
214
|
+
int32_t activation_min,
|
215
|
+
int32_t activation_max) {
|
216
|
+
size_t c = 0;
|
217
|
+
#ifdef USE_NEON
|
218
|
+
const int32x4_t vmax = vdupq_n_s32(activation_max);
|
219
|
+
const int32x4_t vmin = vdupq_n_s32(activation_min);
|
220
|
+
const int32x4_t vb = vdupq_n_s32(broadcast_data[0]);
|
221
|
+
for (; c + 4 <= size; c += 4) {
|
222
|
+
const int32x4_t va = vld1q_s32(&input_data[c]);
|
223
|
+
int32x4_t vres = vaddq_s32(va, vb);
|
224
|
+
vres = vmaxq_s32(vmin, vres);
|
225
|
+
vres = vminq_s32(vmax, vres);
|
226
|
+
vst1q_s32(&output_data[c], vres);
|
227
|
+
}
|
228
|
+
#endif
|
229
|
+
for (; c < size; ++c) {
|
230
|
+
output_data[c] = ActivationFunctionWithMinMax<int32_t>(
|
231
|
+
input_data[c] + broadcast_data[0], activation_min, activation_max);
|
232
|
+
}
|
233
|
+
}
|
234
|
+
|
235
|
+
template <typename T>
|
236
|
+
void AddElementwise(const T* input1_data, const T* input2_data, T* output_data,
|
237
|
+
size_t size, T activation_min, T activation_max) {
|
238
|
+
for (size_t c = 0; c < size; ++c) {
|
239
|
+
output_data[c] = ActivationFunctionWithMinMax<T>(
|
240
|
+
input1_data[c] + input2_data[c], activation_min, activation_max);
|
241
|
+
}
|
242
|
+
}
|
243
|
+
|
244
|
+
template <>
|
245
|
+
inline void AddElementwise<int32_t>(const int32_t* input1_data,
|
246
|
+
const int32_t* input2_data,
|
247
|
+
int32_t* output_data, size_t size,
|
248
|
+
int32_t activation_min,
|
249
|
+
int32_t activation_max) {
|
250
|
+
size_t c = 0;
|
251
|
+
#ifdef USE_NEON
|
252
|
+
const int32x4_t vmax = vdupq_n_s32(activation_max);
|
253
|
+
const int32x4_t vmin = vdupq_n_s32(activation_min);
|
254
|
+
for (; c + 4 <= size; c += 4) {
|
255
|
+
const int32x4_t va = vld1q_s32(&input1_data[c]);
|
256
|
+
const int32x4_t vb = vld1q_s32(&input2_data[c]);
|
257
|
+
int32x4_t vres = vaddq_s32(va, vb);
|
258
|
+
vres = vmaxq_s32(vmin, vres);
|
259
|
+
vres = vminq_s32(vmax, vres);
|
260
|
+
vst1q_s32(&output_data[c], vres);
|
261
|
+
}
|
262
|
+
#endif
|
263
|
+
for (; c < size; ++c) {
|
264
|
+
output_data[c] = ActivationFunctionWithMinMax<int32_t>(
|
265
|
+
input1_data[c] + input2_data[c], activation_min, activation_max);
|
266
|
+
}
|
267
|
+
}
|
268
|
+
|
269
|
+
template <typename T>
|
270
|
+
inline void BroadcastAddRecursiveDimensions(
|
271
|
+
int dimension, size_t* input1_offset_p, size_t* input2_offset_p,
|
272
|
+
size_t* output_offset, size_t* compressed_input1_stride,
|
273
|
+
size_t* compressed_input2_stride, size_t* compressed_output_shape,
|
274
|
+
T activation_min, T activation_max, const T* input1_data,
|
275
|
+
const T* input2_data, T* output_data) {
|
276
|
+
if (dimension > 0) {
|
277
|
+
for (size_t c = 0; c < compressed_output_shape[dimension]; ++c) {
|
278
|
+
size_t input1_offset_c = *input1_offset_p;
|
279
|
+
size_t input2_offset_c = *input2_offset_p;
|
280
|
+
BroadcastAddRecursiveDimensions(
|
281
|
+
dimension - 1, &input1_offset_c, &input2_offset_c, output_offset,
|
282
|
+
compressed_input1_stride, compressed_input2_stride,
|
283
|
+
compressed_output_shape, activation_min, activation_max, input1_data,
|
284
|
+
input2_data, output_data);
|
285
|
+
*input1_offset_p += compressed_input1_stride[dimension];
|
286
|
+
*input2_offset_p += compressed_input2_stride[dimension];
|
287
|
+
}
|
288
|
+
} else {
|
289
|
+
TFLITE_DCHECK(dimension == 0);
|
290
|
+
bool input1_is_broadcast = compressed_input1_stride[dimension] == 0;
|
291
|
+
bool input2_is_broadcast = compressed_input2_stride[dimension] == 0;
|
292
|
+
TFLITE_DCHECK(!(input1_is_broadcast && input2_is_broadcast));
|
293
|
+
const T* input1_data_ptr = input1_data + *input1_offset_p;
|
294
|
+
const T* input2_data_ptr = input2_data + *input2_offset_p;
|
295
|
+
T* output_data_ptr = output_data + *output_offset;
|
296
|
+
if (input1_is_broadcast) {
|
297
|
+
// input1 is broadcast.
|
298
|
+
AddBroadcast<T>(input2_data_ptr, input1_data_ptr, output_data_ptr,
|
299
|
+
compressed_output_shape[dimension], activation_min,
|
300
|
+
activation_max);
|
301
|
+
*input2_offset_p += compressed_output_shape[dimension];
|
302
|
+
} else if (input2_is_broadcast) {
|
303
|
+
// input2 is broadcast.
|
304
|
+
AddBroadcast<T>(input1_data_ptr, input2_data_ptr, output_data_ptr,
|
305
|
+
compressed_output_shape[dimension], activation_min,
|
306
|
+
activation_max);
|
307
|
+
*input1_offset_p += compressed_output_shape[dimension];
|
308
|
+
} else {
|
309
|
+
// Add element-wise.
|
310
|
+
AddElementwise<T>(input1_data_ptr, input2_data_ptr, output_data_ptr,
|
311
|
+
compressed_output_shape[dimension], activation_min,
|
312
|
+
activation_max);
|
313
|
+
*input1_offset_p += compressed_output_shape[dimension];
|
314
|
+
*input2_offset_p += compressed_output_shape[dimension];
|
315
|
+
}
|
316
|
+
*output_offset += compressed_output_shape[dimension];
|
317
|
+
}
|
318
|
+
}
|
319
|
+
|
197
320
|
template <typename T,
|
198
|
-
// For unquantized add for small integers,
|
321
|
+
// For unquantized add for small integers, explicitly set to true.
|
199
322
|
bool dummy = false>
|
200
323
|
inline typename std::enable_if<!is_small_integer<T>::value || dummy, void>::type
|
201
324
|
BroadcastAdd6DSlow(const ArithmeticParams& params,
|
202
325
|
const RuntimeShape& input1_shape, const T* input1_data,
|
203
326
|
const RuntimeShape& input2_shape, const T* input2_data,
|
204
327
|
const RuntimeShape& output_shape, T* output_data) {
|
205
|
-
|
206
|
-
NdArrayDesc<6> desc2;
|
207
|
-
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
208
|
-
&desc2);
|
209
|
-
const RuntimeShape extended_output_shape =
|
210
|
-
RuntimeShape::ExtendedShape(6, output_shape);
|
211
|
-
|
328
|
+
constexpr int kMaxBroadcastDim = 6;
|
212
329
|
T activation_min, activation_max;
|
213
330
|
GetActivationParams(params, &activation_min, &activation_max);
|
214
331
|
|
@@ -223,64 +340,74 @@ BroadcastAdd6DSlow(const ArithmeticParams& params,
|
|
223
340
|
// We name our variables by their Tensorflow convention, but generate C code
|
224
341
|
// nesting loops such that the innermost loop has the smallest stride for the
|
225
342
|
// best cache behavior.
|
226
|
-
size_t
|
227
|
-
size_t
|
228
|
-
size_t
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
343
|
+
size_t compressed_input1_stride[kMaxBroadcastDim];
|
344
|
+
size_t compressed_input2_stride[kMaxBroadcastDim];
|
345
|
+
size_t compressed_output_shape[kMaxBroadcastDim];
|
346
|
+
bool broadcastable_shape = ReduceDimensionsForBroadcast<kMaxBroadcastDim>(
|
347
|
+
input1_shape, input2_shape, compressed_input1_stride,
|
348
|
+
compressed_input2_stride, compressed_output_shape);
|
349
|
+
// Skip broadcasting for degenerate shapes.
|
350
|
+
if (!broadcastable_shape) {
|
351
|
+
return;
|
352
|
+
}
|
353
|
+
|
354
|
+
size_t input1_offset = 0;
|
355
|
+
size_t input2_offset = 0;
|
356
|
+
size_t output_offset = 0;
|
357
|
+
BroadcastAddRecursiveDimensions<T>(
|
358
|
+
kMaxBroadcastDim - 1, &input1_offset, &input2_offset, &output_offset,
|
359
|
+
compressed_input1_stride, compressed_input2_stride,
|
360
|
+
compressed_output_shape, activation_min, activation_max, input1_data,
|
361
|
+
input2_data, output_data);
|
362
|
+
}
|
363
|
+
|
364
|
+
// This function is used for 8-bit as well as for 16-bit, but the accumulator
|
365
|
+
// is 32-bit for both cases. The overflow does not happen due to the
|
366
|
+
// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
|
367
|
+
template <typename T>
|
368
|
+
inline void BroadcastAddRecursiveDimensions(
|
369
|
+
const ArithmeticParams& params, int dimension, size_t* input1_offset_p,
|
370
|
+
size_t* input2_offset_p, size_t* output_offset,
|
371
|
+
size_t* compressed_input1_stride, size_t* compressed_input2_stride,
|
372
|
+
size_t* compressed_output_shape, const T* input1_data, const T* input2_data,
|
373
|
+
T* output_data) {
|
374
|
+
for (size_t c = 0; c < compressed_output_shape[dimension]; ++c) {
|
375
|
+
if (dimension > 0) {
|
376
|
+
size_t input1_offset_c = *input1_offset_p;
|
377
|
+
size_t input2_offset_c = *input2_offset_p;
|
378
|
+
BroadcastAddRecursiveDimensions(
|
379
|
+
params, dimension - 1, &input1_offset_c, &input2_offset_c,
|
380
|
+
output_offset, compressed_input1_stride, compressed_input2_stride,
|
381
|
+
compressed_output_shape, input1_data, input2_data, output_data);
|
382
|
+
} else {
|
383
|
+
TFLITE_DCHECK(dimension == 0);
|
384
|
+
const int32_t input1_val =
|
385
|
+
params.input1_offset + input1_data[*input1_offset_p];
|
386
|
+
const int32_t input2_val =
|
387
|
+
params.input2_offset + input2_data[*input2_offset_p];
|
388
|
+
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
|
389
|
+
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
|
390
|
+
const int32_t scaled_input1_val =
|
391
|
+
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
392
|
+
shifted_input1_val, params.input1_multiplier,
|
393
|
+
params.input1_shift);
|
394
|
+
const int32_t scaled_input2_val =
|
395
|
+
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
396
|
+
shifted_input2_val, params.input2_multiplier,
|
397
|
+
params.input2_shift);
|
398
|
+
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
|
399
|
+
const int32_t raw_output =
|
400
|
+
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
401
|
+
raw_sum, params.output_multiplier, params.output_shift) +
|
402
|
+
params.output_offset;
|
403
|
+
const int32_t clamped_output =
|
404
|
+
std::min(params.quantized_activation_max,
|
405
|
+
std::max(params.quantized_activation_min, raw_output));
|
406
|
+
output_data[*output_offset] = static_cast<T>(clamped_output);
|
407
|
+
++(*output_offset);
|
277
408
|
}
|
278
|
-
|
279
|
-
|
280
|
-
output_offset_a +=
|
281
|
-
extended_output_shape.Dims(1) * extended_output_shape.Dims(2) *
|
282
|
-
extended_output_shape.Dims(3) * extended_output_shape.Dims(4) *
|
283
|
-
extended_output_shape.Dims(5);
|
409
|
+
*input1_offset_p += compressed_input1_stride[dimension];
|
410
|
+
*input2_offset_p += compressed_input2_stride[dimension];
|
284
411
|
}
|
285
412
|
}
|
286
413
|
|
@@ -293,12 +420,7 @@ BroadcastAdd6DSlow(const ArithmeticParams& params,
|
|
293
420
|
const RuntimeShape& input1_shape, const T* input1_data,
|
294
421
|
const RuntimeShape& input2_shape, const T* input2_data,
|
295
422
|
const RuntimeShape& output_shape, T* output_data) {
|
296
|
-
|
297
|
-
NdArrayDesc<6> desc2;
|
298
|
-
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
299
|
-
&desc2);
|
300
|
-
const RuntimeShape extended_output_shape =
|
301
|
-
RuntimeShape::ExtendedShape(6, output_shape);
|
423
|
+
constexpr int kMaxBroadcastDim = 6;
|
302
424
|
|
303
425
|
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
304
426
|
// col, channel), with extents (batches, height, width, depth), with the
|
@@ -311,87 +433,24 @@ BroadcastAdd6DSlow(const ArithmeticParams& params,
|
|
311
433
|
// We name our variables by their Tensorflow convention, but generate C code
|
312
434
|
// nesting loops such that the innermost loop has the smallest stride for the
|
313
435
|
// best cache behavior.
|
314
|
-
size_t
|
315
|
-
size_t
|
316
|
-
size_t
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
size_t input2_offset_b = input2_offset_d;
|
324
|
-
size_t output_offset_b = output_offset_d;
|
325
|
-
for (int b = 0; b < extended_output_shape.Dims(2); ++b) {
|
326
|
-
size_t input1_offset_y = input1_offset_b;
|
327
|
-
size_t input2_offset_y = input2_offset_b;
|
328
|
-
size_t output_offset_y = output_offset_b;
|
329
|
-
for (int y = 0; y < extended_output_shape.Dims(3); ++y) {
|
330
|
-
size_t input1_offset_x = input1_offset_y;
|
331
|
-
size_t input2_offset_x = input2_offset_y;
|
332
|
-
size_t output_offset_x = output_offset_y;
|
333
|
-
for (int x = 0; x < extended_output_shape.Dims(4); ++x) {
|
334
|
-
size_t input1_offset_c = input1_offset_x;
|
335
|
-
size_t input2_offset_c = input2_offset_x;
|
336
|
-
size_t output_offset_c = output_offset_x;
|
337
|
-
for (int c = 0; c < extended_output_shape.Dims(5); ++c) {
|
338
|
-
const int32_t input1_val =
|
339
|
-
params.input1_offset + input1_data[input1_offset_c];
|
340
|
-
const int32_t input2_val =
|
341
|
-
params.input2_offset + input2_data[input2_offset_c];
|
342
|
-
const int32_t shifted_input1_val =
|
343
|
-
input1_val * (1 << params.left_shift);
|
344
|
-
const int32_t shifted_input2_val =
|
345
|
-
input2_val * (1 << params.left_shift);
|
346
|
-
const int32_t scaled_input1_val =
|
347
|
-
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
348
|
-
shifted_input1_val, params.input1_multiplier,
|
349
|
-
params.input1_shift);
|
350
|
-
const int32_t scaled_input2_val =
|
351
|
-
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
352
|
-
shifted_input2_val, params.input2_multiplier,
|
353
|
-
params.input2_shift);
|
354
|
-
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
|
355
|
-
const int32_t raw_output =
|
356
|
-
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
357
|
-
raw_sum, params.output_multiplier, params.output_shift) +
|
358
|
-
params.output_offset;
|
359
|
-
const int32_t clamped_output = std::min(
|
360
|
-
params.quantized_activation_max,
|
361
|
-
std::max(params.quantized_activation_min, raw_output));
|
362
|
-
output_data[output_offset_c] = static_cast<T>(clamped_output);
|
363
|
-
input1_offset_c += desc1.strides[5];
|
364
|
-
input2_offset_c += desc2.strides[5];
|
365
|
-
++output_offset_c;
|
366
|
-
}
|
367
|
-
input1_offset_x += desc1.strides[4];
|
368
|
-
input2_offset_x += desc2.strides[4];
|
369
|
-
output_offset_x += extended_output_shape.Dims(5);
|
370
|
-
}
|
371
|
-
input1_offset_y += desc1.strides[3];
|
372
|
-
input2_offset_y += desc2.strides[3];
|
373
|
-
output_offset_y +=
|
374
|
-
extended_output_shape.Dims(4) * extended_output_shape.Dims(5);
|
375
|
-
}
|
376
|
-
input1_offset_b += desc1.strides[2];
|
377
|
-
input2_offset_b += desc2.strides[2];
|
378
|
-
output_offset_b += extended_output_shape.Dims(3) *
|
379
|
-
extended_output_shape.Dims(4) *
|
380
|
-
extended_output_shape.Dims(5);
|
381
|
-
}
|
382
|
-
input1_offset_d += desc1.strides[1];
|
383
|
-
input2_offset_d += desc2.strides[1];
|
384
|
-
output_offset_d +=
|
385
|
-
extended_output_shape.Dims(2) * extended_output_shape.Dims(3) *
|
386
|
-
extended_output_shape.Dims(4) * extended_output_shape.Dims(5);
|
387
|
-
}
|
388
|
-
input1_offset_a += desc1.strides[0];
|
389
|
-
input2_offset_a += desc2.strides[0];
|
390
|
-
output_offset_a +=
|
391
|
-
extended_output_shape.Dims(1) * extended_output_shape.Dims(2) *
|
392
|
-
extended_output_shape.Dims(3) * extended_output_shape.Dims(4) *
|
393
|
-
extended_output_shape.Dims(5);
|
436
|
+
size_t compressed_input1_stride[kMaxBroadcastDim];
|
437
|
+
size_t compressed_input2_stride[kMaxBroadcastDim];
|
438
|
+
size_t compressed_output_shape[kMaxBroadcastDim];
|
439
|
+
bool broadcastable_shape = ReduceDimensionsForBroadcast<kMaxBroadcastDim>(
|
440
|
+
input1_shape, input2_shape, compressed_input1_stride,
|
441
|
+
compressed_input2_stride, compressed_output_shape);
|
442
|
+
// Skip broadcasting for degenerate shapes.
|
443
|
+
if (!broadcastable_shape) {
|
444
|
+
return;
|
394
445
|
}
|
446
|
+
|
447
|
+
size_t input1_offset = 0;
|
448
|
+
size_t input2_offset = 0;
|
449
|
+
size_t output_offset = 0;
|
450
|
+
BroadcastAddRecursiveDimensions(
|
451
|
+
params, kMaxBroadcastDim - 1, &input1_offset, &input2_offset,
|
452
|
+
&output_offset, compressed_input1_stride, compressed_input2_stride,
|
453
|
+
compressed_output_shape, input1_data, input2_data, output_data);
|
395
454
|
}
|
396
455
|
|
397
456
|
template <typename T>
|
@@ -112,20 +112,11 @@ struct BroadcastComparison4DSlowCommon {
|
|
112
112
|
NdArrayDesc<4> desc2;
|
113
113
|
};
|
114
114
|
|
115
|
-
|
115
|
+
TFLITE_NOINLINE
|
116
|
+
BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess(
|
116
117
|
const RuntimeShape& unextended_input1_shape,
|
117
118
|
const RuntimeShape& unextended_input2_shape,
|
118
|
-
const RuntimeShape& unextended_output_shape)
|
119
|
-
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
|
120
|
-
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
|
121
|
-
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
122
|
-
NdArrayDesc<4> desc1;
|
123
|
-
NdArrayDesc<4> desc2;
|
124
|
-
NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
|
125
|
-
unextended_input2_shape, &desc1, &desc2);
|
126
|
-
return {RuntimeShape::ExtendedShape(4, unextended_output_shape), desc1,
|
127
|
-
desc2};
|
128
|
-
}
|
119
|
+
const RuntimeShape& unextended_output_shape);
|
129
120
|
|
130
121
|
template <typename T, ComparisonFn<T> F>
|
131
122
|
inline void BroadcastComparison4DSlowImpl(
|
@@ -266,12 +257,12 @@ inline void BroadcastComparison4DSlowWithScaling(
|
|
266
257
|
op_params, input1_shape, input1_data, input2_shape, input2_data, \
|
267
258
|
output_shape, output_data); \
|
268
259
|
}
|
269
|
-
TFLITE_COMPARISON_OP(Equal)
|
270
|
-
TFLITE_COMPARISON_OP(NotEqual)
|
271
|
-
TFLITE_COMPARISON_OP(Greater)
|
272
|
-
TFLITE_COMPARISON_OP(GreaterEqual)
|
273
|
-
TFLITE_COMPARISON_OP(Less)
|
274
|
-
TFLITE_COMPARISON_OP(LessEqual)
|
260
|
+
TFLITE_COMPARISON_OP(Equal)
|
261
|
+
TFLITE_COMPARISON_OP(NotEqual)
|
262
|
+
TFLITE_COMPARISON_OP(Greater)
|
263
|
+
TFLITE_COMPARISON_OP(GreaterEqual)
|
264
|
+
TFLITE_COMPARISON_OP(Less)
|
265
|
+
TFLITE_COMPARISON_OP(LessEqual)
|
275
266
|
#undef TFLITE_COMPARISON_OP
|
276
267
|
|
277
268
|
} // namespace reference_ops
|
@@ -56,8 +56,10 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
|
|
56
56
|
const int filter_width = filter_shape.Dims(2);
|
57
57
|
const int filter_input_depth = filter_shape.Dims(3);
|
58
58
|
const int groups = input_depth / filter_input_depth;
|
59
|
+
TFLITE_DCHECK_NE(groups, 0);
|
59
60
|
TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
|
60
61
|
const int filters_per_group = output_depth / groups;
|
62
|
+
TFLITE_DCHECK_NE(filters_per_group, 0);
|
61
63
|
const int output_height = output_shape.Dims(1);
|
62
64
|
const int output_width = output_shape.Dims(2);
|
63
65
|
|