xmos-ai-tools 1.1.2.dev216__py3-none-macosx_11_0_arm64.whl → 1.1.2.dev236__py3-none-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. xmos_ai_tools/runtime/include/lib_nn/api/nn_layers.h +16 -0
  2. xmos_ai_tools/runtime/include/lib_nn/api/quadratic_approximation.h +80 -0
  3. xmos_ai_tools/runtime/include/lib_nn/api/quadratic_interpolation.h +23 -0
  4. xmos_ai_tools/runtime/include/lib_tflite_micro/src/tflite-xcore-kernels/xcore_ops.h +15 -15
  5. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/simple_features/model.h → signal/micro/kernels/delay_flexbuffers_generated_data.h} +7 -9
  6. xmos_ai_tools/runtime/include/signal/micro/kernels/energy_flexbuffers_generated_data.h +28 -0
  7. xmos_ai_tools/runtime/include/signal/micro/kernels/fft_flexbuffers_generated_data.h +37 -0
  8. xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_flexbuffers_generated_data.h +25 -0
  9. xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_log_flexbuffers_generated_data.h +27 -0
  10. xmos_ai_tools/runtime/include/signal/micro/kernels/filter_bank_spectral_subtraction_flexbuffers_generated_data.h +26 -0
  11. xmos_ai_tools/runtime/include/signal/micro/kernels/framer_flexbuffers_generated_data.h +25 -0
  12. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/simple_features/no_simple_features_data.h → signal/micro/kernels/irfft.h} +15 -7
  13. xmos_ai_tools/runtime/include/signal/micro/kernels/overlap_add_flexbuffers_generated_data.h +25 -0
  14. xmos_ai_tools/runtime/include/signal/micro/kernels/pcan_flexbuffers_generated_data.h +7 -0
  15. xmos_ai_tools/runtime/include/signal/micro/kernels/rfft.h +31 -0
  16. xmos_ai_tools/runtime/include/signal/micro/kernels/stacker_flexbuffers_generated_data.h +25 -0
  17. xmos_ai_tools/runtime/include/signal/micro/kernels/window_flexbuffers_generated_data.h +25 -0
  18. xmos_ai_tools/runtime/include/signal/src/circular_buffer.h +118 -0
  19. xmos_ai_tools/runtime/include/signal/src/complex.h +29 -0
  20. xmos_ai_tools/runtime/include/signal/src/energy.h +38 -0
  21. xmos_ai_tools/runtime/include/signal/src/fft_auto_scale.h +35 -0
  22. xmos_ai_tools/runtime/include/signal/src/filter_bank.h +69 -0
  23. xmos_ai_tools/runtime/include/signal/src/filter_bank_log.h +38 -0
  24. xmos_ai_tools/runtime/include/signal/src/filter_bank_spectral_subtraction.h +73 -0
  25. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/main_functions.h → signal/src/filter_bank_square_root.h} +14 -17
  26. xmos_ai_tools/runtime/include/signal/src/irfft.h +84 -0
  27. xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_common.h +49 -0
  28. xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_float.h +31 -0
  29. xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_int16.h +30 -0
  30. xmos_ai_tools/runtime/include/signal/src/kiss_fft_wrappers/kiss_fft_int32.h +31 -0
  31. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.h → signal/src/log.h} +13 -6
  32. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/python/interpreter/src/python_utils.h → signal/src/max_abs.h} +11 -11
  33. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.h → signal/src/msb.h} +15 -6
  34. xmos_ai_tools/runtime/include/signal/src/overlap_add.h +46 -0
  35. xmos_ai_tools/runtime/include/signal/src/pcan_argc_fixed.h +41 -0
  36. xmos_ai_tools/runtime/include/signal/src/rfft.h +85 -0
  37. xmos_ai_tools/runtime/include/signal/src/square_root.h +32 -0
  38. xmos_ai_tools/runtime/include/{tensorflow/lite/micro/python/interpreter/src/numpy_utils.h → signal/src/window.h} +13 -15
  39. xmos_ai_tools/runtime/include/signal/testdata/fft_test_data.h +48 -0
  40. xmos_ai_tools/runtime/include/tensorflow/lite/array.h +156 -0
  41. xmos_ai_tools/runtime/include/tensorflow/lite/builtin_ops.h +44 -0
  42. xmos_ai_tools/runtime/include/tensorflow/lite/c/c_api_types.h +6 -0
  43. xmos_ai_tools/runtime/include/tensorflow/lite/c/common.h +8 -25
  44. xmos_ai_tools/runtime/include/tensorflow/lite/core/api/error_reporter.h +3 -3
  45. xmos_ai_tools/runtime/include/tensorflow/lite/core/api/flatbuffer_conversions.h +15 -0
  46. xmos_ai_tools/runtime/include/tensorflow/lite/core/c/builtin_op_data.h +92 -3
  47. xmos_ai_tools/runtime/include/tensorflow/lite/core/c/c_api_types.h +61 -51
  48. xmos_ai_tools/runtime/include/tensorflow/lite/core/c/common.h +302 -1
  49. xmos_ai_tools/runtime/include/tensorflow/lite/core/macros.h +78 -0
  50. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/common.h +129 -43
  51. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/cppmath.h +2 -2
  52. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/portable_tensor.h +23 -4
  53. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/add.h +210 -151
  54. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/comparisons.h +9 -18
  55. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/conv.h +2 -0
  56. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/add.h +103 -72
  57. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h +2 -0
  58. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h +2 -63
  59. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h +87 -26
  60. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/mul.h +129 -80
  61. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/reduce.h +42 -93
  62. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/resize_bilinear.h +5 -0
  63. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/sub.h +249 -263
  64. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/runtime_shape.h +11 -1
  65. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/strided_slice_logic.h +5 -1
  66. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/tensor_ctypes.h +5 -10
  67. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/types.h +4 -2
  68. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/kernel_util.h +25 -14
  69. xmos_ai_tools/runtime/include/tensorflow/lite/kernels/op_macros.h +14 -3
  70. xmos_ai_tools/runtime/include/tensorflow/lite/micro/debug_log.h +10 -3
  71. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_model_settings.h +37 -0
  72. xmos_ai_tools/runtime/include/tensorflow/lite/micro/fake_micro_context.h +7 -0
  73. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/add.h +6 -5
  74. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/circular_buffer.h +0 -3
  75. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/conv.h +19 -20
  76. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/conv_test.h +8 -31
  77. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/depthwise_conv.h +8 -8
  78. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/ethosu.h +1 -1
  79. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/fully_connected.h +9 -9
  80. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/kernel_runner.h +14 -9
  81. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/kernel_util.h +9 -4
  82. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/micro_ops.h +119 -100
  83. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/mul.h +4 -4
  84. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/pooling.h +8 -8
  85. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/reduce.h +4 -4
  86. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/reshape.h +26 -0
  87. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/softmax.h +12 -16
  88. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/strided_slice.h +40 -0
  89. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/svdf.h +8 -7
  90. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.h +5 -5
  91. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa.h +2 -2
  92. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_conv.h +26 -21
  93. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_depthwise_conv.h +4 -4
  94. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_reshape.h +2 -4
  95. xmos_ai_tools/runtime/include/tensorflow/lite/micro/kernels/xtensa/xtensa_softmax.h +2 -2
  96. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h +5 -0
  97. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/linear_memory_planner.h +4 -0
  98. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/micro_memory_planner.h +4 -0
  99. xmos_ai_tools/runtime/include/tensorflow/lite/micro/memory_planner/non_persistent_buffer_planner_shim.h +4 -0
  100. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_allocator.h +23 -8
  101. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_common.h +38 -0
  102. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_context.h +23 -65
  103. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_graph.h +15 -57
  104. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter.h +16 -5
  105. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter_context.h +125 -0
  106. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_interpreter_graph.h +110 -0
  107. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_log.h +6 -8
  108. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_mutable_op_resolver.h +114 -32
  109. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_op_resolver.h +6 -5
  110. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_profiler.h +1 -1
  111. xmos_ai_tools/runtime/include/tensorflow/lite/micro/mock_micro_graph.h +1 -1
  112. xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/python_ops_resolver.h +21 -0
  113. xmos_ai_tools/runtime/include/tensorflow/lite/micro/test_helper_custom_ops.h +3 -4
  114. xmos_ai_tools/runtime/include/tensorflow/lite/micro/test_helpers.h +28 -12
  115. xmos_ai_tools/runtime/include/tensorflow/lite/micro/testing/micro_test.h +1 -0
  116. xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/log_utils.h +273 -0
  117. xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/metrics.h +41 -0
  118. xmos_ai_tools/runtime/include/tensorflow/lite/micro/tools/benchmarking/op_resolver.h +127 -0
  119. xmos_ai_tools/runtime/include/tensorflow/lite/schema/schema_generated.h +9139 -5010
  120. xmos_ai_tools/runtime/lib/libhost_xtflitemicro.a +0 -0
  121. xmos_ai_tools/runtime/lib/libxtflitemicro.a +0 -0
  122. xmos_ai_tools/xinterpreters/libs/macos/xtflm_python.1.0.1.dylib +0 -0
  123. xmos_ai_tools/xinterpreters/libs/macos/xtflm_python.dylib +0 -0
  124. {xmos_ai_tools-1.1.2.dev216.data → xmos_ai_tools-1.1.2.dev236.data}/data/bin/xcore-opt +0 -0
  125. {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/METADATA +3 -4
  126. {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/RECORD +128 -105
  127. {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/WHEEL +1 -1
  128. xmos_ai_tools/runtime/include/tensorflow/lite/core/api/op_resolver.h +0 -129
  129. xmos_ai_tools/runtime/include/tensorflow/lite/micro/all_ops_resolver.h +0 -38
  130. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/audio_provider.h +0 -44
  131. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/command_responder.h +0 -30
  132. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/feature_provider.h +0 -50
  133. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.h +0 -30
  134. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h +0 -43
  135. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.h +0 -29
  136. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h +0 -29
  137. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/recognize_commands.h +0 -151
  138. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h +0 -29
  139. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_features_generator.h +0 -29
  140. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_model_settings.h +0 -43
  141. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h +0 -29
  142. xmos_ai_tools/runtime/include/tensorflow/lite/micro/examples/micro_speech/simple_features/yes_simple_features_data.h +0 -23
  143. xmos_ai_tools/runtime/include/tensorflow/lite/micro/micro_string.h +0 -33
  144. xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/interpreter_wrapper.h +0 -51
  145. xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/pybind11_lib.h +0 -64
  146. xmos_ai_tools/runtime/include/tensorflow/lite/micro/python/interpreter/src/shared_library.h +0 -40
  147. {xmos_ai_tools-1.1.2.dev216.dist-info → xmos_ai_tools-1.1.2.dev236.dist-info}/top_level.txt +0 -0
@@ -38,6 +38,7 @@ limitations under the License.
38
38
  /// "third_party/tensorflow/lite/c/common.h".
39
39
  /// Only the TensorFlow Lite implementation itself should include this
40
40
  /// file directly.
41
+ // IWYU pragma: private, include "third_party/tensorflow/lite/c/common.h"
41
42
 
42
43
  #ifndef TENSORFLOW_LITE_CORE_C_COMMON_H_
43
44
  #define TENSORFLOW_LITE_CORE_C_COMMON_H_
@@ -157,6 +158,10 @@ int TfLiteFloatArrayGetSizeInBytes(int size);
157
158
  // This returns a pointer, that you must free using TfLiteFloatArrayFree().
158
159
  TfLiteFloatArray* TfLiteFloatArrayCreate(int size);
159
160
 
161
+ // Create a copy of an array passed as `src`.
162
+ // You are expected to free memory with TfLiteFloatArrayFree.
163
+ TfLiteFloatArray* TfLiteFloatArrayCopy(const TfLiteFloatArray* src);
164
+
160
165
  // Free memory of array `a`.
161
166
  void TfLiteFloatArrayFree(TfLiteFloatArray* a);
162
167
  #endif // TF_LITE_STATIC_MEMORY
@@ -345,6 +350,8 @@ typedef union TfLitePtrUnion {
345
350
  // as constant inputs for downstream ops (also in prepare).
346
351
  // * kTfLiteCustom: Custom memory allocation provided by the user. See
347
352
  // TfLiteCustomAllocation below.
353
+ // * kTfLiteVariantObject: Allocation is an arbitrary type-erased C++ object.
354
+ // Allocation and deallocation are done through `new` and `delete`.
348
355
  typedef enum TfLiteAllocationType {
349
356
  kTfLiteMemNone = 0,
350
357
  kTfLiteMmapRo,
@@ -353,8 +360,40 @@ typedef enum TfLiteAllocationType {
353
360
  kTfLiteDynamic,
354
361
  kTfLitePersistentRo,
355
362
  kTfLiteCustom,
363
+ kTfLiteVariantObject,
356
364
  } TfLiteAllocationType;
357
365
 
366
+ // Memory allocation strategies.
367
+ //
368
+ // TfLiteAllocationType values have been overloaded to mean more than their
369
+ // original intent. This enum should only be used to document the allocation
370
+ // strategy used by a tensor for it data.
371
+ typedef enum TfLiteAllocationStrategy {
372
+ kTfLiteAllocationStrategyUnknown,
373
+ kTfLiteAllocationStrategyNone, // No data is allocated.
374
+ kTfLiteAllocationStrategyMMap, // Data is mmaped.
375
+ kTfLiteAllocationStrategyArena, // Handled by the arena.
376
+ kTfLiteAllocationStrategyMalloc, // Uses `malloc`/`free`.
377
+ kTfLiteAllocationStrategyNew // Uses `new[]`/`delete[]`.
378
+ } TfLiteAllocationStrategy;
379
+
380
+ // Describes how stable a tensor attribute is with regards to an interpreter
381
+ // runs.
382
+ typedef enum TfLiteRunStability {
383
+ kTfLiteRunStabilityUnknown,
384
+ kTfLiteRunStabilityUnstable, // May change at any time.
385
+ kTfLiteRunStabilitySingleRun, // Will stay the same for one run.
386
+ kTfLiteRunStabilityAcrossRuns // Will stay the same across all runs.
387
+ } TfLiteRunStability;
388
+
389
+ // Describes the steps of a TFLite operation life cycle.
390
+ typedef enum TfLiteRunStep {
391
+ kTfLiteRunStepUnknown,
392
+ kTfLiteRunStepInit,
393
+ kTfLiteRunStepPrepare,
394
+ kTfLiteRunStepEval
395
+ } TfLiteRunStep;
396
+
358
397
  // The delegates should use zero or positive integers to represent handles.
359
398
  // -1 is reserved from unallocated status.
360
399
  typedef int TfLiteBufferHandle;
@@ -847,7 +886,7 @@ typedef struct TfLiteContext {
847
886
  // }
848
887
  //
849
888
  // NOTE: The context owns the memory referenced by partition_params_array. It
850
- // will be cleared with another call to PreviewDelegateParitioning, or after
889
+ // will be cleared with another call to PreviewDelegatePartitioning, or after
851
890
  // TfLiteDelegateParams::Prepare returns.
852
891
  //
853
892
  // WARNING: This is an experimental interface that is subject to change.
@@ -878,6 +917,27 @@ typedef struct TfLiteContext {
878
917
  TfLiteStatus (*GetModelMetadata)(const struct TfLiteContext* context,
879
918
  const char* name, const char** ptr,
880
919
  size_t* bytes);
920
+
921
+ // Retrieves the corresponding TfLiteContext of a subgraph that the given
922
+ // subgraph_index points to and switches to the delegate context for that
923
+ // subgraph. If an invalid subgraph index is given, returns kTfLiteError.
924
+ // NOTE: This function is expected to be paired with ReleaseSubgraphContext()
925
+ // once the delegate preparation is done and/or the delegate context functions
926
+ // are no longer needed.
927
+ //
928
+ // WARNING: This is an experimental interface that is subject to change.
929
+ TfLiteStatus (*AcquireSubgraphContext)(
930
+ struct TfLiteContext* context, int subgraph_index,
931
+ struct TfLiteContext** acquired_context);
932
+ // Releases the subgraph context by switching back to the TFLite kernel
933
+ // context for the subgraph that the given subgraph_index points to.
934
+ // NOTE: This function is expected to be used after AcquireSubgraphContext()
935
+ // once the delegate preparation is done and/or the delegate context functions
936
+ // are no longer needed.
937
+ //
938
+ // WARNING: This is an experimental interface that is subject to change.
939
+ TfLiteStatus (*ReleaseSubgraphContext)(struct TfLiteContext* context,
940
+ int subgraph_index);
881
941
  } TfLiteContext;
882
942
 
883
943
  // `TfLiteRegistrationExternal` is an external version of `TfLiteRegistration`
@@ -886,6 +946,64 @@ typedef struct TfLiteContext {
886
946
  // field is the exactly the same as with `TfLiteRegistration`.
887
947
  typedef struct TfLiteRegistrationExternal TfLiteRegistrationExternal;
888
948
 
949
+ // The valid values of the `inplace_operator` field in `TfLiteRegistration`.
950
+ // This allow an op to signal to the runtime that the same data pointer
951
+ // may be passed as an input and output without impacting the result.
952
+ // This does not mean that the memory can safely be reused, it is up to the
953
+ // runtime to determine this, e.g. if another op consumes the same input or not
954
+ // or if an input tensor has sufficient memory allocated to store the output
955
+ // data.
956
+ //
957
+ // Setting these flags authorizes the runtime to set the data pointers of an
958
+ // input and output tensor to the same value. In such cases, the memory required
959
+ // by the output must be less than or equal to that required by the shared
960
+ // input, never greater. If kTfLiteInplaceOpDataUnmodified is set, then the
961
+ // runtime can share the same input tensor with multiple operator's outputs,
962
+ // provided that kTfLiteInplaceOpDataUnmodified is set for all of them.
963
+ // Otherwise, if an input tensor is consumed by multiple operators, it may only
964
+ // be shared with the operator which is the last to consume it.
965
+ //
966
+ // Note that this is a bitmask, so the values should be 1, 2, 4, 8, ...etc.
967
+ typedef enum {
968
+ // The default value. This indicates that the same data pointer cannot safely
969
+ // be passed as an op's input and output.
970
+ kTfLiteInplaceOpNone = 0,
971
+ // This indicates that an op's first output's data is identical to its first
972
+ // input's data, for example Reshape.
973
+ kTfLiteInplaceOpDataUnmodified = 1,
974
+ // Setting kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput means
975
+ // that InputN may be shared with OutputN instead of with the first output.
976
+ // This flag requires one or more of kTfLiteInplaceOpInputNShared to be set.
977
+ kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput = 2,
978
+ // kTfLiteInplaceOpInputNShared indicates that it is safe for an op to share
979
+ // InputN's data pointer with an output tensor. If
980
+ // kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput is set then
981
+ // kTfLiteInplaceOpInputNShared indicates that InputN may be shared
982
+ // with OutputN, otherwise kTfLiteInplaceOpInputNShared indicates that InputN
983
+ // may be shared with the first output.
984
+ //
985
+ // Indicates that an op's first input may be shared with the first output
986
+ // tensor. kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput has
987
+ // no impact on the behavior allowed by this flag.
988
+ kTfLiteInplaceOpInput0Shared = 4,
989
+ // Indicates that an op's second input may be shared with the first output
990
+ // if kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput is not set
991
+ // or second output if kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput
992
+ // is set.
993
+ kTfLiteInplaceOpInput1Shared = 8,
994
+ // Indicates that an op's third input may be shared with the first output
995
+ // if kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput is not set
996
+ // or third output if kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput is
997
+ // set.
998
+ kTfLiteInplaceOpInput2Shared = 16,
999
+ // Placeholder to ensure that enum can hold 64 bit values to accommodate
1000
+ // future fields.
1001
+ kTfLiteInplaceOpMaxValue = UINT64_MAX,
1002
+ } TfLiteInPlaceOp;
1003
+
1004
+ // The number of shareable inputs supported.
1005
+ static const int kTfLiteMaxSharableOpInputs = 3;
1006
+
889
1007
  typedef struct TfLiteRegistration {
890
1008
  // Initializes the op from serialized data.
891
1009
  // Called only *once* for the lifetime of the op, so any one-time allocations
@@ -966,8 +1084,37 @@ typedef struct TfLiteRegistration {
966
1084
  // does not support asynchronous execution for this `node`.
967
1085
  struct TfLiteAsyncKernel* (*async_kernel)(TfLiteContext* context,
968
1086
  TfLiteNode* node);
1087
+
1088
+ // Indicates if an operator's output may safely overwrite its inputs.
1089
+ // See the comments in `TfLiteInPlaceOp`.
1090
+ uint64_t inplace_operator;
969
1091
  } TfLiteRegistration;
970
1092
 
1093
+ /// \private
1094
+ // Old version of `TfLiteRegistration` to maintain binary backward
1095
+ // compatibility.
1096
+ // The legacy registration type must be a POD struct type whose field types must
1097
+ // be a prefix of the field types in TfLiteRegistration, and offset of the first
1098
+ // field in TfLiteRegistration that is not present in the legacy registration
1099
+ // type must be greater than or equal to the size of the legacy registration
1100
+ // type.
1101
+ // WARNING: This structure is deprecated / not an official part of the
1102
+ // API. It should be only used for binary backward compatibility.
1103
+ typedef struct TfLiteRegistration_V3 {
1104
+ void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
1105
+ void (*free)(TfLiteContext* context, void* buffer);
1106
+ TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node);
1107
+ TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node);
1108
+ const char* (*profiling_string)(const TfLiteContext* context,
1109
+ const TfLiteNode* node);
1110
+ int32_t builtin_code;
1111
+ const char* custom_name;
1112
+ int version;
1113
+ TfLiteRegistrationExternal* registration_external;
1114
+ struct TfLiteAsyncKernel* (*async_kernel)(TfLiteContext* context,
1115
+ TfLiteNode* node);
1116
+ } TfLiteRegistration_V3;
1117
+
971
1118
  /// \private
972
1119
  // Old version of `TfLiteRegistration` to maintain binary backward
973
1120
  // compatibility.
@@ -1158,6 +1305,7 @@ typedef struct TfLiteOpaqueDelegateBuilder {
1158
1305
  int64_t flags;
1159
1306
  } TfLiteOpaqueDelegateBuilder;
1160
1307
 
1308
+ #ifndef TF_LITE_STATIC_MEMORY
1161
1309
  // Creates an opaque delegate and returns its address. The opaque delegate will
1162
1310
  // behave according to the provided 'opaque_delegate_builder'. The lifetime of
1163
1311
  // the objects pointed to by any of the fields within the
@@ -1174,6 +1322,7 @@ TfLiteOpaqueDelegate* TfLiteOpaqueDelegateCreate(
1174
1322
  // Deletes the provided opaque 'delegate'. This function has no effect if the
1175
1323
  // 'delegate' is a null pointer.
1176
1324
  void TfLiteOpaqueDelegateDelete(TfLiteOpaqueDelegate* delegate);
1325
+ #endif // TF_LITE_STATIC_MEMORY
1177
1326
 
1178
1327
  // Returns a pointer to the data associated with the provided opaque 'delegate'.
1179
1328
  //
@@ -1189,7 +1338,159 @@ void TfLiteOpaqueDelegateDelete(TfLiteOpaqueDelegate* delegate);
1189
1338
  // 'opaque_delegate_builder' field is null.
1190
1339
  void* TfLiteOpaqueDelegateGetData(const TfLiteOpaqueDelegate* delegate);
1191
1340
 
1341
+ // Returns a tensor data allocation strategy.
1342
+ TfLiteAllocationStrategy TfLiteTensorGetAllocationStrategy(
1343
+ const TfLiteTensor* t);
1344
+
1345
+ // Returns how stable a tensor data buffer address is across runs.
1346
+ TfLiteRunStability TfLiteTensorGetBufferAddressStability(const TfLiteTensor* t);
1347
+
1348
+ // Returns how stable a tensor data values are across runs.
1349
+ TfLiteRunStability TfLiteTensorGetDataStability(const TfLiteTensor* t);
1350
+
1351
+ // Returns the operation step when the data of a tensor is populated.
1352
+ //
1353
+ // Some operations can precompute their results before the evaluation step. This
1354
+ // makes the data available earlier for subsequent operations.
1355
+ TfLiteRunStep TfLiteTensorGetDataKnownStep(const TfLiteTensor* t);
1356
+
1357
+ // Returns the operation steop when the shape of a tensor is computed.
1358
+ //
1359
+ // Some operations can precompute the shape of their results before the
1360
+ // evaluation step. This makes the shape available earlier for subsequent
1361
+ // operations.
1362
+ TfLiteRunStep TfLiteTensorGetShapeKnownStep(const TfLiteTensor* t);
1363
+
1192
1364
  #ifdef __cplusplus
1193
1365
  } // extern "C"
1366
+
1367
+ #include <utility>
1368
+
1369
+ // --- TFLITE VARIANT TENSORS ----
1370
+ // Programming languges usually define "variant" as a type that can hold an
1371
+ // unbounded set of types. See std::any
1372
+ // (https://en.cppreference.com/w/cpp/utility/any) for a related standard
1373
+ // library construct. In tensorflow, variant tensors have a data member which is
1374
+ // an Object that is destructible and copy constructible.
1375
+ // Variant tensors are commonly used to represent non trivial data
1376
+ // semantics that don't fit into simple primitives, such as lists of tensors and
1377
+ // datasets. Additionally, they can facilitate containers for optimizing
1378
+ // memory movement of tensor data.
1379
+ //
1380
+ // The following set of classes define the variant tensor member for tflite.
1381
+ // They implement a type-erased container intended to be used behind the
1382
+ // `data.data : void*` member of `TfLiteTensor`s. Runtime functions interact
1383
+ // the variant member at the level of a `VariantData`, whereas kernels
1384
+ // operate with the full knowledge of the un-erased type. The `VariantData`
1385
+ // class provides abstract methods for destroying and copying `VariantData`.
1386
+ // Invoking these methods will dispatch to the erased type opaquely.
1387
+ // The contents of any object of type derived from `AbstractVariant` can be
1388
+ // written to `TfLiteTensor::data::data : void*` from kernels. If the runtime
1389
+ // were to copy such a tensor through `TfLiteTensorCopy`, the destination data
1390
+ // member will contain the result of invoking the erased type's copy
1391
+ // constructor. Similar for the runtime releasing tensors from memory, the
1392
+ // erased type's destructor will be invoked. There are a few caveats to consider
1393
+ // to use these safely, which we discuss below.
1394
+ //
1395
+ // EXAMPLE: READING VARIANT TENSORS
1396
+ // ```
1397
+ // // retrieve input with `type == kTfLiteVariant`
1398
+ // TfLiteTensor* input = ...
1399
+ // // must first static cast to `VariantData`, more on this below.
1400
+ // VariantData* vd_input = static_cast<VariantData*>(t->data.data);
1401
+ // CustomType* typed_input =
1402
+ // static_cast<CustomType*>(vd_input);
1403
+ // // do custom work on `typed_input`...
1404
+ // ```
1405
+ //
1406
+ // EXAMPLE: WRITING VARIANT TENSORS
1407
+ // ```
1408
+ // TfLiteTensor* output = ...
1409
+ // // construct a new variant object behind the target tensor
1410
+ // TfLiteVariantRealloc<DerivedType, DerivedArgs...>(output, args...);
1411
+ // // again must static cast to `VariantData*` before writing to `void*`.
1412
+ // output->data.data = static_cast<VariantData*>(typed_output);
1413
+ // ```
1414
+ //
1415
+ // WHY STATIC CAST TO `VariantData*`
1416
+ // The Standard defines a `reinterpret_cast` from a derived type to its
1417
+ // parents as undefined behavior when the parent is a non-standard layout.
1418
+ // https://en.cppreference.com/w/cpp/language/reinterpret_cast (see bullet 5).
1419
+ // Due to the `VariantData` having virtual members it is indeed non-standard
1420
+ // layout, and any type derived from `VariantData` fails to be
1421
+ // "transparently-replaceable". I.e. implicit cast from derived to base in this
1422
+ // case may adjust the pointer and by definition `reinterpret_cast` will not
1423
+ // the adjust the pointer.
1424
+ // Thus, dereferencing a pointer of type `VariantData` which addresses
1425
+ // the first byte of an object of said derived type is UB unless it was first
1426
+ // implicitly or statically casted to a `VariantData`. Writing the object of
1427
+ // derived type directly to `void*` which is dereferenced as a `VariantData` is
1428
+ // then UB, and so the intermediate cast through `VariantData` must be enforced.
1429
+ // A good example of this issue is ellucidate in the bottom code snippet
1430
+ // here: https://en.cppreference.com/w/cpp/utility/launder.
1431
+ class VariantData {
1432
+ public:
1433
+ // All variant objects must be able to be destroyed and copied.
1434
+ virtual ~VariantData() = default;
1435
+ // A "virtual copy-constructor". Often the destination tensor of a variant
1436
+ // copy may have been previously allocated in a prior call to inference. We
1437
+ // allow the copy to target the destinations buffer (`maybe_alloc`),
1438
+ // for potential reuse and optimizations. `maybe_alloc` must be of the same
1439
+ // underlying derived type. References to whatever object is at
1440
+ // `maybe_alloc` may be invalidated.
1441
+ virtual VariantData* CloneTo(VariantData* maybe_alloc) const = 0;
1442
+ };
1443
+
1444
+ // Concrete implementations extend `AbstractVariantData` with CRPT.
1445
+ template <typename ErasedDerived>
1446
+ class AbstractVariantData : public VariantData {
1447
+ public:
1448
+ VariantData* CloneTo(VariantData* maybe_alloc) const override {
1449
+ if (maybe_alloc != nullptr) {
1450
+ // If the output is still allocated, then its object may still be
1451
+ // in its life time and the destructor must be called before re-using the
1452
+ // buffer.
1453
+ // This may actual have a non-negligible effect on performance if the
1454
+ // destructor is complex. A future iteration may
1455
+ // introduce copy or move assignment semantics, allowing for the
1456
+ // underlying implementation to optimize for this case.
1457
+ auto* derived = static_cast<ErasedDerived*>(maybe_alloc);
1458
+ derived->~ErasedDerived();
1459
+ return new (derived)
1460
+ ErasedDerived(static_cast<ErasedDerived const&>(*this));
1461
+ }
1462
+ return new ErasedDerived(static_cast<ErasedDerived const&>(*this));
1463
+ }
1464
+
1465
+ protected:
1466
+ AbstractVariantData() = default;
1467
+ AbstractVariantData(const AbstractVariantData&) = default;
1468
+ AbstractVariantData(AbstractVariantData&&) = delete;
1469
+ };
1470
+
1471
+ // Analogous to `TfLiteTensorRealloc` for allocation of tensors whose
1472
+ // data member points to an arbitrary C++ object. `VariantType` refers
1473
+ // to the erased type of said object and `VariantArgs` refers to
1474
+ // a list of argument types with which to construct a new `VariantType`.
1475
+ // `VariantArgs` must match a constructor of `VariantType`.
1476
+ template <class VariantType, class... VariantArgs>
1477
+ TfLiteStatus TfLiteTensorVariantRealloc(TfLiteTensor* t,
1478
+ VariantArgs&&... args) {
1479
+ if (t->type != kTfLiteVariant) return kTfLiteError;
1480
+ VariantType* new_vd;
1481
+ if (t->data.raw != nullptr) {
1482
+ auto* target_vd = static_cast<VariantData*>(t->data.data);
1483
+ target_vd->~VariantData();
1484
+ // As above, we assume if `t` is already allocated then it was allocated
1485
+ // with the same `VariantType` as templated.
1486
+ new_vd = new (t->data.raw) VariantType(std::forward<VariantArgs>(args)...);
1487
+ } else {
1488
+ new_vd = new VariantType(std::forward<VariantArgs>(args)...);
1489
+ }
1490
+ t->data.data = static_cast<VariantData*>(new_vd);
1491
+ t->allocation_type = kTfLiteVariantObject;
1492
+ return kTfLiteOk;
1493
+ }
1494
+
1194
1495
  #endif // __cplusplus
1195
1496
  #endif // TENSORFLOW_LITE_CORE_C_COMMON_H_
@@ -0,0 +1,78 @@
1
+ /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+ // This provides utility macros and functions that are inherently platform
16
+ // specific or shared across runtime & converter.
17
+ #ifndef TENSORFLOW_LITE_CORE_MACROS_H_
18
+ #define TENSORFLOW_LITE_CORE_MACROS_H_
19
+
20
+ #ifdef __has_builtin
21
+ #define TFLITE_HAS_BUILTIN(x) __has_builtin(x)
22
+ #else
23
+ #define TFLITE_HAS_BUILTIN(x) 0
24
+ #endif
25
+
26
+ #if (!defined(__NVCC__)) && (TFLITE_HAS_BUILTIN(__builtin_expect) || \
27
+ (defined(__GNUC__) && __GNUC__ >= 3))
28
+ #define TFLITE_EXPECT_FALSE(cond) __builtin_expect(cond, false)
29
+ #define TFLITE_EXPECT_TRUE(cond) __builtin_expect(!!(cond), true)
30
+ #else
31
+ #define TFLITE_EXPECT_FALSE(cond) (cond)
32
+ #define TFLITE_EXPECT_TRUE(cond) (cond)
33
+ #endif
34
+
35
+ #ifdef _WIN32
36
+ #define TFLITE_NOINLINE __declspec(noinline)
37
+ #else
38
+ #ifdef __has_attribute
39
+ #if __has_attribute(noinline)
40
+ #define TFLITE_NOINLINE __attribute__((noinline))
41
+ #else
42
+ #define TFLITE_NOINLINE
43
+ #endif // __has_attribute(noinline)
44
+ #else
45
+ #define TFLITE_NOINLINE
46
+ #endif // __has_attribute
47
+ #endif // _WIN32
48
+
49
+ // Normally we'd use ABSL_HAVE_ATTRIBUTE_WEAK and ABSL_ATTRIBUTE_WEAK, but
50
+ // we avoid the absl dependency for binary size reasons.
51
+ #ifdef __has_attribute
52
+ #define TFLITE_HAS_ATTRIBUTE(x) __has_attribute(x)
53
+ #else
54
+ #define TFLITE_HAS_ATTRIBUTE(x) 0
55
+ #endif
56
+
57
+ #if (TFLITE_HAS_ATTRIBUTE(weak) || \
58
+ (defined(__GNUC__) && !defined(__clang__))) && \
59
+ !(defined(__llvm__) && defined(_WIN32)) && !defined(__MINGW32__)
60
+ #undef TFLITE_ATTRIBUTE_WEAK
61
+ #define TFLITE_ATTRIBUTE_WEAK __attribute__((weak))
62
+ #define TFLITE_HAS_ATTRIBUTE_WEAK 1
63
+ #else
64
+ #define TFLITE_ATTRIBUTE_WEAK
65
+ #define TFLITE_HAS_ATTRIBUTE_WEAK 0
66
+ #endif
67
+
68
+ #ifndef TF_LITE_STATIC_MEMORY
69
+ // maximum size of a valid flatbuffer
70
+ inline constexpr unsigned int flatbuffer_size_max = 2147483648;
71
+ // If none zero then the buffer is stored outside of the flatbuffers, string
72
+ inline constexpr char tflite_metadata_buffer_location[] = "buffer_location";
73
+ // field for minimum runtime version, string
74
+ inline constexpr char tflite_metadata_min_runtime_version[] =
75
+ "min_runtime_version";
76
+ #endif
77
+
78
+ #endif // TENSORFLOW_LITE_CORE_MACROS_H_
@@ -16,6 +16,10 @@ limitations under the License.
16
16
  #define TENSORFLOW_LITE_KERNELS_INTERNAL_COMMON_H_
17
17
 
18
18
  #include <algorithm>
19
+ #include <cstddef>
20
+ #include <cstdint>
21
+
22
+ #include "tensorflow/lite/kernels/internal/runtime_shape.h"
19
23
  #ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
20
24
  #ifdef GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
21
25
  #define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
@@ -26,6 +30,7 @@ limitations under the License.
26
30
  #include <functional>
27
31
 
28
32
  #include "fixedpoint/fixedpoint.h"
33
+ #include "tensorflow/lite/core/macros.h"
29
34
  #include "tensorflow/lite/kernels/internal/cppmath.h"
30
35
  #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
31
36
  #include "tensorflow/lite/kernels/internal/types.h"
@@ -34,6 +39,117 @@ namespace tflite {
34
39
 
35
40
  constexpr int kReverseShift = -1;
36
41
 
42
+ // Reduces and compresses dimensions so that broadcast handling becomes more
43
+ // efficient. Returns true if the output shape is broadcastable; it doesn't
44
+ // contain any degenerate dimension, i.e. shape dimension = 0. False otherwise.
45
+ template <int MAX_DIM = 6>
46
+ bool ReduceDimensionsForBroadcast(const RuntimeShape& input1_shape,
47
+ const RuntimeShape& input2_shape,
48
+ size_t* compressed_input1_stride,
49
+ size_t* compressed_input2_stride,
50
+ size_t* compressed_output_shape) {
51
+ size_t num_compressed_dims = 0;
52
+ size_t compressed_input1_shape[MAX_DIM];
53
+ size_t compressed_input2_shape[MAX_DIM];
54
+ std::fill(compressed_input1_shape, compressed_input1_shape + MAX_DIM, 1);
55
+ std::fill(compressed_input2_shape, compressed_input2_shape + MAX_DIM, 1);
56
+ std::fill(compressed_output_shape, compressed_output_shape + MAX_DIM, 1);
57
+ bool broadcast_input1 = false;
58
+ bool broadcast_input2 = false;
59
+ bool first_nonunit = true;
60
+ const size_t num_input1_dims = input1_shape.DimensionsCount();
61
+ const size_t num_input2_dims = input2_shape.DimensionsCount();
62
+ const int32_t* input1_dims = input1_shape.DimsData();
63
+ const int32_t* input2_dims = input2_shape.DimsData();
64
+ const size_t num_common_dims = std::min(num_input1_dims, num_input2_dims);
65
+ for (size_t i = 1; i <= num_common_dims; i++) {
66
+ const size_t input1_dim = input1_dims[num_input1_dims - i];
67
+ const size_t input2_dim = input2_dims[num_input2_dims - i];
68
+ if (input1_dim == 0 || input2_dim == 0) {
69
+ return false;
70
+ }
71
+ if (input1_dim == 1 && input2_dim == 1) {
72
+ continue;
73
+ }
74
+ assert(!broadcast_input1 || !broadcast_input2);
75
+
76
+ if (input1_dim == 1) {
77
+ if (!broadcast_input1) {
78
+ broadcast_input1 = true;
79
+ broadcast_input2 = false;
80
+ num_compressed_dims++;
81
+ }
82
+ compressed_input2_shape[num_compressed_dims - 1] *= input2_dim;
83
+ compressed_output_shape[num_compressed_dims - 1] *= input2_dim;
84
+ } else if (input2_dim == 1) {
85
+ if (!broadcast_input2) {
86
+ broadcast_input1 = false;
87
+ broadcast_input2 = true;
88
+ num_compressed_dims++;
89
+ }
90
+ compressed_input1_shape[num_compressed_dims - 1] *= input1_dim;
91
+ compressed_output_shape[num_compressed_dims - 1] *= input1_dim;
92
+ } else {
93
+ TFLITE_DCHECK(input1_dim == input2_dim);
94
+ if (broadcast_input1 || broadcast_input2 || first_nonunit) {
95
+ broadcast_input1 = false;
96
+ broadcast_input2 = false;
97
+ num_compressed_dims++;
98
+ }
99
+ compressed_input1_shape[num_compressed_dims - 1] *= input1_dim;
100
+ compressed_input2_shape[num_compressed_dims - 1] *= input1_dim;
101
+ compressed_output_shape[num_compressed_dims - 1] *= input1_dim;
102
+ }
103
+ first_nonunit = false;
104
+ }
105
+ if (num_input1_dims > num_input2_dims) {
106
+ if (!broadcast_input2) {
107
+ num_compressed_dims++;
108
+ }
109
+ for (size_t i = 0; i < num_input1_dims - num_input2_dims; i++) {
110
+ const size_t input1_dim = input1_dims[i];
111
+ if (input1_dim == 0) {
112
+ return false;
113
+ }
114
+ compressed_input1_shape[num_compressed_dims - 1] *= input1_dim;
115
+ compressed_output_shape[num_compressed_dims - 1] *= input1_dim;
116
+ }
117
+ } else if (num_input2_dims > num_input1_dims) {
118
+ if (!broadcast_input1) {
119
+ num_compressed_dims++;
120
+ }
121
+ for (size_t i = 0; i < num_input2_dims - num_input1_dims; i++) {
122
+ const size_t input2_dim = input2_dims[i];
123
+ if (input2_dim == 0) {
124
+ return false;
125
+ }
126
+ compressed_input2_shape[num_compressed_dims - 1] *= input2_dim;
127
+ compressed_output_shape[num_compressed_dims - 1] *= input2_dim;
128
+ }
129
+ }
130
+ num_compressed_dims = (num_compressed_dims > 1) ? num_compressed_dims : 1;
131
+
132
+ int input1_stride = 1;
133
+ int input2_stride = 1;
134
+ for (int i = 0; i < MAX_DIM; ++i) {
135
+ compressed_input1_stride[i] = input1_stride;
136
+ input1_stride *= compressed_input1_shape[i];
137
+ compressed_input2_stride[i] = input2_stride;
138
+ input2_stride *= compressed_input2_shape[i];
139
+ }
140
+ for (int i = 0; i < MAX_DIM; ++i) {
141
+ if (compressed_input1_shape[i] != compressed_input2_shape[i]) {
142
+ if (compressed_input1_shape[i] == 1) {
143
+ compressed_input1_stride[i] = 0;
144
+ } else {
145
+ TFLITE_DCHECK_EQ(compressed_input2_shape[i], 1);
146
+ compressed_input2_stride[i] = 0;
147
+ }
148
+ }
149
+ }
150
+ return true;
151
+ }
152
+
37
153
  inline void GetActivationMinMax(FusedActivationFunctionType ac,
38
154
  float* output_activation_min,
39
155
  float* output_activation_max) {
@@ -250,42 +366,11 @@ inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
250
366
  quantized_multiplier);
251
367
  }
252
368
 
253
- inline int32_t MultiplyByQuantizedMultiplier(int32_t x,
254
- int32_t quantized_multiplier,
255
- int shift) {
256
- using gemmlowp::RoundingDivideByPOT;
257
- using gemmlowp::SaturatingRoundingDoublingHighMul;
258
- int left_shift = shift > 0 ? shift : 0;
259
- int right_shift = shift > 0 ? 0 : -shift;
260
- return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
261
- x * (1 << left_shift), quantized_multiplier),
262
- right_shift);
263
- }
369
+ TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
370
+ int32_t x, int32_t quantized_multiplier, int shift);
264
371
 
265
- inline int32_t MultiplyByQuantizedMultiplier(int64_t x,
266
- int32_t quantized_multiplier,
267
- int shift) {
268
- // Inputs:
269
- // - quantized_multiplier has fixed point at bit 31
270
- // - shift is -31 to +7 (negative for right shift)
271
- //
272
- // Assumptions: The following input ranges are assumed
273
- // - quantize_scale>=0 (the usual range is (1<<30) to (1>>31)-1)
274
- // - scaling is chosen so final scaled result fits in int32_t
275
- // - input x is in the range -(1<<47) <= x < (1<<47)
276
- assert(quantized_multiplier >= 0);
277
- assert(shift >= -31 && shift < 8);
278
- assert(x >= -(static_cast<int64_t>(1) << 47) &&
279
- x < (static_cast<int64_t>(1) << 47));
280
-
281
- int32_t reduced_multiplier = (quantized_multiplier < 0x7FFF0000)
282
- ? ((quantized_multiplier + (1 << 15)) >> 16)
283
- : 0x7FFF;
284
- int total_shift = 15 - shift;
285
- x = (x * (int64_t)reduced_multiplier) + ((int64_t)1 << (total_shift - 1));
286
- int32_t result = x >> total_shift;
287
- return result;
288
- }
372
+ TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
373
+ int64_t x, int32_t quantized_multiplier, int shift);
289
374
 
290
375
  #ifdef USE_NEON
291
376
  // Round uses ARM's rounding shift right.
@@ -328,14 +413,16 @@ template <typename T>
328
413
  int CountLeadingZeros(T integer_input) {
329
414
  static_assert(std::is_unsigned<T>::value,
330
415
  "Only unsigned integer types handled.");
331
- #if defined(__GNUC__)
332
- return integer_input ? __builtin_clz(integer_input)
333
- : std::numeric_limits<T>::digits;
334
- #else
335
416
  if (integer_input == 0) {
336
417
  return std::numeric_limits<T>::digits;
337
418
  }
338
-
419
+ #if defined(__GNUC__)
420
+ if (std::is_same<T, uint32_t>::value) {
421
+ return __builtin_clz(integer_input);
422
+ } else if (std::is_same<T, uint64_t>::value) {
423
+ return __builtin_clzll(integer_input);
424
+ }
425
+ #endif
339
426
  const T one_in_leading_positive = static_cast<T>(1)
340
427
  << (std::numeric_limits<T>::digits - 1);
341
428
  int leading_zeros = 0;
@@ -344,7 +431,6 @@ int CountLeadingZeros(T integer_input) {
344
431
  ++leading_zeros;
345
432
  }
346
433
  return leading_zeros;
347
- #endif
348
434
  }
349
435
 
350
436
  template <typename T>
@@ -1039,8 +1125,8 @@ inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
1039
1125
 
1040
1126
  // Copies dims to desc, calculating strides.
1041
1127
  template <int N>
1042
- inline void CopyDimsToDesc(const RuntimeShape& input_shape,
1043
- NdArrayDesc<N>* desc_out) {
1128
+ TFLITE_NOINLINE void CopyDimsToDesc(const RuntimeShape& input_shape,
1129
+ NdArrayDesc<N>* desc_out) {
1044
1130
  int desc_stride = 1;
1045
1131
  for (int i = N - 1; i >= 0; --i) {
1046
1132
  desc_out->extents[i] = input_shape.Dims(i);