PyPI - xmos-ai-tools - Versions diffs - 1.1.2.dev216__py3-none-macosx_11_0_arm64.whl → 1.1.2.dev236__py3-none-macosx_11_0_arm64.whl - Mend

xmos-ai-tools 1.1.2.dev216__py3-none-macosx_11_0_arm64.whl → 1.1.2.dev236__py3-none-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (147) hide show

xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/sub.h CHANGED Viewed

@@ -18,6 +18,7 @@ limitations under the License.
 #include <stdint.h>
 #include <algorithm>
+#include <cstddef>
 #include <limits>
 #include "ruy/profiler/instrumentation.h"  // from @ruy
@@ -29,100 +30,179 @@ namespace tflite {
 namespace reference_ops {
-inline void SubNonBroadcast(const ArithmeticParams& params,
-                            const RuntimeShape& input1_shape,
-                            const float* input1_data,
-                            const RuntimeShape& input2_shape,
-                            const float* input2_data,
-                            const RuntimeShape& output_shape,
-                            float* output_data) {
-  const int flat_size =
-      MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] - input2_data[i], params.float_activation_min,
-        params.float_activation_max);
+template <class T>
+struct SubImpl {
+  template <class F>
+  static void BroadcastInput1(const ArithmeticParams& params,
+                              const T* input1_data, const T* input2_data,
+                              T* output_data, size_t size, F binary_func) {
+    for (size_t c = 0; c < size; ++c) {
+      output_data[c] = binary_func(input1_data[0], input2_data[c], params);
+    }
   }
-}
-inline void SubNonBroadcast(const ArithmeticParams& params,
-                            const RuntimeShape& input1_shape,
-                            const int32_t* input1_data,
-                            const RuntimeShape& input2_shape,
-                            const int32_t* input2_data,
-                            const RuntimeShape& output_shape,
-                            int32_t* output_data) {
-  const int flat_size =
-      MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] - input2_data[i], params.quantized_activation_min,
-        params.quantized_activation_max);
+  template <class F>
+  static void BroadcastInput2(const ArithmeticParams& params,
+                              const T* input1_data, const T* input2_data,
+                              T* output_data, size_t size, F binary_func) {
+    for (size_t c = 0; c < size; ++c) {
+      output_data[c] = binary_func(input1_data[c], input2_data[0], params);
+    }
   }
-}
-// TODO(b/151345304): We can implement BroadcastSub on buffers of arbitrary
-// dimensionality if the runtime code does a single loop over one dimension
-// that handles broadcasting as the base case. The code generator would then
-// generate max(D1, D2) nested for loops.
-template <int N = 5>
-inline void BroadcastSubSlow(const ArithmeticParams& params,
-                             const RuntimeShape& input1_shape,
-                             const float* input1_data,
-                             const RuntimeShape& input2_shape,
-                             const float* input2_data,
-                             const RuntimeShape& output_shape,
-                             float* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/float");
-  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
-  NdArrayDesc<N> desc1;
-  NdArrayDesc<N> desc2;
-  NdArrayDesc<N> output_desc;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
+  template <class F>
+  static void ElementWise(const ArithmeticParams& params, const T* input1_data,
+                          const T* input2_data, T* output_data, size_t size,
+                          F binary_func) {
+    for (size_t c = 0; c < size; ++c) {
+      output_data[c] = binary_func(input1_data[c], input2_data[c], params);
+    }
+  }
+};
+template <>
+struct SubImpl<int32_t> {
+  template <class F>
+  static void BroadcastInput1(const ArithmeticParams& params,
+                              const int32_t* input1_data,
+                              const int32_t* input2_data, int32_t* output_data,
+                              size_t size, F binary_func) {
+    size_t c = 0;
+    int32_t activation_min, activation_max;
+    GetActivationParams(params, &activation_min, &activation_max);
+#ifdef USE_NEON
+    const int32x4_t vmax = vdupq_n_s32(activation_max);
+    const int32x4_t vmin = vdupq_n_s32(activation_min);
+    const int32x4_t va = vdupq_n_s32(input1_data[0]);
+    for (; c + 4 <= size; c += 4) {
+      const int32x4_t vb = vld1q_s32(&input2_data[c]);
+      int32x4_t vres = vsubq_s32(va, vb);
+      vres = vmaxq_s32(vmin, vres);
+      vres = vminq_s32(vmax, vres);
+      vst1q_s32(&output_data[c], vres);
+    }
+#endif
+    for (; c < size; ++c) {
+      output_data[c] = binary_func(input1_data[0], input2_data[c], params);
+    }
+  }
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  auto sub_func = [&](int indexes[N]) {
-    output_data[SubscriptToIndex(output_desc, indexes)] =
-        ActivationFunctionWithMinMax(
-            input1_data[SubscriptToIndex(desc1, indexes)] -
-                input2_data[SubscriptToIndex(desc2, indexes)],
-            params.float_activation_min, params.float_activation_max);
-  };
-  NDOpsHelper<N>(output_desc, sub_func);
+  template <class F>
+  static void BroadcastInput2(const ArithmeticParams& params,
+                              const int32_t* input1_data,
+                              const int32_t* input2_data, int32_t* output_data,
+                              size_t size, F binary_func) {
+    size_t c = 0;
+    int32_t activation_min, activation_max;
+    GetActivationParams(params, &activation_min, &activation_max);
+#ifdef USE_NEON
+    const int32x4_t vmax = vdupq_n_s32(activation_max);
+    const int32x4_t vmin = vdupq_n_s32(activation_min);
+    const int32x4_t vb = vdupq_n_s32(input2_data[0]);
+    for (; c + 4 <= size; c += 4) {
+      const int32x4_t va = vld1q_s32(&input1_data[c]);
+      int32x4_t vres = vsubq_s32(va, vb);
+      vres = vmaxq_s32(vmin, vres);
+      vres = vminq_s32(vmax, vres);
+      vst1q_s32(&output_data[c], vres);
+    }
+#endif
+    for (; c < size; ++c) {
+      output_data[c] = binary_func(input1_data[c], input2_data[0], params);
+    }
+  }
+  template <class F>
+  static void ElementWise(const ArithmeticParams& params,
+                          const int32_t* input1_data,
+                          const int32_t* input2_data, int32_t* output_data,
+                          size_t size, F binary_func) {
+    size_t c = 0;
+    int32_t activation_min, activation_max;
+    GetActivationParams(params, &activation_min, &activation_max);
+#ifdef USE_NEON
+    int32x4_t vmax = vdupq_n_s32(activation_max);
+    int32x4_t vmin = vdupq_n_s32(activation_min);
+    for (; c + 4 <= size; c += 4) {
+      const int32x4_t va = vld1q_s32(&input1_data[c]);
+      const int32x4_t vb = vld1q_s32(&input2_data[c]);
+      int32x4_t vres = vsubq_s32(va, vb);
+      vres = vmaxq_s32(vmin, vres);
+      vres = vminq_s32(vmax, vres);
+      vst1q_s32(&output_data[c], vres);
+    }
+#endif
+    for (; c < size; ++c) {
+      output_data[c] = binary_func(input1_data[c], input2_data[c], params);
+    }
+  }
+};
+template <typename T, typename F>
+inline void BroadcastSubRecursiveDimensions(
+    int dimension, const ArithmeticParams& params, const T* input1_data,
+    const T* input2_data, T* output_data, size_t* input1_offset_p,
+    size_t* input2_offset_p, size_t* output_offset,
+    size_t* compressed_input1_stride, size_t* compressed_input2_stride,
+    size_t* compressed_output_shape, F binary_func) {
+  if (dimension > 0) {
+    for (size_t c = 0; c < compressed_output_shape[dimension]; ++c) {
+      size_t input1_offset_c = *input1_offset_p;
+      size_t input2_offset_c = *input2_offset_p;
+      BroadcastSubRecursiveDimensions(
+          dimension - 1, params, input1_data, input2_data, output_data,
+          &input1_offset_c, &input2_offset_c, output_offset,
+          compressed_input1_stride, compressed_input2_stride,
+          compressed_output_shape, binary_func);
+      *input1_offset_p += compressed_input1_stride[dimension];
+      *input2_offset_p += compressed_input2_stride[dimension];
+    }
+  } else {
+    TFLITE_DCHECK(dimension == 0);
+    bool input1_is_broadcast = compressed_input1_stride[dimension] == 0;
+    bool input2_is_broadcast = compressed_input2_stride[dimension] == 0;
+    TFLITE_DCHECK(!(input1_is_broadcast && input2_is_broadcast));
+    const T* input1_data_ptr = input1_data + *input1_offset_p;
+    const T* input2_data_ptr = input2_data + *input2_offset_p;
+    T* output_data_ptr = output_data + *output_offset;
+    if (input1_is_broadcast) {
+      // input1 is broadcast.
+      SubImpl<T>::BroadcastInput1(
+          params, input1_data_ptr, input2_data_ptr, output_data_ptr,
+          compressed_output_shape[dimension], binary_func);
+      *input2_offset_p += compressed_output_shape[dimension];
+    } else if (input2_is_broadcast) {
+      // input2 is broadcast.
+      SubImpl<T>::BroadcastInput2(
+          params, input1_data_ptr, input2_data_ptr, output_data_ptr,
+          compressed_output_shape[dimension], binary_func);
+      *input1_offset_p += compressed_output_shape[dimension];
+    } else {
+      // Add element-wise.
+      SubImpl<T>::ElementWise(params, input1_data_ptr, input2_data_ptr,
+                              output_data_ptr,
+                              compressed_output_shape[dimension], binary_func);
+      *input1_offset_p += compressed_output_shape[dimension];
+      *input2_offset_p += compressed_output_shape[dimension];
+    }
+    *output_offset += compressed_output_shape[dimension];
+  }
 }
-template <int N = 5>
-inline void BroadcastSubSlow(const ArithmeticParams& params,
-                             const RuntimeShape& input1_shape,
-                             const int32_t* input1_data,
-                             const RuntimeShape& input2_shape,
-                             const int32_t* input2_data,
-                             const RuntimeShape& output_shape,
-                             int32_t* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32_t");
-  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
-  NdArrayDesc<N> desc1;
-  NdArrayDesc<N> desc2;
-  NdArrayDesc<N> output_desc;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
+// TODO: b/296510380 - we may be able to factor out this to common.h for all
+// binary arithmetic ops (add, sub, mul).
+template <typename T, typename F>
+inline void BroadcastSubCommon(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const T* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const T* input2_data,
+                               const RuntimeShape& output_shape, T* output_data,
+                               F binary_func) {
+  constexpr int kMaxBroadcastDim = 6;
+  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), kMaxBroadcastDim);
+  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), kMaxBroadcastDim);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), kMaxBroadcastDim);
   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   // col, channel), with extents (batches, height, width, depth), with the
@@ -135,33 +215,6 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
   // We name our variables by their Tensorflow convention, but generate C code
   // nesting loops such that the innermost loop has the smallest stride for the
   // best cache behavior.
-  auto sub_func = [&](int indexes[N]) {
-    output_data[SubscriptToIndex(output_desc, indexes)] =
-        ActivationFunctionWithMinMax(
-            input1_data[SubscriptToIndex(desc1, indexes)] -
-                input2_data[SubscriptToIndex(desc2, indexes)],
-            params.quantized_activation_min, params.quantized_activation_max);
-  };
-  NDOpsHelper<N>(output_desc, sub_func);
-}
-template <int N = 5>
-void BroadcastSubSlow(const ArithmeticParams& params,
-                      const RuntimeShape& input1_shape,
-                      const int64_t* input1_data,
-                      const RuntimeShape& input2_shape,
-                      const int64_t* input2_data,
-                      const RuntimeShape& output_shape, int64_t* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int64_t");
-  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
-  NdArrayDesc<N> desc1;
-  NdArrayDesc<N> desc2;
-  NdArrayDesc<N> output_desc;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   // col, channel), with extents (batches, height, width, depth), with the
@@ -174,54 +227,48 @@ void BroadcastSubSlow(const ArithmeticParams& params,
   // We name our variables by their Tensorflow convention, but generate C code
   // nesting loops such that the innermost loop has the smallest stride for the
   // best cache behavior.
-  auto sub_func = [&](int indexes[N]) {
-    output_data[SubscriptToIndex(output_desc, indexes)] =
-        ActivationFunctionWithMinMax(
-            input1_data[SubscriptToIndex(desc1, indexes)] -
-                input2_data[SubscriptToIndex(desc2, indexes)],
-            params.int64_activation_min, params.int64_activation_max);
-  };
-  NDOpsHelper<N>(output_desc, sub_func);
+  size_t compressed_input1_stride[kMaxBroadcastDim];
+  size_t compressed_input2_stride[kMaxBroadcastDim];
+  size_t compressed_output_shape[kMaxBroadcastDim];
+  bool broadcastable_shape = ReduceDimensionsForBroadcast<kMaxBroadcastDim>(
+      input1_shape, input2_shape, compressed_input1_stride,
+      compressed_input2_stride, compressed_output_shape);
+  // Skip broadcasting for degenerate shapes.
+  if (!broadcastable_shape) {
+    return;
+  }
+  size_t input1_offset = 0;
+  size_t input2_offset = 0;
+  size_t output_offset = 0;
+  BroadcastSubRecursiveDimensions(
+      kMaxBroadcastDim - 1, params, input1_data, input2_data, output_data,
+      &input1_offset, &input2_offset, &output_offset, compressed_input1_stride,
+      compressed_input2_stride, compressed_output_shape, binary_func);
 }
-template <typename T, int N = 5>
+// TODO(b/151345304): We can implement BroadcastSub on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+template <typename T>
 void BroadcastSubSlow(const ArithmeticParams& params,
                       const RuntimeShape& input1_shape, const T* input1_data,
                       const RuntimeShape& input2_shape, const T* input2_data,
                       const RuntimeShape& output_shape, T* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/templated");
-  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
-  NdArrayDesc<N> desc1;
-  NdArrayDesc<N> desc2;
-  NdArrayDesc<N> output_desc;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  auto sub_func = [&](int indexes[N]) {
-    output_data[SubscriptToIndex(output_desc, indexes)] =
-        ActivationFunctionWithMinMax(
-            input1_data[SubscriptToIndex(desc1, indexes)] -
-                input2_data[SubscriptToIndex(desc2, indexes)],
-            params.quantized_activation_min, params.quantized_activation_max);
-  };
-  NDOpsHelper<N>(output_desc, sub_func);
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/T");
+  BroadcastSubCommon<T>(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data,
+      [](T input1_val, T input2_val, const ArithmeticParams& params) {
+        T activation_min, activation_max;
+        GetActivationParams(params, &activation_min, &activation_max);
+        return ActivationFunctionWithMinMax(input1_val - input2_val,
+                                            activation_min, activation_max);
+      });
 }
-template <int N = 5>
 inline void BroadcastSub16POTSlow(const ArithmeticParams& params,
                                   const RuntimeShape& input1_shape,
                                   const int16_t* input1_data,
@@ -230,42 +277,24 @@ inline void BroadcastSub16POTSlow(const ArithmeticParams& params,
                                   const RuntimeShape& output_shape,
                                   int16_t* output_data) {
   ruy::profiler::ScopeLabel label("BroadcastSub16POTSlow/int16_t");
-  NdArrayDesc<N> desc1;
-  NdArrayDesc<N> desc2;
-  NdArrayDesc<N> output_desc;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  auto sub_func = [&](int indexes[N]) {
-    const int32_t input1_val = input1_data[SubscriptToIndex(desc1, indexes)];
-    const int32_t input2_val = input2_data[SubscriptToIndex(desc2, indexes)];
-    const int32_t scaled_input1_val =
-        gemmlowp::RoundingDivideByPOT(input1_val, -params.input1_shift);
-    const int32_t scaled_input2_val =
-        gemmlowp::RoundingDivideByPOT(input2_val, -params.input2_shift);
-    const int32_t raw_output = scaled_input1_val - scaled_input2_val;
-    const int32_t clamped_output =
-        std::min(params.quantized_activation_max,
-                 std::max(params.quantized_activation_min, raw_output));
-    output_data[SubscriptToIndex(output_desc, indexes)] =
-        static_cast<int16_t>(clamped_output);
-  };
-  NDOpsHelper<N>(output_desc, sub_func);
+  BroadcastSubCommon<int16_t>(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data,
+      [](int16_t input1_val, int16_t input2_val,
+         const ArithmeticParams& params) {
+        const int32_t scaled_input1_val =
+            gemmlowp::RoundingDivideByPOT(input1_val, -params.input1_shift);
+        const int32_t scaled_input2_val =
+            gemmlowp::RoundingDivideByPOT(input2_val, -params.input2_shift);
+        const int32_t raw_output = scaled_input1_val - scaled_input2_val;
+        const int32_t clamped_output =
+            std::min(params.quantized_activation_max,
+                     std::max(params.quantized_activation_min, raw_output));
+        return static_cast<int16_t>(clamped_output);
+      });
 }
-template <typename T, int N = 5>
+template <typename T>
 void BroadcastQuantSubSlow(const ArithmeticParams& params,
                            const RuntimeShape& input1_shape,
                            const T* input1_data,
@@ -273,52 +302,32 @@ void BroadcastQuantSubSlow(const ArithmeticParams& params,
                            const T* input2_data,
                            const RuntimeShape& output_shape, T* output_data) {
   ruy::profiler::ScopeLabel label("BroadcastQuantSubSlow/T");
-  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
-  NdArrayDesc<N> desc1;
-  NdArrayDesc<N> desc2;
-  NdArrayDesc<N> output_desc;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  auto sub_func = [&](int indexes[N]) {
-    const int32_t input1_val =
-        params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
-    const int32_t input2_val =
-        params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
-    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32_t scaled_input1_val =
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(
-            shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32_t scaled_input2_val =
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(
-            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
-    const int32_t raw_output =
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(
-            raw_sub, params.output_multiplier, params.output_shift) +
-        params.output_offset;
-    const int32_t clamped_output =
-        std::min(params.quantized_activation_max,
-                 std::max(params.quantized_activation_min, raw_output));
-    output_data[SubscriptToIndex(output_desc, indexes)] =
-        static_cast<T>(clamped_output);
-  };
-  NDOpsHelper<N>(output_desc, sub_func);
+  BroadcastSubCommon<T>(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data,
+      [](T input1_val, T input2_val, const ArithmeticParams& params) {
+        const int32_t shifted_input1_val =
+            (params.input1_offset + input1_val) * (1 << params.left_shift);
+        const int32_t shifted_input2_val =
+            (params.input2_offset + input2_val) * (1 << params.left_shift);
+        const int32_t scaled_input1_val =
+            MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                shifted_input1_val, params.input1_multiplier,
+                params.input1_shift);
+        const int32_t scaled_input2_val =
+            MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                shifted_input2_val, params.input2_multiplier,
+                params.input2_shift);
+        const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
+        const int32_t raw_output =
+            MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                raw_sub, params.output_multiplier, params.output_shift) +
+            params.output_offset;
+        const int32_t clamped_output =
+            std::min(params.quantized_activation_max,
+                     std::max(params.quantized_activation_min, raw_output));
+        return static_cast<T>(clamped_output);
+      });
 }
 // Element-wise add that can often be used for inner loop of broadcast add as
@@ -405,35 +414,12 @@ void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
          const T* input1_data, const RuntimeShape& input2_shape,
          const T* input2_data, const RuntimeShape& output_shape,
          T* output_data) {
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(4, output_shape);
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
-    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
-      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
-        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
-              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-        }
-      }
-    }
-  }
+  BroadcastSubCommon<T>(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data,
+      [](T input1_val, T input2_val, const ArithmeticParams& params) {
+        return input1_val - input2_val;
+      });
 }
 inline void SetActivationMinMax(const ArithmeticParams& params,

xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/runtime_shape.h CHANGED Viewed

@@ -15,6 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_
+#include <cstring>
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 namespace tflite {
 template <int N>
@@ -34,9 +38,12 @@ class RuntimeShape {
   RuntimeShape() : size_(0) {}
-  explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) {}
+  explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) {
+    TFLITE_DCHECK_LE(dimensions_count, kMaxSmallSize);
+  }
   RuntimeShape(int shape_size, int32_t value) : size_(shape_size) {
+    TFLITE_DCHECK_LE(shape_size, kMaxSmallSize);
     for (int i = 0; i < shape_size; ++i) {
       SetDim(i, value);
     }
@@ -44,6 +51,7 @@ class RuntimeShape {
   RuntimeShape(int dimensions_count, const int32_t* dims_data)
       : size_(dimensions_count) {
+    // check of dimensions_count handled by ReplaceWith()
     ReplaceWith(dimensions_count, dims_data);
   }
@@ -69,6 +77,7 @@ class RuntimeShape {
   static RuntimeShape ExtendedShape(int new_shape_size,
                                     const RuntimeShape& shape) {
+    TFLITE_DCHECK_LE(new_shape_size, kMaxSmallSize);
     return RuntimeShape(new_shape_size, shape, 1);
   }
   int32_t* DimsData() { return dims_; }
@@ -76,6 +85,7 @@ class RuntimeShape {
   const int32_t* DimsDataUpTo5D() const { return dims_; }
   void ReplaceWith(int dimensions_count, const int32_t* dims_data) {
+    TFLITE_DCHECK_LE(dimensions_count, kMaxSmallSize);
     size_ = dimensions_count;
     int32_t* dst_dims = DimsData();
     std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t));

xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/strided_slice_logic.h CHANGED Viewed

@@ -103,6 +103,7 @@ inline int StridedSliceEndForAxis(const tflite::StridedSliceParams& params,
   const auto shrink_axis_mask = params.shrink_axis_mask;
   const bool shrink_axis = shrink_axis_mask & (1 << axis);
   const int axis_size = input_shape.Dims(axis);
+  const bool offset = params.offset;
   if (shrink_axis) {
     if (start >= axis_size) {
       return start;
@@ -112,6 +113,9 @@ inline int StridedSliceEndForAxis(const tflite::StridedSliceParams& params,
   }
   const auto* indices = params.stop_indices;
   int end = indices[axis];
+  if (offset) {
+    end += start;
+  }
   const int32_t stride = params.strides[axis];
   const int32_t end_mask = (params.end_mask & 1 << axis);
   if (end < 0) {
@@ -246,7 +250,7 @@ inline tflite::StridedSliceParams BuildStridedSliceParams(
     int begin_mask, int end_mask, int shrink_axis_mask,
     const std::vector<int>& start_indices, const std::vector<int>& stop_indices,
     const std::vector<int>& strides) {
-  tflite::StridedSliceParams op_params;
+  tflite::StridedSliceParams op_params{};
   const int dims_count = start_indices.size();
   op_params.start_indices_count = dims_count;