PyPI - xmos-ai-tools - Versions diffs - 1.3.2.dev80__py3-none-macosx_10_15_universal2.whl - Mend

xmos-ai-tools 1.3.2.dev80__py3-none-macosx_10_15_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (395) hide show

xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/hard_swish.h ADDED Viewed

@@ -0,0 +1,168 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_HARD_SWISH_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_HARD_SWISH_H_
+#include <algorithm>
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+namespace tflite_micro {
+namespace reference_ops {
+inline int16_t SaturatingLeftShift(int16_t value, int amount) {
+  int64_t result = static_cast<int64_t>(value) * (1 << amount);
+  result = std::min<int64_t>(result, std::numeric_limits<int16_t>::max());
+  result = std::max<int64_t>(result, std::numeric_limits<int16_t>::min());
+  return result;
+}
+// Similar to ARM instruction SQDMULH.
+// Similar to gemmlowp::SaturatingRoundingDoublingHighMul except
+// rounding to zero instead of to nearest (SQRDMULH).
+inline std::int16_t SaturatingDoublingHighMul(std::int16_t a, std::int16_t b) {
+  bool overflow = a == b && a == std::numeric_limits<std::int16_t>::min();
+  std::int32_t a_32(a);
+  std::int32_t b_32(b);
+  std::int32_t ab_32 = a_32 * b_32;
+  std::int16_t ab_x2_high16 = static_cast<std::int16_t>((ab_32) / (1 << 15));
+  return overflow ? std::numeric_limits<std::int16_t>::max() : ab_x2_high16;
+}
+template <typename T>
+inline void HardSwish(const RuntimeShape& input_shape, const T* input_data,
+                      const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("ReferenceHardSwish/Float");
+  auto matching_size = MatchingFlatSize(input_shape, output_shape);
+  const T* in_end = input_data + matching_size;
+  for (; input_data < in_end; input_data++, output_data++) {
+    const float in = *input_data;
+    *output_data =
+        in * std::min(static_cast<T>(6), std::max(static_cast<T>(0), in + 3)) /
+        6;
+  }
+}
+template <typename T>
+inline void HardSwish(const HardSwishParams& params,
+                      const RuntimeShape& input_shape, const T* input_data,
+                      const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("ReferenceHardSwish/Quantized");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; i++) {
+    const int16_t input_value = input_data[i] - params.input_zero_point;
+    // Left-shift as much as we can without overflow/saturation to put
+    // significant bits in the high bits of our 16-bit fixedpoint values, so
+    // that fixed-point approximate computations below are as accurate as
+    // possible.
+    const int16_t input_value_on_hires_input_scale = input_value * (1 << 7);
+    // Compute the input value on essentially the output scale, just not
+    // right-shifted yet. This is the value that we'll use in the (x >= +3)
+    // case, and that in the general case we'll multiply against the "relu-ish"
+    // fixed-point multiplier in [0, 1].
+    const int16_t input_value_on_preshift_output_scale =
+        gemmlowp::SaturatingRoundingDoublingHighMul(
+            input_value_on_hires_input_scale,
+            params.output_multiplier_fixedpoint_int16);
+    // Now compute the "relu-ish multiplier". In the (-3 <= x <= +3) case, that
+    // is just an affine rescaling of x from [-3, 3] to [0, 1]. In the general
+    // case, it is just that plus saturation at the boundaries of [-3, 3].
+    // First, we rescale from [-3, 3] to [-1, 1], saturating.
+    // That is done by rescaling the input value with a fixed-point multiplier
+    // (reluish_multiplier_fixedpoint) and bit-shift such that we represent
+    // that input value on the scale where the real value 3.0f is represented
+    // by the quantized value 32768.  (+32768 is actually not representable as
+    // int16_t, so this saturates at +32767, and that is seen empirically to be
+    // a negligible contribution to numerical error/bias).
+    //
+    // This code is careful to correctly implement any magnitude of multiplier,
+    // involving either a right shift or a left shift, with correct saturation
+    // behavior in the left-shift case. This forces this code to be more
+    // complicated, but is necessary for real applications: a partially
+    // trained quantized MobileNet v3-small model that motivated this code
+    // exhibits some large [min, max] range boundaries, of the order of
+    // magnitude of 10 or 100 depending on layers.
+    //
+    // The next few lines are basically just an ordinary
+    // MultiplyByQuantizedMultiplier, except that we are more careful here
+    // about the fine details of saturation when left-shifting, because here
+    // overflow in left-shift is a common case, not an anomaly as
+    // MultiplyByQuantizedMultiplier assumes.
+    int16_t reluish_value = input_value_on_hires_input_scale;
+    // Shift left, saturating, as much as we can while ensuring that this
+    // saturation will not contribute to the result. That is, left shift amount
+    // reduced by 1.
+    if (params.reluish_multiplier_exponent > 0) {
+      reluish_value = SaturatingLeftShift(
+          reluish_value, params.reluish_multiplier_exponent - 1);
+    }
+    // Apply the fixed-point multiplier, dividing the value by a divisor
+    // ranging in [1, 2].
+    reluish_value = gemmlowp::SaturatingRoundingDoublingHighMul(
+        reluish_value, params.reluish_multiplier_fixedpoint_int16);
+    // Apply the last bit of left-shift. Thus, in the left-shifting case, if
+    // any saturation affects the result, it is happening here --- any
+    // saturation having occurred above is overwritten here, not affecting the
+    // result.
+    if (params.reluish_multiplier_exponent > 0) {
+      reluish_value = SaturatingLeftShift(reluish_value, 1);
+    }
+    // Shift right, in the right-shifting case.
+    if (params.reluish_multiplier_exponent < 0) {
+      reluish_value = gemmlowp::RoundingDivideByPOT(
+          reluish_value, -params.reluish_multiplier_exponent);
+    }
+    // At this point we have rescaled the value into a 16bit fixedpoint
+    // reluish_value in [-1, 1].
+    // We now convert that to a 16bit fixedpoint value in [0, 1].
+    reluish_value = (reluish_value + (1 << 15)) >> 1;
+    // Use of SaturatingDoublingHighMul here is important to cancel the biases
+    // from the above SaturatingRoundingDoublingHighMul.
+    //
+    // On a partially trained MobileNet-v3-small,
+    //
+    //                                       | bias on    |  ImageNet
+    //                                       | quantized  |  Top-1
+    // Operation used here                   | values     |  accuracy (50k)
+    // --------------------------------------+------------+-----------
+    // SaturatingDoublingHighMul             | -0.0024    |  58.920
+    // SaturatingRoundingDoublingHighMul     | -0.0067    |  58.064
+    //
+    // In activations_test, this is covered by this testcase:
+    //     QuantizedActivationsOpTest.HardSwishBias
+    //
+    const int16_t preshift_output_value = SaturatingDoublingHighMul(
+        reluish_value, input_value_on_preshift_output_scale);
+    // We were so far operating on the pre-shift output scale. Now we finally
+    // apply that output shift, arriving at the final output scale.
+    int16_t output_value = gemmlowp::RoundingDivideByPOT(
+        preshift_output_value, -params.output_multiplier_exponent);
+    output_value += params.output_zero_point;
+    output_value =
+        std::min<int16_t>(output_value, std::numeric_limits<T>::max());
+    output_value =
+        std::max<int16_t>(output_value, std::numeric_limits<T>::min());
+    output_data[i] = output_value;
+  }
+}
+}  // namespace reference_ops
+}  // namespace tflite_micro
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_HARD_SWISH_H_

xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/add.h ADDED Viewed

@@ -0,0 +1,250 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
+#include <algorithm>
+#include <cstddef>
+#include <limits>
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+namespace tflite_micro {
+namespace reference_integer_ops {
+inline void CheckArithmeticParams(const ArithmeticParams& params) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  // Input offset is negative input zero point. Activation tensors are
+  // asymmetric quantized so they span the full int8 range.
+  TFLITE_DCHECK_GE(-params.input1_offset, std::numeric_limits<int8_t>::min());
+  TFLITE_DCHECK_GE(-params.input2_offset, std::numeric_limits<int8_t>::min());
+  TFLITE_DCHECK_LE(-params.input1_offset, std::numeric_limits<int8_t>::max());
+  TFLITE_DCHECK_LE(-params.input2_offset, std::numeric_limits<int8_t>::max());
+}
+// TODO: b/270589088 - move to a more appropriate file (b/270589088#comment2)
+template <typename T>
+void BroadcastInput1(int size, const ArithmeticParams& params,
+                     const T* input1_data, const T* input2_data, T* output_data,
+                     void (*check_arithmetic_params)(const ArithmeticParams&),
+                     T (*binary_func)(T, T, const ArithmeticParams&)) {
+  CheckArithmeticParams(params);
+  for (int i = 0; i < size; ++i) {
+    output_data[i] = binary_func(input1_data[0], input2_data[i], params);
+  }
+}
+template <typename T>
+void BroadcastInput2(int size, const ArithmeticParams& params,
+                     const T* input1_data, const T* input2_data, T* output_data,
+                     void (*check_arithmetic_params)(const ArithmeticParams&),
+                     T (*binary_func)(T, T, const ArithmeticParams&)) {
+  CheckArithmeticParams(params);
+  for (int i = 0; i < size; ++i) {
+    output_data[i] = binary_func(input1_data[i], input2_data[0], params);
+  }
+}
+// TODO: b/270589088 - move to a more appropriate file (b/270589088#comment2)
+template <typename T>
+void ElementWise(int size, const ArithmeticParams& params, const T* input1_data,
+                 const T* input2_data, T* output_data,
+                 void (*check_arithmetic_params)(const ArithmeticParams&),
+                 T (*binary_func)(T, T, const ArithmeticParams&)) {
+  CheckArithmeticParams(params);
+  for (int i = 0; i < size; ++i) {
+    output_data[i] = binary_func(input1_data[i], input2_data[i], params);
+  }
+}
+template <typename T>
+inline void BroadcastAddRecursiveDimensions(
+    const ArithmeticParams& params, int dimension, size_t* input1_offset_p,
+    size_t* input2_offset_p, size_t* output_offset,
+    size_t* compressed_input1_stride, size_t* compressed_input2_stride,
+    size_t* compressed_output_shape, const T* input1_data, const T* input2_data,
+    T* output_data, void (*check_arithmetic_params)(const ArithmeticParams&),
+    T (*binary_func)(T, T, const ArithmeticParams&)) {
+  if (dimension > 0) {
+    for (size_t c = 0; c < compressed_output_shape[dimension]; ++c) {
+      size_t input1_offset_c = *input1_offset_p;
+      size_t input2_offset_c = *input2_offset_p;
+      BroadcastAddRecursiveDimensions(
+          params, dimension - 1, &input1_offset_c, &input2_offset_c,
+          output_offset, compressed_input1_stride, compressed_input2_stride,
+          compressed_output_shape, input1_data, input2_data, output_data,
+          check_arithmetic_params, binary_func);
+      *input1_offset_p += compressed_input1_stride[dimension];
+      *input2_offset_p += compressed_input2_stride[dimension];
+    }
+  } else {
+    TFLITE_DCHECK(dimension == 0);
+    bool input1_is_broadcast = compressed_input1_stride[dimension] == 0;
+    bool input2_is_broadcast = compressed_input2_stride[dimension] == 0;
+    TFLITE_DCHECK(!(input1_is_broadcast && input2_is_broadcast));
+    const T* input1_data_ptr = input1_data + *input1_offset_p;
+    const T* input2_data_ptr = input2_data + *input2_offset_p;
+    T* output_data_ptr = output_data + *output_offset;
+    if (input1_is_broadcast) {
+      // input1 is broadcast.
+      BroadcastInput1<T>(compressed_output_shape[dimension], params,
+                         input1_data_ptr, input2_data_ptr, output_data_ptr,
+                         check_arithmetic_params, binary_func);
+      *input2_offset_p += compressed_output_shape[dimension];
+    } else if (input2_is_broadcast) {
+      // input2 is broadcast.
+      BroadcastInput2<T>(compressed_output_shape[dimension], params,
+                         input1_data_ptr, input2_data_ptr, output_data_ptr,
+                         check_arithmetic_params, binary_func);
+      *input1_offset_p += compressed_output_shape[dimension];
+    } else {
+      // Add element-wise.
+      ElementWise<T>(compressed_output_shape[dimension], params,
+                     input1_data_ptr, input2_data_ptr, output_data_ptr,
+                     check_arithmetic_params, binary_func);
+      *input1_offset_p += compressed_output_shape[dimension];
+      *input2_offset_p += compressed_output_shape[dimension];
+    }
+    *output_offset += compressed_output_shape[dimension];
+  }
+}
+// TODO: b/270589088 - move to a more appropriate file. (b/270589088#comment2)
+template <typename T>
+void BroadcastBinaryFunction6DSlow(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data,
+    void (*check_arithmetic_params)(const ArithmeticParams&),
+    T (*binary_func)(T, T, const ArithmeticParams&)) {
+  constexpr int kMaxBroadcastDim = 6;
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  size_t compressed_input1_stride[kMaxBroadcastDim];
+  size_t compressed_input2_stride[kMaxBroadcastDim];
+  size_t compressed_output_shape[kMaxBroadcastDim];
+  bool broadcastable_shape = ReduceDimensionsForBroadcast<kMaxBroadcastDim>(
+      input1_shape, input2_shape, compressed_input1_stride,
+      compressed_input2_stride, compressed_output_shape);
+  // Skip broadcasting for degenerate shapes.
+  if (!broadcastable_shape) {
+    return;
+  }
+  size_t input1_offset = 0;
+  size_t input2_offset = 0;
+  size_t output_offset = 0;
+  BroadcastAddRecursiveDimensions(
+      params, kMaxBroadcastDim - 1, &input1_offset, &input2_offset,
+      &output_offset, compressed_input1_stride, compressed_input2_stride,
+      compressed_output_shape, input1_data, input2_data, output_data,
+      check_arithmetic_params, binary_func);
+}
+template <typename T>
+void BroadcastBinaryFunction4DSlow(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data,
+    void (*check_arithmetic_params)(const ArithmeticParams&),
+    T (*binary_func)(T, T, const ArithmeticParams&)) {
+  BroadcastBinaryFunction6DSlow(params, input1_shape, input1_data, input2_shape,
+                                input2_data, output_shape, output_data,
+                                check_arithmetic_params, binary_func);
+}
+inline int8_t AddFunc(int8_t x, int8_t y, const ArithmeticParams& params) {
+  const int32_t input1_val = params.input1_offset + x;
+  const int32_t input2_val = params.input2_offset + y;
+  const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+  const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+  const int32_t scaled_input1_val =
+      MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          shifted_input1_val, params.input1_multiplier, params.input1_shift);
+  const int32_t scaled_input2_val =
+      MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          shifted_input2_val, params.input2_multiplier, params.input2_shift);
+  const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+  const int32_t raw_output =
+      MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          raw_sum, params.output_multiplier, params.output_shift) +
+      params.output_offset;
+  const int32_t clamped_output =
+      std::min(params.quantized_activation_max,
+               std::max(params.quantized_activation_min, raw_output));
+  return static_cast<int8_t>(clamped_output);
+}
+// Element-wise add that can often be used for inner loop of broadcast add as
+// well as the non-broadcast add.
+inline void AddElementwise(int size, const ArithmeticParams& params,
+                           const int8_t* input1_data, const int8_t* input2_data,
+                           int8_t* output_data) {
+  ElementWise(size, params, input1_data, input2_data, output_data,
+              CheckArithmeticParams, AddFunc);
+}
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int8_t* input1_data,
+                const RuntimeShape& input2_shape, const int8_t* input2_data,
+                const RuntimeShape& output_shape, int8_t* output_data) {
+  CheckArithmeticParams(params);
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+inline void BroadcastAdd6DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const int8_t* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const int8_t* input2_data,
+                               const RuntimeShape& output_shape,
+                               int8_t* output_data) {
+  BroadcastBinaryFunction6DSlow(params, input1_shape, input1_data, input2_shape,
+                                input2_data, output_shape, output_data,
+                                CheckArithmeticParams, AddFunc);
+}
+inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const int8_t* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const int8_t* input2_data,
+                               const RuntimeShape& output_shape,
+                               int8_t* output_data) {
+  BroadcastBinaryFunction6DSlow(params, input1_shape, input1_data, input2_shape,
+                                input2_data, output_shape, output_data,
+                                CheckArithmeticParams, AddFunc);
+}
+}  // namespace reference_integer_ops
+}  // namespace tflite_micro
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_

xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h ADDED Viewed

@@ -0,0 +1,241 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
+#include <algorithm>
+#include "tensorflow/lite/kernels/internal/common.h"
+namespace tflite_micro {
+namespace reference_integer_ops {
+// Fixed-point per-channel-quantization convolution reference kernel.
+inline void ConvPerChannel(
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data) {
+  // Get parameters.
+  const int32_t input_offset = params.input_offset;  // r = s(q - Z)
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int32_t output_offset = params.output_offset;
+  // Set min and max value of the output.
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  // Consistency check.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = input_shape.Dims(3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+  // Check dimensions of the tensors.
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_NE(groups, 0);
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
+  TFLITE_DCHECK_NE(filters_per_group, 0);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          auto group = out_channel / filters_per_group;
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                  (in_y < input_height);
+              if (!is_point_inside_image) {
+                continue;
+              }
+              for (int in_channel = 0; in_channel < filter_input_depth;
+                   ++in_channel) {
+                int32_t input_val =
+                    input_data[Offset(input_shape, batch, in_y, in_x,
+                                      in_channel + group * filter_input_depth)];
+                int32_t filter_val = filter_data[Offset(
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                // Accumulate with 32 bits accumulator.
+                // In the nudging process during model quantization, we force
+                // real value of 0.0 be represented by a quantized value. This
+                // guarantees that the input_offset is a int8_t, even though
+                // it is represented using int32_t. int32_t += int8_t *
+                // (int8_t - int8_t) so the highest value we can get from each
+                // accumulation is [-127, 127] * ([-128, 127] -
+                // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                // = 14.98, which means we can accumulate at least 2^16
+                // multiplications without overflow. The accumulator is
+                // applied to a filter so the accumulation logic will hold as
+                // long as the filter size (filter_y * filter_x * in_channel)
+                // does not exceed 2^16, which is the case in all the models
+                // we have seen so far.
+                // TODO(b/174275578): Add a check to make sure the
+                // accumulator depth is smaller than 2^16.
+                acc += filter_val * (input_val + input_offset);
+              }
+            }
+          }
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_shift[out_channel]);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<int8_t>(acc);
+        }
+      }
+    }
+  }
+}
+// Fixed-point per-channel-quantization convolution reference kernel.
+// 16-bit data and 8-bit filter
+template <typename AccumScalar>
+inline void ConvPerChannel(
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int16_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const AccumScalar* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data) {
+  // Get parameters.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  // Set min and max value of the output.
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  // Consistency check.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = input_shape.Dims(3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+  // Check dimensions of the tensors.
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          auto group = out_channel / filters_per_group;
+          AccumScalar acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                  (in_y < input_height);
+              if (!is_point_inside_image) {
+                continue;
+              }
+              for (int in_channel = 0; in_channel < filter_input_depth;
+                   ++in_channel) {
+                int32_t input_val =
+                    input_data[Offset(input_shape, batch, in_y, in_x,
+                                      in_channel + group * filter_input_depth)];
+                int32_t filter_val = filter_data[Offset(
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                // Accumulate with 64 bits accumulator.
+                // int64_t += int8_t * int16_t so the highest value we can
+                // get from each accumulation is [-127, 127] * ([-32768,
+                // 32767] -
+                // [-32768, 32767]), which is [-8322945, 8322945].
+                // log2(8322945) = 22.99.
+                acc += filter_val * input_val;
+              }
+            }
+          }
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          int32_t scaled_acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_shift[out_channel]);
+          scaled_acc = std::max(scaled_acc, output_activation_min);
+          scaled_acc = std::min(scaled_acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<int16_t>(scaled_acc);
+        }
+      }
+    }
+  }
+}
+}  // namespace reference_integer_ops
+}  // namespace tflite_micro
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_