PyPI - xmos-ai-tools - Versions diffs - 1.3.2.dev80__py3-none-macosx_10_15_universal2.whl - Mend

xmos-ai-tools 1.3.2.dev80__py3-none-macosx_10_15_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (395) hide show

xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h ADDED Viewed

@@ -0,0 +1,224 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TRANSPOSE_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TRANSPOSE_CONV_H_
+#include <algorithm>
+#include "tensorflow/lite/kernels/internal/common.h"
+namespace tflite_micro {
+namespace reference_integer_ops {
+// Fixed-point per-channel-quantization transpose convolution reference kernel.
+inline void TransposeConv(
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data, const RuntimeShape& im2col_shape, int8_t* im2col_data,
+    int32_t* scratch_buffer) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  (void)im2col_data;   // only used in optimized code.
+  (void)im2col_shape;  // only used in optimized code.
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int num_elements = output_shape.FlatSize();
+  // We need to initialize scratch_buffer to all 0s, as we apply the same
+  // 'scatter' based trick as in float version.
+  memset(scratch_buffer, 0, num_elements * sizeof(int32_t));
+  // Loop through input elements one at a time.
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int in_y = 0; in_y < input_height; ++in_y) {
+      for (int in_x = 0; in_x < input_width; ++in_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          // Loop through the output elements it will influence.
+          const int out_x_origin = (in_x * stride_width) - pad_width;
+          const int out_y_origin = (in_y * stride_height) - pad_height;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int out_channel = 0; out_channel < output_depth;
+                   ++out_channel) {
+                // Compute output element location.
+                const int out_x = out_x_origin + filter_x;
+                const int out_y = out_y_origin + filter_y;
+                // We cannot accumulate out of bounds.
+                if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
+                    (out_y < output_height)) {
+                  const int8_t input_value = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  const int8_t filter_value =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  scratch_buffer[Offset(output_shape, batch, out_y, out_x,
+                                        out_channel)] +=
+                      (input_value + input_offset) * filter_value;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          int32_t acc = scratch_buffer[Offset(output_shape, batch, out_y, out_x,
+                                              out_channel)];
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_shift[out_channel]);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<int8_t>(acc);
+        }
+      }
+    }
+  }
+}
+// int16_t input (zero_point=0), int8_t filter, int32 or int64 accumulator
+template <typename Scalar>
+inline void TransposeConv(
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int16_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const Scalar* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data, const RuntimeShape& im2col_shape, int8_t* im2col_data,
+    Scalar* scratch_buffer) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  (void)im2col_data;   // only used in optimized code.
+  (void)im2col_shape;  // only used in optimized code.
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int num_elements = output_shape.FlatSize();
+  // We need to initialize scratch_buffer to all 0s, as we apply the same
+  // 'scatter' based trick as in float version.
+  memset(scratch_buffer, 0, num_elements * sizeof(Scalar));
+  // Loop through input elements one at a time.
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int in_y = 0; in_y < input_height; ++in_y) {
+      for (int in_x = 0; in_x < input_width; ++in_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          // Loop through the output elements it will influence.
+          const int out_x_origin = (in_x * stride_width) - pad_width;
+          const int out_y_origin = (in_y * stride_height) - pad_height;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int out_channel = 0; out_channel < output_depth;
+                   ++out_channel) {
+                // Compute output element location.
+                const int out_x = out_x_origin + filter_x;
+                const int out_y = out_y_origin + filter_y;
+                // We cannot accumulate out of bounds.
+                if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
+                    (out_y < output_height)) {
+                  const int32_t input_value = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  const int32_t filter_value =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  scratch_buffer[Offset(output_shape, batch, out_y, out_x,
+                                        out_channel)] +=
+                      input_value * filter_value;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          Scalar acc = scratch_buffer[Offset(output_shape, batch, out_y, out_x,
+                                             out_channel)];
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          int32_t scaled_acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_shift[out_channel]);
+          scaled_acc = std::max(scaled_acc, output_activation_min);
+          scaled_acc = std::min(scaled_acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<int16_t>(scaled_acc);
+        }
+      }
+    }
+  }
+}
+}  // namespace reference_integer_ops
+}  // namespace tflite_micro
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TRANSPOSE_CONV_H_

xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/l2normalization.h ADDED Viewed

@@ -0,0 +1,90 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_
+#include <algorithm>
+#include <cmath>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+namespace tflite_micro {
+namespace reference_ops {
+inline void L2Normalization(const tflite_micro::L2NormalizationParams& op_params,
+                            const RuntimeShape& input_shape,
+                            const float* input_data,
+                            const RuntimeShape& output_shape,
+                            float* output_data, float epsilon = 1e-6) {
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  for (int i = 0; i < outer_size; ++i) {
+    float squared_l2_norm = 0;
+    for (int c = 0; c < depth; ++c) {
+      const float val = input_data[depth * i + c];
+      squared_l2_norm += val * val;
+    }
+    float l2_norm = std::sqrt(squared_l2_norm);
+    l2_norm = std::max(l2_norm, epsilon);
+    for (int c = 0; c < depth; ++c) {
+      output_data[depth * i + c] = input_data[depth * i + c] / l2_norm;
+    }
+  }
+}
+inline void L2Normalization(const tflite_micro::L2NormalizationParams& op_params,
+                            const RuntimeShape& input_shape,
+                            const uint8_t* input_data,
+                            const RuntimeShape& output_shape,
+                            uint8_t* output_data) {
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int32_t input_zero_point = op_params.input_zero_point;
+  for (int i = 0; i < outer_size; ++i) {
+    int32_t square_l2_norm = 0;
+    for (int c = 0; c < depth; c++) {
+      int32_t diff = input_data[depth * i + c] - input_zero_point;
+      square_l2_norm += diff * diff;
+    }
+    int32_t inv_l2norm_multiplier;
+    int inv_l2norm_shift;
+    GetInvSqrtQuantizedMultiplierExp(square_l2_norm, kReverseShift,
+                                     &inv_l2norm_multiplier, &inv_l2norm_shift);
+    for (int c = 0; c < depth; c++) {
+      int32_t diff = input_data[depth * i + c] - input_zero_point;
+      int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
+      int32_t unclamped_output_val = 128 + rescaled_diff;
+      int32_t output_val =
+          std::min(static_cast<int32_t>(255),
+                   std::max(static_cast<int32_t>(0), unclamped_output_val));
+      output_data[depth * i + c] = static_cast<uint8_t>(output_val);
+    }
+  }
+}
+}  // namespace reference_ops
+}  // namespace tflite_micro
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_

xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/leaky_relu.h ADDED Viewed

@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEAKY_RELU_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEAKY_RELU_H_
+#include <algorithm>
+#include <limits>
+#include "tensorflow/lite/kernels/internal/common.h"
+namespace tflite_micro {
+namespace reference_ops {
+inline void LeakyRelu(const tflite_micro::LeakyReluParams& params,
+                      const RuntimeShape& input_shape, const float* input_data,
+                      const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    // Note that alpha might be > 1 or < 0, so we don't use std::max here.
+    output_data[i] = val > 0 ? val : val * params.alpha;
+  }
+}
+template <typename T>
+inline void QuantizeLeakyRelu(const LeakyReluParams& params,
+                              const RuntimeShape& input_shape,
+                              const T* input_data,
+                              const RuntimeShape& output_shape,
+                              T* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  static const int32_t quantized_min = std::numeric_limits<T>::min();
+  static const int32_t quantized_max = std::numeric_limits<T>::max();
+  for (int i = 0; i < flat_size; ++i) {
+    const int32_t input_value = input_data[i] - params.input_offset;
+    int32_t unclamped_output;
+    if (input_value >= 0) {
+      unclamped_output = params.output_offset +
+                         MultiplyByQuantizedMultiplier(
+                             input_value, params.output_multiplier_identity,
+                             params.output_shift_identity);
+    } else {
+      unclamped_output = params.output_offset +
+                         MultiplyByQuantizedMultiplier(
+                             input_value, params.output_multiplier_alpha,
+                             params.output_shift_alpha);
+    }
+    const T clamped_output =
+        std::min(quantized_max, std::max(quantized_min, unclamped_output));
+    output_data[i] = static_cast<T>(clamped_output);
+  }
+}
+}  // namespace reference_ops
+}  // namespace tflite_micro
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEAKY_RELU_H_

xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/reference/log_softmax.h ADDED Viewed

@@ -0,0 +1,256 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOG_SOFTMAX_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOG_SOFTMAX_H_
+#include <algorithm>
+#include <cstddef>
+#include <limits>
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+namespace tflite_micro {
+namespace reference_ops {
+inline void LogSoftmax(const SoftmaxParams& params,
+                       const RuntimeShape& input_shape, const float* input_data,
+                       const RuntimeShape& output_shape, float* output_data) {
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  for (int i = 0; i < outer_size; ++i) {
+    // Find max element value which we'll use to ensure numerical stability
+    // taking advantage of the following equality:
+    // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
+    float max = std::numeric_limits<float>::lowest();
+    for (int c = 0; c < depth; ++c) {
+      max = std::max(max, input_data[i * depth + c]);
+    }
+    // Compute sum.
+    float sum = 0.f;
+    for (int c = 0; c < depth; ++c) {
+      sum += std::exp(input_data[i * depth + c] - max);
+    }
+    // Compute result.
+    const float log_sum = std::log(sum);
+    for (int c = 0; c < depth; ++c) {
+      output_data[i * depth + c] = input_data[i * depth + c] - max - log_sum;
+    }
+  }
+}
+inline void LogSoftmax(const SoftmaxParams& params,
+                       const RuntimeShape& input_shape,
+                       const uint8_t* input_data,
+                       const RuntimeShape& output_shape, uint8_t* output_data) {
+  const int32_t input_multiplier = params.input_multiplier;
+  const int32_t input_left_shift = params.input_left_shift;
+  const int32_t reverse_scaling_divisor = params.reverse_scaling_divisor;
+  const int32_t reverse_scaling_right_shift =
+      params.reverse_scaling_right_shift;
+  const int diff_min = params.diff_min;
+  // The representation chosen for the input to the exp() function is Q5.26.
+  // We need to leave extra space since values that we skip might be as large
+  // as -32 before multiplying by input_beta_multiplier, and therefore as
+  // large as -16 afterwards.  Note that exp(-8) is definitely not
+  // insignificant to accumulation, but exp(-16) definitely is.
+  static constexpr int kScaledDiffIntegerBits = 5;
+  static constexpr int kAccumulationIntegerBits = 12;
+  static constexpr int kOutputIntegerBits = 4;
+  using FixedPointScaledDiff =
+      gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
+  using FixedPointAccum =
+      gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  for (int i = 0; i < outer_size; ++i) {
+    uint8_t max_in_row = 0;
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+    for (int c = 0; c < depth; ++c) {
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32_t input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_multiplier, input_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                        exp_on_negative_values(scaled_diff_f8));
+      }
+    }
+    const int32_t fixed_log_sum_of_exps =
+        log_x_for_x_greater_than_or_equal_to_1<kScaledDiffIntegerBits>(
+            sum_of_exps)
+            .raw();
+    // rescaled_diff_min is smallest representable in
+    // Q(kScaledDiffIntegerBits).(31-kScaledDiffIntegerBits) plus the
+    // log-sub-exps that will be subtracted in the loop.
+    //
+    // The thresholds diff_min, etc are negative.
+    const int rescaled_diff_min =
+        fixed_log_sum_of_exps + std::numeric_limits<int32_t>::lowest();
+    const int adjusted_diff_min =
+        std::max(static_cast<int32_t>(
+                     diff_min - 1),  // Note use of > below instead of >= above.
+                 MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                     rescaled_diff_min, reverse_scaling_divisor,
+                     -reverse_scaling_right_shift));
+    for (int c = 0; c < depth; ++c) {
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff > adjusted_diff_min) {
+        const int32_t input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_multiplier, input_left_shift);
+        int32_t unsat_output =
+            gemmlowp::RoundingDivideByPOT(
+                (input_diff_rescaled - fixed_log_sum_of_exps),
+                31 - kScaledDiffIntegerBits - kOutputIntegerBits) +
+            255;
+        output_data[i * depth + c] = static_cast<uint8_t>(
+            std::max(std::min(unsat_output, static_cast<int32_t>(255)),
+                     static_cast<int32_t>(0)));
+      } else {
+        // Set output to smallest value.
+        output_data[i * depth + c] = 0;
+      }
+    }
+  }
+}
+template <typename T>
+inline void LogSoftmaxQuantized(const SoftmaxParams& params,
+                                const size_t outer_size, const size_t depth,
+                                const RuntimeShape& input_shape,
+                                const T* input_data,
+                                const RuntimeShape& output_shape,
+                                T* output_data) {
+  const int32_t input_multiplier = params.input_multiplier;
+  const int32_t input_left_shift = params.input_left_shift;
+  const int32_t reverse_scaling_divisor = params.reverse_scaling_divisor;
+  const int32_t reverse_scaling_right_shift =
+      params.reverse_scaling_right_shift;
+  const int diff_min = params.diff_min;
+  static constexpr T kMinT8 = std::numeric_limits<T>::min();
+  static constexpr T kMaxT8 = std::numeric_limits<T>::max();
+  static constexpr int32_t kMinInt32 = std::numeric_limits<int32_t>::min();
+  // All IntegerBits must agree with Prepare function.
+  // Input is chosen as Q5.26 so exp(-1 * 2^5 * 2^-1) = exp(-16) is negligible.
+  static constexpr int kInputIntegerBits = 5;
+  static constexpr int kAccumulationIntegerBits = 12;
+  static constexpr int kOutputIntegerBits = 4;
+  using F5 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
+  using F12 = gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
+  for (size_t outer_index = 0; outer_index < outer_size; ++outer_index) {
+    T max_in_row = kMinT8;
+    for (size_t inner_index = 0; inner_index < depth; ++inner_index) {
+      max_in_row =
+          std::max(max_in_row, input_data[outer_index * depth + inner_index]);
+    }
+    // Accumulator "sum_of_exps_in_q12" is safe from overflowing in 2^12 steps.
+    F12 sum_of_exps_in_q12 = F12::FromRaw(0);
+    for (size_t inner_index = 0; inner_index < depth; ++inner_index) {
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[outer_index * depth + inner_index]) -
+          max_in_row;
+      if (input_diff >= diff_min) {
+        const int32_t input_diff_in_q5 = MultiplyByQuantizedMultiplier(
+            input_diff, input_multiplier, input_left_shift);
+        sum_of_exps_in_q12 =
+            sum_of_exps_in_q12 +
+            gemmlowp::Rescale<kAccumulationIntegerBits>(
+                exp_on_negative_values(F5::FromRaw(input_diff_in_q5)));
+      }
+    }
+    const int32_t log_sum_of_exps_in_q5 =
+        log_x_for_x_greater_than_or_equal_to_1<kInputIntegerBits>(
+            sum_of_exps_in_q12)
+            .raw();
+    // Potentially reduced the valid range. shifted_log_sum_of_exps_in_q5 is
+    // smallest representable in Q5.26 plus the log_sum_of_exps.
+    const int32_t shifted_log_sum_of_exps_in_q5 =
+        log_sum_of_exps_in_q5 + kMinInt32;
+    const int32_t adjusted_diff_min =
+        std::max(static_cast<int32_t>(diff_min - 1),
+                 MultiplyByQuantizedMultiplier(shifted_log_sum_of_exps_in_q5,
+                                               reverse_scaling_divisor,
+                                               -reverse_scaling_right_shift));
+    for (size_t inner_index = 0; inner_index < depth; ++inner_index) {
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[outer_index * depth + inner_index]) -
+          max_in_row;
+      // Note use of > below instead of >= above.
+      if (input_diff > adjusted_diff_min) {
+        const int32_t input_diff_in_q5 = MultiplyByQuantizedMultiplier(
+            input_diff, input_multiplier, input_left_shift);
+        // Rescale and downcast.
+        int32_t output_in_q27 =
+            gemmlowp::RoundingDivideByPOT(
+                (input_diff_in_q5 - log_sum_of_exps_in_q5),
+                31 - kInputIntegerBits - kOutputIntegerBits) +
+            kMaxT8;
+        output_in_q27 =
+            std::max(std::min(output_in_q27, static_cast<int32_t>(kMaxT8)),
+                     static_cast<int32_t>(kMinT8));
+        output_data[outer_index * depth + inner_index] =
+            static_cast<T>(output_in_q27);
+      } else {
+        output_data[outer_index * depth + inner_index] = kMinT8;
+      }
+    }
+  }
+}
+inline void LogSoftmax(const SoftmaxParams& params, const size_t outer_size,
+                       const size_t depth, const RuntimeShape& input_shape,
+                       const int8_t* input_data,
+                       const RuntimeShape& output_shape, int8_t* output_data) {
+  LogSoftmaxQuantized(params, outer_size, depth, input_shape, input_data,
+                      output_shape, output_data);
+}
+}  // namespace reference_ops
+}  // namespace tflite_micro
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOG_SOFTMAX_H_