PyPI - xmos-ai-tools - Versions diffs - 1.1.2.dev216__py3-none-macosx_11_0_arm64.whl → 1.1.2.dev236__py3-none-macosx_11_0_arm64.whl - Mend

xmos-ai-tools 1.1.2.dev216__py3-none-macosx_11_0_arm64.whl → 1.1.2.dev236__py3-none-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (147) hide show

xmos_ai_tools/runtime/include/tensorflow/lite/core/c/common.h CHANGED Viewed

@@ -38,6 +38,7 @@ limitations under the License.
 /// "third_party/tensorflow/lite/c/common.h".
 /// Only the TensorFlow Lite implementation itself should include this
 /// file directly.
+// IWYU pragma: private, include "third_party/tensorflow/lite/c/common.h"
 #ifndef TENSORFLOW_LITE_CORE_C_COMMON_H_
 #define TENSORFLOW_LITE_CORE_C_COMMON_H_
@@ -157,6 +158,10 @@ int TfLiteFloatArrayGetSizeInBytes(int size);
 // This returns a pointer, that you must free using TfLiteFloatArrayFree().
 TfLiteFloatArray* TfLiteFloatArrayCreate(int size);
+// Create a copy of an array passed as `src`.
+// You are expected to free memory with TfLiteFloatArrayFree.
+TfLiteFloatArray* TfLiteFloatArrayCopy(const TfLiteFloatArray* src);
 // Free memory of array `a`.
 void TfLiteFloatArrayFree(TfLiteFloatArray* a);
 #endif  // TF_LITE_STATIC_MEMORY
@@ -345,6 +350,8 @@ typedef union TfLitePtrUnion {
 //        as constant inputs for downstream ops (also in prepare).
 //  * kTfLiteCustom: Custom memory allocation provided by the user. See
 //        TfLiteCustomAllocation below.
+// * kTfLiteVariantObject: Allocation is an arbitrary type-erased C++ object.
+//        Allocation and deallocation are done through `new` and `delete`.
 typedef enum TfLiteAllocationType {
   kTfLiteMemNone = 0,
   kTfLiteMmapRo,
@@ -353,8 +360,40 @@ typedef enum TfLiteAllocationType {
   kTfLiteDynamic,
   kTfLitePersistentRo,
   kTfLiteCustom,
+  kTfLiteVariantObject,
 } TfLiteAllocationType;
+// Memory allocation strategies.
+//
+// TfLiteAllocationType values have been overloaded to mean more than their
+// original intent. This enum should only be used to document the allocation
+// strategy used by a tensor for it data.
+typedef enum TfLiteAllocationStrategy {
+  kTfLiteAllocationStrategyUnknown,
+  kTfLiteAllocationStrategyNone,    // No data is allocated.
+  kTfLiteAllocationStrategyMMap,    // Data is mmaped.
+  kTfLiteAllocationStrategyArena,   // Handled by the arena.
+  kTfLiteAllocationStrategyMalloc,  // Uses `malloc`/`free`.
+  kTfLiteAllocationStrategyNew      // Uses `new[]`/`delete[]`.
+} TfLiteAllocationStrategy;
+// Describes how stable a tensor attribute is with regards to an interpreter
+// runs.
+typedef enum TfLiteRunStability {
+  kTfLiteRunStabilityUnknown,
+  kTfLiteRunStabilityUnstable,   // May change at any time.
+  kTfLiteRunStabilitySingleRun,  // Will stay the same for one run.
+  kTfLiteRunStabilityAcrossRuns  // Will stay the same across all runs.
+} TfLiteRunStability;
+// Describes the steps of a TFLite operation life cycle.
+typedef enum TfLiteRunStep {
+  kTfLiteRunStepUnknown,
+  kTfLiteRunStepInit,
+  kTfLiteRunStepPrepare,
+  kTfLiteRunStepEval
+} TfLiteRunStep;
 // The delegates should use zero or positive integers to represent handles.
 // -1 is reserved from unallocated status.
 typedef int TfLiteBufferHandle;
@@ -847,7 +886,7 @@ typedef struct TfLiteContext {
   // }
   //
   // NOTE: The context owns the memory referenced by partition_params_array. It
-  // will be cleared with another call to PreviewDelegateParitioning, or after
+  // will be cleared with another call to PreviewDelegatePartitioning, or after
   // TfLiteDelegateParams::Prepare returns.
   //
   // WARNING: This is an experimental interface that is subject to change.
@@ -878,6 +917,27 @@ typedef struct TfLiteContext {
   TfLiteStatus (*GetModelMetadata)(const struct TfLiteContext* context,
                                    const char* name, const char** ptr,
                                    size_t* bytes);
+  // Retrieves the corresponding TfLiteContext of a subgraph that the given
+  // subgraph_index points to and switches to the delegate context for that
+  // subgraph. If an invalid subgraph index is given, returns kTfLiteError.
+  // NOTE: This function is expected to be paired with ReleaseSubgraphContext()
+  // once the delegate preparation is done and/or the delegate context functions
+  // are no longer needed.
+  //
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*AcquireSubgraphContext)(
+      struct TfLiteContext* context, int subgraph_index,
+      struct TfLiteContext** acquired_context);
+  // Releases the subgraph context by switching back to the TFLite kernel
+  // context for the subgraph that the given subgraph_index points to.
+  // NOTE: This function is expected to be used after AcquireSubgraphContext()
+  // once the delegate preparation is done and/or the delegate context functions
+  // are no longer needed.
+  //
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*ReleaseSubgraphContext)(struct TfLiteContext* context,
+                                         int subgraph_index);
 } TfLiteContext;
 // `TfLiteRegistrationExternal` is an external version of `TfLiteRegistration`
@@ -886,6 +946,64 @@ typedef struct TfLiteContext {
 // field is the exactly the same as with `TfLiteRegistration`.
 typedef struct TfLiteRegistrationExternal TfLiteRegistrationExternal;
+// The valid values of the `inplace_operator` field in `TfLiteRegistration`.
+// This allow an op to signal to the runtime that the same data pointer
+// may be passed as an input and output without impacting the result.
+// This does not mean that the memory can safely be reused, it is up to the
+// runtime to determine this, e.g. if another op consumes the same input or not
+// or if an input tensor has sufficient memory allocated to store the output
+// data.
+//
+// Setting these flags authorizes the runtime to set the data pointers of an
+// input and output tensor to the same value. In such cases, the memory required
+// by the output must be less than or equal to that required by the shared
+// input, never greater. If kTfLiteInplaceOpDataUnmodified is set, then the
+// runtime can share the same input tensor with multiple operator's outputs,
+// provided that kTfLiteInplaceOpDataUnmodified is set for all of them.
+// Otherwise, if an input tensor is consumed by multiple operators, it may only
+// be shared with the operator which is the last to consume it.
+//
+// Note that this is a bitmask, so the values should be 1, 2, 4, 8, ...etc.
+typedef enum {
+  // The default value. This indicates that the same data pointer cannot safely
+  // be passed as an op's input and output.
+  kTfLiteInplaceOpNone = 0,
+  // This indicates that an op's first output's data is identical to its first
+  // input's data, for example Reshape.
+  kTfLiteInplaceOpDataUnmodified = 1,
+  // Setting kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput means
+  // that InputN may be shared with OutputN instead of with the first output.
+  // This flag requires one or more of kTfLiteInplaceOpInputNShared to be set.
+  kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput = 2,
+  // kTfLiteInplaceOpInputNShared indicates that it is safe for an op to share
+  // InputN's data pointer with an output tensor. If
+  // kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput is set then
+  // kTfLiteInplaceOpInputNShared indicates that InputN may be shared
+  // with OutputN, otherwise kTfLiteInplaceOpInputNShared indicates that InputN
+  // may be shared with the first output.
+  //
+  // Indicates that an op's first input may be shared with the first output
+  // tensor. kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput has
+  // no impact on the behavior allowed by this flag.
+  kTfLiteInplaceOpInput0Shared = 4,
+  // Indicates that an op's second input may be shared with the first output
+  // if kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput is not set
+  // or second output if kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput
+  // is set.
+  kTfLiteInplaceOpInput1Shared = 8,
+  // Indicates that an op's third input may be shared with the first output
+  // if kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput is not set
+  // or third output if kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput is
+  // set.
+  kTfLiteInplaceOpInput2Shared = 16,
+  // Placeholder to ensure that enum can hold 64 bit values to accommodate
+  // future fields.
+  kTfLiteInplaceOpMaxValue = UINT64_MAX,
+} TfLiteInPlaceOp;
+// The number of shareable inputs supported.
+static const int kTfLiteMaxSharableOpInputs = 3;
 typedef struct TfLiteRegistration {
   // Initializes the op from serialized data.
   // Called only *once* for the lifetime of the op, so any one-time allocations
@@ -966,8 +1084,37 @@ typedef struct TfLiteRegistration {
   // does not support asynchronous execution for this `node`.
   struct TfLiteAsyncKernel* (*async_kernel)(TfLiteContext* context,
                                             TfLiteNode* node);
+  // Indicates if an operator's output may safely overwrite its inputs.
+  // See the comments in `TfLiteInPlaceOp`.
+  uint64_t inplace_operator;
 } TfLiteRegistration;
+/// \private
+// Old version of `TfLiteRegistration` to maintain binary backward
+// compatibility.
+// The legacy registration type must be a POD struct type whose field types must
+// be a prefix of the field types in TfLiteRegistration, and offset of the first
+// field in TfLiteRegistration that is not present in the legacy registration
+// type must be greater than or equal to the size of the legacy registration
+// type.
+// WARNING: This structure is deprecated / not an official part of the
+// API. It should be only used for binary backward compatibility.
+typedef struct TfLiteRegistration_V3 {
+  void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
+  void (*free)(TfLiteContext* context, void* buffer);
+  TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node);
+  TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node);
+  const char* (*profiling_string)(const TfLiteContext* context,
+                                  const TfLiteNode* node);
+  int32_t builtin_code;
+  const char* custom_name;
+  int version;
+  TfLiteRegistrationExternal* registration_external;
+  struct TfLiteAsyncKernel* (*async_kernel)(TfLiteContext* context,
+                                            TfLiteNode* node);
+} TfLiteRegistration_V3;
 /// \private
 // Old version of `TfLiteRegistration` to maintain binary backward
 // compatibility.
@@ -1158,6 +1305,7 @@ typedef struct TfLiteOpaqueDelegateBuilder {
   int64_t flags;
 } TfLiteOpaqueDelegateBuilder;
+#ifndef TF_LITE_STATIC_MEMORY
 // Creates an opaque delegate and returns its address.  The opaque delegate will
 // behave according to the provided 'opaque_delegate_builder'.  The lifetime of
 // the objects pointed to by any of the fields within the
@@ -1174,6 +1322,7 @@ TfLiteOpaqueDelegate* TfLiteOpaqueDelegateCreate(
 // Deletes the provided opaque 'delegate'.  This function has no effect if the
 // 'delegate' is a null pointer.
 void TfLiteOpaqueDelegateDelete(TfLiteOpaqueDelegate* delegate);
+#endif  // TF_LITE_STATIC_MEMORY
 // Returns a pointer to the data associated with the provided opaque 'delegate'.
 //
@@ -1189,7 +1338,159 @@ void TfLiteOpaqueDelegateDelete(TfLiteOpaqueDelegate* delegate);
 //  'opaque_delegate_builder' field is null.
 void* TfLiteOpaqueDelegateGetData(const TfLiteOpaqueDelegate* delegate);
+// Returns a tensor data allocation strategy.
+TfLiteAllocationStrategy TfLiteTensorGetAllocationStrategy(
+    const TfLiteTensor* t);
+// Returns how stable a tensor data buffer address is across runs.
+TfLiteRunStability TfLiteTensorGetBufferAddressStability(const TfLiteTensor* t);
+// Returns how stable a tensor data values are across runs.
+TfLiteRunStability TfLiteTensorGetDataStability(const TfLiteTensor* t);
+// Returns the operation step when the data of a tensor is populated.
+//
+// Some operations can precompute their results before the evaluation step. This
+// makes the data available earlier for subsequent operations.
+TfLiteRunStep TfLiteTensorGetDataKnownStep(const TfLiteTensor* t);
+// Returns the operation steop when the shape of a tensor is computed.
+//
+// Some operations can precompute the shape of their results before the
+// evaluation step. This makes the shape available earlier for subsequent
+// operations.
+TfLiteRunStep TfLiteTensorGetShapeKnownStep(const TfLiteTensor* t);
 #ifdef __cplusplus
 }  // extern "C"
+#include <utility>
+// --- TFLITE VARIANT TENSORS ----
+// Programming languges usually define "variant" as a type that can hold an
+// unbounded set of types. See std::any
+// (https://en.cppreference.com/w/cpp/utility/any) for a related standard
+// library construct. In tensorflow, variant tensors have a data member which is
+// an Object that is destructible and copy constructible.
+//   Variant tensors are commonly used to represent non trivial data
+// semantics that don't fit into simple primitives, such as lists of tensors and
+// datasets. Additionally, they can facilitate containers for optimizing
+// memory movement of tensor data.
+//
+// The following set of classes define the variant tensor member for tflite.
+// They implement a type-erased container intended to be used behind the
+// `data.data : void*` member of `TfLiteTensor`s. Runtime functions interact
+// the variant member at the level of a `VariantData`, whereas kernels
+// operate with the full knowledge of the un-erased type. The `VariantData`
+// class provides abstract methods for destroying and copying `VariantData`.
+// Invoking these methods will dispatch to the erased type opaquely.
+//    The contents of any object of type derived from `AbstractVariant` can be
+// written to `TfLiteTensor::data::data : void*` from kernels. If the runtime
+// were to copy such a tensor through `TfLiteTensorCopy`, the destination data
+// member will contain the result of invoking the erased type's copy
+// constructor. Similar for the runtime releasing tensors from memory, the
+// erased type's destructor will be invoked. There are a few caveats to consider
+// to use these safely, which we discuss below.
+//
+// EXAMPLE: READING VARIANT TENSORS
+//   ```
+//   // retrieve input with `type == kTfLiteVariant`
+//   TfLiteTensor* input = ...
+//   // must first static cast to `VariantData`, more on this below.
+//   VariantData* vd_input = static_cast<VariantData*>(t->data.data);
+//   CustomType* typed_input =
+//   static_cast<CustomType*>(vd_input);
+//   // do custom work on `typed_input`...
+//   ```
+//
+// EXAMPLE: WRITING VARIANT TENSORS
+//   ```
+//   TfLiteTensor* output = ...
+//   // construct a new variant object behind the target tensor
+//   TfLiteVariantRealloc<DerivedType, DerivedArgs...>(output, args...);
+//   // again must static cast to `VariantData*` before writing to `void*`.
+//   output->data.data = static_cast<VariantData*>(typed_output);
+//   ```
+//
+// WHY STATIC CAST TO `VariantData*`
+//    The Standard defines a `reinterpret_cast` from a derived type to its
+// parents as undefined behavior when the parent is a non-standard layout.
+// https://en.cppreference.com/w/cpp/language/reinterpret_cast (see bullet 5).
+// Due to the `VariantData` having virtual members it is indeed non-standard
+// layout, and any type derived from `VariantData` fails to be
+// "transparently-replaceable". I.e. implicit cast from derived to base in this
+// case may adjust the pointer and by definition `reinterpret_cast` will not
+// the adjust the pointer.
+//    Thus, dereferencing a pointer of type `VariantData` which addresses
+// the first byte of an object of said derived type is UB unless it was first
+// implicitly or statically casted to a `VariantData`. Writing the object of
+// derived type directly to `void*` which is dereferenced as a `VariantData` is
+// then UB, and so the intermediate cast through `VariantData` must be enforced.
+//    A good example of this issue is ellucidate in the bottom code snippet
+// here: https://en.cppreference.com/w/cpp/utility/launder.
+class VariantData {
+ public:
+  // All variant objects must be able to be destroyed and copied.
+  virtual ~VariantData() = default;
+  // A "virtual copy-constructor". Often the destination tensor of a variant
+  // copy may have been previously allocated in a prior call to inference. We
+  // allow the copy to target the destinations buffer (`maybe_alloc`),
+  // for potential reuse and optimizations. `maybe_alloc` must be of the same
+  // underlying derived type. References to whatever object is at
+  // `maybe_alloc` may be invalidated.
+  virtual VariantData* CloneTo(VariantData* maybe_alloc) const = 0;
+};
+// Concrete implementations extend `AbstractVariantData` with CRPT.
+template <typename ErasedDerived>
+class AbstractVariantData : public VariantData {
+ public:
+  VariantData* CloneTo(VariantData* maybe_alloc) const override {
+    if (maybe_alloc != nullptr) {
+      // If the output is still allocated, then its object may still be
+      // in its life time and the destructor must be called before re-using the
+      // buffer.
+      //     This may actual have a non-negligible effect on performance if the
+      // destructor is complex. A future iteration may
+      // introduce copy or move assignment semantics, allowing for the
+      // underlying implementation to optimize for this case.
+      auto* derived = static_cast<ErasedDerived*>(maybe_alloc);
+      derived->~ErasedDerived();
+      return new (derived)
+          ErasedDerived(static_cast<ErasedDerived const&>(*this));
+    }
+    return new ErasedDerived(static_cast<ErasedDerived const&>(*this));
+  }
+ protected:
+  AbstractVariantData() = default;
+  AbstractVariantData(const AbstractVariantData&) = default;
+  AbstractVariantData(AbstractVariantData&&) = delete;
+};
+// Analogous to `TfLiteTensorRealloc` for allocation of tensors whose
+// data member points to an arbitrary C++ object. `VariantType` refers
+// to the erased type of said object and `VariantArgs` refers to
+// a list of argument types with which to construct a new `VariantType`.
+// `VariantArgs` must match a constructor of `VariantType`.
+template <class VariantType, class... VariantArgs>
+TfLiteStatus TfLiteTensorVariantRealloc(TfLiteTensor* t,
+                                        VariantArgs&&... args) {
+  if (t->type != kTfLiteVariant) return kTfLiteError;
+  VariantType* new_vd;
+  if (t->data.raw != nullptr) {
+    auto* target_vd = static_cast<VariantData*>(t->data.data);
+    target_vd->~VariantData();
+    // As above, we assume if `t` is already allocated then it was allocated
+    // with the same `VariantType` as templated.
+    new_vd = new (t->data.raw) VariantType(std::forward<VariantArgs>(args)...);
+  } else {
+    new_vd = new VariantType(std::forward<VariantArgs>(args)...);
+  }
+  t->data.data = static_cast<VariantData*>(new_vd);
+  t->allocation_type = kTfLiteVariantObject;
+  return kTfLiteOk;
+}
 #endif  // __cplusplus
 #endif  // TENSORFLOW_LITE_CORE_C_COMMON_H_

xmos_ai_tools/runtime/include/tensorflow/lite/core/macros.h ADDED Viewed

@@ -0,0 +1,78 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This provides utility macros and functions that are inherently platform
+// specific or shared across runtime & converter.
+#ifndef TENSORFLOW_LITE_CORE_MACROS_H_
+#define TENSORFLOW_LITE_CORE_MACROS_H_
+#ifdef __has_builtin
+#define TFLITE_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define TFLITE_HAS_BUILTIN(x) 0
+#endif
+#if (!defined(__NVCC__)) && (TFLITE_HAS_BUILTIN(__builtin_expect) || \
+                             (defined(__GNUC__) && __GNUC__ >= 3))
+#define TFLITE_EXPECT_FALSE(cond) __builtin_expect(cond, false)
+#define TFLITE_EXPECT_TRUE(cond) __builtin_expect(!!(cond), true)
+#else
+#define TFLITE_EXPECT_FALSE(cond) (cond)
+#define TFLITE_EXPECT_TRUE(cond) (cond)
+#endif
+#ifdef _WIN32
+#define TFLITE_NOINLINE __declspec(noinline)
+#else
+#ifdef __has_attribute
+#if __has_attribute(noinline)
+#define TFLITE_NOINLINE __attribute__((noinline))
+#else
+#define TFLITE_NOINLINE
+#endif  // __has_attribute(noinline)
+#else
+#define TFLITE_NOINLINE
+#endif  // __has_attribute
+#endif  // _WIN32
+// Normally we'd use ABSL_HAVE_ATTRIBUTE_WEAK and ABSL_ATTRIBUTE_WEAK, but
+// we avoid the absl dependency for binary size reasons.
+#ifdef __has_attribute
+#define TFLITE_HAS_ATTRIBUTE(x) __has_attribute(x)
+#else
+#define TFLITE_HAS_ATTRIBUTE(x) 0
+#endif
+#if (TFLITE_HAS_ATTRIBUTE(weak) ||                  \
+     (defined(__GNUC__) && !defined(__clang__))) && \
+    !(defined(__llvm__) && defined(_WIN32)) && !defined(__MINGW32__)
+#undef TFLITE_ATTRIBUTE_WEAK
+#define TFLITE_ATTRIBUTE_WEAK __attribute__((weak))
+#define TFLITE_HAS_ATTRIBUTE_WEAK 1
+#else
+#define TFLITE_ATTRIBUTE_WEAK
+#define TFLITE_HAS_ATTRIBUTE_WEAK 0
+#endif
+#ifndef TF_LITE_STATIC_MEMORY
+// maximum size of a valid flatbuffer
+inline constexpr unsigned int flatbuffer_size_max = 2147483648;
+// If none zero then the buffer is stored outside of the flatbuffers, string
+inline constexpr char tflite_metadata_buffer_location[] = "buffer_location";
+// field for minimum runtime version, string
+inline constexpr char tflite_metadata_min_runtime_version[] =
+    "min_runtime_version";
+#endif
+#endif  // TENSORFLOW_LITE_CORE_MACROS_H_

xmos_ai_tools/runtime/include/tensorflow/lite/kernels/internal/common.h CHANGED Viewed

@@ -16,6 +16,10 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_COMMON_H_
 #include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
 #ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
 #ifdef GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
 #define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
@@ -26,6 +30,7 @@ limitations under the License.
 #include <functional>
 #include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/core/macros.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -34,6 +39,117 @@ namespace tflite {
 constexpr int kReverseShift = -1;
+// Reduces and compresses dimensions so that broadcast handling becomes more
+// efficient. Returns true if the output shape is broadcastable; it doesn't
+// contain any degenerate dimension, i.e. shape dimension = 0. False otherwise.
+template <int MAX_DIM = 6>
+bool ReduceDimensionsForBroadcast(const RuntimeShape& input1_shape,
+                                  const RuntimeShape& input2_shape,
+                                  size_t* compressed_input1_stride,
+                                  size_t* compressed_input2_stride,
+                                  size_t* compressed_output_shape) {
+  size_t num_compressed_dims = 0;
+  size_t compressed_input1_shape[MAX_DIM];
+  size_t compressed_input2_shape[MAX_DIM];
+  std::fill(compressed_input1_shape, compressed_input1_shape + MAX_DIM, 1);
+  std::fill(compressed_input2_shape, compressed_input2_shape + MAX_DIM, 1);
+  std::fill(compressed_output_shape, compressed_output_shape + MAX_DIM, 1);
+  bool broadcast_input1 = false;
+  bool broadcast_input2 = false;
+  bool first_nonunit = true;
+  const size_t num_input1_dims = input1_shape.DimensionsCount();
+  const size_t num_input2_dims = input2_shape.DimensionsCount();
+  const int32_t* input1_dims = input1_shape.DimsData();
+  const int32_t* input2_dims = input2_shape.DimsData();
+  const size_t num_common_dims = std::min(num_input1_dims, num_input2_dims);
+  for (size_t i = 1; i <= num_common_dims; i++) {
+    const size_t input1_dim = input1_dims[num_input1_dims - i];
+    const size_t input2_dim = input2_dims[num_input2_dims - i];
+    if (input1_dim == 0 || input2_dim == 0) {
+      return false;
+    }
+    if (input1_dim == 1 && input2_dim == 1) {
+      continue;
+    }
+    assert(!broadcast_input1 || !broadcast_input2);
+    if (input1_dim == 1) {
+      if (!broadcast_input1) {
+        broadcast_input1 = true;
+        broadcast_input2 = false;
+        num_compressed_dims++;
+      }
+      compressed_input2_shape[num_compressed_dims - 1] *= input2_dim;
+      compressed_output_shape[num_compressed_dims - 1] *= input2_dim;
+    } else if (input2_dim == 1) {
+      if (!broadcast_input2) {
+        broadcast_input1 = false;
+        broadcast_input2 = true;
+        num_compressed_dims++;
+      }
+      compressed_input1_shape[num_compressed_dims - 1] *= input1_dim;
+      compressed_output_shape[num_compressed_dims - 1] *= input1_dim;
+    } else {
+      TFLITE_DCHECK(input1_dim == input2_dim);
+      if (broadcast_input1 || broadcast_input2 || first_nonunit) {
+        broadcast_input1 = false;
+        broadcast_input2 = false;
+        num_compressed_dims++;
+      }
+      compressed_input1_shape[num_compressed_dims - 1] *= input1_dim;
+      compressed_input2_shape[num_compressed_dims - 1] *= input1_dim;
+      compressed_output_shape[num_compressed_dims - 1] *= input1_dim;
+    }
+    first_nonunit = false;
+  }
+  if (num_input1_dims > num_input2_dims) {
+    if (!broadcast_input2) {
+      num_compressed_dims++;
+    }
+    for (size_t i = 0; i < num_input1_dims - num_input2_dims; i++) {
+      const size_t input1_dim = input1_dims[i];
+      if (input1_dim == 0) {
+        return false;
+      }
+      compressed_input1_shape[num_compressed_dims - 1] *= input1_dim;
+      compressed_output_shape[num_compressed_dims - 1] *= input1_dim;
+    }
+  } else if (num_input2_dims > num_input1_dims) {
+    if (!broadcast_input1) {
+      num_compressed_dims++;
+    }
+    for (size_t i = 0; i < num_input2_dims - num_input1_dims; i++) {
+      const size_t input2_dim = input2_dims[i];
+      if (input2_dim == 0) {
+        return false;
+      }
+      compressed_input2_shape[num_compressed_dims - 1] *= input2_dim;
+      compressed_output_shape[num_compressed_dims - 1] *= input2_dim;
+    }
+  }
+  num_compressed_dims = (num_compressed_dims > 1) ? num_compressed_dims : 1;
+  int input1_stride = 1;
+  int input2_stride = 1;
+  for (int i = 0; i < MAX_DIM; ++i) {
+    compressed_input1_stride[i] = input1_stride;
+    input1_stride *= compressed_input1_shape[i];
+    compressed_input2_stride[i] = input2_stride;
+    input2_stride *= compressed_input2_shape[i];
+  }
+  for (int i = 0; i < MAX_DIM; ++i) {
+    if (compressed_input1_shape[i] != compressed_input2_shape[i]) {
+      if (compressed_input1_shape[i] == 1) {
+        compressed_input1_stride[i] = 0;
+      } else {
+        TFLITE_DCHECK_EQ(compressed_input2_shape[i], 1);
+        compressed_input2_stride[i] = 0;
+      }
+    }
+  }
+  return true;
+}
 inline void GetActivationMinMax(FusedActivationFunctionType ac,
                                 float* output_activation_min,
                                 float* output_activation_max) {
@@ -250,42 +366,11 @@ inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
                                            quantized_multiplier);
 }
-inline int32_t MultiplyByQuantizedMultiplier(int32_t x,
-                                             int32_t quantized_multiplier,
-                                             int shift) {
-  using gemmlowp::RoundingDivideByPOT;
-  using gemmlowp::SaturatingRoundingDoublingHighMul;
-  int left_shift = shift > 0 ? shift : 0;
-  int right_shift = shift > 0 ? 0 : -shift;
-  return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
-                                 x * (1 << left_shift), quantized_multiplier),
-                             right_shift);
-}
+TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
+    int32_t x, int32_t quantized_multiplier, int shift);
-inline int32_t MultiplyByQuantizedMultiplier(int64_t x,
-                                             int32_t quantized_multiplier,
-                                             int shift) {
-  // Inputs:
-  // - quantized_multiplier has fixed point at bit 31
-  // - shift is -31 to +7 (negative for right shift)
-  //
-  // Assumptions: The following input ranges are assumed
-  // - quantize_scale>=0  (the usual range is (1<<30) to (1>>31)-1)
-  // - scaling is chosen so final scaled result fits in int32_t
-  // - input x is in the range -(1<<47) <= x < (1<<47)
-  assert(quantized_multiplier >= 0);
-  assert(shift >= -31 && shift < 8);
-  assert(x >= -(static_cast<int64_t>(1) << 47) &&
-         x < (static_cast<int64_t>(1) << 47));
-  int32_t reduced_multiplier = (quantized_multiplier < 0x7FFF0000)
-                                   ? ((quantized_multiplier + (1 << 15)) >> 16)
-                                   : 0x7FFF;
-  int total_shift = 15 - shift;
-  x = (x * (int64_t)reduced_multiplier) + ((int64_t)1 << (total_shift - 1));
-  int32_t result = x >> total_shift;
-  return result;
-}
+TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
+    int64_t x, int32_t quantized_multiplier, int shift);
 #ifdef USE_NEON
 // Round uses ARM's rounding shift right.
@@ -328,14 +413,16 @@ template <typename T>
 int CountLeadingZeros(T integer_input) {
   static_assert(std::is_unsigned<T>::value,
                 "Only unsigned integer types handled.");
-#if defined(__GNUC__)
-  return integer_input ? __builtin_clz(integer_input)
-                       : std::numeric_limits<T>::digits;
-#else
   if (integer_input == 0) {
     return std::numeric_limits<T>::digits;
   }
+#if defined(__GNUC__)
+  if (std::is_same<T, uint32_t>::value) {
+    return __builtin_clz(integer_input);
+  } else if (std::is_same<T, uint64_t>::value) {
+    return __builtin_clzll(integer_input);
+  }
+#endif
   const T one_in_leading_positive = static_cast<T>(1)
                                     << (std::numeric_limits<T>::digits - 1);
   int leading_zeros = 0;
@@ -344,7 +431,6 @@ int CountLeadingZeros(T integer_input) {
     ++leading_zeros;
   }
   return leading_zeros;
-#endif
 }
 template <typename T>
@@ -1039,8 +1125,8 @@ inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
 // Copies dims to desc, calculating strides.
 template <int N>
-inline void CopyDimsToDesc(const RuntimeShape& input_shape,
-                           NdArrayDesc<N>* desc_out) {
+TFLITE_NOINLINE void CopyDimsToDesc(const RuntimeShape& input_shape,
+                                    NdArrayDesc<N>* desc_out) {
   int desc_stride = 1;
   for (int i = N - 1; i >= 0; --i) {
     desc_out->extents[i] = input_shape.Dims(i);