PyPI - xmos-ai-tools - Versions diffs - 1.3.2.dev80__py3-none-macosx_10_15_universal2.whl - Mend

xmos-ai-tools 1.3.2.dev80__py3-none-macosx_10_15_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (395) hide show

xmos_ai_tools/runtime/include/lib_nn/api/nn_config.h ADDED Viewed

@@ -0,0 +1,287 @@
+// Copyright 2020-2021 XMOS LIMITED.
+// This Software is subject to the terms of the XMOS Public Licence: Version 1.
+#pragma once
+/**
+ * @macro CONFIG_SYMMETRIC_SATURATION_GLOBAL
+ * @brief Configure whether (supported) operators use `-127` or `-128` as the
+ * their lower saturation bound.
+ *
+ * The output of 8-bit arithmetic on the XS3 VPU has natural symmetric
+ * saturation bounds of (`-127`, `127`). This may be unacceptable, in which case
+ * (`-128`, `127`) can be used instead.
+ *
+ * If `CONFIG_SYMMETRIC_SATURATION_GLOBAL` is defined, it is used as the value
+ * for each config macro `CONFIG_SYMMETRIC_SATURATION_*` (e.g.
+ * `CONFIG_SYMMETRIC_SATURATION_requantize_16_to_8`), unless that macro has been
+ * explicitly set.
+ *
+ * Bypassing the symmetric saturation bound requires additional logic, and so
+ * will generally make the operators slower, though this will be more or less
+ * significant, depending on the specific operators.
+ */
+/**
+ * @macro CONFIG_SYMMETRIC_SATURATION_conv2d_deep
+ * @brief Configure whether `-127` or `-128` is used as the saturation limit for
+ * `conv2d_deep()`.
+ *
+ * The output of 8-bit arithmetic on the XS3 VPU has natural symmetric
+ * saturation bounds of (`-127`, `127`). This may be unacceptable, in which case
+ * (`-128`, `127`) can be used instead.
+ *
+ * To specify that the symmetric saturation lower bound (`-127`) should be used
+ * for `conv2d_deep()`, define `CONFIG_SYMMETRIC_SATURATION_conv2d_deep` to be
+ * `1`. If it is defined to `0`, `-128` will be used instead.
+ *
+ * If `CONFIG_SYMMETRIC_SATURATION_conv2d_deep` is undefined, then the value of
+ * `CONFIG_SYMMETRIC_SATURATION_GLOBAL` is used instead, if that is defined. If
+ * neither symbol is defined, `CONFIG_SYMMETRIC_SATURATION_conv2d_deep` defaults
+ * to 0, using a lower saturation bound of `-128`.
+ *
+ */
+#ifndef CONFIG_SYMMETRIC_SATURATION_conv2d_deep
+#ifdef CONFIG_SYMMETRIC_SATURATION_GLOBAL
+#define CONFIG_SYMMETRIC_SATURATION_conv2d_deep \
+  CONFIG_SYMMETRIC_SATURATION_GLOBAL
+#else
+#define CONFIG_SYMMETRIC_SATURATION_conv2d_deep (0)
+#endif
+#endif
+/**
+ * @macro CONFIG_SYMMETRIC_SATURATION_conv2d_shallowin
+ * @brief Configure whether `-127` or `-128` is used as the saturation limit for
+ * `conv2d_shallowin()`.
+ *
+ * The output of 8-bit arithmetic on the XS3 VPU has natural symmetric
+ * saturation bounds of (`-127`, `127`). This may be unacceptable, in which case
+ * (`-128`, `127`) can be used instead.
+ *
+ * To specify that the symmetric saturation lower bound (`-127`) should be used
+ * for `conv2d_shallowin()`, define
+ * `CONFIG_SYMMETRIC_SATURATION_conv2d_shallowin` to be `1`. If it is defined to
+ * `0`, `-128` will be used instead.
+ *
+ * If `CONFIG_SYMMETRIC_SATURATION_conv2d_shallowin` is undefined, then the
+ * value of `CONFIG_SYMMETRIC_SATURATION_GLOBAL` is used instead, if that is
+ * defined. If neither symbol is defined,
+ * `CONFIG_SYMMETRIC_SATURATION_conv2d_shallowin` defaults to 0, using a lower
+ * saturation bound of `-128`.
+ *
+ */
+#ifndef CONFIG_SYMMETRIC_SATURATION_conv2d_shallowin
+#ifdef CONFIG_SYMMETRIC_SATURATION_GLOBAL
+#define CONFIG_SYMMETRIC_SATURATION_conv2d_shallowin \
+  CONFIG_SYMMETRIC_SATURATION_GLOBAL
+#else
+#define CONFIG_SYMMETRIC_SATURATION_conv2d_shallowin (0)
+#endif
+#endif
+/**
+ * @macro CONFIG_SYMMETRIC_SATURATION_conv2d_im2col
+ * @brief Configure whether `-127` or `-128` is used as the saturation limit for
+ * `conv2d_im2col()`.
+ *
+ * The output of 8-bit arithmetic on the XS3 VPU has natural symmetric
+ * saturation bounds of (`-127`, `127`). This may be unacceptable, in which case
+ * (`-128`, `127`) can be used instead.
+ *
+ * To specify that the symmetric saturation lower bound (`-127`) should be used
+ * for `conv2d_shallowin()`, define `CONFIG_SYMMETRIC_SATURATION_conv2d_im2col`
+ * to be `1`. If it is defined to `0`, `-128` will be used instead.
+ *
+ * If `CONFIG_SYMMETRIC_SATURATION_conv2d_im2col` is undefined, then the value
+ * of `CONFIG_SYMMETRIC_SATURATION_GLOBAL` is used instead, if that is defined.
+ * If neither symbol is defined, `CONFIG_SYMMETRIC_SATURATION_conv2d_im2col`
+ * defaults to 0, using a lower saturation bound of `-128`.
+ *
+ */
+#ifndef CONFIG_SYMMETRIC_SATURATION_conv2d_im2col
+#ifdef CONFIG_SYMMETRIC_SATURATION_GLOBAL
+#define CONFIG_SYMMETRIC_SATURATION_conv2d_im2col \
+  CONFIG_SYMMETRIC_SATURATION_GLOBAL
+#else
+#define CONFIG_SYMMETRIC_SATURATION_conv2d_im2col (0)
+#endif
+#endif
+/**
+ * @macro CONFIG_SYMMETRIC_SATURATION_conv2d_depthwise
+ * @brief Configure whether `-127` or `-128` is used as the saturation limit for
+ * `conv2d_depthwise()`.
+ *
+ * The output of 8-bit arithmetic on the XS3 VPU has natural symmetric
+ * saturation bounds of (`-127`, `127`). This may be unacceptable, in which case
+ * (`-128`, `127`) can be used instead.
+ *
+ * To specify that the symmetric saturation lower bound (`-127`) should be used
+ * for `conv2d_depthwise()`, define
+ * `CONFIG_SYMMETRIC_SATURATION_conv2d_depthwise` to be `1`. If it is defined to
+ * `0`, `-128` will be used instead.
+ *
+ * If `CONFIG_SYMMETRIC_SATURATION_conv2d_depthwise` is undefined, then the
+ * value of `CONFIG_SYMMETRIC_SATURATION_GLOBAL` is used instead, if that is
+ * defined. If neither symbol is defined,
+ * `CONFIG_SYMMETRIC_SATURATION_conv2d_depthwise` defaults to 0, using a lower
+ * saturation bound of `-128`.
+ *
+ */
+#ifndef CONFIG_SYMMETRIC_SATURATION_conv2d_depthwise
+#ifdef CONFIG_SYMMETRIC_SATURATION_GLOBAL
+#define CONFIG_SYMMETRIC_SATURATION_conv2d_depthwise \
+  CONFIG_SYMMETRIC_SATURATION_GLOBAL
+#else
+#define CONFIG_SYMMETRIC_SATURATION_conv2d_depthwise (0)
+#endif
+#endif
+/**
+ * @macro CONFIG_SYMMETRIC_SATURATION_conv2d_1x1
+ * @brief Configure whether `-127` or `-128` is used as the saturation limit for
+ * `conv2d_1x1()`.
+ *
+ * The output of 8-bit arithmetic on the XS3 VPU has natural symmetric
+ * saturation bounds of (`-127`, `127`). This may be unacceptable, in which case
+ * (`-128`, `127`) can be used instead.
+ *
+ * To specify that the symmetric saturation lower bound (`-127`) should be used
+ * for `conv2d_1x1()`, define `CONFIG_SYMMETRIC_SATURATION_conv2d_1x1` to be
+ * `1`. If it is defined to `0`, `-128` will be used instead.
+ *
+ * If `CONFIG_SYMMETRIC_SATURATION_conv2d_1x1` is undefined, then the value of
+ * `CONFIG_SYMMETRIC_SATURATION_GLOBAL` is used instead, if that is defined. If
+ * neither symbol is defined, `CONFIG_SYMMETRIC_SATURATION_conv2d_1x1` defaults
+ * to 0, using a lower saturation bound of `-128`.
+ *
+ */
+#ifndef CONFIG_SYMMETRIC_SATURATION_conv2d_1x1
+#ifdef CONFIG_SYMMETRIC_SATURATION_GLOBAL
+#define CONFIG_SYMMETRIC_SATURATION_conv2d_1x1 \
+  CONFIG_SYMMETRIC_SATURATION_GLOBAL
+#else
+#define CONFIG_SYMMETRIC_SATURATION_conv2d_1x1 (0)
+#endif
+#endif
+/**
+ * @macro CONFIG_SYMMETRIC_SATURATION_avgpool2d
+ * @brief Configure whether `-127` or `-128` is used as the saturation limit for
+ * `avgpool2d()`.
+ *
+ * The output of 8-bit arithmetic on the XS3 VPU has natural symmetric
+ * saturation bounds of (`-127`, `127`). This may be unacceptable, in which case
+ * (`-128`, `127`) can be used instead.
+ *
+ * To specify that the symmetric saturation lower bound (`-127`) should be used
+ * for `avgpool2d()`, define `CONFIG_SYMMETRIC_SATURATION_avgpool2d` to be `1`.
+ * If it is defined to `0`, `-128` will be used instead.
+ *
+ * If `CONFIG_SYMMETRIC_SATURATION_avgpool2d` is undefined, then the value of
+ * `CONFIG_SYMMETRIC_SATURATION_GLOBAL` is used instead, if that is defined. If
+ * neither symbol is defined, `CONFIG_SYMMETRIC_SATURATION_avgpool2d` defaults
+ * to 0, using a lower saturation bound of `-128`.
+ *
+ */
+#ifndef CONFIG_SYMMETRIC_SATURATION_avgpool2d
+#ifdef CONFIG_SYMMETRIC_SATURATION_GLOBAL
+#define CONFIG_SYMMETRIC_SATURATION_avgpool2d CONFIG_SYMMETRIC_SATURATION_GLOBAL
+#else
+#define CONFIG_SYMMETRIC_SATURATION_avgpool2d (0)
+#endif
+#endif
+/**
+ * @macro CONFIG_SYMMETRIC_SATURATION_avgpool2d_global
+ * @brief Configure whether `-127` or `-128` is used as the saturation limit for
+ * `avgpool2d_global()`.
+ *
+ * The output of 8-bit arithmetic on the XS3 VPU has natural symmetric
+ * saturation bounds of (`-127`, `127`). This may be unacceptable, in which case
+ * (`-128`, `127`) can be used instead.
+ *
+ * To specify that the symmetric saturation lower bound (`-127`) should be used
+ * for `avgpool2d_global()`, define
+ * `CONFIG_SYMMETRIC_SATURATION_avgpool2d_global` to be `1`. If it is defined to
+ * `0`, `-128` will be used instead.
+ *
+ * If `CONFIG_SYMMETRIC_SATURATION_avgpool2d_global` is undefined, then the
+ * value of `CONFIG_SYMMETRIC_SATURATION_GLOBAL` is used instead, if that is
+ * defined. If neither symbol is defined,
+ * `CONFIG_SYMMETRIC_SATURATION_avgpool2d_global` defaults to 0, using a lower
+ * saturation bound of `-128`.
+ *
+ */
+#ifndef CONFIG_SYMMETRIC_SATURATION_avgpool2d_global
+#ifdef CONFIG_SYMMETRIC_SATURATION_GLOBAL
+#define CONFIG_SYMMETRIC_SATURATION_avgpool2d_global \
+  CONFIG_SYMMETRIC_SATURATION_GLOBAL
+#else
+#define CONFIG_SYMMETRIC_SATURATION_avgpool2d_global (0)
+#endif
+#endif
+/**
+ * @macro CONFIG_SYMMETRIC_SATURATION_requantize_16_to_8
+ * @brief Configure whether `-127` or `-128` is used as the saturation limit for
+ * `requantize_16_to_8()`.
+ *
+ * The output of 8-bit arithmetic on the XS3 VPU has natural symmetric
+ * saturation bounds of (`-127`, `127`). This may be unacceptable, in which case
+ * (`-128`, `127`) can be used instead.
+ *
+ * To specify that the symmetric saturation lower bound (`-127`) should be used
+ * for `requantize_16_to_8()`, define
+ * `CONFIG_SYMMETRIC_SATURATION_requantize_16_to_8` to be `1`. If it is defined
+ * to `0`, `-128` will be used instead.
+ *
+ * If `CONFIG_SYMMETRIC_SATURATION_requantize_16_to_8` is undefined, then the
+ * value of `CONFIG_SYMMETRIC_SATURATION_GLOBAL` is used instead, if that is
+ * defined. If neither symbol is defined,
+ * `CONFIG_SYMMETRIC_SATURATION_requantize_16_to_8` defaults to 0, using a lower
+ * saturation bound of `-128`.
+ *
+ * Unfortunately, bypassing the symmetric saturation bounds requires significant
+ * additional logic, and so with the symmetric saturation bound,
+ * `requantize_16_to_8()` is approximately 2.5x faster.
+ *
+ */
+#ifndef CONFIG_SYMMETRIC_SATURATION_requantize_16_to_8
+#ifdef CONFIG_SYMMETRIC_SATURATION_GLOBAL
+#define CONFIG_SYMMETRIC_SATURATION_requantize_16_to_8 \
+  CONFIG_SYMMETRIC_SATURATION_GLOBAL
+#else
+#define CONFIG_SYMMETRIC_SATURATION_requantize_16_to_8 (0)
+#endif
+#endif
+/**
+ * @macro CONFIG_SYMMETRIC_SATURATION_fully_connected_8
+ * @brief Configure whether `-127` or `-128` is used as the saturation limit for
+ * `fully_connected_8()`.
+ *
+ * The output of 8-bit arithmetic on the XS3 VPU has natural symmetric
+ * saturation bounds of (`-127`, `127`). This may be unacceptable, in which case
+ * (`-128`, `127`) can be used instead.
+ *
+ * To specify that the symmetric saturation lower bound (`-127`) should be used
+ * for `fully_connected_8()`, define
+ * `CONFIG_SYMMETRIC_SATURATION_fully_connected_8` to be `1`. If it is defined
+ * to `0`, `-128` will be used instead.
+ *
+ * If `CONFIG_SYMMETRIC_SATURATION_fully_connected_8` is undefined, then the
+ * value of `CONFIG_SYMMETRIC_SATURATION_GLOBAL` is used instead, if that is
+ * defined. If neither symbol is defined,
+ * `CONFIG_SYMMETRIC_SATURATION_fully_connected_8` defaults to 0, using a lower
+ * saturation bound of `-128`.
+ *
+ */
+#ifndef CONFIG_SYMMETRIC_SATURATION_fully_connected_8
+#ifdef CONFIG_SYMMETRIC_SATURATION_GLOBAL
+#define CONFIG_SYMMETRIC_SATURATION_fully_connected_8 \
+  CONFIG_SYMMETRIC_SATURATION_GLOBAL
+#else
+#define CONFIG_SYMMETRIC_SATURATION_fully_connected_8 (0)
+#endif
+#endif

xmos_ai_tools/runtime/include/lib_nn/api/nn_conv2d_structs.h ADDED Viewed

@@ -0,0 +1,72 @@
+// Copyright 2020-2021 XMOS LIMITED.
+// This Software is subject to the terms of the XMOS Public Licence: Version 1.
+#ifndef CONV2D_STRUCTS_H_
+#define CONV2D_STRUCTS_H_
+#include "nn_image.h"
+#define CONV2D_OUTPUT_LENGTH(input_length, filter_size, dilation, stride)     \
+  (((input_length - (filter_size + (filter_size - 1) * (dilation - 1)) + 1) + \
+    stride - 1) /                                                             \
+   stride)
+#define CONV2D_INPUT_LENGTH(output_length, filter_size, dilation, stride) \
+  (output_length * stride - (stride - 1) - 1 +                            \
+   (filter_size + (filter_size - 1) * (dilation - 1)))
+/**
+ * Describes the relationship between the convolution window and the
+ * input image.
+ */
+typedef struct {
+  /** The shape of the convolution window */
+  struct {
+    /** Height of the convolution window in pixels */
+    unsigned height;
+    /** Width of the convolution window in pixels */
+    unsigned width;
+  } shape;
+  /**
+   * The initial position of the convolution window, relative to the input
+   * image.
+   *
+   * The position given by this pair indicates where the top-left pixel of the
+   * convolution window begins relative to the top-left pixel of the input
+   * image.
+   *
+   * If this pair is, for example, `(0, 0)`, then the convolution window starts
+   * at the top left of the input image and involves no top or left padding.
+   */
+  struct {
+    /** Row offset of convolution window inital position */
+    int row;
+    /** Column offset of convolution window inital position */
+    int column;
+  } start;
+  /**
+   * The strides of the convolution window. These are the number of (input
+   * image) pixels that the convolution window moves down and right for each
+   * pixel moved down or right in the output image.
+   */
+  struct {
+    /** Vertical stride of the convolution window. */
+    int vertical;
+    /** Horizontal stride of the convolution window */
+    int horizontal;
+  } stride;
+  /**
+   * Note: Only supported where explicitly mentioned.
+   */
+  struct {
+    /** Vertical dilation of the convolution window. */
+    int vertical;
+    /** Horizontal dilation of the convolution window */
+    int horizontal;
+  } dilation;
+} nn_window_params_t;
+#endif  // CONV2D_STRUCTS_H_

xmos_ai_tools/runtime/include/lib_nn/api/nn_image.h ADDED Viewed

@@ -0,0 +1,26 @@
+// Copyright 2020-2021 XMOS LIMITED.
+// This Software is subject to the terms of the XMOS Public Licence: Version 1.
+#ifndef IMAGE_H_
+#define IMAGE_H_
+#include "nn_types.h"
+/**
+ * This struct describes the basic parameters for an image tensor
+ */
+typedef struct {
+  /**
+   * Height of an image (in pixels)
+   */
+  uint32_t height;
+  /**
+   * Width of the image (in pixels)
+   */
+  uint32_t width;
+  /**
+   * Number of channels per pixel
+   */
+  channel_count_t channels;
+} nn_image_params_t;
+#endif  // IMAGE_H_

xmos_ai_tools/runtime/include/lib_nn/api/nn_layers.h ADDED Viewed

@@ -0,0 +1,303 @@
+// Copyright 2020-2021 XMOS LIMITED.
+// This Software is subject to the terms of the XMOS Public Licence: Version 1.
+#ifndef LAYERS_H_
+#define LAYERS_H_
+#include "nn_api.h"
+#include "nn_bin_types.h"
+#include "nn_image.h"
+#include <string.h>
+/**
+ * Struct represents the parameters needed by each `bsign_8()` job.
+ *
+ * Values are set by `bsign_8_prepare()`.
+ *
+ * @note This struct is intended to be opaque.
+ */
+typedef struct {
+  mem_stride_t start;
+  int32_t length;
+} nn_bsign_8_job_t;
+/**
+ * @brief Initialize an instance of the @oper{bsign_8} operator.
+ *
+ * See @oper_ref{bsign_8} for more details about the @oper{bsign_8} operator. To
+ * invoke a
+ * @oper{bsign_8} job, call bsign_8().
+ *
+ * When bsign_8() is called, a job (`nn_bsign_8_job_t`) must be supplied to tell
+ * it how to do its work. This function initializes one or more jobs to be
+ * supplied in subsequent calls to bsign_8().
+ *
+ * Each job computes a range of elements in the output vector (possibly the
+ * entire vector).
+ *
+ * `jobs` points to an array of `nn_bsign_8_t` to be initialized. Each element
+ * represents one job. There should be `job_count` elements in the array.
+ *
+ * `N` is the number of elements @math{N} in the input vector @tensor{x} and
+ * output vector @tensor{y}.
+ *
+ * `job_count` indicates the number of jobs to be initialized (and thus the
+ * number of elements in the `jobs` array).
+ *
+ * Unlike many other operators, @oper{bsign_8} will automatically divide the
+ * work to be done as evenly as possible between jobs.
+ *
+ * @param plan      [out]  The plan to be initialized.
+ * @param jobs      [out]   Array of jobs to be initialized.
+ * @param N         [in]    The number of elements in the input.
+ * @param[in]  zero_point   The value @math{z_0} to be used for padding (for all
+ * channels)
+ * @param job_count [in]    The number of jobs to be initialized.
+ */
+void bsign_8_prepare(nn_bsign_8_job_t *jobs, int8_t *zero_point_vect,
+                     const uint32_t N, const int8_t zero_point,
+                     const int32_t job_count);
+/**
+ * @brief Execute @oper{bsign_8} job.
+ *
+ * See @oper_ref{bsign_8} for more details about the @oper{requantize_16_to_8}
+ * operator.
+ *
+ * An instance of the @oper{bsign_8} operator requires an job (but no plan is
+ * required). See bsign_8_prepare() for more details.
+ *
+ * `Y` points to the output vector @tensor{y} with length @math{N}. The address
+ * supplied for `Y` should be the start address of the output vector (for any
+ * job being processed).
+ *
+ * `X` points to the input vector @tensor{x} with length @math{N}. The address
+ * supplied for `X` should be the start address of the input vector (for any job
+ * being processed).
+ *
+ * `job` points to the (initialized) @oper{bsign_8} job to be performed with
+ * this call.
+ *
+ * @requires_word_alignment{Y,X}
+ *
+ * @param Y   [out]    The output vector @tensor{y}
+ * @param X   [in]     The input vector @tensor{x}
+ * @param plan [in]    The @oper{bsign_8} plan to be processed
+ * @param job [in]     The @oper{bsign_8} job to be processed
+ */
+void bsign_8(bnn_b32_t *Y, const int8_t *X, const int8_t *zero_point_vect,
+             const nn_bsign_8_job_t *job);
+/**
+ * Struct represents the parameters needed by each `pad_run()` job.
+ *
+ * Values are set by `pad_prepare()`.
+ *
+ * @note This struct is intended to be opaque.
+ */
+typedef struct nn_pad_plan_t {
+  unsigned top_pad_bytes;
+  unsigned mid_loop_count;
+  unsigned left_pad_bytes;
+  unsigned mid_copy_bytes;
+  unsigned right_pad_bytes;
+  unsigned bottom_pad_bytes;
+} nn_pad_plan_t;
+typedef struct padding_sizes_t {
+  int32_t top;
+  int32_t bottom;
+  int32_t left;
+  int32_t right;
+} padding_sizes_t;
+/**
+ * Func to calculate n_3
+ */
+void pad_3_to_4_prepare(uint32_t *n_3, const unsigned height,
+                        const unsigned width);
+/** Function that pads an image with 3-byte values with a 0.
+ * The output image must be word aligned. This function solves the general
+ * case and calls an optimised assembly version for the bulk copy.
+ *
+ * @param    outputs    output values, every word contains 3 bytes and a zero
+ * @param    inputs     input values, RGBRGBRGBRGB...
+ * @param    N_3        number of blocks of 3 bytes to copy
+ *
+ * @returns  The inner product
+ */
+extern void pad_3_to_4_run(int8_t outputs[], int8_t inputs[], uint32_t N_3,
+                           uint32_t pad_val);
+extern void pad_3_to_4_ref(int8_t outputs[], int8_t inputs[], uint32_t N_3,
+                           uint32_t pad_val);
+typedef struct nn_mul_params_t {
+  int8_t in1_zero_point;
+  int8_t in2_zero_point;
+  int16_t bias;
+  int16_t scalar;
+  int16_t vlashr_shr;
+} nn_mul_params_t;
+void mul_boggle(nn_mul_params_t *params, double in1Scale, double in2Scale,
+                double outputScale, int8_t in1ZeroPoint, int8_t in2ZeroPoint,
+                int8_t outputZeroPoint);
+void mul_elementwise(const int8_t *in1_data, const int8_t *in2_data,
+                     int element_count, nn_mul_params_t *params,
+                     int8_t *out_data);
+// /**
+//  * Describes the parameters needed for an @oper{add_elementwise} operator.
+//  @see add_elementwise().
+//  */
+// typedef struct {
+//     /**
+//      * The parameters that are applied to each input element.
+//      */
+//
+//     /**
+//     * `m1` and `m2` are the multiplers for the inputs.
+//     */
+//     int16_t m1[16];
+//     int16_t m2[16];
+//     /**
+//     * `shift` is the number of bits the 32-bit accumulator is
+//     * right-shifted by to obtain a final result for each element.
+//     */
+//     int16_t shift[16];
+//     /**
+//     * `bias_hi` and `bias_lo` are together, the 32-bit bias to
+//     * which the scaled inputs are added.
+//     */
+//     int16_t bias_lo[16];
+//     int16_t bias_hi[16];
+// } nn_add_params_t;
+typedef struct {
+  int16_t m1[16];
+  int16_t m2[16];
+  int16_t shift[16];
+  int16_t bias_hi[16];
+  int16_t bias_lo[16];
+} nn_add_params_t;
+/**
+ * @brief Invoke an @oper{add_elementwise} job.
+ *
+ * The @oper{add_elementwise} operator adds together two quantized 8-bit input
+ * vectors, @tensor{x_0} and @tensor{x_1} element-by-element to produce the
+ * output vector @tensor{y}. This function assumes that the input vectors and
+ * the output vector each require different quantization parameters.
+ *
+ * In order to add together two quantized vectors, their quantization parameters
+ * must match. The contents of `params` indicate how to do this.
+ *
+ * @par Parameter Details
+ *
+ * `Y` points to the output vector @tensor{y} with shape @tensor_shape{N}.
+ *
+ * `X0` and `X1` respectively point to the first and second input vectors
+ * @tensor{x_0} and @tensor{x_1}, each with shape
+ * @tensor_shape{N}.
+ *
+ * `params` describes the parameters @math{s_i}, @math{m_i}, @math{b} and
+ * @math{s_{out}} which are applied for each output element.
+ *
+ * `elm_start` and `elm_count` together specify which output elements
+ * @math{y[k]} should be calculated by this invocation. Specifically, this
+ * invocation will calculate @math{y[k]} for which `elm_start` @math{\le k \lt}
+ * `(elm_start + elm_count)`.
+ *
+ * @param[out]  Y           The output vector @tensor{y}
+ * @param[in]   X0          The first input vector @tensor{x_0}
+ * @param[in]   X1          The second input vector @tensor{x_1}
+ * @param[in]   params      The scaling and bias parameters
+ * @param[in]   elm_start   Index of first output element to be computed
+ * @param[in]   elm_count   Number of output elements to be computed
+ */
+void add_elementwise(int8_t Y[], const int8_t X1[], const int8_t X2[],
+                     nn_add_params_t *p, const int elm_start,
+                     const int elm_count);
+/**
+ * @brief Execute @oper{lookup8} job.
+ *
+ * See @oper_ref{lookup8} for more details about the @oper{lookup8} operator.
+ *
+ * Unlike other operators, instances of @oper{lookup8} do not require plans or
+ * jobs and no initialization is necessary.
+ *
+ * `Y` points to the output vector @tensor{y} with length @math{N}.
+ *
+ * `X` points to the input vector @tensor{x} with length @math{N}.
+ *
+ * `lut` points to the look-up table @math{T} with shape @tensor_shape{256} and
+ * dtype `int8`.
+ *
+ * `N` is the length @math{N} of the input vector @tensor{x}.
+ *
+ * @requires_word_alignment{Y,X}
+ *
+ * @param Y      [out]  The output vector @tensor{y}
+ * @param X      [in]   The input vector @tensor{x}
+ * @param lut    [in]   Look-up table @tensor{T}
+ * @param N      [in]   Length @math{N} of input and output vectors
+ */
+void lookup8(uint8_t *Y, const uint8_t *X, const uint8_t *lut,
+             const unsigned elm_start, const unsigned elm_count);
+/**
+ * @brief Execute @oper{softmax_exp_sum} job.
+ *
+ * `Y` points to the output scalar.
+ *
+ * `X` points to the input vector @tensor{x} with length @math{N}.
+ *
+ * `lut` points to the look-up table @math{T} with shape @tensor_shape{256} and
+ * dtype `float32`.
+ *
+ * `N` is the length @math{N} of the input vector @tensor{x}.
+ *
+ * `elm_start` and `elm_count` together specify which output elements should be
+ * summed into the output scalar.
+ */
+void softmax_exp_sum(float *Y, const int8_t *X, const float *lut,
+                     const unsigned elm_start, const unsigned elm_count);
+/**
+ * @brief Execute @oper{softmax_exp_div} job.
+ *
+ * `Y` points to the output vector @tensor{y} with length @math{N}.
+ *
+ * `X` points to the input vector @tensor{x} with length @math{N}.
+ *
+ * `lut` points to the look-up table @math{T} with shape @tensor_shape{256} and
+ * dtype `float32`.
+ *
+ * `inv_sum` is the reciprocal of the sum of the exponentials of the inputs.
+ *
+ * `elm_start` and `elm_count` together specify which output elements should be
+ * calculated by this invocation.
+ */
+void softmax_exp_div(int8_t *Y, const int8_t *X, const float *lut,
+                     const float inv_sum, const unsigned elm_start,
+                     const unsigned elm_count);
+void softmax_calculate_inv_sum(float *inv_sum, const float sums[]);
+void softmax_generate_exp_lut(int zero_point, float scale, float *lut);
+void softmax_ref(int8_t *Y, const int8_t *X, const float zero_point,
+                 const float scale, const int length);
+void softmax_single(int8_t *Y, const int8_t *X, const float *lut,
+                    const int offset);
+void mean_int8(const int8_t *input, int8_t *output, const int start_dim_size,
+               const int mean_dim_size, const int end_dim_size,
+               const float in_zero_point, const float out_zero_point,
+               const float scale_mul);
+#endif // LAYERS_H_