PyPI - xmos-ai-tools - Versions diffs - 1.3.2.dev80__py3-none-macosx_10_15_universal2.whl - Mend

xmos-ai-tools 1.3.2.dev80__py3-none-macosx_10_15_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (395) hide show

xmos_ai_tools/runtime/include/lib_nn/api/output_transform_fn_int16.h ADDED Viewed

@@ -0,0 +1,54 @@
+#ifndef _output_transform_fn_int16_h_
+#define _output_transform_fn_int16_h_
+#include <stdint.h>
+typedef struct {
+    int32_t output_slice_channel_count;
+} otfn_int16_params_t;
+/** Function that transform a ring buffer accumulator
+ * into a vector of 16 bit numbers after multiplying and scaling.
+ * In the name of efficiency the inputs shoudl be provided in the following
+ * order (K indicates the value that affects output channel K):
+ *
+ * vD/vR:   0, 2, 1, 3, 4, 6, 5, 7, 8, 10, 9, 11, 12, 14, 13, 15.
+ *          (each second and third element are swapped)
+ *          If only few channels are being used, eg two, they
+ *          must be: 0, X, 1, X, X, X, X, X X, X, X, X X, X, X, X.
+ *
+ * mul_add: m1, m3, m5, m7, m9, m11, m13, m15,
+ *          a1, a3, a5, a7, a9, a11, a13, a15,
+ *          m0, m2, m4, m6, m8, m10, m12, m14
+ *          a0, a2, a4, a6, a8, a10, a12, a14.
+ *          If only few channels are being used, eg two, they
+ *          must be: m1, X, X, X, X, X, X, X, a1, X, X, X, X, X, X, X,
+ *                   m0, X, X, X, X, X, X, X, a0, X, X, X, X, X, X, X.
+ *
+ * output:  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ *          Only the channels used are written, eg, for two channels
+ *          0, 1.
+ *
+ *
+ *
+ * \param  vDvR    Pointer to 32 shorts storing 16 upper halfs
+ *                 of the ring buffer, and 16 lower halfs in that
+ *                 order
+ *
+ * \param  mul_add Pointer to four vectors of length N/2 integers, the first
+ *                 and third vector are multipliers (Q2.30), the second
+ *                 and fourth vectors are adders (Q16.16).
+ *
+ * \param  output  Pointer to the desired place where N values will be
+ *                 outputted
+ *
+ * \param  N       Number of vector elements to process; N <= 16.
+ */
+extern int16_t *output_transform_fn_int16(otfn_int16_params_t *params,
+                                          int16_t *output,
+                                          int16_t *vDvR,
+                                          int32_t output_channel_group,
+                                          int32_t *mul_add);
+#endif

xmos_ai_tools/runtime/include/lib_nn/api/output_transform_fn_int16_kernel_transform.h ADDED Viewed

@@ -0,0 +1,37 @@
+#ifndef _output_transform_fn_int16_kernel_transform_h_
+#define _output_transform_fn_int16_kernel_transform_h_
+#include <stdint.h>
+/**
+ * Function that performs all the transformations needed for a 16-bit convolution
+ *  1. It transforms the weight ordering to reverse the weights in groups of 16 in preparation of VLMACCR shift-and-rotate
+ *  2. It convers the channel multpliers to integer multipliers
+ *  3. It interleaves the channel multpliers and channel biases into a single blob
+ * The number of kernels is assumed to be input_channels x output_channels elements
+ * The number of bias terms and multipliers is assumed to be output_channels
+ * The number of elements in the array mul_add_out should be 2xoutput_channels
+ * The number of elements in the kernel_weights_out array should be input_channels x output_channels
+ *
+ * @param kernel_weights_in      kernel weights input
+ *
+ * @param channel_multpliers_in  per-channel multipliers.
+ *
+ * @param channel_bias_terms_in  per-channel bias terms.
+ *
+ * @param kernel_weights_out     reordered kernel weights
+ *
+ * @param mul_add_out            mixed per-channel multipliers and bias-terms
+ *
+ * @param input_channels         number of input channels
+ *
+ * @param output_channels        number of output_channels
+ */
+extern void output_transform_fn_int16_kernel_transform(
+    const int8_t *kernel_weights_in,
+    const float *channel_multipliers_in, const int *channel_bias_terms_in,
+    int8_t *kernel_weights_out, int32_t *mul_add_out,
+    int input_channels, int output_channels);
+#endif

xmos_ai_tools/runtime/include/lib_nn/api/output_transform_fn_int16_mappings.h ADDED Viewed

@@ -0,0 +1,13 @@
+#ifndef _output_transform_fn_int16_mappings_h_
+#define _output_transform_fn_int16_mappings_h_
+// This defines the mapping of the output transform multipliers from output channels
+extern int ot_int16_mul_index_used_for_output[];
+// This defines the mapping of the output transform biases from output channels
+extern int ot_int16_add_index_used_for_output[];
+// This defines the kernel mapping from output channels
+extern int aggr_ot_int16_input_channel_used_for_output[];
+#endif

xmos_ai_tools/runtime/include/lib_nn/api/quadratic_approximation.h ADDED Viewed

@@ -0,0 +1,82 @@
+#ifndef _quadratic_approximation_h_
+#define _quadratic_approximation_h_
+#ifdef __xcore__
+#define ACTIVATION_FUNCTION __attribute__(( fptrgroup("activation_functions") ))
+#else
+#define ACTIVATION_FUNCTION /**/
+#endif
+#include "nn_api.h"
+#include <stdint.h>
+#define QUADRATIC_APPROXIMATION_MAX_CHUNKS   2048
+/** Type that stores an approximation table.
+ * Must be stored 64-bit aligned when presented to assembly code.
+ * Use:
+ *    * ``quadratic_function_table_number_bytes()`` to query the size
+ *    * ``quadratic_function_table_bytes()`` to obtain a pointer to the table
+ */
+struct quadratic_function_table {
+    struct {          // The order matters - this is how the assembly code expects them
+        int32_t c;
+        int8_t a;
+        int8_t padding;
+        int16_t b;
+    } coefficients[QUADRATIC_APPROXIMATION_MAX_CHUNKS];
+    int data_bytes;
+};
+typedef struct quadratic_function_table quadratic_function_table_t;
+/* Function pointer type - any float -> float
+ */
+typedef float (*float_function_t)(float x);
+/** Function that builds a quadratic approximation table
+ * The function passed in must be monotinuous
+ * Any number of chunks will work but the assembly implementiaton assumes 128.
+ * The function returns a table pointer, and through two arguments the max error,
+ * and the sqrt of the sum of squared errors as a goodness metric.
+ *
+ * \param table         interpolation table to be filled in
+ * \param av            function to be interpolated
+ * \param input_scaler  scale that is applied to the input, eg 8.0/32768.0
+ * \param outptu_scaler scale that is applied to the output, eg 32768.0
+ * \param chunks        number of interpolations. Set to 128.
+ * \param max_error     maximum error, ought to be 1
+ * \param error         sqrt of sum of squared errors.
+ */
+C_API void quadratic_approximation_generator(
+    quadratic_function_table_t *table,
+    ACTIVATION_FUNCTION float_function_t av,
+    double input_scaler,
+    double output_scaler,
+    int chunks,
+    int *max_error,
+    double *error);
+/** Function that returns the number of bytes in an approximation table
+ *
+ * \param x   the table
+ * \returns   The number of bytes in the table
+ */
+C_API uint32_t quadratic_function_table_number_bytes(quadratic_function_table_t *x);
+/** Function that returns a pointer to the bytes in an approximation table
+ *
+ * \param x   the table
+ * \returns   Pointer to the bytes in the table
+ */
+C_API uint8_t *quadratic_function_table_bytes(quadratic_function_table_t *x);
+/** Example functions that can be passed in
+ */
+C_API float approximation_function_tanh(float x);
+C_API float approximation_function_logistics(float x);
+C_API float approximation_function_elu(float x);
+C_API float approximation_function_relu(float x);
+#endif

xmos_ai_tools/runtime/include/lib_nn/api/quadratic_interpolation.h ADDED Viewed

@@ -0,0 +1,23 @@
+#ifndef _quadratic_interpolation_h_
+#define _quadratic_interpolation_h_
+#include <stdint.h>
+/** Function that performs a quadratic interpolation on a vector of inputs given a table
+ * of coefficients
+ *
+ * \param    outputs       Output vector, 16-bit signed integers
+ *
+ * \param    inputs        Output vector, 16-bit signed integers
+ *
+ * \param    coefficients  The bytes comprising the table of coefficients produced with
+ *                         ``quadratic_approximation_generator()``.
+ *                         Must be 64-bit aligned.
+ *
+ * \param    N             Number of 16-bit elements in the vector
+ */
+extern void quadratic_interpolation_128(int16_t *outputs, int16_t *inputs,
+                                        uint8_t *coeffs,
+                                        uint32_t N);
+#endif

xmos_ai_tools/runtime/include/lib_nn/api/quantize_int16.h ADDED Viewed

@@ -0,0 +1,22 @@
+#ifndef _quantize_int16_h_
+#define _quantize_int16_h_
+/**
+ * Function that implements quantization of a 16-bit tensor to a 32-bit tensor.
+ * The blob must have been created by a call to ``quantize_int16_blob()``
+ *
+ * @param output         Output tensor
+ *
+ * @param input          Input tensor
+ *                       Must be word-aligned
+ *
+ * @param blob           Transformed constant input tensor
+ *                       Must be word-aligned
+ *
+ * @param tensor_length  Number of elements in the tensor (product of all dimensions)
+ *                       There are no constraints on this number.
+ */
+void quantize_int16_tensor(int16_t *output, float *input,
+                             int tensor_length, void *blob);
+#endif

xmos_ai_tools/runtime/include/lib_nn/api/quantize_int16_transform.h ADDED Viewed

@@ -0,0 +1,33 @@
+#ifndef _quantize_int16_transform_h_
+#define _quantize_int16_transform_h_
+#include "nn_api.h"
+#include <stdint.h>
+/**
+ * Function that performs the compile time transformation of an int16 addition
+ * between two tensors.
+ *
+ * this function should be
+ * called at compile-time, and at run-time the output of this function shall be passed
+ * as the second input tensor of ``quantize_int16_tensor``
+ *
+ * @param output            Output of the function; a blob of
+ *                          ``QUANTIZE_INT16_TENSOR_BYTES()`` bytes.
+ *                          Must be word-aligned.
+ *
+ * @param output_scaler      Quantisation scaler for the output
+ *
+ * @returns 1 on success, 0 on fail (fallback required)
+ */
+C_API int quantize_int16_tensor_blob(void *output,
+                                 float output_scaler);
+/**
+ * Macro that calculates the number of int16_t that should be allocated to
+ * store the output of ``quantize_int16_tensor_blob()``
+ */
+#define QUANTIZE_INT16_TENSOR_BYTES()  (1 * sizeof(float))
+#endif

xmos_ai_tools/runtime/include/lib_nn/api/version.h ADDED Viewed

@@ -0,0 +1,13 @@
+// Copyright (c) 2020, XMOS Ltd, All rights reserved
+#ifndef LIB_NN_VERSION_H_
+#define LIB_NN_VERSION_H_
+namespace lib_nn {
+static const unsigned major_version = 0;
+static const unsigned minor_version = 3;
+static const unsigned patch_version = 0;
+}  // namespace lib_nn
+#endif  // LIB_NN_VERSION_H_

xmos_ai_tools/runtime/include/lib_nn/api/vpu_memmove_word_aligned.h ADDED Viewed

@@ -0,0 +1,15 @@
+#ifndef _vpu_memmove_word_aligned_h_
+#define _vpu_memmove_word_aligned_h_
+/**
+ * Function that copies a block of memory. Both source and destination
+ * address must be word aligned. Any number of bytes can be copied. There
+ * may be an overlap between the destination and source.
+ *
+ * @param     dst         Destination address, must be word aligned.
+ * @param     src         Source address, must be word aligned.
+ * @param     byte_count  Number of bytes to copy - may be zero
+ */
+void vpu_memmove_word_aligned(void * dst, const void * src, unsigned int byte_count);
+#endif

xmos_ai_tools/runtime/include/lib_nn/api/vpu_memset_256.h ADDED Viewed

@@ -0,0 +1,55 @@
+#include <stdint.h>
+#ifndef _vpu_memset_256_h_
+#define _vpu_memset_256_h_
+/**
+ * Function that replicates a vector. The source address must be word
+ * aligned, the destination address is assumed to be aligned with the
+ * replication pattern in the source. Any number of bytes can be copied.
+ * There should not be an overlap between the destination and source.
+ *
+ * It is assumed that the source address contains 32 replicated bytes (if
+ * the destination address is byte aligned), or that it contains 16
+ * replicated shorts (if the destination address is 16-bit aligned), or
+ * that it contains 8 replicated ints.
+ *
+ * broadcast_32_to_256() and BROADCAST_8_TO_32() cane be used to
+ * create the source vector
+ *
+ * @param     dst         Destination address
+ * @param     src         Source address, must be word aligned.
+ * @param     byte_count  Number of bytes to copy - may be zero
+ */
+void vpu_memset_256(void *dst, const void *src, unsigned int byte_count);
+/**
+ * Function that replicates an int over a vector. The vector must be
+ * aligned on an 8-byte boundary. In order to replicate a byte or short over
+ * a vector, combine this with a call to BROADCAST_8_TO_32() or
+ * BROADCAST_16_TO_32(). Declare the vector as a uint64_t x[] in order to
+ * guarantee 8-byte alignement.
+ *
+ * @param     dst         Destination address, must be 8-byte aligned
+ * @param     from        Value to be replicated
+ */
+void broadcast_32_to_256(void *dst, uint32_t from);
+/**
+ * Macro that replicates a byte over an int.
+ * Use with broadcast_32_to_256() in order to replicate a byte over a vector
+ */
+#define BROADCAST_8_TO_32(f) (((uint8_t)f) * 0x01010101)
+/**
+ * Macro that replicates a short over an int
+ * Use with broadcast_32_to_256() in order to replicate a short over a vector
+ */
+#define BROADCAST_16_TO_32(f) (((uint16_t)f) * 0x00010001)
+/**
+ * Macro that replicates a byte over a short
+ */
+#define BROADCAST_8_TO_16(f) (((uint8_t)f) * 0x00000101)
+#endif

xmos_ai_tools/runtime/include/lib_nn/api/vpu_sim.h ADDED Viewed

@@ -0,0 +1,118 @@
+// Copyright 2020 XMOS LIMITED. This Software is subject to the terms of the
+// XMOS Public License: Version 1
+#ifndef LIB_NN_VPU_SIM_H_
+#define LIB_NN_VPU_SIM_H_
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+#include "nn_types.h"
+#include "xs3_vpu.h"
+C_API
+typedef union {
+  uint8_t u8[VPU_INT8_EPV];
+  int8_t s8[VPU_INT8_EPV];
+  uint16_t u16[VPU_INT16_EPV];
+  int16_t s16[VPU_INT16_EPV];
+  uint32_t u32[VPU_INT32_EPV];
+  int32_t s32[VPU_INT32_EPV];
+} vpu_vector_t;
+C_API
+typedef enum {
+  MODE_S32 = 0x00,
+  MODE_S16 = 0x100,
+  MODE_S8 = 0x200,
+} vector_mode;
+C_API
+typedef struct {
+  vector_mode mode;
+  vpu_vector_t vR;
+  vpu_vector_t vD;
+  vpu_vector_t vC;
+} xs3_vpu;
+C_API void VSETC(xs3_vpu* vpu, const vector_mode mode);
+C_API void VCLRDR(xs3_vpu* vpu);
+C_API void VLDR(xs3_vpu* vpu, const void* addr);
+C_API void VLDD(xs3_vpu* vpu, const void* addr);
+C_API void VLDC(xs3_vpu* vpu, const void* addr);
+C_API void VSTR(const xs3_vpu* vpu, void* addr);
+C_API void VSTD(const xs3_vpu* vpu, void* addr);
+C_API void VSTC(const xs3_vpu* vpu, void* addr);
+C_API void VSTRPV(const xs3_vpu* vpu, void* addr, unsigned mask);
+C_API void VLMACC(xs3_vpu* vpu, const void* addr);
+C_API void VLMACCR(xs3_vpu* vpu, const void* addr);
+C_API void VLMACCR1(xs3_vpu* vpu, const void* addr);
+C_API void VPOS(xs3_vpu* vpu);
+C_API void VLSAT(xs3_vpu* vpu, const void* addr);
+C_API void VLSAT_FIXED(xs3_vpu* vpu, const void* addr);
+C_API void VLASHR(xs3_vpu* vpu, const void* addr, const int32_t shr);
+C_API void VLADD(xs3_vpu* vpu, const void* addr);
+C_API void VLSUB(xs3_vpu* vpu, const void* addr);
+C_API void VLMUL(xs3_vpu* vpu, const void* addr);
+C_API void VDEPTH1(xs3_vpu* vpu);
+C_API void VDEPTH8(xs3_vpu* vpu);
+C_API void VDEPTH16(xs3_vpu* vpu);
+/** Print vector register contents based on current vector_mode **/
+C_API void vpu_accu_print(xs3_vpu* vpu);
+C_API void vpu_sim_print(xs3_vpu* vpu);
+C_API void vpu_sim_mem_print(void* address, vector_mode mode);
+// Function for implementing the saturation logic within the VPU.
+C_API int64_t vpu_saturate(const int64_t input, const unsigned bits);
+// Assert if the memory access is non-word aligned
+// void assert_word_aligned(const void* address);
+#define assert_word_aligned(address) assert(((uintptr_t)address & 0x3) == 0);
+#ifdef __cplusplus
+namespace nn {
+class VPU {
+ private:
+  xs3_vpu vpu;
+ public:
+  vpu_vector_t& vD;
+  vpu_vector_t& vR;
+  vpu_vector_t& vC;
+  vector_mode& mode;
+ public:
+  VPU() : vD(vpu.vD), vR(vpu.vR), vC(vpu.vC), mode(vpu.mode) {}
+  /** `mode` should be one of `MODE_S32`, `MODE_S16` or `MODE_S8` */
+  void vsetc(const vector_mode mode) { VSETC(&this->vpu, mode); }
+  void vclrdr() { VCLRDR(&this->vpu); }
+  void vldr(void const* addr) { VLDR(&this->vpu, addr); }
+  void vldd(void const* addr) { VLDD(&this->vpu, addr); }
+  void vldc(void const* addr) { VLDC(&this->vpu, addr); }
+  void vstr(void* addr) { VSTR(&this->vpu, addr); }
+  void vstd(void* addr) { VSTD(&this->vpu, addr); }
+  void vstc(void* addr) { VSTC(&this->vpu, addr); }
+  void vstrpv(void* addr, uint32_t mask) { VSTRPV(&this->vpu, addr, mask); }
+  void vlmacc(void const* addr) { VLMACC(&this->vpu, addr); }
+  void vlmaccr(void const* addr) { VLMACCR(&this->vpu, addr); }
+  void vlmaccr1(void const* addr) { VLMACCR1(&this->vpu, addr); }
+  void vlsat(void const* addr) { VLSAT(&this->vpu, addr); }
+  void vlashr(void const* addr, int32_t shr) { VLASHR(&this->vpu, addr, shr); }
+  void vladd(void const* addr) { VLADD(&this->vpu, addr); }
+  void vlsub(void const* addr) { VLSUB(&this->vpu, addr); }
+  void vlmul(void const* addr) { VLMUL(&this->vpu, addr); }
+  void vdepth1() { VDEPTH1(&this->vpu); }
+  void vdepth8() { VDEPTH8(&this->vpu); }
+  void vdepth16() { VDEPTH16(&this->vpu); }
+};
+}  // namespace nn
+#endif
+#endif  // LIB_NN_VPU_SIM_H_

xmos_ai_tools/runtime/include/lib_nn/api/xs3_vpu.h ADDED Viewed

@@ -0,0 +1,216 @@
+// Copyright 2020-2021 XMOS LIMITED.
+// This Software is subject to the terms of the XMOS Public Licence: Version 1.
+#ifndef XS3_VPU_H_
+#define XS3_VPU_H_
+#include <xs3a_registers.h>
+#include "nn_api.h"
+#ifndef XS1_VSR_TYPE
+/* TODO use from xs3a_kernel.h in a future tools release */
+#define XS1_VSR_HEADROOM_SHIFT 0x0
+#define XS1_VSR_HEADROOM_SIZE 0x5
+#define XS1_VSR_HEADROOM_MASK \
+  (((1 << XS1_VSR_HEADROOM_SIZE) - 1) << XS1_VSR_HEADROOM_SHIFT)
+#define XS1_VSR_HEADROOM(x) \
+  (((x)&XS1_VSR_HEADROOM_MASK) >> XS1_VSR_HEADROOM_SHIFT)
+#define XS1_VSR_HEADROOM_SET(x, v)  \
+  (((x) & ~XS1_VSR_HEADROOM_MASK) | \
+   (((v) << XS1_VSR_HEADROOM_SHIFT) & XS1_VSR_HEADROOM_MASK))
+#define XS1_VSR_SHIFT_SHIFT 0x6
+#define XS1_VSR_SHIFT_SIZE 0x2
+#define XS1_VSR_SHIFT_MASK \
+  (((1 << XS1_VSR_SHIFT_SIZE) - 1) << XS1_VSR_SHIFT_SHIFT)
+#define XS1_VSR_SHIFT(x) (((x)&XS1_VSR_SHIFT_MASK) >> XS1_VSR_SHIFT_SHIFT)
+#define XS1_VSR_SHIFT_SET(x, v)  \
+  (((x) & ~XS1_VSR_SHIFT_MASK) | \
+   (((v) << XS1_VSR_SHIFT_SHIFT) & XS1_VSR_SHIFT_MASK))
+#define XS1_VSR_TYPE_SHIFT 0x8
+#define XS1_VSR_TYPE_SIZE 0x4
+#define XS1_VSR_TYPE_MASK (((1 << XS1_VSR_TYPE_SIZE) - 1) << XS1_VSR_TYPE_SHIFT)
+#define XS1_VSR_TYPE(x) (((x)&XS1_VSR_TYPE_MASK) >> XS1_VSR_TYPE_SHIFT)
+#define XS1_VSR_TYPE_SET(x, v)  \
+  (((x) & ~XS1_VSR_TYPE_MASK) | \
+   (((v) << XS1_VSR_TYPE_SHIFT) & XS1_VSR_TYPE_MASK))
+#define XS1_VSR_LENGTH_SHIFT 0xc
+#define XS1_VSR_LENGTH_SIZE 0x4
+#define XS1_VSR_LENGTH_MASK \
+  (((1 << XS1_VSR_LENGTH_SIZE) - 1) << XS1_VSR_LENGTH_SHIFT)
+#define XS1_VSR_LENGTH(x) (((x)&XS1_VSR_LENGTH_MASK) >> XS1_VSR_LENGTH_SHIFT)
+#define XS1_VSR_LENGTH_SET(x, v)  \
+  (((x) & ~XS1_VSR_LENGTH_MASK) | \
+   (((v) << XS1_VSR_LENGTH_SHIFT) & XS1_VSR_LENGTH_MASK))
+#endif
+#define XS1_VSETC_SHIFT_NOSHIFT 0x0
+#define XS1_VSETC_SHIFT_SHIFTLEFT 0x1
+#define XS1_VSETC_SHIFT_SHIFTRIGHT 0x2
+#define XS1_VSETC_TYPE_INT32 0x0
+#define XS1_VSETC_TYPE_INT16 0x1
+#define XS1_VSETC_TYPE_INT8 0x2
+#define XS1_NUM_WORDS_PER_VECTOR 0x8
+/* End of xs3a_kernel.h */
+#define XS3_VPU_VREG_WIDTH_BITS (XS1_NUM_WORDS_PER_VECTOR * XS1_ALL_BITS_SIZE)
+#define XS3_VPU_VREG_WIDTH_BYTES (XS3_VPU_VREG_WIDTH_BITS >> 3)
+#define XS3_VPU_VREG_WIDTH_WORDS (XS3_VPU_VREG_WIDTH_BYTES >> 2)
+#ifndef __ASSEMBLER__
+C_API enum {
+  VEC_INT_32 = 0, /**< 0 */
+  VEC_INT_16 = 1, /**< 1 */
+  VEC_INT_8 = 2,  /**< 2 */
+  VEC_FLT_32 = 4, /**< 4 */
+  VEC_FLT_16 = 5, /**< 5 */
+  VEC_FLT_8 = 6,  /**< 6 */
+};
+C_API enum {
+  VEC_SH0 = 0, /**< 0 */
+  VEC_SHL = 1, /**< 1 */
+  VEC_SHR = 2, /**< 2 */
+};
+/**
+ * The saturation bounds for signed integers in each VPU operating mode.
+ */
+C_API enum {
+  VPU_INT8_MAX = 0x7F,  /**<  0x7F */
+  VPU_INT8_MIN = -0x7F, /**< -0x7F */
+  VPU_INT16_MAX = 0x7FFF,  /**<  0x7FFF */
+  VPU_INT16_MIN = -0x7FFF, /**< -0x7FFF */
+  VPU_INT32_MAX = 0x7FFFFFFF,  /**<  0x7FFFFFFF */
+  VPU_INT32_MIN = -0x7FFFFFFF, /**< -0x7FFFFFFF */
+};
+/**
+ * Number of accumulator bits in each operating mode.
+ *
+ * In each operating mode, the VLMACC, VLMACCR and VLSAT instructions operate on
+ * an array of accumulators in the vector registers vR and vD. In each case, the
+ * most significant bits are stored in vD, and the least significant bits are
+ * stored in vR.
+ */
+C_API enum {
+  VPU_INT8_ACC_SIZE = 32,  /**< 32 */
+  VPU_INT16_ACC_SIZE = 32, /**< 32 */
+  VPU_INT32_ACC_SIZE = 40, /**< 40 */
+};
+/**
+ * When vD and vR contain accumulators, the values in this enum indicate how
+ * many least significant bits are stored in vR, with the remaining bits stored
+ * in vD.
+ */
+C_API enum {
+  VPU_INT8_ACC_VR_BITS = 16,  /**< 16 */
+  VPU_INT16_ACC_VR_BITS = 16, /**< 16 */
+  VPU_INT32_ACC_VR_BITS = 32, /**< 32 */
+};
+/**
+ * When vD and vR contain accumulators, the values in this enum can be used to
+ * mask off the bits of the accumulator value which correspond to the portion in
+ * vR.
+ */
+C_API enum {
+  VPU_INT8_ACC_VR_MASK = 0xFFFF,      /**< 0xFFFF */
+  VPU_INT16_ACC_VR_MASK = 0xFFFF,     /**< 0xFFFF */
+  VPU_INT32_ACC_VR_MASK = 0xFFFFFFFF, /**< 0xFFFFFFFF */
+};
+/**
+ * Integer type which fits a single accumulator (32-bits) corresponding to the
+ * 8-bit VPU mode.
+ */
+C_API typedef int32_t vpu_int8_acc_t;
+/**
+ * Integer type which fits a single accumulator (32-bits) corresponding to the
+ * 16-bit VPU mode.
+ */
+C_API typedef int32_t vpu_int16_acc_t;
+/**
+ * Integer type which fits a single accumulator (40-bits) corresponding to the
+ * 32-bit VPU mode.
+ */
+C_API typedef int64_t vpu_int32_acc_t;
+/**
+ * The number of elements which fit into a vector register for each operating
+ * mode.
+ *
+ * This is also the number of elements which are operated on in the following
+ * instructions: VDEPTH1, VDEPTH16, VDEPTH8, VLADD, VLADDD, VLASHR, VLMACCR,
+ * VLMUL, VLSUB, VPOS, VSIGN
+ *
+ */
+C_API enum {
+  VPU_INT8_EPV = 32,  /**< 32 */
+  VPU_INT16_EPV = 16, /**< 16 */
+  VPU_INT32_EPV = 8,  /**< 8 */
+};
+/**
+ * log-base-2 of the corresponding VPU_INT*_EPV values.
+ */
+C_API enum {
+  VPU_INT8_EPV_LOG2 = 5,  /**< 5 */
+  VPU_INT16_EPV_LOG2 = 4, /**< 4 */
+  VPU_INT32_EPV_LOG2 = 3, /**< 3 */
+};
+/**
+ * The number of accumulators, spread across vR and vD, in each operating mode.
+ *
+ * This is also the number of elements consumed (number of multiplies) by the
+ * VLMACC instruction.
+ */
+C_API enum {
+  VPU_BIN_ACC_PERIOD = 16,   /**< 16 */
+  VPU_INT8_ACC_PERIOD = 16,  /**< 16 */
+  VPU_INT16_ACC_PERIOD = 16, /**< 16 */
+  VPU_INT32_ACC_PERIOD = 8,  /**< 8 */
+};
+/**
+ * log-base-2 of the corresponding VPU_INT*_ACC_PERIOD values.
+ */
+C_API enum {
+  VPU_INT8_ACC_PERIOD_LOG2 = 4,  /**< 4 */
+  VPU_INT16_ACC_PERIOD_LOG2 = 4, /**< 4 */
+  VPU_INT32_ACC_PERIOD_LOG2 = 3, /**< 3 */
+};
+/**
+ * The number of elements consumed by a VLMACC instruction in each operating
+ * mode. In other words, the number of simultaneous multiply-accumulates
+ * performed by the VLMACC instruction.
+ */
+C_API enum {
+  VPU_INT8_VLMACC_ELMS = 16,  /**< 16 */
+  VPU_INT16_VLMACC_ELMS = 16, /**< 16 */
+  VPU_INT32_VLMACC_ELMS = 8,  /**< 8 */
+};
+/**
+ * log-base-2 of the corresponding VPU_INT*_VLMACC_ELMS values.
+ */
+C_API enum {
+  VPU_INT8_VLMACC_ELMS_LOG2 = 4,  /**< 4 */
+  VPU_INT16_VLMACC_ELMS_LOG2 = 4, /**< 4 */
+  VPU_INT32_VLMACC_ELMS_LOG2 = 3, /**< 3 */
+};
+#endif  //__ASM__
+#endif  // XS3_VPU_H_