PyPI - xmos-ai-tools - Versions diffs - 1.3.2.dev80__py3-none-macosx_10_15_universal2.whl - Mend

xmos-ai-tools 1.3.2.dev80__py3-none-macosx_10_15_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (395) hide show

xmos_ai_tools/runtime/include/lib_nn/src/asm/asm_constants.h ADDED Viewed

@@ -0,0 +1,41 @@
+// Copyright 2020-2021 XMOS LIMITED.
+// This Software is subject to the terms of the XMOS Public Licence: Version 1.
+#pragma once
+#ifndef __ASSEMBLER__
+#include <stdint.h>
+#include "xs3_vpu.h"
+typedef struct {
+  // Word offset = 0
+  int16_t vec_0x007F[VPU_INT8_ACC_PERIOD];
+  // Word offset = 8
+  int8_t vec_0x01[VPU_INT8_ACC_PERIOD];
+  // Word offset = 12
+  int16_t vec_0x0002[VPU_INT8_ACC_PERIOD];
+  // Word offset = 20
+  int8_t vec_0x80[VPU_INT8_EPV];
+  // Word offset = 28
+} vpu_constants_t;
+extern const vpu_constants_t vpu_vects;
+extern const uint32_t vpu_vect_zero[VPU_INT32_EPV];
+extern const int16_t vpu_vect_0x007F[VPU_INT16_EPV];
+extern const int8_t vpu_vect_0x01[VPU_INT8_EPV];
+extern const int8_t vpu_vect_0x02[VPU_INT8_EPV];
+extern const int8_t vpu_vect_0x80[VPU_INT8_EPV];
+#endif  // __ASSEMBLER__
+#define VPU_MODE_32BIT 0x0000
+#define VPU_MODE_16BIT 0x0100
+#define VPU_MODE_8BIT 0x0200
+#define VPU_VEC_0x007F (0)
+#define VPU_VEC_0x01 (8)
+#define VPU_VEC_0x0002 (12)
+#define VPU_VEC_0x80 (20)

xmos_ai_tools/runtime/include/lib_nn/src/asm/window_op_plan.h ADDED Viewed

@@ -0,0 +1,25 @@
+// Copyright 2020-2021 XMOS LIMITED.
+// This Software is subject to the terms of the XMOS Public Licence: Version 1.
+#ifndef WINDOW_OP_PLAN_H_
+#define WINDOW_OP_PLAN_H_
+#define WOP_OUTPUT_ROWS (0)
+#define WOP_OUTPUT_COLS (1)
+#define WOP_OUTPUT_CHANS (2)
+#define WOP_WINDOW_ROWS (3)
+#define WOP_WINDOW_COLS (4)
+#define WOP_START_STRIDE_X (5)
+#define WOP_START_STRIDE_Y (6)
+#define WOP_INNER_STRIDE_VERT (7)
+#define WOP_INNER_STRIDE_HORI (8)
+#define WOP_OUTER_STRIDE_VERT_X (9)
+#define WOP_OUTER_STRIDE_VERT_Y (10)
+#define WOP_OUTER_STRIDE_HORI_X (11)
+#define WOP_OUTER_STRIDE_HORI_Y (12)
+#define WOP_CHAN_STRIDE_X (13)
+#define WOP_CHAN_STRIDE_Y (14)
+#define WOP_WORDS (15)
+#endif  // WINDOW_OP_PLAN_H_

xmos_ai_tools/runtime/include/lib_tflite_micro/api/fast_flash.h ADDED Viewed

@@ -0,0 +1,47 @@
+#ifndef _FAST_FLASH_H_
+#define _FAST_FLASH_H_
+#include <quadflash.h>
+/** Fast flash library.
+ * Before calling any of the functions in here, lib_quad_flash must be initialised as normal by using
+ * fl_connectToDevice(qspi, flash_spec, n_flash_spec).
+ * After that, a call to fast_flash_init shall be made.
+ * After that, a sequence of calls to fast_flash_read can be made.
+ *
+ * The data partition must start with the following 32 bytes: **NOTE: REMOVE THE +4 in fast_flash_init**
+ *
+ *   0xff, 0x00, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+ *   0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
+ *   0x31, 0xf7, 0xce, 0x08, 0x31, 0xf7, 0xce, 0x08,
+ *   0x9c, 0x63, 0x9c, 0x63, 0x9c, 0x63, 0x9c, 0x63
+ *
+ * This pattern is designed to create maximum difficulties electrically and is used
+ * to calibrate the electrical settings. Note that this pattern must be nibble reversed
+ * before being written to flash; just like all other data.
+ * The rest of the data partition can be used as normal
+ */
+/** Function that initialises the fast_flash library
+ *
+ * \param      qspi        ports that connect to flash
+ *
+ * \returns    a negative value of -1..-5 if the window is too small (size 0..4)
+ *             zero if successful
+ */
+int fast_flash_init(fl_QSPIPorts &qspi);
+/** Function that reads a sequential set of bytes from memory.
+ * This function assumes that nibbles have been reversed ((x << 4) & 0xf0 | (x >> 4) & 0x0f)
+ * before the data was written to flash.
+ * Note that reading 32 bytes from offset 0 shall yield the special pattern above.
+ *
+ * \param      qspi        ports that connect to flash
+ * \param      addr        address in flash data segment
+ * \param      word_count  Number of words to read
+ * \param      read_data   array to store data in to.
+ * \param      c_out_data  optional channel end over which data is out() instead.
+ */
+void fast_flash_read(fl_QSPIPorts &qspi, unsigned addr, unsigned word_count, unsigned read_data[], chanend ?c_data_out);
+#endif

xmos_ai_tools/runtime/include/lib_tflite_micro/api/inference_engine.h ADDED Viewed

@@ -0,0 +1,218 @@
+// Copyright (c) 2020, XMOS Ltd, All rights reserved
+#ifndef INFERENCE_ENGINE_H_
+#define INFERENCE_ENGINE_H_
+#ifndef __XC__
+#define UNSAFE /**/
+#else
+#define UNSAFE unsafe
+#endif
+#if !defined(XTFLM_DISABLED)
+#if defined(__xtflm_conf_h_exists__)
+#include "xtflm_conf.h"
+#else
+#ifndef XTFLM_OPERATORS
+#define XTFLM_OPERATORS 10
+#endif
+#endif
+#include "tensorflow/lite/c/c_api_types.h"
+#include "xcore_config.h"
+#ifdef __cplusplus
+#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
+#include "xcore_device_memory.h"
+#include "xcore_error_reporter.h"
+#include "xcore_interpreter.h"
+#include "xcore_ops.h"
+#include "xcore_profiler.h"
+/** Structure that contains all the TensorFlowLite for Micro objects that must
+ * be allocated to create an interpreter. One of these structures has to be
+ * allocated for each inference engine. This structure contains C++ objects, and
+ * must therefore be allocated inside a C++ source file.
+ */
+struct tflite_micro_objects {
+  tflite_micro::micro::xcore::XCoreErrorReporter error_reporter;
+  tflite_micro::micro::xcore::XCoreProfiler xcore_profiler;
+  uint64_t interpreter_buffer
+      [(sizeof(tflite_micro::micro::xcore::XCoreInterpreter) + sizeof(uint64_t) - 1) /
+       sizeof(uint64_t)]; // This needs to be aligned on a double word boundary
+  tflite_micro::MicroMutableOpResolver<XTFLM_OPERATORS> resolver;
+  tflite_micro::micro::xcore::XCoreInterpreter *interpreter;
+  const tflite_micro::Model *model;
+};
+#endif
+#endif
+// Opaque definition for the C++ struct above, used in C.
+struct tflite_micro_objects;
+/** Structure that contains all the data needed to describe an inference engine
+ * This structure contains no C++, just standard C pointers and arrays.
+ */
+typedef struct inference_engine {
+  uint32_t *UNSAFE
+      memory_primary; ///< Pointer to space for tensor arena and optional model
+  uint32_t *UNSAFE
+      memory_secondary; ///< Pointer to secondary space. If null,
+                        // use the primary for model and tensor arena
+  uint32_t
+      outputs; ///< Number of output tensors, initialised on loading a model.
+  uint32_t inputs; ///< Number of input tensors, initialised on loading a model.
+  uint32_t *UNSAFE
+      output_buffers[NUM_OUTPUT_TENSORS]; ///< Pointers to output tensors.
+  uint32_t *UNSAFE
+      input_buffers[NUM_INPUT_TENSORS];      ///< Pointers to input tensors.
+  uint32_t output_sizes[NUM_OUTPUT_TENSORS]; ///< Size of each output tensor in
+                                             ///< bytes.
+  uint32_t
+      input_sizes[NUM_INPUT_TENSORS]; ///< Size of each input tensor in bytes.
+  uint32_t output_size; ///< Total size of all outputs - TODO: obsolete?
+  uint32_t input_size;  ///< Total size of all inputs - TODO: obsolete?
+  uint32_t
+      memory_primary_bytes; ///< Number of bytes available in primary memory
+  uint32_t
+      memory_secondary_bytes; ///< Number of bytes available in secondary memory
+  uint32_t
+      output_times_size; ///< Number of bytes available to store profiling data
+  uint32_t operators_size;       ///< ???
+  uint32_t *UNSAFE output_times; ///< pointer to profiling data, one per layer
+  struct tflite_micro_objects *UNSAFE
+      xtflm;             ///< Pointer to C++ XTFLM object - opaque to C
+                         // status for the engine to maintain
+  uint32_t haveModel;    ///< if 1: we have a model
+  uint32_t chainToNext;  ///< if 1: we are chained (could be implicit in c_push
+                         ///< being non-null?)
+  uint32_t acquireMode;  ///< if non zero we're acquiring data
+  uint32_t outputGpioEn; // TODO: should this be here? Possibly not
+  int8_t outputGpioThresh[AISRV_GPIO_LENGTH];
+  uint8_t outputGpioMode;
+  uint32_t debug_log_buffer[MAX_DEBUG_LOG_LENGTH /
+                            sizeof(uint32_t)]; ///< buffer for error messages
+  uint32_t arena_needed_bytes;                 ///< Total arena needed in bytes.
+  uint32_t num_threads;
+  struct xc_context_config_t xc_config;
+} inference_engine_t;
+#ifdef __cplusplus
+#ifndef XTFLM_DISABLED
+/** Function that initializes the inference_engine object, given a
+ * tflite_micro_objects object. This function has to be called from a C++ source
+ * files, and it performs the initialisation of the inference engine. This
+ * involves setting up basic pointers inside the IE object, nothing else.
+ *
+ * The function returns the operator-resolver, which must be be used to add all
+ * necessary operators to the inference engine. A typical calling sequence is as
+ * follows::
+ *
+ *    uint32_t data_ext[100000/sizeof(int)];
+ *    [...]
+ *        static struct tflite_micro_objects s0;
+ *        auto *resolver = inference_engine_initialize(ie,
+ *                                                     data_ext, 100000,
+ *                                                     nullptr,  0,
+ *                                                     &s0);
+ *        resolver->AddAdd();
+ *        resolver->AddConv2D();
+ *        resolver->AddCustom(tflite_micro::ops::micro::xcore::Conv2D_V2_OpCode,
+ *                   tflite_micro::ops::micro::xcore::Register_Conv2D_V2());
+ *    [...]
+ *
+ * Note that when tensorflow lite for micro is disabled this function will not
+ * exist as it depends on all and sundry in XTFLM.
+ *
+ * Note that the lifetime of all spaces passed to this function should be longer
+ * than the lifetime of the inference engine. Typically all spaces are declared
+ * globally.
+ *
+ * \param ie                  Pointer to an uninitialized inference engine
+ * object, allocated by the caller. \param memory_primary      Pointer to the
+ * space to be used for the tensor arena, allocated by the caller. If the fourth
+ * parameter is null, then this space will be used for both model and tensor
+ * arena. \param n_primary           Number of bytes available in primary memory
+ * \param memory_secondary    Pointer to the space to be used for the model
+ *                            allocated by the caller. If this parameter is
+ * null, then the primary memory will be used for both model and tensor arena.
+ * \param n_secondary         Number of bytes available in secondary memory
+ * \param xtflmo               C++ structure for storing the XTFLM data
+ * structures. Must be allocated by the caller.
+ *
+ */
+tflite_micro::MicroMutableOpResolver<XTFLM_OPERATORS> *inference_engine_initialize(
+    inference_engine_t *UNSAFE ie, uint32_t memory_primary[],
+    uint32_t n_memory_primary, uint32_t memory_secondary[],
+    uint32_t n_secondary, struct tflite_micro_objects *UNSAFE xtflmo);
+#endif
+extern "C" {
+#endif
+/** Function that unloads a model frmo the inference engine. This function
+ * must be called before before overwriting the model. For example, you
+ * have ran a model successfully, before you store a new model over the top
+ * of the model (from flash or anywhere else), you must first unload the
+ * model, then you can overwrite the model, and finally you can call the
+ * inference_engine_load_model to set up the new model.
+ *
+ * It is safe to call unload model if there is no model loaded.
+ *
+ * \param ie           pointer to inference engine.
+ */
+void inference_engine_unload_model(inference_engine_t *UNSAFE ie);
+/** Function that loads a model into the inference engine. The model must be
+ * stored in either of the two spaces passed to the inference_engine_initialize
+ * function above: either the dedicated data_model space or the space shared
+ * with the tensor_arena. This funciton assumes the model is in place already
+ * and will simply parse it, not copy it.
+ *
+ * \param ie           pointer to inference engine.
+ * \param model_bytes  Number of bytes in the model
+ * \param model_data   Pointer to the model (one of data_model or
+ * data_tensor_arena passed above) \param c_flash_or_tile      Optional channel to flash
+ * or tile server
+ *
+ * \returns            non zero indicates an error
+ */
+#ifdef __XC__
+    int inference_engine_load_model(inference_engine_t * UNSAFE ie, uint32_t model_bytes, uint32_t * UNSAFE model_data, chanend ?c_flash_or_tile);
+#else
+int inference_engine_load_model(inference_engine_t *UNSAFE ie,
+                                uint32_t model_bytes,
+                                uint32_t *UNSAFE model_data, void *weights_data_ptr);
+#endif
+    /** Function that invokes the inference engine. This function will create an
+     * extra four threads enabling a model to run in 5 threads.
+     *
+     * \param ie           pointer to inference engine.
+     */
+    int interp_invoke_par_5(inference_engine_t *ie);
+    int interp_invoke_par_4(inference_engine_t *ie);
+    int interp_invoke_par_3(inference_engine_t *ie);
+    int interp_invoke_par_2(inference_engine_t *ie);
+    int interp_invoke(inference_engine_t *ie);
+    TfLiteStatus interp_invoke_internal(inference_engine_t *ie);
+    /** Function that resets variable tensors.
+     * This should be called after invoking a model with stateful ops such as LSTM.
+     */
+    int interp_reset(inference_engine_t *ie);
+    /** Function that prints a summary of the time each operator took. This
+     * function uses printf - you may want to avoid calling it.
+     *
+     * \param ie           pointer to inference engine.
+     */
+    void print_profiler_summary(inference_engine_t *UNSAFE ie);
+#ifdef __cplusplus
+    };
+#endif
+#endif // INFERENCE_ENGINE_H_

xmos_ai_tools/runtime/include/lib_tflite_micro/api/memory_parallel_transport.h ADDED Viewed

@@ -0,0 +1,52 @@
+#include <stdint.h>
+#include "thread_call.h"
+#ifdef __XC__
+#include <xs1.h>
+#else
+#include <xcore/chanend.h>
+#endif
+#ifdef __XC__
+extern void memory_parallel_receive(chanend c, uint32_t data[], uint32_t bytes);
+extern void memory_parallel_receive_thread_call(chanend c, uint32_t data[], uint32_t bytes, thread_info_t &ptr);
+extern void memory_parallel_send(chanend c, uint32_t data[], uint32_t bytes);
+#else
+/** Function that receives a block of data.
+ * The number of bytes must be a multiple of 4.
+ * This function creates three threads and three channel ends in order to
+ * make full use of the bandwidth of the switch.
+ *
+ * \param c        channel end to the sender
+ * \param data     pointer where data must be stored
+ * \param bytes    number of bytes that will be received.
+ */
+extern void memory_parallel_receive(chanend_t c, uint32_t *data, uint32_t bytes);
+/** Function that receives a block of data.
+ * The number of bytes must be a multiple of 4.
+ * This function assumes that at least three threads have been created by the
+ * thread_call library and will use those together with three fresh channel
+ * ends in order to make full use of the bandwidth of the switch.
+ *
+ * \param c        channel end to the sender
+ * \param data     pointer where data must be stored
+ * \param bytes    number of bytes that will be received.
+ */
+extern void memory_parallel_receive_thread_call(chanend_t c, uint32_t *data, uint32_t bytes, thread_info_t *ptr);
+/** Function that sends a block of data.
+ * The number of bytes must be a multiple of 4.
+ * This function creates three threads and three channel ends in order to
+ * make full use of the bandwidth of the switch.
+ *
+ * \param c        channel end to the receiver
+ * \param data     pointer where data must be loaded frmo
+ * \param bytes    number of bytes that will be sent.
+ */
+extern void memory_parallel_send(chanend_t c, uint32_t *data, uint32_t bytes);
+#endif

xmos_ai_tools/runtime/include/lib_tflite_micro/api/version.h ADDED Viewed

@@ -0,0 +1,13 @@
+// Copyright (c) 2020, XMOS Ltd, All rights reserved
+#ifndef XCORE_VERSION_H_
+#define XCORE_VERSION_H_
+namespace lib_tflite_micro {
+static const unsigned major_version = 0;
+static const unsigned minor_version = 6;
+static const unsigned patch_version = 0;
+} // namespace lib_tflite_micro
+#endif // XCORE_VERSION_H_

xmos_ai_tools/runtime/include/lib_tflite_micro/api/xcore_config.h ADDED Viewed

@@ -0,0 +1,17 @@
+#ifndef XCORE_CONFIG_H_
+#define XCORE_CONFIG_H_
+#include "../src/thread_call.h"
+struct xc_context_config_t {
+  // This is the thread count specified in the compiler.
+  // It's used by lookup op, beta float ops etc to split up work
+  // in the Prepare phase.
+  // Conv ops have their own thread count as the thread work is
+  // calculated in the compiler.
+  int model_thread_count;
+  thread_info_t thread_info;
+  void *UNSAFE weights_data_ptr; // DDR ptr or channel to flash/tile server.
+};
+#endif // XCORE_CONFIG_H_

xmos_ai_tools/runtime/include/lib_tflite_micro/api/xcore_device_memory.h ADDED Viewed

@@ -0,0 +1,62 @@
+// Copyright (c) 2020, XMOS Ltd, All rights reserved
+#ifndef XCORE_DEVICE_MEMORY_H_
+#define XCORE_DEVICE_MEMORY_H_
+#include <stddef.h>
+#include <stdint.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+#ifdef XCORE
+#ifdef _TIME_H_
+#define _clock_defined
+#endif
+#include <xcore/thread.h>
+#define STRINGIFY(NAME) #NAME
+#define GET_STACKWORDS(DEST, NAME)                                             \
+  asm("ldc %[__dest], " STRINGIFY(NAME) ".nstackwords" : [__dest] "=r"(DEST))
+#define GET_STACKSIZE(DEST, NAME)                                              \
+  {                                                                            \
+    size_t _stack_words;                                                       \
+    asm("ldc %[__dest], " STRINGIFY(NAME) ".nstackwords"                       \
+        : [__dest] "=r"(_stack_words));                                        \
+    DEST = (_stack_words + 2) * 4;                                             \
+  }
+#define IS_RAM(a) (((uintptr_t)a >= 0x80000) && ((uintptr_t)a <= 0x100000))
+#define IS_NOT_RAM(a) ((uintptr_t)a > 0x100000)
+#define IS_EXTMEM(a)                                                           \
+  (((uintptr_t)a >= 0x10000000) && (((uintptr_t)a <= 0x20000000)))
+#define IS_SWMEM(a)                                                            \
+  (((uintptr_t)a >= 0x40000000) && (((uintptr_t)a <= 0x80000000)))
+#ifdef USE_SWMEM
+#ifndef USE_QSPI_SWMEM_DEV
+void swmem_setup();
+#else
+#include <xcore/chanend.h>
+void swmem_setup(chanend_t ctrl_swmem_c);
+#endif // USE_QSPI_SWMEM_DEV
+#endif // USE_SWMEM
+void swmem_handler(void *ignored);
+void swmem_teardown();
+#else // not XCORE
+#define GET_STACKSIZE(DEST, NAME) DEST = 0
+#define GET_STACKWORDS(DEST, NAME) DEST = 0
+#define IS_RAM(a) (1)
+#define IS_NOT_RAM(a) (0)
+#endif // XCORE
+void memload(void *dest, void *src, size_t size);
+#ifdef __cplusplus
+}
+#endif
+#endif // XCORE_DEVICE_MEMORY_H_

xmos_ai_tools/runtime/include/lib_tflite_micro/api/xcore_shared_config.h ADDED Viewed

@@ -0,0 +1,31 @@
+// Copyright (c) 2020, XMOS Ltd, All rights reserved
+#ifndef XCORE_SHARED_CONFIG_H_
+#define XCORE_SHARED_CONFIG_H_
+namespace shared_config {
+// This string is used as a key to store the shared config
+// between xformer and lib_tflite_micro in the flatbuffer
+constexpr char xcoreMetadataName[] = "xcoreSharedConfig";
+struct xcore_metadata {
+  // Dummy variable for aligning the metadata struct to 16 bytes
+  // We cannot use alignas(16) yet in xcore
+  int32_t padding[2];
+  // Versions of libraries used to build the model
+  int32_t lib_nn_major_version;
+  int32_t lib_nn_minor_version;
+  int32_t lib_nn_patch_version;
+  int32_t lib_tflite_micro_major_version;
+  int32_t lib_tflite_micro_minor_version;
+  int32_t lib_tflite_micro_patch_version;
+  int32_t xformer_major_version;
+  int32_t xformer_minor_version;
+  int32_t xformer_patch_version;
+  // Number of threads required from the runtime to execute the model
+  int32_t required_thread_count;
+};
+} // namespace shared_config
+#endif // XCORE_SHARED_CONFIG_H_

xmos_ai_tools/runtime/include/lib_tflite_micro/src/tflite-xcore-kernels/conv2d_float.h ADDED Viewed

@@ -0,0 +1,155 @@
+#ifndef _conv2d_float_h_
+#define _conv2d_float_h_
+#ifdef __cplusplus
+extern "C" {
+#endif
+/** Function that calculates a fully connected.
+ *
+ * @param  outputs              pointer to the output data, the output data will
+ * be stored as an array [out_features]
+ * @param  inputs               pointer to the input data, the input data must
+ * be stored as an array [input_features]
+ * @param  kernels              pointer to the kernels, the kernels
+ *                              must be stored as an array
+ *                              [out_features][input_features]
+ * @param  out_features         dimension 1 of the output array
+ * @param  input_features       dimension 1 of the input array
+ * @returns                     number of MACCs
+ */
+extern int xc_fc_float_ref(float *outputs, float *inputs, float *kernels,
+                           int out_features, int input_features);
+/** Optimized function that calculates a fully connected.
+ *
+ * @param  outputs              pointer to the output data, the output data will
+ * be stored as an array [out_features]
+ * @param  inputs               pointer to the input data, the input data must
+ * be stored as an array [input_features]
+ * @param  kernels              pointer to the kernels, the kernels
+ *                              must be stored as an array
+ *                              [out_features][input_features]
+ * @param  out_features         dimension 1 of the output array
+ * @param  input_features       dimension 1 of the input array
+ * @param  out_f_start          output features to start at
+ * @param  out_f_end            output features to end at plus one
+ * @returns                     number of MACCs
+ */
+extern int xc_fc_float_opt(float *outputs, float *inputs, float *kernels,
+                           int out_features, int input_features, int out_f_start,
+                           int out_f_end);
+/** Function that calculates a convolution with a 5x2 filter with stride 2
+ * over dimension 2 of a tensor over a tensor.
+ *
+ * @param  outputs     pointer to the output data, the output data will be
+ *                     stored as an array [out_w][out_h][out_depth]
+ * @param  inputs      pointer to the input data, the input data must be
+ *                     stored as an array [input_w][input_h][input_depth]
+ * @param  kernels     pointer to the kernels, the kernels
+ *                     must be stored as an array
+ *                     [out_depth][5][2][depth]
+ * @param  bias        pointer to the biases, the bias must be stored as an
+ * array [out_depth]
+ * @param  out_w       dimension 2 of the output array
+ * @param  out_h       dimension 1 of the output array
+ * @param  out_depth   dimension 3 of the output array
+ * @param  input_w     dimension 2 of the input array
+ * @param  input_h     dimension 1 of the input array
+ * @param  input_depth dimension 3 of the input array
+ * @returns            number of MACCs
+ */
+extern int xc_conv2d_float_kw5xh2_stride_w3_ref(float *outputs, float *inputs,
+                                                float *kernels, float *biases,
+                                                int out_w, int out_h,
+                                                int out_depth, int input_w,
+                                                int input_h, int input_depth);
+/** Optimised function that calculates a convolution with a 5x2 filter with
+ * stride 2 over dimension 2 of a tensor. For parallel usage, supply
+ * multiple invocations with different values of out_depth_start and
+ * out_depth_end so that the whole output depth is covered between all of
+ * them.
+ *
+ * @param  outputs     pointer to the output data, the output data will be
+ *                     stored as an array [out_w][out_h][out_depth]
+ * @param  inputs      pointer to the input data, the input data must be
+ *                     stored as an array [input_w][input_h][input_depth]
+ * @param  kernels     pointer to the kernels, the kernels
+ *                     must be stored as an array
+ *                     [out_depth][5][2][depth]
+ * @param  bias        pointer to the biases, the bias must be stored as an
+ * array [out_depth]
+ * @param  out_w       dimension 2 of the output array
+ * @param  out_h       dimension 1 of the output array
+ * @param  out_depth   dimension 3 of the output array
+ * @param  input_w     dimension 2 of the input array
+ * @param  input_h     dimension 1 of the input array
+ * @param  input_depth dimension 3 of the input array
+ * @param  out_depth_start output depth to start at
+ * @param  out_depth_end output depth to end at plus one
+ */
+extern void xc_conv2d_float_kw5xh2_stride_w3_opt(
+    float *outputs, float *inputs, float *kernels, float *biases, int out_w,
+    int out_h, int out_depth, int input_w, int input_h, int input_depth,
+    int out_depth_start, int out_depth_end);
+/** Function that calculates a transposed convolution with a 5x2 filter with
+ * stride 3 in dimension 2 over a tensor.
+ *
+ * @param  outputs     pointer to the output data, the output data will be
+ *                     stored as an array [out_w][out_h][out_depth]
+ * @param  inputs      pointer to the input data, the input data must be
+ *                     stored as an array [input_w][input_h][input_depth]
+ * @param  kernels     pointer to the kernels, the kernels
+ *                     must be stored as an array
+ *                     [out_depth][5][2][depth]
+ * @param  bias        pointer to the biases, the bias must be stored as an
+ * array [out_depth]
+ * @param  out_w       dimension 2 of the output array
+ * @param  out_h       dimension 1 of the output array
+ * @param  out_depth   dimension 3 of the output array
+ * @param  input_w     dimension 2 of the input array
+ * @param  input_h     dimension 1 of the input array
+ * @param  input_depth dimension 3 of the input array
+ * @returns            number of MACCs
+ */
+extern int xc_transpose_conv2d_float_kw5xh2_stride_h3_ref(
+    float *outputs, float *inputs, float *kernels, float *biases, int out_w,
+    int out_h, int out_depth, int input_w, int input_h, int input_depth);
+/** Optimised function that calculates a transposed convolution with a 5x2
+ * filter with stride 3 over dimension 2 of a tensor. For parallel usage,
+ * supply multiple invocations with different values of out_depth_start and
+ * out_depth_end so that the whole output depth is covered between all of
+ * them.
+ *
+ * @param  outputs     pointer to the output data, the output data will be
+ *                     stored as an array [out_w][out_h][out_depth]
+ * @param  inputs      pointer to the input data, the input data must be
+ *                     stored as an array [input_w][input_h][input_depth]
+ * @param  kernels     pointer to the kernels, the kernels
+ *                     must be stored as an array
+ *                     [out_depth][5][2][depth]
+ * @param  bias        pointer to the biases, the bias must be stored as an
+ * array [out_depth]
+ * @param  out_w       dimension 2 of the output array
+ * @param  out_h       dimension 1 of the output array
+ * @param  out_depth   dimension 3 of the output array
+ * @param  input_w     dimension 2 of the input array
+ * @param  input_h     dimension 1 of the input array
+ * @param  input_depth dimension 3 of the input array
+ * @param  out_depth_start output depth to start at
+ * @param  out_depth_end output depth to end at plus one
+ */
+extern void xc_transpose_conv2d_float_kw5xh2_stride_h3_opt(
+    float *outputs, float *inputs, float *kernels, float *biases, int out_w,
+    int out_h, int out_depth, int input_w, int input_h, int input_depth,
+    int out_depth_start, int out_depth_end);
+#ifdef __cplusplus
+};
+#endif
+#endif