npm - @pinkparrot/qsafe-mayo-wasm - Versions diffs - 0.0.3 - Mend

@pinkparrot/qsafe-mayo-wasm 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

package/.gitmodules +3 -0
package/.vscode/launch.json +12 -0
package/LICENSE +201 -0
package/bridge/mayo1_bridge.c +26 -0
package/bridge/mayo2_bridge.c +26 -0
package/bridge/randombytes_inject.c +44 -0
package/build_mayo1.ps1 +36 -0
package/build_mayo2.ps1 +36 -0
package/dist/mayo.browser.min.js +216 -0
package/dist/mayo1.js +0 -0
package/dist/mayo2.js +0 -0
package/dist/mayo_api.js +139 -0
package/dist/package.json +1 -0
package/gitignore +2 -0
package/index.mjs +1 -0
package/mayo-c/.astylerc +16 -0
package/mayo-c/.cmake/flags.cmake +45 -0
package/mayo-c/.cmake/sanitizers.cmake +81 -0
package/mayo-c/.cmake/target.cmake +71 -0
package/mayo-c/.github/workflows/ci_clang.yml +61 -0
package/mayo-c/.github/workflows/ci_gcc.yml +60 -0
package/mayo-c/.github/workflows/cmake.yml +160 -0
package/mayo-c/.github/workflows/macos_m1.yml +68 -0
package/mayo-c/CMakeLists.txt +35 -0
package/mayo-c/KAT/PQCsignKAT_24_MAYO_1.req +900 -0
package/mayo-c/KAT/PQCsignKAT_24_MAYO_1.rsp +902 -0
package/mayo-c/KAT/PQCsignKAT_24_MAYO_2.req +900 -0
package/mayo-c/KAT/PQCsignKAT_24_MAYO_2.rsp +902 -0
package/mayo-c/KAT/PQCsignKAT_32_MAYO_3.req +900 -0
package/mayo-c/KAT/PQCsignKAT_32_MAYO_3.rsp +902 -0
package/mayo-c/KAT/PQCsignKAT_40_MAYO_5.req +900 -0
package/mayo-c/KAT/PQCsignKAT_40_MAYO_5.rsp +902 -0
package/mayo-c/LICENSE +202 -0
package/mayo-c/META/MAYO-1_META.yml +52 -0
package/mayo-c/META/MAYO-2_META.yml +52 -0
package/mayo-c/META/MAYO-3_META.yml +52 -0
package/mayo-c/META/MAYO-5_META.yml +52 -0
package/mayo-c/NOTICE +13 -0
package/mayo-c/README.md +183 -0
package/mayo-c/apps/CMakeLists.txt +31 -0
package/mayo-c/apps/PQCgenKAT_sign.c +281 -0
package/mayo-c/apps/example.c +151 -0
package/mayo-c/apps/example_nistapi.c +124 -0
package/mayo-c/include/mayo.h +442 -0
package/mayo-c/include/mem.h +25 -0
package/mayo-c/include/randombytes.h +31 -0
package/mayo-c/scripts/contstants.py +141 -0
package/mayo-c/scripts/find_irred_poly.sage +39 -0
package/mayo-c/src/AVX2/arithmetic_common.h +159 -0
package/mayo-c/src/AVX2/echelon_form.h +91 -0
package/mayo-c/src/AVX2/echelon_form_loop.h +58 -0
package/mayo-c/src/AVX2/shuffle_arithmetic.h +442 -0
package/mayo-c/src/CMakeLists.txt +98 -0
package/mayo-c/src/arithmetic.c +128 -0
package/mayo-c/src/arithmetic.h +124 -0
package/mayo-c/src/common/aes128ctr.c +293 -0
package/mayo-c/src/common/aes_c.c +741 -0
package/mayo-c/src/common/aes_ctr.h +32 -0
package/mayo-c/src/common/aes_neon.c +201 -0
package/mayo-c/src/common/debug_bench_tools.h +69 -0
package/mayo-c/src/common/fips202.c +1093 -0
package/mayo-c/src/common/fips202.h +12 -0
package/mayo-c/src/common/mem.c +19 -0
package/mayo-c/src/common/randombytes_ctrdrbg.c +141 -0
package/mayo-c/src/common/randombytes_system.c +399 -0
package/mayo-c/src/generic/arithmetic_dynamic.h +68 -0
package/mayo-c/src/generic/arithmetic_fixed.h +84 -0
package/mayo-c/src/generic/echelon_form.h +152 -0
package/mayo-c/src/generic/ef_inner_loop.h +56 -0
package/mayo-c/src/generic/generic_arithmetic.h +294 -0
package/mayo-c/src/mayo.c +675 -0
package/mayo-c/src/mayo_1/api.c +46 -0
package/mayo-c/src/mayo_1/api.h +43 -0
package/mayo-c/src/mayo_2/api.c +46 -0
package/mayo-c/src/mayo_2/api.h +43 -0
package/mayo-c/src/mayo_3/api.c +46 -0
package/mayo-c/src/mayo_3/api.h +43 -0
package/mayo-c/src/mayo_5/api.c +46 -0
package/mayo-c/src/mayo_5/api.h +43 -0
package/mayo-c/src/neon/arithmetic_common.h +132 -0
package/mayo-c/src/neon/echelon_form.h +55 -0
package/mayo-c/src/neon/echelon_form_loop.h +58 -0
package/mayo-c/src/neon/shuffle_arithmetic.h +462 -0
package/mayo-c/src/params.c +42 -0
package/mayo-c/src/simple_arithmetic.h +138 -0
package/mayo-c/test/CMakeLists.txt +51 -0
package/mayo-c/test/bench.c +166 -0
package/mayo-c/test/m1cycles.c +155 -0
package/mayo-c/test/m1cycles.h +13 -0
package/mayo-c/test/test_kat.c +271 -0
package/mayo-c/test/test_mayo.c +139 -0
package/mayo-c/test/test_sample_solution.c +75 -0
package/mayo-c/test/test_various.c +680 -0
package/package.json +39 -0
package/publish.bat +22 -0
package/readme.md +80 -0
package/test/test.mjs +42 -0

package/mayo-c/src/generic/arithmetic_fixed.h ADDED Viewed

@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: Apache-2.0
+#ifndef ARITHMETIC_FIXED_H
+#define ARITHMETIC_FIXED_H
+#include <stdint.h>
+#include <mayo.h>
+#include <simple_arithmetic.h>
+// This implements arithmetic for vectors of X field elements in Z_2[x]/(x^4+x+1)
+static
+inline void m_vec_copy (int m_vec_limbs, const uint64_t *in, uint64_t *out) {
+    (void) m_vec_limbs;
+    for (size_t i = 0; i < M_VEC_LIMBS_MAX; i++)
+    {
+        out[i] = in[i];
+    }
+}
+static
+inline void m_vec_add (int m_vec_limbs, const uint64_t *in, uint64_t *acc) {
+    (void) m_vec_limbs;
+    for (size_t i = 0; i < M_VEC_LIMBS_MAX; i++)
+    {
+        acc[i] ^= in[i];
+    }
+}
+static
+inline void m_vec_mul_add (int m_vec_limbs, const uint64_t *in, unsigned char a, uint64_t *acc) {
+    (void) m_vec_limbs;
+    uint32_t tab = mul_table(a);
+    uint64_t lsb_ask = 0x1111111111111111ULL;
+    for(int i=0; i < M_VEC_LIMBS_MAX ;i++){
+        acc[i] ^= ( in[i]       & lsb_ask) * (tab & 0xff)
+                ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8)  & 0xf)
+                ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+                ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+    }
+}
+inline
+static void m_vec_mul_add_x (int m_vec_limbs, const uint64_t *in, uint64_t *acc) {
+    (void) m_vec_limbs;
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    for(int i=0; i < M_VEC_LIMBS_MAX; i++){
+        uint64_t t = in[i] & mask_msb;
+        acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+    }
+}
+inline
+static void m_vec_mul_add_x_inv (int m_vec_limbs, const uint64_t *in, uint64_t *acc) {
+    (void) m_vec_limbs;
+    uint64_t mask_lsb = 0x1111111111111111ULL;
+    for(int i=0; i < M_VEC_LIMBS_MAX; i++){
+        uint64_t t = in[i] & mask_lsb;
+        acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+    }
+}
+static
+inline void m_vec_multiply_bins (int m_vec_limbs, uint64_t *bins, uint64_t *out) {
+    m_vec_mul_add_x_inv (m_vec_limbs, bins +  5 * M_VEC_LIMBS_MAX, bins +  10 * M_VEC_LIMBS_MAX);
+    m_vec_mul_add_x (m_vec_limbs, bins + 11 * M_VEC_LIMBS_MAX, bins + 12 * M_VEC_LIMBS_MAX);
+    m_vec_mul_add_x_inv (m_vec_limbs, bins +  10 * M_VEC_LIMBS_MAX, bins +  7 * M_VEC_LIMBS_MAX);
+    m_vec_mul_add_x (m_vec_limbs, bins + 12 * M_VEC_LIMBS_MAX, bins +  6 * M_VEC_LIMBS_MAX);
+    m_vec_mul_add_x_inv (m_vec_limbs, bins +  7 * M_VEC_LIMBS_MAX, bins +  14 * M_VEC_LIMBS_MAX);
+    m_vec_mul_add_x (m_vec_limbs, bins +  6 * M_VEC_LIMBS_MAX, bins +  3 * M_VEC_LIMBS_MAX);
+    m_vec_mul_add_x_inv (m_vec_limbs, bins +  14 * M_VEC_LIMBS_MAX, bins +  15 * M_VEC_LIMBS_MAX);
+    m_vec_mul_add_x (m_vec_limbs, bins +  3 * M_VEC_LIMBS_MAX, bins +  8 * M_VEC_LIMBS_MAX);
+    m_vec_mul_add_x_inv (m_vec_limbs, bins +  15 * M_VEC_LIMBS_MAX, bins +  13 * M_VEC_LIMBS_MAX);
+    m_vec_mul_add_x (m_vec_limbs, bins +  8 * M_VEC_LIMBS_MAX, bins +  4 * M_VEC_LIMBS_MAX);
+    m_vec_mul_add_x_inv (m_vec_limbs, bins +  13 * M_VEC_LIMBS_MAX, bins +  9 * M_VEC_LIMBS_MAX);
+    m_vec_mul_add_x (m_vec_limbs, bins +  4 * M_VEC_LIMBS_MAX, bins +  2 * M_VEC_LIMBS_MAX);
+    m_vec_mul_add_x_inv (m_vec_limbs, bins +   9 * M_VEC_LIMBS_MAX, bins +  1 * M_VEC_LIMBS_MAX);
+    m_vec_mul_add_x (m_vec_limbs, bins +  2 * M_VEC_LIMBS_MAX, bins +  1 * M_VEC_LIMBS_MAX);
+    m_vec_copy (m_vec_limbs, bins + M_VEC_LIMBS_MAX, out);
+}
+#endif

package/mayo-c/src/generic/echelon_form.h ADDED Viewed

@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: Apache-2.0
+#ifndef ECHELON_FORM_H
+#define ECHELON_FORM_H
+#include <stdalign.h>
+#include <stdint.h>
+#include <mem.h>
+#include <arithmetic.h>
+#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+static inline unsigned char
+m_extract_element(const uint64_t *in, int index) {
+    const int leg = index / 16;
+    const int offset = index % 16;
+    return (in[leg] >> (offset*4)) & 0xF;
+}
+static inline void
+ef_pack_m_vec(const unsigned char *in, uint64_t *out, int ncols) {
+    int i;
+    unsigned char *out8 = (unsigned char *)out;
+    for(i = 0; i+1 < ncols; i += 2){
+#ifdef TARGET_BIG_ENDIAN
+        out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]  = (in[i+0] << 0) | (in[i+1] << 4);
+#else
+        out8[i/2]  = (in[i+0] << 0) | (in[i+1] << 4);
+#endif
+    }
+    if (ncols % 2 == 1){
+#ifdef TARGET_BIG_ENDIAN
+        out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]  = (in[i+0] << 0);
+#else
+        out8[i/2]  = (in[i+0] << 0);
+#endif
+    }
+}
+static inline void
+ef_unpack_m_vec(int legs, const uint64_t *in, unsigned char *out) {
+    const unsigned char *in8 = (const unsigned char *)in;
+    for(int i = 0; i < legs * 16; i += 2){
+#ifdef TARGET_BIG_ENDIAN
+        out[i]   = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]) & 0xF;
+        out[i+1] = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4);
+#else
+        out[i]   = (in8[i/2]) & 0xF;
+        out[i+1] = (in8[i/2] >> 4);
+#endif
+    }
+}
+// put matrix in row echelon form with ones on first nonzero entries *in
+// constant time*
+static inline void EF(unsigned char *A, int nrows, int ncols) {
+    alignas (32) uint64_t _pivot_row[(K_MAX * O_MAX + 1 + 15) / 16];
+    alignas (32) uint64_t _pivot_row2[(K_MAX * O_MAX + 1 + 15) / 16];
+    alignas (32) uint64_t packed_A[((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX] = {0};
+    int row_len = (ncols + 15) / 16;
+    // nibbleslice the matrix A
+    for (int i = 0; i < nrows; i++) {
+        ef_pack_m_vec(A + i * ncols, packed_A + i * row_len, ncols);
+    }
+    // pivot row is secret, pivot col is not
+    unsigned char inverse;
+    int pivot_row = 0;
+    for (int pivot_col = 0; pivot_col < ncols; pivot_col++) {
+        int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - ncols);
+        int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col);
+        // the pivot row is guaranteed to be between these lower and upper bounds if
+        // A has full rank
+        // zero out pivot row
+        for (int i = 0; i < row_len; i++) {
+            _pivot_row[i] = 0;
+            _pivot_row2[i] = 0;
+        }
+        // try to get a pivot row in constant time
+        unsigned char pivot = 0;
+        uint64_t pivot_is_zero = -1;
+        for (int row = pivot_row_lower_bound;
+                row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) {
+            uint64_t is_pivot_row = ~ct_compare_64(row, pivot_row);
+            uint64_t below_pivot_row = ct_64_is_greater_than(row, pivot_row);
+            for (int j = 0; j < row_len; j++) {
+                _pivot_row[j] ^= (is_pivot_row | (below_pivot_row & pivot_is_zero)) &
+                                 packed_A[row * row_len + j];
+            }
+            pivot = m_extract_element(_pivot_row, pivot_col);
+            pivot_is_zero = ~ct_compare_64((int) pivot, 0);
+        }
+        // multiply pivot row by inverse of pivot
+        inverse = inverse_f(pivot);
+        vec_mul_add_u64(row_len, _pivot_row, inverse, _pivot_row2);
+        // conditionally write pivot row to the correct row, if there is a nonzero
+        // pivot
+        for (int row = pivot_row_lower_bound; row <= pivot_row_upper_bound; row++) {
+            uint64_t do_copy = ~ct_compare_64(row, pivot_row) & ~pivot_is_zero;
+            uint64_t do_not_copy = ~do_copy;
+            for (int col = 0; col < row_len; col++) {
+                packed_A[row * row_len + col] =
+                    (do_not_copy & packed_A[row * row_len + col]) +
+                    (do_copy & _pivot_row2[col]);
+            }
+        }
+        // eliminate entries below pivot
+        for (int row = pivot_row_lower_bound; row < nrows; row++) {
+            unsigned char below_pivot = (row > pivot_row);
+            unsigned char elt_to_elim = m_extract_element(packed_A + row * row_len, pivot_col);
+            vec_mul_add_u64(row_len, _pivot_row2, below_pivot * elt_to_elim,
+                                    packed_A + row * row_len);
+        }
+        pivot_row += (-(int64_t)(~pivot_is_zero));
+    }
+    unsigned char temp[(O_MAX * K_MAX + 1 + 15)];
+    // unbitslice the matrix A
+    for (int i = 0; i < nrows; i++) {
+        ef_unpack_m_vec(row_len, packed_A + i * row_len, temp);
+        for (int j = 0; j < ncols; j++) {
+            A[i * ncols + j] = temp[j];
+        }
+    }
+    mayo_secure_clear(temp, K_MAX * O_MAX + 1 + 15);
+    mayo_secure_clear(_pivot_row, (K_MAX * O_MAX + 1 + 15) / 16 * 8);
+    mayo_secure_clear(_pivot_row2, (K_MAX * O_MAX + 1 + 15) / 16 * 8);
+    mayo_secure_clear(packed_A, ((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX * 8);
+}
+#endif

package/mayo-c/src/generic/ef_inner_loop.h ADDED Viewed

@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: Apache-2.0
+int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - ncols);
+int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col);
+// the pivot row is guaranteed to be between these lower and upper bounds if
+// A has full rank
+// zero out pivot row
+for (int i = offset; i < row_len; i++) {
+    _pivot_row[i] = 0;
+    _pivot_row2[i] = 0;
+}
+// try to get a pivot row in constant time
+unsigned char pivot = 0;
+uint64_t pivot_is_zero = -1;
+for (int row = pivot_row_lower_bound;
+        row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) {
+    uint64_t is_pivot_row = ~ct_compare_64(row, pivot_row);
+    uint64_t below_pivot_row = ct_64_is_greater_than(row, pivot_row);
+    for (int j = offset; j < row_len; j++) {
+        _pivot_row[j] ^= (is_pivot_row | (below_pivot_row & pivot_is_zero)) &
+                            packed_A[row * row_len + j];
+    }
+    pivot = m_extract_element(_pivot_row, pivot_col);
+    pivot_is_zero = ~ct_compare_64((int) pivot, 0);
+}
+// multiply pivot row by inverse of pivot
+inverse = inverse_f(pivot);
+vec_mul_add_u64(row_len - offset, _pivot_row + offset, inverse, _pivot_row2 + offset);
+// conditionally write pivot row to the correct row, if there is a nonzero
+// pivot
+for (int row = pivot_row_lower_bound; row <= pivot_row_upper_bound; row++) {
+    uint64_t do_copy = ~ct_compare_64(row, pivot_row) & ~pivot_is_zero;
+    uint64_t do_not_copy = ~do_copy;
+    for (int col = offset; col < row_len; col++) {
+        packed_A[row * row_len + col] =
+            (do_not_copy & packed_A[row * row_len + col]) +
+            (do_copy & _pivot_row2[col]);
+    }
+}
+// eliminate entries below pivot
+for (int row = pivot_row_lower_bound; row < nrows; row++) {
+    unsigned char below_pivot = (row > pivot_row);
+    unsigned char elt_to_elim = m_extract_element(packed_A + row * row_len, pivot_col);
+    vec_mul_add_u64(row_len - offset, _pivot_row2 + offset, below_pivot * elt_to_elim,
+                            packed_A + row * row_len + offset);
+}
+pivot_row += (-(int32_t)(~pivot_is_zero));

package/mayo-c/src/generic/generic_arithmetic.h ADDED Viewed

@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: Apache-2.0
+#ifndef GENERIC_ARITHMETIC_H
+#define GENERIC_ARITHMETIC_H
+#include <simple_arithmetic.h>
+#ifdef ENABLE_PARAMS_DYNAMIC
+#include <arithmetic_dynamic.h>
+#else
+#include <arithmetic_fixed.h>
+#endif
+// multiplies m (possibly upper triangular) matrices with a single matrix and adds result to acc
+static inline
+void mul_add_m_upper_triangular_mat_x_mat(const int m_vec_limbs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc,
+                                          const int bs_mat_rows, const int bs_mat_cols, const int mat_cols, const int triangular) {
+    int bs_mat_entries_used = 0;
+    for (int r = 0; r < bs_mat_rows; r++) {
+        for (int c = triangular * r; c < bs_mat_cols; c++) {
+            for (int k = 0; k < mat_cols; k += 1) {
+                m_vec_mul_add(m_vec_limbs, bs_mat + m_vec_limbs * bs_mat_entries_used, mat[c * mat_cols + k], acc + m_vec_limbs * (r * mat_cols + k));
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+}
+// multiplies m (possibly upper triangular) matrices with the transpose of a single matrix and adds result to acc
+static inline
+void mul_add_m_upper_triangular_mat_x_mat_trans(const int m_vec_limbs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc,
+                                                const int bs_mat_rows, const int bs_mat_cols, const int mat_rows, const int triangular) {
+    int bs_mat_entries_used = 0;
+    for (int r = 0; r < bs_mat_rows; r++) {
+        for (int c = triangular * r; c < bs_mat_cols; c++) {
+            for (int k = 0; k < mat_rows; k += 1) {
+                m_vec_mul_add(m_vec_limbs, bs_mat + m_vec_limbs * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + m_vec_limbs * (r * mat_rows + k));
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+}
+// multiplies the transpose of a single matrix with m matrices and adds result to acc
+static inline
+void mul_add_mat_trans_x_m_mat(const int m_vec_limbs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc,
+                               const int mat_rows, const int mat_cols, const int bs_mat_cols) {
+    for (int r = 0; r < mat_cols; r++) {
+        for (int c = 0; c < mat_rows; c++) {
+            for (int k = 0; k < bs_mat_cols; k += 1) {
+                m_vec_mul_add(m_vec_limbs, bs_mat + m_vec_limbs * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + m_vec_limbs * (r * bs_mat_cols + k));
+            }
+        }
+    }
+}
+// multiplies a single matrix with m matrices and adds result to acc
+static inline
+void mul_add_mat_x_m_mat(const int m_vec_limbs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc,
+                         const int mat_rows, const int mat_cols, const int bs_mat_cols) {
+    for (int r = 0; r < mat_rows; r++) {
+        for (int c = 0; c < mat_cols; c++) {
+            for (int k = 0; k < bs_mat_cols; k += 1) {
+                m_vec_mul_add(m_vec_limbs, bs_mat + m_vec_limbs * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + m_vec_limbs * (r * bs_mat_cols + k));
+            }
+        }
+    }
+}
+static inline
+void P1_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+    mul_add_m_upper_triangular_mat_x_mat(PARAM_m_vec_limbs(p), P1, O, acc, PARAM_v(p), PARAM_v(p), PARAM_o(p), 1);
+}
+static inline
+void P1_times_Vt(const mayo_params_t* p, const uint64_t* P1, const unsigned char* V, uint64_t* acc){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+    mul_add_m_upper_triangular_mat_x_mat_trans(PARAM_m_vec_limbs(p), P1, V, acc, PARAM_v(p), PARAM_v(p), PARAM_k(p), 1);
+}
+#if defined(HAVE_STACKEFFICIENT) || defined(PQM4)
+// compute P * S^t = [ P1  P2 ] * [S1] = [P1*S1 + P2*S2]
+//                   [  0  P3 ]   [S2]   [        P3*S2]
+// compute S * PS  = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ]
+//                               [         P3*S2 = P2 ]
+static inline void mayo_generic_m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                                                   const int m, const int v, const int o, const int k, uint64_t *SPS) {
+    const int n = o + v;
+    const int m_vec_limbs = (m + 15)/16;
+    uint64_t PS[(N_MAX + K_MAX) * M_VEC_LIMBS_MAX] = { 0 };
+    uint64_t accumulator[16 * ((M_MAX+15)/16) * N_MAX] = {0};
+    int P1_used;
+    int P3_used;
+    for (int col = 0; col < k; col++) {
+        for(unsigned int i = 0; i < sizeof(accumulator)/8; i++) {
+            accumulator[i] = 0;
+        }
+        P1_used = 0;
+        for (int row = 0; row < v; row++) {
+            for (int j = row; j < v; j++) {
+                m_vec_add(m_vec_limbs, P1 + (P1_used * m_vec_limbs), accumulator + ( row * 16 + S[col * n + j] )*m_vec_limbs);
+                P1_used ++;
+            }
+            for (int j = 0; j < o; j++) {
+                m_vec_add(m_vec_limbs, P2 + (row * o + j)*m_vec_limbs, accumulator + ( row * 16 + S[(col * n) + j + v] )* m_vec_limbs);
+            }
+        }
+        P3_used = 0;
+        for (int row = v; row < n; row++) {
+            for (int j = row; j < n; j++) {
+                m_vec_add(m_vec_limbs, P3 + P3_used * m_vec_limbs, accumulator + ( row * 16 + S[col * n + j] )* m_vec_limbs );
+                P3_used ++;
+            }
+        }
+        for (int row = 0; row < n; row++) {
+            m_vec_multiply_bins(m_vec_limbs, accumulator + row * 16 * m_vec_limbs, PS + (row + col) * m_vec_limbs);
+        }
+        for (int row = 0; row < k; row++) {
+            for (unsigned int i = 0; i < 16*((M_MAX+15)/16); ++i)
+                accumulator[i] = 0;
+            for (int j = 0; j < n; j++) {
+                m_vec_add(m_vec_limbs, PS + (j + col) * m_vec_limbs, accumulator + S[row * n + j]*m_vec_limbs);
+            }
+            m_vec_multiply_bins(m_vec_limbs, accumulator, SPS + (row * k + col) * m_vec_limbs);
+        }
+    }
+}
+#else
+// compute P * S^t = [ P1  P2 ] * [S1] = [P1*S1 + P2*S2]
+//                   [  0  P3 ]   [S2]   [        P3*S2]
+static inline void mayo_generic_m_calculate_PS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                                               const int m, const int v, const int o, const int k, uint64_t *PS) {
+    const int n = o + v;
+    const int m_vec_limbs = (m + 15)/16;
+    uint64_t accumulator[16 * ((M_MAX+15)/16) * K_MAX * N_MAX] = {0};
+    int P1_used = 0;
+    for (int row = 0; row < v; row++) {
+        for (int j = row; j < v; j++) {
+            for (int col = 0; col < k; col++) {
+                m_vec_add(m_vec_limbs, P1 + (P1_used * m_vec_limbs), accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_vec_limbs);
+            }
+            P1_used ++;
+        }
+        for (int j = 0; j < o; j++) {
+            for (int col = 0; col < k; col++) {
+                m_vec_add(m_vec_limbs, P2 + (row * o + j)*m_vec_limbs, accumulator + ( (row * k + col) * 16 + S[(col * n) + j + v] )* m_vec_limbs);
+            }
+        }
+    }
+    int P3_used = 0;
+    for (int row = v; row < n; row++) {
+        for (int j = row; j < n; j++) {
+            for (int col = 0; col < k; col++) {
+                m_vec_add(m_vec_limbs, P3 + P3_used * m_vec_limbs, accumulator + ( (row * k + col) * 16 + S[col * n + j] )* m_vec_limbs );
+            }
+            P3_used ++;
+        }
+    }
+    // multiply stuff according to the bins of the accumulator and add to PS.
+    int i = 0;
+    while (i < n * k) {
+        m_vec_multiply_bins(m_vec_limbs, accumulator + i * 16 * m_vec_limbs, PS + i * m_vec_limbs);
+        i++;
+    }
+}
+// compute S * PS  = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ]
+//                               [         P3*S2 = P2 ]
+static inline void mayo_generic_m_calculate_SPS(const uint64_t *PS, const unsigned char *S, int m, int k, int  n, uint64_t *SPS){
+    uint64_t accumulator[16*((M_MAX+15)/16)*K_MAX*K_MAX] = {0};
+    const int m_vec_limbs = (m + 15)/ 16;
+    for (int row = 0; row < k; row++) {
+        for (int j = 0; j < n; j++) {
+            for (int col = 0; col < k; col += 1) {
+                    m_vec_add(m_vec_limbs, PS + (j * k + col) * m_vec_limbs, accumulator + ( (row * k + col) * 16 + S[row * n + j] )*m_vec_limbs);
+            }
+        }
+    }
+    // multiply stuff according to the bins of the accumulator and add to PS.
+    int i = 0;
+    while (i < k*k) {
+        m_vec_multiply_bins(m_vec_limbs, accumulator + i * 16 * m_vec_limbs, SPS + i * m_vec_limbs);
+        i++;
+    }
+}
+#endif
+static inline
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+    const int param_o = PARAM_o(p);
+    const int param_v = PARAM_v(p);
+    const int m_vec_limbs = PARAM_m_vec_limbs(p);
+    int bs_mat_entries_used = 0;
+    for (int r = 0; r < param_v; r++) {
+        for (int c = r; c < param_v; c++) {
+            if(c==r) {
+                bs_mat_entries_used += 1;
+                continue;
+            }
+            for (int k = 0; k < param_o; k += 1) {
+                m_vec_mul_add(m_vec_limbs, P1 + m_vec_limbs * bs_mat_entries_used, O[c * param_o + k], acc + m_vec_limbs * (r * param_o + k));
+                m_vec_mul_add(m_vec_limbs, P1 + m_vec_limbs * bs_mat_entries_used, O[r * param_o + k], acc + m_vec_limbs * (c * param_o + k));
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+}
+static inline
+void compute_M_and_VPV(const mayo_params_t* p, const unsigned char* Vdec, const uint64_t *L, const uint64_t *P1, uint64_t *VL, uint64_t *VP1V){
+    const int param_k = PARAM_k(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    //VL
+    mul_add_mat_x_m_mat(PARAM_m_vec_limbs(p), Vdec, L, VL, param_k, param_v, param_o);
+    //VP1V
+    uint64_t Pv[V_MAX * K_MAX * M_VEC_LIMBS_MAX] = {0};
+    P1_times_Vt(p, P1, Vdec, Pv);
+    mul_add_mat_x_m_mat(PARAM_m_vec_limbs(p), Vdec, Pv, VP1V, param_k, param_v, param_k);
+}
+static inline
+void compute_P3(const mayo_params_t* p, const uint64_t* P1, uint64_t *P2, const unsigned char *O, uint64_t *P3){
+    const int m_vec_limbs = PARAM_m_vec_limbs(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    // compute P1*O + P2
+    P1_times_O(p, P1, O, P2);
+    // compute P3 = O^t * (P1*O + P2)
+    mul_add_mat_trans_x_m_mat(m_vec_limbs, O, P2, P3, param_v, param_o, param_o);
+}
+// compute P * S^t = [ P1  P2 ] * [S1] = [P1*S1 + P2*S2]
+//                   [  0  P3 ]   [S2]   [        P3*S2]
+// compute S * PS  = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ]
+//                               [         P3*S2 = P2 ]
+static inline void m_calculate_PS_SPS(const mayo_params_t *p, const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *s,
+                                      uint64_t *SPS) {
+    // compute P * S^t = {(P1, P2), (0, P3)} * S^t = {(P1*S1 + P2*S2), (P3 * S2)}
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+    #if defined(HAVE_STACKEFFICIENT) || defined(PQM4)
+    mayo_generic_m_calculate_PS_SPS(P1, P2, P3, s, PARAM_m(p), PARAM_v(p), PARAM_o(p), PARAM_k(p), SPS);
+    #else
+    uint64_t PS[N_MAX * K_MAX * M_VEC_LIMBS_MAX] = { 0 };
+    mayo_generic_m_calculate_PS(P1, P2, P3, s, PARAM_m(p), PARAM_v(p), PARAM_o(p), PARAM_k(p), PS);
+    // compute S * P * S = S* (P*S)
+    mayo_generic_m_calculate_SPS(PS, s, PARAM_m(p), PARAM_k(p), PARAM_n(p), SPS);
+    #endif
+}
+#endif