RubyGems - sleeping_kangaroo12 - Versions diffs - 0.0.1 - Mend

sleeping_kangaroo12 0.0.1

Files changed (284) hide show

data/ext/xkcp/lib/low/Xoodoo-times8/AVX512/Xoodoo-times8-SIMD512.c ADDED Viewed

@@ -0,0 +1,1216 @@
+/*
+The eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
+The Xoodoo permutation, designed by Joan Daemen, Seth Hoffert, Gilles Van Assche and Ronny Van Keer.
+Implementation by Ronny Van Keer, hereby denoted as "the implementer".
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+#include <stdio.h>
+#include <string.h>
+#include <smmintrin.h>
+#include <wmmintrin.h>
+#include <immintrin.h>
+#include <emmintrin.h>
+#include "align.h"
+#include "brg_endian.h"
+#include "Xoodoo.h"
+#include "Xoodoo-times8-SnP.h"
+#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
+#error Expecting a little-endian platform
+#endif
+/* #define SIMULATE_AVX512 */
+#define VERBOSE         0
+#if defined(SIMULATE_AVX512)
+typedef struct
+{
+    uint32_t x[16];
+} __m512i;
+static void _mm512_mask_store_epi64(void *mem_addr, uint8_t k, __m512i a)
+{
+    uint64_t *p64 = (uint64_t *)mem_addr;
+    unsigned int i;
+    for ( i = 0; i < 8; ++i ) {
+        if ((k & (1 << i)) != 0)
+            p64[i] = (uint64_t)a.x[2*i] | ((uint64_t)a.x[2*i+1] << 32);
+    }
+}
+static __m512i _mm512_maskz_load_epi64(uint8_t k, const void *mem_addr)
+{
+    __m512i r;
+    const uint64_t *p64 = (const uint64_t *)mem_addr;
+    unsigned int i;
+    for ( i = 0; i < 8; ++i ) {
+        if ((k & (1 << i)) != 0) {
+            r.x[2*i]   = (uint32_t)p64[i];
+            r.x[2*i+1] = (uint32_t)(p64[i] >> 32);
+        }
+        else {
+            r.x[2*i]   = 0;
+            r.x[2*i+1] = 0;
+        }
+    }
+    return(r);
+}
+static void _mm512_storeu_si512(__m512i * mem_addr, __m512i a)
+{
+    uint32_t *p32 = (uint32_t *)mem_addr;
+    unsigned int i;
+    for ( i = 0; i < 16; ++i )
+        p32[i] = a.x[i];
+}
+#define _mm512_store_si512    _mm512_storeu_si512
+typedef union
+{
+    uint32_t x[8];
+    __m256i  s;
+} s__m256i;
+static void _mm256_storeu_si256(__m256i * mem_addr, __m256i aa)
+{
+    uint32_t *p32 = (uint32_t *)mem_addr;
+    s__m256i a;
+    unsigned int i;
+    a.s = aa;
+    for ( i = 0; i < 8; ++i )
+        p32[i] = a.x[i];
+}
+#define _mm256_store_si256    _mm256_storeu_si256
+static __m512i _mm512_loadu_si512(const __m512i * mem_addr)
+{
+    __m512i r;
+    const uint32_t *p32 = (const uint32_t *)mem_addr;
+    unsigned int i;
+    for ( i = 0; i < 16; ++i )
+        r.x[i] = p32[i];
+    return(r);
+}
+#define _mm512_load_si512    _mm512_loadu_si512
+static __m256i _mm256_loadu_si256(const __m256i * mem_addr)
+{
+    s__m256i r;
+    const uint32_t *p32 = (const uint32_t *)mem_addr;
+    unsigned int i;
+    for ( i = 0; i < 8; ++i )
+        r.x[i] = p32[i];
+    return(r.s);
+}
+#define _mm256_load_si256    _mm256_loadu_si256
+static __m512i _mm512_setzero_si512(void)
+{
+    __m512i r;
+    unsigned int i;
+    for ( i = 0; i < 16; ++i )
+        r.x[i] = 0;
+    return(r);
+}
+static __m256i _mm256_setzero_si256(void)
+{
+    s__m256i r;
+    unsigned int i;
+    for ( i = 0; i < 8; ++i )
+        r.x[i] = 0;
+    return(r.s);
+}
+static __m512i _mm512_xor_si512( __m512i a, __m512i b)
+{
+    __m512i r;
+    unsigned int i;
+    for ( i = 0; i < 16; ++i )
+        r.x[i] = a.x[i] ^ b.x[i];
+    return(r);
+}
+static __m512i _mm512_and_si512( __m512i a, __m512i b)
+{
+    __m512i r;
+    unsigned int i;
+    for ( i = 0; i < 16; ++i )
+        r.x[i] = a.x[i] & b.x[i];
+    return(r);
+}
+static __m512i _mm512_ternarylogic_epi32(__m512i a, __m512i b, __m512i c, int imm)
+{
+    if (imm == 0x96)
+        return ( _mm512_xor_si512( _mm512_xor_si512( a, b ), c ) );
+    if (imm == 0xD2) {
+        __m512i t;
+        unsigned int i;
+        for ( i = 0; i < 16; ++i )
+            t.x[i] = ~b.x[i] & c.x[i];
+        return ( _mm512_xor_si512( a, t ) );
+    }
+    printf( "_mm512_ternarylogic_epi32( a, b, c, %02X) not implemented!\n", imm );
+    exit(1);
+}
+static __m256i _mm256_ternarylogic_epi32(__m256i a, __m256i b, __m256i c, int imm)
+{
+    if (imm == 0x96)
+        return ( _mm256_xor_si256( _mm256_xor_si256( a, b ), c ) );
+    if (imm == 0xD2) {
+        s__m256i t;
+        s__m256i bb;
+        s__m256i cc;
+        unsigned int i;
+        bb.s = b;
+        cc.s = c;
+        for ( i = 0; i < 8; ++i )
+            t.x[i] = ~bb.x[i] & cc.x[i];
+        return ( _mm256_xor_si256( a, t.s ) );
+    }
+    printf( "_mm256_ternarylogic_epi32( a, b, c, %02X) not implemented!\n", imm );
+    exit(1);
+}
+static __m512i _mm512_rol_epi32(__m512i a, int offset)
+{
+    __m512i r;
+    unsigned int i;
+    for ( i = 0; i < 16; ++i )
+        r.x[i] = (a.x[i] << offset) | (a.x[i] >> (32-offset));
+    return(r);
+}
+static __m256i _mm256_rol_epi32(__m256i a, int offset)
+{
+    s__m256i r;
+    s__m256i aa;
+    unsigned int i;
+    aa.s = a;
+    for ( i = 0; i < 8; ++i )
+        r.x[i] = (aa.x[i] << offset) | (aa.x[i] >> (32-offset));
+    return(r.s);
+}
+static __m512i _mm512_slli_epi32(__m512i a, int offset)
+{
+    __m512i r;
+    unsigned int i;
+    for ( i = 0; i < 16; ++i )
+        r.x[i] = (a.x[i] << offset);
+    return(r);
+}
+static __m512i _mm512_set1_epi32(uint32_t a)
+{
+    unsigned int i;
+    __m512i r;
+    for ( i = 0; i < 16; ++i )
+        r.x[i] = a;
+    return(r);
+}
+static __m512i _mm512_i32gather_epi32(__m512i idx, const void *p, int scale)
+{
+    __m512i r;
+    unsigned int i;
+    for ( i = 0; i < 16; ++i )
+        r.x[i] = *(const uint32_t*)((const char*)p + idx.x[i] * scale);
+    return(r);
+}
+static void _mm512_i32scatter_epi32( void *p, __m512i idx, __m512i value, int scale)
+{
+    unsigned int i;
+    for ( i = 0; i < 16; ++i )
+        *(uint32_t*)((char*)p + idx.x[i] * scale) = value.x[i];
+}
+static void _mm256_i32scatter_epi32( void *p, __m256i idx, __m256i value, int scale)
+{
+    s__m256i iidx, vvalue;
+    unsigned int i;
+    iidx.s = idx;
+    vvalue.s = value;
+    for ( i = 0; i < 8; ++i )
+        *(uint32_t*)((char*)p + iidx.x[i] * scale) = vvalue.x[i];
+}
+static void _mm512_mask_i32scatter_epi32( void *p, uint16_t k, __m512i idx, __m512i value, int scale)
+{
+    unsigned int i;
+    for ( i = 0; i < 16; ++i ) {
+        if ((k & (1 << i)) != 0)
+            *(uint32_t*)((char*)p + idx.x[i] * scale) = value.x[i];
+    }
+}
+static void _mm256_mask_i32scatter_epi32( void *p, uint16_t k, __m256i idx, __m256i value, int scale)
+{
+    s__m256i iidx, vvalue;
+    unsigned int i;
+    iidx.s = idx;
+    vvalue.s = value;
+    for ( i = 0; i < 8; ++i ) {
+        if ((k & (1 << i)) != 0)
+            *(uint32_t*)((char*)p + iidx.x[i] * scale) = vvalue.x[i];
+    }
+}
+static __m512i _mm512_setr_epi32(    int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8,
+                                    int e7,  int e6,  int e5,  int e4,  int e3,  int e2,  int ee1, int ee0)
+{
+    __m512i r;
+    r.x[ 0] = e15;
+    r.x[ 1] = e14;
+    r.x[ 2] = e13;
+    r.x[ 3] = e12;
+    r.x[ 4] = e11;
+    r.x[ 5] = e10;
+    r.x[ 6] = e9;
+    r.x[ 7] = e8;
+    r.x[ 8] = e7;
+    r.x[ 9] = e6;
+    r.x[10] = e5;
+    r.x[11] = e4;
+    r.x[12] = e3;
+    r.x[13] = e2;
+    r.x[14] = ee1;
+    r.x[15] = ee0;
+    return(r);
+}
+static __m256i _mm256_setr_epi32(int e7,  int e6,  int e5,  int e4,  int e3,  int e2,  int ee1, int ee0)
+{
+    s__m256i r;
+    r.x[0] = e7;
+    r.x[1] = e6;
+    r.x[2] = e5;
+    r.x[3] = e4;
+    r.x[4] = e3;
+    r.x[5] = e2;
+    r.x[6] = ee1;
+    r.x[7] = ee0;
+    return(r.s);
+}
+static __m512i _mm512_inserti64x4 (__m512i a, __m256i b, int imm8)
+{
+    __m512i r;
+    s__m256i bb;
+    unsigned int i;
+    r = a;
+    bb.s = b;
+    if (imm8 == 0) {
+        for ( i = 0; i < 8; ++i )
+            r.x[i] = bb.x[i];
+    } else {
+        for ( i = 0; i < 8; ++i )
+            r.x[i+8] = bb.x[i];
+    }
+    return(r);
+}
+static __m512i _mm512_permutex2var_epi32(__m512i a, __m512i idx, __m512i b)
+{
+    __m512i r;
+    unsigned int i;
+    for ( i = 0; i < 16; ++i )
+        r.x[i] = (idx.x[i] & 0x10) ? b.x[idx.x[i] & 0x0F] :  a.x[idx.x[i] & 0x0F];
+    return(r);
+}
+static __m256i _mm256_permutex2var_epi32(__m256i a, __m256i idx, __m256i b)
+{
+    s__m256i r;
+    s__m256i iidx, aa, bb;
+    unsigned int i;
+    iidx.s = idx;
+    aa.s = a;
+    bb.s = b;
+    for ( i = 0; i < 8; ++i )
+        r.x[i] = (iidx.x[i] & 8) ? bb.x[iidx.x[i] & 7] :  aa.x[iidx.x[i] & 7];
+    return(r.s);
+}
+static __m512i _mm512_permutexvar_epi32(__m512i idx, __m512i a)
+{
+    __m512i r;
+    unsigned int i;
+    for ( i = 0; i < 16; ++i )
+        r.x[i] = a.x[idx.x[i]];
+    return(r);
+}
+static __m256i _mm256_permutexvar_epi32(__m256i idx, __m256i a)
+{
+    s__m256i r;
+    s__m256i iidx, aa;
+    unsigned int i;
+    iidx.s = idx;
+    aa.s = a;
+     for ( i = 0; i < 8; ++i )
+        r.x[i] = aa.x[iidx.x[i]];
+    return(r.s);
+}
+static __m512i _mm512_castsi256_si512(__m256i a)
+{
+    __m512i r;
+    s__m256i aa;
+    unsigned int i;
+    r = _mm512_setzero_si512();
+    aa.s = a;
+     for ( i = 0; i < 8; ++i )
+        r.x[i] = aa.x[i];
+    return(r);
+}
+#endif
+typedef __m128i V128;
+typedef __m256i V256;
+typedef __m512i V512;
+#define SnP_laneLengthInBytes   4
+#define laneIndex(instanceIndex, lanePosition) ((lanePosition)*8 + instanceIndex)
+#define Chi(a,b,c)                  _mm256_ternarylogic_epi32(a,b,c,0xD2)
+#define CONST8_32(a)                _mm256_set1_epi32(a)
+#define LOAD256u(a)                 _mm256_loadu_si256((const V256 *)&(a))
+#define LOAD512(a)                  _mm512_load_si512((const V512 *)&(a))
+#define LOAD512u(a)                 _mm512_loadu_si512((const V512 *)&(a))
+#define LOAD_GATHER8_32(idx,p)      _mm256_i32gather_epi32((const void*)(p), idx, 4)
+#define STORE_SCATTER8_32(idx,a,p)  _mm256_i32scatter_epi32((void*)(p), idx, a, 4)
+#define LOAD8_32(a,b,c,d,e,f,g,h)   _mm256_setr_epi32(a,b,c,d,e,f,g,h)
+#define SHUFFLE_LANES_RIGHT(idx, a) _mm256_permutexvar_epi32(idx, a)
+#define ROL32(a, o)                 _mm256_rol_epi32(a, o)
+#define SHL32(a, o)                 _mm256_slli_epi32(a, o)
+#define SET8_32                     _mm256_setr_epi32
+#define STORE128(a, b)              _mm_store_si128((V128 *)&(a), b)
+#define STORE128u(a, b)             _mm_storeu_si128((V128 *)&(a), b)
+#define STORE256u(a, b)             _mm256_storeu_si256((V256 *)&(a), b)
+#define STORE256(a, b)              _mm256_store_si256((V256 *)&(a), b)
+#define STORE512(a, b)              _mm512_store_si512((V512 *)&(a), b)
+#define STORE512u(a, b)             _mm512_storeu_si512((V512 *)&(a), b)
+#define AND(a, b)                   _mm256_and_si256(a, b)
+#define XOR128(a, b)                _mm_xor_si128(a, b)
+#define XOR256(a, b)                _mm256_xor_si256(a, b)
+#define XOR512(a, b)                _mm512_xor_si512(a, b)
+#define XOR(a, b)                   XOR256(a, b)
+#define XOR3(a,b,c)                 _mm256_ternarylogic_epi32(a,b,c,0x96)
+#ifndef _mm256_storeu2_m128i
+#define _mm256_storeu2_m128i(hi, lo, a)    _mm_storeu_si128((V128*)(lo), _mm256_castsi256_si128(a)), _mm_storeu_si128((V128*)(hi), _mm256_extracti128_si256(a, 1))
+#endif
+#if (VERBOSE > 0)
+    #define    DumpOne(__b,__v,__i) STORE256(__b, __v##__i); \
+                                    printf("%02u %08x %08x %08x %08x %08x %08x %08x %08x\n", __i, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7])
+    #define    Dump(__t,__v)    {                   \
+                            uint32_t    buf[8];    \
+                            printf("%s\n", __t);    \
+                            DumpOne(buf, __v, 00);  \
+                            DumpOne(buf, __v, 01);  \
+                            DumpOne(buf, __v, 02);  \
+                            DumpOne(buf, __v, 03);  \
+                            DumpOne(buf, __v, 10);  \
+                            DumpOne(buf, __v, 11);  \
+                            DumpOne(buf, __v, 12);  \
+                            DumpOne(buf, __v, 13);  \
+                            DumpOne(buf, __v, 20);  \
+                            DumpOne(buf, __v, 21);  \
+                            DumpOne(buf, __v, 22);  \
+                            DumpOne(buf, __v, 23);  \
+                        }
+#else
+    #define    Dump(__t,__v)
+#endif
+#if (VERBOSE >= 1)
+    #define    Dump1(__t,__v)    Dump(__t,__v)
+#else
+    #define    Dump1(__t,__v)
+#endif
+#if (VERBOSE >= 2)
+    #define    Dump2(__t,__v)    Dump(__t,__v)
+#else
+    #define    Dump2(__t,__v)
+#endif
+#if (VERBOSE >= 3)
+    #define    Dump3(__t,__v)    Dump(__t,__v)
+#else
+    #define    Dump3(__t,__v)
+#endif
+#if (VERBOSE > 0)
+#define    DUMP32(tt, buf)    printf("%s %08x %08x %08x %08x %08x %08x %08x %08x\n", tt, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7])
+#define    DUMP32_12(tt, buf) printf("%s %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", tt, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], buf[8], buf[9], buf[10], buf[11])
+#define    DumpLane(__t,__v) {  uint32_t buf[8]; \
+                                STORE256(buf[0], __v); \
+                                printf("%s %08x %08x %08x %08x %08x %08x %08x %08x\n", __t, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); }
+#else
+#define    DUMP32(__t, buf)
+#define    DUMP32_12(__t, buf)
+#define    DumpLane(__t,__v)
+#endif
+ALIGN(32) static const uint32_t     oAllFrom1_0[]   = {   1,   2,   3,   4,   5,   6,   7, 8+0 };
+ALIGN(32) static const uint32_t     oAllFrom1_2[]   = {   1,   2,   3,   4,   5,   6,   7, 8+2 };
+ALIGN(32) static const uint32_t     oAllFrom1_3[]   = {   1,   2,   3,   4,   5,   6,   7, 8+3 };
+ALIGN(32) static const uint32_t     oAllFrom1_4[]   = {   1,   2,   3,   4,   5,   6,   7, 8+4 };
+ALIGN(32) static const uint32_t     oAllFrom1_5[]   = {   1,   2,   3,   4,   5,   6,   7, 8+5 };
+ALIGN(32) static const uint32_t     oAllFrom1_6[]   = {   1,   2,   3,   4,   5,   6,   7, 8+6 };
+ALIGN(32) static const uint32_t     oAllFrom2_0[]   = {   2,   3,   4,   5,   6,   7, 8+0, 8+1 };
+ALIGN(32) static const uint32_t     oAllFrom2_4[]   = {   2,   3,   4,   5,   6,   7, 8+4, 8+5 };
+ALIGN(32) static const uint32_t     oAllFrom3_2[]   = {   3,   4,   5,   6,   7, 8+2, 8+3, 8+4 };
+ALIGN(32) static const uint32_t     oAllFrom3_4[]   = {   3,   4,   5,   6,   7, 8+4, 8+5, 8+6 };
+ALIGN(32) static const uint32_t     oLow128[]       = {   0,   1,   2,   3, 8+0, 8+1, 8+2, 8+3 };
+ALIGN(32) static const uint32_t     oHigh128[]      = {   4,   5,   6,   7, 8+4, 8+5, 8+6, 8+7 };
+ALIGN(32) static const uint32_t     oLow64[]        = {   0,   1, 8+0, 8+1,   4,   5, 8+4, 8+5 };
+ALIGN(32) static const uint32_t     oHigh64[]       = {   2,   3, 8+2, 8+3,   6,   7, 8+6, 8+7 };
+ALIGN(32) static const uint32_t     oLow32[]        = {   0, 8+0,   2, 8+2,   4, 8+4,   6, 8+6 };
+ALIGN(32) static const uint32_t     oHigh32[]       = {   1, 8+1,   3, 8+3,   5, 8+5,   7, 8+7 };
+ALIGN(32) static const uint32_t     oLow32_4[]      = {   0,   2,   4,   6,   0,   2,   4,   6 };
+ALIGN(32) static const uint32_t     oHigh32_4[]     = {   1,   3,   5,   7,   0,   2,   4,   6 };
+ALIGN(32) static const uint32_t    oGatherScatterOffsets[]      = { 0*12, 1*12, 2*12, 3*12, 4*12, 5*12, 6*12, 7*12 };
+ALIGN(64) static const uint32_t    oGatherScatterOffsetsRoll[]  = { 0, 0, 0, 0, 0,  4,  8,  1,  5,  9,  2,  6, 10, 3, 7, 11 }; /* First 4 are dummies */
+void Xoodootimes8_InitializeAll(void *states)
+{
+    memset(states, 0, Xoodootimes8_statesSizeInBytes);
+}
+void Xoodootimes8_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    unsigned int sizeLeft = length;
+    unsigned int lanePosition = offset/SnP_laneLengthInBytes;
+    unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
+    const unsigned char *curData = data;
+    uint32_t *statesAsLanes = (uint32_t *)states;
+    if ((sizeLeft > 0) && (offsetInLane != 0)) {
+        unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
+        uint32_t lane = 0;
+        if (bytesInLane > sizeLeft)
+            bytesInLane = sizeLeft;
+        memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane);
+        statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
+        sizeLeft -= bytesInLane;
+        lanePosition++;
+        curData += bytesInLane;
+    }
+    while(sizeLeft >= SnP_laneLengthInBytes) {
+        uint32_t lane = *((const uint32_t*)curData);
+        statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
+        sizeLeft -= SnP_laneLengthInBytes;
+        lanePosition++;
+        curData += SnP_laneLengthInBytes;
+    }
+    if (sizeLeft > 0) {
+        uint32_t lane = 0;
+        memcpy(&lane, curData, sizeLeft);
+        statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
+    }
+}
+void Xoodootimes8_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
+{
+    V256 *stateAsLanes = (V256 *)states;
+    unsigned int i;
+    const uint32_t *data32 = (const uint32_t *)data;
+    V256 offsets = SET8_32(0*laneOffset, 1*laneOffset, 2*laneOffset, 3*laneOffset, 4*laneOffset, 5*laneOffset, 6*laneOffset, 7*laneOffset);
+    #define Xor_In( argIndex )  stateAsLanes[argIndex] = XOR(stateAsLanes[argIndex], LOAD_GATHER8_32(offsets, &data32[argIndex]))
+    if ( laneCount == 12 )  {
+        Xor_In( 0 );
+        Xor_In( 1 );
+        Xor_In( 2 );
+        Xor_In( 3 );
+        Xor_In( 4 );
+        Xor_In( 5 );
+        Xor_In( 6 );
+        Xor_In( 7 );
+        Xor_In( 8 );
+        Xor_In( 9 );
+        Xor_In( 10 );
+        Xor_In( 11 );
+    }
+    else {
+        for(i=0; i<laneCount; i++)
+            Xor_In( i );
+    }
+    #undef  Xor_In
+}
+void Xoodootimes8_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    unsigned int sizeLeft = length;
+    unsigned int lanePosition = offset/SnP_laneLengthInBytes;
+    unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
+    const unsigned char *curData = data;
+    uint32_t *statesAsLanes = (uint32_t *)states;
+    if ((sizeLeft > 0) && (offsetInLane != 0)) {
+        unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
+        if (bytesInLane > sizeLeft)
+            bytesInLane = sizeLeft;
+        memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane);
+        sizeLeft -= bytesInLane;
+        lanePosition++;
+        curData += bytesInLane;
+    }
+    while(sizeLeft >= SnP_laneLengthInBytes) {
+        uint32_t lane = *((const uint32_t*)curData);
+        statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane;
+        sizeLeft -= SnP_laneLengthInBytes;
+        lanePosition++;
+        curData += SnP_laneLengthInBytes;
+    }
+    if (sizeLeft > 0) {
+        memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft);
+    }
+}
+void Xoodootimes8_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
+{
+    V256 *stateAsLanes = (V256 *)states;
+    unsigned int i;
+    const uint32_t *data32 = (const uint32_t *)data;
+    V256 offsets = SET8_32(0*laneOffset, 1*laneOffset, 2*laneOffset, 3*laneOffset, 4*laneOffset, 5*laneOffset, 6*laneOffset, 7*laneOffset);
+    #define OverWr( argIndex )  stateAsLanes[argIndex] = LOAD_GATHER8_32(offsets, &data32[argIndex])
+    if ( laneCount == 12 )  {
+        OverWr( 0 );
+        OverWr( 1 );
+        OverWr( 2 );
+        OverWr( 3 );
+        OverWr( 4 );
+        OverWr( 5 );
+        OverWr( 6 );
+        OverWr( 7 );
+        OverWr( 8 );
+        OverWr( 9 );
+        OverWr( 10 );
+        OverWr( 11 );
+    }
+    else {
+        for(i=0; i<laneCount; i++)
+            OverWr( i );
+    }
+    #undef  OverWr
+}
+void Xoodootimes8_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
+{
+    unsigned int sizeLeft = byteCount;
+    unsigned int lanePosition = 0;
+    uint32_t *statesAsLanes = (uint32_t *)states;
+    while(sizeLeft >= SnP_laneLengthInBytes) {
+        statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0;
+        sizeLeft -= SnP_laneLengthInBytes;
+        lanePosition++;
+    }
+    if (sizeLeft > 0) {
+        memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft);
+    }
+}
+void Xoodootimes8_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
+{
+    unsigned int sizeLeft = length;
+    unsigned int lanePosition = offset/SnP_laneLengthInBytes;
+    unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
+    unsigned char *curData = data;
+    const uint32_t *statesAsLanes = (const uint32_t *)states;
+    if ((sizeLeft > 0) && (offsetInLane != 0)) {
+        unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
+        if (bytesInLane > sizeLeft)
+            bytesInLane = sizeLeft;
+        memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
+        sizeLeft -= bytesInLane;
+        lanePosition++;
+        curData += bytesInLane;
+    }
+    while(sizeLeft >= SnP_laneLengthInBytes) {
+        *(uint32_t*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
+        sizeLeft -= SnP_laneLengthInBytes;
+        lanePosition++;
+        curData += SnP_laneLengthInBytes;
+    }
+    if (sizeLeft > 0) {
+        memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft);
+    }
+}
+void Xoodootimes8_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
+{
+    const V256 *stateAsLanes = (const V256 *)states;
+    unsigned int i;
+    uint32_t *data32 = (uint32_t *)data;
+    V256 offsets = SET8_32(0*laneOffset, 1*laneOffset, 2*laneOffset, 3*laneOffset, 4*laneOffset, 5*laneOffset, 6*laneOffset, 7*laneOffset);
+    #define Extr( argIndex )        STORE_SCATTER8_32(offsets, stateAsLanes[argIndex], &data32[argIndex])
+    if ( laneCount == 12 )  {
+        Extr( 0 );
+        Extr( 1 );
+        Extr( 2 );
+        Extr( 3 );
+        Extr( 4 );
+        Extr( 5 );
+        Extr( 6 );
+        Extr( 7 );
+        Extr( 8 );
+        Extr( 9 );
+        Extr( 10 );
+        Extr( 11 );
+    }
+    else {
+        for(i=0; i<laneCount; i++)
+            Extr( i );
+    }
+    #undef  Extr
+}
+void Xoodootimes8_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
+{
+    unsigned int sizeLeft = length;
+    unsigned int lanePosition = offset/SnP_laneLengthInBytes;
+    unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
+    const unsigned char *curInput = input;
+    unsigned char *curOutput = output;
+    const uint32_t *statesAsLanes = (const uint32_t *)states;
+    if ((sizeLeft > 0) && (offsetInLane != 0)) {
+        unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
+        uint32_t lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane);
+        if (bytesInLane > sizeLeft)
+            bytesInLane = sizeLeft;
+        sizeLeft -= bytesInLane;
+        do {
+            *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
+            lane >>= 8;
+        } while ( --bytesInLane != 0);
+        lanePosition++;
+    }
+    while(sizeLeft >= SnP_laneLengthInBytes) {
+        *((uint32_t*)curOutput) = *((uint32_t*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
+        sizeLeft -= SnP_laneLengthInBytes;
+        lanePosition++;
+        curInput += SnP_laneLengthInBytes;
+        curOutput += SnP_laneLengthInBytes;
+    }
+    if (sizeLeft != 0) {
+        uint32_t lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
+        do {
+            *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
+            lane >>= 8;
+        } while ( --sizeLeft != 0);
+    }
+}
+void Xoodootimes8_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
+{
+    const V256 *stateAsLanes = (const V256 *)states;
+    unsigned int i;
+    const uint32_t *datai32 = (const uint32_t *)input;
+    uint32_t *datao32 = (uint32_t *)output;
+    V256 offsets = SET8_32(0*laneOffset, 1*laneOffset, 2*laneOffset, 3*laneOffset, 4*laneOffset, 5*laneOffset, 6*laneOffset, 7*laneOffset);
+    #define ExtrXor( argIndex ) STORE_SCATTER8_32(offsets, XOR( stateAsLanes[argIndex], LOAD_GATHER8_32(offsets, &datai32[argIndex])), &datao32[argIndex])
+    if ( laneCount == 12 )  {
+        ExtrXor( 0 );
+        ExtrXor( 1 );
+        ExtrXor( 2 );
+        ExtrXor( 3 );
+        ExtrXor( 4 );
+        ExtrXor( 5 );
+        ExtrXor( 6 );
+        ExtrXor( 7 );
+        ExtrXor( 8 );
+        ExtrXor( 9 );
+        ExtrXor( 10 );
+        ExtrXor( 11 );
+    }
+    else {
+        for(i=0; i<laneCount; i++) {
+            ExtrXor( i );
+        }
+    }
+    #undef  ExtrXor
+}
+#define DeclareVars     V256    a00, a01, a02, a03; \
+                        V256    a10, a11, a12, a13; \
+                        V256    a20, a21, a22, a23; \
+                        V256    v1, v2;
+#define State2Vars2     a00 = states[0], a01 = states[1], a02 = states[ 2], a03 = states[ 3]; \
+                        a12 = states[4], a13 = states[5], a10 = states[ 6], a11 = states[ 7]; \
+                        a20 = states[8], a21 = states[9], a22 = states[10], a23 = states[11]
+#define State2Vars      a00 = states[0], a01 = states[1], a02 = states[ 2], a03 = states[ 3]; \
+                        a10 = states[4], a11 = states[5], a12 = states[ 6], a13 = states[ 7]; \
+                        a20 = states[8], a21 = states[9], a22 = states[10], a23 = states[11]
+#define Vars2State      states[0] = a00, states[1] = a01, states[ 2] = a02, states[ 3] = a03; \
+                        states[4] = a10, states[5] = a11, states[ 6] = a12, states[ 7] = a13; \
+                        states[8] = a20, states[9] = a21, states[10] = a22, states[11] = a23
+#define Round(a10i, a11i, a12i, a13i, a10w, a11w, a12w, a13w, a20i, a21i, a22i, a23i, __rc) \
+                                                            \
+    /* Theta: Column Parity Mixer */                        \
+    /* Iota: round constants */                             \
+    v1 = XOR3( a03, a13i, a23i );                           \
+    v2 = XOR3( a00, a10i, a20i );                           \
+    v1 = XOR( ROL32(v1, 5), ROL32(v1, 14) );               \
+    a00  = XOR3( a00,  v1, CONST8_32(__rc) ); /* Iota */    \
+    a10i = XOR( a10i, v1 );                                 \
+    a20i = XOR( a20i, v1 );                                 \
+    v1 = XOR3( a01, a11i, a21i );                           \
+    v2 = XOR( ROL32(v2, 5), ROL32(v2, 14) );               \
+    a01  = XOR( a01,  v2 );                                 \
+    a11i = XOR( a11i, v2 );                                 \
+    a21i = XOR( a21i, v2 );                                 \
+    v2 = XOR3( a02, a12i, a22i );                           \
+    v1 = XOR( ROL32(v1, 5), ROL32(v1, 14) );               \
+    a02  = XOR( a02,  v1 );                                 \
+    a12i = XOR( a12i, v1 );                                 \
+    a22i = XOR( a22i, v1 );                                 \
+    v2 = XOR( ROL32(v2, 5), ROL32(v2, 14) );               \
+    a03  = XOR( a03,  v2 );                                 \
+    a13i = XOR( a13i, v2 );                                 \
+    a23i = XOR( a23i, v2 );                                 \
+    Dump3("Theta",a);                                       \
+                                                            \
+    /* Rho-west: Plane shift */                             \
+    a20i = ROL32(a20i, 11);                                 \
+    a21i = ROL32(a21i, 11);                                 \
+    a22i = ROL32(a22i, 11);                                 \
+    a23i = ROL32(a23i, 11);                                 \
+    Dump3("Rho-west",a);                                    \
+                                                            \
+    /* Chi: non linear step, on colums */                   \
+    a00  = Chi(a00,  a10w, a20i);                           \
+    a01  = Chi(a01,  a11w, a21i);                           \
+    a02  = Chi(a02,  a12w, a22i);                           \
+    a03  = Chi(a03,  a13w, a23i);                           \
+    a10w = Chi(a10w, a20i, a00);                            \
+    a11w = Chi(a11w, a21i, a01);                            \
+    a12w = Chi(a12w, a22i, a02);                            \
+    a13w = Chi(a13w, a23i, a03);                            \
+    a20i = Chi(a20i, a00,  a10w);                           \
+    a21i = Chi(a21i, a01,  a11w);                           \
+    a22i = Chi(a22i, a02,  a12w);                           \
+    a23i = Chi(a23i, a03,  a13w);                           \
+    Dump3("Chi",a);                                         \
+                                                            \
+    /* Rho-east: Plane shift */                             \
+    a10w = ROL32(a10w, 1);                                  \
+    a11w = ROL32(a11w, 1);                                  \
+    a12w = ROL32(a12w, 1);                                  \
+    a13w = ROL32(a13w, 1);                                  \
+    a20i = ROL32(a20i, 8);                                  \
+    a21i = ROL32(a21i, 8);                                  \
+    a22i = ROL32(a22i, 8);                                  \
+    a23i = ROL32(a23i, 8);                                  \
+    Dump3("Rho-east",a)
+void Xoodootimes8_PermuteAll_6rounds(void *argstates)
+{
+    V256 * states = (V256 *)argstates;
+    DeclareVars;
+    State2Vars2;
+    Round(  a12, a13, a10, a11,    a11, a12, a13, a10,    a20, a21, a22, a23,    _rc6 );
+    Round(  a11, a12, a13, a10,    a10, a11, a12, a13,    a22, a23, a20, a21,    _rc5 );
+    Round(  a10, a11, a12, a13,    a13, a10, a11, a12,    a20, a21, a22, a23,    _rc4 );
+    Round(  a13, a10, a11, a12,    a12, a13, a10, a11,    a22, a23, a20, a21,    _rc3 );
+    Round(  a12, a13, a10, a11,    a11, a12, a13, a10,    a20, a21, a22, a23,    _rc2 );
+    Round(  a11, a12, a13, a10,    a10, a11, a12, a13,    a22, a23, a20, a21,    _rc1 );
+    Dump2("Permutation\n", a);
+    Vars2State;
+}
+void Xoodootimes8_PermuteAll_12rounds(void *argstates)
+{
+    V256 * states = (V256 *)argstates;
+    DeclareVars;
+    State2Vars;
+    Round(  a10, a11, a12, a13,    a13, a10, a11, a12,    a20, a21, a22, a23,    _rc12 );
+    Round(  a13, a10, a11, a12,    a12, a13, a10, a11,    a22, a23, a20, a21,    _rc11 );
+    Round(  a12, a13, a10, a11,    a11, a12, a13, a10,    a20, a21, a22, a23,    _rc10 );
+    Round(  a11, a12, a13, a10,    a10, a11, a12, a13,    a22, a23, a20, a21,    _rc9 );
+    Round(  a10, a11, a12, a13,    a13, a10, a11, a12,    a20, a21, a22, a23,    _rc8 );
+    Round(  a13, a10, a11, a12,    a12, a13, a10, a11,    a22, a23, a20, a21,    _rc7 );
+    Round(  a12, a13, a10, a11,    a11, a12, a13, a10,    a20, a21, a22, a23,    _rc6 );
+    Round(  a11, a12, a13, a10,    a10, a11, a12, a13,    a22, a23, a20, a21,    _rc5 );
+    Round(  a10, a11, a12, a13,    a13, a10, a11, a12,    a20, a21, a22, a23,    _rc4 );
+    Round(  a13, a10, a11, a12,    a12, a13, a10, a11,    a22, a23, a20, a21,    _rc3 );
+    Round(  a12, a13, a10, a11,    a11, a12, a13, a10,    a20, a21, a22, a23,    _rc2 );
+    Round(  a11, a12, a13, a10,    a10, a11, a12, a13,    a22, a23, a20, a21,    _rc1 );
+    Dump2("Permutation\n", a);
+    Vars2State;
+}
+void Xooffftimes8_AddIs(unsigned char *output, const unsigned char *input, size_t bitLen)
+{
+    size_t  byteLen = bitLen / 8;
+    V512    lanes1, lanes2, lanes3, lanes4;
+    V256    lanesA, lanesB;
+    while ( byteLen >= 128 ) {
+        lanes1 = LOAD512u(input[ 0]);
+        lanes2 = LOAD512u(input[64]);
+        lanes3 = LOAD512u(output[ 0]);
+        lanes4 = LOAD512u(output[64]);
+        lanes1 = XOR512(lanes1, lanes3);
+        lanes2 = XOR512(lanes2, lanes4);
+        STORE512u(output[ 0], lanes1);
+        STORE512u(output[64], lanes2);
+        input += 128;
+        output += 128;
+        byteLen -= 128;
+    }
+    while ( byteLen >= 32 ) {
+        lanesA = LOAD256u(input[0]);
+        lanesB = LOAD256u(output[0]);
+        input += 32;
+        lanesA = XOR256(lanesA, lanesB);
+        byteLen -= 32;
+        STORE256u(output[0], lanesA);
+        output += 32;
+    }
+   while ( byteLen >= 8 ) {
+        *((uint64_t*)output) ^= *((uint64_t*)input);
+        input += 8;
+        output += 8;
+        byteLen -= 8;
+    }
+    while ( byteLen-- != 0 ) {
+        *output++ ^= *input++;
+    }
+    bitLen &= 7;
+    if (bitLen != 0)
+    {
+        *output ^= *input;
+        *output &= (1 << bitLen) - 1;
+    }
+}
+size_t Xooffftimes8_CompressFastLoop(unsigned char *k, unsigned char *x, const unsigned char *input, size_t length)
+{
+    DeclareVars;
+    uint32_t    *k32 = (uint32_t*)k;
+    uint32_t    *x32 = (uint32_t*)x;
+    uint32_t    *i32 = (uint32_t*)input;
+    size_t      initialLength;
+    V256        r04815926;
+    V256        r5926a37b;
+    V256        offsets;
+    V256        x00, x01, x02, x03, x10, x11, x12, x13, x20, x21, x22, x23;
+    V512        x512;
+    DUMP32("k32",k32);
+    r04815926 = LOAD_GATHER8_32(LOAD8_32( 0,  4,  8,  1,  5,  9,  2,  6), k32);
+    r5926a37b = LOAD_GATHER8_32(LOAD8_32( 5,  9,  2,  6, 10,  3,  7, 11), k32);
+    offsets = *(V256*)oGatherScatterOffsets;
+    x00 = _mm256_setzero_si256();
+    x01 = _mm256_setzero_si256();
+    x02 = _mm256_setzero_si256();
+    x03 = _mm256_setzero_si256();
+    x10 = _mm256_setzero_si256();
+    x11 = _mm256_setzero_si256();
+    x12 = _mm256_setzero_si256();
+    x13 = _mm256_setzero_si256();
+    x20 = _mm256_setzero_si256();
+    x21 = _mm256_setzero_si256();
+    x22 = _mm256_setzero_si256();
+    x23 = _mm256_setzero_si256();
+    initialLength = length;
+    do {
+        #define        rCGKDHLEI    r5926a37b
+        /*    Note that a10-a12 and a11-a13 are swapped */
+        a00 = r04815926;
+        a02 = r5926a37b;
+        DumpLane("r5926a37b",r5926a37b);
+        a01 = _mm256_permutex2var_epi32(a00, *(const V256*)oAllFrom3_4, r5926a37b);
+        a12 = _mm256_permutex2var_epi32(a00, *(const V256*)oAllFrom1_4, r5926a37b); /* 4815926 A */
+        rCGKDHLEI = XOR3(a00, SHL32(a00, 13), ROL32(a12, 3));
+        a02 = _mm256_permutex2var_epi32(a02, *(const V256*)oAllFrom2_0, rCGKDHLEI);
+        a03 = _mm256_permutex2var_epi32(a02, *(const V256*)oAllFrom3_2, rCGKDHLEI);
+        a13 = _mm256_permutex2var_epi32(a01, *(const V256*)oAllFrom1_5, a02);       /* B */
+        a10 = _mm256_permutex2var_epi32(a02, *(const V256*)oAllFrom1_2, rCGKDHLEI); /* K */
+        a11 = _mm256_permutex2var_epi32(a03, *(const V256*)oAllFrom1_5, rCGKDHLEI); /* L */
+        a20 = _mm256_permutex2var_epi32(a12, *(const V256*)oAllFrom1_0, a03);       /* 3 */
+        a21 = _mm256_permutex2var_epi32(a13, *(const V256*)oAllFrom1_0, rCGKDHLEI); /* C */
+        a22 = _mm256_permutex2var_epi32(a10, *(const V256*)oAllFrom1_3, rCGKDHLEI); /* D */
+        a23 = _mm256_permutex2var_epi32(a11, *(const V256*)oAllFrom1_6, rCGKDHLEI); /* E */
+        r04815926 = a22;
+        Dump("Roll-c", a);
+        a00 = XOR( a00, LOAD_GATHER8_32(offsets, i32+0));
+        a01 = XOR( a01, LOAD_GATHER8_32(offsets, i32+1));
+        a02 = XOR( a02, LOAD_GATHER8_32(offsets, i32+2));
+        a03 = XOR( a03, LOAD_GATHER8_32(offsets, i32+3));
+        a12 = XOR( a12, LOAD_GATHER8_32(offsets, i32+4));
+        a13 = XOR( a13, LOAD_GATHER8_32(offsets, i32+5));
+        a10 = XOR( a10, LOAD_GATHER8_32(offsets, i32+6));
+        a11 = XOR( a11, LOAD_GATHER8_32(offsets, i32+7));
+        a20 = XOR( a20, LOAD_GATHER8_32(offsets, i32+8));
+        a21 = XOR( a21, LOAD_GATHER8_32(offsets, i32+9));
+        a22 = XOR( a22, LOAD_GATHER8_32(offsets, i32+10));
+        a23 = XOR( a23, LOAD_GATHER8_32(offsets, i32+11));
+        Dump("Input Xoodoo (after add)", a);
+        Round(  a12, a13, a10, a11,    a11, a12, a13, a10,    a20, a21, a22, a23,    _rc6 );
+        Round(  a11, a12, a13, a10,    a10, a11, a12, a13,    a22, a23, a20, a21,    _rc5 );
+        Round(  a10, a11, a12, a13,    a13, a10, a11, a12,    a20, a21, a22, a23,    _rc4 );
+        Round(  a13, a10, a11, a12,    a12, a13, a10, a11,    a22, a23, a20, a21,    _rc3 );
+        Round(  a12, a13, a10, a11,    a11, a12, a13, a10,    a20, a21, a22, a23,    _rc2 );
+        Round(  a11, a12, a13, a10,    a10, a11, a12, a13,    a22, a23, a20, a21,    _rc1 );
+        Dump("Output Xoodoo", a);
+        x00 = XOR(x00, a00);
+        x01 = XOR(x01, a01);
+        x02 = XOR(x02, a02);
+        x03 = XOR(x03, a03);
+        x10 = XOR(x10, a10);
+        x11 = XOR(x11, a11);
+        x12 = XOR(x12, a12);
+        x13 = XOR(x13, a13);
+        x20 = XOR(x20, a20);
+        x21 = XOR(x21, a21);
+        x22 = XOR(x22, a22);
+        x23 = XOR(x23, a23);
+        Dump("Accu x", x);
+        i32 += NLANES*8;
+        length -= NLANES*4*8;
+    }
+    while (length >= (NLANES*4*8));
+    /*    Reduce from 8 lanes to 4 */
+    v1 = *(V256*)oLow128;
+    v2 = *(V256*)oHigh128;
+    x00 = XOR(_mm256_permutex2var_epi32(x00, v1, x10), _mm256_permutex2var_epi32(x00, v2, x10));
+    x01 = XOR(_mm256_permutex2var_epi32(x01, v1, x11), _mm256_permutex2var_epi32(x01, v2, x11));
+    x02 = XOR(_mm256_permutex2var_epi32(x02, v1, x12), _mm256_permutex2var_epi32(x02, v2, x12));
+    x03 = XOR(_mm256_permutex2var_epi32(x03, v1, x13), _mm256_permutex2var_epi32(x03, v2, x13));
+    x20 = XOR(_mm256_permutex2var_epi32(x20, v1, x22), _mm256_permutex2var_epi32(x20, v2, x22));
+    x21 = XOR(_mm256_permutex2var_epi32(x21, v1, x23), _mm256_permutex2var_epi32(x21, v2, x23));
+    /*    Reduce from 4 lanes to 2 */
+    v1 = *( V256*)oLow64;
+    v2 = *( V256*)oHigh64;
+    x00 = XOR(_mm256_permutex2var_epi32(x00, v1, x02), _mm256_permutex2var_epi32(x00, v2, x02));
+    x01 = XOR(_mm256_permutex2var_epi32(x01, v1, x03), _mm256_permutex2var_epi32(x01, v2, x03));
+    x20 = XOR(_mm256_permutex2var_epi32(x20, v1, x21), _mm256_permutex2var_epi32(x20, v2, x21));
+    /*    Reduce from 2 lanes to 1 */
+    x00 = XOR(_mm256_permutex2var_epi32(x00, *(V256*)oLow32, x01), _mm256_permutex2var_epi32(x00, *(V256*)oHigh32, x01));
+    x20 = XOR(_mm256_permutex2var_epi32(x20, *(V256*)oLow32_4, x20), _mm256_permutex2var_epi32(x20, *(V256*)oHigh32_4, x20));
+    /*    Combine x00 and x20 */
+    x512 = _mm512_inserti64x4 (_mm512_castsi256_si512(x00), x20, 1);
+    /*  load xAccu, xor and store 12 lanes */
+    x512 = XOR512(x512, _mm512_maskz_load_epi64(0x3F, x32));
+    _mm512_mask_store_epi64(x32, 0x3F, x512);
+    DUMP32_12("x32",x32);
+    /* Save new k */
+    _mm256_i32scatter_epi32(k32, LOAD8_32( 0,  4,  8,  1,  5,  9,  2,  6), r04815926, 4);
+    _mm256_mask_i32scatter_epi32(k32, 0xF0, LOAD8_32( 0,  0,  0,  0,   10,  3,  7,  11), r5926a37b, 4);
+    DUMP32_12( "k32", k32);
+    return initialLength - length;
+}
+size_t Xooffftimes8_ExpandFastLoop(unsigned char *yAccu, const unsigned char *kRoll, unsigned char *output, size_t length)
+{
+    DeclareVars;
+    uint32_t    *k32 = (uint32_t*)kRoll;
+    uint32_t    *y32 = (uint32_t*)yAccu;
+    uint32_t    *o32 = (uint32_t*)output;
+    size_t      initialLength;
+    V256        r04815926;
+    V256        r5926a37b;
+    V256        offsets;
+    r04815926 = LOAD_GATHER8_32(LOAD8_32( 0,  4,  8,  1,  5,  9,  2,  6), y32);
+    r5926a37b = LOAD_GATHER8_32(LOAD8_32( 5,  9,  2,  6, 10,  3,  7, 11), y32);
+    offsets = *(const V256*)oGatherScatterOffsets;
+    initialLength = length;
+    do {
+        #define        rCGKDHLEI    r5926a37b
+        /*    Note that a10-a12 and a11-a13 are swapped */
+        a00 = r04815926;
+        a02 = r5926a37b;
+        DumpLane("r5926a37b",r5926a37b);
+        a01 = _mm256_permutex2var_epi32(a00, *(const V256*)oAllFrom3_4, r5926a37b);
+        a12 = _mm256_permutex2var_epi32(a00, *(const V256*)oAllFrom1_4, r5926a37b); /* 4815926 A */
+        a20 = _mm256_permutex2var_epi32(a00, *(const V256*)oAllFrom2_4, r5926a37b); /* 815926  A3 */
+        rCGKDHLEI = XOR3(ROL32(a00, 5), ROL32(a12, 13), AND(a20, a12));
+        rCGKDHLEI = XOR(rCGKDHLEI, CONST8_32(7));
+        a02 = _mm256_permutex2var_epi32(a02, *(const V256*)oAllFrom2_0, rCGKDHLEI);
+        a03 = _mm256_permutex2var_epi32(a02, *(const V256*)oAllFrom3_2, rCGKDHLEI);
+        a13 = _mm256_permutex2var_epi32(a01, *(const V256*)oAllFrom1_5, a02);       /* B */
+        a10 = _mm256_permutex2var_epi32(a02, *(const V256*)oAllFrom1_2, rCGKDHLEI); /* K */
+        a11 = _mm256_permutex2var_epi32(a03, *(const V256*)oAllFrom1_5, rCGKDHLEI); /* L */
+        a21 = _mm256_permutex2var_epi32(a13, *(const V256*)oAllFrom1_0, rCGKDHLEI); /* C */
+        a22 = _mm256_permutex2var_epi32(a10, *(const V256*)oAllFrom1_3, rCGKDHLEI); /* D */
+        a23 = _mm256_permutex2var_epi32(a11, *(const V256*)oAllFrom1_6, rCGKDHLEI); /* E */
+        r04815926 = a22;
+        Dump("Roll-e", a);
+        Round(  a12, a13, a10, a11,    a11, a12, a13, a10,    a20, a21, a22, a23,    _rc6 );
+        Round(  a11, a12, a13, a10,    a10, a11, a12, a13,    a22, a23, a20, a21,    _rc5 );
+        Round(  a10, a11, a12, a13,    a13, a10, a11, a12,    a20, a21, a22, a23,    _rc4 );
+        Round(  a13, a10, a11, a12,    a12, a13, a10, a11,    a22, a23, a20, a21,    _rc3 );
+        Round(  a12, a13, a10, a11,    a11, a12, a13, a10,    a20, a21, a22, a23,    _rc2 );
+        Round(  a11, a12, a13, a10,    a10, a11, a12, a13,    a22, a23, a20, a21,    _rc1 );
+        Dump("Xoodoo(y)", a);
+        a00 = XOR(a00, CONST8_32(k32[0]));
+        a01 = XOR(a01, CONST8_32(k32[1]));
+        a02 = XOR(a02, CONST8_32(k32[2]));
+        a03 = XOR(a03, CONST8_32(k32[3]));
+        a10 = XOR(a10, CONST8_32(k32[4]));
+        a11 = XOR(a11, CONST8_32(k32[5]));
+        a12 = XOR(a12, CONST8_32(k32[6]));
+        a13 = XOR(a13, CONST8_32(k32[7]));
+        a20 = XOR(a20, CONST8_32(k32[8]));
+        a21 = XOR(a21, CONST8_32(k32[9]));
+        a22 = XOR(a22, CONST8_32(k32[10]));
+        a23 = XOR(a23, CONST8_32(k32[11]));
+        Dump("Xoodoo(y) + kRoll", a);
+        /*  Extract */
+        STORE_SCATTER8_32(offsets, a00, o32+0);
+        STORE_SCATTER8_32(offsets, a01, o32+1);
+        STORE_SCATTER8_32(offsets, a02, o32+2);
+        STORE_SCATTER8_32(offsets, a03, o32+3);
+        STORE_SCATTER8_32(offsets, a10, o32+4);
+        STORE_SCATTER8_32(offsets, a11, o32+5);
+        STORE_SCATTER8_32(offsets, a12, o32+6);
+        STORE_SCATTER8_32(offsets, a13, o32+7);
+        STORE_SCATTER8_32(offsets, a20, o32+8);
+        STORE_SCATTER8_32(offsets, a21, o32+9);
+        STORE_SCATTER8_32(offsets, a22, o32+10);
+        STORE_SCATTER8_32(offsets, a23, o32+11);
+        o32 += NLANES*8;
+        length -= NLANES*4*8;
+    }
+    while (length >= (NLANES*4*8));
+    /* Save new y */
+    _mm256_i32scatter_epi32(y32, LOAD8_32( 0,  4,  8,  1,  5,  9,  2,  6), r04815926, 4);
+    _mm256_mask_i32scatter_epi32(y32, 0xF0, LOAD8_32( 0,  0,  0,  0,   10,  3,  7,  11), r5926a37b, 4);
+    DUMP32_12( "y32", y32);
+    return initialLength - length;
+}