sleeping_kangaroo12 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +127 -0
- data/ext/Rakefile +73 -0
- data/ext/binding/sleeping_kangaroo12.c +39 -0
- data/ext/config/xkcp.build +17 -0
- data/ext/xkcp/LICENSE +1 -0
- data/ext/xkcp/Makefile +15 -0
- data/ext/xkcp/Makefile.build +200 -0
- data/ext/xkcp/README.markdown +296 -0
- data/ext/xkcp/lib/HighLevel.build +143 -0
- data/ext/xkcp/lib/LowLevel.build +757 -0
- data/ext/xkcp/lib/common/align.h +33 -0
- data/ext/xkcp/lib/common/brg_endian.h +143 -0
- data/ext/xkcp/lib/high/KangarooTwelve/KangarooTwelve.c +301 -0
- data/ext/xkcp/lib/high/KangarooTwelve/KangarooTwelve.h +97 -0
- data/ext/xkcp/lib/high/Keccak/FIPS202/KeccakHash.c +81 -0
- data/ext/xkcp/lib/high/Keccak/FIPS202/KeccakHash.h +125 -0
- data/ext/xkcp/lib/high/Keccak/FIPS202/SimpleFIPS202.c +48 -0
- data/ext/xkcp/lib/high/Keccak/FIPS202/SimpleFIPS202.h +79 -0
- data/ext/xkcp/lib/high/Keccak/KeccakDuplex.c +81 -0
- data/ext/xkcp/lib/high/Keccak/KeccakDuplex.h +73 -0
- data/ext/xkcp/lib/high/Keccak/KeccakDuplex.inc +195 -0
- data/ext/xkcp/lib/high/Keccak/KeccakSponge.c +111 -0
- data/ext/xkcp/lib/high/Keccak/KeccakSponge.h +76 -0
- data/ext/xkcp/lib/high/Keccak/KeccakSponge.inc +314 -0
- data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.c +61 -0
- data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.h +67 -0
- data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.inc +128 -0
- data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.c +93 -0
- data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.h +599 -0
- data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.inc +573 -0
- data/ext/xkcp/lib/high/Ketje/Ketjev2.c +87 -0
- data/ext/xkcp/lib/high/Ketje/Ketjev2.h +88 -0
- data/ext/xkcp/lib/high/Ketje/Ketjev2.inc +274 -0
- data/ext/xkcp/lib/high/Keyak/Keyakv2.c +132 -0
- data/ext/xkcp/lib/high/Keyak/Keyakv2.h +217 -0
- data/ext/xkcp/lib/high/Keyak/Keyakv2.inc +81 -0
- data/ext/xkcp/lib/high/Keyak/Motorist.inc +953 -0
- data/ext/xkcp/lib/high/Kravatte/Kravatte.c +533 -0
- data/ext/xkcp/lib/high/Kravatte/Kravatte.h +115 -0
- data/ext/xkcp/lib/high/Kravatte/KravatteModes.c +557 -0
- data/ext/xkcp/lib/high/Kravatte/KravatteModes.h +247 -0
- data/ext/xkcp/lib/high/Xoodyak/Cyclist.h +66 -0
- data/ext/xkcp/lib/high/Xoodyak/Cyclist.inc +336 -0
- data/ext/xkcp/lib/high/Xoodyak/Xoodyak-parameters.h +26 -0
- data/ext/xkcp/lib/high/Xoodyak/Xoodyak.c +55 -0
- data/ext/xkcp/lib/high/Xoodyak/Xoodyak.h +35 -0
- data/ext/xkcp/lib/high/Xoofff/Xoofff.c +634 -0
- data/ext/xkcp/lib/high/Xoofff/Xoofff.h +147 -0
- data/ext/xkcp/lib/high/Xoofff/XoofffModes.c +483 -0
- data/ext/xkcp/lib/high/Xoofff/XoofffModes.h +241 -0
- data/ext/xkcp/lib/high/common/Phases.h +25 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-SnP.h +41 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv6m-le-armcc.s +1666 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv6m-le-gcc.s +1655 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7a-le-armcc.s +1268 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7a-le-gcc.s +1264 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7m-le-armcc.s +1178 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7m-le-gcc.s +1175 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u1-32bi-armv6m-le-armcc.s +1338 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u1-32bi-armv6m-le-gcc.s +1336 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u2-32bi-armv6m-le-armcc.s +1343 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u2-32bi-armv6m-le-gcc.s +1339 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-SnP.h +42 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-armv7a-le-neon-armcc.s +823 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-armv7a-le-neon-gcc.s +831 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARMv8A/KeccakP-1600-SnP.h +31 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARMv8A/KeccakP-1600-armv8a-neon.s +540 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-SnP.h +42 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-avr8-compact.s +733 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-avr8-fast.s +1121 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX2/KeccakP-1600-AVX2.s +1100 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX2/KeccakP-1600-SnP.h +52 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/KeccakP-1600-AVX512.c +623 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/KeccakP-1600-SnP.h +47 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/u12/KeccakP-1600-AVX512-config.h +6 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/u6/KeccakP-1600-AVX512-config.h +6 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/ua/KeccakP-1600-AVX512-config.h +6 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/KeccakP-1600-AVX512.s +1031 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/KeccakP-1600-SnP.h +53 -0
- data/ext/xkcp/lib/low/KeccakP-1600/XOP/KeccakP-1600-SnP.h +44 -0
- data/ext/xkcp/lib/low/KeccakP-1600/XOP/KeccakP-1600-XOP.c +476 -0
- data/ext/xkcp/lib/low/KeccakP-1600/XOP/u6/KeccakP-1600-XOP-config.h +6 -0
- data/ext/xkcp/lib/low/KeccakP-1600/XOP/ua/KeccakP-1600-XOP-config.h +6 -0
- data/ext/xkcp/lib/low/KeccakP-1600/common/KeccakP-1600-64.macros +748 -0
- data/ext/xkcp/lib/low/KeccakP-1600/common/KeccakP-1600-unrolling.macros +305 -0
- data/ext/xkcp/lib/low/KeccakP-1600/compact/KeccakP-1600-SnP.h +40 -0
- data/ext/xkcp/lib/low/KeccakP-1600/compact/KeccakP-1600-compact64.c +420 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-32bits-inplace/KeccakP-1600-SnP.h +43 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-32bits-inplace/KeccakP-1600-inplace32BI.c +1163 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/KeccakP-1600-SnP.h +54 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/KeccakP-1600-opt64.c +565 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcu6/KeccakP-1600-opt64-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcua/KeccakP-1600-opt64-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcua-shld/KeccakP-1600-opt64-config.h +8 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/u6/KeccakP-1600-opt64-config.h +6 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/ua/KeccakP-1600-opt64-config.h +6 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-SnP.h +44 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-reference.h +23 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-reference32BI.c +625 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-SnP.h +44 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-reference.c +440 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-reference.h +23 -0
- data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-SnP.h +42 -0
- data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-gas.s +1196 -0
- data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-gas_Apple.s +1124 -0
- data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-shld-gas.s +1196 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-inplace-pl2-armv7a-neon-le-armcc.s +1392 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-inplace-pl2-armv7a-neon-le-gcc.s +1394 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-times2-SnP.h +42 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512u12/SIMD512-2-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512u4/SIMD512-2-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512ufull/SIMD512-2-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/KeccakP-1600-times2-SIMD512.c +850 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/KeccakP-1600-times2-SnP.h +51 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SIMD128.c +957 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SnP.h +49 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/SSSE3-u2/SIMD128-config.h +8 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/SSSE3-ua/SIMD128-config.h +8 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/XOP-u2/SIMD128-config.h +9 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/XOP-ua/SIMD128-config.h +9 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/fallback-on1/KeccakP-1600-times2-SnP.h +45 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/fallback-on1/KeccakP-1600-times2-on1.c +37 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SIMD256.c +1321 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SnP.h +55 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/u12/SIMD256-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/u6/SIMD256-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/ua/SIMD256-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512u12/SIMD512-4-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512u4/SIMD512-4-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512ufull/SIMD512-4-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/KeccakP-1600-times4-SIMD512.c +881 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/KeccakP-1600-times4-SnP.h +51 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-SnP.h +45 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-on1.c +37 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on2/KeccakP-1600-times4-SnP.h +45 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on2/KeccakP-1600-times4-on2.c +38 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/KeccakP-1600-times8-SIMD512.c +1615 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/KeccakP-1600-times8-SnP.h +57 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/u12/SIMD512-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/u4/SIMD512-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/ua/SIMD512-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on1/KeccakP-1600-times8-SnP.h +45 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on1/KeccakP-1600-times8-on1.c +37 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on2/KeccakP-1600-times8-SnP.h +45 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on2/KeccakP-1600-times8-on2.c +38 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on4/KeccakP-1600-times8-SnP.h +45 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on4/KeccakP-1600-times8-on4.c +38 -0
- data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-SnP.h +41 -0
- data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv6m-le-armcc.s +442 -0
- data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv6m-le-gcc.s +446 -0
- data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-armcc.s +419 -0
- data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-gcc.s +427 -0
- data/ext/xkcp/lib/low/KeccakP-200/AVR8/KeccakP-200-SnP.h +41 -0
- data/ext/xkcp/lib/low/KeccakP-200/AVR8/KeccakP-200-avr8-fast.s +647 -0
- data/ext/xkcp/lib/low/KeccakP-200/compact/KeccakP-200-SnP.h +39 -0
- data/ext/xkcp/lib/low/KeccakP-200/compact/KeccakP-200-compact.c +190 -0
- data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-SnP.h +43 -0
- data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-reference.c +412 -0
- data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-reference.h +23 -0
- data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-SnP.h +41 -0
- data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv6m-le-armcc.s +454 -0
- data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv6m-le-gcc.s +458 -0
- data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv7m-le-armcc.s +455 -0
- data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv7m-le-gcc.s +458 -0
- data/ext/xkcp/lib/low/KeccakP-400/AVR8/KeccakP-400-SnP.h +41 -0
- data/ext/xkcp/lib/low/KeccakP-400/AVR8/KeccakP-400-avr8-fast.s +728 -0
- data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-SnP.h +43 -0
- data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-reference.c +414 -0
- data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-reference.h +23 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-SnP.h +42 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u1-armv6m-le-armcc.s +527 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u1-armv6m-le-gcc.s +533 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv6m-le-armcc.s +528 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv6m-le-gcc.s +534 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7a-le-armcc.s +521 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7a-le-gcc.s +527 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7m-le-armcc.s +517 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7m-le-gcc.s +523 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-uf-armv7m-le-armcc.s +550 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-uf-armv7m-le-gcc.s +556 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARMv8A/KeccakP-800-SnP.h +32 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARMv8A/KeccakP-800-armv8a-neon.s +432 -0
- data/ext/xkcp/lib/low/KeccakP-800/AVR8/KeccakP-800-SnP.h +42 -0
- data/ext/xkcp/lib/low/KeccakP-800/AVR8/KeccakP-800-avr8-fast.s +929 -0
- data/ext/xkcp/lib/low/KeccakP-800/compact/KeccakP-800-SnP.h +40 -0
- data/ext/xkcp/lib/low/KeccakP-800/compact/KeccakP-800-compact.c +244 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-SnP.h +46 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32-bis.macros +184 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32.c +454 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32.macros +459 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-unrolling-bis.macros +83 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-unrolling.macros +88 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/lcu2/KeccakP-800-opt32-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/lcua/KeccakP-800-opt32-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/u2/KeccakP-800-opt32-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/ua/KeccakP-800-opt32-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-SnP.h +44 -0
- data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-reference.c +437 -0
- data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-reference.h +23 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/Ket.h +57 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeJr-armv7m-le-armcc.s +475 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeJr-armv7m-le-gcc.s +480 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeSr-armv7m-le-armcc.s +590 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeSr-armv7m-le-gcc.s +590 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.c +126 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.h +68 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.inc +174 -0
- data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.c +80 -0
- data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.h +68 -0
- data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.inc +142 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-SnP.h +55 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-u1-armv6m-le-armcc.s +1086 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-u1-armv6m-le-gcc.s +1092 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv6-le-armcc.s +721 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv6-le-gcc.s +726 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv7m-le-armcc.s +723 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv7m-le-gcc.s +729 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-u1-armv6m-le-armcc.s +1164 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-u1-armv6m-le-gcc.s +1165 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv6-le-armcc.s +562 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv6-le-gcc.s +563 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv7m-le-armcc.s +563 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv7m-le-gcc.s +565 -0
- data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-SnP.h +55 -0
- data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-uf-armv7a-neon-le-armcc.s +476 -0
- data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-uf-armv7a-neon-le-gcc.s +485 -0
- data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodyak-uf-armv7a-neon-le-armcc.s +362 -0
- data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodyak-uf-armv7a-neon-le-gcc.s +367 -0
- data/ext/xkcp/lib/low/Xoodoo/AVR8/Xoodoo-SnP.h +43 -0
- data/ext/xkcp/lib/low/Xoodoo/AVR8/Xoodoo-avr8-u1.s +1341 -0
- data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodoo-SIMD512.c +581 -0
- data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodoo-SnP.h +58 -0
- data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodyak-full-block-SIMD512.c +332 -0
- data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodoo-SIMD128.c +329 -0
- data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodoo-SnP.h +53 -0
- data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodyak-full-block-SIMD128.c +355 -0
- data/ext/xkcp/lib/low/Xoodoo/Xoodoo.h +79 -0
- data/ext/xkcp/lib/low/Xoodoo/plain/Xoodoo-SnP.h +56 -0
- data/ext/xkcp/lib/low/Xoodoo/plain/Xoodoo-optimized.c +399 -0
- data/ext/xkcp/lib/low/Xoodoo/plain/Xoodyak-full-blocks.c +127 -0
- data/ext/xkcp/lib/low/Xoodoo/ref/Xoodoo-SnP.h +43 -0
- data/ext/xkcp/lib/low/Xoodoo/ref/Xoodoo-reference.c +253 -0
- data/ext/xkcp/lib/low/Xoodoo-times16/AVX512/Xoodoo-times16-SIMD512.c +1044 -0
- data/ext/xkcp/lib/low/Xoodoo-times16/AVX512/Xoodoo-times16-SnP.h +49 -0
- data/ext/xkcp/lib/low/Xoodoo-times16/fallback-on1/Xoodoo-times16-SnP.h +45 -0
- data/ext/xkcp/lib/low/Xoodoo-times16/fallback-on1/Xoodoo-times16-on1.c +37 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/ARMv7A-NEON/Xoodoo-times4-ARMv7A.s +1587 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/ARMv7A-NEON/Xoodoo-times4-SnP.h +48 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/AVX512/Xoodoo-times4-SIMD512.c +1202 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/AVX512/Xoodoo-times4-SnP.h +48 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/SSSE3/Xoodoo-times4-SIMD128.c +484 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/SSSE3/Xoodoo-times4-SnP.h +44 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/fallback-on1/Xoodoo-times4-SnP.h +45 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/fallback-on1/Xoodoo-times4-on1.c +37 -0
- data/ext/xkcp/lib/low/Xoodoo-times8/AVX2/Xoodoo-times8-SIMD256.c +939 -0
- data/ext/xkcp/lib/low/Xoodoo-times8/AVX2/Xoodoo-times8-SnP.h +49 -0
- data/ext/xkcp/lib/low/Xoodoo-times8/AVX512/Xoodoo-times8-SIMD512.c +1216 -0
- data/ext/xkcp/lib/low/Xoodoo-times8/AVX512/Xoodoo-times8-SnP.h +48 -0
- data/ext/xkcp/lib/low/Xoodoo-times8/fallback-on1/Xoodoo-times8-SnP.h +45 -0
- data/ext/xkcp/lib/low/Xoodoo-times8/fallback-on1/Xoodoo-times8-on1.c +37 -0
- data/ext/xkcp/lib/low/common/PlSnP-Fallback.inc +290 -0
- data/ext/xkcp/lib/low/common/SnP-Relaned.h +141 -0
- data/ext/xkcp/support/Build/ExpandProducts.xsl +79 -0
- data/ext/xkcp/support/Build/ToGlobalMakefile.xsl +206 -0
- data/ext/xkcp/support/Build/ToOneTarget.xsl +89 -0
- data/ext/xkcp/support/Build/ToTargetConfigFile.xsl +37 -0
- data/ext/xkcp/support/Build/ToTargetMakefile.xsl +298 -0
- data/ext/xkcp/support/Build/ToVCXProj.xsl +198 -0
- data/ext/xkcp/support/Kernel-PMU/Kernel-pmu.md +133 -0
- data/ext/xkcp/support/Kernel-PMU/Makefile +8 -0
- data/ext/xkcp/support/Kernel-PMU/enable_arm_pmu.c +129 -0
- data/ext/xkcp/support/Kernel-PMU/load-module +1 -0
- data/ext/xkcp/util/KeccakSum/KeccakSum.c +394 -0
- data/ext/xkcp/util/KeccakSum/base64.c +86 -0
- data/ext/xkcp/util/KeccakSum/base64.h +12 -0
- data/lib/sleeping_kangaroo12/binding.rb +15 -0
- data/lib/sleeping_kangaroo12/build/loader.rb +40 -0
- data/lib/sleeping_kangaroo12/build/platform.rb +37 -0
- data/lib/sleeping_kangaroo12/build.rb +4 -0
- data/lib/sleeping_kangaroo12/digest.rb +103 -0
- data/lib/sleeping_kangaroo12/version.rb +5 -0
- data/lib/sleeping_kangaroo12.rb +7 -0
- metadata +372 -0
|
@@ -0,0 +1,1216 @@
|
|
|
1
|
+
/*
|
|
2
|
+
The eXtended Keccak Code Package (XKCP)
|
|
3
|
+
https://github.com/XKCP/XKCP
|
|
4
|
+
|
|
5
|
+
The Xoodoo permutation, designed by Joan Daemen, Seth Hoffert, Gilles Van Assche and Ronny Van Keer.
|
|
6
|
+
|
|
7
|
+
Implementation by Ronny Van Keer, hereby denoted as "the implementer".
|
|
8
|
+
|
|
9
|
+
For more information, feedback or questions, please refer to the Keccak Team website:
|
|
10
|
+
https://keccak.team/
|
|
11
|
+
|
|
12
|
+
To the extent possible under law, the implementer has waived all copyright
|
|
13
|
+
and related or neighboring rights to the source code in this file.
|
|
14
|
+
http://creativecommons.org/publicdomain/zero/1.0/
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
#include <stdio.h>
|
|
18
|
+
#include <string.h>
|
|
19
|
+
#include <smmintrin.h>
|
|
20
|
+
#include <wmmintrin.h>
|
|
21
|
+
#include <immintrin.h>
|
|
22
|
+
#include <emmintrin.h>
|
|
23
|
+
#include "align.h"
|
|
24
|
+
#include "brg_endian.h"
|
|
25
|
+
#include "Xoodoo.h"
|
|
26
|
+
#include "Xoodoo-times8-SnP.h"
|
|
27
|
+
|
|
28
|
+
#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
|
|
29
|
+
#error Expecting a little-endian platform
|
|
30
|
+
#endif
|
|
31
|
+
|
|
32
|
+
/* #define SIMULATE_AVX512 */
|
|
33
|
+
|
|
34
|
+
#define VERBOSE 0
|
|
35
|
+
|
|
36
|
+
#if defined(SIMULATE_AVX512)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
typedef struct
|
|
40
|
+
{
|
|
41
|
+
uint32_t x[16];
|
|
42
|
+
} __m512i;
|
|
43
|
+
|
|
44
|
+
static void _mm512_mask_store_epi64(void *mem_addr, uint8_t k, __m512i a)
|
|
45
|
+
{
|
|
46
|
+
uint64_t *p64 = (uint64_t *)mem_addr;
|
|
47
|
+
unsigned int i;
|
|
48
|
+
|
|
49
|
+
for ( i = 0; i < 8; ++i ) {
|
|
50
|
+
if ((k & (1 << i)) != 0)
|
|
51
|
+
p64[i] = (uint64_t)a.x[2*i] | ((uint64_t)a.x[2*i+1] << 32);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
static __m512i _mm512_maskz_load_epi64(uint8_t k, const void *mem_addr)
|
|
56
|
+
{
|
|
57
|
+
__m512i r;
|
|
58
|
+
const uint64_t *p64 = (const uint64_t *)mem_addr;
|
|
59
|
+
unsigned int i;
|
|
60
|
+
|
|
61
|
+
for ( i = 0; i < 8; ++i ) {
|
|
62
|
+
if ((k & (1 << i)) != 0) {
|
|
63
|
+
r.x[2*i] = (uint32_t)p64[i];
|
|
64
|
+
r.x[2*i+1] = (uint32_t)(p64[i] >> 32);
|
|
65
|
+
}
|
|
66
|
+
else {
|
|
67
|
+
r.x[2*i] = 0;
|
|
68
|
+
r.x[2*i+1] = 0;
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
return(r);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
static void _mm512_storeu_si512(__m512i * mem_addr, __m512i a)
|
|
75
|
+
{
|
|
76
|
+
uint32_t *p32 = (uint32_t *)mem_addr;
|
|
77
|
+
unsigned int i;
|
|
78
|
+
|
|
79
|
+
for ( i = 0; i < 16; ++i )
|
|
80
|
+
p32[i] = a.x[i];
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
#define _mm512_store_si512 _mm512_storeu_si512
|
|
84
|
+
|
|
85
|
+
typedef union
|
|
86
|
+
{
|
|
87
|
+
uint32_t x[8];
|
|
88
|
+
__m256i s;
|
|
89
|
+
} s__m256i;
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
static void _mm256_storeu_si256(__m256i * mem_addr, __m256i aa)
|
|
93
|
+
{
|
|
94
|
+
uint32_t *p32 = (uint32_t *)mem_addr;
|
|
95
|
+
s__m256i a;
|
|
96
|
+
unsigned int i;
|
|
97
|
+
|
|
98
|
+
a.s = aa;
|
|
99
|
+
for ( i = 0; i < 8; ++i )
|
|
100
|
+
p32[i] = a.x[i];
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
#define _mm256_store_si256 _mm256_storeu_si256
|
|
104
|
+
|
|
105
|
+
static __m512i _mm512_loadu_si512(const __m512i * mem_addr)
|
|
106
|
+
{
|
|
107
|
+
__m512i r;
|
|
108
|
+
const uint32_t *p32 = (const uint32_t *)mem_addr;
|
|
109
|
+
unsigned int i;
|
|
110
|
+
|
|
111
|
+
for ( i = 0; i < 16; ++i )
|
|
112
|
+
r.x[i] = p32[i];
|
|
113
|
+
return(r);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
#define _mm512_load_si512 _mm512_loadu_si512
|
|
117
|
+
|
|
118
|
+
static __m256i _mm256_loadu_si256(const __m256i * mem_addr)
|
|
119
|
+
{
|
|
120
|
+
s__m256i r;
|
|
121
|
+
const uint32_t *p32 = (const uint32_t *)mem_addr;
|
|
122
|
+
unsigned int i;
|
|
123
|
+
|
|
124
|
+
for ( i = 0; i < 8; ++i )
|
|
125
|
+
r.x[i] = p32[i];
|
|
126
|
+
return(r.s);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
#define _mm256_load_si256 _mm256_loadu_si256
|
|
130
|
+
|
|
131
|
+
static __m512i _mm512_setzero_si512(void)
|
|
132
|
+
{
|
|
133
|
+
__m512i r;
|
|
134
|
+
unsigned int i;
|
|
135
|
+
|
|
136
|
+
for ( i = 0; i < 16; ++i )
|
|
137
|
+
r.x[i] = 0;
|
|
138
|
+
return(r);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
static __m256i _mm256_setzero_si256(void)
|
|
142
|
+
{
|
|
143
|
+
s__m256i r;
|
|
144
|
+
unsigned int i;
|
|
145
|
+
|
|
146
|
+
for ( i = 0; i < 8; ++i )
|
|
147
|
+
r.x[i] = 0;
|
|
148
|
+
return(r.s);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
static __m512i _mm512_xor_si512( __m512i a, __m512i b)
|
|
152
|
+
{
|
|
153
|
+
__m512i r;
|
|
154
|
+
unsigned int i;
|
|
155
|
+
|
|
156
|
+
for ( i = 0; i < 16; ++i )
|
|
157
|
+
r.x[i] = a.x[i] ^ b.x[i];
|
|
158
|
+
return(r);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
static __m512i _mm512_and_si512( __m512i a, __m512i b)
|
|
162
|
+
{
|
|
163
|
+
__m512i r;
|
|
164
|
+
unsigned int i;
|
|
165
|
+
|
|
166
|
+
for ( i = 0; i < 16; ++i )
|
|
167
|
+
r.x[i] = a.x[i] & b.x[i];
|
|
168
|
+
return(r);
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
static __m512i _mm512_ternarylogic_epi32(__m512i a, __m512i b, __m512i c, int imm)
|
|
172
|
+
{
|
|
173
|
+
|
|
174
|
+
if (imm == 0x96)
|
|
175
|
+
return ( _mm512_xor_si512( _mm512_xor_si512( a, b ), c ) );
|
|
176
|
+
if (imm == 0xD2) {
|
|
177
|
+
__m512i t;
|
|
178
|
+
unsigned int i;
|
|
179
|
+
|
|
180
|
+
for ( i = 0; i < 16; ++i )
|
|
181
|
+
t.x[i] = ~b.x[i] & c.x[i];
|
|
182
|
+
return ( _mm512_xor_si512( a, t ) );
|
|
183
|
+
}
|
|
184
|
+
printf( "_mm512_ternarylogic_epi32( a, b, c, %02X) not implemented!\n", imm );
|
|
185
|
+
exit(1);
|
|
186
|
+
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
static __m256i _mm256_ternarylogic_epi32(__m256i a, __m256i b, __m256i c, int imm)
|
|
190
|
+
{
|
|
191
|
+
|
|
192
|
+
if (imm == 0x96)
|
|
193
|
+
return ( _mm256_xor_si256( _mm256_xor_si256( a, b ), c ) );
|
|
194
|
+
if (imm == 0xD2) {
|
|
195
|
+
s__m256i t;
|
|
196
|
+
s__m256i bb;
|
|
197
|
+
s__m256i cc;
|
|
198
|
+
unsigned int i;
|
|
199
|
+
|
|
200
|
+
bb.s = b;
|
|
201
|
+
cc.s = c;
|
|
202
|
+
for ( i = 0; i < 8; ++i )
|
|
203
|
+
t.x[i] = ~bb.x[i] & cc.x[i];
|
|
204
|
+
return ( _mm256_xor_si256( a, t.s ) );
|
|
205
|
+
}
|
|
206
|
+
printf( "_mm256_ternarylogic_epi32( a, b, c, %02X) not implemented!\n", imm );
|
|
207
|
+
exit(1);
|
|
208
|
+
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
static __m512i _mm512_rol_epi32(__m512i a, int offset)
|
|
212
|
+
{
|
|
213
|
+
__m512i r;
|
|
214
|
+
unsigned int i;
|
|
215
|
+
|
|
216
|
+
for ( i = 0; i < 16; ++i )
|
|
217
|
+
r.x[i] = (a.x[i] << offset) | (a.x[i] >> (32-offset));
|
|
218
|
+
return(r);
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
static __m256i _mm256_rol_epi32(__m256i a, int offset)
|
|
222
|
+
{
|
|
223
|
+
s__m256i r;
|
|
224
|
+
s__m256i aa;
|
|
225
|
+
unsigned int i;
|
|
226
|
+
|
|
227
|
+
aa.s = a;
|
|
228
|
+
for ( i = 0; i < 8; ++i )
|
|
229
|
+
r.x[i] = (aa.x[i] << offset) | (aa.x[i] >> (32-offset));
|
|
230
|
+
return(r.s);
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
static __m512i _mm512_slli_epi32(__m512i a, int offset)
|
|
234
|
+
{
|
|
235
|
+
__m512i r;
|
|
236
|
+
unsigned int i;
|
|
237
|
+
|
|
238
|
+
for ( i = 0; i < 16; ++i )
|
|
239
|
+
r.x[i] = (a.x[i] << offset);
|
|
240
|
+
return(r);
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
static __m512i _mm512_set1_epi32(uint32_t a)
|
|
244
|
+
{
|
|
245
|
+
unsigned int i;
|
|
246
|
+
__m512i r;
|
|
247
|
+
|
|
248
|
+
for ( i = 0; i < 16; ++i )
|
|
249
|
+
r.x[i] = a;
|
|
250
|
+
return(r);
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
static __m512i _mm512_i32gather_epi32(__m512i idx, const void *p, int scale)
|
|
254
|
+
{
|
|
255
|
+
__m512i r;
|
|
256
|
+
unsigned int i;
|
|
257
|
+
for ( i = 0; i < 16; ++i )
|
|
258
|
+
r.x[i] = *(const uint32_t*)((const char*)p + idx.x[i] * scale);
|
|
259
|
+
return(r);
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
static void _mm512_i32scatter_epi32( void *p, __m512i idx, __m512i value, int scale)
|
|
263
|
+
{
|
|
264
|
+
unsigned int i;
|
|
265
|
+
|
|
266
|
+
for ( i = 0; i < 16; ++i )
|
|
267
|
+
*(uint32_t*)((char*)p + idx.x[i] * scale) = value.x[i];
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
static void _mm256_i32scatter_epi32( void *p, __m256i idx, __m256i value, int scale)
|
|
271
|
+
{
|
|
272
|
+
s__m256i iidx, vvalue;
|
|
273
|
+
unsigned int i;
|
|
274
|
+
|
|
275
|
+
iidx.s = idx;
|
|
276
|
+
vvalue.s = value;
|
|
277
|
+
for ( i = 0; i < 8; ++i )
|
|
278
|
+
*(uint32_t*)((char*)p + iidx.x[i] * scale) = vvalue.x[i];
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
static void _mm512_mask_i32scatter_epi32( void *p, uint16_t k, __m512i idx, __m512i value, int scale)
|
|
282
|
+
{
|
|
283
|
+
unsigned int i;
|
|
284
|
+
for ( i = 0; i < 16; ++i ) {
|
|
285
|
+
if ((k & (1 << i)) != 0)
|
|
286
|
+
*(uint32_t*)((char*)p + idx.x[i] * scale) = value.x[i];
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
static void _mm256_mask_i32scatter_epi32( void *p, uint16_t k, __m256i idx, __m256i value, int scale)
|
|
291
|
+
{
|
|
292
|
+
s__m256i iidx, vvalue;
|
|
293
|
+
unsigned int i;
|
|
294
|
+
|
|
295
|
+
iidx.s = idx;
|
|
296
|
+
vvalue.s = value;
|
|
297
|
+
for ( i = 0; i < 8; ++i ) {
|
|
298
|
+
if ((k & (1 << i)) != 0)
|
|
299
|
+
*(uint32_t*)((char*)p + iidx.x[i] * scale) = vvalue.x[i];
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
static __m512i _mm512_setr_epi32( int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8,
|
|
304
|
+
int e7, int e6, int e5, int e4, int e3, int e2, int ee1, int ee0)
|
|
305
|
+
{
|
|
306
|
+
__m512i r;
|
|
307
|
+
|
|
308
|
+
r.x[ 0] = e15;
|
|
309
|
+
r.x[ 1] = e14;
|
|
310
|
+
r.x[ 2] = e13;
|
|
311
|
+
r.x[ 3] = e12;
|
|
312
|
+
r.x[ 4] = e11;
|
|
313
|
+
r.x[ 5] = e10;
|
|
314
|
+
r.x[ 6] = e9;
|
|
315
|
+
r.x[ 7] = e8;
|
|
316
|
+
r.x[ 8] = e7;
|
|
317
|
+
r.x[ 9] = e6;
|
|
318
|
+
r.x[10] = e5;
|
|
319
|
+
r.x[11] = e4;
|
|
320
|
+
r.x[12] = e3;
|
|
321
|
+
r.x[13] = e2;
|
|
322
|
+
r.x[14] = ee1;
|
|
323
|
+
r.x[15] = ee0;
|
|
324
|
+
return(r);
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
static __m256i _mm256_setr_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int ee1, int ee0)
|
|
328
|
+
{
|
|
329
|
+
s__m256i r;
|
|
330
|
+
|
|
331
|
+
r.x[0] = e7;
|
|
332
|
+
r.x[1] = e6;
|
|
333
|
+
r.x[2] = e5;
|
|
334
|
+
r.x[3] = e4;
|
|
335
|
+
r.x[4] = e3;
|
|
336
|
+
r.x[5] = e2;
|
|
337
|
+
r.x[6] = ee1;
|
|
338
|
+
r.x[7] = ee0;
|
|
339
|
+
return(r.s);
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
static __m512i _mm512_inserti64x4 (__m512i a, __m256i b, int imm8)
|
|
343
|
+
{
|
|
344
|
+
__m512i r;
|
|
345
|
+
s__m256i bb;
|
|
346
|
+
unsigned int i;
|
|
347
|
+
|
|
348
|
+
r = a;
|
|
349
|
+
bb.s = b;
|
|
350
|
+
if (imm8 == 0) {
|
|
351
|
+
for ( i = 0; i < 8; ++i )
|
|
352
|
+
r.x[i] = bb.x[i];
|
|
353
|
+
} else {
|
|
354
|
+
for ( i = 0; i < 8; ++i )
|
|
355
|
+
r.x[i+8] = bb.x[i];
|
|
356
|
+
}
|
|
357
|
+
return(r);
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
static __m512i _mm512_permutex2var_epi32(__m512i a, __m512i idx, __m512i b)
|
|
361
|
+
{
|
|
362
|
+
__m512i r;
|
|
363
|
+
unsigned int i;
|
|
364
|
+
for ( i = 0; i < 16; ++i )
|
|
365
|
+
r.x[i] = (idx.x[i] & 0x10) ? b.x[idx.x[i] & 0x0F] : a.x[idx.x[i] & 0x0F];
|
|
366
|
+
return(r);
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
static __m256i _mm256_permutex2var_epi32(__m256i a, __m256i idx, __m256i b)
|
|
370
|
+
{
|
|
371
|
+
s__m256i r;
|
|
372
|
+
s__m256i iidx, aa, bb;
|
|
373
|
+
unsigned int i;
|
|
374
|
+
|
|
375
|
+
iidx.s = idx;
|
|
376
|
+
aa.s = a;
|
|
377
|
+
bb.s = b;
|
|
378
|
+
for ( i = 0; i < 8; ++i )
|
|
379
|
+
r.x[i] = (iidx.x[i] & 8) ? bb.x[iidx.x[i] & 7] : aa.x[iidx.x[i] & 7];
|
|
380
|
+
return(r.s);
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
static __m512i _mm512_permutexvar_epi32(__m512i idx, __m512i a)
|
|
384
|
+
{
|
|
385
|
+
__m512i r;
|
|
386
|
+
unsigned int i;
|
|
387
|
+
for ( i = 0; i < 16; ++i )
|
|
388
|
+
r.x[i] = a.x[idx.x[i]];
|
|
389
|
+
return(r);
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
static __m256i _mm256_permutexvar_epi32(__m256i idx, __m256i a)
|
|
393
|
+
{
|
|
394
|
+
s__m256i r;
|
|
395
|
+
s__m256i iidx, aa;
|
|
396
|
+
unsigned int i;
|
|
397
|
+
|
|
398
|
+
iidx.s = idx;
|
|
399
|
+
aa.s = a;
|
|
400
|
+
for ( i = 0; i < 8; ++i )
|
|
401
|
+
r.x[i] = aa.x[iidx.x[i]];
|
|
402
|
+
return(r.s);
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
static __m512i _mm512_castsi256_si512(__m256i a)
|
|
406
|
+
{
|
|
407
|
+
__m512i r;
|
|
408
|
+
s__m256i aa;
|
|
409
|
+
unsigned int i;
|
|
410
|
+
|
|
411
|
+
r = _mm512_setzero_si512();
|
|
412
|
+
aa.s = a;
|
|
413
|
+
for ( i = 0; i < 8; ++i )
|
|
414
|
+
r.x[i] = aa.x[i];
|
|
415
|
+
return(r);
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
#endif
|
|
419
|
+
|
|
420
|
+
typedef __m128i V128;
|
|
421
|
+
typedef __m256i V256;
|
|
422
|
+
typedef __m512i V512;
|
|
423
|
+
|
|
424
|
+
#define SnP_laneLengthInBytes 4
|
|
425
|
+
#define laneIndex(instanceIndex, lanePosition) ((lanePosition)*8 + instanceIndex)
|
|
426
|
+
|
|
427
|
+
#define Chi(a,b,c) _mm256_ternarylogic_epi32(a,b,c,0xD2)
|
|
428
|
+
|
|
429
|
+
#define CONST8_32(a) _mm256_set1_epi32(a)
|
|
430
|
+
#define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a))
|
|
431
|
+
|
|
432
|
+
#define LOAD512(a) _mm512_load_si512((const V512 *)&(a))
|
|
433
|
+
#define LOAD512u(a) _mm512_loadu_si512((const V512 *)&(a))
|
|
434
|
+
|
|
435
|
+
#define LOAD_GATHER8_32(idx,p) _mm256_i32gather_epi32((const void*)(p), idx, 4)
|
|
436
|
+
#define STORE_SCATTER8_32(idx,a,p) _mm256_i32scatter_epi32((void*)(p), idx, a, 4)
|
|
437
|
+
#define LOAD8_32(a,b,c,d,e,f,g,h) _mm256_setr_epi32(a,b,c,d,e,f,g,h)
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
#define SHUFFLE_LANES_RIGHT(idx, a) _mm256_permutexvar_epi32(idx, a)
|
|
441
|
+
|
|
442
|
+
#define ROL32(a, o) _mm256_rol_epi32(a, o)
|
|
443
|
+
#define SHL32(a, o) _mm256_slli_epi32(a, o)
|
|
444
|
+
|
|
445
|
+
#define SET8_32 _mm256_setr_epi32
|
|
446
|
+
|
|
447
|
+
#define STORE128(a, b) _mm_store_si128((V128 *)&(a), b)
|
|
448
|
+
#define STORE128u(a, b) _mm_storeu_si128((V128 *)&(a), b)
|
|
449
|
+
#define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b)
|
|
450
|
+
#define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b)
|
|
451
|
+
#define STORE512(a, b) _mm512_store_si512((V512 *)&(a), b)
|
|
452
|
+
#define STORE512u(a, b) _mm512_storeu_si512((V512 *)&(a), b)
|
|
453
|
+
|
|
454
|
+
#define AND(a, b) _mm256_and_si256(a, b)
|
|
455
|
+
#define XOR128(a, b) _mm_xor_si128(a, b)
|
|
456
|
+
#define XOR256(a, b) _mm256_xor_si256(a, b)
|
|
457
|
+
#define XOR512(a, b) _mm512_xor_si512(a, b)
|
|
458
|
+
#define XOR(a, b) XOR256(a, b)
|
|
459
|
+
#define XOR3(a,b,c) _mm256_ternarylogic_epi32(a,b,c,0x96)
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
#ifndef _mm256_storeu2_m128i
|
|
463
|
+
#define _mm256_storeu2_m128i(hi, lo, a) _mm_storeu_si128((V128*)(lo), _mm256_castsi256_si128(a)), _mm_storeu_si128((V128*)(hi), _mm256_extracti128_si256(a, 1))
|
|
464
|
+
#endif
|
|
465
|
+
|
|
466
|
+
#if (VERBOSE > 0)
|
|
467
|
+
#define DumpOne(__b,__v,__i) STORE256(__b, __v##__i); \
|
|
468
|
+
printf("%02u %08x %08x %08x %08x %08x %08x %08x %08x\n", __i, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7])
|
|
469
|
+
|
|
470
|
+
#define Dump(__t,__v) { \
|
|
471
|
+
uint32_t buf[8]; \
|
|
472
|
+
printf("%s\n", __t); \
|
|
473
|
+
DumpOne(buf, __v, 00); \
|
|
474
|
+
DumpOne(buf, __v, 01); \
|
|
475
|
+
DumpOne(buf, __v, 02); \
|
|
476
|
+
DumpOne(buf, __v, 03); \
|
|
477
|
+
DumpOne(buf, __v, 10); \
|
|
478
|
+
DumpOne(buf, __v, 11); \
|
|
479
|
+
DumpOne(buf, __v, 12); \
|
|
480
|
+
DumpOne(buf, __v, 13); \
|
|
481
|
+
DumpOne(buf, __v, 20); \
|
|
482
|
+
DumpOne(buf, __v, 21); \
|
|
483
|
+
DumpOne(buf, __v, 22); \
|
|
484
|
+
DumpOne(buf, __v, 23); \
|
|
485
|
+
}
|
|
486
|
+
#else
|
|
487
|
+
#define Dump(__t,__v)
|
|
488
|
+
#endif
|
|
489
|
+
|
|
490
|
+
#if (VERBOSE >= 1)
|
|
491
|
+
#define Dump1(__t,__v) Dump(__t,__v)
|
|
492
|
+
#else
|
|
493
|
+
#define Dump1(__t,__v)
|
|
494
|
+
#endif
|
|
495
|
+
|
|
496
|
+
#if (VERBOSE >= 2)
|
|
497
|
+
#define Dump2(__t,__v) Dump(__t,__v)
|
|
498
|
+
#else
|
|
499
|
+
#define Dump2(__t,__v)
|
|
500
|
+
#endif
|
|
501
|
+
|
|
502
|
+
#if (VERBOSE >= 3)
|
|
503
|
+
#define Dump3(__t,__v) Dump(__t,__v)
|
|
504
|
+
#else
|
|
505
|
+
#define Dump3(__t,__v)
|
|
506
|
+
#endif
|
|
507
|
+
|
|
508
|
+
#if (VERBOSE > 0)
|
|
509
|
+
#define DUMP32(tt, buf) printf("%s %08x %08x %08x %08x %08x %08x %08x %08x\n", tt, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7])
|
|
510
|
+
|
|
511
|
+
#define DUMP32_12(tt, buf) printf("%s %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", tt, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], buf[8], buf[9], buf[10], buf[11])
|
|
512
|
+
|
|
513
|
+
#define DumpLane(__t,__v) { uint32_t buf[8]; \
|
|
514
|
+
STORE256(buf[0], __v); \
|
|
515
|
+
printf("%s %08x %08x %08x %08x %08x %08x %08x %08x\n", __t, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); }
|
|
516
|
+
|
|
517
|
+
#else
|
|
518
|
+
#define DUMP32(__t, buf)
|
|
519
|
+
#define DUMP32_12(__t, buf)
|
|
520
|
+
#define DumpLane(__t,__v)
|
|
521
|
+
#endif
|
|
522
|
+
|
|
523
|
+
ALIGN(32) static const uint32_t oAllFrom1_0[] = { 1, 2, 3, 4, 5, 6, 7, 8+0 };
|
|
524
|
+
ALIGN(32) static const uint32_t oAllFrom1_2[] = { 1, 2, 3, 4, 5, 6, 7, 8+2 };
|
|
525
|
+
ALIGN(32) static const uint32_t oAllFrom1_3[] = { 1, 2, 3, 4, 5, 6, 7, 8+3 };
|
|
526
|
+
ALIGN(32) static const uint32_t oAllFrom1_4[] = { 1, 2, 3, 4, 5, 6, 7, 8+4 };
|
|
527
|
+
ALIGN(32) static const uint32_t oAllFrom1_5[] = { 1, 2, 3, 4, 5, 6, 7, 8+5 };
|
|
528
|
+
ALIGN(32) static const uint32_t oAllFrom1_6[] = { 1, 2, 3, 4, 5, 6, 7, 8+6 };
|
|
529
|
+
|
|
530
|
+
ALIGN(32) static const uint32_t oAllFrom2_0[] = { 2, 3, 4, 5, 6, 7, 8+0, 8+1 };
|
|
531
|
+
ALIGN(32) static const uint32_t oAllFrom2_4[] = { 2, 3, 4, 5, 6, 7, 8+4, 8+5 };
|
|
532
|
+
ALIGN(32) static const uint32_t oAllFrom3_2[] = { 3, 4, 5, 6, 7, 8+2, 8+3, 8+4 };
|
|
533
|
+
ALIGN(32) static const uint32_t oAllFrom3_4[] = { 3, 4, 5, 6, 7, 8+4, 8+5, 8+6 };
|
|
534
|
+
|
|
535
|
+
ALIGN(32) static const uint32_t oLow128[] = { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 };
|
|
536
|
+
ALIGN(32) static const uint32_t oHigh128[] = { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 };
|
|
537
|
+
|
|
538
|
+
ALIGN(32) static const uint32_t oLow64[] = { 0, 1, 8+0, 8+1, 4, 5, 8+4, 8+5 };
|
|
539
|
+
ALIGN(32) static const uint32_t oHigh64[] = { 2, 3, 8+2, 8+3, 6, 7, 8+6, 8+7 };
|
|
540
|
+
|
|
541
|
+
ALIGN(32) static const uint32_t oLow32[] = { 0, 8+0, 2, 8+2, 4, 8+4, 6, 8+6 };
|
|
542
|
+
ALIGN(32) static const uint32_t oHigh32[] = { 1, 8+1, 3, 8+3, 5, 8+5, 7, 8+7 };
|
|
543
|
+
|
|
544
|
+
ALIGN(32) static const uint32_t oLow32_4[] = { 0, 2, 4, 6, 0, 2, 4, 6 };
|
|
545
|
+
ALIGN(32) static const uint32_t oHigh32_4[] = { 1, 3, 5, 7, 0, 2, 4, 6 };
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
ALIGN(32) static const uint32_t oGatherScatterOffsets[] = { 0*12, 1*12, 2*12, 3*12, 4*12, 5*12, 6*12, 7*12 };
|
|
549
|
+
ALIGN(64) static const uint32_t oGatherScatterOffsetsRoll[] = { 0, 0, 0, 0, 0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11 }; /* First 4 are dummies */
|
|
550
|
+
|
|
551
|
+
void Xoodootimes8_InitializeAll(void *states)
|
|
552
|
+
{
|
|
553
|
+
memset(states, 0, Xoodootimes8_statesSizeInBytes);
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
void Xoodootimes8_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
|
|
557
|
+
{
|
|
558
|
+
unsigned int sizeLeft = length;
|
|
559
|
+
unsigned int lanePosition = offset/SnP_laneLengthInBytes;
|
|
560
|
+
unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
|
|
561
|
+
const unsigned char *curData = data;
|
|
562
|
+
uint32_t *statesAsLanes = (uint32_t *)states;
|
|
563
|
+
|
|
564
|
+
if ((sizeLeft > 0) && (offsetInLane != 0)) {
|
|
565
|
+
unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
|
|
566
|
+
uint32_t lane = 0;
|
|
567
|
+
if (bytesInLane > sizeLeft)
|
|
568
|
+
bytesInLane = sizeLeft;
|
|
569
|
+
memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane);
|
|
570
|
+
statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
|
|
571
|
+
sizeLeft -= bytesInLane;
|
|
572
|
+
lanePosition++;
|
|
573
|
+
curData += bytesInLane;
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
while(sizeLeft >= SnP_laneLengthInBytes) {
|
|
577
|
+
uint32_t lane = *((const uint32_t*)curData);
|
|
578
|
+
statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
|
|
579
|
+
sizeLeft -= SnP_laneLengthInBytes;
|
|
580
|
+
lanePosition++;
|
|
581
|
+
curData += SnP_laneLengthInBytes;
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
if (sizeLeft > 0) {
|
|
585
|
+
uint32_t lane = 0;
|
|
586
|
+
memcpy(&lane, curData, sizeLeft);
|
|
587
|
+
statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
void Xoodootimes8_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
|
|
592
|
+
{
|
|
593
|
+
V256 *stateAsLanes = (V256 *)states;
|
|
594
|
+
unsigned int i;
|
|
595
|
+
const uint32_t *data32 = (const uint32_t *)data;
|
|
596
|
+
V256 offsets = SET8_32(0*laneOffset, 1*laneOffset, 2*laneOffset, 3*laneOffset, 4*laneOffset, 5*laneOffset, 6*laneOffset, 7*laneOffset);
|
|
597
|
+
|
|
598
|
+
#define Xor_In( argIndex ) stateAsLanes[argIndex] = XOR(stateAsLanes[argIndex], LOAD_GATHER8_32(offsets, &data32[argIndex]))
|
|
599
|
+
|
|
600
|
+
if ( laneCount == 12 ) {
|
|
601
|
+
Xor_In( 0 );
|
|
602
|
+
Xor_In( 1 );
|
|
603
|
+
Xor_In( 2 );
|
|
604
|
+
Xor_In( 3 );
|
|
605
|
+
Xor_In( 4 );
|
|
606
|
+
Xor_In( 5 );
|
|
607
|
+
Xor_In( 6 );
|
|
608
|
+
Xor_In( 7 );
|
|
609
|
+
Xor_In( 8 );
|
|
610
|
+
Xor_In( 9 );
|
|
611
|
+
Xor_In( 10 );
|
|
612
|
+
Xor_In( 11 );
|
|
613
|
+
}
|
|
614
|
+
else {
|
|
615
|
+
for(i=0; i<laneCount; i++)
|
|
616
|
+
Xor_In( i );
|
|
617
|
+
}
|
|
618
|
+
#undef Xor_In
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
void Xoodootimes8_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
|
|
622
|
+
{
|
|
623
|
+
unsigned int sizeLeft = length;
|
|
624
|
+
unsigned int lanePosition = offset/SnP_laneLengthInBytes;
|
|
625
|
+
unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
|
|
626
|
+
const unsigned char *curData = data;
|
|
627
|
+
uint32_t *statesAsLanes = (uint32_t *)states;
|
|
628
|
+
|
|
629
|
+
if ((sizeLeft > 0) && (offsetInLane != 0)) {
|
|
630
|
+
unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
|
|
631
|
+
if (bytesInLane > sizeLeft)
|
|
632
|
+
bytesInLane = sizeLeft;
|
|
633
|
+
memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane);
|
|
634
|
+
sizeLeft -= bytesInLane;
|
|
635
|
+
lanePosition++;
|
|
636
|
+
curData += bytesInLane;
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
while(sizeLeft >= SnP_laneLengthInBytes) {
|
|
640
|
+
uint32_t lane = *((const uint32_t*)curData);
|
|
641
|
+
statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane;
|
|
642
|
+
sizeLeft -= SnP_laneLengthInBytes;
|
|
643
|
+
lanePosition++;
|
|
644
|
+
curData += SnP_laneLengthInBytes;
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
if (sizeLeft > 0) {
|
|
648
|
+
memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft);
|
|
649
|
+
}
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
void Xoodootimes8_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
|
|
653
|
+
{
|
|
654
|
+
V256 *stateAsLanes = (V256 *)states;
|
|
655
|
+
unsigned int i;
|
|
656
|
+
const uint32_t *data32 = (const uint32_t *)data;
|
|
657
|
+
V256 offsets = SET8_32(0*laneOffset, 1*laneOffset, 2*laneOffset, 3*laneOffset, 4*laneOffset, 5*laneOffset, 6*laneOffset, 7*laneOffset);
|
|
658
|
+
|
|
659
|
+
#define OverWr( argIndex ) stateAsLanes[argIndex] = LOAD_GATHER8_32(offsets, &data32[argIndex])
|
|
660
|
+
|
|
661
|
+
if ( laneCount == 12 ) {
|
|
662
|
+
OverWr( 0 );
|
|
663
|
+
OverWr( 1 );
|
|
664
|
+
OverWr( 2 );
|
|
665
|
+
OverWr( 3 );
|
|
666
|
+
OverWr( 4 );
|
|
667
|
+
OverWr( 5 );
|
|
668
|
+
OverWr( 6 );
|
|
669
|
+
OverWr( 7 );
|
|
670
|
+
OverWr( 8 );
|
|
671
|
+
OverWr( 9 );
|
|
672
|
+
OverWr( 10 );
|
|
673
|
+
OverWr( 11 );
|
|
674
|
+
}
|
|
675
|
+
else {
|
|
676
|
+
for(i=0; i<laneCount; i++)
|
|
677
|
+
OverWr( i );
|
|
678
|
+
}
|
|
679
|
+
#undef OverWr
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
void Xoodootimes8_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
|
|
683
|
+
{
|
|
684
|
+
unsigned int sizeLeft = byteCount;
|
|
685
|
+
unsigned int lanePosition = 0;
|
|
686
|
+
uint32_t *statesAsLanes = (uint32_t *)states;
|
|
687
|
+
|
|
688
|
+
while(sizeLeft >= SnP_laneLengthInBytes) {
|
|
689
|
+
statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0;
|
|
690
|
+
sizeLeft -= SnP_laneLengthInBytes;
|
|
691
|
+
lanePosition++;
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
if (sizeLeft > 0) {
|
|
695
|
+
memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft);
|
|
696
|
+
}
|
|
697
|
+
}
|
|
698
|
+
|
|
699
|
+
void Xoodootimes8_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
|
|
700
|
+
{
|
|
701
|
+
unsigned int sizeLeft = length;
|
|
702
|
+
unsigned int lanePosition = offset/SnP_laneLengthInBytes;
|
|
703
|
+
unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
|
|
704
|
+
unsigned char *curData = data;
|
|
705
|
+
const uint32_t *statesAsLanes = (const uint32_t *)states;
|
|
706
|
+
|
|
707
|
+
if ((sizeLeft > 0) && (offsetInLane != 0)) {
|
|
708
|
+
unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
|
|
709
|
+
if (bytesInLane > sizeLeft)
|
|
710
|
+
bytesInLane = sizeLeft;
|
|
711
|
+
memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
|
|
712
|
+
sizeLeft -= bytesInLane;
|
|
713
|
+
lanePosition++;
|
|
714
|
+
curData += bytesInLane;
|
|
715
|
+
}
|
|
716
|
+
|
|
717
|
+
while(sizeLeft >= SnP_laneLengthInBytes) {
|
|
718
|
+
*(uint32_t*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
|
|
719
|
+
sizeLeft -= SnP_laneLengthInBytes;
|
|
720
|
+
lanePosition++;
|
|
721
|
+
curData += SnP_laneLengthInBytes;
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
if (sizeLeft > 0) {
|
|
725
|
+
memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft);
|
|
726
|
+
}
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
void Xoodootimes8_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
|
|
730
|
+
{
|
|
731
|
+
const V256 *stateAsLanes = (const V256 *)states;
|
|
732
|
+
unsigned int i;
|
|
733
|
+
uint32_t *data32 = (uint32_t *)data;
|
|
734
|
+
V256 offsets = SET8_32(0*laneOffset, 1*laneOffset, 2*laneOffset, 3*laneOffset, 4*laneOffset, 5*laneOffset, 6*laneOffset, 7*laneOffset);
|
|
735
|
+
|
|
736
|
+
#define Extr( argIndex ) STORE_SCATTER8_32(offsets, stateAsLanes[argIndex], &data32[argIndex])
|
|
737
|
+
|
|
738
|
+
if ( laneCount == 12 ) {
|
|
739
|
+
Extr( 0 );
|
|
740
|
+
Extr( 1 );
|
|
741
|
+
Extr( 2 );
|
|
742
|
+
Extr( 3 );
|
|
743
|
+
Extr( 4 );
|
|
744
|
+
Extr( 5 );
|
|
745
|
+
Extr( 6 );
|
|
746
|
+
Extr( 7 );
|
|
747
|
+
Extr( 8 );
|
|
748
|
+
Extr( 9 );
|
|
749
|
+
Extr( 10 );
|
|
750
|
+
Extr( 11 );
|
|
751
|
+
}
|
|
752
|
+
else {
|
|
753
|
+
for(i=0; i<laneCount; i++)
|
|
754
|
+
Extr( i );
|
|
755
|
+
}
|
|
756
|
+
#undef Extr
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
void Xoodootimes8_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
|
|
760
|
+
{
|
|
761
|
+
unsigned int sizeLeft = length;
|
|
762
|
+
unsigned int lanePosition = offset/SnP_laneLengthInBytes;
|
|
763
|
+
unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
|
|
764
|
+
const unsigned char *curInput = input;
|
|
765
|
+
unsigned char *curOutput = output;
|
|
766
|
+
const uint32_t *statesAsLanes = (const uint32_t *)states;
|
|
767
|
+
|
|
768
|
+
if ((sizeLeft > 0) && (offsetInLane != 0)) {
|
|
769
|
+
unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
|
|
770
|
+
uint32_t lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane);
|
|
771
|
+
if (bytesInLane > sizeLeft)
|
|
772
|
+
bytesInLane = sizeLeft;
|
|
773
|
+
sizeLeft -= bytesInLane;
|
|
774
|
+
do {
|
|
775
|
+
*(curOutput++) = *(curInput++) ^ (unsigned char)lane;
|
|
776
|
+
lane >>= 8;
|
|
777
|
+
} while ( --bytesInLane != 0);
|
|
778
|
+
lanePosition++;
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
while(sizeLeft >= SnP_laneLengthInBytes) {
|
|
782
|
+
*((uint32_t*)curOutput) = *((uint32_t*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
|
|
783
|
+
sizeLeft -= SnP_laneLengthInBytes;
|
|
784
|
+
lanePosition++;
|
|
785
|
+
curInput += SnP_laneLengthInBytes;
|
|
786
|
+
curOutput += SnP_laneLengthInBytes;
|
|
787
|
+
}
|
|
788
|
+
|
|
789
|
+
if (sizeLeft != 0) {
|
|
790
|
+
uint32_t lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
|
|
791
|
+
do {
|
|
792
|
+
*(curOutput++) = *(curInput++) ^ (unsigned char)lane;
|
|
793
|
+
lane >>= 8;
|
|
794
|
+
} while ( --sizeLeft != 0);
|
|
795
|
+
}
|
|
796
|
+
}
|
|
797
|
+
|
|
798
|
+
void Xoodootimes8_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
|
|
799
|
+
{
|
|
800
|
+
const V256 *stateAsLanes = (const V256 *)states;
|
|
801
|
+
unsigned int i;
|
|
802
|
+
const uint32_t *datai32 = (const uint32_t *)input;
|
|
803
|
+
uint32_t *datao32 = (uint32_t *)output;
|
|
804
|
+
V256 offsets = SET8_32(0*laneOffset, 1*laneOffset, 2*laneOffset, 3*laneOffset, 4*laneOffset, 5*laneOffset, 6*laneOffset, 7*laneOffset);
|
|
805
|
+
|
|
806
|
+
#define ExtrXor( argIndex ) STORE_SCATTER8_32(offsets, XOR( stateAsLanes[argIndex], LOAD_GATHER8_32(offsets, &datai32[argIndex])), &datao32[argIndex])
|
|
807
|
+
|
|
808
|
+
if ( laneCount == 12 ) {
|
|
809
|
+
ExtrXor( 0 );
|
|
810
|
+
ExtrXor( 1 );
|
|
811
|
+
ExtrXor( 2 );
|
|
812
|
+
ExtrXor( 3 );
|
|
813
|
+
ExtrXor( 4 );
|
|
814
|
+
ExtrXor( 5 );
|
|
815
|
+
ExtrXor( 6 );
|
|
816
|
+
ExtrXor( 7 );
|
|
817
|
+
ExtrXor( 8 );
|
|
818
|
+
ExtrXor( 9 );
|
|
819
|
+
ExtrXor( 10 );
|
|
820
|
+
ExtrXor( 11 );
|
|
821
|
+
}
|
|
822
|
+
else {
|
|
823
|
+
for(i=0; i<laneCount; i++) {
|
|
824
|
+
ExtrXor( i );
|
|
825
|
+
}
|
|
826
|
+
}
|
|
827
|
+
#undef ExtrXor
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
#define DeclareVars V256 a00, a01, a02, a03; \
|
|
831
|
+
V256 a10, a11, a12, a13; \
|
|
832
|
+
V256 a20, a21, a22, a23; \
|
|
833
|
+
V256 v1, v2;
|
|
834
|
+
|
|
835
|
+
#define State2Vars2 a00 = states[0], a01 = states[1], a02 = states[ 2], a03 = states[ 3]; \
|
|
836
|
+
a12 = states[4], a13 = states[5], a10 = states[ 6], a11 = states[ 7]; \
|
|
837
|
+
a20 = states[8], a21 = states[9], a22 = states[10], a23 = states[11]
|
|
838
|
+
|
|
839
|
+
#define State2Vars a00 = states[0], a01 = states[1], a02 = states[ 2], a03 = states[ 3]; \
|
|
840
|
+
a10 = states[4], a11 = states[5], a12 = states[ 6], a13 = states[ 7]; \
|
|
841
|
+
a20 = states[8], a21 = states[9], a22 = states[10], a23 = states[11]
|
|
842
|
+
|
|
843
|
+
#define Vars2State states[0] = a00, states[1] = a01, states[ 2] = a02, states[ 3] = a03; \
|
|
844
|
+
states[4] = a10, states[5] = a11, states[ 6] = a12, states[ 7] = a13; \
|
|
845
|
+
states[8] = a20, states[9] = a21, states[10] = a22, states[11] = a23
|
|
846
|
+
|
|
847
|
+
#define Round(a10i, a11i, a12i, a13i, a10w, a11w, a12w, a13w, a20i, a21i, a22i, a23i, __rc) \
|
|
848
|
+
\
|
|
849
|
+
/* Theta: Column Parity Mixer */ \
|
|
850
|
+
/* Iota: round constants */ \
|
|
851
|
+
v1 = XOR3( a03, a13i, a23i ); \
|
|
852
|
+
v2 = XOR3( a00, a10i, a20i ); \
|
|
853
|
+
v1 = XOR( ROL32(v1, 5), ROL32(v1, 14) ); \
|
|
854
|
+
a00 = XOR3( a00, v1, CONST8_32(__rc) ); /* Iota */ \
|
|
855
|
+
a10i = XOR( a10i, v1 ); \
|
|
856
|
+
a20i = XOR( a20i, v1 ); \
|
|
857
|
+
v1 = XOR3( a01, a11i, a21i ); \
|
|
858
|
+
v2 = XOR( ROL32(v2, 5), ROL32(v2, 14) ); \
|
|
859
|
+
a01 = XOR( a01, v2 ); \
|
|
860
|
+
a11i = XOR( a11i, v2 ); \
|
|
861
|
+
a21i = XOR( a21i, v2 ); \
|
|
862
|
+
v2 = XOR3( a02, a12i, a22i ); \
|
|
863
|
+
v1 = XOR( ROL32(v1, 5), ROL32(v1, 14) ); \
|
|
864
|
+
a02 = XOR( a02, v1 ); \
|
|
865
|
+
a12i = XOR( a12i, v1 ); \
|
|
866
|
+
a22i = XOR( a22i, v1 ); \
|
|
867
|
+
v2 = XOR( ROL32(v2, 5), ROL32(v2, 14) ); \
|
|
868
|
+
a03 = XOR( a03, v2 ); \
|
|
869
|
+
a13i = XOR( a13i, v2 ); \
|
|
870
|
+
a23i = XOR( a23i, v2 ); \
|
|
871
|
+
Dump3("Theta",a); \
|
|
872
|
+
\
|
|
873
|
+
/* Rho-west: Plane shift */ \
|
|
874
|
+
a20i = ROL32(a20i, 11); \
|
|
875
|
+
a21i = ROL32(a21i, 11); \
|
|
876
|
+
a22i = ROL32(a22i, 11); \
|
|
877
|
+
a23i = ROL32(a23i, 11); \
|
|
878
|
+
Dump3("Rho-west",a); \
|
|
879
|
+
\
|
|
880
|
+
/* Chi: non linear step, on colums */ \
|
|
881
|
+
a00 = Chi(a00, a10w, a20i); \
|
|
882
|
+
a01 = Chi(a01, a11w, a21i); \
|
|
883
|
+
a02 = Chi(a02, a12w, a22i); \
|
|
884
|
+
a03 = Chi(a03, a13w, a23i); \
|
|
885
|
+
a10w = Chi(a10w, a20i, a00); \
|
|
886
|
+
a11w = Chi(a11w, a21i, a01); \
|
|
887
|
+
a12w = Chi(a12w, a22i, a02); \
|
|
888
|
+
a13w = Chi(a13w, a23i, a03); \
|
|
889
|
+
a20i = Chi(a20i, a00, a10w); \
|
|
890
|
+
a21i = Chi(a21i, a01, a11w); \
|
|
891
|
+
a22i = Chi(a22i, a02, a12w); \
|
|
892
|
+
a23i = Chi(a23i, a03, a13w); \
|
|
893
|
+
Dump3("Chi",a); \
|
|
894
|
+
\
|
|
895
|
+
/* Rho-east: Plane shift */ \
|
|
896
|
+
a10w = ROL32(a10w, 1); \
|
|
897
|
+
a11w = ROL32(a11w, 1); \
|
|
898
|
+
a12w = ROL32(a12w, 1); \
|
|
899
|
+
a13w = ROL32(a13w, 1); \
|
|
900
|
+
a20i = ROL32(a20i, 8); \
|
|
901
|
+
a21i = ROL32(a21i, 8); \
|
|
902
|
+
a22i = ROL32(a22i, 8); \
|
|
903
|
+
a23i = ROL32(a23i, 8); \
|
|
904
|
+
Dump3("Rho-east",a)
|
|
905
|
+
|
|
906
|
+
void Xoodootimes8_PermuteAll_6rounds(void *argstates)
|
|
907
|
+
{
|
|
908
|
+
V256 * states = (V256 *)argstates;
|
|
909
|
+
DeclareVars;
|
|
910
|
+
|
|
911
|
+
State2Vars2;
|
|
912
|
+
Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc6 );
|
|
913
|
+
Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc5 );
|
|
914
|
+
Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc4 );
|
|
915
|
+
Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc3 );
|
|
916
|
+
Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc2 );
|
|
917
|
+
Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc1 );
|
|
918
|
+
Dump2("Permutation\n", a);
|
|
919
|
+
Vars2State;
|
|
920
|
+
}
|
|
921
|
+
|
|
922
|
+
void Xoodootimes8_PermuteAll_12rounds(void *argstates)
|
|
923
|
+
{
|
|
924
|
+
V256 * states = (V256 *)argstates;
|
|
925
|
+
DeclareVars;
|
|
926
|
+
|
|
927
|
+
State2Vars;
|
|
928
|
+
Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc12 );
|
|
929
|
+
Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc11 );
|
|
930
|
+
Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc10 );
|
|
931
|
+
Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc9 );
|
|
932
|
+
Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc8 );
|
|
933
|
+
Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc7 );
|
|
934
|
+
Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc6 );
|
|
935
|
+
Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc5 );
|
|
936
|
+
Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc4 );
|
|
937
|
+
Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc3 );
|
|
938
|
+
Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc2 );
|
|
939
|
+
Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc1 );
|
|
940
|
+
Dump2("Permutation\n", a);
|
|
941
|
+
Vars2State;
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
void Xooffftimes8_AddIs(unsigned char *output, const unsigned char *input, size_t bitLen)
|
|
945
|
+
{
|
|
946
|
+
size_t byteLen = bitLen / 8;
|
|
947
|
+
V512 lanes1, lanes2, lanes3, lanes4;
|
|
948
|
+
V256 lanesA, lanesB;
|
|
949
|
+
|
|
950
|
+
while ( byteLen >= 128 ) {
|
|
951
|
+
lanes1 = LOAD512u(input[ 0]);
|
|
952
|
+
lanes2 = LOAD512u(input[64]);
|
|
953
|
+
lanes3 = LOAD512u(output[ 0]);
|
|
954
|
+
lanes4 = LOAD512u(output[64]);
|
|
955
|
+
lanes1 = XOR512(lanes1, lanes3);
|
|
956
|
+
lanes2 = XOR512(lanes2, lanes4);
|
|
957
|
+
STORE512u(output[ 0], lanes1);
|
|
958
|
+
STORE512u(output[64], lanes2);
|
|
959
|
+
input += 128;
|
|
960
|
+
output += 128;
|
|
961
|
+
byteLen -= 128;
|
|
962
|
+
}
|
|
963
|
+
while ( byteLen >= 32 ) {
|
|
964
|
+
lanesA = LOAD256u(input[0]);
|
|
965
|
+
lanesB = LOAD256u(output[0]);
|
|
966
|
+
input += 32;
|
|
967
|
+
lanesA = XOR256(lanesA, lanesB);
|
|
968
|
+
byteLen -= 32;
|
|
969
|
+
STORE256u(output[0], lanesA);
|
|
970
|
+
output += 32;
|
|
971
|
+
}
|
|
972
|
+
while ( byteLen >= 8 ) {
|
|
973
|
+
*((uint64_t*)output) ^= *((uint64_t*)input);
|
|
974
|
+
input += 8;
|
|
975
|
+
output += 8;
|
|
976
|
+
byteLen -= 8;
|
|
977
|
+
}
|
|
978
|
+
while ( byteLen-- != 0 ) {
|
|
979
|
+
*output++ ^= *input++;
|
|
980
|
+
}
|
|
981
|
+
|
|
982
|
+
bitLen &= 7;
|
|
983
|
+
if (bitLen != 0)
|
|
984
|
+
{
|
|
985
|
+
*output ^= *input;
|
|
986
|
+
*output &= (1 << bitLen) - 1;
|
|
987
|
+
}
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
size_t Xooffftimes8_CompressFastLoop(unsigned char *k, unsigned char *x, const unsigned char *input, size_t length)
|
|
991
|
+
{
|
|
992
|
+
DeclareVars;
|
|
993
|
+
uint32_t *k32 = (uint32_t*)k;
|
|
994
|
+
uint32_t *x32 = (uint32_t*)x;
|
|
995
|
+
uint32_t *i32 = (uint32_t*)input;
|
|
996
|
+
size_t initialLength;
|
|
997
|
+
V256 r04815926;
|
|
998
|
+
V256 r5926a37b;
|
|
999
|
+
V256 offsets;
|
|
1000
|
+
V256 x00, x01, x02, x03, x10, x11, x12, x13, x20, x21, x22, x23;
|
|
1001
|
+
V512 x512;
|
|
1002
|
+
|
|
1003
|
+
DUMP32("k32",k32);
|
|
1004
|
+
r04815926 = LOAD_GATHER8_32(LOAD8_32( 0, 4, 8, 1, 5, 9, 2, 6), k32);
|
|
1005
|
+
r5926a37b = LOAD_GATHER8_32(LOAD8_32( 5, 9, 2, 6, 10, 3, 7, 11), k32);
|
|
1006
|
+
|
|
1007
|
+
offsets = *(V256*)oGatherScatterOffsets;
|
|
1008
|
+
|
|
1009
|
+
x00 = _mm256_setzero_si256();
|
|
1010
|
+
x01 = _mm256_setzero_si256();
|
|
1011
|
+
x02 = _mm256_setzero_si256();
|
|
1012
|
+
x03 = _mm256_setzero_si256();
|
|
1013
|
+
x10 = _mm256_setzero_si256();
|
|
1014
|
+
x11 = _mm256_setzero_si256();
|
|
1015
|
+
x12 = _mm256_setzero_si256();
|
|
1016
|
+
x13 = _mm256_setzero_si256();
|
|
1017
|
+
x20 = _mm256_setzero_si256();
|
|
1018
|
+
x21 = _mm256_setzero_si256();
|
|
1019
|
+
x22 = _mm256_setzero_si256();
|
|
1020
|
+
x23 = _mm256_setzero_si256();
|
|
1021
|
+
initialLength = length;
|
|
1022
|
+
do {
|
|
1023
|
+
#define rCGKDHLEI r5926a37b
|
|
1024
|
+
|
|
1025
|
+
/* Note that a10-a12 and a11-a13 are swapped */
|
|
1026
|
+
a00 = r04815926;
|
|
1027
|
+
a02 = r5926a37b;
|
|
1028
|
+
DumpLane("r5926a37b",r5926a37b);
|
|
1029
|
+
a01 = _mm256_permutex2var_epi32(a00, *(const V256*)oAllFrom3_4, r5926a37b);
|
|
1030
|
+
a12 = _mm256_permutex2var_epi32(a00, *(const V256*)oAllFrom1_4, r5926a37b); /* 4815926 A */
|
|
1031
|
+
|
|
1032
|
+
rCGKDHLEI = XOR3(a00, SHL32(a00, 13), ROL32(a12, 3));
|
|
1033
|
+
|
|
1034
|
+
a02 = _mm256_permutex2var_epi32(a02, *(const V256*)oAllFrom2_0, rCGKDHLEI);
|
|
1035
|
+
a03 = _mm256_permutex2var_epi32(a02, *(const V256*)oAllFrom3_2, rCGKDHLEI);
|
|
1036
|
+
|
|
1037
|
+
a13 = _mm256_permutex2var_epi32(a01, *(const V256*)oAllFrom1_5, a02); /* B */
|
|
1038
|
+
a10 = _mm256_permutex2var_epi32(a02, *(const V256*)oAllFrom1_2, rCGKDHLEI); /* K */
|
|
1039
|
+
a11 = _mm256_permutex2var_epi32(a03, *(const V256*)oAllFrom1_5, rCGKDHLEI); /* L */
|
|
1040
|
+
|
|
1041
|
+
a20 = _mm256_permutex2var_epi32(a12, *(const V256*)oAllFrom1_0, a03); /* 3 */
|
|
1042
|
+
a21 = _mm256_permutex2var_epi32(a13, *(const V256*)oAllFrom1_0, rCGKDHLEI); /* C */
|
|
1043
|
+
a22 = _mm256_permutex2var_epi32(a10, *(const V256*)oAllFrom1_3, rCGKDHLEI); /* D */
|
|
1044
|
+
a23 = _mm256_permutex2var_epi32(a11, *(const V256*)oAllFrom1_6, rCGKDHLEI); /* E */
|
|
1045
|
+
r04815926 = a22;
|
|
1046
|
+
Dump("Roll-c", a);
|
|
1047
|
+
|
|
1048
|
+
a00 = XOR( a00, LOAD_GATHER8_32(offsets, i32+0));
|
|
1049
|
+
a01 = XOR( a01, LOAD_GATHER8_32(offsets, i32+1));
|
|
1050
|
+
a02 = XOR( a02, LOAD_GATHER8_32(offsets, i32+2));
|
|
1051
|
+
a03 = XOR( a03, LOAD_GATHER8_32(offsets, i32+3));
|
|
1052
|
+
a12 = XOR( a12, LOAD_GATHER8_32(offsets, i32+4));
|
|
1053
|
+
a13 = XOR( a13, LOAD_GATHER8_32(offsets, i32+5));
|
|
1054
|
+
a10 = XOR( a10, LOAD_GATHER8_32(offsets, i32+6));
|
|
1055
|
+
a11 = XOR( a11, LOAD_GATHER8_32(offsets, i32+7));
|
|
1056
|
+
a20 = XOR( a20, LOAD_GATHER8_32(offsets, i32+8));
|
|
1057
|
+
a21 = XOR( a21, LOAD_GATHER8_32(offsets, i32+9));
|
|
1058
|
+
a22 = XOR( a22, LOAD_GATHER8_32(offsets, i32+10));
|
|
1059
|
+
a23 = XOR( a23, LOAD_GATHER8_32(offsets, i32+11));
|
|
1060
|
+
Dump("Input Xoodoo (after add)", a);
|
|
1061
|
+
|
|
1062
|
+
Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc6 );
|
|
1063
|
+
Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc5 );
|
|
1064
|
+
Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc4 );
|
|
1065
|
+
Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc3 );
|
|
1066
|
+
Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc2 );
|
|
1067
|
+
Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc1 );
|
|
1068
|
+
Dump("Output Xoodoo", a);
|
|
1069
|
+
|
|
1070
|
+
x00 = XOR(x00, a00);
|
|
1071
|
+
x01 = XOR(x01, a01);
|
|
1072
|
+
x02 = XOR(x02, a02);
|
|
1073
|
+
x03 = XOR(x03, a03);
|
|
1074
|
+
x10 = XOR(x10, a10);
|
|
1075
|
+
x11 = XOR(x11, a11);
|
|
1076
|
+
x12 = XOR(x12, a12);
|
|
1077
|
+
x13 = XOR(x13, a13);
|
|
1078
|
+
x20 = XOR(x20, a20);
|
|
1079
|
+
x21 = XOR(x21, a21);
|
|
1080
|
+
x22 = XOR(x22, a22);
|
|
1081
|
+
x23 = XOR(x23, a23);
|
|
1082
|
+
Dump("Accu x", x);
|
|
1083
|
+
|
|
1084
|
+
i32 += NLANES*8;
|
|
1085
|
+
length -= NLANES*4*8;
|
|
1086
|
+
}
|
|
1087
|
+
while (length >= (NLANES*4*8));
|
|
1088
|
+
|
|
1089
|
+
/* Reduce from 8 lanes to 4 */
|
|
1090
|
+
v1 = *(V256*)oLow128;
|
|
1091
|
+
v2 = *(V256*)oHigh128;
|
|
1092
|
+
x00 = XOR(_mm256_permutex2var_epi32(x00, v1, x10), _mm256_permutex2var_epi32(x00, v2, x10));
|
|
1093
|
+
x01 = XOR(_mm256_permutex2var_epi32(x01, v1, x11), _mm256_permutex2var_epi32(x01, v2, x11));
|
|
1094
|
+
x02 = XOR(_mm256_permutex2var_epi32(x02, v1, x12), _mm256_permutex2var_epi32(x02, v2, x12));
|
|
1095
|
+
x03 = XOR(_mm256_permutex2var_epi32(x03, v1, x13), _mm256_permutex2var_epi32(x03, v2, x13));
|
|
1096
|
+
x20 = XOR(_mm256_permutex2var_epi32(x20, v1, x22), _mm256_permutex2var_epi32(x20, v2, x22));
|
|
1097
|
+
x21 = XOR(_mm256_permutex2var_epi32(x21, v1, x23), _mm256_permutex2var_epi32(x21, v2, x23));
|
|
1098
|
+
|
|
1099
|
+
/* Reduce from 4 lanes to 2 */
|
|
1100
|
+
v1 = *( V256*)oLow64;
|
|
1101
|
+
v2 = *( V256*)oHigh64;
|
|
1102
|
+
x00 = XOR(_mm256_permutex2var_epi32(x00, v1, x02), _mm256_permutex2var_epi32(x00, v2, x02));
|
|
1103
|
+
x01 = XOR(_mm256_permutex2var_epi32(x01, v1, x03), _mm256_permutex2var_epi32(x01, v2, x03));
|
|
1104
|
+
x20 = XOR(_mm256_permutex2var_epi32(x20, v1, x21), _mm256_permutex2var_epi32(x20, v2, x21));
|
|
1105
|
+
|
|
1106
|
+
/* Reduce from 2 lanes to 1 */
|
|
1107
|
+
x00 = XOR(_mm256_permutex2var_epi32(x00, *(V256*)oLow32, x01), _mm256_permutex2var_epi32(x00, *(V256*)oHigh32, x01));
|
|
1108
|
+
x20 = XOR(_mm256_permutex2var_epi32(x20, *(V256*)oLow32_4, x20), _mm256_permutex2var_epi32(x20, *(V256*)oHigh32_4, x20));
|
|
1109
|
+
|
|
1110
|
+
/* Combine x00 and x20 */
|
|
1111
|
+
x512 = _mm512_inserti64x4 (_mm512_castsi256_si512(x00), x20, 1);
|
|
1112
|
+
|
|
1113
|
+
/* load xAccu, xor and store 12 lanes */
|
|
1114
|
+
x512 = XOR512(x512, _mm512_maskz_load_epi64(0x3F, x32));
|
|
1115
|
+
_mm512_mask_store_epi64(x32, 0x3F, x512);
|
|
1116
|
+
DUMP32_12("x32",x32);
|
|
1117
|
+
|
|
1118
|
+
/* Save new k */
|
|
1119
|
+
_mm256_i32scatter_epi32(k32, LOAD8_32( 0, 4, 8, 1, 5, 9, 2, 6), r04815926, 4);
|
|
1120
|
+
_mm256_mask_i32scatter_epi32(k32, 0xF0, LOAD8_32( 0, 0, 0, 0, 10, 3, 7, 11), r5926a37b, 4);
|
|
1121
|
+
DUMP32_12( "k32", k32);
|
|
1122
|
+
|
|
1123
|
+
return initialLength - length;
|
|
1124
|
+
}
|
|
1125
|
+
|
|
1126
|
+
size_t Xooffftimes8_ExpandFastLoop(unsigned char *yAccu, const unsigned char *kRoll, unsigned char *output, size_t length)
|
|
1127
|
+
{
|
|
1128
|
+
DeclareVars;
|
|
1129
|
+
uint32_t *k32 = (uint32_t*)kRoll;
|
|
1130
|
+
uint32_t *y32 = (uint32_t*)yAccu;
|
|
1131
|
+
uint32_t *o32 = (uint32_t*)output;
|
|
1132
|
+
size_t initialLength;
|
|
1133
|
+
V256 r04815926;
|
|
1134
|
+
V256 r5926a37b;
|
|
1135
|
+
V256 offsets;
|
|
1136
|
+
|
|
1137
|
+
r04815926 = LOAD_GATHER8_32(LOAD8_32( 0, 4, 8, 1, 5, 9, 2, 6), y32);
|
|
1138
|
+
r5926a37b = LOAD_GATHER8_32(LOAD8_32( 5, 9, 2, 6, 10, 3, 7, 11), y32);
|
|
1139
|
+
offsets = *(const V256*)oGatherScatterOffsets;
|
|
1140
|
+
|
|
1141
|
+
initialLength = length;
|
|
1142
|
+
do {
|
|
1143
|
+
#define rCGKDHLEI r5926a37b
|
|
1144
|
+
|
|
1145
|
+
/* Note that a10-a12 and a11-a13 are swapped */
|
|
1146
|
+
a00 = r04815926;
|
|
1147
|
+
a02 = r5926a37b;
|
|
1148
|
+
DumpLane("r5926a37b",r5926a37b);
|
|
1149
|
+
a01 = _mm256_permutex2var_epi32(a00, *(const V256*)oAllFrom3_4, r5926a37b);
|
|
1150
|
+
a12 = _mm256_permutex2var_epi32(a00, *(const V256*)oAllFrom1_4, r5926a37b); /* 4815926 A */
|
|
1151
|
+
a20 = _mm256_permutex2var_epi32(a00, *(const V256*)oAllFrom2_4, r5926a37b); /* 815926 A3 */
|
|
1152
|
+
|
|
1153
|
+
rCGKDHLEI = XOR3(ROL32(a00, 5), ROL32(a12, 13), AND(a20, a12));
|
|
1154
|
+
rCGKDHLEI = XOR(rCGKDHLEI, CONST8_32(7));
|
|
1155
|
+
|
|
1156
|
+
a02 = _mm256_permutex2var_epi32(a02, *(const V256*)oAllFrom2_0, rCGKDHLEI);
|
|
1157
|
+
a03 = _mm256_permutex2var_epi32(a02, *(const V256*)oAllFrom3_2, rCGKDHLEI);
|
|
1158
|
+
|
|
1159
|
+
a13 = _mm256_permutex2var_epi32(a01, *(const V256*)oAllFrom1_5, a02); /* B */
|
|
1160
|
+
a10 = _mm256_permutex2var_epi32(a02, *(const V256*)oAllFrom1_2, rCGKDHLEI); /* K */
|
|
1161
|
+
a11 = _mm256_permutex2var_epi32(a03, *(const V256*)oAllFrom1_5, rCGKDHLEI); /* L */
|
|
1162
|
+
|
|
1163
|
+
a21 = _mm256_permutex2var_epi32(a13, *(const V256*)oAllFrom1_0, rCGKDHLEI); /* C */
|
|
1164
|
+
a22 = _mm256_permutex2var_epi32(a10, *(const V256*)oAllFrom1_3, rCGKDHLEI); /* D */
|
|
1165
|
+
a23 = _mm256_permutex2var_epi32(a11, *(const V256*)oAllFrom1_6, rCGKDHLEI); /* E */
|
|
1166
|
+
r04815926 = a22;
|
|
1167
|
+
Dump("Roll-e", a);
|
|
1168
|
+
|
|
1169
|
+
Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc6 );
|
|
1170
|
+
Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc5 );
|
|
1171
|
+
Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc4 );
|
|
1172
|
+
Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc3 );
|
|
1173
|
+
Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc2 );
|
|
1174
|
+
Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc1 );
|
|
1175
|
+
Dump("Xoodoo(y)", a);
|
|
1176
|
+
|
|
1177
|
+
a00 = XOR(a00, CONST8_32(k32[0]));
|
|
1178
|
+
a01 = XOR(a01, CONST8_32(k32[1]));
|
|
1179
|
+
a02 = XOR(a02, CONST8_32(k32[2]));
|
|
1180
|
+
a03 = XOR(a03, CONST8_32(k32[3]));
|
|
1181
|
+
a10 = XOR(a10, CONST8_32(k32[4]));
|
|
1182
|
+
a11 = XOR(a11, CONST8_32(k32[5]));
|
|
1183
|
+
a12 = XOR(a12, CONST8_32(k32[6]));
|
|
1184
|
+
a13 = XOR(a13, CONST8_32(k32[7]));
|
|
1185
|
+
a20 = XOR(a20, CONST8_32(k32[8]));
|
|
1186
|
+
a21 = XOR(a21, CONST8_32(k32[9]));
|
|
1187
|
+
a22 = XOR(a22, CONST8_32(k32[10]));
|
|
1188
|
+
a23 = XOR(a23, CONST8_32(k32[11]));
|
|
1189
|
+
Dump("Xoodoo(y) + kRoll", a);
|
|
1190
|
+
|
|
1191
|
+
/* Extract */
|
|
1192
|
+
STORE_SCATTER8_32(offsets, a00, o32+0);
|
|
1193
|
+
STORE_SCATTER8_32(offsets, a01, o32+1);
|
|
1194
|
+
STORE_SCATTER8_32(offsets, a02, o32+2);
|
|
1195
|
+
STORE_SCATTER8_32(offsets, a03, o32+3);
|
|
1196
|
+
STORE_SCATTER8_32(offsets, a10, o32+4);
|
|
1197
|
+
STORE_SCATTER8_32(offsets, a11, o32+5);
|
|
1198
|
+
STORE_SCATTER8_32(offsets, a12, o32+6);
|
|
1199
|
+
STORE_SCATTER8_32(offsets, a13, o32+7);
|
|
1200
|
+
STORE_SCATTER8_32(offsets, a20, o32+8);
|
|
1201
|
+
STORE_SCATTER8_32(offsets, a21, o32+9);
|
|
1202
|
+
STORE_SCATTER8_32(offsets, a22, o32+10);
|
|
1203
|
+
STORE_SCATTER8_32(offsets, a23, o32+11);
|
|
1204
|
+
|
|
1205
|
+
o32 += NLANES*8;
|
|
1206
|
+
length -= NLANES*4*8;
|
|
1207
|
+
}
|
|
1208
|
+
while (length >= (NLANES*4*8));
|
|
1209
|
+
|
|
1210
|
+
/* Save new y */
|
|
1211
|
+
_mm256_i32scatter_epi32(y32, LOAD8_32( 0, 4, 8, 1, 5, 9, 2, 6), r04815926, 4);
|
|
1212
|
+
_mm256_mask_i32scatter_epi32(y32, 0xF0, LOAD8_32( 0, 0, 0, 0, 10, 3, 7, 11), r5926a37b, 4);
|
|
1213
|
+
DUMP32_12( "y32", y32);
|
|
1214
|
+
|
|
1215
|
+
return initialLength - length;
|
|
1216
|
+
}
|