sleeping_kangaroo12 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +127 -0
- data/ext/Rakefile +73 -0
- data/ext/binding/sleeping_kangaroo12.c +39 -0
- data/ext/config/xkcp.build +17 -0
- data/ext/xkcp/LICENSE +1 -0
- data/ext/xkcp/Makefile +15 -0
- data/ext/xkcp/Makefile.build +200 -0
- data/ext/xkcp/README.markdown +296 -0
- data/ext/xkcp/lib/HighLevel.build +143 -0
- data/ext/xkcp/lib/LowLevel.build +757 -0
- data/ext/xkcp/lib/common/align.h +33 -0
- data/ext/xkcp/lib/common/brg_endian.h +143 -0
- data/ext/xkcp/lib/high/KangarooTwelve/KangarooTwelve.c +301 -0
- data/ext/xkcp/lib/high/KangarooTwelve/KangarooTwelve.h +97 -0
- data/ext/xkcp/lib/high/Keccak/FIPS202/KeccakHash.c +81 -0
- data/ext/xkcp/lib/high/Keccak/FIPS202/KeccakHash.h +125 -0
- data/ext/xkcp/lib/high/Keccak/FIPS202/SimpleFIPS202.c +48 -0
- data/ext/xkcp/lib/high/Keccak/FIPS202/SimpleFIPS202.h +79 -0
- data/ext/xkcp/lib/high/Keccak/KeccakDuplex.c +81 -0
- data/ext/xkcp/lib/high/Keccak/KeccakDuplex.h +73 -0
- data/ext/xkcp/lib/high/Keccak/KeccakDuplex.inc +195 -0
- data/ext/xkcp/lib/high/Keccak/KeccakSponge.c +111 -0
- data/ext/xkcp/lib/high/Keccak/KeccakSponge.h +76 -0
- data/ext/xkcp/lib/high/Keccak/KeccakSponge.inc +314 -0
- data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.c +61 -0
- data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.h +67 -0
- data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.inc +128 -0
- data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.c +93 -0
- data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.h +599 -0
- data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.inc +573 -0
- data/ext/xkcp/lib/high/Ketje/Ketjev2.c +87 -0
- data/ext/xkcp/lib/high/Ketje/Ketjev2.h +88 -0
- data/ext/xkcp/lib/high/Ketje/Ketjev2.inc +274 -0
- data/ext/xkcp/lib/high/Keyak/Keyakv2.c +132 -0
- data/ext/xkcp/lib/high/Keyak/Keyakv2.h +217 -0
- data/ext/xkcp/lib/high/Keyak/Keyakv2.inc +81 -0
- data/ext/xkcp/lib/high/Keyak/Motorist.inc +953 -0
- data/ext/xkcp/lib/high/Kravatte/Kravatte.c +533 -0
- data/ext/xkcp/lib/high/Kravatte/Kravatte.h +115 -0
- data/ext/xkcp/lib/high/Kravatte/KravatteModes.c +557 -0
- data/ext/xkcp/lib/high/Kravatte/KravatteModes.h +247 -0
- data/ext/xkcp/lib/high/Xoodyak/Cyclist.h +66 -0
- data/ext/xkcp/lib/high/Xoodyak/Cyclist.inc +336 -0
- data/ext/xkcp/lib/high/Xoodyak/Xoodyak-parameters.h +26 -0
- data/ext/xkcp/lib/high/Xoodyak/Xoodyak.c +55 -0
- data/ext/xkcp/lib/high/Xoodyak/Xoodyak.h +35 -0
- data/ext/xkcp/lib/high/Xoofff/Xoofff.c +634 -0
- data/ext/xkcp/lib/high/Xoofff/Xoofff.h +147 -0
- data/ext/xkcp/lib/high/Xoofff/XoofffModes.c +483 -0
- data/ext/xkcp/lib/high/Xoofff/XoofffModes.h +241 -0
- data/ext/xkcp/lib/high/common/Phases.h +25 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-SnP.h +41 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv6m-le-armcc.s +1666 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv6m-le-gcc.s +1655 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7a-le-armcc.s +1268 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7a-le-gcc.s +1264 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7m-le-armcc.s +1178 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7m-le-gcc.s +1175 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u1-32bi-armv6m-le-armcc.s +1338 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u1-32bi-armv6m-le-gcc.s +1336 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u2-32bi-armv6m-le-armcc.s +1343 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u2-32bi-armv6m-le-gcc.s +1339 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-SnP.h +42 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-armv7a-le-neon-armcc.s +823 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-armv7a-le-neon-gcc.s +831 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARMv8A/KeccakP-1600-SnP.h +31 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARMv8A/KeccakP-1600-armv8a-neon.s +540 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-SnP.h +42 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-avr8-compact.s +733 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-avr8-fast.s +1121 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX2/KeccakP-1600-AVX2.s +1100 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX2/KeccakP-1600-SnP.h +52 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/KeccakP-1600-AVX512.c +623 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/KeccakP-1600-SnP.h +47 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/u12/KeccakP-1600-AVX512-config.h +6 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/u6/KeccakP-1600-AVX512-config.h +6 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/ua/KeccakP-1600-AVX512-config.h +6 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/KeccakP-1600-AVX512.s +1031 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/KeccakP-1600-SnP.h +53 -0
- data/ext/xkcp/lib/low/KeccakP-1600/XOP/KeccakP-1600-SnP.h +44 -0
- data/ext/xkcp/lib/low/KeccakP-1600/XOP/KeccakP-1600-XOP.c +476 -0
- data/ext/xkcp/lib/low/KeccakP-1600/XOP/u6/KeccakP-1600-XOP-config.h +6 -0
- data/ext/xkcp/lib/low/KeccakP-1600/XOP/ua/KeccakP-1600-XOP-config.h +6 -0
- data/ext/xkcp/lib/low/KeccakP-1600/common/KeccakP-1600-64.macros +748 -0
- data/ext/xkcp/lib/low/KeccakP-1600/common/KeccakP-1600-unrolling.macros +305 -0
- data/ext/xkcp/lib/low/KeccakP-1600/compact/KeccakP-1600-SnP.h +40 -0
- data/ext/xkcp/lib/low/KeccakP-1600/compact/KeccakP-1600-compact64.c +420 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-32bits-inplace/KeccakP-1600-SnP.h +43 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-32bits-inplace/KeccakP-1600-inplace32BI.c +1163 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/KeccakP-1600-SnP.h +54 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/KeccakP-1600-opt64.c +565 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcu6/KeccakP-1600-opt64-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcua/KeccakP-1600-opt64-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcua-shld/KeccakP-1600-opt64-config.h +8 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/u6/KeccakP-1600-opt64-config.h +6 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/ua/KeccakP-1600-opt64-config.h +6 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-SnP.h +44 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-reference.h +23 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-reference32BI.c +625 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-SnP.h +44 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-reference.c +440 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-reference.h +23 -0
- data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-SnP.h +42 -0
- data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-gas.s +1196 -0
- data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-gas_Apple.s +1124 -0
- data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-shld-gas.s +1196 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-inplace-pl2-armv7a-neon-le-armcc.s +1392 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-inplace-pl2-armv7a-neon-le-gcc.s +1394 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-times2-SnP.h +42 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512u12/SIMD512-2-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512u4/SIMD512-2-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512ufull/SIMD512-2-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/KeccakP-1600-times2-SIMD512.c +850 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/KeccakP-1600-times2-SnP.h +51 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SIMD128.c +957 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SnP.h +49 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/SSSE3-u2/SIMD128-config.h +8 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/SSSE3-ua/SIMD128-config.h +8 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/XOP-u2/SIMD128-config.h +9 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/XOP-ua/SIMD128-config.h +9 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/fallback-on1/KeccakP-1600-times2-SnP.h +45 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/fallback-on1/KeccakP-1600-times2-on1.c +37 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SIMD256.c +1321 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SnP.h +55 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/u12/SIMD256-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/u6/SIMD256-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/ua/SIMD256-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512u12/SIMD512-4-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512u4/SIMD512-4-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512ufull/SIMD512-4-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/KeccakP-1600-times4-SIMD512.c +881 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/KeccakP-1600-times4-SnP.h +51 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-SnP.h +45 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-on1.c +37 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on2/KeccakP-1600-times4-SnP.h +45 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on2/KeccakP-1600-times4-on2.c +38 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/KeccakP-1600-times8-SIMD512.c +1615 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/KeccakP-1600-times8-SnP.h +57 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/u12/SIMD512-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/u4/SIMD512-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/ua/SIMD512-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on1/KeccakP-1600-times8-SnP.h +45 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on1/KeccakP-1600-times8-on1.c +37 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on2/KeccakP-1600-times8-SnP.h +45 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on2/KeccakP-1600-times8-on2.c +38 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on4/KeccakP-1600-times8-SnP.h +45 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on4/KeccakP-1600-times8-on4.c +38 -0
- data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-SnP.h +41 -0
- data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv6m-le-armcc.s +442 -0
- data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv6m-le-gcc.s +446 -0
- data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-armcc.s +419 -0
- data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-gcc.s +427 -0
- data/ext/xkcp/lib/low/KeccakP-200/AVR8/KeccakP-200-SnP.h +41 -0
- data/ext/xkcp/lib/low/KeccakP-200/AVR8/KeccakP-200-avr8-fast.s +647 -0
- data/ext/xkcp/lib/low/KeccakP-200/compact/KeccakP-200-SnP.h +39 -0
- data/ext/xkcp/lib/low/KeccakP-200/compact/KeccakP-200-compact.c +190 -0
- data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-SnP.h +43 -0
- data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-reference.c +412 -0
- data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-reference.h +23 -0
- data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-SnP.h +41 -0
- data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv6m-le-armcc.s +454 -0
- data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv6m-le-gcc.s +458 -0
- data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv7m-le-armcc.s +455 -0
- data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv7m-le-gcc.s +458 -0
- data/ext/xkcp/lib/low/KeccakP-400/AVR8/KeccakP-400-SnP.h +41 -0
- data/ext/xkcp/lib/low/KeccakP-400/AVR8/KeccakP-400-avr8-fast.s +728 -0
- data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-SnP.h +43 -0
- data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-reference.c +414 -0
- data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-reference.h +23 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-SnP.h +42 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u1-armv6m-le-armcc.s +527 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u1-armv6m-le-gcc.s +533 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv6m-le-armcc.s +528 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv6m-le-gcc.s +534 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7a-le-armcc.s +521 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7a-le-gcc.s +527 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7m-le-armcc.s +517 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7m-le-gcc.s +523 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-uf-armv7m-le-armcc.s +550 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-uf-armv7m-le-gcc.s +556 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARMv8A/KeccakP-800-SnP.h +32 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARMv8A/KeccakP-800-armv8a-neon.s +432 -0
- data/ext/xkcp/lib/low/KeccakP-800/AVR8/KeccakP-800-SnP.h +42 -0
- data/ext/xkcp/lib/low/KeccakP-800/AVR8/KeccakP-800-avr8-fast.s +929 -0
- data/ext/xkcp/lib/low/KeccakP-800/compact/KeccakP-800-SnP.h +40 -0
- data/ext/xkcp/lib/low/KeccakP-800/compact/KeccakP-800-compact.c +244 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-SnP.h +46 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32-bis.macros +184 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32.c +454 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32.macros +459 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-unrolling-bis.macros +83 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-unrolling.macros +88 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/lcu2/KeccakP-800-opt32-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/lcua/KeccakP-800-opt32-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/u2/KeccakP-800-opt32-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/ua/KeccakP-800-opt32-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-SnP.h +44 -0
- data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-reference.c +437 -0
- data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-reference.h +23 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/Ket.h +57 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeJr-armv7m-le-armcc.s +475 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeJr-armv7m-le-gcc.s +480 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeSr-armv7m-le-armcc.s +590 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeSr-armv7m-le-gcc.s +590 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.c +126 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.h +68 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.inc +174 -0
- data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.c +80 -0
- data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.h +68 -0
- data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.inc +142 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-SnP.h +55 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-u1-armv6m-le-armcc.s +1086 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-u1-armv6m-le-gcc.s +1092 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv6-le-armcc.s +721 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv6-le-gcc.s +726 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv7m-le-armcc.s +723 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv7m-le-gcc.s +729 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-u1-armv6m-le-armcc.s +1164 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-u1-armv6m-le-gcc.s +1165 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv6-le-armcc.s +562 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv6-le-gcc.s +563 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv7m-le-armcc.s +563 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv7m-le-gcc.s +565 -0
- data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-SnP.h +55 -0
- data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-uf-armv7a-neon-le-armcc.s +476 -0
- data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-uf-armv7a-neon-le-gcc.s +485 -0
- data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodyak-uf-armv7a-neon-le-armcc.s +362 -0
- data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodyak-uf-armv7a-neon-le-gcc.s +367 -0
- data/ext/xkcp/lib/low/Xoodoo/AVR8/Xoodoo-SnP.h +43 -0
- data/ext/xkcp/lib/low/Xoodoo/AVR8/Xoodoo-avr8-u1.s +1341 -0
- data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodoo-SIMD512.c +581 -0
- data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodoo-SnP.h +58 -0
- data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodyak-full-block-SIMD512.c +332 -0
- data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodoo-SIMD128.c +329 -0
- data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodoo-SnP.h +53 -0
- data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodyak-full-block-SIMD128.c +355 -0
- data/ext/xkcp/lib/low/Xoodoo/Xoodoo.h +79 -0
- data/ext/xkcp/lib/low/Xoodoo/plain/Xoodoo-SnP.h +56 -0
- data/ext/xkcp/lib/low/Xoodoo/plain/Xoodoo-optimized.c +399 -0
- data/ext/xkcp/lib/low/Xoodoo/plain/Xoodyak-full-blocks.c +127 -0
- data/ext/xkcp/lib/low/Xoodoo/ref/Xoodoo-SnP.h +43 -0
- data/ext/xkcp/lib/low/Xoodoo/ref/Xoodoo-reference.c +253 -0
- data/ext/xkcp/lib/low/Xoodoo-times16/AVX512/Xoodoo-times16-SIMD512.c +1044 -0
- data/ext/xkcp/lib/low/Xoodoo-times16/AVX512/Xoodoo-times16-SnP.h +49 -0
- data/ext/xkcp/lib/low/Xoodoo-times16/fallback-on1/Xoodoo-times16-SnP.h +45 -0
- data/ext/xkcp/lib/low/Xoodoo-times16/fallback-on1/Xoodoo-times16-on1.c +37 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/ARMv7A-NEON/Xoodoo-times4-ARMv7A.s +1587 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/ARMv7A-NEON/Xoodoo-times4-SnP.h +48 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/AVX512/Xoodoo-times4-SIMD512.c +1202 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/AVX512/Xoodoo-times4-SnP.h +48 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/SSSE3/Xoodoo-times4-SIMD128.c +484 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/SSSE3/Xoodoo-times4-SnP.h +44 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/fallback-on1/Xoodoo-times4-SnP.h +45 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/fallback-on1/Xoodoo-times4-on1.c +37 -0
- data/ext/xkcp/lib/low/Xoodoo-times8/AVX2/Xoodoo-times8-SIMD256.c +939 -0
- data/ext/xkcp/lib/low/Xoodoo-times8/AVX2/Xoodoo-times8-SnP.h +49 -0
- data/ext/xkcp/lib/low/Xoodoo-times8/AVX512/Xoodoo-times8-SIMD512.c +1216 -0
- data/ext/xkcp/lib/low/Xoodoo-times8/AVX512/Xoodoo-times8-SnP.h +48 -0
- data/ext/xkcp/lib/low/Xoodoo-times8/fallback-on1/Xoodoo-times8-SnP.h +45 -0
- data/ext/xkcp/lib/low/Xoodoo-times8/fallback-on1/Xoodoo-times8-on1.c +37 -0
- data/ext/xkcp/lib/low/common/PlSnP-Fallback.inc +290 -0
- data/ext/xkcp/lib/low/common/SnP-Relaned.h +141 -0
- data/ext/xkcp/support/Build/ExpandProducts.xsl +79 -0
- data/ext/xkcp/support/Build/ToGlobalMakefile.xsl +206 -0
- data/ext/xkcp/support/Build/ToOneTarget.xsl +89 -0
- data/ext/xkcp/support/Build/ToTargetConfigFile.xsl +37 -0
- data/ext/xkcp/support/Build/ToTargetMakefile.xsl +298 -0
- data/ext/xkcp/support/Build/ToVCXProj.xsl +198 -0
- data/ext/xkcp/support/Kernel-PMU/Kernel-pmu.md +133 -0
- data/ext/xkcp/support/Kernel-PMU/Makefile +8 -0
- data/ext/xkcp/support/Kernel-PMU/enable_arm_pmu.c +129 -0
- data/ext/xkcp/support/Kernel-PMU/load-module +1 -0
- data/ext/xkcp/util/KeccakSum/KeccakSum.c +394 -0
- data/ext/xkcp/util/KeccakSum/base64.c +86 -0
- data/ext/xkcp/util/KeccakSum/base64.h +12 -0
- data/lib/sleeping_kangaroo12/binding.rb +15 -0
- data/lib/sleeping_kangaroo12/build/loader.rb +40 -0
- data/lib/sleeping_kangaroo12/build/platform.rb +37 -0
- data/lib/sleeping_kangaroo12/build.rb +4 -0
- data/lib/sleeping_kangaroo12/digest.rb +103 -0
- data/lib/sleeping_kangaroo12/version.rb +5 -0
- data/lib/sleeping_kangaroo12.rb +7 -0
- metadata +372 -0
|
@@ -0,0 +1,1615 @@
|
|
|
1
|
+
/*
|
|
2
|
+
The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
|
|
3
|
+
|
|
4
|
+
Implementation by Ronny Van Keer, hereby denoted as "the implementer".
|
|
5
|
+
|
|
6
|
+
For more information, feedback or questions, please refer to the Keccak Team website:
|
|
7
|
+
https://keccak.team/
|
|
8
|
+
|
|
9
|
+
To the extent possible under law, the implementer has waived all copyright
|
|
10
|
+
and related or neighboring rights to the source code in this file.
|
|
11
|
+
http://creativecommons.org/publicdomain/zero/1.0/
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
This file implements Keccak-p[1600]×8 in a PlSnP-compatible way.
|
|
16
|
+
Please refer to PlSnP-documentation.h for more details.
|
|
17
|
+
|
|
18
|
+
This implementation comes with KeccakP-1600-times8-SnP.h in the same folder.
|
|
19
|
+
Please refer to LowLevel.build for the exact list of other files it must be combined with.
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
#include <stdio.h>
|
|
23
|
+
#include <stdlib.h>
|
|
24
|
+
#include <string.h>
|
|
25
|
+
#include <stdint.h>
|
|
26
|
+
#include <smmintrin.h>
|
|
27
|
+
#include <wmmintrin.h>
|
|
28
|
+
#include <immintrin.h>
|
|
29
|
+
#include <emmintrin.h>
|
|
30
|
+
#include "align.h"
|
|
31
|
+
#include "KeccakP-1600-times8-SnP.h"
|
|
32
|
+
#include "SIMD512-config.h"
|
|
33
|
+
|
|
34
|
+
#include "brg_endian.h"
|
|
35
|
+
#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
|
|
36
|
+
#error Expecting a little-endian platform
|
|
37
|
+
#endif
|
|
38
|
+
|
|
39
|
+
/*
|
|
40
|
+
** Uncomment the define hereunder when compiling for a CPU without AVX-512 SIMD.
|
|
41
|
+
#define SIMULATE_AVX512
|
|
42
|
+
*/
|
|
43
|
+
|
|
44
|
+
#define VERBOSE 0
|
|
45
|
+
|
|
46
|
+
#if defined(SIMULATE_AVX512)
|
|
47
|
+
|
|
48
|
+
typedef struct
|
|
49
|
+
{
|
|
50
|
+
uint64_t x[8];
|
|
51
|
+
} __m512i;
|
|
52
|
+
|
|
53
|
+
static __m512i _mm512_and_si512( __m512i a, __m512i b)
|
|
54
|
+
{
|
|
55
|
+
__m512i r;
|
|
56
|
+
unsigned int i;
|
|
57
|
+
|
|
58
|
+
for ( i = 0; i < 8; ++i )
|
|
59
|
+
r.x[i] = a.x[i] & b.x[i];
|
|
60
|
+
return(r);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
static __m512i _mm512_xor_si512( __m512i a, __m512i b)
|
|
64
|
+
{
|
|
65
|
+
__m512i r;
|
|
66
|
+
unsigned int i;
|
|
67
|
+
|
|
68
|
+
for ( i = 0; i < 8; ++i )
|
|
69
|
+
r.x[i] = a.x[i] ^ b.x[i];
|
|
70
|
+
return(r);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
static __m512i _mm512_ternarylogic_epi64(__m512i a, __m512i b, __m512i c, int imm)
|
|
74
|
+
{
|
|
75
|
+
|
|
76
|
+
if (imm == 0x96)
|
|
77
|
+
return ( _mm512_xor_si512( _mm512_xor_si512( a, b ), c ) );
|
|
78
|
+
if (imm == 0xD2) {
|
|
79
|
+
__m512i t;
|
|
80
|
+
unsigned int i;
|
|
81
|
+
|
|
82
|
+
for ( i = 0; i < 8; ++i )
|
|
83
|
+
t.x[i] = ~b.x[i] & c.x[i];
|
|
84
|
+
return ( _mm512_xor_si512( a, t ) );
|
|
85
|
+
}
|
|
86
|
+
printf( "_mm512_ternarylogic_epi64( a, b, c, %02X) not implemented!\n", imm );
|
|
87
|
+
exit(1);
|
|
88
|
+
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
static __m512i _mm512_rol_epi64(__m512i a, int offset)
|
|
92
|
+
{
|
|
93
|
+
__m512i r;
|
|
94
|
+
unsigned int i;
|
|
95
|
+
|
|
96
|
+
for ( i = 0; i < 8; ++i )
|
|
97
|
+
r.x[i] = (a.x[i] << offset) | (a.x[i] >> (64-offset));
|
|
98
|
+
return(r);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
static __m512i _mm512_srli_epi64(__m512i a, int offset)
|
|
102
|
+
{
|
|
103
|
+
__m512i r;
|
|
104
|
+
unsigned int i;
|
|
105
|
+
|
|
106
|
+
for ( i = 0; i < 8; ++i )
|
|
107
|
+
r.x[i] = (a.x[i] >> offset);
|
|
108
|
+
return(r);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
static __m512i _mm512_broadcast_f64x4(__m256d a)
|
|
113
|
+
{
|
|
114
|
+
__m512i r;
|
|
115
|
+
unsigned int i;
|
|
116
|
+
uint64_t t[4];
|
|
117
|
+
|
|
118
|
+
_mm256_store_si256( (__m256i*)t, (__m256i)a );
|
|
119
|
+
for ( i = 0; i < 4; ++i )
|
|
120
|
+
r.x[i+4] = r.x[i] = t[i];
|
|
121
|
+
return(r);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
static __m512i _mm512_set_epi64(uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e, uint64_t f, uint64_t g, uint64_t h)
|
|
125
|
+
{
|
|
126
|
+
__m512i r;
|
|
127
|
+
|
|
128
|
+
r.x[0] = h;
|
|
129
|
+
r.x[1] = g;
|
|
130
|
+
r.x[2] = f;
|
|
131
|
+
r.x[3] = e;
|
|
132
|
+
r.x[4] = d;
|
|
133
|
+
r.x[5] = c;
|
|
134
|
+
r.x[6] = b;
|
|
135
|
+
r.x[7] = a;
|
|
136
|
+
return(r);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
static __m512i _mm512_i32gather_epi64(__m256i idx, const void *p, int scale)
|
|
140
|
+
{
|
|
141
|
+
__m512i r;
|
|
142
|
+
unsigned int i;
|
|
143
|
+
uint32_t offset[8];
|
|
144
|
+
|
|
145
|
+
_mm256_store_si256( (__m256i*)offset, idx );
|
|
146
|
+
for ( i = 0; i < 8; ++i )
|
|
147
|
+
r.x[i] = *(const uint64_t*)((const char*)p + offset[i] * scale);
|
|
148
|
+
return(r);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
static void _mm512_i32scatter_epi64( void *p, __m256i idx, __m512i value, int scale)
|
|
152
|
+
{
|
|
153
|
+
unsigned int i;
|
|
154
|
+
uint32_t offset[8];
|
|
155
|
+
|
|
156
|
+
_mm256_store_si256( (__m256i*)offset, idx );
|
|
157
|
+
for ( i = 0; i < 8; ++i )
|
|
158
|
+
*(uint64_t*)((char*)p + offset[i] * scale) = value.x[i];
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
static __m512i _mm512_permutex2var_epi64(__m512i a, __m512i idx, __m512i b)
|
|
162
|
+
{
|
|
163
|
+
__m512i r;
|
|
164
|
+
unsigned int i;
|
|
165
|
+
for ( i = 0; i < 8; ++i )
|
|
166
|
+
r.x[i] = (idx.x[i] & 8) ? b.x[idx.x[i] & 7] : a.x[idx.x[i] & 7];
|
|
167
|
+
return(r);
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
static __m512i _mm512_maskz_loadu_epi64(uint8_t k, const void *mem_addr)
|
|
171
|
+
{
|
|
172
|
+
__m512i r;
|
|
173
|
+
const uint64_t *p64 = (const uint64_t *)mem_addr;
|
|
174
|
+
unsigned int i;
|
|
175
|
+
|
|
176
|
+
for ( i = 0; i < 8; ++i ) {
|
|
177
|
+
if ((k & (1 << i)) != 0) {
|
|
178
|
+
r.x[i] = p64[i];
|
|
179
|
+
}
|
|
180
|
+
else {
|
|
181
|
+
r.x[i] = 0;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
return(r);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
#define _mm512_maskz_load_epi64 _mm512_maskz_loadu_epi64
|
|
188
|
+
|
|
189
|
+
static void _mm512_storeu_si512(__m512i * mem_addr, __m512i a)
|
|
190
|
+
{
|
|
191
|
+
uint64_t *p64 = (uint64_t *)mem_addr;
|
|
192
|
+
unsigned int i;
|
|
193
|
+
|
|
194
|
+
for ( i = 0; i < 8; ++i )
|
|
195
|
+
p64[i] = a.x[i];
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
#define _mm512_store_si512 _mm512_storeu_si512
|
|
199
|
+
|
|
200
|
+
static __m512i _mm512_loadu_si512(const __m512i * mem_addr)
|
|
201
|
+
{
|
|
202
|
+
__m512i r;
|
|
203
|
+
const uint64_t *p64 = (const uint64_t *)mem_addr;
|
|
204
|
+
unsigned int i;
|
|
205
|
+
|
|
206
|
+
for ( i = 0; i < 8; ++i )
|
|
207
|
+
r.x[i] = p64[i];
|
|
208
|
+
return(r);
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
#define _mm512_load_si512 _mm512_loadu_si512
|
|
212
|
+
|
|
213
|
+
static void _mm512_mask_storeu_epi64(void *mem_addr, uint8_t k, __m512i a)
|
|
214
|
+
{
|
|
215
|
+
uint64_t *p64 = (uint64_t *)mem_addr;
|
|
216
|
+
unsigned int i;
|
|
217
|
+
|
|
218
|
+
for ( i = 0; i < 8; ++i ) {
|
|
219
|
+
if ((k & (1 << i)) != 0)
|
|
220
|
+
p64[i] = a.x[i];
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
#define _mm512_mask_store_epi64 _mm512_mask_storeu_epi64
|
|
225
|
+
|
|
226
|
+
static __m512i _mm512_setzero_si512(void)
|
|
227
|
+
{
|
|
228
|
+
__m512i r;
|
|
229
|
+
unsigned int i;
|
|
230
|
+
|
|
231
|
+
for ( i = 0; i < 8; ++i )
|
|
232
|
+
r.x[i] = 0;
|
|
233
|
+
return(r);
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
static __m256i _mm512_extracti64x4_epi64(__m512i a, int imm8)
|
|
237
|
+
{
|
|
238
|
+
uint64_t buf[8];
|
|
239
|
+
__m256i r;
|
|
240
|
+
|
|
241
|
+
_mm512_storeu_si512((__m512i*)buf, a);
|
|
242
|
+
r = *(__m256i*)&buf[((imm8 == 0) ? 0 : 4)];
|
|
243
|
+
return(r);
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
#endif
|
|
247
|
+
|
|
248
|
+
typedef __m128i V128;
|
|
249
|
+
typedef __m256i V256;
|
|
250
|
+
typedef __m512i V512;
|
|
251
|
+
|
|
252
|
+
#if defined(KeccakP1600times8_useAVX512)
|
|
253
|
+
|
|
254
|
+
#define XOR(a,b) _mm512_xor_si512(a,b)
|
|
255
|
+
#define XOR3(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0x96)
|
|
256
|
+
#define XOR5(a,b,c,d,e) XOR3(XOR3(a,b,c),d,e)
|
|
257
|
+
#define XOReq512(a, b) a = XOR(a,b)
|
|
258
|
+
|
|
259
|
+
#define ROL(a,offset) _mm512_rol_epi64(a,offset)
|
|
260
|
+
#define Chi(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0xD2)
|
|
261
|
+
|
|
262
|
+
#define CONST8_64(a) _mm512_set1_epi64(a)
|
|
263
|
+
|
|
264
|
+
#define LOAD512(a) _mm512_load_si512((const V512 *)&(a))
|
|
265
|
+
#define LOAD512u(a) _mm512_loadu_si512((const V512 *)&(a))
|
|
266
|
+
#define LOAD8_32(a,b,c,d,e,f,g,h) _mm256_set_epi32((uint64_t)(a), (uint32_t)(b), (uint32_t)(c), (uint32_t)(d), (uint32_t)(e), (uint32_t)(f), (uint32_t)(g), (uint32_t)(h))
|
|
267
|
+
#define LOAD8_64(a,b,c,d,e,f,g,h) _mm512_set_epi64((uint64_t)(a), (uint64_t)(b), (uint64_t)(c), (uint64_t)(d), (uint64_t)(e), (uint64_t)(f), (uint64_t)(g), (uint64_t)(h))
|
|
268
|
+
#define LOAD_GATHER8_64(idx,p) _mm512_i32gather_epi64( idx, (const void*)(p), 8)
|
|
269
|
+
|
|
270
|
+
#define STORE_SCATTER8_64(p,idx, v) _mm512_i32scatter_epi64( (void*)(p), idx, v, 8)
|
|
271
|
+
|
|
272
|
+
#endif
|
|
273
|
+
|
|
274
|
+
#if (VERBOSE > 0)
|
|
275
|
+
#define DumpMem(__t, buf, __n) { \
|
|
276
|
+
uint32_t i; \
|
|
277
|
+
printf("%s ", __t); \
|
|
278
|
+
for (i = 0; i < __n; ++i) { \
|
|
279
|
+
printf("%016lx ", (buf)[i]); \
|
|
280
|
+
/*if ((i%5) == 4) printf("\n"); */\
|
|
281
|
+
} \
|
|
282
|
+
printf("\n"); \
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
#define DumpOne(__v,__i) { \
|
|
286
|
+
uint64_t buf[8]; \
|
|
287
|
+
_mm512_storeu_si512((V512*)buf, __v##__i); \
|
|
288
|
+
printf("%016lx %016lx %016lx %016lx %016lx %016lx %016lx %016lx\n", \
|
|
289
|
+
buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
#define Dump(__t,__v) { \
|
|
293
|
+
printf("%s\n", __t); \
|
|
294
|
+
DumpOne(__v, ba); \
|
|
295
|
+
DumpOne(__v, be); \
|
|
296
|
+
DumpOne(__v, bi); \
|
|
297
|
+
DumpOne(__v, bo); \
|
|
298
|
+
DumpOne(__v, bu); \
|
|
299
|
+
DumpOne(__v, ga); \
|
|
300
|
+
DumpOne(__v, ge); \
|
|
301
|
+
DumpOne(__v, gi); \
|
|
302
|
+
DumpOne(__v, go); \
|
|
303
|
+
DumpOne(__v, gu); \
|
|
304
|
+
DumpOne(__v, ka); \
|
|
305
|
+
DumpOne(__v, ke); \
|
|
306
|
+
DumpOne(__v, ki); \
|
|
307
|
+
DumpOne(__v, ko); \
|
|
308
|
+
DumpOne(__v, ku); \
|
|
309
|
+
DumpOne(__v, ma); \
|
|
310
|
+
DumpOne(__v, me); \
|
|
311
|
+
DumpOne(__v, mi); \
|
|
312
|
+
DumpOne(__v, mo); \
|
|
313
|
+
DumpOne(__v, mu); \
|
|
314
|
+
DumpOne(__v, sa); \
|
|
315
|
+
DumpOne(__v, se); \
|
|
316
|
+
DumpOne(__v, si); \
|
|
317
|
+
DumpOne(__v, so); \
|
|
318
|
+
DumpOne(__v, su); \
|
|
319
|
+
printf("\n"); \
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
#define DumpReg(__t,__v,__i) printf("%s ", __t); DumpOne(__v,__i)
|
|
323
|
+
|
|
324
|
+
#else
|
|
325
|
+
#define DumpMem(__t, buf,len)
|
|
326
|
+
#define DumpOne(__v,__i)
|
|
327
|
+
#define Dump(__t,__v)
|
|
328
|
+
#define DumpReg(__t,__v,__i)
|
|
329
|
+
#endif
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
#define laneIndex(instanceIndex, lanePosition) ((lanePosition)*8 + instanceIndex)
|
|
333
|
+
#define SnP_laneLengthInBytes 8
|
|
334
|
+
|
|
335
|
+
void KeccakP1600times8_InitializeAll(void *states)
|
|
336
|
+
{
|
|
337
|
+
memset(states, 0, KeccakP1600times8_statesSizeInBytes);
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
void KeccakP1600times8_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
|
|
341
|
+
{
|
|
342
|
+
unsigned int sizeLeft = length;
|
|
343
|
+
unsigned int lanePosition = offset/SnP_laneLengthInBytes;
|
|
344
|
+
unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
|
|
345
|
+
const unsigned char *curData = data;
|
|
346
|
+
uint64_t *statesAsLanes = states;
|
|
347
|
+
|
|
348
|
+
if ((sizeLeft > 0) && (offsetInLane != 0)) {
|
|
349
|
+
unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
|
|
350
|
+
uint64_t lane = 0;
|
|
351
|
+
if (bytesInLane > sizeLeft)
|
|
352
|
+
bytesInLane = sizeLeft;
|
|
353
|
+
memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane);
|
|
354
|
+
statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
|
|
355
|
+
sizeLeft -= bytesInLane;
|
|
356
|
+
lanePosition++;
|
|
357
|
+
curData += bytesInLane;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
while(sizeLeft >= SnP_laneLengthInBytes) {
|
|
361
|
+
uint64_t lane = *((const uint64_t*)curData);
|
|
362
|
+
statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
|
|
363
|
+
sizeLeft -= SnP_laneLengthInBytes;
|
|
364
|
+
lanePosition++;
|
|
365
|
+
curData += SnP_laneLengthInBytes;
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
if (sizeLeft > 0) {
|
|
369
|
+
uint64_t lane = 0;
|
|
370
|
+
memcpy(&lane, curData, sizeLeft);
|
|
371
|
+
statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
void KeccakP1600times8_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
|
|
376
|
+
{
|
|
377
|
+
V512 *stateAsLanes = states;
|
|
378
|
+
const uint64_t *dataAsLanes = (const uint64_t *)data;
|
|
379
|
+
unsigned int i;
|
|
380
|
+
V256 index;
|
|
381
|
+
|
|
382
|
+
#define Add_In( argIndex ) stateAsLanes[argIndex] = XOR(stateAsLanes[argIndex], LOAD_GATHER8_64(index, dataAsLanes+argIndex))
|
|
383
|
+
index = LOAD8_32(7*laneOffset, 6*laneOffset, 5*laneOffset, 4*laneOffset, 3*laneOffset, 2*laneOffset, 1*laneOffset, 0*laneOffset);
|
|
384
|
+
if ( laneCount >= 16 ) {
|
|
385
|
+
Add_In( 0 );
|
|
386
|
+
Add_In( 1 );
|
|
387
|
+
Add_In( 2 );
|
|
388
|
+
Add_In( 3 );
|
|
389
|
+
Add_In( 4 );
|
|
390
|
+
Add_In( 5 );
|
|
391
|
+
Add_In( 6 );
|
|
392
|
+
Add_In( 7 );
|
|
393
|
+
Add_In( 8 );
|
|
394
|
+
Add_In( 9 );
|
|
395
|
+
Add_In( 10 );
|
|
396
|
+
Add_In( 11 );
|
|
397
|
+
Add_In( 12 );
|
|
398
|
+
Add_In( 13 );
|
|
399
|
+
Add_In( 14 );
|
|
400
|
+
Add_In( 15 );
|
|
401
|
+
if ( laneCount >= 20 ) {
|
|
402
|
+
Add_In( 16 );
|
|
403
|
+
Add_In( 17 );
|
|
404
|
+
Add_In( 18 );
|
|
405
|
+
Add_In( 19 );
|
|
406
|
+
for(i=20; i<laneCount; i++)
|
|
407
|
+
Add_In( i );
|
|
408
|
+
}
|
|
409
|
+
else {
|
|
410
|
+
for(i=16; i<laneCount; i++)
|
|
411
|
+
Add_In( i );
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
else {
|
|
415
|
+
for(i=0; i<laneCount; i++)
|
|
416
|
+
Add_In( i );
|
|
417
|
+
}
|
|
418
|
+
#undef Add_In
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
void KeccakP1600times8_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
|
|
422
|
+
{
|
|
423
|
+
unsigned int sizeLeft = length;
|
|
424
|
+
unsigned int lanePosition = offset/SnP_laneLengthInBytes;
|
|
425
|
+
unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
|
|
426
|
+
const unsigned char *curData = data;
|
|
427
|
+
uint64_t *statesAsLanes = states;
|
|
428
|
+
|
|
429
|
+
if ((sizeLeft > 0) && (offsetInLane != 0)) {
|
|
430
|
+
unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
|
|
431
|
+
if (bytesInLane > sizeLeft)
|
|
432
|
+
bytesInLane = sizeLeft;
|
|
433
|
+
memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane);
|
|
434
|
+
sizeLeft -= bytesInLane;
|
|
435
|
+
lanePosition++;
|
|
436
|
+
curData += bytesInLane;
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
while(sizeLeft >= SnP_laneLengthInBytes) {
|
|
440
|
+
uint64_t lane = *((const uint64_t*)curData);
|
|
441
|
+
statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane;
|
|
442
|
+
sizeLeft -= SnP_laneLengthInBytes;
|
|
443
|
+
lanePosition++;
|
|
444
|
+
curData += SnP_laneLengthInBytes;
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
if (sizeLeft > 0) {
|
|
448
|
+
memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft);
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
void KeccakP1600times8_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
|
|
453
|
+
{
|
|
454
|
+
V512 *stateAsLanes = states;
|
|
455
|
+
const uint64_t *dataAsLanes = (const uint64_t *)data;
|
|
456
|
+
unsigned int i;
|
|
457
|
+
V256 index;
|
|
458
|
+
|
|
459
|
+
#define OverWr( argIndex ) stateAsLanes[argIndex] = LOAD_GATHER8_64(index, dataAsLanes+argIndex)
|
|
460
|
+
index = LOAD8_32(7*laneOffset, 6*laneOffset, 5*laneOffset, 4*laneOffset, 3*laneOffset, 2*laneOffset, 1*laneOffset, 0*laneOffset);
|
|
461
|
+
if ( laneCount >= 16 ) {
|
|
462
|
+
OverWr( 0 );
|
|
463
|
+
OverWr( 1 );
|
|
464
|
+
OverWr( 2 );
|
|
465
|
+
OverWr( 3 );
|
|
466
|
+
OverWr( 4 );
|
|
467
|
+
OverWr( 5 );
|
|
468
|
+
OverWr( 6 );
|
|
469
|
+
OverWr( 7 );
|
|
470
|
+
OverWr( 8 );
|
|
471
|
+
OverWr( 9 );
|
|
472
|
+
OverWr( 10 );
|
|
473
|
+
OverWr( 11 );
|
|
474
|
+
OverWr( 12 );
|
|
475
|
+
OverWr( 13 );
|
|
476
|
+
OverWr( 14 );
|
|
477
|
+
OverWr( 15 );
|
|
478
|
+
if ( laneCount >= 20 ) {
|
|
479
|
+
OverWr( 16 );
|
|
480
|
+
OverWr( 17 );
|
|
481
|
+
OverWr( 18 );
|
|
482
|
+
OverWr( 19 );
|
|
483
|
+
for(i=20; i<laneCount; i++)
|
|
484
|
+
OverWr( i );
|
|
485
|
+
}
|
|
486
|
+
else {
|
|
487
|
+
for(i=16; i<laneCount; i++)
|
|
488
|
+
OverWr( i );
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
else {
|
|
492
|
+
for(i=0; i<laneCount; i++)
|
|
493
|
+
OverWr( i );
|
|
494
|
+
}
|
|
495
|
+
#undef OverWr
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
void KeccakP1600times8_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
|
|
499
|
+
{
|
|
500
|
+
unsigned int sizeLeft = byteCount;
|
|
501
|
+
unsigned int lanePosition = 0;
|
|
502
|
+
uint64_t *statesAsLanes = states;
|
|
503
|
+
|
|
504
|
+
while(sizeLeft >= SnP_laneLengthInBytes) {
|
|
505
|
+
statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0;
|
|
506
|
+
sizeLeft -= SnP_laneLengthInBytes;
|
|
507
|
+
lanePosition++;
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
if (sizeLeft > 0) {
|
|
511
|
+
memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft);
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
void KeccakP1600times8_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
|
|
516
|
+
{
|
|
517
|
+
unsigned int sizeLeft = length;
|
|
518
|
+
unsigned int lanePosition = offset/SnP_laneLengthInBytes;
|
|
519
|
+
unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
|
|
520
|
+
unsigned char *curData = data;
|
|
521
|
+
const uint64_t *statesAsLanes = states;
|
|
522
|
+
|
|
523
|
+
if ((sizeLeft > 0) && (offsetInLane != 0)) {
|
|
524
|
+
unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
|
|
525
|
+
if (bytesInLane > sizeLeft)
|
|
526
|
+
bytesInLane = sizeLeft;
|
|
527
|
+
memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
|
|
528
|
+
sizeLeft -= bytesInLane;
|
|
529
|
+
lanePosition++;
|
|
530
|
+
curData += bytesInLane;
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
while(sizeLeft >= SnP_laneLengthInBytes) {
|
|
534
|
+
*(uint64_t*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
|
|
535
|
+
sizeLeft -= SnP_laneLengthInBytes;
|
|
536
|
+
lanePosition++;
|
|
537
|
+
curData += SnP_laneLengthInBytes;
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
if (sizeLeft > 0) {
|
|
541
|
+
memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft);
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
void KeccakP1600times8_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
|
|
546
|
+
{
|
|
547
|
+
const V512 *stateAsLanes = states;
|
|
548
|
+
uint64_t *dataAsLanes = (uint64_t *)data;
|
|
549
|
+
unsigned int i;
|
|
550
|
+
V256 index;
|
|
551
|
+
|
|
552
|
+
#define Extr( argIndex ) STORE_SCATTER8_64(dataAsLanes+argIndex, index, stateAsLanes[argIndex])
|
|
553
|
+
index = LOAD8_32(7*laneOffset, 6*laneOffset, 5*laneOffset, 4*laneOffset, 3*laneOffset, 2*laneOffset, 1*laneOffset, 0*laneOffset);
|
|
554
|
+
if ( laneCount >= 16 ) {
|
|
555
|
+
Extr( 0 );
|
|
556
|
+
Extr( 1 );
|
|
557
|
+
Extr( 2 );
|
|
558
|
+
Extr( 3 );
|
|
559
|
+
Extr( 4 );
|
|
560
|
+
Extr( 5 );
|
|
561
|
+
Extr( 6 );
|
|
562
|
+
Extr( 7 );
|
|
563
|
+
Extr( 8 );
|
|
564
|
+
Extr( 9 );
|
|
565
|
+
Extr( 10 );
|
|
566
|
+
Extr( 11 );
|
|
567
|
+
Extr( 12 );
|
|
568
|
+
Extr( 13 );
|
|
569
|
+
Extr( 14 );
|
|
570
|
+
Extr( 15 );
|
|
571
|
+
if ( laneCount >= 20 ) {
|
|
572
|
+
Extr( 16 );
|
|
573
|
+
Extr( 17 );
|
|
574
|
+
Extr( 18 );
|
|
575
|
+
Extr( 19 );
|
|
576
|
+
for(i=20; i<laneCount; i++)
|
|
577
|
+
Extr( i );
|
|
578
|
+
}
|
|
579
|
+
else {
|
|
580
|
+
for(i=16; i<laneCount; i++)
|
|
581
|
+
Extr( i );
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
else {
|
|
585
|
+
for(i=0; i<laneCount; i++)
|
|
586
|
+
Extr( i );
|
|
587
|
+
}
|
|
588
|
+
#undef Extr
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
void KeccakP1600times8_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
|
|
592
|
+
{
|
|
593
|
+
unsigned int sizeLeft = length;
|
|
594
|
+
unsigned int lanePosition = offset/SnP_laneLengthInBytes;
|
|
595
|
+
unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
|
|
596
|
+
const unsigned char *curInput = input;
|
|
597
|
+
unsigned char *curOutput = output;
|
|
598
|
+
const uint64_t *statesAsLanes = states;
|
|
599
|
+
|
|
600
|
+
if ((sizeLeft > 0) && (offsetInLane != 0)) {
|
|
601
|
+
unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
|
|
602
|
+
uint64_t lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane);
|
|
603
|
+
if (bytesInLane > sizeLeft)
|
|
604
|
+
bytesInLane = sizeLeft;
|
|
605
|
+
sizeLeft -= bytesInLane;
|
|
606
|
+
do {
|
|
607
|
+
*(curOutput++) = *(curInput++) ^ (unsigned char)lane;
|
|
608
|
+
lane >>= 8;
|
|
609
|
+
} while ( --bytesInLane != 0);
|
|
610
|
+
lanePosition++;
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
while(sizeLeft >= SnP_laneLengthInBytes) {
|
|
614
|
+
*((uint64_t*)curOutput) = *((uint64_t*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
|
|
615
|
+
sizeLeft -= SnP_laneLengthInBytes;
|
|
616
|
+
lanePosition++;
|
|
617
|
+
curInput += SnP_laneLengthInBytes;
|
|
618
|
+
curOutput += SnP_laneLengthInBytes;
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
if (sizeLeft != 0) {
|
|
622
|
+
uint64_t lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
|
|
623
|
+
do {
|
|
624
|
+
*(curOutput++) = *(curInput++) ^ (unsigned char)lane;
|
|
625
|
+
lane >>= 8;
|
|
626
|
+
} while ( --sizeLeft != 0);
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
void KeccakP1600times8_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
|
|
631
|
+
{
|
|
632
|
+
const V512 *stateAsLanes = states;
|
|
633
|
+
const uint64_t *inAsLanes = (const uint64_t *)input;
|
|
634
|
+
uint64_t *outAsLanes = (uint64_t *)output;
|
|
635
|
+
unsigned int i;
|
|
636
|
+
V256 index;
|
|
637
|
+
|
|
638
|
+
#define ExtrAdd( argIndex ) STORE_SCATTER8_64(outAsLanes+argIndex, index, XOR(stateAsLanes[argIndex], LOAD_GATHER8_64(index, inAsLanes+argIndex)))
|
|
639
|
+
index = LOAD8_32(7*laneOffset, 6*laneOffset, 5*laneOffset, 4*laneOffset, 3*laneOffset, 2*laneOffset, 1*laneOffset, 0*laneOffset);
|
|
640
|
+
if ( laneCount >= 16 ) {
|
|
641
|
+
ExtrAdd( 0 );
|
|
642
|
+
ExtrAdd( 1 );
|
|
643
|
+
ExtrAdd( 2 );
|
|
644
|
+
ExtrAdd( 3 );
|
|
645
|
+
ExtrAdd( 4 );
|
|
646
|
+
ExtrAdd( 5 );
|
|
647
|
+
ExtrAdd( 6 );
|
|
648
|
+
ExtrAdd( 7 );
|
|
649
|
+
ExtrAdd( 8 );
|
|
650
|
+
ExtrAdd( 9 );
|
|
651
|
+
ExtrAdd( 10 );
|
|
652
|
+
ExtrAdd( 11 );
|
|
653
|
+
ExtrAdd( 12 );
|
|
654
|
+
ExtrAdd( 13 );
|
|
655
|
+
ExtrAdd( 14 );
|
|
656
|
+
ExtrAdd( 15 );
|
|
657
|
+
if ( laneCount >= 20 ) {
|
|
658
|
+
ExtrAdd( 16 );
|
|
659
|
+
ExtrAdd( 17 );
|
|
660
|
+
ExtrAdd( 18 );
|
|
661
|
+
ExtrAdd( 19 );
|
|
662
|
+
for(i=20; i<laneCount; i++)
|
|
663
|
+
ExtrAdd( i );
|
|
664
|
+
}
|
|
665
|
+
else {
|
|
666
|
+
for(i=16; i<laneCount; i++)
|
|
667
|
+
ExtrAdd( i );
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
else {
|
|
671
|
+
for(i=0; i<laneCount; i++)
|
|
672
|
+
ExtrAdd( i );
|
|
673
|
+
}
|
|
674
|
+
#undef ExtrAdd
|
|
675
|
+
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
static ALIGN(KeccakP1600times8_statesAlignment) const uint64_t KeccakP1600RoundConstants[24] = {
|
|
679
|
+
0x0000000000000001ULL,
|
|
680
|
+
0x0000000000008082ULL,
|
|
681
|
+
0x800000000000808aULL,
|
|
682
|
+
0x8000000080008000ULL,
|
|
683
|
+
0x000000000000808bULL,
|
|
684
|
+
0x0000000080000001ULL,
|
|
685
|
+
0x8000000080008081ULL,
|
|
686
|
+
0x8000000000008009ULL,
|
|
687
|
+
0x000000000000008aULL,
|
|
688
|
+
0x0000000000000088ULL,
|
|
689
|
+
0x0000000080008009ULL,
|
|
690
|
+
0x000000008000000aULL,
|
|
691
|
+
0x000000008000808bULL,
|
|
692
|
+
0x800000000000008bULL,
|
|
693
|
+
0x8000000000008089ULL,
|
|
694
|
+
0x8000000000008003ULL,
|
|
695
|
+
0x8000000000008002ULL,
|
|
696
|
+
0x8000000000000080ULL,
|
|
697
|
+
0x000000000000800aULL,
|
|
698
|
+
0x800000008000000aULL,
|
|
699
|
+
0x8000000080008081ULL,
|
|
700
|
+
0x8000000000008080ULL,
|
|
701
|
+
0x0000000080000001ULL,
|
|
702
|
+
0x8000000080008008ULL};
|
|
703
|
+
|
|
704
|
+
#define KeccakP_DeclareVars \
|
|
705
|
+
V512 _Ba, _Be, _Bi, _Bo, _Bu; \
|
|
706
|
+
V512 _Da, _De, _Di, _Do, _Du; \
|
|
707
|
+
V512 _ba, _be, _bi, _bo, _bu; \
|
|
708
|
+
V512 _ga, _ge, _gi, _go, _gu; \
|
|
709
|
+
V512 _ka, _ke, _ki, _ko, _ku; \
|
|
710
|
+
V512 _ma, _me, _mi, _mo, _mu; \
|
|
711
|
+
V512 _sa, _se, _si, _so, _su
|
|
712
|
+
|
|
713
|
+
#define KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bb1, _Bb2, _Bb3, _Bb4, _Bb5, _Rr1, _Rr2, _Rr3, _Rr4, _Rr5 ) \
|
|
714
|
+
_Bb1 = XOR(_L1, _Da); \
|
|
715
|
+
_Bb2 = XOR(_L2, _De); \
|
|
716
|
+
_Bb3 = XOR(_L3, _Di); \
|
|
717
|
+
_Bb4 = XOR(_L4, _Do); \
|
|
718
|
+
_Bb5 = XOR(_L5, _Du); \
|
|
719
|
+
if (_Rr1 != 0) _Bb1 = ROL(_Bb1, _Rr1); \
|
|
720
|
+
_Bb2 = ROL(_Bb2, _Rr2); \
|
|
721
|
+
_Bb3 = ROL(_Bb3, _Rr3); \
|
|
722
|
+
_Bb4 = ROL(_Bb4, _Rr4); \
|
|
723
|
+
_Bb5 = ROL(_Bb5, _Rr5); \
|
|
724
|
+
_L1 = Chi( _Ba, _Be, _Bi); \
|
|
725
|
+
_L2 = Chi( _Be, _Bi, _Bo); \
|
|
726
|
+
_L3 = Chi( _Bi, _Bo, _Bu); \
|
|
727
|
+
_L4 = Chi( _Bo, _Bu, _Ba); \
|
|
728
|
+
_L5 = Chi( _Bu, _Ba, _Be);
|
|
729
|
+
|
|
730
|
+
#define KeccakP_ThetaRhoPiChiIota0( _L1, _L2, _L3, _L4, _L5, _rc ) \
|
|
731
|
+
_Ba = XOR5( _ba, _ga, _ka, _ma, _sa ); /* Theta effect */ \
|
|
732
|
+
_Be = XOR5( _be, _ge, _ke, _me, _se ); \
|
|
733
|
+
_Bi = XOR5( _bi, _gi, _ki, _mi, _si ); \
|
|
734
|
+
_Bo = XOR5( _bo, _go, _ko, _mo, _so ); \
|
|
735
|
+
_Bu = XOR5( _bu, _gu, _ku, _mu, _su ); \
|
|
736
|
+
_Da = ROL( _Be, 1 ); \
|
|
737
|
+
_De = ROL( _Bi, 1 ); \
|
|
738
|
+
_Di = ROL( _Bo, 1 ); \
|
|
739
|
+
_Do = ROL( _Bu, 1 ); \
|
|
740
|
+
_Du = ROL( _Ba, 1 ); \
|
|
741
|
+
_Da = XOR( _Da, _Bu ); \
|
|
742
|
+
_De = XOR( _De, _Ba ); \
|
|
743
|
+
_Di = XOR( _Di, _Be ); \
|
|
744
|
+
_Do = XOR( _Do, _Bi ); \
|
|
745
|
+
_Du = XOR( _Du, _Bo ); \
|
|
746
|
+
KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Ba, _Be, _Bi, _Bo, _Bu, 0, 44, 43, 21, 14 ); \
|
|
747
|
+
_L1 = XOR(_L1, _rc) /* Iota */
|
|
748
|
+
|
|
749
|
+
#define KeccakP_ThetaRhoPiChi1( _L1, _L2, _L3, _L4, _L5 ) \
|
|
750
|
+
KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bi, _Bo, _Bu, _Ba, _Be, 3, 45, 61, 28, 20 )
|
|
751
|
+
|
|
752
|
+
#define KeccakP_ThetaRhoPiChi2( _L1, _L2, _L3, _L4, _L5 ) \
|
|
753
|
+
KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bu, _Ba, _Be, _Bi, _Bo, 18, 1, 6, 25, 8 )
|
|
754
|
+
|
|
755
|
+
#define KeccakP_ThetaRhoPiChi3( _L1, _L2, _L3, _L4, _L5 ) \
|
|
756
|
+
KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Be, _Bi, _Bo, _Bu, _Ba, 36, 10, 15, 56, 27 )
|
|
757
|
+
|
|
758
|
+
#define KeccakP_ThetaRhoPiChi4( _L1, _L2, _L3, _L4, _L5 ) \
|
|
759
|
+
KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bo, _Bu, _Ba, _Be, _Bi, 41, 2, 62, 55, 39 )
|
|
760
|
+
|
|
761
|
+
#define KeccakP_4rounds( i ) \
|
|
762
|
+
KeccakP_ThetaRhoPiChiIota0(_ba, _ge, _ki, _mo, _su, CONST8_64(KeccakP1600RoundConstants[i]) ); \
|
|
763
|
+
KeccakP_ThetaRhoPiChi1( _ka, _me, _si, _bo, _gu ); \
|
|
764
|
+
KeccakP_ThetaRhoPiChi2( _sa, _be, _gi, _ko, _mu ); \
|
|
765
|
+
KeccakP_ThetaRhoPiChi3( _ga, _ke, _mi, _so, _bu ); \
|
|
766
|
+
KeccakP_ThetaRhoPiChi4( _ma, _se, _bi, _go, _ku ); \
|
|
767
|
+
\
|
|
768
|
+
KeccakP_ThetaRhoPiChiIota0(_ba, _me, _gi, _so, _ku, CONST8_64(KeccakP1600RoundConstants[i+1]) ); \
|
|
769
|
+
KeccakP_ThetaRhoPiChi1( _sa, _ke, _bi, _mo, _gu ); \
|
|
770
|
+
KeccakP_ThetaRhoPiChi2( _ma, _ge, _si, _ko, _bu ); \
|
|
771
|
+
KeccakP_ThetaRhoPiChi3( _ka, _be, _mi, _go, _su ); \
|
|
772
|
+
KeccakP_ThetaRhoPiChi4( _ga, _se, _ki, _bo, _mu ); \
|
|
773
|
+
\
|
|
774
|
+
KeccakP_ThetaRhoPiChiIota0(_ba, _ke, _si, _go, _mu, CONST8_64(KeccakP1600RoundConstants[i+2]) ); \
|
|
775
|
+
KeccakP_ThetaRhoPiChi1( _ma, _be, _ki, _so, _gu ); \
|
|
776
|
+
KeccakP_ThetaRhoPiChi2( _ga, _me, _bi, _ko, _su ); \
|
|
777
|
+
KeccakP_ThetaRhoPiChi3( _sa, _ge, _mi, _bo, _ku ); \
|
|
778
|
+
KeccakP_ThetaRhoPiChi4( _ka, _se, _gi, _mo, _bu ); \
|
|
779
|
+
\
|
|
780
|
+
KeccakP_ThetaRhoPiChiIota0(_ba, _be, _bi, _bo, _bu, CONST8_64(KeccakP1600RoundConstants[i+3]) ); \
|
|
781
|
+
KeccakP_ThetaRhoPiChi1( _ga, _ge, _gi, _go, _gu ); \
|
|
782
|
+
KeccakP_ThetaRhoPiChi2( _ka, _ke, _ki, _ko, _ku ); \
|
|
783
|
+
KeccakP_ThetaRhoPiChi3( _ma, _me, _mi, _mo, _mu ); \
|
|
784
|
+
KeccakP_ThetaRhoPiChi4( _sa, _se, _si, _so, _su )
|
|
785
|
+
|
|
786
|
+
#define KeccakP_2rounds( i ) \
|
|
787
|
+
KeccakP_ThetaRhoPiChiIota0(_ba, _ke, _si, _go, _mu, CONST8_64(KeccakP1600RoundConstants[i]) ); \
|
|
788
|
+
KeccakP_ThetaRhoPiChi1( _ma, _be, _ki, _so, _gu ); \
|
|
789
|
+
KeccakP_ThetaRhoPiChi2( _ga, _me, _bi, _ko, _su ); \
|
|
790
|
+
KeccakP_ThetaRhoPiChi3( _sa, _ge, _mi, _bo, _ku ); \
|
|
791
|
+
KeccakP_ThetaRhoPiChi4( _ka, _se, _gi, _mo, _bu ); \
|
|
792
|
+
\
|
|
793
|
+
KeccakP_ThetaRhoPiChiIota0(_ba, _be, _bi, _bo, _bu, CONST8_64(KeccakP1600RoundConstants[i+1]) ); \
|
|
794
|
+
KeccakP_ThetaRhoPiChi1( _ga, _ge, _gi, _go, _gu ); \
|
|
795
|
+
KeccakP_ThetaRhoPiChi2( _ka, _ke, _ki, _ko, _ku ); \
|
|
796
|
+
KeccakP_ThetaRhoPiChi3( _ma, _me, _mi, _mo, _mu ); \
|
|
797
|
+
KeccakP_ThetaRhoPiChi4( _sa, _se, _si, _so, _su )
|
|
798
|
+
|
|
799
|
+
#ifdef KeccakP1600times8_fullUnrolling
|
|
800
|
+
|
|
801
|
+
#define rounds12 \
|
|
802
|
+
KeccakP_4rounds( 12 ); \
|
|
803
|
+
KeccakP_4rounds( 16 ); \
|
|
804
|
+
KeccakP_4rounds( 20 )
|
|
805
|
+
|
|
806
|
+
#define rounds24 \
|
|
807
|
+
KeccakP_4rounds( 0 ); \
|
|
808
|
+
KeccakP_4rounds( 4 ); \
|
|
809
|
+
KeccakP_4rounds( 8 ); \
|
|
810
|
+
KeccakP_4rounds( 12 ); \
|
|
811
|
+
KeccakP_4rounds( 16 ); \
|
|
812
|
+
KeccakP_4rounds( 20 )
|
|
813
|
+
|
|
814
|
+
#elif (KeccakP1600times8_unrolling == 4)
|
|
815
|
+
|
|
816
|
+
#define rounds12 \
|
|
817
|
+
i = 12; \
|
|
818
|
+
do { \
|
|
819
|
+
KeccakP_4rounds( i ); \
|
|
820
|
+
} while( (i += 4) < 24 )
|
|
821
|
+
|
|
822
|
+
#define rounds24 \
|
|
823
|
+
i = 0; \
|
|
824
|
+
do { \
|
|
825
|
+
KeccakP_4rounds( i ); \
|
|
826
|
+
} while( (i += 4) < 24 )
|
|
827
|
+
|
|
828
|
+
#elif (KeccakP1600times8_unrolling == 12)
|
|
829
|
+
|
|
830
|
+
#define rounds12 \
|
|
831
|
+
KeccakP_4rounds( 12 ); \
|
|
832
|
+
KeccakP_4rounds( 16 ); \
|
|
833
|
+
KeccakP_4rounds( 20 )
|
|
834
|
+
|
|
835
|
+
#define rounds24 \
|
|
836
|
+
i = 0; \
|
|
837
|
+
do { \
|
|
838
|
+
KeccakP_4rounds( i ); \
|
|
839
|
+
KeccakP_4rounds( i+4 ); \
|
|
840
|
+
KeccakP_4rounds( i+8 ); \
|
|
841
|
+
} while( (i += 12) < 24 )
|
|
842
|
+
|
|
843
|
+
#else
|
|
844
|
+
#error "Unrolling is not correctly specified!"
|
|
845
|
+
#endif
|
|
846
|
+
|
|
847
|
+
#define rounds6 \
|
|
848
|
+
KeccakP_2rounds( 18 ); \
|
|
849
|
+
KeccakP_4rounds( 20 )
|
|
850
|
+
|
|
851
|
+
#define rounds4 \
|
|
852
|
+
KeccakP_4rounds( 20 )
|
|
853
|
+
|
|
854
|
+
#define copyFromState(pState) \
|
|
855
|
+
_ba = pState[ 0]; \
|
|
856
|
+
_be = pState[ 1]; \
|
|
857
|
+
_bi = pState[ 2]; \
|
|
858
|
+
_bo = pState[ 3]; \
|
|
859
|
+
_bu = pState[ 4]; \
|
|
860
|
+
_ga = pState[ 5]; \
|
|
861
|
+
_ge = pState[ 6]; \
|
|
862
|
+
_gi = pState[ 7]; \
|
|
863
|
+
_go = pState[ 8]; \
|
|
864
|
+
_gu = pState[ 9]; \
|
|
865
|
+
_ka = pState[10]; \
|
|
866
|
+
_ke = pState[11]; \
|
|
867
|
+
_ki = pState[12]; \
|
|
868
|
+
_ko = pState[13]; \
|
|
869
|
+
_ku = pState[14]; \
|
|
870
|
+
_ma = pState[15]; \
|
|
871
|
+
_me = pState[16]; \
|
|
872
|
+
_mi = pState[17]; \
|
|
873
|
+
_mo = pState[18]; \
|
|
874
|
+
_mu = pState[19]; \
|
|
875
|
+
_sa = pState[20]; \
|
|
876
|
+
_se = pState[21]; \
|
|
877
|
+
_si = pState[22]; \
|
|
878
|
+
_so = pState[23]; \
|
|
879
|
+
_su = pState[24]
|
|
880
|
+
|
|
881
|
+
#define copyFromState2rounds(pState) \
|
|
882
|
+
_ba = pState[ 0]; \
|
|
883
|
+
_be = pState[16]; /* me */ \
|
|
884
|
+
_bi = pState[ 7]; /* gi */ \
|
|
885
|
+
_bo = pState[23]; /* so */ \
|
|
886
|
+
_bu = pState[14]; /* ku */ \
|
|
887
|
+
_ga = pState[20]; /* sa */ \
|
|
888
|
+
_ge = pState[11]; /* ke */ \
|
|
889
|
+
_gi = pState[ 2]; /* bi */ \
|
|
890
|
+
_go = pState[18]; /* mo */ \
|
|
891
|
+
_gu = pState[ 9]; \
|
|
892
|
+
_ka = pState[15]; /* ma */ \
|
|
893
|
+
_ke = pState[ 6]; /* ge */ \
|
|
894
|
+
_ki = pState[22]; /* si */ \
|
|
895
|
+
_ko = pState[13]; \
|
|
896
|
+
_ku = pState[ 4]; /* bu */ \
|
|
897
|
+
_ma = pState[10]; /* ka */ \
|
|
898
|
+
_me = pState[ 1]; /* be */ \
|
|
899
|
+
_mi = pState[17]; \
|
|
900
|
+
_mo = pState[ 8]; /* go */ \
|
|
901
|
+
_mu = pState[24]; /* su */ \
|
|
902
|
+
_sa = pState[ 5]; /* ga */ \
|
|
903
|
+
_se = pState[21]; \
|
|
904
|
+
_si = pState[12]; /* ki */ \
|
|
905
|
+
_so = pState[ 3]; /* bo */ \
|
|
906
|
+
_su = pState[19] /* mu */
|
|
907
|
+
|
|
908
|
+
#define copyToState(pState) \
|
|
909
|
+
pState[ 0] = _ba; \
|
|
910
|
+
pState[ 1] = _be; \
|
|
911
|
+
pState[ 2] = _bi; \
|
|
912
|
+
pState[ 3] = _bo; \
|
|
913
|
+
pState[ 4] = _bu; \
|
|
914
|
+
pState[ 5] = _ga; \
|
|
915
|
+
pState[ 6] = _ge; \
|
|
916
|
+
pState[ 7] = _gi; \
|
|
917
|
+
pState[ 8] = _go; \
|
|
918
|
+
pState[ 9] = _gu; \
|
|
919
|
+
pState[10] = _ka; \
|
|
920
|
+
pState[11] = _ke; \
|
|
921
|
+
pState[12] = _ki; \
|
|
922
|
+
pState[13] = _ko; \
|
|
923
|
+
pState[14] = _ku; \
|
|
924
|
+
pState[15] = _ma; \
|
|
925
|
+
pState[16] = _me; \
|
|
926
|
+
pState[17] = _mi; \
|
|
927
|
+
pState[18] = _mo; \
|
|
928
|
+
pState[19] = _mu; \
|
|
929
|
+
pState[20] = _sa; \
|
|
930
|
+
pState[21] = _se; \
|
|
931
|
+
pState[22] = _si; \
|
|
932
|
+
pState[23] = _so; \
|
|
933
|
+
pState[24] = _su
|
|
934
|
+
|
|
935
|
+
void KeccakP1600times8_PermuteAll_24rounds(void *states)
|
|
936
|
+
{
|
|
937
|
+
V512 *statesAsLanes = states;
|
|
938
|
+
KeccakP_DeclareVars;
|
|
939
|
+
#ifndef KeccakP1600times8_fullUnrolling
|
|
940
|
+
unsigned int i;
|
|
941
|
+
#endif
|
|
942
|
+
|
|
943
|
+
copyFromState(statesAsLanes);
|
|
944
|
+
rounds24;
|
|
945
|
+
copyToState(statesAsLanes);
|
|
946
|
+
}
|
|
947
|
+
|
|
948
|
+
void KeccakP1600times8_PermuteAll_12rounds(void *states)
|
|
949
|
+
{
|
|
950
|
+
V512 *statesAsLanes = states;
|
|
951
|
+
KeccakP_DeclareVars;
|
|
952
|
+
#if (KeccakP1600times8_unrolling < 12)
|
|
953
|
+
unsigned int i;
|
|
954
|
+
#endif
|
|
955
|
+
|
|
956
|
+
copyFromState(statesAsLanes);
|
|
957
|
+
rounds12;
|
|
958
|
+
copyToState(statesAsLanes);
|
|
959
|
+
}
|
|
960
|
+
|
|
961
|
+
void KeccakP1600times8_PermuteAll_6rounds(void *states)
|
|
962
|
+
{
|
|
963
|
+
V512 *statesAsLanes = states;
|
|
964
|
+
KeccakP_DeclareVars;
|
|
965
|
+
|
|
966
|
+
copyFromState2rounds(statesAsLanes);
|
|
967
|
+
rounds6;
|
|
968
|
+
copyToState(statesAsLanes);
|
|
969
|
+
}
|
|
970
|
+
|
|
971
|
+
void KeccakP1600times8_PermuteAll_4rounds(void *states)
|
|
972
|
+
{
|
|
973
|
+
V512 *statesAsLanes = states;
|
|
974
|
+
KeccakP_DeclareVars;
|
|
975
|
+
|
|
976
|
+
copyFromState(statesAsLanes);
|
|
977
|
+
rounds4;
|
|
978
|
+
copyToState(statesAsLanes);
|
|
979
|
+
}
|
|
980
|
+
|
|
981
|
+
size_t KeccakF1600times8_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
|
|
982
|
+
{
|
|
983
|
+
size_t dataMinimumSize = (laneOffsetParallel*7 + laneCount)*8;
|
|
984
|
+
|
|
985
|
+
if (laneCount == 21) {
|
|
986
|
+
#ifndef KeccakP1600times8_fullUnrolling
|
|
987
|
+
unsigned int i;
|
|
988
|
+
#endif
|
|
989
|
+
const unsigned char *dataStart = data;
|
|
990
|
+
V512 *statesAsLanes = states;
|
|
991
|
+
const uint64_t *dataAsLanes = (const uint64_t *)data;
|
|
992
|
+
KeccakP_DeclareVars;
|
|
993
|
+
V256 index;
|
|
994
|
+
|
|
995
|
+
copyFromState(statesAsLanes);
|
|
996
|
+
index = LOAD8_32(7*laneOffsetParallel, 6*laneOffsetParallel, 5*laneOffsetParallel, 4*laneOffsetParallel, 3*laneOffsetParallel, 2*laneOffsetParallel, 1*laneOffsetParallel, 0*laneOffsetParallel);
|
|
997
|
+
while(dataByteLen >= dataMinimumSize) {
|
|
998
|
+
#define Add_In( argLane, argIndex ) argLane = XOR(argLane, LOAD_GATHER8_64(index, dataAsLanes+argIndex))
|
|
999
|
+
Add_In( _ba, 0 );
|
|
1000
|
+
Add_In( _be, 1 );
|
|
1001
|
+
Add_In( _bi, 2 );
|
|
1002
|
+
Add_In( _bo, 3 );
|
|
1003
|
+
Add_In( _bu, 4 );
|
|
1004
|
+
Add_In( _ga, 5 );
|
|
1005
|
+
Add_In( _ge, 6 );
|
|
1006
|
+
Add_In( _gi, 7 );
|
|
1007
|
+
Add_In( _go, 8 );
|
|
1008
|
+
Add_In( _gu, 9 );
|
|
1009
|
+
Add_In( _ka, 10 );
|
|
1010
|
+
Add_In( _ke, 11 );
|
|
1011
|
+
Add_In( _ki, 12 );
|
|
1012
|
+
Add_In( _ko, 13 );
|
|
1013
|
+
Add_In( _ku, 14 );
|
|
1014
|
+
Add_In( _ma, 15 );
|
|
1015
|
+
Add_In( _me, 16 );
|
|
1016
|
+
Add_In( _mi, 17 );
|
|
1017
|
+
Add_In( _mo, 18 );
|
|
1018
|
+
Add_In( _mu, 19 );
|
|
1019
|
+
Add_In( _sa, 20 );
|
|
1020
|
+
#undef Add_In
|
|
1021
|
+
rounds24;
|
|
1022
|
+
dataAsLanes += laneOffsetSerial;
|
|
1023
|
+
dataByteLen -= laneOffsetSerial*8;
|
|
1024
|
+
}
|
|
1025
|
+
copyToState(statesAsLanes);
|
|
1026
|
+
return (const unsigned char *)dataAsLanes - dataStart;
|
|
1027
|
+
}
|
|
1028
|
+
else {
|
|
1029
|
+
const unsigned char *dataStart = data;
|
|
1030
|
+
|
|
1031
|
+
while(dataByteLen >= dataMinimumSize) {
|
|
1032
|
+
KeccakP1600times8_AddLanesAll(states, data, laneCount, laneOffsetParallel);
|
|
1033
|
+
KeccakP1600times8_PermuteAll_24rounds(states);
|
|
1034
|
+
data += laneOffsetSerial*8;
|
|
1035
|
+
dataByteLen -= laneOffsetSerial*8;
|
|
1036
|
+
}
|
|
1037
|
+
return data - dataStart;
|
|
1038
|
+
}
|
|
1039
|
+
}
|
|
1040
|
+
|
|
1041
|
+
size_t KeccakP1600times8_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
|
|
1042
|
+
{
|
|
1043
|
+
size_t dataMinimumSize = (laneOffsetParallel*7 + laneCount)*8;
|
|
1044
|
+
|
|
1045
|
+
if (laneCount == 21) {
|
|
1046
|
+
#if (KeccakP1600times8_unrolling < 12)
|
|
1047
|
+
unsigned int i;
|
|
1048
|
+
#endif
|
|
1049
|
+
const unsigned char *dataStart = data;
|
|
1050
|
+
V512 *statesAsLanes = states;
|
|
1051
|
+
const uint64_t *dataAsLanes = (const uint64_t *)data;
|
|
1052
|
+
KeccakP_DeclareVars;
|
|
1053
|
+
V256 index;
|
|
1054
|
+
|
|
1055
|
+
copyFromState(statesAsLanes);
|
|
1056
|
+
index = LOAD8_32(7*laneOffsetParallel, 6*laneOffsetParallel, 5*laneOffsetParallel, 4*laneOffsetParallel, 3*laneOffsetParallel, 2*laneOffsetParallel, 1*laneOffsetParallel, 0*laneOffsetParallel);
|
|
1057
|
+
while(dataByteLen >= dataMinimumSize) {
|
|
1058
|
+
#define Add_In( argLane, argIndex ) argLane = XOR(argLane, LOAD_GATHER8_64(index, dataAsLanes+argIndex))
|
|
1059
|
+
Add_In( _ba, 0 );
|
|
1060
|
+
Add_In( _be, 1 );
|
|
1061
|
+
Add_In( _bi, 2 );
|
|
1062
|
+
Add_In( _bo, 3 );
|
|
1063
|
+
Add_In( _bu, 4 );
|
|
1064
|
+
Add_In( _ga, 5 );
|
|
1065
|
+
Add_In( _ge, 6 );
|
|
1066
|
+
Add_In( _gi, 7 );
|
|
1067
|
+
Add_In( _go, 8 );
|
|
1068
|
+
Add_In( _gu, 9 );
|
|
1069
|
+
Add_In( _ka, 10 );
|
|
1070
|
+
Add_In( _ke, 11 );
|
|
1071
|
+
Add_In( _ki, 12 );
|
|
1072
|
+
Add_In( _ko, 13 );
|
|
1073
|
+
Add_In( _ku, 14 );
|
|
1074
|
+
Add_In( _ma, 15 );
|
|
1075
|
+
Add_In( _me, 16 );
|
|
1076
|
+
Add_In( _mi, 17 );
|
|
1077
|
+
Add_In( _mo, 18 );
|
|
1078
|
+
Add_In( _mu, 19 );
|
|
1079
|
+
Add_In( _sa, 20 );
|
|
1080
|
+
#undef Add_In
|
|
1081
|
+
rounds12;
|
|
1082
|
+
dataAsLanes += laneOffsetSerial;
|
|
1083
|
+
dataByteLen -= laneOffsetSerial*8;
|
|
1084
|
+
}
|
|
1085
|
+
copyToState(statesAsLanes);
|
|
1086
|
+
return (const unsigned char *)dataAsLanes - dataStart;
|
|
1087
|
+
}
|
|
1088
|
+
else {
|
|
1089
|
+
const unsigned char *dataStart = data;
|
|
1090
|
+
|
|
1091
|
+
while(dataByteLen >= dataMinimumSize) {
|
|
1092
|
+
KeccakP1600times8_AddLanesAll(states, data, laneCount, laneOffsetParallel);
|
|
1093
|
+
KeccakP1600times8_PermuteAll_12rounds(states);
|
|
1094
|
+
data += laneOffsetSerial*8;
|
|
1095
|
+
dataByteLen -= laneOffsetSerial*8;
|
|
1096
|
+
}
|
|
1097
|
+
return data - dataStart;
|
|
1098
|
+
}
|
|
1099
|
+
}
|
|
1100
|
+
|
|
1101
|
+
/* ------------------------------------------------------------------------- */
|
|
1102
|
+
|
|
1103
|
+
#define LOAD(p) _mm512_loadu_si512(p)
|
|
1104
|
+
#define XOReq(a,b) a = _mm512_xor_si512(a,b)
|
|
1105
|
+
#define ZERO() _mm512_setzero_si512()
|
|
1106
|
+
#define CONST_64(a) _mm512_set1_epi64(a)
|
|
1107
|
+
|
|
1108
|
+
#define chunkSize 8192
|
|
1109
|
+
#define rateInBytes 168
|
|
1110
|
+
|
|
1111
|
+
#define initializeState(X) \
|
|
1112
|
+
X##ba = ZERO(); \
|
|
1113
|
+
X##be = ZERO(); \
|
|
1114
|
+
X##bi = ZERO(); \
|
|
1115
|
+
X##bo = ZERO(); \
|
|
1116
|
+
X##bu = ZERO(); \
|
|
1117
|
+
X##ga = ZERO(); \
|
|
1118
|
+
X##ge = ZERO(); \
|
|
1119
|
+
X##gi = ZERO(); \
|
|
1120
|
+
X##go = ZERO(); \
|
|
1121
|
+
X##gu = ZERO(); \
|
|
1122
|
+
X##ka = ZERO(); \
|
|
1123
|
+
X##ke = ZERO(); \
|
|
1124
|
+
X##ki = ZERO(); \
|
|
1125
|
+
X##ko = ZERO(); \
|
|
1126
|
+
X##ku = ZERO(); \
|
|
1127
|
+
X##ma = ZERO(); \
|
|
1128
|
+
X##me = ZERO(); \
|
|
1129
|
+
X##mi = ZERO(); \
|
|
1130
|
+
X##mo = ZERO(); \
|
|
1131
|
+
X##mu = ZERO(); \
|
|
1132
|
+
X##sa = ZERO(); \
|
|
1133
|
+
X##se = ZERO(); \
|
|
1134
|
+
X##si = ZERO(); \
|
|
1135
|
+
X##so = ZERO(); \
|
|
1136
|
+
X##su = ZERO(); \
|
|
1137
|
+
|
|
1138
|
+
#define LoadAndTranspose8(dataAsLanes, offset) \
|
|
1139
|
+
t0 = LOAD((dataAsLanes) + (offset) + 0*chunkSize/8); \
|
|
1140
|
+
t1 = LOAD((dataAsLanes) + (offset) + 1*chunkSize/8); \
|
|
1141
|
+
t2 = LOAD((dataAsLanes) + (offset) + 2*chunkSize/8); \
|
|
1142
|
+
t3 = LOAD((dataAsLanes) + (offset) + 3*chunkSize/8); \
|
|
1143
|
+
t4 = LOAD((dataAsLanes) + (offset) + 4*chunkSize/8); \
|
|
1144
|
+
t5 = LOAD((dataAsLanes) + (offset) + 5*chunkSize/8); \
|
|
1145
|
+
t6 = LOAD((dataAsLanes) + (offset) + 6*chunkSize/8); \
|
|
1146
|
+
t7 = LOAD((dataAsLanes) + (offset) + 7*chunkSize/8); \
|
|
1147
|
+
r0 = _mm512_unpacklo_epi64(t0, t1); \
|
|
1148
|
+
r1 = _mm512_unpackhi_epi64(t0, t1); \
|
|
1149
|
+
r2 = _mm512_unpacklo_epi64(t2, t3); \
|
|
1150
|
+
r3 = _mm512_unpackhi_epi64(t2, t3); \
|
|
1151
|
+
r4 = _mm512_unpacklo_epi64(t4, t5); \
|
|
1152
|
+
r5 = _mm512_unpackhi_epi64(t4, t5); \
|
|
1153
|
+
r6 = _mm512_unpacklo_epi64(t6, t7); \
|
|
1154
|
+
r7 = _mm512_unpackhi_epi64(t6, t7); \
|
|
1155
|
+
t0 = _mm512_shuffle_i32x4(r0, r2, 0x88); \
|
|
1156
|
+
t1 = _mm512_shuffle_i32x4(r1, r3, 0x88); \
|
|
1157
|
+
t2 = _mm512_shuffle_i32x4(r0, r2, 0xdd); \
|
|
1158
|
+
t3 = _mm512_shuffle_i32x4(r1, r3, 0xdd); \
|
|
1159
|
+
t4 = _mm512_shuffle_i32x4(r4, r6, 0x88); \
|
|
1160
|
+
t5 = _mm512_shuffle_i32x4(r5, r7, 0x88); \
|
|
1161
|
+
t6 = _mm512_shuffle_i32x4(r4, r6, 0xdd); \
|
|
1162
|
+
t7 = _mm512_shuffle_i32x4(r5, r7, 0xdd); \
|
|
1163
|
+
r0 = _mm512_shuffle_i32x4(t0, t4, 0x88); \
|
|
1164
|
+
r1 = _mm512_shuffle_i32x4(t1, t5, 0x88); \
|
|
1165
|
+
r2 = _mm512_shuffle_i32x4(t2, t6, 0x88); \
|
|
1166
|
+
r3 = _mm512_shuffle_i32x4(t3, t7, 0x88); \
|
|
1167
|
+
r4 = _mm512_shuffle_i32x4(t0, t4, 0xdd); \
|
|
1168
|
+
r5 = _mm512_shuffle_i32x4(t1, t5, 0xdd); \
|
|
1169
|
+
r6 = _mm512_shuffle_i32x4(t2, t6, 0xdd); \
|
|
1170
|
+
r7 = _mm512_shuffle_i32x4(t3, t7, 0xdd); \
|
|
1171
|
+
|
|
1172
|
+
#define XORdata16(X, index, dataAsLanes) \
|
|
1173
|
+
LoadAndTranspose8(dataAsLanes, 0) \
|
|
1174
|
+
XOReq(X##ba, r0); \
|
|
1175
|
+
XOReq(X##be, r1); \
|
|
1176
|
+
XOReq(X##bi, r2); \
|
|
1177
|
+
XOReq(X##bo, r3); \
|
|
1178
|
+
XOReq(X##bu, r4); \
|
|
1179
|
+
XOReq(X##ga, r5); \
|
|
1180
|
+
XOReq(X##ge, r6); \
|
|
1181
|
+
XOReq(X##gi, r7); \
|
|
1182
|
+
LoadAndTranspose8(dataAsLanes, 8) \
|
|
1183
|
+
XOReq(X##go, r0); \
|
|
1184
|
+
XOReq(X##gu, r1); \
|
|
1185
|
+
XOReq(X##ka, r2); \
|
|
1186
|
+
XOReq(X##ke, r3); \
|
|
1187
|
+
XOReq(X##ki, r4); \
|
|
1188
|
+
XOReq(X##ko, r5); \
|
|
1189
|
+
XOReq(X##ku, r6); \
|
|
1190
|
+
XOReq(X##ma, r7); \
|
|
1191
|
+
|
|
1192
|
+
#define XORdata21(X, index, dataAsLanes) \
|
|
1193
|
+
XORdata16(X, index, dataAsLanes) \
|
|
1194
|
+
XOReq(X##me, LOAD_GATHER8_64(index, (dataAsLanes) + 16)); \
|
|
1195
|
+
XOReq(X##mi, LOAD_GATHER8_64(index, (dataAsLanes) + 17)); \
|
|
1196
|
+
XOReq(X##mo, LOAD_GATHER8_64(index, (dataAsLanes) + 18)); \
|
|
1197
|
+
XOReq(X##mu, LOAD_GATHER8_64(index, (dataAsLanes) + 19)); \
|
|
1198
|
+
XOReq(X##sa, LOAD_GATHER8_64(index, (dataAsLanes) + 20)); \
|
|
1199
|
+
|
|
1200
|
+
void KeccakP1600times8_K12ProcessLeaves(const unsigned char *input, unsigned char *output)
|
|
1201
|
+
{
|
|
1202
|
+
KeccakP_DeclareVars;
|
|
1203
|
+
unsigned int j;
|
|
1204
|
+
const uint64_t *outputAsLanes = (const uint64_t *)output;
|
|
1205
|
+
__m256i index;
|
|
1206
|
+
__m512i t0, t1, t2, t3, t4, t5, t6, t7;
|
|
1207
|
+
__m512i r0, r1, r2, r3, r4, r5, r6, r7;
|
|
1208
|
+
|
|
1209
|
+
initializeState(_);
|
|
1210
|
+
|
|
1211
|
+
index = LOAD8_32(7*(chunkSize / 8), 6*(chunkSize / 8), 5*(chunkSize / 8), 4*(chunkSize / 8), 3*(chunkSize / 8), 2*(chunkSize / 8), 1*(chunkSize / 8), 0*(chunkSize / 8));
|
|
1212
|
+
for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) {
|
|
1213
|
+
XORdata21(_, index, (const uint64_t *)input);
|
|
1214
|
+
rounds12
|
|
1215
|
+
input += rateInBytes;
|
|
1216
|
+
}
|
|
1217
|
+
|
|
1218
|
+
XORdata16(_, index, (const uint64_t *)input);
|
|
1219
|
+
XOReq(_me, CONST_64(0x0BULL));
|
|
1220
|
+
XOReq(_sa, CONST_64(0x8000000000000000ULL));
|
|
1221
|
+
rounds12
|
|
1222
|
+
|
|
1223
|
+
index = LOAD8_32(7*4, 6*4, 5*4, 4*4, 3*4, 2*4, 1*4, 0*4);
|
|
1224
|
+
STORE_SCATTER8_64(outputAsLanes+0, index, _ba);
|
|
1225
|
+
STORE_SCATTER8_64(outputAsLanes+1, index, _be);
|
|
1226
|
+
STORE_SCATTER8_64(outputAsLanes+2, index, _bi);
|
|
1227
|
+
STORE_SCATTER8_64(outputAsLanes+3, index, _bo);
|
|
1228
|
+
}
|
|
1229
|
+
|
|
1230
|
+
#undef LOAD
|
|
1231
|
+
#undef XOReq
|
|
1232
|
+
#undef ZERO
|
|
1233
|
+
#undef CONST_64
|
|
1234
|
+
#undef chunkSize
|
|
1235
|
+
#undef rateInBytes
|
|
1236
|
+
|
|
1237
|
+
/* ------------------------------------------------------------------------- */
|
|
1238
|
+
|
|
1239
|
+
/* Remap lanes to start after two rounds */
|
|
1240
|
+
#define Iba _ba
|
|
1241
|
+
#define Ibe _me
|
|
1242
|
+
#define Ibi _gi
|
|
1243
|
+
#define Ibo _so
|
|
1244
|
+
#define Ibu _ku
|
|
1245
|
+
#define Iga _sa
|
|
1246
|
+
#define Ige _ke
|
|
1247
|
+
#define Igi _bi
|
|
1248
|
+
#define Igo _mo
|
|
1249
|
+
#define Igu _gu
|
|
1250
|
+
#define Ika _ma
|
|
1251
|
+
#define Ike _ge
|
|
1252
|
+
#define Iki _si
|
|
1253
|
+
#define Iko _ko
|
|
1254
|
+
#define Iku _bu
|
|
1255
|
+
#define Ima _ka
|
|
1256
|
+
#define Ime _be
|
|
1257
|
+
#define Imi _mi
|
|
1258
|
+
#define Imo _go
|
|
1259
|
+
#define Imu _su
|
|
1260
|
+
#define Isa _ga
|
|
1261
|
+
#define Ise _se
|
|
1262
|
+
#define Isi _ki
|
|
1263
|
+
#define Iso _bo
|
|
1264
|
+
#define Isu _mu
|
|
1265
|
+
|
|
1266
|
+
#define LoadInput(argIndex) _mm512_i32gather_epi64(gather, (const long long int *)&in64[argIndex], 8)
|
|
1267
|
+
#define AddInput(argIndex) XOR( LoadInput(argIndex), CONST8_64(kRoll[argIndex]))
|
|
1268
|
+
|
|
1269
|
+
|
|
1270
|
+
ALIGN(64) static const uint64_t oLow256[] = { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 };
|
|
1271
|
+
ALIGN(64) static const uint64_t oHigh256[] = { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 };
|
|
1272
|
+
|
|
1273
|
+
ALIGN(64) static const uint64_t oLow128[] = { 0, 1, 8+0, 8+1, 4, 5, 8+4, 8+5 };
|
|
1274
|
+
ALIGN(64) static const uint64_t oHigh128[] = { 2, 3, 8+2, 8+3, 6, 7, 8+6, 8+7 };
|
|
1275
|
+
|
|
1276
|
+
ALIGN(64) static const uint64_t oLow64[] = { 0, 8+0, 2, 8+2, 4, 8+4, 6, 8+6 };
|
|
1277
|
+
ALIGN(64) static const uint64_t oHigh64[] = { 1, 8+1, 3, 8+3, 5, 8+5, 7, 8+7 };
|
|
1278
|
+
|
|
1279
|
+
ALIGN(64) static const uint64_t o01234_012[] = { 0, 1, 2, 3, 4, 8+0, 8+1, 8+2 };
|
|
1280
|
+
ALIGN(64) static const uint64_t o1234_0123[] = { 1, 2, 3, 4, 8+0, 8+1, 8+2, 8+3 };
|
|
1281
|
+
ALIGN(64) static const uint64_t o1234567_0[] = { 1, 2, 3, 4, 5, 6, 7, 8+0 };
|
|
1282
|
+
ALIGN(64) static const uint64_t o1234567_3[] = { 1, 2, 3, 4, 5, 6, 7, 8+3 };
|
|
1283
|
+
ALIGN(64) static const uint64_t o1234567_4[] = { 1, 2, 3, 4, 5, 6, 7, 8+4 };
|
|
1284
|
+
ALIGN(64) static const uint64_t o234567_45[] = { 2, 3, 4, 5, 6, 7, 8+4, 8+5 };
|
|
1285
|
+
ALIGN(64) static const uint64_t o34567_456[] = { 3, 4, 5, 6, 7, 8+4, 8+5, 8+6 };
|
|
1286
|
+
|
|
1287
|
+
ALIGN(32) static const uint32_t oGatherScatter[]= {0*25, 1*25, 2*25, 3*25, 4*25, 5*25, 6*25, 7*25};
|
|
1288
|
+
|
|
1289
|
+
#if defined(__i386__) || defined(_M_IX86)
|
|
1290
|
+
#define _mm256_extract_epi64(a, index) \
|
|
1291
|
+
((uint64_t)_mm256_extract_epi32((a), (index)*2) || ((uint64_t)_mm256_extract_epi32((a), (index)*2+1) << 32))
|
|
1292
|
+
#endif
|
|
1293
|
+
|
|
1294
|
+
size_t KeccakP1600times8_KravatteCompress(uint64_t *xAccu, uint64_t *kRoll, const unsigned char *input, size_t inputByteLen)
|
|
1295
|
+
{
|
|
1296
|
+
#if !defined(KeccakP1600times4_fullUnrolling)
|
|
1297
|
+
unsigned int i;
|
|
1298
|
+
#endif
|
|
1299
|
+
uint64_t *in64 = (uint64_t *)input;
|
|
1300
|
+
size_t nBlocks = inputByteLen / (8 * 200);
|
|
1301
|
+
KeccakP_DeclareVars;
|
|
1302
|
+
V512 x01234567, x12345678;
|
|
1303
|
+
V512 Xba, Xbe, Xbi, Xbo, Xbu;
|
|
1304
|
+
V512 Xga, Xge, Xgi, Xgo, Xgu;
|
|
1305
|
+
V512 Xka, Xke, Xki, Xko, Xku;
|
|
1306
|
+
V512 Xma, Xme, Xmi, Xmo, Xmu;
|
|
1307
|
+
V512 Xsa, Xse, Xsi, Xso, Xsu;
|
|
1308
|
+
V256 v1;
|
|
1309
|
+
V512 p1, p2;
|
|
1310
|
+
V256 gather = *(V256*)oGatherScatter;
|
|
1311
|
+
|
|
1312
|
+
/* Clear internal X accu */
|
|
1313
|
+
Xba = _mm512_setzero_si512();
|
|
1314
|
+
Xbe = _mm512_setzero_si512();
|
|
1315
|
+
Xbi = _mm512_setzero_si512();
|
|
1316
|
+
Xbo = _mm512_setzero_si512();
|
|
1317
|
+
Xbu = _mm512_setzero_si512();
|
|
1318
|
+
Xga = _mm512_setzero_si512();
|
|
1319
|
+
Xge = _mm512_setzero_si512();
|
|
1320
|
+
Xgi = _mm512_setzero_si512();
|
|
1321
|
+
Xgo = _mm512_setzero_si512();
|
|
1322
|
+
Xgu = _mm512_setzero_si512();
|
|
1323
|
+
Xka = _mm512_setzero_si512();
|
|
1324
|
+
Xke = _mm512_setzero_si512();
|
|
1325
|
+
Xki = _mm512_setzero_si512();
|
|
1326
|
+
Xko = _mm512_setzero_si512();
|
|
1327
|
+
Xku = _mm512_setzero_si512();
|
|
1328
|
+
Xma = _mm512_setzero_si512();
|
|
1329
|
+
Xme = _mm512_setzero_si512();
|
|
1330
|
+
Xmi = _mm512_setzero_si512();
|
|
1331
|
+
Xmo = _mm512_setzero_si512();
|
|
1332
|
+
Xmu = _mm512_setzero_si512();
|
|
1333
|
+
Xsa = _mm512_setzero_si512();
|
|
1334
|
+
Xse = _mm512_setzero_si512();
|
|
1335
|
+
Xsi = _mm512_setzero_si512();
|
|
1336
|
+
Xso = _mm512_setzero_si512();
|
|
1337
|
+
Xsu = _mm512_setzero_si512();
|
|
1338
|
+
|
|
1339
|
+
/* prepare 8 lanes for roll-c */
|
|
1340
|
+
x01234567 = _mm512_maskz_loadu_epi64(0x1F, &kRoll[20]); /* 5 lanes ok */
|
|
1341
|
+
_ba = _mm512_maskz_loadu_epi64(0x0F, &kRoll[21]); /* 4 lanes ok */
|
|
1342
|
+
_be = XOR3(ROL(x01234567, 7), _ba, _mm512_srli_epi64(_ba, 3));
|
|
1343
|
+
x01234567 = _mm512_permutex2var_epi64(x01234567, *(V512*)o01234_012, _be);
|
|
1344
|
+
x12345678 = _mm512_permutex2var_epi64(x01234567, *(V512*)o1234_0123, _be);
|
|
1345
|
+
|
|
1346
|
+
do {
|
|
1347
|
+
Iba = AddInput( 0);
|
|
1348
|
+
Ibe = AddInput( 1);
|
|
1349
|
+
Ibi = AddInput( 2);
|
|
1350
|
+
Ibo = AddInput( 3);
|
|
1351
|
+
Ibu = AddInput( 4);
|
|
1352
|
+
Iga = AddInput( 5);
|
|
1353
|
+
Ige = AddInput( 6);
|
|
1354
|
+
Igi = AddInput( 7);
|
|
1355
|
+
Igo = AddInput( 8);
|
|
1356
|
+
Igu = AddInput( 9);
|
|
1357
|
+
Ika = AddInput(10);
|
|
1358
|
+
Ike = AddInput(11);
|
|
1359
|
+
Iki = AddInput(12);
|
|
1360
|
+
Iko = AddInput(13);
|
|
1361
|
+
Iku = AddInput(14);
|
|
1362
|
+
Ima = AddInput(15);
|
|
1363
|
+
Ime = AddInput(16);
|
|
1364
|
+
Imi = AddInput(17);
|
|
1365
|
+
Imo = AddInput(18);
|
|
1366
|
+
Imu = AddInput(19);
|
|
1367
|
+
|
|
1368
|
+
/* Roll-c */
|
|
1369
|
+
Isa = x01234567;
|
|
1370
|
+
Ise = x12345678;
|
|
1371
|
+
Isu = XOR3(ROL(x01234567, 7), x12345678, _mm512_srli_epi64(x12345678, 3));
|
|
1372
|
+
Ise = _mm512_permutex2var_epi64(x01234567, *(V512*)o1234567_3, Isu);
|
|
1373
|
+
Isi = _mm512_permutex2var_epi64(Ise, *(V512*)o1234567_4, Isu);
|
|
1374
|
+
Iso = _mm512_permutex2var_epi64(Ise, *(V512*)o234567_45, Isu);
|
|
1375
|
+
Isu = _mm512_permutex2var_epi64(Ise, *(V512*)o34567_456, Isu);
|
|
1376
|
+
|
|
1377
|
+
x01234567 = XOR3(ROL(Iso, 7), Isu, _mm512_srli_epi64(Isu, 3));
|
|
1378
|
+
x12345678 = _mm512_permutex2var_epi64(x01234567, *(V512*)o1234567_4, x01234567);
|
|
1379
|
+
|
|
1380
|
+
XOReq512(Isa, LoadInput(20));
|
|
1381
|
+
XOReq512(Ise, LoadInput(21));
|
|
1382
|
+
XOReq512(Isi, LoadInput(22));
|
|
1383
|
+
XOReq512(Iso, LoadInput(23));
|
|
1384
|
+
XOReq512(Isu, LoadInput(24));
|
|
1385
|
+
|
|
1386
|
+
rounds6
|
|
1387
|
+
Dump( "P-out", _);
|
|
1388
|
+
|
|
1389
|
+
/* Accumulate in X */
|
|
1390
|
+
XOReq512(Xba, _ba);
|
|
1391
|
+
XOReq512(Xbe, _be);
|
|
1392
|
+
XOReq512(Xbi, _bi);
|
|
1393
|
+
XOReq512(Xbo, _bo);
|
|
1394
|
+
XOReq512(Xbu, _bu);
|
|
1395
|
+
XOReq512(Xga, _ga);
|
|
1396
|
+
XOReq512(Xge, _ge);
|
|
1397
|
+
XOReq512(Xgi, _gi);
|
|
1398
|
+
XOReq512(Xgo, _go);
|
|
1399
|
+
XOReq512(Xgu, _gu);
|
|
1400
|
+
XOReq512(Xka, _ka);
|
|
1401
|
+
XOReq512(Xke, _ke);
|
|
1402
|
+
XOReq512(Xki, _ki);
|
|
1403
|
+
XOReq512(Xko, _ko);
|
|
1404
|
+
XOReq512(Xku, _ku);
|
|
1405
|
+
XOReq512(Xma, _ma);
|
|
1406
|
+
XOReq512(Xme, _me);
|
|
1407
|
+
XOReq512(Xmi, _mi);
|
|
1408
|
+
XOReq512(Xmo, _mo);
|
|
1409
|
+
XOReq512(Xmu, _mu);
|
|
1410
|
+
XOReq512(Xsa, _sa);
|
|
1411
|
+
XOReq512(Xse, _se);
|
|
1412
|
+
XOReq512(Xsi, _si);
|
|
1413
|
+
XOReq512(Xso, _so);
|
|
1414
|
+
XOReq512(Xsu, _su);
|
|
1415
|
+
Dump( "X", X);
|
|
1416
|
+
|
|
1417
|
+
in64 += 8 * 25;
|
|
1418
|
+
}
|
|
1419
|
+
while(--nBlocks != 0);
|
|
1420
|
+
|
|
1421
|
+
/* Add horizontally Xba ... Xgi Reduce from lanes 8 to 4 */
|
|
1422
|
+
p1 = *(V512*)oLow256;
|
|
1423
|
+
p2 = *(V512*)oHigh256;
|
|
1424
|
+
Xba = XOR(_mm512_permutex2var_epi64(Xba, p1, Xbu), _mm512_permutex2var_epi64(Xba, p2, Xbu));
|
|
1425
|
+
Xbe = XOR(_mm512_permutex2var_epi64(Xbe, p1, Xga), _mm512_permutex2var_epi64(Xbe, p2, Xga));
|
|
1426
|
+
Xbi = XOR(_mm512_permutex2var_epi64(Xbi, p1, Xge), _mm512_permutex2var_epi64(Xbi, p2, Xge));
|
|
1427
|
+
Xbo = XOR(_mm512_permutex2var_epi64(Xbo, p1, Xgi), _mm512_permutex2var_epi64(Xbo, p2, Xgi));
|
|
1428
|
+
|
|
1429
|
+
/* Add horizontally Xgo ... Xma Reduce from lanes 8 to 4 */
|
|
1430
|
+
Xgo = XOR(_mm512_permutex2var_epi64(Xgo, p1, Xki), _mm512_permutex2var_epi64(Xgo, p2, Xki));
|
|
1431
|
+
Xgu = XOR(_mm512_permutex2var_epi64(Xgu, p1, Xko), _mm512_permutex2var_epi64(Xgu, p2, Xko));
|
|
1432
|
+
Xka = XOR(_mm512_permutex2var_epi64(Xka, p1, Xku), _mm512_permutex2var_epi64(Xka, p2, Xku));
|
|
1433
|
+
Xke = XOR(_mm512_permutex2var_epi64(Xke, p1, Xma), _mm512_permutex2var_epi64(Xke, p2, Xma));
|
|
1434
|
+
|
|
1435
|
+
/* Add horizontally Xme ... Xso Reduce from lanes 8 to 4 */
|
|
1436
|
+
Xme = XOR(_mm512_permutex2var_epi64(Xme, p1, Xsa), _mm512_permutex2var_epi64(Xme, p2, Xsa));
|
|
1437
|
+
Xmi = XOR(_mm512_permutex2var_epi64(Xmi, p1, Xse), _mm512_permutex2var_epi64(Xmi, p2, Xse));
|
|
1438
|
+
Xmo = XOR(_mm512_permutex2var_epi64(Xmo, p1, Xsi), _mm512_permutex2var_epi64(Xmo, p2, Xsi));
|
|
1439
|
+
Xmu = XOR(_mm512_permutex2var_epi64(Xmu, p1, Xso), _mm512_permutex2var_epi64(Xmu, p2, Xso));
|
|
1440
|
+
|
|
1441
|
+
/* Add horizontally Xba ... Xbo Reduce from lanes 4 to 2 */
|
|
1442
|
+
p1 = *(V512*)oLow128;
|
|
1443
|
+
p2 = *(V512*)oHigh128;
|
|
1444
|
+
Xba = XOR(_mm512_permutex2var_epi64(Xba, p1, Xbi), _mm512_permutex2var_epi64(Xba, p2, Xbi));
|
|
1445
|
+
Xbe = XOR(_mm512_permutex2var_epi64(Xbe, p1, Xbo), _mm512_permutex2var_epi64(Xbe, p2, Xbo));
|
|
1446
|
+
|
|
1447
|
+
/* Add horizontally Xgo ... Xke Reduce from lanes 4 to 2 */
|
|
1448
|
+
Xgo = XOR(_mm512_permutex2var_epi64(Xgo, p1, Xka), _mm512_permutex2var_epi64(Xgo, p2, Xka));
|
|
1449
|
+
Xgu = XOR(_mm512_permutex2var_epi64(Xgu, p1, Xke), _mm512_permutex2var_epi64(Xgu, p2, Xke));
|
|
1450
|
+
|
|
1451
|
+
/* Add horizontally Xme ... Xmu Reduce from lanes 4 to 2 */
|
|
1452
|
+
Xme = XOR(_mm512_permutex2var_epi64(Xme, p1, Xmo), _mm512_permutex2var_epi64(Xme, p2, Xmo));
|
|
1453
|
+
Xmi = XOR(_mm512_permutex2var_epi64(Xmi, p1, Xmu), _mm512_permutex2var_epi64(Xmi, p2, Xmu));
|
|
1454
|
+
|
|
1455
|
+
/* Add horizontally Xba ... Xbe Reduce from lanes 2 to 1 */
|
|
1456
|
+
p1 = *(V512*)oLow64;
|
|
1457
|
+
p2 = *(V512*)oHigh64;
|
|
1458
|
+
Xba = XOR(_mm512_permutex2var_epi64(Xba, p1, Xbe), _mm512_permutex2var_epi64(Xba, p2, Xbe));
|
|
1459
|
+
|
|
1460
|
+
/* Add horizontally Xgo ... Xgu Reduce from lanes 2 to 1 */
|
|
1461
|
+
Xgo = XOR(_mm512_permutex2var_epi64(Xgo, p1, Xgu), _mm512_permutex2var_epi64(Xgo, p2, Xgu));
|
|
1462
|
+
|
|
1463
|
+
/* Add horizontally Xme ... Xmi Reduce from lanes 2 to 1 */
|
|
1464
|
+
Xme = XOR(_mm512_permutex2var_epi64(Xme, p1, Xmi), _mm512_permutex2var_epi64(Xme, p2, Xmi));
|
|
1465
|
+
|
|
1466
|
+
/* Add and store in xAccu */
|
|
1467
|
+
Xba = XOR( Xba, LOAD512u(xAccu[0]));
|
|
1468
|
+
Xgo = XOR( Xgo, LOAD512u(xAccu[8]));
|
|
1469
|
+
Xme = XOR( Xme, LOAD512u(xAccu[16]));
|
|
1470
|
+
_mm512_storeu_si512((V512*)&xAccu[0], Xba);
|
|
1471
|
+
_mm512_storeu_si512((V512*)&xAccu[8], Xgo);
|
|
1472
|
+
_mm512_storeu_si512((V512*)&xAccu[16], Xme);
|
|
1473
|
+
|
|
1474
|
+
/* Add horizontally Xsu */
|
|
1475
|
+
v1 = _mm256_xor_si256( _mm512_extracti64x4_epi64(Xsu, 0), _mm512_extracti64x4_epi64(Xsu, 1));
|
|
1476
|
+
v1 = _mm256_xor_si256( v1, _mm256_permute4x64_epi64(v1, 0xEE));
|
|
1477
|
+
xAccu[24] ^= _mm256_extract_epi64(v1, 0) ^ _mm256_extract_epi64(v1, 1);
|
|
1478
|
+
DumpMem("xAccu", xAccu, 5*5);
|
|
1479
|
+
|
|
1480
|
+
/* Store new kRoll */
|
|
1481
|
+
_mm512_mask_storeu_epi64(&kRoll[20], 0x1F, x01234567);
|
|
1482
|
+
DumpMem("Next kRoll", kRoll+20, 5);
|
|
1483
|
+
|
|
1484
|
+
return (size_t)in64 - (size_t)input;
|
|
1485
|
+
}
|
|
1486
|
+
|
|
1487
|
+
#undef LoadInput
|
|
1488
|
+
#undef AddInput
|
|
1489
|
+
|
|
1490
|
+
ALIGN(64) static const uint64_t o1234567_6[] = { 1, 2, 3, 4, 5, 6, 7, 8+6 };
|
|
1491
|
+
ALIGN(64) static const uint64_t o234567_01[] = { 2, 3, 4, 5, 6, 7, 8+0, 8+1 };
|
|
1492
|
+
ALIGN(64) static const uint64_t o34567_012[] = { 3, 4, 5, 6, 7, 8+0, 8+1, 8+2 };
|
|
1493
|
+
ALIGN(64) static const uint64_t o4567_0123[] = { 4, 5, 6, 7, 8+0, 8+1, 8+2, 8+3 };
|
|
1494
|
+
ALIGN(64) static const uint64_t o567_01234[] = { 5, 6, 7, 8+0, 8+1, 8+2, 8+3, 8+4 };
|
|
1495
|
+
ALIGN(64) static const uint64_t o67_012345[] = { 6, 7, 8+0, 8+1, 8+2, 8+3, 8+4, 8+5 };
|
|
1496
|
+
ALIGN(64) static const uint64_t o7_0123456[] = { 7, 8+0, 8+1, 8+2, 8+3, 8+4, 8+5, 8+6 };
|
|
1497
|
+
|
|
1498
|
+
size_t KeccakP1600times8_KravatteExpand(uint64_t *yAccu, const uint64_t *kRoll, unsigned char *output, size_t outputByteLen)
|
|
1499
|
+
{
|
|
1500
|
+
uint64_t *o64 = (uint64_t *)output;
|
|
1501
|
+
size_t nBlocks = outputByteLen / (8 * 200);
|
|
1502
|
+
KeccakP_DeclareVars;
|
|
1503
|
+
#if !defined(KeccakP1600times4_fullUnrolling)
|
|
1504
|
+
unsigned int i;
|
|
1505
|
+
#endif
|
|
1506
|
+
V512 x01234567, x23456789;
|
|
1507
|
+
V256 scatter = *(V256*)oGatherScatter;
|
|
1508
|
+
|
|
1509
|
+
x01234567 = LOAD512u(yAccu[15]);
|
|
1510
|
+
x23456789 = LOAD512u(yAccu[17]);
|
|
1511
|
+
|
|
1512
|
+
do {
|
|
1513
|
+
Iba = CONST8_64(yAccu[0]);
|
|
1514
|
+
Ibe = CONST8_64(yAccu[1]);
|
|
1515
|
+
Ibi = CONST8_64(yAccu[2]);
|
|
1516
|
+
Ibo = CONST8_64(yAccu[3]);
|
|
1517
|
+
Ibu = CONST8_64(yAccu[4]);
|
|
1518
|
+
|
|
1519
|
+
Iga = CONST8_64(yAccu[5]);
|
|
1520
|
+
Ige = CONST8_64(yAccu[6]);
|
|
1521
|
+
Igi = CONST8_64(yAccu[7]);
|
|
1522
|
+
Igo = CONST8_64(yAccu[8]);
|
|
1523
|
+
Igu = CONST8_64(yAccu[9]);
|
|
1524
|
+
|
|
1525
|
+
Ika = CONST8_64(yAccu[10]);
|
|
1526
|
+
Ike = CONST8_64(yAccu[11]);
|
|
1527
|
+
Iki = CONST8_64(yAccu[12]);
|
|
1528
|
+
Iko = CONST8_64(yAccu[13]);
|
|
1529
|
+
Iku = CONST8_64(yAccu[14]);
|
|
1530
|
+
|
|
1531
|
+
/* roll-e */
|
|
1532
|
+
Ima = x01234567;
|
|
1533
|
+
Ime = _mm512_permutex2var_epi64(x01234567, *(V512*)o1234567_6, x23456789);
|
|
1534
|
+
Imi = x23456789;
|
|
1535
|
+
|
|
1536
|
+
x23456789 = XOR3(ROL(Ima, 7), ROL(Ime, 18), _mm512_and_si512(Imi, _mm512_srli_epi64(Ime, 1)));
|
|
1537
|
+
Imo = _mm512_permutex2var_epi64(Imi, *(V512*)o1234567_0, x23456789);
|
|
1538
|
+
Imu = _mm512_permutex2var_epi64(Imi, *(V512*)o234567_01, x23456789);
|
|
1539
|
+
Isa = _mm512_permutex2var_epi64(Imi, *(V512*)o34567_012, x23456789);
|
|
1540
|
+
Ise = _mm512_permutex2var_epi64(Imi, *(V512*)o4567_0123, x23456789);
|
|
1541
|
+
Isi = _mm512_permutex2var_epi64(Imi, *(V512*)o567_01234, x23456789);
|
|
1542
|
+
Iso = _mm512_permutex2var_epi64(Imi, *(V512*)o67_012345, x23456789);
|
|
1543
|
+
Isu = _mm512_permutex2var_epi64(Imi, *(V512*)o7_0123456, x23456789);
|
|
1544
|
+
x01234567 = Iso;
|
|
1545
|
+
Dump( "After roll-e", I);
|
|
1546
|
+
|
|
1547
|
+
rounds6
|
|
1548
|
+
|
|
1549
|
+
/* Add kRoll */
|
|
1550
|
+
_ba = XOR(_ba, CONST8_64(kRoll[0]));
|
|
1551
|
+
_be = XOR(_be, CONST8_64(kRoll[1]));
|
|
1552
|
+
_bi = XOR(_bi, CONST8_64(kRoll[2]));
|
|
1553
|
+
_bo = XOR(_bo, CONST8_64(kRoll[3]));
|
|
1554
|
+
_bu = XOR(_bu, CONST8_64(kRoll[4]));
|
|
1555
|
+
_ga = XOR(_ga, CONST8_64(kRoll[5]));
|
|
1556
|
+
_ge = XOR(_ge, CONST8_64(kRoll[6]));
|
|
1557
|
+
_gi = XOR(_gi, CONST8_64(kRoll[7]));
|
|
1558
|
+
_go = XOR(_go, CONST8_64(kRoll[8]));
|
|
1559
|
+
_gu = XOR(_gu, CONST8_64(kRoll[9]));
|
|
1560
|
+
_ka = XOR(_ka, CONST8_64(kRoll[10]));
|
|
1561
|
+
_ke = XOR(_ke, CONST8_64(kRoll[11]));
|
|
1562
|
+
_ki = XOR(_ki, CONST8_64(kRoll[12]));
|
|
1563
|
+
_ko = XOR(_ko, CONST8_64(kRoll[13]));
|
|
1564
|
+
_ku = XOR(_ku, CONST8_64(kRoll[14]));
|
|
1565
|
+
_ma = XOR(_ma, CONST8_64(kRoll[15]));
|
|
1566
|
+
_me = XOR(_me, CONST8_64(kRoll[16]));
|
|
1567
|
+
_mi = XOR(_mi, CONST8_64(kRoll[17]));
|
|
1568
|
+
_mo = XOR(_mo, CONST8_64(kRoll[18]));
|
|
1569
|
+
_mu = XOR(_mu, CONST8_64(kRoll[19]));
|
|
1570
|
+
_sa = XOR(_sa, CONST8_64(kRoll[20]));
|
|
1571
|
+
_se = XOR(_se, CONST8_64(kRoll[21]));
|
|
1572
|
+
_si = XOR(_si, CONST8_64(kRoll[22]));
|
|
1573
|
+
_so = XOR(_so, CONST8_64(kRoll[23]));
|
|
1574
|
+
_su = XOR(_su, CONST8_64(kRoll[24]));
|
|
1575
|
+
Dump( "After add kRoll", _);
|
|
1576
|
+
|
|
1577
|
+
/* Extract */
|
|
1578
|
+
STORE_SCATTER8_64(o64+0, scatter, _ba);
|
|
1579
|
+
STORE_SCATTER8_64(o64+1, scatter, _be);
|
|
1580
|
+
STORE_SCATTER8_64(o64+2, scatter, _bi);
|
|
1581
|
+
STORE_SCATTER8_64(o64+3, scatter, _bo);
|
|
1582
|
+
STORE_SCATTER8_64(o64+4, scatter, _bu);
|
|
1583
|
+
STORE_SCATTER8_64(o64+5, scatter, _ga);
|
|
1584
|
+
STORE_SCATTER8_64(o64+6, scatter, _ge);
|
|
1585
|
+
STORE_SCATTER8_64(o64+7, scatter, _gi);
|
|
1586
|
+
STORE_SCATTER8_64(o64+8, scatter, _go);
|
|
1587
|
+
STORE_SCATTER8_64(o64+9, scatter, _gu);
|
|
1588
|
+
STORE_SCATTER8_64(o64+10, scatter, _ka);
|
|
1589
|
+
STORE_SCATTER8_64(o64+11, scatter, _ke);
|
|
1590
|
+
STORE_SCATTER8_64(o64+12, scatter, _ki);
|
|
1591
|
+
STORE_SCATTER8_64(o64+13, scatter, _ko);
|
|
1592
|
+
STORE_SCATTER8_64(o64+14, scatter, _ku);
|
|
1593
|
+
STORE_SCATTER8_64(o64+15, scatter, _ma);
|
|
1594
|
+
STORE_SCATTER8_64(o64+16, scatter, _me);
|
|
1595
|
+
STORE_SCATTER8_64(o64+17, scatter, _mi);
|
|
1596
|
+
STORE_SCATTER8_64(o64+18, scatter, _mo);
|
|
1597
|
+
STORE_SCATTER8_64(o64+19, scatter, _mu);
|
|
1598
|
+
STORE_SCATTER8_64(o64+20, scatter, _sa);
|
|
1599
|
+
STORE_SCATTER8_64(o64+21, scatter, _se);
|
|
1600
|
+
STORE_SCATTER8_64(o64+22, scatter, _si);
|
|
1601
|
+
STORE_SCATTER8_64(o64+23, scatter, _so);
|
|
1602
|
+
STORE_SCATTER8_64(o64+24, scatter, _su);
|
|
1603
|
+
DumpMem("Output", o64, 8*25);
|
|
1604
|
+
|
|
1605
|
+
o64 += 8 * 25;
|
|
1606
|
+
}
|
|
1607
|
+
while(--nBlocks != 0);
|
|
1608
|
+
|
|
1609
|
+
/* Store new yAccu */
|
|
1610
|
+
_mm512_mask_storeu_epi64(&yAccu[15], 0xFF, x01234567);
|
|
1611
|
+
_mm512_mask_storeu_epi64(&yAccu[17], 0xC0, x23456789);
|
|
1612
|
+
DumpMem("yAccu", yAccu, 25);
|
|
1613
|
+
|
|
1614
|
+
return (size_t)o64 - (size_t)output;
|
|
1615
|
+
}
|