sleeping_kangaroo12 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +127 -0
- data/ext/Rakefile +73 -0
- data/ext/binding/sleeping_kangaroo12.c +39 -0
- data/ext/config/xkcp.build +17 -0
- data/ext/xkcp/LICENSE +1 -0
- data/ext/xkcp/Makefile +15 -0
- data/ext/xkcp/Makefile.build +200 -0
- data/ext/xkcp/README.markdown +296 -0
- data/ext/xkcp/lib/HighLevel.build +143 -0
- data/ext/xkcp/lib/LowLevel.build +757 -0
- data/ext/xkcp/lib/common/align.h +33 -0
- data/ext/xkcp/lib/common/brg_endian.h +143 -0
- data/ext/xkcp/lib/high/KangarooTwelve/KangarooTwelve.c +301 -0
- data/ext/xkcp/lib/high/KangarooTwelve/KangarooTwelve.h +97 -0
- data/ext/xkcp/lib/high/Keccak/FIPS202/KeccakHash.c +81 -0
- data/ext/xkcp/lib/high/Keccak/FIPS202/KeccakHash.h +125 -0
- data/ext/xkcp/lib/high/Keccak/FIPS202/SimpleFIPS202.c +48 -0
- data/ext/xkcp/lib/high/Keccak/FIPS202/SimpleFIPS202.h +79 -0
- data/ext/xkcp/lib/high/Keccak/KeccakDuplex.c +81 -0
- data/ext/xkcp/lib/high/Keccak/KeccakDuplex.h +73 -0
- data/ext/xkcp/lib/high/Keccak/KeccakDuplex.inc +195 -0
- data/ext/xkcp/lib/high/Keccak/KeccakSponge.c +111 -0
- data/ext/xkcp/lib/high/Keccak/KeccakSponge.h +76 -0
- data/ext/xkcp/lib/high/Keccak/KeccakSponge.inc +314 -0
- data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.c +61 -0
- data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.h +67 -0
- data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.inc +128 -0
- data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.c +93 -0
- data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.h +599 -0
- data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.inc +573 -0
- data/ext/xkcp/lib/high/Ketje/Ketjev2.c +87 -0
- data/ext/xkcp/lib/high/Ketje/Ketjev2.h +88 -0
- data/ext/xkcp/lib/high/Ketje/Ketjev2.inc +274 -0
- data/ext/xkcp/lib/high/Keyak/Keyakv2.c +132 -0
- data/ext/xkcp/lib/high/Keyak/Keyakv2.h +217 -0
- data/ext/xkcp/lib/high/Keyak/Keyakv2.inc +81 -0
- data/ext/xkcp/lib/high/Keyak/Motorist.inc +953 -0
- data/ext/xkcp/lib/high/Kravatte/Kravatte.c +533 -0
- data/ext/xkcp/lib/high/Kravatte/Kravatte.h +115 -0
- data/ext/xkcp/lib/high/Kravatte/KravatteModes.c +557 -0
- data/ext/xkcp/lib/high/Kravatte/KravatteModes.h +247 -0
- data/ext/xkcp/lib/high/Xoodyak/Cyclist.h +66 -0
- data/ext/xkcp/lib/high/Xoodyak/Cyclist.inc +336 -0
- data/ext/xkcp/lib/high/Xoodyak/Xoodyak-parameters.h +26 -0
- data/ext/xkcp/lib/high/Xoodyak/Xoodyak.c +55 -0
- data/ext/xkcp/lib/high/Xoodyak/Xoodyak.h +35 -0
- data/ext/xkcp/lib/high/Xoofff/Xoofff.c +634 -0
- data/ext/xkcp/lib/high/Xoofff/Xoofff.h +147 -0
- data/ext/xkcp/lib/high/Xoofff/XoofffModes.c +483 -0
- data/ext/xkcp/lib/high/Xoofff/XoofffModes.h +241 -0
- data/ext/xkcp/lib/high/common/Phases.h +25 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-SnP.h +41 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv6m-le-armcc.s +1666 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv6m-le-gcc.s +1655 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7a-le-armcc.s +1268 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7a-le-gcc.s +1264 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7m-le-armcc.s +1178 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7m-le-gcc.s +1175 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u1-32bi-armv6m-le-armcc.s +1338 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u1-32bi-armv6m-le-gcc.s +1336 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u2-32bi-armv6m-le-armcc.s +1343 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u2-32bi-armv6m-le-gcc.s +1339 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-SnP.h +42 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-armv7a-le-neon-armcc.s +823 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-armv7a-le-neon-gcc.s +831 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARMv8A/KeccakP-1600-SnP.h +31 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ARMv8A/KeccakP-1600-armv8a-neon.s +540 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-SnP.h +42 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-avr8-compact.s +733 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-avr8-fast.s +1121 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX2/KeccakP-1600-AVX2.s +1100 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX2/KeccakP-1600-SnP.h +52 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/KeccakP-1600-AVX512.c +623 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/KeccakP-1600-SnP.h +47 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/u12/KeccakP-1600-AVX512-config.h +6 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/u6/KeccakP-1600-AVX512-config.h +6 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/ua/KeccakP-1600-AVX512-config.h +6 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/KeccakP-1600-AVX512.s +1031 -0
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/KeccakP-1600-SnP.h +53 -0
- data/ext/xkcp/lib/low/KeccakP-1600/XOP/KeccakP-1600-SnP.h +44 -0
- data/ext/xkcp/lib/low/KeccakP-1600/XOP/KeccakP-1600-XOP.c +476 -0
- data/ext/xkcp/lib/low/KeccakP-1600/XOP/u6/KeccakP-1600-XOP-config.h +6 -0
- data/ext/xkcp/lib/low/KeccakP-1600/XOP/ua/KeccakP-1600-XOP-config.h +6 -0
- data/ext/xkcp/lib/low/KeccakP-1600/common/KeccakP-1600-64.macros +748 -0
- data/ext/xkcp/lib/low/KeccakP-1600/common/KeccakP-1600-unrolling.macros +305 -0
- data/ext/xkcp/lib/low/KeccakP-1600/compact/KeccakP-1600-SnP.h +40 -0
- data/ext/xkcp/lib/low/KeccakP-1600/compact/KeccakP-1600-compact64.c +420 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-32bits-inplace/KeccakP-1600-SnP.h +43 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-32bits-inplace/KeccakP-1600-inplace32BI.c +1163 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/KeccakP-1600-SnP.h +54 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/KeccakP-1600-opt64.c +565 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcu6/KeccakP-1600-opt64-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcua/KeccakP-1600-opt64-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcua-shld/KeccakP-1600-opt64-config.h +8 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/u6/KeccakP-1600-opt64-config.h +6 -0
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/ua/KeccakP-1600-opt64-config.h +6 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-SnP.h +44 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-reference.h +23 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-reference32BI.c +625 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-SnP.h +44 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-reference.c +440 -0
- data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-reference.h +23 -0
- data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-SnP.h +42 -0
- data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-gas.s +1196 -0
- data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-gas_Apple.s +1124 -0
- data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-shld-gas.s +1196 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-inplace-pl2-armv7a-neon-le-armcc.s +1392 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-inplace-pl2-armv7a-neon-le-gcc.s +1394 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-times2-SnP.h +42 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512u12/SIMD512-2-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512u4/SIMD512-2-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512ufull/SIMD512-2-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/KeccakP-1600-times2-SIMD512.c +850 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/KeccakP-1600-times2-SnP.h +51 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SIMD128.c +957 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SnP.h +49 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/SSSE3-u2/SIMD128-config.h +8 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/SSSE3-ua/SIMD128-config.h +8 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/XOP-u2/SIMD128-config.h +9 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/XOP-ua/SIMD128-config.h +9 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/fallback-on1/KeccakP-1600-times2-SnP.h +45 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times2/fallback-on1/KeccakP-1600-times2-on1.c +37 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SIMD256.c +1321 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SnP.h +55 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/u12/SIMD256-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/u6/SIMD256-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/ua/SIMD256-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512u12/SIMD512-4-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512u4/SIMD512-4-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512ufull/SIMD512-4-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/KeccakP-1600-times4-SIMD512.c +881 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/KeccakP-1600-times4-SnP.h +51 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-SnP.h +45 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-on1.c +37 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on2/KeccakP-1600-times4-SnP.h +45 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on2/KeccakP-1600-times4-on2.c +38 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/KeccakP-1600-times8-SIMD512.c +1615 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/KeccakP-1600-times8-SnP.h +57 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/u12/SIMD512-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/u4/SIMD512-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/ua/SIMD512-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on1/KeccakP-1600-times8-SnP.h +45 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on1/KeccakP-1600-times8-on1.c +37 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on2/KeccakP-1600-times8-SnP.h +45 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on2/KeccakP-1600-times8-on2.c +38 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on4/KeccakP-1600-times8-SnP.h +45 -0
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on4/KeccakP-1600-times8-on4.c +38 -0
- data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-SnP.h +41 -0
- data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv6m-le-armcc.s +442 -0
- data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv6m-le-gcc.s +446 -0
- data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-armcc.s +419 -0
- data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-gcc.s +427 -0
- data/ext/xkcp/lib/low/KeccakP-200/AVR8/KeccakP-200-SnP.h +41 -0
- data/ext/xkcp/lib/low/KeccakP-200/AVR8/KeccakP-200-avr8-fast.s +647 -0
- data/ext/xkcp/lib/low/KeccakP-200/compact/KeccakP-200-SnP.h +39 -0
- data/ext/xkcp/lib/low/KeccakP-200/compact/KeccakP-200-compact.c +190 -0
- data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-SnP.h +43 -0
- data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-reference.c +412 -0
- data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-reference.h +23 -0
- data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-SnP.h +41 -0
- data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv6m-le-armcc.s +454 -0
- data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv6m-le-gcc.s +458 -0
- data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv7m-le-armcc.s +455 -0
- data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv7m-le-gcc.s +458 -0
- data/ext/xkcp/lib/low/KeccakP-400/AVR8/KeccakP-400-SnP.h +41 -0
- data/ext/xkcp/lib/low/KeccakP-400/AVR8/KeccakP-400-avr8-fast.s +728 -0
- data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-SnP.h +43 -0
- data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-reference.c +414 -0
- data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-reference.h +23 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-SnP.h +42 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u1-armv6m-le-armcc.s +527 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u1-armv6m-le-gcc.s +533 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv6m-le-armcc.s +528 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv6m-le-gcc.s +534 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7a-le-armcc.s +521 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7a-le-gcc.s +527 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7m-le-armcc.s +517 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7m-le-gcc.s +523 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-uf-armv7m-le-armcc.s +550 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-uf-armv7m-le-gcc.s +556 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARMv8A/KeccakP-800-SnP.h +32 -0
- data/ext/xkcp/lib/low/KeccakP-800/ARMv8A/KeccakP-800-armv8a-neon.s +432 -0
- data/ext/xkcp/lib/low/KeccakP-800/AVR8/KeccakP-800-SnP.h +42 -0
- data/ext/xkcp/lib/low/KeccakP-800/AVR8/KeccakP-800-avr8-fast.s +929 -0
- data/ext/xkcp/lib/low/KeccakP-800/compact/KeccakP-800-SnP.h +40 -0
- data/ext/xkcp/lib/low/KeccakP-800/compact/KeccakP-800-compact.c +244 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-SnP.h +46 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32-bis.macros +184 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32.c +454 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32.macros +459 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-unrolling-bis.macros +83 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-unrolling.macros +88 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/lcu2/KeccakP-800-opt32-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/lcua/KeccakP-800-opt32-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/u2/KeccakP-800-opt32-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-800/plain/ua/KeccakP-800-opt32-config.h +7 -0
- data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-SnP.h +44 -0
- data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-reference.c +437 -0
- data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-reference.h +23 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/Ket.h +57 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeJr-armv7m-le-armcc.s +475 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeJr-armv7m-le-gcc.s +480 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeSr-armv7m-le-armcc.s +590 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeSr-armv7m-le-gcc.s +590 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.c +126 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.h +68 -0
- data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.inc +174 -0
- data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.c +80 -0
- data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.h +68 -0
- data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.inc +142 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-SnP.h +55 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-u1-armv6m-le-armcc.s +1086 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-u1-armv6m-le-gcc.s +1092 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv6-le-armcc.s +721 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv6-le-gcc.s +726 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv7m-le-armcc.s +723 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv7m-le-gcc.s +729 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-u1-armv6m-le-armcc.s +1164 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-u1-armv6m-le-gcc.s +1165 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv6-le-armcc.s +562 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv6-le-gcc.s +563 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv7m-le-armcc.s +563 -0
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv7m-le-gcc.s +565 -0
- data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-SnP.h +55 -0
- data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-uf-armv7a-neon-le-armcc.s +476 -0
- data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-uf-armv7a-neon-le-gcc.s +485 -0
- data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodyak-uf-armv7a-neon-le-armcc.s +362 -0
- data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodyak-uf-armv7a-neon-le-gcc.s +367 -0
- data/ext/xkcp/lib/low/Xoodoo/AVR8/Xoodoo-SnP.h +43 -0
- data/ext/xkcp/lib/low/Xoodoo/AVR8/Xoodoo-avr8-u1.s +1341 -0
- data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodoo-SIMD512.c +581 -0
- data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodoo-SnP.h +58 -0
- data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodyak-full-block-SIMD512.c +332 -0
- data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodoo-SIMD128.c +329 -0
- data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodoo-SnP.h +53 -0
- data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodyak-full-block-SIMD128.c +355 -0
- data/ext/xkcp/lib/low/Xoodoo/Xoodoo.h +79 -0
- data/ext/xkcp/lib/low/Xoodoo/plain/Xoodoo-SnP.h +56 -0
- data/ext/xkcp/lib/low/Xoodoo/plain/Xoodoo-optimized.c +399 -0
- data/ext/xkcp/lib/low/Xoodoo/plain/Xoodyak-full-blocks.c +127 -0
- data/ext/xkcp/lib/low/Xoodoo/ref/Xoodoo-SnP.h +43 -0
- data/ext/xkcp/lib/low/Xoodoo/ref/Xoodoo-reference.c +253 -0
- data/ext/xkcp/lib/low/Xoodoo-times16/AVX512/Xoodoo-times16-SIMD512.c +1044 -0
- data/ext/xkcp/lib/low/Xoodoo-times16/AVX512/Xoodoo-times16-SnP.h +49 -0
- data/ext/xkcp/lib/low/Xoodoo-times16/fallback-on1/Xoodoo-times16-SnP.h +45 -0
- data/ext/xkcp/lib/low/Xoodoo-times16/fallback-on1/Xoodoo-times16-on1.c +37 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/ARMv7A-NEON/Xoodoo-times4-ARMv7A.s +1587 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/ARMv7A-NEON/Xoodoo-times4-SnP.h +48 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/AVX512/Xoodoo-times4-SIMD512.c +1202 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/AVX512/Xoodoo-times4-SnP.h +48 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/SSSE3/Xoodoo-times4-SIMD128.c +484 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/SSSE3/Xoodoo-times4-SnP.h +44 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/fallback-on1/Xoodoo-times4-SnP.h +45 -0
- data/ext/xkcp/lib/low/Xoodoo-times4/fallback-on1/Xoodoo-times4-on1.c +37 -0
- data/ext/xkcp/lib/low/Xoodoo-times8/AVX2/Xoodoo-times8-SIMD256.c +939 -0
- data/ext/xkcp/lib/low/Xoodoo-times8/AVX2/Xoodoo-times8-SnP.h +49 -0
- data/ext/xkcp/lib/low/Xoodoo-times8/AVX512/Xoodoo-times8-SIMD512.c +1216 -0
- data/ext/xkcp/lib/low/Xoodoo-times8/AVX512/Xoodoo-times8-SnP.h +48 -0
- data/ext/xkcp/lib/low/Xoodoo-times8/fallback-on1/Xoodoo-times8-SnP.h +45 -0
- data/ext/xkcp/lib/low/Xoodoo-times8/fallback-on1/Xoodoo-times8-on1.c +37 -0
- data/ext/xkcp/lib/low/common/PlSnP-Fallback.inc +290 -0
- data/ext/xkcp/lib/low/common/SnP-Relaned.h +141 -0
- data/ext/xkcp/support/Build/ExpandProducts.xsl +79 -0
- data/ext/xkcp/support/Build/ToGlobalMakefile.xsl +206 -0
- data/ext/xkcp/support/Build/ToOneTarget.xsl +89 -0
- data/ext/xkcp/support/Build/ToTargetConfigFile.xsl +37 -0
- data/ext/xkcp/support/Build/ToTargetMakefile.xsl +298 -0
- data/ext/xkcp/support/Build/ToVCXProj.xsl +198 -0
- data/ext/xkcp/support/Kernel-PMU/Kernel-pmu.md +133 -0
- data/ext/xkcp/support/Kernel-PMU/Makefile +8 -0
- data/ext/xkcp/support/Kernel-PMU/enable_arm_pmu.c +129 -0
- data/ext/xkcp/support/Kernel-PMU/load-module +1 -0
- data/ext/xkcp/util/KeccakSum/KeccakSum.c +394 -0
- data/ext/xkcp/util/KeccakSum/base64.c +86 -0
- data/ext/xkcp/util/KeccakSum/base64.h +12 -0
- data/lib/sleeping_kangaroo12/binding.rb +15 -0
- data/lib/sleeping_kangaroo12/build/loader.rb +40 -0
- data/lib/sleeping_kangaroo12/build/platform.rb +37 -0
- data/lib/sleeping_kangaroo12/build.rb +4 -0
- data/lib/sleeping_kangaroo12/digest.rb +103 -0
- data/lib/sleeping_kangaroo12/version.rb +5 -0
- data/lib/sleeping_kangaroo12.rb +7 -0
- metadata +372 -0
|
@@ -0,0 +1,1100 @@
|
|
|
1
|
+
# The eXtended Keccak Code Package (XKCP)
|
|
2
|
+
# https://github.com/XKCP/XKCP
|
|
3
|
+
#
|
|
4
|
+
# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
|
|
5
|
+
# Copyright (c) 2017 Ronny Van Keer
|
|
6
|
+
# All rights reserved.
|
|
7
|
+
#
|
|
8
|
+
# The source code in this file is licensed under the CRYPTOGAMS license.
|
|
9
|
+
# For further details see http://www.openssl.org/~appro/cryptogams/.
|
|
10
|
+
#
|
|
11
|
+
# Notes:
|
|
12
|
+
# The code for the permutation (__KeccakF1600) was generated with
|
|
13
|
+
# Andy Polyakov's keccak1600-avx2.pl from the CRYPTOGAMS project
|
|
14
|
+
# (https://github.com/dot-asm/cryptogams/blob/master/x86_64/keccak1600-avx2.pl).
|
|
15
|
+
# The rest of the code was written by Ronny Van Keer.
|
|
16
|
+
# Adaptations for macOS by Stéphane Léon.
|
|
17
|
+
# Adaptations for mingw-w64 (changes macOS too) by Jorrit Jongma.
|
|
18
|
+
|
|
19
|
+
.text
|
|
20
|
+
|
|
21
|
+
# -----------------------------------------------------------------------------
|
|
22
|
+
#
|
|
23
|
+
# void KeccakP1600_Initialize(void *state);
|
|
24
|
+
#
|
|
25
|
+
.globl KeccakP1600_Initialize
|
|
26
|
+
.globl _KeccakP1600_Initialize
|
|
27
|
+
.ifndef old_gas_syntax
|
|
28
|
+
.type KeccakP1600_Initialize,@function
|
|
29
|
+
.endif
|
|
30
|
+
KeccakP1600_Initialize:
|
|
31
|
+
_KeccakP1600_Initialize:
|
|
32
|
+
.balign 32
|
|
33
|
+
vpxor %ymm0,%ymm0,%ymm0
|
|
34
|
+
vmovdqu %ymm0,0*32(%rdi)
|
|
35
|
+
vmovdqu %ymm0,1*32(%rdi)
|
|
36
|
+
vmovdqu %ymm0,2*32(%rdi)
|
|
37
|
+
vmovdqu %ymm0,3*32(%rdi)
|
|
38
|
+
vmovdqu %ymm0,4*32(%rdi)
|
|
39
|
+
vmovdqu %ymm0,5*32(%rdi)
|
|
40
|
+
movq $0,6*32(%rdi)
|
|
41
|
+
ret
|
|
42
|
+
.ifndef old_gas_syntax
|
|
43
|
+
.size KeccakP1600_Initialize,.-KeccakP1600_Initialize
|
|
44
|
+
.endif
|
|
45
|
+
|
|
46
|
+
# -----------------------------------------------------------------------------
|
|
47
|
+
#
|
|
48
|
+
# void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset);
|
|
49
|
+
# %rdi %rsi %rdx
|
|
50
|
+
#
|
|
51
|
+
.globl KeccakP1600_AddByte
|
|
52
|
+
.globl _KeccakP1600_AddByte
|
|
53
|
+
.ifndef old_gas_syntax
|
|
54
|
+
.type KeccakP1600_AddByte,@function
|
|
55
|
+
.endif
|
|
56
|
+
KeccakP1600_AddByte:
|
|
57
|
+
_KeccakP1600_AddByte:
|
|
58
|
+
.balign 32
|
|
59
|
+
mov %rdx, %rax
|
|
60
|
+
and $7, %rax
|
|
61
|
+
and $0xFFFFFFF8, %edx
|
|
62
|
+
lea mapState(%rip), %r9
|
|
63
|
+
mov (%r9, %rdx), %rdx
|
|
64
|
+
add %rdx, %rdi
|
|
65
|
+
add %rax, %rdi
|
|
66
|
+
xorb %sil, (%rdi)
|
|
67
|
+
ret
|
|
68
|
+
.ifndef old_gas_syntax
|
|
69
|
+
.size KeccakP1600_AddByte,.-KeccakP1600_AddByte
|
|
70
|
+
.endif
|
|
71
|
+
|
|
72
|
+
# -----------------------------------------------------------------------------
|
|
73
|
+
#
|
|
74
|
+
# void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
|
|
75
|
+
# %rdi %rsi %rdx %rcx
|
|
76
|
+
#
|
|
77
|
+
.globl KeccakP1600_AddBytes
|
|
78
|
+
.globl _KeccakP1600_AddBytes
|
|
79
|
+
.ifndef old_gas_syntax
|
|
80
|
+
.type KeccakP1600_AddBytes,@function
|
|
81
|
+
.endif
|
|
82
|
+
KeccakP1600_AddBytes:
|
|
83
|
+
_KeccakP1600_AddBytes:
|
|
84
|
+
.balign 32
|
|
85
|
+
cmp $0, %rcx
|
|
86
|
+
jz KeccakP1600_AddBytes_Exit
|
|
87
|
+
mov %rdx, %rax # rax offset in lane
|
|
88
|
+
and $0xFFFFFFF8, %edx # rdx pointer into state index mapper
|
|
89
|
+
lea mapState(%rip), %r9
|
|
90
|
+
add %r9, %rdx
|
|
91
|
+
and $7, %rax
|
|
92
|
+
jz KeccakP1600_AddBytes_LaneAlignedCheck
|
|
93
|
+
mov $8, %r9 # r9 is (max) length of incomplete lane
|
|
94
|
+
sub %rax, %r9
|
|
95
|
+
cmp %rcx, %r9
|
|
96
|
+
cmovae %rcx, %r9
|
|
97
|
+
sub %r9, %rcx # length -= length of incomplete lane
|
|
98
|
+
add (%rdx), %rax # rax = pointer to state lane
|
|
99
|
+
add $8, %rdx
|
|
100
|
+
add %rdi, %rax
|
|
101
|
+
KeccakP1600_AddBytes_NotAlignedLoop:
|
|
102
|
+
mov (%rsi), %r8b
|
|
103
|
+
inc %rsi
|
|
104
|
+
xorb %r8b, (%rax)
|
|
105
|
+
inc %rax
|
|
106
|
+
dec %r9
|
|
107
|
+
jnz KeccakP1600_AddBytes_NotAlignedLoop
|
|
108
|
+
jmp KeccakP1600_AddBytes_LaneAlignedCheck
|
|
109
|
+
KeccakP1600_AddBytes_LaneAlignedLoop:
|
|
110
|
+
mov (%rsi), %r8
|
|
111
|
+
add $8, %rsi
|
|
112
|
+
mov (%rdx), %rax
|
|
113
|
+
add $8, %rdx
|
|
114
|
+
add %rdi, %rax
|
|
115
|
+
xor %r8, (%rax)
|
|
116
|
+
KeccakP1600_AddBytes_LaneAlignedCheck:
|
|
117
|
+
sub $8, %rcx
|
|
118
|
+
jnc KeccakP1600_AddBytes_LaneAlignedLoop
|
|
119
|
+
KeccakP1600_AddBytes_LastIncompleteLane:
|
|
120
|
+
add $8, %rcx
|
|
121
|
+
jz KeccakP1600_AddBytes_Exit
|
|
122
|
+
mov (%rdx), %rax
|
|
123
|
+
add %rdi, %rax
|
|
124
|
+
KeccakP1600_AddBytes_LastIncompleteLaneLoop:
|
|
125
|
+
mov (%rsi), %r8b
|
|
126
|
+
inc %rsi
|
|
127
|
+
xor %r8b, (%rax)
|
|
128
|
+
inc %rax
|
|
129
|
+
dec %rcx
|
|
130
|
+
jnz KeccakP1600_AddBytes_LastIncompleteLaneLoop
|
|
131
|
+
KeccakP1600_AddBytes_Exit:
|
|
132
|
+
ret
|
|
133
|
+
.ifndef old_gas_syntax
|
|
134
|
+
.size KeccakP1600_AddBytes,.-KeccakP1600_AddBytes
|
|
135
|
+
.endif
|
|
136
|
+
|
|
137
|
+
# -----------------------------------------------------------------------------
|
|
138
|
+
#
|
|
139
|
+
# void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
|
|
140
|
+
# %rdi %rsi %rdx %rcx
|
|
141
|
+
#
|
|
142
|
+
.globl KeccakP1600_OverwriteBytes
|
|
143
|
+
.globl _KeccakP1600_OverwriteBytes
|
|
144
|
+
.ifndef old_gas_syntax
|
|
145
|
+
.type KeccakP1600_OverwriteBytes,@function
|
|
146
|
+
.endif
|
|
147
|
+
KeccakP1600_OverwriteBytes:
|
|
148
|
+
_KeccakP1600_OverwriteBytes:
|
|
149
|
+
.balign 32
|
|
150
|
+
cmp $0, %rcx
|
|
151
|
+
jz KeccakP1600_OverwriteBytes_Exit
|
|
152
|
+
mov %rdx, %rax # rax offset in lane
|
|
153
|
+
and $0xFFFFFFF8, %edx # rdx pointer into state index mapper
|
|
154
|
+
lea mapState(%rip), %r9
|
|
155
|
+
add %r9, %rdx
|
|
156
|
+
and $7, %rax
|
|
157
|
+
jz KeccakP1600_OverwriteBytes_LaneAlignedCheck
|
|
158
|
+
mov $8, %r9 # r9 is (max) length of incomplete lane
|
|
159
|
+
sub %rax, %r9
|
|
160
|
+
cmp %rcx, %r9
|
|
161
|
+
cmovae %rcx, %r9
|
|
162
|
+
sub %r9, %rcx # length -= length of incomplete lane
|
|
163
|
+
add (%rdx), %rax # rax = pointer to state lane
|
|
164
|
+
add $8, %rdx
|
|
165
|
+
add %rdi, %rax
|
|
166
|
+
KeccakP1600_OverwriteBytes_NotAlignedLoop:
|
|
167
|
+
mov (%rsi), %r8b
|
|
168
|
+
inc %rsi
|
|
169
|
+
mov %r8b, (%rax)
|
|
170
|
+
inc %rax
|
|
171
|
+
dec %r9
|
|
172
|
+
jnz KeccakP1600_OverwriteBytes_NotAlignedLoop
|
|
173
|
+
jmp KeccakP1600_OverwriteBytes_LaneAlignedCheck
|
|
174
|
+
KeccakP1600_OverwriteBytes_LaneAlignedLoop:
|
|
175
|
+
mov (%rsi), %r8
|
|
176
|
+
add $8, %rsi
|
|
177
|
+
mov (%rdx), %rax
|
|
178
|
+
add $8, %rdx
|
|
179
|
+
add %rdi, %rax
|
|
180
|
+
mov %r8, (%rax)
|
|
181
|
+
KeccakP1600_OverwriteBytes_LaneAlignedCheck:
|
|
182
|
+
sub $8, %rcx
|
|
183
|
+
jnc KeccakP1600_OverwriteBytes_LaneAlignedLoop
|
|
184
|
+
KeccakP1600_OverwriteBytes_LastIncompleteLane:
|
|
185
|
+
add $8, %rcx
|
|
186
|
+
jz KeccakP1600_OverwriteBytes_Exit
|
|
187
|
+
mov (%rdx), %rax
|
|
188
|
+
add %rdi, %rax
|
|
189
|
+
KeccakP1600_OverwriteBytes_LastIncompleteLaneLoop:
|
|
190
|
+
mov (%rsi), %r8b
|
|
191
|
+
inc %rsi
|
|
192
|
+
mov %r8b, (%rax)
|
|
193
|
+
inc %rax
|
|
194
|
+
dec %rcx
|
|
195
|
+
jnz KeccakP1600_OverwriteBytes_LastIncompleteLaneLoop
|
|
196
|
+
KeccakP1600_OverwriteBytes_Exit:
|
|
197
|
+
ret
|
|
198
|
+
.ifndef old_gas_syntax
|
|
199
|
+
.size KeccakP1600_OverwriteBytes,.-KeccakP1600_OverwriteBytes
|
|
200
|
+
.endif
|
|
201
|
+
|
|
202
|
+
# -----------------------------------------------------------------------------
|
|
203
|
+
#
|
|
204
|
+
# void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount);
|
|
205
|
+
# %rdi %rsi
|
|
206
|
+
#
|
|
207
|
+
.globl KeccakP1600_OverwriteWithZeroes
|
|
208
|
+
.globl _KeccakP1600_OverwriteWithZeroes
|
|
209
|
+
.ifndef old_gas_syntax
|
|
210
|
+
.type KeccakP1600_OverwriteWithZeroes,@function
|
|
211
|
+
.endif
|
|
212
|
+
KeccakP1600_OverwriteWithZeroes:
|
|
213
|
+
_KeccakP1600_OverwriteWithZeroes:
|
|
214
|
+
.balign 32
|
|
215
|
+
cmp $0, %rsi
|
|
216
|
+
jz KeccakP1600_OverwriteWithZeroes_Exit
|
|
217
|
+
lea mapState(%rip), %rdx # rdx pointer into state index mapper
|
|
218
|
+
jmp KeccakP1600_OverwriteWithZeroes_LaneAlignedCheck
|
|
219
|
+
KeccakP1600_OverwriteWithZeroes_LaneAlignedLoop:
|
|
220
|
+
mov (%rdx), %rax
|
|
221
|
+
add $8, %rdx
|
|
222
|
+
add %rdi, %rax
|
|
223
|
+
movq $0, (%rax)
|
|
224
|
+
KeccakP1600_OverwriteWithZeroes_LaneAlignedCheck:
|
|
225
|
+
sub $8, %rsi
|
|
226
|
+
jnc KeccakP1600_OverwriteWithZeroes_LaneAlignedLoop
|
|
227
|
+
KeccakP1600_OverwriteWithZeroes_LastIncompleteLane:
|
|
228
|
+
add $8, %rsi
|
|
229
|
+
jz KeccakP1600_OverwriteWithZeroes_Exit
|
|
230
|
+
mov (%rdx), %rax
|
|
231
|
+
add %rdi, %rax
|
|
232
|
+
KeccakP1600_OverwriteWithZeroes_LastIncompleteLaneLoop:
|
|
233
|
+
movb $0, (%rax)
|
|
234
|
+
inc %rax
|
|
235
|
+
dec %rsi
|
|
236
|
+
jnz KeccakP1600_OverwriteWithZeroes_LastIncompleteLaneLoop
|
|
237
|
+
KeccakP1600_OverwriteWithZeroes_Exit:
|
|
238
|
+
ret
|
|
239
|
+
.ifndef old_gas_syntax
|
|
240
|
+
.size KeccakP1600_OverwriteWithZeroes,.-KeccakP1600_OverwriteWithZeroes
|
|
241
|
+
.endif
|
|
242
|
+
|
|
243
|
+
# -----------------------------------------------------------------------------
|
|
244
|
+
#
|
|
245
|
+
# void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
|
|
246
|
+
# %rdi %rsi %rdx %rcx
|
|
247
|
+
#
|
|
248
|
+
.globl KeccakP1600_ExtractBytes
|
|
249
|
+
.globl _KeccakP1600_ExtractBytes
|
|
250
|
+
.ifndef old_gas_syntax
|
|
251
|
+
.type KeccakP1600_ExtractBytes,@function
|
|
252
|
+
.endif
|
|
253
|
+
KeccakP1600_ExtractBytes:
|
|
254
|
+
_KeccakP1600_ExtractBytes:
|
|
255
|
+
.balign 32
|
|
256
|
+
push %rbx
|
|
257
|
+
cmp $0, %rcx
|
|
258
|
+
jz KeccakP1600_ExtractBytes_Exit
|
|
259
|
+
mov %rdx, %rax # rax offset in lane
|
|
260
|
+
and $0xFFFFFFF8, %edx # rdx pointer into state index mapper
|
|
261
|
+
lea mapState(%rip), %r9
|
|
262
|
+
add %r9, %rdx
|
|
263
|
+
and $7, %rax
|
|
264
|
+
jz KeccakP1600_ExtractBytes_LaneAlignedCheck
|
|
265
|
+
mov $8, %rbx # rbx is (max) length of incomplete lane
|
|
266
|
+
sub %rax, %rbx
|
|
267
|
+
cmp %rcx, %rbx
|
|
268
|
+
cmovae %rcx, %rbx
|
|
269
|
+
sub %rbx, %rcx # length -= length of incomplete lane
|
|
270
|
+
mov (%rdx), %r9
|
|
271
|
+
add $8, %rdx
|
|
272
|
+
add %rdi, %r9
|
|
273
|
+
add %rax, %r9
|
|
274
|
+
KeccakP1600_ExtractBytes_NotAlignedLoop:
|
|
275
|
+
mov (%r9), %r8b
|
|
276
|
+
inc %r9
|
|
277
|
+
mov %r8b, (%rsi)
|
|
278
|
+
inc %rsi
|
|
279
|
+
dec %rbx
|
|
280
|
+
jnz KeccakP1600_ExtractBytes_NotAlignedLoop
|
|
281
|
+
jmp KeccakP1600_ExtractBytes_LaneAlignedCheck
|
|
282
|
+
KeccakP1600_ExtractBytes_LaneAlignedLoop:
|
|
283
|
+
mov (%rdx), %rax
|
|
284
|
+
add $8, %rdx
|
|
285
|
+
add %rdi, %rax
|
|
286
|
+
mov (%rax), %r8
|
|
287
|
+
mov %r8, (%rsi)
|
|
288
|
+
add $8, %rsi
|
|
289
|
+
KeccakP1600_ExtractBytes_LaneAlignedCheck:
|
|
290
|
+
sub $8, %rcx
|
|
291
|
+
jnc KeccakP1600_ExtractBytes_LaneAlignedLoop
|
|
292
|
+
KeccakP1600_ExtractBytes_LastIncompleteLane:
|
|
293
|
+
add $8, %rcx
|
|
294
|
+
jz KeccakP1600_ExtractBytes_Exit
|
|
295
|
+
mov (%rdx), %rax
|
|
296
|
+
add %rdi, %rax
|
|
297
|
+
mov (%rax), %r8
|
|
298
|
+
KeccakP1600_ExtractBytes_LastIncompleteLaneLoop:
|
|
299
|
+
mov %r8b, (%rsi)
|
|
300
|
+
shr $8, %r8
|
|
301
|
+
inc %rsi
|
|
302
|
+
dec %rcx
|
|
303
|
+
jnz KeccakP1600_ExtractBytes_LastIncompleteLaneLoop
|
|
304
|
+
KeccakP1600_ExtractBytes_Exit:
|
|
305
|
+
pop %rbx
|
|
306
|
+
ret
|
|
307
|
+
.ifndef old_gas_syntax
|
|
308
|
+
.size KeccakP1600_ExtractBytes,.-KeccakP1600_ExtractBytes
|
|
309
|
+
.endif
|
|
310
|
+
|
|
311
|
+
# -----------------------------------------------------------------------------
|
|
312
|
+
#
|
|
313
|
+
# void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length);
|
|
314
|
+
# %rdi %rsi %rdx %rcx %r8
|
|
315
|
+
#
|
|
316
|
+
.globl KeccakP1600_ExtractAndAddBytes
|
|
317
|
+
.globl _KeccakP1600_ExtractAndAddBytes
|
|
318
|
+
.ifndef old_gas_syntax
|
|
319
|
+
.type KeccakP1600_ExtractAndAddBytes,@function
|
|
320
|
+
.endif
|
|
321
|
+
KeccakP1600_ExtractAndAddBytes:
|
|
322
|
+
_KeccakP1600_ExtractAndAddBytes:
|
|
323
|
+
.balign 32
|
|
324
|
+
push %rbx
|
|
325
|
+
push %r10
|
|
326
|
+
cmp $0, %r8
|
|
327
|
+
jz KeccakP1600_ExtractAndAddBytes_Exit
|
|
328
|
+
mov %rcx, %rax # rax offset in lane
|
|
329
|
+
and $0xFFFFFFF8, %ecx # rcx pointer into state index mapper
|
|
330
|
+
lea mapState(%rip), %r9
|
|
331
|
+
add %r9, %rcx
|
|
332
|
+
and $7, %rax
|
|
333
|
+
jz KeccakP1600_ExtractAndAddBytes_LaneAlignedCheck
|
|
334
|
+
mov $8, %rbx # rbx is (max) length of incomplete lane
|
|
335
|
+
sub %rax, %rbx
|
|
336
|
+
cmp %r8, %rbx
|
|
337
|
+
cmovae %r8, %rbx
|
|
338
|
+
sub %rbx, %r8 # length -= length of incomplete lane
|
|
339
|
+
mov (%rcx), %r9
|
|
340
|
+
add $8, %rcx
|
|
341
|
+
add %rdi, %r9
|
|
342
|
+
add %rax, %r9
|
|
343
|
+
KeccakP1600_ExtractAndAddBytes_NotAlignedLoop:
|
|
344
|
+
mov (%r9), %r10b
|
|
345
|
+
inc %r9
|
|
346
|
+
xor (%rsi), %r10b
|
|
347
|
+
inc %rsi
|
|
348
|
+
mov %r10b, (%rdx)
|
|
349
|
+
inc %rdx
|
|
350
|
+
dec %rbx
|
|
351
|
+
jnz KeccakP1600_ExtractAndAddBytes_NotAlignedLoop
|
|
352
|
+
jmp KeccakP1600_ExtractAndAddBytes_LaneAlignedCheck
|
|
353
|
+
KeccakP1600_ExtractAndAddBytes_LaneAlignedLoop:
|
|
354
|
+
mov (%rcx), %rax
|
|
355
|
+
add $8, %rcx
|
|
356
|
+
add %rdi, %rax
|
|
357
|
+
mov (%rax), %r10
|
|
358
|
+
xor (%rsi), %r10
|
|
359
|
+
add $8, %rsi
|
|
360
|
+
mov %r10, (%rdx)
|
|
361
|
+
add $8, %rdx
|
|
362
|
+
KeccakP1600_ExtractAndAddBytes_LaneAlignedCheck:
|
|
363
|
+
sub $8, %r8
|
|
364
|
+
jnc KeccakP1600_ExtractAndAddBytes_LaneAlignedLoop
|
|
365
|
+
KeccakP1600_ExtractAndAddBytes_LastIncompleteLane:
|
|
366
|
+
add $8, %r8
|
|
367
|
+
jz KeccakP1600_ExtractAndAddBytes_Exit
|
|
368
|
+
mov (%rcx), %rax
|
|
369
|
+
add %rdi, %rax
|
|
370
|
+
mov (%rax), %r10
|
|
371
|
+
KeccakP1600_ExtractAndAddBytes_LastIncompleteLaneLoop:
|
|
372
|
+
xor (%rsi), %r10b
|
|
373
|
+
inc %rsi
|
|
374
|
+
mov %r10b, (%rdx)
|
|
375
|
+
inc %rdx
|
|
376
|
+
shr $8, %r10
|
|
377
|
+
dec %r8
|
|
378
|
+
jnz KeccakP1600_ExtractAndAddBytes_LastIncompleteLaneLoop
|
|
379
|
+
KeccakP1600_ExtractAndAddBytes_Exit:
|
|
380
|
+
pop %r10
|
|
381
|
+
pop %rbx
|
|
382
|
+
ret
|
|
383
|
+
.ifndef old_gas_syntax
|
|
384
|
+
.size KeccakP1600_ExtractAndAddBytes,.-KeccakP1600_ExtractAndAddBytes
|
|
385
|
+
.endif
|
|
386
|
+
|
|
387
|
+
# -----------------------------------------------------------------------------
|
|
388
|
+
#
|
|
389
|
+
# internal
|
|
390
|
+
#
|
|
391
|
+
.ifndef old_gas_syntax
|
|
392
|
+
.type __KeccakF1600,@function
|
|
393
|
+
.endif
|
|
394
|
+
.balign 32
|
|
395
|
+
__KeccakF1600:
|
|
396
|
+
.Loop_avx2:
|
|
397
|
+
######################################### Theta
|
|
398
|
+
vpshufd $0b01001110,%ymm2,%ymm13
|
|
399
|
+
vpxor %ymm3,%ymm5,%ymm12
|
|
400
|
+
vpxor %ymm6,%ymm4,%ymm9
|
|
401
|
+
vpxor %ymm1,%ymm12,%ymm12
|
|
402
|
+
vpxor %ymm9,%ymm12,%ymm12 # C[1..4]
|
|
403
|
+
|
|
404
|
+
vpermq $0b10010011,%ymm12,%ymm11
|
|
405
|
+
vpxor %ymm2,%ymm13,%ymm13
|
|
406
|
+
vpermq $0b01001110,%ymm13,%ymm7
|
|
407
|
+
|
|
408
|
+
vpsrlq $63,%ymm12,%ymm8
|
|
409
|
+
vpaddq %ymm12,%ymm12,%ymm9
|
|
410
|
+
vpor %ymm9,%ymm8,%ymm8 # ROL64(C[1..4],1)
|
|
411
|
+
|
|
412
|
+
vpermq $0b00111001,%ymm8,%ymm15
|
|
413
|
+
vpxor %ymm11,%ymm8,%ymm14
|
|
414
|
+
vpermq $0b00000000,%ymm14,%ymm14 # D[0..0] = ROL64(C[1],1) ^ C[4]
|
|
415
|
+
|
|
416
|
+
vpxor %ymm0,%ymm13,%ymm13
|
|
417
|
+
vpxor %ymm7,%ymm13,%ymm13 # C[0..0]
|
|
418
|
+
|
|
419
|
+
vpsrlq $63,%ymm13,%ymm7
|
|
420
|
+
vpaddq %ymm13,%ymm13,%ymm8
|
|
421
|
+
vpor %ymm7,%ymm8,%ymm8 # ROL64(C[0..0],1)
|
|
422
|
+
|
|
423
|
+
vpxor %ymm14,%ymm2,%ymm2 # ^= D[0..0]
|
|
424
|
+
vpxor %ymm14,%ymm0,%ymm0 # ^= D[0..0]
|
|
425
|
+
|
|
426
|
+
vpblendd $0b11000000,%ymm8,%ymm15,%ymm15
|
|
427
|
+
vpblendd $0b00000011,%ymm13,%ymm11,%ymm11
|
|
428
|
+
vpxor %ymm11,%ymm15,%ymm15 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
|
|
429
|
+
|
|
430
|
+
######################################### Rho + Pi + pre-Chi shuffle
|
|
431
|
+
vpsllvq 0*32-96(%r8),%ymm2,%ymm10
|
|
432
|
+
vpsrlvq 0*32-96(%r9),%ymm2,%ymm2
|
|
433
|
+
vpor %ymm10,%ymm2,%ymm2
|
|
434
|
+
|
|
435
|
+
vpxor %ymm15,%ymm3,%ymm3 # ^= D[1..4] from Theta
|
|
436
|
+
vpsllvq 2*32-96(%r8),%ymm3,%ymm11
|
|
437
|
+
vpsrlvq 2*32-96(%r9),%ymm3,%ymm3
|
|
438
|
+
vpor %ymm11,%ymm3,%ymm3
|
|
439
|
+
|
|
440
|
+
vpxor %ymm15,%ymm4,%ymm4 # ^= D[1..4] from Theta
|
|
441
|
+
vpsllvq 3*32-96(%r8),%ymm4,%ymm12
|
|
442
|
+
vpsrlvq 3*32-96(%r9),%ymm4,%ymm4
|
|
443
|
+
vpor %ymm12,%ymm4,%ymm4
|
|
444
|
+
|
|
445
|
+
vpxor %ymm15,%ymm5,%ymm5 # ^= D[1..4] from Theta
|
|
446
|
+
vpsllvq 4*32-96(%r8),%ymm5,%ymm13
|
|
447
|
+
vpsrlvq 4*32-96(%r9),%ymm5,%ymm5
|
|
448
|
+
vpor %ymm13,%ymm5,%ymm5
|
|
449
|
+
|
|
450
|
+
vpxor %ymm15,%ymm6,%ymm6 # ^= D[1..4] from Theta
|
|
451
|
+
vpermq $0b10001101,%ymm2,%ymm10 # %ymm2 -> future %ymm3
|
|
452
|
+
vpermq $0b10001101,%ymm3,%ymm11 # %ymm3 -> future %ymm4
|
|
453
|
+
vpsllvq 5*32-96(%r8),%ymm6,%ymm14
|
|
454
|
+
vpsrlvq 5*32-96(%r9),%ymm6,%ymm8
|
|
455
|
+
vpor %ymm14,%ymm8,%ymm8 # %ymm6 -> future %ymm1
|
|
456
|
+
|
|
457
|
+
vpxor %ymm15,%ymm1,%ymm1 # ^= D[1..4] from Theta
|
|
458
|
+
vpermq $0b00011011,%ymm4,%ymm12 # %ymm4 -> future %ymm5
|
|
459
|
+
vpermq $0b01110010,%ymm5,%ymm13 # %ymm5 -> future %ymm6
|
|
460
|
+
vpsllvq 1*32-96(%r8),%ymm1,%ymm15
|
|
461
|
+
vpsrlvq 1*32-96(%r9),%ymm1,%ymm9
|
|
462
|
+
vpor %ymm15,%ymm9,%ymm9 # %ymm1 -> future %ymm2
|
|
463
|
+
|
|
464
|
+
######################################### Chi
|
|
465
|
+
vpsrldq $8,%ymm8,%ymm14
|
|
466
|
+
vpandn %ymm14,%ymm8,%ymm7 # tgting [0][0] [0][0] [0][0] [0][0]
|
|
467
|
+
|
|
468
|
+
vpblendd $0b00001100,%ymm13,%ymm9,%ymm3 # [4][4] [2][0]
|
|
469
|
+
vpblendd $0b00001100,%ymm9,%ymm11,%ymm15 # [4][0] [2][1]
|
|
470
|
+
vpblendd $0b00001100,%ymm11,%ymm10,%ymm5 # [4][2] [2][4]
|
|
471
|
+
vpblendd $0b00001100,%ymm10,%ymm9,%ymm14 # [4][3] [2][0]
|
|
472
|
+
vpblendd $0b00110000,%ymm11,%ymm3,%ymm3 # [1][3] [4][4] [2][0]
|
|
473
|
+
vpblendd $0b00110000,%ymm12,%ymm15,%ymm15 # [1][4] [4][0] [2][1]
|
|
474
|
+
vpblendd $0b00110000,%ymm9,%ymm5,%ymm5 # [1][0] [4][2] [2][4]
|
|
475
|
+
vpblendd $0b00110000,%ymm13,%ymm14,%ymm14 # [1][1] [4][3] [2][0]
|
|
476
|
+
vpblendd $0b11000000,%ymm12,%ymm3,%ymm3 # [3][2] [1][3] [4][4] [2][0]
|
|
477
|
+
vpblendd $0b11000000,%ymm13,%ymm15,%ymm15 # [3][3] [1][4] [4][0] [2][1]
|
|
478
|
+
vpblendd $0b11000000,%ymm13,%ymm5,%ymm5 # [3][3] [1][0] [4][2] [2][4]
|
|
479
|
+
vpblendd $0b11000000,%ymm11,%ymm14,%ymm14 # [3][4] [1][1] [4][3] [2][0]
|
|
480
|
+
vpandn %ymm15,%ymm3,%ymm3 # tgting [3][1] [1][2] [4][3] [2][4]
|
|
481
|
+
vpandn %ymm14,%ymm5,%ymm5 # tgting [3][2] [1][4] [4][1] [2][3]
|
|
482
|
+
|
|
483
|
+
vpblendd $0b00001100,%ymm9,%ymm12,%ymm6 # [4][0] [2][3]
|
|
484
|
+
vpblendd $0b00001100,%ymm12,%ymm10,%ymm15 # [4][1] [2][4]
|
|
485
|
+
vpxor %ymm10,%ymm3,%ymm3
|
|
486
|
+
vpblendd $0b00110000,%ymm10,%ymm6,%ymm6 # [1][2] [4][0] [2][3]
|
|
487
|
+
vpblendd $0b00110000,%ymm11,%ymm15,%ymm15 # [1][3] [4][1] [2][4]
|
|
488
|
+
vpxor %ymm12,%ymm5,%ymm5
|
|
489
|
+
vpblendd $0b11000000,%ymm11,%ymm6,%ymm6 # [3][4] [1][2] [4][0] [2][3]
|
|
490
|
+
vpblendd $0b11000000,%ymm9,%ymm15,%ymm15 # [3][0] [1][3] [4][1] [2][4]
|
|
491
|
+
vpandn %ymm15,%ymm6,%ymm6 # tgting [3][3] [1][1] [4][4] [2][2]
|
|
492
|
+
vpxor %ymm13,%ymm6,%ymm6
|
|
493
|
+
|
|
494
|
+
vpermq $0b00011110,%ymm8,%ymm4 # [0][1] [0][2] [0][4] [0][3]
|
|
495
|
+
vpblendd $0b00110000,%ymm0,%ymm4,%ymm15 # [0][1] [0][0] [0][4] [0][3]
|
|
496
|
+
vpermq $0b00111001,%ymm8,%ymm1 # [0][1] [0][4] [0][3] [0][2]
|
|
497
|
+
vpblendd $0b11000000,%ymm0,%ymm1,%ymm1 # [0][0] [0][4] [0][3] [0][2]
|
|
498
|
+
vpandn %ymm15,%ymm1,%ymm1 # tgting [0][4] [0][3] [0][2] [0][1]
|
|
499
|
+
|
|
500
|
+
vpblendd $0b00001100,%ymm12,%ymm11,%ymm2 # [4][1] [2][1]
|
|
501
|
+
vpblendd $0b00001100,%ymm11,%ymm13,%ymm14 # [4][2] [2][2]
|
|
502
|
+
vpblendd $0b00110000,%ymm13,%ymm2,%ymm2 # [1][1] [4][1] [2][1]
|
|
503
|
+
vpblendd $0b00110000,%ymm10,%ymm14,%ymm14 # [1][2] [4][2] [2][2]
|
|
504
|
+
vpblendd $0b11000000,%ymm10,%ymm2,%ymm2 # [3][1] [1][1] [4][1] [2][1]
|
|
505
|
+
vpblendd $0b11000000,%ymm12,%ymm14,%ymm14 # [3][2] [1][2] [4][2] [2][2]
|
|
506
|
+
vpandn %ymm14,%ymm2,%ymm2 # tgting [3][0] [1][0] [4][0] [2][0]
|
|
507
|
+
vpxor %ymm9,%ymm2,%ymm2
|
|
508
|
+
|
|
509
|
+
vpermq $0b00000000,%ymm7,%ymm7 # [0][0] [0][0] [0][0] [0][0]
|
|
510
|
+
vpermq $0b00011011,%ymm3,%ymm3 # post-Chi shuffle
|
|
511
|
+
vpermq $0b10001101,%ymm5,%ymm5
|
|
512
|
+
vpermq $0b01110010,%ymm6,%ymm6
|
|
513
|
+
|
|
514
|
+
vpblendd $0b00001100,%ymm10,%ymm13,%ymm4 # [4][3] [2][2]
|
|
515
|
+
vpblendd $0b00001100,%ymm13,%ymm12,%ymm14 # [4][4] [2][3]
|
|
516
|
+
vpblendd $0b00110000,%ymm12,%ymm4,%ymm4 # [1][4] [4][3] [2][2]
|
|
517
|
+
vpblendd $0b00110000,%ymm9,%ymm14,%ymm14 # [1][0] [4][4] [2][3]
|
|
518
|
+
vpblendd $0b11000000,%ymm9,%ymm4,%ymm4 # [3][0] [1][4] [4][3] [2][2]
|
|
519
|
+
vpblendd $0b11000000,%ymm10,%ymm14,%ymm14 # [3][1] [1][0] [4][4] [2][3]
|
|
520
|
+
vpandn %ymm14,%ymm4,%ymm4 # tgting [3][4] [1][3] [4][2] [2][1]
|
|
521
|
+
|
|
522
|
+
vpxor %ymm7,%ymm0,%ymm0
|
|
523
|
+
vpxor %ymm8,%ymm1,%ymm1
|
|
524
|
+
vpxor %ymm11,%ymm4,%ymm4
|
|
525
|
+
|
|
526
|
+
######################################### Iota
|
|
527
|
+
vpxor (%r10),%ymm0,%ymm0
|
|
528
|
+
lea 32(%r10),%r10
|
|
529
|
+
|
|
530
|
+
dec %eax
|
|
531
|
+
jnz .Loop_avx2
|
|
532
|
+
ret
|
|
533
|
+
.ifndef old_gas_syntax
|
|
534
|
+
.size __KeccakF1600,.-__KeccakF1600
|
|
535
|
+
.endif
|
|
536
|
+
|
|
537
|
+
# -----------------------------------------------------------------------------
|
|
538
|
+
#
|
|
539
|
+
# void KeccakP1600_Permute_24rounds(void *state);
|
|
540
|
+
# %rdi
|
|
541
|
+
#
|
|
542
|
+
.globl KeccakP1600_Permute_24rounds
|
|
543
|
+
.globl _KeccakP1600_Permute_24rounds
|
|
544
|
+
.ifndef old_gas_syntax
|
|
545
|
+
.type KeccakP1600_Permute_24rounds,@function
|
|
546
|
+
.endif
|
|
547
|
+
KeccakP1600_Permute_24rounds:
|
|
548
|
+
_KeccakP1600_Permute_24rounds:
|
|
549
|
+
.balign 32
|
|
550
|
+
lea rhotates_left+96(%rip),%r8
|
|
551
|
+
lea rhotates_right+96(%rip),%r9
|
|
552
|
+
lea iotas(%rip),%r10
|
|
553
|
+
mov $24,%eax
|
|
554
|
+
lea 96(%rdi),%rdi
|
|
555
|
+
vzeroupper
|
|
556
|
+
vpbroadcastq -96(%rdi),%ymm0 # load A[5][5]
|
|
557
|
+
vmovdqu 8+32*0-96(%rdi),%ymm1
|
|
558
|
+
vmovdqu 8+32*1-96(%rdi),%ymm2
|
|
559
|
+
vmovdqu 8+32*2-96(%rdi),%ymm3
|
|
560
|
+
vmovdqu 8+32*3-96(%rdi),%ymm4
|
|
561
|
+
vmovdqu 8+32*4-96(%rdi),%ymm5
|
|
562
|
+
vmovdqu 8+32*5-96(%rdi),%ymm6
|
|
563
|
+
call __KeccakF1600
|
|
564
|
+
vmovq %xmm0,-96(%rdi)
|
|
565
|
+
vmovdqu %ymm1,8+32*0-96(%rdi)
|
|
566
|
+
vmovdqu %ymm2,8+32*1-96(%rdi)
|
|
567
|
+
vmovdqu %ymm3,8+32*2-96(%rdi)
|
|
568
|
+
vmovdqu %ymm4,8+32*3-96(%rdi)
|
|
569
|
+
vmovdqu %ymm5,8+32*4-96(%rdi)
|
|
570
|
+
vmovdqu %ymm6,8+32*5-96(%rdi)
|
|
571
|
+
vzeroupper
|
|
572
|
+
ret
|
|
573
|
+
.ifndef old_gas_syntax
|
|
574
|
+
.size KeccakP1600_Permute_24rounds,.-KeccakP1600_Permute_24rounds
|
|
575
|
+
.endif
|
|
576
|
+
|
|
577
|
+
# -----------------------------------------------------------------------------
|
|
578
|
+
#
|
|
579
|
+
# void KeccakP1600_Permute_12rounds(void *state);
|
|
580
|
+
# %rdi
|
|
581
|
+
#
|
|
582
|
+
.globl KeccakP1600_Permute_12rounds
|
|
583
|
+
.globl _KeccakP1600_Permute_12rounds
|
|
584
|
+
.ifndef old_gas_syntax
|
|
585
|
+
.type KeccakP1600_Permute_12rounds,@function
|
|
586
|
+
.endif
|
|
587
|
+
KeccakP1600_Permute_12rounds:
|
|
588
|
+
_KeccakP1600_Permute_12rounds:
|
|
589
|
+
.balign 32
|
|
590
|
+
lea rhotates_left+96(%rip),%r8
|
|
591
|
+
lea rhotates_right+96(%rip),%r9
|
|
592
|
+
lea iotas+12*4*8(%rip),%r10
|
|
593
|
+
mov $12,%eax
|
|
594
|
+
lea 96(%rdi),%rdi
|
|
595
|
+
vzeroupper
|
|
596
|
+
vpbroadcastq -96(%rdi),%ymm0 # load A[5][5]
|
|
597
|
+
vmovdqu 8+32*0-96(%rdi),%ymm1
|
|
598
|
+
vmovdqu 8+32*1-96(%rdi),%ymm2
|
|
599
|
+
vmovdqu 8+32*2-96(%rdi),%ymm3
|
|
600
|
+
vmovdqu 8+32*3-96(%rdi),%ymm4
|
|
601
|
+
vmovdqu 8+32*4-96(%rdi),%ymm5
|
|
602
|
+
vmovdqu 8+32*5-96(%rdi),%ymm6
|
|
603
|
+
call __KeccakF1600
|
|
604
|
+
vmovq %xmm0,-96(%rdi)
|
|
605
|
+
vmovdqu %ymm1,8+32*0-96(%rdi)
|
|
606
|
+
vmovdqu %ymm2,8+32*1-96(%rdi)
|
|
607
|
+
vmovdqu %ymm3,8+32*2-96(%rdi)
|
|
608
|
+
vmovdqu %ymm4,8+32*3-96(%rdi)
|
|
609
|
+
vmovdqu %ymm5,8+32*4-96(%rdi)
|
|
610
|
+
vmovdqu %ymm6,8+32*5-96(%rdi)
|
|
611
|
+
vzeroupper
|
|
612
|
+
ret
|
|
613
|
+
.ifndef old_gas_syntax
|
|
614
|
+
.size KeccakP1600_Permute_12rounds,.-KeccakP1600_Permute_12rounds
|
|
615
|
+
.endif
|
|
616
|
+
|
|
617
|
+
# -----------------------------------------------------------------------------
|
|
618
|
+
#
|
|
619
|
+
# void KeccakP1600_Permute_Nrounds(void *state, unsigned int nrounds);
|
|
620
|
+
# %rdi %rsi
|
|
621
|
+
#
|
|
622
|
+
.globl KeccakP1600_Permute_Nrounds
|
|
623
|
+
.globl _KeccakP1600_Permute_Nrounds
|
|
624
|
+
.ifndef old_gas_syntax
|
|
625
|
+
.type KeccakP1600_Permute_Nrounds,@function
|
|
626
|
+
.endif
|
|
627
|
+
KeccakP1600_Permute_Nrounds:
|
|
628
|
+
_KeccakP1600_Permute_Nrounds:
|
|
629
|
+
.balign 32
|
|
630
|
+
lea rhotates_left+96(%rip),%r8
|
|
631
|
+
lea rhotates_right+96(%rip),%r9
|
|
632
|
+
lea iotas+24*4*8(%rip),%r10
|
|
633
|
+
mov %rsi,%rax
|
|
634
|
+
shl $2+3,%rsi
|
|
635
|
+
sub %rsi, %r10
|
|
636
|
+
lea 96(%rdi),%rdi
|
|
637
|
+
vzeroupper
|
|
638
|
+
vpbroadcastq -96(%rdi),%ymm0 # load A[5][5]
|
|
639
|
+
vmovdqu 8+32*0-96(%rdi),%ymm1
|
|
640
|
+
vmovdqu 8+32*1-96(%rdi),%ymm2
|
|
641
|
+
vmovdqu 8+32*2-96(%rdi),%ymm3
|
|
642
|
+
vmovdqu 8+32*3-96(%rdi),%ymm4
|
|
643
|
+
vmovdqu 8+32*4-96(%rdi),%ymm5
|
|
644
|
+
vmovdqu 8+32*5-96(%rdi),%ymm6
|
|
645
|
+
call __KeccakF1600
|
|
646
|
+
vmovq %xmm0,-96(%rdi)
|
|
647
|
+
vmovdqu %ymm1,8+32*0-96(%rdi)
|
|
648
|
+
vmovdqu %ymm2,8+32*1-96(%rdi)
|
|
649
|
+
vmovdqu %ymm3,8+32*2-96(%rdi)
|
|
650
|
+
vmovdqu %ymm4,8+32*3-96(%rdi)
|
|
651
|
+
vmovdqu %ymm5,8+32*4-96(%rdi)
|
|
652
|
+
vmovdqu %ymm6,8+32*5-96(%rdi)
|
|
653
|
+
vzeroupper
|
|
654
|
+
ret
|
|
655
|
+
.ifndef old_gas_syntax
|
|
656
|
+
.size KeccakP1600_Permute_Nrounds,.-KeccakP1600_Permute_Nrounds
|
|
657
|
+
.endif
|
|
658
|
+
|
|
659
|
+
# -----------------------------------------------------------------------------
|
|
660
|
+
#
|
|
661
|
+
# size_t KeccakF1600_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
|
|
662
|
+
# %rdi %rsi %rdx %rcx
|
|
663
|
+
#
|
|
664
|
+
.globl KeccakF1600_FastLoop_Absorb
|
|
665
|
+
.globl _KeccakF1600_FastLoop_Absorb
|
|
666
|
+
.ifndef old_gas_syntax
|
|
667
|
+
.type KeccakF1600_FastLoop_Absorb,@function
|
|
668
|
+
.endif
|
|
669
|
+
KeccakF1600_FastLoop_Absorb:
|
|
670
|
+
_KeccakF1600_FastLoop_Absorb:
|
|
671
|
+
.balign 32
|
|
672
|
+
push %rbx
|
|
673
|
+
push %r10
|
|
674
|
+
shr $3, %rcx # rcx = data length in lanes
|
|
675
|
+
mov %rdx, %rbx # rbx = initial data pointer
|
|
676
|
+
cmp %rsi, %rcx
|
|
677
|
+
jb KeccakF1600_FastLoop_Absorb_Exit
|
|
678
|
+
vzeroupper
|
|
679
|
+
cmp $21, %rsi
|
|
680
|
+
jnz KeccakF1600_FastLoop_Absorb_Not21Lanes
|
|
681
|
+
sub $21, %rcx
|
|
682
|
+
lea rhotates_left+96(%rip),%r8
|
|
683
|
+
lea rhotates_right+96(%rip),%r9
|
|
684
|
+
lea 96(%rdi),%rdi
|
|
685
|
+
vpbroadcastq -96(%rdi),%ymm0 # load A[5][5]
|
|
686
|
+
vmovdqu 8+32*0-96(%rdi),%ymm1
|
|
687
|
+
vmovdqu 8+32*1-96(%rdi),%ymm2
|
|
688
|
+
vmovdqu 8+32*2-96(%rdi),%ymm3
|
|
689
|
+
vmovdqu 8+32*3-96(%rdi),%ymm4
|
|
690
|
+
vmovdqu 8+32*4-96(%rdi),%ymm5
|
|
691
|
+
vmovdqu 8+32*5-96(%rdi),%ymm6
|
|
692
|
+
KeccakF1600_FastLoop_Absorb_Loop21Lanes:
|
|
693
|
+
vpbroadcastq (%rdx),%ymm7
|
|
694
|
+
vmovdqu 8(%rdx),%ymm8
|
|
695
|
+
|
|
696
|
+
vmovdqa map2(%rip), %xmm15
|
|
697
|
+
vpcmpeqd %ymm14, %ymm14, %ymm14
|
|
698
|
+
vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9
|
|
699
|
+
|
|
700
|
+
vmovdqa mask3_21(%rip), %ymm14
|
|
701
|
+
vpxor %ymm10, %ymm10, %ymm10
|
|
702
|
+
vmovdqa map3(%rip), %xmm15
|
|
703
|
+
vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10
|
|
704
|
+
|
|
705
|
+
vmovdqa mask4_21(%rip), %ymm14
|
|
706
|
+
vpxor %ymm11, %ymm11, %ymm11
|
|
707
|
+
vmovdqa map4(%rip), %xmm15
|
|
708
|
+
vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11
|
|
709
|
+
|
|
710
|
+
vmovdqa mask5_21(%rip), %ymm14
|
|
711
|
+
vpxor %ymm12, %ymm12, %ymm12
|
|
712
|
+
vmovdqa map5(%rip), %xmm15
|
|
713
|
+
vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12
|
|
714
|
+
|
|
715
|
+
vmovdqa mask6_21(%rip), %ymm14
|
|
716
|
+
vpxor %ymm13, %ymm13, %ymm13
|
|
717
|
+
vmovdqa map6(%rip), %xmm15
|
|
718
|
+
vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13
|
|
719
|
+
|
|
720
|
+
vpxor %ymm7,%ymm0,%ymm0
|
|
721
|
+
vpxor %ymm8,%ymm1,%ymm1
|
|
722
|
+
vpxor %ymm9,%ymm2,%ymm2
|
|
723
|
+
vpxor %ymm10,%ymm3,%ymm3
|
|
724
|
+
vpxor %ymm11,%ymm4,%ymm4
|
|
725
|
+
vpxor %ymm12,%ymm5,%ymm5
|
|
726
|
+
vpxor %ymm13,%ymm6,%ymm6
|
|
727
|
+
add $21*8, %rdx
|
|
728
|
+
lea iotas(%rip),%r10
|
|
729
|
+
mov $24,%eax
|
|
730
|
+
call __KeccakF1600
|
|
731
|
+
sub $21, %rcx
|
|
732
|
+
jnc KeccakF1600_FastLoop_Absorb_Loop21Lanes
|
|
733
|
+
KeccakF1600_FastLoop_Absorb_SaveAndExit:
|
|
734
|
+
vmovq %xmm0,-96(%rdi)
|
|
735
|
+
vmovdqu %ymm1,8+32*0-96(%rdi)
|
|
736
|
+
vmovdqu %ymm2,8+32*1-96(%rdi)
|
|
737
|
+
vmovdqu %ymm3,8+32*2-96(%rdi)
|
|
738
|
+
vmovdqu %ymm4,8+32*3-96(%rdi)
|
|
739
|
+
vmovdqu %ymm5,8+32*4-96(%rdi)
|
|
740
|
+
vmovdqu %ymm6,8+32*5-96(%rdi)
|
|
741
|
+
KeccakF1600_FastLoop_Absorb_Exit:
|
|
742
|
+
vzeroupper
|
|
743
|
+
mov %rdx, %rax # return number of bytes processed
|
|
744
|
+
sub %rbx, %rax
|
|
745
|
+
pop %r10
|
|
746
|
+
pop %rbx
|
|
747
|
+
ret
|
|
748
|
+
KeccakF1600_FastLoop_Absorb_Not21Lanes:
|
|
749
|
+
cmp $17, %rsi
|
|
750
|
+
jnz KeccakF1600_FastLoop_Absorb_Not17Lanes
|
|
751
|
+
sub $17, %rcx
|
|
752
|
+
lea rhotates_left+96(%rip),%r8
|
|
753
|
+
lea rhotates_right+96(%rip),%r9
|
|
754
|
+
lea 96(%rdi),%rdi
|
|
755
|
+
vpbroadcastq -96(%rdi),%ymm0 # load A[5][5]
|
|
756
|
+
vmovdqu 8+32*0-96(%rdi),%ymm1
|
|
757
|
+
vmovdqu 8+32*1-96(%rdi),%ymm2
|
|
758
|
+
vmovdqu 8+32*2-96(%rdi),%ymm3
|
|
759
|
+
vmovdqu 8+32*3-96(%rdi),%ymm4
|
|
760
|
+
vmovdqu 8+32*4-96(%rdi),%ymm5
|
|
761
|
+
vmovdqu 8+32*5-96(%rdi),%ymm6
|
|
762
|
+
KeccakF1600_FastLoop_Absorb_Loop17Lanes:
|
|
763
|
+
vpbroadcastq (%rdx),%ymm7
|
|
764
|
+
vmovdqu 8(%rdx),%ymm8
|
|
765
|
+
|
|
766
|
+
vmovdqa mask2_17(%rip), %ymm14
|
|
767
|
+
vpxor %ymm9, %ymm9, %ymm9
|
|
768
|
+
vmovdqa map2(%rip), %xmm15
|
|
769
|
+
vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9
|
|
770
|
+
|
|
771
|
+
vmovdqa mask3_17(%rip), %ymm14
|
|
772
|
+
vpxor %ymm10, %ymm10, %ymm10
|
|
773
|
+
vmovdqa map3(%rip), %xmm15
|
|
774
|
+
vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10
|
|
775
|
+
|
|
776
|
+
vmovdqa mask4_17(%rip), %ymm14
|
|
777
|
+
vpxor %ymm11, %ymm11, %ymm11
|
|
778
|
+
vmovdqa map4(%rip), %xmm15
|
|
779
|
+
vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11
|
|
780
|
+
|
|
781
|
+
vmovdqa mask5_17(%rip), %ymm14
|
|
782
|
+
vpxor %ymm12, %ymm12, %ymm12
|
|
783
|
+
vmovdqa map5(%rip), %xmm15
|
|
784
|
+
vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12
|
|
785
|
+
|
|
786
|
+
vmovdqa mask6_17(%rip), %ymm14
|
|
787
|
+
vpxor %ymm13, %ymm13, %ymm13
|
|
788
|
+
vmovdqa map6(%rip), %xmm15
|
|
789
|
+
vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13
|
|
790
|
+
|
|
791
|
+
vpxor %ymm7,%ymm0,%ymm0
|
|
792
|
+
vpxor %ymm8,%ymm1,%ymm1
|
|
793
|
+
vpxor %ymm9,%ymm2,%ymm2
|
|
794
|
+
vpxor %ymm10,%ymm3,%ymm3
|
|
795
|
+
vpxor %ymm11,%ymm4,%ymm4
|
|
796
|
+
vpxor %ymm12,%ymm5,%ymm5
|
|
797
|
+
vpxor %ymm13,%ymm6,%ymm6
|
|
798
|
+
add $17*8, %rdx
|
|
799
|
+
lea iotas(%rip),%r10
|
|
800
|
+
mov $24,%eax
|
|
801
|
+
call __KeccakF1600
|
|
802
|
+
sub $17, %rcx
|
|
803
|
+
jnc KeccakF1600_FastLoop_Absorb_Loop17Lanes
|
|
804
|
+
jmp KeccakF1600_FastLoop_Absorb_SaveAndExit
|
|
805
|
+
KeccakF1600_FastLoop_Absorb_Not17Lanes:
|
|
806
|
+
lea mapState(%rip), %r9
|
|
807
|
+
mov %rsi, %rax
|
|
808
|
+
KeccakF1600_FastLoop_Absorb_LanesAddLoop:
|
|
809
|
+
mov (%rdx), %r8
|
|
810
|
+
add $8, %rdx
|
|
811
|
+
mov (%r9), %r10
|
|
812
|
+
add $8, %r9
|
|
813
|
+
add %rdi, %r10
|
|
814
|
+
xor %r8, (%r10)
|
|
815
|
+
sub $1, %rax
|
|
816
|
+
jnz KeccakF1600_FastLoop_Absorb_LanesAddLoop
|
|
817
|
+
sub %rsi, %rcx
|
|
818
|
+
push %rdi
|
|
819
|
+
push %rsi
|
|
820
|
+
push %rdx
|
|
821
|
+
push %rcx
|
|
822
|
+
.ifdef no_plt
|
|
823
|
+
call KeccakP1600_Permute_24rounds
|
|
824
|
+
.else
|
|
825
|
+
call KeccakP1600_Permute_24rounds@PLT
|
|
826
|
+
.endif
|
|
827
|
+
pop %rcx
|
|
828
|
+
pop %rdx
|
|
829
|
+
pop %rsi
|
|
830
|
+
pop %rdi
|
|
831
|
+
cmp %rsi, %rcx
|
|
832
|
+
jae KeccakF1600_FastLoop_Absorb_Not17Lanes
|
|
833
|
+
jmp KeccakF1600_FastLoop_Absorb_Exit
|
|
834
|
+
.ifndef old_gas_syntax
|
|
835
|
+
.size KeccakF1600_FastLoop_Absorb,.-KeccakF1600_FastLoop_Absorb
|
|
836
|
+
.endif
|
|
837
|
+
|
|
838
|
+
# -----------------------------------------------------------------------------
|
|
839
|
+
#
|
|
840
|
+
# size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
|
|
841
|
+
# %rdi %rsi %rdx %rcx
|
|
842
|
+
#
|
|
843
|
+
.globl KeccakP1600_12rounds_FastLoop_Absorb
|
|
844
|
+
.globl _KeccakP1600_12rounds_FastLoop_Absorb
|
|
845
|
+
.ifndef old_gas_syntax
|
|
846
|
+
.type KeccakP1600_12rounds_FastLoop_Absorb,@function
|
|
847
|
+
.endif
|
|
848
|
+
KeccakP1600_12rounds_FastLoop_Absorb:
|
|
849
|
+
_KeccakP1600_12rounds_FastLoop_Absorb:
|
|
850
|
+
.balign 32
|
|
851
|
+
push %rbx
|
|
852
|
+
push %r10
|
|
853
|
+
shr $3, %rcx # rcx = data length in lanes
|
|
854
|
+
mov %rdx, %rbx # rbx = initial data pointer
|
|
855
|
+
cmp %rsi, %rcx
|
|
856
|
+
jb KeccakP1600_12rounds_FastLoop_Absorb_Exit
|
|
857
|
+
vzeroupper
|
|
858
|
+
cmp $21, %rsi
|
|
859
|
+
jnz KeccakP1600_12rounds_FastLoop_Absorb_Not21Lanes
|
|
860
|
+
sub $21, %rcx
|
|
861
|
+
lea rhotates_left+96(%rip),%r8
|
|
862
|
+
lea rhotates_right+96(%rip),%r9
|
|
863
|
+
lea 96(%rdi),%rdi
|
|
864
|
+
vpbroadcastq -96(%rdi),%ymm0 # load A[5][5]
|
|
865
|
+
vmovdqu 8+32*0-96(%rdi),%ymm1
|
|
866
|
+
vmovdqu 8+32*1-96(%rdi),%ymm2
|
|
867
|
+
vmovdqu 8+32*2-96(%rdi),%ymm3
|
|
868
|
+
vmovdqu 8+32*3-96(%rdi),%ymm4
|
|
869
|
+
vmovdqu 8+32*4-96(%rdi),%ymm5
|
|
870
|
+
vmovdqu 8+32*5-96(%rdi),%ymm6
|
|
871
|
+
KeccakP1600_12rounds_FastLoop_Absorb_Loop21Lanes:
|
|
872
|
+
vpbroadcastq (%rdx),%ymm7
|
|
873
|
+
vmovdqu 8(%rdx),%ymm8
|
|
874
|
+
|
|
875
|
+
vmovdqa map2(%rip), %xmm15
|
|
876
|
+
vpcmpeqd %ymm14, %ymm14, %ymm14
|
|
877
|
+
vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9
|
|
878
|
+
|
|
879
|
+
vmovdqa mask3_21(%rip), %ymm14
|
|
880
|
+
vpxor %ymm10, %ymm10, %ymm10
|
|
881
|
+
vmovdqa map3(%rip), %xmm15
|
|
882
|
+
vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10
|
|
883
|
+
|
|
884
|
+
vmovdqa mask4_21(%rip), %ymm14
|
|
885
|
+
vpxor %ymm11, %ymm11, %ymm11
|
|
886
|
+
vmovdqa map4(%rip), %xmm15
|
|
887
|
+
vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11
|
|
888
|
+
|
|
889
|
+
vmovdqa mask5_21(%rip), %ymm14
|
|
890
|
+
vpxor %ymm12, %ymm12, %ymm12
|
|
891
|
+
vmovdqa map5(%rip), %xmm15
|
|
892
|
+
vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12
|
|
893
|
+
|
|
894
|
+
vmovdqa mask6_21(%rip), %ymm14
|
|
895
|
+
vpxor %ymm13, %ymm13, %ymm13
|
|
896
|
+
vmovdqa map6(%rip), %xmm15
|
|
897
|
+
vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13
|
|
898
|
+
|
|
899
|
+
vpxor %ymm7,%ymm0,%ymm0
|
|
900
|
+
vpxor %ymm8,%ymm1,%ymm1
|
|
901
|
+
vpxor %ymm9,%ymm2,%ymm2
|
|
902
|
+
vpxor %ymm10,%ymm3,%ymm3
|
|
903
|
+
vpxor %ymm11,%ymm4,%ymm4
|
|
904
|
+
vpxor %ymm12,%ymm5,%ymm5
|
|
905
|
+
vpxor %ymm13,%ymm6,%ymm6
|
|
906
|
+
add $21*8, %rdx
|
|
907
|
+
lea iotas+12*4*8(%rip),%r10
|
|
908
|
+
mov $12,%eax
|
|
909
|
+
call __KeccakF1600
|
|
910
|
+
sub $21, %rcx
|
|
911
|
+
jnc KeccakP1600_12rounds_FastLoop_Absorb_Loop21Lanes
|
|
912
|
+
KeccakP1600_12rounds_FastLoop_Absorb_SaveAndExit:
|
|
913
|
+
vmovq %xmm0,-96(%rdi)
|
|
914
|
+
vmovdqu %ymm1,8+32*0-96(%rdi)
|
|
915
|
+
vmovdqu %ymm2,8+32*1-96(%rdi)
|
|
916
|
+
vmovdqu %ymm3,8+32*2-96(%rdi)
|
|
917
|
+
vmovdqu %ymm4,8+32*3-96(%rdi)
|
|
918
|
+
vmovdqu %ymm5,8+32*4-96(%rdi)
|
|
919
|
+
vmovdqu %ymm6,8+32*5-96(%rdi)
|
|
920
|
+
KeccakP1600_12rounds_FastLoop_Absorb_Exit:
|
|
921
|
+
vzeroupper
|
|
922
|
+
mov %rdx, %rax # return number of bytes processed
|
|
923
|
+
sub %rbx, %rax
|
|
924
|
+
pop %r10
|
|
925
|
+
pop %rbx
|
|
926
|
+
ret
|
|
927
|
+
KeccakP1600_12rounds_FastLoop_Absorb_Not21Lanes:
|
|
928
|
+
cmp $17, %rsi
|
|
929
|
+
jnz KeccakP1600_12rounds_FastLoop_Absorb_Not17Lanes
|
|
930
|
+
sub $17, %rcx
|
|
931
|
+
lea rhotates_left+96(%rip),%r8
|
|
932
|
+
lea rhotates_right+96(%rip),%r9
|
|
933
|
+
lea 96(%rdi),%rdi
|
|
934
|
+
vpbroadcastq -96(%rdi),%ymm0 # load A[5][5]
|
|
935
|
+
vmovdqu 8+32*0-96(%rdi),%ymm1
|
|
936
|
+
vmovdqu 8+32*1-96(%rdi),%ymm2
|
|
937
|
+
vmovdqu 8+32*2-96(%rdi),%ymm3
|
|
938
|
+
vmovdqu 8+32*3-96(%rdi),%ymm4
|
|
939
|
+
vmovdqu 8+32*4-96(%rdi),%ymm5
|
|
940
|
+
vmovdqu 8+32*5-96(%rdi),%ymm6
|
|
941
|
+
KeccakP1600_12rounds_FastLoop_Absorb_Loop17Lanes:
|
|
942
|
+
vpbroadcastq (%rdx),%ymm7
|
|
943
|
+
vmovdqu 8(%rdx),%ymm8
|
|
944
|
+
|
|
945
|
+
vmovdqa mask2_17(%rip), %ymm14
|
|
946
|
+
vpxor %ymm9, %ymm9, %ymm9
|
|
947
|
+
vmovdqa map2(%rip), %xmm15
|
|
948
|
+
vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9
|
|
949
|
+
|
|
950
|
+
vmovdqa mask3_17(%rip), %ymm14
|
|
951
|
+
vpxor %ymm10, %ymm10, %ymm10
|
|
952
|
+
vmovdqa map3(%rip), %xmm15
|
|
953
|
+
vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10
|
|
954
|
+
|
|
955
|
+
vmovdqa mask4_17(%rip), %ymm14
|
|
956
|
+
vpxor %ymm11, %ymm11, %ymm11
|
|
957
|
+
vmovdqa map4(%rip), %xmm15
|
|
958
|
+
vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11
|
|
959
|
+
|
|
960
|
+
vmovdqa mask5_17(%rip), %ymm14
|
|
961
|
+
vpxor %ymm12, %ymm12, %ymm12
|
|
962
|
+
vmovdqa map5(%rip), %xmm15
|
|
963
|
+
vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12
|
|
964
|
+
|
|
965
|
+
vmovdqa mask6_17(%rip), %ymm14
|
|
966
|
+
vpxor %ymm13, %ymm13, %ymm13
|
|
967
|
+
vmovdqa map6(%rip), %xmm15
|
|
968
|
+
vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13
|
|
969
|
+
|
|
970
|
+
vpxor %ymm7,%ymm0,%ymm0
|
|
971
|
+
vpxor %ymm8,%ymm1,%ymm1
|
|
972
|
+
vpxor %ymm9,%ymm2,%ymm2
|
|
973
|
+
vpxor %ymm10,%ymm3,%ymm3
|
|
974
|
+
vpxor %ymm11,%ymm4,%ymm4
|
|
975
|
+
vpxor %ymm12,%ymm5,%ymm5
|
|
976
|
+
vpxor %ymm13,%ymm6,%ymm6
|
|
977
|
+
add $17*8, %rdx
|
|
978
|
+
lea iotas+12*4*8(%rip),%r10
|
|
979
|
+
mov $12,%eax
|
|
980
|
+
call __KeccakF1600
|
|
981
|
+
sub $17, %rcx
|
|
982
|
+
jnc KeccakP1600_12rounds_FastLoop_Absorb_Loop17Lanes
|
|
983
|
+
jmp KeccakP1600_12rounds_FastLoop_Absorb_SaveAndExit
|
|
984
|
+
KeccakP1600_12rounds_FastLoop_Absorb_Not17Lanes:
|
|
985
|
+
lea mapState(%rip), %r9
|
|
986
|
+
mov %rsi, %rax
|
|
987
|
+
KeccakP1600_12rounds_FastLoop_Absorb_LanesAddLoop:
|
|
988
|
+
mov (%rdx), %r8
|
|
989
|
+
add $8, %rdx
|
|
990
|
+
mov (%r9), %r10
|
|
991
|
+
add $8, %r9
|
|
992
|
+
add %rdi, %r10
|
|
993
|
+
xor %r8, (%r10)
|
|
994
|
+
sub $1, %rax
|
|
995
|
+
jnz KeccakP1600_12rounds_FastLoop_Absorb_LanesAddLoop
|
|
996
|
+
sub %rsi, %rcx
|
|
997
|
+
push %rdi
|
|
998
|
+
push %rsi
|
|
999
|
+
push %rdx
|
|
1000
|
+
push %rcx
|
|
1001
|
+
.ifdef no_plt
|
|
1002
|
+
call KeccakP1600_Permute_12rounds
|
|
1003
|
+
.else
|
|
1004
|
+
call KeccakP1600_Permute_12rounds@PLT
|
|
1005
|
+
.endif
|
|
1006
|
+
pop %rcx
|
|
1007
|
+
pop %rdx
|
|
1008
|
+
pop %rsi
|
|
1009
|
+
pop %rdi
|
|
1010
|
+
cmp %rsi, %rcx
|
|
1011
|
+
jae KeccakP1600_12rounds_FastLoop_Absorb_Not17Lanes
|
|
1012
|
+
jmp KeccakP1600_12rounds_FastLoop_Absorb_Exit
|
|
1013
|
+
.ifndef old_gas_syntax
|
|
1014
|
+
.size KeccakP1600_12rounds_FastLoop_Absorb,.-KeccakP1600_12rounds_FastLoop_Absorb
|
|
1015
|
+
.endif
|
|
1016
|
+
|
|
1017
|
+
.equ ALLON, 0xFFFFFFFFFFFFFFFF
|
|
1018
|
+
|
|
1019
|
+
.balign 64
|
|
1020
|
+
rhotates_left:
|
|
1021
|
+
.quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0]
|
|
1022
|
+
.quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4]
|
|
1023
|
+
.quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4]
|
|
1024
|
+
.quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4]
|
|
1025
|
+
.quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4]
|
|
1026
|
+
.quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4]
|
|
1027
|
+
rhotates_right:
|
|
1028
|
+
.quad 64-3, 64-18, 64-36, 64-41
|
|
1029
|
+
.quad 64-1, 64-62, 64-28, 64-27
|
|
1030
|
+
.quad 64-45, 64-6, 64-56, 64-39
|
|
1031
|
+
.quad 64-10, 64-61, 64-55, 64-8
|
|
1032
|
+
.quad 64-2, 64-15, 64-25, 64-20
|
|
1033
|
+
.quad 64-44, 64-43, 64-21, 64-14
|
|
1034
|
+
iotas:
|
|
1035
|
+
.quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
|
|
1036
|
+
.quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
|
|
1037
|
+
.quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
|
|
1038
|
+
.quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
|
|
1039
|
+
.quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
|
|
1040
|
+
.quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
|
|
1041
|
+
.quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
|
|
1042
|
+
.quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
|
|
1043
|
+
.quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
|
|
1044
|
+
.quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
|
|
1045
|
+
.quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
|
|
1046
|
+
.quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
|
|
1047
|
+
.quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
|
|
1048
|
+
.quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
|
|
1049
|
+
.quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
|
|
1050
|
+
.quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
|
|
1051
|
+
.quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
|
|
1052
|
+
.quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
|
|
1053
|
+
.quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
|
|
1054
|
+
.quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
|
|
1055
|
+
.quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
|
|
1056
|
+
.quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
|
|
1057
|
+
.quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
|
|
1058
|
+
.quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
|
|
1059
|
+
|
|
1060
|
+
mapState:
|
|
1061
|
+
.quad 0*8, 1*8, 2*8, 3*8, 4*8
|
|
1062
|
+
.quad 7*8, 21*8, 10*8, 15*8, 20*8
|
|
1063
|
+
.quad 5*8, 13*8, 22*8, 19*8, 12*8
|
|
1064
|
+
.quad 8*8, 9*8, 18*8, 23*8, 16*8
|
|
1065
|
+
.quad 6*8, 17*8, 14*8, 11*8, 24*8
|
|
1066
|
+
|
|
1067
|
+
.balign 16
|
|
1068
|
+
map2:
|
|
1069
|
+
.long 10*8, 20*8, 5*8, 15*8
|
|
1070
|
+
map3:
|
|
1071
|
+
.long 16*8, 7*8, 23*8, 14*8
|
|
1072
|
+
map4:
|
|
1073
|
+
.long 11*8, 22*8, 8*8, 19*8
|
|
1074
|
+
map5:
|
|
1075
|
+
.long 21*8, 17*8, 13*8, 9*8
|
|
1076
|
+
map6:
|
|
1077
|
+
.long 6*8, 12*8, 18*8, 24*8
|
|
1078
|
+
|
|
1079
|
+
.balign 32
|
|
1080
|
+
mask3_21:
|
|
1081
|
+
.quad ALLON, ALLON, 0, ALLON
|
|
1082
|
+
mask4_21:
|
|
1083
|
+
.quad ALLON, 0, ALLON, ALLON
|
|
1084
|
+
mask5_21:
|
|
1085
|
+
.quad 0, ALLON, ALLON, ALLON
|
|
1086
|
+
mask6_21:
|
|
1087
|
+
.quad ALLON, ALLON, ALLON, 0
|
|
1088
|
+
|
|
1089
|
+
mask2_17:
|
|
1090
|
+
.quad ALLON, 0, ALLON, ALLON
|
|
1091
|
+
mask3_17:
|
|
1092
|
+
.quad ALLON, ALLON, 0, ALLON
|
|
1093
|
+
mask4_17:
|
|
1094
|
+
.quad ALLON, 0, ALLON, 0
|
|
1095
|
+
mask5_17:
|
|
1096
|
+
.quad 0, 0, ALLON, ALLON
|
|
1097
|
+
mask6_17:
|
|
1098
|
+
.quad ALLON, ALLON, 0, 0
|
|
1099
|
+
|
|
1100
|
+
.asciz "Keccak-1600 for AVX2, CRYPTOGAMS by <appro@openssl.org>"
|