sleeping_kangaroo12 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +34 -67
- data/ext/Rakefile +12 -37
- data/ext/binding/sleeping_kangaroo12.c +1 -16
- data/ext/{xkcp → k12}/Makefile +0 -0
- data/ext/k12/Makefile.build +118 -0
- data/ext/k12/README.markdown +86 -0
- data/ext/k12/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S +623 -0
- data/ext/k12/lib/ARMv8Asha3/KeccakP-1600-SnP.h +65 -0
- data/ext/k12/lib/ARMv8Asha3/KeccakP-1600-opt64.c +227 -0
- data/ext/{xkcp/lib/low/KeccakP-1600/compact → k12/lib/Inplace32BI}/KeccakP-1600-SnP.h +4 -9
- data/ext/{xkcp/lib/low/KeccakP-1600/plain-32bits-inplace → k12/lib/Inplace32BI}/KeccakP-1600-inplace32BI.c +65 -160
- data/ext/k12/lib/KangarooTwelve.c +332 -0
- data/ext/{xkcp/lib/high/KangarooTwelve → k12/lib}/KangarooTwelve.h +53 -16
- data/ext/{xkcp/lib/low/KeccakP-1600/AVX2 → k12/lib/Optimized64}/KeccakP-1600-AVX2.s +122 -558
- data/ext/k12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c +241 -0
- data/ext/k12/lib/Optimized64/KeccakP-1600-AVX512.s +551 -0
- data/ext/k12/lib/Optimized64/KeccakP-1600-SnP.h +74 -0
- data/ext/{xkcp/lib/low/KeccakP-1600/common/KeccakP-1600-64.macros → k12/lib/Optimized64/KeccakP-1600-opt64.c} +447 -169
- data/ext/k12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c +406 -0
- data/ext/k12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c +419 -0
- data/ext/k12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c +458 -0
- data/ext/k12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c +438 -0
- data/ext/{xkcp/lib/low/KeccakP-1600/plain-64bits → k12/lib/Plain64}/KeccakP-1600-SnP.h +14 -20
- data/ext/{xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-reference.h → k12/lib/Plain64/KeccakP-1600-plain64.c} +9 -8
- data/ext/{xkcp/lib/common → k12/lib}/align.h +3 -2
- data/ext/{xkcp/lib/common → k12/lib}/brg_endian.h +0 -0
- data/ext/{xkcp → k12}/support/Build/ExpandProducts.xsl +0 -0
- data/ext/{xkcp → k12}/support/Build/ToGlobalMakefile.xsl +0 -0
- data/ext/{xkcp → k12}/support/Build/ToOneTarget.xsl +0 -0
- data/ext/{xkcp → k12}/support/Build/ToTargetConfigFile.xsl +0 -0
- data/ext/{xkcp → k12}/support/Build/ToTargetMakefile.xsl +10 -16
- data/ext/{xkcp → k12}/support/Build/ToVCXProj.xsl +0 -0
- data/lib/sleeping_kangaroo12/version.rb +1 -1
- metadata +33 -276
- data/ext/config/xkcp.build +0 -17
- data/ext/xkcp/LICENSE +0 -1
- data/ext/xkcp/Makefile.build +0 -200
- data/ext/xkcp/README.markdown +0 -296
- data/ext/xkcp/lib/HighLevel.build +0 -143
- data/ext/xkcp/lib/LowLevel.build +0 -757
- data/ext/xkcp/lib/high/KangarooTwelve/KangarooTwelve.c +0 -301
- data/ext/xkcp/lib/high/Keccak/FIPS202/KeccakHash.c +0 -81
- data/ext/xkcp/lib/high/Keccak/FIPS202/KeccakHash.h +0 -125
- data/ext/xkcp/lib/high/Keccak/FIPS202/SimpleFIPS202.c +0 -48
- data/ext/xkcp/lib/high/Keccak/FIPS202/SimpleFIPS202.h +0 -79
- data/ext/xkcp/lib/high/Keccak/KeccakDuplex.c +0 -81
- data/ext/xkcp/lib/high/Keccak/KeccakDuplex.h +0 -73
- data/ext/xkcp/lib/high/Keccak/KeccakDuplex.inc +0 -195
- data/ext/xkcp/lib/high/Keccak/KeccakSponge.c +0 -111
- data/ext/xkcp/lib/high/Keccak/KeccakSponge.h +0 -76
- data/ext/xkcp/lib/high/Keccak/KeccakSponge.inc +0 -314
- data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.c +0 -61
- data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.h +0 -67
- data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.inc +0 -128
- data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.c +0 -93
- data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.h +0 -599
- data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.inc +0 -573
- data/ext/xkcp/lib/high/Ketje/Ketjev2.c +0 -87
- data/ext/xkcp/lib/high/Ketje/Ketjev2.h +0 -88
- data/ext/xkcp/lib/high/Ketje/Ketjev2.inc +0 -274
- data/ext/xkcp/lib/high/Keyak/Keyakv2.c +0 -132
- data/ext/xkcp/lib/high/Keyak/Keyakv2.h +0 -217
- data/ext/xkcp/lib/high/Keyak/Keyakv2.inc +0 -81
- data/ext/xkcp/lib/high/Keyak/Motorist.inc +0 -953
- data/ext/xkcp/lib/high/Kravatte/Kravatte.c +0 -533
- data/ext/xkcp/lib/high/Kravatte/Kravatte.h +0 -115
- data/ext/xkcp/lib/high/Kravatte/KravatteModes.c +0 -557
- data/ext/xkcp/lib/high/Kravatte/KravatteModes.h +0 -247
- data/ext/xkcp/lib/high/Xoodyak/Cyclist.h +0 -66
- data/ext/xkcp/lib/high/Xoodyak/Cyclist.inc +0 -336
- data/ext/xkcp/lib/high/Xoodyak/Xoodyak-parameters.h +0 -26
- data/ext/xkcp/lib/high/Xoodyak/Xoodyak.c +0 -55
- data/ext/xkcp/lib/high/Xoodyak/Xoodyak.h +0 -35
- data/ext/xkcp/lib/high/Xoofff/Xoofff.c +0 -634
- data/ext/xkcp/lib/high/Xoofff/Xoofff.h +0 -147
- data/ext/xkcp/lib/high/Xoofff/XoofffModes.c +0 -483
- data/ext/xkcp/lib/high/Xoofff/XoofffModes.h +0 -241
- data/ext/xkcp/lib/high/common/Phases.h +0 -25
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-SnP.h +0 -41
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv6m-le-armcc.s +0 -1666
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv6m-le-gcc.s +0 -1655
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7a-le-armcc.s +0 -1268
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7a-le-gcc.s +0 -1264
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7m-le-armcc.s +0 -1178
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7m-le-gcc.s +0 -1175
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u1-32bi-armv6m-le-armcc.s +0 -1338
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u1-32bi-armv6m-le-gcc.s +0 -1336
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u2-32bi-armv6m-le-armcc.s +0 -1343
- data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u2-32bi-armv6m-le-gcc.s +0 -1339
- data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-SnP.h +0 -42
- data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-armv7a-le-neon-armcc.s +0 -823
- data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-armv7a-le-neon-gcc.s +0 -831
- data/ext/xkcp/lib/low/KeccakP-1600/ARMv8A/KeccakP-1600-SnP.h +0 -31
- data/ext/xkcp/lib/low/KeccakP-1600/ARMv8A/KeccakP-1600-armv8a-neon.s +0 -540
- data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-SnP.h +0 -42
- data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-avr8-compact.s +0 -733
- data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-avr8-fast.s +0 -1121
- data/ext/xkcp/lib/low/KeccakP-1600/AVX2/KeccakP-1600-SnP.h +0 -52
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/KeccakP-1600-AVX512.c +0 -623
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/KeccakP-1600-SnP.h +0 -47
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/u12/KeccakP-1600-AVX512-config.h +0 -6
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/u6/KeccakP-1600-AVX512-config.h +0 -6
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/ua/KeccakP-1600-AVX512-config.h +0 -6
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/KeccakP-1600-AVX512.s +0 -1031
- data/ext/xkcp/lib/low/KeccakP-1600/AVX512/KeccakP-1600-SnP.h +0 -53
- data/ext/xkcp/lib/low/KeccakP-1600/XOP/KeccakP-1600-SnP.h +0 -44
- data/ext/xkcp/lib/low/KeccakP-1600/XOP/KeccakP-1600-XOP.c +0 -476
- data/ext/xkcp/lib/low/KeccakP-1600/XOP/u6/KeccakP-1600-XOP-config.h +0 -6
- data/ext/xkcp/lib/low/KeccakP-1600/XOP/ua/KeccakP-1600-XOP-config.h +0 -6
- data/ext/xkcp/lib/low/KeccakP-1600/common/KeccakP-1600-unrolling.macros +0 -305
- data/ext/xkcp/lib/low/KeccakP-1600/compact/KeccakP-1600-compact64.c +0 -420
- data/ext/xkcp/lib/low/KeccakP-1600/plain-32bits-inplace/KeccakP-1600-SnP.h +0 -43
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/KeccakP-1600-opt64.c +0 -565
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcu6/KeccakP-1600-opt64-config.h +0 -7
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcua/KeccakP-1600-opt64-config.h +0 -7
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcua-shld/KeccakP-1600-opt64-config.h +0 -8
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/u6/KeccakP-1600-opt64-config.h +0 -6
- data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/ua/KeccakP-1600-opt64-config.h +0 -6
- data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-SnP.h +0 -44
- data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-reference.h +0 -23
- data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-reference32BI.c +0 -625
- data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-SnP.h +0 -44
- data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-reference.c +0 -440
- data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-SnP.h +0 -42
- data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-gas.s +0 -1196
- data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-gas_Apple.s +0 -1124
- data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-shld-gas.s +0 -1196
- data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-inplace-pl2-armv7a-neon-le-armcc.s +0 -1392
- data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-inplace-pl2-armv7a-neon-le-gcc.s +0 -1394
- data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-times2-SnP.h +0 -42
- data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512u12/SIMD512-2-config.h +0 -7
- data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512u4/SIMD512-2-config.h +0 -7
- data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512ufull/SIMD512-2-config.h +0 -7
- data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/KeccakP-1600-times2-SIMD512.c +0 -850
- data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/KeccakP-1600-times2-SnP.h +0 -51
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SIMD128.c +0 -957
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SnP.h +0 -49
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/SSSE3-u2/SIMD128-config.h +0 -8
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/SSSE3-ua/SIMD128-config.h +0 -8
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/XOP-u2/SIMD128-config.h +0 -9
- data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/XOP-ua/SIMD128-config.h +0 -9
- data/ext/xkcp/lib/low/KeccakP-1600-times2/fallback-on1/KeccakP-1600-times2-SnP.h +0 -45
- data/ext/xkcp/lib/low/KeccakP-1600-times2/fallback-on1/KeccakP-1600-times2-on1.c +0 -37
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SIMD256.c +0 -1321
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SnP.h +0 -55
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/u12/SIMD256-config.h +0 -7
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/u6/SIMD256-config.h +0 -7
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/ua/SIMD256-config.h +0 -7
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512u12/SIMD512-4-config.h +0 -7
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512u4/SIMD512-4-config.h +0 -7
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512ufull/SIMD512-4-config.h +0 -7
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/KeccakP-1600-times4-SIMD512.c +0 -881
- data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/KeccakP-1600-times4-SnP.h +0 -51
- data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-SnP.h +0 -45
- data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-on1.c +0 -37
- data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on2/KeccakP-1600-times4-SnP.h +0 -45
- data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on2/KeccakP-1600-times4-on2.c +0 -38
- data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/KeccakP-1600-times8-SIMD512.c +0 -1615
- data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/KeccakP-1600-times8-SnP.h +0 -57
- data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/u12/SIMD512-config.h +0 -7
- data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/u4/SIMD512-config.h +0 -7
- data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/ua/SIMD512-config.h +0 -7
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on1/KeccakP-1600-times8-SnP.h +0 -45
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on1/KeccakP-1600-times8-on1.c +0 -37
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on2/KeccakP-1600-times8-SnP.h +0 -45
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on2/KeccakP-1600-times8-on2.c +0 -38
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on4/KeccakP-1600-times8-SnP.h +0 -45
- data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on4/KeccakP-1600-times8-on4.c +0 -38
- data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-SnP.h +0 -41
- data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv6m-le-armcc.s +0 -442
- data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv6m-le-gcc.s +0 -446
- data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-armcc.s +0 -419
- data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-gcc.s +0 -427
- data/ext/xkcp/lib/low/KeccakP-200/AVR8/KeccakP-200-SnP.h +0 -41
- data/ext/xkcp/lib/low/KeccakP-200/AVR8/KeccakP-200-avr8-fast.s +0 -647
- data/ext/xkcp/lib/low/KeccakP-200/compact/KeccakP-200-SnP.h +0 -39
- data/ext/xkcp/lib/low/KeccakP-200/compact/KeccakP-200-compact.c +0 -190
- data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-SnP.h +0 -43
- data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-reference.c +0 -412
- data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-reference.h +0 -23
- data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-SnP.h +0 -41
- data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv6m-le-armcc.s +0 -454
- data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv6m-le-gcc.s +0 -458
- data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv7m-le-armcc.s +0 -455
- data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv7m-le-gcc.s +0 -458
- data/ext/xkcp/lib/low/KeccakP-400/AVR8/KeccakP-400-SnP.h +0 -41
- data/ext/xkcp/lib/low/KeccakP-400/AVR8/KeccakP-400-avr8-fast.s +0 -728
- data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-SnP.h +0 -43
- data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-reference.c +0 -414
- data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-reference.h +0 -23
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-SnP.h +0 -42
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u1-armv6m-le-armcc.s +0 -527
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u1-armv6m-le-gcc.s +0 -533
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv6m-le-armcc.s +0 -528
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv6m-le-gcc.s +0 -534
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7a-le-armcc.s +0 -521
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7a-le-gcc.s +0 -527
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7m-le-armcc.s +0 -517
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7m-le-gcc.s +0 -523
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-uf-armv7m-le-armcc.s +0 -550
- data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-uf-armv7m-le-gcc.s +0 -556
- data/ext/xkcp/lib/low/KeccakP-800/ARMv8A/KeccakP-800-SnP.h +0 -32
- data/ext/xkcp/lib/low/KeccakP-800/ARMv8A/KeccakP-800-armv8a-neon.s +0 -432
- data/ext/xkcp/lib/low/KeccakP-800/AVR8/KeccakP-800-SnP.h +0 -42
- data/ext/xkcp/lib/low/KeccakP-800/AVR8/KeccakP-800-avr8-fast.s +0 -929
- data/ext/xkcp/lib/low/KeccakP-800/compact/KeccakP-800-SnP.h +0 -40
- data/ext/xkcp/lib/low/KeccakP-800/compact/KeccakP-800-compact.c +0 -244
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-SnP.h +0 -46
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32-bis.macros +0 -184
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32.c +0 -454
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32.macros +0 -459
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-unrolling-bis.macros +0 -83
- data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-unrolling.macros +0 -88
- data/ext/xkcp/lib/low/KeccakP-800/plain/lcu2/KeccakP-800-opt32-config.h +0 -7
- data/ext/xkcp/lib/low/KeccakP-800/plain/lcua/KeccakP-800-opt32-config.h +0 -7
- data/ext/xkcp/lib/low/KeccakP-800/plain/u2/KeccakP-800-opt32-config.h +0 -7
- data/ext/xkcp/lib/low/KeccakP-800/plain/ua/KeccakP-800-opt32-config.h +0 -7
- data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-SnP.h +0 -44
- data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-reference.c +0 -437
- data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-reference.h +0 -23
- data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/Ket.h +0 -57
- data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeJr-armv7m-le-armcc.s +0 -475
- data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeJr-armv7m-le-gcc.s +0 -480
- data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeSr-armv7m-le-armcc.s +0 -590
- data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeSr-armv7m-le-gcc.s +0 -590
- data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.c +0 -126
- data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.h +0 -68
- data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.inc +0 -174
- data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.c +0 -80
- data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.h +0 -68
- data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.inc +0 -142
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-SnP.h +0 -55
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-u1-armv6m-le-armcc.s +0 -1086
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-u1-armv6m-le-gcc.s +0 -1092
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv6-le-armcc.s +0 -721
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv6-le-gcc.s +0 -726
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv7m-le-armcc.s +0 -723
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv7m-le-gcc.s +0 -729
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-u1-armv6m-le-armcc.s +0 -1164
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-u1-armv6m-le-gcc.s +0 -1165
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv6-le-armcc.s +0 -562
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv6-le-gcc.s +0 -563
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv7m-le-armcc.s +0 -563
- data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv7m-le-gcc.s +0 -565
- data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-SnP.h +0 -55
- data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-uf-armv7a-neon-le-armcc.s +0 -476
- data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-uf-armv7a-neon-le-gcc.s +0 -485
- data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodyak-uf-armv7a-neon-le-armcc.s +0 -362
- data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodyak-uf-armv7a-neon-le-gcc.s +0 -367
- data/ext/xkcp/lib/low/Xoodoo/AVR8/Xoodoo-SnP.h +0 -43
- data/ext/xkcp/lib/low/Xoodoo/AVR8/Xoodoo-avr8-u1.s +0 -1341
- data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodoo-SIMD512.c +0 -581
- data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodoo-SnP.h +0 -58
- data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodyak-full-block-SIMD512.c +0 -332
- data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodoo-SIMD128.c +0 -329
- data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodoo-SnP.h +0 -53
- data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodyak-full-block-SIMD128.c +0 -355
- data/ext/xkcp/lib/low/Xoodoo/Xoodoo.h +0 -79
- data/ext/xkcp/lib/low/Xoodoo/plain/Xoodoo-SnP.h +0 -56
- data/ext/xkcp/lib/low/Xoodoo/plain/Xoodoo-optimized.c +0 -399
- data/ext/xkcp/lib/low/Xoodoo/plain/Xoodyak-full-blocks.c +0 -127
- data/ext/xkcp/lib/low/Xoodoo/ref/Xoodoo-SnP.h +0 -43
- data/ext/xkcp/lib/low/Xoodoo/ref/Xoodoo-reference.c +0 -253
- data/ext/xkcp/lib/low/Xoodoo-times16/AVX512/Xoodoo-times16-SIMD512.c +0 -1044
- data/ext/xkcp/lib/low/Xoodoo-times16/AVX512/Xoodoo-times16-SnP.h +0 -49
- data/ext/xkcp/lib/low/Xoodoo-times16/fallback-on1/Xoodoo-times16-SnP.h +0 -45
- data/ext/xkcp/lib/low/Xoodoo-times16/fallback-on1/Xoodoo-times16-on1.c +0 -37
- data/ext/xkcp/lib/low/Xoodoo-times4/ARMv7A-NEON/Xoodoo-times4-ARMv7A.s +0 -1587
- data/ext/xkcp/lib/low/Xoodoo-times4/ARMv7A-NEON/Xoodoo-times4-SnP.h +0 -48
- data/ext/xkcp/lib/low/Xoodoo-times4/AVX512/Xoodoo-times4-SIMD512.c +0 -1202
- data/ext/xkcp/lib/low/Xoodoo-times4/AVX512/Xoodoo-times4-SnP.h +0 -48
- data/ext/xkcp/lib/low/Xoodoo-times4/SSSE3/Xoodoo-times4-SIMD128.c +0 -484
- data/ext/xkcp/lib/low/Xoodoo-times4/SSSE3/Xoodoo-times4-SnP.h +0 -44
- data/ext/xkcp/lib/low/Xoodoo-times4/fallback-on1/Xoodoo-times4-SnP.h +0 -45
- data/ext/xkcp/lib/low/Xoodoo-times4/fallback-on1/Xoodoo-times4-on1.c +0 -37
- data/ext/xkcp/lib/low/Xoodoo-times8/AVX2/Xoodoo-times8-SIMD256.c +0 -939
- data/ext/xkcp/lib/low/Xoodoo-times8/AVX2/Xoodoo-times8-SnP.h +0 -49
- data/ext/xkcp/lib/low/Xoodoo-times8/AVX512/Xoodoo-times8-SIMD512.c +0 -1216
- data/ext/xkcp/lib/low/Xoodoo-times8/AVX512/Xoodoo-times8-SnP.h +0 -48
- data/ext/xkcp/lib/low/Xoodoo-times8/fallback-on1/Xoodoo-times8-SnP.h +0 -45
- data/ext/xkcp/lib/low/Xoodoo-times8/fallback-on1/Xoodoo-times8-on1.c +0 -37
- data/ext/xkcp/lib/low/common/PlSnP-Fallback.inc +0 -290
- data/ext/xkcp/lib/low/common/SnP-Relaned.h +0 -141
- data/ext/xkcp/support/Kernel-PMU/Kernel-pmu.md +0 -133
- data/ext/xkcp/support/Kernel-PMU/Makefile +0 -8
- data/ext/xkcp/support/Kernel-PMU/enable_arm_pmu.c +0 -129
- data/ext/xkcp/support/Kernel-PMU/load-module +0 -1
- data/ext/xkcp/util/KeccakSum/KeccakSum.c +0 -394
- data/ext/xkcp/util/KeccakSum/base64.c +0 -86
- data/ext/xkcp/util/KeccakSum/base64.h +0 -12
@@ -1,1587 +0,0 @@
|
|
1
|
-
@
|
2
|
-
@ The eXtended Keccak Code Package (XKCP)
|
3
|
-
@ https://github.com/XKCP/XKCP
|
4
|
-
@
|
5
|
-
@ The Xoodoo permutation, designed by Joan Daemen, Seth Hoffert, Gilles Van Assche and Ronny Van Keer.
|
6
|
-
@
|
7
|
-
@ Implementation by Conno Boel, hereby denoted as "the implementer".
|
8
|
-
@
|
9
|
-
@ For more information, feedback or questions, please refer to the Keccak Team website:
|
10
|
-
@ https://keccak.team/
|
11
|
-
@
|
12
|
-
@ To the extent possible under law, the implementer has waived all copyright
|
13
|
-
@ and related or neighboring rights to the source code in this file.
|
14
|
-
@ http://creativecommons.org/publicdomain/zero/1.0/
|
15
|
-
@
|
16
|
-
|
17
|
-
@ WARNING: These functions work only on little endian CPU with@ ARMv7A + NEON architecture (Cortex-A8, ...).
|
18
|
-
|
19
|
-
.text
|
20
|
-
|
21
|
-
@ Xoodootimes4_InitializeAll: void * states -> void
|
22
|
-
.align 8
|
23
|
-
.global Xoodootimes4_InitializeAll
|
24
|
-
.type Xoodootimes4_InitializeAll, %function
|
25
|
-
Xoodootimes4_InitializeAll:
|
26
|
-
vmov.i32 q0, #0
|
27
|
-
vstm r0!, {d0-d1}
|
28
|
-
vstm r0!, {d0-d1}
|
29
|
-
vstm r0!, {d0-d1}
|
30
|
-
|
31
|
-
vstm r0!, {d0-d1}
|
32
|
-
vstm r0!, {d0-d1}
|
33
|
-
vstm r0!, {d0-d1}
|
34
|
-
|
35
|
-
vstm r0!, {d0-d1}
|
36
|
-
vstm r0!, {d0-d1}
|
37
|
-
vstm r0!, {d0-d1}
|
38
|
-
|
39
|
-
vstm r0!, {d0-d1}
|
40
|
-
vstm r0!, {d0-d1}
|
41
|
-
vstm r0!, {d0-d1}
|
42
|
-
bx lr
|
43
|
-
|
44
|
-
|
45
|
-
@ Xoodootimes4_AddByte: void * states -> uint instanceIndex -> const uchar byte -> uint offset -> void
|
46
|
-
.align 8
|
47
|
-
.global Xoodootimes4_AddByte
|
48
|
-
.type Xoodootimes4_AddByte, %function
|
49
|
-
Xoodootimes4_AddByte:
|
50
|
-
add r1, r1, r3
|
51
|
-
and r3, r3, #3
|
52
|
-
sub r1, r1, r3
|
53
|
-
add r0, r0, r1, lsl #2 @ states+(WORD instanceIndex)
|
54
|
-
add r0, r0, r3
|
55
|
-
ldrb r3, [r0]
|
56
|
-
eor r3, r3, r2
|
57
|
-
strb r3, [r0]
|
58
|
-
bx lr
|
59
|
-
|
60
|
-
@ Xoodootimes4_AddBytes: void * states -> uint instanceIndex -> const uchar * data -> uint offset -> uint length -> void
|
61
|
-
.align 8
|
62
|
-
.global Xoodootimes4_AddBytes
|
63
|
-
.type Xoodootimes4_AddBytes, %function
|
64
|
-
Xoodootimes4_AddBytes:
|
65
|
-
add r1, r1, r3
|
66
|
-
and r3, r3, #3
|
67
|
-
sub r1, r1, r3
|
68
|
-
add r0, r0, r1, lsl #2 @ states+(WORD instanceIndex)
|
69
|
-
add r0, r0, r3
|
70
|
-
|
71
|
-
ldr r1, [sp]
|
72
|
-
subs r1, r1, #1
|
73
|
-
bxcc lr
|
74
|
-
|
75
|
-
@ r0 start
|
76
|
-
@ r1 lenght > 0
|
77
|
-
@ r2 data
|
78
|
-
@ r3 byte offset {0,1,2,3}
|
79
|
-
|
80
|
-
push {r4, r5, lr}
|
81
|
-
Xt4_AddBytes_Loop:
|
82
|
-
ldrb r4, [r0]
|
83
|
-
ldrb r5, [r2], #1
|
84
|
-
eor r4, r4, r5
|
85
|
-
strb r4, [r0], #1
|
86
|
-
and r5, r0, #3
|
87
|
-
cmp r5, #0
|
88
|
-
addeq r0, r0, #12 @ Skip state
|
89
|
-
subs r1, r1, #1
|
90
|
-
bcs Xt4_AddBytes_Loop
|
91
|
-
pop {r4, r5, pc}
|
92
|
-
|
93
|
-
@ Xoodootimes4_AddLanesAll: void * states -> const uchar * data -> uint laneCount -> uint laneOffset -> void
|
94
|
-
.align 8
|
95
|
-
.global Xoodootimes4_AddLanesAll
|
96
|
-
.type Xoodootimes4_AddLanesAll, %function
|
97
|
-
Xoodootimes4_AddLanesAll:
|
98
|
-
cmp r2, r3
|
99
|
-
cmpeq r2, #12
|
100
|
-
tsteq r1, #3
|
101
|
-
moveq r3, lr
|
102
|
-
beq Xt4_AddLanesAll_Full
|
103
|
-
|
104
|
-
push {r4-r7,lr}
|
105
|
-
|
106
|
-
add r4, r1, r3, lsl #2
|
107
|
-
add r5, r4, r3, lsl #2
|
108
|
-
add r6, r5, r3, lsl #2
|
109
|
-
|
110
|
-
subs r2, r2, #1
|
111
|
-
popcc {r4-r7,pc}
|
112
|
-
|
113
|
-
and r3, r1, #3
|
114
|
-
cmp r3, #0
|
115
|
-
|
116
|
-
bne Xt4_AddLanesAll_Unaligned_Loop
|
117
|
-
Xt4_AddLanesAll_Aligned_Loop:
|
118
|
-
vldm r0, {d0, d1}
|
119
|
-
ldr r3, [r1], #4
|
120
|
-
vmov s4, r3
|
121
|
-
ldr r3, [r4], #4
|
122
|
-
vmov s5, r3
|
123
|
-
ldr r3, [r5], #4
|
124
|
-
vmov s6, r3
|
125
|
-
ldr r3, [r6], #4
|
126
|
-
vmov s7, r3
|
127
|
-
veor q0, q0, q1
|
128
|
-
vstm r0!, {d0, d1}
|
129
|
-
subs r2, r2, #1
|
130
|
-
bcs Xt4_AddLanesAll_Aligned_Loop
|
131
|
-
pop {r4-r7,pc}
|
132
|
-
Xt4_AddLanesAll_Unaligned_Loop:
|
133
|
-
vldm r0, {d0, d1}
|
134
|
-
|
135
|
-
ldrb r3, [r1], #1
|
136
|
-
ldrb r7, [r1], #1
|
137
|
-
eor r3, r3, r7, lsl #8
|
138
|
-
ldrb r7, [r1], #1
|
139
|
-
eor r3, r3, r7, lsl #16
|
140
|
-
ldrb r7, [r1], #1
|
141
|
-
eor r3, r3, r7, lsl #24
|
142
|
-
vmov s4, r3
|
143
|
-
|
144
|
-
ldrb r3, [r4], #1
|
145
|
-
ldrb r7, [r4], #1
|
146
|
-
eor r3, r3, r7, lsl #8
|
147
|
-
ldrb r7, [r4], #1
|
148
|
-
eor r3, r3, r7, lsl #16
|
149
|
-
ldrb r7, [r4], #1
|
150
|
-
eor r3, r3, r7, lsl #24
|
151
|
-
vmov s5, r3
|
152
|
-
|
153
|
-
ldrb r3, [r5], #1
|
154
|
-
ldrb r7, [r5], #1
|
155
|
-
eor r3, r3, r7, lsl #8
|
156
|
-
ldrb r7, [r5], #1
|
157
|
-
eor r3, r3, r7, lsl #16
|
158
|
-
ldrb r7, [r5], #1
|
159
|
-
eor r3, r3, r7, lsl #24
|
160
|
-
vmov s6, r3
|
161
|
-
|
162
|
-
ldrb r3, [r6], #1
|
163
|
-
ldrb r7, [r6], #1
|
164
|
-
eor r3, r3, r7, lsl #8
|
165
|
-
ldrb r7, [r6], #1
|
166
|
-
eor r3, r3, r7, lsl #16
|
167
|
-
ldrb r7, [r6], #1
|
168
|
-
eor r3, r3, r7, lsl #24
|
169
|
-
vmov s7, r3
|
170
|
-
|
171
|
-
veor q0, q0, q1
|
172
|
-
vstm r0!, {d0, d1}
|
173
|
-
subs r2, r2, #1
|
174
|
-
bcs Xt4_AddLanesAll_Unaligned_Loop
|
175
|
-
pop {r4-r7,pc}
|
176
|
-
Xt4_AddLanesAll_Full:
|
177
|
-
vldm r1!, {d0-d15}
|
178
|
-
vuzp.32 q0, q6
|
179
|
-
vldm r1, {d16-d23}
|
180
|
-
vuzp.32 q3, q9
|
181
|
-
vtrn.32 q0, q3
|
182
|
-
vtrn.32 q6, q9
|
183
|
-
|
184
|
-
vuzp.32 q1, q7
|
185
|
-
vuzp.32 q4, q10
|
186
|
-
vtrn.32 q1, q4
|
187
|
-
vtrn.32 q7, q10
|
188
|
-
|
189
|
-
vuzp.32 q2, q8
|
190
|
-
vuzp.32 q5, q11
|
191
|
-
vtrn.32 q2, q5
|
192
|
-
vtrn.32 q8, q11
|
193
|
-
|
194
|
-
vldm r0, {d24-d31}
|
195
|
-
veor q12, q0, q12
|
196
|
-
veor q13, q6, q13
|
197
|
-
veor q14, q3, q14
|
198
|
-
veor q15, q9, q15
|
199
|
-
vstm r0!, {d24-d31}
|
200
|
-
vldm r0, {d24-d31}
|
201
|
-
veor q12, q1, q12
|
202
|
-
veor q13, q7, q13
|
203
|
-
veor q14, q4, q14
|
204
|
-
veor q15, q10, q15
|
205
|
-
vstm r0!, {d24-d31}
|
206
|
-
vldm r0, {d24-d31}
|
207
|
-
veor q12, q2, q12
|
208
|
-
veor q13, q8, q13
|
209
|
-
veor q14, q5, q14
|
210
|
-
veor q15, q11, q15
|
211
|
-
vstm r0, {d24-d31}
|
212
|
-
mov pc, r3
|
213
|
-
|
214
|
-
@ Xoodootimes4_OverwriteBytes: void * states -> uint instanceIndex -> const uchar * data -> uint offset -> uint length -> void
|
215
|
-
.align 8
|
216
|
-
.global Xoodootimes4_OverwriteBytes
|
217
|
-
.type Xoodootimes4_OverwriteBytes, %function
|
218
|
-
Xoodootimes4_OverwriteBytes:
|
219
|
-
push {r4, lr}
|
220
|
-
ldr r4, [sp, #8]
|
221
|
-
cmp r4, #48
|
222
|
-
tsteq r2, #3
|
223
|
-
beq Xt4_OverwriteBytes_Full
|
224
|
-
|
225
|
-
add r1, r1, r3
|
226
|
-
and r3, r3, #3
|
227
|
-
sub r1, r1, r3
|
228
|
-
add r0, r0, r1, lsl #2 @ states+(WORD instanceIndex)
|
229
|
-
add r0, r0, r3
|
230
|
-
|
231
|
-
subs r1, r4, #1
|
232
|
-
popcc {r4, pc}
|
233
|
-
|
234
|
-
@ r0 start
|
235
|
-
@ r1 lenght > 0
|
236
|
-
@ r2 data
|
237
|
-
@ r3 byte offset {0,1,2,3}
|
238
|
-
|
239
|
-
Xt4_OverwriteBytes_Loop:
|
240
|
-
ldrb r4, [r2], #1
|
241
|
-
strb r4, [r0], #1
|
242
|
-
and r4, r0, #3
|
243
|
-
cmp r4, #0
|
244
|
-
addeq r0, r0, #12 @ Skip state
|
245
|
-
subs r1, r1, #1
|
246
|
-
bcs Xt4_OverwriteBytes_Loop
|
247
|
-
pop {r4, pc}
|
248
|
-
Xt4_OverwriteBytes_Full:
|
249
|
-
add r0, r0, r1, lsl #2
|
250
|
-
ldmia r2!, {r1, r3, r4, r14}
|
251
|
-
str r1, [r0], #16
|
252
|
-
str r3, [r0], #16
|
253
|
-
str r4, [r0], #16
|
254
|
-
str r14, [r0], #16
|
255
|
-
ldmia r2!, {r1, r3, r4, r14}
|
256
|
-
str r1, [r0], #16
|
257
|
-
str r3, [r0], #16
|
258
|
-
str r4, [r0], #16
|
259
|
-
str r14, [r0], #16
|
260
|
-
ldmia r2, {r1, r3, r4, r14}
|
261
|
-
str r1, [r0], #16
|
262
|
-
str r3, [r0], #16
|
263
|
-
str r4, [r0], #16
|
264
|
-
str r14, [r0], #16
|
265
|
-
pop {r4, pc}
|
266
|
-
|
267
|
-
@ Xoodootimes4_OverwriteLanesAll: void * states -> uchar * data -> uint lanecount -> uint laneOffset -> void
|
268
|
-
.align 8
|
269
|
-
.global Xoodootimes4_OverwriteLanesAll
|
270
|
-
.type Xoodootimes4_OverwriteLanesAll, %function
|
271
|
-
Xoodootimes4_OverwriteLanesAll:
|
272
|
-
push {r4-r6,lr}
|
273
|
-
|
274
|
-
add r4, r1, r3, lsl #2
|
275
|
-
add r5, r4, r3, lsl #2
|
276
|
-
add r6, r5, r3, lsl #2
|
277
|
-
|
278
|
-
subs r2, r2, #1
|
279
|
-
popcc {r4-r6,pc}
|
280
|
-
|
281
|
-
and r3, r1, #3
|
282
|
-
cmp r3, #0
|
283
|
-
bne Xt4_OverwriteLanesAll_Unaligned_Loop
|
284
|
-
|
285
|
-
Xt4_OverwriteLanesAll_Aligned_Loop:
|
286
|
-
ldr r3, [r1], #4
|
287
|
-
vmov s0, r3
|
288
|
-
ldr r3, [r4], #4
|
289
|
-
vmov s1, r3
|
290
|
-
ldr r3, [r5], #4
|
291
|
-
vmov s2, r3
|
292
|
-
ldr r3, [r6], #4
|
293
|
-
vmov s3, r3
|
294
|
-
vstm r0!, {d0-d1}
|
295
|
-
subs r2, r2, #1
|
296
|
-
bcs Xt4_OverwriteLanesAll_Aligned_Loop
|
297
|
-
pop {r4-r6,pc}
|
298
|
-
Xt4_OverwriteLanesAll_Unaligned_Loop:
|
299
|
-
push {r7}
|
300
|
-
|
301
|
-
ldrb r3, [r1], #1
|
302
|
-
ldrb r7, [r1], #1
|
303
|
-
eor r3, r3, r7, lsl #8
|
304
|
-
ldrb r7, [r1], #1
|
305
|
-
eor r3, r3, r7, lsl #16
|
306
|
-
ldrb r7, [r1], #1
|
307
|
-
eor r3, r3, r7, lsl #24
|
308
|
-
vmov s0, r3
|
309
|
-
|
310
|
-
ldrb r3, [r4], #1
|
311
|
-
ldrb r7, [r4], #1
|
312
|
-
eor r3, r3, r7, lsl #8
|
313
|
-
ldrb r7, [r4], #1
|
314
|
-
eor r3, r3, r7, lsl #16
|
315
|
-
ldrb r7, [r4], #1
|
316
|
-
eor r3, r3, r7, lsl #24
|
317
|
-
vmov s1, r3
|
318
|
-
|
319
|
-
ldrb r3, [r5], #1
|
320
|
-
ldrb r7, [r5], #1
|
321
|
-
eor r3, r3, r7, lsl #8
|
322
|
-
ldrb r7, [r5], #1
|
323
|
-
eor r3, r3, r7, lsl #16
|
324
|
-
ldrb r7, [r5], #1
|
325
|
-
eor r3, r3, r7, lsl #24
|
326
|
-
vmov s2, r3
|
327
|
-
|
328
|
-
ldrb r3, [r6], #1
|
329
|
-
ldrb r7, [r6], #1
|
330
|
-
eor r3, r3, r7, lsl #8
|
331
|
-
ldrb r7, [r6], #1
|
332
|
-
eor r3, r3, r7, lsl #16
|
333
|
-
ldrb r7, [r6], #1
|
334
|
-
eor r3, r3, r7, lsl #24
|
335
|
-
vmov s3, r3
|
336
|
-
|
337
|
-
vstm r0!, {d0-d1}
|
338
|
-
pop {r7}
|
339
|
-
subs r2, r2, #1
|
340
|
-
bcs Xt4_OverwriteLanesAll_Unaligned_Loop
|
341
|
-
pop {r4-r6,pc}
|
342
|
-
|
343
|
-
|
344
|
-
@ Xoodootimes4_OverwriteWithZeroes: void * states -> uint instanceIndex -> uint byteCount -> void
|
345
|
-
.align 8
|
346
|
-
.global Xoodootimes4_OverwriteWithZeroes
|
347
|
-
.type Xoodootimes4_OverwriteWithZeroes, %function
|
348
|
-
Xoodootimes4_OverwriteWithZeroes:
|
349
|
-
add r0, r0, r1, lsl #2 @ states + 4*instance = state start
|
350
|
-
mov r1, #0
|
351
|
-
mov r3, lr
|
352
|
-
Xt4_OverwriteWithZeroes_Aligned:
|
353
|
-
subs r2, r2, #4
|
354
|
-
strcs r1, [r0], #16
|
355
|
-
bhi Xt4_OverwriteWithZeroes_Aligned
|
356
|
-
moveq pc, r3
|
357
|
-
add r2, r2, #4
|
358
|
-
Xt4_OverwriteWithZeroes_Leftovers:
|
359
|
-
subs r2, r2, #1
|
360
|
-
movcc pc, r3
|
361
|
-
strb r1, [r0], #1
|
362
|
-
bhi Xt4_OverwriteWithZeroes_Leftovers
|
363
|
-
mov pc, r3
|
364
|
-
|
365
|
-
|
366
|
-
@ Xoodootimes4_ExtractBytes: void * states -> uint instanceIndex -> const uchar * data -> uint offset -> uint length -> void
|
367
|
-
.align 8
|
368
|
-
.global Xoodootimes4_ExtractBytes
|
369
|
-
.type Xoodootimes4_ExtractBytes, %function
|
370
|
-
Xoodootimes4_ExtractBytes:
|
371
|
-
add r1, r1, r3
|
372
|
-
and r3, r3, #3
|
373
|
-
sub r1, r1, r3
|
374
|
-
add r0, r0, r1, lsl #2 @ states+(WORD instanceIndex)
|
375
|
-
add r0, r0, r3
|
376
|
-
|
377
|
-
ldr r1, [sp]
|
378
|
-
subs r1, r1, #1
|
379
|
-
bxcc lr
|
380
|
-
|
381
|
-
push {r4, lr}
|
382
|
-
Xt4_ExtractBytes_Loop:
|
383
|
-
ldrb r4, [r0], #1
|
384
|
-
strb r4, [r2], #1
|
385
|
-
and r4, r0, #3
|
386
|
-
cmp r4, #0
|
387
|
-
addeq r0, r0, #12 @ Skip state
|
388
|
-
subs r1, r1, #1
|
389
|
-
bcs Xt4_ExtractBytes_Loop
|
390
|
-
pop {r4, pc}
|
391
|
-
|
392
|
-
@ Xoodootimes4_ExtractLanesAll: void * states -> uchar * data -> uint lanecount -> uint laneoffset -> void
|
393
|
-
.align 8
|
394
|
-
.global Xoodootimes4_ExtractLanesAll
|
395
|
-
.type Xoodootimes4_ExtractLanesAll, %function
|
396
|
-
Xoodootimes4_ExtractLanesAll:
|
397
|
-
push {r4-r6,lr}
|
398
|
-
|
399
|
-
add r4, r1, r3, lsl #2
|
400
|
-
add r5, r4, r3, lsl #2
|
401
|
-
add r6, r5, r3, lsl #2
|
402
|
-
|
403
|
-
subs r2, r2, #1
|
404
|
-
popcc {r4-r6,pc}
|
405
|
-
|
406
|
-
and r3, r1, #3
|
407
|
-
cmp r3, #0
|
408
|
-
bne Xt4_ExtractLanesAll_Unaligned_Loop
|
409
|
-
Xt4_ExtractLanesAll_Aligned_Loop:
|
410
|
-
vldm r0!, {d0-d1}
|
411
|
-
vmov r3, s0
|
412
|
-
str r3, [r1], #4
|
413
|
-
vmov r3, s1
|
414
|
-
str r3, [r4], #4
|
415
|
-
vmov r3, s2
|
416
|
-
str r3, [r5], #4
|
417
|
-
vmov r3, s3
|
418
|
-
str r3, [r6], #4
|
419
|
-
subs r2, r2, #1
|
420
|
-
bcs Xt4_ExtractLanesAll_Aligned_Loop
|
421
|
-
pop {r4-r6,pc}
|
422
|
-
Xt4_ExtractLanesAll_Unaligned_Loop:
|
423
|
-
push {r7}
|
424
|
-
vldm r0!, {d0-d1}
|
425
|
-
vmov r3, s0
|
426
|
-
strb r3, [r1], #1
|
427
|
-
lsr r3, r3, #8
|
428
|
-
strb r3, [r1], #1
|
429
|
-
lsr r3, r3, #8
|
430
|
-
strb r3, [r1], #1
|
431
|
-
lsr r3, r3, #8
|
432
|
-
strb r3, [r1], #1
|
433
|
-
|
434
|
-
vmov r3, s1
|
435
|
-
strb r3, [r4], #1
|
436
|
-
lsr r3, r3, #8
|
437
|
-
strb r3, [r4], #1
|
438
|
-
lsr r3, r3, #8
|
439
|
-
strb r3, [r4], #1
|
440
|
-
lsr r3, r3, #8
|
441
|
-
strb r3, [r4], #1
|
442
|
-
|
443
|
-
vmov r3, s2
|
444
|
-
strb r3, [r5], #1
|
445
|
-
lsr r3, r3, #8
|
446
|
-
strb r3, [r5], #1
|
447
|
-
lsr r3, r3, #8
|
448
|
-
strb r3, [r5], #1
|
449
|
-
lsr r3, r3, #8
|
450
|
-
strb r3, [r5], #1
|
451
|
-
|
452
|
-
vmov r3, s3
|
453
|
-
strb r3, [r6], #1
|
454
|
-
lsr r3, r3, #8
|
455
|
-
strb r3, [r6], #1
|
456
|
-
lsr r3, r3, #8
|
457
|
-
strb r3, [r6], #1
|
458
|
-
lsr r3, r3, #8
|
459
|
-
strb r3, [r6], #1
|
460
|
-
|
461
|
-
pop {r7}
|
462
|
-
subs r2, r2, #1
|
463
|
-
bcs Xt4_ExtractLanesAll_Unaligned_Loop
|
464
|
-
pop {r4-r6,pc}
|
465
|
-
|
466
|
-
@ Xoodootimes4_ExtractAndAddBytes: void * states -> uint instanceIndex -> uchar * input -> uchar * output -> uint offset -> uint length -> void
|
467
|
-
.align 8
|
468
|
-
.global Xoodootimes4_ExtractAndAddBytes
|
469
|
-
.type Xoodootimes4_ExtractAndAddBytes, %function
|
470
|
-
Xoodootimes4_ExtractAndAddBytes:
|
471
|
-
push {r4, r5, lr}
|
472
|
-
ldr r4, [sp, #12]
|
473
|
-
ldr r5, [sp, #16]
|
474
|
-
cmp r5, #48
|
475
|
-
tsteq r2, #3
|
476
|
-
tsteq r3, #3
|
477
|
-
beq Xt4_ExtractAndAddBytes_Full
|
478
|
-
|
479
|
-
add r1, r1, r4
|
480
|
-
and r4, r4, #3
|
481
|
-
sub r1, r1, r4
|
482
|
-
add r0, r0, r1, lsl #2 @ states+(WORD instanceIndex)
|
483
|
-
add r0, r0, r4
|
484
|
-
|
485
|
-
subs r1, r5, #1
|
486
|
-
popcc {r4, r5, pc}
|
487
|
-
|
488
|
-
Xt4_ExtractAndAddBytes_Loop:
|
489
|
-
ldrb r4, [r0], #1
|
490
|
-
ldrb r5, [r2], #1
|
491
|
-
eor r4, r4, r5
|
492
|
-
strb r4, [r3], #1
|
493
|
-
and r4, r0, #3
|
494
|
-
cmp r4, #0
|
495
|
-
addeq r0, r0, #12 @ Skip state
|
496
|
-
subs r1, r1, #1
|
497
|
-
bcs Xt4_ExtractAndAddBytes_Loop
|
498
|
-
pop {r4, r5, pc}
|
499
|
-
Xt4_ExtractAndAddBytes_Full:
|
500
|
-
add r0, r0, r1, lsl #2
|
501
|
-
ldmia r2!, {r1, r4, r5}
|
502
|
-
ldr r14, [r0], #16
|
503
|
-
eor r1, r1, r14
|
504
|
-
ldr r14, [r0], #16
|
505
|
-
eor r4, r4, r14
|
506
|
-
ldr r14, [r0], #16
|
507
|
-
eor r5, r5, r14
|
508
|
-
stmia r3!, {r1, r4, r5}
|
509
|
-
ldmia r2!, {r1, r4, r5}
|
510
|
-
ldr r14, [r0], #16
|
511
|
-
eor r1, r1, r14
|
512
|
-
ldr r14, [r0], #16
|
513
|
-
eor r4, r4, r14
|
514
|
-
ldr r14, [r0], #16
|
515
|
-
eor r5, r5, r14
|
516
|
-
stmia r3!, {r1, r4, r5}
|
517
|
-
ldmia r2!, {r1, r4, r5}
|
518
|
-
ldr r14, [r0], #16
|
519
|
-
eor r1, r1, r14
|
520
|
-
ldr r14, [r0], #16
|
521
|
-
eor r4, r4, r14
|
522
|
-
ldr r14, [r0], #16
|
523
|
-
eor r5, r5, r14
|
524
|
-
stmia r3!, {r1, r4, r5}
|
525
|
-
ldmia r2, {r1, r4, r5}
|
526
|
-
ldr r14, [r0], #16
|
527
|
-
eor r1, r1, r14
|
528
|
-
ldr r14, [r0], #16
|
529
|
-
eor r4, r4, r14
|
530
|
-
ldr r14, [r0], #16
|
531
|
-
eor r5, r5, r14
|
532
|
-
stmia r3, {r1, r4, r5}
|
533
|
-
pop {r4, r5, pc}
|
534
|
-
|
535
|
-
|
536
|
-
@ Xoodootimes4_ExtractAndAddLanesAll: void * states -> uchar * input -> uchar * output -> uint laneCount -> uint laneOffset
|
537
|
-
.align 8
|
538
|
-
.global Xoodootimes4_ExtractAndAddLanesAll
|
539
|
-
.type Xoodootimes4_ExtractAndAddLanesAll, %function
|
540
|
-
Xoodootimes4_ExtractAndAddLanesAll:
|
541
|
-
subs r3, r3, #1
|
542
|
-
bxcc lr
|
543
|
-
|
544
|
-
push {r4-r11,lr}
|
545
|
-
ldr r9, [sp, #36]
|
546
|
-
|
547
|
-
add r4, r1, r9, lsl #2 @ r4 = r1 + 48
|
548
|
-
add r5, r4, r9, lsl #2 @ r5 = r1 + 96
|
549
|
-
add r6, r5, r9, lsl #2 @ r6 = r1 + 144
|
550
|
-
|
551
|
-
add r7, r2, r9, lsl #2 @ r7 = r2 + 48
|
552
|
-
add r8, r7, r9, lsl #2 @ r8 = r2 + 96
|
553
|
-
add r9, r8, r9, lsl #2 @ r9 = r2 + 144
|
554
|
-
|
555
|
-
Xt4_ExtractAndAddLanesAll_Unaligned_Loop:
|
556
|
-
vldm r0!, {d2-d3}
|
557
|
-
|
558
|
-
ldrb r11, [r1], #1
|
559
|
-
ldrb r10, [r1], #1
|
560
|
-
eor r11, r11, r10, lsl #8
|
561
|
-
ldrb r10, [r1], #1
|
562
|
-
eor r11, r11, r10, lsl #16
|
563
|
-
ldrb r10, [r1], #1
|
564
|
-
eor r11, r11, r10, lsl #24
|
565
|
-
vmov s0, r11
|
566
|
-
|
567
|
-
ldrb r11, [r4], #1
|
568
|
-
ldrb r10, [r4], #1
|
569
|
-
eor r11, r11, r10, lsl #8
|
570
|
-
ldrb r10, [r4], #1
|
571
|
-
eor r11, r11, r10, lsl #16
|
572
|
-
ldrb r10, [r4], #1
|
573
|
-
eor r11, r11, r10, lsl #24
|
574
|
-
vmov s1, r11
|
575
|
-
|
576
|
-
ldrb r11, [r5], #1
|
577
|
-
ldrb r10, [r5], #1
|
578
|
-
eor r11, r11, r10, lsl #8
|
579
|
-
ldrb r10, [r5], #1
|
580
|
-
eor r11, r11, r10, lsl #16
|
581
|
-
ldrb r10, [r5], #1
|
582
|
-
eor r11, r11, r10, lsl #24
|
583
|
-
vmov s2, r11
|
584
|
-
|
585
|
-
ldrb r11, [r6], #1
|
586
|
-
ldrb r10, [r6], #1
|
587
|
-
eor r11, r11, r10, lsl #8
|
588
|
-
ldrb r10, [r6], #1
|
589
|
-
eor r11, r11, r10, lsl #16
|
590
|
-
ldrb r10, [r6], #1
|
591
|
-
eor r11, r11, r10, lsl #24
|
592
|
-
vmov s3, r11
|
593
|
-
veor q0, q0, q1
|
594
|
-
|
595
|
-
vmov r10, s0
|
596
|
-
strb r10, [r2], #1
|
597
|
-
lsr r10, r10, #8
|
598
|
-
strb r10, [r2], #1
|
599
|
-
lsr r10, r10, #8
|
600
|
-
strb r10, [r2], #1
|
601
|
-
lsr r10, r10, #8
|
602
|
-
strb r10, [r2], #1
|
603
|
-
|
604
|
-
vmov r10, s1
|
605
|
-
strb r10, [r7], #1
|
606
|
-
lsr r10, r10, #8
|
607
|
-
strb r10, [r7], #1
|
608
|
-
lsr r10, r10, #8
|
609
|
-
strb r10, [r7], #1
|
610
|
-
lsr r10, r10, #8
|
611
|
-
strb r10, [r7], #1
|
612
|
-
|
613
|
-
vmov r10, s2
|
614
|
-
strb r10, [r8], #1
|
615
|
-
lsr r10, r10, #8
|
616
|
-
strb r10, [r8], #1
|
617
|
-
lsr r10, r10, #8
|
618
|
-
strb r10, [r8], #1
|
619
|
-
lsr r10, r10, #8
|
620
|
-
strb r10, [r8], #1
|
621
|
-
|
622
|
-
vmov r10, s3
|
623
|
-
strb r10, [r9], #1
|
624
|
-
lsr r10, r10, #8
|
625
|
-
strb r10, [r9], #1
|
626
|
-
lsr r10, r10, #8
|
627
|
-
strb r10, [r9], #1
|
628
|
-
lsr r10, r10, #8
|
629
|
-
strb r10, [r9], #1
|
630
|
-
|
631
|
-
subs r3, r3, #1
|
632
|
-
bcs Xt4_ExtractAndAddLanesAll_Unaligned_Loop
|
633
|
-
pop {r4-r11,pc}
|
634
|
-
|
635
|
-
@ q0: a00 -> q1: a01 -> q2: a02 -> q3: a03 ->
|
636
|
-
@ q4: a10 -> q5: a11 -> q6: a12 -> q7: a13 ->
|
637
|
-
@ q8: a20 -> q9: a21 -> q10: a22 -> q11: a23
|
638
|
-
|
639
|
-
.macro theta
|
640
|
-
veor q15, q3, q7
|
641
|
-
veor q15, q15, q11
|
642
|
-
|
643
|
-
vmov r3, r4, d30
|
644
|
-
vmov r1, r2, d31
|
645
|
-
ror r3, r3, #27
|
646
|
-
veor q14, q0, q4
|
647
|
-
ror r4, r4, #27
|
648
|
-
veor q14, q14, q8
|
649
|
-
ror r1, r1, #27
|
650
|
-
ror r2, r2, #27
|
651
|
-
eor r3, r3, r3, ror #23
|
652
|
-
eor r4, r4, r4, ror #23
|
653
|
-
eor r1, r1, r1, ror #23
|
654
|
-
vmov d30, r3, r4
|
655
|
-
eor r2, r2, r2, ror #23
|
656
|
-
vmov d31, r1, r2
|
657
|
-
|
658
|
-
vmov r3, r4, d28
|
659
|
-
vmov r1, r2, d29
|
660
|
-
ror r3, r3, #27
|
661
|
-
veor q0, q0, q15
|
662
|
-
ror r4, r4, #27
|
663
|
-
veor q4, q4, q15
|
664
|
-
ror r1, r1, #27
|
665
|
-
veor q8, q8, q15
|
666
|
-
ror r2, r2, #27
|
667
|
-
veor q15, q1, q5
|
668
|
-
eor r3, r3, r3, ror #23
|
669
|
-
veor q15, q15, q9
|
670
|
-
eor r4, r4, r4, ror #23
|
671
|
-
eor r1, r1, r1, ror #23
|
672
|
-
vmov d28, r3, r4
|
673
|
-
eor r2, r2, r2, ror #23
|
674
|
-
vmov d29, r1, r2
|
675
|
-
|
676
|
-
vmov r3, r4, d30
|
677
|
-
vmov r1, r2, d31
|
678
|
-
ror r3, r3, #27
|
679
|
-
veor q1, q1, q14
|
680
|
-
ror r4, r4, #27
|
681
|
-
veor q5, q5, q14
|
682
|
-
ror r1, r1, #27
|
683
|
-
veor q9, q9, q14
|
684
|
-
ror r2, r2, #27
|
685
|
-
veor q14, q2, q6
|
686
|
-
eor r3, r3, r3, ror #23
|
687
|
-
veor q14, q14, q10
|
688
|
-
eor r4, r4, r4, ror #23
|
689
|
-
eor r1, r1, r1, ror #23
|
690
|
-
vmov d30, r3, r4
|
691
|
-
eor r2, r2, r2, ror #23
|
692
|
-
vmov d31, r1, r2
|
693
|
-
|
694
|
-
vmov r3, r4, d28
|
695
|
-
vmov r1, r2, d29
|
696
|
-
ror r3, r3, #27
|
697
|
-
veor q2, q2, q15
|
698
|
-
ror r4, r4, #27
|
699
|
-
veor q6, q6, q15
|
700
|
-
ror r1, r1, #27
|
701
|
-
veor q10, q10, q15
|
702
|
-
ror r2, r2, #27
|
703
|
-
eor r3, r3, r3, ror #23
|
704
|
-
eor r4, r4, r4, ror #23
|
705
|
-
eor r1, r1, r1, ror #23
|
706
|
-
vmov d28, r3, r4
|
707
|
-
eor r2, r2, r2, ror #23
|
708
|
-
vmov d29, r1, r2
|
709
|
-
veor q3, q3, q14
|
710
|
-
veor q7, q7, q14
|
711
|
-
veor q11, q11, q14
|
712
|
-
.endm
|
713
|
-
|
714
|
-
.macro rho_w
|
715
|
-
@ vshl.U32 q12, q8, #11
|
716
|
-
@ vsri.U32 q12, q8, #21
|
717
|
-
vmov r1, r2, d16
|
718
|
-
vshl.U32 q13, q9, #11
|
719
|
-
vmov r3, r4, d17
|
720
|
-
vsri.U32 q13, q9, #21
|
721
|
-
ror r1, r1, #21
|
722
|
-
vshl.U32 q14, q10, #11
|
723
|
-
ror r2, r2, #21
|
724
|
-
vsri.U32 q14, q10, #21
|
725
|
-
ror r3, r3, #21
|
726
|
-
vshl.U32 q15, q11, #11
|
727
|
-
ror r4, r4, #21
|
728
|
-
vsri.U32 q15, q11, #21
|
729
|
-
vmov d24, r1, r2
|
730
|
-
vmov d25, r3, r4
|
731
|
-
@ NOTE: Here we are hiding in the shadows. What happens is that the ROR action is interleaved with the vector actions so that they get executed for free instead of a NOP .
|
732
|
-
.endm
|
733
|
-
|
734
|
-
.macro chi
|
735
|
-
@ NOTE: Iota
|
736
|
-
vdup.32 q8, r3
|
737
|
-
veor q0, q0, q8
|
738
|
-
|
739
|
-
@ NOTE: Probably this is optimal. (Prove?)
|
740
|
-
vbic q11, q12, q7
|
741
|
-
vbic q9, q0, q12
|
742
|
-
vbic q10, q7, q0
|
743
|
-
veor q8, q10, q12
|
744
|
-
veor q12, q7, q9
|
745
|
-
veor q0, q0, q11
|
746
|
-
|
747
|
-
vbic q7, q13, q4
|
748
|
-
vbic q10, q1, q13
|
749
|
-
vbic q11, q4, q1
|
750
|
-
veor q9, q11, q13
|
751
|
-
veor q13, q4, q10
|
752
|
-
veor q1, q1, q7
|
753
|
-
|
754
|
-
vbic q4, q14, q5
|
755
|
-
vbic q11, q2, q14
|
756
|
-
vbic q7, q5, q2
|
757
|
-
veor q10, q7, q14
|
758
|
-
veor q14, q5, q11
|
759
|
-
veor q2, q2, q4
|
760
|
-
|
761
|
-
vbic q5, q15, q6
|
762
|
-
vbic q7, q3, q15
|
763
|
-
vbic q4, q6, q3
|
764
|
-
veor q4, q4, q15
|
765
|
-
veor q15, q6, q7
|
766
|
-
veor q3, q3, q5
|
767
|
-
.endm
|
768
|
-
|
769
|
-
.macro rho_e
|
770
|
-
vshl.U32 q11, q9, #8
|
771
|
-
vsri.U32 q11, q9, #24
|
772
|
-
|
773
|
-
vshl.U32 q9, q4, #8
|
774
|
-
vsri.U32 q9, q4, #24
|
775
|
-
|
776
|
-
vmov r1, r2, d16
|
777
|
-
vmov r3, r4, d17
|
778
|
-
ror r1, r1, #24
|
779
|
-
vshl.U32 q8, q10, #8
|
780
|
-
ror r2, r2, #24
|
781
|
-
vsri.U32 q8, q10, #24
|
782
|
-
ror r3, r3, #24
|
783
|
-
vmov d20, r1, r2
|
784
|
-
ror r4, r4, #24
|
785
|
-
vmov d21, r3, r4
|
786
|
-
|
787
|
-
vshl.U32 q4, q12, #1
|
788
|
-
vsri.U32 q4, q12, #31
|
789
|
-
|
790
|
-
vshl.U32 q5, q13, #1
|
791
|
-
vsri.U32 q5, q13, #31
|
792
|
-
|
793
|
-
vshl.U32 q6, q14, #1
|
794
|
-
vsri.U32 q6, q14, #31
|
795
|
-
|
796
|
-
vshl.U32 q7, q15, #1
|
797
|
-
vsri.U32 q7, q15, #31
|
798
|
-
.endm
|
799
|
-
|
800
|
-
@ NOTE: The idea was to maybe merge rho_e and theta partially, however because P depends on the registers it also XORs into, we do not save cycles by stepping to core registers. Because at no point can we use the barrel shifter, which is the only reason we should want to choose the core registers over the vector registers.
|
801
|
-
|
802
|
-
@ Xoodootimes4_PermuteAll_6rounds: void * argStates -> void
|
803
|
-
.align 8
|
804
|
-
.global Xoodootimes4_PermuteAll_6rounds
|
805
|
-
.type Xoodootimes4_PermuteAll_6rounds, %function
|
806
|
-
Xoodootimes4_PermuteAll_6rounds:
|
807
|
-
vpush {d8-d15}
|
808
|
-
push {r4}
|
809
|
-
vldm r0!, {d0-d15}
|
810
|
-
vldm r0, {d16-d23}
|
811
|
-
sub r0, r0, #128 @ (16*64)/8
|
812
|
-
|
813
|
-
theta
|
814
|
-
rho_w
|
815
|
-
mov r3, #0x00000060
|
816
|
-
chi
|
817
|
-
rho_e
|
818
|
-
|
819
|
-
theta
|
820
|
-
rho_w
|
821
|
-
mov r3, #0x0000002C
|
822
|
-
chi
|
823
|
-
rho_e
|
824
|
-
|
825
|
-
theta
|
826
|
-
rho_w
|
827
|
-
mov r3, #0x00000380
|
828
|
-
chi
|
829
|
-
rho_e
|
830
|
-
|
831
|
-
theta
|
832
|
-
rho_w
|
833
|
-
mov r3, #0x000000F0
|
834
|
-
chi
|
835
|
-
rho_e
|
836
|
-
|
837
|
-
theta
|
838
|
-
rho_w
|
839
|
-
mov r3, #0x000001A0
|
840
|
-
chi
|
841
|
-
rho_e
|
842
|
-
|
843
|
-
theta
|
844
|
-
rho_w
|
845
|
-
mov r3, #0x00000012
|
846
|
-
chi
|
847
|
-
rho_e
|
848
|
-
|
849
|
-
vstm r0!, {d0-d15}
|
850
|
-
vstm r0, {d16-d23}
|
851
|
-
pop {r4}
|
852
|
-
vpop {d8-d15}
|
853
|
-
bx lr
|
854
|
-
|
855
|
-
@ Xoodootimes4_PermuteAll_12rounds: void * argStates -> void
|
856
|
-
.align 8
|
857
|
-
.global Xoodootimes4_PermuteAll_12rounds
|
858
|
-
.type Xoodootimes4_PermuteAll_12rounds, %function
|
859
|
-
Xoodootimes4_PermuteAll_12rounds:
|
860
|
-
vpush {d8-d15}
|
861
|
-
push {r4-r5}
|
862
|
-
vldm r0!, {d0-d15}
|
863
|
-
vldm r0, {d16-d23}
|
864
|
-
sub r0, r0, #128
|
865
|
-
|
866
|
-
theta
|
867
|
-
rho_w
|
868
|
-
mov r3, #0x00000058
|
869
|
-
chi
|
870
|
-
rho_e
|
871
|
-
|
872
|
-
theta
|
873
|
-
rho_w
|
874
|
-
mov r3, #0x00000038
|
875
|
-
chi
|
876
|
-
rho_e
|
877
|
-
|
878
|
-
theta
|
879
|
-
rho_w
|
880
|
-
mov r3, #0x000003C0
|
881
|
-
chi
|
882
|
-
rho_e
|
883
|
-
|
884
|
-
theta
|
885
|
-
rho_w
|
886
|
-
mov r3, #0x000000D0
|
887
|
-
chi
|
888
|
-
rho_e
|
889
|
-
|
890
|
-
theta
|
891
|
-
rho_w
|
892
|
-
mov r3, #0x00000120
|
893
|
-
chi
|
894
|
-
rho_e
|
895
|
-
|
896
|
-
theta
|
897
|
-
rho_w
|
898
|
-
mov r3, #0x00000014
|
899
|
-
chi
|
900
|
-
rho_e
|
901
|
-
|
902
|
-
theta
|
903
|
-
rho_w
|
904
|
-
mov r3, #0x00000060
|
905
|
-
chi
|
906
|
-
rho_e
|
907
|
-
|
908
|
-
theta
|
909
|
-
rho_w
|
910
|
-
mov r3, #0x0000002C
|
911
|
-
chi
|
912
|
-
rho_e
|
913
|
-
|
914
|
-
theta
|
915
|
-
rho_w
|
916
|
-
mov r3, #0x00000380
|
917
|
-
chi
|
918
|
-
rho_e
|
919
|
-
|
920
|
-
theta
|
921
|
-
rho_w
|
922
|
-
mov r3, #0x000000F0
|
923
|
-
chi
|
924
|
-
rho_e
|
925
|
-
|
926
|
-
theta
|
927
|
-
rho_w
|
928
|
-
mov r3, #0x000001A0
|
929
|
-
chi
|
930
|
-
rho_e
|
931
|
-
|
932
|
-
theta
|
933
|
-
rho_w
|
934
|
-
mov r3, #0x00000012
|
935
|
-
chi
|
936
|
-
rho_e
|
937
|
-
vstm r0!, {d0-d15}
|
938
|
-
vstm r0, {d16-d23}
|
939
|
-
pop {r4-r5}
|
940
|
-
vpop {d8-d15}
|
941
|
-
bx lr
|
942
|
-
|
943
|
-
@
|
944
|
-
@ FASTLOOP SUPPORT
|
945
|
-
@
|
946
|
-
|
947
|
-
@ Xooffftimes4_AddIs: uchar * output -> uchar * input -> size_t bitLen -> void
|
948
|
-
.align 8
|
949
|
-
.global Xooffftimes4_AddIs
|
950
|
-
.type Xooffftimes4_AddIs, %function
|
951
|
-
Xooffftimes4_AddIs:
|
952
|
-
push {r4-r12,lr}
|
953
|
-
@ When unaligned always skip to 32.
|
954
|
-
tst r0, #3
|
955
|
-
bne Xft4_AddIs_32
|
956
|
-
tst r1, #3
|
957
|
-
bne Xft4_AddIs_32
|
958
|
-
Xft4_AddIs_384: @ Test core registers and interleaving.
|
959
|
-
cmp r2, #384
|
960
|
-
bcc Xft4_AddIs_128
|
961
|
-
vldm r0, {d0-d5}
|
962
|
-
vldm r1!, {d16-d19}
|
963
|
-
veor q0, q0, q8
|
964
|
-
vldm r1!, {d20-d21}
|
965
|
-
veor q1, q1, q9
|
966
|
-
veor q2, q2, q10
|
967
|
-
vstm r0!, {d0-d5}
|
968
|
-
subs r2, #384
|
969
|
-
beq Xft4_AddIs_0
|
970
|
-
b Xft4_AddIs_384
|
971
|
-
Xft4_AddIs_128: @Test if core registers are faster here...
|
972
|
-
cmp r2, #128
|
973
|
-
bcc Xft4_AddIs_32
|
974
|
-
vldm r0, {d0-d1}
|
975
|
-
vldm r1!, {d2-d3}
|
976
|
-
veor q0, q0, q1
|
977
|
-
vstm r0!, {d0-d1}
|
978
|
-
subs r2, #128
|
979
|
-
beq Xft4_AddIs_0
|
980
|
-
b Xft4_AddIs_128
|
981
|
-
Xft4_AddIs_32: @ Add 64-support
|
982
|
-
cmp r2, #32
|
983
|
-
bcc Xft4_AddIs_8
|
984
|
-
ldr r4, [r0]
|
985
|
-
ldr r5, [r1], #4
|
986
|
-
eor r4, r4, r5
|
987
|
-
str r4, [r0], #4
|
988
|
-
sub r2, r2, #32
|
989
|
-
b Xft4_AddIs_32
|
990
|
-
Xft4_AddIs_8:
|
991
|
-
cmp r2, #8
|
992
|
-
bcc Xft4_AddIs_7
|
993
|
-
ldrb r4, [r0]
|
994
|
-
ldrb r5, [r1], #1
|
995
|
-
eor r4, r4, r5
|
996
|
-
strb r4, [r0], #1
|
997
|
-
sub r2, r2, #8
|
998
|
-
b Xft4_AddIs_8
|
999
|
-
Xft4_AddIs_7:
|
1000
|
-
cmp r2, #0
|
1001
|
-
beq Xft4_AddIs_0
|
1002
|
-
mov r3, #1
|
1003
|
-
lsl r3, r3, r2
|
1004
|
-
sub r3, r3, #1
|
1005
|
-
ldrb r4, [r0]
|
1006
|
-
ldrb r5, [r1], #4
|
1007
|
-
eor r4, r4, r5
|
1008
|
-
and r4, r4, r3
|
1009
|
-
strb r4, [r0], #4
|
1010
|
-
Xft4_AddIs_0:
|
1011
|
-
pop {r4-r12,pc}
|
1012
|
-
|
1013
|
-
.macro theta_star
|
1014
|
-
veor q15, q3, q7
|
1015
|
-
veor q15, q15, q11
|
1016
|
-
|
1017
|
-
vmov r7, r8, d30
|
1018
|
-
vmov r5, r6, d31
|
1019
|
-
ror r7, r7, #27
|
1020
|
-
veor q14, q0, q4
|
1021
|
-
ror r8, r8, #27
|
1022
|
-
veor q14, q14, q8
|
1023
|
-
ror r5, r5, #27
|
1024
|
-
ror r6, r6, #27
|
1025
|
-
eor r7, r7, r7, ror #23
|
1026
|
-
eor r8, r8, r8, ror #23
|
1027
|
-
eor r5, r5, r5, ror #23
|
1028
|
-
vmov d30, r7, r8
|
1029
|
-
eor r6, r6, r6, ror #23
|
1030
|
-
vmov d31, r5, r6
|
1031
|
-
|
1032
|
-
vmov r7, r8, d28
|
1033
|
-
vmov r5, r6, d29
|
1034
|
-
ror r7, r7, #27
|
1035
|
-
veor q0, q0, q15
|
1036
|
-
ror r8, r8, #27
|
1037
|
-
veor q4, q4, q15
|
1038
|
-
ror r5, r5, #27
|
1039
|
-
veor q8, q8, q15
|
1040
|
-
ror r6, r6, #27
|
1041
|
-
veor q15, q1, q5
|
1042
|
-
eor r7, r7, r7, ror #23
|
1043
|
-
veor q15, q15, q9
|
1044
|
-
eor r8, r8, r8, ror #23
|
1045
|
-
eor r5, r5, r5, ror #23
|
1046
|
-
vmov d28, r7, r8
|
1047
|
-
eor r6, r6, r6, ror #23
|
1048
|
-
vmov d29, r5, r6
|
1049
|
-
|
1050
|
-
vmov r7, r8, d30
|
1051
|
-
vmov r5, r6, d31
|
1052
|
-
ror r7, r7, #27
|
1053
|
-
veor q1, q1, q14
|
1054
|
-
ror r8, r8, #27
|
1055
|
-
veor q5, q5, q14
|
1056
|
-
ror r5, r5, #27
|
1057
|
-
veor q9, q9, q14
|
1058
|
-
ror r6, r6, #27
|
1059
|
-
veor q14, q2, q6
|
1060
|
-
eor r7, r7, r7, ror #23
|
1061
|
-
veor q14, q14, q10
|
1062
|
-
eor r8, r8, r8, ror #23
|
1063
|
-
eor r5, r5, r5, ror #23
|
1064
|
-
vmov d30, r7, r8
|
1065
|
-
eor r6, r6, r6, ror #23
|
1066
|
-
vmov d31, r5, r6
|
1067
|
-
|
1068
|
-
vmov r7, r8, d28
|
1069
|
-
vmov r5, r6, d29
|
1070
|
-
ror r7, r7, #27
|
1071
|
-
veor q2, q2, q15
|
1072
|
-
ror r8, r8, #27
|
1073
|
-
veor q6, q6, q15
|
1074
|
-
ror r5, r5, #27
|
1075
|
-
veor q10, q10, q15
|
1076
|
-
ror r6, r6, #27
|
1077
|
-
eor r7, r7, r7, ror #23
|
1078
|
-
eor r8, r8, r8, ror #23
|
1079
|
-
eor r5, r5, r5, ror #23
|
1080
|
-
vmov d28, r7, r8
|
1081
|
-
eor r6, r6, r6, ror #23
|
1082
|
-
vmov d29, r5, r6
|
1083
|
-
veor q3, q3, q14
|
1084
|
-
veor q7, q7, q14
|
1085
|
-
veor q11, q11, q14
|
1086
|
-
.endm
|
1087
|
-
|
1088
|
-
.macro rho_w_star
|
1089
|
-
vmov r5, r6, d16
|
1090
|
-
vshl.U32 q13, q9, #11
|
1091
|
-
vmov r7, r8, d17
|
1092
|
-
vsri.U32 q13, q9, #21
|
1093
|
-
ror r5, r5, #21
|
1094
|
-
vshl.U32 q14, q10, #11
|
1095
|
-
ror r6, r6, #21
|
1096
|
-
vsri.U32 q14, q10, #21
|
1097
|
-
ror r7, r7, #21
|
1098
|
-
vshl.U32 q15, q11, #11
|
1099
|
-
ror r8, r8, #21
|
1100
|
-
vsri.U32 q15, q11, #21
|
1101
|
-
vmov d24, r5, r6
|
1102
|
-
vmov d25, r7, r8
|
1103
|
-
.endm
|
1104
|
-
|
1105
|
-
.macro chi_star
|
1106
|
-
@ NOTE: Iota
|
1107
|
-
vdup.32 q8, r7
|
1108
|
-
veor q0, q0, q8
|
1109
|
-
|
1110
|
-
vbic q11, q12, q7
|
1111
|
-
vbic q9, q0, q12
|
1112
|
-
vbic q10, q7, q0
|
1113
|
-
veor q8, q10, q12
|
1114
|
-
veor q12, q7, q9
|
1115
|
-
veor q0, q0, q11
|
1116
|
-
|
1117
|
-
vbic q7, q13, q4
|
1118
|
-
vbic q10, q1, q13
|
1119
|
-
vbic q11, q4, q1
|
1120
|
-
veor q9, q11, q13
|
1121
|
-
veor q13, q4, q10
|
1122
|
-
veor q1, q1, q7
|
1123
|
-
|
1124
|
-
vbic q4, q14, q5
|
1125
|
-
vbic q11, q2, q14
|
1126
|
-
vbic q7, q5, q2
|
1127
|
-
veor q10, q7, q14
|
1128
|
-
veor q14, q5, q11
|
1129
|
-
veor q2, q2, q4
|
1130
|
-
|
1131
|
-
vbic q5, q15, q6
|
1132
|
-
vbic q7, q3, q15
|
1133
|
-
vbic q4, q6, q3
|
1134
|
-
veor q4, q4, q15
|
1135
|
-
veor q15, q6, q7
|
1136
|
-
veor q3, q3, q5
|
1137
|
-
.endm
|
1138
|
-
|
1139
|
-
.macro rho_e_star
|
1140
|
-
vshl.U32 q11, q9, #8
|
1141
|
-
vsri.U32 q11, q9, #24
|
1142
|
-
|
1143
|
-
vshl.U32 q9, q4, #8
|
1144
|
-
vsri.U32 q9, q4, #24
|
1145
|
-
|
1146
|
-
vmov r5, r6, d16
|
1147
|
-
vmov r7, r8, d17
|
1148
|
-
ror r5, r5, #24
|
1149
|
-
vshl.U32 q8, q10, #8
|
1150
|
-
ror r6, r6, #24
|
1151
|
-
vsri.U32 q8, q10, #24
|
1152
|
-
ror r7, r7, #24
|
1153
|
-
vmov d20, r5, r6
|
1154
|
-
ror r8, r8, #24
|
1155
|
-
vmov d21, r7, r8
|
1156
|
-
|
1157
|
-
vshl.U32 q4, q12, #1
|
1158
|
-
vsri.U32 q4, q12, #31
|
1159
|
-
|
1160
|
-
vshl.U32 q5, q13, #1
|
1161
|
-
vsri.U32 q5, q13, #31
|
1162
|
-
|
1163
|
-
vshl.U32 q6, q14, #1
|
1164
|
-
vsri.U32 q6, q14, #31
|
1165
|
-
|
1166
|
-
vshl.U32 q7, q15, #1
|
1167
|
-
vsri.U32 q7, q15, #31
|
1168
|
-
.endm
|
1169
|
-
|
1170
|
-
.macro xoodoo_6_star
|
1171
|
-
theta_star
|
1172
|
-
rho_w_star
|
1173
|
-
mov r7, #0x00000060
|
1174
|
-
chi_star
|
1175
|
-
rho_e_star
|
1176
|
-
|
1177
|
-
theta_star
|
1178
|
-
rho_w_star
|
1179
|
-
mov r7, #0x0000002C
|
1180
|
-
chi_star
|
1181
|
-
rho_e_star
|
1182
|
-
|
1183
|
-
theta_star
|
1184
|
-
rho_w_star
|
1185
|
-
mov r7, #0x00000380
|
1186
|
-
chi_star
|
1187
|
-
rho_e_star
|
1188
|
-
|
1189
|
-
theta_star
|
1190
|
-
rho_w_star
|
1191
|
-
mov r7, #0x000000F0
|
1192
|
-
chi_star
|
1193
|
-
rho_e_star
|
1194
|
-
|
1195
|
-
theta_star
|
1196
|
-
rho_w_star
|
1197
|
-
mov r7, #0x000001A0
|
1198
|
-
chi_star
|
1199
|
-
rho_e_star
|
1200
|
-
|
1201
|
-
theta_star
|
1202
|
-
rho_w_star
|
1203
|
-
mov r7, #0x00000012
|
1204
|
-
chi_star
|
1205
|
-
rho_e_star
|
1206
|
-
.endm
|
1207
|
-
|
1208
|
-
.macro focus_c
|
1209
|
-
tst r2, #3
|
1210
|
-
beq focused
|
1211
|
-
unfocused:
|
1212
|
-
ldmia r2!, {r4-r9}
|
1213
|
-
vmov d8, r4, r5
|
1214
|
-
vmov d9, r6, r7
|
1215
|
-
ldmia r2!, {r4-r5}
|
1216
|
-
vmov d10, r8, r9
|
1217
|
-
ldmia r2!, {r6-r7}
|
1218
|
-
vmov d11, r4, r5
|
1219
|
-
ldmia r2!, {r8-r9}
|
1220
|
-
vmov d12, r6, r7
|
1221
|
-
ldmia r2!, {r4-r5}
|
1222
|
-
vmov d13, r8, r9
|
1223
|
-
ldmia r2!, {r6-r7}
|
1224
|
-
vmov d14, r4, r5
|
1225
|
-
ldmia r2!, {r8-r9}
|
1226
|
-
vmov d15, r6, r7
|
1227
|
-
ldmia r2!, {r4-r5}
|
1228
|
-
vmov d16, r8, r9
|
1229
|
-
ldmia r2!, {r6-r7}
|
1230
|
-
vmov d17, r4, r5
|
1231
|
-
ldmia r2!, {r8-r9}
|
1232
|
-
vmov d18, r6, r7
|
1233
|
-
ldmia r2!, {r4-r5}
|
1234
|
-
vmov d19, r8, r9
|
1235
|
-
ldmia r2!, {r6-r7}
|
1236
|
-
vmov d20, r4, r5
|
1237
|
-
ldmia r2!, {r8-r9}
|
1238
|
-
vmov d21, r6, r7
|
1239
|
-
ldmia r2!, {r4-r5}
|
1240
|
-
vmov d22, r8, r9
|
1241
|
-
ldmia r2!, {r6-r7}
|
1242
|
-
vmov d23, r4, r5
|
1243
|
-
ldmia r2!, {r8-r9}
|
1244
|
-
vmov d24, r6, r7
|
1245
|
-
ldmia r2!, {r4-r5}
|
1246
|
-
vmov d25, r8, r9
|
1247
|
-
ldmia r2!, {r6-r7}
|
1248
|
-
vmov d26, r4, r5
|
1249
|
-
ldmia r2!, {r8-r9}
|
1250
|
-
vmov d27, r6, r7
|
1251
|
-
ldmia r2!, {r4-r5}
|
1252
|
-
vmov d28, r8, r9
|
1253
|
-
ldmia r2!, {r6-r7}
|
1254
|
-
vmov d29, r4, r5
|
1255
|
-
ldmia r2!, {r8-r9}
|
1256
|
-
vmov d30, r6, r7
|
1257
|
-
vmov d31, r8, r9
|
1258
|
-
b snapped
|
1259
|
-
focused:
|
1260
|
-
vldm r2!, {d8-d23}
|
1261
|
-
vldm r2!, {d24-d31}
|
1262
|
-
snapped:
|
1263
|
-
.endm
|
1264
|
-
|
1265
|
-
.macro zip_x
|
1266
|
-
@ Shatter
|
1267
|
-
vuzp.32 q4, q10
|
1268
|
-
vuzp.32 q7, q13
|
1269
|
-
vtrn.32 q4, q7
|
1270
|
-
vtrn.32 q10, q13
|
1271
|
-
@ q4, q10, q7, q13
|
1272
|
-
|
1273
|
-
vuzp.32 q5, q11
|
1274
|
-
vuzp.32 q8, q14
|
1275
|
-
vtrn.32 q5, q8
|
1276
|
-
vtrn.32 q11, q14
|
1277
|
-
@ q5, q11, q8, q14
|
1278
|
-
|
1279
|
-
vuzp.32 q6, q12
|
1280
|
-
vuzp.32 q9, q15
|
1281
|
-
vtrn.32 q6, q9
|
1282
|
-
vtrn.32 q12, q15
|
1283
|
-
@ q6, q12, q9, q15
|
1284
|
-
|
1285
|
-
@ Reordering (merge later, this is for convenience) (try merge up first!)
|
1286
|
-
vmov q0, q4
|
1287
|
-
vmov q4, q5
|
1288
|
-
vmov q5, q11
|
1289
|
-
vmov q11, q15
|
1290
|
-
|
1291
|
-
vmov q1, q10
|
1292
|
-
vmov q10, q9
|
1293
|
-
vmov q9, q12
|
1294
|
-
|
1295
|
-
vmov q2, q7
|
1296
|
-
vmov q7, q14
|
1297
|
-
|
1298
|
-
vmov q3, q13
|
1299
|
-
|
1300
|
-
vswp q6, q8
|
1301
|
-
.endm
|
1302
|
-
|
1303
|
-
.macro roll_zip_c
|
1304
|
-
@ Key seed bytes
|
1305
|
-
vldm r0, {d0-d5}
|
1306
|
-
|
1307
|
-
@ Get keystream generation inputs
|
1308
|
-
vmov r4, r5, d0 @ 0,1
|
1309
|
-
vmov r6, r7, d2 @ 4,5
|
1310
|
-
vmov r8, s8 @ 8
|
1311
|
-
|
1312
|
-
eor r4, r4, r4, lsl #13
|
1313
|
-
eor r4, r4, r6, ror #29
|
1314
|
-
@ r4 = 12
|
1315
|
-
eor r6, r6, r6, lsl #13
|
1316
|
-
eor r6, r6, r8, ror #29
|
1317
|
-
@ r6 = 13
|
1318
|
-
eor r8, r8, r8, lsl #13
|
1319
|
-
eor r8, r8, r5, ror #29
|
1320
|
-
@ r8 = 14
|
1321
|
-
eor r5, r5, r5, lsl #13
|
1322
|
-
eor r5, r5, r7, ror #29
|
1323
|
-
@ r5 = 15
|
1324
|
-
|
1325
|
-
@ 0,1,2,3
|
1326
|
-
veor q4, q0, q4
|
1327
|
-
@ 4,5,6,7
|
1328
|
-
veor q5, q1, q5
|
1329
|
-
veor q7, q1, q7
|
1330
|
-
@ 8,9,10,11
|
1331
|
-
veor q6, q2, q6
|
1332
|
-
veor q8, q2, q8
|
1333
|
-
veor q10, q2, q10
|
1334
|
-
|
1335
|
-
vmov s12, s1
|
1336
|
-
vmov s13, s2
|
1337
|
-
vmov s14, s3
|
1338
|
-
vmov s15, r4
|
1339
|
-
|
1340
|
-
vmov s0, s5
|
1341
|
-
vmov s1, s6
|
1342
|
-
vmov s2, s7
|
1343
|
-
vmov s3, r6
|
1344
|
-
|
1345
|
-
vmov s4, s9
|
1346
|
-
vmov s5, s10
|
1347
|
-
vmov s6, s11
|
1348
|
-
vmov s7, r8
|
1349
|
-
|
1350
|
-
vmov s8, s13
|
1351
|
-
vmov s9, s14
|
1352
|
-
vmov d5, r4, r5
|
1353
|
-
vstm r0, {d0-d5}
|
1354
|
-
|
1355
|
-
@ 1,2,3,12
|
1356
|
-
veor q9, q3, q9
|
1357
|
-
veor q11, q3, q11
|
1358
|
-
veor q13, q3, q13
|
1359
|
-
|
1360
|
-
@ 5,6,7,13
|
1361
|
-
veor q12, q0, q12
|
1362
|
-
veor q14, q0, q14
|
1363
|
-
|
1364
|
-
@ 9,10,11,14
|
1365
|
-
veor q15, q1, q15
|
1366
|
-
|
1367
|
-
zip_x
|
1368
|
-
.endm
|
1369
|
-
|
1370
|
-
.macro accumulate
|
1371
|
-
vldm r1, {d24-d29}
|
1372
|
-
|
1373
|
-
vtrn.32 q0, q2
|
1374
|
-
vtrn.32 q1, q3
|
1375
|
-
vzip.32 q0, q1
|
1376
|
-
vzip.32 q2, q3
|
1377
|
-
|
1378
|
-
veor q0, q0, q1
|
1379
|
-
veor q2, q2, q3
|
1380
|
-
veor q12, q12, q0
|
1381
|
-
veor q12, q12, q2
|
1382
|
-
|
1383
|
-
vtrn.32 q4, q6
|
1384
|
-
vtrn.32 q5, q7
|
1385
|
-
vzip.32 q4, q5
|
1386
|
-
vzip.32 q6, q7
|
1387
|
-
|
1388
|
-
veor q4, q4, q5
|
1389
|
-
veor q6, q6, q7
|
1390
|
-
veor q13, q13, q4
|
1391
|
-
veor q13, q13, q6
|
1392
|
-
|
1393
|
-
vtrn.32 q8, q10
|
1394
|
-
vtrn.32 q9, q11
|
1395
|
-
vzip.32 q8, q9
|
1396
|
-
vzip.32 q10, q11
|
1397
|
-
|
1398
|
-
veor q8, q8, q9
|
1399
|
-
veor q10, q10, q11
|
1400
|
-
veor q14, q14, q8
|
1401
|
-
veor q14, q14, q10
|
1402
|
-
|
1403
|
-
vstm r1, {d24-d29}
|
1404
|
-
.endm
|
1405
|
-
|
1406
|
-
@ Xooffftimes4_CompressFastLoop: uchar * k -> uchar * x -> uchar * input -> size_t length -> size_t
|
1407
|
-
.align 8
|
1408
|
-
.global Xooffftimes4_CompressFastLoop
|
1409
|
-
.type Xooffftimes4_CompressFastLoop, %function
|
1410
|
-
Xooffftimes4_CompressFastLoop:
|
1411
|
-
@ Do not use this function for unaligned access (for now).
|
1412
|
-
tst r2, #3
|
1413
|
-
movne r0, #0
|
1414
|
-
bxne lr
|
1415
|
-
|
1416
|
-
push {r4-r10, lr} @ Save LR, macros might branch.
|
1417
|
-
vpush {d8-d15}
|
1418
|
-
mov r10, #0
|
1419
|
-
sub r3, #192
|
1420
|
-
Xft4_CompressFast:
|
1421
|
-
focus_c @ Handle unaligned access
|
1422
|
-
roll_zip_c @ Roll_c with message addition (XOR)
|
1423
|
-
xoodoo_6_star @ Same as Xoodoo_6; different registers
|
1424
|
-
accumulate @ Add up the four states we processed
|
1425
|
-
add r10, #192
|
1426
|
-
subs r3, #192
|
1427
|
-
bcs Xft4_CompressFast
|
1428
|
-
mov r0, r10
|
1429
|
-
vpop {d8-d15}
|
1430
|
-
pop {r4-r10, pc}
|
1431
|
-
|
1432
|
-
.macro roll_zip_e
|
1433
|
-
vldm r0, {d0-d5}
|
1434
|
-
|
1435
|
-
@ Get keystream generation inputs
|
1436
|
-
vmov r4, r5, d0 @ 0,1
|
1437
|
-
vmov r6, r7, d2 @ 4,5
|
1438
|
-
vmov r8, r9, d4 @ 8,9
|
1439
|
-
|
1440
|
-
and r10, r6, r8
|
1441
|
-
eor r4, r10, r4, ror #27
|
1442
|
-
eor r4, r4, r6, ror #19
|
1443
|
-
eor r4, r4, #7
|
1444
|
-
@ r4 = 12
|
1445
|
-
|
1446
|
-
and r10, r8, r5
|
1447
|
-
eor r6, r10, r6, ror #27
|
1448
|
-
eor r6, r6, r8, ror #19
|
1449
|
-
eor r6, r6, #7
|
1450
|
-
@r6 = 13
|
1451
|
-
|
1452
|
-
and r10, r5, r7
|
1453
|
-
eor r8, r10, r8, ror #27
|
1454
|
-
eor r8, r8, r5, ror #19
|
1455
|
-
eor r8, r8, #7
|
1456
|
-
@r8 = 14
|
1457
|
-
|
1458
|
-
and r10, r7, r9
|
1459
|
-
eor r5, r10, r5, ror #27
|
1460
|
-
eor r5, r5, r7, ror #19
|
1461
|
-
eor r5, r5, #7
|
1462
|
-
@r5 = 15
|
1463
|
-
|
1464
|
-
@ 0,1,2,3
|
1465
|
-
vmov q4, q0
|
1466
|
-
@ 4,5,6,7
|
1467
|
-
vmov q5, q1
|
1468
|
-
vmov q7, q1
|
1469
|
-
@ 8,9,10,11
|
1470
|
-
vmov q6, q2
|
1471
|
-
vmov q8, q2
|
1472
|
-
vmov q10, q2
|
1473
|
-
|
1474
|
-
@ Optimize movement here. Merge into zip_x or VLDM.
|
1475
|
-
vmov s12, s1
|
1476
|
-
vmov s13, s2
|
1477
|
-
vmov s14, s3
|
1478
|
-
vmov s15, r4
|
1479
|
-
|
1480
|
-
vmov s0, s5
|
1481
|
-
vmov s1, s6
|
1482
|
-
vmov s2, s7
|
1483
|
-
vmov s3, r6
|
1484
|
-
|
1485
|
-
vmov s4, s9
|
1486
|
-
vmov s5, s10
|
1487
|
-
vmov s6, s11
|
1488
|
-
vmov s7, r8
|
1489
|
-
|
1490
|
-
vmov s8, s13
|
1491
|
-
vmov s9, s14
|
1492
|
-
vmov d5, r4, r5
|
1493
|
-
vstm r0, {d0-d5}
|
1494
|
-
|
1495
|
-
@ 1,2,3,12
|
1496
|
-
vmov q9, q3
|
1497
|
-
vmov q11, q3
|
1498
|
-
vmov q13, q3
|
1499
|
-
|
1500
|
-
@ 5,6,7,13
|
1501
|
-
vmov q12, q0
|
1502
|
-
vmov q14, q0
|
1503
|
-
|
1504
|
-
@ 9,10,11,14
|
1505
|
-
vmov q15, q1
|
1506
|
-
|
1507
|
-
zip_x
|
1508
|
-
.endm
|
1509
|
-
|
1510
|
-
.macro sequentiate
|
1511
|
-
@ Roll_e_n -> Pe + kRoll = Zn
|
1512
|
-
vldm r1, {d24-d29}
|
1513
|
-
|
1514
|
-
add r4, r2, #48
|
1515
|
-
add r5, r4, #48
|
1516
|
-
add r6, r5, #48
|
1517
|
-
|
1518
|
-
vtrn.32 q0, q2
|
1519
|
-
vtrn.32 q1, q3
|
1520
|
-
vzip.32 q0, q1
|
1521
|
-
vzip.32 q2, q3
|
1522
|
-
@ 0 1 2 3 for A C B D
|
1523
|
-
|
1524
|
-
veor q0, q0, q12
|
1525
|
-
veor q1, q1, q12
|
1526
|
-
veor q2, q2, q12
|
1527
|
-
veor q3, q3, q12
|
1528
|
-
|
1529
|
-
vstm r2!, {d0-d1}
|
1530
|
-
vstm r4!, {d4-d5}
|
1531
|
-
vstm r5!, {d2-d3}
|
1532
|
-
vstm r6!, {d6-d7}
|
1533
|
-
|
1534
|
-
vtrn.32 q4, q6
|
1535
|
-
vtrn.32 q5, q7
|
1536
|
-
vzip.32 q4, q5
|
1537
|
-
vzip.32 q6, q7
|
1538
|
-
@ 4 5 6 7 for A C B D
|
1539
|
-
|
1540
|
-
veor q0, q4, q13
|
1541
|
-
veor q2, q5, q13
|
1542
|
-
veor q4, q6, q13
|
1543
|
-
veor q6, q7, q13
|
1544
|
-
|
1545
|
-
vtrn.32 q8, q10
|
1546
|
-
vtrn.32 q9, q11
|
1547
|
-
vzip.32 q8, q9
|
1548
|
-
vzip.32 q10, q11
|
1549
|
-
@ 8 9 10 11 for A C B D
|
1550
|
-
|
1551
|
-
veor q1, q8, q14
|
1552
|
-
veor q3, q9, q14
|
1553
|
-
veor q5, q10, q14
|
1554
|
-
veor q7, q11, q14
|
1555
|
-
|
1556
|
-
vstm r2, {d0-d3}
|
1557
|
-
vstm r4, {d8-d11}
|
1558
|
-
vstm r5, {d4-d7}
|
1559
|
-
vstm r6!, {d12-d15}
|
1560
|
-
|
1561
|
-
mov r2, r6
|
1562
|
-
.endm
|
1563
|
-
|
1564
|
-
@ Xooffftimes4_ExpandFastLoop: uchar * yAccu -> uchar * kRoll -> uchar * output -> size_t length -> size_t
|
1565
|
-
.align 8
|
1566
|
-
.global Xooffftimes4_ExpandFastLoop
|
1567
|
-
.type Xooffftimes4_ExpandFastLoop, %function
|
1568
|
-
Xooffftimes4_ExpandFastLoop:
|
1569
|
-
@ Do not use this function for unaligned access (for now).
|
1570
|
-
tst r2, #3
|
1571
|
-
movne r0, #0
|
1572
|
-
bxne lr
|
1573
|
-
|
1574
|
-
push {r4-r11, lr} @ Save LR, macros might branch.
|
1575
|
-
vpush {d8-d15}
|
1576
|
-
mov r11, #0
|
1577
|
-
sub r3, #192
|
1578
|
-
Xft4_ExpandFast: @The second loop breaks something.
|
1579
|
-
roll_zip_e
|
1580
|
-
xoodoo_6_star
|
1581
|
-
sequentiate
|
1582
|
-
add r11, #192
|
1583
|
-
subs r3, #192
|
1584
|
-
bcs Xft4_ExpandFast
|
1585
|
-
mov r0, r11
|
1586
|
-
vpop {d8-d15}
|
1587
|
-
pop {r4-r11, pc}
|