@leocuvee/wrkzcoin-multi-hashing 0.0.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.travis.yml +5 -0
- package/LICENSE +674 -0
- package/README.md +87 -0
- package/appveyor.yml +12 -0
- package/argon2/.gitattributes +10 -0
- package/argon2/.travis.yml +25 -0
- package/argon2/Argon2.sln +160 -0
- package/argon2/CHANGELOG.md +32 -0
- package/argon2/CMakeLists.txt +87 -0
- package/argon2/LICENSE +314 -0
- package/argon2/Makefile +196 -0
- package/argon2/README.md +297 -0
- package/argon2/appveyor.yml +40 -0
- package/argon2/argon2-specs.pdf +0 -0
- package/argon2/export.sh +7 -0
- package/argon2/include/argon2.h +427 -0
- package/argon2/latex/CMakeLists.txt +34 -0
- package/argon2/latex/IEEEtran.cls +6347 -0
- package/argon2/latex/Makefile +18 -0
- package/argon2/latex/argon2-specs.tex +920 -0
- package/argon2/latex/pics/argon2-par.pdf +0 -0
- package/argon2/latex/pics/compression.pdf +0 -0
- package/argon2/latex/pics/generic.pdf +0 -0
- package/argon2/latex/pics/power-distribution.jpg +0 -0
- package/argon2/latex/tradeoff.bib +822 -0
- package/argon2/libargon2.pc +16 -0
- package/argon2/man/CMakeLists.txt +8 -0
- package/argon2/man/argon2.1 +57 -0
- package/argon2/meson.build +16 -0
- package/argon2/meson_options.txt +1 -0
- package/argon2/src/CMakeLists.txt +147 -0
- package/argon2/src/argon2.c +452 -0
- package/argon2/src/argon2.pc.in +11 -0
- package/argon2/src/blake2/blake2-impl.h +156 -0
- package/argon2/src/blake2/blake2.h +89 -0
- package/argon2/src/blake2/blake2b.c +390 -0
- package/argon2/src/blake2/blamka-round-opt.h +471 -0
- package/argon2/src/blake2/blamka-round-ref.h +56 -0
- package/argon2/src/core.c +634 -0
- package/argon2/src/core.h +228 -0
- package/argon2/src/encoding.c +467 -0
- package/argon2/src/encoding.h +57 -0
- package/argon2/src/genkat.h +51 -0
- package/argon2/src/meson.build +68 -0
- package/argon2/src/opt.c +283 -0
- package/argon2/src/optimization/CMakeLists.txt +10 -0
- package/argon2/src/ref.c +194 -0
- package/argon2/src/thread.c +57 -0
- package/argon2/src/thread.h +67 -0
- package/argon2/tests/CMakeLists.txt +43 -0
- package/argon2/tests/bench.c +111 -0
- package/argon2/tests/genkat.c +207 -0
- package/argon2/tests/kats/argon2d +12304 -0
- package/argon2/tests/kats/argon2d.shasum +1 -0
- package/argon2/tests/kats/argon2d_v16 +12304 -0
- package/argon2/tests/kats/argon2d_v16.shasum +1 -0
- package/argon2/tests/kats/argon2i +12304 -0
- package/argon2/tests/kats/argon2i.shasum +1 -0
- package/argon2/tests/kats/argon2i_v16 +12304 -0
- package/argon2/tests/kats/argon2i_v16.shasum +1 -0
- package/argon2/tests/kats/argon2id +12304 -0
- package/argon2/tests/kats/argon2id.shasum +1 -0
- package/argon2/tests/kats/argon2id_v16 +12304 -0
- package/argon2/tests/kats/argon2id_v16.shasum +1 -0
- package/argon2/tests/kats/check-sums.ps1 +48 -0
- package/argon2/tests/kats/check-sums.sh +16 -0
- package/argon2/tests/kats/test.ps1 +132 -0
- package/argon2/tests/kats/test.sh +117 -0
- package/argon2/tests/meson.build +34 -0
- package/argon2/tests/test.c +289 -0
- package/argon2/tool/CMakeLists.txt +7 -0
- package/argon2/tool/main.c +339 -0
- package/argon2/tool/meson.build +8 -0
- package/argon2/vs2015/Argon2Opt/Argon2Opt.vcxproj +226 -0
- package/argon2/vs2015/Argon2Opt/Argon2Opt.vcxproj.filters +69 -0
- package/argon2/vs2015/Argon2OptBench/Argon2OptBench.vcxproj +226 -0
- package/argon2/vs2015/Argon2OptBench/Argon2OptBench.vcxproj.filters +69 -0
- package/argon2/vs2015/Argon2OptDll/Argon2OptDll.vcxproj +225 -0
- package/argon2/vs2015/Argon2OptDll/Argon2OptDll.vcxproj.filters +66 -0
- package/argon2/vs2015/Argon2OptGenKAT/Argon2OptGenKAT.vcxproj +239 -0
- package/argon2/vs2015/Argon2OptGenKAT/Argon2OptGenKAT.vcxproj.filters +72 -0
- package/argon2/vs2015/Argon2OptTestCI/Argon2OptTestCI.vcxproj +227 -0
- package/argon2/vs2015/Argon2OptTestCI/Argon2OptTestCI.vcxproj.filters +69 -0
- package/argon2/vs2015/Argon2Ref/Argon2Ref.vcxproj +226 -0
- package/argon2/vs2015/Argon2Ref/Argon2Ref.vcxproj.filters +69 -0
- package/argon2/vs2015/Argon2RefBench/Argon2RefBench.vcxproj +226 -0
- package/argon2/vs2015/Argon2RefBench/Argon2RefBench.vcxproj.filters +69 -0
- package/argon2/vs2015/Argon2RefDll/Argon2RefDll.vcxproj +225 -0
- package/argon2/vs2015/Argon2RefDll/Argon2RefDll.vcxproj.filters +66 -0
- package/argon2/vs2015/Argon2RefGenKAT/Argon2RefGenKAT.vcxproj +227 -0
- package/argon2/vs2015/Argon2RefGenKAT/Argon2RefGenKAT.vcxproj.filters +72 -0
- package/argon2/vs2015/Argon2RefTestCI/Argon2RefTestCI.vcxproj +226 -0
- package/argon2/vs2015/Argon2RefTestCI/Argon2RefTestCI.vcxproj.filters +69 -0
- package/bcrypt.c +566 -0
- package/bcrypt.h +14 -0
- package/binding.gyp +93 -0
- package/blake.c +17 -0
- package/blake.h +16 -0
- package/boolberry.cc +11 -0
- package/boolberry.h +6 -0
- package/build/Makefile +354 -0
- package/build/Release/.deps/Release/multihashing.node.d +1 -0
- package/build/Release/.deps/Release/obj.target/multihashing/argon2/src/argon2.o.d +8 -0
- package/build/Release/.deps/Release/obj.target/multihashing/argon2/src/blake2/blake2b.o.d +8 -0
- package/build/Release/.deps/Release/obj.target/multihashing/argon2/src/core.o.d +10 -0
- package/build/Release/.deps/Release/obj.target/multihashing/argon2/src/encoding.o.d +8 -0
- package/build/Release/.deps/Release/obj.target/multihashing/argon2/src/ref.o.d +14 -0
- package/build/Release/.deps/Release/obj.target/multihashing/argon2/src/thread.o.d +5 -0
- package/build/Release/.deps/Release/obj.target/multihashing/bcrypt.o.d +4 -0
- package/build/Release/.deps/Release/obj.target/multihashing/blake.o.d +7 -0
- package/build/Release/.deps/Release/obj.target/multihashing/boolberry.o.d +12 -0
- package/build/Release/.deps/Release/obj.target/multihashing/c11.o.d +20 -0
- package/build/Release/.deps/Release/obj.target/multihashing/crypto/aesb.o.d +3 -0
- package/build/Release/.deps/Release/obj.target/multihashing/crypto/c_blake256.o.d +5 -0
- package/build/Release/.deps/Release/obj.target/multihashing/crypto/c_groestl.o.d +10 -0
- package/build/Release/.deps/Release/obj.target/multihashing/crypto/c_jh.o.d +9 -0
- package/build/Release/.deps/Release/obj.target/multihashing/crypto/c_keccak.o.d +7 -0
- package/build/Release/.deps/Release/obj.target/multihashing/crypto/c_skein.o.d +10 -0
- package/build/Release/.deps/Release/obj.target/multihashing/crypto/hash.o.d +7 -0
- package/build/Release/.deps/Release/obj.target/multihashing/crypto/oaes_lib.o.d +6 -0
- package/build/Release/.deps/Release/obj.target/multihashing/crypto/wild_keccak.o.d +8 -0
- package/build/Release/.deps/Release/obj.target/multihashing/cryptonight.o.d +18 -0
- package/build/Release/.deps/Release/obj.target/multihashing/cryptonight_dark.o.d +18 -0
- package/build/Release/.deps/Release/obj.target/multihashing/cryptonight_dark_lite.o.d +18 -0
- package/build/Release/.deps/Release/obj.target/multihashing/cryptonight_fast.o.d +18 -0
- package/build/Release/.deps/Release/obj.target/multihashing/cryptonight_lite.o.d +18 -0
- package/build/Release/.deps/Release/obj.target/multihashing/cryptonight_soft_shell.o.d +18 -0
- package/build/Release/.deps/Release/obj.target/multihashing/cryptonight_turtle.o.d +18 -0
- package/build/Release/.deps/Release/obj.target/multihashing/cryptonight_turtle_lite.o.d +18 -0
- package/build/Release/.deps/Release/obj.target/multihashing/fresh.o.d +10 -0
- package/build/Release/.deps/Release/obj.target/multihashing/fugue.o.d +7 -0
- package/build/Release/.deps/Release/obj.target/multihashing/groestl.o.d +8 -0
- package/build/Release/.deps/Release/obj.target/multihashing/hefty1.o.d +12 -0
- package/build/Release/.deps/Release/obj.target/multihashing/keccak.o.d +8 -0
- package/build/Release/.deps/Release/obj.target/multihashing/multihashing.o.d +155 -0
- package/build/Release/.deps/Release/obj.target/multihashing/nist5.o.d +12 -0
- package/build/Release/.deps/Release/obj.target/multihashing/quark.o.d +14 -0
- package/build/Release/.deps/Release/obj.target/multihashing/qubit.o.d +12 -0
- package/build/Release/.deps/Release/obj.target/multihashing/scryptjane.o.d +30 -0
- package/build/Release/.deps/Release/obj.target/multihashing/scryptn.o.d +6 -0
- package/build/Release/.deps/Release/obj.target/multihashing/sha1.o.d +24 -0
- package/build/Release/.deps/Release/obj.target/multihashing/sha3/aes_helper.o.d +5 -0
- package/build/Release/.deps/Release/obj.target/multihashing/sha3/hamsi.o.d +7 -0
- package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_blake.o.d +6 -0
- package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_bmw.o.d +6 -0
- package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_cubehash.o.d +6 -0
- package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_echo.o.d +7 -0
- package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_fugue.o.d +6 -0
- package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_groestl.o.d +6 -0
- package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_hefty1.o.d +5 -0
- package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_jh.o.d +6 -0
- package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_keccak.o.d +6 -0
- package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_luffa.o.d +6 -0
- package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_shabal.o.d +6 -0
- package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_shavite.o.d +7 -0
- package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_simd.o.d +6 -0
- package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_skein.o.d +6 -0
- package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_whirlpool.o.d +8 -0
- package/build/Release/.deps/Release/obj.target/multihashing/shavite3.o.d +7 -0
- package/build/Release/.deps/Release/obj.target/multihashing/skein.o.d +8 -0
- package/build/Release/.deps/Release/obj.target/multihashing/x11.o.d +20 -0
- package/build/Release/.deps/Release/obj.target/multihashing/x13.o.d +23 -0
- package/build/Release/.deps/Release/obj.target/multihashing/x15.o.d +26 -0
- package/build/Release/.deps/Release/obj.target/multihashing.node.d +1 -0
- package/build/Release/multihashing.node +0 -0
- package/build/binding.Makefile +6 -0
- package/build/multihashing.target.mk +255 -0
- package/c11.c +85 -0
- package/c11.h +17 -0
- package/crypto/aesb.c +177 -0
- package/crypto/c_blake256.c +326 -0
- package/crypto/c_blake256.h +43 -0
- package/crypto/c_groestl.c +360 -0
- package/crypto/c_groestl.h +56 -0
- package/crypto/c_jh.c +367 -0
- package/crypto/c_jh.h +20 -0
- package/crypto/c_keccak.c +112 -0
- package/crypto/c_keccak.h +26 -0
- package/crypto/c_skein.c +2036 -0
- package/crypto/c_skein.h +45 -0
- package/crypto/crypto.h +186 -0
- package/crypto/cryptonote_core/account.cpp +50 -0
- package/crypto/cryptonote_core/account.h +61 -0
- package/crypto/cryptonote_core/cryptonote_basic_impl.cpp +186 -0
- package/crypto/cryptonote_core/cryptonote_basic_impl.h +65 -0
- package/crypto/cryptonote_core/cryptonote_format_utils.cpp +766 -0
- package/crypto/cryptonote_core/cryptonote_format_utils.h +30 -0
- package/crypto/cryptonote_protocol/cryptonote_protocol_defs.h +152 -0
- package/crypto/groestl_tables.h +38 -0
- package/crypto/hash-ops.h +57 -0
- package/crypto/hash.c +24 -0
- package/crypto/hash.h +22 -0
- package/crypto/int-util.h +230 -0
- package/crypto/oaes_config.h +50 -0
- package/crypto/oaes_lib.c +1468 -0
- package/crypto/oaes_lib.h +215 -0
- package/crypto/skein_port.h +190 -0
- package/crypto/variant2_int_sqrt.h +168 -0
- package/crypto/wild_keccak.cpp +119 -0
- package/crypto/wild_keccak.h +168 -0
- package/cryptonight.c +300 -0
- package/cryptonight.h +17 -0
- package/cryptonight_dark.c +300 -0
- package/cryptonight_dark.h +17 -0
- package/cryptonight_dark_lite.c +300 -0
- package/cryptonight_dark_lite.h +17 -0
- package/cryptonight_fast.c +300 -0
- package/cryptonight_fast.h +17 -0
- package/cryptonight_lite.c +300 -0
- package/cryptonight_lite.h +17 -0
- package/cryptonight_soft_shell.c +298 -0
- package/cryptonight_soft_shell.h +17 -0
- package/cryptonight_turtle.c +300 -0
- package/cryptonight_turtle.h +17 -0
- package/cryptonight_turtle_lite.c +300 -0
- package/cryptonight_turtle_lite.h +17 -0
- package/fresh.c +42 -0
- package/fresh.h +16 -0
- package/fugue.c +12 -0
- package/fugue.h +16 -0
- package/groestl.c +40 -0
- package/groestl.h +17 -0
- package/hefty1.c +63 -0
- package/hefty1.h +16 -0
- package/index.js +1 -0
- package/keccak.c +14 -0
- package/keccak.h +16 -0
- package/leocuvee-wrkzcoin-multi-hashing-0.0.20.tgz +0 -0
- package/multihashing.cc +699 -0
- package/nist5.c +46 -0
- package/nist5.h +16 -0
- package/package.json +56 -0
- package/quark.c +210 -0
- package/quark.h +16 -0
- package/qubit.c +45 -0
- package/qubit.h +16 -0
- package/scryptjane/scrypt-jane-chacha.h +132 -0
- package/scryptjane/scrypt-jane-hash.h +48 -0
- package/scryptjane/scrypt-jane-hash_keccak.h +168 -0
- package/scryptjane/scrypt-jane-hash_sha256.h +135 -0
- package/scryptjane/scrypt-jane-mix_chacha-avx.h +340 -0
- package/scryptjane/scrypt-jane-mix_chacha-sse2.h +371 -0
- package/scryptjane/scrypt-jane-mix_chacha-ssse3.h +348 -0
- package/scryptjane/scrypt-jane-mix_chacha.h +69 -0
- package/scryptjane/scrypt-jane-mix_salsa-avx.h +381 -0
- package/scryptjane/scrypt-jane-mix_salsa-sse2.h +443 -0
- package/scryptjane/scrypt-jane-mix_salsa.h +70 -0
- package/scryptjane/scrypt-jane-pbkdf2.h +112 -0
- package/scryptjane/scrypt-jane-portable-x86.h +364 -0
- package/scryptjane/scrypt-jane-portable.h +281 -0
- package/scryptjane/scrypt-jane-romix-basic.h +67 -0
- package/scryptjane/scrypt-jane-romix-template.h +118 -0
- package/scryptjane/scrypt-jane-romix.h +27 -0
- package/scryptjane/scrypt-jane-salsa.h +106 -0
- package/scryptjane/scrypt-jane-test-vectors.h +261 -0
- package/scryptjane.c +223 -0
- package/scryptjane.h +36 -0
- package/scryptn.c +258 -0
- package/scryptn.h +16 -0
- package/sha1.c +65 -0
- package/sha1.h +16 -0
- package/sha256.h +440 -0
- package/sha3/aes_helper.c +392 -0
- package/sha3/hamsi.c +867 -0
- package/sha3/hamsi_helper.c +39648 -0
- package/sha3/md_helper.c +347 -0
- package/sha3/sph_blake.c +1114 -0
- package/sha3/sph_blake.h +327 -0
- package/sha3/sph_bmw.c +965 -0
- package/sha3/sph_bmw.h +328 -0
- package/sha3/sph_cubehash.c +723 -0
- package/sha3/sph_cubehash.h +292 -0
- package/sha3/sph_echo.c +1031 -0
- package/sha3/sph_echo.h +320 -0
- package/sha3/sph_fugue.c +1208 -0
- package/sha3/sph_fugue.h +81 -0
- package/sha3/sph_groestl.c +3119 -0
- package/sha3/sph_groestl.h +329 -0
- package/sha3/sph_hamsi.h +321 -0
- package/sha3/sph_hefty1.c +378 -0
- package/sha3/sph_hefty1.h +66 -0
- package/sha3/sph_jh.c +1116 -0
- package/sha3/sph_jh.h +298 -0
- package/sha3/sph_keccak.c +1824 -0
- package/sha3/sph_keccak.h +293 -0
- package/sha3/sph_luffa.c +1426 -0
- package/sha3/sph_luffa.h +296 -0
- package/sha3/sph_shabal.c +806 -0
- package/sha3/sph_shabal.h +344 -0
- package/sha3/sph_shavite.c +1764 -0
- package/sha3/sph_shavite.h +314 -0
- package/sha3/sph_simd.c +1799 -0
- package/sha3/sph_simd.h +309 -0
- package/sha3/sph_skein.c +1254 -0
- package/sha3/sph_skein.h +298 -0
- package/sha3/sph_types.h +1976 -0
- package/sha3/sph_whirlpool.c +3480 -0
- package/sha3/sph_whirlpool.h +209 -0
- package/shavite3.c +24 -0
- package/shavite3.h +16 -0
- package/skein.c +26 -0
- package/skein.h +16 -0
- package/stdint.h +259 -0
- package/tests/argon2-tests.js +16 -0
- package/tests/benchmark.js +36 -0
- package/tests/cryptonight-tests.js +189 -0
- package/tests/cryptonight_monero.js +53 -0
- package/tests/test.js +16 -0
- package/x11.c +85 -0
- package/x11.h +16 -0
- package/x13.c +97 -0
- package/x13.h +5 -0
- package/x15.c +106 -0
- package/x15.h +16 -0
package/sha3/sph_simd.c
ADDED
|
@@ -0,0 +1,1799 @@
|
|
|
1
|
+
/* $Id: simd.c 227 2010-06-16 17:28:38Z tp $ */
|
|
2
|
+
/*
|
|
3
|
+
* SIMD implementation.
|
|
4
|
+
*
|
|
5
|
+
* ==========================(LICENSE BEGIN)============================
|
|
6
|
+
*
|
|
7
|
+
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
|
8
|
+
*
|
|
9
|
+
* Permission is hereby granted, free of charge, to any person obtaining
|
|
10
|
+
* a copy of this software and associated documentation files (the
|
|
11
|
+
* "Software"), to deal in the Software without restriction, including
|
|
12
|
+
* without limitation the rights to use, copy, modify, merge, publish,
|
|
13
|
+
* distribute, sublicense, and/or sell copies of the Software, and to
|
|
14
|
+
* permit persons to whom the Software is furnished to do so, subject to
|
|
15
|
+
* the following conditions:
|
|
16
|
+
*
|
|
17
|
+
* The above copyright notice and this permission notice shall be
|
|
18
|
+
* included in all copies or substantial portions of the Software.
|
|
19
|
+
*
|
|
20
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
21
|
+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
22
|
+
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
23
|
+
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
24
|
+
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
|
25
|
+
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
26
|
+
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
27
|
+
*
|
|
28
|
+
* ===========================(LICENSE END)=============================
|
|
29
|
+
*
|
|
30
|
+
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
#include <stddef.h>
|
|
34
|
+
#include <string.h>
|
|
35
|
+
#include <limits.h>
|
|
36
|
+
|
|
37
|
+
#include "sph_simd.h"
|
|
38
|
+
|
|
39
|
+
#ifdef __cplusplus
|
|
40
|
+
extern "C"{
|
|
41
|
+
#endif
|
|
42
|
+
|
|
43
|
+
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SIMD
|
|
44
|
+
#define SPH_SMALL_FOOTPRINT_SIMD 1
|
|
45
|
+
#endif
|
|
46
|
+
|
|
47
|
+
#ifdef _MSC_VER
|
|
48
|
+
#pragma warning (disable: 4146)
|
|
49
|
+
#endif
|
|
50
|
+
|
|
51
|
+
typedef sph_u32 u32;
|
|
52
|
+
typedef sph_s32 s32;
|
|
53
|
+
#define C32 SPH_C32
|
|
54
|
+
#define T32 SPH_T32
|
|
55
|
+
#define ROL32 SPH_ROTL32
|
|
56
|
+
|
|
57
|
+
#define XCAT(x, y) XCAT_(x, y)
|
|
58
|
+
#define XCAT_(x, y) x ## y
|
|
59
|
+
|
|
60
|
+
/*
|
|
61
|
+
* The powers of 41 modulo 257. We use exponents from 0 to 255, inclusive.
|
|
62
|
+
*/
|
|
63
|
+
static const s32 alpha_tab[] = {
|
|
64
|
+
1, 41, 139, 45, 46, 87, 226, 14, 60, 147, 116, 130,
|
|
65
|
+
190, 80, 196, 69, 2, 82, 21, 90, 92, 174, 195, 28,
|
|
66
|
+
120, 37, 232, 3, 123, 160, 135, 138, 4, 164, 42, 180,
|
|
67
|
+
184, 91, 133, 56, 240, 74, 207, 6, 246, 63, 13, 19,
|
|
68
|
+
8, 71, 84, 103, 111, 182, 9, 112, 223, 148, 157, 12,
|
|
69
|
+
235, 126, 26, 38, 16, 142, 168, 206, 222, 107, 18, 224,
|
|
70
|
+
189, 39, 57, 24, 213, 252, 52, 76, 32, 27, 79, 155,
|
|
71
|
+
187, 214, 36, 191, 121, 78, 114, 48, 169, 247, 104, 152,
|
|
72
|
+
64, 54, 158, 53, 117, 171, 72, 125, 242, 156, 228, 96,
|
|
73
|
+
81, 237, 208, 47, 128, 108, 59, 106, 234, 85, 144, 250,
|
|
74
|
+
227, 55, 199, 192, 162, 217, 159, 94, 256, 216, 118, 212,
|
|
75
|
+
211, 170, 31, 243, 197, 110, 141, 127, 67, 177, 61, 188,
|
|
76
|
+
255, 175, 236, 167, 165, 83, 62, 229, 137, 220, 25, 254,
|
|
77
|
+
134, 97, 122, 119, 253, 93, 215, 77, 73, 166, 124, 201,
|
|
78
|
+
17, 183, 50, 251, 11, 194, 244, 238, 249, 186, 173, 154,
|
|
79
|
+
146, 75, 248, 145, 34, 109, 100, 245, 22, 131, 231, 219,
|
|
80
|
+
241, 115, 89, 51, 35, 150, 239, 33, 68, 218, 200, 233,
|
|
81
|
+
44, 5, 205, 181, 225, 230, 178, 102, 70, 43, 221, 66,
|
|
82
|
+
136, 179, 143, 209, 88, 10, 153, 105, 193, 203, 99, 204,
|
|
83
|
+
140, 86, 185, 132, 15, 101, 29, 161, 176, 20, 49, 210,
|
|
84
|
+
129, 149, 198, 151, 23, 172, 113, 7, 30, 202, 58, 65,
|
|
85
|
+
95, 40, 98, 163
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
/*
|
|
89
|
+
* Ranges:
|
|
90
|
+
* REDS1: from -32768..98302 to -383..383
|
|
91
|
+
* REDS2: from -2^31..2^31-1 to -32768..98302
|
|
92
|
+
*/
|
|
93
|
+
#define REDS1(x) (((x) & 0xFF) - ((x) >> 8))
|
|
94
|
+
#define REDS2(x) (((x) & 0xFFFF) + ((x) >> 16))
|
|
95
|
+
|
|
96
|
+
/*
|
|
97
|
+
* If, upon entry, the values of q[] are all in the -N..N range (where
|
|
98
|
+
* N >= 98302) then the new values of q[] are in the -2N..2N range.
|
|
99
|
+
*
|
|
100
|
+
* Since alpha_tab[v] <= 256, maximum allowed range is for N = 8388608.
|
|
101
|
+
*/
|
|
102
|
+
#define FFT_LOOP(rb, hk, as, id) do { \
|
|
103
|
+
size_t u, v; \
|
|
104
|
+
s32 m = q[(rb)]; \
|
|
105
|
+
s32 n = q[(rb) + (hk)]; \
|
|
106
|
+
q[(rb)] = m + n; \
|
|
107
|
+
q[(rb) + (hk)] = m - n; \
|
|
108
|
+
u = v = 0; \
|
|
109
|
+
goto id; \
|
|
110
|
+
for (; u < (hk); u += 4, v += 4 * (as)) { \
|
|
111
|
+
s32 t; \
|
|
112
|
+
m = q[(rb) + u + 0]; \
|
|
113
|
+
n = q[(rb) + u + 0 + (hk)]; \
|
|
114
|
+
t = REDS2(n * alpha_tab[v + 0 * (as)]); \
|
|
115
|
+
q[(rb) + u + 0] = m + t; \
|
|
116
|
+
q[(rb) + u + 0 + (hk)] = m - t; \
|
|
117
|
+
id: \
|
|
118
|
+
m = q[(rb) + u + 1]; \
|
|
119
|
+
n = q[(rb) + u + 1 + (hk)]; \
|
|
120
|
+
t = REDS2(n * alpha_tab[v + 1 * (as)]); \
|
|
121
|
+
q[(rb) + u + 1] = m + t; \
|
|
122
|
+
q[(rb) + u + 1 + (hk)] = m - t; \
|
|
123
|
+
m = q[(rb) + u + 2]; \
|
|
124
|
+
n = q[(rb) + u + 2 + (hk)]; \
|
|
125
|
+
t = REDS2(n * alpha_tab[v + 2 * (as)]); \
|
|
126
|
+
q[(rb) + u + 2] = m + t; \
|
|
127
|
+
q[(rb) + u + 2 + (hk)] = m - t; \
|
|
128
|
+
m = q[(rb) + u + 3]; \
|
|
129
|
+
n = q[(rb) + u + 3 + (hk)]; \
|
|
130
|
+
t = REDS2(n * alpha_tab[v + 3 * (as)]); \
|
|
131
|
+
q[(rb) + u + 3] = m + t; \
|
|
132
|
+
q[(rb) + u + 3 + (hk)] = m - t; \
|
|
133
|
+
} \
|
|
134
|
+
} while (0)
|
|
135
|
+
|
|
136
|
+
/*
|
|
137
|
+
* Output ranges:
|
|
138
|
+
* d0: min= 0 max= 1020
|
|
139
|
+
* d1: min= -67 max= 4587
|
|
140
|
+
* d2: min=-4335 max= 4335
|
|
141
|
+
* d3: min=-4147 max= 507
|
|
142
|
+
* d4: min= -510 max= 510
|
|
143
|
+
* d5: min= -252 max= 4402
|
|
144
|
+
* d6: min=-4335 max= 4335
|
|
145
|
+
* d7: min=-4332 max= 322
|
|
146
|
+
*/
|
|
147
|
+
#define FFT8(xb, xs, d) do { \
|
|
148
|
+
s32 x0 = x[(xb)]; \
|
|
149
|
+
s32 x1 = x[(xb) + (xs)]; \
|
|
150
|
+
s32 x2 = x[(xb) + 2 * (xs)]; \
|
|
151
|
+
s32 x3 = x[(xb) + 3 * (xs)]; \
|
|
152
|
+
s32 a0 = x0 + x2; \
|
|
153
|
+
s32 a1 = x0 + (x2 << 4); \
|
|
154
|
+
s32 a2 = x0 - x2; \
|
|
155
|
+
s32 a3 = x0 - (x2 << 4); \
|
|
156
|
+
s32 b0 = x1 + x3; \
|
|
157
|
+
s32 b1 = REDS1((x1 << 2) + (x3 << 6)); \
|
|
158
|
+
s32 b2 = (x1 << 4) - (x3 << 4); \
|
|
159
|
+
s32 b3 = REDS1((x1 << 6) + (x3 << 2)); \
|
|
160
|
+
d ## 0 = a0 + b0; \
|
|
161
|
+
d ## 1 = a1 + b1; \
|
|
162
|
+
d ## 2 = a2 + b2; \
|
|
163
|
+
d ## 3 = a3 + b3; \
|
|
164
|
+
d ## 4 = a0 - b0; \
|
|
165
|
+
d ## 5 = a1 - b1; \
|
|
166
|
+
d ## 6 = a2 - b2; \
|
|
167
|
+
d ## 7 = a3 - b3; \
|
|
168
|
+
} while (0)
|
|
169
|
+
|
|
170
|
+
/*
|
|
171
|
+
* When k=16, we have alpha=2. Multiplication by alpha^i is then reduced
|
|
172
|
+
* to some shifting.
|
|
173
|
+
*
|
|
174
|
+
* Output: within -591471..591723
|
|
175
|
+
*/
|
|
176
|
+
#define FFT16(xb, xs, rb) do { \
|
|
177
|
+
s32 d1_0, d1_1, d1_2, d1_3, d1_4, d1_5, d1_6, d1_7; \
|
|
178
|
+
s32 d2_0, d2_1, d2_2, d2_3, d2_4, d2_5, d2_6, d2_7; \
|
|
179
|
+
FFT8(xb, (xs) << 1, d1_); \
|
|
180
|
+
FFT8((xb) + (xs), (xs) << 1, d2_); \
|
|
181
|
+
q[(rb) + 0] = d1_0 + d2_0; \
|
|
182
|
+
q[(rb) + 1] = d1_1 + (d2_1 << 1); \
|
|
183
|
+
q[(rb) + 2] = d1_2 + (d2_2 << 2); \
|
|
184
|
+
q[(rb) + 3] = d1_3 + (d2_3 << 3); \
|
|
185
|
+
q[(rb) + 4] = d1_4 + (d2_4 << 4); \
|
|
186
|
+
q[(rb) + 5] = d1_5 + (d2_5 << 5); \
|
|
187
|
+
q[(rb) + 6] = d1_6 + (d2_6 << 6); \
|
|
188
|
+
q[(rb) + 7] = d1_7 + (d2_7 << 7); \
|
|
189
|
+
q[(rb) + 8] = d1_0 - d2_0; \
|
|
190
|
+
q[(rb) + 9] = d1_1 - (d2_1 << 1); \
|
|
191
|
+
q[(rb) + 10] = d1_2 - (d2_2 << 2); \
|
|
192
|
+
q[(rb) + 11] = d1_3 - (d2_3 << 3); \
|
|
193
|
+
q[(rb) + 12] = d1_4 - (d2_4 << 4); \
|
|
194
|
+
q[(rb) + 13] = d1_5 - (d2_5 << 5); \
|
|
195
|
+
q[(rb) + 14] = d1_6 - (d2_6 << 6); \
|
|
196
|
+
q[(rb) + 15] = d1_7 - (d2_7 << 7); \
|
|
197
|
+
} while (0)
|
|
198
|
+
|
|
199
|
+
/*
|
|
200
|
+
* Output range: |q| <= 1183446
|
|
201
|
+
*/
|
|
202
|
+
#define FFT32(xb, xs, rb, id) do { \
|
|
203
|
+
FFT16(xb, (xs) << 1, rb); \
|
|
204
|
+
FFT16((xb) + (xs), (xs) << 1, (rb) + 16); \
|
|
205
|
+
FFT_LOOP(rb, 16, 8, id); \
|
|
206
|
+
} while (0)
|
|
207
|
+
|
|
208
|
+
/*
|
|
209
|
+
* Output range: |q| <= 2366892
|
|
210
|
+
*/
|
|
211
|
+
#define FFT64(xb, xs, rb, id) do { \
|
|
212
|
+
FFT32(xb, (xs) << 1, rb, XCAT(id, a)); \
|
|
213
|
+
FFT32((xb) + (xs), (xs) << 1, (rb) + 32, XCAT(id, b)); \
|
|
214
|
+
FFT_LOOP(rb, 32, 4, id); \
|
|
215
|
+
} while (0)
|
|
216
|
+
|
|
217
|
+
#if SPH_SMALL_FOOTPRINT_SIMD
|
|
218
|
+
|
|
219
|
+
static void
|
|
220
|
+
fft32(unsigned char *x, size_t xs, s32 *q)
|
|
221
|
+
{
|
|
222
|
+
size_t xd;
|
|
223
|
+
|
|
224
|
+
xd = xs << 1;
|
|
225
|
+
FFT16(0, xd, 0);
|
|
226
|
+
FFT16(xs, xd, 16);
|
|
227
|
+
FFT_LOOP(0, 16, 8, label_);
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
#define FFT128(xb, xs, rb, id) do { \
|
|
231
|
+
fft32(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) + 0]); \
|
|
232
|
+
fft32(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 32]); \
|
|
233
|
+
FFT_LOOP(rb, 32, 4, XCAT(id, aa)); \
|
|
234
|
+
fft32(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 64]); \
|
|
235
|
+
fft32(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 96]); \
|
|
236
|
+
FFT_LOOP((rb) + 64, 32, 4, XCAT(id, ab)); \
|
|
237
|
+
FFT_LOOP(rb, 64, 2, XCAT(id, a)); \
|
|
238
|
+
} while (0)
|
|
239
|
+
|
|
240
|
+
#else
|
|
241
|
+
|
|
242
|
+
/*
|
|
243
|
+
* Output range: |q| <= 4733784
|
|
244
|
+
*/
|
|
245
|
+
#define FFT128(xb, xs, rb, id) do { \
|
|
246
|
+
FFT64(xb, (xs) << 1, rb, XCAT(id, a)); \
|
|
247
|
+
FFT64((xb) + (xs), (xs) << 1, (rb) + 64, XCAT(id, b)); \
|
|
248
|
+
FFT_LOOP(rb, 64, 2, id); \
|
|
249
|
+
} while (0)
|
|
250
|
+
|
|
251
|
+
#endif
|
|
252
|
+
|
|
253
|
+
/*
|
|
254
|
+
* For SIMD-384 / SIMD-512, the fully unrolled FFT yields a compression
|
|
255
|
+
* function which does not fit in the 32 kB L1 cache of a typical x86
|
|
256
|
+
* Intel. We therefore add a function call layer at the FFT64 level.
|
|
257
|
+
*/
|
|
258
|
+
|
|
259
|
+
static void
|
|
260
|
+
fft64(unsigned char *x, size_t xs, s32 *q)
|
|
261
|
+
{
|
|
262
|
+
size_t xd;
|
|
263
|
+
|
|
264
|
+
xd = xs << 1;
|
|
265
|
+
FFT32(0, xd, 0, label_a);
|
|
266
|
+
FFT32(xs, xd, 32, label_b);
|
|
267
|
+
FFT_LOOP(0, 32, 4, label_);
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
/*
|
|
271
|
+
* Output range: |q| <= 9467568
|
|
272
|
+
*/
|
|
273
|
+
#define FFT256(xb, xs, rb, id) do { \
|
|
274
|
+
fft64(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) + 0]); \
|
|
275
|
+
fft64(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 64]); \
|
|
276
|
+
FFT_LOOP(rb, 64, 2, XCAT(id, aa)); \
|
|
277
|
+
fft64(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 128]); \
|
|
278
|
+
fft64(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 192]); \
|
|
279
|
+
FFT_LOOP((rb) + 128, 64, 2, XCAT(id, ab)); \
|
|
280
|
+
FFT_LOOP(rb, 128, 1, XCAT(id, a)); \
|
|
281
|
+
} while (0)
|
|
282
|
+
|
|
283
|
+
/*
|
|
284
|
+
* alpha^(127*i) mod 257
|
|
285
|
+
*/
|
|
286
|
+
static const unsigned short yoff_s_n[] = {
|
|
287
|
+
1, 98, 95, 58, 30, 113, 23, 198, 129, 49, 176, 29,
|
|
288
|
+
15, 185, 140, 99, 193, 153, 88, 143, 136, 221, 70, 178,
|
|
289
|
+
225, 205, 44, 200, 68, 239, 35, 89, 241, 231, 22, 100,
|
|
290
|
+
34, 248, 146, 173, 249, 244, 11, 50, 17, 124, 73, 215,
|
|
291
|
+
253, 122, 134, 25, 137, 62, 165, 236, 255, 61, 67, 141,
|
|
292
|
+
197, 31, 211, 118, 256, 159, 162, 199, 227, 144, 234, 59,
|
|
293
|
+
128, 208, 81, 228, 242, 72, 117, 158, 64, 104, 169, 114,
|
|
294
|
+
121, 36, 187, 79, 32, 52, 213, 57, 189, 18, 222, 168,
|
|
295
|
+
16, 26, 235, 157, 223, 9, 111, 84, 8, 13, 246, 207,
|
|
296
|
+
240, 133, 184, 42, 4, 135, 123, 232, 120, 195, 92, 21,
|
|
297
|
+
2, 196, 190, 116, 60, 226, 46, 139
|
|
298
|
+
};
|
|
299
|
+
|
|
300
|
+
/*
|
|
301
|
+
* alpha^(127*i) + alpha^(125*i) mod 257
|
|
302
|
+
*/
|
|
303
|
+
static const unsigned short yoff_s_f[] = {
|
|
304
|
+
2, 156, 118, 107, 45, 212, 111, 162, 97, 249, 211, 3,
|
|
305
|
+
49, 101, 151, 223, 189, 178, 253, 204, 76, 82, 232, 65,
|
|
306
|
+
96, 176, 161, 47, 189, 61, 248, 107, 0, 131, 133, 113,
|
|
307
|
+
17, 33, 12, 111, 251, 103, 57, 148, 47, 65, 249, 143,
|
|
308
|
+
189, 8, 204, 230, 205, 151, 187, 227, 247, 111, 140, 6,
|
|
309
|
+
77, 10, 21, 149, 255, 101, 139, 150, 212, 45, 146, 95,
|
|
310
|
+
160, 8, 46, 254, 208, 156, 106, 34, 68, 79, 4, 53,
|
|
311
|
+
181, 175, 25, 192, 161, 81, 96, 210, 68, 196, 9, 150,
|
|
312
|
+
0, 126, 124, 144, 240, 224, 245, 146, 6, 154, 200, 109,
|
|
313
|
+
210, 192, 8, 114, 68, 249, 53, 27, 52, 106, 70, 30,
|
|
314
|
+
10, 146, 117, 251, 180, 247, 236, 108
|
|
315
|
+
};
|
|
316
|
+
|
|
317
|
+
/*
|
|
318
|
+
* beta^(255*i) mod 257
|
|
319
|
+
*/
|
|
320
|
+
static const unsigned short yoff_b_n[] = {
|
|
321
|
+
1, 163, 98, 40, 95, 65, 58, 202, 30, 7, 113, 172,
|
|
322
|
+
23, 151, 198, 149, 129, 210, 49, 20, 176, 161, 29, 101,
|
|
323
|
+
15, 132, 185, 86, 140, 204, 99, 203, 193, 105, 153, 10,
|
|
324
|
+
88, 209, 143, 179, 136, 66, 221, 43, 70, 102, 178, 230,
|
|
325
|
+
225, 181, 205, 5, 44, 233, 200, 218, 68, 33, 239, 150,
|
|
326
|
+
35, 51, 89, 115, 241, 219, 231, 131, 22, 245, 100, 109,
|
|
327
|
+
34, 145, 248, 75, 146, 154, 173, 186, 249, 238, 244, 194,
|
|
328
|
+
11, 251, 50, 183, 17, 201, 124, 166, 73, 77, 215, 93,
|
|
329
|
+
253, 119, 122, 97, 134, 254, 25, 220, 137, 229, 62, 83,
|
|
330
|
+
165, 167, 236, 175, 255, 188, 61, 177, 67, 127, 141, 110,
|
|
331
|
+
197, 243, 31, 170, 211, 212, 118, 216, 256, 94, 159, 217,
|
|
332
|
+
162, 192, 199, 55, 227, 250, 144, 85, 234, 106, 59, 108,
|
|
333
|
+
128, 47, 208, 237, 81, 96, 228, 156, 242, 125, 72, 171,
|
|
334
|
+
117, 53, 158, 54, 64, 152, 104, 247, 169, 48, 114, 78,
|
|
335
|
+
121, 191, 36, 214, 187, 155, 79, 27, 32, 76, 52, 252,
|
|
336
|
+
213, 24, 57, 39, 189, 224, 18, 107, 222, 206, 168, 142,
|
|
337
|
+
16, 38, 26, 126, 235, 12, 157, 148, 223, 112, 9, 182,
|
|
338
|
+
111, 103, 84, 71, 8, 19, 13, 63, 246, 6, 207, 74,
|
|
339
|
+
240, 56, 133, 91, 184, 180, 42, 164, 4, 138, 135, 160,
|
|
340
|
+
123, 3, 232, 37, 120, 28, 195, 174, 92, 90, 21, 82,
|
|
341
|
+
2, 69, 196, 80, 190, 130, 116, 147, 60, 14, 226, 87,
|
|
342
|
+
46, 45, 139, 41
|
|
343
|
+
};
|
|
344
|
+
|
|
345
|
+
/*
|
|
346
|
+
* beta^(255*i) + beta^(253*i) mod 257
|
|
347
|
+
*/
|
|
348
|
+
static const unsigned short yoff_b_f[] = {
|
|
349
|
+
2, 203, 156, 47, 118, 214, 107, 106, 45, 93, 212, 20,
|
|
350
|
+
111, 73, 162, 251, 97, 215, 249, 53, 211, 19, 3, 89,
|
|
351
|
+
49, 207, 101, 67, 151, 130, 223, 23, 189, 202, 178, 239,
|
|
352
|
+
253, 127, 204, 49, 76, 236, 82, 137, 232, 157, 65, 79,
|
|
353
|
+
96, 161, 176, 130, 161, 30, 47, 9, 189, 247, 61, 226,
|
|
354
|
+
248, 90, 107, 64, 0, 88, 131, 243, 133, 59, 113, 115,
|
|
355
|
+
17, 236, 33, 213, 12, 191, 111, 19, 251, 61, 103, 208,
|
|
356
|
+
57, 35, 148, 248, 47, 116, 65, 119, 249, 178, 143, 40,
|
|
357
|
+
189, 129, 8, 163, 204, 227, 230, 196, 205, 122, 151, 45,
|
|
358
|
+
187, 19, 227, 72, 247, 125, 111, 121, 140, 220, 6, 107,
|
|
359
|
+
77, 69, 10, 101, 21, 65, 149, 171, 255, 54, 101, 210,
|
|
360
|
+
139, 43, 150, 151, 212, 164, 45, 237, 146, 184, 95, 6,
|
|
361
|
+
160, 42, 8, 204, 46, 238, 254, 168, 208, 50, 156, 190,
|
|
362
|
+
106, 127, 34, 234, 68, 55, 79, 18, 4, 130, 53, 208,
|
|
363
|
+
181, 21, 175, 120, 25, 100, 192, 178, 161, 96, 81, 127,
|
|
364
|
+
96, 227, 210, 248, 68, 10, 196, 31, 9, 167, 150, 193,
|
|
365
|
+
0, 169, 126, 14, 124, 198, 144, 142, 240, 21, 224, 44,
|
|
366
|
+
245, 66, 146, 238, 6, 196, 154, 49, 200, 222, 109, 9,
|
|
367
|
+
210, 141, 192, 138, 8, 79, 114, 217, 68, 128, 249, 94,
|
|
368
|
+
53, 30, 27, 61, 52, 135, 106, 212, 70, 238, 30, 185,
|
|
369
|
+
10, 132, 146, 136, 117, 37, 251, 150, 180, 188, 247, 156,
|
|
370
|
+
236, 192, 108, 86
|
|
371
|
+
};
|
|
372
|
+
|
|
373
|
+
#define INNER(l, h, mm) (((u32)((l) * (mm)) & 0xFFFFU) \
|
|
374
|
+
+ ((u32)((h) * (mm)) << 16))
|
|
375
|
+
|
|
376
|
+
#define W_SMALL(sb, o1, o2, mm) \
|
|
377
|
+
(INNER(q[8 * (sb) + 2 * 0 + o1], q[8 * (sb) + 2 * 0 + o2], mm), \
|
|
378
|
+
INNER(q[8 * (sb) + 2 * 1 + o1], q[8 * (sb) + 2 * 1 + o2], mm), \
|
|
379
|
+
INNER(q[8 * (sb) + 2 * 2 + o1], q[8 * (sb) + 2 * 2 + o2], mm), \
|
|
380
|
+
INNER(q[8 * (sb) + 2 * 3 + o1], q[8 * (sb) + 2 * 3 + o2], mm)
|
|
381
|
+
|
|
382
|
+
#define WS_0_0 W_SMALL( 4, 0, 1, 185)
|
|
383
|
+
#define WS_0_1 W_SMALL( 6, 0, 1, 185)
|
|
384
|
+
#define WS_0_2 W_SMALL( 0, 0, 1, 185)
|
|
385
|
+
#define WS_0_3 W_SMALL( 2, 0, 1, 185)
|
|
386
|
+
#define WS_0_4 W_SMALL( 7, 0, 1, 185)
|
|
387
|
+
#define WS_0_5 W_SMALL( 5, 0, 1, 185)
|
|
388
|
+
#define WS_0_6 W_SMALL( 3, 0, 1, 185)
|
|
389
|
+
#define WS_0_7 W_SMALL( 1, 0, 1, 185)
|
|
390
|
+
#define WS_1_0 W_SMALL(15, 0, 1, 185)
|
|
391
|
+
#define WS_1_1 W_SMALL(11, 0, 1, 185)
|
|
392
|
+
#define WS_1_2 W_SMALL(12, 0, 1, 185)
|
|
393
|
+
#define WS_1_3 W_SMALL( 8, 0, 1, 185)
|
|
394
|
+
#define WS_1_4 W_SMALL( 9, 0, 1, 185)
|
|
395
|
+
#define WS_1_5 W_SMALL(13, 0, 1, 185)
|
|
396
|
+
#define WS_1_6 W_SMALL(10, 0, 1, 185)
|
|
397
|
+
#define WS_1_7 W_SMALL(14, 0, 1, 185)
|
|
398
|
+
#define WS_2_0 W_SMALL(17, -128, -64, 233)
|
|
399
|
+
#define WS_2_1 W_SMALL(18, -128, -64, 233)
|
|
400
|
+
#define WS_2_2 W_SMALL(23, -128, -64, 233)
|
|
401
|
+
#define WS_2_3 W_SMALL(20, -128, -64, 233)
|
|
402
|
+
#define WS_2_4 W_SMALL(22, -128, -64, 233)
|
|
403
|
+
#define WS_2_5 W_SMALL(21, -128, -64, 233)
|
|
404
|
+
#define WS_2_6 W_SMALL(16, -128, -64, 233)
|
|
405
|
+
#define WS_2_7 W_SMALL(19, -128, -64, 233)
|
|
406
|
+
#define WS_3_0 W_SMALL(30, -191, -127, 233)
|
|
407
|
+
#define WS_3_1 W_SMALL(24, -191, -127, 233)
|
|
408
|
+
#define WS_3_2 W_SMALL(25, -191, -127, 233)
|
|
409
|
+
#define WS_3_3 W_SMALL(31, -191, -127, 233)
|
|
410
|
+
#define WS_3_4 W_SMALL(27, -191, -127, 233)
|
|
411
|
+
#define WS_3_5 W_SMALL(29, -191, -127, 233)
|
|
412
|
+
#define WS_3_6 W_SMALL(28, -191, -127, 233)
|
|
413
|
+
#define WS_3_7 W_SMALL(26, -191, -127, 233)
|
|
414
|
+
|
|
415
|
+
#define W_BIG(sb, o1, o2, mm) \
|
|
416
|
+
(INNER(q[16 * (sb) + 2 * 0 + o1], q[16 * (sb) + 2 * 0 + o2], mm), \
|
|
417
|
+
INNER(q[16 * (sb) + 2 * 1 + o1], q[16 * (sb) + 2 * 1 + o2], mm), \
|
|
418
|
+
INNER(q[16 * (sb) + 2 * 2 + o1], q[16 * (sb) + 2 * 2 + o2], mm), \
|
|
419
|
+
INNER(q[16 * (sb) + 2 * 3 + o1], q[16 * (sb) + 2 * 3 + o2], mm), \
|
|
420
|
+
INNER(q[16 * (sb) + 2 * 4 + o1], q[16 * (sb) + 2 * 4 + o2], mm), \
|
|
421
|
+
INNER(q[16 * (sb) + 2 * 5 + o1], q[16 * (sb) + 2 * 5 + o2], mm), \
|
|
422
|
+
INNER(q[16 * (sb) + 2 * 6 + o1], q[16 * (sb) + 2 * 6 + o2], mm), \
|
|
423
|
+
INNER(q[16 * (sb) + 2 * 7 + o1], q[16 * (sb) + 2 * 7 + o2], mm)
|
|
424
|
+
|
|
425
|
+
#define WB_0_0 W_BIG( 4, 0, 1, 185)
|
|
426
|
+
#define WB_0_1 W_BIG( 6, 0, 1, 185)
|
|
427
|
+
#define WB_0_2 W_BIG( 0, 0, 1, 185)
|
|
428
|
+
#define WB_0_3 W_BIG( 2, 0, 1, 185)
|
|
429
|
+
#define WB_0_4 W_BIG( 7, 0, 1, 185)
|
|
430
|
+
#define WB_0_5 W_BIG( 5, 0, 1, 185)
|
|
431
|
+
#define WB_0_6 W_BIG( 3, 0, 1, 185)
|
|
432
|
+
#define WB_0_7 W_BIG( 1, 0, 1, 185)
|
|
433
|
+
#define WB_1_0 W_BIG(15, 0, 1, 185)
|
|
434
|
+
#define WB_1_1 W_BIG(11, 0, 1, 185)
|
|
435
|
+
#define WB_1_2 W_BIG(12, 0, 1, 185)
|
|
436
|
+
#define WB_1_3 W_BIG( 8, 0, 1, 185)
|
|
437
|
+
#define WB_1_4 W_BIG( 9, 0, 1, 185)
|
|
438
|
+
#define WB_1_5 W_BIG(13, 0, 1, 185)
|
|
439
|
+
#define WB_1_6 W_BIG(10, 0, 1, 185)
|
|
440
|
+
#define WB_1_7 W_BIG(14, 0, 1, 185)
|
|
441
|
+
#define WB_2_0 W_BIG(17, -256, -128, 233)
|
|
442
|
+
#define WB_2_1 W_BIG(18, -256, -128, 233)
|
|
443
|
+
#define WB_2_2 W_BIG(23, -256, -128, 233)
|
|
444
|
+
#define WB_2_3 W_BIG(20, -256, -128, 233)
|
|
445
|
+
#define WB_2_4 W_BIG(22, -256, -128, 233)
|
|
446
|
+
#define WB_2_5 W_BIG(21, -256, -128, 233)
|
|
447
|
+
#define WB_2_6 W_BIG(16, -256, -128, 233)
|
|
448
|
+
#define WB_2_7 W_BIG(19, -256, -128, 233)
|
|
449
|
+
#define WB_3_0 W_BIG(30, -383, -255, 233)
|
|
450
|
+
#define WB_3_1 W_BIG(24, -383, -255, 233)
|
|
451
|
+
#define WB_3_2 W_BIG(25, -383, -255, 233)
|
|
452
|
+
#define WB_3_3 W_BIG(31, -383, -255, 233)
|
|
453
|
+
#define WB_3_4 W_BIG(27, -383, -255, 233)
|
|
454
|
+
#define WB_3_5 W_BIG(29, -383, -255, 233)
|
|
455
|
+
#define WB_3_6 W_BIG(28, -383, -255, 233)
|
|
456
|
+
#define WB_3_7 W_BIG(26, -383, -255, 233)
|
|
457
|
+
|
|
458
|
+
#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
|
|
459
|
+
#define MAJ(x, y, z) (((x) & (y)) | (((x) | (y)) & (z)))
|
|
460
|
+
|
|
461
|
+
#define PP4_0_0 1
|
|
462
|
+
#define PP4_0_1 0
|
|
463
|
+
#define PP4_0_2 3
|
|
464
|
+
#define PP4_0_3 2
|
|
465
|
+
#define PP4_1_0 2
|
|
466
|
+
#define PP4_1_1 3
|
|
467
|
+
#define PP4_1_2 0
|
|
468
|
+
#define PP4_1_3 1
|
|
469
|
+
#define PP4_2_0 3
|
|
470
|
+
#define PP4_2_1 2
|
|
471
|
+
#define PP4_2_2 1
|
|
472
|
+
#define PP4_2_3 0
|
|
473
|
+
|
|
474
|
+
#define PP8_0_0 1
|
|
475
|
+
#define PP8_0_1 0
|
|
476
|
+
#define PP8_0_2 3
|
|
477
|
+
#define PP8_0_3 2
|
|
478
|
+
#define PP8_0_4 5
|
|
479
|
+
#define PP8_0_5 4
|
|
480
|
+
#define PP8_0_6 7
|
|
481
|
+
#define PP8_0_7 6
|
|
482
|
+
|
|
483
|
+
#define PP8_1_0 6
|
|
484
|
+
#define PP8_1_1 7
|
|
485
|
+
#define PP8_1_2 4
|
|
486
|
+
#define PP8_1_3 5
|
|
487
|
+
#define PP8_1_4 2
|
|
488
|
+
#define PP8_1_5 3
|
|
489
|
+
#define PP8_1_6 0
|
|
490
|
+
#define PP8_1_7 1
|
|
491
|
+
|
|
492
|
+
#define PP8_2_0 2
|
|
493
|
+
#define PP8_2_1 3
|
|
494
|
+
#define PP8_2_2 0
|
|
495
|
+
#define PP8_2_3 1
|
|
496
|
+
#define PP8_2_4 6
|
|
497
|
+
#define PP8_2_5 7
|
|
498
|
+
#define PP8_2_6 4
|
|
499
|
+
#define PP8_2_7 5
|
|
500
|
+
|
|
501
|
+
#define PP8_3_0 3
|
|
502
|
+
#define PP8_3_1 2
|
|
503
|
+
#define PP8_3_2 1
|
|
504
|
+
#define PP8_3_3 0
|
|
505
|
+
#define PP8_3_4 7
|
|
506
|
+
#define PP8_3_5 6
|
|
507
|
+
#define PP8_3_6 5
|
|
508
|
+
#define PP8_3_7 4
|
|
509
|
+
|
|
510
|
+
#define PP8_4_0 5
|
|
511
|
+
#define PP8_4_1 4
|
|
512
|
+
#define PP8_4_2 7
|
|
513
|
+
#define PP8_4_3 6
|
|
514
|
+
#define PP8_4_4 1
|
|
515
|
+
#define PP8_4_5 0
|
|
516
|
+
#define PP8_4_6 3
|
|
517
|
+
#define PP8_4_7 2
|
|
518
|
+
|
|
519
|
+
#define PP8_5_0 7
|
|
520
|
+
#define PP8_5_1 6
|
|
521
|
+
#define PP8_5_2 5
|
|
522
|
+
#define PP8_5_3 4
|
|
523
|
+
#define PP8_5_4 3
|
|
524
|
+
#define PP8_5_5 2
|
|
525
|
+
#define PP8_5_6 1
|
|
526
|
+
#define PP8_5_7 0
|
|
527
|
+
|
|
528
|
+
#define PP8_6_0 4
|
|
529
|
+
#define PP8_6_1 5
|
|
530
|
+
#define PP8_6_2 6
|
|
531
|
+
#define PP8_6_3 7
|
|
532
|
+
#define PP8_6_4 0
|
|
533
|
+
#define PP8_6_5 1
|
|
534
|
+
#define PP8_6_6 2
|
|
535
|
+
#define PP8_6_7 3
|
|
536
|
+
|
|
537
|
+
#if SPH_SIMD_NOCOPY
|
|
538
|
+
|
|
539
|
+
#define DECL_STATE_SMALL
|
|
540
|
+
#define READ_STATE_SMALL(sc)
|
|
541
|
+
#define WRITE_STATE_SMALL(sc)
|
|
542
|
+
#define DECL_STATE_BIG
|
|
543
|
+
#define READ_STATE_BIG(sc)
|
|
544
|
+
#define WRITE_STATE_BIG(sc)
|
|
545
|
+
|
|
546
|
+
#else
|
|
547
|
+
|
|
548
|
+
#define DECL_STATE_SMALL \
|
|
549
|
+
u32 A0, A1, A2, A3, B0, B1, B2, B3, C0, C1, C2, C3, D0, D1, D2, D3;
|
|
550
|
+
|
|
551
|
+
#define READ_STATE_SMALL(sc) do { \
|
|
552
|
+
A0 = (sc)->state[ 0]; \
|
|
553
|
+
A1 = (sc)->state[ 1]; \
|
|
554
|
+
A2 = (sc)->state[ 2]; \
|
|
555
|
+
A3 = (sc)->state[ 3]; \
|
|
556
|
+
B0 = (sc)->state[ 4]; \
|
|
557
|
+
B1 = (sc)->state[ 5]; \
|
|
558
|
+
B2 = (sc)->state[ 6]; \
|
|
559
|
+
B3 = (sc)->state[ 7]; \
|
|
560
|
+
C0 = (sc)->state[ 8]; \
|
|
561
|
+
C1 = (sc)->state[ 9]; \
|
|
562
|
+
C2 = (sc)->state[10]; \
|
|
563
|
+
C3 = (sc)->state[11]; \
|
|
564
|
+
D0 = (sc)->state[12]; \
|
|
565
|
+
D1 = (sc)->state[13]; \
|
|
566
|
+
D2 = (sc)->state[14]; \
|
|
567
|
+
D3 = (sc)->state[15]; \
|
|
568
|
+
} while (0)
|
|
569
|
+
|
|
570
|
+
#define WRITE_STATE_SMALL(sc) do { \
|
|
571
|
+
(sc)->state[ 0] = A0; \
|
|
572
|
+
(sc)->state[ 1] = A1; \
|
|
573
|
+
(sc)->state[ 2] = A2; \
|
|
574
|
+
(sc)->state[ 3] = A3; \
|
|
575
|
+
(sc)->state[ 4] = B0; \
|
|
576
|
+
(sc)->state[ 5] = B1; \
|
|
577
|
+
(sc)->state[ 6] = B2; \
|
|
578
|
+
(sc)->state[ 7] = B3; \
|
|
579
|
+
(sc)->state[ 8] = C0; \
|
|
580
|
+
(sc)->state[ 9] = C1; \
|
|
581
|
+
(sc)->state[10] = C2; \
|
|
582
|
+
(sc)->state[11] = C3; \
|
|
583
|
+
(sc)->state[12] = D0; \
|
|
584
|
+
(sc)->state[13] = D1; \
|
|
585
|
+
(sc)->state[14] = D2; \
|
|
586
|
+
(sc)->state[15] = D3; \
|
|
587
|
+
} while (0)
|
|
588
|
+
|
|
589
|
+
#define DECL_STATE_BIG \
|
|
590
|
+
u32 A0, A1, A2, A3, A4, A5, A6, A7; \
|
|
591
|
+
u32 B0, B1, B2, B3, B4, B5, B6, B7; \
|
|
592
|
+
u32 C0, C1, C2, C3, C4, C5, C6, C7; \
|
|
593
|
+
u32 D0, D1, D2, D3, D4, D5, D6, D7;
|
|
594
|
+
|
|
595
|
+
#define READ_STATE_BIG(sc) do { \
|
|
596
|
+
A0 = (sc)->state[ 0]; \
|
|
597
|
+
A1 = (sc)->state[ 1]; \
|
|
598
|
+
A2 = (sc)->state[ 2]; \
|
|
599
|
+
A3 = (sc)->state[ 3]; \
|
|
600
|
+
A4 = (sc)->state[ 4]; \
|
|
601
|
+
A5 = (sc)->state[ 5]; \
|
|
602
|
+
A6 = (sc)->state[ 6]; \
|
|
603
|
+
A7 = (sc)->state[ 7]; \
|
|
604
|
+
B0 = (sc)->state[ 8]; \
|
|
605
|
+
B1 = (sc)->state[ 9]; \
|
|
606
|
+
B2 = (sc)->state[10]; \
|
|
607
|
+
B3 = (sc)->state[11]; \
|
|
608
|
+
B4 = (sc)->state[12]; \
|
|
609
|
+
B5 = (sc)->state[13]; \
|
|
610
|
+
B6 = (sc)->state[14]; \
|
|
611
|
+
B7 = (sc)->state[15]; \
|
|
612
|
+
C0 = (sc)->state[16]; \
|
|
613
|
+
C1 = (sc)->state[17]; \
|
|
614
|
+
C2 = (sc)->state[18]; \
|
|
615
|
+
C3 = (sc)->state[19]; \
|
|
616
|
+
C4 = (sc)->state[20]; \
|
|
617
|
+
C5 = (sc)->state[21]; \
|
|
618
|
+
C6 = (sc)->state[22]; \
|
|
619
|
+
C7 = (sc)->state[23]; \
|
|
620
|
+
D0 = (sc)->state[24]; \
|
|
621
|
+
D1 = (sc)->state[25]; \
|
|
622
|
+
D2 = (sc)->state[26]; \
|
|
623
|
+
D3 = (sc)->state[27]; \
|
|
624
|
+
D4 = (sc)->state[28]; \
|
|
625
|
+
D5 = (sc)->state[29]; \
|
|
626
|
+
D6 = (sc)->state[30]; \
|
|
627
|
+
D7 = (sc)->state[31]; \
|
|
628
|
+
} while (0)
|
|
629
|
+
|
|
630
|
+
#define WRITE_STATE_BIG(sc) do { \
|
|
631
|
+
(sc)->state[ 0] = A0; \
|
|
632
|
+
(sc)->state[ 1] = A1; \
|
|
633
|
+
(sc)->state[ 2] = A2; \
|
|
634
|
+
(sc)->state[ 3] = A3; \
|
|
635
|
+
(sc)->state[ 4] = A4; \
|
|
636
|
+
(sc)->state[ 5] = A5; \
|
|
637
|
+
(sc)->state[ 6] = A6; \
|
|
638
|
+
(sc)->state[ 7] = A7; \
|
|
639
|
+
(sc)->state[ 8] = B0; \
|
|
640
|
+
(sc)->state[ 9] = B1; \
|
|
641
|
+
(sc)->state[10] = B2; \
|
|
642
|
+
(sc)->state[11] = B3; \
|
|
643
|
+
(sc)->state[12] = B4; \
|
|
644
|
+
(sc)->state[13] = B5; \
|
|
645
|
+
(sc)->state[14] = B6; \
|
|
646
|
+
(sc)->state[15] = B7; \
|
|
647
|
+
(sc)->state[16] = C0; \
|
|
648
|
+
(sc)->state[17] = C1; \
|
|
649
|
+
(sc)->state[18] = C2; \
|
|
650
|
+
(sc)->state[19] = C3; \
|
|
651
|
+
(sc)->state[20] = C4; \
|
|
652
|
+
(sc)->state[21] = C5; \
|
|
653
|
+
(sc)->state[22] = C6; \
|
|
654
|
+
(sc)->state[23] = C7; \
|
|
655
|
+
(sc)->state[24] = D0; \
|
|
656
|
+
(sc)->state[25] = D1; \
|
|
657
|
+
(sc)->state[26] = D2; \
|
|
658
|
+
(sc)->state[27] = D3; \
|
|
659
|
+
(sc)->state[28] = D4; \
|
|
660
|
+
(sc)->state[29] = D5; \
|
|
661
|
+
(sc)->state[30] = D6; \
|
|
662
|
+
(sc)->state[31] = D7; \
|
|
663
|
+
} while (0)
|
|
664
|
+
|
|
665
|
+
#endif
|
|
666
|
+
|
|
667
|
+
#define STEP_ELT(n, w, fun, s, ppb) do { \
|
|
668
|
+
u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
|
|
669
|
+
A ## n = T32(ROL32(tt, s) + XCAT(tA, XCAT(ppb, n))); \
|
|
670
|
+
D ## n = C ## n; \
|
|
671
|
+
C ## n = B ## n; \
|
|
672
|
+
B ## n = tA ## n; \
|
|
673
|
+
} while (0)
|
|
674
|
+
|
|
675
|
+
#define STEP_SMALL(w0, w1, w2, w3, fun, r, s, pp4b) do { \
|
|
676
|
+
u32 tA0 = ROL32(A0, r); \
|
|
677
|
+
u32 tA1 = ROL32(A1, r); \
|
|
678
|
+
u32 tA2 = ROL32(A2, r); \
|
|
679
|
+
u32 tA3 = ROL32(A3, r); \
|
|
680
|
+
STEP_ELT(0, w0, fun, s, pp4b); \
|
|
681
|
+
STEP_ELT(1, w1, fun, s, pp4b); \
|
|
682
|
+
STEP_ELT(2, w2, fun, s, pp4b); \
|
|
683
|
+
STEP_ELT(3, w3, fun, s, pp4b); \
|
|
684
|
+
} while (0)
|
|
685
|
+
|
|
686
|
+
#define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \
|
|
687
|
+
u32 tA0 = ROL32(A0, r); \
|
|
688
|
+
u32 tA1 = ROL32(A1, r); \
|
|
689
|
+
u32 tA2 = ROL32(A2, r); \
|
|
690
|
+
u32 tA3 = ROL32(A3, r); \
|
|
691
|
+
u32 tA4 = ROL32(A4, r); \
|
|
692
|
+
u32 tA5 = ROL32(A5, r); \
|
|
693
|
+
u32 tA6 = ROL32(A6, r); \
|
|
694
|
+
u32 tA7 = ROL32(A7, r); \
|
|
695
|
+
STEP_ELT(0, w0, fun, s, pp8b); \
|
|
696
|
+
STEP_ELT(1, w1, fun, s, pp8b); \
|
|
697
|
+
STEP_ELT(2, w2, fun, s, pp8b); \
|
|
698
|
+
STEP_ELT(3, w3, fun, s, pp8b); \
|
|
699
|
+
STEP_ELT(4, w4, fun, s, pp8b); \
|
|
700
|
+
STEP_ELT(5, w5, fun, s, pp8b); \
|
|
701
|
+
STEP_ELT(6, w6, fun, s, pp8b); \
|
|
702
|
+
STEP_ELT(7, w7, fun, s, pp8b); \
|
|
703
|
+
} while (0)
|
|
704
|
+
|
|
705
|
+
#define M3_0_0 0_
|
|
706
|
+
#define M3_1_0 1_
|
|
707
|
+
#define M3_2_0 2_
|
|
708
|
+
#define M3_3_0 0_
|
|
709
|
+
#define M3_4_0 1_
|
|
710
|
+
#define M3_5_0 2_
|
|
711
|
+
#define M3_6_0 0_
|
|
712
|
+
#define M3_7_0 1_
|
|
713
|
+
|
|
714
|
+
#define M3_0_1 1_
|
|
715
|
+
#define M3_1_1 2_
|
|
716
|
+
#define M3_2_1 0_
|
|
717
|
+
#define M3_3_1 1_
|
|
718
|
+
#define M3_4_1 2_
|
|
719
|
+
#define M3_5_1 0_
|
|
720
|
+
#define M3_6_1 1_
|
|
721
|
+
#define M3_7_1 2_
|
|
722
|
+
|
|
723
|
+
#define M3_0_2 2_
|
|
724
|
+
#define M3_1_2 0_
|
|
725
|
+
#define M3_2_2 1_
|
|
726
|
+
#define M3_3_2 2_
|
|
727
|
+
#define M3_4_2 0_
|
|
728
|
+
#define M3_5_2 1_
|
|
729
|
+
#define M3_6_2 2_
|
|
730
|
+
#define M3_7_2 0_
|
|
731
|
+
|
|
732
|
+
#define STEP_SMALL_(w, fun, r, s, pp4b) STEP_SMALL w, fun, r, s, pp4b)
|
|
733
|
+
|
|
734
|
+
#define ONE_ROUND_SMALL(ri, isp, p0, p1, p2, p3) do { \
|
|
735
|
+
STEP_SMALL_(WS_ ## ri ## 0, \
|
|
736
|
+
IF, p0, p1, XCAT(PP4_, M3_0_ ## isp)); \
|
|
737
|
+
STEP_SMALL_(WS_ ## ri ## 1, \
|
|
738
|
+
IF, p1, p2, XCAT(PP4_, M3_1_ ## isp)); \
|
|
739
|
+
STEP_SMALL_(WS_ ## ri ## 2, \
|
|
740
|
+
IF, p2, p3, XCAT(PP4_, M3_2_ ## isp)); \
|
|
741
|
+
STEP_SMALL_(WS_ ## ri ## 3, \
|
|
742
|
+
IF, p3, p0, XCAT(PP4_, M3_3_ ## isp)); \
|
|
743
|
+
STEP_SMALL_(WS_ ## ri ## 4, \
|
|
744
|
+
MAJ, p0, p1, XCAT(PP4_, M3_4_ ## isp)); \
|
|
745
|
+
STEP_SMALL_(WS_ ## ri ## 5, \
|
|
746
|
+
MAJ, p1, p2, XCAT(PP4_, M3_5_ ## isp)); \
|
|
747
|
+
STEP_SMALL_(WS_ ## ri ## 6, \
|
|
748
|
+
MAJ, p2, p3, XCAT(PP4_, M3_6_ ## isp)); \
|
|
749
|
+
STEP_SMALL_(WS_ ## ri ## 7, \
|
|
750
|
+
MAJ, p3, p0, XCAT(PP4_, M3_7_ ## isp)); \
|
|
751
|
+
} while (0)
|
|
752
|
+
|
|
753
|
+
#define M7_0_0 0_
|
|
754
|
+
#define M7_1_0 1_
|
|
755
|
+
#define M7_2_0 2_
|
|
756
|
+
#define M7_3_0 3_
|
|
757
|
+
#define M7_4_0 4_
|
|
758
|
+
#define M7_5_0 5_
|
|
759
|
+
#define M7_6_0 6_
|
|
760
|
+
#define M7_7_0 0_
|
|
761
|
+
|
|
762
|
+
#define M7_0_1 1_
|
|
763
|
+
#define M7_1_1 2_
|
|
764
|
+
#define M7_2_1 3_
|
|
765
|
+
#define M7_3_1 4_
|
|
766
|
+
#define M7_4_1 5_
|
|
767
|
+
#define M7_5_1 6_
|
|
768
|
+
#define M7_6_1 0_
|
|
769
|
+
#define M7_7_1 1_
|
|
770
|
+
|
|
771
|
+
#define M7_0_2 2_
|
|
772
|
+
#define M7_1_2 3_
|
|
773
|
+
#define M7_2_2 4_
|
|
774
|
+
#define M7_3_2 5_
|
|
775
|
+
#define M7_4_2 6_
|
|
776
|
+
#define M7_5_2 0_
|
|
777
|
+
#define M7_6_2 1_
|
|
778
|
+
#define M7_7_2 2_
|
|
779
|
+
|
|
780
|
+
#define M7_0_3 3_
|
|
781
|
+
#define M7_1_3 4_
|
|
782
|
+
#define M7_2_3 5_
|
|
783
|
+
#define M7_3_3 6_
|
|
784
|
+
#define M7_4_3 0_
|
|
785
|
+
#define M7_5_3 1_
|
|
786
|
+
#define M7_6_3 2_
|
|
787
|
+
#define M7_7_3 3_
|
|
788
|
+
|
|
789
|
+
#define STEP_BIG_(w, fun, r, s, pp8b) STEP_BIG w, fun, r, s, pp8b)
|
|
790
|
+
|
|
791
|
+
#define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3) do { \
|
|
792
|
+
STEP_BIG_(WB_ ## ri ## 0, \
|
|
793
|
+
IF, p0, p1, XCAT(PP8_, M7_0_ ## isp)); \
|
|
794
|
+
STEP_BIG_(WB_ ## ri ## 1, \
|
|
795
|
+
IF, p1, p2, XCAT(PP8_, M7_1_ ## isp)); \
|
|
796
|
+
STEP_BIG_(WB_ ## ri ## 2, \
|
|
797
|
+
IF, p2, p3, XCAT(PP8_, M7_2_ ## isp)); \
|
|
798
|
+
STEP_BIG_(WB_ ## ri ## 3, \
|
|
799
|
+
IF, p3, p0, XCAT(PP8_, M7_3_ ## isp)); \
|
|
800
|
+
STEP_BIG_(WB_ ## ri ## 4, \
|
|
801
|
+
MAJ, p0, p1, XCAT(PP8_, M7_4_ ## isp)); \
|
|
802
|
+
STEP_BIG_(WB_ ## ri ## 5, \
|
|
803
|
+
MAJ, p1, p2, XCAT(PP8_, M7_5_ ## isp)); \
|
|
804
|
+
STEP_BIG_(WB_ ## ri ## 6, \
|
|
805
|
+
MAJ, p2, p3, XCAT(PP8_, M7_6_ ## isp)); \
|
|
806
|
+
STEP_BIG_(WB_ ## ri ## 7, \
|
|
807
|
+
MAJ, p3, p0, XCAT(PP8_, M7_7_ ## isp)); \
|
|
808
|
+
} while (0)
|
|
809
|
+
|
|
810
|
+
#if SPH_SMALL_FOOTPRINT_SIMD
|
|
811
|
+
|
|
812
|
+
#define A0 state[ 0]
|
|
813
|
+
#define A1 state[ 1]
|
|
814
|
+
#define A2 state[ 2]
|
|
815
|
+
#define A3 state[ 3]
|
|
816
|
+
#define B0 state[ 4]
|
|
817
|
+
#define B1 state[ 5]
|
|
818
|
+
#define B2 state[ 6]
|
|
819
|
+
#define B3 state[ 7]
|
|
820
|
+
#define C0 state[ 8]
|
|
821
|
+
#define C1 state[ 9]
|
|
822
|
+
#define C2 state[10]
|
|
823
|
+
#define C3 state[11]
|
|
824
|
+
#define D0 state[12]
|
|
825
|
+
#define D1 state[13]
|
|
826
|
+
#define D2 state[14]
|
|
827
|
+
#define D3 state[15]
|
|
828
|
+
|
|
829
|
+
#define STEP2_ELT(n, w, fun, s, ppb) do { \
|
|
830
|
+
u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
|
|
831
|
+
A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
|
|
832
|
+
D ## n = C ## n; \
|
|
833
|
+
C ## n = B ## n; \
|
|
834
|
+
B ## n = tA[n]; \
|
|
835
|
+
} while (0)
|
|
836
|
+
|
|
837
|
+
#define STEP2_SMALL(w0, w1, w2, w3, fun, r, s, pp4b) do { \
|
|
838
|
+
u32 tA[4]; \
|
|
839
|
+
tA[0] = ROL32(A0, r); \
|
|
840
|
+
tA[1] = ROL32(A1, r); \
|
|
841
|
+
tA[2] = ROL32(A2, r); \
|
|
842
|
+
tA[3] = ROL32(A3, r); \
|
|
843
|
+
STEP2_ELT(0, w0, fun, s, pp4b); \
|
|
844
|
+
STEP2_ELT(1, w1, fun, s, pp4b); \
|
|
845
|
+
STEP2_ELT(2, w2, fun, s, pp4b); \
|
|
846
|
+
STEP2_ELT(3, w3, fun, s, pp4b); \
|
|
847
|
+
} while (0)
|
|
848
|
+
|
|
849
|
+
static void
|
|
850
|
+
one_round_small(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
|
|
851
|
+
{
|
|
852
|
+
static const int pp4k[] = { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2 };
|
|
853
|
+
|
|
854
|
+
STEP2_SMALL(w[ 0], w[ 1], w[ 2], w[ 3], IF, p0, p1, pp4k[isp + 0]);
|
|
855
|
+
STEP2_SMALL(w[ 4], w[ 5], w[ 6], w[ 7], IF, p1, p2, pp4k[isp + 1]);
|
|
856
|
+
STEP2_SMALL(w[ 8], w[ 9], w[10], w[11], IF, p2, p3, pp4k[isp + 2]);
|
|
857
|
+
STEP2_SMALL(w[12], w[13], w[14], w[15], IF, p3, p0, pp4k[isp + 3]);
|
|
858
|
+
STEP2_SMALL(w[16], w[17], w[18], w[19], MAJ, p0, p1, pp4k[isp + 4]);
|
|
859
|
+
STEP2_SMALL(w[20], w[21], w[22], w[23], MAJ, p1, p2, pp4k[isp + 5]);
|
|
860
|
+
STEP2_SMALL(w[24], w[25], w[26], w[27], MAJ, p2, p3, pp4k[isp + 6]);
|
|
861
|
+
STEP2_SMALL(w[28], w[29], w[30], w[31], MAJ, p3, p0, pp4k[isp + 7]);
|
|
862
|
+
}
|
|
863
|
+
|
|
864
|
+
static void
|
|
865
|
+
compress_small(sph_simd_small_context *sc, int last)
|
|
866
|
+
{
|
|
867
|
+
unsigned char *x;
|
|
868
|
+
s32 q[128];
|
|
869
|
+
int i;
|
|
870
|
+
u32 w[32];
|
|
871
|
+
u32 state[16];
|
|
872
|
+
size_t u;
|
|
873
|
+
|
|
874
|
+
static const size_t wsp[32] = {
|
|
875
|
+
4 << 3, 6 << 3, 0 << 3, 2 << 3,
|
|
876
|
+
7 << 3, 5 << 3, 3 << 3, 1 << 3,
|
|
877
|
+
15 << 3, 11 << 3, 12 << 3, 8 << 3,
|
|
878
|
+
9 << 3, 13 << 3, 10 << 3, 14 << 3,
|
|
879
|
+
17 << 3, 18 << 3, 23 << 3, 20 << 3,
|
|
880
|
+
22 << 3, 21 << 3, 16 << 3, 19 << 3,
|
|
881
|
+
30 << 3, 24 << 3, 25 << 3, 31 << 3,
|
|
882
|
+
27 << 3, 29 << 3, 28 << 3, 26 << 3
|
|
883
|
+
};
|
|
884
|
+
|
|
885
|
+
x = sc->buf;
|
|
886
|
+
FFT128(0, 1, 0, ll);
|
|
887
|
+
if (last) {
|
|
888
|
+
for (i = 0; i < 128; i ++) {
|
|
889
|
+
s32 tq;
|
|
890
|
+
|
|
891
|
+
tq = q[i] + yoff_s_f[i];
|
|
892
|
+
tq = REDS2(tq);
|
|
893
|
+
tq = REDS1(tq);
|
|
894
|
+
tq = REDS1(tq);
|
|
895
|
+
q[i] = (tq <= 128 ? tq : tq - 257);
|
|
896
|
+
}
|
|
897
|
+
} else {
|
|
898
|
+
for (i = 0; i < 128; i ++) {
|
|
899
|
+
s32 tq;
|
|
900
|
+
|
|
901
|
+
tq = q[i] + yoff_s_n[i];
|
|
902
|
+
tq = REDS2(tq);
|
|
903
|
+
tq = REDS1(tq);
|
|
904
|
+
tq = REDS1(tq);
|
|
905
|
+
q[i] = (tq <= 128 ? tq : tq - 257);
|
|
906
|
+
}
|
|
907
|
+
}
|
|
908
|
+
|
|
909
|
+
for (i = 0; i < 16; i += 4) {
|
|
910
|
+
state[i + 0] = sc->state[i + 0]
|
|
911
|
+
^ sph_dec32le_aligned(x + 4 * (i + 0));
|
|
912
|
+
state[i + 1] = sc->state[i + 1]
|
|
913
|
+
^ sph_dec32le_aligned(x + 4 * (i + 1));
|
|
914
|
+
state[i + 2] = sc->state[i + 2]
|
|
915
|
+
^ sph_dec32le_aligned(x + 4 * (i + 2));
|
|
916
|
+
state[i + 3] = sc->state[i + 3]
|
|
917
|
+
^ sph_dec32le_aligned(x + 4 * (i + 3));
|
|
918
|
+
}
|
|
919
|
+
|
|
920
|
+
#define WSREAD(sb, o1, o2, mm) do { \
|
|
921
|
+
for (u = 0; u < 32; u += 4) { \
|
|
922
|
+
size_t v = wsp[(u >> 2) + (sb)]; \
|
|
923
|
+
w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
|
|
924
|
+
q[v + 2 * 0 + (o2)], mm); \
|
|
925
|
+
w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
|
|
926
|
+
q[v + 2 * 1 + (o2)], mm); \
|
|
927
|
+
w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
|
|
928
|
+
q[v + 2 * 2 + (o2)], mm); \
|
|
929
|
+
w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
|
|
930
|
+
q[v + 2 * 3 + (o2)], mm); \
|
|
931
|
+
} \
|
|
932
|
+
} while (0)
|
|
933
|
+
|
|
934
|
+
WSREAD( 0, 0, 1, 185);
|
|
935
|
+
one_round_small(state, w, 0, 3, 23, 17, 27);
|
|
936
|
+
WSREAD( 8, 0, 1, 185);
|
|
937
|
+
one_round_small(state, w, 2, 28, 19, 22, 7);
|
|
938
|
+
WSREAD(16, -128, -64, 233);
|
|
939
|
+
one_round_small(state, w, 1, 29, 9, 15, 5);
|
|
940
|
+
WSREAD(24, -191, -127, 233);
|
|
941
|
+
one_round_small(state, w, 0, 4, 13, 10, 25);
|
|
942
|
+
|
|
943
|
+
#undef WSREAD
|
|
944
|
+
|
|
945
|
+
STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
|
|
946
|
+
IF, 4, 13, PP4_2_);
|
|
947
|
+
STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
|
|
948
|
+
IF, 13, 10, PP4_0_);
|
|
949
|
+
STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
|
|
950
|
+
IF, 10, 25, PP4_1_);
|
|
951
|
+
STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
|
|
952
|
+
IF, 25, 4, PP4_2_);
|
|
953
|
+
|
|
954
|
+
memcpy(sc->state, state, sizeof state);
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
#undef A0
|
|
958
|
+
#undef A1
|
|
959
|
+
#undef A2
|
|
960
|
+
#undef A3
|
|
961
|
+
#undef B0
|
|
962
|
+
#undef B1
|
|
963
|
+
#undef B2
|
|
964
|
+
#undef B3
|
|
965
|
+
#undef C0
|
|
966
|
+
#undef C1
|
|
967
|
+
#undef C2
|
|
968
|
+
#undef C3
|
|
969
|
+
#undef D0
|
|
970
|
+
#undef D1
|
|
971
|
+
#undef D2
|
|
972
|
+
#undef D3
|
|
973
|
+
|
|
974
|
+
#else
|
|
975
|
+
|
|
976
|
+
#if SPH_SIMD_NOCOPY
|
|
977
|
+
#define A0 (sc->state[ 0])
|
|
978
|
+
#define A1 (sc->state[ 1])
|
|
979
|
+
#define A2 (sc->state[ 2])
|
|
980
|
+
#define A3 (sc->state[ 3])
|
|
981
|
+
#define B0 (sc->state[ 4])
|
|
982
|
+
#define B1 (sc->state[ 5])
|
|
983
|
+
#define B2 (sc->state[ 6])
|
|
984
|
+
#define B3 (sc->state[ 7])
|
|
985
|
+
#define C0 (sc->state[ 8])
|
|
986
|
+
#define C1 (sc->state[ 9])
|
|
987
|
+
#define C2 (sc->state[10])
|
|
988
|
+
#define C3 (sc->state[11])
|
|
989
|
+
#define D0 (sc->state[12])
|
|
990
|
+
#define D1 (sc->state[13])
|
|
991
|
+
#define D2 (sc->state[14])
|
|
992
|
+
#define D3 (sc->state[15])
|
|
993
|
+
#endif
|
|
994
|
+
|
|
995
|
+
static void
|
|
996
|
+
compress_small(sph_simd_small_context *sc, int last)
|
|
997
|
+
{
|
|
998
|
+
unsigned char *x;
|
|
999
|
+
s32 q[128];
|
|
1000
|
+
int i;
|
|
1001
|
+
DECL_STATE_SMALL
|
|
1002
|
+
#if SPH_SIMD_NOCOPY
|
|
1003
|
+
sph_u32 saved[16];
|
|
1004
|
+
#endif
|
|
1005
|
+
|
|
1006
|
+
#if SPH_SIMD_NOCOPY
|
|
1007
|
+
memcpy(saved, sc->state, sizeof saved);
|
|
1008
|
+
#endif
|
|
1009
|
+
x = sc->buf;
|
|
1010
|
+
FFT128(0, 1, 0, ll);
|
|
1011
|
+
if (last) {
|
|
1012
|
+
for (i = 0; i < 128; i ++) {
|
|
1013
|
+
s32 tq;
|
|
1014
|
+
|
|
1015
|
+
tq = q[i] + yoff_s_f[i];
|
|
1016
|
+
tq = REDS2(tq);
|
|
1017
|
+
tq = REDS1(tq);
|
|
1018
|
+
tq = REDS1(tq);
|
|
1019
|
+
q[i] = (tq <= 128 ? tq : tq - 257);
|
|
1020
|
+
}
|
|
1021
|
+
} else {
|
|
1022
|
+
for (i = 0; i < 128; i ++) {
|
|
1023
|
+
s32 tq;
|
|
1024
|
+
|
|
1025
|
+
tq = q[i] + yoff_s_n[i];
|
|
1026
|
+
tq = REDS2(tq);
|
|
1027
|
+
tq = REDS1(tq);
|
|
1028
|
+
tq = REDS1(tq);
|
|
1029
|
+
q[i] = (tq <= 128 ? tq : tq - 257);
|
|
1030
|
+
}
|
|
1031
|
+
}
|
|
1032
|
+
READ_STATE_SMALL(sc);
|
|
1033
|
+
A0 ^= sph_dec32le_aligned(x + 0);
|
|
1034
|
+
A1 ^= sph_dec32le_aligned(x + 4);
|
|
1035
|
+
A2 ^= sph_dec32le_aligned(x + 8);
|
|
1036
|
+
A3 ^= sph_dec32le_aligned(x + 12);
|
|
1037
|
+
B0 ^= sph_dec32le_aligned(x + 16);
|
|
1038
|
+
B1 ^= sph_dec32le_aligned(x + 20);
|
|
1039
|
+
B2 ^= sph_dec32le_aligned(x + 24);
|
|
1040
|
+
B3 ^= sph_dec32le_aligned(x + 28);
|
|
1041
|
+
C0 ^= sph_dec32le_aligned(x + 32);
|
|
1042
|
+
C1 ^= sph_dec32le_aligned(x + 36);
|
|
1043
|
+
C2 ^= sph_dec32le_aligned(x + 40);
|
|
1044
|
+
C3 ^= sph_dec32le_aligned(x + 44);
|
|
1045
|
+
D0 ^= sph_dec32le_aligned(x + 48);
|
|
1046
|
+
D1 ^= sph_dec32le_aligned(x + 52);
|
|
1047
|
+
D2 ^= sph_dec32le_aligned(x + 56);
|
|
1048
|
+
D3 ^= sph_dec32le_aligned(x + 60);
|
|
1049
|
+
ONE_ROUND_SMALL(0_, 0, 3, 23, 17, 27);
|
|
1050
|
+
ONE_ROUND_SMALL(1_, 2, 28, 19, 22, 7);
|
|
1051
|
+
ONE_ROUND_SMALL(2_, 1, 29, 9, 15, 5);
|
|
1052
|
+
ONE_ROUND_SMALL(3_, 0, 4, 13, 10, 25);
|
|
1053
|
+
#if SPH_SIMD_NOCOPY
|
|
1054
|
+
STEP_SMALL(saved[ 0], saved[ 1], saved[ 2], saved[ 3],
|
|
1055
|
+
IF, 4, 13, PP4_2_);
|
|
1056
|
+
STEP_SMALL(saved[ 4], saved[ 5], saved[ 6], saved[ 7],
|
|
1057
|
+
IF, 13, 10, PP4_0_);
|
|
1058
|
+
STEP_SMALL(saved[ 8], saved[ 9], saved[10], saved[11],
|
|
1059
|
+
IF, 10, 25, PP4_1_);
|
|
1060
|
+
STEP_SMALL(saved[12], saved[13], saved[14], saved[15],
|
|
1061
|
+
IF, 25, 4, PP4_2_);
|
|
1062
|
+
#else
|
|
1063
|
+
STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
|
|
1064
|
+
IF, 4, 13, PP4_2_);
|
|
1065
|
+
STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
|
|
1066
|
+
IF, 13, 10, PP4_0_);
|
|
1067
|
+
STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
|
|
1068
|
+
IF, 10, 25, PP4_1_);
|
|
1069
|
+
STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
|
|
1070
|
+
IF, 25, 4, PP4_2_);
|
|
1071
|
+
WRITE_STATE_SMALL(sc);
|
|
1072
|
+
#endif
|
|
1073
|
+
}
|
|
1074
|
+
|
|
1075
|
+
#if SPH_SIMD_NOCOPY
|
|
1076
|
+
#undef A0
|
|
1077
|
+
#undef A1
|
|
1078
|
+
#undef A2
|
|
1079
|
+
#undef A3
|
|
1080
|
+
#undef B0
|
|
1081
|
+
#undef B1
|
|
1082
|
+
#undef B2
|
|
1083
|
+
#undef B3
|
|
1084
|
+
#undef C0
|
|
1085
|
+
#undef C1
|
|
1086
|
+
#undef C2
|
|
1087
|
+
#undef C3
|
|
1088
|
+
#undef D0
|
|
1089
|
+
#undef D1
|
|
1090
|
+
#undef D2
|
|
1091
|
+
#undef D3
|
|
1092
|
+
#endif
|
|
1093
|
+
|
|
1094
|
+
#endif
|
|
1095
|
+
|
|
1096
|
+
#if SPH_SMALL_FOOTPRINT_SIMD
|
|
1097
|
+
|
|
1098
|
+
#define A0 state[ 0]
|
|
1099
|
+
#define A1 state[ 1]
|
|
1100
|
+
#define A2 state[ 2]
|
|
1101
|
+
#define A3 state[ 3]
|
|
1102
|
+
#define A4 state[ 4]
|
|
1103
|
+
#define A5 state[ 5]
|
|
1104
|
+
#define A6 state[ 6]
|
|
1105
|
+
#define A7 state[ 7]
|
|
1106
|
+
#define B0 state[ 8]
|
|
1107
|
+
#define B1 state[ 9]
|
|
1108
|
+
#define B2 state[10]
|
|
1109
|
+
#define B3 state[11]
|
|
1110
|
+
#define B4 state[12]
|
|
1111
|
+
#define B5 state[13]
|
|
1112
|
+
#define B6 state[14]
|
|
1113
|
+
#define B7 state[15]
|
|
1114
|
+
#define C0 state[16]
|
|
1115
|
+
#define C1 state[17]
|
|
1116
|
+
#define C2 state[18]
|
|
1117
|
+
#define C3 state[19]
|
|
1118
|
+
#define C4 state[20]
|
|
1119
|
+
#define C5 state[21]
|
|
1120
|
+
#define C6 state[22]
|
|
1121
|
+
#define C7 state[23]
|
|
1122
|
+
#define D0 state[24]
|
|
1123
|
+
#define D1 state[25]
|
|
1124
|
+
#define D2 state[26]
|
|
1125
|
+
#define D3 state[27]
|
|
1126
|
+
#define D4 state[28]
|
|
1127
|
+
#define D5 state[29]
|
|
1128
|
+
#define D6 state[30]
|
|
1129
|
+
#define D7 state[31]
|
|
1130
|
+
|
|
1131
|
+
/*
|
|
1132
|
+
* Not needed -- already defined for SIMD-224 / SIMD-256
|
|
1133
|
+
*
|
|
1134
|
+
#define STEP2_ELT(n, w, fun, s, ppb) do { \
|
|
1135
|
+
u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
|
|
1136
|
+
A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
|
|
1137
|
+
D ## n = C ## n; \
|
|
1138
|
+
C ## n = B ## n; \
|
|
1139
|
+
B ## n = tA[n]; \
|
|
1140
|
+
} while (0)
|
|
1141
|
+
*/
|
|
1142
|
+
|
|
1143
|
+
#define STEP2_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \
|
|
1144
|
+
u32 tA[8]; \
|
|
1145
|
+
tA[0] = ROL32(A0, r); \
|
|
1146
|
+
tA[1] = ROL32(A1, r); \
|
|
1147
|
+
tA[2] = ROL32(A2, r); \
|
|
1148
|
+
tA[3] = ROL32(A3, r); \
|
|
1149
|
+
tA[4] = ROL32(A4, r); \
|
|
1150
|
+
tA[5] = ROL32(A5, r); \
|
|
1151
|
+
tA[6] = ROL32(A6, r); \
|
|
1152
|
+
tA[7] = ROL32(A7, r); \
|
|
1153
|
+
STEP2_ELT(0, w0, fun, s, pp8b); \
|
|
1154
|
+
STEP2_ELT(1, w1, fun, s, pp8b); \
|
|
1155
|
+
STEP2_ELT(2, w2, fun, s, pp8b); \
|
|
1156
|
+
STEP2_ELT(3, w3, fun, s, pp8b); \
|
|
1157
|
+
STEP2_ELT(4, w4, fun, s, pp8b); \
|
|
1158
|
+
STEP2_ELT(5, w5, fun, s, pp8b); \
|
|
1159
|
+
STEP2_ELT(6, w6, fun, s, pp8b); \
|
|
1160
|
+
STEP2_ELT(7, w7, fun, s, pp8b); \
|
|
1161
|
+
} while (0)
|
|
1162
|
+
|
|
1163
|
+
static void
|
|
1164
|
+
one_round_big(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
|
|
1165
|
+
{
|
|
1166
|
+
static const int pp8k[] = { 1, 6, 2, 3, 5, 7, 4, 1, 6, 2, 3 };
|
|
1167
|
+
|
|
1168
|
+
STEP2_BIG(w[ 0], w[ 1], w[ 2], w[ 3], w[ 4], w[ 5], w[ 6], w[ 7],
|
|
1169
|
+
IF, p0, p1, pp8k[isp + 0]);
|
|
1170
|
+
STEP2_BIG(w[ 8], w[ 9], w[10], w[11], w[12], w[13], w[14], w[15],
|
|
1171
|
+
IF, p1, p2, pp8k[isp + 1]);
|
|
1172
|
+
STEP2_BIG(w[16], w[17], w[18], w[19], w[20], w[21], w[22], w[23],
|
|
1173
|
+
IF, p2, p3, pp8k[isp + 2]);
|
|
1174
|
+
STEP2_BIG(w[24], w[25], w[26], w[27], w[28], w[29], w[30], w[31],
|
|
1175
|
+
IF, p3, p0, pp8k[isp + 3]);
|
|
1176
|
+
STEP2_BIG(w[32], w[33], w[34], w[35], w[36], w[37], w[38], w[39],
|
|
1177
|
+
MAJ, p0, p1, pp8k[isp + 4]);
|
|
1178
|
+
STEP2_BIG(w[40], w[41], w[42], w[43], w[44], w[45], w[46], w[47],
|
|
1179
|
+
MAJ, p1, p2, pp8k[isp + 5]);
|
|
1180
|
+
STEP2_BIG(w[48], w[49], w[50], w[51], w[52], w[53], w[54], w[55],
|
|
1181
|
+
MAJ, p2, p3, pp8k[isp + 6]);
|
|
1182
|
+
STEP2_BIG(w[56], w[57], w[58], w[59], w[60], w[61], w[62], w[63],
|
|
1183
|
+
MAJ, p3, p0, pp8k[isp + 7]);
|
|
1184
|
+
}
|
|
1185
|
+
|
|
1186
|
+
static void
|
|
1187
|
+
compress_big(sph_simd_big_context *sc, int last)
|
|
1188
|
+
{
|
|
1189
|
+
unsigned char *x;
|
|
1190
|
+
s32 q[256];
|
|
1191
|
+
int i;
|
|
1192
|
+
u32 w[64];
|
|
1193
|
+
u32 state[32];
|
|
1194
|
+
size_t u;
|
|
1195
|
+
|
|
1196
|
+
static const size_t wbp[32] = {
|
|
1197
|
+
4 << 4, 6 << 4, 0 << 4, 2 << 4,
|
|
1198
|
+
7 << 4, 5 << 4, 3 << 4, 1 << 4,
|
|
1199
|
+
15 << 4, 11 << 4, 12 << 4, 8 << 4,
|
|
1200
|
+
9 << 4, 13 << 4, 10 << 4, 14 << 4,
|
|
1201
|
+
17 << 4, 18 << 4, 23 << 4, 20 << 4,
|
|
1202
|
+
22 << 4, 21 << 4, 16 << 4, 19 << 4,
|
|
1203
|
+
30 << 4, 24 << 4, 25 << 4, 31 << 4,
|
|
1204
|
+
27 << 4, 29 << 4, 28 << 4, 26 << 4
|
|
1205
|
+
};
|
|
1206
|
+
|
|
1207
|
+
x = sc->buf;
|
|
1208
|
+
FFT256(0, 1, 0, ll);
|
|
1209
|
+
if (last) {
|
|
1210
|
+
for (i = 0; i < 256; i ++) {
|
|
1211
|
+
s32 tq;
|
|
1212
|
+
|
|
1213
|
+
tq = q[i] + yoff_b_f[i];
|
|
1214
|
+
tq = REDS2(tq);
|
|
1215
|
+
tq = REDS1(tq);
|
|
1216
|
+
tq = REDS1(tq);
|
|
1217
|
+
q[i] = (tq <= 128 ? tq : tq - 257);
|
|
1218
|
+
}
|
|
1219
|
+
} else {
|
|
1220
|
+
for (i = 0; i < 256; i ++) {
|
|
1221
|
+
s32 tq;
|
|
1222
|
+
|
|
1223
|
+
tq = q[i] + yoff_b_n[i];
|
|
1224
|
+
tq = REDS2(tq);
|
|
1225
|
+
tq = REDS1(tq);
|
|
1226
|
+
tq = REDS1(tq);
|
|
1227
|
+
q[i] = (tq <= 128 ? tq : tq - 257);
|
|
1228
|
+
}
|
|
1229
|
+
}
|
|
1230
|
+
|
|
1231
|
+
for (i = 0; i < 32; i += 8) {
|
|
1232
|
+
state[i + 0] = sc->state[i + 0]
|
|
1233
|
+
^ sph_dec32le_aligned(x + 4 * (i + 0));
|
|
1234
|
+
state[i + 1] = sc->state[i + 1]
|
|
1235
|
+
^ sph_dec32le_aligned(x + 4 * (i + 1));
|
|
1236
|
+
state[i + 2] = sc->state[i + 2]
|
|
1237
|
+
^ sph_dec32le_aligned(x + 4 * (i + 2));
|
|
1238
|
+
state[i + 3] = sc->state[i + 3]
|
|
1239
|
+
^ sph_dec32le_aligned(x + 4 * (i + 3));
|
|
1240
|
+
state[i + 4] = sc->state[i + 4]
|
|
1241
|
+
^ sph_dec32le_aligned(x + 4 * (i + 4));
|
|
1242
|
+
state[i + 5] = sc->state[i + 5]
|
|
1243
|
+
^ sph_dec32le_aligned(x + 4 * (i + 5));
|
|
1244
|
+
state[i + 6] = sc->state[i + 6]
|
|
1245
|
+
^ sph_dec32le_aligned(x + 4 * (i + 6));
|
|
1246
|
+
state[i + 7] = sc->state[i + 7]
|
|
1247
|
+
^ sph_dec32le_aligned(x + 4 * (i + 7));
|
|
1248
|
+
}
|
|
1249
|
+
|
|
1250
|
+
#define WBREAD(sb, o1, o2, mm) do { \
|
|
1251
|
+
for (u = 0; u < 64; u += 8) { \
|
|
1252
|
+
size_t v = wbp[(u >> 3) + (sb)]; \
|
|
1253
|
+
w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
|
|
1254
|
+
q[v + 2 * 0 + (o2)], mm); \
|
|
1255
|
+
w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
|
|
1256
|
+
q[v + 2 * 1 + (o2)], mm); \
|
|
1257
|
+
w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
|
|
1258
|
+
q[v + 2 * 2 + (o2)], mm); \
|
|
1259
|
+
w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
|
|
1260
|
+
q[v + 2 * 3 + (o2)], mm); \
|
|
1261
|
+
w[u + 4] = INNER(q[v + 2 * 4 + (o1)], \
|
|
1262
|
+
q[v + 2 * 4 + (o2)], mm); \
|
|
1263
|
+
w[u + 5] = INNER(q[v + 2 * 5 + (o1)], \
|
|
1264
|
+
q[v + 2 * 5 + (o2)], mm); \
|
|
1265
|
+
w[u + 6] = INNER(q[v + 2 * 6 + (o1)], \
|
|
1266
|
+
q[v + 2 * 6 + (o2)], mm); \
|
|
1267
|
+
w[u + 7] = INNER(q[v + 2 * 7 + (o1)], \
|
|
1268
|
+
q[v + 2 * 7 + (o2)], mm); \
|
|
1269
|
+
} \
|
|
1270
|
+
} while (0)
|
|
1271
|
+
|
|
1272
|
+
WBREAD( 0, 0, 1, 185);
|
|
1273
|
+
one_round_big(state, w, 0, 3, 23, 17, 27);
|
|
1274
|
+
WBREAD( 8, 0, 1, 185);
|
|
1275
|
+
one_round_big(state, w, 1, 28, 19, 22, 7);
|
|
1276
|
+
WBREAD(16, -256, -128, 233);
|
|
1277
|
+
one_round_big(state, w, 2, 29, 9, 15, 5);
|
|
1278
|
+
WBREAD(24, -383, -255, 233);
|
|
1279
|
+
one_round_big(state, w, 3, 4, 13, 10, 25);
|
|
1280
|
+
|
|
1281
|
+
#undef WBREAD
|
|
1282
|
+
|
|
1283
|
+
STEP_BIG(
|
|
1284
|
+
sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
|
|
1285
|
+
sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
|
|
1286
|
+
IF, 4, 13, PP8_4_);
|
|
1287
|
+
STEP_BIG(
|
|
1288
|
+
sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
|
|
1289
|
+
sc->state[12], sc->state[13], sc->state[14], sc->state[15],
|
|
1290
|
+
IF, 13, 10, PP8_5_);
|
|
1291
|
+
STEP_BIG(
|
|
1292
|
+
sc->state[16], sc->state[17], sc->state[18], sc->state[19],
|
|
1293
|
+
sc->state[20], sc->state[21], sc->state[22], sc->state[23],
|
|
1294
|
+
IF, 10, 25, PP8_6_);
|
|
1295
|
+
STEP_BIG(
|
|
1296
|
+
sc->state[24], sc->state[25], sc->state[26], sc->state[27],
|
|
1297
|
+
sc->state[28], sc->state[29], sc->state[30], sc->state[31],
|
|
1298
|
+
IF, 25, 4, PP8_0_);
|
|
1299
|
+
|
|
1300
|
+
memcpy(sc->state, state, sizeof state);
|
|
1301
|
+
}
|
|
1302
|
+
|
|
1303
|
+
#undef A0
|
|
1304
|
+
#undef A1
|
|
1305
|
+
#undef A2
|
|
1306
|
+
#undef A3
|
|
1307
|
+
#undef A4
|
|
1308
|
+
#undef A5
|
|
1309
|
+
#undef A6
|
|
1310
|
+
#undef A7
|
|
1311
|
+
#undef B0
|
|
1312
|
+
#undef B1
|
|
1313
|
+
#undef B2
|
|
1314
|
+
#undef B3
|
|
1315
|
+
#undef B4
|
|
1316
|
+
#undef B5
|
|
1317
|
+
#undef B6
|
|
1318
|
+
#undef B7
|
|
1319
|
+
#undef C0
|
|
1320
|
+
#undef C1
|
|
1321
|
+
#undef C2
|
|
1322
|
+
#undef C3
|
|
1323
|
+
#undef C4
|
|
1324
|
+
#undef C5
|
|
1325
|
+
#undef C6
|
|
1326
|
+
#undef C7
|
|
1327
|
+
#undef D0
|
|
1328
|
+
#undef D1
|
|
1329
|
+
#undef D2
|
|
1330
|
+
#undef D3
|
|
1331
|
+
#undef D4
|
|
1332
|
+
#undef D5
|
|
1333
|
+
#undef D6
|
|
1334
|
+
#undef D7
|
|
1335
|
+
|
|
1336
|
+
#else
|
|
1337
|
+
|
|
1338
|
+
#if SPH_SIMD_NOCOPY
|
|
1339
|
+
#define A0 (sc->state[ 0])
|
|
1340
|
+
#define A1 (sc->state[ 1])
|
|
1341
|
+
#define A2 (sc->state[ 2])
|
|
1342
|
+
#define A3 (sc->state[ 3])
|
|
1343
|
+
#define A4 (sc->state[ 4])
|
|
1344
|
+
#define A5 (sc->state[ 5])
|
|
1345
|
+
#define A6 (sc->state[ 6])
|
|
1346
|
+
#define A7 (sc->state[ 7])
|
|
1347
|
+
#define B0 (sc->state[ 8])
|
|
1348
|
+
#define B1 (sc->state[ 9])
|
|
1349
|
+
#define B2 (sc->state[10])
|
|
1350
|
+
#define B3 (sc->state[11])
|
|
1351
|
+
#define B4 (sc->state[12])
|
|
1352
|
+
#define B5 (sc->state[13])
|
|
1353
|
+
#define B6 (sc->state[14])
|
|
1354
|
+
#define B7 (sc->state[15])
|
|
1355
|
+
#define C0 (sc->state[16])
|
|
1356
|
+
#define C1 (sc->state[17])
|
|
1357
|
+
#define C2 (sc->state[18])
|
|
1358
|
+
#define C3 (sc->state[19])
|
|
1359
|
+
#define C4 (sc->state[20])
|
|
1360
|
+
#define C5 (sc->state[21])
|
|
1361
|
+
#define C6 (sc->state[22])
|
|
1362
|
+
#define C7 (sc->state[23])
|
|
1363
|
+
#define D0 (sc->state[24])
|
|
1364
|
+
#define D1 (sc->state[25])
|
|
1365
|
+
#define D2 (sc->state[26])
|
|
1366
|
+
#define D3 (sc->state[27])
|
|
1367
|
+
#define D4 (sc->state[28])
|
|
1368
|
+
#define D5 (sc->state[29])
|
|
1369
|
+
#define D6 (sc->state[30])
|
|
1370
|
+
#define D7 (sc->state[31])
|
|
1371
|
+
#endif
|
|
1372
|
+
|
|
1373
|
+
static void
|
|
1374
|
+
compress_big(sph_simd_big_context *sc, int last)
|
|
1375
|
+
{
|
|
1376
|
+
unsigned char *x;
|
|
1377
|
+
s32 q[256];
|
|
1378
|
+
int i;
|
|
1379
|
+
DECL_STATE_BIG
|
|
1380
|
+
#if SPH_SIMD_NOCOPY
|
|
1381
|
+
sph_u32 saved[32];
|
|
1382
|
+
#endif
|
|
1383
|
+
|
|
1384
|
+
#if SPH_SIMD_NOCOPY
|
|
1385
|
+
memcpy(saved, sc->state, sizeof saved);
|
|
1386
|
+
#endif
|
|
1387
|
+
|
|
1388
|
+
x = sc->buf;
|
|
1389
|
+
FFT256(0, 1, 0, ll);
|
|
1390
|
+
if (last) {
|
|
1391
|
+
for (i = 0; i < 256; i ++) {
|
|
1392
|
+
s32 tq;
|
|
1393
|
+
|
|
1394
|
+
tq = q[i] + yoff_b_f[i];
|
|
1395
|
+
tq = REDS2(tq);
|
|
1396
|
+
tq = REDS1(tq);
|
|
1397
|
+
tq = REDS1(tq);
|
|
1398
|
+
q[i] = (tq <= 128 ? tq : tq - 257);
|
|
1399
|
+
}
|
|
1400
|
+
} else {
|
|
1401
|
+
for (i = 0; i < 256; i ++) {
|
|
1402
|
+
s32 tq;
|
|
1403
|
+
|
|
1404
|
+
tq = q[i] + yoff_b_n[i];
|
|
1405
|
+
tq = REDS2(tq);
|
|
1406
|
+
tq = REDS1(tq);
|
|
1407
|
+
tq = REDS1(tq);
|
|
1408
|
+
q[i] = (tq <= 128 ? tq : tq - 257);
|
|
1409
|
+
}
|
|
1410
|
+
}
|
|
1411
|
+
READ_STATE_BIG(sc);
|
|
1412
|
+
A0 ^= sph_dec32le_aligned(x + 0);
|
|
1413
|
+
A1 ^= sph_dec32le_aligned(x + 4);
|
|
1414
|
+
A2 ^= sph_dec32le_aligned(x + 8);
|
|
1415
|
+
A3 ^= sph_dec32le_aligned(x + 12);
|
|
1416
|
+
A4 ^= sph_dec32le_aligned(x + 16);
|
|
1417
|
+
A5 ^= sph_dec32le_aligned(x + 20);
|
|
1418
|
+
A6 ^= sph_dec32le_aligned(x + 24);
|
|
1419
|
+
A7 ^= sph_dec32le_aligned(x + 28);
|
|
1420
|
+
B0 ^= sph_dec32le_aligned(x + 32);
|
|
1421
|
+
B1 ^= sph_dec32le_aligned(x + 36);
|
|
1422
|
+
B2 ^= sph_dec32le_aligned(x + 40);
|
|
1423
|
+
B3 ^= sph_dec32le_aligned(x + 44);
|
|
1424
|
+
B4 ^= sph_dec32le_aligned(x + 48);
|
|
1425
|
+
B5 ^= sph_dec32le_aligned(x + 52);
|
|
1426
|
+
B6 ^= sph_dec32le_aligned(x + 56);
|
|
1427
|
+
B7 ^= sph_dec32le_aligned(x + 60);
|
|
1428
|
+
C0 ^= sph_dec32le_aligned(x + 64);
|
|
1429
|
+
C1 ^= sph_dec32le_aligned(x + 68);
|
|
1430
|
+
C2 ^= sph_dec32le_aligned(x + 72);
|
|
1431
|
+
C3 ^= sph_dec32le_aligned(x + 76);
|
|
1432
|
+
C4 ^= sph_dec32le_aligned(x + 80);
|
|
1433
|
+
C5 ^= sph_dec32le_aligned(x + 84);
|
|
1434
|
+
C6 ^= sph_dec32le_aligned(x + 88);
|
|
1435
|
+
C7 ^= sph_dec32le_aligned(x + 92);
|
|
1436
|
+
D0 ^= sph_dec32le_aligned(x + 96);
|
|
1437
|
+
D1 ^= sph_dec32le_aligned(x + 100);
|
|
1438
|
+
D2 ^= sph_dec32le_aligned(x + 104);
|
|
1439
|
+
D3 ^= sph_dec32le_aligned(x + 108);
|
|
1440
|
+
D4 ^= sph_dec32le_aligned(x + 112);
|
|
1441
|
+
D5 ^= sph_dec32le_aligned(x + 116);
|
|
1442
|
+
D6 ^= sph_dec32le_aligned(x + 120);
|
|
1443
|
+
D7 ^= sph_dec32le_aligned(x + 124);
|
|
1444
|
+
|
|
1445
|
+
ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27);
|
|
1446
|
+
ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7);
|
|
1447
|
+
ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5);
|
|
1448
|
+
ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25);
|
|
1449
|
+
#if SPH_SIMD_NOCOPY
|
|
1450
|
+
STEP_BIG(
|
|
1451
|
+
saved[ 0], saved[ 1], saved[ 2], saved[ 3],
|
|
1452
|
+
saved[ 4], saved[ 5], saved[ 6], saved[ 7],
|
|
1453
|
+
IF, 4, 13, PP8_4_);
|
|
1454
|
+
STEP_BIG(
|
|
1455
|
+
saved[ 8], saved[ 9], saved[10], saved[11],
|
|
1456
|
+
saved[12], saved[13], saved[14], saved[15],
|
|
1457
|
+
IF, 13, 10, PP8_5_);
|
|
1458
|
+
STEP_BIG(
|
|
1459
|
+
saved[16], saved[17], saved[18], saved[19],
|
|
1460
|
+
saved[20], saved[21], saved[22], saved[23],
|
|
1461
|
+
IF, 10, 25, PP8_6_);
|
|
1462
|
+
STEP_BIG(
|
|
1463
|
+
saved[24], saved[25], saved[26], saved[27],
|
|
1464
|
+
saved[28], saved[29], saved[30], saved[31],
|
|
1465
|
+
IF, 25, 4, PP8_0_);
|
|
1466
|
+
#else
|
|
1467
|
+
STEP_BIG(
|
|
1468
|
+
sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
|
|
1469
|
+
sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
|
|
1470
|
+
IF, 4, 13, PP8_4_);
|
|
1471
|
+
STEP_BIG(
|
|
1472
|
+
sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
|
|
1473
|
+
sc->state[12], sc->state[13], sc->state[14], sc->state[15],
|
|
1474
|
+
IF, 13, 10, PP8_5_);
|
|
1475
|
+
STEP_BIG(
|
|
1476
|
+
sc->state[16], sc->state[17], sc->state[18], sc->state[19],
|
|
1477
|
+
sc->state[20], sc->state[21], sc->state[22], sc->state[23],
|
|
1478
|
+
IF, 10, 25, PP8_6_);
|
|
1479
|
+
STEP_BIG(
|
|
1480
|
+
sc->state[24], sc->state[25], sc->state[26], sc->state[27],
|
|
1481
|
+
sc->state[28], sc->state[29], sc->state[30], sc->state[31],
|
|
1482
|
+
IF, 25, 4, PP8_0_);
|
|
1483
|
+
WRITE_STATE_BIG(sc);
|
|
1484
|
+
#endif
|
|
1485
|
+
}
|
|
1486
|
+
|
|
1487
|
+
#if SPH_SIMD_NOCOPY
|
|
1488
|
+
#undef A0
|
|
1489
|
+
#undef A1
|
|
1490
|
+
#undef A2
|
|
1491
|
+
#undef A3
|
|
1492
|
+
#undef A4
|
|
1493
|
+
#undef A5
|
|
1494
|
+
#undef A6
|
|
1495
|
+
#undef A7
|
|
1496
|
+
#undef B0
|
|
1497
|
+
#undef B1
|
|
1498
|
+
#undef B2
|
|
1499
|
+
#undef B3
|
|
1500
|
+
#undef B4
|
|
1501
|
+
#undef B5
|
|
1502
|
+
#undef B6
|
|
1503
|
+
#undef B7
|
|
1504
|
+
#undef C0
|
|
1505
|
+
#undef C1
|
|
1506
|
+
#undef C2
|
|
1507
|
+
#undef C3
|
|
1508
|
+
#undef C4
|
|
1509
|
+
#undef C5
|
|
1510
|
+
#undef C6
|
|
1511
|
+
#undef C7
|
|
1512
|
+
#undef D0
|
|
1513
|
+
#undef D1
|
|
1514
|
+
#undef D2
|
|
1515
|
+
#undef D3
|
|
1516
|
+
#undef D4
|
|
1517
|
+
#undef D5
|
|
1518
|
+
#undef D6
|
|
1519
|
+
#undef D7
|
|
1520
|
+
#endif
|
|
1521
|
+
|
|
1522
|
+
#endif
|
|
1523
|
+
|
|
1524
|
+
static const u32 IV224[] = {
|
|
1525
|
+
C32(0x33586E9F), C32(0x12FFF033), C32(0xB2D9F64D), C32(0x6F8FEA53),
|
|
1526
|
+
C32(0xDE943106), C32(0x2742E439), C32(0x4FBAB5AC), C32(0x62B9FF96),
|
|
1527
|
+
C32(0x22E7B0AF), C32(0xC862B3A8), C32(0x33E00CDC), C32(0x236B86A6),
|
|
1528
|
+
C32(0xF64AE77C), C32(0xFA373B76), C32(0x7DC1EE5B), C32(0x7FB29CE8)
|
|
1529
|
+
};
|
|
1530
|
+
|
|
1531
|
+
static const u32 IV256[] = {
|
|
1532
|
+
C32(0x4D567983), C32(0x07190BA9), C32(0x8474577B), C32(0x39D726E9),
|
|
1533
|
+
C32(0xAAF3D925), C32(0x3EE20B03), C32(0xAFD5E751), C32(0xC96006D3),
|
|
1534
|
+
C32(0xC2C2BA14), C32(0x49B3BCB4), C32(0xF67CAF46), C32(0x668626C9),
|
|
1535
|
+
C32(0xE2EAA8D2), C32(0x1FF47833), C32(0xD0C661A5), C32(0x55693DE1)
|
|
1536
|
+
};
|
|
1537
|
+
|
|
1538
|
+
static const u32 IV384[] = {
|
|
1539
|
+
C32(0x8A36EEBC), C32(0x94A3BD90), C32(0xD1537B83), C32(0xB25B070B),
|
|
1540
|
+
C32(0xF463F1B5), C32(0xB6F81E20), C32(0x0055C339), C32(0xB4D144D1),
|
|
1541
|
+
C32(0x7360CA61), C32(0x18361A03), C32(0x17DCB4B9), C32(0x3414C45A),
|
|
1542
|
+
C32(0xA699A9D2), C32(0xE39E9664), C32(0x468BFE77), C32(0x51D062F8),
|
|
1543
|
+
C32(0xB9E3BFE8), C32(0x63BECE2A), C32(0x8FE506B9), C32(0xF8CC4AC2),
|
|
1544
|
+
C32(0x7AE11542), C32(0xB1AADDA1), C32(0x64B06794), C32(0x28D2F462),
|
|
1545
|
+
C32(0xE64071EC), C32(0x1DEB91A8), C32(0x8AC8DB23), C32(0x3F782AB5),
|
|
1546
|
+
C32(0x039B5CB8), C32(0x71DDD962), C32(0xFADE2CEA), C32(0x1416DF71)
|
|
1547
|
+
};
|
|
1548
|
+
|
|
1549
|
+
static const u32 IV512[] = {
|
|
1550
|
+
C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
|
|
1551
|
+
C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
|
|
1552
|
+
C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
|
|
1553
|
+
C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
|
|
1554
|
+
C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
|
|
1555
|
+
C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
|
|
1556
|
+
C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
|
|
1557
|
+
C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22)
|
|
1558
|
+
};
|
|
1559
|
+
|
|
1560
|
+
static void
|
|
1561
|
+
init_small(void *cc, const u32 *iv)
|
|
1562
|
+
{
|
|
1563
|
+
sph_simd_small_context *sc;
|
|
1564
|
+
|
|
1565
|
+
sc = cc;
|
|
1566
|
+
memcpy(sc->state, iv, sizeof sc->state);
|
|
1567
|
+
sc->count_low = sc->count_high = 0;
|
|
1568
|
+
sc->ptr = 0;
|
|
1569
|
+
}
|
|
1570
|
+
|
|
1571
|
+
static void
|
|
1572
|
+
init_big(void *cc, const u32 *iv)
|
|
1573
|
+
{
|
|
1574
|
+
sph_simd_big_context *sc;
|
|
1575
|
+
|
|
1576
|
+
sc = cc;
|
|
1577
|
+
memcpy(sc->state, iv, sizeof sc->state);
|
|
1578
|
+
sc->count_low = sc->count_high = 0;
|
|
1579
|
+
sc->ptr = 0;
|
|
1580
|
+
}
|
|
1581
|
+
|
|
1582
|
+
static void
|
|
1583
|
+
update_small(void *cc, const void *data, size_t len)
|
|
1584
|
+
{
|
|
1585
|
+
sph_simd_small_context *sc;
|
|
1586
|
+
|
|
1587
|
+
sc = cc;
|
|
1588
|
+
while (len > 0) {
|
|
1589
|
+
size_t clen;
|
|
1590
|
+
|
|
1591
|
+
clen = (sizeof sc->buf) - sc->ptr;
|
|
1592
|
+
if (clen > len)
|
|
1593
|
+
clen = len;
|
|
1594
|
+
memcpy(sc->buf + sc->ptr, data, clen);
|
|
1595
|
+
data = (const unsigned char *)data + clen;
|
|
1596
|
+
len -= clen;
|
|
1597
|
+
if ((sc->ptr += clen) == sizeof sc->buf) {
|
|
1598
|
+
compress_small(sc, 0);
|
|
1599
|
+
sc->ptr = 0;
|
|
1600
|
+
sc->count_low = T32(sc->count_low + 1);
|
|
1601
|
+
if (sc->count_low == 0)
|
|
1602
|
+
sc->count_high ++;
|
|
1603
|
+
}
|
|
1604
|
+
}
|
|
1605
|
+
}
|
|
1606
|
+
|
|
1607
|
+
static void
|
|
1608
|
+
update_big(void *cc, const void *data, size_t len)
|
|
1609
|
+
{
|
|
1610
|
+
sph_simd_big_context *sc;
|
|
1611
|
+
|
|
1612
|
+
sc = cc;
|
|
1613
|
+
while (len > 0) {
|
|
1614
|
+
size_t clen;
|
|
1615
|
+
|
|
1616
|
+
clen = (sizeof sc->buf) - sc->ptr;
|
|
1617
|
+
if (clen > len)
|
|
1618
|
+
clen = len;
|
|
1619
|
+
memcpy(sc->buf + sc->ptr, data, clen);
|
|
1620
|
+
data = (const unsigned char *)data + clen;
|
|
1621
|
+
len -= clen;
|
|
1622
|
+
if ((sc->ptr += clen) == sizeof sc->buf) {
|
|
1623
|
+
compress_big(sc, 0);
|
|
1624
|
+
sc->ptr = 0;
|
|
1625
|
+
sc->count_low = T32(sc->count_low + 1);
|
|
1626
|
+
if (sc->count_low == 0)
|
|
1627
|
+
sc->count_high ++;
|
|
1628
|
+
}
|
|
1629
|
+
}
|
|
1630
|
+
}
|
|
1631
|
+
|
|
1632
|
+
static void
|
|
1633
|
+
encode_count_small(unsigned char *dst,
|
|
1634
|
+
u32 low, u32 high, size_t ptr, unsigned n)
|
|
1635
|
+
{
|
|
1636
|
+
low = T32(low << 9);
|
|
1637
|
+
high = T32(high << 9) + (low >> 23);
|
|
1638
|
+
low += (ptr << 3) + n;
|
|
1639
|
+
sph_enc32le(dst, low);
|
|
1640
|
+
sph_enc32le(dst + 4, high);
|
|
1641
|
+
}
|
|
1642
|
+
|
|
1643
|
+
static void
|
|
1644
|
+
encode_count_big(unsigned char *dst,
|
|
1645
|
+
u32 low, u32 high, size_t ptr, unsigned n)
|
|
1646
|
+
{
|
|
1647
|
+
low = T32(low << 10);
|
|
1648
|
+
high = T32(high << 10) + (low >> 22);
|
|
1649
|
+
low += (ptr << 3) + n;
|
|
1650
|
+
sph_enc32le(dst, low);
|
|
1651
|
+
sph_enc32le(dst + 4, high);
|
|
1652
|
+
}
|
|
1653
|
+
|
|
1654
|
+
static void
|
|
1655
|
+
finalize_small(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
|
|
1656
|
+
{
|
|
1657
|
+
sph_simd_small_context *sc;
|
|
1658
|
+
unsigned char *d;
|
|
1659
|
+
size_t u;
|
|
1660
|
+
|
|
1661
|
+
sc = cc;
|
|
1662
|
+
if (sc->ptr > 0 || n > 0) {
|
|
1663
|
+
memset(sc->buf + sc->ptr, 0,
|
|
1664
|
+
(sizeof sc->buf) - sc->ptr);
|
|
1665
|
+
sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
|
|
1666
|
+
compress_small(sc, 0);
|
|
1667
|
+
}
|
|
1668
|
+
memset(sc->buf, 0, sizeof sc->buf);
|
|
1669
|
+
encode_count_small(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
|
|
1670
|
+
compress_small(sc, 1);
|
|
1671
|
+
d = dst;
|
|
1672
|
+
for (d = dst, u = 0; u < dst_len; u ++)
|
|
1673
|
+
sph_enc32le(d + (u << 2), sc->state[u]);
|
|
1674
|
+
}
|
|
1675
|
+
|
|
1676
|
+
static void
|
|
1677
|
+
finalize_big(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
|
|
1678
|
+
{
|
|
1679
|
+
sph_simd_big_context *sc;
|
|
1680
|
+
unsigned char *d;
|
|
1681
|
+
size_t u;
|
|
1682
|
+
|
|
1683
|
+
sc = cc;
|
|
1684
|
+
if (sc->ptr > 0 || n > 0) {
|
|
1685
|
+
memset(sc->buf + sc->ptr, 0,
|
|
1686
|
+
(sizeof sc->buf) - sc->ptr);
|
|
1687
|
+
sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
|
|
1688
|
+
compress_big(sc, 0);
|
|
1689
|
+
}
|
|
1690
|
+
memset(sc->buf, 0, sizeof sc->buf);
|
|
1691
|
+
encode_count_big(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
|
|
1692
|
+
compress_big(sc, 1);
|
|
1693
|
+
d = dst;
|
|
1694
|
+
for (d = dst, u = 0; u < dst_len; u ++)
|
|
1695
|
+
sph_enc32le(d + (u << 2), sc->state[u]);
|
|
1696
|
+
}
|
|
1697
|
+
|
|
1698
|
+
void
|
|
1699
|
+
sph_simd224_init(void *cc)
|
|
1700
|
+
{
|
|
1701
|
+
init_small(cc, IV224);
|
|
1702
|
+
}
|
|
1703
|
+
|
|
1704
|
+
void
|
|
1705
|
+
sph_simd224(void *cc, const void *data, size_t len)
|
|
1706
|
+
{
|
|
1707
|
+
update_small(cc, data, len);
|
|
1708
|
+
}
|
|
1709
|
+
|
|
1710
|
+
void
|
|
1711
|
+
sph_simd224_close(void *cc, void *dst)
|
|
1712
|
+
{
|
|
1713
|
+
sph_simd224_addbits_and_close(cc, 0, 0, dst);
|
|
1714
|
+
}
|
|
1715
|
+
|
|
1716
|
+
void
|
|
1717
|
+
sph_simd224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
|
1718
|
+
{
|
|
1719
|
+
finalize_small(cc, ub, n, dst, 7);
|
|
1720
|
+
sph_simd224_init(cc);
|
|
1721
|
+
}
|
|
1722
|
+
|
|
1723
|
+
void
|
|
1724
|
+
sph_simd256_init(void *cc)
|
|
1725
|
+
{
|
|
1726
|
+
init_small(cc, IV256);
|
|
1727
|
+
}
|
|
1728
|
+
|
|
1729
|
+
void
|
|
1730
|
+
sph_simd256(void *cc, const void *data, size_t len)
|
|
1731
|
+
{
|
|
1732
|
+
update_small(cc, data, len);
|
|
1733
|
+
}
|
|
1734
|
+
|
|
1735
|
+
void
|
|
1736
|
+
sph_simd256_close(void *cc, void *dst)
|
|
1737
|
+
{
|
|
1738
|
+
sph_simd256_addbits_and_close(cc, 0, 0, dst);
|
|
1739
|
+
}
|
|
1740
|
+
|
|
1741
|
+
void
|
|
1742
|
+
sph_simd256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
|
1743
|
+
{
|
|
1744
|
+
finalize_small(cc, ub, n, dst, 8);
|
|
1745
|
+
sph_simd256_init(cc);
|
|
1746
|
+
}
|
|
1747
|
+
|
|
1748
|
+
void
|
|
1749
|
+
sph_simd384_init(void *cc)
|
|
1750
|
+
{
|
|
1751
|
+
init_big(cc, IV384);
|
|
1752
|
+
}
|
|
1753
|
+
|
|
1754
|
+
void
|
|
1755
|
+
sph_simd384(void *cc, const void *data, size_t len)
|
|
1756
|
+
{
|
|
1757
|
+
update_big(cc, data, len);
|
|
1758
|
+
}
|
|
1759
|
+
|
|
1760
|
+
void
|
|
1761
|
+
sph_simd384_close(void *cc, void *dst)
|
|
1762
|
+
{
|
|
1763
|
+
sph_simd384_addbits_and_close(cc, 0, 0, dst);
|
|
1764
|
+
}
|
|
1765
|
+
|
|
1766
|
+
void
|
|
1767
|
+
sph_simd384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
|
1768
|
+
{
|
|
1769
|
+
finalize_big(cc, ub, n, dst, 12);
|
|
1770
|
+
sph_simd384_init(cc);
|
|
1771
|
+
}
|
|
1772
|
+
|
|
1773
|
+
void
|
|
1774
|
+
sph_simd512_init(void *cc)
|
|
1775
|
+
{
|
|
1776
|
+
init_big(cc, IV512);
|
|
1777
|
+
}
|
|
1778
|
+
|
|
1779
|
+
void
|
|
1780
|
+
sph_simd512(void *cc, const void *data, size_t len)
|
|
1781
|
+
{
|
|
1782
|
+
update_big(cc, data, len);
|
|
1783
|
+
}
|
|
1784
|
+
|
|
1785
|
+
void
|
|
1786
|
+
sph_simd512_close(void *cc, void *dst)
|
|
1787
|
+
{
|
|
1788
|
+
sph_simd512_addbits_and_close(cc, 0, 0, dst);
|
|
1789
|
+
}
|
|
1790
|
+
|
|
1791
|
+
void
|
|
1792
|
+
sph_simd512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
|
1793
|
+
{
|
|
1794
|
+
finalize_big(cc, ub, n, dst, 16);
|
|
1795
|
+
sph_simd512_init(cc);
|
|
1796
|
+
}
|
|
1797
|
+
#ifdef __cplusplus
|
|
1798
|
+
}
|
|
1799
|
+
#endif
|