@leocuvee/wrkzcoin-multi-hashing 0.0.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (314) hide show
  1. package/.travis.yml +5 -0
  2. package/LICENSE +674 -0
  3. package/README.md +87 -0
  4. package/appveyor.yml +12 -0
  5. package/argon2/.gitattributes +10 -0
  6. package/argon2/.travis.yml +25 -0
  7. package/argon2/Argon2.sln +160 -0
  8. package/argon2/CHANGELOG.md +32 -0
  9. package/argon2/CMakeLists.txt +87 -0
  10. package/argon2/LICENSE +314 -0
  11. package/argon2/Makefile +196 -0
  12. package/argon2/README.md +297 -0
  13. package/argon2/appveyor.yml +40 -0
  14. package/argon2/argon2-specs.pdf +0 -0
  15. package/argon2/export.sh +7 -0
  16. package/argon2/include/argon2.h +427 -0
  17. package/argon2/latex/CMakeLists.txt +34 -0
  18. package/argon2/latex/IEEEtran.cls +6347 -0
  19. package/argon2/latex/Makefile +18 -0
  20. package/argon2/latex/argon2-specs.tex +920 -0
  21. package/argon2/latex/pics/argon2-par.pdf +0 -0
  22. package/argon2/latex/pics/compression.pdf +0 -0
  23. package/argon2/latex/pics/generic.pdf +0 -0
  24. package/argon2/latex/pics/power-distribution.jpg +0 -0
  25. package/argon2/latex/tradeoff.bib +822 -0
  26. package/argon2/libargon2.pc +16 -0
  27. package/argon2/man/CMakeLists.txt +8 -0
  28. package/argon2/man/argon2.1 +57 -0
  29. package/argon2/meson.build +16 -0
  30. package/argon2/meson_options.txt +1 -0
  31. package/argon2/src/CMakeLists.txt +147 -0
  32. package/argon2/src/argon2.c +452 -0
  33. package/argon2/src/argon2.pc.in +11 -0
  34. package/argon2/src/blake2/blake2-impl.h +156 -0
  35. package/argon2/src/blake2/blake2.h +89 -0
  36. package/argon2/src/blake2/blake2b.c +390 -0
  37. package/argon2/src/blake2/blamka-round-opt.h +471 -0
  38. package/argon2/src/blake2/blamka-round-ref.h +56 -0
  39. package/argon2/src/core.c +634 -0
  40. package/argon2/src/core.h +228 -0
  41. package/argon2/src/encoding.c +467 -0
  42. package/argon2/src/encoding.h +57 -0
  43. package/argon2/src/genkat.h +51 -0
  44. package/argon2/src/meson.build +68 -0
  45. package/argon2/src/opt.c +283 -0
  46. package/argon2/src/optimization/CMakeLists.txt +10 -0
  47. package/argon2/src/ref.c +194 -0
  48. package/argon2/src/thread.c +57 -0
  49. package/argon2/src/thread.h +67 -0
  50. package/argon2/tests/CMakeLists.txt +43 -0
  51. package/argon2/tests/bench.c +111 -0
  52. package/argon2/tests/genkat.c +207 -0
  53. package/argon2/tests/kats/argon2d +12304 -0
  54. package/argon2/tests/kats/argon2d.shasum +1 -0
  55. package/argon2/tests/kats/argon2d_v16 +12304 -0
  56. package/argon2/tests/kats/argon2d_v16.shasum +1 -0
  57. package/argon2/tests/kats/argon2i +12304 -0
  58. package/argon2/tests/kats/argon2i.shasum +1 -0
  59. package/argon2/tests/kats/argon2i_v16 +12304 -0
  60. package/argon2/tests/kats/argon2i_v16.shasum +1 -0
  61. package/argon2/tests/kats/argon2id +12304 -0
  62. package/argon2/tests/kats/argon2id.shasum +1 -0
  63. package/argon2/tests/kats/argon2id_v16 +12304 -0
  64. package/argon2/tests/kats/argon2id_v16.shasum +1 -0
  65. package/argon2/tests/kats/check-sums.ps1 +48 -0
  66. package/argon2/tests/kats/check-sums.sh +16 -0
  67. package/argon2/tests/kats/test.ps1 +132 -0
  68. package/argon2/tests/kats/test.sh +117 -0
  69. package/argon2/tests/meson.build +34 -0
  70. package/argon2/tests/test.c +289 -0
  71. package/argon2/tool/CMakeLists.txt +7 -0
  72. package/argon2/tool/main.c +339 -0
  73. package/argon2/tool/meson.build +8 -0
  74. package/argon2/vs2015/Argon2Opt/Argon2Opt.vcxproj +226 -0
  75. package/argon2/vs2015/Argon2Opt/Argon2Opt.vcxproj.filters +69 -0
  76. package/argon2/vs2015/Argon2OptBench/Argon2OptBench.vcxproj +226 -0
  77. package/argon2/vs2015/Argon2OptBench/Argon2OptBench.vcxproj.filters +69 -0
  78. package/argon2/vs2015/Argon2OptDll/Argon2OptDll.vcxproj +225 -0
  79. package/argon2/vs2015/Argon2OptDll/Argon2OptDll.vcxproj.filters +66 -0
  80. package/argon2/vs2015/Argon2OptGenKAT/Argon2OptGenKAT.vcxproj +239 -0
  81. package/argon2/vs2015/Argon2OptGenKAT/Argon2OptGenKAT.vcxproj.filters +72 -0
  82. package/argon2/vs2015/Argon2OptTestCI/Argon2OptTestCI.vcxproj +227 -0
  83. package/argon2/vs2015/Argon2OptTestCI/Argon2OptTestCI.vcxproj.filters +69 -0
  84. package/argon2/vs2015/Argon2Ref/Argon2Ref.vcxproj +226 -0
  85. package/argon2/vs2015/Argon2Ref/Argon2Ref.vcxproj.filters +69 -0
  86. package/argon2/vs2015/Argon2RefBench/Argon2RefBench.vcxproj +226 -0
  87. package/argon2/vs2015/Argon2RefBench/Argon2RefBench.vcxproj.filters +69 -0
  88. package/argon2/vs2015/Argon2RefDll/Argon2RefDll.vcxproj +225 -0
  89. package/argon2/vs2015/Argon2RefDll/Argon2RefDll.vcxproj.filters +66 -0
  90. package/argon2/vs2015/Argon2RefGenKAT/Argon2RefGenKAT.vcxproj +227 -0
  91. package/argon2/vs2015/Argon2RefGenKAT/Argon2RefGenKAT.vcxproj.filters +72 -0
  92. package/argon2/vs2015/Argon2RefTestCI/Argon2RefTestCI.vcxproj +226 -0
  93. package/argon2/vs2015/Argon2RefTestCI/Argon2RefTestCI.vcxproj.filters +69 -0
  94. package/bcrypt.c +566 -0
  95. package/bcrypt.h +14 -0
  96. package/binding.gyp +93 -0
  97. package/blake.c +17 -0
  98. package/blake.h +16 -0
  99. package/boolberry.cc +11 -0
  100. package/boolberry.h +6 -0
  101. package/build/Makefile +354 -0
  102. package/build/Release/.deps/Release/multihashing.node.d +1 -0
  103. package/build/Release/.deps/Release/obj.target/multihashing/argon2/src/argon2.o.d +8 -0
  104. package/build/Release/.deps/Release/obj.target/multihashing/argon2/src/blake2/blake2b.o.d +8 -0
  105. package/build/Release/.deps/Release/obj.target/multihashing/argon2/src/core.o.d +10 -0
  106. package/build/Release/.deps/Release/obj.target/multihashing/argon2/src/encoding.o.d +8 -0
  107. package/build/Release/.deps/Release/obj.target/multihashing/argon2/src/ref.o.d +14 -0
  108. package/build/Release/.deps/Release/obj.target/multihashing/argon2/src/thread.o.d +5 -0
  109. package/build/Release/.deps/Release/obj.target/multihashing/bcrypt.o.d +4 -0
  110. package/build/Release/.deps/Release/obj.target/multihashing/blake.o.d +7 -0
  111. package/build/Release/.deps/Release/obj.target/multihashing/boolberry.o.d +12 -0
  112. package/build/Release/.deps/Release/obj.target/multihashing/c11.o.d +20 -0
  113. package/build/Release/.deps/Release/obj.target/multihashing/crypto/aesb.o.d +3 -0
  114. package/build/Release/.deps/Release/obj.target/multihashing/crypto/c_blake256.o.d +5 -0
  115. package/build/Release/.deps/Release/obj.target/multihashing/crypto/c_groestl.o.d +10 -0
  116. package/build/Release/.deps/Release/obj.target/multihashing/crypto/c_jh.o.d +9 -0
  117. package/build/Release/.deps/Release/obj.target/multihashing/crypto/c_keccak.o.d +7 -0
  118. package/build/Release/.deps/Release/obj.target/multihashing/crypto/c_skein.o.d +10 -0
  119. package/build/Release/.deps/Release/obj.target/multihashing/crypto/hash.o.d +7 -0
  120. package/build/Release/.deps/Release/obj.target/multihashing/crypto/oaes_lib.o.d +6 -0
  121. package/build/Release/.deps/Release/obj.target/multihashing/crypto/wild_keccak.o.d +8 -0
  122. package/build/Release/.deps/Release/obj.target/multihashing/cryptonight.o.d +18 -0
  123. package/build/Release/.deps/Release/obj.target/multihashing/cryptonight_dark.o.d +18 -0
  124. package/build/Release/.deps/Release/obj.target/multihashing/cryptonight_dark_lite.o.d +18 -0
  125. package/build/Release/.deps/Release/obj.target/multihashing/cryptonight_fast.o.d +18 -0
  126. package/build/Release/.deps/Release/obj.target/multihashing/cryptonight_lite.o.d +18 -0
  127. package/build/Release/.deps/Release/obj.target/multihashing/cryptonight_soft_shell.o.d +18 -0
  128. package/build/Release/.deps/Release/obj.target/multihashing/cryptonight_turtle.o.d +18 -0
  129. package/build/Release/.deps/Release/obj.target/multihashing/cryptonight_turtle_lite.o.d +18 -0
  130. package/build/Release/.deps/Release/obj.target/multihashing/fresh.o.d +10 -0
  131. package/build/Release/.deps/Release/obj.target/multihashing/fugue.o.d +7 -0
  132. package/build/Release/.deps/Release/obj.target/multihashing/groestl.o.d +8 -0
  133. package/build/Release/.deps/Release/obj.target/multihashing/hefty1.o.d +12 -0
  134. package/build/Release/.deps/Release/obj.target/multihashing/keccak.o.d +8 -0
  135. package/build/Release/.deps/Release/obj.target/multihashing/multihashing.o.d +155 -0
  136. package/build/Release/.deps/Release/obj.target/multihashing/nist5.o.d +12 -0
  137. package/build/Release/.deps/Release/obj.target/multihashing/quark.o.d +14 -0
  138. package/build/Release/.deps/Release/obj.target/multihashing/qubit.o.d +12 -0
  139. package/build/Release/.deps/Release/obj.target/multihashing/scryptjane.o.d +30 -0
  140. package/build/Release/.deps/Release/obj.target/multihashing/scryptn.o.d +6 -0
  141. package/build/Release/.deps/Release/obj.target/multihashing/sha1.o.d +24 -0
  142. package/build/Release/.deps/Release/obj.target/multihashing/sha3/aes_helper.o.d +5 -0
  143. package/build/Release/.deps/Release/obj.target/multihashing/sha3/hamsi.o.d +7 -0
  144. package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_blake.o.d +6 -0
  145. package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_bmw.o.d +6 -0
  146. package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_cubehash.o.d +6 -0
  147. package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_echo.o.d +7 -0
  148. package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_fugue.o.d +6 -0
  149. package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_groestl.o.d +6 -0
  150. package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_hefty1.o.d +5 -0
  151. package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_jh.o.d +6 -0
  152. package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_keccak.o.d +6 -0
  153. package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_luffa.o.d +6 -0
  154. package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_shabal.o.d +6 -0
  155. package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_shavite.o.d +7 -0
  156. package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_simd.o.d +6 -0
  157. package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_skein.o.d +6 -0
  158. package/build/Release/.deps/Release/obj.target/multihashing/sha3/sph_whirlpool.o.d +8 -0
  159. package/build/Release/.deps/Release/obj.target/multihashing/shavite3.o.d +7 -0
  160. package/build/Release/.deps/Release/obj.target/multihashing/skein.o.d +8 -0
  161. package/build/Release/.deps/Release/obj.target/multihashing/x11.o.d +20 -0
  162. package/build/Release/.deps/Release/obj.target/multihashing/x13.o.d +23 -0
  163. package/build/Release/.deps/Release/obj.target/multihashing/x15.o.d +26 -0
  164. package/build/Release/.deps/Release/obj.target/multihashing.node.d +1 -0
  165. package/build/Release/multihashing.node +0 -0
  166. package/build/binding.Makefile +6 -0
  167. package/build/multihashing.target.mk +255 -0
  168. package/c11.c +85 -0
  169. package/c11.h +17 -0
  170. package/crypto/aesb.c +177 -0
  171. package/crypto/c_blake256.c +326 -0
  172. package/crypto/c_blake256.h +43 -0
  173. package/crypto/c_groestl.c +360 -0
  174. package/crypto/c_groestl.h +56 -0
  175. package/crypto/c_jh.c +367 -0
  176. package/crypto/c_jh.h +20 -0
  177. package/crypto/c_keccak.c +112 -0
  178. package/crypto/c_keccak.h +26 -0
  179. package/crypto/c_skein.c +2036 -0
  180. package/crypto/c_skein.h +45 -0
  181. package/crypto/crypto.h +186 -0
  182. package/crypto/cryptonote_core/account.cpp +50 -0
  183. package/crypto/cryptonote_core/account.h +61 -0
  184. package/crypto/cryptonote_core/cryptonote_basic_impl.cpp +186 -0
  185. package/crypto/cryptonote_core/cryptonote_basic_impl.h +65 -0
  186. package/crypto/cryptonote_core/cryptonote_format_utils.cpp +766 -0
  187. package/crypto/cryptonote_core/cryptonote_format_utils.h +30 -0
  188. package/crypto/cryptonote_protocol/cryptonote_protocol_defs.h +152 -0
  189. package/crypto/groestl_tables.h +38 -0
  190. package/crypto/hash-ops.h +57 -0
  191. package/crypto/hash.c +24 -0
  192. package/crypto/hash.h +22 -0
  193. package/crypto/int-util.h +230 -0
  194. package/crypto/oaes_config.h +50 -0
  195. package/crypto/oaes_lib.c +1468 -0
  196. package/crypto/oaes_lib.h +215 -0
  197. package/crypto/skein_port.h +190 -0
  198. package/crypto/variant2_int_sqrt.h +168 -0
  199. package/crypto/wild_keccak.cpp +119 -0
  200. package/crypto/wild_keccak.h +168 -0
  201. package/cryptonight.c +300 -0
  202. package/cryptonight.h +17 -0
  203. package/cryptonight_dark.c +300 -0
  204. package/cryptonight_dark.h +17 -0
  205. package/cryptonight_dark_lite.c +300 -0
  206. package/cryptonight_dark_lite.h +17 -0
  207. package/cryptonight_fast.c +300 -0
  208. package/cryptonight_fast.h +17 -0
  209. package/cryptonight_lite.c +300 -0
  210. package/cryptonight_lite.h +17 -0
  211. package/cryptonight_soft_shell.c +298 -0
  212. package/cryptonight_soft_shell.h +17 -0
  213. package/cryptonight_turtle.c +300 -0
  214. package/cryptonight_turtle.h +17 -0
  215. package/cryptonight_turtle_lite.c +300 -0
  216. package/cryptonight_turtle_lite.h +17 -0
  217. package/fresh.c +42 -0
  218. package/fresh.h +16 -0
  219. package/fugue.c +12 -0
  220. package/fugue.h +16 -0
  221. package/groestl.c +40 -0
  222. package/groestl.h +17 -0
  223. package/hefty1.c +63 -0
  224. package/hefty1.h +16 -0
  225. package/index.js +1 -0
  226. package/keccak.c +14 -0
  227. package/keccak.h +16 -0
  228. package/leocuvee-wrkzcoin-multi-hashing-0.0.20.tgz +0 -0
  229. package/multihashing.cc +699 -0
  230. package/nist5.c +46 -0
  231. package/nist5.h +16 -0
  232. package/package.json +56 -0
  233. package/quark.c +210 -0
  234. package/quark.h +16 -0
  235. package/qubit.c +45 -0
  236. package/qubit.h +16 -0
  237. package/scryptjane/scrypt-jane-chacha.h +132 -0
  238. package/scryptjane/scrypt-jane-hash.h +48 -0
  239. package/scryptjane/scrypt-jane-hash_keccak.h +168 -0
  240. package/scryptjane/scrypt-jane-hash_sha256.h +135 -0
  241. package/scryptjane/scrypt-jane-mix_chacha-avx.h +340 -0
  242. package/scryptjane/scrypt-jane-mix_chacha-sse2.h +371 -0
  243. package/scryptjane/scrypt-jane-mix_chacha-ssse3.h +348 -0
  244. package/scryptjane/scrypt-jane-mix_chacha.h +69 -0
  245. package/scryptjane/scrypt-jane-mix_salsa-avx.h +381 -0
  246. package/scryptjane/scrypt-jane-mix_salsa-sse2.h +443 -0
  247. package/scryptjane/scrypt-jane-mix_salsa.h +70 -0
  248. package/scryptjane/scrypt-jane-pbkdf2.h +112 -0
  249. package/scryptjane/scrypt-jane-portable-x86.h +364 -0
  250. package/scryptjane/scrypt-jane-portable.h +281 -0
  251. package/scryptjane/scrypt-jane-romix-basic.h +67 -0
  252. package/scryptjane/scrypt-jane-romix-template.h +118 -0
  253. package/scryptjane/scrypt-jane-romix.h +27 -0
  254. package/scryptjane/scrypt-jane-salsa.h +106 -0
  255. package/scryptjane/scrypt-jane-test-vectors.h +261 -0
  256. package/scryptjane.c +223 -0
  257. package/scryptjane.h +36 -0
  258. package/scryptn.c +258 -0
  259. package/scryptn.h +16 -0
  260. package/sha1.c +65 -0
  261. package/sha1.h +16 -0
  262. package/sha256.h +440 -0
  263. package/sha3/aes_helper.c +392 -0
  264. package/sha3/hamsi.c +867 -0
  265. package/sha3/hamsi_helper.c +39648 -0
  266. package/sha3/md_helper.c +347 -0
  267. package/sha3/sph_blake.c +1114 -0
  268. package/sha3/sph_blake.h +327 -0
  269. package/sha3/sph_bmw.c +965 -0
  270. package/sha3/sph_bmw.h +328 -0
  271. package/sha3/sph_cubehash.c +723 -0
  272. package/sha3/sph_cubehash.h +292 -0
  273. package/sha3/sph_echo.c +1031 -0
  274. package/sha3/sph_echo.h +320 -0
  275. package/sha3/sph_fugue.c +1208 -0
  276. package/sha3/sph_fugue.h +81 -0
  277. package/sha3/sph_groestl.c +3119 -0
  278. package/sha3/sph_groestl.h +329 -0
  279. package/sha3/sph_hamsi.h +321 -0
  280. package/sha3/sph_hefty1.c +378 -0
  281. package/sha3/sph_hefty1.h +66 -0
  282. package/sha3/sph_jh.c +1116 -0
  283. package/sha3/sph_jh.h +298 -0
  284. package/sha3/sph_keccak.c +1824 -0
  285. package/sha3/sph_keccak.h +293 -0
  286. package/sha3/sph_luffa.c +1426 -0
  287. package/sha3/sph_luffa.h +296 -0
  288. package/sha3/sph_shabal.c +806 -0
  289. package/sha3/sph_shabal.h +344 -0
  290. package/sha3/sph_shavite.c +1764 -0
  291. package/sha3/sph_shavite.h +314 -0
  292. package/sha3/sph_simd.c +1799 -0
  293. package/sha3/sph_simd.h +309 -0
  294. package/sha3/sph_skein.c +1254 -0
  295. package/sha3/sph_skein.h +298 -0
  296. package/sha3/sph_types.h +1976 -0
  297. package/sha3/sph_whirlpool.c +3480 -0
  298. package/sha3/sph_whirlpool.h +209 -0
  299. package/shavite3.c +24 -0
  300. package/shavite3.h +16 -0
  301. package/skein.c +26 -0
  302. package/skein.h +16 -0
  303. package/stdint.h +259 -0
  304. package/tests/argon2-tests.js +16 -0
  305. package/tests/benchmark.js +36 -0
  306. package/tests/cryptonight-tests.js +189 -0
  307. package/tests/cryptonight_monero.js +53 -0
  308. package/tests/test.js +16 -0
  309. package/x11.c +85 -0
  310. package/x11.h +16 -0
  311. package/x13.c +97 -0
  312. package/x13.h +5 -0
  313. package/x15.c +106 -0
  314. package/x15.h +16 -0
@@ -0,0 +1,1799 @@
1
+ /* $Id: simd.c 227 2010-06-16 17:28:38Z tp $ */
2
+ /*
3
+ * SIMD implementation.
4
+ *
5
+ * ==========================(LICENSE BEGIN)============================
6
+ *
7
+ * Copyright (c) 2007-2010 Projet RNRT SAPHIR
8
+ *
9
+ * Permission is hereby granted, free of charge, to any person obtaining
10
+ * a copy of this software and associated documentation files (the
11
+ * "Software"), to deal in the Software without restriction, including
12
+ * without limitation the rights to use, copy, modify, merge, publish,
13
+ * distribute, sublicense, and/or sell copies of the Software, and to
14
+ * permit persons to whom the Software is furnished to do so, subject to
15
+ * the following conditions:
16
+ *
17
+ * The above copyright notice and this permission notice shall be
18
+ * included in all copies or substantial portions of the Software.
19
+ *
20
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
+ *
28
+ * ===========================(LICENSE END)=============================
29
+ *
30
+ * @author Thomas Pornin <thomas.pornin@cryptolog.com>
31
+ */
32
+
33
+ #include <stddef.h>
34
+ #include <string.h>
35
+ #include <limits.h>
36
+
37
+ #include "sph_simd.h"
38
+
39
+ #ifdef __cplusplus
40
+ extern "C"{
41
+ #endif
42
+
43
+ #if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SIMD
44
+ #define SPH_SMALL_FOOTPRINT_SIMD 1
45
+ #endif
46
+
47
+ #ifdef _MSC_VER
48
+ #pragma warning (disable: 4146)
49
+ #endif
50
+
51
+ typedef sph_u32 u32;
52
+ typedef sph_s32 s32;
53
+ #define C32 SPH_C32
54
+ #define T32 SPH_T32
55
+ #define ROL32 SPH_ROTL32
56
+
57
+ #define XCAT(x, y) XCAT_(x, y)
58
+ #define XCAT_(x, y) x ## y
59
+
60
+ /*
61
+ * The powers of 41 modulo 257. We use exponents from 0 to 255, inclusive.
62
+ */
63
+ static const s32 alpha_tab[] = {
64
+ 1, 41, 139, 45, 46, 87, 226, 14, 60, 147, 116, 130,
65
+ 190, 80, 196, 69, 2, 82, 21, 90, 92, 174, 195, 28,
66
+ 120, 37, 232, 3, 123, 160, 135, 138, 4, 164, 42, 180,
67
+ 184, 91, 133, 56, 240, 74, 207, 6, 246, 63, 13, 19,
68
+ 8, 71, 84, 103, 111, 182, 9, 112, 223, 148, 157, 12,
69
+ 235, 126, 26, 38, 16, 142, 168, 206, 222, 107, 18, 224,
70
+ 189, 39, 57, 24, 213, 252, 52, 76, 32, 27, 79, 155,
71
+ 187, 214, 36, 191, 121, 78, 114, 48, 169, 247, 104, 152,
72
+ 64, 54, 158, 53, 117, 171, 72, 125, 242, 156, 228, 96,
73
+ 81, 237, 208, 47, 128, 108, 59, 106, 234, 85, 144, 250,
74
+ 227, 55, 199, 192, 162, 217, 159, 94, 256, 216, 118, 212,
75
+ 211, 170, 31, 243, 197, 110, 141, 127, 67, 177, 61, 188,
76
+ 255, 175, 236, 167, 165, 83, 62, 229, 137, 220, 25, 254,
77
+ 134, 97, 122, 119, 253, 93, 215, 77, 73, 166, 124, 201,
78
+ 17, 183, 50, 251, 11, 194, 244, 238, 249, 186, 173, 154,
79
+ 146, 75, 248, 145, 34, 109, 100, 245, 22, 131, 231, 219,
80
+ 241, 115, 89, 51, 35, 150, 239, 33, 68, 218, 200, 233,
81
+ 44, 5, 205, 181, 225, 230, 178, 102, 70, 43, 221, 66,
82
+ 136, 179, 143, 209, 88, 10, 153, 105, 193, 203, 99, 204,
83
+ 140, 86, 185, 132, 15, 101, 29, 161, 176, 20, 49, 210,
84
+ 129, 149, 198, 151, 23, 172, 113, 7, 30, 202, 58, 65,
85
+ 95, 40, 98, 163
86
+ };
87
+
88
+ /*
89
+ * Ranges:
90
+ * REDS1: from -32768..98302 to -383..383
91
+ * REDS2: from -2^31..2^31-1 to -32768..98302
92
+ */
93
+ #define REDS1(x) (((x) & 0xFF) - ((x) >> 8))
94
+ #define REDS2(x) (((x) & 0xFFFF) + ((x) >> 16))
95
+
96
+ /*
97
+ * If, upon entry, the values of q[] are all in the -N..N range (where
98
+ * N >= 98302) then the new values of q[] are in the -2N..2N range.
99
+ *
100
+ * Since alpha_tab[v] <= 256, maximum allowed range is for N = 8388608.
101
+ */
102
+ #define FFT_LOOP(rb, hk, as, id) do { \
103
+ size_t u, v; \
104
+ s32 m = q[(rb)]; \
105
+ s32 n = q[(rb) + (hk)]; \
106
+ q[(rb)] = m + n; \
107
+ q[(rb) + (hk)] = m - n; \
108
+ u = v = 0; \
109
+ goto id; \
110
+ for (; u < (hk); u += 4, v += 4 * (as)) { \
111
+ s32 t; \
112
+ m = q[(rb) + u + 0]; \
113
+ n = q[(rb) + u + 0 + (hk)]; \
114
+ t = REDS2(n * alpha_tab[v + 0 * (as)]); \
115
+ q[(rb) + u + 0] = m + t; \
116
+ q[(rb) + u + 0 + (hk)] = m - t; \
117
+ id: \
118
+ m = q[(rb) + u + 1]; \
119
+ n = q[(rb) + u + 1 + (hk)]; \
120
+ t = REDS2(n * alpha_tab[v + 1 * (as)]); \
121
+ q[(rb) + u + 1] = m + t; \
122
+ q[(rb) + u + 1 + (hk)] = m - t; \
123
+ m = q[(rb) + u + 2]; \
124
+ n = q[(rb) + u + 2 + (hk)]; \
125
+ t = REDS2(n * alpha_tab[v + 2 * (as)]); \
126
+ q[(rb) + u + 2] = m + t; \
127
+ q[(rb) + u + 2 + (hk)] = m - t; \
128
+ m = q[(rb) + u + 3]; \
129
+ n = q[(rb) + u + 3 + (hk)]; \
130
+ t = REDS2(n * alpha_tab[v + 3 * (as)]); \
131
+ q[(rb) + u + 3] = m + t; \
132
+ q[(rb) + u + 3 + (hk)] = m - t; \
133
+ } \
134
+ } while (0)
135
+
136
+ /*
137
+ * Output ranges:
138
+ * d0: min= 0 max= 1020
139
+ * d1: min= -67 max= 4587
140
+ * d2: min=-4335 max= 4335
141
+ * d3: min=-4147 max= 507
142
+ * d4: min= -510 max= 510
143
+ * d5: min= -252 max= 4402
144
+ * d6: min=-4335 max= 4335
145
+ * d7: min=-4332 max= 322
146
+ */
147
+ #define FFT8(xb, xs, d) do { \
148
+ s32 x0 = x[(xb)]; \
149
+ s32 x1 = x[(xb) + (xs)]; \
150
+ s32 x2 = x[(xb) + 2 * (xs)]; \
151
+ s32 x3 = x[(xb) + 3 * (xs)]; \
152
+ s32 a0 = x0 + x2; \
153
+ s32 a1 = x0 + (x2 << 4); \
154
+ s32 a2 = x0 - x2; \
155
+ s32 a3 = x0 - (x2 << 4); \
156
+ s32 b0 = x1 + x3; \
157
+ s32 b1 = REDS1((x1 << 2) + (x3 << 6)); \
158
+ s32 b2 = (x1 << 4) - (x3 << 4); \
159
+ s32 b3 = REDS1((x1 << 6) + (x3 << 2)); \
160
+ d ## 0 = a0 + b0; \
161
+ d ## 1 = a1 + b1; \
162
+ d ## 2 = a2 + b2; \
163
+ d ## 3 = a3 + b3; \
164
+ d ## 4 = a0 - b0; \
165
+ d ## 5 = a1 - b1; \
166
+ d ## 6 = a2 - b2; \
167
+ d ## 7 = a3 - b3; \
168
+ } while (0)
169
+
170
+ /*
171
+ * When k=16, we have alpha=2. Multiplication by alpha^i is then reduced
172
+ * to some shifting.
173
+ *
174
+ * Output: within -591471..591723
175
+ */
176
+ #define FFT16(xb, xs, rb) do { \
177
+ s32 d1_0, d1_1, d1_2, d1_3, d1_4, d1_5, d1_6, d1_7; \
178
+ s32 d2_0, d2_1, d2_2, d2_3, d2_4, d2_5, d2_6, d2_7; \
179
+ FFT8(xb, (xs) << 1, d1_); \
180
+ FFT8((xb) + (xs), (xs) << 1, d2_); \
181
+ q[(rb) + 0] = d1_0 + d2_0; \
182
+ q[(rb) + 1] = d1_1 + (d2_1 << 1); \
183
+ q[(rb) + 2] = d1_2 + (d2_2 << 2); \
184
+ q[(rb) + 3] = d1_3 + (d2_3 << 3); \
185
+ q[(rb) + 4] = d1_4 + (d2_4 << 4); \
186
+ q[(rb) + 5] = d1_5 + (d2_5 << 5); \
187
+ q[(rb) + 6] = d1_6 + (d2_6 << 6); \
188
+ q[(rb) + 7] = d1_7 + (d2_7 << 7); \
189
+ q[(rb) + 8] = d1_0 - d2_0; \
190
+ q[(rb) + 9] = d1_1 - (d2_1 << 1); \
191
+ q[(rb) + 10] = d1_2 - (d2_2 << 2); \
192
+ q[(rb) + 11] = d1_3 - (d2_3 << 3); \
193
+ q[(rb) + 12] = d1_4 - (d2_4 << 4); \
194
+ q[(rb) + 13] = d1_5 - (d2_5 << 5); \
195
+ q[(rb) + 14] = d1_6 - (d2_6 << 6); \
196
+ q[(rb) + 15] = d1_7 - (d2_7 << 7); \
197
+ } while (0)
198
+
199
+ /*
200
+ * Output range: |q| <= 1183446
201
+ */
202
+ #define FFT32(xb, xs, rb, id) do { \
203
+ FFT16(xb, (xs) << 1, rb); \
204
+ FFT16((xb) + (xs), (xs) << 1, (rb) + 16); \
205
+ FFT_LOOP(rb, 16, 8, id); \
206
+ } while (0)
207
+
208
+ /*
209
+ * Output range: |q| <= 2366892
210
+ */
211
+ #define FFT64(xb, xs, rb, id) do { \
212
+ FFT32(xb, (xs) << 1, rb, XCAT(id, a)); \
213
+ FFT32((xb) + (xs), (xs) << 1, (rb) + 32, XCAT(id, b)); \
214
+ FFT_LOOP(rb, 32, 4, id); \
215
+ } while (0)
216
+
217
+ #if SPH_SMALL_FOOTPRINT_SIMD
218
+
219
+ static void
220
+ fft32(unsigned char *x, size_t xs, s32 *q)
221
+ {
222
+ size_t xd;
223
+
224
+ xd = xs << 1;
225
+ FFT16(0, xd, 0);
226
+ FFT16(xs, xd, 16);
227
+ FFT_LOOP(0, 16, 8, label_);
228
+ }
229
+
230
+ #define FFT128(xb, xs, rb, id) do { \
231
+ fft32(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) + 0]); \
232
+ fft32(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 32]); \
233
+ FFT_LOOP(rb, 32, 4, XCAT(id, aa)); \
234
+ fft32(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 64]); \
235
+ fft32(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 96]); \
236
+ FFT_LOOP((rb) + 64, 32, 4, XCAT(id, ab)); \
237
+ FFT_LOOP(rb, 64, 2, XCAT(id, a)); \
238
+ } while (0)
239
+
240
+ #else
241
+
242
+ /*
243
+ * Output range: |q| <= 4733784
244
+ */
245
+ #define FFT128(xb, xs, rb, id) do { \
246
+ FFT64(xb, (xs) << 1, rb, XCAT(id, a)); \
247
+ FFT64((xb) + (xs), (xs) << 1, (rb) + 64, XCAT(id, b)); \
248
+ FFT_LOOP(rb, 64, 2, id); \
249
+ } while (0)
250
+
251
+ #endif
252
+
253
+ /*
254
+ * For SIMD-384 / SIMD-512, the fully unrolled FFT yields a compression
255
+ * function which does not fit in the 32 kB L1 cache of a typical x86
256
+ * Intel. We therefore add a function call layer at the FFT64 level.
257
+ */
258
+
259
+ static void
260
+ fft64(unsigned char *x, size_t xs, s32 *q)
261
+ {
262
+ size_t xd;
263
+
264
+ xd = xs << 1;
265
+ FFT32(0, xd, 0, label_a);
266
+ FFT32(xs, xd, 32, label_b);
267
+ FFT_LOOP(0, 32, 4, label_);
268
+ }
269
+
270
+ /*
271
+ * Output range: |q| <= 9467568
272
+ */
273
+ #define FFT256(xb, xs, rb, id) do { \
274
+ fft64(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) + 0]); \
275
+ fft64(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 64]); \
276
+ FFT_LOOP(rb, 64, 2, XCAT(id, aa)); \
277
+ fft64(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 128]); \
278
+ fft64(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 192]); \
279
+ FFT_LOOP((rb) + 128, 64, 2, XCAT(id, ab)); \
280
+ FFT_LOOP(rb, 128, 1, XCAT(id, a)); \
281
+ } while (0)
282
+
283
+ /*
284
+ * alpha^(127*i) mod 257
285
+ */
286
+ static const unsigned short yoff_s_n[] = {
287
+ 1, 98, 95, 58, 30, 113, 23, 198, 129, 49, 176, 29,
288
+ 15, 185, 140, 99, 193, 153, 88, 143, 136, 221, 70, 178,
289
+ 225, 205, 44, 200, 68, 239, 35, 89, 241, 231, 22, 100,
290
+ 34, 248, 146, 173, 249, 244, 11, 50, 17, 124, 73, 215,
291
+ 253, 122, 134, 25, 137, 62, 165, 236, 255, 61, 67, 141,
292
+ 197, 31, 211, 118, 256, 159, 162, 199, 227, 144, 234, 59,
293
+ 128, 208, 81, 228, 242, 72, 117, 158, 64, 104, 169, 114,
294
+ 121, 36, 187, 79, 32, 52, 213, 57, 189, 18, 222, 168,
295
+ 16, 26, 235, 157, 223, 9, 111, 84, 8, 13, 246, 207,
296
+ 240, 133, 184, 42, 4, 135, 123, 232, 120, 195, 92, 21,
297
+ 2, 196, 190, 116, 60, 226, 46, 139
298
+ };
299
+
300
+ /*
301
+ * alpha^(127*i) + alpha^(125*i) mod 257
302
+ */
303
+ static const unsigned short yoff_s_f[] = {
304
+ 2, 156, 118, 107, 45, 212, 111, 162, 97, 249, 211, 3,
305
+ 49, 101, 151, 223, 189, 178, 253, 204, 76, 82, 232, 65,
306
+ 96, 176, 161, 47, 189, 61, 248, 107, 0, 131, 133, 113,
307
+ 17, 33, 12, 111, 251, 103, 57, 148, 47, 65, 249, 143,
308
+ 189, 8, 204, 230, 205, 151, 187, 227, 247, 111, 140, 6,
309
+ 77, 10, 21, 149, 255, 101, 139, 150, 212, 45, 146, 95,
310
+ 160, 8, 46, 254, 208, 156, 106, 34, 68, 79, 4, 53,
311
+ 181, 175, 25, 192, 161, 81, 96, 210, 68, 196, 9, 150,
312
+ 0, 126, 124, 144, 240, 224, 245, 146, 6, 154, 200, 109,
313
+ 210, 192, 8, 114, 68, 249, 53, 27, 52, 106, 70, 30,
314
+ 10, 146, 117, 251, 180, 247, 236, 108
315
+ };
316
+
317
+ /*
318
+ * beta^(255*i) mod 257
319
+ */
320
+ static const unsigned short yoff_b_n[] = {
321
+ 1, 163, 98, 40, 95, 65, 58, 202, 30, 7, 113, 172,
322
+ 23, 151, 198, 149, 129, 210, 49, 20, 176, 161, 29, 101,
323
+ 15, 132, 185, 86, 140, 204, 99, 203, 193, 105, 153, 10,
324
+ 88, 209, 143, 179, 136, 66, 221, 43, 70, 102, 178, 230,
325
+ 225, 181, 205, 5, 44, 233, 200, 218, 68, 33, 239, 150,
326
+ 35, 51, 89, 115, 241, 219, 231, 131, 22, 245, 100, 109,
327
+ 34, 145, 248, 75, 146, 154, 173, 186, 249, 238, 244, 194,
328
+ 11, 251, 50, 183, 17, 201, 124, 166, 73, 77, 215, 93,
329
+ 253, 119, 122, 97, 134, 254, 25, 220, 137, 229, 62, 83,
330
+ 165, 167, 236, 175, 255, 188, 61, 177, 67, 127, 141, 110,
331
+ 197, 243, 31, 170, 211, 212, 118, 216, 256, 94, 159, 217,
332
+ 162, 192, 199, 55, 227, 250, 144, 85, 234, 106, 59, 108,
333
+ 128, 47, 208, 237, 81, 96, 228, 156, 242, 125, 72, 171,
334
+ 117, 53, 158, 54, 64, 152, 104, 247, 169, 48, 114, 78,
335
+ 121, 191, 36, 214, 187, 155, 79, 27, 32, 76, 52, 252,
336
+ 213, 24, 57, 39, 189, 224, 18, 107, 222, 206, 168, 142,
337
+ 16, 38, 26, 126, 235, 12, 157, 148, 223, 112, 9, 182,
338
+ 111, 103, 84, 71, 8, 19, 13, 63, 246, 6, 207, 74,
339
+ 240, 56, 133, 91, 184, 180, 42, 164, 4, 138, 135, 160,
340
+ 123, 3, 232, 37, 120, 28, 195, 174, 92, 90, 21, 82,
341
+ 2, 69, 196, 80, 190, 130, 116, 147, 60, 14, 226, 87,
342
+ 46, 45, 139, 41
343
+ };
344
+
345
+ /*
346
+ * beta^(255*i) + beta^(253*i) mod 257
347
+ */
348
+ static const unsigned short yoff_b_f[] = {
349
+ 2, 203, 156, 47, 118, 214, 107, 106, 45, 93, 212, 20,
350
+ 111, 73, 162, 251, 97, 215, 249, 53, 211, 19, 3, 89,
351
+ 49, 207, 101, 67, 151, 130, 223, 23, 189, 202, 178, 239,
352
+ 253, 127, 204, 49, 76, 236, 82, 137, 232, 157, 65, 79,
353
+ 96, 161, 176, 130, 161, 30, 47, 9, 189, 247, 61, 226,
354
+ 248, 90, 107, 64, 0, 88, 131, 243, 133, 59, 113, 115,
355
+ 17, 236, 33, 213, 12, 191, 111, 19, 251, 61, 103, 208,
356
+ 57, 35, 148, 248, 47, 116, 65, 119, 249, 178, 143, 40,
357
+ 189, 129, 8, 163, 204, 227, 230, 196, 205, 122, 151, 45,
358
+ 187, 19, 227, 72, 247, 125, 111, 121, 140, 220, 6, 107,
359
+ 77, 69, 10, 101, 21, 65, 149, 171, 255, 54, 101, 210,
360
+ 139, 43, 150, 151, 212, 164, 45, 237, 146, 184, 95, 6,
361
+ 160, 42, 8, 204, 46, 238, 254, 168, 208, 50, 156, 190,
362
+ 106, 127, 34, 234, 68, 55, 79, 18, 4, 130, 53, 208,
363
+ 181, 21, 175, 120, 25, 100, 192, 178, 161, 96, 81, 127,
364
+ 96, 227, 210, 248, 68, 10, 196, 31, 9, 167, 150, 193,
365
+ 0, 169, 126, 14, 124, 198, 144, 142, 240, 21, 224, 44,
366
+ 245, 66, 146, 238, 6, 196, 154, 49, 200, 222, 109, 9,
367
+ 210, 141, 192, 138, 8, 79, 114, 217, 68, 128, 249, 94,
368
+ 53, 30, 27, 61, 52, 135, 106, 212, 70, 238, 30, 185,
369
+ 10, 132, 146, 136, 117, 37, 251, 150, 180, 188, 247, 156,
370
+ 236, 192, 108, 86
371
+ };
372
+
373
+ #define INNER(l, h, mm) (((u32)((l) * (mm)) & 0xFFFFU) \
374
+ + ((u32)((h) * (mm)) << 16))
375
+
376
+ #define W_SMALL(sb, o1, o2, mm) \
377
+ (INNER(q[8 * (sb) + 2 * 0 + o1], q[8 * (sb) + 2 * 0 + o2], mm), \
378
+ INNER(q[8 * (sb) + 2 * 1 + o1], q[8 * (sb) + 2 * 1 + o2], mm), \
379
+ INNER(q[8 * (sb) + 2 * 2 + o1], q[8 * (sb) + 2 * 2 + o2], mm), \
380
+ INNER(q[8 * (sb) + 2 * 3 + o1], q[8 * (sb) + 2 * 3 + o2], mm)
381
+
382
+ #define WS_0_0 W_SMALL( 4, 0, 1, 185)
383
+ #define WS_0_1 W_SMALL( 6, 0, 1, 185)
384
+ #define WS_0_2 W_SMALL( 0, 0, 1, 185)
385
+ #define WS_0_3 W_SMALL( 2, 0, 1, 185)
386
+ #define WS_0_4 W_SMALL( 7, 0, 1, 185)
387
+ #define WS_0_5 W_SMALL( 5, 0, 1, 185)
388
+ #define WS_0_6 W_SMALL( 3, 0, 1, 185)
389
+ #define WS_0_7 W_SMALL( 1, 0, 1, 185)
390
+ #define WS_1_0 W_SMALL(15, 0, 1, 185)
391
+ #define WS_1_1 W_SMALL(11, 0, 1, 185)
392
+ #define WS_1_2 W_SMALL(12, 0, 1, 185)
393
+ #define WS_1_3 W_SMALL( 8, 0, 1, 185)
394
+ #define WS_1_4 W_SMALL( 9, 0, 1, 185)
395
+ #define WS_1_5 W_SMALL(13, 0, 1, 185)
396
+ #define WS_1_6 W_SMALL(10, 0, 1, 185)
397
+ #define WS_1_7 W_SMALL(14, 0, 1, 185)
398
+ #define WS_2_0 W_SMALL(17, -128, -64, 233)
399
+ #define WS_2_1 W_SMALL(18, -128, -64, 233)
400
+ #define WS_2_2 W_SMALL(23, -128, -64, 233)
401
+ #define WS_2_3 W_SMALL(20, -128, -64, 233)
402
+ #define WS_2_4 W_SMALL(22, -128, -64, 233)
403
+ #define WS_2_5 W_SMALL(21, -128, -64, 233)
404
+ #define WS_2_6 W_SMALL(16, -128, -64, 233)
405
+ #define WS_2_7 W_SMALL(19, -128, -64, 233)
406
+ #define WS_3_0 W_SMALL(30, -191, -127, 233)
407
+ #define WS_3_1 W_SMALL(24, -191, -127, 233)
408
+ #define WS_3_2 W_SMALL(25, -191, -127, 233)
409
+ #define WS_3_3 W_SMALL(31, -191, -127, 233)
410
+ #define WS_3_4 W_SMALL(27, -191, -127, 233)
411
+ #define WS_3_5 W_SMALL(29, -191, -127, 233)
412
+ #define WS_3_6 W_SMALL(28, -191, -127, 233)
413
+ #define WS_3_7 W_SMALL(26, -191, -127, 233)
414
+
415
+ #define W_BIG(sb, o1, o2, mm) \
416
+ (INNER(q[16 * (sb) + 2 * 0 + o1], q[16 * (sb) + 2 * 0 + o2], mm), \
417
+ INNER(q[16 * (sb) + 2 * 1 + o1], q[16 * (sb) + 2 * 1 + o2], mm), \
418
+ INNER(q[16 * (sb) + 2 * 2 + o1], q[16 * (sb) + 2 * 2 + o2], mm), \
419
+ INNER(q[16 * (sb) + 2 * 3 + o1], q[16 * (sb) + 2 * 3 + o2], mm), \
420
+ INNER(q[16 * (sb) + 2 * 4 + o1], q[16 * (sb) + 2 * 4 + o2], mm), \
421
+ INNER(q[16 * (sb) + 2 * 5 + o1], q[16 * (sb) + 2 * 5 + o2], mm), \
422
+ INNER(q[16 * (sb) + 2 * 6 + o1], q[16 * (sb) + 2 * 6 + o2], mm), \
423
+ INNER(q[16 * (sb) + 2 * 7 + o1], q[16 * (sb) + 2 * 7 + o2], mm)
424
+
425
+ #define WB_0_0 W_BIG( 4, 0, 1, 185)
426
+ #define WB_0_1 W_BIG( 6, 0, 1, 185)
427
+ #define WB_0_2 W_BIG( 0, 0, 1, 185)
428
+ #define WB_0_3 W_BIG( 2, 0, 1, 185)
429
+ #define WB_0_4 W_BIG( 7, 0, 1, 185)
430
+ #define WB_0_5 W_BIG( 5, 0, 1, 185)
431
+ #define WB_0_6 W_BIG( 3, 0, 1, 185)
432
+ #define WB_0_7 W_BIG( 1, 0, 1, 185)
433
+ #define WB_1_0 W_BIG(15, 0, 1, 185)
434
+ #define WB_1_1 W_BIG(11, 0, 1, 185)
435
+ #define WB_1_2 W_BIG(12, 0, 1, 185)
436
+ #define WB_1_3 W_BIG( 8, 0, 1, 185)
437
+ #define WB_1_4 W_BIG( 9, 0, 1, 185)
438
+ #define WB_1_5 W_BIG(13, 0, 1, 185)
439
+ #define WB_1_6 W_BIG(10, 0, 1, 185)
440
+ #define WB_1_7 W_BIG(14, 0, 1, 185)
441
+ #define WB_2_0 W_BIG(17, -256, -128, 233)
442
+ #define WB_2_1 W_BIG(18, -256, -128, 233)
443
+ #define WB_2_2 W_BIG(23, -256, -128, 233)
444
+ #define WB_2_3 W_BIG(20, -256, -128, 233)
445
+ #define WB_2_4 W_BIG(22, -256, -128, 233)
446
+ #define WB_2_5 W_BIG(21, -256, -128, 233)
447
+ #define WB_2_6 W_BIG(16, -256, -128, 233)
448
+ #define WB_2_7 W_BIG(19, -256, -128, 233)
449
+ #define WB_3_0 W_BIG(30, -383, -255, 233)
450
+ #define WB_3_1 W_BIG(24, -383, -255, 233)
451
+ #define WB_3_2 W_BIG(25, -383, -255, 233)
452
+ #define WB_3_3 W_BIG(31, -383, -255, 233)
453
+ #define WB_3_4 W_BIG(27, -383, -255, 233)
454
+ #define WB_3_5 W_BIG(29, -383, -255, 233)
455
+ #define WB_3_6 W_BIG(28, -383, -255, 233)
456
+ #define WB_3_7 W_BIG(26, -383, -255, 233)
457
+
458
+ #define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
459
+ #define MAJ(x, y, z) (((x) & (y)) | (((x) | (y)) & (z)))
460
+
461
+ #define PP4_0_0 1
462
+ #define PP4_0_1 0
463
+ #define PP4_0_2 3
464
+ #define PP4_0_3 2
465
+ #define PP4_1_0 2
466
+ #define PP4_1_1 3
467
+ #define PP4_1_2 0
468
+ #define PP4_1_3 1
469
+ #define PP4_2_0 3
470
+ #define PP4_2_1 2
471
+ #define PP4_2_2 1
472
+ #define PP4_2_3 0
473
+
474
+ #define PP8_0_0 1
475
+ #define PP8_0_1 0
476
+ #define PP8_0_2 3
477
+ #define PP8_0_3 2
478
+ #define PP8_0_4 5
479
+ #define PP8_0_5 4
480
+ #define PP8_0_6 7
481
+ #define PP8_0_7 6
482
+
483
+ #define PP8_1_0 6
484
+ #define PP8_1_1 7
485
+ #define PP8_1_2 4
486
+ #define PP8_1_3 5
487
+ #define PP8_1_4 2
488
+ #define PP8_1_5 3
489
+ #define PP8_1_6 0
490
+ #define PP8_1_7 1
491
+
492
+ #define PP8_2_0 2
493
+ #define PP8_2_1 3
494
+ #define PP8_2_2 0
495
+ #define PP8_2_3 1
496
+ #define PP8_2_4 6
497
+ #define PP8_2_5 7
498
+ #define PP8_2_6 4
499
+ #define PP8_2_7 5
500
+
501
+ #define PP8_3_0 3
502
+ #define PP8_3_1 2
503
+ #define PP8_3_2 1
504
+ #define PP8_3_3 0
505
+ #define PP8_3_4 7
506
+ #define PP8_3_5 6
507
+ #define PP8_3_6 5
508
+ #define PP8_3_7 4
509
+
510
+ #define PP8_4_0 5
511
+ #define PP8_4_1 4
512
+ #define PP8_4_2 7
513
+ #define PP8_4_3 6
514
+ #define PP8_4_4 1
515
+ #define PP8_4_5 0
516
+ #define PP8_4_6 3
517
+ #define PP8_4_7 2
518
+
519
+ #define PP8_5_0 7
520
+ #define PP8_5_1 6
521
+ #define PP8_5_2 5
522
+ #define PP8_5_3 4
523
+ #define PP8_5_4 3
524
+ #define PP8_5_5 2
525
+ #define PP8_5_6 1
526
+ #define PP8_5_7 0
527
+
528
+ #define PP8_6_0 4
529
+ #define PP8_6_1 5
530
+ #define PP8_6_2 6
531
+ #define PP8_6_3 7
532
+ #define PP8_6_4 0
533
+ #define PP8_6_5 1
534
+ #define PP8_6_6 2
535
+ #define PP8_6_7 3
536
+
537
+ #if SPH_SIMD_NOCOPY
538
+
539
+ #define DECL_STATE_SMALL
540
+ #define READ_STATE_SMALL(sc)
541
+ #define WRITE_STATE_SMALL(sc)
542
+ #define DECL_STATE_BIG
543
+ #define READ_STATE_BIG(sc)
544
+ #define WRITE_STATE_BIG(sc)
545
+
546
+ #else
547
+
548
+ #define DECL_STATE_SMALL \
549
+ u32 A0, A1, A2, A3, B0, B1, B2, B3, C0, C1, C2, C3, D0, D1, D2, D3;
550
+
551
+ #define READ_STATE_SMALL(sc) do { \
552
+ A0 = (sc)->state[ 0]; \
553
+ A1 = (sc)->state[ 1]; \
554
+ A2 = (sc)->state[ 2]; \
555
+ A3 = (sc)->state[ 3]; \
556
+ B0 = (sc)->state[ 4]; \
557
+ B1 = (sc)->state[ 5]; \
558
+ B2 = (sc)->state[ 6]; \
559
+ B3 = (sc)->state[ 7]; \
560
+ C0 = (sc)->state[ 8]; \
561
+ C1 = (sc)->state[ 9]; \
562
+ C2 = (sc)->state[10]; \
563
+ C3 = (sc)->state[11]; \
564
+ D0 = (sc)->state[12]; \
565
+ D1 = (sc)->state[13]; \
566
+ D2 = (sc)->state[14]; \
567
+ D3 = (sc)->state[15]; \
568
+ } while (0)
569
+
570
+ #define WRITE_STATE_SMALL(sc) do { \
571
+ (sc)->state[ 0] = A0; \
572
+ (sc)->state[ 1] = A1; \
573
+ (sc)->state[ 2] = A2; \
574
+ (sc)->state[ 3] = A3; \
575
+ (sc)->state[ 4] = B0; \
576
+ (sc)->state[ 5] = B1; \
577
+ (sc)->state[ 6] = B2; \
578
+ (sc)->state[ 7] = B3; \
579
+ (sc)->state[ 8] = C0; \
580
+ (sc)->state[ 9] = C1; \
581
+ (sc)->state[10] = C2; \
582
+ (sc)->state[11] = C3; \
583
+ (sc)->state[12] = D0; \
584
+ (sc)->state[13] = D1; \
585
+ (sc)->state[14] = D2; \
586
+ (sc)->state[15] = D3; \
587
+ } while (0)
588
+
589
+ #define DECL_STATE_BIG \
590
+ u32 A0, A1, A2, A3, A4, A5, A6, A7; \
591
+ u32 B0, B1, B2, B3, B4, B5, B6, B7; \
592
+ u32 C0, C1, C2, C3, C4, C5, C6, C7; \
593
+ u32 D0, D1, D2, D3, D4, D5, D6, D7;
594
+
595
+ #define READ_STATE_BIG(sc) do { \
596
+ A0 = (sc)->state[ 0]; \
597
+ A1 = (sc)->state[ 1]; \
598
+ A2 = (sc)->state[ 2]; \
599
+ A3 = (sc)->state[ 3]; \
600
+ A4 = (sc)->state[ 4]; \
601
+ A5 = (sc)->state[ 5]; \
602
+ A6 = (sc)->state[ 6]; \
603
+ A7 = (sc)->state[ 7]; \
604
+ B0 = (sc)->state[ 8]; \
605
+ B1 = (sc)->state[ 9]; \
606
+ B2 = (sc)->state[10]; \
607
+ B3 = (sc)->state[11]; \
608
+ B4 = (sc)->state[12]; \
609
+ B5 = (sc)->state[13]; \
610
+ B6 = (sc)->state[14]; \
611
+ B7 = (sc)->state[15]; \
612
+ C0 = (sc)->state[16]; \
613
+ C1 = (sc)->state[17]; \
614
+ C2 = (sc)->state[18]; \
615
+ C3 = (sc)->state[19]; \
616
+ C4 = (sc)->state[20]; \
617
+ C5 = (sc)->state[21]; \
618
+ C6 = (sc)->state[22]; \
619
+ C7 = (sc)->state[23]; \
620
+ D0 = (sc)->state[24]; \
621
+ D1 = (sc)->state[25]; \
622
+ D2 = (sc)->state[26]; \
623
+ D3 = (sc)->state[27]; \
624
+ D4 = (sc)->state[28]; \
625
+ D5 = (sc)->state[29]; \
626
+ D6 = (sc)->state[30]; \
627
+ D7 = (sc)->state[31]; \
628
+ } while (0)
629
+
630
+ #define WRITE_STATE_BIG(sc) do { \
631
+ (sc)->state[ 0] = A0; \
632
+ (sc)->state[ 1] = A1; \
633
+ (sc)->state[ 2] = A2; \
634
+ (sc)->state[ 3] = A3; \
635
+ (sc)->state[ 4] = A4; \
636
+ (sc)->state[ 5] = A5; \
637
+ (sc)->state[ 6] = A6; \
638
+ (sc)->state[ 7] = A7; \
639
+ (sc)->state[ 8] = B0; \
640
+ (sc)->state[ 9] = B1; \
641
+ (sc)->state[10] = B2; \
642
+ (sc)->state[11] = B3; \
643
+ (sc)->state[12] = B4; \
644
+ (sc)->state[13] = B5; \
645
+ (sc)->state[14] = B6; \
646
+ (sc)->state[15] = B7; \
647
+ (sc)->state[16] = C0; \
648
+ (sc)->state[17] = C1; \
649
+ (sc)->state[18] = C2; \
650
+ (sc)->state[19] = C3; \
651
+ (sc)->state[20] = C4; \
652
+ (sc)->state[21] = C5; \
653
+ (sc)->state[22] = C6; \
654
+ (sc)->state[23] = C7; \
655
+ (sc)->state[24] = D0; \
656
+ (sc)->state[25] = D1; \
657
+ (sc)->state[26] = D2; \
658
+ (sc)->state[27] = D3; \
659
+ (sc)->state[28] = D4; \
660
+ (sc)->state[29] = D5; \
661
+ (sc)->state[30] = D6; \
662
+ (sc)->state[31] = D7; \
663
+ } while (0)
664
+
665
+ #endif
666
+
667
+ #define STEP_ELT(n, w, fun, s, ppb) do { \
668
+ u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
669
+ A ## n = T32(ROL32(tt, s) + XCAT(tA, XCAT(ppb, n))); \
670
+ D ## n = C ## n; \
671
+ C ## n = B ## n; \
672
+ B ## n = tA ## n; \
673
+ } while (0)
674
+
675
+ #define STEP_SMALL(w0, w1, w2, w3, fun, r, s, pp4b) do { \
676
+ u32 tA0 = ROL32(A0, r); \
677
+ u32 tA1 = ROL32(A1, r); \
678
+ u32 tA2 = ROL32(A2, r); \
679
+ u32 tA3 = ROL32(A3, r); \
680
+ STEP_ELT(0, w0, fun, s, pp4b); \
681
+ STEP_ELT(1, w1, fun, s, pp4b); \
682
+ STEP_ELT(2, w2, fun, s, pp4b); \
683
+ STEP_ELT(3, w3, fun, s, pp4b); \
684
+ } while (0)
685
+
686
+ #define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \
687
+ u32 tA0 = ROL32(A0, r); \
688
+ u32 tA1 = ROL32(A1, r); \
689
+ u32 tA2 = ROL32(A2, r); \
690
+ u32 tA3 = ROL32(A3, r); \
691
+ u32 tA4 = ROL32(A4, r); \
692
+ u32 tA5 = ROL32(A5, r); \
693
+ u32 tA6 = ROL32(A6, r); \
694
+ u32 tA7 = ROL32(A7, r); \
695
+ STEP_ELT(0, w0, fun, s, pp8b); \
696
+ STEP_ELT(1, w1, fun, s, pp8b); \
697
+ STEP_ELT(2, w2, fun, s, pp8b); \
698
+ STEP_ELT(3, w3, fun, s, pp8b); \
699
+ STEP_ELT(4, w4, fun, s, pp8b); \
700
+ STEP_ELT(5, w5, fun, s, pp8b); \
701
+ STEP_ELT(6, w6, fun, s, pp8b); \
702
+ STEP_ELT(7, w7, fun, s, pp8b); \
703
+ } while (0)
704
+
705
+ #define M3_0_0 0_
706
+ #define M3_1_0 1_
707
+ #define M3_2_0 2_
708
+ #define M3_3_0 0_
709
+ #define M3_4_0 1_
710
+ #define M3_5_0 2_
711
+ #define M3_6_0 0_
712
+ #define M3_7_0 1_
713
+
714
+ #define M3_0_1 1_
715
+ #define M3_1_1 2_
716
+ #define M3_2_1 0_
717
+ #define M3_3_1 1_
718
+ #define M3_4_1 2_
719
+ #define M3_5_1 0_
720
+ #define M3_6_1 1_
721
+ #define M3_7_1 2_
722
+
723
+ #define M3_0_2 2_
724
+ #define M3_1_2 0_
725
+ #define M3_2_2 1_
726
+ #define M3_3_2 2_
727
+ #define M3_4_2 0_
728
+ #define M3_5_2 1_
729
+ #define M3_6_2 2_
730
+ #define M3_7_2 0_
731
+
732
+ #define STEP_SMALL_(w, fun, r, s, pp4b) STEP_SMALL w, fun, r, s, pp4b)
733
+
734
+ #define ONE_ROUND_SMALL(ri, isp, p0, p1, p2, p3) do { \
735
+ STEP_SMALL_(WS_ ## ri ## 0, \
736
+ IF, p0, p1, XCAT(PP4_, M3_0_ ## isp)); \
737
+ STEP_SMALL_(WS_ ## ri ## 1, \
738
+ IF, p1, p2, XCAT(PP4_, M3_1_ ## isp)); \
739
+ STEP_SMALL_(WS_ ## ri ## 2, \
740
+ IF, p2, p3, XCAT(PP4_, M3_2_ ## isp)); \
741
+ STEP_SMALL_(WS_ ## ri ## 3, \
742
+ IF, p3, p0, XCAT(PP4_, M3_3_ ## isp)); \
743
+ STEP_SMALL_(WS_ ## ri ## 4, \
744
+ MAJ, p0, p1, XCAT(PP4_, M3_4_ ## isp)); \
745
+ STEP_SMALL_(WS_ ## ri ## 5, \
746
+ MAJ, p1, p2, XCAT(PP4_, M3_5_ ## isp)); \
747
+ STEP_SMALL_(WS_ ## ri ## 6, \
748
+ MAJ, p2, p3, XCAT(PP4_, M3_6_ ## isp)); \
749
+ STEP_SMALL_(WS_ ## ri ## 7, \
750
+ MAJ, p3, p0, XCAT(PP4_, M3_7_ ## isp)); \
751
+ } while (0)
752
+
753
+ #define M7_0_0 0_
754
+ #define M7_1_0 1_
755
+ #define M7_2_0 2_
756
+ #define M7_3_0 3_
757
+ #define M7_4_0 4_
758
+ #define M7_5_0 5_
759
+ #define M7_6_0 6_
760
+ #define M7_7_0 0_
761
+
762
+ #define M7_0_1 1_
763
+ #define M7_1_1 2_
764
+ #define M7_2_1 3_
765
+ #define M7_3_1 4_
766
+ #define M7_4_1 5_
767
+ #define M7_5_1 6_
768
+ #define M7_6_1 0_
769
+ #define M7_7_1 1_
770
+
771
+ #define M7_0_2 2_
772
+ #define M7_1_2 3_
773
+ #define M7_2_2 4_
774
+ #define M7_3_2 5_
775
+ #define M7_4_2 6_
776
+ #define M7_5_2 0_
777
+ #define M7_6_2 1_
778
+ #define M7_7_2 2_
779
+
780
+ #define M7_0_3 3_
781
+ #define M7_1_3 4_
782
+ #define M7_2_3 5_
783
+ #define M7_3_3 6_
784
+ #define M7_4_3 0_
785
+ #define M7_5_3 1_
786
+ #define M7_6_3 2_
787
+ #define M7_7_3 3_
788
+
789
+ #define STEP_BIG_(w, fun, r, s, pp8b) STEP_BIG w, fun, r, s, pp8b)
790
+
791
+ #define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3) do { \
792
+ STEP_BIG_(WB_ ## ri ## 0, \
793
+ IF, p0, p1, XCAT(PP8_, M7_0_ ## isp)); \
794
+ STEP_BIG_(WB_ ## ri ## 1, \
795
+ IF, p1, p2, XCAT(PP8_, M7_1_ ## isp)); \
796
+ STEP_BIG_(WB_ ## ri ## 2, \
797
+ IF, p2, p3, XCAT(PP8_, M7_2_ ## isp)); \
798
+ STEP_BIG_(WB_ ## ri ## 3, \
799
+ IF, p3, p0, XCAT(PP8_, M7_3_ ## isp)); \
800
+ STEP_BIG_(WB_ ## ri ## 4, \
801
+ MAJ, p0, p1, XCAT(PP8_, M7_4_ ## isp)); \
802
+ STEP_BIG_(WB_ ## ri ## 5, \
803
+ MAJ, p1, p2, XCAT(PP8_, M7_5_ ## isp)); \
804
+ STEP_BIG_(WB_ ## ri ## 6, \
805
+ MAJ, p2, p3, XCAT(PP8_, M7_6_ ## isp)); \
806
+ STEP_BIG_(WB_ ## ri ## 7, \
807
+ MAJ, p3, p0, XCAT(PP8_, M7_7_ ## isp)); \
808
+ } while (0)
809
+
810
+ #if SPH_SMALL_FOOTPRINT_SIMD
811
+
812
+ #define A0 state[ 0]
813
+ #define A1 state[ 1]
814
+ #define A2 state[ 2]
815
+ #define A3 state[ 3]
816
+ #define B0 state[ 4]
817
+ #define B1 state[ 5]
818
+ #define B2 state[ 6]
819
+ #define B3 state[ 7]
820
+ #define C0 state[ 8]
821
+ #define C1 state[ 9]
822
+ #define C2 state[10]
823
+ #define C3 state[11]
824
+ #define D0 state[12]
825
+ #define D1 state[13]
826
+ #define D2 state[14]
827
+ #define D3 state[15]
828
+
829
+ #define STEP2_ELT(n, w, fun, s, ppb) do { \
830
+ u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
831
+ A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
832
+ D ## n = C ## n; \
833
+ C ## n = B ## n; \
834
+ B ## n = tA[n]; \
835
+ } while (0)
836
+
837
+ #define STEP2_SMALL(w0, w1, w2, w3, fun, r, s, pp4b) do { \
838
+ u32 tA[4]; \
839
+ tA[0] = ROL32(A0, r); \
840
+ tA[1] = ROL32(A1, r); \
841
+ tA[2] = ROL32(A2, r); \
842
+ tA[3] = ROL32(A3, r); \
843
+ STEP2_ELT(0, w0, fun, s, pp4b); \
844
+ STEP2_ELT(1, w1, fun, s, pp4b); \
845
+ STEP2_ELT(2, w2, fun, s, pp4b); \
846
+ STEP2_ELT(3, w3, fun, s, pp4b); \
847
+ } while (0)
848
+
849
+ static void
850
+ one_round_small(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
851
+ {
852
+ static const int pp4k[] = { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2 };
853
+
854
+ STEP2_SMALL(w[ 0], w[ 1], w[ 2], w[ 3], IF, p0, p1, pp4k[isp + 0]);
855
+ STEP2_SMALL(w[ 4], w[ 5], w[ 6], w[ 7], IF, p1, p2, pp4k[isp + 1]);
856
+ STEP2_SMALL(w[ 8], w[ 9], w[10], w[11], IF, p2, p3, pp4k[isp + 2]);
857
+ STEP2_SMALL(w[12], w[13], w[14], w[15], IF, p3, p0, pp4k[isp + 3]);
858
+ STEP2_SMALL(w[16], w[17], w[18], w[19], MAJ, p0, p1, pp4k[isp + 4]);
859
+ STEP2_SMALL(w[20], w[21], w[22], w[23], MAJ, p1, p2, pp4k[isp + 5]);
860
+ STEP2_SMALL(w[24], w[25], w[26], w[27], MAJ, p2, p3, pp4k[isp + 6]);
861
+ STEP2_SMALL(w[28], w[29], w[30], w[31], MAJ, p3, p0, pp4k[isp + 7]);
862
+ }
863
+
864
+ static void
865
+ compress_small(sph_simd_small_context *sc, int last)
866
+ {
867
+ unsigned char *x;
868
+ s32 q[128];
869
+ int i;
870
+ u32 w[32];
871
+ u32 state[16];
872
+ size_t u;
873
+
874
+ static const size_t wsp[32] = {
875
+ 4 << 3, 6 << 3, 0 << 3, 2 << 3,
876
+ 7 << 3, 5 << 3, 3 << 3, 1 << 3,
877
+ 15 << 3, 11 << 3, 12 << 3, 8 << 3,
878
+ 9 << 3, 13 << 3, 10 << 3, 14 << 3,
879
+ 17 << 3, 18 << 3, 23 << 3, 20 << 3,
880
+ 22 << 3, 21 << 3, 16 << 3, 19 << 3,
881
+ 30 << 3, 24 << 3, 25 << 3, 31 << 3,
882
+ 27 << 3, 29 << 3, 28 << 3, 26 << 3
883
+ };
884
+
885
+ x = sc->buf;
886
+ FFT128(0, 1, 0, ll);
887
+ if (last) {
888
+ for (i = 0; i < 128; i ++) {
889
+ s32 tq;
890
+
891
+ tq = q[i] + yoff_s_f[i];
892
+ tq = REDS2(tq);
893
+ tq = REDS1(tq);
894
+ tq = REDS1(tq);
895
+ q[i] = (tq <= 128 ? tq : tq - 257);
896
+ }
897
+ } else {
898
+ for (i = 0; i < 128; i ++) {
899
+ s32 tq;
900
+
901
+ tq = q[i] + yoff_s_n[i];
902
+ tq = REDS2(tq);
903
+ tq = REDS1(tq);
904
+ tq = REDS1(tq);
905
+ q[i] = (tq <= 128 ? tq : tq - 257);
906
+ }
907
+ }
908
+
909
+ for (i = 0; i < 16; i += 4) {
910
+ state[i + 0] = sc->state[i + 0]
911
+ ^ sph_dec32le_aligned(x + 4 * (i + 0));
912
+ state[i + 1] = sc->state[i + 1]
913
+ ^ sph_dec32le_aligned(x + 4 * (i + 1));
914
+ state[i + 2] = sc->state[i + 2]
915
+ ^ sph_dec32le_aligned(x + 4 * (i + 2));
916
+ state[i + 3] = sc->state[i + 3]
917
+ ^ sph_dec32le_aligned(x + 4 * (i + 3));
918
+ }
919
+
920
+ #define WSREAD(sb, o1, o2, mm) do { \
921
+ for (u = 0; u < 32; u += 4) { \
922
+ size_t v = wsp[(u >> 2) + (sb)]; \
923
+ w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
924
+ q[v + 2 * 0 + (o2)], mm); \
925
+ w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
926
+ q[v + 2 * 1 + (o2)], mm); \
927
+ w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
928
+ q[v + 2 * 2 + (o2)], mm); \
929
+ w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
930
+ q[v + 2 * 3 + (o2)], mm); \
931
+ } \
932
+ } while (0)
933
+
934
+ WSREAD( 0, 0, 1, 185);
935
+ one_round_small(state, w, 0, 3, 23, 17, 27);
936
+ WSREAD( 8, 0, 1, 185);
937
+ one_round_small(state, w, 2, 28, 19, 22, 7);
938
+ WSREAD(16, -128, -64, 233);
939
+ one_round_small(state, w, 1, 29, 9, 15, 5);
940
+ WSREAD(24, -191, -127, 233);
941
+ one_round_small(state, w, 0, 4, 13, 10, 25);
942
+
943
+ #undef WSREAD
944
+
945
+ STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
946
+ IF, 4, 13, PP4_2_);
947
+ STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
948
+ IF, 13, 10, PP4_0_);
949
+ STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
950
+ IF, 10, 25, PP4_1_);
951
+ STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
952
+ IF, 25, 4, PP4_2_);
953
+
954
+ memcpy(sc->state, state, sizeof state);
955
+ }
956
+
957
+ #undef A0
958
+ #undef A1
959
+ #undef A2
960
+ #undef A3
961
+ #undef B0
962
+ #undef B1
963
+ #undef B2
964
+ #undef B3
965
+ #undef C0
966
+ #undef C1
967
+ #undef C2
968
+ #undef C3
969
+ #undef D0
970
+ #undef D1
971
+ #undef D2
972
+ #undef D3
973
+
974
+ #else
975
+
976
+ #if SPH_SIMD_NOCOPY
977
+ #define A0 (sc->state[ 0])
978
+ #define A1 (sc->state[ 1])
979
+ #define A2 (sc->state[ 2])
980
+ #define A3 (sc->state[ 3])
981
+ #define B0 (sc->state[ 4])
982
+ #define B1 (sc->state[ 5])
983
+ #define B2 (sc->state[ 6])
984
+ #define B3 (sc->state[ 7])
985
+ #define C0 (sc->state[ 8])
986
+ #define C1 (sc->state[ 9])
987
+ #define C2 (sc->state[10])
988
+ #define C3 (sc->state[11])
989
+ #define D0 (sc->state[12])
990
+ #define D1 (sc->state[13])
991
+ #define D2 (sc->state[14])
992
+ #define D3 (sc->state[15])
993
+ #endif
994
+
995
+ static void
996
+ compress_small(sph_simd_small_context *sc, int last)
997
+ {
998
+ unsigned char *x;
999
+ s32 q[128];
1000
+ int i;
1001
+ DECL_STATE_SMALL
1002
+ #if SPH_SIMD_NOCOPY
1003
+ sph_u32 saved[16];
1004
+ #endif
1005
+
1006
+ #if SPH_SIMD_NOCOPY
1007
+ memcpy(saved, sc->state, sizeof saved);
1008
+ #endif
1009
+ x = sc->buf;
1010
+ FFT128(0, 1, 0, ll);
1011
+ if (last) {
1012
+ for (i = 0; i < 128; i ++) {
1013
+ s32 tq;
1014
+
1015
+ tq = q[i] + yoff_s_f[i];
1016
+ tq = REDS2(tq);
1017
+ tq = REDS1(tq);
1018
+ tq = REDS1(tq);
1019
+ q[i] = (tq <= 128 ? tq : tq - 257);
1020
+ }
1021
+ } else {
1022
+ for (i = 0; i < 128; i ++) {
1023
+ s32 tq;
1024
+
1025
+ tq = q[i] + yoff_s_n[i];
1026
+ tq = REDS2(tq);
1027
+ tq = REDS1(tq);
1028
+ tq = REDS1(tq);
1029
+ q[i] = (tq <= 128 ? tq : tq - 257);
1030
+ }
1031
+ }
1032
+ READ_STATE_SMALL(sc);
1033
+ A0 ^= sph_dec32le_aligned(x + 0);
1034
+ A1 ^= sph_dec32le_aligned(x + 4);
1035
+ A2 ^= sph_dec32le_aligned(x + 8);
1036
+ A3 ^= sph_dec32le_aligned(x + 12);
1037
+ B0 ^= sph_dec32le_aligned(x + 16);
1038
+ B1 ^= sph_dec32le_aligned(x + 20);
1039
+ B2 ^= sph_dec32le_aligned(x + 24);
1040
+ B3 ^= sph_dec32le_aligned(x + 28);
1041
+ C0 ^= sph_dec32le_aligned(x + 32);
1042
+ C1 ^= sph_dec32le_aligned(x + 36);
1043
+ C2 ^= sph_dec32le_aligned(x + 40);
1044
+ C3 ^= sph_dec32le_aligned(x + 44);
1045
+ D0 ^= sph_dec32le_aligned(x + 48);
1046
+ D1 ^= sph_dec32le_aligned(x + 52);
1047
+ D2 ^= sph_dec32le_aligned(x + 56);
1048
+ D3 ^= sph_dec32le_aligned(x + 60);
1049
+ ONE_ROUND_SMALL(0_, 0, 3, 23, 17, 27);
1050
+ ONE_ROUND_SMALL(1_, 2, 28, 19, 22, 7);
1051
+ ONE_ROUND_SMALL(2_, 1, 29, 9, 15, 5);
1052
+ ONE_ROUND_SMALL(3_, 0, 4, 13, 10, 25);
1053
+ #if SPH_SIMD_NOCOPY
1054
+ STEP_SMALL(saved[ 0], saved[ 1], saved[ 2], saved[ 3],
1055
+ IF, 4, 13, PP4_2_);
1056
+ STEP_SMALL(saved[ 4], saved[ 5], saved[ 6], saved[ 7],
1057
+ IF, 13, 10, PP4_0_);
1058
+ STEP_SMALL(saved[ 8], saved[ 9], saved[10], saved[11],
1059
+ IF, 10, 25, PP4_1_);
1060
+ STEP_SMALL(saved[12], saved[13], saved[14], saved[15],
1061
+ IF, 25, 4, PP4_2_);
1062
+ #else
1063
+ STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
1064
+ IF, 4, 13, PP4_2_);
1065
+ STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
1066
+ IF, 13, 10, PP4_0_);
1067
+ STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
1068
+ IF, 10, 25, PP4_1_);
1069
+ STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
1070
+ IF, 25, 4, PP4_2_);
1071
+ WRITE_STATE_SMALL(sc);
1072
+ #endif
1073
+ }
1074
+
1075
+ #if SPH_SIMD_NOCOPY
1076
+ #undef A0
1077
+ #undef A1
1078
+ #undef A2
1079
+ #undef A3
1080
+ #undef B0
1081
+ #undef B1
1082
+ #undef B2
1083
+ #undef B3
1084
+ #undef C0
1085
+ #undef C1
1086
+ #undef C2
1087
+ #undef C3
1088
+ #undef D0
1089
+ #undef D1
1090
+ #undef D2
1091
+ #undef D3
1092
+ #endif
1093
+
1094
+ #endif
1095
+
1096
+ #if SPH_SMALL_FOOTPRINT_SIMD
1097
+
1098
+ #define A0 state[ 0]
1099
+ #define A1 state[ 1]
1100
+ #define A2 state[ 2]
1101
+ #define A3 state[ 3]
1102
+ #define A4 state[ 4]
1103
+ #define A5 state[ 5]
1104
+ #define A6 state[ 6]
1105
+ #define A7 state[ 7]
1106
+ #define B0 state[ 8]
1107
+ #define B1 state[ 9]
1108
+ #define B2 state[10]
1109
+ #define B3 state[11]
1110
+ #define B4 state[12]
1111
+ #define B5 state[13]
1112
+ #define B6 state[14]
1113
+ #define B7 state[15]
1114
+ #define C0 state[16]
1115
+ #define C1 state[17]
1116
+ #define C2 state[18]
1117
+ #define C3 state[19]
1118
+ #define C4 state[20]
1119
+ #define C5 state[21]
1120
+ #define C6 state[22]
1121
+ #define C7 state[23]
1122
+ #define D0 state[24]
1123
+ #define D1 state[25]
1124
+ #define D2 state[26]
1125
+ #define D3 state[27]
1126
+ #define D4 state[28]
1127
+ #define D5 state[29]
1128
+ #define D6 state[30]
1129
+ #define D7 state[31]
1130
+
1131
+ /*
1132
+ * Not needed -- already defined for SIMD-224 / SIMD-256
1133
+ *
1134
+ #define STEP2_ELT(n, w, fun, s, ppb) do { \
1135
+ u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
1136
+ A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
1137
+ D ## n = C ## n; \
1138
+ C ## n = B ## n; \
1139
+ B ## n = tA[n]; \
1140
+ } while (0)
1141
+ */
1142
+
1143
+ #define STEP2_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \
1144
+ u32 tA[8]; \
1145
+ tA[0] = ROL32(A0, r); \
1146
+ tA[1] = ROL32(A1, r); \
1147
+ tA[2] = ROL32(A2, r); \
1148
+ tA[3] = ROL32(A3, r); \
1149
+ tA[4] = ROL32(A4, r); \
1150
+ tA[5] = ROL32(A5, r); \
1151
+ tA[6] = ROL32(A6, r); \
1152
+ tA[7] = ROL32(A7, r); \
1153
+ STEP2_ELT(0, w0, fun, s, pp8b); \
1154
+ STEP2_ELT(1, w1, fun, s, pp8b); \
1155
+ STEP2_ELT(2, w2, fun, s, pp8b); \
1156
+ STEP2_ELT(3, w3, fun, s, pp8b); \
1157
+ STEP2_ELT(4, w4, fun, s, pp8b); \
1158
+ STEP2_ELT(5, w5, fun, s, pp8b); \
1159
+ STEP2_ELT(6, w6, fun, s, pp8b); \
1160
+ STEP2_ELT(7, w7, fun, s, pp8b); \
1161
+ } while (0)
1162
+
1163
+ static void
1164
+ one_round_big(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
1165
+ {
1166
+ static const int pp8k[] = { 1, 6, 2, 3, 5, 7, 4, 1, 6, 2, 3 };
1167
+
1168
+ STEP2_BIG(w[ 0], w[ 1], w[ 2], w[ 3], w[ 4], w[ 5], w[ 6], w[ 7],
1169
+ IF, p0, p1, pp8k[isp + 0]);
1170
+ STEP2_BIG(w[ 8], w[ 9], w[10], w[11], w[12], w[13], w[14], w[15],
1171
+ IF, p1, p2, pp8k[isp + 1]);
1172
+ STEP2_BIG(w[16], w[17], w[18], w[19], w[20], w[21], w[22], w[23],
1173
+ IF, p2, p3, pp8k[isp + 2]);
1174
+ STEP2_BIG(w[24], w[25], w[26], w[27], w[28], w[29], w[30], w[31],
1175
+ IF, p3, p0, pp8k[isp + 3]);
1176
+ STEP2_BIG(w[32], w[33], w[34], w[35], w[36], w[37], w[38], w[39],
1177
+ MAJ, p0, p1, pp8k[isp + 4]);
1178
+ STEP2_BIG(w[40], w[41], w[42], w[43], w[44], w[45], w[46], w[47],
1179
+ MAJ, p1, p2, pp8k[isp + 5]);
1180
+ STEP2_BIG(w[48], w[49], w[50], w[51], w[52], w[53], w[54], w[55],
1181
+ MAJ, p2, p3, pp8k[isp + 6]);
1182
+ STEP2_BIG(w[56], w[57], w[58], w[59], w[60], w[61], w[62], w[63],
1183
+ MAJ, p3, p0, pp8k[isp + 7]);
1184
+ }
1185
+
1186
+ static void
1187
+ compress_big(sph_simd_big_context *sc, int last)
1188
+ {
1189
+ unsigned char *x;
1190
+ s32 q[256];
1191
+ int i;
1192
+ u32 w[64];
1193
+ u32 state[32];
1194
+ size_t u;
1195
+
1196
+ static const size_t wbp[32] = {
1197
+ 4 << 4, 6 << 4, 0 << 4, 2 << 4,
1198
+ 7 << 4, 5 << 4, 3 << 4, 1 << 4,
1199
+ 15 << 4, 11 << 4, 12 << 4, 8 << 4,
1200
+ 9 << 4, 13 << 4, 10 << 4, 14 << 4,
1201
+ 17 << 4, 18 << 4, 23 << 4, 20 << 4,
1202
+ 22 << 4, 21 << 4, 16 << 4, 19 << 4,
1203
+ 30 << 4, 24 << 4, 25 << 4, 31 << 4,
1204
+ 27 << 4, 29 << 4, 28 << 4, 26 << 4
1205
+ };
1206
+
1207
+ x = sc->buf;
1208
+ FFT256(0, 1, 0, ll);
1209
+ if (last) {
1210
+ for (i = 0; i < 256; i ++) {
1211
+ s32 tq;
1212
+
1213
+ tq = q[i] + yoff_b_f[i];
1214
+ tq = REDS2(tq);
1215
+ tq = REDS1(tq);
1216
+ tq = REDS1(tq);
1217
+ q[i] = (tq <= 128 ? tq : tq - 257);
1218
+ }
1219
+ } else {
1220
+ for (i = 0; i < 256; i ++) {
1221
+ s32 tq;
1222
+
1223
+ tq = q[i] + yoff_b_n[i];
1224
+ tq = REDS2(tq);
1225
+ tq = REDS1(tq);
1226
+ tq = REDS1(tq);
1227
+ q[i] = (tq <= 128 ? tq : tq - 257);
1228
+ }
1229
+ }
1230
+
1231
+ for (i = 0; i < 32; i += 8) {
1232
+ state[i + 0] = sc->state[i + 0]
1233
+ ^ sph_dec32le_aligned(x + 4 * (i + 0));
1234
+ state[i + 1] = sc->state[i + 1]
1235
+ ^ sph_dec32le_aligned(x + 4 * (i + 1));
1236
+ state[i + 2] = sc->state[i + 2]
1237
+ ^ sph_dec32le_aligned(x + 4 * (i + 2));
1238
+ state[i + 3] = sc->state[i + 3]
1239
+ ^ sph_dec32le_aligned(x + 4 * (i + 3));
1240
+ state[i + 4] = sc->state[i + 4]
1241
+ ^ sph_dec32le_aligned(x + 4 * (i + 4));
1242
+ state[i + 5] = sc->state[i + 5]
1243
+ ^ sph_dec32le_aligned(x + 4 * (i + 5));
1244
+ state[i + 6] = sc->state[i + 6]
1245
+ ^ sph_dec32le_aligned(x + 4 * (i + 6));
1246
+ state[i + 7] = sc->state[i + 7]
1247
+ ^ sph_dec32le_aligned(x + 4 * (i + 7));
1248
+ }
1249
+
1250
+ #define WBREAD(sb, o1, o2, mm) do { \
1251
+ for (u = 0; u < 64; u += 8) { \
1252
+ size_t v = wbp[(u >> 3) + (sb)]; \
1253
+ w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
1254
+ q[v + 2 * 0 + (o2)], mm); \
1255
+ w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
1256
+ q[v + 2 * 1 + (o2)], mm); \
1257
+ w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
1258
+ q[v + 2 * 2 + (o2)], mm); \
1259
+ w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
1260
+ q[v + 2 * 3 + (o2)], mm); \
1261
+ w[u + 4] = INNER(q[v + 2 * 4 + (o1)], \
1262
+ q[v + 2 * 4 + (o2)], mm); \
1263
+ w[u + 5] = INNER(q[v + 2 * 5 + (o1)], \
1264
+ q[v + 2 * 5 + (o2)], mm); \
1265
+ w[u + 6] = INNER(q[v + 2 * 6 + (o1)], \
1266
+ q[v + 2 * 6 + (o2)], mm); \
1267
+ w[u + 7] = INNER(q[v + 2 * 7 + (o1)], \
1268
+ q[v + 2 * 7 + (o2)], mm); \
1269
+ } \
1270
+ } while (0)
1271
+
1272
+ WBREAD( 0, 0, 1, 185);
1273
+ one_round_big(state, w, 0, 3, 23, 17, 27);
1274
+ WBREAD( 8, 0, 1, 185);
1275
+ one_round_big(state, w, 1, 28, 19, 22, 7);
1276
+ WBREAD(16, -256, -128, 233);
1277
+ one_round_big(state, w, 2, 29, 9, 15, 5);
1278
+ WBREAD(24, -383, -255, 233);
1279
+ one_round_big(state, w, 3, 4, 13, 10, 25);
1280
+
1281
+ #undef WBREAD
1282
+
1283
+ STEP_BIG(
1284
+ sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
1285
+ sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
1286
+ IF, 4, 13, PP8_4_);
1287
+ STEP_BIG(
1288
+ sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
1289
+ sc->state[12], sc->state[13], sc->state[14], sc->state[15],
1290
+ IF, 13, 10, PP8_5_);
1291
+ STEP_BIG(
1292
+ sc->state[16], sc->state[17], sc->state[18], sc->state[19],
1293
+ sc->state[20], sc->state[21], sc->state[22], sc->state[23],
1294
+ IF, 10, 25, PP8_6_);
1295
+ STEP_BIG(
1296
+ sc->state[24], sc->state[25], sc->state[26], sc->state[27],
1297
+ sc->state[28], sc->state[29], sc->state[30], sc->state[31],
1298
+ IF, 25, 4, PP8_0_);
1299
+
1300
+ memcpy(sc->state, state, sizeof state);
1301
+ }
1302
+
1303
+ #undef A0
1304
+ #undef A1
1305
+ #undef A2
1306
+ #undef A3
1307
+ #undef A4
1308
+ #undef A5
1309
+ #undef A6
1310
+ #undef A7
1311
+ #undef B0
1312
+ #undef B1
1313
+ #undef B2
1314
+ #undef B3
1315
+ #undef B4
1316
+ #undef B5
1317
+ #undef B6
1318
+ #undef B7
1319
+ #undef C0
1320
+ #undef C1
1321
+ #undef C2
1322
+ #undef C3
1323
+ #undef C4
1324
+ #undef C5
1325
+ #undef C6
1326
+ #undef C7
1327
+ #undef D0
1328
+ #undef D1
1329
+ #undef D2
1330
+ #undef D3
1331
+ #undef D4
1332
+ #undef D5
1333
+ #undef D6
1334
+ #undef D7
1335
+
1336
+ #else
1337
+
1338
+ #if SPH_SIMD_NOCOPY
1339
+ #define A0 (sc->state[ 0])
1340
+ #define A1 (sc->state[ 1])
1341
+ #define A2 (sc->state[ 2])
1342
+ #define A3 (sc->state[ 3])
1343
+ #define A4 (sc->state[ 4])
1344
+ #define A5 (sc->state[ 5])
1345
+ #define A6 (sc->state[ 6])
1346
+ #define A7 (sc->state[ 7])
1347
+ #define B0 (sc->state[ 8])
1348
+ #define B1 (sc->state[ 9])
1349
+ #define B2 (sc->state[10])
1350
+ #define B3 (sc->state[11])
1351
+ #define B4 (sc->state[12])
1352
+ #define B5 (sc->state[13])
1353
+ #define B6 (sc->state[14])
1354
+ #define B7 (sc->state[15])
1355
+ #define C0 (sc->state[16])
1356
+ #define C1 (sc->state[17])
1357
+ #define C2 (sc->state[18])
1358
+ #define C3 (sc->state[19])
1359
+ #define C4 (sc->state[20])
1360
+ #define C5 (sc->state[21])
1361
+ #define C6 (sc->state[22])
1362
+ #define C7 (sc->state[23])
1363
+ #define D0 (sc->state[24])
1364
+ #define D1 (sc->state[25])
1365
+ #define D2 (sc->state[26])
1366
+ #define D3 (sc->state[27])
1367
+ #define D4 (sc->state[28])
1368
+ #define D5 (sc->state[29])
1369
+ #define D6 (sc->state[30])
1370
+ #define D7 (sc->state[31])
1371
+ #endif
1372
+
1373
+ static void
1374
+ compress_big(sph_simd_big_context *sc, int last)
1375
+ {
1376
+ unsigned char *x;
1377
+ s32 q[256];
1378
+ int i;
1379
+ DECL_STATE_BIG
1380
+ #if SPH_SIMD_NOCOPY
1381
+ sph_u32 saved[32];
1382
+ #endif
1383
+
1384
+ #if SPH_SIMD_NOCOPY
1385
+ memcpy(saved, sc->state, sizeof saved);
1386
+ #endif
1387
+
1388
+ x = sc->buf;
1389
+ FFT256(0, 1, 0, ll);
1390
+ if (last) {
1391
+ for (i = 0; i < 256; i ++) {
1392
+ s32 tq;
1393
+
1394
+ tq = q[i] + yoff_b_f[i];
1395
+ tq = REDS2(tq);
1396
+ tq = REDS1(tq);
1397
+ tq = REDS1(tq);
1398
+ q[i] = (tq <= 128 ? tq : tq - 257);
1399
+ }
1400
+ } else {
1401
+ for (i = 0; i < 256; i ++) {
1402
+ s32 tq;
1403
+
1404
+ tq = q[i] + yoff_b_n[i];
1405
+ tq = REDS2(tq);
1406
+ tq = REDS1(tq);
1407
+ tq = REDS1(tq);
1408
+ q[i] = (tq <= 128 ? tq : tq - 257);
1409
+ }
1410
+ }
1411
+ READ_STATE_BIG(sc);
1412
+ A0 ^= sph_dec32le_aligned(x + 0);
1413
+ A1 ^= sph_dec32le_aligned(x + 4);
1414
+ A2 ^= sph_dec32le_aligned(x + 8);
1415
+ A3 ^= sph_dec32le_aligned(x + 12);
1416
+ A4 ^= sph_dec32le_aligned(x + 16);
1417
+ A5 ^= sph_dec32le_aligned(x + 20);
1418
+ A6 ^= sph_dec32le_aligned(x + 24);
1419
+ A7 ^= sph_dec32le_aligned(x + 28);
1420
+ B0 ^= sph_dec32le_aligned(x + 32);
1421
+ B1 ^= sph_dec32le_aligned(x + 36);
1422
+ B2 ^= sph_dec32le_aligned(x + 40);
1423
+ B3 ^= sph_dec32le_aligned(x + 44);
1424
+ B4 ^= sph_dec32le_aligned(x + 48);
1425
+ B5 ^= sph_dec32le_aligned(x + 52);
1426
+ B6 ^= sph_dec32le_aligned(x + 56);
1427
+ B7 ^= sph_dec32le_aligned(x + 60);
1428
+ C0 ^= sph_dec32le_aligned(x + 64);
1429
+ C1 ^= sph_dec32le_aligned(x + 68);
1430
+ C2 ^= sph_dec32le_aligned(x + 72);
1431
+ C3 ^= sph_dec32le_aligned(x + 76);
1432
+ C4 ^= sph_dec32le_aligned(x + 80);
1433
+ C5 ^= sph_dec32le_aligned(x + 84);
1434
+ C6 ^= sph_dec32le_aligned(x + 88);
1435
+ C7 ^= sph_dec32le_aligned(x + 92);
1436
+ D0 ^= sph_dec32le_aligned(x + 96);
1437
+ D1 ^= sph_dec32le_aligned(x + 100);
1438
+ D2 ^= sph_dec32le_aligned(x + 104);
1439
+ D3 ^= sph_dec32le_aligned(x + 108);
1440
+ D4 ^= sph_dec32le_aligned(x + 112);
1441
+ D5 ^= sph_dec32le_aligned(x + 116);
1442
+ D6 ^= sph_dec32le_aligned(x + 120);
1443
+ D7 ^= sph_dec32le_aligned(x + 124);
1444
+
1445
+ ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27);
1446
+ ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7);
1447
+ ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5);
1448
+ ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25);
1449
+ #if SPH_SIMD_NOCOPY
1450
+ STEP_BIG(
1451
+ saved[ 0], saved[ 1], saved[ 2], saved[ 3],
1452
+ saved[ 4], saved[ 5], saved[ 6], saved[ 7],
1453
+ IF, 4, 13, PP8_4_);
1454
+ STEP_BIG(
1455
+ saved[ 8], saved[ 9], saved[10], saved[11],
1456
+ saved[12], saved[13], saved[14], saved[15],
1457
+ IF, 13, 10, PP8_5_);
1458
+ STEP_BIG(
1459
+ saved[16], saved[17], saved[18], saved[19],
1460
+ saved[20], saved[21], saved[22], saved[23],
1461
+ IF, 10, 25, PP8_6_);
1462
+ STEP_BIG(
1463
+ saved[24], saved[25], saved[26], saved[27],
1464
+ saved[28], saved[29], saved[30], saved[31],
1465
+ IF, 25, 4, PP8_0_);
1466
+ #else
1467
+ STEP_BIG(
1468
+ sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
1469
+ sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
1470
+ IF, 4, 13, PP8_4_);
1471
+ STEP_BIG(
1472
+ sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
1473
+ sc->state[12], sc->state[13], sc->state[14], sc->state[15],
1474
+ IF, 13, 10, PP8_5_);
1475
+ STEP_BIG(
1476
+ sc->state[16], sc->state[17], sc->state[18], sc->state[19],
1477
+ sc->state[20], sc->state[21], sc->state[22], sc->state[23],
1478
+ IF, 10, 25, PP8_6_);
1479
+ STEP_BIG(
1480
+ sc->state[24], sc->state[25], sc->state[26], sc->state[27],
1481
+ sc->state[28], sc->state[29], sc->state[30], sc->state[31],
1482
+ IF, 25, 4, PP8_0_);
1483
+ WRITE_STATE_BIG(sc);
1484
+ #endif
1485
+ }
1486
+
1487
+ #if SPH_SIMD_NOCOPY
1488
+ #undef A0
1489
+ #undef A1
1490
+ #undef A2
1491
+ #undef A3
1492
+ #undef A4
1493
+ #undef A5
1494
+ #undef A6
1495
+ #undef A7
1496
+ #undef B0
1497
+ #undef B1
1498
+ #undef B2
1499
+ #undef B3
1500
+ #undef B4
1501
+ #undef B5
1502
+ #undef B6
1503
+ #undef B7
1504
+ #undef C0
1505
+ #undef C1
1506
+ #undef C2
1507
+ #undef C3
1508
+ #undef C4
1509
+ #undef C5
1510
+ #undef C6
1511
+ #undef C7
1512
+ #undef D0
1513
+ #undef D1
1514
+ #undef D2
1515
+ #undef D3
1516
+ #undef D4
1517
+ #undef D5
1518
+ #undef D6
1519
+ #undef D7
1520
+ #endif
1521
+
1522
+ #endif
1523
+
1524
+ static const u32 IV224[] = {
1525
+ C32(0x33586E9F), C32(0x12FFF033), C32(0xB2D9F64D), C32(0x6F8FEA53),
1526
+ C32(0xDE943106), C32(0x2742E439), C32(0x4FBAB5AC), C32(0x62B9FF96),
1527
+ C32(0x22E7B0AF), C32(0xC862B3A8), C32(0x33E00CDC), C32(0x236B86A6),
1528
+ C32(0xF64AE77C), C32(0xFA373B76), C32(0x7DC1EE5B), C32(0x7FB29CE8)
1529
+ };
1530
+
1531
+ static const u32 IV256[] = {
1532
+ C32(0x4D567983), C32(0x07190BA9), C32(0x8474577B), C32(0x39D726E9),
1533
+ C32(0xAAF3D925), C32(0x3EE20B03), C32(0xAFD5E751), C32(0xC96006D3),
1534
+ C32(0xC2C2BA14), C32(0x49B3BCB4), C32(0xF67CAF46), C32(0x668626C9),
1535
+ C32(0xE2EAA8D2), C32(0x1FF47833), C32(0xD0C661A5), C32(0x55693DE1)
1536
+ };
1537
+
1538
+ static const u32 IV384[] = {
1539
+ C32(0x8A36EEBC), C32(0x94A3BD90), C32(0xD1537B83), C32(0xB25B070B),
1540
+ C32(0xF463F1B5), C32(0xB6F81E20), C32(0x0055C339), C32(0xB4D144D1),
1541
+ C32(0x7360CA61), C32(0x18361A03), C32(0x17DCB4B9), C32(0x3414C45A),
1542
+ C32(0xA699A9D2), C32(0xE39E9664), C32(0x468BFE77), C32(0x51D062F8),
1543
+ C32(0xB9E3BFE8), C32(0x63BECE2A), C32(0x8FE506B9), C32(0xF8CC4AC2),
1544
+ C32(0x7AE11542), C32(0xB1AADDA1), C32(0x64B06794), C32(0x28D2F462),
1545
+ C32(0xE64071EC), C32(0x1DEB91A8), C32(0x8AC8DB23), C32(0x3F782AB5),
1546
+ C32(0x039B5CB8), C32(0x71DDD962), C32(0xFADE2CEA), C32(0x1416DF71)
1547
+ };
1548
+
1549
+ static const u32 IV512[] = {
1550
+ C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
1551
+ C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
1552
+ C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
1553
+ C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
1554
+ C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
1555
+ C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
1556
+ C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
1557
+ C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22)
1558
+ };
1559
+
1560
+ static void
1561
+ init_small(void *cc, const u32 *iv)
1562
+ {
1563
+ sph_simd_small_context *sc;
1564
+
1565
+ sc = cc;
1566
+ memcpy(sc->state, iv, sizeof sc->state);
1567
+ sc->count_low = sc->count_high = 0;
1568
+ sc->ptr = 0;
1569
+ }
1570
+
1571
+ static void
1572
+ init_big(void *cc, const u32 *iv)
1573
+ {
1574
+ sph_simd_big_context *sc;
1575
+
1576
+ sc = cc;
1577
+ memcpy(sc->state, iv, sizeof sc->state);
1578
+ sc->count_low = sc->count_high = 0;
1579
+ sc->ptr = 0;
1580
+ }
1581
+
1582
+ static void
1583
+ update_small(void *cc, const void *data, size_t len)
1584
+ {
1585
+ sph_simd_small_context *sc;
1586
+
1587
+ sc = cc;
1588
+ while (len > 0) {
1589
+ size_t clen;
1590
+
1591
+ clen = (sizeof sc->buf) - sc->ptr;
1592
+ if (clen > len)
1593
+ clen = len;
1594
+ memcpy(sc->buf + sc->ptr, data, clen);
1595
+ data = (const unsigned char *)data + clen;
1596
+ len -= clen;
1597
+ if ((sc->ptr += clen) == sizeof sc->buf) {
1598
+ compress_small(sc, 0);
1599
+ sc->ptr = 0;
1600
+ sc->count_low = T32(sc->count_low + 1);
1601
+ if (sc->count_low == 0)
1602
+ sc->count_high ++;
1603
+ }
1604
+ }
1605
+ }
1606
+
1607
+ static void
1608
+ update_big(void *cc, const void *data, size_t len)
1609
+ {
1610
+ sph_simd_big_context *sc;
1611
+
1612
+ sc = cc;
1613
+ while (len > 0) {
1614
+ size_t clen;
1615
+
1616
+ clen = (sizeof sc->buf) - sc->ptr;
1617
+ if (clen > len)
1618
+ clen = len;
1619
+ memcpy(sc->buf + sc->ptr, data, clen);
1620
+ data = (const unsigned char *)data + clen;
1621
+ len -= clen;
1622
+ if ((sc->ptr += clen) == sizeof sc->buf) {
1623
+ compress_big(sc, 0);
1624
+ sc->ptr = 0;
1625
+ sc->count_low = T32(sc->count_low + 1);
1626
+ if (sc->count_low == 0)
1627
+ sc->count_high ++;
1628
+ }
1629
+ }
1630
+ }
1631
+
1632
+ static void
1633
+ encode_count_small(unsigned char *dst,
1634
+ u32 low, u32 high, size_t ptr, unsigned n)
1635
+ {
1636
+ low = T32(low << 9);
1637
+ high = T32(high << 9) + (low >> 23);
1638
+ low += (ptr << 3) + n;
1639
+ sph_enc32le(dst, low);
1640
+ sph_enc32le(dst + 4, high);
1641
+ }
1642
+
1643
+ static void
1644
+ encode_count_big(unsigned char *dst,
1645
+ u32 low, u32 high, size_t ptr, unsigned n)
1646
+ {
1647
+ low = T32(low << 10);
1648
+ high = T32(high << 10) + (low >> 22);
1649
+ low += (ptr << 3) + n;
1650
+ sph_enc32le(dst, low);
1651
+ sph_enc32le(dst + 4, high);
1652
+ }
1653
+
1654
+ static void
1655
+ finalize_small(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
1656
+ {
1657
+ sph_simd_small_context *sc;
1658
+ unsigned char *d;
1659
+ size_t u;
1660
+
1661
+ sc = cc;
1662
+ if (sc->ptr > 0 || n > 0) {
1663
+ memset(sc->buf + sc->ptr, 0,
1664
+ (sizeof sc->buf) - sc->ptr);
1665
+ sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
1666
+ compress_small(sc, 0);
1667
+ }
1668
+ memset(sc->buf, 0, sizeof sc->buf);
1669
+ encode_count_small(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
1670
+ compress_small(sc, 1);
1671
+ d = dst;
1672
+ for (d = dst, u = 0; u < dst_len; u ++)
1673
+ sph_enc32le(d + (u << 2), sc->state[u]);
1674
+ }
1675
+
1676
+ static void
1677
+ finalize_big(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
1678
+ {
1679
+ sph_simd_big_context *sc;
1680
+ unsigned char *d;
1681
+ size_t u;
1682
+
1683
+ sc = cc;
1684
+ if (sc->ptr > 0 || n > 0) {
1685
+ memset(sc->buf + sc->ptr, 0,
1686
+ (sizeof sc->buf) - sc->ptr);
1687
+ sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
1688
+ compress_big(sc, 0);
1689
+ }
1690
+ memset(sc->buf, 0, sizeof sc->buf);
1691
+ encode_count_big(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
1692
+ compress_big(sc, 1);
1693
+ d = dst;
1694
+ for (d = dst, u = 0; u < dst_len; u ++)
1695
+ sph_enc32le(d + (u << 2), sc->state[u]);
1696
+ }
1697
+
1698
+ void
1699
+ sph_simd224_init(void *cc)
1700
+ {
1701
+ init_small(cc, IV224);
1702
+ }
1703
+
1704
+ void
1705
+ sph_simd224(void *cc, const void *data, size_t len)
1706
+ {
1707
+ update_small(cc, data, len);
1708
+ }
1709
+
1710
+ void
1711
+ sph_simd224_close(void *cc, void *dst)
1712
+ {
1713
+ sph_simd224_addbits_and_close(cc, 0, 0, dst);
1714
+ }
1715
+
1716
+ void
1717
+ sph_simd224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1718
+ {
1719
+ finalize_small(cc, ub, n, dst, 7);
1720
+ sph_simd224_init(cc);
1721
+ }
1722
+
1723
+ void
1724
+ sph_simd256_init(void *cc)
1725
+ {
1726
+ init_small(cc, IV256);
1727
+ }
1728
+
1729
+ void
1730
+ sph_simd256(void *cc, const void *data, size_t len)
1731
+ {
1732
+ update_small(cc, data, len);
1733
+ }
1734
+
1735
+ void
1736
+ sph_simd256_close(void *cc, void *dst)
1737
+ {
1738
+ sph_simd256_addbits_and_close(cc, 0, 0, dst);
1739
+ }
1740
+
1741
+ void
1742
+ sph_simd256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1743
+ {
1744
+ finalize_small(cc, ub, n, dst, 8);
1745
+ sph_simd256_init(cc);
1746
+ }
1747
+
1748
+ void
1749
+ sph_simd384_init(void *cc)
1750
+ {
1751
+ init_big(cc, IV384);
1752
+ }
1753
+
1754
+ void
1755
+ sph_simd384(void *cc, const void *data, size_t len)
1756
+ {
1757
+ update_big(cc, data, len);
1758
+ }
1759
+
1760
+ void
1761
+ sph_simd384_close(void *cc, void *dst)
1762
+ {
1763
+ sph_simd384_addbits_and_close(cc, 0, 0, dst);
1764
+ }
1765
+
1766
+ void
1767
+ sph_simd384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1768
+ {
1769
+ finalize_big(cc, ub, n, dst, 12);
1770
+ sph_simd384_init(cc);
1771
+ }
1772
+
1773
+ void
1774
+ sph_simd512_init(void *cc)
1775
+ {
1776
+ init_big(cc, IV512);
1777
+ }
1778
+
1779
+ void
1780
+ sph_simd512(void *cc, const void *data, size_t len)
1781
+ {
1782
+ update_big(cc, data, len);
1783
+ }
1784
+
1785
+ void
1786
+ sph_simd512_close(void *cc, void *dst)
1787
+ {
1788
+ sph_simd512_addbits_and_close(cc, 0, 0, dst);
1789
+ }
1790
+
1791
+ void
1792
+ sph_simd512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1793
+ {
1794
+ finalize_big(cc, ub, n, dst, 16);
1795
+ sph_simd512_init(cc);
1796
+ }
1797
+ #ifdef __cplusplus
1798
+ }
1799
+ #endif