react-native-quick-crypto 1.0.19 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (561) hide show
  1. package/QuickCrypto.podspec +12 -38
  2. package/README.md +2 -0
  3. package/android/CMakeLists.txt +3 -0
  4. package/android/build.gradle +5 -1
  5. package/cpp/argon2/HybridArgon2.cpp +10 -3
  6. package/cpp/blake3/HybridBlake3.cpp +5 -3
  7. package/cpp/cipher/CCMCipher.cpp +29 -16
  8. package/cpp/cipher/CCMCipher.hpp +2 -4
  9. package/cpp/cipher/ChaCha20Cipher.cpp +14 -18
  10. package/cpp/cipher/ChaCha20Cipher.hpp +2 -4
  11. package/cpp/cipher/ChaCha20Poly1305Cipher.cpp +34 -23
  12. package/cpp/cipher/ChaCha20Poly1305Cipher.hpp +2 -4
  13. package/cpp/cipher/GCMCipher.cpp +14 -15
  14. package/cpp/cipher/HybridCipher.cpp +39 -36
  15. package/cpp/cipher/HybridCipher.hpp +17 -1
  16. package/cpp/cipher/HybridRsaCipher.cpp +74 -29
  17. package/cpp/cipher/OCBCipher.cpp +4 -3
  18. package/cpp/cipher/XChaCha20Poly1305Cipher.cpp +14 -13
  19. package/cpp/cipher/XSalsa20Cipher.cpp +72 -6
  20. package/cpp/cipher/XSalsa20Cipher.hpp +25 -3
  21. package/cpp/cipher/XSalsa20Poly1305Cipher.cpp +21 -25
  22. package/cpp/dh/HybridDiffieHellman.cpp +29 -0
  23. package/cpp/ec/HybridEcKeyPair.cpp +35 -33
  24. package/cpp/ec/HybridEcKeyPair.hpp +3 -7
  25. package/cpp/ecdh/HybridECDH.cpp +23 -0
  26. package/cpp/ed25519/HybridEdKeyPair.cpp +73 -117
  27. package/cpp/ed25519/HybridEdKeyPair.hpp +5 -9
  28. package/cpp/hash/HybridHash.cpp +5 -7
  29. package/cpp/hkdf/HybridHkdf.cpp +6 -4
  30. package/cpp/hmac/HybridHmac.cpp +4 -6
  31. package/cpp/kmac/HybridKmac.cpp +4 -4
  32. package/cpp/mldsa/HybridMlDsaKeyPair.cpp +37 -49
  33. package/cpp/mlkem/HybridMlKemKeyPair.cpp +39 -43
  34. package/cpp/pbkdf2/HybridPbkdf2.cpp +7 -8
  35. package/cpp/rsa/HybridRsaKeyPair.cpp +5 -8
  36. package/cpp/rsa/HybridRsaKeyPair.hpp +4 -7
  37. package/cpp/scrypt/HybridScrypt.cpp +6 -4
  38. package/cpp/sign/HybridSignHandle.cpp +25 -68
  39. package/cpp/sign/HybridVerifyHandle.cpp +23 -60
  40. package/cpp/utils/HybridUtils.cpp +213 -111
  41. package/cpp/utils/HybridUtils.hpp +9 -2
  42. package/cpp/utils/QuickCryptoUtils.hpp +72 -0
  43. package/deps/simdutf/LICENSE-APACHE +201 -0
  44. package/deps/simdutf/LICENSE-MIT +18 -0
  45. package/deps/simdutf/README.md +2782 -0
  46. package/deps/simdutf/include/simdutf/avx512.h +79 -0
  47. package/deps/simdutf/include/simdutf/base64_implementation.h +158 -0
  48. package/deps/simdutf/include/simdutf/base64_tables.h +887 -0
  49. package/deps/simdutf/include/simdutf/common_defs.h +186 -0
  50. package/deps/simdutf/include/simdutf/compiler_check.h +50 -0
  51. package/deps/simdutf/include/simdutf/constexpr_ptr.h +138 -0
  52. package/deps/simdutf/include/simdutf/encoding_types.h +189 -0
  53. package/deps/simdutf/include/simdutf/error.h +126 -0
  54. package/deps/simdutf/include/simdutf/implementation.h +7081 -0
  55. package/deps/simdutf/include/simdutf/internal/isadetection.h +325 -0
  56. package/deps/simdutf/include/simdutf/portability.h +285 -0
  57. package/deps/simdutf/include/simdutf/scalar/ascii.h +86 -0
  58. package/deps/simdutf/include/simdutf/scalar/atomic_util.h +105 -0
  59. package/deps/simdutf/include/simdutf/scalar/base64.h +911 -0
  60. package/deps/simdutf/include/simdutf/scalar/latin1.h +26 -0
  61. package/deps/simdutf/include/simdutf/scalar/latin1_to_utf16/latin1_to_utf16.h +52 -0
  62. package/deps/simdutf/include/simdutf/scalar/latin1_to_utf32/latin1_to_utf32.h +27 -0
  63. package/deps/simdutf/include/simdutf/scalar/latin1_to_utf8/latin1_to_utf8.h +191 -0
  64. package/deps/simdutf/include/simdutf/scalar/swap_bytes.h +35 -0
  65. package/deps/simdutf/include/simdutf/scalar/utf16.h +226 -0
  66. package/deps/simdutf/include/simdutf/scalar/utf16_to_latin1/utf16_to_latin1.h +108 -0
  67. package/deps/simdutf/include/simdutf/scalar/utf16_to_latin1/valid_utf16_to_latin1.h +40 -0
  68. package/deps/simdutf/include/simdutf/scalar/utf16_to_utf32/utf16_to_utf32.h +86 -0
  69. package/deps/simdutf/include/simdutf/scalar/utf16_to_utf32/valid_utf16_to_utf32.h +44 -0
  70. package/deps/simdutf/include/simdutf/scalar/utf16_to_utf8/utf16_to_utf8.h +295 -0
  71. package/deps/simdutf/include/simdutf/scalar/utf16_to_utf8/valid_utf16_to_utf8.h +91 -0
  72. package/deps/simdutf/include/simdutf/scalar/utf32.h +82 -0
  73. package/deps/simdutf/include/simdutf/scalar/utf32_to_latin1/utf32_to_latin1.h +68 -0
  74. package/deps/simdutf/include/simdutf/scalar/utf32_to_latin1/valid_utf32_to_latin1.h +67 -0
  75. package/deps/simdutf/include/simdutf/scalar/utf32_to_utf16/utf32_to_utf16.h +84 -0
  76. package/deps/simdutf/include/simdutf/scalar/utf32_to_utf16/valid_utf32_to_utf16.h +44 -0
  77. package/deps/simdutf/include/simdutf/scalar/utf32_to_utf8/utf32_to_utf8.h +142 -0
  78. package/deps/simdutf/include/simdutf/scalar/utf32_to_utf8/valid_utf32_to_utf8.h +72 -0
  79. package/deps/simdutf/include/simdutf/scalar/utf8.h +326 -0
  80. package/deps/simdutf/include/simdutf/scalar/utf8_to_latin1/utf8_to_latin1.h +225 -0
  81. package/deps/simdutf/include/simdutf/scalar/utf8_to_latin1/valid_utf8_to_latin1.h +87 -0
  82. package/deps/simdutf/include/simdutf/scalar/utf8_to_utf16/utf8_to_utf16.h +342 -0
  83. package/deps/simdutf/include/simdutf/scalar/utf8_to_utf16/valid_utf8_to_utf16.h +106 -0
  84. package/deps/simdutf/include/simdutf/scalar/utf8_to_utf32/utf8_to_utf32.h +299 -0
  85. package/deps/simdutf/include/simdutf/scalar/utf8_to_utf32/valid_utf8_to_utf32.h +83 -0
  86. package/deps/simdutf/include/simdutf/simdutf_version.h +26 -0
  87. package/deps/simdutf/include/simdutf.h +26 -0
  88. package/deps/simdutf/include/simdutf_c.h +342 -0
  89. package/deps/simdutf/src/arm64/arm_base64.cpp +791 -0
  90. package/deps/simdutf/src/arm64/arm_convert_latin1_to_utf16.cpp +24 -0
  91. package/deps/simdutf/src/arm64/arm_convert_latin1_to_utf32.cpp +24 -0
  92. package/deps/simdutf/src/arm64/arm_convert_latin1_to_utf8.cpp +70 -0
  93. package/deps/simdutf/src/arm64/arm_convert_utf16_to_latin1.cpp +61 -0
  94. package/deps/simdutf/src/arm64/arm_convert_utf16_to_utf32.cpp +185 -0
  95. package/deps/simdutf/src/arm64/arm_convert_utf16_to_utf8.cpp +780 -0
  96. package/deps/simdutf/src/arm64/arm_convert_utf32_to_latin1.cpp +60 -0
  97. package/deps/simdutf/src/arm64/arm_convert_utf32_to_utf16.cpp +208 -0
  98. package/deps/simdutf/src/arm64/arm_convert_utf32_to_utf8.cpp +505 -0
  99. package/deps/simdutf/src/arm64/arm_convert_utf8_to_latin1.cpp +69 -0
  100. package/deps/simdutf/src/arm64/arm_convert_utf8_to_utf16.cpp +313 -0
  101. package/deps/simdutf/src/arm64/arm_convert_utf8_to_utf32.cpp +179 -0
  102. package/deps/simdutf/src/arm64/arm_find.cpp +199 -0
  103. package/deps/simdutf/src/arm64/arm_utf16fix.cpp +185 -0
  104. package/deps/simdutf/src/arm64/arm_validate_utf16.cpp +165 -0
  105. package/deps/simdutf/src/arm64/arm_validate_utf32le.cpp +65 -0
  106. package/deps/simdutf/src/arm64/implementation.cpp +1442 -0
  107. package/deps/simdutf/src/encoding_types.cpp +67 -0
  108. package/deps/simdutf/src/error.cpp +3 -0
  109. package/deps/simdutf/src/fallback/implementation.cpp +589 -0
  110. package/deps/simdutf/src/generic/ascii_validation.h +50 -0
  111. package/deps/simdutf/src/generic/base64.h +233 -0
  112. package/deps/simdutf/src/generic/base64lengths.h +63 -0
  113. package/deps/simdutf/src/generic/buf_block_reader.h +109 -0
  114. package/deps/simdutf/src/generic/find.h +75 -0
  115. package/deps/simdutf/src/generic/utf16/change_endianness.h +24 -0
  116. package/deps/simdutf/src/generic/utf16/count_code_points_bytemask.h +58 -0
  117. package/deps/simdutf/src/generic/utf16/to_well_formed.h +93 -0
  118. package/deps/simdutf/src/generic/utf16/utf32_length_from_utf16.h +15 -0
  119. package/deps/simdutf/src/generic/utf16/utf8_length_from_utf16.h +35 -0
  120. package/deps/simdutf/src/generic/utf16/utf8_length_from_utf16_bytemask.h +199 -0
  121. package/deps/simdutf/src/generic/utf16.h +73 -0
  122. package/deps/simdutf/src/generic/utf32.h +136 -0
  123. package/deps/simdutf/src/generic/utf8/utf16_length_from_utf8_bytemask.h +53 -0
  124. package/deps/simdutf/src/generic/utf8.h +92 -0
  125. package/deps/simdutf/src/generic/utf8_to_latin1/utf8_to_latin1.h +316 -0
  126. package/deps/simdutf/src/generic/utf8_to_latin1/valid_utf8_to_latin1.h +78 -0
  127. package/deps/simdutf/src/generic/utf8_to_utf16/utf8_to_utf16.h +332 -0
  128. package/deps/simdutf/src/generic/utf8_to_utf16/valid_utf8_to_utf16.h +74 -0
  129. package/deps/simdutf/src/generic/utf8_to_utf32/utf8_to_utf32.h +318 -0
  130. package/deps/simdutf/src/generic/utf8_to_utf32/valid_utf8_to_utf32.h +42 -0
  131. package/deps/simdutf/src/generic/utf8_validation/utf8_lookup4_algorithm.h +223 -0
  132. package/deps/simdutf/src/generic/utf8_validation/utf8_validator.h +84 -0
  133. package/deps/simdutf/src/generic/validate_utf16.h +164 -0
  134. package/deps/simdutf/src/generic/validate_utf32.h +99 -0
  135. package/deps/simdutf/src/haswell/avx2_base64.cpp +837 -0
  136. package/deps/simdutf/src/haswell/avx2_convert_latin1_to_utf16.cpp +28 -0
  137. package/deps/simdutf/src/haswell/avx2_convert_latin1_to_utf32.cpp +20 -0
  138. package/deps/simdutf/src/haswell/avx2_convert_latin1_to_utf8.cpp +83 -0
  139. package/deps/simdutf/src/haswell/avx2_convert_utf16_to_latin1.cpp +83 -0
  140. package/deps/simdutf/src/haswell/avx2_convert_utf16_to_utf32.cpp +210 -0
  141. package/deps/simdutf/src/haswell/avx2_convert_utf16_to_utf8.cpp +602 -0
  142. package/deps/simdutf/src/haswell/avx2_convert_utf32_to_latin1.cpp +116 -0
  143. package/deps/simdutf/src/haswell/avx2_convert_utf32_to_utf16.cpp +164 -0
  144. package/deps/simdutf/src/haswell/avx2_convert_utf32_to_utf8.cpp +569 -0
  145. package/deps/simdutf/src/haswell/avx2_convert_utf8_to_latin1.cpp +60 -0
  146. package/deps/simdutf/src/haswell/avx2_convert_utf8_to_utf16.cpp +195 -0
  147. package/deps/simdutf/src/haswell/avx2_convert_utf8_to_utf32.cpp +135 -0
  148. package/deps/simdutf/src/haswell/avx2_utf16fix.cpp +173 -0
  149. package/deps/simdutf/src/haswell/avx2_validate_utf16.cpp +17 -0
  150. package/deps/simdutf/src/haswell/implementation.cpp +1447 -0
  151. package/deps/simdutf/src/icelake/icelake_ascii_validation.inl.cpp +19 -0
  152. package/deps/simdutf/src/icelake/icelake_base64.inl.cpp +630 -0
  153. package/deps/simdutf/src/icelake/icelake_common.inl.cpp +37 -0
  154. package/deps/simdutf/src/icelake/icelake_convert_latin1_to_utf16.inl.cpp +36 -0
  155. package/deps/simdutf/src/icelake/icelake_convert_latin1_to_utf32.inl.cpp +23 -0
  156. package/deps/simdutf/src/icelake/icelake_convert_latin1_to_utf8.inl.cpp +107 -0
  157. package/deps/simdutf/src/icelake/icelake_convert_utf16_to_latin1.inl.cpp +103 -0
  158. package/deps/simdutf/src/icelake/icelake_convert_utf16_to_utf32.inl.cpp +136 -0
  159. package/deps/simdutf/src/icelake/icelake_convert_utf16_to_utf8.inl.cpp +206 -0
  160. package/deps/simdutf/src/icelake/icelake_convert_utf32_to_latin1.inl.cpp +74 -0
  161. package/deps/simdutf/src/icelake/icelake_convert_utf32_to_utf16.inl.cpp +338 -0
  162. package/deps/simdutf/src/icelake/icelake_convert_utf32_to_utf8.inl.cpp +574 -0
  163. package/deps/simdutf/src/icelake/icelake_convert_utf8_to_latin1.inl.cpp +104 -0
  164. package/deps/simdutf/src/icelake/icelake_convert_utf8_to_utf16.inl.cpp +75 -0
  165. package/deps/simdutf/src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp +69 -0
  166. package/deps/simdutf/src/icelake/icelake_find.inl.cpp +146 -0
  167. package/deps/simdutf/src/icelake/icelake_from_utf8.inl.cpp +266 -0
  168. package/deps/simdutf/src/icelake/icelake_from_valid_utf8.inl.cpp +136 -0
  169. package/deps/simdutf/src/icelake/icelake_macros.inl.cpp +143 -0
  170. package/deps/simdutf/src/icelake/icelake_utf16fix.cpp +138 -0
  171. package/deps/simdutf/src/icelake/icelake_utf32_validation.inl.cpp +63 -0
  172. package/deps/simdutf/src/icelake/icelake_utf8_common.inl.cpp +753 -0
  173. package/deps/simdutf/src/icelake/icelake_utf8_length_from_utf16.inl.cpp +269 -0
  174. package/deps/simdutf/src/icelake/icelake_utf8_validation.inl.cpp +116 -0
  175. package/deps/simdutf/src/icelake/implementation.cpp +1903 -0
  176. package/deps/simdutf/src/implementation.cpp +2526 -0
  177. package/deps/simdutf/src/lasx/implementation.cpp +1531 -0
  178. package/deps/simdutf/src/lasx/lasx_base64.cpp +695 -0
  179. package/deps/simdutf/src/lasx/lasx_convert_latin1_to_utf16.cpp +76 -0
  180. package/deps/simdutf/src/lasx/lasx_convert_latin1_to_utf32.cpp +55 -0
  181. package/deps/simdutf/src/lasx/lasx_convert_latin1_to_utf8.cpp +65 -0
  182. package/deps/simdutf/src/lasx/lasx_convert_utf16_to_latin1.cpp +64 -0
  183. package/deps/simdutf/src/lasx/lasx_convert_utf16_to_utf32.cpp +183 -0
  184. package/deps/simdutf/src/lasx/lasx_convert_utf16_to_utf8.cpp +550 -0
  185. package/deps/simdutf/src/lasx/lasx_convert_utf32_to_latin1.cpp +73 -0
  186. package/deps/simdutf/src/lasx/lasx_convert_utf32_to_utf16.cpp +218 -0
  187. package/deps/simdutf/src/lasx/lasx_convert_utf32_to_utf8.cpp +589 -0
  188. package/deps/simdutf/src/lasx/lasx_convert_utf8_to_latin1.cpp +72 -0
  189. package/deps/simdutf/src/lasx/lasx_convert_utf8_to_utf16.cpp +296 -0
  190. package/deps/simdutf/src/lasx/lasx_convert_utf8_to_utf32.cpp +190 -0
  191. package/deps/simdutf/src/lasx/lasx_find.cpp +64 -0
  192. package/deps/simdutf/src/lasx/lasx_validate_utf16.cpp +13 -0
  193. package/deps/simdutf/src/lasx/lasx_validate_utf32le.cpp +84 -0
  194. package/deps/simdutf/src/lsx/implementation.cpp +1417 -0
  195. package/deps/simdutf/src/lsx/lsx_base64.cpp +675 -0
  196. package/deps/simdutf/src/lsx/lsx_convert_latin1_to_utf16.cpp +39 -0
  197. package/deps/simdutf/src/lsx/lsx_convert_latin1_to_utf32.cpp +27 -0
  198. package/deps/simdutf/src/lsx/lsx_convert_latin1_to_utf8.cpp +56 -0
  199. package/deps/simdutf/src/lsx/lsx_convert_utf16_to_latin1.cpp +64 -0
  200. package/deps/simdutf/src/lsx/lsx_convert_utf16_to_utf32.cpp +133 -0
  201. package/deps/simdutf/src/lsx/lsx_convert_utf16_to_utf8.cpp +518 -0
  202. package/deps/simdutf/src/lsx/lsx_convert_utf32_to_latin1.cpp +66 -0
  203. package/deps/simdutf/src/lsx/lsx_convert_utf32_to_utf16.cpp +155 -0
  204. package/deps/simdutf/src/lsx/lsx_convert_utf32_to_utf8.cpp +459 -0
  205. package/deps/simdutf/src/lsx/lsx_convert_utf8_to_latin1.cpp +75 -0
  206. package/deps/simdutf/src/lsx/lsx_convert_utf8_to_utf16.cpp +291 -0
  207. package/deps/simdutf/src/lsx/lsx_convert_utf8_to_utf32.cpp +179 -0
  208. package/deps/simdutf/src/lsx/lsx_find.cpp +60 -0
  209. package/deps/simdutf/src/lsx/lsx_validate_utf16.cpp +13 -0
  210. package/deps/simdutf/src/lsx/lsx_validate_utf32le.cpp +68 -0
  211. package/deps/simdutf/src/ppc64/implementation.cpp +992 -0
  212. package/deps/simdutf/src/ppc64/ppc64_base64.cpp +480 -0
  213. package/deps/simdutf/src/ppc64/ppc64_base64_internal_tests.cpp +401 -0
  214. package/deps/simdutf/src/ppc64/ppc64_convert_latin1_to_utf16.cpp +12 -0
  215. package/deps/simdutf/src/ppc64/ppc64_convert_latin1_to_utf32.cpp +12 -0
  216. package/deps/simdutf/src/ppc64/ppc64_convert_latin1_to_utf8.cpp +149 -0
  217. package/deps/simdutf/src/ppc64/ppc64_convert_utf16_to_latin1.cpp +67 -0
  218. package/deps/simdutf/src/ppc64/ppc64_convert_utf16_to_utf32.cpp +87 -0
  219. package/deps/simdutf/src/ppc64/ppc64_convert_utf16_to_utf8.cpp +296 -0
  220. package/deps/simdutf/src/ppc64/ppc64_convert_utf32_to_latin1.cpp +57 -0
  221. package/deps/simdutf/src/ppc64/ppc64_convert_utf32_to_utf16.cpp +117 -0
  222. package/deps/simdutf/src/ppc64/ppc64_convert_utf32_to_utf8.cpp +166 -0
  223. package/deps/simdutf/src/ppc64/ppc64_convert_utf8_to_latin1.cpp +69 -0
  224. package/deps/simdutf/src/ppc64/ppc64_convert_utf8_to_utf16.cpp +211 -0
  225. package/deps/simdutf/src/ppc64/ppc64_convert_utf8_to_utf32.cpp +153 -0
  226. package/deps/simdutf/src/ppc64/ppc64_utf16_to_utf8_tables.h +1011 -0
  227. package/deps/simdutf/src/ppc64/ppc64_utf8_length_from_latin1.cpp +37 -0
  228. package/deps/simdutf/src/ppc64/ppc64_validate_utf16.cpp +19 -0
  229. package/deps/simdutf/src/ppc64/templates.cpp +91 -0
  230. package/deps/simdutf/src/rvv/implementation.cpp +138 -0
  231. package/deps/simdutf/src/rvv/rvv_find.cpp +27 -0
  232. package/deps/simdutf/src/rvv/rvv_helpers.inl.cpp +23 -0
  233. package/deps/simdutf/src/rvv/rvv_latin1_to.inl.cpp +71 -0
  234. package/deps/simdutf/src/rvv/rvv_length_from.inl.cpp +164 -0
  235. package/deps/simdutf/src/rvv/rvv_utf16_to.inl.cpp +399 -0
  236. package/deps/simdutf/src/rvv/rvv_utf16fix.cpp +110 -0
  237. package/deps/simdutf/src/rvv/rvv_utf32_to.inl.cpp +307 -0
  238. package/deps/simdutf/src/rvv/rvv_utf8_to.inl.cpp +435 -0
  239. package/deps/simdutf/src/rvv/rvv_validate.inl.cpp +275 -0
  240. package/deps/simdutf/src/simdutf/arm64/begin.h +2 -0
  241. package/deps/simdutf/src/simdutf/arm64/bitmanipulation.h +34 -0
  242. package/deps/simdutf/src/simdutf/arm64/end.h +2 -0
  243. package/deps/simdutf/src/simdutf/arm64/implementation.h +307 -0
  244. package/deps/simdutf/src/simdutf/arm64/intrinsics.h +10 -0
  245. package/deps/simdutf/src/simdutf/arm64/simd.h +547 -0
  246. package/deps/simdutf/src/simdutf/arm64/simd16-inl.h +403 -0
  247. package/deps/simdutf/src/simdutf/arm64/simd32-inl.h +129 -0
  248. package/deps/simdutf/src/simdutf/arm64/simd64-inl.h +28 -0
  249. package/deps/simdutf/src/simdutf/arm64.h +43 -0
  250. package/deps/simdutf/src/simdutf/fallback/begin.h +1 -0
  251. package/deps/simdutf/src/simdutf/fallback/bitmanipulation.h +13 -0
  252. package/deps/simdutf/src/simdutf/fallback/end.h +1 -0
  253. package/deps/simdutf/src/simdutf/fallback/implementation.h +331 -0
  254. package/deps/simdutf/src/simdutf/fallback.h +42 -0
  255. package/deps/simdutf/src/simdutf/haswell/begin.h +15 -0
  256. package/deps/simdutf/src/simdutf/haswell/bitmanipulation.h +35 -0
  257. package/deps/simdutf/src/simdutf/haswell/end.h +13 -0
  258. package/deps/simdutf/src/simdutf/haswell/implementation.h +338 -0
  259. package/deps/simdutf/src/simdutf/haswell/intrinsics.h +67 -0
  260. package/deps/simdutf/src/simdutf/haswell/simd.h +363 -0
  261. package/deps/simdutf/src/simdutf/haswell/simd16-inl.h +261 -0
  262. package/deps/simdutf/src/simdutf/haswell/simd32-inl.h +111 -0
  263. package/deps/simdutf/src/simdutf/haswell/simd64-inl.h +34 -0
  264. package/deps/simdutf/src/simdutf/haswell.h +63 -0
  265. package/deps/simdutf/src/simdutf/icelake/begin.h +14 -0
  266. package/deps/simdutf/src/simdutf/icelake/bitmanipulation.h +44 -0
  267. package/deps/simdutf/src/simdutf/icelake/end.h +12 -0
  268. package/deps/simdutf/src/simdutf/icelake/implementation.h +346 -0
  269. package/deps/simdutf/src/simdutf/icelake/intrinsics.h +138 -0
  270. package/deps/simdutf/src/simdutf/icelake/simd.h +17 -0
  271. package/deps/simdutf/src/simdutf/icelake/simd16-inl.h +90 -0
  272. package/deps/simdutf/src/simdutf/icelake/simd32-inl.h +47 -0
  273. package/deps/simdutf/src/simdutf/icelake.h +81 -0
  274. package/deps/simdutf/src/simdutf/lasx/begin.h +8 -0
  275. package/deps/simdutf/src/simdutf/lasx/bitmanipulation.h +25 -0
  276. package/deps/simdutf/src/simdutf/lasx/end.h +8 -0
  277. package/deps/simdutf/src/simdutf/lasx/implementation.h +310 -0
  278. package/deps/simdutf/src/simdutf/lasx/intrinsics.h +319 -0
  279. package/deps/simdutf/src/simdutf/lasx/simd.h +551 -0
  280. package/deps/simdutf/src/simdutf/lasx/simd16-inl.h +234 -0
  281. package/deps/simdutf/src/simdutf/lasx/simd32-inl.h +74 -0
  282. package/deps/simdutf/src/simdutf/lasx/simd64-inl.h +52 -0
  283. package/deps/simdutf/src/simdutf/lasx.h +49 -0
  284. package/deps/simdutf/src/simdutf/lsx/begin.h +2 -0
  285. package/deps/simdutf/src/simdutf/lsx/bitmanipulation.h +25 -0
  286. package/deps/simdutf/src/simdutf/lsx/end.h +2 -0
  287. package/deps/simdutf/src/simdutf/lsx/implementation.h +309 -0
  288. package/deps/simdutf/src/simdutf/lsx/intrinsics.h +196 -0
  289. package/deps/simdutf/src/simdutf/lsx/simd.h +421 -0
  290. package/deps/simdutf/src/simdutf/lsx/simd16-inl.h +242 -0
  291. package/deps/simdutf/src/simdutf/lsx/simd32-inl.h +69 -0
  292. package/deps/simdutf/src/simdutf/lsx/simd64-inl.h +50 -0
  293. package/deps/simdutf/src/simdutf/lsx.h +52 -0
  294. package/deps/simdutf/src/simdutf/ppc64/begin.h +1 -0
  295. package/deps/simdutf/src/simdutf/ppc64/bitmanipulation.h +29 -0
  296. package/deps/simdutf/src/simdutf/ppc64/end.h +1 -0
  297. package/deps/simdutf/src/simdutf/ppc64/implementation.h +348 -0
  298. package/deps/simdutf/src/simdutf/ppc64/intrinsics.h +19 -0
  299. package/deps/simdutf/src/simdutf/ppc64/simd.h +177 -0
  300. package/deps/simdutf/src/simdutf/ppc64/simd16-inl.h +327 -0
  301. package/deps/simdutf/src/simdutf/ppc64/simd32-inl.h +247 -0
  302. package/deps/simdutf/src/simdutf/ppc64/simd8-inl.h +618 -0
  303. package/deps/simdutf/src/simdutf/ppc64.h +40 -0
  304. package/deps/simdutf/src/simdutf/rvv/begin.h +7 -0
  305. package/deps/simdutf/src/simdutf/rvv/end.h +7 -0
  306. package/deps/simdutf/src/simdutf/rvv/implementation.h +321 -0
  307. package/deps/simdutf/src/simdutf/rvv/intrinsics.h +131 -0
  308. package/deps/simdutf/src/simdutf/rvv.h +41 -0
  309. package/deps/simdutf/src/simdutf/westmere/begin.h +8 -0
  310. package/deps/simdutf/src/simdutf/westmere/bitmanipulation.h +37 -0
  311. package/deps/simdutf/src/simdutf/westmere/end.h +8 -0
  312. package/deps/simdutf/src/simdutf/westmere/implementation.h +338 -0
  313. package/deps/simdutf/src/simdutf/westmere/intrinsics.h +38 -0
  314. package/deps/simdutf/src/simdutf/westmere/simd.h +379 -0
  315. package/deps/simdutf/src/simdutf/westmere/simd16-inl.h +242 -0
  316. package/deps/simdutf/src/simdutf/westmere/simd32-inl.h +151 -0
  317. package/deps/simdutf/src/simdutf/westmere/simd64-inl.h +33 -0
  318. package/deps/simdutf/src/simdutf/westmere.h +59 -0
  319. package/deps/simdutf/src/simdutf.cpp +152 -0
  320. package/deps/simdutf/src/simdutf_c.cpp +525 -0
  321. package/deps/simdutf/src/tables/utf16_to_utf8_tables.h +768 -0
  322. package/deps/simdutf/src/tables/utf32_to_utf16_tables.h +53 -0
  323. package/deps/simdutf/src/tables/utf8_to_utf16_tables.h +826 -0
  324. package/deps/simdutf/src/westmere/implementation.cpp +1479 -0
  325. package/deps/simdutf/src/westmere/internal/loader.cpp +7 -0
  326. package/deps/simdutf/src/westmere/internal/write_v_u16_11bits_to_utf8.cpp +66 -0
  327. package/deps/simdutf/src/westmere/sse_base64.cpp +672 -0
  328. package/deps/simdutf/src/westmere/sse_convert_latin1_to_utf16.cpp +21 -0
  329. package/deps/simdutf/src/westmere/sse_convert_latin1_to_utf32.cpp +31 -0
  330. package/deps/simdutf/src/westmere/sse_convert_latin1_to_utf8.cpp +71 -0
  331. package/deps/simdutf/src/westmere/sse_convert_utf16_to_latin1.cpp +70 -0
  332. package/deps/simdutf/src/westmere/sse_convert_utf16_to_utf32.cpp +206 -0
  333. package/deps/simdutf/src/westmere/sse_convert_utf16_to_utf8.cpp +504 -0
  334. package/deps/simdutf/src/westmere/sse_convert_utf32_to_latin1.cpp +82 -0
  335. package/deps/simdutf/src/westmere/sse_convert_utf32_to_utf16.cpp +209 -0
  336. package/deps/simdutf/src/westmere/sse_convert_utf32_to_utf8.cpp +589 -0
  337. package/deps/simdutf/src/westmere/sse_convert_utf8_to_latin1.cpp +58 -0
  338. package/deps/simdutf/src/westmere/sse_convert_utf8_to_utf16.cpp +197 -0
  339. package/deps/simdutf/src/westmere/sse_convert_utf8_to_utf32.cpp +141 -0
  340. package/deps/simdutf/src/westmere/sse_utf16fix.cpp +82 -0
  341. package/deps/simdutf/src/westmere/sse_validate_utf16.cpp +17 -0
  342. package/lib/commonjs/argon2.js +51 -2
  343. package/lib/commonjs/argon2.js.map +1 -1
  344. package/lib/commonjs/cipher.js +109 -11
  345. package/lib/commonjs/cipher.js.map +1 -1
  346. package/lib/commonjs/dsa.js +8 -2
  347. package/lib/commonjs/dsa.js.map +1 -1
  348. package/lib/commonjs/hash.js +15 -5
  349. package/lib/commonjs/hash.js.map +1 -1
  350. package/lib/commonjs/hkdf.js +33 -6
  351. package/lib/commonjs/hkdf.js.map +1 -1
  352. package/lib/commonjs/hmac.js +15 -5
  353. package/lib/commonjs/hmac.js.map +1 -1
  354. package/lib/commonjs/keys/publicCipher.js +10 -4
  355. package/lib/commonjs/keys/publicCipher.js.map +1 -1
  356. package/lib/commonjs/random.js +11 -2
  357. package/lib/commonjs/random.js.map +1 -1
  358. package/lib/commonjs/rsa.js +12 -5
  359. package/lib/commonjs/rsa.js.map +1 -1
  360. package/lib/commonjs/scrypt.js +47 -6
  361. package/lib/commonjs/scrypt.js.map +1 -1
  362. package/lib/commonjs/subtle.js +76 -5
  363. package/lib/commonjs/subtle.js.map +1 -1
  364. package/lib/commonjs/utils/cipher.js +18 -7
  365. package/lib/commonjs/utils/cipher.js.map +1 -1
  366. package/lib/commonjs/utils/conversion.js +33 -9
  367. package/lib/commonjs/utils/conversion.js.map +1 -1
  368. package/lib/commonjs/utils/timingSafeEqual.js +7 -2
  369. package/lib/commonjs/utils/timingSafeEqual.js.map +1 -1
  370. package/lib/commonjs/x509certificate.js +6 -6
  371. package/lib/commonjs/x509certificate.js.map +1 -1
  372. package/lib/module/argon2.js +51 -2
  373. package/lib/module/argon2.js.map +1 -1
  374. package/lib/module/cipher.js +109 -11
  375. package/lib/module/cipher.js.map +1 -1
  376. package/lib/module/dsa.js +8 -2
  377. package/lib/module/dsa.js.map +1 -1
  378. package/lib/module/hash.js +15 -5
  379. package/lib/module/hash.js.map +1 -1
  380. package/lib/module/hkdf.js +33 -6
  381. package/lib/module/hkdf.js.map +1 -1
  382. package/lib/module/hmac.js +15 -5
  383. package/lib/module/hmac.js.map +1 -1
  384. package/lib/module/keys/publicCipher.js +10 -4
  385. package/lib/module/keys/publicCipher.js.map +1 -1
  386. package/lib/module/random.js +11 -2
  387. package/lib/module/random.js.map +1 -1
  388. package/lib/module/rsa.js +11 -4
  389. package/lib/module/rsa.js.map +1 -1
  390. package/lib/module/scrypt.js +47 -6
  391. package/lib/module/scrypt.js.map +1 -1
  392. package/lib/module/subtle.js +76 -5
  393. package/lib/module/subtle.js.map +1 -1
  394. package/lib/module/utils/cipher.js +18 -7
  395. package/lib/module/utils/cipher.js.map +1 -1
  396. package/lib/module/utils/conversion.js +33 -9
  397. package/lib/module/utils/conversion.js.map +1 -1
  398. package/lib/module/utils/timingSafeEqual.js +8 -3
  399. package/lib/module/utils/timingSafeEqual.js.map +1 -1
  400. package/lib/module/x509certificate.js +6 -6
  401. package/lib/module/x509certificate.js.map +1 -1
  402. package/lib/typescript/argon2.d.ts.map +1 -1
  403. package/lib/typescript/cipher.d.ts +2 -2
  404. package/lib/typescript/cipher.d.ts.map +1 -1
  405. package/lib/typescript/dsa.d.ts.map +1 -1
  406. package/lib/typescript/hash.d.ts +2 -2
  407. package/lib/typescript/hash.d.ts.map +1 -1
  408. package/lib/typescript/hkdf.d.ts.map +1 -1
  409. package/lib/typescript/hmac.d.ts +2 -2
  410. package/lib/typescript/hmac.d.ts.map +1 -1
  411. package/lib/typescript/index.d.ts +1 -1
  412. package/lib/typescript/index.d.ts.map +1 -1
  413. package/lib/typescript/keys/publicCipher.d.ts.map +1 -1
  414. package/lib/typescript/random.d.ts.map +1 -1
  415. package/lib/typescript/rsa.d.ts.map +1 -1
  416. package/lib/typescript/scrypt.d.ts.map +1 -1
  417. package/lib/typescript/specs/utils.nitro.d.ts +0 -2
  418. package/lib/typescript/specs/utils.nitro.d.ts.map +1 -1
  419. package/lib/typescript/subtle.d.ts.map +1 -1
  420. package/lib/typescript/utils/cipher.d.ts +13 -1
  421. package/lib/typescript/utils/cipher.d.ts.map +1 -1
  422. package/lib/typescript/utils/conversion.d.ts +9 -6
  423. package/lib/typescript/utils/conversion.d.ts.map +1 -1
  424. package/lib/typescript/utils/timingSafeEqual.d.ts.map +1 -1
  425. package/lib/typescript/x509certificate.d.ts.map +1 -1
  426. package/nitrogen/generated/shared/c++/HybridUtilsSpec.cpp +0 -2
  427. package/nitrogen/generated/shared/c++/HybridUtilsSpec.hpp +0 -3
  428. package/package.json +38 -6
  429. package/src/argon2.ts +80 -2
  430. package/src/cipher.ts +139 -15
  431. package/src/dsa.ts +11 -2
  432. package/src/hash.ts +17 -7
  433. package/src/hkdf.ts +44 -6
  434. package/src/hmac.ts +17 -7
  435. package/src/keys/publicCipher.ts +10 -4
  436. package/src/random.ts +11 -2
  437. package/src/rsa.ts +18 -4
  438. package/src/scrypt.ts +73 -6
  439. package/src/specs/utils.nitro.ts +0 -2
  440. package/src/subtle.ts +90 -8
  441. package/src/utils/cipher.ts +30 -8
  442. package/src/utils/conversion.ts +58 -20
  443. package/src/utils/timingSafeEqual.ts +8 -3
  444. package/src/x509certificate.ts +5 -6
  445. package/deps/blake3/.cargo/config.toml +0 -2
  446. package/deps/blake3/.git-blame-ignore-revs +0 -2
  447. package/deps/blake3/.github/workflows/build_b3sum.py +0 -38
  448. package/deps/blake3/.github/workflows/ci.yml +0 -491
  449. package/deps/blake3/.github/workflows/tag.yml +0 -43
  450. package/deps/blake3/.github/workflows/upload_github_release_asset.py +0 -73
  451. package/deps/blake3/CONTRIBUTING.md +0 -31
  452. package/deps/blake3/Cargo.toml +0 -135
  453. package/deps/blake3/b3sum/Cargo.lock +0 -513
  454. package/deps/blake3/b3sum/Cargo.toml +0 -26
  455. package/deps/blake3/b3sum/README.md +0 -72
  456. package/deps/blake3/b3sum/src/main.rs +0 -564
  457. package/deps/blake3/b3sum/src/unit_tests.rs +0 -235
  458. package/deps/blake3/b3sum/tests/cli_tests.rs +0 -680
  459. package/deps/blake3/b3sum/what_does_check_do.md +0 -176
  460. package/deps/blake3/benches/bench.rs +0 -623
  461. package/deps/blake3/build.rs +0 -389
  462. package/deps/blake3/c/CMakeLists.txt +0 -383
  463. package/deps/blake3/c/CMakePresets.json +0 -73
  464. package/deps/blake3/c/Makefile.testing +0 -82
  465. package/deps/blake3/c/blake3-config.cmake.in +0 -14
  466. package/deps/blake3/c/blake3_avx2.c +0 -326
  467. package/deps/blake3/c/blake3_avx2_x86-64_unix.S +0 -1815
  468. package/deps/blake3/c/blake3_avx2_x86-64_windows_gnu.S +0 -1817
  469. package/deps/blake3/c/blake3_avx2_x86-64_windows_msvc.asm +0 -1828
  470. package/deps/blake3/c/blake3_avx512.c +0 -1388
  471. package/deps/blake3/c/blake3_avx512_x86-64_unix.S +0 -4824
  472. package/deps/blake3/c/blake3_avx512_x86-64_windows_gnu.S +0 -2615
  473. package/deps/blake3/c/blake3_avx512_x86-64_windows_msvc.asm +0 -2634
  474. package/deps/blake3/c/blake3_c_rust_bindings/Cargo.toml +0 -32
  475. package/deps/blake3/c/blake3_c_rust_bindings/README.md +0 -4
  476. package/deps/blake3/c/blake3_c_rust_bindings/benches/bench.rs +0 -477
  477. package/deps/blake3/c/blake3_c_rust_bindings/build.rs +0 -253
  478. package/deps/blake3/c/blake3_c_rust_bindings/cross_test.sh +0 -31
  479. package/deps/blake3/c/blake3_c_rust_bindings/src/lib.rs +0 -333
  480. package/deps/blake3/c/blake3_c_rust_bindings/src/test.rs +0 -696
  481. package/deps/blake3/c/blake3_sse2.c +0 -566
  482. package/deps/blake3/c/blake3_sse2_x86-64_unix.S +0 -2291
  483. package/deps/blake3/c/blake3_sse2_x86-64_windows_gnu.S +0 -2332
  484. package/deps/blake3/c/blake3_sse2_x86-64_windows_msvc.asm +0 -2350
  485. package/deps/blake3/c/blake3_sse41.c +0 -560
  486. package/deps/blake3/c/blake3_sse41_x86-64_unix.S +0 -2028
  487. package/deps/blake3/c/blake3_sse41_x86-64_windows_gnu.S +0 -2069
  488. package/deps/blake3/c/blake3_sse41_x86-64_windows_msvc.asm +0 -2089
  489. package/deps/blake3/c/blake3_tbb.cpp +0 -37
  490. package/deps/blake3/c/dependencies/CMakeLists.txt +0 -3
  491. package/deps/blake3/c/dependencies/tbb/CMakeLists.txt +0 -28
  492. package/deps/blake3/c/example.c +0 -36
  493. package/deps/blake3/c/example_tbb.c +0 -57
  494. package/deps/blake3/c/libblake3.pc.in +0 -12
  495. package/deps/blake3/c/main.c +0 -166
  496. package/deps/blake3/c/test.py +0 -97
  497. package/deps/blake3/media/B3.svg +0 -70
  498. package/deps/blake3/media/BLAKE3.svg +0 -85
  499. package/deps/blake3/media/speed.svg +0 -1474
  500. package/deps/blake3/reference_impl/Cargo.toml +0 -8
  501. package/deps/blake3/reference_impl/README.md +0 -14
  502. package/deps/blake3/reference_impl/reference_impl.rs +0 -374
  503. package/deps/blake3/src/ffi_avx2.rs +0 -65
  504. package/deps/blake3/src/ffi_avx512.rs +0 -169
  505. package/deps/blake3/src/ffi_neon.rs +0 -82
  506. package/deps/blake3/src/ffi_sse2.rs +0 -126
  507. package/deps/blake3/src/ffi_sse41.rs +0 -126
  508. package/deps/blake3/src/guts.rs +0 -60
  509. package/deps/blake3/src/hazmat.rs +0 -704
  510. package/deps/blake3/src/io.rs +0 -64
  511. package/deps/blake3/src/join.rs +0 -92
  512. package/deps/blake3/src/lib.rs +0 -1835
  513. package/deps/blake3/src/platform.rs +0 -587
  514. package/deps/blake3/src/portable.rs +0 -198
  515. package/deps/blake3/src/rust_avx2.rs +0 -474
  516. package/deps/blake3/src/rust_sse2.rs +0 -775
  517. package/deps/blake3/src/rust_sse41.rs +0 -766
  518. package/deps/blake3/src/test.rs +0 -1049
  519. package/deps/blake3/src/traits.rs +0 -227
  520. package/deps/blake3/src/wasm32_simd.rs +0 -794
  521. package/deps/blake3/test_vectors/Cargo.toml +0 -19
  522. package/deps/blake3/test_vectors/cross_test.sh +0 -25
  523. package/deps/blake3/test_vectors/src/bin/generate.rs +0 -4
  524. package/deps/blake3/test_vectors/src/lib.rs +0 -350
  525. package/deps/blake3/test_vectors/test_vectors.json +0 -217
  526. package/deps/blake3/tools/compiler_version/Cargo.toml +0 -7
  527. package/deps/blake3/tools/compiler_version/build.rs +0 -6
  528. package/deps/blake3/tools/compiler_version/src/main.rs +0 -27
  529. package/deps/blake3/tools/instruction_set_support/Cargo.toml +0 -6
  530. package/deps/blake3/tools/instruction_set_support/src/main.rs +0 -10
  531. package/deps/blake3/tools/release.md +0 -16
  532. package/deps/ncrypto/.bazelignore +0 -4
  533. package/deps/ncrypto/.bazelrc +0 -1
  534. package/deps/ncrypto/.bazelversion +0 -1
  535. package/deps/ncrypto/.clang-format +0 -111
  536. package/deps/ncrypto/.github/workflows/bazel.yml +0 -58
  537. package/deps/ncrypto/.github/workflows/commitlint.yml +0 -16
  538. package/deps/ncrypto/.github/workflows/linter.yml +0 -38
  539. package/deps/ncrypto/.github/workflows/macos.yml +0 -43
  540. package/deps/ncrypto/.github/workflows/release-please.yml +0 -16
  541. package/deps/ncrypto/.github/workflows/ubuntu.yml +0 -128
  542. package/deps/ncrypto/.github/workflows/visual-studio.yml +0 -49
  543. package/deps/ncrypto/.python-version +0 -1
  544. package/deps/ncrypto/.release-please-manifest.json +0 -3
  545. package/deps/ncrypto/BUILD.bazel +0 -44
  546. package/deps/ncrypto/CHANGELOG.md +0 -37
  547. package/deps/ncrypto/CMakeLists.txt +0 -79
  548. package/deps/ncrypto/MODULE.bazel +0 -16
  549. package/deps/ncrypto/MODULE.bazel.lock +0 -461
  550. package/deps/ncrypto/cmake/CPM.cmake +0 -1225
  551. package/deps/ncrypto/cmake/ncrypto-flags.cmake +0 -17
  552. package/deps/ncrypto/ncrypto.pc.in +0 -10
  553. package/deps/ncrypto/patches/0001-Expose-libdecrepit-so-NodeJS-can-use-it-for-ncrypto.patch +0 -28
  554. package/deps/ncrypto/pyproject.toml +0 -38
  555. package/deps/ncrypto/release-please-config.json +0 -11
  556. package/deps/ncrypto/src/CMakeLists.txt +0 -40
  557. package/deps/ncrypto/tests/BUILD.bazel +0 -11
  558. package/deps/ncrypto/tests/CMakeLists.txt +0 -7
  559. package/deps/ncrypto/tests/basic.cpp +0 -856
  560. package/deps/ncrypto/tools/run-clang-format.sh +0 -42
  561. package/lib/tsconfig.tsbuildinfo +0 -1
@@ -1,1388 +0,0 @@
1
- #include "blake3_impl.h"
2
-
3
- #include <immintrin.h>
4
-
5
- #define _mm_shuffle_ps2(a, b, c) \
6
- (_mm_castps_si128( \
7
- _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
8
-
9
- INLINE __m128i loadu_128(const uint8_t src[16]) {
10
- return _mm_loadu_si128((void*)src);
11
- }
12
-
13
- INLINE __m256i loadu_256(const uint8_t src[32]) {
14
- return _mm256_loadu_si256((void*)src);
15
- }
16
-
17
- INLINE __m512i loadu_512(const uint8_t src[64]) {
18
- return _mm512_loadu_si512((void*)src);
19
- }
20
-
21
- INLINE void storeu_128(__m128i src, uint8_t dest[16]) {
22
- _mm_storeu_si128((void*)dest, src);
23
- }
24
-
25
- INLINE void storeu_256(__m256i src, uint8_t dest[16]) {
26
- _mm256_storeu_si256((void*)dest, src);
27
- }
28
-
29
- INLINE void storeu_512(__m512i src, uint8_t dest[16]) {
30
- _mm512_storeu_si512((void*)dest, src);
31
- }
32
-
33
- INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
34
-
35
- INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
36
-
37
- INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); }
38
-
39
- INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
40
-
41
- INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
42
-
43
- INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); }
44
-
45
- INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
46
-
47
- INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
48
-
49
- INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); }
50
-
51
- INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
52
- return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
53
- }
54
-
55
- INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); }
56
-
57
- INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); }
58
-
59
- INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); }
60
-
61
- INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); }
62
-
63
- INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); }
64
-
65
- INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); }
66
-
67
- INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); }
68
-
69
- INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); }
70
-
71
- INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); }
72
-
73
- INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); }
74
-
75
- INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); }
76
-
77
- INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); }
78
-
79
- /*
80
- * ----------------------------------------------------------------------------
81
- * compress_avx512
82
- * ----------------------------------------------------------------------------
83
- */
84
-
85
- INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
86
- __m128i m) {
87
- *row0 = add_128(add_128(*row0, m), *row1);
88
- *row3 = xor_128(*row3, *row0);
89
- *row3 = rot16_128(*row3);
90
- *row2 = add_128(*row2, *row3);
91
- *row1 = xor_128(*row1, *row2);
92
- *row1 = rot12_128(*row1);
93
- }
94
-
95
- INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
96
- __m128i m) {
97
- *row0 = add_128(add_128(*row0, m), *row1);
98
- *row3 = xor_128(*row3, *row0);
99
- *row3 = rot8_128(*row3);
100
- *row2 = add_128(*row2, *row3);
101
- *row1 = xor_128(*row1, *row2);
102
- *row1 = rot7_128(*row1);
103
- }
104
-
105
- // Note the optimization here of leaving row1 as the unrotated row, rather than
106
- // row0. All the message loads below are adjusted to compensate for this. See
107
- // discussion at https://github.com/sneves/blake2-avx2/pull/4
108
- INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
109
- *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
110
- *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
111
- *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
112
- }
113
-
114
- INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
115
- *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
116
- *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
117
- *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
118
- }
119
-
120
- INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
121
- const uint8_t block[BLAKE3_BLOCK_LEN],
122
- uint8_t block_len, uint64_t counter, uint8_t flags) {
123
- rows[0] = loadu_128((uint8_t *)&cv[0]);
124
- rows[1] = loadu_128((uint8_t *)&cv[4]);
125
- rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
126
- rows[3] = set4(counter_low(counter), counter_high(counter),
127
- (uint32_t)block_len, (uint32_t)flags);
128
-
129
- __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]);
130
- __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]);
131
- __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]);
132
- __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]);
133
-
134
- __m128i t0, t1, t2, t3, tt;
135
-
136
- // Round 1. The first round permutes the message words from the original
137
- // input order, into the groups that get mixed in parallel.
138
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0
139
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
140
- t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1
141
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
142
- diagonalize(&rows[0], &rows[2], &rows[3]);
143
- t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8
144
- t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14
145
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
146
- t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9
147
- t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15
148
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
149
- undiagonalize(&rows[0], &rows[2], &rows[3]);
150
- m0 = t0;
151
- m1 = t1;
152
- m2 = t2;
153
- m3 = t3;
154
-
155
- // Round 2. This round and all following rounds apply a fixed permutation
156
- // to the message words from the round before.
157
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
158
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
159
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
160
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
161
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
162
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
163
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
164
- diagonalize(&rows[0], &rows[2], &rows[3]);
165
- t2 = _mm_unpacklo_epi64(m3, m1);
166
- tt = _mm_blend_epi16(t2, m2, 0xC0);
167
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
168
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
169
- t3 = _mm_unpackhi_epi32(m1, m3);
170
- tt = _mm_unpacklo_epi32(m2, t3);
171
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
172
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
173
- undiagonalize(&rows[0], &rows[2], &rows[3]);
174
- m0 = t0;
175
- m1 = t1;
176
- m2 = t2;
177
- m3 = t3;
178
-
179
- // Round 3
180
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
181
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
182
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
183
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
184
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
185
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
186
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
187
- diagonalize(&rows[0], &rows[2], &rows[3]);
188
- t2 = _mm_unpacklo_epi64(m3, m1);
189
- tt = _mm_blend_epi16(t2, m2, 0xC0);
190
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
191
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
192
- t3 = _mm_unpackhi_epi32(m1, m3);
193
- tt = _mm_unpacklo_epi32(m2, t3);
194
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
195
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
196
- undiagonalize(&rows[0], &rows[2], &rows[3]);
197
- m0 = t0;
198
- m1 = t1;
199
- m2 = t2;
200
- m3 = t3;
201
-
202
- // Round 4
203
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
204
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
205
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
206
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
207
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
208
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
209
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
210
- diagonalize(&rows[0], &rows[2], &rows[3]);
211
- t2 = _mm_unpacklo_epi64(m3, m1);
212
- tt = _mm_blend_epi16(t2, m2, 0xC0);
213
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
214
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
215
- t3 = _mm_unpackhi_epi32(m1, m3);
216
- tt = _mm_unpacklo_epi32(m2, t3);
217
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
218
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
219
- undiagonalize(&rows[0], &rows[2], &rows[3]);
220
- m0 = t0;
221
- m1 = t1;
222
- m2 = t2;
223
- m3 = t3;
224
-
225
- // Round 5
226
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
227
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
228
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
229
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
230
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
231
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
232
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
233
- diagonalize(&rows[0], &rows[2], &rows[3]);
234
- t2 = _mm_unpacklo_epi64(m3, m1);
235
- tt = _mm_blend_epi16(t2, m2, 0xC0);
236
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
237
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
238
- t3 = _mm_unpackhi_epi32(m1, m3);
239
- tt = _mm_unpacklo_epi32(m2, t3);
240
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
241
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
242
- undiagonalize(&rows[0], &rows[2], &rows[3]);
243
- m0 = t0;
244
- m1 = t1;
245
- m2 = t2;
246
- m3 = t3;
247
-
248
- // Round 6
249
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
250
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
251
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
252
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
253
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
254
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
255
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
256
- diagonalize(&rows[0], &rows[2], &rows[3]);
257
- t2 = _mm_unpacklo_epi64(m3, m1);
258
- tt = _mm_blend_epi16(t2, m2, 0xC0);
259
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
260
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
261
- t3 = _mm_unpackhi_epi32(m1, m3);
262
- tt = _mm_unpacklo_epi32(m2, t3);
263
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
264
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
265
- undiagonalize(&rows[0], &rows[2], &rows[3]);
266
- m0 = t0;
267
- m1 = t1;
268
- m2 = t2;
269
- m3 = t3;
270
-
271
- // Round 7
272
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
273
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
274
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
275
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
276
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
277
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
278
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
279
- diagonalize(&rows[0], &rows[2], &rows[3]);
280
- t2 = _mm_unpacklo_epi64(m3, m1);
281
- tt = _mm_blend_epi16(t2, m2, 0xC0);
282
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
283
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
284
- t3 = _mm_unpackhi_epi32(m1, m3);
285
- tt = _mm_unpacklo_epi32(m2, t3);
286
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
287
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
288
- undiagonalize(&rows[0], &rows[2], &rows[3]);
289
- }
290
-
291
- void blake3_compress_xof_avx512(const uint32_t cv[8],
292
- const uint8_t block[BLAKE3_BLOCK_LEN],
293
- uint8_t block_len, uint64_t counter,
294
- uint8_t flags, uint8_t out[64]) {
295
- __m128i rows[4];
296
- compress_pre(rows, cv, block, block_len, counter, flags);
297
- storeu_128(xor_128(rows[0], rows[2]), &out[0]);
298
- storeu_128(xor_128(rows[1], rows[3]), &out[16]);
299
- storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]);
300
- storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]);
301
- }
302
-
303
- void blake3_compress_in_place_avx512(uint32_t cv[8],
304
- const uint8_t block[BLAKE3_BLOCK_LEN],
305
- uint8_t block_len, uint64_t counter,
306
- uint8_t flags) {
307
- __m128i rows[4];
308
- compress_pre(rows, cv, block, block_len, counter, flags);
309
- storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]);
310
- storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]);
311
- }
312
-
313
- /*
314
- * ----------------------------------------------------------------------------
315
- * hash4_avx512
316
- * ----------------------------------------------------------------------------
317
- */
318
-
319
- INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) {
320
- v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
321
- v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
322
- v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
323
- v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
324
- v[0] = add_128(v[0], v[4]);
325
- v[1] = add_128(v[1], v[5]);
326
- v[2] = add_128(v[2], v[6]);
327
- v[3] = add_128(v[3], v[7]);
328
- v[12] = xor_128(v[12], v[0]);
329
- v[13] = xor_128(v[13], v[1]);
330
- v[14] = xor_128(v[14], v[2]);
331
- v[15] = xor_128(v[15], v[3]);
332
- v[12] = rot16_128(v[12]);
333
- v[13] = rot16_128(v[13]);
334
- v[14] = rot16_128(v[14]);
335
- v[15] = rot16_128(v[15]);
336
- v[8] = add_128(v[8], v[12]);
337
- v[9] = add_128(v[9], v[13]);
338
- v[10] = add_128(v[10], v[14]);
339
- v[11] = add_128(v[11], v[15]);
340
- v[4] = xor_128(v[4], v[8]);
341
- v[5] = xor_128(v[5], v[9]);
342
- v[6] = xor_128(v[6], v[10]);
343
- v[7] = xor_128(v[7], v[11]);
344
- v[4] = rot12_128(v[4]);
345
- v[5] = rot12_128(v[5]);
346
- v[6] = rot12_128(v[6]);
347
- v[7] = rot12_128(v[7]);
348
- v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
349
- v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
350
- v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
351
- v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
352
- v[0] = add_128(v[0], v[4]);
353
- v[1] = add_128(v[1], v[5]);
354
- v[2] = add_128(v[2], v[6]);
355
- v[3] = add_128(v[3], v[7]);
356
- v[12] = xor_128(v[12], v[0]);
357
- v[13] = xor_128(v[13], v[1]);
358
- v[14] = xor_128(v[14], v[2]);
359
- v[15] = xor_128(v[15], v[3]);
360
- v[12] = rot8_128(v[12]);
361
- v[13] = rot8_128(v[13]);
362
- v[14] = rot8_128(v[14]);
363
- v[15] = rot8_128(v[15]);
364
- v[8] = add_128(v[8], v[12]);
365
- v[9] = add_128(v[9], v[13]);
366
- v[10] = add_128(v[10], v[14]);
367
- v[11] = add_128(v[11], v[15]);
368
- v[4] = xor_128(v[4], v[8]);
369
- v[5] = xor_128(v[5], v[9]);
370
- v[6] = xor_128(v[6], v[10]);
371
- v[7] = xor_128(v[7], v[11]);
372
- v[4] = rot7_128(v[4]);
373
- v[5] = rot7_128(v[5]);
374
- v[6] = rot7_128(v[6]);
375
- v[7] = rot7_128(v[7]);
376
-
377
- v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
378
- v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
379
- v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
380
- v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
381
- v[0] = add_128(v[0], v[5]);
382
- v[1] = add_128(v[1], v[6]);
383
- v[2] = add_128(v[2], v[7]);
384
- v[3] = add_128(v[3], v[4]);
385
- v[15] = xor_128(v[15], v[0]);
386
- v[12] = xor_128(v[12], v[1]);
387
- v[13] = xor_128(v[13], v[2]);
388
- v[14] = xor_128(v[14], v[3]);
389
- v[15] = rot16_128(v[15]);
390
- v[12] = rot16_128(v[12]);
391
- v[13] = rot16_128(v[13]);
392
- v[14] = rot16_128(v[14]);
393
- v[10] = add_128(v[10], v[15]);
394
- v[11] = add_128(v[11], v[12]);
395
- v[8] = add_128(v[8], v[13]);
396
- v[9] = add_128(v[9], v[14]);
397
- v[5] = xor_128(v[5], v[10]);
398
- v[6] = xor_128(v[6], v[11]);
399
- v[7] = xor_128(v[7], v[8]);
400
- v[4] = xor_128(v[4], v[9]);
401
- v[5] = rot12_128(v[5]);
402
- v[6] = rot12_128(v[6]);
403
- v[7] = rot12_128(v[7]);
404
- v[4] = rot12_128(v[4]);
405
- v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
406
- v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
407
- v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
408
- v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
409
- v[0] = add_128(v[0], v[5]);
410
- v[1] = add_128(v[1], v[6]);
411
- v[2] = add_128(v[2], v[7]);
412
- v[3] = add_128(v[3], v[4]);
413
- v[15] = xor_128(v[15], v[0]);
414
- v[12] = xor_128(v[12], v[1]);
415
- v[13] = xor_128(v[13], v[2]);
416
- v[14] = xor_128(v[14], v[3]);
417
- v[15] = rot8_128(v[15]);
418
- v[12] = rot8_128(v[12]);
419
- v[13] = rot8_128(v[13]);
420
- v[14] = rot8_128(v[14]);
421
- v[10] = add_128(v[10], v[15]);
422
- v[11] = add_128(v[11], v[12]);
423
- v[8] = add_128(v[8], v[13]);
424
- v[9] = add_128(v[9], v[14]);
425
- v[5] = xor_128(v[5], v[10]);
426
- v[6] = xor_128(v[6], v[11]);
427
- v[7] = xor_128(v[7], v[8]);
428
- v[4] = xor_128(v[4], v[9]);
429
- v[5] = rot7_128(v[5]);
430
- v[6] = rot7_128(v[6]);
431
- v[7] = rot7_128(v[7]);
432
- v[4] = rot7_128(v[4]);
433
- }
434
-
435
- INLINE void transpose_vecs_128(__m128i vecs[4]) {
436
- // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
437
- // 22/33. Note that this doesn't split the vector into two lanes, as the
438
- // AVX2 counterparts do.
439
- __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
440
- __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
441
- __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
442
- __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
443
-
444
- // Interleave 64-bit lanes.
445
- __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
446
- __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
447
- __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
448
- __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
449
-
450
- vecs[0] = abcd_0;
451
- vecs[1] = abcd_1;
452
- vecs[2] = abcd_2;
453
- vecs[3] = abcd_3;
454
- }
455
-
456
- INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
457
- size_t block_offset, __m128i out[16]) {
458
- out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
459
- out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
460
- out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
461
- out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
462
- out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
463
- out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
464
- out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
465
- out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
466
- out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
467
- out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
468
- out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
469
- out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
470
- out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
471
- out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
472
- out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
473
- out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
474
- for (size_t i = 0; i < 4; ++i) {
475
- _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
476
- }
477
- transpose_vecs_128(&out[0]);
478
- transpose_vecs_128(&out[4]);
479
- transpose_vecs_128(&out[8]);
480
- transpose_vecs_128(&out[12]);
481
- }
482
-
483
- INLINE void load_counters4(uint64_t counter, bool increment_counter,
484
- __m128i *out_lo, __m128i *out_hi) {
485
- uint64_t mask = (increment_counter ? ~0 : 0);
486
- __m256i mask_vec = _mm256_set1_epi64x(mask);
487
- __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3);
488
- deltas = _mm256_and_si256(mask_vec, deltas);
489
- __m256i counters =
490
- _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas);
491
- *out_lo = _mm256_cvtepi64_epi32(counters);
492
- *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
493
- }
494
-
495
- static
496
- void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
497
- const uint32_t key[8], uint64_t counter,
498
- bool increment_counter, uint8_t flags,
499
- uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
500
- __m128i h_vecs[8] = {
501
- set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
502
- set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
503
- };
504
- __m128i counter_low_vec, counter_high_vec;
505
- load_counters4(counter, increment_counter, &counter_low_vec,
506
- &counter_high_vec);
507
- uint8_t block_flags = flags | flags_start;
508
-
509
- for (size_t block = 0; block < blocks; block++) {
510
- if (block + 1 == blocks) {
511
- block_flags |= flags_end;
512
- }
513
- __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
514
- __m128i block_flags_vec = set1_128(block_flags);
515
- __m128i msg_vecs[16];
516
- transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
517
-
518
- __m128i v[16] = {
519
- h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
520
- h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
521
- set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
522
- counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
523
- };
524
- round_fn4(v, msg_vecs, 0);
525
- round_fn4(v, msg_vecs, 1);
526
- round_fn4(v, msg_vecs, 2);
527
- round_fn4(v, msg_vecs, 3);
528
- round_fn4(v, msg_vecs, 4);
529
- round_fn4(v, msg_vecs, 5);
530
- round_fn4(v, msg_vecs, 6);
531
- h_vecs[0] = xor_128(v[0], v[8]);
532
- h_vecs[1] = xor_128(v[1], v[9]);
533
- h_vecs[2] = xor_128(v[2], v[10]);
534
- h_vecs[3] = xor_128(v[3], v[11]);
535
- h_vecs[4] = xor_128(v[4], v[12]);
536
- h_vecs[5] = xor_128(v[5], v[13]);
537
- h_vecs[6] = xor_128(v[6], v[14]);
538
- h_vecs[7] = xor_128(v[7], v[15]);
539
-
540
- block_flags = flags;
541
- }
542
-
543
- transpose_vecs_128(&h_vecs[0]);
544
- transpose_vecs_128(&h_vecs[4]);
545
- // The first four vecs now contain the first half of each output, and the
546
- // second four vecs contain the second half of each output.
547
- storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]);
548
- storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]);
549
- storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]);
550
- storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]);
551
- storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]);
552
- storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]);
553
- storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]);
554
- storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]);
555
- }
556
-
557
- static
558
- void blake3_xof4_avx512(const uint32_t cv[8],
559
- const uint8_t block[BLAKE3_BLOCK_LEN],
560
- uint8_t block_len, uint64_t counter, uint8_t flags,
561
- uint8_t out[4 * 64]) {
562
- __m128i h_vecs[8] = {
563
- set1_128(cv[0]), set1_128(cv[1]), set1_128(cv[2]), set1_128(cv[3]),
564
- set1_128(cv[4]), set1_128(cv[5]), set1_128(cv[6]), set1_128(cv[7]),
565
- };
566
- uint32_t block_words[16];
567
- load_block_words(block, block_words);
568
- __m128i msg_vecs[16];
569
- for (size_t i = 0; i < 16; i++) {
570
- msg_vecs[i] = set1_128(block_words[i]);
571
- }
572
- __m128i counter_low_vec, counter_high_vec;
573
- load_counters4(counter, true, &counter_low_vec, &counter_high_vec);
574
- __m128i block_len_vec = set1_128(block_len);
575
- __m128i block_flags_vec = set1_128(flags);
576
- __m128i v[16] = {
577
- h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
578
- h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
579
- set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
580
- counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
581
- };
582
- round_fn4(v, msg_vecs, 0);
583
- round_fn4(v, msg_vecs, 1);
584
- round_fn4(v, msg_vecs, 2);
585
- round_fn4(v, msg_vecs, 3);
586
- round_fn4(v, msg_vecs, 4);
587
- round_fn4(v, msg_vecs, 5);
588
- round_fn4(v, msg_vecs, 6);
589
- for (size_t i = 0; i < 8; i++) {
590
- v[i] = xor_128(v[i], v[i+8]);
591
- v[i+8] = xor_128(v[i+8], h_vecs[i]);
592
- }
593
- transpose_vecs_128(&v[0]);
594
- transpose_vecs_128(&v[4]);
595
- transpose_vecs_128(&v[8]);
596
- transpose_vecs_128(&v[12]);
597
- for (size_t i = 0; i < 4; i++) {
598
- storeu_128(v[i+ 0], &out[(4*i+0) * sizeof(__m128i)]);
599
- storeu_128(v[i+ 4], &out[(4*i+1) * sizeof(__m128i)]);
600
- storeu_128(v[i+ 8], &out[(4*i+2) * sizeof(__m128i)]);
601
- storeu_128(v[i+12], &out[(4*i+3) * sizeof(__m128i)]);
602
- }
603
- }
604
-
605
- /*
606
- * ----------------------------------------------------------------------------
607
- * hash8_avx512
608
- * ----------------------------------------------------------------------------
609
- */
610
-
611
- INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) {
612
- v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
613
- v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
614
- v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
615
- v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
616
- v[0] = add_256(v[0], v[4]);
617
- v[1] = add_256(v[1], v[5]);
618
- v[2] = add_256(v[2], v[6]);
619
- v[3] = add_256(v[3], v[7]);
620
- v[12] = xor_256(v[12], v[0]);
621
- v[13] = xor_256(v[13], v[1]);
622
- v[14] = xor_256(v[14], v[2]);
623
- v[15] = xor_256(v[15], v[3]);
624
- v[12] = rot16_256(v[12]);
625
- v[13] = rot16_256(v[13]);
626
- v[14] = rot16_256(v[14]);
627
- v[15] = rot16_256(v[15]);
628
- v[8] = add_256(v[8], v[12]);
629
- v[9] = add_256(v[9], v[13]);
630
- v[10] = add_256(v[10], v[14]);
631
- v[11] = add_256(v[11], v[15]);
632
- v[4] = xor_256(v[4], v[8]);
633
- v[5] = xor_256(v[5], v[9]);
634
- v[6] = xor_256(v[6], v[10]);
635
- v[7] = xor_256(v[7], v[11]);
636
- v[4] = rot12_256(v[4]);
637
- v[5] = rot12_256(v[5]);
638
- v[6] = rot12_256(v[6]);
639
- v[7] = rot12_256(v[7]);
640
- v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
641
- v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
642
- v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
643
- v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
644
- v[0] = add_256(v[0], v[4]);
645
- v[1] = add_256(v[1], v[5]);
646
- v[2] = add_256(v[2], v[6]);
647
- v[3] = add_256(v[3], v[7]);
648
- v[12] = xor_256(v[12], v[0]);
649
- v[13] = xor_256(v[13], v[1]);
650
- v[14] = xor_256(v[14], v[2]);
651
- v[15] = xor_256(v[15], v[3]);
652
- v[12] = rot8_256(v[12]);
653
- v[13] = rot8_256(v[13]);
654
- v[14] = rot8_256(v[14]);
655
- v[15] = rot8_256(v[15]);
656
- v[8] = add_256(v[8], v[12]);
657
- v[9] = add_256(v[9], v[13]);
658
- v[10] = add_256(v[10], v[14]);
659
- v[11] = add_256(v[11], v[15]);
660
- v[4] = xor_256(v[4], v[8]);
661
- v[5] = xor_256(v[5], v[9]);
662
- v[6] = xor_256(v[6], v[10]);
663
- v[7] = xor_256(v[7], v[11]);
664
- v[4] = rot7_256(v[4]);
665
- v[5] = rot7_256(v[5]);
666
- v[6] = rot7_256(v[6]);
667
- v[7] = rot7_256(v[7]);
668
-
669
- v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
670
- v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
671
- v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
672
- v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
673
- v[0] = add_256(v[0], v[5]);
674
- v[1] = add_256(v[1], v[6]);
675
- v[2] = add_256(v[2], v[7]);
676
- v[3] = add_256(v[3], v[4]);
677
- v[15] = xor_256(v[15], v[0]);
678
- v[12] = xor_256(v[12], v[1]);
679
- v[13] = xor_256(v[13], v[2]);
680
- v[14] = xor_256(v[14], v[3]);
681
- v[15] = rot16_256(v[15]);
682
- v[12] = rot16_256(v[12]);
683
- v[13] = rot16_256(v[13]);
684
- v[14] = rot16_256(v[14]);
685
- v[10] = add_256(v[10], v[15]);
686
- v[11] = add_256(v[11], v[12]);
687
- v[8] = add_256(v[8], v[13]);
688
- v[9] = add_256(v[9], v[14]);
689
- v[5] = xor_256(v[5], v[10]);
690
- v[6] = xor_256(v[6], v[11]);
691
- v[7] = xor_256(v[7], v[8]);
692
- v[4] = xor_256(v[4], v[9]);
693
- v[5] = rot12_256(v[5]);
694
- v[6] = rot12_256(v[6]);
695
- v[7] = rot12_256(v[7]);
696
- v[4] = rot12_256(v[4]);
697
- v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
698
- v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
699
- v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
700
- v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
701
- v[0] = add_256(v[0], v[5]);
702
- v[1] = add_256(v[1], v[6]);
703
- v[2] = add_256(v[2], v[7]);
704
- v[3] = add_256(v[3], v[4]);
705
- v[15] = xor_256(v[15], v[0]);
706
- v[12] = xor_256(v[12], v[1]);
707
- v[13] = xor_256(v[13], v[2]);
708
- v[14] = xor_256(v[14], v[3]);
709
- v[15] = rot8_256(v[15]);
710
- v[12] = rot8_256(v[12]);
711
- v[13] = rot8_256(v[13]);
712
- v[14] = rot8_256(v[14]);
713
- v[10] = add_256(v[10], v[15]);
714
- v[11] = add_256(v[11], v[12]);
715
- v[8] = add_256(v[8], v[13]);
716
- v[9] = add_256(v[9], v[14]);
717
- v[5] = xor_256(v[5], v[10]);
718
- v[6] = xor_256(v[6], v[11]);
719
- v[7] = xor_256(v[7], v[8]);
720
- v[4] = xor_256(v[4], v[9]);
721
- v[5] = rot7_256(v[5]);
722
- v[6] = rot7_256(v[6]);
723
- v[7] = rot7_256(v[7]);
724
- v[4] = rot7_256(v[4]);
725
- }
726
-
727
- INLINE void transpose_vecs_256(__m256i vecs[8]) {
728
- // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
729
- // is 22/33/66/77.
730
- __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
731
- __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
732
- __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
733
- __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
734
- __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
735
- __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
736
- __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
737
- __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
738
-
739
- // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
740
- // 11/33.
741
- __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
742
- __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
743
- __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
744
- __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
745
- __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
746
- __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
747
- __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
748
- __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
749
-
750
- // Interleave 128-bit lanes.
751
- vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
752
- vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
753
- vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
754
- vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
755
- vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
756
- vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
757
- vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
758
- vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
759
- }
760
-
761
- INLINE void transpose_msg_vecs8(const uint8_t *const *inputs,
762
- size_t block_offset, __m256i out[16]) {
763
- out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
764
- out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
765
- out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
766
- out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
767
- out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
768
- out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
769
- out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
770
- out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
771
- out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
772
- out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
773
- out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
774
- out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
775
- out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
776
- out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
777
- out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
778
- out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
779
- for (size_t i = 0; i < 8; ++i) {
780
- _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
781
- }
782
- transpose_vecs_256(&out[0]);
783
- transpose_vecs_256(&out[8]);
784
- }
785
-
786
- INLINE void load_counters8(uint64_t counter, bool increment_counter,
787
- __m256i *out_lo, __m256i *out_hi) {
788
- uint64_t mask = (increment_counter ? ~0 : 0);
789
- __m512i mask_vec = _mm512_set1_epi64(mask);
790
- __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
791
- deltas = _mm512_and_si512(mask_vec, deltas);
792
- __m512i counters =
793
- _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas);
794
- *out_lo = _mm512_cvtepi64_epi32(counters);
795
- *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
796
- }
797
-
798
- static
799
- void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
800
- const uint32_t key[8], uint64_t counter,
801
- bool increment_counter, uint8_t flags,
802
- uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
803
- __m256i h_vecs[8] = {
804
- set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]),
805
- set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]),
806
- };
807
- __m256i counter_low_vec, counter_high_vec;
808
- load_counters8(counter, increment_counter, &counter_low_vec,
809
- &counter_high_vec);
810
- uint8_t block_flags = flags | flags_start;
811
-
812
- for (size_t block = 0; block < blocks; block++) {
813
- if (block + 1 == blocks) {
814
- block_flags |= flags_end;
815
- }
816
- __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN);
817
- __m256i block_flags_vec = set1_256(block_flags);
818
- __m256i msg_vecs[16];
819
- transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
820
-
821
- __m256i v[16] = {
822
- h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
823
- h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
824
- set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]),
825
- counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
826
- };
827
- round_fn8(v, msg_vecs, 0);
828
- round_fn8(v, msg_vecs, 1);
829
- round_fn8(v, msg_vecs, 2);
830
- round_fn8(v, msg_vecs, 3);
831
- round_fn8(v, msg_vecs, 4);
832
- round_fn8(v, msg_vecs, 5);
833
- round_fn8(v, msg_vecs, 6);
834
- h_vecs[0] = xor_256(v[0], v[8]);
835
- h_vecs[1] = xor_256(v[1], v[9]);
836
- h_vecs[2] = xor_256(v[2], v[10]);
837
- h_vecs[3] = xor_256(v[3], v[11]);
838
- h_vecs[4] = xor_256(v[4], v[12]);
839
- h_vecs[5] = xor_256(v[5], v[13]);
840
- h_vecs[6] = xor_256(v[6], v[14]);
841
- h_vecs[7] = xor_256(v[7], v[15]);
842
-
843
- block_flags = flags;
844
- }
845
-
846
- transpose_vecs_256(h_vecs);
847
- storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]);
848
- storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]);
849
- storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]);
850
- storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]);
851
- storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]);
852
- storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]);
853
- storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]);
854
- storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]);
855
- }
856
-
857
- static
858
- void blake3_xof8_avx512(const uint32_t cv[8],
859
- const uint8_t block[BLAKE3_BLOCK_LEN],
860
- uint8_t block_len, uint64_t counter, uint8_t flags,
861
- uint8_t out[8 * 64]) {
862
- __m256i h_vecs[8] = {
863
- set1_256(cv[0]), set1_256(cv[1]), set1_256(cv[2]), set1_256(cv[3]),
864
- set1_256(cv[4]), set1_256(cv[5]), set1_256(cv[6]), set1_256(cv[7]),
865
- };
866
- uint32_t block_words[16];
867
- load_block_words(block, block_words);
868
- __m256i msg_vecs[16];
869
- for (size_t i = 0; i < 16; i++) {
870
- msg_vecs[i] = set1_256(block_words[i]);
871
- }
872
- __m256i counter_low_vec, counter_high_vec;
873
- load_counters8(counter, true, &counter_low_vec, &counter_high_vec);
874
- __m256i block_len_vec = set1_256(block_len);
875
- __m256i block_flags_vec = set1_256(flags);
876
- __m256i v[16] = {
877
- h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
878
- h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
879
- set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]),
880
- counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
881
- };
882
- round_fn8(v, msg_vecs, 0);
883
- round_fn8(v, msg_vecs, 1);
884
- round_fn8(v, msg_vecs, 2);
885
- round_fn8(v, msg_vecs, 3);
886
- round_fn8(v, msg_vecs, 4);
887
- round_fn8(v, msg_vecs, 5);
888
- round_fn8(v, msg_vecs, 6);
889
- for (size_t i = 0; i < 8; i++) {
890
- v[i] = xor_256(v[i], v[i+8]);
891
- v[i+8] = xor_256(v[i+8], h_vecs[i]);
892
- }
893
- transpose_vecs_256(&v[0]);
894
- transpose_vecs_256(&v[8]);
895
- for (size_t i = 0; i < 8; i++) {
896
- storeu_256(v[i+0], &out[(2*i+0) * sizeof(__m256i)]);
897
- storeu_256(v[i+8], &out[(2*i+1) * sizeof(__m256i)]);
898
- }
899
- }
900
-
901
- /*
902
- * ----------------------------------------------------------------------------
903
- * hash16_avx512
904
- * ----------------------------------------------------------------------------
905
- */
906
-
907
- INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) {
908
- v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
909
- v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
910
- v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
911
- v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
912
- v[0] = add_512(v[0], v[4]);
913
- v[1] = add_512(v[1], v[5]);
914
- v[2] = add_512(v[2], v[6]);
915
- v[3] = add_512(v[3], v[7]);
916
- v[12] = xor_512(v[12], v[0]);
917
- v[13] = xor_512(v[13], v[1]);
918
- v[14] = xor_512(v[14], v[2]);
919
- v[15] = xor_512(v[15], v[3]);
920
- v[12] = rot16_512(v[12]);
921
- v[13] = rot16_512(v[13]);
922
- v[14] = rot16_512(v[14]);
923
- v[15] = rot16_512(v[15]);
924
- v[8] = add_512(v[8], v[12]);
925
- v[9] = add_512(v[9], v[13]);
926
- v[10] = add_512(v[10], v[14]);
927
- v[11] = add_512(v[11], v[15]);
928
- v[4] = xor_512(v[4], v[8]);
929
- v[5] = xor_512(v[5], v[9]);
930
- v[6] = xor_512(v[6], v[10]);
931
- v[7] = xor_512(v[7], v[11]);
932
- v[4] = rot12_512(v[4]);
933
- v[5] = rot12_512(v[5]);
934
- v[6] = rot12_512(v[6]);
935
- v[7] = rot12_512(v[7]);
936
- v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
937
- v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
938
- v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
939
- v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
940
- v[0] = add_512(v[0], v[4]);
941
- v[1] = add_512(v[1], v[5]);
942
- v[2] = add_512(v[2], v[6]);
943
- v[3] = add_512(v[3], v[7]);
944
- v[12] = xor_512(v[12], v[0]);
945
- v[13] = xor_512(v[13], v[1]);
946
- v[14] = xor_512(v[14], v[2]);
947
- v[15] = xor_512(v[15], v[3]);
948
- v[12] = rot8_512(v[12]);
949
- v[13] = rot8_512(v[13]);
950
- v[14] = rot8_512(v[14]);
951
- v[15] = rot8_512(v[15]);
952
- v[8] = add_512(v[8], v[12]);
953
- v[9] = add_512(v[9], v[13]);
954
- v[10] = add_512(v[10], v[14]);
955
- v[11] = add_512(v[11], v[15]);
956
- v[4] = xor_512(v[4], v[8]);
957
- v[5] = xor_512(v[5], v[9]);
958
- v[6] = xor_512(v[6], v[10]);
959
- v[7] = xor_512(v[7], v[11]);
960
- v[4] = rot7_512(v[4]);
961
- v[5] = rot7_512(v[5]);
962
- v[6] = rot7_512(v[6]);
963
- v[7] = rot7_512(v[7]);
964
-
965
- v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
966
- v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
967
- v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
968
- v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
969
- v[0] = add_512(v[0], v[5]);
970
- v[1] = add_512(v[1], v[6]);
971
- v[2] = add_512(v[2], v[7]);
972
- v[3] = add_512(v[3], v[4]);
973
- v[15] = xor_512(v[15], v[0]);
974
- v[12] = xor_512(v[12], v[1]);
975
- v[13] = xor_512(v[13], v[2]);
976
- v[14] = xor_512(v[14], v[3]);
977
- v[15] = rot16_512(v[15]);
978
- v[12] = rot16_512(v[12]);
979
- v[13] = rot16_512(v[13]);
980
- v[14] = rot16_512(v[14]);
981
- v[10] = add_512(v[10], v[15]);
982
- v[11] = add_512(v[11], v[12]);
983
- v[8] = add_512(v[8], v[13]);
984
- v[9] = add_512(v[9], v[14]);
985
- v[5] = xor_512(v[5], v[10]);
986
- v[6] = xor_512(v[6], v[11]);
987
- v[7] = xor_512(v[7], v[8]);
988
- v[4] = xor_512(v[4], v[9]);
989
- v[5] = rot12_512(v[5]);
990
- v[6] = rot12_512(v[6]);
991
- v[7] = rot12_512(v[7]);
992
- v[4] = rot12_512(v[4]);
993
- v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
994
- v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
995
- v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
996
- v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
997
- v[0] = add_512(v[0], v[5]);
998
- v[1] = add_512(v[1], v[6]);
999
- v[2] = add_512(v[2], v[7]);
1000
- v[3] = add_512(v[3], v[4]);
1001
- v[15] = xor_512(v[15], v[0]);
1002
- v[12] = xor_512(v[12], v[1]);
1003
- v[13] = xor_512(v[13], v[2]);
1004
- v[14] = xor_512(v[14], v[3]);
1005
- v[15] = rot8_512(v[15]);
1006
- v[12] = rot8_512(v[12]);
1007
- v[13] = rot8_512(v[13]);
1008
- v[14] = rot8_512(v[14]);
1009
- v[10] = add_512(v[10], v[15]);
1010
- v[11] = add_512(v[11], v[12]);
1011
- v[8] = add_512(v[8], v[13]);
1012
- v[9] = add_512(v[9], v[14]);
1013
- v[5] = xor_512(v[5], v[10]);
1014
- v[6] = xor_512(v[6], v[11]);
1015
- v[7] = xor_512(v[7], v[8]);
1016
- v[4] = xor_512(v[4], v[9]);
1017
- v[5] = rot7_512(v[5]);
1018
- v[6] = rot7_512(v[6]);
1019
- v[7] = rot7_512(v[7]);
1020
- v[4] = rot7_512(v[4]);
1021
- }
1022
-
1023
- // 0b10001000, or lanes a0/a2/b0/b2 in little-endian order
1024
- #define LO_IMM8 0x88
1025
-
1026
- INLINE __m512i unpack_lo_128(__m512i a, __m512i b) {
1027
- return _mm512_shuffle_i32x4(a, b, LO_IMM8);
1028
- }
1029
-
1030
- // 0b11011101, or lanes a1/a3/b1/b3 in little-endian order
1031
- #define HI_IMM8 0xdd
1032
-
1033
- INLINE __m512i unpack_hi_128(__m512i a, __m512i b) {
1034
- return _mm512_shuffle_i32x4(a, b, HI_IMM8);
1035
- }
1036
-
1037
- INLINE void transpose_vecs_512(__m512i vecs[16]) {
1038
- // Interleave 32-bit lanes. The _0 unpack is lanes
1039
- // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes
1040
- // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15.
1041
- __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]);
1042
- __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]);
1043
- __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]);
1044
- __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]);
1045
- __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]);
1046
- __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]);
1047
- __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]);
1048
- __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]);
1049
- __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]);
1050
- __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]);
1051
- __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]);
1052
- __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]);
1053
- __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]);
1054
- __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]);
1055
- __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
1056
- __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
1057
-
1058
- // Interleave 64-bit lanes. The _0 unpack is lanes
1059
- // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes
1060
- // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes
1061
- // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes
1062
- // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15.
1063
- __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0);
1064
- __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0);
1065
- __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2);
1066
- __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2);
1067
- __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0);
1068
- __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0);
1069
- __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2);
1070
- __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2);
1071
- __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0);
1072
- __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0);
1073
- __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2);
1074
- __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2);
1075
- __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0);
1076
- __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0);
1077
- __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2);
1078
- __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2);
1079
-
1080
- // Interleave 128-bit lanes. The _0 unpack is
1081
- // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is
1082
- // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on.
1083
- __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0);
1084
- __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1);
1085
- __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2);
1086
- __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3);
1087
- __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0);
1088
- __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1);
1089
- __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2);
1090
- __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3);
1091
- __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0);
1092
- __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1);
1093
- __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2);
1094
- __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3);
1095
- __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0);
1096
- __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1);
1097
- __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2);
1098
- __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3);
1099
-
1100
- // Interleave 128-bit lanes again for the final outputs.
1101
- vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0);
1102
- vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1);
1103
- vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2);
1104
- vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3);
1105
- vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4);
1106
- vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5);
1107
- vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6);
1108
- vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7);
1109
- vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0);
1110
- vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1);
1111
- vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2);
1112
- vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3);
1113
- vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4);
1114
- vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5);
1115
- vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6);
1116
- vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7);
1117
- }
1118
-
1119
- INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
1120
- size_t block_offset, __m512i out[16]) {
1121
- out[0] = loadu_512(&inputs[0][block_offset]);
1122
- out[1] = loadu_512(&inputs[1][block_offset]);
1123
- out[2] = loadu_512(&inputs[2][block_offset]);
1124
- out[3] = loadu_512(&inputs[3][block_offset]);
1125
- out[4] = loadu_512(&inputs[4][block_offset]);
1126
- out[5] = loadu_512(&inputs[5][block_offset]);
1127
- out[6] = loadu_512(&inputs[6][block_offset]);
1128
- out[7] = loadu_512(&inputs[7][block_offset]);
1129
- out[8] = loadu_512(&inputs[8][block_offset]);
1130
- out[9] = loadu_512(&inputs[9][block_offset]);
1131
- out[10] = loadu_512(&inputs[10][block_offset]);
1132
- out[11] = loadu_512(&inputs[11][block_offset]);
1133
- out[12] = loadu_512(&inputs[12][block_offset]);
1134
- out[13] = loadu_512(&inputs[13][block_offset]);
1135
- out[14] = loadu_512(&inputs[14][block_offset]);
1136
- out[15] = loadu_512(&inputs[15][block_offset]);
1137
- for (size_t i = 0; i < 16; ++i) {
1138
- _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
1139
- }
1140
- transpose_vecs_512(out);
1141
- }
1142
-
1143
- INLINE void load_counters16(uint64_t counter, bool increment_counter,
1144
- __m512i *out_lo, __m512i *out_hi) {
1145
- const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
1146
- const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1147
- const __m512i masked_deltas = _mm512_and_si512(deltas, mask);
1148
- const __m512i low_words = _mm512_add_epi32(
1149
- _mm512_set1_epi32((int32_t)counter),
1150
- masked_deltas);
1151
- // The carry bit is 1 if the high bit of the word was 1 before addition and is
1152
- // 0 after.
1153
- // NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to
1154
- // compute the carry bits here, and originally we did, but that intrinsic is
1155
- // broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271.
1156
- const __m512i carries = _mm512_srli_epi32(
1157
- _mm512_andnot_si512(
1158
- low_words, // 0 after (gets inverted by andnot)
1159
- _mm512_set1_epi32((int32_t)counter)), // and 1 before
1160
- 31);
1161
- const __m512i high_words = _mm512_add_epi32(
1162
- _mm512_set1_epi32((int32_t)(counter >> 32)),
1163
- carries);
1164
- *out_lo = low_words;
1165
- *out_hi = high_words;
1166
- }
1167
-
1168
- static
1169
- void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
1170
- const uint32_t key[8], uint64_t counter,
1171
- bool increment_counter, uint8_t flags,
1172
- uint8_t flags_start, uint8_t flags_end,
1173
- uint8_t *out) {
1174
- __m512i h_vecs[8] = {
1175
- set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]),
1176
- set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]),
1177
- };
1178
- __m512i counter_low_vec, counter_high_vec;
1179
- load_counters16(counter, increment_counter, &counter_low_vec,
1180
- &counter_high_vec);
1181
- uint8_t block_flags = flags | flags_start;
1182
-
1183
- for (size_t block = 0; block < blocks; block++) {
1184
- if (block + 1 == blocks) {
1185
- block_flags |= flags_end;
1186
- }
1187
- __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN);
1188
- __m512i block_flags_vec = set1_512(block_flags);
1189
- __m512i msg_vecs[16];
1190
- transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
1191
-
1192
- __m512i v[16] = {
1193
- h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
1194
- h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
1195
- set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]),
1196
- counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
1197
- };
1198
- round_fn16(v, msg_vecs, 0);
1199
- round_fn16(v, msg_vecs, 1);
1200
- round_fn16(v, msg_vecs, 2);
1201
- round_fn16(v, msg_vecs, 3);
1202
- round_fn16(v, msg_vecs, 4);
1203
- round_fn16(v, msg_vecs, 5);
1204
- round_fn16(v, msg_vecs, 6);
1205
- h_vecs[0] = xor_512(v[0], v[8]);
1206
- h_vecs[1] = xor_512(v[1], v[9]);
1207
- h_vecs[2] = xor_512(v[2], v[10]);
1208
- h_vecs[3] = xor_512(v[3], v[11]);
1209
- h_vecs[4] = xor_512(v[4], v[12]);
1210
- h_vecs[5] = xor_512(v[5], v[13]);
1211
- h_vecs[6] = xor_512(v[6], v[14]);
1212
- h_vecs[7] = xor_512(v[7], v[15]);
1213
-
1214
- block_flags = flags;
1215
- }
1216
-
1217
- // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8
1218
- // state vectors. Pad the matrix with zeros. After transposition, store the
1219
- // lower half of each vector.
1220
- __m512i padded[16] = {
1221
- h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
1222
- h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
1223
- set1_512(0), set1_512(0), set1_512(0), set1_512(0),
1224
- set1_512(0), set1_512(0), set1_512(0), set1_512(0),
1225
- };
1226
- transpose_vecs_512(padded);
1227
- _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0]));
1228
- _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1]));
1229
- _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2]));
1230
- _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3]));
1231
- _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4]));
1232
- _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5]));
1233
- _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6]));
1234
- _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7]));
1235
- _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8]));
1236
- _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9]));
1237
- _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10]));
1238
- _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11]));
1239
- _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12]));
1240
- _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13]));
1241
- _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14]));
1242
- _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15]));
1243
- }
1244
-
1245
- static
1246
- void blake3_xof16_avx512(const uint32_t cv[8],
1247
- const uint8_t block[BLAKE3_BLOCK_LEN],
1248
- uint8_t block_len, uint64_t counter, uint8_t flags,
1249
- uint8_t out[16 * 64]) {
1250
- __m512i h_vecs[8] = {
1251
- set1_512(cv[0]), set1_512(cv[1]), set1_512(cv[2]), set1_512(cv[3]),
1252
- set1_512(cv[4]), set1_512(cv[5]), set1_512(cv[6]), set1_512(cv[7]),
1253
- };
1254
- uint32_t block_words[16];
1255
- load_block_words(block, block_words);
1256
- __m512i msg_vecs[16];
1257
- for (size_t i = 0; i < 16; i++) {
1258
- msg_vecs[i] = set1_512(block_words[i]);
1259
- }
1260
- __m512i counter_low_vec, counter_high_vec;
1261
- load_counters16(counter, true, &counter_low_vec, &counter_high_vec);
1262
- __m512i block_len_vec = set1_512(block_len);
1263
- __m512i block_flags_vec = set1_512(flags);
1264
- __m512i v[16] = {
1265
- h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
1266
- h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
1267
- set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]),
1268
- counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
1269
- };
1270
- round_fn16(v, msg_vecs, 0);
1271
- round_fn16(v, msg_vecs, 1);
1272
- round_fn16(v, msg_vecs, 2);
1273
- round_fn16(v, msg_vecs, 3);
1274
- round_fn16(v, msg_vecs, 4);
1275
- round_fn16(v, msg_vecs, 5);
1276
- round_fn16(v, msg_vecs, 6);
1277
- for (size_t i = 0; i < 8; i++) {
1278
- v[i] = xor_512(v[i], v[i+8]);
1279
- v[i+8] = xor_512(v[i+8], h_vecs[i]);
1280
- }
1281
- transpose_vecs_512(&v[0]);
1282
- for (size_t i = 0; i < 16; i++) {
1283
- storeu_512(v[i], &out[i * sizeof(__m512i)]);
1284
- }
1285
- }
1286
-
1287
- /*
1288
- * ----------------------------------------------------------------------------
1289
- * hash_many_avx512
1290
- * ----------------------------------------------------------------------------
1291
- */
1292
-
1293
- INLINE void hash_one_avx512(const uint8_t *input, size_t blocks,
1294
- const uint32_t key[8], uint64_t counter,
1295
- uint8_t flags, uint8_t flags_start,
1296
- uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
1297
- uint32_t cv[8];
1298
- memcpy(cv, key, BLAKE3_KEY_LEN);
1299
- uint8_t block_flags = flags | flags_start;
1300
- while (blocks > 0) {
1301
- if (blocks == 1) {
1302
- block_flags |= flags_end;
1303
- }
1304
- blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter,
1305
- block_flags);
1306
- input = &input[BLAKE3_BLOCK_LEN];
1307
- blocks -= 1;
1308
- block_flags = flags;
1309
- }
1310
- memcpy(out, cv, BLAKE3_OUT_LEN);
1311
- }
1312
-
1313
- void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
1314
- size_t blocks, const uint32_t key[8],
1315
- uint64_t counter, bool increment_counter,
1316
- uint8_t flags, uint8_t flags_start,
1317
- uint8_t flags_end, uint8_t *out) {
1318
- while (num_inputs >= 16) {
1319
- blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags,
1320
- flags_start, flags_end, out);
1321
- if (increment_counter) {
1322
- counter += 16;
1323
- }
1324
- inputs += 16;
1325
- num_inputs -= 16;
1326
- out = &out[16 * BLAKE3_OUT_LEN];
1327
- }
1328
- while (num_inputs >= 8) {
1329
- blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags,
1330
- flags_start, flags_end, out);
1331
- if (increment_counter) {
1332
- counter += 8;
1333
- }
1334
- inputs += 8;
1335
- num_inputs -= 8;
1336
- out = &out[8 * BLAKE3_OUT_LEN];
1337
- }
1338
- while (num_inputs >= 4) {
1339
- blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags,
1340
- flags_start, flags_end, out);
1341
- if (increment_counter) {
1342
- counter += 4;
1343
- }
1344
- inputs += 4;
1345
- num_inputs -= 4;
1346
- out = &out[4 * BLAKE3_OUT_LEN];
1347
- }
1348
- while (num_inputs > 0) {
1349
- hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start,
1350
- flags_end, out);
1351
- if (increment_counter) {
1352
- counter += 1;
1353
- }
1354
- inputs += 1;
1355
- num_inputs -= 1;
1356
- out = &out[BLAKE3_OUT_LEN];
1357
- }
1358
- }
1359
-
1360
- void blake3_xof_many_avx512(const uint32_t cv[8],
1361
- const uint8_t block[BLAKE3_BLOCK_LEN],
1362
- uint8_t block_len, uint64_t counter, uint8_t flags,
1363
- uint8_t* out, size_t outblocks) {
1364
- while (outblocks >= 16) {
1365
- blake3_xof16_avx512(cv, block, block_len, counter, flags, out);
1366
- counter += 16;
1367
- outblocks -= 16;
1368
- out += 16 * BLAKE3_BLOCK_LEN;
1369
- }
1370
- while (outblocks >= 8) {
1371
- blake3_xof8_avx512(cv, block, block_len, counter, flags, out);
1372
- counter += 8;
1373
- outblocks -= 8;
1374
- out += 8 * BLAKE3_BLOCK_LEN;
1375
- }
1376
- while (outblocks >= 4) {
1377
- blake3_xof4_avx512(cv, block, block_len, counter, flags, out);
1378
- counter += 4;
1379
- outblocks -= 4;
1380
- out += 4 * BLAKE3_BLOCK_LEN;
1381
- }
1382
- while (outblocks > 0) {
1383
- blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
1384
- counter += 1;
1385
- outblocks -= 1;
1386
- out += BLAKE3_BLOCK_LEN;
1387
- }
1388
- }