ring-native 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,1275 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # ====================================================================
4
+ # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
+ # project. The module is, however, dual licensed under OpenSSL and
6
+ # CRYPTOGAMS licenses depending on where you obtain it. For further
7
+ # details see http://www.openssl.org/~appro/cryptogams/.
8
+ # ====================================================================
9
+ #
10
+ # SHA256 block transform for x86. September 2007.
11
+ #
12
+ # Performance improvement over compiler generated code varies from
13
+ # 10% to 40% [see below]. Not very impressive on some µ-archs, but
14
+ # it's 5 times smaller and optimizies amount of writes.
15
+ #
16
+ # May 2012.
17
+ #
18
+ # Optimization including two of Pavel Semjanov's ideas, alternative
19
+ # Maj and full unroll, resulted in ~20-25% improvement on most CPUs,
20
+ # ~7% on Pentium, ~40% on Atom. As fully unrolled loop body is almost
21
+ # 15x larger, 8KB vs. 560B, it's fired only for longer inputs. But not
22
+ # on P4, where it kills performance, nor Sandy Bridge, where folded
23
+ # loop is approximately as fast...
24
+ #
25
+ # June 2012.
26
+ #
27
+ # Add AMD XOP-specific code path, >30% improvement on Bulldozer over
28
+ # May version, >60% over original. Add AVX+shrd code path, >25%
29
+ # improvement on Sandy Bridge over May version, 60% over original.
30
+ #
31
+ # May 2013.
32
+ #
33
+ # Replace AMD XOP code path with SSSE3 to cover more processors.
34
+ # (Biggest improvement coefficient is on upcoming Atom Silvermont,
35
+ # not shown.) Add AVX+BMI code path.
36
+ #
37
+ # March 2014.
38
+ #
39
+ # Add support for Intel SHA Extensions.
40
+ #
41
+ # Performance in clock cycles per processed byte (less is better):
42
+ #
43
+ # gcc icc x86 asm(*) SIMD x86_64 asm(**)
44
+ # Pentium 46 57 40/38 - -
45
+ # PIII 36 33 27/24 - -
46
+ # P4 41 38 28 - 17.3
47
+ # AMD K8 27 25 19/15.5 - 14.9
48
+ # Core2 26 23 18/15.6 14.3 13.8
49
+ # Westmere 27 - 19/15.7 13.4 12.3
50
+ # Sandy Bridge 25 - 15.9 12.4 11.6
51
+ # Ivy Bridge 24 - 15.0 11.4 10.3
52
+ # Haswell 22 - 13.9 9.46 7.80
53
+ # Bulldozer 36 - 27/22 17.0 13.6
54
+ # VIA Nano 36 - 25/22 16.8 16.5
55
+ # Atom 50 - 30/25 21.9 18.9
56
+ # Silvermont 40 - 34/31 22.9 20.6
57
+ #
58
+ # (*) numbers after slash are for unrolled loop, where applicable;
59
+ # (**) x86_64 assembly performance is presented for reference
60
+ # purposes, results are best-available;
61
+
62
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63
+ push(@INC,"${dir}","${dir}../../perlasm");
64
+ require "x86asm.pl";
65
+
66
+ &asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
67
+
68
+ $xmm=$avx=0;
69
+ for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
70
+
71
+ # In upstream, this is controlled by shelling out to the compiler to check
72
+ # versions, but BoringSSL is intended to be used with pre-generated perlasm
73
+ # output, so this isn't useful anyway.
74
+ #
75
+ # TODO(davidben): Enable AVX2 code after testing by setting $avx to 2.
76
+ $avx = 1;
77
+
78
+ $avx = 0 unless ($xmm);
79
+
80
+ $shaext=$xmm; ### set to zero if compiling for 1.0.1
81
+
82
+ # TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
83
+ # been tested.
84
+ $shaext = 0;
85
+
86
+ $unroll_after = 64*4; # If pre-evicted from L1P cache first spin of
87
+ # fully unrolled loop was measured to run about
88
+ # 3-4x slower. If slowdown coefficient is N and
89
+ # unrolled loop is m times faster, then you break
90
+ # even at (N-1)/(m-1) blocks. Then it needs to be
91
+ # adjusted for probability of code being evicted,
92
+ # code size/cache size=1/4. Typical m is 1.15...
93
+
94
+ $A="eax";
95
+ $E="edx";
96
+ $T="ebx";
97
+ $Aoff=&DWP(4,"esp");
98
+ $Boff=&DWP(8,"esp");
99
+ $Coff=&DWP(12,"esp");
100
+ $Doff=&DWP(16,"esp");
101
+ $Eoff=&DWP(20,"esp");
102
+ $Foff=&DWP(24,"esp");
103
+ $Goff=&DWP(28,"esp");
104
+ $Hoff=&DWP(32,"esp");
105
+ $Xoff=&DWP(36,"esp");
106
+ $K256="ebp";
107
+
108
+ sub BODY_16_63() {
109
+ &mov ($T,"ecx"); # "ecx" is preloaded
110
+ &mov ("esi",&DWP(4*(9+15+16-14),"esp"));
111
+ &ror ("ecx",18-7);
112
+ &mov ("edi","esi");
113
+ &ror ("esi",19-17);
114
+ &xor ("ecx",$T);
115
+ &shr ($T,3);
116
+ &ror ("ecx",7);
117
+ &xor ("esi","edi");
118
+ &xor ($T,"ecx"); # T = sigma0(X[-15])
119
+ &ror ("esi",17);
120
+ &add ($T,&DWP(4*(9+15+16),"esp")); # T += X[-16]
121
+ &shr ("edi",10);
122
+ &add ($T,&DWP(4*(9+15+16-9),"esp")); # T += X[-7]
123
+ #&xor ("edi","esi") # sigma1(X[-2])
124
+ # &add ($T,"edi"); # T += sigma1(X[-2])
125
+ # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0]
126
+
127
+ &BODY_00_15(1);
128
+ }
129
+ sub BODY_00_15() {
130
+ my $in_16_63=shift;
131
+
132
+ &mov ("ecx",$E);
133
+ &xor ("edi","esi") if ($in_16_63); # sigma1(X[-2])
134
+ &mov ("esi",$Foff);
135
+ &ror ("ecx",25-11);
136
+ &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2])
137
+ &mov ("edi",$Goff);
138
+ &xor ("ecx",$E);
139
+ &xor ("esi","edi");
140
+ &mov ($T,&DWP(4*(9+15),"esp")) if (!$in_16_63);
141
+ &mov (&DWP(4*(9+15),"esp"),$T) if ($in_16_63); # save X[0]
142
+ &ror ("ecx",11-6);
143
+ &and ("esi",$E);
144
+ &mov ($Eoff,$E); # modulo-scheduled
145
+ &xor ($E,"ecx");
146
+ &add ($T,$Hoff); # T += h
147
+ &xor ("esi","edi"); # Ch(e,f,g)
148
+ &ror ($E,6); # Sigma1(e)
149
+ &mov ("ecx",$A);
150
+ &add ($T,"esi"); # T += Ch(e,f,g)
151
+
152
+ &ror ("ecx",22-13);
153
+ &add ($T,$E); # T += Sigma1(e)
154
+ &mov ("edi",$Boff);
155
+ &xor ("ecx",$A);
156
+ &mov ($Aoff,$A); # modulo-scheduled
157
+ &lea ("esp",&DWP(-4,"esp"));
158
+ &ror ("ecx",13-2);
159
+ &mov ("esi",&DWP(0,$K256));
160
+ &xor ("ecx",$A);
161
+ &mov ($E,$Eoff); # e in next iteration, d in this one
162
+ &xor ($A,"edi"); # a ^= b
163
+ &ror ("ecx",2); # Sigma0(a)
164
+
165
+ &add ($T,"esi"); # T+= K[i]
166
+ &mov (&DWP(0,"esp"),$A); # (b^c) in next round
167
+ &add ($E,$T); # d += T
168
+ &and ($A,&DWP(4,"esp")); # a &= (b^c)
169
+ &add ($T,"ecx"); # T += Sigma0(a)
170
+ &xor ($A,"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b)
171
+ &mov ("ecx",&DWP(4*(9+15+16-1),"esp")) if ($in_16_63); # preload T
172
+ &add ($K256,4);
173
+ &add ($A,$T); # h += T
174
+ }
175
+
176
+ &external_label("OPENSSL_ia32cap_P") if (!$i386);
177
+
178
+ &function_begin("sha256_block_data_order");
179
+ &mov ("esi",wparam(0)); # ctx
180
+ &mov ("edi",wparam(1)); # inp
181
+ &mov ("eax",wparam(2)); # num
182
+ &mov ("ebx","esp"); # saved sp
183
+
184
+ &call (&label("pic_point")); # make it PIC!
185
+ &set_label("pic_point");
186
+ &blindpop($K256);
187
+ &lea ($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256));
188
+
189
+ &sub ("esp",16);
190
+ &and ("esp",-64);
191
+
192
+ &shl ("eax",6);
193
+ &add ("eax","edi");
194
+ &mov (&DWP(0,"esp"),"esi"); # ctx
195
+ &mov (&DWP(4,"esp"),"edi"); # inp
196
+ &mov (&DWP(8,"esp"),"eax"); # inp+num*128
197
+ &mov (&DWP(12,"esp"),"ebx"); # saved sp
198
+ if (!$i386 && $xmm) {
199
+ &picmeup("edx","OPENSSL_ia32cap_P",$K256,&label("K256"));
200
+ &mov ("ecx",&DWP(0,"edx"));
201
+ &mov ("ebx",&DWP(4,"edx"));
202
+ &test ("ecx",1<<20); # check for P4
203
+ &jnz (&label("loop"));
204
+ &mov ("edx",&DWP(8,"edx")) if ($xmm);
205
+ &test ("ecx",1<<24); # check for FXSR
206
+ &jz ($unroll_after?&label("no_xmm"):&label("loop"));
207
+ &and ("ecx",1<<30); # mask "Intel CPU" bit
208
+ &and ("ebx",1<<28|1<<9); # mask AVX and SSSE3 bits
209
+ &test ("edx",1<<29) if ($shaext); # check for SHA
210
+ &jnz (&label("shaext")) if ($shaext);
211
+ &or ("ecx","ebx");
212
+ &and ("ecx",1<<28|1<<30);
213
+ &cmp ("ecx",1<<28|1<<30);
214
+ if ($xmm) {
215
+ &je (&label("AVX")) if ($avx);
216
+ &test ("ebx",1<<9); # check for SSSE3
217
+ &jnz (&label("SSSE3"));
218
+ } else {
219
+ &je (&label("loop_shrd"));
220
+ }
221
+ if ($unroll_after) {
222
+ &set_label("no_xmm");
223
+ &sub ("eax","edi");
224
+ &cmp ("eax",$unroll_after);
225
+ &jae (&label("unrolled"));
226
+ } }
227
+ &jmp (&label("loop"));
228
+
229
+ sub COMPACT_LOOP() {
230
+ my $suffix=shift;
231
+
232
+ &set_label("loop$suffix",$suffix?32:16);
233
+ # copy input block to stack reversing byte and dword order
234
+ for($i=0;$i<4;$i++) {
235
+ &mov ("eax",&DWP($i*16+0,"edi"));
236
+ &mov ("ebx",&DWP($i*16+4,"edi"));
237
+ &mov ("ecx",&DWP($i*16+8,"edi"));
238
+ &bswap ("eax");
239
+ &mov ("edx",&DWP($i*16+12,"edi"));
240
+ &bswap ("ebx");
241
+ &push ("eax");
242
+ &bswap ("ecx");
243
+ &push ("ebx");
244
+ &bswap ("edx");
245
+ &push ("ecx");
246
+ &push ("edx");
247
+ }
248
+ &add ("edi",64);
249
+ &lea ("esp",&DWP(-4*9,"esp"));# place for A,B,C,D,E,F,G,H
250
+ &mov (&DWP(4*(9+16)+4,"esp"),"edi");
251
+
252
+ # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
253
+ &mov ($A,&DWP(0,"esi"));
254
+ &mov ("ebx",&DWP(4,"esi"));
255
+ &mov ("ecx",&DWP(8,"esi"));
256
+ &mov ("edi",&DWP(12,"esi"));
257
+ # &mov ($Aoff,$A);
258
+ &mov ($Boff,"ebx");
259
+ &xor ("ebx","ecx");
260
+ &mov ($Coff,"ecx");
261
+ &mov ($Doff,"edi");
262
+ &mov (&DWP(0,"esp"),"ebx"); # magic
263
+ &mov ($E,&DWP(16,"esi"));
264
+ &mov ("ebx",&DWP(20,"esi"));
265
+ &mov ("ecx",&DWP(24,"esi"));
266
+ &mov ("edi",&DWP(28,"esi"));
267
+ # &mov ($Eoff,$E);
268
+ &mov ($Foff,"ebx");
269
+ &mov ($Goff,"ecx");
270
+ &mov ($Hoff,"edi");
271
+
272
+ &set_label("00_15$suffix",16);
273
+
274
+ &BODY_00_15();
275
+
276
+ &cmp ("esi",0xc19bf174);
277
+ &jne (&label("00_15$suffix"));
278
+
279
+ &mov ("ecx",&DWP(4*(9+15+16-1),"esp")); # preloaded in BODY_00_15(1)
280
+ &jmp (&label("16_63$suffix"));
281
+
282
+ &set_label("16_63$suffix",16);
283
+
284
+ &BODY_16_63();
285
+
286
+ &cmp ("esi",0xc67178f2);
287
+ &jne (&label("16_63$suffix"));
288
+
289
+ &mov ("esi",&DWP(4*(9+16+64)+0,"esp"));#ctx
290
+ # &mov ($A,$Aoff);
291
+ &mov ("ebx",$Boff);
292
+ # &mov ("edi",$Coff);
293
+ &mov ("ecx",$Doff);
294
+ &add ($A,&DWP(0,"esi"));
295
+ &add ("ebx",&DWP(4,"esi"));
296
+ &add ("edi",&DWP(8,"esi"));
297
+ &add ("ecx",&DWP(12,"esi"));
298
+ &mov (&DWP(0,"esi"),$A);
299
+ &mov (&DWP(4,"esi"),"ebx");
300
+ &mov (&DWP(8,"esi"),"edi");
301
+ &mov (&DWP(12,"esi"),"ecx");
302
+ # &mov ($E,$Eoff);
303
+ &mov ("eax",$Foff);
304
+ &mov ("ebx",$Goff);
305
+ &mov ("ecx",$Hoff);
306
+ &mov ("edi",&DWP(4*(9+16+64)+4,"esp"));#inp
307
+ &add ($E,&DWP(16,"esi"));
308
+ &add ("eax",&DWP(20,"esi"));
309
+ &add ("ebx",&DWP(24,"esi"));
310
+ &add ("ecx",&DWP(28,"esi"));
311
+ &mov (&DWP(16,"esi"),$E);
312
+ &mov (&DWP(20,"esi"),"eax");
313
+ &mov (&DWP(24,"esi"),"ebx");
314
+ &mov (&DWP(28,"esi"),"ecx");
315
+
316
+ &lea ("esp",&DWP(4*(9+16+64),"esp"));# destroy frame
317
+ &sub ($K256,4*64); # rewind K
318
+
319
+ &cmp ("edi",&DWP(8,"esp")); # are we done yet?
320
+ &jb (&label("loop$suffix"));
321
+ }
322
+ &COMPACT_LOOP();
323
+ &mov ("esp",&DWP(12,"esp")); # restore sp
324
+ &function_end_A();
325
+ if (!$i386 && !$xmm) {
326
+ # ~20% improvement on Sandy Bridge
327
+ local *ror = sub { &shrd(@_[0],@_) };
328
+ &COMPACT_LOOP("_shrd");
329
+ &mov ("esp",&DWP(12,"esp")); # restore sp
330
+ &function_end_A();
331
+ }
332
+
333
+ &set_label("K256",64); # Yes! I keep it in the code segment!
334
+ @K256=( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
335
+ 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
336
+ 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
337
+ 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
338
+ 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
339
+ 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
340
+ 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
341
+ 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
342
+ 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
343
+ 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
344
+ 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
345
+ 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
346
+ 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
347
+ 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
348
+ 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
349
+ 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
350
+ &data_word(@K256);
351
+ &data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # byte swap mask
352
+ &asciz("SHA256 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
353
+
354
+ ($a,$b,$c,$d,$e,$f,$g,$h)=(0..7); # offsets
355
+ sub off { &DWP(4*(((shift)-$i)&7),"esp"); }
356
+
357
+ if (!$i386 && $unroll_after) {
358
+ my @AH=($A,$K256);
359
+
360
+ &set_label("unrolled",16);
361
+ &lea ("esp",&DWP(-96,"esp"));
362
+ # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
363
+ &mov ($AH[0],&DWP(0,"esi"));
364
+ &mov ($AH[1],&DWP(4,"esi"));
365
+ &mov ("ecx",&DWP(8,"esi"));
366
+ &mov ("ebx",&DWP(12,"esi"));
367
+ #&mov (&DWP(0,"esp"),$AH[0]);
368
+ &mov (&DWP(4,"esp"),$AH[1]);
369
+ &xor ($AH[1],"ecx"); # magic
370
+ &mov (&DWP(8,"esp"),"ecx");
371
+ &mov (&DWP(12,"esp"),"ebx");
372
+ &mov ($E,&DWP(16,"esi"));
373
+ &mov ("ebx",&DWP(20,"esi"));
374
+ &mov ("ecx",&DWP(24,"esi"));
375
+ &mov ("esi",&DWP(28,"esi"));
376
+ #&mov (&DWP(16,"esp"),$E);
377
+ &mov (&DWP(20,"esp"),"ebx");
378
+ &mov (&DWP(24,"esp"),"ecx");
379
+ &mov (&DWP(28,"esp"),"esi");
380
+ &jmp (&label("grand_loop"));
381
+
382
+ &set_label("grand_loop",16);
383
+ # copy input block to stack reversing byte order
384
+ for($i=0;$i<5;$i++) {
385
+ &mov ("ebx",&DWP(12*$i+0,"edi"));
386
+ &mov ("ecx",&DWP(12*$i+4,"edi"));
387
+ &bswap ("ebx");
388
+ &mov ("esi",&DWP(12*$i+8,"edi"));
389
+ &bswap ("ecx");
390
+ &mov (&DWP(32+12*$i+0,"esp"),"ebx");
391
+ &bswap ("esi");
392
+ &mov (&DWP(32+12*$i+4,"esp"),"ecx");
393
+ &mov (&DWP(32+12*$i+8,"esp"),"esi");
394
+ }
395
+ &mov ("ebx",&DWP($i*12,"edi"));
396
+ &add ("edi",64);
397
+ &bswap ("ebx");
398
+ &mov (&DWP(96+4,"esp"),"edi");
399
+ &mov (&DWP(32+12*$i,"esp"),"ebx");
400
+
401
+ my ($t1,$t2) = ("ecx","esi");
402
+
403
+ for ($i=0;$i<64;$i++) {
404
+
405
+ if ($i>=16) {
406
+ &mov ($T,$t1); # $t1 is preloaded
407
+ # &mov ($t2,&DWP(32+4*(($i+14)&15),"esp"));
408
+ &ror ($t1,18-7);
409
+ &mov ("edi",$t2);
410
+ &ror ($t2,19-17);
411
+ &xor ($t1,$T);
412
+ &shr ($T,3);
413
+ &ror ($t1,7);
414
+ &xor ($t2,"edi");
415
+ &xor ($T,$t1); # T = sigma0(X[-15])
416
+ &ror ($t2,17);
417
+ &add ($T,&DWP(32+4*($i&15),"esp")); # T += X[-16]
418
+ &shr ("edi",10);
419
+ &add ($T,&DWP(32+4*(($i+9)&15),"esp")); # T += X[-7]
420
+ #&xor ("edi",$t2) # sigma1(X[-2])
421
+ # &add ($T,"edi"); # T += sigma1(X[-2])
422
+ # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0]
423
+ }
424
+ &mov ($t1,$E);
425
+ &xor ("edi",$t2) if ($i>=16); # sigma1(X[-2])
426
+ &mov ($t2,&off($f));
427
+ &ror ($E,25-11);
428
+ &add ($T,"edi") if ($i>=16); # T += sigma1(X[-2])
429
+ &mov ("edi",&off($g));
430
+ &xor ($E,$t1);
431
+ &mov ($T,&DWP(32+4*($i&15),"esp")) if ($i<16); # X[i]
432
+ &mov (&DWP(32+4*($i&15),"esp"),$T) if ($i>=16 && $i<62); # save X[0]
433
+ &xor ($t2,"edi");
434
+ &ror ($E,11-6);
435
+ &and ($t2,$t1);
436
+ &mov (&off($e),$t1); # save $E, modulo-scheduled
437
+ &xor ($E,$t1);
438
+ &add ($T,&off($h)); # T += h
439
+ &xor ("edi",$t2); # Ch(e,f,g)
440
+ &ror ($E,6); # Sigma1(e)
441
+ &mov ($t1,$AH[0]);
442
+ &add ($T,"edi"); # T += Ch(e,f,g)
443
+
444
+ &ror ($t1,22-13);
445
+ &mov ($t2,$AH[0]);
446
+ &mov ("edi",&off($b));
447
+ &xor ($t1,$AH[0]);
448
+ &mov (&off($a),$AH[0]); # save $A, modulo-scheduled
449
+ &xor ($AH[0],"edi"); # a ^= b, (b^c) in next round
450
+ &ror ($t1,13-2);
451
+ &and ($AH[1],$AH[0]); # (b^c) &= (a^b)
452
+ &lea ($E,&DWP(@K256[$i],$T,$E)); # T += Sigma1(1)+K[i]
453
+ &xor ($t1,$t2);
454
+ &xor ($AH[1],"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b)
455
+ &mov ($t2,&DWP(32+4*(($i+2)&15),"esp")) if ($i>=15 && $i<63);
456
+ &ror ($t1,2); # Sigma0(a)
457
+
458
+ &add ($AH[1],$E); # h += T
459
+ &add ($E,&off($d)); # d += T
460
+ &add ($AH[1],$t1); # h += Sigma0(a)
461
+ &mov ($t1,&DWP(32+4*(($i+15)&15),"esp")) if ($i>=15 && $i<63);
462
+
463
+ @AH = reverse(@AH); # rotate(a,h)
464
+ ($t1,$t2) = ($t2,$t1); # rotate(t1,t2)
465
+ }
466
+ &mov ("esi",&DWP(96,"esp")); #ctx
467
+ #&mov ($AH[0],&DWP(0,"esp"));
468
+ &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp"));
469
+ #&mov ("edi", &DWP(8,"esp"));
470
+ &mov ("ecx",&DWP(12,"esp"));
471
+ &add ($AH[0],&DWP(0,"esi"));
472
+ &add ($AH[1],&DWP(4,"esi"));
473
+ &add ("edi",&DWP(8,"esi"));
474
+ &add ("ecx",&DWP(12,"esi"));
475
+ &mov (&DWP(0,"esi"),$AH[0]);
476
+ &mov (&DWP(4,"esi"),$AH[1]);
477
+ &mov (&DWP(8,"esi"),"edi");
478
+ &mov (&DWP(12,"esi"),"ecx");
479
+ #&mov (&DWP(0,"esp"),$AH[0]);
480
+ &mov (&DWP(4,"esp"),$AH[1]);
481
+ &xor ($AH[1],"edi"); # magic
482
+ &mov (&DWP(8,"esp"),"edi");
483
+ &mov (&DWP(12,"esp"),"ecx");
484
+ #&mov ($E,&DWP(16,"esp"));
485
+ &mov ("edi",&DWP(20,"esp"));
486
+ &mov ("ebx",&DWP(24,"esp"));
487
+ &mov ("ecx",&DWP(28,"esp"));
488
+ &add ($E,&DWP(16,"esi"));
489
+ &add ("edi",&DWP(20,"esi"));
490
+ &add ("ebx",&DWP(24,"esi"));
491
+ &add ("ecx",&DWP(28,"esi"));
492
+ &mov (&DWP(16,"esi"),$E);
493
+ &mov (&DWP(20,"esi"),"edi");
494
+ &mov (&DWP(24,"esi"),"ebx");
495
+ &mov (&DWP(28,"esi"),"ecx");
496
+ #&mov (&DWP(16,"esp"),$E);
497
+ &mov (&DWP(20,"esp"),"edi");
498
+ &mov ("edi",&DWP(96+4,"esp")); # inp
499
+ &mov (&DWP(24,"esp"),"ebx");
500
+ &mov (&DWP(28,"esp"),"ecx");
501
+
502
+ &cmp ("edi",&DWP(96+8,"esp")); # are we done yet?
503
+ &jb (&label("grand_loop"));
504
+
505
+ &mov ("esp",&DWP(96+12,"esp")); # restore sp
506
+ &function_end_A();
507
+ }
508
+ if (!$i386 && $xmm) {{{
509
+ if ($shaext) {
510
+ ######################################################################
511
+ # Intel SHA Extensions implementation of SHA256 update function.
512
+ #
513
+ my ($ctx,$inp,$end)=("esi","edi","eax");
514
+ my ($Wi,$ABEF,$CDGH,$TMP)=map("xmm$_",(0..2,7));
515
+ my @MSG=map("xmm$_",(3..6));
516
+
517
+ sub sha256op38 {
518
+ my ($opcodelet,$dst,$src)=@_;
519
+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
520
+ { &data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2); }
521
+ }
522
+ sub sha256rnds2 { sha256op38(0xcb,@_); }
523
+ sub sha256msg1 { sha256op38(0xcc,@_); }
524
+ sub sha256msg2 { sha256op38(0xcd,@_); }
525
+
526
+ &set_label("shaext",32);
527
+ &sub ("esp",32);
528
+
529
+ &movdqu ($ABEF,&QWP(0,$ctx)); # DCBA
530
+ &lea ($K256,&DWP(0x80,$K256));
531
+ &movdqu ($CDGH,&QWP(16,$ctx)); # HGFE
532
+ &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask
533
+
534
+ &pshufd ($Wi,$ABEF,0x1b); # ABCD
535
+ &pshufd ($ABEF,$ABEF,0xb1); # CDAB
536
+ &pshufd ($CDGH,$CDGH,0x1b); # EFGH
537
+ &palignr ($ABEF,$CDGH,8); # ABEF
538
+ &punpcklqdq ($CDGH,$Wi); # CDGH
539
+ &jmp (&label("loop_shaext"));
540
+
541
+ &set_label("loop_shaext",16);
542
+ &movdqu (@MSG[0],&QWP(0,$inp));
543
+ &movdqu (@MSG[1],&QWP(0x10,$inp));
544
+ &movdqu (@MSG[2],&QWP(0x20,$inp));
545
+ &pshufb (@MSG[0],$TMP);
546
+ &movdqu (@MSG[3],&QWP(0x30,$inp));
547
+ &movdqa (&QWP(16,"esp"),$CDGH); # offload
548
+
549
+ &movdqa ($Wi,&QWP(0*16-0x80,$K256));
550
+ &paddd ($Wi,@MSG[0]);
551
+ &pshufb (@MSG[1],$TMP);
552
+ &sha256rnds2 ($CDGH,$ABEF); # 0-3
553
+ &pshufd ($Wi,$Wi,0x0e);
554
+ &nop ();
555
+ &movdqa (&QWP(0,"esp"),$ABEF); # offload
556
+ &sha256rnds2 ($ABEF,$CDGH);
557
+
558
+ &movdqa ($Wi,&QWP(1*16-0x80,$K256));
559
+ &paddd ($Wi,@MSG[1]);
560
+ &pshufb (@MSG[2],$TMP);
561
+ &sha256rnds2 ($CDGH,$ABEF); # 4-7
562
+ &pshufd ($Wi,$Wi,0x0e);
563
+ &lea ($inp,&DWP(0x40,$inp));
564
+ &sha256msg1 (@MSG[0],@MSG[1]);
565
+ &sha256rnds2 ($ABEF,$CDGH);
566
+
567
+ &movdqa ($Wi,&QWP(2*16-0x80,$K256));
568
+ &paddd ($Wi,@MSG[2]);
569
+ &pshufb (@MSG[3],$TMP);
570
+ &sha256rnds2 ($CDGH,$ABEF); # 8-11
571
+ &pshufd ($Wi,$Wi,0x0e);
572
+ &movdqa ($TMP,@MSG[3]);
573
+ &palignr ($TMP,@MSG[2],4);
574
+ &nop ();
575
+ &paddd (@MSG[0],$TMP);
576
+ &sha256msg1 (@MSG[1],@MSG[2]);
577
+ &sha256rnds2 ($ABEF,$CDGH);
578
+
579
+ &movdqa ($Wi,&QWP(3*16-0x80,$K256));
580
+ &paddd ($Wi,@MSG[3]);
581
+ &sha256msg2 (@MSG[0],@MSG[3]);
582
+ &sha256rnds2 ($CDGH,$ABEF); # 12-15
583
+ &pshufd ($Wi,$Wi,0x0e);
584
+ &movdqa ($TMP,@MSG[0]);
585
+ &palignr ($TMP,@MSG[3],4);
586
+ &nop ();
587
+ &paddd (@MSG[1],$TMP);
588
+ &sha256msg1 (@MSG[2],@MSG[3]);
589
+ &sha256rnds2 ($ABEF,$CDGH);
590
+
591
+ for($i=4;$i<16-3;$i++) {
592
+ &movdqa ($Wi,&QWP($i*16-0x80,$K256));
593
+ &paddd ($Wi,@MSG[0]);
594
+ &sha256msg2 (@MSG[1],@MSG[0]);
595
+ &sha256rnds2 ($CDGH,$ABEF); # 16-19...
596
+ &pshufd ($Wi,$Wi,0x0e);
597
+ &movdqa ($TMP,@MSG[1]);
598
+ &palignr ($TMP,@MSG[0],4);
599
+ &nop ();
600
+ &paddd (@MSG[2],$TMP);
601
+ &sha256msg1 (@MSG[3],@MSG[0]);
602
+ &sha256rnds2 ($ABEF,$CDGH);
603
+
604
+ push(@MSG,shift(@MSG));
605
+ }
606
+ &movdqa ($Wi,&QWP(13*16-0x80,$K256));
607
+ &paddd ($Wi,@MSG[0]);
608
+ &sha256msg2 (@MSG[1],@MSG[0]);
609
+ &sha256rnds2 ($CDGH,$ABEF); # 52-55
610
+ &pshufd ($Wi,$Wi,0x0e);
611
+ &movdqa ($TMP,@MSG[1])
612
+ &palignr ($TMP,@MSG[0],4);
613
+ &sha256rnds2 ($ABEF,$CDGH);
614
+ &paddd (@MSG[2],$TMP);
615
+
616
+ &movdqa ($Wi,&QWP(14*16-0x80,$K256));
617
+ &paddd ($Wi,@MSG[1]);
618
+ &sha256rnds2 ($CDGH,$ABEF); # 56-59
619
+ &pshufd ($Wi,$Wi,0x0e);
620
+ &sha256msg2 (@MSG[2],@MSG[1]);
621
+ &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask
622
+ &sha256rnds2 ($ABEF,$CDGH);
623
+
624
+ &movdqa ($Wi,&QWP(15*16-0x80,$K256));
625
+ &paddd ($Wi,@MSG[2]);
626
+ &nop ();
627
+ &sha256rnds2 ($CDGH,$ABEF); # 60-63
628
+ &pshufd ($Wi,$Wi,0x0e);
629
+ &cmp ($end,$inp);
630
+ &nop ();
631
+ &sha256rnds2 ($ABEF,$CDGH);
632
+
633
+ &paddd ($CDGH,&QWP(16,"esp"));
634
+ &paddd ($ABEF,&QWP(0,"esp"));
635
+ &jnz (&label("loop_shaext"));
636
+
637
+ &pshufd ($CDGH,$CDGH,0xb1); # DCHG
638
+ &pshufd ($TMP,$ABEF,0x1b); # FEBA
639
+ &pshufd ($ABEF,$ABEF,0xb1); # BAFE
640
+ &punpckhqdq ($ABEF,$CDGH); # DCBA
641
+ &palignr ($CDGH,$TMP,8); # HGFE
642
+
643
+ &mov ("esp",&DWP(32+12,"esp"));
644
+ &movdqu (&QWP(0,$ctx),$ABEF);
645
+ &movdqu (&QWP(16,$ctx),$CDGH);
646
+ &function_end_A();
647
+ }
648
+
649
+ my @X = map("xmm$_",(0..3));
650
+ my ($t0,$t1,$t2,$t3) = map("xmm$_",(4..7));
651
+ my @AH = ($A,$T);
652
+
653
+ &set_label("SSSE3",32);
654
+ &lea ("esp",&DWP(-96,"esp"));
655
+ # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
656
+ &mov ($AH[0],&DWP(0,"esi"));
657
+ &mov ($AH[1],&DWP(4,"esi"));
658
+ &mov ("ecx",&DWP(8,"esi"));
659
+ &mov ("edi",&DWP(12,"esi"));
660
+ #&mov (&DWP(0,"esp"),$AH[0]);
661
+ &mov (&DWP(4,"esp"),$AH[1]);
662
+ &xor ($AH[1],"ecx"); # magic
663
+ &mov (&DWP(8,"esp"),"ecx");
664
+ &mov (&DWP(12,"esp"),"edi");
665
+ &mov ($E,&DWP(16,"esi"));
666
+ &mov ("edi",&DWP(20,"esi"));
667
+ &mov ("ecx",&DWP(24,"esi"));
668
+ &mov ("esi",&DWP(28,"esi"));
669
+ #&mov (&DWP(16,"esp"),$E);
670
+ &mov (&DWP(20,"esp"),"edi");
671
+ &mov ("edi",&DWP(96+4,"esp")); # inp
672
+ &mov (&DWP(24,"esp"),"ecx");
673
+ &mov (&DWP(28,"esp"),"esi");
674
+ &movdqa ($t3,&QWP(256,$K256));
675
+ &jmp (&label("grand_ssse3"));
676
+
677
+ &set_label("grand_ssse3",16);
678
+ # load input, reverse byte order, add K256[0..15], save to stack
679
+ &movdqu (@X[0],&QWP(0,"edi"));
680
+ &movdqu (@X[1],&QWP(16,"edi"));
681
+ &movdqu (@X[2],&QWP(32,"edi"));
682
+ &movdqu (@X[3],&QWP(48,"edi"));
683
+ &add ("edi",64);
684
+ &pshufb (@X[0],$t3);
685
+ &mov (&DWP(96+4,"esp"),"edi");
686
+ &pshufb (@X[1],$t3);
687
+ &movdqa ($t0,&QWP(0,$K256));
688
+ &pshufb (@X[2],$t3);
689
+ &movdqa ($t1,&QWP(16,$K256));
690
+ &paddd ($t0,@X[0]);
691
+ &pshufb (@X[3],$t3);
692
+ &movdqa ($t2,&QWP(32,$K256));
693
+ &paddd ($t1,@X[1]);
694
+ &movdqa ($t3,&QWP(48,$K256));
695
+ &movdqa (&QWP(32+0,"esp"),$t0);
696
+ &paddd ($t2,@X[2]);
697
+ &movdqa (&QWP(32+16,"esp"),$t1);
698
+ &paddd ($t3,@X[3]);
699
+ &movdqa (&QWP(32+32,"esp"),$t2);
700
+ &movdqa (&QWP(32+48,"esp"),$t3);
701
+ &jmp (&label("ssse3_00_47"));
702
+
703
+ &set_label("ssse3_00_47",16);
704
+ &add ($K256,64);
705
+
706
+ sub SSSE3_00_47 () {
707
+ my $j = shift;
708
+ my $body = shift;
709
+ my @X = @_;
710
+ my @insns = (&$body,&$body,&$body,&$body); # 120 instructions
711
+
712
+ eval(shift(@insns));
713
+ &movdqa ($t0,@X[1]);
714
+ eval(shift(@insns)); # @
715
+ eval(shift(@insns));
716
+ &movdqa ($t3,@X[3]);
717
+ eval(shift(@insns));
718
+ eval(shift(@insns));
719
+ &palignr ($t0,@X[0],4); # X[1..4]
720
+ eval(shift(@insns));
721
+ eval(shift(@insns)); # @
722
+ eval(shift(@insns));
723
+ &palignr ($t3,@X[2],4); # X[9..12]
724
+ eval(shift(@insns));
725
+ eval(shift(@insns));
726
+ eval(shift(@insns));
727
+ &movdqa ($t1,$t0);
728
+ eval(shift(@insns)); # @
729
+ eval(shift(@insns));
730
+ &movdqa ($t2,$t0);
731
+ eval(shift(@insns));
732
+ eval(shift(@insns));
733
+ &psrld ($t0,3);
734
+ eval(shift(@insns));
735
+ eval(shift(@insns)); # @
736
+ &paddd (@X[0],$t3); # X[0..3] += X[9..12]
737
+ eval(shift(@insns));
738
+ eval(shift(@insns));
739
+ &psrld ($t2,7);
740
+ eval(shift(@insns));
741
+ eval(shift(@insns));
742
+ eval(shift(@insns)); # @
743
+ eval(shift(@insns));
744
+ &pshufd ($t3,@X[3],0b11111010); # X[14..15]
745
+ eval(shift(@insns));
746
+ eval(shift(@insns));
747
+ &pslld ($t1,32-18);
748
+ eval(shift(@insns));
749
+ eval(shift(@insns)); # @
750
+ &pxor ($t0,$t2);
751
+ eval(shift(@insns));
752
+ eval(shift(@insns));
753
+ &psrld ($t2,18-7);
754
+ eval(shift(@insns));
755
+ eval(shift(@insns));
756
+ eval(shift(@insns)); # @
757
+ &pxor ($t0,$t1);
758
+ eval(shift(@insns));
759
+ eval(shift(@insns));
760
+ &pslld ($t1,18-7);
761
+ eval(shift(@insns));
762
+ eval(shift(@insns));
763
+ eval(shift(@insns)); # @
764
+ &pxor ($t0,$t2);
765
+ eval(shift(@insns));
766
+ eval(shift(@insns));
767
+ &movdqa ($t2,$t3);
768
+ eval(shift(@insns));
769
+ eval(shift(@insns));
770
+ eval(shift(@insns)); # @
771
+ &pxor ($t0,$t1); # sigma0(X[1..4])
772
+ eval(shift(@insns));
773
+ eval(shift(@insns));
774
+ &psrld ($t3,10);
775
+ eval(shift(@insns));
776
+ eval(shift(@insns));
777
+ eval(shift(@insns)); # @
778
+ &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
779
+ eval(shift(@insns));
780
+ eval(shift(@insns));
781
+ &psrlq ($t2,17);
782
+ eval(shift(@insns));
783
+ eval(shift(@insns));
784
+ eval(shift(@insns)); # @
785
+ &pxor ($t3,$t2);
786
+ eval(shift(@insns));
787
+ eval(shift(@insns));
788
+ &psrlq ($t2,19-17);
789
+ eval(shift(@insns));
790
+ eval(shift(@insns));
791
+ eval(shift(@insns)); # @
792
+ &pxor ($t3,$t2);
793
+ eval(shift(@insns));
794
+ eval(shift(@insns));
795
+ &pshufd ($t3,$t3,0b10000000);
796
+ eval(shift(@insns));
797
+ eval(shift(@insns));
798
+ eval(shift(@insns)); # @
799
+ eval(shift(@insns));
800
+ eval(shift(@insns));
801
+ eval(shift(@insns));
802
+ eval(shift(@insns));
803
+ eval(shift(@insns)); # @
804
+ eval(shift(@insns));
805
+ &psrldq ($t3,8);
806
+ eval(shift(@insns));
807
+ eval(shift(@insns));
808
+ eval(shift(@insns));
809
+ &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
810
+ eval(shift(@insns)); # @
811
+ eval(shift(@insns));
812
+ eval(shift(@insns));
813
+ eval(shift(@insns));
814
+ eval(shift(@insns));
815
+ eval(shift(@insns)); # @
816
+ eval(shift(@insns));
817
+ &pshufd ($t3,@X[0],0b01010000); # X[16..17]
818
+ eval(shift(@insns));
819
+ eval(shift(@insns));
820
+ eval(shift(@insns));
821
+ &movdqa ($t2,$t3);
822
+ eval(shift(@insns)); # @
823
+ &psrld ($t3,10);
824
+ eval(shift(@insns));
825
+ &psrlq ($t2,17);
826
+ eval(shift(@insns));
827
+ eval(shift(@insns));
828
+ eval(shift(@insns));
829
+ eval(shift(@insns)); # @
830
+ &pxor ($t3,$t2);
831
+ eval(shift(@insns));
832
+ eval(shift(@insns));
833
+ &psrlq ($t2,19-17);
834
+ eval(shift(@insns));
835
+ eval(shift(@insns));
836
+ eval(shift(@insns)); # @
837
+ &pxor ($t3,$t2);
838
+ eval(shift(@insns));
839
+ eval(shift(@insns));
840
+ eval(shift(@insns));
841
+ &pshufd ($t3,$t3,0b00001000);
842
+ eval(shift(@insns));
843
+ eval(shift(@insns)); # @
844
+ &movdqa ($t2,&QWP(16*$j,$K256));
845
+ eval(shift(@insns));
846
+ eval(shift(@insns));
847
+ &pslldq ($t3,8);
848
+ eval(shift(@insns));
849
+ eval(shift(@insns));
850
+ eval(shift(@insns)); # @
851
+ eval(shift(@insns));
852
+ eval(shift(@insns));
853
+ eval(shift(@insns));
854
+ eval(shift(@insns));
855
+ eval(shift(@insns)); # @
856
+ &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
857
+ eval(shift(@insns));
858
+ eval(shift(@insns));
859
+ eval(shift(@insns));
860
+ eval(shift(@insns));
861
+ &paddd ($t2,@X[0]);
862
+ eval(shift(@insns)); # @
863
+
864
+ foreach (@insns) { eval; } # remaining instructions
865
+
866
+ &movdqa (&QWP(32+16*$j,"esp"),$t2);
867
+ }
868
+
869
+ sub body_00_15 () {
870
+ (
871
+ '&mov ("ecx",$E);',
872
+ '&ror ($E,25-11);',
873
+ '&mov ("esi",&off($f));',
874
+ '&xor ($E,"ecx");',
875
+ '&mov ("edi",&off($g));',
876
+ '&xor ("esi","edi");',
877
+ '&ror ($E,11-6);',
878
+ '&and ("esi","ecx");',
879
+ '&mov (&off($e),"ecx");', # save $E, modulo-scheduled
880
+ '&xor ($E,"ecx");',
881
+ '&xor ("edi","esi");', # Ch(e,f,g)
882
+ '&ror ($E,6);', # T = Sigma1(e)
883
+ '&mov ("ecx",$AH[0]);',
884
+ '&add ($E,"edi");', # T += Ch(e,f,g)
885
+ '&mov ("edi",&off($b));',
886
+ '&mov ("esi",$AH[0]);',
887
+
888
+ '&ror ("ecx",22-13);',
889
+ '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled
890
+ '&xor ("ecx",$AH[0]);',
891
+ '&xor ($AH[0],"edi");', # a ^= b, (b^c) in next round
892
+ '&add ($E,&off($h));', # T += h
893
+ '&ror ("ecx",13-2);',
894
+ '&and ($AH[1],$AH[0]);', # (b^c) &= (a^b)
895
+ '&xor ("ecx","esi");',
896
+ '&add ($E,&DWP(32+4*($i&15),"esp"));', # T += K[i]+X[i]
897
+ '&xor ($AH[1],"edi");', # h = Maj(a,b,c) = Ch(a^b,c,b)
898
+ '&ror ("ecx",2);', # Sigma0(a)
899
+
900
+ '&add ($AH[1],$E);', # h += T
901
+ '&add ($E,&off($d));', # d += T
902
+ '&add ($AH[1],"ecx");'. # h += Sigma0(a)
903
+
904
+ '@AH = reverse(@AH); $i++;' # rotate(a,h)
905
+ );
906
+ }
907
+
908
+ for ($i=0,$j=0; $j<4; $j++) {
909
+ &SSSE3_00_47($j,\&body_00_15,@X);
910
+ push(@X,shift(@X)); # rotate(@X)
911
+ }
912
+ &cmp (&DWP(16*$j,$K256),0x00010203);
913
+ &jne (&label("ssse3_00_47"));
914
+
915
+ for ($i=0; $i<16; ) {
916
+ foreach(body_00_15()) { eval; }
917
+ }
918
+
919
+ &mov ("esi",&DWP(96,"esp")); #ctx
920
+ #&mov ($AH[0],&DWP(0,"esp"));
921
+ &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp"));
922
+ #&mov ("edi", &DWP(8,"esp"));
923
+ &mov ("ecx",&DWP(12,"esp"));
924
+ &add ($AH[0],&DWP(0,"esi"));
925
+ &add ($AH[1],&DWP(4,"esi"));
926
+ &add ("edi",&DWP(8,"esi"));
927
+ &add ("ecx",&DWP(12,"esi"));
928
+ &mov (&DWP(0,"esi"),$AH[0]);
929
+ &mov (&DWP(4,"esi"),$AH[1]);
930
+ &mov (&DWP(8,"esi"),"edi");
931
+ &mov (&DWP(12,"esi"),"ecx");
932
+ #&mov (&DWP(0,"esp"),$AH[0]);
933
+ &mov (&DWP(4,"esp"),$AH[1]);
934
+ &xor ($AH[1],"edi"); # magic
935
+ &mov (&DWP(8,"esp"),"edi");
936
+ &mov (&DWP(12,"esp"),"ecx");
937
+ #&mov ($E,&DWP(16,"esp"));
938
+ &mov ("edi",&DWP(20,"esp"));
939
+ &mov ("ecx",&DWP(24,"esp"));
940
+ &add ($E,&DWP(16,"esi"));
941
+ &add ("edi",&DWP(20,"esi"));
942
+ &add ("ecx",&DWP(24,"esi"));
943
+ &mov (&DWP(16,"esi"),$E);
944
+ &mov (&DWP(20,"esi"),"edi");
945
+ &mov (&DWP(20,"esp"),"edi");
946
+ &mov ("edi",&DWP(28,"esp"));
947
+ &mov (&DWP(24,"esi"),"ecx");
948
+ #&mov (&DWP(16,"esp"),$E);
949
+ &add ("edi",&DWP(28,"esi"));
950
+ &mov (&DWP(24,"esp"),"ecx");
951
+ &mov (&DWP(28,"esi"),"edi");
952
+ &mov (&DWP(28,"esp"),"edi");
953
+ &mov ("edi",&DWP(96+4,"esp")); # inp
954
+
955
+ &movdqa ($t3,&QWP(64,$K256));
956
+ &sub ($K256,3*64); # rewind K
957
+ &cmp ("edi",&DWP(96+8,"esp")); # are we done yet?
958
+ &jb (&label("grand_ssse3"));
959
+
960
+ &mov ("esp",&DWP(96+12,"esp")); # restore sp
961
+ &function_end_A();
962
+ if ($avx) {
963
+ &set_label("AVX",32);
964
+ if ($avx>1) {
965
+ &and ("edx",1<<8|1<<3); # check for BMI2+BMI1
966
+ &cmp ("edx",1<<8|1<<3);
967
+ &je (&label("AVX_BMI"));
968
+ }
969
+ &lea ("esp",&DWP(-96,"esp"));
970
+ &vzeroall ();
971
+ # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
972
+ &mov ($AH[0],&DWP(0,"esi"));
973
+ &mov ($AH[1],&DWP(4,"esi"));
974
+ &mov ("ecx",&DWP(8,"esi"));
975
+ &mov ("edi",&DWP(12,"esi"));
976
+ #&mov (&DWP(0,"esp"),$AH[0]);
977
+ &mov (&DWP(4,"esp"),$AH[1]);
978
+ &xor ($AH[1],"ecx"); # magic
979
+ &mov (&DWP(8,"esp"),"ecx");
980
+ &mov (&DWP(12,"esp"),"edi");
981
+ &mov ($E,&DWP(16,"esi"));
982
+ &mov ("edi",&DWP(20,"esi"));
983
+ &mov ("ecx",&DWP(24,"esi"));
984
+ &mov ("esi",&DWP(28,"esi"));
985
+ #&mov (&DWP(16,"esp"),$E);
986
+ &mov (&DWP(20,"esp"),"edi");
987
+ &mov ("edi",&DWP(96+4,"esp")); # inp
988
+ &mov (&DWP(24,"esp"),"ecx");
989
+ &mov (&DWP(28,"esp"),"esi");
990
+ &vmovdqa ($t3,&QWP(256,$K256));
991
+ &jmp (&label("grand_avx"));
992
+
993
+ &set_label("grand_avx",32);
994
+ # load input, reverse byte order, add K256[0..15], save to stack
995
+ &vmovdqu (@X[0],&QWP(0,"edi"));
996
+ &vmovdqu (@X[1],&QWP(16,"edi"));
997
+ &vmovdqu (@X[2],&QWP(32,"edi"));
998
+ &vmovdqu (@X[3],&QWP(48,"edi"));
999
+ &add ("edi",64);
1000
+ &vpshufb (@X[0],@X[0],$t3);
1001
+ &mov (&DWP(96+4,"esp"),"edi");
1002
+ &vpshufb (@X[1],@X[1],$t3);
1003
+ &vpshufb (@X[2],@X[2],$t3);
1004
+ &vpaddd ($t0,@X[0],&QWP(0,$K256));
1005
+ &vpshufb (@X[3],@X[3],$t3);
1006
+ &vpaddd ($t1,@X[1],&QWP(16,$K256));
1007
+ &vpaddd ($t2,@X[2],&QWP(32,$K256));
1008
+ &vpaddd ($t3,@X[3],&QWP(48,$K256));
1009
+ &vmovdqa (&QWP(32+0,"esp"),$t0);
1010
+ &vmovdqa (&QWP(32+16,"esp"),$t1);
1011
+ &vmovdqa (&QWP(32+32,"esp"),$t2);
1012
+ &vmovdqa (&QWP(32+48,"esp"),$t3);
1013
+ &jmp (&label("avx_00_47"));
1014
+
1015
+ &set_label("avx_00_47",16);
1016
+ &add ($K256,64);
1017
+
1018
+ sub Xupdate_AVX () {
1019
+ (
1020
+ '&vpalignr ($t0,@X[1],@X[0],4);', # X[1..4]
1021
+ '&vpalignr ($t3,@X[3],@X[2],4);', # X[9..12]
1022
+ '&vpsrld ($t2,$t0,7);',
1023
+ '&vpaddd (@X[0],@X[0],$t3);', # X[0..3] += X[9..16]
1024
+ '&vpsrld ($t3,$t0,3);',
1025
+ '&vpslld ($t1,$t0,14);',
1026
+ '&vpxor ($t0,$t3,$t2);',
1027
+ '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
1028
+ '&vpsrld ($t2,$t2,18-7);',
1029
+ '&vpxor ($t0,$t0,$t1);',
1030
+ '&vpslld ($t1,$t1,25-14);',
1031
+ '&vpxor ($t0,$t0,$t2);',
1032
+ '&vpsrld ($t2,$t3,10);',
1033
+ '&vpxor ($t0,$t0,$t1);', # sigma0(X[1..4])
1034
+ '&vpsrlq ($t1,$t3,17);',
1035
+ '&vpaddd (@X[0],@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
1036
+ '&vpxor ($t2,$t2,$t1);',
1037
+ '&vpsrlq ($t3,$t3,19);',
1038
+ '&vpxor ($t2,$t2,$t3);', # sigma1(X[14..15]
1039
+ '&vpshufd ($t3,$t2,0b10000100);',
1040
+ '&vpsrldq ($t3,$t3,8);',
1041
+ '&vpaddd (@X[0],@X[0],$t3);', # X[0..1] += sigma1(X[14..15])
1042
+ '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
1043
+ '&vpsrld ($t2,$t3,10);',
1044
+ '&vpsrlq ($t1,$t3,17);',
1045
+ '&vpxor ($t2,$t2,$t1);',
1046
+ '&vpsrlq ($t3,$t3,19);',
1047
+ '&vpxor ($t2,$t2,$t3);', # sigma1(X[16..17]
1048
+ '&vpshufd ($t3,$t2,0b11101000);',
1049
+ '&vpslldq ($t3,$t3,8);',
1050
+ '&vpaddd (@X[0],@X[0],$t3);' # X[2..3] += sigma1(X[16..17])
1051
+ );
1052
+ }
1053
+
1054
+ local *ror = sub { &shrd(@_[0],@_) };
1055
+ sub AVX_00_47 () {
1056
+ my $j = shift;
1057
+ my $body = shift;
1058
+ my @X = @_;
1059
+ my @insns = (&$body,&$body,&$body,&$body); # 120 instructions
1060
+ my $insn;
1061
+
1062
+ foreach (Xupdate_AVX()) { # 31 instructions
1063
+ eval;
1064
+ eval(shift(@insns));
1065
+ eval(shift(@insns));
1066
+ eval($insn = shift(@insns));
1067
+ eval(shift(@insns)) if ($insn =~ /rorx/ && @insns[0] =~ /rorx/);
1068
+ }
1069
+ &vpaddd ($t2,@X[0],&QWP(16*$j,$K256));
1070
+ foreach (@insns) { eval; } # remaining instructions
1071
+ &vmovdqa (&QWP(32+16*$j,"esp"),$t2);
1072
+ }
1073
+
1074
+ for ($i=0,$j=0; $j<4; $j++) {
1075
+ &AVX_00_47($j,\&body_00_15,@X);
1076
+ push(@X,shift(@X)); # rotate(@X)
1077
+ }
1078
+ &cmp (&DWP(16*$j,$K256),0x00010203);
1079
+ &jne (&label("avx_00_47"));
1080
+
1081
+ for ($i=0; $i<16; ) {
1082
+ foreach(body_00_15()) { eval; }
1083
+ }
1084
+
1085
+ &mov ("esi",&DWP(96,"esp")); #ctx
1086
+ #&mov ($AH[0],&DWP(0,"esp"));
1087
+ &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp"));
1088
+ #&mov ("edi", &DWP(8,"esp"));
1089
+ &mov ("ecx",&DWP(12,"esp"));
1090
+ &add ($AH[0],&DWP(0,"esi"));
1091
+ &add ($AH[1],&DWP(4,"esi"));
1092
+ &add ("edi",&DWP(8,"esi"));
1093
+ &add ("ecx",&DWP(12,"esi"));
1094
+ &mov (&DWP(0,"esi"),$AH[0]);
1095
+ &mov (&DWP(4,"esi"),$AH[1]);
1096
+ &mov (&DWP(8,"esi"),"edi");
1097
+ &mov (&DWP(12,"esi"),"ecx");
1098
+ #&mov (&DWP(0,"esp"),$AH[0]);
1099
+ &mov (&DWP(4,"esp"),$AH[1]);
1100
+ &xor ($AH[1],"edi"); # magic
1101
+ &mov (&DWP(8,"esp"),"edi");
1102
+ &mov (&DWP(12,"esp"),"ecx");
1103
+ #&mov ($E,&DWP(16,"esp"));
1104
+ &mov ("edi",&DWP(20,"esp"));
1105
+ &mov ("ecx",&DWP(24,"esp"));
1106
+ &add ($E,&DWP(16,"esi"));
1107
+ &add ("edi",&DWP(20,"esi"));
1108
+ &add ("ecx",&DWP(24,"esi"));
1109
+ &mov (&DWP(16,"esi"),$E);
1110
+ &mov (&DWP(20,"esi"),"edi");
1111
+ &mov (&DWP(20,"esp"),"edi");
1112
+ &mov ("edi",&DWP(28,"esp"));
1113
+ &mov (&DWP(24,"esi"),"ecx");
1114
+ #&mov (&DWP(16,"esp"),$E);
1115
+ &add ("edi",&DWP(28,"esi"));
1116
+ &mov (&DWP(24,"esp"),"ecx");
1117
+ &mov (&DWP(28,"esi"),"edi");
1118
+ &mov (&DWP(28,"esp"),"edi");
1119
+ &mov ("edi",&DWP(96+4,"esp")); # inp
1120
+
1121
+ &vmovdqa ($t3,&QWP(64,$K256));
1122
+ &sub ($K256,3*64); # rewind K
1123
+ &cmp ("edi",&DWP(96+8,"esp")); # are we done yet?
1124
+ &jb (&label("grand_avx"));
1125
+
1126
+ &mov ("esp",&DWP(96+12,"esp")); # restore sp
1127
+ &vzeroall ();
1128
+ &function_end_A();
1129
+ if ($avx>1) {
1130
+ sub bodyx_00_15 () { # +10%
1131
+ (
1132
+ '&rorx ("ecx",$E,6)',
1133
+ '&rorx ("esi",$E,11)',
1134
+ '&mov (&off($e),$E)', # save $E, modulo-scheduled
1135
+ '&rorx ("edi",$E,25)',
1136
+ '&xor ("ecx","esi")',
1137
+ '&andn ("esi",$E,&off($g))',
1138
+ '&xor ("ecx","edi")', # Sigma1(e)
1139
+ '&and ($E,&off($f))',
1140
+ '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled
1141
+ '&or ($E,"esi")', # T = Ch(e,f,g)
1142
+
1143
+ '&rorx ("edi",$AH[0],2)',
1144
+ '&rorx ("esi",$AH[0],13)',
1145
+ '&lea ($E,&DWP(0,$E,"ecx"))', # T += Sigma1(e)
1146
+ '&rorx ("ecx",$AH[0],22)',
1147
+ '&xor ("esi","edi")',
1148
+ '&mov ("edi",&off($b))',
1149
+ '&xor ("ecx","esi")', # Sigma0(a)
1150
+
1151
+ '&xor ($AH[0],"edi")', # a ^= b, (b^c) in next round
1152
+ '&add ($E,&off($h))', # T += h
1153
+ '&and ($AH[1],$AH[0])', # (b^c) &= (a^b)
1154
+ '&add ($E,&DWP(32+4*($i&15),"esp"))', # T += K[i]+X[i]
1155
+ '&xor ($AH[1],"edi")', # h = Maj(a,b,c) = Ch(a^b,c,b)
1156
+
1157
+ '&add ("ecx",$E)', # h += T
1158
+ '&add ($E,&off($d))', # d += T
1159
+ '&lea ($AH[1],&DWP(0,$AH[1],"ecx"));'. # h += Sigma0(a)
1160
+
1161
+ '@AH = reverse(@AH); $i++;' # rotate(a,h)
1162
+ );
1163
+ }
1164
+
1165
+ &set_label("AVX_BMI",32);
1166
+ &lea ("esp",&DWP(-96,"esp"));
1167
+ &vzeroall ();
1168
+ # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
1169
+ &mov ($AH[0],&DWP(0,"esi"));
1170
+ &mov ($AH[1],&DWP(4,"esi"));
1171
+ &mov ("ecx",&DWP(8,"esi"));
1172
+ &mov ("edi",&DWP(12,"esi"));
1173
+ #&mov (&DWP(0,"esp"),$AH[0]);
1174
+ &mov (&DWP(4,"esp"),$AH[1]);
1175
+ &xor ($AH[1],"ecx"); # magic
1176
+ &mov (&DWP(8,"esp"),"ecx");
1177
+ &mov (&DWP(12,"esp"),"edi");
1178
+ &mov ($E,&DWP(16,"esi"));
1179
+ &mov ("edi",&DWP(20,"esi"));
1180
+ &mov ("ecx",&DWP(24,"esi"));
1181
+ &mov ("esi",&DWP(28,"esi"));
1182
+ #&mov (&DWP(16,"esp"),$E);
1183
+ &mov (&DWP(20,"esp"),"edi");
1184
+ &mov ("edi",&DWP(96+4,"esp")); # inp
1185
+ &mov (&DWP(24,"esp"),"ecx");
1186
+ &mov (&DWP(28,"esp"),"esi");
1187
+ &vmovdqa ($t3,&QWP(256,$K256));
1188
+ &jmp (&label("grand_avx_bmi"));
1189
+
1190
+ &set_label("grand_avx_bmi",32);
1191
+ # load input, reverse byte order, add K256[0..15], save to stack
1192
+ &vmovdqu (@X[0],&QWP(0,"edi"));
1193
+ &vmovdqu (@X[1],&QWP(16,"edi"));
1194
+ &vmovdqu (@X[2],&QWP(32,"edi"));
1195
+ &vmovdqu (@X[3],&QWP(48,"edi"));
1196
+ &add ("edi",64);
1197
+ &vpshufb (@X[0],@X[0],$t3);
1198
+ &mov (&DWP(96+4,"esp"),"edi");
1199
+ &vpshufb (@X[1],@X[1],$t3);
1200
+ &vpshufb (@X[2],@X[2],$t3);
1201
+ &vpaddd ($t0,@X[0],&QWP(0,$K256));
1202
+ &vpshufb (@X[3],@X[3],$t3);
1203
+ &vpaddd ($t1,@X[1],&QWP(16,$K256));
1204
+ &vpaddd ($t2,@X[2],&QWP(32,$K256));
1205
+ &vpaddd ($t3,@X[3],&QWP(48,$K256));
1206
+ &vmovdqa (&QWP(32+0,"esp"),$t0);
1207
+ &vmovdqa (&QWP(32+16,"esp"),$t1);
1208
+ &vmovdqa (&QWP(32+32,"esp"),$t2);
1209
+ &vmovdqa (&QWP(32+48,"esp"),$t3);
1210
+ &jmp (&label("avx_bmi_00_47"));
1211
+
1212
+ &set_label("avx_bmi_00_47",16);
1213
+ &add ($K256,64);
1214
+
1215
+ for ($i=0,$j=0; $j<4; $j++) {
1216
+ &AVX_00_47($j,\&bodyx_00_15,@X);
1217
+ push(@X,shift(@X)); # rotate(@X)
1218
+ }
1219
+ &cmp (&DWP(16*$j,$K256),0x00010203);
1220
+ &jne (&label("avx_bmi_00_47"));
1221
+
1222
+ for ($i=0; $i<16; ) {
1223
+ foreach(bodyx_00_15()) { eval; }
1224
+ }
1225
+
1226
+ &mov ("esi",&DWP(96,"esp")); #ctx
1227
+ #&mov ($AH[0],&DWP(0,"esp"));
1228
+ &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp"));
1229
+ #&mov ("edi", &DWP(8,"esp"));
1230
+ &mov ("ecx",&DWP(12,"esp"));
1231
+ &add ($AH[0],&DWP(0,"esi"));
1232
+ &add ($AH[1],&DWP(4,"esi"));
1233
+ &add ("edi",&DWP(8,"esi"));
1234
+ &add ("ecx",&DWP(12,"esi"));
1235
+ &mov (&DWP(0,"esi"),$AH[0]);
1236
+ &mov (&DWP(4,"esi"),$AH[1]);
1237
+ &mov (&DWP(8,"esi"),"edi");
1238
+ &mov (&DWP(12,"esi"),"ecx");
1239
+ #&mov (&DWP(0,"esp"),$AH[0]);
1240
+ &mov (&DWP(4,"esp"),$AH[1]);
1241
+ &xor ($AH[1],"edi"); # magic
1242
+ &mov (&DWP(8,"esp"),"edi");
1243
+ &mov (&DWP(12,"esp"),"ecx");
1244
+ #&mov ($E,&DWP(16,"esp"));
1245
+ &mov ("edi",&DWP(20,"esp"));
1246
+ &mov ("ecx",&DWP(24,"esp"));
1247
+ &add ($E,&DWP(16,"esi"));
1248
+ &add ("edi",&DWP(20,"esi"));
1249
+ &add ("ecx",&DWP(24,"esi"));
1250
+ &mov (&DWP(16,"esi"),$E);
1251
+ &mov (&DWP(20,"esi"),"edi");
1252
+ &mov (&DWP(20,"esp"),"edi");
1253
+ &mov ("edi",&DWP(28,"esp"));
1254
+ &mov (&DWP(24,"esi"),"ecx");
1255
+ #&mov (&DWP(16,"esp"),$E);
1256
+ &add ("edi",&DWP(28,"esi"));
1257
+ &mov (&DWP(24,"esp"),"ecx");
1258
+ &mov (&DWP(28,"esi"),"edi");
1259
+ &mov (&DWP(28,"esp"),"edi");
1260
+ &mov ("edi",&DWP(96+4,"esp")); # inp
1261
+
1262
+ &vmovdqa ($t3,&QWP(64,$K256));
1263
+ &sub ($K256,3*64); # rewind K
1264
+ &cmp ("edi",&DWP(96+8,"esp")); # are we done yet?
1265
+ &jb (&label("grand_avx_bmi"));
1266
+
1267
+ &mov ("esp",&DWP(96+12,"esp")); # restore sp
1268
+ &vzeroall ();
1269
+ &function_end_A();
1270
+ }
1271
+ }
1272
+ }}}
1273
+ &function_end_B("sha256_block_data_order");
1274
+
1275
+ &asm_finish();