ring-native 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,436 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # ====================================================================
4
+ # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
+ # project. The module is, however, dual licensed under OpenSSL and
6
+ # CRYPTOGAMS licenses depending on where you obtain it. For further
7
+ # details see http://www.openssl.org/~appro/cryptogams/.
8
+ # ====================================================================
9
+ #
10
+ # SHA256/512 for ARMv8.
11
+ #
12
+ # Performance in cycles per processed byte and improvement coefficient
13
+ # over code generated with "default" compiler:
14
+ #
15
+ # SHA256-hw SHA256(*) SHA512
16
+ # Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
17
+ # Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
18
+ # Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
19
+ # Denver 2.01 10.5 (+26%) 6.70 (+8%)
20
+ # X-Gene 20.0 (+100%) 12.8 (+300%(***))
21
+ #
22
+ # (*) Software SHA256 results are of lesser relevance, presented
23
+ # mostly for informational purposes.
24
+ # (**) The result is a trade-off: it's possible to improve it by
25
+ # 10% (or by 1 cycle per round), but at the cost of 20% loss
26
+ # on Cortex-A53 (or by 4 cycles per round).
27
+ # (***) Super-impressive coefficients over gcc-generated code are
28
+ # indication of some compiler "pathology", most notably code
29
+ # generated with -mgeneral-regs-only is significanty faster
30
+ # and the gap is only 40-90%.
31
+
32
+ $flavour=shift;
33
+ # Unlike most perlasm files, sha512-armv8.pl takes an additional argument to
34
+ # determine which hash function to emit. This differs from upstream OpenSSL so
35
+ # that the script may continue to output to stdout.
36
+ $variant=shift;
37
+ $output=shift;
38
+
39
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
41
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
42
+ die "can't locate arm-xlate.pl";
43
+
44
+ open OUT,"| \"$^X\" $xlate $flavour $output";
45
+ *STDOUT=*OUT;
46
+
47
+ if ($variant eq "sha512") {
48
+ $BITS=512;
49
+ $SZ=8;
50
+ @Sigma0=(28,34,39);
51
+ @Sigma1=(14,18,41);
52
+ @sigma0=(1, 8, 7);
53
+ @sigma1=(19,61, 6);
54
+ $rounds=80;
55
+ $reg_t="x";
56
+ } elsif ($variant eq "sha256") {
57
+ $BITS=256;
58
+ $SZ=4;
59
+ @Sigma0=( 2,13,22);
60
+ @Sigma1=( 6,11,25);
61
+ @sigma0=( 7,18, 3);
62
+ @sigma1=(17,19,10);
63
+ $rounds=64;
64
+ $reg_t="w";
65
+ } else {
66
+ die "Unknown variant: $variant";
67
+ }
68
+
69
+ $func="sha${BITS}_block_data_order";
70
+
71
+ ($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
72
+
73
+ @X=map("$reg_t$_",(3..15,0..2));
74
+ @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
75
+ ($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
76
+
77
+ sub BODY_00_xx {
78
+ my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
79
+ my $j=($i+1)&15;
80
+ my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
81
+ $T0=@X[$i+3] if ($i<11);
82
+
83
+ $code.=<<___ if ($i<16);
84
+ #ifndef __ARMEB__
85
+ rev @X[$i],@X[$i] // $i
86
+ #endif
87
+ ___
88
+ $code.=<<___ if ($i<13 && ($i&1));
89
+ ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ
90
+ ___
91
+ $code.=<<___ if ($i==13);
92
+ ldp @X[14],@X[15],[$inp]
93
+ ___
94
+ $code.=<<___ if ($i>=14);
95
+ ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
96
+ ___
97
+ $code.=<<___ if ($i>0 && $i<16);
98
+ add $a,$a,$t1 // h+=Sigma0(a)
99
+ ___
100
+ $code.=<<___ if ($i>=11);
101
+ str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
102
+ ___
103
+ # While ARMv8 specifies merged rotate-n-logical operation such as
104
+ # 'eor x,y,z,ror#n', it was found to negatively affect performance
105
+ # on Apple A7. The reason seems to be that it requires even 'y' to
106
+ # be available earlier. This means that such merged instruction is
107
+ # not necessarily best choice on critical path... On the other hand
108
+ # Cortex-A5x handles merged instructions much better than disjoint
109
+ # rotate and logical... See (**) footnote above.
110
+ $code.=<<___ if ($i<15);
111
+ ror $t0,$e,#$Sigma1[0]
112
+ add $h,$h,$t2 // h+=K[i]
113
+ eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
114
+ and $t1,$f,$e
115
+ bic $t2,$g,$e
116
+ add $h,$h,@X[$i&15] // h+=X[i]
117
+ orr $t1,$t1,$t2 // Ch(e,f,g)
118
+ eor $t2,$a,$b // a^b, b^c in next round
119
+ eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e)
120
+ ror $T0,$a,#$Sigma0[0]
121
+ add $h,$h,$t1 // h+=Ch(e,f,g)
122
+ eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
123
+ add $h,$h,$t0 // h+=Sigma1(e)
124
+ and $t3,$t3,$t2 // (b^c)&=(a^b)
125
+ add $d,$d,$h // d+=h
126
+ eor $t3,$t3,$b // Maj(a,b,c)
127
+ eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a)
128
+ add $h,$h,$t3 // h+=Maj(a,b,c)
129
+ ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
130
+ //add $h,$h,$t1 // h+=Sigma0(a)
131
+ ___
132
+ $code.=<<___ if ($i>=15);
133
+ ror $t0,$e,#$Sigma1[0]
134
+ add $h,$h,$t2 // h+=K[i]
135
+ ror $T1,@X[($j+1)&15],#$sigma0[0]
136
+ and $t1,$f,$e
137
+ ror $T2,@X[($j+14)&15],#$sigma1[0]
138
+ bic $t2,$g,$e
139
+ ror $T0,$a,#$Sigma0[0]
140
+ add $h,$h,@X[$i&15] // h+=X[i]
141
+ eor $t0,$t0,$e,ror#$Sigma1[1]
142
+ eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
143
+ orr $t1,$t1,$t2 // Ch(e,f,g)
144
+ eor $t2,$a,$b // a^b, b^c in next round
145
+ eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e)
146
+ eor $T0,$T0,$a,ror#$Sigma0[1]
147
+ add $h,$h,$t1 // h+=Ch(e,f,g)
148
+ and $t3,$t3,$t2 // (b^c)&=(a^b)
149
+ eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
150
+ eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1])
151
+ add $h,$h,$t0 // h+=Sigma1(e)
152
+ eor $t3,$t3,$b // Maj(a,b,c)
153
+ eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a)
154
+ eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14])
155
+ add @X[$j],@X[$j],@X[($j+9)&15]
156
+ add $d,$d,$h // d+=h
157
+ add $h,$h,$t3 // h+=Maj(a,b,c)
158
+ ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
159
+ add @X[$j],@X[$j],$T1
160
+ add $h,$h,$t1 // h+=Sigma0(a)
161
+ add @X[$j],@X[$j],$T2
162
+ ___
163
+ ($t2,$t3)=($t3,$t2);
164
+ }
165
+
166
+ $code.=<<___;
167
+ #include <openssl/arm_arch.h>
168
+
169
+ .text
170
+
171
+ .extern OPENSSL_armcap_P
172
+ .globl $func
173
+ .type $func,%function
174
+ .align 6
175
+ $func:
176
+ ___
177
+ $code.=<<___ if ($SZ==4);
178
+ ldr x16,.LOPENSSL_armcap_P
179
+ adr x17,.LOPENSSL_armcap_P
180
+ add x16,x16,x17
181
+ ldr w16,[x16]
182
+ tst w16,#ARMV8_SHA256
183
+ b.ne .Lv8_entry
184
+ ___
185
+ $code.=<<___;
186
+ stp x29,x30,[sp,#-128]!
187
+ add x29,sp,#0
188
+
189
+ stp x19,x20,[sp,#16]
190
+ stp x21,x22,[sp,#32]
191
+ stp x23,x24,[sp,#48]
192
+ stp x25,x26,[sp,#64]
193
+ stp x27,x28,[sp,#80]
194
+ sub sp,sp,#4*$SZ
195
+
196
+ ldp $A,$B,[$ctx] // load context
197
+ ldp $C,$D,[$ctx,#2*$SZ]
198
+ ldp $E,$F,[$ctx,#4*$SZ]
199
+ add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
200
+ ldp $G,$H,[$ctx,#6*$SZ]
201
+ adr $Ktbl,.LK$BITS
202
+ stp $ctx,$num,[x29,#96]
203
+
204
+ .Loop:
205
+ ldp @X[0],@X[1],[$inp],#2*$SZ
206
+ ldr $t2,[$Ktbl],#$SZ // *K++
207
+ eor $t3,$B,$C // magic seed
208
+ str $inp,[x29,#112]
209
+ ___
210
+ for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
211
+ $code.=".Loop_16_xx:\n";
212
+ for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
213
+ $code.=<<___;
214
+ cbnz $t2,.Loop_16_xx
215
+
216
+ ldp $ctx,$num,[x29,#96]
217
+ ldr $inp,[x29,#112]
218
+ sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind
219
+
220
+ ldp @X[0],@X[1],[$ctx]
221
+ ldp @X[2],@X[3],[$ctx,#2*$SZ]
222
+ add $inp,$inp,#14*$SZ // advance input pointer
223
+ ldp @X[4],@X[5],[$ctx,#4*$SZ]
224
+ add $A,$A,@X[0]
225
+ ldp @X[6],@X[7],[$ctx,#6*$SZ]
226
+ add $B,$B,@X[1]
227
+ add $C,$C,@X[2]
228
+ add $D,$D,@X[3]
229
+ stp $A,$B,[$ctx]
230
+ add $E,$E,@X[4]
231
+ add $F,$F,@X[5]
232
+ stp $C,$D,[$ctx,#2*$SZ]
233
+ add $G,$G,@X[6]
234
+ add $H,$H,@X[7]
235
+ cmp $inp,$num
236
+ stp $E,$F,[$ctx,#4*$SZ]
237
+ stp $G,$H,[$ctx,#6*$SZ]
238
+ b.ne .Loop
239
+
240
+ ldp x19,x20,[x29,#16]
241
+ add sp,sp,#4*$SZ
242
+ ldp x21,x22,[x29,#32]
243
+ ldp x23,x24,[x29,#48]
244
+ ldp x25,x26,[x29,#64]
245
+ ldp x27,x28,[x29,#80]
246
+ ldp x29,x30,[sp],#128
247
+ ret
248
+ .size $func,.-$func
249
+
250
+ .align 6
251
+ .type .LK$BITS,%object
252
+ .LK$BITS:
253
+ ___
254
+ $code.=<<___ if ($SZ==8);
255
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
256
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
257
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
258
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
259
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
260
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
261
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
262
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
263
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
264
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
265
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
266
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
267
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
268
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
269
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
270
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
271
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
272
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
273
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
274
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
275
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
276
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
277
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
278
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
279
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
280
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
281
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
282
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
283
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
284
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
285
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
286
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
287
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
288
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
289
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
290
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
291
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
292
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
293
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
294
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
295
+ .quad 0 // terminator
296
+ ___
297
+ $code.=<<___ if ($SZ==4);
298
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
299
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
300
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
301
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
302
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
303
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
304
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
305
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
306
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
307
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
308
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
309
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
310
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
311
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
312
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
313
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
314
+ .long 0 //terminator
315
+ ___
316
+ $code.=<<___;
317
+ .size .LK$BITS,.-.LK$BITS
318
+ .align 3
319
+ .LOPENSSL_armcap_P:
320
+ .quad OPENSSL_armcap_P-.
321
+ .asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
322
+ .align 2
323
+ ___
324
+
325
+ if ($SZ==4) {
326
+ my $Ktbl="x3";
327
+
328
+ my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
329
+ my @MSG=map("v$_.16b",(4..7));
330
+ my ($W0,$W1)=("v16.4s","v17.4s");
331
+ my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
332
+
333
+ $code.=<<___;
334
+ .type sha256_block_armv8,%function
335
+ .align 6
336
+ sha256_block_armv8:
337
+ .Lv8_entry:
338
+ stp x29,x30,[sp,#-16]!
339
+ add x29,sp,#0
340
+
341
+ ld1.32 {$ABCD,$EFGH},[$ctx]
342
+ adr $Ktbl,.LK256
343
+
344
+ .Loop_hw:
345
+ ld1 {@MSG[0]-@MSG[3]},[$inp],#64
346
+ sub $num,$num,#1
347
+ ld1.32 {$W0},[$Ktbl],#16
348
+ rev32 @MSG[0],@MSG[0]
349
+ rev32 @MSG[1],@MSG[1]
350
+ rev32 @MSG[2],@MSG[2]
351
+ rev32 @MSG[3],@MSG[3]
352
+ orr $ABCD_SAVE,$ABCD,$ABCD // offload
353
+ orr $EFGH_SAVE,$EFGH,$EFGH
354
+ ___
355
+ for($i=0;$i<12;$i++) {
356
+ $code.=<<___;
357
+ ld1.32 {$W1},[$Ktbl],#16
358
+ add.i32 $W0,$W0,@MSG[0]
359
+ sha256su0 @MSG[0],@MSG[1]
360
+ orr $abcd,$ABCD,$ABCD
361
+ sha256h $ABCD,$EFGH,$W0
362
+ sha256h2 $EFGH,$abcd,$W0
363
+ sha256su1 @MSG[0],@MSG[2],@MSG[3]
364
+ ___
365
+ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
366
+ }
367
+ $code.=<<___;
368
+ ld1.32 {$W1},[$Ktbl],#16
369
+ add.i32 $W0,$W0,@MSG[0]
370
+ orr $abcd,$ABCD,$ABCD
371
+ sha256h $ABCD,$EFGH,$W0
372
+ sha256h2 $EFGH,$abcd,$W0
373
+
374
+ ld1.32 {$W0},[$Ktbl],#16
375
+ add.i32 $W1,$W1,@MSG[1]
376
+ orr $abcd,$ABCD,$ABCD
377
+ sha256h $ABCD,$EFGH,$W1
378
+ sha256h2 $EFGH,$abcd,$W1
379
+
380
+ ld1.32 {$W1},[$Ktbl]
381
+ add.i32 $W0,$W0,@MSG[2]
382
+ sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
383
+ orr $abcd,$ABCD,$ABCD
384
+ sha256h $ABCD,$EFGH,$W0
385
+ sha256h2 $EFGH,$abcd,$W0
386
+
387
+ add.i32 $W1,$W1,@MSG[3]
388
+ orr $abcd,$ABCD,$ABCD
389
+ sha256h $ABCD,$EFGH,$W1
390
+ sha256h2 $EFGH,$abcd,$W1
391
+
392
+ add.i32 $ABCD,$ABCD,$ABCD_SAVE
393
+ add.i32 $EFGH,$EFGH,$EFGH_SAVE
394
+
395
+ cbnz $num,.Loop_hw
396
+
397
+ st1.32 {$ABCD,$EFGH},[$ctx]
398
+
399
+ ldr x29,[sp],#16
400
+ ret
401
+ .size sha256_block_armv8,.-sha256_block_armv8
402
+ ___
403
+ }
404
+
405
+ $code.=<<___;
406
+ .comm OPENSSL_armcap_P,4,4
407
+ ___
408
+
409
+ { my %opcode = (
410
+ "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000,
411
+ "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 );
412
+
413
+ sub unsha256 {
414
+ my ($mnemonic,$arg)=@_;
415
+
416
+ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
417
+ &&
418
+ sprintf ".inst\t0x%08x\t//%s %s",
419
+ $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
420
+ $mnemonic,$arg;
421
+ }
422
+ }
423
+
424
+ foreach(split("\n",$code)) {
425
+
426
+ s/\`([^\`]*)\`/eval($1)/geo;
427
+
428
+ s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
429
+
430
+ s/\.\w?32\b//o and s/\.16b/\.4s/go;
431
+ m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
432
+
433
+ print $_,"\n";
434
+ }
435
+
436
+ close STDOUT;
@@ -0,0 +1,2390 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # ====================================================================
4
+ # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
+ # project. Rights for redistribution and usage in source and binary
6
+ # forms are granted according to the OpenSSL license.
7
+ # ====================================================================
8
+ #
9
+ # sha256/512_block procedure for x86_64.
10
+ #
11
+ # 40% improvement over compiler-generated code on Opteron. On EM64T
12
+ # sha256 was observed to run >80% faster and sha512 - >40%. No magical
13
+ # tricks, just straight implementation... I really wonder why gcc
14
+ # [being armed with inline assembler] fails to generate as fast code.
15
+ # The only thing which is cool about this module is that it's very
16
+ # same instruction sequence used for both SHA-256 and SHA-512. In
17
+ # former case the instructions operate on 32-bit operands, while in
18
+ # latter - on 64-bit ones. All I had to do is to get one flavor right,
19
+ # the other one passed the test right away:-)
20
+ #
21
+ # sha256_block runs in ~1005 cycles on Opteron, which gives you
22
+ # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
23
+ # frequency in GHz. sha512_block runs in ~1275 cycles, which results
24
+ # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
25
+ # Well, if you compare it to IA-64 implementation, which maintains
26
+ # X[16] in register bank[!], tends to 4 instructions per CPU clock
27
+ # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
28
+ # issue Opteron pipeline and X[16] maintained in memory. So that *if*
29
+ # there is a way to improve it, *then* the only way would be to try to
30
+ # offload X[16] updates to SSE unit, but that would require "deeper"
31
+ # loop unroll, which in turn would naturally cause size blow-up, not
32
+ # to mention increased complexity! And once again, only *if* it's
33
+ # actually possible to noticeably improve overall ILP, instruction
34
+ # level parallelism, on a given CPU implementation in this case.
35
+ #
36
+ # Special note on Intel EM64T. While Opteron CPU exhibits perfect
37
+ # perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
38
+ # [currently available] EM64T CPUs apparently are far from it. On the
39
+ # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
40
+ # sha256_block:-( This is presumably because 64-bit shifts/rotates
41
+ # apparently are not atomic instructions, but implemented in microcode.
42
+ #
43
+ # May 2012.
44
+ #
45
+ # Optimization including one of Pavel Semjanov's ideas, alternative
46
+ # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
47
+ # unfortunately -2% SHA512 on P4 [which nobody should care about
48
+ # that much].
49
+ #
50
+ # June 2012.
51
+ #
52
+ # Add SIMD code paths, see below for improvement coefficients. SSSE3
53
+ # code path was not attempted for SHA512, because improvement is not
54
+ # estimated to be high enough, noticeably less than 9%, to justify
55
+ # the effort, not on pre-AVX processors. [Obviously with exclusion
56
+ # for VIA Nano, but it has SHA512 instruction that is faster and
57
+ # should be used instead.] For reference, corresponding estimated
58
+ # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
59
+ # higher coefficients are observed on VIA Nano and Bulldozer has more
60
+ # to do with specifics of their architecture [which is topic for
61
+ # separate discussion].
62
+ #
63
+ # November 2012.
64
+ #
65
+ # Add AVX2 code path. Two consecutive input blocks are loaded to
66
+ # 256-bit %ymm registers, with data from first block to least
67
+ # significant 128-bit halves and data from second to most significant.
68
+ # The data is then processed with same SIMD instruction sequence as
69
+ # for AVX, but with %ymm as operands. Side effect is increased stack
70
+ # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
71
+ # code size increase.
72
+ #
73
+ # March 2014.
74
+ #
75
+ # Add support for Intel SHA Extensions.
76
+
77
+ ######################################################################
78
+ # Current performance in cycles per processed byte (less is better):
79
+ #
80
+ # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
81
+ #
82
+ # AMD K8 14.9 - - 9.57 -
83
+ # P4 17.3 - - 30.8 -
84
+ # Core 2 15.6 13.8(+13%) - 9.97 -
85
+ # Westmere 14.8 12.3(+19%) - 9.58 -
86
+ # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
87
+ # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
88
+ # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
89
+ # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
90
+ # VIA Nano 23.0 16.5(+39%) - 14.7 -
91
+ # Atom 23.0 18.9(+22%) - 14.7 -
92
+ # Silvermont 27.4 20.6(+33%) - 17.5 -
93
+ #
94
+ # (*) whichever best applicable;
95
+ # (**) switch from ror to shrd stands for fair share of improvement;
96
+ # (***) execution time is fully determined by remaining integer-only
97
+ # part, body_00_15; reducing the amount of SIMD instructions
98
+ # below certain limit makes no difference/sense; to conserve
99
+ # space SHA256 XOP code path is therefore omitted;
100
+
101
+ $flavour = shift;
102
+ $output = shift;
103
+ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
104
+
105
+ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
106
+
107
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
108
+ ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
109
+ ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
110
+ die "can't locate x86_64-xlate.pl";
111
+
112
+ # In upstream, this is controlled by shelling out to the compiler to check
113
+ # versions, but BoringSSL is intended to be used with pre-generated perlasm
114
+ # output, so this isn't useful anyway.
115
+ #
116
+ # TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
117
+ # necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
118
+ # did not tie them together until after $shaext was added.
119
+ $avx = 1;
120
+
121
+ # TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
122
+ # been tested.
123
+ $shaext=0; ### set to zero if compiling for 1.0.1
124
+ $avx=1 if (!$shaext && $avx);
125
+
126
+ open OUT,"| \"$^X\" $xlate $flavour";
127
+ *STDOUT=*OUT;
128
+
129
+ if ($output =~ /512/) {
130
+ $func="sha512_block_data_order";
131
+ $TABLE="K512";
132
+ $SZ=8;
133
+ @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
134
+ "%r8", "%r9", "%r10","%r11");
135
+ ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
136
+ @Sigma0=(28,34,39);
137
+ @Sigma1=(14,18,41);
138
+ @sigma0=(1, 8, 7);
139
+ @sigma1=(19,61, 6);
140
+ $rounds=80;
141
+ } else {
142
+ $func="sha256_block_data_order";
143
+ $TABLE="K256";
144
+ $SZ=4;
145
+ @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
146
+ "%r8d","%r9d","%r10d","%r11d");
147
+ ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
148
+ @Sigma0=( 2,13,22);
149
+ @Sigma1=( 6,11,25);
150
+ @sigma0=( 7,18, 3);
151
+ @sigma1=(17,19,10);
152
+ $rounds=64;
153
+ }
154
+
155
+ $ctx="%rdi"; # 1st arg, zapped by $a3
156
+ $inp="%rsi"; # 2nd arg
157
+ $Tbl="%rbp";
158
+
159
+ $_ctx="16*$SZ+0*8(%rsp)";
160
+ $_inp="16*$SZ+1*8(%rsp)";
161
+ $_end="16*$SZ+2*8(%rsp)";
162
+ $_rsp="16*$SZ+3*8(%rsp)";
163
+ $framesz="16*$SZ+4*8";
164
+
165
+
166
+ sub ROUND_00_15()
167
+ { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
168
+ my $STRIDE=$SZ;
169
+ $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
170
+
171
+ $code.=<<___;
172
+ ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
173
+ mov $f,$a2
174
+
175
+ xor $e,$a0
176
+ ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
177
+ xor $g,$a2 # f^g
178
+
179
+ mov $T1,`$SZ*($i&0xf)`(%rsp)
180
+ xor $a,$a1
181
+ and $e,$a2 # (f^g)&e
182
+
183
+ ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
184
+ add $h,$T1 # T1+=h
185
+ xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
186
+
187
+ ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
188
+ xor $e,$a0
189
+ add $a2,$T1 # T1+=Ch(e,f,g)
190
+
191
+ mov $a,$a2
192
+ add ($Tbl),$T1 # T1+=K[round]
193
+ xor $a,$a1
194
+
195
+ xor $b,$a2 # a^b, b^c in next round
196
+ ror \$$Sigma1[0],$a0 # Sigma1(e)
197
+ mov $b,$h
198
+
199
+ and $a2,$a3
200
+ ror \$$Sigma0[0],$a1 # Sigma0(a)
201
+ add $a0,$T1 # T1+=Sigma1(e)
202
+
203
+ xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
204
+ add $T1,$d # d+=T1
205
+ add $T1,$h # h+=T1
206
+
207
+ lea $STRIDE($Tbl),$Tbl # round++
208
+ ___
209
+ $code.=<<___ if ($i<15);
210
+ add $a1,$h # h+=Sigma0(a)
211
+ ___
212
+ ($a2,$a3) = ($a3,$a2);
213
+ }
214
+
215
+ sub ROUND_16_XX()
216
+ { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
217
+
218
+ $code.=<<___;
219
+ mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
220
+ mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
221
+
222
+ mov $a0,$T1
223
+ ror \$`$sigma0[1]-$sigma0[0]`,$a0
224
+ add $a1,$a # modulo-scheduled h+=Sigma0(a)
225
+ mov $a2,$a1
226
+ ror \$`$sigma1[1]-$sigma1[0]`,$a2
227
+
228
+ xor $T1,$a0
229
+ shr \$$sigma0[2],$T1
230
+ ror \$$sigma0[0],$a0
231
+ xor $a1,$a2
232
+ shr \$$sigma1[2],$a1
233
+
234
+ ror \$$sigma1[0],$a2
235
+ xor $a0,$T1 # sigma0(X[(i+1)&0xf])
236
+ xor $a1,$a2 # sigma1(X[(i+14)&0xf])
237
+ add `$SZ*(($i+9)&0xf)`(%rsp),$T1
238
+
239
+ add `$SZ*($i&0xf)`(%rsp),$T1
240
+ mov $e,$a0
241
+ add $a2,$T1
242
+ mov $a,$a1
243
+ ___
244
+ &ROUND_00_15(@_);
245
+ }
246
+
247
+ $code=<<___;
248
+ .text
249
+
250
+ .extern OPENSSL_ia32cap_P
251
+ .globl $func
252
+ .type $func,\@function,3
253
+ .align 16
254
+ $func:
255
+ ___
256
+ $code.=<<___ if ($SZ==4 || $avx);
257
+ lea OPENSSL_ia32cap_P(%rip),%r11
258
+ mov 0(%r11),%r9d
259
+ mov 4(%r11),%r10d
260
+ mov 8(%r11),%r11d
261
+ ___
262
+ $code.=<<___ if ($SZ==4 && $shaext);
263
+ test \$`1<<29`,%r11d # check for SHA
264
+ jnz _shaext_shortcut
265
+ ___
266
+ $code.=<<___ if ($avx && $SZ==8);
267
+ test \$`1<<11`,%r10d # check for XOP
268
+ jnz .Lxop_shortcut
269
+ ___
270
+ $code.=<<___ if ($avx>1);
271
+ and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
272
+ cmp \$`1<<8|1<<5|1<<3`,%r11d
273
+ je .Lavx2_shortcut
274
+ ___
275
+ $code.=<<___ if ($avx);
276
+ and \$`1<<30`,%r9d # mask "Intel CPU" bit
277
+ and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
278
+ or %r9d,%r10d
279
+ cmp \$`1<<28|1<<9|1<<30`,%r10d
280
+ je .Lavx_shortcut
281
+ ___
282
+ $code.=<<___ if ($SZ==4);
283
+ test \$`1<<9`,%r10d
284
+ jnz .Lssse3_shortcut
285
+ ___
286
+ $code.=<<___;
287
+ push %rbx
288
+ push %rbp
289
+ push %r12
290
+ push %r13
291
+ push %r14
292
+ push %r15
293
+ mov %rsp,%r11 # copy %rsp
294
+ shl \$4,%rdx # num*16
295
+ sub \$$framesz,%rsp
296
+ lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
297
+ and \$-64,%rsp # align stack frame
298
+ mov $ctx,$_ctx # save ctx, 1st arg
299
+ mov $inp,$_inp # save inp, 2nd arh
300
+ mov %rdx,$_end # save end pointer, "3rd" arg
301
+ mov %r11,$_rsp # save copy of %rsp
302
+ .Lprologue:
303
+
304
+ mov $SZ*0($ctx),$A
305
+ mov $SZ*1($ctx),$B
306
+ mov $SZ*2($ctx),$C
307
+ mov $SZ*3($ctx),$D
308
+ mov $SZ*4($ctx),$E
309
+ mov $SZ*5($ctx),$F
310
+ mov $SZ*6($ctx),$G
311
+ mov $SZ*7($ctx),$H
312
+ jmp .Lloop
313
+
314
+ .align 16
315
+ .Lloop:
316
+ mov $B,$a3
317
+ lea $TABLE(%rip),$Tbl
318
+ xor $C,$a3 # magic
319
+ ___
320
+ for($i=0;$i<16;$i++) {
321
+ $code.=" mov $SZ*$i($inp),$T1\n";
322
+ $code.=" mov @ROT[4],$a0\n";
323
+ $code.=" mov @ROT[0],$a1\n";
324
+ $code.=" bswap $T1\n";
325
+ &ROUND_00_15($i,@ROT);
326
+ unshift(@ROT,pop(@ROT));
327
+ }
328
+ $code.=<<___;
329
+ jmp .Lrounds_16_xx
330
+ .align 16
331
+ .Lrounds_16_xx:
332
+ ___
333
+ for(;$i<32;$i++) {
334
+ &ROUND_16_XX($i,@ROT);
335
+ unshift(@ROT,pop(@ROT));
336
+ }
337
+
338
+ $code.=<<___;
339
+ cmpb \$0,`$SZ-1`($Tbl)
340
+ jnz .Lrounds_16_xx
341
+
342
+ mov $_ctx,$ctx
343
+ add $a1,$A # modulo-scheduled h+=Sigma0(a)
344
+ lea 16*$SZ($inp),$inp
345
+
346
+ add $SZ*0($ctx),$A
347
+ add $SZ*1($ctx),$B
348
+ add $SZ*2($ctx),$C
349
+ add $SZ*3($ctx),$D
350
+ add $SZ*4($ctx),$E
351
+ add $SZ*5($ctx),$F
352
+ add $SZ*6($ctx),$G
353
+ add $SZ*7($ctx),$H
354
+
355
+ cmp $_end,$inp
356
+
357
+ mov $A,$SZ*0($ctx)
358
+ mov $B,$SZ*1($ctx)
359
+ mov $C,$SZ*2($ctx)
360
+ mov $D,$SZ*3($ctx)
361
+ mov $E,$SZ*4($ctx)
362
+ mov $F,$SZ*5($ctx)
363
+ mov $G,$SZ*6($ctx)
364
+ mov $H,$SZ*7($ctx)
365
+ jb .Lloop
366
+
367
+ mov $_rsp,%rsi
368
+ mov (%rsi),%r15
369
+ mov 8(%rsi),%r14
370
+ mov 16(%rsi),%r13
371
+ mov 24(%rsi),%r12
372
+ mov 32(%rsi),%rbp
373
+ mov 40(%rsi),%rbx
374
+ lea 48(%rsi),%rsp
375
+ .Lepilogue:
376
+ ret
377
+ .size $func,.-$func
378
+ ___
379
+
380
+ if ($SZ==4) {
381
+ $code.=<<___;
382
+ .align 64
383
+ .type $TABLE,\@object
384
+ $TABLE:
385
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
386
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
387
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
388
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
389
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
390
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
391
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
392
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
393
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
394
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
395
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
396
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
397
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
398
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
399
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
400
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
401
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
402
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
403
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
404
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
405
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
406
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
407
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
408
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
409
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
410
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
411
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
412
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
413
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
414
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
415
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
416
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
417
+
418
+ .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
419
+ .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
420
+ .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
421
+ .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
422
+ .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
423
+ .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
424
+ .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
425
+ ___
426
+ } else {
427
+ $code.=<<___;
428
+ .align 64
429
+ .type $TABLE,\@object
430
+ $TABLE:
431
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
432
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
433
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
434
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
435
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
436
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
437
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
438
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
439
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
440
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
441
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
442
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
443
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
444
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
445
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
446
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
447
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
448
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
449
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
450
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
451
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
452
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
453
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
454
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
455
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
456
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
457
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
458
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
459
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
460
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
461
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
462
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
463
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
464
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
465
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
466
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
467
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
468
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
469
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
470
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
471
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
472
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
473
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
474
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
475
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
476
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
477
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
478
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
479
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
480
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
481
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
482
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
483
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
484
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
485
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
486
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
487
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
488
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
489
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
490
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
491
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
492
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
493
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
494
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
495
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
496
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
497
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
498
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
499
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
500
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
501
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
502
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
503
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
504
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
505
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
506
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
507
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
508
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
509
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
510
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
511
+
512
+ .quad 0x0001020304050607,0x08090a0b0c0d0e0f
513
+ .quad 0x0001020304050607,0x08090a0b0c0d0e0f
514
+ .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
515
+ ___
516
+ }
517
+
518
+ ######################################################################
519
+ # SIMD code paths
520
+ #
521
+ if ($SZ==4 && $shaext) {{{
522
+ ######################################################################
523
+ # Intel SHA Extensions implementation of SHA256 update function.
524
+ #
525
+ my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
526
+
527
+ my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
528
+ my @MSG=map("%xmm$_",(3..6));
529
+
530
+ $code.=<<___;
531
+ .type sha256_block_data_order_shaext,\@function,3
532
+ .align 64
533
+ sha256_block_data_order_shaext:
534
+ _shaext_shortcut:
535
+ ___
536
+ $code.=<<___ if ($win64);
537
+ lea `-8-5*16`(%rsp),%rsp
538
+ movaps %xmm6,-8-5*16(%rax)
539
+ movaps %xmm7,-8-4*16(%rax)
540
+ movaps %xmm8,-8-3*16(%rax)
541
+ movaps %xmm9,-8-2*16(%rax)
542
+ movaps %xmm10,-8-1*16(%rax)
543
+ .Lprologue_shaext:
544
+ ___
545
+ $code.=<<___;
546
+ lea K256+0x80(%rip),$Tbl
547
+ movdqu ($ctx),$ABEF # DCBA
548
+ movdqu 16($ctx),$CDGH # HGFE
549
+ movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
550
+
551
+ pshufd \$0x1b,$ABEF,$Wi # ABCD
552
+ pshufd \$0xb1,$ABEF,$ABEF # CDAB
553
+ pshufd \$0x1b,$CDGH,$CDGH # EFGH
554
+ movdqa $TMP,$BSWAP # offload
555
+ palignr \$8,$CDGH,$ABEF # ABEF
556
+ punpcklqdq $Wi,$CDGH # CDGH
557
+ jmp .Loop_shaext
558
+
559
+ .align 16
560
+ .Loop_shaext:
561
+ movdqu ($inp),@MSG[0]
562
+ movdqu 0x10($inp),@MSG[1]
563
+ movdqu 0x20($inp),@MSG[2]
564
+ pshufb $TMP,@MSG[0]
565
+ movdqu 0x30($inp),@MSG[3]
566
+
567
+ movdqa 0*32-0x80($Tbl),$Wi
568
+ paddd @MSG[0],$Wi
569
+ pshufb $TMP,@MSG[1]
570
+ movdqa $CDGH,$CDGH_SAVE # offload
571
+ sha256rnds2 $ABEF,$CDGH # 0-3
572
+ pshufd \$0x0e,$Wi,$Wi
573
+ nop
574
+ movdqa $ABEF,$ABEF_SAVE # offload
575
+ sha256rnds2 $CDGH,$ABEF
576
+
577
+ movdqa 1*32-0x80($Tbl),$Wi
578
+ paddd @MSG[1],$Wi
579
+ pshufb $TMP,@MSG[2]
580
+ sha256rnds2 $ABEF,$CDGH # 4-7
581
+ pshufd \$0x0e,$Wi,$Wi
582
+ lea 0x40($inp),$inp
583
+ sha256msg1 @MSG[1],@MSG[0]
584
+ sha256rnds2 $CDGH,$ABEF
585
+
586
+ movdqa 2*32-0x80($Tbl),$Wi
587
+ paddd @MSG[2],$Wi
588
+ pshufb $TMP,@MSG[3]
589
+ sha256rnds2 $ABEF,$CDGH # 8-11
590
+ pshufd \$0x0e,$Wi,$Wi
591
+ movdqa @MSG[3],$TMP
592
+ palignr \$4,@MSG[2],$TMP
593
+ nop
594
+ paddd $TMP,@MSG[0]
595
+ sha256msg1 @MSG[2],@MSG[1]
596
+ sha256rnds2 $CDGH,$ABEF
597
+
598
+ movdqa 3*32-0x80($Tbl),$Wi
599
+ paddd @MSG[3],$Wi
600
+ sha256msg2 @MSG[3],@MSG[0]
601
+ sha256rnds2 $ABEF,$CDGH # 12-15
602
+ pshufd \$0x0e,$Wi,$Wi
603
+ movdqa @MSG[0],$TMP
604
+ palignr \$4,@MSG[3],$TMP
605
+ nop
606
+ paddd $TMP,@MSG[1]
607
+ sha256msg1 @MSG[3],@MSG[2]
608
+ sha256rnds2 $CDGH,$ABEF
609
+ ___
610
+ for($i=4;$i<16-3;$i++) {
611
+ $code.=<<___;
612
+ movdqa $i*32-0x80($Tbl),$Wi
613
+ paddd @MSG[0],$Wi
614
+ sha256msg2 @MSG[0],@MSG[1]
615
+ sha256rnds2 $ABEF,$CDGH # 16-19...
616
+ pshufd \$0x0e,$Wi,$Wi
617
+ movdqa @MSG[1],$TMP
618
+ palignr \$4,@MSG[0],$TMP
619
+ nop
620
+ paddd $TMP,@MSG[2]
621
+ sha256msg1 @MSG[0],@MSG[3]
622
+ sha256rnds2 $CDGH,$ABEF
623
+ ___
624
+ push(@MSG,shift(@MSG));
625
+ }
626
+ $code.=<<___;
627
+ movdqa 13*32-0x80($Tbl),$Wi
628
+ paddd @MSG[0],$Wi
629
+ sha256msg2 @MSG[0],@MSG[1]
630
+ sha256rnds2 $ABEF,$CDGH # 52-55
631
+ pshufd \$0x0e,$Wi,$Wi
632
+ movdqa @MSG[1],$TMP
633
+ palignr \$4,@MSG[0],$TMP
634
+ sha256rnds2 $CDGH,$ABEF
635
+ paddd $TMP,@MSG[2]
636
+
637
+ movdqa 14*32-0x80($Tbl),$Wi
638
+ paddd @MSG[1],$Wi
639
+ sha256rnds2 $ABEF,$CDGH # 56-59
640
+ pshufd \$0x0e,$Wi,$Wi
641
+ sha256msg2 @MSG[1],@MSG[2]
642
+ movdqa $BSWAP,$TMP
643
+ sha256rnds2 $CDGH,$ABEF
644
+
645
+ movdqa 15*32-0x80($Tbl),$Wi
646
+ paddd @MSG[2],$Wi
647
+ nop
648
+ sha256rnds2 $ABEF,$CDGH # 60-63
649
+ pshufd \$0x0e,$Wi,$Wi
650
+ dec $num
651
+ nop
652
+ sha256rnds2 $CDGH,$ABEF
653
+
654
+ paddd $CDGH_SAVE,$CDGH
655
+ paddd $ABEF_SAVE,$ABEF
656
+ jnz .Loop_shaext
657
+
658
+ pshufd \$0xb1,$CDGH,$CDGH # DCHG
659
+ pshufd \$0x1b,$ABEF,$TMP # FEBA
660
+ pshufd \$0xb1,$ABEF,$ABEF # BAFE
661
+ punpckhqdq $CDGH,$ABEF # DCBA
662
+ palignr \$8,$TMP,$CDGH # HGFE
663
+
664
+ movdqu $ABEF,($ctx)
665
+ movdqu $CDGH,16($ctx)
666
+ ___
667
+ $code.=<<___ if ($win64);
668
+ movaps -8-5*16(%rax),%xmm6
669
+ movaps -8-4*16(%rax),%xmm7
670
+ movaps -8-3*16(%rax),%xmm8
671
+ movaps -8-2*16(%rax),%xmm9
672
+ movaps -8-1*16(%rax),%xmm10
673
+ mov %rax,%rsp
674
+ .Lepilogue_shaext:
675
+ ___
676
+ $code.=<<___;
677
+ ret
678
+ .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
679
+ ___
680
+ }}}
681
+ {{{
682
+
683
+ my $a4=$T1;
684
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
685
+
686
+ sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
687
+ { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
688
+ my $arg = pop;
689
+ $arg = "\$$arg" if ($arg*1 eq $arg);
690
+ $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
691
+ }
692
+
693
+ sub body_00_15 () {
694
+ (
695
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
696
+
697
+ '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
698
+ '&mov ($a,$a1)',
699
+ '&mov ($a4,$f)',
700
+
701
+ '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
702
+ '&xor ($a0,$e)',
703
+ '&xor ($a4,$g)', # f^g
704
+
705
+ '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
706
+ '&xor ($a1,$a)',
707
+ '&and ($a4,$e)', # (f^g)&e
708
+
709
+ '&xor ($a0,$e)',
710
+ '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
711
+ '&mov ($a2,$a)',
712
+
713
+ '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
714
+ '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
715
+ '&xor ($a2,$b)', # a^b, b^c in next round
716
+
717
+ '&add ($h,$a4)', # h+=Ch(e,f,g)
718
+ '&ror ($a0,$Sigma1[0])', # Sigma1(e)
719
+ '&and ($a3,$a2)', # (b^c)&(a^b)
720
+
721
+ '&xor ($a1,$a)',
722
+ '&add ($h,$a0)', # h+=Sigma1(e)
723
+ '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
724
+
725
+ '&ror ($a1,$Sigma0[0])', # Sigma0(a)
726
+ '&add ($d,$h)', # d+=h
727
+ '&add ($h,$a3)', # h+=Maj(a,b,c)
728
+
729
+ '&mov ($a0,$d)',
730
+ '&add ($a1,$h);'. # h+=Sigma0(a)
731
+ '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
732
+ );
733
+ }
734
+
735
+ ######################################################################
736
+ # SSSE3 code path
737
+ #
738
+ if ($SZ==4) { # SHA256 only
739
+ my @X = map("%xmm$_",(0..3));
740
+ my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
741
+
742
+ $code.=<<___;
743
+ .type ${func}_ssse3,\@function,3
744
+ .align 64
745
+ ${func}_ssse3:
746
+ .Lssse3_shortcut:
747
+ push %rbx
748
+ push %rbp
749
+ push %r12
750
+ push %r13
751
+ push %r14
752
+ push %r15
753
+ mov %rsp,%r11 # copy %rsp
754
+ shl \$4,%rdx # num*16
755
+ sub \$`$framesz+$win64*16*4`,%rsp
756
+ lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
757
+ and \$-64,%rsp # align stack frame
758
+ mov $ctx,$_ctx # save ctx, 1st arg
759
+ mov $inp,$_inp # save inp, 2nd arh
760
+ mov %rdx,$_end # save end pointer, "3rd" arg
761
+ mov %r11,$_rsp # save copy of %rsp
762
+ ___
763
+ $code.=<<___ if ($win64);
764
+ movaps %xmm6,16*$SZ+32(%rsp)
765
+ movaps %xmm7,16*$SZ+48(%rsp)
766
+ movaps %xmm8,16*$SZ+64(%rsp)
767
+ movaps %xmm9,16*$SZ+80(%rsp)
768
+ ___
769
+ $code.=<<___;
770
+ .Lprologue_ssse3:
771
+
772
+ mov $SZ*0($ctx),$A
773
+ mov $SZ*1($ctx),$B
774
+ mov $SZ*2($ctx),$C
775
+ mov $SZ*3($ctx),$D
776
+ mov $SZ*4($ctx),$E
777
+ mov $SZ*5($ctx),$F
778
+ mov $SZ*6($ctx),$G
779
+ mov $SZ*7($ctx),$H
780
+ ___
781
+
782
+ $code.=<<___;
783
+ #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
784
+ #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
785
+ jmp .Lloop_ssse3
786
+ .align 16
787
+ .Lloop_ssse3:
788
+ movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
789
+ movdqu 0x00($inp),@X[0]
790
+ movdqu 0x10($inp),@X[1]
791
+ movdqu 0x20($inp),@X[2]
792
+ pshufb $t3,@X[0]
793
+ movdqu 0x30($inp),@X[3]
794
+ lea $TABLE(%rip),$Tbl
795
+ pshufb $t3,@X[1]
796
+ movdqa 0x00($Tbl),$t0
797
+ movdqa 0x20($Tbl),$t1
798
+ pshufb $t3,@X[2]
799
+ paddd @X[0],$t0
800
+ movdqa 0x40($Tbl),$t2
801
+ pshufb $t3,@X[3]
802
+ movdqa 0x60($Tbl),$t3
803
+ paddd @X[1],$t1
804
+ paddd @X[2],$t2
805
+ paddd @X[3],$t3
806
+ movdqa $t0,0x00(%rsp)
807
+ mov $A,$a1
808
+ movdqa $t1,0x10(%rsp)
809
+ mov $B,$a3
810
+ movdqa $t2,0x20(%rsp)
811
+ xor $C,$a3 # magic
812
+ movdqa $t3,0x30(%rsp)
813
+ mov $E,$a0
814
+ jmp .Lssse3_00_47
815
+
816
+ .align 16
817
+ .Lssse3_00_47:
818
+ sub \$`-16*2*$SZ`,$Tbl # size optimization
819
+ ___
820
+ sub Xupdate_256_SSSE3 () {
821
+ (
822
+ '&movdqa ($t0,@X[1]);',
823
+ '&movdqa ($t3,@X[3])',
824
+ '&palignr ($t0,@X[0],$SZ)', # X[1..4]
825
+ '&palignr ($t3,@X[2],$SZ);', # X[9..12]
826
+ '&movdqa ($t1,$t0)',
827
+ '&movdqa ($t2,$t0);',
828
+ '&psrld ($t0,$sigma0[2])',
829
+ '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
830
+ '&psrld ($t2,$sigma0[0])',
831
+ '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
832
+ '&pslld ($t1,8*$SZ-$sigma0[1]);'.
833
+ '&pxor ($t0,$t2)',
834
+ '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
835
+ '&pxor ($t0,$t1)',
836
+ '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
837
+ '&pxor ($t0,$t2);',
838
+ '&movdqa ($t2,$t3)',
839
+ '&pxor ($t0,$t1);', # sigma0(X[1..4])
840
+ '&psrld ($t3,$sigma1[2])',
841
+ '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
842
+ '&psrlq ($t2,$sigma1[0])',
843
+ '&pxor ($t3,$t2);',
844
+ '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
845
+ '&pxor ($t3,$t2)',
846
+ '&pshufb ($t3,$t4)', # sigma1(X[14..15])
847
+ '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
848
+ '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
849
+ '&movdqa ($t2,$t3);',
850
+ '&psrld ($t3,$sigma1[2])',
851
+ '&psrlq ($t2,$sigma1[0])',
852
+ '&pxor ($t3,$t2);',
853
+ '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
854
+ '&pxor ($t3,$t2);',
855
+ '&movdqa ($t2,16*2*$j."($Tbl)")',
856
+ '&pshufb ($t3,$t5)',
857
+ '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
858
+ );
859
+ }
860
+
861
+ sub SSSE3_256_00_47 () {
862
+ my $j = shift;
863
+ my $body = shift;
864
+ my @X = @_;
865
+ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
866
+
867
+ if (0) {
868
+ foreach (Xupdate_256_SSSE3()) { # 36 instructions
869
+ eval;
870
+ eval(shift(@insns));
871
+ eval(shift(@insns));
872
+ eval(shift(@insns));
873
+ }
874
+ } else { # squeeze extra 4% on Westmere and 19% on Atom
875
+ eval(shift(@insns)); #@
876
+ &movdqa ($t0,@X[1]);
877
+ eval(shift(@insns));
878
+ eval(shift(@insns));
879
+ &movdqa ($t3,@X[3]);
880
+ eval(shift(@insns)); #@
881
+ eval(shift(@insns));
882
+ eval(shift(@insns));
883
+ eval(shift(@insns)); #@
884
+ eval(shift(@insns));
885
+ &palignr ($t0,@X[0],$SZ); # X[1..4]
886
+ eval(shift(@insns));
887
+ eval(shift(@insns));
888
+ &palignr ($t3,@X[2],$SZ); # X[9..12]
889
+ eval(shift(@insns));
890
+ eval(shift(@insns));
891
+ eval(shift(@insns));
892
+ eval(shift(@insns)); #@
893
+ &movdqa ($t1,$t0);
894
+ eval(shift(@insns));
895
+ eval(shift(@insns));
896
+ &movdqa ($t2,$t0);
897
+ eval(shift(@insns)); #@
898
+ eval(shift(@insns));
899
+ &psrld ($t0,$sigma0[2]);
900
+ eval(shift(@insns));
901
+ eval(shift(@insns));
902
+ eval(shift(@insns));
903
+ &paddd (@X[0],$t3); # X[0..3] += X[9..12]
904
+ eval(shift(@insns)); #@
905
+ eval(shift(@insns));
906
+ &psrld ($t2,$sigma0[0]);
907
+ eval(shift(@insns));
908
+ eval(shift(@insns));
909
+ &pshufd ($t3,@X[3],0b11111010); # X[4..15]
910
+ eval(shift(@insns));
911
+ eval(shift(@insns)); #@
912
+ &pslld ($t1,8*$SZ-$sigma0[1]);
913
+ eval(shift(@insns));
914
+ eval(shift(@insns));
915
+ &pxor ($t0,$t2);
916
+ eval(shift(@insns)); #@
917
+ eval(shift(@insns));
918
+ eval(shift(@insns));
919
+ eval(shift(@insns)); #@
920
+ &psrld ($t2,$sigma0[1]-$sigma0[0]);
921
+ eval(shift(@insns));
922
+ &pxor ($t0,$t1);
923
+ eval(shift(@insns));
924
+ eval(shift(@insns));
925
+ &pslld ($t1,$sigma0[1]-$sigma0[0]);
926
+ eval(shift(@insns));
927
+ eval(shift(@insns));
928
+ &pxor ($t0,$t2);
929
+ eval(shift(@insns));
930
+ eval(shift(@insns)); #@
931
+ &movdqa ($t2,$t3);
932
+ eval(shift(@insns));
933
+ eval(shift(@insns));
934
+ &pxor ($t0,$t1); # sigma0(X[1..4])
935
+ eval(shift(@insns)); #@
936
+ eval(shift(@insns));
937
+ eval(shift(@insns));
938
+ &psrld ($t3,$sigma1[2]);
939
+ eval(shift(@insns));
940
+ eval(shift(@insns));
941
+ &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
942
+ eval(shift(@insns)); #@
943
+ eval(shift(@insns));
944
+ &psrlq ($t2,$sigma1[0]);
945
+ eval(shift(@insns));
946
+ eval(shift(@insns));
947
+ eval(shift(@insns));
948
+ &pxor ($t3,$t2);
949
+ eval(shift(@insns)); #@
950
+ eval(shift(@insns));
951
+ eval(shift(@insns));
952
+ eval(shift(@insns)); #@
953
+ &psrlq ($t2,$sigma1[1]-$sigma1[0]);
954
+ eval(shift(@insns));
955
+ eval(shift(@insns));
956
+ &pxor ($t3,$t2);
957
+ eval(shift(@insns)); #@
958
+ eval(shift(@insns));
959
+ eval(shift(@insns));
960
+ #&pshufb ($t3,$t4); # sigma1(X[14..15])
961
+ &pshufd ($t3,$t3,0b10000000);
962
+ eval(shift(@insns));
963
+ eval(shift(@insns));
964
+ eval(shift(@insns));
965
+ &psrldq ($t3,8);
966
+ eval(shift(@insns));
967
+ eval(shift(@insns)); #@
968
+ eval(shift(@insns));
969
+ eval(shift(@insns));
970
+ eval(shift(@insns)); #@
971
+ &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
972
+ eval(shift(@insns));
973
+ eval(shift(@insns));
974
+ eval(shift(@insns));
975
+ &pshufd ($t3,@X[0],0b01010000); # X[16..17]
976
+ eval(shift(@insns));
977
+ eval(shift(@insns)); #@
978
+ eval(shift(@insns));
979
+ &movdqa ($t2,$t3);
980
+ eval(shift(@insns));
981
+ eval(shift(@insns));
982
+ &psrld ($t3,$sigma1[2]);
983
+ eval(shift(@insns));
984
+ eval(shift(@insns)); #@
985
+ &psrlq ($t2,$sigma1[0]);
986
+ eval(shift(@insns));
987
+ eval(shift(@insns));
988
+ &pxor ($t3,$t2);
989
+ eval(shift(@insns)); #@
990
+ eval(shift(@insns));
991
+ eval(shift(@insns));
992
+ eval(shift(@insns)); #@
993
+ eval(shift(@insns));
994
+ &psrlq ($t2,$sigma1[1]-$sigma1[0]);
995
+ eval(shift(@insns));
996
+ eval(shift(@insns));
997
+ eval(shift(@insns));
998
+ &pxor ($t3,$t2);
999
+ eval(shift(@insns));
1000
+ eval(shift(@insns));
1001
+ eval(shift(@insns)); #@
1002
+ #&pshufb ($t3,$t5);
1003
+ &pshufd ($t3,$t3,0b00001000);
1004
+ eval(shift(@insns));
1005
+ eval(shift(@insns));
1006
+ &movdqa ($t2,16*2*$j."($Tbl)");
1007
+ eval(shift(@insns)); #@
1008
+ eval(shift(@insns));
1009
+ &pslldq ($t3,8);
1010
+ eval(shift(@insns));
1011
+ eval(shift(@insns));
1012
+ eval(shift(@insns));
1013
+ &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1014
+ eval(shift(@insns)); #@
1015
+ eval(shift(@insns));
1016
+ eval(shift(@insns));
1017
+ }
1018
+ &paddd ($t2,@X[0]);
1019
+ foreach (@insns) { eval; } # remaining instructions
1020
+ &movdqa (16*$j."(%rsp)",$t2);
1021
+ }
1022
+
1023
+ for ($i=0,$j=0; $j<4; $j++) {
1024
+ &SSSE3_256_00_47($j,\&body_00_15,@X);
1025
+ push(@X,shift(@X)); # rotate(@X)
1026
+ }
1027
+ &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1028
+ &jne (".Lssse3_00_47");
1029
+
1030
+ for ($i=0; $i<16; ) {
1031
+ foreach(body_00_15()) { eval; }
1032
+ }
1033
+ $code.=<<___;
1034
+ mov $_ctx,$ctx
1035
+ mov $a1,$A
1036
+
1037
+ add $SZ*0($ctx),$A
1038
+ lea 16*$SZ($inp),$inp
1039
+ add $SZ*1($ctx),$B
1040
+ add $SZ*2($ctx),$C
1041
+ add $SZ*3($ctx),$D
1042
+ add $SZ*4($ctx),$E
1043
+ add $SZ*5($ctx),$F
1044
+ add $SZ*6($ctx),$G
1045
+ add $SZ*7($ctx),$H
1046
+
1047
+ cmp $_end,$inp
1048
+
1049
+ mov $A,$SZ*0($ctx)
1050
+ mov $B,$SZ*1($ctx)
1051
+ mov $C,$SZ*2($ctx)
1052
+ mov $D,$SZ*3($ctx)
1053
+ mov $E,$SZ*4($ctx)
1054
+ mov $F,$SZ*5($ctx)
1055
+ mov $G,$SZ*6($ctx)
1056
+ mov $H,$SZ*7($ctx)
1057
+ jb .Lloop_ssse3
1058
+
1059
+ mov $_rsp,%rsi
1060
+ ___
1061
+ $code.=<<___ if ($win64);
1062
+ movaps 16*$SZ+32(%rsp),%xmm6
1063
+ movaps 16*$SZ+48(%rsp),%xmm7
1064
+ movaps 16*$SZ+64(%rsp),%xmm8
1065
+ movaps 16*$SZ+80(%rsp),%xmm9
1066
+ ___
1067
+ $code.=<<___;
1068
+ mov (%rsi),%r15
1069
+ mov 8(%rsi),%r14
1070
+ mov 16(%rsi),%r13
1071
+ mov 24(%rsi),%r12
1072
+ mov 32(%rsi),%rbp
1073
+ mov 40(%rsi),%rbx
1074
+ lea 48(%rsi),%rsp
1075
+ .Lepilogue_ssse3:
1076
+ ret
1077
+ .size ${func}_ssse3,.-${func}_ssse3
1078
+ ___
1079
+ }
1080
+
1081
+ if ($avx) {{
1082
+ ######################################################################
1083
+ # XOP code path
1084
+ #
1085
+ if ($SZ==8) { # SHA512 only
1086
+ $code.=<<___;
1087
+ .type ${func}_xop,\@function,3
1088
+ .align 64
1089
+ ${func}_xop:
1090
+ .Lxop_shortcut:
1091
+ push %rbx
1092
+ push %rbp
1093
+ push %r12
1094
+ push %r13
1095
+ push %r14
1096
+ push %r15
1097
+ mov %rsp,%r11 # copy %rsp
1098
+ shl \$4,%rdx # num*16
1099
+ sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1100
+ lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1101
+ and \$-64,%rsp # align stack frame
1102
+ mov $ctx,$_ctx # save ctx, 1st arg
1103
+ mov $inp,$_inp # save inp, 2nd arh
1104
+ mov %rdx,$_end # save end pointer, "3rd" arg
1105
+ mov %r11,$_rsp # save copy of %rsp
1106
+ ___
1107
+ $code.=<<___ if ($win64);
1108
+ movaps %xmm6,16*$SZ+32(%rsp)
1109
+ movaps %xmm7,16*$SZ+48(%rsp)
1110
+ movaps %xmm8,16*$SZ+64(%rsp)
1111
+ movaps %xmm9,16*$SZ+80(%rsp)
1112
+ ___
1113
+ $code.=<<___ if ($win64 && $SZ>4);
1114
+ movaps %xmm10,16*$SZ+96(%rsp)
1115
+ movaps %xmm11,16*$SZ+112(%rsp)
1116
+ ___
1117
+ $code.=<<___;
1118
+ .Lprologue_xop:
1119
+
1120
+ vzeroupper
1121
+ mov $SZ*0($ctx),$A
1122
+ mov $SZ*1($ctx),$B
1123
+ mov $SZ*2($ctx),$C
1124
+ mov $SZ*3($ctx),$D
1125
+ mov $SZ*4($ctx),$E
1126
+ mov $SZ*5($ctx),$F
1127
+ mov $SZ*6($ctx),$G
1128
+ mov $SZ*7($ctx),$H
1129
+ jmp .Lloop_xop
1130
+ ___
1131
+ if ($SZ==4) { # SHA256
1132
+ my @X = map("%xmm$_",(0..3));
1133
+ my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1134
+
1135
+ $code.=<<___;
1136
+ .align 16
1137
+ .Lloop_xop:
1138
+ vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1139
+ vmovdqu 0x00($inp),@X[0]
1140
+ vmovdqu 0x10($inp),@X[1]
1141
+ vmovdqu 0x20($inp),@X[2]
1142
+ vmovdqu 0x30($inp),@X[3]
1143
+ vpshufb $t3,@X[0],@X[0]
1144
+ lea $TABLE(%rip),$Tbl
1145
+ vpshufb $t3,@X[1],@X[1]
1146
+ vpshufb $t3,@X[2],@X[2]
1147
+ vpaddd 0x00($Tbl),@X[0],$t0
1148
+ vpshufb $t3,@X[3],@X[3]
1149
+ vpaddd 0x20($Tbl),@X[1],$t1
1150
+ vpaddd 0x40($Tbl),@X[2],$t2
1151
+ vpaddd 0x60($Tbl),@X[3],$t3
1152
+ vmovdqa $t0,0x00(%rsp)
1153
+ mov $A,$a1
1154
+ vmovdqa $t1,0x10(%rsp)
1155
+ mov $B,$a3
1156
+ vmovdqa $t2,0x20(%rsp)
1157
+ xor $C,$a3 # magic
1158
+ vmovdqa $t3,0x30(%rsp)
1159
+ mov $E,$a0
1160
+ jmp .Lxop_00_47
1161
+
1162
+ .align 16
1163
+ .Lxop_00_47:
1164
+ sub \$`-16*2*$SZ`,$Tbl # size optimization
1165
+ ___
1166
+ sub XOP_256_00_47 () {
1167
+ my $j = shift;
1168
+ my $body = shift;
1169
+ my @X = @_;
1170
+ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1171
+
1172
+ &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
1173
+ eval(shift(@insns));
1174
+ eval(shift(@insns));
1175
+ &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
1176
+ eval(shift(@insns));
1177
+ eval(shift(@insns));
1178
+ &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
1179
+ eval(shift(@insns));
1180
+ eval(shift(@insns));
1181
+ &vpsrld ($t0,$t0,$sigma0[2]);
1182
+ eval(shift(@insns));
1183
+ eval(shift(@insns));
1184
+ &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
1185
+ eval(shift(@insns));
1186
+ eval(shift(@insns));
1187
+ eval(shift(@insns));
1188
+ eval(shift(@insns));
1189
+ &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
1190
+ eval(shift(@insns));
1191
+ eval(shift(@insns));
1192
+ &vpxor ($t0,$t0,$t1);
1193
+ eval(shift(@insns));
1194
+ eval(shift(@insns));
1195
+ eval(shift(@insns));
1196
+ eval(shift(@insns));
1197
+ &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
1198
+ eval(shift(@insns));
1199
+ eval(shift(@insns));
1200
+ &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
1201
+ eval(shift(@insns));
1202
+ eval(shift(@insns));
1203
+ &vpsrld ($t2,@X[3],$sigma1[2]);
1204
+ eval(shift(@insns));
1205
+ eval(shift(@insns));
1206
+ &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
1207
+ eval(shift(@insns));
1208
+ eval(shift(@insns));
1209
+ &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1210
+ eval(shift(@insns));
1211
+ eval(shift(@insns));
1212
+ &vpxor ($t3,$t3,$t2);
1213
+ eval(shift(@insns));
1214
+ eval(shift(@insns));
1215
+ eval(shift(@insns));
1216
+ eval(shift(@insns));
1217
+ &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1218
+ eval(shift(@insns));
1219
+ eval(shift(@insns));
1220
+ eval(shift(@insns));
1221
+ eval(shift(@insns));
1222
+ &vpsrldq ($t3,$t3,8);
1223
+ eval(shift(@insns));
1224
+ eval(shift(@insns));
1225
+ eval(shift(@insns));
1226
+ eval(shift(@insns));
1227
+ &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1228
+ eval(shift(@insns));
1229
+ eval(shift(@insns));
1230
+ eval(shift(@insns));
1231
+ eval(shift(@insns));
1232
+ &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
1233
+ eval(shift(@insns));
1234
+ eval(shift(@insns));
1235
+ &vpsrld ($t2,@X[0],$sigma1[2]);
1236
+ eval(shift(@insns));
1237
+ eval(shift(@insns));
1238
+ &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1239
+ eval(shift(@insns));
1240
+ eval(shift(@insns));
1241
+ &vpxor ($t3,$t3,$t2);
1242
+ eval(shift(@insns));
1243
+ eval(shift(@insns));
1244
+ eval(shift(@insns));
1245
+ eval(shift(@insns));
1246
+ &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
1247
+ eval(shift(@insns));
1248
+ eval(shift(@insns));
1249
+ eval(shift(@insns));
1250
+ eval(shift(@insns));
1251
+ &vpslldq ($t3,$t3,8); # 22 instructions
1252
+ eval(shift(@insns));
1253
+ eval(shift(@insns));
1254
+ eval(shift(@insns));
1255
+ eval(shift(@insns));
1256
+ &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1257
+ eval(shift(@insns));
1258
+ eval(shift(@insns));
1259
+ eval(shift(@insns));
1260
+ eval(shift(@insns));
1261
+ &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1262
+ foreach (@insns) { eval; } # remaining instructions
1263
+ &vmovdqa (16*$j."(%rsp)",$t2);
1264
+ }
1265
+
1266
+ for ($i=0,$j=0; $j<4; $j++) {
1267
+ &XOP_256_00_47($j,\&body_00_15,@X);
1268
+ push(@X,shift(@X)); # rotate(@X)
1269
+ }
1270
+ &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1271
+ &jne (".Lxop_00_47");
1272
+
1273
+ for ($i=0; $i<16; ) {
1274
+ foreach(body_00_15()) { eval; }
1275
+ }
1276
+
1277
+ } else { # SHA512
1278
+ my @X = map("%xmm$_",(0..7));
1279
+ my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1280
+
1281
+ $code.=<<___;
1282
+ .align 16
1283
+ .Lloop_xop:
1284
+ vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1285
+ vmovdqu 0x00($inp),@X[0]
1286
+ lea $TABLE+0x80(%rip),$Tbl # size optimization
1287
+ vmovdqu 0x10($inp),@X[1]
1288
+ vmovdqu 0x20($inp),@X[2]
1289
+ vpshufb $t3,@X[0],@X[0]
1290
+ vmovdqu 0x30($inp),@X[3]
1291
+ vpshufb $t3,@X[1],@X[1]
1292
+ vmovdqu 0x40($inp),@X[4]
1293
+ vpshufb $t3,@X[2],@X[2]
1294
+ vmovdqu 0x50($inp),@X[5]
1295
+ vpshufb $t3,@X[3],@X[3]
1296
+ vmovdqu 0x60($inp),@X[6]
1297
+ vpshufb $t3,@X[4],@X[4]
1298
+ vmovdqu 0x70($inp),@X[7]
1299
+ vpshufb $t3,@X[5],@X[5]
1300
+ vpaddq -0x80($Tbl),@X[0],$t0
1301
+ vpshufb $t3,@X[6],@X[6]
1302
+ vpaddq -0x60($Tbl),@X[1],$t1
1303
+ vpshufb $t3,@X[7],@X[7]
1304
+ vpaddq -0x40($Tbl),@X[2],$t2
1305
+ vpaddq -0x20($Tbl),@X[3],$t3
1306
+ vmovdqa $t0,0x00(%rsp)
1307
+ vpaddq 0x00($Tbl),@X[4],$t0
1308
+ vmovdqa $t1,0x10(%rsp)
1309
+ vpaddq 0x20($Tbl),@X[5],$t1
1310
+ vmovdqa $t2,0x20(%rsp)
1311
+ vpaddq 0x40($Tbl),@X[6],$t2
1312
+ vmovdqa $t3,0x30(%rsp)
1313
+ vpaddq 0x60($Tbl),@X[7],$t3
1314
+ vmovdqa $t0,0x40(%rsp)
1315
+ mov $A,$a1
1316
+ vmovdqa $t1,0x50(%rsp)
1317
+ mov $B,$a3
1318
+ vmovdqa $t2,0x60(%rsp)
1319
+ xor $C,$a3 # magic
1320
+ vmovdqa $t3,0x70(%rsp)
1321
+ mov $E,$a0
1322
+ jmp .Lxop_00_47
1323
+
1324
+ .align 16
1325
+ .Lxop_00_47:
1326
+ add \$`16*2*$SZ`,$Tbl
1327
+ ___
1328
+ sub XOP_512_00_47 () {
1329
+ my $j = shift;
1330
+ my $body = shift;
1331
+ my @X = @_;
1332
+ my @insns = (&$body,&$body); # 52 instructions
1333
+
1334
+ &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
1335
+ eval(shift(@insns));
1336
+ eval(shift(@insns));
1337
+ &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
1338
+ eval(shift(@insns));
1339
+ eval(shift(@insns));
1340
+ &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
1341
+ eval(shift(@insns));
1342
+ eval(shift(@insns));
1343
+ &vpsrlq ($t0,$t0,$sigma0[2]);
1344
+ eval(shift(@insns));
1345
+ eval(shift(@insns));
1346
+ &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
1347
+ eval(shift(@insns));
1348
+ eval(shift(@insns));
1349
+ eval(shift(@insns));
1350
+ eval(shift(@insns));
1351
+ &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
1352
+ eval(shift(@insns));
1353
+ eval(shift(@insns));
1354
+ &vpxor ($t0,$t0,$t1);
1355
+ eval(shift(@insns));
1356
+ eval(shift(@insns));
1357
+ eval(shift(@insns));
1358
+ eval(shift(@insns));
1359
+ &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
1360
+ eval(shift(@insns));
1361
+ eval(shift(@insns));
1362
+ &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
1363
+ eval(shift(@insns));
1364
+ eval(shift(@insns));
1365
+ &vpsrlq ($t2,@X[7],$sigma1[2]);
1366
+ eval(shift(@insns));
1367
+ eval(shift(@insns));
1368
+ &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
1369
+ eval(shift(@insns));
1370
+ eval(shift(@insns));
1371
+ &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
1372
+ eval(shift(@insns));
1373
+ eval(shift(@insns));
1374
+ &vpxor ($t3,$t3,$t2);
1375
+ eval(shift(@insns));
1376
+ eval(shift(@insns));
1377
+ eval(shift(@insns));
1378
+ eval(shift(@insns));
1379
+ &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1380
+ eval(shift(@insns));
1381
+ eval(shift(@insns));
1382
+ eval(shift(@insns));
1383
+ eval(shift(@insns));
1384
+ &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1385
+ eval(shift(@insns));
1386
+ eval(shift(@insns));
1387
+ eval(shift(@insns));
1388
+ eval(shift(@insns));
1389
+ &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1390
+ foreach (@insns) { eval; } # remaining instructions
1391
+ &vmovdqa (16*$j."(%rsp)",$t2);
1392
+ }
1393
+
1394
+ for ($i=0,$j=0; $j<8; $j++) {
1395
+ &XOP_512_00_47($j,\&body_00_15,@X);
1396
+ push(@X,shift(@X)); # rotate(@X)
1397
+ }
1398
+ &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1399
+ &jne (".Lxop_00_47");
1400
+
1401
+ for ($i=0; $i<16; ) {
1402
+ foreach(body_00_15()) { eval; }
1403
+ }
1404
+ }
1405
+ $code.=<<___;
1406
+ mov $_ctx,$ctx
1407
+ mov $a1,$A
1408
+
1409
+ add $SZ*0($ctx),$A
1410
+ lea 16*$SZ($inp),$inp
1411
+ add $SZ*1($ctx),$B
1412
+ add $SZ*2($ctx),$C
1413
+ add $SZ*3($ctx),$D
1414
+ add $SZ*4($ctx),$E
1415
+ add $SZ*5($ctx),$F
1416
+ add $SZ*6($ctx),$G
1417
+ add $SZ*7($ctx),$H
1418
+
1419
+ cmp $_end,$inp
1420
+
1421
+ mov $A,$SZ*0($ctx)
1422
+ mov $B,$SZ*1($ctx)
1423
+ mov $C,$SZ*2($ctx)
1424
+ mov $D,$SZ*3($ctx)
1425
+ mov $E,$SZ*4($ctx)
1426
+ mov $F,$SZ*5($ctx)
1427
+ mov $G,$SZ*6($ctx)
1428
+ mov $H,$SZ*7($ctx)
1429
+ jb .Lloop_xop
1430
+
1431
+ mov $_rsp,%rsi
1432
+ vzeroupper
1433
+ ___
1434
+ $code.=<<___ if ($win64);
1435
+ movaps 16*$SZ+32(%rsp),%xmm6
1436
+ movaps 16*$SZ+48(%rsp),%xmm7
1437
+ movaps 16*$SZ+64(%rsp),%xmm8
1438
+ movaps 16*$SZ+80(%rsp),%xmm9
1439
+ ___
1440
+ $code.=<<___ if ($win64 && $SZ>4);
1441
+ movaps 16*$SZ+96(%rsp),%xmm10
1442
+ movaps 16*$SZ+112(%rsp),%xmm11
1443
+ ___
1444
+ $code.=<<___;
1445
+ mov (%rsi),%r15
1446
+ mov 8(%rsi),%r14
1447
+ mov 16(%rsi),%r13
1448
+ mov 24(%rsi),%r12
1449
+ mov 32(%rsi),%rbp
1450
+ mov 40(%rsi),%rbx
1451
+ lea 48(%rsi),%rsp
1452
+ .Lepilogue_xop:
1453
+ ret
1454
+ .size ${func}_xop,.-${func}_xop
1455
+ ___
1456
+ }
1457
+ ######################################################################
1458
+ # AVX+shrd code path
1459
+ #
1460
+ local *ror = sub { &shrd(@_[0],@_) };
1461
+
1462
+ $code.=<<___;
1463
+ .type ${func}_avx,\@function,3
1464
+ .align 64
1465
+ ${func}_avx:
1466
+ .Lavx_shortcut:
1467
+ push %rbx
1468
+ push %rbp
1469
+ push %r12
1470
+ push %r13
1471
+ push %r14
1472
+ push %r15
1473
+ mov %rsp,%r11 # copy %rsp
1474
+ shl \$4,%rdx # num*16
1475
+ sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1476
+ lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1477
+ and \$-64,%rsp # align stack frame
1478
+ mov $ctx,$_ctx # save ctx, 1st arg
1479
+ mov $inp,$_inp # save inp, 2nd arh
1480
+ mov %rdx,$_end # save end pointer, "3rd" arg
1481
+ mov %r11,$_rsp # save copy of %rsp
1482
+ ___
1483
+ $code.=<<___ if ($win64);
1484
+ movaps %xmm6,16*$SZ+32(%rsp)
1485
+ movaps %xmm7,16*$SZ+48(%rsp)
1486
+ movaps %xmm8,16*$SZ+64(%rsp)
1487
+ movaps %xmm9,16*$SZ+80(%rsp)
1488
+ ___
1489
+ $code.=<<___ if ($win64 && $SZ>4);
1490
+ movaps %xmm10,16*$SZ+96(%rsp)
1491
+ movaps %xmm11,16*$SZ+112(%rsp)
1492
+ ___
1493
+ $code.=<<___;
1494
+ .Lprologue_avx:
1495
+
1496
+ vzeroupper
1497
+ mov $SZ*0($ctx),$A
1498
+ mov $SZ*1($ctx),$B
1499
+ mov $SZ*2($ctx),$C
1500
+ mov $SZ*3($ctx),$D
1501
+ mov $SZ*4($ctx),$E
1502
+ mov $SZ*5($ctx),$F
1503
+ mov $SZ*6($ctx),$G
1504
+ mov $SZ*7($ctx),$H
1505
+ ___
1506
+ if ($SZ==4) { # SHA256
1507
+ my @X = map("%xmm$_",(0..3));
1508
+ my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1509
+
1510
+ $code.=<<___;
1511
+ vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1512
+ vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1513
+ jmp .Lloop_avx
1514
+ .align 16
1515
+ .Lloop_avx:
1516
+ vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1517
+ vmovdqu 0x00($inp),@X[0]
1518
+ vmovdqu 0x10($inp),@X[1]
1519
+ vmovdqu 0x20($inp),@X[2]
1520
+ vmovdqu 0x30($inp),@X[3]
1521
+ vpshufb $t3,@X[0],@X[0]
1522
+ lea $TABLE(%rip),$Tbl
1523
+ vpshufb $t3,@X[1],@X[1]
1524
+ vpshufb $t3,@X[2],@X[2]
1525
+ vpaddd 0x00($Tbl),@X[0],$t0
1526
+ vpshufb $t3,@X[3],@X[3]
1527
+ vpaddd 0x20($Tbl),@X[1],$t1
1528
+ vpaddd 0x40($Tbl),@X[2],$t2
1529
+ vpaddd 0x60($Tbl),@X[3],$t3
1530
+ vmovdqa $t0,0x00(%rsp)
1531
+ mov $A,$a1
1532
+ vmovdqa $t1,0x10(%rsp)
1533
+ mov $B,$a3
1534
+ vmovdqa $t2,0x20(%rsp)
1535
+ xor $C,$a3 # magic
1536
+ vmovdqa $t3,0x30(%rsp)
1537
+ mov $E,$a0
1538
+ jmp .Lavx_00_47
1539
+
1540
+ .align 16
1541
+ .Lavx_00_47:
1542
+ sub \$`-16*2*$SZ`,$Tbl # size optimization
1543
+ ___
1544
+ sub Xupdate_256_AVX () {
1545
+ (
1546
+ '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
1547
+ '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
1548
+ '&vpsrld ($t2,$t0,$sigma0[0]);',
1549
+ '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
1550
+ '&vpsrld ($t3,$t0,$sigma0[2])',
1551
+ '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
1552
+ '&vpxor ($t0,$t3,$t2)',
1553
+ '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
1554
+ '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1555
+ '&vpxor ($t0,$t0,$t1)',
1556
+ '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1557
+ '&vpxor ($t0,$t0,$t2)',
1558
+ '&vpsrld ($t2,$t3,$sigma1[2]);',
1559
+ '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
1560
+ '&vpsrlq ($t3,$t3,$sigma1[0]);',
1561
+ '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
1562
+ '&vpxor ($t2,$t2,$t3);',
1563
+ '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1564
+ '&vpxor ($t2,$t2,$t3)',
1565
+ '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
1566
+ '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
1567
+ '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
1568
+ '&vpsrld ($t2,$t3,$sigma1[2])',
1569
+ '&vpsrlq ($t3,$t3,$sigma1[0])',
1570
+ '&vpxor ($t2,$t2,$t3);',
1571
+ '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1572
+ '&vpxor ($t2,$t2,$t3)',
1573
+ '&vpshufb ($t2,$t2,$t5)',
1574
+ '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
1575
+ );
1576
+ }
1577
+
1578
+ sub AVX_256_00_47 () {
1579
+ my $j = shift;
1580
+ my $body = shift;
1581
+ my @X = @_;
1582
+ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1583
+
1584
+ foreach (Xupdate_256_AVX()) { # 29 instructions
1585
+ eval;
1586
+ eval(shift(@insns));
1587
+ eval(shift(@insns));
1588
+ eval(shift(@insns));
1589
+ }
1590
+ &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1591
+ foreach (@insns) { eval; } # remaining instructions
1592
+ &vmovdqa (16*$j."(%rsp)",$t2);
1593
+ }
1594
+
1595
+ for ($i=0,$j=0; $j<4; $j++) {
1596
+ &AVX_256_00_47($j,\&body_00_15,@X);
1597
+ push(@X,shift(@X)); # rotate(@X)
1598
+ }
1599
+ &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1600
+ &jne (".Lavx_00_47");
1601
+
1602
+ for ($i=0; $i<16; ) {
1603
+ foreach(body_00_15()) { eval; }
1604
+ }
1605
+
1606
+ } else { # SHA512
1607
+ my @X = map("%xmm$_",(0..7));
1608
+ my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1609
+
1610
+ $code.=<<___;
1611
+ jmp .Lloop_avx
1612
+ .align 16
1613
+ .Lloop_avx:
1614
+ vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1615
+ vmovdqu 0x00($inp),@X[0]
1616
+ lea $TABLE+0x80(%rip),$Tbl # size optimization
1617
+ vmovdqu 0x10($inp),@X[1]
1618
+ vmovdqu 0x20($inp),@X[2]
1619
+ vpshufb $t3,@X[0],@X[0]
1620
+ vmovdqu 0x30($inp),@X[3]
1621
+ vpshufb $t3,@X[1],@X[1]
1622
+ vmovdqu 0x40($inp),@X[4]
1623
+ vpshufb $t3,@X[2],@X[2]
1624
+ vmovdqu 0x50($inp),@X[5]
1625
+ vpshufb $t3,@X[3],@X[3]
1626
+ vmovdqu 0x60($inp),@X[6]
1627
+ vpshufb $t3,@X[4],@X[4]
1628
+ vmovdqu 0x70($inp),@X[7]
1629
+ vpshufb $t3,@X[5],@X[5]
1630
+ vpaddq -0x80($Tbl),@X[0],$t0
1631
+ vpshufb $t3,@X[6],@X[6]
1632
+ vpaddq -0x60($Tbl),@X[1],$t1
1633
+ vpshufb $t3,@X[7],@X[7]
1634
+ vpaddq -0x40($Tbl),@X[2],$t2
1635
+ vpaddq -0x20($Tbl),@X[3],$t3
1636
+ vmovdqa $t0,0x00(%rsp)
1637
+ vpaddq 0x00($Tbl),@X[4],$t0
1638
+ vmovdqa $t1,0x10(%rsp)
1639
+ vpaddq 0x20($Tbl),@X[5],$t1
1640
+ vmovdqa $t2,0x20(%rsp)
1641
+ vpaddq 0x40($Tbl),@X[6],$t2
1642
+ vmovdqa $t3,0x30(%rsp)
1643
+ vpaddq 0x60($Tbl),@X[7],$t3
1644
+ vmovdqa $t0,0x40(%rsp)
1645
+ mov $A,$a1
1646
+ vmovdqa $t1,0x50(%rsp)
1647
+ mov $B,$a3
1648
+ vmovdqa $t2,0x60(%rsp)
1649
+ xor $C,$a3 # magic
1650
+ vmovdqa $t3,0x70(%rsp)
1651
+ mov $E,$a0
1652
+ jmp .Lavx_00_47
1653
+
1654
+ .align 16
1655
+ .Lavx_00_47:
1656
+ add \$`16*2*$SZ`,$Tbl
1657
+ ___
1658
+ sub Xupdate_512_AVX () {
1659
+ (
1660
+ '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
1661
+ '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
1662
+ '&vpsrlq ($t2,$t0,$sigma0[0])',
1663
+ '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
1664
+ '&vpsrlq ($t3,$t0,$sigma0[2])',
1665
+ '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
1666
+ '&vpxor ($t0,$t3,$t2)',
1667
+ '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1668
+ '&vpxor ($t0,$t0,$t1)',
1669
+ '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1670
+ '&vpxor ($t0,$t0,$t2)',
1671
+ '&vpsrlq ($t3,@X[7],$sigma1[2]);',
1672
+ '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
1673
+ '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
1674
+ '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
1675
+ '&vpsrlq ($t1,@X[7],$sigma1[0]);',
1676
+ '&vpxor ($t3,$t3,$t2)',
1677
+ '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
1678
+ '&vpxor ($t3,$t3,$t1)',
1679
+ '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
1680
+ '&vpxor ($t3,$t3,$t2)',
1681
+ '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
1682
+ '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
1683
+ );
1684
+ }
1685
+
1686
+ sub AVX_512_00_47 () {
1687
+ my $j = shift;
1688
+ my $body = shift;
1689
+ my @X = @_;
1690
+ my @insns = (&$body,&$body); # 52 instructions
1691
+
1692
+ foreach (Xupdate_512_AVX()) { # 23 instructions
1693
+ eval;
1694
+ eval(shift(@insns));
1695
+ eval(shift(@insns));
1696
+ }
1697
+ &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1698
+ foreach (@insns) { eval; } # remaining instructions
1699
+ &vmovdqa (16*$j."(%rsp)",$t2);
1700
+ }
1701
+
1702
+ for ($i=0,$j=0; $j<8; $j++) {
1703
+ &AVX_512_00_47($j,\&body_00_15,@X);
1704
+ push(@X,shift(@X)); # rotate(@X)
1705
+ }
1706
+ &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1707
+ &jne (".Lavx_00_47");
1708
+
1709
+ for ($i=0; $i<16; ) {
1710
+ foreach(body_00_15()) { eval; }
1711
+ }
1712
+ }
1713
+ $code.=<<___;
1714
+ mov $_ctx,$ctx
1715
+ mov $a1,$A
1716
+
1717
+ add $SZ*0($ctx),$A
1718
+ lea 16*$SZ($inp),$inp
1719
+ add $SZ*1($ctx),$B
1720
+ add $SZ*2($ctx),$C
1721
+ add $SZ*3($ctx),$D
1722
+ add $SZ*4($ctx),$E
1723
+ add $SZ*5($ctx),$F
1724
+ add $SZ*6($ctx),$G
1725
+ add $SZ*7($ctx),$H
1726
+
1727
+ cmp $_end,$inp
1728
+
1729
+ mov $A,$SZ*0($ctx)
1730
+ mov $B,$SZ*1($ctx)
1731
+ mov $C,$SZ*2($ctx)
1732
+ mov $D,$SZ*3($ctx)
1733
+ mov $E,$SZ*4($ctx)
1734
+ mov $F,$SZ*5($ctx)
1735
+ mov $G,$SZ*6($ctx)
1736
+ mov $H,$SZ*7($ctx)
1737
+ jb .Lloop_avx
1738
+
1739
+ mov $_rsp,%rsi
1740
+ vzeroupper
1741
+ ___
1742
+ $code.=<<___ if ($win64);
1743
+ movaps 16*$SZ+32(%rsp),%xmm6
1744
+ movaps 16*$SZ+48(%rsp),%xmm7
1745
+ movaps 16*$SZ+64(%rsp),%xmm8
1746
+ movaps 16*$SZ+80(%rsp),%xmm9
1747
+ ___
1748
+ $code.=<<___ if ($win64 && $SZ>4);
1749
+ movaps 16*$SZ+96(%rsp),%xmm10
1750
+ movaps 16*$SZ+112(%rsp),%xmm11
1751
+ ___
1752
+ $code.=<<___;
1753
+ mov (%rsi),%r15
1754
+ mov 8(%rsi),%r14
1755
+ mov 16(%rsi),%r13
1756
+ mov 24(%rsi),%r12
1757
+ mov 32(%rsi),%rbp
1758
+ mov 40(%rsi),%rbx
1759
+ lea 48(%rsi),%rsp
1760
+ .Lepilogue_avx:
1761
+ ret
1762
+ .size ${func}_avx,.-${func}_avx
1763
+ ___
1764
+
1765
+ if ($avx>1) {{
1766
+ ######################################################################
1767
+ # AVX2+BMI code path
1768
+ #
1769
+ my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
1770
+ my $PUSH8=8*2*$SZ;
1771
+ use integer;
1772
+
1773
+ sub bodyx_00_15 () {
1774
+ # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1775
+ (
1776
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1777
+
1778
+ '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
1779
+ '&and ($a4,$e)', # f&e
1780
+ '&rorx ($a0,$e,$Sigma1[2])',
1781
+ '&rorx ($a2,$e,$Sigma1[1])',
1782
+
1783
+ '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
1784
+ '&lea ($h,"($h,$a4)")',
1785
+ '&andn ($a4,$e,$g)', # ~e&g
1786
+ '&xor ($a0,$a2)',
1787
+
1788
+ '&rorx ($a1,$e,$Sigma1[0])',
1789
+ '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
1790
+ '&xor ($a0,$a1)', # Sigma1(e)
1791
+ '&mov ($a2,$a)',
1792
+
1793
+ '&rorx ($a4,$a,$Sigma0[2])',
1794
+ '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
1795
+ '&xor ($a2,$b)', # a^b, b^c in next round
1796
+ '&rorx ($a1,$a,$Sigma0[1])',
1797
+
1798
+ '&rorx ($a0,$a,$Sigma0[0])',
1799
+ '&lea ($d,"($d,$h)")', # d+=h
1800
+ '&and ($a3,$a2)', # (b^c)&(a^b)
1801
+ '&xor ($a1,$a4)',
1802
+
1803
+ '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
1804
+ '&xor ($a1,$a0)', # Sigma0(a)
1805
+ '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
1806
+ '&mov ($a4,$e)', # copy of f in future
1807
+
1808
+ '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1809
+ );
1810
+ # and at the finish one has to $a+=$a1
1811
+ }
1812
+
1813
+ $code.=<<___;
1814
+ .type ${func}_avx2,\@function,3
1815
+ .align 64
1816
+ ${func}_avx2:
1817
+ .Lavx2_shortcut:
1818
+ push %rbx
1819
+ push %rbp
1820
+ push %r12
1821
+ push %r13
1822
+ push %r14
1823
+ push %r15
1824
+ mov %rsp,%r11 # copy %rsp
1825
+ sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1826
+ shl \$4,%rdx # num*16
1827
+ and \$-256*$SZ,%rsp # align stack frame
1828
+ lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1829
+ add \$`2*$SZ*($rounds-8)`,%rsp
1830
+ mov $ctx,$_ctx # save ctx, 1st arg
1831
+ mov $inp,$_inp # save inp, 2nd arh
1832
+ mov %rdx,$_end # save end pointer, "3rd" arg
1833
+ mov %r11,$_rsp # save copy of %rsp
1834
+ ___
1835
+ $code.=<<___ if ($win64);
1836
+ movaps %xmm6,16*$SZ+32(%rsp)
1837
+ movaps %xmm7,16*$SZ+48(%rsp)
1838
+ movaps %xmm8,16*$SZ+64(%rsp)
1839
+ movaps %xmm9,16*$SZ+80(%rsp)
1840
+ ___
1841
+ $code.=<<___ if ($win64 && $SZ>4);
1842
+ movaps %xmm10,16*$SZ+96(%rsp)
1843
+ movaps %xmm11,16*$SZ+112(%rsp)
1844
+ ___
1845
+ $code.=<<___;
1846
+ .Lprologue_avx2:
1847
+
1848
+ vzeroupper
1849
+ sub \$-16*$SZ,$inp # inp++, size optimization
1850
+ mov $SZ*0($ctx),$A
1851
+ mov $inp,%r12 # borrow $T1
1852
+ mov $SZ*1($ctx),$B
1853
+ cmp %rdx,$inp # $_end
1854
+ mov $SZ*2($ctx),$C
1855
+ cmove %rsp,%r12 # next block or random data
1856
+ mov $SZ*3($ctx),$D
1857
+ mov $SZ*4($ctx),$E
1858
+ mov $SZ*5($ctx),$F
1859
+ mov $SZ*6($ctx),$G
1860
+ mov $SZ*7($ctx),$H
1861
+ ___
1862
+ if ($SZ==4) { # SHA256
1863
+ my @X = map("%ymm$_",(0..3));
1864
+ my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1865
+
1866
+ $code.=<<___;
1867
+ vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1868
+ vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1869
+ jmp .Loop_avx2
1870
+ .align 16
1871
+ .Loop_avx2:
1872
+ vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1873
+ vmovdqu -16*$SZ+0($inp),%xmm0
1874
+ vmovdqu -16*$SZ+16($inp),%xmm1
1875
+ vmovdqu -16*$SZ+32($inp),%xmm2
1876
+ vmovdqu -16*$SZ+48($inp),%xmm3
1877
+ #mov $inp,$_inp # offload $inp
1878
+ vinserti128 \$1,(%r12),@X[0],@X[0]
1879
+ vinserti128 \$1,16(%r12),@X[1],@X[1]
1880
+ vpshufb $t3,@X[0],@X[0]
1881
+ vinserti128 \$1,32(%r12),@X[2],@X[2]
1882
+ vpshufb $t3,@X[1],@X[1]
1883
+ vinserti128 \$1,48(%r12),@X[3],@X[3]
1884
+
1885
+ lea $TABLE(%rip),$Tbl
1886
+ vpshufb $t3,@X[2],@X[2]
1887
+ vpaddd 0x00($Tbl),@X[0],$t0
1888
+ vpshufb $t3,@X[3],@X[3]
1889
+ vpaddd 0x20($Tbl),@X[1],$t1
1890
+ vpaddd 0x40($Tbl),@X[2],$t2
1891
+ vpaddd 0x60($Tbl),@X[3],$t3
1892
+ vmovdqa $t0,0x00(%rsp)
1893
+ xor $a1,$a1
1894
+ vmovdqa $t1,0x20(%rsp)
1895
+ lea -$PUSH8(%rsp),%rsp
1896
+ mov $B,$a3
1897
+ vmovdqa $t2,0x00(%rsp)
1898
+ xor $C,$a3 # magic
1899
+ vmovdqa $t3,0x20(%rsp)
1900
+ mov $F,$a4
1901
+ sub \$-16*2*$SZ,$Tbl # size optimization
1902
+ jmp .Lavx2_00_47
1903
+
1904
+ .align 16
1905
+ .Lavx2_00_47:
1906
+ ___
1907
+
1908
+ sub AVX2_256_00_47 () {
1909
+ my $j = shift;
1910
+ my $body = shift;
1911
+ my @X = @_;
1912
+ my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1913
+ my $base = "+2*$PUSH8(%rsp)";
1914
+
1915
+ &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1916
+ foreach (Xupdate_256_AVX()) { # 29 instructions
1917
+ eval;
1918
+ eval(shift(@insns));
1919
+ eval(shift(@insns));
1920
+ eval(shift(@insns));
1921
+ }
1922
+ &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1923
+ foreach (@insns) { eval; } # remaining instructions
1924
+ &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1925
+ }
1926
+
1927
+ for ($i=0,$j=0; $j<4; $j++) {
1928
+ &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1929
+ push(@X,shift(@X)); # rotate(@X)
1930
+ }
1931
+ &lea ($Tbl,16*2*$SZ."($Tbl)");
1932
+ &cmpb (($SZ-1)."($Tbl)",0);
1933
+ &jne (".Lavx2_00_47");
1934
+
1935
+ for ($i=0; $i<16; ) {
1936
+ my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1937
+ foreach(bodyx_00_15()) { eval; }
1938
+ }
1939
+ } else { # SHA512
1940
+ my @X = map("%ymm$_",(0..7));
1941
+ my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
1942
+
1943
+ $code.=<<___;
1944
+ jmp .Loop_avx2
1945
+ .align 16
1946
+ .Loop_avx2:
1947
+ vmovdqu -16*$SZ($inp),%xmm0
1948
+ vmovdqu -16*$SZ+16($inp),%xmm1
1949
+ vmovdqu -16*$SZ+32($inp),%xmm2
1950
+ lea $TABLE+0x80(%rip),$Tbl # size optimization
1951
+ vmovdqu -16*$SZ+48($inp),%xmm3
1952
+ vmovdqu -16*$SZ+64($inp),%xmm4
1953
+ vmovdqu -16*$SZ+80($inp),%xmm5
1954
+ vmovdqu -16*$SZ+96($inp),%xmm6
1955
+ vmovdqu -16*$SZ+112($inp),%xmm7
1956
+ #mov $inp,$_inp # offload $inp
1957
+ vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
1958
+ vinserti128 \$1,(%r12),@X[0],@X[0]
1959
+ vinserti128 \$1,16(%r12),@X[1],@X[1]
1960
+ vpshufb $t2,@X[0],@X[0]
1961
+ vinserti128 \$1,32(%r12),@X[2],@X[2]
1962
+ vpshufb $t2,@X[1],@X[1]
1963
+ vinserti128 \$1,48(%r12),@X[3],@X[3]
1964
+ vpshufb $t2,@X[2],@X[2]
1965
+ vinserti128 \$1,64(%r12),@X[4],@X[4]
1966
+ vpshufb $t2,@X[3],@X[3]
1967
+ vinserti128 \$1,80(%r12),@X[5],@X[5]
1968
+ vpshufb $t2,@X[4],@X[4]
1969
+ vinserti128 \$1,96(%r12),@X[6],@X[6]
1970
+ vpshufb $t2,@X[5],@X[5]
1971
+ vinserti128 \$1,112(%r12),@X[7],@X[7]
1972
+
1973
+ vpaddq -0x80($Tbl),@X[0],$t0
1974
+ vpshufb $t2,@X[6],@X[6]
1975
+ vpaddq -0x60($Tbl),@X[1],$t1
1976
+ vpshufb $t2,@X[7],@X[7]
1977
+ vpaddq -0x40($Tbl),@X[2],$t2
1978
+ vpaddq -0x20($Tbl),@X[3],$t3
1979
+ vmovdqa $t0,0x00(%rsp)
1980
+ vpaddq 0x00($Tbl),@X[4],$t0
1981
+ vmovdqa $t1,0x20(%rsp)
1982
+ vpaddq 0x20($Tbl),@X[5],$t1
1983
+ vmovdqa $t2,0x40(%rsp)
1984
+ vpaddq 0x40($Tbl),@X[6],$t2
1985
+ vmovdqa $t3,0x60(%rsp)
1986
+ lea -$PUSH8(%rsp),%rsp
1987
+ vpaddq 0x60($Tbl),@X[7],$t3
1988
+ vmovdqa $t0,0x00(%rsp)
1989
+ xor $a1,$a1
1990
+ vmovdqa $t1,0x20(%rsp)
1991
+ mov $B,$a3
1992
+ vmovdqa $t2,0x40(%rsp)
1993
+ xor $C,$a3 # magic
1994
+ vmovdqa $t3,0x60(%rsp)
1995
+ mov $F,$a4
1996
+ add \$16*2*$SZ,$Tbl
1997
+ jmp .Lavx2_00_47
1998
+
1999
+ .align 16
2000
+ .Lavx2_00_47:
2001
+ ___
2002
+
2003
+ sub AVX2_512_00_47 () {
2004
+ my $j = shift;
2005
+ my $body = shift;
2006
+ my @X = @_;
2007
+ my @insns = (&$body,&$body); # 48 instructions
2008
+ my $base = "+2*$PUSH8(%rsp)";
2009
+
2010
+ &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0);
2011
+ foreach (Xupdate_512_AVX()) { # 23 instructions
2012
+ eval;
2013
+ if ($_ !~ /\;$/) {
2014
+ eval(shift(@insns));
2015
+ eval(shift(@insns));
2016
+ eval(shift(@insns));
2017
+ }
2018
+ }
2019
+ &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
2020
+ foreach (@insns) { eval; } # remaining instructions
2021
+ &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2022
+ }
2023
+
2024
+ for ($i=0,$j=0; $j<8; $j++) {
2025
+ &AVX2_512_00_47($j,\&bodyx_00_15,@X);
2026
+ push(@X,shift(@X)); # rotate(@X)
2027
+ }
2028
+ &lea ($Tbl,16*2*$SZ."($Tbl)");
2029
+ &cmpb (($SZ-1-0x80)."($Tbl)",0);
2030
+ &jne (".Lavx2_00_47");
2031
+
2032
+ for ($i=0; $i<16; ) {
2033
+ my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2034
+ foreach(bodyx_00_15()) { eval; }
2035
+ }
2036
+ }
2037
+ $code.=<<___;
2038
+ mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2039
+ add $a1,$A
2040
+ #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2041
+ lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
2042
+
2043
+ add $SZ*0($ctx),$A
2044
+ add $SZ*1($ctx),$B
2045
+ add $SZ*2($ctx),$C
2046
+ add $SZ*3($ctx),$D
2047
+ add $SZ*4($ctx),$E
2048
+ add $SZ*5($ctx),$F
2049
+ add $SZ*6($ctx),$G
2050
+ add $SZ*7($ctx),$H
2051
+
2052
+ mov $A,$SZ*0($ctx)
2053
+ mov $B,$SZ*1($ctx)
2054
+ mov $C,$SZ*2($ctx)
2055
+ mov $D,$SZ*3($ctx)
2056
+ mov $E,$SZ*4($ctx)
2057
+ mov $F,$SZ*5($ctx)
2058
+ mov $G,$SZ*6($ctx)
2059
+ mov $H,$SZ*7($ctx)
2060
+
2061
+ cmp `$PUSH8+2*8`($Tbl),$inp # $_end
2062
+ je .Ldone_avx2
2063
+
2064
+ xor $a1,$a1
2065
+ mov $B,$a3
2066
+ xor $C,$a3 # magic
2067
+ mov $F,$a4
2068
+ jmp .Lower_avx2
2069
+ .align 16
2070
+ .Lower_avx2:
2071
+ ___
2072
+ for ($i=0; $i<8; ) {
2073
+ my $base="+16($Tbl)";
2074
+ foreach(bodyx_00_15()) { eval; }
2075
+ }
2076
+ $code.=<<___;
2077
+ lea -$PUSH8($Tbl),$Tbl
2078
+ cmp %rsp,$Tbl
2079
+ jae .Lower_avx2
2080
+
2081
+ mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2082
+ add $a1,$A
2083
+ #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2084
+ lea `2*$SZ*($rounds-8)`(%rsp),%rsp
2085
+
2086
+ add $SZ*0($ctx),$A
2087
+ add $SZ*1($ctx),$B
2088
+ add $SZ*2($ctx),$C
2089
+ add $SZ*3($ctx),$D
2090
+ add $SZ*4($ctx),$E
2091
+ add $SZ*5($ctx),$F
2092
+ lea `2*16*$SZ`($inp),$inp # inp+=2
2093
+ add $SZ*6($ctx),$G
2094
+ mov $inp,%r12
2095
+ add $SZ*7($ctx),$H
2096
+ cmp $_end,$inp
2097
+
2098
+ mov $A,$SZ*0($ctx)
2099
+ cmove %rsp,%r12 # next block or stale data
2100
+ mov $B,$SZ*1($ctx)
2101
+ mov $C,$SZ*2($ctx)
2102
+ mov $D,$SZ*3($ctx)
2103
+ mov $E,$SZ*4($ctx)
2104
+ mov $F,$SZ*5($ctx)
2105
+ mov $G,$SZ*6($ctx)
2106
+ mov $H,$SZ*7($ctx)
2107
+
2108
+ jbe .Loop_avx2
2109
+ lea (%rsp),$Tbl
2110
+
2111
+ .Ldone_avx2:
2112
+ lea ($Tbl),%rsp
2113
+ mov $_rsp,%rsi
2114
+ vzeroupper
2115
+ ___
2116
+ $code.=<<___ if ($win64);
2117
+ movaps 16*$SZ+32(%rsp),%xmm6
2118
+ movaps 16*$SZ+48(%rsp),%xmm7
2119
+ movaps 16*$SZ+64(%rsp),%xmm8
2120
+ movaps 16*$SZ+80(%rsp),%xmm9
2121
+ ___
2122
+ $code.=<<___ if ($win64 && $SZ>4);
2123
+ movaps 16*$SZ+96(%rsp),%xmm10
2124
+ movaps 16*$SZ+112(%rsp),%xmm11
2125
+ ___
2126
+ $code.=<<___;
2127
+ mov (%rsi),%r15
2128
+ mov 8(%rsi),%r14
2129
+ mov 16(%rsi),%r13
2130
+ mov 24(%rsi),%r12
2131
+ mov 32(%rsi),%rbp
2132
+ mov 40(%rsi),%rbx
2133
+ lea 48(%rsi),%rsp
2134
+ .Lepilogue_avx2:
2135
+ ret
2136
+ .size ${func}_avx2,.-${func}_avx2
2137
+ ___
2138
+ }}
2139
+ }}}}}
2140
+
2141
+ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2142
+ # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2143
+ if ($win64) {
2144
+ $rec="%rcx";
2145
+ $frame="%rdx";
2146
+ $context="%r8";
2147
+ $disp="%r9";
2148
+
2149
+ $code.=<<___;
2150
+ .extern __imp_RtlVirtualUnwind
2151
+ .type se_handler,\@abi-omnipotent
2152
+ .align 16
2153
+ se_handler:
2154
+ push %rsi
2155
+ push %rdi
2156
+ push %rbx
2157
+ push %rbp
2158
+ push %r12
2159
+ push %r13
2160
+ push %r14
2161
+ push %r15
2162
+ pushfq
2163
+ sub \$64,%rsp
2164
+
2165
+ mov 120($context),%rax # pull context->Rax
2166
+ mov 248($context),%rbx # pull context->Rip
2167
+
2168
+ mov 8($disp),%rsi # disp->ImageBase
2169
+ mov 56($disp),%r11 # disp->HanderlData
2170
+
2171
+ mov 0(%r11),%r10d # HandlerData[0]
2172
+ lea (%rsi,%r10),%r10 # prologue label
2173
+ cmp %r10,%rbx # context->Rip<prologue label
2174
+ jb .Lin_prologue
2175
+
2176
+ mov 152($context),%rax # pull context->Rsp
2177
+
2178
+ mov 4(%r11),%r10d # HandlerData[1]
2179
+ lea (%rsi,%r10),%r10 # epilogue label
2180
+ cmp %r10,%rbx # context->Rip>=epilogue label
2181
+ jae .Lin_prologue
2182
+ ___
2183
+ $code.=<<___ if ($avx>1);
2184
+ lea .Lavx2_shortcut(%rip),%r10
2185
+ cmp %r10,%rbx # context->Rip<avx2_shortcut
2186
+ jb .Lnot_in_avx2
2187
+
2188
+ and \$-256*$SZ,%rax
2189
+ add \$`2*$SZ*($rounds-8)`,%rax
2190
+ .Lnot_in_avx2:
2191
+ ___
2192
+ $code.=<<___;
2193
+ mov %rax,%rsi # put aside Rsp
2194
+ mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
2195
+ lea 48(%rax),%rax
2196
+
2197
+ mov -8(%rax),%rbx
2198
+ mov -16(%rax),%rbp
2199
+ mov -24(%rax),%r12
2200
+ mov -32(%rax),%r13
2201
+ mov -40(%rax),%r14
2202
+ mov -48(%rax),%r15
2203
+ mov %rbx,144($context) # restore context->Rbx
2204
+ mov %rbp,160($context) # restore context->Rbp
2205
+ mov %r12,216($context) # restore context->R12
2206
+ mov %r13,224($context) # restore context->R13
2207
+ mov %r14,232($context) # restore context->R14
2208
+ mov %r15,240($context) # restore context->R15
2209
+
2210
+ lea .Lepilogue(%rip),%r10
2211
+ cmp %r10,%rbx
2212
+ jb .Lin_prologue # non-AVX code
2213
+
2214
+ lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
2215
+ lea 512($context),%rdi # &context.Xmm6
2216
+ mov \$`$SZ==4?8:12`,%ecx
2217
+ .long 0xa548f3fc # cld; rep movsq
2218
+
2219
+ .Lin_prologue:
2220
+ mov 8(%rax),%rdi
2221
+ mov 16(%rax),%rsi
2222
+ mov %rax,152($context) # restore context->Rsp
2223
+ mov %rsi,168($context) # restore context->Rsi
2224
+ mov %rdi,176($context) # restore context->Rdi
2225
+
2226
+ mov 40($disp),%rdi # disp->ContextRecord
2227
+ mov $context,%rsi # context
2228
+ mov \$154,%ecx # sizeof(CONTEXT)
2229
+ .long 0xa548f3fc # cld; rep movsq
2230
+
2231
+ mov $disp,%rsi
2232
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2233
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
2234
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
2235
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2236
+ mov 40(%rsi),%r10 # disp->ContextRecord
2237
+ lea 56(%rsi),%r11 # &disp->HandlerData
2238
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
2239
+ mov %r10,32(%rsp) # arg5
2240
+ mov %r11,40(%rsp) # arg6
2241
+ mov %r12,48(%rsp) # arg7
2242
+ mov %rcx,56(%rsp) # arg8, (NULL)
2243
+ call *__imp_RtlVirtualUnwind(%rip)
2244
+
2245
+ mov \$1,%eax # ExceptionContinueSearch
2246
+ add \$64,%rsp
2247
+ popfq
2248
+ pop %r15
2249
+ pop %r14
2250
+ pop %r13
2251
+ pop %r12
2252
+ pop %rbp
2253
+ pop %rbx
2254
+ pop %rdi
2255
+ pop %rsi
2256
+ ret
2257
+ .size se_handler,.-se_handler
2258
+ ___
2259
+
2260
+ $code.=<<___ if ($SZ==4 && $shaext);
2261
+ .type shaext_handler,\@abi-omnipotent
2262
+ .align 16
2263
+ shaext_handler:
2264
+ push %rsi
2265
+ push %rdi
2266
+ push %rbx
2267
+ push %rbp
2268
+ push %r12
2269
+ push %r13
2270
+ push %r14
2271
+ push %r15
2272
+ pushfq
2273
+ sub \$64,%rsp
2274
+
2275
+ mov 120($context),%rax # pull context->Rax
2276
+ mov 248($context),%rbx # pull context->Rip
2277
+
2278
+ lea .Lprologue_shaext(%rip),%r10
2279
+ cmp %r10,%rbx # context->Rip<.Lprologue
2280
+ jb .Lin_prologue
2281
+
2282
+ lea .Lepilogue_shaext(%rip),%r10
2283
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
2284
+ jae .Lin_prologue
2285
+
2286
+ lea -8-5*16(%rax),%rsi
2287
+ lea 512($context),%rdi # &context.Xmm6
2288
+ mov \$10,%ecx
2289
+ .long 0xa548f3fc # cld; rep movsq
2290
+
2291
+ jmp .Lin_prologue
2292
+ .size shaext_handler,.-shaext_handler
2293
+ ___
2294
+
2295
+ $code.=<<___;
2296
+ .section .pdata
2297
+ .align 4
2298
+ .rva .LSEH_begin_$func
2299
+ .rva .LSEH_end_$func
2300
+ .rva .LSEH_info_$func
2301
+ ___
2302
+ $code.=<<___ if ($SZ==4 && $shaext);
2303
+ .rva .LSEH_begin_${func}_shaext
2304
+ .rva .LSEH_end_${func}_shaext
2305
+ .rva .LSEH_info_${func}_shaext
2306
+ ___
2307
+ $code.=<<___ if ($SZ==4);
2308
+ .rva .LSEH_begin_${func}_ssse3
2309
+ .rva .LSEH_end_${func}_ssse3
2310
+ .rva .LSEH_info_${func}_ssse3
2311
+ ___
2312
+ $code.=<<___ if ($avx && $SZ==8);
2313
+ .rva .LSEH_begin_${func}_xop
2314
+ .rva .LSEH_end_${func}_xop
2315
+ .rva .LSEH_info_${func}_xop
2316
+ ___
2317
+ $code.=<<___ if ($avx);
2318
+ .rva .LSEH_begin_${func}_avx
2319
+ .rva .LSEH_end_${func}_avx
2320
+ .rva .LSEH_info_${func}_avx
2321
+ ___
2322
+ $code.=<<___ if ($avx>1);
2323
+ .rva .LSEH_begin_${func}_avx2
2324
+ .rva .LSEH_end_${func}_avx2
2325
+ .rva .LSEH_info_${func}_avx2
2326
+ ___
2327
+ $code.=<<___;
2328
+ .section .xdata
2329
+ .align 8
2330
+ .LSEH_info_$func:
2331
+ .byte 9,0,0,0
2332
+ .rva se_handler
2333
+ .rva .Lprologue,.Lepilogue # HandlerData[]
2334
+ ___
2335
+ $code.=<<___ if ($SZ==4 && $shaext);
2336
+ .LSEH_info_${func}_shaext:
2337
+ .byte 9,0,0,0
2338
+ .rva shaext_handler
2339
+ ___
2340
+ $code.=<<___ if ($SZ==4);
2341
+ .LSEH_info_${func}_ssse3:
2342
+ .byte 9,0,0,0
2343
+ .rva se_handler
2344
+ .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
2345
+ ___
2346
+ $code.=<<___ if ($avx && $SZ==8);
2347
+ .LSEH_info_${func}_xop:
2348
+ .byte 9,0,0,0
2349
+ .rva se_handler
2350
+ .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
2351
+ ___
2352
+ $code.=<<___ if ($avx);
2353
+ .LSEH_info_${func}_avx:
2354
+ .byte 9,0,0,0
2355
+ .rva se_handler
2356
+ .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
2357
+ ___
2358
+ $code.=<<___ if ($avx>1);
2359
+ .LSEH_info_${func}_avx2:
2360
+ .byte 9,0,0,0
2361
+ .rva se_handler
2362
+ .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2363
+ ___
2364
+ }
2365
+
2366
+ sub sha256op38 {
2367
+ my $instr = shift;
2368
+ my %opcodelet = (
2369
+ "sha256rnds2" => 0xcb,
2370
+ "sha256msg1" => 0xcc,
2371
+ "sha256msg2" => 0xcd );
2372
+
2373
+ if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2374
+ my @opcode=(0x0f,0x38);
2375
+ push @opcode,$opcodelet{$instr};
2376
+ push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
2377
+ return ".byte\t".join(',',@opcode);
2378
+ } else {
2379
+ return $instr."\t".@_[0];
2380
+ }
2381
+ }
2382
+
2383
+ foreach (split("\n",$code)) {
2384
+ s/\`([^\`]*)\`/eval $1/geo;
2385
+
2386
+ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2387
+
2388
+ print $_,"\n";
2389
+ }
2390
+ close STDOUT;