ring-native 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,436 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # ====================================================================
4
+ # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
+ # project. The module is, however, dual licensed under OpenSSL and
6
+ # CRYPTOGAMS licenses depending on where you obtain it. For further
7
+ # details see http://www.openssl.org/~appro/cryptogams/.
8
+ # ====================================================================
9
+ #
10
+ # SHA256/512 for ARMv8.
11
+ #
12
+ # Performance in cycles per processed byte and improvement coefficient
13
+ # over code generated with "default" compiler:
14
+ #
15
+ # SHA256-hw SHA256(*) SHA512
16
+ # Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
17
+ # Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
18
+ # Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
19
+ # Denver 2.01 10.5 (+26%) 6.70 (+8%)
20
+ # X-Gene 20.0 (+100%) 12.8 (+300%(***))
21
+ #
22
+ # (*) Software SHA256 results are of lesser relevance, presented
23
+ # mostly for informational purposes.
24
+ # (**) The result is a trade-off: it's possible to improve it by
25
+ # 10% (or by 1 cycle per round), but at the cost of 20% loss
26
+ # on Cortex-A53 (or by 4 cycles per round).
27
+ # (***) Super-impressive coefficients over gcc-generated code are
28
+ # indication of some compiler "pathology", most notably code
29
+ # generated with -mgeneral-regs-only is significanty faster
30
+ # and the gap is only 40-90%.
31
+
32
+ $flavour=shift;
33
+ # Unlike most perlasm files, sha512-armv8.pl takes an additional argument to
34
+ # determine which hash function to emit. This differs from upstream OpenSSL so
35
+ # that the script may continue to output to stdout.
36
+ $variant=shift;
37
+ $output=shift;
38
+
39
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
41
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
42
+ die "can't locate arm-xlate.pl";
43
+
44
+ open OUT,"| \"$^X\" $xlate $flavour $output";
45
+ *STDOUT=*OUT;
46
+
47
+ if ($variant eq "sha512") {
48
+ $BITS=512;
49
+ $SZ=8;
50
+ @Sigma0=(28,34,39);
51
+ @Sigma1=(14,18,41);
52
+ @sigma0=(1, 8, 7);
53
+ @sigma1=(19,61, 6);
54
+ $rounds=80;
55
+ $reg_t="x";
56
+ } elsif ($variant eq "sha256") {
57
+ $BITS=256;
58
+ $SZ=4;
59
+ @Sigma0=( 2,13,22);
60
+ @Sigma1=( 6,11,25);
61
+ @sigma0=( 7,18, 3);
62
+ @sigma1=(17,19,10);
63
+ $rounds=64;
64
+ $reg_t="w";
65
+ } else {
66
+ die "Unknown variant: $variant";
67
+ }
68
+
69
+ $func="sha${BITS}_block_data_order";
70
+
71
+ ($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
72
+
73
+ @X=map("$reg_t$_",(3..15,0..2));
74
+ @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
75
+ ($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
76
+
77
+ sub BODY_00_xx {
78
+ my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
79
+ my $j=($i+1)&15;
80
+ my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
81
+ $T0=@X[$i+3] if ($i<11);
82
+
83
+ $code.=<<___ if ($i<16);
84
+ #ifndef __ARMEB__
85
+ rev @X[$i],@X[$i] // $i
86
+ #endif
87
+ ___
88
+ $code.=<<___ if ($i<13 && ($i&1));
89
+ ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ
90
+ ___
91
+ $code.=<<___ if ($i==13);
92
+ ldp @X[14],@X[15],[$inp]
93
+ ___
94
+ $code.=<<___ if ($i>=14);
95
+ ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
96
+ ___
97
+ $code.=<<___ if ($i>0 && $i<16);
98
+ add $a,$a,$t1 // h+=Sigma0(a)
99
+ ___
100
+ $code.=<<___ if ($i>=11);
101
+ str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
102
+ ___
103
+ # While ARMv8 specifies merged rotate-n-logical operation such as
104
+ # 'eor x,y,z,ror#n', it was found to negatively affect performance
105
+ # on Apple A7. The reason seems to be that it requires even 'y' to
106
+ # be available earlier. This means that such merged instruction is
107
+ # not necessarily best choice on critical path... On the other hand
108
+ # Cortex-A5x handles merged instructions much better than disjoint
109
+ # rotate and logical... See (**) footnote above.
110
+ $code.=<<___ if ($i<15);
111
+ ror $t0,$e,#$Sigma1[0]
112
+ add $h,$h,$t2 // h+=K[i]
113
+ eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
114
+ and $t1,$f,$e
115
+ bic $t2,$g,$e
116
+ add $h,$h,@X[$i&15] // h+=X[i]
117
+ orr $t1,$t1,$t2 // Ch(e,f,g)
118
+ eor $t2,$a,$b // a^b, b^c in next round
119
+ eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e)
120
+ ror $T0,$a,#$Sigma0[0]
121
+ add $h,$h,$t1 // h+=Ch(e,f,g)
122
+ eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
123
+ add $h,$h,$t0 // h+=Sigma1(e)
124
+ and $t3,$t3,$t2 // (b^c)&=(a^b)
125
+ add $d,$d,$h // d+=h
126
+ eor $t3,$t3,$b // Maj(a,b,c)
127
+ eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a)
128
+ add $h,$h,$t3 // h+=Maj(a,b,c)
129
+ ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
130
+ //add $h,$h,$t1 // h+=Sigma0(a)
131
+ ___
132
+ $code.=<<___ if ($i>=15);
133
+ ror $t0,$e,#$Sigma1[0]
134
+ add $h,$h,$t2 // h+=K[i]
135
+ ror $T1,@X[($j+1)&15],#$sigma0[0]
136
+ and $t1,$f,$e
137
+ ror $T2,@X[($j+14)&15],#$sigma1[0]
138
+ bic $t2,$g,$e
139
+ ror $T0,$a,#$Sigma0[0]
140
+ add $h,$h,@X[$i&15] // h+=X[i]
141
+ eor $t0,$t0,$e,ror#$Sigma1[1]
142
+ eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
143
+ orr $t1,$t1,$t2 // Ch(e,f,g)
144
+ eor $t2,$a,$b // a^b, b^c in next round
145
+ eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e)
146
+ eor $T0,$T0,$a,ror#$Sigma0[1]
147
+ add $h,$h,$t1 // h+=Ch(e,f,g)
148
+ and $t3,$t3,$t2 // (b^c)&=(a^b)
149
+ eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
150
+ eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1])
151
+ add $h,$h,$t0 // h+=Sigma1(e)
152
+ eor $t3,$t3,$b // Maj(a,b,c)
153
+ eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a)
154
+ eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14])
155
+ add @X[$j],@X[$j],@X[($j+9)&15]
156
+ add $d,$d,$h // d+=h
157
+ add $h,$h,$t3 // h+=Maj(a,b,c)
158
+ ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
159
+ add @X[$j],@X[$j],$T1
160
+ add $h,$h,$t1 // h+=Sigma0(a)
161
+ add @X[$j],@X[$j],$T2
162
+ ___
163
+ ($t2,$t3)=($t3,$t2);
164
+ }
165
+
166
+ $code.=<<___;
167
+ #include <openssl/arm_arch.h>
168
+
169
+ .text
170
+
171
+ .extern OPENSSL_armcap_P
172
+ .globl $func
173
+ .type $func,%function
174
+ .align 6
175
+ $func:
176
+ ___
177
+ $code.=<<___ if ($SZ==4);
178
+ ldr x16,.LOPENSSL_armcap_P
179
+ adr x17,.LOPENSSL_armcap_P
180
+ add x16,x16,x17
181
+ ldr w16,[x16]
182
+ tst w16,#ARMV8_SHA256
183
+ b.ne .Lv8_entry
184
+ ___
185
+ $code.=<<___;
186
+ stp x29,x30,[sp,#-128]!
187
+ add x29,sp,#0
188
+
189
+ stp x19,x20,[sp,#16]
190
+ stp x21,x22,[sp,#32]
191
+ stp x23,x24,[sp,#48]
192
+ stp x25,x26,[sp,#64]
193
+ stp x27,x28,[sp,#80]
194
+ sub sp,sp,#4*$SZ
195
+
196
+ ldp $A,$B,[$ctx] // load context
197
+ ldp $C,$D,[$ctx,#2*$SZ]
198
+ ldp $E,$F,[$ctx,#4*$SZ]
199
+ add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
200
+ ldp $G,$H,[$ctx,#6*$SZ]
201
+ adr $Ktbl,.LK$BITS
202
+ stp $ctx,$num,[x29,#96]
203
+
204
+ .Loop:
205
+ ldp @X[0],@X[1],[$inp],#2*$SZ
206
+ ldr $t2,[$Ktbl],#$SZ // *K++
207
+ eor $t3,$B,$C // magic seed
208
+ str $inp,[x29,#112]
209
+ ___
210
+ for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
211
+ $code.=".Loop_16_xx:\n";
212
+ for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
213
+ $code.=<<___;
214
+ cbnz $t2,.Loop_16_xx
215
+
216
+ ldp $ctx,$num,[x29,#96]
217
+ ldr $inp,[x29,#112]
218
+ sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind
219
+
220
+ ldp @X[0],@X[1],[$ctx]
221
+ ldp @X[2],@X[3],[$ctx,#2*$SZ]
222
+ add $inp,$inp,#14*$SZ // advance input pointer
223
+ ldp @X[4],@X[5],[$ctx,#4*$SZ]
224
+ add $A,$A,@X[0]
225
+ ldp @X[6],@X[7],[$ctx,#6*$SZ]
226
+ add $B,$B,@X[1]
227
+ add $C,$C,@X[2]
228
+ add $D,$D,@X[3]
229
+ stp $A,$B,[$ctx]
230
+ add $E,$E,@X[4]
231
+ add $F,$F,@X[5]
232
+ stp $C,$D,[$ctx,#2*$SZ]
233
+ add $G,$G,@X[6]
234
+ add $H,$H,@X[7]
235
+ cmp $inp,$num
236
+ stp $E,$F,[$ctx,#4*$SZ]
237
+ stp $G,$H,[$ctx,#6*$SZ]
238
+ b.ne .Loop
239
+
240
+ ldp x19,x20,[x29,#16]
241
+ add sp,sp,#4*$SZ
242
+ ldp x21,x22,[x29,#32]
243
+ ldp x23,x24,[x29,#48]
244
+ ldp x25,x26,[x29,#64]
245
+ ldp x27,x28,[x29,#80]
246
+ ldp x29,x30,[sp],#128
247
+ ret
248
+ .size $func,.-$func
249
+
250
+ .align 6
251
+ .type .LK$BITS,%object
252
+ .LK$BITS:
253
+ ___
254
+ $code.=<<___ if ($SZ==8);
255
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
256
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
257
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
258
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
259
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
260
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
261
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
262
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
263
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
264
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
265
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
266
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
267
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
268
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
269
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
270
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
271
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
272
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
273
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
274
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
275
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
276
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
277
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
278
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
279
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
280
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
281
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
282
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
283
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
284
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
285
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
286
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
287
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
288
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
289
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
290
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
291
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
292
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
293
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
294
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
295
+ .quad 0 // terminator
296
+ ___
297
+ $code.=<<___ if ($SZ==4);
298
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
299
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
300
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
301
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
302
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
303
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
304
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
305
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
306
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
307
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
308
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
309
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
310
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
311
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
312
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
313
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
314
+ .long 0 //terminator
315
+ ___
316
+ $code.=<<___;
317
+ .size .LK$BITS,.-.LK$BITS
318
+ .align 3
319
+ .LOPENSSL_armcap_P:
320
+ .quad OPENSSL_armcap_P-.
321
+ .asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
322
+ .align 2
323
+ ___
324
+
325
+ if ($SZ==4) {
326
+ my $Ktbl="x3";
327
+
328
+ my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
329
+ my @MSG=map("v$_.16b",(4..7));
330
+ my ($W0,$W1)=("v16.4s","v17.4s");
331
+ my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
332
+
333
+ $code.=<<___;
334
+ .type sha256_block_armv8,%function
335
+ .align 6
336
+ sha256_block_armv8:
337
+ .Lv8_entry:
338
+ stp x29,x30,[sp,#-16]!
339
+ add x29,sp,#0
340
+
341
+ ld1.32 {$ABCD,$EFGH},[$ctx]
342
+ adr $Ktbl,.LK256
343
+
344
+ .Loop_hw:
345
+ ld1 {@MSG[0]-@MSG[3]},[$inp],#64
346
+ sub $num,$num,#1
347
+ ld1.32 {$W0},[$Ktbl],#16
348
+ rev32 @MSG[0],@MSG[0]
349
+ rev32 @MSG[1],@MSG[1]
350
+ rev32 @MSG[2],@MSG[2]
351
+ rev32 @MSG[3],@MSG[3]
352
+ orr $ABCD_SAVE,$ABCD,$ABCD // offload
353
+ orr $EFGH_SAVE,$EFGH,$EFGH
354
+ ___
355
+ for($i=0;$i<12;$i++) {
356
+ $code.=<<___;
357
+ ld1.32 {$W1},[$Ktbl],#16
358
+ add.i32 $W0,$W0,@MSG[0]
359
+ sha256su0 @MSG[0],@MSG[1]
360
+ orr $abcd,$ABCD,$ABCD
361
+ sha256h $ABCD,$EFGH,$W0
362
+ sha256h2 $EFGH,$abcd,$W0
363
+ sha256su1 @MSG[0],@MSG[2],@MSG[3]
364
+ ___
365
+ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
366
+ }
367
+ $code.=<<___;
368
+ ld1.32 {$W1},[$Ktbl],#16
369
+ add.i32 $W0,$W0,@MSG[0]
370
+ orr $abcd,$ABCD,$ABCD
371
+ sha256h $ABCD,$EFGH,$W0
372
+ sha256h2 $EFGH,$abcd,$W0
373
+
374
+ ld1.32 {$W0},[$Ktbl],#16
375
+ add.i32 $W1,$W1,@MSG[1]
376
+ orr $abcd,$ABCD,$ABCD
377
+ sha256h $ABCD,$EFGH,$W1
378
+ sha256h2 $EFGH,$abcd,$W1
379
+
380
+ ld1.32 {$W1},[$Ktbl]
381
+ add.i32 $W0,$W0,@MSG[2]
382
+ sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
383
+ orr $abcd,$ABCD,$ABCD
384
+ sha256h $ABCD,$EFGH,$W0
385
+ sha256h2 $EFGH,$abcd,$W0
386
+
387
+ add.i32 $W1,$W1,@MSG[3]
388
+ orr $abcd,$ABCD,$ABCD
389
+ sha256h $ABCD,$EFGH,$W1
390
+ sha256h2 $EFGH,$abcd,$W1
391
+
392
+ add.i32 $ABCD,$ABCD,$ABCD_SAVE
393
+ add.i32 $EFGH,$EFGH,$EFGH_SAVE
394
+
395
+ cbnz $num,.Loop_hw
396
+
397
+ st1.32 {$ABCD,$EFGH},[$ctx]
398
+
399
+ ldr x29,[sp],#16
400
+ ret
401
+ .size sha256_block_armv8,.-sha256_block_armv8
402
+ ___
403
+ }
404
+
405
+ $code.=<<___;
406
+ .comm OPENSSL_armcap_P,4,4
407
+ ___
408
+
409
+ { my %opcode = (
410
+ "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000,
411
+ "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 );
412
+
413
+ sub unsha256 {
414
+ my ($mnemonic,$arg)=@_;
415
+
416
+ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
417
+ &&
418
+ sprintf ".inst\t0x%08x\t//%s %s",
419
+ $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
420
+ $mnemonic,$arg;
421
+ }
422
+ }
423
+
424
+ foreach(split("\n",$code)) {
425
+
426
+ s/\`([^\`]*)\`/eval($1)/geo;
427
+
428
+ s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
429
+
430
+ s/\.\w?32\b//o and s/\.16b/\.4s/go;
431
+ m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
432
+
433
+ print $_,"\n";
434
+ }
435
+
436
+ close STDOUT;
@@ -0,0 +1,2390 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # ====================================================================
4
+ # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
+ # project. Rights for redistribution and usage in source and binary
6
+ # forms are granted according to the OpenSSL license.
7
+ # ====================================================================
8
+ #
9
+ # sha256/512_block procedure for x86_64.
10
+ #
11
+ # 40% improvement over compiler-generated code on Opteron. On EM64T
12
+ # sha256 was observed to run >80% faster and sha512 - >40%. No magical
13
+ # tricks, just straight implementation... I really wonder why gcc
14
+ # [being armed with inline assembler] fails to generate as fast code.
15
+ # The only thing which is cool about this module is that it's very
16
+ # same instruction sequence used for both SHA-256 and SHA-512. In
17
+ # former case the instructions operate on 32-bit operands, while in
18
+ # latter - on 64-bit ones. All I had to do is to get one flavor right,
19
+ # the other one passed the test right away:-)
20
+ #
21
+ # sha256_block runs in ~1005 cycles on Opteron, which gives you
22
+ # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
23
+ # frequency in GHz. sha512_block runs in ~1275 cycles, which results
24
+ # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
25
+ # Well, if you compare it to IA-64 implementation, which maintains
26
+ # X[16] in register bank[!], tends to 4 instructions per CPU clock
27
+ # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
28
+ # issue Opteron pipeline and X[16] maintained in memory. So that *if*
29
+ # there is a way to improve it, *then* the only way would be to try to
30
+ # offload X[16] updates to SSE unit, but that would require "deeper"
31
+ # loop unroll, which in turn would naturally cause size blow-up, not
32
+ # to mention increased complexity! And once again, only *if* it's
33
+ # actually possible to noticeably improve overall ILP, instruction
34
+ # level parallelism, on a given CPU implementation in this case.
35
+ #
36
+ # Special note on Intel EM64T. While Opteron CPU exhibits perfect
37
+ # perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
38
+ # [currently available] EM64T CPUs apparently are far from it. On the
39
+ # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
40
+ # sha256_block:-( This is presumably because 64-bit shifts/rotates
41
+ # apparently are not atomic instructions, but implemented in microcode.
42
+ #
43
+ # May 2012.
44
+ #
45
+ # Optimization including one of Pavel Semjanov's ideas, alternative
46
+ # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
47
+ # unfortunately -2% SHA512 on P4 [which nobody should care about
48
+ # that much].
49
+ #
50
+ # June 2012.
51
+ #
52
+ # Add SIMD code paths, see below for improvement coefficients. SSSE3
53
+ # code path was not attempted for SHA512, because improvement is not
54
+ # estimated to be high enough, noticeably less than 9%, to justify
55
+ # the effort, not on pre-AVX processors. [Obviously with exclusion
56
+ # for VIA Nano, but it has SHA512 instruction that is faster and
57
+ # should be used instead.] For reference, corresponding estimated
58
+ # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
59
+ # higher coefficients are observed on VIA Nano and Bulldozer has more
60
+ # to do with specifics of their architecture [which is topic for
61
+ # separate discussion].
62
+ #
63
+ # November 2012.
64
+ #
65
+ # Add AVX2 code path. Two consecutive input blocks are loaded to
66
+ # 256-bit %ymm registers, with data from first block to least
67
+ # significant 128-bit halves and data from second to most significant.
68
+ # The data is then processed with same SIMD instruction sequence as
69
+ # for AVX, but with %ymm as operands. Side effect is increased stack
70
+ # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
71
+ # code size increase.
72
+ #
73
+ # March 2014.
74
+ #
75
+ # Add support for Intel SHA Extensions.
76
+
77
+ ######################################################################
78
+ # Current performance in cycles per processed byte (less is better):
79
+ #
80
+ # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
81
+ #
82
+ # AMD K8 14.9 - - 9.57 -
83
+ # P4 17.3 - - 30.8 -
84
+ # Core 2 15.6 13.8(+13%) - 9.97 -
85
+ # Westmere 14.8 12.3(+19%) - 9.58 -
86
+ # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
87
+ # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
88
+ # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
89
+ # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
90
+ # VIA Nano 23.0 16.5(+39%) - 14.7 -
91
+ # Atom 23.0 18.9(+22%) - 14.7 -
92
+ # Silvermont 27.4 20.6(+33%) - 17.5 -
93
+ #
94
+ # (*) whichever best applicable;
95
+ # (**) switch from ror to shrd stands for fair share of improvement;
96
+ # (***) execution time is fully determined by remaining integer-only
97
+ # part, body_00_15; reducing the amount of SIMD instructions
98
+ # below certain limit makes no difference/sense; to conserve
99
+ # space SHA256 XOP code path is therefore omitted;
100
+
101
+ $flavour = shift;
102
+ $output = shift;
103
+ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
104
+
105
+ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
106
+
107
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
108
+ ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
109
+ ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
110
+ die "can't locate x86_64-xlate.pl";
111
+
112
+ # In upstream, this is controlled by shelling out to the compiler to check
113
+ # versions, but BoringSSL is intended to be used with pre-generated perlasm
114
+ # output, so this isn't useful anyway.
115
+ #
116
+ # TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
117
+ # necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
118
+ # did not tie them together until after $shaext was added.
119
+ $avx = 1;
120
+
121
+ # TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
122
+ # been tested.
123
+ $shaext=0; ### set to zero if compiling for 1.0.1
124
+ $avx=1 if (!$shaext && $avx);
125
+
126
+ open OUT,"| \"$^X\" $xlate $flavour";
127
+ *STDOUT=*OUT;
128
+
129
+ if ($output =~ /512/) {
130
+ $func="sha512_block_data_order";
131
+ $TABLE="K512";
132
+ $SZ=8;
133
+ @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
134
+ "%r8", "%r9", "%r10","%r11");
135
+ ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
136
+ @Sigma0=(28,34,39);
137
+ @Sigma1=(14,18,41);
138
+ @sigma0=(1, 8, 7);
139
+ @sigma1=(19,61, 6);
140
+ $rounds=80;
141
+ } else {
142
+ $func="sha256_block_data_order";
143
+ $TABLE="K256";
144
+ $SZ=4;
145
+ @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
146
+ "%r8d","%r9d","%r10d","%r11d");
147
+ ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
148
+ @Sigma0=( 2,13,22);
149
+ @Sigma1=( 6,11,25);
150
+ @sigma0=( 7,18, 3);
151
+ @sigma1=(17,19,10);
152
+ $rounds=64;
153
+ }
154
+
155
+ $ctx="%rdi"; # 1st arg, zapped by $a3
156
+ $inp="%rsi"; # 2nd arg
157
+ $Tbl="%rbp";
158
+
159
+ $_ctx="16*$SZ+0*8(%rsp)";
160
+ $_inp="16*$SZ+1*8(%rsp)";
161
+ $_end="16*$SZ+2*8(%rsp)";
162
+ $_rsp="16*$SZ+3*8(%rsp)";
163
+ $framesz="16*$SZ+4*8";
164
+
165
+
166
+ sub ROUND_00_15()
167
+ { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
168
+ my $STRIDE=$SZ;
169
+ $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
170
+
171
+ $code.=<<___;
172
+ ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
173
+ mov $f,$a2
174
+
175
+ xor $e,$a0
176
+ ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
177
+ xor $g,$a2 # f^g
178
+
179
+ mov $T1,`$SZ*($i&0xf)`(%rsp)
180
+ xor $a,$a1
181
+ and $e,$a2 # (f^g)&e
182
+
183
+ ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
184
+ add $h,$T1 # T1+=h
185
+ xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
186
+
187
+ ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
188
+ xor $e,$a0
189
+ add $a2,$T1 # T1+=Ch(e,f,g)
190
+
191
+ mov $a,$a2
192
+ add ($Tbl),$T1 # T1+=K[round]
193
+ xor $a,$a1
194
+
195
+ xor $b,$a2 # a^b, b^c in next round
196
+ ror \$$Sigma1[0],$a0 # Sigma1(e)
197
+ mov $b,$h
198
+
199
+ and $a2,$a3
200
+ ror \$$Sigma0[0],$a1 # Sigma0(a)
201
+ add $a0,$T1 # T1+=Sigma1(e)
202
+
203
+ xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
204
+ add $T1,$d # d+=T1
205
+ add $T1,$h # h+=T1
206
+
207
+ lea $STRIDE($Tbl),$Tbl # round++
208
+ ___
209
+ $code.=<<___ if ($i<15);
210
+ add $a1,$h # h+=Sigma0(a)
211
+ ___
212
+ ($a2,$a3) = ($a3,$a2);
213
+ }
214
+
215
+ sub ROUND_16_XX()
216
+ { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
217
+
218
+ $code.=<<___;
219
+ mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
220
+ mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
221
+
222
+ mov $a0,$T1
223
+ ror \$`$sigma0[1]-$sigma0[0]`,$a0
224
+ add $a1,$a # modulo-scheduled h+=Sigma0(a)
225
+ mov $a2,$a1
226
+ ror \$`$sigma1[1]-$sigma1[0]`,$a2
227
+
228
+ xor $T1,$a0
229
+ shr \$$sigma0[2],$T1
230
+ ror \$$sigma0[0],$a0
231
+ xor $a1,$a2
232
+ shr \$$sigma1[2],$a1
233
+
234
+ ror \$$sigma1[0],$a2
235
+ xor $a0,$T1 # sigma0(X[(i+1)&0xf])
236
+ xor $a1,$a2 # sigma1(X[(i+14)&0xf])
237
+ add `$SZ*(($i+9)&0xf)`(%rsp),$T1
238
+
239
+ add `$SZ*($i&0xf)`(%rsp),$T1
240
+ mov $e,$a0
241
+ add $a2,$T1
242
+ mov $a,$a1
243
+ ___
244
+ &ROUND_00_15(@_);
245
+ }
246
+
247
+ $code=<<___;
248
+ .text
249
+
250
+ .extern OPENSSL_ia32cap_P
251
+ .globl $func
252
+ .type $func,\@function,3
253
+ .align 16
254
+ $func:
255
+ ___
256
+ $code.=<<___ if ($SZ==4 || $avx);
257
+ lea OPENSSL_ia32cap_P(%rip),%r11
258
+ mov 0(%r11),%r9d
259
+ mov 4(%r11),%r10d
260
+ mov 8(%r11),%r11d
261
+ ___
262
+ $code.=<<___ if ($SZ==4 && $shaext);
263
+ test \$`1<<29`,%r11d # check for SHA
264
+ jnz _shaext_shortcut
265
+ ___
266
+ $code.=<<___ if ($avx && $SZ==8);
267
+ test \$`1<<11`,%r10d # check for XOP
268
+ jnz .Lxop_shortcut
269
+ ___
270
+ $code.=<<___ if ($avx>1);
271
+ and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
272
+ cmp \$`1<<8|1<<5|1<<3`,%r11d
273
+ je .Lavx2_shortcut
274
+ ___
275
+ $code.=<<___ if ($avx);
276
+ and \$`1<<30`,%r9d # mask "Intel CPU" bit
277
+ and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
278
+ or %r9d,%r10d
279
+ cmp \$`1<<28|1<<9|1<<30`,%r10d
280
+ je .Lavx_shortcut
281
+ ___
282
+ $code.=<<___ if ($SZ==4);
283
+ test \$`1<<9`,%r10d
284
+ jnz .Lssse3_shortcut
285
+ ___
286
+ $code.=<<___;
287
+ push %rbx
288
+ push %rbp
289
+ push %r12
290
+ push %r13
291
+ push %r14
292
+ push %r15
293
+ mov %rsp,%r11 # copy %rsp
294
+ shl \$4,%rdx # num*16
295
+ sub \$$framesz,%rsp
296
+ lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
297
+ and \$-64,%rsp # align stack frame
298
+ mov $ctx,$_ctx # save ctx, 1st arg
299
+ mov $inp,$_inp # save inp, 2nd arh
300
+ mov %rdx,$_end # save end pointer, "3rd" arg
301
+ mov %r11,$_rsp # save copy of %rsp
302
+ .Lprologue:
303
+
304
+ mov $SZ*0($ctx),$A
305
+ mov $SZ*1($ctx),$B
306
+ mov $SZ*2($ctx),$C
307
+ mov $SZ*3($ctx),$D
308
+ mov $SZ*4($ctx),$E
309
+ mov $SZ*5($ctx),$F
310
+ mov $SZ*6($ctx),$G
311
+ mov $SZ*7($ctx),$H
312
+ jmp .Lloop
313
+
314
+ .align 16
315
+ .Lloop:
316
+ mov $B,$a3
317
+ lea $TABLE(%rip),$Tbl
318
+ xor $C,$a3 # magic
319
+ ___
320
+ for($i=0;$i<16;$i++) {
321
+ $code.=" mov $SZ*$i($inp),$T1\n";
322
+ $code.=" mov @ROT[4],$a0\n";
323
+ $code.=" mov @ROT[0],$a1\n";
324
+ $code.=" bswap $T1\n";
325
+ &ROUND_00_15($i,@ROT);
326
+ unshift(@ROT,pop(@ROT));
327
+ }
328
+ $code.=<<___;
329
+ jmp .Lrounds_16_xx
330
+ .align 16
331
+ .Lrounds_16_xx:
332
+ ___
333
+ for(;$i<32;$i++) {
334
+ &ROUND_16_XX($i,@ROT);
335
+ unshift(@ROT,pop(@ROT));
336
+ }
337
+
338
+ $code.=<<___;
339
+ cmpb \$0,`$SZ-1`($Tbl)
340
+ jnz .Lrounds_16_xx
341
+
342
+ mov $_ctx,$ctx
343
+ add $a1,$A # modulo-scheduled h+=Sigma0(a)
344
+ lea 16*$SZ($inp),$inp
345
+
346
+ add $SZ*0($ctx),$A
347
+ add $SZ*1($ctx),$B
348
+ add $SZ*2($ctx),$C
349
+ add $SZ*3($ctx),$D
350
+ add $SZ*4($ctx),$E
351
+ add $SZ*5($ctx),$F
352
+ add $SZ*6($ctx),$G
353
+ add $SZ*7($ctx),$H
354
+
355
+ cmp $_end,$inp
356
+
357
+ mov $A,$SZ*0($ctx)
358
+ mov $B,$SZ*1($ctx)
359
+ mov $C,$SZ*2($ctx)
360
+ mov $D,$SZ*3($ctx)
361
+ mov $E,$SZ*4($ctx)
362
+ mov $F,$SZ*5($ctx)
363
+ mov $G,$SZ*6($ctx)
364
+ mov $H,$SZ*7($ctx)
365
+ jb .Lloop
366
+
367
+ mov $_rsp,%rsi
368
+ mov (%rsi),%r15
369
+ mov 8(%rsi),%r14
370
+ mov 16(%rsi),%r13
371
+ mov 24(%rsi),%r12
372
+ mov 32(%rsi),%rbp
373
+ mov 40(%rsi),%rbx
374
+ lea 48(%rsi),%rsp
375
+ .Lepilogue:
376
+ ret
377
+ .size $func,.-$func
378
+ ___
379
+
380
+ if ($SZ==4) {
381
+ $code.=<<___;
382
+ .align 64
383
+ .type $TABLE,\@object
384
+ $TABLE:
385
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
386
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
387
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
388
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
389
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
390
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
391
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
392
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
393
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
394
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
395
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
396
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
397
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
398
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
399
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
400
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
401
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
402
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
403
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
404
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
405
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
406
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
407
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
408
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
409
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
410
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
411
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
412
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
413
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
414
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
415
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
416
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
417
+
418
+ .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
419
+ .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
420
+ .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
421
+ .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
422
+ .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
423
+ .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
424
+ .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
425
+ ___
426
+ } else {
427
+ $code.=<<___;
428
+ .align 64
429
+ .type $TABLE,\@object
430
+ $TABLE:
431
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
432
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
433
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
434
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
435
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
436
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
437
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
438
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
439
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
440
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
441
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
442
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
443
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
444
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
445
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
446
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
447
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
448
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
449
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
450
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
451
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
452
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
453
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
454
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
455
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
456
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
457
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
458
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
459
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
460
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
461
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
462
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
463
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
464
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
465
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
466
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
467
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
468
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
469
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
470
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
471
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
472
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
473
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
474
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
475
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
476
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
477
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
478
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
479
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
480
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
481
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
482
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
483
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
484
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
485
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
486
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
487
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
488
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
489
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
490
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
491
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
492
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
493
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
494
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
495
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
496
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
497
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
498
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
499
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
500
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
501
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
502
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
503
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
504
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
505
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
506
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
507
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
508
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
509
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
510
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
511
+
512
+ .quad 0x0001020304050607,0x08090a0b0c0d0e0f
513
+ .quad 0x0001020304050607,0x08090a0b0c0d0e0f
514
+ .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
515
+ ___
516
+ }
517
+
518
+ ######################################################################
519
+ # SIMD code paths
520
+ #
521
+ if ($SZ==4 && $shaext) {{{
522
+ ######################################################################
523
+ # Intel SHA Extensions implementation of SHA256 update function.
524
+ #
525
+ my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
526
+
527
+ my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
528
+ my @MSG=map("%xmm$_",(3..6));
529
+
530
+ $code.=<<___;
531
+ .type sha256_block_data_order_shaext,\@function,3
532
+ .align 64
533
+ sha256_block_data_order_shaext:
534
+ _shaext_shortcut:
535
+ ___
536
+ $code.=<<___ if ($win64);
537
+ lea `-8-5*16`(%rsp),%rsp
538
+ movaps %xmm6,-8-5*16(%rax)
539
+ movaps %xmm7,-8-4*16(%rax)
540
+ movaps %xmm8,-8-3*16(%rax)
541
+ movaps %xmm9,-8-2*16(%rax)
542
+ movaps %xmm10,-8-1*16(%rax)
543
+ .Lprologue_shaext:
544
+ ___
545
+ $code.=<<___;
546
+ lea K256+0x80(%rip),$Tbl
547
+ movdqu ($ctx),$ABEF # DCBA
548
+ movdqu 16($ctx),$CDGH # HGFE
549
+ movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
550
+
551
+ pshufd \$0x1b,$ABEF,$Wi # ABCD
552
+ pshufd \$0xb1,$ABEF,$ABEF # CDAB
553
+ pshufd \$0x1b,$CDGH,$CDGH # EFGH
554
+ movdqa $TMP,$BSWAP # offload
555
+ palignr \$8,$CDGH,$ABEF # ABEF
556
+ punpcklqdq $Wi,$CDGH # CDGH
557
+ jmp .Loop_shaext
558
+
559
+ .align 16
560
+ .Loop_shaext:
561
+ movdqu ($inp),@MSG[0]
562
+ movdqu 0x10($inp),@MSG[1]
563
+ movdqu 0x20($inp),@MSG[2]
564
+ pshufb $TMP,@MSG[0]
565
+ movdqu 0x30($inp),@MSG[3]
566
+
567
+ movdqa 0*32-0x80($Tbl),$Wi
568
+ paddd @MSG[0],$Wi
569
+ pshufb $TMP,@MSG[1]
570
+ movdqa $CDGH,$CDGH_SAVE # offload
571
+ sha256rnds2 $ABEF,$CDGH # 0-3
572
+ pshufd \$0x0e,$Wi,$Wi
573
+ nop
574
+ movdqa $ABEF,$ABEF_SAVE # offload
575
+ sha256rnds2 $CDGH,$ABEF
576
+
577
+ movdqa 1*32-0x80($Tbl),$Wi
578
+ paddd @MSG[1],$Wi
579
+ pshufb $TMP,@MSG[2]
580
+ sha256rnds2 $ABEF,$CDGH # 4-7
581
+ pshufd \$0x0e,$Wi,$Wi
582
+ lea 0x40($inp),$inp
583
+ sha256msg1 @MSG[1],@MSG[0]
584
+ sha256rnds2 $CDGH,$ABEF
585
+
586
+ movdqa 2*32-0x80($Tbl),$Wi
587
+ paddd @MSG[2],$Wi
588
+ pshufb $TMP,@MSG[3]
589
+ sha256rnds2 $ABEF,$CDGH # 8-11
590
+ pshufd \$0x0e,$Wi,$Wi
591
+ movdqa @MSG[3],$TMP
592
+ palignr \$4,@MSG[2],$TMP
593
+ nop
594
+ paddd $TMP,@MSG[0]
595
+ sha256msg1 @MSG[2],@MSG[1]
596
+ sha256rnds2 $CDGH,$ABEF
597
+
598
+ movdqa 3*32-0x80($Tbl),$Wi
599
+ paddd @MSG[3],$Wi
600
+ sha256msg2 @MSG[3],@MSG[0]
601
+ sha256rnds2 $ABEF,$CDGH # 12-15
602
+ pshufd \$0x0e,$Wi,$Wi
603
+ movdqa @MSG[0],$TMP
604
+ palignr \$4,@MSG[3],$TMP
605
+ nop
606
+ paddd $TMP,@MSG[1]
607
+ sha256msg1 @MSG[3],@MSG[2]
608
+ sha256rnds2 $CDGH,$ABEF
609
+ ___
610
+ for($i=4;$i<16-3;$i++) {
611
+ $code.=<<___;
612
+ movdqa $i*32-0x80($Tbl),$Wi
613
+ paddd @MSG[0],$Wi
614
+ sha256msg2 @MSG[0],@MSG[1]
615
+ sha256rnds2 $ABEF,$CDGH # 16-19...
616
+ pshufd \$0x0e,$Wi,$Wi
617
+ movdqa @MSG[1],$TMP
618
+ palignr \$4,@MSG[0],$TMP
619
+ nop
620
+ paddd $TMP,@MSG[2]
621
+ sha256msg1 @MSG[0],@MSG[3]
622
+ sha256rnds2 $CDGH,$ABEF
623
+ ___
624
+ push(@MSG,shift(@MSG));
625
+ }
626
+ $code.=<<___;
627
+ movdqa 13*32-0x80($Tbl),$Wi
628
+ paddd @MSG[0],$Wi
629
+ sha256msg2 @MSG[0],@MSG[1]
630
+ sha256rnds2 $ABEF,$CDGH # 52-55
631
+ pshufd \$0x0e,$Wi,$Wi
632
+ movdqa @MSG[1],$TMP
633
+ palignr \$4,@MSG[0],$TMP
634
+ sha256rnds2 $CDGH,$ABEF
635
+ paddd $TMP,@MSG[2]
636
+
637
+ movdqa 14*32-0x80($Tbl),$Wi
638
+ paddd @MSG[1],$Wi
639
+ sha256rnds2 $ABEF,$CDGH # 56-59
640
+ pshufd \$0x0e,$Wi,$Wi
641
+ sha256msg2 @MSG[1],@MSG[2]
642
+ movdqa $BSWAP,$TMP
643
+ sha256rnds2 $CDGH,$ABEF
644
+
645
+ movdqa 15*32-0x80($Tbl),$Wi
646
+ paddd @MSG[2],$Wi
647
+ nop
648
+ sha256rnds2 $ABEF,$CDGH # 60-63
649
+ pshufd \$0x0e,$Wi,$Wi
650
+ dec $num
651
+ nop
652
+ sha256rnds2 $CDGH,$ABEF
653
+
654
+ paddd $CDGH_SAVE,$CDGH
655
+ paddd $ABEF_SAVE,$ABEF
656
+ jnz .Loop_shaext
657
+
658
+ pshufd \$0xb1,$CDGH,$CDGH # DCHG
659
+ pshufd \$0x1b,$ABEF,$TMP # FEBA
660
+ pshufd \$0xb1,$ABEF,$ABEF # BAFE
661
+ punpckhqdq $CDGH,$ABEF # DCBA
662
+ palignr \$8,$TMP,$CDGH # HGFE
663
+
664
+ movdqu $ABEF,($ctx)
665
+ movdqu $CDGH,16($ctx)
666
+ ___
667
+ $code.=<<___ if ($win64);
668
+ movaps -8-5*16(%rax),%xmm6
669
+ movaps -8-4*16(%rax),%xmm7
670
+ movaps -8-3*16(%rax),%xmm8
671
+ movaps -8-2*16(%rax),%xmm9
672
+ movaps -8-1*16(%rax),%xmm10
673
+ mov %rax,%rsp
674
+ .Lepilogue_shaext:
675
+ ___
676
+ $code.=<<___;
677
+ ret
678
+ .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
679
+ ___
680
+ }}}
681
+ {{{
682
+
683
+ my $a4=$T1;
684
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
685
+
686
+ sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
687
+ { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
688
+ my $arg = pop;
689
+ $arg = "\$$arg" if ($arg*1 eq $arg);
690
+ $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
691
+ }
692
+
693
+ sub body_00_15 () {
694
+ (
695
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
696
+
697
+ '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
698
+ '&mov ($a,$a1)',
699
+ '&mov ($a4,$f)',
700
+
701
+ '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
702
+ '&xor ($a0,$e)',
703
+ '&xor ($a4,$g)', # f^g
704
+
705
+ '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
706
+ '&xor ($a1,$a)',
707
+ '&and ($a4,$e)', # (f^g)&e
708
+
709
+ '&xor ($a0,$e)',
710
+ '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
711
+ '&mov ($a2,$a)',
712
+
713
+ '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
714
+ '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
715
+ '&xor ($a2,$b)', # a^b, b^c in next round
716
+
717
+ '&add ($h,$a4)', # h+=Ch(e,f,g)
718
+ '&ror ($a0,$Sigma1[0])', # Sigma1(e)
719
+ '&and ($a3,$a2)', # (b^c)&(a^b)
720
+
721
+ '&xor ($a1,$a)',
722
+ '&add ($h,$a0)', # h+=Sigma1(e)
723
+ '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
724
+
725
+ '&ror ($a1,$Sigma0[0])', # Sigma0(a)
726
+ '&add ($d,$h)', # d+=h
727
+ '&add ($h,$a3)', # h+=Maj(a,b,c)
728
+
729
+ '&mov ($a0,$d)',
730
+ '&add ($a1,$h);'. # h+=Sigma0(a)
731
+ '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
732
+ );
733
+ }
734
+
735
+ ######################################################################
736
+ # SSSE3 code path
737
+ #
738
+ if ($SZ==4) { # SHA256 only
739
+ my @X = map("%xmm$_",(0..3));
740
+ my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
741
+
742
+ $code.=<<___;
743
+ .type ${func}_ssse3,\@function,3
744
+ .align 64
745
+ ${func}_ssse3:
746
+ .Lssse3_shortcut:
747
+ push %rbx
748
+ push %rbp
749
+ push %r12
750
+ push %r13
751
+ push %r14
752
+ push %r15
753
+ mov %rsp,%r11 # copy %rsp
754
+ shl \$4,%rdx # num*16
755
+ sub \$`$framesz+$win64*16*4`,%rsp
756
+ lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
757
+ and \$-64,%rsp # align stack frame
758
+ mov $ctx,$_ctx # save ctx, 1st arg
759
+ mov $inp,$_inp # save inp, 2nd arh
760
+ mov %rdx,$_end # save end pointer, "3rd" arg
761
+ mov %r11,$_rsp # save copy of %rsp
762
+ ___
763
+ $code.=<<___ if ($win64);
764
+ movaps %xmm6,16*$SZ+32(%rsp)
765
+ movaps %xmm7,16*$SZ+48(%rsp)
766
+ movaps %xmm8,16*$SZ+64(%rsp)
767
+ movaps %xmm9,16*$SZ+80(%rsp)
768
+ ___
769
+ $code.=<<___;
770
+ .Lprologue_ssse3:
771
+
772
+ mov $SZ*0($ctx),$A
773
+ mov $SZ*1($ctx),$B
774
+ mov $SZ*2($ctx),$C
775
+ mov $SZ*3($ctx),$D
776
+ mov $SZ*4($ctx),$E
777
+ mov $SZ*5($ctx),$F
778
+ mov $SZ*6($ctx),$G
779
+ mov $SZ*7($ctx),$H
780
+ ___
781
+
782
+ $code.=<<___;
783
+ #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
784
+ #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
785
+ jmp .Lloop_ssse3
786
+ .align 16
787
+ .Lloop_ssse3:
788
+ movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
789
+ movdqu 0x00($inp),@X[0]
790
+ movdqu 0x10($inp),@X[1]
791
+ movdqu 0x20($inp),@X[2]
792
+ pshufb $t3,@X[0]
793
+ movdqu 0x30($inp),@X[3]
794
+ lea $TABLE(%rip),$Tbl
795
+ pshufb $t3,@X[1]
796
+ movdqa 0x00($Tbl),$t0
797
+ movdqa 0x20($Tbl),$t1
798
+ pshufb $t3,@X[2]
799
+ paddd @X[0],$t0
800
+ movdqa 0x40($Tbl),$t2
801
+ pshufb $t3,@X[3]
802
+ movdqa 0x60($Tbl),$t3
803
+ paddd @X[1],$t1
804
+ paddd @X[2],$t2
805
+ paddd @X[3],$t3
806
+ movdqa $t0,0x00(%rsp)
807
+ mov $A,$a1
808
+ movdqa $t1,0x10(%rsp)
809
+ mov $B,$a3
810
+ movdqa $t2,0x20(%rsp)
811
+ xor $C,$a3 # magic
812
+ movdqa $t3,0x30(%rsp)
813
+ mov $E,$a0
814
+ jmp .Lssse3_00_47
815
+
816
+ .align 16
817
+ .Lssse3_00_47:
818
+ sub \$`-16*2*$SZ`,$Tbl # size optimization
819
+ ___
820
+ sub Xupdate_256_SSSE3 () {
821
+ (
822
+ '&movdqa ($t0,@X[1]);',
823
+ '&movdqa ($t3,@X[3])',
824
+ '&palignr ($t0,@X[0],$SZ)', # X[1..4]
825
+ '&palignr ($t3,@X[2],$SZ);', # X[9..12]
826
+ '&movdqa ($t1,$t0)',
827
+ '&movdqa ($t2,$t0);',
828
+ '&psrld ($t0,$sigma0[2])',
829
+ '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
830
+ '&psrld ($t2,$sigma0[0])',
831
+ '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
832
+ '&pslld ($t1,8*$SZ-$sigma0[1]);'.
833
+ '&pxor ($t0,$t2)',
834
+ '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
835
+ '&pxor ($t0,$t1)',
836
+ '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
837
+ '&pxor ($t0,$t2);',
838
+ '&movdqa ($t2,$t3)',
839
+ '&pxor ($t0,$t1);', # sigma0(X[1..4])
840
+ '&psrld ($t3,$sigma1[2])',
841
+ '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
842
+ '&psrlq ($t2,$sigma1[0])',
843
+ '&pxor ($t3,$t2);',
844
+ '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
845
+ '&pxor ($t3,$t2)',
846
+ '&pshufb ($t3,$t4)', # sigma1(X[14..15])
847
+ '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
848
+ '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
849
+ '&movdqa ($t2,$t3);',
850
+ '&psrld ($t3,$sigma1[2])',
851
+ '&psrlq ($t2,$sigma1[0])',
852
+ '&pxor ($t3,$t2);',
853
+ '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
854
+ '&pxor ($t3,$t2);',
855
+ '&movdqa ($t2,16*2*$j."($Tbl)")',
856
+ '&pshufb ($t3,$t5)',
857
+ '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
858
+ );
859
+ }
860
+
861
+ sub SSSE3_256_00_47 () {
862
+ my $j = shift;
863
+ my $body = shift;
864
+ my @X = @_;
865
+ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
866
+
867
+ if (0) {
868
+ foreach (Xupdate_256_SSSE3()) { # 36 instructions
869
+ eval;
870
+ eval(shift(@insns));
871
+ eval(shift(@insns));
872
+ eval(shift(@insns));
873
+ }
874
+ } else { # squeeze extra 4% on Westmere and 19% on Atom
875
+ eval(shift(@insns)); #@
876
+ &movdqa ($t0,@X[1]);
877
+ eval(shift(@insns));
878
+ eval(shift(@insns));
879
+ &movdqa ($t3,@X[3]);
880
+ eval(shift(@insns)); #@
881
+ eval(shift(@insns));
882
+ eval(shift(@insns));
883
+ eval(shift(@insns)); #@
884
+ eval(shift(@insns));
885
+ &palignr ($t0,@X[0],$SZ); # X[1..4]
886
+ eval(shift(@insns));
887
+ eval(shift(@insns));
888
+ &palignr ($t3,@X[2],$SZ); # X[9..12]
889
+ eval(shift(@insns));
890
+ eval(shift(@insns));
891
+ eval(shift(@insns));
892
+ eval(shift(@insns)); #@
893
+ &movdqa ($t1,$t0);
894
+ eval(shift(@insns));
895
+ eval(shift(@insns));
896
+ &movdqa ($t2,$t0);
897
+ eval(shift(@insns)); #@
898
+ eval(shift(@insns));
899
+ &psrld ($t0,$sigma0[2]);
900
+ eval(shift(@insns));
901
+ eval(shift(@insns));
902
+ eval(shift(@insns));
903
+ &paddd (@X[0],$t3); # X[0..3] += X[9..12]
904
+ eval(shift(@insns)); #@
905
+ eval(shift(@insns));
906
+ &psrld ($t2,$sigma0[0]);
907
+ eval(shift(@insns));
908
+ eval(shift(@insns));
909
+ &pshufd ($t3,@X[3],0b11111010); # X[4..15]
910
+ eval(shift(@insns));
911
+ eval(shift(@insns)); #@
912
+ &pslld ($t1,8*$SZ-$sigma0[1]);
913
+ eval(shift(@insns));
914
+ eval(shift(@insns));
915
+ &pxor ($t0,$t2);
916
+ eval(shift(@insns)); #@
917
+ eval(shift(@insns));
918
+ eval(shift(@insns));
919
+ eval(shift(@insns)); #@
920
+ &psrld ($t2,$sigma0[1]-$sigma0[0]);
921
+ eval(shift(@insns));
922
+ &pxor ($t0,$t1);
923
+ eval(shift(@insns));
924
+ eval(shift(@insns));
925
+ &pslld ($t1,$sigma0[1]-$sigma0[0]);
926
+ eval(shift(@insns));
927
+ eval(shift(@insns));
928
+ &pxor ($t0,$t2);
929
+ eval(shift(@insns));
930
+ eval(shift(@insns)); #@
931
+ &movdqa ($t2,$t3);
932
+ eval(shift(@insns));
933
+ eval(shift(@insns));
934
+ &pxor ($t0,$t1); # sigma0(X[1..4])
935
+ eval(shift(@insns)); #@
936
+ eval(shift(@insns));
937
+ eval(shift(@insns));
938
+ &psrld ($t3,$sigma1[2]);
939
+ eval(shift(@insns));
940
+ eval(shift(@insns));
941
+ &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
942
+ eval(shift(@insns)); #@
943
+ eval(shift(@insns));
944
+ &psrlq ($t2,$sigma1[0]);
945
+ eval(shift(@insns));
946
+ eval(shift(@insns));
947
+ eval(shift(@insns));
948
+ &pxor ($t3,$t2);
949
+ eval(shift(@insns)); #@
950
+ eval(shift(@insns));
951
+ eval(shift(@insns));
952
+ eval(shift(@insns)); #@
953
+ &psrlq ($t2,$sigma1[1]-$sigma1[0]);
954
+ eval(shift(@insns));
955
+ eval(shift(@insns));
956
+ &pxor ($t3,$t2);
957
+ eval(shift(@insns)); #@
958
+ eval(shift(@insns));
959
+ eval(shift(@insns));
960
+ #&pshufb ($t3,$t4); # sigma1(X[14..15])
961
+ &pshufd ($t3,$t3,0b10000000);
962
+ eval(shift(@insns));
963
+ eval(shift(@insns));
964
+ eval(shift(@insns));
965
+ &psrldq ($t3,8);
966
+ eval(shift(@insns));
967
+ eval(shift(@insns)); #@
968
+ eval(shift(@insns));
969
+ eval(shift(@insns));
970
+ eval(shift(@insns)); #@
971
+ &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
972
+ eval(shift(@insns));
973
+ eval(shift(@insns));
974
+ eval(shift(@insns));
975
+ &pshufd ($t3,@X[0],0b01010000); # X[16..17]
976
+ eval(shift(@insns));
977
+ eval(shift(@insns)); #@
978
+ eval(shift(@insns));
979
+ &movdqa ($t2,$t3);
980
+ eval(shift(@insns));
981
+ eval(shift(@insns));
982
+ &psrld ($t3,$sigma1[2]);
983
+ eval(shift(@insns));
984
+ eval(shift(@insns)); #@
985
+ &psrlq ($t2,$sigma1[0]);
986
+ eval(shift(@insns));
987
+ eval(shift(@insns));
988
+ &pxor ($t3,$t2);
989
+ eval(shift(@insns)); #@
990
+ eval(shift(@insns));
991
+ eval(shift(@insns));
992
+ eval(shift(@insns)); #@
993
+ eval(shift(@insns));
994
+ &psrlq ($t2,$sigma1[1]-$sigma1[0]);
995
+ eval(shift(@insns));
996
+ eval(shift(@insns));
997
+ eval(shift(@insns));
998
+ &pxor ($t3,$t2);
999
+ eval(shift(@insns));
1000
+ eval(shift(@insns));
1001
+ eval(shift(@insns)); #@
1002
+ #&pshufb ($t3,$t5);
1003
+ &pshufd ($t3,$t3,0b00001000);
1004
+ eval(shift(@insns));
1005
+ eval(shift(@insns));
1006
+ &movdqa ($t2,16*2*$j."($Tbl)");
1007
+ eval(shift(@insns)); #@
1008
+ eval(shift(@insns));
1009
+ &pslldq ($t3,8);
1010
+ eval(shift(@insns));
1011
+ eval(shift(@insns));
1012
+ eval(shift(@insns));
1013
+ &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1014
+ eval(shift(@insns)); #@
1015
+ eval(shift(@insns));
1016
+ eval(shift(@insns));
1017
+ }
1018
+ &paddd ($t2,@X[0]);
1019
+ foreach (@insns) { eval; } # remaining instructions
1020
+ &movdqa (16*$j."(%rsp)",$t2);
1021
+ }
1022
+
1023
+ for ($i=0,$j=0; $j<4; $j++) {
1024
+ &SSSE3_256_00_47($j,\&body_00_15,@X);
1025
+ push(@X,shift(@X)); # rotate(@X)
1026
+ }
1027
+ &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1028
+ &jne (".Lssse3_00_47");
1029
+
1030
+ for ($i=0; $i<16; ) {
1031
+ foreach(body_00_15()) { eval; }
1032
+ }
1033
+ $code.=<<___;
1034
+ mov $_ctx,$ctx
1035
+ mov $a1,$A
1036
+
1037
+ add $SZ*0($ctx),$A
1038
+ lea 16*$SZ($inp),$inp
1039
+ add $SZ*1($ctx),$B
1040
+ add $SZ*2($ctx),$C
1041
+ add $SZ*3($ctx),$D
1042
+ add $SZ*4($ctx),$E
1043
+ add $SZ*5($ctx),$F
1044
+ add $SZ*6($ctx),$G
1045
+ add $SZ*7($ctx),$H
1046
+
1047
+ cmp $_end,$inp
1048
+
1049
+ mov $A,$SZ*0($ctx)
1050
+ mov $B,$SZ*1($ctx)
1051
+ mov $C,$SZ*2($ctx)
1052
+ mov $D,$SZ*3($ctx)
1053
+ mov $E,$SZ*4($ctx)
1054
+ mov $F,$SZ*5($ctx)
1055
+ mov $G,$SZ*6($ctx)
1056
+ mov $H,$SZ*7($ctx)
1057
+ jb .Lloop_ssse3
1058
+
1059
+ mov $_rsp,%rsi
1060
+ ___
1061
+ $code.=<<___ if ($win64);
1062
+ movaps 16*$SZ+32(%rsp),%xmm6
1063
+ movaps 16*$SZ+48(%rsp),%xmm7
1064
+ movaps 16*$SZ+64(%rsp),%xmm8
1065
+ movaps 16*$SZ+80(%rsp),%xmm9
1066
+ ___
1067
+ $code.=<<___;
1068
+ mov (%rsi),%r15
1069
+ mov 8(%rsi),%r14
1070
+ mov 16(%rsi),%r13
1071
+ mov 24(%rsi),%r12
1072
+ mov 32(%rsi),%rbp
1073
+ mov 40(%rsi),%rbx
1074
+ lea 48(%rsi),%rsp
1075
+ .Lepilogue_ssse3:
1076
+ ret
1077
+ .size ${func}_ssse3,.-${func}_ssse3
1078
+ ___
1079
+ }
1080
+
1081
+ if ($avx) {{
1082
+ ######################################################################
1083
+ # XOP code path
1084
+ #
1085
+ if ($SZ==8) { # SHA512 only
1086
+ $code.=<<___;
1087
+ .type ${func}_xop,\@function,3
1088
+ .align 64
1089
+ ${func}_xop:
1090
+ .Lxop_shortcut:
1091
+ push %rbx
1092
+ push %rbp
1093
+ push %r12
1094
+ push %r13
1095
+ push %r14
1096
+ push %r15
1097
+ mov %rsp,%r11 # copy %rsp
1098
+ shl \$4,%rdx # num*16
1099
+ sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1100
+ lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1101
+ and \$-64,%rsp # align stack frame
1102
+ mov $ctx,$_ctx # save ctx, 1st arg
1103
+ mov $inp,$_inp # save inp, 2nd arh
1104
+ mov %rdx,$_end # save end pointer, "3rd" arg
1105
+ mov %r11,$_rsp # save copy of %rsp
1106
+ ___
1107
+ $code.=<<___ if ($win64);
1108
+ movaps %xmm6,16*$SZ+32(%rsp)
1109
+ movaps %xmm7,16*$SZ+48(%rsp)
1110
+ movaps %xmm8,16*$SZ+64(%rsp)
1111
+ movaps %xmm9,16*$SZ+80(%rsp)
1112
+ ___
1113
+ $code.=<<___ if ($win64 && $SZ>4);
1114
+ movaps %xmm10,16*$SZ+96(%rsp)
1115
+ movaps %xmm11,16*$SZ+112(%rsp)
1116
+ ___
1117
+ $code.=<<___;
1118
+ .Lprologue_xop:
1119
+
1120
+ vzeroupper
1121
+ mov $SZ*0($ctx),$A
1122
+ mov $SZ*1($ctx),$B
1123
+ mov $SZ*2($ctx),$C
1124
+ mov $SZ*3($ctx),$D
1125
+ mov $SZ*4($ctx),$E
1126
+ mov $SZ*5($ctx),$F
1127
+ mov $SZ*6($ctx),$G
1128
+ mov $SZ*7($ctx),$H
1129
+ jmp .Lloop_xop
1130
+ ___
1131
+ if ($SZ==4) { # SHA256
1132
+ my @X = map("%xmm$_",(0..3));
1133
+ my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1134
+
1135
+ $code.=<<___;
1136
+ .align 16
1137
+ .Lloop_xop:
1138
+ vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1139
+ vmovdqu 0x00($inp),@X[0]
1140
+ vmovdqu 0x10($inp),@X[1]
1141
+ vmovdqu 0x20($inp),@X[2]
1142
+ vmovdqu 0x30($inp),@X[3]
1143
+ vpshufb $t3,@X[0],@X[0]
1144
+ lea $TABLE(%rip),$Tbl
1145
+ vpshufb $t3,@X[1],@X[1]
1146
+ vpshufb $t3,@X[2],@X[2]
1147
+ vpaddd 0x00($Tbl),@X[0],$t0
1148
+ vpshufb $t3,@X[3],@X[3]
1149
+ vpaddd 0x20($Tbl),@X[1],$t1
1150
+ vpaddd 0x40($Tbl),@X[2],$t2
1151
+ vpaddd 0x60($Tbl),@X[3],$t3
1152
+ vmovdqa $t0,0x00(%rsp)
1153
+ mov $A,$a1
1154
+ vmovdqa $t1,0x10(%rsp)
1155
+ mov $B,$a3
1156
+ vmovdqa $t2,0x20(%rsp)
1157
+ xor $C,$a3 # magic
1158
+ vmovdqa $t3,0x30(%rsp)
1159
+ mov $E,$a0
1160
+ jmp .Lxop_00_47
1161
+
1162
+ .align 16
1163
+ .Lxop_00_47:
1164
+ sub \$`-16*2*$SZ`,$Tbl # size optimization
1165
+ ___
1166
+ sub XOP_256_00_47 () {
1167
+ my $j = shift;
1168
+ my $body = shift;
1169
+ my @X = @_;
1170
+ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1171
+
1172
+ &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
1173
+ eval(shift(@insns));
1174
+ eval(shift(@insns));
1175
+ &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
1176
+ eval(shift(@insns));
1177
+ eval(shift(@insns));
1178
+ &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
1179
+ eval(shift(@insns));
1180
+ eval(shift(@insns));
1181
+ &vpsrld ($t0,$t0,$sigma0[2]);
1182
+ eval(shift(@insns));
1183
+ eval(shift(@insns));
1184
+ &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
1185
+ eval(shift(@insns));
1186
+ eval(shift(@insns));
1187
+ eval(shift(@insns));
1188
+ eval(shift(@insns));
1189
+ &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
1190
+ eval(shift(@insns));
1191
+ eval(shift(@insns));
1192
+ &vpxor ($t0,$t0,$t1);
1193
+ eval(shift(@insns));
1194
+ eval(shift(@insns));
1195
+ eval(shift(@insns));
1196
+ eval(shift(@insns));
1197
+ &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
1198
+ eval(shift(@insns));
1199
+ eval(shift(@insns));
1200
+ &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
1201
+ eval(shift(@insns));
1202
+ eval(shift(@insns));
1203
+ &vpsrld ($t2,@X[3],$sigma1[2]);
1204
+ eval(shift(@insns));
1205
+ eval(shift(@insns));
1206
+ &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
1207
+ eval(shift(@insns));
1208
+ eval(shift(@insns));
1209
+ &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1210
+ eval(shift(@insns));
1211
+ eval(shift(@insns));
1212
+ &vpxor ($t3,$t3,$t2);
1213
+ eval(shift(@insns));
1214
+ eval(shift(@insns));
1215
+ eval(shift(@insns));
1216
+ eval(shift(@insns));
1217
+ &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1218
+ eval(shift(@insns));
1219
+ eval(shift(@insns));
1220
+ eval(shift(@insns));
1221
+ eval(shift(@insns));
1222
+ &vpsrldq ($t3,$t3,8);
1223
+ eval(shift(@insns));
1224
+ eval(shift(@insns));
1225
+ eval(shift(@insns));
1226
+ eval(shift(@insns));
1227
+ &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1228
+ eval(shift(@insns));
1229
+ eval(shift(@insns));
1230
+ eval(shift(@insns));
1231
+ eval(shift(@insns));
1232
+ &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
1233
+ eval(shift(@insns));
1234
+ eval(shift(@insns));
1235
+ &vpsrld ($t2,@X[0],$sigma1[2]);
1236
+ eval(shift(@insns));
1237
+ eval(shift(@insns));
1238
+ &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1239
+ eval(shift(@insns));
1240
+ eval(shift(@insns));
1241
+ &vpxor ($t3,$t3,$t2);
1242
+ eval(shift(@insns));
1243
+ eval(shift(@insns));
1244
+ eval(shift(@insns));
1245
+ eval(shift(@insns));
1246
+ &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
1247
+ eval(shift(@insns));
1248
+ eval(shift(@insns));
1249
+ eval(shift(@insns));
1250
+ eval(shift(@insns));
1251
+ &vpslldq ($t3,$t3,8); # 22 instructions
1252
+ eval(shift(@insns));
1253
+ eval(shift(@insns));
1254
+ eval(shift(@insns));
1255
+ eval(shift(@insns));
1256
+ &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1257
+ eval(shift(@insns));
1258
+ eval(shift(@insns));
1259
+ eval(shift(@insns));
1260
+ eval(shift(@insns));
1261
+ &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1262
+ foreach (@insns) { eval; } # remaining instructions
1263
+ &vmovdqa (16*$j."(%rsp)",$t2);
1264
+ }
1265
+
1266
+ for ($i=0,$j=0; $j<4; $j++) {
1267
+ &XOP_256_00_47($j,\&body_00_15,@X);
1268
+ push(@X,shift(@X)); # rotate(@X)
1269
+ }
1270
+ &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1271
+ &jne (".Lxop_00_47");
1272
+
1273
+ for ($i=0; $i<16; ) {
1274
+ foreach(body_00_15()) { eval; }
1275
+ }
1276
+
1277
+ } else { # SHA512
1278
+ my @X = map("%xmm$_",(0..7));
1279
+ my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1280
+
1281
+ $code.=<<___;
1282
+ .align 16
1283
+ .Lloop_xop:
1284
+ vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1285
+ vmovdqu 0x00($inp),@X[0]
1286
+ lea $TABLE+0x80(%rip),$Tbl # size optimization
1287
+ vmovdqu 0x10($inp),@X[1]
1288
+ vmovdqu 0x20($inp),@X[2]
1289
+ vpshufb $t3,@X[0],@X[0]
1290
+ vmovdqu 0x30($inp),@X[3]
1291
+ vpshufb $t3,@X[1],@X[1]
1292
+ vmovdqu 0x40($inp),@X[4]
1293
+ vpshufb $t3,@X[2],@X[2]
1294
+ vmovdqu 0x50($inp),@X[5]
1295
+ vpshufb $t3,@X[3],@X[3]
1296
+ vmovdqu 0x60($inp),@X[6]
1297
+ vpshufb $t3,@X[4],@X[4]
1298
+ vmovdqu 0x70($inp),@X[7]
1299
+ vpshufb $t3,@X[5],@X[5]
1300
+ vpaddq -0x80($Tbl),@X[0],$t0
1301
+ vpshufb $t3,@X[6],@X[6]
1302
+ vpaddq -0x60($Tbl),@X[1],$t1
1303
+ vpshufb $t3,@X[7],@X[7]
1304
+ vpaddq -0x40($Tbl),@X[2],$t2
1305
+ vpaddq -0x20($Tbl),@X[3],$t3
1306
+ vmovdqa $t0,0x00(%rsp)
1307
+ vpaddq 0x00($Tbl),@X[4],$t0
1308
+ vmovdqa $t1,0x10(%rsp)
1309
+ vpaddq 0x20($Tbl),@X[5],$t1
1310
+ vmovdqa $t2,0x20(%rsp)
1311
+ vpaddq 0x40($Tbl),@X[6],$t2
1312
+ vmovdqa $t3,0x30(%rsp)
1313
+ vpaddq 0x60($Tbl),@X[7],$t3
1314
+ vmovdqa $t0,0x40(%rsp)
1315
+ mov $A,$a1
1316
+ vmovdqa $t1,0x50(%rsp)
1317
+ mov $B,$a3
1318
+ vmovdqa $t2,0x60(%rsp)
1319
+ xor $C,$a3 # magic
1320
+ vmovdqa $t3,0x70(%rsp)
1321
+ mov $E,$a0
1322
+ jmp .Lxop_00_47
1323
+
1324
+ .align 16
1325
+ .Lxop_00_47:
1326
+ add \$`16*2*$SZ`,$Tbl
1327
+ ___
1328
+ sub XOP_512_00_47 () {
1329
+ my $j = shift;
1330
+ my $body = shift;
1331
+ my @X = @_;
1332
+ my @insns = (&$body,&$body); # 52 instructions
1333
+
1334
+ &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
1335
+ eval(shift(@insns));
1336
+ eval(shift(@insns));
1337
+ &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
1338
+ eval(shift(@insns));
1339
+ eval(shift(@insns));
1340
+ &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
1341
+ eval(shift(@insns));
1342
+ eval(shift(@insns));
1343
+ &vpsrlq ($t0,$t0,$sigma0[2]);
1344
+ eval(shift(@insns));
1345
+ eval(shift(@insns));
1346
+ &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
1347
+ eval(shift(@insns));
1348
+ eval(shift(@insns));
1349
+ eval(shift(@insns));
1350
+ eval(shift(@insns));
1351
+ &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
1352
+ eval(shift(@insns));
1353
+ eval(shift(@insns));
1354
+ &vpxor ($t0,$t0,$t1);
1355
+ eval(shift(@insns));
1356
+ eval(shift(@insns));
1357
+ eval(shift(@insns));
1358
+ eval(shift(@insns));
1359
+ &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
1360
+ eval(shift(@insns));
1361
+ eval(shift(@insns));
1362
+ &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
1363
+ eval(shift(@insns));
1364
+ eval(shift(@insns));
1365
+ &vpsrlq ($t2,@X[7],$sigma1[2]);
1366
+ eval(shift(@insns));
1367
+ eval(shift(@insns));
1368
+ &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
1369
+ eval(shift(@insns));
1370
+ eval(shift(@insns));
1371
+ &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
1372
+ eval(shift(@insns));
1373
+ eval(shift(@insns));
1374
+ &vpxor ($t3,$t3,$t2);
1375
+ eval(shift(@insns));
1376
+ eval(shift(@insns));
1377
+ eval(shift(@insns));
1378
+ eval(shift(@insns));
1379
+ &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1380
+ eval(shift(@insns));
1381
+ eval(shift(@insns));
1382
+ eval(shift(@insns));
1383
+ eval(shift(@insns));
1384
+ &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1385
+ eval(shift(@insns));
1386
+ eval(shift(@insns));
1387
+ eval(shift(@insns));
1388
+ eval(shift(@insns));
1389
+ &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1390
+ foreach (@insns) { eval; } # remaining instructions
1391
+ &vmovdqa (16*$j."(%rsp)",$t2);
1392
+ }
1393
+
1394
+ for ($i=0,$j=0; $j<8; $j++) {
1395
+ &XOP_512_00_47($j,\&body_00_15,@X);
1396
+ push(@X,shift(@X)); # rotate(@X)
1397
+ }
1398
+ &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1399
+ &jne (".Lxop_00_47");
1400
+
1401
+ for ($i=0; $i<16; ) {
1402
+ foreach(body_00_15()) { eval; }
1403
+ }
1404
+ }
1405
+ $code.=<<___;
1406
+ mov $_ctx,$ctx
1407
+ mov $a1,$A
1408
+
1409
+ add $SZ*0($ctx),$A
1410
+ lea 16*$SZ($inp),$inp
1411
+ add $SZ*1($ctx),$B
1412
+ add $SZ*2($ctx),$C
1413
+ add $SZ*3($ctx),$D
1414
+ add $SZ*4($ctx),$E
1415
+ add $SZ*5($ctx),$F
1416
+ add $SZ*6($ctx),$G
1417
+ add $SZ*7($ctx),$H
1418
+
1419
+ cmp $_end,$inp
1420
+
1421
+ mov $A,$SZ*0($ctx)
1422
+ mov $B,$SZ*1($ctx)
1423
+ mov $C,$SZ*2($ctx)
1424
+ mov $D,$SZ*3($ctx)
1425
+ mov $E,$SZ*4($ctx)
1426
+ mov $F,$SZ*5($ctx)
1427
+ mov $G,$SZ*6($ctx)
1428
+ mov $H,$SZ*7($ctx)
1429
+ jb .Lloop_xop
1430
+
1431
+ mov $_rsp,%rsi
1432
+ vzeroupper
1433
+ ___
1434
+ $code.=<<___ if ($win64);
1435
+ movaps 16*$SZ+32(%rsp),%xmm6
1436
+ movaps 16*$SZ+48(%rsp),%xmm7
1437
+ movaps 16*$SZ+64(%rsp),%xmm8
1438
+ movaps 16*$SZ+80(%rsp),%xmm9
1439
+ ___
1440
+ $code.=<<___ if ($win64 && $SZ>4);
1441
+ movaps 16*$SZ+96(%rsp),%xmm10
1442
+ movaps 16*$SZ+112(%rsp),%xmm11
1443
+ ___
1444
+ $code.=<<___;
1445
+ mov (%rsi),%r15
1446
+ mov 8(%rsi),%r14
1447
+ mov 16(%rsi),%r13
1448
+ mov 24(%rsi),%r12
1449
+ mov 32(%rsi),%rbp
1450
+ mov 40(%rsi),%rbx
1451
+ lea 48(%rsi),%rsp
1452
+ .Lepilogue_xop:
1453
+ ret
1454
+ .size ${func}_xop,.-${func}_xop
1455
+ ___
1456
+ }
1457
+ ######################################################################
1458
+ # AVX+shrd code path
1459
+ #
1460
+ local *ror = sub { &shrd(@_[0],@_) };
1461
+
1462
+ $code.=<<___;
1463
+ .type ${func}_avx,\@function,3
1464
+ .align 64
1465
+ ${func}_avx:
1466
+ .Lavx_shortcut:
1467
+ push %rbx
1468
+ push %rbp
1469
+ push %r12
1470
+ push %r13
1471
+ push %r14
1472
+ push %r15
1473
+ mov %rsp,%r11 # copy %rsp
1474
+ shl \$4,%rdx # num*16
1475
+ sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1476
+ lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1477
+ and \$-64,%rsp # align stack frame
1478
+ mov $ctx,$_ctx # save ctx, 1st arg
1479
+ mov $inp,$_inp # save inp, 2nd arh
1480
+ mov %rdx,$_end # save end pointer, "3rd" arg
1481
+ mov %r11,$_rsp # save copy of %rsp
1482
+ ___
1483
+ $code.=<<___ if ($win64);
1484
+ movaps %xmm6,16*$SZ+32(%rsp)
1485
+ movaps %xmm7,16*$SZ+48(%rsp)
1486
+ movaps %xmm8,16*$SZ+64(%rsp)
1487
+ movaps %xmm9,16*$SZ+80(%rsp)
1488
+ ___
1489
+ $code.=<<___ if ($win64 && $SZ>4);
1490
+ movaps %xmm10,16*$SZ+96(%rsp)
1491
+ movaps %xmm11,16*$SZ+112(%rsp)
1492
+ ___
1493
+ $code.=<<___;
1494
+ .Lprologue_avx:
1495
+
1496
+ vzeroupper
1497
+ mov $SZ*0($ctx),$A
1498
+ mov $SZ*1($ctx),$B
1499
+ mov $SZ*2($ctx),$C
1500
+ mov $SZ*3($ctx),$D
1501
+ mov $SZ*4($ctx),$E
1502
+ mov $SZ*5($ctx),$F
1503
+ mov $SZ*6($ctx),$G
1504
+ mov $SZ*7($ctx),$H
1505
+ ___
1506
+ if ($SZ==4) { # SHA256
1507
+ my @X = map("%xmm$_",(0..3));
1508
+ my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1509
+
1510
+ $code.=<<___;
1511
+ vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1512
+ vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1513
+ jmp .Lloop_avx
1514
+ .align 16
1515
+ .Lloop_avx:
1516
+ vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1517
+ vmovdqu 0x00($inp),@X[0]
1518
+ vmovdqu 0x10($inp),@X[1]
1519
+ vmovdqu 0x20($inp),@X[2]
1520
+ vmovdqu 0x30($inp),@X[3]
1521
+ vpshufb $t3,@X[0],@X[0]
1522
+ lea $TABLE(%rip),$Tbl
1523
+ vpshufb $t3,@X[1],@X[1]
1524
+ vpshufb $t3,@X[2],@X[2]
1525
+ vpaddd 0x00($Tbl),@X[0],$t0
1526
+ vpshufb $t3,@X[3],@X[3]
1527
+ vpaddd 0x20($Tbl),@X[1],$t1
1528
+ vpaddd 0x40($Tbl),@X[2],$t2
1529
+ vpaddd 0x60($Tbl),@X[3],$t3
1530
+ vmovdqa $t0,0x00(%rsp)
1531
+ mov $A,$a1
1532
+ vmovdqa $t1,0x10(%rsp)
1533
+ mov $B,$a3
1534
+ vmovdqa $t2,0x20(%rsp)
1535
+ xor $C,$a3 # magic
1536
+ vmovdqa $t3,0x30(%rsp)
1537
+ mov $E,$a0
1538
+ jmp .Lavx_00_47
1539
+
1540
+ .align 16
1541
+ .Lavx_00_47:
1542
+ sub \$`-16*2*$SZ`,$Tbl # size optimization
1543
+ ___
1544
+ sub Xupdate_256_AVX () {
1545
+ (
1546
+ '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
1547
+ '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
1548
+ '&vpsrld ($t2,$t0,$sigma0[0]);',
1549
+ '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
1550
+ '&vpsrld ($t3,$t0,$sigma0[2])',
1551
+ '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
1552
+ '&vpxor ($t0,$t3,$t2)',
1553
+ '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
1554
+ '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1555
+ '&vpxor ($t0,$t0,$t1)',
1556
+ '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1557
+ '&vpxor ($t0,$t0,$t2)',
1558
+ '&vpsrld ($t2,$t3,$sigma1[2]);',
1559
+ '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
1560
+ '&vpsrlq ($t3,$t3,$sigma1[0]);',
1561
+ '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
1562
+ '&vpxor ($t2,$t2,$t3);',
1563
+ '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1564
+ '&vpxor ($t2,$t2,$t3)',
1565
+ '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
1566
+ '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
1567
+ '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
1568
+ '&vpsrld ($t2,$t3,$sigma1[2])',
1569
+ '&vpsrlq ($t3,$t3,$sigma1[0])',
1570
+ '&vpxor ($t2,$t2,$t3);',
1571
+ '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1572
+ '&vpxor ($t2,$t2,$t3)',
1573
+ '&vpshufb ($t2,$t2,$t5)',
1574
+ '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
1575
+ );
1576
+ }
1577
+
1578
+ sub AVX_256_00_47 () {
1579
+ my $j = shift;
1580
+ my $body = shift;
1581
+ my @X = @_;
1582
+ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1583
+
1584
+ foreach (Xupdate_256_AVX()) { # 29 instructions
1585
+ eval;
1586
+ eval(shift(@insns));
1587
+ eval(shift(@insns));
1588
+ eval(shift(@insns));
1589
+ }
1590
+ &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1591
+ foreach (@insns) { eval; } # remaining instructions
1592
+ &vmovdqa (16*$j."(%rsp)",$t2);
1593
+ }
1594
+
1595
+ for ($i=0,$j=0; $j<4; $j++) {
1596
+ &AVX_256_00_47($j,\&body_00_15,@X);
1597
+ push(@X,shift(@X)); # rotate(@X)
1598
+ }
1599
+ &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1600
+ &jne (".Lavx_00_47");
1601
+
1602
+ for ($i=0; $i<16; ) {
1603
+ foreach(body_00_15()) { eval; }
1604
+ }
1605
+
1606
+ } else { # SHA512
1607
+ my @X = map("%xmm$_",(0..7));
1608
+ my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1609
+
1610
+ $code.=<<___;
1611
+ jmp .Lloop_avx
1612
+ .align 16
1613
+ .Lloop_avx:
1614
+ vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1615
+ vmovdqu 0x00($inp),@X[0]
1616
+ lea $TABLE+0x80(%rip),$Tbl # size optimization
1617
+ vmovdqu 0x10($inp),@X[1]
1618
+ vmovdqu 0x20($inp),@X[2]
1619
+ vpshufb $t3,@X[0],@X[0]
1620
+ vmovdqu 0x30($inp),@X[3]
1621
+ vpshufb $t3,@X[1],@X[1]
1622
+ vmovdqu 0x40($inp),@X[4]
1623
+ vpshufb $t3,@X[2],@X[2]
1624
+ vmovdqu 0x50($inp),@X[5]
1625
+ vpshufb $t3,@X[3],@X[3]
1626
+ vmovdqu 0x60($inp),@X[6]
1627
+ vpshufb $t3,@X[4],@X[4]
1628
+ vmovdqu 0x70($inp),@X[7]
1629
+ vpshufb $t3,@X[5],@X[5]
1630
+ vpaddq -0x80($Tbl),@X[0],$t0
1631
+ vpshufb $t3,@X[6],@X[6]
1632
+ vpaddq -0x60($Tbl),@X[1],$t1
1633
+ vpshufb $t3,@X[7],@X[7]
1634
+ vpaddq -0x40($Tbl),@X[2],$t2
1635
+ vpaddq -0x20($Tbl),@X[3],$t3
1636
+ vmovdqa $t0,0x00(%rsp)
1637
+ vpaddq 0x00($Tbl),@X[4],$t0
1638
+ vmovdqa $t1,0x10(%rsp)
1639
+ vpaddq 0x20($Tbl),@X[5],$t1
1640
+ vmovdqa $t2,0x20(%rsp)
1641
+ vpaddq 0x40($Tbl),@X[6],$t2
1642
+ vmovdqa $t3,0x30(%rsp)
1643
+ vpaddq 0x60($Tbl),@X[7],$t3
1644
+ vmovdqa $t0,0x40(%rsp)
1645
+ mov $A,$a1
1646
+ vmovdqa $t1,0x50(%rsp)
1647
+ mov $B,$a3
1648
+ vmovdqa $t2,0x60(%rsp)
1649
+ xor $C,$a3 # magic
1650
+ vmovdqa $t3,0x70(%rsp)
1651
+ mov $E,$a0
1652
+ jmp .Lavx_00_47
1653
+
1654
+ .align 16
1655
+ .Lavx_00_47:
1656
+ add \$`16*2*$SZ`,$Tbl
1657
+ ___
1658
+ sub Xupdate_512_AVX () {
1659
+ (
1660
+ '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
1661
+ '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
1662
+ '&vpsrlq ($t2,$t0,$sigma0[0])',
1663
+ '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
1664
+ '&vpsrlq ($t3,$t0,$sigma0[2])',
1665
+ '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
1666
+ '&vpxor ($t0,$t3,$t2)',
1667
+ '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1668
+ '&vpxor ($t0,$t0,$t1)',
1669
+ '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1670
+ '&vpxor ($t0,$t0,$t2)',
1671
+ '&vpsrlq ($t3,@X[7],$sigma1[2]);',
1672
+ '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
1673
+ '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
1674
+ '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
1675
+ '&vpsrlq ($t1,@X[7],$sigma1[0]);',
1676
+ '&vpxor ($t3,$t3,$t2)',
1677
+ '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
1678
+ '&vpxor ($t3,$t3,$t1)',
1679
+ '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
1680
+ '&vpxor ($t3,$t3,$t2)',
1681
+ '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
1682
+ '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
1683
+ );
1684
+ }
1685
+
1686
+ sub AVX_512_00_47 () {
1687
+ my $j = shift;
1688
+ my $body = shift;
1689
+ my @X = @_;
1690
+ my @insns = (&$body,&$body); # 52 instructions
1691
+
1692
+ foreach (Xupdate_512_AVX()) { # 23 instructions
1693
+ eval;
1694
+ eval(shift(@insns));
1695
+ eval(shift(@insns));
1696
+ }
1697
+ &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1698
+ foreach (@insns) { eval; } # remaining instructions
1699
+ &vmovdqa (16*$j."(%rsp)",$t2);
1700
+ }
1701
+
1702
+ for ($i=0,$j=0; $j<8; $j++) {
1703
+ &AVX_512_00_47($j,\&body_00_15,@X);
1704
+ push(@X,shift(@X)); # rotate(@X)
1705
+ }
1706
+ &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1707
+ &jne (".Lavx_00_47");
1708
+
1709
+ for ($i=0; $i<16; ) {
1710
+ foreach(body_00_15()) { eval; }
1711
+ }
1712
+ }
1713
+ $code.=<<___;
1714
+ mov $_ctx,$ctx
1715
+ mov $a1,$A
1716
+
1717
+ add $SZ*0($ctx),$A
1718
+ lea 16*$SZ($inp),$inp
1719
+ add $SZ*1($ctx),$B
1720
+ add $SZ*2($ctx),$C
1721
+ add $SZ*3($ctx),$D
1722
+ add $SZ*4($ctx),$E
1723
+ add $SZ*5($ctx),$F
1724
+ add $SZ*6($ctx),$G
1725
+ add $SZ*7($ctx),$H
1726
+
1727
+ cmp $_end,$inp
1728
+
1729
+ mov $A,$SZ*0($ctx)
1730
+ mov $B,$SZ*1($ctx)
1731
+ mov $C,$SZ*2($ctx)
1732
+ mov $D,$SZ*3($ctx)
1733
+ mov $E,$SZ*4($ctx)
1734
+ mov $F,$SZ*5($ctx)
1735
+ mov $G,$SZ*6($ctx)
1736
+ mov $H,$SZ*7($ctx)
1737
+ jb .Lloop_avx
1738
+
1739
+ mov $_rsp,%rsi
1740
+ vzeroupper
1741
+ ___
1742
+ $code.=<<___ if ($win64);
1743
+ movaps 16*$SZ+32(%rsp),%xmm6
1744
+ movaps 16*$SZ+48(%rsp),%xmm7
1745
+ movaps 16*$SZ+64(%rsp),%xmm8
1746
+ movaps 16*$SZ+80(%rsp),%xmm9
1747
+ ___
1748
+ $code.=<<___ if ($win64 && $SZ>4);
1749
+ movaps 16*$SZ+96(%rsp),%xmm10
1750
+ movaps 16*$SZ+112(%rsp),%xmm11
1751
+ ___
1752
+ $code.=<<___;
1753
+ mov (%rsi),%r15
1754
+ mov 8(%rsi),%r14
1755
+ mov 16(%rsi),%r13
1756
+ mov 24(%rsi),%r12
1757
+ mov 32(%rsi),%rbp
1758
+ mov 40(%rsi),%rbx
1759
+ lea 48(%rsi),%rsp
1760
+ .Lepilogue_avx:
1761
+ ret
1762
+ .size ${func}_avx,.-${func}_avx
1763
+ ___
1764
+
1765
+ if ($avx>1) {{
1766
+ ######################################################################
1767
+ # AVX2+BMI code path
1768
+ #
1769
+ my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
1770
+ my $PUSH8=8*2*$SZ;
1771
+ use integer;
1772
+
1773
+ sub bodyx_00_15 () {
1774
+ # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1775
+ (
1776
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1777
+
1778
+ '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
1779
+ '&and ($a4,$e)', # f&e
1780
+ '&rorx ($a0,$e,$Sigma1[2])',
1781
+ '&rorx ($a2,$e,$Sigma1[1])',
1782
+
1783
+ '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
1784
+ '&lea ($h,"($h,$a4)")',
1785
+ '&andn ($a4,$e,$g)', # ~e&g
1786
+ '&xor ($a0,$a2)',
1787
+
1788
+ '&rorx ($a1,$e,$Sigma1[0])',
1789
+ '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
1790
+ '&xor ($a0,$a1)', # Sigma1(e)
1791
+ '&mov ($a2,$a)',
1792
+
1793
+ '&rorx ($a4,$a,$Sigma0[2])',
1794
+ '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
1795
+ '&xor ($a2,$b)', # a^b, b^c in next round
1796
+ '&rorx ($a1,$a,$Sigma0[1])',
1797
+
1798
+ '&rorx ($a0,$a,$Sigma0[0])',
1799
+ '&lea ($d,"($d,$h)")', # d+=h
1800
+ '&and ($a3,$a2)', # (b^c)&(a^b)
1801
+ '&xor ($a1,$a4)',
1802
+
1803
+ '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
1804
+ '&xor ($a1,$a0)', # Sigma0(a)
1805
+ '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
1806
+ '&mov ($a4,$e)', # copy of f in future
1807
+
1808
+ '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1809
+ );
1810
+ # and at the finish one has to $a+=$a1
1811
+ }
1812
+
1813
+ $code.=<<___;
1814
+ .type ${func}_avx2,\@function,3
1815
+ .align 64
1816
+ ${func}_avx2:
1817
+ .Lavx2_shortcut:
1818
+ push %rbx
1819
+ push %rbp
1820
+ push %r12
1821
+ push %r13
1822
+ push %r14
1823
+ push %r15
1824
+ mov %rsp,%r11 # copy %rsp
1825
+ sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1826
+ shl \$4,%rdx # num*16
1827
+ and \$-256*$SZ,%rsp # align stack frame
1828
+ lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1829
+ add \$`2*$SZ*($rounds-8)`,%rsp
1830
+ mov $ctx,$_ctx # save ctx, 1st arg
1831
+ mov $inp,$_inp # save inp, 2nd arh
1832
+ mov %rdx,$_end # save end pointer, "3rd" arg
1833
+ mov %r11,$_rsp # save copy of %rsp
1834
+ ___
1835
+ $code.=<<___ if ($win64);
1836
+ movaps %xmm6,16*$SZ+32(%rsp)
1837
+ movaps %xmm7,16*$SZ+48(%rsp)
1838
+ movaps %xmm8,16*$SZ+64(%rsp)
1839
+ movaps %xmm9,16*$SZ+80(%rsp)
1840
+ ___
1841
+ $code.=<<___ if ($win64 && $SZ>4);
1842
+ movaps %xmm10,16*$SZ+96(%rsp)
1843
+ movaps %xmm11,16*$SZ+112(%rsp)
1844
+ ___
1845
+ $code.=<<___;
1846
+ .Lprologue_avx2:
1847
+
1848
+ vzeroupper
1849
+ sub \$-16*$SZ,$inp # inp++, size optimization
1850
+ mov $SZ*0($ctx),$A
1851
+ mov $inp,%r12 # borrow $T1
1852
+ mov $SZ*1($ctx),$B
1853
+ cmp %rdx,$inp # $_end
1854
+ mov $SZ*2($ctx),$C
1855
+ cmove %rsp,%r12 # next block or random data
1856
+ mov $SZ*3($ctx),$D
1857
+ mov $SZ*4($ctx),$E
1858
+ mov $SZ*5($ctx),$F
1859
+ mov $SZ*6($ctx),$G
1860
+ mov $SZ*7($ctx),$H
1861
+ ___
1862
+ if ($SZ==4) { # SHA256
1863
+ my @X = map("%ymm$_",(0..3));
1864
+ my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1865
+
1866
+ $code.=<<___;
1867
+ vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1868
+ vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1869
+ jmp .Loop_avx2
1870
+ .align 16
1871
+ .Loop_avx2:
1872
+ vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1873
+ vmovdqu -16*$SZ+0($inp),%xmm0
1874
+ vmovdqu -16*$SZ+16($inp),%xmm1
1875
+ vmovdqu -16*$SZ+32($inp),%xmm2
1876
+ vmovdqu -16*$SZ+48($inp),%xmm3
1877
+ #mov $inp,$_inp # offload $inp
1878
+ vinserti128 \$1,(%r12),@X[0],@X[0]
1879
+ vinserti128 \$1,16(%r12),@X[1],@X[1]
1880
+ vpshufb $t3,@X[0],@X[0]
1881
+ vinserti128 \$1,32(%r12),@X[2],@X[2]
1882
+ vpshufb $t3,@X[1],@X[1]
1883
+ vinserti128 \$1,48(%r12),@X[3],@X[3]
1884
+
1885
+ lea $TABLE(%rip),$Tbl
1886
+ vpshufb $t3,@X[2],@X[2]
1887
+ vpaddd 0x00($Tbl),@X[0],$t0
1888
+ vpshufb $t3,@X[3],@X[3]
1889
+ vpaddd 0x20($Tbl),@X[1],$t1
1890
+ vpaddd 0x40($Tbl),@X[2],$t2
1891
+ vpaddd 0x60($Tbl),@X[3],$t3
1892
+ vmovdqa $t0,0x00(%rsp)
1893
+ xor $a1,$a1
1894
+ vmovdqa $t1,0x20(%rsp)
1895
+ lea -$PUSH8(%rsp),%rsp
1896
+ mov $B,$a3
1897
+ vmovdqa $t2,0x00(%rsp)
1898
+ xor $C,$a3 # magic
1899
+ vmovdqa $t3,0x20(%rsp)
1900
+ mov $F,$a4
1901
+ sub \$-16*2*$SZ,$Tbl # size optimization
1902
+ jmp .Lavx2_00_47
1903
+
1904
+ .align 16
1905
+ .Lavx2_00_47:
1906
+ ___
1907
+
1908
+ sub AVX2_256_00_47 () {
1909
+ my $j = shift;
1910
+ my $body = shift;
1911
+ my @X = @_;
1912
+ my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1913
+ my $base = "+2*$PUSH8(%rsp)";
1914
+
1915
+ &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1916
+ foreach (Xupdate_256_AVX()) { # 29 instructions
1917
+ eval;
1918
+ eval(shift(@insns));
1919
+ eval(shift(@insns));
1920
+ eval(shift(@insns));
1921
+ }
1922
+ &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1923
+ foreach (@insns) { eval; } # remaining instructions
1924
+ &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1925
+ }
1926
+
1927
+ for ($i=0,$j=0; $j<4; $j++) {
1928
+ &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1929
+ push(@X,shift(@X)); # rotate(@X)
1930
+ }
1931
+ &lea ($Tbl,16*2*$SZ."($Tbl)");
1932
+ &cmpb (($SZ-1)."($Tbl)",0);
1933
+ &jne (".Lavx2_00_47");
1934
+
1935
+ for ($i=0; $i<16; ) {
1936
+ my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1937
+ foreach(bodyx_00_15()) { eval; }
1938
+ }
1939
+ } else { # SHA512
1940
+ my @X = map("%ymm$_",(0..7));
1941
+ my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
1942
+
1943
+ $code.=<<___;
1944
+ jmp .Loop_avx2
1945
+ .align 16
1946
+ .Loop_avx2:
1947
+ vmovdqu -16*$SZ($inp),%xmm0
1948
+ vmovdqu -16*$SZ+16($inp),%xmm1
1949
+ vmovdqu -16*$SZ+32($inp),%xmm2
1950
+ lea $TABLE+0x80(%rip),$Tbl # size optimization
1951
+ vmovdqu -16*$SZ+48($inp),%xmm3
1952
+ vmovdqu -16*$SZ+64($inp),%xmm4
1953
+ vmovdqu -16*$SZ+80($inp),%xmm5
1954
+ vmovdqu -16*$SZ+96($inp),%xmm6
1955
+ vmovdqu -16*$SZ+112($inp),%xmm7
1956
+ #mov $inp,$_inp # offload $inp
1957
+ vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
1958
+ vinserti128 \$1,(%r12),@X[0],@X[0]
1959
+ vinserti128 \$1,16(%r12),@X[1],@X[1]
1960
+ vpshufb $t2,@X[0],@X[0]
1961
+ vinserti128 \$1,32(%r12),@X[2],@X[2]
1962
+ vpshufb $t2,@X[1],@X[1]
1963
+ vinserti128 \$1,48(%r12),@X[3],@X[3]
1964
+ vpshufb $t2,@X[2],@X[2]
1965
+ vinserti128 \$1,64(%r12),@X[4],@X[4]
1966
+ vpshufb $t2,@X[3],@X[3]
1967
+ vinserti128 \$1,80(%r12),@X[5],@X[5]
1968
+ vpshufb $t2,@X[4],@X[4]
1969
+ vinserti128 \$1,96(%r12),@X[6],@X[6]
1970
+ vpshufb $t2,@X[5],@X[5]
1971
+ vinserti128 \$1,112(%r12),@X[7],@X[7]
1972
+
1973
+ vpaddq -0x80($Tbl),@X[0],$t0
1974
+ vpshufb $t2,@X[6],@X[6]
1975
+ vpaddq -0x60($Tbl),@X[1],$t1
1976
+ vpshufb $t2,@X[7],@X[7]
1977
+ vpaddq -0x40($Tbl),@X[2],$t2
1978
+ vpaddq -0x20($Tbl),@X[3],$t3
1979
+ vmovdqa $t0,0x00(%rsp)
1980
+ vpaddq 0x00($Tbl),@X[4],$t0
1981
+ vmovdqa $t1,0x20(%rsp)
1982
+ vpaddq 0x20($Tbl),@X[5],$t1
1983
+ vmovdqa $t2,0x40(%rsp)
1984
+ vpaddq 0x40($Tbl),@X[6],$t2
1985
+ vmovdqa $t3,0x60(%rsp)
1986
+ lea -$PUSH8(%rsp),%rsp
1987
+ vpaddq 0x60($Tbl),@X[7],$t3
1988
+ vmovdqa $t0,0x00(%rsp)
1989
+ xor $a1,$a1
1990
+ vmovdqa $t1,0x20(%rsp)
1991
+ mov $B,$a3
1992
+ vmovdqa $t2,0x40(%rsp)
1993
+ xor $C,$a3 # magic
1994
+ vmovdqa $t3,0x60(%rsp)
1995
+ mov $F,$a4
1996
+ add \$16*2*$SZ,$Tbl
1997
+ jmp .Lavx2_00_47
1998
+
1999
+ .align 16
2000
+ .Lavx2_00_47:
2001
+ ___
2002
+
2003
+ sub AVX2_512_00_47 () {
2004
+ my $j = shift;
2005
+ my $body = shift;
2006
+ my @X = @_;
2007
+ my @insns = (&$body,&$body); # 48 instructions
2008
+ my $base = "+2*$PUSH8(%rsp)";
2009
+
2010
+ &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0);
2011
+ foreach (Xupdate_512_AVX()) { # 23 instructions
2012
+ eval;
2013
+ if ($_ !~ /\;$/) {
2014
+ eval(shift(@insns));
2015
+ eval(shift(@insns));
2016
+ eval(shift(@insns));
2017
+ }
2018
+ }
2019
+ &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
2020
+ foreach (@insns) { eval; } # remaining instructions
2021
+ &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2022
+ }
2023
+
2024
+ for ($i=0,$j=0; $j<8; $j++) {
2025
+ &AVX2_512_00_47($j,\&bodyx_00_15,@X);
2026
+ push(@X,shift(@X)); # rotate(@X)
2027
+ }
2028
+ &lea ($Tbl,16*2*$SZ."($Tbl)");
2029
+ &cmpb (($SZ-1-0x80)."($Tbl)",0);
2030
+ &jne (".Lavx2_00_47");
2031
+
2032
+ for ($i=0; $i<16; ) {
2033
+ my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2034
+ foreach(bodyx_00_15()) { eval; }
2035
+ }
2036
+ }
2037
+ $code.=<<___;
2038
+ mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2039
+ add $a1,$A
2040
+ #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2041
+ lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
2042
+
2043
+ add $SZ*0($ctx),$A
2044
+ add $SZ*1($ctx),$B
2045
+ add $SZ*2($ctx),$C
2046
+ add $SZ*3($ctx),$D
2047
+ add $SZ*4($ctx),$E
2048
+ add $SZ*5($ctx),$F
2049
+ add $SZ*6($ctx),$G
2050
+ add $SZ*7($ctx),$H
2051
+
2052
+ mov $A,$SZ*0($ctx)
2053
+ mov $B,$SZ*1($ctx)
2054
+ mov $C,$SZ*2($ctx)
2055
+ mov $D,$SZ*3($ctx)
2056
+ mov $E,$SZ*4($ctx)
2057
+ mov $F,$SZ*5($ctx)
2058
+ mov $G,$SZ*6($ctx)
2059
+ mov $H,$SZ*7($ctx)
2060
+
2061
+ cmp `$PUSH8+2*8`($Tbl),$inp # $_end
2062
+ je .Ldone_avx2
2063
+
2064
+ xor $a1,$a1
2065
+ mov $B,$a3
2066
+ xor $C,$a3 # magic
2067
+ mov $F,$a4
2068
+ jmp .Lower_avx2
2069
+ .align 16
2070
+ .Lower_avx2:
2071
+ ___
2072
+ for ($i=0; $i<8; ) {
2073
+ my $base="+16($Tbl)";
2074
+ foreach(bodyx_00_15()) { eval; }
2075
+ }
2076
+ $code.=<<___;
2077
+ lea -$PUSH8($Tbl),$Tbl
2078
+ cmp %rsp,$Tbl
2079
+ jae .Lower_avx2
2080
+
2081
+ mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2082
+ add $a1,$A
2083
+ #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2084
+ lea `2*$SZ*($rounds-8)`(%rsp),%rsp
2085
+
2086
+ add $SZ*0($ctx),$A
2087
+ add $SZ*1($ctx),$B
2088
+ add $SZ*2($ctx),$C
2089
+ add $SZ*3($ctx),$D
2090
+ add $SZ*4($ctx),$E
2091
+ add $SZ*5($ctx),$F
2092
+ lea `2*16*$SZ`($inp),$inp # inp+=2
2093
+ add $SZ*6($ctx),$G
2094
+ mov $inp,%r12
2095
+ add $SZ*7($ctx),$H
2096
+ cmp $_end,$inp
2097
+
2098
+ mov $A,$SZ*0($ctx)
2099
+ cmove %rsp,%r12 # next block or stale data
2100
+ mov $B,$SZ*1($ctx)
2101
+ mov $C,$SZ*2($ctx)
2102
+ mov $D,$SZ*3($ctx)
2103
+ mov $E,$SZ*4($ctx)
2104
+ mov $F,$SZ*5($ctx)
2105
+ mov $G,$SZ*6($ctx)
2106
+ mov $H,$SZ*7($ctx)
2107
+
2108
+ jbe .Loop_avx2
2109
+ lea (%rsp),$Tbl
2110
+
2111
+ .Ldone_avx2:
2112
+ lea ($Tbl),%rsp
2113
+ mov $_rsp,%rsi
2114
+ vzeroupper
2115
+ ___
2116
+ $code.=<<___ if ($win64);
2117
+ movaps 16*$SZ+32(%rsp),%xmm6
2118
+ movaps 16*$SZ+48(%rsp),%xmm7
2119
+ movaps 16*$SZ+64(%rsp),%xmm8
2120
+ movaps 16*$SZ+80(%rsp),%xmm9
2121
+ ___
2122
+ $code.=<<___ if ($win64 && $SZ>4);
2123
+ movaps 16*$SZ+96(%rsp),%xmm10
2124
+ movaps 16*$SZ+112(%rsp),%xmm11
2125
+ ___
2126
+ $code.=<<___;
2127
+ mov (%rsi),%r15
2128
+ mov 8(%rsi),%r14
2129
+ mov 16(%rsi),%r13
2130
+ mov 24(%rsi),%r12
2131
+ mov 32(%rsi),%rbp
2132
+ mov 40(%rsi),%rbx
2133
+ lea 48(%rsi),%rsp
2134
+ .Lepilogue_avx2:
2135
+ ret
2136
+ .size ${func}_avx2,.-${func}_avx2
2137
+ ___
2138
+ }}
2139
+ }}}}}
2140
+
2141
+ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2142
+ # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2143
+ if ($win64) {
2144
+ $rec="%rcx";
2145
+ $frame="%rdx";
2146
+ $context="%r8";
2147
+ $disp="%r9";
2148
+
2149
+ $code.=<<___;
2150
+ .extern __imp_RtlVirtualUnwind
2151
+ .type se_handler,\@abi-omnipotent
2152
+ .align 16
2153
+ se_handler:
2154
+ push %rsi
2155
+ push %rdi
2156
+ push %rbx
2157
+ push %rbp
2158
+ push %r12
2159
+ push %r13
2160
+ push %r14
2161
+ push %r15
2162
+ pushfq
2163
+ sub \$64,%rsp
2164
+
2165
+ mov 120($context),%rax # pull context->Rax
2166
+ mov 248($context),%rbx # pull context->Rip
2167
+
2168
+ mov 8($disp),%rsi # disp->ImageBase
2169
+ mov 56($disp),%r11 # disp->HanderlData
2170
+
2171
+ mov 0(%r11),%r10d # HandlerData[0]
2172
+ lea (%rsi,%r10),%r10 # prologue label
2173
+ cmp %r10,%rbx # context->Rip<prologue label
2174
+ jb .Lin_prologue
2175
+
2176
+ mov 152($context),%rax # pull context->Rsp
2177
+
2178
+ mov 4(%r11),%r10d # HandlerData[1]
2179
+ lea (%rsi,%r10),%r10 # epilogue label
2180
+ cmp %r10,%rbx # context->Rip>=epilogue label
2181
+ jae .Lin_prologue
2182
+ ___
2183
+ $code.=<<___ if ($avx>1);
2184
+ lea .Lavx2_shortcut(%rip),%r10
2185
+ cmp %r10,%rbx # context->Rip<avx2_shortcut
2186
+ jb .Lnot_in_avx2
2187
+
2188
+ and \$-256*$SZ,%rax
2189
+ add \$`2*$SZ*($rounds-8)`,%rax
2190
+ .Lnot_in_avx2:
2191
+ ___
2192
+ $code.=<<___;
2193
+ mov %rax,%rsi # put aside Rsp
2194
+ mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
2195
+ lea 48(%rax),%rax
2196
+
2197
+ mov -8(%rax),%rbx
2198
+ mov -16(%rax),%rbp
2199
+ mov -24(%rax),%r12
2200
+ mov -32(%rax),%r13
2201
+ mov -40(%rax),%r14
2202
+ mov -48(%rax),%r15
2203
+ mov %rbx,144($context) # restore context->Rbx
2204
+ mov %rbp,160($context) # restore context->Rbp
2205
+ mov %r12,216($context) # restore context->R12
2206
+ mov %r13,224($context) # restore context->R13
2207
+ mov %r14,232($context) # restore context->R14
2208
+ mov %r15,240($context) # restore context->R15
2209
+
2210
+ lea .Lepilogue(%rip),%r10
2211
+ cmp %r10,%rbx
2212
+ jb .Lin_prologue # non-AVX code
2213
+
2214
+ lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
2215
+ lea 512($context),%rdi # &context.Xmm6
2216
+ mov \$`$SZ==4?8:12`,%ecx
2217
+ .long 0xa548f3fc # cld; rep movsq
2218
+
2219
+ .Lin_prologue:
2220
+ mov 8(%rax),%rdi
2221
+ mov 16(%rax),%rsi
2222
+ mov %rax,152($context) # restore context->Rsp
2223
+ mov %rsi,168($context) # restore context->Rsi
2224
+ mov %rdi,176($context) # restore context->Rdi
2225
+
2226
+ mov 40($disp),%rdi # disp->ContextRecord
2227
+ mov $context,%rsi # context
2228
+ mov \$154,%ecx # sizeof(CONTEXT)
2229
+ .long 0xa548f3fc # cld; rep movsq
2230
+
2231
+ mov $disp,%rsi
2232
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2233
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
2234
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
2235
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2236
+ mov 40(%rsi),%r10 # disp->ContextRecord
2237
+ lea 56(%rsi),%r11 # &disp->HandlerData
2238
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
2239
+ mov %r10,32(%rsp) # arg5
2240
+ mov %r11,40(%rsp) # arg6
2241
+ mov %r12,48(%rsp) # arg7
2242
+ mov %rcx,56(%rsp) # arg8, (NULL)
2243
+ call *__imp_RtlVirtualUnwind(%rip)
2244
+
2245
+ mov \$1,%eax # ExceptionContinueSearch
2246
+ add \$64,%rsp
2247
+ popfq
2248
+ pop %r15
2249
+ pop %r14
2250
+ pop %r13
2251
+ pop %r12
2252
+ pop %rbp
2253
+ pop %rbx
2254
+ pop %rdi
2255
+ pop %rsi
2256
+ ret
2257
+ .size se_handler,.-se_handler
2258
+ ___
2259
+
2260
+ $code.=<<___ if ($SZ==4 && $shaext);
2261
+ .type shaext_handler,\@abi-omnipotent
2262
+ .align 16
2263
+ shaext_handler:
2264
+ push %rsi
2265
+ push %rdi
2266
+ push %rbx
2267
+ push %rbp
2268
+ push %r12
2269
+ push %r13
2270
+ push %r14
2271
+ push %r15
2272
+ pushfq
2273
+ sub \$64,%rsp
2274
+
2275
+ mov 120($context),%rax # pull context->Rax
2276
+ mov 248($context),%rbx # pull context->Rip
2277
+
2278
+ lea .Lprologue_shaext(%rip),%r10
2279
+ cmp %r10,%rbx # context->Rip<.Lprologue
2280
+ jb .Lin_prologue
2281
+
2282
+ lea .Lepilogue_shaext(%rip),%r10
2283
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
2284
+ jae .Lin_prologue
2285
+
2286
+ lea -8-5*16(%rax),%rsi
2287
+ lea 512($context),%rdi # &context.Xmm6
2288
+ mov \$10,%ecx
2289
+ .long 0xa548f3fc # cld; rep movsq
2290
+
2291
+ jmp .Lin_prologue
2292
+ .size shaext_handler,.-shaext_handler
2293
+ ___
2294
+
2295
+ $code.=<<___;
2296
+ .section .pdata
2297
+ .align 4
2298
+ .rva .LSEH_begin_$func
2299
+ .rva .LSEH_end_$func
2300
+ .rva .LSEH_info_$func
2301
+ ___
2302
+ $code.=<<___ if ($SZ==4 && $shaext);
2303
+ .rva .LSEH_begin_${func}_shaext
2304
+ .rva .LSEH_end_${func}_shaext
2305
+ .rva .LSEH_info_${func}_shaext
2306
+ ___
2307
+ $code.=<<___ if ($SZ==4);
2308
+ .rva .LSEH_begin_${func}_ssse3
2309
+ .rva .LSEH_end_${func}_ssse3
2310
+ .rva .LSEH_info_${func}_ssse3
2311
+ ___
2312
+ $code.=<<___ if ($avx && $SZ==8);
2313
+ .rva .LSEH_begin_${func}_xop
2314
+ .rva .LSEH_end_${func}_xop
2315
+ .rva .LSEH_info_${func}_xop
2316
+ ___
2317
+ $code.=<<___ if ($avx);
2318
+ .rva .LSEH_begin_${func}_avx
2319
+ .rva .LSEH_end_${func}_avx
2320
+ .rva .LSEH_info_${func}_avx
2321
+ ___
2322
+ $code.=<<___ if ($avx>1);
2323
+ .rva .LSEH_begin_${func}_avx2
2324
+ .rva .LSEH_end_${func}_avx2
2325
+ .rva .LSEH_info_${func}_avx2
2326
+ ___
2327
+ $code.=<<___;
2328
+ .section .xdata
2329
+ .align 8
2330
+ .LSEH_info_$func:
2331
+ .byte 9,0,0,0
2332
+ .rva se_handler
2333
+ .rva .Lprologue,.Lepilogue # HandlerData[]
2334
+ ___
2335
+ $code.=<<___ if ($SZ==4 && $shaext);
2336
+ .LSEH_info_${func}_shaext:
2337
+ .byte 9,0,0,0
2338
+ .rva shaext_handler
2339
+ ___
2340
+ $code.=<<___ if ($SZ==4);
2341
+ .LSEH_info_${func}_ssse3:
2342
+ .byte 9,0,0,0
2343
+ .rva se_handler
2344
+ .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
2345
+ ___
2346
+ $code.=<<___ if ($avx && $SZ==8);
2347
+ .LSEH_info_${func}_xop:
2348
+ .byte 9,0,0,0
2349
+ .rva se_handler
2350
+ .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
2351
+ ___
2352
+ $code.=<<___ if ($avx);
2353
+ .LSEH_info_${func}_avx:
2354
+ .byte 9,0,0,0
2355
+ .rva se_handler
2356
+ .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
2357
+ ___
2358
+ $code.=<<___ if ($avx>1);
2359
+ .LSEH_info_${func}_avx2:
2360
+ .byte 9,0,0,0
2361
+ .rva se_handler
2362
+ .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2363
+ ___
2364
+ }
2365
+
2366
+ sub sha256op38 {
2367
+ my $instr = shift;
2368
+ my %opcodelet = (
2369
+ "sha256rnds2" => 0xcb,
2370
+ "sha256msg1" => 0xcc,
2371
+ "sha256msg2" => 0xcd );
2372
+
2373
+ if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2374
+ my @opcode=(0x0f,0x38);
2375
+ push @opcode,$opcodelet{$instr};
2376
+ push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
2377
+ return ".byte\t".join(',',@opcode);
2378
+ } else {
2379
+ return $instr."\t".@_[0];
2380
+ }
2381
+ }
2382
+
2383
+ foreach (split("\n",$code)) {
2384
+ s/\`([^\`]*)\`/eval $1/geo;
2385
+
2386
+ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2387
+
2388
+ print $_,"\n";
2389
+ }
2390
+ close STDOUT;