ring-native 0.0.0 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (267) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/CHANGES.md +7 -0
  4. data/Makefile +5 -0
  5. data/README.md +12 -5
  6. data/Rakefile +4 -0
  7. data/ext/ring/extconf.rb +4 -5
  8. data/lib/ring/native.rb +3 -1
  9. data/lib/ring/native/version.rb +5 -1
  10. data/ring-native.gemspec +6 -6
  11. data/vendor/ring-ffi/Cargo.lock +26 -0
  12. data/vendor/ring-ffi/Cargo.toml +45 -0
  13. data/vendor/ring-ffi/LICENSE +16 -0
  14. data/vendor/ring-ffi/README.md +59 -0
  15. data/vendor/ring-ffi/src/lib.rs +79 -0
  16. metadata +10 -255
  17. data/vendor/ring/BUILDING.md +0 -40
  18. data/vendor/ring/Cargo.toml +0 -43
  19. data/vendor/ring/LICENSE +0 -185
  20. data/vendor/ring/Makefile +0 -35
  21. data/vendor/ring/PORTING.md +0 -163
  22. data/vendor/ring/README.md +0 -113
  23. data/vendor/ring/STYLE.md +0 -197
  24. data/vendor/ring/appveyor.yml +0 -27
  25. data/vendor/ring/build.rs +0 -108
  26. data/vendor/ring/crypto/aes/aes.c +0 -1142
  27. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +0 -25
  28. data/vendor/ring/crypto/aes/aes_test.cc +0 -93
  29. data/vendor/ring/crypto/aes/asm/aes-586.pl +0 -2368
  30. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +0 -1249
  31. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +0 -2246
  32. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +0 -1318
  33. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +0 -2084
  34. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +0 -675
  35. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +0 -1364
  36. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +0 -1565
  37. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +0 -841
  38. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +0 -1116
  39. data/vendor/ring/crypto/aes/internal.h +0 -87
  40. data/vendor/ring/crypto/aes/mode_wrappers.c +0 -61
  41. data/vendor/ring/crypto/bn/add.c +0 -394
  42. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +0 -694
  43. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +0 -1503
  44. data/vendor/ring/crypto/bn/asm/bn-586.pl +0 -774
  45. data/vendor/ring/crypto/bn/asm/co-586.pl +0 -287
  46. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +0 -1882
  47. data/vendor/ring/crypto/bn/asm/x86-mont.pl +0 -592
  48. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +0 -599
  49. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +0 -1393
  50. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +0 -3507
  51. data/vendor/ring/crypto/bn/bn.c +0 -352
  52. data/vendor/ring/crypto/bn/bn_asn1.c +0 -74
  53. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +0 -25
  54. data/vendor/ring/crypto/bn/bn_test.cc +0 -1696
  55. data/vendor/ring/crypto/bn/cmp.c +0 -200
  56. data/vendor/ring/crypto/bn/convert.c +0 -433
  57. data/vendor/ring/crypto/bn/ctx.c +0 -311
  58. data/vendor/ring/crypto/bn/div.c +0 -594
  59. data/vendor/ring/crypto/bn/exponentiation.c +0 -1335
  60. data/vendor/ring/crypto/bn/gcd.c +0 -711
  61. data/vendor/ring/crypto/bn/generic.c +0 -1019
  62. data/vendor/ring/crypto/bn/internal.h +0 -316
  63. data/vendor/ring/crypto/bn/montgomery.c +0 -516
  64. data/vendor/ring/crypto/bn/mul.c +0 -888
  65. data/vendor/ring/crypto/bn/prime.c +0 -829
  66. data/vendor/ring/crypto/bn/random.c +0 -334
  67. data/vendor/ring/crypto/bn/rsaz_exp.c +0 -262
  68. data/vendor/ring/crypto/bn/rsaz_exp.h +0 -53
  69. data/vendor/ring/crypto/bn/shift.c +0 -276
  70. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +0 -25
  71. data/vendor/ring/crypto/bytestring/bytestring_test.cc +0 -421
  72. data/vendor/ring/crypto/bytestring/cbb.c +0 -399
  73. data/vendor/ring/crypto/bytestring/cbs.c +0 -227
  74. data/vendor/ring/crypto/bytestring/internal.h +0 -46
  75. data/vendor/ring/crypto/chacha/chacha_generic.c +0 -140
  76. data/vendor/ring/crypto/chacha/chacha_vec.c +0 -323
  77. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +0 -1447
  78. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +0 -153
  79. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +0 -25
  80. data/vendor/ring/crypto/cipher/e_aes.c +0 -390
  81. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +0 -208
  82. data/vendor/ring/crypto/cipher/internal.h +0 -173
  83. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +0 -543
  84. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +0 -9
  85. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +0 -475
  86. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +0 -23
  87. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +0 -422
  88. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +0 -484
  89. data/vendor/ring/crypto/cipher/test/cipher_test.txt +0 -100
  90. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +0 -25
  91. data/vendor/ring/crypto/constant_time_test.c +0 -304
  92. data/vendor/ring/crypto/cpu-arm-asm.S +0 -32
  93. data/vendor/ring/crypto/cpu-arm.c +0 -199
  94. data/vendor/ring/crypto/cpu-intel.c +0 -261
  95. data/vendor/ring/crypto/crypto.c +0 -151
  96. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +0 -2118
  97. data/vendor/ring/crypto/curve25519/curve25519.c +0 -4888
  98. data/vendor/ring/crypto/curve25519/x25519_test.cc +0 -128
  99. data/vendor/ring/crypto/digest/md32_common.h +0 -181
  100. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +0 -2725
  101. data/vendor/ring/crypto/ec/ec.c +0 -193
  102. data/vendor/ring/crypto/ec/ec_curves.c +0 -61
  103. data/vendor/ring/crypto/ec/ec_key.c +0 -228
  104. data/vendor/ring/crypto/ec/ec_montgomery.c +0 -114
  105. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +0 -25
  106. data/vendor/ring/crypto/ec/internal.h +0 -243
  107. data/vendor/ring/crypto/ec/oct.c +0 -253
  108. data/vendor/ring/crypto/ec/p256-64.c +0 -1794
  109. data/vendor/ring/crypto/ec/p256-x86_64-table.h +0 -9548
  110. data/vendor/ring/crypto/ec/p256-x86_64.c +0 -509
  111. data/vendor/ring/crypto/ec/simple.c +0 -1007
  112. data/vendor/ring/crypto/ec/util-64.c +0 -183
  113. data/vendor/ring/crypto/ec/wnaf.c +0 -508
  114. data/vendor/ring/crypto/ecdh/ecdh.c +0 -155
  115. data/vendor/ring/crypto/ecdsa/ecdsa.c +0 -304
  116. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +0 -193
  117. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +0 -25
  118. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +0 -327
  119. data/vendor/ring/crypto/header_removed.h +0 -17
  120. data/vendor/ring/crypto/internal.h +0 -495
  121. data/vendor/ring/crypto/libring.Windows.vcxproj +0 -101
  122. data/vendor/ring/crypto/mem.c +0 -98
  123. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +0 -1045
  124. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +0 -517
  125. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +0 -1393
  126. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +0 -1741
  127. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +0 -422
  128. data/vendor/ring/crypto/modes/ctr.c +0 -226
  129. data/vendor/ring/crypto/modes/gcm.c +0 -1206
  130. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +0 -25
  131. data/vendor/ring/crypto/modes/gcm_test.c +0 -348
  132. data/vendor/ring/crypto/modes/internal.h +0 -299
  133. data/vendor/ring/crypto/perlasm/arm-xlate.pl +0 -170
  134. data/vendor/ring/crypto/perlasm/readme +0 -100
  135. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +0 -1164
  136. data/vendor/ring/crypto/perlasm/x86asm.pl +0 -292
  137. data/vendor/ring/crypto/perlasm/x86gas.pl +0 -263
  138. data/vendor/ring/crypto/perlasm/x86masm.pl +0 -200
  139. data/vendor/ring/crypto/perlasm/x86nasm.pl +0 -187
  140. data/vendor/ring/crypto/poly1305/poly1305.c +0 -331
  141. data/vendor/ring/crypto/poly1305/poly1305_arm.c +0 -301
  142. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +0 -2015
  143. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +0 -25
  144. data/vendor/ring/crypto/poly1305/poly1305_test.cc +0 -80
  145. data/vendor/ring/crypto/poly1305/poly1305_test.txt +0 -52
  146. data/vendor/ring/crypto/poly1305/poly1305_vec.c +0 -892
  147. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +0 -75
  148. data/vendor/ring/crypto/rand/internal.h +0 -32
  149. data/vendor/ring/crypto/rand/rand.c +0 -189
  150. data/vendor/ring/crypto/rand/urandom.c +0 -219
  151. data/vendor/ring/crypto/rand/windows.c +0 -56
  152. data/vendor/ring/crypto/refcount_c11.c +0 -66
  153. data/vendor/ring/crypto/refcount_lock.c +0 -53
  154. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +0 -25
  155. data/vendor/ring/crypto/refcount_test.c +0 -58
  156. data/vendor/ring/crypto/rsa/blinding.c +0 -462
  157. data/vendor/ring/crypto/rsa/internal.h +0 -108
  158. data/vendor/ring/crypto/rsa/padding.c +0 -300
  159. data/vendor/ring/crypto/rsa/rsa.c +0 -450
  160. data/vendor/ring/crypto/rsa/rsa_asn1.c +0 -261
  161. data/vendor/ring/crypto/rsa/rsa_impl.c +0 -944
  162. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +0 -25
  163. data/vendor/ring/crypto/rsa/rsa_test.cc +0 -437
  164. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +0 -436
  165. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +0 -2390
  166. data/vendor/ring/crypto/sha/asm/sha256-586.pl +0 -1275
  167. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +0 -735
  168. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +0 -14
  169. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +0 -14
  170. data/vendor/ring/crypto/sha/asm/sha512-586.pl +0 -911
  171. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +0 -666
  172. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +0 -14
  173. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +0 -14
  174. data/vendor/ring/crypto/sha/sha1.c +0 -271
  175. data/vendor/ring/crypto/sha/sha256.c +0 -204
  176. data/vendor/ring/crypto/sha/sha512.c +0 -355
  177. data/vendor/ring/crypto/test/file_test.cc +0 -326
  178. data/vendor/ring/crypto/test/file_test.h +0 -181
  179. data/vendor/ring/crypto/test/malloc.cc +0 -150
  180. data/vendor/ring/crypto/test/scoped_types.h +0 -95
  181. data/vendor/ring/crypto/test/test.Windows.vcxproj +0 -35
  182. data/vendor/ring/crypto/test/test_util.cc +0 -46
  183. data/vendor/ring/crypto/test/test_util.h +0 -41
  184. data/vendor/ring/crypto/thread_none.c +0 -55
  185. data/vendor/ring/crypto/thread_pthread.c +0 -165
  186. data/vendor/ring/crypto/thread_test.Windows.vcxproj +0 -25
  187. data/vendor/ring/crypto/thread_test.c +0 -200
  188. data/vendor/ring/crypto/thread_win.c +0 -282
  189. data/vendor/ring/examples/checkdigest.rs +0 -103
  190. data/vendor/ring/include/openssl/aes.h +0 -121
  191. data/vendor/ring/include/openssl/arm_arch.h +0 -129
  192. data/vendor/ring/include/openssl/base.h +0 -156
  193. data/vendor/ring/include/openssl/bn.h +0 -794
  194. data/vendor/ring/include/openssl/buffer.h +0 -18
  195. data/vendor/ring/include/openssl/bytestring.h +0 -235
  196. data/vendor/ring/include/openssl/chacha.h +0 -37
  197. data/vendor/ring/include/openssl/cmac.h +0 -76
  198. data/vendor/ring/include/openssl/cpu.h +0 -184
  199. data/vendor/ring/include/openssl/crypto.h +0 -43
  200. data/vendor/ring/include/openssl/curve25519.h +0 -88
  201. data/vendor/ring/include/openssl/ec.h +0 -225
  202. data/vendor/ring/include/openssl/ec_key.h +0 -129
  203. data/vendor/ring/include/openssl/ecdh.h +0 -110
  204. data/vendor/ring/include/openssl/ecdsa.h +0 -156
  205. data/vendor/ring/include/openssl/err.h +0 -201
  206. data/vendor/ring/include/openssl/mem.h +0 -101
  207. data/vendor/ring/include/openssl/obj_mac.h +0 -71
  208. data/vendor/ring/include/openssl/opensslfeatures.h +0 -68
  209. data/vendor/ring/include/openssl/opensslv.h +0 -18
  210. data/vendor/ring/include/openssl/ossl_typ.h +0 -18
  211. data/vendor/ring/include/openssl/poly1305.h +0 -51
  212. data/vendor/ring/include/openssl/rand.h +0 -70
  213. data/vendor/ring/include/openssl/rsa.h +0 -399
  214. data/vendor/ring/include/openssl/thread.h +0 -133
  215. data/vendor/ring/include/openssl/type_check.h +0 -71
  216. data/vendor/ring/mk/Common.props +0 -63
  217. data/vendor/ring/mk/Windows.props +0 -42
  218. data/vendor/ring/mk/WindowsTest.props +0 -18
  219. data/vendor/ring/mk/appveyor.bat +0 -62
  220. data/vendor/ring/mk/bottom_of_makefile.mk +0 -54
  221. data/vendor/ring/mk/ring.mk +0 -266
  222. data/vendor/ring/mk/top_of_makefile.mk +0 -214
  223. data/vendor/ring/mk/travis.sh +0 -40
  224. data/vendor/ring/mk/update-travis-yml.py +0 -229
  225. data/vendor/ring/ring.sln +0 -153
  226. data/vendor/ring/src/aead.rs +0 -682
  227. data/vendor/ring/src/agreement.rs +0 -248
  228. data/vendor/ring/src/c.rs +0 -129
  229. data/vendor/ring/src/constant_time.rs +0 -37
  230. data/vendor/ring/src/der.rs +0 -96
  231. data/vendor/ring/src/digest.rs +0 -690
  232. data/vendor/ring/src/digest_tests.txt +0 -57
  233. data/vendor/ring/src/ecc.rs +0 -28
  234. data/vendor/ring/src/ecc_build.rs +0 -279
  235. data/vendor/ring/src/ecc_curves.rs +0 -117
  236. data/vendor/ring/src/ed25519_tests.txt +0 -2579
  237. data/vendor/ring/src/exe_tests.rs +0 -46
  238. data/vendor/ring/src/ffi.rs +0 -29
  239. data/vendor/ring/src/file_test.rs +0 -187
  240. data/vendor/ring/src/hkdf.rs +0 -153
  241. data/vendor/ring/src/hkdf_tests.txt +0 -59
  242. data/vendor/ring/src/hmac.rs +0 -414
  243. data/vendor/ring/src/hmac_tests.txt +0 -97
  244. data/vendor/ring/src/input.rs +0 -312
  245. data/vendor/ring/src/lib.rs +0 -41
  246. data/vendor/ring/src/pbkdf2.rs +0 -265
  247. data/vendor/ring/src/pbkdf2_tests.txt +0 -113
  248. data/vendor/ring/src/polyfill.rs +0 -57
  249. data/vendor/ring/src/rand.rs +0 -28
  250. data/vendor/ring/src/signature.rs +0 -314
  251. data/vendor/ring/third-party/NIST/README.md +0 -9
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +0 -263
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +0 -309
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +0 -267
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +0 -263
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +0 -309
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +0 -267
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +0 -263
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +0 -309
  260. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +0 -267
  261. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +0 -519
  262. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +0 -309
  263. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +0 -523
  264. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +0 -519
  265. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +0 -309
  266. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +0 -523
  267. data/vendor/ring/third-party/NIST/sha256sums.txt +0 -1
@@ -1,436 +0,0 @@
1
- #!/usr/bin/env perl
2
- #
3
- # ====================================================================
4
- # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
- # project. The module is, however, dual licensed under OpenSSL and
6
- # CRYPTOGAMS licenses depending on where you obtain it. For further
7
- # details see http://www.openssl.org/~appro/cryptogams/.
8
- # ====================================================================
9
- #
10
- # SHA256/512 for ARMv8.
11
- #
12
- # Performance in cycles per processed byte and improvement coefficient
13
- # over code generated with "default" compiler:
14
- #
15
- # SHA256-hw SHA256(*) SHA512
16
- # Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
17
- # Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
18
- # Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
19
- # Denver 2.01 10.5 (+26%) 6.70 (+8%)
20
- # X-Gene 20.0 (+100%) 12.8 (+300%(***))
21
- #
22
- # (*) Software SHA256 results are of lesser relevance, presented
23
- # mostly for informational purposes.
24
- # (**) The result is a trade-off: it's possible to improve it by
25
- # 10% (or by 1 cycle per round), but at the cost of 20% loss
26
- # on Cortex-A53 (or by 4 cycles per round).
27
- # (***) Super-impressive coefficients over gcc-generated code are
28
- # indication of some compiler "pathology", most notably code
29
- # generated with -mgeneral-regs-only is significanty faster
30
- # and the gap is only 40-90%.
31
-
32
- $flavour=shift;
33
- # Unlike most perlasm files, sha512-armv8.pl takes an additional argument to
34
- # determine which hash function to emit. This differs from upstream OpenSSL so
35
- # that the script may continue to output to stdout.
36
- $variant=shift;
37
- $output=shift;
38
-
39
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
41
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
42
- die "can't locate arm-xlate.pl";
43
-
44
- open OUT,"| \"$^X\" $xlate $flavour $output";
45
- *STDOUT=*OUT;
46
-
47
- if ($variant eq "sha512") {
48
- $BITS=512;
49
- $SZ=8;
50
- @Sigma0=(28,34,39);
51
- @Sigma1=(14,18,41);
52
- @sigma0=(1, 8, 7);
53
- @sigma1=(19,61, 6);
54
- $rounds=80;
55
- $reg_t="x";
56
- } elsif ($variant eq "sha256") {
57
- $BITS=256;
58
- $SZ=4;
59
- @Sigma0=( 2,13,22);
60
- @Sigma1=( 6,11,25);
61
- @sigma0=( 7,18, 3);
62
- @sigma1=(17,19,10);
63
- $rounds=64;
64
- $reg_t="w";
65
- } else {
66
- die "Unknown variant: $variant";
67
- }
68
-
69
- $func="sha${BITS}_block_data_order";
70
-
71
- ($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
72
-
73
- @X=map("$reg_t$_",(3..15,0..2));
74
- @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
75
- ($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
76
-
77
- sub BODY_00_xx {
78
- my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
79
- my $j=($i+1)&15;
80
- my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
81
- $T0=@X[$i+3] if ($i<11);
82
-
83
- $code.=<<___ if ($i<16);
84
- #ifndef __ARMEB__
85
- rev @X[$i],@X[$i] // $i
86
- #endif
87
- ___
88
- $code.=<<___ if ($i<13 && ($i&1));
89
- ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ
90
- ___
91
- $code.=<<___ if ($i==13);
92
- ldp @X[14],@X[15],[$inp]
93
- ___
94
- $code.=<<___ if ($i>=14);
95
- ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
96
- ___
97
- $code.=<<___ if ($i>0 && $i<16);
98
- add $a,$a,$t1 // h+=Sigma0(a)
99
- ___
100
- $code.=<<___ if ($i>=11);
101
- str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
102
- ___
103
- # While ARMv8 specifies merged rotate-n-logical operation such as
104
- # 'eor x,y,z,ror#n', it was found to negatively affect performance
105
- # on Apple A7. The reason seems to be that it requires even 'y' to
106
- # be available earlier. This means that such merged instruction is
107
- # not necessarily best choice on critical path... On the other hand
108
- # Cortex-A5x handles merged instructions much better than disjoint
109
- # rotate and logical... See (**) footnote above.
110
- $code.=<<___ if ($i<15);
111
- ror $t0,$e,#$Sigma1[0]
112
- add $h,$h,$t2 // h+=K[i]
113
- eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
114
- and $t1,$f,$e
115
- bic $t2,$g,$e
116
- add $h,$h,@X[$i&15] // h+=X[i]
117
- orr $t1,$t1,$t2 // Ch(e,f,g)
118
- eor $t2,$a,$b // a^b, b^c in next round
119
- eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e)
120
- ror $T0,$a,#$Sigma0[0]
121
- add $h,$h,$t1 // h+=Ch(e,f,g)
122
- eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
123
- add $h,$h,$t0 // h+=Sigma1(e)
124
- and $t3,$t3,$t2 // (b^c)&=(a^b)
125
- add $d,$d,$h // d+=h
126
- eor $t3,$t3,$b // Maj(a,b,c)
127
- eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a)
128
- add $h,$h,$t3 // h+=Maj(a,b,c)
129
- ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
130
- //add $h,$h,$t1 // h+=Sigma0(a)
131
- ___
132
- $code.=<<___ if ($i>=15);
133
- ror $t0,$e,#$Sigma1[0]
134
- add $h,$h,$t2 // h+=K[i]
135
- ror $T1,@X[($j+1)&15],#$sigma0[0]
136
- and $t1,$f,$e
137
- ror $T2,@X[($j+14)&15],#$sigma1[0]
138
- bic $t2,$g,$e
139
- ror $T0,$a,#$Sigma0[0]
140
- add $h,$h,@X[$i&15] // h+=X[i]
141
- eor $t0,$t0,$e,ror#$Sigma1[1]
142
- eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
143
- orr $t1,$t1,$t2 // Ch(e,f,g)
144
- eor $t2,$a,$b // a^b, b^c in next round
145
- eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e)
146
- eor $T0,$T0,$a,ror#$Sigma0[1]
147
- add $h,$h,$t1 // h+=Ch(e,f,g)
148
- and $t3,$t3,$t2 // (b^c)&=(a^b)
149
- eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
150
- eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1])
151
- add $h,$h,$t0 // h+=Sigma1(e)
152
- eor $t3,$t3,$b // Maj(a,b,c)
153
- eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a)
154
- eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14])
155
- add @X[$j],@X[$j],@X[($j+9)&15]
156
- add $d,$d,$h // d+=h
157
- add $h,$h,$t3 // h+=Maj(a,b,c)
158
- ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
159
- add @X[$j],@X[$j],$T1
160
- add $h,$h,$t1 // h+=Sigma0(a)
161
- add @X[$j],@X[$j],$T2
162
- ___
163
- ($t2,$t3)=($t3,$t2);
164
- }
165
-
166
- $code.=<<___;
167
- #include <openssl/arm_arch.h>
168
-
169
- .text
170
-
171
- .extern OPENSSL_armcap_P
172
- .globl $func
173
- .type $func,%function
174
- .align 6
175
- $func:
176
- ___
177
- $code.=<<___ if ($SZ==4);
178
- ldr x16,.LOPENSSL_armcap_P
179
- adr x17,.LOPENSSL_armcap_P
180
- add x16,x16,x17
181
- ldr w16,[x16]
182
- tst w16,#ARMV8_SHA256
183
- b.ne .Lv8_entry
184
- ___
185
- $code.=<<___;
186
- stp x29,x30,[sp,#-128]!
187
- add x29,sp,#0
188
-
189
- stp x19,x20,[sp,#16]
190
- stp x21,x22,[sp,#32]
191
- stp x23,x24,[sp,#48]
192
- stp x25,x26,[sp,#64]
193
- stp x27,x28,[sp,#80]
194
- sub sp,sp,#4*$SZ
195
-
196
- ldp $A,$B,[$ctx] // load context
197
- ldp $C,$D,[$ctx,#2*$SZ]
198
- ldp $E,$F,[$ctx,#4*$SZ]
199
- add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
200
- ldp $G,$H,[$ctx,#6*$SZ]
201
- adr $Ktbl,.LK$BITS
202
- stp $ctx,$num,[x29,#96]
203
-
204
- .Loop:
205
- ldp @X[0],@X[1],[$inp],#2*$SZ
206
- ldr $t2,[$Ktbl],#$SZ // *K++
207
- eor $t3,$B,$C // magic seed
208
- str $inp,[x29,#112]
209
- ___
210
- for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
211
- $code.=".Loop_16_xx:\n";
212
- for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
213
- $code.=<<___;
214
- cbnz $t2,.Loop_16_xx
215
-
216
- ldp $ctx,$num,[x29,#96]
217
- ldr $inp,[x29,#112]
218
- sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind
219
-
220
- ldp @X[0],@X[1],[$ctx]
221
- ldp @X[2],@X[3],[$ctx,#2*$SZ]
222
- add $inp,$inp,#14*$SZ // advance input pointer
223
- ldp @X[4],@X[5],[$ctx,#4*$SZ]
224
- add $A,$A,@X[0]
225
- ldp @X[6],@X[7],[$ctx,#6*$SZ]
226
- add $B,$B,@X[1]
227
- add $C,$C,@X[2]
228
- add $D,$D,@X[3]
229
- stp $A,$B,[$ctx]
230
- add $E,$E,@X[4]
231
- add $F,$F,@X[5]
232
- stp $C,$D,[$ctx,#2*$SZ]
233
- add $G,$G,@X[6]
234
- add $H,$H,@X[7]
235
- cmp $inp,$num
236
- stp $E,$F,[$ctx,#4*$SZ]
237
- stp $G,$H,[$ctx,#6*$SZ]
238
- b.ne .Loop
239
-
240
- ldp x19,x20,[x29,#16]
241
- add sp,sp,#4*$SZ
242
- ldp x21,x22,[x29,#32]
243
- ldp x23,x24,[x29,#48]
244
- ldp x25,x26,[x29,#64]
245
- ldp x27,x28,[x29,#80]
246
- ldp x29,x30,[sp],#128
247
- ret
248
- .size $func,.-$func
249
-
250
- .align 6
251
- .type .LK$BITS,%object
252
- .LK$BITS:
253
- ___
254
- $code.=<<___ if ($SZ==8);
255
- .quad 0x428a2f98d728ae22,0x7137449123ef65cd
256
- .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
257
- .quad 0x3956c25bf348b538,0x59f111f1b605d019
258
- .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
259
- .quad 0xd807aa98a3030242,0x12835b0145706fbe
260
- .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
261
- .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
262
- .quad 0x9bdc06a725c71235,0xc19bf174cf692694
263
- .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
264
- .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
265
- .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
266
- .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
267
- .quad 0x983e5152ee66dfab,0xa831c66d2db43210
268
- .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
269
- .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
270
- .quad 0x06ca6351e003826f,0x142929670a0e6e70
271
- .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
272
- .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
273
- .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
274
- .quad 0x81c2c92e47edaee6,0x92722c851482353b
275
- .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
276
- .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
277
- .quad 0xd192e819d6ef5218,0xd69906245565a910
278
- .quad 0xf40e35855771202a,0x106aa07032bbd1b8
279
- .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
280
- .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
281
- .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
282
- .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
283
- .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
284
- .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
285
- .quad 0x90befffa23631e28,0xa4506cebde82bde9
286
- .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
287
- .quad 0xca273eceea26619c,0xd186b8c721c0c207
288
- .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
289
- .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
290
- .quad 0x113f9804bef90dae,0x1b710b35131c471b
291
- .quad 0x28db77f523047d84,0x32caab7b40c72493
292
- .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
293
- .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
294
- .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
295
- .quad 0 // terminator
296
- ___
297
- $code.=<<___ if ($SZ==4);
298
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
299
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
300
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
301
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
302
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
303
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
304
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
305
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
306
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
307
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
308
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
309
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
310
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
311
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
312
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
313
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
314
- .long 0 //terminator
315
- ___
316
- $code.=<<___;
317
- .size .LK$BITS,.-.LK$BITS
318
- .align 3
319
- .LOPENSSL_armcap_P:
320
- .quad OPENSSL_armcap_P-.
321
- .asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
322
- .align 2
323
- ___
324
-
325
- if ($SZ==4) {
326
- my $Ktbl="x3";
327
-
328
- my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
329
- my @MSG=map("v$_.16b",(4..7));
330
- my ($W0,$W1)=("v16.4s","v17.4s");
331
- my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
332
-
333
- $code.=<<___;
334
- .type sha256_block_armv8,%function
335
- .align 6
336
- sha256_block_armv8:
337
- .Lv8_entry:
338
- stp x29,x30,[sp,#-16]!
339
- add x29,sp,#0
340
-
341
- ld1.32 {$ABCD,$EFGH},[$ctx]
342
- adr $Ktbl,.LK256
343
-
344
- .Loop_hw:
345
- ld1 {@MSG[0]-@MSG[3]},[$inp],#64
346
- sub $num,$num,#1
347
- ld1.32 {$W0},[$Ktbl],#16
348
- rev32 @MSG[0],@MSG[0]
349
- rev32 @MSG[1],@MSG[1]
350
- rev32 @MSG[2],@MSG[2]
351
- rev32 @MSG[3],@MSG[3]
352
- orr $ABCD_SAVE,$ABCD,$ABCD // offload
353
- orr $EFGH_SAVE,$EFGH,$EFGH
354
- ___
355
- for($i=0;$i<12;$i++) {
356
- $code.=<<___;
357
- ld1.32 {$W1},[$Ktbl],#16
358
- add.i32 $W0,$W0,@MSG[0]
359
- sha256su0 @MSG[0],@MSG[1]
360
- orr $abcd,$ABCD,$ABCD
361
- sha256h $ABCD,$EFGH,$W0
362
- sha256h2 $EFGH,$abcd,$W0
363
- sha256su1 @MSG[0],@MSG[2],@MSG[3]
364
- ___
365
- ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
366
- }
367
- $code.=<<___;
368
- ld1.32 {$W1},[$Ktbl],#16
369
- add.i32 $W0,$W0,@MSG[0]
370
- orr $abcd,$ABCD,$ABCD
371
- sha256h $ABCD,$EFGH,$W0
372
- sha256h2 $EFGH,$abcd,$W0
373
-
374
- ld1.32 {$W0},[$Ktbl],#16
375
- add.i32 $W1,$W1,@MSG[1]
376
- orr $abcd,$ABCD,$ABCD
377
- sha256h $ABCD,$EFGH,$W1
378
- sha256h2 $EFGH,$abcd,$W1
379
-
380
- ld1.32 {$W1},[$Ktbl]
381
- add.i32 $W0,$W0,@MSG[2]
382
- sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
383
- orr $abcd,$ABCD,$ABCD
384
- sha256h $ABCD,$EFGH,$W0
385
- sha256h2 $EFGH,$abcd,$W0
386
-
387
- add.i32 $W1,$W1,@MSG[3]
388
- orr $abcd,$ABCD,$ABCD
389
- sha256h $ABCD,$EFGH,$W1
390
- sha256h2 $EFGH,$abcd,$W1
391
-
392
- add.i32 $ABCD,$ABCD,$ABCD_SAVE
393
- add.i32 $EFGH,$EFGH,$EFGH_SAVE
394
-
395
- cbnz $num,.Loop_hw
396
-
397
- st1.32 {$ABCD,$EFGH},[$ctx]
398
-
399
- ldr x29,[sp],#16
400
- ret
401
- .size sha256_block_armv8,.-sha256_block_armv8
402
- ___
403
- }
404
-
405
- $code.=<<___;
406
- .comm OPENSSL_armcap_P,4,4
407
- ___
408
-
409
- { my %opcode = (
410
- "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000,
411
- "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 );
412
-
413
- sub unsha256 {
414
- my ($mnemonic,$arg)=@_;
415
-
416
- $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
417
- &&
418
- sprintf ".inst\t0x%08x\t//%s %s",
419
- $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
420
- $mnemonic,$arg;
421
- }
422
- }
423
-
424
- foreach(split("\n",$code)) {
425
-
426
- s/\`([^\`]*)\`/eval($1)/geo;
427
-
428
- s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
429
-
430
- s/\.\w?32\b//o and s/\.16b/\.4s/go;
431
- m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
432
-
433
- print $_,"\n";
434
- }
435
-
436
- close STDOUT;
@@ -1,2390 +0,0 @@
1
- #!/usr/bin/env perl
2
- #
3
- # ====================================================================
4
- # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
- # project. Rights for redistribution and usage in source and binary
6
- # forms are granted according to the OpenSSL license.
7
- # ====================================================================
8
- #
9
- # sha256/512_block procedure for x86_64.
10
- #
11
- # 40% improvement over compiler-generated code on Opteron. On EM64T
12
- # sha256 was observed to run >80% faster and sha512 - >40%. No magical
13
- # tricks, just straight implementation... I really wonder why gcc
14
- # [being armed with inline assembler] fails to generate as fast code.
15
- # The only thing which is cool about this module is that it's very
16
- # same instruction sequence used for both SHA-256 and SHA-512. In
17
- # former case the instructions operate on 32-bit operands, while in
18
- # latter - on 64-bit ones. All I had to do is to get one flavor right,
19
- # the other one passed the test right away:-)
20
- #
21
- # sha256_block runs in ~1005 cycles on Opteron, which gives you
22
- # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
23
- # frequency in GHz. sha512_block runs in ~1275 cycles, which results
24
- # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
25
- # Well, if you compare it to IA-64 implementation, which maintains
26
- # X[16] in register bank[!], tends to 4 instructions per CPU clock
27
- # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
28
- # issue Opteron pipeline and X[16] maintained in memory. So that *if*
29
- # there is a way to improve it, *then* the only way would be to try to
30
- # offload X[16] updates to SSE unit, but that would require "deeper"
31
- # loop unroll, which in turn would naturally cause size blow-up, not
32
- # to mention increased complexity! And once again, only *if* it's
33
- # actually possible to noticeably improve overall ILP, instruction
34
- # level parallelism, on a given CPU implementation in this case.
35
- #
36
- # Special note on Intel EM64T. While Opteron CPU exhibits perfect
37
- # perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
38
- # [currently available] EM64T CPUs apparently are far from it. On the
39
- # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
40
- # sha256_block:-( This is presumably because 64-bit shifts/rotates
41
- # apparently are not atomic instructions, but implemented in microcode.
42
- #
43
- # May 2012.
44
- #
45
- # Optimization including one of Pavel Semjanov's ideas, alternative
46
- # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
47
- # unfortunately -2% SHA512 on P4 [which nobody should care about
48
- # that much].
49
- #
50
- # June 2012.
51
- #
52
- # Add SIMD code paths, see below for improvement coefficients. SSSE3
53
- # code path was not attempted for SHA512, because improvement is not
54
- # estimated to be high enough, noticeably less than 9%, to justify
55
- # the effort, not on pre-AVX processors. [Obviously with exclusion
56
- # for VIA Nano, but it has SHA512 instruction that is faster and
57
- # should be used instead.] For reference, corresponding estimated
58
- # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
59
- # higher coefficients are observed on VIA Nano and Bulldozer has more
60
- # to do with specifics of their architecture [which is topic for
61
- # separate discussion].
62
- #
63
- # November 2012.
64
- #
65
- # Add AVX2 code path. Two consecutive input blocks are loaded to
66
- # 256-bit %ymm registers, with data from first block to least
67
- # significant 128-bit halves and data from second to most significant.
68
- # The data is then processed with same SIMD instruction sequence as
69
- # for AVX, but with %ymm as operands. Side effect is increased stack
70
- # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
71
- # code size increase.
72
- #
73
- # March 2014.
74
- #
75
- # Add support for Intel SHA Extensions.
76
-
77
- ######################################################################
78
- # Current performance in cycles per processed byte (less is better):
79
- #
80
- # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
81
- #
82
- # AMD K8 14.9 - - 9.57 -
83
- # P4 17.3 - - 30.8 -
84
- # Core 2 15.6 13.8(+13%) - 9.97 -
85
- # Westmere 14.8 12.3(+19%) - 9.58 -
86
- # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
87
- # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
88
- # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
89
- # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
90
- # VIA Nano 23.0 16.5(+39%) - 14.7 -
91
- # Atom 23.0 18.9(+22%) - 14.7 -
92
- # Silvermont 27.4 20.6(+33%) - 17.5 -
93
- #
94
- # (*) whichever best applicable;
95
- # (**) switch from ror to shrd stands for fair share of improvement;
96
- # (***) execution time is fully determined by remaining integer-only
97
- # part, body_00_15; reducing the amount of SIMD instructions
98
- # below certain limit makes no difference/sense; to conserve
99
- # space SHA256 XOP code path is therefore omitted;
100
-
101
- $flavour = shift;
102
- $output = shift;
103
- if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
104
-
105
- $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
106
-
107
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
108
- ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
109
- ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
110
- die "can't locate x86_64-xlate.pl";
111
-
112
- # In upstream, this is controlled by shelling out to the compiler to check
113
- # versions, but BoringSSL is intended to be used with pre-generated perlasm
114
- # output, so this isn't useful anyway.
115
- #
116
- # TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
117
- # necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
118
- # did not tie them together until after $shaext was added.
119
- $avx = 1;
120
-
121
- # TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
122
- # been tested.
123
- $shaext=0; ### set to zero if compiling for 1.0.1
124
- $avx=1 if (!$shaext && $avx);
125
-
126
- open OUT,"| \"$^X\" $xlate $flavour";
127
- *STDOUT=*OUT;
128
-
129
- if ($output =~ /512/) {
130
- $func="sha512_block_data_order";
131
- $TABLE="K512";
132
- $SZ=8;
133
- @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
134
- "%r8", "%r9", "%r10","%r11");
135
- ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
136
- @Sigma0=(28,34,39);
137
- @Sigma1=(14,18,41);
138
- @sigma0=(1, 8, 7);
139
- @sigma1=(19,61, 6);
140
- $rounds=80;
141
- } else {
142
- $func="sha256_block_data_order";
143
- $TABLE="K256";
144
- $SZ=4;
145
- @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
146
- "%r8d","%r9d","%r10d","%r11d");
147
- ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
148
- @Sigma0=( 2,13,22);
149
- @Sigma1=( 6,11,25);
150
- @sigma0=( 7,18, 3);
151
- @sigma1=(17,19,10);
152
- $rounds=64;
153
- }
154
-
155
- $ctx="%rdi"; # 1st arg, zapped by $a3
156
- $inp="%rsi"; # 2nd arg
157
- $Tbl="%rbp";
158
-
159
- $_ctx="16*$SZ+0*8(%rsp)";
160
- $_inp="16*$SZ+1*8(%rsp)";
161
- $_end="16*$SZ+2*8(%rsp)";
162
- $_rsp="16*$SZ+3*8(%rsp)";
163
- $framesz="16*$SZ+4*8";
164
-
165
-
166
- sub ROUND_00_15()
167
- { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
168
- my $STRIDE=$SZ;
169
- $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
170
-
171
- $code.=<<___;
172
- ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
173
- mov $f,$a2
174
-
175
- xor $e,$a0
176
- ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
177
- xor $g,$a2 # f^g
178
-
179
- mov $T1,`$SZ*($i&0xf)`(%rsp)
180
- xor $a,$a1
181
- and $e,$a2 # (f^g)&e
182
-
183
- ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
184
- add $h,$T1 # T1+=h
185
- xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
186
-
187
- ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
188
- xor $e,$a0
189
- add $a2,$T1 # T1+=Ch(e,f,g)
190
-
191
- mov $a,$a2
192
- add ($Tbl),$T1 # T1+=K[round]
193
- xor $a,$a1
194
-
195
- xor $b,$a2 # a^b, b^c in next round
196
- ror \$$Sigma1[0],$a0 # Sigma1(e)
197
- mov $b,$h
198
-
199
- and $a2,$a3
200
- ror \$$Sigma0[0],$a1 # Sigma0(a)
201
- add $a0,$T1 # T1+=Sigma1(e)
202
-
203
- xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
204
- add $T1,$d # d+=T1
205
- add $T1,$h # h+=T1
206
-
207
- lea $STRIDE($Tbl),$Tbl # round++
208
- ___
209
- $code.=<<___ if ($i<15);
210
- add $a1,$h # h+=Sigma0(a)
211
- ___
212
- ($a2,$a3) = ($a3,$a2);
213
- }
214
-
215
- sub ROUND_16_XX()
216
- { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
217
-
218
- $code.=<<___;
219
- mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
220
- mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
221
-
222
- mov $a0,$T1
223
- ror \$`$sigma0[1]-$sigma0[0]`,$a0
224
- add $a1,$a # modulo-scheduled h+=Sigma0(a)
225
- mov $a2,$a1
226
- ror \$`$sigma1[1]-$sigma1[0]`,$a2
227
-
228
- xor $T1,$a0
229
- shr \$$sigma0[2],$T1
230
- ror \$$sigma0[0],$a0
231
- xor $a1,$a2
232
- shr \$$sigma1[2],$a1
233
-
234
- ror \$$sigma1[0],$a2
235
- xor $a0,$T1 # sigma0(X[(i+1)&0xf])
236
- xor $a1,$a2 # sigma1(X[(i+14)&0xf])
237
- add `$SZ*(($i+9)&0xf)`(%rsp),$T1
238
-
239
- add `$SZ*($i&0xf)`(%rsp),$T1
240
- mov $e,$a0
241
- add $a2,$T1
242
- mov $a,$a1
243
- ___
244
- &ROUND_00_15(@_);
245
- }
246
-
247
- $code=<<___;
248
- .text
249
-
250
- .extern OPENSSL_ia32cap_P
251
- .globl $func
252
- .type $func,\@function,3
253
- .align 16
254
- $func:
255
- ___
256
- $code.=<<___ if ($SZ==4 || $avx);
257
- lea OPENSSL_ia32cap_P(%rip),%r11
258
- mov 0(%r11),%r9d
259
- mov 4(%r11),%r10d
260
- mov 8(%r11),%r11d
261
- ___
262
- $code.=<<___ if ($SZ==4 && $shaext);
263
- test \$`1<<29`,%r11d # check for SHA
264
- jnz _shaext_shortcut
265
- ___
266
- $code.=<<___ if ($avx && $SZ==8);
267
- test \$`1<<11`,%r10d # check for XOP
268
- jnz .Lxop_shortcut
269
- ___
270
- $code.=<<___ if ($avx>1);
271
- and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
272
- cmp \$`1<<8|1<<5|1<<3`,%r11d
273
- je .Lavx2_shortcut
274
- ___
275
- $code.=<<___ if ($avx);
276
- and \$`1<<30`,%r9d # mask "Intel CPU" bit
277
- and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
278
- or %r9d,%r10d
279
- cmp \$`1<<28|1<<9|1<<30`,%r10d
280
- je .Lavx_shortcut
281
- ___
282
- $code.=<<___ if ($SZ==4);
283
- test \$`1<<9`,%r10d
284
- jnz .Lssse3_shortcut
285
- ___
286
- $code.=<<___;
287
- push %rbx
288
- push %rbp
289
- push %r12
290
- push %r13
291
- push %r14
292
- push %r15
293
- mov %rsp,%r11 # copy %rsp
294
- shl \$4,%rdx # num*16
295
- sub \$$framesz,%rsp
296
- lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
297
- and \$-64,%rsp # align stack frame
298
- mov $ctx,$_ctx # save ctx, 1st arg
299
- mov $inp,$_inp # save inp, 2nd arh
300
- mov %rdx,$_end # save end pointer, "3rd" arg
301
- mov %r11,$_rsp # save copy of %rsp
302
- .Lprologue:
303
-
304
- mov $SZ*0($ctx),$A
305
- mov $SZ*1($ctx),$B
306
- mov $SZ*2($ctx),$C
307
- mov $SZ*3($ctx),$D
308
- mov $SZ*4($ctx),$E
309
- mov $SZ*5($ctx),$F
310
- mov $SZ*6($ctx),$G
311
- mov $SZ*7($ctx),$H
312
- jmp .Lloop
313
-
314
- .align 16
315
- .Lloop:
316
- mov $B,$a3
317
- lea $TABLE(%rip),$Tbl
318
- xor $C,$a3 # magic
319
- ___
320
- for($i=0;$i<16;$i++) {
321
- $code.=" mov $SZ*$i($inp),$T1\n";
322
- $code.=" mov @ROT[4],$a0\n";
323
- $code.=" mov @ROT[0],$a1\n";
324
- $code.=" bswap $T1\n";
325
- &ROUND_00_15($i,@ROT);
326
- unshift(@ROT,pop(@ROT));
327
- }
328
- $code.=<<___;
329
- jmp .Lrounds_16_xx
330
- .align 16
331
- .Lrounds_16_xx:
332
- ___
333
- for(;$i<32;$i++) {
334
- &ROUND_16_XX($i,@ROT);
335
- unshift(@ROT,pop(@ROT));
336
- }
337
-
338
- $code.=<<___;
339
- cmpb \$0,`$SZ-1`($Tbl)
340
- jnz .Lrounds_16_xx
341
-
342
- mov $_ctx,$ctx
343
- add $a1,$A # modulo-scheduled h+=Sigma0(a)
344
- lea 16*$SZ($inp),$inp
345
-
346
- add $SZ*0($ctx),$A
347
- add $SZ*1($ctx),$B
348
- add $SZ*2($ctx),$C
349
- add $SZ*3($ctx),$D
350
- add $SZ*4($ctx),$E
351
- add $SZ*5($ctx),$F
352
- add $SZ*6($ctx),$G
353
- add $SZ*7($ctx),$H
354
-
355
- cmp $_end,$inp
356
-
357
- mov $A,$SZ*0($ctx)
358
- mov $B,$SZ*1($ctx)
359
- mov $C,$SZ*2($ctx)
360
- mov $D,$SZ*3($ctx)
361
- mov $E,$SZ*4($ctx)
362
- mov $F,$SZ*5($ctx)
363
- mov $G,$SZ*6($ctx)
364
- mov $H,$SZ*7($ctx)
365
- jb .Lloop
366
-
367
- mov $_rsp,%rsi
368
- mov (%rsi),%r15
369
- mov 8(%rsi),%r14
370
- mov 16(%rsi),%r13
371
- mov 24(%rsi),%r12
372
- mov 32(%rsi),%rbp
373
- mov 40(%rsi),%rbx
374
- lea 48(%rsi),%rsp
375
- .Lepilogue:
376
- ret
377
- .size $func,.-$func
378
- ___
379
-
380
- if ($SZ==4) {
381
- $code.=<<___;
382
- .align 64
383
- .type $TABLE,\@object
384
- $TABLE:
385
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
386
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
387
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
388
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
389
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
390
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
391
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
392
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
393
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
394
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
395
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
396
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
397
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
398
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
399
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
400
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
401
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
402
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
403
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
404
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
405
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
406
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
407
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
408
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
409
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
410
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
411
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
412
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
413
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
414
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
415
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
416
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
417
-
418
- .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
419
- .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
420
- .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
421
- .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
422
- .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
423
- .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
424
- .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
425
- ___
426
- } else {
427
- $code.=<<___;
428
- .align 64
429
- .type $TABLE,\@object
430
- $TABLE:
431
- .quad 0x428a2f98d728ae22,0x7137449123ef65cd
432
- .quad 0x428a2f98d728ae22,0x7137449123ef65cd
433
- .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
434
- .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
435
- .quad 0x3956c25bf348b538,0x59f111f1b605d019
436
- .quad 0x3956c25bf348b538,0x59f111f1b605d019
437
- .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
438
- .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
439
- .quad 0xd807aa98a3030242,0x12835b0145706fbe
440
- .quad 0xd807aa98a3030242,0x12835b0145706fbe
441
- .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
442
- .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
443
- .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
444
- .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
445
- .quad 0x9bdc06a725c71235,0xc19bf174cf692694
446
- .quad 0x9bdc06a725c71235,0xc19bf174cf692694
447
- .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
448
- .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
449
- .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
450
- .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
451
- .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
452
- .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
453
- .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
454
- .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
455
- .quad 0x983e5152ee66dfab,0xa831c66d2db43210
456
- .quad 0x983e5152ee66dfab,0xa831c66d2db43210
457
- .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
458
- .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
459
- .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
460
- .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
461
- .quad 0x06ca6351e003826f,0x142929670a0e6e70
462
- .quad 0x06ca6351e003826f,0x142929670a0e6e70
463
- .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
464
- .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
465
- .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
466
- .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
467
- .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
468
- .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
469
- .quad 0x81c2c92e47edaee6,0x92722c851482353b
470
- .quad 0x81c2c92e47edaee6,0x92722c851482353b
471
- .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
472
- .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
473
- .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
474
- .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
475
- .quad 0xd192e819d6ef5218,0xd69906245565a910
476
- .quad 0xd192e819d6ef5218,0xd69906245565a910
477
- .quad 0xf40e35855771202a,0x106aa07032bbd1b8
478
- .quad 0xf40e35855771202a,0x106aa07032bbd1b8
479
- .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
480
- .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
481
- .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
482
- .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
483
- .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
484
- .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
485
- .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
486
- .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
487
- .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
488
- .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
489
- .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
490
- .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
491
- .quad 0x90befffa23631e28,0xa4506cebde82bde9
492
- .quad 0x90befffa23631e28,0xa4506cebde82bde9
493
- .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
494
- .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
495
- .quad 0xca273eceea26619c,0xd186b8c721c0c207
496
- .quad 0xca273eceea26619c,0xd186b8c721c0c207
497
- .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
498
- .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
499
- .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
500
- .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
501
- .quad 0x113f9804bef90dae,0x1b710b35131c471b
502
- .quad 0x113f9804bef90dae,0x1b710b35131c471b
503
- .quad 0x28db77f523047d84,0x32caab7b40c72493
504
- .quad 0x28db77f523047d84,0x32caab7b40c72493
505
- .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
506
- .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
507
- .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
508
- .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
509
- .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
510
- .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
511
-
512
- .quad 0x0001020304050607,0x08090a0b0c0d0e0f
513
- .quad 0x0001020304050607,0x08090a0b0c0d0e0f
514
- .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
515
- ___
516
- }
517
-
518
- ######################################################################
519
- # SIMD code paths
520
- #
521
- if ($SZ==4 && $shaext) {{{
522
- ######################################################################
523
- # Intel SHA Extensions implementation of SHA256 update function.
524
- #
525
- my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
526
-
527
- my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
528
- my @MSG=map("%xmm$_",(3..6));
529
-
530
- $code.=<<___;
531
- .type sha256_block_data_order_shaext,\@function,3
532
- .align 64
533
- sha256_block_data_order_shaext:
534
- _shaext_shortcut:
535
- ___
536
- $code.=<<___ if ($win64);
537
- lea `-8-5*16`(%rsp),%rsp
538
- movaps %xmm6,-8-5*16(%rax)
539
- movaps %xmm7,-8-4*16(%rax)
540
- movaps %xmm8,-8-3*16(%rax)
541
- movaps %xmm9,-8-2*16(%rax)
542
- movaps %xmm10,-8-1*16(%rax)
543
- .Lprologue_shaext:
544
- ___
545
- $code.=<<___;
546
- lea K256+0x80(%rip),$Tbl
547
- movdqu ($ctx),$ABEF # DCBA
548
- movdqu 16($ctx),$CDGH # HGFE
549
- movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
550
-
551
- pshufd \$0x1b,$ABEF,$Wi # ABCD
552
- pshufd \$0xb1,$ABEF,$ABEF # CDAB
553
- pshufd \$0x1b,$CDGH,$CDGH # EFGH
554
- movdqa $TMP,$BSWAP # offload
555
- palignr \$8,$CDGH,$ABEF # ABEF
556
- punpcklqdq $Wi,$CDGH # CDGH
557
- jmp .Loop_shaext
558
-
559
- .align 16
560
- .Loop_shaext:
561
- movdqu ($inp),@MSG[0]
562
- movdqu 0x10($inp),@MSG[1]
563
- movdqu 0x20($inp),@MSG[2]
564
- pshufb $TMP,@MSG[0]
565
- movdqu 0x30($inp),@MSG[3]
566
-
567
- movdqa 0*32-0x80($Tbl),$Wi
568
- paddd @MSG[0],$Wi
569
- pshufb $TMP,@MSG[1]
570
- movdqa $CDGH,$CDGH_SAVE # offload
571
- sha256rnds2 $ABEF,$CDGH # 0-3
572
- pshufd \$0x0e,$Wi,$Wi
573
- nop
574
- movdqa $ABEF,$ABEF_SAVE # offload
575
- sha256rnds2 $CDGH,$ABEF
576
-
577
- movdqa 1*32-0x80($Tbl),$Wi
578
- paddd @MSG[1],$Wi
579
- pshufb $TMP,@MSG[2]
580
- sha256rnds2 $ABEF,$CDGH # 4-7
581
- pshufd \$0x0e,$Wi,$Wi
582
- lea 0x40($inp),$inp
583
- sha256msg1 @MSG[1],@MSG[0]
584
- sha256rnds2 $CDGH,$ABEF
585
-
586
- movdqa 2*32-0x80($Tbl),$Wi
587
- paddd @MSG[2],$Wi
588
- pshufb $TMP,@MSG[3]
589
- sha256rnds2 $ABEF,$CDGH # 8-11
590
- pshufd \$0x0e,$Wi,$Wi
591
- movdqa @MSG[3],$TMP
592
- palignr \$4,@MSG[2],$TMP
593
- nop
594
- paddd $TMP,@MSG[0]
595
- sha256msg1 @MSG[2],@MSG[1]
596
- sha256rnds2 $CDGH,$ABEF
597
-
598
- movdqa 3*32-0x80($Tbl),$Wi
599
- paddd @MSG[3],$Wi
600
- sha256msg2 @MSG[3],@MSG[0]
601
- sha256rnds2 $ABEF,$CDGH # 12-15
602
- pshufd \$0x0e,$Wi,$Wi
603
- movdqa @MSG[0],$TMP
604
- palignr \$4,@MSG[3],$TMP
605
- nop
606
- paddd $TMP,@MSG[1]
607
- sha256msg1 @MSG[3],@MSG[2]
608
- sha256rnds2 $CDGH,$ABEF
609
- ___
610
- for($i=4;$i<16-3;$i++) {
611
- $code.=<<___;
612
- movdqa $i*32-0x80($Tbl),$Wi
613
- paddd @MSG[0],$Wi
614
- sha256msg2 @MSG[0],@MSG[1]
615
- sha256rnds2 $ABEF,$CDGH # 16-19...
616
- pshufd \$0x0e,$Wi,$Wi
617
- movdqa @MSG[1],$TMP
618
- palignr \$4,@MSG[0],$TMP
619
- nop
620
- paddd $TMP,@MSG[2]
621
- sha256msg1 @MSG[0],@MSG[3]
622
- sha256rnds2 $CDGH,$ABEF
623
- ___
624
- push(@MSG,shift(@MSG));
625
- }
626
- $code.=<<___;
627
- movdqa 13*32-0x80($Tbl),$Wi
628
- paddd @MSG[0],$Wi
629
- sha256msg2 @MSG[0],@MSG[1]
630
- sha256rnds2 $ABEF,$CDGH # 52-55
631
- pshufd \$0x0e,$Wi,$Wi
632
- movdqa @MSG[1],$TMP
633
- palignr \$4,@MSG[0],$TMP
634
- sha256rnds2 $CDGH,$ABEF
635
- paddd $TMP,@MSG[2]
636
-
637
- movdqa 14*32-0x80($Tbl),$Wi
638
- paddd @MSG[1],$Wi
639
- sha256rnds2 $ABEF,$CDGH # 56-59
640
- pshufd \$0x0e,$Wi,$Wi
641
- sha256msg2 @MSG[1],@MSG[2]
642
- movdqa $BSWAP,$TMP
643
- sha256rnds2 $CDGH,$ABEF
644
-
645
- movdqa 15*32-0x80($Tbl),$Wi
646
- paddd @MSG[2],$Wi
647
- nop
648
- sha256rnds2 $ABEF,$CDGH # 60-63
649
- pshufd \$0x0e,$Wi,$Wi
650
- dec $num
651
- nop
652
- sha256rnds2 $CDGH,$ABEF
653
-
654
- paddd $CDGH_SAVE,$CDGH
655
- paddd $ABEF_SAVE,$ABEF
656
- jnz .Loop_shaext
657
-
658
- pshufd \$0xb1,$CDGH,$CDGH # DCHG
659
- pshufd \$0x1b,$ABEF,$TMP # FEBA
660
- pshufd \$0xb1,$ABEF,$ABEF # BAFE
661
- punpckhqdq $CDGH,$ABEF # DCBA
662
- palignr \$8,$TMP,$CDGH # HGFE
663
-
664
- movdqu $ABEF,($ctx)
665
- movdqu $CDGH,16($ctx)
666
- ___
667
- $code.=<<___ if ($win64);
668
- movaps -8-5*16(%rax),%xmm6
669
- movaps -8-4*16(%rax),%xmm7
670
- movaps -8-3*16(%rax),%xmm8
671
- movaps -8-2*16(%rax),%xmm9
672
- movaps -8-1*16(%rax),%xmm10
673
- mov %rax,%rsp
674
- .Lepilogue_shaext:
675
- ___
676
- $code.=<<___;
677
- ret
678
- .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
679
- ___
680
- }}}
681
- {{{
682
-
683
- my $a4=$T1;
684
- my ($a,$b,$c,$d,$e,$f,$g,$h);
685
-
686
- sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
687
- { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
688
- my $arg = pop;
689
- $arg = "\$$arg" if ($arg*1 eq $arg);
690
- $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
691
- }
692
-
693
- sub body_00_15 () {
694
- (
695
- '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
696
-
697
- '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
698
- '&mov ($a,$a1)',
699
- '&mov ($a4,$f)',
700
-
701
- '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
702
- '&xor ($a0,$e)',
703
- '&xor ($a4,$g)', # f^g
704
-
705
- '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
706
- '&xor ($a1,$a)',
707
- '&and ($a4,$e)', # (f^g)&e
708
-
709
- '&xor ($a0,$e)',
710
- '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
711
- '&mov ($a2,$a)',
712
-
713
- '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
714
- '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
715
- '&xor ($a2,$b)', # a^b, b^c in next round
716
-
717
- '&add ($h,$a4)', # h+=Ch(e,f,g)
718
- '&ror ($a0,$Sigma1[0])', # Sigma1(e)
719
- '&and ($a3,$a2)', # (b^c)&(a^b)
720
-
721
- '&xor ($a1,$a)',
722
- '&add ($h,$a0)', # h+=Sigma1(e)
723
- '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
724
-
725
- '&ror ($a1,$Sigma0[0])', # Sigma0(a)
726
- '&add ($d,$h)', # d+=h
727
- '&add ($h,$a3)', # h+=Maj(a,b,c)
728
-
729
- '&mov ($a0,$d)',
730
- '&add ($a1,$h);'. # h+=Sigma0(a)
731
- '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
732
- );
733
- }
734
-
735
- ######################################################################
736
- # SSSE3 code path
737
- #
738
- if ($SZ==4) { # SHA256 only
739
- my @X = map("%xmm$_",(0..3));
740
- my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
741
-
742
- $code.=<<___;
743
- .type ${func}_ssse3,\@function,3
744
- .align 64
745
- ${func}_ssse3:
746
- .Lssse3_shortcut:
747
- push %rbx
748
- push %rbp
749
- push %r12
750
- push %r13
751
- push %r14
752
- push %r15
753
- mov %rsp,%r11 # copy %rsp
754
- shl \$4,%rdx # num*16
755
- sub \$`$framesz+$win64*16*4`,%rsp
756
- lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
757
- and \$-64,%rsp # align stack frame
758
- mov $ctx,$_ctx # save ctx, 1st arg
759
- mov $inp,$_inp # save inp, 2nd arh
760
- mov %rdx,$_end # save end pointer, "3rd" arg
761
- mov %r11,$_rsp # save copy of %rsp
762
- ___
763
- $code.=<<___ if ($win64);
764
- movaps %xmm6,16*$SZ+32(%rsp)
765
- movaps %xmm7,16*$SZ+48(%rsp)
766
- movaps %xmm8,16*$SZ+64(%rsp)
767
- movaps %xmm9,16*$SZ+80(%rsp)
768
- ___
769
- $code.=<<___;
770
- .Lprologue_ssse3:
771
-
772
- mov $SZ*0($ctx),$A
773
- mov $SZ*1($ctx),$B
774
- mov $SZ*2($ctx),$C
775
- mov $SZ*3($ctx),$D
776
- mov $SZ*4($ctx),$E
777
- mov $SZ*5($ctx),$F
778
- mov $SZ*6($ctx),$G
779
- mov $SZ*7($ctx),$H
780
- ___
781
-
782
- $code.=<<___;
783
- #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
784
- #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
785
- jmp .Lloop_ssse3
786
- .align 16
787
- .Lloop_ssse3:
788
- movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
789
- movdqu 0x00($inp),@X[0]
790
- movdqu 0x10($inp),@X[1]
791
- movdqu 0x20($inp),@X[2]
792
- pshufb $t3,@X[0]
793
- movdqu 0x30($inp),@X[3]
794
- lea $TABLE(%rip),$Tbl
795
- pshufb $t3,@X[1]
796
- movdqa 0x00($Tbl),$t0
797
- movdqa 0x20($Tbl),$t1
798
- pshufb $t3,@X[2]
799
- paddd @X[0],$t0
800
- movdqa 0x40($Tbl),$t2
801
- pshufb $t3,@X[3]
802
- movdqa 0x60($Tbl),$t3
803
- paddd @X[1],$t1
804
- paddd @X[2],$t2
805
- paddd @X[3],$t3
806
- movdqa $t0,0x00(%rsp)
807
- mov $A,$a1
808
- movdqa $t1,0x10(%rsp)
809
- mov $B,$a3
810
- movdqa $t2,0x20(%rsp)
811
- xor $C,$a3 # magic
812
- movdqa $t3,0x30(%rsp)
813
- mov $E,$a0
814
- jmp .Lssse3_00_47
815
-
816
- .align 16
817
- .Lssse3_00_47:
818
- sub \$`-16*2*$SZ`,$Tbl # size optimization
819
- ___
820
- sub Xupdate_256_SSSE3 () {
821
- (
822
- '&movdqa ($t0,@X[1]);',
823
- '&movdqa ($t3,@X[3])',
824
- '&palignr ($t0,@X[0],$SZ)', # X[1..4]
825
- '&palignr ($t3,@X[2],$SZ);', # X[9..12]
826
- '&movdqa ($t1,$t0)',
827
- '&movdqa ($t2,$t0);',
828
- '&psrld ($t0,$sigma0[2])',
829
- '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
830
- '&psrld ($t2,$sigma0[0])',
831
- '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
832
- '&pslld ($t1,8*$SZ-$sigma0[1]);'.
833
- '&pxor ($t0,$t2)',
834
- '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
835
- '&pxor ($t0,$t1)',
836
- '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
837
- '&pxor ($t0,$t2);',
838
- '&movdqa ($t2,$t3)',
839
- '&pxor ($t0,$t1);', # sigma0(X[1..4])
840
- '&psrld ($t3,$sigma1[2])',
841
- '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
842
- '&psrlq ($t2,$sigma1[0])',
843
- '&pxor ($t3,$t2);',
844
- '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
845
- '&pxor ($t3,$t2)',
846
- '&pshufb ($t3,$t4)', # sigma1(X[14..15])
847
- '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
848
- '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
849
- '&movdqa ($t2,$t3);',
850
- '&psrld ($t3,$sigma1[2])',
851
- '&psrlq ($t2,$sigma1[0])',
852
- '&pxor ($t3,$t2);',
853
- '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
854
- '&pxor ($t3,$t2);',
855
- '&movdqa ($t2,16*2*$j."($Tbl)")',
856
- '&pshufb ($t3,$t5)',
857
- '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
858
- );
859
- }
860
-
861
- sub SSSE3_256_00_47 () {
862
- my $j = shift;
863
- my $body = shift;
864
- my @X = @_;
865
- my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
866
-
867
- if (0) {
868
- foreach (Xupdate_256_SSSE3()) { # 36 instructions
869
- eval;
870
- eval(shift(@insns));
871
- eval(shift(@insns));
872
- eval(shift(@insns));
873
- }
874
- } else { # squeeze extra 4% on Westmere and 19% on Atom
875
- eval(shift(@insns)); #@
876
- &movdqa ($t0,@X[1]);
877
- eval(shift(@insns));
878
- eval(shift(@insns));
879
- &movdqa ($t3,@X[3]);
880
- eval(shift(@insns)); #@
881
- eval(shift(@insns));
882
- eval(shift(@insns));
883
- eval(shift(@insns)); #@
884
- eval(shift(@insns));
885
- &palignr ($t0,@X[0],$SZ); # X[1..4]
886
- eval(shift(@insns));
887
- eval(shift(@insns));
888
- &palignr ($t3,@X[2],$SZ); # X[9..12]
889
- eval(shift(@insns));
890
- eval(shift(@insns));
891
- eval(shift(@insns));
892
- eval(shift(@insns)); #@
893
- &movdqa ($t1,$t0);
894
- eval(shift(@insns));
895
- eval(shift(@insns));
896
- &movdqa ($t2,$t0);
897
- eval(shift(@insns)); #@
898
- eval(shift(@insns));
899
- &psrld ($t0,$sigma0[2]);
900
- eval(shift(@insns));
901
- eval(shift(@insns));
902
- eval(shift(@insns));
903
- &paddd (@X[0],$t3); # X[0..3] += X[9..12]
904
- eval(shift(@insns)); #@
905
- eval(shift(@insns));
906
- &psrld ($t2,$sigma0[0]);
907
- eval(shift(@insns));
908
- eval(shift(@insns));
909
- &pshufd ($t3,@X[3],0b11111010); # X[4..15]
910
- eval(shift(@insns));
911
- eval(shift(@insns)); #@
912
- &pslld ($t1,8*$SZ-$sigma0[1]);
913
- eval(shift(@insns));
914
- eval(shift(@insns));
915
- &pxor ($t0,$t2);
916
- eval(shift(@insns)); #@
917
- eval(shift(@insns));
918
- eval(shift(@insns));
919
- eval(shift(@insns)); #@
920
- &psrld ($t2,$sigma0[1]-$sigma0[0]);
921
- eval(shift(@insns));
922
- &pxor ($t0,$t1);
923
- eval(shift(@insns));
924
- eval(shift(@insns));
925
- &pslld ($t1,$sigma0[1]-$sigma0[0]);
926
- eval(shift(@insns));
927
- eval(shift(@insns));
928
- &pxor ($t0,$t2);
929
- eval(shift(@insns));
930
- eval(shift(@insns)); #@
931
- &movdqa ($t2,$t3);
932
- eval(shift(@insns));
933
- eval(shift(@insns));
934
- &pxor ($t0,$t1); # sigma0(X[1..4])
935
- eval(shift(@insns)); #@
936
- eval(shift(@insns));
937
- eval(shift(@insns));
938
- &psrld ($t3,$sigma1[2]);
939
- eval(shift(@insns));
940
- eval(shift(@insns));
941
- &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
942
- eval(shift(@insns)); #@
943
- eval(shift(@insns));
944
- &psrlq ($t2,$sigma1[0]);
945
- eval(shift(@insns));
946
- eval(shift(@insns));
947
- eval(shift(@insns));
948
- &pxor ($t3,$t2);
949
- eval(shift(@insns)); #@
950
- eval(shift(@insns));
951
- eval(shift(@insns));
952
- eval(shift(@insns)); #@
953
- &psrlq ($t2,$sigma1[1]-$sigma1[0]);
954
- eval(shift(@insns));
955
- eval(shift(@insns));
956
- &pxor ($t3,$t2);
957
- eval(shift(@insns)); #@
958
- eval(shift(@insns));
959
- eval(shift(@insns));
960
- #&pshufb ($t3,$t4); # sigma1(X[14..15])
961
- &pshufd ($t3,$t3,0b10000000);
962
- eval(shift(@insns));
963
- eval(shift(@insns));
964
- eval(shift(@insns));
965
- &psrldq ($t3,8);
966
- eval(shift(@insns));
967
- eval(shift(@insns)); #@
968
- eval(shift(@insns));
969
- eval(shift(@insns));
970
- eval(shift(@insns)); #@
971
- &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
972
- eval(shift(@insns));
973
- eval(shift(@insns));
974
- eval(shift(@insns));
975
- &pshufd ($t3,@X[0],0b01010000); # X[16..17]
976
- eval(shift(@insns));
977
- eval(shift(@insns)); #@
978
- eval(shift(@insns));
979
- &movdqa ($t2,$t3);
980
- eval(shift(@insns));
981
- eval(shift(@insns));
982
- &psrld ($t3,$sigma1[2]);
983
- eval(shift(@insns));
984
- eval(shift(@insns)); #@
985
- &psrlq ($t2,$sigma1[0]);
986
- eval(shift(@insns));
987
- eval(shift(@insns));
988
- &pxor ($t3,$t2);
989
- eval(shift(@insns)); #@
990
- eval(shift(@insns));
991
- eval(shift(@insns));
992
- eval(shift(@insns)); #@
993
- eval(shift(@insns));
994
- &psrlq ($t2,$sigma1[1]-$sigma1[0]);
995
- eval(shift(@insns));
996
- eval(shift(@insns));
997
- eval(shift(@insns));
998
- &pxor ($t3,$t2);
999
- eval(shift(@insns));
1000
- eval(shift(@insns));
1001
- eval(shift(@insns)); #@
1002
- #&pshufb ($t3,$t5);
1003
- &pshufd ($t3,$t3,0b00001000);
1004
- eval(shift(@insns));
1005
- eval(shift(@insns));
1006
- &movdqa ($t2,16*2*$j."($Tbl)");
1007
- eval(shift(@insns)); #@
1008
- eval(shift(@insns));
1009
- &pslldq ($t3,8);
1010
- eval(shift(@insns));
1011
- eval(shift(@insns));
1012
- eval(shift(@insns));
1013
- &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1014
- eval(shift(@insns)); #@
1015
- eval(shift(@insns));
1016
- eval(shift(@insns));
1017
- }
1018
- &paddd ($t2,@X[0]);
1019
- foreach (@insns) { eval; } # remaining instructions
1020
- &movdqa (16*$j."(%rsp)",$t2);
1021
- }
1022
-
1023
- for ($i=0,$j=0; $j<4; $j++) {
1024
- &SSSE3_256_00_47($j,\&body_00_15,@X);
1025
- push(@X,shift(@X)); # rotate(@X)
1026
- }
1027
- &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1028
- &jne (".Lssse3_00_47");
1029
-
1030
- for ($i=0; $i<16; ) {
1031
- foreach(body_00_15()) { eval; }
1032
- }
1033
- $code.=<<___;
1034
- mov $_ctx,$ctx
1035
- mov $a1,$A
1036
-
1037
- add $SZ*0($ctx),$A
1038
- lea 16*$SZ($inp),$inp
1039
- add $SZ*1($ctx),$B
1040
- add $SZ*2($ctx),$C
1041
- add $SZ*3($ctx),$D
1042
- add $SZ*4($ctx),$E
1043
- add $SZ*5($ctx),$F
1044
- add $SZ*6($ctx),$G
1045
- add $SZ*7($ctx),$H
1046
-
1047
- cmp $_end,$inp
1048
-
1049
- mov $A,$SZ*0($ctx)
1050
- mov $B,$SZ*1($ctx)
1051
- mov $C,$SZ*2($ctx)
1052
- mov $D,$SZ*3($ctx)
1053
- mov $E,$SZ*4($ctx)
1054
- mov $F,$SZ*5($ctx)
1055
- mov $G,$SZ*6($ctx)
1056
- mov $H,$SZ*7($ctx)
1057
- jb .Lloop_ssse3
1058
-
1059
- mov $_rsp,%rsi
1060
- ___
1061
- $code.=<<___ if ($win64);
1062
- movaps 16*$SZ+32(%rsp),%xmm6
1063
- movaps 16*$SZ+48(%rsp),%xmm7
1064
- movaps 16*$SZ+64(%rsp),%xmm8
1065
- movaps 16*$SZ+80(%rsp),%xmm9
1066
- ___
1067
- $code.=<<___;
1068
- mov (%rsi),%r15
1069
- mov 8(%rsi),%r14
1070
- mov 16(%rsi),%r13
1071
- mov 24(%rsi),%r12
1072
- mov 32(%rsi),%rbp
1073
- mov 40(%rsi),%rbx
1074
- lea 48(%rsi),%rsp
1075
- .Lepilogue_ssse3:
1076
- ret
1077
- .size ${func}_ssse3,.-${func}_ssse3
1078
- ___
1079
- }
1080
-
1081
- if ($avx) {{
1082
- ######################################################################
1083
- # XOP code path
1084
- #
1085
- if ($SZ==8) { # SHA512 only
1086
- $code.=<<___;
1087
- .type ${func}_xop,\@function,3
1088
- .align 64
1089
- ${func}_xop:
1090
- .Lxop_shortcut:
1091
- push %rbx
1092
- push %rbp
1093
- push %r12
1094
- push %r13
1095
- push %r14
1096
- push %r15
1097
- mov %rsp,%r11 # copy %rsp
1098
- shl \$4,%rdx # num*16
1099
- sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1100
- lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1101
- and \$-64,%rsp # align stack frame
1102
- mov $ctx,$_ctx # save ctx, 1st arg
1103
- mov $inp,$_inp # save inp, 2nd arh
1104
- mov %rdx,$_end # save end pointer, "3rd" arg
1105
- mov %r11,$_rsp # save copy of %rsp
1106
- ___
1107
- $code.=<<___ if ($win64);
1108
- movaps %xmm6,16*$SZ+32(%rsp)
1109
- movaps %xmm7,16*$SZ+48(%rsp)
1110
- movaps %xmm8,16*$SZ+64(%rsp)
1111
- movaps %xmm9,16*$SZ+80(%rsp)
1112
- ___
1113
- $code.=<<___ if ($win64 && $SZ>4);
1114
- movaps %xmm10,16*$SZ+96(%rsp)
1115
- movaps %xmm11,16*$SZ+112(%rsp)
1116
- ___
1117
- $code.=<<___;
1118
- .Lprologue_xop:
1119
-
1120
- vzeroupper
1121
- mov $SZ*0($ctx),$A
1122
- mov $SZ*1($ctx),$B
1123
- mov $SZ*2($ctx),$C
1124
- mov $SZ*3($ctx),$D
1125
- mov $SZ*4($ctx),$E
1126
- mov $SZ*5($ctx),$F
1127
- mov $SZ*6($ctx),$G
1128
- mov $SZ*7($ctx),$H
1129
- jmp .Lloop_xop
1130
- ___
1131
- if ($SZ==4) { # SHA256
1132
- my @X = map("%xmm$_",(0..3));
1133
- my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1134
-
1135
- $code.=<<___;
1136
- .align 16
1137
- .Lloop_xop:
1138
- vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1139
- vmovdqu 0x00($inp),@X[0]
1140
- vmovdqu 0x10($inp),@X[1]
1141
- vmovdqu 0x20($inp),@X[2]
1142
- vmovdqu 0x30($inp),@X[3]
1143
- vpshufb $t3,@X[0],@X[0]
1144
- lea $TABLE(%rip),$Tbl
1145
- vpshufb $t3,@X[1],@X[1]
1146
- vpshufb $t3,@X[2],@X[2]
1147
- vpaddd 0x00($Tbl),@X[0],$t0
1148
- vpshufb $t3,@X[3],@X[3]
1149
- vpaddd 0x20($Tbl),@X[1],$t1
1150
- vpaddd 0x40($Tbl),@X[2],$t2
1151
- vpaddd 0x60($Tbl),@X[3],$t3
1152
- vmovdqa $t0,0x00(%rsp)
1153
- mov $A,$a1
1154
- vmovdqa $t1,0x10(%rsp)
1155
- mov $B,$a3
1156
- vmovdqa $t2,0x20(%rsp)
1157
- xor $C,$a3 # magic
1158
- vmovdqa $t3,0x30(%rsp)
1159
- mov $E,$a0
1160
- jmp .Lxop_00_47
1161
-
1162
- .align 16
1163
- .Lxop_00_47:
1164
- sub \$`-16*2*$SZ`,$Tbl # size optimization
1165
- ___
1166
- sub XOP_256_00_47 () {
1167
- my $j = shift;
1168
- my $body = shift;
1169
- my @X = @_;
1170
- my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1171
-
1172
- &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
1173
- eval(shift(@insns));
1174
- eval(shift(@insns));
1175
- &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
1176
- eval(shift(@insns));
1177
- eval(shift(@insns));
1178
- &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
1179
- eval(shift(@insns));
1180
- eval(shift(@insns));
1181
- &vpsrld ($t0,$t0,$sigma0[2]);
1182
- eval(shift(@insns));
1183
- eval(shift(@insns));
1184
- &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
1185
- eval(shift(@insns));
1186
- eval(shift(@insns));
1187
- eval(shift(@insns));
1188
- eval(shift(@insns));
1189
- &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
1190
- eval(shift(@insns));
1191
- eval(shift(@insns));
1192
- &vpxor ($t0,$t0,$t1);
1193
- eval(shift(@insns));
1194
- eval(shift(@insns));
1195
- eval(shift(@insns));
1196
- eval(shift(@insns));
1197
- &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
1198
- eval(shift(@insns));
1199
- eval(shift(@insns));
1200
- &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
1201
- eval(shift(@insns));
1202
- eval(shift(@insns));
1203
- &vpsrld ($t2,@X[3],$sigma1[2]);
1204
- eval(shift(@insns));
1205
- eval(shift(@insns));
1206
- &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
1207
- eval(shift(@insns));
1208
- eval(shift(@insns));
1209
- &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1210
- eval(shift(@insns));
1211
- eval(shift(@insns));
1212
- &vpxor ($t3,$t3,$t2);
1213
- eval(shift(@insns));
1214
- eval(shift(@insns));
1215
- eval(shift(@insns));
1216
- eval(shift(@insns));
1217
- &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1218
- eval(shift(@insns));
1219
- eval(shift(@insns));
1220
- eval(shift(@insns));
1221
- eval(shift(@insns));
1222
- &vpsrldq ($t3,$t3,8);
1223
- eval(shift(@insns));
1224
- eval(shift(@insns));
1225
- eval(shift(@insns));
1226
- eval(shift(@insns));
1227
- &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1228
- eval(shift(@insns));
1229
- eval(shift(@insns));
1230
- eval(shift(@insns));
1231
- eval(shift(@insns));
1232
- &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
1233
- eval(shift(@insns));
1234
- eval(shift(@insns));
1235
- &vpsrld ($t2,@X[0],$sigma1[2]);
1236
- eval(shift(@insns));
1237
- eval(shift(@insns));
1238
- &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1239
- eval(shift(@insns));
1240
- eval(shift(@insns));
1241
- &vpxor ($t3,$t3,$t2);
1242
- eval(shift(@insns));
1243
- eval(shift(@insns));
1244
- eval(shift(@insns));
1245
- eval(shift(@insns));
1246
- &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
1247
- eval(shift(@insns));
1248
- eval(shift(@insns));
1249
- eval(shift(@insns));
1250
- eval(shift(@insns));
1251
- &vpslldq ($t3,$t3,8); # 22 instructions
1252
- eval(shift(@insns));
1253
- eval(shift(@insns));
1254
- eval(shift(@insns));
1255
- eval(shift(@insns));
1256
- &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1257
- eval(shift(@insns));
1258
- eval(shift(@insns));
1259
- eval(shift(@insns));
1260
- eval(shift(@insns));
1261
- &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1262
- foreach (@insns) { eval; } # remaining instructions
1263
- &vmovdqa (16*$j."(%rsp)",$t2);
1264
- }
1265
-
1266
- for ($i=0,$j=0; $j<4; $j++) {
1267
- &XOP_256_00_47($j,\&body_00_15,@X);
1268
- push(@X,shift(@X)); # rotate(@X)
1269
- }
1270
- &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1271
- &jne (".Lxop_00_47");
1272
-
1273
- for ($i=0; $i<16; ) {
1274
- foreach(body_00_15()) { eval; }
1275
- }
1276
-
1277
- } else { # SHA512
1278
- my @X = map("%xmm$_",(0..7));
1279
- my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1280
-
1281
- $code.=<<___;
1282
- .align 16
1283
- .Lloop_xop:
1284
- vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1285
- vmovdqu 0x00($inp),@X[0]
1286
- lea $TABLE+0x80(%rip),$Tbl # size optimization
1287
- vmovdqu 0x10($inp),@X[1]
1288
- vmovdqu 0x20($inp),@X[2]
1289
- vpshufb $t3,@X[0],@X[0]
1290
- vmovdqu 0x30($inp),@X[3]
1291
- vpshufb $t3,@X[1],@X[1]
1292
- vmovdqu 0x40($inp),@X[4]
1293
- vpshufb $t3,@X[2],@X[2]
1294
- vmovdqu 0x50($inp),@X[5]
1295
- vpshufb $t3,@X[3],@X[3]
1296
- vmovdqu 0x60($inp),@X[6]
1297
- vpshufb $t3,@X[4],@X[4]
1298
- vmovdqu 0x70($inp),@X[7]
1299
- vpshufb $t3,@X[5],@X[5]
1300
- vpaddq -0x80($Tbl),@X[0],$t0
1301
- vpshufb $t3,@X[6],@X[6]
1302
- vpaddq -0x60($Tbl),@X[1],$t1
1303
- vpshufb $t3,@X[7],@X[7]
1304
- vpaddq -0x40($Tbl),@X[2],$t2
1305
- vpaddq -0x20($Tbl),@X[3],$t3
1306
- vmovdqa $t0,0x00(%rsp)
1307
- vpaddq 0x00($Tbl),@X[4],$t0
1308
- vmovdqa $t1,0x10(%rsp)
1309
- vpaddq 0x20($Tbl),@X[5],$t1
1310
- vmovdqa $t2,0x20(%rsp)
1311
- vpaddq 0x40($Tbl),@X[6],$t2
1312
- vmovdqa $t3,0x30(%rsp)
1313
- vpaddq 0x60($Tbl),@X[7],$t3
1314
- vmovdqa $t0,0x40(%rsp)
1315
- mov $A,$a1
1316
- vmovdqa $t1,0x50(%rsp)
1317
- mov $B,$a3
1318
- vmovdqa $t2,0x60(%rsp)
1319
- xor $C,$a3 # magic
1320
- vmovdqa $t3,0x70(%rsp)
1321
- mov $E,$a0
1322
- jmp .Lxop_00_47
1323
-
1324
- .align 16
1325
- .Lxop_00_47:
1326
- add \$`16*2*$SZ`,$Tbl
1327
- ___
1328
- sub XOP_512_00_47 () {
1329
- my $j = shift;
1330
- my $body = shift;
1331
- my @X = @_;
1332
- my @insns = (&$body,&$body); # 52 instructions
1333
-
1334
- &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
1335
- eval(shift(@insns));
1336
- eval(shift(@insns));
1337
- &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
1338
- eval(shift(@insns));
1339
- eval(shift(@insns));
1340
- &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
1341
- eval(shift(@insns));
1342
- eval(shift(@insns));
1343
- &vpsrlq ($t0,$t0,$sigma0[2]);
1344
- eval(shift(@insns));
1345
- eval(shift(@insns));
1346
- &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
1347
- eval(shift(@insns));
1348
- eval(shift(@insns));
1349
- eval(shift(@insns));
1350
- eval(shift(@insns));
1351
- &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
1352
- eval(shift(@insns));
1353
- eval(shift(@insns));
1354
- &vpxor ($t0,$t0,$t1);
1355
- eval(shift(@insns));
1356
- eval(shift(@insns));
1357
- eval(shift(@insns));
1358
- eval(shift(@insns));
1359
- &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
1360
- eval(shift(@insns));
1361
- eval(shift(@insns));
1362
- &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
1363
- eval(shift(@insns));
1364
- eval(shift(@insns));
1365
- &vpsrlq ($t2,@X[7],$sigma1[2]);
1366
- eval(shift(@insns));
1367
- eval(shift(@insns));
1368
- &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
1369
- eval(shift(@insns));
1370
- eval(shift(@insns));
1371
- &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
1372
- eval(shift(@insns));
1373
- eval(shift(@insns));
1374
- &vpxor ($t3,$t3,$t2);
1375
- eval(shift(@insns));
1376
- eval(shift(@insns));
1377
- eval(shift(@insns));
1378
- eval(shift(@insns));
1379
- &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1380
- eval(shift(@insns));
1381
- eval(shift(@insns));
1382
- eval(shift(@insns));
1383
- eval(shift(@insns));
1384
- &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1385
- eval(shift(@insns));
1386
- eval(shift(@insns));
1387
- eval(shift(@insns));
1388
- eval(shift(@insns));
1389
- &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1390
- foreach (@insns) { eval; } # remaining instructions
1391
- &vmovdqa (16*$j."(%rsp)",$t2);
1392
- }
1393
-
1394
- for ($i=0,$j=0; $j<8; $j++) {
1395
- &XOP_512_00_47($j,\&body_00_15,@X);
1396
- push(@X,shift(@X)); # rotate(@X)
1397
- }
1398
- &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1399
- &jne (".Lxop_00_47");
1400
-
1401
- for ($i=0; $i<16; ) {
1402
- foreach(body_00_15()) { eval; }
1403
- }
1404
- }
1405
- $code.=<<___;
1406
- mov $_ctx,$ctx
1407
- mov $a1,$A
1408
-
1409
- add $SZ*0($ctx),$A
1410
- lea 16*$SZ($inp),$inp
1411
- add $SZ*1($ctx),$B
1412
- add $SZ*2($ctx),$C
1413
- add $SZ*3($ctx),$D
1414
- add $SZ*4($ctx),$E
1415
- add $SZ*5($ctx),$F
1416
- add $SZ*6($ctx),$G
1417
- add $SZ*7($ctx),$H
1418
-
1419
- cmp $_end,$inp
1420
-
1421
- mov $A,$SZ*0($ctx)
1422
- mov $B,$SZ*1($ctx)
1423
- mov $C,$SZ*2($ctx)
1424
- mov $D,$SZ*3($ctx)
1425
- mov $E,$SZ*4($ctx)
1426
- mov $F,$SZ*5($ctx)
1427
- mov $G,$SZ*6($ctx)
1428
- mov $H,$SZ*7($ctx)
1429
- jb .Lloop_xop
1430
-
1431
- mov $_rsp,%rsi
1432
- vzeroupper
1433
- ___
1434
- $code.=<<___ if ($win64);
1435
- movaps 16*$SZ+32(%rsp),%xmm6
1436
- movaps 16*$SZ+48(%rsp),%xmm7
1437
- movaps 16*$SZ+64(%rsp),%xmm8
1438
- movaps 16*$SZ+80(%rsp),%xmm9
1439
- ___
1440
- $code.=<<___ if ($win64 && $SZ>4);
1441
- movaps 16*$SZ+96(%rsp),%xmm10
1442
- movaps 16*$SZ+112(%rsp),%xmm11
1443
- ___
1444
- $code.=<<___;
1445
- mov (%rsi),%r15
1446
- mov 8(%rsi),%r14
1447
- mov 16(%rsi),%r13
1448
- mov 24(%rsi),%r12
1449
- mov 32(%rsi),%rbp
1450
- mov 40(%rsi),%rbx
1451
- lea 48(%rsi),%rsp
1452
- .Lepilogue_xop:
1453
- ret
1454
- .size ${func}_xop,.-${func}_xop
1455
- ___
1456
- }
1457
- ######################################################################
1458
- # AVX+shrd code path
1459
- #
1460
- local *ror = sub { &shrd(@_[0],@_) };
1461
-
1462
- $code.=<<___;
1463
- .type ${func}_avx,\@function,3
1464
- .align 64
1465
- ${func}_avx:
1466
- .Lavx_shortcut:
1467
- push %rbx
1468
- push %rbp
1469
- push %r12
1470
- push %r13
1471
- push %r14
1472
- push %r15
1473
- mov %rsp,%r11 # copy %rsp
1474
- shl \$4,%rdx # num*16
1475
- sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1476
- lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1477
- and \$-64,%rsp # align stack frame
1478
- mov $ctx,$_ctx # save ctx, 1st arg
1479
- mov $inp,$_inp # save inp, 2nd arh
1480
- mov %rdx,$_end # save end pointer, "3rd" arg
1481
- mov %r11,$_rsp # save copy of %rsp
1482
- ___
1483
- $code.=<<___ if ($win64);
1484
- movaps %xmm6,16*$SZ+32(%rsp)
1485
- movaps %xmm7,16*$SZ+48(%rsp)
1486
- movaps %xmm8,16*$SZ+64(%rsp)
1487
- movaps %xmm9,16*$SZ+80(%rsp)
1488
- ___
1489
- $code.=<<___ if ($win64 && $SZ>4);
1490
- movaps %xmm10,16*$SZ+96(%rsp)
1491
- movaps %xmm11,16*$SZ+112(%rsp)
1492
- ___
1493
- $code.=<<___;
1494
- .Lprologue_avx:
1495
-
1496
- vzeroupper
1497
- mov $SZ*0($ctx),$A
1498
- mov $SZ*1($ctx),$B
1499
- mov $SZ*2($ctx),$C
1500
- mov $SZ*3($ctx),$D
1501
- mov $SZ*4($ctx),$E
1502
- mov $SZ*5($ctx),$F
1503
- mov $SZ*6($ctx),$G
1504
- mov $SZ*7($ctx),$H
1505
- ___
1506
- if ($SZ==4) { # SHA256
1507
- my @X = map("%xmm$_",(0..3));
1508
- my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1509
-
1510
- $code.=<<___;
1511
- vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1512
- vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1513
- jmp .Lloop_avx
1514
- .align 16
1515
- .Lloop_avx:
1516
- vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1517
- vmovdqu 0x00($inp),@X[0]
1518
- vmovdqu 0x10($inp),@X[1]
1519
- vmovdqu 0x20($inp),@X[2]
1520
- vmovdqu 0x30($inp),@X[3]
1521
- vpshufb $t3,@X[0],@X[0]
1522
- lea $TABLE(%rip),$Tbl
1523
- vpshufb $t3,@X[1],@X[1]
1524
- vpshufb $t3,@X[2],@X[2]
1525
- vpaddd 0x00($Tbl),@X[0],$t0
1526
- vpshufb $t3,@X[3],@X[3]
1527
- vpaddd 0x20($Tbl),@X[1],$t1
1528
- vpaddd 0x40($Tbl),@X[2],$t2
1529
- vpaddd 0x60($Tbl),@X[3],$t3
1530
- vmovdqa $t0,0x00(%rsp)
1531
- mov $A,$a1
1532
- vmovdqa $t1,0x10(%rsp)
1533
- mov $B,$a3
1534
- vmovdqa $t2,0x20(%rsp)
1535
- xor $C,$a3 # magic
1536
- vmovdqa $t3,0x30(%rsp)
1537
- mov $E,$a0
1538
- jmp .Lavx_00_47
1539
-
1540
- .align 16
1541
- .Lavx_00_47:
1542
- sub \$`-16*2*$SZ`,$Tbl # size optimization
1543
- ___
1544
- sub Xupdate_256_AVX () {
1545
- (
1546
- '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
1547
- '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
1548
- '&vpsrld ($t2,$t0,$sigma0[0]);',
1549
- '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
1550
- '&vpsrld ($t3,$t0,$sigma0[2])',
1551
- '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
1552
- '&vpxor ($t0,$t3,$t2)',
1553
- '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
1554
- '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1555
- '&vpxor ($t0,$t0,$t1)',
1556
- '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1557
- '&vpxor ($t0,$t0,$t2)',
1558
- '&vpsrld ($t2,$t3,$sigma1[2]);',
1559
- '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
1560
- '&vpsrlq ($t3,$t3,$sigma1[0]);',
1561
- '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
1562
- '&vpxor ($t2,$t2,$t3);',
1563
- '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1564
- '&vpxor ($t2,$t2,$t3)',
1565
- '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
1566
- '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
1567
- '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
1568
- '&vpsrld ($t2,$t3,$sigma1[2])',
1569
- '&vpsrlq ($t3,$t3,$sigma1[0])',
1570
- '&vpxor ($t2,$t2,$t3);',
1571
- '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1572
- '&vpxor ($t2,$t2,$t3)',
1573
- '&vpshufb ($t2,$t2,$t5)',
1574
- '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
1575
- );
1576
- }
1577
-
1578
- sub AVX_256_00_47 () {
1579
- my $j = shift;
1580
- my $body = shift;
1581
- my @X = @_;
1582
- my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1583
-
1584
- foreach (Xupdate_256_AVX()) { # 29 instructions
1585
- eval;
1586
- eval(shift(@insns));
1587
- eval(shift(@insns));
1588
- eval(shift(@insns));
1589
- }
1590
- &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1591
- foreach (@insns) { eval; } # remaining instructions
1592
- &vmovdqa (16*$j."(%rsp)",$t2);
1593
- }
1594
-
1595
- for ($i=0,$j=0; $j<4; $j++) {
1596
- &AVX_256_00_47($j,\&body_00_15,@X);
1597
- push(@X,shift(@X)); # rotate(@X)
1598
- }
1599
- &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1600
- &jne (".Lavx_00_47");
1601
-
1602
- for ($i=0; $i<16; ) {
1603
- foreach(body_00_15()) { eval; }
1604
- }
1605
-
1606
- } else { # SHA512
1607
- my @X = map("%xmm$_",(0..7));
1608
- my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1609
-
1610
- $code.=<<___;
1611
- jmp .Lloop_avx
1612
- .align 16
1613
- .Lloop_avx:
1614
- vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1615
- vmovdqu 0x00($inp),@X[0]
1616
- lea $TABLE+0x80(%rip),$Tbl # size optimization
1617
- vmovdqu 0x10($inp),@X[1]
1618
- vmovdqu 0x20($inp),@X[2]
1619
- vpshufb $t3,@X[0],@X[0]
1620
- vmovdqu 0x30($inp),@X[3]
1621
- vpshufb $t3,@X[1],@X[1]
1622
- vmovdqu 0x40($inp),@X[4]
1623
- vpshufb $t3,@X[2],@X[2]
1624
- vmovdqu 0x50($inp),@X[5]
1625
- vpshufb $t3,@X[3],@X[3]
1626
- vmovdqu 0x60($inp),@X[6]
1627
- vpshufb $t3,@X[4],@X[4]
1628
- vmovdqu 0x70($inp),@X[7]
1629
- vpshufb $t3,@X[5],@X[5]
1630
- vpaddq -0x80($Tbl),@X[0],$t0
1631
- vpshufb $t3,@X[6],@X[6]
1632
- vpaddq -0x60($Tbl),@X[1],$t1
1633
- vpshufb $t3,@X[7],@X[7]
1634
- vpaddq -0x40($Tbl),@X[2],$t2
1635
- vpaddq -0x20($Tbl),@X[3],$t3
1636
- vmovdqa $t0,0x00(%rsp)
1637
- vpaddq 0x00($Tbl),@X[4],$t0
1638
- vmovdqa $t1,0x10(%rsp)
1639
- vpaddq 0x20($Tbl),@X[5],$t1
1640
- vmovdqa $t2,0x20(%rsp)
1641
- vpaddq 0x40($Tbl),@X[6],$t2
1642
- vmovdqa $t3,0x30(%rsp)
1643
- vpaddq 0x60($Tbl),@X[7],$t3
1644
- vmovdqa $t0,0x40(%rsp)
1645
- mov $A,$a1
1646
- vmovdqa $t1,0x50(%rsp)
1647
- mov $B,$a3
1648
- vmovdqa $t2,0x60(%rsp)
1649
- xor $C,$a3 # magic
1650
- vmovdqa $t3,0x70(%rsp)
1651
- mov $E,$a0
1652
- jmp .Lavx_00_47
1653
-
1654
- .align 16
1655
- .Lavx_00_47:
1656
- add \$`16*2*$SZ`,$Tbl
1657
- ___
1658
- sub Xupdate_512_AVX () {
1659
- (
1660
- '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
1661
- '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
1662
- '&vpsrlq ($t2,$t0,$sigma0[0])',
1663
- '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
1664
- '&vpsrlq ($t3,$t0,$sigma0[2])',
1665
- '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
1666
- '&vpxor ($t0,$t3,$t2)',
1667
- '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1668
- '&vpxor ($t0,$t0,$t1)',
1669
- '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1670
- '&vpxor ($t0,$t0,$t2)',
1671
- '&vpsrlq ($t3,@X[7],$sigma1[2]);',
1672
- '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
1673
- '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
1674
- '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
1675
- '&vpsrlq ($t1,@X[7],$sigma1[0]);',
1676
- '&vpxor ($t3,$t3,$t2)',
1677
- '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
1678
- '&vpxor ($t3,$t3,$t1)',
1679
- '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
1680
- '&vpxor ($t3,$t3,$t2)',
1681
- '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
1682
- '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
1683
- );
1684
- }
1685
-
1686
- sub AVX_512_00_47 () {
1687
- my $j = shift;
1688
- my $body = shift;
1689
- my @X = @_;
1690
- my @insns = (&$body,&$body); # 52 instructions
1691
-
1692
- foreach (Xupdate_512_AVX()) { # 23 instructions
1693
- eval;
1694
- eval(shift(@insns));
1695
- eval(shift(@insns));
1696
- }
1697
- &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1698
- foreach (@insns) { eval; } # remaining instructions
1699
- &vmovdqa (16*$j."(%rsp)",$t2);
1700
- }
1701
-
1702
- for ($i=0,$j=0; $j<8; $j++) {
1703
- &AVX_512_00_47($j,\&body_00_15,@X);
1704
- push(@X,shift(@X)); # rotate(@X)
1705
- }
1706
- &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1707
- &jne (".Lavx_00_47");
1708
-
1709
- for ($i=0; $i<16; ) {
1710
- foreach(body_00_15()) { eval; }
1711
- }
1712
- }
1713
- $code.=<<___;
1714
- mov $_ctx,$ctx
1715
- mov $a1,$A
1716
-
1717
- add $SZ*0($ctx),$A
1718
- lea 16*$SZ($inp),$inp
1719
- add $SZ*1($ctx),$B
1720
- add $SZ*2($ctx),$C
1721
- add $SZ*3($ctx),$D
1722
- add $SZ*4($ctx),$E
1723
- add $SZ*5($ctx),$F
1724
- add $SZ*6($ctx),$G
1725
- add $SZ*7($ctx),$H
1726
-
1727
- cmp $_end,$inp
1728
-
1729
- mov $A,$SZ*0($ctx)
1730
- mov $B,$SZ*1($ctx)
1731
- mov $C,$SZ*2($ctx)
1732
- mov $D,$SZ*3($ctx)
1733
- mov $E,$SZ*4($ctx)
1734
- mov $F,$SZ*5($ctx)
1735
- mov $G,$SZ*6($ctx)
1736
- mov $H,$SZ*7($ctx)
1737
- jb .Lloop_avx
1738
-
1739
- mov $_rsp,%rsi
1740
- vzeroupper
1741
- ___
1742
- $code.=<<___ if ($win64);
1743
- movaps 16*$SZ+32(%rsp),%xmm6
1744
- movaps 16*$SZ+48(%rsp),%xmm7
1745
- movaps 16*$SZ+64(%rsp),%xmm8
1746
- movaps 16*$SZ+80(%rsp),%xmm9
1747
- ___
1748
- $code.=<<___ if ($win64 && $SZ>4);
1749
- movaps 16*$SZ+96(%rsp),%xmm10
1750
- movaps 16*$SZ+112(%rsp),%xmm11
1751
- ___
1752
- $code.=<<___;
1753
- mov (%rsi),%r15
1754
- mov 8(%rsi),%r14
1755
- mov 16(%rsi),%r13
1756
- mov 24(%rsi),%r12
1757
- mov 32(%rsi),%rbp
1758
- mov 40(%rsi),%rbx
1759
- lea 48(%rsi),%rsp
1760
- .Lepilogue_avx:
1761
- ret
1762
- .size ${func}_avx,.-${func}_avx
1763
- ___
1764
-
1765
- if ($avx>1) {{
1766
- ######################################################################
1767
- # AVX2+BMI code path
1768
- #
1769
- my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
1770
- my $PUSH8=8*2*$SZ;
1771
- use integer;
1772
-
1773
- sub bodyx_00_15 () {
1774
- # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1775
- (
1776
- '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1777
-
1778
- '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
1779
- '&and ($a4,$e)', # f&e
1780
- '&rorx ($a0,$e,$Sigma1[2])',
1781
- '&rorx ($a2,$e,$Sigma1[1])',
1782
-
1783
- '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
1784
- '&lea ($h,"($h,$a4)")',
1785
- '&andn ($a4,$e,$g)', # ~e&g
1786
- '&xor ($a0,$a2)',
1787
-
1788
- '&rorx ($a1,$e,$Sigma1[0])',
1789
- '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
1790
- '&xor ($a0,$a1)', # Sigma1(e)
1791
- '&mov ($a2,$a)',
1792
-
1793
- '&rorx ($a4,$a,$Sigma0[2])',
1794
- '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
1795
- '&xor ($a2,$b)', # a^b, b^c in next round
1796
- '&rorx ($a1,$a,$Sigma0[1])',
1797
-
1798
- '&rorx ($a0,$a,$Sigma0[0])',
1799
- '&lea ($d,"($d,$h)")', # d+=h
1800
- '&and ($a3,$a2)', # (b^c)&(a^b)
1801
- '&xor ($a1,$a4)',
1802
-
1803
- '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
1804
- '&xor ($a1,$a0)', # Sigma0(a)
1805
- '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
1806
- '&mov ($a4,$e)', # copy of f in future
1807
-
1808
- '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1809
- );
1810
- # and at the finish one has to $a+=$a1
1811
- }
1812
-
1813
- $code.=<<___;
1814
- .type ${func}_avx2,\@function,3
1815
- .align 64
1816
- ${func}_avx2:
1817
- .Lavx2_shortcut:
1818
- push %rbx
1819
- push %rbp
1820
- push %r12
1821
- push %r13
1822
- push %r14
1823
- push %r15
1824
- mov %rsp,%r11 # copy %rsp
1825
- sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1826
- shl \$4,%rdx # num*16
1827
- and \$-256*$SZ,%rsp # align stack frame
1828
- lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1829
- add \$`2*$SZ*($rounds-8)`,%rsp
1830
- mov $ctx,$_ctx # save ctx, 1st arg
1831
- mov $inp,$_inp # save inp, 2nd arh
1832
- mov %rdx,$_end # save end pointer, "3rd" arg
1833
- mov %r11,$_rsp # save copy of %rsp
1834
- ___
1835
- $code.=<<___ if ($win64);
1836
- movaps %xmm6,16*$SZ+32(%rsp)
1837
- movaps %xmm7,16*$SZ+48(%rsp)
1838
- movaps %xmm8,16*$SZ+64(%rsp)
1839
- movaps %xmm9,16*$SZ+80(%rsp)
1840
- ___
1841
- $code.=<<___ if ($win64 && $SZ>4);
1842
- movaps %xmm10,16*$SZ+96(%rsp)
1843
- movaps %xmm11,16*$SZ+112(%rsp)
1844
- ___
1845
- $code.=<<___;
1846
- .Lprologue_avx2:
1847
-
1848
- vzeroupper
1849
- sub \$-16*$SZ,$inp # inp++, size optimization
1850
- mov $SZ*0($ctx),$A
1851
- mov $inp,%r12 # borrow $T1
1852
- mov $SZ*1($ctx),$B
1853
- cmp %rdx,$inp # $_end
1854
- mov $SZ*2($ctx),$C
1855
- cmove %rsp,%r12 # next block or random data
1856
- mov $SZ*3($ctx),$D
1857
- mov $SZ*4($ctx),$E
1858
- mov $SZ*5($ctx),$F
1859
- mov $SZ*6($ctx),$G
1860
- mov $SZ*7($ctx),$H
1861
- ___
1862
- if ($SZ==4) { # SHA256
1863
- my @X = map("%ymm$_",(0..3));
1864
- my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1865
-
1866
- $code.=<<___;
1867
- vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1868
- vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1869
- jmp .Loop_avx2
1870
- .align 16
1871
- .Loop_avx2:
1872
- vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1873
- vmovdqu -16*$SZ+0($inp),%xmm0
1874
- vmovdqu -16*$SZ+16($inp),%xmm1
1875
- vmovdqu -16*$SZ+32($inp),%xmm2
1876
- vmovdqu -16*$SZ+48($inp),%xmm3
1877
- #mov $inp,$_inp # offload $inp
1878
- vinserti128 \$1,(%r12),@X[0],@X[0]
1879
- vinserti128 \$1,16(%r12),@X[1],@X[1]
1880
- vpshufb $t3,@X[0],@X[0]
1881
- vinserti128 \$1,32(%r12),@X[2],@X[2]
1882
- vpshufb $t3,@X[1],@X[1]
1883
- vinserti128 \$1,48(%r12),@X[3],@X[3]
1884
-
1885
- lea $TABLE(%rip),$Tbl
1886
- vpshufb $t3,@X[2],@X[2]
1887
- vpaddd 0x00($Tbl),@X[0],$t0
1888
- vpshufb $t3,@X[3],@X[3]
1889
- vpaddd 0x20($Tbl),@X[1],$t1
1890
- vpaddd 0x40($Tbl),@X[2],$t2
1891
- vpaddd 0x60($Tbl),@X[3],$t3
1892
- vmovdqa $t0,0x00(%rsp)
1893
- xor $a1,$a1
1894
- vmovdqa $t1,0x20(%rsp)
1895
- lea -$PUSH8(%rsp),%rsp
1896
- mov $B,$a3
1897
- vmovdqa $t2,0x00(%rsp)
1898
- xor $C,$a3 # magic
1899
- vmovdqa $t3,0x20(%rsp)
1900
- mov $F,$a4
1901
- sub \$-16*2*$SZ,$Tbl # size optimization
1902
- jmp .Lavx2_00_47
1903
-
1904
- .align 16
1905
- .Lavx2_00_47:
1906
- ___
1907
-
1908
- sub AVX2_256_00_47 () {
1909
- my $j = shift;
1910
- my $body = shift;
1911
- my @X = @_;
1912
- my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1913
- my $base = "+2*$PUSH8(%rsp)";
1914
-
1915
- &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1916
- foreach (Xupdate_256_AVX()) { # 29 instructions
1917
- eval;
1918
- eval(shift(@insns));
1919
- eval(shift(@insns));
1920
- eval(shift(@insns));
1921
- }
1922
- &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1923
- foreach (@insns) { eval; } # remaining instructions
1924
- &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1925
- }
1926
-
1927
- for ($i=0,$j=0; $j<4; $j++) {
1928
- &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1929
- push(@X,shift(@X)); # rotate(@X)
1930
- }
1931
- &lea ($Tbl,16*2*$SZ."($Tbl)");
1932
- &cmpb (($SZ-1)."($Tbl)",0);
1933
- &jne (".Lavx2_00_47");
1934
-
1935
- for ($i=0; $i<16; ) {
1936
- my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1937
- foreach(bodyx_00_15()) { eval; }
1938
- }
1939
- } else { # SHA512
1940
- my @X = map("%ymm$_",(0..7));
1941
- my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
1942
-
1943
- $code.=<<___;
1944
- jmp .Loop_avx2
1945
- .align 16
1946
- .Loop_avx2:
1947
- vmovdqu -16*$SZ($inp),%xmm0
1948
- vmovdqu -16*$SZ+16($inp),%xmm1
1949
- vmovdqu -16*$SZ+32($inp),%xmm2
1950
- lea $TABLE+0x80(%rip),$Tbl # size optimization
1951
- vmovdqu -16*$SZ+48($inp),%xmm3
1952
- vmovdqu -16*$SZ+64($inp),%xmm4
1953
- vmovdqu -16*$SZ+80($inp),%xmm5
1954
- vmovdqu -16*$SZ+96($inp),%xmm6
1955
- vmovdqu -16*$SZ+112($inp),%xmm7
1956
- #mov $inp,$_inp # offload $inp
1957
- vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
1958
- vinserti128 \$1,(%r12),@X[0],@X[0]
1959
- vinserti128 \$1,16(%r12),@X[1],@X[1]
1960
- vpshufb $t2,@X[0],@X[0]
1961
- vinserti128 \$1,32(%r12),@X[2],@X[2]
1962
- vpshufb $t2,@X[1],@X[1]
1963
- vinserti128 \$1,48(%r12),@X[3],@X[3]
1964
- vpshufb $t2,@X[2],@X[2]
1965
- vinserti128 \$1,64(%r12),@X[4],@X[4]
1966
- vpshufb $t2,@X[3],@X[3]
1967
- vinserti128 \$1,80(%r12),@X[5],@X[5]
1968
- vpshufb $t2,@X[4],@X[4]
1969
- vinserti128 \$1,96(%r12),@X[6],@X[6]
1970
- vpshufb $t2,@X[5],@X[5]
1971
- vinserti128 \$1,112(%r12),@X[7],@X[7]
1972
-
1973
- vpaddq -0x80($Tbl),@X[0],$t0
1974
- vpshufb $t2,@X[6],@X[6]
1975
- vpaddq -0x60($Tbl),@X[1],$t1
1976
- vpshufb $t2,@X[7],@X[7]
1977
- vpaddq -0x40($Tbl),@X[2],$t2
1978
- vpaddq -0x20($Tbl),@X[3],$t3
1979
- vmovdqa $t0,0x00(%rsp)
1980
- vpaddq 0x00($Tbl),@X[4],$t0
1981
- vmovdqa $t1,0x20(%rsp)
1982
- vpaddq 0x20($Tbl),@X[5],$t1
1983
- vmovdqa $t2,0x40(%rsp)
1984
- vpaddq 0x40($Tbl),@X[6],$t2
1985
- vmovdqa $t3,0x60(%rsp)
1986
- lea -$PUSH8(%rsp),%rsp
1987
- vpaddq 0x60($Tbl),@X[7],$t3
1988
- vmovdqa $t0,0x00(%rsp)
1989
- xor $a1,$a1
1990
- vmovdqa $t1,0x20(%rsp)
1991
- mov $B,$a3
1992
- vmovdqa $t2,0x40(%rsp)
1993
- xor $C,$a3 # magic
1994
- vmovdqa $t3,0x60(%rsp)
1995
- mov $F,$a4
1996
- add \$16*2*$SZ,$Tbl
1997
- jmp .Lavx2_00_47
1998
-
1999
- .align 16
2000
- .Lavx2_00_47:
2001
- ___
2002
-
2003
- sub AVX2_512_00_47 () {
2004
- my $j = shift;
2005
- my $body = shift;
2006
- my @X = @_;
2007
- my @insns = (&$body,&$body); # 48 instructions
2008
- my $base = "+2*$PUSH8(%rsp)";
2009
-
2010
- &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0);
2011
- foreach (Xupdate_512_AVX()) { # 23 instructions
2012
- eval;
2013
- if ($_ !~ /\;$/) {
2014
- eval(shift(@insns));
2015
- eval(shift(@insns));
2016
- eval(shift(@insns));
2017
- }
2018
- }
2019
- &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
2020
- foreach (@insns) { eval; } # remaining instructions
2021
- &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2022
- }
2023
-
2024
- for ($i=0,$j=0; $j<8; $j++) {
2025
- &AVX2_512_00_47($j,\&bodyx_00_15,@X);
2026
- push(@X,shift(@X)); # rotate(@X)
2027
- }
2028
- &lea ($Tbl,16*2*$SZ."($Tbl)");
2029
- &cmpb (($SZ-1-0x80)."($Tbl)",0);
2030
- &jne (".Lavx2_00_47");
2031
-
2032
- for ($i=0; $i<16; ) {
2033
- my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2034
- foreach(bodyx_00_15()) { eval; }
2035
- }
2036
- }
2037
- $code.=<<___;
2038
- mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2039
- add $a1,$A
2040
- #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2041
- lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
2042
-
2043
- add $SZ*0($ctx),$A
2044
- add $SZ*1($ctx),$B
2045
- add $SZ*2($ctx),$C
2046
- add $SZ*3($ctx),$D
2047
- add $SZ*4($ctx),$E
2048
- add $SZ*5($ctx),$F
2049
- add $SZ*6($ctx),$G
2050
- add $SZ*7($ctx),$H
2051
-
2052
- mov $A,$SZ*0($ctx)
2053
- mov $B,$SZ*1($ctx)
2054
- mov $C,$SZ*2($ctx)
2055
- mov $D,$SZ*3($ctx)
2056
- mov $E,$SZ*4($ctx)
2057
- mov $F,$SZ*5($ctx)
2058
- mov $G,$SZ*6($ctx)
2059
- mov $H,$SZ*7($ctx)
2060
-
2061
- cmp `$PUSH8+2*8`($Tbl),$inp # $_end
2062
- je .Ldone_avx2
2063
-
2064
- xor $a1,$a1
2065
- mov $B,$a3
2066
- xor $C,$a3 # magic
2067
- mov $F,$a4
2068
- jmp .Lower_avx2
2069
- .align 16
2070
- .Lower_avx2:
2071
- ___
2072
- for ($i=0; $i<8; ) {
2073
- my $base="+16($Tbl)";
2074
- foreach(bodyx_00_15()) { eval; }
2075
- }
2076
- $code.=<<___;
2077
- lea -$PUSH8($Tbl),$Tbl
2078
- cmp %rsp,$Tbl
2079
- jae .Lower_avx2
2080
-
2081
- mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2082
- add $a1,$A
2083
- #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2084
- lea `2*$SZ*($rounds-8)`(%rsp),%rsp
2085
-
2086
- add $SZ*0($ctx),$A
2087
- add $SZ*1($ctx),$B
2088
- add $SZ*2($ctx),$C
2089
- add $SZ*3($ctx),$D
2090
- add $SZ*4($ctx),$E
2091
- add $SZ*5($ctx),$F
2092
- lea `2*16*$SZ`($inp),$inp # inp+=2
2093
- add $SZ*6($ctx),$G
2094
- mov $inp,%r12
2095
- add $SZ*7($ctx),$H
2096
- cmp $_end,$inp
2097
-
2098
- mov $A,$SZ*0($ctx)
2099
- cmove %rsp,%r12 # next block or stale data
2100
- mov $B,$SZ*1($ctx)
2101
- mov $C,$SZ*2($ctx)
2102
- mov $D,$SZ*3($ctx)
2103
- mov $E,$SZ*4($ctx)
2104
- mov $F,$SZ*5($ctx)
2105
- mov $G,$SZ*6($ctx)
2106
- mov $H,$SZ*7($ctx)
2107
-
2108
- jbe .Loop_avx2
2109
- lea (%rsp),$Tbl
2110
-
2111
- .Ldone_avx2:
2112
- lea ($Tbl),%rsp
2113
- mov $_rsp,%rsi
2114
- vzeroupper
2115
- ___
2116
- $code.=<<___ if ($win64);
2117
- movaps 16*$SZ+32(%rsp),%xmm6
2118
- movaps 16*$SZ+48(%rsp),%xmm7
2119
- movaps 16*$SZ+64(%rsp),%xmm8
2120
- movaps 16*$SZ+80(%rsp),%xmm9
2121
- ___
2122
- $code.=<<___ if ($win64 && $SZ>4);
2123
- movaps 16*$SZ+96(%rsp),%xmm10
2124
- movaps 16*$SZ+112(%rsp),%xmm11
2125
- ___
2126
- $code.=<<___;
2127
- mov (%rsi),%r15
2128
- mov 8(%rsi),%r14
2129
- mov 16(%rsi),%r13
2130
- mov 24(%rsi),%r12
2131
- mov 32(%rsi),%rbp
2132
- mov 40(%rsi),%rbx
2133
- lea 48(%rsi),%rsp
2134
- .Lepilogue_avx2:
2135
- ret
2136
- .size ${func}_avx2,.-${func}_avx2
2137
- ___
2138
- }}
2139
- }}}}}
2140
-
2141
- # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2142
- # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2143
- if ($win64) {
2144
- $rec="%rcx";
2145
- $frame="%rdx";
2146
- $context="%r8";
2147
- $disp="%r9";
2148
-
2149
- $code.=<<___;
2150
- .extern __imp_RtlVirtualUnwind
2151
- .type se_handler,\@abi-omnipotent
2152
- .align 16
2153
- se_handler:
2154
- push %rsi
2155
- push %rdi
2156
- push %rbx
2157
- push %rbp
2158
- push %r12
2159
- push %r13
2160
- push %r14
2161
- push %r15
2162
- pushfq
2163
- sub \$64,%rsp
2164
-
2165
- mov 120($context),%rax # pull context->Rax
2166
- mov 248($context),%rbx # pull context->Rip
2167
-
2168
- mov 8($disp),%rsi # disp->ImageBase
2169
- mov 56($disp),%r11 # disp->HanderlData
2170
-
2171
- mov 0(%r11),%r10d # HandlerData[0]
2172
- lea (%rsi,%r10),%r10 # prologue label
2173
- cmp %r10,%rbx # context->Rip<prologue label
2174
- jb .Lin_prologue
2175
-
2176
- mov 152($context),%rax # pull context->Rsp
2177
-
2178
- mov 4(%r11),%r10d # HandlerData[1]
2179
- lea (%rsi,%r10),%r10 # epilogue label
2180
- cmp %r10,%rbx # context->Rip>=epilogue label
2181
- jae .Lin_prologue
2182
- ___
2183
- $code.=<<___ if ($avx>1);
2184
- lea .Lavx2_shortcut(%rip),%r10
2185
- cmp %r10,%rbx # context->Rip<avx2_shortcut
2186
- jb .Lnot_in_avx2
2187
-
2188
- and \$-256*$SZ,%rax
2189
- add \$`2*$SZ*($rounds-8)`,%rax
2190
- .Lnot_in_avx2:
2191
- ___
2192
- $code.=<<___;
2193
- mov %rax,%rsi # put aside Rsp
2194
- mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
2195
- lea 48(%rax),%rax
2196
-
2197
- mov -8(%rax),%rbx
2198
- mov -16(%rax),%rbp
2199
- mov -24(%rax),%r12
2200
- mov -32(%rax),%r13
2201
- mov -40(%rax),%r14
2202
- mov -48(%rax),%r15
2203
- mov %rbx,144($context) # restore context->Rbx
2204
- mov %rbp,160($context) # restore context->Rbp
2205
- mov %r12,216($context) # restore context->R12
2206
- mov %r13,224($context) # restore context->R13
2207
- mov %r14,232($context) # restore context->R14
2208
- mov %r15,240($context) # restore context->R15
2209
-
2210
- lea .Lepilogue(%rip),%r10
2211
- cmp %r10,%rbx
2212
- jb .Lin_prologue # non-AVX code
2213
-
2214
- lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
2215
- lea 512($context),%rdi # &context.Xmm6
2216
- mov \$`$SZ==4?8:12`,%ecx
2217
- .long 0xa548f3fc # cld; rep movsq
2218
-
2219
- .Lin_prologue:
2220
- mov 8(%rax),%rdi
2221
- mov 16(%rax),%rsi
2222
- mov %rax,152($context) # restore context->Rsp
2223
- mov %rsi,168($context) # restore context->Rsi
2224
- mov %rdi,176($context) # restore context->Rdi
2225
-
2226
- mov 40($disp),%rdi # disp->ContextRecord
2227
- mov $context,%rsi # context
2228
- mov \$154,%ecx # sizeof(CONTEXT)
2229
- .long 0xa548f3fc # cld; rep movsq
2230
-
2231
- mov $disp,%rsi
2232
- xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2233
- mov 8(%rsi),%rdx # arg2, disp->ImageBase
2234
- mov 0(%rsi),%r8 # arg3, disp->ControlPc
2235
- mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2236
- mov 40(%rsi),%r10 # disp->ContextRecord
2237
- lea 56(%rsi),%r11 # &disp->HandlerData
2238
- lea 24(%rsi),%r12 # &disp->EstablisherFrame
2239
- mov %r10,32(%rsp) # arg5
2240
- mov %r11,40(%rsp) # arg6
2241
- mov %r12,48(%rsp) # arg7
2242
- mov %rcx,56(%rsp) # arg8, (NULL)
2243
- call *__imp_RtlVirtualUnwind(%rip)
2244
-
2245
- mov \$1,%eax # ExceptionContinueSearch
2246
- add \$64,%rsp
2247
- popfq
2248
- pop %r15
2249
- pop %r14
2250
- pop %r13
2251
- pop %r12
2252
- pop %rbp
2253
- pop %rbx
2254
- pop %rdi
2255
- pop %rsi
2256
- ret
2257
- .size se_handler,.-se_handler
2258
- ___
2259
-
2260
- $code.=<<___ if ($SZ==4 && $shaext);
2261
- .type shaext_handler,\@abi-omnipotent
2262
- .align 16
2263
- shaext_handler:
2264
- push %rsi
2265
- push %rdi
2266
- push %rbx
2267
- push %rbp
2268
- push %r12
2269
- push %r13
2270
- push %r14
2271
- push %r15
2272
- pushfq
2273
- sub \$64,%rsp
2274
-
2275
- mov 120($context),%rax # pull context->Rax
2276
- mov 248($context),%rbx # pull context->Rip
2277
-
2278
- lea .Lprologue_shaext(%rip),%r10
2279
- cmp %r10,%rbx # context->Rip<.Lprologue
2280
- jb .Lin_prologue
2281
-
2282
- lea .Lepilogue_shaext(%rip),%r10
2283
- cmp %r10,%rbx # context->Rip>=.Lepilogue
2284
- jae .Lin_prologue
2285
-
2286
- lea -8-5*16(%rax),%rsi
2287
- lea 512($context),%rdi # &context.Xmm6
2288
- mov \$10,%ecx
2289
- .long 0xa548f3fc # cld; rep movsq
2290
-
2291
- jmp .Lin_prologue
2292
- .size shaext_handler,.-shaext_handler
2293
- ___
2294
-
2295
- $code.=<<___;
2296
- .section .pdata
2297
- .align 4
2298
- .rva .LSEH_begin_$func
2299
- .rva .LSEH_end_$func
2300
- .rva .LSEH_info_$func
2301
- ___
2302
- $code.=<<___ if ($SZ==4 && $shaext);
2303
- .rva .LSEH_begin_${func}_shaext
2304
- .rva .LSEH_end_${func}_shaext
2305
- .rva .LSEH_info_${func}_shaext
2306
- ___
2307
- $code.=<<___ if ($SZ==4);
2308
- .rva .LSEH_begin_${func}_ssse3
2309
- .rva .LSEH_end_${func}_ssse3
2310
- .rva .LSEH_info_${func}_ssse3
2311
- ___
2312
- $code.=<<___ if ($avx && $SZ==8);
2313
- .rva .LSEH_begin_${func}_xop
2314
- .rva .LSEH_end_${func}_xop
2315
- .rva .LSEH_info_${func}_xop
2316
- ___
2317
- $code.=<<___ if ($avx);
2318
- .rva .LSEH_begin_${func}_avx
2319
- .rva .LSEH_end_${func}_avx
2320
- .rva .LSEH_info_${func}_avx
2321
- ___
2322
- $code.=<<___ if ($avx>1);
2323
- .rva .LSEH_begin_${func}_avx2
2324
- .rva .LSEH_end_${func}_avx2
2325
- .rva .LSEH_info_${func}_avx2
2326
- ___
2327
- $code.=<<___;
2328
- .section .xdata
2329
- .align 8
2330
- .LSEH_info_$func:
2331
- .byte 9,0,0,0
2332
- .rva se_handler
2333
- .rva .Lprologue,.Lepilogue # HandlerData[]
2334
- ___
2335
- $code.=<<___ if ($SZ==4 && $shaext);
2336
- .LSEH_info_${func}_shaext:
2337
- .byte 9,0,0,0
2338
- .rva shaext_handler
2339
- ___
2340
- $code.=<<___ if ($SZ==4);
2341
- .LSEH_info_${func}_ssse3:
2342
- .byte 9,0,0,0
2343
- .rva se_handler
2344
- .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
2345
- ___
2346
- $code.=<<___ if ($avx && $SZ==8);
2347
- .LSEH_info_${func}_xop:
2348
- .byte 9,0,0,0
2349
- .rva se_handler
2350
- .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
2351
- ___
2352
- $code.=<<___ if ($avx);
2353
- .LSEH_info_${func}_avx:
2354
- .byte 9,0,0,0
2355
- .rva se_handler
2356
- .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
2357
- ___
2358
- $code.=<<___ if ($avx>1);
2359
- .LSEH_info_${func}_avx2:
2360
- .byte 9,0,0,0
2361
- .rva se_handler
2362
- .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2363
- ___
2364
- }
2365
-
2366
- sub sha256op38 {
2367
- my $instr = shift;
2368
- my %opcodelet = (
2369
- "sha256rnds2" => 0xcb,
2370
- "sha256msg1" => 0xcc,
2371
- "sha256msg2" => 0xcd );
2372
-
2373
- if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2374
- my @opcode=(0x0f,0x38);
2375
- push @opcode,$opcodelet{$instr};
2376
- push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
2377
- return ".byte\t".join(',',@opcode);
2378
- } else {
2379
- return $instr."\t".@_[0];
2380
- }
2381
- }
2382
-
2383
- foreach (split("\n",$code)) {
2384
- s/\`([^\`]*)\`/eval $1/geo;
2385
-
2386
- s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2387
-
2388
- print $_,"\n";
2389
- }
2390
- close STDOUT;