ring-native 0.0.0 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (267) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/CHANGES.md +7 -0
  4. data/Makefile +5 -0
  5. data/README.md +12 -5
  6. data/Rakefile +4 -0
  7. data/ext/ring/extconf.rb +4 -5
  8. data/lib/ring/native.rb +3 -1
  9. data/lib/ring/native/version.rb +5 -1
  10. data/ring-native.gemspec +6 -6
  11. data/vendor/ring-ffi/Cargo.lock +26 -0
  12. data/vendor/ring-ffi/Cargo.toml +45 -0
  13. data/vendor/ring-ffi/LICENSE +16 -0
  14. data/vendor/ring-ffi/README.md +59 -0
  15. data/vendor/ring-ffi/src/lib.rs +79 -0
  16. metadata +10 -255
  17. data/vendor/ring/BUILDING.md +0 -40
  18. data/vendor/ring/Cargo.toml +0 -43
  19. data/vendor/ring/LICENSE +0 -185
  20. data/vendor/ring/Makefile +0 -35
  21. data/vendor/ring/PORTING.md +0 -163
  22. data/vendor/ring/README.md +0 -113
  23. data/vendor/ring/STYLE.md +0 -197
  24. data/vendor/ring/appveyor.yml +0 -27
  25. data/vendor/ring/build.rs +0 -108
  26. data/vendor/ring/crypto/aes/aes.c +0 -1142
  27. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +0 -25
  28. data/vendor/ring/crypto/aes/aes_test.cc +0 -93
  29. data/vendor/ring/crypto/aes/asm/aes-586.pl +0 -2368
  30. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +0 -1249
  31. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +0 -2246
  32. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +0 -1318
  33. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +0 -2084
  34. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +0 -675
  35. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +0 -1364
  36. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +0 -1565
  37. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +0 -841
  38. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +0 -1116
  39. data/vendor/ring/crypto/aes/internal.h +0 -87
  40. data/vendor/ring/crypto/aes/mode_wrappers.c +0 -61
  41. data/vendor/ring/crypto/bn/add.c +0 -394
  42. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +0 -694
  43. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +0 -1503
  44. data/vendor/ring/crypto/bn/asm/bn-586.pl +0 -774
  45. data/vendor/ring/crypto/bn/asm/co-586.pl +0 -287
  46. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +0 -1882
  47. data/vendor/ring/crypto/bn/asm/x86-mont.pl +0 -592
  48. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +0 -599
  49. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +0 -1393
  50. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +0 -3507
  51. data/vendor/ring/crypto/bn/bn.c +0 -352
  52. data/vendor/ring/crypto/bn/bn_asn1.c +0 -74
  53. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +0 -25
  54. data/vendor/ring/crypto/bn/bn_test.cc +0 -1696
  55. data/vendor/ring/crypto/bn/cmp.c +0 -200
  56. data/vendor/ring/crypto/bn/convert.c +0 -433
  57. data/vendor/ring/crypto/bn/ctx.c +0 -311
  58. data/vendor/ring/crypto/bn/div.c +0 -594
  59. data/vendor/ring/crypto/bn/exponentiation.c +0 -1335
  60. data/vendor/ring/crypto/bn/gcd.c +0 -711
  61. data/vendor/ring/crypto/bn/generic.c +0 -1019
  62. data/vendor/ring/crypto/bn/internal.h +0 -316
  63. data/vendor/ring/crypto/bn/montgomery.c +0 -516
  64. data/vendor/ring/crypto/bn/mul.c +0 -888
  65. data/vendor/ring/crypto/bn/prime.c +0 -829
  66. data/vendor/ring/crypto/bn/random.c +0 -334
  67. data/vendor/ring/crypto/bn/rsaz_exp.c +0 -262
  68. data/vendor/ring/crypto/bn/rsaz_exp.h +0 -53
  69. data/vendor/ring/crypto/bn/shift.c +0 -276
  70. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +0 -25
  71. data/vendor/ring/crypto/bytestring/bytestring_test.cc +0 -421
  72. data/vendor/ring/crypto/bytestring/cbb.c +0 -399
  73. data/vendor/ring/crypto/bytestring/cbs.c +0 -227
  74. data/vendor/ring/crypto/bytestring/internal.h +0 -46
  75. data/vendor/ring/crypto/chacha/chacha_generic.c +0 -140
  76. data/vendor/ring/crypto/chacha/chacha_vec.c +0 -323
  77. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +0 -1447
  78. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +0 -153
  79. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +0 -25
  80. data/vendor/ring/crypto/cipher/e_aes.c +0 -390
  81. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +0 -208
  82. data/vendor/ring/crypto/cipher/internal.h +0 -173
  83. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +0 -543
  84. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +0 -9
  85. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +0 -475
  86. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +0 -23
  87. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +0 -422
  88. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +0 -484
  89. data/vendor/ring/crypto/cipher/test/cipher_test.txt +0 -100
  90. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +0 -25
  91. data/vendor/ring/crypto/constant_time_test.c +0 -304
  92. data/vendor/ring/crypto/cpu-arm-asm.S +0 -32
  93. data/vendor/ring/crypto/cpu-arm.c +0 -199
  94. data/vendor/ring/crypto/cpu-intel.c +0 -261
  95. data/vendor/ring/crypto/crypto.c +0 -151
  96. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +0 -2118
  97. data/vendor/ring/crypto/curve25519/curve25519.c +0 -4888
  98. data/vendor/ring/crypto/curve25519/x25519_test.cc +0 -128
  99. data/vendor/ring/crypto/digest/md32_common.h +0 -181
  100. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +0 -2725
  101. data/vendor/ring/crypto/ec/ec.c +0 -193
  102. data/vendor/ring/crypto/ec/ec_curves.c +0 -61
  103. data/vendor/ring/crypto/ec/ec_key.c +0 -228
  104. data/vendor/ring/crypto/ec/ec_montgomery.c +0 -114
  105. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +0 -25
  106. data/vendor/ring/crypto/ec/internal.h +0 -243
  107. data/vendor/ring/crypto/ec/oct.c +0 -253
  108. data/vendor/ring/crypto/ec/p256-64.c +0 -1794
  109. data/vendor/ring/crypto/ec/p256-x86_64-table.h +0 -9548
  110. data/vendor/ring/crypto/ec/p256-x86_64.c +0 -509
  111. data/vendor/ring/crypto/ec/simple.c +0 -1007
  112. data/vendor/ring/crypto/ec/util-64.c +0 -183
  113. data/vendor/ring/crypto/ec/wnaf.c +0 -508
  114. data/vendor/ring/crypto/ecdh/ecdh.c +0 -155
  115. data/vendor/ring/crypto/ecdsa/ecdsa.c +0 -304
  116. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +0 -193
  117. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +0 -25
  118. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +0 -327
  119. data/vendor/ring/crypto/header_removed.h +0 -17
  120. data/vendor/ring/crypto/internal.h +0 -495
  121. data/vendor/ring/crypto/libring.Windows.vcxproj +0 -101
  122. data/vendor/ring/crypto/mem.c +0 -98
  123. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +0 -1045
  124. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +0 -517
  125. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +0 -1393
  126. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +0 -1741
  127. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +0 -422
  128. data/vendor/ring/crypto/modes/ctr.c +0 -226
  129. data/vendor/ring/crypto/modes/gcm.c +0 -1206
  130. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +0 -25
  131. data/vendor/ring/crypto/modes/gcm_test.c +0 -348
  132. data/vendor/ring/crypto/modes/internal.h +0 -299
  133. data/vendor/ring/crypto/perlasm/arm-xlate.pl +0 -170
  134. data/vendor/ring/crypto/perlasm/readme +0 -100
  135. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +0 -1164
  136. data/vendor/ring/crypto/perlasm/x86asm.pl +0 -292
  137. data/vendor/ring/crypto/perlasm/x86gas.pl +0 -263
  138. data/vendor/ring/crypto/perlasm/x86masm.pl +0 -200
  139. data/vendor/ring/crypto/perlasm/x86nasm.pl +0 -187
  140. data/vendor/ring/crypto/poly1305/poly1305.c +0 -331
  141. data/vendor/ring/crypto/poly1305/poly1305_arm.c +0 -301
  142. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +0 -2015
  143. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +0 -25
  144. data/vendor/ring/crypto/poly1305/poly1305_test.cc +0 -80
  145. data/vendor/ring/crypto/poly1305/poly1305_test.txt +0 -52
  146. data/vendor/ring/crypto/poly1305/poly1305_vec.c +0 -892
  147. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +0 -75
  148. data/vendor/ring/crypto/rand/internal.h +0 -32
  149. data/vendor/ring/crypto/rand/rand.c +0 -189
  150. data/vendor/ring/crypto/rand/urandom.c +0 -219
  151. data/vendor/ring/crypto/rand/windows.c +0 -56
  152. data/vendor/ring/crypto/refcount_c11.c +0 -66
  153. data/vendor/ring/crypto/refcount_lock.c +0 -53
  154. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +0 -25
  155. data/vendor/ring/crypto/refcount_test.c +0 -58
  156. data/vendor/ring/crypto/rsa/blinding.c +0 -462
  157. data/vendor/ring/crypto/rsa/internal.h +0 -108
  158. data/vendor/ring/crypto/rsa/padding.c +0 -300
  159. data/vendor/ring/crypto/rsa/rsa.c +0 -450
  160. data/vendor/ring/crypto/rsa/rsa_asn1.c +0 -261
  161. data/vendor/ring/crypto/rsa/rsa_impl.c +0 -944
  162. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +0 -25
  163. data/vendor/ring/crypto/rsa/rsa_test.cc +0 -437
  164. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +0 -436
  165. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +0 -2390
  166. data/vendor/ring/crypto/sha/asm/sha256-586.pl +0 -1275
  167. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +0 -735
  168. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +0 -14
  169. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +0 -14
  170. data/vendor/ring/crypto/sha/asm/sha512-586.pl +0 -911
  171. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +0 -666
  172. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +0 -14
  173. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +0 -14
  174. data/vendor/ring/crypto/sha/sha1.c +0 -271
  175. data/vendor/ring/crypto/sha/sha256.c +0 -204
  176. data/vendor/ring/crypto/sha/sha512.c +0 -355
  177. data/vendor/ring/crypto/test/file_test.cc +0 -326
  178. data/vendor/ring/crypto/test/file_test.h +0 -181
  179. data/vendor/ring/crypto/test/malloc.cc +0 -150
  180. data/vendor/ring/crypto/test/scoped_types.h +0 -95
  181. data/vendor/ring/crypto/test/test.Windows.vcxproj +0 -35
  182. data/vendor/ring/crypto/test/test_util.cc +0 -46
  183. data/vendor/ring/crypto/test/test_util.h +0 -41
  184. data/vendor/ring/crypto/thread_none.c +0 -55
  185. data/vendor/ring/crypto/thread_pthread.c +0 -165
  186. data/vendor/ring/crypto/thread_test.Windows.vcxproj +0 -25
  187. data/vendor/ring/crypto/thread_test.c +0 -200
  188. data/vendor/ring/crypto/thread_win.c +0 -282
  189. data/vendor/ring/examples/checkdigest.rs +0 -103
  190. data/vendor/ring/include/openssl/aes.h +0 -121
  191. data/vendor/ring/include/openssl/arm_arch.h +0 -129
  192. data/vendor/ring/include/openssl/base.h +0 -156
  193. data/vendor/ring/include/openssl/bn.h +0 -794
  194. data/vendor/ring/include/openssl/buffer.h +0 -18
  195. data/vendor/ring/include/openssl/bytestring.h +0 -235
  196. data/vendor/ring/include/openssl/chacha.h +0 -37
  197. data/vendor/ring/include/openssl/cmac.h +0 -76
  198. data/vendor/ring/include/openssl/cpu.h +0 -184
  199. data/vendor/ring/include/openssl/crypto.h +0 -43
  200. data/vendor/ring/include/openssl/curve25519.h +0 -88
  201. data/vendor/ring/include/openssl/ec.h +0 -225
  202. data/vendor/ring/include/openssl/ec_key.h +0 -129
  203. data/vendor/ring/include/openssl/ecdh.h +0 -110
  204. data/vendor/ring/include/openssl/ecdsa.h +0 -156
  205. data/vendor/ring/include/openssl/err.h +0 -201
  206. data/vendor/ring/include/openssl/mem.h +0 -101
  207. data/vendor/ring/include/openssl/obj_mac.h +0 -71
  208. data/vendor/ring/include/openssl/opensslfeatures.h +0 -68
  209. data/vendor/ring/include/openssl/opensslv.h +0 -18
  210. data/vendor/ring/include/openssl/ossl_typ.h +0 -18
  211. data/vendor/ring/include/openssl/poly1305.h +0 -51
  212. data/vendor/ring/include/openssl/rand.h +0 -70
  213. data/vendor/ring/include/openssl/rsa.h +0 -399
  214. data/vendor/ring/include/openssl/thread.h +0 -133
  215. data/vendor/ring/include/openssl/type_check.h +0 -71
  216. data/vendor/ring/mk/Common.props +0 -63
  217. data/vendor/ring/mk/Windows.props +0 -42
  218. data/vendor/ring/mk/WindowsTest.props +0 -18
  219. data/vendor/ring/mk/appveyor.bat +0 -62
  220. data/vendor/ring/mk/bottom_of_makefile.mk +0 -54
  221. data/vendor/ring/mk/ring.mk +0 -266
  222. data/vendor/ring/mk/top_of_makefile.mk +0 -214
  223. data/vendor/ring/mk/travis.sh +0 -40
  224. data/vendor/ring/mk/update-travis-yml.py +0 -229
  225. data/vendor/ring/ring.sln +0 -153
  226. data/vendor/ring/src/aead.rs +0 -682
  227. data/vendor/ring/src/agreement.rs +0 -248
  228. data/vendor/ring/src/c.rs +0 -129
  229. data/vendor/ring/src/constant_time.rs +0 -37
  230. data/vendor/ring/src/der.rs +0 -96
  231. data/vendor/ring/src/digest.rs +0 -690
  232. data/vendor/ring/src/digest_tests.txt +0 -57
  233. data/vendor/ring/src/ecc.rs +0 -28
  234. data/vendor/ring/src/ecc_build.rs +0 -279
  235. data/vendor/ring/src/ecc_curves.rs +0 -117
  236. data/vendor/ring/src/ed25519_tests.txt +0 -2579
  237. data/vendor/ring/src/exe_tests.rs +0 -46
  238. data/vendor/ring/src/ffi.rs +0 -29
  239. data/vendor/ring/src/file_test.rs +0 -187
  240. data/vendor/ring/src/hkdf.rs +0 -153
  241. data/vendor/ring/src/hkdf_tests.txt +0 -59
  242. data/vendor/ring/src/hmac.rs +0 -414
  243. data/vendor/ring/src/hmac_tests.txt +0 -97
  244. data/vendor/ring/src/input.rs +0 -312
  245. data/vendor/ring/src/lib.rs +0 -41
  246. data/vendor/ring/src/pbkdf2.rs +0 -265
  247. data/vendor/ring/src/pbkdf2_tests.txt +0 -113
  248. data/vendor/ring/src/polyfill.rs +0 -57
  249. data/vendor/ring/src/rand.rs +0 -28
  250. data/vendor/ring/src/signature.rs +0 -314
  251. data/vendor/ring/third-party/NIST/README.md +0 -9
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +0 -263
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +0 -309
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +0 -267
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +0 -263
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +0 -309
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +0 -267
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +0 -263
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +0 -309
  260. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +0 -267
  261. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +0 -519
  262. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +0 -309
  263. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +0 -523
  264. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +0 -519
  265. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +0 -309
  266. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +0 -523
  267. data/vendor/ring/third-party/NIST/sha256sums.txt +0 -1
@@ -1,1275 +0,0 @@
1
- #!/usr/bin/env perl
2
- #
3
- # ====================================================================
4
- # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
- # project. The module is, however, dual licensed under OpenSSL and
6
- # CRYPTOGAMS licenses depending on where you obtain it. For further
7
- # details see http://www.openssl.org/~appro/cryptogams/.
8
- # ====================================================================
9
- #
10
- # SHA256 block transform for x86. September 2007.
11
- #
12
- # Performance improvement over compiler generated code varies from
13
- # 10% to 40% [see below]. Not very impressive on some µ-archs, but
14
- # it's 5 times smaller and optimizies amount of writes.
15
- #
16
- # May 2012.
17
- #
18
- # Optimization including two of Pavel Semjanov's ideas, alternative
19
- # Maj and full unroll, resulted in ~20-25% improvement on most CPUs,
20
- # ~7% on Pentium, ~40% on Atom. As fully unrolled loop body is almost
21
- # 15x larger, 8KB vs. 560B, it's fired only for longer inputs. But not
22
- # on P4, where it kills performance, nor Sandy Bridge, where folded
23
- # loop is approximately as fast...
24
- #
25
- # June 2012.
26
- #
27
- # Add AMD XOP-specific code path, >30% improvement on Bulldozer over
28
- # May version, >60% over original. Add AVX+shrd code path, >25%
29
- # improvement on Sandy Bridge over May version, 60% over original.
30
- #
31
- # May 2013.
32
- #
33
- # Replace AMD XOP code path with SSSE3 to cover more processors.
34
- # (Biggest improvement coefficient is on upcoming Atom Silvermont,
35
- # not shown.) Add AVX+BMI code path.
36
- #
37
- # March 2014.
38
- #
39
- # Add support for Intel SHA Extensions.
40
- #
41
- # Performance in clock cycles per processed byte (less is better):
42
- #
43
- # gcc icc x86 asm(*) SIMD x86_64 asm(**)
44
- # Pentium 46 57 40/38 - -
45
- # PIII 36 33 27/24 - -
46
- # P4 41 38 28 - 17.3
47
- # AMD K8 27 25 19/15.5 - 14.9
48
- # Core2 26 23 18/15.6 14.3 13.8
49
- # Westmere 27 - 19/15.7 13.4 12.3
50
- # Sandy Bridge 25 - 15.9 12.4 11.6
51
- # Ivy Bridge 24 - 15.0 11.4 10.3
52
- # Haswell 22 - 13.9 9.46 7.80
53
- # Bulldozer 36 - 27/22 17.0 13.6
54
- # VIA Nano 36 - 25/22 16.8 16.5
55
- # Atom 50 - 30/25 21.9 18.9
56
- # Silvermont 40 - 34/31 22.9 20.6
57
- #
58
- # (*) numbers after slash are for unrolled loop, where applicable;
59
- # (**) x86_64 assembly performance is presented for reference
60
- # purposes, results are best-available;
61
-
62
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63
- push(@INC,"${dir}","${dir}../../perlasm");
64
- require "x86asm.pl";
65
-
66
- &asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
67
-
68
- $xmm=$avx=0;
69
- for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
70
-
71
- # In upstream, this is controlled by shelling out to the compiler to check
72
- # versions, but BoringSSL is intended to be used with pre-generated perlasm
73
- # output, so this isn't useful anyway.
74
- #
75
- # TODO(davidben): Enable AVX2 code after testing by setting $avx to 2.
76
- $avx = 1;
77
-
78
- $avx = 0 unless ($xmm);
79
-
80
- $shaext=$xmm; ### set to zero if compiling for 1.0.1
81
-
82
- # TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
83
- # been tested.
84
- $shaext = 0;
85
-
86
- $unroll_after = 64*4; # If pre-evicted from L1P cache first spin of
87
- # fully unrolled loop was measured to run about
88
- # 3-4x slower. If slowdown coefficient is N and
89
- # unrolled loop is m times faster, then you break
90
- # even at (N-1)/(m-1) blocks. Then it needs to be
91
- # adjusted for probability of code being evicted,
92
- # code size/cache size=1/4. Typical m is 1.15...
93
-
94
- $A="eax";
95
- $E="edx";
96
- $T="ebx";
97
- $Aoff=&DWP(4,"esp");
98
- $Boff=&DWP(8,"esp");
99
- $Coff=&DWP(12,"esp");
100
- $Doff=&DWP(16,"esp");
101
- $Eoff=&DWP(20,"esp");
102
- $Foff=&DWP(24,"esp");
103
- $Goff=&DWP(28,"esp");
104
- $Hoff=&DWP(32,"esp");
105
- $Xoff=&DWP(36,"esp");
106
- $K256="ebp";
107
-
108
- sub BODY_16_63() {
109
- &mov ($T,"ecx"); # "ecx" is preloaded
110
- &mov ("esi",&DWP(4*(9+15+16-14),"esp"));
111
- &ror ("ecx",18-7);
112
- &mov ("edi","esi");
113
- &ror ("esi",19-17);
114
- &xor ("ecx",$T);
115
- &shr ($T,3);
116
- &ror ("ecx",7);
117
- &xor ("esi","edi");
118
- &xor ($T,"ecx"); # T = sigma0(X[-15])
119
- &ror ("esi",17);
120
- &add ($T,&DWP(4*(9+15+16),"esp")); # T += X[-16]
121
- &shr ("edi",10);
122
- &add ($T,&DWP(4*(9+15+16-9),"esp")); # T += X[-7]
123
- #&xor ("edi","esi") # sigma1(X[-2])
124
- # &add ($T,"edi"); # T += sigma1(X[-2])
125
- # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0]
126
-
127
- &BODY_00_15(1);
128
- }
129
- sub BODY_00_15() {
130
- my $in_16_63=shift;
131
-
132
- &mov ("ecx",$E);
133
- &xor ("edi","esi") if ($in_16_63); # sigma1(X[-2])
134
- &mov ("esi",$Foff);
135
- &ror ("ecx",25-11);
136
- &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2])
137
- &mov ("edi",$Goff);
138
- &xor ("ecx",$E);
139
- &xor ("esi","edi");
140
- &mov ($T,&DWP(4*(9+15),"esp")) if (!$in_16_63);
141
- &mov (&DWP(4*(9+15),"esp"),$T) if ($in_16_63); # save X[0]
142
- &ror ("ecx",11-6);
143
- &and ("esi",$E);
144
- &mov ($Eoff,$E); # modulo-scheduled
145
- &xor ($E,"ecx");
146
- &add ($T,$Hoff); # T += h
147
- &xor ("esi","edi"); # Ch(e,f,g)
148
- &ror ($E,6); # Sigma1(e)
149
- &mov ("ecx",$A);
150
- &add ($T,"esi"); # T += Ch(e,f,g)
151
-
152
- &ror ("ecx",22-13);
153
- &add ($T,$E); # T += Sigma1(e)
154
- &mov ("edi",$Boff);
155
- &xor ("ecx",$A);
156
- &mov ($Aoff,$A); # modulo-scheduled
157
- &lea ("esp",&DWP(-4,"esp"));
158
- &ror ("ecx",13-2);
159
- &mov ("esi",&DWP(0,$K256));
160
- &xor ("ecx",$A);
161
- &mov ($E,$Eoff); # e in next iteration, d in this one
162
- &xor ($A,"edi"); # a ^= b
163
- &ror ("ecx",2); # Sigma0(a)
164
-
165
- &add ($T,"esi"); # T+= K[i]
166
- &mov (&DWP(0,"esp"),$A); # (b^c) in next round
167
- &add ($E,$T); # d += T
168
- &and ($A,&DWP(4,"esp")); # a &= (b^c)
169
- &add ($T,"ecx"); # T += Sigma0(a)
170
- &xor ($A,"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b)
171
- &mov ("ecx",&DWP(4*(9+15+16-1),"esp")) if ($in_16_63); # preload T
172
- &add ($K256,4);
173
- &add ($A,$T); # h += T
174
- }
175
-
176
- &external_label("OPENSSL_ia32cap_P") if (!$i386);
177
-
178
- &function_begin("sha256_block_data_order");
179
- &mov ("esi",wparam(0)); # ctx
180
- &mov ("edi",wparam(1)); # inp
181
- &mov ("eax",wparam(2)); # num
182
- &mov ("ebx","esp"); # saved sp
183
-
184
- &call (&label("pic_point")); # make it PIC!
185
- &set_label("pic_point");
186
- &blindpop($K256);
187
- &lea ($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256));
188
-
189
- &sub ("esp",16);
190
- &and ("esp",-64);
191
-
192
- &shl ("eax",6);
193
- &add ("eax","edi");
194
- &mov (&DWP(0,"esp"),"esi"); # ctx
195
- &mov (&DWP(4,"esp"),"edi"); # inp
196
- &mov (&DWP(8,"esp"),"eax"); # inp+num*128
197
- &mov (&DWP(12,"esp"),"ebx"); # saved sp
198
- if (!$i386 && $xmm) {
199
- &picmeup("edx","OPENSSL_ia32cap_P",$K256,&label("K256"));
200
- &mov ("ecx",&DWP(0,"edx"));
201
- &mov ("ebx",&DWP(4,"edx"));
202
- &test ("ecx",1<<20); # check for P4
203
- &jnz (&label("loop"));
204
- &mov ("edx",&DWP(8,"edx")) if ($xmm);
205
- &test ("ecx",1<<24); # check for FXSR
206
- &jz ($unroll_after?&label("no_xmm"):&label("loop"));
207
- &and ("ecx",1<<30); # mask "Intel CPU" bit
208
- &and ("ebx",1<<28|1<<9); # mask AVX and SSSE3 bits
209
- &test ("edx",1<<29) if ($shaext); # check for SHA
210
- &jnz (&label("shaext")) if ($shaext);
211
- &or ("ecx","ebx");
212
- &and ("ecx",1<<28|1<<30);
213
- &cmp ("ecx",1<<28|1<<30);
214
- if ($xmm) {
215
- &je (&label("AVX")) if ($avx);
216
- &test ("ebx",1<<9); # check for SSSE3
217
- &jnz (&label("SSSE3"));
218
- } else {
219
- &je (&label("loop_shrd"));
220
- }
221
- if ($unroll_after) {
222
- &set_label("no_xmm");
223
- &sub ("eax","edi");
224
- &cmp ("eax",$unroll_after);
225
- &jae (&label("unrolled"));
226
- } }
227
- &jmp (&label("loop"));
228
-
229
- sub COMPACT_LOOP() {
230
- my $suffix=shift;
231
-
232
- &set_label("loop$suffix",$suffix?32:16);
233
- # copy input block to stack reversing byte and dword order
234
- for($i=0;$i<4;$i++) {
235
- &mov ("eax",&DWP($i*16+0,"edi"));
236
- &mov ("ebx",&DWP($i*16+4,"edi"));
237
- &mov ("ecx",&DWP($i*16+8,"edi"));
238
- &bswap ("eax");
239
- &mov ("edx",&DWP($i*16+12,"edi"));
240
- &bswap ("ebx");
241
- &push ("eax");
242
- &bswap ("ecx");
243
- &push ("ebx");
244
- &bswap ("edx");
245
- &push ("ecx");
246
- &push ("edx");
247
- }
248
- &add ("edi",64);
249
- &lea ("esp",&DWP(-4*9,"esp"));# place for A,B,C,D,E,F,G,H
250
- &mov (&DWP(4*(9+16)+4,"esp"),"edi");
251
-
252
- # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
253
- &mov ($A,&DWP(0,"esi"));
254
- &mov ("ebx",&DWP(4,"esi"));
255
- &mov ("ecx",&DWP(8,"esi"));
256
- &mov ("edi",&DWP(12,"esi"));
257
- # &mov ($Aoff,$A);
258
- &mov ($Boff,"ebx");
259
- &xor ("ebx","ecx");
260
- &mov ($Coff,"ecx");
261
- &mov ($Doff,"edi");
262
- &mov (&DWP(0,"esp"),"ebx"); # magic
263
- &mov ($E,&DWP(16,"esi"));
264
- &mov ("ebx",&DWP(20,"esi"));
265
- &mov ("ecx",&DWP(24,"esi"));
266
- &mov ("edi",&DWP(28,"esi"));
267
- # &mov ($Eoff,$E);
268
- &mov ($Foff,"ebx");
269
- &mov ($Goff,"ecx");
270
- &mov ($Hoff,"edi");
271
-
272
- &set_label("00_15$suffix",16);
273
-
274
- &BODY_00_15();
275
-
276
- &cmp ("esi",0xc19bf174);
277
- &jne (&label("00_15$suffix"));
278
-
279
- &mov ("ecx",&DWP(4*(9+15+16-1),"esp")); # preloaded in BODY_00_15(1)
280
- &jmp (&label("16_63$suffix"));
281
-
282
- &set_label("16_63$suffix",16);
283
-
284
- &BODY_16_63();
285
-
286
- &cmp ("esi",0xc67178f2);
287
- &jne (&label("16_63$suffix"));
288
-
289
- &mov ("esi",&DWP(4*(9+16+64)+0,"esp"));#ctx
290
- # &mov ($A,$Aoff);
291
- &mov ("ebx",$Boff);
292
- # &mov ("edi",$Coff);
293
- &mov ("ecx",$Doff);
294
- &add ($A,&DWP(0,"esi"));
295
- &add ("ebx",&DWP(4,"esi"));
296
- &add ("edi",&DWP(8,"esi"));
297
- &add ("ecx",&DWP(12,"esi"));
298
- &mov (&DWP(0,"esi"),$A);
299
- &mov (&DWP(4,"esi"),"ebx");
300
- &mov (&DWP(8,"esi"),"edi");
301
- &mov (&DWP(12,"esi"),"ecx");
302
- # &mov ($E,$Eoff);
303
- &mov ("eax",$Foff);
304
- &mov ("ebx",$Goff);
305
- &mov ("ecx",$Hoff);
306
- &mov ("edi",&DWP(4*(9+16+64)+4,"esp"));#inp
307
- &add ($E,&DWP(16,"esi"));
308
- &add ("eax",&DWP(20,"esi"));
309
- &add ("ebx",&DWP(24,"esi"));
310
- &add ("ecx",&DWP(28,"esi"));
311
- &mov (&DWP(16,"esi"),$E);
312
- &mov (&DWP(20,"esi"),"eax");
313
- &mov (&DWP(24,"esi"),"ebx");
314
- &mov (&DWP(28,"esi"),"ecx");
315
-
316
- &lea ("esp",&DWP(4*(9+16+64),"esp"));# destroy frame
317
- &sub ($K256,4*64); # rewind K
318
-
319
- &cmp ("edi",&DWP(8,"esp")); # are we done yet?
320
- &jb (&label("loop$suffix"));
321
- }
322
- &COMPACT_LOOP();
323
- &mov ("esp",&DWP(12,"esp")); # restore sp
324
- &function_end_A();
325
- if (!$i386 && !$xmm) {
326
- # ~20% improvement on Sandy Bridge
327
- local *ror = sub { &shrd(@_[0],@_) };
328
- &COMPACT_LOOP("_shrd");
329
- &mov ("esp",&DWP(12,"esp")); # restore sp
330
- &function_end_A();
331
- }
332
-
333
- &set_label("K256",64); # Yes! I keep it in the code segment!
334
- @K256=( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
335
- 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
336
- 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
337
- 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
338
- 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
339
- 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
340
- 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
341
- 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
342
- 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
343
- 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
344
- 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
345
- 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
346
- 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
347
- 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
348
- 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
349
- 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
350
- &data_word(@K256);
351
- &data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # byte swap mask
352
- &asciz("SHA256 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
353
-
354
- ($a,$b,$c,$d,$e,$f,$g,$h)=(0..7); # offsets
355
- sub off { &DWP(4*(((shift)-$i)&7),"esp"); }
356
-
357
- if (!$i386 && $unroll_after) {
358
- my @AH=($A,$K256);
359
-
360
- &set_label("unrolled",16);
361
- &lea ("esp",&DWP(-96,"esp"));
362
- # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
363
- &mov ($AH[0],&DWP(0,"esi"));
364
- &mov ($AH[1],&DWP(4,"esi"));
365
- &mov ("ecx",&DWP(8,"esi"));
366
- &mov ("ebx",&DWP(12,"esi"));
367
- #&mov (&DWP(0,"esp"),$AH[0]);
368
- &mov (&DWP(4,"esp"),$AH[1]);
369
- &xor ($AH[1],"ecx"); # magic
370
- &mov (&DWP(8,"esp"),"ecx");
371
- &mov (&DWP(12,"esp"),"ebx");
372
- &mov ($E,&DWP(16,"esi"));
373
- &mov ("ebx",&DWP(20,"esi"));
374
- &mov ("ecx",&DWP(24,"esi"));
375
- &mov ("esi",&DWP(28,"esi"));
376
- #&mov (&DWP(16,"esp"),$E);
377
- &mov (&DWP(20,"esp"),"ebx");
378
- &mov (&DWP(24,"esp"),"ecx");
379
- &mov (&DWP(28,"esp"),"esi");
380
- &jmp (&label("grand_loop"));
381
-
382
- &set_label("grand_loop",16);
383
- # copy input block to stack reversing byte order
384
- for($i=0;$i<5;$i++) {
385
- &mov ("ebx",&DWP(12*$i+0,"edi"));
386
- &mov ("ecx",&DWP(12*$i+4,"edi"));
387
- &bswap ("ebx");
388
- &mov ("esi",&DWP(12*$i+8,"edi"));
389
- &bswap ("ecx");
390
- &mov (&DWP(32+12*$i+0,"esp"),"ebx");
391
- &bswap ("esi");
392
- &mov (&DWP(32+12*$i+4,"esp"),"ecx");
393
- &mov (&DWP(32+12*$i+8,"esp"),"esi");
394
- }
395
- &mov ("ebx",&DWP($i*12,"edi"));
396
- &add ("edi",64);
397
- &bswap ("ebx");
398
- &mov (&DWP(96+4,"esp"),"edi");
399
- &mov (&DWP(32+12*$i,"esp"),"ebx");
400
-
401
- my ($t1,$t2) = ("ecx","esi");
402
-
403
- for ($i=0;$i<64;$i++) {
404
-
405
- if ($i>=16) {
406
- &mov ($T,$t1); # $t1 is preloaded
407
- # &mov ($t2,&DWP(32+4*(($i+14)&15),"esp"));
408
- &ror ($t1,18-7);
409
- &mov ("edi",$t2);
410
- &ror ($t2,19-17);
411
- &xor ($t1,$T);
412
- &shr ($T,3);
413
- &ror ($t1,7);
414
- &xor ($t2,"edi");
415
- &xor ($T,$t1); # T = sigma0(X[-15])
416
- &ror ($t2,17);
417
- &add ($T,&DWP(32+4*($i&15),"esp")); # T += X[-16]
418
- &shr ("edi",10);
419
- &add ($T,&DWP(32+4*(($i+9)&15),"esp")); # T += X[-7]
420
- #&xor ("edi",$t2) # sigma1(X[-2])
421
- # &add ($T,"edi"); # T += sigma1(X[-2])
422
- # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0]
423
- }
424
- &mov ($t1,$E);
425
- &xor ("edi",$t2) if ($i>=16); # sigma1(X[-2])
426
- &mov ($t2,&off($f));
427
- &ror ($E,25-11);
428
- &add ($T,"edi") if ($i>=16); # T += sigma1(X[-2])
429
- &mov ("edi",&off($g));
430
- &xor ($E,$t1);
431
- &mov ($T,&DWP(32+4*($i&15),"esp")) if ($i<16); # X[i]
432
- &mov (&DWP(32+4*($i&15),"esp"),$T) if ($i>=16 && $i<62); # save X[0]
433
- &xor ($t2,"edi");
434
- &ror ($E,11-6);
435
- &and ($t2,$t1);
436
- &mov (&off($e),$t1); # save $E, modulo-scheduled
437
- &xor ($E,$t1);
438
- &add ($T,&off($h)); # T += h
439
- &xor ("edi",$t2); # Ch(e,f,g)
440
- &ror ($E,6); # Sigma1(e)
441
- &mov ($t1,$AH[0]);
442
- &add ($T,"edi"); # T += Ch(e,f,g)
443
-
444
- &ror ($t1,22-13);
445
- &mov ($t2,$AH[0]);
446
- &mov ("edi",&off($b));
447
- &xor ($t1,$AH[0]);
448
- &mov (&off($a),$AH[0]); # save $A, modulo-scheduled
449
- &xor ($AH[0],"edi"); # a ^= b, (b^c) in next round
450
- &ror ($t1,13-2);
451
- &and ($AH[1],$AH[0]); # (b^c) &= (a^b)
452
- &lea ($E,&DWP(@K256[$i],$T,$E)); # T += Sigma1(1)+K[i]
453
- &xor ($t1,$t2);
454
- &xor ($AH[1],"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b)
455
- &mov ($t2,&DWP(32+4*(($i+2)&15),"esp")) if ($i>=15 && $i<63);
456
- &ror ($t1,2); # Sigma0(a)
457
-
458
- &add ($AH[1],$E); # h += T
459
- &add ($E,&off($d)); # d += T
460
- &add ($AH[1],$t1); # h += Sigma0(a)
461
- &mov ($t1,&DWP(32+4*(($i+15)&15),"esp")) if ($i>=15 && $i<63);
462
-
463
- @AH = reverse(@AH); # rotate(a,h)
464
- ($t1,$t2) = ($t2,$t1); # rotate(t1,t2)
465
- }
466
- &mov ("esi",&DWP(96,"esp")); #ctx
467
- #&mov ($AH[0],&DWP(0,"esp"));
468
- &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp"));
469
- #&mov ("edi", &DWP(8,"esp"));
470
- &mov ("ecx",&DWP(12,"esp"));
471
- &add ($AH[0],&DWP(0,"esi"));
472
- &add ($AH[1],&DWP(4,"esi"));
473
- &add ("edi",&DWP(8,"esi"));
474
- &add ("ecx",&DWP(12,"esi"));
475
- &mov (&DWP(0,"esi"),$AH[0]);
476
- &mov (&DWP(4,"esi"),$AH[1]);
477
- &mov (&DWP(8,"esi"),"edi");
478
- &mov (&DWP(12,"esi"),"ecx");
479
- #&mov (&DWP(0,"esp"),$AH[0]);
480
- &mov (&DWP(4,"esp"),$AH[1]);
481
- &xor ($AH[1],"edi"); # magic
482
- &mov (&DWP(8,"esp"),"edi");
483
- &mov (&DWP(12,"esp"),"ecx");
484
- #&mov ($E,&DWP(16,"esp"));
485
- &mov ("edi",&DWP(20,"esp"));
486
- &mov ("ebx",&DWP(24,"esp"));
487
- &mov ("ecx",&DWP(28,"esp"));
488
- &add ($E,&DWP(16,"esi"));
489
- &add ("edi",&DWP(20,"esi"));
490
- &add ("ebx",&DWP(24,"esi"));
491
- &add ("ecx",&DWP(28,"esi"));
492
- &mov (&DWP(16,"esi"),$E);
493
- &mov (&DWP(20,"esi"),"edi");
494
- &mov (&DWP(24,"esi"),"ebx");
495
- &mov (&DWP(28,"esi"),"ecx");
496
- #&mov (&DWP(16,"esp"),$E);
497
- &mov (&DWP(20,"esp"),"edi");
498
- &mov ("edi",&DWP(96+4,"esp")); # inp
499
- &mov (&DWP(24,"esp"),"ebx");
500
- &mov (&DWP(28,"esp"),"ecx");
501
-
502
- &cmp ("edi",&DWP(96+8,"esp")); # are we done yet?
503
- &jb (&label("grand_loop"));
504
-
505
- &mov ("esp",&DWP(96+12,"esp")); # restore sp
506
- &function_end_A();
507
- }
508
- if (!$i386 && $xmm) {{{
509
- if ($shaext) {
510
- ######################################################################
511
- # Intel SHA Extensions implementation of SHA256 update function.
512
- #
513
- my ($ctx,$inp,$end)=("esi","edi","eax");
514
- my ($Wi,$ABEF,$CDGH,$TMP)=map("xmm$_",(0..2,7));
515
- my @MSG=map("xmm$_",(3..6));
516
-
517
- sub sha256op38 {
518
- my ($opcodelet,$dst,$src)=@_;
519
- if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
520
- { &data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2); }
521
- }
522
- sub sha256rnds2 { sha256op38(0xcb,@_); }
523
- sub sha256msg1 { sha256op38(0xcc,@_); }
524
- sub sha256msg2 { sha256op38(0xcd,@_); }
525
-
526
- &set_label("shaext",32);
527
- &sub ("esp",32);
528
-
529
- &movdqu ($ABEF,&QWP(0,$ctx)); # DCBA
530
- &lea ($K256,&DWP(0x80,$K256));
531
- &movdqu ($CDGH,&QWP(16,$ctx)); # HGFE
532
- &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask
533
-
534
- &pshufd ($Wi,$ABEF,0x1b); # ABCD
535
- &pshufd ($ABEF,$ABEF,0xb1); # CDAB
536
- &pshufd ($CDGH,$CDGH,0x1b); # EFGH
537
- &palignr ($ABEF,$CDGH,8); # ABEF
538
- &punpcklqdq ($CDGH,$Wi); # CDGH
539
- &jmp (&label("loop_shaext"));
540
-
541
- &set_label("loop_shaext",16);
542
- &movdqu (@MSG[0],&QWP(0,$inp));
543
- &movdqu (@MSG[1],&QWP(0x10,$inp));
544
- &movdqu (@MSG[2],&QWP(0x20,$inp));
545
- &pshufb (@MSG[0],$TMP);
546
- &movdqu (@MSG[3],&QWP(0x30,$inp));
547
- &movdqa (&QWP(16,"esp"),$CDGH); # offload
548
-
549
- &movdqa ($Wi,&QWP(0*16-0x80,$K256));
550
- &paddd ($Wi,@MSG[0]);
551
- &pshufb (@MSG[1],$TMP);
552
- &sha256rnds2 ($CDGH,$ABEF); # 0-3
553
- &pshufd ($Wi,$Wi,0x0e);
554
- &nop ();
555
- &movdqa (&QWP(0,"esp"),$ABEF); # offload
556
- &sha256rnds2 ($ABEF,$CDGH);
557
-
558
- &movdqa ($Wi,&QWP(1*16-0x80,$K256));
559
- &paddd ($Wi,@MSG[1]);
560
- &pshufb (@MSG[2],$TMP);
561
- &sha256rnds2 ($CDGH,$ABEF); # 4-7
562
- &pshufd ($Wi,$Wi,0x0e);
563
- &lea ($inp,&DWP(0x40,$inp));
564
- &sha256msg1 (@MSG[0],@MSG[1]);
565
- &sha256rnds2 ($ABEF,$CDGH);
566
-
567
- &movdqa ($Wi,&QWP(2*16-0x80,$K256));
568
- &paddd ($Wi,@MSG[2]);
569
- &pshufb (@MSG[3],$TMP);
570
- &sha256rnds2 ($CDGH,$ABEF); # 8-11
571
- &pshufd ($Wi,$Wi,0x0e);
572
- &movdqa ($TMP,@MSG[3]);
573
- &palignr ($TMP,@MSG[2],4);
574
- &nop ();
575
- &paddd (@MSG[0],$TMP);
576
- &sha256msg1 (@MSG[1],@MSG[2]);
577
- &sha256rnds2 ($ABEF,$CDGH);
578
-
579
- &movdqa ($Wi,&QWP(3*16-0x80,$K256));
580
- &paddd ($Wi,@MSG[3]);
581
- &sha256msg2 (@MSG[0],@MSG[3]);
582
- &sha256rnds2 ($CDGH,$ABEF); # 12-15
583
- &pshufd ($Wi,$Wi,0x0e);
584
- &movdqa ($TMP,@MSG[0]);
585
- &palignr ($TMP,@MSG[3],4);
586
- &nop ();
587
- &paddd (@MSG[1],$TMP);
588
- &sha256msg1 (@MSG[2],@MSG[3]);
589
- &sha256rnds2 ($ABEF,$CDGH);
590
-
591
- for($i=4;$i<16-3;$i++) {
592
- &movdqa ($Wi,&QWP($i*16-0x80,$K256));
593
- &paddd ($Wi,@MSG[0]);
594
- &sha256msg2 (@MSG[1],@MSG[0]);
595
- &sha256rnds2 ($CDGH,$ABEF); # 16-19...
596
- &pshufd ($Wi,$Wi,0x0e);
597
- &movdqa ($TMP,@MSG[1]);
598
- &palignr ($TMP,@MSG[0],4);
599
- &nop ();
600
- &paddd (@MSG[2],$TMP);
601
- &sha256msg1 (@MSG[3],@MSG[0]);
602
- &sha256rnds2 ($ABEF,$CDGH);
603
-
604
- push(@MSG,shift(@MSG));
605
- }
606
- &movdqa ($Wi,&QWP(13*16-0x80,$K256));
607
- &paddd ($Wi,@MSG[0]);
608
- &sha256msg2 (@MSG[1],@MSG[0]);
609
- &sha256rnds2 ($CDGH,$ABEF); # 52-55
610
- &pshufd ($Wi,$Wi,0x0e);
611
- &movdqa ($TMP,@MSG[1])
612
- &palignr ($TMP,@MSG[0],4);
613
- &sha256rnds2 ($ABEF,$CDGH);
614
- &paddd (@MSG[2],$TMP);
615
-
616
- &movdqa ($Wi,&QWP(14*16-0x80,$K256));
617
- &paddd ($Wi,@MSG[1]);
618
- &sha256rnds2 ($CDGH,$ABEF); # 56-59
619
- &pshufd ($Wi,$Wi,0x0e);
620
- &sha256msg2 (@MSG[2],@MSG[1]);
621
- &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask
622
- &sha256rnds2 ($ABEF,$CDGH);
623
-
624
- &movdqa ($Wi,&QWP(15*16-0x80,$K256));
625
- &paddd ($Wi,@MSG[2]);
626
- &nop ();
627
- &sha256rnds2 ($CDGH,$ABEF); # 60-63
628
- &pshufd ($Wi,$Wi,0x0e);
629
- &cmp ($end,$inp);
630
- &nop ();
631
- &sha256rnds2 ($ABEF,$CDGH);
632
-
633
- &paddd ($CDGH,&QWP(16,"esp"));
634
- &paddd ($ABEF,&QWP(0,"esp"));
635
- &jnz (&label("loop_shaext"));
636
-
637
- &pshufd ($CDGH,$CDGH,0xb1); # DCHG
638
- &pshufd ($TMP,$ABEF,0x1b); # FEBA
639
- &pshufd ($ABEF,$ABEF,0xb1); # BAFE
640
- &punpckhqdq ($ABEF,$CDGH); # DCBA
641
- &palignr ($CDGH,$TMP,8); # HGFE
642
-
643
- &mov ("esp",&DWP(32+12,"esp"));
644
- &movdqu (&QWP(0,$ctx),$ABEF);
645
- &movdqu (&QWP(16,$ctx),$CDGH);
646
- &function_end_A();
647
- }
648
-
649
- my @X = map("xmm$_",(0..3));
650
- my ($t0,$t1,$t2,$t3) = map("xmm$_",(4..7));
651
- my @AH = ($A,$T);
652
-
653
- &set_label("SSSE3",32);
654
- &lea ("esp",&DWP(-96,"esp"));
655
- # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
656
- &mov ($AH[0],&DWP(0,"esi"));
657
- &mov ($AH[1],&DWP(4,"esi"));
658
- &mov ("ecx",&DWP(8,"esi"));
659
- &mov ("edi",&DWP(12,"esi"));
660
- #&mov (&DWP(0,"esp"),$AH[0]);
661
- &mov (&DWP(4,"esp"),$AH[1]);
662
- &xor ($AH[1],"ecx"); # magic
663
- &mov (&DWP(8,"esp"),"ecx");
664
- &mov (&DWP(12,"esp"),"edi");
665
- &mov ($E,&DWP(16,"esi"));
666
- &mov ("edi",&DWP(20,"esi"));
667
- &mov ("ecx",&DWP(24,"esi"));
668
- &mov ("esi",&DWP(28,"esi"));
669
- #&mov (&DWP(16,"esp"),$E);
670
- &mov (&DWP(20,"esp"),"edi");
671
- &mov ("edi",&DWP(96+4,"esp")); # inp
672
- &mov (&DWP(24,"esp"),"ecx");
673
- &mov (&DWP(28,"esp"),"esi");
674
- &movdqa ($t3,&QWP(256,$K256));
675
- &jmp (&label("grand_ssse3"));
676
-
677
- &set_label("grand_ssse3",16);
678
- # load input, reverse byte order, add K256[0..15], save to stack
679
- &movdqu (@X[0],&QWP(0,"edi"));
680
- &movdqu (@X[1],&QWP(16,"edi"));
681
- &movdqu (@X[2],&QWP(32,"edi"));
682
- &movdqu (@X[3],&QWP(48,"edi"));
683
- &add ("edi",64);
684
- &pshufb (@X[0],$t3);
685
- &mov (&DWP(96+4,"esp"),"edi");
686
- &pshufb (@X[1],$t3);
687
- &movdqa ($t0,&QWP(0,$K256));
688
- &pshufb (@X[2],$t3);
689
- &movdqa ($t1,&QWP(16,$K256));
690
- &paddd ($t0,@X[0]);
691
- &pshufb (@X[3],$t3);
692
- &movdqa ($t2,&QWP(32,$K256));
693
- &paddd ($t1,@X[1]);
694
- &movdqa ($t3,&QWP(48,$K256));
695
- &movdqa (&QWP(32+0,"esp"),$t0);
696
- &paddd ($t2,@X[2]);
697
- &movdqa (&QWP(32+16,"esp"),$t1);
698
- &paddd ($t3,@X[3]);
699
- &movdqa (&QWP(32+32,"esp"),$t2);
700
- &movdqa (&QWP(32+48,"esp"),$t3);
701
- &jmp (&label("ssse3_00_47"));
702
-
703
- &set_label("ssse3_00_47",16);
704
- &add ($K256,64);
705
-
706
- sub SSSE3_00_47 () {
707
- my $j = shift;
708
- my $body = shift;
709
- my @X = @_;
710
- my @insns = (&$body,&$body,&$body,&$body); # 120 instructions
711
-
712
- eval(shift(@insns));
713
- &movdqa ($t0,@X[1]);
714
- eval(shift(@insns)); # @
715
- eval(shift(@insns));
716
- &movdqa ($t3,@X[3]);
717
- eval(shift(@insns));
718
- eval(shift(@insns));
719
- &palignr ($t0,@X[0],4); # X[1..4]
720
- eval(shift(@insns));
721
- eval(shift(@insns)); # @
722
- eval(shift(@insns));
723
- &palignr ($t3,@X[2],4); # X[9..12]
724
- eval(shift(@insns));
725
- eval(shift(@insns));
726
- eval(shift(@insns));
727
- &movdqa ($t1,$t0);
728
- eval(shift(@insns)); # @
729
- eval(shift(@insns));
730
- &movdqa ($t2,$t0);
731
- eval(shift(@insns));
732
- eval(shift(@insns));
733
- &psrld ($t0,3);
734
- eval(shift(@insns));
735
- eval(shift(@insns)); # @
736
- &paddd (@X[0],$t3); # X[0..3] += X[9..12]
737
- eval(shift(@insns));
738
- eval(shift(@insns));
739
- &psrld ($t2,7);
740
- eval(shift(@insns));
741
- eval(shift(@insns));
742
- eval(shift(@insns)); # @
743
- eval(shift(@insns));
744
- &pshufd ($t3,@X[3],0b11111010); # X[14..15]
745
- eval(shift(@insns));
746
- eval(shift(@insns));
747
- &pslld ($t1,32-18);
748
- eval(shift(@insns));
749
- eval(shift(@insns)); # @
750
- &pxor ($t0,$t2);
751
- eval(shift(@insns));
752
- eval(shift(@insns));
753
- &psrld ($t2,18-7);
754
- eval(shift(@insns));
755
- eval(shift(@insns));
756
- eval(shift(@insns)); # @
757
- &pxor ($t0,$t1);
758
- eval(shift(@insns));
759
- eval(shift(@insns));
760
- &pslld ($t1,18-7);
761
- eval(shift(@insns));
762
- eval(shift(@insns));
763
- eval(shift(@insns)); # @
764
- &pxor ($t0,$t2);
765
- eval(shift(@insns));
766
- eval(shift(@insns));
767
- &movdqa ($t2,$t3);
768
- eval(shift(@insns));
769
- eval(shift(@insns));
770
- eval(shift(@insns)); # @
771
- &pxor ($t0,$t1); # sigma0(X[1..4])
772
- eval(shift(@insns));
773
- eval(shift(@insns));
774
- &psrld ($t3,10);
775
- eval(shift(@insns));
776
- eval(shift(@insns));
777
- eval(shift(@insns)); # @
778
- &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
779
- eval(shift(@insns));
780
- eval(shift(@insns));
781
- &psrlq ($t2,17);
782
- eval(shift(@insns));
783
- eval(shift(@insns));
784
- eval(shift(@insns)); # @
785
- &pxor ($t3,$t2);
786
- eval(shift(@insns));
787
- eval(shift(@insns));
788
- &psrlq ($t2,19-17);
789
- eval(shift(@insns));
790
- eval(shift(@insns));
791
- eval(shift(@insns)); # @
792
- &pxor ($t3,$t2);
793
- eval(shift(@insns));
794
- eval(shift(@insns));
795
- &pshufd ($t3,$t3,0b10000000);
796
- eval(shift(@insns));
797
- eval(shift(@insns));
798
- eval(shift(@insns)); # @
799
- eval(shift(@insns));
800
- eval(shift(@insns));
801
- eval(shift(@insns));
802
- eval(shift(@insns));
803
- eval(shift(@insns)); # @
804
- eval(shift(@insns));
805
- &psrldq ($t3,8);
806
- eval(shift(@insns));
807
- eval(shift(@insns));
808
- eval(shift(@insns));
809
- &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
810
- eval(shift(@insns)); # @
811
- eval(shift(@insns));
812
- eval(shift(@insns));
813
- eval(shift(@insns));
814
- eval(shift(@insns));
815
- eval(shift(@insns)); # @
816
- eval(shift(@insns));
817
- &pshufd ($t3,@X[0],0b01010000); # X[16..17]
818
- eval(shift(@insns));
819
- eval(shift(@insns));
820
- eval(shift(@insns));
821
- &movdqa ($t2,$t3);
822
- eval(shift(@insns)); # @
823
- &psrld ($t3,10);
824
- eval(shift(@insns));
825
- &psrlq ($t2,17);
826
- eval(shift(@insns));
827
- eval(shift(@insns));
828
- eval(shift(@insns));
829
- eval(shift(@insns)); # @
830
- &pxor ($t3,$t2);
831
- eval(shift(@insns));
832
- eval(shift(@insns));
833
- &psrlq ($t2,19-17);
834
- eval(shift(@insns));
835
- eval(shift(@insns));
836
- eval(shift(@insns)); # @
837
- &pxor ($t3,$t2);
838
- eval(shift(@insns));
839
- eval(shift(@insns));
840
- eval(shift(@insns));
841
- &pshufd ($t3,$t3,0b00001000);
842
- eval(shift(@insns));
843
- eval(shift(@insns)); # @
844
- &movdqa ($t2,&QWP(16*$j,$K256));
845
- eval(shift(@insns));
846
- eval(shift(@insns));
847
- &pslldq ($t3,8);
848
- eval(shift(@insns));
849
- eval(shift(@insns));
850
- eval(shift(@insns)); # @
851
- eval(shift(@insns));
852
- eval(shift(@insns));
853
- eval(shift(@insns));
854
- eval(shift(@insns));
855
- eval(shift(@insns)); # @
856
- &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
857
- eval(shift(@insns));
858
- eval(shift(@insns));
859
- eval(shift(@insns));
860
- eval(shift(@insns));
861
- &paddd ($t2,@X[0]);
862
- eval(shift(@insns)); # @
863
-
864
- foreach (@insns) { eval; } # remaining instructions
865
-
866
- &movdqa (&QWP(32+16*$j,"esp"),$t2);
867
- }
868
-
869
- sub body_00_15 () {
870
- (
871
- '&mov ("ecx",$E);',
872
- '&ror ($E,25-11);',
873
- '&mov ("esi",&off($f));',
874
- '&xor ($E,"ecx");',
875
- '&mov ("edi",&off($g));',
876
- '&xor ("esi","edi");',
877
- '&ror ($E,11-6);',
878
- '&and ("esi","ecx");',
879
- '&mov (&off($e),"ecx");', # save $E, modulo-scheduled
880
- '&xor ($E,"ecx");',
881
- '&xor ("edi","esi");', # Ch(e,f,g)
882
- '&ror ($E,6);', # T = Sigma1(e)
883
- '&mov ("ecx",$AH[0]);',
884
- '&add ($E,"edi");', # T += Ch(e,f,g)
885
- '&mov ("edi",&off($b));',
886
- '&mov ("esi",$AH[0]);',
887
-
888
- '&ror ("ecx",22-13);',
889
- '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled
890
- '&xor ("ecx",$AH[0]);',
891
- '&xor ($AH[0],"edi");', # a ^= b, (b^c) in next round
892
- '&add ($E,&off($h));', # T += h
893
- '&ror ("ecx",13-2);',
894
- '&and ($AH[1],$AH[0]);', # (b^c) &= (a^b)
895
- '&xor ("ecx","esi");',
896
- '&add ($E,&DWP(32+4*($i&15),"esp"));', # T += K[i]+X[i]
897
- '&xor ($AH[1],"edi");', # h = Maj(a,b,c) = Ch(a^b,c,b)
898
- '&ror ("ecx",2);', # Sigma0(a)
899
-
900
- '&add ($AH[1],$E);', # h += T
901
- '&add ($E,&off($d));', # d += T
902
- '&add ($AH[1],"ecx");'. # h += Sigma0(a)
903
-
904
- '@AH = reverse(@AH); $i++;' # rotate(a,h)
905
- );
906
- }
907
-
908
- for ($i=0,$j=0; $j<4; $j++) {
909
- &SSSE3_00_47($j,\&body_00_15,@X);
910
- push(@X,shift(@X)); # rotate(@X)
911
- }
912
- &cmp (&DWP(16*$j,$K256),0x00010203);
913
- &jne (&label("ssse3_00_47"));
914
-
915
- for ($i=0; $i<16; ) {
916
- foreach(body_00_15()) { eval; }
917
- }
918
-
919
- &mov ("esi",&DWP(96,"esp")); #ctx
920
- #&mov ($AH[0],&DWP(0,"esp"));
921
- &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp"));
922
- #&mov ("edi", &DWP(8,"esp"));
923
- &mov ("ecx",&DWP(12,"esp"));
924
- &add ($AH[0],&DWP(0,"esi"));
925
- &add ($AH[1],&DWP(4,"esi"));
926
- &add ("edi",&DWP(8,"esi"));
927
- &add ("ecx",&DWP(12,"esi"));
928
- &mov (&DWP(0,"esi"),$AH[0]);
929
- &mov (&DWP(4,"esi"),$AH[1]);
930
- &mov (&DWP(8,"esi"),"edi");
931
- &mov (&DWP(12,"esi"),"ecx");
932
- #&mov (&DWP(0,"esp"),$AH[0]);
933
- &mov (&DWP(4,"esp"),$AH[1]);
934
- &xor ($AH[1],"edi"); # magic
935
- &mov (&DWP(8,"esp"),"edi");
936
- &mov (&DWP(12,"esp"),"ecx");
937
- #&mov ($E,&DWP(16,"esp"));
938
- &mov ("edi",&DWP(20,"esp"));
939
- &mov ("ecx",&DWP(24,"esp"));
940
- &add ($E,&DWP(16,"esi"));
941
- &add ("edi",&DWP(20,"esi"));
942
- &add ("ecx",&DWP(24,"esi"));
943
- &mov (&DWP(16,"esi"),$E);
944
- &mov (&DWP(20,"esi"),"edi");
945
- &mov (&DWP(20,"esp"),"edi");
946
- &mov ("edi",&DWP(28,"esp"));
947
- &mov (&DWP(24,"esi"),"ecx");
948
- #&mov (&DWP(16,"esp"),$E);
949
- &add ("edi",&DWP(28,"esi"));
950
- &mov (&DWP(24,"esp"),"ecx");
951
- &mov (&DWP(28,"esi"),"edi");
952
- &mov (&DWP(28,"esp"),"edi");
953
- &mov ("edi",&DWP(96+4,"esp")); # inp
954
-
955
- &movdqa ($t3,&QWP(64,$K256));
956
- &sub ($K256,3*64); # rewind K
957
- &cmp ("edi",&DWP(96+8,"esp")); # are we done yet?
958
- &jb (&label("grand_ssse3"));
959
-
960
- &mov ("esp",&DWP(96+12,"esp")); # restore sp
961
- &function_end_A();
962
- if ($avx) {
963
- &set_label("AVX",32);
964
- if ($avx>1) {
965
- &and ("edx",1<<8|1<<3); # check for BMI2+BMI1
966
- &cmp ("edx",1<<8|1<<3);
967
- &je (&label("AVX_BMI"));
968
- }
969
- &lea ("esp",&DWP(-96,"esp"));
970
- &vzeroall ();
971
- # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
972
- &mov ($AH[0],&DWP(0,"esi"));
973
- &mov ($AH[1],&DWP(4,"esi"));
974
- &mov ("ecx",&DWP(8,"esi"));
975
- &mov ("edi",&DWP(12,"esi"));
976
- #&mov (&DWP(0,"esp"),$AH[0]);
977
- &mov (&DWP(4,"esp"),$AH[1]);
978
- &xor ($AH[1],"ecx"); # magic
979
- &mov (&DWP(8,"esp"),"ecx");
980
- &mov (&DWP(12,"esp"),"edi");
981
- &mov ($E,&DWP(16,"esi"));
982
- &mov ("edi",&DWP(20,"esi"));
983
- &mov ("ecx",&DWP(24,"esi"));
984
- &mov ("esi",&DWP(28,"esi"));
985
- #&mov (&DWP(16,"esp"),$E);
986
- &mov (&DWP(20,"esp"),"edi");
987
- &mov ("edi",&DWP(96+4,"esp")); # inp
988
- &mov (&DWP(24,"esp"),"ecx");
989
- &mov (&DWP(28,"esp"),"esi");
990
- &vmovdqa ($t3,&QWP(256,$K256));
991
- &jmp (&label("grand_avx"));
992
-
993
- &set_label("grand_avx",32);
994
- # load input, reverse byte order, add K256[0..15], save to stack
995
- &vmovdqu (@X[0],&QWP(0,"edi"));
996
- &vmovdqu (@X[1],&QWP(16,"edi"));
997
- &vmovdqu (@X[2],&QWP(32,"edi"));
998
- &vmovdqu (@X[3],&QWP(48,"edi"));
999
- &add ("edi",64);
1000
- &vpshufb (@X[0],@X[0],$t3);
1001
- &mov (&DWP(96+4,"esp"),"edi");
1002
- &vpshufb (@X[1],@X[1],$t3);
1003
- &vpshufb (@X[2],@X[2],$t3);
1004
- &vpaddd ($t0,@X[0],&QWP(0,$K256));
1005
- &vpshufb (@X[3],@X[3],$t3);
1006
- &vpaddd ($t1,@X[1],&QWP(16,$K256));
1007
- &vpaddd ($t2,@X[2],&QWP(32,$K256));
1008
- &vpaddd ($t3,@X[3],&QWP(48,$K256));
1009
- &vmovdqa (&QWP(32+0,"esp"),$t0);
1010
- &vmovdqa (&QWP(32+16,"esp"),$t1);
1011
- &vmovdqa (&QWP(32+32,"esp"),$t2);
1012
- &vmovdqa (&QWP(32+48,"esp"),$t3);
1013
- &jmp (&label("avx_00_47"));
1014
-
1015
- &set_label("avx_00_47",16);
1016
- &add ($K256,64);
1017
-
1018
- sub Xupdate_AVX () {
1019
- (
1020
- '&vpalignr ($t0,@X[1],@X[0],4);', # X[1..4]
1021
- '&vpalignr ($t3,@X[3],@X[2],4);', # X[9..12]
1022
- '&vpsrld ($t2,$t0,7);',
1023
- '&vpaddd (@X[0],@X[0],$t3);', # X[0..3] += X[9..16]
1024
- '&vpsrld ($t3,$t0,3);',
1025
- '&vpslld ($t1,$t0,14);',
1026
- '&vpxor ($t0,$t3,$t2);',
1027
- '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
1028
- '&vpsrld ($t2,$t2,18-7);',
1029
- '&vpxor ($t0,$t0,$t1);',
1030
- '&vpslld ($t1,$t1,25-14);',
1031
- '&vpxor ($t0,$t0,$t2);',
1032
- '&vpsrld ($t2,$t3,10);',
1033
- '&vpxor ($t0,$t0,$t1);', # sigma0(X[1..4])
1034
- '&vpsrlq ($t1,$t3,17);',
1035
- '&vpaddd (@X[0],@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
1036
- '&vpxor ($t2,$t2,$t1);',
1037
- '&vpsrlq ($t3,$t3,19);',
1038
- '&vpxor ($t2,$t2,$t3);', # sigma1(X[14..15]
1039
- '&vpshufd ($t3,$t2,0b10000100);',
1040
- '&vpsrldq ($t3,$t3,8);',
1041
- '&vpaddd (@X[0],@X[0],$t3);', # X[0..1] += sigma1(X[14..15])
1042
- '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
1043
- '&vpsrld ($t2,$t3,10);',
1044
- '&vpsrlq ($t1,$t3,17);',
1045
- '&vpxor ($t2,$t2,$t1);',
1046
- '&vpsrlq ($t3,$t3,19);',
1047
- '&vpxor ($t2,$t2,$t3);', # sigma1(X[16..17]
1048
- '&vpshufd ($t3,$t2,0b11101000);',
1049
- '&vpslldq ($t3,$t3,8);',
1050
- '&vpaddd (@X[0],@X[0],$t3);' # X[2..3] += sigma1(X[16..17])
1051
- );
1052
- }
1053
-
1054
- local *ror = sub { &shrd(@_[0],@_) };
1055
- sub AVX_00_47 () {
1056
- my $j = shift;
1057
- my $body = shift;
1058
- my @X = @_;
1059
- my @insns = (&$body,&$body,&$body,&$body); # 120 instructions
1060
- my $insn;
1061
-
1062
- foreach (Xupdate_AVX()) { # 31 instructions
1063
- eval;
1064
- eval(shift(@insns));
1065
- eval(shift(@insns));
1066
- eval($insn = shift(@insns));
1067
- eval(shift(@insns)) if ($insn =~ /rorx/ && @insns[0] =~ /rorx/);
1068
- }
1069
- &vpaddd ($t2,@X[0],&QWP(16*$j,$K256));
1070
- foreach (@insns) { eval; } # remaining instructions
1071
- &vmovdqa (&QWP(32+16*$j,"esp"),$t2);
1072
- }
1073
-
1074
- for ($i=0,$j=0; $j<4; $j++) {
1075
- &AVX_00_47($j,\&body_00_15,@X);
1076
- push(@X,shift(@X)); # rotate(@X)
1077
- }
1078
- &cmp (&DWP(16*$j,$K256),0x00010203);
1079
- &jne (&label("avx_00_47"));
1080
-
1081
- for ($i=0; $i<16; ) {
1082
- foreach(body_00_15()) { eval; }
1083
- }
1084
-
1085
- &mov ("esi",&DWP(96,"esp")); #ctx
1086
- #&mov ($AH[0],&DWP(0,"esp"));
1087
- &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp"));
1088
- #&mov ("edi", &DWP(8,"esp"));
1089
- &mov ("ecx",&DWP(12,"esp"));
1090
- &add ($AH[0],&DWP(0,"esi"));
1091
- &add ($AH[1],&DWP(4,"esi"));
1092
- &add ("edi",&DWP(8,"esi"));
1093
- &add ("ecx",&DWP(12,"esi"));
1094
- &mov (&DWP(0,"esi"),$AH[0]);
1095
- &mov (&DWP(4,"esi"),$AH[1]);
1096
- &mov (&DWP(8,"esi"),"edi");
1097
- &mov (&DWP(12,"esi"),"ecx");
1098
- #&mov (&DWP(0,"esp"),$AH[0]);
1099
- &mov (&DWP(4,"esp"),$AH[1]);
1100
- &xor ($AH[1],"edi"); # magic
1101
- &mov (&DWP(8,"esp"),"edi");
1102
- &mov (&DWP(12,"esp"),"ecx");
1103
- #&mov ($E,&DWP(16,"esp"));
1104
- &mov ("edi",&DWP(20,"esp"));
1105
- &mov ("ecx",&DWP(24,"esp"));
1106
- &add ($E,&DWP(16,"esi"));
1107
- &add ("edi",&DWP(20,"esi"));
1108
- &add ("ecx",&DWP(24,"esi"));
1109
- &mov (&DWP(16,"esi"),$E);
1110
- &mov (&DWP(20,"esi"),"edi");
1111
- &mov (&DWP(20,"esp"),"edi");
1112
- &mov ("edi",&DWP(28,"esp"));
1113
- &mov (&DWP(24,"esi"),"ecx");
1114
- #&mov (&DWP(16,"esp"),$E);
1115
- &add ("edi",&DWP(28,"esi"));
1116
- &mov (&DWP(24,"esp"),"ecx");
1117
- &mov (&DWP(28,"esi"),"edi");
1118
- &mov (&DWP(28,"esp"),"edi");
1119
- &mov ("edi",&DWP(96+4,"esp")); # inp
1120
-
1121
- &vmovdqa ($t3,&QWP(64,$K256));
1122
- &sub ($K256,3*64); # rewind K
1123
- &cmp ("edi",&DWP(96+8,"esp")); # are we done yet?
1124
- &jb (&label("grand_avx"));
1125
-
1126
- &mov ("esp",&DWP(96+12,"esp")); # restore sp
1127
- &vzeroall ();
1128
- &function_end_A();
1129
- if ($avx>1) {
1130
- sub bodyx_00_15 () { # +10%
1131
- (
1132
- '&rorx ("ecx",$E,6)',
1133
- '&rorx ("esi",$E,11)',
1134
- '&mov (&off($e),$E)', # save $E, modulo-scheduled
1135
- '&rorx ("edi",$E,25)',
1136
- '&xor ("ecx","esi")',
1137
- '&andn ("esi",$E,&off($g))',
1138
- '&xor ("ecx","edi")', # Sigma1(e)
1139
- '&and ($E,&off($f))',
1140
- '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled
1141
- '&or ($E,"esi")', # T = Ch(e,f,g)
1142
-
1143
- '&rorx ("edi",$AH[0],2)',
1144
- '&rorx ("esi",$AH[0],13)',
1145
- '&lea ($E,&DWP(0,$E,"ecx"))', # T += Sigma1(e)
1146
- '&rorx ("ecx",$AH[0],22)',
1147
- '&xor ("esi","edi")',
1148
- '&mov ("edi",&off($b))',
1149
- '&xor ("ecx","esi")', # Sigma0(a)
1150
-
1151
- '&xor ($AH[0],"edi")', # a ^= b, (b^c) in next round
1152
- '&add ($E,&off($h))', # T += h
1153
- '&and ($AH[1],$AH[0])', # (b^c) &= (a^b)
1154
- '&add ($E,&DWP(32+4*($i&15),"esp"))', # T += K[i]+X[i]
1155
- '&xor ($AH[1],"edi")', # h = Maj(a,b,c) = Ch(a^b,c,b)
1156
-
1157
- '&add ("ecx",$E)', # h += T
1158
- '&add ($E,&off($d))', # d += T
1159
- '&lea ($AH[1],&DWP(0,$AH[1],"ecx"));'. # h += Sigma0(a)
1160
-
1161
- '@AH = reverse(@AH); $i++;' # rotate(a,h)
1162
- );
1163
- }
1164
-
1165
- &set_label("AVX_BMI",32);
1166
- &lea ("esp",&DWP(-96,"esp"));
1167
- &vzeroall ();
1168
- # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
1169
- &mov ($AH[0],&DWP(0,"esi"));
1170
- &mov ($AH[1],&DWP(4,"esi"));
1171
- &mov ("ecx",&DWP(8,"esi"));
1172
- &mov ("edi",&DWP(12,"esi"));
1173
- #&mov (&DWP(0,"esp"),$AH[0]);
1174
- &mov (&DWP(4,"esp"),$AH[1]);
1175
- &xor ($AH[1],"ecx"); # magic
1176
- &mov (&DWP(8,"esp"),"ecx");
1177
- &mov (&DWP(12,"esp"),"edi");
1178
- &mov ($E,&DWP(16,"esi"));
1179
- &mov ("edi",&DWP(20,"esi"));
1180
- &mov ("ecx",&DWP(24,"esi"));
1181
- &mov ("esi",&DWP(28,"esi"));
1182
- #&mov (&DWP(16,"esp"),$E);
1183
- &mov (&DWP(20,"esp"),"edi");
1184
- &mov ("edi",&DWP(96+4,"esp")); # inp
1185
- &mov (&DWP(24,"esp"),"ecx");
1186
- &mov (&DWP(28,"esp"),"esi");
1187
- &vmovdqa ($t3,&QWP(256,$K256));
1188
- &jmp (&label("grand_avx_bmi"));
1189
-
1190
- &set_label("grand_avx_bmi",32);
1191
- # load input, reverse byte order, add K256[0..15], save to stack
1192
- &vmovdqu (@X[0],&QWP(0,"edi"));
1193
- &vmovdqu (@X[1],&QWP(16,"edi"));
1194
- &vmovdqu (@X[2],&QWP(32,"edi"));
1195
- &vmovdqu (@X[3],&QWP(48,"edi"));
1196
- &add ("edi",64);
1197
- &vpshufb (@X[0],@X[0],$t3);
1198
- &mov (&DWP(96+4,"esp"),"edi");
1199
- &vpshufb (@X[1],@X[1],$t3);
1200
- &vpshufb (@X[2],@X[2],$t3);
1201
- &vpaddd ($t0,@X[0],&QWP(0,$K256));
1202
- &vpshufb (@X[3],@X[3],$t3);
1203
- &vpaddd ($t1,@X[1],&QWP(16,$K256));
1204
- &vpaddd ($t2,@X[2],&QWP(32,$K256));
1205
- &vpaddd ($t3,@X[3],&QWP(48,$K256));
1206
- &vmovdqa (&QWP(32+0,"esp"),$t0);
1207
- &vmovdqa (&QWP(32+16,"esp"),$t1);
1208
- &vmovdqa (&QWP(32+32,"esp"),$t2);
1209
- &vmovdqa (&QWP(32+48,"esp"),$t3);
1210
- &jmp (&label("avx_bmi_00_47"));
1211
-
1212
- &set_label("avx_bmi_00_47",16);
1213
- &add ($K256,64);
1214
-
1215
- for ($i=0,$j=0; $j<4; $j++) {
1216
- &AVX_00_47($j,\&bodyx_00_15,@X);
1217
- push(@X,shift(@X)); # rotate(@X)
1218
- }
1219
- &cmp (&DWP(16*$j,$K256),0x00010203);
1220
- &jne (&label("avx_bmi_00_47"));
1221
-
1222
- for ($i=0; $i<16; ) {
1223
- foreach(bodyx_00_15()) { eval; }
1224
- }
1225
-
1226
- &mov ("esi",&DWP(96,"esp")); #ctx
1227
- #&mov ($AH[0],&DWP(0,"esp"));
1228
- &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp"));
1229
- #&mov ("edi", &DWP(8,"esp"));
1230
- &mov ("ecx",&DWP(12,"esp"));
1231
- &add ($AH[0],&DWP(0,"esi"));
1232
- &add ($AH[1],&DWP(4,"esi"));
1233
- &add ("edi",&DWP(8,"esi"));
1234
- &add ("ecx",&DWP(12,"esi"));
1235
- &mov (&DWP(0,"esi"),$AH[0]);
1236
- &mov (&DWP(4,"esi"),$AH[1]);
1237
- &mov (&DWP(8,"esi"),"edi");
1238
- &mov (&DWP(12,"esi"),"ecx");
1239
- #&mov (&DWP(0,"esp"),$AH[0]);
1240
- &mov (&DWP(4,"esp"),$AH[1]);
1241
- &xor ($AH[1],"edi"); # magic
1242
- &mov (&DWP(8,"esp"),"edi");
1243
- &mov (&DWP(12,"esp"),"ecx");
1244
- #&mov ($E,&DWP(16,"esp"));
1245
- &mov ("edi",&DWP(20,"esp"));
1246
- &mov ("ecx",&DWP(24,"esp"));
1247
- &add ($E,&DWP(16,"esi"));
1248
- &add ("edi",&DWP(20,"esi"));
1249
- &add ("ecx",&DWP(24,"esi"));
1250
- &mov (&DWP(16,"esi"),$E);
1251
- &mov (&DWP(20,"esi"),"edi");
1252
- &mov (&DWP(20,"esp"),"edi");
1253
- &mov ("edi",&DWP(28,"esp"));
1254
- &mov (&DWP(24,"esi"),"ecx");
1255
- #&mov (&DWP(16,"esp"),$E);
1256
- &add ("edi",&DWP(28,"esi"));
1257
- &mov (&DWP(24,"esp"),"ecx");
1258
- &mov (&DWP(28,"esi"),"edi");
1259
- &mov (&DWP(28,"esp"),"edi");
1260
- &mov ("edi",&DWP(96+4,"esp")); # inp
1261
-
1262
- &vmovdqa ($t3,&QWP(64,$K256));
1263
- &sub ($K256,3*64); # rewind K
1264
- &cmp ("edi",&DWP(96+8,"esp")); # are we done yet?
1265
- &jb (&label("grand_avx_bmi"));
1266
-
1267
- &mov ("esp",&DWP(96+12,"esp")); # restore sp
1268
- &vzeroall ();
1269
- &function_end_A();
1270
- }
1271
- }
1272
- }}}
1273
- &function_end_B("sha256_block_data_order");
1274
-
1275
- &asm_finish();